mirror of
https://github.com/MariaDB/server.git
synced 2025-07-29 05:21:33 +03:00
MDEV-27229: Estimation for filtered rows less precise ... #5
Fix special handling for values that are right next to buckets with ndv=1.
This commit is contained in:
committed by
Sergei Petrunia
parent
67d4d0426f
commit
531dd708ef
@ -4631,12 +4631,12 @@ test t1_json a a-0 a-9 0.0000 3.0000 1.0000 10 JSON_HB {
|
|||||||
}
|
}
|
||||||
explain extended select * from t1_json where a between 'a-3a' and 'zzzzzzzzz';
|
explain extended select * from t1_json where a between 'a-3a' and 'zzzzzzzzz';
|
||||||
id select_type table type possible_keys key key_len ref rows filtered Extra
|
id select_type table type possible_keys key key_len ref rows filtered Extra
|
||||||
1 SIMPLE t1_json ALL NULL NULL NULL NULL 10 68.71 Using where
|
1 SIMPLE t1_json ALL NULL NULL NULL NULL 10 60.00 Using where
|
||||||
Warnings:
|
Warnings:
|
||||||
Note 1003 select `test`.`t1_json`.`a` AS `a` from `test`.`t1_json` where `test`.`t1_json`.`a` between 'a-3a' and 'zzzzzzzzz'
|
Note 1003 select `test`.`t1_json`.`a` AS `a` from `test`.`t1_json` where `test`.`t1_json`.`a` between 'a-3a' and 'zzzzzzzzz'
|
||||||
analyze select * from t1_json where a between 'a-3a' and 'zzzzzzzzz';
|
analyze select * from t1_json where a between 'a-3a' and 'zzzzzzzzz';
|
||||||
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
|
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
|
||||||
1 SIMPLE t1_json ALL NULL NULL NULL NULL 10 10.00 68.71 60.00 Using where
|
1 SIMPLE t1_json ALL NULL NULL NULL NULL 10 10.00 60.00 60.00 Using where
|
||||||
explain extended select * from t1_json where a < 'b-1a';
|
explain extended select * from t1_json where a < 'b-1a';
|
||||||
id select_type table type possible_keys key key_len ref rows filtered Extra
|
id select_type table type possible_keys key key_len ref rows filtered Extra
|
||||||
1 SIMPLE t1_json ALL NULL NULL NULL NULL 10 100.00 Using where
|
1 SIMPLE t1_json ALL NULL NULL NULL NULL 10 100.00 Using where
|
||||||
@ -8014,7 +8014,7 @@ test.t1 analyze status OK
|
|||||||
analyze
|
analyze
|
||||||
select c from t1 where c > '1';
|
select c from t1 where c > '1';
|
||||||
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
|
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
|
||||||
1 SIMPLE t1 ALL NULL NULL NULL NULL 16 16.00 80.47 75.00 Using where
|
1 SIMPLE t1 ALL NULL NULL NULL NULL 16 16.00 75.00 75.00 Using where
|
||||||
drop table t1;
|
drop table t1;
|
||||||
#
|
#
|
||||||
# MDEV-26849: JSON Histograms: point selectivity estimates are off for non-existent values
|
# MDEV-26849: JSON Histograms: point selectivity estimates are off for non-existent values
|
||||||
@ -8211,3 +8211,33 @@ analyze select COUNT(*) FROM t1 WHERE a < 'a';
|
|||||||
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
|
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
|
||||||
1 SIMPLE t1 ALL NULL NULL NULL NULL 100 100.00 50.00 50.00 Using where
|
1 SIMPLE t1 ALL NULL NULL NULL NULL 100 100.00 50.00 50.00 Using where
|
||||||
drop table t1;
|
drop table t1;
|
||||||
|
#
|
||||||
|
# MDEV-27229: Estimation for filtered rows less precise ... #5
|
||||||
|
#
|
||||||
|
create table t1 (id int, a varchar(8));
|
||||||
|
insert into t1 select seq, 'bar' from seq_1_to_100;
|
||||||
|
insert into t1 select id, 'qux' from t1;
|
||||||
|
set histogram_type=JSON_HB;
|
||||||
|
analyze table t1 persistent for all;
|
||||||
|
Table Op Msg_type Msg_text
|
||||||
|
test.t1 analyze status Engine-independent statistics collected
|
||||||
|
test.t1 analyze status OK
|
||||||
|
analyze select COUNT(*) FROM t1 WHERE a > 'foo';
|
||||||
|
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
|
||||||
|
1 SIMPLE t1 ALL NULL NULL NULL NULL 200 200.00 50.00 50.00 Using where
|
||||||
|
analyze select COUNT(*) FROM t1 WHERE a > 'aaa';
|
||||||
|
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
|
||||||
|
1 SIMPLE t1 ALL NULL NULL NULL NULL 200 200.00 100.00 100.00 Using where
|
||||||
|
analyze select COUNT(*) FROM t1 WHERE a >='aaa';
|
||||||
|
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
|
||||||
|
1 SIMPLE t1 ALL NULL NULL NULL NULL 200 200.00 100.00 100.00 Using where
|
||||||
|
analyze select COUNT(*) FROM t1 WHERE a > 'bar';
|
||||||
|
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
|
||||||
|
1 SIMPLE t1 ALL NULL NULL NULL NULL 200 200.00 50.00 50.00 Using where
|
||||||
|
analyze select COUNT(*) FROM t1 WHERE a >='bar';
|
||||||
|
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
|
||||||
|
1 SIMPLE t1 ALL NULL NULL NULL NULL 200 200.00 100.00 100.00 Using where
|
||||||
|
analyze select COUNT(*) FROM t1 WHERE a <='bar';
|
||||||
|
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
|
||||||
|
1 SIMPLE t1 ALL NULL NULL NULL NULL 200 200.00 50.00 50.00 Using where
|
||||||
|
drop table t1;
|
||||||
|
@ -390,3 +390,29 @@ analyze table t1 persistent for all;
|
|||||||
analyze select COUNT(*) FROM t1 WHERE a <> 'a';
|
analyze select COUNT(*) FROM t1 WHERE a <> 'a';
|
||||||
analyze select COUNT(*) FROM t1 WHERE a < 'a';
|
analyze select COUNT(*) FROM t1 WHERE a < 'a';
|
||||||
drop table t1;
|
drop table t1;
|
||||||
|
|
||||||
|
--echo #
|
||||||
|
--echo # MDEV-27229: Estimation for filtered rows less precise ... #5
|
||||||
|
--echo #
|
||||||
|
create table t1 (id int, a varchar(8));
|
||||||
|
insert into t1 select seq, 'bar' from seq_1_to_100;
|
||||||
|
insert into t1 select id, 'qux' from t1;
|
||||||
|
|
||||||
|
set histogram_type=JSON_HB;
|
||||||
|
analyze table t1 persistent for all;
|
||||||
|
analyze select COUNT(*) FROM t1 WHERE a > 'foo';
|
||||||
|
|
||||||
|
analyze select COUNT(*) FROM t1 WHERE a > 'aaa';
|
||||||
|
analyze select COUNT(*) FROM t1 WHERE a >='aaa';
|
||||||
|
|
||||||
|
analyze select COUNT(*) FROM t1 WHERE a > 'bar';
|
||||||
|
analyze select COUNT(*) FROM t1 WHERE a >='bar';
|
||||||
|
|
||||||
|
# Can enable these after get_avg_frequency issue is resolved:
|
||||||
|
# analyze select COUNT(*) FROM t1 WHERE a < 'aaa';
|
||||||
|
# analyze select COUNT(*) FROM t1 WHERE a <='aaa';
|
||||||
|
# analyze select COUNT(*) FROM t1 WHERE a < 'bar';
|
||||||
|
|
||||||
|
analyze select COUNT(*) FROM t1 WHERE a <='bar';
|
||||||
|
|
||||||
|
drop table t1;
|
||||||
|
@ -910,12 +910,12 @@ double Histogram_json_hb::point_selectivity(Field *field, key_range *endpoint,
|
|||||||
|
|
||||||
// If the value is outside of the histogram's range, this will "clip" it to
|
// If the value is outside of the histogram's range, this will "clip" it to
|
||||||
// first or last bucket.
|
// first or last bucket.
|
||||||
bool equal;
|
int endp_cmp;
|
||||||
int idx= find_bucket(field, key, &equal);
|
int idx= find_bucket(field, key, &endp_cmp);
|
||||||
|
|
||||||
double sel;
|
double sel;
|
||||||
|
|
||||||
if (buckets[idx].ndv == 1 && !equal)
|
if (buckets[idx].ndv == 1 && (endp_cmp!=0))
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
The bucket has a single value and it doesn't match! Return a very
|
The bucket has a single value and it doesn't match! Return a very
|
||||||
@ -979,22 +979,27 @@ double Histogram_json_hb::range_selectivity(Field *field, key_range *min_endp,
|
|||||||
|
|
||||||
// Find the leftmost bucket that contains the lookup value.
|
// Find the leftmost bucket that contains the lookup value.
|
||||||
// (If the lookup value is to the left of all buckets, find bucket #0)
|
// (If the lookup value is to the left of all buckets, find bucket #0)
|
||||||
bool equal;
|
int endp_cmp;
|
||||||
int idx= find_bucket(field, min_key, &equal);
|
int idx= find_bucket(field, min_key, &endp_cmp);
|
||||||
if (equal && exclusive_endp && buckets[idx].ndv==1 &&
|
|
||||||
idx < (int)buckets.size()-1)
|
double sel;
|
||||||
|
// Special handling for buckets with ndv=1:
|
||||||
|
if (buckets[idx].ndv == 1)
|
||||||
{
|
{
|
||||||
/*
|
if (endp_cmp < 0)
|
||||||
The range is "col > $CONST" and we've found a bucket that contains
|
sel= 0.0;
|
||||||
only the value $CONST. Move to the next bucket.
|
else if (endp_cmp > 0)
|
||||||
*/
|
sel= 1.0;
|
||||||
idx++;
|
else // endp_cmp == 0.0
|
||||||
|
sel= (exclusive_endp)? 1.0 : 0.0;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
sel= position_in_interval(field, min_key, min_key_len,
|
||||||
|
buckets[idx].start_value,
|
||||||
|
get_end_value(idx));
|
||||||
}
|
}
|
||||||
double left_fract= get_left_fract(idx);
|
double left_fract= get_left_fract(idx);
|
||||||
double sel= position_in_interval(field, min_key, min_key_len,
|
|
||||||
buckets[idx].start_value,
|
|
||||||
get_end_value(idx));
|
|
||||||
|
|
||||||
min= left_fract + sel * (buckets[idx].cum_fract - left_fract);
|
min= left_fract + sel * (buckets[idx].cum_fract - left_fract);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@ -1012,28 +1017,35 @@ double Histogram_json_hb::range_selectivity(Field *field, key_range *min_endp,
|
|||||||
max_key++;
|
max_key++;
|
||||||
max_key_len--;
|
max_key_len--;
|
||||||
}
|
}
|
||||||
bool equal;
|
int endp_cmp;
|
||||||
int idx= find_bucket(field, max_key, &equal);
|
int idx= find_bucket(field, max_key, &endp_cmp);
|
||||||
|
|
||||||
if (equal && !inclusive_endp && idx > 0)
|
if ((endp_cmp == 0) && !inclusive_endp)
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
The range is "col < $CONST" and we've found a bucket starting with
|
The range is "col < $CONST" and we've found a bucket starting with
|
||||||
$CONST. Move to the previous bucket.
|
$CONST.
|
||||||
*/
|
*/
|
||||||
idx--;
|
if (idx > 0)
|
||||||
equal= false;
|
{
|
||||||
}
|
// Move to the previous bucket
|
||||||
double left_fract= get_left_fract(idx);
|
endp_cmp= 1;
|
||||||
|
idx--;
|
||||||
double sel;
|
}
|
||||||
/* Special handling for singleton buckets */
|
|
||||||
if (buckets[idx].ndv == 1 && equal)
|
|
||||||
{
|
|
||||||
if (inclusive_endp)
|
|
||||||
sel= 1.0;
|
|
||||||
else
|
else
|
||||||
|
endp_cmp= -1;
|
||||||
|
}
|
||||||
|
double sel;
|
||||||
|
|
||||||
|
// Special handling for buckets with ndv=1:
|
||||||
|
if (buckets[idx].ndv == 1)
|
||||||
|
{
|
||||||
|
if (endp_cmp < 0)
|
||||||
sel= 0.0;
|
sel= 0.0;
|
||||||
|
else if (endp_cmp > 0)
|
||||||
|
sel= 1.0;
|
||||||
|
else // endp_cmp == 0.0
|
||||||
|
sel= inclusive_endp? 1.0 : 0.0;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@ -1041,13 +1053,13 @@ double Histogram_json_hb::range_selectivity(Field *field, key_range *min_endp,
|
|||||||
buckets[idx].start_value,
|
buckets[idx].start_value,
|
||||||
get_end_value(idx));
|
get_end_value(idx));
|
||||||
}
|
}
|
||||||
|
double left_fract= get_left_fract(idx);
|
||||||
max= left_fract + sel * (buckets[idx].cum_fract - left_fract);
|
max= left_fract + sel * (buckets[idx].cum_fract - left_fract);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
max= 1.0;
|
max= 1.0;
|
||||||
|
|
||||||
double sel = max - min;
|
return max - min;
|
||||||
return sel;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -1057,25 +1069,37 @@ void Histogram_json_hb::serialize(Field *field)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static int SGN(int x)
|
||||||
|
{
|
||||||
|
if (!x)
|
||||||
|
return 0;
|
||||||
|
return (x < 0)? -1 : 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@brief
|
@brief
|
||||||
Find the leftmost histogram bucket such that "lookup_val >= start_value".
|
Find the leftmost histogram bucket such that "lookup_val >= start_value".
|
||||||
|
|
||||||
@param field Field object (used to do value comparisons)
|
@param field Field object (used to do value comparisons)
|
||||||
@param lookup_val The lookup value in KeyTupleFormat.
|
@param lookup_val The lookup value in KeyTupleFormat.
|
||||||
@param equal OUT TRUE<=> the found bucket has left_bound=lookup_val
|
@param cmp OUT How the lookup_val compares to found_bucket.left_bound:
|
||||||
|
0 - lookup_val == bucket.left_bound
|
||||||
|
>0 - lookup_val > bucket.left_bound (the most typical)
|
||||||
|
<0 - lookup_val < bucket.left_bound. This can only happen
|
||||||
|
for the first bucket, for all other buckets we would just
|
||||||
|
pick the previous bucket and have cmp>=0.
|
||||||
@return
|
@return
|
||||||
The bucket index
|
The bucket index
|
||||||
*/
|
*/
|
||||||
|
|
||||||
int Histogram_json_hb::find_bucket(const Field *field, const uchar *lookup_val,
|
int Histogram_json_hb::find_bucket(const Field *field, const uchar *lookup_val,
|
||||||
bool *equal)
|
int *cmp)
|
||||||
{
|
{
|
||||||
int res;
|
int res;
|
||||||
int low= 0;
|
int low= 0;
|
||||||
int high= (int)buckets.size() - 1;
|
int high= (int)buckets.size() - 1;
|
||||||
*equal= false;
|
*cmp= 1; // By default, (bucket[retval].start_value < *lookup_val)
|
||||||
|
|
||||||
while (low + 1 < high)
|
while (low + 1 < high)
|
||||||
{
|
{
|
||||||
@ -1083,7 +1107,7 @@ int Histogram_json_hb::find_bucket(const Field *field, const uchar *lookup_val,
|
|||||||
res= field->key_cmp((uchar*)buckets[middle].start_value.data(), lookup_val);
|
res= field->key_cmp((uchar*)buckets[middle].start_value.data(), lookup_val);
|
||||||
if (!res)
|
if (!res)
|
||||||
{
|
{
|
||||||
*equal= true;
|
*cmp= res;
|
||||||
low= middle;
|
low= middle;
|
||||||
goto end;
|
goto end;
|
||||||
}
|
}
|
||||||
@ -1104,31 +1128,44 @@ int Histogram_json_hb::find_bucket(const Field *field, const uchar *lookup_val,
|
|||||||
*/
|
*/
|
||||||
if (low == 0)
|
if (low == 0)
|
||||||
{
|
{
|
||||||
res= field->key_cmp((uchar*)buckets[0].start_value.data(), lookup_val);
|
res= field->key_cmp(lookup_val, (uchar*)buckets[0].start_value.data());
|
||||||
if (!res)
|
if (res <= 0)
|
||||||
*equal= true;
|
*cmp= res;
|
||||||
else if (res < 0) // buckets[0] < lookup_val
|
else // res>0, lookup_val > buckets[0].start_value
|
||||||
{
|
{
|
||||||
res= field->key_cmp((uchar*)buckets[high].start_value.data(), lookup_val);
|
res= field->key_cmp(lookup_val, (uchar*)buckets[high].start_value.data());
|
||||||
if (!res)
|
if (res >= 0) // lookup_val >= buckets[high].start_value
|
||||||
*equal= true;
|
{
|
||||||
if (res <= 0) // buckets[high] <= lookup_val
|
// Move to that bucket
|
||||||
low= high;
|
low= high;
|
||||||
|
*cmp= res;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
*cmp= 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (high == (int)buckets.size() - 1)
|
else if (high == (int)buckets.size() - 1)
|
||||||
{
|
{
|
||||||
res= field->key_cmp((uchar*)buckets[high].start_value.data(), lookup_val);
|
res= field->key_cmp(lookup_val, (uchar*)buckets[high].start_value.data());
|
||||||
if (!res)
|
if (res >= 0)
|
||||||
*equal= true;
|
{
|
||||||
if (res <= 0)
|
// Ok the value is in the last bucket.
|
||||||
|
*cmp= res;
|
||||||
low= high;
|
low= high;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// The value is in the 'low' bucket.
|
||||||
|
res= field->key_cmp(lookup_val, (uchar*)buckets[low].start_value.data());
|
||||||
|
*cmp= res;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
end:
|
end:
|
||||||
// Verification: *equal==TRUE <=> lookup value is equal to the found bucket.
|
// Verification: *cmp has correct value
|
||||||
DBUG_ASSERT(*equal == !(field->key_cmp((uchar*)buckets[low].start_value.data(),
|
DBUG_ASSERT(SGN(*cmp) ==
|
||||||
lookup_val)));
|
SGN(field->key_cmp(lookup_val,
|
||||||
|
(uchar*)buckets[low].start_value.data())));
|
||||||
// buckets[low] <= lookup_val, with one exception of the first bucket.
|
// buckets[low] <= lookup_val, with one exception of the first bucket.
|
||||||
DBUG_ASSERT(low == 0 ||
|
DBUG_ASSERT(low == 0 ||
|
||||||
field->key_cmp((uchar*)buckets[low].start_value.data(), lookup_val)<= 0);
|
field->key_cmp((uchar*)buckets[low].start_value.data(), lookup_val)<= 0);
|
||||||
|
@ -144,6 +144,6 @@ private:
|
|||||||
|
|
||||||
double get_left_fract(int idx);
|
double get_left_fract(int idx);
|
||||||
std::string& get_end_value(int idx);
|
std::string& get_end_value(int idx);
|
||||||
int find_bucket(const Field *field, const uchar *lookup_val, bool *equal);
|
int find_bucket(const Field *field, const uchar *lookup_val, int *cmp);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user