MDEV-27229: Estimation for filtered rows less precise ... #5

Fix special handling for values that are right next to buckets with ndv=1.
2025-07-29 05:21:33 +03:00 · 2022-01-08 22:36:12 +03:00
parent 67d4d0426f
commit 531dd708ef
4 changed files with 150 additions and 57 deletions
--- a/mysql-test/main/statistics_json.result
+++ b/mysql-test/main/statistics_json.result
@ -4631,12 +4631,12 @@ test	t1_json	a	a-0	a-9	0.0000	3.0000	1.0000	10	JSON_HB	{
 }
 explain extended select * from t1_json where a between 'a-3a' and 'zzzzzzzzz';
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
-1	SIMPLE	t1_json	ALL	NULL	NULL	NULL	NULL	10	68.71	Using where
+1	SIMPLE	t1_json	ALL	NULL	NULL	NULL	NULL	10	60.00	Using where
 Warnings:
 Note	1003	select `test`.`t1_json`.`a` AS `a` from `test`.`t1_json` where `test`.`t1_json`.`a` between 'a-3a' and 'zzzzzzzzz'
 analyze select * from t1_json where a between 'a-3a' and 'zzzzzzzzz';
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
-1	SIMPLE	t1_json	ALL	NULL	NULL	NULL	NULL	10	10.00	68.71	60.00	Using where
+1	SIMPLE	t1_json	ALL	NULL	NULL	NULL	NULL	10	10.00	60.00	60.00	Using where
 explain extended select * from t1_json where a < 'b-1a';
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
 1	SIMPLE	t1_json	ALL	NULL	NULL	NULL	NULL	10	100.00	Using where
@ -8014,7 +8014,7 @@ test.t1	analyze	status	OK
 analyze
 select c from t1 where c > '1';
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
-1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	16	16.00	80.47	75.00	Using where
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	16	16.00	75.00	75.00	Using where
 drop table t1;
 #
 # MDEV-26849: JSON Histograms: point selectivity estimates are off for non-existent values
@ -8211,3 +8211,33 @@ analyze select COUNT(*) FROM t1 WHERE a < 'a';
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
 1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	100	100.00	50.00	50.00	Using where
 drop table t1;
 #
 # MDEV-27229: Estimation for filtered rows less precise ... #5
 #
 create table t1 (id int, a varchar(8));
 insert into t1 select seq, 'bar' from seq_1_to_100;
 insert into t1 select id, 'qux' from t1;
 set histogram_type=JSON_HB;
 analyze table t1 persistent for all;
 Table	Op	Msg_type	Msg_text
 test.t1	analyze	status	Engine-independent statistics collected
 test.t1	analyze	status	OK
 analyze select COUNT(*) FROM t1 WHERE a > 'foo';
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
 1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	200	200.00	50.00	50.00	Using where
 analyze select COUNT(*) FROM t1 WHERE a > 'aaa';
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
 1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	200	200.00	100.00	100.00	Using where
 analyze select COUNT(*) FROM t1 WHERE a >='aaa';
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
 1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	200	200.00	100.00	100.00	Using where
 analyze select COUNT(*) FROM t1 WHERE a > 'bar';
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
 1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	200	200.00	50.00	50.00	Using where
 analyze select COUNT(*) FROM t1 WHERE a >='bar';
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
 1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	200	200.00	100.00	100.00	Using where
 analyze select COUNT(*) FROM t1 WHERE a <='bar';
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
 1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	200	200.00	50.00	50.00	Using where
 drop table t1;
--- a/mysql-test/main/statistics_json.test
+++ b/mysql-test/main/statistics_json.test
@ -390,3 +390,29 @@ analyze table t1 persistent for all;
 analyze select COUNT(*) FROM t1 WHERE a <> 'a';
 analyze select COUNT(*) FROM t1 WHERE a < 'a';
 drop table t1;
 --echo #
 --echo # MDEV-27229: Estimation for filtered rows less precise ... #5
 --echo #
 create table t1 (id int, a varchar(8));
 insert into t1 select seq, 'bar' from seq_1_to_100;
 insert into t1 select id, 'qux' from t1;
 set histogram_type=JSON_HB;
 analyze table t1 persistent for all;
 analyze select COUNT(*) FROM t1 WHERE a > 'foo';
 analyze select COUNT(*) FROM t1 WHERE a > 'aaa';
 analyze select COUNT(*) FROM t1 WHERE a >='aaa';
 analyze select COUNT(*) FROM t1 WHERE a > 'bar';
 analyze select COUNT(*) FROM t1 WHERE a >='bar';
 # Can enable these after get_avg_frequency issue is resolved:
 # analyze select COUNT(*) FROM t1 WHERE a < 'aaa';
 # analyze select COUNT(*) FROM t1 WHERE a <='aaa';
 # analyze select COUNT(*) FROM t1 WHERE a < 'bar';
 analyze select COUNT(*) FROM t1 WHERE a <='bar';
 drop table t1;
--- a/sql/opt_histogram_json.cc
+++ b/sql/opt_histogram_json.cc
@ -910,12 +910,12 @@ double Histogram_json_hb::point_selectivity(Field *field, key_range *endpoint,
  // If the value is outside of the histogram's range, this will "clip" it to
  // first or last bucket.
-  bool equal;
+  int endp_cmp;
-  int idx= find_bucket(field, key, &equal);
+  int idx= find_bucket(field, key, &endp_cmp);
  double sel;
-  if (buckets[idx].ndv == 1 && !equal)
+  if (buckets[idx].ndv == 1 && (endp_cmp!=0))
  {
    /*
      The bucket has a single value and it doesn't match! Return a very
@ -979,22 +979,27 @@ double Histogram_json_hb::range_selectivity(Field *field, key_range *min_endp,
    // Find the leftmost bucket that contains the lookup value.
    // (If the lookup value is to the left of all buckets, find bucket #0)
-    bool equal;
+    int endp_cmp;
-    int idx= find_bucket(field, min_key, &equal);
+    int idx= find_bucket(field, min_key, &endp_cmp);
-    if (equal && exclusive_endp && buckets[idx].ndv==1 &&
+
-        idx < (int)buckets.size()-1)
+    double sel;
    // Special handling for buckets with ndv=1:
    if (buckets[idx].ndv == 1)
    {
-      /*
+      if (endp_cmp < 0)
-        The range is "col > $CONST" and we've found a bucket that contains
+        sel= 0.0;
-        only the value $CONST. Move to the next bucket.
+      else if (endp_cmp > 0)
-      */
+        sel= 1.0;
-      idx++;
+      else // endp_cmp == 0.0
        sel= (exclusive_endp)? 1.0 : 0.0;
    }
    else
    {
      sel= position_in_interval(field, min_key, min_key_len,
 				buckets[idx].start_value,
 				get_end_value(idx));
    }
    double left_fract= get_left_fract(idx);
    double sel= position_in_interval(field, min_key, min_key_len,
                                     buckets[idx].start_value,
                                     get_end_value(idx));
    min= left_fract + sel * (buckets[idx].cum_fract - left_fract);
  }
  else
@ -1012,28 +1017,35 @@ double Histogram_json_hb::range_selectivity(Field *field, key_range *min_endp,
      max_key++;
      max_key_len--;
    }
-    bool equal;
+    int endp_cmp;
-    int idx= find_bucket(field, max_key, &equal);
+    int idx= find_bucket(field, max_key, &endp_cmp);
-    if (equal && !inclusive_endp && idx > 0)
+    if ((endp_cmp == 0) && !inclusive_endp)
    {
      /*
        The range is "col < $CONST" and we've found a bucket starting with
-        $CONST. Move to the previous bucket.
+        $CONST.
      */
-      idx--;
+      if (idx > 0)
-      equal= false;
+      {
-    }
+        // Move to the previous bucket
-    double left_fract= get_left_fract(idx);
+        endp_cmp= 1;
-
+        idx--;
-    double sel;
+      }
    /* Special handling for singleton buckets */
    if (buckets[idx].ndv == 1 && equal)
    {
      if (inclusive_endp)
        sel= 1.0;
      else
        endp_cmp= -1;
    }
    double sel;
    // Special handling for buckets with ndv=1:
    if (buckets[idx].ndv == 1)
    {
      if (endp_cmp < 0)
        sel= 0.0;
      else if (endp_cmp > 0)
        sel= 1.0;
      else // endp_cmp == 0.0
        sel= inclusive_endp? 1.0 : 0.0;
    }
    else
    {
@ -1041,13 +1053,13 @@ double Histogram_json_hb::range_selectivity(Field *field, key_range *min_endp,
                                buckets[idx].start_value,
                                get_end_value(idx));
    }
    double left_fract= get_left_fract(idx);
    max= left_fract + sel * (buckets[idx].cum_fract - left_fract);
  }
  else
    max= 1.0;
-  double sel = max - min;
+  return max - min;
  return sel;
 }
@ -1057,25 +1069,37 @@ void Histogram_json_hb::serialize(Field *field)
 }
 static int SGN(int x)
 {
  if (!x)
    return 0;
  return (x < 0)? -1 : 1;
 }
 /*
  @brief
   Find the leftmost histogram bucket such that "lookup_val >= start_value".
  @param field        Field object (used to do value comparisons)
  @param lookup_val   The lookup value in KeyTupleFormat.
-  @param equal  OUT   TRUE<=> the found bucket has left_bound=lookup_val
+  @param cmp  OUT     How the lookup_val compares to found_bucket.left_bound:
-
+                      0  - lookup_val == bucket.left_bound
                      >0 - lookup_val > bucket.left_bound (the most typical)
                      <0 - lookup_val < bucket.left_bound. This can only happen
                      for the first bucket, for all other buckets we would just
                      pick the previous bucket and have cmp>=0.
  @return
     The bucket index
 */
 int Histogram_json_hb::find_bucket(const Field *field, const uchar *lookup_val,
-                                   bool *equal)
+                                   int *cmp)
 {
  int res;
  int low= 0;
  int high= (int)buckets.size() - 1;
-  *equal= false;
+  *cmp= 1; // By default, (bucket[retval].start_value < *lookup_val)
  while (low + 1 < high)
  {
@ -1083,7 +1107,7 @@ int Histogram_json_hb::find_bucket(const Field *field, const uchar *lookup_val,
    res= field->key_cmp((uchar*)buckets[middle].start_value.data(), lookup_val);
    if (!res)
    {
-      *equal= true;
+      *cmp= res;
      low= middle;
      goto end;
    }
@ -1104,31 +1128,44 @@ int Histogram_json_hb::find_bucket(const Field *field, const uchar *lookup_val,
  */
  if (low == 0)
  {
-    res= field->key_cmp((uchar*)buckets[0].start_value.data(), lookup_val);
+    res= field->key_cmp(lookup_val, (uchar*)buckets[0].start_value.data());
-    if (!res)
+    if (res <= 0)
-      *equal= true;
+      *cmp= res;
-    else if (res < 0) //  buckets[0] < lookup_val
+    else // res>0, lookup_val > buckets[0].start_value
    {
-      res= field->key_cmp((uchar*)buckets[high].start_value.data(), lookup_val);
+      res= field->key_cmp(lookup_val, (uchar*)buckets[high].start_value.data());
-      if (!res)
+      if (res >= 0)  // lookup_val >= buckets[high].start_value
-        *equal= true;
+      {
-      if (res <= 0) // buckets[high] <= lookup_val
+        // Move to that bucket
        low= high;
        *cmp= res;
      }
      else
        *cmp= 1;
    }
  }
  else if (high == (int)buckets.size() - 1)
  {
-    res= field->key_cmp((uchar*)buckets[high].start_value.data(), lookup_val);
+    res= field->key_cmp(lookup_val, (uchar*)buckets[high].start_value.data());
-    if (!res)
+    if (res >= 0)
-      *equal= true;
+    {
-    if (res <= 0)
+      // Ok the value is in the last bucket.
      *cmp= res;
      low= high;
    }
    else
    {
      // The value is in the 'low' bucket.
      res= field->key_cmp(lookup_val, (uchar*)buckets[low].start_value.data());
      *cmp= res;
    }
  }
 end:
-  // Verification: *equal==TRUE <=> lookup value is equal to the found bucket.
+  // Verification: *cmp has correct value
-  DBUG_ASSERT(*equal == !(field->key_cmp((uchar*)buckets[low].start_value.data(),
+  DBUG_ASSERT(SGN(*cmp) ==
-                                         lookup_val)));
+              SGN(field->key_cmp(lookup_val,
                                 (uchar*)buckets[low].start_value.data())));
  // buckets[low] <= lookup_val, with one exception of the first bucket.
  DBUG_ASSERT(low == 0 ||
              field->key_cmp((uchar*)buckets[low].start_value.data(), lookup_val)<= 0);
--- a/sql/opt_histogram_json.h
+++ b/sql/opt_histogram_json.h
@ -144,6 +144,6 @@ private:
  double get_left_fract(int idx);
  std::string& get_end_value(int idx);
-  int find_bucket(const Field *field, const uchar *lookup_val, bool *equal);
+  int find_bucket(const Field *field, const uchar *lookup_val, int *cmp);
 };