Implement avg_frequency unsmoothed jacknife estimator

When sampling data through ANALYZE TABLE, use the estimator to get a better estimation of avg_frequency instead of just using the raw sampled data.
2025-07-29 05:21:33 +03:00 · 2019-02-17 00:44:58 +02:00
parent f0773b7842
commit 764429271d
3 changed files with 96 additions and 31 deletions
--- a/mysql-test/main/statistics.result
+++ b/mysql-test/main/statistics.result
@ -1768,6 +1768,7 @@ set histogram_size=@save_hist_size, histogram_type=@save_hist_type;
 #
 set @save_use_stat_tables=@@use_stat_tables;
 set @save_analyze_sample_percentage=@@analyze_sample_percentage;
 set @save_hist_size=@@histogram_size;
 set session rand_seed1=42;
 set session rand_seed2=62;
 set use_stat_tables=PREFERABLY;
@ -1800,7 +1801,7 @@ select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, a
 DECODE_HISTOGRAM(hist_type, histogram)
 from mysql.column_stats;
 table_name	column_name	min_value	max_value	nulls_ratio	avg_length	avg_frequency	DECODE_HISTOGRAM(hist_type, histogram)
-t1	id	1	17384	0.0000	4.0000	14.0000	0.082,0.086,0.086,0.086,0.086,0.141,0.086,0.086,0.086,0.086,0.086
+t1	id	1	17384	0.0000	4.0000	14.0000	0.15705,0.15711,0.21463,0.15705,0.15711,0.15706
 set analyze_sample_percentage=0.1;
 #
 # This query will show an innacurate avg_frequency value.
@ -1813,7 +1814,7 @@ select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, a
 DECODE_HISTOGRAM(hist_type, histogram)
 from mysql.column_stats;
 table_name	column_name	min_value	max_value	nulls_ratio	avg_length	avg_frequency	DECODE_HISTOGRAM(hist_type, histogram)
-t1	id	111	17026	0.0000	4.0000	1.0047	0.039,0.098,0.055,0.118,0.078,0.157,0.082,0.118,0.094,0.063,0.098
+t1	id	111	17026	0.0000	4.0000	10.4739	0.13649,0.14922,0.16921,0.21141,0.18355,0.15012
 #
 # This query will show a better avg_frequency value.
 #
@ -1826,7 +1827,7 @@ select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, a
 DECODE_HISTOGRAM(hist_type, histogram)
 from mysql.column_stats;
 table_name	column_name	min_value	max_value	nulls_ratio	avg_length	avg_frequency	DECODE_HISTOGRAM(hist_type, histogram)
-t1	id	1	17384	0.0000	4.0000	3.5736	0.082,0.086,0.086,0.082,0.086,0.145,0.086,0.086,0.082,0.086,0.090
+t1	id	1	17384	0.0000	4.0000	14.0401	0.15566,0.15590,0.15729,0.21538,0.15790,0.15787
 set analyze_sample_percentage=0;
 #
 # Test self adjusting sampling level.
@ -1839,7 +1840,7 @@ select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, a
 DECODE_HISTOGRAM(hist_type, histogram)
 from mysql.column_stats;
 table_name	column_name	min_value	max_value	nulls_ratio	avg_length	avg_frequency	DECODE_HISTOGRAM(hist_type, histogram)
-t1	id	1	17384	0.0000	4.0000	7.4523	0.082,0.090,0.086,0.082,0.086,0.145,0.086,0.082,0.086,0.086,0.086
+t1	id	1	17384	0.0000	4.0000	13.9812	0.15860,0.15767,0.21515,0.15573,0.15630,0.15654
 #
 # Test record estimation is working properly.
 #
@ -1858,9 +1859,11 @@ select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, a
 DECODE_HISTOGRAM(hist_type, histogram)
 from mysql.column_stats;
 table_name	column_name	min_value	max_value	nulls_ratio	avg_length	avg_frequency	DECODE_HISTOGRAM(hist_type, histogram)
-t1	id	1	17384	0.0000	4.0000	14.0000	0.082,0.086,0.086,0.086,0.086,0.141,0.086,0.086,0.086,0.086,0.086
+t1	id	1	17384	0.0000	4.0000	14.0000	0.15705,0.15711,0.21463,0.15705,0.15711,0.15706
 explain select * from t1;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	229376	
 set use_stat_tables=@save_use_stat_tables;
 drop table t1;
 set analyze_sample_percentage=@save_analyze_sample_percentage;
 set histogram_size=@save_hist_size;
 set use_stat_tables=@save_use_stat_tables;
--- a/mysql-test/main/statistics.test
+++ b/mysql-test/main/statistics.test
@ -912,6 +912,7 @@ set histogram_size=@save_hist_size, histogram_type=@save_hist_type;
 --echo #
 set @save_use_stat_tables=@@use_stat_tables;
 set @save_analyze_sample_percentage=@@analyze_sample_percentage;
 set @save_hist_size=@@histogram_size;
 set session rand_seed1=42;
 set session rand_seed2=62;
@ -986,6 +987,8 @@ select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, a
 from mysql.column_stats;
 explain select * from t1;
 set use_stat_tables=@save_use_stat_tables;
 drop table t1;
 set analyze_sample_percentage=@save_analyze_sample_percentage;
 set histogram_size=@save_hist_size;
 set use_stat_tables=@save_use_stat_tables;
--- a/sql/sql_statistics.cc
+++ b/sql/sql_statistics.cc
@ -325,7 +325,7 @@ public:
  inline void init(THD *thd, Field * table_field);
  inline bool add();
-  inline void finish(ha_rows rows); 
+  inline void finish(ha_rows rows, double sample_fraction);
  inline void cleanup();
 };
@ -1540,6 +1540,8 @@ class Histogram_builder
  uint curr_bucket;        /* number of the current bucket to be built     */
  ulonglong count;         /* number of values retrieved                   */
  ulonglong count_distinct;    /* number of distinct values retrieved      */
  /* number of distinct values that occured only once  */
  ulonglong count_distinct_single_occurence;
 public: 
  Histogram_builder(Field *col, uint col_len, ha_rows rows)
@ -1553,14 +1555,21 @@ public:
    bucket_capacity= (double) records / (hist_width + 1);
    curr_bucket= 0;
    count= 0;
-    count_distinct= 0;    
+    count_distinct= 0;
    count_distinct_single_occurence= 0;
  }
-  ulonglong get_count_distinct() { return count_distinct; }
+  ulonglong get_count_distinct() const { return count_distinct; }
  ulonglong get_count_single_occurence() const
  {
    return count_distinct_single_occurence;
  }
  int next(void *elem, element_count elem_cnt)
  {
    count_distinct++;
    if (elem_cnt == 1)
      count_distinct_single_occurence++;
    count+= elem_cnt;
    if (curr_bucket == hist_width)
      return 0;
@ -1574,7 +1583,7 @@ public:
             count > bucket_capacity * (curr_bucket + 1))
      {
        histogram->set_prev_value(curr_bucket);
-	curr_bucket++;
+        curr_bucket++;
      }
    }
    return 0;
@ -1590,9 +1599,18 @@ int histogram_build_walk(void *elem, element_count elem_cnt, void *arg)
  return hist_builder->next(elem, elem_cnt);
 }
 static int count_distinct_single_occurence_walk(void *elem,
                                                element_count count, void *arg)
 {
  ((ulonglong*)arg)[0]+= 1;
  if (count == 1)
    ((ulonglong*)arg)[1]+= 1;
  return 0;
 }
 C_MODE_END
 /*
  The class Count_distinct_field is a helper class used to calculate
  the number of distinct values for a column. The class employs the
@ -1611,6 +1629,9 @@ protected:
  Unique *tree;       /* The helper object to contain distinct values */
  uint tree_key_length; /* The length of the keys for the elements of 'tree */
  ulonglong distincts;
  ulonglong distincts_single_occurence;
 public:
  Count_distinct_field() {}
@ -1662,30 +1683,40 @@ public:
  {
    return tree->unique_add(table_field->ptr);
  }
-  
+
  /*
    @brief
    Calculate the number of elements accumulated in the container of 'tree'
  */
-  ulonglong get_value()
+  void walk_tree()
  {
-    ulonglong count;
+    ulonglong counts[2] = {0, 0};
-    if (tree->elements == 0)
+    tree->walk(table_field->table,
-      return (ulonglong) tree->elements_in_tree();
+               count_distinct_single_occurence_walk, counts);
-    count= 0;  
+    distincts= counts[0];
-    tree->walk(table_field->table, count_distinct_walk, (void*) &count);
+    distincts_single_occurence= counts[1];
    return count;
  }
  /*
    @brief
-    Build the histogram for the elements accumulated in the container of 'tree'
+    Calculate a histogram of the tree
  */
-  ulonglong get_value_with_histogram(ha_rows rows)
+   void walk_tree_with_histogram(ha_rows rows)
  {
    Histogram_builder hist_builder(table_field, tree_key_length, rows);
    tree->walk(table_field->table,  histogram_build_walk, (void *) &hist_builder);
-    return hist_builder.get_count_distinct();
+    distincts= hist_builder.get_count_distinct();
    distincts_single_occurence= hist_builder.get_count_single_occurence();
  }
  ulonglong get_count_distinct()
  {
    return distincts;
  }
  ulonglong get_count_distinct_single_occurence()
  {
    return distincts_single_occurence;
  }
  /*
@ -2514,7 +2545,7 @@ bool Column_statistics_collected::add()
 */
 inline
-void Column_statistics_collected::finish(ha_rows rows)
+void Column_statistics_collected::finish(ha_rows rows, double sample_fraction)
 {
  double val;
@ -2532,16 +2563,44 @@ void Column_statistics_collected::finish(ha_rows rows)
  }
  if (count_distinct)
  {
    ulonglong distincts;
    uint hist_size= count_distinct->get_hist_size();
    /* Compute cardinality statistics and optionally histogram. */
    if (hist_size == 0)
-      distincts= count_distinct->get_value();
+      count_distinct->walk_tree();
    else
-      distincts= count_distinct->get_value_with_histogram(rows - nulls);
+      count_distinct->walk_tree_with_histogram(rows - nulls);
    ulonglong distincts= count_distinct->get_count_distinct();
    ulonglong distincts_single_occurence=
      count_distinct->get_count_distinct_single_occurence();
    if (distincts)
    {
-      val= (double) (rows - nulls) / distincts;
+      /*
-      set_avg_frequency(val); 
+       We use the unsmoothed first-order jackknife estimator" to estimate
       the number of distinct values.
       With a sufficient large percentage of rows sampled (80%), we revert back
       to computing the avg_frequency off of the raw data.
      */
      if (sample_fraction > 0.8)
        val= (double) (rows - nulls) / distincts;
      else
      {
        if (nulls == 1)
          distincts_single_occurence+= 1;
        if (nulls)
          distincts+= 1;
        double fraction_single_occurence=
          static_cast<double>(distincts_single_occurence) / rows;
        double total_number_of_rows= rows / sample_fraction;
        double estimate_total_distincts= total_number_of_rows /
                (distincts /
                 (1.0 - (1.0 - sample_fraction) * fraction_single_occurence));
        val = std::fmax(estimate_total_distincts * (rows - nulls) / rows, 1.0);
      }
      set_avg_frequency(val);
      set_not_null(COLUMN_STAT_AVG_FREQUENCY);
    }
    else
@ -2813,7 +2872,7 @@ int collect_statistics_for_table(THD *thd, TABLE *table)
      continue;
    bitmap_set_bit(table->write_set, table_field->field_index); 
    if (!rc)
-      table_field->collected_stats->finish(rows);
+      table_field->collected_stats->finish(rows, sample_fraction);
    else
      table_field->collected_stats->cleanup();
  }