diff --git a/mysql-test/main/statistics.result b/mysql-test/main/statistics.result index ee97ec05391..787f0194532 100644 --- a/mysql-test/main/statistics.result +++ b/mysql-test/main/statistics.result @@ -1768,6 +1768,7 @@ set histogram_size=@save_hist_size, histogram_type=@save_hist_type; # set @save_use_stat_tables=@@use_stat_tables; set @save_analyze_sample_percentage=@@analyze_sample_percentage; +set @save_hist_size=@@histogram_size; set session rand_seed1=42; set session rand_seed2=62; set use_stat_tables=PREFERABLY; @@ -1800,7 +1801,7 @@ select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, a DECODE_HISTOGRAM(hist_type, histogram) from mysql.column_stats; table_name column_name min_value max_value nulls_ratio avg_length avg_frequency DECODE_HISTOGRAM(hist_type, histogram) -t1 id 1 17384 0.0000 4.0000 14.0000 0.082,0.086,0.086,0.086,0.086,0.141,0.086,0.086,0.086,0.086,0.086 +t1 id 1 17384 0.0000 4.0000 14.0000 0.15705,0.15711,0.21463,0.15705,0.15711,0.15706 set analyze_sample_percentage=0.1; # # This query will show an innacurate avg_frequency value. @@ -1813,7 +1814,7 @@ select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, a DECODE_HISTOGRAM(hist_type, histogram) from mysql.column_stats; table_name column_name min_value max_value nulls_ratio avg_length avg_frequency DECODE_HISTOGRAM(hist_type, histogram) -t1 id 111 17026 0.0000 4.0000 1.0047 0.039,0.098,0.055,0.118,0.078,0.157,0.082,0.118,0.094,0.063,0.098 +t1 id 111 17026 0.0000 4.0000 10.4739 0.13649,0.14922,0.16921,0.21141,0.18355,0.15012 # # This query will show a better avg_frequency value. # @@ -1826,7 +1827,7 @@ select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, a DECODE_HISTOGRAM(hist_type, histogram) from mysql.column_stats; table_name column_name min_value max_value nulls_ratio avg_length avg_frequency DECODE_HISTOGRAM(hist_type, histogram) -t1 id 1 17384 0.0000 4.0000 3.5736 0.082,0.086,0.086,0.082,0.086,0.145,0.086,0.086,0.082,0.086,0.090 +t1 id 1 17384 0.0000 4.0000 14.0401 0.15566,0.15590,0.15729,0.21538,0.15790,0.15787 set analyze_sample_percentage=0; # # Test self adjusting sampling level. @@ -1839,7 +1840,7 @@ select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, a DECODE_HISTOGRAM(hist_type, histogram) from mysql.column_stats; table_name column_name min_value max_value nulls_ratio avg_length avg_frequency DECODE_HISTOGRAM(hist_type, histogram) -t1 id 1 17384 0.0000 4.0000 7.4523 0.082,0.090,0.086,0.082,0.086,0.145,0.086,0.082,0.086,0.086,0.086 +t1 id 1 17384 0.0000 4.0000 13.9812 0.15860,0.15767,0.21515,0.15573,0.15630,0.15654 # # Test record estimation is working properly. # @@ -1858,9 +1859,11 @@ select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, a DECODE_HISTOGRAM(hist_type, histogram) from mysql.column_stats; table_name column_name min_value max_value nulls_ratio avg_length avg_frequency DECODE_HISTOGRAM(hist_type, histogram) -t1 id 1 17384 0.0000 4.0000 14.0000 0.082,0.086,0.086,0.086,0.086,0.141,0.086,0.086,0.086,0.086,0.086 +t1 id 1 17384 0.0000 4.0000 14.0000 0.15705,0.15711,0.21463,0.15705,0.15711,0.15706 explain select * from t1; id select_type table type possible_keys key key_len ref rows Extra 1 SIMPLE t1 ALL NULL NULL NULL NULL 229376 -set use_stat_tables=@save_use_stat_tables; drop table t1; +set analyze_sample_percentage=@save_analyze_sample_percentage; +set histogram_size=@save_hist_size; +set use_stat_tables=@save_use_stat_tables; diff --git a/mysql-test/main/statistics.test b/mysql-test/main/statistics.test index 84bb7c10059..880f2987b50 100644 --- a/mysql-test/main/statistics.test +++ b/mysql-test/main/statistics.test @@ -912,6 +912,7 @@ set histogram_size=@save_hist_size, histogram_type=@save_hist_type; --echo # set @save_use_stat_tables=@@use_stat_tables; set @save_analyze_sample_percentage=@@analyze_sample_percentage; +set @save_hist_size=@@histogram_size; set session rand_seed1=42; set session rand_seed2=62; @@ -986,6 +987,8 @@ select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, a from mysql.column_stats; explain select * from t1; -set use_stat_tables=@save_use_stat_tables; drop table t1; +set analyze_sample_percentage=@save_analyze_sample_percentage; +set histogram_size=@save_hist_size; +set use_stat_tables=@save_use_stat_tables; diff --git a/sql/sql_statistics.cc b/sql/sql_statistics.cc index 27fab974441..8683368e818 100644 --- a/sql/sql_statistics.cc +++ b/sql/sql_statistics.cc @@ -325,7 +325,7 @@ public: inline void init(THD *thd, Field * table_field); inline bool add(); - inline void finish(ha_rows rows); + inline void finish(ha_rows rows, double sample_fraction); inline void cleanup(); }; @@ -1540,6 +1540,8 @@ class Histogram_builder uint curr_bucket; /* number of the current bucket to be built */ ulonglong count; /* number of values retrieved */ ulonglong count_distinct; /* number of distinct values retrieved */ + /* number of distinct values that occured only once */ + ulonglong count_distinct_single_occurence; public: Histogram_builder(Field *col, uint col_len, ha_rows rows) @@ -1553,14 +1555,21 @@ public: bucket_capacity= (double) records / (hist_width + 1); curr_bucket= 0; count= 0; - count_distinct= 0; + count_distinct= 0; + count_distinct_single_occurence= 0; } - ulonglong get_count_distinct() { return count_distinct; } + ulonglong get_count_distinct() const { return count_distinct; } + ulonglong get_count_single_occurence() const + { + return count_distinct_single_occurence; + } int next(void *elem, element_count elem_cnt) { count_distinct++; + if (elem_cnt == 1) + count_distinct_single_occurence++; count+= elem_cnt; if (curr_bucket == hist_width) return 0; @@ -1574,7 +1583,7 @@ public: count > bucket_capacity * (curr_bucket + 1)) { histogram->set_prev_value(curr_bucket); - curr_bucket++; + curr_bucket++; } } return 0; @@ -1590,9 +1599,18 @@ int histogram_build_walk(void *elem, element_count elem_cnt, void *arg) return hist_builder->next(elem, elem_cnt); } + + +static int count_distinct_single_occurence_walk(void *elem, + element_count count, void *arg) +{ + ((ulonglong*)arg)[0]+= 1; + if (count == 1) + ((ulonglong*)arg)[1]+= 1; + return 0; +} + C_MODE_END - - /* The class Count_distinct_field is a helper class used to calculate the number of distinct values for a column. The class employs the @@ -1611,6 +1629,9 @@ protected: Unique *tree; /* The helper object to contain distinct values */ uint tree_key_length; /* The length of the keys for the elements of 'tree */ + ulonglong distincts; + ulonglong distincts_single_occurence; + public: Count_distinct_field() {} @@ -1662,30 +1683,40 @@ public: { return tree->unique_add(table_field->ptr); } - + /* @brief Calculate the number of elements accumulated in the container of 'tree' */ - ulonglong get_value() + void walk_tree() { - ulonglong count; - if (tree->elements == 0) - return (ulonglong) tree->elements_in_tree(); - count= 0; - tree->walk(table_field->table, count_distinct_walk, (void*) &count); - return count; + ulonglong counts[2] = {0, 0}; + tree->walk(table_field->table, + count_distinct_single_occurence_walk, counts); + distincts= counts[0]; + distincts_single_occurence= counts[1]; } /* @brief - Build the histogram for the elements accumulated in the container of 'tree' + Calculate a histogram of the tree */ - ulonglong get_value_with_histogram(ha_rows rows) + void walk_tree_with_histogram(ha_rows rows) { Histogram_builder hist_builder(table_field, tree_key_length, rows); tree->walk(table_field->table, histogram_build_walk, (void *) &hist_builder); - return hist_builder.get_count_distinct(); + distincts= hist_builder.get_count_distinct(); + distincts_single_occurence= hist_builder.get_count_single_occurence(); + } + + ulonglong get_count_distinct() + { + return distincts; + } + + ulonglong get_count_distinct_single_occurence() + { + return distincts_single_occurence; } /* @@ -2514,7 +2545,7 @@ bool Column_statistics_collected::add() */ inline -void Column_statistics_collected::finish(ha_rows rows) +void Column_statistics_collected::finish(ha_rows rows, double sample_fraction) { double val; @@ -2532,16 +2563,44 @@ void Column_statistics_collected::finish(ha_rows rows) } if (count_distinct) { - ulonglong distincts; uint hist_size= count_distinct->get_hist_size(); + + /* Compute cardinality statistics and optionally histogram. */ if (hist_size == 0) - distincts= count_distinct->get_value(); + count_distinct->walk_tree(); else - distincts= count_distinct->get_value_with_histogram(rows - nulls); + count_distinct->walk_tree_with_histogram(rows - nulls); + + ulonglong distincts= count_distinct->get_count_distinct(); + ulonglong distincts_single_occurence= + count_distinct->get_count_distinct_single_occurence(); + if (distincts) { - val= (double) (rows - nulls) / distincts; - set_avg_frequency(val); + /* + We use the unsmoothed first-order jackknife estimator" to estimate + the number of distinct values. + With a sufficient large percentage of rows sampled (80%), we revert back + to computing the avg_frequency off of the raw data. + */ + if (sample_fraction > 0.8) + val= (double) (rows - nulls) / distincts; + else + { + if (nulls == 1) + distincts_single_occurence+= 1; + if (nulls) + distincts+= 1; + double fraction_single_occurence= + static_cast(distincts_single_occurence) / rows; + double total_number_of_rows= rows / sample_fraction; + double estimate_total_distincts= total_number_of_rows / + (distincts / + (1.0 - (1.0 - sample_fraction) * fraction_single_occurence)); + val = std::fmax(estimate_total_distincts * (rows - nulls) / rows, 1.0); + } + + set_avg_frequency(val); set_not_null(COLUMN_STAT_AVG_FREQUENCY); } else @@ -2813,7 +2872,7 @@ int collect_statistics_for_table(THD *thd, TABLE *table) continue; bitmap_set_bit(table->write_set, table_field->field_index); if (!rc) - table_field->collected_stats->finish(rows); + table_field->collected_stats->finish(rows, sample_fraction); else table_field->collected_stats->cleanup(); }