mirror of
https://github.com/MariaDB/server.git
synced 2025-07-29 05:21:33 +03:00
Implement avg_frequency unsmoothed jacknife estimator
When sampling data through ANALYZE TABLE, use the estimator to get a better estimation of avg_frequency instead of just using the raw sampled data.
This commit is contained in:
@ -1768,6 +1768,7 @@ set histogram_size=@save_hist_size, histogram_type=@save_hist_type;
|
|||||||
#
|
#
|
||||||
set @save_use_stat_tables=@@use_stat_tables;
|
set @save_use_stat_tables=@@use_stat_tables;
|
||||||
set @save_analyze_sample_percentage=@@analyze_sample_percentage;
|
set @save_analyze_sample_percentage=@@analyze_sample_percentage;
|
||||||
|
set @save_hist_size=@@histogram_size;
|
||||||
set session rand_seed1=42;
|
set session rand_seed1=42;
|
||||||
set session rand_seed2=62;
|
set session rand_seed2=62;
|
||||||
set use_stat_tables=PREFERABLY;
|
set use_stat_tables=PREFERABLY;
|
||||||
@ -1800,7 +1801,7 @@ select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, a
|
|||||||
DECODE_HISTOGRAM(hist_type, histogram)
|
DECODE_HISTOGRAM(hist_type, histogram)
|
||||||
from mysql.column_stats;
|
from mysql.column_stats;
|
||||||
table_name column_name min_value max_value nulls_ratio avg_length avg_frequency DECODE_HISTOGRAM(hist_type, histogram)
|
table_name column_name min_value max_value nulls_ratio avg_length avg_frequency DECODE_HISTOGRAM(hist_type, histogram)
|
||||||
t1 id 1 17384 0.0000 4.0000 14.0000 0.082,0.086,0.086,0.086,0.086,0.141,0.086,0.086,0.086,0.086,0.086
|
t1 id 1 17384 0.0000 4.0000 14.0000 0.15705,0.15711,0.21463,0.15705,0.15711,0.15706
|
||||||
set analyze_sample_percentage=0.1;
|
set analyze_sample_percentage=0.1;
|
||||||
#
|
#
|
||||||
# This query will show an innacurate avg_frequency value.
|
# This query will show an innacurate avg_frequency value.
|
||||||
@ -1813,7 +1814,7 @@ select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, a
|
|||||||
DECODE_HISTOGRAM(hist_type, histogram)
|
DECODE_HISTOGRAM(hist_type, histogram)
|
||||||
from mysql.column_stats;
|
from mysql.column_stats;
|
||||||
table_name column_name min_value max_value nulls_ratio avg_length avg_frequency DECODE_HISTOGRAM(hist_type, histogram)
|
table_name column_name min_value max_value nulls_ratio avg_length avg_frequency DECODE_HISTOGRAM(hist_type, histogram)
|
||||||
t1 id 111 17026 0.0000 4.0000 1.0047 0.039,0.098,0.055,0.118,0.078,0.157,0.082,0.118,0.094,0.063,0.098
|
t1 id 111 17026 0.0000 4.0000 10.4739 0.13649,0.14922,0.16921,0.21141,0.18355,0.15012
|
||||||
#
|
#
|
||||||
# This query will show a better avg_frequency value.
|
# This query will show a better avg_frequency value.
|
||||||
#
|
#
|
||||||
@ -1826,7 +1827,7 @@ select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, a
|
|||||||
DECODE_HISTOGRAM(hist_type, histogram)
|
DECODE_HISTOGRAM(hist_type, histogram)
|
||||||
from mysql.column_stats;
|
from mysql.column_stats;
|
||||||
table_name column_name min_value max_value nulls_ratio avg_length avg_frequency DECODE_HISTOGRAM(hist_type, histogram)
|
table_name column_name min_value max_value nulls_ratio avg_length avg_frequency DECODE_HISTOGRAM(hist_type, histogram)
|
||||||
t1 id 1 17384 0.0000 4.0000 3.5736 0.082,0.086,0.086,0.082,0.086,0.145,0.086,0.086,0.082,0.086,0.090
|
t1 id 1 17384 0.0000 4.0000 14.0401 0.15566,0.15590,0.15729,0.21538,0.15790,0.15787
|
||||||
set analyze_sample_percentage=0;
|
set analyze_sample_percentage=0;
|
||||||
#
|
#
|
||||||
# Test self adjusting sampling level.
|
# Test self adjusting sampling level.
|
||||||
@ -1839,7 +1840,7 @@ select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, a
|
|||||||
DECODE_HISTOGRAM(hist_type, histogram)
|
DECODE_HISTOGRAM(hist_type, histogram)
|
||||||
from mysql.column_stats;
|
from mysql.column_stats;
|
||||||
table_name column_name min_value max_value nulls_ratio avg_length avg_frequency DECODE_HISTOGRAM(hist_type, histogram)
|
table_name column_name min_value max_value nulls_ratio avg_length avg_frequency DECODE_HISTOGRAM(hist_type, histogram)
|
||||||
t1 id 1 17384 0.0000 4.0000 7.4523 0.082,0.090,0.086,0.082,0.086,0.145,0.086,0.082,0.086,0.086,0.086
|
t1 id 1 17384 0.0000 4.0000 13.9812 0.15860,0.15767,0.21515,0.15573,0.15630,0.15654
|
||||||
#
|
#
|
||||||
# Test record estimation is working properly.
|
# Test record estimation is working properly.
|
||||||
#
|
#
|
||||||
@ -1858,9 +1859,11 @@ select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, a
|
|||||||
DECODE_HISTOGRAM(hist_type, histogram)
|
DECODE_HISTOGRAM(hist_type, histogram)
|
||||||
from mysql.column_stats;
|
from mysql.column_stats;
|
||||||
table_name column_name min_value max_value nulls_ratio avg_length avg_frequency DECODE_HISTOGRAM(hist_type, histogram)
|
table_name column_name min_value max_value nulls_ratio avg_length avg_frequency DECODE_HISTOGRAM(hist_type, histogram)
|
||||||
t1 id 1 17384 0.0000 4.0000 14.0000 0.082,0.086,0.086,0.086,0.086,0.141,0.086,0.086,0.086,0.086,0.086
|
t1 id 1 17384 0.0000 4.0000 14.0000 0.15705,0.15711,0.21463,0.15705,0.15711,0.15706
|
||||||
explain select * from t1;
|
explain select * from t1;
|
||||||
id select_type table type possible_keys key key_len ref rows Extra
|
id select_type table type possible_keys key key_len ref rows Extra
|
||||||
1 SIMPLE t1 ALL NULL NULL NULL NULL 229376
|
1 SIMPLE t1 ALL NULL NULL NULL NULL 229376
|
||||||
set use_stat_tables=@save_use_stat_tables;
|
|
||||||
drop table t1;
|
drop table t1;
|
||||||
|
set analyze_sample_percentage=@save_analyze_sample_percentage;
|
||||||
|
set histogram_size=@save_hist_size;
|
||||||
|
set use_stat_tables=@save_use_stat_tables;
|
||||||
|
@ -912,6 +912,7 @@ set histogram_size=@save_hist_size, histogram_type=@save_hist_type;
|
|||||||
--echo #
|
--echo #
|
||||||
set @save_use_stat_tables=@@use_stat_tables;
|
set @save_use_stat_tables=@@use_stat_tables;
|
||||||
set @save_analyze_sample_percentage=@@analyze_sample_percentage;
|
set @save_analyze_sample_percentage=@@analyze_sample_percentage;
|
||||||
|
set @save_hist_size=@@histogram_size;
|
||||||
|
|
||||||
set session rand_seed1=42;
|
set session rand_seed1=42;
|
||||||
set session rand_seed2=62;
|
set session rand_seed2=62;
|
||||||
@ -986,6 +987,8 @@ select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, a
|
|||||||
from mysql.column_stats;
|
from mysql.column_stats;
|
||||||
explain select * from t1;
|
explain select * from t1;
|
||||||
|
|
||||||
set use_stat_tables=@save_use_stat_tables;
|
|
||||||
|
|
||||||
drop table t1;
|
drop table t1;
|
||||||
|
set analyze_sample_percentage=@save_analyze_sample_percentage;
|
||||||
|
set histogram_size=@save_hist_size;
|
||||||
|
set use_stat_tables=@save_use_stat_tables;
|
||||||
|
@ -325,7 +325,7 @@ public:
|
|||||||
|
|
||||||
inline void init(THD *thd, Field * table_field);
|
inline void init(THD *thd, Field * table_field);
|
||||||
inline bool add();
|
inline bool add();
|
||||||
inline void finish(ha_rows rows);
|
inline void finish(ha_rows rows, double sample_fraction);
|
||||||
inline void cleanup();
|
inline void cleanup();
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -1540,6 +1540,8 @@ class Histogram_builder
|
|||||||
uint curr_bucket; /* number of the current bucket to be built */
|
uint curr_bucket; /* number of the current bucket to be built */
|
||||||
ulonglong count; /* number of values retrieved */
|
ulonglong count; /* number of values retrieved */
|
||||||
ulonglong count_distinct; /* number of distinct values retrieved */
|
ulonglong count_distinct; /* number of distinct values retrieved */
|
||||||
|
/* number of distinct values that occured only once */
|
||||||
|
ulonglong count_distinct_single_occurence;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
Histogram_builder(Field *col, uint col_len, ha_rows rows)
|
Histogram_builder(Field *col, uint col_len, ha_rows rows)
|
||||||
@ -1553,14 +1555,21 @@ public:
|
|||||||
bucket_capacity= (double) records / (hist_width + 1);
|
bucket_capacity= (double) records / (hist_width + 1);
|
||||||
curr_bucket= 0;
|
curr_bucket= 0;
|
||||||
count= 0;
|
count= 0;
|
||||||
count_distinct= 0;
|
count_distinct= 0;
|
||||||
|
count_distinct_single_occurence= 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
ulonglong get_count_distinct() { return count_distinct; }
|
ulonglong get_count_distinct() const { return count_distinct; }
|
||||||
|
ulonglong get_count_single_occurence() const
|
||||||
|
{
|
||||||
|
return count_distinct_single_occurence;
|
||||||
|
}
|
||||||
|
|
||||||
int next(void *elem, element_count elem_cnt)
|
int next(void *elem, element_count elem_cnt)
|
||||||
{
|
{
|
||||||
count_distinct++;
|
count_distinct++;
|
||||||
|
if (elem_cnt == 1)
|
||||||
|
count_distinct_single_occurence++;
|
||||||
count+= elem_cnt;
|
count+= elem_cnt;
|
||||||
if (curr_bucket == hist_width)
|
if (curr_bucket == hist_width)
|
||||||
return 0;
|
return 0;
|
||||||
@ -1574,7 +1583,7 @@ public:
|
|||||||
count > bucket_capacity * (curr_bucket + 1))
|
count > bucket_capacity * (curr_bucket + 1))
|
||||||
{
|
{
|
||||||
histogram->set_prev_value(curr_bucket);
|
histogram->set_prev_value(curr_bucket);
|
||||||
curr_bucket++;
|
curr_bucket++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
@ -1590,9 +1599,18 @@ int histogram_build_walk(void *elem, element_count elem_cnt, void *arg)
|
|||||||
return hist_builder->next(elem, elem_cnt);
|
return hist_builder->next(elem, elem_cnt);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
static int count_distinct_single_occurence_walk(void *elem,
|
||||||
|
element_count count, void *arg)
|
||||||
|
{
|
||||||
|
((ulonglong*)arg)[0]+= 1;
|
||||||
|
if (count == 1)
|
||||||
|
((ulonglong*)arg)[1]+= 1;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
C_MODE_END
|
C_MODE_END
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
The class Count_distinct_field is a helper class used to calculate
|
The class Count_distinct_field is a helper class used to calculate
|
||||||
the number of distinct values for a column. The class employs the
|
the number of distinct values for a column. The class employs the
|
||||||
@ -1611,6 +1629,9 @@ protected:
|
|||||||
Unique *tree; /* The helper object to contain distinct values */
|
Unique *tree; /* The helper object to contain distinct values */
|
||||||
uint tree_key_length; /* The length of the keys for the elements of 'tree */
|
uint tree_key_length; /* The length of the keys for the elements of 'tree */
|
||||||
|
|
||||||
|
ulonglong distincts;
|
||||||
|
ulonglong distincts_single_occurence;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
Count_distinct_field() {}
|
Count_distinct_field() {}
|
||||||
@ -1662,30 +1683,40 @@ public:
|
|||||||
{
|
{
|
||||||
return tree->unique_add(table_field->ptr);
|
return tree->unique_add(table_field->ptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@brief
|
@brief
|
||||||
Calculate the number of elements accumulated in the container of 'tree'
|
Calculate the number of elements accumulated in the container of 'tree'
|
||||||
*/
|
*/
|
||||||
ulonglong get_value()
|
void walk_tree()
|
||||||
{
|
{
|
||||||
ulonglong count;
|
ulonglong counts[2] = {0, 0};
|
||||||
if (tree->elements == 0)
|
tree->walk(table_field->table,
|
||||||
return (ulonglong) tree->elements_in_tree();
|
count_distinct_single_occurence_walk, counts);
|
||||||
count= 0;
|
distincts= counts[0];
|
||||||
tree->walk(table_field->table, count_distinct_walk, (void*) &count);
|
distincts_single_occurence= counts[1];
|
||||||
return count;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@brief
|
@brief
|
||||||
Build the histogram for the elements accumulated in the container of 'tree'
|
Calculate a histogram of the tree
|
||||||
*/
|
*/
|
||||||
ulonglong get_value_with_histogram(ha_rows rows)
|
void walk_tree_with_histogram(ha_rows rows)
|
||||||
{
|
{
|
||||||
Histogram_builder hist_builder(table_field, tree_key_length, rows);
|
Histogram_builder hist_builder(table_field, tree_key_length, rows);
|
||||||
tree->walk(table_field->table, histogram_build_walk, (void *) &hist_builder);
|
tree->walk(table_field->table, histogram_build_walk, (void *) &hist_builder);
|
||||||
return hist_builder.get_count_distinct();
|
distincts= hist_builder.get_count_distinct();
|
||||||
|
distincts_single_occurence= hist_builder.get_count_single_occurence();
|
||||||
|
}
|
||||||
|
|
||||||
|
ulonglong get_count_distinct()
|
||||||
|
{
|
||||||
|
return distincts;
|
||||||
|
}
|
||||||
|
|
||||||
|
ulonglong get_count_distinct_single_occurence()
|
||||||
|
{
|
||||||
|
return distincts_single_occurence;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -2514,7 +2545,7 @@ bool Column_statistics_collected::add()
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
inline
|
inline
|
||||||
void Column_statistics_collected::finish(ha_rows rows)
|
void Column_statistics_collected::finish(ha_rows rows, double sample_fraction)
|
||||||
{
|
{
|
||||||
double val;
|
double val;
|
||||||
|
|
||||||
@ -2532,16 +2563,44 @@ void Column_statistics_collected::finish(ha_rows rows)
|
|||||||
}
|
}
|
||||||
if (count_distinct)
|
if (count_distinct)
|
||||||
{
|
{
|
||||||
ulonglong distincts;
|
|
||||||
uint hist_size= count_distinct->get_hist_size();
|
uint hist_size= count_distinct->get_hist_size();
|
||||||
|
|
||||||
|
/* Compute cardinality statistics and optionally histogram. */
|
||||||
if (hist_size == 0)
|
if (hist_size == 0)
|
||||||
distincts= count_distinct->get_value();
|
count_distinct->walk_tree();
|
||||||
else
|
else
|
||||||
distincts= count_distinct->get_value_with_histogram(rows - nulls);
|
count_distinct->walk_tree_with_histogram(rows - nulls);
|
||||||
|
|
||||||
|
ulonglong distincts= count_distinct->get_count_distinct();
|
||||||
|
ulonglong distincts_single_occurence=
|
||||||
|
count_distinct->get_count_distinct_single_occurence();
|
||||||
|
|
||||||
if (distincts)
|
if (distincts)
|
||||||
{
|
{
|
||||||
val= (double) (rows - nulls) / distincts;
|
/*
|
||||||
set_avg_frequency(val);
|
We use the unsmoothed first-order jackknife estimator" to estimate
|
||||||
|
the number of distinct values.
|
||||||
|
With a sufficient large percentage of rows sampled (80%), we revert back
|
||||||
|
to computing the avg_frequency off of the raw data.
|
||||||
|
*/
|
||||||
|
if (sample_fraction > 0.8)
|
||||||
|
val= (double) (rows - nulls) / distincts;
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (nulls == 1)
|
||||||
|
distincts_single_occurence+= 1;
|
||||||
|
if (nulls)
|
||||||
|
distincts+= 1;
|
||||||
|
double fraction_single_occurence=
|
||||||
|
static_cast<double>(distincts_single_occurence) / rows;
|
||||||
|
double total_number_of_rows= rows / sample_fraction;
|
||||||
|
double estimate_total_distincts= total_number_of_rows /
|
||||||
|
(distincts /
|
||||||
|
(1.0 - (1.0 - sample_fraction) * fraction_single_occurence));
|
||||||
|
val = std::fmax(estimate_total_distincts * (rows - nulls) / rows, 1.0);
|
||||||
|
}
|
||||||
|
|
||||||
|
set_avg_frequency(val);
|
||||||
set_not_null(COLUMN_STAT_AVG_FREQUENCY);
|
set_not_null(COLUMN_STAT_AVG_FREQUENCY);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@ -2813,7 +2872,7 @@ int collect_statistics_for_table(THD *thd, TABLE *table)
|
|||||||
continue;
|
continue;
|
||||||
bitmap_set_bit(table->write_set, table_field->field_index);
|
bitmap_set_bit(table->write_set, table_field->field_index);
|
||||||
if (!rc)
|
if (!rc)
|
||||||
table_field->collected_stats->finish(rows);
|
table_field->collected_stats->finish(rows, sample_fraction);
|
||||||
else
|
else
|
||||||
table_field->collected_stats->cleanup();
|
table_field->collected_stats->cleanup();
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user