1
0
mirror of https://github.com/MariaDB/server.git synced 2025-07-27 18:02:13 +03:00

Implement avg_frequency unsmoothed jacknife estimator

When sampling data through ANALYZE TABLE, use the estimator to get a
better estimation of avg_frequency instead of just using the raw sampled data.
This commit is contained in:
Vicențiu Ciorbaru
2019-02-17 00:44:58 +02:00
parent f0773b7842
commit 764429271d
3 changed files with 96 additions and 31 deletions

View File

@ -325,7 +325,7 @@ public:
inline void init(THD *thd, Field * table_field);
inline bool add();
inline void finish(ha_rows rows);
inline void finish(ha_rows rows, double sample_fraction);
inline void cleanup();
};
@ -1540,6 +1540,8 @@ class Histogram_builder
uint curr_bucket; /* number of the current bucket to be built */
ulonglong count; /* number of values retrieved */
ulonglong count_distinct; /* number of distinct values retrieved */
/* number of distinct values that occured only once */
ulonglong count_distinct_single_occurence;
public:
Histogram_builder(Field *col, uint col_len, ha_rows rows)
@ -1553,14 +1555,21 @@ public:
bucket_capacity= (double) records / (hist_width + 1);
curr_bucket= 0;
count= 0;
count_distinct= 0;
count_distinct= 0;
count_distinct_single_occurence= 0;
}
ulonglong get_count_distinct() { return count_distinct; }
ulonglong get_count_distinct() const { return count_distinct; }
ulonglong get_count_single_occurence() const
{
return count_distinct_single_occurence;
}
int next(void *elem, element_count elem_cnt)
{
count_distinct++;
if (elem_cnt == 1)
count_distinct_single_occurence++;
count+= elem_cnt;
if (curr_bucket == hist_width)
return 0;
@ -1574,7 +1583,7 @@ public:
count > bucket_capacity * (curr_bucket + 1))
{
histogram->set_prev_value(curr_bucket);
curr_bucket++;
curr_bucket++;
}
}
return 0;
@ -1590,9 +1599,18 @@ int histogram_build_walk(void *elem, element_count elem_cnt, void *arg)
return hist_builder->next(elem, elem_cnt);
}
static int count_distinct_single_occurence_walk(void *elem,
element_count count, void *arg)
{
((ulonglong*)arg)[0]+= 1;
if (count == 1)
((ulonglong*)arg)[1]+= 1;
return 0;
}
C_MODE_END
/*
The class Count_distinct_field is a helper class used to calculate
the number of distinct values for a column. The class employs the
@ -1611,6 +1629,9 @@ protected:
Unique *tree; /* The helper object to contain distinct values */
uint tree_key_length; /* The length of the keys for the elements of 'tree */
ulonglong distincts;
ulonglong distincts_single_occurence;
public:
Count_distinct_field() {}
@ -1662,30 +1683,40 @@ public:
{
return tree->unique_add(table_field->ptr);
}
/*
@brief
Calculate the number of elements accumulated in the container of 'tree'
*/
ulonglong get_value()
void walk_tree()
{
ulonglong count;
if (tree->elements == 0)
return (ulonglong) tree->elements_in_tree();
count= 0;
tree->walk(table_field->table, count_distinct_walk, (void*) &count);
return count;
ulonglong counts[2] = {0, 0};
tree->walk(table_field->table,
count_distinct_single_occurence_walk, counts);
distincts= counts[0];
distincts_single_occurence= counts[1];
}
/*
@brief
Build the histogram for the elements accumulated in the container of 'tree'
Calculate a histogram of the tree
*/
ulonglong get_value_with_histogram(ha_rows rows)
void walk_tree_with_histogram(ha_rows rows)
{
Histogram_builder hist_builder(table_field, tree_key_length, rows);
tree->walk(table_field->table, histogram_build_walk, (void *) &hist_builder);
return hist_builder.get_count_distinct();
distincts= hist_builder.get_count_distinct();
distincts_single_occurence= hist_builder.get_count_single_occurence();
}
ulonglong get_count_distinct()
{
return distincts;
}
ulonglong get_count_distinct_single_occurence()
{
return distincts_single_occurence;
}
/*
@ -2514,7 +2545,7 @@ bool Column_statistics_collected::add()
*/
inline
void Column_statistics_collected::finish(ha_rows rows)
void Column_statistics_collected::finish(ha_rows rows, double sample_fraction)
{
double val;
@ -2532,16 +2563,44 @@ void Column_statistics_collected::finish(ha_rows rows)
}
if (count_distinct)
{
ulonglong distincts;
uint hist_size= count_distinct->get_hist_size();
/* Compute cardinality statistics and optionally histogram. */
if (hist_size == 0)
distincts= count_distinct->get_value();
count_distinct->walk_tree();
else
distincts= count_distinct->get_value_with_histogram(rows - nulls);
count_distinct->walk_tree_with_histogram(rows - nulls);
ulonglong distincts= count_distinct->get_count_distinct();
ulonglong distincts_single_occurence=
count_distinct->get_count_distinct_single_occurence();
if (distincts)
{
val= (double) (rows - nulls) / distincts;
set_avg_frequency(val);
/*
We use the unsmoothed first-order jackknife estimator" to estimate
the number of distinct values.
With a sufficient large percentage of rows sampled (80%), we revert back
to computing the avg_frequency off of the raw data.
*/
if (sample_fraction > 0.8)
val= (double) (rows - nulls) / distincts;
else
{
if (nulls == 1)
distincts_single_occurence+= 1;
if (nulls)
distincts+= 1;
double fraction_single_occurence=
static_cast<double>(distincts_single_occurence) / rows;
double total_number_of_rows= rows / sample_fraction;
double estimate_total_distincts= total_number_of_rows /
(distincts /
(1.0 - (1.0 - sample_fraction) * fraction_single_occurence));
val = std::fmax(estimate_total_distincts * (rows - nulls) / rows, 1.0);
}
set_avg_frequency(val);
set_not_null(COLUMN_STAT_AVG_FREQUENCY);
}
else
@ -2813,7 +2872,7 @@ int collect_statistics_for_table(THD *thd, TABLE *table)
continue;
bitmap_set_bit(table->write_set, table_field->field_index);
if (!rc)
table_field->collected_stats->finish(rows);
table_field->collected_stats->finish(rows, sample_fraction);
else
table_field->collected_stats->cleanup();
}