1
0
mirror of https://github.com/MariaDB/server.git synced 2025-07-27 18:02:13 +03:00

Introduce analyze_sample_percentage variable

The variable controls the amount of sampling analyze table performs.

If ANALYZE table with histogram collection is too slow, one can reduce the
time taken by setting analyze_sample_percentage to a lower value of the
total number of rows.
Setting it to 0 will use a formula to compute how many rows to sample:

The number of rows collected is capped to a minimum of 50000 and
increases logarithmically with a coffecient of 4096. The coffecient is
chosen so that we expect an error of less than 3% in our estimations
according to the paper:
"Random Sampling for Histogram Construction: How much is enough?”
– Surajit Chaudhuri, Rajeev Motwani, Vivek Narasayya, ACM SIGMOD, 1998.

The drawback of sampling is that avg_frequency number is computed
imprecisely and will yeild a smaller number than the real one.
This commit is contained in:
Vicențiu Ciorbaru
2019-02-15 01:23:00 +02:00
parent 47f15ea73c
commit f0773b7842
8 changed files with 265 additions and 10 deletions

View File

@ -2729,12 +2729,28 @@ int collect_statistics_for_table(THD *thd, TABLE *table)
Field *table_field;
ha_rows rows= 0;
handler *file=table->file;
double sample_fraction= thd->variables.sample_percentage / 100;
const ha_rows MIN_THRESHOLD_FOR_SAMPLING= 50000;
DBUG_ENTER("collect_statistics_for_table");
table->collected_stats->cardinality_is_null= TRUE;
table->collected_stats->cardinality= 0;
if (thd->variables.sample_percentage == 0)
{
if (file->records() < MIN_THRESHOLD_FOR_SAMPLING)
{
sample_fraction= 1;
}
else
{
sample_fraction= std::fmin(
(MIN_THRESHOLD_FOR_SAMPLING + 4096 *
log(200 * file->records())) / file->records(), 1);
}
}
for (field_ptr= table->field; *field_ptr; field_ptr++)
{
table_field= *field_ptr;
@ -2747,7 +2763,7 @@ int collect_statistics_for_table(THD *thd, TABLE *table)
/* Perform a full table scan to collect statistics on 'table's columns */
if (!(rc= file->ha_rnd_init(TRUE)))
{
{
DEBUG_SYNC(table->in_use, "statistics_collection_start");
while ((rc= file->ha_rnd_next(table->record[0])) != HA_ERR_END_OF_FILE)
@ -2758,17 +2774,20 @@ int collect_statistics_for_table(THD *thd, TABLE *table)
if (rc)
break;
for (field_ptr= table->field; *field_ptr; field_ptr++)
if (thd_rnd(thd) <= sample_fraction)
{
table_field= *field_ptr;
if (!bitmap_is_set(table->read_set, table_field->field_index))
continue;
if ((rc= table_field->collected_stats->add()))
for (field_ptr= table->field; *field_ptr; field_ptr++)
{
table_field= *field_ptr;
if (!bitmap_is_set(table->read_set, table_field->field_index))
continue;
if ((rc= table_field->collected_stats->add()))
break;
}
if (rc)
break;
rows++;
}
if (rc)
break;
rows++;
}
file->ha_rnd_end();
}
@ -2782,7 +2801,8 @@ int collect_statistics_for_table(THD *thd, TABLE *table)
if (!rc)
{
table->collected_stats->cardinality_is_null= FALSE;
table->collected_stats->cardinality= rows;
table->collected_stats->cardinality=
static_cast<ha_rows>(rows / sample_fraction);
}
bitmap_clear_all(table->write_set);