1
0
mirror of https://github.com/MariaDB/server.git synced 2025-07-27 18:02:13 +03:00

Introduce analyze_sample_percentage variable

The variable controls the amount of sampling analyze table performs.

If ANALYZE table with histogram collection is too slow, one can reduce the
time taken by setting analyze_sample_percentage to a lower value of the
total number of rows.
Setting it to 0 will use a formula to compute how many rows to sample:

The number of rows collected is capped to a minimum of 50000 and
increases logarithmically with a coffecient of 4096. The coffecient is
chosen so that we expect an error of less than 3% in our estimations
according to the paper:
"Random Sampling for Histogram Construction: How much is enough?”
– Surajit Chaudhuri, Rajeev Motwani, Vivek Narasayya, ACM SIGMOD, 1998.

The drawback of sampling is that avg_frequency number is computed
imprecisely and will yeild a smaller number than the real one.
This commit is contained in:
Vicențiu Ciorbaru
2019-02-15 01:23:00 +02:00
parent 47f15ea73c
commit f0773b7842
8 changed files with 265 additions and 10 deletions

View File

@ -901,3 +901,91 @@ DROP TABLE t1;
--echo #
set histogram_size=@save_hist_size, histogram_type=@save_hist_type;
--echo #
--echo # Start of 10.4 tests
--echo #
--echo #
--echo # Test analyze_sample_percentage system variable.
--echo #
set @save_use_stat_tables=@@use_stat_tables;
set @save_analyze_sample_percentage=@@analyze_sample_percentage;
set session rand_seed1=42;
set session rand_seed2=62;
set use_stat_tables=PREFERABLY;
set histogram_size=10;
CREATE TABLE t1 (id int);
INSERT INTO t1 (id) VALUES (1), (1), (1), (1), (1), (1), (1);
INSERT INTO t1 (id) SELECT id FROM t1;
INSERT INTO t1 SELECT id+1 FROM t1;
INSERT INTO t1 SELECT id+2 FROM t1;
INSERT INTO t1 SELECT id+4 FROM t1;
INSERT INTO t1 SELECT id+8 FROM t1;
INSERT INTO t1 SELECT id+16 FROM t1;
INSERT INTO t1 SELECT id+32 FROM t1;
INSERT INTO t1 SELECT id+64 FROM t1;
INSERT INTO t1 SELECT id+128 FROM t1;
INSERT INTO t1 SELECT id+256 FROM t1;
INSERT INTO t1 SELECT id+512 FROM t1;
INSERT INTO t1 SELECT id+1024 FROM t1;
INSERT INTO t1 SELECT id+2048 FROM t1;
INSERT INTO t1 SELECT id+4096 FROM t1;
INSERT INTO t1 SELECT id+9192 FROM t1;
--echo #
--echo # This query will should show a full table scan analysis.
--echo #
ANALYZE TABLE t1;
select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, avg_frequency,
DECODE_HISTOGRAM(hist_type, histogram)
from mysql.column_stats;
set analyze_sample_percentage=0.1;
--echo #
--echo # This query will show an innacurate avg_frequency value.
--echo #
ANALYZE TABLE t1;
select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, avg_frequency,
DECODE_HISTOGRAM(hist_type, histogram)
from mysql.column_stats;
--echo #
--echo # This query will show a better avg_frequency value.
--echo #
set analyze_sample_percentage=25;
ANALYZE TABLE t1;
select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, avg_frequency,
DECODE_HISTOGRAM(hist_type, histogram)
from mysql.column_stats;
set analyze_sample_percentage=0;
--echo #
--echo # Test self adjusting sampling level.
--echo #
ANALYZE TABLE t1;
select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, avg_frequency,
DECODE_HISTOGRAM(hist_type, histogram)
from mysql.column_stats;
-- echo #
-- echo # Test record estimation is working properly.
-- echo #
select count(*) from t1;
explain select * from t1;
set analyze_sample_percentage=100;
ANALYZE TABLE t1;
select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, avg_frequency,
DECODE_HISTOGRAM(hist_type, histogram)
from mysql.column_stats;
explain select * from t1;
set use_stat_tables=@save_use_stat_tables;
drop table t1;