From ad842b5f058d5342c22cdc86542baa2ae9db5e70 Mon Sep 17 00:00:00 2001 From: Sergey Petrunya Date: Wed, 26 Mar 2014 17:55:00 +0400 Subject: [PATCH 1/3] MDEV-5926: EITS: Histogram estimates for column=least_possible_value are wrong [Attempt #2] - Use a new selectivity calculation formula in Histogram::point_selectivity. The formula is different from the old one because it was developed from scratch. it doesn't have any possible division-by-zero problems. --- mysql-test/r/selectivity.result | 47 +++++++++++-- mysql-test/r/selectivity_innodb.result | 47 +++++++++++-- mysql-test/t/selectivity.test | 25 ++++++- sql/sql_statistics.h | 92 +++++++++++++++++++++++--- 4 files changed, 191 insertions(+), 20 deletions(-) diff --git a/mysql-test/r/selectivity.result b/mysql-test/r/selectivity.result index 2c96ae4ae90..c91e4345f68 100644 --- a/mysql-test/r/selectivity.result +++ b/mysql-test/r/selectivity.result @@ -1,4 +1,4 @@ -drop table if exists t1,t2,t3; +drop table if exists t0,t1,t2,t3; select @@global.use_stat_tables; @@global.use_stat_tables COMPLEMENTARY @@ -826,7 +826,7 @@ flush table t1; set optimizer_use_condition_selectivity=4; explain extended select * from t1 where a=0; id select_type table type possible_keys key key_len ref rows filtered Extra -1 SIMPLE t1 ALL NULL NULL NULL NULL 1025 49.61 Using where +1 SIMPLE t1 ALL NULL NULL NULL NULL 1025 0.39 Using where Warnings: Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`a` = 0) drop table t1; @@ -1308,15 +1308,54 @@ test.t2 analyze status OK # The following two must have the same in 'Extra' column: explain extended select * from t2 where col1 IN (20, 180); id select_type table type possible_keys key key_len ref rows filtered Extra -1 SIMPLE t2 ALL NULL NULL NULL NULL 1100 1.37 Using where +1 SIMPLE t2 ALL NULL NULL NULL NULL 1100 1.35 Using where Warnings: Note 1003 select `test`.`t2`.`col1` AS `col1` from `test`.`t2` where (`test`.`t2`.`col1` in (20,180)) explain extended select * from t2 where col1 IN (180, 20); id select_type table type possible_keys key key_len ref rows filtered Extra -1 SIMPLE t2 ALL NULL NULL NULL NULL 1100 1.37 Using where +1 SIMPLE t2 ALL NULL NULL NULL NULL 1100 1.35 Using where Warnings: Note 1003 select `test`.`t2`.`col1` AS `col1` from `test`.`t2` where (`test`.`t2`.`col1` in (180,20)) drop table t1, t2; +# +# MDEV-5926: EITS: Histogram estimates for column=least_possible_value are wrong +# +create table t0(a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1(a int); +insert into t1 select A.a from t0 A, t0 B, t0 C; +set histogram_size=20; +set histogram_type='single_prec_hb'; +analyze table t1 persistent for all; +Table Op Msg_type Msg_text +test.t1 analyze status OK +set use_stat_tables='preferably'; +set optimizer_use_condition_selectivity=4; +# Should select about 10%: +explain extended select * from t1 where a=2; +id select_type table type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 9.52 Using where +Warnings: +Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`a` = 2) +# Should select about 10%: +explain extended select * from t1 where a=1; +id select_type table type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 9.52 Using where +Warnings: +Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`a` = 1) +# Must not have filtered=100%: +explain extended select * from t1 where a=0; +id select_type table type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 9.52 Using where +Warnings: +Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`a` = 0) +# Again, must not have filtered=100%: +explain extended select * from t1 where a=-1; +id select_type table type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 9.52 Using where +Warnings: +Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`a` = (-(1))) +drop table t0, t1; set histogram_type=@save_histogram_type; set histogram_size=@save_histogram_size; set optimizer_use_condition_selectivity=@save_optimizer_use_condition_selectivity; diff --git a/mysql-test/r/selectivity_innodb.result b/mysql-test/r/selectivity_innodb.result index 70ce55b50c4..a348836783e 100644 --- a/mysql-test/r/selectivity_innodb.result +++ b/mysql-test/r/selectivity_innodb.result @@ -1,7 +1,7 @@ SET SESSION STORAGE_ENGINE='InnoDB'; set @save_optimizer_switch_for_selectivity_test=@@optimizer_switch; set optimizer_switch='extended_keys=on'; -drop table if exists t1,t2,t3; +drop table if exists t0,t1,t2,t3; select @@global.use_stat_tables; @@global.use_stat_tables COMPLEMENTARY @@ -835,7 +835,7 @@ flush table t1; set optimizer_use_condition_selectivity=4; explain extended select * from t1 where a=0; id select_type table type possible_keys key key_len ref rows filtered Extra -1 SIMPLE t1 ALL NULL NULL NULL NULL 1025 49.61 Using where +1 SIMPLE t1 ALL NULL NULL NULL NULL 1025 0.39 Using where Warnings: Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`a` = 0) drop table t1; @@ -1318,15 +1318,54 @@ test.t2 analyze status OK # The following two must have the same in 'Extra' column: explain extended select * from t2 where col1 IN (20, 180); id select_type table type possible_keys key key_len ref rows filtered Extra -1 SIMPLE t2 ALL NULL NULL NULL NULL 1100 1.37 Using where +1 SIMPLE t2 ALL NULL NULL NULL NULL 1100 1.35 Using where Warnings: Note 1003 select `test`.`t2`.`col1` AS `col1` from `test`.`t2` where (`test`.`t2`.`col1` in (20,180)) explain extended select * from t2 where col1 IN (180, 20); id select_type table type possible_keys key key_len ref rows filtered Extra -1 SIMPLE t2 ALL NULL NULL NULL NULL 1100 1.37 Using where +1 SIMPLE t2 ALL NULL NULL NULL NULL 1100 1.35 Using where Warnings: Note 1003 select `test`.`t2`.`col1` AS `col1` from `test`.`t2` where (`test`.`t2`.`col1` in (180,20)) drop table t1, t2; +# +# MDEV-5926: EITS: Histogram estimates for column=least_possible_value are wrong +# +create table t0(a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1(a int); +insert into t1 select A.a from t0 A, t0 B, t0 C; +set histogram_size=20; +set histogram_type='single_prec_hb'; +analyze table t1 persistent for all; +Table Op Msg_type Msg_text +test.t1 analyze status OK +set use_stat_tables='preferably'; +set optimizer_use_condition_selectivity=4; +# Should select about 10%: +explain extended select * from t1 where a=2; +id select_type table type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 9.52 Using where +Warnings: +Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`a` = 2) +# Should select about 10%: +explain extended select * from t1 where a=1; +id select_type table type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 9.52 Using where +Warnings: +Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`a` = 1) +# Must not have filtered=100%: +explain extended select * from t1 where a=0; +id select_type table type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 9.52 Using where +Warnings: +Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`a` = 0) +# Again, must not have filtered=100%: +explain extended select * from t1 where a=-1; +id select_type table type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 9.52 Using where +Warnings: +Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`a` = (-(1))) +drop table t0, t1; set histogram_type=@save_histogram_type; set histogram_size=@save_histogram_size; set optimizer_use_condition_selectivity=@save_optimizer_use_condition_selectivity; diff --git a/mysql-test/t/selectivity.test b/mysql-test/t/selectivity.test index 8b7dfdff09f..fe35d9652ff 100644 --- a/mysql-test/t/selectivity.test +++ b/mysql-test/t/selectivity.test @@ -1,7 +1,7 @@ --source include/have_stat_tables.inc --disable_warnings -drop table if exists t1,t2,t3; +drop table if exists t0,t1,t2,t3; --enable_warnings select @@global.use_stat_tables; @@ -885,6 +885,29 @@ explain extended select * from t2 where col1 IN (180, 20); drop table t1, t2; +--echo # +--echo # MDEV-5926: EITS: Histogram estimates for column=least_possible_value are wrong +--echo # +create table t0(a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1(a int); +insert into t1 select A.a from t0 A, t0 B, t0 C; +set histogram_size=20; +set histogram_type='single_prec_hb'; +analyze table t1 persistent for all; +set use_stat_tables='preferably'; +set optimizer_use_condition_selectivity=4; +--echo # Should select about 10%: +explain extended select * from t1 where a=2; +--echo # Should select about 10%: +explain extended select * from t1 where a=1; +--echo # Must not have filtered=100%: +explain extended select * from t1 where a=0; +--echo # Again, must not have filtered=100%: +explain extended select * from t1 where a=-1; + +drop table t0, t1; + set histogram_type=@save_histogram_type; set histogram_size=@save_histogram_size; set optimizer_use_condition_selectivity=@save_optimizer_use_condition_selectivity; diff --git a/sql/sql_statistics.h b/sql/sql_statistics.h index 68aacd69d98..936f23f1091 100644 --- a/sql/sql_statistics.h +++ b/sql/sql_statistics.h @@ -113,7 +113,7 @@ class Histogram private: Histogram_type type; - uint8 size; + uint8 size; /* Size of values array, in bytes */ uchar *values; uint prec_factor() @@ -142,6 +142,7 @@ public: private: uint get_value(uint i) { + DBUG_ASSERT(i < get_width()); switch (type) { case SINGLE_PREC_HB: return (uint) (((uint8 *) values)[i]); @@ -150,7 +151,7 @@ private: } return 0; } - + /* Find the bucket which value 'pos' falls into. */ uint find_bucket(double pos, bool first) { uint val= (uint) (pos * prec_factor()); @@ -169,6 +170,10 @@ private: else break; } + + if (val > get_value(i)) + i++; + if (val == get_value(i)) { if (first) @@ -234,24 +239,89 @@ public: sel= bucket_sel * (max - min + 1); return sel; } + + + /* + Estimate selectivity of "col=const" using a histogram + + @param pos Position of the "const" between column's min_value and + max_value. This is a number in [0..1] range. + @param avg_sel Average selectivity of condition "col=const" in this table. + It is calcuated as (#non_null_values / #distinct_values). + + @return + Expected condition selectivity (a number between 0 and 1) + */ double point_selectivity(double pos, double avg_sel) { double sel; - double bucket_sel= 1.0/(get_width() + 1); + /* Find the bucket that contains the value 'pos'. */ uint min= find_bucket(pos, TRUE); + uint pos_value= (uint) (pos * prec_factor()); + + /* Find how many buckets this value occupies */ uint max= min; - while (max + 1 < get_width() && get_value(max + 1) == get_value(max)) + while (max + 1 < get_width() && get_value(max + 1) == pos_value) max++; - double inv_prec_factor= (double) 1.0 / prec_factor(); - double width= (max + 1 == get_width() ? - 1.0 : get_value(max) * inv_prec_factor) - - (min == 0 ? - 0.0 : get_value(min-1) * inv_prec_factor); - sel= avg_sel * (bucket_sel * (max + 1 - min)) / width; + + if (max > min) + { + /* + The value occupies multiple buckets. Use start_bucket ... end_bucket as + selectivity. + */ + double bucket_sel= 1.0/(get_width() + 1); + sel= bucket_sel * (max - min + 1); + } + else + { + /* + The value 'pos' fits within one single histogram bucket. + + Histogram buckets have the same numbers of rows, but they cover + different ranges of values. + + We assume that values are uniformly distributed across the [0..1] value + range. + */ + + /* + If all buckets covered value ranges of the same size, the width of + value range would be: + */ + double avg_bucket_width= 1.0 / (get_width() + 1); + + /* + Let's see what is the width of value range that our bucket is covering. + (min==max currently. they are kept in the formula just in case we + will want to extend it to handle multi-bucket case) + */ + double inv_prec_factor= (double) 1.0 / prec_factor(); + double current_bucket_width= + (max + 1 == get_width() ? 1.0 : (get_value(max) * inv_prec_factor)) - + (min == 0 ? 0.0 : (get_value(min-1) * inv_prec_factor)); + + /* + So: + - each bucket has the same #rows + - values are unformly distributed across the [min_value,max_value] domain. + + If a bucket has value range that's N times bigger then average, than + each value will have to have N times fewer rows than average. + */ + DBUG_ASSERT(current_bucket_width); + sel= avg_sel * avg_bucket_width / current_bucket_width; + + /* + (Q: if we just follow this proportion we may end up in a situation + where number of different values we expect to find in this bucket + exceeds the number of rows that this histogram has in a bucket. Are + we ok with this or we would want to have certain caps?) + */ + } return sel; } - }; From dee11f9633be3091bd7d3c0b868e4ea1efe4ac7f Mon Sep 17 00:00:00 2001 From: Sergey Petrunya Date: Wed, 26 Mar 2014 21:05:31 +0400 Subject: [PATCH 2/3] MDEV-4362: {division by zero when lookup constant is outside the value table} - Fix Histogram::point_selectivity() to work in the case where the passed value_pos=0 (or 1) and the first (or the last) bucket in the histogram has zero value-range (i.e one value). --- mysql-test/r/selectivity.result | 31 ++++++++++++++++++++++++++ mysql-test/r/selectivity_innodb.result | 31 ++++++++++++++++++++++++++ mysql-test/t/selectivity.test | 16 +++++++++++++ sql/sql_statistics.h | 29 ++++++++++++++++-------- 4 files changed, 98 insertions(+), 9 deletions(-) diff --git a/mysql-test/r/selectivity.result b/mysql-test/r/selectivity.result index c91e4345f68..27e05e22714 100644 --- a/mysql-test/r/selectivity.result +++ b/mysql-test/r/selectivity.result @@ -1356,6 +1356,37 @@ id select_type table type possible_keys key key_len ref rows filtered Extra Warnings: Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`a` = (-(1))) drop table t0, t1; +# +# MDEV-4362: Selectivity estimates for IN (...) do not depend on whether the values are in range +# +create table t1 (col1 int); +set @a=-1; +create table t2 (a int) select (@a:=@a+1) as a from information_schema.session_variables A limit 100; +insert into t1 select A.a from t2 A, t2 B where A.a < 100 and B.a < 100; +select min(col1), max(col1), count(*) from t1; +min(col1) max(col1) count(*) +0 99 10000 +set histogram_size=100; +analyze table t1 persistent for all; +Table Op Msg_type Msg_text +test.t1 analyze status OK +explain extended select * from t1 where col1 in (1,2,3); +id select_type table type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 10000 3.37 Using where +Warnings: +Note 1003 select `test`.`t1`.`col1` AS `col1` from `test`.`t1` where (`test`.`t1`.`col1` in (1,2,3)) +# Must not cause fp division by zero, or produce nonsense numbers: +explain extended select * from t1 where col1 in (-1,-2,-3); +id select_type table type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 10000 3.00 Using where +Warnings: +Note 1003 select `test`.`t1`.`col1` AS `col1` from `test`.`t1` where (`test`.`t1`.`col1` in ((-(1)),(-(2)),(-(3)))) +explain extended select * from t1 where col1<=-1; +id select_type table type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 10000 1.00 Using where +Warnings: +Note 1003 select `test`.`t1`.`col1` AS `col1` from `test`.`t1` where (`test`.`t1`.`col1` <= (-(1))) +drop table t1, t2; set histogram_type=@save_histogram_type; set histogram_size=@save_histogram_size; set optimizer_use_condition_selectivity=@save_optimizer_use_condition_selectivity; diff --git a/mysql-test/r/selectivity_innodb.result b/mysql-test/r/selectivity_innodb.result index a348836783e..104b465b5a4 100644 --- a/mysql-test/r/selectivity_innodb.result +++ b/mysql-test/r/selectivity_innodb.result @@ -1366,6 +1366,37 @@ id select_type table type possible_keys key key_len ref rows filtered Extra Warnings: Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`a` = (-(1))) drop table t0, t1; +# +# MDEV-4362: Selectivity estimates for IN (...) do not depend on whether the values are in range +# +create table t1 (col1 int); +set @a=-1; +create table t2 (a int) select (@a:=@a+1) as a from information_schema.session_variables A limit 100; +insert into t1 select A.a from t2 A, t2 B where A.a < 100 and B.a < 100; +select min(col1), max(col1), count(*) from t1; +min(col1) max(col1) count(*) +0 99 10000 +set histogram_size=100; +analyze table t1 persistent for all; +Table Op Msg_type Msg_text +test.t1 analyze status OK +explain extended select * from t1 where col1 in (1,2,3); +id select_type table type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 10000 3.37 Using where +Warnings: +Note 1003 select `test`.`t1`.`col1` AS `col1` from `test`.`t1` where (`test`.`t1`.`col1` in (1,2,3)) +# Must not cause fp division by zero, or produce nonsense numbers: +explain extended select * from t1 where col1 in (-1,-2,-3); +id select_type table type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 10000 3.00 Using where +Warnings: +Note 1003 select `test`.`t1`.`col1` AS `col1` from `test`.`t1` where (`test`.`t1`.`col1` in ((-(1)),(-(2)),(-(3)))) +explain extended select * from t1 where col1<=-1; +id select_type table type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 10000 1.00 Using where +Warnings: +Note 1003 select `test`.`t1`.`col1` AS `col1` from `test`.`t1` where (`test`.`t1`.`col1` <= (-(1))) +drop table t1, t2; set histogram_type=@save_histogram_type; set histogram_size=@save_histogram_size; set optimizer_use_condition_selectivity=@save_optimizer_use_condition_selectivity; diff --git a/mysql-test/t/selectivity.test b/mysql-test/t/selectivity.test index fe35d9652ff..3e4940d53e4 100644 --- a/mysql-test/t/selectivity.test +++ b/mysql-test/t/selectivity.test @@ -908,6 +908,22 @@ explain extended select * from t1 where a=-1; drop table t0, t1; +--echo # +--echo # MDEV-4362: Selectivity estimates for IN (...) do not depend on whether the values are in range +--echo # +create table t1 (col1 int); +set @a=-1; +create table t2 (a int) select (@a:=@a+1) as a from information_schema.session_variables A limit 100; +insert into t1 select A.a from t2 A, t2 B where A.a < 100 and B.a < 100; +select min(col1), max(col1), count(*) from t1; +set histogram_size=100; +analyze table t1 persistent for all; +explain extended select * from t1 where col1 in (1,2,3); +--echo # Must not cause fp division by zero, or produce nonsense numbers: +explain extended select * from t1 where col1 in (-1,-2,-3); +explain extended select * from t1 where col1<=-1; +drop table t1, t2; + set histogram_type=@save_histogram_type; set histogram_size=@save_histogram_size; set optimizer_use_condition_selectivity=@save_optimizer_use_condition_selectivity; diff --git a/sql/sql_statistics.h b/sql/sql_statistics.h index 936f23f1091..da6a9035b44 100644 --- a/sql/sql_statistics.h +++ b/sql/sql_statistics.h @@ -302,16 +302,27 @@ public: (max + 1 == get_width() ? 1.0 : (get_value(max) * inv_prec_factor)) - (min == 0 ? 0.0 : (get_value(min-1) * inv_prec_factor)); - /* - So: - - each bucket has the same #rows - - values are unformly distributed across the [min_value,max_value] domain. + if (current_bucket_width < 1e-16) + { + /* + A special case: we are at the first (or the last) bucket in the + histogram, the bucket's value range is a singlepoint [x,x], and + pos_value=0 (for the first bucket) or pos_value=1 (for the last). + */ + sel= avg_sel; + } + else + { + /* + So: + - each bucket has the same #rows + - values are unformly distributed across the [min_value,max_value] domain. - If a bucket has value range that's N times bigger then average, than - each value will have to have N times fewer rows than average. - */ - DBUG_ASSERT(current_bucket_width); - sel= avg_sel * avg_bucket_width / current_bucket_width; + If a bucket has value range that's N times bigger then average, than + each value will have to have N times fewer rows than average. + */ + sel= avg_sel * avg_bucket_width / current_bucket_width; + } /* (Q: if we just follow this proportion we may end up in a situation From ab061a2bb3723c00eb5c88ecc1cb90ee7f1458e6 Mon Sep 17 00:00:00 2001 From: Sergey Petrunya Date: Thu, 27 Mar 2014 12:30:49 +0400 Subject: [PATCH 3/3] MDEV-5926, MDEV-4362 post-fixes: - Histogram::find_bucket() should not walk off the end of the value range. - Address review feedback in Histogram::point_selectivity(): different handling for zero-width buckets, and explanations. --- mysql-test/r/selectivity.result | 2 +- mysql-test/r/selectivity_innodb.result | 2 +- sql/sql_statistics.h | 64 +++++++++++++++++--------- 3 files changed, 45 insertions(+), 23 deletions(-) diff --git a/mysql-test/r/selectivity.result b/mysql-test/r/selectivity.result index 27e05e22714..7e29b1014fc 100644 --- a/mysql-test/r/selectivity.result +++ b/mysql-test/r/selectivity.result @@ -1378,7 +1378,7 @@ Note 1003 select `test`.`t1`.`col1` AS `col1` from `test`.`t1` where (`test`.`t1 # Must not cause fp division by zero, or produce nonsense numbers: explain extended select * from t1 where col1 in (-1,-2,-3); id select_type table type possible_keys key key_len ref rows filtered Extra -1 SIMPLE t1 ALL NULL NULL NULL NULL 10000 3.00 Using where +1 SIMPLE t1 ALL NULL NULL NULL NULL 10000 5.94 Using where Warnings: Note 1003 select `test`.`t1`.`col1` AS `col1` from `test`.`t1` where (`test`.`t1`.`col1` in ((-(1)),(-(2)),(-(3)))) explain extended select * from t1 where col1<=-1; diff --git a/mysql-test/r/selectivity_innodb.result b/mysql-test/r/selectivity_innodb.result index 104b465b5a4..e91cfa41d73 100644 --- a/mysql-test/r/selectivity_innodb.result +++ b/mysql-test/r/selectivity_innodb.result @@ -1388,7 +1388,7 @@ Note 1003 select `test`.`t1`.`col1` AS `col1` from `test`.`t1` where (`test`.`t1 # Must not cause fp division by zero, or produce nonsense numbers: explain extended select * from t1 where col1 in (-1,-2,-3); id select_type table type possible_keys key key_len ref rows filtered Extra -1 SIMPLE t1 ALL NULL NULL NULL NULL 10000 3.00 Using where +1 SIMPLE t1 ALL NULL NULL NULL NULL 10000 5.94 Using where Warnings: Note 1003 select `test`.`t1`.`col1` AS `col1` from `test`.`t1` where (`test`.`t1`.`col1` in ((-(1)),(-(2)),(-(3)))) explain extended select * from t1 where col1<=-1; diff --git a/sql/sql_statistics.h b/sql/sql_statistics.h index da6a9035b44..d0db0a3bf33 100644 --- a/sql/sql_statistics.h +++ b/sql/sql_statistics.h @@ -151,6 +151,7 @@ private: } return 0; } + /* Find the bucket which value 'pos' falls into. */ uint find_bucket(double pos, bool first) { @@ -171,7 +172,7 @@ private: break; } - if (val > get_value(i)) + if (val > get_value(i) && i < (get_width() - 1)) i++; if (val == get_value(i)) @@ -251,6 +252,27 @@ public: @return Expected condition selectivity (a number between 0 and 1) + + @notes + [re_zero_length_buckets] If a bucket with zero value-length is in the + middle of the histogram, we will not have min==max. Example: suppose, + pos_value=0x12, and the histogram is: + + #n #n+1 #n+2 + ... 0x10 0x12 0x12 0x14 ... + | + +------------- bucket with zero value-length + + Here, we will get min=#n+1, max=#n+2, and use the multi-bucket formula. + + The problem happens at the histogram ends. if pos_value=0, and the + histogram is: + + 0x00 0x10 ... + + then min=0, max=0. This means pos_value is contained within bucket #0, + but on the other hand, histogram data says that the bucket has only one + value. */ double point_selectivity(double pos, double avg_sel) @@ -264,6 +286,16 @@ public: uint max= min; while (max + 1 < get_width() && get_value(max + 1) == pos_value) max++; + + /* + A special case: we're looking at a single bucket, and that bucket has + zero value-length. Use the multi-bucket formula (attempt to use + single-bucket formula will cause divison by zero). + + For more details see [re_zero_length_buckets] above. + */ + if (max == min && get_value(max) == ((max==0)? 0 : get_value(max-1))) + max++; if (max > min) { @@ -302,27 +334,17 @@ public: (max + 1 == get_width() ? 1.0 : (get_value(max) * inv_prec_factor)) - (min == 0 ? 0.0 : (get_value(min-1) * inv_prec_factor)); - if (current_bucket_width < 1e-16) - { - /* - A special case: we are at the first (or the last) bucket in the - histogram, the bucket's value range is a singlepoint [x,x], and - pos_value=0 (for the first bucket) or pos_value=1 (for the last). - */ - sel= avg_sel; - } - else - { - /* - So: - - each bucket has the same #rows - - values are unformly distributed across the [min_value,max_value] domain. + DBUG_ASSERT(current_bucket_width); /* We shouldn't get a one zero-width bucket */ - If a bucket has value range that's N times bigger then average, than - each value will have to have N times fewer rows than average. - */ - sel= avg_sel * avg_bucket_width / current_bucket_width; - } + /* + So: + - each bucket has the same #rows + - values are unformly distributed across the [min_value,max_value] domain. + + If a bucket has value range that's N times bigger then average, than + each value will have to have N times fewer rows than average. + */ + sel= avg_sel * avg_bucket_width / current_bucket_width; /* (Q: if we just follow this proportion we may end up in a situation