diff --git a/mysql-test/r/selectivity_no_engine.result b/mysql-test/r/selectivity_no_engine.result index 6516abbe318..3f046b0c2b7 100644 --- a/mysql-test/r/selectivity_no_engine.result +++ b/mysql-test/r/selectivity_no_engine.result @@ -139,6 +139,118 @@ Warnings: Note 1003 select `test`.`t2`.`a` AS `a`,`test`.`t2`.`b` AS `b`,`test`.`t2`.`col1` AS `col1`,`test`.`t2`.`col2` AS `col2` from `test`.`t2` where ((`test`.`t2`.`a` in (1,2,3)) and (`test`.`t2`.`b` in (1,2,3))) drop table t2, t1; # +# MDEV-5980: EITS: if condition is used for REF access, its selectivity is still in filtered% +# +create table t0(a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1(key1 int, col1 int, key(key1)); +insert into t1 select A.a, A.a from t0 A, t0 B, t0 C; +set histogram_size=100; +set use_stat_tables='preferably'; +set optimizer_use_condition_selectivity=4; +analyze table t1 persistent for all; +Table Op Msg_type Msg_text +test.t1 analyze status Engine-independent statistics collected +test.t1 analyze status Table is already up to date +# 10% is ok +explain extended select * from t1 where col1=2; +id select_type table type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 9.90 Using where +Warnings: +Note 1003 select `test`.`t1`.`key1` AS `key1`,`test`.`t1`.`col1` AS `col1` from `test`.`t1` where (`test`.`t1`.`col1` = 2) +# Must show 100%, not 10% +explain extended select * from t1 where key1=2; +id select_type table type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 ref key1 key1 5 const 98 100.00 +Warnings: +Note 1003 select `test`.`t1`.`key1` AS `key1`,`test`.`t1`.`col1` AS `col1` from `test`.`t1` where (`test`.`t1`.`key1` = 2) +drop table t0, t1; +# +# MDEV-5985: EITS: selectivity estimates look illogical for join and non-key equalities +# +create table t1(a int); +insert into t1 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t2(a int); +insert into t2 select A.a + B.a* 10 + C.a * 100 from t1 A, t1 B, t1 C; +create table t3 as select * from t2; +set histogram_size=100; +set use_stat_tables='preferably'; +set optimizer_use_condition_selectivity=4; +analyze table t2 persistent for all; +Table Op Msg_type Msg_text +test.t2 analyze status Engine-independent statistics collected +test.t2 analyze status OK +analyze table t3 persistent for all; +Table Op Msg_type Msg_text +test.t3 analyze status Engine-independent statistics collected +test.t3 analyze status OK +explain extended select * from t2 A where A.a < 40; +id select_type table type possible_keys key key_len ref rows filtered Extra +1 SIMPLE A ALL NULL NULL NULL NULL 1000 4.95 Using where +Warnings: +Note 1003 select `test`.`A`.`a` AS `a` from `test`.`t2` `A` where (`test`.`A`.`a` < 40) +explain extended select * from t3 B where B.a < 100; +id select_type table type possible_keys key key_len ref rows filtered Extra +1 SIMPLE B ALL NULL NULL NULL NULL 1000 9.90 Using where +Warnings: +Note 1003 select `test`.`B`.`a` AS `a` from `test`.`t3` `B` where (`test`.`B`.`a` < 100) +explain extended select * from t2 A, t3 B where A.a < 40 and B.a < 100; +id select_type table type possible_keys key key_len ref rows filtered Extra +1 SIMPLE A ALL NULL NULL NULL NULL 1000 4.95 Using where +1 SIMPLE B ALL NULL NULL NULL NULL 1000 9.90 Using where; Using join buffer (flat, BNL join) +Warnings: +Note 1003 select `test`.`A`.`a` AS `a`,`test`.`B`.`a` AS `a` from `test`.`t2` `A` join `test`.`t3` `B` where ((`test`.`A`.`a` < 40) and (`test`.`B`.`a` < 100)) +explain extended select * from t2 A, t3 B where A.a < 40 and B.a < 100 and B.a=A.a; +id select_type table type possible_keys key key_len ref rows filtered Extra +1 SIMPLE A ALL NULL NULL NULL NULL 1000 4.95 Using where +1 SIMPLE B ALL NULL NULL NULL NULL 1000 4.95 Using where; Using join buffer (flat, BNL join) +Warnings: +Note 1003 select `test`.`A`.`a` AS `a`,`test`.`B`.`a` AS `a` from `test`.`t2` `A` join `test`.`t3` `B` where ((`test`.`B`.`a` = `test`.`A`.`a`) and (`test`.`A`.`a` < 40) and (`test`.`A`.`a` < 100)) +drop table t1, t2, t3; +select 1; +1 +1 +# +# MDEV-6003: EITS: ref access, keypart2=const vs keypart2=expr - inconsistent filtered% value +# +create table t0(a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1 ( +kp1 int, kp2 int, +filler1 char(100), +filler2 char(100), +key(kp1, kp2) +); +insert into t1 +select +A.a, +B.a, +'filler-data-1', +'filler-data-2' +from t0 A, t0 B, t0 C; +set histogram_size=100; +set use_stat_tables='preferably'; +set optimizer_use_condition_selectivity=4; +analyze table t1 persistent for all; +Table Op Msg_type Msg_text +test.t1 analyze status Engine-independent statistics collected +test.t1 analyze status Table is already up to date +# NOTE: 10*100%, 10*100% rows is ok +explain extended select * from t0, t1 where t1.kp1=t0.a and t1.kp2=t0.a+1; +id select_type table type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t0 ALL NULL NULL NULL NULL 10 100.00 Using where +1 SIMPLE t1 ref kp1 kp1 10 test.t0.a,func 10 100.00 Using index condition +Warnings: +Note 1003 select `test`.`t0`.`a` AS `a`,`test`.`t1`.`kp1` AS `kp1`,`test`.`t1`.`kp2` AS `kp2`,`test`.`t1`.`filler1` AS `filler1`,`test`.`t1`.`filler2` AS `filler2` from `test`.`t0` join `test`.`t1` where ((`test`.`t1`.`kp1` = `test`.`t0`.`a`) and (`test`.`t1`.`kp2` = (`test`.`t0`.`a` + 1))) +# NOTE: t0: 10*100% is ok, t1: 10*9.90% is bad. t1 should have 10*100%. +explain extended select * from t0, t1 where t1.kp1=t0.a and t1.kp2=4; +id select_type table type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t0 ALL NULL NULL NULL NULL 10 100.00 Using where +1 SIMPLE t1 ref kp1 kp1 10 test.t0.a,const 10 100.00 +Warnings: +Note 1003 select `test`.`t0`.`a` AS `a`,`test`.`t1`.`kp1` AS `kp1`,`test`.`t1`.`kp2` AS `kp2`,`test`.`t1`.`filler1` AS `filler1`,`test`.`t1`.`filler2` AS `filler2` from `test`.`t0` join `test`.`t1` where ((`test`.`t1`.`kp1` = `test`.`t0`.`a`) and (`test`.`t1`.`kp2` = 4)) +drop table t0, t1; +# # End of the test file # set use_stat_tables= @save_use_stat_tables; diff --git a/mysql-test/t/selectivity_no_engine.test b/mysql-test/t/selectivity_no_engine.test index eb6642fb5c7..d1e34a7ac6f 100644 --- a/mysql-test/t/selectivity_no_engine.test +++ b/mysql-test/t/selectivity_no_engine.test @@ -101,6 +101,78 @@ analyze table t2 persistent for all; explain extended select * from t2 where a in (1,2,3) and b in (1,2,3); drop table t2, t1; + +--echo # +--echo # MDEV-5980: EITS: if condition is used for REF access, its selectivity is still in filtered% +--echo # +create table t0(a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1(key1 int, col1 int, key(key1)); +insert into t1 select A.a, A.a from t0 A, t0 B, t0 C; + +set histogram_size=100; +set use_stat_tables='preferably'; +set optimizer_use_condition_selectivity=4; +analyze table t1 persistent for all; +--echo # 10% is ok +explain extended select * from t1 where col1=2; +--echo # Must show 100%, not 10% +explain extended select * from t1 where key1=2; +drop table t0, t1; + +##--disable_parsing +--echo # +--echo # MDEV-5985: EITS: selectivity estimates look illogical for join and non-key equalities +--echo # +create table t1(a int); +insert into t1 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t2(a int); +insert into t2 select A.a + B.a* 10 + C.a * 100 from t1 A, t1 B, t1 C; +create table t3 as select * from t2; +set histogram_size=100; +set use_stat_tables='preferably'; +set optimizer_use_condition_selectivity=4; +analyze table t2 persistent for all; +analyze table t3 persistent for all; +explain extended select * from t2 A where A.a < 40; +explain extended select * from t3 B where B.a < 100; +explain extended select * from t2 A, t3 B where A.a < 40 and B.a < 100; +explain extended select * from t2 A, t3 B where A.a < 40 and B.a < 100 and B.a=A.a; +drop table t1, t2, t3; +## --enable_parsing +select 1; +##--disable_parsing +--echo # +--echo # MDEV-6003: EITS: ref access, keypart2=const vs keypart2=expr - inconsistent filtered% value +--echo # +create table t0(a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1 ( + kp1 int, kp2 int, + filler1 char(100), + filler2 char(100), + key(kp1, kp2) +); + +insert into t1 +select + A.a, + B.a, + 'filler-data-1', + 'filler-data-2' +from t0 A, t0 B, t0 C; +set histogram_size=100; +set use_stat_tables='preferably'; +set optimizer_use_condition_selectivity=4; +analyze table t1 persistent for all; +--echo # NOTE: 10*100%, 10*100% rows is ok +explain extended select * from t0, t1 where t1.kp1=t0.a and t1.kp2=t0.a+1; + +--echo # NOTE: t0: 10*100% is ok, t1: 10*9.90% is bad. t1 should have 10*100%. +explain extended select * from t0, t1 where t1.kp1=t0.a and t1.kp2=4; +drop table t0, t1; +##--enable_parsing + --echo # --echo # End of the test file --echo # diff --git a/sql/opt_range.cc b/sql/opt_range.cc index d4f171321e0..bfcb7e4a54f 100644 --- a/sql/opt_range.cc +++ b/sql/opt_range.cc @@ -3406,6 +3406,10 @@ double records_in_column_ranges(PARAM *param, uint idx, selectivity (this is used for conditions like "column LIKE '%val%'" where approaches #1 and #2 do not provide selectivity data). + SEE ALSO + table_cond_selectivity() + matching_candidates_in_table() + NOTE Currently the selectivities of range conditions over different columns are considered independent. diff --git a/sql/sql_select.cc b/sql/sql_select.cc index 235de14c466..c8c120e4f24 100644 --- a/sql/sql_select.cc +++ b/sql/sql_select.cc @@ -7165,8 +7165,18 @@ double table_multi_eq_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s, the current value of sel by this selectivity */ table_map used_tables= item_equal->used_tables(); + + /* + Equalities that do not include fields in this table do not matter + */ if (!(used_tables & table_bit)) continue; + + /* + Equalities that include a constant are taken into account in + table->cond_selectivity. Selectivity from there is taken into account + in matching_candidates_in_table() and/or table_cond_selectivity(). + */ if (item_equal->get_const()) continue; @@ -7178,14 +7188,23 @@ double table_multi_eq_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s, Field *fld= fi.get_curr_field(); if (fld->table->map != table_bit) continue; + if (pos->key == 0) + { + /* + No ref access used (and no const in the multi-equality). We will + need to adjust the selectivity. + */ adjust_sel= TRUE; + } else { + /* Ok, [eq]ref access is used */ uint i; KEYUSE *keyuse= pos->key; uint key= keyuse->key; + /* Find which keypart participates in the equality */ for (i= 0; i < keyparts; i++) { uint fldno; @@ -7196,6 +7215,7 @@ double table_multi_eq_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s, if (fld->field_index == fldno) break; } + if (i == keyparts) { /* @@ -7221,6 +7241,7 @@ double table_multi_eq_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s, } } } + if (adjust_sel) { /* @@ -7266,6 +7287,10 @@ double table_multi_eq_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s, For other access methods, we need to calculate selectivity of the whole condition, "COND(this_table) AND COND(this_table, previous_tables)". + + @seealso + calculate_cond_selectivity_for_table() + matching_candidates_in_table() @retval selectivity of the conditions imposed on the rows of s @@ -7276,9 +7301,7 @@ double table_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s, table_map rem_tables) { uint16 ref_keyuse_steps[MAX_REF_PARTS - 1]; - Field *field; TABLE *table= s->table; - MY_BITMAP *read_set= table->read_set; double sel= s->table->cond_selectivity; POSITION *pos= &join->positions[idx]; uint keyparts= 0; @@ -7287,23 +7310,79 @@ double table_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s, if (pos->key != 0) { /* - A ref access or hash join is used for this table. + A ref access or hash join is used for this table. ref access is created + from - It could have some parts with "t.key_part=const". Using ref access - means that we will only get records where the condition holds, so we - should remove its selectivity from the condition selectivity. + tbl.keypart1=expr1 AND tbl.keypart2=expr2 AND ... + and it will only return rows for which this condition is satisified. + Suppose, certain expr{i} is a constant. Since ref access only returns + rows that satisfy + + tbl.keypart{i}=const (*) + + then selectivity of this equality should not be counted in return value + of this function. This function uses the value of + + table->cond_selectivity=selectivity(COND(tbl)) + + as a starting point. This value includes selectivity of equality (*). We + should somehow discount it. + + Looking at calculate_cond_selectivity_for_table(), one can see that that + value is not necessarily a direct multiplicand in table->cond_selectivity + + There are three possible ways to discount + 1. There is a range access on t.keypart{i}=const. + (an important special case: multi-keypart ref(const) access) + + 2. The field has a histogram. field[x]->cond_selectivity has the data. + + 3. Use index stats on this index: + rec_per_key[key_part+1]/rec_per_key[key_part] + (TODO: more details about the "t.key=othertable.col" case) */ KEYUSE *keyuse= pos->key; KEYUSE *prev_ref_keyuse= keyuse; uint key= keyuse->key; - do + + /* + Check if we have a prefix of key=const that matches a quick select. + */ + if (!is_hash_join_key_no(key)) + { + table_map quick_key_map= (table_map(1) << table->quick_key_parts[key]) - 1; + if (table->quick_rows[key] && + !(quick_key_map & ~table->const_key_parts[key])) + { + /* + Ok, there is an equality for each of the key parts used by the + quick select. This means, quick select's estimate can be reused to + discount the selectivity of a prefix of a ref access. + */ + for (; quick_key_map & 1 ; quick_key_map>>= 1) + { + while (keyuse->keypart == keyparts) + keyuse++; + keyparts++; + } + sel /= table->quick_rows[key] / table->stat_records(); + } + } + + /* + Go through the "keypart{N}=..." equalities and find those that were + already taken into account in table->cond_selectivity. + */ + //do + while (keyuse->table == table && keyuse->key == key) { if (!(keyuse->used_tables & (rem_tables | table->map))) { if (are_tables_local(s, keyuse->val->used_tables())) - { + { /// ^^ why val->used_tables here but just + /// used_tables above? if (is_hash_join_key_no(key)) { if (keyparts == keyuse->keypart) @@ -7312,22 +7391,40 @@ double table_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s, else { if (keyparts == keyuse->keypart && - !(~(keyuse->val->used_tables()) & pos->ref_depend_map) && + !((keyuse->val->used_tables()) & ~pos->ref_depend_map) && !(found_part_ref_or_null & keyuse->optimize)) { + /* Found a KEYUSE object that will be used by ref access */ keyparts++; found_part_ref_or_null|= keyuse->optimize & ~KEY_OPTIMIZE_EQ; } } + + if (keyparts > keyuse->keypart) { + /* Ok this is the keyuse that will be used for ref access */ uint fldno; if (is_hash_join_key_no(key)) fldno= keyuse->keypart; else fldno= table->key_info[key].key_part[keyparts-1].fieldnr - 1; if (keyuse->val->const_item()) - sel*= table->field[fldno]->cond_selectivity; + { + // psergey: not multiply, divide instead. + // before, we've had: + // sel*= table->field[fldno]->cond_selectivity; + + sel /= table->field[fldno]->cond_selectivity; + /* + TODO: we could do better here: + 1. cond_selectivity might be =1 (the default) because quick + select on some index prevented us from analyzing + histogram for this column. + 2. we could get an estimate through this? + rec_per_key[key_part-1] / rec_per_key[key_part] + */ + } if (keyparts > 1) { ref_keyuse_steps[keyparts-2]= keyuse - prev_ref_keyuse; @@ -7337,14 +7434,16 @@ double table_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s, } } keyuse++; - } while (keyuse->table == table && keyuse->key == key); + } } else { /* The table is accessed with full table scan, or quick select. - Selectivity of COND(table) is already accounted for in + Selectivity of COND(this_table) is already accounted for in matching_candidates_in_table(). + For COND(this_table, previous_tables) we don't have any meaningful + estimates. */ sel= 1; } @@ -7353,7 +7452,30 @@ double table_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s, If the field f from the table is equal to a field from one the earlier joined tables then the selectivity of the range conditions over the field f must be discounted. + + psergey: I think this is wrong. Example: + + ## no keys used + select * from t1, t2 where t1.col=t2.col and t2.col<5 + + ## a variant with key: + select * from t1, t2 where t1.col=t2.col and t2.col<5 and t2.key=t1.col2 + + suppose the join order is t1, t2. Attached conditions: + + t1: t2.col<3 + t2: [t2.col=t1.col] AND t2.col<3 + + Suppose, we're now looking at selectivity for table t2. + - in case t2 uses full table scan (or quick select): all selectivity is + already accounted for in matching_candidates_in_table(). [YES. CHECKED] + - in case t2 uses ref access + = if the equality is used for ref access, we have already + discounted its selectivity above. + = if the equality is not used for ref access, we should still count its + selectivity. */ +#if 0 for (Field **f_ptr=table->field ; (field= *f_ptr) ; f_ptr++) { if (!bitmap_is_set(read_set, field->field_index) || @@ -7371,6 +7493,7 @@ double table_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s, } } } +#endif sel*= table_multi_eq_cond_selectivity(join, idx, s, rem_tables, keyparts, ref_keyuse_steps);