From 26a3d567c9435cd593d7afac8a0f1d00def50740 Mon Sep 17 00:00:00 2001 From: Sergey Petrunya Date: Tue, 1 Apr 2014 09:59:51 -0700 Subject: [PATCH] MDEV-5992: EITS: Selectivity of non-indexed condition is counted twice in table's fanout MDEV-5984: EITS: Incorrect filtered% value for single-table select with range access - Fix calculate_cond_selectivity_for_table() to work correctly with range accesses over multi-component keys: = First, take selectivity of all possible range scans into account. Remember which fields were used bt the range scans. = Then, calculate selectivity produced by sargable predicates on fields. If a field was used in a possible range access, assume its selectivity is already taken into account. - Fix table_cond_selectivity(): when quick select is used, selectivity of COND(table) is taken into account in matching_candidates_in_table(). In table_cond_selectivity() we should not apply it for the second time. --- mysql-test/r/selectivity_no_engine.result | 21 ++++ mysql-test/t/selectivity_no_engine.test | 16 +++ sql/opt_range.cc | 138 +++++++++++++++------- sql/sql_select.cc | 58 ++++++--- 4 files changed, 173 insertions(+), 60 deletions(-) diff --git a/mysql-test/r/selectivity_no_engine.result b/mysql-test/r/selectivity_no_engine.result index 05405a8777f..6516abbe318 100644 --- a/mysql-test/r/selectivity_no_engine.result +++ b/mysql-test/r/selectivity_no_engine.result @@ -118,6 +118,27 @@ Warnings: Note 1003 select `test`.`t1`.`col1` AS `col1` from `test`.`t1` where (`test`.`t1`.`col1` <= (-(1))) drop table t1, t2; # +# MDEV-5984: EITS: Incorrect filtered% value for single-table select with range access +# +create table t1(a int); +insert into t1 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t2 (a int, b int, col1 varchar(64), col2 varchar(64), key(a,b)); +insert into t2 select A.a+10*B.a, C.a+10*D.a, 'filler-data1', 'filler-data2' from t1 A, t1 B, t1 C, t1 D; +set histogram_size=100; +set optimizer_use_condition_selectivity=4; +set use_stat_tables='preferably'; +analyze table t2 persistent for all; +Table Op Msg_type Msg_text +test.t2 analyze status Engine-independent statistics collected +test.t2 analyze status Table is already up to date +# This must show filtered=100%: +explain extended select * from t2 where a in (1,2,3) and b in (1,2,3); +id select_type table type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t2 range a a 10 NULL 9 100.00 Using index condition +Warnings: +Note 1003 select `test`.`t2`.`a` AS `a`,`test`.`t2`.`b` AS `b`,`test`.`t2`.`col1` AS `col1`,`test`.`t2`.`col2` AS `col2` from `test`.`t2` where ((`test`.`t2`.`a` in (1,2,3)) and (`test`.`t2`.`b` in (1,2,3))) +drop table t2, t1; +# # End of the test file # set use_stat_tables= @save_use_stat_tables; diff --git a/mysql-test/t/selectivity_no_engine.test b/mysql-test/t/selectivity_no_engine.test index e88d822ce46..eb6642fb5c7 100644 --- a/mysql-test/t/selectivity_no_engine.test +++ b/mysql-test/t/selectivity_no_engine.test @@ -84,7 +84,23 @@ explain extended select * from t1 where col1 in (-1,-2,-3); explain extended select * from t1 where col1<=-1; drop table t1, t2; +--echo # +--echo # MDEV-5984: EITS: Incorrect filtered% value for single-table select with range access +--echo # +create table t1(a int); +insert into t1 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t2 (a int, b int, col1 varchar(64), col2 varchar(64), key(a,b)); +insert into t2 select A.a+10*B.a, C.a+10*D.a, 'filler-data1', 'filler-data2' from t1 A, t1 B, t1 C, t1 D; + +set histogram_size=100; +set optimizer_use_condition_selectivity=4; +set use_stat_tables='preferably'; +analyze table t2 persistent for all; +--echo # This must show filtered=100%: +explain extended select * from t2 where a in (1,2,3) and b in (1,2,3); + +drop table t2, t1; --echo # --echo # End of the test file --echo # diff --git a/sql/opt_range.cc b/sql/opt_range.cc index cbc3d285d0a..6d2e8c7340f 100644 --- a/sql/opt_range.cc +++ b/sql/opt_range.cc @@ -3390,6 +3390,17 @@ double records_in_column_ranges(PARAM *param, uint idx, on the rows of 'table' in the processed query. The calculated selectivity is assigned to the field table->cond_selectivity. + Selectivity is calculated as a product of selectivities imposed by: + + 1. possible range accesses. (if multiple range accesses use the same + restrictions on the same field, we make adjustments for that) + 2. Sargable conditions on fields for which we have column statistics (if + a field is used in a possible range access, we assume that selectivity + is already provided by the range access' estimates) + 3. Reading a few records from the table pages and checking the condition + selectivity (this is used for conditions like "column LIKE '%val%'" + where approaches #1 and #2 do not provide selectivity data). + NOTE Currently the selectivities of range conditions over different columns are considered independent. @@ -3415,14 +3426,90 @@ bool calculate_cond_selectivity_for_table(THD *thd, TABLE *table, Item *cond) if (table->pos_in_table_list->schema_table) DBUG_RETURN(FALSE); + MY_BITMAP handled_columns; + my_bitmap_map* buf; + if (!(buf= (my_bitmap_map*)thd->alloc(table->s->column_bitmap_size))) + DBUG_RETURN(TRUE); + my_bitmap_init(&handled_columns, buf, table->s->fields, FALSE); + + /* + First, take into account possible range accesses. + range access estimates are the most precise, we prefer them to any other + estimate sources. + */ + + for (keynr= 0; keynr < table->s->keys; keynr++) + { + if (table->quick_keys.is_set(keynr)) + set_if_bigger(max_quick_key_parts, table->quick_key_parts[keynr]); + } + + /* + Walk through all indexes, indexes where range access uses more keyparts + go first. + */ + for (uint quick_key_parts= max_quick_key_parts; + quick_key_parts; quick_key_parts--) + { + for (keynr= 0; keynr < table->s->keys; keynr++) + { + if (table->quick_keys.is_set(keynr) && + table->quick_key_parts[keynr] == quick_key_parts) + { + uint i; + uint used_key_parts= table->quick_key_parts[keynr]; + double quick_cond_selectivity= table->quick_rows[keynr] / + table_records; + KEY *key_info= table->key_info + keynr; + KEY_PART_INFO* key_part= key_info->key_part; + /* + Suppose, there are range conditions on two keys + KEY1 (col1, col2) + KEY2 (col3, col2) + + we don't want to count selectivity of condition on col2 twice. + + First, find the longest key prefix that's made of columns whose + selectivity wasn't already accounted for. + */ + for (i= 0; i < used_key_parts; i++, key_part++) + { + if (bitmap_is_set(&handled_columns, key_part->fieldnr-1)) + break; + bitmap_set_bit(&handled_columns, key_part->fieldnr-1); + } + if (i) + { + /* + There is at least 1-column prefix of columns whose selectivity has + not yet been accounted for. + */ + table->cond_selectivity*= quick_cond_selectivity; + if (i != used_key_parts) + { + /* + Range access got us estimate for #used_key_parts. + We need estimate for #(i-1) key parts. + */ + double f1= key_info->actual_rec_per_key(i-1); + double f2= key_info->actual_rec_per_key(i); + table->cond_selectivity*= f1 / f2; + } + } + } + } + } + + /* + Second step: calculate the selectivity of the range conditions not + supported by any index + */ + bitmap_subtract(used_fields, &handled_columns); + /* no need to do: my_bitmap_free(&handled_columns); */ + if (thd->variables.optimizer_use_condition_selectivity > 2 && !bitmap_is_clear_all(used_fields)) { - /* - Calculate the selectivity of the range conditions not supported - by any index - */ - PARAM param; MEM_ROOT alloc; SEL_TREE *tree; @@ -3509,47 +3596,8 @@ bool calculate_cond_selectivity_for_table(THD *thd, TABLE *table, Item *cond) bitmap_clear_all(used_fields); - for (keynr= 0; keynr < table->s->keys; keynr++) - { - if (table->quick_keys.is_set(keynr)) - set_if_bigger(max_quick_key_parts, table->quick_key_parts[keynr]); - } - for (uint quick_key_parts= max_quick_key_parts; - quick_key_parts; quick_key_parts--) - { - for (keynr= 0; keynr < table->s->keys; keynr++) - { - if (table->quick_keys.is_set(keynr) && - table->quick_key_parts[keynr] == quick_key_parts) - { - uint i; - uint used_key_parts= table->quick_key_parts[keynr]; - double quick_cond_selectivity= table->quick_rows[keynr] / - table_records; - KEY *key_info= table->key_info + keynr; - KEY_PART_INFO* key_part= key_info->key_part; - for (i= 0; i < used_key_parts; i++, key_part++) - { - if (bitmap_is_set(used_fields, key_part->fieldnr-1)) - break; - bitmap_set_bit(used_fields, key_part->fieldnr-1); - } - if (i) - { - table->cond_selectivity*= quick_cond_selectivity; - if (i != used_key_parts) - { - double f1= key_info->actual_rec_per_key(i-1); - double f2= key_info->actual_rec_per_key(i); - table->cond_selectivity*= f1 / f2; - } - } - } - } - } - - /* Calculate selectivity of probably highly selective predicates */ + /* Check if we can improve selectivity estimates by using sampling */ ulong check_rows= MY_MIN(thd->variables.optimizer_selectivity_sampling_limit, (ulong) (table_records * SELECTIVITY_SAMPLING_SHARE)); diff --git a/sql/sql_select.cc b/sql/sql_select.cc index 240fc953f1d..e7c93b89303 100644 --- a/sql/sql_select.cc +++ b/sql/sql_select.cc @@ -5557,7 +5557,20 @@ void set_position(JOIN *join,uint idx,JOIN_TAB *table,KEYUSE *key) } -/* Estimate of the number matching candidates in the joined table */ +/* + Estimate how many records we will get if we read just this table and apply + a part of WHERE that can be checked for it. + + @detail + Estimate how many records we will get if we + - read the given table with its "independent" access method (either quick + select or full table/index scan), + - apply the part of WHERE that refers only to this table. + + @seealso + table_cond_selectivity() produces selectivity of condition that is checked + after joining rows from this table to rows from preceding tables. +*/ inline double matching_candidates_in_table(JOIN_TAB *s, bool with_found_constraint, @@ -7236,14 +7249,25 @@ double table_multi_eq_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s, /** @brief - Get the selectivity of conditions when joining a table + Get the selectivity of conditions when joining a table @param join The optimized join @param s The table to be joined for evaluation @param rem_tables The bitmap of tables to be joined later + @detail + Get selectivity of conditions that can be applied when joining this table + with previous tables. + + For quick selects and full table scans, selectivity of COND(this_table) + is accounted for in matching_candidates_in_table(). Here, we only count + selectivity of COND(this_table, previous_tables). + + For other access methods, we need to calculate selectivity of the whole + condition, "COND(this_table) AND COND(this_table, previous_tables)". + @retval - selectivity of the conditions imposed on the rows of s + selectivity of the conditions imposed on the rows of s */ static @@ -7255,22 +7279,21 @@ double table_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s, TABLE *table= s->table; MY_BITMAP *read_set= table->read_set; double sel= s->table->cond_selectivity; - double table_records= table->stat_records(); POSITION *pos= &join->positions[idx]; uint keyparts= 0; uint found_part_ref_or_null= 0; - /* Discount the selectivity of the access method used to join table s */ - if (s->quick && s->quick->index != MAX_KEY) + if (pos->key != 0) { - if (pos->key == 0 && table_records > 0) - { - sel/= table->quick_rows[s->quick->index]/table_records; - } - } - else if (pos->key != 0) - { - /* A ref/ access or hash join is used to join table */ + /* + A ref access or hash join is used for this table. + + It could have some parts with "t.key_part=const". Using ref access + means that we will only get records where the condition holds, so we + should remove its selectivity from the condition selectivity. + + (TODO: more details about the "t.key=othertable.col" case) + */ KEYUSE *keyuse= pos->key; KEYUSE *prev_ref_keyuse= keyuse; uint key= keyuse->key; @@ -7317,9 +7340,14 @@ double table_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s, } else { + /* + The table is accessed with full table scan, or quick select. + Selectivity of COND(table) is already accounted for in + matching_candidates_in_table(). + */ sel= 1; } - + /* If the field f from the table is equal to a field from one the earlier joined tables then the selectivity of the range conditions