MDEV-5992: EITS: Selectivity of non-indexed condition is counted twice in table's fanout

MDEV-5984: EITS: Incorrect filtered% value for single-table select with range access - Fix calculate_cond_selectivity_for_table() to work correctly with range accesses over multi-component keys: = First, take selectivity of all possible range scans into account. Remember which fields were used bt the range scans. = Then, calculate selectivity produced by sargable predicates on fields. If a field was used in a possible range access, assume its selectivity is already taken into account. - Fix table_cond_selectivity(): when quick select is used, selectivity of COND(table) is taken into account in matching_candidates_in_table(). In table_cond_selectivity() we should not apply it for the second time.
2025-07-30 16:24:05 +03:00 · 2014-04-01 09:59:51 -07:00
parent 41a2ca5c16
commit 26a3d567c9
4 changed files with 173 additions and 60 deletions
--- a/mysql-test/r/selectivity_no_engine.result
+++ b/mysql-test/r/selectivity_no_engine.result
@ -118,6 +118,27 @@ Warnings:
 Note	1003	select `test`.`t1`.`col1` AS `col1` from `test`.`t1` where (`test`.`t1`.`col1` <= <cache>(-(1)))
 drop table t1, t2;
 # 
 # MDEV-5984: EITS: Incorrect filtered% value for single-table select with range access
 # 
 create table t1(a int);
 insert into t1 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
 create table t2 (a int, b int, col1 varchar(64), col2 varchar(64), key(a,b));
 insert into t2 select A.a+10*B.a, C.a+10*D.a, 'filler-data1', 'filler-data2' from t1 A, t1 B, t1 C, t1 D;
 set histogram_size=100;
 set optimizer_use_condition_selectivity=4;
 set use_stat_tables='preferably';
 analyze table t2 persistent for all;
 Table	Op	Msg_type	Msg_text
 test.t2	analyze	status	Engine-independent statistics collected
 test.t2	analyze	status	Table is already up to date
 # This must show filtered=100%:
 explain extended select * from t2 where a in (1,2,3) and b in (1,2,3);
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
 1	SIMPLE	t2	range	a	a	10	NULL	9	100.00	Using index condition
 Warnings:
 Note	1003	select `test`.`t2`.`a` AS `a`,`test`.`t2`.`b` AS `b`,`test`.`t2`.`col1` AS `col1`,`test`.`t2`.`col2` AS `col2` from `test`.`t2` where ((`test`.`t2`.`a` in (1,2,3)) and (`test`.`t2`.`b` in (1,2,3)))
 drop table t2, t1;
 # 
 # End of the test file
 # 
 set use_stat_tables= @save_use_stat_tables;
--- a/mysql-test/t/selectivity_no_engine.test
+++ b/mysql-test/t/selectivity_no_engine.test
@ -84,7 +84,23 @@ explain extended select * from t1 where col1 in (-1,-2,-3);
 explain extended select * from t1 where col1<=-1;
 drop table t1, t2;
 --echo # 
 --echo # MDEV-5984: EITS: Incorrect filtered% value for single-table select with range access
 --echo # 
 create table t1(a int);
 insert into t1 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
 create table t2 (a int, b int, col1 varchar(64), col2 varchar(64), key(a,b));
 insert into t2 select A.a+10*B.a, C.a+10*D.a, 'filler-data1', 'filler-data2' from t1 A, t1 B, t1 C, t1 D;
 set histogram_size=100;
 set optimizer_use_condition_selectivity=4;
 set use_stat_tables='preferably';
 analyze table t2 persistent for all;
 --echo # This must show filtered=100%:
 explain extended select * from t2 where a in (1,2,3) and b in (1,2,3);
 drop table t2, t1;
 --echo # 
 --echo # End of the test file
 --echo # 
--- a/sql/opt_range.cc
+++ b/sql/opt_range.cc
@ -3390,6 +3390,17 @@ double records_in_column_ranges(PARAM *param, uint idx,
    on the rows of 'table' in the processed query.
    The calculated selectivity is assigned to the field table->cond_selectivity.
    Selectivity is calculated as a product of selectivities imposed by:
    1. possible range accesses. (if multiple range accesses use the same
       restrictions on the same field, we make adjustments for that)
    2. Sargable conditions on fields for which we have column statistics (if 
       a field is used in a possible range access, we assume that selectivity
       is already provided by the range access' estimates)
    3. Reading a few records from the table pages and checking the condition
       selectivity (this is used for conditions like "column LIKE '%val%'" 
       where approaches #1 and #2 do not provide selectivity data).
  NOTE
    Currently the selectivities of range conditions over different columns are
    considered independent. 
@ -3415,14 +3426,90 @@ bool calculate_cond_selectivity_for_table(THD *thd, TABLE *table, Item *cond)
  if (table->pos_in_table_list->schema_table)
    DBUG_RETURN(FALSE);
  MY_BITMAP handled_columns;
  my_bitmap_map* buf;
  if (!(buf= (my_bitmap_map*)thd->alloc(table->s->column_bitmap_size)))
    DBUG_RETURN(TRUE);
  my_bitmap_init(&handled_columns, buf, table->s->fields, FALSE);
  /*
    First, take into account possible range accesses. 
    range access estimates are the most precise, we prefer them to any other
    estimate sources.
  */
  for (keynr= 0;  keynr < table->s->keys; keynr++)
  {
    if (table->quick_keys.is_set(keynr))
      set_if_bigger(max_quick_key_parts, table->quick_key_parts[keynr]);
  }
  /* 
    Walk through all indexes, indexes where range access uses more keyparts 
    go first.
  */
  for (uint quick_key_parts= max_quick_key_parts;
       quick_key_parts; quick_key_parts--)
  {
    for (keynr= 0;  keynr < table->s->keys; keynr++)
    {
      if (table->quick_keys.is_set(keynr) &&
          table->quick_key_parts[keynr] == quick_key_parts)
      {
        uint i;
        uint used_key_parts= table->quick_key_parts[keynr];
        double quick_cond_selectivity= table->quick_rows[keynr] / 
 	                               table_records;
        KEY *key_info= table->key_info + keynr;
        KEY_PART_INFO* key_part= key_info->key_part;
        /*
          Suppose, there are range conditions on two keys
            KEY1 (col1, col2)
            KEY2 (col3, col2)
          we don't want to count selectivity of condition on col2 twice.
          First, find the longest key prefix that's made of columns whose
          selectivity wasn't already accounted for.
        */
        for (i= 0; i < used_key_parts; i++, key_part++)
        {
          if (bitmap_is_set(&handled_columns, key_part->fieldnr-1))
 	    break; 
          bitmap_set_bit(&handled_columns, key_part->fieldnr-1);
        }
        if (i)
        {
          /* 
            There is at least 1-column prefix of columns whose selectivity has
            not yet been accounted for.
          */
          table->cond_selectivity*= quick_cond_selectivity;
          if (i != used_key_parts)
 	  {
            /*
              Range access got us estimate for #used_key_parts.
              We need estimate for #(i-1) key parts.
            */
            double f1= key_info->actual_rec_per_key(i-1);
            double f2= key_info->actual_rec_per_key(i);
            table->cond_selectivity*= f1 / f2;
          }
        }
      }
    }
  }
  /* 
    Second step: calculate the selectivity of the range conditions not 
    supported by any index
  */
  bitmap_subtract(used_fields, &handled_columns);
  /* no need to do: my_bitmap_free(&handled_columns); */
  if (thd->variables.optimizer_use_condition_selectivity > 2 &&
      !bitmap_is_clear_all(used_fields))
  {
    /* 
      Calculate the selectivity of the range conditions not supported
      by any index
    */
    PARAM param;
    MEM_ROOT alloc;
    SEL_TREE *tree;
@ -3509,47 +3596,8 @@ bool calculate_cond_selectivity_for_table(THD *thd, TABLE *table, Item *cond)
  bitmap_clear_all(used_fields);
  for (keynr= 0;  keynr < table->s->keys; keynr++)
  {
    if (table->quick_keys.is_set(keynr))
      set_if_bigger(max_quick_key_parts, table->quick_key_parts[keynr]);
  }
-  for (uint quick_key_parts= max_quick_key_parts;
+  /* Check if we can improve selectivity estimates by using sampling */
       quick_key_parts; quick_key_parts--)
  {
    for (keynr= 0;  keynr < table->s->keys; keynr++)
    {
      if (table->quick_keys.is_set(keynr) &&
          table->quick_key_parts[keynr] == quick_key_parts)
      {
        uint i;
        uint used_key_parts= table->quick_key_parts[keynr];
        double quick_cond_selectivity= table->quick_rows[keynr] / 
 	                               table_records;
        KEY *key_info= table->key_info + keynr;
        KEY_PART_INFO* key_part= key_info->key_part;
        for (i= 0; i < used_key_parts; i++, key_part++)
        {
          if (bitmap_is_set(used_fields, key_part->fieldnr-1))
 	    break; 
          bitmap_set_bit(used_fields, key_part->fieldnr-1);
        }
        if (i)
        {
          table->cond_selectivity*= quick_cond_selectivity;
          if (i != used_key_parts)
 	  {
            double f1= key_info->actual_rec_per_key(i-1);
            double f2= key_info->actual_rec_per_key(i);
            table->cond_selectivity*= f1 / f2;
          }
        }
      } 
    }
  }
  /* Calculate selectivity of probably highly selective predicates */
  ulong check_rows=
    MY_MIN(thd->variables.optimizer_selectivity_sampling_limit,
        (ulong) (table_records * SELECTIVITY_SAMPLING_SHARE));
--- a/sql/sql_select.cc
+++ b/sql/sql_select.cc
@ -5557,7 +5557,20 @@ void set_position(JOIN *join,uint idx,JOIN_TAB *table,KEYUSE *key)
 }
-/* Estimate of the number matching candidates in the joined table */
+/*
  Estimate how many records we will get if we read just this table and apply
  a part of WHERE that can be checked for it.
  @detail
  Estimate how many records we will get if we
   - read the given table with its "independent" access method (either quick 
     select or full table/index scan),
   - apply the part of WHERE that refers only to this table.
  @seealso
    table_cond_selectivity() produces selectivity of condition that is checked
    after joining rows from this table to rows from preceding tables.
 */
 inline
 double matching_candidates_in_table(JOIN_TAB *s, bool with_found_constraint,
@ -7236,14 +7249,25 @@ double table_multi_eq_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s,
 /**
  @brief
-  Get the selectivity of conditions when joining a table
+    Get the selectivity of conditions when joining a table
  @param join       The optimized join
  @param s          The table to be joined for evaluation
  @param rem_tables The bitmap of tables to be joined later
  @detail
    Get selectivity of conditions that can be applied when joining this table
    with previous tables.
    For quick selects and full table scans, selectivity of COND(this_table)
    is accounted for in matching_candidates_in_table(). Here, we only count
    selectivity of COND(this_table, previous_tables). 
    For other access methods, we need to calculate selectivity of the whole
    condition, "COND(this_table) AND COND(this_table, previous_tables)".
  @retval
-  selectivity of the conditions imposed on the rows of s
+    selectivity of the conditions imposed on the rows of s
 */
 static
@ -7255,22 +7279,21 @@ double table_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s,
  TABLE *table= s->table;
  MY_BITMAP *read_set= table->read_set;
  double sel= s->table->cond_selectivity;
  double table_records= table->stat_records();
  POSITION *pos= &join->positions[idx];
  uint keyparts= 0;
  uint found_part_ref_or_null= 0;
-  /* Discount the selectivity of the access method used to join table s */
+  if (pos->key != 0)
  if (s->quick && s->quick->index != MAX_KEY)
  {
-    if (pos->key == 0 && table_records > 0)
+    /* 
-    {
+      A ref access or hash join is used for this table.
-      sel/= table->quick_rows[s->quick->index]/table_records;
+
-    }
+      It could have some parts with "t.key_part=const". Using ref access
-  }
+      means that we will only get records where the condition holds, so we
-  else if (pos->key != 0)
+      should remove its selectivity from the condition selectivity.
-  {
+      
-    /* A ref/ access or hash join is used to join table */
+      (TODO: more details about the "t.key=othertable.col" case)
    */
    KEYUSE *keyuse= pos->key;
    KEYUSE *prev_ref_keyuse= keyuse;
    uint key= keyuse->key;
@ -7317,9 +7340,14 @@ double table_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s,
  }
  else
  {
    /*
      The table is accessed with full table scan, or quick select.
      Selectivity of COND(table) is already accounted for in 
      matching_candidates_in_table().
    */
    sel= 1;
  }
-    
+
  /* 
    If the field f from the table is equal to a field from one the
    earlier joined tables then the selectivity of the range conditions