Fixed bug#28404.

This patch adds cost estimation for the queries with ORDER BY / GROUP BY and LIMIT. If there was a ref/range access to the table whose rows were required to be ordered in the result set the optimizer always employed this access though a scan by a different index that was compatible with the required order could be cheaper to produce the first L rows of the result set. Now for such queries the optimizer makes a choice between the cheapest ref/range accesses not compatible with the given order and index scans compatible with it.
2025-07-27 18:02:13 +03:00 · 2007-08-02 12:45:56 -07:00
parent 642b0c00ed
commit cf39429295
14 changed files with 359 additions and 105 deletions
--- a/sql/sql_select.cc
+++ b/sql/sql_select.cc
@ -6450,6 +6450,7 @@ void JOIN_TAB::cleanup()
  quick= 0;
  x_free(cache.buff);
  cache.buff= 0;
+  limit= 0;
  if (table)
  {
    if (table->key_read)
@ -12588,9 +12589,12 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit,
 {
  int ref_key;
  uint ref_key_parts;
+  int order_direction;
+  uint used_key_parts;
  TABLE *table=tab->table;
  SQL_SELECT *select=tab->select;
  key_map usable_keys;
+  QUICK_SELECT_I *save_quick;
  DBUG_ENTER("test_if_skip_sort_order");
  LINT_INIT(ref_key_parts);

@ -12625,6 +12629,7 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit,
  else if (select && select->quick)		// Range found by opt_range
  {
    int quick_type= select->quick->get_type();
+    save_quick= select->quick;
    /* 
      assume results are not ordered when index merge is used 
      TODO: sergeyp: Results of all index merge selects actually are ordered 
@ -12644,8 +12649,6 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit,
    /*
      We come here when there is a REF key.
    */
-    int order_direction;
-    uint used_key_parts;
    if (!usable_keys.is_set(ref_key))
    {
      /*
@ -12706,63 +12709,30 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit,
    }
    /* Check if we get the rows in requested sorted order by using the key */
    if (usable_keys.is_set(ref_key) &&
-	(order_direction = test_if_order_by_key(order,table,ref_key,
-						&used_key_parts)))
-    {
-      if (order_direction == -1)		// If ORDER BY ... DESC
-      {
-	if (select && select->quick)
-	{
-	  /*
-	    Don't reverse the sort order, if it's already done.
-	    (In some cases test_if_order_by_key() can be called multiple times
-	  */
-	  if (!select->quick->reverse_sorted())
-	  {
-            QUICK_SELECT_DESC *tmp;
-            int quick_type= select->quick->get_type();
-            if (quick_type == QUICK_SELECT_I::QS_TYPE_INDEX_MERGE ||
-                quick_type == QUICK_SELECT_I::QS_TYPE_ROR_INTERSECT ||
-                quick_type == QUICK_SELECT_I::QS_TYPE_ROR_UNION ||
-                quick_type == QUICK_SELECT_I::QS_TYPE_GROUP_MIN_MAX)
-              DBUG_RETURN(0);                   // Use filesort
-            
-            /* ORDER BY range_key DESC */
-	    tmp= new QUICK_SELECT_DESC((QUICK_RANGE_SELECT*)(select->quick),
-                                       used_key_parts);
-	    if (!tmp || tmp->error)
-	    {
-	      delete tmp;
-	      DBUG_RETURN(0);		// Reverse sort not supported
-	    }
-	    select->quick=tmp;
-	  }
-	  DBUG_RETURN(1);
-	}
-	if (tab->ref.key_parts < used_key_parts)
-	{
-	  /*
-	    SELECT * FROM t1 WHERE a=1 ORDER BY a DESC,b DESC
-
-	    Use a traversal function that starts by reading the last row
-	    with key part (A) and then traverse the index backwards.
-	  */
-	  tab->read_first_record=       join_read_last_key;
-	  tab->read_record.read_record= join_read_prev_same;
-	  /* fall through */
-	}
-      }
-      else if (select && select->quick)
-	  select->quick->sorted= 1;
-      DBUG_RETURN(1);			/* No need to sort */
-    }
+        (order_direction= test_if_order_by_key(order,table,ref_key,
+					       &used_key_parts)))
+      goto check_reverse_order;
  }
-  else
  {
-    /* check if we can use a key to resolve the group */
-    /* Tables using JT_NEXT are handled here */
+    /*
+      Check whether there is an index compatible with the given order
+      usage of which is cheaper than usage of the ref_key index (ref_key>=0)
+      or a table scan.
+      It may be the case if ORDER/GROUP BY is used with LIMIT.
+    */
    uint nr;
    key_map keys;
+    uint best_key_parts;
+    int best_key_direction;
+    ha_rows best_records;
+    double read_time;
+    int best_key= -1;
+    bool is_best_covering= FALSE;
+    double fanout= 1;
+    JOIN *join= tab->join;
+    uint tablenr= tab - join->join_tab;
+    ha_rows table_records= table->file->stats.records;
+    bool group= join->group && order == join->group_list; 

    /* 
      filesort() and join cache are usually faster than reading in 
@ -12775,7 +12745,7 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit,
      resolved with a key;  This is because filesort() is usually faster than
      retrieving all rows through an index.
    */
-    if (select_limit >= table->file->stats.records)
+    if (select_limit >= table_records)
    {
      keys= *table->file->keys_to_use_for_scanning();
      keys.merge(table->covering_keys);
@ -12786,38 +12756,224 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit,
        This is to allow users to use index in ORDER BY.
      */
      if (table->force_index) 
-	keys.merge(table->keys_in_use_for_query);
+	keys.merge(group ? table->keys_in_use_for_group_by :
+                           table->keys_in_use_for_order_by);
      keys.intersect(usable_keys);
    }
    else
      keys= usable_keys;

+    read_time= join->best_positions[tablenr].read_time;
+    for (uint i= tablenr+1; i < join->tables; i++)
+      fanout*= join->best_positions[i].records_read; // fanout is always >= 1
+
    for (nr=0; nr < table->s->keys ; nr++)
    {
-      uint not_used;
-      if (keys.is_set(nr))
+      int direction;
+      if (keys.is_set(nr) &&
+          (direction= test_if_order_by_key(order, table, nr, &used_key_parts)))
      {
-	int flag;
-	if ((flag= test_if_order_by_key(order, table, nr, &not_used)))
-	{
-	  if (!no_changes)
-	  {
-	    tab->index=nr;
-	    tab->read_first_record=  (flag > 0 ? join_read_first:
-				      join_read_last);
-	    tab->type=JT_NEXT;	// Read with index_first(), index_next()
-	    if (table->covering_keys.is_set(nr))
-	    {
-	      table->key_read=1;
-	      table->file->extra(HA_EXTRA_KEYREAD);
-	    }
-	  }
-	  DBUG_RETURN(1);
-	}
+        bool is_covering= table->covering_keys.is_set(nr) ||
+                          nr == table->s->primary_key &&
+	                  table->file->primary_key_is_clustered();
+	
+        /* 
+          Don't use an index scan with ORDER BY without limit.
+          For GROUP BY without limit always use index scan
+          if there is a suitable index. 
+          Why we hold to this asymmetry hardly can be explained
+          rationally. It's easy to demonstrate that using
+          temporary table + filesort could be cheaper for grouping
+          queries too.
+	*/ 
+        if (is_covering ||
+            select_limit != HA_POS_ERROR || 
+            ref_key < 0 && (group || table->force_index))
+        { 
+          double rec_per_key;
+          double index_scan_time;
+          KEY *keyinfo= tab->table->key_info+nr;
+          if (select_limit == HA_POS_ERROR)
+            select_limit= table_records;
+          if (group)
+          {
+            rec_per_key= keyinfo->rec_per_key[used_key_parts-1];
+            set_if_bigger(rec_per_key, 1);
+            /*
+              With a grouping query each group containing on average
+              rec_per_key records produces only one row that will
+              be included into the result set.
+	    */  
+            if (select_limit > table_records/rec_per_key)
+                select_limit= table_records;
+            else
+              select_limit= (ha_rows) (select_limit*rec_per_key);
+          }
+          /* 
+            If tab=tk is not the last joined table tn then to get first
+            L records from the result set we can expect to retrieve
+            only L/fanout(tk,tn) where fanout(tk,tn) says how many
+            rows in the record set on average will match each row tk.
+            Usually our estimates for fanouts are too pessimistic.
+            So the estimate for L/fanout(tk,tn) will be too optimistic
+            and as result we'll choose an index scan when using ref/range
+            access + filesort will be cheaper.
+	  */
+          select_limit= (ha_rows) (select_limit < fanout ?
+                                   1 : select_limit/fanout);
+          /*
+            We assume that each of the tested indexes is not correlated
+            with ref_key. Thus, to select first N records we have to scan
+            N/selectivity(ref_key) index entries. 
+            selectivity(ref_key) = #scanned_records/#table_records =
+            table->quick_condition_rows/table_records.
+            In any case we can't select more than #table_records.
+            N/(table->quick_condition_rows/table_records) > table_records 
+            <=> N > table->quick_condition_rows.
+          */ 
+          if (select_limit > table->quick_condition_rows)
+            select_limit= table_records;
+          else
+            select_limit= (ha_rows) (select_limit *
+                                     (double) table_records /
+                                      table->quick_condition_rows);
+          rec_per_key= keyinfo->rec_per_key[keyinfo->key_parts-1];
+          set_if_bigger(rec_per_key, 1);
+          /*
+            Here we take into account the fact that rows are
+            accessed in sequences rec_per_key records in each.
+            Rows in such a sequence are supposed to be ordered
+            by rowid/primary key. When reading the data
+            in a sequence we'll touch not more pages than the
+            table file contains.
+            TODO. Use the formula for a disk sweep sequential access
+            to calculate the cost of accessing data rows for one 
+            index entry.
+	  */
+          index_scan_time= select_limit/rec_per_key *
+	                   min(rec_per_key, table->file->scan_time());
+          if (is_covering || 
+              ref_key < 0 && (group || table->force_index) ||
+              index_scan_time < read_time)
+          {
+            ha_rows quick_records= table_records;
+            if (is_best_covering && !is_covering)
+              continue;
+            if (table->quick_keys.is_set(nr))
+              quick_records= table->quick_rows[nr];
+            if (best_key < 0 ||
+                (select_limit <= min(quick_records,best_records) ?
+                 keyinfo->key_parts < best_key_parts :
+                 quick_records < best_records))
+            {
+              best_key= nr;
+              best_key_parts= keyinfo->key_parts;
+              best_records= quick_records;
+              is_best_covering= is_covering;
+              best_key_direction= direction; 
+            }
+          }   
+	}      
      }
    }
+    if (best_key >= 0)
+    {
+      bool quick_created= FALSE;
+      if (table->quick_keys.is_set(best_key) && best_key != ref_key)
+      {
+        key_map map;
+        map.clear_all();       // Force the creation of quick select
+        map.set_bit(best_key); // only best_key.
+        quick_created=         
+          select->test_quick_select(join->thd, map, 0,
+                                    join->select_options & OPTION_FOUND_ROWS ?
+                                    HA_POS_ERROR :
+                                    join->unit->select_limit_cnt,
+                                    0) > 0;
+      }
+      if (!no_changes)
+      {
+        if (!quick_created)
+	{
+          tab->index= best_key;
+          tab->read_first_record= best_key_direction > 0 ?
+                                  join_read_first:join_read_last;
+          tab->type=JT_NEXT;           // Read with index_first(), index_next()
+          if (table->covering_keys.is_set(best_key))
+          {
+            table->key_read=1;
+            table->file->extra(HA_EXTRA_KEYREAD);
+          }
+          table->file->ha_index_or_rnd_end();
+          if (join->select_options & SELECT_DESCRIBE)
+          {
+            tab->ref.key= -1;
+            tab->ref.key_parts= 0;
+            if (tab->select)
+              tab->select->quick= 0;
+            if (select_limit < table_records) 
+              tab->limit= select_limit;
+          }
+        }
+      }
+      used_key_parts= best_key_parts;
+      order_direction= best_key_direction;
+    }
+    else
+      DBUG_RETURN(0); 
+  } 
+
+check_reverse_order:                  
+  if (order_direction == -1)		// If ORDER BY ... DESC
+  {
+    if (select && select->quick)
+    {
+      /*
+	Don't reverse the sort order, if it's already done.
+        (In some cases test_if_order_by_key() can be called multiple times
+      */
+      if (!select->quick->reverse_sorted())
+      {
+        QUICK_SELECT_DESC *tmp;
+        int quick_type= select->quick->get_type();
+        if (quick_type == QUICK_SELECT_I::QS_TYPE_INDEX_MERGE ||
+            quick_type == QUICK_SELECT_I::QS_TYPE_ROR_INTERSECT ||
+            quick_type == QUICK_SELECT_I::QS_TYPE_ROR_UNION ||
+            quick_type == QUICK_SELECT_I::QS_TYPE_GROUP_MIN_MAX)
+        {
+          tab->limit= 0;
+          select->quick= save_quick;
+          DBUG_RETURN(0);                   // Use filesort
+        }
+            
+        /* ORDER BY range_key DESC */
+	tmp= new QUICK_SELECT_DESC((QUICK_RANGE_SELECT*)(select->quick),
+                                    used_key_parts);
+	if (!tmp || tmp->error)
+	{
+	  delete tmp;
+          select->quick= save_quick;
+          tab->limit= 0;
+	  DBUG_RETURN(0);		// Reverse sort not supported
+	}
+	select->quick=tmp;
+      }
+    }
+    else if (tab->ref.key >= 0 && tab->ref.key_parts < used_key_parts)
+    {
+      /*
+	SELECT * FROM t1 WHERE a=1 ORDER BY a DESC,b DESC
+
+	Use a traversal function that starts by reading the last row
+	with key part (A) and then traverse the index backwards.
+      */
+      tab->read_first_record= join_read_last_key;
+      tab->read_record.read_record= join_read_prev_same;
+    }
  }
-  DBUG_RETURN(0);				// Can't use index.
+  else if (select && select->quick)
+    select->quick->sorted= 1;
+  DBUG_RETURN(1);
 }


@ -15524,7 +15680,7 @@ static void select_describe(JOIN *join, bool need_tmp_table, bool need_order,
      if (tab->select && tab->select->quick)
        examined_rows= tab->select->quick->records;
      else if (tab->type == JT_NEXT || tab->type == JT_ALL)
-        examined_rows= tab->table->file->records();
+        examined_rows= tab->limit ? tab->limit : tab->table->file->records();
      else
        examined_rows=(ha_rows)join->best_positions[i].records_read;