MDEV-18073: get_range_limit_read_cost() doesnt adjust LIMIT for the range access

The computation about which "fraction" of range/ref access cost we will need to perform, was incorrect. Adjusted the computation.
2025-07-29 05:21:33 +03:00 · 2019-01-23 16:26:09 +03:00
parent b7a784ae25
commit 3238f2a6e9
3 changed files with 73 additions and 10 deletions
--- a/sql/sql_select.cc
+++ b/sql/sql_select.cc
@ -26685,16 +26685,22 @@ void JOIN::cache_const_exprs()

 
 /*
-  Get a cost of reading rows_limit rows through index keynr.
+  Get the cost of using index keynr to read #LIMIT matching rows

  @detail
   - If there is a quick select, we try to use it.
   - if there is a ref(const) access, we try to use it, too.
   - quick and ref(const) use different cost formulas, so if both are possible
      we should make a cost-based choice.
-  
+
+  rows_limit is the number of rows we would need to read when using a full
+  index scan. This is generally higher than the N from "LIMIT N" clause,
+  because there's a WHERE condition (a part of which is used to construct a
+  range access we are considering using here)
+
  @param  tab              JOIN_TAB with table access (is NULL for single-table
                           UPDATE/DELETE)
+  @param  rows_limit       See explanation above
  @param  read_time OUT    Cost of reading using quick or ref(const) access.


@ -26707,6 +26713,7 @@ void JOIN::cache_const_exprs()

 static bool get_range_limit_read_cost(const JOIN_TAB *tab, 
                                      const TABLE *table, 
+                                      ha_rows table_records,
                                      uint keynr, 
                                      ha_rows rows_limit,
                                      double *read_time)
@ -26773,8 +26780,32 @@ static bool get_range_limit_read_cost(const JOIN_TAB *tab,
        }
      }
    }
+
+    /*
+      Consider an example:
+
+        SELECT *
+        FROM t1
+        WHERE key1 BETWEEN 10 AND 20 AND col2='foo'
+        ORDER BY key1 LIMIT 10
+
+      If we were using a full index scan on key1, we would need to read this
+      many rows to get 10 matches:
+
+        10 / selectivity(key1 BETWEEN 10 AND 20 AND col2='foo')
+
+      This is the number we get in rows_limit.
+      But we intend to use range access on key1. The rows returned by quick
+      select will satisfy the range part of the condition,
+      "key1 BETWEEN 10 and 20". We will still need to filter them with
+      the remainder condition, (col2='foo').
+
+      The selectivity of the range access is (best_rows/table_records). We need
+      to discount it from the rows_limit:
+    */
+    double rows_limit_for_quick= rows_limit * (best_rows / table_records);
 
-    if (best_rows > rows_limit)
+    if (best_rows > rows_limit_for_quick)
    {
      /*
        LIMIT clause specifies that we will need to read fewer records than
@ -26783,7 +26814,7 @@ static bool get_range_limit_read_cost(const JOIN_TAB *tab,
        only need 1/3rd of records, it will cost us 1/3rd of quick select's
        read time)
      */
-      best_cost *= rows_limit / best_rows;
+      best_cost *= rows_limit_for_quick / best_rows;
    }
    *read_time= best_cost;
    res= true;
@ -27076,8 +27107,8 @@ test_if_cheaper_ordering(const JOIN_TAB *tab, ORDER *order, TABLE *table,
        index_scan_time= select_limit/rec_per_key *
                         MY_MIN(rec_per_key, table->file->scan_time());
        double range_scan_time;
-        if (get_range_limit_read_cost(tab, table, nr, select_limit, 
-                                       &range_scan_time))
+        if (get_range_limit_read_cost(tab, table, table_records, nr,
+                                      select_limit, &range_scan_time))
        {
          if (range_scan_time < index_scan_time)
            index_scan_time= range_scan_time;