1
0
mirror of https://github.com/MariaDB/server.git synced 2025-07-27 18:02:13 +03:00

Fixed bug#28404.

This patch adds cost estimation for the queries with ORDER BY / GROUP BY
and LIMIT. 
If there was a ref/range access to the table whose rows were required
to be ordered in the result set the optimizer always employed this access
though a scan by a different index that was compatible with the required 
order could be cheaper to produce the first L rows of the result set.
Now for such queries the optimizer makes a choice between the cheapest
ref/range accesses not compatible with the given order and index scans
compatible with it.
This commit is contained in:
igor@olga.mysql.com
2007-08-02 12:45:56 -07:00
parent 642b0c00ed
commit cf39429295
14 changed files with 359 additions and 105 deletions

View File

@ -6450,6 +6450,7 @@ void JOIN_TAB::cleanup()
quick= 0;
x_free(cache.buff);
cache.buff= 0;
limit= 0;
if (table)
{
if (table->key_read)
@ -12588,9 +12589,12 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit,
{
int ref_key;
uint ref_key_parts;
int order_direction;
uint used_key_parts;
TABLE *table=tab->table;
SQL_SELECT *select=tab->select;
key_map usable_keys;
QUICK_SELECT_I *save_quick;
DBUG_ENTER("test_if_skip_sort_order");
LINT_INIT(ref_key_parts);
@ -12625,6 +12629,7 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit,
else if (select && select->quick) // Range found by opt_range
{
int quick_type= select->quick->get_type();
save_quick= select->quick;
/*
assume results are not ordered when index merge is used
TODO: sergeyp: Results of all index merge selects actually are ordered
@ -12644,8 +12649,6 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit,
/*
We come here when there is a REF key.
*/
int order_direction;
uint used_key_parts;
if (!usable_keys.is_set(ref_key))
{
/*
@ -12706,63 +12709,30 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit,
}
/* Check if we get the rows in requested sorted order by using the key */
if (usable_keys.is_set(ref_key) &&
(order_direction = test_if_order_by_key(order,table,ref_key,
&used_key_parts)))
{
if (order_direction == -1) // If ORDER BY ... DESC
{
if (select && select->quick)
{
/*
Don't reverse the sort order, if it's already done.
(In some cases test_if_order_by_key() can be called multiple times
*/
if (!select->quick->reverse_sorted())
{
QUICK_SELECT_DESC *tmp;
int quick_type= select->quick->get_type();
if (quick_type == QUICK_SELECT_I::QS_TYPE_INDEX_MERGE ||
quick_type == QUICK_SELECT_I::QS_TYPE_ROR_INTERSECT ||
quick_type == QUICK_SELECT_I::QS_TYPE_ROR_UNION ||
quick_type == QUICK_SELECT_I::QS_TYPE_GROUP_MIN_MAX)
DBUG_RETURN(0); // Use filesort
/* ORDER BY range_key DESC */
tmp= new QUICK_SELECT_DESC((QUICK_RANGE_SELECT*)(select->quick),
used_key_parts);
if (!tmp || tmp->error)
{
delete tmp;
DBUG_RETURN(0); // Reverse sort not supported
}
select->quick=tmp;
}
DBUG_RETURN(1);
}
if (tab->ref.key_parts < used_key_parts)
{
/*
SELECT * FROM t1 WHERE a=1 ORDER BY a DESC,b DESC
Use a traversal function that starts by reading the last row
with key part (A) and then traverse the index backwards.
*/
tab->read_first_record= join_read_last_key;
tab->read_record.read_record= join_read_prev_same;
/* fall through */
}
}
else if (select && select->quick)
select->quick->sorted= 1;
DBUG_RETURN(1); /* No need to sort */
}
(order_direction= test_if_order_by_key(order,table,ref_key,
&used_key_parts)))
goto check_reverse_order;
}
else
{
/* check if we can use a key to resolve the group */
/* Tables using JT_NEXT are handled here */
/*
Check whether there is an index compatible with the given order
usage of which is cheaper than usage of the ref_key index (ref_key>=0)
or a table scan.
It may be the case if ORDER/GROUP BY is used with LIMIT.
*/
uint nr;
key_map keys;
uint best_key_parts;
int best_key_direction;
ha_rows best_records;
double read_time;
int best_key= -1;
bool is_best_covering= FALSE;
double fanout= 1;
JOIN *join= tab->join;
uint tablenr= tab - join->join_tab;
ha_rows table_records= table->file->stats.records;
bool group= join->group && order == join->group_list;
/*
filesort() and join cache are usually faster than reading in
@ -12775,7 +12745,7 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit,
resolved with a key; This is because filesort() is usually faster than
retrieving all rows through an index.
*/
if (select_limit >= table->file->stats.records)
if (select_limit >= table_records)
{
keys= *table->file->keys_to_use_for_scanning();
keys.merge(table->covering_keys);
@ -12786,38 +12756,224 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit,
This is to allow users to use index in ORDER BY.
*/
if (table->force_index)
keys.merge(table->keys_in_use_for_query);
keys.merge(group ? table->keys_in_use_for_group_by :
table->keys_in_use_for_order_by);
keys.intersect(usable_keys);
}
else
keys= usable_keys;
read_time= join->best_positions[tablenr].read_time;
for (uint i= tablenr+1; i < join->tables; i++)
fanout*= join->best_positions[i].records_read; // fanout is always >= 1
for (nr=0; nr < table->s->keys ; nr++)
{
uint not_used;
if (keys.is_set(nr))
int direction;
if (keys.is_set(nr) &&
(direction= test_if_order_by_key(order, table, nr, &used_key_parts)))
{
int flag;
if ((flag= test_if_order_by_key(order, table, nr, &not_used)))
{
if (!no_changes)
{
tab->index=nr;
tab->read_first_record= (flag > 0 ? join_read_first:
join_read_last);
tab->type=JT_NEXT; // Read with index_first(), index_next()
if (table->covering_keys.is_set(nr))
{
table->key_read=1;
table->file->extra(HA_EXTRA_KEYREAD);
}
}
DBUG_RETURN(1);
}
bool is_covering= table->covering_keys.is_set(nr) ||
nr == table->s->primary_key &&
table->file->primary_key_is_clustered();
/*
Don't use an index scan with ORDER BY without limit.
For GROUP BY without limit always use index scan
if there is a suitable index.
Why we hold to this asymmetry hardly can be explained
rationally. It's easy to demonstrate that using
temporary table + filesort could be cheaper for grouping
queries too.
*/
if (is_covering ||
select_limit != HA_POS_ERROR ||
ref_key < 0 && (group || table->force_index))
{
double rec_per_key;
double index_scan_time;
KEY *keyinfo= tab->table->key_info+nr;
if (select_limit == HA_POS_ERROR)
select_limit= table_records;
if (group)
{
rec_per_key= keyinfo->rec_per_key[used_key_parts-1];
set_if_bigger(rec_per_key, 1);
/*
With a grouping query each group containing on average
rec_per_key records produces only one row that will
be included into the result set.
*/
if (select_limit > table_records/rec_per_key)
select_limit= table_records;
else
select_limit= (ha_rows) (select_limit*rec_per_key);
}
/*
If tab=tk is not the last joined table tn then to get first
L records from the result set we can expect to retrieve
only L/fanout(tk,tn) where fanout(tk,tn) says how many
rows in the record set on average will match each row tk.
Usually our estimates for fanouts are too pessimistic.
So the estimate for L/fanout(tk,tn) will be too optimistic
and as result we'll choose an index scan when using ref/range
access + filesort will be cheaper.
*/
select_limit= (ha_rows) (select_limit < fanout ?
1 : select_limit/fanout);
/*
We assume that each of the tested indexes is not correlated
with ref_key. Thus, to select first N records we have to scan
N/selectivity(ref_key) index entries.
selectivity(ref_key) = #scanned_records/#table_records =
table->quick_condition_rows/table_records.
In any case we can't select more than #table_records.
N/(table->quick_condition_rows/table_records) > table_records
<=> N > table->quick_condition_rows.
*/
if (select_limit > table->quick_condition_rows)
select_limit= table_records;
else
select_limit= (ha_rows) (select_limit *
(double) table_records /
table->quick_condition_rows);
rec_per_key= keyinfo->rec_per_key[keyinfo->key_parts-1];
set_if_bigger(rec_per_key, 1);
/*
Here we take into account the fact that rows are
accessed in sequences rec_per_key records in each.
Rows in such a sequence are supposed to be ordered
by rowid/primary key. When reading the data
in a sequence we'll touch not more pages than the
table file contains.
TODO. Use the formula for a disk sweep sequential access
to calculate the cost of accessing data rows for one
index entry.
*/
index_scan_time= select_limit/rec_per_key *
min(rec_per_key, table->file->scan_time());
if (is_covering ||
ref_key < 0 && (group || table->force_index) ||
index_scan_time < read_time)
{
ha_rows quick_records= table_records;
if (is_best_covering && !is_covering)
continue;
if (table->quick_keys.is_set(nr))
quick_records= table->quick_rows[nr];
if (best_key < 0 ||
(select_limit <= min(quick_records,best_records) ?
keyinfo->key_parts < best_key_parts :
quick_records < best_records))
{
best_key= nr;
best_key_parts= keyinfo->key_parts;
best_records= quick_records;
is_best_covering= is_covering;
best_key_direction= direction;
}
}
}
}
}
if (best_key >= 0)
{
bool quick_created= FALSE;
if (table->quick_keys.is_set(best_key) && best_key != ref_key)
{
key_map map;
map.clear_all(); // Force the creation of quick select
map.set_bit(best_key); // only best_key.
quick_created=
select->test_quick_select(join->thd, map, 0,
join->select_options & OPTION_FOUND_ROWS ?
HA_POS_ERROR :
join->unit->select_limit_cnt,
0) > 0;
}
if (!no_changes)
{
if (!quick_created)
{
tab->index= best_key;
tab->read_first_record= best_key_direction > 0 ?
join_read_first:join_read_last;
tab->type=JT_NEXT; // Read with index_first(), index_next()
if (table->covering_keys.is_set(best_key))
{
table->key_read=1;
table->file->extra(HA_EXTRA_KEYREAD);
}
table->file->ha_index_or_rnd_end();
if (join->select_options & SELECT_DESCRIBE)
{
tab->ref.key= -1;
tab->ref.key_parts= 0;
if (tab->select)
tab->select->quick= 0;
if (select_limit < table_records)
tab->limit= select_limit;
}
}
}
used_key_parts= best_key_parts;
order_direction= best_key_direction;
}
else
DBUG_RETURN(0);
}
check_reverse_order:
if (order_direction == -1) // If ORDER BY ... DESC
{
if (select && select->quick)
{
/*
Don't reverse the sort order, if it's already done.
(In some cases test_if_order_by_key() can be called multiple times
*/
if (!select->quick->reverse_sorted())
{
QUICK_SELECT_DESC *tmp;
int quick_type= select->quick->get_type();
if (quick_type == QUICK_SELECT_I::QS_TYPE_INDEX_MERGE ||
quick_type == QUICK_SELECT_I::QS_TYPE_ROR_INTERSECT ||
quick_type == QUICK_SELECT_I::QS_TYPE_ROR_UNION ||
quick_type == QUICK_SELECT_I::QS_TYPE_GROUP_MIN_MAX)
{
tab->limit= 0;
select->quick= save_quick;
DBUG_RETURN(0); // Use filesort
}
/* ORDER BY range_key DESC */
tmp= new QUICK_SELECT_DESC((QUICK_RANGE_SELECT*)(select->quick),
used_key_parts);
if (!tmp || tmp->error)
{
delete tmp;
select->quick= save_quick;
tab->limit= 0;
DBUG_RETURN(0); // Reverse sort not supported
}
select->quick=tmp;
}
}
else if (tab->ref.key >= 0 && tab->ref.key_parts < used_key_parts)
{
/*
SELECT * FROM t1 WHERE a=1 ORDER BY a DESC,b DESC
Use a traversal function that starts by reading the last row
with key part (A) and then traverse the index backwards.
*/
tab->read_first_record= join_read_last_key;
tab->read_record.read_record= join_read_prev_same;
}
}
DBUG_RETURN(0); // Can't use index.
else if (select && select->quick)
select->quick->sorted= 1;
DBUG_RETURN(1);
}
@ -15524,7 +15680,7 @@ static void select_describe(JOIN *join, bool need_tmp_table, bool need_order,
if (tab->select && tab->select->quick)
examined_rows= tab->select->quick->records;
else if (tab->type == JT_NEXT || tab->type == JT_ALL)
examined_rows= tab->table->file->records();
examined_rows= tab->limit ? tab->limit : tab->table->file->records();
else
examined_rows=(ha_rows)join->best_positions[i].records_read;