MWL#121: DS-MRR support for clustered primary keys

- Merge with current 5.3
2025-08-08 11:22:35 +03:00 · 2010-07-16 13:38:23 +04:00
parent 75bba30c5a 488d352a66
commit e1006e9e1f
14 changed files with 731 additions and 92 deletions
--- a/mysql-test/r/innodb_mrr_cpk.result
+++ b/mysql-test/r/innodb_mrr_cpk.result
@@ -0,0 +1,148 @@
 drop table if exists t0,t1,t2,t3;
 set @save_join_cache_level=@@join_cache_level;
 set join_cache_level=6;
 set @save_storage_engine=@@storage_engine;
 set storage_engine=innodb;
 create table t0(a int);
 insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
 create table t1(a char(8), b char(8), filler char(100), primary key(a));
 show create table t1;
 Table	Create Table
 t1	CREATE TABLE `t1` (
  `a` char(8) NOT NULL DEFAULT '',
  `b` char(8) DEFAULT NULL,
  `filler` char(100) DEFAULT NULL,
  PRIMARY KEY (`a`)
 ) ENGINE=InnoDB DEFAULT CHARSET=latin1
 insert into t1 select 
 concat('a-', 1000 + A.a + B.a*10 + C.a*100, '=A'),
 concat('b-', 1000 + A.a + B.a*10 + C.a*100, '=B'),
 'filler'
 from t0 A, t0 B, t0 C;
 create table t2 (a char(8));
 insert into t2 values ('a-1010=A'), ('a-1030=A'), ('a-1020=A');
 This should use join buffer:
 explain select * from t1, t2 where t1.a=t2.a;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	3	
 1	SIMPLE	t1	eq_ref	PRIMARY	PRIMARY	8	test.t2.a	1	Using join buffer
 This output must be sorted by value of t1.a:
 select * from t1, t2 where t1.a=t2.a;
 a	b	filler	a
 a-1010=A	b-1010=B	filler	a-1010=A
 a-1020=A	b-1020=B	filler	a-1020=A
 a-1030=A	b-1030=B	filler	a-1030=A
 drop table t1, t2;
 create table t1(
 a char(8) character set utf8, b int, filler char(100), 
 primary key(a,b)
 );
 insert into t1 select 
 concat('a-', 1000 + A.a + B.a*10 + C.a*100, '=A'),
 1000 + A.a + B.a*10 + C.a*100,
 'filler'
 from t0 A, t0 B, t0 C;
 create table t2 (a char(8) character set utf8, b int);
 insert into t2 values ('a-1010=A', 1010), ('a-1030=A', 1030), ('a-1020=A', 1020);
 explain select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	3	
 1	SIMPLE	t1	eq_ref	PRIMARY	PRIMARY	28	test.t2.a,test.t2.b	1	Using join buffer
 select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
 a	b	filler	a	b
 a-1010=A	1010	filler	a-1010=A	1010
 a-1020=A	1020	filler	a-1020=A	1020
 a-1030=A	1030	filler	a-1030=A	1030
 insert into t2 values ('a-1030=A', 1030), ('a-1020=A', 1020);
 explain select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	5	
 1	SIMPLE	t1	eq_ref	PRIMARY	PRIMARY	28	test.t2.a,test.t2.b	1	Using join buffer
 select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
 a	b	filler	a	b
 a-1010=A	1010	filler	a-1010=A	1010
 a-1020=A	1020	filler	a-1020=A	1020
 a-1020=A	1020	filler	a-1020=A	1020
 a-1030=A	1030	filler	a-1030=A	1030
 a-1030=A	1030	filler	a-1030=A	1030
 drop table t1, t2;
 create table t1(
 a varchar(8) character set utf8, b int, filler char(100), 
 primary key(a,b)
 );
 insert into t1 select 
 concat('a-', 1000 + A.a + B.a*10 + C.a*100, '=A'),
 1000 + A.a + B.a*10 + C.a*100,
 'filler'
 from t0 A, t0 B, t0 C;
 create table t2 (a char(8) character set utf8, b int);
 insert into t2 values ('a-1010=A', 1010), ('a-1030=A', 1030), ('a-1020=A', 1020);
 explain select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	3	
 1	SIMPLE	t1	eq_ref	PRIMARY	PRIMARY	30	test.t2.a,test.t2.b	1	Using index condition(BKA); Using join buffer
 select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
 a	b	filler	a	b
 a-1010=A	1010	filler	a-1010=A	1010
 a-1020=A	1020	filler	a-1020=A	1020
 a-1030=A	1030	filler	a-1030=A	1030
 explain select * from t1, t2 where t1.a=t2.a;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	3	
 1	SIMPLE	t1	ref	PRIMARY	PRIMARY	26	test.t2.a	1	Using index condition(BKA); Using join buffer
 select * from t1, t2 where t1.a=t2.a;
 a	b	filler	a	b
 a-1010=A	1010	filler	a-1010=A	1010
 a-1020=A	1020	filler	a-1020=A	1020
 a-1030=A	1030	filler	a-1030=A	1030
 drop table t1, t2;
 create table t1 (a int, b int, c int, filler char(100), primary key(a,b,c));
 insert into t1 select A.a, B.a, C.a, 'filler' from t0 A, t0 B, t0 C;
 insert into t1 values (11, 11, 11,   'filler');
 insert into t1 values (11, 11, 12,   'filler');
 insert into t1 values (11, 11, 13,   'filler');
 insert into t1 values (11, 22, 1234, 'filler');
 insert into t1 values (11, 33, 124,  'filler');
 insert into t1 values (11, 33, 125,  'filler');
 create table t2 (a int, b int);
 insert into t2 values (11,33), (11,22), (11,11);
 explain select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	3	
 1	SIMPLE	t1	ref	PRIMARY	PRIMARY	8	test.t2.a,test.t2.b	1	Using join buffer
 select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
 a	b	c	filler	a	b
 11	11	11	filler	11	11
 11	11	12	filler	11	11
 11	11	13	filler	11	11
 11	22	1234	filler	11	22
 11	33	124	filler	11	33
 11	33	125	filler	11	33
 set join_cache_level=0;
 select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
 a	b	c	filler	a	b
 11	33	124	filler	11	33
 11	33	125	filler	11	33
 11	22	1234	filler	11	22
 11	11	11	filler	11	11
 11	11	12	filler	11	11
 11	11	13	filler	11	11
 set join_cache_level=6;
 explain select * from t1, t2 where t1.a=t2.a and t2.b + t1.b > 100;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	3	
 1	SIMPLE	t1	ref	PRIMARY	PRIMARY	4	test.t2.a	1	Using index condition(BKA); Using join buffer
 select * from t1, t2 where t1.a=t2.a and t2.b + t1.b > 100;
 a	b	c	filler	a	b
 set optimizer_switch='index_condition_pushdown=off';
 explain select * from t1, t2 where t1.a=t2.a and t2.b + t1.b > 100;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	3	
 1	SIMPLE	t1	ref	PRIMARY	PRIMARY	4	test.t2.a	1	Using where; Using join buffer
 select * from t1, t2 where t1.a=t2.a and t2.b + t1.b > 100;
 a	b	c	filler	a	b
 set optimizer_switch='index_condition_pushdown=on';
 drop table t1,t2;
 set @@join_cache_level= @save_join_cache_level;
 set storage_engine=@save_storage_engine;
 drop table t0;
--- a/mysql-test/t/innodb_mrr_cpk.test
+++ b/mysql-test/t/innodb_mrr_cpk.test
@@ -0,0 +1,137 @@
 # 
 # Tests for DS-MRR over clustered primary key. The only engine that supports
 # this is InnoDB/XtraDB.
 #
 # Basic idea about testing
 #  - DS-MRR/CPK works only with BKA
 #  - Should also test index condition pushdown
 #  - Should also test whatever uses RANGE_SEQ_IF::skip_record() for filtering
 #  - Also test access using prefix of primary key
 # 
 #  - Forget about cost model, BKA's multi_range_read_info() call passes 10 for
 #    #rows, the call is there at all only for applicability check
 # 
 -- source include/have_innodb.inc
 --disable_warnings
 drop table if exists t0,t1,t2,t3;
 --enable_warnings
 set @save_join_cache_level=@@join_cache_level;
 set join_cache_level=6;
 set @save_storage_engine=@@storage_engine;
 set storage_engine=innodb;
 create table t0(a int);
 insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
 create table t1(a char(8), b char(8), filler char(100), primary key(a));
 show create table t1;
 insert into t1 select 
  concat('a-', 1000 + A.a + B.a*10 + C.a*100, '=A'),
  concat('b-', 1000 + A.a + B.a*10 + C.a*100, '=B'),
  'filler'
 from t0 A, t0 B, t0 C;
 create table t2 (a char(8));
 insert into t2 values ('a-1010=A'), ('a-1030=A'), ('a-1020=A');
 --echo This should use join buffer:
 explain select * from t1, t2 where t1.a=t2.a;
 --echo This output must be sorted by value of t1.a:
 select * from t1, t2 where t1.a=t2.a;
 drop table t1, t2;
 # Try multi-column indexes
 create table t1(
  a char(8) character set utf8, b int, filler char(100), 
  primary key(a,b)
 );
 insert into t1 select 
  concat('a-', 1000 + A.a + B.a*10 + C.a*100, '=A'),
  1000 + A.a + B.a*10 + C.a*100,
  'filler'
 from t0 A, t0 B, t0 C;
 create table t2 (a char(8) character set utf8, b int);
 insert into t2 values ('a-1010=A', 1010), ('a-1030=A', 1030), ('a-1020=A', 1020);
 explain select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
 select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
 # Try with dataset that causes identical lookup keys:
 insert into t2 values ('a-1030=A', 1030), ('a-1020=A', 1020);
 explain select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
 select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
 drop table t1, t2;
 create table t1(
  a varchar(8) character set utf8, b int, filler char(100), 
  primary key(a,b)
 );
 insert into t1 select 
  concat('a-', 1000 + A.a + B.a*10 + C.a*100, '=A'),
  1000 + A.a + B.a*10 + C.a*100,
  'filler'
 from t0 A, t0 B, t0 C;
 create table t2 (a char(8) character set utf8, b int);
 insert into t2 values ('a-1010=A', 1010), ('a-1030=A', 1030), ('a-1020=A', 1020);
 explain select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
 select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
 # 
 # Try scanning on a CPK prefix
 #
 explain select * from t1, t2 where t1.a=t2.a;
 select * from t1, t2 where t1.a=t2.a;
 drop table t1, t2;
 #
 # The above example is not very interesting, as CPK prefix has 
 # only one match.  Create a dataset where scan on CPK prefix 
 # would produce multiple matches:
 #
 create table t1 (a int, b int, c int, filler char(100), primary key(a,b,c));
 insert into t1 select A.a, B.a, C.a, 'filler' from t0 A, t0 B, t0 C;
 insert into t1 values (11, 11, 11,   'filler');
 insert into t1 values (11, 11, 12,   'filler');
 insert into t1 values (11, 11, 13,   'filler');
 insert into t1 values (11, 22, 1234, 'filler');
 insert into t1 values (11, 33, 124,  'filler');
 insert into t1 values (11, 33, 125,  'filler');
 create table t2 (a int, b int);
 insert into t2 values (11,33), (11,22), (11,11);
 explain select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
 select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
 # Check a real resultset for comaprison:
 set join_cache_level=0;
 select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
 set join_cache_level=6;
 #
 # Check that Index Condition Pushdown (BKA) actually works:
 #
 explain select * from t1, t2 where t1.a=t2.a and t2.b + t1.b > 100;
 select * from t1, t2 where t1.a=t2.a and t2.b + t1.b > 100;
 set optimizer_switch='index_condition_pushdown=off';
 explain select * from t1, t2 where t1.a=t2.a and t2.b + t1.b > 100;
 select * from t1, t2 where t1.a=t2.a and t2.b + t1.b > 100;
 set optimizer_switch='index_condition_pushdown=on';
 drop table t1,t2;
 set @@join_cache_level= @save_join_cache_level;
 set storage_engine=@save_storage_engine;
 drop table t0;
--- a/sql/handler.h
+++ b/sql/handler.h
@@ -1278,9 +1278,9 @@ void get_sweep_read_cost(TABLE *table, ha_rows nrows, bool interrupted,
                         COST_VECT *cost);
 /*
-  The below two are not used (and not handled) in this milestone of this WL
+  Indicates that all scanned ranges will be singlepoint (aka equality) ranges.
-  entry because there seems to be no use for them at this stage of
+  The ranges may not use the full key but all of them will use the same number
-  implementation.
+  of key parts.
 */
 #define HA_MRR_SINGLE_POINT 1
 #define HA_MRR_FIXED_KEY  2
@@ -1806,9 +1806,10 @@ public:
                                              uint n_ranges, uint *bufsz,
                                              uint *flags, COST_VECT *cost);
  virtual ha_rows multi_range_read_info(uint keyno, uint n_ranges, uint keys,
-                                        uint *bufsz, uint *flags, COST_VECT *cost);
+                                        uint key_parts, uint *bufsz, 
                                        uint *flags, COST_VECT *cost);
  virtual int multi_range_read_init(RANGE_SEQ_IF *seq, void *seq_init_param,
-                                    uint n_ranges, uint mode,
+                                    uint n_ranges, uint mode, 
                                    HANDLER_BUFFER *buf);
  virtual int multi_range_read_next(char **range_info);
  virtual int read_range_first(const key_range *start_key,
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -1,4 +1,5 @@
 #include "mysql_priv.h"
 #include <my_bit.h>
 #include "sql_select.h"
 /****************************************************************************
@@ -136,10 +137,16 @@ handler::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
 */
 ha_rows handler::multi_range_read_info(uint keyno, uint n_ranges, uint n_rows,
-                                       uint *bufsz, uint *flags, COST_VECT *cost)
+                                       uint key_parts, uint *bufsz, 
                                       uint *flags, COST_VECT *cost)
 {
-  *bufsz= 0; /* Default implementation doesn't need a buffer */
+  /* 
    Currently we expect this function to be called only in preparation of scan
    with HA_MRR_SINGLE_POINT property.
  */
  DBUG_ASSERT(*flags | HA_MRR_SINGLE_POINT);
  *bufsz= 0; /* Default implementation doesn't need a buffer */
  *flags |= HA_MRR_USE_DEFAULT_IMPL;
  cost->zero();
@@ -316,25 +323,39 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
  {
    use_default_impl= TRUE;
    const int retval=
-      h->handler::multi_range_read_init(seq_funcs, seq_init_param,
+      h->handler::multi_range_read_init(seq_funcs, seq_init_param, n_ranges, 
-                                        n_ranges, mode, buf);
+                                        mode, buf);
    DBUG_RETURN(retval);
  }
-  rowids_buf= buf->buffer;
+  mrr_buf= buf->buffer;
  is_mrr_assoc= !test(mode & HA_MRR_NO_ASSOCIATION);
  if (is_mrr_assoc)
    status_var_increment(table->in_use->status_var.ha_multi_range_read_init_count);
-  rowids_buf_end= buf->buffer_end;
+  mrr_buf_end= buf->buffer_end;
  elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*);
  rowids_buf_last= rowids_buf + 
                      ((rowids_buf_end - rowids_buf)/ elem_size)*
                      elem_size;
  rowids_buf_end= rowids_buf_last;
-    /*
+  if ((doing_cpk_scan= check_cpk_scan(h->active_index, mode)))
  {
    /* It's a DS-MRR/CPK scan */
    cpk_tuple_length= 0; /* dummy value telling it needs to be inited */
    cpk_have_range= FALSE;
    use_default_impl= FALSE;
    h->mrr_iter= seq_funcs->init(seq_init_param, n_ranges, mode);
    h->mrr_funcs= *seq_funcs;
    dsmrr_fill_buffer_cpk();
    if (dsmrr_eof)
      buf->end_of_used_area= mrr_buf_last;
    DBUG_RETURN(0); /* nothing could go wrong while filling the buffer */
  }
  /* In regular DS-MRR, buffer stores {rowid, range_id} pairs */
  elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*);
  mrr_buf_last= mrr_buf + ((mrr_buf_end - mrr_buf)/ elem_size)* elem_size;
  mrr_buf_end= mrr_buf_last;
  /*
    There can be two cases:
    - This is the first call since index_init(), h2==NULL
       Need to setup h2 then.
@@ -406,8 +427,8 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
      goto error;
  }
-  if (h2->handler::multi_range_read_init(seq_funcs, seq_init_param, n_ranges,
+  if (h2->handler::multi_range_read_init(seq_funcs, seq_init_param, n_ranges, 
-                                          mode, buf) || 
+                                         mode, buf) ||
      dsmrr_fill_buffer())
  {
    goto error;
@@ -417,7 +438,7 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
    adjust *buf to indicate that the remaining buffer space will not be used.
  */
  if (dsmrr_eof) 
-    buf->end_of_used_area= rowids_buf_last;
+    buf->end_of_used_area= mrr_buf_last;
  /*
     h->inited == INDEX may occur when 'range checked for each record' is
@@ -473,6 +494,9 @@ static int rowid_cmp(void *h, uchar *a, uchar *b)
  rowid and return.
  The function assumes that rowids buffer is empty when it is invoked. 
  dsmrr_eof is set to indicate whether we've exhausted the list of ranges we're
  scanning.
  @param h  Table handler
@@ -487,8 +511,8 @@ int DsMrr_impl::dsmrr_fill_buffer()
  int res;
  DBUG_ENTER("DsMrr_impl::dsmrr_fill_buffer");
-  rowids_buf_cur= rowids_buf;
+  mrr_buf_cur= mrr_buf;
-  while ((rowids_buf_cur < rowids_buf_end) && 
+  while ((mrr_buf_cur < mrr_buf_end) && 
         !(res= h2->handler::multi_range_read_next(&range_info)))
  {
    KEY_MULTI_RANGE *curr_range= &h2->handler::mrr_cur_range;
@@ -498,13 +522,13 @@ int DsMrr_impl::dsmrr_fill_buffer()
    /* Put rowid, or {rowid, range_id} pair into the buffer */
    h2->position(table->record[0]);
-    memcpy(rowids_buf_cur, h2->ref, h2->ref_length);
+    memcpy(mrr_buf_cur, h2->ref, h2->ref_length);
-    rowids_buf_cur += h2->ref_length;
+    mrr_buf_cur += h2->ref_length;
    if (is_mrr_assoc)
    {
-      memcpy(rowids_buf_cur, &range_info, sizeof(void*));
+      memcpy(mrr_buf_cur, &range_info, sizeof(void*));
-      rowids_buf_cur += sizeof(void*);
+      mrr_buf_cur += sizeof(void*);
    }
  }
@@ -514,16 +538,224 @@ int DsMrr_impl::dsmrr_fill_buffer()
  /* Sort the buffer contents by rowid */
  uint elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*);
-  uint n_rowids= (rowids_buf_cur - rowids_buf) / elem_size;
+  uint n_rowids= (mrr_buf_cur - mrr_buf) / elem_size;
-  my_qsort2(rowids_buf, n_rowids, elem_size, (qsort2_cmp)rowid_cmp,
+  my_qsort2(mrr_buf, n_rowids, elem_size, (qsort2_cmp)rowid_cmp,
            (void*)h);
-  rowids_buf_last= rowids_buf_cur;
+  mrr_buf_last= mrr_buf_cur;
-  rowids_buf_cur=  rowids_buf;
+  mrr_buf_cur=  mrr_buf;
  DBUG_RETURN(0);
 }
 /* 
  my_qsort2-compatible function to compare key tuples 
 */
 int DsMrr_impl::key_tuple_cmp(void* arg, uchar* key1, uchar* key2)
 {
  DsMrr_impl *dsmrr= (DsMrr_impl*)arg;
  TABLE *table= dsmrr->h->table;
  KEY_PART_INFO *part= table->key_info[table->s->primary_key].key_part;
  uchar *key1_end= key1 + dsmrr->cpk_tuple_length;
  while (key1 < key1_end)
  {
    Field* f = part->field;
    int len = part->store_length;
    int res = f->cmp(key1, key2);
    if (res)
      return res;
    key1 += len;
    key2 += len;
    part++;
  }
  return 0;
 }
 /*
  DS-MRR/CPK: Fill the buffer with (lookup_tuple, range_id) pairs and sort
  SYNOPSIS
    DsMrr_impl::dsmrr_fill_buffer_cpk()
  DESCRIPTION
    DS-MRR/CPK: Fill the buffer with (lookup_tuple, range_id) pairs and sort
    dsmrr_eof is set to indicate whether we've exhausted the list of ranges 
    we're scanning.
 */
 void DsMrr_impl::dsmrr_fill_buffer_cpk()
 {
  int res;
  KEY_MULTI_RANGE cur_range;
  DBUG_ENTER("DsMrr_impl::dsmrr_fill_buffer_cpk");
  mrr_buf_cur= mrr_buf;
  while ((mrr_buf_cur < mrr_buf_end) && 
         !(res= h->mrr_funcs.next(h->mrr_iter, &cur_range)))
  {
    DBUG_ASSERT(cur_range.range_flag & EQ_RANGE);
    DBUG_ASSERT(!cpk_tuple_length || 
                cpk_tuple_length == cur_range.start_key.length);
    if (!cpk_tuple_length)
    {
      cpk_tuple_length= cur_range.start_key.length;
      cpk_is_unique_scan= test(table->key_info[h->active_index].key_parts == 
                               my_count_bits(cur_range.start_key.keypart_map));
      uint elem_size= cpk_tuple_length + (int)is_mrr_assoc * sizeof(void*);
      mrr_buf_last= mrr_buf + ((mrr_buf_end - mrr_buf)/elem_size) * elem_size;
      mrr_buf_end= mrr_buf_last;
    }
    /* Put key, or {key, range_id} pair into the buffer */
    memcpy(mrr_buf_cur, cur_range.start_key.key, cpk_tuple_length);
    mrr_buf_cur += cpk_tuple_length;
    if (is_mrr_assoc)
    {
      memcpy(mrr_buf_cur, &cur_range.ptr, sizeof(void*));
      mrr_buf_cur += sizeof(void*);
    }
  }
  dsmrr_eof= test(res);
  /* Sort the buffer contents by rowid */
  uint elem_size= cpk_tuple_length + (int)is_mrr_assoc * sizeof(void*);
  uint n_rowids= (mrr_buf_cur - mrr_buf) / elem_size;
  my_qsort2(mrr_buf, n_rowids, elem_size, 
            (qsort2_cmp)DsMrr_impl::key_tuple_cmp, (void*)this);
  mrr_buf_last= mrr_buf_cur;
  mrr_buf_cur=  mrr_buf;
  DBUG_VOID_RETURN;
 }
 /*
  DS-MRR/CPK: multi_range_read_next() function
  DESCRIPTION
    DsMrr_impl::dsmrr_next_cpk()
      range_info  OUT  identifier of range that the returned record belongs to
  DESCRIPTION
    DS-MRR/CPK: multi_range_read_next() function. 
    This is similar to DsMrr_impl::dsmrr_next(), the differences are that
     - we get records with index_read(), not with rnd_pos()
     - we may get multiple records for one key (=element of the buffer)
     - unlike dsmrr_fill_buffer(), dsmrr_fill_buffer_cpk() never fails.
  RETURN
    0                   OK, next record was successfully read
    HA_ERR_END_OF_FILE  End of records
    Other               Some other error
 */
 int DsMrr_impl::dsmrr_next_cpk(char **range_info)
 {
  int res;
  while (cpk_have_range)
  {
    if (h->mrr_funcs.skip_record &&
        h->mrr_funcs.skip_record(h->mrr_iter, cpk_saved_range_info, NULL))
    {
      cpk_have_range= FALSE;
      break;
    }
    res= h->index_next_same(table->record[0], mrr_buf_cur, cpk_tuple_length);
    if (h->mrr_funcs.skip_index_tuple &&
        h->mrr_funcs.skip_index_tuple(h->mrr_iter, cpk_saved_range_info))
      continue;
    if (res != HA_ERR_END_OF_FILE)
    {
      if (is_mrr_assoc)
        memcpy(range_info, &cpk_saved_range_info, sizeof(void*));
      return res;
    }
    /* No more records in this range. Exit this loop and go get another range */
    cpk_have_range= FALSE;
  }
  do
  {
    /* First, make sure we have a range at start of the buffer */
    if (mrr_buf_cur == mrr_buf_last)
    {
      if (dsmrr_eof)
      {
        res= HA_ERR_END_OF_FILE;
        goto end;
      }
      dsmrr_fill_buffer_cpk();
    }
    if (mrr_buf_cur == mrr_buf_last)
    {
      res= HA_ERR_END_OF_FILE;
      goto end;
    }
    /* Ok, got the range. Try making a lookup.  */
    uchar *lookup_tuple= mrr_buf_cur;
    mrr_buf_cur += cpk_tuple_length;
    if (is_mrr_assoc)
    {
      memcpy(&cpk_saved_range_info, mrr_buf_cur, sizeof(void*));
      mrr_buf_cur += sizeof(void*) * test(is_mrr_assoc);
    }
    if (h->mrr_funcs.skip_record &&
        h->mrr_funcs.skip_record(h->mrr_iter, cpk_saved_range_info, NULL))
      continue;
    res= h->index_read(table->record[0], lookup_tuple, cpk_tuple_length, 
                       HA_READ_KEY_EXACT);
    /*
      Check pushed index condition. Performance-wise, it does not make any
      sense to put this call here (the above call has already accessed the full
      record). That's the best I could do, though, because:
      - ha_innobase doesn't support IndexConditionPushdown on clustered PK
      - MRR interface doesn't allow the storage engine to refuse a pushed index
        condition.
      Having this call here is not fully harmless: EXPLAIN shows "pushed index
      condition", which is technically true but doesn't bring the benefits that
      one might expect.
    */
    if (h->mrr_funcs.skip_index_tuple &&
        h->mrr_funcs.skip_index_tuple(h->mrr_iter, cpk_saved_range_info))
      continue;
    if (res && res != HA_ERR_END_OF_FILE)
      goto end;
    if (!res)
    {
      memcpy(range_info, &cpk_saved_range_info, sizeof(void*));
      /* 
        Attempt reading more rows from this range only if there actually can
        be multiple matches:
       */
      cpk_have_range= !cpk_is_unique_scan;
      break;
    }
  } while (true);
 end:
  return res;
 }
 /**
  DS-MRR implementation: multi_range_read_next() function
 */
@@ -536,10 +768,13 @@ int DsMrr_impl::dsmrr_next(char **range_info)
  if (use_default_impl)
    return h->handler::multi_range_read_next(range_info);
  if (doing_cpk_scan)
    return dsmrr_next_cpk(range_info);
  do
  {
-    if (rowids_buf_cur == rowids_buf_last)
+    if (mrr_buf_cur == mrr_buf_last)
    {
      if (dsmrr_eof)
      {
@@ -552,17 +787,17 @@ int DsMrr_impl::dsmrr_next(char **range_info)
    }
    /* return eof if there are no rowids in the buffer after re-fill attempt */
-    if (rowids_buf_cur == rowids_buf_last)
+    if (mrr_buf_cur == mrr_buf_last)
    {
      res= HA_ERR_END_OF_FILE;
      goto end;
    }
-    rowid= rowids_buf_cur;
+    rowid= mrr_buf_cur;
    if (is_mrr_assoc)
-      memcpy(&cur_range_info, rowids_buf_cur + h->ref_length, sizeof(uchar**));
+      memcpy(&cur_range_info, mrr_buf_cur + h->ref_length, sizeof(uchar**));
-    rowids_buf_cur += h->ref_length + sizeof(void*) * test(is_mrr_assoc);
+    mrr_buf_cur += h->ref_length + sizeof(void*) * test(is_mrr_assoc);
    if (h2->mrr_funcs.skip_record &&
 	h2->mrr_funcs.skip_record(h2->mrr_iter, (char *) cur_range_info, rowid))
      continue;
@@ -582,7 +817,8 @@ end:
 /**
  DS-MRR implementation: multi_range_read_info() function
 */
-ha_rows DsMrr_impl::dsmrr_info(uint keyno, uint n_ranges, uint rows,
+ha_rows DsMrr_impl::dsmrr_info(uint keyno, uint n_ranges, uint rows, 
                               uint key_parts,
                               uint *bufsz, uint *flags, COST_VECT *cost)
 {  
  ha_rows res;
@@ -590,8 +826,8 @@ ha_rows DsMrr_impl::dsmrr_info(uint keyno, uint n_ranges, uint rows,
  uint def_bufsz= *bufsz;
  /* Get cost/flags/mem_usage of default MRR implementation */
-  res= h->handler::multi_range_read_info(keyno, n_ranges, rows, &def_bufsz,
+  res= h->handler::multi_range_read_info(keyno, n_ranges, rows, key_parts, 
-                                         &def_flags, cost);
+                                         &def_bufsz, &def_flags, cost);
  DBUG_ASSERT(!res);
  if ((*flags & HA_MRR_USE_DEFAULT_IMPL) || 
@@ -683,7 +919,33 @@ bool key_uses_partial_cols(TABLE *table, uint keyno)
  return FALSE;
 }
-/**
+
 /*
  Check if key/flags allow DS-MRR/CPK strategy to be used
  SYNOPSIS
   DsMrr_impl::check_cpk_scan()
     keyno      Index that will be used
     mrr_flags  
  DESCRIPTION
    Check if key/flags allow DS-MRR/CPK strategy to be used. 
  RETURN
    TRUE   DS-MRR/CPK should be used
    FALSE  Otherwise
 */
 bool DsMrr_impl::check_cpk_scan(uint keyno, uint mrr_flags)
 {
  return test((mrr_flags & HA_MRR_SINGLE_POINT) && 
              !(mrr_flags & HA_MRR_SORTED) && 
              keyno == table->s->primary_key && 
              h->primary_key_is_clustered());
 }
 /*
  DS-MRR Internals: Choose between Default MRR implementation and DS-MRR
  Make the choice between using Default MRR implementation and DS-MRR.
@@ -706,14 +968,18 @@ bool key_uses_partial_cols(TABLE *table, uint keyno)
  @retval FALSE  DS-MRR implementation should be used
 */
 bool DsMrr_impl::choose_mrr_impl(uint keyno, ha_rows rows, uint *flags,
                                 uint *bufsz, COST_VECT *cost)
 {
  COST_VECT dsmrr_cost;
  bool res;
  THD *thd= current_thd;
  doing_cpk_scan= check_cpk_scan(keyno, *flags); 
  if (thd->variables.optimizer_use_mrr == 2 || *flags & HA_MRR_INDEX_ONLY ||
-      (keyno == table->s->primary_key && h->primary_key_is_clustered()) ||
+      (keyno == table->s->primary_key && h->primary_key_is_clustered() &&
       !doing_cpk_scan) ||
       key_uses_partial_cols(table, keyno))
  {
    /* Use the default implementation */
--- a/sql/multi_range_read.h
+++ b/sql/multi_range_read.h
@@ -1,16 +1,76 @@
 /*
-  This file contains declarations for 
+  This file contains declarations for Disk-Sweep MultiRangeRead (DS-MRR) 
-   - Disk-Sweep MultiRangeRead (DS-MRR) implementation
+  implementation
 */
 /**
-  A Disk-Sweep MRR interface implementation
+  A Disk-Sweep implementation of MRR Interface (DS-MRR for short)
-  This implementation makes range (and, in the future, 'ref') scans to read
+  This is a "plugin"(*) for storage engines that allows make index scans 
-  table rows in disk sweeps. 
+  read table rows in rowid order. For disk-based storage engines, this is
-  
+  faster than reading table rows in whatever-SQL-layer-makes-calls-in order.
-  Currently it is used by MyISAM and InnoDB. Potentially it can be used with
+
-  any table handler that has non-clustered indexes and on-disk rows.
+  (*) - only conceptually. No dynamic loading or binary compatibility of any
        kind.
  General scheme of things:
      SQL Layer code
       |   |   |
      -v---v---v---- handler->multi_range_read_XXX() function calls
       |   |   |
      ____________________________________
     / DS-MRR module                      \
     |  (scan indexes, order rowids, do    |
     |   full record reads in rowid order) |
     \____________________________________/
       |   |   |
      -|---|---|----- handler->read_range_first()/read_range_next(), 
       |   |   |      handler->index_read(), handler->rnd_pos() calls.
       |   |   |
       v   v   v
      Storage engine internals
  Currently DS-MRR is used by MyISAM, InnoDB/XtraDB and Maria storage engines.
  Potentially it can be used with any table handler that has disk-based data
  storage and has better performance when reading data in rowid order.
 */
 /*
  DS-MRR implementation for one table. Create/use one object of this class for
  each ha_{myisam/innobase/etc} object. That object will be further referred to
  as "the handler"
  There are actually three strategies
   S1. Bypass DS-MRR, pass all calls to default implementation (i.e. to
      MRR-to-non-MRR calls converter)
   S2. Regular DS-MRR 
   S3. DS-MRR/CPK for doing scans on clustered primary keys.
  S1 is used for cases which DS-MRR is unable to handle for some reason.
  S2 is the actual DS-MRR. The basic algorithm is as follows:
    1. Scan the index (and only index, that is, with HA_EXTRA_KEYREAD on) and 
        fill the buffer with {rowid, range_id} pairs
    2. Sort the buffer by rowid
    3. for each {rowid, range_id} pair in the buffer
         get record by rowid and return the {record, range_id} pair
    4. Repeat the above steps until we've exhausted the list of ranges we're
       scanning.
  S3 is the variant of DS-MRR for use with clustered primary keys (or any
  clustered index). The idea is that in clustered index it is sufficient to 
  access the index in index order, and we don't need an intermediate steps to
  get rowid (like step #1 in S2).
   DS-MRR/CPK's basic algorithm is as follows:
    1. Collect a number of ranges (=lookup keys)
    2. Sort them so that they follow in index order.
    3. for each {lookup_key, range_id} pair in the buffer 
       get record(s) matching the lookup key and return {record, range_id} pairs
    4. Repeat the above steps until we've exhausted the list of ranges we're
       scanning.
 */
 class DsMrr_impl
@@ -21,29 +81,6 @@ public:
  DsMrr_impl()
    : h2(NULL) {};
  /*
    The "owner" handler object (the one that calls dsmrr_XXX functions.
    It is used to retrieve full table rows by calling rnd_pos().
  */
  handler *h;
  TABLE *table; /* Always equal to h->table */
 private:
  /* Secondary handler object.  It is used for scanning the index */
  handler *h2;
  /* Buffer to store rowids, or (rowid, range_id) pairs */
  uchar *rowids_buf;
  uchar *rowids_buf_cur;   /* Current position when reading/writing */
  uchar *rowids_buf_last;  /* When reading: end of used buffer space */
  uchar *rowids_buf_end;   /* End of the buffer */
  bool dsmrr_eof; /* TRUE <=> We have reached EOF when reading index tuples */
  /* TRUE <=> need range association, buffer holds {rowid, range_id} pairs */
  bool is_mrr_assoc;
  bool use_default_impl; /* TRUE <=> shortcut all calls to default MRR impl */
 public:
  void init(handler *h_arg, TABLE *table_arg)
  {
    h= h_arg; 
@@ -52,19 +89,62 @@ public:
  int dsmrr_init(handler *h, RANGE_SEQ_IF *seq_funcs, void *seq_init_param, 
                 uint n_ranges, uint mode, HANDLER_BUFFER *buf);
  void dsmrr_close();
  int dsmrr_fill_buffer();
  int dsmrr_next(char **range_info);
-  ha_rows dsmrr_info(uint keyno, uint n_ranges, uint keys, uint *bufsz,
+  ha_rows dsmrr_info(uint keyno, uint n_ranges, uint keys, uint key_parts, 
-                     uint *flags, COST_VECT *cost);
+                     uint *bufsz, uint *flags, COST_VECT *cost);
  ha_rows dsmrr_info_const(uint keyno, RANGE_SEQ_IF *seq, 
                            void *seq_init_param, uint n_ranges, uint *bufsz,
                            uint *flags, COST_VECT *cost);
 private:
  /*
    The "owner" handler object (the one that calls dsmrr_XXX functions.
    It is used to retrieve full table rows by calling rnd_pos().
  */
  handler *h;
  TABLE *table; /* Always equal to h->table */
  /* Secondary handler object.  It is used for scanning the index */
  handler *h2;
  /* Buffer to store rowids, or (rowid, range_id) pairs */
  uchar *mrr_buf;
  uchar *mrr_buf_cur;   /* Current position when reading/writing */
  uchar *mrr_buf_last;  /* When reading: end of used buffer space */
  uchar *mrr_buf_end;   /* End of the buffer */
  bool dsmrr_eof; /* TRUE <=> We have reached EOF when reading index tuples */
  /* TRUE <=> need range association, buffer holds {rowid, range_id} pairs */
  bool is_mrr_assoc;
  bool use_default_impl; /* TRUE <=> shortcut all calls to default MRR impl */
  bool doing_cpk_scan; /* TRUE <=> DS-MRR/CPK variant is used */
  /** DS-MRR/CPK variables start */
  /* Length of lookup tuple being used, in bytes */
  uint cpk_tuple_length;
  /*
    TRUE <=> We're scanning on a full primary key (and not on prefix), and so 
    can get max. one match for each key 
  */
  bool cpk_is_unique_scan;
  /* TRUE<=> we're in a middle of enumerating records from a range */ 
  bool cpk_have_range;
  /* Valid if cpk_have_range==TRUE: range_id of the range we're enumerating */
  char *cpk_saved_range_info;
  bool choose_mrr_impl(uint keyno, ha_rows rows, uint *flags, uint *bufsz, 
                       COST_VECT *cost);
  bool get_disk_sweep_mrr_cost(uint keynr, ha_rows rows, uint flags, 
                               uint *buffer_size, COST_VECT *cost);
  bool check_cpk_scan(uint keyno, uint mrr_flags);
  static int key_tuple_cmp(void* arg, uchar* key1, uchar* key2);
  int dsmrr_fill_buffer();
  void dsmrr_fill_buffer_cpk();
  int dsmrr_next_cpk(char **range_info);
 };
--- a/sql/opt_range.cc
+++ b/sql/opt_range.cc
@@ -8005,6 +8005,7 @@ QUICK_RANGE_SELECT *get_quick_select_for_ref(THD *thd, TABLE *table,
  quick->mrr_buf_size= thd->variables.mrr_buff_size;
  if (table->file->multi_range_read_info(quick->index, 1, (uint)records,
                                         uint(-1), 
                                         &quick->mrr_buf_size,
                                         &quick->mrr_flags, &cost))
    goto err;
--- a/sql/sql_join_cache.cc
+++ b/sql/sql_join_cache.cc
@@ -2390,8 +2390,8 @@ JOIN_CACHE_BKA::init_join_matching_records(RANGE_SEQ_IF *seq_funcs, uint ranges)
  */ 
  if (!file->inited)
    file->ha_index_init(join_tab->ref.key, 1);
-  if ((error= file->multi_range_read_init(seq_funcs, (void*) this, ranges,
+  if ((error= file->multi_range_read_init(seq_funcs, (void*) this, ranges, 
-					  mrr_mode, &mrr_buff)))
+                                          mrr_mode, &mrr_buff)))
    rc= error < 0 ? NESTED_LOOP_NO_MORE_ROWS: NESTED_LOOP_ERROR;
  return rc;
--- a/sql/sql_select.cc
+++ b/sql/sql_select.cc
@@ -7508,10 +7508,11 @@ uint check_join_cache_usage(JOIN_TAB *tab,
  case JT_EQ_REF:
    if (cache_level <= 4)
      return 0;
-    flags= HA_MRR_NO_NULL_ENDPOINTS;
+    flags= HA_MRR_NO_NULL_ENDPOINTS | HA_MRR_SINGLE_POINT;
    if (tab->table->covering_keys.is_set(tab->ref.key))
      flags|= HA_MRR_INDEX_ONLY;
    rows= tab->table->file->multi_range_read_info(tab->ref.key, 10, 20,
                                                  tab->ref.key_parts,
                                                  &bufsz, &flags, &cost);
    if ((rows != HA_POS_ERROR) && !(flags & HA_MRR_USE_DEFAULT_IMPL) &&
        (!(flags & HA_MRR_NO_ASSOCIATION) || cache_level > 6) &&
--- a/storage/maria/ha_maria.cc
+++ b/storage/maria/ha_maria.cc
@@ -3503,8 +3503,8 @@ static SHOW_VAR status_variables[]= {
 ***************************************************************************/
 int ha_maria::multi_range_read_init(RANGE_SEQ_IF *seq, void *seq_init_param,
-                                     uint n_ranges, uint mode, 
+                                    uint n_ranges, uint mode, 
-                                     HANDLER_BUFFER *buf)
+                                    HANDLER_BUFFER *buf)
 {
  return ds_mrr.dsmrr_init(this, seq, seq_init_param, n_ranges, mode, buf);
 }
@@ -3530,11 +3530,11 @@ ha_rows ha_maria::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
 }
 ha_rows ha_maria::multi_range_read_info(uint keyno, uint n_ranges, uint keys,
-                                        uint *bufsz, uint *flags, 
+                                       uint key_parts, uint *bufsz, 
-                                        COST_VECT *cost)
+                                       uint *flags, COST_VECT *cost)
 {
  ds_mrr.init(this, table);
-  return ds_mrr.dsmrr_info(keyno, n_ranges, keys, bufsz, flags, cost);
+  return ds_mrr.dsmrr_info(keyno, n_ranges, keys, key_parts, bufsz, flags, cost);
 }
 /* MyISAM MRR implementation ends */
--- a/storage/maria/ha_maria.h
+++ b/storage/maria/ha_maria.h
@@ -183,7 +183,8 @@ public:
                                      uint n_ranges, uint *bufsz,
                                      uint *flags, COST_VECT *cost);
  ha_rows multi_range_read_info(uint keyno, uint n_ranges, uint keys,
-                                uint *bufsz, uint *flags, COST_VECT *cost);
+                                uint key_parts, uint *bufsz, 
                                uint *flags, COST_VECT *cost);
  /* Index condition pushdown implementation */
  Item *idx_cond_push(uint keyno, Item* idx_cond);
--- a/storage/myisam/ha_myisam.cc
+++ b/storage/myisam/ha_myisam.cc
@@ -2206,11 +2206,11 @@ ha_rows ha_myisam::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
 }
 ha_rows ha_myisam::multi_range_read_info(uint keyno, uint n_ranges, uint keys,
-                                         uint *bufsz, uint *flags,
+                                         uint key_parts, uint *bufsz, 
-                                         COST_VECT *cost)
+                                         uint *flags, COST_VECT *cost)
 {
  ds_mrr.init(this, table);
-  return ds_mrr.dsmrr_info(keyno, n_ranges, keys, bufsz, flags, cost);
+  return ds_mrr.dsmrr_info(keyno, n_ranges, keys, key_parts, bufsz, flags, cost);
 }
 /* MyISAM MRR implementation ends */
--- a/storage/myisam/ha_myisam.h
+++ b/storage/myisam/ha_myisam.h
@@ -168,7 +168,8 @@ public:
                                      uint n_ranges, uint *bufsz,
                                      uint *flags, COST_VECT *cost);
  ha_rows multi_range_read_info(uint keyno, uint n_ranges, uint keys,
-                                uint *bufsz, uint *flags, COST_VECT *cost);
+                                uint key_parts, uint *bufsz, 
                                uint *flags, COST_VECT *cost);
  /* Index condition pushdown implementation */
  Item *idx_cond_push(uint keyno, Item* idx_cond);
--- a/storage/xtradb/handler/ha_innodb.cc
+++ b/storage/xtradb/handler/ha_innodb.cc
@@ -11206,7 +11206,8 @@ test_innobase_convert_name()
 */
 int ha_innobase::multi_range_read_init(RANGE_SEQ_IF *seq, void *seq_init_param,
-                          uint n_ranges, uint mode, HANDLER_BUFFER *buf)
+                                       uint n_ranges, uint mode, 
                                       HANDLER_BUFFER *buf)
 {
  return ds_mrr.dsmrr_init(this, seq, seq_init_param, n_ranges, mode, buf);
 }
@@ -11233,12 +11234,13 @@ ha_rows ha_innobase::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
  return res;
 }
-ha_rows ha_innobase::multi_range_read_info(uint keyno, uint n_ranges, 
+ha_rows ha_innobase::multi_range_read_info(uint keyno, uint n_ranges, uint keys,
-                                           uint keys, uint *bufsz, 
+                                           uint key_parts, uint *bufsz, 
                                           uint *flags, COST_VECT *cost)
 {
  ds_mrr.init(this, table);
-  ha_rows res= ds_mrr.dsmrr_info(keyno, n_ranges, keys, bufsz, flags, cost);
+  ha_rows res= ds_mrr.dsmrr_info(keyno, n_ranges, keys, key_parts, bufsz, 
                                 flags, cost);
  return res;
 }
--- a/storage/xtradb/handler/ha_innodb.h
+++ b/storage/xtradb/handler/ha_innodb.h
@@ -219,7 +219,8 @@ public:
                                      uint n_ranges, uint *bufsz,
                                      uint *flags, COST_VECT *cost);
  ha_rows multi_range_read_info(uint keyno, uint n_ranges, uint keys,
-                                uint *bufsz, uint *flags, COST_VECT *cost);
+                                uint key_parts, uint *bufsz, 
                                uint *flags, COST_VECT *cost);
  DsMrr_impl ds_mrr;
  Item *idx_cond_push(uint keyno, Item* idx_cond);