MWL#121: DS-MRR support for clustered primary keys

- Add testcases - Code cleanup: garbage removal, better comments, make members private where possible
2025-07-30 16:24:05 +03:00 · 2010-06-22 21:24:22 +04:00
parent 82f8ed17e1
commit 16e197f5b1
4 changed files with 577 additions and 189 deletions
--- a/mysql-test/r/innodb_mrr_cpk.result
+++ b/mysql-test/r/innodb_mrr_cpk.result
@ -0,0 +1,134 @@
+drop table if exists t0,t1,t2,t3;
+set @save_join_cache_level=@@join_cache_level;
+set join_cache_level=6;
+set @save_storage_engine=@@storage_engine;
+set storage_engine=innodb;
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1(a char(8), b char(8), filler char(100), primary key(a));
+show create table t1;
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` char(8) NOT NULL DEFAULT '',
+  `b` char(8) DEFAULT NULL,
+  `filler` char(100) DEFAULT NULL,
+  PRIMARY KEY (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1
+insert into t1 select 
+concat('a-', 1000 + A.a + B.a*10 + C.a*100, '=A'),
+concat('b-', 1000 + A.a + B.a*10 + C.a*100, '=B'),
+'filler'
+from t0 A, t0 B, t0 C;
+create table t2 (a char(8));
+insert into t2 values ('a-1010=A'), ('a-1030=A'), ('a-1020=A');
+This should use join buffer:
+explain select * from t1, t2 where t1.a=t2.a;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	3	
+1	SIMPLE	t1	eq_ref	PRIMARY	PRIMARY	8	test.t2.a	1	Using join buffer
+This output must be sorted by value of t1.a:
+select * from t1, t2 where t1.a=t2.a;
+a	b	filler	a
+a-1010=A	b-1010=B	filler	a-1010=A
+a-1020=A	b-1020=B	filler	a-1020=A
+a-1030=A	b-1030=B	filler	a-1030=A
+drop table t1, t2;
+create table t1(
+a char(8) character set utf8, b int, filler char(100), 
+primary key(a,b)
+);
+insert into t1 select 
+concat('a-', 1000 + A.a + B.a*10 + C.a*100, '=A'),
+1000 + A.a + B.a*10 + C.a*100,
+'filler'
+from t0 A, t0 B, t0 C;
+create table t2 (a char(8) character set utf8, b int);
+insert into t2 values ('a-1010=A', 1010), ('a-1030=A', 1030), ('a-1020=A', 1020);
+explain select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	3	
+1	SIMPLE	t1	eq_ref	PRIMARY	PRIMARY	28	test.t2.a,test.t2.b	1	Using join buffer
+select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+a	b	filler	a	b
+a-1010=A	1010	filler	a-1010=A	1010
+a-1020=A	1020	filler	a-1020=A	1020
+a-1030=A	1030	filler	a-1030=A	1030
+insert into t2 values ('a-1030=A', 1030), ('a-1020=A', 1020);
+explain select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	5	
+1	SIMPLE	t1	eq_ref	PRIMARY	PRIMARY	28	test.t2.a,test.t2.b	1	Using join buffer
+select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+a	b	filler	a	b
+a-1010=A	1010	filler	a-1010=A	1010
+a-1020=A	1020	filler	a-1020=A	1020
+a-1020=A	1020	filler	a-1020=A	1020
+a-1030=A	1030	filler	a-1030=A	1030
+a-1030=A	1030	filler	a-1030=A	1030
+drop table t1, t2;
+create table t1(
+a varchar(8) character set utf8, b int, filler char(100), 
+primary key(a,b)
+);
+insert into t1 select 
+concat('a-', 1000 + A.a + B.a*10 + C.a*100, '=A'),
+1000 + A.a + B.a*10 + C.a*100,
+'filler'
+from t0 A, t0 B, t0 C;
+create table t2 (a char(8) character set utf8, b int);
+insert into t2 values ('a-1010=A', 1010), ('a-1030=A', 1030), ('a-1020=A', 1020);
+explain select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	3	
+1	SIMPLE	t1	eq_ref	PRIMARY	PRIMARY	30	test.t2.a,test.t2.b	1	Using index condition(BKA); Using join buffer
+select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+a	b	filler	a	b
+a-1010=A	1010	filler	a-1010=A	1010
+a-1020=A	1020	filler	a-1020=A	1020
+a-1030=A	1030	filler	a-1030=A	1030
+explain select * from t1, t2 where t1.a=t2.a;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	3	
+1	SIMPLE	t1	ref	PRIMARY	PRIMARY	26	test.t2.a	1	Using index condition(BKA); Using join buffer
+select * from t1, t2 where t1.a=t2.a;
+a	b	filler	a	b
+a-1010=A	1010	filler	a-1010=A	1010
+a-1020=A	1020	filler	a-1020=A	1020
+a-1030=A	1030	filler	a-1030=A	1030
+drop table t1, t2;
+create table t1 (a int, b int, c int, filler char(100), primary key(a,b,c));
+insert into t1 select A.a, B.a, C.a, 'filler' from t0 A, t0 B, t0 C;
+insert into t1 values (11, 11, 11,   'filler');
+insert into t1 values (11, 11, 12,   'filler');
+insert into t1 values (11, 11, 13,   'filler');
+insert into t1 values (11, 22, 1234, 'filler');
+insert into t1 values (11, 33, 124,  'filler');
+insert into t1 values (11, 33, 125,  'filler');
+create table t2 (a int, b int);
+insert into t2 values (11,33), (11,22), (11,11);
+explain select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	3	
+1	SIMPLE	t1	ref	PRIMARY	PRIMARY	8	test.t2.a,test.t2.b	1	Using join buffer
+select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+a	b	c	filler	a	b
+11	11	11	filler	11	11
+11	11	12	filler	11	11
+11	11	13	filler	11	11
+11	22	1234	filler	11	22
+11	33	124	filler	11	33
+11	33	125	filler	11	33
+set join_cache_level=0;
+select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+a	b	c	filler	a	b
+11	33	124	filler	11	33
+11	33	125	filler	11	33
+11	22	1234	filler	11	22
+11	11	11	filler	11	11
+11	11	12	filler	11	11
+11	11	13	filler	11	11
+set join_cache_level=6;
+drop table t1,t2;
+set @@join_cache_level= @save_join_cache_level;
+set storage_engine=@save_storage_engine;
+drop table t0;
--- a/mysql-test/t/innodb_mrr_cpk.test
+++ b/mysql-test/t/innodb_mrr_cpk.test
@ -0,0 +1,134 @@
+# 
+# Tests for DS-MRR over clustered primary key. The only engine that supports
+# this is InnoDB/XtraDB.
+#
+# Basic idea about testing
+#  - DS-MRR/CPK works only with BKA
+#  - Should also test index condition pushdown
+#  - Should also test whatever uses RANGE_SEQ_IF::skip_record() for filtering
+#  - Also test access using prefix of primary key
+# 
+#  - Forget about cost model, BKA's multi_range_read_info() call passes 10 for
+#    #rows, the call is there at all only for applicability check
+# 
+-- source include/have_innodb.inc
+
+--disable_warnings
+drop table if exists t0,t1,t2,t3;
+--enable_warnings
+
+set @save_join_cache_level=@@join_cache_level;
+set join_cache_level=6;
+
+set @save_storage_engine=@@storage_engine;
+set storage_engine=innodb;
+
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1(a char(8), b char(8), filler char(100), primary key(a));
+show create table t1;
+
+insert into t1 select 
+  concat('a-', 1000 + A.a + B.a*10 + C.a*100, '=A'),
+  concat('b-', 1000 + A.a + B.a*10 + C.a*100, '=B'),
+  'filler'
+from t0 A, t0 B, t0 C;
+
+create table t2 (a char(8));
+insert into t2 values ('a-1010=A'), ('a-1030=A'), ('a-1020=A');
+
+--echo This should use join buffer:
+explain select * from t1, t2 where t1.a=t2.a;
+
+--echo This output must be sorted by value of t1.a:
+select * from t1, t2 where t1.a=t2.a;
+drop table t1, t2;
+
+# Try multi-column indexes
+create table t1(
+  a char(8) character set utf8, b int, filler char(100), 
+  primary key(a,b)
+);
+
+insert into t1 select 
+  concat('a-', 1000 + A.a + B.a*10 + C.a*100, '=A'),
+  1000 + A.a + B.a*10 + C.a*100,
+  'filler'
+from t0 A, t0 B, t0 C;
+
+create table t2 (a char(8) character set utf8, b int);
+insert into t2 values ('a-1010=A', 1010), ('a-1030=A', 1030), ('a-1020=A', 1020);
+explain select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+
+# Try with dataset that causes identical lookup keys:
+insert into t2 values ('a-1030=A', 1030), ('a-1020=A', 1020);
+explain select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+
+drop table t1, t2;
+
+create table t1(
+  a varchar(8) character set utf8, b int, filler char(100), 
+  primary key(a,b)
+);
+
+insert into t1 select 
+  concat('a-', 1000 + A.a + B.a*10 + C.a*100, '=A'),
+  1000 + A.a + B.a*10 + C.a*100,
+  'filler'
+from t0 A, t0 B, t0 C;
+
+create table t2 (a char(8) character set utf8, b int);
+insert into t2 values ('a-1010=A', 1010), ('a-1030=A', 1030), ('a-1020=A', 1020);
+explain select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+
+# 
+# Try scanning on a CPK prefix
+#
+explain select * from t1, t2 where t1.a=t2.a;
+select * from t1, t2 where t1.a=t2.a;
+drop table t1, t2;
+
+#
+# The above example is not very interesting, as CPK prefix has 
+# only one match.  Create a dataset where scan on CPK prefix 
+# would produce multiple matches:
+#
+create table t1 (a int, b int, c int, filler char(100), primary key(a,b,c));
+insert into t1 select A.a, B.a, C.a, 'filler' from t0 A, t0 B, t0 C;
+
+insert into t1 values (11, 11, 11,   'filler');
+insert into t1 values (11, 11, 12,   'filler');
+insert into t1 values (11, 11, 13,   'filler');
+insert into t1 values (11, 22, 1234, 'filler');
+insert into t1 values (11, 33, 124,  'filler');
+insert into t1 values (11, 33, 125,  'filler');
+
+create table t2 (a int, b int);
+insert into t2 values (11,33), (11,22), (11,11);
+
+explain select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+
+set join_cache_level=0;
+select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+set join_cache_level=6;
+
+drop table t1,t2;
+
+#
+# Check that Index Condition Pushdown (BKA) actually works:
+#
+
+# TODO
+
+#
+# Check that record-check-func is done:
+# 
+
+set @@join_cache_level= @save_join_cache_level;
+set storage_engine=@save_storage_engine;
+drop table t0;
+
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@ -139,8 +139,13 @@ ha_rows handler::multi_range_read_info(uint keyno, uint n_ranges, uint n_rows,
                                       uint key_parts, uint *bufsz, 
                                       uint *flags, COST_VECT *cost)
 {
+  /* 
+    Currently we expect this function to be called only in preparation of scan
+    with HA_MRR_SINGLE_POINT property.
+  */
+  DBUG_ASSERT(*flags | HA_MRR_SINGLE_POINT);
+
  *bufsz= 0; /* Default implementation doesn't need a buffer */
-  //psergey2-todo: assert for singlepoint ranges here?
  *flags |= HA_MRR_USE_DEFAULT_IMPL;

  cost->zero();
@ -323,22 +328,25 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
                                        n_ranges, key_parts, mode, buf);
    DBUG_RETURN(retval);
  }
-  rowids_buf= buf->buffer;
+  mrr_buf= buf->buffer;

  is_mrr_assoc= !test(mode & HA_MRR_NO_ASSOCIATION);

  if (is_mrr_assoc)
    status_var_increment(table->in_use->status_var.ha_multi_range_read_init_count);
 
-  rowids_buf_end= buf->buffer_end;
+  mrr_buf_end= buf->buffer_end;


  doing_cpk_scan= check_cpk_scan(h->active_index, mode); 
  if (doing_cpk_scan)
  {
+    /* 
+      When doing a scan on CPK, the buffer stores {lookup_tuple, range_id}
+      pairs 
+    */
    uint keylen=0;
    DBUG_ASSERT(key_parts != 0);
-    //psergey2-todo: new elem_size here
    for (uint kp= 0; kp < key_parts; kp++)
      keylen += table->key_info[h->active_index].key_part[kp].store_length;

@ -350,12 +358,29 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
    use_default_impl= FALSE;
  }
  else
+  {
+    /* In regular DS-MRR, buffer stores {rowid, range_id} pairs */
    elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*);
+  }

-  rowids_buf_last= rowids_buf + 
-                      ((rowids_buf_end - rowids_buf)/ elem_size)*
+  mrr_buf_last= mrr_buf + 
+                      ((mrr_buf_end - mrr_buf)/ elem_size)*
                      elem_size;
-  rowids_buf_end= rowids_buf_last;
+  mrr_buf_end= mrr_buf_last;
+
+  if (doing_cpk_scan)
+  {
+    /* 
+      DS-MRR/CPK: fill buffer with lookup tuples and sort; also we don't need a
+      secondary handler object.
+    */
+    h->mrr_iter= seq_funcs->init(seq_init_param, n_ranges, mode);
+    h->mrr_funcs= *seq_funcs;
+    dsmrr_fill_buffer_cpk();
+    if (dsmrr_eof) 
+      buf->end_of_used_area= mrr_buf_last;
+    DBUG_RETURN(0); /* nothing could go wrong while filling the buffer */
+  }

  /*
    There can be two cases:
@ -365,9 +390,6 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
       The caller might have called h->index_init(), need to switch h to
       rnd_pos calls.
  */
-  //psergey2-todo: don't create secondary for CPK scan.
-  if (!doing_cpk_scan)
-  {
  if (!h2)
  {
    /* Create a separate handler object to do rndpos() calls. */
@ -431,19 +453,6 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
    if (res)
      goto error;
  }
-  }
-  else
-  {
-    //doing DS-MRR/CPK
-    // fill-buffer-analog
-    // eof
-    h->mrr_iter= seq_funcs->init(seq_init_param, n_ranges, mode);
-    h->mrr_funcs= *seq_funcs;
-    dsmrr_fill_buffer_cpk();
-    if (dsmrr_eof) 
-      buf->end_of_used_area= rowids_buf_last;
-    DBUG_RETURN(0); // nothing can go wrong while filling the buffer
-  }

  if (h2->handler::multi_range_read_init(seq_funcs, seq_init_param, n_ranges,
                                         key_parts, mode, buf) || 
@ -456,7 +465,7 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
    adjust *buf to indicate that the remaining buffer space will not be used.
  */
  if (dsmrr_eof) 
-    buf->end_of_used_area= rowids_buf_last;
+    buf->end_of_used_area= mrr_buf_last;

  /*
     h->inited == INDEX may occur when 'range checked for each record' is
@ -513,6 +522,9 @@ static int rowid_cmp(void *h, uchar *a, uchar *b)
  
  The function assumes that rowids buffer is empty when it is invoked. 

+  dsmrr_eof is set to indicate whether we've exhausted the list of ranges we're
+  scanning.
+  
  @param h  Table handler

  @retval 0      OK, the next portion of rowids is in the buffer,
@ -526,8 +538,8 @@ int DsMrr_impl::dsmrr_fill_buffer()
  int res;
  DBUG_ENTER("DsMrr_impl::dsmrr_fill_buffer");

-  rowids_buf_cur= rowids_buf;
-  while ((rowids_buf_cur < rowids_buf_end) && 
+  mrr_buf_cur= mrr_buf;
+  while ((mrr_buf_cur < mrr_buf_end) && 
         !(res= h2->handler::multi_range_read_next(&range_info)))
  {
    KEY_MULTI_RANGE *curr_range= &h2->handler::mrr_cur_range;
@ -537,13 +549,13 @@ int DsMrr_impl::dsmrr_fill_buffer()
    
    /* Put rowid, or {rowid, range_id} pair into the buffer */
    h2->position(table->record[0]);
-    memcpy(rowids_buf_cur, h2->ref, h2->ref_length);
-    rowids_buf_cur += h2->ref_length;
+    memcpy(mrr_buf_cur, h2->ref, h2->ref_length);
+    mrr_buf_cur += h2->ref_length;

    if (is_mrr_assoc)
    {
-      memcpy(rowids_buf_cur, &range_info, sizeof(void*));
-      rowids_buf_cur += sizeof(void*);
+      memcpy(mrr_buf_cur, &range_info, sizeof(void*));
+      mrr_buf_cur += sizeof(void*);
    }
  }

@ -553,27 +565,29 @@ int DsMrr_impl::dsmrr_fill_buffer()

  /* Sort the buffer contents by rowid */
  uint elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*);
-  uint n_rowids= (rowids_buf_cur - rowids_buf) / elem_size;
+  uint n_rowids= (mrr_buf_cur - mrr_buf) / elem_size;
  
-  my_qsort2(rowids_buf, n_rowids, elem_size, (qsort2_cmp)rowid_cmp,
+  my_qsort2(mrr_buf, n_rowids, elem_size, (qsort2_cmp)rowid_cmp,
            (void*)h);
-  rowids_buf_last= rowids_buf_cur;
-  rowids_buf_cur=  rowids_buf;
+  mrr_buf_last= mrr_buf_cur;
+  mrr_buf_cur=  mrr_buf;
  DBUG_RETURN(0);
 }


-/* qsort-compatible function to compare key tuples */
+/* 
+  my_qsort2-compatible function to compare key tuples 
+*/
+
 int DsMrr_impl::key_tuple_cmp(void* arg, uchar* key1, uchar* key2)
 {
  DsMrr_impl *dsmrr= (DsMrr_impl*)arg;
  TABLE *table= dsmrr->h->table;
  
  KEY_PART_INFO *part= table->key_info[table->s->primary_key].key_part;
-  KEY_PART_INFO *part_end= part + dsmrr->cpk_n_parts;
+  uchar *key1_end= key1 + dsmrr->cpk_tuple_length;

-  //uint32 *lengths=item->field_lengths;
-  for (; part < part_end; ++part)
+  while (key1 < key1_end)
  {
    Field* f = part->field;
    int len = part->store_length;
@ -582,33 +596,43 @@ int DsMrr_impl::key_tuple_cmp(void* arg, uchar* key1, uchar* key2)
      return res;
    key1 += len;
    key2 += len;
+    part++;
  }
  return 0;
 }


-//psergey2:
-int DsMrr_impl::dsmrr_fill_buffer_cpk()
+/*
+  DS-MRR/CPK: Fill the buffer with (lookup_tuple, range_id) pairs and sort
+
+  DESCRIPTION
+    DS-MRR/CPK: Fill the buffer with (lookup_tuple, range_id) pairs and sort
+
+    dsmrr_eof is set to indicate whether we've exhausted the list of ranges 
+    we're scanning.
+*/
+
+void DsMrr_impl::dsmrr_fill_buffer_cpk()
 {
  int res;
  KEY_MULTI_RANGE cur_range;
  DBUG_ENTER("DsMrr_impl::dsmrr_fill_buffer_cpk");

-  rowids_buf_cur= rowids_buf;
-  while ((rowids_buf_cur < rowids_buf_end) && 
+  mrr_buf_cur= mrr_buf;
+  while ((mrr_buf_cur < mrr_buf_end) && 
         !(res= h->mrr_funcs.next(h->mrr_iter, &cur_range)))
  {
    DBUG_ASSERT(cur_range.range_flag & EQ_RANGE);
    DBUG_ASSERT(cpk_tuple_length == cur_range.start_key.length);

    /* Put key, or {key, range_id} pair into the buffer */
-    memcpy(rowids_buf_cur, cur_range.start_key.key, cpk_tuple_length);
-    rowids_buf_cur += cpk_tuple_length;
+    memcpy(mrr_buf_cur, cur_range.start_key.key, cpk_tuple_length);
+    mrr_buf_cur += cpk_tuple_length;

    if (is_mrr_assoc)
    {
-      memcpy(rowids_buf_cur, &cur_range.ptr, sizeof(void*));
-      rowids_buf_cur += sizeof(void*);
+      memcpy(mrr_buf_cur, &cur_range.ptr, sizeof(void*));
+      mrr_buf_cur += sizeof(void*);
    }
  }

@ -616,77 +640,82 @@ int DsMrr_impl::dsmrr_fill_buffer_cpk()

  /* Sort the buffer contents by rowid */
  uint elem_size= cpk_tuple_length + (int)is_mrr_assoc * sizeof(void*);
-  uint n_rowids= (rowids_buf_cur - rowids_buf) / elem_size;
+  uint n_rowids= (mrr_buf_cur - mrr_buf) / elem_size;
  
-  my_qsort2(rowids_buf, n_rowids, elem_size, 
+  my_qsort2(mrr_buf, n_rowids, elem_size, 
            (qsort2_cmp)DsMrr_impl::key_tuple_cmp, (void*)this);
-  rowids_buf_last= rowids_buf_cur;
-  rowids_buf_cur=  rowids_buf;
-  DBUG_RETURN(0);
+  mrr_buf_last= mrr_buf_cur;
+  mrr_buf_cur=  mrr_buf;
+  DBUG_VOID_RETURN;
 }


 /*
-  CPK: so, the source is 
-   - buffer exhaustion/re-fill
-   - advance to next range on "record-not-found" error.
-   - if scanning on a prefix, enumerate all records for a key.
+  DS-MRR/CPK: multi_range_read_next() function
+
+  DESCRIPTION
+    DsMrr_impl::dsmrr_next_cpk() 
+
+  DESCRIPTION
+    DS-MRR/CPK: multi_range_read_next() function. 
+    This is similar to DsMrr_impl::dsmrr_next(), the differences are that
+     - we get records with index_read(), not with rnd_pos()
+     - we may get multiple records for one key (=element of the buffer)
+     - unlike dsmrr_fill_buffer(), dsmrr_fill_buffer_cpk() never fails.
+ 
+  RETURN
+    0                   OK, next record was successfully read
+    HA_ERR_END_OF_FILE  End of records
+    Other               Some other error
 */
+
 int DsMrr_impl::dsmrr_next_cpk(char **range_info)
 {
  int res;

  if (cpk_have_range)
  {
-    res= h->index_next_same(table->record[0], rowids_buf_cur, cpk_tuple_length);
+    res= h->index_next_same(table->record[0], mrr_buf_cur, cpk_tuple_length);
    if (res != HA_ERR_END_OF_FILE)
    {
-      // todo
      if (is_mrr_assoc)
        memcpy(range_info, &cpk_saved_range_info, sizeof(void*));
      return res;
    }
-    /* 
-      Ok, we got EOF for records in this range. Fall through to get to another
-      range.
-    */
+    /* No more records in this range. Fall through to get to another range  */
  }

  do
  {
    /* First, make sure we have a range at start of the buffer */
-    if (rowids_buf_cur == rowids_buf_last)
+    if (mrr_buf_cur == mrr_buf_last)
    {
      if (dsmrr_eof)
      {
        res= HA_ERR_END_OF_FILE;
        goto end;
      }
-      // TODO: the return values are mix of HA_ERR_ codes and TRUE as "generic
-      //       failure" error. Is this ok?
-      if ((res= dsmrr_fill_buffer_cpk()))
-        goto end;
+      dsmrr_fill_buffer_cpk();
    }
-   
-    if (rowids_buf_cur == rowids_buf_last)
+    if (mrr_buf_cur == mrr_buf_last)
    {
      res= HA_ERR_END_OF_FILE;
      goto end;
    }
    
-    //TODO: make skip_index_tuple() calls, too?
-    //TODO: skip-record calls here?
+    //psergey2-todo: make skip_index_tuple() calls, too?
+    //psergey2-todo: skip-record calls here?
    //if (h2->mrr_funcs.skip_record &&
    //	h2->mrr_funcs.skip_record(h2->mrr_iter, (char *) cur_range_info, rowid))
    //  continue;
    
    /* Ok, got the range. Try making a lookup.  */
-    uchar *lookup_tuple= rowids_buf_cur;
-    rowids_buf_cur += cpk_tuple_length;
+    uchar *lookup_tuple= mrr_buf_cur;
+    mrr_buf_cur += cpk_tuple_length;
    if (is_mrr_assoc)
    {
-      memcpy(cpk_saved_range_info, rowids_buf_cur, sizeof(void*));
-      rowids_buf_cur += sizeof(void*) * test(is_mrr_assoc);
+      memcpy(cpk_saved_range_info, mrr_buf_cur, sizeof(void*));
+      mrr_buf_cur += sizeof(void*) * test(is_mrr_assoc);
    }
      
    res= h->index_read(table->record[0], lookup_tuple, cpk_tuple_length, 
@ -698,6 +727,10 @@ int DsMrr_impl::dsmrr_next_cpk(char **range_info)
    if (!res)
    {
      memcpy(range_info, cpk_saved_range_info, sizeof(void*));
+      /* 
+        Attempt reading more rows from this range only if there actually can
+        be multiple matches:
+       */
      cpk_have_range= !cpk_is_unique_scan;
      break;
    }
@ -707,6 +740,7 @@ end:
  return res;
 }

+
 /**
  DS-MRR implementation: multi_range_read_next() function
 */
@ -725,7 +759,7 @@ int DsMrr_impl::dsmrr_next(char **range_info)
  
  do
  {
-    if (rowids_buf_cur == rowids_buf_last)
+    if (mrr_buf_cur == mrr_buf_last)
    {
      if (dsmrr_eof)
      {
@ -738,17 +772,17 @@ int DsMrr_impl::dsmrr_next(char **range_info)
    }
   
    /* return eof if there are no rowids in the buffer after re-fill attempt */
-    if (rowids_buf_cur == rowids_buf_last)
+    if (mrr_buf_cur == mrr_buf_last)
    {
      res= HA_ERR_END_OF_FILE;
      goto end;
    }
-    rowid= rowids_buf_cur;
+    rowid= mrr_buf_cur;

    if (is_mrr_assoc)
-      memcpy(&cur_range_info, rowids_buf_cur + h->ref_length, sizeof(uchar**));
+      memcpy(&cur_range_info, mrr_buf_cur + h->ref_length, sizeof(uchar**));

-    rowids_buf_cur += h->ref_length + sizeof(void*) * test(is_mrr_assoc);
+    mrr_buf_cur += h->ref_length + sizeof(void*) * test(is_mrr_assoc);
    if (h2->mrr_funcs.skip_record &&
 	h2->mrr_funcs.skip_record(h2->mrr_iter, (char *) cur_range_info, rowid))
      continue;
@ -870,7 +904,33 @@ bool key_uses_partial_cols(TABLE *table, uint keyno)
  return FALSE;
 }

-/**
+
+/*
+  Check if key/flags allow DS-MRR/CPK strategy to be used
+  
+  SYNOPSIS
+   DsMrr_impl::check_cpk_scan()
+     keyno      Index that will be used
+     mrr_flags  
+  
+  DESCRIPTION
+    Check if key/flags allow DS-MRR/CPK strategy to be used. 
+ 
+  RETURN
+    TRUE   DS-MRR/CPK should be used
+    FALSE  Otherwise
+*/
+
+bool DsMrr_impl::check_cpk_scan(uint keyno, uint mrr_flags)
+{
+  return test((mrr_flags & HA_MRR_SINGLE_POINT) && 
+              !(mrr_flags & HA_MRR_SORTED) && 
+              keyno == table->s->primary_key && 
+              h->primary_key_is_clustered());
+}
+
+
+/*
  DS-MRR Internals: Choose between Default MRR implementation and DS-MRR

  Make the choice between using Default MRR implementation and DS-MRR.
@ -892,13 +952,7 @@ bool key_uses_partial_cols(TABLE *table, uint keyno)
  @retval TRUE   Default MRR implementation should be used
  @retval FALSE  DS-MRR implementation should be used
 */
-bool DsMrr_impl::check_cpk_scan(uint keyno, uint mrr_flags)
-{
-  return test((mrr_flags & HA_MRR_SINGLE_POINT) && 
-              !(mrr_flags & HA_MRR_SORTED) && 
-              keyno == table->s->primary_key && 
-              h->primary_key_is_clustered());
-}
+

 bool DsMrr_impl::choose_mrr_impl(uint keyno, ha_rows rows, uint *flags,
                                 uint *bufsz, COST_VECT *cost)
@ -906,9 +960,8 @@ bool DsMrr_impl::choose_mrr_impl(uint keyno, ha_rows rows, uint *flags,
  COST_VECT dsmrr_cost;
  bool res;
  THD *thd= current_thd;
-  //psergey2: check the criteria.
-  doing_cpk_scan= check_cpk_scan(keyno, *flags); 

+  doing_cpk_scan= check_cpk_scan(keyno, *flags); 
  if (thd->variables.optimizer_use_mrr == 2 || *flags & HA_MRR_INDEX_ONLY ||
      (keyno == table->s->primary_key && h->primary_key_is_clustered() &&
       !doing_cpk_scan) ||
--- a/sql/multi_range_read.h
+++ b/sql/multi_range_read.h
@ -1,16 +1,76 @@
 /*
-  This file contains declarations for 
-   - Disk-Sweep MultiRangeRead (DS-MRR) implementation
+  This file contains declarations for Disk-Sweep MultiRangeRead (DS-MRR) 
+  implementation
 */

 /**
-  A Disk-Sweep MRR interface implementation
+  A Disk-Sweep implementation of MRR Interface (DS-MRR for short)

-  This implementation makes range (and, in the future, 'ref') scans to read
-  table rows in disk sweeps. 
+  This is a "plugin"(*) for storage engines that allows make index scans 
+  read table rows in rowid order. For disk-based storage engines, this is
+  faster than reading table rows in whatever-SQL-layer-makes-calls-in order.

-  Currently it is used by MyISAM and InnoDB. Potentially it can be used with
-  any table handler that has non-clustered indexes and on-disk rows.
+  (*) - only conceptually. No dynamic loading or binary compatibility of any
+        kind.
+
+  General scheme of things:
+   
+      SQL Layer code
+       |   |   |
+      -v---v---v---- handler->multi_range_read_XXX() function calls
+       |   |   |
+      ____________________________________
+     / DS-MRR module                      \
+     |  (scan indexes, order rowids, do    |
+     |   full record reads in rowid order) |
+     \____________________________________/
+       |   |   |
+      -|---|---|----- handler->read_range_first()/read_range_next(), 
+       |   |   |      handler->index_read(), handler->rnd_pos() calls.
+       |   |   |
+       v   v   v
+      Storage engine internals
+   
+  Currently DS-MRR is used by MyISAM, InnoDB/XtraDB and Maria storage engines.
+  Potentially it can be used with any table handler that has disk-based data
+  storage and has better performance when reading data in rowid order.
+*/
+
+
+/*
+  DS-MRR implementation for one table. Create/use one object of this class for
+  each ha_{myisam/innobase/etc} object. That object will be further referred to
+  as "the handler"
+
+  There are actually three strategies
+   S1. Bypass DS-MRR, pass all calls to default implementation (i.e. to
+      MRR-to-non-MRR calls converter)
+   S2. Regular DS-MRR 
+   S3. DS-MRR/CPK for doing scans on clustered primary keys.
+
+  S1 is used for cases which DS-MRR is unable to handle for some reason.
+
+  S2 is the actual DS-MRR. The basic algorithm is as follows:
+    1. Scan the index (and only index, that is, with HA_EXTRA_KEYREAD on) and 
+        fill the buffer with {rowid, range_id} pairs
+    2. Sort the buffer by rowid
+    3. for each {rowid, range_id} pair in the buffer
+         get record by rowid and return the {record, range_id} pair
+    4. Repeat the above steps until we've exhausted the list of ranges we're
+       scanning.
+
+  S3 is the variant of DS-MRR for use with clustered primary keys (or any
+  clustered index). The idea is that in clustered index it is sufficient to 
+  access the index in index order, and we don't need an intermediate steps to
+  get rowid (like step #1 in S2).
+
+   DS-MRR/CPK's basic algorithm is as follows:
+    1. Collect a number of ranges (=lookup keys)
+    2. Sort them so that they follow in index order.
+    3. for each {lookup_key, range_id} pair in the buffer 
+       get record(s) matching the lookup key and return {record, range_id} pairs
+    4. Repeat the above steps until we've exhausted the list of ranges we're
+       scanning.
 */

 class DsMrr_impl
@ -21,40 +81,6 @@ public:
  DsMrr_impl()
    : h2(NULL) {};
  
-  /*
-    The "owner" handler object (the one that calls dsmrr_XXX functions.
-    It is used to retrieve full table rows by calling rnd_pos().
-  */
-  handler *h;
-  TABLE *table; /* Always equal to h->table */
-private:
-  /* Secondary handler object.  It is used for scanning the index */
-  handler *h2;
-
-  /* Buffer to store rowids, or (rowid, range_id) pairs */
-  uchar *rowids_buf;
-  uchar *rowids_buf_cur;   /* Current position when reading/writing */
-  uchar *rowids_buf_last;  /* When reading: end of used buffer space */
-  uchar *rowids_buf_end;   /* End of the buffer */
-
-  bool dsmrr_eof; /* TRUE <=> We have reached EOF when reading index tuples */
-
-  /* TRUE <=> need range association, buffer holds {rowid, range_id} pairs */
-  bool is_mrr_assoc;
-
-  bool use_default_impl; /* TRUE <=> shortcut all calls to default MRR impl */
-
-  bool doing_cpk_scan;
-  uint cpk_tuple_length;
-  uint cpk_n_parts;
-  bool cpk_is_unique_scan;
-  char *cpk_saved_range_info;
-  bool cpk_have_range;
-
-
-  bool check_cpk_scan(uint keyno, uint mrr_flags);
-  static int key_tuple_cmp(void* arg, uchar* key1, uchar* key2);
-public:
  void init(handler *h_arg, TABLE *table_arg)
  {
    h= h_arg; 
@ -64,10 +90,7 @@ public:
                 uint n_ranges, uint key_parts, uint mode, 
                 HANDLER_BUFFER *buf);
  void dsmrr_close();
-  int dsmrr_fill_buffer();
-  int dsmrr_fill_buffer_cpk();
  int dsmrr_next(char **range_info);
-  int dsmrr_next_cpk(char **range_info);

  ha_rows dsmrr_info(uint keyno, uint n_ranges, uint keys, uint key_parts, 
                     uint *bufsz, uint *flags, COST_VECT *cost);
@ -76,9 +99,53 @@ public:
                            void *seq_init_param, uint n_ranges, uint *bufsz,
                            uint *flags, COST_VECT *cost);
 private:
+  /*
+    The "owner" handler object (the one that calls dsmrr_XXX functions.
+    It is used to retrieve full table rows by calling rnd_pos().
+  */
+  handler *h;
+  TABLE *table; /* Always equal to h->table */
+
+  /* Secondary handler object.  It is used for scanning the index */
+  handler *h2;
+
+  /* Buffer to store rowids, or (rowid, range_id) pairs */
+  uchar *mrr_buf;
+  uchar *mrr_buf_cur;   /* Current position when reading/writing */
+  uchar *mrr_buf_last;  /* When reading: end of used buffer space */
+  uchar *mrr_buf_end;   /* End of the buffer */
+
+  bool dsmrr_eof; /* TRUE <=> We have reached EOF when reading index tuples */
+
+  /* TRUE <=> need range association, buffer holds {rowid, range_id} pairs */
+  bool is_mrr_assoc;
+
+  bool use_default_impl; /* TRUE <=> shortcut all calls to default MRR impl */
+
+  bool doing_cpk_scan; /* TRUE <=> DS-MRR/CPK variant is used */
+
+  /** DS-MRR/CPK variables start */
+
+  /* Length of lookup tuple being used, in bytes */
+  uint cpk_tuple_length;
+  /*
+    TRUE <=> We're scanning on a full primary key (and not on prefix), and so 
+    can get max. one match for each key 
+  */
+  bool cpk_is_unique_scan;
+  /* TRUE<=> we're in a middle of enumerating records from a range */ 
+  bool cpk_have_range;
+  /* Valid if cpk_have_range==TRUE: range_id of the range we're enumerating */
+  char *cpk_saved_range_info;
+
  bool choose_mrr_impl(uint keyno, ha_rows rows, uint *flags, uint *bufsz, 
                       COST_VECT *cost);
  bool get_disk_sweep_mrr_cost(uint keynr, ha_rows rows, uint flags, 
                               uint *buffer_size, COST_VECT *cost);
+  bool check_cpk_scan(uint keyno, uint mrr_flags);
+  static int key_tuple_cmp(void* arg, uchar* key1, uchar* key2);
+  int dsmrr_fill_buffer();
+  void dsmrr_fill_buffer_cpk();
+  int dsmrr_next_cpk(char **range_info);
 };