From 16e197f5b10fdee23703b94c5549bc17cd81c6f8 Mon Sep 17 00:00:00 2001
From: Sergey Petrunya <psergey@askmonty.org>
Date: Tue, 22 Jun 2010 21:24:22 +0400
Subject: [PATCH] MWL#121: DS-MRR support for clustered primary keys - Add
 testcases - Code cleanup: garbage removal, better comments, make members
 private where possible

---
 mysql-test/r/innodb_mrr_cpk.result | 134 ++++++++++++
 mysql-test/t/innodb_mrr_cpk.test   | 134 ++++++++++++
 sql/multi_range_read.cc            | 341 +++++++++++++++++------------
 sql/multi_range_read.h             | 157 +++++++++----
 4 files changed, 577 insertions(+), 189 deletions(-)
 create mode 100644 mysql-test/r/innodb_mrr_cpk.result
 create mode 100644 mysql-test/t/innodb_mrr_cpk.test

diff --git a/mysql-test/r/innodb_mrr_cpk.result b/mysql-test/r/innodb_mrr_cpk.result
new file mode 100644
index 00000000000..f93807e14d8
--- /dev/null
+++ b/mysql-test/r/innodb_mrr_cpk.result
@@ -0,0 +1,134 @@
+drop table if exists t0,t1,t2,t3;
+set @save_join_cache_level=@@join_cache_level;
+set join_cache_level=6;
+set @save_storage_engine=@@storage_engine;
+set storage_engine=innodb;
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1(a char(8), b char(8), filler char(100), primary key(a));
+show create table t1;
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` char(8) NOT NULL DEFAULT '',
+  `b` char(8) DEFAULT NULL,
+  `filler` char(100) DEFAULT NULL,
+  PRIMARY KEY (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1
+insert into t1 select 
+concat('a-', 1000 + A.a + B.a*10 + C.a*100, '=A'),
+concat('b-', 1000 + A.a + B.a*10 + C.a*100, '=B'),
+'filler'
+from t0 A, t0 B, t0 C;
+create table t2 (a char(8));
+insert into t2 values ('a-1010=A'), ('a-1030=A'), ('a-1020=A');
+This should use join buffer:
+explain select * from t1, t2 where t1.a=t2.a;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	3	
+1	SIMPLE	t1	eq_ref	PRIMARY	PRIMARY	8	test.t2.a	1	Using join buffer
+This output must be sorted by value of t1.a:
+select * from t1, t2 where t1.a=t2.a;
+a	b	filler	a
+a-1010=A	b-1010=B	filler	a-1010=A
+a-1020=A	b-1020=B	filler	a-1020=A
+a-1030=A	b-1030=B	filler	a-1030=A
+drop table t1, t2;
+create table t1(
+a char(8) character set utf8, b int, filler char(100), 
+primary key(a,b)
+);
+insert into t1 select 
+concat('a-', 1000 + A.a + B.a*10 + C.a*100, '=A'),
+1000 + A.a + B.a*10 + C.a*100,
+'filler'
+from t0 A, t0 B, t0 C;
+create table t2 (a char(8) character set utf8, b int);
+insert into t2 values ('a-1010=A', 1010), ('a-1030=A', 1030), ('a-1020=A', 1020);
+explain select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	3	
+1	SIMPLE	t1	eq_ref	PRIMARY	PRIMARY	28	test.t2.a,test.t2.b	1	Using join buffer
+select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+a	b	filler	a	b
+a-1010=A	1010	filler	a-1010=A	1010
+a-1020=A	1020	filler	a-1020=A	1020
+a-1030=A	1030	filler	a-1030=A	1030
+insert into t2 values ('a-1030=A', 1030), ('a-1020=A', 1020);
+explain select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	5	
+1	SIMPLE	t1	eq_ref	PRIMARY	PRIMARY	28	test.t2.a,test.t2.b	1	Using join buffer
+select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+a	b	filler	a	b
+a-1010=A	1010	filler	a-1010=A	1010
+a-1020=A	1020	filler	a-1020=A	1020
+a-1020=A	1020	filler	a-1020=A	1020
+a-1030=A	1030	filler	a-1030=A	1030
+a-1030=A	1030	filler	a-1030=A	1030
+drop table t1, t2;
+create table t1(
+a varchar(8) character set utf8, b int, filler char(100), 
+primary key(a,b)
+);
+insert into t1 select 
+concat('a-', 1000 + A.a + B.a*10 + C.a*100, '=A'),
+1000 + A.a + B.a*10 + C.a*100,
+'filler'
+from t0 A, t0 B, t0 C;
+create table t2 (a char(8) character set utf8, b int);
+insert into t2 values ('a-1010=A', 1010), ('a-1030=A', 1030), ('a-1020=A', 1020);
+explain select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	3	
+1	SIMPLE	t1	eq_ref	PRIMARY	PRIMARY	30	test.t2.a,test.t2.b	1	Using index condition(BKA); Using join buffer
+select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+a	b	filler	a	b
+a-1010=A	1010	filler	a-1010=A	1010
+a-1020=A	1020	filler	a-1020=A	1020
+a-1030=A	1030	filler	a-1030=A	1030
+explain select * from t1, t2 where t1.a=t2.a;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	3	
+1	SIMPLE	t1	ref	PRIMARY	PRIMARY	26	test.t2.a	1	Using index condition(BKA); Using join buffer
+select * from t1, t2 where t1.a=t2.a;
+a	b	filler	a	b
+a-1010=A	1010	filler	a-1010=A	1010
+a-1020=A	1020	filler	a-1020=A	1020
+a-1030=A	1030	filler	a-1030=A	1030
+drop table t1, t2;
+create table t1 (a int, b int, c int, filler char(100), primary key(a,b,c));
+insert into t1 select A.a, B.a, C.a, 'filler' from t0 A, t0 B, t0 C;
+insert into t1 values (11, 11, 11,   'filler');
+insert into t1 values (11, 11, 12,   'filler');
+insert into t1 values (11, 11, 13,   'filler');
+insert into t1 values (11, 22, 1234, 'filler');
+insert into t1 values (11, 33, 124,  'filler');
+insert into t1 values (11, 33, 125,  'filler');
+create table t2 (a int, b int);
+insert into t2 values (11,33), (11,22), (11,11);
+explain select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	3	
+1	SIMPLE	t1	ref	PRIMARY	PRIMARY	8	test.t2.a,test.t2.b	1	Using join buffer
+select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+a	b	c	filler	a	b
+11	11	11	filler	11	11
+11	11	12	filler	11	11
+11	11	13	filler	11	11
+11	22	1234	filler	11	22
+11	33	124	filler	11	33
+11	33	125	filler	11	33
+set join_cache_level=0;
+select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+a	b	c	filler	a	b
+11	33	124	filler	11	33
+11	33	125	filler	11	33
+11	22	1234	filler	11	22
+11	11	11	filler	11	11
+11	11	12	filler	11	11
+11	11	13	filler	11	11
+set join_cache_level=6;
+drop table t1,t2;
+set @@join_cache_level= @save_join_cache_level;
+set storage_engine=@save_storage_engine;
+drop table t0;
diff --git a/mysql-test/t/innodb_mrr_cpk.test b/mysql-test/t/innodb_mrr_cpk.test
new file mode 100644
index 00000000000..84b37840880
--- /dev/null
+++ b/mysql-test/t/innodb_mrr_cpk.test
@@ -0,0 +1,134 @@
+# 
+# Tests for DS-MRR over clustered primary key. The only engine that supports
+# this is InnoDB/XtraDB.
+#
+# Basic idea about testing
+#  - DS-MRR/CPK works only with BKA
+#  - Should also test index condition pushdown
+#  - Should also test whatever uses RANGE_SEQ_IF::skip_record() for filtering
+#  - Also test access using prefix of primary key
+# 
+#  - Forget about cost model, BKA's multi_range_read_info() call passes 10 for
+#    #rows, the call is there at all only for applicability check
+# 
+-- source include/have_innodb.inc
+
+--disable_warnings
+drop table if exists t0,t1,t2,t3;
+--enable_warnings
+
+set @save_join_cache_level=@@join_cache_level;
+set join_cache_level=6;
+
+set @save_storage_engine=@@storage_engine;
+set storage_engine=innodb;
+
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1(a char(8), b char(8), filler char(100), primary key(a));
+show create table t1;
+
+insert into t1 select 
+  concat('a-', 1000 + A.a + B.a*10 + C.a*100, '=A'),
+  concat('b-', 1000 + A.a + B.a*10 + C.a*100, '=B'),
+  'filler'
+from t0 A, t0 B, t0 C;
+
+create table t2 (a char(8));
+insert into t2 values ('a-1010=A'), ('a-1030=A'), ('a-1020=A');
+
+--echo This should use join buffer:
+explain select * from t1, t2 where t1.a=t2.a;
+
+--echo This output must be sorted by value of t1.a:
+select * from t1, t2 where t1.a=t2.a;
+drop table t1, t2;
+
+# Try multi-column indexes
+create table t1(
+  a char(8) character set utf8, b int, filler char(100), 
+  primary key(a,b)
+);
+
+insert into t1 select 
+  concat('a-', 1000 + A.a + B.a*10 + C.a*100, '=A'),
+  1000 + A.a + B.a*10 + C.a*100,
+  'filler'
+from t0 A, t0 B, t0 C;
+
+create table t2 (a char(8) character set utf8, b int);
+insert into t2 values ('a-1010=A', 1010), ('a-1030=A', 1030), ('a-1020=A', 1020);
+explain select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+
+# Try with dataset that causes identical lookup keys:
+insert into t2 values ('a-1030=A', 1030), ('a-1020=A', 1020);
+explain select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+
+drop table t1, t2;
+
+create table t1(
+  a varchar(8) character set utf8, b int, filler char(100), 
+  primary key(a,b)
+);
+
+insert into t1 select 
+  concat('a-', 1000 + A.a + B.a*10 + C.a*100, '=A'),
+  1000 + A.a + B.a*10 + C.a*100,
+  'filler'
+from t0 A, t0 B, t0 C;
+
+create table t2 (a char(8) character set utf8, b int);
+insert into t2 values ('a-1010=A', 1010), ('a-1030=A', 1030), ('a-1020=A', 1020);
+explain select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+
+# 
+# Try scanning on a CPK prefix
+#
+explain select * from t1, t2 where t1.a=t2.a;
+select * from t1, t2 where t1.a=t2.a;
+drop table t1, t2;
+
+#
+# The above example is not very interesting, as CPK prefix has 
+# only one match.  Create a dataset where scan on CPK prefix 
+# would produce multiple matches:
+#
+create table t1 (a int, b int, c int, filler char(100), primary key(a,b,c));
+insert into t1 select A.a, B.a, C.a, 'filler' from t0 A, t0 B, t0 C;
+
+insert into t1 values (11, 11, 11,   'filler');
+insert into t1 values (11, 11, 12,   'filler');
+insert into t1 values (11, 11, 13,   'filler');
+insert into t1 values (11, 22, 1234, 'filler');
+insert into t1 values (11, 33, 124,  'filler');
+insert into t1 values (11, 33, 125,  'filler');
+
+create table t2 (a int, b int);
+insert into t2 values (11,33), (11,22), (11,11);
+
+explain select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+
+set join_cache_level=0;
+select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+set join_cache_level=6;
+
+drop table t1,t2;
+
+#
+# Check that Index Condition Pushdown (BKA) actually works:
+#
+
+# TODO
+
+#
+# Check that record-check-func is done:
+# 
+
+set @@join_cache_level= @save_join_cache_level;
+set storage_engine=@save_storage_engine;
+drop table t0;
+
diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc
index 72c85ec11bb..46790adee9e 100644
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -139,8 +139,13 @@ ha_rows handler::multi_range_read_info(uint keyno, uint n_ranges, uint n_rows,
                                        uint key_parts, uint *bufsz, 
                                        uint *flags, COST_VECT *cost)
 {
+  /* 
+    Currently we expect this function to be called only in preparation of scan
+    with HA_MRR_SINGLE_POINT property.
+  */
+  DBUG_ASSERT(*flags | HA_MRR_SINGLE_POINT);
+
   *bufsz= 0; /* Default implementation doesn't need a buffer */
-  //psergey2-todo: assert for singlepoint ranges here?
   *flags |= HA_MRR_USE_DEFAULT_IMPL;
 
   cost->zero();
@@ -323,22 +328,25 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
                                         n_ranges, key_parts, mode, buf);
     DBUG_RETURN(retval);
   }
-  rowids_buf= buf->buffer;
+  mrr_buf= buf->buffer;
 
   is_mrr_assoc= !test(mode & HA_MRR_NO_ASSOCIATION);
 
   if (is_mrr_assoc)
     status_var_increment(table->in_use->status_var.ha_multi_range_read_init_count);
  
-  rowids_buf_end= buf->buffer_end;
+  mrr_buf_end= buf->buffer_end;
 
 
   doing_cpk_scan= check_cpk_scan(h->active_index, mode); 
   if (doing_cpk_scan)
   {
+    /* 
+      When doing a scan on CPK, the buffer stores {lookup_tuple, range_id}
+      pairs 
+    */
     uint keylen=0;
     DBUG_ASSERT(key_parts != 0);
-    //psergey2-todo: new elem_size here
     for (uint kp= 0; kp < key_parts; kp++)
       keylen += table->key_info[h->active_index].key_part[kp].store_length;
 
@@ -350,12 +358,29 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
     use_default_impl= FALSE;
   }
   else
+  {
+    /* In regular DS-MRR, buffer stores {rowid, range_id} pairs */
     elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*);
+  }
 
-  rowids_buf_last= rowids_buf + 
-                      ((rowids_buf_end - rowids_buf)/ elem_size)*
+  mrr_buf_last= mrr_buf + 
+                      ((mrr_buf_end - mrr_buf)/ elem_size)*
                       elem_size;
-  rowids_buf_end= rowids_buf_last;
+  mrr_buf_end= mrr_buf_last;
+
+  if (doing_cpk_scan)
+  {
+    /* 
+      DS-MRR/CPK: fill buffer with lookup tuples and sort; also we don't need a
+      secondary handler object.
+    */
+    h->mrr_iter= seq_funcs->init(seq_init_param, n_ranges, mode);
+    h->mrr_funcs= *seq_funcs;
+    dsmrr_fill_buffer_cpk();
+    if (dsmrr_eof) 
+      buf->end_of_used_area= mrr_buf_last;
+    DBUG_RETURN(0); /* nothing could go wrong while filling the buffer */
+  }
 
   /*
     There can be two cases:
@@ -365,84 +390,68 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
        The caller might have called h->index_init(), need to switch h to
        rnd_pos calls.
   */
-  //psergey2-todo: don't create secondary for CPK scan.
-  if (!doing_cpk_scan)
+  if (!h2)
   {
-    if (!h2)
+    /* Create a separate handler object to do rndpos() calls. */
+    THD *thd= current_thd;
+    /*
+      ::clone() takes up a lot of stack, especially on 64 bit platforms.
+      The constant 5 is an empiric result.
+    */
+    if (check_stack_overrun(thd, 5*STACK_MIN_SIZE, (uchar*) &new_h2))
+      DBUG_RETURN(1);
+    DBUG_ASSERT(h->active_index != MAX_KEY);
+    uint mrr_keyno= h->active_index;
+
+    /* Create a separate handler object to do rndpos() calls. */
+    if (!(new_h2= h->clone(thd->mem_root)) || 
+        new_h2->ha_external_lock(thd, F_RDLCK))
     {
-      /* Create a separate handler object to do rndpos() calls. */
-      THD *thd= current_thd;
-      /*
-        ::clone() takes up a lot of stack, especially on 64 bit platforms.
-        The constant 5 is an empiric result.
-      */
-      if (check_stack_overrun(thd, 5*STACK_MIN_SIZE, (uchar*) &new_h2))
-        DBUG_RETURN(1);
-      DBUG_ASSERT(h->active_index != MAX_KEY);
-      uint mrr_keyno= h->active_index;
-
-      /* Create a separate handler object to do rndpos() calls. */
-      if (!(new_h2= h->clone(thd->mem_root)) || 
-          new_h2->ha_external_lock(thd, F_RDLCK))
-      {
-        delete new_h2;
-        DBUG_RETURN(1);
-      }
-
-      if (mrr_keyno == h->pushed_idx_cond_keyno)
-        pushed_cond= h->pushed_idx_cond;
-
-      /*
-        Caution: this call will invoke this->dsmrr_close(). Do not put the
-        created secondary table handler into this->h2 or it will delete it.
-      */
-      if (h->ha_index_end())
-      {
-        h2=new_h2;
-        goto error;
-      }
-
-      h2= new_h2; /* Ok, now can put it into h2 */
-      table->prepare_for_position();
-      h2->extra(HA_EXTRA_KEYREAD);
-    
-      if (h2->ha_index_init(mrr_keyno, FALSE))
-        goto error;
-
-      use_default_impl= FALSE;
-      if (pushed_cond)
-        h2->idx_cond_push(mrr_keyno, pushed_cond);
+      delete new_h2;
+      DBUG_RETURN(1);
     }
-    else
+
+    if (mrr_keyno == h->pushed_idx_cond_keyno)
+      pushed_cond= h->pushed_idx_cond;
+
+    /*
+      Caution: this call will invoke this->dsmrr_close(). Do not put the
+      created secondary table handler into this->h2 or it will delete it.
+    */
+    if (h->ha_index_end())
     {
-      /* 
-        We get here when the access alternates betwen MRR scan(s) and non-MRR
-        scans.
-
-        Calling h->index_end() will invoke dsmrr_close() for this object,
-        which will delete h2. We need to keep it, so save put it away and dont
-        let it be deleted:
-      */
-      handler *save_h2= h2;
-      h2= NULL;
-      int res= (h->inited == handler::INDEX && h->ha_index_end());
-      h2= save_h2;
-      use_default_impl= FALSE;
-      if (res)
-        goto error;
+      h2=new_h2;
+      goto error;
     }
+
+    h2= new_h2; /* Ok, now can put it into h2 */
+    table->prepare_for_position();
+    h2->extra(HA_EXTRA_KEYREAD);
+  
+    if (h2->ha_index_init(mrr_keyno, FALSE))
+      goto error;
+
+    use_default_impl= FALSE;
+    if (pushed_cond)
+      h2->idx_cond_push(mrr_keyno, pushed_cond);
   }
   else
   {
-    //doing DS-MRR/CPK
-    // fill-buffer-analog
-    // eof
-    h->mrr_iter= seq_funcs->init(seq_init_param, n_ranges, mode);
-    h->mrr_funcs= *seq_funcs;
-    dsmrr_fill_buffer_cpk();
-    if (dsmrr_eof) 
-      buf->end_of_used_area= rowids_buf_last;
-    DBUG_RETURN(0); // nothing can go wrong while filling the buffer
+    /* 
+      We get here when the access alternates betwen MRR scan(s) and non-MRR
+      scans.
+
+      Calling h->index_end() will invoke dsmrr_close() for this object,
+      which will delete h2. We need to keep it, so save put it away and dont
+      let it be deleted:
+    */
+    handler *save_h2= h2;
+    h2= NULL;
+    int res= (h->inited == handler::INDEX && h->ha_index_end());
+    h2= save_h2;
+    use_default_impl= FALSE;
+    if (res)
+      goto error;
   }
 
   if (h2->handler::multi_range_read_init(seq_funcs, seq_init_param, n_ranges,
@@ -456,7 +465,7 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
     adjust *buf to indicate that the remaining buffer space will not be used.
   */
   if (dsmrr_eof) 
-    buf->end_of_used_area= rowids_buf_last;
+    buf->end_of_used_area= mrr_buf_last;
 
   /*
      h->inited == INDEX may occur when 'range checked for each record' is
@@ -512,6 +521,9 @@ static int rowid_cmp(void *h, uchar *a, uchar *b)
   rowid and return.
   
   The function assumes that rowids buffer is empty when it is invoked. 
+
+  dsmrr_eof is set to indicate whether we've exhausted the list of ranges we're
+  scanning.
   
   @param h  Table handler
 
@@ -526,8 +538,8 @@ int DsMrr_impl::dsmrr_fill_buffer()
   int res;
   DBUG_ENTER("DsMrr_impl::dsmrr_fill_buffer");
 
-  rowids_buf_cur= rowids_buf;
-  while ((rowids_buf_cur < rowids_buf_end) && 
+  mrr_buf_cur= mrr_buf;
+  while ((mrr_buf_cur < mrr_buf_end) && 
          !(res= h2->handler::multi_range_read_next(&range_info)))
   {
     KEY_MULTI_RANGE *curr_range= &h2->handler::mrr_cur_range;
@@ -537,13 +549,13 @@ int DsMrr_impl::dsmrr_fill_buffer()
     
     /* Put rowid, or {rowid, range_id} pair into the buffer */
     h2->position(table->record[0]);
-    memcpy(rowids_buf_cur, h2->ref, h2->ref_length);
-    rowids_buf_cur += h2->ref_length;
+    memcpy(mrr_buf_cur, h2->ref, h2->ref_length);
+    mrr_buf_cur += h2->ref_length;
 
     if (is_mrr_assoc)
     {
-      memcpy(rowids_buf_cur, &range_info, sizeof(void*));
-      rowids_buf_cur += sizeof(void*);
+      memcpy(mrr_buf_cur, &range_info, sizeof(void*));
+      mrr_buf_cur += sizeof(void*);
     }
   }
 
@@ -553,27 +565,29 @@ int DsMrr_impl::dsmrr_fill_buffer()
 
   /* Sort the buffer contents by rowid */
   uint elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*);
-  uint n_rowids= (rowids_buf_cur - rowids_buf) / elem_size;
+  uint n_rowids= (mrr_buf_cur - mrr_buf) / elem_size;
   
-  my_qsort2(rowids_buf, n_rowids, elem_size, (qsort2_cmp)rowid_cmp,
+  my_qsort2(mrr_buf, n_rowids, elem_size, (qsort2_cmp)rowid_cmp,
             (void*)h);
-  rowids_buf_last= rowids_buf_cur;
-  rowids_buf_cur=  rowids_buf;
+  mrr_buf_last= mrr_buf_cur;
+  mrr_buf_cur=  mrr_buf;
   DBUG_RETURN(0);
 }
 
 
-/* qsort-compatible function to compare key tuples */
+/* 
+  my_qsort2-compatible function to compare key tuples 
+*/
+
 int DsMrr_impl::key_tuple_cmp(void* arg, uchar* key1, uchar* key2)
 {
   DsMrr_impl *dsmrr= (DsMrr_impl*)arg;
   TABLE *table= dsmrr->h->table;
   
   KEY_PART_INFO *part= table->key_info[table->s->primary_key].key_part;
-  KEY_PART_INFO *part_end= part + dsmrr->cpk_n_parts;
+  uchar *key1_end= key1 + dsmrr->cpk_tuple_length;
 
-  //uint32 *lengths=item->field_lengths;
-  for (; part < part_end; ++part)
+  while (key1 < key1_end)
   {
     Field* f = part->field;
     int len = part->store_length;
@@ -582,33 +596,43 @@ int DsMrr_impl::key_tuple_cmp(void* arg, uchar* key1, uchar* key2)
       return res;
     key1 += len;
     key2 += len;
+    part++;
   }
   return 0;
 }
 
 
-//psergey2:
-int DsMrr_impl::dsmrr_fill_buffer_cpk()
+/*
+  DS-MRR/CPK: Fill the buffer with (lookup_tuple, range_id) pairs and sort
+
+  DESCRIPTION
+    DS-MRR/CPK: Fill the buffer with (lookup_tuple, range_id) pairs and sort
+
+    dsmrr_eof is set to indicate whether we've exhausted the list of ranges 
+    we're scanning.
+*/
+
+void DsMrr_impl::dsmrr_fill_buffer_cpk()
 {
   int res;
   KEY_MULTI_RANGE cur_range;
   DBUG_ENTER("DsMrr_impl::dsmrr_fill_buffer_cpk");
 
-  rowids_buf_cur= rowids_buf;
-  while ((rowids_buf_cur < rowids_buf_end) && 
+  mrr_buf_cur= mrr_buf;
+  while ((mrr_buf_cur < mrr_buf_end) && 
          !(res= h->mrr_funcs.next(h->mrr_iter, &cur_range)))
   {
     DBUG_ASSERT(cur_range.range_flag & EQ_RANGE);
     DBUG_ASSERT(cpk_tuple_length == cur_range.start_key.length);
 
     /* Put key, or {key, range_id} pair into the buffer */
-    memcpy(rowids_buf_cur, cur_range.start_key.key, cpk_tuple_length);
-    rowids_buf_cur += cpk_tuple_length;
+    memcpy(mrr_buf_cur, cur_range.start_key.key, cpk_tuple_length);
+    mrr_buf_cur += cpk_tuple_length;
 
     if (is_mrr_assoc)
     {
-      memcpy(rowids_buf_cur, &cur_range.ptr, sizeof(void*));
-      rowids_buf_cur += sizeof(void*);
+      memcpy(mrr_buf_cur, &cur_range.ptr, sizeof(void*));
+      mrr_buf_cur += sizeof(void*);
     }
   }
 
@@ -616,77 +640,82 @@ int DsMrr_impl::dsmrr_fill_buffer_cpk()
 
   /* Sort the buffer contents by rowid */
   uint elem_size= cpk_tuple_length + (int)is_mrr_assoc * sizeof(void*);
-  uint n_rowids= (rowids_buf_cur - rowids_buf) / elem_size;
+  uint n_rowids= (mrr_buf_cur - mrr_buf) / elem_size;
   
-  my_qsort2(rowids_buf, n_rowids, elem_size, 
+  my_qsort2(mrr_buf, n_rowids, elem_size, 
             (qsort2_cmp)DsMrr_impl::key_tuple_cmp, (void*)this);
-  rowids_buf_last= rowids_buf_cur;
-  rowids_buf_cur=  rowids_buf;
-  DBUG_RETURN(0);
+  mrr_buf_last= mrr_buf_cur;
+  mrr_buf_cur=  mrr_buf;
+  DBUG_VOID_RETURN;
 }
 
 
 /*
-  CPK: so, the source is 
-   - buffer exhaustion/re-fill
-   - advance to next range on "record-not-found" error.
-   - if scanning on a prefix, enumerate all records for a key.
+  DS-MRR/CPK: multi_range_read_next() function
+
+  DESCRIPTION
+    DsMrr_impl::dsmrr_next_cpk() 
+
+  DESCRIPTION
+    DS-MRR/CPK: multi_range_read_next() function. 
+    This is similar to DsMrr_impl::dsmrr_next(), the differences are that
+     - we get records with index_read(), not with rnd_pos()
+     - we may get multiple records for one key (=element of the buffer)
+     - unlike dsmrr_fill_buffer(), dsmrr_fill_buffer_cpk() never fails.
+ 
+  RETURN
+    0                   OK, next record was successfully read
+    HA_ERR_END_OF_FILE  End of records
+    Other               Some other error
 */
+
 int DsMrr_impl::dsmrr_next_cpk(char **range_info)
 {
   int res;
 
   if (cpk_have_range)
   {
-    res= h->index_next_same(table->record[0], rowids_buf_cur, cpk_tuple_length);
+    res= h->index_next_same(table->record[0], mrr_buf_cur, cpk_tuple_length);
     if (res != HA_ERR_END_OF_FILE)
     {
-      // todo
       if (is_mrr_assoc)
         memcpy(range_info, &cpk_saved_range_info, sizeof(void*));
       return res;
     }
-    /* 
-      Ok, we got EOF for records in this range. Fall through to get to another
-      range.
-    */
+    /* No more records in this range. Fall through to get to another range  */
   }
 
   do
   {
-    /* First, make sure we have a range at start of the buffer*/
-    if (rowids_buf_cur == rowids_buf_last)
+    /* First, make sure we have a range at start of the buffer */
+    if (mrr_buf_cur == mrr_buf_last)
     {
       if (dsmrr_eof)
       {
         res= HA_ERR_END_OF_FILE;
         goto end;
       }
-      // TODO: the return values are mix of HA_ERR_ codes and TRUE as "generic
-      //       failure" error. Is this ok?
-      if ((res= dsmrr_fill_buffer_cpk()))
-        goto end;
+      dsmrr_fill_buffer_cpk();
     }
-   
-    if (rowids_buf_cur == rowids_buf_last)
+    if (mrr_buf_cur == mrr_buf_last)
     {
       res= HA_ERR_END_OF_FILE;
       goto end;
     }
     
-    //TODO: make skip_index_tuple() calls, too?
-    //TODO: skip-record calls here?
+    //psergey2-todo: make skip_index_tuple() calls, too?
+    //psergey2-todo: skip-record calls here?
     //if (h2->mrr_funcs.skip_record &&
     //	h2->mrr_funcs.skip_record(h2->mrr_iter, (char *) cur_range_info, rowid))
     //  continue;
     
     /* Ok, got the range. Try making a lookup.  */
-    uchar *lookup_tuple= rowids_buf_cur;
-    rowids_buf_cur += cpk_tuple_length;
+    uchar *lookup_tuple= mrr_buf_cur;
+    mrr_buf_cur += cpk_tuple_length;
     if (is_mrr_assoc)
     {
-      memcpy(cpk_saved_range_info, rowids_buf_cur, sizeof(void*));
-      rowids_buf_cur += sizeof(void*) * test(is_mrr_assoc);
+      memcpy(cpk_saved_range_info, mrr_buf_cur, sizeof(void*));
+      mrr_buf_cur += sizeof(void*) * test(is_mrr_assoc);
     }
       
     res= h->index_read(table->record[0], lookup_tuple, cpk_tuple_length, 
@@ -698,6 +727,10 @@ int DsMrr_impl::dsmrr_next_cpk(char **range_info)
     if (!res)
     {
       memcpy(range_info, cpk_saved_range_info, sizeof(void*));
+      /* 
+        Attempt reading more rows from this range only if there actually can
+        be multiple matches:
+       */
       cpk_have_range= !cpk_is_unique_scan;
       break;
     }
@@ -707,6 +740,7 @@ end:
   return res;
 }
 
+
 /**
   DS-MRR implementation: multi_range_read_next() function
 */
@@ -725,7 +759,7 @@ int DsMrr_impl::dsmrr_next(char **range_info)
   
   do
   {
-    if (rowids_buf_cur == rowids_buf_last)
+    if (mrr_buf_cur == mrr_buf_last)
     {
       if (dsmrr_eof)
       {
@@ -738,17 +772,17 @@ int DsMrr_impl::dsmrr_next(char **range_info)
     }
    
     /* return eof if there are no rowids in the buffer after re-fill attempt */
-    if (rowids_buf_cur == rowids_buf_last)
+    if (mrr_buf_cur == mrr_buf_last)
     {
       res= HA_ERR_END_OF_FILE;
       goto end;
     }
-    rowid= rowids_buf_cur;
+    rowid= mrr_buf_cur;
 
     if (is_mrr_assoc)
-      memcpy(&cur_range_info, rowids_buf_cur + h->ref_length, sizeof(uchar**));
+      memcpy(&cur_range_info, mrr_buf_cur + h->ref_length, sizeof(uchar**));
 
-    rowids_buf_cur += h->ref_length + sizeof(void*) * test(is_mrr_assoc);
+    mrr_buf_cur += h->ref_length + sizeof(void*) * test(is_mrr_assoc);
     if (h2->mrr_funcs.skip_record &&
 	h2->mrr_funcs.skip_record(h2->mrr_iter, (char *) cur_range_info, rowid))
       continue;
@@ -870,7 +904,33 @@ bool key_uses_partial_cols(TABLE *table, uint keyno)
   return FALSE;
 }
 
-/**
+
+/*
+  Check if key/flags allow DS-MRR/CPK strategy to be used
+  
+  SYNOPSIS
+   DsMrr_impl::check_cpk_scan()
+     keyno      Index that will be used
+     mrr_flags  
+  
+  DESCRIPTION
+    Check if key/flags allow DS-MRR/CPK strategy to be used. 
+ 
+  RETURN
+    TRUE   DS-MRR/CPK should be used
+    FALSE  Otherwise
+*/
+
+bool DsMrr_impl::check_cpk_scan(uint keyno, uint mrr_flags)
+{
+  return test((mrr_flags & HA_MRR_SINGLE_POINT) && 
+              !(mrr_flags & HA_MRR_SORTED) && 
+              keyno == table->s->primary_key && 
+              h->primary_key_is_clustered());
+}
+
+
+/*
   DS-MRR Internals: Choose between Default MRR implementation and DS-MRR
 
   Make the choice between using Default MRR implementation and DS-MRR.
@@ -892,13 +952,7 @@ bool key_uses_partial_cols(TABLE *table, uint keyno)
   @retval TRUE   Default MRR implementation should be used
   @retval FALSE  DS-MRR implementation should be used
 */
-bool DsMrr_impl::check_cpk_scan(uint keyno, uint mrr_flags)
-{
-  return test((mrr_flags & HA_MRR_SINGLE_POINT) && 
-              !(mrr_flags & HA_MRR_SORTED) && 
-              keyno == table->s->primary_key && 
-              h->primary_key_is_clustered());
-}
+
 
 bool DsMrr_impl::choose_mrr_impl(uint keyno, ha_rows rows, uint *flags,
                                  uint *bufsz, COST_VECT *cost)
@@ -906,9 +960,8 @@ bool DsMrr_impl::choose_mrr_impl(uint keyno, ha_rows rows, uint *flags,
   COST_VECT dsmrr_cost;
   bool res;
   THD *thd= current_thd;
-  //psergey2: check the criteria.
-  doing_cpk_scan= check_cpk_scan(keyno, *flags); 
 
+  doing_cpk_scan= check_cpk_scan(keyno, *flags); 
   if (thd->variables.optimizer_use_mrr == 2 || *flags & HA_MRR_INDEX_ONLY ||
       (keyno == table->s->primary_key && h->primary_key_is_clustered() &&
        !doing_cpk_scan) ||
diff --git a/sql/multi_range_read.h b/sql/multi_range_read.h
index b379d4f517d..5dd2e0d6adf 100644
--- a/sql/multi_range_read.h
+++ b/sql/multi_range_read.h
@@ -1,16 +1,76 @@
 /*
-  This file contains declarations for 
-   - Disk-Sweep MultiRangeRead (DS-MRR) implementation
+  This file contains declarations for Disk-Sweep MultiRangeRead (DS-MRR) 
+  implementation
 */
 
 /**
-  A Disk-Sweep MRR interface implementation
+  A Disk-Sweep implementation of MRR Interface (DS-MRR for short)
 
-  This implementation makes range (and, in the future, 'ref') scans to read
-  table rows in disk sweeps. 
-  
-  Currently it is used by MyISAM and InnoDB. Potentially it can be used with
-  any table handler that has non-clustered indexes and on-disk rows.
+  This is a "plugin"(*) for storage engines that allows make index scans 
+  read table rows in rowid order. For disk-based storage engines, this is
+  faster than reading table rows in whatever-SQL-layer-makes-calls-in order.
+
+  (*) - only conceptually. No dynamic loading or binary compatibility of any
+        kind.
+
+  General scheme of things:
+   
+      SQL Layer code
+       |   |   |
+      -v---v---v---- handler->multi_range_read_XXX() function calls
+       |   |   |
+      ____________________________________
+     / DS-MRR module                      \
+     |  (scan indexes, order rowids, do    |
+     |   full record reads in rowid order) |
+     \____________________________________/
+       |   |   |
+      -|---|---|----- handler->read_range_first()/read_range_next(), 
+       |   |   |      handler->index_read(), handler->rnd_pos() calls.
+       |   |   |
+       v   v   v
+      Storage engine internals
+   
+  Currently DS-MRR is used by MyISAM, InnoDB/XtraDB and Maria storage engines.
+  Potentially it can be used with any table handler that has disk-based data
+  storage and has better performance when reading data in rowid order.
+*/
+
+
+/*
+  DS-MRR implementation for one table. Create/use one object of this class for
+  each ha_{myisam/innobase/etc} object. That object will be further referred to
+  as "the handler"
+
+  There are actually three strategies
+   S1. Bypass DS-MRR, pass all calls to default implementation (i.e. to
+      MRR-to-non-MRR calls converter)
+   S2. Regular DS-MRR 
+   S3. DS-MRR/CPK for doing scans on clustered primary keys.
+
+  S1 is used for cases which DS-MRR is unable to handle for some reason.
+
+  S2 is the actual DS-MRR. The basic algorithm is as follows:
+    1. Scan the index (and only index, that is, with HA_EXTRA_KEYREAD on) and 
+        fill the buffer with {rowid, range_id} pairs
+    2. Sort the buffer by rowid
+    3. for each {rowid, range_id} pair in the buffer
+         get record by rowid and return the {record, range_id} pair
+    4. Repeat the above steps until we've exhausted the list of ranges we're
+       scanning.
+
+  S3 is the variant of DS-MRR for use with clustered primary keys (or any
+  clustered index). The idea is that in clustered index it is sufficient to 
+  access the index in index order, and we don't need an intermediate steps to
+  get rowid (like step #1 in S2).
+
+   DS-MRR/CPK's basic algorithm is as follows:
+    1. Collect a number of ranges (=lookup keys)
+    2. Sort them so that they follow in index order.
+    3. for each {lookup_key, range_id} pair in the buffer 
+       get record(s) matching the lookup key and return {record, range_id} pairs
+    4. Repeat the above steps until we've exhausted the list of ranges we're
+       scanning.
 */
 
 class DsMrr_impl
@@ -21,40 +81,6 @@ public:
   DsMrr_impl()
     : h2(NULL) {};
   
-  /*
-    The "owner" handler object (the one that calls dsmrr_XXX functions.
-    It is used to retrieve full table rows by calling rnd_pos().
-  */
-  handler *h;
-  TABLE *table; /* Always equal to h->table */
-private:
-  /* Secondary handler object.  It is used for scanning the index */
-  handler *h2;
-
-  /* Buffer to store rowids, or (rowid, range_id) pairs */
-  uchar *rowids_buf;
-  uchar *rowids_buf_cur;   /* Current position when reading/writing */
-  uchar *rowids_buf_last;  /* When reading: end of used buffer space */
-  uchar *rowids_buf_end;   /* End of the buffer */
-
-  bool dsmrr_eof; /* TRUE <=> We have reached EOF when reading index tuples */
-
-  /* TRUE <=> need range association, buffer holds {rowid, range_id} pairs */
-  bool is_mrr_assoc;
-
-  bool use_default_impl; /* TRUE <=> shortcut all calls to default MRR impl */
-
-  bool doing_cpk_scan;
-  uint cpk_tuple_length;
-  uint cpk_n_parts;
-  bool cpk_is_unique_scan;
-  char *cpk_saved_range_info;
-  bool cpk_have_range;
-
-
-  bool check_cpk_scan(uint keyno, uint mrr_flags);
-  static int key_tuple_cmp(void* arg, uchar* key1, uchar* key2);
-public:
   void init(handler *h_arg, TABLE *table_arg)
   {
     h= h_arg; 
@@ -64,10 +90,7 @@ public:
                  uint n_ranges, uint key_parts, uint mode, 
                  HANDLER_BUFFER *buf);
   void dsmrr_close();
-  int dsmrr_fill_buffer();
-  int dsmrr_fill_buffer_cpk();
   int dsmrr_next(char **range_info);
-  int dsmrr_next_cpk(char **range_info);
 
   ha_rows dsmrr_info(uint keyno, uint n_ranges, uint keys, uint key_parts, 
                      uint *bufsz, uint *flags, COST_VECT *cost);
@@ -76,9 +99,53 @@ public:
                             void *seq_init_param, uint n_ranges, uint *bufsz,
                             uint *flags, COST_VECT *cost);
 private:
+  /*
+    The "owner" handler object (the one that calls dsmrr_XXX functions.
+    It is used to retrieve full table rows by calling rnd_pos().
+  */
+  handler *h;
+  TABLE *table; /* Always equal to h->table */
+
+  /* Secondary handler object.  It is used for scanning the index */
+  handler *h2;
+
+  /* Buffer to store rowids, or (rowid, range_id) pairs */
+  uchar *mrr_buf;
+  uchar *mrr_buf_cur;   /* Current position when reading/writing */
+  uchar *mrr_buf_last;  /* When reading: end of used buffer space */
+  uchar *mrr_buf_end;   /* End of the buffer */
+
+  bool dsmrr_eof; /* TRUE <=> We have reached EOF when reading index tuples */
+
+  /* TRUE <=> need range association, buffer holds {rowid, range_id} pairs */
+  bool is_mrr_assoc;
+
+  bool use_default_impl; /* TRUE <=> shortcut all calls to default MRR impl */
+
+  bool doing_cpk_scan; /* TRUE <=> DS-MRR/CPK variant is used */
+
+  /** DS-MRR/CPK variables start */
+
+  /* Length of lookup tuple being used, in bytes */
+  uint cpk_tuple_length;
+  /*
+    TRUE <=> We're scanning on a full primary key (and not on prefix), and so 
+    can get max. one match for each key 
+  */
+  bool cpk_is_unique_scan;
+  /* TRUE<=> we're in a middle of enumerating records from a range */ 
+  bool cpk_have_range;
+  /* Valid if cpk_have_range==TRUE: range_id of the range we're enumerating */
+  char *cpk_saved_range_info;
+
   bool choose_mrr_impl(uint keyno, ha_rows rows, uint *flags, uint *bufsz, 
                        COST_VECT *cost);
   bool get_disk_sweep_mrr_cost(uint keynr, ha_rows rows, uint flags, 
                                uint *buffer_size, COST_VECT *cost);
+  bool check_cpk_scan(uint keyno, uint mrr_flags);
+  static int key_tuple_cmp(void* arg, uchar* key1, uchar* key2);
+  int dsmrr_fill_buffer();
+  void dsmrr_fill_buffer_cpk();
+  int dsmrr_next_cpk(char **range_info);
 };