From 925e508a2e76c39173bb058ad8891202ce895d8f Mon Sep 17 00:00:00 2001
From: Sergey Petrunya <psergey@askmonty.org>
Date: Sat, 19 Jun 2010 15:40:19 +0400
Subject: [PATCH 01/49] MWL#121: DS-MRR support for clustered primary keys -
 First code (will need code cleanup)

---
 sql/handler.h                       |   5 +-
 sql/multi_range_read.cc             | 329 ++++++++++++++++++++++------
 sql/multi_range_read.h              |  20 +-
 sql/opt_range.cc                    |   2 +
 sql/sql_join_cache.cc               |   3 +-
 sql/sql_select.cc                   |   3 +-
 storage/maria/ha_maria.cc           |  11 +-
 storage/maria/ha_maria.h            |   6 +-
 storage/myisam/ha_myisam.cc         |  10 +-
 storage/myisam/ha_myisam.h          |   6 +-
 storage/xtradb/handler/ha_innodb.cc |  12 +-
 storage/xtradb/handler/ha_innodb.h  |   6 +-
 12 files changed, 319 insertions(+), 94 deletions(-)

diff --git a/sql/handler.h b/sql/handler.h
index 124bd40711d..f2cc50de38a 100644
--- a/sql/handler.h
+++ b/sql/handler.h
@@ -1752,9 +1752,10 @@ public:
                                               uint n_ranges, uint *bufsz,
                                               uint *flags, COST_VECT *cost);
   virtual ha_rows multi_range_read_info(uint keyno, uint n_ranges, uint keys,
-                                        uint *bufsz, uint *flags, COST_VECT *cost);
+                                        uint key_parts, uint *bufsz, 
+                                        uint *flags, COST_VECT *cost);
   virtual int multi_range_read_init(RANGE_SEQ_IF *seq, void *seq_init_param,
-                                    uint n_ranges, uint mode,
+                                    uint n_ranges, uint key_parts, uint mode,
                                     HANDLER_BUFFER *buf);
   virtual int multi_range_read_next(char **range_info);
   virtual int read_range_first(const key_range *start_key,
diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc
index 644634c3d74..e0bccb9bf90 100644
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -136,10 +136,11 @@ handler::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
 */
 
 ha_rows handler::multi_range_read_info(uint keyno, uint n_ranges, uint n_rows,
-                                       uint *bufsz, uint *flags, COST_VECT *cost)
+                                       uint key_parts, uint *bufsz, 
+                                       uint *flags, COST_VECT *cost)
 {
   *bufsz= 0; /* Default implementation doesn't need a buffer */
-
+  //psergey2-todo: assert for singlepoint ranges here?
   *flags |= HA_MRR_USE_DEFAULT_IMPL;
 
   cost->zero();
@@ -197,7 +198,8 @@ ha_rows handler::multi_range_read_info(uint keyno, uint n_ranges, uint n_rows,
 
 int
 handler::multi_range_read_init(RANGE_SEQ_IF *seq_funcs, void *seq_init_param,
-                               uint n_ranges, uint mode, HANDLER_BUFFER *buf)
+                               uint n_ranges, uint key_parts, uint mode, 
+                               HANDLER_BUFFER *buf)
 {
   DBUG_ENTER("handler::multi_range_read_init");
   mrr_iter= seq_funcs->init(seq_init_param, n_ranges, mode);
@@ -299,7 +301,8 @@ scan_it_again:
 */
 
 int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs, 
-                           void *seq_init_param, uint n_ranges, uint mode,
+                           void *seq_init_param, uint n_ranges, uint key_parts,
+                           uint mode,
                            HANDLER_BUFFER *buf)
 {
   uint elem_size;
@@ -317,7 +320,7 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
     use_default_impl= TRUE;
     const int retval=
       h->handler::multi_range_read_init(seq_funcs, seq_init_param,
-                                        n_ranges, mode, buf);
+                                        n_ranges, key_parts, mode, buf);
     DBUG_RETURN(retval);
   }
   rowids_buf= buf->buffer;
@@ -328,13 +331,33 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
     status_var_increment(table->in_use->status_var.ha_multi_range_read_init_count);
  
   rowids_buf_end= buf->buffer_end;
-  elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*);
+
+
+  doing_cpk_scan= check_cpk_scan(h->active_index, mode); 
+  if (doing_cpk_scan)
+  {
+    uint keylen=0;
+    DBUG_ASSERT(key_parts != 0);
+    //psergey2-todo: new elem_size here
+    for (uint kp= 0; kp < key_parts; kp++)
+      keylen += table->key_info[h->active_index].key_part[kp].store_length;
+
+    cpk_tuple_length= keylen;
+    cpk_is_unique_scan= test(table->key_info[h->active_index].key_parts == 
+                             key_parts);
+    cpk_have_range= FALSE;
+    elem_size= keylen + (int)is_mrr_assoc * sizeof(void*);
+    use_default_impl= FALSE;
+  }
+  else
+    elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*);
+
   rowids_buf_last= rowids_buf + 
                       ((rowids_buf_end - rowids_buf)/ elem_size)*
                       elem_size;
   rowids_buf_end= rowids_buf_last;
 
-    /*
+  /*
     There can be two cases:
     - This is the first call since index_init(), h2==NULL
        Need to setup h2 then.
@@ -342,72 +365,88 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
        The caller might have called h->index_init(), need to switch h to
        rnd_pos calls.
   */
-  if (!h2)
+  //psergey2-todo: don't create secondary for CPK scan.
+  if (!doing_cpk_scan)
   {
-    /* Create a separate handler object to do rndpos() calls. */
-    THD *thd= current_thd;
-    /*
-      ::clone() takes up a lot of stack, especially on 64 bit platforms.
-      The constant 5 is an empiric result.
-    */
-    if (check_stack_overrun(thd, 5*STACK_MIN_SIZE, (uchar*) &new_h2))
-      DBUG_RETURN(1);
-    DBUG_ASSERT(h->active_index != MAX_KEY);
-    uint mrr_keyno= h->active_index;
-
-    /* Create a separate handler object to do rndpos() calls. */
-    if (!(new_h2= h->clone(thd->mem_root)) || 
-        new_h2->ha_external_lock(thd, F_RDLCK))
+    if (!h2)
     {
-      delete new_h2;
-      DBUG_RETURN(1);
+      /* Create a separate handler object to do rndpos() calls. */
+      THD *thd= current_thd;
+      /*
+        ::clone() takes up a lot of stack, especially on 64 bit platforms.
+        The constant 5 is an empiric result.
+      */
+      if (check_stack_overrun(thd, 5*STACK_MIN_SIZE, (uchar*) &new_h2))
+        DBUG_RETURN(1);
+      DBUG_ASSERT(h->active_index != MAX_KEY);
+      uint mrr_keyno= h->active_index;
+
+      /* Create a separate handler object to do rndpos() calls. */
+      if (!(new_h2= h->clone(thd->mem_root)) || 
+          new_h2->ha_external_lock(thd, F_RDLCK))
+      {
+        delete new_h2;
+        DBUG_RETURN(1);
+      }
+
+      if (mrr_keyno == h->pushed_idx_cond_keyno)
+        pushed_cond= h->pushed_idx_cond;
+
+      /*
+        Caution: this call will invoke this->dsmrr_close(). Do not put the
+        created secondary table handler into this->h2 or it will delete it.
+      */
+      if (h->ha_index_end())
+      {
+        h2=new_h2;
+        goto error;
+      }
+
+      h2= new_h2; /* Ok, now can put it into h2 */
+      table->prepare_for_position();
+      h2->extra(HA_EXTRA_KEYREAD);
+    
+      if (h2->ha_index_init(mrr_keyno, FALSE))
+        goto error;
+
+      use_default_impl= FALSE;
+      if (pushed_cond)
+        h2->idx_cond_push(mrr_keyno, pushed_cond);
     }
-
-    if (mrr_keyno == h->pushed_idx_cond_keyno)
-      pushed_cond= h->pushed_idx_cond;
-
-    /*
-      Caution: this call will invoke this->dsmrr_close(). Do not put the
-      created secondary table handler into this->h2 or it will delete it.
-    */
-    if (h->ha_index_end())
+    else
     {
-      h2=new_h2;
-      goto error;
+      /* 
+        We get here when the access alternates betwen MRR scan(s) and non-MRR
+        scans.
+
+        Calling h->index_end() will invoke dsmrr_close() for this object,
+        which will delete h2. We need to keep it, so save put it away and dont
+        let it be deleted:
+      */
+      handler *save_h2= h2;
+      h2= NULL;
+      int res= (h->inited == handler::INDEX && h->ha_index_end());
+      h2= save_h2;
+      use_default_impl= FALSE;
+      if (res)
+        goto error;
     }
-
-    h2= new_h2; /* Ok, now can put it into h2 */
-    table->prepare_for_position();
-    h2->extra(HA_EXTRA_KEYREAD);
-  
-    if (h2->ha_index_init(mrr_keyno, FALSE))
-      goto error;
-
-    use_default_impl= FALSE;
-    if (pushed_cond)
-      h2->idx_cond_push(mrr_keyno, pushed_cond);
   }
   else
   {
-    /* 
-      We get here when the access alternates betwen MRR scan(s) and non-MRR
-      scans.
-
-      Calling h->index_end() will invoke dsmrr_close() for this object,
-      which will delete h2. We need to keep it, so save put it away and dont
-      let it be deleted:
-    */
-    handler *save_h2= h2;
-    h2= NULL;
-    int res= (h->inited == handler::INDEX && h->ha_index_end());
-    h2= save_h2;
-    use_default_impl= FALSE;
-    if (res)
-      goto error;
+    //doing DS-MRR/CPK
+    // fill-buffer-analog
+    // eof
+    h->mrr_iter= seq_funcs->init(seq_init_param, n_ranges, mode);
+    h->mrr_funcs= *seq_funcs;
+    dsmrr_fill_buffer_cpk();
+    if (dsmrr_eof) 
+      buf->end_of_used_area= rowids_buf_last;
+    DBUG_RETURN(0); // nothing can go wrong while filling the buffer
   }
 
   if (h2->handler::multi_range_read_init(seq_funcs, seq_init_param, n_ranges,
-                                          mode, buf) || 
+                                         key_parts, mode, buf) || 
       dsmrr_fill_buffer())
   {
     goto error;
@@ -524,6 +563,149 @@ int DsMrr_impl::dsmrr_fill_buffer()
 }
 
 
+/* qsort-compatible function to compare key tuples */
+int DsMrr_impl::key_tuple_cmp(void* arg, uchar* key1, uchar* key2)
+{
+  DsMrr_impl *dsmrr= (DsMrr_impl*)arg;
+  TABLE *table= dsmrr->h->table;
+  
+  KEY_PART_INFO *part= table->key_info[table->s->primary_key].key_part;
+  KEY_PART_INFO *part_end= part + dsmrr->cpk_n_parts;
+
+  //uint32 *lengths=item->field_lengths;
+  for (; part < part_end; ++part)
+  {
+    Field* f = part->field;
+    int len = part->store_length;
+    int res = f->cmp(key1, key2);
+    if (res)
+      return res;
+    key1 += len;
+    key2 += len;
+  }
+  return 0;
+}
+
+
+//psergey2:
+int DsMrr_impl::dsmrr_fill_buffer_cpk()
+{
+  int res;
+  KEY_MULTI_RANGE cur_range;
+  DBUG_ENTER("DsMrr_impl::dsmrr_fill_buffer_cpk");
+
+  rowids_buf_cur= rowids_buf;
+  while ((rowids_buf_cur < rowids_buf_end) && 
+         !(res= h->mrr_funcs.next(h->mrr_iter, &cur_range)))
+  {
+    DBUG_ASSERT(cur_range.range_flag & EQ_RANGE);
+    DBUG_ASSERT(cpk_tuple_length == cur_range.start_key.length);
+
+    /* Put key, or {key, range_id} pair into the buffer */
+    memcpy(rowids_buf_cur, cur_range.start_key.key, cpk_tuple_length);
+    rowids_buf_cur += cpk_tuple_length;
+
+    if (is_mrr_assoc)
+    {
+      memcpy(rowids_buf_cur, &cur_range.ptr, sizeof(void*));
+      rowids_buf_cur += sizeof(void*);
+    }
+  }
+
+  dsmrr_eof= test(res);
+
+  /* Sort the buffer contents by rowid */
+  uint elem_size= cpk_tuple_length + (int)is_mrr_assoc * sizeof(void*);
+  uint n_rowids= (rowids_buf_cur - rowids_buf) / elem_size;
+  
+  my_qsort2(rowids_buf, n_rowids, elem_size, 
+            (qsort2_cmp)DsMrr_impl::key_tuple_cmp, (void*)this);
+  rowids_buf_last= rowids_buf_cur;
+  rowids_buf_cur=  rowids_buf;
+  DBUG_RETURN(0);
+}
+
+
+/*
+  CPK: so, the source is 
+   - buffer exhaustion/re-fill
+   - advance to next range on "record-not-found" error.
+   - if scanning on a prefix, enumerate all records for a key.
+*/
+int DsMrr_impl::dsmrr_next_cpk(char **range_info)
+{
+  int res;
+
+  if (cpk_have_range)
+  {
+    res= h->index_next_same(table->record[0], rowids_buf_cur, cpk_tuple_length);
+    if (res != HA_ERR_END_OF_FILE)
+    {
+      // todo
+      if (is_mrr_assoc)
+        memcpy(range_info, &cpk_saved_range_info, sizeof(void*));
+      return res;
+    }
+    /* 
+      Ok, we got EOF for records in this range. Fall through to get to another
+      range.
+    */
+  }
+
+  do
+  {
+    /* First, make sure we have a range at start of the buffer*/
+    if (rowids_buf_cur == rowids_buf_last)
+    {
+      if (dsmrr_eof)
+      {
+        res= HA_ERR_END_OF_FILE;
+        goto end;
+      }
+      // TODO: the return values are mix of HA_ERR_ codes and TRUE as "generic
+      //       failure" error. Is this ok?
+      if ((res= dsmrr_fill_buffer_cpk()))
+        goto end;
+    }
+   
+    if (rowids_buf_cur == rowids_buf_last)
+    {
+      res= HA_ERR_END_OF_FILE;
+      goto end;
+    }
+    
+    //TODO: skip-record calls here?
+    //if (h2->mrr_funcs.skip_record &&
+    //	h2->mrr_funcs.skip_record(h2->mrr_iter, (char *) cur_range_info, rowid))
+    //  continue;
+    
+    /* Ok, got the range. Try making a lookup.  */
+    uchar *lookup_tuple= rowids_buf_cur;
+    rowids_buf_cur += cpk_tuple_length;
+    if (is_mrr_assoc)
+    {
+      memcpy(cpk_saved_range_info, rowids_buf_cur, sizeof(void*));
+      rowids_buf_cur += sizeof(void*) * test(is_mrr_assoc);
+    }
+      
+    res= h->index_read(table->record[0], lookup_tuple, cpk_tuple_length, 
+                       HA_READ_KEY_EXACT);
+
+    if (res && res != HA_ERR_END_OF_FILE)
+      goto end;
+
+    if (!res)
+    {
+      memcpy(range_info, cpk_saved_range_info, sizeof(void*));
+      cpk_have_range= !cpk_is_unique_scan;
+      break;
+    }
+  } while (true);
+ 
+end:
+  return res;
+}
+
 /**
   DS-MRR implementation: multi_range_read_next() function
 */
@@ -536,6 +718,9 @@ int DsMrr_impl::dsmrr_next(char **range_info)
 
   if (use_default_impl)
     return h->handler::multi_range_read_next(range_info);
+
+  if (doing_cpk_scan)
+    return dsmrr_next_cpk(range_info);
   
   do
   {
@@ -582,7 +767,8 @@ end:
 /**
   DS-MRR implementation: multi_range_read_info() function
 */
-ha_rows DsMrr_impl::dsmrr_info(uint keyno, uint n_ranges, uint rows,
+ha_rows DsMrr_impl::dsmrr_info(uint keyno, uint n_ranges, uint rows, 
+                               uint key_parts,
                                uint *bufsz, uint *flags, COST_VECT *cost)
 {  
   ha_rows res;
@@ -590,8 +776,8 @@ ha_rows DsMrr_impl::dsmrr_info(uint keyno, uint n_ranges, uint rows,
   uint def_bufsz= *bufsz;
 
   /* Get cost/flags/mem_usage of default MRR implementation */
-  res= h->handler::multi_range_read_info(keyno, n_ranges, rows, &def_bufsz,
-                                         &def_flags, cost);
+  res= h->handler::multi_range_read_info(keyno, n_ranges, rows, key_parts, 
+                                         &def_bufsz, &def_flags, cost);
   DBUG_ASSERT(!res);
 
   if ((*flags & HA_MRR_USE_DEFAULT_IMPL) || 
@@ -705,6 +891,13 @@ bool key_uses_partial_cols(TABLE *table, uint keyno)
   @retval TRUE   Default MRR implementation should be used
   @retval FALSE  DS-MRR implementation should be used
 */
+bool DsMrr_impl::check_cpk_scan(uint keyno, uint mrr_flags)
+{
+  return test((mrr_flags & HA_MRR_SINGLE_POINT) && 
+              !(mrr_flags & HA_MRR_SORTED) && 
+              keyno == table->s->primary_key && 
+              h->primary_key_is_clustered());
+}
 
 bool DsMrr_impl::choose_mrr_impl(uint keyno, ha_rows rows, uint *flags,
                                  uint *bufsz, COST_VECT *cost)
@@ -712,8 +905,12 @@ bool DsMrr_impl::choose_mrr_impl(uint keyno, ha_rows rows, uint *flags,
   COST_VECT dsmrr_cost;
   bool res;
   THD *thd= current_thd;
+  //psergey2: check the criteria.
+  doing_cpk_scan= check_cpk_scan(keyno, *flags); 
+
   if (thd->variables.optimizer_use_mrr == 2 || *flags & HA_MRR_INDEX_ONLY ||
-      (keyno == table->s->primary_key && h->primary_key_is_clustered()) ||
+      (keyno == table->s->primary_key && h->primary_key_is_clustered() &&
+       !doing_cpk_scan) ||
        key_uses_partial_cols(table, keyno))
   {
     /* Use the default implementation */
diff --git a/sql/multi_range_read.h b/sql/multi_range_read.h
index 90e2e4c93d6..b379d4f517d 100644
--- a/sql/multi_range_read.h
+++ b/sql/multi_range_read.h
@@ -43,6 +43,17 @@ private:
   bool is_mrr_assoc;
 
   bool use_default_impl; /* TRUE <=> shortcut all calls to default MRR impl */
+
+  bool doing_cpk_scan;
+  uint cpk_tuple_length;
+  uint cpk_n_parts;
+  bool cpk_is_unique_scan;
+  char *cpk_saved_range_info;
+  bool cpk_have_range;
+
+
+  bool check_cpk_scan(uint keyno, uint mrr_flags);
+  static int key_tuple_cmp(void* arg, uchar* key1, uchar* key2);
 public:
   void init(handler *h_arg, TABLE *table_arg)
   {
@@ -50,13 +61,16 @@ public:
     table= table_arg;
   }
   int dsmrr_init(handler *h, RANGE_SEQ_IF *seq_funcs, void *seq_init_param, 
-                 uint n_ranges, uint mode, HANDLER_BUFFER *buf);
+                 uint n_ranges, uint key_parts, uint mode, 
+                 HANDLER_BUFFER *buf);
   void dsmrr_close();
   int dsmrr_fill_buffer();
+  int dsmrr_fill_buffer_cpk();
   int dsmrr_next(char **range_info);
+  int dsmrr_next_cpk(char **range_info);
 
-  ha_rows dsmrr_info(uint keyno, uint n_ranges, uint keys, uint *bufsz,
-                     uint *flags, COST_VECT *cost);
+  ha_rows dsmrr_info(uint keyno, uint n_ranges, uint keys, uint key_parts, 
+                     uint *bufsz, uint *flags, COST_VECT *cost);
 
   ha_rows dsmrr_info_const(uint keyno, RANGE_SEQ_IF *seq, 
                             void *seq_init_param, uint n_ranges, uint *bufsz,
diff --git a/sql/opt_range.cc b/sql/opt_range.cc
index 27ecdea9568..25c4259295f 100644
--- a/sql/opt_range.cc
+++ b/sql/opt_range.cc
@@ -8006,6 +8006,7 @@ QUICK_RANGE_SELECT *get_quick_select_for_ref(THD *thd, TABLE *table,
 
   quick->mrr_buf_size= thd->variables.mrr_buff_size;
   if (table->file->multi_range_read_info(quick->index, 1, (uint)records,
+                                         uint(-1), 
                                          &quick->mrr_buf_size,
                                          &quick->mrr_flags, &cost))
     goto err;
@@ -8367,6 +8368,7 @@ int QUICK_RANGE_SELECT::reset()
  
   RANGE_SEQ_IF seq_funcs= {quick_range_seq_init, quick_range_seq_next, 0, 0};
   error= file->multi_range_read_init(&seq_funcs, (void*)this, ranges.elements,
+                                     uint(-1),
                                      mrr_flags, mrr_buf_desc? mrr_buf_desc: 
                                                               &empty_buf);
   DBUG_RETURN(error);
diff --git a/sql/sql_join_cache.cc b/sql/sql_join_cache.cc
index d88cc7a9f7f..120b109d8ff 100644
--- a/sql/sql_join_cache.cc
+++ b/sql/sql_join_cache.cc
@@ -2377,7 +2377,8 @@ JOIN_CACHE_BKA::init_join_matching_records(RANGE_SEQ_IF *seq_funcs, uint ranges)
   if (!file->inited)
     file->ha_index_init(join_tab->ref.key, 1);
   if ((error= file->multi_range_read_init(seq_funcs, (void*) this, ranges,
-					  mrr_mode, &mrr_buff)))
+					  join_tab->ref.key_parts,
+                                          mrr_mode, &mrr_buff)))
     rc= error < 0 ? NESTED_LOOP_NO_MORE_ROWS: NESTED_LOOP_ERROR;
   
   return rc;
diff --git a/sql/sql_select.cc b/sql/sql_select.cc
index 8a8952550c0..1c1b054a2ea 100644
--- a/sql/sql_select.cc
+++ b/sql/sql_select.cc
@@ -7318,10 +7318,11 @@ uint check_join_cache_usage(JOIN_TAB *tab,
   case JT_EQ_REF:
     if (cache_level <= 4)
       return 0;
-    flags= HA_MRR_NO_NULL_ENDPOINTS;
+    flags= HA_MRR_NO_NULL_ENDPOINTS | HA_MRR_SINGLE_POINT;
     if (tab->table->covering_keys.is_set(tab->ref.key))
       flags|= HA_MRR_INDEX_ONLY;
     rows= tab->table->file->multi_range_read_info(tab->ref.key, 10, 20,
+                                                  tab->ref.key_parts,
                                                   &bufsz, &flags, &cost);
     if ((rows != HA_POS_ERROR) && !(flags & HA_MRR_USE_DEFAULT_IMPL) &&
         (!(flags & HA_MRR_NO_ASSOCIATION) || cache_level > 6) &&
diff --git a/storage/maria/ha_maria.cc b/storage/maria/ha_maria.cc
index 7c34a5f7595..43c6cd6606a 100644
--- a/storage/maria/ha_maria.cc
+++ b/storage/maria/ha_maria.cc
@@ -3501,10 +3501,11 @@ static SHOW_VAR status_variables[]= {
  ***************************************************************************/
 
 int ha_maria::multi_range_read_init(RANGE_SEQ_IF *seq, void *seq_init_param,
-                                     uint n_ranges, uint mode, 
+                                     uint n_ranges, uint key_parts, uint mode, 
                                      HANDLER_BUFFER *buf)
 {
-  return ds_mrr.dsmrr_init(this, seq, seq_init_param, n_ranges, mode, buf);
+  return ds_mrr.dsmrr_init(this, seq, seq_init_param, n_ranges, key_parts, 
+                           mode, buf);
 }
 
 int ha_maria::multi_range_read_next(char **range_info)
@@ -3528,11 +3529,11 @@ ha_rows ha_maria::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
 }
 
 ha_rows ha_maria::multi_range_read_info(uint keyno, uint n_ranges, uint keys,
-                                        uint *bufsz, uint *flags, 
-                                        COST_VECT *cost)
+                                       uint key_parts, uint *bufsz, 
+                                       uint *flags, COST_VECT *cost)
 {
   ds_mrr.init(this, table);
-  return ds_mrr.dsmrr_info(keyno, n_ranges, keys, bufsz, flags, cost);
+  return ds_mrr.dsmrr_info(keyno, n_ranges, keys, key_parts, bufsz, flags, cost);
 }
 
 /* MyISAM MRR implementation ends */
diff --git a/storage/maria/ha_maria.h b/storage/maria/ha_maria.h
index c2ff99fab0e..177008f422a 100644
--- a/storage/maria/ha_maria.h
+++ b/storage/maria/ha_maria.h
@@ -174,14 +174,16 @@ public:
    * Multi Range Read interface
    */
   int multi_range_read_init(RANGE_SEQ_IF *seq, void *seq_init_param,
-                            uint n_ranges, uint mode, HANDLER_BUFFER *buf);
+                            uint n_ranges, uint key_parts, uint mode, 
+                            HANDLER_BUFFER *buf);
   int multi_range_read_next(char **range_info);
   ha_rows multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
                                       void *seq_init_param, 
                                       uint n_ranges, uint *bufsz,
                                       uint *flags, COST_VECT *cost);
   ha_rows multi_range_read_info(uint keyno, uint n_ranges, uint keys,
-                                uint *bufsz, uint *flags, COST_VECT *cost);
+                                uint key_parts, uint *bufsz, 
+                                uint *flags, COST_VECT *cost);
   
   /* Index condition pushdown implementation */
   Item *idx_cond_push(uint keyno, Item* idx_cond);
diff --git a/storage/myisam/ha_myisam.cc b/storage/myisam/ha_myisam.cc
index 0a4229c2ab2..bb6ac446a4f 100644
--- a/storage/myisam/ha_myisam.cc
+++ b/storage/myisam/ha_myisam.cc
@@ -2217,10 +2217,10 @@ static int myisam_init(void *p)
  ***************************************************************************/
 
 int ha_myisam::multi_range_read_init(RANGE_SEQ_IF *seq, void *seq_init_param,
-                                     uint n_ranges, uint mode, 
+                                     uint n_ranges, uint key_parts, uint mode, 
                                      HANDLER_BUFFER *buf)
 {
-  return ds_mrr.dsmrr_init(this, seq, seq_init_param, n_ranges, mode, buf);
+  return ds_mrr.dsmrr_init(this, seq, seq_init_param, n_ranges, key_parts, mode, buf);
 }
 
 int ha_myisam::multi_range_read_next(char **range_info)
@@ -2244,11 +2244,11 @@ ha_rows ha_myisam::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
 }
 
 ha_rows ha_myisam::multi_range_read_info(uint keyno, uint n_ranges, uint keys,
-                                         uint *bufsz, uint *flags,
-                                         COST_VECT *cost)
+                                         uint key_parts, uint *bufsz, 
+                                         uint *flags, COST_VECT *cost)
 {
   ds_mrr.init(this, table);
-  return ds_mrr.dsmrr_info(keyno, n_ranges, keys, bufsz, flags, cost);
+  return ds_mrr.dsmrr_info(keyno, n_ranges, keys, key_parts, bufsz, flags, cost);
 }
 
 /* MyISAM MRR implementation ends */
diff --git a/storage/myisam/ha_myisam.h b/storage/myisam/ha_myisam.h
index 76db0e89536..d37870b861b 100644
--- a/storage/myisam/ha_myisam.h
+++ b/storage/myisam/ha_myisam.h
@@ -162,14 +162,16 @@ public:
    * Multi Range Read interface
    */
   int multi_range_read_init(RANGE_SEQ_IF *seq, void *seq_init_param,
-                            uint n_ranges, uint mode, HANDLER_BUFFER *buf);
+                            uint n_ranges, uint key_parts, uint mode, 
+                            HANDLER_BUFFER *buf);
   int multi_range_read_next(char **range_info);
   ha_rows multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
                                       void *seq_init_param, 
                                       uint n_ranges, uint *bufsz,
                                       uint *flags, COST_VECT *cost);
   ha_rows multi_range_read_info(uint keyno, uint n_ranges, uint keys,
-                                uint *bufsz, uint *flags, COST_VECT *cost);
+                                uint key_parts, uint *bufsz, 
+                                uint *flags, COST_VECT *cost);
   
   /* Index condition pushdown implementation */
   Item *idx_cond_push(uint keyno, Item* idx_cond);
diff --git a/storage/xtradb/handler/ha_innodb.cc b/storage/xtradb/handler/ha_innodb.cc
index db25b39caab..a8ccb426aa5 100644
--- a/storage/xtradb/handler/ha_innodb.cc
+++ b/storage/xtradb/handler/ha_innodb.cc
@@ -11025,9 +11025,10 @@ test_innobase_convert_name()
  */
 
 int ha_innobase::multi_range_read_init(RANGE_SEQ_IF *seq, void *seq_init_param,
-                          uint n_ranges, uint mode, HANDLER_BUFFER *buf)
+                                       uint n_ranges, uint key_parts, uint mode,
+                                       HANDLER_BUFFER *buf)
 {
-  return ds_mrr.dsmrr_init(this, seq, seq_init_param, n_ranges, mode, buf);
+  return ds_mrr.dsmrr_init(this, seq, seq_init_param, n_ranges, key_parts, mode, buf);
 }
 
 int ha_innobase::multi_range_read_next(char **range_info)
@@ -11052,12 +11053,13 @@ ha_rows ha_innobase::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
   return res;
 }
 
-ha_rows ha_innobase::multi_range_read_info(uint keyno, uint n_ranges, 
-                                           uint keys, uint *bufsz, 
+ha_rows ha_innobase::multi_range_read_info(uint keyno, uint n_ranges, uint keys,
+                                           uint key_parts, uint *bufsz, 
                                            uint *flags, COST_VECT *cost)
 {
   ds_mrr.init(this, table);
-  ha_rows res= ds_mrr.dsmrr_info(keyno, n_ranges, keys, bufsz, flags, cost);
+  ha_rows res= ds_mrr.dsmrr_info(keyno, n_ranges, keys, key_parts, bufsz, 
+                                 flags, cost);
   return res;
 }
 
diff --git a/storage/xtradb/handler/ha_innodb.h b/storage/xtradb/handler/ha_innodb.h
index 6c7098560b9..0c1f2b42dd6 100644
--- a/storage/xtradb/handler/ha_innodb.h
+++ b/storage/xtradb/handler/ha_innodb.h
@@ -210,14 +210,16 @@ public:
    * Multi Range Read interface
    */
   int multi_range_read_init(RANGE_SEQ_IF *seq, void *seq_init_param,
-                            uint n_ranges, uint mode, HANDLER_BUFFER *buf);
+                            uint n_ranges, uint key_parts, uint mode, 
+                            HANDLER_BUFFER *buf);
   int multi_range_read_next(char **range_info);
   ha_rows multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
                                       void *seq_init_param, 
                                       uint n_ranges, uint *bufsz,
                                       uint *flags, COST_VECT *cost);
   ha_rows multi_range_read_info(uint keyno, uint n_ranges, uint keys,
-                                uint *bufsz, uint *flags, COST_VECT *cost);
+                                uint key_parts, uint *bufsz, 
+                                uint *flags, COST_VECT *cost);
   DsMrr_impl ds_mrr;
 
   Item *idx_cond_push(uint keyno, Item* idx_cond);

From 82f8ed17e1172f949857385a7bc7bebde82a1602 Mon Sep 17 00:00:00 2001
From: Sergey Petrunya <psergey@askmonty.org>
Date: Mon, 21 Jun 2010 12:34:31 +0400
Subject: [PATCH 02/49] MWL#121: DS-MRR support for clustered primary keys -
 Add testcases

---
 sql/multi_range_read.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc
index e0bccb9bf90..72c85ec11bb 100644
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -674,6 +674,7 @@ int DsMrr_impl::dsmrr_next_cpk(char **range_info)
       goto end;
     }
     
+    //TODO: make skip_index_tuple() calls, too?
     //TODO: skip-record calls here?
     //if (h2->mrr_funcs.skip_record &&
     //	h2->mrr_funcs.skip_record(h2->mrr_iter, (char *) cur_range_info, rowid))

From 16e197f5b10fdee23703b94c5549bc17cd81c6f8 Mon Sep 17 00:00:00 2001
From: Sergey Petrunya <psergey@askmonty.org>
Date: Tue, 22 Jun 2010 21:24:22 +0400
Subject: [PATCH 03/49] MWL#121: DS-MRR support for clustered primary keys -
 Add testcases - Code cleanup: garbage removal, better comments, make members
 private where possible

---
 mysql-test/r/innodb_mrr_cpk.result | 134 ++++++++++++
 mysql-test/t/innodb_mrr_cpk.test   | 134 ++++++++++++
 sql/multi_range_read.cc            | 341 +++++++++++++++++------------
 sql/multi_range_read.h             | 157 +++++++++----
 4 files changed, 577 insertions(+), 189 deletions(-)
 create mode 100644 mysql-test/r/innodb_mrr_cpk.result
 create mode 100644 mysql-test/t/innodb_mrr_cpk.test

diff --git a/mysql-test/r/innodb_mrr_cpk.result b/mysql-test/r/innodb_mrr_cpk.result
new file mode 100644
index 00000000000..f93807e14d8
--- /dev/null
+++ b/mysql-test/r/innodb_mrr_cpk.result
@@ -0,0 +1,134 @@
+drop table if exists t0,t1,t2,t3;
+set @save_join_cache_level=@@join_cache_level;
+set join_cache_level=6;
+set @save_storage_engine=@@storage_engine;
+set storage_engine=innodb;
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1(a char(8), b char(8), filler char(100), primary key(a));
+show create table t1;
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` char(8) NOT NULL DEFAULT '',
+  `b` char(8) DEFAULT NULL,
+  `filler` char(100) DEFAULT NULL,
+  PRIMARY KEY (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1
+insert into t1 select 
+concat('a-', 1000 + A.a + B.a*10 + C.a*100, '=A'),
+concat('b-', 1000 + A.a + B.a*10 + C.a*100, '=B'),
+'filler'
+from t0 A, t0 B, t0 C;
+create table t2 (a char(8));
+insert into t2 values ('a-1010=A'), ('a-1030=A'), ('a-1020=A');
+This should use join buffer:
+explain select * from t1, t2 where t1.a=t2.a;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	3	
+1	SIMPLE	t1	eq_ref	PRIMARY	PRIMARY	8	test.t2.a	1	Using join buffer
+This output must be sorted by value of t1.a:
+select * from t1, t2 where t1.a=t2.a;
+a	b	filler	a
+a-1010=A	b-1010=B	filler	a-1010=A
+a-1020=A	b-1020=B	filler	a-1020=A
+a-1030=A	b-1030=B	filler	a-1030=A
+drop table t1, t2;
+create table t1(
+a char(8) character set utf8, b int, filler char(100), 
+primary key(a,b)
+);
+insert into t1 select 
+concat('a-', 1000 + A.a + B.a*10 + C.a*100, '=A'),
+1000 + A.a + B.a*10 + C.a*100,
+'filler'
+from t0 A, t0 B, t0 C;
+create table t2 (a char(8) character set utf8, b int);
+insert into t2 values ('a-1010=A', 1010), ('a-1030=A', 1030), ('a-1020=A', 1020);
+explain select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	3	
+1	SIMPLE	t1	eq_ref	PRIMARY	PRIMARY	28	test.t2.a,test.t2.b	1	Using join buffer
+select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+a	b	filler	a	b
+a-1010=A	1010	filler	a-1010=A	1010
+a-1020=A	1020	filler	a-1020=A	1020
+a-1030=A	1030	filler	a-1030=A	1030
+insert into t2 values ('a-1030=A', 1030), ('a-1020=A', 1020);
+explain select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	5	
+1	SIMPLE	t1	eq_ref	PRIMARY	PRIMARY	28	test.t2.a,test.t2.b	1	Using join buffer
+select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+a	b	filler	a	b
+a-1010=A	1010	filler	a-1010=A	1010
+a-1020=A	1020	filler	a-1020=A	1020
+a-1020=A	1020	filler	a-1020=A	1020
+a-1030=A	1030	filler	a-1030=A	1030
+a-1030=A	1030	filler	a-1030=A	1030
+drop table t1, t2;
+create table t1(
+a varchar(8) character set utf8, b int, filler char(100), 
+primary key(a,b)
+);
+insert into t1 select 
+concat('a-', 1000 + A.a + B.a*10 + C.a*100, '=A'),
+1000 + A.a + B.a*10 + C.a*100,
+'filler'
+from t0 A, t0 B, t0 C;
+create table t2 (a char(8) character set utf8, b int);
+insert into t2 values ('a-1010=A', 1010), ('a-1030=A', 1030), ('a-1020=A', 1020);
+explain select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	3	
+1	SIMPLE	t1	eq_ref	PRIMARY	PRIMARY	30	test.t2.a,test.t2.b	1	Using index condition(BKA); Using join buffer
+select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+a	b	filler	a	b
+a-1010=A	1010	filler	a-1010=A	1010
+a-1020=A	1020	filler	a-1020=A	1020
+a-1030=A	1030	filler	a-1030=A	1030
+explain select * from t1, t2 where t1.a=t2.a;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	3	
+1	SIMPLE	t1	ref	PRIMARY	PRIMARY	26	test.t2.a	1	Using index condition(BKA); Using join buffer
+select * from t1, t2 where t1.a=t2.a;
+a	b	filler	a	b
+a-1010=A	1010	filler	a-1010=A	1010
+a-1020=A	1020	filler	a-1020=A	1020
+a-1030=A	1030	filler	a-1030=A	1030
+drop table t1, t2;
+create table t1 (a int, b int, c int, filler char(100), primary key(a,b,c));
+insert into t1 select A.a, B.a, C.a, 'filler' from t0 A, t0 B, t0 C;
+insert into t1 values (11, 11, 11,   'filler');
+insert into t1 values (11, 11, 12,   'filler');
+insert into t1 values (11, 11, 13,   'filler');
+insert into t1 values (11, 22, 1234, 'filler');
+insert into t1 values (11, 33, 124,  'filler');
+insert into t1 values (11, 33, 125,  'filler');
+create table t2 (a int, b int);
+insert into t2 values (11,33), (11,22), (11,11);
+explain select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	3	
+1	SIMPLE	t1	ref	PRIMARY	PRIMARY	8	test.t2.a,test.t2.b	1	Using join buffer
+select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+a	b	c	filler	a	b
+11	11	11	filler	11	11
+11	11	12	filler	11	11
+11	11	13	filler	11	11
+11	22	1234	filler	11	22
+11	33	124	filler	11	33
+11	33	125	filler	11	33
+set join_cache_level=0;
+select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+a	b	c	filler	a	b
+11	33	124	filler	11	33
+11	33	125	filler	11	33
+11	22	1234	filler	11	22
+11	11	11	filler	11	11
+11	11	12	filler	11	11
+11	11	13	filler	11	11
+set join_cache_level=6;
+drop table t1,t2;
+set @@join_cache_level= @save_join_cache_level;
+set storage_engine=@save_storage_engine;
+drop table t0;
diff --git a/mysql-test/t/innodb_mrr_cpk.test b/mysql-test/t/innodb_mrr_cpk.test
new file mode 100644
index 00000000000..84b37840880
--- /dev/null
+++ b/mysql-test/t/innodb_mrr_cpk.test
@@ -0,0 +1,134 @@
+# 
+# Tests for DS-MRR over clustered primary key. The only engine that supports
+# this is InnoDB/XtraDB.
+#
+# Basic idea about testing
+#  - DS-MRR/CPK works only with BKA
+#  - Should also test index condition pushdown
+#  - Should also test whatever uses RANGE_SEQ_IF::skip_record() for filtering
+#  - Also test access using prefix of primary key
+# 
+#  - Forget about cost model, BKA's multi_range_read_info() call passes 10 for
+#    #rows, the call is there at all only for applicability check
+# 
+-- source include/have_innodb.inc
+
+--disable_warnings
+drop table if exists t0,t1,t2,t3;
+--enable_warnings
+
+set @save_join_cache_level=@@join_cache_level;
+set join_cache_level=6;
+
+set @save_storage_engine=@@storage_engine;
+set storage_engine=innodb;
+
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1(a char(8), b char(8), filler char(100), primary key(a));
+show create table t1;
+
+insert into t1 select 
+  concat('a-', 1000 + A.a + B.a*10 + C.a*100, '=A'),
+  concat('b-', 1000 + A.a + B.a*10 + C.a*100, '=B'),
+  'filler'
+from t0 A, t0 B, t0 C;
+
+create table t2 (a char(8));
+insert into t2 values ('a-1010=A'), ('a-1030=A'), ('a-1020=A');
+
+--echo This should use join buffer:
+explain select * from t1, t2 where t1.a=t2.a;
+
+--echo This output must be sorted by value of t1.a:
+select * from t1, t2 where t1.a=t2.a;
+drop table t1, t2;
+
+# Try multi-column indexes
+create table t1(
+  a char(8) character set utf8, b int, filler char(100), 
+  primary key(a,b)
+);
+
+insert into t1 select 
+  concat('a-', 1000 + A.a + B.a*10 + C.a*100, '=A'),
+  1000 + A.a + B.a*10 + C.a*100,
+  'filler'
+from t0 A, t0 B, t0 C;
+
+create table t2 (a char(8) character set utf8, b int);
+insert into t2 values ('a-1010=A', 1010), ('a-1030=A', 1030), ('a-1020=A', 1020);
+explain select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+
+# Try with dataset that causes identical lookup keys:
+insert into t2 values ('a-1030=A', 1030), ('a-1020=A', 1020);
+explain select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+
+drop table t1, t2;
+
+create table t1(
+  a varchar(8) character set utf8, b int, filler char(100), 
+  primary key(a,b)
+);
+
+insert into t1 select 
+  concat('a-', 1000 + A.a + B.a*10 + C.a*100, '=A'),
+  1000 + A.a + B.a*10 + C.a*100,
+  'filler'
+from t0 A, t0 B, t0 C;
+
+create table t2 (a char(8) character set utf8, b int);
+insert into t2 values ('a-1010=A', 1010), ('a-1030=A', 1030), ('a-1020=A', 1020);
+explain select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+
+# 
+# Try scanning on a CPK prefix
+#
+explain select * from t1, t2 where t1.a=t2.a;
+select * from t1, t2 where t1.a=t2.a;
+drop table t1, t2;
+
+#
+# The above example is not very interesting, as CPK prefix has 
+# only one match.  Create a dataset where scan on CPK prefix 
+# would produce multiple matches:
+#
+create table t1 (a int, b int, c int, filler char(100), primary key(a,b,c));
+insert into t1 select A.a, B.a, C.a, 'filler' from t0 A, t0 B, t0 C;
+
+insert into t1 values (11, 11, 11,   'filler');
+insert into t1 values (11, 11, 12,   'filler');
+insert into t1 values (11, 11, 13,   'filler');
+insert into t1 values (11, 22, 1234, 'filler');
+insert into t1 values (11, 33, 124,  'filler');
+insert into t1 values (11, 33, 125,  'filler');
+
+create table t2 (a int, b int);
+insert into t2 values (11,33), (11,22), (11,11);
+
+explain select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+
+set join_cache_level=0;
+select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
+set join_cache_level=6;
+
+drop table t1,t2;
+
+#
+# Check that Index Condition Pushdown (BKA) actually works:
+#
+
+# TODO
+
+#
+# Check that record-check-func is done:
+# 
+
+set @@join_cache_level= @save_join_cache_level;
+set storage_engine=@save_storage_engine;
+drop table t0;
+
diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc
index 72c85ec11bb..46790adee9e 100644
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -139,8 +139,13 @@ ha_rows handler::multi_range_read_info(uint keyno, uint n_ranges, uint n_rows,
                                        uint key_parts, uint *bufsz, 
                                        uint *flags, COST_VECT *cost)
 {
+  /* 
+    Currently we expect this function to be called only in preparation of scan
+    with HA_MRR_SINGLE_POINT property.
+  */
+  DBUG_ASSERT(*flags | HA_MRR_SINGLE_POINT);
+
   *bufsz= 0; /* Default implementation doesn't need a buffer */
-  //psergey2-todo: assert for singlepoint ranges here?
   *flags |= HA_MRR_USE_DEFAULT_IMPL;
 
   cost->zero();
@@ -323,22 +328,25 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
                                         n_ranges, key_parts, mode, buf);
     DBUG_RETURN(retval);
   }
-  rowids_buf= buf->buffer;
+  mrr_buf= buf->buffer;
 
   is_mrr_assoc= !test(mode & HA_MRR_NO_ASSOCIATION);
 
   if (is_mrr_assoc)
     status_var_increment(table->in_use->status_var.ha_multi_range_read_init_count);
  
-  rowids_buf_end= buf->buffer_end;
+  mrr_buf_end= buf->buffer_end;
 
 
   doing_cpk_scan= check_cpk_scan(h->active_index, mode); 
   if (doing_cpk_scan)
   {
+    /* 
+      When doing a scan on CPK, the buffer stores {lookup_tuple, range_id}
+      pairs 
+    */
     uint keylen=0;
     DBUG_ASSERT(key_parts != 0);
-    //psergey2-todo: new elem_size here
     for (uint kp= 0; kp < key_parts; kp++)
       keylen += table->key_info[h->active_index].key_part[kp].store_length;
 
@@ -350,12 +358,29 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
     use_default_impl= FALSE;
   }
   else
+  {
+    /* In regular DS-MRR, buffer stores {rowid, range_id} pairs */
     elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*);
+  }
 
-  rowids_buf_last= rowids_buf + 
-                      ((rowids_buf_end - rowids_buf)/ elem_size)*
+  mrr_buf_last= mrr_buf + 
+                      ((mrr_buf_end - mrr_buf)/ elem_size)*
                       elem_size;
-  rowids_buf_end= rowids_buf_last;
+  mrr_buf_end= mrr_buf_last;
+
+  if (doing_cpk_scan)
+  {
+    /* 
+      DS-MRR/CPK: fill buffer with lookup tuples and sort; also we don't need a
+      secondary handler object.
+    */
+    h->mrr_iter= seq_funcs->init(seq_init_param, n_ranges, mode);
+    h->mrr_funcs= *seq_funcs;
+    dsmrr_fill_buffer_cpk();
+    if (dsmrr_eof) 
+      buf->end_of_used_area= mrr_buf_last;
+    DBUG_RETURN(0); /* nothing could go wrong while filling the buffer */
+  }
 
   /*
     There can be two cases:
@@ -365,84 +390,68 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
        The caller might have called h->index_init(), need to switch h to
        rnd_pos calls.
   */
-  //psergey2-todo: don't create secondary for CPK scan.
-  if (!doing_cpk_scan)
+  if (!h2)
   {
-    if (!h2)
+    /* Create a separate handler object to do rndpos() calls. */
+    THD *thd= current_thd;
+    /*
+      ::clone() takes up a lot of stack, especially on 64 bit platforms.
+      The constant 5 is an empiric result.
+    */
+    if (check_stack_overrun(thd, 5*STACK_MIN_SIZE, (uchar*) &new_h2))
+      DBUG_RETURN(1);
+    DBUG_ASSERT(h->active_index != MAX_KEY);
+    uint mrr_keyno= h->active_index;
+
+    /* Create a separate handler object to do rndpos() calls. */
+    if (!(new_h2= h->clone(thd->mem_root)) || 
+        new_h2->ha_external_lock(thd, F_RDLCK))
     {
-      /* Create a separate handler object to do rndpos() calls. */
-      THD *thd= current_thd;
-      /*
-        ::clone() takes up a lot of stack, especially on 64 bit platforms.
-        The constant 5 is an empiric result.
-      */
-      if (check_stack_overrun(thd, 5*STACK_MIN_SIZE, (uchar*) &new_h2))
-        DBUG_RETURN(1);
-      DBUG_ASSERT(h->active_index != MAX_KEY);
-      uint mrr_keyno= h->active_index;
-
-      /* Create a separate handler object to do rndpos() calls. */
-      if (!(new_h2= h->clone(thd->mem_root)) || 
-          new_h2->ha_external_lock(thd, F_RDLCK))
-      {
-        delete new_h2;
-        DBUG_RETURN(1);
-      }
-
-      if (mrr_keyno == h->pushed_idx_cond_keyno)
-        pushed_cond= h->pushed_idx_cond;
-
-      /*
-        Caution: this call will invoke this->dsmrr_close(). Do not put the
-        created secondary table handler into this->h2 or it will delete it.
-      */
-      if (h->ha_index_end())
-      {
-        h2=new_h2;
-        goto error;
-      }
-
-      h2= new_h2; /* Ok, now can put it into h2 */
-      table->prepare_for_position();
-      h2->extra(HA_EXTRA_KEYREAD);
-    
-      if (h2->ha_index_init(mrr_keyno, FALSE))
-        goto error;
-
-      use_default_impl= FALSE;
-      if (pushed_cond)
-        h2->idx_cond_push(mrr_keyno, pushed_cond);
+      delete new_h2;
+      DBUG_RETURN(1);
     }
-    else
+
+    if (mrr_keyno == h->pushed_idx_cond_keyno)
+      pushed_cond= h->pushed_idx_cond;
+
+    /*
+      Caution: this call will invoke this->dsmrr_close(). Do not put the
+      created secondary table handler into this->h2 or it will delete it.
+    */
+    if (h->ha_index_end())
     {
-      /* 
-        We get here when the access alternates betwen MRR scan(s) and non-MRR
-        scans.
-
-        Calling h->index_end() will invoke dsmrr_close() for this object,
-        which will delete h2. We need to keep it, so save put it away and dont
-        let it be deleted:
-      */
-      handler *save_h2= h2;
-      h2= NULL;
-      int res= (h->inited == handler::INDEX && h->ha_index_end());
-      h2= save_h2;
-      use_default_impl= FALSE;
-      if (res)
-        goto error;
+      h2=new_h2;
+      goto error;
     }
+
+    h2= new_h2; /* Ok, now can put it into h2 */
+    table->prepare_for_position();
+    h2->extra(HA_EXTRA_KEYREAD);
+  
+    if (h2->ha_index_init(mrr_keyno, FALSE))
+      goto error;
+
+    use_default_impl= FALSE;
+    if (pushed_cond)
+      h2->idx_cond_push(mrr_keyno, pushed_cond);
   }
   else
   {
-    //doing DS-MRR/CPK
-    // fill-buffer-analog
-    // eof
-    h->mrr_iter= seq_funcs->init(seq_init_param, n_ranges, mode);
-    h->mrr_funcs= *seq_funcs;
-    dsmrr_fill_buffer_cpk();
-    if (dsmrr_eof) 
-      buf->end_of_used_area= rowids_buf_last;
-    DBUG_RETURN(0); // nothing can go wrong while filling the buffer
+    /* 
+      We get here when the access alternates betwen MRR scan(s) and non-MRR
+      scans.
+
+      Calling h->index_end() will invoke dsmrr_close() for this object,
+      which will delete h2. We need to keep it, so save put it away and dont
+      let it be deleted:
+    */
+    handler *save_h2= h2;
+    h2= NULL;
+    int res= (h->inited == handler::INDEX && h->ha_index_end());
+    h2= save_h2;
+    use_default_impl= FALSE;
+    if (res)
+      goto error;
   }
 
   if (h2->handler::multi_range_read_init(seq_funcs, seq_init_param, n_ranges,
@@ -456,7 +465,7 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
     adjust *buf to indicate that the remaining buffer space will not be used.
   */
   if (dsmrr_eof) 
-    buf->end_of_used_area= rowids_buf_last;
+    buf->end_of_used_area= mrr_buf_last;
 
   /*
      h->inited == INDEX may occur when 'range checked for each record' is
@@ -512,6 +521,9 @@ static int rowid_cmp(void *h, uchar *a, uchar *b)
   rowid and return.
   
   The function assumes that rowids buffer is empty when it is invoked. 
+
+  dsmrr_eof is set to indicate whether we've exhausted the list of ranges we're
+  scanning.
   
   @param h  Table handler
 
@@ -526,8 +538,8 @@ int DsMrr_impl::dsmrr_fill_buffer()
   int res;
   DBUG_ENTER("DsMrr_impl::dsmrr_fill_buffer");
 
-  rowids_buf_cur= rowids_buf;
-  while ((rowids_buf_cur < rowids_buf_end) && 
+  mrr_buf_cur= mrr_buf;
+  while ((mrr_buf_cur < mrr_buf_end) && 
          !(res= h2->handler::multi_range_read_next(&range_info)))
   {
     KEY_MULTI_RANGE *curr_range= &h2->handler::mrr_cur_range;
@@ -537,13 +549,13 @@ int DsMrr_impl::dsmrr_fill_buffer()
     
     /* Put rowid, or {rowid, range_id} pair into the buffer */
     h2->position(table->record[0]);
-    memcpy(rowids_buf_cur, h2->ref, h2->ref_length);
-    rowids_buf_cur += h2->ref_length;
+    memcpy(mrr_buf_cur, h2->ref, h2->ref_length);
+    mrr_buf_cur += h2->ref_length;
 
     if (is_mrr_assoc)
     {
-      memcpy(rowids_buf_cur, &range_info, sizeof(void*));
-      rowids_buf_cur += sizeof(void*);
+      memcpy(mrr_buf_cur, &range_info, sizeof(void*));
+      mrr_buf_cur += sizeof(void*);
     }
   }
 
@@ -553,27 +565,29 @@ int DsMrr_impl::dsmrr_fill_buffer()
 
   /* Sort the buffer contents by rowid */
   uint elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*);
-  uint n_rowids= (rowids_buf_cur - rowids_buf) / elem_size;
+  uint n_rowids= (mrr_buf_cur - mrr_buf) / elem_size;
   
-  my_qsort2(rowids_buf, n_rowids, elem_size, (qsort2_cmp)rowid_cmp,
+  my_qsort2(mrr_buf, n_rowids, elem_size, (qsort2_cmp)rowid_cmp,
             (void*)h);
-  rowids_buf_last= rowids_buf_cur;
-  rowids_buf_cur=  rowids_buf;
+  mrr_buf_last= mrr_buf_cur;
+  mrr_buf_cur=  mrr_buf;
   DBUG_RETURN(0);
 }
 
 
-/* qsort-compatible function to compare key tuples */
+/* 
+  my_qsort2-compatible function to compare key tuples 
+*/
+
 int DsMrr_impl::key_tuple_cmp(void* arg, uchar* key1, uchar* key2)
 {
   DsMrr_impl *dsmrr= (DsMrr_impl*)arg;
   TABLE *table= dsmrr->h->table;
   
   KEY_PART_INFO *part= table->key_info[table->s->primary_key].key_part;
-  KEY_PART_INFO *part_end= part + dsmrr->cpk_n_parts;
+  uchar *key1_end= key1 + dsmrr->cpk_tuple_length;
 
-  //uint32 *lengths=item->field_lengths;
-  for (; part < part_end; ++part)
+  while (key1 < key1_end)
   {
     Field* f = part->field;
     int len = part->store_length;
@@ -582,33 +596,43 @@ int DsMrr_impl::key_tuple_cmp(void* arg, uchar* key1, uchar* key2)
       return res;
     key1 += len;
     key2 += len;
+    part++;
   }
   return 0;
 }
 
 
-//psergey2:
-int DsMrr_impl::dsmrr_fill_buffer_cpk()
+/*
+  DS-MRR/CPK: Fill the buffer with (lookup_tuple, range_id) pairs and sort
+
+  DESCRIPTION
+    DS-MRR/CPK: Fill the buffer with (lookup_tuple, range_id) pairs and sort
+
+    dsmrr_eof is set to indicate whether we've exhausted the list of ranges 
+    we're scanning.
+*/
+
+void DsMrr_impl::dsmrr_fill_buffer_cpk()
 {
   int res;
   KEY_MULTI_RANGE cur_range;
   DBUG_ENTER("DsMrr_impl::dsmrr_fill_buffer_cpk");
 
-  rowids_buf_cur= rowids_buf;
-  while ((rowids_buf_cur < rowids_buf_end) && 
+  mrr_buf_cur= mrr_buf;
+  while ((mrr_buf_cur < mrr_buf_end) && 
          !(res= h->mrr_funcs.next(h->mrr_iter, &cur_range)))
   {
     DBUG_ASSERT(cur_range.range_flag & EQ_RANGE);
     DBUG_ASSERT(cpk_tuple_length == cur_range.start_key.length);
 
     /* Put key, or {key, range_id} pair into the buffer */
-    memcpy(rowids_buf_cur, cur_range.start_key.key, cpk_tuple_length);
-    rowids_buf_cur += cpk_tuple_length;
+    memcpy(mrr_buf_cur, cur_range.start_key.key, cpk_tuple_length);
+    mrr_buf_cur += cpk_tuple_length;
 
     if (is_mrr_assoc)
     {
-      memcpy(rowids_buf_cur, &cur_range.ptr, sizeof(void*));
-      rowids_buf_cur += sizeof(void*);
+      memcpy(mrr_buf_cur, &cur_range.ptr, sizeof(void*));
+      mrr_buf_cur += sizeof(void*);
     }
   }
 
@@ -616,77 +640,82 @@ int DsMrr_impl::dsmrr_fill_buffer_cpk()
 
   /* Sort the buffer contents by rowid */
   uint elem_size= cpk_tuple_length + (int)is_mrr_assoc * sizeof(void*);
-  uint n_rowids= (rowids_buf_cur - rowids_buf) / elem_size;
+  uint n_rowids= (mrr_buf_cur - mrr_buf) / elem_size;
   
-  my_qsort2(rowids_buf, n_rowids, elem_size, 
+  my_qsort2(mrr_buf, n_rowids, elem_size, 
             (qsort2_cmp)DsMrr_impl::key_tuple_cmp, (void*)this);
-  rowids_buf_last= rowids_buf_cur;
-  rowids_buf_cur=  rowids_buf;
-  DBUG_RETURN(0);
+  mrr_buf_last= mrr_buf_cur;
+  mrr_buf_cur=  mrr_buf;
+  DBUG_VOID_RETURN;
 }
 
 
 /*
-  CPK: so, the source is 
-   - buffer exhaustion/re-fill
-   - advance to next range on "record-not-found" error.
-   - if scanning on a prefix, enumerate all records for a key.
+  DS-MRR/CPK: multi_range_read_next() function
+
+  DESCRIPTION
+    DsMrr_impl::dsmrr_next_cpk() 
+
+  DESCRIPTION
+    DS-MRR/CPK: multi_range_read_next() function. 
+    This is similar to DsMrr_impl::dsmrr_next(), the differences are that
+     - we get records with index_read(), not with rnd_pos()
+     - we may get multiple records for one key (=element of the buffer)
+     - unlike dsmrr_fill_buffer(), dsmrr_fill_buffer_cpk() never fails.
+ 
+  RETURN
+    0                   OK, next record was successfully read
+    HA_ERR_END_OF_FILE  End of records
+    Other               Some other error
 */
+
 int DsMrr_impl::dsmrr_next_cpk(char **range_info)
 {
   int res;
 
   if (cpk_have_range)
   {
-    res= h->index_next_same(table->record[0], rowids_buf_cur, cpk_tuple_length);
+    res= h->index_next_same(table->record[0], mrr_buf_cur, cpk_tuple_length);
     if (res != HA_ERR_END_OF_FILE)
     {
-      // todo
       if (is_mrr_assoc)
         memcpy(range_info, &cpk_saved_range_info, sizeof(void*));
       return res;
     }
-    /* 
-      Ok, we got EOF for records in this range. Fall through to get to another
-      range.
-    */
+    /* No more records in this range. Fall through to get to another range  */
   }
 
   do
   {
-    /* First, make sure we have a range at start of the buffer*/
-    if (rowids_buf_cur == rowids_buf_last)
+    /* First, make sure we have a range at start of the buffer */
+    if (mrr_buf_cur == mrr_buf_last)
     {
       if (dsmrr_eof)
       {
         res= HA_ERR_END_OF_FILE;
         goto end;
       }
-      // TODO: the return values are mix of HA_ERR_ codes and TRUE as "generic
-      //       failure" error. Is this ok?
-      if ((res= dsmrr_fill_buffer_cpk()))
-        goto end;
+      dsmrr_fill_buffer_cpk();
     }
-   
-    if (rowids_buf_cur == rowids_buf_last)
+    if (mrr_buf_cur == mrr_buf_last)
     {
       res= HA_ERR_END_OF_FILE;
       goto end;
     }
     
-    //TODO: make skip_index_tuple() calls, too?
-    //TODO: skip-record calls here?
+    //psergey2-todo: make skip_index_tuple() calls, too?
+    //psergey2-todo: skip-record calls here?
     //if (h2->mrr_funcs.skip_record &&
     //	h2->mrr_funcs.skip_record(h2->mrr_iter, (char *) cur_range_info, rowid))
     //  continue;
     
     /* Ok, got the range. Try making a lookup.  */
-    uchar *lookup_tuple= rowids_buf_cur;
-    rowids_buf_cur += cpk_tuple_length;
+    uchar *lookup_tuple= mrr_buf_cur;
+    mrr_buf_cur += cpk_tuple_length;
     if (is_mrr_assoc)
     {
-      memcpy(cpk_saved_range_info, rowids_buf_cur, sizeof(void*));
-      rowids_buf_cur += sizeof(void*) * test(is_mrr_assoc);
+      memcpy(cpk_saved_range_info, mrr_buf_cur, sizeof(void*));
+      mrr_buf_cur += sizeof(void*) * test(is_mrr_assoc);
     }
       
     res= h->index_read(table->record[0], lookup_tuple, cpk_tuple_length, 
@@ -698,6 +727,10 @@ int DsMrr_impl::dsmrr_next_cpk(char **range_info)
     if (!res)
     {
       memcpy(range_info, cpk_saved_range_info, sizeof(void*));
+      /* 
+        Attempt reading more rows from this range only if there actually can
+        be multiple matches:
+       */
       cpk_have_range= !cpk_is_unique_scan;
       break;
     }
@@ -707,6 +740,7 @@ end:
   return res;
 }
 
+
 /**
   DS-MRR implementation: multi_range_read_next() function
 */
@@ -725,7 +759,7 @@ int DsMrr_impl::dsmrr_next(char **range_info)
   
   do
   {
-    if (rowids_buf_cur == rowids_buf_last)
+    if (mrr_buf_cur == mrr_buf_last)
     {
       if (dsmrr_eof)
       {
@@ -738,17 +772,17 @@ int DsMrr_impl::dsmrr_next(char **range_info)
     }
    
     /* return eof if there are no rowids in the buffer after re-fill attempt */
-    if (rowids_buf_cur == rowids_buf_last)
+    if (mrr_buf_cur == mrr_buf_last)
     {
       res= HA_ERR_END_OF_FILE;
       goto end;
     }
-    rowid= rowids_buf_cur;
+    rowid= mrr_buf_cur;
 
     if (is_mrr_assoc)
-      memcpy(&cur_range_info, rowids_buf_cur + h->ref_length, sizeof(uchar**));
+      memcpy(&cur_range_info, mrr_buf_cur + h->ref_length, sizeof(uchar**));
 
-    rowids_buf_cur += h->ref_length + sizeof(void*) * test(is_mrr_assoc);
+    mrr_buf_cur += h->ref_length + sizeof(void*) * test(is_mrr_assoc);
     if (h2->mrr_funcs.skip_record &&
 	h2->mrr_funcs.skip_record(h2->mrr_iter, (char *) cur_range_info, rowid))
       continue;
@@ -870,7 +904,33 @@ bool key_uses_partial_cols(TABLE *table, uint keyno)
   return FALSE;
 }
 
-/**
+
+/*
+  Check if key/flags allow DS-MRR/CPK strategy to be used
+  
+  SYNOPSIS
+   DsMrr_impl::check_cpk_scan()
+     keyno      Index that will be used
+     mrr_flags  
+  
+  DESCRIPTION
+    Check if key/flags allow DS-MRR/CPK strategy to be used. 
+ 
+  RETURN
+    TRUE   DS-MRR/CPK should be used
+    FALSE  Otherwise
+*/
+
+bool DsMrr_impl::check_cpk_scan(uint keyno, uint mrr_flags)
+{
+  return test((mrr_flags & HA_MRR_SINGLE_POINT) && 
+              !(mrr_flags & HA_MRR_SORTED) && 
+              keyno == table->s->primary_key && 
+              h->primary_key_is_clustered());
+}
+
+
+/*
   DS-MRR Internals: Choose between Default MRR implementation and DS-MRR
 
   Make the choice between using Default MRR implementation and DS-MRR.
@@ -892,13 +952,7 @@ bool key_uses_partial_cols(TABLE *table, uint keyno)
   @retval TRUE   Default MRR implementation should be used
   @retval FALSE  DS-MRR implementation should be used
 */
-bool DsMrr_impl::check_cpk_scan(uint keyno, uint mrr_flags)
-{
-  return test((mrr_flags & HA_MRR_SINGLE_POINT) && 
-              !(mrr_flags & HA_MRR_SORTED) && 
-              keyno == table->s->primary_key && 
-              h->primary_key_is_clustered());
-}
+
 
 bool DsMrr_impl::choose_mrr_impl(uint keyno, ha_rows rows, uint *flags,
                                  uint *bufsz, COST_VECT *cost)
@@ -906,9 +960,8 @@ bool DsMrr_impl::choose_mrr_impl(uint keyno, ha_rows rows, uint *flags,
   COST_VECT dsmrr_cost;
   bool res;
   THD *thd= current_thd;
-  //psergey2: check the criteria.
-  doing_cpk_scan= check_cpk_scan(keyno, *flags); 
 
+  doing_cpk_scan= check_cpk_scan(keyno, *flags); 
   if (thd->variables.optimizer_use_mrr == 2 || *flags & HA_MRR_INDEX_ONLY ||
       (keyno == table->s->primary_key && h->primary_key_is_clustered() &&
        !doing_cpk_scan) ||
diff --git a/sql/multi_range_read.h b/sql/multi_range_read.h
index b379d4f517d..5dd2e0d6adf 100644
--- a/sql/multi_range_read.h
+++ b/sql/multi_range_read.h
@@ -1,16 +1,76 @@
 /*
-  This file contains declarations for 
-   - Disk-Sweep MultiRangeRead (DS-MRR) implementation
+  This file contains declarations for Disk-Sweep MultiRangeRead (DS-MRR) 
+  implementation
 */
 
 /**
-  A Disk-Sweep MRR interface implementation
+  A Disk-Sweep implementation of MRR Interface (DS-MRR for short)
 
-  This implementation makes range (and, in the future, 'ref') scans to read
-  table rows in disk sweeps. 
-  
-  Currently it is used by MyISAM and InnoDB. Potentially it can be used with
-  any table handler that has non-clustered indexes and on-disk rows.
+  This is a "plugin"(*) for storage engines that allows make index scans 
+  read table rows in rowid order. For disk-based storage engines, this is
+  faster than reading table rows in whatever-SQL-layer-makes-calls-in order.
+
+  (*) - only conceptually. No dynamic loading or binary compatibility of any
+        kind.
+
+  General scheme of things:
+   
+      SQL Layer code
+       |   |   |
+      -v---v---v---- handler->multi_range_read_XXX() function calls
+       |   |   |
+      ____________________________________
+     / DS-MRR module                      \
+     |  (scan indexes, order rowids, do    |
+     |   full record reads in rowid order) |
+     \____________________________________/
+       |   |   |
+      -|---|---|----- handler->read_range_first()/read_range_next(), 
+       |   |   |      handler->index_read(), handler->rnd_pos() calls.
+       |   |   |
+       v   v   v
+      Storage engine internals
+   
+  Currently DS-MRR is used by MyISAM, InnoDB/XtraDB and Maria storage engines.
+  Potentially it can be used with any table handler that has disk-based data
+  storage and has better performance when reading data in rowid order.
+*/
+
+
+/*
+  DS-MRR implementation for one table. Create/use one object of this class for
+  each ha_{myisam/innobase/etc} object. That object will be further referred to
+  as "the handler"
+
+  There are actually three strategies
+   S1. Bypass DS-MRR, pass all calls to default implementation (i.e. to
+      MRR-to-non-MRR calls converter)
+   S2. Regular DS-MRR 
+   S3. DS-MRR/CPK for doing scans on clustered primary keys.
+
+  S1 is used for cases which DS-MRR is unable to handle for some reason.
+
+  S2 is the actual DS-MRR. The basic algorithm is as follows:
+    1. Scan the index (and only index, that is, with HA_EXTRA_KEYREAD on) and 
+        fill the buffer with {rowid, range_id} pairs
+    2. Sort the buffer by rowid
+    3. for each {rowid, range_id} pair in the buffer
+         get record by rowid and return the {record, range_id} pair
+    4. Repeat the above steps until we've exhausted the list of ranges we're
+       scanning.
+
+  S3 is the variant of DS-MRR for use with clustered primary keys (or any
+  clustered index). The idea is that in clustered index it is sufficient to 
+  access the index in index order, and we don't need an intermediate steps to
+  get rowid (like step #1 in S2).
+
+   DS-MRR/CPK's basic algorithm is as follows:
+    1. Collect a number of ranges (=lookup keys)
+    2. Sort them so that they follow in index order.
+    3. for each {lookup_key, range_id} pair in the buffer 
+       get record(s) matching the lookup key and return {record, range_id} pairs
+    4. Repeat the above steps until we've exhausted the list of ranges we're
+       scanning.
 */
 
 class DsMrr_impl
@@ -21,40 +81,6 @@ public:
   DsMrr_impl()
     : h2(NULL) {};
   
-  /*
-    The "owner" handler object (the one that calls dsmrr_XXX functions.
-    It is used to retrieve full table rows by calling rnd_pos().
-  */
-  handler *h;
-  TABLE *table; /* Always equal to h->table */
-private:
-  /* Secondary handler object.  It is used for scanning the index */
-  handler *h2;
-
-  /* Buffer to store rowids, or (rowid, range_id) pairs */
-  uchar *rowids_buf;
-  uchar *rowids_buf_cur;   /* Current position when reading/writing */
-  uchar *rowids_buf_last;  /* When reading: end of used buffer space */
-  uchar *rowids_buf_end;   /* End of the buffer */
-
-  bool dsmrr_eof; /* TRUE <=> We have reached EOF when reading index tuples */
-
-  /* TRUE <=> need range association, buffer holds {rowid, range_id} pairs */
-  bool is_mrr_assoc;
-
-  bool use_default_impl; /* TRUE <=> shortcut all calls to default MRR impl */
-
-  bool doing_cpk_scan;
-  uint cpk_tuple_length;
-  uint cpk_n_parts;
-  bool cpk_is_unique_scan;
-  char *cpk_saved_range_info;
-  bool cpk_have_range;
-
-
-  bool check_cpk_scan(uint keyno, uint mrr_flags);
-  static int key_tuple_cmp(void* arg, uchar* key1, uchar* key2);
-public:
   void init(handler *h_arg, TABLE *table_arg)
   {
     h= h_arg; 
@@ -64,10 +90,7 @@ public:
                  uint n_ranges, uint key_parts, uint mode, 
                  HANDLER_BUFFER *buf);
   void dsmrr_close();
-  int dsmrr_fill_buffer();
-  int dsmrr_fill_buffer_cpk();
   int dsmrr_next(char **range_info);
-  int dsmrr_next_cpk(char **range_info);
 
   ha_rows dsmrr_info(uint keyno, uint n_ranges, uint keys, uint key_parts, 
                      uint *bufsz, uint *flags, COST_VECT *cost);
@@ -76,9 +99,53 @@ public:
                             void *seq_init_param, uint n_ranges, uint *bufsz,
                             uint *flags, COST_VECT *cost);
 private:
+  /*
+    The "owner" handler object (the one that calls dsmrr_XXX functions.
+    It is used to retrieve full table rows by calling rnd_pos().
+  */
+  handler *h;
+  TABLE *table; /* Always equal to h->table */
+
+  /* Secondary handler object.  It is used for scanning the index */
+  handler *h2;
+
+  /* Buffer to store rowids, or (rowid, range_id) pairs */
+  uchar *mrr_buf;
+  uchar *mrr_buf_cur;   /* Current position when reading/writing */
+  uchar *mrr_buf_last;  /* When reading: end of used buffer space */
+  uchar *mrr_buf_end;   /* End of the buffer */
+
+  bool dsmrr_eof; /* TRUE <=> We have reached EOF when reading index tuples */
+
+  /* TRUE <=> need range association, buffer holds {rowid, range_id} pairs */
+  bool is_mrr_assoc;
+
+  bool use_default_impl; /* TRUE <=> shortcut all calls to default MRR impl */
+
+  bool doing_cpk_scan; /* TRUE <=> DS-MRR/CPK variant is used */
+
+  /** DS-MRR/CPK variables start */
+
+  /* Length of lookup tuple being used, in bytes */
+  uint cpk_tuple_length;
+  /*
+    TRUE <=> We're scanning on a full primary key (and not on prefix), and so 
+    can get max. one match for each key 
+  */
+  bool cpk_is_unique_scan;
+  /* TRUE<=> we're in a middle of enumerating records from a range */ 
+  bool cpk_have_range;
+  /* Valid if cpk_have_range==TRUE: range_id of the range we're enumerating */
+  char *cpk_saved_range_info;
+
   bool choose_mrr_impl(uint keyno, ha_rows rows, uint *flags, uint *bufsz, 
                        COST_VECT *cost);
   bool get_disk_sweep_mrr_cost(uint keynr, ha_rows rows, uint flags, 
                                uint *buffer_size, COST_VECT *cost);
+  bool check_cpk_scan(uint keyno, uint mrr_flags);
+  static int key_tuple_cmp(void* arg, uchar* key1, uchar* key2);
+  int dsmrr_fill_buffer();
+  void dsmrr_fill_buffer_cpk();
+  int dsmrr_next_cpk(char **range_info);
 };
 

From b45748f058e00b941d7b737cc43dcc3ad237d5d3 Mon Sep 17 00:00:00 2001
From: Sergey Petrunya <psergey@askmonty.org>
Date: Tue, 22 Jun 2010 22:38:52 +0400
Subject: [PATCH 04/49] MWL#121: DS-MRR support for clustered primary keys -
 Fix the code to work with IndexConditionPushdown+BKA (EXPLAIN is still  
 incorrect, see comments in the patch) - Test coverage for ICP+BKA

---
 mysql-test/r/innodb_mrr_cpk.result | 14 ++++++++
 mysql-test/t/innodb_mrr_cpk.test   | 13 +++++---
 sql/multi_range_read.cc            | 52 +++++++++++++++++++++++-------
 3 files changed, 62 insertions(+), 17 deletions(-)

diff --git a/mysql-test/r/innodb_mrr_cpk.result b/mysql-test/r/innodb_mrr_cpk.result
index f93807e14d8..469d78e8e45 100644
--- a/mysql-test/r/innodb_mrr_cpk.result
+++ b/mysql-test/r/innodb_mrr_cpk.result
@@ -128,6 +128,20 @@ a	b	c	filler	a	b
 11	11	12	filler	11	11
 11	11	13	filler	11	11
 set join_cache_level=6;
+explain select * from t1, t2 where t1.a=t2.a and t2.b + t1.b > 100;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	3	
+1	SIMPLE	t1	ref	PRIMARY	PRIMARY	4	test.t2.a	1	Using index condition(BKA); Using join buffer
+select * from t1, t2 where t1.a=t2.a and t2.b + t1.b > 100;
+a	b	c	filler	a	b
+set optimizer_switch='index_condition_pushdown=off';
+explain select * from t1, t2 where t1.a=t2.a and t2.b + t1.b > 100;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	3	
+1	SIMPLE	t1	ref	PRIMARY	PRIMARY	4	test.t2.a	1	Using where; Using join buffer
+select * from t1, t2 where t1.a=t2.a and t2.b + t1.b > 100;
+a	b	c	filler	a	b
+set optimizer_switch='index_condition_pushdown=on';
 drop table t1,t2;
 set @@join_cache_level= @save_join_cache_level;
 set storage_engine=@save_storage_engine;
diff --git a/mysql-test/t/innodb_mrr_cpk.test b/mysql-test/t/innodb_mrr_cpk.test
index 84b37840880..69eeef9618f 100644
--- a/mysql-test/t/innodb_mrr_cpk.test
+++ b/mysql-test/t/innodb_mrr_cpk.test
@@ -112,21 +112,24 @@ insert into t2 values (11,33), (11,22), (11,11);
 explain select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
 select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
 
+# Check a real resultset for comaprison:
 set join_cache_level=0;
 select * from t1, t2 where t1.a=t2.a and t1.b=t2.b;
 set join_cache_level=6;
 
-drop table t1,t2;
 
 #
 # Check that Index Condition Pushdown (BKA) actually works:
 #
+explain select * from t1, t2 where t1.a=t2.a and t2.b + t1.b > 100;
+select * from t1, t2 where t1.a=t2.a and t2.b + t1.b > 100;
 
-# TODO
+set optimizer_switch='index_condition_pushdown=off';
+explain select * from t1, t2 where t1.a=t2.a and t2.b + t1.b > 100;
+select * from t1, t2 where t1.a=t2.a and t2.b + t1.b > 100;
+set optimizer_switch='index_condition_pushdown=on';
 
-#
-# Check that record-check-func is done:
-# 
+drop table t1,t2;
 
 set @@join_cache_level= @save_join_cache_level;
 set storage_engine=@save_storage_engine;
diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc
index 46790adee9e..9c0a0233e0e 100644
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -624,7 +624,6 @@ void DsMrr_impl::dsmrr_fill_buffer_cpk()
   {
     DBUG_ASSERT(cur_range.range_flag & EQ_RANGE);
     DBUG_ASSERT(cpk_tuple_length == cur_range.start_key.length);
-
     /* Put key, or {key, range_id} pair into the buffer */
     memcpy(mrr_buf_cur, cur_range.start_key.key, cpk_tuple_length);
     mrr_buf_cur += cpk_tuple_length;
@@ -654,7 +653,8 @@ void DsMrr_impl::dsmrr_fill_buffer_cpk()
   DS-MRR/CPK: multi_range_read_next() function
 
   DESCRIPTION
-    DsMrr_impl::dsmrr_next_cpk() 
+    DsMrr_impl::dsmrr_next_cpk()
+      range_info  OUT  identifier of range that the returned record belongs to
 
   DESCRIPTION
     DS-MRR/CPK: multi_range_read_next() function. 
@@ -673,16 +673,31 @@ int DsMrr_impl::dsmrr_next_cpk(char **range_info)
 {
   int res;
 
-  if (cpk_have_range)
+  while (cpk_have_range)
   {
+
+    if (h->mrr_funcs.skip_record &&
+        h->mrr_funcs.skip_record(h->mrr_iter, cpk_saved_range_info, NULL))
+    {
+      cpk_have_range= FALSE;
+      break;
+    }
+
     res= h->index_next_same(table->record[0], mrr_buf_cur, cpk_tuple_length);
+
+    if (h->mrr_funcs.skip_index_tuple &&
+        h->mrr_funcs.skip_index_tuple(h->mrr_iter, cpk_saved_range_info))
+      continue;
+
     if (res != HA_ERR_END_OF_FILE)
     {
       if (is_mrr_assoc)
         memcpy(range_info, &cpk_saved_range_info, sizeof(void*));
       return res;
     }
-    /* No more records in this range. Fall through to get to another range  */
+
+    /* No more records in this range. Exit this loop and go get another range */
+    cpk_have_range= FALSE;
   }
 
   do
@@ -703,30 +718,43 @@ int DsMrr_impl::dsmrr_next_cpk(char **range_info)
       goto end;
     }
     
-    //psergey2-todo: make skip_index_tuple() calls, too?
-    //psergey2-todo: skip-record calls here?
-    //if (h2->mrr_funcs.skip_record &&
-    //	h2->mrr_funcs.skip_record(h2->mrr_iter, (char *) cur_range_info, rowid))
-    //  continue;
-    
     /* Ok, got the range. Try making a lookup.  */
     uchar *lookup_tuple= mrr_buf_cur;
     mrr_buf_cur += cpk_tuple_length;
     if (is_mrr_assoc)
     {
-      memcpy(cpk_saved_range_info, mrr_buf_cur, sizeof(void*));
+      memcpy(&cpk_saved_range_info, mrr_buf_cur, sizeof(void*));
       mrr_buf_cur += sizeof(void*) * test(is_mrr_assoc);
     }
       
+    if (h->mrr_funcs.skip_record &&
+        h->mrr_funcs.skip_record(h->mrr_iter, cpk_saved_range_info, NULL))
+      continue;
+    
     res= h->index_read(table->record[0], lookup_tuple, cpk_tuple_length, 
                        HA_READ_KEY_EXACT);
 
+    /*
+      Check pushed index condition. Performance-wise, it does not make any
+      sense to put this call here (the above call has already accessed the full
+      record). That's the best I could do, though, because:
+      - ha_innobase doesn't support IndexConditionPushdown on clustered PK
+      - MRR interface doesn't allow the storage engine to refuse a pushed index
+        condition.
+      Having this call here is not fully harmless: EXPLAIN shows "pushed index
+      condition", which is technically true but doesn't bring the benefits that
+      one might expect.
+    */
+    if (h->mrr_funcs.skip_index_tuple &&
+        h->mrr_funcs.skip_index_tuple(h->mrr_iter, cpk_saved_range_info))
+      continue;
+
     if (res && res != HA_ERR_END_OF_FILE)
       goto end;
 
     if (!res)
     {
-      memcpy(range_info, cpk_saved_range_info, sizeof(void*));
+      memcpy(range_info, &cpk_saved_range_info, sizeof(void*));
       /* 
         Attempt reading more rows from this range only if there actually can
         be multiple matches:

From 488d352a662438a3c34fd1b41261a94be7572f38 Mon Sep 17 00:00:00 2001
From: Sergey Petrunya <psergey@askmonty.org>
Date: Tue, 22 Jun 2010 23:26:11 +0400
Subject: [PATCH 05/49] MWL#121: DS-MRR support for clustered primary keys -
 Remove back key_parts from multi_range_read_init() parameters - Related code
 simplification/cleanup

---
 sql/handler.h                       |  8 +--
 sql/multi_range_read.cc             | 75 ++++++++++++-----------------
 sql/multi_range_read.h              |  3 +-
 sql/opt_range.cc                    |  1 -
 sql/sql_join_cache.cc               |  3 +-
 storage/maria/ha_maria.cc           |  7 ++-
 storage/maria/ha_maria.h            |  3 +-
 storage/myisam/ha_myisam.cc         |  4 +-
 storage/myisam/ha_myisam.h          |  3 +-
 storage/xtradb/handler/ha_innodb.cc |  4 +-
 storage/xtradb/handler/ha_innodb.h  |  3 +-
 11 files changed, 47 insertions(+), 67 deletions(-)

diff --git a/sql/handler.h b/sql/handler.h
index f2cc50de38a..3e173905f66 100644
--- a/sql/handler.h
+++ b/sql/handler.h
@@ -1168,9 +1168,9 @@ void get_sweep_read_cost(TABLE *table, ha_rows nrows, bool interrupted,
                          COST_VECT *cost);
 
 /*
-  The below two are not used (and not handled) in this milestone of this WL
-  entry because there seems to be no use for them at this stage of
-  implementation.
+  Indicates that all scanned ranges will be singlepoint (aka equality) ranges.
+  The ranges may not use the full key but all of them will use the same number
+  of key parts.
 */
 #define HA_MRR_SINGLE_POINT 1
 #define HA_MRR_FIXED_KEY  2
@@ -1755,7 +1755,7 @@ public:
                                         uint key_parts, uint *bufsz, 
                                         uint *flags, COST_VECT *cost);
   virtual int multi_range_read_init(RANGE_SEQ_IF *seq, void *seq_init_param,
-                                    uint n_ranges, uint key_parts, uint mode,
+                                    uint n_ranges, uint mode, 
                                     HANDLER_BUFFER *buf);
   virtual int multi_range_read_next(char **range_info);
   virtual int read_range_first(const key_range *start_key,
diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc
index 9c0a0233e0e..c86143c4a12 100644
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -1,4 +1,5 @@
 #include "mysql_priv.h"
+#include <my_bit.h>
 #include "sql_select.h"
 
 /****************************************************************************
@@ -203,8 +204,7 @@ ha_rows handler::multi_range_read_info(uint keyno, uint n_ranges, uint n_rows,
 
 int
 handler::multi_range_read_init(RANGE_SEQ_IF *seq_funcs, void *seq_init_param,
-                               uint n_ranges, uint key_parts, uint mode, 
-                               HANDLER_BUFFER *buf)
+                               uint n_ranges, uint mode, HANDLER_BUFFER *buf)
 {
   DBUG_ENTER("handler::multi_range_read_init");
   mrr_iter= seq_funcs->init(seq_init_param, n_ranges, mode);
@@ -306,8 +306,7 @@ scan_it_again:
 */
 
 int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs, 
-                           void *seq_init_param, uint n_ranges, uint key_parts,
-                           uint mode,
+                           void *seq_init_param, uint n_ranges, uint mode,
                            HANDLER_BUFFER *buf)
 {
   uint elem_size;
@@ -324,8 +323,8 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
   {
     use_default_impl= TRUE;
     const int retval=
-      h->handler::multi_range_read_init(seq_funcs, seq_init_param,
-                                        n_ranges, key_parts, mode, buf);
+      h->handler::multi_range_read_init(seq_funcs, seq_init_param, n_ranges, 
+                                        mode, buf);
     DBUG_RETURN(retval);
   }
   mrr_buf= buf->buffer;
@@ -337,51 +336,25 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
  
   mrr_buf_end= buf->buffer_end;
 
-
-  doing_cpk_scan= check_cpk_scan(h->active_index, mode); 
-  if (doing_cpk_scan)
+  if ((doing_cpk_scan= check_cpk_scan(h->active_index, mode)))
   {
-    /* 
-      When doing a scan on CPK, the buffer stores {lookup_tuple, range_id}
-      pairs 
-    */
-    uint keylen=0;
-    DBUG_ASSERT(key_parts != 0);
-    for (uint kp= 0; kp < key_parts; kp++)
-      keylen += table->key_info[h->active_index].key_part[kp].store_length;
-
-    cpk_tuple_length= keylen;
-    cpk_is_unique_scan= test(table->key_info[h->active_index].key_parts == 
-                             key_parts);
+    /* It's a DS-MRR/CPK scan */
+    cpk_tuple_length= 0; /* dummy value telling it needs to be inited */
     cpk_have_range= FALSE;
-    elem_size= keylen + (int)is_mrr_assoc * sizeof(void*);
     use_default_impl= FALSE;
-  }
-  else
-  {
-    /* In regular DS-MRR, buffer stores {rowid, range_id} pairs */
-    elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*);
-  }
-
-  mrr_buf_last= mrr_buf + 
-                      ((mrr_buf_end - mrr_buf)/ elem_size)*
-                      elem_size;
-  mrr_buf_end= mrr_buf_last;
-
-  if (doing_cpk_scan)
-  {
-    /* 
-      DS-MRR/CPK: fill buffer with lookup tuples and sort; also we don't need a
-      secondary handler object.
-    */
     h->mrr_iter= seq_funcs->init(seq_init_param, n_ranges, mode);
     h->mrr_funcs= *seq_funcs;
     dsmrr_fill_buffer_cpk();
-    if (dsmrr_eof) 
+    if (dsmrr_eof)
       buf->end_of_used_area= mrr_buf_last;
     DBUG_RETURN(0); /* nothing could go wrong while filling the buffer */
   }
 
+  /* In regular DS-MRR, buffer stores {rowid, range_id} pairs */
+  elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*);
+  mrr_buf_last= mrr_buf + ((mrr_buf_end - mrr_buf)/ elem_size)* elem_size;
+  mrr_buf_end= mrr_buf_last;
+
   /*
     There can be two cases:
     - This is the first call since index_init(), h2==NULL
@@ -454,8 +427,8 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
       goto error;
   }
 
-  if (h2->handler::multi_range_read_init(seq_funcs, seq_init_param, n_ranges,
-                                         key_parts, mode, buf) || 
+  if (h2->handler::multi_range_read_init(seq_funcs, seq_init_param, n_ranges, 
+                                         mode, buf) ||
       dsmrr_fill_buffer())
   {
     goto error;
@@ -604,6 +577,9 @@ int DsMrr_impl::key_tuple_cmp(void* arg, uchar* key1, uchar* key2)
 
 /*
   DS-MRR/CPK: Fill the buffer with (lookup_tuple, range_id) pairs and sort
+  
+  SYNOPSIS
+    DsMrr_impl::dsmrr_fill_buffer_cpk()
 
   DESCRIPTION
     DS-MRR/CPK: Fill the buffer with (lookup_tuple, range_id) pairs and sort
@@ -623,7 +599,18 @@ void DsMrr_impl::dsmrr_fill_buffer_cpk()
          !(res= h->mrr_funcs.next(h->mrr_iter, &cur_range)))
   {
     DBUG_ASSERT(cur_range.range_flag & EQ_RANGE);
-    DBUG_ASSERT(cpk_tuple_length == cur_range.start_key.length);
+    DBUG_ASSERT(!cpk_tuple_length || 
+                cpk_tuple_length == cur_range.start_key.length);
+    if (!cpk_tuple_length)
+    {
+      cpk_tuple_length= cur_range.start_key.length;
+      cpk_is_unique_scan= test(table->key_info[h->active_index].key_parts == 
+                               my_count_bits(cur_range.start_key.keypart_map));
+      uint elem_size= cpk_tuple_length + (int)is_mrr_assoc * sizeof(void*);
+      mrr_buf_last= mrr_buf + ((mrr_buf_end - mrr_buf)/elem_size) * elem_size;
+      mrr_buf_end= mrr_buf_last;
+    }
+
     /* Put key, or {key, range_id} pair into the buffer */
     memcpy(mrr_buf_cur, cur_range.start_key.key, cpk_tuple_length);
     mrr_buf_cur += cpk_tuple_length;
diff --git a/sql/multi_range_read.h b/sql/multi_range_read.h
index 5dd2e0d6adf..7a5e57e490e 100644
--- a/sql/multi_range_read.h
+++ b/sql/multi_range_read.h
@@ -87,8 +87,7 @@ public:
     table= table_arg;
   }
   int dsmrr_init(handler *h, RANGE_SEQ_IF *seq_funcs, void *seq_init_param, 
-                 uint n_ranges, uint key_parts, uint mode, 
-                 HANDLER_BUFFER *buf);
+                 uint n_ranges, uint mode, HANDLER_BUFFER *buf);
   void dsmrr_close();
   int dsmrr_next(char **range_info);
 
diff --git a/sql/opt_range.cc b/sql/opt_range.cc
index 25c4259295f..ad0f9301b7f 100644
--- a/sql/opt_range.cc
+++ b/sql/opt_range.cc
@@ -8368,7 +8368,6 @@ int QUICK_RANGE_SELECT::reset()
  
   RANGE_SEQ_IF seq_funcs= {quick_range_seq_init, quick_range_seq_next, 0, 0};
   error= file->multi_range_read_init(&seq_funcs, (void*)this, ranges.elements,
-                                     uint(-1),
                                      mrr_flags, mrr_buf_desc? mrr_buf_desc: 
                                                               &empty_buf);
   DBUG_RETURN(error);
diff --git a/sql/sql_join_cache.cc b/sql/sql_join_cache.cc
index 120b109d8ff..c536026214c 100644
--- a/sql/sql_join_cache.cc
+++ b/sql/sql_join_cache.cc
@@ -2376,8 +2376,7 @@ JOIN_CACHE_BKA::init_join_matching_records(RANGE_SEQ_IF *seq_funcs, uint ranges)
   */ 
   if (!file->inited)
     file->ha_index_init(join_tab->ref.key, 1);
-  if ((error= file->multi_range_read_init(seq_funcs, (void*) this, ranges,
-					  join_tab->ref.key_parts,
+  if ((error= file->multi_range_read_init(seq_funcs, (void*) this, ranges, 
                                           mrr_mode, &mrr_buff)))
     rc= error < 0 ? NESTED_LOOP_NO_MORE_ROWS: NESTED_LOOP_ERROR;
   
diff --git a/storage/maria/ha_maria.cc b/storage/maria/ha_maria.cc
index 43c6cd6606a..e27983989d8 100644
--- a/storage/maria/ha_maria.cc
+++ b/storage/maria/ha_maria.cc
@@ -3501,11 +3501,10 @@ static SHOW_VAR status_variables[]= {
  ***************************************************************************/
 
 int ha_maria::multi_range_read_init(RANGE_SEQ_IF *seq, void *seq_init_param,
-                                     uint n_ranges, uint key_parts, uint mode, 
-                                     HANDLER_BUFFER *buf)
+                                    uint n_ranges, uint mode, 
+                                    HANDLER_BUFFER *buf)
 {
-  return ds_mrr.dsmrr_init(this, seq, seq_init_param, n_ranges, key_parts, 
-                           mode, buf);
+  return ds_mrr.dsmrr_init(this, seq, seq_init_param, n_ranges, mode, buf);
 }
 
 int ha_maria::multi_range_read_next(char **range_info)
diff --git a/storage/maria/ha_maria.h b/storage/maria/ha_maria.h
index 177008f422a..6901229bb44 100644
--- a/storage/maria/ha_maria.h
+++ b/storage/maria/ha_maria.h
@@ -174,8 +174,7 @@ public:
    * Multi Range Read interface
    */
   int multi_range_read_init(RANGE_SEQ_IF *seq, void *seq_init_param,
-                            uint n_ranges, uint key_parts, uint mode, 
-                            HANDLER_BUFFER *buf);
+                            uint n_ranges, uint mode, HANDLER_BUFFER *buf);
   int multi_range_read_next(char **range_info);
   ha_rows multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
                                       void *seq_init_param, 
diff --git a/storage/myisam/ha_myisam.cc b/storage/myisam/ha_myisam.cc
index bb6ac446a4f..95ab5cb167e 100644
--- a/storage/myisam/ha_myisam.cc
+++ b/storage/myisam/ha_myisam.cc
@@ -2217,10 +2217,10 @@ static int myisam_init(void *p)
  ***************************************************************************/
 
 int ha_myisam::multi_range_read_init(RANGE_SEQ_IF *seq, void *seq_init_param,
-                                     uint n_ranges, uint key_parts, uint mode, 
+                                     uint n_ranges, uint mode, 
                                      HANDLER_BUFFER *buf)
 {
-  return ds_mrr.dsmrr_init(this, seq, seq_init_param, n_ranges, key_parts, mode, buf);
+  return ds_mrr.dsmrr_init(this, seq, seq_init_param, n_ranges, mode, buf);
 }
 
 int ha_myisam::multi_range_read_next(char **range_info)
diff --git a/storage/myisam/ha_myisam.h b/storage/myisam/ha_myisam.h
index d37870b861b..f5428e653c4 100644
--- a/storage/myisam/ha_myisam.h
+++ b/storage/myisam/ha_myisam.h
@@ -162,8 +162,7 @@ public:
    * Multi Range Read interface
    */
   int multi_range_read_init(RANGE_SEQ_IF *seq, void *seq_init_param,
-                            uint n_ranges, uint key_parts, uint mode, 
-                            HANDLER_BUFFER *buf);
+                            uint n_ranges, uint mode, HANDLER_BUFFER *buf);
   int multi_range_read_next(char **range_info);
   ha_rows multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
                                       void *seq_init_param, 
diff --git a/storage/xtradb/handler/ha_innodb.cc b/storage/xtradb/handler/ha_innodb.cc
index a8ccb426aa5..8aff0103e20 100644
--- a/storage/xtradb/handler/ha_innodb.cc
+++ b/storage/xtradb/handler/ha_innodb.cc
@@ -11025,10 +11025,10 @@ test_innobase_convert_name()
  */
 
 int ha_innobase::multi_range_read_init(RANGE_SEQ_IF *seq, void *seq_init_param,
-                                       uint n_ranges, uint key_parts, uint mode,
+                                       uint n_ranges, uint mode, 
                                        HANDLER_BUFFER *buf)
 {
-  return ds_mrr.dsmrr_init(this, seq, seq_init_param, n_ranges, key_parts, mode, buf);
+  return ds_mrr.dsmrr_init(this, seq, seq_init_param, n_ranges, mode, buf);
 }
 
 int ha_innobase::multi_range_read_next(char **range_info)
diff --git a/storage/xtradb/handler/ha_innodb.h b/storage/xtradb/handler/ha_innodb.h
index 0c1f2b42dd6..41a073e4374 100644
--- a/storage/xtradb/handler/ha_innodb.h
+++ b/storage/xtradb/handler/ha_innodb.h
@@ -210,8 +210,7 @@ public:
    * Multi Range Read interface
    */
   int multi_range_read_init(RANGE_SEQ_IF *seq, void *seq_init_param,
-                            uint n_ranges, uint key_parts, uint mode, 
-                            HANDLER_BUFFER *buf);
+                            uint n_ranges, uint mode, HANDLER_BUFFER *buf);
   int multi_range_read_next(char **range_info);
   ha_rows multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
                                       void *seq_init_param, 

From e0999cdf7c2222f37573d50ecd7eeb9612d51a49 Mon Sep 17 00:00:00 2001
From: Sergey Petrunya <psergey@askmonty.org>
Date: Sat, 17 Jul 2010 18:03:50 +0400
Subject: [PATCH 06/49] DS-MRR support improvements (MWL#123, MWL#124, MWL#125)
 - Lots of TODO comments - add mrr_sort_keys flag to @@optimizer_switch -
 [from Igor] SQL layer part passes HA_MRR_MATERIALIZED_KEYS flag - Don't call
 rnd_pos() many times in a row if sorted rowid buffer   has the same rowid
 value for multiple consequive (rowid, range_id) pairs.

---
 mysql-test/r/optimizer_switch.result |  34 +++---
 sql/handler.h                        |   6 +
 sql/multi_range_read.cc              | 160 +++++++++++++++++++++++----
 sql/multi_range_read.h               |   6 +-
 sql/mysql_priv.h                     |  13 ++-
 sql/mysqld.cc                        |   5 +-
 sql/sql_join_cache.cc                |   5 +
 7 files changed, 181 insertions(+), 48 deletions(-)

diff --git a/mysql-test/r/optimizer_switch.result b/mysql-test/r/optimizer_switch.result
index 6bccefe54be..13a5f1ab232 100644
--- a/mysql-test/r/optimizer_switch.result
+++ b/mysql-test/r/optimizer_switch.result
@@ -4,19 +4,19 @@
 #
 select @@optimizer_switch;
 @@optimizer_switch
-index_merge=on,index_merge_union=on,index_merge_sort_union=on,index_merge_intersection=on,index_condition_pushdown=on,firstmatch=on,loosescan=on,materialization=on,semijoin=on,partial_match_rowid_merge=on,partial_match_table_scan=on,subquery_cache=on
+index_merge=on,index_merge_union=on,index_merge_sort_union=on,index_merge_intersection=on,index_condition_pushdown=on,firstmatch=on,loosescan=on,materialization=on,semijoin=on,partial_match_rowid_merge=on,partial_match_table_scan=on,subquery_cache=on,mrr_sort_keys=on
 set optimizer_switch='index_merge=off,index_merge_union=off';
 select @@optimizer_switch;
 @@optimizer_switch
-index_merge=off,index_merge_union=off,index_merge_sort_union=on,index_merge_intersection=on,index_condition_pushdown=on,firstmatch=on,loosescan=on,materialization=on,semijoin=on,partial_match_rowid_merge=on,partial_match_table_scan=on,subquery_cache=on
+index_merge=off,index_merge_union=off,index_merge_sort_union=on,index_merge_intersection=on,index_condition_pushdown=on,firstmatch=on,loosescan=on,materialization=on,semijoin=on,partial_match_rowid_merge=on,partial_match_table_scan=on,subquery_cache=on,mrr_sort_keys=on
 set optimizer_switch='index_merge_union=on';
 select @@optimizer_switch;
 @@optimizer_switch
-index_merge=off,index_merge_union=on,index_merge_sort_union=on,index_merge_intersection=on,index_condition_pushdown=on,firstmatch=on,loosescan=on,materialization=on,semijoin=on,partial_match_rowid_merge=on,partial_match_table_scan=on,subquery_cache=on
+index_merge=off,index_merge_union=on,index_merge_sort_union=on,index_merge_intersection=on,index_condition_pushdown=on,firstmatch=on,loosescan=on,materialization=on,semijoin=on,partial_match_rowid_merge=on,partial_match_table_scan=on,subquery_cache=on,mrr_sort_keys=on
 set optimizer_switch='default,index_merge_sort_union=off';
 select @@optimizer_switch;
 @@optimizer_switch
-index_merge=on,index_merge_union=on,index_merge_sort_union=off,index_merge_intersection=on,index_condition_pushdown=on,firstmatch=on,loosescan=on,materialization=on,semijoin=on,partial_match_rowid_merge=on,partial_match_table_scan=on,subquery_cache=on
+index_merge=on,index_merge_union=on,index_merge_sort_union=off,index_merge_intersection=on,index_condition_pushdown=on,firstmatch=on,loosescan=on,materialization=on,semijoin=on,partial_match_rowid_merge=on,partial_match_table_scan=on,subquery_cache=on,mrr_sort_keys=on
 set optimizer_switch=4;
 ERROR 42000: Variable 'optimizer_switch' can't be set to the value of '4'
 set optimizer_switch=NULL;
@@ -43,57 +43,57 @@ set optimizer_switch=default;
 set optimizer_switch='index_merge=off,index_merge_union=off,default';
 select @@optimizer_switch;
 @@optimizer_switch
-index_merge=off,index_merge_union=off,index_merge_sort_union=on,index_merge_intersection=on,index_condition_pushdown=on,firstmatch=on,loosescan=on,materialization=on,semijoin=on,partial_match_rowid_merge=on,partial_match_table_scan=on,subquery_cache=on
+index_merge=off,index_merge_union=off,index_merge_sort_union=on,index_merge_intersection=on,index_condition_pushdown=on,firstmatch=on,loosescan=on,materialization=on,semijoin=on,partial_match_rowid_merge=on,partial_match_table_scan=on,subquery_cache=on,mrr_sort_keys=on
 set optimizer_switch=default;
 select @@global.optimizer_switch;
 @@global.optimizer_switch
-index_merge=on,index_merge_union=on,index_merge_sort_union=on,index_merge_intersection=on,index_condition_pushdown=on,firstmatch=on,loosescan=on,materialization=on,semijoin=on,partial_match_rowid_merge=on,partial_match_table_scan=on,subquery_cache=on
+index_merge=on,index_merge_union=on,index_merge_sort_union=on,index_merge_intersection=on,index_condition_pushdown=on,firstmatch=on,loosescan=on,materialization=on,semijoin=on,partial_match_rowid_merge=on,partial_match_table_scan=on,subquery_cache=on,mrr_sort_keys=on
 set @@global.optimizer_switch=default;
 select @@global.optimizer_switch;
 @@global.optimizer_switch
-index_merge=on,index_merge_union=on,index_merge_sort_union=on,index_merge_intersection=on,index_condition_pushdown=on,firstmatch=on,loosescan=on,materialization=on,semijoin=on,partial_match_rowid_merge=on,partial_match_table_scan=on,subquery_cache=on
+index_merge=on,index_merge_union=on,index_merge_sort_union=on,index_merge_intersection=on,index_condition_pushdown=on,firstmatch=on,loosescan=on,materialization=on,semijoin=on,partial_match_rowid_merge=on,partial_match_table_scan=on,subquery_cache=on,mrr_sort_keys=on
 #
 # Check index_merge's @@optimizer_switch flags
 #
 select @@optimizer_switch;
 @@optimizer_switch
-index_merge=on,index_merge_union=on,index_merge_sort_union=on,index_merge_intersection=on,index_condition_pushdown=on,firstmatch=on,loosescan=on,materialization=on,semijoin=on,partial_match_rowid_merge=on,partial_match_table_scan=on,subquery_cache=on
+index_merge=on,index_merge_union=on,index_merge_sort_union=on,index_merge_intersection=on,index_condition_pushdown=on,firstmatch=on,loosescan=on,materialization=on,semijoin=on,partial_match_rowid_merge=on,partial_match_table_scan=on,subquery_cache=on,mrr_sort_keys=on
 
 BUG#37120 optimizer_switch allowable values not according to specification
 
 select @@optimizer_switch;
 @@optimizer_switch
-index_merge=on,index_merge_union=on,index_merge_sort_union=on,index_merge_intersection=on,index_condition_pushdown=on,firstmatch=on,loosescan=on,materialization=on,semijoin=on,partial_match_rowid_merge=on,partial_match_table_scan=on,subquery_cache=on
+index_merge=on,index_merge_union=on,index_merge_sort_union=on,index_merge_intersection=on,index_condition_pushdown=on,firstmatch=on,loosescan=on,materialization=on,semijoin=on,partial_match_rowid_merge=on,partial_match_table_scan=on,subquery_cache=on,mrr_sort_keys=on
 set optimizer_switch='default,materialization=off';
 select @@optimizer_switch;
 @@optimizer_switch
-index_merge=on,index_merge_union=on,index_merge_sort_union=on,index_merge_intersection=on,index_condition_pushdown=on,firstmatch=on,loosescan=on,materialization=off,semijoin=on,partial_match_rowid_merge=on,partial_match_table_scan=on,subquery_cache=on
+index_merge=on,index_merge_union=on,index_merge_sort_union=on,index_merge_intersection=on,index_condition_pushdown=on,firstmatch=on,loosescan=on,materialization=off,semijoin=on,partial_match_rowid_merge=on,partial_match_table_scan=on,subquery_cache=on,mrr_sort_keys=on
 set optimizer_switch='default,semijoin=off';
 select @@optimizer_switch;
 @@optimizer_switch
-index_merge=on,index_merge_union=on,index_merge_sort_union=on,index_merge_intersection=on,index_condition_pushdown=on,firstmatch=on,loosescan=on,materialization=on,semijoin=off,partial_match_rowid_merge=on,partial_match_table_scan=on,subquery_cache=on
+index_merge=on,index_merge_union=on,index_merge_sort_union=on,index_merge_intersection=on,index_condition_pushdown=on,firstmatch=on,loosescan=on,materialization=on,semijoin=off,partial_match_rowid_merge=on,partial_match_table_scan=on,subquery_cache=on,mrr_sort_keys=on
 set optimizer_switch='default,loosescan=off';
 select @@optimizer_switch;
 @@optimizer_switch
-index_merge=on,index_merge_union=on,index_merge_sort_union=on,index_merge_intersection=on,index_condition_pushdown=on,firstmatch=on,loosescan=off,materialization=on,semijoin=on,partial_match_rowid_merge=on,partial_match_table_scan=on,subquery_cache=on
+index_merge=on,index_merge_union=on,index_merge_sort_union=on,index_merge_intersection=on,index_condition_pushdown=on,firstmatch=on,loosescan=off,materialization=on,semijoin=on,partial_match_rowid_merge=on,partial_match_table_scan=on,subquery_cache=on,mrr_sort_keys=on
 set optimizer_switch='default,semijoin=off,materialization=off';
 select @@optimizer_switch;
 @@optimizer_switch
-index_merge=on,index_merge_union=on,index_merge_sort_union=on,index_merge_intersection=on,index_condition_pushdown=on,firstmatch=on,loosescan=on,materialization=off,semijoin=off,partial_match_rowid_merge=on,partial_match_table_scan=on,subquery_cache=on
+index_merge=on,index_merge_union=on,index_merge_sort_union=on,index_merge_intersection=on,index_condition_pushdown=on,firstmatch=on,loosescan=on,materialization=off,semijoin=off,partial_match_rowid_merge=on,partial_match_table_scan=on,subquery_cache=on,mrr_sort_keys=on
 set optimizer_switch='default,materialization=off,semijoin=off';
 select @@optimizer_switch;
 @@optimizer_switch
-index_merge=on,index_merge_union=on,index_merge_sort_union=on,index_merge_intersection=on,index_condition_pushdown=on,firstmatch=on,loosescan=on,materialization=off,semijoin=off,partial_match_rowid_merge=on,partial_match_table_scan=on,subquery_cache=on
+index_merge=on,index_merge_union=on,index_merge_sort_union=on,index_merge_intersection=on,index_condition_pushdown=on,firstmatch=on,loosescan=on,materialization=off,semijoin=off,partial_match_rowid_merge=on,partial_match_table_scan=on,subquery_cache=on,mrr_sort_keys=on
 set optimizer_switch='default,semijoin=off,materialization=off,loosescan=off';
 select @@optimizer_switch;
 @@optimizer_switch
-index_merge=on,index_merge_union=on,index_merge_sort_union=on,index_merge_intersection=on,index_condition_pushdown=on,firstmatch=on,loosescan=off,materialization=off,semijoin=off,partial_match_rowid_merge=on,partial_match_table_scan=on,subquery_cache=on
+index_merge=on,index_merge_union=on,index_merge_sort_union=on,index_merge_intersection=on,index_condition_pushdown=on,firstmatch=on,loosescan=off,materialization=off,semijoin=off,partial_match_rowid_merge=on,partial_match_table_scan=on,subquery_cache=on,mrr_sort_keys=on
 set optimizer_switch='default,semijoin=off,loosescan=off';
 select @@optimizer_switch;
 @@optimizer_switch
-index_merge=on,index_merge_union=on,index_merge_sort_union=on,index_merge_intersection=on,index_condition_pushdown=on,firstmatch=on,loosescan=off,materialization=on,semijoin=off,partial_match_rowid_merge=on,partial_match_table_scan=on,subquery_cache=on
+index_merge=on,index_merge_union=on,index_merge_sort_union=on,index_merge_intersection=on,index_condition_pushdown=on,firstmatch=on,loosescan=off,materialization=on,semijoin=off,partial_match_rowid_merge=on,partial_match_table_scan=on,subquery_cache=on,mrr_sort_keys=on
 set optimizer_switch='default,materialization=off,loosescan=off';
 select @@optimizer_switch;
 @@optimizer_switch
-index_merge=on,index_merge_union=on,index_merge_sort_union=on,index_merge_intersection=on,index_condition_pushdown=on,firstmatch=on,loosescan=off,materialization=off,semijoin=on,partial_match_rowid_merge=on,partial_match_table_scan=on,subquery_cache=on
+index_merge=on,index_merge_union=on,index_merge_sort_union=on,index_merge_intersection=on,index_condition_pushdown=on,firstmatch=on,loosescan=off,materialization=off,semijoin=on,partial_match_rowid_merge=on,partial_match_table_scan=on,subquery_cache=on,mrr_sort_keys=on
 set optimizer_switch=default;
diff --git a/sql/handler.h b/sql/handler.h
index c471aa6e4d3..2eae66fd741 100644
--- a/sql/handler.h
+++ b/sql/handler.h
@@ -1322,6 +1322,12 @@ void get_sweep_read_cost(TABLE *table, ha_rows nrows, bool interrupted,
 */
 #define HA_MRR_NO_NULL_ENDPOINTS 128
 
+/*
+  The MRR user has materialized range keys somewhere in the user's buffer.
+  This can be used for optimization of the procedure that sorts these keys
+  since in this case key values don't have to be copied into the MRR buffer.
+*/
+#define HA_MRR_MATERIALIZED_KEYS 256
 
 
 /*
diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc
index 14d7722d2c8..f6417beb786 100644
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -327,24 +327,62 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
                                         mode, buf);
     DBUG_RETURN(retval);
   }
-  mrr_buf= buf->buffer;
-
+  use_default_impl= FALSE;
   is_mrr_assoc= !test(mode & HA_MRR_NO_ASSOCIATION);
 
+  // psergey2: split the buffer:
+  /*
+
+  psergey2-note: we can't split the buffer here because we don't know how key
+  length. we'll only be able to do it when we've got the first range.
+
+  if ((mrr_flags & HA_MRR_SINGLE_POINT) && 
+       optimizer_flag(thd, OPTIMIZER_SWITCH_MRR_SORT_KEYS))
+  {
+    do_sort_keys= TRUE; // will use key buffer to sort keys;
+    bool use_key_pointers= test(mrr_flags & HA_MRR_MATERIALIZED_KEYS);
+  }
+  
+  do_rowid_fetch= FALSE;
+  if (!doing_cpk_scan && !index_only_read)
+  {
+    do_rowid_fetch= TRUE; //will use rowid buffer to store/sort rowids, etc
+  }
+
+
+  if (do_sort_keys && do_rowid_fetch)
+  {
+    split buffer space proportionally
+  }
+  else
+  {
+    // give all space to one buffer
+    if (do_sort_keys)
+    {
+      //sort_buffer_start= ...;
+    }
+    else 
+    {
+      DBUG_ASSERT(do_rowid_fetch);
+      //rowid_buffer_start= ...;
+    }
+  }
+  */
+  mrr_buf= buf->buffer;
+  mrr_buf_end= buf->buffer_end;
+
   if (is_mrr_assoc)
     status_var_increment(table->in_use->status_var.ha_multi_range_read_init_count);
- 
-  mrr_buf_end= buf->buffer_end;
 
   if ((doing_cpk_scan= check_cpk_scan(h->active_index, mode)))
   {
     /* It's a DS-MRR/CPK scan */
     cpk_tuple_length= 0; /* dummy value telling it needs to be inited */
     cpk_have_range= FALSE;
-    use_default_impl= FALSE;
     h->mrr_iter= seq_funcs->init(seq_init_param, n_ranges, mode);
     h->mrr_funcs= *seq_funcs;
-    dsmrr_fill_buffer_cpk();
+    dsmrr_fill_key_buffer();
+    
     if (dsmrr_eof)
       buf->end_of_used_area= mrr_buf_last;
     DBUG_RETURN(0); /* nothing could go wrong while filling the buffer */
@@ -355,6 +393,11 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
   mrr_buf_last= mrr_buf + ((mrr_buf_end - mrr_buf)/ elem_size)* elem_size;
   mrr_buf_end= mrr_buf_last;
 
+  /*
+    psergey2: this is only needed when 
+      - doing a rowid-to-row scan
+      - the buffer wasn't exhausted on the first pass.
+  */
   /*
     There can be two cases:
     - This is the first call since index_init(), h2==NULL
@@ -365,7 +408,7 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
   */
   if (!h2)
   {
-    /* Create a separate handler object to do rndpos() calls. */
+    /* Create a separate handler object to do rnd_pos() calls. */
     THD *thd= current_thd;
     /*
       ::clone() takes up a lot of stack, especially on 64 bit platforms.
@@ -376,7 +419,7 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
     DBUG_ASSERT(h->active_index != MAX_KEY);
     uint mrr_keyno= h->active_index;
 
-    /* Create a separate handler object to do rndpos() calls. */
+    /* Create a separate handler object to do rnd_pos() calls. */
     if (!(new_h2= h->clone(thd->mem_root)) || 
         new_h2->ha_external_lock(thd, F_RDLCK))
     {
@@ -397,6 +440,7 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
       goto error;
     }
 
+    use_default_impl= FALSE;
     h2= new_h2; /* Ok, now can put it into h2 */
     table->prepare_for_position();
     h2->extra(HA_EXTRA_KEYREAD);
@@ -404,7 +448,6 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
     if (h2->ha_index_init(mrr_keyno, FALSE))
       goto error;
 
-    use_default_impl= FALSE;
     if (pushed_cond)
       h2->idx_cond_push(mrr_keyno, pushed_cond);
   }
@@ -422,14 +465,13 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
     h2= NULL;
     int res= (h->inited == handler::INDEX && h->ha_index_end());
     h2= save_h2;
-    use_default_impl= FALSE;
     if (res)
       goto error;
   }
 
   if (h2->handler::multi_range_read_init(seq_funcs, seq_init_param, n_ranges, 
                                          mode, buf) ||
-      dsmrr_fill_buffer())
+      dsmrr_fill_rowid_buffer())
   {
     goto error;
   }
@@ -449,7 +491,6 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
        (h->ha_rnd_init(FALSE))))
       goto error;
 
-  use_default_impl= FALSE;
   h->mrr_funcs= *seq_funcs;
   
   DBUG_RETURN(0);
@@ -497,6 +538,9 @@ static int rowid_cmp(void *h, uchar *a, uchar *b)
 
   dsmrr_eof is set to indicate whether we've exhausted the list of ranges we're
   scanning.
+
+  psergey2: this func will 'fill the rowid buffer'. If filling the rowid buffer 
+  requires that key buffer is filled/sorted first, will do that, too.
   
   @param h  Table handler
 
@@ -505,13 +549,27 @@ static int rowid_cmp(void *h, uchar *a, uchar *b)
   @retval other  Error
 */
 
-int DsMrr_impl::dsmrr_fill_buffer()
+int DsMrr_impl::dsmrr_fill_rowid_buffer()
 {
   char *range_info;
   int res;
-  DBUG_ENTER("DsMrr_impl::dsmrr_fill_buffer");
-
+  DBUG_ENTER("DsMrr_impl::dsmrr_fill_rowid_buffer");
+  
   mrr_buf_cur= mrr_buf;
+  mrr_buf_next_identical= mrr_buf_cur;
+  /*
+    psergey2-todo:
+      - call here fill/sort key buffer, if needed.
+
+    psergey2-todo: then, get keys either from
+      - multi_range_read_next()
+      - sorted key buffer
+    
+    psergey2-todo: if we're traversing an ordered key sequence,
+     check if next keys are the same as previous.
+     (note that it's easy as ordered sequence allows forward/backward
+     navigation so we don't need to buffer things)
+  */
   while ((mrr_buf_cur < mrr_buf_end) && 
          !(res= h2->handler::multi_range_read_next(&range_info)))
   {
@@ -520,6 +578,7 @@ int DsMrr_impl::dsmrr_fill_buffer()
         h2->mrr_funcs.skip_index_tuple(h2->mrr_iter, curr_range->ptr))
       continue;
     
+
     /* Put rowid, or {rowid, range_id} pair into the buffer */
     h2->position(table->record[0]);
     memcpy(mrr_buf_cur, h2->ref, h2->ref_length);
@@ -579,20 +638,26 @@ int DsMrr_impl::key_tuple_cmp(void* arg, uchar* key1, uchar* key2)
   DS-MRR/CPK: Fill the buffer with (lookup_tuple, range_id) pairs and sort
   
   SYNOPSIS
-    DsMrr_impl::dsmrr_fill_buffer_cpk()
+    DsMrr_impl::dsmrr_fill_key_buffer()
 
   DESCRIPTION
     DS-MRR/CPK: Fill the buffer with (lookup_tuple, range_id) pairs and sort
 
     dsmrr_eof is set to indicate whether we've exhausted the list of ranges 
     we're scanning.
+
+  psergey2-q: can this be used for filling/sorting key buffer in general case?
+   a: yes. 
+  qq: can we push sequence iteration init down into here?
 */
 
-void DsMrr_impl::dsmrr_fill_buffer_cpk()
+void DsMrr_impl::dsmrr_fill_key_buffer()
 {
+  //psergey2: here, no identicals detection is necessary since we always scan
+  //  the unordered sequence.
   int res;
   KEY_MULTI_RANGE cur_range;
-  DBUG_ENTER("DsMrr_impl::dsmrr_fill_buffer_cpk");
+  DBUG_ENTER("DsMrr_impl::dsmrr_fill_key_buffer");
 
   mrr_buf_cur= mrr_buf;
   while ((mrr_buf_cur < mrr_buf_end) && 
@@ -611,6 +676,8 @@ void DsMrr_impl::dsmrr_fill_buffer_cpk()
       mrr_buf_end= mrr_buf_last;
     }
 
+    //psergey2: if keys are materialized, store pointers, not copy keys
+
     /* Put key, or {key, range_id} pair into the buffer */
     memcpy(mrr_buf_cur, cur_range.start_key.key, cpk_tuple_length);
     mrr_buf_cur += cpk_tuple_length;
@@ -648,12 +715,14 @@ void DsMrr_impl::dsmrr_fill_buffer_cpk()
     This is similar to DsMrr_impl::dsmrr_next(), the differences are that
      - we get records with index_read(), not with rnd_pos()
      - we may get multiple records for one key (=element of the buffer)
-     - unlike dsmrr_fill_buffer(), dsmrr_fill_buffer_cpk() never fails.
+     - unlike dsmrr_fill_rowid_buffer(), dsmrr_fill_key_buffer() never fails.
  
   RETURN
     0                   OK, next record was successfully read
     HA_ERR_END_OF_FILE  End of records
     Other               Some other error
+
+  psergey2-todo: this should detect identical keys.
 */
 
 int DsMrr_impl::dsmrr_next_cpk(char **range_info)
@@ -697,7 +766,7 @@ int DsMrr_impl::dsmrr_next_cpk(char **range_info)
         res= HA_ERR_END_OF_FILE;
         goto end;
       }
-      dsmrr_fill_buffer_cpk();
+      dsmrr_fill_key_buffer();
     }
     if (mrr_buf_cur == mrr_buf_last)
     {
@@ -758,6 +827,9 @@ end:
 
 /**
   DS-MRR implementation: multi_range_read_next() function
+
+  psergey2-todo: put identical rowid detection code here
+    it should always work because rowid sequences are always sorted
 */
 
 int DsMrr_impl::dsmrr_next(char **range_info)
@@ -772,6 +844,23 @@ int DsMrr_impl::dsmrr_next(char **range_info)
   if (doing_cpk_scan)
     return dsmrr_next_cpk(range_info);
   
+  if (mrr_buf_next_identical != mrr_buf_cur)
+  {
+    /*
+      There are multiple rowids. Return the record again, now with different
+      range_id 
+    */
+    do 
+    {
+      if (is_mrr_assoc)
+        memcpy(range_info, mrr_buf_next_identical + h->ref_length, sizeof(uchar*));
+    } while (!h2->mrr_funcs.skip_record ||
+             !h2->mrr_funcs.skip_record(h2->mrr_iter, (char *) range_info, rowid));
+
+    mrr_buf_next_identical += h->ref_length + sizeof(void*) * test(is_mrr_assoc);
+    return 0;
+  }
+
   do
   {
     if (mrr_buf_cur == mrr_buf_last)
@@ -781,7 +870,7 @@ int DsMrr_impl::dsmrr_next(char **range_info)
         res= HA_ERR_END_OF_FILE;
         goto end;
       }
-      res= dsmrr_fill_buffer();
+      res= dsmrr_fill_rowid_buffer();
       if (res)
         goto end;
     }
@@ -796,13 +885,34 @@ int DsMrr_impl::dsmrr_next(char **range_info)
 
     if (is_mrr_assoc)
       memcpy(&cur_range_info, mrr_buf_cur + h->ref_length, sizeof(uchar**));
+    
+    size_t element_size= h->ref_length + sizeof(void*) * test(is_mrr_assoc);
+    mrr_buf_cur += element_size;
+    mrr_buf_next_identical= mrr_buf_cur;
 
-    mrr_buf_cur += h->ref_length + sizeof(void*) * test(is_mrr_assoc);
     if (h2->mrr_funcs.skip_record &&
 	h2->mrr_funcs.skip_record(h2->mrr_iter, (char *) cur_range_info, rowid))
       continue;
     res= h->ha_rnd_pos(table->record[0], rowid);
+
+    if (res == HA_ERR_RECORD_DELETED)
+      continue;
+    
+    if (0)//(!res)
+    {
+      /* 
+        Note: this implies that SQL layer doesn't touch table->record[0]
+        between calls.
+      */
+      uchar *current_el= mrr_buf_cur - element_size;
+      while (mrr_buf_cur != mrr_buf_last && 
+             !h2->cmp_ref(current_el, mrr_buf_cur))
+      {
+        mrr_buf_cur += element_size;
+      }
+    }
     break;
+
   } while (true);
  
   if (is_mrr_assoc)
@@ -986,7 +1096,7 @@ bool DsMrr_impl::choose_mrr_impl(uint keyno, ha_rows rows, uint *flags,
     *flags |= HA_MRR_USE_DEFAULT_IMPL;
     return TRUE;
   }
-  
+
   uint add_len= table->key_info[keyno].key_length + h->ref_length; 
   *bufsz -= add_len;
   if (get_disk_sweep_mrr_cost(keyno, rows, *flags, bufsz, &dsmrr_cost))
@@ -1010,6 +1120,10 @@ bool DsMrr_impl::choose_mrr_impl(uint keyno, ha_rows rows, uint *flags,
     *flags &= ~HA_MRR_SORTED;          /* We will return unordered output */
     *cost= dsmrr_cost;
     res= FALSE;
+
+    if ((*flags & HA_MRR_SINGLE_POINT) && 
+         optimizer_flag(thd, OPTIMIZER_SWITCH_MRR_SORT_KEYS))
+      *flags |= HA_MRR_MATERIALIZED_KEYS;
   }
   else
   {
diff --git a/sql/multi_range_read.h b/sql/multi_range_read.h
index 7a5e57e490e..9d0e7dc096e 100644
--- a/sql/multi_range_read.h
+++ b/sql/multi_range_read.h
@@ -114,6 +114,8 @@ private:
   uchar *mrr_buf_last;  /* When reading: end of used buffer space */
   uchar *mrr_buf_end;   /* End of the buffer */
 
+  uchar *mrr_buf_next_identical;
+
   bool dsmrr_eof; /* TRUE <=> We have reached EOF when reading index tuples */
 
   /* TRUE <=> need range association, buffer holds {rowid, range_id} pairs */
@@ -143,8 +145,8 @@ private:
                                uint *buffer_size, COST_VECT *cost);
   bool check_cpk_scan(uint keyno, uint mrr_flags);
   static int key_tuple_cmp(void* arg, uchar* key1, uchar* key2);
-  int dsmrr_fill_buffer();
-  void dsmrr_fill_buffer_cpk();
+  int dsmrr_fill_rowid_buffer();
+  void dsmrr_fill_key_buffer();
   int dsmrr_next_cpk(char **range_info);
 };
 
diff --git a/sql/mysql_priv.h b/sql/mysql_priv.h
index ba60bab9b50..925840e49f3 100644
--- a/sql/mysql_priv.h
+++ b/sql/mysql_priv.h
@@ -571,12 +571,13 @@ protected:
 #define OPTIMIZER_SWITCH_PARTIAL_MATCH_ROWID_MERGE 512
 #define OPTIMIZER_SWITCH_PARTIAL_MATCH_TABLE_SCAN 1024
 #define OPTIMIZER_SWITCH_SUBQUERY_CACHE (1<<11)
+#define OPTIMIZER_SWITCH_MRR_SORT_KEYS (1<<12)
 
 #ifdef DBUG_OFF
-#  define OPTIMIZER_SWITCH_LAST (1<<12)
-#else
-#  define OPTIMIZER_SWITCH_TABLE_ELIMINATION (1<<12)
 #  define OPTIMIZER_SWITCH_LAST (1<<13)
+#else
+#  define OPTIMIZER_SWITCH_TABLE_ELIMINATION (1<<13)
+#  define OPTIMIZER_SWITCH_LAST (1<<14)
 #endif
 
 #ifdef DBUG_OFF 
@@ -592,7 +593,8 @@ protected:
                                     OPTIMIZER_SWITCH_SEMIJOIN | \
                                     OPTIMIZER_SWITCH_PARTIAL_MATCH_ROWID_MERGE|\
                                     OPTIMIZER_SWITCH_PARTIAL_MATCH_TABLE_SCAN|\
-                                    OPTIMIZER_SWITCH_SUBQUERY_CACHE)
+                                    OPTIMIZER_SWITCH_SUBQUERY_CACHE|\
+                                    OPTIMIZER_SWITCH_MRR_SORT_KEYS)
 #else
 #  define OPTIMIZER_SWITCH_DEFAULT (OPTIMIZER_SWITCH_INDEX_MERGE | \
                                     OPTIMIZER_SWITCH_INDEX_MERGE_UNION | \
@@ -606,7 +608,8 @@ protected:
                                     OPTIMIZER_SWITCH_SEMIJOIN | \
                                     OPTIMIZER_SWITCH_PARTIAL_MATCH_ROWID_MERGE|\
                                     OPTIMIZER_SWITCH_PARTIAL_MATCH_TABLE_SCAN|\
-                                    OPTIMIZER_SWITCH_SUBQUERY_CACHE)
+                                    OPTIMIZER_SWITCH_SUBQUERY_CACHE|\
+                                    OPTIMIZER_SWITCH_MRR_SORT_KEYS)
 #endif
 
 /*
diff --git a/sql/mysqld.cc b/sql/mysqld.cc
index 1593a584454..e53c455c3ca 100644
--- a/sql/mysqld.cc
+++ b/sql/mysqld.cc
@@ -308,6 +308,7 @@ static const char *optimizer_switch_names[]=
   "partial_match_rowid_merge",
   "partial_match_table_scan",
   "subquery_cache",
+  "mrr_sort_keys",
 #ifndef DBUG_OFF
   "table_elimination",
 #endif
@@ -329,6 +330,7 @@ static const unsigned int optimizer_switch_names_len[]=
   sizeof("partial_match_rowid_merge") - 1,
   sizeof("partial_match_table_scan") - 1,
   sizeof("subquery_cache") - 1,
+  sizeof("mrr_sort_keys") - 1,
 #ifndef DBUG_OFF
   sizeof("table_elimination") - 1,
 #endif
@@ -415,7 +417,8 @@ static const char *optimizer_switch_str="index_merge=on,index_merge_union=on,"
                                         "semijoin=on,"
                                         "partial_match_rowid_merge=on,"
                                         "partial_match_table_scan=on,"
-                                        "subquery_cache=on"
+                                        "subquery_cache=on,"
+                                        "mrr_sort_keys=on"
 #ifndef DBUG_OFF
                                         ",table_elimination=on";
 #else
diff --git a/sql/sql_join_cache.cc b/sql/sql_join_cache.cc
index 96850eb99b8..14730eac8c6 100644
--- a/sql/sql_join_cache.cc
+++ b/sql/sql_join_cache.cc
@@ -651,6 +651,9 @@ int JOIN_CACHE_BKA::init()
 
   use_emb_key= check_emb_key_usage();
 
+  if (use_emb_key)
+    mrr_mode|= HA_MRR_MATERIALIZED_KEYS;
+
   create_remaining_fields(FALSE);
 
   set_constants();
@@ -2631,6 +2634,8 @@ int JOIN_CACHE_BKA_UNIQUE::init()
       data_fields_offset+= copy->length;
   } 
 
+  mrr_mode|= HA_MRR_MATERIALIZED_KEYS;
+
   DBUG_RETURN(rc);
 }
 

From 7391175d7ee296fb95001870b5d56ac9dbbd499e Mon Sep 17 00:00:00 2001
From: Sergey Petrunya <psergey@askmonty.org>
Date: Sun, 18 Jul 2010 01:05:44 +0400
Subject: [PATCH 07/49] DS-MRR support improvements (MWL#123, MWL#124, MWL#125)
 - Let DS-MRR/CPK take advantage of materialized keys, when possible

---
 sql/multi_range_read.cc | 47 +++++++++++++++++++++++++++--------------
 sql/multi_range_read.h  |  3 +++
 2 files changed, 34 insertions(+), 16 deletions(-)

diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc
index f6417beb786..91697633ac9 100644
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -312,6 +312,7 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
   uint elem_size;
   Item *pushed_cond= NULL;
   handler *new_h2= 0;
+  THD *thd= current_thd;
   DBUG_ENTER("DsMrr_impl::dsmrr_init");
 
   /*
@@ -332,17 +333,16 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
 
   // psergey2: split the buffer:
   /*
-
   psergey2-note: we can't split the buffer here because we don't know how key
   length. we'll only be able to do it when we've got the first range.
-
-  if ((mrr_flags & HA_MRR_SINGLE_POINT) && 
+  */
+  if ((mode & HA_MRR_SINGLE_POINT) && 
        optimizer_flag(thd, OPTIMIZER_SWITCH_MRR_SORT_KEYS))
   {
-    do_sort_keys= TRUE; // will use key buffer to sort keys;
-    bool use_key_pointers= test(mrr_flags & HA_MRR_MATERIALIZED_KEYS);
+    //do_sort_keys= TRUE; // will use key buffer to sort keys;
+    use_key_pointers= test(mode & HA_MRR_MATERIALIZED_KEYS);
   }
-  
+  /*
   do_rowid_fetch= FALSE;
   if (!doing_cpk_scan && !index_only_read)
   {
@@ -409,7 +409,6 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
   if (!h2)
   {
     /* Create a separate handler object to do rnd_pos() calls. */
-    THD *thd= current_thd;
     /*
       ::clone() takes up a lot of stack, especially on 64 bit platforms.
       The constant 5 is an empiric result.
@@ -609,6 +608,8 @@ int DsMrr_impl::dsmrr_fill_rowid_buffer()
 
 /* 
   my_qsort2-compatible function to compare key tuples 
+
+  If dsmrr->use_key_pointers==FALSE
 */
 
 int DsMrr_impl::key_tuple_cmp(void* arg, uchar* key1, uchar* key2)
@@ -617,6 +618,14 @@ int DsMrr_impl::key_tuple_cmp(void* arg, uchar* key1, uchar* key2)
   TABLE *table= dsmrr->h->table;
   
   KEY_PART_INFO *part= table->key_info[table->s->primary_key].key_part;
+  
+  if (dsmrr->use_key_pointers)
+  {
+    /* the buffer stores pointers to keys, get to the keys */
+    key1= *((uchar**)key1);
+    key2= *((uchar**)key2);  // todo is this alignment-safe?
+  }
+
   uchar *key1_end= key1 + dsmrr->cpk_tuple_length;
 
   while (key1 < key1_end)
@@ -664,14 +673,15 @@ void DsMrr_impl::dsmrr_fill_key_buffer()
          !(res= h->mrr_funcs.next(h->mrr_iter, &cur_range)))
   {
     DBUG_ASSERT(cur_range.range_flag & EQ_RANGE);
-    DBUG_ASSERT(!cpk_tuple_length || 
-                cpk_tuple_length == cur_range.start_key.length);
     if (!cpk_tuple_length)
     {
       cpk_tuple_length= cur_range.start_key.length;
+      key_buf_element_size= use_key_pointers ? sizeof(char*) : 
+                                           cpk_tuple_length;
+
       cpk_is_unique_scan= test(table->key_info[h->active_index].key_parts == 
                                my_count_bits(cur_range.start_key.keypart_map));
-      uint elem_size= cpk_tuple_length + (int)is_mrr_assoc * sizeof(void*);
+      uint elem_size= key_buf_element_size + (int)is_mrr_assoc * sizeof(void*);
       mrr_buf_last= mrr_buf + ((mrr_buf_end - mrr_buf)/elem_size) * elem_size;
       mrr_buf_end= mrr_buf_last;
     }
@@ -679,8 +689,12 @@ void DsMrr_impl::dsmrr_fill_key_buffer()
     //psergey2: if keys are materialized, store pointers, not copy keys
 
     /* Put key, or {key, range_id} pair into the buffer */
-    memcpy(mrr_buf_cur, cur_range.start_key.key, cpk_tuple_length);
-    mrr_buf_cur += cpk_tuple_length;
+    if (use_key_pointers)
+      memcpy(mrr_buf_cur, &cur_range.start_key.key, sizeof(char*));
+    else
+      memcpy(mrr_buf_cur, cur_range.start_key.key, cpk_tuple_length);
+
+    mrr_buf_cur += key_buf_element_size;
 
     if (is_mrr_assoc)
     {
@@ -692,7 +706,7 @@ void DsMrr_impl::dsmrr_fill_key_buffer()
   dsmrr_eof= test(res);
 
   /* Sort the buffer contents by rowid */
-  uint elem_size= cpk_tuple_length + (int)is_mrr_assoc * sizeof(void*);
+  uint elem_size= key_buf_element_size + (int)is_mrr_assoc * sizeof(void*);
   uint n_rowids= (mrr_buf_cur - mrr_buf) / elem_size;
   
   my_qsort2(mrr_buf, n_rowids, elem_size, 
@@ -739,7 +753,8 @@ int DsMrr_impl::dsmrr_next_cpk(char **range_info)
       break;
     }
 
-    res= h->index_next_same(table->record[0], mrr_buf_cur, cpk_tuple_length);
+    uchar *lookup_tuple= use_key_pointers? (*((uchar**)mrr_buf_cur)) : mrr_buf_cur;
+    res= h->index_next_same(table->record[0], lookup_tuple, cpk_tuple_length);
 
     if (h->mrr_funcs.skip_index_tuple &&
         h->mrr_funcs.skip_index_tuple(h->mrr_iter, cpk_saved_range_info))
@@ -775,8 +790,8 @@ int DsMrr_impl::dsmrr_next_cpk(char **range_info)
     }
     
     /* Ok, got the range. Try making a lookup.  */
-    uchar *lookup_tuple= mrr_buf_cur;
-    mrr_buf_cur += cpk_tuple_length;
+    uchar *lookup_tuple= use_key_pointers? (*((uchar**)mrr_buf_cur)) : mrr_buf_cur;
+    mrr_buf_cur += key_buf_element_size;
     if (is_mrr_assoc)
     {
       memcpy(&cpk_saved_range_info, mrr_buf_cur, sizeof(void*));
diff --git a/sql/multi_range_read.h b/sql/multi_range_read.h
index 9d0e7dc096e..aa33147e2db 100644
--- a/sql/multi_range_read.h
+++ b/sql/multi_range_read.h
@@ -115,6 +115,7 @@ private:
   uchar *mrr_buf_end;   /* End of the buffer */
 
   uchar *mrr_buf_next_identical;
+  bool use_key_pointers;
 
   bool dsmrr_eof; /* TRUE <=> We have reached EOF when reading index tuples */
 
@@ -129,6 +130,8 @@ private:
 
   /* Length of lookup tuple being used, in bytes */
   uint cpk_tuple_length;
+
+  uint key_buf_element_size;
   /*
     TRUE <=> We're scanning on a full primary key (and not on prefix), and so 
     can get max. one match for each key 

From e25db795bd48e806f10a0baaa8abceb1eb408502 Mon Sep 17 00:00:00 2001
From: Sergey Petrunya <psergey@askmonty.org>
Date: Sun, 8 Aug 2010 11:13:54 +0400
Subject: [PATCH 08/49] DS-MRR, key-ordered retrievals: commit for buildbot

---
 sql/multi_range_read.cc | 695 ++++++++++++++++++++++++++--------------
 sql/multi_range_read.h  | 151 ++++++++-
 2 files changed, 586 insertions(+), 260 deletions(-)

diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc
index 91697633ac9..35530a24aae 100644
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -287,6 +287,84 @@ scan_it_again:
  * DS-MRR implementation 
  ***************************************************************************/
 
+void SimpleBuffer::write(const uchar *data, size_t bytes)
+{
+  DBUG_ASSERT(have_space_for(bytes));
+
+  if (direction == -1)
+    write_pos -= bytes;
+
+  memcpy(write_pos, data, bytes);
+
+  if (direction == 1)
+    write_pos += bytes;
+}
+
+bool SimpleBuffer::have_space_for(size_t bytes)
+{
+  if (direction == 1)
+    return (write_pos + bytes < end);
+  else
+    return (write_pos - bytes >= start);
+}
+
+size_t SimpleBuffer::used_size()
+{
+  return (direction == 1)? write_pos - read_pos : read_pos - write_pos;
+}
+
+uchar *SimpleBuffer::read(size_t bytes)
+{
+  DBUG_ASSERT(have_data(bytes));
+  uchar *res;
+  if (direction == 1)
+  {
+    res= read_pos;
+    read_pos += bytes;
+    return res;
+  }
+  else
+  {
+    read_pos= read_pos - bytes;
+    return read_pos;
+  }
+}
+
+bool SimpleBuffer::have_data(size_t bytes)
+{
+  return (direction == 1)? (write_pos - read_pos >= (ptrdiff_t)bytes) : 
+                           (read_pos - write_pos >= (ptrdiff_t)bytes);
+}
+
+void SimpleBuffer::reset_for_writing()
+{
+  if (direction == 1)
+    write_pos= read_pos= start;
+  else
+    write_pos= read_pos= end;
+}
+
+void SimpleBuffer::reset_for_reading()
+{
+/*
+Do we need this at all?
+  if (direction == 1)
+    pos= start;
+  else
+    pos= end;
+//end?
+*/
+}
+
+uchar *SimpleBuffer::end_of_space()
+{
+  if (direction == 1)
+    return start;
+  else
+    return end;
+//TODO: check this.
+}
+
 /**
   DS-MRR: Initialize and start MRR scan
 
@@ -309,7 +387,6 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
                            void *seq_init_param, uint n_ranges, uint mode,
                            HANDLER_BUFFER *buf)
 {
-  uint elem_size;
   Item *pushed_cond= NULL;
   handler *new_h2= 0;
   THD *thd= current_thd;
@@ -330,69 +407,69 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
   }
   use_default_impl= FALSE;
   is_mrr_assoc= !test(mode & HA_MRR_NO_ASSOCIATION);
-
-  // psergey2: split the buffer:
+  
   /*
-  psergey2-note: we can't split the buffer here because we don't know how key
-  length. we'll only be able to do it when we've got the first range.
+    Figure out what steps we'll need to do
   */
+  do_sort_keys= FALSE;
   if ((mode & HA_MRR_SINGLE_POINT) && 
        optimizer_flag(thd, OPTIMIZER_SWITCH_MRR_SORT_KEYS))
   {
-    //do_sort_keys= TRUE; // will use key buffer to sort keys;
+    do_sort_keys= TRUE;
     use_key_pointers= test(mode & HA_MRR_MATERIALIZED_KEYS);
   }
-  /*
+
   do_rowid_fetch= FALSE;
-  if (!doing_cpk_scan && !index_only_read)
+  doing_cpk_scan= check_cpk_scan(h->active_index, mode);
+  if (!doing_cpk_scan /* && !index_only_read */)
   {
-    do_rowid_fetch= TRUE; //will use rowid buffer to store/sort rowids, etc
+    /* Will use rowid buffer to store/sort rowids, etc */
+    do_rowid_fetch= TRUE;
   }
+  DBUG_ASSERT(do_sort_keys || do_rowid_fetch);
 
-
-  if (do_sort_keys && do_rowid_fetch)
-  {
-    split buffer space proportionally
-  }
-  else
-  {
-    // give all space to one buffer
-    if (do_sort_keys)
-    {
-      //sort_buffer_start= ...;
-    }
-    else 
-    {
-      DBUG_ASSERT(do_rowid_fetch);
-      //rowid_buffer_start= ...;
-    }
-  }
+  full_buf= buf->buffer;
+  full_buf_end= buf->buffer_end;
+  
+  /* 
+    At start, alloc all of the buffer for rowids. Key sorting code will grab a
+    piece if necessary.
   */
-  mrr_buf= buf->buffer;
-  mrr_buf_end= buf->buffer_end;
+  rowid_buffer.set_buffer_space(full_buf, full_buf_end, 1);
 
   if (is_mrr_assoc)
     status_var_increment(table->in_use->status_var.ha_multi_range_read_init_count);
-
-  if ((doing_cpk_scan= check_cpk_scan(h->active_index, mode)))
+  
+  /*
+    psergey2-todo: for CPK scans:
+     - use MRR irrespectively of @@mrr_sort_keys setting,
+     - dont do rowid retrieval.
+  */
+  if (do_sort_keys)
   {
     /* It's a DS-MRR/CPK scan */
-    cpk_tuple_length= 0; /* dummy value telling it needs to be inited */
-    cpk_have_range= FALSE;
+    key_tuple_length= 0; /* dummy value telling it needs to be inited */
+    key_buff_elem_size= 0;
+    in_index_range= FALSE;
     h->mrr_iter= seq_funcs->init(seq_init_param, n_ranges, mode);
     h->mrr_funcs= *seq_funcs;
+    keyno= h->active_index != MAX_KEY? h->active_index : h2->active_index;
     dsmrr_fill_key_buffer();
     
-    if (dsmrr_eof)
-      buf->end_of_used_area= mrr_buf_last;
-    DBUG_RETURN(0); /* nothing could go wrong while filling the buffer */
+    if (dsmrr_eof && !do_rowid_fetch)
+      buf->end_of_used_area= key_buffer.end_of_space();
   }
 
-  /* In regular DS-MRR, buffer stores {rowid, range_id} pairs */
-  elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*);
-  mrr_buf_last= mrr_buf + ((mrr_buf_end - mrr_buf)/ elem_size)* elem_size;
-  mrr_buf_end= mrr_buf_last;
+  if (!do_rowid_fetch)
+  {
+    /* 
+      We have the keys and won't need to fetch rowids, as key lookup will be
+      the last operation, done in multi_range_read_next().
+    */
+    DBUG_RETURN(0);
+  }
 
+  rowid_buff_elem_size= h->ref_length + (is_mrr_assoc? sizeof(char*) : 0);
   /*
     psergey2: this is only needed when 
       - doing a rowid-to-row scan
@@ -416,7 +493,7 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
     if (check_stack_overrun(thd, 5*STACK_MIN_SIZE, (uchar*) &new_h2))
       DBUG_RETURN(1);
     DBUG_ASSERT(h->active_index != MAX_KEY);
-    uint mrr_keyno= h->active_index;
+    keyno= h->active_index;
 
     /* Create a separate handler object to do rnd_pos() calls. */
     if (!(new_h2= h->clone(thd->mem_root)) || 
@@ -426,7 +503,7 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
       DBUG_RETURN(1);
     }
 
-    if (mrr_keyno == h->pushed_idx_cond_keyno)
+    if (keyno == h->pushed_idx_cond_keyno)
       pushed_cond= h->pushed_idx_cond;
 
     /*
@@ -443,12 +520,14 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
     h2= new_h2; /* Ok, now can put it into h2 */
     table->prepare_for_position();
     h2->extra(HA_EXTRA_KEYREAD);
-  
-    if (h2->ha_index_init(mrr_keyno, FALSE))
+    h2->mrr_funcs= *seq_funcs; //psergey3-todo: sort out where to store
+    h2->mrr_iter= h->mrr_iter;
+
+    if (h2->ha_index_init(keyno, FALSE))
       goto error;
 
     if (pushed_cond)
-      h2->idx_cond_push(mrr_keyno, pushed_cond);
+      h2->idx_cond_push(keyno, pushed_cond);
   }
   else
   {
@@ -467,10 +546,15 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
     if (res)
       goto error;
   }
+  
+  if (!do_sort_keys && 
+      h2->handler::multi_range_read_init(seq_funcs, seq_init_param, n_ranges, 
+                                         mode, buf))
+  {
+    goto error;
+  }
 
-  if (h2->handler::multi_range_read_init(seq_funcs, seq_init_param, n_ranges, 
-                                         mode, buf) ||
-      dsmrr_fill_rowid_buffer())
+  if (dsmrr_fill_rowid_buffer())
   {
     goto error;
   }
@@ -479,7 +563,7 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
     adjust *buf to indicate that the remaining buffer space will not be used.
   */
   if (dsmrr_eof) 
-    buf->end_of_used_area= mrr_buf_last;
+    buf->end_of_used_area= rowid_buffer.end_of_space();
 
   /*
      h->inited == INDEX may occur when 'range checked for each record' is
@@ -526,21 +610,24 @@ static int rowid_cmp(void *h, uchar *a, uchar *b)
 
 
 /**
-  DS-MRR: Fill the buffer with rowids and sort it by rowid
+  DS-MRR: Fill and sort the rowid buffer
 
   {This is an internal function of DiskSweep MRR implementation}
+
   Scan the MRR ranges and collect ROWIDs (or {ROWID, range_id} pairs) into 
   buffer. When the buffer is full or scan is completed, sort the buffer by 
   rowid and return.
   
   The function assumes that rowids buffer is empty when it is invoked. 
 
+  New2:
+    we will need to scan either 
+     - the source sequence getting records
+     - use dsmrr_next_from_index..
+
   dsmrr_eof is set to indicate whether we've exhausted the list of ranges we're
   scanning.
 
-  psergey2: this func will 'fill the rowid buffer'. If filling the rowid buffer 
-  requires that key buffer is filled/sorted first, will do that, too.
-  
   @param h  Table handler
 
   @retval 0      OK, the next portion of rowids is in the buffer,
@@ -554,40 +641,32 @@ int DsMrr_impl::dsmrr_fill_rowid_buffer()
   int res;
   DBUG_ENTER("DsMrr_impl::dsmrr_fill_rowid_buffer");
   
-  mrr_buf_cur= mrr_buf;
-  mrr_buf_next_identical= mrr_buf_cur;
-  /*
-    psergey2-todo:
-      - call here fill/sort key buffer, if needed.
+  rowid_buffer.reset_for_writing();
+  identical_rowid_ptr= NULL;
 
-    psergey2-todo: then, get keys either from
-      - multi_range_read_next()
-      - sorted key buffer
-    
-    psergey2-todo: if we're traversing an ordered key sequence,
-     check if next keys are the same as previous.
-     (note that it's easy as ordered sequence allows forward/backward
-     navigation so we don't need to buffer things)
-  */
-  while ((mrr_buf_cur < mrr_buf_end) && 
-         !(res= h2->handler::multi_range_read_next(&range_info)))
+  while (rowid_buffer.have_space_for(rowid_buff_elem_size))
   {
+    if (do_sort_keys)
+      res= dsmrr_next_from_index(&range_info);
+    else 
+      res= h2->handler::multi_range_read_next(&range_info);
+
+    if (res)
+      break;
+    
+
     KEY_MULTI_RANGE *curr_range= &h2->handler::mrr_cur_range;
-    if (h2->mrr_funcs.skip_index_tuple &&
+    if (!do_sort_keys && /* If keys are sorted then this check is already done */
+        h2->mrr_funcs.skip_index_tuple &&
         h2->mrr_funcs.skip_index_tuple(h2->mrr_iter, curr_range->ptr))
       continue;
-    
 
     /* Put rowid, or {rowid, range_id} pair into the buffer */
     h2->position(table->record[0]);
-    memcpy(mrr_buf_cur, h2->ref, h2->ref_length);
-    mrr_buf_cur += h2->ref_length;
+    rowid_buffer.write(h2->ref, h2->ref_length);
 
     if (is_mrr_assoc)
-    {
-      memcpy(mrr_buf_cur, &range_info, sizeof(void*));
-      mrr_buf_cur += sizeof(void*);
-    }
+      rowid_buffer.write((uchar*)&range_info, sizeof(void*));
   }
 
   if (res && res != HA_ERR_END_OF_FILE)
@@ -596,12 +675,11 @@ int DsMrr_impl::dsmrr_fill_rowid_buffer()
 
   /* Sort the buffer contents by rowid */
   uint elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*);
-  uint n_rowids= (mrr_buf_cur - mrr_buf) / elem_size;
+  uint n_rowids= rowid_buffer.used_size() / elem_size;
   
-  my_qsort2(mrr_buf, n_rowids, elem_size, (qsort2_cmp)rowid_cmp,
-            (void*)h);
-  mrr_buf_last= mrr_buf_cur;
-  mrr_buf_cur=  mrr_buf;
+  my_qsort2(rowid_buffer.used_area(), n_rowids, elem_size, 
+            (qsort2_cmp)rowid_cmp, (void*)h);
+
   DBUG_RETURN(0);
 }
 
@@ -616,8 +694,8 @@ int DsMrr_impl::key_tuple_cmp(void* arg, uchar* key1, uchar* key2)
 {
   DsMrr_impl *dsmrr= (DsMrr_impl*)arg;
   TABLE *table= dsmrr->h->table;
-  
-  KEY_PART_INFO *part= table->key_info[table->s->primary_key].key_part;
+  int res;
+  KEY_PART_INFO *part= table->key_info[dsmrr->keyno].key_part;
   
   if (dsmrr->use_key_pointers)
   {
@@ -626,15 +704,31 @@ int DsMrr_impl::key_tuple_cmp(void* arg, uchar* key1, uchar* key2)
     key2= *((uchar**)key2);  // todo is this alignment-safe?
   }
 
-  uchar *key1_end= key1 + dsmrr->cpk_tuple_length;
+  uchar *key1_end= key1 + dsmrr->key_tuple_length;
 
   while (key1 < key1_end)
   {
     Field* f = part->field;
     int len = part->store_length;
-    int res = f->cmp(key1, key2);
-    if (res)
+    if (part->null_bit)
+    {
+      if (*key1) // key1 == NULL
+      {
+        if (!*key2) // key1(NULL) < key2(notNULL)
+          return -1;
+        goto equals;
+      }
+      else if (*key2) // key1(notNULL) > key2 (NULL)
+        return 1;
+      // Step over NULL byte for f->cmp().
+      key1++;
+      key2++;
+      len--;
+    }
+    
+    if ((res= f->key_cmp(key1, key2)))
       return res;
+equals:
     key1 += len;
     key2 += len;
     part++;
@@ -643,6 +737,68 @@ int DsMrr_impl::key_tuple_cmp(void* arg, uchar* key1, uchar* key2)
 }
 
 
+/*
+  Setup key/rowid buffer sizes based on sample_key
+
+  DESCRIPTION
+    Setup key/rowid buffer sizes based on sample_key and its length.
+
+    This function must be called when all buffer space is empty.
+*/
+
+void DsMrr_impl::setup_buffer_sizes(key_range *sample_key)
+{
+  key_tuple_length= sample_key->length;
+  key_tuple_map= sample_key->keypart_map;
+  key_size_in_keybuf= use_key_pointers ? sizeof(char*) : 
+                                       key_tuple_length;
+  key_buff_elem_size= key_size_in_keybuf + 
+                      (int)is_mrr_assoc * sizeof(void*);
+
+  uint rowid_buf_elem_size= h->ref_length + 
+                            (int)is_mrr_assoc * sizeof(char*);
+  
+  KEY *key_info= &h->table->key_info[keyno];
+  /*
+    Use rec_per_key statistics as a basis to find out how many rowids 
+    we'll get for each key value.
+     TODO: are we guaranteed to get r_p_c==1 for unique keys?
+     TODO: what should be the default value to use when there is no 
+           statistics?
+  */
+  uint parts= my_count_bits(key_tuple_map);
+  ulong rpc;
+  if ((rpc= key_info->rec_per_key[parts - 1]))
+  {
+    rowid_buf_elem_size *= rpc;
+  }
+
+  double fraction_for_rowids=
+    ((double) rowid_buf_elem_size / 
+         ((double)rowid_buf_elem_size + key_buff_elem_size));
+
+  uint bytes_for_rowids= 
+    round(fraction_for_rowids * (full_buf_end - full_buf));
+  
+  uint bytes_for_keys= (full_buf_end - full_buf) - bytes_for_rowids;
+
+  if (bytes_for_keys < key_buff_elem_size + 1)
+  {
+    uint add= key_buff_elem_size + 1 - bytes_for_keys;
+    bytes_for_rowids -= add;
+    DBUG_ASSERT(bytes_for_rowids >= 
+                (h->ref_length + (int)is_mrr_assoc * sizeof(char*) + 1));
+  }
+
+  rowid_buffer.set_buffer_space(full_buf, full_buf + bytes_for_rowids, 1);
+  key_buffer.set_buffer_space(full_buf + bytes_for_rowids, full_buf_end, 1);
+
+  index_ranges_unique= test(key_info->flags & HA_NOSAME && 
+                            key_info->key_parts == 
+                              my_count_bits(sample_key->keypart_map));
+}
+
+
 /*
   DS-MRR/CPK: Fill the buffer with (lookup_tuple, range_id) pairs and sort
   
@@ -650,69 +806,55 @@ int DsMrr_impl::key_tuple_cmp(void* arg, uchar* key1, uchar* key2)
     DsMrr_impl::dsmrr_fill_key_buffer()
 
   DESCRIPTION
-    DS-MRR/CPK: Fill the buffer with (lookup_tuple, range_id) pairs and sort
+    DS-MRR/CPK: Enumerate the input range (=key) sequence, fill the key buffer
+    (lookup_key, range_id) pairs and sort.
 
     dsmrr_eof is set to indicate whether we've exhausted the list of ranges 
     we're scanning.
-
-  psergey2-q: can this be used for filling/sorting key buffer in general case?
-   a: yes. 
-  qq: can we push sequence iteration init down into here?
 */
 
 void DsMrr_impl::dsmrr_fill_key_buffer()
 {
-  //psergey2: here, no identicals detection is necessary since we always scan
-  //  the unordered sequence.
   int res;
   KEY_MULTI_RANGE cur_range;
   DBUG_ENTER("DsMrr_impl::dsmrr_fill_key_buffer");
 
-  mrr_buf_cur= mrr_buf;
-  while ((mrr_buf_cur < mrr_buf_end) && 
+  // reset the buffer for writing.
+  key_buffer.reset_for_writing();
+
+  while ((key_buffer.have_space_for(key_buff_elem_size)) && 
          !(res= h->mrr_funcs.next(h->mrr_iter, &cur_range)))
   {
     DBUG_ASSERT(cur_range.range_flag & EQ_RANGE);
-    if (!cpk_tuple_length)
+    if (!key_tuple_length)
     {
-      cpk_tuple_length= cur_range.start_key.length;
-      key_buf_element_size= use_key_pointers ? sizeof(char*) : 
-                                           cpk_tuple_length;
-
-      cpk_is_unique_scan= test(table->key_info[h->active_index].key_parts == 
-                               my_count_bits(cur_range.start_key.keypart_map));
-      uint elem_size= key_buf_element_size + (int)is_mrr_assoc * sizeof(void*);
-      mrr_buf_last= mrr_buf + ((mrr_buf_end - mrr_buf)/elem_size) * elem_size;
-      mrr_buf_end= mrr_buf_last;
+      /* This only happens when we've just started filling the buffer */
+      DBUG_ASSERT(key_buffer.used_size() == 0);
+      setup_buffer_sizes(&cur_range.start_key);
     }
 
-    //psergey2: if keys are materialized, store pointers, not copy keys
-
     /* Put key, or {key, range_id} pair into the buffer */
     if (use_key_pointers)
-      memcpy(mrr_buf_cur, &cur_range.start_key.key, sizeof(char*));
+      key_buffer.write((uchar*)&cur_range.start_key.key, sizeof(char*));
     else
-      memcpy(mrr_buf_cur, cur_range.start_key.key, cpk_tuple_length);
-
-    mrr_buf_cur += key_buf_element_size;
-
+      key_buffer.write(cur_range.start_key.key, key_tuple_length);
+ 
     if (is_mrr_assoc)
-    {
-      memcpy(mrr_buf_cur, &cur_range.ptr, sizeof(void*));
-      mrr_buf_cur += sizeof(void*);
-    }
+      key_buffer.write((uchar*)&cur_range.ptr, sizeof(void*));
   }
 
   dsmrr_eof= test(res);
 
   /* Sort the buffer contents by rowid */
-  uint elem_size= key_buf_element_size + (int)is_mrr_assoc * sizeof(void*);
-  uint n_rowids= (mrr_buf_cur - mrr_buf) / elem_size;
+  uint key_elem_size= key_size_in_keybuf + (int)is_mrr_assoc * sizeof(void*);
+  uint n_keys= key_buffer.used_size() / key_elem_size;
   
-  my_qsort2(mrr_buf, n_rowids, elem_size, 
+  my_qsort2(key_buffer.used_area(), n_keys, key_elem_size,
             (qsort2_cmp)DsMrr_impl::key_tuple_cmp, (void*)this);
-  mrr_buf_last= mrr_buf_cur;
-  mrr_buf_cur=  mrr_buf;
+  
+  last_identical_key_ptr= NULL;
+  in_identical_keys_range= FALSE;
+
   DBUG_VOID_RETURN;
 }
 
@@ -721,60 +863,105 @@ void DsMrr_impl::dsmrr_fill_key_buffer()
   DS-MRR/CPK: multi_range_read_next() function
 
   DESCRIPTION
-    DsMrr_impl::dsmrr_next_cpk()
+    DsMrr_impl::dsmrr_next_from_index()
       range_info  OUT  identifier of range that the returned record belongs to
 
   DESCRIPTION
-    DS-MRR/CPK: multi_range_read_next() function. 
-    This is similar to DsMrr_impl::dsmrr_next(), the differences are that
-     - we get records with index_read(), not with rnd_pos()
-     - we may get multiple records for one key (=element of the buffer)
-     - unlike dsmrr_fill_rowid_buffer(), dsmrr_fill_key_buffer() never fails.
- 
+  
+  This function walks over key buffer and does index reads, i.e. it produces
+  {current_record, range_id} pairs.
+
+  The function has the same call contract like multi_range_read_next()'s.
+
+  We actually iterate nested sequences:
+  
+  - a disjoint sequence of index ranges
+    - each range has multiple records
+      - each record goes into multiple identical ranges.
+
   RETURN
     0                   OK, next record was successfully read
     HA_ERR_END_OF_FILE  End of records
     Other               Some other error
-
-  psergey2-todo: this should detect identical keys.
 */
 
-int DsMrr_impl::dsmrr_next_cpk(char **range_info)
+int DsMrr_impl::dsmrr_next_from_index(char **range_info_arg)
 {
   int res;
+  uchar *key_in_buf;
+  handler *file= do_rowid_fetch? h2: h;
 
-  while (cpk_have_range)
+  while (in_identical_keys_range)
   {
+//read_and_check:
+    /* Read record/key pointer from the buffer */
+    key_in_buf= identical_key_it.get_next(key_size_in_keybuf);
+    if (is_mrr_assoc)
+      cur_range_info= (char*)identical_key_it.get_next(sizeof(void*));
 
-    if (h->mrr_funcs.skip_record &&
-        h->mrr_funcs.skip_record(h->mrr_iter, cpk_saved_range_info, NULL))
+    if (key_in_buf == last_identical_key_ptr)
     {
-      cpk_have_range= FALSE;
+      /* We're looking at the last of the identical keys */
+      in_identical_keys_range= FALSE;
+    }
+check_record:
+    if ((h->mrr_funcs.skip_index_tuple &&
+         h->mrr_funcs.skip_index_tuple(h->mrr_iter, *(char**)cur_range_info)) || 
+        (h->mrr_funcs.skip_record &&
+         h->mrr_funcs.skip_record(h->mrr_iter, *(char**)cur_range_info, NULL)))
+    {
+      continue;
+    }
+    memcpy(range_info_arg, cur_range_info, sizeof(void*));
+
+    return 0;
+  }
+  
+  /* Try returrning next record from the current range */
+  while (in_index_range)
+  {
+    res= file->ha_index_next_same(table->record[0], cur_index_tuple, 
+                                  key_tuple_length);
+    
+    if (res)
+    {
+      if (res != HA_ERR_END_OF_FILE && res != HA_ERR_KEY_NOT_FOUND)
+        return res;  /* Fatal error */
+
+      in_index_range= FALSE; /* no more records here */
       break;
     }
-
-    uchar *lookup_tuple= use_key_pointers? (*((uchar**)mrr_buf_cur)) : mrr_buf_cur;
-    res= h->index_next_same(table->record[0], lookup_tuple, cpk_tuple_length);
-
-    if (h->mrr_funcs.skip_index_tuple &&
-        h->mrr_funcs.skip_index_tuple(h->mrr_iter, cpk_saved_range_info))
-      continue;
-
-    if (res != HA_ERR_END_OF_FILE)
+    
+    if (last_identical_key_ptr)
     {
-      if (is_mrr_assoc)
-        memcpy(range_info, &cpk_saved_range_info, sizeof(void*));
-      return res;
+      in_identical_keys_range= TRUE;
+      identical_key_it.init(&key_buffer);
+      cur_range_info= first_identical_range_info;
     }
 
-    /* No more records in this range. Exit this loop and go get another range */
-    cpk_have_range= FALSE;
+    goto check_record;
+   //  goto read_and_check;
   }
 
-  do
+  while(1)
   {
+    DBUG_ASSERT(!in_identical_keys_range && !in_index_range);
+
+    /* Jump over the keys that were handled by identical key processing */
+    if (last_identical_key_ptr)
+    {
+      while (key_buffer.read(key_size_in_keybuf) != last_identical_key_ptr)
+      {
+        if (is_mrr_assoc)
+          key_buffer.read(sizeof(void*));
+      }
+      if (is_mrr_assoc)
+        key_buffer.read(sizeof(void*));
+      last_identical_key_ptr= NULL;
+    }
+
     /* First, make sure we have a range at start of the buffer */
-    if (mrr_buf_cur == mrr_buf_last)
+    if (!key_buffer.have_data(key_buff_elem_size))
     {
       if (dsmrr_eof)
       {
@@ -782,58 +969,56 @@ int DsMrr_impl::dsmrr_next_cpk(char **range_info)
         goto end;
       }
       dsmrr_fill_key_buffer();
-    }
-    if (mrr_buf_cur == mrr_buf_last)
-    {
-      res= HA_ERR_END_OF_FILE;
-      goto end;
+      if (!key_buffer.have_data(key_buff_elem_size))
+      {
+        res= HA_ERR_END_OF_FILE;
+        goto end;
+      }
     }
     
-    /* Ok, got the range. Try making a lookup.  */
-    uchar *lookup_tuple= use_key_pointers? (*((uchar**)mrr_buf_cur)) : mrr_buf_cur;
-    mrr_buf_cur += key_buf_element_size;
+    /* Get the next range to scan*/
+    cur_index_tuple= key_in_buf= key_buffer.read(key_size_in_keybuf);
+    if (use_key_pointers)
+      cur_index_tuple= *((uchar**)cur_index_tuple);
+
     if (is_mrr_assoc)
-    {
-      memcpy(&cpk_saved_range_info, mrr_buf_cur, sizeof(void*));
-      mrr_buf_cur += sizeof(void*) * test(is_mrr_assoc);
-    }
+      cur_range_info= (char*)key_buffer.read(sizeof(void*));
       
-    if (h->mrr_funcs.skip_record &&
-        h->mrr_funcs.skip_record(h->mrr_iter, cpk_saved_range_info, NULL))
-      continue;
-    
-    res= h->index_read(table->record[0], lookup_tuple, cpk_tuple_length, 
-                       HA_READ_KEY_EXACT);
-
-    /*
-      Check pushed index condition. Performance-wise, it does not make any
-      sense to put this call here (the above call has already accessed the full
-      record). That's the best I could do, though, because:
-      - ha_innobase doesn't support IndexConditionPushdown on clustered PK
-      - MRR interface doesn't allow the storage engine to refuse a pushed index
-        condition.
-      Having this call here is not fully harmless: EXPLAIN shows "pushed index
-      condition", which is technically true but doesn't bring the benefits that
-      one might expect.
-    */
-    if (h->mrr_funcs.skip_index_tuple &&
-        h->mrr_funcs.skip_index_tuple(h->mrr_iter, cpk_saved_range_info))
-      continue;
-
-    if (res && res != HA_ERR_END_OF_FILE)
-      goto end;
-
-    if (!res)
+    /* Do index lookup */
+    if ((res= file->ha_index_read_map(table->record[0], cur_index_tuple, 
+                                      key_tuple_map, HA_READ_KEY_EXACT)))
     {
-      memcpy(range_info, &cpk_saved_range_info, sizeof(void*));
-      /* 
-        Attempt reading more rows from this range only if there actually can
-        be multiple matches:
-       */
-      cpk_have_range= !cpk_is_unique_scan;
-      break;
+      if (res != HA_ERR_END_OF_FILE && res != HA_ERR_KEY_NOT_FOUND)
+        return res;
+      continue; /* to next key and make another lookup */
     }
-  } while (true);
+
+    /* Check if subsequent keys in the key buffer are the same as this one */
+    {
+      uchar *ptr;
+      identical_key_it.init(&key_buffer);
+      last_identical_key_ptr= NULL;
+      while ((ptr= identical_key_it.get_next(key_size_in_keybuf)))
+      {
+        if (is_mrr_assoc)
+          identical_key_it.get_next(sizeof(void*));
+
+        if (key_tuple_cmp(this, key_in_buf, ptr))
+          break;
+
+        last_identical_key_ptr= ptr;
+      }
+      if (last_identical_key_ptr)
+      {
+        in_identical_keys_range= TRUE;
+        identical_key_it.init(&key_buffer);
+        first_identical_range_info= cur_range_info;
+      }
+    }
+
+    in_index_range= !index_ranges_unique;
+    goto check_record;
+  }
  
 end:
   return res;
@@ -842,9 +1027,6 @@ end:
 
 /**
   DS-MRR implementation: multi_range_read_next() function
-
-  psergey2-todo: put identical rowid detection code here
-    it should always work because rowid sequences are always sorted
 */
 
 int DsMrr_impl::dsmrr_next(char **range_info)
@@ -852,89 +1034,108 @@ int DsMrr_impl::dsmrr_next(char **range_info)
   int res;
   uchar *cur_range_info= 0;
   uchar *rowid;
+  uchar *range_id;
 
   if (use_default_impl)
     return h->handler::multi_range_read_next(range_info);
 
-  if (doing_cpk_scan)
-    return dsmrr_next_cpk(range_info);
+  if (!do_rowid_fetch)
+    return dsmrr_next_from_index(range_info);
   
-  if (mrr_buf_next_identical != mrr_buf_cur)
+  while (identical_rowid_ptr)
   {
     /*
-      There are multiple rowids. Return the record again, now with different
-      range_id 
+      Current record (the one we've returned in previous call) was obtained
+      from a rowid that matched multiple range_ids. Return this record again,
+      with next matching range_id.
     */
-    do 
+    rowid= rowid_buffer.read(h->ref_length);
+    if (is_mrr_assoc)
     {
-      if (is_mrr_assoc)
-        memcpy(range_info, mrr_buf_next_identical + h->ref_length, sizeof(uchar*));
-    } while (!h2->mrr_funcs.skip_record ||
-             !h2->mrr_funcs.skip_record(h2->mrr_iter, (char *) range_info, rowid));
+      uchar *range_ptr= rowid_buffer.read(sizeof(uchar*));
+      memcpy(range_info, range_ptr, sizeof(uchar*));
+    }
 
-    mrr_buf_next_identical += h->ref_length + sizeof(void*) * test(is_mrr_assoc);
-    return 0;
+    if (rowid == identical_rowid_ptr)
+    {
+      identical_rowid_ptr= NULL; /* reached the last of identical rowids */
+    }
+
+    if (!h2->mrr_funcs.skip_record ||
+        !h2->mrr_funcs.skip_record(h2->mrr_iter, (char *) *range_info, rowid))
+    {
+      return 0;
+    }
   }
 
-  do
+  while (1)
   {
-    if (mrr_buf_cur == mrr_buf_last)
+    if (!rowid_buffer.have_data(1))
     {
       if (dsmrr_eof)
-      {
-        res= HA_ERR_END_OF_FILE;
-        goto end;
-      }
-      res= dsmrr_fill_rowid_buffer();
-      if (res)
-        goto end;
+        return HA_ERR_END_OF_FILE;
+
+      if (do_sort_keys && key_buffer.used_size() == 0)
+        dsmrr_fill_key_buffer();
+
+      if ((res= dsmrr_fill_rowid_buffer()))
+        return res;
     }
    
-    /* return eof if there are no rowids in the buffer after re-fill attempt */
-    if (mrr_buf_cur == mrr_buf_last)
-    {
-      res= HA_ERR_END_OF_FILE;
-      goto end;
-    }
-    rowid= mrr_buf_cur;
+    /* Return eof if there are no rowids in the buffer after re-fill attempt */
+    if (!rowid_buffer.have_data(1))
+      return HA_ERR_END_OF_FILE;
+
+    rowid= rowid_buffer.read(h->ref_length);
+    identical_rowid_ptr= NULL;
 
     if (is_mrr_assoc)
-      memcpy(&cur_range_info, mrr_buf_cur + h->ref_length, sizeof(uchar**));
+    {
+      range_id= rowid_buffer.read(sizeof(uchar*));
+      memcpy(&cur_range_info, range_id, sizeof(uchar*));
+      memcpy(range_info, range_id, sizeof(uchar*));
+    }
     
-    size_t element_size= h->ref_length + sizeof(void*) * test(is_mrr_assoc);
-    mrr_buf_cur += element_size;
-    mrr_buf_next_identical= mrr_buf_cur;
-
+    //psergey2-note: the below isn't right- we won't want to skip over this 
+    // rowid because this (rowid, range_id) pair has nothing.. the next 
+    // identical rowids might have something.. (but we set identicals later,
+    // dont we?)
     if (h2->mrr_funcs.skip_record &&
 	h2->mrr_funcs.skip_record(h2->mrr_iter, (char *) cur_range_info, rowid))
       continue;
+
     res= h->ha_rnd_pos(table->record[0], rowid);
 
     if (res == HA_ERR_RECORD_DELETED)
       continue;
     
-    if (0)//(!res)
+    /* 
+      Check if subsequent buffer elements have the same rowid value as this
+      one. If yes, remember this fact so that we don't make any more rnd_pos()
+      calls with this value.
+    */
+    if (!res)
     {
       /* 
         Note: this implies that SQL layer doesn't touch table->record[0]
         between calls.
       */
-      uchar *current_el= mrr_buf_cur - element_size;
-      while (mrr_buf_cur != mrr_buf_last && 
-             !h2->cmp_ref(current_el, mrr_buf_cur))
+      uchar *ptr;
+      SimpleBuffer::PeekIterator identical_rowid_it;
+      identical_rowid_it.init(&rowid_buffer);
+      while ((ptr= identical_rowid_it.get_next(h->ref_length)))
       {
-        mrr_buf_cur += element_size;
+        if (is_mrr_assoc)
+          identical_rowid_it.get_next(sizeof(void*));
+
+        if (h2->cmp_ref(rowid, ptr))
+          break;
+        identical_rowid_ptr= ptr;
       }
     }
-    break;
-
-  } while (true);
- 
-  if (is_mrr_assoc)
-  {
-    memcpy(range_info, rowid + h->ref_length, sizeof(void*));
+    return 0;
   }
-end:
+
   return res;
 }
 
diff --git a/sql/multi_range_read.h b/sql/multi_range_read.h
index aa33147e2db..39d23649c56 100644
--- a/sql/multi_range_read.h
+++ b/sql/multi_range_read.h
@@ -37,6 +37,96 @@
 */
 
 
+/*
+  A simple memory buffer for reading and writing.
+
+  when writing, there is no user-visible "current" position, although
+  internally 'pos' points to just after the end of used area  (or at the 
+  start of it for reverse buffer).
+
+  When reading, there is current position pointing at start (for reverse
+  buffer, end) of the element that will be read next.
+   ^^ why end for reverse? it's more logical to point at start 
+
+  One can peek at what's behind that element by using advance_ptr function.
+
+  TODO: will the reverse buffer store {tuple; rowid} or {rowid; tuple} pairs?
+    (why does it matter??? Read and write in the same order and then it
+    shouldn't matter.)
+*/
+
+class SimpleBuffer
+{
+  uchar *start;
+  uchar *end;
+  uchar *read_pos;
+  uchar *write_pos;
+  
+  /*
+     1 <=> buffer grows/is filled/is read  from start to end
+    -1 <=> everthing is done from end to start instead.
+  */
+  int direction;
+public:
+  /* Write-mode functions */
+  void reset_for_writing();
+  void write(const uchar *data, size_t bytes);
+  bool have_space_for(size_t bytes);
+
+  uchar *used_area() { return (direction == 1)? read_pos : write_pos; }
+  size_t used_size();
+
+  /* Read-mode functions */
+  void reset_for_reading();
+
+  uchar *read(size_t bytes);
+  bool have_data(size_t bytes);
+  uchar *end_of_space();
+
+  /* Control functions */
+  void set_buffer_space(uchar *start_arg, uchar *end_arg, int direction_arg) 
+  {
+    start= start_arg;
+    end= end_arg;
+    direction= direction_arg;
+    reset_for_writing();
+  }
+
+  friend class PeekIterator;
+  class PeekIterator
+  {
+    // if direction==1 : pointer to what to return next
+    // if direction==-1: pointer to the end of what is to be returned next
+    uchar *pos;
+    SimpleBuffer *sb;
+  public:
+    void init(SimpleBuffer *sb_arg)
+    {
+      sb= sb_arg;
+      pos= sb->read_pos;
+    }
+    /* Return pointer to next chunk of nbytes bytes and avance over it */
+    uchar *get_next(size_t nbytes)
+    {
+      if (sb->direction == 1)
+      {
+        if (pos + nbytes > sb->write_pos)
+          return NULL;
+        uchar *res= pos;
+        pos += nbytes;
+        return res;
+      }
+      else
+      {
+        if (pos - nbytes <= sb->write_pos)
+          return NULL;
+        pos -= nbytes;
+        return pos;
+      }
+    }
+  };
+};
+
 /*
   DS-MRR implementation for one table. Create/use one object of this class for
   each ha_{myisam/innobase/etc} object. That object will be further referred to
@@ -73,6 +163,8 @@
        scanning.
 */
 
+
+
 class DsMrr_impl
 {
 public:
@@ -108,14 +200,27 @@ private:
   /* Secondary handler object.  It is used for scanning the index */
   handler *h2;
 
-  /* Buffer to store rowids, or (rowid, range_id) pairs */
-  uchar *mrr_buf;
-  uchar *mrr_buf_cur;   /* Current position when reading/writing */
-  uchar *mrr_buf_last;  /* When reading: end of used buffer space */
-  uchar *mrr_buf_end;   /* End of the buffer */
+  uchar *full_buf;
+  uchar *full_buf_end;
 
-  uchar *mrr_buf_next_identical;
+  /* Buffer to store rowids, or (rowid, range_id) pairs */
+  SimpleBuffer rowid_buffer;
+  
+  uchar *identical_rowid_ptr;
+  
+  /* Identical keys */
+  bool in_identical_keys_range;
+  uchar *last_identical_key_ptr;
+  SimpleBuffer::PeekIterator identical_key_it;
+
+  SimpleBuffer key_buffer;
+  
+  uint keyno;
+
+  /* Execution control */
+  bool do_sort_keys;
   bool use_key_pointers;
+  bool do_rowid_fetch;
 
   bool dsmrr_eof; /* TRUE <=> We have reached EOF when reading index tuples */
 
@@ -129,18 +234,33 @@ private:
   /** DS-MRR/CPK variables start */
 
   /* Length of lookup tuple being used, in bytes */
-  uint cpk_tuple_length;
+  uint key_tuple_length;
+  key_part_map key_tuple_map; 
+  /*
+    This is 
+      = key_tuple_length   if we copy keys to buffer
+      = sizeof(void*)      if we're using pointers to materialized keys.
+  */
+  uint key_size_in_keybuf;
+  
+  /* = key_size_in_keybuf [ + sizeof(range_assoc_info) ] */
+  uint key_buff_elem_size;
+  
+  /* = h->ref_length  [ + sizeof(range_assoc_info) ] */
+  uint rowid_buff_elem_size;
 
-  uint key_buf_element_size;
   /*
     TRUE <=> We're scanning on a full primary key (and not on prefix), and so 
     can get max. one match for each key 
   */
-  bool cpk_is_unique_scan;
+  bool index_ranges_unique;
   /* TRUE<=> we're in a middle of enumerating records from a range */ 
-  bool cpk_have_range;
-  /* Valid if cpk_have_range==TRUE: range_id of the range we're enumerating */
-  char *cpk_saved_range_info;
+  bool in_index_range;
+  uchar *cur_index_tuple;
+  /* if in_index_range==TRUE: range_id of the range we're enumerating */
+  char *cur_range_info;
+
+  char *first_identical_range_info;
 
   bool choose_mrr_impl(uint keyno, ha_rows rows, uint *flags, uint *bufsz, 
                        COST_VECT *cost);
@@ -150,6 +270,11 @@ private:
   static int key_tuple_cmp(void* arg, uchar* key1, uchar* key2);
   int dsmrr_fill_rowid_buffer();
   void dsmrr_fill_key_buffer();
-  int dsmrr_next_cpk(char **range_info);
+  int dsmrr_next_from_index(char **range_info);
+
+  void setup_buffer_sizes(key_range *sample_key);
+
+  static range_seq_t key_buf_seq_init(void *init_param, uint n_ranges, uint flags);
+  static uint key_buf_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range);
 };
 

From 758b68a9807280d96da12c1dfbcfe153947fc6cb Mon Sep 17 00:00:00 2001
From: Sergey Petrunya <psergey@askmonty.org>
Date: Sun, 8 Aug 2010 12:30:48 +0400
Subject: [PATCH 09/49] Fix valgrind failure: don't access key_buffer if we
 haven't set it up yet.

---
 sql/multi_range_read.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc
index 35530a24aae..22f8133f309 100644
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -822,7 +822,8 @@ void DsMrr_impl::dsmrr_fill_key_buffer()
   // reset the buffer for writing.
   key_buffer.reset_for_writing();
 
-  while ((key_buffer.have_space_for(key_buff_elem_size)) && 
+  while ((key_tuple_length == 0 || 
+          key_buffer.have_space_for(key_buff_elem_size)) && 
          !(res= h->mrr_funcs.next(h->mrr_iter, &cur_range)))
   {
     DBUG_ASSERT(cur_range.range_flag & EQ_RANGE);

From a2e29a83aacead6ac082a95acd44a8b2ad2f141d Mon Sep 17 00:00:00 2001
From: Sergey Petrunya <psergey@askmonty.org>
Date: Sun, 8 Aug 2010 15:44:32 +0400
Subject: [PATCH 10/49] We can't check that assertion at the first iteration

---
 sql/multi_range_read.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc
index 22f8133f309..ab833f5fdc9 100644
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -830,7 +830,7 @@ void DsMrr_impl::dsmrr_fill_key_buffer()
     if (!key_tuple_length)
     {
       /* This only happens when we've just started filling the buffer */
-      DBUG_ASSERT(key_buffer.used_size() == 0);
+      //DBUG_ASSERT(key_buffer.used_size() == 0);
       setup_buffer_sizes(&cur_range.start_key);
     }
 

From 57c17e18af47c7cd200a604410f6389d0ad01d52 Mon Sep 17 00:00:00 2001
From: Sergey Petrunya <psergey@askmonty.org>
Date: Mon, 9 Aug 2010 00:38:42 +0400
Subject: [PATCH 11/49] Fix valgrind failure

---
 sql/multi_range_read.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc
index ab833f5fdc9..e233b45792b 100644
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -820,7 +820,8 @@ void DsMrr_impl::dsmrr_fill_key_buffer()
   DBUG_ENTER("DsMrr_impl::dsmrr_fill_key_buffer");
 
   // reset the buffer for writing.
-  key_buffer.reset_for_writing();
+  if (key_tuple_length)
+    key_buffer.reset_for_writing();
 
   while ((key_tuple_length == 0 || 
           key_buffer.have_space_for(key_buff_elem_size)) && 

From 8dc81f47efc6d11c664edc0afa2455ac11736522 Mon Sep 17 00:00:00 2001
From: Sergey Petrunya <psergey@askmonty.org>
Date: Wed, 11 Aug 2010 14:54:34 +0400
Subject: [PATCH 12/49] Use reverse/backwards buffer for keys (now works) -
 don't allocate space for rowid buffer when we don't really need it. - fix
 buffer iterator

---
 mysql-test/r/join_outer_jcl6.result |  8 ++++----
 sql/multi_range_read.cc             | 23 ++++++++++++++++++++---
 sql/multi_range_read.h              |  2 +-
 3 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/mysql-test/r/join_outer_jcl6.result b/mysql-test/r/join_outer_jcl6.result
index 854fc725845..624f94438ba 100644
--- a/mysql-test/r/join_outer_jcl6.result
+++ b/mysql-test/r/join_outer_jcl6.result
@@ -352,14 +352,14 @@ Thimble Smith	Happy	3	3
 Lilliana Angelovska	NULL	NULL	NULL
 select t1.name, t2.name, t2.id,t3.id from t1 right join t2 on (t1.id = t2.owner) right join t1 as t3 on t3.id=t2.owner;
 name	name	id	id
-Antonio Paz	El Gato	1	1
 Antonio Paz	Perrito	2	1
+Antonio Paz	El Gato	1	1
 Thimble Smith	Happy	3	3
 NULL	NULL	NULL	2
 select t1.name, t2.name, t2.id, t2.owner, t3.id from t1 left join t2 on (t1.id = t2.owner) right join t1 as t3 on t3.id=t2.owner;
 name	name	id	owner	id
-Antonio Paz	El Gato	1	1	1
 Antonio Paz	Perrito	2	1	1
+Antonio Paz	El Gato	1	1	1
 Thimble Smith	Happy	3	3	3
 NULL	NULL	NULL	NULL	2
 drop table t1,t2;
@@ -413,9 +413,9 @@ insert into t2 values (1, 2, 3),(2, 2, 8), (4,3,9),(3,2,10);
 select t1.*, t2.* from t1 left join t2 on t1.n = t2.n and
 t1.m = t2.m where t1.n = 1;
 n	m	o	n	m	o
-1	2	11	1	2	3
-1	2	7	1	2	3
 1	2	9	1	2	3
+1	2	7	1	2	3
+1	2	11	1	2	3
 1	3	9	NULL	NULL	NULL
 select t1.*, t2.* from t1 left join t2 on t1.n = t2.n and
 t1.m = t2.m where t1.n = 1 order by t1.o;
diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc
index e233b45792b..faf70a97aad 100644
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -420,7 +420,8 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
   }
 
   do_rowid_fetch= FALSE;
-  doing_cpk_scan= check_cpk_scan(h->active_index, mode);
+  doing_cpk_scan= check_cpk_scan(h->inited == handler::INDEX? 
+                                 h->active_index: h2->active_index, mode);
   if (!doing_cpk_scan /* && !index_only_read */)
   {
     /* Will use rowid buffer to store/sort rowids, etc */
@@ -754,7 +755,21 @@ void DsMrr_impl::setup_buffer_sizes(key_range *sample_key)
                                        key_tuple_length;
   key_buff_elem_size= key_size_in_keybuf + 
                       (int)is_mrr_assoc * sizeof(void*);
+  
+  if (!do_rowid_fetch)
+  {
+    /* Give all space to key buffer. */
+    key_buffer.set_buffer_space(full_buf, full_buf_end, 1);
 
+    /* Just in case, tell rowid buffer that it has zero size: */
+    rowid_buffer.set_buffer_space(full_buf_end, full_buf_end, 1);
+    return;
+  }
+  
+  /* 
+    Ok if we got here we need to allocate one part of the buffer 
+    for keys and another part for rowids.
+  */
   uint rowid_buf_elem_size= h->ref_length + 
                             (int)is_mrr_assoc * sizeof(char*);
   
@@ -790,9 +805,11 @@ void DsMrr_impl::setup_buffer_sizes(key_range *sample_key)
                 (h->ref_length + (int)is_mrr_assoc * sizeof(char*) + 1));
   }
 
+  //rowid_buffer.set_buffer_space(full_buf, full_buf + bytes_for_rowids, 1);
+  //key_buffer.set_buffer_space(full_buf + bytes_for_rowids, full_buf_end, 1);
   rowid_buffer.set_buffer_space(full_buf, full_buf + bytes_for_rowids, 1);
-  key_buffer.set_buffer_space(full_buf + bytes_for_rowids, full_buf_end, 1);
-
+  key_buffer.set_buffer_space(full_buf + bytes_for_rowids, full_buf_end, -1);
+  
   index_ranges_unique= test(key_info->flags & HA_NOSAME && 
                             key_info->key_parts == 
                               my_count_bits(sample_key->keypart_map));
diff --git a/sql/multi_range_read.h b/sql/multi_range_read.h
index 39d23649c56..5b1fb991aee 100644
--- a/sql/multi_range_read.h
+++ b/sql/multi_range_read.h
@@ -118,7 +118,7 @@ public:
       }
       else
       {
-        if (pos - nbytes <= sb->write_pos)
+        if (pos - nbytes < sb->write_pos)
           return NULL;
         pos -= nbytes;
         return pos;

From 8d07c16ad91190e812907723d5dcaee0b6511e70 Mon Sep 17 00:00:00 2001
From: Sergey Petrunya <psergey@askmonty.org>
Date: Thu, 12 Aug 2010 21:18:41 +0400
Subject: [PATCH 13/49] Do dynamic buffer growing/shrinking.

---
 sql/multi_range_read.cc | 39 ++++++++++++++++++++++++++++-------
 sql/multi_range_read.h  | 45 ++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 76 insertions(+), 8 deletions(-)

diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc
index faf70a97aad..c1f06594e70 100644
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -792,7 +792,7 @@ void DsMrr_impl::setup_buffer_sizes(key_range *sample_key)
     ((double) rowid_buf_elem_size / 
          ((double)rowid_buf_elem_size + key_buff_elem_size));
 
-  uint bytes_for_rowids= 
+  size_t bytes_for_rowids= 
     round(fraction_for_rowids * (full_buf_end - full_buf));
   
   uint bytes_for_keys= (full_buf_end - full_buf) - bytes_for_rowids;
@@ -805,10 +805,10 @@ void DsMrr_impl::setup_buffer_sizes(key_range *sample_key)
                 (h->ref_length + (int)is_mrr_assoc * sizeof(char*) + 1));
   }
 
-  //rowid_buffer.set_buffer_space(full_buf, full_buf + bytes_for_rowids, 1);
-  //key_buffer.set_buffer_space(full_buf + bytes_for_rowids, full_buf_end, 1);
-  rowid_buffer.set_buffer_space(full_buf, full_buf + bytes_for_rowids, 1);
-  key_buffer.set_buffer_space(full_buf + bytes_for_rowids, full_buf_end, -1);
+  rowid_buffer_end= full_buf + bytes_for_rowids;
+  rowid_buffer.set_buffer_space(full_buf, rowid_buffer_end, 1);
+  key_buffer.set_buffer_space(rowid_buffer_end, full_buf_end, -1);
+
   
   index_ranges_unique= test(key_info->flags & HA_NOSAME && 
                             key_info->key_parts == 
@@ -838,7 +838,15 @@ void DsMrr_impl::dsmrr_fill_key_buffer()
 
   // reset the buffer for writing.
   if (key_tuple_length)
+  {
+    if (do_rowid_fetch)
+    {
+      /* Restore original buffer sizes */
+      rowid_buffer.set_buffer_space(full_buf, rowid_buffer_end, 1);
+      key_buffer.set_buffer_space(rowid_buffer_end, full_buf_end, -1);
+    }
     key_buffer.reset_for_writing();
+  }
 
   while ((key_tuple_length == 0 || 
           key_buffer.have_space_for(key_buff_elem_size)) && 
@@ -912,7 +920,6 @@ int DsMrr_impl::dsmrr_next_from_index(char **range_info_arg)
 
   while (in_identical_keys_range)
   {
-//read_and_check:
     /* Read record/key pointer from the buffer */
     key_in_buf= identical_key_it.get_next(key_size_in_keybuf);
     if (is_mrr_assoc)
@@ -980,6 +987,11 @@ check_record:
     }
 
     /* First, make sure we have a range at start of the buffer */
+
+    //psergey-todo: why would we re-fill it here in the case when
+    // we're doing rowid retrieval?
+    // - need to check if this is really happening.
+
     if (!key_buffer.have_data(key_buff_elem_size))
     {
       if (dsmrr_eof)
@@ -995,6 +1007,18 @@ check_record:
       }
     }
     
+    if (do_rowid_fetch)
+    {
+      /*
+        At this point we're not using anything beyond what we've read from key
+        buffer. Shrik the key buffer and grow the rowid buffer.
+      */
+      uchar *unused_start;
+      uchar *unused_end;
+      key_buffer.remove_unused_space(&unused_start, &unused_end);
+      rowid_buffer.grow(unused_start, unused_end);
+    }
+
     /* Get the next range to scan*/
     cur_index_tuple= key_in_buf= key_buffer.read(key_size_in_keybuf);
     if (use_key_pointers)
@@ -1002,7 +1026,8 @@ check_record:
 
     if (is_mrr_assoc)
       cur_range_info= (char*)key_buffer.read(sizeof(void*));
-      
+    
+
     /* Do index lookup */
     if ((res= file->ha_index_read_map(table->record[0], cur_index_tuple, 
                                       key_tuple_map, HA_READ_KEY_EXACT)))
diff --git a/sql/multi_range_read.h b/sql/multi_range_read.h
index 5b1fb991aee..e49b1ab914d 100644
--- a/sql/multi_range_read.h
+++ b/sql/multi_range_read.h
@@ -91,6 +91,46 @@ public:
     direction= direction_arg;
     reset_for_writing();
   }
+  
+  /*
+    Stop/return the unneded space (the one that we have wrote to and have read
+    from.
+  */
+  void remove_unused_space(uchar **unused_start, uchar **unused_end)
+  {
+    if (direction == 1)
+    {
+      *unused_start= start;
+      *unused_end= read_pos;
+    }
+    else
+    {
+      *unused_start=read_pos;
+      *unused_end=end;
+    }
+  }
+
+  void grow(uchar *unused_start, uchar *unused_end)
+  {
+    /*
+      Passed memory area can be meaningfully used for growing the buffer if:
+      - it is adjacent to buffer space we're using
+      - it is on the end towards which we grow.
+    */
+    if (direction == 1 && end == unused_start)
+    {
+      end= unused_end;
+    }
+    else if (direction == -1 && start == unused_end)
+    {
+      start= unused_start;
+    }
+    else
+      DBUG_ASSERT(0); /* Attempt to grow buffer in wrong direction */
+  }
+  
+  /* */
+  void grow();
 
   friend class PeekIterator;
   class PeekIterator
@@ -202,6 +242,9 @@ private:
 
   uchar *full_buf;
   uchar *full_buf_end;
+  
+  /* Valid when using both rowid and key buffer: the original bound between them */
+  uchar *rowid_buffer_end;
 
   /* Buffer to store rowids, or (rowid, range_id) pairs */
   SimpleBuffer rowid_buffer;
@@ -248,7 +291,7 @@ private:
   
   /* = h->ref_length  [ + sizeof(range_assoc_info) ] */
   uint rowid_buff_elem_size;
-
+  
   /*
     TRUE <=> We're scanning on a full primary key (and not on prefix), and so 
     can get max. one match for each key 

From d098596ba5466b02823dc2431b632a43a077c2d5 Mon Sep 17 00:00:00 2001
From: Sergey Petrunya <psergey@askmonty.org>
Date: Thu, 12 Aug 2010 23:59:29 +0400
Subject: [PATCH 14/49] Fix a number of problems with reverse buffer use

---
 sql/multi_range_read.cc | 11 ++++++++---
 sql/multi_range_read.h  | 15 +++++++++++++--
 2 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc
index c1f06594e70..212f04dc766 100644
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -644,6 +644,8 @@ int DsMrr_impl::dsmrr_fill_rowid_buffer()
   
   rowid_buffer.reset_for_writing();
   identical_rowid_ptr= NULL;
+  if (do_sort_keys)
+    key_buffer.flip();
 
   while (rowid_buffer.have_space_for(rowid_buff_elem_size))
   {
@@ -860,13 +862,16 @@ void DsMrr_impl::dsmrr_fill_key_buffer()
       setup_buffer_sizes(&cur_range.start_key);
     }
 
+    if (key_buffer.is_reverse() && is_mrr_assoc)
+      key_buffer.write((uchar*)&cur_range.ptr, sizeof(void*));
+
     /* Put key, or {key, range_id} pair into the buffer */
     if (use_key_pointers)
       key_buffer.write((uchar*)&cur_range.start_key.key, sizeof(char*));
     else
       key_buffer.write(cur_range.start_key.key, key_tuple_length);
  
-    if (is_mrr_assoc)
+    if (!key_buffer.is_reverse() && is_mrr_assoc)
       key_buffer.write((uchar*)&cur_range.ptr, sizeof(void*));
   }
 
@@ -966,7 +971,6 @@ check_record:
     }
 
     goto check_record;
-   //  goto read_and_check;
   }
 
   while(1)
@@ -999,7 +1003,8 @@ check_record:
         res= HA_ERR_END_OF_FILE;
         goto end;
       }
-      dsmrr_fill_key_buffer();
+      if (!do_rowid_fetch)
+        dsmrr_fill_key_buffer();
       if (!key_buffer.have_data(key_buff_elem_size))
       {
         res= HA_ERR_END_OF_FILE;
diff --git a/sql/multi_range_read.h b/sql/multi_range_read.h
index e49b1ab914d..1589e65b49f 100644
--- a/sql/multi_range_read.h
+++ b/sql/multi_range_read.h
@@ -102,14 +102,25 @@ public:
     {
       *unused_start= start;
       *unused_end= read_pos;
+      start= read_pos;
     }
     else
     {
-      *unused_start=read_pos;
-      *unused_end=end;
+      *unused_start= read_pos;
+      *unused_end= end;
+      end= read_pos;
     }
   }
 
+  void flip()
+  {
+    uchar *tmp= read_pos;
+    read_pos= write_pos;
+    write_pos= tmp;
+    direction= -direction;
+  }
+  bool is_reverse() { return direction == -1; }
+
   void grow(uchar *unused_start, uchar *unused_end)
   {
     /*

From c964cb1b626d9ff1c995b42fde406342aa35547a Mon Sep 17 00:00:00 2001
From: Sergey Petrunya <psergey@askmonty.org>
Date: Sat, 14 Aug 2010 18:56:37 +0400
Subject: [PATCH 15/49] key/rowid buffer overflow fixes for various tricky
 cases.

---
 mysql-test/r/join_nested_jcl6.result |   6 +-
 sql/multi_range_read.cc              | 108 ++++++++++++++++-----------
 sql/multi_range_read.h               |   8 ++
 3 files changed, 77 insertions(+), 45 deletions(-)

diff --git a/mysql-test/r/join_nested_jcl6.result b/mysql-test/r/join_nested_jcl6.result
index 0b83bd7cd6e..9683c7c854a 100644
--- a/mysql-test/r/join_nested_jcl6.result
+++ b/mysql-test/r/join_nested_jcl6.result
@@ -865,12 +865,12 @@ LEFT JOIN
 (t1,t2)
 ON t3.a=1 AND t3.b=t2.b AND t2.b=t4.b;
 a	b	a	b	a	b
-4	2	1	2	3	2
 4	2	1	2	4	2
 4	2	1	2	3	2
 4	2	1	2	4	2
 4	2	1	2	3	2
 4	2	1	2	4	2
+4	2	1	2	3	2
 NULL	NULL	2	2	3	2
 NULL	NULL	2	2	4	2
 EXPLAIN EXTENDED
@@ -1105,8 +1105,8 @@ t0.b=t1.b AND
 (t8.b=t9.b OR t8.c IS NULL) AND
 (t9.a=1);
 a	b	a	b	a	b	a	b	a	b	a	b	a	b	a	b	a	b	a	b
-1	2	3	2	4	2	1	2	3	2	2	2	6	2	2	2	0	2	1	2
 1	2	3	2	4	2	1	2	4	2	2	2	6	2	2	2	0	2	1	2
+1	2	3	2	4	2	1	2	3	2	2	2	6	2	2	2	0	2	1	2
 1	2	3	2	4	2	1	2	3	2	3	1	6	2	1	1	NULL	NULL	1	1
 1	2	3	2	4	2	1	2	4	2	3	1	6	2	1	1	NULL	NULL	1	1
 1	2	3	2	4	2	1	2	3	2	3	1	6	2	1	1	NULL	NULL	1	2
@@ -1785,8 +1785,8 @@ ON t7.b=t8.b AND t6.b < 10
 ON t6.b >= 2 AND t5.b=t7.b AND
 (t8.a > 0 OR t8.c IS NULL);
 a	b	a	b	a	b	a	b
-2	2	1	2	2	2	1	2
 2	2	3	2	2	2	1	2
+2	2	1	2	2	2	1	2
 1	1	1	2	1	1	NULL	NULL
 1	1	3	2	1	1	NULL	NULL
 3	3	NULL	NULL	NULL	NULL	NULL	NULL
diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc
index 212f04dc766..6cb9ebfee12 100644
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -563,8 +563,8 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
     If the above call has scanned through all intervals in *seq, then
     adjust *buf to indicate that the remaining buffer space will not be used.
   */
-  if (dsmrr_eof) 
-    buf->end_of_used_area= rowid_buffer.end_of_space();
+//  if (dsmrr_eof) 
+//    buf->end_of_used_area= rowid_buffer.end_of_space();
 
   /*
      h->inited == INDEX may occur when 'range checked for each record' is
@@ -619,21 +619,18 @@ static int rowid_cmp(void *h, uchar *a, uchar *b)
   buffer. When the buffer is full or scan is completed, sort the buffer by 
   rowid and return.
   
-  The function assumes that rowids buffer is empty when it is invoked. 
-
-  New2:
-    we will need to scan either 
-     - the source sequence getting records
-     - use dsmrr_next_from_index..
-
   dsmrr_eof is set to indicate whether we've exhausted the list of ranges we're
-  scanning.
+  scanning. This function never returns HA_ERR_END_OF_FILE.
+
+  post-condition:
+   rowid buffer is not empty, or key source is exhausted.
 
   @param h  Table handler
 
   @retval 0      OK, the next portion of rowids is in the buffer,
                  properly ordered
   @retval other  Error
+  
 */
 
 int DsMrr_impl::dsmrr_fill_rowid_buffer()
@@ -642,9 +639,11 @@ int DsMrr_impl::dsmrr_fill_rowid_buffer()
   int res;
   DBUG_ENTER("DsMrr_impl::dsmrr_fill_rowid_buffer");
   
+  DBUG_ASSERT(rowid_buffer.is_empty());
   rowid_buffer.reset_for_writing();
   identical_rowid_ptr= NULL;
-  if (do_sort_keys)
+
+  if (key_buffer.is_reverse())
     key_buffer.flip();
 
   while (rowid_buffer.have_space_for(rowid_buff_elem_size))
@@ -656,7 +655,6 @@ int DsMrr_impl::dsmrr_fill_rowid_buffer()
 
     if (res)
       break;
-    
 
     KEY_MULTI_RANGE *curr_range= &h2->handler::mrr_cur_range;
     if (!do_sort_keys && /* If keys are sorted then this check is already done */
@@ -674,7 +672,9 @@ int DsMrr_impl::dsmrr_fill_rowid_buffer()
 
   if (res && res != HA_ERR_END_OF_FILE)
     DBUG_RETURN(res); 
-  dsmrr_eof= test(res == HA_ERR_END_OF_FILE);
+
+  if (!do_sort_keys)
+    dsmrr_eof= test(res == HA_ERR_END_OF_FILE);
 
   /* Sort the buffer contents by rowid */
   uint elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*);
@@ -830,6 +830,10 @@ void DsMrr_impl::setup_buffer_sizes(key_range *sample_key)
 
     dsmrr_eof is set to indicate whether we've exhausted the list of ranges 
     we're scanning.
+
+  post-condition:
+   - key buffer is non-empty
+   - key buffer is empty and source range sequence is exhausted
 */
 
 void DsMrr_impl::dsmrr_fill_key_buffer()
@@ -838,12 +842,16 @@ void DsMrr_impl::dsmrr_fill_key_buffer()
   KEY_MULTI_RANGE cur_range;
   DBUG_ENTER("DsMrr_impl::dsmrr_fill_key_buffer");
 
-  // reset the buffer for writing.
+  DBUG_ASSERT(!key_tuple_length || key_buffer.is_empty());
+
   if (key_tuple_length)
   {
-    if (do_rowid_fetch)
+    if (do_rowid_fetch && rowid_buffer.is_empty())
     {
-      /* Restore original buffer sizes */
+      /*
+        We're using two buffers and both of them are empty now. Restore the
+        original sizes
+      */
       rowid_buffer.set_buffer_space(full_buf, rowid_buffer_end, 1);
       key_buffer.set_buffer_space(rowid_buffer_end, full_buf_end, -1);
     }
@@ -858,7 +866,6 @@ void DsMrr_impl::dsmrr_fill_key_buffer()
     if (!key_tuple_length)
     {
       /* This only happens when we've just started filling the buffer */
-      //DBUG_ASSERT(key_buffer.used_size() == 0);
       setup_buffer_sizes(&cur_range.start_key);
     }
 
@@ -991,21 +998,21 @@ check_record:
     }
 
     /* First, make sure we have a range at start of the buffer */
-
-    //psergey-todo: why would we re-fill it here in the case when
-    // we're doing rowid retrieval?
-    // - need to check if this is really happening.
-
-    if (!key_buffer.have_data(key_buff_elem_size))
+    if (key_buffer.is_empty())
     {
       if (dsmrr_eof)
       {
         res= HA_ERR_END_OF_FILE;
         goto end;
       }
+      /*
+        When rowid fetching is used, it controls all buffer refills. When we're
+        on our own, try refilling our buffer.
+      */
       if (!do_rowid_fetch)
         dsmrr_fill_key_buffer();
-      if (!key_buffer.have_data(key_buff_elem_size))
+
+      if (key_buffer.is_empty())
       {
         res= HA_ERR_END_OF_FILE;
         goto end;
@@ -1015,16 +1022,16 @@ check_record:
     if (do_rowid_fetch)
     {
       /*
-        At this point we're not using anything beyond what we've read from key
-        buffer. Shrik the key buffer and grow the rowid buffer.
+        At this point we're not using anything what we've read from key
+        buffer. Cut off unused key buffer space and give it to the rowid
+        buffer.
       */
-      uchar *unused_start;
-      uchar *unused_end;
+      uchar *unused_start, *unused_end;
       key_buffer.remove_unused_space(&unused_start, &unused_end);
       rowid_buffer.grow(unused_start, unused_end);
     }
 
-    /* Get the next range to scan*/
+    /* Get the next range to scan */
     cur_index_tuple= key_in_buf= key_buffer.read(key_size_in_keybuf);
     if (use_key_pointers)
       cur_index_tuple= *((uchar**)cur_index_tuple);
@@ -1119,20 +1126,41 @@ int DsMrr_impl::dsmrr_next(char **range_info)
 
   while (1)
   {
-    if (!rowid_buffer.have_data(1))
+    if (rowid_buffer.is_empty())
     {
-      if (dsmrr_eof)
-        return HA_ERR_END_OF_FILE;
+      if (do_sort_keys)
+      {
+        if (!key_buffer.is_empty() || in_index_range) 
+        {
+          /* There are some sorted keys left. Use them to get rowids */
+          if ((res= dsmrr_fill_rowid_buffer()))
+            return res; /* for fatal errors */
+        }
+        if (rowid_buffer.is_empty())
+        {
+          if (dsmrr_eof)
+            return HA_ERR_END_OF_FILE;
+          dsmrr_fill_key_buffer();
+          if ((res= dsmrr_fill_rowid_buffer()))
+            return res;
+        }
+      }
+      else
+      {
+        /* 
+          There is no buffer with sorted keys. If fill_rowid_buffer() haven't
+          reached eof condition before, try refilling the buffer.
+        */
+        if (dsmrr_eof)
+          return HA_ERR_END_OF_FILE;
 
-      if (do_sort_keys && key_buffer.used_size() == 0)
-        dsmrr_fill_key_buffer();
-
-      if ((res= dsmrr_fill_rowid_buffer()))
-        return res;
+        if ((res= dsmrr_fill_rowid_buffer()))
+          return res;
+      }
     }
    
     /* Return eof if there are no rowids in the buffer after re-fill attempt */
-    if (!rowid_buffer.have_data(1))
+    if (rowid_buffer.is_empty())
       return HA_ERR_END_OF_FILE;
 
     rowid= rowid_buffer.read(h->ref_length);
@@ -1145,10 +1173,6 @@ int DsMrr_impl::dsmrr_next(char **range_info)
       memcpy(range_info, range_id, sizeof(uchar*));
     }
     
-    //psergey2-note: the below isn't right- we won't want to skip over this 
-    // rowid because this (rowid, range_id) pair has nothing.. the next 
-    // identical rowids might have something.. (but we set identicals later,
-    // dont we?)
     if (h2->mrr_funcs.skip_record &&
 	h2->mrr_funcs.skip_record(h2->mrr_iter, (char *) cur_range_info, rowid))
       continue;
diff --git a/sql/multi_range_read.h b/sql/multi_range_read.h
index 1589e65b49f..4941cac688d 100644
--- a/sql/multi_range_read.h
+++ b/sql/multi_range_read.h
@@ -75,6 +75,7 @@ public:
 
   uchar *used_area() { return (direction == 1)? read_pos : write_pos; }
   size_t used_size();
+  bool is_empty() { return used_size() == 0; }
 
   /* Read-mode functions */
   void reset_for_reading();
@@ -277,6 +278,13 @@ private:
   bool do_rowid_fetch;
 
   bool dsmrr_eof; /* TRUE <=> We have reached EOF when reading index tuples */
+  
+  /* 
+    TRUE <=> key buffer is exhausted (we need this because we may have a situation
+    where we've read everything from the key buffer but haven't finished with
+    scanning the last range)
+  */
+  bool key_eof;
 
   /* TRUE <=> need range association, buffer holds {rowid, range_id} pairs */
   bool is_mrr_assoc;

From 937db4bff4575971996e401f0a5b969ce3475801 Mon Sep 17 00:00:00 2001
From: Sergey Petrunya <psergey@askmonty.org>
Date: Sat, 14 Aug 2010 22:35:50 +0400
Subject: [PATCH 16/49] - Remove out-of-date comments - Make testcase stable

---
 mysql-test/suite/vcol/r/vcol_misc.result |  2 +-
 mysql-test/suite/vcol/t/vcol_misc.test   |  3 ++-
 sql/multi_range_read.h                   | 13 ++-----------
 3 files changed, 5 insertions(+), 13 deletions(-)

diff --git a/mysql-test/suite/vcol/r/vcol_misc.result b/mysql-test/suite/vcol/r/vcol_misc.result
index 57460b1d669..f72373c6d3d 100644
--- a/mysql-test/suite/vcol/r/vcol_misc.result
+++ b/mysql-test/suite/vcol/r/vcol_misc.result
@@ -13,8 +13,8 @@ id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t2	ref	idx	idx	5	test.t1.b	2	Using where; Using join buffer
 select * from t1,t2 where t1.b=t2.c and d <= 100;
 a	b	c	d	v
-4	20	20	100	101
 1	20	20	100	101
 3	30	30	100	101
+4	20	20	100	101
 set join_cache_level=default;
 drop table t1, t2;
diff --git a/mysql-test/suite/vcol/t/vcol_misc.test b/mysql-test/suite/vcol/t/vcol_misc.test
index afe6f838268..5031a23aa49 100644
--- a/mysql-test/suite/vcol/t/vcol_misc.test
+++ b/mysql-test/suite/vcol/t/vcol_misc.test
@@ -17,7 +17,8 @@ set join_cache_level=6;
 explain
 select * from t1,t2 where t1.b=t2.c and d <= 100;
 
+--sorted_result
 select * from t1,t2 where t1.b=t2.c and d <= 100;
 set join_cache_level=default;
 
-drop table t1, t2;
\ No newline at end of file
+drop table t1, t2;
diff --git a/sql/multi_range_read.h b/sql/multi_range_read.h
index 4941cac688d..0696dde21ca 100644
--- a/sql/multi_range_read.h
+++ b/sql/multi_range_read.h
@@ -47,12 +47,6 @@
   When reading, there is current position pointing at start (for reverse
   buffer, end) of the element that will be read next.
    ^^ why end for reverse? it's more logical to point at start 
-
-  One can peek at what's behind that element by using advance_ptr function.
-
-  TODO: will the reverse buffer store {tuple; rowid} or {rowid; tuple} pairs?
-    (why does it matter??? Read and write in the same order and then it
-    shouldn't matter.)
 */
 
 class SimpleBuffer
@@ -63,7 +57,7 @@ class SimpleBuffer
   uchar *write_pos;
   
   /*
-     1 <=> buffer grows/is filled/is read  from start to end
+     1 <=> buffer grows/is filled/is read from start to end
     -1 <=> everthing is done from end to start instead.
   */
   int direction;
@@ -141,10 +135,7 @@ public:
       DBUG_ASSERT(0); /* Attempt to grow buffer in wrong direction */
   }
   
-  /* */
-  void grow();
-
-  friend class PeekIterator;
+  //friend class PeekIterator;
   class PeekIterator
   {
     // if direction==1 : pointer to what to return next

From 889e6170fe10a8d9529db2bf409a10c7adbac668 Mon Sep 17 00:00:00 2001
From: Sergey Petrunya <psergey@askmonty.org>
Date: Sun, 15 Aug 2010 06:59:23 +0400
Subject: [PATCH 17/49] Fix [harmless] valgrind failure

---
 sql/multi_range_read.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc
index 6cb9ebfee12..4af35dec308 100644
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -643,7 +643,7 @@ int DsMrr_impl::dsmrr_fill_rowid_buffer()
   rowid_buffer.reset_for_writing();
   identical_rowid_ptr= NULL;
 
-  if (key_buffer.is_reverse())
+  if (do_sort_keys && key_buffer.is_reverse())
     key_buffer.flip();
 
   while (rowid_buffer.have_space_for(rowid_buff_elem_size))

From 3b85e019ab30974a41558ba0473c2a85f29e3210 Mon Sep 17 00:00:00 2001
From: Sergey Petrunya <psergey@askmonty.org>
Date: Sun, 15 Aug 2010 07:59:39 +0400
Subject: [PATCH 18/49] Fix another (possibly dangerous) valgrind failure.

---
 sql/multi_range_read.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc
index 4af35dec308..209c13a13cf 100644
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -454,7 +454,7 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
     in_index_range= FALSE;
     h->mrr_iter= seq_funcs->init(seq_init_param, n_ranges, mode);
     h->mrr_funcs= *seq_funcs;
-    keyno= h->active_index != MAX_KEY? h->active_index : h2->active_index;
+    keyno= (h->inited == handler::INDEX)? h->active_index : h2->active_index;
     dsmrr_fill_key_buffer();
     
     if (dsmrr_eof && !do_rowid_fetch)

From d4f057f2565d43cb5471df790ca79e6ff04f4920 Mon Sep 17 00:00:00 2001
From: Sergey Petrunya <psergey@askmonty.org>
Date: Sun, 15 Aug 2010 22:30:18 +0400
Subject: [PATCH 19/49] More valgrind fixes

---
 sql/multi_range_read.cc | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc
index 209c13a13cf..07c44fcde5d 100644
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -758,6 +758,10 @@ void DsMrr_impl::setup_buffer_sizes(key_range *sample_key)
   key_buff_elem_size= key_size_in_keybuf + 
                       (int)is_mrr_assoc * sizeof(void*);
   
+  KEY *key_info= &h->table->key_info[keyno];
+  index_ranges_unique= test(key_info->flags & HA_NOSAME && 
+                            key_info->key_parts == 
+                              my_count_bits(sample_key->keypart_map));
   if (!do_rowid_fetch)
   {
     /* Give all space to key buffer. */
@@ -775,7 +779,6 @@ void DsMrr_impl::setup_buffer_sizes(key_range *sample_key)
   uint rowid_buf_elem_size= h->ref_length + 
                             (int)is_mrr_assoc * sizeof(char*);
   
-  KEY *key_info= &h->table->key_info[keyno];
   /*
     Use rec_per_key statistics as a basis to find out how many rowids 
     we'll get for each key value.
@@ -809,12 +812,7 @@ void DsMrr_impl::setup_buffer_sizes(key_range *sample_key)
 
   rowid_buffer_end= full_buf + bytes_for_rowids;
   rowid_buffer.set_buffer_space(full_buf, rowid_buffer_end, 1);
-  key_buffer.set_buffer_space(rowid_buffer_end, full_buf_end, -1);
-
-  
-  index_ranges_unique= test(key_info->flags & HA_NOSAME && 
-                            key_info->key_parts == 
-                              my_count_bits(sample_key->keypart_map));
+  key_buffer.set_buffer_space(rowid_buffer_end, full_buf_end, -1); 
 }
 
 

From fae27347d1750a6d61f5cdb170df6cf10d3387b0 Mon Sep 17 00:00:00 2001
From: Sergey Petrunya <psergey@askmonty.org>
Date: Thu, 19 Aug 2010 19:52:58 +0200
Subject: [PATCH 20/49] Fix one more problem with buffer exhaustion scenario

---
 sql/multi_range_read.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc
index 07c44fcde5d..2ab2b6a7604 100644
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -1134,7 +1134,7 @@ int DsMrr_impl::dsmrr_next(char **range_info)
           if ((res= dsmrr_fill_rowid_buffer()))
             return res; /* for fatal errors */
         }
-        if (rowid_buffer.is_empty())
+        while (rowid_buffer.is_empty())
         {
           if (dsmrr_eof)
             return HA_ERR_END_OF_FILE;

From dbc63bed225503ae2ec90300ce05588234b565a5 Mon Sep 17 00:00:00 2001
From: Sergey Petrunya <psergey@askmonty.org>
Date: Sun, 5 Sep 2010 14:32:14 +0400
Subject: [PATCH 21/49] MWL#121-125: DS-MRR improvements - Address review
 feedback, step 1

---
 sql/handler.h           |   4 +
 sql/multi_range_read.cc | 183 +++++++++++++++++++++++-----------------
 sql/multi_range_read.h  |  70 +++++++++++++--
 3 files changed, 174 insertions(+), 83 deletions(-)

diff --git a/sql/handler.h b/sql/handler.h
index 2eae66fd741..2c3af0e8150 100644
--- a/sql/handler.h
+++ b/sql/handler.h
@@ -1807,6 +1807,10 @@ public:
   inline int ha_index_first(uchar * buf);
   inline int ha_index_last(uchar * buf);
   inline int ha_index_next_same(uchar *buf, const uchar *key, uint keylen);
+  /*
+    TODO: should we make for those functions non-virtual ha_func_name wrappers,
+    too?
+  */
   virtual ha_rows multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
                                               void *seq_init_param, 
                                               uint n_ranges, uint *bufsz,
diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc
index 2ab2b6a7604..beb06098f84 100644
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -286,6 +286,28 @@ scan_it_again:
 /****************************************************************************
  * DS-MRR implementation 
  ***************************************************************************/
+void SimpleBuffer::setup_writing(uchar **data1, size_t len1, 
+                                 uchar **data2, size_t len2)
+{
+  write_ptr1= data1;
+  write_size1= len1;
+
+  write_ptr2= data2;
+  write_size2= len2;
+}
+
+
+void SimpleBuffer::write()
+{
+  if (is_reverse() && write_ptr2)
+    write(*write_ptr2, write_size2);
+
+  write(*write_ptr1, write_size1);
+
+  if (!is_reverse() && write_ptr2)
+    write(*write_ptr2, write_size2);
+}
+
 
 void SimpleBuffer::write(const uchar *data, size_t bytes)
 {
@@ -313,6 +335,27 @@ size_t SimpleBuffer::used_size()
   return (direction == 1)? write_pos - read_pos : read_pos - write_pos;
 }
 
+
+void SimpleBuffer::setup_reading(uchar **data1, size_t len1, 
+                                 uchar **data2, size_t len2)
+{
+  read_ptr1= data1;
+  read_size1= len1;
+
+  read_ptr2= data2;
+  read_size2= len2;
+}
+
+bool SimpleBuffer::read()
+{
+  if (!have_data(read_size1 + read_ptr2? read_size2 : 0))
+    return TRUE;
+  *read_ptr1 =read(read_size1);
+  if (read_ptr2)
+    *read_ptr2= read(read_size2);
+  return FALSE;
+}
+
 uchar *SimpleBuffer::read(size_t bytes)
 {
   DBUG_ASSERT(have_data(bytes));
@@ -636,12 +679,16 @@ static int rowid_cmp(void *h, uchar *a, uchar *b)
 int DsMrr_impl::dsmrr_fill_rowid_buffer()
 {
   char *range_info;
+  uchar **range_info_ptr= (uchar**)&range_info;
   int res;
   DBUG_ENTER("DsMrr_impl::dsmrr_fill_rowid_buffer");
   
   DBUG_ASSERT(rowid_buffer.is_empty());
   rowid_buffer.reset_for_writing();
-  identical_rowid_ptr= NULL;
+  rowid_buffer.setup_writing(&h2->ref, h2->ref_length,
+                             is_mrr_assoc? (uchar**)&range_info_ptr: NULL, sizeof(void*));
+
+  last_identical_rowid= NULL;
 
   if (do_sort_keys && key_buffer.is_reverse())
     key_buffer.flip();
@@ -664,10 +711,8 @@ int DsMrr_impl::dsmrr_fill_rowid_buffer()
 
     /* Put rowid, or {rowid, range_id} pair into the buffer */
     h2->position(table->record[0]);
-    rowid_buffer.write(h2->ref, h2->ref_length);
 
-    if (is_mrr_assoc)
-      rowid_buffer.write((uchar*)&range_info, sizeof(void*));
+    rowid_buffer.write();
   }
 
   if (res && res != HA_ERR_END_OF_FILE)
@@ -677,15 +722,19 @@ int DsMrr_impl::dsmrr_fill_rowid_buffer()
     dsmrr_eof= test(res == HA_ERR_END_OF_FILE);
 
   /* Sort the buffer contents by rowid */
-  uint elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*);
-  uint n_rowids= rowid_buffer.used_size() / elem_size;
-  
-  my_qsort2(rowid_buffer.used_area(), n_rowids, elem_size, 
-            (qsort2_cmp)rowid_cmp, (void*)h);
+  rowid_buffer.sort((qsort2_cmp)rowid_cmp, (void*)h);
 
+  rowid_buffer.setup_reading(&rowid, h->ref_length,
+                             is_mrr_assoc? (uchar**)&rowids_range_id: NULL, sizeof(void*));
   DBUG_RETURN(0);
 }
 
+void SimpleBuffer::sort(qsort2_cmp cmp_func, void *cmp_func_arg)
+{
+  uint elem_size=write_size1 + (write_ptr2 ? write_size2 : 0);
+  uint n_elements= used_size() / elem_size;
+  my_qsort2(used_area(), n_elements, elem_size, cmp_func, cmp_func_arg);
+}
 
 /* 
   my_qsort2-compatible function to compare key tuples 
@@ -838,6 +887,7 @@ void DsMrr_impl::dsmrr_fill_key_buffer()
 {
   int res;
   KEY_MULTI_RANGE cur_range;
+  uchar **range_info_ptr= (uchar**)&cur_range.ptr;
   DBUG_ENTER("DsMrr_impl::dsmrr_fill_key_buffer");
 
   DBUG_ASSERT(!key_tuple_length || key_buffer.is_empty());
@@ -856,6 +906,7 @@ void DsMrr_impl::dsmrr_fill_key_buffer()
     key_buffer.reset_for_writing();
   }
 
+  uchar *key_ptr;
   while ((key_tuple_length == 0 || 
           key_buffer.have_space_for(key_buff_elem_size)) && 
          !(res= h->mrr_funcs.next(h->mrr_iter, &cur_range)))
@@ -865,33 +916,29 @@ void DsMrr_impl::dsmrr_fill_key_buffer()
     {
       /* This only happens when we've just started filling the buffer */
       setup_buffer_sizes(&cur_range.start_key);
+      key_buffer.setup_writing(&key_ptr, key_size_in_keybuf,
+                               is_mrr_assoc? (uchar**)&range_info_ptr : NULL,
+                               sizeof(uchar*));
     }
-
-    if (key_buffer.is_reverse() && is_mrr_assoc)
-      key_buffer.write((uchar*)&cur_range.ptr, sizeof(void*));
-
+    
     /* Put key, or {key, range_id} pair into the buffer */
     if (use_key_pointers)
-      key_buffer.write((uchar*)&cur_range.start_key.key, sizeof(char*));
+      key_ptr=(uchar*) &cur_range.start_key.key;
     else
-      key_buffer.write(cur_range.start_key.key, key_tuple_length);
- 
-    if (!key_buffer.is_reverse() && is_mrr_assoc)
-      key_buffer.write((uchar*)&cur_range.ptr, sizeof(void*));
+      key_ptr=(uchar*) cur_range.start_key.key;
+
+    key_buffer.write();
   }
 
   dsmrr_eof= test(res);
 
-  /* Sort the buffer contents by rowid */
-  uint key_elem_size= key_size_in_keybuf + (int)is_mrr_assoc * sizeof(void*);
-  uint n_keys= key_buffer.used_size() / key_elem_size;
-  
-  my_qsort2(key_buffer.used_area(), n_keys, key_elem_size,
-            (qsort2_cmp)DsMrr_impl::key_tuple_cmp, (void*)this);
+  key_buffer.sort((qsort2_cmp)DsMrr_impl::key_tuple_cmp, (void*)this);
   
+  key_buffer.setup_reading(&cur_index_tuple, key_size_in_keybuf,
+                           is_mrr_assoc? (uchar**)&cur_range_info: NULL, sizeof(void*));
+
   last_identical_key_ptr= NULL;
   in_identical_keys_range= FALSE;
-
   DBUG_VOID_RETURN;
 }
 
@@ -927,15 +974,15 @@ int DsMrr_impl::dsmrr_next_from_index(char **range_info_arg)
   int res;
   uchar *key_in_buf;
   handler *file= do_rowid_fetch? h2: h;
+  bool res2;
 
   while (in_identical_keys_range)
   {
-    /* Read record/key pointer from the buffer */
-    key_in_buf= identical_key_it.get_next(key_size_in_keybuf);
-    if (is_mrr_assoc)
-      cur_range_info= (char*)identical_key_it.get_next(sizeof(void*));
+    /* This will read to (cur_index_tuple, cur_range_info): */
+    res2= identical_key_it.read_next();
+    DBUG_ASSERT(!res2);
 
-    if (key_in_buf == last_identical_key_ptr)
+    if (cur_index_tuple == last_identical_key_ptr)
     {
       /* We're looking at the last of the identical keys */
       in_identical_keys_range= FALSE;
@@ -985,13 +1032,8 @@ check_record:
     /* Jump over the keys that were handled by identical key processing */
     if (last_identical_key_ptr)
     {
-      while (key_buffer.read(key_size_in_keybuf) != last_identical_key_ptr)
-      {
-        if (is_mrr_assoc)
-          key_buffer.read(sizeof(void*));
-      }
-      if (is_mrr_assoc)
-        key_buffer.read(sizeof(void*));
+      /* key_buffer.read() reads to (cur_index_tuple, cur_range_info) */
+      while (!key_buffer.read() && (cur_index_tuple != last_identical_key_ptr)) {}
       last_identical_key_ptr= NULL;
     }
 
@@ -1030,14 +1072,12 @@ check_record:
     }
 
     /* Get the next range to scan */
-    cur_index_tuple= key_in_buf= key_buffer.read(key_size_in_keybuf);
+    key_buffer.read(); // reads to (cur_index_tuple, cur_range_info)
+    key_in_buf= cur_index_tuple;
+
     if (use_key_pointers)
       cur_index_tuple= *((uchar**)cur_index_tuple);
 
-    if (is_mrr_assoc)
-      cur_range_info= (char*)key_buffer.read(sizeof(void*));
-    
-
     /* Do index lookup */
     if ((res= file->ha_index_read_map(table->record[0], cur_index_tuple, 
                                       key_tuple_map, HA_READ_KEY_EXACT)))
@@ -1049,19 +1089,17 @@ check_record:
 
     /* Check if subsequent keys in the key buffer are the same as this one */
     {
-      uchar *ptr;
+      char *save_cur_range_info= cur_range_info;
       identical_key_it.init(&key_buffer);
       last_identical_key_ptr= NULL;
-      while ((ptr= identical_key_it.get_next(key_size_in_keybuf)))
+      while (!identical_key_it.read_next())
       {
-        if (is_mrr_assoc)
-          identical_key_it.get_next(sizeof(void*));
-
-        if (key_tuple_cmp(this, key_in_buf, ptr))
+        if (key_tuple_cmp(this, key_in_buf, cur_index_tuple))
           break;
 
-        last_identical_key_ptr= ptr;
+        last_identical_key_ptr= cur_index_tuple;
       }
+      cur_range_info= save_cur_range_info;
       if (last_identical_key_ptr)
       {
         in_identical_keys_range= TRUE;
@@ -1086,9 +1124,6 @@ end:
 int DsMrr_impl::dsmrr_next(char **range_info)
 {
   int res;
-  uchar *cur_range_info= 0;
-  uchar *rowid;
-  uchar *range_id;
 
   if (use_default_impl)
     return h->handler::multi_range_read_next(range_info);
@@ -1096,23 +1131,22 @@ int DsMrr_impl::dsmrr_next(char **range_info)
   if (!do_rowid_fetch)
     return dsmrr_next_from_index(range_info);
   
-  while (identical_rowid_ptr)
+  while (last_identical_rowid)
   {
     /*
       Current record (the one we've returned in previous call) was obtained
       from a rowid that matched multiple range_ids. Return this record again,
       with next matching range_id.
     */
-    rowid= rowid_buffer.read(h->ref_length);
-    if (is_mrr_assoc)
-    {
-      uchar *range_ptr= rowid_buffer.read(sizeof(uchar*));
-      memcpy(range_info, range_ptr, sizeof(uchar*));
-    }
+    bool bres= rowid_buffer.read();
+    DBUG_ASSERT(!bres);
 
-    if (rowid == identical_rowid_ptr)
+    if (is_mrr_assoc)
+      memcpy(range_info, rowids_range_id, sizeof(uchar*));
+
+    if (rowid == last_identical_rowid)
     {
-      identical_rowid_ptr= NULL; /* reached the last of identical rowids */
+      last_identical_rowid= NULL; /* reached the last of identical rowids */
     }
 
     if (!h2->mrr_funcs.skip_record ||
@@ -1157,20 +1191,18 @@ int DsMrr_impl::dsmrr_next(char **range_info)
       }
     }
    
-    /* Return eof if there are no rowids in the buffer after re-fill attempt */
-    if (rowid_buffer.is_empty())
-      return HA_ERR_END_OF_FILE;
+    last_identical_rowid= NULL;
 
-    rowid= rowid_buffer.read(h->ref_length);
-    identical_rowid_ptr= NULL;
+    /* Return eof if there are no rowids in the buffer after re-fill attempt */
+    if (rowid_buffer.read())
+      return HA_ERR_END_OF_FILE;
 
     if (is_mrr_assoc)
     {
-      range_id= rowid_buffer.read(sizeof(uchar*));
-      memcpy(&cur_range_info, range_id, sizeof(uchar*));
-      memcpy(range_info, range_id, sizeof(uchar*));
+      memcpy(range_info, rowids_range_id, sizeof(uchar*));
+      memcpy(&cur_range_info, rowids_range_id, sizeof(uchar*));
     }
-    
+
     if (h2->mrr_funcs.skip_record &&
 	h2->mrr_funcs.skip_record(h2->mrr_iter, (char *) cur_range_info, rowid))
       continue;
@@ -1187,21 +1219,18 @@ int DsMrr_impl::dsmrr_next(char **range_info)
     */
     if (!res)
     {
+      uchar *cur_rowid= rowid;
       /* 
         Note: this implies that SQL layer doesn't touch table->record[0]
         between calls.
       */
-      uchar *ptr;
       SimpleBuffer::PeekIterator identical_rowid_it;
       identical_rowid_it.init(&rowid_buffer);
-      while ((ptr= identical_rowid_it.get_next(h->ref_length)))
+      while (!identical_rowid_it.read_next()) // reads to (rowid, ...)
       {
-        if (is_mrr_assoc)
-          identical_rowid_it.get_next(sizeof(void*));
-
-        if (h2->cmp_ref(rowid, ptr))
+        if (h2->cmp_ref(rowid, cur_rowid))
           break;
-        identical_rowid_ptr= ptr;
+        last_identical_rowid= rowid;
       }
     }
     return 0;
diff --git a/sql/multi_range_read.h b/sql/multi_range_read.h
index 0696dde21ca..84775f64a19 100644
--- a/sql/multi_range_read.h
+++ b/sql/multi_range_read.h
@@ -61,20 +61,51 @@ class SimpleBuffer
     -1 <=> everthing is done from end to start instead.
   */
   int direction;
+  
+  /* Pointers to read data from */
+  uchar **write_ptr1;
+  size_t write_size1;
+  /* Same as above, but may be NULL */
+  uchar **write_ptr2;
+  size_t write_size2;
+
+  /* Pointers to write data to */
+  uchar **read_ptr1;
+  size_t read_size1;
+  /* Same as above, but may be NULL */
+  uchar **read_ptr2;
+  size_t read_size2;
+
 public:
+  /* Set up writing*/
+  void setup_writing(uchar **data1, size_t len1, 
+                     uchar **data2, size_t len2);
+
+  void sort(qsort2_cmp cmp_func, void *cmp_func_arg);
+
   /* Write-mode functions */
   void reset_for_writing();
-  void write(const uchar *data, size_t bytes);
+  void write();
   bool have_space_for(size_t bytes);
 
+private:
+  void write(const uchar *data, size_t bytes);
   uchar *used_area() { return (direction == 1)? read_pos : write_pos; }
   size_t used_size();
+public:
+
   bool is_empty() { return used_size() == 0; }
 
   /* Read-mode functions */
   void reset_for_reading();
-
+  
+  // todo: join with setup-writing?
+  void setup_reading(uchar **data1, size_t len1, 
+                     uchar **data2, size_t len2);
+  bool read();
+private:
   uchar *read(size_t bytes);
+public:
   bool have_data(size_t bytes);
   uchar *end_of_space();
 
@@ -135,7 +166,6 @@ public:
       DBUG_ASSERT(0); /* Attempt to grow buffer in wrong direction */
   }
   
-  //friend class PeekIterator;
   class PeekIterator
   {
     // if direction==1 : pointer to what to return next
@@ -148,6 +178,26 @@ public:
       sb= sb_arg;
       pos= sb->read_pos;
     }
+    
+    /*
+      If the buffer stores tuples, this call will return pointer to the first
+      component.
+    */
+    bool read_next()
+    {
+      // Always read the first component first? (because we do inverted-writes
+      // if needed, so no measures need to be taken here).
+      uchar *res;
+      if ((res= get_next(sb->read_size1)))
+      {
+        *(sb->read_ptr1)= res;
+        if (sb->read_ptr2)
+          *sb->read_ptr2= get_next(sb->read_size2);
+        return FALSE;
+      }
+      return TRUE; /* EOF */
+    }
+  private:
     /* Return pointer to next chunk of nbytes bytes and avance over it */
     uchar *get_next(size_t nbytes)
     {
@@ -170,6 +220,7 @@ public:
   };
 };
 
+
 /*
   DS-MRR implementation for one table. Create/use one object of this class for
   each ha_{myisam/innobase/etc} object. That object will be further referred to
@@ -206,8 +257,6 @@ public:
        scanning.
 */
 
-
-
 class DsMrr_impl
 {
 public:
@@ -252,7 +301,16 @@ private:
   /* Buffer to store rowids, or (rowid, range_id) pairs */
   SimpleBuffer rowid_buffer;
   
-  uchar *identical_rowid_ptr;
+  /*  Reads from rowid buffer go to here: */
+  uchar *rowid;
+  uchar *rowids_range_id;
+  
+  /*
+    not-NULL: we're traversing a group of (rowid, range_id) pairs with
+              identical rowid values, and this is the pointer to the last one.
+    NULL: we're not in the group of indentical rowids.
+  */
+  uchar *last_identical_rowid;
   
   /* Identical keys */
   bool in_identical_keys_range;

From 9b04caffd4ee6a84c61835826095a8fee5993238 Mon Sep 17 00:00:00 2001
From: Sergey Petrunya <psergey@askmonty.org>
Date: Fri, 10 Sep 2010 20:48:11 +0400
Subject: [PATCH 22/49] Commit for buildbot checks

---
 sql/multi_range_read.cc | 64 ++++++++++++++++++++++++-----------------
 sql/multi_range_read.h  | 33 ++++++++++++---------
 2 files changed, 56 insertions(+), 41 deletions(-)

diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc
index beb06098f84..ba12b520a25 100644
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -283,8 +283,9 @@ scan_it_again:
   DBUG_RETURN(result);
 }
 
+
 /****************************************************************************
- * DS-MRR implementation 
+ * SimpleBuffer class implementation (used by DS-MRR code)
  ***************************************************************************/
 void SimpleBuffer::setup_writing(uchar **data1, size_t len1, 
                                  uchar **data2, size_t len2)
@@ -322,6 +323,13 @@ void SimpleBuffer::write(const uchar *data, size_t bytes)
     write_pos += bytes;
 }
 
+
+bool SimpleBuffer::can_write()
+{
+  return have_space_for(write_size1 + (write_ptr2? write_size2:0));
+}
+
+
 bool SimpleBuffer::have_space_for(size_t bytes)
 {
   if (direction == 1)
@@ -405,9 +413,12 @@ uchar *SimpleBuffer::end_of_space()
     return start;
   else
     return end;
-//TODO: check this.
 }
 
+/****************************************************************************
+ * DS-MRR implementation 
+ ***************************************************************************/
+
 /**
   DS-MRR: Initialize and start MRR scan
 
@@ -472,28 +483,21 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
   }
   DBUG_ASSERT(do_sort_keys || do_rowid_fetch);
 
-  full_buf= buf->buffer;
-  full_buf_end= buf->buffer_end;
   
+  if (is_mrr_assoc)
+    status_var_increment(table->in_use->status_var.ha_multi_range_read_init_count);
+
   /* 
     At start, alloc all of the buffer for rowids. Key sorting code will grab a
     piece if necessary.
   */
+  full_buf= buf->buffer;
+  full_buf_end= buf->buffer_end;
   rowid_buffer.set_buffer_space(full_buf, full_buf_end, 1);
-
-  if (is_mrr_assoc)
-    status_var_increment(table->in_use->status_var.ha_multi_range_read_init_count);
   
-  /*
-    psergey2-todo: for CPK scans:
-     - use MRR irrespectively of @@mrr_sort_keys setting,
-     - dont do rowid retrieval.
-  */
   if (do_sort_keys)
   {
-    /* It's a DS-MRR/CPK scan */
-    key_tuple_length= 0; /* dummy value telling it needs to be inited */
-    key_buff_elem_size= 0;
+    know_key_tuple_params= FALSE;
     in_index_range= FALSE;
     h->mrr_iter= seq_funcs->init(seq_init_param, n_ranges, mode);
     h->mrr_funcs= *seq_funcs;
@@ -693,7 +697,7 @@ int DsMrr_impl::dsmrr_fill_rowid_buffer()
   if (do_sort_keys && key_buffer.is_reverse())
     key_buffer.flip();
 
-  while (rowid_buffer.have_space_for(rowid_buff_elem_size))
+  while (rowid_buffer.can_write())
   {
     if (do_sort_keys)
       res= dsmrr_next_from_index(&range_info);
@@ -729,6 +733,7 @@ int DsMrr_impl::dsmrr_fill_rowid_buffer()
   DBUG_RETURN(0);
 }
 
+
 void SimpleBuffer::sort(qsort2_cmp cmp_func, void *cmp_func_arg)
 {
   uint elem_size=write_size1 + (write_ptr2 ? write_size2 : 0);
@@ -736,10 +741,9 @@ void SimpleBuffer::sort(qsort2_cmp cmp_func, void *cmp_func_arg)
   my_qsort2(used_area(), n_elements, elem_size, cmp_func, cmp_func_arg);
 }
 
+
 /* 
   my_qsort2-compatible function to compare key tuples 
-
-  If dsmrr->use_key_pointers==FALSE
 */
 
 int DsMrr_impl::key_tuple_cmp(void* arg, uchar* key1, uchar* key2)
@@ -890,9 +894,10 @@ void DsMrr_impl::dsmrr_fill_key_buffer()
   uchar **range_info_ptr= (uchar**)&cur_range.ptr;
   DBUG_ENTER("DsMrr_impl::dsmrr_fill_key_buffer");
 
-  DBUG_ASSERT(!key_tuple_length || key_buffer.is_empty());
+  DBUG_ASSERT(!know_key_tuple_params || key_buffer.is_empty());
 
-  if (key_tuple_length)
+  uchar *key_ptr;
+  if (know_key_tuple_params)
   {
     if (do_rowid_fetch && rowid_buffer.is_empty())
     {
@@ -904,21 +909,24 @@ void DsMrr_impl::dsmrr_fill_key_buffer()
       key_buffer.set_buffer_space(rowid_buffer_end, full_buf_end, -1);
     }
     key_buffer.reset_for_writing();
+    key_buffer.setup_writing(&key_ptr, key_size_in_keybuf,
+                             is_mrr_assoc? (uchar**)&range_info_ptr : NULL,
+                             sizeof(uchar*));
   }
 
-  uchar *key_ptr;
-  while ((key_tuple_length == 0 || 
-          key_buffer.have_space_for(key_buff_elem_size)) && 
+  while ((!know_key_tuple_params || key_buffer.can_write()) && 
          !(res= h->mrr_funcs.next(h->mrr_iter, &cur_range)))
   {
     DBUG_ASSERT(cur_range.range_flag & EQ_RANGE);
-    if (!key_tuple_length)
+    if (!know_key_tuple_params)
     {
       /* This only happens when we've just started filling the buffer */
       setup_buffer_sizes(&cur_range.start_key);
+      know_key_tuple_params= TRUE;
       key_buffer.setup_writing(&key_ptr, key_size_in_keybuf,
                                is_mrr_assoc? (uchar**)&range_info_ptr : NULL,
                                sizeof(uchar*));
+      DBUG_ASSERT(key_buffer.can_write());
     }
     
     /* Put key, or {key, range_id} pair into the buffer */
@@ -934,6 +942,7 @@ void DsMrr_impl::dsmrr_fill_key_buffer()
 
   key_buffer.sort((qsort2_cmp)DsMrr_impl::key_tuple_cmp, (void*)this);
   
+  //psergey4: cur_range_info will point to range-info bytes.
   key_buffer.setup_reading(&cur_index_tuple, key_size_in_keybuf,
                            is_mrr_assoc? (uchar**)&cur_range_info: NULL, sizeof(void*));
 
@@ -995,7 +1004,7 @@ check_record:
     {
       continue;
     }
-    memcpy(range_info_arg, cur_range_info, sizeof(void*));
+    memcpy(range_info_arg, cur_range_info, sizeof(void*)); //psergey4: this copyies junk there
 
     return 0;
   }
@@ -1200,11 +1209,12 @@ int DsMrr_impl::dsmrr_next(char **range_info)
     if (is_mrr_assoc)
     {
       memcpy(range_info, rowids_range_id, sizeof(uchar*));
-      memcpy(&cur_range_info, rowids_range_id, sizeof(uchar*));
+      //psergey5: memcpy(&cur_range_info, rowids_range_id, sizeof(uchar*)); // psergey: ???
     }
 
     if (h2->mrr_funcs.skip_record &&
-	h2->mrr_funcs.skip_record(h2->mrr_iter, (char *) cur_range_info, rowid))
+	h2->mrr_funcs.skip_record(h2->mrr_iter, /* psergey5 (char *)
+        cur_range_info */ *range_info, rowid))
       continue;
 
     res= h->ha_rnd_pos(table->record[0], rowid);
diff --git a/sql/multi_range_read.h b/sql/multi_range_read.h
index 84775f64a19..b433dd1b219 100644
--- a/sql/multi_range_read.h
+++ b/sql/multi_range_read.h
@@ -76,6 +76,13 @@ class SimpleBuffer
   uchar **read_ptr2;
   size_t read_size2;
 
+  bool have_space_for(size_t bytes);
+  uchar *used_area() { return (direction == 1)? read_pos : write_pos; }
+  size_t used_size();
+
+  void write(const uchar *data, size_t bytes);
+  uchar *read(size_t bytes);
+
 public:
   /* Set up writing*/
   void setup_writing(uchar **data1, size_t len1, 
@@ -86,26 +93,17 @@ public:
   /* Write-mode functions */
   void reset_for_writing();
   void write();
-  bool have_space_for(size_t bytes);
-
-private:
-  void write(const uchar *data, size_t bytes);
-  uchar *used_area() { return (direction == 1)? read_pos : write_pos; }
-  size_t used_size();
-public:
+  bool can_write();
 
   bool is_empty() { return used_size() == 0; }
 
   /* Read-mode functions */
   void reset_for_reading();
-  
-  // todo: join with setup-writing?
+  // todo: join with setup-writing? (but what for?)
   void setup_reading(uchar **data1, size_t len1, 
                      uchar **data2, size_t len2);
   bool read();
-private:
-  uchar *read(size_t bytes);
-public:
+
   bool have_data(size_t bytes);
   uchar *end_of_space();
 
@@ -115,6 +113,7 @@ public:
     start= start_arg;
     end= end_arg;
     direction= direction_arg;
+  //  TRASH(start, end - start);
     reset_for_writing();
   }
   
@@ -166,6 +165,10 @@ public:
       DBUG_ASSERT(0); /* Attempt to grow buffer in wrong direction */
   }
   
+  /*
+    An iterator to do look at what we're about to read from the buffer without
+    actually reading it.
+  */
   class PeekIterator
   {
     // if direction==1 : pointer to what to return next
@@ -342,8 +345,9 @@ private:
 
   bool doing_cpk_scan; /* TRUE <=> DS-MRR/CPK variant is used */
 
-  /** DS-MRR/CPK variables start */
-
+  
+  /* Initially FALSE, becomes TRUE when we've set key_tuple_xxx members */
+  bool know_key_tuple_params;
   /* Length of lookup tuple being used, in bytes */
   uint key_tuple_length;
   key_part_map key_tuple_map; 
@@ -368,6 +372,7 @@ private:
   /* TRUE<=> we're in a middle of enumerating records from a range */ 
   bool in_index_range;
   uchar *cur_index_tuple;
+
   /* if in_index_range==TRUE: range_id of the range we're enumerating */
   char *cur_range_info;
 

From a730bb9cafd7f58b3c5cbce0fa6d4bc8c31fffdf Mon Sep 17 00:00:00 2001
From: Sergey Petrunya <psergey@askmonty.org>
Date: Sat, 11 Sep 2010 21:56:43 +0400
Subject: [PATCH 23/49] Fix a typo bug in SimpleBuffer.read()

---
 sql/multi_range_read.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc
index ba12b520a25..2b43a880564 100644
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -356,7 +356,7 @@ void SimpleBuffer::setup_reading(uchar **data1, size_t len1,
 
 bool SimpleBuffer::read()
 {
-  if (!have_data(read_size1 + read_ptr2? read_size2 : 0))
+  if (!have_data(read_size1 + (read_ptr2? read_size2 : 0)))
     return TRUE;
   *read_ptr1 =read(read_size1);
   if (read_ptr2)

From b9e5125050a749324819e9e7910673337a706e83 Mon Sep 17 00:00:00 2001
From: Sergey Petrunya <psergey@askmonty.org>
Date: Sun, 12 Sep 2010 13:21:25 +0400
Subject: [PATCH 24/49] Remove garbage comments, coding style conformance (no
 functional changes).

---
 sql/multi_range_read.cc | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc
index 2b43a880564..9ae86a287ac 100644
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -942,9 +942,9 @@ void DsMrr_impl::dsmrr_fill_key_buffer()
 
   key_buffer.sort((qsort2_cmp)DsMrr_impl::key_tuple_cmp, (void*)this);
   
-  //psergey4: cur_range_info will point to range-info bytes.
   key_buffer.setup_reading(&cur_index_tuple, key_size_in_keybuf,
-                           is_mrr_assoc? (uchar**)&cur_range_info: NULL, sizeof(void*));
+                           is_mrr_assoc? (uchar**)&cur_range_info: NULL,
+                           sizeof(void*));
 
   last_identical_key_ptr= NULL;
   in_identical_keys_range= FALSE;
@@ -1004,8 +1004,7 @@ check_record:
     {
       continue;
     }
-    memcpy(range_info_arg, cur_range_info, sizeof(void*)); //psergey4: this copyies junk there
-
+    memcpy(range_info_arg, cur_range_info, sizeof(void*));
     return 0;
   }
   
@@ -1209,12 +1208,10 @@ int DsMrr_impl::dsmrr_next(char **range_info)
     if (is_mrr_assoc)
     {
       memcpy(range_info, rowids_range_id, sizeof(uchar*));
-      //psergey5: memcpy(&cur_range_info, rowids_range_id, sizeof(uchar*)); // psergey: ???
     }
 
     if (h2->mrr_funcs.skip_record &&
-	h2->mrr_funcs.skip_record(h2->mrr_iter, /* psergey5 (char *)
-        cur_range_info */ *range_info, rowid))
+	h2->mrr_funcs.skip_record(h2->mrr_iter, *range_info, rowid))
       continue;
 
     res= h->ha_rnd_pos(table->record[0], rowid);

From ac3756a1d0412114c1e2cb9bb28c940751c8bf94 Mon Sep 17 00:00:00 2001
From: Sergey Petrunya <psergey@askmonty.org>
Date: Sun, 12 Sep 2010 18:28:19 +0400
Subject: [PATCH 25/49] Debug measure: trash unused buffer space

---
 sql/multi_range_read.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sql/multi_range_read.h b/sql/multi_range_read.h
index b433dd1b219..4b33b85c199 100644
--- a/sql/multi_range_read.h
+++ b/sql/multi_range_read.h
@@ -113,7 +113,7 @@ public:
     start= start_arg;
     end= end_arg;
     direction= direction_arg;
-  //  TRASH(start, end - start);
+    TRASH(start, end - start);
     reset_for_writing();
   }
   
@@ -153,6 +153,8 @@ public:
       - it is adjacent to buffer space we're using
       - it is on the end towards which we grow.
     */
+    DBUG_ASSERT(unused_end > unused_start);
+    TRASH(unused_start, unused_end - unused_start);
     if (direction == 1 && end == unused_start)
     {
       end= unused_end;

From 188de43db1db9a88b45b0de5371294ab0f8b2bfe Mon Sep 17 00:00:00 2001
From: Sergey Petrunya <psergey@askmonty.org>
Date: Mon, 13 Sep 2010 19:45:41 +0400
Subject: [PATCH 26/49] - Fixed BUG#623285: Crash in quick_range_seq_next() in
 maria-5.3-dsmrr-cpk - Relax overly-strict assert added in previous commit -
 Run generic MRR testsuite for Maria engine, too.

---
 mysql-test/r/maria_mrr.result | 324 ++++++++++++++++++++++++++++++++++
 mysql-test/t/maria_mrr.test   |  47 +++++
 sql/multi_range_read.cc       |   1 +
 sql/multi_range_read.h        |   2 +-
 4 files changed, 373 insertions(+), 1 deletion(-)
 create mode 100644 mysql-test/r/maria_mrr.result
 create mode 100644 mysql-test/t/maria_mrr.test

diff --git a/mysql-test/r/maria_mrr.result b/mysql-test/r/maria_mrr.result
new file mode 100644
index 00000000000..1c18be47ce3
--- /dev/null
+++ b/mysql-test/r/maria_mrr.result
@@ -0,0 +1,324 @@
+drop table if exists t1, t2, t3;
+set @mrr_buffer_size_save= @@mrr_buffer_size;
+set @save_storage_engine= @@storage_engine;
+set storage_engine=Maria;
+create table t1(a int);
+show create table t1;
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL
+) ENGINE=MARIA DEFAULT CHARSET=latin1 PAGE_CHECKSUM=1
+insert into t1 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t2(a int);
+insert into t2 select A.a + 10*(B.a + 10*C.a) from t1 A, t1 B, t1 C;
+create table t3 (
+a char(8) not null, b char(8) not null, filler char(200),
+key(a)
+);
+insert into t3 select @a:=concat('c-', 1000+ A.a, '=w'), @a, 'filler' from t2 A;
+insert into t3 select concat('c-', 1000+A.a, '=w'), concat('c-', 2000+A.a, '=w'), 
+'filler-1' from t2 A;
+insert into t3 select concat('c-', 1000+A.a, '=w'), concat('c-', 3000+A.a, '=w'), 
+'filler-2' from t2 A;
+select a,filler from t3 where a >= 'c-9011=w';
+a	filler
+select a,filler from t3 where a >= 'c-1011=w' and a <= 'c-1015=w';
+a	filler
+c-1011=w	filler
+c-1012=w	filler
+c-1013=w	filler
+c-1014=w	filler
+c-1015=w	filler
+c-1011=w	filler-1
+c-1012=w	filler-1
+c-1013=w	filler-1
+c-1014=w	filler-1
+c-1015=w	filler-1
+c-1011=w	filler-2
+c-1012=w	filler-2
+c-1013=w	filler-2
+c-1014=w	filler-2
+c-1015=w	filler-2
+select a,filler from t3 where (a>='c-1011=w' and a <= 'c-1013=w') or
+(a>='c-1014=w' and a <= 'c-1015=w');
+a	filler
+c-1011=w	filler
+c-1012=w	filler
+c-1013=w	filler
+c-1014=w	filler
+c-1015=w	filler
+c-1011=w	filler-1
+c-1012=w	filler-1
+c-1013=w	filler-1
+c-1014=w	filler-1
+c-1015=w	filler-1
+c-1011=w	filler-2
+c-1012=w	filler-2
+c-1013=w	filler-2
+c-1014=w	filler-2
+c-1015=w	filler-2
+insert into t3 values ('c-1013=z', 'c-1013=z', 'err');
+insert into t3 values ('a-1014=w', 'a-1014=w', 'err');
+select a,filler from t3 where (a>='c-1011=w' and a <= 'c-1013=w') or
+(a>='c-1014=w' and a <= 'c-1015=w');
+a	filler
+c-1011=w	filler
+c-1012=w	filler
+c-1013=w	filler
+c-1014=w	filler
+c-1015=w	filler
+c-1011=w	filler-1
+c-1012=w	filler-1
+c-1013=w	filler-1
+c-1014=w	filler-1
+c-1015=w	filler-1
+c-1011=w	filler-2
+c-1012=w	filler-2
+c-1013=w	filler-2
+c-1014=w	filler-2
+c-1015=w	filler-2
+delete from t3 where b in ('c-1013=z', 'a-1014=w');
+select a,filler from t3 where a='c-1011=w' or a='c-1012=w' or a='c-1013=w' or
+a='c-1014=w' or a='c-1015=w';
+a	filler
+c-1011=w	filler
+c-1012=w	filler
+c-1013=w	filler
+c-1014=w	filler
+c-1015=w	filler
+c-1011=w	filler-1
+c-1012=w	filler-1
+c-1013=w	filler-1
+c-1014=w	filler-1
+c-1015=w	filler-1
+c-1011=w	filler-2
+c-1012=w	filler-2
+c-1013=w	filler-2
+c-1014=w	filler-2
+c-1015=w	filler-2
+insert into t3 values ('c-1013=w', 'del-me', 'inserted');
+select a,filler from t3 where a='c-1011=w' or a='c-1012=w' or a='c-1013=w' or
+a='c-1014=w' or a='c-1015=w';
+a	filler
+c-1011=w	filler
+c-1012=w	filler
+c-1013=w	filler
+c-1014=w	filler
+c-1015=w	filler
+c-1011=w	filler-1
+c-1012=w	filler-1
+c-1013=w	filler-1
+c-1014=w	filler-1
+c-1015=w	filler-1
+c-1011=w	filler-2
+c-1012=w	filler-2
+c-1013=w	filler-2
+c-1014=w	filler-2
+c-1015=w	filler-2
+c-1013=w	inserted
+delete from t3 where b='del-me';
+alter table t3 add primary key(b);
+select b,filler from t3 where (b>='c-1011=w' and b<= 'c-1018=w') or 
+b IN ('c-1019=w', 'c-1020=w', 'c-1021=w', 
+'c-1022=w', 'c-1023=w', 'c-1024=w');
+b	filler
+c-1011=w	filler
+c-1012=w	filler
+c-1013=w	filler
+c-1014=w	filler
+c-1015=w	filler
+c-1016=w	filler
+c-1017=w	filler
+c-1018=w	filler
+c-1019=w	filler
+c-1020=w	filler
+c-1021=w	filler
+c-1022=w	filler
+c-1023=w	filler
+c-1024=w	filler
+select b,filler from t3 where (b>='c-1011=w' and b<= 'c-1020=w') or 
+b IN ('c-1021=w', 'c-1022=w', 'c-1023=w');
+b	filler
+c-1011=w	filler
+c-1012=w	filler
+c-1013=w	filler
+c-1014=w	filler
+c-1015=w	filler
+c-1016=w	filler
+c-1017=w	filler
+c-1018=w	filler
+c-1019=w	filler
+c-1020=w	filler
+c-1021=w	filler
+c-1022=w	filler
+c-1023=w	filler
+select b,filler from t3 where (b>='c-1011=w' and b<= 'c-1018=w') or 
+b IN ('c-1019=w', 'c-1020=w') or 
+(b>='c-1021=w' and b<= 'c-1023=w');
+b	filler
+c-1011=w	filler
+c-1012=w	filler
+c-1013=w	filler
+c-1014=w	filler
+c-1015=w	filler
+c-1016=w	filler
+c-1017=w	filler
+c-1018=w	filler
+c-1019=w	filler
+c-1020=w	filler
+c-1021=w	filler
+c-1022=w	filler
+c-1023=w	filler
+create table t4 (a varchar(10), b int, c char(10), filler char(200),
+key idx1 (a, b, c));
+insert into t4 (filler) select concat('NULL-', 15-a) from t2 order by a limit 15;
+insert into t4 (a,b,c,filler) 
+select 'b-1',NULL,'c-1', concat('NULL-', 15-a) from t2 order by a limit 15;
+insert into t4 (a,b,c,filler) 
+select 'b-1',NULL,'c-222', concat('NULL-', 15-a) from t2 order by a limit 15;
+insert into t4 (a,b,c,filler) 
+select 'bb-1',NULL,'cc-2', concat('NULL-', 15-a) from t2 order by a limit 15;
+insert into t4 (a,b,c,filler) 
+select 'zz-1',NULL,'cc-2', 'filler-data' from t2 order by a limit 500;
+explain 
+select * from t4 where a IS NULL and b IS NULL and (c IS NULL or c='no-such-row1'
+                                                      or c='no-such-row2');
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t4	range	idx1	idx1	29	NULL	16	Using index condition; Using MRR
+select * from t4 where a IS NULL and b IS NULL and (c IS NULL or c='no-such-row1'
+                                                    or c='no-such-row2');
+a	b	c	filler
+NULL	NULL	NULL	NULL-15
+NULL	NULL	NULL	NULL-14
+NULL	NULL	NULL	NULL-13
+NULL	NULL	NULL	NULL-12
+NULL	NULL	NULL	NULL-11
+NULL	NULL	NULL	NULL-10
+NULL	NULL	NULL	NULL-9
+NULL	NULL	NULL	NULL-8
+NULL	NULL	NULL	NULL-7
+NULL	NULL	NULL	NULL-6
+NULL	NULL	NULL	NULL-5
+NULL	NULL	NULL	NULL-4
+NULL	NULL	NULL	NULL-3
+NULL	NULL	NULL	NULL-2
+NULL	NULL	NULL	NULL-1
+explain 
+select * from t4 where (a ='b-1' or a='bb-1') and b IS NULL and (c='c-1' or c='cc-2');
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t4	range	idx1	idx1	29	NULL	32	Using index condition; Using MRR
+select * from t4 where (a ='b-1' or a='bb-1') and b IS NULL and (c='c-1' or c='cc-2');
+a	b	c	filler
+b-1	NULL	c-1	NULL-15
+b-1	NULL	c-1	NULL-14
+b-1	NULL	c-1	NULL-13
+b-1	NULL	c-1	NULL-12
+b-1	NULL	c-1	NULL-11
+b-1	NULL	c-1	NULL-10
+b-1	NULL	c-1	NULL-9
+b-1	NULL	c-1	NULL-8
+b-1	NULL	c-1	NULL-7
+b-1	NULL	c-1	NULL-6
+b-1	NULL	c-1	NULL-5
+b-1	NULL	c-1	NULL-4
+b-1	NULL	c-1	NULL-3
+b-1	NULL	c-1	NULL-2
+b-1	NULL	c-1	NULL-1
+bb-1	NULL	cc-2	NULL-15
+bb-1	NULL	cc-2	NULL-14
+bb-1	NULL	cc-2	NULL-13
+bb-1	NULL	cc-2	NULL-12
+bb-1	NULL	cc-2	NULL-11
+bb-1	NULL	cc-2	NULL-10
+bb-1	NULL	cc-2	NULL-9
+bb-1	NULL	cc-2	NULL-8
+bb-1	NULL	cc-2	NULL-7
+bb-1	NULL	cc-2	NULL-6
+bb-1	NULL	cc-2	NULL-5
+bb-1	NULL	cc-2	NULL-4
+bb-1	NULL	cc-2	NULL-3
+bb-1	NULL	cc-2	NULL-2
+bb-1	NULL	cc-2	NULL-1
+select * from t4 ignore index(idx1) where (a ='b-1' or a='bb-1') and b IS NULL and (c='c-1' or c='cc-2');
+a	b	c	filler
+b-1	NULL	c-1	NULL-15
+b-1	NULL	c-1	NULL-14
+b-1	NULL	c-1	NULL-13
+b-1	NULL	c-1	NULL-12
+b-1	NULL	c-1	NULL-11
+b-1	NULL	c-1	NULL-10
+b-1	NULL	c-1	NULL-9
+b-1	NULL	c-1	NULL-8
+b-1	NULL	c-1	NULL-7
+b-1	NULL	c-1	NULL-6
+b-1	NULL	c-1	NULL-5
+b-1	NULL	c-1	NULL-4
+b-1	NULL	c-1	NULL-3
+b-1	NULL	c-1	NULL-2
+b-1	NULL	c-1	NULL-1
+bb-1	NULL	cc-2	NULL-15
+bb-1	NULL	cc-2	NULL-14
+bb-1	NULL	cc-2	NULL-13
+bb-1	NULL	cc-2	NULL-12
+bb-1	NULL	cc-2	NULL-11
+bb-1	NULL	cc-2	NULL-10
+bb-1	NULL	cc-2	NULL-9
+bb-1	NULL	cc-2	NULL-8
+bb-1	NULL	cc-2	NULL-7
+bb-1	NULL	cc-2	NULL-6
+bb-1	NULL	cc-2	NULL-5
+bb-1	NULL	cc-2	NULL-4
+bb-1	NULL	cc-2	NULL-3
+bb-1	NULL	cc-2	NULL-2
+bb-1	NULL	cc-2	NULL-1
+drop table t1, t2, t3, t4;
+create table t1 (a int, b int not null,unique key (a,b),index(b));
+insert ignore into t1 values (1,1),(2,2),(3,3),(4,4),(5,5),(6,6),(null,7),(9,9),(8,8),(7,7),(null,9),(null,9),(6,6);
+create table t2 like t1;
+insert into t2 select * from t1;
+alter table t1 modify b blob not null, add c int not null, drop key a, add unique key (a,b(20),c), drop key b, add key (b(10));
+select * from t1 where a is null;
+a	b	c
+NULL	7	0
+NULL	9	0
+NULL	9	0
+select * from t1 where (a is null or a > 0 and a < 3) and b > 7 limit 3;
+a	b	c
+NULL	9	0
+NULL	9	0
+select * from t1 where a is null and b=9 or a is null and b=7 limit 3;
+a	b	c
+NULL	7	0
+NULL	9	0
+NULL	9	0
+drop table t1, t2;
+set storage_engine= @save_storage_engine;
+set @@mrr_buffer_size= @mrr_buffer_size_save;
+# 
+# Crash in quick_range_seq_next() in maria-5.3-dsmrr-cpk with join_cache_level = {8,1}
+# 
+set @save_join_cache_level= @@join_cache_level;
+SET SESSION join_cache_level = 8;
+CREATE TABLE `t1` (
+`col_int_key` int(11) DEFAULT NULL,
+`col_datetime_key` datetime DEFAULT NULL,
+`col_varchar_key` varchar(1) DEFAULT NULL,
+`col_varchar_nokey` varchar(1) DEFAULT NULL,
+KEY `col_varchar_key` (`col_varchar_key`,`col_int_key`)
+) ENGINE=MARIA DEFAULT CHARSET=latin1 PAGE_CHECKSUM=1;
+INSERT INTO `t1` VALUES (6,'2005-10-07 00:00:00','e','e');
+INSERT INTO `t1` VALUES (51,'2000-07-15 05:00:34','f','f');
+CREATE TABLE `t2` (
+`col_int_key` int(11) DEFAULT NULL,
+`col_datetime_key` datetime DEFAULT NULL,
+`col_varchar_key` varchar(1) DEFAULT NULL,
+`col_varchar_nokey` varchar(1) DEFAULT NULL,
+KEY `col_varchar_key` (`col_varchar_key`,`col_int_key`)
+) ENGINE=MARIA DEFAULT CHARSET=latin1 PAGE_CHECKSUM=1;
+INSERT INTO `t2` VALUES (2,'2004-10-11 18:13:16','w','w');
+INSERT INTO `t2` VALUES (2,'1900-01-01 00:00:00','d','d');
+SELECT table2 .`col_datetime_key`
+FROM t2 JOIN ( t1 table2 JOIN t2 table3 ON table3 .`col_varchar_key` < table2 .`col_varchar_key` ) ON table3 .`col_varchar_nokey` ;
+col_datetime_key
+drop table t1, t2;
+set join_cache_level=@save_join_cache_level;
diff --git a/mysql-test/t/maria_mrr.test b/mysql-test/t/maria_mrr.test
new file mode 100644
index 00000000000..5549e6808f6
--- /dev/null
+++ b/mysql-test/t/maria_mrr.test
@@ -0,0 +1,47 @@
+-- source include/have_maria.inc
+#
+# MRR/Maria tests.
+#
+
+--disable_warnings
+drop table if exists t1, t2, t3;
+--enable_warnings
+
+set @mrr_buffer_size_save= @@mrr_buffer_size;
+
+set @save_storage_engine= @@storage_engine;
+set storage_engine=Maria;
+-- source include/mrr_tests.inc
+set storage_engine= @save_storage_engine;
+
+set @@mrr_buffer_size= @mrr_buffer_size_save;
+
+--echo # 
+--echo # Crash in quick_range_seq_next() in maria-5.3-dsmrr-cpk with join_cache_level = {8,1}
+--echo # 
+set @save_join_cache_level= @@join_cache_level;
+SET SESSION join_cache_level = 8;
+CREATE TABLE `t1` (
+  `col_int_key` int(11) DEFAULT NULL,
+  `col_datetime_key` datetime DEFAULT NULL,
+  `col_varchar_key` varchar(1) DEFAULT NULL,
+  `col_varchar_nokey` varchar(1) DEFAULT NULL,
+  KEY `col_varchar_key` (`col_varchar_key`,`col_int_key`)
+) ENGINE=MARIA DEFAULT CHARSET=latin1 PAGE_CHECKSUM=1;
+INSERT INTO `t1` VALUES (6,'2005-10-07 00:00:00','e','e');
+INSERT INTO `t1` VALUES (51,'2000-07-15 05:00:34','f','f');
+CREATE TABLE `t2` (
+  `col_int_key` int(11) DEFAULT NULL,
+  `col_datetime_key` datetime DEFAULT NULL,
+  `col_varchar_key` varchar(1) DEFAULT NULL,
+  `col_varchar_nokey` varchar(1) DEFAULT NULL,
+  KEY `col_varchar_key` (`col_varchar_key`,`col_int_key`)
+) ENGINE=MARIA DEFAULT CHARSET=latin1 PAGE_CHECKSUM=1;
+INSERT INTO `t2` VALUES (2,'2004-10-11 18:13:16','w','w');
+INSERT INTO `t2` VALUES (2,'1900-01-01 00:00:00','d','d');
+SELECT table2 .`col_datetime_key`
+FROM t2 JOIN ( t1 table2 JOIN t2 table3 ON table3 .`col_varchar_key` < table2 .`col_varchar_key` ) ON table3 .`col_varchar_nokey` ;
+
+drop table t1, t2;
+set join_cache_level=@save_join_cache_level;
+
diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc
index 9ae86a287ac..31d3faa3205 100644
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -591,6 +591,7 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
     h2= NULL;
     int res= (h->inited == handler::INDEX && h->ha_index_end());
     h2= save_h2;
+    use_default_impl= FALSE;
     if (res)
       goto error;
   }
diff --git a/sql/multi_range_read.h b/sql/multi_range_read.h
index 4b33b85c199..5eaff483c47 100644
--- a/sql/multi_range_read.h
+++ b/sql/multi_range_read.h
@@ -153,7 +153,7 @@ public:
       - it is adjacent to buffer space we're using
       - it is on the end towards which we grow.
     */
-    DBUG_ASSERT(unused_end > unused_start);
+    DBUG_ASSERT(unused_end >= unused_start);
     TRASH(unused_start, unused_end - unused_start);
     if (direction == 1 && end == unused_start)
     {

From 7f41516f4f3980a68808b4498f76b54c8bbe1969 Mon Sep 17 00:00:00 2001
From: Sergey Petrunya <psergey@askmonty.org>
Date: Mon, 13 Sep 2010 20:05:51 +0400
Subject: [PATCH 27/49] BUG#629684: Unreachable code in multi_range_read.cc in
 maria-5.3-dsmrr-cpk - More test coverage

---
 mysql-test/r/myisam_mrr.result | 23 +++++++++++++++++++++++
 mysql-test/t/myisam_mrr.test   | 16 ++++++++++++++++
 2 files changed, 39 insertions(+)

diff --git a/mysql-test/r/myisam_mrr.result b/mysql-test/r/myisam_mrr.result
index 5db03db85ac..87a099426f5 100644
--- a/mysql-test/r/myisam_mrr.result
+++ b/mysql-test/r/myisam_mrr.result
@@ -413,4 +413,27 @@ explain select * from t1 where a < 20;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	range	a	a	5	NULL	20	Using index condition; Using MRR
 set optimizer_switch=@save_optimizer_switch;
+# 
+# BUG#629684: Unreachable code in multi_range_read.cc in maria-5.3-dsmrr-cpk
+#
+delete from t0 where a > 2;
+insert into t0 values (NULL),(NULL);
+insert into t1 values (NULL, 1234), (NULL, 5678);
+set @save_join_cache_level=@@join_cache_level;
+set @@join_cache_level=6;
+explain 
+select * from t0, t1 where t0.a<=>t1.a;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t0	ALL	NULL	NULL	NULL	NULL	5	
+1	SIMPLE	t1	ref	a	a	5	test.t0.a	1	Using index condition(BKA); Using join buffer
+select * from t0, t1 where t0.a<=>t1.a;
+a	a	b
+0	0	0
+1	1	1
+2	2	2
+NULL	NULL	1234
+NULL	NULL	1234
+NULL	NULL	5678
+NULL	NULL	5678
+set @@join_cache_level=@save_join_cache_level;
 drop table t0, t1;
diff --git a/mysql-test/t/myisam_mrr.test b/mysql-test/t/myisam_mrr.test
index d9afdf3140d..3ac414e8ca8 100644
--- a/mysql-test/t/myisam_mrr.test
+++ b/mysql-test/t/myisam_mrr.test
@@ -123,4 +123,20 @@ explain select * from t1 where a < 20;
 
 set optimizer_switch=@save_optimizer_switch;
 
+
+--echo # 
+--echo # BUG#629684: Unreachable code in multi_range_read.cc in maria-5.3-dsmrr-cpk
+--echo #
+
+delete from t0 where a > 2;
+insert into t0 values (NULL),(NULL);
+insert into t1 values (NULL, 1234), (NULL, 5678);
+
+set @save_join_cache_level=@@join_cache_level;
+set @@join_cache_level=6;
+explain 
+select * from t0, t1 where t0.a<=>t1.a;
+select * from t0, t1 where t0.a<=>t1.a;
+
+set @@join_cache_level=@save_join_cache_level;
 drop table t0, t1;

From 3a5c004bb5c860a337ec7469596d18ce1ae28702 Mon Sep 17 00:00:00 2001
From: Sergey Petrunya <psergey@askmonty.org>
Date: Wed, 15 Sep 2010 16:14:19 +0400
Subject: [PATCH 28/49] BUG#625841: Assertion `!table || (!table->read_set ||
 bitmap_is_set - When find_all_keys() checks which table columns are needed
 for table scan   that is done before the sorting, it should also analyze
 pushed index condition.   This is achieved by remembering/checking
 pre-index-pushed condition.

---
 mysql-test/r/myisam_mrr.result | 25 +++++++++++++++++++++++++
 mysql-test/t/myisam_mrr.test   | 29 +++++++++++++++++++++++++++++
 sql/filesort.cc                | 15 ++++++++++-----
 sql/opt_index_cond_pushdown.cc |  1 +
 sql/opt_range.cc               |  2 +-
 sql/opt_range.h                |  7 +++++++
 6 files changed, 73 insertions(+), 6 deletions(-)

diff --git a/mysql-test/r/myisam_mrr.result b/mysql-test/r/myisam_mrr.result
index 87a099426f5..5b3e966f83d 100644
--- a/mysql-test/r/myisam_mrr.result
+++ b/mysql-test/r/myisam_mrr.result
@@ -437,3 +437,28 @@ NULL	NULL	5678
 NULL	NULL	5678
 set @@join_cache_level=@save_join_cache_level;
 drop table t0, t1;
+#
+# BUG#625841: Assertion `!table || (!table->read_set || bitmap_is_set
+#             (table->read_set, field_index))' on REPLACE ... SELECT with MRR
+#
+create table t0 (a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+key1 varchar(10),
+col1 char(255), col2 char(255),
+col3 char(244), col4 char(255),
+key(key1)
+);
+create table t2 like t1;
+insert into t1
+select
+1000+A.a+100*B.a + 10*C.a,
+'col1val', 'col2val',
+'col3val', 'col4val'
+from t0 A, t0 B, t0 C;
+REPLACE INTO t2(col2,col3,col4)
+SELECT col2,col3,col4
+FROM t1
+WHERE `key1` LIKE CONCAT( LEFT( '1' , 7 ) , '%' )
+ORDER BY col1 LIMIT 7;
+drop table t0, t1, t2;
diff --git a/mysql-test/t/myisam_mrr.test b/mysql-test/t/myisam_mrr.test
index 3ac414e8ca8..a9433337a10 100644
--- a/mysql-test/t/myisam_mrr.test
+++ b/mysql-test/t/myisam_mrr.test
@@ -140,3 +140,32 @@ select * from t0, t1 where t0.a<=>t1.a;
 
 set @@join_cache_level=@save_join_cache_level;
 drop table t0, t1;
+
+--echo #
+--echo # BUG#625841: Assertion `!table || (!table->read_set || bitmap_is_set
+--echo #             (table->read_set, field_index))' on REPLACE ... SELECT with MRR
+--echo #
+create table t0 (a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+
+create table t1 (
+  key1 varchar(10),
+  col1 char(255), col2 char(255),
+  col3 char(244), col4 char(255),
+  key(key1)
+);
+create table t2 like t1;
+
+insert into t1
+select
+  1000+A.a+100*B.a + 10*C.a,
+  'col1val', 'col2val',
+  'col3val', 'col4val'
+from t0 A, t0 B, t0 C;
+
+REPLACE INTO t2(col2,col3,col4)
+SELECT col2,col3,col4
+FROM t1
+WHERE `key1` LIKE CONCAT( LEFT( '1' , 7 ) , '%' )
+ORDER BY col1 LIMIT 7;
+drop table t0, t1, t2;
diff --git a/sql/filesort.cc b/sql/filesort.cc
index 41410929f15..f5c0b18ceb3 100644
--- a/sql/filesort.cc
+++ b/sql/filesort.cc
@@ -542,11 +542,6 @@ static ha_rows find_all_keys(SORTPARAM *param, SQL_SELECT *select,
 		    current_thd->variables.read_buff_size);
   }
 
-  if (quick_select)
-  {
-    if (select->quick->reset())
-      DBUG_RETURN(HA_POS_ERROR);
-  }
 
   /* Remember original bitmaps */
   save_read_set=  sort_form->read_set;
@@ -559,8 +554,18 @@ static ha_rows find_all_keys(SORTPARAM *param, SQL_SELECT *select,
   if (select && select->cond)
     select->cond->walk(&Item::register_field_in_read_map, 1,
                        (uchar*) sort_form);
+  if (select && select->pre_idx_push_select_cond)
+    select->pre_idx_push_select_cond->walk(&Item::register_field_in_read_map,
+                                           1, (uchar*) sort_form);
+
   sort_form->column_bitmaps_set(&sort_form->tmp_set, &sort_form->tmp_set);
 
+  if (quick_select)
+  {
+    if (select->quick->reset())
+      DBUG_RETURN(HA_POS_ERROR);
+  }
+
   for (;;)
   {
     if (quick_select)
diff --git a/sql/opt_index_cond_pushdown.cc b/sql/opt_index_cond_pushdown.cc
index 277343b81a5..2e5ad795668 100644
--- a/sql/opt_index_cond_pushdown.cc
+++ b/sql/opt_index_cond_pushdown.cc
@@ -378,6 +378,7 @@ void push_index_cond(JOIN_TAB *tab, uint keyno, bool other_tbls_ok)
                                  QT_ORDINARY););
 
         tab->select->cond= tab->select_cond;
+        tab->select->pre_idx_push_select_cond= tab->pre_idx_push_select_cond;
       }
     }
   }
diff --git a/sql/opt_range.cc b/sql/opt_range.cc
index 62e39faa272..a64dd67a2cd 100644
--- a/sql/opt_range.cc
+++ b/sql/opt_range.cc
@@ -1119,7 +1119,7 @@ SQL_SELECT *make_select(TABLE *head, table_map const_tables,
 }
 
 
-SQL_SELECT::SQL_SELECT() :quick(0),cond(0),free_cond(0)
+SQL_SELECT::SQL_SELECT() :quick(0),cond(0),pre_idx_push_select_cond(NULL),free_cond(0)
 {
   quick_keys.clear_all(); needed_reg.clear_all();
   my_b_clear(&file);
diff --git a/sql/opt_range.h b/sql/opt_range.h
index 5abad749b58..ac2aaf08057 100644
--- a/sql/opt_range.h
+++ b/sql/opt_range.h
@@ -738,6 +738,13 @@ class SQL_SELECT :public Sql_alloc {
  public:
   QUICK_SELECT_I *quick;	// If quick-select used
   COND		*cond;		// where condition
+
+  /*
+    When using Index Condition Pushdown: condition that we've had before
+    extracting and pushing index condition.
+    In other cases, NULL.
+  */
+  Item *pre_idx_push_select_cond;
   TABLE	*head;
   IO_CACHE file;		// Positions to used records
   ha_rows records;		// Records in use if read from file

From 499b142ad512fcaf3b0bf2bf8073a4983445dc06 Mon Sep 17 00:00:00 2001
From: Sergey Petrunya <psergey@askmonty.org>
Date: Wed, 15 Sep 2010 16:58:01 +0400
Subject: [PATCH 29/49] BUG#628785: multi_range_read.cc:430: int
 DsMrr_impl::dsmrr_init(): Assertion `do_sort_keys || do_rowid_fetch' failed -
 Make Ds_MrrImpl::check_cpk_scan() follow the execution code' logic: don't  
 do MRR scans on clustered PK when mrr_sort_keys=off.

---
 mysql-test/r/innodb_mrr.result | 30 ++++++++++++++++++++++++++++++
 mysql-test/t/innodb_mrr.test   | 31 +++++++++++++++++++++++++++++++
 sql/multi_range_read.cc        | 17 +++++++++--------
 sql/multi_range_read.h         |  2 +-
 4 files changed, 71 insertions(+), 9 deletions(-)

diff --git a/mysql-test/r/innodb_mrr.result b/mysql-test/r/innodb_mrr.result
index 7b1c18d2523..1e50a8b84ed 100644
--- a/mysql-test/r/innodb_mrr.result
+++ b/mysql-test/r/innodb_mrr.result
@@ -402,3 +402,33 @@ SELECT * FROM t1 WHERE parent_id IS NOT NULL ORDER BY id DESC LIMIT 1;
 id	parent_id	name
 60	40	F
 drop table t1;
+#
+# BUG#628785: multi_range_read.cc:430: int DsMrr_impl::dsmrr_init(): Assertion `do_sort_keys || do_rowid_fetch' failed 
+#
+set @save_join_cache_level= @@join_cache_level;
+set @save_optimizer_switch= @@optimizer_switch;
+SET SESSION join_cache_level=9;
+Warnings:
+Warning	1292	Truncated incorrect join_cache_level value: '9'
+SET SESSION optimizer_switch='mrr_sort_keys=off';
+CREATE TABLE `t1` (
+`pk` int(11) NOT NULL AUTO_INCREMENT,
+`col_int_nokey` int(11) DEFAULT NULL,
+`col_int_key` int(11) DEFAULT NULL,
+`col_varchar_key` varchar(1) DEFAULT NULL,
+`col_varchar_nokey` varchar(1) DEFAULT NULL,
+PRIMARY KEY (`pk`),
+KEY `col_varchar_key` (`col_varchar_key`,`col_int_key`)
+) ENGINE=InnoDB AUTO_INCREMENT=101 DEFAULT CHARSET=latin1;
+INSERT INTO `t1` VALUES (1,6,NULL,'r','r');
+INSERT INTO `t1` VALUES (2,8,0,'c','c');
+INSERT INTO `t1` VALUES (97,7,0,'z','z');
+INSERT INTO `t1` VALUES (98,1,1,'j','j');
+INSERT INTO `t1` VALUES (99,7,8,'c','c');
+INSERT INTO `t1` VALUES (100,2,5,'f','f');
+SELECT table1 .`col_varchar_key`
+FROM t1 table1 STRAIGHT_JOIN ( t1 table3 JOIN t1 table4 ON table4 .`pk` = table3 .`col_int_nokey` ) ON table4 .`col_varchar_nokey` ;
+col_varchar_key
+DROP TABLE t1;
+set join_cache_level=@save_join_cache_level;
+set optimizer_switch=@save_optimizer_switch;
diff --git a/mysql-test/t/innodb_mrr.test b/mysql-test/t/innodb_mrr.test
index 0f5b41cef27..f2c7a83e068 100644
--- a/mysql-test/t/innodb_mrr.test
+++ b/mysql-test/t/innodb_mrr.test
@@ -123,3 +123,34 @@ SELECT id FROM t1 WHERE parent_id IS NOT NULL ORDER BY id DESC LIMIT 1;
 explain SELECT * FROM t1 FORCE INDEX (PRIMARY) WHERE parent_id IS NOT NULL ORDER BY id DESC LIMIT 1;
 SELECT * FROM t1 WHERE parent_id IS NOT NULL ORDER BY id DESC LIMIT 1;
 drop table t1;
+
+
+-- echo #
+-- echo # BUG#628785: multi_range_read.cc:430: int DsMrr_impl::dsmrr_init(): Assertion `do_sort_keys || do_rowid_fetch' failed 
+-- echo #
+set @save_join_cache_level= @@join_cache_level;
+set @save_optimizer_switch= @@optimizer_switch;
+SET SESSION join_cache_level=9;
+SET SESSION optimizer_switch='mrr_sort_keys=off';
+
+CREATE TABLE `t1` (
+  `pk` int(11) NOT NULL AUTO_INCREMENT,
+  `col_int_nokey` int(11) DEFAULT NULL,
+  `col_int_key` int(11) DEFAULT NULL,
+  `col_varchar_key` varchar(1) DEFAULT NULL,
+  `col_varchar_nokey` varchar(1) DEFAULT NULL,
+  PRIMARY KEY (`pk`),
+  KEY `col_varchar_key` (`col_varchar_key`,`col_int_key`)
+) ENGINE=InnoDB AUTO_INCREMENT=101 DEFAULT CHARSET=latin1;
+INSERT INTO `t1` VALUES (1,6,NULL,'r','r');
+INSERT INTO `t1` VALUES (2,8,0,'c','c');
+INSERT INTO `t1` VALUES (97,7,0,'z','z');
+INSERT INTO `t1` VALUES (98,1,1,'j','j');
+INSERT INTO `t1` VALUES (99,7,8,'c','c');
+INSERT INTO `t1` VALUES (100,2,5,'f','f');
+SELECT table1 .`col_varchar_key`
+FROM t1 table1 STRAIGHT_JOIN ( t1 table3 JOIN t1 table4 ON table4 .`pk` = table3 .`col_int_nokey` ) ON table4 .`col_varchar_nokey` ;
+DROP TABLE t1;
+set join_cache_level=@save_join_cache_level;
+set optimizer_switch=@save_optimizer_switch;
+
diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc
index 31d3faa3205..b6c9a5e16ab 100644
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -474,8 +474,8 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
   }
 
   do_rowid_fetch= FALSE;
-  doing_cpk_scan= check_cpk_scan(h->inited == handler::INDEX? 
-                                 h->active_index: h2->active_index, mode);
+  doing_cpk_scan= check_cpk_scan(thd, h->inited == handler::INDEX? 
+                                      h->active_index: h2->active_index, mode);
   if (!doing_cpk_scan /* && !index_only_read */)
   {
     /* Will use rowid buffer to store/sort rowids, etc */
@@ -1370,12 +1370,13 @@ bool key_uses_partial_cols(TABLE *table, uint keyno)
     FALSE  Otherwise
 */
 
-bool DsMrr_impl::check_cpk_scan(uint keyno, uint mrr_flags)
+bool DsMrr_impl::check_cpk_scan(THD *thd, uint keyno, uint mrr_flags)
 {
   return test((mrr_flags & HA_MRR_SINGLE_POINT) && 
               !(mrr_flags & HA_MRR_SORTED) && 
               keyno == table->s->primary_key && 
-              h->primary_key_is_clustered());
+              h->primary_key_is_clustered() && 
+              optimizer_flag(thd, OPTIMIZER_SWITCH_MRR_SORT_KEYS));
 }
 
 
@@ -1410,11 +1411,11 @@ bool DsMrr_impl::choose_mrr_impl(uint keyno, ha_rows rows, uint *flags,
   bool res;
   THD *thd= current_thd;
 
-  doing_cpk_scan= check_cpk_scan(keyno, *flags); 
+  doing_cpk_scan= check_cpk_scan(thd, keyno, *flags); 
+  bool using_cpk= test(keyno == table->s->primary_key &&
+                       h->primary_key_is_clustered());
   if (thd->variables.optimizer_use_mrr == 2 || *flags & HA_MRR_INDEX_ONLY ||
-      (keyno == table->s->primary_key && h->primary_key_is_clustered() &&
-       !doing_cpk_scan) ||
-       key_uses_partial_cols(table, keyno))
+      (using_cpk && !doing_cpk_scan) || key_uses_partial_cols(table, keyno))
   {
     /* Use the default implementation */
     *flags |= HA_MRR_USE_DEFAULT_IMPL;
diff --git a/sql/multi_range_read.h b/sql/multi_range_read.h
index 5eaff483c47..9cd1503596f 100644
--- a/sql/multi_range_read.h
+++ b/sql/multi_range_read.h
@@ -384,7 +384,7 @@ private:
                        COST_VECT *cost);
   bool get_disk_sweep_mrr_cost(uint keynr, ha_rows rows, uint flags, 
                                uint *buffer_size, COST_VECT *cost);
-  bool check_cpk_scan(uint keyno, uint mrr_flags);
+  bool check_cpk_scan(THD *thd, uint keyno, uint mrr_flags);
   static int key_tuple_cmp(void* arg, uchar* key1, uchar* key2);
   int dsmrr_fill_rowid_buffer();
   void dsmrr_fill_key_buffer();

From 7b9df6aab2d5cfbb899b8d17860f3e3de95e5154 Mon Sep 17 00:00:00 2001
From: Sergey Petrunya <psergey@askmonty.org>
Date: Wed, 15 Sep 2010 20:58:38 +0400
Subject: [PATCH 30/49] BUG#623300: Query with join_cache_level = 6 returns
 extra rows in maria-5.3-dsmrr-cpk - First part of the fix: enable Early NULLs
 filtering to work when WHERE clause is present

---
 mysql-test/r/order_by.result         |  2 +-
 mysql-test/r/select.result           | 14 +++++++-------
 mysql-test/r/select_jcl6.result      | 14 +++++++-------
 mysql-test/r/select_pkeycache.result | 14 +++++++-------
 mysql-test/r/subselect3.result       |  2 +-
 mysql-test/r/subselect3_jcl6.result  |  2 +-
 mysql-test/r/table_elim.result       |  6 +++---
 sql/sql_select.cc                    |  2 +-
 8 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/mysql-test/r/order_by.result b/mysql-test/r/order_by.result
index dcd40f66365..fc1cd57dd28 100644
--- a/mysql-test/r/order_by.result
+++ b/mysql-test/r/order_by.result
@@ -1489,7 +1489,7 @@ SELECT d FROM t1, t2
 WHERE t2.b=14 AND t2.a=t1.a AND 5.1<t2.c AND t1.b='DE'
 ORDER BY t2.c LIMIT 1;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	ref	a,b	b	4	const	4	Using index condition; Using temporary; Using filesort
+1	SIMPLE	t1	ref	a,b	b	4	const	4	Using index condition; Using where; Using temporary; Using filesort
 1	SIMPLE	t2	ref	a,b,c	a	40	test.t1.a,const	11	Using index condition
 SELECT d FROM t1, t2
 WHERE t2.b=14 AND t2.a=t1.a AND 5.1<t2.c AND t1.b='DE'
diff --git a/mysql-test/r/select.result b/mysql-test/r/select.result
index 5e9a336ca9e..a9745599615 100644
--- a/mysql-test/r/select.result
+++ b/mysql-test/r/select.result
@@ -3562,19 +3562,19 @@ EXPLAIN SELECT t2.*
 FROM t1 JOIN t2 ON t2.fk=t1.pk
 WHERE t2.fk < 'c' AND t2.pk=t1.fk;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	PRIMARY	PRIMARY	12	NULL	3	Using index condition; Using MRR
+1	SIMPLE	t1	range	PRIMARY	PRIMARY	12	NULL	3	Using index condition; Using where; Using MRR
 1	SIMPLE	t2	eq_ref	PRIMARY	PRIMARY	18	test.t1.fk	1	Using where
 EXPLAIN SELECT t2.* 
 FROM t1 JOIN t2 ON t2.fk=t1.pk 
 WHERE t2.fk BETWEEN 'a' AND 'b' AND t2.pk=t1.fk;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	PRIMARY	PRIMARY	12	NULL	2	Using index condition; Using MRR
+1	SIMPLE	t1	range	PRIMARY	PRIMARY	12	NULL	2	Using index condition; Using where; Using MRR
 1	SIMPLE	t2	eq_ref	PRIMARY	PRIMARY	18	test.t1.fk	1	Using where
 EXPLAIN SELECT t2.* 
 FROM t1 JOIN t2 ON t2.fk=t1.pk 
 WHERE t2.fk IN ('a','b') AND t2.pk=t1.fk;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	PRIMARY	PRIMARY	12	NULL	2	Using index condition; Using MRR
+1	SIMPLE	t1	range	PRIMARY	PRIMARY	12	NULL	2	Using index condition; Using where; Using MRR
 1	SIMPLE	t2	eq_ref	PRIMARY	PRIMARY	18	test.t1.fk	1	Using where
 DROP TABLE t1,t2;
 CREATE TABLE t1 (a int, b varchar(20) NOT NULL, PRIMARY KEY(a));
@@ -3608,7 +3608,7 @@ WHERE t1.id = 8 AND t2.i BETWEEN t1.b AND t1.e AND
 t3.a=t2.a AND t3.c IN ('bb','ee');
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	const	PRIMARY	PRIMARY	4	const	1	
-1	SIMPLE	t2	range	si	si	5	NULL	4	Using index condition; Using MRR
+1	SIMPLE	t2	range	si	si	5	NULL	4	Using index condition; Using where; Using MRR
 1	SIMPLE	t3	eq_ref	PRIMARY,ci	PRIMARY	4	test.t2.a	1	Using where
 EXPLAIN
 SELECT t3.a FROM t1,t2,t3
@@ -3616,7 +3616,7 @@ WHERE t1.id = 8 AND t2.i BETWEEN t1.b AND t1.e AND
 t3.a=t2.a AND t3.c IN ('bb','ee') ;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	const	PRIMARY	PRIMARY	4	const	1	
-1	SIMPLE	t2	range	si,ai	si	5	NULL	4	Using index condition; Using MRR
+1	SIMPLE	t2	range	si,ai	si	5	NULL	4	Using index condition; Using where; Using MRR
 1	SIMPLE	t3	eq_ref	PRIMARY,ci	PRIMARY	4	test.t2.a	1	Using where
 EXPLAIN 
 SELECT t3.a FROM t1,t2 FORCE INDEX (si),t3
@@ -3624,7 +3624,7 @@ WHERE t1.id = 8 AND (t2.i=t1.b OR t2.i=t1.e) AND t3.a=t2.a AND
 t3.c IN ('bb','ee');
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	const	PRIMARY	PRIMARY	4	const	1	
-1	SIMPLE	t2	range	si	si	5	NULL	2	Using index condition; Using MRR
+1	SIMPLE	t2	range	si	si	5	NULL	2	Using index condition; Using where; Using MRR
 1	SIMPLE	t3	eq_ref	PRIMARY,ci	PRIMARY	4	test.t2.a	1	Using where
 EXPLAIN 
 SELECT t3.a FROM t1,t2,t3
@@ -3632,7 +3632,7 @@ WHERE t1.id = 8 AND (t2.i=t1.b OR t2.i=t1.e) AND t3.a=t2.a AND
 t3.c IN ('bb','ee');
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	const	PRIMARY	PRIMARY	4	const	1	
-1	SIMPLE	t2	range	si,ai	si	5	NULL	2	Using index condition; Using MRR
+1	SIMPLE	t2	range	si,ai	si	5	NULL	2	Using index condition; Using where; Using MRR
 1	SIMPLE	t3	eq_ref	PRIMARY,ci	PRIMARY	4	test.t2.a	1	Using where
 DROP TABLE t1,t2,t3;
 CREATE TABLE t1 ( f1 int primary key, f2 int, f3 int, f4 int, f5 int, f6 int, checked_out int);
diff --git a/mysql-test/r/select_jcl6.result b/mysql-test/r/select_jcl6.result
index ff09e79a511..68579f43345 100644
--- a/mysql-test/r/select_jcl6.result
+++ b/mysql-test/r/select_jcl6.result
@@ -3566,19 +3566,19 @@ EXPLAIN SELECT t2.*
 FROM t1 JOIN t2 ON t2.fk=t1.pk
 WHERE t2.fk < 'c' AND t2.pk=t1.fk;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	PRIMARY	PRIMARY	12	NULL	3	Using index condition; Using MRR
+1	SIMPLE	t1	range	PRIMARY	PRIMARY	12	NULL	3	Using index condition; Using where; Using MRR
 1	SIMPLE	t2	eq_ref	PRIMARY	PRIMARY	18	test.t1.fk	1	Using where; Using join buffer
 EXPLAIN SELECT t2.* 
 FROM t1 JOIN t2 ON t2.fk=t1.pk 
 WHERE t2.fk BETWEEN 'a' AND 'b' AND t2.pk=t1.fk;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	PRIMARY	PRIMARY	12	NULL	2	Using index condition; Using MRR
+1	SIMPLE	t1	range	PRIMARY	PRIMARY	12	NULL	2	Using index condition; Using where; Using MRR
 1	SIMPLE	t2	eq_ref	PRIMARY	PRIMARY	18	test.t1.fk	1	Using where; Using join buffer
 EXPLAIN SELECT t2.* 
 FROM t1 JOIN t2 ON t2.fk=t1.pk 
 WHERE t2.fk IN ('a','b') AND t2.pk=t1.fk;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	PRIMARY	PRIMARY	12	NULL	2	Using index condition; Using MRR
+1	SIMPLE	t1	range	PRIMARY	PRIMARY	12	NULL	2	Using index condition; Using where; Using MRR
 1	SIMPLE	t2	eq_ref	PRIMARY	PRIMARY	18	test.t1.fk	1	Using where; Using join buffer
 DROP TABLE t1,t2;
 CREATE TABLE t1 (a int, b varchar(20) NOT NULL, PRIMARY KEY(a));
@@ -3612,7 +3612,7 @@ WHERE t1.id = 8 AND t2.i BETWEEN t1.b AND t1.e AND
 t3.a=t2.a AND t3.c IN ('bb','ee');
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	const	PRIMARY	PRIMARY	4	const	1	
-1	SIMPLE	t2	range	si	si	5	NULL	4	Using index condition; Using MRR
+1	SIMPLE	t2	range	si	si	5	NULL	4	Using index condition; Using where; Using MRR
 1	SIMPLE	t3	eq_ref	PRIMARY,ci	PRIMARY	4	test.t2.a	1	Using where; Using join buffer
 EXPLAIN
 SELECT t3.a FROM t1,t2,t3
@@ -3620,7 +3620,7 @@ WHERE t1.id = 8 AND t2.i BETWEEN t1.b AND t1.e AND
 t3.a=t2.a AND t3.c IN ('bb','ee') ;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	const	PRIMARY	PRIMARY	4	const	1	
-1	SIMPLE	t2	range	si,ai	si	5	NULL	4	Using index condition; Using MRR
+1	SIMPLE	t2	range	si,ai	si	5	NULL	4	Using index condition; Using where; Using MRR
 1	SIMPLE	t3	eq_ref	PRIMARY,ci	PRIMARY	4	test.t2.a	1	Using where; Using join buffer
 EXPLAIN 
 SELECT t3.a FROM t1,t2 FORCE INDEX (si),t3
@@ -3628,7 +3628,7 @@ WHERE t1.id = 8 AND (t2.i=t1.b OR t2.i=t1.e) AND t3.a=t2.a AND
 t3.c IN ('bb','ee');
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	const	PRIMARY	PRIMARY	4	const	1	
-1	SIMPLE	t2	range	si	si	5	NULL	2	Using index condition; Using MRR
+1	SIMPLE	t2	range	si	si	5	NULL	2	Using index condition; Using where; Using MRR
 1	SIMPLE	t3	eq_ref	PRIMARY,ci	PRIMARY	4	test.t2.a	1	Using where; Using join buffer
 EXPLAIN 
 SELECT t3.a FROM t1,t2,t3
@@ -3636,7 +3636,7 @@ WHERE t1.id = 8 AND (t2.i=t1.b OR t2.i=t1.e) AND t3.a=t2.a AND
 t3.c IN ('bb','ee');
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	const	PRIMARY	PRIMARY	4	const	1	
-1	SIMPLE	t2	range	si,ai	si	5	NULL	2	Using index condition; Using MRR
+1	SIMPLE	t2	range	si,ai	si	5	NULL	2	Using index condition; Using where; Using MRR
 1	SIMPLE	t3	eq_ref	PRIMARY,ci	PRIMARY	4	test.t2.a	1	Using where; Using join buffer
 DROP TABLE t1,t2,t3;
 CREATE TABLE t1 ( f1 int primary key, f2 int, f3 int, f4 int, f5 int, f6 int, checked_out int);
diff --git a/mysql-test/r/select_pkeycache.result b/mysql-test/r/select_pkeycache.result
index 5e9a336ca9e..a9745599615 100644
--- a/mysql-test/r/select_pkeycache.result
+++ b/mysql-test/r/select_pkeycache.result
@@ -3562,19 +3562,19 @@ EXPLAIN SELECT t2.*
 FROM t1 JOIN t2 ON t2.fk=t1.pk
 WHERE t2.fk < 'c' AND t2.pk=t1.fk;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	PRIMARY	PRIMARY	12	NULL	3	Using index condition; Using MRR
+1	SIMPLE	t1	range	PRIMARY	PRIMARY	12	NULL	3	Using index condition; Using where; Using MRR
 1	SIMPLE	t2	eq_ref	PRIMARY	PRIMARY	18	test.t1.fk	1	Using where
 EXPLAIN SELECT t2.* 
 FROM t1 JOIN t2 ON t2.fk=t1.pk 
 WHERE t2.fk BETWEEN 'a' AND 'b' AND t2.pk=t1.fk;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	PRIMARY	PRIMARY	12	NULL	2	Using index condition; Using MRR
+1	SIMPLE	t1	range	PRIMARY	PRIMARY	12	NULL	2	Using index condition; Using where; Using MRR
 1	SIMPLE	t2	eq_ref	PRIMARY	PRIMARY	18	test.t1.fk	1	Using where
 EXPLAIN SELECT t2.* 
 FROM t1 JOIN t2 ON t2.fk=t1.pk 
 WHERE t2.fk IN ('a','b') AND t2.pk=t1.fk;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	PRIMARY	PRIMARY	12	NULL	2	Using index condition; Using MRR
+1	SIMPLE	t1	range	PRIMARY	PRIMARY	12	NULL	2	Using index condition; Using where; Using MRR
 1	SIMPLE	t2	eq_ref	PRIMARY	PRIMARY	18	test.t1.fk	1	Using where
 DROP TABLE t1,t2;
 CREATE TABLE t1 (a int, b varchar(20) NOT NULL, PRIMARY KEY(a));
@@ -3608,7 +3608,7 @@ WHERE t1.id = 8 AND t2.i BETWEEN t1.b AND t1.e AND
 t3.a=t2.a AND t3.c IN ('bb','ee');
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	const	PRIMARY	PRIMARY	4	const	1	
-1	SIMPLE	t2	range	si	si	5	NULL	4	Using index condition; Using MRR
+1	SIMPLE	t2	range	si	si	5	NULL	4	Using index condition; Using where; Using MRR
 1	SIMPLE	t3	eq_ref	PRIMARY,ci	PRIMARY	4	test.t2.a	1	Using where
 EXPLAIN
 SELECT t3.a FROM t1,t2,t3
@@ -3616,7 +3616,7 @@ WHERE t1.id = 8 AND t2.i BETWEEN t1.b AND t1.e AND
 t3.a=t2.a AND t3.c IN ('bb','ee') ;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	const	PRIMARY	PRIMARY	4	const	1	
-1	SIMPLE	t2	range	si,ai	si	5	NULL	4	Using index condition; Using MRR
+1	SIMPLE	t2	range	si,ai	si	5	NULL	4	Using index condition; Using where; Using MRR
 1	SIMPLE	t3	eq_ref	PRIMARY,ci	PRIMARY	4	test.t2.a	1	Using where
 EXPLAIN 
 SELECT t3.a FROM t1,t2 FORCE INDEX (si),t3
@@ -3624,7 +3624,7 @@ WHERE t1.id = 8 AND (t2.i=t1.b OR t2.i=t1.e) AND t3.a=t2.a AND
 t3.c IN ('bb','ee');
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	const	PRIMARY	PRIMARY	4	const	1	
-1	SIMPLE	t2	range	si	si	5	NULL	2	Using index condition; Using MRR
+1	SIMPLE	t2	range	si	si	5	NULL	2	Using index condition; Using where; Using MRR
 1	SIMPLE	t3	eq_ref	PRIMARY,ci	PRIMARY	4	test.t2.a	1	Using where
 EXPLAIN 
 SELECT t3.a FROM t1,t2,t3
@@ -3632,7 +3632,7 @@ WHERE t1.id = 8 AND (t2.i=t1.b OR t2.i=t1.e) AND t3.a=t2.a AND
 t3.c IN ('bb','ee');
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	const	PRIMARY	PRIMARY	4	const	1	
-1	SIMPLE	t2	range	si,ai	si	5	NULL	2	Using index condition; Using MRR
+1	SIMPLE	t2	range	si,ai	si	5	NULL	2	Using index condition; Using where; Using MRR
 1	SIMPLE	t3	eq_ref	PRIMARY,ci	PRIMARY	4	test.t2.a	1	Using where
 DROP TABLE t1,t2,t3;
 CREATE TABLE t1 ( f1 int primary key, f2 int, f3 int, f4 int, f5 int, f6 int, checked_out int);
diff --git a/mysql-test/r/subselect3.result b/mysql-test/r/subselect3.result
index 665bb9a2bde..440d90c2866 100644
--- a/mysql-test/r/subselect3.result
+++ b/mysql-test/r/subselect3.result
@@ -1130,7 +1130,7 @@ insert into t4 select a from t3;
 explain select * from t3 where a in (select t1.kp1 from t1,t4 where kp1<20
 and t4.pk=t1.c);
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	PRIMARY	t1	range	kp1	kp1	5	NULL	48	Using index condition; Using MRR; LooseScan
+1	PRIMARY	t1	range	kp1	kp1	5	NULL	48	Using index condition; Using where; Using MRR; LooseScan
 1	PRIMARY	t4	eq_ref	PRIMARY	PRIMARY	4	test.t1.c	1	Using index; FirstMatch(t1)
 1	PRIMARY	t3	ALL	NULL	NULL	NULL	NULL	100	Using where; Using join buffer
 drop table t1, t3, t4;
diff --git a/mysql-test/r/subselect3_jcl6.result b/mysql-test/r/subselect3_jcl6.result
index d25ca436311..a45197aa77e 100644
--- a/mysql-test/r/subselect3_jcl6.result
+++ b/mysql-test/r/subselect3_jcl6.result
@@ -1135,7 +1135,7 @@ insert into t4 select a from t3;
 explain select * from t3 where a in (select t1.kp1 from t1,t4 where kp1<20
 and t4.pk=t1.c);
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	PRIMARY	t1	range	kp1	kp1	5	NULL	48	Using index condition; Using MRR; LooseScan
+1	PRIMARY	t1	range	kp1	kp1	5	NULL	48	Using index condition; Using where; Using MRR; LooseScan
 1	PRIMARY	t4	eq_ref	PRIMARY	PRIMARY	4	test.t1.c	1	Using index; FirstMatch(t1)
 1	PRIMARY	t3	ALL	NULL	NULL	NULL	NULL	100	Using where; Using join buffer
 drop table t1, t3, t4;
diff --git a/mysql-test/r/table_elim.result b/mysql-test/r/table_elim.result
index a1b8ba0018d..d2ec09429da 100644
--- a/mysql-test/r/table_elim.result
+++ b/mysql-test/r/table_elim.result
@@ -128,7 +128,7 @@ Note	1003	select `f`.`id` AS `id` from `test`.`t0` `f` where (`f`.`id` in (1,2,3
 This should use facts and a1 tables:
 explain extended select id from v1 where attr1 between 12 and 14;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
-1	PRIMARY	a1	range	PRIMARY,attr1	attr1	5	NULL	2	100.00	Using index condition; Using MRR
+1	PRIMARY	a1	range	PRIMARY,attr1	attr1	5	NULL	2	100.00	Using index condition; Using where; Using MRR
 1	PRIMARY	f	eq_ref	PRIMARY	PRIMARY	4	test.a1.id	1	100.00	Using index
 Warnings:
 Note	1276	Field or reference 'test.a2.id' of SELECT #3 was resolved in SELECT #1
@@ -156,7 +156,7 @@ Note	1003	select `f`.`id` AS `id` from `test`.`t0` `f` where (`f`.`id` in (1,2,3
 This should use facts and a1 tables:
 explain extended select id from v2 where attr1 between 12 and 14;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
-1	PRIMARY	a1	range	PRIMARY,attr1	attr1	5	NULL	2	100.00	Using index condition; Using MRR
+1	PRIMARY	a1	range	PRIMARY,attr1	attr1	5	NULL	2	100.00	Using index condition; Using where; Using MRR
 1	PRIMARY	f	eq_ref	PRIMARY	PRIMARY	4	test.a1.id	1	100.00	Using index
 Warnings:
 Note	1276	Field or reference 'test.f.id' of SELECT #3 was resolved in SELECT #1
@@ -164,7 +164,7 @@ Note	1003	select `f`.`id` AS `id` from `test`.`t0` `f` join `test`.`t1` `a1` whe
 This should use facts, a2 and its subquery:
 explain extended select id from v2 where attr2 between 12 and 14;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
-1	PRIMARY	a2	range	PRIMARY,attr2	attr2	5	NULL	5	100.00	Using index condition; Using MRR
+1	PRIMARY	a2	range	PRIMARY,attr2	attr2	5	NULL	5	100.00	Using index condition; Using where; Using MRR
 1	PRIMARY	f	eq_ref	PRIMARY	PRIMARY	4	test.a2.id	1	100.00	Using where; Using index
 3	DEPENDENT SUBQUERY	t2	ref	PRIMARY	PRIMARY	4	test.f.id	2	100.00	Using index
 Warnings:
diff --git a/sql/sql_select.cc b/sql/sql_select.cc
index 7e9e880a37a..51f847911f9 100644
--- a/sql/sql_select.cc
+++ b/sql/sql_select.cc
@@ -6637,7 +6637,6 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond)
   DBUG_ENTER("make_join_select");
   if (select)
   {
-    add_not_null_conds(join);
     table_map used_tables;
     /*
       Step #1: Extract constant condition
@@ -7082,6 +7081,7 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond)
       }
 
     }
+    add_not_null_conds(join);
   }
   DBUG_RETURN(0);
 }

From 18a348503a3124f3fcf73fc236948a10d4c3c29d Mon Sep 17 00:00:00 2001
From: Sergey Petrunya <psergey@askmonty.org>
Date: Sun, 19 Sep 2010 01:05:47 +0400
Subject: [PATCH 31/49] DS-MRR improvements: better comments, use symbolic name
 instead of +1/-1 constants.

---
 sql/multi_range_read.cc |  26 ++++--
 sql/multi_range_read.h  | 183 ++++++++++++++++++++++++++--------------
 2 files changed, 137 insertions(+), 72 deletions(-)

diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc
index b6c9a5e16ab..e86bd8470b7 100644
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -338,6 +338,7 @@ bool SimpleBuffer::have_space_for(size_t bytes)
     return (write_pos - bytes >= start);
 }
 
+
 size_t SimpleBuffer::used_size()
 {
   return (direction == 1)? write_pos - read_pos : read_pos - write_pos;
@@ -354,16 +355,18 @@ void SimpleBuffer::setup_reading(uchar **data1, size_t len1,
   read_size2= len2;
 }
 
+
 bool SimpleBuffer::read()
 {
   if (!have_data(read_size1 + (read_ptr2? read_size2 : 0)))
     return TRUE;
-  *read_ptr1 =read(read_size1);
+  *read_ptr1= read(read_size1);
   if (read_ptr2)
     *read_ptr2= read(read_size2);
   return FALSE;
 }
 
+
 uchar *SimpleBuffer::read(size_t bytes)
 {
   DBUG_ASSERT(have_data(bytes));
@@ -381,12 +384,14 @@ uchar *SimpleBuffer::read(size_t bytes)
   }
 }
 
+
 bool SimpleBuffer::have_data(size_t bytes)
 {
   return (direction == 1)? (write_pos - read_pos >= (ptrdiff_t)bytes) : 
                            (read_pos - write_pos >= (ptrdiff_t)bytes);
 }
 
+
 void SimpleBuffer::reset_for_writing()
 {
   if (direction == 1)
@@ -493,7 +498,7 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
   */
   full_buf= buf->buffer;
   full_buf_end= buf->buffer_end;
-  rowid_buffer.set_buffer_space(full_buf, full_buf_end, 1);
+  rowid_buffer.set_buffer_space(full_buf, full_buf_end, SimpleBuffer::FORWARD);
   
   if (do_sort_keys)
   {
@@ -819,10 +824,11 @@ void DsMrr_impl::setup_buffer_sizes(key_range *sample_key)
   if (!do_rowid_fetch)
   {
     /* Give all space to key buffer. */
-    key_buffer.set_buffer_space(full_buf, full_buf_end, 1);
+    key_buffer.set_buffer_space(full_buf, full_buf_end, SimpleBuffer::FORWARD);
 
     /* Just in case, tell rowid buffer that it has zero size: */
-    rowid_buffer.set_buffer_space(full_buf_end, full_buf_end, 1);
+    rowid_buffer.set_buffer_space(full_buf_end, full_buf_end, 
+                                  SimpleBuffer::FORWARD);
     return;
   }
   
@@ -865,8 +871,10 @@ void DsMrr_impl::setup_buffer_sizes(key_range *sample_key)
   }
 
   rowid_buffer_end= full_buf + bytes_for_rowids;
-  rowid_buffer.set_buffer_space(full_buf, rowid_buffer_end, 1);
-  key_buffer.set_buffer_space(rowid_buffer_end, full_buf_end, -1); 
+  rowid_buffer.set_buffer_space(full_buf, rowid_buffer_end, 
+                                SimpleBuffer::FORWARD);
+  key_buffer.set_buffer_space(rowid_buffer_end, full_buf_end, 
+                              SimpleBuffer::BACKWARD); 
 }
 
 
@@ -906,8 +914,10 @@ void DsMrr_impl::dsmrr_fill_key_buffer()
         We're using two buffers and both of them are empty now. Restore the
         original sizes
       */
-      rowid_buffer.set_buffer_space(full_buf, rowid_buffer_end, 1);
-      key_buffer.set_buffer_space(rowid_buffer_end, full_buf_end, -1);
+      rowid_buffer.set_buffer_space(full_buf, rowid_buffer_end,
+                                    SimpleBuffer::FORWARD);
+      key_buffer.set_buffer_space(rowid_buffer_end, full_buf_end,
+                                  SimpleBuffer::BACKWARD);
     }
     key_buffer.reset_for_writing();
     key_buffer.setup_writing(&key_ptr, key_size_in_keybuf,
diff --git a/sql/multi_range_read.h b/sql/multi_range_read.h
index 9cd1503596f..21cc0a49d74 100644
--- a/sql/multi_range_read.h
+++ b/sql/multi_range_read.h
@@ -6,9 +6,14 @@
 /**
   A Disk-Sweep implementation of MRR Interface (DS-MRR for short)
 
-  This is a "plugin"(*) for storage engines that allows make index scans 
-  read table rows in rowid order. For disk-based storage engines, this is
-  faster than reading table rows in whatever-SQL-layer-makes-calls-in order.
+  This is a "plugin"(*) for storage engines that allows to
+    1. When doing index scans, read table rows in rowid order;
+    2. when making many index lookups, do them in key order and don't
+       lookup the same key value multiple times;
+    3. Do both #1 and #2, when applicable.
+  These changes are expected to speed up query execution for disk-based 
+  storage engines running io-bound loads and "big" queries (ie. queries that
+  do joins and enumerate lots of records).
 
   (*) - only conceptually. No dynamic loading or binary compatibility of any
         kind.
@@ -17,20 +22,25 @@
    
       SQL Layer code
        |   |   |
-      -v---v---v---- handler->multi_range_read_XXX() function calls
+       v   v   v 
+      -|---|---|---- handler->multi_range_read_XXX() function calls
        |   |   |
-      ____________________________________
-     / DS-MRR module                      \
-     |  (scan indexes, order rowids, do    |
-     |   full record reads in rowid order) |
-     \____________________________________/
+      _____________________________________
+     / DS-MRR module                       \
+     | (order/de-duplicate lookup keys,    |
+     | scan indexes in key order,          |
+     | order/de-duplicate rowids,          |
+     | retrieve full record reads in rowid |
+     | order)                              |
+     \_____________________________________/
        |   |   |
       -|---|---|----- handler->read_range_first()/read_range_next(), 
        |   |   |      handler->index_read(), handler->rnd_pos() calls.
        |   |   |
        v   v   v
       Storage engine internals
-   
+
+
   Currently DS-MRR is used by MyISAM, InnoDB/XtraDB and Maria storage engines.
   Potentially it can be used with any table handler that has disk-based data
   storage and has better performance when reading data in rowid order.
@@ -38,77 +48,104 @@
 
 
 /*
-  A simple memory buffer for reading and writing.
+  An in-memory buffer used by DS-MRR implementation. 
+  - The buffer contains fixed-size elements. The elements are either atomic
+    byte sequences or pairs.
+  - The buffer resides in memory provided by the user. It is possible to
+     = dynamically (ie. between write operations) add ajacent memory space to
+       the buffer
+     = dynamically remove unused space from the buffer.
+  - Buffer can be set to be either "forward" or "backward". 
 
-  when writing, there is no user-visible "current" position, although
-  internally 'pos' points to just after the end of used area  (or at the 
-  start of it for reverse buffer).
+  The intent of the last two properties is to allow to have two buffers on
+  adjacent memory space, one is being read from (and so its space shrinks)
+  while the other is being written to (and so it needs more and more space).
+
+  Illustration of forward buffer operation:
+
+                         +-- next read will read from here
+                         |
+                         |               +-- next write will write to here
+                         v               v
+        *--------------*===============*----------------*
+        |       ^      |          ^    |                |
+        |       |      read_pos   |    write_pos        |
+        start   |                 |                     end
+                |                 |            
+              usused space         user data
 
-  When reading, there is current position pointing at start (for reverse
-  buffer, end) of the element that will be read next.
-   ^^ why end for reverse? it's more logical to point at start 
 */
 
 class SimpleBuffer
 {
-  uchar *start;
-  uchar *end;
-  uchar *read_pos;
-  uchar *write_pos;
-  
+public:
+
+  enum enum_direction {
+    BACKWARD=-1, /* buffer is filled/read from bigger to smaller memory addresses */
+    FORWARD=1  /* buffer is filled/read from smaller to bigger memory addresses */
+  };
+
+private:
+  enum_direction direction;
+
+  uchar *start; /* points to start of buffer space */
+  uchar *end;   /* points to just beyond the end of buffer space */
   /*
-     1 <=> buffer grows/is filled/is read from start to end
-    -1 <=> everthing is done from end to start instead.
+    Forward buffer: points to the start of the data that will be read next
+    Backward buffer: points to just beyond the end of the data that will be 
+    read next.
+  */
+  uchar *read_pos;
+  /*
+    Forward buffer: points to just after the end of the used area.
+    Backward buffer: points to the start of used area.
+  */
+  uchar *write_pos;
+
+  /* 
+    Data to be written. write() call will assume that (*write_ptr1) points to 
+    write_size1 bytes of data to be written.
+    If write_ptr2!=NULL then the buffer stores pairs, and (*write_ptr2) points
+    to write_size2 bytes of data that form the second component.
   */
-  int direction;
-  
-  /* Pointers to read data from */
   uchar **write_ptr1;
   size_t write_size1;
-  /* Same as above, but may be NULL */
   uchar **write_ptr2;
   size_t write_size2;
 
-  /* Pointers to write data to */
+  /*
+    read() will do reading by storing pointer to read data into *read_ptr1 (if
+    the buffer stores atomic elements), or into {*read_ptr1, *read_ptr2} (if
+    the buffer stores pairs).
+  */
+  //TODO if write_size1 == read_size1 why have two variables??
   uchar **read_ptr1;
   size_t read_size1;
-  /* Same as above, but may be NULL */
   uchar **read_ptr2;
   size_t read_size2;
 
-  bool have_space_for(size_t bytes);
-  uchar *used_area() { return (direction == 1)? read_pos : write_pos; }
-  size_t used_size();
-
-  void write(const uchar *data, size_t bytes);
-  uchar *read(size_t bytes);
-
 public:
-  /* Set up writing*/
+  /* Write-mode functions */
   void setup_writing(uchar **data1, size_t len1, 
                      uchar **data2, size_t len2);
-
-  void sort(qsort2_cmp cmp_func, void *cmp_func_arg);
-
-  /* Write-mode functions */
   void reset_for_writing();
-  void write();
   bool can_write();
-
-  bool is_empty() { return used_size() == 0; }
+  void write();
 
   /* Read-mode functions */
+  bool is_empty() { return used_size() == 0; }
   void reset_for_reading();
-  // todo: join with setup-writing? (but what for?)
   void setup_reading(uchar **data1, size_t len1, 
                      uchar **data2, size_t len2);
   bool read();
 
-  bool have_data(size_t bytes);
+  /* Misc functions */
+  void sort(qsort2_cmp cmp_func, void *cmp_func_arg);
+  bool is_reverse() { return direction == BACKWARD; }
   uchar *end_of_space();
 
-  /* Control functions */
-  void set_buffer_space(uchar *start_arg, uchar *end_arg, int direction_arg) 
+  /* Buffer space control functions */
+  void set_buffer_space(uchar *start_arg, uchar *end_arg, enum_direction direction_arg) 
   {
     start= start_arg;
     end= end_arg;
@@ -116,10 +153,10 @@ public:
     TRASH(start, end - start);
     reset_for_writing();
   }
-  
+
   /*
-    Stop/return the unneded space (the one that we have wrote to and have read
-    from.
+    Stop using/return the unneded space (the one that we have already wrote 
+    to read from).
   */
   void remove_unused_space(uchar **unused_start, uchar **unused_end)
   {
@@ -142,9 +179,8 @@ public:
     uchar *tmp= read_pos;
     read_pos= write_pos;
     write_pos= tmp;
-    direction= -direction;
+    direction= (direction == FORWARD)? BACKWARD: FORWARD;
   }
-  bool is_reverse() { return direction == -1; }
 
   void grow(uchar *unused_start, uchar *unused_end)
   {
@@ -173,10 +209,13 @@ public:
   */
   class PeekIterator
   {
-    // if direction==1 : pointer to what to return next
-    // if direction==-1: pointer to the end of what is to be returned next
+    /*
+      if sb->direction==1 : pointer to what to return next
+      if sb->direction==-1: pointer to the end of what is to be returned next
+    */
     uchar *pos;
     SimpleBuffer *sb;
+    
   public:
     void init(SimpleBuffer *sb_arg)
     {
@@ -190,8 +229,10 @@ public:
     */
     bool read_next()
     {
-      // Always read the first component first? (because we do inverted-writes
-      // if needed, so no measures need to be taken here).
+      /* 
+        Always read the first component first (if the buffer is backwards, we
+        have written the second component first).
+      */
       uchar *res;
       if ((res= get_next(sb->read_size1)))
       {
@@ -223,6 +264,16 @@ public:
       }
     }
   };
+
+private:
+  bool have_space_for(size_t bytes);
+  /* Return pointer to start of the memory area that is occupied by the data */
+  uchar *used_area() { return (direction == FORWARD)? read_pos : write_pos; }
+  size_t used_size();
+
+  void write(const uchar *data, size_t bytes);
+  uchar *read(size_t bytes);
+  bool have_data(size_t bytes);
 };
 
 
@@ -231,11 +282,11 @@ public:
   each ha_{myisam/innobase/etc} object. That object will be further referred to
   as "the handler"
 
-  There are actually three strategies
-   S1. Bypass DS-MRR, pass all calls to default implementation (i.e. to
+  DsMrr_impl has the following execution strategies:
+   S1. Bypass DS-MRR, pass all calls to default MRR implementation (i.e. to
       MRR-to-non-MRR calls converter)
-   S2. Regular DS-MRR 
-   S3. DS-MRR/CPK for doing scans on clustered primary keys.
+   S2. Sort Keys
+   S3. Sort Rowids
 
   S1 is used for cases which DS-MRR is unable to handle for some reason.
 
@@ -294,9 +345,13 @@ private:
   handler *h;
   TABLE *table; /* Always equal to h->table */
 
-  /* Secondary handler object.  It is used for scanning the index */
+  /*
+    Secondary handler object, if needed (we need it when we need to both scan
+    the index and return rows).
+  */
   handler *h2;
-
+  
+  /* Full buffer that we're using (the buffer is obtained from SQL layer) */
   uchar *full_buf;
   uchar *full_buf_end;
   

From 189555f39a8a5edeb2ca4f780cee13c44144fa5c Mon Sep 17 00:00:00 2001
From: Sergey Petrunya <psergey@askmonty.org>
Date: Mon, 20 Sep 2010 13:02:17 +0400
Subject: [PATCH 32/49] DS-MRR improvements: more code cleanup - better
 comments - rename variables to better reflect their meaning

---
 sql/multi_range_read.cc |  75 +++++++-------
 sql/multi_range_read.h  | 213 ++++++++++++++++++++++++----------------
 2 files changed, 165 insertions(+), 123 deletions(-)

diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc
index e86bd8470b7..083f18f86c8 100644
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -400,17 +400,6 @@ void SimpleBuffer::reset_for_writing()
     write_pos= read_pos= end;
 }
 
-void SimpleBuffer::reset_for_reading()
-{
-/*
-Do we need this at all?
-  if (direction == 1)
-    pos= start;
-  else
-    pos= end;
-//end?
-*/
-}
 
 uchar *SimpleBuffer::end_of_space()
 {
@@ -478,15 +467,20 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
     use_key_pointers= test(mode & HA_MRR_MATERIALIZED_KEYS);
   }
 
-  do_rowid_fetch= FALSE;
-  doing_cpk_scan= check_cpk_scan(thd, h->inited == handler::INDEX? 
+  do_rndpos_scan= FALSE;
+  bool doing_cpk_scan= check_cpk_scan(thd, h->inited == handler::INDEX? 
                                       h->active_index: h2->active_index, mode);
   if (!doing_cpk_scan /* && !index_only_read */)
   {
     /* Will use rowid buffer to store/sort rowids, etc */
-    do_rowid_fetch= TRUE;
+    do_rndpos_scan= TRUE;
   }
-  DBUG_ASSERT(do_sort_keys || do_rowid_fetch);
+
+  /* 
+    We should either sort keys, or do ordered rnd_pos scan, or both. If we
+    decide to do neither, we should have used default MRR implementation.
+  */
+  DBUG_ASSERT(do_sort_keys || do_rndpos_scan);
 
   
   if (is_mrr_assoc)
@@ -509,11 +503,11 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
     keyno= (h->inited == handler::INDEX)? h->active_index : h2->active_index;
     dsmrr_fill_key_buffer();
     
-    if (dsmrr_eof && !do_rowid_fetch)
+    if (dsmrr_eof && !do_rndpos_scan)
       buf->end_of_used_area= key_buffer.end_of_space();
   }
 
-  if (!do_rowid_fetch)
+  if (!do_rndpos_scan)
   {
     /* 
       We have the keys and won't need to fetch rowids, as key lookup will be
@@ -523,11 +517,6 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
   }
 
   rowid_buff_elem_size= h->ref_length + (is_mrr_assoc? sizeof(char*) : 0);
-  /*
-    psergey2: this is only needed when 
-      - doing a rowid-to-row scan
-      - the buffer wasn't exhausted on the first pass.
-  */
   /*
     There can be two cases:
     - This is the first call since index_init(), h2==NULL
@@ -821,7 +810,7 @@ void DsMrr_impl::setup_buffer_sizes(key_range *sample_key)
   index_ranges_unique= test(key_info->flags & HA_NOSAME && 
                             key_info->key_parts == 
                               my_count_bits(sample_key->keypart_map));
-  if (!do_rowid_fetch)
+  if (!do_rndpos_scan)
   {
     /* Give all space to key buffer. */
     key_buffer.set_buffer_space(full_buf, full_buf_end, SimpleBuffer::FORWARD);
@@ -908,7 +897,7 @@ void DsMrr_impl::dsmrr_fill_key_buffer()
   uchar *key_ptr;
   if (know_key_tuple_params)
   {
-    if (do_rowid_fetch && rowid_buffer.is_empty())
+    if (do_rndpos_scan && rowid_buffer.is_empty())
     {
       /*
         We're using two buffers and both of them are empty now. Restore the
@@ -963,6 +952,18 @@ void DsMrr_impl::dsmrr_fill_key_buffer()
 }
 
 
+/*
+  Take unused space from key buffer and give it to rowid buffer.
+*/
+
+void DsMrr_impl::reallocate_buffer_space()
+{
+  uchar *unused_start, *unused_end;
+  key_buffer.remove_unused_space(&unused_start, &unused_end);
+  rowid_buffer.grow(unused_start, unused_end);
+}
+
+
 /*
   DS-MRR/CPK: multi_range_read_next() function
 
@@ -993,7 +994,7 @@ int DsMrr_impl::dsmrr_next_from_index(char **range_info_arg)
 {
   int res;
   uchar *key_in_buf;
-  handler *file= do_rowid_fetch? h2: h;
+  handler *file= do_rndpos_scan? h2: h;
   bool res2;
 
   while (in_identical_keys_range)
@@ -1068,7 +1069,7 @@ check_record:
         When rowid fetching is used, it controls all buffer refills. When we're
         on our own, try refilling our buffer.
       */
-      if (!do_rowid_fetch)
+      if (!do_rndpos_scan)
         dsmrr_fill_key_buffer();
 
       if (key_buffer.is_empty())
@@ -1078,17 +1079,13 @@ check_record:
       }
     }
     
-    if (do_rowid_fetch)
-    {
-      /*
-        At this point we're not using anything what we've read from key
-        buffer. Cut off unused key buffer space and give it to the rowid
-        buffer.
-      */
-      uchar *unused_start, *unused_end;
-      key_buffer.remove_unused_space(&unused_start, &unused_end);
-      rowid_buffer.grow(unused_start, unused_end);
-    }
+    /*
+      At this point we're not using anything what we've read from key
+      buffer. Cut off unused key buffer space and give it to the rowid
+      buffer.
+    */
+    if (do_rndpos_scan)
+      reallocate_buffer_space();
 
     /* Get the next range to scan */
     key_buffer.read(); // reads to (cur_index_tuple, cur_range_info)
@@ -1147,7 +1144,7 @@ int DsMrr_impl::dsmrr_next(char **range_info)
   if (use_default_impl)
     return h->handler::multi_range_read_next(range_info);
 
-  if (!do_rowid_fetch)
+  if (!do_rndpos_scan)
     return dsmrr_next_from_index(range_info);
   
   while (last_identical_rowid)
@@ -1421,7 +1418,7 @@ bool DsMrr_impl::choose_mrr_impl(uint keyno, ha_rows rows, uint *flags,
   bool res;
   THD *thd= current_thd;
 
-  doing_cpk_scan= check_cpk_scan(thd, keyno, *flags); 
+  bool doing_cpk_scan= check_cpk_scan(thd, keyno, *flags); 
   bool using_cpk= test(keyno == table->s->primary_key &&
                        h->primary_key_is_clustered());
   if (thd->variables.optimizer_use_mrr == 2 || *flags & HA_MRR_INDEX_ONLY ||
diff --git a/sql/multi_range_read.h b/sql/multi_range_read.h
index 21cc0a49d74..06809c22c5e 100644
--- a/sql/multi_range_read.h
+++ b/sql/multi_range_read.h
@@ -73,7 +73,9 @@
         start   |                 |                     end
                 |                 |            
               usused space         user data
-
+  
+  For reverse buffer, start/end have the same meaning, but reading and 
+  writing is done from end to start.
 */
 
 class SimpleBuffer
@@ -134,7 +136,6 @@ public:
 
   /* Read-mode functions */
   bool is_empty() { return used_size() == 0; }
-  void reset_for_reading();
   void setup_reading(uchar **data1, size_t len1, 
                      uchar **data2, size_t len2);
   bool read();
@@ -209,23 +210,31 @@ public:
   */
   class PeekIterator
   {
+    SimpleBuffer *buf; /* The buffer we're iterating over*/
     /*
-      if sb->direction==1 : pointer to what to return next
-      if sb->direction==-1: pointer to the end of what is to be returned next
+      if buf->direction==FORWARD  : pointer to what to return next
+      if buf->direction==BACKWARD : pointer to the end of what is to be 
+                                   returned next
     */
     uchar *pos;
-    SimpleBuffer *sb;
-    
   public:
-    void init(SimpleBuffer *sb_arg)
+    /* 
+      Initialize the iterator. After intiialization, the first read_next() call
+      will read what buf_arg->read() would read.
+    */
+    void init(SimpleBuffer *buf_arg)
     {
-      sb= sb_arg;
-      pos= sb->read_pos;
+      buf= buf_arg;
+      pos= buf->read_pos;
     }
     
     /*
-      If the buffer stores tuples, this call will return pointer to the first
-      component.
+      Read the next value. The calling convention is the same as buf->read()
+      has.
+
+      RETURN
+        FALSE - Ok
+        TRUE  - EOF, reached the end of the buffer
     */
     bool read_next()
     {
@@ -234,11 +243,11 @@ public:
         have written the second component first).
       */
       uchar *res;
-      if ((res= get_next(sb->read_size1)))
+      if ((res= get_next(buf->read_size1)))
       {
-        *(sb->read_ptr1)= res;
-        if (sb->read_ptr2)
-          *sb->read_ptr2= get_next(sb->read_size2);
+        *(buf->read_ptr1)= res;
+        if (buf->read_ptr2)
+          *buf->read_ptr2= get_next(buf->read_size2);
         return FALSE;
       }
       return TRUE; /* EOF */
@@ -247,9 +256,9 @@ public:
     /* Return pointer to next chunk of nbytes bytes and avance over it */
     uchar *get_next(size_t nbytes)
     {
-      if (sb->direction == 1)
+      if (buf->direction == 1)
       {
-        if (pos + nbytes > sb->write_pos)
+        if (pos + nbytes > buf->write_pos)
           return NULL;
         uchar *res= pos;
         pos += nbytes;
@@ -257,7 +266,7 @@ public:
       }
       else
       {
-        if (pos - nbytes < sb->write_pos)
+        if (pos - nbytes < buf->write_pos)
           return NULL;
         pos -= nbytes;
         return pos;
@@ -288,6 +297,8 @@ private:
    S2. Sort Keys
    S3. Sort Rowids
 
+  psergey-TODO.
+
   S1 is used for cases which DS-MRR is unable to handle for some reason.
 
   S2 is the actual DS-MRR. The basic algorithm is as follows:
@@ -339,75 +350,78 @@ public:
                             uint *flags, COST_VECT *cost);
 private:
   /*
-    The "owner" handler object (the one that calls dsmrr_XXX functions.
-    It is used to retrieve full table rows by calling rnd_pos().
+    The "owner" handler object (the one that is expected to "own" this object
+    and call its functions).
   */
   handler *h;
   TABLE *table; /* Always equal to h->table */
 
   /*
-    Secondary handler object, if needed (we need it when we need to both scan
-    the index and return rows).
+    Secondary handler object. (created when needed, we need it when we need 
+    to run both index scan and rnd_pos() at the same time)
   */
   handler *h2;
   
-  /* Full buffer that we're using (the buffer is obtained from SQL layer) */
+  /** Properties of current MRR scan **/
+
+  uint keyno; /* index we're running the scan on */
+  bool use_default_impl; /* TRUE <=> shortcut all calls to default MRR impl */
+  /* TRUE <=> need range association, buffers hold {rowid, range_id} pairs */
+  bool is_mrr_assoc;
+  /* TRUE <=> sort the keys before making index lookups */
+  bool do_sort_keys;
+  /* TRUE <=> sort rowids and use rnd_pos() to get and return full records */
+  bool do_rndpos_scan;
+
+  /*
+    (if do_sort_keys==TRUE) don't copy key values, use pointers to them 
+    instead.
+  */
+  bool use_key_pointers;
+
+
+  /* The whole buffer space that we're using */
   uchar *full_buf;
   uchar *full_buf_end;
   
-  /* Valid when using both rowid and key buffer: the original bound between them */
-  uchar *rowid_buffer_end;
-
-  /* Buffer to store rowids, or (rowid, range_id) pairs */
-  SimpleBuffer rowid_buffer;
-  
-  /*  Reads from rowid buffer go to here: */
-  uchar *rowid;
-  uchar *rowids_range_id;
-  
-  /*
-    not-NULL: we're traversing a group of (rowid, range_id) pairs with
-              identical rowid values, and this is the pointer to the last one.
-    NULL: we're not in the group of indentical rowids.
-  */
-  uchar *last_identical_rowid;
-  
-  /* Identical keys */
-  bool in_identical_keys_range;
-  uchar *last_identical_key_ptr;
-  SimpleBuffer::PeekIterator identical_key_it;
-
-  SimpleBuffer key_buffer;
-  
-  uint keyno;
-
-  /* Execution control */
-  bool do_sort_keys;
-  bool use_key_pointers;
-  bool do_rowid_fetch;
-
-  bool dsmrr_eof; /* TRUE <=> We have reached EOF when reading index tuples */
-  
   /* 
-    TRUE <=> key buffer is exhausted (we need this because we may have a situation
-    where we've read everything from the key buffer but haven't finished with
-    scanning the last range)
+    When using both rowid and key buffers: the bound between key and rowid
+    parts of the buffer. This is the "original" value, actual memory ranges 
+    used by key and rowid parts may be different because of dynamic space 
+    reallocation between them.
+  */
+  uchar *rowid_buffer_end;
+ 
+
+  /** Index scaning and key buffer-related members **/
+  
+  /* TRUE <=> We can get at most one index tuple for a lookup key */
+  bool index_ranges_unique;
+
+  /* TRUE<=> we're in a middle of enumerating records for a key range */
+  bool in_index_range;
+  
+  /* Buffer to store (key, range_id) pairs */
+  SimpleBuffer key_buffer;
+   
+  /* key_buffer.read() reads */
+  uchar *cur_index_tuple;
+
+  /* if in_index_range==TRUE: range_id of the range we're enumerating */
+  char *cur_range_info;
+
+  /* 
+    TRUE <=> we've got index tuples/rowids for all keys (need this flag because 
+    we may have a situation where we've read everything from the key buffer but 
+    haven't finished with getting index tuples for the last key)
   */
   bool key_eof;
 
-  /* TRUE <=> need range association, buffer holds {rowid, range_id} pairs */
-  bool is_mrr_assoc;
-
-  bool use_default_impl; /* TRUE <=> shortcut all calls to default MRR impl */
-
-  bool doing_cpk_scan; /* TRUE <=> DS-MRR/CPK variant is used */
-
-  
   /* Initially FALSE, becomes TRUE when we've set key_tuple_xxx members */
   bool know_key_tuple_params;
-  /* Length of lookup tuple being used, in bytes */
-  uint key_tuple_length;
-  key_part_map key_tuple_map; 
+  uint         key_tuple_length; /* Length of index lookup tuple, in bytes */
+  key_part_map key_tuple_map;    /* keyparts used in index lookup tuples */
+
   /*
     This is 
       = key_tuple_length   if we copy keys to buffer
@@ -418,23 +432,52 @@ private:
   /* = key_size_in_keybuf [ + sizeof(range_assoc_info) ] */
   uint key_buff_elem_size;
   
+  /* 
+    TRUE <=> we're doing key-ordered index scan and right now several
+    subsequent key values are the same as the one we've already retrieved and
+    returned index tuple for.
+  */
+  bool in_identical_keys_range;
+
+  /* range_id of the first of the identical keys */
+  char *first_identical_range_info;
+
+  /* Pointer to the last of the identical key values */
+  uchar *last_identical_key_ptr;
+
+  /* 
+    key_buffer iterator for walking the identical key range (we need to
+    enumerate the set of (identical_key, range_id) pairs multiple times,
+    and do that by walking from current buffer read position until we get
+    last_identical_key_ptr.
+  */
+  SimpleBuffer::PeekIterator identical_key_it;
+
+
+  /** rnd_pos() scan and rowid buffer-related members **/
+
+  /*
+    Buffer to store (rowid, range_id) pairs, or just rowids if 
+    is_mrr_assoc==FALSE
+  */
+  SimpleBuffer rowid_buffer;
+  
+  /* rowid_buffer.read() will set the following:  */
+  uchar *rowid;
+  uchar *rowids_range_id;
+  
+  /*
+    not-NULL: we're traversing a group of (rowid, range_id) pairs with
+              identical rowid values, and this is the pointer to the last one.
+    NULL: we're not in the group of indentical rowids.
+  */
+  uchar *last_identical_rowid;
+
+  bool dsmrr_eof; /* TRUE <=> We have reached EOF when reading index tuples */
+  
   /* = h->ref_length  [ + sizeof(range_assoc_info) ] */
   uint rowid_buff_elem_size;
   
-  /*
-    TRUE <=> We're scanning on a full primary key (and not on prefix), and so 
-    can get max. one match for each key 
-  */
-  bool index_ranges_unique;
-  /* TRUE<=> we're in a middle of enumerating records from a range */ 
-  bool in_index_range;
-  uchar *cur_index_tuple;
-
-  /* if in_index_range==TRUE: range_id of the range we're enumerating */
-  char *cur_range_info;
-
-  char *first_identical_range_info;
-
   bool choose_mrr_impl(uint keyno, ha_rows rows, uint *flags, uint *bufsz, 
                        COST_VECT *cost);
   bool get_disk_sweep_mrr_cost(uint keynr, ha_rows rows, uint flags, 
@@ -446,8 +489,10 @@ private:
   int dsmrr_next_from_index(char **range_info);
 
   void setup_buffer_sizes(key_range *sample_key);
+  void reallocate_buffer_space();
 
   static range_seq_t key_buf_seq_init(void *init_param, uint n_ranges, uint flags);
   static uint key_buf_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range);
 };
 
+

From 2121ab1eb420dd49784e02315e9acd1d0298f44d Mon Sep 17 00:00:00 2001
From: Sergey Petrunya <psergey@askmonty.org>
Date: Mon, 20 Sep 2010 13:23:51 +0400
Subject: [PATCH 33/49] DS-MRR improvements: remove write_size/read_size, have
 the same size   for writing and reading

---
 sql/multi_range_read.cc | 24 ++++++++++++------------
 sql/multi_range_read.h  | 17 +++++++----------
 2 files changed, 19 insertions(+), 22 deletions(-)

diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc
index 083f18f86c8..e2f70288b94 100644
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -291,22 +291,22 @@ void SimpleBuffer::setup_writing(uchar **data1, size_t len1,
                                  uchar **data2, size_t len2)
 {
   write_ptr1= data1;
-  write_size1= len1;
+  size1= len1;
 
   write_ptr2= data2;
-  write_size2= len2;
+  size2= len2;
 }
 
 
 void SimpleBuffer::write()
 {
   if (is_reverse() && write_ptr2)
-    write(*write_ptr2, write_size2);
+    write(*write_ptr2, size2);
 
-  write(*write_ptr1, write_size1);
+  write(*write_ptr1, size1);
 
   if (!is_reverse() && write_ptr2)
-    write(*write_ptr2, write_size2);
+    write(*write_ptr2, size2);
 }
 
 
@@ -326,7 +326,7 @@ void SimpleBuffer::write(const uchar *data, size_t bytes)
 
 bool SimpleBuffer::can_write()
 {
-  return have_space_for(write_size1 + (write_ptr2? write_size2:0));
+  return have_space_for(size1 + (write_ptr2 ? size2 : 0));
 }
 
 
@@ -349,20 +349,20 @@ void SimpleBuffer::setup_reading(uchar **data1, size_t len1,
                                  uchar **data2, size_t len2)
 {
   read_ptr1= data1;
-  read_size1= len1;
+  DBUG_ASSERT(len1 == size1);
 
   read_ptr2= data2;
-  read_size2= len2;
+  DBUG_ASSERT(len2 == size2);
 }
 
 
 bool SimpleBuffer::read()
 {
-  if (!have_data(read_size1 + (read_ptr2? read_size2 : 0)))
+  if (!have_data(size1 + (read_ptr2 ? size2 : 0)))
     return TRUE;
-  *read_ptr1= read(read_size1);
+  *read_ptr1= read(size1);
   if (read_ptr2)
-    *read_ptr2= read(read_size2);
+    *read_ptr2= read(size2);
   return FALSE;
 }
 
@@ -731,7 +731,7 @@ int DsMrr_impl::dsmrr_fill_rowid_buffer()
 
 void SimpleBuffer::sort(qsort2_cmp cmp_func, void *cmp_func_arg)
 {
-  uint elem_size=write_size1 + (write_ptr2 ? write_size2 : 0);
+  uint elem_size= size1 + (write_ptr2 ? size2 : 0);
   uint n_elements= used_size() / elem_size;
   my_qsort2(used_area(), n_elements, elem_size, cmp_func, cmp_func_arg);
 }
diff --git a/sql/multi_range_read.h b/sql/multi_range_read.h
index 06809c22c5e..986409047b7 100644
--- a/sql/multi_range_read.h
+++ b/sql/multi_range_read.h
@@ -106,25 +106,22 @@ private:
 
   /* 
     Data to be written. write() call will assume that (*write_ptr1) points to 
-    write_size1 bytes of data to be written.
-    If write_ptr2!=NULL then the buffer stores pairs, and (*write_ptr2) points
-    to write_size2 bytes of data that form the second component.
+    size1 bytes of data to be written.
+    If write_ptr2 != NULL then the buffer stores pairs, and (*write_ptr2) 
+    points to size2 bytes of data that form the second component.
   */
   uchar **write_ptr1;
-  size_t write_size1;
+  size_t size1;
   uchar **write_ptr2;
-  size_t write_size2;
+  size_t size2;
 
   /*
     read() will do reading by storing pointer to read data into *read_ptr1 (if
     the buffer stores atomic elements), or into {*read_ptr1, *read_ptr2} (if
     the buffer stores pairs).
   */
-  //TODO if write_size1 == read_size1 why have two variables??
   uchar **read_ptr1;
-  size_t read_size1;
   uchar **read_ptr2;
-  size_t read_size2;
 
 public:
   /* Write-mode functions */
@@ -243,11 +240,11 @@ public:
         have written the second component first).
       */
       uchar *res;
-      if ((res= get_next(buf->read_size1)))
+      if ((res= get_next(buf->size1)))
       {
         *(buf->read_ptr1)= res;
         if (buf->read_ptr2)
-          *buf->read_ptr2= get_next(buf->read_size2);
+          *buf->read_ptr2= get_next(buf->size2);
         return FALSE;
       }
       return TRUE; /* EOF */

From 51f90976083382d53f8c8d76bc0c2d71b72290de Mon Sep 17 00:00:00 2001
From: Sergey Petrunya <psergey@askmonty.org>
Date: Mon, 20 Sep 2010 23:13:28 +0400
Subject: [PATCH 34/49] More comments

---
 sql/multi_range_read.h | 87 +++++++++++++++++++++++++++++++-----------
 1 file changed, 64 insertions(+), 23 deletions(-)

diff --git a/sql/multi_range_read.h b/sql/multi_range_read.h
index 986409047b7..6fb33ab486e 100644
--- a/sql/multi_range_read.h
+++ b/sql/multi_range_read.h
@@ -288,37 +288,78 @@ private:
   each ha_{myisam/innobase/etc} object. That object will be further referred to
   as "the handler"
 
-  DsMrr_impl has the following execution strategies:
-   S1. Bypass DS-MRR, pass all calls to default MRR implementation (i.e. to
-      MRR-to-non-MRR calls converter)
-   S2. Sort Keys
-   S3. Sort Rowids
+  DsMrr_impl supports has the following execution strategies:
 
-  psergey-TODO.
+  - Bypass DS-MRR, pass all calls to default MRR implementation, which is 
+    an MRR-to-non-MRR call converter.
+  - Key-Ordered Retrieval
+  - Rowid-Ordered Retrieval
 
-  S1 is used for cases which DS-MRR is unable to handle for some reason.
+  DsMrr_impl will use one of the above strategies, or combination of them, 
+  according to the following diagram:
+
+         (mrr function calls)
+                |
+                +----------------->-----------------+
+                |                                   |
+     ___________v______________      _______________v________________
+    / default: use lookup keys \    / KEY-ORDERED RETRIEVAL:         \
+    | (or ranges) in whatever  |    | sort lookup keys and then make | 
+    | order they are supplied  |    | index lookups in index order   |
+    \__________________________/    \________________________________/
+              | |  |                           |    |
+      +---<---+ |  +--------------->-----------|----+
+      |         |                              |    |
+      |         |              +---------------+    |
+      |   ______v___ ______    |     _______________v_______________
+      |  / default: read   \   |    / ROWID-ORDERED RETRIEVAL:      \
+      |  | table records   |   |    | Before reading table records, |
+      v  | in random order |   v    | sort their rowids and then    |
+      |  \_________________/   |    | read them in rowid order      |
+      |         |              |    \_______________________________/
+      |         |              |                    |
+      |         |              |                    |
+      +-->---+  |  +----<------+-----------<--------+
+             |  |  |                                
+             v  v  v
+      (table records and range_ids)
+
+  The choice of strategy depends on MRR scan properties, table properties
+  (whether we're scanning clustered primary key), and @@optimizer_flag
+  settings.
+  
+  Key-Ordered Retrieval
+  ---------------------
+  The idea is: if MRR scan is essentially a series of lookups on 
+   
+    tbl.key=value1 OR tbl.key=value2 OR ... OR tbl.key=valueN
+  
+  then it makes sense to collect and order the set of lookup values, i.e.
+   
+     sort(value1, value2, .. valueN)
+
+  and then do index lookups in index order. This results in fewer index page
+  fetch operations, and we also can avoid making multiple index lookups for the
+  same value. That is, if value1=valueN we can easily discover that after
+  sorting and make one index lookup for them instead of two.
+
+  Rowid-Ordered Retrieval
+  -----------------------
+  If we do a regular index scan or a series of index lookups, we'll be hitting
+  table records at random. For disk-based engines, this is much slower than 
+  reading the same records in disk order. We assume that disk ordering of
+  rows is the same as ordering of their rowids (which is provided by 
+  handler::cmp_ref())
+  In order to retrieve records in different order, we must separate index
+  scanning and record fetching, that is, MRR scan uses the following steps:
 
-  S2 is the actual DS-MRR. The basic algorithm is as follows:
     1. Scan the index (and only index, that is, with HA_EXTRA_KEYREAD on) and 
-        fill the buffer with {rowid, range_id} pairs
-    2. Sort the buffer by rowid
+        fill a buffer with {rowid, range_id} pairs
+    2. Sort the buffer by rowid value
     3. for each {rowid, range_id} pair in the buffer
          get record by rowid and return the {record, range_id} pair
     4. Repeat the above steps until we've exhausted the list of ranges we're
        scanning.
-
-  S3 is the variant of DS-MRR for use with clustered primary keys (or any
-  clustered index). The idea is that in clustered index it is sufficient to 
-  access the index in index order, and we don't need an intermediate steps to
-  get rowid (like step #1 in S2).
-
-   DS-MRR/CPK's basic algorithm is as follows:
-    1. Collect a number of ranges (=lookup keys)
-    2. Sort them so that they follow in index order.
-    3. for each {lookup_key, range_id} pair in the buffer 
-       get record(s) matching the lookup key and return {record, range_id} pairs
-    4. Repeat the above steps until we've exhausted the list of ranges we're
-       scanning.
 */
 
 class DsMrr_impl

From 3066c37718ec16e8b38cb231e938755c68d0a1e1 Mon Sep 17 00:00:00 2001
From: Sergey Petrunya <psergey@askmonty.org>
Date: Tue, 21 Sep 2010 20:19:54 +0400
Subject: [PATCH 35/49] DS-MRR improvements: review feedback - Switch from one
 bi-directional buffer class to two   virtual inheritance-based forward and
 backward buffer classes.

---
 mysql-test/r/join_nested_jcl6.result |   6 +-
 mysql-test/r/join_outer_jcl6.result  |   8 +-
 sql/multi_range_read.cc              | 235 +++--------
 sql/multi_range_read.h               | 572 +++++++++++++++++----------
 4 files changed, 425 insertions(+), 396 deletions(-)

diff --git a/mysql-test/r/join_nested_jcl6.result b/mysql-test/r/join_nested_jcl6.result
index 9683c7c854a..0b83bd7cd6e 100644
--- a/mysql-test/r/join_nested_jcl6.result
+++ b/mysql-test/r/join_nested_jcl6.result
@@ -865,12 +865,12 @@ LEFT JOIN
 (t1,t2)
 ON t3.a=1 AND t3.b=t2.b AND t2.b=t4.b;
 a	b	a	b	a	b
-4	2	1	2	4	2
 4	2	1	2	3	2
 4	2	1	2	4	2
 4	2	1	2	3	2
 4	2	1	2	4	2
 4	2	1	2	3	2
+4	2	1	2	4	2
 NULL	NULL	2	2	3	2
 NULL	NULL	2	2	4	2
 EXPLAIN EXTENDED
@@ -1105,8 +1105,8 @@ t0.b=t1.b AND
 (t8.b=t9.b OR t8.c IS NULL) AND
 (t9.a=1);
 a	b	a	b	a	b	a	b	a	b	a	b	a	b	a	b	a	b	a	b
-1	2	3	2	4	2	1	2	4	2	2	2	6	2	2	2	0	2	1	2
 1	2	3	2	4	2	1	2	3	2	2	2	6	2	2	2	0	2	1	2
+1	2	3	2	4	2	1	2	4	2	2	2	6	2	2	2	0	2	1	2
 1	2	3	2	4	2	1	2	3	2	3	1	6	2	1	1	NULL	NULL	1	1
 1	2	3	2	4	2	1	2	4	2	3	1	6	2	1	1	NULL	NULL	1	1
 1	2	3	2	4	2	1	2	3	2	3	1	6	2	1	1	NULL	NULL	1	2
@@ -1785,8 +1785,8 @@ ON t7.b=t8.b AND t6.b < 10
 ON t6.b >= 2 AND t5.b=t7.b AND
 (t8.a > 0 OR t8.c IS NULL);
 a	b	a	b	a	b	a	b
-2	2	3	2	2	2	1	2
 2	2	1	2	2	2	1	2
+2	2	3	2	2	2	1	2
 1	1	1	2	1	1	NULL	NULL
 1	1	3	2	1	1	NULL	NULL
 3	3	NULL	NULL	NULL	NULL	NULL	NULL
diff --git a/mysql-test/r/join_outer_jcl6.result b/mysql-test/r/join_outer_jcl6.result
index 624f94438ba..854fc725845 100644
--- a/mysql-test/r/join_outer_jcl6.result
+++ b/mysql-test/r/join_outer_jcl6.result
@@ -352,14 +352,14 @@ Thimble Smith	Happy	3	3
 Lilliana Angelovska	NULL	NULL	NULL
 select t1.name, t2.name, t2.id,t3.id from t1 right join t2 on (t1.id = t2.owner) right join t1 as t3 on t3.id=t2.owner;
 name	name	id	id
-Antonio Paz	Perrito	2	1
 Antonio Paz	El Gato	1	1
+Antonio Paz	Perrito	2	1
 Thimble Smith	Happy	3	3
 NULL	NULL	NULL	2
 select t1.name, t2.name, t2.id, t2.owner, t3.id from t1 left join t2 on (t1.id = t2.owner) right join t1 as t3 on t3.id=t2.owner;
 name	name	id	owner	id
-Antonio Paz	Perrito	2	1	1
 Antonio Paz	El Gato	1	1	1
+Antonio Paz	Perrito	2	1	1
 Thimble Smith	Happy	3	3	3
 NULL	NULL	NULL	NULL	2
 drop table t1,t2;
@@ -413,9 +413,9 @@ insert into t2 values (1, 2, 3),(2, 2, 8), (4,3,9),(3,2,10);
 select t1.*, t2.* from t1 left join t2 on t1.n = t2.n and
 t1.m = t2.m where t1.n = 1;
 n	m	o	n	m	o
-1	2	9	1	2	3
-1	2	7	1	2	3
 1	2	11	1	2	3
+1	2	7	1	2	3
+1	2	9	1	2	3
 1	3	9	NULL	NULL	NULL
 select t1.*, t2.* from t1 left join t2 on t1.n = t2.n and
 t1.m = t2.m where t1.n = 1 order by t1.o;
diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc
index e2f70288b94..ccf0affec93 100644
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -284,131 +284,6 @@ scan_it_again:
 }
 
 
-/****************************************************************************
- * SimpleBuffer class implementation (used by DS-MRR code)
- ***************************************************************************/
-void SimpleBuffer::setup_writing(uchar **data1, size_t len1, 
-                                 uchar **data2, size_t len2)
-{
-  write_ptr1= data1;
-  size1= len1;
-
-  write_ptr2= data2;
-  size2= len2;
-}
-
-
-void SimpleBuffer::write()
-{
-  if (is_reverse() && write_ptr2)
-    write(*write_ptr2, size2);
-
-  write(*write_ptr1, size1);
-
-  if (!is_reverse() && write_ptr2)
-    write(*write_ptr2, size2);
-}
-
-
-void SimpleBuffer::write(const uchar *data, size_t bytes)
-{
-  DBUG_ASSERT(have_space_for(bytes));
-
-  if (direction == -1)
-    write_pos -= bytes;
-
-  memcpy(write_pos, data, bytes);
-
-  if (direction == 1)
-    write_pos += bytes;
-}
-
-
-bool SimpleBuffer::can_write()
-{
-  return have_space_for(size1 + (write_ptr2 ? size2 : 0));
-}
-
-
-bool SimpleBuffer::have_space_for(size_t bytes)
-{
-  if (direction == 1)
-    return (write_pos + bytes < end);
-  else
-    return (write_pos - bytes >= start);
-}
-
-
-size_t SimpleBuffer::used_size()
-{
-  return (direction == 1)? write_pos - read_pos : read_pos - write_pos;
-}
-
-
-void SimpleBuffer::setup_reading(uchar **data1, size_t len1, 
-                                 uchar **data2, size_t len2)
-{
-  read_ptr1= data1;
-  DBUG_ASSERT(len1 == size1);
-
-  read_ptr2= data2;
-  DBUG_ASSERT(len2 == size2);
-}
-
-
-bool SimpleBuffer::read()
-{
-  if (!have_data(size1 + (read_ptr2 ? size2 : 0)))
-    return TRUE;
-  *read_ptr1= read(size1);
-  if (read_ptr2)
-    *read_ptr2= read(size2);
-  return FALSE;
-}
-
-
-uchar *SimpleBuffer::read(size_t bytes)
-{
-  DBUG_ASSERT(have_data(bytes));
-  uchar *res;
-  if (direction == 1)
-  {
-    res= read_pos;
-    read_pos += bytes;
-    return res;
-  }
-  else
-  {
-    read_pos= read_pos - bytes;
-    return read_pos;
-  }
-}
-
-
-bool SimpleBuffer::have_data(size_t bytes)
-{
-  return (direction == 1)? (write_pos - read_pos >= (ptrdiff_t)bytes) : 
-                           (read_pos - write_pos >= (ptrdiff_t)bytes);
-}
-
-
-void SimpleBuffer::reset_for_writing()
-{
-  if (direction == 1)
-    write_pos= read_pos= start;
-  else
-    write_pos= read_pos= end;
-}
-
-
-uchar *SimpleBuffer::end_of_space()
-{
-  if (direction == 1)
-    return start;
-  else
-    return end;
-}
-
 /****************************************************************************
  * DS-MRR implementation 
  ***************************************************************************/
@@ -492,7 +367,7 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
   */
   full_buf= buf->buffer;
   full_buf_end= buf->buffer_end;
-  rowid_buffer.set_buffer_space(full_buf, full_buf_end, SimpleBuffer::FORWARD);
+  rowid_buffer.set_buffer_space(full_buf, full_buf_end);
   
   if (do_sort_keys)
   {
@@ -504,7 +379,7 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
     dsmrr_fill_key_buffer();
     
     if (dsmrr_eof && !do_rndpos_scan)
-      buf->end_of_used_area= key_buffer.end_of_space();
+      buf->end_of_used_area= key_buffer->end_of_space();
   }
 
   if (!do_rndpos_scan)
@@ -646,9 +521,9 @@ void DsMrr_impl::dsmrr_close()
 }
 
 
-static int rowid_cmp(void *h, uchar *a, uchar *b)
+static int rowid_cmp_reverse(void *h, uchar *a, uchar *b)
 {
-  return ((handler*)h)->cmp_ref(a, b);
+  return - ((handler*)h)->cmp_ref(a, b);
 }
 
 
@@ -667,8 +542,6 @@ static int rowid_cmp(void *h, uchar *a, uchar *b)
   post-condition:
    rowid buffer is not empty, or key source is exhausted.
 
-  @param h  Table handler
-
   @retval 0      OK, the next portion of rowids is in the buffer,
                  properly ordered
   @retval other  Error
@@ -689,8 +562,8 @@ int DsMrr_impl::dsmrr_fill_rowid_buffer()
 
   last_identical_rowid= NULL;
 
-  if (do_sort_keys && key_buffer.is_reverse())
-    key_buffer.flip();
+  //if (do_sort_keys && key_buffer.is_reverse())
+  //  key_buffer.flip();
 
   while (rowid_buffer.can_write())
   {
@@ -721,7 +594,7 @@ int DsMrr_impl::dsmrr_fill_rowid_buffer()
     dsmrr_eof= test(res == HA_ERR_END_OF_FILE);
 
   /* Sort the buffer contents by rowid */
-  rowid_buffer.sort((qsort2_cmp)rowid_cmp, (void*)h);
+  rowid_buffer.sort((qsort2_cmp)rowid_cmp_reverse, (void*)h);
 
   rowid_buffer.setup_reading(&rowid, h->ref_length,
                              is_mrr_assoc? (uchar**)&rowids_range_id: NULL, sizeof(void*));
@@ -729,14 +602,6 @@ int DsMrr_impl::dsmrr_fill_rowid_buffer()
 }
 
 
-void SimpleBuffer::sort(qsort2_cmp cmp_func, void *cmp_func_arg)
-{
-  uint elem_size= size1 + (write_ptr2 ? size2 : 0);
-  uint n_elements= used_size() / elem_size;
-  my_qsort2(used_area(), n_elements, elem_size, cmp_func, cmp_func_arg);
-}
-
-
 /* 
   my_qsort2-compatible function to compare key tuples 
 */
@@ -787,6 +652,10 @@ equals:
   return 0;
 }
 
+int DsMrr_impl::key_tuple_cmp_reverse(void* arg, uchar* key1, uchar* key2)
+{
+  return -key_tuple_cmp(arg, key1, key2);
+}
 
 /*
   Setup key/rowid buffer sizes based on sample_key
@@ -812,12 +681,13 @@ void DsMrr_impl::setup_buffer_sizes(key_range *sample_key)
                               my_count_bits(sample_key->keypart_map));
   if (!do_rndpos_scan)
   {
-    /* Give all space to key buffer. */
-    key_buffer.set_buffer_space(full_buf, full_buf_end, SimpleBuffer::FORWARD);
+    /* Give all space to forward key buffer. */
+    key_buffer= &forward_key_buf;
+    identical_key_it= &forward_key_it;
+    key_buffer->set_buffer_space(full_buf, full_buf_end);
 
     /* Just in case, tell rowid buffer that it has zero size: */
-    rowid_buffer.set_buffer_space(full_buf_end, full_buf_end, 
-                                  SimpleBuffer::FORWARD);
+    rowid_buffer.set_buffer_space(full_buf_end, full_buf_end);
     return;
   }
   
@@ -860,10 +730,10 @@ void DsMrr_impl::setup_buffer_sizes(key_range *sample_key)
   }
 
   rowid_buffer_end= full_buf + bytes_for_rowids;
-  rowid_buffer.set_buffer_space(full_buf, rowid_buffer_end, 
-                                SimpleBuffer::FORWARD);
-  key_buffer.set_buffer_space(rowid_buffer_end, full_buf_end, 
-                              SimpleBuffer::BACKWARD); 
+  rowid_buffer.set_buffer_space(full_buf, rowid_buffer_end);
+  key_buffer= &backward_key_buf;
+  identical_key_it= &backward_key_it;
+  key_buffer->set_buffer_space(rowid_buffer_end, full_buf_end); 
 }
 
 
@@ -892,7 +762,7 @@ void DsMrr_impl::dsmrr_fill_key_buffer()
   uchar **range_info_ptr= (uchar**)&cur_range.ptr;
   DBUG_ENTER("DsMrr_impl::dsmrr_fill_key_buffer");
 
-  DBUG_ASSERT(!know_key_tuple_params || key_buffer.is_empty());
+  DBUG_ASSERT(!know_key_tuple_params || key_buffer->is_empty());
 
   uchar *key_ptr;
   if (know_key_tuple_params)
@@ -903,18 +773,18 @@ void DsMrr_impl::dsmrr_fill_key_buffer()
         We're using two buffers and both of them are empty now. Restore the
         original sizes
       */
-      rowid_buffer.set_buffer_space(full_buf, rowid_buffer_end,
-                                    SimpleBuffer::FORWARD);
-      key_buffer.set_buffer_space(rowid_buffer_end, full_buf_end,
-                                  SimpleBuffer::BACKWARD);
+      rowid_buffer.set_buffer_space(full_buf, rowid_buffer_end);
+      key_buffer= &backward_key_buf;
+      identical_key_it= &backward_key_it;
+      key_buffer->set_buffer_space(rowid_buffer_end, full_buf_end);
     }
-    key_buffer.reset_for_writing();
-    key_buffer.setup_writing(&key_ptr, key_size_in_keybuf,
-                             is_mrr_assoc? (uchar**)&range_info_ptr : NULL,
-                             sizeof(uchar*));
+    key_buffer->reset_for_writing();
+    key_buffer->setup_writing(&key_ptr, key_size_in_keybuf,
+                              is_mrr_assoc? (uchar**)&range_info_ptr : NULL,
+                              sizeof(uchar*));
   }
 
-  while ((!know_key_tuple_params || key_buffer.can_write()) && 
+  while ((!know_key_tuple_params || key_buffer->can_write()) && 
          !(res= h->mrr_funcs.next(h->mrr_iter, &cur_range)))
   {
     DBUG_ASSERT(cur_range.range_flag & EQ_RANGE);
@@ -923,10 +793,10 @@ void DsMrr_impl::dsmrr_fill_key_buffer()
       /* This only happens when we've just started filling the buffer */
       setup_buffer_sizes(&cur_range.start_key);
       know_key_tuple_params= TRUE;
-      key_buffer.setup_writing(&key_ptr, key_size_in_keybuf,
+      key_buffer->setup_writing(&key_ptr, key_size_in_keybuf,
                                is_mrr_assoc? (uchar**)&range_info_ptr : NULL,
                                sizeof(uchar*));
-      DBUG_ASSERT(key_buffer.can_write());
+      DBUG_ASSERT(key_buffer->can_write());
     }
     
     /* Put key, or {key, range_id} pair into the buffer */
@@ -935,16 +805,19 @@ void DsMrr_impl::dsmrr_fill_key_buffer()
     else
       key_ptr=(uchar*) cur_range.start_key.key;
 
-    key_buffer.write();
+    key_buffer->write();
   }
 
   dsmrr_eof= test(res);
 
-  key_buffer.sort((qsort2_cmp)DsMrr_impl::key_tuple_cmp, (void*)this);
+  key_buffer->sort((key_buffer->type() == Lifo_buffer::FORWARD)? 
+                     (qsort2_cmp)DsMrr_impl::key_tuple_cmp_reverse : 
+                     (qsort2_cmp)DsMrr_impl::key_tuple_cmp, 
+                   (void*)this);
   
-  key_buffer.setup_reading(&cur_index_tuple, key_size_in_keybuf,
-                           is_mrr_assoc? (uchar**)&cur_range_info: NULL,
-                           sizeof(void*));
+  key_buffer->setup_reading(&cur_index_tuple, key_size_in_keybuf,
+                            is_mrr_assoc? (uchar**)&cur_range_info: NULL,
+                            sizeof(void*));
 
   last_identical_key_ptr= NULL;
   in_identical_keys_range= FALSE;
@@ -959,7 +832,7 @@ void DsMrr_impl::dsmrr_fill_key_buffer()
 void DsMrr_impl::reallocate_buffer_space()
 {
   uchar *unused_start, *unused_end;
-  key_buffer.remove_unused_space(&unused_start, &unused_end);
+  key_buffer->remove_unused_space(&unused_start, &unused_end);
   rowid_buffer.grow(unused_start, unused_end);
 }
 
@@ -1000,7 +873,7 @@ int DsMrr_impl::dsmrr_next_from_index(char **range_info_arg)
   while (in_identical_keys_range)
   {
     /* This will read to (cur_index_tuple, cur_range_info): */
-    res2= identical_key_it.read_next();
+    res2= identical_key_it->read_next();
     DBUG_ASSERT(!res2);
 
     if (cur_index_tuple == last_identical_key_ptr)
@@ -1038,7 +911,7 @@ check_record:
     if (last_identical_key_ptr)
     {
       in_identical_keys_range= TRUE;
-      identical_key_it.init(&key_buffer);
+      identical_key_it->init(key_buffer);
       cur_range_info= first_identical_range_info;
     }
 
@@ -1053,12 +926,12 @@ check_record:
     if (last_identical_key_ptr)
     {
       /* key_buffer.read() reads to (cur_index_tuple, cur_range_info) */
-      while (!key_buffer.read() && (cur_index_tuple != last_identical_key_ptr)) {}
+      while (!key_buffer->read() && (cur_index_tuple != last_identical_key_ptr)) {}
       last_identical_key_ptr= NULL;
     }
 
     /* First, make sure we have a range at start of the buffer */
-    if (key_buffer.is_empty())
+    if (key_buffer->is_empty())
     {
       if (dsmrr_eof)
       {
@@ -1072,7 +945,7 @@ check_record:
       if (!do_rndpos_scan)
         dsmrr_fill_key_buffer();
 
-      if (key_buffer.is_empty())
+      if (key_buffer->is_empty())
       {
         res= HA_ERR_END_OF_FILE;
         goto end;
@@ -1088,7 +961,7 @@ check_record:
       reallocate_buffer_space();
 
     /* Get the next range to scan */
-    key_buffer.read(); // reads to (cur_index_tuple, cur_range_info)
+    key_buffer->read(); // reads to (cur_index_tuple, cur_range_info)
     key_in_buf= cur_index_tuple;
 
     if (use_key_pointers)
@@ -1106,9 +979,9 @@ check_record:
     /* Check if subsequent keys in the key buffer are the same as this one */
     {
       char *save_cur_range_info= cur_range_info;
-      identical_key_it.init(&key_buffer);
+      identical_key_it->init(key_buffer);
       last_identical_key_ptr= NULL;
-      while (!identical_key_it.read_next())
+      while (!identical_key_it->read_next())
       {
         if (key_tuple_cmp(this, key_in_buf, cur_index_tuple))
           break;
@@ -1119,7 +992,7 @@ check_record:
       if (last_identical_key_ptr)
       {
         in_identical_keys_range= TRUE;
-        identical_key_it.init(&key_buffer);
+        identical_key_it->init(key_buffer);
         first_identical_range_info= cur_range_info;
       }
     }
@@ -1178,7 +1051,7 @@ int DsMrr_impl::dsmrr_next(char **range_info)
     {
       if (do_sort_keys)
       {
-        if (!key_buffer.is_empty() || in_index_range) 
+        if (!key_buffer->is_empty() || in_index_range) 
         {
           /* There are some sorted keys left. Use them to get rowids */
           if ((res= dsmrr_fill_rowid_buffer()))
@@ -1239,9 +1112,9 @@ int DsMrr_impl::dsmrr_next(char **range_info)
         Note: this implies that SQL layer doesn't touch table->record[0]
         between calls.
       */
-      SimpleBuffer::PeekIterator identical_rowid_it;
-      identical_rowid_it.init(&rowid_buffer);
-      while (!identical_rowid_it.read_next()) // reads to (rowid, ...)
+      Forward_iterator it;
+      it.init(&rowid_buffer);
+      while (!it.read_next()) // reads to (rowid, ...)
       {
         if (h2->cmp_ref(rowid, cur_rowid))
           break;
diff --git a/sql/multi_range_read.h b/sql/multi_range_read.h
index 6fb33ab486e..5d69ee2b6ce 100644
--- a/sql/multi_range_read.h
+++ b/sql/multi_range_read.h
@@ -46,6 +46,356 @@
   storage and has better performance when reading data in rowid order.
 */
 
+class Forward_lifo_buffer;
+class Backward_lifo_buffer;
+
+class Lifo_buffer 
+{
+protected:
+  /* 
+    Data to be written. write() call will assume that (*write_ptr1) points to 
+    size1 bytes of data to be written.
+    If write_ptr2 != NULL then the buffer stores pairs, and (*write_ptr2) 
+    points to size2 bytes of data that form the second component.
+  */
+  uchar **write_ptr1;
+  size_t size1;
+  uchar **write_ptr2;
+  size_t size2;
+
+  /*
+    read() will do reading by storing pointer to read data into *read_ptr1 (if
+    the buffer stores atomic elements), or into {*read_ptr1, *read_ptr2} (if
+    the buffer stores pairs).
+  */
+  uchar **read_ptr1;
+  uchar **read_ptr2;
+
+  uchar *start; /* points to start of buffer space */
+  uchar *end;   /* points to just beyond the end of buffer space */
+public:
+
+  enum enum_direction {
+    BACKWARD=-1, /* buffer is filled/read from bigger to smaller memory addresses */
+    FORWARD=1  /* buffer is filled/read from smaller to bigger memory addresses */
+  };
+
+  virtual enum_direction type() = 0;
+
+  /* Buffer space control functions */
+  void set_buffer_space(uchar *start_arg, uchar *end_arg) 
+  {
+    start= start_arg;
+    end= end_arg;
+    TRASH(start, end - start);
+    reset_for_writing();
+  }
+
+  void setup_writing(uchar **data1, size_t len1, uchar **data2, size_t len2)
+  {
+    write_ptr1= data1;
+    size1= len1;
+    write_ptr2= data2;
+    size2= len2;
+  }
+
+  void setup_reading(uchar **data1, size_t len1, uchar **data2, size_t len2)
+  {
+    read_ptr1= data1;
+    DBUG_ASSERT(len1 == size1);
+    read_ptr2= data2;
+    DBUG_ASSERT(len2 == size2);
+  }
+  
+  //virtual void write_bytes(const uchar *data, size_t bytes)=0;
+
+  virtual bool read() = 0;
+  virtual void write() = 0;
+  bool can_write()
+  {
+    return have_space_for(size1 + (write_ptr2 ? size2 : 0));
+  }
+  
+  bool is_empty() { return used_size() == 0; }
+  virtual size_t used_size() = 0;
+
+  void sort(qsort2_cmp cmp_func, void *cmp_func_arg)
+  {
+    uint elem_size= size1 + (write_ptr2 ? size2 : 0);
+    uint n_elements= used_size() / elem_size;
+    my_qsort2(used_area(), n_elements, elem_size, cmp_func, cmp_func_arg);
+  }
+
+
+  virtual void reset_for_writing() = 0;
+  virtual uchar *end_of_space() = 0;
+  bool have_data(size_t bytes)
+  {
+    return (used_size() >= bytes);
+  }
+  virtual bool have_space_for(size_t bytes) = 0;
+  //virtual uchar *read_bytes(size_t bytes) = 0;
+
+  virtual void remove_unused_space(uchar **unused_start, uchar **unused_end)=0;
+  virtual uchar *used_area() = 0;
+
+  class Iterator
+  {
+  public:
+    virtual void init(Lifo_buffer *buf) = 0;
+    /*
+      Read the next value. The calling convention is the same as buf->read()
+      has.
+
+      RETURN
+        FALSE - Ok
+        TRUE  - EOF, reached the end of the buffer
+    */
+    virtual bool read_next()= 0;
+    virtual ~Iterator() {}
+  protected:
+    Lifo_buffer *buf;
+    virtual uchar *get_next(size_t nbytes)=0;
+  };
+  virtual ~Lifo_buffer() {};
+
+  friend class Forward_iterator;
+  friend class Backward_iterator;
+};
+
+
+class Forward_lifo_buffer: public Lifo_buffer
+{
+  uchar *pos;
+public:
+  enum_direction type() { return FORWARD; }
+  size_t used_size()
+  {
+    return pos - start;
+  }
+  void reset_for_writing()
+  {
+    pos= start;
+  }
+  uchar *end_of_space() { return pos; }
+  bool have_space_for(size_t bytes)
+  {
+    return (pos + bytes < end);
+  }
+
+  void write()
+  {
+    write_bytes(*write_ptr1, size1);
+    if (write_ptr2)
+      write_bytes(*write_ptr2, size2);
+  }
+  void write_bytes(const uchar *data, size_t bytes)
+  {
+    DBUG_ASSERT(have_space_for(bytes));
+    memcpy(pos, data, bytes);
+    pos += bytes;
+  }
+  uchar *read_bytes(size_t bytes)
+  {
+    DBUG_ASSERT(have_data(bytes));
+    pos= pos - bytes;
+    return pos;
+  }
+  bool read()
+  {
+    if (!have_data(size1 + (read_ptr2 ? size2 : 0)))
+      return TRUE;
+    if (read_ptr2)
+      *read_ptr2= read_bytes(size2);
+    *read_ptr1= read_bytes(size1);
+    return FALSE;
+  }
+  /*
+    Stop using/return the unneded space (the one that we have already wrote 
+    to read from).
+  */
+  void remove_unused_space(uchar **unused_start, uchar **unused_end)
+  {
+    DBUG_ASSERT(0); /* Don't need this yet */
+  }
+  void grow(uchar *unused_start, uchar *unused_end)
+  {
+    /*
+      Passed memory area can be meaningfully used for growing the buffer if:
+      - it is adjacent to buffer space we're using
+      - it is on the end towards which we grow.
+    */
+    DBUG_ASSERT(unused_end >= unused_start);
+    TRASH(unused_start, unused_end - unused_start);
+    DBUG_ASSERT(end == unused_start);
+    end= unused_end;
+  }
+  /* Return pointer to start of the memory area that is occupied by the data */
+  uchar *used_area() { return start; }
+  friend class Forward_iterator;
+};
+
+
+class Forward_iterator : public Lifo_buffer::Iterator
+{
+  uchar *pos;
+
+  /* Return pointer to next chunk of nbytes bytes and avance over it */
+  uchar *get_next(size_t nbytes)
+  {
+    if (pos - nbytes < ((Forward_lifo_buffer*)buf)->start)
+      return NULL;
+    pos -= nbytes;
+    return pos;
+  }
+public:
+  bool read_next()
+  {
+    uchar *res;
+    if (buf->read_ptr2)
+    {
+      if ((res= get_next(buf->size2)))
+      {
+        *(buf->read_ptr2)= res;
+        *buf->read_ptr1= get_next(buf->size1);
+        return FALSE;
+      }
+    }
+    else
+    {
+      if ((res= get_next(buf->size1)))
+      {
+        *(buf->read_ptr1)= res;
+        return FALSE;
+      }
+    }
+    return TRUE; /* EOF */
+  }
+
+  void init(Lifo_buffer *buf_arg)
+  {
+    DBUG_ASSERT(buf_arg->type() == Lifo_buffer::FORWARD);
+    buf= buf_arg;
+    pos= ((Forward_lifo_buffer*)buf)->pos;
+  }
+};
+
+
+class Backward_lifo_buffer: public Lifo_buffer
+{
+  uchar *pos;
+public:
+  enum_direction type() { return BACKWARD; }
+ 
+  size_t used_size()
+  {
+    return end - pos;
+  }
+  void reset_for_writing()
+  {
+    pos= end;
+  }
+  uchar *end_of_space() { return end; }
+  bool have_space_for(size_t bytes)
+  {
+    return (pos - bytes >= start);
+  }
+  void write()
+  {
+    if (write_ptr2)
+      write_bytes(*write_ptr2, size2);
+    write_bytes(*write_ptr1, size1);
+  }
+  void write_bytes(const uchar *data, size_t bytes)
+  {
+    DBUG_ASSERT(have_space_for(bytes));
+    pos -= bytes;
+    memcpy(pos, data, bytes);
+  }
+  bool read()
+  {
+    if (!have_data(size1 + (read_ptr2 ? size2 : 0)))
+      return TRUE;
+    *read_ptr1= read_bytes(size1);
+    if (read_ptr2)
+      *read_ptr2= read_bytes(size2);
+    return FALSE;
+  }
+  uchar *read_bytes(size_t bytes)
+  {
+    DBUG_ASSERT(have_data(bytes));
+    uchar *ret= pos;
+    pos= pos + bytes;
+    return ret;
+  }
+  /*
+    Stop using/return the unneded space (the one that we have already wrote 
+    to and have read from).
+  */
+  void remove_unused_space(uchar **unused_start, uchar **unused_end)
+  {
+    *unused_start= start;
+    *unused_end= pos;
+    start= pos;
+  }
+  void grow(uchar *unused_start, uchar *unused_end)
+  {
+    /*
+      Passed memory area can be meaningfully used for growing the buffer if:
+      - it is adjacent to buffer space we're using
+      - it is on the end towards which we grow.
+    */
+    /*
+    DBUG_ASSERT(unused_end >= unused_start);
+    TRASH(unused_start, unused_end - unused_start);
+    DBUG_ASSERT(start == unused_end);
+    start= unused_start;
+    */
+    DBUG_ASSERT(0); //Not used
+  }
+  /* Return pointer to start of the memory area that is occupied by the data */
+  uchar *used_area() { return pos; }
+  friend class Backward_iterator;
+};
+
+
+class Backward_iterator : public Lifo_buffer::Iterator
+{
+  uchar *pos;
+  /* Return pointer to next chunk of nbytes bytes and advance over it */
+  uchar *get_next(size_t nbytes)
+  {
+    if (pos + nbytes > ((Backward_lifo_buffer*)buf)->end)
+      return NULL;
+    uchar *res= pos;
+    pos += nbytes;
+    return res;
+  }
+public:
+  bool read_next()
+  {
+    /*
+      Always read the first component first (if the buffer is backwards, we
+      have written the second component first).
+    */
+    uchar *res;
+    if ((res= get_next(buf->size1)))
+    {
+      *(buf->read_ptr1)= res;
+      if (buf->read_ptr2)
+        *buf->read_ptr2= get_next(buf->size2);
+      return FALSE;
+    }
+    return TRUE; /* EOF */
+  }
+  void init(Lifo_buffer *buf_arg)
+  {
+    DBUG_ASSERT(buf_arg->type() == Lifo_buffer::BACKWARD);
+    buf= buf_arg;
+    pos= ((Backward_lifo_buffer*)buf)->pos;
+  }
+};
+
 
 /*
   An in-memory buffer used by DS-MRR implementation. 
@@ -78,211 +428,6 @@
   writing is done from end to start.
 */
 
-class SimpleBuffer
-{
-public:
-
-  enum enum_direction {
-    BACKWARD=-1, /* buffer is filled/read from bigger to smaller memory addresses */
-    FORWARD=1  /* buffer is filled/read from smaller to bigger memory addresses */
-  };
-
-private:
-  enum_direction direction;
-
-  uchar *start; /* points to start of buffer space */
-  uchar *end;   /* points to just beyond the end of buffer space */
-  /*
-    Forward buffer: points to the start of the data that will be read next
-    Backward buffer: points to just beyond the end of the data that will be 
-    read next.
-  */
-  uchar *read_pos;
-  /*
-    Forward buffer: points to just after the end of the used area.
-    Backward buffer: points to the start of used area.
-  */
-  uchar *write_pos;
-
-  /* 
-    Data to be written. write() call will assume that (*write_ptr1) points to 
-    size1 bytes of data to be written.
-    If write_ptr2 != NULL then the buffer stores pairs, and (*write_ptr2) 
-    points to size2 bytes of data that form the second component.
-  */
-  uchar **write_ptr1;
-  size_t size1;
-  uchar **write_ptr2;
-  size_t size2;
-
-  /*
-    read() will do reading by storing pointer to read data into *read_ptr1 (if
-    the buffer stores atomic elements), or into {*read_ptr1, *read_ptr2} (if
-    the buffer stores pairs).
-  */
-  uchar **read_ptr1;
-  uchar **read_ptr2;
-
-public:
-  /* Write-mode functions */
-  void setup_writing(uchar **data1, size_t len1, 
-                     uchar **data2, size_t len2);
-  void reset_for_writing();
-  bool can_write();
-  void write();
-
-  /* Read-mode functions */
-  bool is_empty() { return used_size() == 0; }
-  void setup_reading(uchar **data1, size_t len1, 
-                     uchar **data2, size_t len2);
-  bool read();
-
-  /* Misc functions */
-  void sort(qsort2_cmp cmp_func, void *cmp_func_arg);
-  bool is_reverse() { return direction == BACKWARD; }
-  uchar *end_of_space();
-
-  /* Buffer space control functions */
-  void set_buffer_space(uchar *start_arg, uchar *end_arg, enum_direction direction_arg) 
-  {
-    start= start_arg;
-    end= end_arg;
-    direction= direction_arg;
-    TRASH(start, end - start);
-    reset_for_writing();
-  }
-
-  /*
-    Stop using/return the unneded space (the one that we have already wrote 
-    to read from).
-  */
-  void remove_unused_space(uchar **unused_start, uchar **unused_end)
-  {
-    if (direction == 1)
-    {
-      *unused_start= start;
-      *unused_end= read_pos;
-      start= read_pos;
-    }
-    else
-    {
-      *unused_start= read_pos;
-      *unused_end= end;
-      end= read_pos;
-    }
-  }
-
-  void flip()
-  {
-    uchar *tmp= read_pos;
-    read_pos= write_pos;
-    write_pos= tmp;
-    direction= (direction == FORWARD)? BACKWARD: FORWARD;
-  }
-
-  void grow(uchar *unused_start, uchar *unused_end)
-  {
-    /*
-      Passed memory area can be meaningfully used for growing the buffer if:
-      - it is adjacent to buffer space we're using
-      - it is on the end towards which we grow.
-    */
-    DBUG_ASSERT(unused_end >= unused_start);
-    TRASH(unused_start, unused_end - unused_start);
-    if (direction == 1 && end == unused_start)
-    {
-      end= unused_end;
-    }
-    else if (direction == -1 && start == unused_end)
-    {
-      start= unused_start;
-    }
-    else
-      DBUG_ASSERT(0); /* Attempt to grow buffer in wrong direction */
-  }
-  
-  /*
-    An iterator to do look at what we're about to read from the buffer without
-    actually reading it.
-  */
-  class PeekIterator
-  {
-    SimpleBuffer *buf; /* The buffer we're iterating over*/
-    /*
-      if buf->direction==FORWARD  : pointer to what to return next
-      if buf->direction==BACKWARD : pointer to the end of what is to be 
-                                   returned next
-    */
-    uchar *pos;
-  public:
-    /* 
-      Initialize the iterator. After intiialization, the first read_next() call
-      will read what buf_arg->read() would read.
-    */
-    void init(SimpleBuffer *buf_arg)
-    {
-      buf= buf_arg;
-      pos= buf->read_pos;
-    }
-    
-    /*
-      Read the next value. The calling convention is the same as buf->read()
-      has.
-
-      RETURN
-        FALSE - Ok
-        TRUE  - EOF, reached the end of the buffer
-    */
-    bool read_next()
-    {
-      /* 
-        Always read the first component first (if the buffer is backwards, we
-        have written the second component first).
-      */
-      uchar *res;
-      if ((res= get_next(buf->size1)))
-      {
-        *(buf->read_ptr1)= res;
-        if (buf->read_ptr2)
-          *buf->read_ptr2= get_next(buf->size2);
-        return FALSE;
-      }
-      return TRUE; /* EOF */
-    }
-  private:
-    /* Return pointer to next chunk of nbytes bytes and avance over it */
-    uchar *get_next(size_t nbytes)
-    {
-      if (buf->direction == 1)
-      {
-        if (pos + nbytes > buf->write_pos)
-          return NULL;
-        uchar *res= pos;
-        pos += nbytes;
-        return res;
-      }
-      else
-      {
-        if (pos - nbytes < buf->write_pos)
-          return NULL;
-        pos -= nbytes;
-        return pos;
-      }
-    }
-  };
-
-private:
-  bool have_space_for(size_t bytes);
-  /* Return pointer to start of the memory area that is occupied by the data */
-  uchar *used_area() { return (direction == FORWARD)? read_pos : write_pos; }
-  size_t used_size();
-
-  void write(const uchar *data, size_t bytes);
-  uchar *read(size_t bytes);
-  bool have_data(size_t bytes);
-};
-
-
 /*
   DS-MRR implementation for one table. Create/use one object of this class for
   each ha_{myisam/innobase/etc} object. That object will be further referred to
@@ -439,8 +584,18 @@ private:
   /* TRUE<=> we're in a middle of enumerating records for a key range */
   bool in_index_range;
   
+  /*
+    One of the following two is used for key buffer: forward is used when 
+    we only need key buffer, backward is used when we need both key and rowid
+    buffers.
+  */
+  Forward_lifo_buffer forward_key_buf;
+  Forward_iterator forward_key_it;
+  Backward_lifo_buffer backward_key_buf;
+  Backward_iterator backward_key_it;
+
   /* Buffer to store (key, range_id) pairs */
-  SimpleBuffer key_buffer;
+  Lifo_buffer *key_buffer;
    
   /* key_buffer.read() reads */
   uchar *cur_index_tuple;
@@ -489,7 +644,7 @@ private:
     and do that by walking from current buffer read position until we get
     last_identical_key_ptr.
   */
-  SimpleBuffer::PeekIterator identical_key_it;
+  Lifo_buffer::Iterator *identical_key_it;
 
 
   /** rnd_pos() scan and rowid buffer-related members **/
@@ -498,7 +653,7 @@ private:
     Buffer to store (rowid, range_id) pairs, or just rowids if 
     is_mrr_assoc==FALSE
   */
-  SimpleBuffer rowid_buffer;
+  Forward_lifo_buffer rowid_buffer;
   
   /* rowid_buffer.read() will set the following:  */
   uchar *rowid;
@@ -522,6 +677,7 @@ private:
                                uint *buffer_size, COST_VECT *cost);
   bool check_cpk_scan(THD *thd, uint keyno, uint mrr_flags);
   static int key_tuple_cmp(void* arg, uchar* key1, uchar* key2);
+  static int key_tuple_cmp_reverse(void* arg, uchar* key1, uchar* key2);
   int dsmrr_fill_rowid_buffer();
   void dsmrr_fill_key_buffer();
   int dsmrr_next_from_index(char **range_info);

From 4f56acb676f273b3e7f92311fd557d4fb7c4068c Mon Sep 17 00:00:00 2001
From: Sergey Petrunya <psergey@askmonty.org>
Date: Tue, 28 Sep 2010 12:19:50 +0400
Subject: [PATCH 36/49] Better comments, move Lifo_buffer to separate file.

---
 sql/CMakeLists.txt      |   1 +
 sql/Makefile.am         |   1 +
 sql/handler.h           |   4 +
 sql/multi_range_read.cc | 135 ++++++--------
 sql/multi_range_read.h  | 400 ++--------------------------------------
 5 files changed, 77 insertions(+), 464 deletions(-)

diff --git a/sql/CMakeLists.txt b/sql/CMakeLists.txt
index bae77f80b22..0d65a0fc701 100755
--- a/sql/CMakeLists.txt
+++ b/sql/CMakeLists.txt
@@ -63,6 +63,7 @@ SET (SQL_SOURCE
                sql_cache.cc sql_class.cc sql_client.cc sql_crypt.cc sql_crypt.h 
                sql_cursor.cc sql_db.cc sql_delete.cc sql_derived.cc sql_do.cc 
                sql_error.cc sql_handler.cc sql_help.cc sql_insert.cc
+               sql_lifo_buffer.h
                sql_join_cache.cc sql_lex.cc sql_list.cc sql_load.cc sql_manager.cc
                sql_map.cc sql_parse.cc  sql_partition.cc sql_plugin.cc
                sql_prepare.cc sql_rename.cc 
diff --git a/sql/Makefile.am b/sql/Makefile.am
index 13a60ba5c79..b1b7ab17f3c 100644
--- a/sql/Makefile.am
+++ b/sql/Makefile.am
@@ -66,6 +66,7 @@ noinst_HEADERS =	item.h item_func.h item_sum.h item_cmpfunc.h \
 			log.h log_slow.h sql_show.h rpl_rli.h rpl_mi.h \
 			sql_select.h structs.h table.h sql_udf.h hash_filo.h \
 			lex.h lex_symbol.h sql_acl.h sql_crypt.h  \
+                        sql_lifo_buffer.h \
 			sql_repl.h slave.h rpl_filter.h rpl_injector.h \
 			log_event.h rpl_record.h \
 			log_event_old.h rpl_record_old.h \
diff --git a/sql/handler.h b/sql/handler.h
index 2c3af0e8150..40f2d321241 100644
--- a/sql/handler.h
+++ b/sql/handler.h
@@ -1326,6 +1326,10 @@ void get_sweep_read_cost(TABLE *table, ha_rows nrows, bool interrupted,
   The MRR user has materialized range keys somewhere in the user's buffer.
   This can be used for optimization of the procedure that sorts these keys
   since in this case key values don't have to be copied into the MRR buffer.
+
+  In other words, it is guaranteed that after RANGE_SEQ_IF::next() call the 
+  pointer in range->start_key.key will point to a key value that will remain 
+  there until the end of the MRR scan.
 */
 #define HA_MRR_MATERIALIZED_KEYS 256
 
diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc
index ccf0affec93..cf257ec4c7f 100644
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -332,7 +332,7 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
   is_mrr_assoc= !test(mode & HA_MRR_NO_ASSOCIATION);
   
   /*
-    Figure out what steps we'll need to do
+    Determine whether we'll need to do key sorting and/or rnd_pos() scan
   */
   do_sort_keys= FALSE;
   if ((mode & HA_MRR_SINGLE_POINT) && 
@@ -362,8 +362,9 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
     status_var_increment(table->in_use->status_var.ha_multi_range_read_init_count);
 
   /* 
-    At start, alloc all of the buffer for rowids. Key sorting code will grab a
-    piece if necessary.
+    At start, alloc all of the buffer for rowids. When/if key sorting code
+    figures how much buffer space it needs, it will call setup_buffer_sizes()
+    to re-distribute the buffer space.
   */
   full_buf= buf->buffer;
   full_buf_end= buf->buffer_end;
@@ -530,22 +531,19 @@ static int rowid_cmp_reverse(void *h, uchar *a, uchar *b)
 /**
   DS-MRR: Fill and sort the rowid buffer
 
-  {This is an internal function of DiskSweep MRR implementation}
-
   Scan the MRR ranges and collect ROWIDs (or {ROWID, range_id} pairs) into 
   buffer. When the buffer is full or scan is completed, sort the buffer by 
   rowid and return.
+
+  When this function returns, either rowid buffer is not empty, or the source
+  of lookup keys (i.e. ranges) is exhaused.
   
   dsmrr_eof is set to indicate whether we've exhausted the list of ranges we're
   scanning. This function never returns HA_ERR_END_OF_FILE.
 
-  post-condition:
-   rowid buffer is not empty, or key source is exhausted.
-
   @retval 0      OK, the next portion of rowids is in the buffer,
                  properly ordered
   @retval other  Error
-  
 */
 
 int DsMrr_impl::dsmrr_fill_rowid_buffer()
@@ -556,15 +554,13 @@ int DsMrr_impl::dsmrr_fill_rowid_buffer()
   DBUG_ENTER("DsMrr_impl::dsmrr_fill_rowid_buffer");
   
   DBUG_ASSERT(rowid_buffer.is_empty());
-  rowid_buffer.reset_for_writing();
+  rowid_buffer.reset();
   rowid_buffer.setup_writing(&h2->ref, h2->ref_length,
-                             is_mrr_assoc? (uchar**)&range_info_ptr: NULL, sizeof(void*));
+                             is_mrr_assoc? (uchar**)&range_info_ptr: NULL,
+                             sizeof(void*));
 
   last_identical_rowid= NULL;
 
-  //if (do_sort_keys && key_buffer.is_reverse())
-  //  key_buffer.flip();
-
   while (rowid_buffer.can_write())
   {
     if (do_sort_keys)
@@ -652,18 +648,21 @@ equals:
   return 0;
 }
 
+
 int DsMrr_impl::key_tuple_cmp_reverse(void* arg, uchar* key1, uchar* key2)
 {
   return -key_tuple_cmp(arg, key1, key2);
 }
 
-/*
-  Setup key/rowid buffer sizes based on sample_key
 
-  DESCRIPTION
-    Setup key/rowid buffer sizes based on sample_key and its length.
-
-    This function must be called when all buffer space is empty.
+/**
+  Setup key/rowid buffer sizes based on sample_key and its length.
+  
+  @param
+    sample_key  A lookup key to use as a sample. It is assumed that
+                all other keys will have the same length/etc.
+  @note
+    This function must be called when all buffers are empty
 */
 
 void DsMrr_impl::setup_buffer_sizes(key_range *sample_key)
@@ -737,22 +736,19 @@ void DsMrr_impl::setup_buffer_sizes(key_range *sample_key)
 }
 
 
-/*
+/**
   DS-MRR/CPK: Fill the buffer with (lookup_tuple, range_id) pairs and sort
   
-  SYNOPSIS
-    DsMrr_impl::dsmrr_fill_key_buffer()
-
-  DESCRIPTION
-    DS-MRR/CPK: Enumerate the input range (=key) sequence, fill the key buffer
-    (lookup_key, range_id) pairs and sort.
+  Enumerate the input range (=key) sequence, fill the key buffer with 
+  (lookup_key, range_id) pairs and sort it.
 
+  When this function returns, either
+   - key buffer is non-empty, or
+   - key buffer is empty and source range sequence is exhausted
+  
+  @note
     dsmrr_eof is set to indicate whether we've exhausted the list of ranges 
     we're scanning.
-
-  post-condition:
-   - key buffer is non-empty
-   - key buffer is empty and source range sequence is exhausted
 */
 
 void DsMrr_impl::dsmrr_fill_key_buffer()
@@ -778,7 +774,7 @@ void DsMrr_impl::dsmrr_fill_key_buffer()
       identical_key_it= &backward_key_it;
       key_buffer->set_buffer_space(rowid_buffer_end, full_buf_end);
     }
-    key_buffer->reset_for_writing();
+    key_buffer->reset();
     key_buffer->setup_writing(&key_ptr, key_size_in_keybuf,
                               is_mrr_assoc? (uchar**)&range_info_ptr : NULL,
                               sizeof(uchar*));
@@ -825,8 +821,8 @@ void DsMrr_impl::dsmrr_fill_key_buffer()
 }
 
 
-/*
-  Take unused space from key buffer and give it to rowid buffer.
+/**
+  Take unused space from the key buffer and give it to the rowid buffer
 */
 
 void DsMrr_impl::reallocate_buffer_space()
@@ -837,30 +833,25 @@ void DsMrr_impl::reallocate_buffer_space()
 }
 
 
-/*
+/**
   DS-MRR/CPK: multi_range_read_next() function
-
-  DESCRIPTION
-    DsMrr_impl::dsmrr_next_from_index()
-      range_info  OUT  identifier of range that the returned record belongs to
-
-  DESCRIPTION
   
-  This function walks over key buffer and does index reads, i.e. it produces
-  {current_record, range_id} pairs.
-
-  The function has the same call contract like multi_range_read_next()'s.
-
-  We actually iterate nested sequences:
+  @param range_info  OUT  identifier of range that the returned record belongs to
   
-  - a disjoint sequence of index ranges
-    - each range has multiple records
-      - each record goes into multiple identical ranges.
+  @note
+    This function walks over key buffer and does index reads, i.e. it produces
+    {current_record, range_id} pairs.
 
-  RETURN
-    0                   OK, next record was successfully read
-    HA_ERR_END_OF_FILE  End of records
-    Other               Some other error
+    The function has the same call contract like multi_range_read_next()'s.
+
+    We actually iterate over nested sequences:
+    - a disjoint sequence of index ranges
+      - each range has multiple records
+        - each record goes into multiple identical ranges.
+
+  @retval 0                   OK, next record was successfully read
+  @retval HA_ERR_END_OF_FILE  End of records
+  @retval Other               Some other error
 */
 
 int DsMrr_impl::dsmrr_next_from_index(char **range_info_arg)
@@ -1007,7 +998,9 @@ end:
 
 
 /**
-  DS-MRR implementation: multi_range_read_next() function
+  DS-MRR implementation: multi_range_read_next() function.
+
+  Calling convention is like multi_range_read_next() has.
 */
 
 int DsMrr_impl::dsmrr_next(char **range_info)
@@ -1237,17 +1230,12 @@ bool key_uses_partial_cols(TABLE *table, uint keyno)
 /*
   Check if key/flags allow DS-MRR/CPK strategy to be used
   
-  SYNOPSIS
-   DsMrr_impl::check_cpk_scan()
-     keyno      Index that will be used
-     mrr_flags  
+  @param thd
+  @param keyno      Index that will be used
+  @param  mrr_flags  
   
-  DESCRIPTION
-    Check if key/flags allow DS-MRR/CPK strategy to be used. 
- 
-  RETURN
-    TRUE   DS-MRR/CPK should be used
-    FALSE  Otherwise
+  @retval TRUE   DS-MRR/CPK should be used
+  @retval FALSE  Otherwise
 */
 
 bool DsMrr_impl::check_cpk_scan(THD *thd, uint keyno, uint mrr_flags)
@@ -1413,17 +1401,14 @@ bool DsMrr_impl::get_disk_sweep_mrr_cost(uint keynr, ha_rows rows, uint flags,
 
 /* 
   Get cost of one sort-and-sweep step
+  
+  It consists of two parts:
+   - sort an array of #nrows ROWIDs using qsort
+   - read #nrows records from table in a sweep.
 
-  SYNOPSIS
-    get_sort_and_sweep_cost()
-      table       Table being accessed
-      nrows       Number of rows to be sorted and retrieved
-      cost   OUT  The cost
-
-  DESCRIPTION
-    Get cost of these operations:
-     - sort an array of #nrows ROWIDs using qsort
-     - read #nrows records from table in a sweep.
+  @param table       Table being accessed
+  @param nrows       Number of rows to be sorted and retrieved
+  @param cost   OUT  The cost of scan
 */
 
 static 
diff --git a/sql/multi_range_read.h b/sql/multi_range_read.h
index 5d69ee2b6ce..2ac477a0949 100644
--- a/sql/multi_range_read.h
+++ b/sql/multi_range_read.h
@@ -1,6 +1,6 @@
-/*
-  This file contains declarations for Disk-Sweep MultiRangeRead (DS-MRR) 
-  implementation
+/**
+  @defgroup DS-MRR declarations
+  @{
 */
 
 /**
@@ -46,387 +46,7 @@
   storage and has better performance when reading data in rowid order.
 */
 
-class Forward_lifo_buffer;
-class Backward_lifo_buffer;
-
-class Lifo_buffer 
-{
-protected:
-  /* 
-    Data to be written. write() call will assume that (*write_ptr1) points to 
-    size1 bytes of data to be written.
-    If write_ptr2 != NULL then the buffer stores pairs, and (*write_ptr2) 
-    points to size2 bytes of data that form the second component.
-  */
-  uchar **write_ptr1;
-  size_t size1;
-  uchar **write_ptr2;
-  size_t size2;
-
-  /*
-    read() will do reading by storing pointer to read data into *read_ptr1 (if
-    the buffer stores atomic elements), or into {*read_ptr1, *read_ptr2} (if
-    the buffer stores pairs).
-  */
-  uchar **read_ptr1;
-  uchar **read_ptr2;
-
-  uchar *start; /* points to start of buffer space */
-  uchar *end;   /* points to just beyond the end of buffer space */
-public:
-
-  enum enum_direction {
-    BACKWARD=-1, /* buffer is filled/read from bigger to smaller memory addresses */
-    FORWARD=1  /* buffer is filled/read from smaller to bigger memory addresses */
-  };
-
-  virtual enum_direction type() = 0;
-
-  /* Buffer space control functions */
-  void set_buffer_space(uchar *start_arg, uchar *end_arg) 
-  {
-    start= start_arg;
-    end= end_arg;
-    TRASH(start, end - start);
-    reset_for_writing();
-  }
-
-  void setup_writing(uchar **data1, size_t len1, uchar **data2, size_t len2)
-  {
-    write_ptr1= data1;
-    size1= len1;
-    write_ptr2= data2;
-    size2= len2;
-  }
-
-  void setup_reading(uchar **data1, size_t len1, uchar **data2, size_t len2)
-  {
-    read_ptr1= data1;
-    DBUG_ASSERT(len1 == size1);
-    read_ptr2= data2;
-    DBUG_ASSERT(len2 == size2);
-  }
-  
-  //virtual void write_bytes(const uchar *data, size_t bytes)=0;
-
-  virtual bool read() = 0;
-  virtual void write() = 0;
-  bool can_write()
-  {
-    return have_space_for(size1 + (write_ptr2 ? size2 : 0));
-  }
-  
-  bool is_empty() { return used_size() == 0; }
-  virtual size_t used_size() = 0;
-
-  void sort(qsort2_cmp cmp_func, void *cmp_func_arg)
-  {
-    uint elem_size= size1 + (write_ptr2 ? size2 : 0);
-    uint n_elements= used_size() / elem_size;
-    my_qsort2(used_area(), n_elements, elem_size, cmp_func, cmp_func_arg);
-  }
-
-
-  virtual void reset_for_writing() = 0;
-  virtual uchar *end_of_space() = 0;
-  bool have_data(size_t bytes)
-  {
-    return (used_size() >= bytes);
-  }
-  virtual bool have_space_for(size_t bytes) = 0;
-  //virtual uchar *read_bytes(size_t bytes) = 0;
-
-  virtual void remove_unused_space(uchar **unused_start, uchar **unused_end)=0;
-  virtual uchar *used_area() = 0;
-
-  class Iterator
-  {
-  public:
-    virtual void init(Lifo_buffer *buf) = 0;
-    /*
-      Read the next value. The calling convention is the same as buf->read()
-      has.
-
-      RETURN
-        FALSE - Ok
-        TRUE  - EOF, reached the end of the buffer
-    */
-    virtual bool read_next()= 0;
-    virtual ~Iterator() {}
-  protected:
-    Lifo_buffer *buf;
-    virtual uchar *get_next(size_t nbytes)=0;
-  };
-  virtual ~Lifo_buffer() {};
-
-  friend class Forward_iterator;
-  friend class Backward_iterator;
-};
-
-
-class Forward_lifo_buffer: public Lifo_buffer
-{
-  uchar *pos;
-public:
-  enum_direction type() { return FORWARD; }
-  size_t used_size()
-  {
-    return pos - start;
-  }
-  void reset_for_writing()
-  {
-    pos= start;
-  }
-  uchar *end_of_space() { return pos; }
-  bool have_space_for(size_t bytes)
-  {
-    return (pos + bytes < end);
-  }
-
-  void write()
-  {
-    write_bytes(*write_ptr1, size1);
-    if (write_ptr2)
-      write_bytes(*write_ptr2, size2);
-  }
-  void write_bytes(const uchar *data, size_t bytes)
-  {
-    DBUG_ASSERT(have_space_for(bytes));
-    memcpy(pos, data, bytes);
-    pos += bytes;
-  }
-  uchar *read_bytes(size_t bytes)
-  {
-    DBUG_ASSERT(have_data(bytes));
-    pos= pos - bytes;
-    return pos;
-  }
-  bool read()
-  {
-    if (!have_data(size1 + (read_ptr2 ? size2 : 0)))
-      return TRUE;
-    if (read_ptr2)
-      *read_ptr2= read_bytes(size2);
-    *read_ptr1= read_bytes(size1);
-    return FALSE;
-  }
-  /*
-    Stop using/return the unneded space (the one that we have already wrote 
-    to read from).
-  */
-  void remove_unused_space(uchar **unused_start, uchar **unused_end)
-  {
-    DBUG_ASSERT(0); /* Don't need this yet */
-  }
-  void grow(uchar *unused_start, uchar *unused_end)
-  {
-    /*
-      Passed memory area can be meaningfully used for growing the buffer if:
-      - it is adjacent to buffer space we're using
-      - it is on the end towards which we grow.
-    */
-    DBUG_ASSERT(unused_end >= unused_start);
-    TRASH(unused_start, unused_end - unused_start);
-    DBUG_ASSERT(end == unused_start);
-    end= unused_end;
-  }
-  /* Return pointer to start of the memory area that is occupied by the data */
-  uchar *used_area() { return start; }
-  friend class Forward_iterator;
-};
-
-
-class Forward_iterator : public Lifo_buffer::Iterator
-{
-  uchar *pos;
-
-  /* Return pointer to next chunk of nbytes bytes and avance over it */
-  uchar *get_next(size_t nbytes)
-  {
-    if (pos - nbytes < ((Forward_lifo_buffer*)buf)->start)
-      return NULL;
-    pos -= nbytes;
-    return pos;
-  }
-public:
-  bool read_next()
-  {
-    uchar *res;
-    if (buf->read_ptr2)
-    {
-      if ((res= get_next(buf->size2)))
-      {
-        *(buf->read_ptr2)= res;
-        *buf->read_ptr1= get_next(buf->size1);
-        return FALSE;
-      }
-    }
-    else
-    {
-      if ((res= get_next(buf->size1)))
-      {
-        *(buf->read_ptr1)= res;
-        return FALSE;
-      }
-    }
-    return TRUE; /* EOF */
-  }
-
-  void init(Lifo_buffer *buf_arg)
-  {
-    DBUG_ASSERT(buf_arg->type() == Lifo_buffer::FORWARD);
-    buf= buf_arg;
-    pos= ((Forward_lifo_buffer*)buf)->pos;
-  }
-};
-
-
-class Backward_lifo_buffer: public Lifo_buffer
-{
-  uchar *pos;
-public:
-  enum_direction type() { return BACKWARD; }
- 
-  size_t used_size()
-  {
-    return end - pos;
-  }
-  void reset_for_writing()
-  {
-    pos= end;
-  }
-  uchar *end_of_space() { return end; }
-  bool have_space_for(size_t bytes)
-  {
-    return (pos - bytes >= start);
-  }
-  void write()
-  {
-    if (write_ptr2)
-      write_bytes(*write_ptr2, size2);
-    write_bytes(*write_ptr1, size1);
-  }
-  void write_bytes(const uchar *data, size_t bytes)
-  {
-    DBUG_ASSERT(have_space_for(bytes));
-    pos -= bytes;
-    memcpy(pos, data, bytes);
-  }
-  bool read()
-  {
-    if (!have_data(size1 + (read_ptr2 ? size2 : 0)))
-      return TRUE;
-    *read_ptr1= read_bytes(size1);
-    if (read_ptr2)
-      *read_ptr2= read_bytes(size2);
-    return FALSE;
-  }
-  uchar *read_bytes(size_t bytes)
-  {
-    DBUG_ASSERT(have_data(bytes));
-    uchar *ret= pos;
-    pos= pos + bytes;
-    return ret;
-  }
-  /*
-    Stop using/return the unneded space (the one that we have already wrote 
-    to and have read from).
-  */
-  void remove_unused_space(uchar **unused_start, uchar **unused_end)
-  {
-    *unused_start= start;
-    *unused_end= pos;
-    start= pos;
-  }
-  void grow(uchar *unused_start, uchar *unused_end)
-  {
-    /*
-      Passed memory area can be meaningfully used for growing the buffer if:
-      - it is adjacent to buffer space we're using
-      - it is on the end towards which we grow.
-    */
-    /*
-    DBUG_ASSERT(unused_end >= unused_start);
-    TRASH(unused_start, unused_end - unused_start);
-    DBUG_ASSERT(start == unused_end);
-    start= unused_start;
-    */
-    DBUG_ASSERT(0); //Not used
-  }
-  /* Return pointer to start of the memory area that is occupied by the data */
-  uchar *used_area() { return pos; }
-  friend class Backward_iterator;
-};
-
-
-class Backward_iterator : public Lifo_buffer::Iterator
-{
-  uchar *pos;
-  /* Return pointer to next chunk of nbytes bytes and advance over it */
-  uchar *get_next(size_t nbytes)
-  {
-    if (pos + nbytes > ((Backward_lifo_buffer*)buf)->end)
-      return NULL;
-    uchar *res= pos;
-    pos += nbytes;
-    return res;
-  }
-public:
-  bool read_next()
-  {
-    /*
-      Always read the first component first (if the buffer is backwards, we
-      have written the second component first).
-    */
-    uchar *res;
-    if ((res= get_next(buf->size1)))
-    {
-      *(buf->read_ptr1)= res;
-      if (buf->read_ptr2)
-        *buf->read_ptr2= get_next(buf->size2);
-      return FALSE;
-    }
-    return TRUE; /* EOF */
-  }
-  void init(Lifo_buffer *buf_arg)
-  {
-    DBUG_ASSERT(buf_arg->type() == Lifo_buffer::BACKWARD);
-    buf= buf_arg;
-    pos= ((Backward_lifo_buffer*)buf)->pos;
-  }
-};
-
-
-/*
-  An in-memory buffer used by DS-MRR implementation. 
-  - The buffer contains fixed-size elements. The elements are either atomic
-    byte sequences or pairs.
-  - The buffer resides in memory provided by the user. It is possible to
-     = dynamically (ie. between write operations) add ajacent memory space to
-       the buffer
-     = dynamically remove unused space from the buffer.
-  - Buffer can be set to be either "forward" or "backward". 
-
-  The intent of the last two properties is to allow to have two buffers on
-  adjacent memory space, one is being read from (and so its space shrinks)
-  while the other is being written to (and so it needs more and more space).
-
-  Illustration of forward buffer operation:
-
-                         +-- next read will read from here
-                         |
-                         |               +-- next write will write to here
-                         v               v
-        *--------------*===============*----------------*
-        |       ^      |          ^    |                |
-        |       |      read_pos   |    write_pos        |
-        start   |                 |                     end
-                |                 |            
-              usused space         user data
-  
-  For reverse buffer, start/end have the same meaning, but reading and 
-  writing is done from end to start.
-*/
+#include "sql_lifo_buffer.h"
 
 /*
   DS-MRR implementation for one table. Create/use one object of this class for
@@ -440,7 +60,7 @@ public:
   - Key-Ordered Retrieval
   - Rowid-Ordered Retrieval
 
-  DsMrr_impl will use one of the above strategies, or combination of them, 
+  DsMrr_impl will use one of the above strategies, or a combination of them, 
   according to the following diagram:
 
          (mrr function calls)
@@ -470,7 +90,7 @@ public:
       (table records and range_ids)
 
   The choice of strategy depends on MRR scan properties, table properties
-  (whether we're scanning clustered primary key), and @@optimizer_flag
+  (whether we're scanning clustered primary key), and @@optimizer_switch
   settings.
   
   Key-Ordered Retrieval
@@ -541,7 +161,7 @@ private:
 
   /*
     Secondary handler object. (created when needed, we need it when we need 
-    to run both index scan and rnd_pos() at the same time)
+    to run both index scan and rnd_pos() scan at the same time)
   */
   handler *h2;
   
@@ -568,14 +188,13 @@ private:
   uchar *full_buf_end;
   
   /* 
-    When using both rowid and key buffers: the bound between key and rowid
+    When using both rowid and key buffers: the boundary between key and rowid
     parts of the buffer. This is the "original" value, actual memory ranges 
     used by key and rowid parts may be different because of dynamic space 
     reallocation between them.
   */
   uchar *rowid_buffer_end;
  
-
   /** Index scaning and key buffer-related members **/
   
   /* TRUE <=> We can get at most one index tuple for a lookup key */
@@ -689,4 +308,7 @@ private:
   static uint key_buf_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range);
 };
 
+/**
+  @} (end of group DS-MRR declarations)
+*/
 

From 8e59978b08569395a5ae24bd46c8db8548d18dd9 Mon Sep 17 00:00:00 2001
From: Sergey Petrunya <psergey@askmonty.org>
Date: Tue, 28 Sep 2010 12:20:16 +0400
Subject: [PATCH 37/49] Move Lifo_buffer to separate file.

---
 sql/sql_lifo_buffer.h | 412 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 412 insertions(+)
 create mode 100644 sql/sql_lifo_buffer.h

diff --git a/sql/sql_lifo_buffer.h b/sql/sql_lifo_buffer.h
new file mode 100644
index 00000000000..dc0ed30ab43
--- /dev/null
+++ b/sql/sql_lifo_buffer.h
@@ -0,0 +1,412 @@
+/**
+  @defgroup Bi-directional LIFO buffers used by DS-MRR implementation
+  @{
+*/
+
+class Forward_lifo_buffer;
+class Backward_lifo_buffer;
+
+
+/*
+  A base class for in-memory buffer used by DS-MRR implementation. Common
+  properties:
+  - The buffer is last-in-first-out, i.e. elements that are written last are
+    read first.
+  - The buffer contains fixed-size elements. The elements are either atomic
+    byte sequences or pairs of them.
+  - The buffer resides in the memory provided by the user. It is possible to
+     = dynamically (ie. between write operations) add ajacent memory space to
+       the buffer
+     = dynamically remove unused space from the buffer.
+    The intent of this is to allow to have two buffers on adjacent memory
+    space, one is being read from (and so its space shrinks), while the other 
+    is being written to (and so it needs more and more space).
+
+  There are two concrete classes, Forward_lifo_buffer and Backward_lifo_buffer.
+*/
+
+class Lifo_buffer 
+{
+protected:
+  /**
+    Pointers to data to be written. write() call will assume that 
+    (*write_ptr1) points to size1 bytes of data to be written.
+    If write_ptr2 != NULL then the buffer stores pairs, and (*write_ptr2) 
+    points to size2 bytes of data that form the second component.
+  */
+  uchar **write_ptr1;
+  size_t size1;
+  uchar **write_ptr2;
+  size_t size2;
+
+  /**
+    read() will do reading by storing pointer to read data into *read_ptr1 (if
+    the buffer stores atomic elements), or into {*read_ptr1, *read_ptr2} (if
+    the buffer stores pairs).
+  */
+  uchar **read_ptr1;
+  uchar **read_ptr2;
+
+  uchar *start; /**< points to start of buffer space */
+  uchar *end;   /**< points to just beyond the end of buffer space */
+public:
+
+  enum enum_direction {
+    BACKWARD=-1, /**< buffer is filled/read from bigger to smaller memory addresses */
+    FORWARD=1  /**< buffer is filled/read from smaller to bigger memory addresses */
+  };
+
+  virtual enum_direction type() = 0;
+
+  /* Buffer space control functions */
+
+  /** Let the buffer store data in the given space. */
+  void set_buffer_space(uchar *start_arg, uchar *end_arg) 
+  {
+    start= start_arg;
+    end= end_arg;
+    TRASH(start, end - start);
+    reset();
+  }
+  
+  /** 
+    Specify where write() should get the source data from, as well as source
+    data size.
+  */
+  void setup_writing(uchar **data1, size_t len1, uchar **data2, size_t len2)
+  {
+    write_ptr1= data1;
+    size1= len1;
+    write_ptr2= data2;
+    size2= len2;
+  }
+
+  /** 
+    Specify where read() should store pointers to read data, as well as read
+    data size. The sizes must match those passed to setup_writing().
+  */
+  void setup_reading(uchar **data1, size_t len1, uchar **data2, size_t len2)
+  {
+    read_ptr1= data1;
+    DBUG_ASSERT(len1 == size1);
+    read_ptr2= data2;
+    DBUG_ASSERT(len2 == size2);
+  }
+  
+  bool can_write()
+  {
+    return have_space_for(size1 + (write_ptr2 ? size2 : 0));
+  }
+  virtual void write() = 0;
+
+  bool is_empty() { return used_size() == 0; }
+  virtual bool read() = 0;
+  
+  void sort(qsort2_cmp cmp_func, void *cmp_func_arg)
+  {
+    uint elem_size= size1 + (write_ptr2 ? size2 : 0);
+    uint n_elements= used_size() / elem_size;
+    my_qsort2(used_area(), n_elements, elem_size, cmp_func, cmp_func_arg);
+  }
+
+  virtual void reset() = 0;
+  virtual uchar *end_of_space() = 0;
+protected:
+  bool have_data(size_t bytes)
+  {
+    return (used_size() >= bytes);
+  }
+  virtual bool have_space_for(size_t bytes) = 0;
+  virtual size_t used_size() = 0;
+
+public:
+
+  virtual void remove_unused_space(uchar **unused_start, uchar **unused_end)=0;
+  virtual uchar *used_area() = 0;
+   
+  /** Iterator to walk over contents of the buffer without reading it. */
+  class Iterator
+  {
+  public:
+    virtual void init(Lifo_buffer *buf) = 0;
+    /*
+      Read the next value. The calling convention is the same as buf->read()
+      has.
+
+      @retval FALSE - ok
+      @retval TRUE  - EOF, reached the end of the buffer
+    */
+    virtual bool read_next()= 0;
+    virtual ~Iterator() {}
+  protected:
+    Lifo_buffer *buf;
+    virtual uchar *get_next(size_t nbytes)=0;
+  };
+  virtual ~Lifo_buffer() {};
+
+  friend class Forward_iterator;
+  friend class Backward_iterator;
+};
+
+
+/**
+  Forward LIFO buffer
+
+  The buffer that is being written to from start to end and read in the
+  reverse.  'pos' points to just beyond the end of used space.
+
+  It is possible to grow/shink the buffer at the end bound
+
+     used space      unused space  
+   *==============*-----------------*
+   ^              ^                 ^
+   |              |                 +--- end
+   |              +---- pos              
+   +--- start           
+*/
+
+class Forward_lifo_buffer: public Lifo_buffer
+{
+  uchar *pos;
+public:
+  enum_direction type() { return FORWARD; }
+  size_t used_size()
+  {
+    return pos - start;
+  }
+  void reset()
+  {
+    pos= start;
+  }
+  uchar *end_of_space() { return pos; }
+  bool have_space_for(size_t bytes)
+  {
+    return (pos + bytes < end);
+  }
+
+  void write()
+  {
+    write_bytes(*write_ptr1, size1);
+    if (write_ptr2)
+      write_bytes(*write_ptr2, size2);
+  }
+  void write_bytes(const uchar *data, size_t bytes)
+  {
+    DBUG_ASSERT(have_space_for(bytes));
+    memcpy(pos, data, bytes);
+    pos += bytes;
+  }
+  uchar *read_bytes(size_t bytes)
+  {
+    DBUG_ASSERT(have_data(bytes));
+    pos= pos - bytes;
+    return pos;
+  }
+  bool read()
+  {
+    if (!have_data(size1 + (read_ptr2 ? size2 : 0)))
+      return TRUE;
+    if (read_ptr2)
+      *read_ptr2= read_bytes(size2);
+    *read_ptr1= read_bytes(size1);
+    return FALSE;
+  }
+  void remove_unused_space(uchar **unused_start, uchar **unused_end)
+  {
+    DBUG_ASSERT(0); /* Don't need this yet */
+  }
+  /**
+    Add more space to the buffer. The caller is responsible that the space
+    being added is adjacent to the end of the buffer.
+
+    @param unused_start Start of space
+    @param unused_end   End of space
+  */
+  void grow(uchar *unused_start, uchar *unused_end)
+  {
+    DBUG_ASSERT(unused_end >= unused_start);
+    DBUG_ASSERT(end == unused_start);
+    TRASH(unused_start, unused_end - unused_start);
+    end= unused_end;
+  }
+  /* Return pointer to start of the memory area that is occupied by the data */
+  uchar *used_area() { return start; }
+  friend class Forward_iterator;
+};
+
+
+/**
+  Iterator for Forward_lifo_buffer
+*/
+
+class Forward_iterator : public Lifo_buffer::Iterator
+{
+  uchar *pos;
+
+  /** Return pointer to next chunk of nbytes bytes and avance over it */
+  uchar *get_next(size_t nbytes)
+  {
+    if (pos - nbytes < ((Forward_lifo_buffer*)buf)->start)
+      return NULL;
+    pos -= nbytes;
+    return pos;
+  }
+public:
+  bool read_next()
+  {
+    uchar *res;
+    if (buf->read_ptr2)
+    {
+      if ((res= get_next(buf->size2)))
+      {
+        *(buf->read_ptr2)= res;
+        *buf->read_ptr1= get_next(buf->size1);
+        return FALSE;
+      }
+    }
+    else
+    {
+      if ((res= get_next(buf->size1)))
+      {
+        *(buf->read_ptr1)= res;
+        return FALSE;
+      }
+    }
+    return TRUE; /* EOF */
+  }
+
+  void init(Lifo_buffer *buf_arg)
+  {
+    DBUG_ASSERT(buf_arg->type() == Lifo_buffer::FORWARD);
+    buf= buf_arg;
+    pos= ((Forward_lifo_buffer*)buf)->pos;
+  }
+};
+
+
+/**
+  Backward LIFO buffer
+
+  The buffer that is being written to from start to end and read in the
+  reverse.  'pos' points to the start of used space.
+
+  It is possible to grow/shink the buffer at the start.
+
+     unused space      used space  
+   *--------------*=================*
+   ^              ^                 ^
+   |              |                 +--- end
+   |              +---- pos              
+   +--- start           
+*/
+class Backward_lifo_buffer: public Lifo_buffer
+{
+  uchar *pos;
+public:
+  enum_direction type() { return BACKWARD; }
+ 
+  size_t used_size()
+  {
+    return end - pos;
+  }
+  void reset()
+  {
+    pos= end;
+  }
+  uchar *end_of_space() { return end; }
+  bool have_space_for(size_t bytes)
+  {
+    return (pos - bytes >= start);
+  }
+  void write()
+  {
+    if (write_ptr2)
+      write_bytes(*write_ptr2, size2);
+    write_bytes(*write_ptr1, size1);
+  }
+  void write_bytes(const uchar *data, size_t bytes)
+  {
+    DBUG_ASSERT(have_space_for(bytes));
+    pos -= bytes;
+    memcpy(pos, data, bytes);
+  }
+  bool read()
+  {
+    if (!have_data(size1 + (read_ptr2 ? size2 : 0)))
+      return TRUE;
+    *read_ptr1= read_bytes(size1);
+    if (read_ptr2)
+      *read_ptr2= read_bytes(size2);
+    return FALSE;
+  }
+  uchar *read_bytes(size_t bytes)
+  {
+    DBUG_ASSERT(have_data(bytes));
+    uchar *ret= pos;
+    pos= pos + bytes;
+    return ret;
+  }
+  /**
+    Stop using/return the unused part of the space
+    @param unused_start  OUT Start of the unused space
+    @param unused_end    OUT End of the unused space
+  */
+  void remove_unused_space(uchar **unused_start, uchar **unused_end)
+  {
+    *unused_start= start;
+    *unused_end= pos;
+    start= pos;
+  }
+  void grow(uchar *unused_start, uchar *unused_end)
+  {
+    DBUG_ASSERT(0); /* Not used for backward buffers */
+  }
+  /* Return pointer to start of the memory area that is occupied by the data */
+  uchar *used_area() { return pos; }
+  friend class Backward_iterator;
+};
+
+
+/**
+  Iterator for Backward_lifo_buffer
+*/
+
+class Backward_iterator : public Lifo_buffer::Iterator
+{
+  uchar *pos;
+  /* Return pointer to next chunk of nbytes bytes and advance over it */
+  uchar *get_next(size_t nbytes)
+  {
+    if (pos + nbytes > ((Backward_lifo_buffer*)buf)->end)
+      return NULL;
+    uchar *res= pos;
+    pos += nbytes;
+    return res;
+  }
+public:
+  bool read_next()
+  {
+    /*
+      Always read the first component first (if the buffer is backwards, we
+      have written the second component first).
+    */
+    uchar *res;
+    if ((res= get_next(buf->size1)))
+    {
+      *(buf->read_ptr1)= res;
+      if (buf->read_ptr2)
+        *buf->read_ptr2= get_next(buf->size2);
+      return FALSE;
+    }
+    return TRUE; /* EOF */
+  }
+  void init(Lifo_buffer *buf_arg)
+  {
+    DBUG_ASSERT(buf_arg->type() == Lifo_buffer::BACKWARD);
+    buf= buf_arg;
+    pos= ((Backward_lifo_buffer*)buf)->pos;
+  }
+};
+
+
+

From 61f26f0c62d2bfacb9c250baa1177e8186d5f0e7 Mon Sep 17 00:00:00 2001
From: Sergey Petrunya <psergey@askmonty.org>
Date: Tue, 28 Sep 2010 20:20:09 +0400
Subject: [PATCH 38/49] DS-MRR improvements: address review feedback - change
 dsmrr_next_from_index() to a switch-based state automaton-like structure.

---
 sql/multi_range_read.cc | 183 +++++++++++++++++++++++++++++++++++++++-
 sql/multi_range_read.h  |  17 +++-
 2 files changed, 195 insertions(+), 5 deletions(-)

diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc
index cf257ec4c7f..326344f1be8 100644
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -373,10 +373,11 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
   if (do_sort_keys)
   {
     know_key_tuple_params= FALSE;
-    in_index_range= FALSE;
+    //in_index_range= FALSE;
     h->mrr_iter= seq_funcs->init(seq_init_param, n_ranges, mode);
     h->mrr_funcs= *seq_funcs;
     keyno= (h->inited == handler::INDEX)? h->active_index : h2->active_index;
+    index_scan_state= IN_RANGE_LIST;
     dsmrr_fill_key_buffer();
     
     if (dsmrr_eof && !do_rndpos_scan)
@@ -816,7 +817,8 @@ void DsMrr_impl::dsmrr_fill_key_buffer()
                             sizeof(void*));
 
   last_identical_key_ptr= NULL;
-  in_identical_keys_range= FALSE;
+  //in_identical_keys_range= FALSE;
+  index_scan_state= IN_RANGE_LIST;
   DBUG_VOID_RETURN;
 }
 
@@ -833,6 +835,23 @@ void DsMrr_impl::reallocate_buffer_space()
 }
 
 
+/**
+  Read out ranges from the buffer until we've reached the range with 
+  last_identical_key_ptr. 
+*/
+
+void DsMrr_impl::read_out_identical_ranges()
+{
+  if (last_identical_key_ptr)
+  {
+    /* key_buffer.read() reads to (cur_index_tuple, cur_range_info) */
+    while (!key_buffer->read() && (cur_index_tuple != last_identical_key_ptr)) {}
+    last_identical_key_ptr= NULL;
+  }
+}
+
+
+
 /**
   DS-MRR/CPK: multi_range_read_next() function
   
@@ -854,6 +873,163 @@ void DsMrr_impl::reallocate_buffer_space()
   @retval Other               Some other error
 */
 
+int DsMrr_impl::dsmrr_next_from_index(char **range_info_arg)
+{
+  DBUG_ENTER("DsMrr_impl::dsmrr_next_from_index");
+  int res;
+  handler *file= do_rndpos_scan? h2: h;
+  
+  while (1)
+  {
+    bool have_record= FALSE;
+    switch (index_scan_state)
+    {
+      case IN_IDENTICAL_KEYS_RANGE:
+      {
+        /* Get the next range_id for the current record */ 
+
+        /* read to (cur_index_tuple, cur_range_info) */
+        bool bres= identical_key_it->read_next();
+        DBUG_ASSERT(!bres);
+
+        if (cur_index_tuple == last_identical_key_ptr)
+        {
+          /* 
+            We've just got to the last of identical ranges. Next step is to
+            go next record
+          */
+          index_scan_state= index_ranges_unique? IN_RANGE_LIST : IN_INDEX_RANGE;
+        }
+        have_record= TRUE;
+        break;
+      }
+      case IN_INDEX_RANGE:
+      {
+        /* Get the next record from the range */
+        res= file->ha_index_next_same(table->record[0], cur_index_tuple, 
+                                      key_tuple_length);
+        if (res)
+        {
+          if (res != HA_ERR_END_OF_FILE && res != HA_ERR_KEY_NOT_FOUND)
+            return res;  /* Fatal error */
+
+          /* Got EOF for this range, go get the next range */
+          index_scan_state= IN_RANGE_LIST;
+          break;
+        }
+        
+        have_record= TRUE;
+        if (last_identical_key_ptr)
+        {
+          /* 
+            If the range we're scanning is one of the set of identical ranges,
+            return this record with range_id of each range
+          */
+          index_scan_state= IN_IDENTICAL_KEYS_RANGE;
+          identical_key_it->init(key_buffer);
+          cur_range_info= first_identical_range_info;
+        }
+        break;
+      }
+      case IN_RANGE_LIST:
+      {
+        if (do_rndpos_scan)
+          reallocate_buffer_space();
+
+        /* Get the next range to scan */
+        if (key_buffer->read()) /* read to (cur_index_tuple,cur_range_info) */
+        {
+          index_scan_state= NEED_MORE_RANGES;
+          break;
+        }
+        uchar *key_in_buf= cur_index_tuple;
+
+        if (use_key_pointers)
+          cur_index_tuple= *((uchar**)cur_index_tuple);
+
+        res= file->ha_index_read_map(table->record[0], cur_index_tuple, 
+                                     key_tuple_map, HA_READ_KEY_EXACT);
+
+        if (res && res != HA_ERR_END_OF_FILE && res != HA_ERR_KEY_NOT_FOUND)
+          return res; /* Fatal error */
+        
+        /* 
+          Check if subsequent elements in the key buffer are the same as this
+          one
+        */
+        char *save_cur_range_info= cur_range_info;
+        identical_key_it->init(key_buffer);
+        last_identical_key_ptr= NULL;
+        while (!identical_key_it->read_next())
+        {
+          if (key_tuple_cmp(this, key_in_buf, cur_index_tuple))
+            break;
+          last_identical_key_ptr= cur_index_tuple;
+        }
+        cur_range_info= save_cur_range_info;
+
+        if (last_identical_key_ptr)
+        {
+          index_scan_state= IN_IDENTICAL_KEYS_RANGE;
+          identical_key_it->init(key_buffer);
+          first_identical_range_info= cur_range_info;
+        }
+        else
+          index_scan_state= index_ranges_unique? IN_RANGE_LIST : IN_INDEX_RANGE;
+
+        if (res)
+        {
+          read_out_identical_ranges();
+          index_scan_state= IN_RANGE_LIST;
+        }
+
+        have_record= TRUE;
+        break;
+      }
+      case NEED_MORE_RANGES:
+      {
+        if (dsmrr_eof)
+        {
+          index_scan_state= SCAN_FINISHED;
+          return HA_ERR_END_OF_FILE;
+        }
+
+        /*
+          When rowid fetching is used, it controls all buffer refills. When we're
+          on our own, try refilling our buffer.
+        */
+        if (!do_rndpos_scan)
+          dsmrr_fill_key_buffer();
+
+        if (key_buffer->is_empty())
+        {
+          index_scan_state= SCAN_FINISHED;
+          return HA_ERR_END_OF_FILE;
+        }
+
+        index_scan_state= IN_RANGE_LIST;
+      }
+      default:
+        DBUG_ASSERT(0);
+        break;
+    }
+    
+    if (have_record &&
+        (!h->mrr_funcs.skip_index_tuple ||
+         h->mrr_funcs.skip_index_tuple(h->mrr_iter, *(char**)cur_range_info)) 
+        && 
+        (!h->mrr_funcs.skip_record ||
+         h->mrr_funcs.skip_record(h->mrr_iter, *(char**)cur_range_info, NULL)))
+    {
+      break;
+    }
+    /* Go get another (record, range_id) combination */
+  } /* while */
+  
+  DBUG_RETURN(0);
+}
+
+#if 0
 int DsMrr_impl::dsmrr_next_from_index(char **range_info_arg)
 {
   int res;
@@ -995,6 +1171,7 @@ check_record:
 end:
   return res;
 }
+#endif
 
 
 /**
@@ -1044,7 +1221,7 @@ int DsMrr_impl::dsmrr_next(char **range_info)
     {
       if (do_sort_keys)
       {
-        if (!key_buffer->is_empty() || in_index_range) 
+        if (index_scan_state != SCAN_FINISHED) 
         {
           /* There are some sorted keys left. Use them to get rowids */
           if ((res= dsmrr_fill_rowid_buffer()))
diff --git a/sql/multi_range_read.h b/sql/multi_range_read.h
index 2ac477a0949..dfe3f99195a 100644
--- a/sql/multi_range_read.h
+++ b/sql/multi_range_read.h
@@ -197,11 +197,22 @@ private:
  
   /** Index scaning and key buffer-related members **/
   
+
+  enum enum_index_scan_state {
+    NEED_MORE_RANGES,
+    IN_RANGE_LIST,
+    IN_INDEX_RANGE,
+    IN_IDENTICAL_KEYS_RANGE,
+    SCAN_FINISHED
+  };
+
+  enum enum_index_scan_state index_scan_state;
+
   /* TRUE <=> We can get at most one index tuple for a lookup key */
   bool index_ranges_unique;
 
   /* TRUE<=> we're in a middle of enumerating records for a key range */
-  bool in_index_range;
+  //bool in_index_range;
   
   /*
     One of the following two is used for key buffer: forward is used when 
@@ -249,7 +260,7 @@ private:
     subsequent key values are the same as the one we've already retrieved and
     returned index tuple for.
   */
-  bool in_identical_keys_range;
+  //bool in_identical_keys_range;
 
   /* range_id of the first of the identical keys */
   char *first_identical_range_info;
@@ -303,6 +314,8 @@ private:
 
   void setup_buffer_sizes(key_range *sample_key);
   void reallocate_buffer_space();
+  
+  void read_out_identical_ranges();
 
   static range_seq_t key_buf_seq_init(void *init_param, uint n_ranges, uint flags);
   static uint key_buf_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range);

From 22d5323fac784a0c3f337d1d47e8011bf161422f Mon Sep 17 00:00:00 2001
From: Sergey Petrunya <psergey@askmonty.org>
Date: Fri, 1 Oct 2010 15:54:35 +0400
Subject: [PATCH 39/49] DS-MRR/CPK improvements: more of addressing review
 feedback

---
 sql/multi_range_read.cc | 48 +++++++++++++++++++++++------------------
 sql/multi_range_read.h  | 10 ++++-----
 2 files changed, 32 insertions(+), 26 deletions(-)

diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc
index 326344f1be8..9ffc9fed3df 100644
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -377,7 +377,6 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
     h->mrr_iter= seq_funcs->init(seq_init_param, n_ranges, mode);
     h->mrr_funcs= *seq_funcs;
     keyno= (h->inited == handler::INDEX)? h->active_index : h2->active_index;
-    index_scan_state= IN_RANGE_LIST;
     dsmrr_fill_key_buffer();
     
     if (dsmrr_eof && !do_rndpos_scan)
@@ -818,7 +817,7 @@ void DsMrr_impl::dsmrr_fill_key_buffer()
 
   last_identical_key_ptr= NULL;
   //in_identical_keys_range= FALSE;
-  index_scan_state= IN_RANGE_LIST;
+  index_scan_state= GET_NEXT_RANGE;
   DBUG_VOID_RETURN;
 }
 
@@ -884,7 +883,7 @@ int DsMrr_impl::dsmrr_next_from_index(char **range_info_arg)
     bool have_record= FALSE;
     switch (index_scan_state)
     {
-      case IN_IDENTICAL_KEYS_RANGE:
+      case GET_NEXT_IDENTICAL_KEY:
       {
         /* Get the next range_id for the current record */ 
 
@@ -898,12 +897,12 @@ int DsMrr_impl::dsmrr_next_from_index(char **range_info_arg)
             We've just got to the last of identical ranges. Next step is to
             go next record
           */
-          index_scan_state= index_ranges_unique? IN_RANGE_LIST : IN_INDEX_RANGE;
+          index_scan_state= index_ranges_unique? GET_NEXT_RANGE : GET_NEXT_RECORD;
         }
         have_record= TRUE;
         break;
       }
-      case IN_INDEX_RANGE:
+      case GET_NEXT_RECORD:
       {
         /* Get the next record from the range */
         res= file->ha_index_next_same(table->record[0], cur_index_tuple, 
@@ -911,10 +910,10 @@ int DsMrr_impl::dsmrr_next_from_index(char **range_info_arg)
         if (res)
         {
           if (res != HA_ERR_END_OF_FILE && res != HA_ERR_KEY_NOT_FOUND)
-            return res;  /* Fatal error */
+            DBUG_RETURN(res);  /* Fatal error */
 
           /* Got EOF for this range, go get the next range */
-          index_scan_state= IN_RANGE_LIST;
+          index_scan_state= GET_NEXT_RANGE;
           break;
         }
         
@@ -925,21 +924,23 @@ int DsMrr_impl::dsmrr_next_from_index(char **range_info_arg)
             If the range we're scanning is one of the set of identical ranges,
             return this record with range_id of each range
           */
-          index_scan_state= IN_IDENTICAL_KEYS_RANGE;
+          index_scan_state= GET_NEXT_IDENTICAL_KEY;
           identical_key_it->init(key_buffer);
           cur_range_info= first_identical_range_info;
+          have_record= FALSE; //psergey4
         }
         break;
       }
-      case IN_RANGE_LIST:
+      case GET_NEXT_RANGE:
       {
+        read_out_identical_ranges();
         if (do_rndpos_scan)
           reallocate_buffer_space();
 
         /* Get the next range to scan */
         if (key_buffer->read()) /* read to (cur_index_tuple,cur_range_info) */
         {
-          index_scan_state= NEED_MORE_RANGES;
+          index_scan_state= REFILL_KEY_BUFFER;
           break;
         }
         uchar *key_in_buf= cur_index_tuple;
@@ -951,7 +952,7 @@ int DsMrr_impl::dsmrr_next_from_index(char **range_info_arg)
                                      key_tuple_map, HA_READ_KEY_EXACT);
 
         if (res && res != HA_ERR_END_OF_FILE && res != HA_ERR_KEY_NOT_FOUND)
-          return res; /* Fatal error */
+          DBUG_RETURN(res); /* Fatal error */
         
         /* 
           Check if subsequent elements in the key buffer are the same as this
@@ -970,28 +971,32 @@ int DsMrr_impl::dsmrr_next_from_index(char **range_info_arg)
 
         if (last_identical_key_ptr)
         {
-          index_scan_state= IN_IDENTICAL_KEYS_RANGE;
+          index_scan_state= GET_NEXT_IDENTICAL_KEY;
           identical_key_it->init(key_buffer);
           first_identical_range_info= cur_range_info;
+          have_record= FALSE; //psergey4
         }
         else
-          index_scan_state= index_ranges_unique? IN_RANGE_LIST : IN_INDEX_RANGE;
+        {
+          index_scan_state= index_ranges_unique? GET_NEXT_RANGE : GET_NEXT_RECORD;
+          have_record= TRUE;
+        }
 
         if (res)
         {
           read_out_identical_ranges();
-          index_scan_state= IN_RANGE_LIST;
+          index_scan_state= GET_NEXT_RANGE;
+          have_record= FALSE;
         }
 
-        have_record= TRUE;
         break;
       }
-      case NEED_MORE_RANGES:
+      case REFILL_KEY_BUFFER:
       {
         if (dsmrr_eof)
         {
           index_scan_state= SCAN_FINISHED;
-          return HA_ERR_END_OF_FILE;
+          DBUG_RETURN(HA_ERR_END_OF_FILE);
         }
 
         /*
@@ -1004,10 +1009,10 @@ int DsMrr_impl::dsmrr_next_from_index(char **range_info_arg)
         if (key_buffer->is_empty())
         {
           index_scan_state= SCAN_FINISHED;
-          return HA_ERR_END_OF_FILE;
+          DBUG_RETURN(HA_ERR_END_OF_FILE);
         }
 
-        index_scan_state= IN_RANGE_LIST;
+        index_scan_state= GET_NEXT_RANGE;
       }
       default:
         DBUG_ASSERT(0);
@@ -1016,16 +1021,17 @@ int DsMrr_impl::dsmrr_next_from_index(char **range_info_arg)
     
     if (have_record &&
         (!h->mrr_funcs.skip_index_tuple ||
-         h->mrr_funcs.skip_index_tuple(h->mrr_iter, *(char**)cur_range_info)) 
+         !h->mrr_funcs.skip_index_tuple(h->mrr_iter, *(char**)cur_range_info)) 
         && 
         (!h->mrr_funcs.skip_record ||
-         h->mrr_funcs.skip_record(h->mrr_iter, *(char**)cur_range_info, NULL)))
+         !h->mrr_funcs.skip_record(h->mrr_iter, *(char**)cur_range_info, NULL)))
     {
       break;
     }
     /* Go get another (record, range_id) combination */
   } /* while */
   
+  memcpy(range_info_arg, cur_range_info, sizeof(void*));
   DBUG_RETURN(0);
 }
 
diff --git a/sql/multi_range_read.h b/sql/multi_range_read.h
index dfe3f99195a..2a0b4d6a59e 100644
--- a/sql/multi_range_read.h
+++ b/sql/multi_range_read.h
@@ -199,10 +199,10 @@ private:
   
 
   enum enum_index_scan_state {
-    NEED_MORE_RANGES,
-    IN_RANGE_LIST,
-    IN_INDEX_RANGE,
-    IN_IDENTICAL_KEYS_RANGE,
+    REFILL_KEY_BUFFER,
+    GET_NEXT_RANGE,
+    GET_NEXT_RECORD,
+    GET_NEXT_IDENTICAL_KEY,
     SCAN_FINISHED
   };
 
@@ -238,7 +238,7 @@ private:
     we may have a situation where we've read everything from the key buffer but 
     haven't finished with getting index tuples for the last key)
   */
-  bool key_eof;
+  //bool key_eof;
 
   /* Initially FALSE, becomes TRUE when we've set key_tuple_xxx members */
   bool know_key_tuple_params;

From 51564575068494463d6591356c28d771858e280f Mon Sep 17 00:00:00 2001
From: Sergey Petrunya <psergey@askmonty.org>
Date: Sun, 3 Oct 2010 14:48:42 +0400
Subject: [PATCH 40/49] Address review feedback - One iterator class - Switch
 back from state automaton into two-nested-iterators approach..

---
 mysql-test/t/join_nested.test   |   1 -
 mysql-test/t/subselect_sj2.test |   1 +
 sql/multi_range_read.cc         | 419 ++++++++++----------------------
 sql/multi_range_read.h          |  95 ++++----
 sql/sql_lifo_buffer.h           | 179 ++++----------
 5 files changed, 220 insertions(+), 475 deletions(-)

diff --git a/mysql-test/t/join_nested.test b/mysql-test/t/join_nested.test
index 5b07d8966f1..97404b0440c 100644
--- a/mysql-test/t/join_nested.test
+++ b/mysql-test/t/join_nested.test
@@ -462,7 +462,6 @@ SELECT t2.a,t2.b,t3.a,t3.b,t4.a,t4.b
        LEFT JOIN              
        (t1,t2)
        ON t3.a=1 AND t3.b=t2.b AND t2.b=t4.b;
-
 SELECT t2.a,t2.b,t3.a,t3.b,t4.a,t4.b
   FROM (t3,t4)
        LEFT JOIN              
diff --git a/mysql-test/t/subselect_sj2.test b/mysql-test/t/subselect_sj2.test
index e73e7cfade2..5c40da0f56f 100644
--- a/mysql-test/t/subselect_sj2.test
+++ b/mysql-test/t/subselect_sj2.test
@@ -586,6 +586,7 @@ if (`select @@join_cache_level=6`)
   --echo # Not anymore:
   --echo # The following query gives wrong result due to Bug#49129
 }
+select sin(0);
 select * from t0 where t0.a in 
   (select t1.a from t1, t2 where t2.a=t0.a and t1.b=t2.b);
 
diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc
index 9ffc9fed3df..0cf31c222c6 100644
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -682,7 +682,7 @@ void DsMrr_impl::setup_buffer_sizes(key_range *sample_key)
   {
     /* Give all space to forward key buffer. */
     key_buffer= &forward_key_buf;
-    identical_key_it= &forward_key_it;
+    //identical_key_it= &forward_key_it;
     key_buffer->set_buffer_space(full_buf, full_buf_end);
 
     /* Just in case, tell rowid buffer that it has zero size: */
@@ -731,7 +731,7 @@ void DsMrr_impl::setup_buffer_sizes(key_range *sample_key)
   rowid_buffer_end= full_buf + bytes_for_rowids;
   rowid_buffer.set_buffer_space(full_buf, rowid_buffer_end);
   key_buffer= &backward_key_buf;
-  identical_key_it= &backward_key_it;
+  //identical_key_it= &backward_key_it;
   key_buffer->set_buffer_space(rowid_buffer_end, full_buf_end); 
 }
 
@@ -771,7 +771,7 @@ void DsMrr_impl::dsmrr_fill_key_buffer()
       */
       rowid_buffer.set_buffer_space(full_buf, rowid_buffer_end);
       key_buffer= &backward_key_buf;
-      identical_key_it= &backward_key_it;
+      //identical_key_it= &backward_key_it;
       key_buffer->set_buffer_space(rowid_buffer_end, full_buf_end);
     }
     key_buffer->reset();
@@ -815,9 +815,12 @@ void DsMrr_impl::dsmrr_fill_key_buffer()
                             is_mrr_assoc? (uchar**)&cur_range_info: NULL,
                             sizeof(void*));
 
-  last_identical_key_ptr= NULL;
+  //last_identical_key_ptr= NULL;
   //in_identical_keys_range= FALSE;
-  index_scan_state= GET_NEXT_RANGE;
+  //index_scan_state= GET_NEXT_RANGE;
+  scanning_key_val_iter= FALSE;
+  index_scan_eof= FALSE; 
+
   DBUG_VOID_RETURN;
 }
 
@@ -834,22 +837,88 @@ void DsMrr_impl::reallocate_buffer_space()
 }
 
 
-/**
-  Read out ranges from the buffer until we've reached the range with 
-  last_identical_key_ptr. 
-*/
-
-void DsMrr_impl::read_out_identical_ranges()
+//////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////
+bool Key_value_records_iterator::init(DsMrr_impl *dsmrr_arg)
 {
-  if (last_identical_key_ptr)
+  int res;
+  dsmrr= dsmrr_arg;
+  handler *file= dsmrr->do_rndpos_scan? dsmrr->h2 : dsmrr->h;
+
+  identical_key_it.init(dsmrr->key_buffer);
+  /* Get the first pair into (cur_index_tuple, cur_range_info) */ 
+  if (identical_key_it.read())
+    return TRUE;
+
+  uchar *key_in_buf= dsmrr->cur_index_tuple;
+
+  if (dsmrr->use_key_pointers)
+    dsmrr->cur_index_tuple= *((uchar**)dsmrr->cur_index_tuple);
+  
+  /* Check out how many more identical keys are following */
+  //char *save_cur_range_info= cur_range_info;
+  uchar *save_cur_index_tuple= dsmrr->cur_index_tuple;
+  last_identical_key_ptr= dsmrr->cur_index_tuple;
+  while (!identical_key_it.read())
   {
-    /* key_buffer.read() reads to (cur_index_tuple, cur_range_info) */
-    while (!key_buffer->read() && (cur_index_tuple != last_identical_key_ptr)) {}
-    last_identical_key_ptr= NULL;
+    if (DsMrr_impl::key_tuple_cmp(dsmrr, key_in_buf, dsmrr->cur_index_tuple))
+      break;
+    last_identical_key_ptr= dsmrr->cur_index_tuple;
   }
+  identical_key_it.init(dsmrr->key_buffer);
+  dsmrr->cur_index_tuple= save_cur_index_tuple;
+  //cur_range_info= save_cur_range_info;
+  res= file->ha_index_read_map(dsmrr->table->record[0], 
+                               dsmrr->cur_index_tuple, 
+                               dsmrr->key_tuple_map, 
+                               HA_READ_KEY_EXACT);
+
+  if (res)
+  {
+    close();
+    return res; /* Fatal error */
+  }
+  get_next_row= FALSE;
+  return 0;
 }
 
 
+int Key_value_records_iterator::get_next()
+{
+  handler *file= dsmrr->do_rndpos_scan? dsmrr->h2 : dsmrr->h;
+  int res;
+
+  if (get_next_row)
+  {
+    if (dsmrr->index_ranges_unique)
+      return HA_ERR_END_OF_FILE;  /* Max one match */
+
+    if ((res= file->ha_index_next_same(dsmrr->table->record[0], 
+                                       dsmrr->cur_index_tuple, 
+                                       dsmrr->key_tuple_length)))
+    {
+      /* EOF is EOF for iterator, also, any error means EOF on the iterator */
+      return res;
+    }
+    identical_key_it.init(dsmrr->key_buffer);
+  }
+
+  identical_key_it.read(); // This gets us next range_id.
+  if (!last_identical_key_ptr || (dsmrr->cur_index_tuple == last_identical_key_ptr))
+  {
+    get_next_row= TRUE;
+  }
+  return 0;
+}
+
+void Key_value_records_iterator::close()
+{
+  while (!dsmrr->key_buffer->read() && 
+         (dsmrr->cur_index_tuple != last_identical_key_ptr)) {}
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////
 
 /**
   DS-MRR/CPK: multi_range_read_next() function
@@ -871,154 +940,55 @@ void DsMrr_impl::read_out_identical_ranges()
   @retval HA_ERR_END_OF_FILE  End of records
   @retval Other               Some other error
 */
-
 int DsMrr_impl::dsmrr_next_from_index(char **range_info_arg)
 {
   DBUG_ENTER("DsMrr_impl::dsmrr_next_from_index");
-  int res;
-  handler *file= do_rndpos_scan? h2: h;
-  
+  //handler *file= do_rndpos_scan? h2: h;
+
   while (1)
   {
     bool have_record= FALSE;
-    switch (index_scan_state)
+    if (scanning_key_val_iter)
     {
-      case GET_NEXT_IDENTICAL_KEY:
+      if (kv_it.get_next())
       {
-        /* Get the next range_id for the current record */ 
-
-        /* read to (cur_index_tuple, cur_range_info) */
-        bool bres= identical_key_it->read_next();
-        DBUG_ASSERT(!bres);
-
-        if (cur_index_tuple == last_identical_key_ptr)
-        {
-          /* 
-            We've just got to the last of identical ranges. Next step is to
-            go next record
-          */
-          index_scan_state= index_ranges_unique? GET_NEXT_RANGE : GET_NEXT_RECORD;
-        }
+        kv_it.close();
+        scanning_key_val_iter= FALSE;
+      }
+      else
         have_record= TRUE;
-        break;
-      }
-      case GET_NEXT_RECORD:
+    }
+    else
+    {
+      while (kv_it.init(this))
       {
-        /* Get the next record from the range */
-        res= file->ha_index_next_same(table->record[0], cur_index_tuple, 
-                                      key_tuple_length);
-        if (res)
-        {
-          if (res != HA_ERR_END_OF_FILE && res != HA_ERR_KEY_NOT_FOUND)
-            DBUG_RETURN(res);  /* Fatal error */
-
-          /* Got EOF for this range, go get the next range */
-          index_scan_state= GET_NEXT_RANGE;
-          break;
-        }
-        
-        have_record= TRUE;
-        if (last_identical_key_ptr)
-        {
-          /* 
-            If the range we're scanning is one of the set of identical ranges,
-            return this record with range_id of each range
-          */
-          index_scan_state= GET_NEXT_IDENTICAL_KEY;
-          identical_key_it->init(key_buffer);
-          cur_range_info= first_identical_range_info;
-          have_record= FALSE; //psergey4
-        }
-        break;
-      }
-      case GET_NEXT_RANGE:
-      {
-        read_out_identical_ranges();
-        if (do_rndpos_scan)
-          reallocate_buffer_space();
-
-        /* Get the next range to scan */
-        if (key_buffer->read()) /* read to (cur_index_tuple,cur_range_info) */
-        {
-          index_scan_state= REFILL_KEY_BUFFER;
-          break;
-        }
-        uchar *key_in_buf= cur_index_tuple;
-
-        if (use_key_pointers)
-          cur_index_tuple= *((uchar**)cur_index_tuple);
-
-        res= file->ha_index_read_map(table->record[0], cur_index_tuple, 
-                                     key_tuple_map, HA_READ_KEY_EXACT);
-
-        if (res && res != HA_ERR_END_OF_FILE && res != HA_ERR_KEY_NOT_FOUND)
-          DBUG_RETURN(res); /* Fatal error */
-        
-        /* 
-          Check if subsequent elements in the key buffer are the same as this
-          one
-        */
-        char *save_cur_range_info= cur_range_info;
-        identical_key_it->init(key_buffer);
-        last_identical_key_ptr= NULL;
-        while (!identical_key_it->read_next())
-        {
-          if (key_tuple_cmp(this, key_in_buf, cur_index_tuple))
-            break;
-          last_identical_key_ptr= cur_index_tuple;
-        }
-        cur_range_info= save_cur_range_info;
-
-        if (last_identical_key_ptr)
-        {
-          index_scan_state= GET_NEXT_IDENTICAL_KEY;
-          identical_key_it->init(key_buffer);
-          first_identical_range_info= cur_range_info;
-          have_record= FALSE; //psergey4
-        }
-        else
-        {
-          index_scan_state= index_ranges_unique? GET_NEXT_RANGE : GET_NEXT_RECORD;
-          have_record= TRUE;
-        }
-
-        if (res)
-        {
-          read_out_identical_ranges();
-          index_scan_state= GET_NEXT_RANGE;
-          have_record= FALSE;
-        }
-
-        break;
-      }
-      case REFILL_KEY_BUFFER:
-      {
-        if (dsmrr_eof)
-        {
-          index_scan_state= SCAN_FINISHED;
-          DBUG_RETURN(HA_ERR_END_OF_FILE);
-        }
-
-        /*
-          When rowid fetching is used, it controls all buffer refills. When we're
-          on our own, try refilling our buffer.
-        */
-        if (!do_rndpos_scan)
-          dsmrr_fill_key_buffer();
-
+        /* Failed to initialize iterator */
         if (key_buffer->is_empty())
         {
-          index_scan_state= SCAN_FINISHED;
-          DBUG_RETURN(HA_ERR_END_OF_FILE);
-        }
+          if (dsmrr_eof)
+          {
+            index_scan_eof= TRUE;
+            DBUG_RETURN(HA_ERR_END_OF_FILE);
+          }
 
-        index_scan_state= GET_NEXT_RANGE;
+          /*
+            When rowid fetching is used, it controls all buffer refills. When we're
+            on our own, try refilling our buffer.
+          */
+          if (!do_rndpos_scan)
+            dsmrr_fill_key_buffer();
+
+          if (key_buffer->is_empty())
+          {
+            index_scan_eof= TRUE;
+            DBUG_RETURN(HA_ERR_END_OF_FILE);
+          }
+        }
       }
-      default:
-        DBUG_ASSERT(0);
-        break;
+      /* if we got here, it means iterator was successfully initialized */
+      scanning_key_val_iter= TRUE;
     }
-    
+
     if (have_record &&
         (!h->mrr_funcs.skip_index_tuple ||
          !h->mrr_funcs.skip_index_tuple(h->mrr_iter, *(char**)cur_range_info)) 
@@ -1030,156 +1000,11 @@ int DsMrr_impl::dsmrr_next_from_index(char **range_info_arg)
     }
     /* Go get another (record, range_id) combination */
   } /* while */
-  
+
   memcpy(range_info_arg, cur_range_info, sizeof(void*));
   DBUG_RETURN(0);
 }
 
-#if 0
-int DsMrr_impl::dsmrr_next_from_index(char **range_info_arg)
-{
-  int res;
-  uchar *key_in_buf;
-  handler *file= do_rndpos_scan? h2: h;
-  bool res2;
-
-  while (in_identical_keys_range)
-  {
-    /* This will read to (cur_index_tuple, cur_range_info): */
-    res2= identical_key_it->read_next();
-    DBUG_ASSERT(!res2);
-
-    if (cur_index_tuple == last_identical_key_ptr)
-    {
-      /* We're looking at the last of the identical keys */
-      in_identical_keys_range= FALSE;
-    }
-check_record:
-    if ((h->mrr_funcs.skip_index_tuple &&
-         h->mrr_funcs.skip_index_tuple(h->mrr_iter, *(char**)cur_range_info)) || 
-        (h->mrr_funcs.skip_record &&
-         h->mrr_funcs.skip_record(h->mrr_iter, *(char**)cur_range_info, NULL)))
-    {
-      continue;
-    }
-    memcpy(range_info_arg, cur_range_info, sizeof(void*));
-    return 0;
-  }
-  
-  /* Try returrning next record from the current range */
-  while (in_index_range)
-  {
-    res= file->ha_index_next_same(table->record[0], cur_index_tuple, 
-                                  key_tuple_length);
-    
-    if (res)
-    {
-      if (res != HA_ERR_END_OF_FILE && res != HA_ERR_KEY_NOT_FOUND)
-        return res;  /* Fatal error */
-
-      in_index_range= FALSE; /* no more records here */
-      break;
-    }
-    
-    if (last_identical_key_ptr)
-    {
-      in_identical_keys_range= TRUE;
-      identical_key_it->init(key_buffer);
-      cur_range_info= first_identical_range_info;
-    }
-
-    goto check_record;
-  }
-
-  while(1)
-  {
-    DBUG_ASSERT(!in_identical_keys_range && !in_index_range);
-
-    /* Jump over the keys that were handled by identical key processing */
-    if (last_identical_key_ptr)
-    {
-      /* key_buffer.read() reads to (cur_index_tuple, cur_range_info) */
-      while (!key_buffer->read() && (cur_index_tuple != last_identical_key_ptr)) {}
-      last_identical_key_ptr= NULL;
-    }
-
-    /* First, make sure we have a range at start of the buffer */
-    if (key_buffer->is_empty())
-    {
-      if (dsmrr_eof)
-      {
-        res= HA_ERR_END_OF_FILE;
-        goto end;
-      }
-      /*
-        When rowid fetching is used, it controls all buffer refills. When we're
-        on our own, try refilling our buffer.
-      */
-      if (!do_rndpos_scan)
-        dsmrr_fill_key_buffer();
-
-      if (key_buffer->is_empty())
-      {
-        res= HA_ERR_END_OF_FILE;
-        goto end;
-      }
-    }
-    
-    /*
-      At this point we're not using anything what we've read from key
-      buffer. Cut off unused key buffer space and give it to the rowid
-      buffer.
-    */
-    if (do_rndpos_scan)
-      reallocate_buffer_space();
-
-    /* Get the next range to scan */
-    key_buffer->read(); // reads to (cur_index_tuple, cur_range_info)
-    key_in_buf= cur_index_tuple;
-
-    if (use_key_pointers)
-      cur_index_tuple= *((uchar**)cur_index_tuple);
-
-    /* Do index lookup */
-    if ((res= file->ha_index_read_map(table->record[0], cur_index_tuple, 
-                                      key_tuple_map, HA_READ_KEY_EXACT)))
-    {
-      if (res != HA_ERR_END_OF_FILE && res != HA_ERR_KEY_NOT_FOUND)
-        return res;
-      continue; /* to next key and make another lookup */
-    }
-
-    /* Check if subsequent keys in the key buffer are the same as this one */
-    {
-      char *save_cur_range_info= cur_range_info;
-      identical_key_it->init(key_buffer);
-      last_identical_key_ptr= NULL;
-      while (!identical_key_it->read_next())
-      {
-        if (key_tuple_cmp(this, key_in_buf, cur_index_tuple))
-          break;
-
-        last_identical_key_ptr= cur_index_tuple;
-      }
-      cur_range_info= save_cur_range_info;
-      if (last_identical_key_ptr)
-      {
-        in_identical_keys_range= TRUE;
-        identical_key_it->init(key_buffer);
-        first_identical_range_info= cur_range_info;
-      }
-    }
-
-    in_index_range= !index_ranges_unique;
-    goto check_record;
-  }
- 
-end:
-  return res;
-}
-#endif
-
-
 /**
   DS-MRR implementation: multi_range_read_next() function.
 
@@ -1227,7 +1052,7 @@ int DsMrr_impl::dsmrr_next(char **range_info)
     {
       if (do_sort_keys)
       {
-        if (index_scan_state != SCAN_FINISHED) 
+        if (index_scan_eof) 
         {
           /* There are some sorted keys left. Use them to get rowids */
           if ((res= dsmrr_fill_rowid_buffer()))
@@ -1288,9 +1113,9 @@ int DsMrr_impl::dsmrr_next(char **range_info)
         Note: this implies that SQL layer doesn't touch table->record[0]
         between calls.
       */
-      Forward_iterator it;
+      Lifo_buffer_iterator it;
       it.init(&rowid_buffer);
-      while (!it.read_next()) // reads to (rowid, ...)
+      while (!it.read()) // reads to (rowid, ...)
       {
         if (h2->cmp_ref(rowid, cur_rowid))
           break;
diff --git a/sql/multi_range_read.h b/sql/multi_range_read.h
index 2a0b4d6a59e..e93276055e5 100644
--- a/sql/multi_range_read.h
+++ b/sql/multi_range_read.h
@@ -48,6 +48,37 @@
 
 #include "sql_lifo_buffer.h"
 
+class DsMrr_impl;
+
+/**
+  Iterator over (record, range_id) pairs that match given key value.
+  
+  We may need to scan multiple (key_val, range_id) pairs with the same 
+  key value. A key value may have multiple matching records, so we'll need to
+  produce a cross-product of sets of matching records and range_id-s.
+*/
+
+class Key_value_records_iterator
+{
+  /* Scan parameters */
+  DsMrr_impl *dsmrr;
+  Lifo_buffer_iterator identical_key_it;
+  uchar *last_identical_key_ptr;
+  bool get_next_row;
+public:
+  /*
+  */
+  bool init(DsMrr_impl *dsmrr);
+
+  /*
+    Get next (key_val, range_id) pair.
+  */
+  int get_next();
+
+  void close();
+};
+
+
 /*
   DS-MRR implementation for one table. Create/use one object of this class for
   each ha_{myisam/innobase/etc} object. That object will be further referred to
@@ -196,17 +227,6 @@ private:
   uchar *rowid_buffer_end;
  
   /** Index scaning and key buffer-related members **/
-  
-
-  enum enum_index_scan_state {
-    REFILL_KEY_BUFFER,
-    GET_NEXT_RANGE,
-    GET_NEXT_RECORD,
-    GET_NEXT_IDENTICAL_KEY,
-    SCAN_FINISHED
-  };
-
-  enum enum_index_scan_state index_scan_state;
 
   /* TRUE <=> We can get at most one index tuple for a lookup key */
   bool index_ranges_unique;
@@ -220,25 +240,26 @@ private:
     buffers.
   */
   Forward_lifo_buffer forward_key_buf;
-  Forward_iterator forward_key_it;
   Backward_lifo_buffer backward_key_buf;
-  Backward_iterator backward_key_it;
 
   /* Buffer to store (key, range_id) pairs */
   Lifo_buffer *key_buffer;
-   
-  /* key_buffer.read() reads */
-  uchar *cur_index_tuple;
-
-  /* if in_index_range==TRUE: range_id of the range we're enumerating */
-  char *cur_range_info;
-
+  
+  /* Index scan state */
+  bool scanning_key_val_iter;
   /* 
     TRUE <=> we've got index tuples/rowids for all keys (need this flag because 
     we may have a situation where we've read everything from the key buffer but 
     haven't finished with getting index tuples for the last key)
   */
-  //bool key_eof;
+  bool index_scan_eof;  
+  Key_value_records_iterator kv_it;
+  
+  /* key_buffer.read() reads to here */
+  uchar *cur_index_tuple;
+
+  /* if in_index_range==TRUE: range_id of the range we're enumerating */
+  char *cur_range_info;
 
   /* Initially FALSE, becomes TRUE when we've set key_tuple_xxx members */
   bool know_key_tuple_params;
@@ -255,28 +276,6 @@ private:
   /* = key_size_in_keybuf [ + sizeof(range_assoc_info) ] */
   uint key_buff_elem_size;
   
-  /* 
-    TRUE <=> we're doing key-ordered index scan and right now several
-    subsequent key values are the same as the one we've already retrieved and
-    returned index tuple for.
-  */
-  //bool in_identical_keys_range;
-
-  /* range_id of the first of the identical keys */
-  char *first_identical_range_info;
-
-  /* Pointer to the last of the identical key values */
-  uchar *last_identical_key_ptr;
-
-  /* 
-    key_buffer iterator for walking the identical key range (we need to
-    enumerate the set of (identical_key, range_id) pairs multiple times,
-    and do that by walking from current buffer read position until we get
-    last_identical_key_ptr.
-  */
-  Lifo_buffer::Iterator *identical_key_it;
-
-
   /** rnd_pos() scan and rowid buffer-related members **/
 
   /*
@@ -288,12 +287,7 @@ private:
   /* rowid_buffer.read() will set the following:  */
   uchar *rowid;
   uchar *rowids_range_id;
-  
-  /*
-    not-NULL: we're traversing a group of (rowid, range_id) pairs with
-              identical rowid values, and this is the pointer to the last one.
-    NULL: we're not in the group of indentical rowids.
-  */
+
   uchar *last_identical_rowid;
 
   bool dsmrr_eof; /* TRUE <=> We have reached EOF when reading index tuples */
@@ -315,10 +309,9 @@ private:
   void setup_buffer_sizes(key_range *sample_key);
   void reallocate_buffer_space();
   
-  void read_out_identical_ranges();
-
   static range_seq_t key_buf_seq_init(void *init_param, uint n_ranges, uint flags);
   static uint key_buf_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range);
+  friend class Key_value_records_iterator;
 };
 
 /**
diff --git a/sql/sql_lifo_buffer.h b/sql/sql_lifo_buffer.h
index dc0ed30ab43..89b520484e4 100644
--- a/sql/sql_lifo_buffer.h
+++ b/sql/sql_lifo_buffer.h
@@ -112,40 +112,17 @@ public:
   virtual void reset() = 0;
   virtual uchar *end_of_space() = 0;
 protected:
-  bool have_data(size_t bytes)
-  {
-    return (used_size() >= bytes);
-  }
   virtual bool have_space_for(size_t bytes) = 0;
   virtual size_t used_size() = 0;
-
+  
+  /* To be used only by iterator class: */
+  virtual uchar *get_pos()= 0;
+  virtual bool read(uchar **position)= 0;
+  friend class Lifo_buffer_iterator;
 public:
-
   virtual void remove_unused_space(uchar **unused_start, uchar **unused_end)=0;
-  virtual uchar *used_area() = 0;
-   
-  /** Iterator to walk over contents of the buffer without reading it. */
-  class Iterator
-  {
-  public:
-    virtual void init(Lifo_buffer *buf) = 0;
-    /*
-      Read the next value. The calling convention is the same as buf->read()
-      has.
-
-      @retval FALSE - ok
-      @retval TRUE  - EOF, reached the end of the buffer
-    */
-    virtual bool read_next()= 0;
-    virtual ~Iterator() {}
-  protected:
-    Lifo_buffer *buf;
-    virtual uchar *get_next(size_t nbytes)=0;
-  };
+  virtual uchar *used_area() = 0; 
   virtual ~Lifo_buffer() {};
-
-  friend class Forward_iterator;
-  friend class Backward_iterator;
 };
 
 
@@ -196,19 +173,24 @@ public:
     memcpy(pos, data, bytes);
     pos += bytes;
   }
-  uchar *read_bytes(size_t bytes)
+  bool have_data(uchar *position, size_t bytes)
   {
-    DBUG_ASSERT(have_data(bytes));
-    pos= pos - bytes;
-    return pos;
+    return ((position - start) >= (ptrdiff_t)bytes);
   }
-  bool read()
+  uchar *read_bytes(uchar **position, size_t bytes)
   {
-    if (!have_data(size1 + (read_ptr2 ? size2 : 0)))
+    DBUG_ASSERT(have_data(*position, bytes));
+    *position= (*position) - bytes;
+    return *position;
+  }
+  bool read() { return read(&pos); }
+  bool read(uchar **position)
+  {
+    if (!have_data(*position, size1 + (read_ptr2 ? size2 : 0)))
       return TRUE;
     if (read_ptr2)
-      *read_ptr2= read_bytes(size2);
-    *read_ptr1= read_bytes(size1);
+      *read_ptr2= read_bytes(position, size2);
+    *read_ptr1= read_bytes(position, size1);
     return FALSE;
   }
   void remove_unused_space(uchar **unused_start, uchar **unused_end)
@@ -231,58 +213,11 @@ public:
   }
   /* Return pointer to start of the memory area that is occupied by the data */
   uchar *used_area() { return start; }
-  friend class Forward_iterator;
+  friend class Lifo_buffer_iterator;
+  uchar *get_pos() { return pos; }
 };
 
 
-/**
-  Iterator for Forward_lifo_buffer
-*/
-
-class Forward_iterator : public Lifo_buffer::Iterator
-{
-  uchar *pos;
-
-  /** Return pointer to next chunk of nbytes bytes and avance over it */
-  uchar *get_next(size_t nbytes)
-  {
-    if (pos - nbytes < ((Forward_lifo_buffer*)buf)->start)
-      return NULL;
-    pos -= nbytes;
-    return pos;
-  }
-public:
-  bool read_next()
-  {
-    uchar *res;
-    if (buf->read_ptr2)
-    {
-      if ((res= get_next(buf->size2)))
-      {
-        *(buf->read_ptr2)= res;
-        *buf->read_ptr1= get_next(buf->size1);
-        return FALSE;
-      }
-    }
-    else
-    {
-      if ((res= get_next(buf->size1)))
-      {
-        *(buf->read_ptr1)= res;
-        return FALSE;
-      }
-    }
-    return TRUE; /* EOF */
-  }
-
-  void init(Lifo_buffer *buf_arg)
-  {
-    DBUG_ASSERT(buf_arg->type() == Lifo_buffer::FORWARD);
-    buf= buf_arg;
-    pos= ((Forward_lifo_buffer*)buf)->pos;
-  }
-};
-
 
 /**
   Backward LIFO buffer
@@ -332,18 +267,26 @@ public:
   }
   bool read()
   {
-    if (!have_data(size1 + (read_ptr2 ? size2 : 0)))
+    return read(&pos);
+  }
+  bool read(uchar **position)
+  {
+    if (!have_data(*position, size1 + (read_ptr2 ? size2 : 0)))
       return TRUE;
-    *read_ptr1= read_bytes(size1);
+    *read_ptr1= read_bytes(position, size1);
     if (read_ptr2)
-      *read_ptr2= read_bytes(size2);
+      *read_ptr2= read_bytes(position, size2);
     return FALSE;
   }
-  uchar *read_bytes(size_t bytes)
+  bool have_data(uchar *position, size_t bytes)
   {
-    DBUG_ASSERT(have_data(bytes));
-    uchar *ret= pos;
-    pos= pos + bytes;
+    return ((end - position) >= (ptrdiff_t)bytes);
+  }
+  uchar *read_bytes(uchar **position, size_t bytes)
+  {
+    DBUG_ASSERT(have_data(*position, bytes));
+    uchar *ret= *position;
+    *position= *position + bytes;
     return ret;
   }
   /**
@@ -363,50 +306,34 @@ public:
   }
   /* Return pointer to start of the memory area that is occupied by the data */
   uchar *used_area() { return pos; }
-  friend class Backward_iterator;
+  friend class Lifo_buffer_iterator;
+  uchar *get_pos() { return pos; }
 };
 
 
-/**
-  Iterator for Backward_lifo_buffer
-*/
 
-class Backward_iterator : public Lifo_buffer::Iterator
+/** Iterator to walk over contents of the buffer without reading it. */
+class Lifo_buffer_iterator
 {
   uchar *pos;
-  /* Return pointer to next chunk of nbytes bytes and advance over it */
-  uchar *get_next(size_t nbytes)
-  {
-    if (pos + nbytes > ((Backward_lifo_buffer*)buf)->end)
-      return NULL;
-    uchar *res= pos;
-    pos += nbytes;
-    return res;
-  }
+  Lifo_buffer *buf;
 public:
-  bool read_next()
-  {
-    /*
-      Always read the first component first (if the buffer is backwards, we
-      have written the second component first).
-    */
-    uchar *res;
-    if ((res= get_next(buf->size1)))
-    {
-      *(buf->read_ptr1)= res;
-      if (buf->read_ptr2)
-        *buf->read_ptr2= get_next(buf->size2);
-      return FALSE;
-    }
-    return TRUE; /* EOF */
-  }
   void init(Lifo_buffer *buf_arg)
   {
-    DBUG_ASSERT(buf_arg->type() == Lifo_buffer::BACKWARD);
     buf= buf_arg;
-    pos= ((Backward_lifo_buffer*)buf)->pos;
+    pos= buf->get_pos();
+  }
+  /*
+    Read the next value. The calling convention is the same as buf->read()
+    has.
+
+    @retval FALSE - ok
+    @retval TRUE  - EOF, reached the end of the buffer
+  */
+  bool read() 
+  {
+    return buf->read(&pos);
   }
 };
 
 
-

From d4f2e7a9a931912999f9d126fb1476f40a861635 Mon Sep 17 00:00:00 2001
From: Sergey Petrunya <psergey@askmonty.org>
Date: Mon, 4 Oct 2010 00:37:30 +0400
Subject: [PATCH 41/49] Fix failures introduced in the previous push

---
 mysql-test/t/subselect_sj2.test | 2 +-
 sql/multi_range_read.cc         | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/mysql-test/t/subselect_sj2.test b/mysql-test/t/subselect_sj2.test
index 5c40da0f56f..67452b5a69e 100644
--- a/mysql-test/t/subselect_sj2.test
+++ b/mysql-test/t/subselect_sj2.test
@@ -586,7 +586,7 @@ if (`select @@join_cache_level=6`)
   --echo # Not anymore:
   --echo # The following query gives wrong result due to Bug#49129
 }
-select sin(0);
+
 select * from t0 where t0.a in 
   (select t1.a from t1, t2 where t2.a=t0.a and t1.b=t2.b);
 
diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc
index 0cf31c222c6..b5ec4b075df 100644
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -852,13 +852,13 @@ bool Key_value_records_iterator::init(DsMrr_impl *dsmrr_arg)
 
   uchar *key_in_buf= dsmrr->cur_index_tuple;
 
+  last_identical_key_ptr= dsmrr->cur_index_tuple;
   if (dsmrr->use_key_pointers)
     dsmrr->cur_index_tuple= *((uchar**)dsmrr->cur_index_tuple);
   
   /* Check out how many more identical keys are following */
   //char *save_cur_range_info= cur_range_info;
   uchar *save_cur_index_tuple= dsmrr->cur_index_tuple;
-  last_identical_key_ptr= dsmrr->cur_index_tuple;
   while (!identical_key_it.read())
   {
     if (DsMrr_impl::key_tuple_cmp(dsmrr, key_in_buf, dsmrr->cur_index_tuple))
@@ -901,6 +901,7 @@ int Key_value_records_iterator::get_next()
       return res;
     }
     identical_key_it.init(dsmrr->key_buffer);
+    get_next_row= FALSE;
   }
 
   identical_key_it.read(); // This gets us next range_id.
@@ -1052,7 +1053,7 @@ int DsMrr_impl::dsmrr_next(char **range_info)
     {
       if (do_sort_keys)
       {
-        if (index_scan_eof) 
+        if (!index_scan_eof) 
         {
           /* There are some sorted keys left. Use them to get rowids */
           if ((res= dsmrr_fill_rowid_buffer()))

From ac8a79b944546ffcbaf366f789873b28e05896f8 Mon Sep 17 00:00:00 2001
From: Sergey Petrunya <psergey@askmonty.org>
Date: Mon, 4 Oct 2010 10:31:40 +0400
Subject: [PATCH 42/49] Code cleanup

---
 sql/multi_range_read.cc | 14 ++------------
 sql/multi_range_read.h  |  2 --
 2 files changed, 2 insertions(+), 14 deletions(-)

diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc
index b5ec4b075df..58b5cd3da50 100644
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -373,7 +373,6 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
   if (do_sort_keys)
   {
     know_key_tuple_params= FALSE;
-    //in_index_range= FALSE;
     h->mrr_iter= seq_funcs->init(seq_init_param, n_ranges, mode);
     h->mrr_funcs= *seq_funcs;
     keyno= (h->inited == handler::INDEX)? h->active_index : h2->active_index;
@@ -771,7 +770,6 @@ void DsMrr_impl::dsmrr_fill_key_buffer()
       */
       rowid_buffer.set_buffer_space(full_buf, rowid_buffer_end);
       key_buffer= &backward_key_buf;
-      //identical_key_it= &backward_key_it;
       key_buffer->set_buffer_space(rowid_buffer_end, full_buf_end);
     }
     key_buffer->reset();
@@ -815,9 +813,6 @@ void DsMrr_impl::dsmrr_fill_key_buffer()
                             is_mrr_assoc? (uchar**)&cur_range_info: NULL,
                             sizeof(void*));
 
-  //last_identical_key_ptr= NULL;
-  //in_identical_keys_range= FALSE;
-  //index_scan_state= GET_NEXT_RANGE;
   scanning_key_val_iter= FALSE;
   index_scan_eof= FALSE; 
 
@@ -857,7 +852,6 @@ bool Key_value_records_iterator::init(DsMrr_impl *dsmrr_arg)
     dsmrr->cur_index_tuple= *((uchar**)dsmrr->cur_index_tuple);
   
   /* Check out how many more identical keys are following */
-  //char *save_cur_range_info= cur_range_info;
   uchar *save_cur_index_tuple= dsmrr->cur_index_tuple;
   while (!identical_key_it.read())
   {
@@ -867,7 +861,6 @@ bool Key_value_records_iterator::init(DsMrr_impl *dsmrr_arg)
   }
   identical_key_it.init(dsmrr->key_buffer);
   dsmrr->cur_index_tuple= save_cur_index_tuple;
-  //cur_range_info= save_cur_range_info;
   res= file->ha_index_read_map(dsmrr->table->record[0], 
                                dsmrr->cur_index_tuple, 
                                dsmrr->key_tuple_map, 
@@ -918,8 +911,6 @@ void Key_value_records_iterator::close()
          (dsmrr->cur_index_tuple != last_identical_key_ptr)) {}
 }
 
-//////////////////////////////////////////////////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////
 
 /**
   DS-MRR/CPK: multi_range_read_next() function
@@ -941,10 +932,10 @@ void Key_value_records_iterator::close()
   @retval HA_ERR_END_OF_FILE  End of records
   @retval Other               Some other error
 */
+
 int DsMrr_impl::dsmrr_next_from_index(char **range_info_arg)
 {
   DBUG_ENTER("DsMrr_impl::dsmrr_next_from_index");
-  //handler *file= do_rndpos_scan? h2: h;
 
   while (1)
   {
@@ -963,7 +954,6 @@ int DsMrr_impl::dsmrr_next_from_index(char **range_info_arg)
     {
       while (kv_it.init(this))
       {
-        /* Failed to initialize iterator */
         if (key_buffer->is_empty())
         {
           if (dsmrr_eof)
@@ -986,7 +976,6 @@ int DsMrr_impl::dsmrr_next_from_index(char **range_info_arg)
           }
         }
       }
-      /* if we got here, it means iterator was successfully initialized */
       scanning_key_val_iter= TRUE;
     }
 
@@ -1006,6 +995,7 @@ int DsMrr_impl::dsmrr_next_from_index(char **range_info_arg)
   DBUG_RETURN(0);
 }
 
+
 /**
   DS-MRR implementation: multi_range_read_next() function.
 
diff --git a/sql/multi_range_read.h b/sql/multi_range_read.h
index e93276055e5..3c92dcd2950 100644
--- a/sql/multi_range_read.h
+++ b/sql/multi_range_read.h
@@ -66,8 +66,6 @@ class Key_value_records_iterator
   uchar *last_identical_key_ptr;
   bool get_next_row;
 public:
-  /*
-  */
   bool init(DsMrr_impl *dsmrr);
 
   /*

From d8efc3b155dc7791b00d5e87ccb8b34d1a12156b Mon Sep 17 00:00:00 2001
From: Sergey Petrunya <psergey@askmonty.org>
Date: Tue, 26 Oct 2010 15:35:13 +0400
Subject: [PATCH 43/49] DS-MRR improvements: address review feedback for R3
 version of the patch

---
 sql/handler.h           |    4 +-
 sql/multi_range_read.cc | 1130 +++++++++++++++++++++------------------
 sql/multi_range_read.h  |  346 +++++++++---
 3 files changed, 882 insertions(+), 598 deletions(-)

diff --git a/sql/handler.h b/sql/handler.h
index 40f2d321241..6ff252632ee 100644
--- a/sql/handler.h
+++ b/sql/handler.h
@@ -2177,7 +2177,8 @@ public:
       TRUE    if the engine supports virtual columns
   */
   virtual bool check_if_supported_virtual_columns(void) { return FALSE;}
-
+  
+  TABLE* get_table() { return table; }
 protected:
   /* deprecated, don't use in new engines */
   inline void ha_statistic_increment(ulong SSV::*offset) const { }
@@ -2370,7 +2371,6 @@ private:
   virtual int rename_partitions(const char *path)
   { return HA_ERR_WRONG_COMMAND; }
   friend class ha_partition;
-  friend class DsMrr_impl;
 public:
   /* XXX to be removed, see ha_partition::partition_ht() */
   virtual handlerton *partition_ht() const
diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc
index 58b5cd3da50..4df2c4209d9 100644
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -214,7 +214,6 @@ handler::multi_range_read_init(RANGE_SEQ_IF *seq_funcs, void *seq_init_param,
   DBUG_RETURN(0);
 }
 
-
 /**
   Get next record in MRR scan
 
@@ -230,7 +229,7 @@ handler::multi_range_read_init(RANGE_SEQ_IF *seq_funcs, void *seq_init_param,
 
 int handler::multi_range_read_next(char **range_info)
 {
-  int UNINIT_VAR(result);
+  int result= HA_ERR_END_OF_FILE;
   int range_res;
   DBUG_ENTER("handler::multi_range_read_next");
 
@@ -284,6 +283,416 @@ scan_it_again:
 }
 
 
+/***** MRR_impl classes ****************************************************/
+
+int Mrr_simple_index_reader::get_next(char **range_info)
+{
+  while (!(res= h->handler::multi_range_read_next(range_info)))
+  {
+    KEY_MULTI_RANGE *curr_range= &h->handler::mrr_cur_range;
+    if (!h->mrr_funcs.skip_index_tuple ||
+        !h->mrr_funcs.skip_index_tuple(h->mrr_iter, curr_range->ptr))
+      break;
+  }
+  return res;
+}
+
+int Mrr_simple_index_reader::init(handler *h_arg, RANGE_SEQ_IF *seq_funcs, 
+                                  void *seq_init_param, uint n_ranges,
+                                  uint mode, Buffer_manager *buf_manager_arg)
+{
+  HANDLER_BUFFER no_buffer = {NULL, NULL, NULL};
+  h= h_arg;
+  return h->handler::multi_range_read_init(seq_funcs, seq_init_param, n_ranges,
+                                           mode, &no_buffer);
+}
+
+/**
+  DS-MRR/CPK: multi_range_read_next() function
+  
+  @param range_info  OUT  identifier of range that the returned record belongs to
+  
+  @note
+    This function walks over key buffer and does index reads, i.e. it produces
+    {current_record, range_id} pairs.
+
+    The function has the same call contract like multi_range_read_next()'s.
+
+    We actually iterate over nested sequences:
+    - a disjoint sequence of index ranges
+      - each range has multiple records
+        - each record goes into multiple identical ranges.
+
+  @retval 0                   OK, next record was successfully read
+  @retval HA_ERR_END_OF_FILE  End of records
+  @retval Other               Some other error
+*/
+
+int Mrr_ordered_index_reader::get_next(char **range_info_arg)
+{
+  DBUG_ENTER("Mrr_ordered_index_reader::get_next");
+
+  while (1)
+  {
+    bool have_record= FALSE;
+    if (scanning_key_val_iter)
+    {
+      if (kv_it.get_next())
+      {
+        kv_it.close();
+        scanning_key_val_iter= FALSE;
+      }
+      else
+        have_record= TRUE;
+    }
+    else
+    {
+      while (kv_it.init(this))
+      {
+        if (key_buffer->is_empty())
+        {
+          if (auto_refill)
+          {
+            int res;
+            if ((res= refill_buffer()))
+              return res;
+            if (key_buffer->is_empty())
+            {
+              index_scan_eof= TRUE;
+              DBUG_RETURN(HA_ERR_END_OF_FILE);
+            }
+          }
+          else
+          {
+            /* Buffer refills are managed by somebody else for us */
+            index_scan_eof= TRUE;
+            DBUG_RETURN(HA_ERR_END_OF_FILE);
+          }
+        }
+      }
+      scanning_key_val_iter= TRUE;
+    }
+
+    if (have_record &&
+        (!mrr_funcs.skip_index_tuple ||
+         !mrr_funcs.skip_index_tuple(mrr_iter, *(char**)cur_range_info))
+        && 
+        (!mrr_funcs.skip_record ||
+         !mrr_funcs.skip_record(mrr_iter, *(char**)cur_range_info, NULL)))
+    {
+      break;
+    }
+    /* Go get another (record, range_id) combination */
+  } /* while */
+
+  memcpy(range_info_arg, cur_range_info, sizeof(void*));
+  DBUG_RETURN(0);
+}
+
+
+/**
+  DS-MRR/CPK: Fill the buffer with (lookup_tuple, range_id) pairs and sort
+  
+  Enumerate the input range (=key) sequence, fill the key buffer with 
+  (lookup_key, range_id) pairs and sort it.
+
+  When this function returns, either
+   - key buffer is non-empty, or
+   - key buffer is empty and source range sequence is exhausted
+  
+  @note
+    dsmrr_eof is set to indicate whether we've exhausted the list of ranges 
+    we're scanning.
+*/
+
+int Mrr_ordered_index_reader::refill_buffer()
+{
+  int res;
+  KEY_MULTI_RANGE cur_range;
+  uchar **range_info_ptr= (uchar**)&cur_range.ptr;
+  uchar *key_ptr;
+  DBUG_ENTER("Mrr_ordered_index_reader::refill_buffer");
+
+  DBUG_ASSERT(!know_key_tuple_params || key_buffer->is_empty());
+  if (know_key_tuple_params)
+  {
+    buf_manager->reset_buffer_sizes();
+    key_buffer->reset();
+    key_buffer->setup_writing(&key_ptr, keypar.key_size_in_keybuf,
+                              is_mrr_assoc? (uchar**)&range_info_ptr : NULL,
+                              sizeof(uchar*));
+  }
+#if 0
+
+  if (know_key_tuple_params)
+  {
+    if (do_rndpos_scan && rowid_buffer.is_empty())
+    {
+      /*
+        We're using two buffers and both of them are empty now. Restore the
+        original sizes
+      */
+      rowid_buffer.set_buffer_space(full_buf, rowid_buffer_end);
+      key_buffer= &backward_key_buf;
+      key_buffer->set_buffer_space(rowid_buffer_end, full_buf_end);
+    }
+  }
+  is all of the ifdef-ed stuff is handled above?
+#endif
+  while ((!know_key_tuple_params || key_buffer->can_write()) && 
+         !(res= mrr_funcs.next(mrr_iter, &cur_range)))
+  {
+    DBUG_ASSERT(cur_range.range_flag & EQ_RANGE);
+
+    if (!know_key_tuple_params)
+    {
+      /* This only happens when we've just started filling the buffer */
+      key_range *sample_key= &cur_range.start_key;
+      know_key_tuple_params= TRUE;
+      keypar.key_tuple_length= sample_key->length;
+      keypar.key_tuple_map= sample_key->keypart_map;
+      keypar.key_size_in_keybuf= keypar.use_key_pointers ? sizeof(char*) : keypar.key_tuple_length;
+      KEY *key_info= &h->get_table()->key_info[h->active_index];
+      keypar.index_ranges_unique= test(key_info->flags & HA_NOSAME && 
+                                       key_info->key_parts == 
+                                       my_count_bits(sample_key->keypart_map));
+      buf_manager->setup_buffer_sizes(keypar.key_size_in_keybuf, keypar.key_tuple_map);
+      key_buffer= buf_manager->get_key_buffer();
+      key_buffer->setup_writing(&key_ptr, keypar.key_size_in_keybuf,
+                               is_mrr_assoc? (uchar**)&range_info_ptr : NULL,
+                               sizeof(uchar*));
+      DBUG_ASSERT(key_buffer->can_write());
+    }
+    
+    /* Put key, or {key, range_id} pair into the buffer */
+    if (keypar.use_key_pointers)
+      key_ptr=(uchar*) &cur_range.start_key.key;
+    else
+      key_ptr=(uchar*) cur_range.start_key.key;
+
+    key_buffer->write();
+  }
+
+  no_more_keys= test(res);
+
+  key_buffer->sort((key_buffer->type() == Lifo_buffer::FORWARD)? 
+                     (qsort2_cmp)Mrr_ordered_index_reader::key_tuple_cmp_reverse : 
+                     (qsort2_cmp)Mrr_ordered_index_reader::key_tuple_cmp, 
+                   (void*)this);
+  
+  scanning_key_val_iter= FALSE;
+  index_scan_eof= FALSE; 
+
+  DBUG_RETURN(0);
+}
+
+
+int Mrr_ordered_index_reader::init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
+                                   void *seq_init_param, uint n_ranges,
+                                   uint mode, Buffer_manager *buf_manager_arg)
+{
+  h= h_arg;
+  mrr_iter= seq_funcs->init(seq_init_param, n_ranges, mode);
+  keypar.use_key_pointers= test(mode & HA_MRR_MATERIALIZED_KEYS);
+  is_mrr_assoc=    !test(mode & HA_MRR_NO_ASSOCIATION);
+  mrr_funcs= *seq_funcs;
+  know_key_tuple_params= FALSE;
+  buf_manager= buf_manager_arg;
+  return 0;
+}
+
+
+static int rowid_cmp_reverse(void *h, uchar *a, uchar *b)
+{
+  return - ((handler*)h)->cmp_ref(a, b);
+}
+
+
+int Mrr_ordered_rndpos_reader::init(handler *h_arg, 
+                                    Mrr_index_reader *index_reader_arg,
+                                    uint mode,
+                                    Lifo_buffer *buf)
+{
+  h= h_arg;
+  index_reader= index_reader_arg;
+  rowid_buffer= buf;
+  is_mrr_assoc=    !test(mode & HA_MRR_NO_ASSOCIATION);
+  //rowid_buff_elem_size= h->ref_length;
+  //if (!(mode & HA_MRR_NO_ASSOCIATION))
+  //  rowid_buff_elem_size += sizeof(char*);
+
+  return index_reader->refill_buffer();
+}
+
+
+/**
+  DS-MRR: Fill and sort the rowid buffer
+
+  Scan the MRR ranges and collect ROWIDs (or {ROWID, range_id} pairs) into 
+  buffer. When the buffer is full or scan is completed, sort the buffer by 
+  rowid and return.
+
+  When this function returns, either rowid buffer is not empty, or the source
+  of lookup keys (i.e. ranges) is exhaused.
+  
+  dsmrr_eof is set to indicate whether we've exhausted the list of ranges we're
+  scanning. This function never returns HA_ERR_END_OF_FILE.
+
+  @retval 0      OK, the next portion of rowids is in the buffer,
+                 properly ordered
+  @retval other  Error
+*/
+
+int Mrr_ordered_rndpos_reader::refill_buffer()
+{
+  char *range_info;
+  uchar **range_info_ptr= (uchar**)&range_info;
+  int res;
+  DBUG_ENTER("Mrr_ordered_rndpos_reader::refill_buffer");
+  
+  DBUG_ASSERT(rowid_buffer->is_empty());
+  index_rowid= index_reader->get_rowid_ptr();
+  rowid_buffer->reset();
+  rowid_buffer->setup_writing(&index_rowid, h->ref_length,
+                              is_mrr_assoc? (uchar**)&range_info_ptr: NULL,
+                              sizeof(void*));
+
+  last_identical_rowid= NULL;
+
+  while (rowid_buffer->can_write())
+  {
+    res= index_reader->get_next(&range_info);
+
+    if (res)
+      break;
+
+    /* Put rowid, or {rowid, range_id} pair into the buffer */
+    index_reader->h->position(index_reader->h->get_table()->record[0]);
+
+    rowid_buffer->write();
+  }
+
+  if (res && res != HA_ERR_END_OF_FILE)
+    DBUG_RETURN(res); 
+
+   
+  /* Sort the buffer contents by rowid */
+  rowid_buffer->sort((qsort2_cmp)rowid_cmp_reverse, (void*)h);
+
+  rowid_buffer->setup_reading(&rowid, h->ref_length,
+                              is_mrr_assoc? (uchar**)&rowids_range_id: NULL, 
+                              sizeof(void*));
+  DBUG_RETURN(0);
+}
+
+
+/**
+  DS-MRR implementation: multi_range_read_next() function.
+
+  Calling convention is like multi_range_read_next() has.
+*/
+
+int Mrr_ordered_rndpos_reader::get_next(char **range_info)
+{
+  int res;
+
+  while (last_identical_rowid)
+  {
+    /*
+      Current record (the one we've returned in previous call) was obtained
+      from a rowid that matched multiple range_ids. Return this record again,
+      with next matching range_id.
+    */
+    bool UNINIT_VAR(bres);
+    bres= rowid_buffer->read();
+    DBUG_ASSERT(!bres);
+
+    if (is_mrr_assoc)
+      memcpy(range_info, rowids_range_id, sizeof(uchar*));
+
+    if (rowid == last_identical_rowid)
+    {
+      last_identical_rowid= NULL; /* reached the last of identical rowids */
+    }
+
+    if (!index_reader->skip_record((char*)*range_info, rowid))
+    {
+      return 0;
+    }
+  }
+
+  while (1)
+  {
+    if (rowid_buffer->is_empty())
+    {
+      /*
+        We're out of rowids. If there are still some sorted keys, use up them
+        first (that is, don't call re-fill for keys when we still have some).
+      */
+      if (!index_reader->eof())
+      {
+        if ((res= refill_buffer()))
+          return res; /* for fatal errors */
+      }
+      else
+      {
+        //TODO: here: redistribute the buffer space, then refill the index
+        //reader, then refill us.
+      }
+    }
+   
+    last_identical_rowid= NULL;
+
+    /* Return eof if there are no rowids in the buffer after re-fill attempt */
+    if (rowid_buffer->read())
+      return HA_ERR_END_OF_FILE;
+
+    if (is_mrr_assoc)
+    {
+      memcpy(range_info, rowids_range_id, sizeof(uchar*));
+    }
+
+    if (index_reader->skip_record(*range_info, rowid))
+      continue;
+
+    res= h->ha_rnd_pos(h->get_table()->record[0], rowid);
+
+    if (res == HA_ERR_RECORD_DELETED)
+      continue;
+    
+    /* 
+      Check if subsequent buffer elements have the same rowid value as this
+      one. If yes, remember this fact so that we don't make any more rnd_pos()
+      calls with this value.
+    */
+    if (!res)
+    {
+      uchar *cur_rowid= rowid;
+      /* 
+        Note: this implies that SQL layer doesn't touch table->record[0]
+        between calls.
+      */
+      Lifo_buffer_iterator it;
+      it.init(rowid_buffer);
+      while (!it.read()) // reads to (rowid, ...)
+      {
+        if (h->cmp_ref(rowid, cur_rowid))
+          break;
+        last_identical_rowid= rowid;
+      }
+    }
+    return 0;
+  }
+
+  return res;
+}
+
+
+
+
+/************ MRR_impl classes end *********************************************/
+
+
 /****************************************************************************
  * DS-MRR implementation 
  ***************************************************************************/
@@ -310,9 +719,8 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
                            void *seq_init_param, uint n_ranges, uint mode,
                            HANDLER_BUFFER *buf)
 {
-  Item *pushed_cond= NULL;
-  handler *new_h2= 0;
   THD *thd= current_thd;
+  int res;
   DBUG_ENTER("DsMrr_impl::dsmrr_init");
 
   /*
@@ -320,88 +728,130 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
     has not been called, so set the owner handler here as well.
   */
   h= h_arg;
-  if (mode & HA_MRR_USE_DEFAULT_IMPL || mode & HA_MRR_SORTED)
+  is_mrr_assoc=    !test(mode & HA_MRR_NO_ASSOCIATION);
+
+  if ((mode & HA_MRR_USE_DEFAULT_IMPL) || (mode & HA_MRR_SORTED))
   {
-    use_default_impl= TRUE;
-    const int retval=
-      h->handler::multi_range_read_init(seq_funcs, seq_init_param, n_ranges, 
-                                        mode, buf);
-    DBUG_RETURN(retval);
+    DBUG_ASSERT(h->inited == handler::INDEX);
+    Mrr_simple_index_reader *s= &strategy_factory.simple_index_reader;
+    res= s->init(h, seq_funcs, seq_init_param, n_ranges, mode, this);
+    strategy= s;
+    DBUG_RETURN(res);
   }
-  use_default_impl= FALSE;
-  is_mrr_assoc= !test(mode & HA_MRR_NO_ASSOCIATION);
   
+  /* Neither of strategies used below can handle sorting */
+  DBUG_ASSERT(!(mode & HA_MRR_SORTED));
+
   /*
     Determine whether we'll need to do key sorting and/or rnd_pos() scan
   */
-  do_sort_keys= FALSE;
-  if ((mode & HA_MRR_SINGLE_POINT) && 
-       optimizer_flag(thd, OPTIMIZER_SWITCH_MRR_SORT_KEYS))
+  index_strategy= NULL;
+  Mrr_ordered_index_reader *ordered_idx_reader= NULL;
+  if ((mode & HA_MRR_SINGLE_POINT) &&
+      optimizer_flag(thd, OPTIMIZER_SWITCH_MRR_SORT_KEYS))
   {
-    do_sort_keys= TRUE;
-    use_key_pointers= test(mode & HA_MRR_MATERIALIZED_KEYS);
+    index_strategy= ordered_idx_reader= &strategy_factory.ordered_index_reader;
   }
+  else
+    index_strategy= &strategy_factory.simple_index_reader;
 
-  do_rndpos_scan= FALSE;
-  bool doing_cpk_scan= check_cpk_scan(thd, h->inited == handler::INDEX? 
-                                      h->active_index: h2->active_index, mode);
-  if (!doing_cpk_scan /* && !index_only_read */)
-  {
-    /* Will use rowid buffer to store/sort rowids, etc */
-    do_rndpos_scan= TRUE;
-  }
-
-  /* 
-    We should either sort keys, or do ordered rnd_pos scan, or both. If we
-    decide to do neither, we should have used default MRR implementation.
+  strategy= index_strategy;
+  /*
+    We don't need a rowid-to-rndpos step if
+     - We're doing a scan on clustered primary key
+     - [In the future] We're doing an index_only read
   */
-  DBUG_ASSERT(do_sort_keys || do_rndpos_scan);
+  DBUG_ASSERT(h->inited == handler::INDEX || 
+              (h->inited == handler::RND && h2 && 
+               h2->inited == handler::INDEX));
+
+  handler *h_idx= (h->inited == handler::INDEX)? h: h2;
+  keyno= h_idx->active_index;
+
+  Mrr_ordered_rndpos_reader *disk_strategy= NULL;
+  if (!(keyno == table->s->primary_key && h_idx->primary_key_is_clustered()))
+  {
+    strategy= disk_strategy= &strategy_factory.ordered_rndpos_reader;
+  }
 
-  
   if (is_mrr_assoc)
-    status_var_increment(table->in_use->status_var.ha_multi_range_read_init_count);
+    status_var_increment(thd->status_var.ha_multi_range_read_init_count);
 
-  /* 
-    At start, alloc all of the buffer for rowids. When/if key sorting code
-    figures how much buffer space it needs, it will call setup_buffer_sizes()
-    to re-distribute the buffer space.
-  */
   full_buf= buf->buffer;
   full_buf_end= buf->buffer_end;
-  rowid_buffer.set_buffer_space(full_buf, full_buf_end);
   
-  if (do_sort_keys)
+  if (strategy == index_strategy)
   {
-    know_key_tuple_params= FALSE;
-    h->mrr_iter= seq_funcs->init(seq_init_param, n_ranges, mode);
-    h->mrr_funcs= *seq_funcs;
-    keyno= (h->inited == handler::INDEX)? h->active_index : h2->active_index;
-    dsmrr_fill_key_buffer();
-    
-    if (dsmrr_eof && !do_rndpos_scan)
-      buf->end_of_used_area= key_buffer->end_of_space();
+    if (ordered_idx_reader)
+      ordered_idx_reader->auto_refill= TRUE;
+    /* Index strategy serves it all. We don't need two handlers, etc */
+    /* Give the buffer to index strategy */
+    if ((res= index_strategy->init(h, seq_funcs, seq_init_param, n_ranges,
+                                   mode, this)))
+      goto error;
   }
-
-  if (!do_rndpos_scan)
+  else
   {
-    /* 
-      We have the keys and won't need to fetch rowids, as key lookup will be
-      the last operation, done in multi_range_read_next().
+    /*
+      If we got here the request is served by both index and rndpos strategies
+      working together.
+
     */
-    DBUG_RETURN(0);
+    rowid_buffer.set_buffer_space(buf->buffer, buf->buffer_end);
+
+    if ((res= setup_two_handlers()))
+      DBUG_RETURN(res);
+
+    if (ordered_idx_reader)
+      ordered_idx_reader->auto_refill= FALSE;
+
+    if ((res= index_strategy->init(h2, seq_funcs, seq_init_param, n_ranges, 
+                                   mode, this)) || 
+        (res= disk_strategy->init(h, index_strategy, mode, &rowid_buffer)))
+    {
+      goto error;
+    }
   }
 
-  rowid_buff_elem_size= h->ref_length + (is_mrr_assoc? sizeof(char*) : 0);
+  if (strategy->refill_buffer())
+    goto error;
+
   /*
-    There can be two cases:
-    - This is the first call since index_init(), h2==NULL
-       Need to setup h2 then.
-    - This is not the first call, h2 is initalized and set up appropriately.
-       The caller might have called h->index_init(), need to switch h to
-       rnd_pos calls.
+    If we have scanned through all intervals in *seq, then adjust *buf to 
+    indicate that the remaining buffer space will not be used.
   */
+//  if (dsmrr_eof) 
+//    buf->end_of_used_area= rowid_buffer.end_of_space();
+
+  
+  DBUG_RETURN(0);
+error:
+  close_second_handler();
+  strategy= NULL;
+  DBUG_RETURN(1);
+}
+
+
+/*
+  Whatever the current state is, make it so that we have two handler objects:
+  - h (the primary)    -  initialized for rnd_pos() scan
+  - h2 (the secondary) -  initialized for scanning the index specified in
+                          this->keyno
+  RETURN 
+    0        OK
+    HA_XXX   Error code
+*/
+
+int DsMrr_impl::setup_two_handlers()
+{
+  int res;
+  THD *thd= current_thd;
+  DBUG_ENTER("DsMrr_impl::setup_two_handlers");
   if (!h2)
   {
+    handler *new_h2;
+    Item *pushed_cond= NULL;
+    DBUG_ASSERT(h->inited == handler::INDEX);
     /* Create a separate handler object to do rnd_pos() calls. */
     /*
       ::clone() takes up a lot of stack, especially on 64 bit platforms.
@@ -409,8 +859,6 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
     */
     if (check_stack_overrun(thd, 5*STACK_MIN_SIZE, (uchar*) &new_h2))
       DBUG_RETURN(1);
-    DBUG_ASSERT(h->active_index != MAX_KEY);
-    keyno= h->active_index;
 
     /* Create a separate handler object to do rnd_pos() calls. */
     if (!(new_h2= h->clone(thd->mem_root)) || 
@@ -422,25 +870,27 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
 
     if (keyno == h->pushed_idx_cond_keyno)
       pushed_cond= h->pushed_idx_cond;
-
+    
+    Mrr_strategy *save_strategy= strategy;
+    strategy= NULL;
     /*
       Caution: this call will invoke this->dsmrr_close(). Do not put the
-      created secondary table handler into this->h2 or it will delete it.
+      created secondary table handler new_h2 into this->h2 or it will delete 
+      it. Also, save the picked strategy
     */
-    if (h->ha_index_end())
-    {
-      h2=new_h2;
-      goto error;
-    }
+    res= h->ha_index_end();
 
-    use_default_impl= FALSE;
+    strategy= save_strategy;
     h2= new_h2; /* Ok, now can put it into h2 */
+
+    if (res || (res= (h->ha_rnd_init(FALSE))))
+      goto error;
+
     table->prepare_for_position();
     h2->extra(HA_EXTRA_KEYREAD);
-    h2->mrr_funcs= *seq_funcs; //psergey3-todo: sort out where to store
     h2->mrr_iter= h->mrr_iter;
 
-    if (h2->ha_index_init(keyno, FALSE))
+    if ((res= h2->ha_index_init(keyno, FALSE)))
       goto error;
 
     if (pushed_cond)
@@ -448,66 +898,39 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
   }
   else
   {
+    DBUG_ASSERT(h2 && h2->inited==handler::INDEX);
     /* 
       We get here when the access alternates betwen MRR scan(s) and non-MRR
       scans.
 
       Calling h->index_end() will invoke dsmrr_close() for this object,
-      which will delete h2. We need to keep it, so save put it away and dont
+      which will delete h2. We need to keep it, so put it away and dont
       let it be deleted:
     */
-    handler *save_h2= h2;
-    h2= NULL;
-    int res= (h->inited == handler::INDEX && h->ha_index_end());
-    h2= save_h2;
-    use_default_impl= FALSE;
-    if (res)
+    if (h->inited == handler::INDEX)
+    {
+      handler *save_h2= h2;
+      Mrr_strategy *save_strategy= strategy;
+      h2= NULL;
+      strategy= NULL;
+      res= h->ha_index_end();
+      h2= save_h2;
+      strategy= save_strategy;
+      if (res)
+        goto error;
+    }
+    if ((h->inited == handler::RND) && h->ha_rnd_init(FALSE))
       goto error;
   }
-  
-  if (!do_sort_keys && 
-      h2->handler::multi_range_read_init(seq_funcs, seq_init_param, n_ranges, 
-                                         mode, buf))
-  {
-    goto error;
-  }
-
-  if (dsmrr_fill_rowid_buffer())
-  {
-    goto error;
-  }
-  /*
-    If the above call has scanned through all intervals in *seq, then
-    adjust *buf to indicate that the remaining buffer space will not be used.
-  */
-//  if (dsmrr_eof) 
-//    buf->end_of_used_area= rowid_buffer.end_of_space();
-
-  /*
-     h->inited == INDEX may occur when 'range checked for each record' is
-     used.
-  */
-  if ((h->inited != handler::RND) && 
-      ((h->inited==handler::INDEX? h->ha_index_end(): FALSE) || 
-       (h->ha_rnd_init(FALSE))))
-      goto error;
-
-  h->mrr_funcs= *seq_funcs;
-  
   DBUG_RETURN(0);
 error:
-  h2->ha_index_or_rnd_end();
-  h2->ha_external_lock(current_thd, F_UNLCK);
-  h2->close();
-  delete h2;
-  h2= NULL;
-  DBUG_RETURN(1);
+  //close_second_handler(); -- caller does that
+  DBUG_RETURN(res);
 }
 
 
-void DsMrr_impl::dsmrr_close()
+void DsMrr_impl::close_second_handler()
 {
-  DBUG_ENTER("DsMrr_impl::dsmrr_close");
   if (h2)
   {
     h2->ha_index_or_rnd_end();
@@ -516,106 +939,37 @@ void DsMrr_impl::dsmrr_close()
     delete h2;
     h2= NULL;
   }
-  use_default_impl= TRUE;
+}
+
+
+void DsMrr_impl::dsmrr_close()
+{
+  DBUG_ENTER("DsMrr_impl::dsmrr_close");
+  close_second_handler();
+  strategy= NULL;
   DBUG_VOID_RETURN;
 }
 
 
-static int rowid_cmp_reverse(void *h, uchar *a, uchar *b)
-{
-  return - ((handler*)h)->cmp_ref(a, b);
-}
-
-
-/**
-  DS-MRR: Fill and sort the rowid buffer
-
-  Scan the MRR ranges and collect ROWIDs (or {ROWID, range_id} pairs) into 
-  buffer. When the buffer is full or scan is completed, sort the buffer by 
-  rowid and return.
-
-  When this function returns, either rowid buffer is not empty, or the source
-  of lookup keys (i.e. ranges) is exhaused.
-  
-  dsmrr_eof is set to indicate whether we've exhausted the list of ranges we're
-  scanning. This function never returns HA_ERR_END_OF_FILE.
-
-  @retval 0      OK, the next portion of rowids is in the buffer,
-                 properly ordered
-  @retval other  Error
-*/
-
-int DsMrr_impl::dsmrr_fill_rowid_buffer()
-{
-  char *range_info;
-  uchar **range_info_ptr= (uchar**)&range_info;
-  int res;
-  DBUG_ENTER("DsMrr_impl::dsmrr_fill_rowid_buffer");
-  
-  DBUG_ASSERT(rowid_buffer.is_empty());
-  rowid_buffer.reset();
-  rowid_buffer.setup_writing(&h2->ref, h2->ref_length,
-                             is_mrr_assoc? (uchar**)&range_info_ptr: NULL,
-                             sizeof(void*));
-
-  last_identical_rowid= NULL;
-
-  while (rowid_buffer.can_write())
-  {
-    if (do_sort_keys)
-      res= dsmrr_next_from_index(&range_info);
-    else 
-      res= h2->handler::multi_range_read_next(&range_info);
-
-    if (res)
-      break;
-
-    KEY_MULTI_RANGE *curr_range= &h2->handler::mrr_cur_range;
-    if (!do_sort_keys && /* If keys are sorted then this check is already done */
-        h2->mrr_funcs.skip_index_tuple &&
-        h2->mrr_funcs.skip_index_tuple(h2->mrr_iter, curr_range->ptr))
-      continue;
-
-    /* Put rowid, or {rowid, range_id} pair into the buffer */
-    h2->position(table->record[0]);
-
-    rowid_buffer.write();
-  }
-
-  if (res && res != HA_ERR_END_OF_FILE)
-    DBUG_RETURN(res); 
-
-  if (!do_sort_keys)
-    dsmrr_eof= test(res == HA_ERR_END_OF_FILE);
-
-  /* Sort the buffer contents by rowid */
-  rowid_buffer.sort((qsort2_cmp)rowid_cmp_reverse, (void*)h);
-
-  rowid_buffer.setup_reading(&rowid, h->ref_length,
-                             is_mrr_assoc? (uchar**)&rowids_range_id: NULL, sizeof(void*));
-  DBUG_RETURN(0);
-}
-
-
 /* 
   my_qsort2-compatible function to compare key tuples 
 */
 
-int DsMrr_impl::key_tuple_cmp(void* arg, uchar* key1, uchar* key2)
+int Mrr_ordered_index_reader::key_tuple_cmp(void* arg, uchar* key1, uchar* key2)
 {
-  DsMrr_impl *dsmrr= (DsMrr_impl*)arg;
-  TABLE *table= dsmrr->h->table;
+  Mrr_ordered_index_reader *this_= (Mrr_ordered_index_reader*)arg;
+  TABLE *table= this_->h->get_table();
   int res;
-  KEY_PART_INFO *part= table->key_info[dsmrr->keyno].key_part;
+  KEY_PART_INFO *part= table->key_info[this_->h->active_index].key_part;
   
-  if (dsmrr->use_key_pointers)
+  if (this_->keypar.use_key_pointers)
   {
     /* the buffer stores pointers to keys, get to the keys */
     key1= *((uchar**)key1);
     key2= *((uchar**)key2);  // todo is this alignment-safe?
   }
 
-  uchar *key1_end= key1 + dsmrr->key_tuple_length;
+  uchar *key1_end= key1 + this_->keypar.key_tuple_length;
 
   while (key1 < key1_end)
   {
@@ -648,7 +1002,7 @@ equals:
 }
 
 
-int DsMrr_impl::key_tuple_cmp_reverse(void* arg, uchar* key1, uchar* key2)
+int Mrr_ordered_index_reader::key_tuple_cmp_reverse(void* arg, uchar* key1, uchar* key2)
 {
   return -key_tuple_cmp(arg, key1, key2);
 }
@@ -664,24 +1018,17 @@ int DsMrr_impl::key_tuple_cmp_reverse(void* arg, uchar* key1, uchar* key2)
     This function must be called when all buffers are empty
 */
 
-void DsMrr_impl::setup_buffer_sizes(key_range *sample_key)
+void DsMrr_impl::setup_buffer_sizes(uint key_size_in_keybuf, 
+                                    key_part_map key_tuple_map)
 {
-  key_tuple_length= sample_key->length;
-  key_tuple_map= sample_key->keypart_map;
-  key_size_in_keybuf= use_key_pointers ? sizeof(char*) : 
-                                       key_tuple_length;
-  key_buff_elem_size= key_size_in_keybuf + 
-                      (int)is_mrr_assoc * sizeof(void*);
+  uint key_buff_elem_size= key_size_in_keybuf + 
+                           (int)is_mrr_assoc * sizeof(void*);
   
-  KEY *key_info= &h->table->key_info[keyno];
-  index_ranges_unique= test(key_info->flags & HA_NOSAME && 
-                            key_info->key_parts == 
-                              my_count_bits(sample_key->keypart_map));
-  if (!do_rndpos_scan)
+  KEY *key_info= &h->get_table()->key_info[keyno];
+  if (strategy == index_strategy)
   {
-    /* Give all space to forward key buffer. */
+    /* Give all space to the key buffer, key buffer must be forward */
     key_buffer= &forward_key_buf;
-    //identical_key_it= &forward_key_it;
     key_buffer->set_buffer_space(full_buf, full_buf_end);
 
     /* Just in case, tell rowid buffer that it has zero size: */
@@ -730,100 +1077,21 @@ void DsMrr_impl::setup_buffer_sizes(key_range *sample_key)
   rowid_buffer_end= full_buf + bytes_for_rowids;
   rowid_buffer.set_buffer_space(full_buf, rowid_buffer_end);
   key_buffer= &backward_key_buf;
-  //identical_key_it= &backward_key_it;
   key_buffer->set_buffer_space(rowid_buffer_end, full_buf_end); 
 }
 
 
-/**
-  DS-MRR/CPK: Fill the buffer with (lookup_tuple, range_id) pairs and sort
-  
-  Enumerate the input range (=key) sequence, fill the key buffer with 
-  (lookup_key, range_id) pairs and sort it.
-
-  When this function returns, either
-   - key buffer is non-empty, or
-   - key buffer is empty and source range sequence is exhausted
-  
-  @note
-    dsmrr_eof is set to indicate whether we've exhausted the list of ranges 
-    we're scanning.
-*/
-
-void DsMrr_impl::dsmrr_fill_key_buffer()
+void DsMrr_impl::reset_buffer_sizes()
 {
-  int res;
-  KEY_MULTI_RANGE cur_range;
-  uchar **range_info_ptr= (uchar**)&cur_range.ptr;
-  DBUG_ENTER("DsMrr_impl::dsmrr_fill_key_buffer");
-
-  DBUG_ASSERT(!know_key_tuple_params || key_buffer->is_empty());
-
-  uchar *key_ptr;
-  if (know_key_tuple_params)
-  {
-    if (do_rndpos_scan && rowid_buffer.is_empty())
-    {
-      /*
-        We're using two buffers and both of them are empty now. Restore the
-        original sizes
-      */
-      rowid_buffer.set_buffer_space(full_buf, rowid_buffer_end);
-      key_buffer= &backward_key_buf;
-      key_buffer->set_buffer_space(rowid_buffer_end, full_buf_end);
-    }
-    key_buffer->reset();
-    key_buffer->setup_writing(&key_ptr, key_size_in_keybuf,
-                              is_mrr_assoc? (uchar**)&range_info_ptr : NULL,
-                              sizeof(uchar*));
-  }
-
-  while ((!know_key_tuple_params || key_buffer->can_write()) && 
-         !(res= h->mrr_funcs.next(h->mrr_iter, &cur_range)))
-  {
-    DBUG_ASSERT(cur_range.range_flag & EQ_RANGE);
-    if (!know_key_tuple_params)
-    {
-      /* This only happens when we've just started filling the buffer */
-      setup_buffer_sizes(&cur_range.start_key);
-      know_key_tuple_params= TRUE;
-      key_buffer->setup_writing(&key_ptr, key_size_in_keybuf,
-                               is_mrr_assoc? (uchar**)&range_info_ptr : NULL,
-                               sizeof(uchar*));
-      DBUG_ASSERT(key_buffer->can_write());
-    }
-    
-    /* Put key, or {key, range_id} pair into the buffer */
-    if (use_key_pointers)
-      key_ptr=(uchar*) &cur_range.start_key.key;
-    else
-      key_ptr=(uchar*) cur_range.start_key.key;
-
-    key_buffer->write();
-  }
-
-  dsmrr_eof= test(res);
-
-  key_buffer->sort((key_buffer->type() == Lifo_buffer::FORWARD)? 
-                     (qsort2_cmp)DsMrr_impl::key_tuple_cmp_reverse : 
-                     (qsort2_cmp)DsMrr_impl::key_tuple_cmp, 
-                   (void*)this);
-  
-  key_buffer->setup_reading(&cur_index_tuple, key_size_in_keybuf,
-                            is_mrr_assoc? (uchar**)&cur_range_info: NULL,
-                            sizeof(void*));
-
-  scanning_key_val_iter= FALSE;
-  index_scan_eof= FALSE; 
-
-  DBUG_VOID_RETURN;
+  rowid_buffer.set_buffer_space(full_buf, rowid_buffer_end);
+  key_buffer= &backward_key_buf;
+  key_buffer->set_buffer_space(rowid_buffer_end, full_buf_end);
 }
 
-
 /**
   Take unused space from the key buffer and give it to the rowid buffer
 */
-
+//psergey-todo: do invoke this function.
 void DsMrr_impl::reallocate_buffer_space()
 {
   uchar *unused_start, *unused_end;
@@ -834,37 +1102,43 @@ void DsMrr_impl::reallocate_buffer_space()
 
 //////////////////////////////////////////////////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////
-bool Key_value_records_iterator::init(DsMrr_impl *dsmrr_arg)
+
+bool Key_value_records_iterator::init(Mrr_ordered_index_reader *owner_arg)
 {
   int res;
-  dsmrr= dsmrr_arg;
-  handler *file= dsmrr->do_rndpos_scan? dsmrr->h2 : dsmrr->h;
+  //h= h_arg;
+  //param= param_arg;
+  owner= owner_arg;
 
-  identical_key_it.init(dsmrr->key_buffer);
+  identical_key_it.init(owner->key_buffer);
   /* Get the first pair into (cur_index_tuple, cur_range_info) */ 
+  owner->key_buffer->setup_reading(&cur_index_tuple, owner->keypar.key_size_in_keybuf,
+                            owner->is_mrr_assoc? (uchar**)&owner->cur_range_info: NULL,
+                            sizeof(void*));
+
   if (identical_key_it.read())
     return TRUE;
 
-  uchar *key_in_buf= dsmrr->cur_index_tuple;
+  uchar *key_in_buf= cur_index_tuple;
 
-  last_identical_key_ptr= dsmrr->cur_index_tuple;
-  if (dsmrr->use_key_pointers)
-    dsmrr->cur_index_tuple= *((uchar**)dsmrr->cur_index_tuple);
+  last_identical_key_ptr= cur_index_tuple;
+  if (owner->keypar.use_key_pointers)
+    cur_index_tuple= *((uchar**)cur_index_tuple);
   
   /* Check out how many more identical keys are following */
-  uchar *save_cur_index_tuple= dsmrr->cur_index_tuple;
+  uchar *save_cur_index_tuple= cur_index_tuple;
   while (!identical_key_it.read())
   {
-    if (DsMrr_impl::key_tuple_cmp(dsmrr, key_in_buf, dsmrr->cur_index_tuple))
+    if (Mrr_ordered_index_reader::key_tuple_cmp(owner, key_in_buf, cur_index_tuple))
       break;
-    last_identical_key_ptr= dsmrr->cur_index_tuple;
+    last_identical_key_ptr= cur_index_tuple;
   }
-  identical_key_it.init(dsmrr->key_buffer);
-  dsmrr->cur_index_tuple= save_cur_index_tuple;
-  res= file->ha_index_read_map(dsmrr->table->record[0], 
-                               dsmrr->cur_index_tuple, 
-                               dsmrr->key_tuple_map, 
-                               HA_READ_KEY_EXACT);
+  identical_key_it.init(owner->key_buffer);
+  cur_index_tuple= save_cur_index_tuple;
+  res= owner->h->ha_index_read_map(owner->h->get_table()->record[0], 
+                            cur_index_tuple, 
+                            owner->keypar.key_tuple_map, 
+                            HA_READ_KEY_EXACT);
 
   if (res)
   {
@@ -878,27 +1152,27 @@ bool Key_value_records_iterator::init(DsMrr_impl *dsmrr_arg)
 
 int Key_value_records_iterator::get_next()
 {
-  handler *file= dsmrr->do_rndpos_scan? dsmrr->h2 : dsmrr->h;
   int res;
 
   if (get_next_row)
   {
-    if (dsmrr->index_ranges_unique)
+    if (owner->keypar.index_ranges_unique)
       return HA_ERR_END_OF_FILE;  /* Max one match */
-
-    if ((res= file->ha_index_next_same(dsmrr->table->record[0], 
-                                       dsmrr->cur_index_tuple, 
-                                       dsmrr->key_tuple_length)))
+    
+    handler *h= owner->h;
+    if ((res= h->ha_index_next_same(h->get_table()->record[0], 
+                                    cur_index_tuple, 
+                                    owner->keypar.key_tuple_length)))
     {
       /* EOF is EOF for iterator, also, any error means EOF on the iterator */
       return res;
     }
-    identical_key_it.init(dsmrr->key_buffer);
+    identical_key_it.init(owner->key_buffer);
     get_next_row= FALSE;
   }
 
   identical_key_it.read(); // This gets us next range_id.
-  if (!last_identical_key_ptr || (dsmrr->cur_index_tuple == last_identical_key_ptr))
+  if (!last_identical_key_ptr || (cur_index_tuple == last_identical_key_ptr))
   {
     get_next_row= TRUE;
   }
@@ -907,92 +1181,8 @@ int Key_value_records_iterator::get_next()
 
 void Key_value_records_iterator::close()
 {
-  while (!dsmrr->key_buffer->read() && 
-         (dsmrr->cur_index_tuple != last_identical_key_ptr)) {}
-}
-
-
-/**
-  DS-MRR/CPK: multi_range_read_next() function
-  
-  @param range_info  OUT  identifier of range that the returned record belongs to
-  
-  @note
-    This function walks over key buffer and does index reads, i.e. it produces
-    {current_record, range_id} pairs.
-
-    The function has the same call contract like multi_range_read_next()'s.
-
-    We actually iterate over nested sequences:
-    - a disjoint sequence of index ranges
-      - each range has multiple records
-        - each record goes into multiple identical ranges.
-
-  @retval 0                   OK, next record was successfully read
-  @retval HA_ERR_END_OF_FILE  End of records
-  @retval Other               Some other error
-*/
-
-int DsMrr_impl::dsmrr_next_from_index(char **range_info_arg)
-{
-  DBUG_ENTER("DsMrr_impl::dsmrr_next_from_index");
-
-  while (1)
-  {
-    bool have_record= FALSE;
-    if (scanning_key_val_iter)
-    {
-      if (kv_it.get_next())
-      {
-        kv_it.close();
-        scanning_key_val_iter= FALSE;
-      }
-      else
-        have_record= TRUE;
-    }
-    else
-    {
-      while (kv_it.init(this))
-      {
-        if (key_buffer->is_empty())
-        {
-          if (dsmrr_eof)
-          {
-            index_scan_eof= TRUE;
-            DBUG_RETURN(HA_ERR_END_OF_FILE);
-          }
-
-          /*
-            When rowid fetching is used, it controls all buffer refills. When we're
-            on our own, try refilling our buffer.
-          */
-          if (!do_rndpos_scan)
-            dsmrr_fill_key_buffer();
-
-          if (key_buffer->is_empty())
-          {
-            index_scan_eof= TRUE;
-            DBUG_RETURN(HA_ERR_END_OF_FILE);
-          }
-        }
-      }
-      scanning_key_val_iter= TRUE;
-    }
-
-    if (have_record &&
-        (!h->mrr_funcs.skip_index_tuple ||
-         !h->mrr_funcs.skip_index_tuple(h->mrr_iter, *(char**)cur_range_info)) 
-        && 
-        (!h->mrr_funcs.skip_record ||
-         !h->mrr_funcs.skip_record(h->mrr_iter, *(char**)cur_range_info, NULL)))
-    {
-      break;
-    }
-    /* Go get another (record, range_id) combination */
-  } /* while */
-
-  memcpy(range_info_arg, cur_range_info, sizeof(void*));
-  DBUG_RETURN(0);
+  while (!owner->key_buffer->read() && 
+         (cur_index_tuple != last_identical_key_ptr)) {}
 }
 
 
@@ -1004,119 +1194,7 @@ int DsMrr_impl::dsmrr_next_from_index(char **range_info_arg)
 
 int DsMrr_impl::dsmrr_next(char **range_info)
 {
-  int res;
-
-  if (use_default_impl)
-    return h->handler::multi_range_read_next(range_info);
-
-  if (!do_rndpos_scan)
-    return dsmrr_next_from_index(range_info);
-  
-  while (last_identical_rowid)
-  {
-    /*
-      Current record (the one we've returned in previous call) was obtained
-      from a rowid that matched multiple range_ids. Return this record again,
-      with next matching range_id.
-    */
-    bool bres= rowid_buffer.read();
-    DBUG_ASSERT(!bres);
-
-    if (is_mrr_assoc)
-      memcpy(range_info, rowids_range_id, sizeof(uchar*));
-
-    if (rowid == last_identical_rowid)
-    {
-      last_identical_rowid= NULL; /* reached the last of identical rowids */
-    }
-
-    if (!h2->mrr_funcs.skip_record ||
-        !h2->mrr_funcs.skip_record(h2->mrr_iter, (char *) *range_info, rowid))
-    {
-      return 0;
-    }
-  }
-
-  while (1)
-  {
-    if (rowid_buffer.is_empty())
-    {
-      if (do_sort_keys)
-      {
-        if (!index_scan_eof) 
-        {
-          /* There are some sorted keys left. Use them to get rowids */
-          if ((res= dsmrr_fill_rowid_buffer()))
-            return res; /* for fatal errors */
-        }
-        while (rowid_buffer.is_empty())
-        {
-          if (dsmrr_eof)
-            return HA_ERR_END_OF_FILE;
-          dsmrr_fill_key_buffer();
-          if ((res= dsmrr_fill_rowid_buffer()))
-            return res;
-        }
-      }
-      else
-      {
-        /* 
-          There is no buffer with sorted keys. If fill_rowid_buffer() haven't
-          reached eof condition before, try refilling the buffer.
-        */
-        if (dsmrr_eof)
-          return HA_ERR_END_OF_FILE;
-
-        if ((res= dsmrr_fill_rowid_buffer()))
-          return res;
-      }
-    }
-   
-    last_identical_rowid= NULL;
-
-    /* Return eof if there are no rowids in the buffer after re-fill attempt */
-    if (rowid_buffer.read())
-      return HA_ERR_END_OF_FILE;
-
-    if (is_mrr_assoc)
-    {
-      memcpy(range_info, rowids_range_id, sizeof(uchar*));
-    }
-
-    if (h2->mrr_funcs.skip_record &&
-	h2->mrr_funcs.skip_record(h2->mrr_iter, *range_info, rowid))
-      continue;
-
-    res= h->ha_rnd_pos(table->record[0], rowid);
-
-    if (res == HA_ERR_RECORD_DELETED)
-      continue;
-    
-    /* 
-      Check if subsequent buffer elements have the same rowid value as this
-      one. If yes, remember this fact so that we don't make any more rnd_pos()
-      calls with this value.
-    */
-    if (!res)
-    {
-      uchar *cur_rowid= rowid;
-      /* 
-        Note: this implies that SQL layer doesn't touch table->record[0]
-        between calls.
-      */
-      Lifo_buffer_iterator it;
-      it.init(&rowid_buffer);
-      while (!it.read()) // reads to (rowid, ...)
-      {
-        if (h2->cmp_ref(rowid, cur_rowid))
-          break;
-        last_identical_rowid= rowid;
-      }
-    }
-    return 0;
-  }
-
-  return res;
+  return strategy->get_next(range_info);
 }
 
 
@@ -1239,11 +1317,11 @@ bool key_uses_partial_cols(TABLE *table, uint keyno)
 
 bool DsMrr_impl::check_cpk_scan(THD *thd, uint keyno, uint mrr_flags)
 {
-  return test((mrr_flags & HA_MRR_SINGLE_POINT) && 
-              !(mrr_flags & HA_MRR_SORTED) && 
+  return test((mrr_flags & HA_MRR_SINGLE_POINT) &&  // check
+        //      !(mrr_flags & HA_MRR_SORTED) && 
               keyno == table->s->primary_key && 
               h->primary_key_is_clustered() && 
-              optimizer_flag(thd, OPTIMIZER_SWITCH_MRR_SORT_KEYS));
+              optimizer_flag(thd, OPTIMIZER_SWITCH_MRR_SORT_KEYS)); //check
 }
 
 
diff --git a/sql/multi_range_read.h b/sql/multi_range_read.h
index 3c92dcd2950..fb2a67b6af1 100644
--- a/sql/multi_range_read.h
+++ b/sql/multi_range_read.h
@@ -50,6 +50,26 @@
 
 class DsMrr_impl;
 
+class Key_parameters
+{
+public:
+  /* TRUE <=> We can get at most one index tuple for a lookup key */
+  bool index_ranges_unique;
+
+  uint         key_tuple_length; /* Length of index lookup tuple, in bytes */
+  key_part_map key_tuple_map;    /* keyparts used in index lookup tuples */
+
+  /*
+    This is 
+      = key_tuple_length   if we copy keys to buffer
+      = sizeof(void*)      if we're using pointers to materialized keys.
+  */
+  uint key_size_in_keybuf;
+
+  /* TRUE <=> don't copy key values, use pointers to them instead.  */
+  bool use_key_pointers;
+};
+
 /**
   Iterator over (record, range_id) pairs that match given key value.
   
@@ -57,16 +77,23 @@ class DsMrr_impl;
   key value. A key value may have multiple matching records, so we'll need to
   produce a cross-product of sets of matching records and range_id-s.
 */
-
+class Mrr_ordered_index_reader;
 class Key_value_records_iterator
 {
   /* Scan parameters */
-  DsMrr_impl *dsmrr;
+  Key_parameters *param;
   Lifo_buffer_iterator identical_key_it;
   uchar *last_identical_key_ptr;
   bool get_next_row;
+  //handler *h;
+  /* TRUE <=> We can get at most one index tuple for a lookup key */
+  //bool index_ranges_unique;
+  
+  Mrr_ordered_index_reader *owner;
+  /* key_buffer.read() reads to here */
+  uchar *cur_index_tuple;
 public:
-  bool init(DsMrr_impl *dsmrr);
+  bool init(Mrr_ordered_index_reader *owner_arg);
 
   /*
     Get next (key_val, range_id) pair.
@@ -74,9 +101,184 @@ public:
   int get_next();
 
   void close();
+  friend class Mrr_ordered_index_reader;
 };
 
 
+/*
+  Something that will manage buffers for those that call it
+*/
+class Buffer_manager
+{
+public:
+  virtual void reset_buffer_sizes()= 0;
+  virtual void setup_buffer_sizes(uint key_size_in_keybuf, 
+                                  key_part_map key_tuple_map)=0;
+  virtual Lifo_buffer* get_key_buffer()= 0;
+  virtual ~Buffer_manager(){}
+};
+
+
+/* 
+  Abstract MRR execution strategy
+  
+  An object of this class produces (R, range_info) pairs where R can be an
+  index tuple or a table record.
+
+  Getting HA_ERR_END_OF_FILE from get_next() means that the source should be
+  re-filled. if eof() returns true after refill attempt, then end of stream has
+  been reached and get_next() must not be called anymore.
+*/
+
+class Mrr_strategy 
+{
+public:
+  virtual int get_next(char **range_info) = 0;
+  virtual int refill_buffer()=0;
+
+  virtual ~Mrr_strategy() {};
+};
+
+
+/* A common base for strategies that do index scans and produce index tuples */
+class Mrr_index_reader : public Mrr_strategy
+{
+public:
+  handler *h;
+
+  virtual int init(handler *h_arg, RANGE_SEQ_IF *seq_funcs, 
+                   void *seq_init_param, uint n_ranges,
+                   uint mode, Buffer_manager *buf_manager_arg) = 0;
+  virtual bool eof() = 0; 
+  virtual uchar *get_rowid_ptr()= 0;
+  virtual bool skip_record(char *range_id, uchar *rowid)=0;
+};
+
+
+/*
+  A "bypass" strategy that uses default MRR implementation (i.e.
+  handler::multi_range_read_XXX() calls) to produce rows.
+*/
+
+class Mrr_simple_index_reader : public Mrr_index_reader
+{
+  int res; 
+public:
+  int init(handler *h_arg, RANGE_SEQ_IF *seq_funcs, 
+           void *seq_init_param, uint n_ranges,
+           uint mode, Buffer_manager *buf_manager_arg);
+  int get_next(char **range_info);
+  int refill_buffer() { return 0; }
+  bool eof() { return test(res); }
+  uchar *get_rowid_ptr() { return h->ref; }
+  bool skip_record(char *range_id, uchar *rowid)
+  {
+    return (h->mrr_funcs.skip_record &&
+            h->mrr_funcs.skip_record(h->mrr_iter, range_id, rowid));
+  }
+};
+
+
+
+/* 
+  A strategy that sorts index lookup keys before scanning the index
+*/
+
+class Mrr_ordered_index_reader : public Mrr_index_reader
+{
+public:
+  int init(handler *h_arg, RANGE_SEQ_IF *seq_funcs, 
+           void *seq_init_param, uint n_ranges,
+           uint mode, Buffer_manager *buf_manager_arg);
+  int get_next(char **range_info);
+  int refill_buffer();
+  bool eof() { return index_scan_eof; }
+  uchar *get_rowid_ptr() { return h->ref; }
+  
+  bool skip_record(char *range_info, uchar *rowid)
+  {
+    return (mrr_funcs.skip_record &&
+            mrr_funcs.skip_record(mrr_iter, range_info, rowid));
+  }
+private:
+  Key_value_records_iterator kv_it;
+
+  bool scanning_key_val_iter;
+  
+  char *cur_range_info;
+
+  /* Buffer to store (key, range_id) pairs */
+  Lifo_buffer *key_buffer;
+
+  Buffer_manager *buf_manager;
+
+  /* Initially FALSE, becomes TRUE when we've set key_tuple_xxx members */
+  bool know_key_tuple_params;
+
+ // bool use_key_pointers;
+  
+  Key_parameters  keypar;
+  /* TRUE <=> need range association, buffers hold {rowid, range_id} pairs */
+  bool is_mrr_assoc;
+
+  bool no_more_keys;
+  RANGE_SEQ_IF mrr_funcs;
+  range_seq_t mrr_iter;
+
+  bool auto_refill;
+
+  bool index_scan_eof;
+
+  static int key_tuple_cmp(void* arg, uchar* key1, uchar* key2);
+  static int key_tuple_cmp_reverse(void* arg, uchar* key1, uchar* key2);
+  //void cleanup();
+  
+  friend class Key_value_records_iterator; 
+  friend class DsMrr_impl;
+  friend class Mrr_ordered_rndpos_reader;
+};
+
+
+/* MRR strategy that fetches rowids */
+
+class Mrr_ordered_rndpos_reader : public Mrr_strategy 
+{
+public:
+  int init(handler *h, Mrr_index_reader *index_reader, uint mode,
+           Lifo_buffer *buf);
+  int get_next(char **range_info);
+  int refill_buffer();
+  void cleanup();
+private:
+  handler *h;
+  
+  DsMrr_impl *dsmrr;
+  /* This what we get (rowid, range_info) pairs from */
+  Mrr_index_reader *index_reader;
+  uchar *index_rowid;
+  
+  /* TRUE <=> need range association, buffers hold {rowid, range_id} pairs */
+  bool is_mrr_assoc;
+
+  uchar *last_identical_rowid;
+  Lifo_buffer *rowid_buffer;
+  
+  /* = h->ref_length  [ + sizeof(range_assoc_info) ] */
+  //uint rowid_buff_elem_size;
+
+  /* rowid_buffer.read() will set the following:  */
+  uchar *rowid;
+  uchar *rowids_range_id;
+};
+
+class Mrr_strategy_factory
+{
+public:
+  Mrr_ordered_rndpos_reader ordered_rndpos_reader;
+  Mrr_ordered_index_reader  ordered_index_reader;
+  Mrr_simple_index_reader   simple_index_reader;
+};
+
 /*
   DS-MRR implementation for one table. Create/use one object of this class for
   each ha_{myisam/innobase/etc} object. That object will be further referred to
@@ -154,9 +356,58 @@ public:
          get record by rowid and return the {record, range_id} pair
     4. Repeat the above steps until we've exhausted the list of ranges we're
        scanning.
+
+  Buffer space management considerations
+  --------------------------------------
+  With regards to buffer/memory management, MRR interface specifies that 
+   - SQL layer provides multi_range_read_init() with buffer of certain size.
+   - MRR implementation may use (i.e. have at its disposal till the end of 
+     the MRR scan) all of the buffer, or return the unused end of the buffer 
+     to SQL layer.
+
+  DS-MRR needs buffer in order to accumulate and sort rowids and/or keys. When
+  we need to accumulate/sort only keys (or only rowids), it is fairly trivial.
+
+  When we need to accumulate/sort both keys and rowids, efficient buffer use
+  gets complicated. We need to:
+   - First, accumulate keys and sort them
+   - Then use the keys (smaller values go first) to obtain rowids. A key is not
+     needed after we've got matching rowids for it.
+   - Make sure that rowids are accumulated at the front of the buffer, so that we
+     can return the end part of the buffer to SQL layer, should there be too
+     few rowid values to occupy the buffer.
+
+  All of these goals are achieved by using the following scheme:
+
+     |                    |   We get an empty buffer from SQL layer.   
+
+     |                  *-|    
+     |               *----|   First, we fill the buffer with keys. Key_buffer
+     |            *-------|   part grows from end of the buffer space to start
+     |         *----------|   (In this picture, the buffer is big enough to
+     |      *-------------|    accomodate all keys and even have some space left)
+
+     |      *=============|   We want to do key-ordered index scan, so we sort
+                              the keys
+
+     |-x      *===========|   Then we use the keys get rowids. Rowids are 
+     |----x      *========|   stored from start of buffer space towards the end.
+     |--------x     *=====|   The part of the buffer occupied with keys
+     |------------x   *===|   gradually frees up space for rowids. In this
+     |--------------x   *=|   picture we run out of keys before we've ran out
+     |----------------x   |   of buffer space (it can be other way as well).
+
+     |================x   |   Then we sort the rowids.
+                     
+     |                |~~~|   The unused part of the buffer is at the end, so
+                              we can return it to the SQL layer.
+
+     |================*       Sorted rowids are then used to read table records 
+                              in disk order
+
 */
 
-class DsMrr_impl
+class DsMrr_impl : public Buffer_manager
 {
 public:
   typedef void (handler::*range_check_toggle_func_t)(bool on);
@@ -181,6 +432,9 @@ public:
                             void *seq_init_param, uint n_ranges, uint *bufsz,
                             uint *flags, COST_VECT *cost);
 private:
+  /* Buffer to store (key, range_id) pairs */
+  Lifo_buffer *key_buffer;
+
   /*
     The "owner" handler object (the one that is expected to "own" this object
     and call its functions).
@@ -197,20 +451,16 @@ private:
   /** Properties of current MRR scan **/
 
   uint keyno; /* index we're running the scan on */
-  bool use_default_impl; /* TRUE <=> shortcut all calls to default MRR impl */
   /* TRUE <=> need range association, buffers hold {rowid, range_id} pairs */
   bool is_mrr_assoc;
   /* TRUE <=> sort the keys before making index lookups */
-  bool do_sort_keys;
+  //bool do_sort_keys;
   /* TRUE <=> sort rowids and use rnd_pos() to get and return full records */
-  bool do_rndpos_scan;
-
-  /*
-    (if do_sort_keys==TRUE) don't copy key values, use pointers to them 
-    instead.
-  */
-  bool use_key_pointers;
+  //bool do_rndpos_scan;
 
+  Mrr_strategy_factory strategy_factory;
+  Mrr_strategy *strategy;
+  Mrr_index_reader *index_strategy;
 
   /* The whole buffer space that we're using */
   uchar *full_buf;
@@ -226,12 +476,6 @@ private:
  
   /** Index scaning and key buffer-related members **/
 
-  /* TRUE <=> We can get at most one index tuple for a lookup key */
-  bool index_ranges_unique;
-
-  /* TRUE<=> we're in a middle of enumerating records for a key range */
-  //bool in_index_range;
-  
   /*
     One of the following two is used for key buffer: forward is used when 
     we only need key buffer, backward is used when we need both key and rowid
@@ -240,39 +484,10 @@ private:
   Forward_lifo_buffer forward_key_buf;
   Backward_lifo_buffer backward_key_buf;
 
-  /* Buffer to store (key, range_id) pairs */
-  Lifo_buffer *key_buffer;
-  
-  /* Index scan state */
-  bool scanning_key_val_iter;
-  /* 
-    TRUE <=> we've got index tuples/rowids for all keys (need this flag because 
-    we may have a situation where we've read everything from the key buffer but 
-    haven't finished with getting index tuples for the last key)
-  */
-  bool index_scan_eof;  
-  Key_value_records_iterator kv_it;
-  
-  /* key_buffer.read() reads to here */
-  uchar *cur_index_tuple;
-
-  /* if in_index_range==TRUE: range_id of the range we're enumerating */
-  char *cur_range_info;
-
-  /* Initially FALSE, becomes TRUE when we've set key_tuple_xxx members */
-  bool know_key_tuple_params;
-  uint         key_tuple_length; /* Length of index lookup tuple, in bytes */
-  key_part_map key_tuple_map;    /* keyparts used in index lookup tuples */
-
-  /*
-    This is 
-      = key_tuple_length   if we copy keys to buffer
-      = sizeof(void*)      if we're using pointers to materialized keys.
-  */
-  uint key_size_in_keybuf;
+  Forward_lifo_buffer rowid_buffer;
   
   /* = key_size_in_keybuf [ + sizeof(range_assoc_info) ] */
-  uint key_buff_elem_size;
+  //uint key_buff_elem_size_;
   
   /** rnd_pos() scan and rowid buffer-related members **/
 
@@ -280,36 +495,27 @@ private:
     Buffer to store (rowid, range_id) pairs, or just rowids if 
     is_mrr_assoc==FALSE
   */
-  Forward_lifo_buffer rowid_buffer;
-  
-  /* rowid_buffer.read() will set the following:  */
-  uchar *rowid;
-  uchar *rowids_range_id;
-
-  uchar *last_identical_rowid;
-
-  bool dsmrr_eof; /* TRUE <=> We have reached EOF when reading index tuples */
-  
-  /* = h->ref_length  [ + sizeof(range_assoc_info) ] */
-  uint rowid_buff_elem_size;
+  //Forward_lifo_buffer rowid_buffer;
   
   bool choose_mrr_impl(uint keyno, ha_rows rows, uint *flags, uint *bufsz, 
                        COST_VECT *cost);
   bool get_disk_sweep_mrr_cost(uint keynr, ha_rows rows, uint flags, 
                                uint *buffer_size, COST_VECT *cost);
   bool check_cpk_scan(THD *thd, uint keyno, uint mrr_flags);
-  static int key_tuple_cmp(void* arg, uchar* key1, uchar* key2);
-  static int key_tuple_cmp_reverse(void* arg, uchar* key1, uchar* key2);
-  int dsmrr_fill_rowid_buffer();
-  void dsmrr_fill_key_buffer();
-  int dsmrr_next_from_index(char **range_info);
 
-  void setup_buffer_sizes(key_range *sample_key);
   void reallocate_buffer_space();
   
-  static range_seq_t key_buf_seq_init(void *init_param, uint n_ranges, uint flags);
-  static uint key_buf_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range);
+  /* Buffer_manager implementation */
+  void setup_buffer_sizes(uint key_size_in_keybuf, key_part_map key_tuple_map);
+  void reset_buffer_sizes();
+  Lifo_buffer* get_key_buffer() { return key_buffer; }
+
   friend class Key_value_records_iterator;
+  friend class Mrr_ordered_index_reader;
+  friend class Mrr_ordered_rndpos_reader;
+
+  int  setup_two_handlers();
+  void close_second_handler();
 };
 
 /**

From a88bd5cf56f44f7e1a2714b471f13f123735f864 Mon Sep 17 00:00:00 2001
From: Sergey Petrunya <psergey@askmonty.org>
Date: Thu, 28 Oct 2010 20:48:16 +0400
Subject: [PATCH 44/49] DS-MRR improvements: - address the review feedback
 (class/var renames) - fix wrong test result (bug in buffer refill logic)

---
 sql/multi_range_read.cc | 27 ++++++++++++++++-----------
 sql/multi_range_read.h  | 16 ++++++++--------
 2 files changed, 24 insertions(+), 19 deletions(-)

diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc
index 4df2c4209d9..9e8ffef529a 100644
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -623,21 +623,26 @@ int Mrr_ordered_rndpos_reader::get_next(char **range_info)
 
   while (1)
   {
-    if (rowid_buffer->is_empty())
+    if (rowid_buffer->is_empty()) /* We're out of rowids */
     {
-      /*
-        We're out of rowids. If there are still some sorted keys, use up them
-        first (that is, don't call re-fill for keys when we still have some).
-      */
+      /* First, finish off the sorted keys we have */ 
       if (!index_reader->eof())
       {
         if ((res= refill_buffer()))
           return res; /* for fatal errors */
       }
-      else
+
+      if (rowid_buffer->is_empty())
       {
-        //TODO: here: redistribute the buffer space, then refill the index
-        //reader, then refill us.
+        /*
+          Ok neither index_reader nor us have any records. Refill index
+          reader, then refill us.
+        */
+        // TODO: if key buffer is empty, too, redistribute the buffer space.
+  
+        if ((res= index_reader->refill_buffer()) ||
+            (res= refill_buffer()))
+          return res;
       }
     }
    
@@ -871,7 +876,7 @@ int DsMrr_impl::setup_two_handlers()
     if (keyno == h->pushed_idx_cond_keyno)
       pushed_cond= h->pushed_idx_cond;
     
-    Mrr_strategy *save_strategy= strategy;
+    Mrr_reader *save_strategy= strategy;
     strategy= NULL;
     /*
       Caution: this call will invoke this->dsmrr_close(). Do not put the
@@ -910,7 +915,7 @@ int DsMrr_impl::setup_two_handlers()
     if (h->inited == handler::INDEX)
     {
       handler *save_h2= h2;
-      Mrr_strategy *save_strategy= strategy;
+      Mrr_reader *save_strategy= strategy;
       h2= NULL;
       strategy= NULL;
       res= h->ha_index_end();
@@ -919,7 +924,7 @@ int DsMrr_impl::setup_two_handlers()
       if (res)
         goto error;
     }
-    if ((h->inited == handler::RND) && h->ha_rnd_init(FALSE))
+    if ((h->inited != handler::RND) && h->ha_rnd_init(FALSE))
       goto error;
   }
   DBUG_RETURN(0);
diff --git a/sql/multi_range_read.h b/sql/multi_range_read.h
index fb2a67b6af1..1f53b212c93 100644
--- a/sql/multi_range_read.h
+++ b/sql/multi_range_read.h
@@ -130,18 +130,18 @@ public:
   been reached and get_next() must not be called anymore.
 */
 
-class Mrr_strategy 
+class Mrr_reader 
 {
 public:
   virtual int get_next(char **range_info) = 0;
   virtual int refill_buffer()=0;
-
-  virtual ~Mrr_strategy() {};
+  
+  virtual ~Mrr_reader() {}; /* just to remove compiler warning */
 };
 
 
 /* A common base for strategies that do index scans and produce index tuples */
-class Mrr_index_reader : public Mrr_strategy
+class Mrr_index_reader : public Mrr_reader
 {
 public:
   handler *h;
@@ -241,7 +241,7 @@ private:
 
 /* MRR strategy that fetches rowids */
 
-class Mrr_ordered_rndpos_reader : public Mrr_strategy 
+class Mrr_ordered_rndpos_reader : public Mrr_reader 
 {
 public:
   int init(handler *h, Mrr_index_reader *index_reader, uint mode,
@@ -271,7 +271,7 @@ private:
   uchar *rowids_range_id;
 };
 
-class Mrr_strategy_factory
+class Mrr_reader_factory
 {
 public:
   Mrr_ordered_rndpos_reader ordered_rndpos_reader;
@@ -458,8 +458,8 @@ private:
   /* TRUE <=> sort rowids and use rnd_pos() to get and return full records */
   //bool do_rndpos_scan;
 
-  Mrr_strategy_factory strategy_factory;
-  Mrr_strategy *strategy;
+  Mrr_reader_factory strategy_factory;
+  Mrr_reader *strategy;
   Mrr_index_reader *index_strategy;
 
   /* The whole buffer space that we're using */

From 1e82320d27ab0c5de7cffcef49ead619b7893038 Mon Sep 17 00:00:00 2001
From: Sergey Petrunya <psergey@askmonty.org>
Date: Fri, 29 Oct 2010 13:42:48 +0400
Subject: [PATCH 45/49] DS-MRR improvements - buildbot test failure fixes:
 don't try to get more records from SimpleIndexReader   if we've already got
 EOF for it.

---
 sql/multi_range_read.cc | 4 ++++
 sql/multi_range_read.h  | 4 ++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc
index 9e8ffef529a..c1b166cb4e7 100644
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -303,6 +303,7 @@ int Mrr_simple_index_reader::init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
 {
   HANDLER_BUFFER no_buffer = {NULL, NULL, NULL};
   h= h_arg;
+  res= 0;
   return h->handler::multi_range_read_init(seq_funcs, seq_init_param, n_ranges,
                                            mode, &no_buffer);
 }
@@ -559,6 +560,9 @@ int Mrr_ordered_rndpos_reader::refill_buffer()
 
   last_identical_rowid= NULL;
 
+  if (index_reader->eof())
+    DBUG_RETURN(0);
+
   while (rowid_buffer->can_write())
   {
     res= index_reader->get_next(&range_info);
diff --git a/sql/multi_range_read.h b/sql/multi_range_read.h
index 1f53b212c93..a7748a32b6b 100644
--- a/sql/multi_range_read.h
+++ b/sql/multi_range_read.h
@@ -126,8 +126,8 @@ public:
   index tuple or a table record.
 
   Getting HA_ERR_END_OF_FILE from get_next() means that the source should be
-  re-filled. if eof() returns true after refill attempt, then end of stream has
-  been reached and get_next() must not be called anymore.
+  re-filled. if eof() returns true after refill attempt, then the end of 
+  stream has been reached and get_next() must not be called anymore.
 */
 
 class Mrr_reader 

From 8e4c627ef8bff737e072fde733ee39351c17f70b Mon Sep 17 00:00:00 2001
From: Sergey Petrunya <psergey@askmonty.org>
Date: Fri, 29 Oct 2010 20:21:26 +0400
Subject: [PATCH 46/49] DS-MRR improvements: - Correct buffer re-allocation

---
 sql/multi_range_read.cc | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc
index c1b166cb4e7..a968cf2da52 100644
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -1092,9 +1092,16 @@ void DsMrr_impl::setup_buffer_sizes(uint key_size_in_keybuf,
 
 void DsMrr_impl::reset_buffer_sizes()
 {
-  rowid_buffer.set_buffer_space(full_buf, rowid_buffer_end);
-  key_buffer= &backward_key_buf;
-  key_buffer->set_buffer_space(rowid_buffer_end, full_buf_end);
+  if (strategy != index_strategy)
+  {
+    /*
+      Ok we have both ordered index reader and there is a disk rearder. 
+      Redistribute the buffer space.
+    */
+    rowid_buffer.set_buffer_space(full_buf, rowid_buffer_end);
+    key_buffer= &backward_key_buf;
+    key_buffer->set_buffer_space(rowid_buffer_end, full_buf_end);
+  }
 }
 
 /**

From d9a8dd22b2e88e9ef0844c32fa442ed6d3f8bc56 Mon Sep 17 00:00:00 2001
From: Sergey Petrunya <psergey@askmonty.org>
Date: Sun, 31 Oct 2010 22:00:15 +0300
Subject: [PATCH 47/49] DS-MRR/CPK improvements: correct buffer exhaustion
 handling

---
 sql/multi_range_read.cc | 25 +++++++++++++++----------
 sql/multi_range_read.h  | 12 ++++++++++--
 2 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc
index a968cf2da52..c9f3785b231 100644
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -484,7 +484,7 @@ int Mrr_ordered_index_reader::refill_buffer()
   scanning_key_val_iter= FALSE;
   index_scan_eof= FALSE; 
 
-  DBUG_RETURN(0);
+  DBUG_RETURN((no_more_keys && key_buffer->is_empty())? HA_ERR_END_OF_FILE:0);
 }
 
 
@@ -521,8 +521,11 @@ int Mrr_ordered_rndpos_reader::init(handler *h_arg,
   //rowid_buff_elem_size= h->ref_length;
   //if (!(mode & HA_MRR_NO_ASSOCIATION))
   //  rowid_buff_elem_size += sizeof(char*);
-
-  return index_reader->refill_buffer();
+  
+  int res= index_reader->refill_buffer();
+  if (res && res!=HA_ERR_END_OF_FILE)
+    return res;
+  return 0;
 }
 
 
@@ -561,7 +564,7 @@ int Mrr_ordered_rndpos_reader::refill_buffer()
   last_identical_rowid= NULL;
 
   if (index_reader->eof())
-    DBUG_RETURN(0);
+    DBUG_RETURN(HA_ERR_END_OF_FILE);
 
   while (rowid_buffer->can_write())
   {
@@ -584,9 +587,9 @@ int Mrr_ordered_rndpos_reader::refill_buffer()
   rowid_buffer->sort((qsort2_cmp)rowid_cmp_reverse, (void*)h);
 
   rowid_buffer->setup_reading(&rowid, h->ref_length,
-                              is_mrr_assoc? (uchar**)&rowids_range_id: NULL, 
+                              is_mrr_assoc? (uchar**)&rowids_range_id: NULL,
                               sizeof(void*));
-  DBUG_RETURN(0);
+  DBUG_RETURN((rowid_buffer->is_empty() && res) ? HA_ERR_END_OF_FILE : 0);
 }
 
 
@@ -632,8 +635,9 @@ int Mrr_ordered_rndpos_reader::get_next(char **range_info)
       /* First, finish off the sorted keys we have */ 
       if (!index_reader->eof())
       {
-        if ((res= refill_buffer()))
-          return res; /* for fatal errors */
+        res= refill_buffer();
+        if (res && res != HA_ERR_END_OF_FILE)
+          return res;
       }
 
       if (rowid_buffer->is_empty())
@@ -821,8 +825,9 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
       goto error;
     }
   }
-
-  if (strategy->refill_buffer())
+  
+  res= strategy->refill_buffer();
+  if (res && res != HA_ERR_END_OF_FILE)
     goto error;
 
   /*
diff --git a/sql/multi_range_read.h b/sql/multi_range_read.h
index a7748a32b6b..b4f2a699dc5 100644
--- a/sql/multi_range_read.h
+++ b/sql/multi_range_read.h
@@ -126,8 +126,16 @@ public:
   index tuple or a table record.
 
   Getting HA_ERR_END_OF_FILE from get_next() means that the source should be
-  re-filled. if eof() returns true after refill attempt, then the end of 
+  re-filled. 
+  
+  Was:
+  if eof() returns true after refill attempt, then the end of 
   stream has been reached and get_next() must not be called anymore.
+
+  Now:
+  if refill_buffer() returns HA_ERR_END_OF_FILE that means the stream is 
+  really exhausted.
+
 */
 
 class Mrr_reader 
@@ -168,7 +176,7 @@ public:
            void *seq_init_param, uint n_ranges,
            uint mode, Buffer_manager *buf_manager_arg);
   int get_next(char **range_info);
-  int refill_buffer() { return 0; }
+  int refill_buffer() { return HA_ERR_END_OF_FILE; }
   bool eof() { return test(res); }
   uchar *get_rowid_ptr() { return h->ref; }
   bool skip_record(char *range_id, uchar *rowid)

From 430f63c2713f89d2d692838f4af29b17a153d391 Mon Sep 17 00:00:00 2001
From: Sergey Petrunya <psergey@askmonty.org>
Date: Mon, 1 Nov 2010 00:04:34 +0300
Subject: [PATCH 48/49] use DBUG_RETURN with DBUG_ENTER

---
 sql/multi_range_read.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc
index c9f3785b231..8dc6ce246f6 100644
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -356,7 +356,7 @@ int Mrr_ordered_index_reader::get_next(char **range_info_arg)
           {
             int res;
             if ((res= refill_buffer()))
-              return res;
+              DBUG_RETURN(res);
             if (key_buffer->is_empty())
             {
               index_scan_eof= TRUE;

From b76a8595c611bedf512b19a7c4ccc260f0d0a8f6 Mon Sep 17 00:00:00 2001
From: Sergey Petrunya <psergey@askmonty.org>
Date: Mon, 1 Nov 2010 13:52:10 +0300
Subject: [PATCH 49/49] Re-initialization reworked

---
 sql/multi_range_read.cc | 73 ++++++++++++++++++++++++++++-------------
 sql/multi_range_read.h  |  4 ++-
 2 files changed, 54 insertions(+), 23 deletions(-)

diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc
index 8dc6ce246f6..3edd0eb0f7c 100644
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -332,6 +332,9 @@ int Mrr_simple_index_reader::init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
 int Mrr_ordered_index_reader::get_next(char **range_info_arg)
 {
   DBUG_ENTER("Mrr_ordered_index_reader::get_next");
+  
+  if (!know_key_tuple_params) /* We're in startup phase */
+    DBUG_RETURN(HA_ERR_END_OF_FILE);
 
   while (1)
   {
@@ -352,7 +355,7 @@ int Mrr_ordered_index_reader::get_next(char **range_info_arg)
       {
         if (key_buffer->is_empty())
         {
-          if (auto_refill)
+          /*if (auto_refill)
           {
             int res;
             if ((res= refill_buffer()))
@@ -364,6 +367,7 @@ int Mrr_ordered_index_reader::get_next(char **range_info_arg)
             }
           }
           else
+          */
           {
             /* Buffer refills are managed by somebody else for us */
             index_scan_eof= TRUE;
@@ -521,10 +525,11 @@ int Mrr_ordered_rndpos_reader::init(handler *h_arg,
   //rowid_buff_elem_size= h->ref_length;
   //if (!(mode & HA_MRR_NO_ASSOCIATION))
   //  rowid_buff_elem_size += sizeof(char*);
-  
-  int res= index_reader->refill_buffer();
-  if (res && res!=HA_ERR_END_OF_FILE)
-    return res;
+
+  index_reader_exhausted= FALSE;
+  ///int res= index_reader->refill_buffer();
+  ///if (res && res!=HA_ERR_END_OF_FILE)
+  ///  return res;
   return 0;
 }
 
@@ -547,13 +552,36 @@ int Mrr_ordered_rndpos_reader::init(handler *h_arg,
   @retval other  Error
 */
 
+
 int Mrr_ordered_rndpos_reader::refill_buffer()
+{
+  int res;
+  DBUG_ENTER("Mrr_ordered_rndpos_reader::refill_buffer");
+
+  if (index_reader_exhausted)
+    DBUG_RETURN(HA_ERR_END_OF_FILE);
+
+  while ((res= refill2() == HA_ERR_END_OF_FILE))
+  {
+    if ((res= index_reader->refill_buffer()))
+    {
+      if (res == HA_ERR_END_OF_FILE)
+        index_reader_exhausted= TRUE;
+      break;
+    }
+  }
+  DBUG_RETURN(res);
+}
+
+
+/* This one refills without calling index_reader->refill_buffer(). */
+int Mrr_ordered_rndpos_reader::refill2()
 {
   char *range_info;
   uchar **range_info_ptr= (uchar**)&range_info;
   int res;
-  DBUG_ENTER("Mrr_ordered_rndpos_reader::refill_buffer");
-  
+  DBUG_ENTER("Mrr_ordered_rndpos_reader::refill2");
+
   DBUG_ASSERT(rowid_buffer->is_empty());
   index_rowid= index_reader->get_rowid_ptr();
   rowid_buffer->reset();
@@ -563,9 +591,6 @@ int Mrr_ordered_rndpos_reader::refill_buffer()
 
   last_identical_rowid= NULL;
 
-  if (index_reader->eof())
-    DBUG_RETURN(HA_ERR_END_OF_FILE);
-
   while (rowid_buffer->can_write())
   {
     res= index_reader->get_next(&range_info);
@@ -578,10 +603,6 @@ int Mrr_ordered_rndpos_reader::refill_buffer()
 
     rowid_buffer->write();
   }
-
-  if (res && res != HA_ERR_END_OF_FILE)
-    DBUG_RETURN(res); 
-
    
   /* Sort the buffer contents by rowid */
   rowid_buffer->sort((qsort2_cmp)rowid_cmp_reverse, (void*)h);
@@ -589,7 +610,7 @@ int Mrr_ordered_rndpos_reader::refill_buffer()
   rowid_buffer->setup_reading(&rowid, h->ref_length,
                               is_mrr_assoc? (uchar**)&rowids_range_id: NULL,
                               sizeof(void*));
-  DBUG_RETURN((rowid_buffer->is_empty() && res) ? HA_ERR_END_OF_FILE : 0);
+  DBUG_RETURN(rowid_buffer->is_empty()? HA_ERR_END_OF_FILE : 0);
 }
 
 
@@ -630,6 +651,7 @@ int Mrr_ordered_rndpos_reader::get_next(char **range_info)
 
   while (1)
   {
+#if 0      
     if (rowid_buffer->is_empty()) /* We're out of rowids */
     {
       /* First, finish off the sorted keys we have */ 
@@ -647,12 +669,12 @@ int Mrr_ordered_rndpos_reader::get_next(char **range_info)
           reader, then refill us.
         */
         // TODO: if key buffer is empty, too, redistribute the buffer space.
-  
         if ((res= index_reader->refill_buffer()) ||
             (res= refill_buffer()))
           return res;
       }
     }
+#endif
    
     last_identical_rowid= NULL;
 
@@ -795,8 +817,8 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
   
   if (strategy == index_strategy)
   {
-    if (ordered_idx_reader)
-      ordered_idx_reader->auto_refill= TRUE;
+    ///if (ordered_idx_reader)
+    //  ordered_idx_reader->auto_refill= TRUE;
     /* Index strategy serves it all. We don't need two handlers, etc */
     /* Give the buffer to index strategy */
     if ((res= index_strategy->init(h, seq_funcs, seq_init_param, n_ranges,
@@ -815,8 +837,8 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
     if ((res= setup_two_handlers()))
       DBUG_RETURN(res);
 
-    if (ordered_idx_reader)
-      ordered_idx_reader->auto_refill= FALSE;
+    ///if (ordered_idx_reader)
+    ///  ordered_idx_reader->auto_refill= FALSE;
 
     if ((res= index_strategy->init(h2, seq_funcs, seq_init_param, n_ranges, 
                                    mode, this)) || 
@@ -827,7 +849,7 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
   }
   
   res= strategy->refill_buffer();
-  if (res && res != HA_ERR_END_OF_FILE)
+  if (res && res != HA_ERR_END_OF_FILE) //psergey-todo: remove EOF check here
     goto error;
 
   /*
@@ -1215,7 +1237,14 @@ void Key_value_records_iterator::close()
 
 int DsMrr_impl::dsmrr_next(char **range_info)
 {
-  return strategy->get_next(range_info);
+  int res;
+  while ((res= strategy->get_next(range_info)) == HA_ERR_END_OF_FILE)
+  {
+    if ((res= strategy->refill_buffer()))
+      break; /* EOF or error */
+  }
+  return res;
+  //return strategy->get_next(range_info);
 }
 
 
diff --git a/sql/multi_range_read.h b/sql/multi_range_read.h
index b4f2a699dc5..14bcdd5de6c 100644
--- a/sql/multi_range_read.h
+++ b/sql/multi_range_read.h
@@ -233,7 +233,7 @@ private:
   RANGE_SEQ_IF mrr_funcs;
   range_seq_t mrr_iter;
 
-  bool auto_refill;
+  //bool auto_refill;
 
   bool index_scan_eof;
 
@@ -256,6 +256,7 @@ public:
            Lifo_buffer *buf);
   int get_next(char **range_info);
   int refill_buffer();
+  int refill2();
   void cleanup();
 private:
   handler *h;
@@ -264,6 +265,7 @@ private:
   /* This what we get (rowid, range_info) pairs from */
   Mrr_index_reader *index_reader;
   uchar *index_rowid;
+  bool index_reader_exhausted;
   
   /* TRUE <=> need range association, buffers hold {rowid, range_id} pairs */
   bool is_mrr_assoc;