Automatic merge

2025-08-08 11:22:35 +03:00 · 2011-05-19 19:23:06 +03:00
parent 73f4d0c6e4 29b751c796
commit 54a3d1869c
12 changed files with 157 additions and 77 deletions
--- a/.bzr-mysql/default.conf
+++ b/.bzr-mysql/default.conf
@@ -1,6 +1,6 @@
 [MYSQL]
-tree_location = lp:maria/5.2
+tree_location = lp:maria/5.3
 post_commit_to = commits@mariadb.org
-post_commit_url = lp:maria/5.2
-tree_name = maria/5.2
-project_name = "Mariadb 5.2, with Maria 2.0"
+post_commit_url = lp:maria/5.3
+tree_name = maria/5.3
+project_name = "Mariadb 5.3, with Aria 2.0"
--- a/include/my_base.h
+++ b/include/my_base.h
@@ -446,7 +446,8 @@ enum ha_base_keytype {
 #define HA_ERR_ROW_NOT_VISIBLE    177
 #define HA_ERR_TOO_MANY_CONCURRENT_TRXS 178 /*Too many active concurrent transactions */
 #define HA_ERR_ABORTED_BY_USER    179
-#define HA_ERR_LAST               179    /* Copy of last error nr */
+#define HA_ERR_DISK_FULL          180
+#define HA_ERR_LAST               180    /* Copy of last error nr */

 /* Number of different errors */
 #define HA_ERR_ERRORS            (HA_ERR_LAST - HA_ERR_FIRST + 1)
--- a/mysys/my_handler_errors.h
+++ b/mysys/my_handler_errors.h
@@ -64,5 +64,6 @@ static const char *handler_error_messages[]=
  "Read page with wrong checksum",
  "Too many active concurrent transactions",
  "Row is not visible by the current transaction",
-  "Operation was interrupted by end user (probably kill command?)"
+  "Operation was interrupted by end user (probably kill command?)",
+  "Disk full"
 };
--- a/sql/ha_partition.h
+++ b/sql/ha_partition.h
@@ -877,6 +877,10 @@ public:
  */
  virtual ulong index_flags(uint inx, uint part, bool all_parts) const
  {
+    /*
+      The following code is not safe if you are using different
+      storage engines or different index types per partition.
+    */
    return m_file[0]->index_flags(inx, part, all_parts);
  }

--- a/sql/handler.cc
+++ b/sql/handler.cc
@@ -345,6 +345,7 @@ int ha_init_errors(void)
  SETMSG(HA_ERR_AUTOINC_READ_FAILED,    ER(ER_AUTOINC_READ_FAILED));
  SETMSG(HA_ERR_AUTOINC_ERANGE,         ER(ER_WARN_DATA_OUT_OF_RANGE));
  SETMSG(HA_ERR_TOO_MANY_CONCURRENT_TRXS, ER(ER_TOO_MANY_CONCURRENT_TRXS));
+  SETMSG(HA_ERR_DISK_FULL,              ER(ER_DISK_FULL));

  /* Register the error messages for use with my_error(). */
  return my_error_register(errmsgs, HA_ERR_FIRST, HA_ERR_LAST);
@@ -2759,6 +2760,11 @@ void handler::print_error(int error, myf errflag)
  case ENOENT:
    textno=ER_FILE_NOT_FOUND;
    break;
+  case ENOSPC:
+  case HA_ERR_DISK_FULL:
+    textno= ER_DISK_FULL;
+    SET_FATAL_ERROR;                            // Ensure error is logged
+    break;
  case HA_ERR_KEY_NOT_FOUND:
  case HA_ERR_NO_ACTIVE_RECORD:
  case HA_ERR_END_OF_FILE:
--- a/sql/handler.h
+++ b/sql/handler.h
@@ -161,8 +161,11 @@
 */
 #define HA_KEY_SCAN_NOT_ROR     128 
 #define HA_DO_INDEX_COND_PUSHDOWN  256 /* Supports Index Condition Pushdown */
-
-
+/*
+  Data is clustered on this key. This means that when you read the key
+  you also get the row data without any additional disk reads.
+*/
+#define HA_CLUSTERED_INDEX      512

 /*
  bits in alter_table_flags:
@@ -2311,9 +2314,28 @@ public:


 /*
-   @retval TRUE   Primary key (if there is one) is clustered
-                  key covering all fields
-   @retval FALSE  otherwise
+   Check if the primary key (if there is one) is a clustered and a
+   reference key. This means:
+
+   - Data is stored together with the primary key (no secondary lookup
+     needed to find the row data). The optimizer uses this to find out
+     the cost of fetching data.
+   - The primary key is part of each secondary key and is used
+     to find the row data in the primary index when reading trough
+     secondary indexes.
+   - When doing a HA_KEYREAD_ONLY we get also all the primary key parts
+     into the row. This is critical property used by index_merge.
+
+   All the above is usually true for engines that store the row
+   data in the primary key index (e.g. in a b-tree), and use the primary
+   key value as a position().  InnoDB is an example of such an engine.
+
+   For such a clustered primary key, the following should also hold:
+   index_flags() should contain HA_CLUSTERED_INDEX
+   table_flags() should contain HA_TABLE_SCAN_ON_INDEX
+
+   @retval TRUE   yes
+   @retval FALSE  No.
 */
 virtual bool primary_key_is_clustered() { return FALSE; }
 virtual int cmp_ref(const uchar *ref1, const uchar *ref2)
--- a/sql/opt_range.cc
+++ b/sql/opt_range.cc
@@ -1816,6 +1816,12 @@ QUICK_RANGE_SELECT::~QUICK_RANGE_SELECT()
  DBUG_VOID_RETURN;
 }

+/*
+  QUICK_INDEX_SORT_SELECT works as follows:
+  - Do index scans, accumulate rowids in the Unique object 
+    (Unique will also sort and de-duplicate rowids)
+  - Use rowids from unique to run a disk-ordered sweep
+*/

 QUICK_INDEX_SORT_SELECT::QUICK_INDEX_SORT_SELECT(THD *thd_param,
                                                 TABLE *table)
@@ -1848,7 +1854,18 @@ QUICK_INDEX_SORT_SELECT::push_quick_back(QUICK_RANGE_SELECT *quick_sel_range)
  if (head->file->primary_key_is_clustered() &&
      quick_sel_range->index == head->s->primary_key)
  {
-   /* A quick_select over a clustered primary key is handled specifically */
+   /*
+     A quick_select over a clustered primary key is handled specifically
+     Here we assume:
+     - PK columns are included in any other merged index
+     - Scan on the PK is disk-ordered.
+       (not meeting #2 will only cause performance degradation)
+
+       We could treat clustered PK as any other index, but that would
+       be inefficient. There is no point in doing scan on
+       CPK, remembering the rowid, then making rnd_pos() call with
+       that rowid.
+    */
    pk_quick_select= quick_sel_range;
    DBUG_RETURN(0);
  }
@@ -4298,11 +4315,19 @@ double get_sweep_read_cost(const PARAM *param, ha_rows records)
  DBUG_ENTER("get_sweep_read_cost");
  if (param->table->file->primary_key_is_clustered())
  {
+    /*
+      We are using the primary key to find the rows.
+      Calculate the cost for this.
+    */
    result= param->table->file->read_time(param->table->s->primary_key,
                                          (uint)records, records);
  }
  else
  {
+    /*
+      Rows will be retreived with rnd_pos(). Caluclate the expected
+      cost for this.
+    */
    double n_blocks=
      ceil(ulonglong2double(param->table->file->stats.data_file_length) /
           IO_SIZE);
@@ -5013,7 +5038,7 @@ bool prepare_search_best_index_intersect(PARAM *param,
      if ((*index_scan)->keynr == table->s->primary_key)
      {
        common->cpk_scan= cpk_scan= *index_scan;
-          break;
+        break;
      }
    }
  }
@@ -6187,7 +6212,6 @@ TRP_ROR_INTERSECT *get_best_ror_intersect(const PARAM *param, SEL_TREE *tree,
  ROR_SCAN_INFO **cur_ror_scan;
  ROR_SCAN_INFO *cpk_scan= NULL;
  uint cpk_no;
-  bool cpk_scan_used= FALSE;

  if (!(tree->ror_scans= (ROR_SCAN_INFO**)alloc_root(param->mem_root,
                                                     sizeof(ROR_SCAN_INFO*)*
@@ -6199,11 +6223,20 @@ TRP_ROR_INTERSECT *get_best_ror_intersect(const PARAM *param, SEL_TREE *tree,
  for (idx= 0, cur_ror_scan= tree->ror_scans; idx < param->keys; idx++)
  {
    ROR_SCAN_INFO *scan;
+    uint key_no;
    if (!tree->ror_scans_map.is_set(idx))
      continue;
+    key_no= param->real_keynr[idx];
+    if (key_no != cpk_no &&
+        param->table->file->index_flags(key_no,0,0) & HA_CLUSTERED_INDEX)
+    {
+      /* Ignore clustering keys */
+      tree->n_ror_scans--;
+      continue;
+    }
    if (!(scan= make_ror_scan(param, idx, tree->keys[idx])))
      return NULL;
-    if (param->real_keynr[idx] == cpk_no)
+    if (key_no == cpk_no)
    {
      cpk_scan= scan;
      tree->n_ror_scans--;
@@ -6289,15 +6322,14 @@ TRP_ROR_INTERSECT *get_best_ror_intersect(const PARAM *param, SEL_TREE *tree,
  {
    if (ror_intersect_add(intersect, cpk_scan, TRUE) && 
        (intersect->total_cost < min_cost))
-    {
-      cpk_scan_used= TRUE;
      intersect_best= intersect; //just set pointer here
-    }
  }
+  else
+    cpk_scan= 0;                                // Don't use cpk_scan

  /* Ok, return ROR-intersect plan if we have found one */
  TRP_ROR_INTERSECT *trp= NULL;
-  if (min_cost < read_time && (cpk_scan_used || best_num > 1))
+  if (min_cost < read_time && (cpk_scan || best_num > 1))
  {
    if (!(trp= new (param->mem_root) TRP_ROR_INTERSECT))
      DBUG_RETURN(trp);
@@ -6316,7 +6348,7 @@ TRP_ROR_INTERSECT *get_best_ror_intersect(const PARAM *param, SEL_TREE *tree,
    set_if_smaller(param->table->quick_condition_rows, best_rows);
    trp->records= best_rows;
    trp->index_scan_costs= intersect_best->index_scan_costs;
-    trp->cpk_scan= cpk_scan_used? cpk_scan: NULL;
+    trp->cpk_scan= cpk_scan;
    DBUG_PRINT("info", ("Returning non-covering ROR-intersect plan:"
                        "cost %g, records %lu",
                        trp->read_cost, (ulong) trp->records));
@@ -9511,10 +9543,10 @@ ha_rows check_quick_select(PARAM *param, uint idx, bool index_only,
  bool pk_is_clustered= file->primary_key_is_clustered();
  if (index_only && 
      (file->index_flags(keynr, param->max_key_part, 1) & HA_KEYREAD_ONLY) &&
-      !(pk_is_clustered && keynr == param->table->s->primary_key))
+      !(file->index_flags(keynr, param->max_key_part, 1) & HA_CLUSTERED_INDEX))
     *mrr_flags |= HA_MRR_INDEX_ONLY;
  
-  if (current_thd->lex->sql_command != SQLCOM_SELECT)
+  if (param->thd->lex->sql_command != SQLCOM_SELECT)
    *mrr_flags |= HA_MRR_USE_DEFAULT_IMPL;

  *bufsize= param->thd->variables.mrr_buff_size;
--- a/sql/sql_select.cc
+++ b/sql/sql_select.cc
@@ -8550,17 +8550,19 @@ make_join_readinfo(JOIN *join, ulonglong options, uint no_jbuf_after)
 	  else if (!table->covering_keys.is_clear_all() &&
 		   !(tab->select && tab->select->quick))
 	  {					// Only read index tree
+#ifdef BAD_OPTIMIZATION
 	    /*
-            It has turned out that the below change, while speeding things
-            up for disk-bound loads, slows them down for cases when the data
-            is in disk cache (see BUG#35850):
-	    //  See bug #26447: "Using the clustered index for a table scan
-	    //  is always faster than using a secondary index".
+              It has turned out that the below change, while speeding things
+              up for disk-bound loads, slows them down for cases when the data
+              is in disk cache (see BUG#35850):
+              See bug #26447: "Using the clustered index for a table scan
+              is always faster than using a secondary index".
+            */
            if (table->s->primary_key != MAX_KEY &&
                table->file->primary_key_is_clustered())
              tab->index= table->s->primary_key;
            else
-	    */
+#endif
              tab->index=find_shortest_key(table, & table->covering_keys);
 	    tab->read_first_record= join_read_first;
            /* Read with index_first / index_next */
@@ -16308,7 +16310,7 @@ find_field_in_item_list (Field *field, void *data)
 */

 static bool
-test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit,
+test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit_arg,
 			bool no_changes, const key_map *map)
 {
  int ref_key;
@@ -16320,8 +16322,11 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit,
  key_map usable_keys;
  QUICK_SELECT_I *save_quick= select ? select->quick : 0;
  int best_key= -1;
+  ha_rows best_select_limit;
  DBUG_ENTER("test_if_skip_sort_order");
+
  LINT_INIT(ref_key_parts);
+  LINT_INIT(best_select_limit);

  /*
    Keys disabled by ALTER TABLE ... DISABLE KEYS should have already
@@ -16486,7 +16491,7 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit,
      resolved with a key;  This is because filesort() is usually faster than
      retrieving all rows through an index.
    */
-    if (select_limit >= table_records)
+    if (select_limit_arg >= table_records)
    {
      keys= *table->file->keys_to_use_for_scanning();
      keys.merge(table->covering_keys);
@@ -16514,6 +16519,7 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit,
    for (nr=0; nr < table->s->keys ; nr++)
    {
      int direction;
+      ha_rows select_limit= select_limit_arg;

      if (keys.is_set(nr) &&
          (direction= test_if_order_by_key(order, table, nr, &used_key_parts)))
@@ -16525,9 +16531,9 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit,
        */
        DBUG_ASSERT (ref_key != (int) nr);

-        bool is_covering= table->covering_keys.is_set(nr) ||
-                          (nr == table->s->primary_key &&
-                          table->file->primary_key_is_clustered());
+        bool is_covering= (table->covering_keys.is_set(nr) ||
+                           (table->file->index_flags(nr, 0, 1) &
+                            HA_CLUSTERED_INDEX));
 	
        /* 
          Don't use an index scan with ORDER BY without limit.
@@ -16606,7 +16612,8 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit,
                select_limit= table_records;
            else
              select_limit= (ha_rows) (select_limit*rec_per_key);
-          }
+          } /* group */
+
          /* 
            If tab=tk is not the last joined table tn then to get first
            L records from the result set we can expect to retrieve
@@ -16650,8 +16657,7 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit,
 	  */
          index_scan_time= select_limit/rec_per_key *
 	                   min(rec_per_key, table->file->scan_time());
-          if ((ref_key < 0 && is_covering) || 
-              (ref_key < 0 && (group || table->force_index)) ||
+          if ((ref_key < 0 && (group || table->force_index || is_covering)) ||
              index_scan_time < read_time)
          {
            ha_rows quick_records= table_records;
@@ -16663,7 +16669,8 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit,
            if (best_key < 0 ||
                (select_limit <= min(quick_records,best_records) ?
                 keyinfo->key_parts < best_key_parts :
-                 quick_records < best_records))
+                 quick_records < best_records) ||
+                (!is_best_covering && is_covering))
            {
              best_key= nr;
              best_key_parts= keyinfo->key_parts;
@@ -16671,6 +16678,7 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit,
              best_records= quick_records;
              is_best_covering= is_covering;
              best_key_direction= direction; 
+              best_select_limit= select_limit;
            }
          }   
 	}      
@@ -16680,42 +16688,37 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit,
    /*
      filesort() and join cache are usually faster than reading in 
      index order and not using join cache, except in case that chosen
-      index is clustered primary key.
+      index is clustered key.
    */
-    if ((select_limit >= table_records) &&
-        (tab->type == JT_ALL &&
-         tab->join->tables > tab->join->const_tables + 1) &&
-         ((unsigned) best_key != table->s->primary_key ||
-          !table->file->primary_key_is_clustered()))
+    if (best_key < 0 ||
+        ((select_limit_arg >= table_records) &&
+         (tab->type == JT_ALL &&
+          tab->join->tables > tab->join->const_tables + 1) &&
+         !(table->file->index_flags(best_key, 0, 1) & HA_CLUSTERED_INDEX)))
      goto use_filesort;

-    if (best_key >= 0)
+    if (table->quick_keys.is_set(best_key) && best_key != ref_key)
    {
-      if (table->quick_keys.is_set(best_key) && best_key != ref_key)
-      {
-        key_map map;
-        map.clear_all();       // Force the creation of quick select
-        map.set_bit(best_key); // only best_key.
-        select->quick= 0;
-        select->test_quick_select(join->thd, map, 0,
-                                  join->select_options & OPTION_FOUND_ROWS ?
-                                  HA_POS_ERROR :
-                                  join->unit->select_limit_cnt,
-                                  TRUE, FALSE);
-      }
-      order_direction= best_key_direction;
-      /*
-        saved_best_key_parts is actual number of used keyparts found by the
-        test_if_order_by_key function. It could differ from keyinfo->key_parts,
-        thus we have to restore it in case of desc order as it affects
-        QUICK_SELECT_DESC behaviour.
-      */
-      used_key_parts= (order_direction == -1) ?
-        saved_best_key_parts :  best_key_parts;
+      key_map map;
+      map.clear_all();       // Force the creation of quick select
+      map.set_bit(best_key); // only best_key.
+      select->quick= 0;
+      select->test_quick_select(join->thd, map, 0,
+                                join->select_options & OPTION_FOUND_ROWS ?
+                                HA_POS_ERROR :
+                                join->unit->select_limit_cnt,
+                                TRUE, FALSE);
    }
-    else
-      goto use_filesort;
-  } 
+    order_direction= best_key_direction;
+    /*
+      saved_best_key_parts is actual number of used keyparts found by the
+      test_if_order_by_key function. It could differ from keyinfo->key_parts,
+      thus we have to restore it in case of desc order as it affects
+      QUICK_SELECT_DESC behaviour.
+    */
+    used_key_parts= (order_direction == -1) ?
+      saved_best_key_parts :  best_key_parts;
+  }

 check_reverse_order:                  
  DBUG_ASSERT(order_direction != 0);
@@ -16791,8 +16794,8 @@ check_reverse_order:
        {
          tab->ref.key= -1;
          tab->ref.key_parts= 0;
-          if (select_limit < table->file->stats.records) 
-            tab->limit= select_limit;
+          if (best_select_limit < table->file->stats.records)
+            tab->limit= best_select_limit;
        }
      }
      else if (tab->type != JT_ALL)
--- a/sql/sql_table.cc
+++ b/sql/sql_table.cc
@@ -7983,7 +7983,8 @@ copy_data_between_tables(TABLE *from,TABLE *to,

  if (order)
  {
-    if (to->s->primary_key != MAX_KEY && to->file->primary_key_is_clustered())
+    if (to->s->primary_key != MAX_KEY &&
+        to->file->ha_table_flags() & HA_TABLE_SCAN_ON_INDEX)
    {
      char warn_buff[MYSQL_ERRMSG_SIZE];
      my_snprintf(warn_buff, sizeof(warn_buff), 
--- a/storage/innobase/handler/ha_innodb.h
+++ b/storage/innobase/handler/ha_innodb.h
@@ -98,10 +98,14 @@ class ha_innobase: public handler
 	Table_flags table_flags() const;
 	ulong index_flags(uint idx, uint part, bool all_parts) const
 	{
+          ulong extra_flag= 0;
+          if (table && idx == table->s->primary_key)
+             extra_flag= HA_CLUSTERED_INDEX;
 	  return (HA_READ_NEXT |
 		  HA_READ_PREV |
 		  HA_READ_ORDER |
 		  HA_READ_RANGE |
+                  extra_flag |
 		  HA_KEYREAD_ONLY);
 	}
 	uint max_supported_keys()	   const { return MAX_KEY; }
--- a/storage/innodb_plugin/handler/ha_innodb.cc
+++ b/storage/innodb_plugin/handler/ha_innodb.cc
@@ -2995,12 +2995,15 @@ UNIV_INTERN
 ulong
 ha_innobase::index_flags(
 /*=====================*/
-	uint,
+	uint index,
 	uint,
 	bool)
 const
 {
-	return(HA_READ_NEXT | HA_READ_PREV | HA_READ_ORDER
+       ulong extra_flag= 0;
+       if (table && index == table->s->primary_key)
+             extra_flag= HA_CLUSTERED_INDEX;
+	return(HA_READ_NEXT | HA_READ_PREV | HA_READ_ORDER | extra_flag
 	       | HA_READ_RANGE | HA_KEYREAD_ONLY);
 }

--- a/storage/xtradb/handler/ha_innodb.cc
+++ b/storage/xtradb/handler/ha_innodb.cc
@@ -3408,12 +3408,15 @@ UNIV_INTERN
 ulong
 ha_innobase::index_flags(
 /*=====================*/
-	uint,
-	uint,
-	bool)
+	uint index,
+	uint part,
+	bool all_parts)
 const
 {
-	return(HA_READ_NEXT | HA_READ_PREV | HA_READ_ORDER
+       ulong extra_flag= 0;
+       if (table && index == table->s->primary_key)
+             extra_flag= HA_CLUSTERED_INDEX;
+	return(HA_READ_NEXT | HA_READ_PREV | HA_READ_ORDER | extra_flag 
 	       | HA_READ_RANGE | HA_KEYREAD_ONLY | HA_DO_INDEX_COND_PUSHDOWN);
 }