use tree for count(distinct) when possible

2025-08-01 03:47:19 +03:00 · 2001-05-11 15:07:34 -06:00
parent a12117f036
commit c706bf40f3
4 changed files with 116 additions and 4 deletions
--- a/include/my_tree.h
+++ b/include/my_tree.h
@ -57,7 +57,7 @@ typedef struct st_tree {
  void (*free)(void *);
 } TREE;
-	/* Functions on hole tree */
+	/* Functions on whole tree */
 void init_tree(TREE *tree,uint default_alloc_size, int element_size,
 	       qsort_cmp2 compare, my_bool with_delete,
 	       void (*free_element)(void*));
--- a/mysys/tree.c
+++ b/mysys/tree.c
@ -84,7 +84,7 @@ void init_tree(TREE *tree, uint default_alloc_size, int size,
      ((uint) size <= sizeof(void*) || ((uint) size & (sizeof(void*)-1))))
  {
    tree->offset_to_key=sizeof(TREE_ELEMENT); /* Put key after element */
-    /* Fix allocation size so that we don't loose any memory */
+    /* Fix allocation size so that we don't lose any memory */
    default_alloc_size/=(sizeof(TREE_ELEMENT)+size);
    if (!default_alloc_size)
      default_alloc_size=1;
--- a/sql/item_sum.cc
+++ b/sql/item_sum.cc
@ -788,11 +788,56 @@ String *Item_std_field::val_str(String *str)
 #include "sql_select.h"
 static int simple_raw_key_cmp(void* arg, byte* key1, byte* key2)
 {
  return memcmp(key1, key2, (int)arg);
 }
 static int simple_str_key_cmp(void* arg, byte* key1, byte* key2)
 {
  return my_sortcmp(key1, key2, (int)arg);
 }
 // did not make this one static - at least gcc gets confused when
 // I try to declare a static function as a friend. If you can figure
 // out the syntax to make a static function a friend, make this one
 // static
 int composite_key_cmp(void* arg, byte* key1, byte* key2)
 {
  Item_sum_count_distinct* item = (Item_sum_count_distinct*)arg;
  Field** field = item->table->field, **field_end;
  field_end = field + item->table->fields;
  for(; field < field_end; ++field)
    {
      int res;
      int len = (*field)->field_length;
      switch((*field)->type())
 	{
 	case FIELD_TYPE_STRING:
 	case FIELD_TYPE_VAR_STRING:
 	  res = my_sortcmp(key1, key2, len);
 	  break;
 	default:
 	  res = memcmp(key1, key2, len);
 	  break;
 	}
      if(res)
 	return res;
      key1 += len;
      key2 += len;
    }
  return 0;
 }
 Item_sum_count_distinct::~Item_sum_count_distinct()
 {
  if (table)
    free_tmp_table(current_thd, table);
  delete tmp_table_param;
  if(use_tree)
    delete_tree(&tree);
 }
@ -821,6 +866,53 @@ bool Item_sum_count_distinct::setup(THD *thd)
 			       0, 0, current_lex->options | thd->options)))
    return 1;
  table->file->extra(HA_EXTRA_NO_ROWS);		// Don't update rows
  if(table->db_type == DB_TYPE_HEAP) // no blobs, otherwise it would be
    // MyISAM
    {
      qsort_cmp2 compare_key;
      void* cmp_arg;
      int key_len;
      if(table->fields == 1) // if we have only one field, which is
 	// the most common use of count(distinct), it is much faster
 	// to use a simpler key compare method that can take advantage
 	// of not having to worry about other fields
 	{
 	  switch(table->field[0]->type())
 	    {
 	      // if we have a string, we must take care of charsets
 	      // and case sensitivity
 	    case FIELD_TYPE_STRING:
 	    case FIELD_TYPE_VAR_STRING:
 	      compare_key = (qsort_cmp2)simple_str_key_cmp;
 	      break;
 	    default: // since at this point we cannot have blobs
 	      // anything else can be compared with memcmp
 	      compare_key = (qsort_cmp2)simple_raw_key_cmp;
 	      break;
 	    }
 	  cmp_arg = (void*)(key_len = table->field[0]->field_length);
 	  rec_offset = 1;
 	}
      else // too bad, cannot cheat - there is more than one field
 	{
 	  cmp_arg = (void*)this;
 	  compare_key = (qsort_cmp2)composite_key_cmp;
 	  Field** field, **field_end;
 	  field_end = (field = table->field) + table->fields;
 	  for(key_len = 0; field < field_end; ++field)
 	    {
 	      key_len += (*field)->field_length;
 	    }
 	  rec_offset = table->reclength - key_len;
 	}
      init_tree(&tree, 0, key_len, compare_key, 0, 0);
      tree.cmp_arg = cmp_arg;
      use_tree = 1;
    }
  return 0;
 }
@ -830,6 +922,8 @@ void Item_sum_count_distinct::reset()
  table->file->extra(HA_EXTRA_NO_CACHE);
  table->file->delete_all_rows();
  table->file->extra(HA_EXTRA_WRITE_CACHE);
  if(use_tree)
    delete_tree(&tree);
  (void) add();
 }
@ -843,7 +937,12 @@ bool Item_sum_count_distinct::add()
    if ((*field)->is_real_null(0))
      return 0;					// Don't count NULL
-  if ((error=table->file->write_row(table->record[0])))
+  if(use_tree)
    {
      if(!tree_insert(&tree, table->record[0] + rec_offset, 0))
 	return 1;
    }
  else if ((error=table->file->write_row(table->record[0])))
  {
    if (error != HA_ERR_FOUND_DUPP_KEY &&
 	error != HA_ERR_FOUND_DUPP_UNIQUE)
@ -859,6 +958,8 @@ longlong Item_sum_count_distinct::val_int()
 {
  if (!table)					// Empty query
    return LL(0);
  if(use_tree)
    return tree.elements_in_tree;
  table->file->info(HA_STATUS_VARIABLE | HA_STATUS_NO_LOCK);
  return table->file->records;
 }
--- a/sql/item_sum.h
+++ b/sql/item_sum.h
@ -21,6 +21,8 @@
 #pragma interface			/* gcc class implementation */
 #endif
 #include <my_tree.h>
 class Item_sum :public Item_result_field
 {
 public:
@ -145,11 +147,20 @@ class Item_sum_count_distinct :public Item_sum_int
  table_map used_table_cache;
  bool fix_fields(THD *thd,TABLE_LIST *tables);
  TMP_TABLE_PARAM *tmp_table_param;
  TREE tree;
  bool use_tree; // If there are no blobs, we can use a tree, which
  // is faster than heap table. In that case, we still use the table
  // to help get things set up, but we insert nothing in it
  int rec_offset; // the first few bytes of record ( at least one)
  // are just markers for deleted and NULLs. We want to skip them since
  // they will just bloat the tree without providing any valuable info
  friend int composite_key_cmp(void* arg, byte* key1, byte* key2);
  public:
  Item_sum_count_distinct(List<Item> &list)
    :Item_sum_int(list),table(0),used_table_cache(~(table_map) 0),
-    tmp_table_param(0)
+     tmp_table_param(0),use_tree(0)
  { quick_group=0; }
  ~Item_sum_count_distinct();
  table_map used_tables() const { return used_table_cache; }