Move JSON histograms code into its own files

2025-08-08 11:22:35 +03:00 · 2021-09-04 17:11:16 +03:00
parent 4ab2b78b65
commit 1d98168547
5 changed files with 564 additions and 495 deletions
--- a/sql/CMakeLists.txt
+++ b/sql/CMakeLists.txt
@@ -151,6 +151,7 @@ SET (SQL_SOURCE
               sql_analyze_stmt.cc
               sql_join_cache.cc
               create_options.cc multi_range_read.cc
               opt_histogram_json.cc
               opt_index_cond_pushdown.cc opt_subselect.cc
               opt_table_elimination.cc sql_expression_cache.cc
               gcalc_slicescan.cc gcalc_tools.cc
--- a/sql/opt_histogram_json.cc
+++ b/sql/opt_histogram_json.cc
@@ -0,0 +1,391 @@
 /*
   Copyright (c) 2021, MariaDB Corporation.
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; version 2 of the License.
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.
   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
 #include "mariadb.h"
 #include "sql_base.h"
 #include "my_json_writer.h"
 #include "sql_statistics.h"
 #include "opt_histogram_json.h"
 class Histogram_json_builder : public Histogram_builder
 {
  Histogram_json_hb *histogram;
  uint hist_width;         /* the number of points in the histogram        */
  double bucket_capacity;  /* number of rows in a bucket of the histogram  */
  uint curr_bucket;        /* number of the current bucket to be built     */
  std::vector<std::string> bucket_bounds;
  bool first_value= true;
 public:
  Histogram_json_builder(Histogram_json_hb *hist, Field *col, uint col_len,
                         ha_rows rows)
    : Histogram_builder(col, col_len, rows), histogram(hist)
  {
    bucket_capacity= (double)records / histogram->get_width();
    hist_width= histogram->get_width();
    curr_bucket= 0;
  }
  ~Histogram_json_builder() override = default;
  /*
    @brief
      Add data to the histogram. This call adds elem_cnt rows, each
      of which has value of *elem.
    @detail
      Subsequent next() calls will add values that are greater than *elem.
  */
  int next(void *elem, element_count elem_cnt) override
  {
    counters.next(elem, elem_cnt);
    ulonglong count= counters.get_count();
    if (curr_bucket == hist_width)
      return 0;
    if (first_value)
    {
      first_value= false;
      column->store_field_value((uchar*) elem, col_length);
      StringBuffer<MAX_FIELD_WIDTH> val;
      column->val_str(&val);
      bucket_bounds.push_back(std::string(val.ptr(), val.length()));
    }
    if (count > bucket_capacity * (curr_bucket + 1))
    {
      column->store_field_value((uchar*) elem, col_length);
      StringBuffer<MAX_FIELD_WIDTH> val;
      column->val_str(&val);
      bucket_bounds.emplace_back(val.ptr(), val.length());
      curr_bucket++;
      while (curr_bucket != hist_width &&
             count > bucket_capacity * (curr_bucket + 1))
      {
        bucket_bounds.push_back(std::string(val.ptr(), val.length()));
        curr_bucket++;
      }
    }
    if (records == count && bucket_bounds.size() == hist_width)
    {
      column->store_field_value((uchar*) elem, col_length);
      StringBuffer<MAX_FIELD_WIDTH> val;
      column->val_str(&val);
      bucket_bounds.push_back(std::string(val.ptr(), val.length()));
    }
    return 0;
  }
  /*
    @brief
      Finalize the creation of histogram
  */
  void finalize() override
  {
    Json_writer writer;
    writer.start_object();
    writer.add_member(Histogram_json_hb::JSON_NAME).start_array();
    for(auto& value: bucket_bounds) {
      writer.add_str(value.c_str());
    }
    writer.end_array();
    writer.end_object();
    Binary_string *json_string= (Binary_string *) writer.output.get_string();
    histogram->set_json_text(bucket_bounds.size()-1,
                             (uchar *) json_string->c_ptr());
  }
 };
 Histogram_builder *Histogram_json_hb::create_builder(Field *col, uint col_len,
                                                     ha_rows rows)
 {
  return new Histogram_json_builder(this, col, col_len, rows);
 }
 void Histogram_json_hb::init_for_collection(MEM_ROOT *mem_root,
                                            Histogram_type htype_arg,
                                            ulonglong size_arg)
 {
  DBUG_ASSERT(htype_arg == JSON_HB);
  size= (uint8) size_arg;
 }
 /*
  @brief
    Parse the histogram from its on-disk representation
  @return
     false  OK
     True   Error
 */
 bool Histogram_json_hb::parse(MEM_ROOT *mem_root, Field *field,
                              Histogram_type type_arg, const char *hist_data,
                              size_t hist_data_len)
 {
  DBUG_ENTER("Histogram_json_hb::parse");
  DBUG_ASSERT(type_arg == JSON_HB);
  const char *err;
  json_engine_t je;
  json_string_t key_name;
  json_scan_start(&je, &my_charset_utf8mb4_bin,
                  (const uchar*)hist_data,
                  (const uchar*)hist_data+hist_data_len);
  if (json_read_value(&je) || je.value_type != JSON_VALUE_OBJECT)
  {
    err= "Root JSON element must be a JSON object";
    goto error;
  }
  json_string_set_str(&key_name, (const uchar*)JSON_NAME,
                      (const uchar*)JSON_NAME + strlen(JSON_NAME));
  json_string_set_cs(&key_name, system_charset_info);
  if (json_scan_next(&je) || je.state != JST_KEY ||
      !json_key_matches(&je, &key_name))
  {
    err= "The first key in the object must be histogram_hb_v1";
    goto error;
  }
  // The value must be a JSON array
  if (json_read_value(&je) || (je.value_type != JSON_VALUE_ARRAY))
  {
    err= "A JSON array expected";
    goto error;
  }
  // Read the array
  while (!json_scan_next(&je))
  {
    switch(je.state)
    {
      case JST_VALUE:
      {
        const char *val;
        int val_len;
        json_smart_read_value(&je, &val, &val_len);
        if (je.value_type != JSON_VALUE_STRING &&
            je.value_type != JSON_VALUE_NUMBER &&
            je.value_type != JSON_VALUE_TRUE &&
            je.value_type != JSON_VALUE_FALSE)
        {
          err= "Scalar value expected";
          goto error;
        }
        uchar buf[MAX_KEY_LENGTH];
        uint len_to_copy= field->key_length();
        field->store_text(val, val_len, &my_charset_bin);
        uint bytes= field->get_key_image(buf, len_to_copy, Field::itRAW);
        histogram_bounds.push_back(std::string((char*)buf, bytes));
        // TODO: Should we also compare this endpoint with the previous
        // to verify that the ordering is right?
        break;
      }
      case JST_ARRAY_END:
        break;
    }
  }
  // n_buckets = n_bounds - 1 :
  size= histogram_bounds.size()-1;
  DBUG_RETURN(false);
 error:
  my_error(ER_JSON_HISTOGRAM_PARSE_FAILED, MYF(0), err,
           je.s.c_str - (const uchar*)hist_data);
  DBUG_RETURN(true);
 }
 static
 void store_key_image_to_rec_no_null(Field *field, const uchar *ptr)
 {
  MY_BITMAP *old_map= dbug_tmp_use_all_columns(field->table,
                                    &field->table->write_set);
  field->set_key_image(ptr, field->key_length());
  dbug_tmp_restore_column_map(&field->table->write_set, old_map);
 }
 static
 double position_in_interval(Field *field, const  uchar *key,
                            const std::string& left, const std::string& right)
 {
  double res;
  if (field->pos_through_val_str())
  {
    uint32 min_len= uint2korr(left.data());
    uint32 max_len= uint2korr(right.data());
    uint32 midp_len= uint2korr(key);
    res= pos_in_interval_for_string(field->charset(),
           key + HA_KEY_BLOB_LENGTH,
           midp_len,
           (const uchar*)left.data() + HA_KEY_BLOB_LENGTH,
           min_len,
           (const uchar*)right.data() + HA_KEY_BLOB_LENGTH,
           max_len);
  }
  else
  {
    store_key_image_to_rec_no_null(field, (const uchar*)left.data());
    double min_val_real= field->val_real();
    store_key_image_to_rec_no_null(field, (const uchar*)right.data());
    double max_val_real= field->val_real();
    store_key_image_to_rec_no_null(field, key);
    double midp_val_real= field->val_real();
    res= pos_in_interval_for_double(midp_val_real, min_val_real, max_val_real);
  }
  return res;
 }
 double Histogram_json_hb::point_selectivity(Field *field, key_range *endpoint,
                                            double avg_sel)
 {
  double sel;
  store_key_image_to_rec(field, (uchar *) endpoint->key,
                         field->key_length());
  const uchar *min_key = endpoint->key;
  if (field->real_maybe_null())
    min_key++;
  uint min_idx= find_bucket(field, min_key, false);
  uint max_idx= find_bucket(field, min_key, true);
 #if 0
  // find how many buckets this value occupies
  while ((max_idx + 1 < get_width() ) &&
         (field->key_cmp((uchar *)histogram_bounds[max_idx + 1].data(), min_key) == 0)) {
    max_idx++;
  }
 #endif
  if (max_idx > min_idx)
  {
    // value spans multiple buckets
    double bucket_sel= 1.0/(get_width() + 1);
    sel= bucket_sel * (max_idx - min_idx + 1);
  }
  else
  {
    // the value fits within a single bucket
    sel = MY_MIN(avg_sel, 1.0/get_width());
  }
  return sel;
 }
 /*
  @param field    The table field histogram is for.  We don't care about the
                  field's current value, we only need its virtual functions to
                  perform various operations
  @param min_endp Left endpoint, or NULL if there is none
  @param max_endp Right endpoint, or NULL if there is none
 */
 double Histogram_json_hb::range_selectivity(Field *field, key_range *min_endp,
                                            key_range *max_endp)
 {
  double min, max;
  double width= 1.0 / histogram_bounds.size();
  if (min_endp && !(field->null_ptr && min_endp->key[0]))
  {
    bool exclusive_endp= (min_endp->flag == HA_READ_AFTER_KEY)? true: false;
    const uchar *min_key= min_endp->key;
    if (field->real_maybe_null())
      min_key++;
    // Find the leftmost bucket that contains the lookup value.
    // (If the lookup value is to the left of all buckets, find bucket #0)
    int idx= find_bucket(field, min_key, exclusive_endp);
    double min_sel= position_in_interval(field, (const uchar*)min_key,
                                         histogram_bounds[idx],
                                         histogram_bounds[idx+1]);
    min= idx*width + min_sel*width;
  }
  else
    min= 0.0;
  if (max_endp)
  {
    // The right endpoint cannot be NULL
    DBUG_ASSERT(!(field->null_ptr && max_endp->key[0]));
    bool inclusive_endp= (max_endp->flag == HA_READ_AFTER_KEY)? true: false;
    const uchar *max_key= max_endp->key;
    if (field->real_maybe_null())
      max_key++;
    int idx= find_bucket(field, max_key, inclusive_endp);
    double max_sel= position_in_interval(field, (const uchar*)max_key,
                                         histogram_bounds[idx],
                                         histogram_bounds[idx+1]);
    max= idx*width + max_sel*width;
  }
  else
    max= 1.0;
  double sel = max - min;
  return sel;
 }
 void Histogram_json_hb::serialize(Field *field)
 {
  field->store(json_text.data(), json_text.size(), &my_charset_bin);
 }
 /*
  Find the histogram bucket that contains the value.
  @param equal_is_less Controls what to do if a histogram bound is equal to the
                       lookup_val.
 */
 int Histogram_json_hb::find_bucket(Field *field, const uchar *lookup_val,
                                   bool equal_is_less)
 {
  int low= 0;
  int high= histogram_bounds.size() - 1;
  int middle;
  while (low + 1 < high)
  {
    middle= (low + high) / 2;
    int res= field->key_cmp((uchar*)histogram_bounds[middle].data(), lookup_val);
    if (!res)
      res= equal_is_less? -1: 1;
    if (res < 0)
      low= middle;
    else //res > 0
      high= middle;
  }
  return low;
 }
--- a/sql/opt_histogram_json.h
+++ b/sql/opt_histogram_json.h
@@ -0,0 +1,95 @@
 /*
   Copyright (c) 2021, MariaDB Corporation.
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; version 2 of the License.
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.
   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
 #include "sql_statistics.h"
 /*
  An equi-height histogram which stores real values for bucket bounds.
  Handles @@histogram_type=JSON_HB
 */
 class Histogram_json_hb : public Histogram_base
 {
  size_t size; /* Number of elements in the histogram */
  /* Collection-time only: collected histogram in the JSON form. */
  std::string json_text;
  // Array of histogram bucket endpoints in KeyTupleFormat.
  std::vector<std::string> histogram_bounds;
 public:
  static constexpr const char* JSON_NAME="histogram_hb_v1";
  bool parse(MEM_ROOT *mem_root, Field *field, Histogram_type type_arg,
             const char *hist_data, size_t hist_data_len) override;
  void serialize(Field *field) override;
  Histogram_builder *create_builder(Field *col, uint col_len,
                                    ha_rows rows) override;
  // returns number of buckets in the histogram
  uint get_width() override
  {
    return (uint)size;
  }
  Histogram_type get_type() override
  {
    return JSON_HB;
  }
  /*
    @brief
      Legacy: this returns the size of the histogram on disk.
    @detail
      This is only called at collection time when json_text is non-empty.
  */
  uint get_size() override
  {
    return json_text.size();
  }
  void init_for_collection(MEM_ROOT *mem_root, Histogram_type htype_arg,
                           ulonglong size) override;
  bool is_available() override {return true; }
  bool is_usable(THD *thd) override
  {
    return thd->variables.optimizer_use_condition_selectivity > 3 &&
           is_available();
  }
  double point_selectivity(Field *field, key_range *endpoint,
                           double avg_selection) override;
  double range_selectivity(Field *field, key_range *min_endp,
                           key_range *max_endp) override;
  void set_json_text(ulonglong sz, uchar *json_text_arg)
  {
    size = (uint8) sz;
    json_text.assign((const char*)json_text_arg,
                     strlen((const char*)json_text_arg));
  }
 private:
  int find_bucket(Field *field, const uchar *lookup_val, bool equal_is_less);
 };
--- a/sql/sql_statistics.cc
+++ b/sql/sql_statistics.cc
@@ -28,11 +28,11 @@
 #include "sql_base.h"
 #include "key.h"
 #include "sql_statistics.h"
 #include "opt_histogram_json.h"
 #include "opt_range.h"
 #include "uniques.h"
 #include "sql_show.h"
 #include "sql_partition.h"
 #include "my_json_writer.h"
 #include <vector>
 #include <string>
@@ -1276,273 +1276,6 @@ void Histogram_binary::init_for_collection(MEM_ROOT *mem_root,
 }
 void Histogram_json_hb::init_for_collection(MEM_ROOT *mem_root,
                                            Histogram_type htype_arg,
                                            ulonglong size_arg)
 {
  DBUG_ASSERT(htype_arg == JSON_HB);
  size= (uint8) size_arg;
 }
 /*
  @brief
    Parse the histogram from its on-disk representation
  @return
     false  OK
     True   Error
 */
 bool Histogram_json_hb::parse(MEM_ROOT *mem_root, Field *field,
                              Histogram_type type_arg, const char *hist_data,
                              size_t hist_data_len)
 {
  DBUG_ENTER("Histogram_json_hb::parse");
  DBUG_ASSERT(type_arg == JSON_HB);
  const char *err;
  json_engine_t je;
  json_string_t key_name;
  json_scan_start(&je, &my_charset_utf8mb4_bin,
                  (const uchar*)hist_data,
                  (const uchar*)hist_data+hist_data_len);
  if (json_read_value(&je) || je.value_type != JSON_VALUE_OBJECT)
  {
    err= "Root JSON element must be a JSON object";
    goto error;
  }
  json_string_set_str(&key_name, (const uchar*)JSON_NAME,
                      (const uchar*)JSON_NAME + strlen(JSON_NAME));
  json_string_set_cs(&key_name, system_charset_info);
  if (json_scan_next(&je) || je.state != JST_KEY ||
      !json_key_matches(&je, &key_name))
  {
    err= "The first key in the object must be histogram_hb_v1";
    goto error;
  }
  // The value must be a JSON array
  if (json_read_value(&je) || (je.value_type != JSON_VALUE_ARRAY))
  {
    err= "A JSON array expected";
    goto error;
  }
  // Read the array
  while (!json_scan_next(&je))
  {
    switch(je.state)
    {
      case JST_VALUE:
      {
        const char *val;
        int val_len;
        json_smart_read_value(&je, &val, &val_len);
        if (je.value_type != JSON_VALUE_STRING &&
            je.value_type != JSON_VALUE_NUMBER &&
            je.value_type != JSON_VALUE_TRUE &&
            je.value_type != JSON_VALUE_FALSE)
        {
          err= "Scalar value expected";
          goto error;
        }
        uchar buf[MAX_KEY_LENGTH];
        uint len_to_copy= field->key_length();
        field->store_text(val, val_len, &my_charset_bin);
        uint bytes= field->get_key_image(buf, len_to_copy, Field::itRAW);
        histogram_bounds.push_back(std::string((char*)buf, bytes));
        // TODO: Should we also compare this endpoint with the previous
        // to verify that the ordering is right?
        break;
      }
      case JST_ARRAY_END:
        break;
    }
  }
  // n_buckets = n_bounds - 1 :
  size= histogram_bounds.size()-1;
  DBUG_RETURN(false);
 error:
  my_error(ER_JSON_HISTOGRAM_PARSE_FAILED, MYF(0), err,
           je.s.c_str - (const uchar*)hist_data);
  DBUG_RETURN(true);
 }
 double Histogram_json_hb::point_selectivity(Field *field, key_range *endpoint,
                                            double avg_sel)
 {
  double sel;
  store_key_image_to_rec(field, (uchar *) endpoint->key,
                         field->key_length());
  const uchar *min_key = endpoint->key;
  if (field->real_maybe_null())
    min_key++;
  uint min_idx= find_bucket(field, min_key, false);
  uint max_idx= find_bucket(field, min_key, true);
 #if 0
  // find how many buckets this value occupies
  while ((max_idx + 1 < get_width() ) &&
         (field->key_cmp((uchar *)histogram_bounds[max_idx + 1].data(), min_key) == 0)) {
    max_idx++;
  }
 #endif
  if (max_idx > min_idx)
  {
    // value spans multiple buckets
    double bucket_sel= 1.0/(get_width() + 1);
    sel= bucket_sel * (max_idx - min_idx + 1);
  }
  else
  {
    // the value fits within a single bucket
    sel = MY_MIN(avg_sel, 1.0/get_width());
  }
  return sel;
 }
 static
 void store_key_image_to_rec_no_null(Field *field, const uchar *ptr)
 {
  MY_BITMAP *old_map= dbug_tmp_use_all_columns(field->table,
                                    &field->table->write_set);
  field->set_key_image(ptr, field->key_length());
  dbug_tmp_restore_column_map(&field->table->write_set, old_map);
 }
 static
 double position_in_interval(Field *field, const  uchar *key,
                            const std::string& left, const std::string& right)
 {
  double res;
  if (field->pos_through_val_str())
  {
    uint32 min_len= uint2korr(left.data());
    uint32 max_len= uint2korr(right.data());
    uint32 midp_len= uint2korr(key);
    res= pos_in_interval_for_string(field->charset(),
           key + HA_KEY_BLOB_LENGTH,
           midp_len,
           (const uchar*)left.data() + HA_KEY_BLOB_LENGTH,
           min_len,
           (const uchar*)right.data() + HA_KEY_BLOB_LENGTH,
           max_len);
  }
  else
  {
    store_key_image_to_rec_no_null(field, (const uchar*)left.data());
    double min_val_real= field->val_real();
    store_key_image_to_rec_no_null(field, (const uchar*)right.data());
    double max_val_real= field->val_real();
    store_key_image_to_rec_no_null(field, key);
    double midp_val_real= field->val_real();
    res= pos_in_interval_for_double(midp_val_real, min_val_real, max_val_real);
  }
  return res;
 }
 /*
  @param field    The table field histogram is for.  We don't care about the
                  field's current value, we only need its virtual functions to
                  perform various operations
  @param min_endp Left endpoint, or NULL if there is none
  @param max_endp Right endpoint, or NULL if there is none
 */
 double Histogram_json_hb::range_selectivity(Field *field, key_range *min_endp,
                                            key_range *max_endp)
 {
  double min, max;
  double width= 1.0 / histogram_bounds.size();
  if (min_endp && !(field->null_ptr && min_endp->key[0]))
  {
    bool exclusive_endp= (min_endp->flag == HA_READ_AFTER_KEY)? true: false;
    const uchar *min_key= min_endp->key;
    if (field->real_maybe_null())
      min_key++;
    // Find the leftmost bucket that contains the lookup value.
    // (If the lookup value is to the left of all buckets, find bucket #0)
    int idx= find_bucket(field, min_key, exclusive_endp);
    double min_sel= position_in_interval(field, (const uchar*)min_key,
                                         histogram_bounds[idx],
                                         histogram_bounds[idx+1]);
    min= idx*width + min_sel*width;
  }
  else
    min= 0.0;
  if (max_endp)
  {
    // The right endpoint cannot be NULL
    DBUG_ASSERT(!(field->null_ptr && max_endp->key[0]));
    bool inclusive_endp= (max_endp->flag == HA_READ_AFTER_KEY)? true: false;
    const uchar *max_key= max_endp->key;
    if (field->real_maybe_null())
      max_key++;
    int idx= find_bucket(field, max_key, inclusive_endp);
    double max_sel= position_in_interval(field, (const uchar*)max_key,
                                         histogram_bounds[idx],
                                         histogram_bounds[idx+1]);
    max= idx*width + max_sel*width;
  }
  else
    max= 1.0;
  double sel = max - min;
  return sel;
 }
 void Histogram_json_hb::serialize(Field *field)
 {
  field->store(json_text.data(), json_text.size(), &my_charset_bin);
 }
 /*
  Find the histogram bucket that contains the value.
  @param equal_is_less Controls what to do if a histogram bound is equal to the
                       lookup_val.
 */
 int Histogram_json_hb::find_bucket(Field *field, const uchar *lookup_val,
                                   bool equal_is_less)
 {
  int low= 0;
  int high= histogram_bounds.size() - 1;
  int middle;
  while (low + 1 < high)
  {
    middle= (low + high) / 2;
    int res= field->key_cmp((uchar*)histogram_bounds[middle].data(), lookup_val);
    if (!res)
      res= equal_is_less? -1: 1;
    if (res < 0)
      low= middle;
    else //res > 0
      high= middle;
  }
  return low;
 }
 /*
  An object of the class Index_stat is created to read statistical
  data on tables from the statistical table table_stat, to update
@@ -1853,73 +1586,6 @@ public:
  }
 };
 /*
  This is used to collect the the basic statistics from a Unique object:
   - count of values
   - count of distinct values
   - count of distinct values that have occurred only once
 */
 class Basic_stats_collector
 {
  ulonglong count;         /* number of values retrieved                   */
  ulonglong count_distinct;    /* number of distinct values retrieved      */
  /* number of distinct values that occured only once  */
  ulonglong count_distinct_single_occurence;
 public:
  Basic_stats_collector()
  {
    count= 0;
    count_distinct= 0;
    count_distinct_single_occurence= 0;
  }
  ulonglong get_count_distinct() const { return count_distinct; }
  ulonglong get_count_single_occurence() const
  {
    return count_distinct_single_occurence;
  }
  ulonglong get_count() const { return count; }
  void next(void *elem, element_count elem_cnt)
  {
    count_distinct++;
    if (elem_cnt == 1)
      count_distinct_single_occurence++;
    count+= elem_cnt;
  }
 };
 /*
  Histogram_builder is a helper class that is used to build histograms
  for columns.
  Do not create directly, call Histogram->get_builder(...);
 */
 class Histogram_builder
 {
 protected:
  Field *column;           /* table field for which the histogram is built */
  uint col_length;         /* size of this field                           */
  ha_rows records;         /* number of records the histogram is built for */
  Histogram_builder(Field *col, uint col_len, ha_rows rows) :
    column(col), col_length(col_len), records(rows)
  {}
 public:
  // A histogram builder will also collect the counters
  Basic_stats_collector counters;
  virtual int next(void *elem, element_count elem_cnt)=0;
  virtual void finalize()=0;
  virtual ~Histogram_builder(){}
 };
 class Histogram_binary_builder : public Histogram_builder
 {
  Field *min_value;        /* pointer to the minimal value for the field   */
@@ -1974,101 +1640,6 @@ Histogram_builder *Histogram_binary::create_builder(Field *col, uint col_len,
 }
 class Histogram_json_builder : public Histogram_builder
 {
  Histogram_json_hb *histogram;
  uint hist_width;         /* the number of points in the histogram        */
  double bucket_capacity;  /* number of rows in a bucket of the histogram  */
  uint curr_bucket;        /* number of the current bucket to be built     */
  std::vector<std::string> bucket_bounds;
  bool first_value= true;
 public:
  Histogram_json_builder(Field *col, uint col_len, ha_rows rows)
    : Histogram_builder(col, col_len, rows)
  {
    histogram= (Histogram_json_hb*)col->collected_stats->histogram;
    bucket_capacity= (double)records / histogram->get_width();
    hist_width= histogram->get_width();
    curr_bucket= 0;
  }
  ~Histogram_json_builder() override = default;
  /*
    Add data to the histogram. Adding Element elem which encountered elem_cnt
    times.
  */
  int next(void *elem, element_count elem_cnt) override
  {
    counters.next(elem, elem_cnt);
    ulonglong count= counters.get_count();
    if (curr_bucket == hist_width)
      return 0;
    if (first_value)
    {
      first_value= false;
      column->store_field_value((uchar*) elem, col_length);
      StringBuffer<MAX_FIELD_WIDTH> val;
      column->val_str(&val);
      bucket_bounds.push_back(std::string(val.ptr(), val.length()));
    }
    if (count > bucket_capacity * (curr_bucket + 1))
    {
      column->store_field_value((uchar*) elem, col_length);
      StringBuffer<MAX_FIELD_WIDTH> val;
      column->val_str(&val);
      bucket_bounds.emplace_back(val.ptr(), val.length());
      curr_bucket++;
      while (curr_bucket != hist_width &&
             count > bucket_capacity * (curr_bucket + 1))
      {
        bucket_bounds.push_back(std::string(val.ptr(), val.length()));
        curr_bucket++;
      }
    }
    if (records == count && bucket_bounds.size() == hist_width)
    {
      column->store_field_value((uchar*) elem, col_length);
      StringBuffer<MAX_FIELD_WIDTH> val;
      column->val_str(&val);
      bucket_bounds.push_back(std::string(val.ptr(), val.length()));
    }
    return 0;
  }
  /*
    Finalize the creation of histogram
  */
  void finalize() override
  {
    Json_writer writer;
    writer.start_object();
    writer.add_member(Histogram_json_hb::JSON_NAME).start_array();
    for(auto& value: bucket_bounds) {
      writer.add_str(value.c_str());
    }
    writer.end_array();
    writer.end_object();
    Binary_string *json_string = (Binary_string *) writer.output.get_string();
    histogram->set_json_text(bucket_bounds.size()-1,
                             (uchar *) json_string->c_ptr());
  }
 };
 Histogram_builder *Histogram_json_hb::create_builder(Field *col, uint col_len,
                                                     ha_rows rows)
 {
  return new Histogram_json_builder(col, col_len, rows);
 }
 Histogram_base *create_histogram(MEM_ROOT *mem_root, Histogram_type hist_type,
                                 THD *owner)
 {
--- a/sql/sql_statistics.h
+++ b/sql/sql_statistics.h
@@ -162,11 +162,18 @@ public:
  virtual uint get_width()=0;
-  virtual Histogram_builder *create_builder(Field *col, uint col_len,
+  /*
-                                            ha_rows rows)=0;
+    The creation-time workflow is:
-
+     * create a histogram
     * init_for_collection()
     * create_builder()
     * feed the data to the builder
     * serialize();
  */
  virtual void init_for_collection(MEM_ROOT *mem_root, Histogram_type htype_arg,
                                   ulonglong size)=0;
  virtual Histogram_builder *create_builder(Field *col, uint col_len,
                                            ha_rows rows)=0;
  virtual bool is_available()=0;
@@ -177,19 +184,26 @@ public:
  virtual double range_selectivity(Field *field, key_range *min_endp,
                                   key_range *max_endp)=0;
-  // Legacy: return the size of the histogram on disk.
+  /*
-  // This will be stored in mysql.column_stats.hist_size column.
+    Legacy: return the size of the histogram on disk.
-  // Newer, JSON-based histograms may return 0.
+
    This will be stored in mysql.column_stats.hist_size column.
    The value is not really needed as one can look at
    LENGTH(mysql.column_stats.histogram) directly.
  */
  virtual uint get_size()=0;
  virtual ~Histogram_base()= default;
  Histogram_base() : owner(NULL) {}
  /*
    Memory management: a histogram may be (exclusively) "owned" by a particular
    thread (done for histograms that are being collected).  By default, a
    histogram has owner==NULL and is not owned by any particular thread.
  */
  THD *get_owner() { return owner; }
  void set_owner(THD *thd) { owner=thd; }
 private:
  // Owner is a thread that *exclusively* owns this histogram (and so can
  // delete it at any time)
  THD *owner;
 };
@@ -353,75 +367,72 @@ public:
 /*
-  An equi-height histogram which stores real values for bucket bounds.
+  This is used to collect the the basic statistics from a Unique object:
-
+   - count of values
-  Handles @@histogram_type=JSON_HB
+   - count of distinct values
   - count of distinct values that have occurred only once
 */
-class Histogram_json_hb : public Histogram_base
+class Basic_stats_collector
 {
-private:
+  ulonglong count;         /* number of values retrieved                   */
-  size_t size; /* Number of elements in the histogram */
+  ulonglong count_distinct;    /* number of distinct values retrieved      */
-  
+  /* number of distinct values that occured only once  */
-  /* Collection-time only: collected histogram in the JSON form. */
+  ulonglong count_distinct_single_occurence;
  std::string json_text;
  // Array of histogram bucket endpoints in KeyTupleFormat.
  std::vector<std::string> histogram_bounds;
 public:
-  static constexpr const char* JSON_NAME="histogram_hb_v1";
+  Basic_stats_collector()
  bool parse(MEM_ROOT *mem_root, Field *field, Histogram_type type_arg,
             const char *hist_data, size_t hist_data_len) override;
  void serialize(Field *field) override;
  Histogram_builder *create_builder(Field *col, uint col_len,
                                    ha_rows rows) override;
  // returns number of buckets in the histogram
  uint get_width() override
  {
-    return (uint)size;
+    count= 0;
    count_distinct= 0;
    count_distinct_single_occurence= 0;
  }
-  Histogram_type get_type() override
+  ulonglong get_count_distinct() const { return count_distinct; }
  ulonglong get_count_single_occurence() const
  {
-    return JSON_HB;
+    return count_distinct_single_occurence;
  }
  ulonglong get_count() const { return count; }
-  void set_json_text(ulonglong sz, uchar *json_text_arg)
+  void next(void *elem, element_count elem_cnt)
  {
-    size = (uint8) sz;
+    count_distinct++;
-    json_text.assign((const char*)json_text_arg, 
+    if (elem_cnt == 1)
-                     strlen((const char*)json_text_arg));
+      count_distinct_single_occurence++;
    count+= elem_cnt;
  }
  uint get_size() override
  {
    return size;
  }
  void init_for_collection(MEM_ROOT *mem_root, Histogram_type htype_arg,
                           ulonglong size) override;
  bool is_available() override {return true; }
  bool is_usable(THD *thd) override
  {
    return thd->variables.optimizer_use_condition_selectivity > 3 &&
           is_available();
  }
  double point_selectivity(Field *field, key_range *endpoint,
                           double avg_selection) override;
  double range_selectivity(Field *field, key_range *min_endp,
                           key_range *max_endp) override;
 private:
  int find_bucket(Field *field, const uchar *lookup_val, bool equal_is_less);
 };
 /*
  Histogram_builder is a helper class that is used to build histograms
  for columns.
  Do not create directly, call Histogram->get_builder(...);
 */
 class Histogram_builder
 {
 protected:
  Field *column;           /* table field for which the histogram is built */
  uint col_length;         /* size of this field                           */
  ha_rows records;         /* number of records the histogram is built for */
  Histogram_builder(Field *col, uint col_len, ha_rows rows) :
    column(col), col_length(col_len), records(rows)
  {}
 public:
  // A histogram builder will also collect the counters
  Basic_stats_collector counters;
  virtual int next(void *elem, element_count elem_cnt)=0;
  virtual void finalize()=0;
  virtual ~Histogram_builder(){}
 };
 class Columns_statistics;
 class Index_statistics;