mirror of
https://github.com/MariaDB/server.git
synced 2025-08-08 11:22:35 +03:00
Move JSON histograms code into its own files
This commit is contained in:
@@ -151,6 +151,7 @@ SET (SQL_SOURCE
|
||||
sql_analyze_stmt.cc
|
||||
sql_join_cache.cc
|
||||
create_options.cc multi_range_read.cc
|
||||
opt_histogram_json.cc
|
||||
opt_index_cond_pushdown.cc opt_subselect.cc
|
||||
opt_table_elimination.cc sql_expression_cache.cc
|
||||
gcalc_slicescan.cc gcalc_tools.cc
|
||||
|
391
sql/opt_histogram_json.cc
Normal file
391
sql/opt_histogram_json.cc
Normal file
@@ -0,0 +1,391 @@
|
||||
/*
|
||||
Copyright (c) 2021, MariaDB Corporation.
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; version 2 of the License.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
|
||||
|
||||
#include "mariadb.h"
|
||||
#include "sql_base.h"
|
||||
#include "my_json_writer.h"
|
||||
#include "sql_statistics.h"
|
||||
#include "opt_histogram_json.h"
|
||||
|
||||
class Histogram_json_builder : public Histogram_builder
|
||||
{
|
||||
Histogram_json_hb *histogram;
|
||||
uint hist_width; /* the number of points in the histogram */
|
||||
double bucket_capacity; /* number of rows in a bucket of the histogram */
|
||||
uint curr_bucket; /* number of the current bucket to be built */
|
||||
|
||||
std::vector<std::string> bucket_bounds;
|
||||
bool first_value= true;
|
||||
public:
|
||||
|
||||
Histogram_json_builder(Histogram_json_hb *hist, Field *col, uint col_len,
|
||||
ha_rows rows)
|
||||
: Histogram_builder(col, col_len, rows), histogram(hist)
|
||||
{
|
||||
bucket_capacity= (double)records / histogram->get_width();
|
||||
hist_width= histogram->get_width();
|
||||
curr_bucket= 0;
|
||||
}
|
||||
|
||||
~Histogram_json_builder() override = default;
|
||||
|
||||
/*
|
||||
@brief
|
||||
Add data to the histogram. This call adds elem_cnt rows, each
|
||||
of which has value of *elem.
|
||||
|
||||
@detail
|
||||
Subsequent next() calls will add values that are greater than *elem.
|
||||
*/
|
||||
int next(void *elem, element_count elem_cnt) override
|
||||
{
|
||||
counters.next(elem, elem_cnt);
|
||||
ulonglong count= counters.get_count();
|
||||
|
||||
if (curr_bucket == hist_width)
|
||||
return 0;
|
||||
if (first_value)
|
||||
{
|
||||
first_value= false;
|
||||
column->store_field_value((uchar*) elem, col_length);
|
||||
StringBuffer<MAX_FIELD_WIDTH> val;
|
||||
column->val_str(&val);
|
||||
bucket_bounds.push_back(std::string(val.ptr(), val.length()));
|
||||
}
|
||||
|
||||
if (count > bucket_capacity * (curr_bucket + 1))
|
||||
{
|
||||
column->store_field_value((uchar*) elem, col_length);
|
||||
StringBuffer<MAX_FIELD_WIDTH> val;
|
||||
column->val_str(&val);
|
||||
bucket_bounds.emplace_back(val.ptr(), val.length());
|
||||
|
||||
curr_bucket++;
|
||||
while (curr_bucket != hist_width &&
|
||||
count > bucket_capacity * (curr_bucket + 1))
|
||||
{
|
||||
bucket_bounds.push_back(std::string(val.ptr(), val.length()));
|
||||
curr_bucket++;
|
||||
}
|
||||
}
|
||||
|
||||
if (records == count && bucket_bounds.size() == hist_width)
|
||||
{
|
||||
column->store_field_value((uchar*) elem, col_length);
|
||||
StringBuffer<MAX_FIELD_WIDTH> val;
|
||||
column->val_str(&val);
|
||||
bucket_bounds.push_back(std::string(val.ptr(), val.length()));
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
@brief
|
||||
Finalize the creation of histogram
|
||||
*/
|
||||
void finalize() override
|
||||
{
|
||||
Json_writer writer;
|
||||
writer.start_object();
|
||||
writer.add_member(Histogram_json_hb::JSON_NAME).start_array();
|
||||
|
||||
for(auto& value: bucket_bounds) {
|
||||
writer.add_str(value.c_str());
|
||||
}
|
||||
writer.end_array();
|
||||
writer.end_object();
|
||||
Binary_string *json_string= (Binary_string *) writer.output.get_string();
|
||||
histogram->set_json_text(bucket_bounds.size()-1,
|
||||
(uchar *) json_string->c_ptr());
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
Histogram_builder *Histogram_json_hb::create_builder(Field *col, uint col_len,
|
||||
ha_rows rows)
|
||||
{
|
||||
return new Histogram_json_builder(this, col, col_len, rows);
|
||||
}
|
||||
|
||||
|
||||
void Histogram_json_hb::init_for_collection(MEM_ROOT *mem_root,
|
||||
Histogram_type htype_arg,
|
||||
ulonglong size_arg)
|
||||
{
|
||||
DBUG_ASSERT(htype_arg == JSON_HB);
|
||||
size= (uint8) size_arg;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
@brief
|
||||
Parse the histogram from its on-disk representation
|
||||
|
||||
@return
|
||||
false OK
|
||||
True Error
|
||||
*/
|
||||
|
||||
bool Histogram_json_hb::parse(MEM_ROOT *mem_root, Field *field,
|
||||
Histogram_type type_arg, const char *hist_data,
|
||||
size_t hist_data_len)
|
||||
{
|
||||
DBUG_ENTER("Histogram_json_hb::parse");
|
||||
DBUG_ASSERT(type_arg == JSON_HB);
|
||||
const char *err;
|
||||
json_engine_t je;
|
||||
json_string_t key_name;
|
||||
|
||||
json_scan_start(&je, &my_charset_utf8mb4_bin,
|
||||
(const uchar*)hist_data,
|
||||
(const uchar*)hist_data+hist_data_len);
|
||||
|
||||
if (json_read_value(&je) || je.value_type != JSON_VALUE_OBJECT)
|
||||
{
|
||||
err= "Root JSON element must be a JSON object";
|
||||
goto error;
|
||||
}
|
||||
|
||||
json_string_set_str(&key_name, (const uchar*)JSON_NAME,
|
||||
(const uchar*)JSON_NAME + strlen(JSON_NAME));
|
||||
json_string_set_cs(&key_name, system_charset_info);
|
||||
|
||||
if (json_scan_next(&je) || je.state != JST_KEY ||
|
||||
!json_key_matches(&je, &key_name))
|
||||
{
|
||||
err= "The first key in the object must be histogram_hb_v1";
|
||||
goto error;
|
||||
}
|
||||
|
||||
// The value must be a JSON array
|
||||
if (json_read_value(&je) || (je.value_type != JSON_VALUE_ARRAY))
|
||||
{
|
||||
err= "A JSON array expected";
|
||||
goto error;
|
||||
}
|
||||
|
||||
// Read the array
|
||||
while (!json_scan_next(&je))
|
||||
{
|
||||
switch(je.state)
|
||||
{
|
||||
case JST_VALUE:
|
||||
{
|
||||
const char *val;
|
||||
int val_len;
|
||||
json_smart_read_value(&je, &val, &val_len);
|
||||
if (je.value_type != JSON_VALUE_STRING &&
|
||||
je.value_type != JSON_VALUE_NUMBER &&
|
||||
je.value_type != JSON_VALUE_TRUE &&
|
||||
je.value_type != JSON_VALUE_FALSE)
|
||||
{
|
||||
err= "Scalar value expected";
|
||||
goto error;
|
||||
}
|
||||
uchar buf[MAX_KEY_LENGTH];
|
||||
uint len_to_copy= field->key_length();
|
||||
field->store_text(val, val_len, &my_charset_bin);
|
||||
uint bytes= field->get_key_image(buf, len_to_copy, Field::itRAW);
|
||||
histogram_bounds.push_back(std::string((char*)buf, bytes));
|
||||
// TODO: Should we also compare this endpoint with the previous
|
||||
// to verify that the ordering is right?
|
||||
break;
|
||||
}
|
||||
case JST_ARRAY_END:
|
||||
break;
|
||||
}
|
||||
}
|
||||
// n_buckets = n_bounds - 1 :
|
||||
size= histogram_bounds.size()-1;
|
||||
DBUG_RETURN(false);
|
||||
|
||||
error:
|
||||
my_error(ER_JSON_HISTOGRAM_PARSE_FAILED, MYF(0), err,
|
||||
je.s.c_str - (const uchar*)hist_data);
|
||||
DBUG_RETURN(true);
|
||||
}
|
||||
|
||||
|
||||
static
|
||||
void store_key_image_to_rec_no_null(Field *field, const uchar *ptr)
|
||||
{
|
||||
MY_BITMAP *old_map= dbug_tmp_use_all_columns(field->table,
|
||||
&field->table->write_set);
|
||||
field->set_key_image(ptr, field->key_length());
|
||||
dbug_tmp_restore_column_map(&field->table->write_set, old_map);
|
||||
}
|
||||
|
||||
|
||||
static
|
||||
double position_in_interval(Field *field, const uchar *key,
|
||||
const std::string& left, const std::string& right)
|
||||
{
|
||||
double res;
|
||||
if (field->pos_through_val_str())
|
||||
{
|
||||
uint32 min_len= uint2korr(left.data());
|
||||
uint32 max_len= uint2korr(right.data());
|
||||
uint32 midp_len= uint2korr(key);
|
||||
|
||||
res= pos_in_interval_for_string(field->charset(),
|
||||
key + HA_KEY_BLOB_LENGTH,
|
||||
midp_len,
|
||||
(const uchar*)left.data() + HA_KEY_BLOB_LENGTH,
|
||||
min_len,
|
||||
(const uchar*)right.data() + HA_KEY_BLOB_LENGTH,
|
||||
max_len);
|
||||
}
|
||||
else
|
||||
{
|
||||
store_key_image_to_rec_no_null(field, (const uchar*)left.data());
|
||||
double min_val_real= field->val_real();
|
||||
|
||||
store_key_image_to_rec_no_null(field, (const uchar*)right.data());
|
||||
double max_val_real= field->val_real();
|
||||
|
||||
store_key_image_to_rec_no_null(field, key);
|
||||
double midp_val_real= field->val_real();
|
||||
|
||||
res= pos_in_interval_for_double(midp_val_real, min_val_real, max_val_real);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
double Histogram_json_hb::point_selectivity(Field *field, key_range *endpoint,
|
||||
double avg_sel)
|
||||
{
|
||||
double sel;
|
||||
store_key_image_to_rec(field, (uchar *) endpoint->key,
|
||||
field->key_length());
|
||||
const uchar *min_key = endpoint->key;
|
||||
if (field->real_maybe_null())
|
||||
min_key++;
|
||||
uint min_idx= find_bucket(field, min_key, false);
|
||||
|
||||
uint max_idx= find_bucket(field, min_key, true);
|
||||
#if 0
|
||||
// find how many buckets this value occupies
|
||||
while ((max_idx + 1 < get_width() ) &&
|
||||
(field->key_cmp((uchar *)histogram_bounds[max_idx + 1].data(), min_key) == 0)) {
|
||||
max_idx++;
|
||||
}
|
||||
#endif
|
||||
if (max_idx > min_idx)
|
||||
{
|
||||
// value spans multiple buckets
|
||||
double bucket_sel= 1.0/(get_width() + 1);
|
||||
sel= bucket_sel * (max_idx - min_idx + 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// the value fits within a single bucket
|
||||
sel = MY_MIN(avg_sel, 1.0/get_width());
|
||||
}
|
||||
return sel;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
@param field The table field histogram is for. We don't care about the
|
||||
field's current value, we only need its virtual functions to
|
||||
perform various operations
|
||||
|
||||
@param min_endp Left endpoint, or NULL if there is none
|
||||
@param max_endp Right endpoint, or NULL if there is none
|
||||
*/
|
||||
|
||||
double Histogram_json_hb::range_selectivity(Field *field, key_range *min_endp,
|
||||
key_range *max_endp)
|
||||
{
|
||||
double min, max;
|
||||
double width= 1.0 / histogram_bounds.size();
|
||||
|
||||
if (min_endp && !(field->null_ptr && min_endp->key[0]))
|
||||
{
|
||||
bool exclusive_endp= (min_endp->flag == HA_READ_AFTER_KEY)? true: false;
|
||||
const uchar *min_key= min_endp->key;
|
||||
if (field->real_maybe_null())
|
||||
min_key++;
|
||||
|
||||
// Find the leftmost bucket that contains the lookup value.
|
||||
// (If the lookup value is to the left of all buckets, find bucket #0)
|
||||
int idx= find_bucket(field, min_key, exclusive_endp);
|
||||
double min_sel= position_in_interval(field, (const uchar*)min_key,
|
||||
histogram_bounds[idx],
|
||||
histogram_bounds[idx+1]);
|
||||
min= idx*width + min_sel*width;
|
||||
}
|
||||
else
|
||||
min= 0.0;
|
||||
|
||||
if (max_endp)
|
||||
{
|
||||
// The right endpoint cannot be NULL
|
||||
DBUG_ASSERT(!(field->null_ptr && max_endp->key[0]));
|
||||
bool inclusive_endp= (max_endp->flag == HA_READ_AFTER_KEY)? true: false;
|
||||
const uchar *max_key= max_endp->key;
|
||||
if (field->real_maybe_null())
|
||||
max_key++;
|
||||
|
||||
int idx= find_bucket(field, max_key, inclusive_endp);
|
||||
double max_sel= position_in_interval(field, (const uchar*)max_key,
|
||||
histogram_bounds[idx],
|
||||
histogram_bounds[idx+1]);
|
||||
max= idx*width + max_sel*width;
|
||||
}
|
||||
else
|
||||
max= 1.0;
|
||||
|
||||
double sel = max - min;
|
||||
return sel;
|
||||
}
|
||||
|
||||
|
||||
void Histogram_json_hb::serialize(Field *field)
|
||||
{
|
||||
field->store(json_text.data(), json_text.size(), &my_charset_bin);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Find the histogram bucket that contains the value.
|
||||
|
||||
@param equal_is_less Controls what to do if a histogram bound is equal to the
|
||||
lookup_val.
|
||||
*/
|
||||
|
||||
int Histogram_json_hb::find_bucket(Field *field, const uchar *lookup_val,
|
||||
bool equal_is_less)
|
||||
{
|
||||
int low= 0;
|
||||
int high= histogram_bounds.size() - 1;
|
||||
int middle;
|
||||
|
||||
while (low + 1 < high)
|
||||
{
|
||||
middle= (low + high) / 2;
|
||||
int res= field->key_cmp((uchar*)histogram_bounds[middle].data(), lookup_val);
|
||||
if (!res)
|
||||
res= equal_is_less? -1: 1;
|
||||
if (res < 0)
|
||||
low= middle;
|
||||
else //res > 0
|
||||
high= middle;
|
||||
}
|
||||
|
||||
return low;
|
||||
}
|
95
sql/opt_histogram_json.h
Normal file
95
sql/opt_histogram_json.h
Normal file
@@ -0,0 +1,95 @@
|
||||
/*
|
||||
Copyright (c) 2021, MariaDB Corporation.
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; version 2 of the License.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
|
||||
|
||||
#include "sql_statistics.h"
|
||||
|
||||
/*
|
||||
An equi-height histogram which stores real values for bucket bounds.
|
||||
|
||||
Handles @@histogram_type=JSON_HB
|
||||
*/
|
||||
|
||||
class Histogram_json_hb : public Histogram_base
|
||||
{
|
||||
size_t size; /* Number of elements in the histogram */
|
||||
|
||||
/* Collection-time only: collected histogram in the JSON form. */
|
||||
std::string json_text;
|
||||
|
||||
// Array of histogram bucket endpoints in KeyTupleFormat.
|
||||
std::vector<std::string> histogram_bounds;
|
||||
|
||||
public:
|
||||
static constexpr const char* JSON_NAME="histogram_hb_v1";
|
||||
|
||||
bool parse(MEM_ROOT *mem_root, Field *field, Histogram_type type_arg,
|
||||
const char *hist_data, size_t hist_data_len) override;
|
||||
|
||||
void serialize(Field *field) override;
|
||||
|
||||
Histogram_builder *create_builder(Field *col, uint col_len,
|
||||
ha_rows rows) override;
|
||||
|
||||
// returns number of buckets in the histogram
|
||||
uint get_width() override
|
||||
{
|
||||
return (uint)size;
|
||||
}
|
||||
|
||||
Histogram_type get_type() override
|
||||
{
|
||||
return JSON_HB;
|
||||
}
|
||||
|
||||
/*
|
||||
@brief
|
||||
Legacy: this returns the size of the histogram on disk.
|
||||
|
||||
@detail
|
||||
This is only called at collection time when json_text is non-empty.
|
||||
*/
|
||||
uint get_size() override
|
||||
{
|
||||
return json_text.size();
|
||||
}
|
||||
|
||||
void init_for_collection(MEM_ROOT *mem_root, Histogram_type htype_arg,
|
||||
ulonglong size) override;
|
||||
|
||||
bool is_available() override {return true; }
|
||||
|
||||
bool is_usable(THD *thd) override
|
||||
{
|
||||
return thd->variables.optimizer_use_condition_selectivity > 3 &&
|
||||
is_available();
|
||||
}
|
||||
|
||||
double point_selectivity(Field *field, key_range *endpoint,
|
||||
double avg_selection) override;
|
||||
double range_selectivity(Field *field, key_range *min_endp,
|
||||
key_range *max_endp) override;
|
||||
|
||||
void set_json_text(ulonglong sz, uchar *json_text_arg)
|
||||
{
|
||||
size = (uint8) sz;
|
||||
json_text.assign((const char*)json_text_arg,
|
||||
strlen((const char*)json_text_arg));
|
||||
}
|
||||
|
||||
private:
|
||||
int find_bucket(Field *field, const uchar *lookup_val, bool equal_is_less);
|
||||
};
|
||||
|
@@ -28,11 +28,11 @@
|
||||
#include "sql_base.h"
|
||||
#include "key.h"
|
||||
#include "sql_statistics.h"
|
||||
#include "opt_histogram_json.h"
|
||||
#include "opt_range.h"
|
||||
#include "uniques.h"
|
||||
#include "sql_show.h"
|
||||
#include "sql_partition.h"
|
||||
#include "my_json_writer.h"
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
@@ -1267,8 +1267,8 @@ void Histogram_binary::serialize(Field *field)
|
||||
}
|
||||
|
||||
void Histogram_binary::init_for_collection(MEM_ROOT *mem_root,
|
||||
Histogram_type htype_arg,
|
||||
ulonglong size_arg)
|
||||
Histogram_type htype_arg,
|
||||
ulonglong size_arg)
|
||||
{
|
||||
type= htype_arg;
|
||||
values = (uchar*)alloc_root(mem_root, size_arg);
|
||||
@@ -1276,273 +1276,6 @@ void Histogram_binary::init_for_collection(MEM_ROOT *mem_root,
|
||||
}
|
||||
|
||||
|
||||
void Histogram_json_hb::init_for_collection(MEM_ROOT *mem_root,
|
||||
Histogram_type htype_arg,
|
||||
ulonglong size_arg)
|
||||
{
|
||||
DBUG_ASSERT(htype_arg == JSON_HB);
|
||||
size= (uint8) size_arg;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
@brief
|
||||
Parse the histogram from its on-disk representation
|
||||
|
||||
@return
|
||||
false OK
|
||||
True Error
|
||||
*/
|
||||
|
||||
bool Histogram_json_hb::parse(MEM_ROOT *mem_root, Field *field,
|
||||
Histogram_type type_arg, const char *hist_data,
|
||||
size_t hist_data_len)
|
||||
{
|
||||
DBUG_ENTER("Histogram_json_hb::parse");
|
||||
DBUG_ASSERT(type_arg == JSON_HB);
|
||||
const char *err;
|
||||
json_engine_t je;
|
||||
json_string_t key_name;
|
||||
|
||||
json_scan_start(&je, &my_charset_utf8mb4_bin,
|
||||
(const uchar*)hist_data,
|
||||
(const uchar*)hist_data+hist_data_len);
|
||||
|
||||
if (json_read_value(&je) || je.value_type != JSON_VALUE_OBJECT)
|
||||
{
|
||||
err= "Root JSON element must be a JSON object";
|
||||
goto error;
|
||||
}
|
||||
|
||||
json_string_set_str(&key_name, (const uchar*)JSON_NAME,
|
||||
(const uchar*)JSON_NAME + strlen(JSON_NAME));
|
||||
json_string_set_cs(&key_name, system_charset_info);
|
||||
|
||||
if (json_scan_next(&je) || je.state != JST_KEY ||
|
||||
!json_key_matches(&je, &key_name))
|
||||
{
|
||||
err= "The first key in the object must be histogram_hb_v1";
|
||||
goto error;
|
||||
}
|
||||
|
||||
// The value must be a JSON array
|
||||
if (json_read_value(&je) || (je.value_type != JSON_VALUE_ARRAY))
|
||||
{
|
||||
err= "A JSON array expected";
|
||||
goto error;
|
||||
}
|
||||
|
||||
// Read the array
|
||||
while (!json_scan_next(&je))
|
||||
{
|
||||
switch(je.state)
|
||||
{
|
||||
case JST_VALUE:
|
||||
{
|
||||
const char *val;
|
||||
int val_len;
|
||||
json_smart_read_value(&je, &val, &val_len);
|
||||
if (je.value_type != JSON_VALUE_STRING &&
|
||||
je.value_type != JSON_VALUE_NUMBER &&
|
||||
je.value_type != JSON_VALUE_TRUE &&
|
||||
je.value_type != JSON_VALUE_FALSE)
|
||||
{
|
||||
err= "Scalar value expected";
|
||||
goto error;
|
||||
}
|
||||
uchar buf[MAX_KEY_LENGTH];
|
||||
uint len_to_copy= field->key_length();
|
||||
field->store_text(val, val_len, &my_charset_bin);
|
||||
uint bytes= field->get_key_image(buf, len_to_copy, Field::itRAW);
|
||||
histogram_bounds.push_back(std::string((char*)buf, bytes));
|
||||
// TODO: Should we also compare this endpoint with the previous
|
||||
// to verify that the ordering is right?
|
||||
break;
|
||||
}
|
||||
case JST_ARRAY_END:
|
||||
break;
|
||||
}
|
||||
}
|
||||
// n_buckets = n_bounds - 1 :
|
||||
size= histogram_bounds.size()-1;
|
||||
DBUG_RETURN(false);
|
||||
|
||||
error:
|
||||
my_error(ER_JSON_HISTOGRAM_PARSE_FAILED, MYF(0), err,
|
||||
je.s.c_str - (const uchar*)hist_data);
|
||||
DBUG_RETURN(true);
|
||||
}
|
||||
|
||||
|
||||
double Histogram_json_hb::point_selectivity(Field *field, key_range *endpoint,
|
||||
double avg_sel)
|
||||
{
|
||||
double sel;
|
||||
store_key_image_to_rec(field, (uchar *) endpoint->key,
|
||||
field->key_length());
|
||||
const uchar *min_key = endpoint->key;
|
||||
if (field->real_maybe_null())
|
||||
min_key++;
|
||||
uint min_idx= find_bucket(field, min_key, false);
|
||||
|
||||
uint max_idx= find_bucket(field, min_key, true);
|
||||
#if 0
|
||||
// find how many buckets this value occupies
|
||||
while ((max_idx + 1 < get_width() ) &&
|
||||
(field->key_cmp((uchar *)histogram_bounds[max_idx + 1].data(), min_key) == 0)) {
|
||||
max_idx++;
|
||||
}
|
||||
#endif
|
||||
if (max_idx > min_idx)
|
||||
{
|
||||
// value spans multiple buckets
|
||||
double bucket_sel= 1.0/(get_width() + 1);
|
||||
sel= bucket_sel * (max_idx - min_idx + 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// the value fits within a single bucket
|
||||
sel = MY_MIN(avg_sel, 1.0/get_width());
|
||||
}
|
||||
return sel;
|
||||
}
|
||||
|
||||
|
||||
static
|
||||
void store_key_image_to_rec_no_null(Field *field, const uchar *ptr)
|
||||
{
|
||||
MY_BITMAP *old_map= dbug_tmp_use_all_columns(field->table,
|
||||
&field->table->write_set);
|
||||
field->set_key_image(ptr, field->key_length());
|
||||
dbug_tmp_restore_column_map(&field->table->write_set, old_map);
|
||||
}
|
||||
|
||||
|
||||
static
|
||||
double position_in_interval(Field *field, const uchar *key,
|
||||
const std::string& left, const std::string& right)
|
||||
{
|
||||
double res;
|
||||
if (field->pos_through_val_str())
|
||||
{
|
||||
uint32 min_len= uint2korr(left.data());
|
||||
uint32 max_len= uint2korr(right.data());
|
||||
uint32 midp_len= uint2korr(key);
|
||||
|
||||
res= pos_in_interval_for_string(field->charset(),
|
||||
key + HA_KEY_BLOB_LENGTH,
|
||||
midp_len,
|
||||
(const uchar*)left.data() + HA_KEY_BLOB_LENGTH,
|
||||
min_len,
|
||||
(const uchar*)right.data() + HA_KEY_BLOB_LENGTH,
|
||||
max_len);
|
||||
}
|
||||
else
|
||||
{
|
||||
store_key_image_to_rec_no_null(field, (const uchar*)left.data());
|
||||
double min_val_real= field->val_real();
|
||||
|
||||
store_key_image_to_rec_no_null(field, (const uchar*)right.data());
|
||||
double max_val_real= field->val_real();
|
||||
|
||||
store_key_image_to_rec_no_null(field, key);
|
||||
double midp_val_real= field->val_real();
|
||||
|
||||
res= pos_in_interval_for_double(midp_val_real, min_val_real, max_val_real);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
/*
|
||||
@param field The table field histogram is for. We don't care about the
|
||||
field's current value, we only need its virtual functions to
|
||||
perform various operations
|
||||
|
||||
@param min_endp Left endpoint, or NULL if there is none
|
||||
@param max_endp Right endpoint, or NULL if there is none
|
||||
*/
|
||||
double Histogram_json_hb::range_selectivity(Field *field, key_range *min_endp,
|
||||
key_range *max_endp)
|
||||
{
|
||||
double min, max;
|
||||
double width= 1.0 / histogram_bounds.size();
|
||||
|
||||
if (min_endp && !(field->null_ptr && min_endp->key[0]))
|
||||
{
|
||||
bool exclusive_endp= (min_endp->flag == HA_READ_AFTER_KEY)? true: false;
|
||||
const uchar *min_key= min_endp->key;
|
||||
if (field->real_maybe_null())
|
||||
min_key++;
|
||||
|
||||
// Find the leftmost bucket that contains the lookup value.
|
||||
// (If the lookup value is to the left of all buckets, find bucket #0)
|
||||
int idx= find_bucket(field, min_key, exclusive_endp);
|
||||
double min_sel= position_in_interval(field, (const uchar*)min_key,
|
||||
histogram_bounds[idx],
|
||||
histogram_bounds[idx+1]);
|
||||
min= idx*width + min_sel*width;
|
||||
}
|
||||
else
|
||||
min= 0.0;
|
||||
|
||||
if (max_endp)
|
||||
{
|
||||
// The right endpoint cannot be NULL
|
||||
DBUG_ASSERT(!(field->null_ptr && max_endp->key[0]));
|
||||
bool inclusive_endp= (max_endp->flag == HA_READ_AFTER_KEY)? true: false;
|
||||
const uchar *max_key= max_endp->key;
|
||||
if (field->real_maybe_null())
|
||||
max_key++;
|
||||
|
||||
int idx= find_bucket(field, max_key, inclusive_endp);
|
||||
double max_sel= position_in_interval(field, (const uchar*)max_key,
|
||||
histogram_bounds[idx],
|
||||
histogram_bounds[idx+1]);
|
||||
max= idx*width + max_sel*width;
|
||||
}
|
||||
else
|
||||
max= 1.0;
|
||||
|
||||
double sel = max - min;
|
||||
return sel;
|
||||
}
|
||||
|
||||
|
||||
void Histogram_json_hb::serialize(Field *field)
|
||||
{
|
||||
field->store(json_text.data(), json_text.size(), &my_charset_bin);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Find the histogram bucket that contains the value.
|
||||
|
||||
@param equal_is_less Controls what to do if a histogram bound is equal to the
|
||||
lookup_val.
|
||||
*/
|
||||
|
||||
int Histogram_json_hb::find_bucket(Field *field, const uchar *lookup_val,
|
||||
bool equal_is_less)
|
||||
{
|
||||
int low= 0;
|
||||
int high= histogram_bounds.size() - 1;
|
||||
int middle;
|
||||
|
||||
while (low + 1 < high)
|
||||
{
|
||||
middle= (low + high) / 2;
|
||||
int res= field->key_cmp((uchar*)histogram_bounds[middle].data(), lookup_val);
|
||||
if (!res)
|
||||
res= equal_is_less? -1: 1;
|
||||
if (res < 0)
|
||||
low= middle;
|
||||
else //res > 0
|
||||
high= middle;
|
||||
}
|
||||
|
||||
return low;
|
||||
}
|
||||
|
||||
/*
|
||||
An object of the class Index_stat is created to read statistical
|
||||
data on tables from the statistical table table_stat, to update
|
||||
@@ -1853,73 +1586,6 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
This is used to collect the the basic statistics from a Unique object:
|
||||
- count of values
|
||||
- count of distinct values
|
||||
- count of distinct values that have occurred only once
|
||||
*/
|
||||
|
||||
class Basic_stats_collector
|
||||
{
|
||||
ulonglong count; /* number of values retrieved */
|
||||
ulonglong count_distinct; /* number of distinct values retrieved */
|
||||
/* number of distinct values that occured only once */
|
||||
ulonglong count_distinct_single_occurence;
|
||||
|
||||
public:
|
||||
Basic_stats_collector()
|
||||
{
|
||||
count= 0;
|
||||
count_distinct= 0;
|
||||
count_distinct_single_occurence= 0;
|
||||
}
|
||||
|
||||
ulonglong get_count_distinct() const { return count_distinct; }
|
||||
ulonglong get_count_single_occurence() const
|
||||
{
|
||||
return count_distinct_single_occurence;
|
||||
}
|
||||
ulonglong get_count() const { return count; }
|
||||
|
||||
void next(void *elem, element_count elem_cnt)
|
||||
{
|
||||
count_distinct++;
|
||||
if (elem_cnt == 1)
|
||||
count_distinct_single_occurence++;
|
||||
count+= elem_cnt;
|
||||
}
|
||||
};
|
||||
|
||||
/*
|
||||
Histogram_builder is a helper class that is used to build histograms
|
||||
for columns.
|
||||
|
||||
Do not create directly, call Histogram->get_builder(...);
|
||||
*/
|
||||
|
||||
class Histogram_builder
|
||||
{
|
||||
protected:
|
||||
Field *column; /* table field for which the histogram is built */
|
||||
uint col_length; /* size of this field */
|
||||
ha_rows records; /* number of records the histogram is built for */
|
||||
|
||||
Histogram_builder(Field *col, uint col_len, ha_rows rows) :
|
||||
column(col), col_length(col_len), records(rows)
|
||||
{}
|
||||
|
||||
public:
|
||||
// A histogram builder will also collect the counters
|
||||
Basic_stats_collector counters;
|
||||
|
||||
virtual int next(void *elem, element_count elem_cnt)=0;
|
||||
virtual void finalize()=0;
|
||||
virtual ~Histogram_builder(){}
|
||||
};
|
||||
|
||||
|
||||
class Histogram_binary_builder : public Histogram_builder
|
||||
{
|
||||
Field *min_value; /* pointer to the minimal value for the field */
|
||||
@@ -1974,101 +1640,6 @@ Histogram_builder *Histogram_binary::create_builder(Field *col, uint col_len,
|
||||
}
|
||||
|
||||
|
||||
class Histogram_json_builder : public Histogram_builder
|
||||
{
|
||||
Histogram_json_hb *histogram;
|
||||
uint hist_width; /* the number of points in the histogram */
|
||||
double bucket_capacity; /* number of rows in a bucket of the histogram */
|
||||
uint curr_bucket; /* number of the current bucket to be built */
|
||||
|
||||
std::vector<std::string> bucket_bounds;
|
||||
bool first_value= true;
|
||||
public:
|
||||
Histogram_json_builder(Field *col, uint col_len, ha_rows rows)
|
||||
: Histogram_builder(col, col_len, rows)
|
||||
{
|
||||
histogram= (Histogram_json_hb*)col->collected_stats->histogram;
|
||||
bucket_capacity= (double)records / histogram->get_width();
|
||||
hist_width= histogram->get_width();
|
||||
curr_bucket= 0;
|
||||
}
|
||||
|
||||
~Histogram_json_builder() override = default;
|
||||
|
||||
/*
|
||||
Add data to the histogram. Adding Element elem which encountered elem_cnt
|
||||
times.
|
||||
*/
|
||||
int next(void *elem, element_count elem_cnt) override
|
||||
{
|
||||
counters.next(elem, elem_cnt);
|
||||
ulonglong count= counters.get_count();
|
||||
|
||||
if (curr_bucket == hist_width)
|
||||
return 0;
|
||||
if (first_value)
|
||||
{
|
||||
first_value= false;
|
||||
column->store_field_value((uchar*) elem, col_length);
|
||||
StringBuffer<MAX_FIELD_WIDTH> val;
|
||||
column->val_str(&val);
|
||||
bucket_bounds.push_back(std::string(val.ptr(), val.length()));
|
||||
}
|
||||
|
||||
if (count > bucket_capacity * (curr_bucket + 1))
|
||||
{
|
||||
column->store_field_value((uchar*) elem, col_length);
|
||||
StringBuffer<MAX_FIELD_WIDTH> val;
|
||||
column->val_str(&val);
|
||||
bucket_bounds.emplace_back(val.ptr(), val.length());
|
||||
|
||||
curr_bucket++;
|
||||
while (curr_bucket != hist_width &&
|
||||
count > bucket_capacity * (curr_bucket + 1))
|
||||
{
|
||||
bucket_bounds.push_back(std::string(val.ptr(), val.length()));
|
||||
curr_bucket++;
|
||||
}
|
||||
}
|
||||
|
||||
if (records == count && bucket_bounds.size() == hist_width)
|
||||
{
|
||||
column->store_field_value((uchar*) elem, col_length);
|
||||
StringBuffer<MAX_FIELD_WIDTH> val;
|
||||
column->val_str(&val);
|
||||
bucket_bounds.push_back(std::string(val.ptr(), val.length()));
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
Finalize the creation of histogram
|
||||
*/
|
||||
void finalize() override
|
||||
{
|
||||
Json_writer writer;
|
||||
writer.start_object();
|
||||
writer.add_member(Histogram_json_hb::JSON_NAME).start_array();
|
||||
|
||||
for(auto& value: bucket_bounds) {
|
||||
writer.add_str(value.c_str());
|
||||
}
|
||||
writer.end_array();
|
||||
writer.end_object();
|
||||
Binary_string *json_string = (Binary_string *) writer.output.get_string();
|
||||
histogram->set_json_text(bucket_bounds.size()-1,
|
||||
(uchar *) json_string->c_ptr());
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
Histogram_builder *Histogram_json_hb::create_builder(Field *col, uint col_len,
|
||||
ha_rows rows)
|
||||
{
|
||||
return new Histogram_json_builder(col, col_len, rows);
|
||||
}
|
||||
|
||||
|
||||
Histogram_base *create_histogram(MEM_ROOT *mem_root, Histogram_type hist_type,
|
||||
THD *owner)
|
||||
{
|
||||
|
@@ -162,11 +162,18 @@ public:
|
||||
|
||||
virtual uint get_width()=0;
|
||||
|
||||
virtual Histogram_builder *create_builder(Field *col, uint col_len,
|
||||
ha_rows rows)=0;
|
||||
|
||||
/*
|
||||
The creation-time workflow is:
|
||||
* create a histogram
|
||||
* init_for_collection()
|
||||
* create_builder()
|
||||
* feed the data to the builder
|
||||
* serialize();
|
||||
*/
|
||||
virtual void init_for_collection(MEM_ROOT *mem_root, Histogram_type htype_arg,
|
||||
ulonglong size)=0;
|
||||
virtual Histogram_builder *create_builder(Field *col, uint col_len,
|
||||
ha_rows rows)=0;
|
||||
|
||||
virtual bool is_available()=0;
|
||||
|
||||
@@ -177,19 +184,26 @@ public:
|
||||
virtual double range_selectivity(Field *field, key_range *min_endp,
|
||||
key_range *max_endp)=0;
|
||||
|
||||
// Legacy: return the size of the histogram on disk.
|
||||
// This will be stored in mysql.column_stats.hist_size column.
|
||||
// Newer, JSON-based histograms may return 0.
|
||||
/*
|
||||
Legacy: return the size of the histogram on disk.
|
||||
|
||||
This will be stored in mysql.column_stats.hist_size column.
|
||||
The value is not really needed as one can look at
|
||||
LENGTH(mysql.column_stats.histogram) directly.
|
||||
*/
|
||||
virtual uint get_size()=0;
|
||||
virtual ~Histogram_base()= default;
|
||||
|
||||
|
||||
Histogram_base() : owner(NULL) {}
|
||||
|
||||
/*
|
||||
Memory management: a histogram may be (exclusively) "owned" by a particular
|
||||
thread (done for histograms that are being collected). By default, a
|
||||
histogram has owner==NULL and is not owned by any particular thread.
|
||||
*/
|
||||
THD *get_owner() { return owner; }
|
||||
void set_owner(THD *thd) { owner=thd; }
|
||||
private:
|
||||
// Owner is a thread that *exclusively* owns this histogram (and so can
|
||||
// delete it at any time)
|
||||
THD *owner;
|
||||
};
|
||||
|
||||
@@ -353,75 +367,72 @@ public:
|
||||
|
||||
|
||||
/*
|
||||
An equi-height histogram which stores real values for bucket bounds.
|
||||
|
||||
Handles @@histogram_type=JSON_HB
|
||||
This is used to collect the the basic statistics from a Unique object:
|
||||
- count of values
|
||||
- count of distinct values
|
||||
- count of distinct values that have occurred only once
|
||||
*/
|
||||
|
||||
class Histogram_json_hb : public Histogram_base
|
||||
class Basic_stats_collector
|
||||
{
|
||||
private:
|
||||
size_t size; /* Number of elements in the histogram */
|
||||
|
||||
/* Collection-time only: collected histogram in the JSON form. */
|
||||
std::string json_text;
|
||||
|
||||
// Array of histogram bucket endpoints in KeyTupleFormat.
|
||||
std::vector<std::string> histogram_bounds;
|
||||
ulonglong count; /* number of values retrieved */
|
||||
ulonglong count_distinct; /* number of distinct values retrieved */
|
||||
/* number of distinct values that occured only once */
|
||||
ulonglong count_distinct_single_occurence;
|
||||
|
||||
public:
|
||||
static constexpr const char* JSON_NAME="histogram_hb_v1";
|
||||
|
||||
bool parse(MEM_ROOT *mem_root, Field *field, Histogram_type type_arg,
|
||||
const char *hist_data, size_t hist_data_len) override;
|
||||
|
||||
void serialize(Field *field) override;
|
||||
|
||||
Histogram_builder *create_builder(Field *col, uint col_len,
|
||||
ha_rows rows) override;
|
||||
|
||||
// returns number of buckets in the histogram
|
||||
uint get_width() override
|
||||
Basic_stats_collector()
|
||||
{
|
||||
return (uint)size;
|
||||
count= 0;
|
||||
count_distinct= 0;
|
||||
count_distinct_single_occurence= 0;
|
||||
}
|
||||
|
||||
Histogram_type get_type() override
|
||||
ulonglong get_count_distinct() const { return count_distinct; }
|
||||
ulonglong get_count_single_occurence() const
|
||||
{
|
||||
return JSON_HB;
|
||||
return count_distinct_single_occurence;
|
||||
}
|
||||
ulonglong get_count() const { return count; }
|
||||
|
||||
void set_json_text(ulonglong sz, uchar *json_text_arg)
|
||||
void next(void *elem, element_count elem_cnt)
|
||||
{
|
||||
size = (uint8) sz;
|
||||
json_text.assign((const char*)json_text_arg,
|
||||
strlen((const char*)json_text_arg));
|
||||
count_distinct++;
|
||||
if (elem_cnt == 1)
|
||||
count_distinct_single_occurence++;
|
||||
count+= elem_cnt;
|
||||
}
|
||||
|
||||
uint get_size() override
|
||||
{
|
||||
return size;
|
||||
}
|
||||
|
||||
void init_for_collection(MEM_ROOT *mem_root, Histogram_type htype_arg,
|
||||
ulonglong size) override;
|
||||
|
||||
bool is_available() override {return true; }
|
||||
|
||||
bool is_usable(THD *thd) override
|
||||
{
|
||||
return thd->variables.optimizer_use_condition_selectivity > 3 &&
|
||||
is_available();
|
||||
}
|
||||
|
||||
double point_selectivity(Field *field, key_range *endpoint,
|
||||
double avg_selection) override;
|
||||
double range_selectivity(Field *field, key_range *min_endp,
|
||||
key_range *max_endp) override;
|
||||
private:
|
||||
int find_bucket(Field *field, const uchar *lookup_val, bool equal_is_less);
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
Histogram_builder is a helper class that is used to build histograms
|
||||
for columns.
|
||||
|
||||
Do not create directly, call Histogram->get_builder(...);
|
||||
*/
|
||||
|
||||
class Histogram_builder
|
||||
{
|
||||
protected:
|
||||
Field *column; /* table field for which the histogram is built */
|
||||
uint col_length; /* size of this field */
|
||||
ha_rows records; /* number of records the histogram is built for */
|
||||
|
||||
Histogram_builder(Field *col, uint col_len, ha_rows rows) :
|
||||
column(col), col_length(col_len), records(rows)
|
||||
{}
|
||||
|
||||
public:
|
||||
// A histogram builder will also collect the counters
|
||||
Basic_stats_collector counters;
|
||||
|
||||
virtual int next(void *elem, element_count elem_cnt)=0;
|
||||
virtual void finalize()=0;
|
||||
virtual ~Histogram_builder(){}
|
||||
};
|
||||
|
||||
|
||||
class Columns_statistics;
|
||||
class Index_statistics;
|
||||
|
||||
|
Reference in New Issue
Block a user