mirror of
https://github.com/MariaDB/server.git
synced 2025-07-30 16:24:05 +03:00
MDEV-21130: Histograms: use JSON as on-disk format
A demo of how to use in-memory data structure for histogram. The patch shows how to * convert string form of data to binary form * compare two values in binary form * compute a fraction for val in [X, Y] range. grep for GSOC-TODO for notes.
This commit is contained in:
@ -1857,6 +1857,7 @@ public:
|
||||
{
|
||||
return (double) 0.5;
|
||||
}
|
||||
virtual bool pos_through_val_str() { return false;}
|
||||
|
||||
/*
|
||||
Check if comparison between the field and an item unambiguously
|
||||
@ -2142,6 +2143,8 @@ public:
|
||||
{
|
||||
return pos_in_interval_val_str(min, max, length_size());
|
||||
}
|
||||
bool pos_through_val_str() override {return true;}
|
||||
|
||||
bool test_if_equality_guarantees_uniqueness(const Item *const_item) const
|
||||
override;
|
||||
SEL_ARG *get_mm_leaf(RANGE_OPT_PARAM *param, KEY_PART *key_part,
|
||||
|
@ -1240,7 +1240,8 @@ public:
|
||||
default:
|
||||
return NULL;
|
||||
}
|
||||
if (!hist->parse(mem_root, table_field->read_stats->histogram_type_on_disk,
|
||||
if (!hist->parse(mem_root, table_field,
|
||||
table_field->read_stats->histogram_type_on_disk,
|
||||
(const uchar*)val.ptr(), val.length()))
|
||||
{
|
||||
table_field->read_stats->histogram_= hist;
|
||||
@ -1253,7 +1254,7 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
bool Histogram_binary::parse(MEM_ROOT *mem_root, Histogram_type type_arg, const uchar *ptr_arg, uint size_arg)
|
||||
bool Histogram_binary::parse(MEM_ROOT *mem_root, Field *, Histogram_type type_arg, const uchar *ptr_arg, uint size_arg)
|
||||
{
|
||||
// Just copy the data
|
||||
size = (uint8) size_arg;
|
||||
@ -1288,21 +1289,260 @@ void Histogram_json::init_for_collection(MEM_ROOT *mem_root, Histogram_type htyp
|
||||
size = (uint8) size_arg;
|
||||
}
|
||||
|
||||
bool Histogram_json::parse(MEM_ROOT *mem_root, Histogram_type type_arg, const uchar *ptr, uint size_arg)
|
||||
bool Histogram_json::parse(MEM_ROOT *mem_root, Field *field, Histogram_type type_arg, const uchar *ptr, uint size_arg)
|
||||
{
|
||||
DBUG_ENTER("Histogram_json::parse");
|
||||
type = type_arg;
|
||||
const char *json = (char *)ptr;
|
||||
int vt;
|
||||
bool result = json_get_array_items(json, json + strlen(json), &vt, hist_buckets);
|
||||
bool result = json_get_array_items(json, json + strlen(json), &vt, hist_buckets_text);
|
||||
if (!result)
|
||||
{
|
||||
my_error(ER_JSON_HISTOGRAM_PARSE_FAILED, MYF(0), vt);
|
||||
DBUG_RETURN(true);
|
||||
}
|
||||
size= hist_buckets_text.size();
|
||||
|
||||
/*
|
||||
Convert the text based array into a data structure that allows lookups and
|
||||
estimates
|
||||
*/
|
||||
for (auto &s : hist_buckets_text)
|
||||
{
|
||||
field->store_text(s.data(), s.size(), &my_charset_bin);
|
||||
|
||||
// Get the value in "truncated key tuple format" here:
|
||||
uchar buf[MAX_KEY_LENGTH];
|
||||
uint len_to_copy= field->key_length();
|
||||
uint bytes= field->get_key_image(buf, len_to_copy, Field::itRAW);
|
||||
histogram_bounds.push_back(std::string((char*)buf, bytes));
|
||||
}
|
||||
|
||||
DBUG_RETURN(false);
|
||||
}
|
||||
|
||||
|
||||
static
|
||||
void store_key_image_to_rec_no_null(Field *field, uchar *ptr) {
|
||||
MY_BITMAP *old_map= dbug_tmp_use_all_columns(field->table,
|
||||
&field->table->write_set);
|
||||
field->set_key_image(ptr, field->key_length());
|
||||
dbug_tmp_restore_column_map(&field->table->write_set, old_map);
|
||||
}
|
||||
|
||||
/*
|
||||
GSOC-TODO:
|
||||
This is our replacement for Field::pos_in_interval_val_real
|
||||
|
||||
We take midpoint_val and an interval [min_val, max_val], and return
|
||||
a number between 0.0 and 1.0 which specifies how close midpoint_val is
|
||||
to one of the bounds.
|
||||
|
||||
@param field Field object. We don't care about the field's current value
|
||||
(actually, we overwrite it). We need it for its virtual
|
||||
functions.
|
||||
|
||||
*/
|
||||
double pos_in_interval_through_val_real(Field *field,
|
||||
uchar* min_val,
|
||||
uchar *max_val,
|
||||
uchar *midpoint_val)
|
||||
{
|
||||
|
||||
// For each passed value: unpack it into Field's current value. Then, we can
|
||||
// get the value as double.
|
||||
|
||||
store_key_image_to_rec_no_null(field, min_val);
|
||||
double min_val_real= field->val_real();
|
||||
|
||||
store_key_image_to_rec_no_null(field, max_val);
|
||||
double max_val_real= field->val_real();
|
||||
|
||||
store_key_image_to_rec_no_null(field, midpoint_val);
|
||||
double midpoint_val_real= field->val_real();
|
||||
|
||||
// The code below is a copy of logic from Field::pos_in_interval_val_real:
|
||||
double n, d;
|
||||
n= midpoint_val_real - min_val_real;
|
||||
if (n < 0)
|
||||
return 0.0;
|
||||
d= max_val_real - min_val_real;
|
||||
if (d <= 0)
|
||||
return 1.0;
|
||||
return MY_MIN(n/d, 1.0);
|
||||
}
|
||||
|
||||
// Copy-paste:
|
||||
static
|
||||
inline ulonglong char_prefix_to_ulonglong(uchar *src)
|
||||
{
|
||||
uint sz= sizeof(ulonglong);
|
||||
for (uint i= 0; i < sz/2; i++)
|
||||
{
|
||||
uchar tmp= src[i];
|
||||
src[i]= src[sz-1-i];
|
||||
src[sz-1-i]= tmp;
|
||||
}
|
||||
return uint8korr(src);
|
||||
}
|
||||
|
||||
// copy-paste:
|
||||
static inline double safe_substract(ulonglong a, ulonglong b)
|
||||
{
|
||||
return (a > b)? double(a - b) : -double(b - a);
|
||||
}
|
||||
|
||||
/*
|
||||
GSOC-TODO:
|
||||
This is our replacement for Field::pos_in_interval_val_str
|
||||
|
||||
We take midpoint_val and an interval [min_val, max_val], and return
|
||||
a number between 0.0 and 1.0 which specifies how close midpoint_val is
|
||||
to one of the bounds.
|
||||
|
||||
@param field Field object. We don't care about the field's current value
|
||||
(actually, we overwrite it). We need it for its virtual
|
||||
functions.
|
||||
|
||||
@TODO
|
||||
Instead of copying the pos_in_interval_val_str(), we should do better:
|
||||
if all three passed values have a common prefix, skip it.
|
||||
This will make the returned value more precise.
|
||||
|
||||
*/
|
||||
|
||||
double pos_in_interval_through_strxfrm(Field *field,
|
||||
uchar *min_val,
|
||||
uchar *max_val,
|
||||
uchar *midpoint_val)
|
||||
{
|
||||
// The code below is a copy of logic from Field::pos_in_interval_val_str
|
||||
uchar mp_prefix[sizeof(ulonglong)];
|
||||
uchar minp_prefix[sizeof(ulonglong)];
|
||||
uchar maxp_prefix[sizeof(ulonglong)];
|
||||
ulonglong mp, minp, maxp;
|
||||
|
||||
uint min_len= uint2korr(min_val);
|
||||
uint max_len= uint2korr(max_val);
|
||||
uint midpoint_len= uint2korr(midpoint_val);
|
||||
|
||||
auto cset= field->charset();
|
||||
|
||||
cset->strnxfrm(mp_prefix, sizeof(mp),
|
||||
midpoint_val + HA_KEY_BLOB_LENGTH,
|
||||
midpoint_len);
|
||||
cset->strnxfrm(minp_prefix, sizeof(minp),
|
||||
min_val + HA_KEY_BLOB_LENGTH,
|
||||
min_len);
|
||||
cset->strnxfrm(maxp_prefix, sizeof(maxp),
|
||||
max_val + HA_KEY_BLOB_LENGTH,
|
||||
max_len);
|
||||
mp= char_prefix_to_ulonglong(mp_prefix);
|
||||
minp= char_prefix_to_ulonglong(minp_prefix);
|
||||
maxp= char_prefix_to_ulonglong(maxp_prefix);
|
||||
double n, d;
|
||||
n= safe_substract(mp, minp);
|
||||
if (n < 0)
|
||||
return 0.0;
|
||||
d= safe_substract(maxp, minp);
|
||||
if (d <= 0)
|
||||
return 1.0;
|
||||
return MY_MIN(n/d, 1.0);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
GSOC-TODO:
|
||||
This is how range selectivity function should look like.
|
||||
|
||||
@param field The table field histogram is for. We don't care about the
|
||||
field's current value, we only need its virtual functions to
|
||||
perform various operations
|
||||
|
||||
@param min_endp, max_endp - this specifies the range.
|
||||
*/
|
||||
double Histogram_json::range_selectivity_new(Field *field, key_range *min_endp,
|
||||
key_range *max_endp)
|
||||
{
|
||||
fprintf(stderr, "Histogram_json::range_selectivity_new\n");
|
||||
|
||||
|
||||
/*
|
||||
GSOC-TODO:
|
||||
The code below is NOT what this function have.
|
||||
|
||||
== WHAT THIS CODE DOES ==
|
||||
At the moment it does a linear walk through histogram_bounds and compares
|
||||
min_endp to each of histogram bucket's min and max.
|
||||
ATTENTION: This is a demo of how key_cmp() is used to compare the values.
|
||||
|
||||
When it finds the bucket such that BUCKET_START < min_endp < BUCKET_END,
|
||||
it computes a position of min_endp within the bucket.
|
||||
ATTENTION: calls to pos_in_interval_.... are a demo of how to compute
|
||||
position of a value within a [min,max] range.
|
||||
|
||||
== WHAT THIS CODE SHOULD DO ==
|
||||
* Use binary search to locate the range [MIN_BUCKET; MAX_BUCKET] - the
|
||||
set of buckets that overlaps with the search interval {min_endp, max_endp}.
|
||||
|
||||
* If the search interval covers MIN_BUCKET only partially, compute a
|
||||
position of min_endp within the bucket.
|
||||
|
||||
* The same for max_endp.
|
||||
|
||||
* Compute the final selectivity and return it.
|
||||
*/
|
||||
std::string prev_s;
|
||||
bool have_prev_s=false;
|
||||
for (auto &s : histogram_bounds)
|
||||
{
|
||||
if (!have_prev_s)
|
||||
{
|
||||
prev_s = s;
|
||||
have_prev_s= true;
|
||||
continue;
|
||||
}
|
||||
|
||||
// It's a test code, so we only process min_endp.
|
||||
if (min_endp)
|
||||
{
|
||||
const uchar *min_key= min_endp->key;
|
||||
// TODO: also, properly handle SQL NULLs.
|
||||
// in this test patch, we just assume the values are not SQL NULLs.
|
||||
if (field->real_maybe_null())
|
||||
min_key++;
|
||||
|
||||
int res1= field->key_cmp((uchar*)prev_s.data(), min_key);
|
||||
const char *str1="<";
|
||||
if (res1>0) str1=">";
|
||||
if (res1==0) str1="=";
|
||||
|
||||
int res2= field->key_cmp(min_key, (uchar*)s.data());
|
||||
const char *str2="<";
|
||||
if (res2>0) str2=">";
|
||||
if (res2==0) str2="=";
|
||||
fprintf(stderr, "prev_bound %s min_key %s bound\n", str1, str2);
|
||||
|
||||
if (res1<0 && res2 < 0)
|
||||
{
|
||||
double sel;
|
||||
if (field->pos_through_val_str())
|
||||
sel= pos_in_interval_through_strxfrm(field, (uchar*)prev_s.data(),
|
||||
(uchar*)s.data(), (uchar*)min_key);
|
||||
else
|
||||
sel= pos_in_interval_through_val_real(field, (uchar*)prev_s.data(),
|
||||
(uchar*)s.data(), (uchar*)min_key);
|
||||
|
||||
fprintf(stderr, " pos_in_interval=%g\n", sel);
|
||||
}
|
||||
|
||||
prev_s= s;
|
||||
}
|
||||
}
|
||||
fprintf(stderr, "Histogram_json::range_selectivity_new ends\n");
|
||||
return 0.5;
|
||||
}
|
||||
|
||||
void Histogram_json::serialize(Field *field)
|
||||
{
|
||||
field->store((char*)values, strlen((char*)values),
|
||||
@ -4107,8 +4347,14 @@ double get_column_range_cardinality(Field *field,
|
||||
max_mp_pos= 1.0;
|
||||
|
||||
Histogram_base *hist = col_stats->histogram_;
|
||||
if (hist && hist->is_usable(thd))
|
||||
if (hist && hist->is_usable(thd)) {
|
||||
/*
|
||||
GSOC-TODO: for now, we just call range_selectivity_new here.
|
||||
*/
|
||||
sel= hist->range_selectivity_new(field, min_endp, max_endp);
|
||||
|
||||
sel= hist->range_selectivity(min_mp_pos, max_mp_pos);
|
||||
}
|
||||
else
|
||||
sel= (max_mp_pos - min_mp_pos);
|
||||
res= col_non_nulls * sel;
|
||||
|
@ -149,7 +149,7 @@ bool is_eits_usable(Field* field);
|
||||
class Histogram_base : public Sql_alloc
|
||||
{
|
||||
public:
|
||||
virtual bool parse(MEM_ROOT *mem_root, Histogram_type type_arg,
|
||||
virtual bool parse(MEM_ROOT *mem_root, Field *field, Histogram_type type_arg,
|
||||
const uchar *ptr, uint size)= 0;
|
||||
virtual void serialize(Field *to_field)= 0;
|
||||
|
||||
@ -173,13 +173,19 @@ public:
|
||||
|
||||
virtual double point_selectivity(double pos, double avg_selection)=0;
|
||||
|
||||
virtual double range_selectivity_new(Field *field, key_range *min_endp,
|
||||
key_range *max_endp)
|
||||
{
|
||||
return 1.0;
|
||||
};
|
||||
|
||||
virtual ~Histogram_base(){}
|
||||
};
|
||||
|
||||
class Histogram_binary : public Histogram_base
|
||||
{
|
||||
public:
|
||||
bool parse(MEM_ROOT *mem_root, Histogram_type type_arg,
|
||||
bool parse(MEM_ROOT *mem_root, Field *, Histogram_type type_arg,
|
||||
const uchar *ptr_arg, uint size_arg) override;
|
||||
void serialize(Field *to_field) override;
|
||||
|
||||
@ -341,11 +347,28 @@ class Histogram_json : public Histogram_base
|
||||
private:
|
||||
Histogram_type type;
|
||||
uint8 size; /* Number of elements in the histogram*/
|
||||
|
||||
/*
|
||||
GSOC-TODO: This is used for storing collected JSON text. Rename it
|
||||
accordingly.
|
||||
*/
|
||||
uchar *values;
|
||||
std::vector<std::string> hist_buckets;
|
||||
|
||||
// List of values in string form.
|
||||
/*
|
||||
GSOC-TODO: We don't need to save this. It can be a local variable in
|
||||
parse().
|
||||
Eventually we should get rid of this at all, as we can convert the
|
||||
endpoints and add them to histogram_bounds as soon as we've read them.
|
||||
*/
|
||||
std::vector<std::string> hist_buckets_text;
|
||||
|
||||
// Array of histogram bucket endpoints in KeyTupleFormat.
|
||||
std::vector<std::string> histogram_bounds;
|
||||
|
||||
public:
|
||||
bool parse(MEM_ROOT *mem_root, Histogram_type type_arg, const uchar *ptr, uint size) override;
|
||||
bool parse(MEM_ROOT *mem_root, Field *field, Histogram_type type_arg,
|
||||
const uchar *ptr, uint size) override;
|
||||
|
||||
void serialize(Field *field) override;
|
||||
|
||||
@ -364,7 +387,7 @@ public:
|
||||
|
||||
void init_for_collection(MEM_ROOT *mem_root, Histogram_type htype_arg, ulonglong size) override;
|
||||
|
||||
bool is_available() override {return get_width() > 0 && get_values(); }
|
||||
bool is_available() override {return get_width() > 0 /*&& get_values()*/; }
|
||||
|
||||
bool is_usable(THD *thd) override
|
||||
{
|
||||
@ -379,6 +402,13 @@ public:
|
||||
double range_selectivity(double min_pos, double max_pos) override {return 0.1;}
|
||||
|
||||
double point_selectivity(double pos, double avg_selection) override {return 0.5;}
|
||||
|
||||
/*
|
||||
GSOC-TODO: This function should eventually replace both range_selectivity()
|
||||
and point_selectivity(). See its code for more details.
|
||||
*/
|
||||
double range_selectivity_new(Field *field, key_range *min_endp,
|
||||
key_range *max_endp) override;
|
||||
};
|
||||
|
||||
class Columns_statistics;
|
||||
|
Reference in New Issue
Block a user