mirror of
https://github.com/MariaDB/server.git
synced 2025-07-27 18:02:13 +03:00
Code cleanup:
- Move [some] engine-agnostic tests from t/selectivity.test to t/selectivity_no_engine.test - Move Histogram::point_selectivity to sql_statistics.cc
This commit is contained in:
@ -3565,3 +3565,120 @@ double get_column_range_cardinality(Field *field,
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
Estimate selectivity of "col=const" using a histogram
|
||||
|
||||
@param pos Position of the "const" between column's min_value and
|
||||
max_value. This is a number in [0..1] range.
|
||||
@param avg_sel Average selectivity of condition "col=const" in this table.
|
||||
It is calcuated as (#non_null_values / #distinct_values).
|
||||
|
||||
@return
|
||||
Expected condition selectivity (a number between 0 and 1)
|
||||
|
||||
@notes
|
||||
[re_zero_length_buckets] If a bucket with zero value-length is in the
|
||||
middle of the histogram, we will not have min==max. Example: suppose,
|
||||
pos_value=0x12, and the histogram is:
|
||||
|
||||
#n #n+1 #n+2
|
||||
... 0x10 0x12 0x12 0x14 ...
|
||||
|
|
||||
+------------- bucket with zero value-length
|
||||
|
||||
Here, we will get min=#n+1, max=#n+2, and use the multi-bucket formula.
|
||||
|
||||
The problem happens at the histogram ends. if pos_value=0, and the
|
||||
histogram is:
|
||||
|
||||
0x00 0x10 ...
|
||||
|
||||
then min=0, max=0. This means pos_value is contained within bucket #0,
|
||||
but on the other hand, histogram data says that the bucket has only one
|
||||
value.
|
||||
*/
|
||||
|
||||
double Histogram::point_selectivity(double pos, double avg_sel)
|
||||
{
|
||||
double sel;
|
||||
/* Find the bucket that contains the value 'pos'. */
|
||||
uint min= find_bucket(pos, TRUE);
|
||||
uint pos_value= (uint) (pos * prec_factor());
|
||||
|
||||
/* Find how many buckets this value occupies */
|
||||
uint max= min;
|
||||
while (max + 1 < get_width() && get_value(max + 1) == pos_value)
|
||||
max++;
|
||||
|
||||
/*
|
||||
A special case: we're looking at a single bucket, and that bucket has
|
||||
zero value-length. Use the multi-bucket formula (attempt to use
|
||||
single-bucket formula will cause divison by zero).
|
||||
|
||||
For more details see [re_zero_length_buckets] above.
|
||||
*/
|
||||
if (max == min && get_value(max) == ((max==0)? 0 : get_value(max-1)))
|
||||
max++;
|
||||
|
||||
if (max > min)
|
||||
{
|
||||
/*
|
||||
The value occupies multiple buckets. Use start_bucket ... end_bucket as
|
||||
selectivity.
|
||||
*/
|
||||
double bucket_sel= 1.0/(get_width() + 1);
|
||||
sel= bucket_sel * (max - min + 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
/*
|
||||
The value 'pos' fits within one single histogram bucket.
|
||||
|
||||
Histogram buckets have the same numbers of rows, but they cover
|
||||
different ranges of values.
|
||||
|
||||
We assume that values are uniformly distributed across the [0..1] value
|
||||
range.
|
||||
*/
|
||||
|
||||
/*
|
||||
If all buckets covered value ranges of the same size, the width of
|
||||
value range would be:
|
||||
*/
|
||||
double avg_bucket_width= 1.0 / (get_width() + 1);
|
||||
|
||||
/*
|
||||
Let's see what is the width of value range that our bucket is covering.
|
||||
(min==max currently. they are kept in the formula just in case we
|
||||
will want to extend it to handle multi-bucket case)
|
||||
*/
|
||||
double inv_prec_factor= (double) 1.0 / prec_factor();
|
||||
double current_bucket_width=
|
||||
(max + 1 == get_width() ? 1.0 : (get_value(max) * inv_prec_factor)) -
|
||||
(min == 0 ? 0.0 : (get_value(min-1) * inv_prec_factor));
|
||||
|
||||
DBUG_ASSERT(current_bucket_width); /* We shouldn't get a one zero-width bucket */
|
||||
|
||||
/*
|
||||
So:
|
||||
- each bucket has the same #rows
|
||||
- values are unformly distributed across the [min_value,max_value] domain.
|
||||
|
||||
If a bucket has value range that's N times bigger then average, than
|
||||
each value will have to have N times fewer rows than average.
|
||||
*/
|
||||
sel= avg_sel * avg_bucket_width / current_bucket_width;
|
||||
|
||||
/*
|
||||
(Q: if we just follow this proportion we may end up in a situation
|
||||
where number of different values we expect to find in this bucket
|
||||
exceeds the number of rows that this histogram has in a bucket. Are
|
||||
we ok with this or we would want to have certain caps?)
|
||||
*/
|
||||
}
|
||||
return sel;
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user