mirror of
https://github.com/postgres/postgres.git
synced 2025-04-27 22:56:53 +03:00
Change patternsel() so that instead of switching from a pure
pattern-examination heuristic method to purely histogram-driven selectivity at histogram size 100, we compute both estimates and use a weighted average. The weight put on the heuristic estimate decreases linearly with histogram size, dropping to zero for 100 or more histogram entries. Likewise in ltreeparentsel(). After a patch by Greg Stark, though I reorganized the logic a bit to give the caller of histogram_selectivity() more control.
This commit is contained in:
parent
422495d0da
commit
f4230d2937
@ -1,7 +1,7 @@
|
|||||||
/*
|
/*
|
||||||
* op function for ltree
|
* op function for ltree
|
||||||
* Teodor Sigaev <teodor@stack.net>
|
* Teodor Sigaev <teodor@stack.net>
|
||||||
* $PostgreSQL: pgsql/contrib/ltree/ltree_op.c,v 1.16 2007/02/28 22:44:38 tgl Exp $
|
* $PostgreSQL: pgsql/contrib/ltree/ltree_op.c,v 1.17 2008/03/09 00:32:09 tgl Exp $
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "ltree.h"
|
#include "ltree.h"
|
||||||
@ -609,6 +609,7 @@ ltreeparentsel(PG_FUNCTION_ARGS)
|
|||||||
double mcvsum;
|
double mcvsum;
|
||||||
double mcvsel;
|
double mcvsel;
|
||||||
double nullfrac;
|
double nullfrac;
|
||||||
|
int hist_size;
|
||||||
|
|
||||||
fmgr_info(get_opcode(operator), &contproc);
|
fmgr_info(get_opcode(operator), &contproc);
|
||||||
|
|
||||||
@ -626,20 +627,30 @@ ltreeparentsel(PG_FUNCTION_ARGS)
|
|||||||
*/
|
*/
|
||||||
selec = histogram_selectivity(&vardata, &contproc,
|
selec = histogram_selectivity(&vardata, &contproc,
|
||||||
constval, varonleft,
|
constval, varonleft,
|
||||||
100, 1);
|
10, 1, &hist_size);
|
||||||
if (selec < 0)
|
if (selec < 0)
|
||||||
{
|
{
|
||||||
/* Nope, fall back on default */
|
/* Nope, fall back on default */
|
||||||
selec = DEFAULT_PARENT_SEL;
|
selec = DEFAULT_PARENT_SEL;
|
||||||
}
|
}
|
||||||
else
|
else if (hist_size < 100)
|
||||||
{
|
{
|
||||||
/* Yes, but don't believe extremely small or large estimates. */
|
/*
|
||||||
|
* For histogram sizes from 10 to 100, we combine the
|
||||||
|
* histogram and default selectivities, putting increasingly
|
||||||
|
* more trust in the histogram for larger sizes.
|
||||||
|
*/
|
||||||
|
double hist_weight = hist_size / 100.0;
|
||||||
|
|
||||||
|
selec = selec * hist_weight +
|
||||||
|
DEFAULT_PARENT_SEL * (1.0 - hist_weight);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* In any case, don't believe extremely small or large estimates. */
|
||||||
if (selec < 0.0001)
|
if (selec < 0.0001)
|
||||||
selec = 0.0001;
|
selec = 0.0001;
|
||||||
else if (selec > 0.9999)
|
else if (selec > 0.9999)
|
||||||
selec = 0.9999;
|
selec = 0.9999;
|
||||||
}
|
|
||||||
|
|
||||||
if (HeapTupleIsValid(vardata.statsTuple))
|
if (HeapTupleIsValid(vardata.statsTuple))
|
||||||
nullfrac = ((Form_pg_statistic) GETSTRUCT(vardata.statsTuple))->stanullfrac;
|
nullfrac = ((Form_pg_statistic) GETSTRUCT(vardata.statsTuple))->stanullfrac;
|
||||||
|
@ -15,7 +15,7 @@
|
|||||||
*
|
*
|
||||||
*
|
*
|
||||||
* IDENTIFICATION
|
* IDENTIFICATION
|
||||||
* $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.244 2008/03/08 22:41:38 tgl Exp $
|
* $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.245 2008/03/09 00:32:09 tgl Exp $
|
||||||
*
|
*
|
||||||
*-------------------------------------------------------------------------
|
*-------------------------------------------------------------------------
|
||||||
*/
|
*/
|
||||||
@ -567,17 +567,23 @@ mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
|
|||||||
* or not it has anything to do with the histogram sort operator. We are
|
* or not it has anything to do with the histogram sort operator. We are
|
||||||
* essentially using the histogram just as a representative sample. However,
|
* essentially using the histogram just as a representative sample. However,
|
||||||
* small histograms are unlikely to be all that representative, so the caller
|
* small histograms are unlikely to be all that representative, so the caller
|
||||||
* should specify a minimum histogram size to use, and fall back on some
|
* should be prepared to fall back on some other estimation approach when the
|
||||||
* other approach if this routine fails.
|
* histogram is missing or very small. It may also be prudent to combine this
|
||||||
|
* approach with another one when the histogram is small.
|
||||||
*
|
*
|
||||||
* The caller also specifies n_skip, which causes us to ignore the first and
|
* If the actual histogram size is not at least min_hist_size, we won't bother
|
||||||
* last n_skip histogram elements, on the grounds that they are outliers and
|
* to do the calculation at all. Also, if the n_skip parameter is > 0, we
|
||||||
* hence not very representative. If in doubt, min_hist_size = 100 and
|
* ignore the first and last n_skip histogram elements, on the grounds that
|
||||||
* n_skip = 1 are reasonable values.
|
* they are outliers and hence not very representative. Typical values for
|
||||||
|
* these parameters are 10 and 1.
|
||||||
*
|
*
|
||||||
* The function result is the selectivity, or -1 if there is no histogram
|
* The function result is the selectivity, or -1 if there is no histogram
|
||||||
* or it's smaller than min_hist_size.
|
* or it's smaller than min_hist_size.
|
||||||
*
|
*
|
||||||
|
* The output parameter *hist_size receives the actual histogram size,
|
||||||
|
* or zero if no histogram. Callers may use this number to decide how
|
||||||
|
* much faith to put in the function result.
|
||||||
|
*
|
||||||
* Note that the result disregards both the most-common-values (if any) and
|
* Note that the result disregards both the most-common-values (if any) and
|
||||||
* null entries. The caller is expected to combine this result with
|
* null entries. The caller is expected to combine this result with
|
||||||
* statistics for those portions of the column population. It may also be
|
* statistics for those portions of the column population. It may also be
|
||||||
@ -586,7 +592,8 @@ mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
|
|||||||
double
|
double
|
||||||
histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
|
histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
|
||||||
Datum constval, bool varonleft,
|
Datum constval, bool varonleft,
|
||||||
int min_hist_size, int n_skip)
|
int min_hist_size, int n_skip,
|
||||||
|
int *hist_size)
|
||||||
{
|
{
|
||||||
double result;
|
double result;
|
||||||
Datum *values;
|
Datum *values;
|
||||||
@ -603,6 +610,7 @@ histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
|
|||||||
&values, &nvalues,
|
&values, &nvalues,
|
||||||
NULL, NULL))
|
NULL, NULL))
|
||||||
{
|
{
|
||||||
|
*hist_size = nvalues;
|
||||||
if (nvalues >= min_hist_size)
|
if (nvalues >= min_hist_size)
|
||||||
{
|
{
|
||||||
int nmatch = 0;
|
int nmatch = 0;
|
||||||
@ -626,7 +634,10 @@ histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
|
|||||||
free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
|
free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
{
|
||||||
|
*hist_size = 0;
|
||||||
result = -1;
|
result = -1;
|
||||||
|
}
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
@ -1117,13 +1128,16 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
|
|||||||
* selectivity of the fixed prefix and remainder of pattern
|
* selectivity of the fixed prefix and remainder of pattern
|
||||||
* separately, then combine the two to get an estimate of the
|
* separately, then combine the two to get an estimate of the
|
||||||
* selectivity for the part of the column population represented by
|
* selectivity for the part of the column population represented by
|
||||||
* the histogram. We then add up data for any most-common-values
|
* the histogram. (For small histograms, we combine these approaches.)
|
||||||
* values; these are not in the histogram population, and we can get
|
*
|
||||||
* exact answers for them by applying the pattern operator, so there's
|
* We then add up data for any most-common-values values; these are
|
||||||
* no reason to approximate. (If the MCVs cover a significant part of
|
* not in the histogram population, and we can get exact answers for
|
||||||
* the total population, this gives us a big leg up in accuracy.)
|
* them by applying the pattern operator, so there's no reason to
|
||||||
|
* approximate. (If the MCVs cover a significant part of the total
|
||||||
|
* population, this gives us a big leg up in accuracy.)
|
||||||
*/
|
*/
|
||||||
Selectivity selec;
|
Selectivity selec;
|
||||||
|
int hist_size;
|
||||||
FmgrInfo opproc;
|
FmgrInfo opproc;
|
||||||
double nullfrac,
|
double nullfrac,
|
||||||
mcv_selec,
|
mcv_selec,
|
||||||
@ -1133,10 +1147,12 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
|
|||||||
fmgr_info(get_opcode(operator), &opproc);
|
fmgr_info(get_opcode(operator), &opproc);
|
||||||
|
|
||||||
selec = histogram_selectivity(&vardata, &opproc, constval, true,
|
selec = histogram_selectivity(&vardata, &opproc, constval, true,
|
||||||
100, 1);
|
10, 1, &hist_size);
|
||||||
if (selec < 0)
|
|
||||||
|
/* If not at least 100 entries, use the heuristic method */
|
||||||
|
if (hist_size < 100)
|
||||||
{
|
{
|
||||||
/* Nope, so fake it with the heuristic method */
|
Selectivity heursel;
|
||||||
Selectivity prefixsel;
|
Selectivity prefixsel;
|
||||||
Selectivity restsel;
|
Selectivity restsel;
|
||||||
|
|
||||||
@ -1146,16 +1162,28 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
|
|||||||
else
|
else
|
||||||
prefixsel = 1.0;
|
prefixsel = 1.0;
|
||||||
restsel = pattern_selectivity(rest, ptype);
|
restsel = pattern_selectivity(rest, ptype);
|
||||||
selec = prefixsel * restsel;
|
heursel = prefixsel * restsel;
|
||||||
}
|
|
||||||
|
if (selec < 0) /* fewer than 10 histogram entries? */
|
||||||
|
selec = heursel;
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
/* Yes, but don't believe extremely small or large estimates. */
|
/*
|
||||||
|
* For histogram sizes from 10 to 100, we combine the
|
||||||
|
* histogram and heuristic selectivities, putting increasingly
|
||||||
|
* more trust in the histogram for larger sizes.
|
||||||
|
*/
|
||||||
|
double hist_weight = hist_size / 100.0;
|
||||||
|
|
||||||
|
selec = selec * hist_weight + heursel * (1.0 - hist_weight);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* In any case, don't believe extremely small or large estimates. */
|
||||||
if (selec < 0.0001)
|
if (selec < 0.0001)
|
||||||
selec = 0.0001;
|
selec = 0.0001;
|
||||||
else if (selec > 0.9999)
|
else if (selec > 0.9999)
|
||||||
selec = 0.9999;
|
selec = 0.9999;
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If we have most-common-values info, add up the fractions of the MCV
|
* If we have most-common-values info, add up the fractions of the MCV
|
||||||
|
@ -8,7 +8,7 @@
|
|||||||
* Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
|
* Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
|
||||||
* Portions Copyright (c) 1994, Regents of the University of California
|
* Portions Copyright (c) 1994, Regents of the University of California
|
||||||
*
|
*
|
||||||
* $PostgreSQL: pgsql/src/include/utils/selfuncs.h,v 1.43 2008/01/01 19:45:59 momjian Exp $
|
* $PostgreSQL: pgsql/src/include/utils/selfuncs.h,v 1.44 2008/03/09 00:32:09 tgl Exp $
|
||||||
*
|
*
|
||||||
*-------------------------------------------------------------------------
|
*-------------------------------------------------------------------------
|
||||||
*/
|
*/
|
||||||
@ -112,7 +112,8 @@ extern double mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
|
|||||||
double *sumcommonp);
|
double *sumcommonp);
|
||||||
extern double histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
|
extern double histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
|
||||||
Datum constval, bool varonleft,
|
Datum constval, bool varonleft,
|
||||||
int min_hist_size, int n_skip);
|
int min_hist_size, int n_skip,
|
||||||
|
int *hist_size);
|
||||||
|
|
||||||
extern Pattern_Prefix_Status pattern_fixed_prefix(Const *patt,
|
extern Pattern_Prefix_Status pattern_fixed_prefix(Const *patt,
|
||||||
Pattern_Type ptype,
|
Pattern_Type ptype,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user