1
0
mirror of https://github.com/postgres/postgres.git synced 2025-07-09 22:41:56 +03:00

Collect and use multi-column dependency stats

Follow on patch in the multi-variate statistics patch series.

CREATE STATISTICS s1 WITH (dependencies) ON (a, b) FROM t;
ANALYZE;
will collect dependency stats on (a, b) and then use the measured
dependency in subsequent query planning.

Commit 7b504eb282 added
CREATE STATISTICS with n-distinct coefficients. These are now
specified using the mutually exclusive option WITH (ndistinct).

Author: Tomas Vondra, David Rowley
Reviewed-by: Kyotaro HORIGUCHI, Álvaro Herrera, Dean Rasheed, Robert Haas
and many other comments and contributions
Discussion: https://postgr.es/m/56f40b20-c464-fad2-ff39-06b668fac47c@2ndquadrant.com
This commit is contained in:
Simon Riggs
2017-04-05 18:00:42 -04:00
parent 00b6b6feb1
commit 2686ee1b7c
31 changed files with 2035 additions and 79 deletions

View File

@ -22,6 +22,7 @@
#include "utils/fmgroids.h"
#include "utils/lsyscache.h"
#include "utils/selfuncs.h"
#include "statistics/statistics.h"
/*
@ -60,23 +61,30 @@ static void addRangeClause(RangeQueryClause **rqlist, Node *clause,
* subclauses. However, that's only right if the subclauses have independent
* probabilities, and in reality they are often NOT independent. So,
* we want to be smarter where we can.
* Currently, the only extra smarts we have is to recognize "range queries",
* such as "x > 34 AND x < 42". Clauses are recognized as possible range
* query components if they are restriction opclauses whose operators have
* scalarltsel() or scalargtsel() as their restriction selectivity estimator.
* We pair up clauses of this form that refer to the same variable. An
* unpairable clause of this kind is simply multiplied into the selectivity
* product in the normal way. But when we find a pair, we know that the
* selectivities represent the relative positions of the low and high bounds
* within the column's range, so instead of figuring the selectivity as
* hisel * losel, we can figure it as hisel + losel - 1. (To visualize this,
* see that hisel is the fraction of the range below the high bound, while
* losel is the fraction above the low bound; so hisel can be interpreted
* directly as a 0..1 value but we need to convert losel to 1-losel before
* interpreting it as a value. Then the available range is 1-losel to hisel.
* However, this calculation double-excludes nulls, so really we need
* hisel + losel + null_frac - 1.)
*
* When 'rel' is not null and rtekind = RTE_RELATION, we'll try to apply
* selectivity estimates using any extended statistcs on 'rel'.
*
* If we identify such extended statistics exist, we try to apply them.
* Currently we only have (soft) functional dependencies, so apply these in as
* many cases as possible, and fall back on normal estimates for remaining
* clauses.
*
* We also recognize "range queries", such as "x > 34 AND x < 42". Clauses
* are recognized as possible range query components if they are restriction
* opclauses whose operators have scalarltsel() or scalargtsel() as their
* restriction selectivity estimator. We pair up clauses of this form that
* refer to the same variable. An unpairable clause of this kind is simply
* multiplied into the selectivity product in the normal way. But when we
* find a pair, we know that the selectivities represent the relative
* positions of the low and high bounds within the column's range, so instead
* of figuring the selectivity as hisel * losel, we can figure it as hisel +
* losel - 1. (To visualize this, see that hisel is the fraction of the range
* below the high bound, while losel is the fraction above the low bound; so
* hisel can be interpreted directly as a 0..1 value but we need to convert
* losel to 1-losel before interpreting it as a value. Then the available
* range is 1-losel to hisel. However, this calculation double-excludes
* nulls, so really we need hisel + losel + null_frac - 1.)
*
* If either selectivity is exactly DEFAULT_INEQ_SEL, we forget this equation
* and instead use DEFAULT_RANGE_INEQ_SEL. The same applies if the equation
@ -93,33 +101,70 @@ clauselist_selectivity(PlannerInfo *root,
List *clauses,
int varRelid,
JoinType jointype,
SpecialJoinInfo *sjinfo)
SpecialJoinInfo *sjinfo,
RelOptInfo *rel)
{
Selectivity s1 = 1.0;
RangeQueryClause *rqlist = NULL;
ListCell *l;
Bitmapset *estimatedclauses = NULL;
int listidx;
/*
* If there's exactly one clause, then no use in trying to match up pairs,
* so just go directly to clause_selectivity().
* If there's exactly one clause, then extended statistics is futile at
* this level (we might be able to apply them later if it's AND/OR
* clause). So just go directly to clause_selectivity().
*/
if (list_length(clauses) == 1)
return clause_selectivity(root, (Node *) linitial(clauses),
varRelid, jointype, sjinfo);
varRelid, jointype, sjinfo, rel);
/*
* Initial scan over clauses. Anything that doesn't look like a potential
* rangequery clause gets multiplied into s1 and forgotten. Anything that
* does gets inserted into an rqlist entry.
* When a relation of RTE_RELATION is given as 'rel', we'll try to
* perform selectivity estimation using extended statistics.
*/
if (rel && rel->rtekind == RTE_RELATION && rel->statlist != NIL)
{
/*
* Perform selectivity estimations on any clauses found applicable by
* dependencies_clauselist_selectivity. The 0-based list position of
* estimated clauses will be populated in 'estimatedclauses'.
*/
s1 *= dependencies_clauselist_selectivity(root, clauses, varRelid,
jointype, sjinfo, rel, &estimatedclauses);
/*
* This would be the place to apply any other types of extended
* statistics selectivity estimations for remaining clauses.
*/
}
/*
* Apply normal selectivity estimates for remaining clauses. We'll be
* careful to skip any clauses which were already estimated above.
*
* Anything that doesn't look like a potential rangequery clause gets
* multiplied into s1 and forgotten. Anything that does gets inserted into
* an rqlist entry.
*/
listidx = -1;
foreach(l, clauses)
{
Node *clause = (Node *) lfirst(l);
RestrictInfo *rinfo;
Selectivity s2;
listidx++;
/*
* Skip this clause if it's already been estimated by some other
* statistics above.
*/
if (bms_is_member(listidx, estimatedclauses))
continue;
/* Always compute the selectivity using clause_selectivity */
s2 = clause_selectivity(root, clause, varRelid, jointype, sjinfo);
s2 = clause_selectivity(root, clause, varRelid, jointype, sjinfo, rel);
/*
* Check for being passed a RestrictInfo.
@ -484,7 +529,8 @@ clause_selectivity(PlannerInfo *root,
Node *clause,
int varRelid,
JoinType jointype,
SpecialJoinInfo *sjinfo)
SpecialJoinInfo *sjinfo,
RelOptInfo *rel)
{
Selectivity s1 = 0.5; /* default for any unhandled clause type */
RestrictInfo *rinfo = NULL;
@ -604,7 +650,8 @@ clause_selectivity(PlannerInfo *root,
(Node *) get_notclausearg((Expr *) clause),
varRelid,
jointype,
sjinfo);
sjinfo,
rel);
}
else if (and_clause(clause))
{
@ -613,7 +660,8 @@ clause_selectivity(PlannerInfo *root,
((BoolExpr *) clause)->args,
varRelid,
jointype,
sjinfo);
sjinfo,
rel);
}
else if (or_clause(clause))
{
@ -632,7 +680,8 @@ clause_selectivity(PlannerInfo *root,
(Node *) lfirst(arg),
varRelid,
jointype,
sjinfo);
sjinfo,
rel);
s1 = s1 + s2 - s1 * s2;
}
@ -725,7 +774,8 @@ clause_selectivity(PlannerInfo *root,
(Node *) ((RelabelType *) clause)->arg,
varRelid,
jointype,
sjinfo);
sjinfo,
rel);
}
else if (IsA(clause, CoerceToDomain))
{
@ -734,7 +784,8 @@ clause_selectivity(PlannerInfo *root,
(Node *) ((CoerceToDomain *) clause)->arg,
varRelid,
jointype,
sjinfo);
sjinfo,
rel);
}
else
{

View File

@ -3750,7 +3750,8 @@ compute_semi_anti_join_factors(PlannerInfo *root,
joinquals,
0,
jointype,
sjinfo);
sjinfo,
NULL);
/*
* Also get the normal inner-join selectivity of the join clauses.
@ -3773,7 +3774,8 @@ compute_semi_anti_join_factors(PlannerInfo *root,
joinquals,
0,
JOIN_INNER,
&norm_sjinfo);
&norm_sjinfo,
NULL);
/* Avoid leaking a lot of ListCells */
if (jointype == JOIN_ANTI)
@ -3940,7 +3942,7 @@ approx_tuple_count(PlannerInfo *root, JoinPath *path, List *quals)
Node *qual = (Node *) lfirst(l);
/* Note that clause_selectivity will be able to cache its result */
selec *= clause_selectivity(root, qual, 0, JOIN_INNER, &sjinfo);
selec *= clause_selectivity(root, qual, 0, JOIN_INNER, &sjinfo, NULL);
}
/* Apply it to the input relation sizes */
@ -3976,7 +3978,8 @@ set_baserel_size_estimates(PlannerInfo *root, RelOptInfo *rel)
rel->baserestrictinfo,
0,
JOIN_INNER,
NULL);
NULL,
rel);
rel->rows = clamp_row_est(nrows);
@ -4013,7 +4016,8 @@ get_parameterized_baserel_size(PlannerInfo *root, RelOptInfo *rel,
allclauses,
rel->relid, /* do not use 0! */
JOIN_INNER,
NULL);
NULL,
rel);
nrows = clamp_row_est(nrows);
/* For safety, make sure result is not more than the base estimate */
if (nrows > rel->rows)
@ -4179,12 +4183,14 @@ calc_joinrel_size_estimate(PlannerInfo *root,
joinquals,
0,
jointype,
sjinfo);
sjinfo,
NULL);
pselec = clauselist_selectivity(root,
pushedquals,
0,
jointype,
sjinfo);
sjinfo,
NULL);
/* Avoid leaking a lot of ListCells */
list_free(joinquals);
@ -4196,7 +4202,8 @@ calc_joinrel_size_estimate(PlannerInfo *root,
restrictlist,
0,
jointype,
sjinfo);
sjinfo,
NULL);
pselec = 0.0; /* not used, keep compiler quiet */
}
@ -4491,7 +4498,7 @@ get_foreign_key_join_selectivity(PlannerInfo *root,
Selectivity csel;
csel = clause_selectivity(root, (Node *) rinfo,
0, jointype, sjinfo);
0, jointype, sjinfo, NULL);
thisfksel = Min(thisfksel, csel);
}
fkselec *= thisfksel;

View File

@ -280,7 +280,7 @@ consider_new_or_clause(PlannerInfo *root, RelOptInfo *rel,
* saving work later.)
*/
or_selec = clause_selectivity(root, (Node *) or_rinfo,
0, JOIN_INNER, NULL);
0, JOIN_INNER, NULL, rel);
/*
* The clause is only worth adding to the query if it rejects a useful
@ -344,7 +344,7 @@ consider_new_or_clause(PlannerInfo *root, RelOptInfo *rel,
/* Compute inner-join size */
orig_selec = clause_selectivity(root, (Node *) join_or_rinfo,
0, JOIN_INNER, &sjinfo);
0, JOIN_INNER, &sjinfo, NULL);
/* And hack cached selectivity so join size remains the same */
join_or_rinfo->norm_selec = orig_selec / or_selec;

View File

@ -1308,6 +1308,18 @@ get_relation_statistics(RelOptInfo *rel, Relation relation)
stainfos = lcons(info, stainfos);
}
if (statext_is_kind_built(htup, STATS_EXT_DEPENDENCIES))
{
StatisticExtInfo *info = makeNode(StatisticExtInfo);
info->statOid = statOid;
info->rel = rel;
info->kind = STATS_EXT_DEPENDENCIES;
info->keys = bms_copy(keys);
stainfos = lcons(info, stainfos);
}
ReleaseSysCache(htup);
bms_free(keys);
}