1
0
mirror of https://github.com/postgres/postgres.git synced 2025-09-06 13:46:51 +03:00

Add support for multivariate MCV lists

Introduce a third extended statistic type, supported by the CREATE
STATISTICS command - MCV lists, a generalization of the statistic
already built and used for individual columns.

Compared to the already supported types (n-distinct coefficients and
functional dependencies), MCV lists are more complex, include column
values and allow estimation of much wider range of common clauses
(equality and inequality conditions, IS NULL, IS NOT NULL etc.).
Similarly to the other types, a new pseudo-type (pg_mcv_list) is used.

Author: Tomas Vondra
Reviewed-by: Dean Rasheed, David Rowley, Mark Dilger, Alvaro Herrera
Discussion: https://postgr.es/m/dfdac334-9cf2-2597-fb27-f0fb3753f435@2ndquadrant.com
This commit is contained in:
Tomas Vondra
2019-03-27 18:32:18 +01:00
parent 333ed246c6
commit 7300a69950
32 changed files with 3597 additions and 134 deletions

View File

@@ -60,17 +60,67 @@ static RelOptInfo *find_single_rel_for_clauses(PlannerInfo *root,
*
* See clause_selectivity() for the meaning of the additional parameters.
*
* The basic approach is to apply extended statistics first, on as many
* clauses as possible, in order to capture cross-column dependencies etc.
* The remaining clauses are then estimated using regular statistics tracked
* for individual columns. This is done by simply passing the clauses to
* clauselist_selectivity_simple.
*/
Selectivity
clauselist_selectivity(PlannerInfo *root,
List *clauses,
int varRelid,
JoinType jointype,
SpecialJoinInfo *sjinfo)
{
Selectivity s1 = 1.0;
RelOptInfo *rel;
Bitmapset *estimatedclauses = NULL;
/*
* Determine if these clauses reference a single relation. If so, and if
* it has extended statistics, try to apply those.
*/
rel = find_single_rel_for_clauses(root, clauses);
if (rel && rel->rtekind == RTE_RELATION && rel->statlist != NIL)
{
/*
* Estimate as many clauses as possible using extended statistics.
*
* 'estimatedclauses' tracks the 0-based list position index of
* clauses that we've estimated using extended statistics, and that
* should be ignored.
*/
s1 *= statext_clauselist_selectivity(root, clauses, varRelid,
jointype, sjinfo, rel,
&estimatedclauses);
}
/*
* Apply normal selectivity estimates for the remaining clauses, passing
* 'estimatedclauses' so that it skips already estimated ones.
*/
return s1 * clauselist_selectivity_simple(root, clauses, varRelid,
jointype, sjinfo,
estimatedclauses);
}
/*
* clauselist_selectivity_simple -
* Compute the selectivity of an implicitly-ANDed list of boolean
* expression clauses. The list can be empty, in which case 1.0
* must be returned. List elements may be either RestrictInfos
* or bare expression clauses --- the former is preferred since
* it allows caching of results. The estimatedclauses bitmap tracks
* clauses that have already been estimated by other means.
*
* See clause_selectivity() for the meaning of the additional parameters.
*
* Our basic approach is to take the product of the selectivities of the
* subclauses. However, that's only right if the subclauses have independent
* probabilities, and in reality they are often NOT independent. So,
* we want to be smarter where we can.
*
* If the clauses taken together refer to just one relation, we'll try to
* apply selectivity estimates using any extended statistics for that rel.
* Currently we only have (soft) functional dependencies, so apply these in as
* many cases as possible, and fall back on normal estimates for remaining
* clauses.
*
* We also recognize "range queries", such as "x > 34 AND x < 42". Clauses
* are recognized as possible range query components if they are restriction
* opclauses whose operators have scalarltsel or a related function as their
@@ -98,54 +148,29 @@ static RelOptInfo *find_single_rel_for_clauses(PlannerInfo *root,
* selectivity functions; perhaps some day we can generalize the approach.
*/
Selectivity
clauselist_selectivity(PlannerInfo *root,
List *clauses,
int varRelid,
JoinType jointype,
SpecialJoinInfo *sjinfo)
clauselist_selectivity_simple(PlannerInfo *root,
List *clauses,
int varRelid,
JoinType jointype,
SpecialJoinInfo *sjinfo,
Bitmapset *estimatedclauses)
{
Selectivity s1 = 1.0;
RelOptInfo *rel;
Bitmapset *estimatedclauses = NULL;
RangeQueryClause *rqlist = NULL;
ListCell *l;
int listidx;
/*
* If there's exactly one clause, just go directly to
* clause_selectivity(). None of what we might do below is relevant.
* If there's exactly one clause (and it was not estimated yet), just
* go directly to clause_selectivity(). None of what we might do below
* is relevant.
*/
if (list_length(clauses) == 1)
if ((list_length(clauses) == 1) &&
bms_num_members(estimatedclauses) == 0)
return clause_selectivity(root, (Node *) linitial(clauses),
varRelid, jointype, sjinfo);
/*
* Determine if these clauses reference a single relation. If so, and if
* it has extended statistics, try to apply those.
*/
rel = find_single_rel_for_clauses(root, clauses);
if (rel && rel->rtekind == RTE_RELATION && rel->statlist != NIL)
{
/*
* Perform selectivity estimations on any clauses found applicable by
* dependencies_clauselist_selectivity. 'estimatedclauses' will be
* filled with the 0-based list positions of clauses used that way, so
* that we can ignore them below.
*/
s1 *= dependencies_clauselist_selectivity(root, clauses, varRelid,
jointype, sjinfo, rel,
&estimatedclauses);
/*
* This would be the place to apply any other types of extended
* statistics selectivity estimations for remaining clauses.
*/
}
/*
* Apply normal selectivity estimates for remaining clauses. We'll be
* careful to skip any clauses which were already estimated above.
*
* Anything that doesn't look like a potential rangequery clause gets
* multiplied into s1 and forgotten. Anything that does gets inserted into
* an rqlist entry.

View File

@@ -1363,6 +1363,18 @@ get_relation_statistics(RelOptInfo *rel, Relation relation)
stainfos = lcons(info, stainfos);
}
if (statext_is_kind_built(htup, STATS_EXT_MCV))
{
StatisticExtInfo *info = makeNode(StatisticExtInfo);
info->statOid = statOid;
info->rel = rel;
info->kind = STATS_EXT_MCV;
info->keys = bms_copy(keys);
stainfos = lcons(info, stainfos);
}
ReleaseSysCache(htup);
bms_free(keys);
}