1
0
mirror of https://github.com/postgres/postgres.git synced 2025-08-28 18:48:04 +03:00

Collect and use multi-column dependency stats

Follow on patch in the multi-variate statistics patch series.

CREATE STATISTICS s1 WITH (dependencies) ON (a, b) FROM t;
ANALYZE;
will collect dependency stats on (a, b) and then use the measured
dependency in subsequent query planning.

Commit 7b504eb282 added
CREATE STATISTICS with n-distinct coefficients. These are now
specified using the mutually exclusive option WITH (ndistinct).

Author: Tomas Vondra, David Rowley
Reviewed-by: Kyotaro HORIGUCHI, Álvaro Herrera, Dean Rasheed, Robert Haas
and many other comments and contributions
Discussion: https://postgr.es/m/56f40b20-c464-fad2-ff39-06b668fac47c@2ndquadrant.com
This commit is contained in:
Simon Riggs
2017-04-05 18:00:42 -04:00
parent 00b6b6feb1
commit 2686ee1b7c
31 changed files with 2035 additions and 79 deletions

View File

@@ -192,7 +192,8 @@ CREATE VIEW pg_stats_ext AS
C.relname AS tablename,
S.staname AS staname,
S.stakeys AS attnums,
length(s.standistinct) AS ndistbytes
length(s.standistinct::bytea) AS ndistbytes,
length(S.stadependencies::bytea) AS depsbytes
FROM (pg_statistic_ext S JOIN pg_class C ON (C.oid = S.starelid))
LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace);

View File

@@ -62,10 +62,11 @@ CreateStatistics(CreateStatsStmt *stmt)
Oid relid;
ObjectAddress parentobject,
childobject;
Datum types[1]; /* only ndistinct defined now */
Datum types[2]; /* one for each possible type of statistics */
int ntypes;
ArrayType *staenabled;
bool build_ndistinct;
bool build_dependencies;
bool requested_type = false;
Assert(IsA(stmt, CreateStatsStmt));
@@ -159,7 +160,7 @@ CreateStatistics(CreateStatsStmt *stmt)
errmsg("statistics require at least 2 columns")));
/*
* Sort the attnums, which makes detecting duplicies somewhat easier, and
* Sort the attnums, which makes detecting duplicities somewhat easier, and
* it does not hurt (it does not affect the efficiency, unlike for
* indexes, for example).
*/
@@ -182,6 +183,7 @@ CreateStatistics(CreateStatsStmt *stmt)
* recognized.
*/
build_ndistinct = false;
build_dependencies = false;
foreach(l, stmt->options)
{
DefElem *opt = (DefElem *) lfirst(l);
@@ -191,6 +193,11 @@ CreateStatistics(CreateStatsStmt *stmt)
build_ndistinct = defGetBoolean(opt);
requested_type = true;
}
else if (strcmp(opt->defname, "dependencies") == 0)
{
build_dependencies = defGetBoolean(opt);
requested_type = true;
}
else
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
@@ -199,12 +206,17 @@ CreateStatistics(CreateStatsStmt *stmt)
}
/* If no statistic type was specified, build them all. */
if (!requested_type)
{
build_ndistinct = true;
build_dependencies = true;
}
/* construct the char array of enabled statistic types */
ntypes = 0;
if (build_ndistinct)
types[ntypes++] = CharGetDatum(STATS_EXT_NDISTINCT);
if (build_dependencies)
types[ntypes++] = CharGetDatum(STATS_EXT_DEPENDENCIES);
Assert(ntypes > 0);
staenabled = construct_array(types, ntypes, CHAROID, 1, true, 'c');
@@ -222,6 +234,7 @@ CreateStatistics(CreateStatsStmt *stmt)
/* no statistics build yet */
nulls[Anum_pg_statistic_ext_standistinct - 1] = true;
nulls[Anum_pg_statistic_ext_stadependencies - 1] = true;
/* insert it into pg_statistic_ext */
statrel = heap_open(StatisticExtRelationId, RowExclusiveLock);

View File

@@ -22,6 +22,7 @@
#include "utils/fmgroids.h"
#include "utils/lsyscache.h"
#include "utils/selfuncs.h"
#include "statistics/statistics.h"
/*
@@ -60,23 +61,30 @@ static void addRangeClause(RangeQueryClause **rqlist, Node *clause,
* subclauses. However, that's only right if the subclauses have independent
* probabilities, and in reality they are often NOT independent. So,
* we want to be smarter where we can.
* Currently, the only extra smarts we have is to recognize "range queries",
* such as "x > 34 AND x < 42". Clauses are recognized as possible range
* query components if they are restriction opclauses whose operators have
* scalarltsel() or scalargtsel() as their restriction selectivity estimator.
* We pair up clauses of this form that refer to the same variable. An
* unpairable clause of this kind is simply multiplied into the selectivity
* product in the normal way. But when we find a pair, we know that the
* selectivities represent the relative positions of the low and high bounds
* within the column's range, so instead of figuring the selectivity as
* hisel * losel, we can figure it as hisel + losel - 1. (To visualize this,
* see that hisel is the fraction of the range below the high bound, while
* losel is the fraction above the low bound; so hisel can be interpreted
* directly as a 0..1 value but we need to convert losel to 1-losel before
* interpreting it as a value. Then the available range is 1-losel to hisel.
* However, this calculation double-excludes nulls, so really we need
* hisel + losel + null_frac - 1.)
*
* When 'rel' is not null and rtekind = RTE_RELATION, we'll try to apply
* selectivity estimates using any extended statistcs on 'rel'.
*
* If we identify such extended statistics exist, we try to apply them.
* Currently we only have (soft) functional dependencies, so apply these in as
* many cases as possible, and fall back on normal estimates for remaining
* clauses.
*
* We also recognize "range queries", such as "x > 34 AND x < 42". Clauses
* are recognized as possible range query components if they are restriction
* opclauses whose operators have scalarltsel() or scalargtsel() as their
* restriction selectivity estimator. We pair up clauses of this form that
* refer to the same variable. An unpairable clause of this kind is simply
* multiplied into the selectivity product in the normal way. But when we
* find a pair, we know that the selectivities represent the relative
* positions of the low and high bounds within the column's range, so instead
* of figuring the selectivity as hisel * losel, we can figure it as hisel +
* losel - 1. (To visualize this, see that hisel is the fraction of the range
* below the high bound, while losel is the fraction above the low bound; so
* hisel can be interpreted directly as a 0..1 value but we need to convert
* losel to 1-losel before interpreting it as a value. Then the available
* range is 1-losel to hisel. However, this calculation double-excludes
* nulls, so really we need hisel + losel + null_frac - 1.)
*
* If either selectivity is exactly DEFAULT_INEQ_SEL, we forget this equation
* and instead use DEFAULT_RANGE_INEQ_SEL. The same applies if the equation
@@ -93,33 +101,70 @@ clauselist_selectivity(PlannerInfo *root,
List *clauses,
int varRelid,
JoinType jointype,
SpecialJoinInfo *sjinfo)
SpecialJoinInfo *sjinfo,
RelOptInfo *rel)
{
Selectivity s1 = 1.0;
RangeQueryClause *rqlist = NULL;
ListCell *l;
Bitmapset *estimatedclauses = NULL;
int listidx;
/*
* If there's exactly one clause, then no use in trying to match up pairs,
* so just go directly to clause_selectivity().
* If there's exactly one clause, then extended statistics is futile at
* this level (we might be able to apply them later if it's AND/OR
* clause). So just go directly to clause_selectivity().
*/
if (list_length(clauses) == 1)
return clause_selectivity(root, (Node *) linitial(clauses),
varRelid, jointype, sjinfo);
varRelid, jointype, sjinfo, rel);
/*
* Initial scan over clauses. Anything that doesn't look like a potential
* rangequery clause gets multiplied into s1 and forgotten. Anything that
* does gets inserted into an rqlist entry.
* When a relation of RTE_RELATION is given as 'rel', we'll try to
* perform selectivity estimation using extended statistics.
*/
if (rel && rel->rtekind == RTE_RELATION && rel->statlist != NIL)
{
/*
* Perform selectivity estimations on any clauses found applicable by
* dependencies_clauselist_selectivity. The 0-based list position of
* estimated clauses will be populated in 'estimatedclauses'.
*/
s1 *= dependencies_clauselist_selectivity(root, clauses, varRelid,
jointype, sjinfo, rel, &estimatedclauses);
/*
* This would be the place to apply any other types of extended
* statistics selectivity estimations for remaining clauses.
*/
}
/*
* Apply normal selectivity estimates for remaining clauses. We'll be
* careful to skip any clauses which were already estimated above.
*
* Anything that doesn't look like a potential rangequery clause gets
* multiplied into s1 and forgotten. Anything that does gets inserted into
* an rqlist entry.
*/
listidx = -1;
foreach(l, clauses)
{
Node *clause = (Node *) lfirst(l);
RestrictInfo *rinfo;
Selectivity s2;
listidx++;
/*
* Skip this clause if it's already been estimated by some other
* statistics above.
*/
if (bms_is_member(listidx, estimatedclauses))
continue;
/* Always compute the selectivity using clause_selectivity */
s2 = clause_selectivity(root, clause, varRelid, jointype, sjinfo);
s2 = clause_selectivity(root, clause, varRelid, jointype, sjinfo, rel);
/*
* Check for being passed a RestrictInfo.
@@ -484,7 +529,8 @@ clause_selectivity(PlannerInfo *root,
Node *clause,
int varRelid,
JoinType jointype,
SpecialJoinInfo *sjinfo)
SpecialJoinInfo *sjinfo,
RelOptInfo *rel)
{
Selectivity s1 = 0.5; /* default for any unhandled clause type */
RestrictInfo *rinfo = NULL;
@@ -604,7 +650,8 @@ clause_selectivity(PlannerInfo *root,
(Node *) get_notclausearg((Expr *) clause),
varRelid,
jointype,
sjinfo);
sjinfo,
rel);
}
else if (and_clause(clause))
{
@@ -613,7 +660,8 @@ clause_selectivity(PlannerInfo *root,
((BoolExpr *) clause)->args,
varRelid,
jointype,
sjinfo);
sjinfo,
rel);
}
else if (or_clause(clause))
{
@@ -632,7 +680,8 @@ clause_selectivity(PlannerInfo *root,
(Node *) lfirst(arg),
varRelid,
jointype,
sjinfo);
sjinfo,
rel);
s1 = s1 + s2 - s1 * s2;
}
@@ -725,7 +774,8 @@ clause_selectivity(PlannerInfo *root,
(Node *) ((RelabelType *) clause)->arg,
varRelid,
jointype,
sjinfo);
sjinfo,
rel);
}
else if (IsA(clause, CoerceToDomain))
{
@@ -734,7 +784,8 @@ clause_selectivity(PlannerInfo *root,
(Node *) ((CoerceToDomain *) clause)->arg,
varRelid,
jointype,
sjinfo);
sjinfo,
rel);
}
else
{

View File

@@ -3750,7 +3750,8 @@ compute_semi_anti_join_factors(PlannerInfo *root,
joinquals,
0,
jointype,
sjinfo);
sjinfo,
NULL);
/*
* Also get the normal inner-join selectivity of the join clauses.
@@ -3773,7 +3774,8 @@ compute_semi_anti_join_factors(PlannerInfo *root,
joinquals,
0,
JOIN_INNER,
&norm_sjinfo);
&norm_sjinfo,
NULL);
/* Avoid leaking a lot of ListCells */
if (jointype == JOIN_ANTI)
@@ -3940,7 +3942,7 @@ approx_tuple_count(PlannerInfo *root, JoinPath *path, List *quals)
Node *qual = (Node *) lfirst(l);
/* Note that clause_selectivity will be able to cache its result */
selec *= clause_selectivity(root, qual, 0, JOIN_INNER, &sjinfo);
selec *= clause_selectivity(root, qual, 0, JOIN_INNER, &sjinfo, NULL);
}
/* Apply it to the input relation sizes */
@@ -3976,7 +3978,8 @@ set_baserel_size_estimates(PlannerInfo *root, RelOptInfo *rel)
rel->baserestrictinfo,
0,
JOIN_INNER,
NULL);
NULL,
rel);
rel->rows = clamp_row_est(nrows);
@@ -4013,7 +4016,8 @@ get_parameterized_baserel_size(PlannerInfo *root, RelOptInfo *rel,
allclauses,
rel->relid, /* do not use 0! */
JOIN_INNER,
NULL);
NULL,
rel);
nrows = clamp_row_est(nrows);
/* For safety, make sure result is not more than the base estimate */
if (nrows > rel->rows)
@@ -4179,12 +4183,14 @@ calc_joinrel_size_estimate(PlannerInfo *root,
joinquals,
0,
jointype,
sjinfo);
sjinfo,
NULL);
pselec = clauselist_selectivity(root,
pushedquals,
0,
jointype,
sjinfo);
sjinfo,
NULL);
/* Avoid leaking a lot of ListCells */
list_free(joinquals);
@@ -4196,7 +4202,8 @@ calc_joinrel_size_estimate(PlannerInfo *root,
restrictlist,
0,
jointype,
sjinfo);
sjinfo,
NULL);
pselec = 0.0; /* not used, keep compiler quiet */
}
@@ -4491,7 +4498,7 @@ get_foreign_key_join_selectivity(PlannerInfo *root,
Selectivity csel;
csel = clause_selectivity(root, (Node *) rinfo,
0, jointype, sjinfo);
0, jointype, sjinfo, NULL);
thisfksel = Min(thisfksel, csel);
}
fkselec *= thisfksel;

View File

@@ -280,7 +280,7 @@ consider_new_or_clause(PlannerInfo *root, RelOptInfo *rel,
* saving work later.)
*/
or_selec = clause_selectivity(root, (Node *) or_rinfo,
0, JOIN_INNER, NULL);
0, JOIN_INNER, NULL, rel);
/*
* The clause is only worth adding to the query if it rejects a useful
@@ -344,7 +344,7 @@ consider_new_or_clause(PlannerInfo *root, RelOptInfo *rel,
/* Compute inner-join size */
orig_selec = clause_selectivity(root, (Node *) join_or_rinfo,
0, JOIN_INNER, &sjinfo);
0, JOIN_INNER, &sjinfo, NULL);
/* And hack cached selectivity so join size remains the same */
join_or_rinfo->norm_selec = orig_selec / or_selec;

View File

@@ -1308,6 +1308,18 @@ get_relation_statistics(RelOptInfo *rel, Relation relation)
stainfos = lcons(info, stainfos);
}
if (statext_is_kind_built(htup, STATS_EXT_DEPENDENCIES))
{
StatisticExtInfo *info = makeNode(StatisticExtInfo);
info->statOid = statOid;
info->rel = rel;
info->kind = STATS_EXT_DEPENDENCIES;
info->keys = bms_copy(keys);
stainfos = lcons(info, stainfos);
}
ReleaseSysCache(htup);
bms_free(keys);
}

View File

@@ -12,6 +12,6 @@ subdir = src/backend/statistics
top_builddir = ../../..
include $(top_builddir)/src/Makefile.global
OBJS = extended_stats.o mvdistinct.o
OBJS = extended_stats.o dependencies.o mvdistinct.o
include $(top_srcdir)/src/backend/common.mk

View File

@@ -8,10 +8,72 @@ not true, resulting in estimation errors.
Extended statistics track different types of dependencies between the columns,
hopefully improving the estimates and producing better plans.
Currently we only have one type of extended statistics - ndistinct
coefficients, and we use it to improve estimates of grouping queries. See
README.ndistinct for details.
Types of statistics
-------------------
There are two kinds of extended statistics:
(a) ndistinct coefficients
(b) soft functional dependencies (README.dependencies)
Compatible clause types
-----------------------
Each type of statistics may be used to estimate some subset of clause types.
(a) functional dependencies - equality clauses (AND), possibly IS NULL
Currently, only OpExprs in the form Var op Const, or Const op Var are
supported, however it's feasible to expand the code later to also estimate the
selectivities on clauses such as Var op Var.
Complex clauses
---------------
We also support estimating more complex clauses - essentially AND/OR clauses
with (Var op Const) as leaves, as long as all the referenced attributes are
covered by a single statistics.
For example this condition
(a=1) AND ((b=2) OR ((c=3) AND (d=4)))
may be estimated using statistics on (a,b,c,d). If we only have statistics on
(b,c,d) we may estimate the second part, and estimate (a=1) using simple stats.
If we only have statistics on (a,b,c) we can't apply it at all at this point,
but it's worth pointing out clauselist_selectivity() works recursively and when
handling the second part (the OR-clause), we'll be able to apply the statistics.
Note: The multi-statistics estimation patch also makes it possible to pass some
clauses as 'conditions' into the deeper parts of the expression tree.
Selectivity estimation
----------------------
Throughout the planner clauselist_selectivity() still remains in charge of
most selectivity estimate requests. clauselist_selectivity() can be instructed
to try to make use of any extended statistics on the given RelOptInfo, which
it will do, if:
(a) An actual valid RelOptInfo was given. Join relations are passed in as
NULL, therefore are invalid.
(b) The relation given actually has any extended statistics defined which
are actually built.
When the above conditions are met, clauselist_selectivity() first attempts to
pass the clause list off to the extended statistics selectivity estimation
function. This functions may not find any clauses which is can perform any
estimations on. In such cases these clauses are simply ignored. When actual
estimation work is performed in these functions they're expected to mark which
clauses they've performed estimations for so that any other function
performing estimations knows which clauses are to be skipped.
Size of sample in ANALYZE
-------------------------

View File

@@ -0,0 +1,119 @@
Soft functional dependencies
============================
Functional dependencies are a concept well described in relational theory,
particularly in the definition of normalization and "normal forms". Wikipedia
has a nice definition of a functional dependency [1]:
In a given table, an attribute Y is said to have a functional dependency
on a set of attributes X (written X -> Y) if and only if each X value is
associated with precisely one Y value. For example, in an "Employee"
table that includes the attributes "Employee ID" and "Employee Date of
Birth", the functional dependency
{Employee ID} -> {Employee Date of Birth}
would hold. It follows from the previous two sentences that each
{Employee ID} is associated with precisely one {Employee Date of Birth}.
[1] https://en.wikipedia.org/wiki/Functional_dependency
In practical terms, functional dependencies mean that a value in one column
determines values in some other column. Consider for example this trivial
table with two integer columns:
CREATE TABLE t (a INT, b INT)
AS SELECT i, i/10 FROM generate_series(1,100000) s(i);
Clearly, knowledge of the value in column 'a' is sufficient to determine the
value in column 'b', as it's simply (a/10). A more practical example may be
addresses, where the knowledge of a ZIP code (usually) determines city. Larger
cities may have multiple ZIP codes, so the dependency can't be reversed.
Many datasets might be normalized not to contain such dependencies, but often
it's not practical for various reasons. In some cases, it's actually a conscious
design choice to model the dataset in a denormalized way, either because of
performance or to make querying easier.
Soft dependencies
-----------------
Real-world data sets often contain data errors, either because of data entry
mistakes (user mistyping the ZIP code) or perhaps issues in generating the
data (e.g. a ZIP code mistakenly assigned to two cities in different states).
A strict implementation would either ignore dependencies in such cases,
rendering the approach mostly useless even for slightly noisy data sets, or
result in sudden changes in behavior depending on minor differences between
samples provided to ANALYZE.
For this reason, the statistics implements "soft" functional dependencies,
associating each functional dependency with a degree of validity (a number
between 0 and 1). This degree is then used to combine selectivities in a
smooth manner.
Mining dependencies (ANALYZE)
-----------------------------
The current algorithm is fairly simple - generate all possible functional
dependencies, and for each one count the number of rows consistent with it.
Then use the fraction of rows (supporting/total) as the degree.
To count the rows consistent with the dependency (a => b):
(a) Sort the data lexicographically, i.e. first by 'a' then 'b'.
(b) For each group of rows with the same 'a' value, count the number of
distinct values in 'b'.
(c) If there's a single distinct value in 'b', the rows are consistent with
the functional dependency, otherwise they contradict it.
The algorithm also requires a minimum size of the group to consider it
consistent (currently 3 rows in the sample). Small groups make it less likely
to break the consistency.
Clause reduction (planner/optimizer)
------------------------------------
Applying the functional dependencies is fairly simple - given a list of
equality clauses, we compute selectivities of each clause and then use the
degree to combine them using this formula
P(a=?,b=?) = P(a=?) * (d + (1-d) * P(b=?))
Where 'd' is the degree of functional dependence (a=>b).
With more than two equality clauses, this process happens recursively. For
example for (a,b,c) we first use (a,b=>c) to break the computation into
P(a=?,b=?,c=?) = P(a=?,b=?) * (d + (1-d)*P(b=?))
and then apply (a=>b) the same way on P(a=?,b=?).
Consistency of clauses
----------------------
Functional dependencies only express general dependencies between columns,
without referencing particular values. This assumes that the equality clauses
are in fact consistent with the functional dependency, i.e. that given a
dependency (a=>b), the value in (b=?) clause is the value determined by (a=?).
If that's not the case, the clauses are "inconsistent" with the functional
dependency and the result will be over-estimation.
This may happen, for example, when using conditions on the ZIP code and city
name with mismatching values (ZIP code for a different city), etc. In such a
case, the result set will be empty, but we'll estimate the selectivity using
the ZIP code condition.
In this case, the default estimation based on AVIA principle happens to work
better, but mostly by chance.
This issue is the price for the simplicity of functional dependencies. If the
application frequently constructs queries with clauses inconsistent with
functional dependencies present in the data, the best solution is not to
use functional dependencies, but one of the more complex types of statistics.

File diff suppressed because it is too large Load Diff

View File

@@ -47,7 +47,7 @@ static List *fetch_statentries_for_relation(Relation pg_statext, Oid relid);
static VacAttrStats **lookup_var_attr_stats(Relation rel, Bitmapset *attrs,
int natts, VacAttrStats **vacattrstats);
static void statext_store(Relation pg_stext, Oid relid,
MVNDistinct *ndistinct,
MVNDistinct *ndistinct, MVDependencies *dependencies,
VacAttrStats **stats);
@@ -74,6 +74,7 @@ BuildRelationExtStatistics(Relation onerel, double totalrows,
{
StatExtEntry *stat = (StatExtEntry *) lfirst(lc);
MVNDistinct *ndistinct = NULL;
MVDependencies *dependencies = NULL;
VacAttrStats **stats;
ListCell *lc2;
@@ -93,10 +94,13 @@ BuildRelationExtStatistics(Relation onerel, double totalrows,
if (t == STATS_EXT_NDISTINCT)
ndistinct = statext_ndistinct_build(totalrows, numrows, rows,
stat->columns, stats);
else if (t == STATS_EXT_DEPENDENCIES)
dependencies = statext_dependencies_build(numrows, rows,
stat->columns, stats);
}
/* store the statistics in the catalog */
statext_store(pg_stext, stat->statOid, ndistinct, stats);
statext_store(pg_stext, stat->statOid, ndistinct, dependencies, stats);
}
heap_close(pg_stext, RowExclusiveLock);
@@ -117,6 +121,10 @@ statext_is_kind_built(HeapTuple htup, char type)
attnum = Anum_pg_statistic_ext_standistinct;
break;
case STATS_EXT_DEPENDENCIES:
attnum = Anum_pg_statistic_ext_stadependencies;
break;
default:
elog(ERROR, "unexpected statistics type requested: %d", type);
}
@@ -178,7 +186,8 @@ fetch_statentries_for_relation(Relation pg_statext, Oid relid)
enabled = (char *) ARR_DATA_PTR(arr);
for (i = 0; i < ARR_DIMS(arr)[0]; i++)
{
Assert(enabled[i] == STATS_EXT_NDISTINCT);
Assert((enabled[i] == STATS_EXT_NDISTINCT) ||
(enabled[i] == STATS_EXT_DEPENDENCIES));
entry->types = lappend_int(entry->types, (int) enabled[i]);
}
@@ -256,7 +265,7 @@ lookup_var_attr_stats(Relation rel, Bitmapset *attrs, int natts,
*/
static void
statext_store(Relation pg_stext, Oid statOid,
MVNDistinct *ndistinct,
MVNDistinct *ndistinct, MVDependencies *dependencies,
VacAttrStats **stats)
{
HeapTuple stup,
@@ -280,8 +289,17 @@ statext_store(Relation pg_stext, Oid statOid,
values[Anum_pg_statistic_ext_standistinct - 1] = PointerGetDatum(data);
}
if (dependencies != NULL)
{
bytea *data = statext_dependencies_serialize(dependencies);
nulls[Anum_pg_statistic_ext_stadependencies - 1] = (data == NULL);
values[Anum_pg_statistic_ext_stadependencies - 1] = PointerGetDatum(data);
}
/* always replace the value (either by bytea or NULL) */
replaces[Anum_pg_statistic_ext_standistinct - 1] = true;
replaces[Anum_pg_statistic_ext_stadependencies - 1] = true;
/* there should already be a pg_statistic_ext tuple */
oldtup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statOid));
@@ -387,3 +405,82 @@ multi_sort_compare_dims(int start, int end,
return 0;
}
/*
* has_stats_of_kind
* Check that the list contains statistic of a given kind
*/
bool
has_stats_of_kind(List *stats, char requiredkind)
{
ListCell *l;
foreach(l, stats)
{
StatisticExtInfo *stat = (StatisticExtInfo *) lfirst(l);
if (stat->kind == requiredkind)
return true;
}
return false;
}
/*
* choose_best_statistics
* Look for statistics with the specified 'requiredkind' which have keys
* that match at least two attnums.
*
* The current selection criteria is very simple - we choose the statistics
* referencing the most attributes with the least keys.
*
* XXX if multiple statistics exists of the same size matching the same number
* of keys, then the statistics which are chosen depend on the order that they
* appear in the stats list. Perhaps this needs to be more definitive.
*/
StatisticExtInfo *
choose_best_statistics(List *stats, Bitmapset *attnums, char requiredkind)
{
ListCell *lc;
StatisticExtInfo *best_match = NULL;
int best_num_matched = 2; /* goal #1: maximize */
int best_match_keys = (STATS_MAX_DIMENSIONS + 1); /* goal #2: minimize */
foreach(lc, stats)
{
StatisticExtInfo *info = (StatisticExtInfo *) lfirst(lc);
int num_matched;
int numkeys;
Bitmapset *matched;
/* skip statistics that are not the correct type */
if (info->kind != requiredkind)
continue;
/* determine how many attributes of these stats can be matched to */
matched = bms_intersect(attnums, info->keys);
num_matched = bms_num_members(matched);
bms_free(matched);
/*
* save the actual number of keys in the stats so that we can choose
* the narrowest stats with the most matching keys.
*/
numkeys = bms_num_members(info->keys);
/*
* Use these statistics when it increases the number of matched
* clauses or when it matches the same number of attributes but these
* stats have fewer keys than any previous match.
*/
if (num_matched > best_num_matched ||
(num_matched == best_num_matched && numkeys < best_match_keys))
{
best_match = info;
best_num_matched = num_matched;
best_match_keys = numkeys;
}
}
return best_match;
}

View File

@@ -1452,6 +1452,13 @@ pg_get_statisticsext_worker(Oid statextid, bool missing_ok)
StringInfoData buf;
int colno;
char *nsp;
ArrayType *arr;
char *enabled;
Datum datum;
bool isnull;
bool ndistinct_enabled;
bool dependencies_enabled;
int i;
statexttup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statextid));
@@ -1467,10 +1474,55 @@ pg_get_statisticsext_worker(Oid statextid, bool missing_ok)
initStringInfo(&buf);
nsp = get_namespace_name(statextrec->stanamespace);
appendStringInfo(&buf, "CREATE STATISTICS %s ON (",
appendStringInfo(&buf, "CREATE STATISTICS %s",
quote_qualified_identifier(nsp,
NameStr(statextrec->staname)));
/*
* Lookup the staenabled column so that we know how to handle the WITH
* clause.
*/
datum = SysCacheGetAttr(STATEXTOID, statexttup,
Anum_pg_statistic_ext_staenabled, &isnull);
Assert(!isnull);
arr = DatumGetArrayTypeP(datum);
if (ARR_NDIM(arr) != 1 ||
ARR_HASNULL(arr) ||
ARR_ELEMTYPE(arr) != CHAROID)
elog(ERROR, "staenabled is not a 1-D char array");
enabled = (char *) ARR_DATA_PTR(arr);
ndistinct_enabled = false;
dependencies_enabled = false;
for (i = 0; i < ARR_DIMS(arr)[0]; i++)
{
if (enabled[i] == STATS_EXT_NDISTINCT)
ndistinct_enabled = true;
if (enabled[i] == STATS_EXT_DEPENDENCIES)
dependencies_enabled = true;
}
/*
* If any option is disabled, then we'll need to append a WITH clause to
* show which options are enabled. We omit the WITH clause on purpose
* when all options are enabled, so a pg_dump/pg_restore will create all
* statistics types on a newer postgres version, if the statistics had all
* options enabled on the original version.
*/
if (!ndistinct_enabled || !dependencies_enabled)
{
appendStringInfoString(&buf, " WITH (");
if (ndistinct_enabled)
appendStringInfoString(&buf, "ndistinct");
else if (dependencies_enabled)
appendStringInfoString(&buf, "dependencies");
appendStringInfoChar(&buf, ')');
}
appendStringInfoString(&buf, " ON (");
for (colno = 0; colno < statextrec->stakeys.dim1; colno++)
{
AttrNumber attnum = statextrec->stakeys.values[colno];

View File

@@ -1633,13 +1633,17 @@ booltestsel(PlannerInfo *root, BoolTestType booltesttype, Node *arg,
case IS_NOT_FALSE:
selec = (double) clause_selectivity(root, arg,
varRelid,
jointype, sjinfo);
jointype,
sjinfo,
NULL);
break;
case IS_FALSE:
case IS_NOT_TRUE:
selec = 1.0 - (double) clause_selectivity(root, arg,
varRelid,
jointype, sjinfo);
jointype,
sjinfo,
NULL);
break;
default:
elog(ERROR, "unrecognized booltesttype: %d",
@@ -6436,7 +6440,8 @@ genericcostestimate(PlannerInfo *root,
indexSelectivity = clauselist_selectivity(root, selectivityQuals,
index->rel->relid,
JOIN_INNER,
NULL);
NULL,
index->rel);
/*
* If caller didn't give us an estimate, estimate the number of index
@@ -6757,7 +6762,8 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
btreeSelectivity = clauselist_selectivity(root, selectivityQuals,
index->rel->relid,
JOIN_INNER,
NULL);
NULL,
index->rel);
numIndexTuples = btreeSelectivity * index->rel->tuples;
/*
@@ -7516,7 +7522,8 @@ gincostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
*indexSelectivity = clauselist_selectivity(root, selectivityQuals,
index->rel->relid,
JOIN_INNER,
NULL);
NULL,
index->rel);
/* fetch estimated page cost for tablespace containing index */
get_tablespace_page_costs(index->reltablespace,
@@ -7748,7 +7755,8 @@ brincostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
*indexSelectivity =
clauselist_selectivity(root, indexQuals,
path->indexinfo->rel->relid,
JOIN_INNER, NULL);
JOIN_INNER, NULL,
path->indexinfo->rel);
*indexCorrelation = 1;
/*