Collect and use multi-column dependency stats

Follow on patch in the multi-variate statistics patch series. CREATE STATISTICS s1 WITH (dependencies) ON (a, b) FROM t; ANALYZE; will collect dependency stats on (a, b) and then use the measured dependency in subsequent query planning. Commit 7b504eb282 added CREATE STATISTICS with n-distinct coefficients. These are now specified using the mutually exclusive option WITH (ndistinct). Author: Tomas Vondra, David Rowley Reviewed-by: Kyotaro HORIGUCHI, Álvaro Herrera, Dean Rasheed, Robert Haas and many other comments and contributions Discussion: https://postgr.es/m/56f40b20-c464-fad2-ff39-06b668fac47c@2ndquadrant.com
2025-08-28 18:48:04 +03:00 · 2017-04-05 18:00:42 -04:00
parent 00b6b6feb1
commit 2686ee1b7c
31 changed files with 2035 additions and 79 deletions
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -192,7 +192,8 @@ CREATE VIEW pg_stats_ext AS
        C.relname AS tablename,
        S.staname AS staname,
        S.stakeys AS attnums,
-        length(s.standistinct) AS ndistbytes
+        length(s.standistinct::bytea) AS ndistbytes,
+        length(S.stadependencies::bytea) AS depsbytes
    FROM (pg_statistic_ext S JOIN pg_class C ON (C.oid = S.starelid))
        LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace);

--- a/src/backend/commands/statscmds.c
+++ b/src/backend/commands/statscmds.c
@@ -62,10 +62,11 @@ CreateStatistics(CreateStatsStmt *stmt)
 	Oid			relid;
 	ObjectAddress parentobject,
 				childobject;
-	Datum		types[1];		/* only ndistinct defined now */
+	Datum		types[2];		/* one for each possible type of statistics */
 	int			ntypes;
 	ArrayType  *staenabled;
 	bool		build_ndistinct;
+	bool		build_dependencies;
 	bool		requested_type = false;

 	Assert(IsA(stmt, CreateStatsStmt));
@@ -159,7 +160,7 @@ CreateStatistics(CreateStatsStmt *stmt)
 				 errmsg("statistics require at least 2 columns")));

 	/*
-	 * Sort the attnums, which makes detecting duplicies somewhat easier, and
+	 * Sort the attnums, which makes detecting duplicities somewhat easier, and
 	 * it does not hurt (it does not affect the efficiency, unlike for
 	 * indexes, for example).
 	 */
@@ -182,6 +183,7 @@ CreateStatistics(CreateStatsStmt *stmt)
 	 * recognized.
 	 */
 	build_ndistinct = false;
+	build_dependencies = false;
 	foreach(l, stmt->options)
 	{
 		DefElem    *opt = (DefElem *) lfirst(l);
@@ -191,6 +193,11 @@ CreateStatistics(CreateStatsStmt *stmt)
 			build_ndistinct = defGetBoolean(opt);
 			requested_type = true;
 		}
+		else if (strcmp(opt->defname, "dependencies") == 0)
+		{
+			build_dependencies = defGetBoolean(opt);
+			requested_type = true;
+		}
 		else
 			ereport(ERROR,
 					(errcode(ERRCODE_SYNTAX_ERROR),
@@ -199,12 +206,17 @@ CreateStatistics(CreateStatsStmt *stmt)
 	}
 	/* If no statistic type was specified, build them all. */
 	if (!requested_type)
+	{
 		build_ndistinct = true;
+		build_dependencies = true;
+	}

 	/* construct the char array of enabled statistic types */
 	ntypes = 0;
 	if (build_ndistinct)
 		types[ntypes++] = CharGetDatum(STATS_EXT_NDISTINCT);
+	if (build_dependencies)
+		types[ntypes++] = CharGetDatum(STATS_EXT_DEPENDENCIES);
 	Assert(ntypes > 0);
 	staenabled = construct_array(types, ntypes, CHAROID, 1, true, 'c');

@@ -222,6 +234,7 @@ CreateStatistics(CreateStatsStmt *stmt)

 	/* no statistics build yet */
 	nulls[Anum_pg_statistic_ext_standistinct - 1] = true;
+	nulls[Anum_pg_statistic_ext_stadependencies - 1] = true;

 	/* insert it into pg_statistic_ext */
 	statrel = heap_open(StatisticExtRelationId, RowExclusiveLock);
--- a/src/backend/optimizer/path/clausesel.c
+++ b/src/backend/optimizer/path/clausesel.c
@@ -22,6 +22,7 @@
 #include "utils/fmgroids.h"
 #include "utils/lsyscache.h"
 #include "utils/selfuncs.h"
+#include "statistics/statistics.h"


 /*
@@ -60,23 +61,30 @@ static void addRangeClause(RangeQueryClause **rqlist, Node *clause,
 * subclauses.  However, that's only right if the subclauses have independent
 * probabilities, and in reality they are often NOT independent.  So,
 * we want to be smarter where we can.
-
- * Currently, the only extra smarts we have is to recognize "range queries",
- * such as "x > 34 AND x < 42".  Clauses are recognized as possible range
- * query components if they are restriction opclauses whose operators have
- * scalarltsel() or scalargtsel() as their restriction selectivity estimator.
- * We pair up clauses of this form that refer to the same variable.  An
- * unpairable clause of this kind is simply multiplied into the selectivity
- * product in the normal way.  But when we find a pair, we know that the
- * selectivities represent the relative positions of the low and high bounds
- * within the column's range, so instead of figuring the selectivity as
- * hisel * losel, we can figure it as hisel + losel - 1.  (To visualize this,
- * see that hisel is the fraction of the range below the high bound, while
- * losel is the fraction above the low bound; so hisel can be interpreted
- * directly as a 0..1 value but we need to convert losel to 1-losel before
- * interpreting it as a value.  Then the available range is 1-losel to hisel.
- * However, this calculation double-excludes nulls, so really we need
- * hisel + losel + null_frac - 1.)
+ *
+ * When 'rel' is not null and rtekind = RTE_RELATION, we'll try to apply
+ * selectivity estimates using any extended statistcs on 'rel'.
+ *
+ * If we identify such extended statistics exist, we try to apply them.
+ * Currently we only have (soft) functional dependencies, so apply these in as
+ * many cases as possible, and fall back on normal estimates for remaining
+ * clauses.
+ *
+ * We also recognize "range queries", such as "x > 34 AND x < 42".  Clauses
+ * are recognized as possible range query components if they are restriction
+ * opclauses whose operators have scalarltsel() or scalargtsel() as their
+ * restriction selectivity estimator.  We pair up clauses of this form that
+ * refer to the same variable.  An unpairable clause of this kind is simply
+ * multiplied into the selectivity product in the normal way.  But when we
+ * find a pair, we know that the selectivities represent the relative
+ * positions of the low and high bounds within the column's range, so instead
+ * of figuring the selectivity as hisel * losel, we can figure it as hisel +
+ * losel - 1.  (To visualize this, see that hisel is the fraction of the range
+ * below the high bound, while losel is the fraction above the low bound; so
+ * hisel can be interpreted directly as a 0..1 value but we need to convert
+ * losel to 1-losel before interpreting it as a value.  Then the available
+ * range is 1-losel to hisel.  However, this calculation double-excludes
+ * nulls, so really we need hisel + losel + null_frac - 1.)
 *
 * If either selectivity is exactly DEFAULT_INEQ_SEL, we forget this equation
 * and instead use DEFAULT_RANGE_INEQ_SEL.  The same applies if the equation
@@ -93,33 +101,70 @@ clauselist_selectivity(PlannerInfo *root,
 					   List *clauses,
 					   int varRelid,
 					   JoinType jointype,
-					   SpecialJoinInfo *sjinfo)
+					   SpecialJoinInfo *sjinfo,
+					   RelOptInfo *rel)
 {
 	Selectivity s1 = 1.0;
 	RangeQueryClause *rqlist = NULL;
 	ListCell   *l;
+	Bitmapset  *estimatedclauses = NULL;
+	int			listidx;

 	/*
-	 * If there's exactly one clause, then no use in trying to match up pairs,
-	 * so just go directly to clause_selectivity().
+	 * If there's exactly one clause, then extended statistics is futile at
+	 * this level (we might be able to apply them later if it's AND/OR
+	 * clause). So just go directly to clause_selectivity().
 	 */
 	if (list_length(clauses) == 1)
 		return clause_selectivity(root, (Node *) linitial(clauses),
-								  varRelid, jointype, sjinfo);
+								  varRelid, jointype, sjinfo, rel);

 	/*
-	 * Initial scan over clauses.  Anything that doesn't look like a potential
-	 * rangequery clause gets multiplied into s1 and forgotten. Anything that
-	 * does gets inserted into an rqlist entry.
+	 * When a relation of RTE_RELATION is given as 'rel', we'll try to
+	 * perform selectivity estimation using extended statistics.
 	 */
+	if (rel && rel->rtekind == RTE_RELATION && rel->statlist != NIL)
+	{
+		/*
+		 * Perform selectivity estimations on any clauses found applicable by
+		 * dependencies_clauselist_selectivity. The 0-based list position of
+		 * estimated clauses will be populated in 'estimatedclauses'.
+		 */
+		s1 *= dependencies_clauselist_selectivity(root, clauses, varRelid,
+								   jointype, sjinfo, rel, &estimatedclauses);
+
+		/*
+		 * This would be the place to apply any other types of extended
+		 * statistics selectivity estimations for remaining clauses.
+		 */
+	}
+
+	/*
+	 * Apply normal selectivity estimates for remaining clauses. We'll be
+	 * careful to skip any clauses which were already estimated above.
+	 *
+	 * Anything that doesn't look like a potential rangequery clause gets
+	 * multiplied into s1 and forgotten. Anything that does gets inserted into
+	 * an rqlist entry.
+	 */
+	listidx = -1;
 	foreach(l, clauses)
 	{
 		Node	   *clause = (Node *) lfirst(l);
 		RestrictInfo *rinfo;
 		Selectivity s2;

+		listidx++;
+
+		/*
+		 * Skip this clause if it's already been estimated by some other
+		 * statistics above.
+		 */
+		if (bms_is_member(listidx, estimatedclauses))
+			continue;
+
 		/* Always compute the selectivity using clause_selectivity */
-		s2 = clause_selectivity(root, clause, varRelid, jointype, sjinfo);
+		s2 = clause_selectivity(root, clause, varRelid, jointype, sjinfo, rel);

 		/*
 		 * Check for being passed a RestrictInfo.
@@ -484,7 +529,8 @@ clause_selectivity(PlannerInfo *root,
 				   Node *clause,
 				   int varRelid,
 				   JoinType jointype,
-				   SpecialJoinInfo *sjinfo)
+				   SpecialJoinInfo *sjinfo,
+				   RelOptInfo *rel)
 {
 	Selectivity s1 = 0.5;		/* default for any unhandled clause type */
 	RestrictInfo *rinfo = NULL;
@@ -604,7 +650,8 @@ clause_selectivity(PlannerInfo *root,
 								  (Node *) get_notclausearg((Expr *) clause),
 									  varRelid,
 									  jointype,
-									  sjinfo);
+									  sjinfo,
+									  rel);
 	}
 	else if (and_clause(clause))
 	{
@@ -613,7 +660,8 @@ clause_selectivity(PlannerInfo *root,
 									((BoolExpr *) clause)->args,
 									varRelid,
 									jointype,
-									sjinfo);
+									sjinfo,
+									rel);
 	}
 	else if (or_clause(clause))
 	{
@@ -632,7 +680,8 @@ clause_selectivity(PlannerInfo *root,
 												(Node *) lfirst(arg),
 												varRelid,
 												jointype,
-												sjinfo);
+												sjinfo,
+												rel);

 			s1 = s1 + s2 - s1 * s2;
 		}
@@ -725,7 +774,8 @@ clause_selectivity(PlannerInfo *root,
 								(Node *) ((RelabelType *) clause)->arg,
 								varRelid,
 								jointype,
-								sjinfo);
+								sjinfo,
+								rel);
 	}
 	else if (IsA(clause, CoerceToDomain))
 	{
@@ -734,7 +784,8 @@ clause_selectivity(PlannerInfo *root,
 								(Node *) ((CoerceToDomain *) clause)->arg,
 								varRelid,
 								jointype,
-								sjinfo);
+								sjinfo,
+								rel);
 	}
 	else
 	{
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -3750,7 +3750,8 @@ compute_semi_anti_join_factors(PlannerInfo *root,
 									joinquals,
 									0,
 									jointype,
-									sjinfo);
+									sjinfo,
+									NULL);

 	/*
 	 * Also get the normal inner-join selectivity of the join clauses.
@@ -3773,7 +3774,8 @@ compute_semi_anti_join_factors(PlannerInfo *root,
 									joinquals,
 									0,
 									JOIN_INNER,
-									&norm_sjinfo);
+									&norm_sjinfo,
+									NULL);

 	/* Avoid leaking a lot of ListCells */
 	if (jointype == JOIN_ANTI)
@@ -3940,7 +3942,7 @@ approx_tuple_count(PlannerInfo *root, JoinPath *path, List *quals)
 		Node	   *qual = (Node *) lfirst(l);

 		/* Note that clause_selectivity will be able to cache its result */
-		selec *= clause_selectivity(root, qual, 0, JOIN_INNER, &sjinfo);
+		selec *= clause_selectivity(root, qual, 0, JOIN_INNER, &sjinfo, NULL);
 	}

 	/* Apply it to the input relation sizes */
@@ -3976,7 +3978,8 @@ set_baserel_size_estimates(PlannerInfo *root, RelOptInfo *rel)
 							   rel->baserestrictinfo,
 							   0,
 							   JOIN_INNER,
-							   NULL);
+							   NULL,
+							   rel);

 	rel->rows = clamp_row_est(nrows);

@@ -4013,7 +4016,8 @@ get_parameterized_baserel_size(PlannerInfo *root, RelOptInfo *rel,
 							   allclauses,
 							   rel->relid,		/* do not use 0! */
 							   JOIN_INNER,
-							   NULL);
+							   NULL,
+							   rel);
 	nrows = clamp_row_est(nrows);
 	/* For safety, make sure result is not more than the base estimate */
 	if (nrows > rel->rows)
@@ -4179,12 +4183,14 @@ calc_joinrel_size_estimate(PlannerInfo *root,
 										joinquals,
 										0,
 										jointype,
-										sjinfo);
+										sjinfo,
+										NULL);
 		pselec = clauselist_selectivity(root,
 										pushedquals,
 										0,
 										jointype,
-										sjinfo);
+										sjinfo,
+										NULL);

 		/* Avoid leaking a lot of ListCells */
 		list_free(joinquals);
@@ -4196,7 +4202,8 @@ calc_joinrel_size_estimate(PlannerInfo *root,
 										restrictlist,
 										0,
 										jointype,
-										sjinfo);
+										sjinfo,
+										NULL);
 		pselec = 0.0;			/* not used, keep compiler quiet */
 	}

@@ -4491,7 +4498,7 @@ get_foreign_key_join_selectivity(PlannerInfo *root,
 				Selectivity csel;

 				csel = clause_selectivity(root, (Node *) rinfo,
-										  0, jointype, sjinfo);
+										  0, jointype, sjinfo, NULL);
 				thisfksel = Min(thisfksel, csel);
 			}
 			fkselec *= thisfksel;
--- a/src/backend/optimizer/util/orclauses.c
+++ b/src/backend/optimizer/util/orclauses.c
@@ -280,7 +280,7 @@ consider_new_or_clause(PlannerInfo *root, RelOptInfo *rel,
 	 * saving work later.)
 	 */
 	or_selec = clause_selectivity(root, (Node *) or_rinfo,
-								  0, JOIN_INNER, NULL);
+								  0, JOIN_INNER, NULL, rel);

 	/*
 	 * The clause is only worth adding to the query if it rejects a useful
@@ -344,7 +344,7 @@ consider_new_or_clause(PlannerInfo *root, RelOptInfo *rel,

 		/* Compute inner-join size */
 		orig_selec = clause_selectivity(root, (Node *) join_or_rinfo,
-										0, JOIN_INNER, &sjinfo);
+										0, JOIN_INNER, &sjinfo, NULL);

 		/* And hack cached selectivity so join size remains the same */
 		join_or_rinfo->norm_selec = orig_selec / or_selec;
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -1308,6 +1308,18 @@ get_relation_statistics(RelOptInfo *rel, Relation relation)
 			stainfos = lcons(info, stainfos);
 		}

+		if (statext_is_kind_built(htup, STATS_EXT_DEPENDENCIES))
+		{
+			StatisticExtInfo *info = makeNode(StatisticExtInfo);
+
+			info->statOid = statOid;
+			info->rel = rel;
+			info->kind = STATS_EXT_DEPENDENCIES;
+			info->keys = bms_copy(keys);
+
+			stainfos = lcons(info, stainfos);
+		}
+
 		ReleaseSysCache(htup);
 		bms_free(keys);
 	}
--- a/src/backend/statistics/Makefile
+++ b/src/backend/statistics/Makefile
@@ -12,6 +12,6 @@ subdir = src/backend/statistics
 top_builddir = ../../..
 include $(top_builddir)/src/Makefile.global

-OBJS = extended_stats.o mvdistinct.o
+OBJS = extended_stats.o dependencies.o mvdistinct.o

 include $(top_srcdir)/src/backend/common.mk
--- a/src/backend/statistics/README
+++ b/src/backend/statistics/README
@@ -8,10 +8,72 @@ not true, resulting in estimation errors.
 Extended statistics track different types of dependencies between the columns,
 hopefully improving the estimates and producing better plans.

-Currently we only have one type of extended statistics - ndistinct
-coefficients, and we use it to improve estimates of grouping queries. See
-README.ndistinct for details.

+Types of statistics
+-------------------
+
+There are two kinds of extended statistics:
+
+    (a) ndistinct coefficients
+
+    (b) soft functional dependencies (README.dependencies)
+
+
+Compatible clause types
+-----------------------
+
+Each type of statistics may be used to estimate some subset of clause types.
+
+    (a) functional dependencies - equality clauses (AND), possibly IS NULL
+
+Currently, only OpExprs in the form Var op Const, or Const op Var are
+supported, however it's feasible to expand the code later to also estimate the
+selectivities on clauses such as Var op Var.
+
+
+Complex clauses
+---------------
+
+We also support estimating more complex clauses - essentially AND/OR clauses
+with (Var op Const) as leaves, as long as all the referenced attributes are
+covered by a single statistics.
+
+For example this condition
+
+    (a=1) AND ((b=2) OR ((c=3) AND (d=4)))
+
+may be estimated using statistics on (a,b,c,d). If we only have statistics on
+(b,c,d) we may estimate the second part, and estimate (a=1) using simple stats.
+
+If we only have statistics on (a,b,c) we can't apply it at all at this point,
+but it's worth pointing out clauselist_selectivity() works recursively and when
+handling the second part (the OR-clause), we'll be able to apply the statistics.
+
+Note: The multi-statistics estimation patch also makes it possible to pass some
+clauses as 'conditions' into the deeper parts of the expression tree.
+
+
+Selectivity estimation
+----------------------
+
+Throughout the planner clauselist_selectivity() still remains in charge of
+most selectivity estimate requests. clauselist_selectivity() can be instructed
+to try to make use of any extended statistics on the given RelOptInfo, which
+it will do, if:
+
+    (a) An actual valid RelOptInfo was given. Join relations are passed in as
+        NULL, therefore are invalid.
+
+    (b) The relation given actually has any extended statistics defined which
+        are actually built.
+
+When the above conditions are met, clauselist_selectivity() first attempts to
+pass the clause list off to the extended statistics selectivity estimation
+function. This functions may not find any clauses which is can perform any
+estimations on. In such cases these clauses are simply ignored. When actual
+estimation work is performed in these functions they're expected to mark which
+clauses they've performed estimations for so that any other function
+performing estimations knows which clauses are to be skipped.

 Size of sample in ANALYZE
 -------------------------
--- a/src/backend/statistics/README.dependencies
+++ b/src/backend/statistics/README.dependencies
@@ -0,0 +1,119 @@
+Soft functional dependencies
+============================
+
+Functional dependencies are a concept well described in relational theory,
+particularly in the definition of normalization and "normal forms". Wikipedia
+has a nice definition of a functional dependency [1]:
+
+    In a given table, an attribute Y is said to have a functional dependency
+    on a set of attributes X (written X -> Y) if and only if each X value is
+    associated with precisely one Y value. For example, in an "Employee"
+    table that includes the attributes "Employee ID" and "Employee Date of
+    Birth", the functional dependency
+
+        {Employee ID} -> {Employee Date of Birth}
+
+    would hold. It follows from the previous two sentences that each
+    {Employee ID} is associated with precisely one {Employee Date of Birth}.
+
+    [1] https://en.wikipedia.org/wiki/Functional_dependency
+
+In practical terms, functional dependencies mean that a value in one column
+determines values in some other column. Consider for example this trivial
+table with two integer columns:
+
+    CREATE TABLE t (a INT, b INT)
+        AS SELECT i, i/10 FROM generate_series(1,100000) s(i);
+
+Clearly, knowledge of the value in column 'a' is sufficient to determine the
+value in column 'b', as it's simply (a/10). A more practical example may be
+addresses, where the knowledge of a ZIP code (usually) determines city. Larger
+cities may have multiple ZIP codes, so the dependency can't be reversed.
+
+Many datasets might be normalized not to contain such dependencies, but often
+it's not practical for various reasons. In some cases, it's actually a conscious
+design choice to model the dataset in a denormalized way, either because of
+performance or to make querying easier.
+
+
+Soft dependencies
+-----------------
+
+Real-world data sets often contain data errors, either because of data entry
+mistakes (user mistyping the ZIP code) or perhaps issues in generating the
+data (e.g. a ZIP code mistakenly assigned to two cities in different states).
+
+A strict implementation would either ignore dependencies in such cases,
+rendering the approach mostly useless even for slightly noisy data sets, or
+result in sudden changes in behavior depending on minor differences between
+samples provided to ANALYZE.
+
+For this reason, the statistics implements "soft" functional dependencies,
+associating each functional dependency with a degree of validity (a number
+between 0 and 1). This degree is then used to combine selectivities in a
+smooth manner.
+
+
+Mining dependencies (ANALYZE)
+-----------------------------
+
+The current algorithm is fairly simple - generate all possible functional
+dependencies, and for each one count the number of rows consistent with it.
+Then use the fraction of rows (supporting/total) as the degree.
+
+To count the rows consistent with the dependency (a => b):
+
+ (a) Sort the data lexicographically, i.e. first by 'a' then 'b'.
+
+ (b) For each group of rows with the same 'a' value, count the number of
+     distinct values in 'b'.
+
+ (c) If there's a single distinct value in 'b', the rows are consistent with
+     the functional dependency, otherwise they contradict it.
+
+The algorithm also requires a minimum size of the group to consider it
+consistent (currently 3 rows in the sample). Small groups make it less likely
+to break the consistency.
+
+
+Clause reduction (planner/optimizer)
+------------------------------------
+
+Applying the functional dependencies is fairly simple - given a list of
+equality clauses, we compute selectivities of each clause and then use the
+degree to combine them using this formula
+
+    P(a=?,b=?) = P(a=?) * (d + (1-d) * P(b=?))
+
+Where 'd' is the degree of functional dependence (a=>b).
+
+With more than two equality clauses, this process happens recursively. For
+example for (a,b,c) we first use (a,b=>c) to break the computation into
+
+    P(a=?,b=?,c=?) = P(a=?,b=?) * (d + (1-d)*P(b=?))
+
+and then apply (a=>b) the same way on P(a=?,b=?).
+
+
+Consistency of clauses
+----------------------
+
+Functional dependencies only express general dependencies between columns,
+without referencing particular values. This assumes that the equality clauses
+are in fact consistent with the functional dependency, i.e. that given a
+dependency (a=>b), the value in (b=?) clause is the value determined by (a=?).
+If that's not the case, the clauses are "inconsistent" with the functional
+dependency and the result will be over-estimation.
+
+This may happen, for example, when using conditions on the ZIP code and city
+name with mismatching values (ZIP code for a different city), etc. In such a
+case, the result set will be empty, but we'll estimate the selectivity using
+the ZIP code condition.
+
+In this case, the default estimation based on AVIA principle happens to work
+better, but mostly by chance.
+
+This issue is the price for the simplicity of functional dependencies. If the
+application frequently constructs queries with clauses inconsistent with
+functional dependencies present in the data, the best solution is not to
+use functional dependencies, but one of the more complex types of statistics.
--- a/src/backend/statistics/dependencies.c
+++ b/src/backend/statistics/dependencies.c
--- a/src/backend/statistics/extended_stats.c
+++ b/src/backend/statistics/extended_stats.c
@@ -47,7 +47,7 @@ static List *fetch_statentries_for_relation(Relation pg_statext, Oid relid);
 static VacAttrStats **lookup_var_attr_stats(Relation rel, Bitmapset *attrs,
 					  int natts, VacAttrStats **vacattrstats);
 static void statext_store(Relation pg_stext, Oid relid,
-			  MVNDistinct *ndistinct,
+			  MVNDistinct *ndistinct, MVDependencies *dependencies,
 			  VacAttrStats **stats);


@@ -74,6 +74,7 @@ BuildRelationExtStatistics(Relation onerel, double totalrows,
 	{
 		StatExtEntry   *stat = (StatExtEntry *) lfirst(lc);
 		MVNDistinct	   *ndistinct = NULL;
+		MVDependencies *dependencies = NULL;
 		VacAttrStats  **stats;
 		ListCell	   *lc2;

@@ -93,10 +94,13 @@ BuildRelationExtStatistics(Relation onerel, double totalrows,
 			if (t == STATS_EXT_NDISTINCT)
 				ndistinct = statext_ndistinct_build(totalrows, numrows, rows,
 													stat->columns, stats);
+			else if (t == STATS_EXT_DEPENDENCIES)
+				dependencies = statext_dependencies_build(numrows, rows,
+													   stat->columns, stats);
 		}

 		/* store the statistics in the catalog */
-		statext_store(pg_stext, stat->statOid, ndistinct, stats);
+		statext_store(pg_stext, stat->statOid, ndistinct, dependencies, stats);
 	}

 	heap_close(pg_stext, RowExclusiveLock);
@@ -117,6 +121,10 @@ statext_is_kind_built(HeapTuple htup, char type)
 			attnum = Anum_pg_statistic_ext_standistinct;
 			break;

+		case STATS_EXT_DEPENDENCIES:
+			attnum = Anum_pg_statistic_ext_stadependencies;
+			break;
+
 		default:
 			elog(ERROR, "unexpected statistics type requested: %d", type);
 	}
@@ -178,7 +186,8 @@ fetch_statentries_for_relation(Relation pg_statext, Oid relid)
 		enabled = (char *) ARR_DATA_PTR(arr);
 		for (i = 0; i < ARR_DIMS(arr)[0]; i++)
 		{
-			Assert(enabled[i] == STATS_EXT_NDISTINCT);
+			Assert((enabled[i] == STATS_EXT_NDISTINCT) ||
+				   (enabled[i] == STATS_EXT_DEPENDENCIES));
 			entry->types = lappend_int(entry->types, (int) enabled[i]);
 		}

@@ -256,7 +265,7 @@ lookup_var_attr_stats(Relation rel, Bitmapset *attrs, int natts,
 */
 static void
 statext_store(Relation pg_stext, Oid statOid,
-			  MVNDistinct *ndistinct,
+			  MVNDistinct *ndistinct, MVDependencies *dependencies,
 			  VacAttrStats **stats)
 {
 	HeapTuple	stup,
@@ -280,8 +289,17 @@ statext_store(Relation pg_stext, Oid statOid,
 		values[Anum_pg_statistic_ext_standistinct - 1] = PointerGetDatum(data);
 	}

+	if (dependencies != NULL)
+	{
+		bytea	   *data = statext_dependencies_serialize(dependencies);
+
+		nulls[Anum_pg_statistic_ext_stadependencies - 1] = (data == NULL);
+		values[Anum_pg_statistic_ext_stadependencies - 1] = PointerGetDatum(data);
+	}
+
 	/* always replace the value (either by bytea or NULL) */
 	replaces[Anum_pg_statistic_ext_standistinct - 1] = true;
+	replaces[Anum_pg_statistic_ext_stadependencies - 1] = true;

 	/* there should already be a pg_statistic_ext tuple */
 	oldtup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statOid));
@@ -387,3 +405,82 @@ multi_sort_compare_dims(int start, int end,

 	return 0;
 }
+
+/*
+ * has_stats_of_kind
+ *	Check that the list contains statistic of a given kind
+ */
+bool
+has_stats_of_kind(List *stats, char requiredkind)
+{
+	ListCell   *l;
+
+	foreach(l, stats)
+	{
+		StatisticExtInfo *stat = (StatisticExtInfo *) lfirst(l);
+
+		if (stat->kind == requiredkind)
+			return true;
+	}
+
+	return false;
+}
+
+/*
+ * choose_best_statistics
+ *		Look for statistics with the specified 'requiredkind' which have keys
+ *		that match at least two attnums.
+ *
+ * The current selection criteria is very simple - we choose the statistics
+ * referencing the most attributes with the least keys.
+ *
+ * XXX if multiple statistics exists of the same size matching the same number
+ * of keys, then the statistics which are chosen depend on the order that they
+ * appear in the stats list. Perhaps this needs to be more definitive.
+ */
+StatisticExtInfo *
+choose_best_statistics(List *stats, Bitmapset *attnums, char requiredkind)
+{
+	ListCell   *lc;
+	StatisticExtInfo *best_match = NULL;
+	int			best_num_matched = 2;	/* goal #1: maximize */
+	int			best_match_keys = (STATS_MAX_DIMENSIONS + 1);	/* goal #2: minimize */
+
+	foreach(lc, stats)
+	{
+		StatisticExtInfo *info = (StatisticExtInfo *) lfirst(lc);
+		int			num_matched;
+		int			numkeys;
+		Bitmapset  *matched;
+
+		/* skip statistics that are not the correct type */
+		if (info->kind != requiredkind)
+			continue;
+
+		/* determine how many attributes of these stats can be matched to */
+		matched = bms_intersect(attnums, info->keys);
+		num_matched = bms_num_members(matched);
+		bms_free(matched);
+
+		/*
+		 * save the actual number of keys in the stats so that we can choose
+		 * the narrowest stats with the most matching keys.
+		 */
+		numkeys = bms_num_members(info->keys);
+
+		/*
+		 * Use these statistics when it increases the number of matched
+		 * clauses or when it matches the same number of attributes but these
+		 * stats have fewer keys than any previous match.
+		 */
+		if (num_matched > best_num_matched ||
+			(num_matched == best_num_matched && numkeys < best_match_keys))
+		{
+			best_match = info;
+			best_num_matched = num_matched;
+			best_match_keys = numkeys;
+		}
+	}
+
+	return best_match;
+}
--- a/src/backend/utils/adt/ruleutils.c
+++ b/src/backend/utils/adt/ruleutils.c
@@ -1452,6 +1452,13 @@ pg_get_statisticsext_worker(Oid statextid, bool missing_ok)
 	StringInfoData buf;
 	int			colno;
 	char	   *nsp;
+	ArrayType  *arr;
+	char	   *enabled;
+	Datum		datum;
+	bool		isnull;
+	bool		ndistinct_enabled;
+	bool		dependencies_enabled;
+	int			i;

 	statexttup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statextid));

@@ -1467,10 +1474,55 @@ pg_get_statisticsext_worker(Oid statextid, bool missing_ok)
 	initStringInfo(&buf);

 	nsp = get_namespace_name(statextrec->stanamespace);
-	appendStringInfo(&buf, "CREATE STATISTICS %s ON (",
+	appendStringInfo(&buf, "CREATE STATISTICS %s",
 					 quote_qualified_identifier(nsp,
 												NameStr(statextrec->staname)));

+	/*
+	 * Lookup the staenabled column so that we know how to handle the WITH
+	 * clause.
+	 */
+	datum = SysCacheGetAttr(STATEXTOID, statexttup,
+							Anum_pg_statistic_ext_staenabled, &isnull);
+	Assert(!isnull);
+	arr = DatumGetArrayTypeP(datum);
+	if (ARR_NDIM(arr) != 1 ||
+		ARR_HASNULL(arr) ||
+		ARR_ELEMTYPE(arr) != CHAROID)
+		elog(ERROR, "staenabled is not a 1-D char array");
+	enabled = (char *) ARR_DATA_PTR(arr);
+
+	ndistinct_enabled = false;
+	dependencies_enabled = false;
+
+	for (i = 0; i < ARR_DIMS(arr)[0]; i++)
+	{
+		if (enabled[i] == STATS_EXT_NDISTINCT)
+			ndistinct_enabled = true;
+		if (enabled[i] == STATS_EXT_DEPENDENCIES)
+			dependencies_enabled = true;
+	}
+
+	/*
+	 * If any option is disabled, then we'll need to append a WITH clause to
+	 * show which options are enabled.  We omit the WITH clause on purpose
+	 * when all options are enabled, so a pg_dump/pg_restore will create all
+	 * statistics types on a newer postgres version, if the statistics had all
+	 * options enabled on the original version.
+	 */
+	if (!ndistinct_enabled || !dependencies_enabled)
+	{
+		appendStringInfoString(&buf, " WITH (");
+		if (ndistinct_enabled)
+			appendStringInfoString(&buf, "ndistinct");
+		else if (dependencies_enabled)
+			appendStringInfoString(&buf, "dependencies");
+
+		appendStringInfoChar(&buf, ')');
+	}
+
+	appendStringInfoString(&buf, " ON (");
+
 	for (colno = 0; colno < statextrec->stakeys.dim1; colno++)
 	{
 		AttrNumber	attnum = statextrec->stakeys.values[colno];
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -1633,13 +1633,17 @@ booltestsel(PlannerInfo *root, BoolTestType booltesttype, Node *arg,
 			case IS_NOT_FALSE:
 				selec = (double) clause_selectivity(root, arg,
 													varRelid,
-													jointype, sjinfo);
+													jointype,
+													sjinfo,
+													NULL);
 				break;
 			case IS_FALSE:
 			case IS_NOT_TRUE:
 				selec = 1.0 - (double) clause_selectivity(root, arg,
 														  varRelid,
-														  jointype, sjinfo);
+														  jointype,
+														  sjinfo,
+														  NULL);
 				break;
 			default:
 				elog(ERROR, "unrecognized booltesttype: %d",
@@ -6436,7 +6440,8 @@ genericcostestimate(PlannerInfo *root,
 	indexSelectivity = clauselist_selectivity(root, selectivityQuals,
 											  index->rel->relid,
 											  JOIN_INNER,
-											  NULL);
+											  NULL,
+											  index->rel);

 	/*
 	 * If caller didn't give us an estimate, estimate the number of index
@@ -6757,7 +6762,8 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
 		btreeSelectivity = clauselist_selectivity(root, selectivityQuals,
 												  index->rel->relid,
 												  JOIN_INNER,
-												  NULL);
+												  NULL,
+												  index->rel);
 		numIndexTuples = btreeSelectivity * index->rel->tuples;

 		/*
@@ -7516,7 +7522,8 @@ gincostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
 	*indexSelectivity = clauselist_selectivity(root, selectivityQuals,
 											   index->rel->relid,
 											   JOIN_INNER,
-											   NULL);
+											   NULL,
+											   index->rel);

 	/* fetch estimated page cost for tablespace containing index */
 	get_tablespace_page_costs(index->reltablespace,
@@ -7748,7 +7755,8 @@ brincostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
 	*indexSelectivity =
 		clauselist_selectivity(root, indexQuals,
 							   path->indexinfo->rel->relid,
-							   JOIN_INNER, NULL);
+							   JOIN_INNER, NULL,
+							   path->indexinfo->rel);
 	*indexCorrelation = 1;

 	/*