diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml
index aa5e705e574..b2fae027f5f 100644
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -221,13 +221,13 @@
- pg_pltemplate
- template data for procedural languages
+ pg_partitioned_table
+ information about partition key of tables
- pg_partitioned_table
- information about partition key of tables
+ pg_pltemplate
+ template data for procedural languages
@@ -4271,108 +4271,6 @@ SCRAM-SHA-256$<iteration count>>:<salt><
-
- pg_statistic_ext
-
-
- pg_statistic_ext
-
-
-
- The catalog pg_statistic_ext
- holds extended planner statistics.
-
-
-
- pg_statistic_ext> Columns
-
-
-
-
- Name
- Type
- References
- Description
-
-
-
-
-
-
- stxrelid
- oid
- pg_class.oid
- The table that the described columns belongs to
-
-
-
- stxname
- name
-
- Name of the statistic.
-
-
-
- stxnamespace
- oid
- pg_namespace.oid
-
- The OID of the namespace that contains this statistic
-
-
-
-
- stxowner
- oid
- pg_authid.oid
- Owner of the statistic
-
-
-
- stxkeys
- int2vector
- pg_attribute.attnum
-
- This is an array of values that indicate which table columns this
- statistic covers. For example a value of 1 3 would
- mean that the first and the third table columns make up the statistic key.
-
-
-
-
- stxkind
- char[]
-
-
- An array with the modes of the enabled statistic types. Valid values
- are:
- d for ndistinct coefficients,
- f for functional dependencies.
-
-
-
-
- stxndistinct
- pg_ndistinct
-
-
- N-distinct coefficients, serialized as pg_ndistinct> type.
-
-
-
-
- stxdependencies
- pg_dependencies
-
-
- Functional dependencies, serialized as pg_dependencies> type.
-
-
-
-
-
-
- pg_namespace
@@ -4790,6 +4688,111 @@ SCRAM-SHA-256$<iteration count>>:<salt><
+
+ pg_partitioned_table
+
+
+ pg_partitioned_table
+
+
+
+ The catalog pg_partitioned_table stores
+ information about how tables are partitioned.
+
+
+
+ pg_partitioned_table> Columns
+
+
+
+
+ Name
+ Type
+ References
+ Description
+
+
+
+
+
+
+ partrelid
+ oid
+ pg_class.oid
+ The OID of the pg_class> entry for this partitioned table
+
+
+
+ partstrat
+ char
+
+
+ Partitioning strategy; l> = list partitioned table,
+ r> = range partitioned table
+
+
+
+
+ partnatts
+ int2
+
+ The number of columns in partition key
+
+
+
+ partattrs
+ int2vector
+ pg_attribute.attnum
+
+ This is an array of partnatts values that
+ indicate which table columns are part of the partition key. For
+ example, a value of 1 3 would mean that the first
+ and the third table columns make up the partition key. A zero in this
+ array indicates that the corresponding partition key column is an
+ expression, rather than a simple column reference.
+
+
+
+
+ partclass
+ oidvector
+ pg_opclass.oid
+
+ For each column in the partition key, this contains the OID of the
+ operator class to use. See
+ pg_opclass for details.
+
+
+
+
+ partcollation
+ oidvector
+ pg_opclass.oid
+
+ For each column in the partition key, this contains the OID of the
+ the collation to use for partitioning.
+
+
+
+
+ partexprs
+ pg_node_tree
+
+
+ Expression trees (in nodeToString()
+ representation) for partition key columns that are not simple column
+ references. This is a list with one element for each zero
+ entry in partattrs>. Null if all partition key columns
+ are simple references.
+
+
+
+
+
+
+
+
+
pg_pltemplate
@@ -4896,109 +4899,6 @@ SCRAM-SHA-256$<iteration count>>:<salt><
-
- pg_partitioned_table
-
-
- pg_partitioned_table
-
-
-
- The catalog pg_partitioned_table stores
- information about how tables are partitioned.
-
-
-
- pg_partitioned_table> Columns
-
-
-
-
- Name
- Type
- References
- Description
-
-
-
-
-
-
- partrelid
- oid
- pg_class.oid
- The OID of the pg_class> entry for this partitioned table
-
-
-
- partstrat
- char
-
-
- Partitioning strategy; l> = list partitioned table,
- r> = range partitioned table
-
-
-
-
- partnatts
- int2
-
- The number of columns in partition key
-
-
-
- partattrs
- int2vector
- pg_attribute.attnum
-
- This is an array of partnatts values that
- indicate which table columns are part of the partition key. For
- example, a value of 1 3 would mean that the first
- and the third table columns make up the partition key. A zero in this
- array indicates that the corresponding partition key column is an
- expression, rather than a simple column reference.
-
-
-
-
- partclass
- oidvector
- pg_opclass.oid
-
- For each column in the partition key, this contains the OID of the
- operator class to use. See
- pg_opclass for details.
-
-
-
-
- partcollation
- oidvector
- pg_opclass.oid
-
- For each column in the partition key, this contains the OID of the
- the collation to use for partitioning.
-
-
-
-
- partexprs
- pg_node_tree
-
-
- Expression trees (in nodeToString()
- representation) for partition key columns that are not simple column
- references. This is a list with one element for each zero
- entry in partattrs>. Null if all partition key columns
- are simple references.
-
-
-
-
-
-
- pg_policy
@@ -6466,6 +6366,120 @@ SCRAM-SHA-256$<iteration count>>:<salt><
+
+ pg_statistic_ext
+
+
+ pg_statistic_ext
+
+
+
+ The catalog pg_statistic_ext
+ holds extended planner statistics.
+ Each row in this catalog corresponds to a statistics object>
+ created with .
+
+
+
+ pg_statistic_ext> Columns
+
+
+
+
+ Name
+ Type
+ References
+ Description
+
+
+
+
+
+
+ stxrelid
+ oid
+ pg_class.oid
+ Table containing the columns described by this object
+
+
+
+ stxname
+ name
+
+ Name of the statistics object
+
+
+
+ stxnamespace
+ oid
+ pg_namespace.oid
+
+ The OID of the namespace that contains this statistics object
+
+
+
+
+ stxowner
+ oid
+ pg_authid.oid
+ Owner of the statistics object
+
+
+
+ stxkeys
+ int2vector
+ pg_attribute.attnum
+
+ An array of attribute numbers, indicating which table columns are
+ covered by this statistics object;
+ for example a value of 1 3 would
+ mean that the first and the third table columns are covered
+
+
+
+
+ stxkind
+ char[]
+
+
+ An array containing codes for the enabled statistic types;
+ valid values are:
+ d for n-distinct statistics,
+ f for functional dependency statistics
+
+
+
+
+ stxndistinct
+ pg_ndistinct
+
+
+ N-distinct counts, serialized as pg_ndistinct> type
+
+
+
+
+ stxdependencies
+ pg_dependencies
+
+
+ Functional dependency statistics, serialized
+ as pg_dependencies> type
+
+
+
+
+
+
+
+
+ The stxkind field is filled at creation of the
+ statistics object, indicating which statistic type(s) are desired.
+ The fields after it are initially NULL and are filled only when the
+ corresponding statistic has been computed by ANALYZE>.
+
+
+
pg_subscription
diff --git a/doc/src/sgml/perform.sgml b/doc/src/sgml/perform.sgml
index 32e17ee5f8e..b4b8f8dcb8f 100644
--- a/doc/src/sgml/perform.sgml
+++ b/doc/src/sgml/perform.sgml
@@ -1071,25 +1071,41 @@ WHERE tablename = 'road';
are independent of each other,
an assumption that does not hold when column values are correlated.
Regular statistics, because of their per-individual-column nature,
- do not capture the knowledge of cross-column correlation;
- multivariate statistics can be used to instruct
- the server to obtain statistics across such a set of columns,
- which are later used by the query optimizer
- to determine cardinality and selectivity
- of clauses involving those columns.
- Multivariate statistics are currently the only use of
- extended statistics.
+ cannot capture any knowledge about cross-column correlation.
+ However, PostgreSQL> has the ability to compute
+ multivariate statistics, which can capture
+ such information.
- Extended statistics are created using
+ Because the number of possible column combinations is very large,
+ it's impractical to compute multivariate statistics automatically.
+ Instead, extended statistics objects, more often
+ called just statistics objects>, can be created to instruct
+ the server to obtain statistics across interesting sets of columns.
+
+
+
+ Statistics objects are created using
, which see for more details.
- Data collection is deferred until the next ANALYZE
- on the table, after which the stored values can be examined in the
+ Creation of such an object merely creates a catalog entry expressing
+ interest in the statistics. Actual data collection is performed
+ by ANALYZE (either a manual command, or background
+ auto-analyze). The collected values can be examined in the
pg_statistic_ext
catalog.
+
+ ANALYZE computes extended statistics based on the same
+ sample of table rows that it takes for computing regular single-column
+ statistics. Since the sample size is increased by increasing the
+ statistics target for the table or any of its columns (as described in
+ the previous section), a larger statistics target will normally result in
+ more accurate extended statistics, as well as more time spent calculating
+ them.
+
+
The following subsections describe the types of extended statistics
that are currently supported.
@@ -1099,142 +1115,162 @@ WHERE tablename = 'road';
Functional Dependencies
- The simplest type of extended statistics are functional dependencies,
- a concept used in definitions of database normal forms.
- Put simply, it is said that column b> is functionally
- dependent on column a> if knowledge of the value of
- a> is sufficient to determine the value of b>.
- In normalized databases, functional dependencies are allowed only on
- primary keys and superkeys. However, many data sets are in practice not
- fully normalized for various reasons; intentional denormalization for
- performance reasons is a common example.
+ The simplest type of extended statistics tracks functional
+ dependencies>, a concept used in definitions of database normal forms.
+ We say that column b> is functionally dependent on
+ column a> if knowledge of the value of
+ a> is sufficient to determine the value
+ of b>, that is there are no two rows having the same value
+ of a> but different values of b>.
+ In a fully normalized database, functional dependencies should exist
+ only on primary keys and superkeys. However, in practice many data sets
+ are not fully normalized for various reasons; intentional
+ denormalization for performance reasons is a common example.
+ Even in a fully normalized database, there may be partial correlation
+ between some columns, which can be expressed as partial functional
+ dependency.
- The existance of functional dependencies directly affects the accuracy
- of estimates in certain queries.
- The reason is that conditions on the dependent columns do not
- restrict the result set, but the query planner (lacking functional
- dependency knowledge) considers them independent, resulting in
- underestimates.
- To inform the planner about the functional dependencies, we collect
- measurements of dependency during ANALYZE>. Assessing
- the degree of dependency between all sets of columns would be
- prohibitively expensive, so the search is limited to potential
- dependencies defined using the dependencies> option of
- extended statistics. It is advisable to create
- dependencies> statistics if and only if functional
- dependencies actually exist, to avoid unnecessary overhead on both
- ANALYZE> and query planning.
+ The existence of functional dependencies directly affects the accuracy
+ of estimates in certain queries. If a query contains conditions on
+ both the independent and the dependent column(s), the
+ conditions on the dependent columns do not further reduce the result
+ size; but without knowledge of the functional dependency, the query
+ planner will assume that the conditions are independent, resulting
+ in underestimating the result size.
- To inspect functional dependencies on a statistics
- stts, you may do this:
+ To inform the planner about functional dependencies, ANALYZE>
+ can collect measurements of cross-column dependency. Assessing the
+ degree of dependency between all sets of columns would be prohibitively
+ expensive, so data collection is limited to those groups of columns
+ appearing together in a statistics object defined with
+ the dependencies> option. It is advisable to create
+ dependencies> statistics only for column groups that are
+ strongly correlated, to avoid unnecessary overhead in both
+ ANALYZE> and later query planning.
+
+
+
+ Here is an example of collecting functional-dependency statistics:
-CREATE STATISTICS stts (dependencies)
- ON zip, city FROM zipcodes;
+CREATE STATISTICS stts (dependencies) ON zip, city FROM zipcodes;
+
ANALYZE zipcodes;
+
SELECT stxname, stxkeys, stxdependencies
FROM pg_statistic_ext
- WHERE stxname = 'stts';
+ WHERE stxname = 'stts';
stxname | stxkeys | stxdependencies
---------+---------+------------------------------------------
stts | 1 5 | {"1 => 5": 1.000000, "5 => 1": 0.423130}
(1 row)
- where it can be seen that column 1 (a zip code) fully determines column
+ Here it can be seen that column 1 (zip code) fully determines column
5 (city) so the coefficient is 1.0, while city only determines zip code
about 42% of the time, meaning that there are many cities (58%) that are
represented by more than a single ZIP code.
- When computing the selectivity, the planner inspects all conditions and
- attempts to identify which conditions are already implied by other
- conditions. The selectivity estimates from any redundant conditions are
- ignored from a selectivity point of view. In the example query above,
- the selectivity estimates for either of the conditions may be eliminated,
- thus improving the overall estimate.
+ When computing the selectivity for a query involving functionally
+ dependent columns, the planner adjusts the per-condition selectivity
+ estimates using the dependency coefficients so as not to produce
+ an underestimate.
Limitations of Functional Dependencies
- Functional dependencies are a very simple type of statistics, and
- as such have several limitations. The first limitation is that they
- only work with simple equality conditions, comparing columns and constant
- values. It's not possible to use them to eliminate equality conditions
- comparing two columns or a column to an expression, range clauses,
- LIKE> or any other type of conditions.
+ Functional dependencies are currently only applied when considering
+ simple equality conditions that compare columns to constant values.
+ They are not used to improve estimates for equality conditions
+ comparing two columns or comparing a column to an expression, nor for
+ range clauses, LIKE> or any other type of condition.
- When eliminating the implied conditions, the planner assumes that the
- conditions are compatible. Consider the following example, where
- this assumption does not hold:
-
+ When estimating with functional dependencies, the planner assumes that
+ conditions on the involved columns are compatible and hence redundant.
+ If they are incompatible, the correct estimate would be zero rows, but
+ that possibility is not considered. For example, given a query like
-EXPLAIN (ANALYZE, TIMING OFF) SELECT * FROM t WHERE a = 1 AND b = 10;
- QUERY PLAN
------------------------------------------------------------------------------
- Seq Scan on t (cost=0.00..195.00 rows=100 width=8) (actual rows=0 loops=1)
- Filter: ((a = 1) AND (b = 10))
- Rows Removed by Filter: 10000
+SELECT * FROM zipcodes WHERE city = 'San Francisco' AND zip = '94105';
-
- While there are no rows with such combination of values, the planner
- is unable to verify whether the values match — it only knows that
- the columns are functionally dependent.
+ the planner will disregard the city> clause as not
+ changing the selectivity, which is correct. However, it will make
+ the same assumption about
+
+SELECT * FROM zipcodes WHERE city = 'San Francisco' AND zip = '90210';
+
+ even though there will really be zero rows satisfying this query.
+ Functional dependency statistics do not provide enough information
+ to conclude that, however.
- This assumption is related to queries executed on the database; in many
- cases, it's actually satisfied (e.g. when the GUI only allows selecting
- compatible values). But if that's not the case, functional dependencies
- may not be a viable option.
+ In many practical situations, this assumption is usually satisfied;
+ for example, there might be a GUI in the application that only allows
+ selecting compatible city and zipcode values to use in a query.
+ But if that's not the case, functional dependencies may not be a viable
+ option.
- Multivariate N-Distinct Coefficients
+ Multivariate N-Distinct Counts
Single-column statistics store the number of distinct values in each
- column. Estimates of the number of distinct values on more than one
- column (for example, for GROUP BY a, b) are
+ column. Estimates of the number of distinct values when combining more
+ than one column (for example, for GROUP BY a, b) are
frequently wrong when the planner only has single-column statistical
- data, however, causing it to select bad plans.
- In order to improve n-distinct estimation when multiple columns are
- grouped together, the ndistinct> option of extended statistics
- can be used, which instructs ANALYZE> to collect n-distinct
- estimates for all possible combinations of two or more columns of the set
- of columns in the statistics object (the per-column estimates are already
- available in pg_statistic>).
+ data, causing it to select bad plans.
- Continuing the above example, the n-distinct coefficients in a ZIP
- code table may look like the following:
+ To improve such estimates, ANALYZE> can collect n-distinct
+ statistics for groups of columns. As before, it's impractical to do
+ this for every possible column grouping, so data is collected only for
+ those groups of columns appearing together in a statistics object
+ defined with the ndistinct> option. Data will be collected
+ for each possible combination of two or more columns from the set of
+ listed columns.
+
+
+
+ Continuing the previous example, the n-distinct counts in a
+ table of ZIP codes might look like the following:
-CREATE STATISTICS stts2 (ndistinct)
- ON zip, state, city FROM zipcodes;
+CREATE STATISTICS stts2 (ndistinct) ON zip, state, city FROM zipcodes;
+
ANALYZE zipcodes;
+
SELECT stxkeys AS k, stxndistinct AS nd
FROM pg_statistic_ext
- WHERE stxname = 'stts2';
+ WHERE stxname = 'stts2';
-[ RECORD 1 ]--------------------------------------------------------
k | 1 2 5
nd | {"1, 2": 33178, "1, 5": 33178, "2, 5": 27435, "1, 2, 5": 33178}
(1 row)
- which indicates that there are three combinations of columns that
+ This indicates that there are three combinations of columns that
have 33178 distinct values: ZIP code and state; ZIP code and city;
and ZIP code, city and state (the fact that they are all equal is
- expected given the nature of ZIP-code data). On the other hand,
- the combination of city and state only has 27435 distinct values.
+ expected given that ZIP code alone is unique in this table). On the
+ other hand, the combination of city and state has only 27435 distinct
+ values.
+
+
+
+ It's advisable to create ndistinct> statistics objects only
+ on combinations of columns that are actually used for grouping, and
+ for which misestimation of the number of groups is resulting in bad
+ plans. Otherwise, the ANALYZE> cycles are just wasted.
diff --git a/doc/src/sgml/planstats.sgml b/doc/src/sgml/planstats.sgml
index ef847b96333..8caf297f859 100644
--- a/doc/src/sgml/planstats.sgml
+++ b/doc/src/sgml/planstats.sgml
@@ -456,10 +456,11 @@ rows = (outer_cardinality * inner_cardinality) * selectivity
- Functional dependencies
+ Functional Dependencies
+
- Multivariate correlation can be seen with a very simple data set — a
- table with two columns, both containing the same values:
+ Multivariate correlation can be demonstrated with a very simple data set
+ — a table with two columns, both containing the same values:
CREATE TABLE t (a INT, b INT);
@@ -501,8 +502,8 @@ EXPLAIN (ANALYZE, TIMING OFF) SELECT * FROM t WHERE a = 1;
number of rows, we see that the estimate is very accurate
(in fact exact, as the table is very small). Changing the
WHERE> to use the b> column, an identical
- plan is generated. Observe what happens if we apply the same
- condition on both columns combining them with AND>:
+ plan is generated. But observe what happens if we apply the same
+ condition on both columns, combining them with AND>:
EXPLAIN (ANALYZE, TIMING OFF) SELECT * FROM t WHERE a = 1 AND b = 1;
@@ -514,15 +515,16 @@ EXPLAIN (ANALYZE, TIMING OFF) SELECT * FROM t WHERE a = 1 AND b = 1;
The planner estimates the selectivity for each condition individually,
- arriving to the 1% estimates as above, and then multiplies them, getting
- the final 0.01% estimate. The actual figures, however,
- show that this results in a significant underestimate, as the actual
- number of rows matching the conditions (100) is two orders of magnitude
- higher than the estimated value.
+ arriving at the same 1% estimates as above. Then it assumes that the
+ conditions are independent, and so it multiplies their selectivities,
+ producing a final selectivity estimate of just 0.01%.
+ This is a significant underestimate, as the actual number of rows
+ matching the conditions (100) is two orders of magnitude higher.
- This problem can be fixed by applying functional-dependency
+ This problem can be fixed by creating a statistics object that
+ directs ANALYZE> to calculate functional-dependency
multivariate statistics on the two columns:
@@ -539,13 +541,15 @@ EXPLAIN (ANALYZE, TIMING OFF) SELECT * FROM t WHERE a = 1 AND b = 1;
- Multivariate N-Distinct coefficients
+ Multivariate N-Distinct Counts
+
- A similar problem occurs with estimation of the cardinality of distinct
- elements, used to determine the number of groups that would be generated
- by a GROUP BY clause. When GROUP BY
- lists a single column, the n-distinct estimate (which can be seen as the
- number of rows returned by the aggregate execution node) is very accurate:
+ A similar problem occurs with estimation of the cardinality of sets of
+ multiple columns, such as the number of groups that would be generated by
+ a GROUP BY clause. When GROUP BY
+ lists a single column, the n-distinct estimate (which is visible as the
+ estimated number of rows returned by the HashAggregate node) is very
+ accurate:
EXPLAIN (ANALYZE, TIMING OFF) SELECT COUNT(*) FROM t GROUP BY a;
QUERY PLAN
@@ -565,8 +569,8 @@ EXPLAIN (ANALYZE, TIMING OFF) SELECT COUNT(*) FROM t GROUP BY a, b;
Group Key: a, b
-> Seq Scan on t (cost=0.00..145.00 rows=10000 width=8) (actual rows=10000 loops=1)
- By dropping the existing statistics and re-creating it to include n-distinct
- calculation, the estimate is much improved:
+ By redefining the statistics object to include n-distinct counts for the
+ two columns, the estimate is much improved:
DROP STATISTICS stts;
CREATE STATISTICS stts (dependencies, ndistinct) ON a, b FROM t;
diff --git a/doc/src/sgml/ref/create_statistics.sgml b/doc/src/sgml/ref/create_statistics.sgml
index 92ee4e4efa7..854746de24d 100644
--- a/doc/src/sgml/ref/create_statistics.sgml
+++ b/doc/src/sgml/ref/create_statistics.sgml
@@ -79,11 +79,13 @@ CREATE STATISTICS [ IF NOT EXISTS ] statistics_na
statistic_type
- A statistic type to be computed in this statistics object. Currently
- supported types are ndistinct, which enables
- n-distinct coefficient tracking,
- and dependencies, which enables functional
- dependencies.
+ A statistic type to be computed in this statistics object.
+ Currently supported types are
+ ndistinct, which enables n-distinct statistics, and
+ dependencies, which enables functional
+ dependency statistics.
+ For more information, see
+ and .
@@ -92,7 +94,8 @@ CREATE STATISTICS [ IF NOT EXISTS ] statistics_na
column_name
- The name of a table column to be included in the statistics object.
+ The name of a table column to be covered by the computed statistics.
+ At least two column names must be given.
@@ -114,7 +117,9 @@ CREATE STATISTICS [ IF NOT EXISTS ] statistics_na
Notes
- You must be the owner of a table to create or change statistics on it.
+ You must be the owner of a table to create a statistics object
+ reading it. Once created, however, the ownership of the statistics
+ object is independent of the underlying table(s).
@@ -124,8 +129,8 @@ CREATE STATISTICS [ IF NOT EXISTS ] statistics_na
Create table t1> with two functionally dependent columns, i.e.
knowledge of a value in the first column is sufficient for determining the
- value in the other column. Then functional dependencies are built on those
- columns:
+ value in the other column. Then functional dependency statistics are built
+ on those columns:
CREATE TABLE t1 (
@@ -136,21 +141,25 @@ CREATE TABLE t1 (
INSERT INTO t1 SELECT i/100, i/500
FROM generate_series(1,1000000) s(i);
+ANALYZE t1;
+
+-- the number of matching rows will be drastically underestimated:
+EXPLAIN ANALYZE SELECT * FROM t1 WHERE (a = 1) AND (b = 0);
+
CREATE STATISTICS s1 (dependencies) ON a, b FROM t1;
ANALYZE t1;
--- valid combination of values
+-- now the rowcount estimate is more accurate:
EXPLAIN ANALYZE SELECT * FROM t1 WHERE (a = 1) AND (b = 0);
-
--- invalid combination of values
-EXPLAIN ANALYZE SELECT * FROM t1 WHERE (a = 1) AND (b = 1);
- Without functional-dependency statistics, the planner would make the
- same estimate of the number of matching rows for these two queries.
- With such statistics, it is able to tell that one case has matches
- and the other does not.
+ Without functional-dependency statistics, the planner would assume
+ that the two WHERE> conditions are independent, and would
+ multiply their selectivities together to arrive at a much-too-small
+ rowcount estimate.
+ With such statistics, the planner recognizes that the WHERE>
+ conditions are redundant and does not underestimate the rowcount.