diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml index 303f3a6f921..fa6aa7d3014 100644 --- a/doc/src/sgml/catalogs.sgml +++ b/doc/src/sgml/catalogs.sgml @@ -5072,9 +5072,9 @@ The number of distinct nonnull data values in the column. A value greater than zero is the actual number of distinct values. A value less than zero is the negative of a multiplier for the number - of rows in the table; for example, a column in which values appear about - twice on the average could be represented by - stadistinct = -0.5. + of rows in the table; for example, a column in which about 80% of the + values are nonnull and each nonnull value appears about twice on + average could be represented by stadistinct = -0.4. A zero value means the number of distinct values is unknown. diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 87993069830..9800d762fba 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -2044,8 +2044,11 @@ compute_minimal_stats(VacAttrStatsP stats, if (nmultiple == 0) { - /* If we found no repeated values, assume it's a unique column */ - stats->stadistinct = -1.0; + /* + * If we found no repeated non-null values, assume it's a unique + * column; but be sure to discount for any nulls we found. + */ + stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac); } else if (track_cnt < track_max && toowide_cnt == 0 && nmultiple == track_cnt) @@ -2390,8 +2393,11 @@ compute_scalar_stats(VacAttrStatsP stats, if (nmultiple == 0) { - /* If we found no repeated values, assume it's a unique column */ - stats->stadistinct = -1.0; + /* + * If we found no repeated non-null values, assume it's a unique + * column; but be sure to discount for any nulls we found. + */ + stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac); } else if (toowide_cnt == 0 && nmultiple == ndistinct) { @@ -2695,7 +2701,7 @@ compute_scalar_stats(VacAttrStatsP stats, else stats->stawidth = stats->attrtype->typlen; /* Assume all too-wide values are distinct, so it's a unique column */ - stats->stadistinct = -1.0; + stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac); } else if (null_cnt > 0) { diff --git a/src/backend/tsearch/ts_typanalyze.c b/src/backend/tsearch/ts_typanalyze.c index 308ae0893c8..5914d27062b 100644 --- a/src/backend/tsearch/ts_typanalyze.c +++ b/src/backend/tsearch/ts_typanalyze.c @@ -296,7 +296,7 @@ compute_tsvector_stats(VacAttrStats *stats, stats->stawidth = total_width / (double) nonnull_cnt; /* Assume it's a unique column (see notes above) */ - stats->stadistinct = -1.0; + stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac); /* * Construct an array of the interesting hashtable items, that is, diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 2c993148271..9c7ffbf6bfb 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -4383,12 +4383,14 @@ double get_variable_numdistinct(VariableStatData *vardata) { double stadistinct; + double stanullfrac = 0.0; double ntuples; /* * Determine the stadistinct value to use. There are cases where we can * get an estimate even without a pg_statistic entry, or can get a better - * value than is in pg_statistic. + * value than is in pg_statistic. Grab stanullfrac too if we can find it + * (otherwise, assume no nulls, for lack of any better idea). */ if (HeapTupleIsValid(vardata->statsTuple)) { @@ -4397,6 +4399,7 @@ get_variable_numdistinct(VariableStatData *vardata) stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple); stadistinct = stats->stadistinct; + stanullfrac = stats->stanullfrac; } else if (vardata->vartype == BOOLOID) { @@ -4420,7 +4423,7 @@ get_variable_numdistinct(VariableStatData *vardata) { case ObjectIdAttributeNumber: case SelfItemPointerAttributeNumber: - stadistinct = -1.0; /* unique */ + stadistinct = -1.0; /* unique (and all non null) */ break; case TableOidAttributeNumber: stadistinct = 1.0; /* only 1 value */ @@ -4442,10 +4445,11 @@ get_variable_numdistinct(VariableStatData *vardata) * If there is a unique index for the variable, assume it is unique no * matter what pg_statistic says; the statistics could be out of date, or * we might have found a partial unique index that proves the var is - * unique for this query. + * unique for this query. However, we'd better still believe + * the null-fraction statistic. */ if (vardata->isunique) - stadistinct = -1.0; + stadistinct = -1.0 * (1.0 - stanullfrac); /* * If we had an absolute estimate, use that. diff --git a/src/include/catalog/pg_statistic.h b/src/include/catalog/pg_statistic.h index 044eb446c08..ddef0f49e6d 100644 --- a/src/include/catalog/pg_statistic.h +++ b/src/include/catalog/pg_statistic.h @@ -67,13 +67,14 @@ CATALOG(pg_statistic,2619) BKI_WITHOUT_OIDS * > 0 actual number of distinct values * < 0 negative of multiplier for number of rows * The special negative case allows us to cope with columns that are - * unique (stadistinct = -1) or nearly so (for example, a column in - * which values appear about twice on the average could be represented - * by stadistinct = -0.5). Because the number-of-rows statistic in - * pg_class may be updated more frequently than pg_statistic is, it's - * important to be able to describe such situations as a multiple of - * the number of rows, rather than a fixed number of distinct values. - * But in other cases a fixed number is correct (eg, a boolean column). + * unique (stadistinct = -1) or nearly so (for example, a column in which + * non-null values appear about twice on the average could be represented + * by stadistinct = -0.5 if there are no nulls, or -0.4 if 20% of the + * column is nulls). Because the number-of-rows statistic in pg_class may + * be updated more frequently than pg_statistic is, it's important to be + * able to describe such situations as a multiple of the number of rows, + * rather than a fixed number of distinct values. But in other cases a + * fixed number is correct (eg, a boolean column). * ---------------- */ float4 stadistinct;