1
0
mirror of https://github.com/postgres/postgres.git synced 2025-10-13 18:28:01 +03:00
Files
postgres/contrib/intarray/_int_selfuncs.c
Tom Lane 261f89a976 Track the maximum possible frequency of non-MCE array elements.
The lossy-counting algorithm that ANALYZE uses to identify most-common
array elements has a notion of cutoff frequency: elements with
frequency greater than that are guaranteed to be collected, elements
with smaller frequencies are not.  In cases where we find fewer MCEs
than the stats target would permit us to store, the cutoff frequency
provides valuable additional information, to wit that there are no
non-MCEs with frequency greater than that.  What the selectivity
estimation functions actually use the "minfreq" entry for is as a
ceiling on the possible frequency of non-MCEs, so using the cutoff
rather than the lowest stored MCE frequency provides a tighter bound
and more accurate estimates.

Therefore, instead of redundantly storing the minimum observed MCE
frequency, store the cutoff frequency when there are fewer tracked
values than we want.  (When there are more, then of course we cannot
assert that no non-stored elements are above the cutoff frequency,
since we're throwing away some that are; so we still use the
minimum stored frequency in that case.)

Notably, this works even when none of the values are common enough
to be called MCEs.  In such cases we previously stored nothing in
the STATISTIC_KIND_MCELEM pg_statistic slot, which resulted in the
selectivity functions falling back to default estimates.  So in that
case we want to construct a STATISTIC_KIND_MCELEM entry that contains
no "values" but does have "numbers", to wit the three extra numbers
that the MCELEM entry type defines.  A small obstacle is that
update_attstats() has traditionally stored a null, not an empty array,
when passed zero "values" for a slot.  That gives rise to an MCELEM
entry that get_attstatsslot() will spit up on.  The least risky
solution seems to be to adjust update_attstats() so that it will emit
a non-null (but possibly empty) array when the passed stavalues array
pointer isn't NULL, rather than conditioning that on numvalues > 0.
In other existing cases I don't believe that that changes anything.
For consistency, handle the stanumbers array the same way.

In passing, improve the comments in routines that use
STATISTIC_KIND_MCELEM data.  Particularly, explain why we use
minfreq / 2 not minfreq as the estimate for non-MCE values.

Thanks to Matt Long for the suggestion that we could apply this
idea even when there are more than zero MCEs.

Reported-by: Mark Frost <FROSTMAR@uk.ibm.com>
Reported-by: Matt Long <matt@mattlong.org>
Author: Tom Lane <tgl@sss.pgh.pa.us>
Discussion: https://postgr.es/m/PH3PPF1C905D6E6F24A5C1A1A1D8345B593E16FA@PH3PPF1C905D6E6.namprd15.prod.outlook.com
2025-09-20 14:48:16 -04:00

336 lines
8.6 KiB
C

/*-------------------------------------------------------------------------
*
* _int_selfuncs.c
* Functions for selectivity estimation of intarray operators
*
* Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* contrib/intarray/_int_selfuncs.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "_int.h"
#include "access/htup_details.h"
#include "catalog/pg_operator.h"
#include "catalog/pg_statistic.h"
#include "catalog/pg_type.h"
#include "miscadmin.h"
#include "utils/fmgrprotos.h"
#include "utils/lsyscache.h"
#include "utils/selfuncs.h"
PG_FUNCTION_INFO_V1(_int_overlap_sel);
PG_FUNCTION_INFO_V1(_int_contains_sel);
PG_FUNCTION_INFO_V1(_int_contained_sel);
PG_FUNCTION_INFO_V1(_int_overlap_joinsel);
PG_FUNCTION_INFO_V1(_int_contains_joinsel);
PG_FUNCTION_INFO_V1(_int_contained_joinsel);
PG_FUNCTION_INFO_V1(_int_matchsel);
static Selectivity int_query_opr_selec(ITEM *item, Datum *mcelems, float4 *mcefreqs,
int nmcelems, float4 minfreq);
static int compare_val_int4(const void *a, const void *b);
/*
* Wrappers around the default array selectivity estimation functions.
*
* The default array selectivity operators for the @>, && and @< operators
* work fine for integer arrays. However, if we tried to just use arraycontsel
* and arraycontjoinsel directly as the cost estimator functions for our
* operators, they would not work as intended, because they look at the
* operator's OID. Our operators behave exactly like the built-in anyarray
* versions, but we must tell the cost estimator functions which built-in
* operators they correspond to. These wrappers just replace the operator
* OID with the corresponding built-in operator's OID, and call the built-in
* function.
*/
Datum
_int_overlap_sel(PG_FUNCTION_ARGS)
{
PG_RETURN_DATUM(DirectFunctionCall4(arraycontsel,
PG_GETARG_DATUM(0),
ObjectIdGetDatum(OID_ARRAY_OVERLAP_OP),
PG_GETARG_DATUM(2),
PG_GETARG_DATUM(3)));
}
Datum
_int_contains_sel(PG_FUNCTION_ARGS)
{
PG_RETURN_DATUM(DirectFunctionCall4(arraycontsel,
PG_GETARG_DATUM(0),
ObjectIdGetDatum(OID_ARRAY_CONTAINS_OP),
PG_GETARG_DATUM(2),
PG_GETARG_DATUM(3)));
}
Datum
_int_contained_sel(PG_FUNCTION_ARGS)
{
PG_RETURN_DATUM(DirectFunctionCall4(arraycontsel,
PG_GETARG_DATUM(0),
ObjectIdGetDatum(OID_ARRAY_CONTAINED_OP),
PG_GETARG_DATUM(2),
PG_GETARG_DATUM(3)));
}
Datum
_int_overlap_joinsel(PG_FUNCTION_ARGS)
{
PG_RETURN_DATUM(DirectFunctionCall5(arraycontjoinsel,
PG_GETARG_DATUM(0),
ObjectIdGetDatum(OID_ARRAY_OVERLAP_OP),
PG_GETARG_DATUM(2),
PG_GETARG_DATUM(3),
PG_GETARG_DATUM(4)));
}
Datum
_int_contains_joinsel(PG_FUNCTION_ARGS)
{
PG_RETURN_DATUM(DirectFunctionCall5(arraycontjoinsel,
PG_GETARG_DATUM(0),
ObjectIdGetDatum(OID_ARRAY_CONTAINS_OP),
PG_GETARG_DATUM(2),
PG_GETARG_DATUM(3),
PG_GETARG_DATUM(4)));
}
Datum
_int_contained_joinsel(PG_FUNCTION_ARGS)
{
PG_RETURN_DATUM(DirectFunctionCall5(arraycontjoinsel,
PG_GETARG_DATUM(0),
ObjectIdGetDatum(OID_ARRAY_CONTAINED_OP),
PG_GETARG_DATUM(2),
PG_GETARG_DATUM(3),
PG_GETARG_DATUM(4)));
}
/*
* _int_matchsel -- restriction selectivity function for intarray @@ query_int
*/
Datum
_int_matchsel(PG_FUNCTION_ARGS)
{
PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
List *args = (List *) PG_GETARG_POINTER(2);
int varRelid = PG_GETARG_INT32(3);
VariableStatData vardata;
Node *other;
bool varonleft;
Selectivity selec;
QUERYTYPE *query;
Datum *mcelems = NULL;
float4 *mcefreqs = NULL;
int nmcelems = 0;
float4 minfreq = 0.0;
float4 nullfrac = 0.0;
AttStatsSlot sslot;
/*
* If expression is not "variable @@ something" or "something @@ variable"
* then punt and return a default estimate.
*/
if (!get_restriction_variable(root, args, varRelid,
&vardata, &other, &varonleft))
PG_RETURN_FLOAT8(DEFAULT_EQ_SEL);
/*
* Variable should be int[]. We don't support cases where variable is
* query_int.
*/
if (vardata.vartype != INT4ARRAYOID)
PG_RETURN_FLOAT8(DEFAULT_EQ_SEL);
/*
* Can't do anything useful if the something is not a constant, either.
*/
if (!IsA(other, Const))
{
ReleaseVariableStats(vardata);
PG_RETURN_FLOAT8(DEFAULT_EQ_SEL);
}
/*
* The "@@" operator is strict, so we can cope with NULL right away.
*/
if (((Const *) other)->constisnull)
{
ReleaseVariableStats(vardata);
PG_RETURN_FLOAT8(0.0);
}
/* The caller made sure the const is a query, so get it now */
query = DatumGetQueryTypeP(((Const *) other)->constvalue);
/* Empty query matches nothing */
if (query->size == 0)
{
ReleaseVariableStats(vardata);
PG_RETURN_FLOAT8(0.0);
}
/*
* Get the statistics for the intarray column.
*
* We're interested in the Most-Common-Elements list, and the NULL
* fraction.
*/
if (HeapTupleIsValid(vardata.statsTuple))
{
Form_pg_statistic stats;
stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
nullfrac = stats->stanullfrac;
/*
* For an int4 array, the default array type analyze function will
* collect a Most Common Elements list, which is an array of int4s.
*/
if (get_attstatsslot(&sslot, vardata.statsTuple,
STATISTIC_KIND_MCELEM, InvalidOid,
ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS))
{
Assert(sslot.valuetype == INT4OID);
/*
* There should be three more Numbers than Values, because the
* last three (for intarray) cells are taken for minimal, maximal
* and nulls frequency. Punt if not.
*/
if (sslot.nnumbers == sslot.nvalues + 3)
{
/* Grab the minimal MCE frequency. */
minfreq = sslot.numbers[sslot.nvalues];
mcelems = sslot.values;
mcefreqs = sslot.numbers;
nmcelems = sslot.nvalues;
}
}
}
else
memset(&sslot, 0, sizeof(sslot));
/* Process the logical expression in the query, using the stats */
selec = int_query_opr_selec(GETQUERY(query) + query->size - 1,
mcelems, mcefreqs, nmcelems, minfreq);
/* MCE stats count only non-null rows, so adjust for null rows. */
selec *= (1.0 - nullfrac);
free_attstatsslot(&sslot);
ReleaseVariableStats(vardata);
CLAMP_PROBABILITY(selec);
PG_RETURN_FLOAT8((float8) selec);
}
/*
* Estimate selectivity of single intquery operator
*/
static Selectivity
int_query_opr_selec(ITEM *item, Datum *mcelems, float4 *mcefreqs,
int nmcelems, float4 minfreq)
{
Selectivity selec;
/* since this function recurses, it could be driven to stack overflow */
check_stack_depth();
if (item->type == VAL)
{
Datum *searchres;
if (mcelems == NULL)
return (Selectivity) DEFAULT_EQ_SEL;
searchres = (Datum *) bsearch(&item->val, mcelems, nmcelems,
sizeof(Datum), compare_val_int4);
if (searchres)
{
/*
* The element is in MCELEM. Return precise selectivity (or at
* least as precise as ANALYZE could find out).
*/
selec = mcefreqs[searchres - mcelems];
}
else
{
/*
* The element is not in MCELEM. Estimate its frequency as half
* that of the least-frequent MCE. (We know it cannot be more
* than minfreq, and it could be a great deal less. Half seems
* like a good compromise.) For probably-historical reasons,
* clamp to not more than DEFAULT_EQ_SEL.
*/
selec = Min(DEFAULT_EQ_SEL, minfreq / 2);
}
}
else if (item->type == OPR)
{
/* Current query node is an operator */
Selectivity s1,
s2;
s1 = int_query_opr_selec(item - 1, mcelems, mcefreqs, nmcelems,
minfreq);
switch (item->val)
{
case (int32) '!':
selec = 1.0 - s1;
break;
case (int32) '&':
s2 = int_query_opr_selec(item + item->left, mcelems, mcefreqs,
nmcelems, minfreq);
selec = s1 * s2;
break;
case (int32) '|':
s2 = int_query_opr_selec(item + item->left, mcelems, mcefreqs,
nmcelems, minfreq);
selec = s1 + s2 - s1 * s2;
break;
default:
elog(ERROR, "unrecognized operator: %d", item->val);
selec = 0; /* keep compiler quiet */
break;
}
}
else
{
elog(ERROR, "unrecognized int query item type: %u", item->type);
selec = 0; /* keep compiler quiet */
}
/* Clamp intermediate results to stay sane despite roundoff error */
CLAMP_PROBABILITY(selec);
return selec;
}
/*
* Comparison function for binary search in mcelem array.
*/
static int
compare_val_int4(const void *a, const void *b)
{
int32 key = *(int32 *) a;
const Datum *t = (const Datum *) b;
return key - DatumGetInt32(*t);
}