mirror of
				https://github.com/postgres/postgres.git
				synced 2025-10-24 01:29:19 +03:00 
			
		
		
		
	The lossy-counting algorithm that ANALYZE uses to identify most-common array elements has a notion of cutoff frequency: elements with frequency greater than that are guaranteed to be collected, elements with smaller frequencies are not. In cases where we find fewer MCEs than the stats target would permit us to store, the cutoff frequency provides valuable additional information, to wit that there are no non-MCEs with frequency greater than that. What the selectivity estimation functions actually use the "minfreq" entry for is as a ceiling on the possible frequency of non-MCEs, so using the cutoff rather than the lowest stored MCE frequency provides a tighter bound and more accurate estimates. Therefore, instead of redundantly storing the minimum observed MCE frequency, store the cutoff frequency when there are fewer tracked values than we want. (When there are more, then of course we cannot assert that no non-stored elements are above the cutoff frequency, since we're throwing away some that are; so we still use the minimum stored frequency in that case.) Notably, this works even when none of the values are common enough to be called MCEs. In such cases we previously stored nothing in the STATISTIC_KIND_MCELEM pg_statistic slot, which resulted in the selectivity functions falling back to default estimates. So in that case we want to construct a STATISTIC_KIND_MCELEM entry that contains no "values" but does have "numbers", to wit the three extra numbers that the MCELEM entry type defines. A small obstacle is that update_attstats() has traditionally stored a null, not an empty array, when passed zero "values" for a slot. That gives rise to an MCELEM entry that get_attstatsslot() will spit up on. The least risky solution seems to be to adjust update_attstats() so that it will emit a non-null (but possibly empty) array when the passed stavalues array pointer isn't NULL, rather than conditioning that on numvalues > 0. In other existing cases I don't believe that that changes anything. For consistency, handle the stanumbers array the same way. In passing, improve the comments in routines that use STATISTIC_KIND_MCELEM data. Particularly, explain why we use minfreq / 2 not minfreq as the estimate for non-MCE values. Thanks to Matt Long for the suggestion that we could apply this idea even when there are more than zero MCEs. Reported-by: Mark Frost <FROSTMAR@uk.ibm.com> Reported-by: Matt Long <matt@mattlong.org> Author: Tom Lane <tgl@sss.pgh.pa.us> Discussion: https://postgr.es/m/PH3PPF1C905D6E6F24A5C1A1A1D8345B593E16FA@PH3PPF1C905D6E6.namprd15.prod.outlook.com
		
			
				
	
	
		
			336 lines
		
	
	
		
			8.6 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			336 lines
		
	
	
		
			8.6 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /*-------------------------------------------------------------------------
 | |
|  *
 | |
|  * _int_selfuncs.c
 | |
|  *	  Functions for selectivity estimation of intarray operators
 | |
|  *
 | |
|  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
 | |
|  * Portions Copyright (c) 1994, Regents of the University of California
 | |
|  *
 | |
|  *
 | |
|  * IDENTIFICATION
 | |
|  *	  contrib/intarray/_int_selfuncs.c
 | |
|  *
 | |
|  *-------------------------------------------------------------------------
 | |
|  */
 | |
| #include "postgres.h"
 | |
| 
 | |
| #include "_int.h"
 | |
| #include "access/htup_details.h"
 | |
| #include "catalog/pg_operator.h"
 | |
| #include "catalog/pg_statistic.h"
 | |
| #include "catalog/pg_type.h"
 | |
| #include "miscadmin.h"
 | |
| #include "utils/fmgrprotos.h"
 | |
| #include "utils/lsyscache.h"
 | |
| #include "utils/selfuncs.h"
 | |
| 
 | |
| PG_FUNCTION_INFO_V1(_int_overlap_sel);
 | |
| PG_FUNCTION_INFO_V1(_int_contains_sel);
 | |
| PG_FUNCTION_INFO_V1(_int_contained_sel);
 | |
| PG_FUNCTION_INFO_V1(_int_overlap_joinsel);
 | |
| PG_FUNCTION_INFO_V1(_int_contains_joinsel);
 | |
| PG_FUNCTION_INFO_V1(_int_contained_joinsel);
 | |
| PG_FUNCTION_INFO_V1(_int_matchsel);
 | |
| 
 | |
| 
 | |
| static Selectivity int_query_opr_selec(ITEM *item, Datum *mcelems, float4 *mcefreqs,
 | |
| 									   int nmcelems, float4 minfreq);
 | |
| static int	compare_val_int4(const void *a, const void *b);
 | |
| 
 | |
| /*
 | |
|  * Wrappers around the default array selectivity estimation functions.
 | |
|  *
 | |
|  * The default array selectivity operators for the @>, && and @< operators
 | |
|  * work fine for integer arrays. However, if we tried to just use arraycontsel
 | |
|  * and arraycontjoinsel directly as the cost estimator functions for our
 | |
|  * operators, they would not work as intended, because they look at the
 | |
|  * operator's OID. Our operators behave exactly like the built-in anyarray
 | |
|  * versions, but we must tell the cost estimator functions which built-in
 | |
|  * operators they correspond to. These wrappers just replace the operator
 | |
|  * OID with the corresponding built-in operator's OID, and call the built-in
 | |
|  * function.
 | |
|  */
 | |
| 
 | |
| Datum
 | |
| _int_overlap_sel(PG_FUNCTION_ARGS)
 | |
| {
 | |
| 	PG_RETURN_DATUM(DirectFunctionCall4(arraycontsel,
 | |
| 										PG_GETARG_DATUM(0),
 | |
| 										ObjectIdGetDatum(OID_ARRAY_OVERLAP_OP),
 | |
| 										PG_GETARG_DATUM(2),
 | |
| 										PG_GETARG_DATUM(3)));
 | |
| }
 | |
| 
 | |
| Datum
 | |
| _int_contains_sel(PG_FUNCTION_ARGS)
 | |
| {
 | |
| 	PG_RETURN_DATUM(DirectFunctionCall4(arraycontsel,
 | |
| 										PG_GETARG_DATUM(0),
 | |
| 										ObjectIdGetDatum(OID_ARRAY_CONTAINS_OP),
 | |
| 										PG_GETARG_DATUM(2),
 | |
| 										PG_GETARG_DATUM(3)));
 | |
| }
 | |
| 
 | |
| Datum
 | |
| _int_contained_sel(PG_FUNCTION_ARGS)
 | |
| {
 | |
| 	PG_RETURN_DATUM(DirectFunctionCall4(arraycontsel,
 | |
| 										PG_GETARG_DATUM(0),
 | |
| 										ObjectIdGetDatum(OID_ARRAY_CONTAINED_OP),
 | |
| 										PG_GETARG_DATUM(2),
 | |
| 										PG_GETARG_DATUM(3)));
 | |
| }
 | |
| 
 | |
| Datum
 | |
| _int_overlap_joinsel(PG_FUNCTION_ARGS)
 | |
| {
 | |
| 	PG_RETURN_DATUM(DirectFunctionCall5(arraycontjoinsel,
 | |
| 										PG_GETARG_DATUM(0),
 | |
| 										ObjectIdGetDatum(OID_ARRAY_OVERLAP_OP),
 | |
| 										PG_GETARG_DATUM(2),
 | |
| 										PG_GETARG_DATUM(3),
 | |
| 										PG_GETARG_DATUM(4)));
 | |
| }
 | |
| 
 | |
| Datum
 | |
| _int_contains_joinsel(PG_FUNCTION_ARGS)
 | |
| {
 | |
| 	PG_RETURN_DATUM(DirectFunctionCall5(arraycontjoinsel,
 | |
| 										PG_GETARG_DATUM(0),
 | |
| 										ObjectIdGetDatum(OID_ARRAY_CONTAINS_OP),
 | |
| 										PG_GETARG_DATUM(2),
 | |
| 										PG_GETARG_DATUM(3),
 | |
| 										PG_GETARG_DATUM(4)));
 | |
| }
 | |
| 
 | |
| Datum
 | |
| _int_contained_joinsel(PG_FUNCTION_ARGS)
 | |
| {
 | |
| 	PG_RETURN_DATUM(DirectFunctionCall5(arraycontjoinsel,
 | |
| 										PG_GETARG_DATUM(0),
 | |
| 										ObjectIdGetDatum(OID_ARRAY_CONTAINED_OP),
 | |
| 										PG_GETARG_DATUM(2),
 | |
| 										PG_GETARG_DATUM(3),
 | |
| 										PG_GETARG_DATUM(4)));
 | |
| }
 | |
| 
 | |
| 
 | |
| /*
 | |
|  * _int_matchsel -- restriction selectivity function for intarray @@ query_int
 | |
|  */
 | |
| Datum
 | |
| _int_matchsel(PG_FUNCTION_ARGS)
 | |
| {
 | |
| 	PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
 | |
| 
 | |
| 	List	   *args = (List *) PG_GETARG_POINTER(2);
 | |
| 	int			varRelid = PG_GETARG_INT32(3);
 | |
| 	VariableStatData vardata;
 | |
| 	Node	   *other;
 | |
| 	bool		varonleft;
 | |
| 	Selectivity selec;
 | |
| 	QUERYTYPE  *query;
 | |
| 	Datum	   *mcelems = NULL;
 | |
| 	float4	   *mcefreqs = NULL;
 | |
| 	int			nmcelems = 0;
 | |
| 	float4		minfreq = 0.0;
 | |
| 	float4		nullfrac = 0.0;
 | |
| 	AttStatsSlot sslot;
 | |
| 
 | |
| 	/*
 | |
| 	 * If expression is not "variable @@ something" or "something @@ variable"
 | |
| 	 * then punt and return a default estimate.
 | |
| 	 */
 | |
| 	if (!get_restriction_variable(root, args, varRelid,
 | |
| 								  &vardata, &other, &varonleft))
 | |
| 		PG_RETURN_FLOAT8(DEFAULT_EQ_SEL);
 | |
| 
 | |
| 	/*
 | |
| 	 * Variable should be int[]. We don't support cases where variable is
 | |
| 	 * query_int.
 | |
| 	 */
 | |
| 	if (vardata.vartype != INT4ARRAYOID)
 | |
| 		PG_RETURN_FLOAT8(DEFAULT_EQ_SEL);
 | |
| 
 | |
| 	/*
 | |
| 	 * Can't do anything useful if the something is not a constant, either.
 | |
| 	 */
 | |
| 	if (!IsA(other, Const))
 | |
| 	{
 | |
| 		ReleaseVariableStats(vardata);
 | |
| 		PG_RETURN_FLOAT8(DEFAULT_EQ_SEL);
 | |
| 	}
 | |
| 
 | |
| 	/*
 | |
| 	 * The "@@" operator is strict, so we can cope with NULL right away.
 | |
| 	 */
 | |
| 	if (((Const *) other)->constisnull)
 | |
| 	{
 | |
| 		ReleaseVariableStats(vardata);
 | |
| 		PG_RETURN_FLOAT8(0.0);
 | |
| 	}
 | |
| 
 | |
| 	/* The caller made sure the const is a query, so get it now */
 | |
| 	query = DatumGetQueryTypeP(((Const *) other)->constvalue);
 | |
| 
 | |
| 	/* Empty query matches nothing */
 | |
| 	if (query->size == 0)
 | |
| 	{
 | |
| 		ReleaseVariableStats(vardata);
 | |
| 		PG_RETURN_FLOAT8(0.0);
 | |
| 	}
 | |
| 
 | |
| 	/*
 | |
| 	 * Get the statistics for the intarray column.
 | |
| 	 *
 | |
| 	 * We're interested in the Most-Common-Elements list, and the NULL
 | |
| 	 * fraction.
 | |
| 	 */
 | |
| 	if (HeapTupleIsValid(vardata.statsTuple))
 | |
| 	{
 | |
| 		Form_pg_statistic stats;
 | |
| 
 | |
| 		stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
 | |
| 		nullfrac = stats->stanullfrac;
 | |
| 
 | |
| 		/*
 | |
| 		 * For an int4 array, the default array type analyze function will
 | |
| 		 * collect a Most Common Elements list, which is an array of int4s.
 | |
| 		 */
 | |
| 		if (get_attstatsslot(&sslot, vardata.statsTuple,
 | |
| 							 STATISTIC_KIND_MCELEM, InvalidOid,
 | |
| 							 ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS))
 | |
| 		{
 | |
| 			Assert(sslot.valuetype == INT4OID);
 | |
| 
 | |
| 			/*
 | |
| 			 * There should be three more Numbers than Values, because the
 | |
| 			 * last three (for intarray) cells are taken for minimal, maximal
 | |
| 			 * and nulls frequency. Punt if not.
 | |
| 			 */
 | |
| 			if (sslot.nnumbers == sslot.nvalues + 3)
 | |
| 			{
 | |
| 				/* Grab the minimal MCE frequency. */
 | |
| 				minfreq = sslot.numbers[sslot.nvalues];
 | |
| 
 | |
| 				mcelems = sslot.values;
 | |
| 				mcefreqs = sslot.numbers;
 | |
| 				nmcelems = sslot.nvalues;
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 	else
 | |
| 		memset(&sslot, 0, sizeof(sslot));
 | |
| 
 | |
| 	/* Process the logical expression in the query, using the stats */
 | |
| 	selec = int_query_opr_selec(GETQUERY(query) + query->size - 1,
 | |
| 								mcelems, mcefreqs, nmcelems, minfreq);
 | |
| 
 | |
| 	/* MCE stats count only non-null rows, so adjust for null rows. */
 | |
| 	selec *= (1.0 - nullfrac);
 | |
| 
 | |
| 	free_attstatsslot(&sslot);
 | |
| 	ReleaseVariableStats(vardata);
 | |
| 
 | |
| 	CLAMP_PROBABILITY(selec);
 | |
| 
 | |
| 	PG_RETURN_FLOAT8((float8) selec);
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Estimate selectivity of single intquery operator
 | |
|  */
 | |
| static Selectivity
 | |
| int_query_opr_selec(ITEM *item, Datum *mcelems, float4 *mcefreqs,
 | |
| 					int nmcelems, float4 minfreq)
 | |
| {
 | |
| 	Selectivity selec;
 | |
| 
 | |
| 	/* since this function recurses, it could be driven to stack overflow */
 | |
| 	check_stack_depth();
 | |
| 
 | |
| 	if (item->type == VAL)
 | |
| 	{
 | |
| 		Datum	   *searchres;
 | |
| 
 | |
| 		if (mcelems == NULL)
 | |
| 			return (Selectivity) DEFAULT_EQ_SEL;
 | |
| 
 | |
| 		searchres = (Datum *) bsearch(&item->val, mcelems, nmcelems,
 | |
| 									  sizeof(Datum), compare_val_int4);
 | |
| 		if (searchres)
 | |
| 		{
 | |
| 			/*
 | |
| 			 * The element is in MCELEM.  Return precise selectivity (or at
 | |
| 			 * least as precise as ANALYZE could find out).
 | |
| 			 */
 | |
| 			selec = mcefreqs[searchres - mcelems];
 | |
| 		}
 | |
| 		else
 | |
| 		{
 | |
| 			/*
 | |
| 			 * The element is not in MCELEM.  Estimate its frequency as half
 | |
| 			 * that of the least-frequent MCE.  (We know it cannot be more
 | |
| 			 * than minfreq, and it could be a great deal less.  Half seems
 | |
| 			 * like a good compromise.)  For probably-historical reasons,
 | |
| 			 * clamp to not more than DEFAULT_EQ_SEL.
 | |
| 			 */
 | |
| 			selec = Min(DEFAULT_EQ_SEL, minfreq / 2);
 | |
| 		}
 | |
| 	}
 | |
| 	else if (item->type == OPR)
 | |
| 	{
 | |
| 		/* Current query node is an operator */
 | |
| 		Selectivity s1,
 | |
| 					s2;
 | |
| 
 | |
| 		s1 = int_query_opr_selec(item - 1, mcelems, mcefreqs, nmcelems,
 | |
| 								 minfreq);
 | |
| 		switch (item->val)
 | |
| 		{
 | |
| 			case (int32) '!':
 | |
| 				selec = 1.0 - s1;
 | |
| 				break;
 | |
| 
 | |
| 			case (int32) '&':
 | |
| 				s2 = int_query_opr_selec(item + item->left, mcelems, mcefreqs,
 | |
| 										 nmcelems, minfreq);
 | |
| 				selec = s1 * s2;
 | |
| 				break;
 | |
| 
 | |
| 			case (int32) '|':
 | |
| 				s2 = int_query_opr_selec(item + item->left, mcelems, mcefreqs,
 | |
| 										 nmcelems, minfreq);
 | |
| 				selec = s1 + s2 - s1 * s2;
 | |
| 				break;
 | |
| 
 | |
| 			default:
 | |
| 				elog(ERROR, "unrecognized operator: %d", item->val);
 | |
| 				selec = 0;		/* keep compiler quiet */
 | |
| 				break;
 | |
| 		}
 | |
| 	}
 | |
| 	else
 | |
| 	{
 | |
| 		elog(ERROR, "unrecognized int query item type: %u", item->type);
 | |
| 		selec = 0;				/* keep compiler quiet */
 | |
| 	}
 | |
| 
 | |
| 	/* Clamp intermediate results to stay sane despite roundoff error */
 | |
| 	CLAMP_PROBABILITY(selec);
 | |
| 
 | |
| 	return selec;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Comparison function for binary search in mcelem array.
 | |
|  */
 | |
| static int
 | |
| compare_val_int4(const void *a, const void *b)
 | |
| {
 | |
| 	int32		key = *(int32 *) a;
 | |
| 	const Datum *t = (const Datum *) b;
 | |
| 
 | |
| 	return key - DatumGetInt32(*t);
 | |
| }
 |