First cut at making useful selectivity estimates for range queries

(ie, WHERE x > lowbound AND x < highbound). It's not very bright yet but it does something useful. Also, rename intltsel/intgtsel to scalarltsel/scalargtsel to reflect usage better. Extend convert_to_scalar to do something a little bit useful with string data types. Still need to make it do something with date/time datatypes, but I'll wait for Thomas's datetime unification dust to settle first. Eventually the routine ought not have any type-specific knowledge at all; it ought to be calling a type-dependent routine found via a pg_type column; but that's a task for another day.
2025-11-12 05:01:15 +03:00 · 2000-01-24 07:16:52 +00:00
parent 8bcac56086
commit 0dbffa704a
9 changed files with 529 additions and 248 deletions
--- a/src/backend/optimizer/path/clausesel.c
+++ b/src/backend/optimizer/path/clausesel.c
@@ -7,7 +7,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/optimizer/path/clausesel.c,v 1.28 2000/01/23 02:06:58 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/optimizer/path/clausesel.c,v 1.29 2000/01/24 07:16:46 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -23,6 +23,23 @@
 #include "utils/lsyscache.h"


+/*
+ * Data structure for accumulating info about possible range-query
+ * clause pairs in clauselist_selectivity.
+ */
+typedef struct RangeQueryClause {
+	struct RangeQueryClause *next; /* next in linked list */
+	Node	   *var;			/* The common variable of the clauses */
+	bool		have_lobound;	/* found a low-bound clause yet? */
+	bool		have_hibound;	/* found a high-bound clause yet? */
+	Selectivity	lobound;		/* Selectivity of a var > something clause */
+	Selectivity	hibound;		/* Selectivity of a var < something clause */
+} RangeQueryClause;
+
+static void addRangeClause(RangeQueryClause **rqlist, Node *clause,
+						   int flag, bool isLTsel, Selectivity s2);
+
+
 /****************************************************************************
 *		ROUTINES TO COMPUTE SELECTIVITIES
 ****************************************************************************/
@@ -55,29 +72,237 @@ restrictlist_selectivity(Query *root,
 *	  must be returned.
 *
 * See clause_selectivity() for the meaning of the varRelid parameter.
+ *
+ * Our basic approach is to take the product of the selectivities of the
+ * subclauses.  However, that's only right if the subclauses have independent
+ * probabilities, and in reality they are often NOT independent.  So,
+ * we want to be smarter where we can.
+
+ * Currently, the only extra smarts we have is to recognize "range queries",
+ * such as "x > 34 AND x < 42".  Clauses are recognized as possible range
+ * query components if they are restriction opclauses whose operators have
+ * scalarltsel() or scalargtsel() as their restriction selectivity estimator.
+ * We pair up clauses of this form that refer to the same variable.  An
+ * unpairable clause of this kind is simply multiplied into the selectivity
+ * product in the normal way.  But when we find a pair, we know that the
+ * selectivities represent the relative positions of the low and high bounds
+ * within the column's range, so instead of figuring the selectivity as
+ * hisel * losel, we can figure it as hisel + losel - 1.  (To visualize this,
+ * see that hisel is the fraction of the range below the high bound, while
+ * losel is the fraction above the low bound; so hisel can be interpreted
+ * directly as a 0..1 value but we need to convert losel to 1-losel before
+ * interpreting it as a value.  Then the available range is 1-losel to hisel.)
+ * If the calculation yields zero or negative, however, we chicken out and
+ * use the default interpretation; that probably means that one or both
+ * selectivities is a default estimate rather than an actual range value.
+ * Of course this is all very dependent on the behavior of
+ * scalarltsel/scalargtsel; perhaps some day we can generalize the approach.
 */
 Selectivity
 clauselist_selectivity(Query *root,
 					   List *clauses,
 					   int varRelid)
 {
-	Selectivity		s1 = 1.0;
-	List		   *clause;
+	Selectivity			s1 = 1.0;
+	RangeQueryClause   *rqlist = NULL;
+	List			   *clist;

-	/* Use the product of the selectivities of the subclauses.
-	 * XXX this is too optimistic, since the subclauses
-	 * are very likely not independent...
+	/*
+	 * Initial scan over clauses.  Anything that doesn't look like a
+	 * potential rangequery clause gets multiplied into s1 and forgotten.
+	 * Anything that does gets inserted into an rqlist entry.
 	 */
-	foreach(clause, clauses)
+	foreach(clist, clauses)
 	{
-		Selectivity	s2 = clause_selectivity(root,
-											(Node *) lfirst(clause),
-											varRelid);
+		Node	   *clause = (Node *) lfirst(clist);
+		Selectivity	s2;
+
+		/*
+		 * See if it looks like a restriction clause with a constant.
+		 * (If it's not a constant we can't really trust the selectivity!)
+		 * NB: for consistency of results, this fragment of code had
+		 * better match what clause_selectivity() would do.
+		 */
+		if (varRelid != 0 || NumRelids(clause) == 1)
+		{
+			int			relidx;
+			AttrNumber	attno;
+			Datum		constval;
+			int			flag;
+
+			get_relattval(clause, varRelid,
+						  &relidx, &attno, &constval, &flag);
+			if (relidx != 0 && (flag & SEL_CONSTANT))
+			{
+				/* if get_relattval succeeded, it must be an opclause */
+				Oid			opno = ((Oper *) ((Expr *) clause)->oper)->opno;
+				RegProcedure oprrest = get_oprrest(opno);
+
+				if (!oprrest)
+					s2 = (Selectivity) 0.5;
+				else
+					s2 = restriction_selectivity(oprrest, opno,
+												 getrelid(relidx,
+														  root->rtable),
+												 attno,
+												 constval, flag);
+				/*
+				 * If we reach here, we have computed the same result
+				 * that clause_selectivity would, so we can just use s2
+				 * if it's the wrong oprrest.  But if it's the right
+				 * oprrest, add the clause to rqlist for later processing.
+				 */
+				switch (oprrest)
+				{
+					case F_SCALARLTSEL:
+						addRangeClause(&rqlist, clause, flag, true, s2);
+						break;
+					case F_SCALARGTSEL:
+						addRangeClause(&rqlist, clause, flag, false, s2);
+						break;
+					default:
+						/* Just merge the selectivity in generically */
+						s1 = s1 * s2;
+						break;
+				}
+				continue; /* drop to loop bottom */
+			}
+		}
+		/* Not the right form, so treat it generically. */
+		s2 = clause_selectivity(root, clause, varRelid);
 		s1 = s1 * s2;
 	}
+
+	/*
+	 * Now scan the rangequery pair list.
+	 */
+	while (rqlist != NULL)
+	{
+		RangeQueryClause   *rqnext;
+
+		if (rqlist->have_lobound && rqlist->have_hibound)
+		{
+			/* Successfully matched a pair of range clauses */
+			Selectivity	s2 = rqlist->hibound + rqlist->lobound - 1.0;
+
+			if (s2 > 0.0)
+			{
+				/* All our hard work has paid off! */
+				s1 *= s2;
+			}
+			else
+			{
+				/* One or both is probably a default estimate,
+				 * so punt and just merge them in generically.
+				 */
+				s1 *= rqlist->hibound * rqlist->lobound;
+			}
+		}
+		else
+		{
+			/* Only found one of a pair, merge it in generically */
+			if (rqlist->have_lobound)
+				s1 *= rqlist->lobound;
+			else
+				s1 *= rqlist->hibound;
+		}
+		/* release storage and advance */
+		rqnext = rqlist->next;
+		pfree(rqlist);
+		rqlist = rqnext;
+	}
+
 	return s1;
 }

+/*
+ * addRangeClause --- add a new range clause for clauselist_selectivity
+ *
+ * Here is where we try to match up pairs of range-query clauses
+ */
+static void
+addRangeClause(RangeQueryClause **rqlist, Node *clause,
+			   int flag, bool isLTsel, Selectivity s2)
+{
+	RangeQueryClause   *rqelem;
+	Node			   *var;
+	bool				is_lobound;
+
+	/* get_relattval sets flag&SEL_RIGHT if the var is on the LEFT. */
+	if (flag & SEL_RIGHT)
+	{
+		var = (Node *) get_leftop((Expr *) clause);
+		is_lobound = ! isLTsel;	/* x < something is high bound */
+	}
+	else
+	{
+		var = (Node *) get_rightop((Expr *) clause);
+		is_lobound = isLTsel;	/* something < x is low bound */
+	}
+
+	for (rqelem = *rqlist; rqelem; rqelem = rqelem->next)
+	{
+		/* We use full equal() here because the "var" might be a function
+		 * of one or more attributes of the same relation...
+		 */
+		if (! equal(var, rqelem->var))
+			continue;
+		/* Found the right group to put this clause in */
+		if (is_lobound)
+		{
+			if (! rqelem->have_lobound)
+			{
+				rqelem->have_lobound = true;
+				rqelem->lobound = s2;
+			}
+			else
+			{
+				/* We have found two similar clauses, such as
+				 * x < y AND x < z.  Keep only the more restrictive one.
+				 */
+				if (rqelem->lobound > s2)
+					rqelem->lobound = s2;
+			}
+		}
+		else
+		{
+			if (! rqelem->have_hibound)
+			{
+				rqelem->have_hibound = true;
+				rqelem->hibound = s2;
+			}
+			else
+			{
+				/* We have found two similar clauses, such as
+				 * x > y AND x > z.  Keep only the more restrictive one.
+				 */
+				if (rqelem->hibound > s2)
+					rqelem->hibound = s2;
+			}
+		}
+		return;
+	}
+
+	/* No matching var found, so make a new clause-pair data structure */
+	rqelem = (RangeQueryClause *) palloc(sizeof(RangeQueryClause));
+	rqelem->var = var;
+	if (is_lobound)
+	{
+		rqelem->have_lobound = true;
+		rqelem->have_hibound = false;
+		rqelem->lobound = s2;
+	}
+	else
+	{
+		rqelem->have_lobound = false;
+		rqelem->have_hibound = true;
+		rqelem->hibound = s2;
+	}
+	rqelem->next = *rqlist;
+	*rqlist = rqelem;
+}
+
+
 /*
 * clause_selectivity -
 *	  Compute the selectivity of a general boolean expression clause.
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -14,7 +14,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.52 2000/01/24 02:12:55 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.53 2000/01/24 07:16:46 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -47,15 +47,13 @@
 /* default selectivity estimate for inequalities such as "A < b" */
 #define DEFAULT_INEQ_SEL  (1.0 / 3.0)

-static bool convert_to_scale(Datum value, Oid typid,
-							 double *scaleval);
 static void getattproperties(Oid relid, AttrNumber attnum,
 							 Oid *typid,
 							 int *typlen,
 							 bool *typbyval,
 							 int32 *typmod);
 static bool getattstatistics(Oid relid, AttrNumber attnum,
-							 Oid opid, Oid typid, int32 typmod,
+							 Oid typid, int32 typmod,
 							 double *nullfrac,
 							 double *commonfrac,
 							 Datum *commonval,
@@ -100,7 +98,7 @@ eqsel(Oid opid,
 						 &typid, &typlen, &typbyval, &typmod);

 		/* get stats for the attribute, if available */
-		if (getattstatistics(relid, attno, opid, typid, typmod,
+		if (getattstatistics(relid, attno, typid, typmod,
 							 &nullfrac, &commonfrac, &commonval,
 							 NULL, NULL))
 		{
@@ -212,19 +210,18 @@ neqsel(Oid opid,
 }

 /*
- *		intltsel		- Selectivity of "<" (also "<=") for integers.
+ *		scalarltsel		- Selectivity of "<" (also "<=") for scalars.
 *
- * Actually, this works and is used for all numeric types, so it should
- * be renamed.  In fact, it is also currently called for all manner of
- * non-numeric types, for which it is NOT very helpful.  That needs
- * to be fixed.
+ * This routine works for any datatype (or pair of datatypes) known to
+ * convert_to_scalar().  If it is applied to some other datatype,
+ * it will return a default estimate.
 */
 float64
-intltsel(Oid opid,
-		 Oid relid,
-		 AttrNumber attno,
-		 Datum value,
-		 int32 flag)
+scalarltsel(Oid opid,
+			Oid relid,
+			AttrNumber attno,
+			Datum value,
+			int32 flag)
 {
 	float64		result;

@@ -253,19 +250,19 @@ intltsel(Oid opid,
 		 */
 		oprtuple = get_operator_tuple(opid);
 		if (! HeapTupleIsValid(oprtuple))
-			elog(ERROR, "intltsel: no tuple for operator %u", opid);
+			elog(ERROR, "scalarltsel: no tuple for operator %u", opid);
 		ltype = ((Form_pg_operator) GETSTRUCT(oprtuple))->oprleft;
 		rtype = ((Form_pg_operator) GETSTRUCT(oprtuple))->oprright;

 		/* Convert the constant to a uniform comparison scale. */
-		if (! convert_to_scale(value,
-							   ((flag & SEL_RIGHT) ? rtype : ltype),
-							   &val))
+		if (! convert_to_scalar(value,
+								((flag & SEL_RIGHT) ? rtype : ltype),
+								&val))
 		{
-			/* Ideally we'd produce an error here, on the grounds that
-			 * the given operator shouldn't have intltsel registered as its
+			/* Ideally we'd produce an error here, on the grounds that the
+			 * given operator shouldn't have scalarltsel registered as its
 			 * selectivity func unless we can deal with its operand types.
-			 * But currently, all manner of stuff is invoking intltsel,
+			 * But currently, all manner of stuff is invoking scalarltsel,
 			 * so give a default estimate until that can be fixed.
 			 */
 			*result = DEFAULT_INEQ_SEL;
@@ -276,7 +273,7 @@ intltsel(Oid opid,
 		getattproperties(relid, attno,
 						 &typid, &typlen, &typbyval, &typmod);

-		if (! getattstatistics(relid, attno, opid, typid, typmod,
+		if (! getattstatistics(relid, attno, typid, typmod,
 							   NULL, NULL, NULL,
 							   &loval, &hival))
 		{
@@ -286,8 +283,8 @@ intltsel(Oid opid,
 		}

 		/* Convert the attribute's loval/hival to common scale. */
-		if (! convert_to_scale(loval, typid, &low) ||
-			! convert_to_scale(hival, typid, &high))
+		if (! convert_to_scalar(loval, typid, &low) ||
+			! convert_to_scalar(hival, typid, &high))
 		{
 			/* See above comments... */
 			if (! typbyval)
@@ -341,23 +338,23 @@ intltsel(Oid opid,
 }

 /*
- *		intgtsel		- Selectivity of ">" (also ">=") for integers.
+ *		scalargtsel		- Selectivity of ">" (also ">=") for integers.
 *
- * See above comments for intltsel.
+ * See above comments for scalarltsel.
 */
 float64
-intgtsel(Oid opid,
-		 Oid relid,
-		 AttrNumber attno,
-		 Datum value,
-		 int32 flag)
+scalargtsel(Oid opid,
+			Oid relid,
+			AttrNumber attno,
+			Datum value,
+			int32 flag)
 {
 	float64		result;

 	/* Compute selectivity of "<", then invert --- but only if we
 	 * were able to produce a non-default estimate.
 	 */
-	result = intltsel(opid, relid, attno, value, flag);
+	result = scalarltsel(opid, relid, attno, value, flag);
 	if (*result != DEFAULT_INEQ_SEL)
 		*result = 1.0 - *result;
 	return result;
@@ -429,14 +426,14 @@ neqjoinsel(Oid opid,
 }

 /*
- *		intltjoinsel	- Join selectivity of "<" and "<="
+ *		scalarltjoinsel	- Join selectivity of "<" and "<=" for scalars
 */
 float64
-intltjoinsel(Oid opid,
-			 Oid relid1,
-			 AttrNumber attno1,
-			 Oid relid2,
-			 AttrNumber attno2)
+scalarltjoinsel(Oid opid,
+				Oid relid1,
+				AttrNumber attno1,
+				Oid relid2,
+				AttrNumber attno2)
 {
 	float64		result;

@@ -446,14 +443,14 @@ intltjoinsel(Oid opid,
 }

 /*
- *		intgtjoinsel	- Join selectivity of ">" and ">="
+ *		scalargtjoinsel	- Join selectivity of ">" and ">=" for scalars
 */
 float64
-intgtjoinsel(Oid opid,
-			 Oid relid1,
-			 AttrNumber attno1,
-			 Oid relid2,
-			 AttrNumber attno2)
+scalargtjoinsel(Oid opid,
+				Oid relid1,
+				AttrNumber attno1,
+				Oid relid2,
+				AttrNumber attno2)
 {
 	float64		result;

@@ -463,21 +460,25 @@ intgtjoinsel(Oid opid,
 }

 /*
- * convert_to_scale
- *	  Convert a given value of the indicated type to the comparison
- *	  scale needed by intltsel().  Returns "true" if successful.
+ * convert_to_scalar
+ *	  Convert a non-NULL value of the indicated type to the comparison
+ *	  scale needed by scalarltsel()/scalargtsel().
+ *	  Returns "true" if successful.
 *
 * All numeric datatypes are simply converted to their equivalent
- * "double" values.
- * Future extension: convert string-like types to some suitable scale.
+ * "double" values.  String datatypes are converted to a crude scale
+ * using their first character (only if it is in the ASCII range,
+ * to try to avoid problems with non-ASCII collating sequences).
 */
-static bool
-convert_to_scale(Datum value, Oid typid,
-				 double *scaleval)
+bool
+convert_to_scalar(Datum value, Oid typid,
+				  double *scaleval)
 {
-	/* Fast-path conversions for some built-in types */
 	switch (typid)
 	{
+		/*
+		 * Built-in numeric types
+		 */
 		case BOOLOID:
 			*scaleval = (double) DatumGetUInt8(value);
 			return true;
@@ -504,18 +505,54 @@ convert_to_scale(Datum value, Oid typid,
 			/* we can treat OIDs as integers... */
 			*scaleval = (double) DatumGetObjectId(value);
 			return true;
-		case TEXTOID:
-			/*
-			 * Eventually this should get handled by somehow scaling as a
-			 * string value.  For now, we need to call it out to avoid
-			 * falling into the default case, because there is a float8(text)
-			 * function declared in pg_proc that will do the wrong thing :-(
-			 */
+
+		/*
+		 * Built-in string types
+		 */
+		case CHAROID:
+		{
+			char		ch = DatumGetChar(value);
+
+			if (ch >= 0 && ch < 127)
+			{
+				*scaleval = (double) ch;
+				return true;
+			}
 			break;
+		}
+		case BPCHAROID:
+		case VARCHAROID:
+		case TEXTOID:
+			if (VARSIZE(DatumGetPointer(value)) > VARHDRSZ)
+			{
+				char	ch = * (char *) VARDATA(DatumGetPointer(value));
+
+				if (ch >= 0 && ch < 127)
+				{
+					*scaleval = (double) ch;
+					return true;
+				}
+			}
+			break;
+		case NAMEOID:
+		{
+			NameData   *nm = (NameData *) DatumGetPointer(value);
+			char		ch = NameStr(*nm)[0];
+
+			if (ch >= 0 && ch < 127)
+			{
+				*scaleval = (double) ch;
+				return true;
+			}
+			break;
+		}
+
 		default:
 		{
-			/* See whether there is a registered type-conversion function,
+			/*
+			 * See whether there is a registered type-conversion function,
 			 * namely a procedure named "float8" with the right signature.
+			 * If so, assume we can convert the value to the numeric scale.
 			 */
 			Oid			oid_array[FUNC_MAX_ARGS];
 			HeapTuple	ftup;
@@ -589,7 +626,9 @@ getattproperties(Oid relid, AttrNumber attnum,
 * after use if the data type is not by-value.)
 */
 static bool
-getattstatistics(Oid relid, AttrNumber attnum, Oid opid, Oid typid,
+getattstatistics(Oid relid,
+				 AttrNumber attnum,
+				 Oid typid,
 				 int32 typmod,
 				 double *nullfrac,
 				 double *commonfrac,
@@ -603,8 +642,15 @@ getattstatistics(Oid relid, AttrNumber attnum, Oid opid, Oid typid,
 	Oid			typelem;
 	bool		isnull;

-	/* We assume that there will only be one entry in pg_statistic
-	 * for the given rel/att.  Someday, VACUUM might store more than one...
+	/*
+	 * We assume that there will only be one entry in pg_statistic for
+	 * the given rel/att, so we search WITHOUT considering the staop
+	 * column.  Someday, VACUUM might store more than one entry per rel/att,
+	 * corresponding to more than one possible sort ordering defined for
+	 * the column type.  However, to make that work we will need to figure
+	 * out which staop to search for --- it's not necessarily the one we
+	 * have at hand!  (For example, we might have a '>' operator rather than
+	 * the '<' operator that will appear in staop.)
 	 */
 	tuple = SearchSysCacheTuple(STATRELID,
 								ObjectIdGetDatum(relid),