Make pg_statistic and related code account more honestly for collations.

When we first put in collations support, we basically punted on teaching pg_statistic, ANALYZE, and the planner selectivity functions about that. They've just used DEFAULT_COLLATION_OID independently of the actual collation of the data. It's time to improve that, so: * Add columns to pg_statistic that record the specific collation associated with each statistics slot. * Teach ANALYZE to use the column's actual collation when comparing values for statistical purposes, and record this in the appropriate slot. (Note that type-specific typanalyze functions are now expected to fill stats->stacoll with the appropriate collation, too.) * Teach assorted selectivity functions to use the actual collation of the stats they are looking at, instead of just assuming it's DEFAULT_COLLATION_OID. This should give noticeably better results in selectivity estimates for columns with nondefault collations, at least for query clauses that use that same collation (which would be the default behavior in most cases). It's still true that comparisons with explicit COLLATE clauses different from the stored data's collation won't be well-estimated, but that's no worse than before. Also, this patch does make the first step towards doing better with that, which is that it's now theoretically possible to collect stats for a collation other than the column's own collation. Patch by me; thanks to Peter Eisentraut for review. Discussion: https://postgr.es/m/14706.1544630227@sss.pgh.pa.us
2025-07-02 09:02:37 +03:00 · 2018-12-14 12:52:49 -05:00
parent 8fb569e978
commit 5e09280057
18 changed files with 189 additions and 90 deletions
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@ -87,11 +87,12 @@
 * For both oprrest and oprjoin functions, the operator's input collation OID
 * (if any) is passed using the standard fmgr mechanism, so that the estimator
 * function can fetch it with PG_GET_COLLATION().  Note, however, that all
- * statistics in pg_statistic are currently built using the database's default
+ * statistics in pg_statistic are currently built using the relevant column's
 * collation.  Thus, in most cases where we are looking at statistics, we
- * should ignore the actual operator collation and use DEFAULT_COLLATION_OID.
+ * should ignore the operator collation and use the stats entry's collation.
 * We expect that the error induced by doing this is usually not large enough
- * to justify complicating matters.
+ * to justify complicating matters.  In any case, doing otherwise would yield
+ * entirely garbage results for ordered stats data such as histograms.
 *----------
 */

@ -181,7 +182,8 @@ static double eqjoinsel_semi(Oid opfuncoid,
 			   RelOptInfo *inner_rel);
 static bool estimate_multivariate_ndistinct(PlannerInfo *root,
 								RelOptInfo *rel, List **varinfos, double *ndistinct);
-static bool convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
+static bool convert_to_scalar(Datum value, Oid valuetypid, Oid collid,
+				  double *scaledvalue,
 				  Datum lobound, Datum hibound, Oid boundstypid,
 				  double *scaledlobound, double *scaledhibound);
 static double convert_numeric_to_scalar(Datum value, Oid typid, bool *failure);
@ -201,7 +203,8 @@ static double convert_one_string_to_scalar(char *value,
 							 int rangelo, int rangehi);
 static double convert_one_bytea_to_scalar(unsigned char *value, int valuelen,
 							int rangelo, int rangehi);
-static char *convert_string_datum(Datum value, Oid typid, bool *failure);
+static char *convert_string_datum(Datum value, Oid typid, Oid collid,
+					 bool *failure);
 static double convert_timevalue_to_scalar(Datum value, Oid typid,
 							bool *failure);
 static void examine_simple_variable(PlannerInfo *root, Var *var,
@ -370,12 +373,12 @@ var_eq_const(VariableStatData *vardata, Oid operator,
 				/* be careful to apply operator right way 'round */
 				if (varonleft)
 					match = DatumGetBool(FunctionCall2Coll(&eqproc,
-														   DEFAULT_COLLATION_OID,
+														   sslot.stacoll,
 														   sslot.values[i],
 														   constval));
 				else
 					match = DatumGetBool(FunctionCall2Coll(&eqproc,
-														   DEFAULT_COLLATION_OID,
+														   sslot.stacoll,
 														   constval,
 														   sslot.values[i]));
 				if (match)
@ -666,11 +669,11 @@ mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
 		{
 			if (varonleft ?
 				DatumGetBool(FunctionCall2Coll(opproc,
-											   DEFAULT_COLLATION_OID,
+											   sslot.stacoll,
 											   sslot.values[i],
 											   constval)) :
 				DatumGetBool(FunctionCall2Coll(opproc,
-											   DEFAULT_COLLATION_OID,
+											   sslot.stacoll,
 											   constval,
 											   sslot.values[i])))
 				mcv_selec += sslot.numbers[i];
@ -744,11 +747,11 @@ histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
 			{
 				if (varonleft ?
 					DatumGetBool(FunctionCall2Coll(opproc,
-												   DEFAULT_COLLATION_OID,
+												   sslot.stacoll,
 												   sslot.values[i],
 												   constval)) :
 					DatumGetBool(FunctionCall2Coll(opproc,
-												   DEFAULT_COLLATION_OID,
+												   sslot.stacoll,
 												   constval,
 												   sslot.values[i])))
 					nmatch++;
@ -873,7 +876,7 @@ ineq_histogram_selectivity(PlannerInfo *root,
 														 &sslot.values[probe]);

 				ltcmp = DatumGetBool(FunctionCall2Coll(opproc,
-													   DEFAULT_COLLATION_OID,
+													   sslot.stacoll,
 													   sslot.values[probe],
 													   constval));
 				if (isgt)
@ -958,7 +961,8 @@ ineq_histogram_selectivity(PlannerInfo *root,
 				 * values to a uniform comparison scale, and do a linear
 				 * interpolation within this bin.
 				 */
-				if (convert_to_scalar(constval, consttype, &val,
+				if (convert_to_scalar(constval, consttype, sslot.stacoll,
+									  &val,
 									  sslot.values[i - 1], sslot.values[i],
 									  vardata->vartype,
 									  &low, &high))
@ -2499,7 +2503,7 @@ eqjoinsel_inner(Oid opfuncoid,
 				if (hasmatch2[j])
 					continue;
 				if (DatumGetBool(FunctionCall2Coll(&eqproc,
-												   DEFAULT_COLLATION_OID,
+												   sslot1->stacoll,
 												   sslot1->values[i],
 												   sslot2->values[j])))
 				{
@ -2711,7 +2715,7 @@ eqjoinsel_semi(Oid opfuncoid,
 				if (hasmatch2[j])
 					continue;
 				if (DatumGetBool(FunctionCall2Coll(&eqproc,
-												   DEFAULT_COLLATION_OID,
+												   sslot1->stacoll,
 												   sslot1->values[i],
 												   sslot2->values[j])))
 				{
@ -4066,7 +4070,7 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel,
 * converted to measurements expressed in seconds.
 */
 static bool
-convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
+convert_to_scalar(Datum value, Oid valuetypid, Oid collid, double *scaledvalue,
 				  Datum lobound, Datum hibound, Oid boundstypid,
 				  double *scaledlobound, double *scaledhibound)
 {
@ -4131,11 +4135,11 @@ convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
 		case NAMEOID:
 			{
 				char	   *valstr = convert_string_datum(value, valuetypid,
-														  &failure);
+														  collid, &failure);
 				char	   *lostr = convert_string_datum(lobound, boundstypid,
-														 &failure);
+														 collid, &failure);
 				char	   *histr = convert_string_datum(hibound, boundstypid,
-														 &failure);
+														 collid, &failure);

 				/*
 				 * Bail out if any of the values is not of string type.  We
@ -4404,7 +4408,7 @@ convert_one_string_to_scalar(char *value, int rangelo, int rangehi)
 * before continuing, so as to generate correct locale-specific results.
 */
 static char *
-convert_string_datum(Datum value, Oid typid, bool *failure)
+convert_string_datum(Datum value, Oid typid, Oid collid, bool *failure)
 {
 	char	   *val;

@ -4432,7 +4436,7 @@ convert_string_datum(Datum value, Oid typid, bool *failure)
 			return NULL;
 	}

-	if (!lc_collate_is_c(DEFAULT_COLLATION_OID))
+	if (!lc_collate_is_c(collid))
 	{
 		char	   *xfrmstr;
 		size_t		xfrmlen;
@ -5407,14 +5411,14 @@ get_variable_range(PlannerInfo *root, VariableStatData *vardata, Oid sortop,
 				continue;
 			}
 			if (DatumGetBool(FunctionCall2Coll(&opproc,
-											   DEFAULT_COLLATION_OID,
+											   sslot.stacoll,
 											   sslot.values[i], tmin)))
 			{
 				tmin = sslot.values[i];
 				tmin_is_mcv = true;
 			}
 			if (DatumGetBool(FunctionCall2Coll(&opproc,
-											   DEFAULT_COLLATION_OID,
+											   sslot.stacoll,
 											   tmax, sslot.values[i])))
 			{
 				tmax = sslot.values[i];
@ -6014,6 +6018,7 @@ prefix_selectivity(PlannerInfo *root, VariableStatData *vardata,
 	Selectivity prefixsel;
 	Oid			cmpopr;
 	FmgrInfo	opproc;
+	AttStatsSlot sslot;
 	Const	   *greaterstrcon;
 	Selectivity eq_sel;

@ -6036,16 +6041,23 @@ prefix_selectivity(PlannerInfo *root, VariableStatData *vardata,

 	/*-------
 	 * If we can create a string larger than the prefix, say
-	 *	"x < greaterstr".
+	 * "x < greaterstr".  We try to generate the string referencing the
+	 * collation of the var's statistics, but if that's not available,
+	 * use DEFAULT_COLLATION_OID.
 	 *-------
 	 */
+	if (HeapTupleIsValid(vardata->statsTuple) &&
+		get_attstatsslot(&sslot, vardata->statsTuple,
+						 STATISTIC_KIND_HISTOGRAM, InvalidOid, 0))
+		 /* sslot.stacoll is set up */ ;
+	else
+		sslot.stacoll = DEFAULT_COLLATION_OID;
 	cmpopr = get_opfamily_member(opfamily, vartype, vartype,
 								 BTLessStrategyNumber);
 	if (cmpopr == InvalidOid)
 		elog(ERROR, "no < operator for opfamily %u", opfamily);
 	fmgr_info(get_opcode(cmpopr), &opproc);
-	greaterstrcon = make_greater_string(prefixcon, &opproc,
-										DEFAULT_COLLATION_OID);
+	greaterstrcon = make_greater_string(prefixcon, &opproc, sslot.stacoll);
 	if (greaterstrcon)
 	{
 		Selectivity topsel;