Teach pattern_fixed_prefix() about collations.

This is necessary, not optional, now that ILIKE and regexes are collation aware --- else we might derive a wrong comparison constant for index optimized pattern matches.
2025-07-30 11:03:19 +03:00 · 2011-04-11 12:28:28 -04:00
parent dad1f46382
commit 3c381a55b0
3 changed files with 117 additions and 55 deletions
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@ -1181,9 +1181,14 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
 			return result;
 	}

-	/* divide pattern into fixed prefix and remainder */
+	/*
+	 * Divide pattern into fixed prefix and remainder.  XXX we have to assume
+	 * default collation here, because we don't have access to the actual
+	 * input collation for the operator.  FIXME ...
+	 */
 	patt = (Const *) other;
-	pstatus = pattern_fixed_prefix(patt, ptype, &prefix, &rest);
+	pstatus = pattern_fixed_prefix(patt, ptype, DEFAULT_COLLATION_OID,
+								   &prefix, &rest);

 	/*
 	 * If necessary, coerce the prefix constant to the right type. (The "rest"
@ -4755,6 +4760,29 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata,
 *-------------------------------------------------------------------------
 */

+/*
+ * Check whether char is a letter (and, hence, subject to case-folding)
+ *
+ * In multibyte character sets, we can't use isalpha, and it does not seem
+ * worth trying to convert to wchar_t to use iswalpha.  Instead, just assume
+ * any multibyte char is potentially case-varying.
+ */
+static int
+pattern_char_isalpha(char c, bool is_multibyte,
+					 pg_locale_t locale, bool locale_is_c)
+{
+	if (locale_is_c)
+		return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
+	else if (is_multibyte && IS_HIGHBIT_SET(c))
+		return true;
+#ifdef HAVE_LOCALE_T
+	else if (locale)
+		return isalpha_l((unsigned char) c, locale);
+#endif
+	else
+		return isalpha((unsigned char) c);
+}
+
 /*
 * Extract the fixed prefix, if any, for a pattern.
 *
@ -4769,7 +4797,7 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata,
 */

 static Pattern_Prefix_Status
-like_fixed_prefix(Const *patt_const, bool case_insensitive,
+like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
 				  Const **prefix_const, Const **rest_const)
 {
 	char	   *match;
@ -4780,15 +4808,39 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive,
 	int			pos,
 				match_pos;
 	bool		is_multibyte = (pg_database_encoding_max_length() > 1);
+	pg_locale_t	locale = 0;
+	bool		locale_is_c = false;

 	/* the right-hand const is type text or bytea */
 	Assert(typeid == BYTEAOID || typeid == TEXTOID);

-	if (typeid == BYTEAOID && case_insensitive)
-		ereport(ERROR,
-				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+	if (case_insensitive)
+	{
+		if (typeid == BYTEAOID)
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 		   errmsg("case insensitive matching not supported on type bytea")));

+		/* If case-insensitive, we need locale info */
+		if (lc_ctype_is_c(collation))
+			locale_is_c = true;
+		else if (collation != DEFAULT_COLLATION_OID)
+		{
+			if (!OidIsValid(collation))
+			{
+				/*
+				 * This typically means that the parser could not resolve a
+				 * conflict of implicit collations, so report it that way.
+				 */
+				ereport(ERROR,
+						(errcode(ERRCODE_INDETERMINATE_COLLATION),
+						 errmsg("could not determine which collation to use for ILIKE"),
+						 errhint("Use the COLLATE clause to set the collation explicitly.")));
+			}
+			locale = pg_newlocale_from_collation(collation);
+		}
+	}
+
 	if (typeid != BYTEAOID)
 	{
 		patt = TextDatumGetCString(patt_const->constvalue);
@ -4822,23 +4874,11 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive,
 				break;
 		}

-		/*
-		 * XXX In multibyte character sets, we can't trust isalpha, so assume
-		 * any multibyte char is potentially case-varying.
-		 */
-		if (case_insensitive)
-		{
-			if (is_multibyte && (unsigned char) patt[pos] >= 0x80)
-				break;
-			if (isalpha((unsigned char) patt[pos]))
-				break;
-		}
+		/* Stop if case-varying character (it's sort of a wildcard) */
+		if (case_insensitive &&
+			pattern_char_isalpha(patt[pos], is_multibyte, locale, locale_is_c))
+			break;

-		/*
-		 * NOTE: this code used to think that %% meant a literal %, but
-		 * textlike() itself does not think that, and the SQL92 spec doesn't
-		 * say any such thing either.
-		 */
 		match[match_pos++] = patt[pos];
 	}

@ -4870,7 +4910,7 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive,
 }

 static Pattern_Prefix_Status
-regex_fixed_prefix(Const *patt_const, bool case_insensitive,
+regex_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
 				   Const **prefix_const, Const **rest_const)
 {
 	char	   *match;
@ -4883,6 +4923,8 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
 	char	   *rest;
 	Oid			typeid = patt_const->consttype;
 	bool		is_multibyte = (pg_database_encoding_max_length() > 1);
+	pg_locale_t	locale = 0;
+	bool		locale_is_c = false;

 	/*
 	 * Should be unnecessary, there are no bytea regex operators defined. As
@ -4894,6 +4936,28 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 		 errmsg("regular-expression matching not supported on type bytea")));

+	if (case_insensitive)
+	{
+		/* If case-insensitive, we need locale info */
+		if (lc_ctype_is_c(collation))
+			locale_is_c = true;
+		else if (collation != DEFAULT_COLLATION_OID)
+		{
+			if (!OidIsValid(collation))
+			{
+				/*
+				 * This typically means that the parser could not resolve a
+				 * conflict of implicit collations, so report it that way.
+				 */
+				ereport(ERROR,
+						(errcode(ERRCODE_INDETERMINATE_COLLATION),
+						 errmsg("could not determine which collation to use for regular expression"),
+						 errhint("Use the COLLATE clause to set the collation explicitly.")));
+			}
+			locale = pg_newlocale_from_collation(collation);
+		}
+	}
+
 	/* the right-hand const is type text for all of these */
 	patt = TextDatumGetCString(patt_const->constvalue);

@ -4969,17 +5033,10 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
 			patt[pos] == '$')
 			break;

-		/*
-		 * XXX In multibyte character sets, we can't trust isalpha, so assume
-		 * any multibyte char is potentially case-varying.
-		 */
-		if (case_insensitive)
-		{
-			if (is_multibyte && (unsigned char) patt[pos] >= 0x80)
-				break;
-			if (isalpha((unsigned char) patt[pos]))
-				break;
-		}
+		/* Stop if case-varying character (it's sort of a wildcard) */
+		if (case_insensitive &&
+			pattern_char_isalpha(patt[pos], is_multibyte, locale, locale_is_c))
+			break;

 		/*
 		 * Check for quantifiers.  Except for +, this means the preceding
@ -5004,7 +5061,7 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
 		 * backslash followed by alphanumeric is an escape, not a quoted
 		 * character.  Must treat it as having multiple possible matches.
 		 * Note: since only ASCII alphanumerics are escapes, we don't have to
-		 * be paranoid about multibyte here.
+		 * be paranoid about multibyte or collations here.
 		 */
 		if (patt[pos] == '\\')
 		{
@ -5056,7 +5113,7 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
 }

 Pattern_Prefix_Status
-pattern_fixed_prefix(Const *patt, Pattern_Type ptype,
+pattern_fixed_prefix(Const *patt, Pattern_Type ptype, Oid collation,
 					 Const **prefix, Const **rest)
 {
 	Pattern_Prefix_Status result;
@ -5064,16 +5121,16 @@ pattern_fixed_prefix(Const *patt, Pattern_Type ptype,
 	switch (ptype)
 	{
 		case Pattern_Type_Like:
-			result = like_fixed_prefix(patt, false, prefix, rest);
+			result = like_fixed_prefix(patt, false, collation, prefix, rest);
 			break;
 		case Pattern_Type_Like_IC:
-			result = like_fixed_prefix(patt, true, prefix, rest);
+			result = like_fixed_prefix(patt, true, collation, prefix, rest);
 			break;
 		case Pattern_Type_Regex:
-			result = regex_fixed_prefix(patt, false, prefix, rest);
+			result = regex_fixed_prefix(patt, false, collation, prefix, rest);
 			break;
 		case Pattern_Type_Regex_IC:
-			result = regex_fixed_prefix(patt, true, prefix, rest);
+			result = regex_fixed_prefix(patt, true, collation, prefix, rest);
 			break;
 		default:
 			elog(ERROR, "unrecognized ptype: %d", (int) ptype);