Support LIKE with nondeterministic collations

This allows for example using LIKE with case-insensitive collations. There was previously no internal implementation of this, so it was met with a not-supported error. This adds the internal implementation and removes the error. The implementation follows the specification of the SQL standard for this. Unlike with deterministic collations, the LIKE matching cannot go character by character but has to go substring by substring. For example, if we are matching against LIKE 'foo%bar', we can't start by looking for an 'f', then an 'o', but instead with have to find something that matches 'foo'. This is because the collation could consider substrings of different lengths to be equal. This is all internal to MatchText() in like_match.c. The changes in GenericMatchText() in like.c just pass through the locale information to MatchText(), which was previously not needed. This matches exactly Generic_Text_IC_like() below. ILIKE is not affected. (It's unclear whether ILIKE makes sense under nondeterministic collations.) This also updates match_pattern_prefix() in like_support.c to support optimizing the case of an exact pattern with nondeterministic collations. This was already alluded to in the previous code. (includes documentation examples from Daniel Vérité and test cases from Paul A Jungwirth) Reviewed-by: Jian He <jian.universality@gmail.com> Discussion: https://www.postgresql.org/message-id/flat/700d2e86-bf75-4607-9cf2-f5b7802f6e88@eisentraut.org
2025-11-19 13:42:17 +03:00 · 2024-11-27 08:18:35 +01:00
parent 8fcd80258b
commit 85b7efa1cd
7 changed files with 458 additions and 44 deletions
--- a/src/backend/utils/adt/like.c
+++ b/src/backend/utils/adt/like.c
@@ -147,22 +147,28 @@ SB_lower_char(unsigned char c, pg_locale_t locale)
 static inline int
 GenericMatchText(const char *s, int slen, const char *p, int plen, Oid collation)
 {
-	if (collation)
-	{
-		pg_locale_t locale = pg_newlocale_from_collation(collation);
+	pg_locale_t locale;

-		if (!locale->deterministic)
-			ereport(ERROR,
-					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-					 errmsg("nondeterministic collations are not supported for LIKE")));
+	if (!OidIsValid(collation))
+	{
+		/*
+		 * This typically means that the parser could not resolve a conflict
+		 * of implicit collations, so report it that way.
+		 */
+		ereport(ERROR,
+				(errcode(ERRCODE_INDETERMINATE_COLLATION),
+				 errmsg("could not determine which collation to use for LIKE"),
+				 errhint("Use the COLLATE clause to set the collation explicitly.")));
 	}

+	locale = pg_newlocale_from_collation(collation);
+
 	if (pg_database_encoding_max_length() == 1)
-		return SB_MatchText(s, slen, p, plen, 0);
+		return SB_MatchText(s, slen, p, plen, locale);
 	else if (GetDatabaseEncoding() == PG_UTF8)
-		return UTF8_MatchText(s, slen, p, plen, 0);
+		return UTF8_MatchText(s, slen, p, plen, locale);
 	else
-		return MB_MatchText(s, slen, p, plen, 0);
+		return MB_MatchText(s, slen, p, plen, locale);
 }

 static inline int
--- a/src/backend/utils/adt/like_match.c
+++ b/src/backend/utils/adt/like_match.c
@@ -157,7 +157,9 @@ MatchText(const char *t, int tlen, const char *p, int plen, pg_locale_t locale)
 			 * the first pattern byte to each text byte to avoid recursing
 			 * more than we have to.  This fact also guarantees that we don't
 			 * have to consider a match to the zero-length substring at the
-			 * end of the text.
+			 * end of the text.  With a nondeterministic collation, we can't
+			 * rely on the first bytes being equal, so we have to recurse in
+			 * any case.
 			 */
 			if (*p == '\\')
 			{
@@ -172,7 +174,7 @@ MatchText(const char *t, int tlen, const char *p, int plen, pg_locale_t locale)

 			while (tlen > 0)
 			{
-				if (GETCHAR(*t, locale) == firstpat)
+				if (GETCHAR(*t, locale) == firstpat || (locale && !locale->deterministic))
 				{
 					int			matched = MatchText(t, tlen, p, plen, locale);

@@ -196,6 +198,149 @@ MatchText(const char *t, int tlen, const char *p, int plen, pg_locale_t locale)
 			NextByte(p, plen);
 			continue;
 		}
+		else if (locale && !locale->deterministic)
+		{
+			/*
+			 * For nondeterministic locales, we find the next substring of the
+			 * pattern that does not contain wildcards and try to find a
+			 * matching substring in the text.  Crucially, we cannot do this
+			 * character by character, as in the normal case, but must do it
+			 * substring by substring, partitioned by the wildcard characters.
+			 * (This is per SQL standard.)
+			 */
+			const char *p1;
+			size_t		p1len;
+			const char *t1;
+			size_t		t1len;
+			bool		found_escape;
+			const char *subpat;
+			size_t		subpatlen;
+			char	   *buf = NULL;
+
+			/*
+			 * Determine next substring of pattern without wildcards.  p is
+			 * the start of the subpattern, p1 is one past the last byte. Also
+			 * track if we found an escape character.
+			 */
+			p1 = p;
+			p1len = plen;
+			found_escape = false;
+			while (p1len > 0)
+			{
+				if (*p1 == '\\')
+				{
+					found_escape = true;
+					NextByte(p1, p1len);
+					if (p1len == 0)
+						ereport(ERROR,
+								(errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
+								 errmsg("LIKE pattern must not end with escape character")));
+				}
+				else if (*p1 == '_' || *p1 == '%')
+					break;
+				NextByte(p1, p1len);
+			}
+
+			/*
+			 * If we found an escape character, then make an unescaped copy of
+			 * the subpattern.
+			 */
+			if (found_escape)
+			{
+				char	   *b;
+
+				b = buf = palloc(p1 - p);
+				for (const char *c = p; c < p1; c++)
+				{
+					if (*c == '\\')
+						;
+					else
+						*(b++) = *c;
+				}
+
+				subpat = buf;
+				subpatlen = b - buf;
+			}
+			else
+			{
+				subpat = p;
+				subpatlen = p1 - p;
+			}
+
+			/*
+			 * Shortcut: If this is the end of the pattern, then the rest of
+			 * the text has to match the rest of the pattern.
+			 */
+			if (p1len == 0)
+			{
+				int			cmp;
+
+				cmp = pg_strncoll(subpat, subpatlen, t, tlen, locale);
+
+				if (buf)
+					pfree(buf);
+				if (cmp == 0)
+					return LIKE_TRUE;
+				else
+					return LIKE_FALSE;
+			}
+
+			/*
+			 * Now build a substring of the text and try to match it against
+			 * the subpattern.  t is the start of the text, t1 is one past the
+			 * last byte.  We start with a zero-length string.
+			 */
+			t1 = t;
+			t1len = tlen;
+			for (;;)
+			{
+				int			cmp;
+
+				CHECK_FOR_INTERRUPTS();
+
+				cmp = pg_strncoll(subpat, subpatlen, t, (t1 - t), locale);
+
+				/*
+				 * If we found a match, we have to test if the rest of pattern
+				 * can match against the rest of the string.  Otherwise we
+				 * have to continue here try matching with a longer substring.
+				 * (This is similar to the recursion for the '%' wildcard
+				 * above.)
+				 *
+				 * Note that we can't just wind forward p and t and continue
+				 * with the main loop.  This would fail for example with
+				 *
+				 * U&'\0061\0308bc' LIKE U&'\00E4_c' COLLATE ignore_accents
+				 *
+				 * You'd find that t=\0061 matches p=\00E4, but then the rest
+				 * won't match; but t=\0061\0308 also matches p=\00E4, and
+				 * then the rest will match.
+				 */
+				if (cmp == 0)
+				{
+					int			matched = MatchText(t1, t1len, p1, p1len, locale);
+
+					if (matched == LIKE_TRUE)
+					{
+						if (buf)
+							pfree(buf);
+						return matched;
+					}
+				}
+
+				/*
+				 * Didn't match.  If we used up the whole text, then the match
+				 * fails.  Otherwise, try again with a longer substring.
+				 */
+				if (t1len == 0)
+					return LIKE_FALSE;
+				else
+					NextChar(t1, t1len);
+			}
+			if (buf)
+				pfree(buf);
+			continue;
+		}
 		else if (GETCHAR(*p, locale) != GETCHAR(*t, locale))
 		{
 			/* non-wildcard pattern char fails to match text char */
--- a/src/backend/utils/adt/like_support.c
+++ b/src/backend/utils/adt/like_support.c
@@ -272,22 +272,6 @@ match_pattern_prefix(Node *leftop,
 		return NIL;
 	patt = (Const *) rightop;

-	/*
-	 * Not supported if the expression collation is nondeterministic.  The
-	 * optimized equality or prefix tests use bytewise comparisons, which is
-	 * not consistent with nondeterministic collations.  The actual
-	 * pattern-matching implementation functions will later error out that
-	 * pattern-matching is not supported with nondeterministic collations. (We
-	 * could also error out here, but by doing it later we get more precise
-	 * error messages.)  (It should be possible to support at least
-	 * Pattern_Prefix_Exact, but no point as long as the actual
-	 * pattern-matching implementations don't support it.)
-	 *
-	 * expr_coll is not set for a non-collation-aware data type such as bytea.
-	 */
-	if (expr_coll && !get_collation_isdeterministic(expr_coll))
-		return NIL;
-
 	/*
 	 * Try to extract a fixed prefix from the pattern.
 	 */
@@ -404,6 +388,8 @@ match_pattern_prefix(Node *leftop,
 	{
 		if (!op_in_opfamily(eqopr, opfamily))
 			return NIL;
+		if (indexcollation != expr_coll)
+			return NIL;
 		expr = make_opclause(eqopr, BOOLOID, false,
 							 (Expr *) leftop, (Expr *) prefix,
 							 InvalidOid, indexcollation);
@@ -411,6 +397,17 @@ match_pattern_prefix(Node *leftop,
 		return result;
 	}

+	/*
+	 * Anything other than Pattern_Prefix_Exact is not supported if the
+	 * expression collation is nondeterministic.  The optimized equality or
+	 * prefix tests use bytewise comparisons, which is not consistent with
+	 * nondeterministic collations.
+	 *
+	 * expr_coll is not set for a non-collation-aware data type such as bytea.
+	 */
+	if (expr_coll && !get_collation_isdeterministic(expr_coll))
+		return NIL;
+
 	/*
 	 * Otherwise, we have a nonempty required prefix of the values.  Some
 	 * opclasses support prefix checks directly, otherwise we'll try to