Rewrite LIKE's %-followed-by-_ optimization so it really works (this time

for sure ;-)). It now also optimizes more cases, such as %_%_. Improve comments too. Per bug #5478. In passing, also rename the TCHAR macro to GETCHAR, because pgindent is messing with the formatting of the former (apparently it now thinks TCHAR is a typedef name). Back-patch to 8.3, where the bug was introduced.
2025-08-18 12:22:09 +03:00 · 2010-05-28 17:35:36 +00:00
parent caea7b7aca
commit 63c0780dba
3 changed files with 84 additions and 78 deletions
--- a/src/backend/utils/adt/like_match.c
+++ b/src/backend/utils/adt/like_match.c
@@ -1,25 +1,25 @@
 /*-------------------------------------------------------------------------
 *
 * like_match.c
- *	  like expression handling internal code.
+ *	  LIKE pattern matching internal code.
 *
- * This file is included by like.c four times, to provide natching code for
+ * This file is included by like.c four times, to provide matching code for
- * single-byte encodings, UTF8, and for other multi-byte encodings,
+ * (1) single-byte encodings, (2) UTF8, (3) other multi-byte encodings,
- * and case insensitive matches for single byte encodings.
+ * and (4) case insensitive matches in single byte encodings.
- * UTF8 is a special case because we can use a much more efficient version
+ * (UTF8 is a special case because we can use a much more efficient version
- * of NextChar than can be used for other multi-byte encodings.
+ * of NextChar than can be used for general multi-byte encodings.)
 *
 * Before the inclusion, we need to define the following macros:
 *
 * NextChar
 * MatchText - to name of function wanted
 * do_like_escape - name of function if wanted - needs CHAREQ and CopyAdvChar
- * MATCH_LOWER - define iff using to_lower on text chars
+ * MATCH_LOWER - define for case (4), using to_lower on single-byte chars
 *
 * Copyright (c) 1996-2008, PostgreSQL Global Development Group
 *
 * IDENTIFICATION
- *	$PostgreSQL: pgsql/src/backend/utils/adt/like_match.c,v 1.20.2.3 2009/05/24 18:10:47 tgl Exp $
+ *	$PostgreSQL: pgsql/src/backend/utils/adt/like_match.c,v 1.20.2.4 2010/05/28 17:35:36 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -70,9 +70,9 @@
 */
 #ifdef MATCH_LOWER
-#define TCHAR(t) ((char) tolower((unsigned char) (t)))
+#define GETCHAR(t) ((char) tolower((unsigned char) (t)))
 #else
-#define TCHAR(t) (t)
+#define GETCHAR(t) (t)
 #endif
 static int
@@ -94,78 +94,74 @@ MatchText(char *t, int tlen, char *p, int plen)
 	{
 		if (*p == '\\')
 		{
-			/* Next byte must match literally, whatever it is */
+			/* Next pattern byte must match literally, whatever it is */
 			NextByte(p, plen);
-			if ((plen <= 0) || TCHAR(*p) != TCHAR(*t))
+			if (plen <= 0 || GETCHAR(*p) != GETCHAR(*t))
 				return LIKE_FALSE;
 		}
 		else if (*p == '%')
 		{
-			/*
+			char		firstpat;
 			 * % processing is essentially a search for a match for what
 			 * follows the %, plus a recursive match of the remainder. We
 			 * succeed if and only if both conditions are met.
 			 */
-			/* %% is the same as % according to the SQL standard */
+			/*
-			/* Advance past all %'s */
+			 * % processing is essentially a search for a text position at
-			while (plen > 0 && *p == '%')
+			 * which the remainder of the text matches the remainder of the
 			 * pattern, using a recursive call to check each potential match.
 			 *
 			 * If there are wildcards immediately following the %, we can skip
 			 * over them first, using the idea that any sequence of N _'s and
 			 * one or more %'s is equivalent to N _'s and one % (ie, it will
 			 * match any sequence of at least N text characters).  In this
 			 * way we will always run the recursive search loop using a
 			 * pattern fragment that begins with a literal character-to-match,
 			 * thereby not recursing more than we have to.
 			 */
 			NextByte(p, plen);
-			/* Trailing percent matches everything. */
+
 			while (plen > 0)
 			{
 				if (*p == '%')
 					NextByte(p, plen);
 				else if (*p == '_')
 				{
 					/* If not enough text left to match the pattern, ABORT */
 					if (tlen <= 0)
 						return LIKE_ABORT;
 					NextChar(t, tlen);
 					NextByte(p, plen);
 				}
 				else
 					break;		/* Reached a non-wildcard pattern char */
 			}
 			/*
 			 * If we're at end of pattern, match: we have a trailing % which
 			 * matches any remaining text string.
 			 */
 			if (plen <= 0)
 				return LIKE_TRUE;
 			/*
 			 * Otherwise, scan for a text position at which we can match the
-			 * rest of the pattern.
+			 * rest of the pattern.  The first remaining pattern char is known
 			 * to be a regular or escaped literal character, so we can compare
 			 * the first pattern byte to each text byte to avoid recursing
 			 * more than we have to.  This fact also guarantees that we don't
 			 * have to consider a match to the zero-length substring at the
 			 * end of the text.
 			 */
 			if (*p == '_')
 			{
 				/* %_ is the same as _% - avoid matching _ repeatedly */
 				do
 				{
 					NextChar(t, tlen);
 					NextByte(p, plen);
 				} while (tlen > 0 && plen > 0 && *p == '_');
 				/*
 				 * If we are at the end of the pattern, succeed: % followed
 				 * by n _'s matches any string of at least n characters, and
 				 * we have now found there are at least n characters.
 				 */
 				if (plen <= 0)
 					return LIKE_TRUE;
 				/* Look for a place that matches the rest of the pattern */
 				while (tlen > 0)
 				{
 					int			matched = MatchText(t, tlen, p, plen);
 					if (matched != LIKE_FALSE)
 						return matched; /* TRUE or ABORT */
 					NextChar(t, tlen);
 				}
 			}
 			else
 			{
 				char		firstpat = TCHAR(*p);
 			if (*p == '\\')
 			{
 				if (plen < 2)
-						return LIKE_FALSE;
+					return LIKE_FALSE; /* XXX should throw error */
-					firstpat = TCHAR(p[1]);
+				firstpat = GETCHAR(p[1]);
 			}
 			else
 				firstpat = GETCHAR(*p);
 			while (tlen > 0)
 			{
-					/*
+				if (GETCHAR(*t) == firstpat)
 					 * Optimization to prevent most recursion: don't recurse
 					 * unless first pattern byte matches first text byte.
 					 */
 					if (TCHAR(*t) == firstpat)
 				{
 					int			matched = MatchText(t, tlen, p, plen);
@@ -175,7 +171,6 @@ MatchText(char *t, int tlen, char *p, int plen)
 				NextChar(t, tlen);
 			}
 			}
 			/*
 			 * End of text with no match, so no point in trying later places
@@ -190,7 +185,7 @@ MatchText(char *t, int tlen, char *p, int plen)
 			NextByte(p, plen);
 			continue;
 		}
-		else if (TCHAR(*p) != TCHAR(*t))
+		else if (GETCHAR(*p) != GETCHAR(*t))
 		{
 			/* non-wildcard pattern char fails to match text char */
 			return LIKE_FALSE;
@@ -215,10 +210,12 @@ MatchText(char *t, int tlen, char *p, int plen)
 	if (tlen > 0)
 		return LIKE_FALSE;		/* end of pattern, but not of text */
-	/* End of text string.  Do we have matching pattern remaining? */
+	/*
-	while (plen > 0 && *p == '%')	/* allow multiple %'s at end of pattern */
+	 * End of text, but perhaps not of pattern.  Match iff the remaining
 	 * pattern can match a zero-length string, ie, it's zero or more %'s.
 	 */
 	while (plen > 0 && *p == '%')
 		NextByte(p, plen);
 	if (plen <= 0)
 		return LIKE_TRUE;
@@ -342,8 +339,9 @@ do_like_escape(text *pat, text *esc)
 #undef do_like_escape
 #endif
-#undef TCHAR
+#undef GETCHAR
 #ifdef MATCH_LOWER
 #undef MATCH_LOWER
 #endif
--- a/src/test/regress/expected/strings.out
+++ b/src/test/regress/expected/strings.out
@@ -827,7 +827,7 @@ SELECT 'Hawkeye' NOT ILIKE 'h%' AS "false";
 (1 row)
 --
-- test %/_ combination cases, cf bug #4821
+-- test %/_ combination cases, cf bugs #4821 and #5478
 --
 SELECT 'foo' LIKE '_%' as t, 'f' LIKE '_%' as t, '' LIKE '_%' as f;
 t | t | f 
@@ -853,6 +853,12 @@ SELECT 'foo' LIKE '%__' as t, 'foo' LIKE '%___' as t, 'foo' LIKE '%____' as f;
 t | t | f
 (1 row)
 SELECT 'jack' LIKE '%____%' AS t;
 t 
 ---
 t
 (1 row)
 --
 -- test implicit type conversion
 --
--- a/src/test/regress/sql/strings.sql
+++ b/src/test/regress/sql/strings.sql
@@ -253,7 +253,7 @@ SELECT 'Hawkeye' ILIKE 'h%' AS "true";
 SELECT 'Hawkeye' NOT ILIKE 'h%' AS "false";
 --
-- test %/_ combination cases, cf bug #4821
+-- test %/_ combination cases, cf bugs #4821 and #5478
 --
 SELECT 'foo' LIKE '_%' as t, 'f' LIKE '_%' as t, '' LIKE '_%' as f;
@@ -262,6 +262,8 @@ SELECT 'foo' LIKE '%_' as t, 'f' LIKE '%_' as t, '' LIKE '%_' as f;
 SELECT 'foo' LIKE '__%' as t, 'foo' LIKE '___%' as t, 'foo' LIKE '____%' as f;
 SELECT 'foo' LIKE '%__' as t, 'foo' LIKE '%___' as t, 'foo' LIKE '%____' as f;
 SELECT 'jack' LIKE '%____%' AS t;
 --
 -- test implicit type conversion