Fix regex_fixed_prefix() to cope reasonably well with regex patterns of the

form '^(foo)$'. Before, these could never be optimized into indexscans. The recent changes to make psql and pg_dump generate such patterns (for \d commands and -t and related switches, respectively) therefore represented a big performance hit for people with large pg_class catalogs, as seen in recent gripe from Erik Jones. While at it, be more paranoid about case-sensitivity checking in multibyte encodings, and fix some other corner cases in which a regex might be interpreted too liberally.
2025-11-26 23:43:30 +03:00 · 2007-01-03 22:40:04 +00:00
parent d3db2bd80c
commit 0b29676aa6
1 changed files with 94 additions and 56 deletions
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -15,7 +15,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.119.2.9 2006/05/21 20:07:11 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.119.2.10 2007/01/03 22:40:04 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -2735,7 +2735,10 @@ get_join_vars(List *args, Var **var1, Var **var2)
 * These routines support analysis of LIKE and regular-expression patterns
 * by the planner/optimizer.  It's important that they agree with the
 * regular-expression code in backend/regex/ and the LIKE code in
- * backend/utils/adt/like.c.
+ * backend/utils/adt/like.c.  Also, the computation of the fixed prefix
 * must be conservative: if we report a string longer than the true fixed
 * prefix, the query may produce actually wrong answers, rather than just
 * getting a bad selectivity estimate!
 *
 * Note that the prefix-analysis functions are called from
 * backend/optimizer/path/indxpath.c as well as from routines in this file.
@@ -2764,6 +2767,7 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive,
 	Oid			typeid = patt_const->consttype;
 	int			pos,
 				match_pos;
 	bool		is_multibyte = (pg_database_encoding_max_length() > 1);
 	/* the right-hand const is type text or bytea */
 	Assert(typeid == BYTEAOID || typeid == TEXTOID);
@@ -2811,11 +2815,16 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive,
 		}
 		/*
-		 * XXX I suspect isalpha() is not an adequately locale-sensitive
+		 * XXX In multibyte character sets, we can't trust isalpha, so assume
-		 * test for characters that can vary under case folding?
+		 * any multibyte char is potentially case-varying.
 		 */
-		if (case_insensitive && isalpha((unsigned char) patt[pos]))
+		if (case_insensitive)
-			break;
+		{
 			if (is_multibyte && (unsigned char) patt[pos] >= 0x80)
 				break;
 			if (isalpha((unsigned char) patt[pos]))
 				break;
 		}
 		/*
 		 * NOTE: this code used to think that %% meant a literal %, but
@@ -2861,11 +2870,13 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
 	char	   *match;
 	int			pos,
 				match_pos,
-				paren_depth;
+				prev_pos,
 				prev_match_pos;
 	bool		have_leading_paren;
 	char	   *patt;
 	char	   *prefix;
 	char	   *rest;
 	Oid			typeid = patt_const->consttype;
 	bool		is_multibyte = (pg_database_encoding_max_length() > 1);
 	/*
 	 * Should be unnecessary, there are no bytea regex operators defined.
@@ -2879,7 +2890,25 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
 	patt = DatumGetCString(DirectFunctionCall1(textout, patt_const->constvalue));
 	/* Pattern must be anchored left */
-	if (patt[0] != '^')
+	pos = 0;
 	if (patt[pos] != '^')
 	{
 		rest = patt;
 		*prefix_const = NULL;
 		*rest_const = string_to_const(rest, typeid);
 		return Pattern_Prefix_None;
 	}
 	pos++;
 	/*
 	 * If '|' is present in pattern, then there may be multiple alternatives
 	 * for the start of the string.  (There are cases where this isn't so,
 	 * for instance if the '|' is inside parens, but detecting that reliably
 	 * is too hard.)
 	 */
 	if (strchr(patt + pos, '|') != NULL)
 	{
 		rest = patt;
@@ -2889,103 +2918,112 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
 		return Pattern_Prefix_None;
 	}
 	/* OK, allocate space for pattern */
 	match = palloc(strlen(patt) + 1);
 	prev_match_pos = match_pos = 0;
 	/*
-	 * If unquoted | is present at paren level 0 in pattern, then there
+	 * We special-case the syntax '^(...)$' because psql uses it.  But beware:
-	 * are multiple alternatives for the start of the string.
+	 * sequences beginning "(?" are not what they seem.
 	 */
-	paren_depth = 0;
+	have_leading_paren = false;
-	for (pos = 1; patt[pos]; pos++)
+	if (patt[pos] == '(' && patt[pos + 1] != '?')
 	{
-		if (patt[pos] == '|' && paren_depth == 0)
+		have_leading_paren = true;
-		{
+		pos++;
 			rest = patt;
 			*prefix_const = NULL;
 			*rest_const = string_to_const(rest, typeid);
 			return Pattern_Prefix_None;
 		}
 		else if (patt[pos] == '(')
 			paren_depth++;
 		else if (patt[pos] == ')' && paren_depth > 0)
 			paren_depth--;
 		else if (patt[pos] == '\\')
 		{
 			/* backslash quotes the next character */
 			pos++;
 			if (patt[pos] == '\0')
 				break;
 		}
 	}
-	/* OK, allocate space for pattern */
+	/* Scan remainder of pattern */
-	prefix = match = palloc(strlen(patt) + 1);
+	prev_pos = pos;
-	match_pos = 0;
+	while (patt[pos])
 	/* note start at pos 1 to skip leading ^ */
 	for (pos = 1; patt[pos]; pos++)
 	{
 		int			len;
 		/*
-		 * Check for characters that indicate multiple possible matches
+		 * Check for characters that indicate multiple possible matches here.
-		 * here. XXX I suspect isalpha() is not an adequately
+		 * Also, drop out at ')' or '$' so the termination test works right.
 		 * locale-sensitive test for characters that can vary under case
 		 * folding?
 		 */
 		if (patt[pos] == '.' ||
 			patt[pos] == '(' ||
 			patt[pos] == ')' ||
 			patt[pos] == '[' ||
-			patt[pos] == '$' ||
+			patt[pos] == '^' ||
-			(case_insensitive && isalpha((unsigned char) patt[pos])))
+			patt[pos] == '$')
 			break;
 		/*
 		 * XXX In multibyte character sets, we can't trust isalpha, so assume
 		 * any multibyte char is potentially case-varying.
 		 */
 		if (case_insensitive)
 		{
 			if (is_multibyte && (unsigned char) patt[pos] >= 0x80)
 				break;
 			if (isalpha((unsigned char) patt[pos]))
 				break;
 		}
 		/*
 		 * Check for quantifiers.  Except for +, this means the preceding
-		 * character is optional, so we must remove it from the prefix
+		 * character is optional, so we must remove it from the prefix too!
 		 * too!
 		 */
 		if (patt[pos] == '*' ||
 			patt[pos] == '?' ||
 			patt[pos] == '{')
 		{
-			if (match_pos > 0)
+			match_pos = prev_match_pos;
-				match_pos--;
+			pos = prev_pos;
 			pos--;
 			break;
 		}
 		if (patt[pos] == '+')
 		{
-			pos--;
+			pos = prev_pos;
 			break;
 		}
 		/*
 		 * backslash quotes the next character.
 		 */
 		if (patt[pos] == '\\')
 		{
 			/* backslash quotes the next character */
 			pos++;
 			if (patt[pos] == '\0')
 				break;
 		}
-		match[match_pos++] = patt[pos];
+		/* save position in case we need to back up on next loop cycle */
 		prev_match_pos = match_pos;
 		prev_pos = pos;
 		/* must use encoding-aware processing here */
 		len = pg_mblen(&patt[pos]);
 		memcpy(&match[match_pos], &patt[pos], len);
 		match_pos += len;
 		pos += len;
 	}
 	match[match_pos] = '\0';
 	rest = &patt[pos];
 	if (have_leading_paren && patt[pos] == ')')
 		pos++;
 	if (patt[pos] == '$' && patt[pos + 1] == '\0')
 	{
 		rest = &patt[pos + 1];
-		*prefix_const = string_to_const(prefix, typeid);
+		*prefix_const = string_to_const(match, typeid);
 		*rest_const = string_to_const(rest, typeid);
 		pfree(patt);
 		pfree(match);
 		return Pattern_Prefix_Exact;	/* pattern specifies exact match */
 	}
-	*prefix_const = string_to_const(prefix, typeid);
+	*prefix_const = string_to_const(match, typeid);
 	*rest_const = string_to_const(rest, typeid);
 	pfree(patt);
 	pfree(match);
 	prefix = NULL;
 	if (match_pos > 0)
 		return Pattern_Prefix_Partial;