mirror of
https://github.com/postgres/postgres.git
synced 2025-08-12 15:23:02 +03:00
Fix regex_fixed_prefix() to cope reasonably well with regex patterns of the
form '^(foo)$'. Before, these could never be optimized into indexscans. The recent changes to make psql and pg_dump generate such patterns (for \d commands and -t and related switches, respectively) therefore represented a big performance hit for people with large pg_class catalogs, as seen in recent gripe from Erik Jones. While at it, be more paranoid about case-sensitivity checking in multibyte encodings, and fix some other corner cases in which a regex might be interpreted too liberally.
This commit is contained in:
@@ -15,7 +15,7 @@
|
|||||||
*
|
*
|
||||||
*
|
*
|
||||||
* IDENTIFICATION
|
* IDENTIFICATION
|
||||||
* $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.119.2.9 2006/05/21 20:07:11 tgl Exp $
|
* $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.119.2.10 2007/01/03 22:40:04 tgl Exp $
|
||||||
*
|
*
|
||||||
*-------------------------------------------------------------------------
|
*-------------------------------------------------------------------------
|
||||||
*/
|
*/
|
||||||
@@ -2735,7 +2735,10 @@ get_join_vars(List *args, Var **var1, Var **var2)
|
|||||||
* These routines support analysis of LIKE and regular-expression patterns
|
* These routines support analysis of LIKE and regular-expression patterns
|
||||||
* by the planner/optimizer. It's important that they agree with the
|
* by the planner/optimizer. It's important that they agree with the
|
||||||
* regular-expression code in backend/regex/ and the LIKE code in
|
* regular-expression code in backend/regex/ and the LIKE code in
|
||||||
* backend/utils/adt/like.c.
|
* backend/utils/adt/like.c. Also, the computation of the fixed prefix
|
||||||
|
* must be conservative: if we report a string longer than the true fixed
|
||||||
|
* prefix, the query may produce actually wrong answers, rather than just
|
||||||
|
* getting a bad selectivity estimate!
|
||||||
*
|
*
|
||||||
* Note that the prefix-analysis functions are called from
|
* Note that the prefix-analysis functions are called from
|
||||||
* backend/optimizer/path/indxpath.c as well as from routines in this file.
|
* backend/optimizer/path/indxpath.c as well as from routines in this file.
|
||||||
@@ -2764,6 +2767,7 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive,
|
|||||||
Oid typeid = patt_const->consttype;
|
Oid typeid = patt_const->consttype;
|
||||||
int pos,
|
int pos,
|
||||||
match_pos;
|
match_pos;
|
||||||
|
bool is_multibyte = (pg_database_encoding_max_length() > 1);
|
||||||
|
|
||||||
/* the right-hand const is type text or bytea */
|
/* the right-hand const is type text or bytea */
|
||||||
Assert(typeid == BYTEAOID || typeid == TEXTOID);
|
Assert(typeid == BYTEAOID || typeid == TEXTOID);
|
||||||
@@ -2811,11 +2815,16 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* XXX I suspect isalpha() is not an adequately locale-sensitive
|
* XXX In multibyte character sets, we can't trust isalpha, so assume
|
||||||
* test for characters that can vary under case folding?
|
* any multibyte char is potentially case-varying.
|
||||||
*/
|
*/
|
||||||
if (case_insensitive && isalpha((unsigned char) patt[pos]))
|
if (case_insensitive)
|
||||||
break;
|
{
|
||||||
|
if (is_multibyte && (unsigned char) patt[pos] >= 0x80)
|
||||||
|
break;
|
||||||
|
if (isalpha((unsigned char) patt[pos]))
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* NOTE: this code used to think that %% meant a literal %, but
|
* NOTE: this code used to think that %% meant a literal %, but
|
||||||
@@ -2861,11 +2870,13 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
|
|||||||
char *match;
|
char *match;
|
||||||
int pos,
|
int pos,
|
||||||
match_pos,
|
match_pos,
|
||||||
paren_depth;
|
prev_pos,
|
||||||
|
prev_match_pos;
|
||||||
|
bool have_leading_paren;
|
||||||
char *patt;
|
char *patt;
|
||||||
char *prefix;
|
|
||||||
char *rest;
|
char *rest;
|
||||||
Oid typeid = patt_const->consttype;
|
Oid typeid = patt_const->consttype;
|
||||||
|
bool is_multibyte = (pg_database_encoding_max_length() > 1);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Should be unnecessary, there are no bytea regex operators defined.
|
* Should be unnecessary, there are no bytea regex operators defined.
|
||||||
@@ -2879,7 +2890,25 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
|
|||||||
patt = DatumGetCString(DirectFunctionCall1(textout, patt_const->constvalue));
|
patt = DatumGetCString(DirectFunctionCall1(textout, patt_const->constvalue));
|
||||||
|
|
||||||
/* Pattern must be anchored left */
|
/* Pattern must be anchored left */
|
||||||
if (patt[0] != '^')
|
pos = 0;
|
||||||
|
if (patt[pos] != '^')
|
||||||
|
{
|
||||||
|
rest = patt;
|
||||||
|
|
||||||
|
*prefix_const = NULL;
|
||||||
|
*rest_const = string_to_const(rest, typeid);
|
||||||
|
|
||||||
|
return Pattern_Prefix_None;
|
||||||
|
}
|
||||||
|
pos++;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If '|' is present in pattern, then there may be multiple alternatives
|
||||||
|
* for the start of the string. (There are cases where this isn't so,
|
||||||
|
* for instance if the '|' is inside parens, but detecting that reliably
|
||||||
|
* is too hard.)
|
||||||
|
*/
|
||||||
|
if (strchr(patt + pos, '|') != NULL)
|
||||||
{
|
{
|
||||||
rest = patt;
|
rest = patt;
|
||||||
|
|
||||||
@@ -2889,103 +2918,112 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
|
|||||||
return Pattern_Prefix_None;
|
return Pattern_Prefix_None;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* OK, allocate space for pattern */
|
||||||
|
match = palloc(strlen(patt) + 1);
|
||||||
|
prev_match_pos = match_pos = 0;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If unquoted | is present at paren level 0 in pattern, then there
|
* We special-case the syntax '^(...)$' because psql uses it. But beware:
|
||||||
* are multiple alternatives for the start of the string.
|
* sequences beginning "(?" are not what they seem.
|
||||||
*/
|
*/
|
||||||
paren_depth = 0;
|
have_leading_paren = false;
|
||||||
for (pos = 1; patt[pos]; pos++)
|
if (patt[pos] == '(' && patt[pos + 1] != '?')
|
||||||
{
|
{
|
||||||
if (patt[pos] == '|' && paren_depth == 0)
|
have_leading_paren = true;
|
||||||
{
|
pos++;
|
||||||
rest = patt;
|
|
||||||
|
|
||||||
*prefix_const = NULL;
|
|
||||||
*rest_const = string_to_const(rest, typeid);
|
|
||||||
|
|
||||||
return Pattern_Prefix_None;
|
|
||||||
}
|
|
||||||
else if (patt[pos] == '(')
|
|
||||||
paren_depth++;
|
|
||||||
else if (patt[pos] == ')' && paren_depth > 0)
|
|
||||||
paren_depth--;
|
|
||||||
else if (patt[pos] == '\\')
|
|
||||||
{
|
|
||||||
/* backslash quotes the next character */
|
|
||||||
pos++;
|
|
||||||
if (patt[pos] == '\0')
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* OK, allocate space for pattern */
|
/* Scan remainder of pattern */
|
||||||
prefix = match = palloc(strlen(patt) + 1);
|
prev_pos = pos;
|
||||||
match_pos = 0;
|
while (patt[pos])
|
||||||
|
|
||||||
/* note start at pos 1 to skip leading ^ */
|
|
||||||
for (pos = 1; patt[pos]; pos++)
|
|
||||||
{
|
{
|
||||||
|
int len;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Check for characters that indicate multiple possible matches
|
* Check for characters that indicate multiple possible matches here.
|
||||||
* here. XXX I suspect isalpha() is not an adequately
|
* Also, drop out at ')' or '$' so the termination test works right.
|
||||||
* locale-sensitive test for characters that can vary under case
|
|
||||||
* folding?
|
|
||||||
*/
|
*/
|
||||||
if (patt[pos] == '.' ||
|
if (patt[pos] == '.' ||
|
||||||
patt[pos] == '(' ||
|
patt[pos] == '(' ||
|
||||||
|
patt[pos] == ')' ||
|
||||||
patt[pos] == '[' ||
|
patt[pos] == '[' ||
|
||||||
patt[pos] == '$' ||
|
patt[pos] == '^' ||
|
||||||
(case_insensitive && isalpha((unsigned char) patt[pos])))
|
patt[pos] == '$')
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* XXX In multibyte character sets, we can't trust isalpha, so assume
|
||||||
|
* any multibyte char is potentially case-varying.
|
||||||
|
*/
|
||||||
|
if (case_insensitive)
|
||||||
|
{
|
||||||
|
if (is_multibyte && (unsigned char) patt[pos] >= 0x80)
|
||||||
|
break;
|
||||||
|
if (isalpha((unsigned char) patt[pos]))
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Check for quantifiers. Except for +, this means the preceding
|
* Check for quantifiers. Except for +, this means the preceding
|
||||||
* character is optional, so we must remove it from the prefix
|
* character is optional, so we must remove it from the prefix too!
|
||||||
* too!
|
|
||||||
*/
|
*/
|
||||||
if (patt[pos] == '*' ||
|
if (patt[pos] == '*' ||
|
||||||
patt[pos] == '?' ||
|
patt[pos] == '?' ||
|
||||||
patt[pos] == '{')
|
patt[pos] == '{')
|
||||||
{
|
{
|
||||||
if (match_pos > 0)
|
match_pos = prev_match_pos;
|
||||||
match_pos--;
|
pos = prev_pos;
|
||||||
pos--;
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (patt[pos] == '+')
|
if (patt[pos] == '+')
|
||||||
{
|
{
|
||||||
pos--;
|
pos = prev_pos;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* backslash quotes the next character.
|
||||||
|
*/
|
||||||
if (patt[pos] == '\\')
|
if (patt[pos] == '\\')
|
||||||
{
|
{
|
||||||
/* backslash quotes the next character */
|
|
||||||
pos++;
|
pos++;
|
||||||
if (patt[pos] == '\0')
|
if (patt[pos] == '\0')
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
match[match_pos++] = patt[pos];
|
/* save position in case we need to back up on next loop cycle */
|
||||||
|
prev_match_pos = match_pos;
|
||||||
|
prev_pos = pos;
|
||||||
|
/* must use encoding-aware processing here */
|
||||||
|
len = pg_mblen(&patt[pos]);
|
||||||
|
memcpy(&match[match_pos], &patt[pos], len);
|
||||||
|
match_pos += len;
|
||||||
|
pos += len;
|
||||||
}
|
}
|
||||||
|
|
||||||
match[match_pos] = '\0';
|
match[match_pos] = '\0';
|
||||||
rest = &patt[pos];
|
rest = &patt[pos];
|
||||||
|
|
||||||
|
if (have_leading_paren && patt[pos] == ')')
|
||||||
|
pos++;
|
||||||
|
|
||||||
if (patt[pos] == '$' && patt[pos + 1] == '\0')
|
if (patt[pos] == '$' && patt[pos + 1] == '\0')
|
||||||
{
|
{
|
||||||
rest = &patt[pos + 1];
|
rest = &patt[pos + 1];
|
||||||
|
|
||||||
*prefix_const = string_to_const(prefix, typeid);
|
*prefix_const = string_to_const(match, typeid);
|
||||||
*rest_const = string_to_const(rest, typeid);
|
*rest_const = string_to_const(rest, typeid);
|
||||||
|
|
||||||
|
pfree(patt);
|
||||||
|
pfree(match);
|
||||||
|
|
||||||
return Pattern_Prefix_Exact; /* pattern specifies exact match */
|
return Pattern_Prefix_Exact; /* pattern specifies exact match */
|
||||||
}
|
}
|
||||||
|
|
||||||
*prefix_const = string_to_const(prefix, typeid);
|
*prefix_const = string_to_const(match, typeid);
|
||||||
*rest_const = string_to_const(rest, typeid);
|
*rest_const = string_to_const(rest, typeid);
|
||||||
|
|
||||||
pfree(patt);
|
pfree(patt);
|
||||||
pfree(match);
|
pfree(match);
|
||||||
prefix = NULL;
|
|
||||||
|
|
||||||
if (match_pos > 0)
|
if (match_pos > 0)
|
||||||
return Pattern_Prefix_Partial;
|
return Pattern_Prefix_Partial;
|
||||||
|
Reference in New Issue
Block a user