mirror of
https://github.com/postgres/postgres.git
synced 2025-07-02 09:02:37 +03:00
Fix regexp substring matching (substring(string from pattern)) for the corner
case where there is a match to the pattern overall but the user has specified a parenthesized subexpression and that subexpression hasn't got a match. An example is substring('foo' from 'foo(bar)?'). This should return NULL, since (bar) isn't matched, but it was mistakenly returning the whole-pattern match instead (ie, 'foo'). Per bug #4044 from Rui Martins. This has been broken since the beginning; patch in all supported versions. The old behavior was sufficiently inconsistent that it's impossible to believe anyone is depending on it.
This commit is contained in:
@ -8,7 +8,7 @@
|
|||||||
*
|
*
|
||||||
*
|
*
|
||||||
* IDENTIFICATION
|
* IDENTIFICATION
|
||||||
* $Header: /cvsroot/pgsql/src/backend/utils/adt/regexp.c,v 1.49.4.4 2007/01/03 22:39:56 tgl Exp $
|
* $Header: /cvsroot/pgsql/src/backend/utils/adt/regexp.c,v 1.49.4.5 2008/03/19 02:41:15 tgl Exp $
|
||||||
*
|
*
|
||||||
* Alistair Crooks added the code for the regex caching
|
* Alistair Crooks added the code for the regex caching
|
||||||
* agc - cached the regular expressions used - there's a good chance
|
* agc - cached the regular expressions used - there's a good chance
|
||||||
@ -80,42 +80,31 @@ static cached_re_str re_array[MAX_CACHED_RES]; /* cached re's */
|
|||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* RE_compile_and_execute - compile and execute a RE, caching if possible
|
* RE_compile_and_cache - compile a RE, caching if possible
|
||||||
*
|
*
|
||||||
* Returns TRUE on match, FALSE on no match
|
* Returns regex_t *
|
||||||
*
|
*
|
||||||
* text_re --- the pattern, expressed as an *untoasted* TEXT object
|
* text_re --- the pattern, expressed as an *untoasted* TEXT object
|
||||||
* dat --- the data to match against (need not be null-terminated)
|
|
||||||
* dat_len --- the length of the data string
|
|
||||||
* cflags --- compile options for the pattern
|
* cflags --- compile options for the pattern
|
||||||
* nmatch, pmatch --- optional return area for match details
|
|
||||||
*
|
*
|
||||||
* Both pattern and data are given in the database encoding. We internally
|
* Pattern is given in the database encoding. We internally convert to
|
||||||
* convert to array of pg_wchar which is what Spencer's regex package wants.
|
* array of pg_wchar which is what Spencer's regex package wants.
|
||||||
*/
|
*/
|
||||||
static bool
|
static regex_t *
|
||||||
RE_compile_and_execute(text *text_re, unsigned char *dat, int dat_len,
|
RE_compile_and_cache(text *text_re, int cflags)
|
||||||
int cflags, int nmatch, regmatch_t *pmatch)
|
|
||||||
{
|
{
|
||||||
int text_re_len = VARSIZE(text_re);
|
int text_re_len = VARSIZE(text_re);
|
||||||
pg_wchar *data;
|
|
||||||
size_t data_len;
|
|
||||||
pg_wchar *pattern;
|
pg_wchar *pattern;
|
||||||
size_t pattern_len;
|
size_t pattern_len;
|
||||||
int i;
|
int i;
|
||||||
int regcomp_result;
|
int regcomp_result;
|
||||||
int regexec_result;
|
|
||||||
cached_re_str re_temp;
|
cached_re_str re_temp;
|
||||||
char errMsg[100];
|
char errMsg[100];
|
||||||
|
|
||||||
/* Convert data string to wide characters */
|
|
||||||
data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
|
|
||||||
data_len = pg_mb2wchar_with_len(dat, data, dat_len);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Look for a match among previously compiled REs. Since the data
|
* Look for a match among previously compiled REs. Since the data
|
||||||
* structure is self-organizing with most-used entries at the front,
|
* structure is self-organizing with most-used entries at the front, our
|
||||||
* our search strategy can just be to scan from the front.
|
* search strategy can just be to scan from the front.
|
||||||
*/
|
*/
|
||||||
for (i = 0; i < num_res; i++)
|
for (i = 0; i < num_res; i++)
|
||||||
{
|
{
|
||||||
@ -133,28 +122,7 @@ RE_compile_and_execute(text *text_re, unsigned char *dat, int dat_len,
|
|||||||
re_array[0] = re_temp;
|
re_array[0] = re_temp;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Perform RE match and return result */
|
return &re_array[0].cre_re;
|
||||||
regexec_result = pg_regexec(&re_array[0].cre_re,
|
|
||||||
data,
|
|
||||||
data_len,
|
|
||||||
NULL, /* no details */
|
|
||||||
nmatch,
|
|
||||||
pmatch,
|
|
||||||
0);
|
|
||||||
|
|
||||||
pfree(data);
|
|
||||||
|
|
||||||
if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH)
|
|
||||||
{
|
|
||||||
/* re failed??? */
|
|
||||||
pg_regerror(regexec_result, &re_array[0].cre_re,
|
|
||||||
errMsg, sizeof(errMsg));
|
|
||||||
ereport(ERROR,
|
|
||||||
(errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
|
|
||||||
errmsg("regular expression failed: %s", errMsg)));
|
|
||||||
}
|
|
||||||
|
|
||||||
return (regexec_result == REG_OKAY);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -165,7 +133,7 @@ RE_compile_and_execute(text *text_re, unsigned char *dat, int dat_len,
|
|||||||
|
|
||||||
/* Convert pattern string to wide characters */
|
/* Convert pattern string to wide characters */
|
||||||
pattern = (pg_wchar *) palloc((text_re_len - VARHDRSZ + 1) * sizeof(pg_wchar));
|
pattern = (pg_wchar *) palloc((text_re_len - VARHDRSZ + 1) * sizeof(pg_wchar));
|
||||||
pattern_len = pg_mb2wchar_with_len((unsigned char *) VARDATA(text_re),
|
pattern_len = pg_mb2wchar_with_len(VARDATA(text_re),
|
||||||
pattern,
|
pattern,
|
||||||
text_re_len - VARHDRSZ);
|
text_re_len - VARHDRSZ);
|
||||||
|
|
||||||
@ -202,8 +170,8 @@ RE_compile_and_execute(text *text_re, unsigned char *dat, int dat_len,
|
|||||||
re_temp.cre_flags = cflags;
|
re_temp.cre_flags = cflags;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Okay, we have a valid new item in re_temp; insert it into the
|
* Okay, we have a valid new item in re_temp; insert it into the storage
|
||||||
* storage array. Discard last entry if needed.
|
* array. Discard last entry if needed.
|
||||||
*/
|
*/
|
||||||
if (num_res >= MAX_CACHED_RES)
|
if (num_res >= MAX_CACHED_RES)
|
||||||
{
|
{
|
||||||
@ -219,8 +187,37 @@ RE_compile_and_execute(text *text_re, unsigned char *dat, int dat_len,
|
|||||||
re_array[0] = re_temp;
|
re_array[0] = re_temp;
|
||||||
num_res++;
|
num_res++;
|
||||||
|
|
||||||
|
return &re_array[0].cre_re;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* RE_execute - execute a RE
|
||||||
|
*
|
||||||
|
* Returns TRUE on match, FALSE on no match
|
||||||
|
*
|
||||||
|
* re --- the compiled pattern as returned by RE_compile_and_cache
|
||||||
|
* dat --- the data to match against (need not be null-terminated)
|
||||||
|
* dat_len --- the length of the data string
|
||||||
|
* nmatch, pmatch --- optional return area for match details
|
||||||
|
*
|
||||||
|
* Data is given in the database encoding. We internally
|
||||||
|
* convert to array of pg_wchar which is what Spencer's regex package wants.
|
||||||
|
*/
|
||||||
|
static bool
|
||||||
|
RE_execute(regex_t *re, char *dat, int dat_len,
|
||||||
|
int nmatch, regmatch_t *pmatch)
|
||||||
|
{
|
||||||
|
pg_wchar *data;
|
||||||
|
size_t data_len;
|
||||||
|
int regexec_result;
|
||||||
|
char errMsg[100];
|
||||||
|
|
||||||
|
/* Convert data string to wide characters */
|
||||||
|
data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
|
||||||
|
data_len = pg_mb2wchar_with_len(dat, data, dat_len);
|
||||||
|
|
||||||
/* Perform RE match and return result */
|
/* Perform RE match and return result */
|
||||||
regexec_result = pg_regexec(&re_array[0].cre_re,
|
regexec_result = pg_regexec(re,
|
||||||
data,
|
data,
|
||||||
data_len,
|
data_len,
|
||||||
NULL, /* no details */
|
NULL, /* no details */
|
||||||
@ -233,8 +230,7 @@ RE_compile_and_execute(text *text_re, unsigned char *dat, int dat_len,
|
|||||||
if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH)
|
if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH)
|
||||||
{
|
{
|
||||||
/* re failed??? */
|
/* re failed??? */
|
||||||
pg_regerror(regexec_result, &re_array[0].cre_re,
|
pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
|
||||||
errMsg, sizeof(errMsg));
|
|
||||||
ereport(ERROR,
|
ereport(ERROR,
|
||||||
(errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
|
(errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
|
||||||
errmsg("regular expression failed: %s", errMsg)));
|
errmsg("regular expression failed: %s", errMsg)));
|
||||||
@ -243,6 +239,33 @@ RE_compile_and_execute(text *text_re, unsigned char *dat, int dat_len,
|
|||||||
return (regexec_result == REG_OKAY);
|
return (regexec_result == REG_OKAY);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* RE_compile_and_execute - compile and execute a RE
|
||||||
|
*
|
||||||
|
* Returns TRUE on match, FALSE on no match
|
||||||
|
*
|
||||||
|
* text_re --- the pattern, expressed as an *untoasted* TEXT object
|
||||||
|
* dat --- the data to match against (need not be null-terminated)
|
||||||
|
* dat_len --- the length of the data string
|
||||||
|
* cflags --- compile options for the pattern
|
||||||
|
* nmatch, pmatch --- optional return area for match details
|
||||||
|
*
|
||||||
|
* Both pattern and data are given in the database encoding. We internally
|
||||||
|
* convert to array of pg_wchar which is what Spencer's regex package wants.
|
||||||
|
*/
|
||||||
|
static bool
|
||||||
|
RE_compile_and_execute(text *text_re, char *dat, int dat_len,
|
||||||
|
int cflags, int nmatch, regmatch_t *pmatch)
|
||||||
|
{
|
||||||
|
regex_t *re;
|
||||||
|
|
||||||
|
/* Compile RE */
|
||||||
|
re = RE_compile_and_cache(text_re, cflags);
|
||||||
|
|
||||||
|
/* Perform RE match and return result */
|
||||||
|
return RE_execute(re, dat, dat_len, nmatch, pmatch);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* assign_regex_flavor - GUC hook to validate and set REGEX_FLAVOR
|
* assign_regex_flavor - GUC hook to validate and set REGEX_FLAVOR
|
||||||
@ -283,7 +306,7 @@ nameregexeq(PG_FUNCTION_ARGS)
|
|||||||
text *p = PG_GETARG_TEXT_P(1);
|
text *p = PG_GETARG_TEXT_P(1);
|
||||||
|
|
||||||
PG_RETURN_BOOL(RE_compile_and_execute(p,
|
PG_RETURN_BOOL(RE_compile_and_execute(p,
|
||||||
(unsigned char *) NameStr(*n),
|
NameStr(*n),
|
||||||
strlen(NameStr(*n)),
|
strlen(NameStr(*n)),
|
||||||
regex_flavor,
|
regex_flavor,
|
||||||
0, NULL));
|
0, NULL));
|
||||||
@ -296,7 +319,7 @@ nameregexne(PG_FUNCTION_ARGS)
|
|||||||
text *p = PG_GETARG_TEXT_P(1);
|
text *p = PG_GETARG_TEXT_P(1);
|
||||||
|
|
||||||
PG_RETURN_BOOL(!RE_compile_and_execute(p,
|
PG_RETURN_BOOL(!RE_compile_and_execute(p,
|
||||||
(unsigned char *) NameStr(*n),
|
NameStr(*n),
|
||||||
strlen(NameStr(*n)),
|
strlen(NameStr(*n)),
|
||||||
regex_flavor,
|
regex_flavor,
|
||||||
0, NULL));
|
0, NULL));
|
||||||
@ -309,7 +332,7 @@ textregexeq(PG_FUNCTION_ARGS)
|
|||||||
text *p = PG_GETARG_TEXT_P(1);
|
text *p = PG_GETARG_TEXT_P(1);
|
||||||
|
|
||||||
PG_RETURN_BOOL(RE_compile_and_execute(p,
|
PG_RETURN_BOOL(RE_compile_and_execute(p,
|
||||||
(unsigned char *) VARDATA(s),
|
VARDATA(s),
|
||||||
VARSIZE(s) - VARHDRSZ,
|
VARSIZE(s) - VARHDRSZ,
|
||||||
regex_flavor,
|
regex_flavor,
|
||||||
0, NULL));
|
0, NULL));
|
||||||
@ -322,7 +345,7 @@ textregexne(PG_FUNCTION_ARGS)
|
|||||||
text *p = PG_GETARG_TEXT_P(1);
|
text *p = PG_GETARG_TEXT_P(1);
|
||||||
|
|
||||||
PG_RETURN_BOOL(!RE_compile_and_execute(p,
|
PG_RETURN_BOOL(!RE_compile_and_execute(p,
|
||||||
(unsigned char *) VARDATA(s),
|
VARDATA(s),
|
||||||
VARSIZE(s) - VARHDRSZ,
|
VARSIZE(s) - VARHDRSZ,
|
||||||
regex_flavor,
|
regex_flavor,
|
||||||
0, NULL));
|
0, NULL));
|
||||||
@ -342,7 +365,7 @@ nameicregexeq(PG_FUNCTION_ARGS)
|
|||||||
text *p = PG_GETARG_TEXT_P(1);
|
text *p = PG_GETARG_TEXT_P(1);
|
||||||
|
|
||||||
PG_RETURN_BOOL(RE_compile_and_execute(p,
|
PG_RETURN_BOOL(RE_compile_and_execute(p,
|
||||||
(unsigned char *) NameStr(*n),
|
NameStr(*n),
|
||||||
strlen(NameStr(*n)),
|
strlen(NameStr(*n)),
|
||||||
regex_flavor | REG_ICASE,
|
regex_flavor | REG_ICASE,
|
||||||
0, NULL));
|
0, NULL));
|
||||||
@ -355,7 +378,7 @@ nameicregexne(PG_FUNCTION_ARGS)
|
|||||||
text *p = PG_GETARG_TEXT_P(1);
|
text *p = PG_GETARG_TEXT_P(1);
|
||||||
|
|
||||||
PG_RETURN_BOOL(!RE_compile_and_execute(p,
|
PG_RETURN_BOOL(!RE_compile_and_execute(p,
|
||||||
(unsigned char *) NameStr(*n),
|
NameStr(*n),
|
||||||
strlen(NameStr(*n)),
|
strlen(NameStr(*n)),
|
||||||
regex_flavor | REG_ICASE,
|
regex_flavor | REG_ICASE,
|
||||||
0, NULL));
|
0, NULL));
|
||||||
@ -368,7 +391,7 @@ texticregexeq(PG_FUNCTION_ARGS)
|
|||||||
text *p = PG_GETARG_TEXT_P(1);
|
text *p = PG_GETARG_TEXT_P(1);
|
||||||
|
|
||||||
PG_RETURN_BOOL(RE_compile_and_execute(p,
|
PG_RETURN_BOOL(RE_compile_and_execute(p,
|
||||||
(unsigned char *) VARDATA(s),
|
VARDATA(s),
|
||||||
VARSIZE(s) - VARHDRSZ,
|
VARSIZE(s) - VARHDRSZ,
|
||||||
regex_flavor | REG_ICASE,
|
regex_flavor | REG_ICASE,
|
||||||
0, NULL));
|
0, NULL));
|
||||||
@ -381,7 +404,7 @@ texticregexne(PG_FUNCTION_ARGS)
|
|||||||
text *p = PG_GETARG_TEXT_P(1);
|
text *p = PG_GETARG_TEXT_P(1);
|
||||||
|
|
||||||
PG_RETURN_BOOL(!RE_compile_and_execute(p,
|
PG_RETURN_BOOL(!RE_compile_and_execute(p,
|
||||||
(unsigned char *) VARDATA(s),
|
VARDATA(s),
|
||||||
VARSIZE(s) - VARHDRSZ,
|
VARSIZE(s) - VARHDRSZ,
|
||||||
regex_flavor | REG_ICASE,
|
regex_flavor | REG_ICASE,
|
||||||
0, NULL));
|
0, NULL));
|
||||||
@ -397,43 +420,51 @@ textregexsubstr(PG_FUNCTION_ARGS)
|
|||||||
{
|
{
|
||||||
text *s = PG_GETARG_TEXT_P(0);
|
text *s = PG_GETARG_TEXT_P(0);
|
||||||
text *p = PG_GETARG_TEXT_P(1);
|
text *p = PG_GETARG_TEXT_P(1);
|
||||||
bool match;
|
regex_t *re;
|
||||||
regmatch_t pmatch[2];
|
regmatch_t pmatch[2];
|
||||||
|
int so,
|
||||||
|
eo;
|
||||||
|
|
||||||
|
/* Compile RE */
|
||||||
|
re = RE_compile_and_cache(p, regex_flavor);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We pass two regmatch_t structs to get info about the overall match
|
* We pass two regmatch_t structs to get info about the overall match and
|
||||||
* and the match for the first parenthesized subexpression (if any).
|
* the match for the first parenthesized subexpression (if any). If there
|
||||||
* If there is a parenthesized subexpression, we return what it
|
* is a parenthesized subexpression, we return what it matched; else
|
||||||
* matched; else return what the whole regexp matched.
|
* return what the whole regexp matched.
|
||||||
*/
|
*/
|
||||||
match = RE_compile_and_execute(p,
|
if (!RE_execute(re,
|
||||||
(unsigned char *) VARDATA(s),
|
VARDATA(s), VARSIZE(s) - VARHDRSZ,
|
||||||
VARSIZE(s) - VARHDRSZ,
|
2, pmatch))
|
||||||
regex_flavor,
|
PG_RETURN_NULL(); /* definitely no match */
|
||||||
2, pmatch);
|
|
||||||
|
|
||||||
/* match? then return the substring matching the pattern */
|
if (re->re_nsub > 0)
|
||||||
if (match)
|
|
||||||
{
|
{
|
||||||
int so,
|
/* has parenthesized subexpressions, use the first one */
|
||||||
eo;
|
|
||||||
|
|
||||||
so = pmatch[1].rm_so;
|
so = pmatch[1].rm_so;
|
||||||
eo = pmatch[1].rm_eo;
|
eo = pmatch[1].rm_eo;
|
||||||
if (so < 0 || eo < 0)
|
}
|
||||||
{
|
else
|
||||||
/* no parenthesized subexpression */
|
{
|
||||||
so = pmatch[0].rm_so;
|
/* no parenthesized subexpression, use whole match */
|
||||||
eo = pmatch[0].rm_eo;
|
so = pmatch[0].rm_so;
|
||||||
}
|
eo = pmatch[0].rm_eo;
|
||||||
|
|
||||||
return (DirectFunctionCall3(text_substr,
|
|
||||||
PointerGetDatum(s),
|
|
||||||
Int32GetDatum(so + 1),
|
|
||||||
Int32GetDatum(eo - so)));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
PG_RETURN_NULL();
|
/*
|
||||||
|
* It is possible to have a match to the whole pattern but no match
|
||||||
|
* for a subexpression; for example 'foo(bar)?' is considered to match
|
||||||
|
* 'foo' but there is no subexpression match. So this extra test for
|
||||||
|
* match failure is not redundant.
|
||||||
|
*/
|
||||||
|
if (so < 0 || eo < 0)
|
||||||
|
PG_RETURN_NULL();
|
||||||
|
|
||||||
|
return DirectFunctionCall3(text_substr,
|
||||||
|
PointerGetDatum(s),
|
||||||
|
Int32GetDatum(so + 1),
|
||||||
|
Int32GetDatum(eo - so));
|
||||||
}
|
}
|
||||||
|
|
||||||
/* similar_escape()
|
/* similar_escape()
|
||||||
@ -446,7 +477,7 @@ similar_escape(PG_FUNCTION_ARGS)
|
|||||||
text *pat_text;
|
text *pat_text;
|
||||||
text *esc_text;
|
text *esc_text;
|
||||||
text *result;
|
text *result;
|
||||||
unsigned char *p,
|
char *p,
|
||||||
*e,
|
*e,
|
||||||
*r;
|
*r;
|
||||||
int plen,
|
int plen,
|
||||||
@ -477,7 +508,7 @@ similar_escape(PG_FUNCTION_ARGS)
|
|||||||
ereport(ERROR,
|
ereport(ERROR,
|
||||||
(errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
|
(errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
|
||||||
errmsg("invalid escape string"),
|
errmsg("invalid escape string"),
|
||||||
errhint("Escape string must be empty or one character.")));
|
errhint("Escape string must be empty or one character.")));
|
||||||
}
|
}
|
||||||
|
|
||||||
/*----------
|
/*----------
|
||||||
@ -513,7 +544,7 @@ similar_escape(PG_FUNCTION_ARGS)
|
|||||||
|
|
||||||
while (plen > 0)
|
while (plen > 0)
|
||||||
{
|
{
|
||||||
unsigned char pchar = *p;
|
char pchar = *p;
|
||||||
|
|
||||||
if (afterescape)
|
if (afterescape)
|
||||||
{
|
{
|
||||||
@ -552,7 +583,7 @@ similar_escape(PG_FUNCTION_ARGS)
|
|||||||
*r++ = ')';
|
*r++ = ')';
|
||||||
*r++ = '$';
|
*r++ = '$';
|
||||||
|
|
||||||
VARATT_SIZEP(result) = r - ((unsigned char *) result);
|
VARATT_SIZEP(result) = r - ((char *) result);
|
||||||
|
|
||||||
PG_RETURN_TEXT_P(result);
|
PG_RETURN_TEXT_P(result);
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user