1
0
mirror of https://github.com/postgres/postgres.git synced 2025-10-24 01:29:19 +03:00

Fix regexp substring matching (substring(string from pattern)) for the corner

case where there is a match to the pattern overall but the user has specified
a parenthesized subexpression and that subexpression hasn't got a match.
An example is substring('foo' from 'foo(bar)?').  This should return NULL,
since (bar) isn't matched, but it was mistakenly returning the whole-pattern
match instead (ie, 'foo').  Per bug #4044 from Rui Martins.

This has been broken since the beginning; patch in all supported versions.
The old behavior was sufficiently inconsistent that it's impossible to believe
anyone is depending on it.
This commit is contained in:
Tom Lane
2008-03-19 02:40:53 +00:00
parent 2fec466326
commit 2d19019d9d

View File

@@ -8,7 +8,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/utils/adt/regexp.c,v 1.66.2.1 2007/01/03 22:39:35 tgl Exp $ * $PostgreSQL: pgsql/src/backend/utils/adt/regexp.c,v 1.66.2.2 2008/03/19 02:40:53 tgl Exp $
* *
* Alistair Crooks added the code for the regex caching * Alistair Crooks added the code for the regex caching
* agc - cached the regular expressions used - there's a good chance * agc - cached the regular expressions used - there's a good chance
@@ -191,36 +191,31 @@ RE_compile_and_cache(text *text_re, int cflags)
} }
/* /*
* RE_compile_and_execute - compile and execute a RE * RE_execute - execute a RE
* *
* Returns TRUE on match, FALSE on no match * Returns TRUE on match, FALSE on no match
* *
* text_re --- the pattern, expressed as an *untoasted* TEXT object * re --- the compiled pattern as returned by RE_compile_and_cache
* dat --- the data to match against (need not be null-terminated) * dat --- the data to match against (need not be null-terminated)
* dat_len --- the length of the data string * dat_len --- the length of the data string
* cflags --- compile options for the pattern
* nmatch, pmatch --- optional return area for match details * nmatch, pmatch --- optional return area for match details
* *
* Both pattern and data are given in the database encoding. We internally * Data is given in the database encoding. We internally
* convert to array of pg_wchar which is what Spencer's regex package wants. * convert to array of pg_wchar which is what Spencer's regex package wants.
*/ */
static bool static bool
RE_compile_and_execute(text *text_re, char *dat, int dat_len, RE_execute(regex_t *re, char *dat, int dat_len,
int cflags, int nmatch, regmatch_t *pmatch) int nmatch, regmatch_t *pmatch)
{ {
pg_wchar *data; pg_wchar *data;
size_t data_len; size_t data_len;
int regexec_result; int regexec_result;
regex_t *re;
char errMsg[100]; char errMsg[100];
/* Convert data string to wide characters */ /* Convert data string to wide characters */
data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar)); data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
data_len = pg_mb2wchar_with_len(dat, data, dat_len); data_len = pg_mb2wchar_with_len(dat, data, dat_len);
/* Compile RE */
re = RE_compile_and_cache(text_re, cflags);
/* Perform RE match and return result */ /* Perform RE match and return result */
regexec_result = pg_regexec(re, regexec_result = pg_regexec(re,
data, data,
@@ -245,6 +240,33 @@ RE_compile_and_execute(text *text_re, char *dat, int dat_len,
return (regexec_result == REG_OKAY); return (regexec_result == REG_OKAY);
} }
/*
* RE_compile_and_execute - compile and execute a RE
*
* Returns TRUE on match, FALSE on no match
*
* text_re --- the pattern, expressed as an *untoasted* TEXT object
* dat --- the data to match against (need not be null-terminated)
* dat_len --- the length of the data string
* cflags --- compile options for the pattern
* nmatch, pmatch --- optional return area for match details
*
* Both pattern and data are given in the database encoding. We internally
* convert to array of pg_wchar which is what Spencer's regex package wants.
*/
static bool
RE_compile_and_execute(text *text_re, char *dat, int dat_len,
int cflags, int nmatch, regmatch_t *pmatch)
{
regex_t *re;
/* Compile RE */
re = RE_compile_and_cache(text_re, cflags);
/* Perform RE match and return result */
return RE_execute(re, dat, dat_len, nmatch, pmatch);
}
/* /*
* assign_regex_flavor - GUC hook to validate and set REGEX_FLAVOR * assign_regex_flavor - GUC hook to validate and set REGEX_FLAVOR
@@ -399,8 +421,13 @@ textregexsubstr(PG_FUNCTION_ARGS)
{ {
text *s = PG_GETARG_TEXT_P(0); text *s = PG_GETARG_TEXT_P(0);
text *p = PG_GETARG_TEXT_P(1); text *p = PG_GETARG_TEXT_P(1);
bool match; regex_t *re;
regmatch_t pmatch[2]; regmatch_t pmatch[2];
int so,
eo;
/* Compile RE */
re = RE_compile_and_cache(p, regex_flavor);
/* /*
* We pass two regmatch_t structs to get info about the overall match and * We pass two regmatch_t structs to get info about the overall match and
@@ -408,36 +435,39 @@ textregexsubstr(PG_FUNCTION_ARGS)
* is a parenthesized subexpression, we return what it matched; else * is a parenthesized subexpression, we return what it matched; else
* return what the whole regexp matched. * return what the whole regexp matched.
*/ */
match = RE_compile_and_execute(p, if (!RE_execute(re,
VARDATA(s), VARDATA(s), VARSIZE(s) - VARHDRSZ,
VARSIZE(s) - VARHDRSZ, 2, pmatch))
regex_flavor, PG_RETURN_NULL(); /* definitely no match */
2, pmatch);
/* match? then return the substring matching the pattern */ if (re->re_nsub > 0)
if (match)
{ {
int so, /* has parenthesized subexpressions, use the first one */
eo;
so = pmatch[1].rm_so; so = pmatch[1].rm_so;
eo = pmatch[1].rm_eo; eo = pmatch[1].rm_eo;
if (so < 0 || eo < 0) }
else
{ {
/* no parenthesized subexpression */ /* no parenthesized subexpression, use whole match */
so = pmatch[0].rm_so; so = pmatch[0].rm_so;
eo = pmatch[0].rm_eo; eo = pmatch[0].rm_eo;
} }
/*
* It is possible to have a match to the whole pattern but no match
* for a subexpression; for example 'foo(bar)?' is considered to match
* 'foo' but there is no subexpression match. So this extra test for
* match failure is not redundant.
*/
if (so < 0 || eo < 0)
PG_RETURN_NULL();
return DirectFunctionCall3(text_substr, return DirectFunctionCall3(text_substr,
PointerGetDatum(s), PointerGetDatum(s),
Int32GetDatum(so + 1), Int32GetDatum(so + 1),
Int32GetDatum(eo - so)); Int32GetDatum(eo - so));
} }
PG_RETURN_NULL();
}
/* /*
* textregexreplace_noopt() * textregexreplace_noopt()
* Return a string matched by a regular expression, with replacement. * Return a string matched by a regular expression, with replacement.