mirror of
				https://github.com/postgres/postgres.git
				synced 2025-11-03 09:13:20 +03:00 
			
		
		
		
	Fix regexp substring matching (substring(string from pattern)) for the corner
case where there is a match to the pattern overall but the user has specified
a parenthesized subexpression and that subexpression hasn't got a match.
An example is substring('foo' from 'foo(bar)?').  This should return NULL,
since (bar) isn't matched, but it was mistakenly returning the whole-pattern
match instead (ie, 'foo').  Per bug #4044 from Rui Martins.
This has been broken since the beginning; patch in all supported versions.
The old behavior was sufficiently inconsistent that it's impossible to believe
anyone is depending on it.
			
			
This commit is contained in:
		@@ -8,7 +8,7 @@
 | 
			
		||||
 *
 | 
			
		||||
 *
 | 
			
		||||
 * IDENTIFICATION
 | 
			
		||||
 *	  $PostgreSQL: pgsql/src/backend/utils/adt/regexp.c,v 1.60.2.3 2007/01/03 22:39:42 tgl Exp $
 | 
			
		||||
 *	  $PostgreSQL: pgsql/src/backend/utils/adt/regexp.c,v 1.60.2.4 2008/03/19 02:41:00 tgl Exp $
 | 
			
		||||
 *
 | 
			
		||||
 *		Alistair Crooks added the code for the regex caching
 | 
			
		||||
 *		agc - cached the regular expressions used - there's a good chance
 | 
			
		||||
@@ -192,36 +192,31 @@ RE_compile_and_cache(text *text_re, int cflags)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * RE_compile_and_execute - compile and execute a RE
 | 
			
		||||
 * RE_execute - execute a RE
 | 
			
		||||
 *
 | 
			
		||||
 * Returns TRUE on match, FALSE on no match
 | 
			
		||||
 *
 | 
			
		||||
 *	text_re --- the pattern, expressed as an *untoasted* TEXT object
 | 
			
		||||
 *	re --- the compiled pattern as returned by RE_compile_and_cache
 | 
			
		||||
 *	dat --- the data to match against (need not be null-terminated)
 | 
			
		||||
 *	dat_len --- the length of the data string
 | 
			
		||||
 *	cflags --- compile options for the pattern
 | 
			
		||||
 *	nmatch, pmatch	--- optional return area for match details
 | 
			
		||||
 *
 | 
			
		||||
 * Both pattern and data are given in the database encoding.  We internally
 | 
			
		||||
 * Data is given in the database encoding.	We internally
 | 
			
		||||
 * convert to array of pg_wchar which is what Spencer's regex package wants.
 | 
			
		||||
 */
 | 
			
		||||
static bool
 | 
			
		||||
RE_compile_and_execute(text *text_re, char *dat, int dat_len,
 | 
			
		||||
					   int cflags, int nmatch, regmatch_t *pmatch)
 | 
			
		||||
RE_execute(regex_t *re, char *dat, int dat_len,
 | 
			
		||||
		   int nmatch, regmatch_t *pmatch)
 | 
			
		||||
{
 | 
			
		||||
	pg_wchar   *data;
 | 
			
		||||
	size_t		data_len;
 | 
			
		||||
	int			regexec_result;
 | 
			
		||||
	regex_t    *re;
 | 
			
		||||
	char		errMsg[100];
 | 
			
		||||
 | 
			
		||||
	/* Convert data string to wide characters */
 | 
			
		||||
	data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
 | 
			
		||||
	data_len = pg_mb2wchar_with_len(dat, data, dat_len);
 | 
			
		||||
 | 
			
		||||
	/* Compile RE */
 | 
			
		||||
	re = RE_compile_and_cache(text_re, cflags);
 | 
			
		||||
 | 
			
		||||
	/* Perform RE match and return result */
 | 
			
		||||
	regexec_result = pg_regexec(re,
 | 
			
		||||
								data,
 | 
			
		||||
@@ -246,6 +241,33 @@ RE_compile_and_execute(text *text_re, char *dat, int dat_len,
 | 
			
		||||
	return (regexec_result == REG_OKAY);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * RE_compile_and_execute - compile and execute a RE
 | 
			
		||||
 *
 | 
			
		||||
 * Returns TRUE on match, FALSE on no match
 | 
			
		||||
 *
 | 
			
		||||
 *	text_re --- the pattern, expressed as an *untoasted* TEXT object
 | 
			
		||||
 *	dat --- the data to match against (need not be null-terminated)
 | 
			
		||||
 *	dat_len --- the length of the data string
 | 
			
		||||
 *	cflags --- compile options for the pattern
 | 
			
		||||
 *	nmatch, pmatch	--- optional return area for match details
 | 
			
		||||
 *
 | 
			
		||||
 * Both pattern and data are given in the database encoding.  We internally
 | 
			
		||||
 * convert to array of pg_wchar which is what Spencer's regex package wants.
 | 
			
		||||
 */
 | 
			
		||||
static bool
 | 
			
		||||
RE_compile_and_execute(text *text_re, char *dat, int dat_len,
 | 
			
		||||
					   int cflags, int nmatch, regmatch_t *pmatch)
 | 
			
		||||
{
 | 
			
		||||
	regex_t    *re;
 | 
			
		||||
 | 
			
		||||
	/* Compile RE */
 | 
			
		||||
	re = RE_compile_and_cache(text_re, cflags);
 | 
			
		||||
 | 
			
		||||
	/* Perform RE match and return result */
 | 
			
		||||
	return RE_execute(re, dat, dat_len, nmatch, pmatch);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * assign_regex_flavor - GUC hook to validate and set REGEX_FLAVOR
 | 
			
		||||
@@ -400,8 +422,13 @@ textregexsubstr(PG_FUNCTION_ARGS)
 | 
			
		||||
{
 | 
			
		||||
	text	   *s = PG_GETARG_TEXT_P(0);
 | 
			
		||||
	text	   *p = PG_GETARG_TEXT_P(1);
 | 
			
		||||
	bool		match;
 | 
			
		||||
	regex_t    *re;
 | 
			
		||||
	regmatch_t	pmatch[2];
 | 
			
		||||
	int			so,
 | 
			
		||||
				eo;
 | 
			
		||||
 | 
			
		||||
	/* Compile RE */
 | 
			
		||||
	re = RE_compile_and_cache(p, regex_flavor);
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	 * We pass two regmatch_t structs to get info about the overall match and
 | 
			
		||||
@@ -409,34 +436,37 @@ textregexsubstr(PG_FUNCTION_ARGS)
 | 
			
		||||
	 * is a parenthesized subexpression, we return what it matched; else
 | 
			
		||||
	 * return what the whole regexp matched.
 | 
			
		||||
	 */
 | 
			
		||||
	match = RE_compile_and_execute(p,
 | 
			
		||||
								   VARDATA(s),
 | 
			
		||||
								   VARSIZE(s) - VARHDRSZ,
 | 
			
		||||
								   regex_flavor,
 | 
			
		||||
								   2, pmatch);
 | 
			
		||||
	if (!RE_execute(re,
 | 
			
		||||
					VARDATA(s), VARSIZE(s) - VARHDRSZ,
 | 
			
		||||
					2, pmatch))
 | 
			
		||||
		PG_RETURN_NULL();		/* definitely no match */
 | 
			
		||||
 | 
			
		||||
	/* match? then return the substring matching the pattern */
 | 
			
		||||
	if (match)
 | 
			
		||||
	if (re->re_nsub > 0)
 | 
			
		||||
	{
 | 
			
		||||
		int			so,
 | 
			
		||||
					eo;
 | 
			
		||||
 | 
			
		||||
		/* has parenthesized subexpressions, use the first one */
 | 
			
		||||
		so = pmatch[1].rm_so;
 | 
			
		||||
		eo = pmatch[1].rm_eo;
 | 
			
		||||
		if (so < 0 || eo < 0)
 | 
			
		||||
		{
 | 
			
		||||
			/* no parenthesized subexpression */
 | 
			
		||||
			so = pmatch[0].rm_so;
 | 
			
		||||
			eo = pmatch[0].rm_eo;
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		return DirectFunctionCall3(text_substr,
 | 
			
		||||
								   PointerGetDatum(s),
 | 
			
		||||
								   Int32GetDatum(so + 1),
 | 
			
		||||
								   Int32GetDatum(eo - so));
 | 
			
		||||
	}
 | 
			
		||||
	else
 | 
			
		||||
	{
 | 
			
		||||
		/* no parenthesized subexpression, use whole match */
 | 
			
		||||
		so = pmatch[0].rm_so;
 | 
			
		||||
		eo = pmatch[0].rm_eo;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	PG_RETURN_NULL();
 | 
			
		||||
	/*
 | 
			
		||||
	 * It is possible to have a match to the whole pattern but no match
 | 
			
		||||
	 * for a subexpression; for example 'foo(bar)?' is considered to match
 | 
			
		||||
	 * 'foo' but there is no subexpression match.  So this extra test for
 | 
			
		||||
	 * match failure is not redundant.
 | 
			
		||||
	 */
 | 
			
		||||
	if (so < 0 || eo < 0)
 | 
			
		||||
		PG_RETURN_NULL();
 | 
			
		||||
 | 
			
		||||
	return DirectFunctionCall3(text_substr,
 | 
			
		||||
							   PointerGetDatum(s),
 | 
			
		||||
							   Int32GetDatum(so + 1),
 | 
			
		||||
							   Int32GetDatum(eo - so));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user