mirror of
				https://github.com/postgres/postgres.git
				synced 2025-11-03 09:13:20 +03:00 
			
		
		
		
	Fix conversion of SIMILAR TO regexes for character classes
The code that translates SIMILAR TO pattern matching expressions to
POSIX-style regular expressions did not consider that square brackets
can be nested.  For example, in an expression like [[:alpha:]%_], the
logic replaced the placeholders '_' and '%' but it should not.
This commit fixes the conversion logic by tracking the nesting level of
square brackets marking character class areas, while considering that
in expressions like []] or [^]] the first closing square bracket is a
regular character.  Multiple tests are added to show how the conversions
should or should not apply applied while in a character class area, with
specific cases added for all the characters converted outside character
classes like an opening parenthesis '(', dollar sign '$', etc.
Author: Laurenz Albe <laurenz.albe@cybertec.at>
Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us>
Reviewed-by: Michael Paquier <michael@paquier.xyz>
Discussion: https://postgr.es/m/16ab039d1af455652bdf4173402ddda145f2c73b.camel@cybertec.at
Backpatch-through: 13
			
			
This commit is contained in:
		@@ -774,8 +774,11 @@ similar_escape_internal(text *pat_text, text *esc_text)
 | 
				
			|||||||
	int			plen,
 | 
						int			plen,
 | 
				
			||||||
				elen;
 | 
									elen;
 | 
				
			||||||
	bool		afterescape = false;
 | 
						bool		afterescape = false;
 | 
				
			||||||
	bool		incharclass = false;
 | 
					 | 
				
			||||||
	int			nquotes = 0;
 | 
						int			nquotes = 0;
 | 
				
			||||||
 | 
						int			charclass_depth = 0;	/* Nesting level of character classes,
 | 
				
			||||||
 | 
															 * encompassed by square brackets */
 | 
				
			||||||
 | 
						int			charclass_start = 0;	/* State of the character class start,
 | 
				
			||||||
 | 
															 * for carets */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	p = VARDATA_ANY(pat_text);
 | 
						p = VARDATA_ANY(pat_text);
 | 
				
			||||||
	plen = VARSIZE_ANY_EXHDR(pat_text);
 | 
						plen = VARSIZE_ANY_EXHDR(pat_text);
 | 
				
			||||||
@@ -905,7 +908,7 @@ similar_escape_internal(text *pat_text, text *esc_text)
 | 
				
			|||||||
		/* fast path */
 | 
							/* fast path */
 | 
				
			||||||
		if (afterescape)
 | 
							if (afterescape)
 | 
				
			||||||
		{
 | 
							{
 | 
				
			||||||
			if (pchar == '"' && !incharclass)	/* escape-double-quote? */
 | 
								if (pchar == '"' && charclass_depth < 1)	/* escape-double-quote? */
 | 
				
			||||||
			{
 | 
								{
 | 
				
			||||||
				/* emit appropriate part separator, per notes above */
 | 
									/* emit appropriate part separator, per notes above */
 | 
				
			||||||
				if (nquotes == 0)
 | 
									if (nquotes == 0)
 | 
				
			||||||
@@ -954,18 +957,41 @@ similar_escape_internal(text *pat_text, text *esc_text)
 | 
				
			|||||||
			/* SQL escape character; do not send to output */
 | 
								/* SQL escape character; do not send to output */
 | 
				
			||||||
			afterescape = true;
 | 
								afterescape = true;
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
		else if (incharclass)
 | 
							else if (charclass_depth > 0)
 | 
				
			||||||
		{
 | 
							{
 | 
				
			||||||
			if (pchar == '\\')
 | 
								if (pchar == '\\')
 | 
				
			||||||
				*r++ = '\\';
 | 
									*r++ = '\\';
 | 
				
			||||||
			*r++ = pchar;
 | 
								*r++ = pchar;
 | 
				
			||||||
			if (pchar == ']')
 | 
					
 | 
				
			||||||
				incharclass = false;
 | 
								/*
 | 
				
			||||||
 | 
								 * Ignore a closing bracket at the start of a character class.
 | 
				
			||||||
 | 
								 * Such a bracket is taken literally rather than closing the
 | 
				
			||||||
 | 
								 * class.  "charclass_start" is 1 right at the beginning of a
 | 
				
			||||||
 | 
								 * class and 2 after an initial caret.
 | 
				
			||||||
 | 
								 */
 | 
				
			||||||
 | 
								if (pchar == ']' && charclass_start > 2)
 | 
				
			||||||
 | 
									charclass_depth--;
 | 
				
			||||||
 | 
								else if (pchar == '[')
 | 
				
			||||||
 | 
									charclass_depth++;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
								/*
 | 
				
			||||||
 | 
								 * If there is a caret right after the opening bracket, it negates
 | 
				
			||||||
 | 
								 * the character class, but a following closing bracket should
 | 
				
			||||||
 | 
								 * still be treated as a normal character.  That holds only for
 | 
				
			||||||
 | 
								 * the first caret, so only the values 1 and 2 mean that closing
 | 
				
			||||||
 | 
								 * brackets should be taken literally.
 | 
				
			||||||
 | 
								 */
 | 
				
			||||||
 | 
								if (pchar == '^')
 | 
				
			||||||
 | 
									charclass_start++;
 | 
				
			||||||
 | 
								else
 | 
				
			||||||
 | 
									charclass_start = 3;	/* definitely past the start */
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
		else if (pchar == '[')
 | 
							else if (pchar == '[')
 | 
				
			||||||
		{
 | 
							{
 | 
				
			||||||
 | 
								/* start of a character class */
 | 
				
			||||||
			*r++ = pchar;
 | 
								*r++ = pchar;
 | 
				
			||||||
			incharclass = true;
 | 
								charclass_depth++;
 | 
				
			||||||
 | 
								charclass_start = 1;
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
		else if (pchar == '%')
 | 
							else if (pchar == '%')
 | 
				
			||||||
		{
 | 
							{
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -596,6 +596,68 @@ SELECT 'abcdefg' SIMILAR TO '_bcd%' ESCAPE NULL AS null;
 | 
				
			|||||||
SELECT 'abcdefg' SIMILAR TO '_bcd#%' ESCAPE '##' AS error;
 | 
					SELECT 'abcdefg' SIMILAR TO '_bcd#%' ESCAPE '##' AS error;
 | 
				
			||||||
ERROR:  invalid escape string
 | 
					ERROR:  invalid escape string
 | 
				
			||||||
HINT:  Escape string must be empty or one character.
 | 
					HINT:  Escape string must be empty or one character.
 | 
				
			||||||
 | 
					-- Characters that should be left alone in character classes when a
 | 
				
			||||||
 | 
					-- SIMILAR TO regexp pattern is converted to POSIX style.
 | 
				
			||||||
 | 
					-- Underscore "_"
 | 
				
			||||||
 | 
					EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '_[_[:alpha:]_]_';
 | 
				
			||||||
 | 
					                   QUERY PLAN                   
 | 
				
			||||||
 | 
					------------------------------------------------
 | 
				
			||||||
 | 
					 Seq Scan on text_tbl
 | 
				
			||||||
 | 
					   Filter: (f1 ~ '^(?:.[_[:alpha:]_].)$'::text)
 | 
				
			||||||
 | 
					(2 rows)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-- Percentage "%"
 | 
				
			||||||
 | 
					EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '%[%[:alnum:]%]%';
 | 
				
			||||||
 | 
					                    QUERY PLAN                    
 | 
				
			||||||
 | 
					--------------------------------------------------
 | 
				
			||||||
 | 
					 Seq Scan on text_tbl
 | 
				
			||||||
 | 
					   Filter: (f1 ~ '^(?:.*[%[:alnum:]%].*)$'::text)
 | 
				
			||||||
 | 
					(2 rows)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-- Dot "."
 | 
				
			||||||
 | 
					EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '.[.[:alnum:].].';
 | 
				
			||||||
 | 
					                    QUERY PLAN                    
 | 
				
			||||||
 | 
					--------------------------------------------------
 | 
				
			||||||
 | 
					 Seq Scan on text_tbl
 | 
				
			||||||
 | 
					   Filter: (f1 ~ '^(?:\.[.[:alnum:].]\.)$'::text)
 | 
				
			||||||
 | 
					(2 rows)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-- Dollar "$"
 | 
				
			||||||
 | 
					EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '$[$[:alnum:]$]$';
 | 
				
			||||||
 | 
					                    QUERY PLAN                    
 | 
				
			||||||
 | 
					--------------------------------------------------
 | 
				
			||||||
 | 
					 Seq Scan on text_tbl
 | 
				
			||||||
 | 
					   Filter: (f1 ~ '^(?:\$[$[:alnum:]$]\$)$'::text)
 | 
				
			||||||
 | 
					(2 rows)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-- Opening parenthesis "("
 | 
				
			||||||
 | 
					EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '([([:alnum:](](';
 | 
				
			||||||
 | 
					ERROR:  invalid regular expression: parentheses () not balanced
 | 
				
			||||||
 | 
					-- Caret "^"
 | 
				
			||||||
 | 
					EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '^[^[:alnum:]^[^^][[^^]][\^][[\^]]\^]^';
 | 
				
			||||||
 | 
					                               QUERY PLAN                               
 | 
				
			||||||
 | 
					------------------------------------------------------------------------
 | 
				
			||||||
 | 
					 Seq Scan on text_tbl
 | 
				
			||||||
 | 
					   Filter: (f1 ~ '^(?:\^[^[:alnum:]^[^^][[^^]][\^][[\^]]\^]\^)$'::text)
 | 
				
			||||||
 | 
					(2 rows)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-- Closing square bracket "]" at the beginning of character class
 | 
				
			||||||
 | 
					EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[]%][^]%][^%]%';
 | 
				
			||||||
 | 
					                   QUERY PLAN                   
 | 
				
			||||||
 | 
					------------------------------------------------
 | 
				
			||||||
 | 
					 Seq Scan on text_tbl
 | 
				
			||||||
 | 
					   Filter: (f1 ~ '^(?:[]%][^]%][^%].*)$'::text)
 | 
				
			||||||
 | 
					(2 rows)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-- Closing square bracket effective after two carets at the beginning
 | 
				
			||||||
 | 
					-- of character class.
 | 
				
			||||||
 | 
					EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[^^]^';
 | 
				
			||||||
 | 
					              QUERY PLAN               
 | 
				
			||||||
 | 
					---------------------------------------
 | 
				
			||||||
 | 
					 Seq Scan on text_tbl
 | 
				
			||||||
 | 
					   Filter: (f1 ~ '^(?:[^^]\^)$'::text)
 | 
				
			||||||
 | 
					(2 rows)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
-- Test backslash escapes in regexp_replace's replacement string
 | 
					-- Test backslash escapes in regexp_replace's replacement string
 | 
				
			||||||
SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3');
 | 
					SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3');
 | 
				
			||||||
 regexp_replace 
 | 
					 regexp_replace 
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -193,6 +193,26 @@ SELECT 'abcd\efg' SIMILAR TO '_bcd\%' ESCAPE '' AS true;
 | 
				
			|||||||
SELECT 'abcdefg' SIMILAR TO '_bcd%' ESCAPE NULL AS null;
 | 
					SELECT 'abcdefg' SIMILAR TO '_bcd%' ESCAPE NULL AS null;
 | 
				
			||||||
SELECT 'abcdefg' SIMILAR TO '_bcd#%' ESCAPE '##' AS error;
 | 
					SELECT 'abcdefg' SIMILAR TO '_bcd#%' ESCAPE '##' AS error;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					-- Characters that should be left alone in character classes when a
 | 
				
			||||||
 | 
					-- SIMILAR TO regexp pattern is converted to POSIX style.
 | 
				
			||||||
 | 
					-- Underscore "_"
 | 
				
			||||||
 | 
					EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '_[_[:alpha:]_]_';
 | 
				
			||||||
 | 
					-- Percentage "%"
 | 
				
			||||||
 | 
					EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '%[%[:alnum:]%]%';
 | 
				
			||||||
 | 
					-- Dot "."
 | 
				
			||||||
 | 
					EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '.[.[:alnum:].].';
 | 
				
			||||||
 | 
					-- Dollar "$"
 | 
				
			||||||
 | 
					EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '$[$[:alnum:]$]$';
 | 
				
			||||||
 | 
					-- Opening parenthesis "("
 | 
				
			||||||
 | 
					EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '([([:alnum:](](';
 | 
				
			||||||
 | 
					-- Caret "^"
 | 
				
			||||||
 | 
					EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '^[^[:alnum:]^[^^][[^^]][\^][[\^]]\^]^';
 | 
				
			||||||
 | 
					-- Closing square bracket "]" at the beginning of character class
 | 
				
			||||||
 | 
					EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[]%][^]%][^%]%';
 | 
				
			||||||
 | 
					-- Closing square bracket effective after two carets at the beginning
 | 
				
			||||||
 | 
					-- of character class.
 | 
				
			||||||
 | 
					EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[^^]^';
 | 
				
			||||||
 | 
					
 | 
				
			||||||
-- Test backslash escapes in regexp_replace's replacement string
 | 
					-- Test backslash escapes in regexp_replace's replacement string
 | 
				
			||||||
SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3');
 | 
					SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3');
 | 
				
			||||||
SELECT regexp_replace('foobarrbazz', E'(.)\\1', E'X\\&Y', 'g');
 | 
					SELECT regexp_replace('foobarrbazz', E'(.)\\1', E'X\\&Y', 'g');
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user