mirror of
				https://github.com/postgres/postgres.git
				synced 2025-10-31 10:30:33 +03:00 
			
		
		
		
	Fix conversion of SIMILAR TO regexes for character classes
The code that translates SIMILAR TO pattern matching expressions to
POSIX-style regular expressions did not consider that square brackets
can be nested.  For example, in an expression like [[:alpha:]%_], the
logic replaced the placeholders '_' and '%' but it should not.
This commit fixes the conversion logic by tracking the nesting level of
square brackets marking character class areas, while considering that
in expressions like []] or [^]] the first closing square bracket is a
regular character.  Multiple tests are added to show how the conversions
should or should not apply applied while in a character class area, with
specific cases added for all the characters converted outside character
classes like an opening parenthesis '(', dollar sign '$', etc.
Author: Laurenz Albe <laurenz.albe@cybertec.at>
Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us>
Reviewed-by: Michael Paquier <michael@paquier.xyz>
Discussion: https://postgr.es/m/16ab039d1af455652bdf4173402ddda145f2c73b.camel@cybertec.at
Backpatch-through: 13
			
			
This commit is contained in:
		| @@ -773,8 +773,11 @@ similar_escape_internal(text *pat_text, text *esc_text) | ||||
| 	int			plen, | ||||
| 				elen; | ||||
| 	bool		afterescape = false; | ||||
| 	bool		incharclass = false; | ||||
| 	int			nquotes = 0; | ||||
| 	int			charclass_depth = 0;	/* Nesting level of character classes, | ||||
| 										 * encompassed by square brackets */ | ||||
| 	int			charclass_start = 0;	/* State of the character class start, | ||||
| 										 * for carets */ | ||||
|  | ||||
| 	p = VARDATA_ANY(pat_text); | ||||
| 	plen = VARSIZE_ANY_EXHDR(pat_text); | ||||
| @@ -904,7 +907,7 @@ similar_escape_internal(text *pat_text, text *esc_text) | ||||
| 		/* fast path */ | ||||
| 		if (afterescape) | ||||
| 		{ | ||||
| 			if (pchar == '"' && !incharclass)	/* escape-double-quote? */ | ||||
| 			if (pchar == '"' && charclass_depth < 1)	/* escape-double-quote? */ | ||||
| 			{ | ||||
| 				/* emit appropriate part separator, per notes above */ | ||||
| 				if (nquotes == 0) | ||||
| @@ -953,18 +956,41 @@ similar_escape_internal(text *pat_text, text *esc_text) | ||||
| 			/* SQL escape character; do not send to output */ | ||||
| 			afterescape = true; | ||||
| 		} | ||||
| 		else if (incharclass) | ||||
| 		else if (charclass_depth > 0) | ||||
| 		{ | ||||
| 			if (pchar == '\\') | ||||
| 				*r++ = '\\'; | ||||
| 			*r++ = pchar; | ||||
| 			if (pchar == ']') | ||||
| 				incharclass = false; | ||||
|  | ||||
| 			/* | ||||
| 			 * Ignore a closing bracket at the start of a character class. | ||||
| 			 * Such a bracket is taken literally rather than closing the | ||||
| 			 * class.  "charclass_start" is 1 right at the beginning of a | ||||
| 			 * class and 2 after an initial caret. | ||||
| 			 */ | ||||
| 			if (pchar == ']' && charclass_start > 2) | ||||
| 				charclass_depth--; | ||||
| 			else if (pchar == '[') | ||||
| 				charclass_depth++; | ||||
|  | ||||
| 			/* | ||||
| 			 * If there is a caret right after the opening bracket, it negates | ||||
| 			 * the character class, but a following closing bracket should | ||||
| 			 * still be treated as a normal character.  That holds only for | ||||
| 			 * the first caret, so only the values 1 and 2 mean that closing | ||||
| 			 * brackets should be taken literally. | ||||
| 			 */ | ||||
| 			if (pchar == '^') | ||||
| 				charclass_start++; | ||||
| 			else | ||||
| 				charclass_start = 3;	/* definitely past the start */ | ||||
| 		} | ||||
| 		else if (pchar == '[') | ||||
| 		{ | ||||
| 			/* start of a character class */ | ||||
| 			*r++ = pchar; | ||||
| 			incharclass = true; | ||||
| 			charclass_depth++; | ||||
| 			charclass_start = 1; | ||||
| 		} | ||||
| 		else if (pchar == '%') | ||||
| 		{ | ||||
|   | ||||
| @@ -596,6 +596,68 @@ SELECT 'abcdefg' SIMILAR TO '_bcd%' ESCAPE NULL AS null; | ||||
| SELECT 'abcdefg' SIMILAR TO '_bcd#%' ESCAPE '##' AS error; | ||||
| ERROR:  invalid escape string | ||||
| HINT:  Escape string must be empty or one character. | ||||
| -- Characters that should be left alone in character classes when a | ||||
| -- SIMILAR TO regexp pattern is converted to POSIX style. | ||||
| -- Underscore "_" | ||||
| EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '_[_[:alpha:]_]_'; | ||||
|                    QUERY PLAN                    | ||||
| ------------------------------------------------ | ||||
|  Seq Scan on text_tbl | ||||
|    Filter: (f1 ~ '^(?:.[_[:alpha:]_].)$'::text) | ||||
| (2 rows) | ||||
|  | ||||
| -- Percentage "%" | ||||
| EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '%[%[:alnum:]%]%'; | ||||
|                     QUERY PLAN                     | ||||
| -------------------------------------------------- | ||||
|  Seq Scan on text_tbl | ||||
|    Filter: (f1 ~ '^(?:.*[%[:alnum:]%].*)$'::text) | ||||
| (2 rows) | ||||
|  | ||||
| -- Dot "." | ||||
| EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '.[.[:alnum:].].'; | ||||
|                     QUERY PLAN                     | ||||
| -------------------------------------------------- | ||||
|  Seq Scan on text_tbl | ||||
|    Filter: (f1 ~ '^(?:\.[.[:alnum:].]\.)$'::text) | ||||
| (2 rows) | ||||
|  | ||||
| -- Dollar "$" | ||||
| EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '$[$[:alnum:]$]$'; | ||||
|                     QUERY PLAN                     | ||||
| -------------------------------------------------- | ||||
|  Seq Scan on text_tbl | ||||
|    Filter: (f1 ~ '^(?:\$[$[:alnum:]$]\$)$'::text) | ||||
| (2 rows) | ||||
|  | ||||
| -- Opening parenthesis "(" | ||||
| EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '([([:alnum:](]('; | ||||
| ERROR:  invalid regular expression: parentheses () not balanced | ||||
| -- Caret "^" | ||||
| EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '^[^[:alnum:]^[^^][[^^]][\^][[\^]]\^]^'; | ||||
|                                QUERY PLAN                                | ||||
| ------------------------------------------------------------------------ | ||||
|  Seq Scan on text_tbl | ||||
|    Filter: (f1 ~ '^(?:\^[^[:alnum:]^[^^][[^^]][\^][[\^]]\^]\^)$'::text) | ||||
| (2 rows) | ||||
|  | ||||
| -- Closing square bracket "]" at the beginning of character class | ||||
| EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[]%][^]%][^%]%'; | ||||
|                    QUERY PLAN                    | ||||
| ------------------------------------------------ | ||||
|  Seq Scan on text_tbl | ||||
|    Filter: (f1 ~ '^(?:[]%][^]%][^%].*)$'::text) | ||||
| (2 rows) | ||||
|  | ||||
| -- Closing square bracket effective after two carets at the beginning | ||||
| -- of character class. | ||||
| EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[^^]^'; | ||||
|               QUERY PLAN                | ||||
| --------------------------------------- | ||||
|  Seq Scan on text_tbl | ||||
|    Filter: (f1 ~ '^(?:[^^]\^)$'::text) | ||||
| (2 rows) | ||||
|  | ||||
| -- Test backslash escapes in regexp_replace's replacement string | ||||
| SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3'); | ||||
|  regexp_replace  | ||||
|   | ||||
| @@ -193,6 +193,26 @@ SELECT 'abcd\efg' SIMILAR TO '_bcd\%' ESCAPE '' AS true; | ||||
| SELECT 'abcdefg' SIMILAR TO '_bcd%' ESCAPE NULL AS null; | ||||
| SELECT 'abcdefg' SIMILAR TO '_bcd#%' ESCAPE '##' AS error; | ||||
|  | ||||
| -- Characters that should be left alone in character classes when a | ||||
| -- SIMILAR TO regexp pattern is converted to POSIX style. | ||||
| -- Underscore "_" | ||||
| EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '_[_[:alpha:]_]_'; | ||||
| -- Percentage "%" | ||||
| EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '%[%[:alnum:]%]%'; | ||||
| -- Dot "." | ||||
| EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '.[.[:alnum:].].'; | ||||
| -- Dollar "$" | ||||
| EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '$[$[:alnum:]$]$'; | ||||
| -- Opening parenthesis "(" | ||||
| EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '([([:alnum:](]('; | ||||
| -- Caret "^" | ||||
| EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '^[^[:alnum:]^[^^][[^^]][\^][[\^]]\^]^'; | ||||
| -- Closing square bracket "]" at the beginning of character class | ||||
| EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[]%][^]%][^%]%'; | ||||
| -- Closing square bracket effective after two carets at the beginning | ||||
| -- of character class. | ||||
| EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[^^]^'; | ||||
|  | ||||
| -- Test backslash escapes in regexp_replace's replacement string | ||||
| SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3'); | ||||
| SELECT regexp_replace('foobarrbazz', E'(.)\\1', E'X\\&Y', 'g'); | ||||
|   | ||||
		Reference in New Issue
	
	Block a user