mirror of
https://github.com/postgres/postgres.git
synced 2025-06-13 07:41:39 +03:00
Fix conversion of SIMILAR TO regexes for character classes
The code that translates SIMILAR TO pattern matching expressions to POSIX-style regular expressions did not consider that square brackets can be nested. For example, in an expression like [[:alpha:]%_], the logic replaced the placeholders '_' and '%' but it should not. This commit fixes the conversion logic by tracking the nesting level of square brackets marking character class areas, while considering that in expressions like []] or [^]] the first closing square bracket is a regular character. Multiple tests are added to show how the conversions should or should not apply applied while in a character class area, with specific cases added for all the characters converted outside character classes like an opening parenthesis '(', dollar sign '$', etc. Author: Laurenz Albe <laurenz.albe@cybertec.at> Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> Reviewed-by: Michael Paquier <michael@paquier.xyz> Discussion: https://postgr.es/m/16ab039d1af455652bdf4173402ddda145f2c73b.camel@cybertec.at Backpatch-through: 13
This commit is contained in:
@ -773,8 +773,11 @@ similar_escape_internal(text *pat_text, text *esc_text)
|
|||||||
int plen,
|
int plen,
|
||||||
elen;
|
elen;
|
||||||
bool afterescape = false;
|
bool afterescape = false;
|
||||||
bool incharclass = false;
|
|
||||||
int nquotes = 0;
|
int nquotes = 0;
|
||||||
|
int charclass_depth = 0; /* Nesting level of character classes,
|
||||||
|
* encompassed by square brackets */
|
||||||
|
int charclass_start = 0; /* State of the character class start,
|
||||||
|
* for carets */
|
||||||
|
|
||||||
p = VARDATA_ANY(pat_text);
|
p = VARDATA_ANY(pat_text);
|
||||||
plen = VARSIZE_ANY_EXHDR(pat_text);
|
plen = VARSIZE_ANY_EXHDR(pat_text);
|
||||||
@ -904,7 +907,7 @@ similar_escape_internal(text *pat_text, text *esc_text)
|
|||||||
/* fast path */
|
/* fast path */
|
||||||
if (afterescape)
|
if (afterescape)
|
||||||
{
|
{
|
||||||
if (pchar == '"' && !incharclass) /* escape-double-quote? */
|
if (pchar == '"' && charclass_depth < 1) /* escape-double-quote? */
|
||||||
{
|
{
|
||||||
/* emit appropriate part separator, per notes above */
|
/* emit appropriate part separator, per notes above */
|
||||||
if (nquotes == 0)
|
if (nquotes == 0)
|
||||||
@ -953,18 +956,41 @@ similar_escape_internal(text *pat_text, text *esc_text)
|
|||||||
/* SQL escape character; do not send to output */
|
/* SQL escape character; do not send to output */
|
||||||
afterescape = true;
|
afterescape = true;
|
||||||
}
|
}
|
||||||
else if (incharclass)
|
else if (charclass_depth > 0)
|
||||||
{
|
{
|
||||||
if (pchar == '\\')
|
if (pchar == '\\')
|
||||||
*r++ = '\\';
|
*r++ = '\\';
|
||||||
*r++ = pchar;
|
*r++ = pchar;
|
||||||
if (pchar == ']')
|
|
||||||
incharclass = false;
|
/*
|
||||||
|
* Ignore a closing bracket at the start of a character class.
|
||||||
|
* Such a bracket is taken literally rather than closing the
|
||||||
|
* class. "charclass_start" is 1 right at the beginning of a
|
||||||
|
* class and 2 after an initial caret.
|
||||||
|
*/
|
||||||
|
if (pchar == ']' && charclass_start > 2)
|
||||||
|
charclass_depth--;
|
||||||
|
else if (pchar == '[')
|
||||||
|
charclass_depth++;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If there is a caret right after the opening bracket, it negates
|
||||||
|
* the character class, but a following closing bracket should
|
||||||
|
* still be treated as a normal character. That holds only for
|
||||||
|
* the first caret, so only the values 1 and 2 mean that closing
|
||||||
|
* brackets should be taken literally.
|
||||||
|
*/
|
||||||
|
if (pchar == '^')
|
||||||
|
charclass_start++;
|
||||||
|
else
|
||||||
|
charclass_start = 3; /* definitely past the start */
|
||||||
}
|
}
|
||||||
else if (pchar == '[')
|
else if (pchar == '[')
|
||||||
{
|
{
|
||||||
|
/* start of a character class */
|
||||||
*r++ = pchar;
|
*r++ = pchar;
|
||||||
incharclass = true;
|
charclass_depth++;
|
||||||
|
charclass_start = 1;
|
||||||
}
|
}
|
||||||
else if (pchar == '%')
|
else if (pchar == '%')
|
||||||
{
|
{
|
||||||
|
@ -614,6 +614,68 @@ SELECT 'abcdefg' SIMILAR TO '_bcd%' ESCAPE NULL AS null;
|
|||||||
SELECT 'abcdefg' SIMILAR TO '_bcd#%' ESCAPE '##' AS error;
|
SELECT 'abcdefg' SIMILAR TO '_bcd#%' ESCAPE '##' AS error;
|
||||||
ERROR: invalid escape string
|
ERROR: invalid escape string
|
||||||
HINT: Escape string must be empty or one character.
|
HINT: Escape string must be empty or one character.
|
||||||
|
-- Characters that should be left alone in character classes when a
|
||||||
|
-- SIMILAR TO regexp pattern is converted to POSIX style.
|
||||||
|
-- Underscore "_"
|
||||||
|
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '_[_[:alpha:]_]_';
|
||||||
|
QUERY PLAN
|
||||||
|
------------------------------------------------
|
||||||
|
Seq Scan on text_tbl
|
||||||
|
Filter: (f1 ~ '^(?:.[_[:alpha:]_].)$'::text)
|
||||||
|
(2 rows)
|
||||||
|
|
||||||
|
-- Percentage "%"
|
||||||
|
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '%[%[:alnum:]%]%';
|
||||||
|
QUERY PLAN
|
||||||
|
--------------------------------------------------
|
||||||
|
Seq Scan on text_tbl
|
||||||
|
Filter: (f1 ~ '^(?:.*[%[:alnum:]%].*)$'::text)
|
||||||
|
(2 rows)
|
||||||
|
|
||||||
|
-- Dot "."
|
||||||
|
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '.[.[:alnum:].].';
|
||||||
|
QUERY PLAN
|
||||||
|
--------------------------------------------------
|
||||||
|
Seq Scan on text_tbl
|
||||||
|
Filter: (f1 ~ '^(?:\.[.[:alnum:].]\.)$'::text)
|
||||||
|
(2 rows)
|
||||||
|
|
||||||
|
-- Dollar "$"
|
||||||
|
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '$[$[:alnum:]$]$';
|
||||||
|
QUERY PLAN
|
||||||
|
--------------------------------------------------
|
||||||
|
Seq Scan on text_tbl
|
||||||
|
Filter: (f1 ~ '^(?:\$[$[:alnum:]$]\$)$'::text)
|
||||||
|
(2 rows)
|
||||||
|
|
||||||
|
-- Opening parenthesis "("
|
||||||
|
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '([([:alnum:](](';
|
||||||
|
ERROR: invalid regular expression: parentheses () not balanced
|
||||||
|
-- Caret "^"
|
||||||
|
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '^[^[:alnum:]^[^^][[^^]][\^][[\^]]\^]^';
|
||||||
|
QUERY PLAN
|
||||||
|
------------------------------------------------------------------------
|
||||||
|
Seq Scan on text_tbl
|
||||||
|
Filter: (f1 ~ '^(?:\^[^[:alnum:]^[^^][[^^]][\^][[\^]]\^]\^)$'::text)
|
||||||
|
(2 rows)
|
||||||
|
|
||||||
|
-- Closing square bracket "]" at the beginning of character class
|
||||||
|
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[]%][^]%][^%]%';
|
||||||
|
QUERY PLAN
|
||||||
|
------------------------------------------------
|
||||||
|
Seq Scan on text_tbl
|
||||||
|
Filter: (f1 ~ '^(?:[]%][^]%][^%].*)$'::text)
|
||||||
|
(2 rows)
|
||||||
|
|
||||||
|
-- Closing square bracket effective after two carets at the beginning
|
||||||
|
-- of character class.
|
||||||
|
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[^^]^';
|
||||||
|
QUERY PLAN
|
||||||
|
---------------------------------------
|
||||||
|
Seq Scan on text_tbl
|
||||||
|
Filter: (f1 ~ '^(?:[^^]\^)$'::text)
|
||||||
|
(2 rows)
|
||||||
|
|
||||||
-- Test backslash escapes in regexp_replace's replacement string
|
-- Test backslash escapes in regexp_replace's replacement string
|
||||||
SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3');
|
SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3');
|
||||||
regexp_replace
|
regexp_replace
|
||||||
|
@ -197,6 +197,26 @@ SELECT 'abcd\efg' SIMILAR TO '_bcd\%' ESCAPE '' AS true;
|
|||||||
SELECT 'abcdefg' SIMILAR TO '_bcd%' ESCAPE NULL AS null;
|
SELECT 'abcdefg' SIMILAR TO '_bcd%' ESCAPE NULL AS null;
|
||||||
SELECT 'abcdefg' SIMILAR TO '_bcd#%' ESCAPE '##' AS error;
|
SELECT 'abcdefg' SIMILAR TO '_bcd#%' ESCAPE '##' AS error;
|
||||||
|
|
||||||
|
-- Characters that should be left alone in character classes when a
|
||||||
|
-- SIMILAR TO regexp pattern is converted to POSIX style.
|
||||||
|
-- Underscore "_"
|
||||||
|
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '_[_[:alpha:]_]_';
|
||||||
|
-- Percentage "%"
|
||||||
|
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '%[%[:alnum:]%]%';
|
||||||
|
-- Dot "."
|
||||||
|
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '.[.[:alnum:].].';
|
||||||
|
-- Dollar "$"
|
||||||
|
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '$[$[:alnum:]$]$';
|
||||||
|
-- Opening parenthesis "("
|
||||||
|
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '([([:alnum:](](';
|
||||||
|
-- Caret "^"
|
||||||
|
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '^[^[:alnum:]^[^^][[^^]][\^][[\^]]\^]^';
|
||||||
|
-- Closing square bracket "]" at the beginning of character class
|
||||||
|
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[]%][^]%][^%]%';
|
||||||
|
-- Closing square bracket effective after two carets at the beginning
|
||||||
|
-- of character class.
|
||||||
|
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[^^]^';
|
||||||
|
|
||||||
-- Test backslash escapes in regexp_replace's replacement string
|
-- Test backslash escapes in regexp_replace's replacement string
|
||||||
SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3');
|
SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3');
|
||||||
SELECT regexp_replace('foobarrbazz', E'(.)\\1', E'X\\&Y', 'g');
|
SELECT regexp_replace('foobarrbazz', E'(.)\\1', E'X\\&Y', 'g');
|
||||||
|
Reference in New Issue
Block a user