1
0
mirror of https://github.com/postgres/postgres.git synced 2025-07-15 19:21:59 +03:00

Allow complemented character class escapes within regex brackets.

The complement-class escapes \D, \S, \W are now allowed within
bracket expressions.  There is no semantic difficulty with doing
that, but the rather hokey macro-expansion-based implementation
previously used here couldn't cope.

Also, invent "word" as an allowed character class name, thus "\w"
is now equivalent to "[[:word:]]" outside brackets, or "[:word:]"
within brackets.  POSIX allows such implementation-specific
extensions, and the same name is used in e.g. bash.

One surprising compatibility issue this raises is that constructs
such as "[\w-_]" are now disallowed, as our documentation has always
said they should be: character classes can't be endpoints of a range.
Previously, because \w was just a macro for "[:alnum:]_", such a
construct was read as "[[:alnum:]_-_]", so it was accepted so long as
the character after "-" was numerically greater than or equal to "_".

Some implementation cleanup along the way:

* Remove the lexnest() hack, and in consequence clean up wordchrs()
to not interact with the lexer.

* Fix colorcomplement() to not be O(N^2) in the number of colors
involved.

* Get rid of useless-as-far-as-I-can-see calls of element()
on single-character character element names in brackpart().
element() always maps these to the character itself, and things
would be quite broken if it didn't --- should "[a]" match something
different than "a" does?  Besides, the shortcut path in brackpart()
wasn't doing this anyway, making it even more inconsistent.

Discussion: https://postgr.es/m/2845172.1613674385@sss.pgh.pa.us
Discussion: https://postgr.es/m/3220564.1613859619@sss.pgh.pa.us
This commit is contained in:
Tom Lane
2021-02-25 13:00:40 -05:00
parent 6b40d9bdbd
commit 2a0af7fe46
10 changed files with 677 additions and 276 deletions

View File

@ -1970,6 +1970,256 @@ select * from test_regex('a[\w]b', 'axb', 'LPE');
{axb}
(2 rows)
-- these should be invalid
select * from test_regex('[\w-~]*', 'ab01_~-`**', 'LNPSE');
ERROR: invalid regular expression: invalid character range
select * from test_regex('[~-\w]*', 'ab01_~-`**', 'LNPSE');
ERROR: invalid regular expression: invalid character range
select * from test_regex('[[:alnum:]-~]*', 'ab01~-`**', 'LNS');
ERROR: invalid regular expression: invalid character range
select * from test_regex('[~-[:alnum:]]*', 'ab01~-`**', 'LNS');
ERROR: invalid regular expression: invalid character range
-- test complemented char classes within brackets
select * from test_regex('[\D]', '0123456789abc*', 'LPE');
test_regex
----------------------------------------
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
{a}
(2 rows)
select * from test_regex('[^\D]', 'abc0123456789*', 'LPE');
test_regex
----------------------------------------
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
{0}
(2 rows)
select * from test_regex('[1\D7]', '0123456789abc*', 'LPE');
test_regex
----------------------------------------
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
{1}
(2 rows)
select * from test_regex('[7\D1]', '0123456789abc*', 'LPE');
test_regex
----------------------------------------
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
{1}
(2 rows)
select * from test_regex('[^0\D1]', 'abc0123456789*', 'LPE');
test_regex
----------------------------------------
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
{2}
(2 rows)
select * from test_regex('[^1\D0]', 'abc0123456789*', 'LPE');
test_regex
----------------------------------------
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
{2}
(2 rows)
select * from test_regex('\W', '0123456789abc_*', 'LP');
test_regex
-------------------------------
{0,REG_UNONPOSIX,REG_ULOCALE}
{*}
(2 rows)
select * from test_regex('[\W]', '0123456789abc_*', 'LPE');
test_regex
----------------------------------------
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
{*}
(2 rows)
select * from test_regex('[\s\S]*', '012 3456789abc_*', 'LNPE');
test_regex
--------------------------------------------------------
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE,REG_UEMPTYMATCH}
{"012 3456789abc_*"}
(2 rows)
-- check char classes' handling of newlines
select * from test_regex('\s+', E'abc \n def', 'LP');
test_regex
-------------------------------
{0,REG_UNONPOSIX,REG_ULOCALE}
{" +
"}
(2 rows)
select * from test_regex('\s+', E'abc \n def', 'nLP');
test_regex
-------------------------------
{0,REG_UNONPOSIX,REG_ULOCALE}
{" +
"}
(2 rows)
select * from test_regex('[\s]+', E'abc \n def', 'LPE');
test_regex
----------------------------------------
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
{" +
"}
(2 rows)
select * from test_regex('[\s]+', E'abc \n def', 'nLPE');
test_regex
----------------------------------------
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
{" +
"}
(2 rows)
select * from test_regex('\S+', E'abc\ndef', 'LP');
test_regex
-------------------------------
{0,REG_UNONPOSIX,REG_ULOCALE}
{abc}
(2 rows)
select * from test_regex('\S+', E'abc\ndef', 'nLP');
test_regex
-------------------------------
{0,REG_UNONPOSIX,REG_ULOCALE}
{abc}
(2 rows)
select * from test_regex('[\S]+', E'abc\ndef', 'LPE');
test_regex
----------------------------------------
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
{abc}
(2 rows)
select * from test_regex('[\S]+', E'abc\ndef', 'nLPE');
test_regex
----------------------------------------
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
{abc}
(2 rows)
select * from test_regex('\d+', E'012\n345', 'LP');
test_regex
-------------------------------
{0,REG_UNONPOSIX,REG_ULOCALE}
{012}
(2 rows)
select * from test_regex('\d+', E'012\n345', 'nLP');
test_regex
-------------------------------
{0,REG_UNONPOSIX,REG_ULOCALE}
{012}
(2 rows)
select * from test_regex('[\d]+', E'012\n345', 'LPE');
test_regex
----------------------------------------
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
{012}
(2 rows)
select * from test_regex('[\d]+', E'012\n345', 'nLPE');
test_regex
----------------------------------------
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
{012}
(2 rows)
select * from test_regex('\D+', E'abc\ndef345', 'LP');
test_regex
-------------------------------
{0,REG_UNONPOSIX,REG_ULOCALE}
{"abc +
def"}
(2 rows)
select * from test_regex('\D+', E'abc\ndef345', 'nLP');
test_regex
-------------------------------
{0,REG_UNONPOSIX,REG_ULOCALE}
{abc}
(2 rows)
select * from test_regex('[\D]+', E'abc\ndef345', 'LPE');
test_regex
----------------------------------------
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
{"abc +
def"}
(2 rows)
select * from test_regex('[\D]+', E'abc\ndef345', 'nLPE');
test_regex
----------------------------------------
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
{abc}
(2 rows)
select * from test_regex('\w+', E'abc_012\ndef', 'LP');
test_regex
-------------------------------
{0,REG_UNONPOSIX,REG_ULOCALE}
{abc_012}
(2 rows)
select * from test_regex('\w+', E'abc_012\ndef', 'nLP');
test_regex
-------------------------------
{0,REG_UNONPOSIX,REG_ULOCALE}
{abc_012}
(2 rows)
select * from test_regex('[\w]+', E'abc_012\ndef', 'LPE');
test_regex
----------------------------------------
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
{abc_012}
(2 rows)
select * from test_regex('[\w]+', E'abc_012\ndef', 'nLPE');
test_regex
----------------------------------------
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
{abc_012}
(2 rows)
select * from test_regex('\W+', E'***\n@@@___', 'LP');
test_regex
-------------------------------
{0,REG_UNONPOSIX,REG_ULOCALE}
{"*** +
@@@"}
(2 rows)
select * from test_regex('\W+', E'***\n@@@___', 'nLP');
test_regex
-------------------------------
{0,REG_UNONPOSIX,REG_ULOCALE}
{***}
(2 rows)
select * from test_regex('[\W]+', E'***\n@@@___', 'LPE');
test_regex
----------------------------------------
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
{"*** +
@@@"}
(2 rows)
select * from test_regex('[\W]+', E'***\n@@@___', 'nLPE');
test_regex
----------------------------------------
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
{***}
(2 rows)
-- doing 13 "escapes"
-- expectError 13.1 & "a\\" EESCAPE
select * from test_regex('a\', '', '');