mirror of
https://github.com/postgres/postgres.git
synced 2025-06-20 15:22:23 +03:00
Regexps like "(.){0}...\1" drew an "invalid backreference number". That's not unreasonable on its face, since the capture group will never be matched if it's iterated zero times. However, other engines such as Perl's don't complain about this, nor do we throw an error for related cases such as "(.)|\1", even though that backref can never succeed either. Also, if the zero-iterations case happens at runtime rather than compile time --- say, "(x)*...\1" when there's no "x" to be found --- that's not an error, we just deem the backref to not match. Making this even less defensible, no error was thrown for nested cases such as "((.)){0}...\2"; and to add insult to injury, those cases could result in assertion failures instead. (It seems that nothing especially bad happened in non-assert builds, though.) Let's just fix it so that no error is thrown and instead the backref is deemed to never match, so that compile-time detection of no iterations behaves the same as run-time detection. Per report from Mark Dilger. This appears to be an aboriginal error in Spencer's library, so back-patch to all supported versions. Pre-v14, it turns out to also be necessary to back-patch one aspect of commits cb76fbd7e/00116dee5, namely to create capture-node subREs with the begin/end states of their subexpressions, not the current lp/rp of the outer parseqatom invocation. Otherwise delsub complains that we're trying to disconnect a state from itself. This is a bit scary but code examination shows that it's safe: in the pre-v14 code, if we want to wrap iteration around the subexpression, the first thing we do is overwrite the atom's begin/end fields with new states. So the bogus values didn't survive long enough to be used for anything, except if no iteration is required, in which case it doesn't matter. Discussion: https://postgr.es/m/A099E4A8-4377-4C64-A98C-3DEDDC075502@enterprisedb.com
151 lines
5.6 KiB
SQL
151 lines
5.6 KiB
SQL
--
|
|
-- Regular expression tests
|
|
--
|
|
|
|
-- Don't want to have to double backslashes in regexes
|
|
set standard_conforming_strings = on;
|
|
|
|
-- Test simple quantified backrefs
|
|
select 'bbbbb' ~ '^([bc])\1*$' as t;
|
|
select 'ccc' ~ '^([bc])\1*$' as t;
|
|
select 'xxx' ~ '^([bc])\1*$' as f;
|
|
select 'bbc' ~ '^([bc])\1*$' as f;
|
|
select 'b' ~ '^([bc])\1*$' as t;
|
|
|
|
-- Test quantified backref within a larger expression
|
|
select 'abc abc abc' ~ '^(\w+)( \1)+$' as t;
|
|
select 'abc abd abc' ~ '^(\w+)( \1)+$' as f;
|
|
select 'abc abc abd' ~ '^(\w+)( \1)+$' as f;
|
|
select 'abc abc abc' ~ '^(.+)( \1)+$' as t;
|
|
select 'abc abd abc' ~ '^(.+)( \1)+$' as f;
|
|
select 'abc abc abd' ~ '^(.+)( \1)+$' as f;
|
|
|
|
-- Test some cases that crashed in 9.2beta1 due to pmatch[] array overrun
|
|
select substring('asd TO foo' from ' TO (([a-z0-9._]+|"([^"]+|"")+")+)');
|
|
select substring('a' from '((a))+');
|
|
select substring('a' from '((a)+)');
|
|
|
|
-- Test regexp_match()
|
|
select regexp_match('abc', '');
|
|
select regexp_match('abc', 'bc');
|
|
select regexp_match('abc', 'd') is null;
|
|
select regexp_match('abc', '(B)(c)', 'i');
|
|
select regexp_match('abc', 'Bd', 'ig'); -- error
|
|
|
|
-- Test lookahead constraints
|
|
select regexp_matches('ab', 'a(?=b)b*');
|
|
select regexp_matches('a', 'a(?=b)b*');
|
|
select regexp_matches('abc', 'a(?=b)b*(?=c)c*');
|
|
select regexp_matches('ab', 'a(?=b)b*(?=c)c*');
|
|
select regexp_matches('ab', 'a(?!b)b*');
|
|
select regexp_matches('a', 'a(?!b)b*');
|
|
select regexp_matches('b', '(?=b)b');
|
|
select regexp_matches('a', '(?=b)b');
|
|
|
|
-- Test lookbehind constraints
|
|
select regexp_matches('abb', '(?<=a)b*');
|
|
select regexp_matches('a', 'a(?<=a)b*');
|
|
select regexp_matches('abc', 'a(?<=a)b*(?<=b)c*');
|
|
select regexp_matches('ab', 'a(?<=a)b*(?<=b)c*');
|
|
select regexp_matches('ab', 'a*(?<!a)b*');
|
|
select regexp_matches('ab', 'a*(?<!a)b+');
|
|
select regexp_matches('b', 'a*(?<!a)b+');
|
|
select regexp_matches('a', 'a(?<!a)b*');
|
|
select regexp_matches('b', '(?<=b)b');
|
|
select regexp_matches('foobar', '(?<=f)b+');
|
|
select regexp_matches('foobar', '(?<=foo)b+');
|
|
select regexp_matches('foobar', '(?<=oo)b+');
|
|
|
|
-- Test optimization of single-chr-or-bracket-expression lookaround constraints
|
|
select 'xz' ~ 'x(?=[xy])';
|
|
select 'xy' ~ 'x(?=[xy])';
|
|
select 'xz' ~ 'x(?![xy])';
|
|
select 'xy' ~ 'x(?![xy])';
|
|
select 'x' ~ 'x(?![xy])';
|
|
select 'xyy' ~ '(?<=[xy])yy+';
|
|
select 'zyy' ~ '(?<=[xy])yy+';
|
|
select 'xyy' ~ '(?<![xy])yy+';
|
|
select 'zyy' ~ '(?<![xy])yy+';
|
|
|
|
-- Test conversion of regex patterns to indexable conditions
|
|
explain (costs off) select * from pg_proc where proname ~ 'abc';
|
|
explain (costs off) select * from pg_proc where proname ~ '^abc';
|
|
explain (costs off) select * from pg_proc where proname ~ '^abc$';
|
|
explain (costs off) select * from pg_proc where proname ~ '^abcd*e';
|
|
explain (costs off) select * from pg_proc where proname ~ '^abc+d';
|
|
explain (costs off) select * from pg_proc where proname ~ '^(abc)(def)';
|
|
explain (costs off) select * from pg_proc where proname ~ '^(abc)$';
|
|
explain (costs off) select * from pg_proc where proname ~ '^(abc)?d';
|
|
explain (costs off) select * from pg_proc where proname ~ '^abcd(x|(?=\w\w)q)';
|
|
|
|
-- Test for infinite loop in pullback() (CVE-2007-4772)
|
|
select 'a' ~ '($|^)*';
|
|
|
|
-- These cases expose a bug in the original fix for CVE-2007-4772
|
|
select 'a' ~ '(^)+^';
|
|
select 'a' ~ '$($$)+';
|
|
|
|
-- More cases of infinite loop in pullback(), not fixed by CVE-2007-4772 fix
|
|
select 'a' ~ '($^)+';
|
|
select 'a' ~ '(^$)*';
|
|
select 'aa bb cc' ~ '(^(?!aa))+';
|
|
select 'aa x' ~ '(^(?!aa)(?!bb)(?!cc))+';
|
|
select 'bb x' ~ '(^(?!aa)(?!bb)(?!cc))+';
|
|
select 'cc x' ~ '(^(?!aa)(?!bb)(?!cc))+';
|
|
select 'dd x' ~ '(^(?!aa)(?!bb)(?!cc))+';
|
|
|
|
-- Test for infinite loop in fixempties() (Tcl bugs 3604074, 3606683)
|
|
select 'a' ~ '((((((a)*)*)*)*)*)*';
|
|
select 'a' ~ '((((((a+|)+|)+|)+|)+|)+|)';
|
|
|
|
-- These cases used to give too-many-states failures
|
|
select 'x' ~ 'abcd(\m)+xyz';
|
|
select 'a' ~ '^abcd*(((((^(a c(e?d)a+|)+|)+|)+|)+|a)+|)';
|
|
select 'x' ~ 'a^(^)bcd*xy(((((($a+|)+|)+|)+$|)+|)+|)^$';
|
|
select 'x' ~ 'xyz(\Y\Y)+';
|
|
select 'x' ~ 'x|(?:\M)+';
|
|
|
|
-- This generates O(N) states but O(N^2) arcs, so it causes problems
|
|
-- if arc count is not constrained
|
|
select 'x' ~ repeat('x*y*z*', 1000);
|
|
|
|
-- Test backref in combination with non-greedy quantifier
|
|
-- https://core.tcl.tk/tcl/tktview/6585b21ca8fa6f3678d442b97241fdd43dba2ec0
|
|
select 'Programmer' ~ '(\w).*?\1' as t;
|
|
select regexp_matches('Programmer', '(\w)(.*?\1)', 'g');
|
|
|
|
-- Test for proper matching of non-greedy iteration (bug #11478)
|
|
select regexp_matches('foo/bar/baz',
|
|
'^([^/]+?)(?:/([^/]+?))(?:/([^/]+?))?$', '');
|
|
|
|
-- Test that greediness can be overridden by outer quantifier
|
|
select regexp_matches('llmmmfff', '^(l*)(.*)(f*)$');
|
|
select regexp_matches('llmmmfff', '^(l*){1,1}(.*)(f*)$');
|
|
select regexp_matches('llmmmfff', '^(l*){1,1}?(.*)(f*)$');
|
|
select regexp_matches('llmmmfff', '^(l*){1,1}?(.*){1,1}?(f*)$');
|
|
select regexp_matches('llmmmfff', '^(l*?)(.*)(f*)$');
|
|
select regexp_matches('llmmmfff', '^(l*?){1,1}(.*)(f*)$');
|
|
select regexp_matches('llmmmfff', '^(l*?){1,1}?(.*)(f*)$');
|
|
select regexp_matches('llmmmfff', '^(l*?){1,1}?(.*){1,1}?(f*)$');
|
|
|
|
-- Test for infinite loop in cfindloop with zero-length possible match
|
|
-- but no actual match (can only happen in the presence of backrefs)
|
|
select 'a' ~ '$()|^\1';
|
|
select 'a' ~ '.. ()|\1';
|
|
select 'a' ~ '()*\1';
|
|
select 'a' ~ '()+\1';
|
|
|
|
-- Test incorrect removal of capture groups within {0}
|
|
select 'xxx' ~ '(.){0}(\1)' as f;
|
|
select 'xxx' ~ '((.)){0}(\2)' as f;
|
|
select 'xyz' ~ '((.)){0}(\2){0}' as t;
|
|
|
|
-- Test ancient oversight in when to apply zaptreesubs
|
|
select 'abcdef' ~ '^(.)\1|\1.' as f;
|
|
select 'abadef' ~ '^((.)\2|..)\2' as f;
|
|
|
|
-- Error conditions
|
|
select 'xyz' ~ 'x(\w)(?=\1)'; -- no backrefs in LACONs
|
|
select 'xyz' ~ 'x(\w)(?=(\1))';
|
|
select 'a' ~ '\x7fffffff'; -- invalid chr code
|