mirror of
https://github.com/postgres/postgres.git
synced 2025-12-10 14:22:35 +03:00
Fix SQL-style substring() to have spec-compliant greediness behavior.
SQL's regular-expression substring() function is defined to have a pattern argument that's separated into three subpatterns by escape- double-quote markers; the function result is the part of the input matching the second subpattern. The standard makes it clear that if there is ambiguity about how to match the input to the subpatterns, the first and third subpatterns should be taken to match the smallest possible amount of text (i.e., they're "non greedy", in the terms of our regex code). We were not doing it that way: the first subpattern would eat the largest possible amount of text, causing the function result to be shorter than what the spec requires. Fix that by attaching explicit greediness quantifiers to the subpatterns. (This depends on the regex fix in commit 8a29ed053; before that, this didn't reliably change the regex engine's behavior.) Also, by adding parentheses around each subpattern, we ensure that "|" (OR) in the subpatterns behave sanely. Previously, "|" in the first or third subpatterns didn't work. This patch also makes the function throw error if you write more than two escape-double-quote markers, and do something sane if you write just one, and document that behavior. Previously, an odd number of markers led to a confusing complaint about unbalanced parentheses, while extra pairs of markers were just ignored. (Note that the spec requires exactly two markers, but we've historically allowed there to be none, and this patch preserves the old behavior for that case.) In passing, adjust some substring() test cases that didn't really prove what they said they were testing for: they used patterns that didn't match the data string, so that the output would be NULL whether or not the function was really strict. Although this is certainly a bug fix, changing the behavior in back branches seems undesirable: applications could perhaps be depending on the old behavior, since it's not obviously wrong unless you read the spec very closely. Hence, no back-patch. Discussion: https://postgr.es/m/5bb27a41-350d-37bf-901e-9d26f5592dd0@charter.net
This commit is contained in:
@@ -313,7 +313,7 @@ SELECT SUBSTRING('1234567890' FROM 4 FOR 3) = '456' AS "456";
|
||||
t
|
||||
(1 row)
|
||||
|
||||
-- T581 regular expression substring (with SQL99's bizarre regexp syntax)
|
||||
-- T581 regular expression substring (with SQL's bizarre regexp syntax)
|
||||
SELECT SUBSTRING('abcdefg' FROM 'a#"(b_d)#"%' FOR '#') AS "bcd";
|
||||
bcd
|
||||
-----
|
||||
@@ -328,13 +328,13 @@ SELECT SUBSTRING('abcdefg' FROM '#"(b_d)#"%' FOR '#') IS NULL AS "True";
|
||||
(1 row)
|
||||
|
||||
-- Null inputs should return NULL
|
||||
SELECT SUBSTRING('abcdefg' FROM '(b|c)' FOR NULL) IS NULL AS "True";
|
||||
SELECT SUBSTRING('abcdefg' FROM '%' FOR NULL) IS NULL AS "True";
|
||||
True
|
||||
------
|
||||
t
|
||||
(1 row)
|
||||
|
||||
SELECT SUBSTRING(NULL FROM '(b|c)' FOR '#') IS NULL AS "True";
|
||||
SELECT SUBSTRING(NULL FROM '%' FOR '#') IS NULL AS "True";
|
||||
True
|
||||
------
|
||||
t
|
||||
@@ -346,8 +346,57 @@ SELECT SUBSTRING('abcdefg' FROM NULL FOR '#') IS NULL AS "True";
|
||||
t
|
||||
(1 row)
|
||||
|
||||
-- PostgreSQL extension to allow omitting the escape character;
|
||||
-- here the regexp is taken as Posix syntax
|
||||
-- The first and last parts should act non-greedy
|
||||
SELECT SUBSTRING('abcdefg' FROM 'a#"%#"g' FOR '#') AS "bcdef";
|
||||
bcdef
|
||||
-------
|
||||
bcdef
|
||||
(1 row)
|
||||
|
||||
SELECT SUBSTRING('abcdefg' FROM 'a*#"%#"g*' FOR '#') AS "abcdefg";
|
||||
abcdefg
|
||||
---------
|
||||
abcdefg
|
||||
(1 row)
|
||||
|
||||
-- Vertical bar in any part affects only that part
|
||||
SELECT SUBSTRING('abcdefg' FROM 'a|b#"%#"g' FOR '#') AS "bcdef";
|
||||
bcdef
|
||||
-------
|
||||
bcdef
|
||||
(1 row)
|
||||
|
||||
SELECT SUBSTRING('abcdefg' FROM 'a#"%#"x|g' FOR '#') AS "bcdef";
|
||||
bcdef
|
||||
-------
|
||||
bcdef
|
||||
(1 row)
|
||||
|
||||
SELECT SUBSTRING('abcdefg' FROM 'a#"%|ab#"g' FOR '#') AS "bcdef";
|
||||
bcdef
|
||||
-------
|
||||
bcdef
|
||||
(1 row)
|
||||
|
||||
-- Can't have more than two part separators
|
||||
SELECT SUBSTRING('abcdefg' FROM 'a*#"%#"g*#"x' FOR '#') AS "error";
|
||||
ERROR: SQL regular expression may not contain more than two escape-double-quote separators
|
||||
CONTEXT: SQL function "substring" statement 1
|
||||
-- Postgres extension: with 0 or 1 separator, assume parts 1 and 3 are empty
|
||||
SELECT SUBSTRING('abcdefg' FROM 'a#"%g' FOR '#') AS "bcdefg";
|
||||
bcdefg
|
||||
--------
|
||||
bcdefg
|
||||
(1 row)
|
||||
|
||||
SELECT SUBSTRING('abcdefg' FROM 'a%g' FOR '#') AS "abcdefg";
|
||||
abcdefg
|
||||
---------
|
||||
abcdefg
|
||||
(1 row)
|
||||
|
||||
-- substring() with just two arguments is not allowed by SQL spec;
|
||||
-- we accept it, but we interpret the pattern as a POSIX regexp not SQL
|
||||
SELECT SUBSTRING('abcdefg' FROM 'c.e') AS "cde";
|
||||
cde
|
||||
-----
|
||||
|
||||
@@ -110,19 +110,35 @@ SELECT SUBSTRING('1234567890' FROM 3) = '34567890' AS "34567890";
|
||||
|
||||
SELECT SUBSTRING('1234567890' FROM 4 FOR 3) = '456' AS "456";
|
||||
|
||||
-- T581 regular expression substring (with SQL99's bizarre regexp syntax)
|
||||
-- T581 regular expression substring (with SQL's bizarre regexp syntax)
|
||||
SELECT SUBSTRING('abcdefg' FROM 'a#"(b_d)#"%' FOR '#') AS "bcd";
|
||||
|
||||
-- No match should return NULL
|
||||
SELECT SUBSTRING('abcdefg' FROM '#"(b_d)#"%' FOR '#') IS NULL AS "True";
|
||||
|
||||
-- Null inputs should return NULL
|
||||
SELECT SUBSTRING('abcdefg' FROM '(b|c)' FOR NULL) IS NULL AS "True";
|
||||
SELECT SUBSTRING(NULL FROM '(b|c)' FOR '#') IS NULL AS "True";
|
||||
SELECT SUBSTRING('abcdefg' FROM '%' FOR NULL) IS NULL AS "True";
|
||||
SELECT SUBSTRING(NULL FROM '%' FOR '#') IS NULL AS "True";
|
||||
SELECT SUBSTRING('abcdefg' FROM NULL FOR '#') IS NULL AS "True";
|
||||
|
||||
-- PostgreSQL extension to allow omitting the escape character;
|
||||
-- here the regexp is taken as Posix syntax
|
||||
-- The first and last parts should act non-greedy
|
||||
SELECT SUBSTRING('abcdefg' FROM 'a#"%#"g' FOR '#') AS "bcdef";
|
||||
SELECT SUBSTRING('abcdefg' FROM 'a*#"%#"g*' FOR '#') AS "abcdefg";
|
||||
|
||||
-- Vertical bar in any part affects only that part
|
||||
SELECT SUBSTRING('abcdefg' FROM 'a|b#"%#"g' FOR '#') AS "bcdef";
|
||||
SELECT SUBSTRING('abcdefg' FROM 'a#"%#"x|g' FOR '#') AS "bcdef";
|
||||
SELECT SUBSTRING('abcdefg' FROM 'a#"%|ab#"g' FOR '#') AS "bcdef";
|
||||
|
||||
-- Can't have more than two part separators
|
||||
SELECT SUBSTRING('abcdefg' FROM 'a*#"%#"g*#"x' FOR '#') AS "error";
|
||||
|
||||
-- Postgres extension: with 0 or 1 separator, assume parts 1 and 3 are empty
|
||||
SELECT SUBSTRING('abcdefg' FROM 'a#"%g' FOR '#') AS "bcdefg";
|
||||
SELECT SUBSTRING('abcdefg' FROM 'a%g' FOR '#') AS "abcdefg";
|
||||
|
||||
-- substring() with just two arguments is not allowed by SQL spec;
|
||||
-- we accept it, but we interpret the pattern as a POSIX regexp not SQL
|
||||
SELECT SUBSTRING('abcdefg' FROM 'c.e') AS "cde";
|
||||
|
||||
-- With a parenthesized subexpression, return only what matches the subexpr
|
||||
|
||||
Reference in New Issue
Block a user