diff --git a/doc/src/sgml/charset.sgml b/doc/src/sgml/charset.sgml index f5e115e8d6e..00e1986849a 100644 --- a/doc/src/sgml/charset.sgml +++ b/doc/src/sgml/charset.sgml @@ -1197,7 +1197,7 @@ CREATE COLLATION ignore_accents (provider = icu, locale = 'und-u-ks-level1-kc-tr to a performance penalty. Note, in particular, that B-tree cannot use deduplication with indexes that use a nondeterministic collation. Also, certain operations are not possible with nondeterministic collations, - such as pattern matching operations. Therefore, they should be used + such as some pattern matching operations. Therefore, they should be used only in cases where they are specifically wanted. diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 13ccbe7d78c..8b81106fa23 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -5414,9 +5414,10 @@ cast(-44 as bit(12)) 111111010100 - The pattern matching operators of all three kinds do not support - nondeterministic collations. If required, apply a different collation to - the expression to work around this limitation. + SIMILAR TO and POSIX-style regular + expressions do not support nondeterministic collations. If required, use + LIKE or apply a different collation to the expression + to work around this limitation. @@ -5462,6 +5463,46 @@ cast(-44 as bit(12)) 111111010100 + + LIKE pattern matching supports nondeterministic + collations (see ), such as + case-insensitive collations or collations that, say, ignore punctuation. + So with a case-insensitive collation, one could have: + +'AbC' LIKE 'abc' COLLATE case_insensitive true +'AbC' LIKE 'a%' COLLATE case_insensitive true + + With collations that ignore certain characters or in general that consider + strings of different lengths equal, the semantics can become a bit more + complicated. Consider these examples: + +'.foo.' LIKE 'foo' COLLATE ign_punct true +'.foo.' LIKE 'f_o' COLLATE ign_punct true +'.foo.' LIKE '_oo' COLLATE ign_punct false + + The way the matching works is that the pattern is partitioned into + sequences of wildcards and non-wildcard strings (wildcards being + _ and %). For example, the pattern + f_o is partitioned into f, _, o, the + pattern _oo is partitioned into _, + oo. The input string matches the pattern if it can be + partitioned in such a way that the wildcards match one character or any + number of characters respectively and the non-wildcard partitions are + equal under the applicable collation. So for example, '.foo.' + LIKE 'f_o' COLLATE ign_punct is true because one can partition + .foo. into .f, o, o., and then + '.f' = 'f' COLLATE ign_punct, 'o' + matches the _ wildcard, and 'o.' = 'o' COLLATE + ign_punct. But '.foo.' LIKE '_oo' COLLATE + ign_punct is false because .foo. cannot be + partitioned in a way that the first character is any character and the + rest of the string compares equal to oo. (Note that + the single-character wildcard always matches exactly one character, + independent of the collation. So in this example, the + _ would match ., but then the rest + of the input string won't match the rest of the pattern.) + + LIKE pattern matching always covers the entire string. Therefore, if it's desired to match a sequence anywhere within @@ -5503,8 +5544,9 @@ cast(-44 as bit(12)) 111111010100 The key word ILIKE can be used instead of - LIKE to make the match case-insensitive according - to the active locale. This is not in the SQL standard but is a + LIKE to make the match case-insensitive according to the + active locale. (But this does not support nondeterministic collations.) + This is not in the SQL standard but is a PostgreSQL extension. diff --git a/src/backend/utils/adt/like.c b/src/backend/utils/adt/like.c index 0152723b2a6..7b3d1b5be71 100644 --- a/src/backend/utils/adt/like.c +++ b/src/backend/utils/adt/like.c @@ -147,22 +147,28 @@ SB_lower_char(unsigned char c, pg_locale_t locale) static inline int GenericMatchText(const char *s, int slen, const char *p, int plen, Oid collation) { - if (collation) - { - pg_locale_t locale = pg_newlocale_from_collation(collation); + pg_locale_t locale; - if (!locale->deterministic) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("nondeterministic collations are not supported for LIKE"))); + if (!OidIsValid(collation)) + { + /* + * This typically means that the parser could not resolve a conflict + * of implicit collations, so report it that way. + */ + ereport(ERROR, + (errcode(ERRCODE_INDETERMINATE_COLLATION), + errmsg("could not determine which collation to use for LIKE"), + errhint("Use the COLLATE clause to set the collation explicitly."))); } + locale = pg_newlocale_from_collation(collation); + if (pg_database_encoding_max_length() == 1) - return SB_MatchText(s, slen, p, plen, 0); + return SB_MatchText(s, slen, p, plen, locale); else if (GetDatabaseEncoding() == PG_UTF8) - return UTF8_MatchText(s, slen, p, plen, 0); + return UTF8_MatchText(s, slen, p, plen, locale); else - return MB_MatchText(s, slen, p, plen, 0); + return MB_MatchText(s, slen, p, plen, locale); } static inline int diff --git a/src/backend/utils/adt/like_match.c b/src/backend/utils/adt/like_match.c index f561cc15e4c..afe5406cf40 100644 --- a/src/backend/utils/adt/like_match.c +++ b/src/backend/utils/adt/like_match.c @@ -157,7 +157,9 @@ MatchText(const char *t, int tlen, const char *p, int plen, pg_locale_t locale) * the first pattern byte to each text byte to avoid recursing * more than we have to. This fact also guarantees that we don't * have to consider a match to the zero-length substring at the - * end of the text. + * end of the text. With a nondeterministic collation, we can't + * rely on the first bytes being equal, so we have to recurse in + * any case. */ if (*p == '\\') { @@ -172,7 +174,7 @@ MatchText(const char *t, int tlen, const char *p, int plen, pg_locale_t locale) while (tlen > 0) { - if (GETCHAR(*t, locale) == firstpat) + if (GETCHAR(*t, locale) == firstpat || (locale && !locale->deterministic)) { int matched = MatchText(t, tlen, p, plen, locale); @@ -196,6 +198,149 @@ MatchText(const char *t, int tlen, const char *p, int plen, pg_locale_t locale) NextByte(p, plen); continue; } + else if (locale && !locale->deterministic) + { + /* + * For nondeterministic locales, we find the next substring of the + * pattern that does not contain wildcards and try to find a + * matching substring in the text. Crucially, we cannot do this + * character by character, as in the normal case, but must do it + * substring by substring, partitioned by the wildcard characters. + * (This is per SQL standard.) + */ + const char *p1; + size_t p1len; + const char *t1; + size_t t1len; + bool found_escape; + const char *subpat; + size_t subpatlen; + char *buf = NULL; + + /* + * Determine next substring of pattern without wildcards. p is + * the start of the subpattern, p1 is one past the last byte. Also + * track if we found an escape character. + */ + p1 = p; + p1len = plen; + found_escape = false; + while (p1len > 0) + { + if (*p1 == '\\') + { + found_escape = true; + NextByte(p1, p1len); + if (p1len == 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE), + errmsg("LIKE pattern must not end with escape character"))); + } + else if (*p1 == '_' || *p1 == '%') + break; + NextByte(p1, p1len); + } + + /* + * If we found an escape character, then make an unescaped copy of + * the subpattern. + */ + if (found_escape) + { + char *b; + + b = buf = palloc(p1 - p); + for (const char *c = p; c < p1; c++) + { + if (*c == '\\') + ; + else + *(b++) = *c; + } + + subpat = buf; + subpatlen = b - buf; + } + else + { + subpat = p; + subpatlen = p1 - p; + } + + /* + * Shortcut: If this is the end of the pattern, then the rest of + * the text has to match the rest of the pattern. + */ + if (p1len == 0) + { + int cmp; + + cmp = pg_strncoll(subpat, subpatlen, t, tlen, locale); + + if (buf) + pfree(buf); + if (cmp == 0) + return LIKE_TRUE; + else + return LIKE_FALSE; + } + + /* + * Now build a substring of the text and try to match it against + * the subpattern. t is the start of the text, t1 is one past the + * last byte. We start with a zero-length string. + */ + t1 = t; + t1len = tlen; + for (;;) + { + int cmp; + + CHECK_FOR_INTERRUPTS(); + + cmp = pg_strncoll(subpat, subpatlen, t, (t1 - t), locale); + + /* + * If we found a match, we have to test if the rest of pattern + * can match against the rest of the string. Otherwise we + * have to continue here try matching with a longer substring. + * (This is similar to the recursion for the '%' wildcard + * above.) + * + * Note that we can't just wind forward p and t and continue + * with the main loop. This would fail for example with + * + * U&'\0061\0308bc' LIKE U&'\00E4_c' COLLATE ignore_accents + * + * You'd find that t=\0061 matches p=\00E4, but then the rest + * won't match; but t=\0061\0308 also matches p=\00E4, and + * then the rest will match. + */ + if (cmp == 0) + { + int matched = MatchText(t1, t1len, p1, p1len, locale); + + if (matched == LIKE_TRUE) + { + if (buf) + pfree(buf); + return matched; + } + } + + /* + * Didn't match. If we used up the whole text, then the match + * fails. Otherwise, try again with a longer substring. + */ + if (t1len == 0) + return LIKE_FALSE; + else + NextChar(t1, t1len); + } + if (buf) + pfree(buf); + continue; + } else if (GETCHAR(*p, locale) != GETCHAR(*t, locale)) { /* non-wildcard pattern char fails to match text char */ diff --git a/src/backend/utils/adt/like_support.c b/src/backend/utils/adt/like_support.c index 8b15509a3bf..ee71ca89ffd 100644 --- a/src/backend/utils/adt/like_support.c +++ b/src/backend/utils/adt/like_support.c @@ -272,22 +272,6 @@ match_pattern_prefix(Node *leftop, return NIL; patt = (Const *) rightop; - /* - * Not supported if the expression collation is nondeterministic. The - * optimized equality or prefix tests use bytewise comparisons, which is - * not consistent with nondeterministic collations. The actual - * pattern-matching implementation functions will later error out that - * pattern-matching is not supported with nondeterministic collations. (We - * could also error out here, but by doing it later we get more precise - * error messages.) (It should be possible to support at least - * Pattern_Prefix_Exact, but no point as long as the actual - * pattern-matching implementations don't support it.) - * - * expr_coll is not set for a non-collation-aware data type such as bytea. - */ - if (expr_coll && !get_collation_isdeterministic(expr_coll)) - return NIL; - /* * Try to extract a fixed prefix from the pattern. */ @@ -404,6 +388,8 @@ match_pattern_prefix(Node *leftop, { if (!op_in_opfamily(eqopr, opfamily)) return NIL; + if (indexcollation != expr_coll) + return NIL; expr = make_opclause(eqopr, BOOLOID, false, (Expr *) leftop, (Expr *) prefix, InvalidOid, indexcollation); @@ -411,6 +397,17 @@ match_pattern_prefix(Node *leftop, return result; } + /* + * Anything other than Pattern_Prefix_Exact is not supported if the + * expression collation is nondeterministic. The optimized equality or + * prefix tests use bytewise comparisons, which is not consistent with + * nondeterministic collations. + * + * expr_coll is not set for a non-collation-aware data type such as bytea. + */ + if (expr_coll && !get_collation_isdeterministic(expr_coll)) + return NIL; + /* * Otherwise, we have a nonempty required prefix of the values. Some * opclasses support prefix checks directly, otherwise we'll try to diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out index de17f7db6ce..79e76d4b850 100644 --- a/src/test/regress/expected/collate.icu.utf8.out +++ b/src/test/regress/expected/collate.icu.utf8.out @@ -1274,6 +1274,30 @@ CREATE COLLATION ctest_det (provider = icu, locale = '', deterministic = true); NOTICE: using standard form "und" for ICU locale "" CREATE COLLATION ctest_nondet (provider = icu, locale = '', deterministic = false); NOTICE: using standard form "und" for ICU locale "" +SELECT 'abc' LIKE 'abc' COLLATE ctest_det; + ?column? +---------- + t +(1 row) + +SELECT 'abc' LIKE 'a\bc' COLLATE ctest_det; + ?column? +---------- + t +(1 row) + +SELECT 'abc' LIKE 'abc' COLLATE ctest_nondet; + ?column? +---------- + t +(1 row) + +SELECT 'abc' LIKE 'a\bc' COLLATE ctest_nondet; + ?column? +---------- + t +(1 row) + CREATE TABLE test6 (a int, b text); -- same string in different normal forms INSERT INTO test6 VALUES (1, U&'\00E4bc'); @@ -1298,6 +1322,19 @@ SELECT * FROM test6 WHERE b = 'äbc' COLLATE ctest_nondet; 2 | äbc (2 rows) +SELECT * FROM test6 WHERE b LIKE 'äbc' COLLATE ctest_det; + a | b +---+----- + 1 | äbc +(1 row) + +SELECT * FROM test6 WHERE b LIKE 'äbc' COLLATE ctest_nondet; + a | b +---+----- + 1 | äbc + 2 | äbc +(2 rows) + -- same with arrays CREATE TABLE test6a (a int, b text[]); INSERT INTO test6a VALUES (1, ARRAY[U&'\00E4bc']); @@ -1514,7 +1551,12 @@ SELECT x FROM test3ci WHERE x <> 'abc'; (2 rows) SELECT x FROM test3ci WHERE x LIKE 'a%'; -ERROR: nondeterministic collations are not supported for LIKE + x +----- + abc + ABC +(2 rows) + SELECT x FROM test3ci WHERE x ILIKE 'a%'; ERROR: nondeterministic collations are not supported for ILIKE SELECT x FROM test3ci WHERE x SIMILAR TO 'a%'; @@ -1632,7 +1674,12 @@ SELECT x FROM test3bpci WHERE x <> 'abc'; (2 rows) SELECT x FROM test3bpci WHERE x LIKE 'a%'; -ERROR: nondeterministic collations are not supported for LIKE + x +----- + abc + ABC +(2 rows) + SELECT x FROM test3bpci WHERE x ILIKE 'a%'; ERROR: nondeterministic collations are not supported for ILIKE SELECT x FROM test3bpci WHERE x SIMILAR TO 'a%'; @@ -1729,7 +1776,7 @@ SELECT string_to_array('ABCDEFGHI'::char(9) COLLATE case_insensitive, NULL, 'b') -- This tests the issue described in match_pattern_prefix(). In the -- absence of that check, the case_insensitive tests below would -- return no rows where they should logically return one. -CREATE TABLE test4c (x text COLLATE "C"); +CREATE TABLE test4c (x text COLLATE case_insensitive); INSERT INTO test4c VALUES ('abc'); CREATE INDEX ON test4c (x); SET enable_seqscan = off; @@ -1743,10 +1790,18 @@ SELECT x FROM test4c WHERE x LIKE 'ABC%' COLLATE case_sensitive; -- ok, no rows --- (0 rows) -SELECT x FROM test4c WHERE x LIKE 'ABC' COLLATE case_insensitive; -- error -ERROR: nondeterministic collations are not supported for LIKE -SELECT x FROM test4c WHERE x LIKE 'ABC%' COLLATE case_insensitive; -- error -ERROR: nondeterministic collations are not supported for LIKE +SELECT x FROM test4c WHERE x LIKE 'ABC' COLLATE case_insensitive; -- ok + x +----- + abc +(1 row) + +SELECT x FROM test4c WHERE x LIKE 'ABC%' COLLATE case_insensitive; -- ok + x +----- + abc +(1 row) + RESET enable_seqscan; -- Unicode special case: different variants of Greek lower case sigma. -- A naive implementation like citext that just does lower(x) = @@ -1840,6 +1895,126 @@ SELECT * FROM test4 WHERE b = 'Cote' COLLATE case_insensitive; 1 | cote (1 row) +-- This is a tricky one. A naive implementation would first test +-- \00E4 matches \0061, which is true under ignore_accents, but then +-- the rest of the string won't match anymore. Therefore, the +-- algorithm has to test whether the rest of the string matches, and +-- if not try matching \00E4 against a longer substring like +-- \0061\0308, which will then work out. +SELECT U&'\0061\0308bc' LIKE U&'\00E4_c' COLLATE ignore_accents; + ?column? +---------- + t +(1 row) + +-- and in reverse: +SELECT U&'\00E4bc' LIKE U&'\0061\0308_c' COLLATE ignore_accents; + ?column? +---------- + t +(1 row) + +-- inner % matches b: +SELECT U&'\0061\0308bc' LIKE U&'\00E4%c' COLLATE ignore_accents; + ?column? +---------- + t +(1 row) + +-- inner %% matches b then zero: +SELECT U&'\0061\0308bc' LIKE U&'\00E4%%c' COLLATE ignore_accents; + ?column? +---------- + t +(1 row) + +-- inner %% matches b then zero: +SELECT U&'cb\0061\0308' LIKE U&'c%%\00E4' COLLATE ignore_accents; + ?column? +---------- + t +(1 row) + +-- trailing _ matches two codepoints that form one grapheme: +SELECT U&'cb\0061\0308' LIKE U&'cb_' COLLATE ignore_accents; + ?column? +---------- + f +(1 row) + +-- trailing __ matches two codepoints that form one grapheme: +SELECT U&'cb\0061\0308' LIKE U&'cb__' COLLATE ignore_accents; + ?column? +---------- + t +(1 row) + +-- leading % matches zero: +SELECT U&'\0061\0308bc' LIKE U&'%\00E4bc' COLLATE ignore_accents; + ?column? +---------- + t +(1 row) + +-- leading % matches zero (with later %): +SELECT U&'\0061\0308bc' LIKE U&'%\00E4%c' COLLATE ignore_accents; + ?column? +---------- + t +(1 row) + +-- trailing % matches zero: +SELECT U&'\0061\0308bc' LIKE U&'\00E4bc%' COLLATE ignore_accents; + ?column? +---------- + t +(1 row) + +-- trailing % matches zero (with previous %): +SELECT U&'\0061\0308bc' LIKE U&'\00E4%c%' COLLATE ignore_accents; + ?column? +---------- + t +(1 row) + +-- _ versus two codepoints that form one grapheme: +SELECT U&'\0061\0308bc' LIKE U&'_bc' COLLATE ignore_accents; + ?column? +---------- + t +(1 row) + +-- (actually this matches because) +SELECT U&'\0308bc' = 'bc' COLLATE ignore_accents; + ?column? +---------- + t +(1 row) + +-- __ matches two codepoints that form one grapheme: +SELECT U&'\0061\0308bc' LIKE U&'__bc' COLLATE ignore_accents; + ?column? +---------- + t +(1 row) + +-- _ matches one codepoint that forms half a grapheme: +SELECT U&'\0061\0308bc' LIKE U&'_\0308bc' COLLATE ignore_accents; + ?column? +---------- + t +(1 row) + +-- doesn't match because \00e4 doesn't match only \0308 +SELECT U&'\0061\0308bc' LIKE U&'_\00e4bc' COLLATE ignore_accents; + ?column? +---------- + f +(1 row) + +-- escape character at end of pattern +SELECT 'foox' LIKE 'foo\' COLLATE ignore_accents; +ERROR: LIKE pattern must not end with escape character -- foreign keys (mixing different nondeterministic collations not allowed) CREATE TABLE test10pk (x text COLLATE case_sensitive PRIMARY KEY); CREATE TABLE test10fk (x text COLLATE case_insensitive REFERENCES test10pk (x) ON UPDATE CASCADE ON DELETE CASCADE); -- error diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql index 0c9491c260e..797e93ac714 100644 --- a/src/test/regress/sql/collate.icu.utf8.sql +++ b/src/test/regress/sql/collate.icu.utf8.sql @@ -514,6 +514,12 @@ CREATE COLLATION testcoll_rulesx (provider = icu, locale = '', rules = '!!wrong! CREATE COLLATION ctest_det (provider = icu, locale = '', deterministic = true); CREATE COLLATION ctest_nondet (provider = icu, locale = '', deterministic = false); +SELECT 'abc' LIKE 'abc' COLLATE ctest_det; +SELECT 'abc' LIKE 'a\bc' COLLATE ctest_det; + +SELECT 'abc' LIKE 'abc' COLLATE ctest_nondet; +SELECT 'abc' LIKE 'a\bc' COLLATE ctest_nondet; + CREATE TABLE test6 (a int, b text); -- same string in different normal forms INSERT INTO test6 VALUES (1, U&'\00E4bc'); @@ -522,6 +528,9 @@ SELECT * FROM test6; SELECT * FROM test6 WHERE b = 'äbc' COLLATE ctest_det; SELECT * FROM test6 WHERE b = 'äbc' COLLATE ctest_nondet; +SELECT * FROM test6 WHERE b LIKE 'äbc' COLLATE ctest_det; +SELECT * FROM test6 WHERE b LIKE 'äbc' COLLATE ctest_nondet; + -- same with arrays CREATE TABLE test6a (a int, b text[]); INSERT INTO test6a VALUES (1, ARRAY[U&'\00E4bc']); @@ -637,14 +646,14 @@ SELECT string_to_array('ABCDEFGHI'::char(9) COLLATE case_insensitive, NULL, 'b') -- This tests the issue described in match_pattern_prefix(). In the -- absence of that check, the case_insensitive tests below would -- return no rows where they should logically return one. -CREATE TABLE test4c (x text COLLATE "C"); +CREATE TABLE test4c (x text COLLATE case_insensitive); INSERT INTO test4c VALUES ('abc'); CREATE INDEX ON test4c (x); SET enable_seqscan = off; SELECT x FROM test4c WHERE x LIKE 'ABC' COLLATE case_sensitive; -- ok, no rows SELECT x FROM test4c WHERE x LIKE 'ABC%' COLLATE case_sensitive; -- ok, no rows -SELECT x FROM test4c WHERE x LIKE 'ABC' COLLATE case_insensitive; -- error -SELECT x FROM test4c WHERE x LIKE 'ABC%' COLLATE case_insensitive; -- error +SELECT x FROM test4c WHERE x LIKE 'ABC' COLLATE case_insensitive; -- ok +SELECT x FROM test4c WHERE x LIKE 'ABC%' COLLATE case_insensitive; -- ok RESET enable_seqscan; -- Unicode special case: different variants of Greek lower case sigma. @@ -687,6 +696,46 @@ SELECT * FROM test4 WHERE b = 'cote' COLLATE ignore_accents; SELECT * FROM test4 WHERE b = 'Cote' COLLATE ignore_accents; -- still case-sensitive SELECT * FROM test4 WHERE b = 'Cote' COLLATE case_insensitive; +-- This is a tricky one. A naive implementation would first test +-- \00E4 matches \0061, which is true under ignore_accents, but then +-- the rest of the string won't match anymore. Therefore, the +-- algorithm has to test whether the rest of the string matches, and +-- if not try matching \00E4 against a longer substring like +-- \0061\0308, which will then work out. +SELECT U&'\0061\0308bc' LIKE U&'\00E4_c' COLLATE ignore_accents; +-- and in reverse: +SELECT U&'\00E4bc' LIKE U&'\0061\0308_c' COLLATE ignore_accents; +-- inner % matches b: +SELECT U&'\0061\0308bc' LIKE U&'\00E4%c' COLLATE ignore_accents; +-- inner %% matches b then zero: +SELECT U&'\0061\0308bc' LIKE U&'\00E4%%c' COLLATE ignore_accents; +-- inner %% matches b then zero: +SELECT U&'cb\0061\0308' LIKE U&'c%%\00E4' COLLATE ignore_accents; +-- trailing _ matches two codepoints that form one grapheme: +SELECT U&'cb\0061\0308' LIKE U&'cb_' COLLATE ignore_accents; +-- trailing __ matches two codepoints that form one grapheme: +SELECT U&'cb\0061\0308' LIKE U&'cb__' COLLATE ignore_accents; +-- leading % matches zero: +SELECT U&'\0061\0308bc' LIKE U&'%\00E4bc' COLLATE ignore_accents; +-- leading % matches zero (with later %): +SELECT U&'\0061\0308bc' LIKE U&'%\00E4%c' COLLATE ignore_accents; +-- trailing % matches zero: +SELECT U&'\0061\0308bc' LIKE U&'\00E4bc%' COLLATE ignore_accents; +-- trailing % matches zero (with previous %): +SELECT U&'\0061\0308bc' LIKE U&'\00E4%c%' COLLATE ignore_accents; +-- _ versus two codepoints that form one grapheme: +SELECT U&'\0061\0308bc' LIKE U&'_bc' COLLATE ignore_accents; +-- (actually this matches because) +SELECT U&'\0308bc' = 'bc' COLLATE ignore_accents; +-- __ matches two codepoints that form one grapheme: +SELECT U&'\0061\0308bc' LIKE U&'__bc' COLLATE ignore_accents; +-- _ matches one codepoint that forms half a grapheme: +SELECT U&'\0061\0308bc' LIKE U&'_\0308bc' COLLATE ignore_accents; +-- doesn't match because \00e4 doesn't match only \0308 +SELECT U&'\0061\0308bc' LIKE U&'_\00e4bc' COLLATE ignore_accents; +-- escape character at end of pattern +SELECT 'foox' LIKE 'foo\' COLLATE ignore_accents; + -- foreign keys (mixing different nondeterministic collations not allowed) CREATE TABLE test10pk (x text COLLATE case_sensitive PRIMARY KEY); CREATE TABLE test10fk (x text COLLATE case_insensitive REFERENCES test10pk (x) ON UPDATE CASCADE ON DELETE CASCADE); -- error