1
0
mirror of https://github.com/postgres/postgres.git synced 2025-07-31 22:04:40 +03:00

Support PG_UNICODE_FAST locale in the builtin collation provider.

The PG_UNICODE_FAST locale uses code point sort order (fast,
memcmp-based) combined with Unicode character semantics. The character
semantics are based on Unicode full case mapping.

Full case mapping can map a single codepoint to multiple codepoints,
such as "ß" uppercasing to "SS". Additionally, it handles
context-sensitive mappings like the "final sigma", and it uses
titlecase mappings such as "Dž" when titlecasing (rather than plain
uppercase mappings).

Importantly, the uppercasing of "ß" as "SS" is specifically mentioned
by the SQL standard. In Postgres, UCS_BASIC uses plain ASCII semantics
for case mapping and pattern matching, so if we changed it to use the
PG_UNICODE_FAST locale, it would offer better compliance with the
standard. For now, though, do not change the behavior of UCS_BASIC.

Discussion: https://postgr.es/m/ddfd67928818f138f51635712529bc5e1d25e4e7.camel@j-davis.com
Discussion: https://postgr.es/m/27bb0e52-801d-4f73-a0a4-02cfdd4a9ada@eisentraut.org
Reviewed-by: Peter Eisentraut, Daniel Verite
This commit is contained in:
Jeff Davis
2025-01-17 15:56:30 -08:00
parent 286a365b9c
commit d3d0983169
13 changed files with 283 additions and 16 deletions

View File

@ -377,8 +377,9 @@ initdb --locale-provider=icu --icu-locale=en
<listitem>
<para>
The <literal>builtin</literal> provider uses built-in operations. Only
the <literal>C</literal> and <literal>C.UTF-8</literal> locales are
supported for this provider.
the <literal>C</literal>, <literal>C.UTF-8</literal>, and
<literal>PG_UNICODE_FAST</literal> locales are supported for this
provider.
</para>
<para>
The <literal>C</literal> locale behavior is identical to the
@ -392,6 +393,13 @@ initdb --locale-provider=icu --icu-locale=en
regular expression character classes are based on the "POSIX
Compatible" semantics, and the case mapping is the "simple" variant.
</para>
<para>
The <literal>PG_UNICODE_FAST</literal> locale is available only when
the database encoding is <literal>UTF-8</literal>, and the behavior is
based on Unicode. The collation uses the code point values only. The
regular expression character classes are based on the "Standard"
semantics, and the case mapping is the "full" variant.
</para>
</listitem>
</varlistentry>
@ -886,6 +894,23 @@ SELECT * FROM test1 ORDER BY a || b COLLATE "fr_FR";
</listitem>
</varlistentry>
<varlistentry>
<term><literal>pg_unicode_fast</literal></term>
<listitem>
<para>
This collation sorts by Unicode code point values rather than natural
language order. For the functions <function>lower</function>,
<function>initcap</function>, and <function>upper</function> it uses
Unicode full case mapping. For pattern matching (including regular
expressions), it uses the Standard variant of Unicode <ulink
url="https://www.unicode.org/reports/tr18/#Compatibility_Properties">Compatibility
Properties</ulink>. Behavior is efficient and stable within a
<productname>Postgres</productname> major version. It is only
available for encoding <literal>UTF8</literal>.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><literal>pg_c_utf8</literal></term>
<listitem>

View File

@ -99,7 +99,8 @@ CREATE COLLATION [ IF NOT EXISTS ] <replaceable>name</replaceable> FROM <replace
<para>
If <replaceable>provider</replaceable> is <literal>builtin</literal>,
then <replaceable>locale</replaceable> must be specified and set to
either <literal>C</literal> or <literal>C.UTF-8</literal>.
either <literal>C</literal>, <literal>C.UTF-8</literal> or
<literal>PG_UNICODE_FAST</literal>.
</para>
</listitem>
</varlistentry>

View File

@ -168,7 +168,8 @@ CREATE DATABASE <replaceable class="parameter">name</replaceable>
If <xref linkend="create-database-locale-provider"/> is
<literal>builtin</literal>, then <replaceable>locale</replaceable> or
<replaceable>builtin_locale</replaceable> must be specified and set to
either <literal>C</literal> or <literal>C.UTF-8</literal>.
either <literal>C</literal>, <literal>C.UTF-8</literal>, or
<literal>PG_UNICODE_FAST</literal>.
</para>
<tip>
<para>
@ -233,7 +234,8 @@ CREATE DATABASE <replaceable class="parameter">name</replaceable>
</para>
<para>
The locales available for the <literal>builtin</literal> provider are
<literal>C</literal> and <literal>C.UTF-8</literal>.
<literal>C</literal>, <literal>C.UTF-8</literal> and
<literal>PG_UNICODE_FAST</literal>.
</para>
</listitem>
</varlistentry>

View File

@ -295,8 +295,8 @@ PostgreSQL documentation
<para>
If <option>--locale-provider</option> is <literal>builtin</literal>,
<option>--locale</option> or <option>--builtin-locale</option> must be
specified and set to <literal>C</literal> or
<literal>C.UTF-8</literal>.
specified and set to <literal>C</literal>, <literal>C.UTF-8</literal>
or <literal>PG_UNICODE_FAST</literal>.
</para>
</listitem>
</varlistentry>

View File

@ -307,7 +307,7 @@ pg_wc_isdigit(pg_wchar c)
return (c <= (pg_wchar) 127 &&
(pg_char_properties[c] & PG_ISDIGIT));
case PG_REGEX_STRATEGY_BUILTIN:
return pg_u_isdigit(c, true);
return pg_u_isdigit(c, !pg_regex_locale->info.builtin.casemap_full);
case PG_REGEX_STRATEGY_LIBC_WIDE:
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
return iswdigit_l((wint_t) c, pg_regex_locale->info.lt);
@ -361,7 +361,7 @@ pg_wc_isalnum(pg_wchar c)
return (c <= (pg_wchar) 127 &&
(pg_char_properties[c] & PG_ISALNUM));
case PG_REGEX_STRATEGY_BUILTIN:
return pg_u_isalnum(c, true);
return pg_u_isalnum(c, !pg_regex_locale->info.builtin.casemap_full);
case PG_REGEX_STRATEGY_LIBC_WIDE:
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
return iswalnum_l((wint_t) c, pg_regex_locale->info.lt);
@ -505,7 +505,7 @@ pg_wc_ispunct(pg_wchar c)
return (c <= (pg_wchar) 127 &&
(pg_char_properties[c] & PG_ISPUNCT));
case PG_REGEX_STRATEGY_BUILTIN:
return pg_u_ispunct(c, true);
return pg_u_ispunct(c, !pg_regex_locale->info.builtin.casemap_full);
case PG_REGEX_STRATEGY_LIBC_WIDE:
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
return iswpunct_l((wint_t) c, pg_regex_locale->info.lt);

View File

@ -1590,8 +1590,11 @@ builtin_locale_encoding(const char *locale)
{
if (strcmp(locale, "C") == 0)
return -1;
if (strcmp(locale, "C.UTF-8") == 0)
else if (strcmp(locale, "C.UTF-8") == 0)
return PG_UTF8;
else if (strcmp(locale, "PG_UNICODE_FAST") == 0)
return PG_UTF8;
ereport(ERROR,
(errcode(ERRCODE_WRONG_OBJECT_TYPE),
@ -1616,6 +1619,8 @@ builtin_validate_locale(int encoding, const char *locale)
canonical_name = "C";
else if (strcmp(locale, "C.UTF-8") == 0 || strcmp(locale, "C.UTF8") == 0)
canonical_name = "C.UTF-8";
else if (strcmp(locale, "PG_UNICODE_FAST") == 0)
canonical_name = "PG_UNICODE_FAST";
if (!canonical_name)
ereport(ERROR,

View File

@ -78,7 +78,8 @@ size_t
strlower_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
pg_locale_t locale)
{
return unicode_strlower(dest, destsize, src, srclen, false);
return unicode_strlower(dest, destsize, src, srclen,
locale->info.builtin.casemap_full);
}
size_t
@ -93,7 +94,8 @@ strtitle_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
.prev_alnum = false,
};
return unicode_strtitle(dest, destsize, src, srclen, false,
return unicode_strtitle(dest, destsize, src, srclen,
locale->info.builtin.casemap_full,
initcap_wbnext, &wbstate);
}
@ -101,7 +103,8 @@ size_t
strupper_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
pg_locale_t locale)
{
return unicode_strupper(dest, destsize, src, srclen, false);
return unicode_strupper(dest, destsize, src, srclen,
locale->info.builtin.casemap_full);
}
pg_locale_t
@ -142,6 +145,7 @@ create_pg_locale_builtin(Oid collid, MemoryContext context)
result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct));
result->info.builtin.locale = MemoryContextStrdup(context, locstr);
result->info.builtin.casemap_full = (strcmp(locstr, "PG_UNICODE_FAST") == 0);
result->provider = COLLPROVIDER_BUILTIN;
result->deterministic = true;
result->collate_is_c = true;
@ -164,6 +168,8 @@ get_collation_actual_version_builtin(const char *collcollate)
return "1";
else if (strcmp(collcollate, "C.UTF-8") == 0)
return "1";
else if (strcmp(collcollate, "PG_UNICODE_FAST") == 0)
return "1";
else
ereport(ERROR,
(errcode(ERRCODE_WRONG_OBJECT_TYPE),

View File

@ -2489,6 +2489,8 @@ setlocales(void)
else if (strcmp(datlocale, "C.UTF-8") == 0 ||
strcmp(datlocale, "C.UTF8") == 0)
canonname = "C.UTF-8";
else if (strcmp(datlocale, "PG_UNICODE_FAST") == 0)
canonname = "PG_UNICODE_FAST";
else
pg_fatal("invalid locale name \"%s\" for builtin provider",
datlocale);
@ -2782,7 +2784,9 @@ setup_locale_encoding(void)
if (locale_provider == COLLPROVIDER_BUILTIN)
{
if (strcmp(datlocale, "C.UTF-8") == 0 && encodingid != PG_UTF8)
if ((strcmp(datlocale, "C.UTF-8") == 0 ||
strcmp(datlocale, "PG_UNICODE_FAST") == 0) &&
encodingid != PG_UTF8)
pg_fatal("builtin provider locale \"%s\" requires encoding \"%s\"",
datlocale, "UTF-8");
}

View File

@ -57,6 +57,6 @@
*/
/* yyyymmddN */
#define CATALOG_VERSION_NO 202501162
#define CATALOG_VERSION_NO 202501171
#endif

View File

@ -33,5 +33,8 @@
descr => 'sorts by Unicode code point; Unicode and POSIX character semantics',
collname => 'pg_c_utf8', collprovider => 'b', collencoding => '6',
colllocale => 'C.UTF-8', collversion => '1' },
{ oid => '9535', descr => 'sorts by Unicode code point; Unicode character semantics',
collname => 'pg_unicode_fast', collprovider => 'b', collencoding => '6',
colllocale => 'PG_UNICODE_FAST', collversion => '1' },
]

View File

@ -108,6 +108,7 @@ struct pg_locale_struct
struct
{
const char *locale;
bool casemap_full;
} builtin;
locale_t lt;
#ifdef USE_ICU

View File

@ -160,3 +160,163 @@ SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_C_UTF8; -- same as above with cases reversed
t
(1 row)
--
-- Test PG_UNICODE_FAST
--
CREATE COLLATION regress_pg_unicode_fast (
provider = builtin, locale = 'unicode'); -- fails
ERROR: invalid locale name "unicode" for builtin provider
CREATE COLLATION regress_pg_unicode_fast (
provider = builtin, locale = 'PG_UNICODE_FAST');
CREATE TABLE test_pg_unicode_fast (
t TEXT COLLATE PG_UNICODE_FAST
);
INSERT INTO test_pg_unicode_fast VALUES
('abc DEF 123abc'),
('ábc sßs ßss DÉF'),
('DŽxxDŽ džxxDž Džxxdž'),
('ȺȺȺ'),
('ⱥⱥⱥ'),
('ⱥȺ');
SELECT
t, lower(t), initcap(t), upper(t),
length(convert_to(t, 'UTF8')) AS t_bytes,
length(convert_to(lower(t), 'UTF8')) AS lower_t_bytes,
length(convert_to(initcap(t), 'UTF8')) AS initcap_t_bytes,
length(convert_to(upper(t), 'UTF8')) AS upper_t_bytes
FROM test_pg_unicode_fast;
t | lower | initcap | upper | t_bytes | lower_t_bytes | initcap_t_bytes | upper_t_bytes
-----------------+-----------------+------------------+-------------------+---------+---------------+-----------------+---------------
abc DEF 123abc | abc def 123abc | Abc Def 123abc | ABC DEF 123ABC | 14 | 14 | 14 | 14
ábc sßs ßss DÉF | ábc sßs ßss déf | Ábc Sßs Ssss Déf | ÁBC SSSS SSSS DÉF | 19 | 19 | 19 | 19
DŽxxDŽ džxxDž Džxxdž | džxxdž džxxdž džxxdž | Džxxdž Džxxdž Džxxdž | DŽXXDŽ DŽXXDŽ DŽXXDŽ | 20 | 20 | 20 | 20
ȺȺȺ | ⱥⱥⱥ | Ⱥⱥⱥ | ȺȺȺ | 6 | 9 | 8 | 6
ⱥⱥⱥ | ⱥⱥⱥ | Ⱥⱥⱥ | ȺȺȺ | 9 | 9 | 8 | 6
ⱥȺ | ⱥⱥ | Ⱥⱥ | ȺȺ | 5 | 6 | 5 | 4
(6 rows)
DROP TABLE test_pg_unicode_fast;
-- test Final_Sigma
SELECT lower('ΑΣ' COLLATE PG_UNICODE_FAST); -- 0391 03A3
lower
-------
ας
(1 row)
SELECT lower('ΑΣ0' COLLATE PG_UNICODE_FAST); -- 0391 03A3 0030
lower
-------
ας0
(1 row)
SELECT lower('ἈΣ̓' COLLATE PG_UNICODE_FAST); -- 0391 0343 03A3 0343
lower
-------
ἀς̓
(1 row)
SELECT lower('ᾼΣͅ' COLLATE PG_UNICODE_FAST); -- 0391 0345 03A3 0345
lower
-------
ᾳςͅ
(1 row)
-- test !Final_Sigma
SELECT lower('Σ' COLLATE PG_UNICODE_FAST); -- 03A3
lower
-------
σ
(1 row)
SELECT lower('0Σ' COLLATE PG_UNICODE_FAST); -- 0030 03A3
lower
-------
0σ
(1 row)
SELECT lower('ΑΣΑ' COLLATE PG_UNICODE_FAST); -- 0391 03A3 0391
lower
-------
ασα
(1 row)
SELECT lower('ἈΣ̓Α' COLLATE PG_UNICODE_FAST); -- 0391 0343 03A3 0343 0391
lower
-------
ἀσ̓α
(1 row)
SELECT lower('ᾼΣͅΑ' COLLATE PG_UNICODE_FAST); -- 0391 0345 03A3 0345 0391
lower
-------
ᾳσͅα
(1 row)
-- properties
SELECT 'xyz' ~ '[[:alnum:]]' COLLATE PG_UNICODE_FAST;
?column?
----------
t
(1 row)
SELECT 'xyz' !~ '[[:upper:]]' COLLATE PG_UNICODE_FAST;
?column?
----------
t
(1 row)
SELECT '@' !~ '[[:alnum:]]' COLLATE PG_UNICODE_FAST;
?column?
----------
t
(1 row)
SELECT '=' !~ '[[:punct:]]' COLLATE PG_UNICODE_FAST; -- symbols are not punctuation
?column?
----------
t
(1 row)
SELECT 'a8a' ~ '[[:digit:]]' COLLATE PG_UNICODE_FAST;
?column?
----------
t
(1 row)
SELECT '൧' ~ '\d' COLLATE PG_UNICODE_FAST;
?column?
----------
t
(1 row)
-- case mapping
SELECT 'xYz' ~* 'XyZ' COLLATE PG_UNICODE_FAST;
?column?
----------
t
(1 row)
SELECT 'xAb' ~* '[W-Y]' COLLATE PG_UNICODE_FAST;
?column?
----------
t
(1 row)
SELECT 'xAb' !~* '[c-d]' COLLATE PG_UNICODE_FAST;
?column?
----------
t
(1 row)
SELECT 'Δ' ~* '[γ-λ]' COLLATE PG_UNICODE_FAST;
?column?
----------
t
(1 row)
SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_UNICODE_FAST; -- same as above with cases reversed
?column?
----------
t
(1 row)

View File

@ -80,3 +80,63 @@ SELECT 'xAb' ~* '[W-Y]' COLLATE PG_C_UTF8;
SELECT 'xAb' !~* '[c-d]' COLLATE PG_C_UTF8;
SELECT 'Δ' ~* '[γ-λ]' COLLATE PG_C_UTF8;
SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_C_UTF8; -- same as above with cases reversed
--
-- Test PG_UNICODE_FAST
--
CREATE COLLATION regress_pg_unicode_fast (
provider = builtin, locale = 'unicode'); -- fails
CREATE COLLATION regress_pg_unicode_fast (
provider = builtin, locale = 'PG_UNICODE_FAST');
CREATE TABLE test_pg_unicode_fast (
t TEXT COLLATE PG_UNICODE_FAST
);
INSERT INTO test_pg_unicode_fast VALUES
('abc DEF 123abc'),
('ábc sßs ßss DÉF'),
('DŽxxDŽ džxxDž Džxxdž'),
('ȺȺȺ'),
('ⱥⱥⱥ'),
('ⱥȺ');
SELECT
t, lower(t), initcap(t), upper(t),
length(convert_to(t, 'UTF8')) AS t_bytes,
length(convert_to(lower(t), 'UTF8')) AS lower_t_bytes,
length(convert_to(initcap(t), 'UTF8')) AS initcap_t_bytes,
length(convert_to(upper(t), 'UTF8')) AS upper_t_bytes
FROM test_pg_unicode_fast;
DROP TABLE test_pg_unicode_fast;
-- test Final_Sigma
SELECT lower('ΑΣ' COLLATE PG_UNICODE_FAST); -- 0391 03A3
SELECT lower('ΑΣ0' COLLATE PG_UNICODE_FAST); -- 0391 03A3 0030
SELECT lower('ἈΣ̓' COLLATE PG_UNICODE_FAST); -- 0391 0343 03A3 0343
SELECT lower('ᾼΣͅ' COLLATE PG_UNICODE_FAST); -- 0391 0345 03A3 0345
-- test !Final_Sigma
SELECT lower('Σ' COLLATE PG_UNICODE_FAST); -- 03A3
SELECT lower('' COLLATE PG_UNICODE_FAST); -- 0030 03A3
SELECT lower('ΑΣΑ' COLLATE PG_UNICODE_FAST); -- 0391 03A3 0391
SELECT lower('ἈΣ̓Α' COLLATE PG_UNICODE_FAST); -- 0391 0343 03A3 0343 0391
SELECT lower('ᾼΣͅΑ' COLLATE PG_UNICODE_FAST); -- 0391 0345 03A3 0345 0391
-- properties
SELECT 'xyz' ~ '[[:alnum:]]' COLLATE PG_UNICODE_FAST;
SELECT 'xyz' !~ '[[:upper:]]' COLLATE PG_UNICODE_FAST;
SELECT '@' !~ '[[:alnum:]]' COLLATE PG_UNICODE_FAST;
SELECT '=' !~ '[[:punct:]]' COLLATE PG_UNICODE_FAST; -- symbols are not punctuation
SELECT 'a8a' ~ '[[:digit:]]' COLLATE PG_UNICODE_FAST;
SELECT '' ~ '\d' COLLATE PG_UNICODE_FAST;
-- case mapping
SELECT 'xYz' ~* 'XyZ' COLLATE PG_UNICODE_FAST;
SELECT 'xAb' ~* '[W-Y]' COLLATE PG_UNICODE_FAST;
SELECT 'xAb' !~* '[c-d]' COLLATE PG_UNICODE_FAST;
SELECT 'Δ' ~* '[γ-λ]' COLLATE PG_UNICODE_FAST;
SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_UNICODE_FAST; -- same as above with cases reversed