mirror of
https://github.com/postgres/postgres.git
synced 2025-05-31 03:21:24 +03:00
Fix INITCAP() word boundaries for PG_UNICODE_FAST.
Word boundaries are based on whether a character is alphanumeric or not. For the PG_UNICODE_FAST collation, alphanumeric includes non-ASCII digits; whereas for the PG_C_UTF8 collation, it only includes digits 0-9. Pass down the right information from the pg_locale_t into initcap_wbnext to differentiate the behavior. Reported-by: Noah Misch <noah@leadboat.com> Reviewed-by: Noah Misch <noah@leadboat.com> Discussion: https://postgr.es/m/20250417135841.33.nmisch@google.com
This commit is contained in:
parent
80b727eb9d
commit
90260e2ec6
@ -40,6 +40,7 @@ struct WordBoundaryState
|
||||
const char *str;
|
||||
size_t len;
|
||||
size_t offset;
|
||||
bool posix;
|
||||
bool init;
|
||||
bool prev_alnum;
|
||||
};
|
||||
@ -58,7 +59,7 @@ initcap_wbnext(void *state)
|
||||
{
|
||||
pg_wchar u = utf8_to_unicode((unsigned char *) wbstate->str +
|
||||
wbstate->offset);
|
||||
bool curr_alnum = pg_u_isalnum(u, true);
|
||||
bool curr_alnum = pg_u_isalnum(u, wbstate->posix);
|
||||
|
||||
if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
|
||||
{
|
||||
@ -92,6 +93,7 @@ strtitle_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
|
||||
.str = src,
|
||||
.len = srclen,
|
||||
.offset = 0,
|
||||
.posix = !locale->info.builtin.casemap_full,
|
||||
.init = false,
|
||||
.prev_alnum = false,
|
||||
};
|
||||
|
@ -41,6 +41,7 @@ struct WordBoundaryState
|
||||
const char *str;
|
||||
size_t len;
|
||||
size_t offset;
|
||||
bool posix;
|
||||
bool init;
|
||||
bool prev_alnum;
|
||||
};
|
||||
@ -55,7 +56,7 @@ initcap_wbnext(void *state)
|
||||
{
|
||||
pg_wchar u = utf8_to_unicode((unsigned char *) wbstate->str +
|
||||
wbstate->offset);
|
||||
bool curr_alnum = pg_u_isalnum(u, true);
|
||||
bool curr_alnum = pg_u_isalnum(u, wbstate->posix);
|
||||
|
||||
if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
|
||||
{
|
||||
@ -112,10 +113,13 @@ icu_test_full(char *str)
|
||||
char icu_upper[BUFSZ];
|
||||
char icu_fold[BUFSZ];
|
||||
UErrorCode status;
|
||||
|
||||
/* full case mapping doesn't use posix semantics */
|
||||
struct WordBoundaryState wbstate = {
|
||||
.str = str,
|
||||
.len = strlen(str),
|
||||
.offset = 0,
|
||||
.posix = false,
|
||||
.init = false,
|
||||
.prev_alnum = false,
|
||||
};
|
||||
@ -344,6 +348,12 @@ test_convert_case()
|
||||
test_convert(tfunc_lower, "σς'Σ' ΣΣ'Σ'", "σς'ς' σσ'ς'");
|
||||
test_convert(tfunc_title, "σςΣ ΣΣΣ", "Σςς Σσς");
|
||||
test_convert(tfunc_fold, "σςΣ ΣΣΣ", "σσσ σσσ");
|
||||
/* test that alphanumerics are word characters */
|
||||
test_convert(tfunc_title, "λλ", "Λλ");
|
||||
test_convert(tfunc_title, "1a", "1a");
|
||||
/* U+FF11 FULLWIDTH ONE is alphanumeric for full case mapping */
|
||||
test_convert(tfunc_title, "\uFF11a", "\uFF11a");
|
||||
|
||||
|
||||
#ifdef USE_ICU
|
||||
icu_test_full("");
|
||||
@ -354,6 +364,7 @@ test_convert_case()
|
||||
icu_test_full("abc 123xyz");
|
||||
icu_test_full("σςΣ ΣΣΣ");
|
||||
icu_test_full("ıiIİ");
|
||||
icu_test_full("\uFF11a");
|
||||
/* test <alpha><iota_subscript><acute> */
|
||||
icu_test_full("\u0391\u0345\u0301");
|
||||
#endif
|
||||
|
@ -52,6 +52,7 @@ INSERT INTO test_pg_c_utf8 VALUES
|
||||
('abc DEF 123abc'),
|
||||
('ábc sßs ßss DÉF'),
|
||||
('DŽxxDŽ džxxDž Džxxdž'),
|
||||
(U&'Λλ 1a \FF11a'),
|
||||
('ȺȺȺ'),
|
||||
('ⱥⱥⱥ'),
|
||||
('ⱥȺ');
|
||||
@ -67,10 +68,11 @@ SELECT
|
||||
abc DEF 123abc | abc def 123abc | Abc Def 123abc | ABC DEF 123ABC | 14 | 14 | 14 | 14
|
||||
ábc sßs ßss DÉF | ábc sßs ßss déf | Ábc Sßs ßss Déf | ÁBC SßS ßSS DÉF | 19 | 19 | 19 | 19
|
||||
DŽxxDŽ džxxDž Džxxdž | džxxdž džxxdž džxxdž | DŽxxdž DŽxxdž DŽxxdž | DŽXXDŽ DŽXXDŽ DŽXXDŽ | 20 | 20 | 20 | 20
|
||||
Λλ 1a 1a | λλ 1a 1a | Λλ 1a 1A | ΛΛ 1A 1A | 12 | 12 | 12 | 12
|
||||
ȺȺȺ | ⱥⱥⱥ | Ⱥⱥⱥ | ȺȺȺ | 6 | 9 | 8 | 6
|
||||
ⱥⱥⱥ | ⱥⱥⱥ | Ⱥⱥⱥ | ȺȺȺ | 9 | 9 | 8 | 6
|
||||
ⱥȺ | ⱥⱥ | Ⱥⱥ | ȺȺ | 5 | 6 | 5 | 4
|
||||
(6 rows)
|
||||
(7 rows)
|
||||
|
||||
DROP TABLE test_pg_c_utf8;
|
||||
-- negative test: Final_Sigma not used for builtin locale C.UTF-8
|
||||
@ -182,6 +184,7 @@ INSERT INTO test_pg_unicode_fast VALUES
|
||||
('abc DEF 123abc'),
|
||||
('ábc sßs ßss DÉF'),
|
||||
('DŽxxDŽ džxxDž Džxxdž'),
|
||||
(U&'Λλ 1a \FF11a'),
|
||||
('ȺȺȺ'),
|
||||
('ⱥⱥⱥ'),
|
||||
('ⱥȺ');
|
||||
@ -197,10 +200,11 @@ SELECT
|
||||
abc DEF 123abc | abc def 123abc | Abc Def 123abc | ABC DEF 123ABC | 14 | 14 | 14 | 14
|
||||
ábc sßs ßss DÉF | ábc sßs ßss déf | Ábc Sßs Ssss Déf | ÁBC SSSS SSSS DÉF | 19 | 19 | 19 | 19
|
||||
DŽxxDŽ džxxDž Džxxdž | džxxdž džxxdž džxxdž | Džxxdž Džxxdž Džxxdž | DŽXXDŽ DŽXXDŽ DŽXXDŽ | 20 | 20 | 20 | 20
|
||||
Λλ 1a 1a | λλ 1a 1a | Λλ 1a 1a | ΛΛ 1A 1A | 12 | 12 | 12 | 12
|
||||
ȺȺȺ | ⱥⱥⱥ | Ⱥⱥⱥ | ȺȺȺ | 6 | 9 | 8 | 6
|
||||
ⱥⱥⱥ | ⱥⱥⱥ | Ⱥⱥⱥ | ȺȺȺ | 9 | 9 | 8 | 6
|
||||
ⱥȺ | ⱥⱥ | Ⱥⱥ | ȺȺ | 5 | 6 | 5 | 4
|
||||
(6 rows)
|
||||
(7 rows)
|
||||
|
||||
DROP TABLE test_pg_unicode_fast;
|
||||
-- test Final_Sigma
|
||||
|
@ -45,6 +45,7 @@ INSERT INTO test_pg_c_utf8 VALUES
|
||||
('abc DEF 123abc'),
|
||||
('ábc sßs ßss DÉF'),
|
||||
('DŽxxDŽ džxxDž Džxxdž'),
|
||||
(U&'Λλ 1a \FF11a'),
|
||||
('ȺȺȺ'),
|
||||
('ⱥⱥⱥ'),
|
||||
('ⱥȺ');
|
||||
@ -100,6 +101,7 @@ INSERT INTO test_pg_unicode_fast VALUES
|
||||
('abc DEF 123abc'),
|
||||
('ábc sßs ßss DÉF'),
|
||||
('DŽxxDŽ džxxDž Džxxdž'),
|
||||
(U&'Λλ 1a \FF11a'),
|
||||
('ȺȺȺ'),
|
||||
('ⱥⱥⱥ'),
|
||||
('ⱥȺ');
|
||||
|
Loading…
x
Reference in New Issue
Block a user