Fix INITCAP() word boundaries for PG_UNICODE_FAST.

Word boundaries are based on whether a character is alphanumeric or not. For the PG_UNICODE_FAST collation, alphanumeric includes non-ASCII digits; whereas for the PG_C_UTF8 collation, it only includes digits 0-9. Pass down the right information from the pg_locale_t into initcap_wbnext to differentiate the behavior. Reported-by: Noah Misch <noah@leadboat.com> Reviewed-by: Noah Misch <noah@leadboat.com> Discussion: https://postgr.es/m/20250417135841.33.nmisch@google.com
2025-10-21 02:52:47 +03:00 · 2025-04-21 12:34:58 -07:00
parent 80b727eb9d
commit 90260e2ec6
4 changed files with 23 additions and 4 deletions
--- a/src/backend/utils/adt/pg_locale_builtin.c
+++ b/src/backend/utils/adt/pg_locale_builtin.c
@@ -40,6 +40,7 @@ struct WordBoundaryState
 	const char *str;
 	size_t		len;
 	size_t		offset;
+	bool		posix;
 	bool		init;
 	bool		prev_alnum;
 };
@@ -58,7 +59,7 @@ initcap_wbnext(void *state)
 	{
 		pg_wchar	u = utf8_to_unicode((unsigned char *) wbstate->str +
 										wbstate->offset);
-		bool		curr_alnum = pg_u_isalnum(u, true);
+		bool		curr_alnum = pg_u_isalnum(u, wbstate->posix);

 		if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
 		{
@@ -92,6 +93,7 @@ strtitle_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
 		.str = src,
 		.len = srclen,
 		.offset = 0,
+		.posix = !locale->info.builtin.casemap_full,
 		.init = false,
 		.prev_alnum = false,
 	};
--- a/src/common/unicode/case_test.c
+++ b/src/common/unicode/case_test.c
@@ -41,6 +41,7 @@ struct WordBoundaryState
 	const char *str;
 	size_t		len;
 	size_t		offset;
+	bool		posix;
 	bool		init;
 	bool		prev_alnum;
 };
@@ -55,7 +56,7 @@ initcap_wbnext(void *state)
 	{
 		pg_wchar	u = utf8_to_unicode((unsigned char *) wbstate->str +
 										wbstate->offset);
-		bool		curr_alnum = pg_u_isalnum(u, true);
+		bool		curr_alnum = pg_u_isalnum(u, wbstate->posix);

 		if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
 		{
@@ -112,10 +113,13 @@ icu_test_full(char *str)
 	char		icu_upper[BUFSZ];
 	char		icu_fold[BUFSZ];
 	UErrorCode	status;
+
+	/* full case mapping doesn't use posix semantics */
 	struct WordBoundaryState wbstate = {
 		.str = str,
 		.len = strlen(str),
 		.offset = 0,
+		.posix = false,
 		.init = false,
 		.prev_alnum = false,
 	};
@@ -344,6 +348,12 @@ test_convert_case()
 	test_convert(tfunc_lower, "σς'Σ' ΣΣ'Σ'", "σς'ς' σσ'ς'");
 	test_convert(tfunc_title, "σςΣ ΣΣΣ", "Σςς Σσς");
 	test_convert(tfunc_fold, "σςΣ ΣΣΣ", "σσσ σσσ");
+	/* test that alphanumerics are word characters */
+	test_convert(tfunc_title, "λλ", "Λλ");
+	test_convert(tfunc_title, "1a", "1a");
+	/* U+FF11 FULLWIDTH ONE is alphanumeric for full case mapping */
+	test_convert(tfunc_title, "\uFF11a", "\uFF11a");
+

 #ifdef USE_ICU
 	icu_test_full("");
@@ -354,6 +364,7 @@ test_convert_case()
 	icu_test_full("abc 123xyz");
 	icu_test_full("σςΣ ΣΣΣ");
 	icu_test_full("ıiIİ");
+	icu_test_full("\uFF11a");
 	/* test <alpha><iota_subscript><acute> */
 	icu_test_full("\u0391\u0345\u0301");
 #endif
--- a/src/test/regress/expected/collate.utf8.out
+++ b/src/test/regress/expected/collate.utf8.out
@@ -52,6 +52,7 @@ INSERT INTO test_pg_c_utf8 VALUES
  ('abc DEF 123abc'),
  ('ábc sßs ßss DÉF'),
  ('ǄxxǄ ǆxxǅ ǅxxǆ'),
+  (U&'Λλ 1a \FF11a'),
  ('ȺȺȺ'),
  ('ⱥⱥⱥ'),
  ('ⱥȺ');
@@ -67,10 +68,11 @@ SELECT
 abc DEF 123abc  | abc def 123abc  | Abc Def 123abc  | ABC DEF 123ABC  |      14 |            14 |              14 |            14
 ábc sßs ßss DÉF | ábc sßs ßss déf | Ábc Sßs ßss Déf | ÁBC SßS ßSS DÉF |      19 |            19 |              19 |            19
 ǄxxǄ ǆxxǅ ǅxxǆ  | ǆxxǆ ǆxxǆ ǆxxǆ  | Ǆxxǆ Ǆxxǆ Ǆxxǆ  | ǄXXǄ ǄXXǄ ǄXXǄ  |      20 |            20 |              20 |            20
+ Λλ 1a １a       | λλ 1a １a       | Λλ 1a １A       | ΛΛ 1A １A       |      12 |            12 |              12 |            12
 ȺȺȺ             | ⱥⱥⱥ             | Ⱥⱥⱥ             | ȺȺȺ             |       6 |             9 |               8 |             6
 ⱥⱥⱥ             | ⱥⱥⱥ             | Ⱥⱥⱥ             | ȺȺȺ             |       9 |             9 |               8 |             6
 ⱥȺ              | ⱥⱥ              | Ⱥⱥ              | ȺȺ              |       5 |             6 |               5 |             4
-(6 rows)
+(7 rows)

 DROP TABLE test_pg_c_utf8;
 -- negative test: Final_Sigma not used for builtin locale C.UTF-8
@@ -182,6 +184,7 @@ INSERT INTO test_pg_unicode_fast VALUES
  ('abc DEF 123abc'),
  ('ábc sßs ßss DÉF'),
  ('ǄxxǄ ǆxxǅ ǅxxǆ'),
+  (U&'Λλ 1a \FF11a'),
  ('ȺȺȺ'),
  ('ⱥⱥⱥ'),
  ('ⱥȺ');
@@ -197,10 +200,11 @@ SELECT
 abc DEF 123abc  | abc def 123abc  | Abc Def 123abc   | ABC DEF 123ABC    |      14 |            14 |              14 |            14
 ábc sßs ßss DÉF | ábc sßs ßss déf | Ábc Sßs Ssss Déf | ÁBC SSSS SSSS DÉF |      19 |            19 |              19 |            19
 ǄxxǄ ǆxxǅ ǅxxǆ  | ǆxxǆ ǆxxǆ ǆxxǆ  | ǅxxǆ ǅxxǆ ǅxxǆ   | ǄXXǄ ǄXXǄ ǄXXǄ    |      20 |            20 |              20 |            20
+ Λλ 1a １a       | λλ 1a １a       | Λλ 1a １a        | ΛΛ 1A １A         |      12 |            12 |              12 |            12
 ȺȺȺ             | ⱥⱥⱥ             | Ⱥⱥⱥ              | ȺȺȺ               |       6 |             9 |               8 |             6
 ⱥⱥⱥ             | ⱥⱥⱥ             | Ⱥⱥⱥ              | ȺȺȺ               |       9 |             9 |               8 |             6
 ⱥȺ              | ⱥⱥ              | Ⱥⱥ               | ȺȺ                |       5 |             6 |               5 |             4
-(6 rows)
+(7 rows)

 DROP TABLE test_pg_unicode_fast;
 -- test Final_Sigma
--- a/src/test/regress/sql/collate.utf8.sql
+++ b/src/test/regress/sql/collate.utf8.sql
@@ -45,6 +45,7 @@ INSERT INTO test_pg_c_utf8 VALUES
  ('abc DEF 123abc'),
  ('ábc sßs ßss DÉF'),
  ('ǄxxǄ ǆxxǅ ǅxxǆ'),
+  (U&'Λλ 1a \FF11a'),
  ('ȺȺȺ'),
  ('ⱥⱥⱥ'),
  ('ⱥȺ');
@@ -100,6 +101,7 @@ INSERT INTO test_pg_unicode_fast VALUES
  ('abc DEF 123abc'),
  ('ábc sßs ßss DÉF'),
  ('ǄxxǄ ǆxxǅ ǅxxǆ'),
+  (U&'Λλ 1a \FF11a'),
  ('ȺȺȺ'),
  ('ⱥⱥⱥ'),
  ('ⱥȺ');