Support Unicode full case mapping and conversion.

Generate tables from Unicode SpecialCasing.txt to support more sophisticated case mapping behavior: * support case mappings to multiple codepoints, such as "ß" uppercasing to "SS" * support conditional case mappings, such as the "final sigma" * support titlecase variants, such as "ǆ" uppercasing to "Ǆ" but titlecasing to "ǅ" Discussion: https://postgr.es/m/ddfd67928818f138f51635712529bc5e1d25e4e7.camel@j-davis.com Discussion: https://postgr.es/m/27bb0e52-801d-4f73-a0a4-02cfdd4a9ada@eisentraut.org Reviewed-by: Peter Eisentraut, Daniel Verite
2025-10-25 13:17:41 +03:00 · 2025-01-17 15:56:20 -08:00
parent 6a9b2a631a
commit 286a365b9c
9 changed files with 3645 additions and 2993 deletions
--- a/src/common/unicode_case.c
+++ b/src/common/unicode_case.c
@@ -17,12 +17,15 @@

 #include "common/unicode_case.h"
 #include "common/unicode_case_table.h"
+#include "common/unicode_category.h"
 #include "mb/pg_wchar.h"

 static const pg_case_map *find_case_map(pg_wchar ucs);
 static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
-						   CaseKind str_casekind, WordBoundaryNext wbnext,
+						   CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
 						   void *wbstate);
+static bool check_special_conditions(int conditions, const char *str,
+									 size_t len, size_t offset);

 pg_wchar
 unicode_lowercase_simple(pg_wchar code)
@@ -63,11 +66,16 @@ unicode_uppercase_simple(pg_wchar code)
 *
 * If dstsize is zero, dst may be NULL. This is useful for calculating the
 * required buffer size before allocating.
+ *
+ * If full is true, use special case mappings if available and if the
+ * conditions are satisfied.
 */
 size_t
-unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
+unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen,
+				 bool full)
 {
-	return convert_case(dst, dstsize, src, srclen, CaseLower, NULL, NULL);
+	return convert_case(dst, dstsize, src, srclen, CaseLower, full, NULL,
+						NULL);
 }

 /*
@@ -86,6 +94,10 @@ unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
 * If dstsize is zero, dst may be NULL. This is useful for calculating the
 * required buffer size before allocating.
 *
+ * If full is true, use special case mappings if available and if the
+ * conditions are satisfied. Otherwise, use only simple mappings and use
+ * uppercase instead of titlecase.
+ *
 * Titlecasing requires knowledge about word boundaries, which is provided by
 * the callback wbnext. A word boundary is the offset of the start of a word
 * or the offset of the character immediately following a word.
@@ -97,9 +109,9 @@ unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
 */
 size_t
 unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
-				 WordBoundaryNext wbnext, void *wbstate)
+				 bool full, WordBoundaryNext wbnext, void *wbstate)
 {
-	return convert_case(dst, dstsize, src, srclen, CaseTitle, wbnext,
+	return convert_case(dst, dstsize, src, srclen, CaseTitle, full, wbnext,
 						wbstate);
 }

@@ -118,23 +130,38 @@ unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
 *
 * If dstsize is zero, dst may be NULL. This is useful for calculating the
 * required buffer size before allocating.
+ *
+ * If full is true, use special case mappings if available and if the
+ * conditions are satisfied.
 */
 size_t
-unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen)
+unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen,
+				 bool full)
 {
-	return convert_case(dst, dstsize, src, srclen, CaseUpper, NULL, NULL);
+	return convert_case(dst, dstsize, src, srclen, CaseUpper, full, NULL,
+						NULL);
 }

 /*
+ * Implement Unicode Default Case Conversion algorithm.
+ *
 * If str_casekind is CaseLower or CaseUpper, map each character in the string
 * for which a mapping is available.
 *
 * If str_casekind is CaseTitle, maps characters found on a word boundary to
- * uppercase and other characters to lowercase.
+ * titlecase (or uppercase if full is false) and other characters to
+ * lowercase. NB: does not currently implement the Unicode behavior in which
+ * the word boundary is adjusted to the next Cased character. That behavior
+ * could be implemented as an option, but it doesn't match the default
+ * behavior of ICU, nor does it match the documented behavior of INITCAP().
+ *
+ * If full is true, use special mappings for relevant characters, which can
+ * map a single codepoint to multiple codepoints, or depend on conditions.
 */
 static size_t
 convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
-			 CaseKind str_casekind, WordBoundaryNext wbnext, void *wbstate)
+			 CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
+			 void *wbstate)
 {
 	/* character CaseKind varies while titlecasing */
 	CaseKind	chr_casekind = str_casekind;
@@ -156,20 +183,53 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
 		pg_wchar	u1 = utf8_to_unicode((unsigned char *) src + srcoff);
 		int			u1len = unicode_utf8len(u1);
 		const pg_case_map *casemap = find_case_map(u1);
+		const pg_special_case *special = NULL;

 		if (str_casekind == CaseTitle)
 		{
 			if (srcoff == boundary)
 			{
-				chr_casekind = CaseUpper;
+				chr_casekind = full ? CaseTitle : CaseUpper;
 				boundary = wbnext(wbstate);
 			}
 			else
 				chr_casekind = CaseLower;
 		}

+		/*
+		 * Find special case that matches the conditions, if any.
+		 *
+		 * Note: only a single special mapping per codepoint is currently
+		 * supported, though Unicode allows for multiple special mappings for
+		 * a single codepoint.
+		 */
+		if (full && casemap && casemap->special_case)
+		{
+			int16		conditions = casemap->special_case->conditions;
+
+			Assert(casemap->special_case->codepoint == u1);
+			if (check_special_conditions(conditions, src, srclen, srcoff))
+				special = casemap->special_case;
+		}
+
 		/* perform mapping, update result_len, and write to dst */
-		if (casemap)
+		if (special)
+		{
+			for (int i = 0; i < MAX_CASE_EXPANSION; i++)
+			{
+				pg_wchar	u2 = special->map[chr_casekind][i];
+				size_t		u2len = unicode_utf8len(u2);
+
+				if (u2 == '\0')
+					break;
+
+				if (result_len + u2len <= dstsize)
+					unicode_to_utf8(u2, (unsigned char *) dst + result_len);
+
+				result_len += u2len;
+			}
+		}
+		else if (casemap)
 		{
 			pg_wchar	u2 = casemap->simplemap[chr_casekind];
 			pg_wchar	u2len = unicode_utf8len(u2);
@@ -197,6 +257,82 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
 	return result_len;
 }

+/*
+ * Check that the condition matches Final_Sigma, described in Unicode Table
+ * 3-17. The character at the given offset must be directly preceded by a
+ * Cased character, and must not be directly followed by a Cased character.
+ *
+ * Case_Ignorable characters are ignored. NB: some characters may be both
+ * Cased and Case_Ignorable, in which case they are ignored.
+ */
+static bool
+check_final_sigma(const unsigned char *str, size_t len, size_t offset)
+{
+	/* the start of the string is not preceded by a Cased character */
+	if (offset == 0)
+		return false;
+
+	/* iterate backwards, looking for Cased character */
+	for (int i = offset - 1; i >= 0; i--)
+	{
+		if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
+		{
+			pg_wchar	curr = utf8_to_unicode(str + i);
+
+			if (pg_u_prop_case_ignorable(curr))
+				continue;
+			else if (pg_u_prop_cased(curr))
+				break;
+			else
+				return false;
+		}
+		else if ((str[i] & 0xC0) == 0x80)
+			continue;
+
+		Assert(false);			/* invalid UTF-8 */
+	}
+
+	/* end of string is not followed by a Cased character */
+	if (offset == len)
+		return true;
+
+	/* iterate forwards, looking for Cased character */
+	for (int i = offset + 1; i < len && str[i] != '\0'; i++)
+	{
+		if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
+		{
+			pg_wchar	curr = utf8_to_unicode(str + i);
+
+			if (pg_u_prop_case_ignorable(curr))
+				continue;
+			else if (pg_u_prop_cased(curr))
+				return false;
+			else
+				break;
+		}
+		else if ((str[i] & 0xC0) == 0x80)
+			continue;
+
+		Assert(false);			/* invalid UTF-8 */
+	}
+
+	return true;
+}
+
+static bool
+check_special_conditions(int conditions, const char *str, size_t len,
+						 size_t offset)
+{
+	if (conditions == 0)
+		return true;
+	else if (conditions == PG_U_FINAL_SIGMA)
+		return check_final_sigma((unsigned char *) str, len, offset);
+
+	/* no other conditions supported */
+	Assert(false);
+	return false;
+}
+
 /* find entry in simple case map, if any */
 static const pg_case_map *
 find_case_map(pg_wchar ucs)