Add unicode_strtitle() for Unicode Default Case Conversion.

This brings the titlecasing implementation for the builtin provider out of formatting.c and into unicode_case.c, along with unicode_strlower() and unicode_strupper(). Accepts an arbitrary word boundary callback. Simple for now, but can be extended to support the Unicode Default Case Conversion algorithm with full case mapping. Discussion: https://postgr.es/m/3bc653b5d562ae9e2838b11cb696816c328a489a.camel@j-davis.com Reviewed-by: Peter Eisentraut
2025-10-25 13:17:41 +03:00 · 2024-03-29 17:35:07 -07:00
parent a96a8b15fa
commit 46e5441fa5
3 changed files with 140 additions and 48 deletions
--- a/src/common/unicode_case.c
+++ b/src/common/unicode_case.c
@@ -21,8 +21,9 @@
 #include "mb/pg_wchar.h"

 static const pg_case_map *find_case_map(pg_wchar ucs);
-static size_t convert_case(char *dst, size_t dstsize, const char *src,
-						   ssize_t srclen, CaseKind casekind);
+static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
+						   CaseKind str_casekind, WordBoundaryNext wbnext,
+						   void *wbstate);

 pg_wchar
 unicode_lowercase_simple(pg_wchar code)
@@ -67,7 +68,40 @@ unicode_uppercase_simple(pg_wchar code)
 size_t
 unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
 {
-	return convert_case(dst, dstsize, src, srclen, CaseLower);
+	return convert_case(dst, dstsize, src, srclen, CaseLower, NULL, NULL);
+}
+
+/*
+ * unicode_strtitle()
+ *
+ * Convert src to titlecase, and return the result length (not including
+ * terminating NUL).
+ *
+ * String src must be encoded in UTF-8. If srclen < 0, src must be
+ * NUL-terminated.
+ *
+ * Result string is stored in dst, truncating if larger than dstsize. If
+ * dstsize is greater than the result length, dst will be NUL-terminated;
+ * otherwise not.
+ *
+ * If dstsize is zero, dst may be NULL. This is useful for calculating the
+ * required buffer size before allocating.
+ *
+ * Titlecasing requires knowledge about word boundaries, which is provided by
+ * the callback wbnext. A word boundary is the offset of the start of a word
+ * or the offset of the character immediately following a word.
+ *
+ * The caller is expected to initialize and free the callback state
+ * wbstate. The callback should first return offset 0 for the first boundary;
+ * then the offset of each subsequent word boundary; then the total length of
+ * the string to indicate the final boundary.
+ */
+size_t
+unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
+				 WordBoundaryNext wbnext, void *wbstate)
+{
+	return convert_case(dst, dstsize, src, srclen, CaseTitle, wbnext,
+						wbstate);
 }

 /*
@@ -89,20 +123,34 @@ unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
 size_t
 unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen)
 {
-	return convert_case(dst, dstsize, src, srclen, CaseUpper);
+	return convert_case(dst, dstsize, src, srclen, CaseUpper, NULL, NULL);
 }

 /*
- * Implement Unicode Default Case Conversion algorithm.
+ * If str_casekind is CaseLower or CaseUpper, map each character in the string
+ * for which a mapping is available.
 *
- * Map each character in the string for which a mapping is available.
+ * If str_casekind is CaseTitle, maps characters found on a word boundary to
+ * uppercase and other characters to lowercase.
 */
 static size_t
 convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
-			 CaseKind casekind)
+			 CaseKind str_casekind, WordBoundaryNext wbnext, void *wbstate)
 {
+	/* character CaseKind varies while titlecasing */
+	CaseKind	chr_casekind = str_casekind;
 	size_t		srcoff = 0;
 	size_t		result_len = 0;
+	size_t		boundary = 0;
+
+	Assert((str_casekind == CaseTitle && wbnext && wbstate) ||
+		   (str_casekind != CaseTitle && !wbnext && !wbstate));
+
+	if (str_casekind == CaseTitle)
+	{
+		boundary = wbnext(wbstate);
+		Assert(boundary == 0);	/* start of text is always a boundary */
+	}

 	while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0')
 	{
@@ -110,9 +158,21 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
 		int			u1len = unicode_utf8len(u1);
 		const		pg_case_map *casemap = find_case_map(u1);

+		if (str_casekind == CaseTitle)
+		{
+			if (srcoff == boundary)
+			{
+				chr_casekind = CaseUpper;
+				boundary = wbnext(wbstate);
+			}
+			else
+				chr_casekind = CaseLower;
+		}
+
+		/* perform mapping, update result_len, and write to dst */
 		if (casemap)
 		{
-			pg_wchar	u2 = casemap->simplemap[casekind];
+			pg_wchar	u2 = casemap->simplemap[chr_casekind];
 			pg_wchar	u2len = unicode_utf8len(u2);

 			if (result_len + u2len <= dstsize)