Add unicode_strtitle() for Unicode Default Case Conversion.

This brings the titlecasing implementation for the builtin provider out of formatting.c and into unicode_case.c, along with unicode_strlower() and unicode_strupper(). Accepts an arbitrary word boundary callback. Simple for now, but can be extended to support the Unicode Default Case Conversion algorithm with full case mapping. Discussion: https://postgr.es/m/3bc653b5d562ae9e2838b11cb696816c328a489a.camel@j-davis.com Reviewed-by: Peter Eisentraut
2025-06-30 21:42:05 +03:00 · 2024-03-29 17:35:07 -07:00
parent a96a8b15fa
commit 46e5441fa5
3 changed files with 140 additions and 48 deletions
--- a/src/backend/utils/adt/formatting.c
+++ b/src/backend/utils/adt/formatting.c
@ -1922,6 +1922,47 @@ str_toupper(const char *buff, size_t nbytes, Oid collid)
 	return result;
 }

+struct WordBoundaryState
+{
+	const char *str;
+	size_t		len;
+	size_t		offset;
+	bool		init;
+	bool		prev_alnum;
+};
+
+/*
+ * Simple word boundary iterator that draws boundaries each time the result of
+ * pg_u_isalnum() changes.
+ */
+static size_t
+initcap_wbnext(void *state)
+{
+	struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state;
+
+	while (wbstate->offset < wbstate->len &&
+		   wbstate->str[wbstate->offset] != '\0')
+	{
+		pg_wchar	u = utf8_to_unicode((unsigned char *) wbstate->str +
+										wbstate->offset);
+		bool		curr_alnum = pg_u_isalnum(u, true);
+
+		if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
+		{
+			size_t		prev_offset = wbstate->offset;
+
+			wbstate->init = true;
+			wbstate->offset += unicode_utf8len(u);
+			wbstate->prev_alnum = curr_alnum;
+			return prev_offset;
+		}
+
+		wbstate->offset += unicode_utf8len(u);
+	}
+
+	return wbstate->len;
+}
+
 /*
 * collation-aware, wide-character-aware initcap function
 *
@ -1980,56 +2021,42 @@ str_initcap(const char *buff, size_t nbytes, Oid collid)
 #endif
 		if (mylocale && mylocale->provider == COLLPROVIDER_BUILTIN)
 		{
-			const unsigned char *src = (unsigned char *) buff;
+			const char *src = buff;
 			size_t		srclen = nbytes;
-			unsigned char *dst;
 			size_t		dstsize;
-			int			srcoff = 0;
-			int			dstoff = 0;
+			char	   *dst;
+			size_t		needed;
+			struct WordBoundaryState wbstate = {
+				.str = src,
+				.len = srclen,
+				.offset = 0,
+				.init = false,
+				.prev_alnum = false,
+			};

 			Assert(GetDatabaseEncoding() == PG_UTF8);

-			/* overflow paranoia */
-			if ((srclen + 1) > (INT_MAX / MAX_MULTIBYTE_CHAR_LEN))
-				ereport(ERROR,
-						(errcode(ERRCODE_OUT_OF_MEMORY),
-						 errmsg("out of memory")));
+			/* first try buffer of equal size plus terminating NUL */
+			dstsize = srclen + 1;
+			dst = palloc(dstsize);

-			/* result is at most srclen codepoints plus terminating NUL */
-			dstsize = srclen * MAX_MULTIBYTE_CHAR_LEN + 1;
-			dst = (unsigned char *) palloc(dstsize);
-
-			while (srcoff < nbytes)
+			needed = unicode_strtitle(dst, dstsize, src, srclen,
+									  initcap_wbnext, &wbstate);
+			if (needed + 1 > dstsize)
 			{
-				pg_wchar	u1 = utf8_to_unicode(src + srcoff);
-				pg_wchar	u2;
-				int			u1len = unicode_utf8len(u1);
-				int			u2len;
+				/* reset iterator */
+				wbstate.offset = 0;
+				wbstate.init = false;

-				if (wasalnum)
-					u2 = unicode_lowercase_simple(u1);
-				else
-					u2 = unicode_uppercase_simple(u1);
-
-				u2len = unicode_utf8len(u2);
-
-				Assert(dstoff + u2len + 1 <= dstsize);
-
-				wasalnum = pg_u_isalnum(u2, true);
-
-				unicode_to_utf8(u2, dst + dstoff);
-				srcoff += u1len;
-				dstoff += u2len;
+				/* grow buffer if needed and retry */
+				dstsize = needed + 1;
+				dst = repalloc(dst, dstsize);
+				needed = unicode_strtitle(dst, dstsize, src, srclen,
+										  initcap_wbnext, &wbstate);
+				Assert(needed + 1 == dstsize);
 			}

-			Assert(dstoff + 1 <= dstsize);
-			*(dst + dstoff) = '\0';
-			dstoff++;
-
-			/* allocate result buffer of the right size and free workspace */
-			result = palloc(dstoff);
-			memcpy(result, dst, dstoff);
-			pfree(dst);
+			result = dst;
 		}
 		else
 		{