Refactor convert_case() to prepare for optimizations.

Upcoming optimizations will add complexity to convert_case(). This patch reorganizes slightly so that the complexity can be contained within the logic to convert the case of a single character, rather than mixing it in with logic to iterate through the string. Reviewed-by: Alexander Borisov <lex.borisov@gmail.com> Discussion: https://postgr.es/m/44005c3d-88f4-4a26-981f-fd82dfa8e313@gmail.com
2025-12-21 05:21:08 +03:00 · 2025-03-12 21:51:52 -07:00
parent 3abe9dc188
commit d3b2e5e1ab
1 changed files with 97 additions and 48 deletions
--- a/src/common/unicode_case.c
+++ b/src/common/unicode_case.c
@@ -20,12 +20,20 @@
 #include "common/unicode_category.h"
 #include "mb/pg_wchar.h"

+enum CaseMapResult
+{
+	CASEMAP_SELF,
+	CASEMAP_SIMPLE,
+	CASEMAP_SPECIAL,
+};
+
 static const pg_case_map *find_case_map(pg_wchar ucs);
 static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
 						   CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
 						   void *wbstate);
-static bool check_special_conditions(int conditions, const char *str,
-									 size_t len, size_t offset);
+static enum CaseMapResult casemap(pg_wchar u1, CaseKind casekind, bool full,
+								  const char *src, size_t srclen, size_t srcoff,
+								  pg_wchar *u2, const pg_wchar **special);

 pg_wchar
 unicode_lowercase_simple(pg_wchar code)
@@ -214,8 +222,9 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
 	{
 		pg_wchar	u1 = utf8_to_unicode((unsigned char *) src + srcoff);
 		int			u1len = unicode_utf8len(u1);
-		const pg_case_map *casemap = find_case_map(u1);
-		const pg_special_case *special = NULL;
+		pg_wchar	simple = 0;
+		const pg_wchar *special = NULL;
+		enum CaseMapResult casemap_result;

 		if (str_casekind == CaseTitle)
 		{
@@ -228,56 +237,47 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
 				chr_casekind = CaseLower;
 		}

-		/*
-		 * Find special case that matches the conditions, if any.
-		 *
-		 * Note: only a single special mapping per codepoint is currently
-		 * supported, though Unicode allows for multiple special mappings for
-		 * a single codepoint.
-		 */
-		if (full && casemap && casemap->special_case)
+		casemap_result = casemap(u1, chr_casekind, full, src, srclen, srcoff,
+								 &simple, &special);
+
+		switch (casemap_result)
 		{
-			int16		conditions = casemap->special_case->conditions;
+			case CASEMAP_SELF:
+				/* no mapping; copy bytes from src */
+				Assert(simple == 0);
+				Assert(special == NULL);
+				if (result_len + u1len <= dstsize)
+					memcpy(dst + result_len, src + srcoff, u1len);

-			Assert(casemap->special_case->codepoint == u1);
-			if (check_special_conditions(conditions, src, srclen, srcoff))
-				special = casemap->special_case;
-		}
+				result_len += u1len;
+				break;
+			case CASEMAP_SIMPLE:
+				{
+					/* replace with single character */
+					pg_wchar	u2 = simple;
+					pg_wchar	u2len = unicode_utf8len(u2);

-		/* perform mapping, update result_len, and write to dst */
-		if (special)
-		{
-			for (int i = 0; i < MAX_CASE_EXPANSION; i++)
-			{
-				pg_wchar	u2 = special->map[chr_casekind][i];
-				size_t		u2len = unicode_utf8len(u2);
+					Assert(special == NULL);
+					if (result_len + u2len <= dstsize)
+						unicode_to_utf8(u2, (unsigned char *) dst + result_len);

-				if (u2 == '\0')
-					break;
+					result_len += u2len;
+				}
+				break;
+			case CASEMAP_SPECIAL:
+				/* replace with up to MAX_CASE_EXPANSION characters */
+				Assert(simple == 0);
+				for (int i = 0; i < MAX_CASE_EXPANSION && special[i]; i++)
+				{
+					pg_wchar	u2 = special[i];
+					size_t		u2len = unicode_utf8len(u2);

-				if (result_len + u2len <= dstsize)
-					unicode_to_utf8(u2, (unsigned char *) dst + result_len);
+					if (result_len + u2len <= dstsize)
+						unicode_to_utf8(u2, (unsigned char *) dst + result_len);

-				result_len += u2len;
-			}
-		}
-		else if (casemap)
-		{
-			pg_wchar	u2 = casemap->simplemap[chr_casekind];
-			pg_wchar	u2len = unicode_utf8len(u2);
-
-			if (result_len + u2len <= dstsize)
-				unicode_to_utf8(u2, (unsigned char *) dst + result_len);
-
-			result_len += u2len;
-		}
-		else
-		{
-			/* no mapping; copy bytes from src */
-			if (result_len + u1len <= dstsize)
-				memcpy(dst + result_len, src + srcoff, u1len);
-
-			result_len += u1len;
+					result_len += u2len;
+				}
+				break;
 		}

 		srcoff += u1len;
@@ -351,6 +351,10 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
 	return true;
 }

+/*
+ * Unicode allows for special casing to be applied only under certain
+ * circumstances. The only currently-supported condition is Final_Sigma.
+ */
 static bool
 check_special_conditions(int conditions, const char *str, size_t len,
 						 size_t offset)
@@ -365,6 +369,51 @@ check_special_conditions(int conditions, const char *str, size_t len,
 	return false;
 }

+/*
+ * Map the given character to the requested case.
+ *
+ * If full is true, and a special case mapping is found and the conditions are
+ * met, 'special' is set to the mapping result (which is an array of up to
+ * MAX_CASE_EXPANSION characters) and CASEMAP_SPECIAL is returned.
+ *
+ * Otherwise, search for a simple mapping, and if found, set 'simple' to the
+ * result and return CASEMAP_SIMPLE.
+ *
+ * If no mapping is found, return CASEMAP_SELF, and the caller should copy the
+ * character without modification.
+ */
+static enum CaseMapResult
+casemap(pg_wchar u1, CaseKind casekind, bool full,
+		const char *src, size_t srclen, size_t srcoff,
+		pg_wchar *simple, const pg_wchar **special)
+{
+	const pg_case_map *map;
+
+	if (u1 < 0x80)
+	{
+		*simple = case_map[u1].simplemap[casekind];
+
+		return CASEMAP_SIMPLE;
+	}
+
+	map = find_case_map(u1);
+
+	if (map == NULL)
+		return CASEMAP_SELF;
+
+	if (full && map->special_case != NULL &&
+		check_special_conditions(map->special_case->conditions,
+								 src, srclen, srcoff))
+	{
+		*special = map->special_case->map[casekind];
+		return CASEMAP_SPECIAL;
+	}
+
+	*simple = map->simplemap[casekind];
+
+	return CASEMAP_SIMPLE;
+}
+
 /* find entry in simple case map, if any */
 static const pg_case_map *
 find_case_map(pg_wchar ucs)