Support Unicode full case mapping and conversion.

Generate tables from Unicode SpecialCasing.txt to support more sophisticated case mapping behavior: * support case mappings to multiple codepoints, such as "ß" uppercasing to "SS" * support conditional case mappings, such as the "final sigma" * support titlecase variants, such as "ǆ" uppercasing to "Ǆ" but titlecasing to "ǅ" Discussion: https://postgr.es/m/ddfd67928818f138f51635712529bc5e1d25e4e7.camel@j-davis.com Discussion: https://postgr.es/m/27bb0e52-801d-4f73-a0a4-02cfdd4a9ada@eisentraut.org Reviewed-by: Peter Eisentraut, Daniel Verite
2025-12-21 05:21:08 +03:00 · 2025-01-17 15:56:20 -08:00
parent 6a9b2a631a
commit 286a365b9c
9 changed files with 3645 additions and 2993 deletions
--- a/src/backend/utils/adt/pg_locale_builtin.c
+++ b/src/backend/utils/adt/pg_locale_builtin.c
@@ -78,7 +78,7 @@ size_t
 strlower_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
 				 pg_locale_t locale)
 {
-	return unicode_strlower(dest, destsize, src, srclen);
+	return unicode_strlower(dest, destsize, src, srclen, false);
 }

 size_t
@@ -93,7 +93,7 @@ strtitle_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
 		.prev_alnum = false,
 	};

-	return unicode_strtitle(dest, destsize, src, srclen,
+	return unicode_strtitle(dest, destsize, src, srclen, false,
 							initcap_wbnext, &wbstate);
 }

@@ -101,7 +101,7 @@ size_t
 strupper_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
 				 pg_locale_t locale)
 {
-	return unicode_strupper(dest, destsize, src, srclen);
+	return unicode_strupper(dest, destsize, src, srclen, false);
 }

 pg_locale_t
--- a/src/common/unicode/Makefile
+++ b/src/common/unicode/Makefile
@@ -30,7 +30,7 @@ update-unicode: unicode_case_table.h unicode_category_table.h unicode_east_asian
 # These files are part of the Unicode Character Database. Download
 # them on demand.  The dependency on Makefile.global is for
 # UNICODE_VERSION.
-CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt UnicodeData.txt: $(top_builddir)/src/Makefile.global
+CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt SpecialCasing.txt UnicodeData.txt: $(top_builddir)/src/Makefile.global
 	$(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F)

 unicode_version.h: generate-unicode_version.pl
@@ -91,4 +91,4 @@ clean:
 	rm -f $(OBJS) case_test case_test.o category_test category_test.o norm_test norm_test.o

 distclean: clean
-	rm -f CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt UnicodeData.txt norm_test_table.h unicode_case_table.h unicode_category_table.h unicode_norm_table.h
+	rm -f CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt SpecialCasing.txt UnicodeData.txt norm_test_table.h unicode_case_table.h unicode_category_table.h unicode_norm_table.h
--- a/src/common/unicode/case_test.c
+++ b/src/common/unicode/case_test.c
@@ -18,12 +18,61 @@
 #include <wctype.h>

 #ifdef USE_ICU
+#include <unicode/ucasemap.h>
 #include <unicode/uchar.h>
 #endif
 #include "common/unicode_case.h"
 #include "common/unicode_category.h"
 #include "common/unicode_version.h"

+/* enough to hold largest source or result string, including NUL */
+#define BUFSZ 256
+
+#ifdef USE_ICU
+static UCaseMap * casemap = NULL;
+#endif
+
+typedef size_t (*TestFunc) (char *dst, size_t dstsize, const char *src,
+							ssize_t srclen);
+
+/* simple boundary iterator copied from pg_locale_builtin.c */
+struct WordBoundaryState
+{
+	const char *str;
+	size_t		len;
+	size_t		offset;
+	bool		init;
+	bool		prev_alnum;
+};
+
+static size_t
+initcap_wbnext(void *state)
+{
+	struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state;
+
+	while (wbstate->offset < wbstate->len &&
+		   wbstate->str[wbstate->offset] != '\0')
+	{
+		pg_wchar	u = utf8_to_unicode((unsigned char *) wbstate->str +
+										wbstate->offset);
+		bool		curr_alnum = pg_u_isalnum(u, true);
+
+		if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
+		{
+			size_t		prev_offset = wbstate->offset;
+
+			wbstate->init = true;
+			wbstate->offset += unicode_utf8len(u);
+			wbstate->prev_alnum = curr_alnum;
+			return prev_offset;
+		}
+
+		wbstate->offset += unicode_utf8len(u);
+	}
+
+	return wbstate->len;
+}
+
 #ifdef USE_ICU

 static void
@@ -48,6 +97,54 @@ icu_test_simple(pg_wchar code)
 	}
 }

+static void
+icu_test_full(char *str)
+{
+	char		lower[BUFSZ];
+	char		title[BUFSZ];
+	char		upper[BUFSZ];
+	char		icu_lower[BUFSZ];
+	char		icu_title[BUFSZ];
+	char		icu_upper[BUFSZ];
+	UErrorCode	status;
+	struct WordBoundaryState wbstate = {
+		.str = str,
+		.len = strlen(str),
+		.offset = 0,
+		.init = false,
+		.prev_alnum = false,
+	};
+
+	unicode_strlower(lower, BUFSZ, str, -1, true);
+	unicode_strtitle(title, BUFSZ, str, -1, true, initcap_wbnext, &wbstate);
+	unicode_strupper(upper, BUFSZ, str, -1, true);
+	status = U_ZERO_ERROR;
+	ucasemap_utf8ToLower(casemap, icu_lower, BUFSZ, str, -1, &status);
+	status = U_ZERO_ERROR;
+	ucasemap_utf8ToTitle(casemap, icu_title, BUFSZ, str, -1, &status);
+	status = U_ZERO_ERROR;
+	ucasemap_utf8ToUpper(casemap, icu_upper, BUFSZ, str, -1, &status);
+
+	if (strcmp(lower, icu_lower) != 0)
+	{
+		printf("case_test: str='%s' lower='%s' icu_lower='%s'\n", str, lower,
+			   icu_lower);
+		exit(1);
+	}
+	if (strcmp(title, icu_title) != 0)
+	{
+		printf("case_test: str='%s' title='%s' icu_title='%s'\n", str, title,
+			   icu_title);
+		exit(1);
+	}
+	if (strcmp(upper, icu_upper) != 0)
+	{
+		printf("case_test: str='%s' upper='%s' icu_upper='%s'\n", str, upper,
+			   icu_upper);
+		exit(1);
+	}
+}
+
 /*
 * Exhaustively compare case mappings with the results from ICU.
 */
@@ -64,6 +161,7 @@ test_icu(void)
 		if (category != PG_U_UNASSIGNED)
 		{
 			uint8_t		icu_category = u_charType(code);
+			char		code_str[5] = {0};

 			if (icu_category == PG_U_UNASSIGNED)
 			{
@@ -72,6 +170,9 @@ test_icu(void)
 			}

 			icu_test_simple(code);
+			unicode_to_utf8(code, (unsigned char *) code_str);
+			icu_test_full(code_str);
+
 			successful++;
 		}
 	}
@@ -86,7 +187,7 @@ test_icu(void)
 #endif

 static void
-test_strlower(const char *test_string, const char *expected)
+test_convert(TestFunc tfunc, const char *test_string, const char *expected)
 {
 	size_t		src1len = strlen(test_string);
 	size_t		src2len = -1;	/* NUL-terminated */
@@ -102,10 +203,11 @@ test_strlower(const char *test_string, const char *expected)

 	/* neither source nor destination are NUL-terminated */
 	memset(dst1, 0x7F, dst1len);
-	needed = unicode_strlower(dst1, dst1len, src1, src1len);
+	needed = tfunc(dst1, dst1len, src1, src1len);
 	if (needed != strlen(expected))
 	{
-		printf("case_test: convert_case test1 FAILURE: needed %zu\n", needed);
+		printf("case_test: convert_case test1 FAILURE: '%s' needed %zu expected %zu\n",
+			   test_string, needed, strlen(expected));
 		exit(1);
 	}
 	if (memcmp(dst1, expected, dst1len) != 0)
@@ -117,10 +219,11 @@ test_strlower(const char *test_string, const char *expected)

 	/* destination is NUL-terminated and source is not */
 	memset(dst2, 0x7F, dst2len);
-	needed = unicode_strlower(dst2, dst2len, src1, src1len);
+	needed = tfunc(dst2, dst2len, src1, src1len);
 	if (needed != strlen(expected))
 	{
-		printf("case_test: convert_case test2 FAILURE: needed %zu\n", needed);
+		printf("case_test: convert_case test2 FAILURE: '%s' needed %zu expected %zu\n",
+			   test_string, needed, strlen(expected));
 		exit(1);
 	}
 	if (strcmp(dst2, expected) != 0)
@@ -132,9 +235,11 @@ test_strlower(const char *test_string, const char *expected)

 	/* source is NUL-terminated and destination is not */
 	memset(dst1, 0x7F, dst1len);
-	needed = unicode_strlower(dst1, dst1len, src2, src2len);
+	needed = tfunc(dst1, dst1len, src2, src2len);
 	if (needed != strlen(expected))
 	{
+		printf("case_test: convert_case test3 FAILURE: '%s' needed %zu expected %zu\n",
+			   test_string, needed, strlen(expected));
 		printf("case_test: convert_case test3 FAILURE: needed %zu\n", needed);
 		exit(1);
 	}
@@ -147,10 +252,11 @@ test_strlower(const char *test_string, const char *expected)

 	/* both source and destination are NUL-terminated */
 	memset(dst2, 0x7F, dst2len);
-	needed = unicode_strlower(dst2, dst2len, src2, src2len);
+	needed = tfunc(dst2, dst2len, src2, src2len);
 	if (needed != strlen(expected))
 	{
-		printf("case_test: convert_case test4 FAILURE: needed %zu\n", needed);
+		printf("case_test: convert_case test4 FAILURE: '%s' needed %zu expected %zu\n",
+			   test_string, needed, strlen(expected));
 		exit(1);
 	}
 	if (strcmp(dst2, expected) != 0)
@@ -166,15 +272,69 @@ test_strlower(const char *test_string, const char *expected)
 	free(dst2);
 }

+static size_t
+tfunc_lower(char *dst, size_t dstsize, const char *src,
+			ssize_t srclen)
+{
+	return unicode_strlower(dst, dstsize, src, srclen, true);
+}
+
+static size_t
+tfunc_title(char *dst, size_t dstsize, const char *src,
+			ssize_t srclen)
+{
+	struct WordBoundaryState wbstate = {
+		.str = src,
+		.len = srclen,
+		.offset = 0,
+		.init = false,
+		.prev_alnum = false,
+	};
+
+	return unicode_strtitle(dst, dstsize, src, srclen, true, initcap_wbnext,
+							&wbstate);
+}
+
+static size_t
+tfunc_upper(char *dst, size_t dstsize, const char *src,
+			ssize_t srclen)
+{
+	return unicode_strupper(dst, dstsize, src, srclen, true);
+}
+
+
 static void
 test_convert_case()
 {
 	/* test string with no case changes */
-	test_strlower("√∞", "√∞");
+	test_convert(tfunc_lower, "√∞", "√∞");
+	/* test adjust-to-cased behavior */
+	test_convert(tfunc_title, "abc 123xyz", "Abc 123xyz");
 	/* test string with case changes */
-	test_strlower("ABC", "abc");
+	test_convert(tfunc_upper, "abc", "ABC");
 	/* test string with case changes and byte length changes */
-	test_strlower("ȺȺȺ", "ⱥⱥⱥ");
+	test_convert(tfunc_lower, "ȺȺȺ", "ⱥⱥⱥ");
+	/* test special case conversions */
+	test_convert(tfunc_upper, "ß", "SS");
+	test_convert(tfunc_lower, "ıiIİ", "ıiii\u0307");
+	test_convert(tfunc_upper, "ıiIİ", "IIIİ");
+	/* test final sigma */
+	test_convert(tfunc_lower, "σςΣ ΣΣΣ", "σςς σσς");
+	test_convert(tfunc_lower, "σς'Σ' ΣΣ'Σ'", "σς'ς' σσ'ς'");
+	test_convert(tfunc_title, "σςΣ ΣΣΣ", "Σςς Σσς");
+
+#ifdef USE_ICU
+	icu_test_full("");
+	icu_test_full("ȺȺȺ");
+	icu_test_full("ßßß");
+	icu_test_full("√∞");
+	icu_test_full("a b");
+	icu_test_full("abc 123xyz");
+	icu_test_full("σςΣ ΣΣΣ");
+	icu_test_full("ıiIİ");
+	/* test <alpha><iota_subscript><acute> */
+	icu_test_full("\u0391\u0345\u0301");
+#endif

 	printf("case_test: convert_case: success\n");
 }
@@ -182,6 +342,22 @@ test_convert_case()
 int
 main(int argc, char **argv)
 {
+#ifdef USE_ICU
+	UErrorCode	status = U_ZERO_ERROR;
+
+	/*
+	 * Disable ICU's word break adjustment for titlecase to match the expected
+	 * behavior of unicode_strtitle().
+	 */
+	casemap = ucasemap_open("und", U_TITLECASE_NO_BREAK_ADJUSTMENT, &status);
+	if (U_FAILURE(status))
+	{
+		printf("case_test: failure opening UCaseMap: %s\n",
+			   u_errorName(status));
+		exit(1);
+	}
+#endif
+
 	printf("case_test: Postgres Unicode version:\t%s\n", PG_UNICODE_VERSION);
 #ifdef USE_ICU
 	printf("case_test: ICU Unicode version:\t\t%s\n", U_UNICODE_VERSION);
@@ -191,5 +367,9 @@ main(int argc, char **argv)
 #endif

 	test_convert_case();
+
+#ifdef USE_ICU
+	ucasemap_close(casemap);
+#endif
 	exit(0);
 }
--- a/src/common/unicode/generate-unicode_case_table.pl
+++ b/src/common/unicode/generate-unicode_case_table.pl
@@ -3,7 +3,7 @@
 # Generate Unicode character case mappings. Does not include tailoring
 # or locale-specific mappings.
 #
-# Input: UnicodeData.txt
+# Input: SpecialCasing.txt UnicodeData.txt
 # Output: unicode_case_table.h
 #
 # Copyright (c) 2000-2025, PostgreSQL Global Development Group
@@ -21,6 +21,10 @@ GetOptions('outdir:s' => \$output_path);

 my $output_table_file = "$output_path/unicode_case_table.h";

+# The maximum number of codepoints that can result from case mapping
+# of a single character. See Unicode section 5.18 "Case Mappings".
+my $MAX_CASE_EXPANSION = 3;
+
 my $FH;

 my %simple = ();
@@ -51,6 +55,98 @@ while (my $line = <$FH>)
 }
 close $FH;

+# Map for special casing rules that aren't represented in the simple
+# mapping. Language-sensitive mappings are not supported.
+#
+# See https://www.unicode.org/reports/tr44/#SpecialCasing.txt, or the
+# SpecialCasing.txt file itself for details.
+
+# for now, only Final_Sigma is supported
+my %condition_map = (Final_Sigma => 'PG_U_FINAL_SIGMA');
+
+my %special = ();
+open($FH, '<', "$output_path/SpecialCasing.txt")
+  or die "Could not open $output_path/SpecialCasing.txt: $!.";
+while (my $line = <$FH>)
+{
+	# language-sensitive mappings not supported
+	last if $line =~ /\# Language-Sensitive Mappings/;
+
+	# remove comments
+	$line =~ s/^(.*?)#.*$/$1/s;
+
+	# ignore empty lines
+	next unless $line =~ /;/;
+
+	my @elts = split /;/, $line;
+	my $code = hex($elts[0]);
+
+	# Codepoint may map to multiple characters when converting
+	# case. Split each mapping on whitespace and extract the
+	# hexadecimal into an array of codepoints.
+	my @lower = map { hex $_ } (grep /^[0-9A-F]+$/, (split /\s+/, $elts[1]));
+	my @title = map { hex $_ } (grep /^[0-9A-F]+$/, (split /\s+/, $elts[2]));
+	my @upper = map { hex $_ } (grep /^[0-9A-F]+$/, (split /\s+/, $elts[3]));
+	my @conditions = map {
+		# supporting negated conditions may require storing a
+		# mask of relevant conditions for a given rule to differentiate
+		# between lack of a condition and a negated condition
+		die "negated conditions not supported" if /^Not_/;
+		$condition_map{$_} || die "unrecognized condition: $_"
+	} (grep /\w+/, (split /\s+/, $elts[4]));
+
+	my $cond_str = (join '|', @conditions) || '0';
+
+	# if empty, create a self-mapping
+	push @lower, $code if (scalar @lower == 0);
+	push @title, $code if (scalar @title == 0);
+	push @upper, $code if (scalar @upper == 0);
+
+	# none should map to more than 3 codepoints
+	die "lowercase expansion for 0x$elts[0] exceeds maximum: '$elts[1]'"
+	  if (scalar @lower) > $MAX_CASE_EXPANSION;
+	die "titlecase expansion for 0x$elts[0] exceeds maximum: '$elts[2]'"
+	  if (scalar @title) > $MAX_CASE_EXPANSION;
+	die "uppercase expansion for 0x$elts[0] exceeds maximum: '$elts[3]'"
+	  if (scalar @upper) > $MAX_CASE_EXPANSION;
+
+	# pad arrays to a fixed length of 3
+	while (scalar @upper < $MAX_CASE_EXPANSION) { push @upper, 0x000000 }
+	while (scalar @lower < $MAX_CASE_EXPANSION) { push @lower, 0x000000 }
+	while (scalar @title < $MAX_CASE_EXPANSION) { push @title, 0x000000 }
+
+	# Characters with special mappings may not have simple mappings;
+	# ensure that an entry exists.
+	$simple{$code} ||= {
+		Simple_Lowercase => $code,
+		Simple_Titlecase => $code,
+		Simple_Uppercase => $code
+	};
+
+	# Multiple special case rules for a single codepoint could be
+	# supported by making several entries for each codepoint, and have
+	# the simple mapping point to the first entry. The caller could
+	# scan forward looking for an entry that matches the conditions,
+	# or fall back to the normal behavior.
+	die "multiple special case mappings not supported"
+	  if defined $special{$code};
+
+	$special{$code} = {
+		Lowercase => \@lower,
+		Titlecase => \@title,
+		Uppercase => \@upper,
+		Conditions => $cond_str
+	};
+}
+close $FH;
+
+# assign sequential array indexes to the special mappings
+my $special_idx = 0;
+foreach my $code (sort { $a <=> $b } (keys %special))
+{
+	$special{$code}{Index} = $special_idx++;
+}
+
 # Start writing out the output files
 open my $OT, '>', $output_table_file
  or die "Could not open output file $output_table_file: $!\n";
@@ -63,6 +159,8 @@ foreach my $code (sort { $a <=> $b } (keys %simple))
 	$num_simple++ unless $code < 0x80;
 }

+my $num_special = scalar(keys %special) + 1;
+
 print $OT <<"EOS";
 /*-------------------------------------------------------------------------
 *
@@ -86,6 +184,19 @@ print $OT <<"EOS";
 #include "common/unicode_case.h"
 #include "mb/pg_wchar.h"

+/*
+ * The maximum number of codepoints that can result from case mapping
+ * of a single character. See Unicode section 5.18 "Case Mappings".
+ */
+#define MAX_CASE_EXPANSION 3
+
+/*
+ * Case mapping condition flags. For now, only Final_Sigma is supported.
+ *
+ * See Unicode Context Specification for Casing.
+ */
+#define PG_U_FINAL_SIGMA		(1 << 0)
+
 typedef enum
 {
 	CaseLower = 0,
@@ -94,12 +205,47 @@ typedef enum
 	NCaseKind
 } CaseKind;

+typedef struct
+{
+	pg_wchar	codepoint;		/* Unicode codepoint */
+	int16		conditions;
+	pg_wchar	map[NCaseKind][MAX_CASE_EXPANSION];
+} pg_special_case;
+
 typedef struct
 {
 	pg_wchar	codepoint;		/* Unicode codepoint */
 	pg_wchar	simplemap[NCaseKind];
+	const pg_special_case *special_case;
 } pg_case_map;

+/*
+ * Special case mappings that aren't representable in the simple map.
+ * Entries are referenced from simple_case_map.
+ */
+static const pg_special_case special_case[$num_special] =
+{
+EOS
+
+foreach my $code (sort { $a <=> $b } (keys %special))
+{
+	die if scalar @{ $special{$code}{Lowercase} } != $MAX_CASE_EXPANSION;
+	die if scalar @{ $special{$code}{Titlecase} } != $MAX_CASE_EXPANSION;
+	die if scalar @{ $special{$code}{Uppercase} } != $MAX_CASE_EXPANSION;
+	my $lower = join ", ",
+	  (map { sprintf "0x%06x", $_ } @{ $special{$code}{Lowercase} });
+	my $title = join ", ",
+	  (map { sprintf "0x%06x", $_ } @{ $special{$code}{Titlecase} });
+	my $upper = join ", ",
+	  (map { sprintf "0x%06x", $_ } @{ $special{$code}{Uppercase} });
+	printf $OT "\t{0x%06x, %s, ", $code, $special{$code}{Conditions};
+	printf $OT "{{%s}, {%s}, {%s}}},\n", $lower, $title, $upper;
+}
+
+print $OT "\t{0, 0, {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}}}\n";
+print $OT <<"EOS";
+};
+
 /*
 * Case mapping table. Dense for codepoints < 0x80 (enabling fast lookup),
 * sparse for higher codepoints (requiring scan or binary search).
@@ -114,8 +260,10 @@ for (my $code = 0; $code < 0x80; $code++)
 	my $lc = ($simple{$code}{Simple_Lowercase} || $code);
 	my $tc = ($simple{$code}{Simple_Titlecase} || $code);
 	my $uc = ($simple{$code}{Simple_Uppercase} || $code);
+	die "unexpected special case for code $code"
+	  if defined $special{$code};
 	printf $OT
-	  "\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}},\n",
+	  "\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}, NULL},\n",
 	  $code, $lc, $tc, $uc;
 }
 printf $OT "\n";
@@ -126,9 +274,14 @@ foreach my $code (sort { $a <=> $b } (keys %simple))
 	next unless $code >= 0x80;    # already output above

 	my $map = $simple{$code};
+	my $special_case = "NULL";
+	if (exists $special{$code})
+	{
+		$special_case = sprintf "&special_case[%d]", $special{$code}{Index};
+	}
 	printf $OT
-	  "\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}},\n",
+	  "\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}, %s},\n",
 	  $code, $map->{Simple_Lowercase}, $map->{Simple_Titlecase},
-	  $map->{Simple_Uppercase};
+	  $map->{Simple_Uppercase}, $special_case;
 }
 print $OT "};\n";
--- a/src/common/unicode/meson.build
+++ b/src/common/unicode/meson.build
@@ -11,7 +11,7 @@ endif

 # These files are part of the Unicode Character Database. Download them on
 # demand.
-foreach f : ['CompositionExclusions.txt', 'DerivedCoreProperties.txt', 'DerivedNormalizationProps.txt', 'EastAsianWidth.txt', 'NormalizationTest.txt', 'PropList.txt', 'UnicodeData.txt']
+foreach f : ['CompositionExclusions.txt', 'DerivedCoreProperties.txt', 'DerivedNormalizationProps.txt', 'EastAsianWidth.txt', 'NormalizationTest.txt', 'PropList.txt', 'SpecialCasing.txt', 'UnicodeData.txt']
  url = unicode_baseurl.format(UNICODE_VERSION, f)
  target = custom_target(f,
    output: f,
@@ -26,7 +26,7 @@ update_unicode_targets = []

 update_unicode_targets += \
  custom_target('unicode_case_table.h',
-    input: [unicode_data['UnicodeData.txt']],
+    input: [unicode_data['SpecialCasing.txt'], unicode_data['UnicodeData.txt']],
    output: ['unicode_case_table.h'],
    command: [
      perl, files('generate-unicode_case_table.pl'),
--- a/src/common/unicode_case.c
+++ b/src/common/unicode_case.c
@@ -17,12 +17,15 @@

 #include "common/unicode_case.h"
 #include "common/unicode_case_table.h"
+#include "common/unicode_category.h"
 #include "mb/pg_wchar.h"

 static const pg_case_map *find_case_map(pg_wchar ucs);
 static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
-						   CaseKind str_casekind, WordBoundaryNext wbnext,
+						   CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
 						   void *wbstate);
+static bool check_special_conditions(int conditions, const char *str,
+									 size_t len, size_t offset);

 pg_wchar
 unicode_lowercase_simple(pg_wchar code)
@@ -63,11 +66,16 @@ unicode_uppercase_simple(pg_wchar code)
 *
 * If dstsize is zero, dst may be NULL. This is useful for calculating the
 * required buffer size before allocating.
+ *
+ * If full is true, use special case mappings if available and if the
+ * conditions are satisfied.
 */
 size_t
-unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
+unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen,
+				 bool full)
 {
-	return convert_case(dst, dstsize, src, srclen, CaseLower, NULL, NULL);
+	return convert_case(dst, dstsize, src, srclen, CaseLower, full, NULL,
+						NULL);
 }

 /*
@@ -86,6 +94,10 @@ unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
 * If dstsize is zero, dst may be NULL. This is useful for calculating the
 * required buffer size before allocating.
 *
+ * If full is true, use special case mappings if available and if the
+ * conditions are satisfied. Otherwise, use only simple mappings and use
+ * uppercase instead of titlecase.
+ *
 * Titlecasing requires knowledge about word boundaries, which is provided by
 * the callback wbnext. A word boundary is the offset of the start of a word
 * or the offset of the character immediately following a word.
@@ -97,9 +109,9 @@ unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
 */
 size_t
 unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
-				 WordBoundaryNext wbnext, void *wbstate)
+				 bool full, WordBoundaryNext wbnext, void *wbstate)
 {
-	return convert_case(dst, dstsize, src, srclen, CaseTitle, wbnext,
+	return convert_case(dst, dstsize, src, srclen, CaseTitle, full, wbnext,
 						wbstate);
 }

@@ -118,23 +130,38 @@ unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
 *
 * If dstsize is zero, dst may be NULL. This is useful for calculating the
 * required buffer size before allocating.
+ *
+ * If full is true, use special case mappings if available and if the
+ * conditions are satisfied.
 */
 size_t
-unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen)
+unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen,
+				 bool full)
 {
-	return convert_case(dst, dstsize, src, srclen, CaseUpper, NULL, NULL);
+	return convert_case(dst, dstsize, src, srclen, CaseUpper, full, NULL,
+						NULL);
 }

 /*
+ * Implement Unicode Default Case Conversion algorithm.
+ *
 * If str_casekind is CaseLower or CaseUpper, map each character in the string
 * for which a mapping is available.
 *
 * If str_casekind is CaseTitle, maps characters found on a word boundary to
- * uppercase and other characters to lowercase.
+ * titlecase (or uppercase if full is false) and other characters to
+ * lowercase. NB: does not currently implement the Unicode behavior in which
+ * the word boundary is adjusted to the next Cased character. That behavior
+ * could be implemented as an option, but it doesn't match the default
+ * behavior of ICU, nor does it match the documented behavior of INITCAP().
+ *
+ * If full is true, use special mappings for relevant characters, which can
+ * map a single codepoint to multiple codepoints, or depend on conditions.
 */
 static size_t
 convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
-			 CaseKind str_casekind, WordBoundaryNext wbnext, void *wbstate)
+			 CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
+			 void *wbstate)
 {
 	/* character CaseKind varies while titlecasing */
 	CaseKind	chr_casekind = str_casekind;
@@ -156,20 +183,53 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
 		pg_wchar	u1 = utf8_to_unicode((unsigned char *) src + srcoff);
 		int			u1len = unicode_utf8len(u1);
 		const pg_case_map *casemap = find_case_map(u1);
+		const pg_special_case *special = NULL;

 		if (str_casekind == CaseTitle)
 		{
 			if (srcoff == boundary)
 			{
-				chr_casekind = CaseUpper;
+				chr_casekind = full ? CaseTitle : CaseUpper;
 				boundary = wbnext(wbstate);
 			}
 			else
 				chr_casekind = CaseLower;
 		}

+		/*
+		 * Find special case that matches the conditions, if any.
+		 *
+		 * Note: only a single special mapping per codepoint is currently
+		 * supported, though Unicode allows for multiple special mappings for
+		 * a single codepoint.
+		 */
+		if (full && casemap && casemap->special_case)
+		{
+			int16		conditions = casemap->special_case->conditions;
+
+			Assert(casemap->special_case->codepoint == u1);
+			if (check_special_conditions(conditions, src, srclen, srcoff))
+				special = casemap->special_case;
+		}
+
 		/* perform mapping, update result_len, and write to dst */
-		if (casemap)
+		if (special)
+		{
+			for (int i = 0; i < MAX_CASE_EXPANSION; i++)
+			{
+				pg_wchar	u2 = special->map[chr_casekind][i];
+				size_t		u2len = unicode_utf8len(u2);
+
+				if (u2 == '\0')
+					break;
+
+				if (result_len + u2len <= dstsize)
+					unicode_to_utf8(u2, (unsigned char *) dst + result_len);
+
+				result_len += u2len;
+			}
+		}
+		else if (casemap)
 		{
 			pg_wchar	u2 = casemap->simplemap[chr_casekind];
 			pg_wchar	u2len = unicode_utf8len(u2);
@@ -197,6 +257,82 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
 	return result_len;
 }

+/*
+ * Check that the condition matches Final_Sigma, described in Unicode Table
+ * 3-17. The character at the given offset must be directly preceded by a
+ * Cased character, and must not be directly followed by a Cased character.
+ *
+ * Case_Ignorable characters are ignored. NB: some characters may be both
+ * Cased and Case_Ignorable, in which case they are ignored.
+ */
+static bool
+check_final_sigma(const unsigned char *str, size_t len, size_t offset)
+{
+	/* the start of the string is not preceded by a Cased character */
+	if (offset == 0)
+		return false;
+
+	/* iterate backwards, looking for Cased character */
+	for (int i = offset - 1; i >= 0; i--)
+	{
+		if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
+		{
+			pg_wchar	curr = utf8_to_unicode(str + i);
+
+			if (pg_u_prop_case_ignorable(curr))
+				continue;
+			else if (pg_u_prop_cased(curr))
+				break;
+			else
+				return false;
+		}
+		else if ((str[i] & 0xC0) == 0x80)
+			continue;
+
+		Assert(false);			/* invalid UTF-8 */
+	}
+
+	/* end of string is not followed by a Cased character */
+	if (offset == len)
+		return true;
+
+	/* iterate forwards, looking for Cased character */
+	for (int i = offset + 1; i < len && str[i] != '\0'; i++)
+	{
+		if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
+		{
+			pg_wchar	curr = utf8_to_unicode(str + i);
+
+			if (pg_u_prop_case_ignorable(curr))
+				continue;
+			else if (pg_u_prop_cased(curr))
+				return false;
+			else
+				break;
+		}
+		else if ((str[i] & 0xC0) == 0x80)
+			continue;
+
+		Assert(false);			/* invalid UTF-8 */
+	}
+
+	return true;
+}
+
+static bool
+check_special_conditions(int conditions, const char *str, size_t len,
+						 size_t offset)
+{
+	if (conditions == 0)
+		return true;
+	else if (conditions == PG_U_FINAL_SIGMA)
+		return check_final_sigma((unsigned char *) str, len, offset);
+
+	/* no other conditions supported */
+	Assert(false);
+	return false;
+}
+
 /* find entry in simple case map, if any */
 static const pg_case_map *
 find_case_map(pg_wchar ucs)
--- a/src/include/common/unicode_case.h
+++ b/src/include/common/unicode_case.h
@@ -22,11 +22,11 @@ pg_wchar	unicode_lowercase_simple(pg_wchar code);
 pg_wchar	unicode_titlecase_simple(pg_wchar code);
 pg_wchar	unicode_uppercase_simple(pg_wchar code);
 size_t		unicode_strlower(char *dst, size_t dstsize, const char *src,
-							 ssize_t srclen);
+							 ssize_t srclen, bool full);
 size_t		unicode_strtitle(char *dst, size_t dstsize, const char *src,
-							 ssize_t srclen, WordBoundaryNext wbnext,
-							 void *wbstate);
+							 ssize_t srclen, bool full,
+							 WordBoundaryNext wbnext, void *wbstate);
 size_t		unicode_strupper(char *dst, size_t dstsize, const char *src,
-							 ssize_t srclen);
+							 ssize_t srclen, bool full);

 #endif							/* UNICODE_CASE_H */
--- a/src/include/common/unicode_case_table.h
+++ b/src/include/common/unicode_case_table.h
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -3754,6 +3754,7 @@ pg_sha256_ctx
 pg_sha384_ctx
 pg_sha512_ctx
 pg_snapshot
+pg_special_case
 pg_stack_base_t
 pg_time_t
 pg_time_usec_t