Support Unicode full case mapping and conversion.

Generate tables from Unicode SpecialCasing.txt to support more sophisticated case mapping behavior: * support case mappings to multiple codepoints, such as "ß" uppercasing to "SS" * support conditional case mappings, such as the "final sigma" * support titlecase variants, such as "ǆ" uppercasing to "Ǆ" but titlecasing to "ǅ" Discussion: https://postgr.es/m/ddfd67928818f138f51635712529bc5e1d25e4e7.camel@j-davis.com Discussion: https://postgr.es/m/27bb0e52-801d-4f73-a0a4-02cfdd4a9ada@eisentraut.org Reviewed-by: Peter Eisentraut, Daniel Verite
2025-06-29 10:41:53 +03:00 · 2025-01-17 15:56:20 -08:00
parent 6a9b2a631a
commit 286a365b9c
9 changed files with 3645 additions and 2993 deletions
--- a/src/backend/utils/adt/pg_locale_builtin.c
+++ b/src/backend/utils/adt/pg_locale_builtin.c
@ -78,7 +78,7 @@ size_t
 strlower_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
 				 pg_locale_t locale)
 {
-	return unicode_strlower(dest, destsize, src, srclen);
+	return unicode_strlower(dest, destsize, src, srclen, false);
 }
 size_t
@ -93,7 +93,7 @@ strtitle_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
 		.prev_alnum = false,
 	};
-	return unicode_strtitle(dest, destsize, src, srclen,
+	return unicode_strtitle(dest, destsize, src, srclen, false,
 							initcap_wbnext, &wbstate);
 }
@ -101,7 +101,7 @@ size_t
 strupper_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
 				 pg_locale_t locale)
 {
-	return unicode_strupper(dest, destsize, src, srclen);
+	return unicode_strupper(dest, destsize, src, srclen, false);
 }
 pg_locale_t
--- a/src/common/unicode/Makefile
+++ b/src/common/unicode/Makefile
@ -30,7 +30,7 @@ update-unicode: unicode_case_table.h unicode_category_table.h unicode_east_asian
 # These files are part of the Unicode Character Database. Download
 # them on demand.  The dependency on Makefile.global is for
 # UNICODE_VERSION.
-CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt UnicodeData.txt: $(top_builddir)/src/Makefile.global
+CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt SpecialCasing.txt UnicodeData.txt: $(top_builddir)/src/Makefile.global
 	$(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F)
 unicode_version.h: generate-unicode_version.pl
@ -91,4 +91,4 @@ clean:
 	rm -f $(OBJS) case_test case_test.o category_test category_test.o norm_test norm_test.o
 distclean: clean
-	rm -f CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt UnicodeData.txt norm_test_table.h unicode_case_table.h unicode_category_table.h unicode_norm_table.h
+	rm -f CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt SpecialCasing.txt UnicodeData.txt norm_test_table.h unicode_case_table.h unicode_category_table.h unicode_norm_table.h
--- a/src/common/unicode/case_test.c
+++ b/src/common/unicode/case_test.c
@ -18,12 +18,61 @@
 #include <wctype.h>
 #ifdef USE_ICU
 #include <unicode/ucasemap.h>
 #include <unicode/uchar.h>
 #endif
 #include "common/unicode_case.h"
 #include "common/unicode_category.h"
 #include "common/unicode_version.h"
 /* enough to hold largest source or result string, including NUL */
 #define BUFSZ 256
 #ifdef USE_ICU
 static UCaseMap * casemap = NULL;
 #endif
 typedef size_t (*TestFunc) (char *dst, size_t dstsize, const char *src,
 							ssize_t srclen);
 /* simple boundary iterator copied from pg_locale_builtin.c */
 struct WordBoundaryState
 {
 	const char *str;
 	size_t		len;
 	size_t		offset;
 	bool		init;
 	bool		prev_alnum;
 };
 static size_t
 initcap_wbnext(void *state)
 {
 	struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state;
 	while (wbstate->offset < wbstate->len &&
 		   wbstate->str[wbstate->offset] != '\0')
 	{
 		pg_wchar	u = utf8_to_unicode((unsigned char *) wbstate->str +
 										wbstate->offset);
 		bool		curr_alnum = pg_u_isalnum(u, true);
 		if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
 		{
 			size_t		prev_offset = wbstate->offset;
 			wbstate->init = true;
 			wbstate->offset += unicode_utf8len(u);
 			wbstate->prev_alnum = curr_alnum;
 			return prev_offset;
 		}
 		wbstate->offset += unicode_utf8len(u);
 	}
 	return wbstate->len;
 }
 #ifdef USE_ICU
 static void
@ -48,6 +97,54 @@ icu_test_simple(pg_wchar code)
 	}
 }
 static void
 icu_test_full(char *str)
 {
 	char		lower[BUFSZ];
 	char		title[BUFSZ];
 	char		upper[BUFSZ];
 	char		icu_lower[BUFSZ];
 	char		icu_title[BUFSZ];
 	char		icu_upper[BUFSZ];
 	UErrorCode	status;
 	struct WordBoundaryState wbstate = {
 		.str = str,
 		.len = strlen(str),
 		.offset = 0,
 		.init = false,
 		.prev_alnum = false,
 	};
 	unicode_strlower(lower, BUFSZ, str, -1, true);
 	unicode_strtitle(title, BUFSZ, str, -1, true, initcap_wbnext, &wbstate);
 	unicode_strupper(upper, BUFSZ, str, -1, true);
 	status = U_ZERO_ERROR;
 	ucasemap_utf8ToLower(casemap, icu_lower, BUFSZ, str, -1, &status);
 	status = U_ZERO_ERROR;
 	ucasemap_utf8ToTitle(casemap, icu_title, BUFSZ, str, -1, &status);
 	status = U_ZERO_ERROR;
 	ucasemap_utf8ToUpper(casemap, icu_upper, BUFSZ, str, -1, &status);
 	if (strcmp(lower, icu_lower) != 0)
 	{
 		printf("case_test: str='%s' lower='%s' icu_lower='%s'\n", str, lower,
 			   icu_lower);
 		exit(1);
 	}
 	if (strcmp(title, icu_title) != 0)
 	{
 		printf("case_test: str='%s' title='%s' icu_title='%s'\n", str, title,
 			   icu_title);
 		exit(1);
 	}
 	if (strcmp(upper, icu_upper) != 0)
 	{
 		printf("case_test: str='%s' upper='%s' icu_upper='%s'\n", str, upper,
 			   icu_upper);
 		exit(1);
 	}
 }
 /*
 * Exhaustively compare case mappings with the results from ICU.
 */
@ -64,6 +161,7 @@ test_icu(void)
 		if (category != PG_U_UNASSIGNED)
 		{
 			uint8_t		icu_category = u_charType(code);
 			char		code_str[5] = {0};
 			if (icu_category == PG_U_UNASSIGNED)
 			{
@ -72,6 +170,9 @@ test_icu(void)
 			}
 			icu_test_simple(code);
 			unicode_to_utf8(code, (unsigned char *) code_str);
 			icu_test_full(code_str);
 			successful++;
 		}
 	}
@ -86,7 +187,7 @@ test_icu(void)
 #endif
 static void
-test_strlower(const char *test_string, const char *expected)
+test_convert(TestFunc tfunc, const char *test_string, const char *expected)
 {
 	size_t		src1len = strlen(test_string);
 	size_t		src2len = -1;	/* NUL-terminated */
@ -102,10 +203,11 @@ test_strlower(const char *test_string, const char *expected)
 	/* neither source nor destination are NUL-terminated */
 	memset(dst1, 0x7F, dst1len);
-	needed = unicode_strlower(dst1, dst1len, src1, src1len);
+	needed = tfunc(dst1, dst1len, src1, src1len);
 	if (needed != strlen(expected))
 	{
-		printf("case_test: convert_case test1 FAILURE: needed %zu\n", needed);
+		printf("case_test: convert_case test1 FAILURE: '%s' needed %zu expected %zu\n",
 			   test_string, needed, strlen(expected));
 		exit(1);
 	}
 	if (memcmp(dst1, expected, dst1len) != 0)
@ -117,10 +219,11 @@ test_strlower(const char *test_string, const char *expected)
 	/* destination is NUL-terminated and source is not */
 	memset(dst2, 0x7F, dst2len);
-	needed = unicode_strlower(dst2, dst2len, src1, src1len);
+	needed = tfunc(dst2, dst2len, src1, src1len);
 	if (needed != strlen(expected))
 	{
-		printf("case_test: convert_case test2 FAILURE: needed %zu\n", needed);
+		printf("case_test: convert_case test2 FAILURE: '%s' needed %zu expected %zu\n",
 			   test_string, needed, strlen(expected));
 		exit(1);
 	}
 	if (strcmp(dst2, expected) != 0)
@ -132,9 +235,11 @@ test_strlower(const char *test_string, const char *expected)
 	/* source is NUL-terminated and destination is not */
 	memset(dst1, 0x7F, dst1len);
-	needed = unicode_strlower(dst1, dst1len, src2, src2len);
+	needed = tfunc(dst1, dst1len, src2, src2len);
 	if (needed != strlen(expected))
 	{
 		printf("case_test: convert_case test3 FAILURE: '%s' needed %zu expected %zu\n",
 			   test_string, needed, strlen(expected));
 		printf("case_test: convert_case test3 FAILURE: needed %zu\n", needed);
 		exit(1);
 	}
@ -147,10 +252,11 @@ test_strlower(const char *test_string, const char *expected)
 	/* both source and destination are NUL-terminated */
 	memset(dst2, 0x7F, dst2len);
-	needed = unicode_strlower(dst2, dst2len, src2, src2len);
+	needed = tfunc(dst2, dst2len, src2, src2len);
 	if (needed != strlen(expected))
 	{
-		printf("case_test: convert_case test4 FAILURE: needed %zu\n", needed);
+		printf("case_test: convert_case test4 FAILURE: '%s' needed %zu expected %zu\n",
 			   test_string, needed, strlen(expected));
 		exit(1);
 	}
 	if (strcmp(dst2, expected) != 0)
@ -166,15 +272,69 @@ test_strlower(const char *test_string, const char *expected)
 	free(dst2);
 }
 static size_t
 tfunc_lower(char *dst, size_t dstsize, const char *src,
 			ssize_t srclen)
 {
 	return unicode_strlower(dst, dstsize, src, srclen, true);
 }
 static size_t
 tfunc_title(char *dst, size_t dstsize, const char *src,
 			ssize_t srclen)
 {
 	struct WordBoundaryState wbstate = {
 		.str = src,
 		.len = srclen,
 		.offset = 0,
 		.init = false,
 		.prev_alnum = false,
 	};
 	return unicode_strtitle(dst, dstsize, src, srclen, true, initcap_wbnext,
 							&wbstate);
 }
 static size_t
 tfunc_upper(char *dst, size_t dstsize, const char *src,
 			ssize_t srclen)
 {
 	return unicode_strupper(dst, dstsize, src, srclen, true);
 }
 static void
 test_convert_case()
 {
 	/* test string with no case changes */
-	test_strlower("√∞", "√∞");
+	test_convert(tfunc_lower, "√∞", "√∞");
 	/* test adjust-to-cased behavior */
 	test_convert(tfunc_title, "abc 123xyz", "Abc 123xyz");
 	/* test string with case changes */
-	test_strlower("ABC", "abc");
+	test_convert(tfunc_upper, "abc", "ABC");
 	/* test string with case changes and byte length changes */
-	test_strlower("ȺȺȺ", "ⱥⱥⱥ");
+	test_convert(tfunc_lower, "ȺȺȺ", "ⱥⱥⱥ");
 	/* test special case conversions */
 	test_convert(tfunc_upper, "ß", "SS");
 	test_convert(tfunc_lower, "ıiIİ", "ıiii\u0307");
 	test_convert(tfunc_upper, "ıiIİ", "IIIİ");
 	/* test final sigma */
 	test_convert(tfunc_lower, "σςΣ ΣΣΣ", "σςς σσς");
 	test_convert(tfunc_lower, "σς'Σ' ΣΣ'Σ'", "σς'ς' σσ'ς'");
 	test_convert(tfunc_title, "σςΣ ΣΣΣ", "Σςς Σσς");
 #ifdef USE_ICU
 	icu_test_full("");
 	icu_test_full("ȺȺȺ");
 	icu_test_full("ßßß");
 	icu_test_full("√∞");
 	icu_test_full("a b");
 	icu_test_full("abc 123xyz");
 	icu_test_full("σςΣ ΣΣΣ");
 	icu_test_full("ıiIİ");
 	/* test <alpha><iota_subscript><acute> */
 	icu_test_full("\u0391\u0345\u0301");
 #endif
 	printf("case_test: convert_case: success\n");
 }
@ -182,6 +342,22 @@ test_convert_case()
 int
 main(int argc, char **argv)
 {
 #ifdef USE_ICU
 	UErrorCode	status = U_ZERO_ERROR;
 	/*
 	 * Disable ICU's word break adjustment for titlecase to match the expected
 	 * behavior of unicode_strtitle().
 	 */
 	casemap = ucasemap_open("und", U_TITLECASE_NO_BREAK_ADJUSTMENT, &status);
 	if (U_FAILURE(status))
 	{
 		printf("case_test: failure opening UCaseMap: %s\n",
 			   u_errorName(status));
 		exit(1);
 	}
 #endif
 	printf("case_test: Postgres Unicode version:\t%s\n", PG_UNICODE_VERSION);
 #ifdef USE_ICU
 	printf("case_test: ICU Unicode version:\t\t%s\n", U_UNICODE_VERSION);
@ -191,5 +367,9 @@ main(int argc, char **argv)
 #endif
 	test_convert_case();
 #ifdef USE_ICU
 	ucasemap_close(casemap);
 #endif
 	exit(0);
 }
--- a/src/common/unicode/generate-unicode_case_table.pl
+++ b/src/common/unicode/generate-unicode_case_table.pl
@ -3,7 +3,7 @@
 # Generate Unicode character case mappings. Does not include tailoring
 # or locale-specific mappings.
 #
-# Input: UnicodeData.txt
+# Input: SpecialCasing.txt UnicodeData.txt
 # Output: unicode_case_table.h
 #
 # Copyright (c) 2000-2025, PostgreSQL Global Development Group
@ -21,6 +21,10 @@ GetOptions('outdir:s' => \$output_path);
 my $output_table_file = "$output_path/unicode_case_table.h";
 # The maximum number of codepoints that can result from case mapping
 # of a single character. See Unicode section 5.18 "Case Mappings".
 my $MAX_CASE_EXPANSION = 3;
 my $FH;
 my %simple = ();
@ -51,6 +55,98 @@ while (my $line = <$FH>)
 }
 close $FH;
 # Map for special casing rules that aren't represented in the simple
 # mapping. Language-sensitive mappings are not supported.
 #
 # See https://www.unicode.org/reports/tr44/#SpecialCasing.txt, or the
 # SpecialCasing.txt file itself for details.
 # for now, only Final_Sigma is supported
 my %condition_map = (Final_Sigma => 'PG_U_FINAL_SIGMA');
 my %special = ();
 open($FH, '<', "$output_path/SpecialCasing.txt")
  or die "Could not open $output_path/SpecialCasing.txt: $!.";
 while (my $line = <$FH>)
 {
 	# language-sensitive mappings not supported
 	last if $line =~ /\# Language-Sensitive Mappings/;
 	# remove comments
 	$line =~ s/^(.*?)#.*$/$1/s;
 	# ignore empty lines
 	next unless $line =~ /;/;
 	my @elts = split /;/, $line;
 	my $code = hex($elts[0]);
 	# Codepoint may map to multiple characters when converting
 	# case. Split each mapping on whitespace and extract the
 	# hexadecimal into an array of codepoints.
 	my @lower = map { hex $_ } (grep /^[0-9A-F]+$/, (split /\s+/, $elts[1]));
 	my @title = map { hex $_ } (grep /^[0-9A-F]+$/, (split /\s+/, $elts[2]));
 	my @upper = map { hex $_ } (grep /^[0-9A-F]+$/, (split /\s+/, $elts[3]));
 	my @conditions = map {
 		# supporting negated conditions may require storing a
 		# mask of relevant conditions for a given rule to differentiate
 		# between lack of a condition and a negated condition
 		die "negated conditions not supported" if /^Not_/;
 		$condition_map{$_} || die "unrecognized condition: $_"
 	} (grep /\w+/, (split /\s+/, $elts[4]));
 	my $cond_str = (join '|', @conditions) || '0';
 	# if empty, create a self-mapping
 	push @lower, $code if (scalar @lower == 0);
 	push @title, $code if (scalar @title == 0);
 	push @upper, $code if (scalar @upper == 0);
 	# none should map to more than 3 codepoints
 	die "lowercase expansion for 0x$elts[0] exceeds maximum: '$elts[1]'"
 	  if (scalar @lower) > $MAX_CASE_EXPANSION;
 	die "titlecase expansion for 0x$elts[0] exceeds maximum: '$elts[2]'"
 	  if (scalar @title) > $MAX_CASE_EXPANSION;
 	die "uppercase expansion for 0x$elts[0] exceeds maximum: '$elts[3]'"
 	  if (scalar @upper) > $MAX_CASE_EXPANSION;
 	# pad arrays to a fixed length of 3
 	while (scalar @upper < $MAX_CASE_EXPANSION) { push @upper, 0x000000 }
 	while (scalar @lower < $MAX_CASE_EXPANSION) { push @lower, 0x000000 }
 	while (scalar @title < $MAX_CASE_EXPANSION) { push @title, 0x000000 }
 	# Characters with special mappings may not have simple mappings;
 	# ensure that an entry exists.
 	$simple{$code} ||= {
 		Simple_Lowercase => $code,
 		Simple_Titlecase => $code,
 		Simple_Uppercase => $code
 	};
 	# Multiple special case rules for a single codepoint could be
 	# supported by making several entries for each codepoint, and have
 	# the simple mapping point to the first entry. The caller could
 	# scan forward looking for an entry that matches the conditions,
 	# or fall back to the normal behavior.
 	die "multiple special case mappings not supported"
 	  if defined $special{$code};
 	$special{$code} = {
 		Lowercase => \@lower,
 		Titlecase => \@title,
 		Uppercase => \@upper,
 		Conditions => $cond_str
 	};
 }
 close $FH;
 # assign sequential array indexes to the special mappings
 my $special_idx = 0;
 foreach my $code (sort { $a <=> $b } (keys %special))
 {
 	$special{$code}{Index} = $special_idx++;
 }
 # Start writing out the output files
 open my $OT, '>', $output_table_file
  or die "Could not open output file $output_table_file: $!\n";
@ -63,6 +159,8 @@ foreach my $code (sort { $a <=> $b } (keys %simple))
 	$num_simple++ unless $code < 0x80;
 }
 my $num_special = scalar(keys %special) + 1;
 print $OT <<"EOS";
 /*-------------------------------------------------------------------------
 *
@ -86,6 +184,19 @@ print $OT <<"EOS";
 #include "common/unicode_case.h"
 #include "mb/pg_wchar.h"
 /*
 * The maximum number of codepoints that can result from case mapping
 * of a single character. See Unicode section 5.18 "Case Mappings".
 */
 #define MAX_CASE_EXPANSION 3
 /*
 * Case mapping condition flags. For now, only Final_Sigma is supported.
 *
 * See Unicode Context Specification for Casing.
 */
 #define PG_U_FINAL_SIGMA		(1 << 0)
 typedef enum
 {
 	CaseLower = 0,
@ -94,12 +205,47 @@ typedef enum
 	NCaseKind
 } CaseKind;
 typedef struct
 {
 	pg_wchar	codepoint;		/* Unicode codepoint */
 	int16		conditions;
 	pg_wchar	map[NCaseKind][MAX_CASE_EXPANSION];
 } pg_special_case;
 typedef struct
 {
 	pg_wchar	codepoint;		/* Unicode codepoint */
 	pg_wchar	simplemap[NCaseKind];
 	const pg_special_case *special_case;
 } pg_case_map;
 /*
 * Special case mappings that aren't representable in the simple map.
 * Entries are referenced from simple_case_map.
 */
 static const pg_special_case special_case[$num_special] =
 {
 EOS
 foreach my $code (sort { $a <=> $b } (keys %special))
 {
 	die if scalar @{ $special{$code}{Lowercase} } != $MAX_CASE_EXPANSION;
 	die if scalar @{ $special{$code}{Titlecase} } != $MAX_CASE_EXPANSION;
 	die if scalar @{ $special{$code}{Uppercase} } != $MAX_CASE_EXPANSION;
 	my $lower = join ", ",
 	  (map { sprintf "0x%06x", $_ } @{ $special{$code}{Lowercase} });
 	my $title = join ", ",
 	  (map { sprintf "0x%06x", $_ } @{ $special{$code}{Titlecase} });
 	my $upper = join ", ",
 	  (map { sprintf "0x%06x", $_ } @{ $special{$code}{Uppercase} });
 	printf $OT "\t{0x%06x, %s, ", $code, $special{$code}{Conditions};
 	printf $OT "{{%s}, {%s}, {%s}}},\n", $lower, $title, $upper;
 }
 print $OT "\t{0, 0, {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}}}\n";
 print $OT <<"EOS";
 };
 /*
 * Case mapping table. Dense for codepoints < 0x80 (enabling fast lookup),
 * sparse for higher codepoints (requiring scan or binary search).
@ -114,8 +260,10 @@ for (my $code = 0; $code < 0x80; $code++)
 	my $lc = ($simple{$code}{Simple_Lowercase} || $code);
 	my $tc = ($simple{$code}{Simple_Titlecase} || $code);
 	my $uc = ($simple{$code}{Simple_Uppercase} || $code);
 	die "unexpected special case for code $code"
 	  if defined $special{$code};
 	printf $OT
-	  "\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}},\n",
+	  "\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}, NULL},\n",
 	  $code, $lc, $tc, $uc;
 }
 printf $OT "\n";
@ -126,9 +274,14 @@ foreach my $code (sort { $a <=> $b } (keys %simple))
 	next unless $code >= 0x80;    # already output above
 	my $map = $simple{$code};
 	my $special_case = "NULL";
 	if (exists $special{$code})
 	{
 		$special_case = sprintf "&special_case[%d]", $special{$code}{Index};
 	}
 	printf $OT
-	  "\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}},\n",
+	  "\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}, %s},\n",
 	  $code, $map->{Simple_Lowercase}, $map->{Simple_Titlecase},
-	  $map->{Simple_Uppercase};
+	  $map->{Simple_Uppercase}, $special_case;
 }
 print $OT "};\n";
--- a/src/common/unicode/meson.build
+++ b/src/common/unicode/meson.build
@ -11,7 +11,7 @@ endif
 # These files are part of the Unicode Character Database. Download them on
 # demand.
-foreach f : ['CompositionExclusions.txt', 'DerivedCoreProperties.txt', 'DerivedNormalizationProps.txt', 'EastAsianWidth.txt', 'NormalizationTest.txt', 'PropList.txt', 'UnicodeData.txt']
+foreach f : ['CompositionExclusions.txt', 'DerivedCoreProperties.txt', 'DerivedNormalizationProps.txt', 'EastAsianWidth.txt', 'NormalizationTest.txt', 'PropList.txt', 'SpecialCasing.txt', 'UnicodeData.txt']
  url = unicode_baseurl.format(UNICODE_VERSION, f)
  target = custom_target(f,
    output: f,
@ -26,7 +26,7 @@ update_unicode_targets = []
 update_unicode_targets += \
  custom_target('unicode_case_table.h',
-    input: [unicode_data['UnicodeData.txt']],
+    input: [unicode_data['SpecialCasing.txt'], unicode_data['UnicodeData.txt']],
    output: ['unicode_case_table.h'],
    command: [
      perl, files('generate-unicode_case_table.pl'),
--- a/src/common/unicode_case.c
+++ b/src/common/unicode_case.c
@ -17,12 +17,15 @@
 #include "common/unicode_case.h"
 #include "common/unicode_case_table.h"
 #include "common/unicode_category.h"
 #include "mb/pg_wchar.h"
 static const pg_case_map *find_case_map(pg_wchar ucs);
 static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
-						   CaseKind str_casekind, WordBoundaryNext wbnext,
+						   CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
 						   void *wbstate);
 static bool check_special_conditions(int conditions, const char *str,
 									 size_t len, size_t offset);
 pg_wchar
 unicode_lowercase_simple(pg_wchar code)
@ -63,11 +66,16 @@ unicode_uppercase_simple(pg_wchar code)
 *
 * If dstsize is zero, dst may be NULL. This is useful for calculating the
 * required buffer size before allocating.
 *
 * If full is true, use special case mappings if available and if the
 * conditions are satisfied.
 */
 size_t
-unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
+unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen,
 				 bool full)
 {
-	return convert_case(dst, dstsize, src, srclen, CaseLower, NULL, NULL);
+	return convert_case(dst, dstsize, src, srclen, CaseLower, full, NULL,
 						NULL);
 }
 /*
@ -86,6 +94,10 @@ unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
 * If dstsize is zero, dst may be NULL. This is useful for calculating the
 * required buffer size before allocating.
 *
 * If full is true, use special case mappings if available and if the
 * conditions are satisfied. Otherwise, use only simple mappings and use
 * uppercase instead of titlecase.
 *
 * Titlecasing requires knowledge about word boundaries, which is provided by
 * the callback wbnext. A word boundary is the offset of the start of a word
 * or the offset of the character immediately following a word.
@ -97,9 +109,9 @@ unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
 */
 size_t
 unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
-				 WordBoundaryNext wbnext, void *wbstate)
+				 bool full, WordBoundaryNext wbnext, void *wbstate)
 {
-	return convert_case(dst, dstsize, src, srclen, CaseTitle, wbnext,
+	return convert_case(dst, dstsize, src, srclen, CaseTitle, full, wbnext,
 						wbstate);
 }
@ -118,23 +130,38 @@ unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
 *
 * If dstsize is zero, dst may be NULL. This is useful for calculating the
 * required buffer size before allocating.
 *
 * If full is true, use special case mappings if available and if the
 * conditions are satisfied.
 */
 size_t
-unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen)
+unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen,
 				 bool full)
 {
-	return convert_case(dst, dstsize, src, srclen, CaseUpper, NULL, NULL);
+	return convert_case(dst, dstsize, src, srclen, CaseUpper, full, NULL,
 						NULL);
 }
 /*
 * Implement Unicode Default Case Conversion algorithm.
 *
 * If str_casekind is CaseLower or CaseUpper, map each character in the string
 * for which a mapping is available.
 *
 * If str_casekind is CaseTitle, maps characters found on a word boundary to
- * uppercase and other characters to lowercase.
+ * titlecase (or uppercase if full is false) and other characters to
 * lowercase. NB: does not currently implement the Unicode behavior in which
 * the word boundary is adjusted to the next Cased character. That behavior
 * could be implemented as an option, but it doesn't match the default
 * behavior of ICU, nor does it match the documented behavior of INITCAP().
 *
 * If full is true, use special mappings for relevant characters, which can
 * map a single codepoint to multiple codepoints, or depend on conditions.
 */
 static size_t
 convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
-			 CaseKind str_casekind, WordBoundaryNext wbnext, void *wbstate)
+			 CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
 			 void *wbstate)
 {
 	/* character CaseKind varies while titlecasing */
 	CaseKind	chr_casekind = str_casekind;
@ -156,20 +183,53 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
 		pg_wchar	u1 = utf8_to_unicode((unsigned char *) src + srcoff);
 		int			u1len = unicode_utf8len(u1);
 		const pg_case_map *casemap = find_case_map(u1);
 		const pg_special_case *special = NULL;
 		if (str_casekind == CaseTitle)
 		{
 			if (srcoff == boundary)
 			{
-				chr_casekind = CaseUpper;
+				chr_casekind = full ? CaseTitle : CaseUpper;
 				boundary = wbnext(wbstate);
 			}
 			else
 				chr_casekind = CaseLower;
 		}
 		/*
 		 * Find special case that matches the conditions, if any.
 		 *
 		 * Note: only a single special mapping per codepoint is currently
 		 * supported, though Unicode allows for multiple special mappings for
 		 * a single codepoint.
 		 */
 		if (full && casemap && casemap->special_case)
 		{
 			int16		conditions = casemap->special_case->conditions;
 			Assert(casemap->special_case->codepoint == u1);
 			if (check_special_conditions(conditions, src, srclen, srcoff))
 				special = casemap->special_case;
 		}
 		/* perform mapping, update result_len, and write to dst */
-		if (casemap)
+		if (special)
 		{
 			for (int i = 0; i < MAX_CASE_EXPANSION; i++)
 			{
 				pg_wchar	u2 = special->map[chr_casekind][i];
 				size_t		u2len = unicode_utf8len(u2);
 				if (u2 == '\0')
 					break;
 				if (result_len + u2len <= dstsize)
 					unicode_to_utf8(u2, (unsigned char *) dst + result_len);
 				result_len += u2len;
 			}
 		}
 		else if (casemap)
 		{
 			pg_wchar	u2 = casemap->simplemap[chr_casekind];
 			pg_wchar	u2len = unicode_utf8len(u2);
@ -197,6 +257,82 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
 	return result_len;
 }
 /*
 * Check that the condition matches Final_Sigma, described in Unicode Table
 * 3-17. The character at the given offset must be directly preceded by a
 * Cased character, and must not be directly followed by a Cased character.
 *
 * Case_Ignorable characters are ignored. NB: some characters may be both
 * Cased and Case_Ignorable, in which case they are ignored.
 */
 static bool
 check_final_sigma(const unsigned char *str, size_t len, size_t offset)
 {
 	/* the start of the string is not preceded by a Cased character */
 	if (offset == 0)
 		return false;
 	/* iterate backwards, looking for Cased character */
 	for (int i = offset - 1; i >= 0; i--)
 	{
 		if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
 		{
 			pg_wchar	curr = utf8_to_unicode(str + i);
 			if (pg_u_prop_case_ignorable(curr))
 				continue;
 			else if (pg_u_prop_cased(curr))
 				break;
 			else
 				return false;
 		}
 		else if ((str[i] & 0xC0) == 0x80)
 			continue;
 		Assert(false);			/* invalid UTF-8 */
 	}
 	/* end of string is not followed by a Cased character */
 	if (offset == len)
 		return true;
 	/* iterate forwards, looking for Cased character */
 	for (int i = offset + 1; i < len && str[i] != '\0'; i++)
 	{
 		if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
 		{
 			pg_wchar	curr = utf8_to_unicode(str + i);
 			if (pg_u_prop_case_ignorable(curr))
 				continue;
 			else if (pg_u_prop_cased(curr))
 				return false;
 			else
 				break;
 		}
 		else if ((str[i] & 0xC0) == 0x80)
 			continue;
 		Assert(false);			/* invalid UTF-8 */
 	}
 	return true;
 }
 static bool
 check_special_conditions(int conditions, const char *str, size_t len,
 						 size_t offset)
 {
 	if (conditions == 0)
 		return true;
 	else if (conditions == PG_U_FINAL_SIGMA)
 		return check_final_sigma((unsigned char *) str, len, offset);
 	/* no other conditions supported */
 	Assert(false);
 	return false;
 }
 /* find entry in simple case map, if any */
 static const pg_case_map *
 find_case_map(pg_wchar ucs)
--- a/src/include/common/unicode_case.h
+++ b/src/include/common/unicode_case.h
@ -22,11 +22,11 @@ pg_wchar	unicode_lowercase_simple(pg_wchar code);
 pg_wchar	unicode_titlecase_simple(pg_wchar code);
 pg_wchar	unicode_uppercase_simple(pg_wchar code);
 size_t		unicode_strlower(char *dst, size_t dstsize, const char *src,
-							 ssize_t srclen);
+							 ssize_t srclen, bool full);
 size_t		unicode_strtitle(char *dst, size_t dstsize, const char *src,
-							 ssize_t srclen, WordBoundaryNext wbnext,
+							 ssize_t srclen, bool full,
-							 void *wbstate);
+							 WordBoundaryNext wbnext, void *wbstate);
 size_t		unicode_strupper(char *dst, size_t dstsize, const char *src,
-							 ssize_t srclen);
+							 ssize_t srclen, bool full);
 #endif							/* UNICODE_CASE_H */
--- a/src/include/common/unicode_case_table.h
+++ b/src/include/common/unicode_case_table.h
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@ -3754,6 +3754,7 @@ pg_sha256_ctx
 pg_sha384_ctx
 pg_sha512_ctx
 pg_snapshot
 pg_special_case
 pg_stack_base_t
 pg_time_t
 pg_time_usec_t