Add support for other normal forms to Unicode normalization API

It previously only supported NFKC, for use by SASLprep. This expands the API to offer the choice of all four normalization forms. Right now, there are no internal users of the forms other than NFKC. Reviewed-by: Daniel Verite <daniel@manitou-mail.org> Reviewed-by: Andreas Karlsson <andreas@proxel.se> Discussion: https://www.postgresql.org/message-id/flat/c1909f27-c269-2ed9-12f8-3ab72c8caf7a@2ndquadrant.com
2025-09-03 15:22:11 +03:00 · 2020-03-24 08:49:52 +01:00
parent cedffbdb8b
commit d40d564c5a
7 changed files with 3728 additions and 3703 deletions
--- a/src/common/unicode/generate-norm_test_table.pl
+++ b/src/common/unicode/generate-norm_test_table.pl
@@ -48,7 +48,7 @@ typedef struct
 {
 	int			linenum;
 	pg_wchar	input[50];
-	pg_wchar	output[50];
+	pg_wchar	output[4][50];
 } pg_unicode_test;

 /* test table */
@@ -89,13 +89,16 @@ while (my $line = <$INPUT>)
 	my ($source, $nfc, $nfd, $nfkc, $nfkd) = split(';', $line);

 	my $source_utf8 = codepoint_string_to_hex($source);
+	my $nfc_utf8    = codepoint_string_to_hex($nfc);
+	my $nfd_utf8    = codepoint_string_to_hex($nfd);
 	my $nfkc_utf8   = codepoint_string_to_hex($nfkc);
+	my $nfkd_utf8   = codepoint_string_to_hex($nfkd);

-	print $OUTPUT "\t{ $linenum, { $source_utf8 }, { $nfkc_utf8 } },\n";
+	print $OUTPUT "\t{ $linenum, { $source_utf8 }, { { $nfc_utf8 }, { $nfd_utf8 }, { $nfkc_utf8 }, { $nfkd_utf8 } } },\n";
 }

 # Output terminator entry
-print $OUTPUT "\t{ 0, { 0 }, { 0 } }";
+print $OUTPUT "\t{ 0, { 0 }, { { 0 }, { 0 }, { 0 }, { 0 } } }";
 print $OUTPUT "\n};\n";

 close $OUTPUT;
--- a/src/common/unicode/generate-unicode_norm_table.pl
+++ b/src/common/unicode/generate-unicode_norm_table.pl
@@ -99,10 +99,12 @@ typedef struct
 #define DECOMP_NO_COMPOSE	0x80	/* don't use for re-composition */
 #define DECOMP_INLINE		0x40	/* decomposition is stored inline in
 									 * dec_index */
+#define DECOMP_COMPAT		0x20	/* compatibility mapping */

-#define DECOMPOSITION_SIZE(x) ((x)->dec_size_flags & 0x3F)
-#define DECOMPOSITION_NO_COMPOSE(x) (((x)->dec_size_flags & DECOMP_NO_COMPOSE) != 0)
+#define DECOMPOSITION_SIZE(x) ((x)->dec_size_flags & 0x1F)
+#define DECOMPOSITION_NO_COMPOSE(x) (((x)->dec_size_flags & (DECOMP_NO_COMPOSE | DECOMP_COMPAT)) != 0)
 #define DECOMPOSITION_IS_INLINE(x) (((x)->dec_size_flags & DECOMP_INLINE) != 0)
+#define DECOMPOSITION_IS_COMPAT(x) (((x)->dec_size_flags & DECOMP_COMPAT) != 0)

 /* Table of Unicode codepoints and their decompositions */
 static const pg_unicode_decomposition UnicodeDecompMain[$num_characters] =
@@ -136,22 +138,22 @@ foreach my $char (@characters)
 	# Decomposition size
 	# Print size of decomposition
 	my $decomp_size = scalar(@decomp_elts);
+	die if $decomp_size > 0x1F;		# to not overrun bitmask

 	my $first_decomp = shift @decomp_elts;

 	my $flags   = "";
 	my $comment = "";

+	if ($compat)
+	{
+		$flags .= " | DECOMP_COMPAT";
+	}
+
 	if ($decomp_size == 2)
 	{
-
 		# Should this be used for recomposition?
-		if ($compat)
-		{
-			$flags .= " | DECOMP_NO_COMPOSE";
-			$comment = "compatibility mapping";
-		}
-		elsif ($character_hash{$first_decomp}
+		if ($character_hash{$first_decomp}
 			&& $character_hash{$first_decomp}->{class} != 0)
 		{
 			$flags .= " | DECOMP_NO_COMPOSE";
--- a/src/common/unicode/norm_test.c
+++ b/src/common/unicode/norm_test.c
@@ -63,18 +63,21 @@ main(int argc, char **argv)

 	for (test = UnicodeNormalizationTests; test->input[0] != 0; test++)
 	{
-		pg_wchar   *result;
-
-		result = unicode_normalize_kc(test->input);
-
-		if (pg_wcscmp(test->output, result) != 0)
+		for (int form = 0; form < 4; form++)
 		{
-			printf("FAILURE (NormalizationTest.txt line %d):\n", test->linenum);
-			printf("input:    %s\n", print_wchar_str(test->input));
-			printf("expected: %s\n", print_wchar_str(test->output));
-			printf("got:      %s\n", print_wchar_str(result));
-			printf("\n");
-			exit(1);
+			pg_wchar   *result;
+
+			result = unicode_normalize(form, test->input);
+
+			if (pg_wcscmp(test->output[form], result) != 0)
+			{
+				printf("FAILURE (NormalizationTest.txt line %d form %d):\n", test->linenum, form);
+				printf("input:    %s\n", print_wchar_str(test->input));
+				printf("expected: %s\n", print_wchar_str(test->output[form]));
+				printf("got:      %s\n", print_wchar_str(result));
+				printf("\n");
+				exit(1);
+			}
 		}
 	}