1
0
mirror of https://github.com/postgres/postgres.git synced 2025-09-03 15:22:11 +03:00

Add support for other normal forms to Unicode normalization API

It previously only supported NFKC, for use by SASLprep.  This expands
the API to offer the choice of all four normalization forms.  Right
now, there are no internal users of the forms other than NFKC.

Reviewed-by: Daniel Verite <daniel@manitou-mail.org>
Reviewed-by: Andreas Karlsson <andreas@proxel.se>
Discussion: https://www.postgresql.org/message-id/flat/c1909f27-c269-2ed9-12f8-3ab72c8caf7a@2ndquadrant.com
This commit is contained in:
Peter Eisentraut
2020-03-24 08:49:52 +01:00
parent cedffbdb8b
commit d40d564c5a
7 changed files with 3728 additions and 3703 deletions

View File

@@ -48,7 +48,7 @@ typedef struct
{
int linenum;
pg_wchar input[50];
pg_wchar output[50];
pg_wchar output[4][50];
} pg_unicode_test;
/* test table */
@@ -89,13 +89,16 @@ while (my $line = <$INPUT>)
my ($source, $nfc, $nfd, $nfkc, $nfkd) = split(';', $line);
my $source_utf8 = codepoint_string_to_hex($source);
my $nfc_utf8 = codepoint_string_to_hex($nfc);
my $nfd_utf8 = codepoint_string_to_hex($nfd);
my $nfkc_utf8 = codepoint_string_to_hex($nfkc);
my $nfkd_utf8 = codepoint_string_to_hex($nfkd);
print $OUTPUT "\t{ $linenum, { $source_utf8 }, { $nfkc_utf8 } },\n";
print $OUTPUT "\t{ $linenum, { $source_utf8 }, { { $nfc_utf8 }, { $nfd_utf8 }, { $nfkc_utf8 }, { $nfkd_utf8 } } },\n";
}
# Output terminator entry
print $OUTPUT "\t{ 0, { 0 }, { 0 } }";
print $OUTPUT "\t{ 0, { 0 }, { { 0 }, { 0 }, { 0 }, { 0 } } }";
print $OUTPUT "\n};\n";
close $OUTPUT;

View File

@@ -99,10 +99,12 @@ typedef struct
#define DECOMP_NO_COMPOSE 0x80 /* don't use for re-composition */
#define DECOMP_INLINE 0x40 /* decomposition is stored inline in
* dec_index */
#define DECOMP_COMPAT 0x20 /* compatibility mapping */
#define DECOMPOSITION_SIZE(x) ((x)->dec_size_flags & 0x3F)
#define DECOMPOSITION_NO_COMPOSE(x) (((x)->dec_size_flags & DECOMP_NO_COMPOSE) != 0)
#define DECOMPOSITION_SIZE(x) ((x)->dec_size_flags & 0x1F)
#define DECOMPOSITION_NO_COMPOSE(x) (((x)->dec_size_flags & (DECOMP_NO_COMPOSE | DECOMP_COMPAT)) != 0)
#define DECOMPOSITION_IS_INLINE(x) (((x)->dec_size_flags & DECOMP_INLINE) != 0)
#define DECOMPOSITION_IS_COMPAT(x) (((x)->dec_size_flags & DECOMP_COMPAT) != 0)
/* Table of Unicode codepoints and their decompositions */
static const pg_unicode_decomposition UnicodeDecompMain[$num_characters] =
@@ -136,22 +138,22 @@ foreach my $char (@characters)
# Decomposition size
# Print size of decomposition
my $decomp_size = scalar(@decomp_elts);
die if $decomp_size > 0x1F; # to not overrun bitmask
my $first_decomp = shift @decomp_elts;
my $flags = "";
my $comment = "";
if ($compat)
{
$flags .= " | DECOMP_COMPAT";
}
if ($decomp_size == 2)
{
# Should this be used for recomposition?
if ($compat)
{
$flags .= " | DECOMP_NO_COMPOSE";
$comment = "compatibility mapping";
}
elsif ($character_hash{$first_decomp}
if ($character_hash{$first_decomp}
&& $character_hash{$first_decomp}->{class} != 0)
{
$flags .= " | DECOMP_NO_COMPOSE";

View File

@@ -63,18 +63,21 @@ main(int argc, char **argv)
for (test = UnicodeNormalizationTests; test->input[0] != 0; test++)
{
pg_wchar *result;
result = unicode_normalize_kc(test->input);
if (pg_wcscmp(test->output, result) != 0)
for (int form = 0; form < 4; form++)
{
printf("FAILURE (NormalizationTest.txt line %d):\n", test->linenum);
printf("input: %s\n", print_wchar_str(test->input));
printf("expected: %s\n", print_wchar_str(test->output));
printf("got: %s\n", print_wchar_str(result));
printf("\n");
exit(1);
pg_wchar *result;
result = unicode_normalize(form, test->input);
if (pg_wcscmp(test->output[form], result) != 0)
{
printf("FAILURE (NormalizationTest.txt line %d form %d):\n", test->linenum, form);
printf("input: %s\n", print_wchar_str(test->input));
printf("expected: %s\n", print_wchar_str(test->output[form]));
printf("got: %s\n", print_wchar_str(result));
printf("\n");
exit(1);
}
}
}