mirror of
https://github.com/postgres/postgres.git
synced 2025-09-03 15:22:11 +03:00
Add support for other normal forms to Unicode normalization API
It previously only supported NFKC, for use by SASLprep. This expands the API to offer the choice of all four normalization forms. Right now, there are no internal users of the forms other than NFKC. Reviewed-by: Daniel Verite <daniel@manitou-mail.org> Reviewed-by: Andreas Karlsson <andreas@proxel.se> Discussion: https://www.postgresql.org/message-id/flat/c1909f27-c269-2ed9-12f8-3ab72c8caf7a@2ndquadrant.com
This commit is contained in:
@@ -48,7 +48,7 @@ typedef struct
|
||||
{
|
||||
int linenum;
|
||||
pg_wchar input[50];
|
||||
pg_wchar output[50];
|
||||
pg_wchar output[4][50];
|
||||
} pg_unicode_test;
|
||||
|
||||
/* test table */
|
||||
@@ -89,13 +89,16 @@ while (my $line = <$INPUT>)
|
||||
my ($source, $nfc, $nfd, $nfkc, $nfkd) = split(';', $line);
|
||||
|
||||
my $source_utf8 = codepoint_string_to_hex($source);
|
||||
my $nfc_utf8 = codepoint_string_to_hex($nfc);
|
||||
my $nfd_utf8 = codepoint_string_to_hex($nfd);
|
||||
my $nfkc_utf8 = codepoint_string_to_hex($nfkc);
|
||||
my $nfkd_utf8 = codepoint_string_to_hex($nfkd);
|
||||
|
||||
print $OUTPUT "\t{ $linenum, { $source_utf8 }, { $nfkc_utf8 } },\n";
|
||||
print $OUTPUT "\t{ $linenum, { $source_utf8 }, { { $nfc_utf8 }, { $nfd_utf8 }, { $nfkc_utf8 }, { $nfkd_utf8 } } },\n";
|
||||
}
|
||||
|
||||
# Output terminator entry
|
||||
print $OUTPUT "\t{ 0, { 0 }, { 0 } }";
|
||||
print $OUTPUT "\t{ 0, { 0 }, { { 0 }, { 0 }, { 0 }, { 0 } } }";
|
||||
print $OUTPUT "\n};\n";
|
||||
|
||||
close $OUTPUT;
|
||||
|
@@ -99,10 +99,12 @@ typedef struct
|
||||
#define DECOMP_NO_COMPOSE 0x80 /* don't use for re-composition */
|
||||
#define DECOMP_INLINE 0x40 /* decomposition is stored inline in
|
||||
* dec_index */
|
||||
#define DECOMP_COMPAT 0x20 /* compatibility mapping */
|
||||
|
||||
#define DECOMPOSITION_SIZE(x) ((x)->dec_size_flags & 0x3F)
|
||||
#define DECOMPOSITION_NO_COMPOSE(x) (((x)->dec_size_flags & DECOMP_NO_COMPOSE) != 0)
|
||||
#define DECOMPOSITION_SIZE(x) ((x)->dec_size_flags & 0x1F)
|
||||
#define DECOMPOSITION_NO_COMPOSE(x) (((x)->dec_size_flags & (DECOMP_NO_COMPOSE | DECOMP_COMPAT)) != 0)
|
||||
#define DECOMPOSITION_IS_INLINE(x) (((x)->dec_size_flags & DECOMP_INLINE) != 0)
|
||||
#define DECOMPOSITION_IS_COMPAT(x) (((x)->dec_size_flags & DECOMP_COMPAT) != 0)
|
||||
|
||||
/* Table of Unicode codepoints and their decompositions */
|
||||
static const pg_unicode_decomposition UnicodeDecompMain[$num_characters] =
|
||||
@@ -136,22 +138,22 @@ foreach my $char (@characters)
|
||||
# Decomposition size
|
||||
# Print size of decomposition
|
||||
my $decomp_size = scalar(@decomp_elts);
|
||||
die if $decomp_size > 0x1F; # to not overrun bitmask
|
||||
|
||||
my $first_decomp = shift @decomp_elts;
|
||||
|
||||
my $flags = "";
|
||||
my $comment = "";
|
||||
|
||||
if ($compat)
|
||||
{
|
||||
$flags .= " | DECOMP_COMPAT";
|
||||
}
|
||||
|
||||
if ($decomp_size == 2)
|
||||
{
|
||||
|
||||
# Should this be used for recomposition?
|
||||
if ($compat)
|
||||
{
|
||||
$flags .= " | DECOMP_NO_COMPOSE";
|
||||
$comment = "compatibility mapping";
|
||||
}
|
||||
elsif ($character_hash{$first_decomp}
|
||||
if ($character_hash{$first_decomp}
|
||||
&& $character_hash{$first_decomp}->{class} != 0)
|
||||
{
|
||||
$flags .= " | DECOMP_NO_COMPOSE";
|
||||
|
@@ -63,18 +63,21 @@ main(int argc, char **argv)
|
||||
|
||||
for (test = UnicodeNormalizationTests; test->input[0] != 0; test++)
|
||||
{
|
||||
pg_wchar *result;
|
||||
|
||||
result = unicode_normalize_kc(test->input);
|
||||
|
||||
if (pg_wcscmp(test->output, result) != 0)
|
||||
for (int form = 0; form < 4; form++)
|
||||
{
|
||||
printf("FAILURE (NormalizationTest.txt line %d):\n", test->linenum);
|
||||
printf("input: %s\n", print_wchar_str(test->input));
|
||||
printf("expected: %s\n", print_wchar_str(test->output));
|
||||
printf("got: %s\n", print_wchar_str(result));
|
||||
printf("\n");
|
||||
exit(1);
|
||||
pg_wchar *result;
|
||||
|
||||
result = unicode_normalize(form, test->input);
|
||||
|
||||
if (pg_wcscmp(test->output[form], result) != 0)
|
||||
{
|
||||
printf("FAILURE (NormalizationTest.txt line %d form %d):\n", test->linenum, form);
|
||||
printf("input: %s\n", print_wchar_str(test->input));
|
||||
printf("expected: %s\n", print_wchar_str(test->output[form]));
|
||||
printf("got: %s\n", print_wchar_str(result));
|
||||
printf("\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user