mirror of
https://github.com/postgres/postgres.git
synced 2025-06-16 06:01:02 +03:00
An important step of SASLprep normalization, is to convert the string to Unicode normalization form NFKC. Unicode normalization requires a fairly large table of character decompositions, which is generated from data published by the Unicode consortium. The script to generate the table is put in src/common/unicode, as well test code for the normalization. A pre-generated version of the tables is included in src/include/common, so you don't need the code in src/common/unicode to build PostgreSQL, only if you wish to modify the normalization tables. The SASLprep implementation depends on the UTF-8 functions from src/backend/utils/mb/wchar.c. So to use it, you must also compile and link that. That doesn't change anything for the current users of these functions, the backend and libpq, as they both already link with wchar.o. It would be good to move those functions into a separate file in src/commmon, but I'll leave that for another day. No documentation changes included, because there is no details on the SCRAM mechanism in the docs anyway. An overview on that in the protocol specification would probably be good, even though SCRAM is documented in detail in RFC5802. I'll write that as a separate patch. An important thing to mention there is that we apply SASLprep even on invalid UTF-8 strings, to support other encodings. Patch by Michael Paquier and me. Discussion: https://www.postgresql.org/message-id/CAB7nPqSByyEmAVLtEf1KxTRh=PWNKiWKEKQR=e1yGehz=wbymQ@mail.gmail.com
103 lines
2.5 KiB
Perl
103 lines
2.5 KiB
Perl
#!/usr/bin/perl
|
|
#
|
|
# Read Unicode consortium's normalization test suite, NormalizationTest.txt,
|
|
# and generate a C array from it, for norm_test.c.
|
|
#
|
|
# NormalizationTest.txt is part of the Unicode Character Database.
|
|
#
|
|
# Copyright (c) 2000-2017, PostgreSQL Global Development Group
|
|
|
|
use strict;
|
|
use warnings;
|
|
|
|
use File::Basename;
|
|
|
|
die "Usage: $0 INPUT_FILE OUTPUT_FILE\n" if @ARGV != 2;
|
|
my $input_file = $ARGV[0];
|
|
my $output_file = $ARGV[1];
|
|
my $output_base = basename($output_file);
|
|
|
|
# Open the input and output files
|
|
open my $INPUT, $input_file
|
|
or die "Could not open input file $input_file: $!";
|
|
open my $OUTPUT, "> $output_file"
|
|
or die "Could not open output file $output_file: $!\n";
|
|
|
|
# Print header of output file.
|
|
print $OUTPUT <<HEADER;
|
|
/*-------------------------------------------------------------------------
|
|
*
|
|
* norm_test_table.h
|
|
* Test strings for Unicode normalization.
|
|
*
|
|
* Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
*
|
|
* src/common/unicode/norm_test_table.h
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
|
|
/*
|
|
* File auto-generated by src/common/unicode/generate-norm_test_table.pl, do
|
|
* not edit. There is deliberately not an #ifndef PG_NORM_TEST_TABLE_H
|
|
* here.
|
|
*/
|
|
|
|
typedef struct
|
|
{
|
|
int linenum;
|
|
pg_wchar input[50];
|
|
pg_wchar output[50];
|
|
} pg_unicode_test;
|
|
|
|
/* test table */
|
|
HEADER
|
|
print $OUTPUT
|
|
"static const pg_unicode_test UnicodeNormalizationTests[] =\n{\n";
|
|
|
|
# Helper routine to conver a space-separated list of Unicode characters to
|
|
# hexadecimal list format, suitable for outputting in a C array.
|
|
sub codepoint_string_to_hex
|
|
{
|
|
my $codepoint_string = shift;
|
|
|
|
my $result;
|
|
|
|
foreach (split(' ', $codepoint_string))
|
|
{
|
|
my $cp = $_;
|
|
my $utf8 = "0x$cp, ";
|
|
$result .= $utf8;
|
|
}
|
|
$result .= '0'; # null-terminated the array
|
|
return $result;
|
|
}
|
|
|
|
# Process the input file line by line
|
|
my $linenum = 0;
|
|
while (my $line = <$INPUT>)
|
|
{
|
|
$linenum = $linenum + 1;
|
|
if ($line =~ /^\s*#/) { next; } # ignore comments
|
|
|
|
if ($line =~ /^@/) { next; } # ignore @Part0 like headers
|
|
|
|
# Split the line wanted and get the fields needed:
|
|
#
|
|
# source; NFC; NFD; NFKC; NFKD
|
|
my ($source, $nfc, $nfd, $nfkc, $nfkd) = split(';', $line);
|
|
|
|
my $source_utf8 = codepoint_string_to_hex($source);
|
|
my $nfkc_utf8 = codepoint_string_to_hex($nfkc);
|
|
|
|
print $OUTPUT "\t{ $linenum, { $source_utf8 }, { $nfkc_utf8 } },\n";
|
|
}
|
|
|
|
# Output terminator entry
|
|
print $OUTPUT "\t{ 0, { 0 }, { 0 } }";
|
|
print $OUTPUT "\n};\n";
|
|
|
|
close $OUTPUT;
|
|
close $INPUT;
|