mirror of
https://github.com/postgres/postgres.git
synced 2025-09-02 04:21:28 +03:00
Add SQL functions for Unicode normalization
This adds SQL expressions NORMALIZE() and IS NORMALIZED to convert and check Unicode normal forms, per SQL standard. To support fast IS NORMALIZED tests, we pull in a new data file DerivedNormalizationProps.txt from Unicode and build a lookup table from that, using techniques similar to ones already used for other Unicode data. make update-unicode will keep it up to date. We only build and use these tables for the NFC and NFKC forms, because they are too big for NFD and NFKD and the improvement is not significant enough there. Reviewed-by: Daniel Verite <daniel@manitou-mail.org> Reviewed-by: Andreas Karlsson <andreas@proxel.se> Discussion: https://www.postgresql.org/message-id/flat/c1909f27-c269-2ed9-12f8-3ab72c8caf7a@2ndquadrant.com
This commit is contained in:
86
src/common/unicode/generate-unicode_normprops_table.pl
Normal file
86
src/common/unicode/generate-unicode_normprops_table.pl
Normal file
@@ -0,0 +1,86 @@
|
||||
#!/usr/bin/perl
|
||||
#
|
||||
# Generate table of Unicode normalization "quick check" properties
|
||||
# (see UAX #15). Pass DerivedNormalizationProps.txt as argument. The
|
||||
# output is on stdout.
|
||||
#
|
||||
# Copyright (c) 2020, PostgreSQL Global Development Group
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
|
||||
my %data;
|
||||
|
||||
print "/* generated by src/common/unicode/generate-unicode_normprops_table.pl, do not edit */\n\n";
|
||||
|
||||
print <<EOS;
|
||||
#include "common/unicode_norm.h"
|
||||
|
||||
/*
|
||||
* We use a bit field here to save space.
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
unsigned int codepoint:21;
|
||||
signed int quickcheck:4; /* really UnicodeNormalizationQC */
|
||||
} pg_unicode_normprops;
|
||||
EOS
|
||||
|
||||
foreach my $line (<ARGV>)
|
||||
{
|
||||
chomp $line;
|
||||
$line =~ s/\s*#.*$//;
|
||||
next if $line eq '';
|
||||
my ($codepoint, $prop, $value) = split /\s*;\s*/, $line;
|
||||
next if $prop !~ /_QC/;
|
||||
|
||||
my ($first, $last);
|
||||
if ($codepoint =~ /\.\./)
|
||||
{
|
||||
($first, $last) = split /\.\./, $codepoint;
|
||||
}
|
||||
else
|
||||
{
|
||||
$first = $last = $codepoint;
|
||||
}
|
||||
|
||||
foreach my $cp (hex($first)..hex($last))
|
||||
{
|
||||
$data{$prop}{$cp} = $value;
|
||||
}
|
||||
}
|
||||
|
||||
# We create a separate array for each normalization form rather than,
|
||||
# say, a two-dimensional array, because that array would be very
|
||||
# sparse and would create unnecessary overhead especially for the NFC
|
||||
# lookup.
|
||||
foreach my $prop (sort keys %data)
|
||||
{
|
||||
# Don't build the tables for the "D" forms because they are too
|
||||
# big. See also unicode_is_normalized_quickcheck().
|
||||
next if $prop eq "NFD_QC" || $prop eq "NFKD_QC";
|
||||
|
||||
print "\n";
|
||||
print "static const pg_unicode_normprops UnicodeNormProps_${prop}[] = {\n";
|
||||
|
||||
my %subdata = %{$data{$prop}};
|
||||
foreach my $cp (sort { $a <=> $b } keys %subdata)
|
||||
{
|
||||
my $qc;
|
||||
if ($subdata{$cp} eq 'N')
|
||||
{
|
||||
$qc = 'UNICODE_NORM_QC_NO';
|
||||
}
|
||||
elsif ($subdata{$cp} eq 'M')
|
||||
{
|
||||
$qc = 'UNICODE_NORM_QC_MAYBE';
|
||||
}
|
||||
else
|
||||
{
|
||||
die;
|
||||
}
|
||||
printf "\t{0x%04X, %s},\n", $cp, $qc;
|
||||
}
|
||||
|
||||
print "};\n";
|
||||
}
|
Reference in New Issue
Block a user