mirror of
https://github.com/postgres/postgres.git
synced 2025-08-27 07:42:10 +03:00
This makes the normalization quick check about 30% faster for NFC and 50% faster for NFKC than the binary search used previously. The hash lookup reuses the existing array of bit fields used for the binary search to get the quick check property and is generated as part of "make update-unicode" in src/common/unicode/. Author: John Naylor Reviewed-by: Mark Dilger, Michael Paquier Discussion: https://postgr.es/m/CACPNZCt4fbJ0_bGrN5QPt34N4whv=mszM0LMVQdoa2rC9UMRXA@mail.gmail.com
126 lines
3.0 KiB
Perl
126 lines
3.0 KiB
Perl
#!/usr/bin/perl
|
|
#
|
|
# Generate table of Unicode normalization "quick check" properties
|
|
# (see UAX #15). Pass DerivedNormalizationProps.txt as argument. The
|
|
# output is on stdout.
|
|
#
|
|
# Copyright (c) 2020, PostgreSQL Global Development Group
|
|
|
|
use strict;
|
|
use warnings;
|
|
|
|
use FindBin;
|
|
use lib "$FindBin::RealBin/../../tools/";
|
|
use PerfectHash;
|
|
|
|
my %data;
|
|
|
|
print
|
|
"/* generated by src/common/unicode/generate-unicode_normprops_table.pl, do not edit */\n\n";
|
|
|
|
print <<EOS;
|
|
#include "common/unicode_norm.h"
|
|
|
|
/*
|
|
* Normalization quick check entry for codepoint. We use a bit field
|
|
* here to save space.
|
|
*/
|
|
typedef struct
|
|
{
|
|
unsigned int codepoint:21;
|
|
signed int quickcheck:4; /* really UnicodeNormalizationQC */
|
|
} pg_unicode_normprops;
|
|
|
|
/* Typedef for hash function on quick check table */
|
|
typedef int (*qc_hash_func) (const void *key);
|
|
|
|
/* Information for quick check lookup with perfect hash function */
|
|
typedef struct
|
|
{
|
|
const pg_unicode_normprops *normprops;
|
|
qc_hash_func hash;
|
|
int num_normprops;
|
|
} pg_unicode_norminfo;
|
|
EOS
|
|
|
|
foreach my $line (<ARGV>)
|
|
{
|
|
chomp $line;
|
|
$line =~ s/\s*#.*$//;
|
|
next if $line eq '';
|
|
my ($codepoint, $prop, $value) = split /\s*;\s*/, $line;
|
|
next if $prop !~ /_QC/;
|
|
|
|
my ($first, $last);
|
|
if ($codepoint =~ /\.\./)
|
|
{
|
|
($first, $last) = split /\.\./, $codepoint;
|
|
}
|
|
else
|
|
{
|
|
$first = $last = $codepoint;
|
|
}
|
|
|
|
foreach my $cp (hex($first) .. hex($last))
|
|
{
|
|
$data{$prop}{$cp} = $value;
|
|
}
|
|
}
|
|
|
|
# We create a separate array for each normalization form rather than,
|
|
# say, a two-dimensional array, because that array would be very
|
|
# sparse and would create unnecessary overhead especially for the NFC
|
|
# lookup.
|
|
foreach my $prop (sort keys %data)
|
|
{
|
|
# Don't build the tables for the "D" forms because they are too
|
|
# big. See also unicode_is_normalized_quickcheck().
|
|
next if $prop eq "NFD_QC" || $prop eq "NFKD_QC";
|
|
|
|
print "\n";
|
|
print
|
|
"static const pg_unicode_normprops UnicodeNormProps_${prop}[] = {\n";
|
|
|
|
my %subdata = %{ $data{$prop} };
|
|
my @cp_packed;
|
|
foreach my $cp (sort { $a <=> $b } keys %subdata)
|
|
{
|
|
my $qc;
|
|
if ($subdata{$cp} eq 'N')
|
|
{
|
|
$qc = 'UNICODE_NORM_QC_NO';
|
|
}
|
|
elsif ($subdata{$cp} eq 'M')
|
|
{
|
|
$qc = 'UNICODE_NORM_QC_MAYBE';
|
|
}
|
|
else
|
|
{
|
|
die;
|
|
}
|
|
printf "\t{0x%04X, %s},\n", $cp, $qc;
|
|
|
|
# Save the bytes as a string in network order.
|
|
push @cp_packed, pack('N', $cp);
|
|
}
|
|
|
|
print "};\n";
|
|
|
|
# Emit the definition of the perfect hash function.
|
|
my $funcname = $prop . '_hash_func';
|
|
my $f = PerfectHash::generate_hash_function(\@cp_packed, $funcname,
|
|
fixed_key_length => 4);
|
|
printf "\n/* Perfect hash function for %s */", $prop;
|
|
print "\nstatic $f\n";
|
|
|
|
# Emit the structure that wraps the hash lookup information into
|
|
# one variable.
|
|
printf "/* Hash lookup information for %s */", $prop;
|
|
printf "\nstatic const pg_unicode_norminfo ";
|
|
printf "UnicodeNormInfo_%s = {\n", $prop;
|
|
printf "\tUnicodeNormProps_%s,\n", $prop;
|
|
printf "\t%s,\n", $funcname;
|
|
printf "\t%d\n", scalar @cp_packed;
|
|
printf "};\n";
|
|
}
|