1
0
mirror of https://github.com/postgres/postgres.git synced 2025-08-27 07:42:10 +03:00
Files
postgres/src/common/unicode/generate-unicode_normprops_table.pl
Michael Paquier 80f8eb79e2 Use perfect hash for NFC and NFKC Unicode Normalization quick check
This makes the normalization quick check about 30% faster for NFC and
50% faster for NFKC than the binary search used previously.  The hash
lookup reuses the existing array of bit fields used for the binary
search to get the quick check property and is generated as part of "make
update-unicode" in src/common/unicode/.

Author: John Naylor
Reviewed-by: Mark Dilger, Michael Paquier
Discussion: https://postgr.es/m/CACPNZCt4fbJ0_bGrN5QPt34N4whv=mszM0LMVQdoa2rC9UMRXA@mail.gmail.com
2020-10-11 19:09:01 +09:00

126 lines
3.0 KiB
Perl

#!/usr/bin/perl
#
# Generate table of Unicode normalization "quick check" properties
# (see UAX #15). Pass DerivedNormalizationProps.txt as argument. The
# output is on stdout.
#
# Copyright (c) 2020, PostgreSQL Global Development Group
use strict;
use warnings;
use FindBin;
use lib "$FindBin::RealBin/../../tools/";
use PerfectHash;
my %data;
print
"/* generated by src/common/unicode/generate-unicode_normprops_table.pl, do not edit */\n\n";
print <<EOS;
#include "common/unicode_norm.h"
/*
* Normalization quick check entry for codepoint. We use a bit field
* here to save space.
*/
typedef struct
{
unsigned int codepoint:21;
signed int quickcheck:4; /* really UnicodeNormalizationQC */
} pg_unicode_normprops;
/* Typedef for hash function on quick check table */
typedef int (*qc_hash_func) (const void *key);
/* Information for quick check lookup with perfect hash function */
typedef struct
{
const pg_unicode_normprops *normprops;
qc_hash_func hash;
int num_normprops;
} pg_unicode_norminfo;
EOS
foreach my $line (<ARGV>)
{
chomp $line;
$line =~ s/\s*#.*$//;
next if $line eq '';
my ($codepoint, $prop, $value) = split /\s*;\s*/, $line;
next if $prop !~ /_QC/;
my ($first, $last);
if ($codepoint =~ /\.\./)
{
($first, $last) = split /\.\./, $codepoint;
}
else
{
$first = $last = $codepoint;
}
foreach my $cp (hex($first) .. hex($last))
{
$data{$prop}{$cp} = $value;
}
}
# We create a separate array for each normalization form rather than,
# say, a two-dimensional array, because that array would be very
# sparse and would create unnecessary overhead especially for the NFC
# lookup.
foreach my $prop (sort keys %data)
{
# Don't build the tables for the "D" forms because they are too
# big. See also unicode_is_normalized_quickcheck().
next if $prop eq "NFD_QC" || $prop eq "NFKD_QC";
print "\n";
print
"static const pg_unicode_normprops UnicodeNormProps_${prop}[] = {\n";
my %subdata = %{ $data{$prop} };
my @cp_packed;
foreach my $cp (sort { $a <=> $b } keys %subdata)
{
my $qc;
if ($subdata{$cp} eq 'N')
{
$qc = 'UNICODE_NORM_QC_NO';
}
elsif ($subdata{$cp} eq 'M')
{
$qc = 'UNICODE_NORM_QC_MAYBE';
}
else
{
die;
}
printf "\t{0x%04X, %s},\n", $cp, $qc;
# Save the bytes as a string in network order.
push @cp_packed, pack('N', $cp);
}
print "};\n";
# Emit the definition of the perfect hash function.
my $funcname = $prop . '_hash_func';
my $f = PerfectHash::generate_hash_function(\@cp_packed, $funcname,
fixed_key_length => 4);
printf "\n/* Perfect hash function for %s */", $prop;
print "\nstatic $f\n";
# Emit the structure that wraps the hash lookup information into
# one variable.
printf "/* Hash lookup information for %s */", $prop;
printf "\nstatic const pg_unicode_norminfo ";
printf "UnicodeNormInfo_%s = {\n", $prop;
printf "\tUnicodeNormProps_%s,\n", $prop;
printf "\t%s,\n", $funcname;
printf "\t%d\n", scalar @cp_packed;
printf "};\n";
}