1
0
mirror of https://github.com/postgres/postgres.git synced 2025-08-19 23:22:23 +03:00

Add SQL functions for Unicode normalization

This adds SQL expressions NORMALIZE() and IS NORMALIZED to convert and
check Unicode normal forms, per SQL standard.

To support fast IS NORMALIZED tests, we pull in a new data file
DerivedNormalizationProps.txt from Unicode and build a lookup table
from that, using techniques similar to ones already used for other
Unicode data.  make update-unicode will keep it up to date.  We only
build and use these tables for the NFC and NFKC forms, because they
are too big for NFD and NFKD and the improvement is not significant
enough there.

Reviewed-by: Daniel Verite <daniel@manitou-mail.org>
Reviewed-by: Andreas Karlsson <andreas@proxel.se>
Discussion: https://www.postgresql.org/message-id/flat/c1909f27-c269-2ed9-12f8-3ab72c8caf7a@2ndquadrant.com
This commit is contained in:
Peter Eisentraut
2020-03-26 08:14:00 +01:00
parent 070c3d3937
commit 2991ac5fc9
20 changed files with 6764 additions and 7 deletions

View File

@@ -3,5 +3,6 @@
# Downloaded files
/CompositionExclusions.txt
/DerivedNormalizationProps.txt
/NormalizationTest.txt
/UnicodeData.txt

View File

@@ -18,14 +18,14 @@ LIBS += $(PTHREAD_LIBS)
# By default, do nothing.
all:
update-unicode: unicode_norm_table.h unicode_combining_table.h
update-unicode: unicode_norm_table.h unicode_combining_table.h unicode_normprops_table.h
$(MAKE) normalization-check
mv unicode_norm_table.h unicode_combining_table.h ../../../src/include/common/
mv $^ ../../../src/include/common/
# These files are part of the Unicode Character Database. Download
# them on demand. The dependency on Makefile.global is for
# UNICODE_VERSION.
UnicodeData.txt CompositionExclusions.txt NormalizationTest.txt: $(top_builddir)/src/Makefile.global
UnicodeData.txt DerivedNormalizationProps.txt CompositionExclusions.txt NormalizationTest.txt: $(top_builddir)/src/Makefile.global
$(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F)
# Generation of conversion tables used for string normalization with
@@ -36,6 +36,9 @@ unicode_norm_table.h: generate-unicode_norm_table.pl UnicodeData.txt Composition
unicode_combining_table.h: generate-unicode_combining_table.pl UnicodeData.txt
$(PERL) $^ >$@
unicode_normprops_table.h: generate-unicode_normprops_table.pl DerivedNormalizationProps.txt
$(PERL) $^ >$@
# Test suite
normalization-check: norm_test
./norm_test

View File

@@ -0,0 +1,86 @@
#!/usr/bin/perl
#
# Generate table of Unicode normalization "quick check" properties
# (see UAX #15). Pass DerivedNormalizationProps.txt as argument. The
# output is on stdout.
#
# Copyright (c) 2020, PostgreSQL Global Development Group
use strict;
use warnings;
my %data;
print "/* generated by src/common/unicode/generate-unicode_normprops_table.pl, do not edit */\n\n";
print <<EOS;
#include "common/unicode_norm.h"
/*
* We use a bit field here to save space.
*/
typedef struct
{
unsigned int codepoint:21;
signed int quickcheck:4; /* really UnicodeNormalizationQC */
} pg_unicode_normprops;
EOS
foreach my $line (<ARGV>)
{
chomp $line;
$line =~ s/\s*#.*$//;
next if $line eq '';
my ($codepoint, $prop, $value) = split /\s*;\s*/, $line;
next if $prop !~ /_QC/;
my ($first, $last);
if ($codepoint =~ /\.\./)
{
($first, $last) = split /\.\./, $codepoint;
}
else
{
$first = $last = $codepoint;
}
foreach my $cp (hex($first)..hex($last))
{
$data{$prop}{$cp} = $value;
}
}
# We create a separate array for each normalization form rather than,
# say, a two-dimensional array, because that array would be very
# sparse and would create unnecessary overhead especially for the NFC
# lookup.
foreach my $prop (sort keys %data)
{
# Don't build the tables for the "D" forms because they are too
# big. See also unicode_is_normalized_quickcheck().
next if $prop eq "NFD_QC" || $prop eq "NFKD_QC";
print "\n";
print "static const pg_unicode_normprops UnicodeNormProps_${prop}[] = {\n";
my %subdata = %{$data{$prop}};
foreach my $cp (sort { $a <=> $b } keys %subdata)
{
my $qc;
if ($subdata{$cp} eq 'N')
{
$qc = 'UNICODE_NORM_QC_NO';
}
elsif ($subdata{$cp} eq 'M')
{
$qc = 'UNICODE_NORM_QC_MAYBE';
}
else
{
die;
}
printf "\t{0x%04X, %s},\n", $cp, $qc;
}
print "};\n";
}

View File

@@ -20,6 +20,9 @@
#include "common/unicode_norm.h"
#include "common/unicode_norm_table.h"
#ifndef FRONTEND
#include "common/unicode_normprops_table.h"
#endif
#ifndef FRONTEND
#define ALLOC(size) palloc(size)
@@ -442,3 +445,110 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
return recomp_chars;
}
/*
* Normalization "quick check" algorithm; see
* <http://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms>
*/
/* We only need this in the backend. */
#ifndef FRONTEND
static uint8
get_canonical_class(pg_wchar ch)
{
pg_unicode_decomposition *entry = get_code_entry(ch);
if (!entry)
return 0;
else
return entry->comb_class;
}
static int
qc_compare(const void *p1, const void *p2)
{
uint32 v1,
v2;
v1 = ((const pg_unicode_normprops *) p1)->codepoint;
v2 = ((const pg_unicode_normprops *) p2)->codepoint;
return (v1 - v2);
}
/*
* Look up the normalization quick check character property
*/
static UnicodeNormalizationQC
qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch)
{
pg_unicode_normprops key;
pg_unicode_normprops *found = NULL;
key.codepoint = ch;
switch (form)
{
case UNICODE_NFC:
found = bsearch(&key,
UnicodeNormProps_NFC_QC,
lengthof(UnicodeNormProps_NFC_QC),
sizeof(pg_unicode_normprops),
qc_compare);
break;
case UNICODE_NFKC:
found = bsearch(&key,
UnicodeNormProps_NFKC_QC,
lengthof(UnicodeNormProps_NFKC_QC),
sizeof(pg_unicode_normprops),
qc_compare);
break;
default:
Assert(false);
break;
}
if (found)
return found->quickcheck;
else
return UNICODE_NORM_QC_YES;
}
UnicodeNormalizationQC
unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const pg_wchar *input)
{
uint8 lastCanonicalClass = 0;
UnicodeNormalizationQC result = UNICODE_NORM_QC_YES;
/*
* For the "D" forms, we don't run the quickcheck. We don't include the
* lookup tables for those because they are huge, checking for these
* particular forms is less common, and running the slow path is faster
* for the "D" forms than the "C" forms because you don't need to
* recompose, which is slow.
*/
if (form == UNICODE_NFD || form == UNICODE_NFKD)
return UNICODE_NORM_QC_MAYBE;
for (const pg_wchar *p = input; *p; p++)
{
pg_wchar ch = *p;
uint8 canonicalClass;
UnicodeNormalizationQC check;
canonicalClass = get_canonical_class(ch);
if (lastCanonicalClass > canonicalClass && canonicalClass != 0)
return UNICODE_NORM_QC_NO;
check = qc_is_allowed(form, ch);
if (check == UNICODE_NORM_QC_NO)
return UNICODE_NORM_QC_NO;
else if (check == UNICODE_NORM_QC_MAYBE)
result = UNICODE_NORM_QC_MAYBE;
lastCanonicalClass = canonicalClass;
}
return result;
}
#endif /* !FRONTEND */