mirror of
https://github.com/postgres/postgres.git
synced 2025-11-16 15:02:33 +03:00
Additional unicode primitive functions.
Introduce unicode_version(), icu_unicode_version(), and unicode_assigned(). The latter requires introducing a new lookup table for the Unicode General Category, which is generated along with the other Unicode lookup tables. Discussion: https://postgr.es/m/CA+TgmoYzYR-yhU6k1XFCADeyj=Oyz2PkVsa3iKv+keM8wp-F_A@mail.gmail.com Reviewed-by: Peter Eisentraut
This commit is contained in:
@@ -15,11 +15,15 @@ include $(top_builddir)/src/Makefile.global
|
||||
override CPPFLAGS := -DFRONTEND -I. $(CPPFLAGS)
|
||||
LIBS += $(PTHREAD_LIBS)
|
||||
|
||||
LDFLAGS_INTERNAL += $(ICU_LIBS)
|
||||
CPPFLAGS += $(ICU_CFLAGS)
|
||||
|
||||
# By default, do nothing.
|
||||
all:
|
||||
|
||||
update-unicode: unicode_norm_table.h unicode_nonspacing_table.h unicode_east_asian_fw_table.h unicode_normprops_table.h unicode_norm_hashfunc.h
|
||||
update-unicode: unicode_category_table.h unicode_norm_table.h unicode_nonspacing_table.h unicode_east_asian_fw_table.h unicode_normprops_table.h unicode_norm_hashfunc.h unicode_version.h
|
||||
mv $^ $(top_srcdir)/src/include/common/
|
||||
$(MAKE) category-check
|
||||
$(MAKE) normalization-check
|
||||
|
||||
# These files are part of the Unicode Character Database. Download
|
||||
@@ -28,6 +32,12 @@ update-unicode: unicode_norm_table.h unicode_nonspacing_table.h unicode_east_asi
|
||||
UnicodeData.txt EastAsianWidth.txt DerivedNormalizationProps.txt CompositionExclusions.txt NormalizationTest.txt: $(top_builddir)/src/Makefile.global
|
||||
$(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F)
|
||||
|
||||
unicode_version.h: generate-unicode_version.pl
|
||||
$(PERL) $< --version $(UNICODE_VERSION)
|
||||
|
||||
unicode_category_table.h: generate-unicode_category_table.pl UnicodeData.txt
|
||||
$(PERL) $<
|
||||
|
||||
# Generation of conversion tables used for string normalization with
|
||||
# UTF-8 strings.
|
||||
unicode_norm_hashfunc.h: unicode_norm_table.h
|
||||
@@ -45,9 +55,14 @@ unicode_normprops_table.h: generate-unicode_normprops_table.pl DerivedNormalizat
|
||||
$(PERL) $^ >$@
|
||||
|
||||
# Test suite
|
||||
category-check: category_test
|
||||
./category_test
|
||||
|
||||
normalization-check: norm_test
|
||||
./norm_test
|
||||
|
||||
category_test: category_test.o ../unicode_category.o | submake-common
|
||||
|
||||
norm_test: norm_test.o ../unicode_norm.o | submake-common
|
||||
|
||||
norm_test.o: norm_test_table.h
|
||||
@@ -64,7 +79,7 @@ norm_test_table.h: generate-norm_test_table.pl NormalizationTest.txt
|
||||
|
||||
|
||||
clean:
|
||||
rm -f $(OBJS) norm_test norm_test.o
|
||||
rm -f $(OBJS) category_test category_test.o norm_test norm_test.o
|
||||
|
||||
distclean: clean
|
||||
rm -f UnicodeData.txt EastAsianWidth.txt CompositionExclusions.txt NormalizationTest.txt norm_test_table.h unicode_norm_table.h
|
||||
|
||||
108
src/common/unicode/category_test.c
Normal file
108
src/common/unicode/category_test.c
Normal file
@@ -0,0 +1,108 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
* category_test.c
|
||||
* Program to test Unicode general category functions.
|
||||
*
|
||||
* Portions Copyright (c) 2017-2023, PostgreSQL Global Development Group
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* src/common/unicode/category_test.c
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
#include "postgres_fe.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#ifdef USE_ICU
|
||||
#include <unicode/uchar.h>
|
||||
#endif
|
||||
#include "common/unicode_category.h"
|
||||
#include "common/unicode_version.h"
|
||||
|
||||
/*
|
||||
* Parse version into integer for easy comparison.
|
||||
*/
|
||||
#ifdef USE_ICU
|
||||
static int
|
||||
parse_unicode_version(const char *version)
|
||||
{
|
||||
int n,
|
||||
major,
|
||||
minor;
|
||||
|
||||
n = sscanf(version, "%d.%d", &major, &minor);
|
||||
|
||||
Assert(n == 2);
|
||||
Assert(minor < 100);
|
||||
|
||||
return major * 100 + minor;
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Exhaustively test that the Unicode category for each codepoint matches that
|
||||
* returned by ICU.
|
||||
*/
|
||||
int
|
||||
main(int argc, char **argv)
|
||||
{
|
||||
#ifdef USE_ICU
|
||||
int pg_unicode_version = parse_unicode_version(PG_UNICODE_VERSION);
|
||||
int icu_unicode_version = parse_unicode_version(U_UNICODE_VERSION);
|
||||
int pg_skipped_codepoints = 0;
|
||||
int icu_skipped_codepoints = 0;
|
||||
|
||||
printf("Postgres Unicode Version:\t%s\n", PG_UNICODE_VERSION);
|
||||
printf("ICU Unicode Version:\t\t%s\n", U_UNICODE_VERSION);
|
||||
|
||||
for (UChar32 code = 0; code <= 0x10ffff; code++)
|
||||
{
|
||||
uint8_t pg_category = unicode_category(code);
|
||||
uint8_t icu_category = u_charType(code);
|
||||
|
||||
if (pg_category != icu_category)
|
||||
{
|
||||
/*
|
||||
* A version mismatch means that some assigned codepoints in the
|
||||
* newer version may be unassigned in the older version. That's
|
||||
* OK, though the test will not cover those codepoints marked
|
||||
* unassigned in the older version (that is, it will no longer be
|
||||
* an exhaustive test).
|
||||
*/
|
||||
if (pg_category == PG_U_UNASSIGNED &&
|
||||
pg_unicode_version < icu_unicode_version)
|
||||
pg_skipped_codepoints++;
|
||||
else if (icu_category == PG_U_UNASSIGNED &&
|
||||
icu_unicode_version < pg_unicode_version)
|
||||
icu_skipped_codepoints++;
|
||||
else
|
||||
{
|
||||
printf("FAILURE for codepoint %06x\n", code);
|
||||
printf("Postgres category: %02d %s %s\n", pg_category,
|
||||
unicode_category_abbrev(pg_category),
|
||||
unicode_category_string(pg_category));
|
||||
printf("ICU category: %02d %s %s\n", icu_category,
|
||||
unicode_category_abbrev(icu_category),
|
||||
unicode_category_string(icu_category));
|
||||
printf("\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (pg_skipped_codepoints > 0)
|
||||
printf("Skipped %d codepoints unassigned in Postgres due to Unicode version mismatch.\n",
|
||||
pg_skipped_codepoints);
|
||||
if (icu_skipped_codepoints > 0)
|
||||
printf("Skipped %d codepoints unassigned in ICU due to Unicode version mismatch.\n",
|
||||
icu_skipped_codepoints);
|
||||
|
||||
printf("category_test: All tests successful!\n");
|
||||
exit(0);
|
||||
#else
|
||||
printf("ICU support required for test; skipping.\n");
|
||||
exit(0);
|
||||
#endif
|
||||
}
|
||||
204
src/common/unicode/generate-unicode_category_table.pl
Normal file
204
src/common/unicode/generate-unicode_category_table.pl
Normal file
@@ -0,0 +1,204 @@
|
||||
#!/usr/bin/perl
|
||||
#
|
||||
# Generate a code point category table and its lookup utilities, using
|
||||
# Unicode data files as input.
|
||||
#
|
||||
# Input: UnicodeData.txt
|
||||
# Output: unicode_category_table.h
|
||||
#
|
||||
# Copyright (c) 2000-2023, PostgreSQL Global Development Group
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
use Getopt::Long;
|
||||
|
||||
use FindBin;
|
||||
use lib "$FindBin::RealBin/../../tools/";
|
||||
|
||||
my $CATEGORY_UNASSIGNED = 'Cn';
|
||||
|
||||
my $output_path = '.';
|
||||
|
||||
GetOptions('outdir:s' => \$output_path);
|
||||
|
||||
my $output_table_file = "$output_path/unicode_category_table.h";
|
||||
|
||||
my $FH;
|
||||
|
||||
# Read entries from UnicodeData.txt into a list of codepoint ranges
|
||||
# and their general category.
|
||||
my @category_ranges = ();
|
||||
my $range_start = undef;
|
||||
my $range_end = undef;
|
||||
my $range_category = undef;
|
||||
|
||||
# If between a "<..., First>" entry and a "<..., Last>" entry, the gap in
|
||||
# codepoints represents a range, and $gap_category is equal to the
|
||||
# category for both (which must match). Otherwise, the gap represents
|
||||
# unassigned code points.
|
||||
my $gap_category = undef;
|
||||
|
||||
open($FH, '<', "$output_path/UnicodeData.txt")
|
||||
or die "Could not open $output_path/UnicodeData.txt: $!.";
|
||||
while (my $line = <$FH>)
|
||||
{
|
||||
my @elts = split(';', $line);
|
||||
my $code = hex($elts[0]);
|
||||
my $name = $elts[1];
|
||||
my $category = $elts[2];
|
||||
|
||||
die "codepoint out of range" if $code > 0x10FFFF;
|
||||
die "unassigned codepoint in UnicodeData.txt" if $category eq $CATEGORY_UNASSIGNED;
|
||||
|
||||
if (!defined($range_start)) {
|
||||
my $code_str = sprintf "0x%06x", $code;
|
||||
die if defined($range_end) || defined($range_category) || defined($gap_category);
|
||||
die "unexpected first entry <..., Last>" if ($name =~ /Last>/);
|
||||
die "expected 0x000000 for first entry, got $code_str" if $code != 0x000000;
|
||||
|
||||
# initialize
|
||||
$range_start = $code;
|
||||
$range_end = $code;
|
||||
$range_category = $category;
|
||||
if ($name =~ /<.*, First>$/) {
|
||||
$gap_category = $category;
|
||||
} else {
|
||||
$gap_category = $CATEGORY_UNASSIGNED;
|
||||
}
|
||||
next;
|
||||
}
|
||||
|
||||
# Gap in codepoints detected. If it's a different category than
|
||||
# the current range, emit the current range and initialize a new
|
||||
# range representing the gap.
|
||||
if ($range_end + 1 != $code && $range_category ne $gap_category) {
|
||||
push(@category_ranges, {start => $range_start, end => $range_end, category => $range_category});
|
||||
$range_start = $range_end + 1;
|
||||
$range_end = $code - 1;
|
||||
$range_category = $gap_category;
|
||||
}
|
||||
|
||||
# different category; new range
|
||||
if ($range_category ne $category) {
|
||||
push(@category_ranges, {start => $range_start, end => $range_end, category => $range_category});
|
||||
$range_start = $code;
|
||||
$range_end = $code;
|
||||
$range_category = $category;
|
||||
}
|
||||
|
||||
if ($name =~ /<.*, First>$/) {
|
||||
die "<..., First> entry unexpectedly follows another <..., First> entry"
|
||||
if $gap_category ne $CATEGORY_UNASSIGNED;
|
||||
$gap_category = $category;
|
||||
}
|
||||
elsif ($name =~ /<.*, Last>$/) {
|
||||
die "<..., First> and <..., Last> entries have mismatching general category"
|
||||
if $gap_category ne $category;
|
||||
$gap_category = $CATEGORY_UNASSIGNED;
|
||||
}
|
||||
else {
|
||||
die "unexpected entry found between <..., First> and <..., Last>"
|
||||
if $gap_category ne $CATEGORY_UNASSIGNED;
|
||||
}
|
||||
|
||||
$range_end = $code;
|
||||
}
|
||||
close $FH;
|
||||
|
||||
die "<..., First> entry with no corresponding <..., Last> entry"
|
||||
if $gap_category ne $CATEGORY_UNASSIGNED;
|
||||
|
||||
# emit final range
|
||||
push(@category_ranges, {start => $range_start, end => $range_end, category => $range_category});
|
||||
|
||||
# emit range for any unassigned code points after last entry
|
||||
if ($range_end < 0x10FFFF) {
|
||||
$range_start = $range_end + 1;
|
||||
$range_end = 0x10FFFF;
|
||||
$range_category = $CATEGORY_UNASSIGNED;
|
||||
push(@category_ranges, {start => $range_start, end => $range_end, category => $range_category});
|
||||
}
|
||||
|
||||
my $num_ranges = scalar @category_ranges;
|
||||
|
||||
# See: https://www.unicode.org/reports/tr44/#General_Category_Values
|
||||
my $categories = {
|
||||
Cn => 'PG_U_UNASSIGNED',
|
||||
Lu => 'PG_U_UPPERCASE_LETTER',
|
||||
Ll => 'PG_U_LOWERCASE_LETTER',
|
||||
Lt => 'PG_U_TITLECASE_LETTER',
|
||||
Lm => 'PG_U_MODIFIER_LETTER',
|
||||
Lo => 'PG_U_OTHER_LETTER',
|
||||
Mn => 'PG_U_NONSPACING_MARK',
|
||||
Me => 'PG_U_ENCLOSING_MARK',
|
||||
Mc => 'PG_U_SPACING_MARK',
|
||||
Nd => 'PG_U_DECIMAL_NUMBER',
|
||||
Nl => 'PG_U_LETTER_NUMBER',
|
||||
No => 'PG_U_OTHER_NUMBER',
|
||||
Zs => 'PG_U_SPACE_SEPARATOR',
|
||||
Zl => 'PG_U_LINE_SEPARATOR',
|
||||
Zp => 'PG_U_PARAGRAPH_SEPARATOR',
|
||||
Cc => 'PG_U_CONTROL',
|
||||
Cf => 'PG_U_FORMAT',
|
||||
Co => 'PG_U_PRIVATE_USE',
|
||||
Cs => 'PG_U_SURROGATE',
|
||||
Pd => 'PG_U_DASH_PUNCTUATION',
|
||||
Ps => 'PG_U_OPEN_PUNCTUATION',
|
||||
Pe => 'PG_U_CLOSE_PUNCTUATION',
|
||||
Pc => 'PG_U_CONNECTOR_PUNCTUATION',
|
||||
Po => 'PG_U_OTHER_PUNCTUATION',
|
||||
Sm => 'PG_U_MATH_SYMBOL',
|
||||
Sc => 'PG_U_CURRENCY_SYMBOL',
|
||||
Sk => 'PG_U_MODIFIER_SYMBOL',
|
||||
So => 'PG_U_OTHER_SYMBOL',
|
||||
Pi => 'PG_U_INITIAL_PUNCTUATION',
|
||||
Pf => 'PG_U_FINAL_PUNCTUATION'
|
||||
};
|
||||
|
||||
# Start writing out the output files
|
||||
open my $OT, '>', $output_table_file
|
||||
or die "Could not open output file $output_table_file: $!\n";
|
||||
|
||||
print $OT <<HEADER;
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* unicode_category_table.h
|
||||
* Category table for Unicode character classification.
|
||||
*
|
||||
* Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* src/include/common/unicode_category_table.h
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
#include "common/unicode_category.h"
|
||||
|
||||
/*
|
||||
* File auto-generated by src/common/unicode/generate-unicode_category_table.pl,
|
||||
* do not edit. There is deliberately not an #ifndef PG_UNICODE_CATEGORY_TABLE_H
|
||||
* here.
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
uint32 first; /* Unicode codepoint */
|
||||
uint32 last; /* Unicode codepoint */
|
||||
uint8 category; /* General Category */
|
||||
} pg_category_range;
|
||||
|
||||
/* table of Unicode codepoint ranges and their categories */
|
||||
static const pg_category_range unicode_categories[$num_ranges] =
|
||||
{
|
||||
HEADER
|
||||
|
||||
my $firsttime = 1;
|
||||
foreach my $range (@category_ranges) {
|
||||
printf $OT ",\n" unless $firsttime;
|
||||
$firsttime = 0;
|
||||
|
||||
my $category = $categories->{$range->{category}};
|
||||
die "category missing: $range->{category}" unless $category;
|
||||
printf $OT "\t{0x%06x, 0x%06x, %s}", $range->{start}, $range->{end}, $category;
|
||||
}
|
||||
print $OT "\n};\n";
|
||||
46
src/common/unicode/generate-unicode_version.pl
Normal file
46
src/common/unicode/generate-unicode_version.pl
Normal file
@@ -0,0 +1,46 @@
|
||||
#!/usr/bin/perl
|
||||
#
|
||||
# Generate header file with Unicode version used by Postgres.
|
||||
#
|
||||
# Output: unicode_version.h
|
||||
#
|
||||
# Copyright (c) 2000-2023, PostgreSQL Global Development Group
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
use Getopt::Long;
|
||||
|
||||
use FindBin;
|
||||
use lib "$FindBin::RealBin/../../tools/";
|
||||
|
||||
my $output_path = '.';
|
||||
my $version_str = undef;
|
||||
|
||||
GetOptions('outdir:s' => \$output_path, 'version:s' => \$version_str);
|
||||
|
||||
my @version_parts = split /\./, $version_str;
|
||||
|
||||
my $unicode_version_str = sprintf "%d.%d", $version_parts[0], $version_parts[1];
|
||||
|
||||
my $output_file = "$output_path/unicode_version.h";
|
||||
|
||||
# Start writing out the output files
|
||||
open my $OT, '>', $output_file
|
||||
or die "Could not open output file $output_file: $!\n";
|
||||
|
||||
print $OT <<HEADER;
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* unicode_version.h
|
||||
* Unicode version used by Postgres.
|
||||
*
|
||||
* Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* src/include/common/unicode_version.h
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
#define PG_UNICODE_VERSION "$unicode_version_str"
|
||||
HEADER
|
||||
@@ -24,6 +24,25 @@ endforeach
|
||||
|
||||
update_unicode_targets = []
|
||||
|
||||
update_unicode_targets += \
|
||||
custom_target('unicode_version.h',
|
||||
output: ['unicode_version.h'],
|
||||
command: [
|
||||
perl, files('generate-unicode_version.pl'),
|
||||
'--outdir', '@OUTDIR@', '--version', UNICODE_VERSION],
|
||||
build_by_default: false,
|
||||
)
|
||||
|
||||
update_unicode_targets += \
|
||||
custom_target('unicode_category_table.h',
|
||||
input: [unicode_data['UnicodeData.txt']],
|
||||
output: ['unicode_category_table.h'],
|
||||
command: [
|
||||
perl, files('generate-unicode_category_table.pl'),
|
||||
'--outdir', '@OUTDIR@', '@INPUT@'],
|
||||
build_by_default: false,
|
||||
)
|
||||
|
||||
update_unicode_targets += \
|
||||
custom_target('unicode_norm_table.h',
|
||||
input: [unicode_data['UnicodeData.txt'], unicode_data['CompositionExclusions.txt']],
|
||||
@@ -73,6 +92,17 @@ norm_test_table = custom_target('norm_test_table.h',
|
||||
|
||||
inc = include_directories('.')
|
||||
|
||||
category_test = executable('category_test',
|
||||
['category_test.c'],
|
||||
dependencies: [frontend_port_code, icu],
|
||||
include_directories: inc,
|
||||
link_with: [common_static, pgport_static],
|
||||
build_by_default: false,
|
||||
kwargs: default_bin_args + {
|
||||
'install': false,
|
||||
}
|
||||
)
|
||||
|
||||
norm_test = executable('norm_test',
|
||||
['norm_test.c', norm_test_table],
|
||||
dependencies: [frontend_port_code],
|
||||
@@ -86,6 +116,16 @@ norm_test = executable('norm_test',
|
||||
|
||||
update_unicode_dep = []
|
||||
|
||||
if not meson.is_cross_build()
|
||||
update_unicode_dep += custom_target('category_test.run',
|
||||
output: 'category_test.run',
|
||||
input: update_unicode_targets,
|
||||
command: [category_test, UNICODE_VERSION],
|
||||
build_by_default: false,
|
||||
build_always_stale: true,
|
||||
)
|
||||
endif
|
||||
|
||||
if not meson.is_cross_build()
|
||||
update_unicode_dep += custom_target('norm_test.run',
|
||||
output: 'norm_test.run',
|
||||
|
||||
@@ -81,6 +81,6 @@ main(int argc, char **argv)
|
||||
}
|
||||
}
|
||||
|
||||
printf("All tests successful!\n");
|
||||
printf("norm_test: All tests successful!\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user