Add support for automatically updating Unicode derived files

We currently have several sets of files generated from data provided by Unicode. These all have ad hoc rules and instructions for updating when new Unicode versions appear, and it's not done consistently. This patch centralizes and automates the process and makes it part of the release checklist. The Unicode and CLDR versions are specified in Makefile.global.in. There is a new make target "update-unicode" that downloads all the relevant files and runs the generation script. There is also a new script for generating the table of combining characters for ucs_wcwidth(). That table is now in a separate include file rather than hardcoded into the middle of other code. This is based on the script that was used for generating d8594d123c, but the script itself wasn't committed at that time. Reviewed-by: John Naylor <john.naylor@2ndquadrant.com> Discussion: https://www.postgresql.org/message-id/flat/c8d05f42-443e-6c23-819b-05b31759a37c@2ndquadrant.com
2025-08-21 10:42:50 +03:00 · 2020-01-09 09:54:47 +01:00
parent f5fd995a1a
commit f85a485f89
13 changed files with 313 additions and 94 deletions
--- a/src/common/unicode/.gitignore
+++ b/src/common/unicode/.gitignore
@@ -1,7 +1,7 @@
 /norm_test
 /norm_test_table.h

-# Files downloaded from the Unicode Character Database
+# Downloaded files
 /CompositionExclusions.txt
 /NormalizationTest.txt
 /UnicodeData.txt
--- a/src/common/unicode/Makefile
+++ b/src/common/unicode/Makefile
@@ -18,18 +18,24 @@ LIBS += $(PTHREAD_LIBS)
 # By default, do nothing.
 all:

-DOWNLOAD = wget -O $@ --no-use-server-timestamps
+update-unicode: unicode_norm_table.h unicode_combining_table.h
+	$(MAKE) normalization-check
+	mv unicode_norm_table.h unicode_combining_table.h ../../../src/include/common/

 # These files are part of the Unicode Character Database. Download
-# them on demand.
-UnicodeData.txt CompositionExclusions.txt NormalizationTest.txt:
-	$(DOWNLOAD) https://www.unicode.org/Public/UNIDATA/$(@F)
+# them on demand.  The dependency on Makefile.global is for
+# UNICODE_VERSION.
+UnicodeData.txt CompositionExclusions.txt NormalizationTest.txt: $(top_builddir)/src/Makefile.global
+	$(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F)

 # Generation of conversion tables used for string normalization with
 # UTF-8 strings.
 unicode_norm_table.h: generate-unicode_norm_table.pl UnicodeData.txt CompositionExclusions.txt
 	$(PERL) generate-unicode_norm_table.pl

+unicode_combining_table.h: generate-unicode_combining_table.pl UnicodeData.txt
+	$(PERL) $^ >$@
+
 # Test suite
 normalization-check: norm_test
 	./norm_test
--- a/src/common/unicode/README
+++ b/src/common/unicode/README
@@ -8,20 +8,11 @@ of Unicode.
 Generating unicode_norm_table.h
 -------------------------------

-1. Download the Unicode data file, UnicodeData.txt, from the Unicode
-consortium and place it to the current directory. Run the perl script
-"generate-unicode_norm_table.pl", to process it, and to generate the
-"unicode_norm_table.h" file. The Makefile contains a rule to download the
-data files if they don't exist.
-
-    make unicode_norm_table.h
-
-2. Inspect the resulting header file. Once you're happy with it, copy it to
-the right location.
-
-    cp unicode_norm_table.h ../../../src/include/common/
+Run

+    make update-unicode

+from the top level of the source tree and commit the result.

 Tests
 -----
@@ -33,3 +24,5 @@ normalization code with all the test strings in NormalizationTest.txt.
 To download NormalizationTest.txt and run the tests:

    make normalization-check
+
+This is also run as part of the update-unicode target.
--- a/src/common/unicode/generate-unicode_combining_table.pl
+++ b/src/common/unicode/generate-unicode_combining_table.pl
@@ -0,0 +1,52 @@
+#!/usr/bin/perl
+#
+# Generate sorted list of non-overlapping intervals of non-spacing
+# characters, using Unicode data files as input.  Pass UnicodeData.txt
+# as argument.  The output is on stdout.
+#
+# Copyright (c) 2019, PostgreSQL Global Development Group
+
+use strict;
+use warnings;
+
+my $range_start = undef;
+my $codepoint;
+my $prev_codepoint;
+my $count = 0;
+
+print "/* generated by src/common/unicode/generate-unicode_combining_table.pl, do not edit */\n\n";
+
+print "static const struct mbinterval combining[] = {\n";
+
+foreach my $line (<ARGV>)
+{
+    chomp $line;
+    my @fields = split ';', $line;
+    $codepoint = hex $fields[0];
+
+    next if $codepoint > 0xFFFF;
+
+    if ($fields[2] eq 'Me' || $fields[2] eq 'Mn')
+    {
+        # combining character, save for start of range
+        if (!defined($range_start))
+        {
+            $range_start = $codepoint;
+        }
+    }
+    else
+    {
+        # not a combining character, print out previous range if any
+        if (defined($range_start))
+        {
+            printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_codepoint;
+            $range_start = undef;
+        }
+    }
+}
+continue
+{
+    $prev_codepoint = $codepoint;
+}
+
+print "};\n";