1
0
mirror of https://github.com/postgres/postgres.git synced 2025-11-16 15:02:33 +03:00

Update display widths as part of updating Unicode

The hardcoded "wide character" set in ucs_wcwidth() was last updated
around the Unicode 5.0 era.  This led to misalignment when printing
emojis and other codepoints that have since been designated
wide or full-width.

To fix and keep up to date, extend update-unicode to download the list
of wide and full-width codepoints from the offical sources.

In passing, remove some comments about non-spacing characters that
haven't been accurate since we removed the former hardcoded logic.

Jacob Champion

Reported and reviewed by Pavel Stehule
Discussion: https://www.postgresql.org/message-id/flat/CAFj8pRCeX21O69YHxmykYySYyprZAqrKWWg0KoGKdjgqcGyygg@mail.gmail.com
This commit is contained in:
John Naylor
2021-08-26 10:53:56 -04:00
parent 1563ecbc1b
commit bab982161e
5 changed files with 220 additions and 27 deletions

View File

@@ -4,5 +4,6 @@
# Downloaded files
/CompositionExclusions.txt
/DerivedNormalizationProps.txt
/EastAsianWidth.txt
/NormalizationTest.txt
/UnicodeData.txt

View File

@@ -18,14 +18,14 @@ LIBS += $(PTHREAD_LIBS)
# By default, do nothing.
all:
update-unicode: unicode_norm_table.h unicode_combining_table.h unicode_normprops_table.h unicode_norm_hashfunc.h
update-unicode: unicode_norm_table.h unicode_combining_table.h unicode_east_asian_fw_table.h unicode_normprops_table.h unicode_norm_hashfunc.h
mv $^ ../../../src/include/common/
$(MAKE) normalization-check
# These files are part of the Unicode Character Database. Download
# them on demand. The dependency on Makefile.global is for
# UNICODE_VERSION.
UnicodeData.txt DerivedNormalizationProps.txt CompositionExclusions.txt NormalizationTest.txt: $(top_builddir)/src/Makefile.global
UnicodeData.txt EastAsianWidth.txt DerivedNormalizationProps.txt CompositionExclusions.txt NormalizationTest.txt: $(top_builddir)/src/Makefile.global
$(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F)
# Generation of conversion tables used for string normalization with
@@ -38,6 +38,9 @@ unicode_norm_table.h: generate-unicode_norm_table.pl UnicodeData.txt Composition
unicode_combining_table.h: generate-unicode_combining_table.pl UnicodeData.txt
$(PERL) $^ >$@
unicode_east_asian_fw_table.h: generate-unicode_east_asian_fw_table.pl EastAsianWidth.txt
$(PERL) $^ >$@
unicode_normprops_table.h: generate-unicode_normprops_table.pl DerivedNormalizationProps.txt
$(PERL) $^ >$@
@@ -64,6 +67,6 @@ clean:
rm -f $(OBJS) norm_test norm_test.o
distclean: clean
rm -f UnicodeData.txt CompositionExclusions.txt NormalizationTest.txt norm_test_table.h unicode_norm_table.h
rm -f UnicodeData.txt EastAsianWidth.txt CompositionExclusions.txt NormalizationTest.txt norm_test_table.h unicode_norm_table.h
maintainer-clean: distclean

View File

@@ -0,0 +1,76 @@
#!/usr/bin/perl
#
# Generate a sorted list of non-overlapping intervals of East Asian Wide (W)
# and East Asian Fullwidth (F) characters, using Unicode data files as input.
# Pass EastAsianWidth.txt as argument. The output is on stdout.
#
# Copyright (c) 2019-2021, PostgreSQL Global Development Group
use strict;
use warnings;
my $range_start = undef;
my ($first, $last);
my $prev_last;
print
"/* generated by src/common/unicode/generate-unicode_east_asian_fw_table.pl, do not edit */\n\n";
print "static const struct mbinterval east_asian_fw[] = {\n";
foreach my $line (<ARGV>)
{
chomp $line;
$line =~ s/\s*#.*$//;
next if $line eq '';
my ($codepoint, $width) = split ';', $line;
if ($codepoint =~ /\.\./)
{
($first, $last) = split /\.\./, $codepoint;
}
else
{
$first = $last = $codepoint;
}
($first, $last) = map(hex, ($first, $last));
if ($width eq 'F' || $width eq 'W')
{
# fullwidth/wide characters
if (!defined($range_start))
{
# save for start of range if one hasn't been started yet
$range_start = $first;
}
elsif ($first != $prev_last + 1)
{
# ranges aren't contiguous; emit the last and start a new one
printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_last;
$range_start = $first;
}
}
else
{
# not wide characters, print out previous range if any
if (defined($range_start))
{
printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_last;
$range_start = undef;
}
}
}
continue
{
$prev_last = $last;
}
# don't forget any ranges at the very end of the database (though there are none
# as of Unicode 13.0)
if (defined($range_start))
{
printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_last;
}
print "};\n";