mirror of
https://github.com/postgres/postgres.git
synced 2025-06-13 07:41:39 +03:00
Update display widths as part of updating Unicode
The hardcoded "wide character" set in ucs_wcwidth() was last updated around the Unicode 5.0 era. This led to misalignment when printing emojis and other codepoints that have since been designated wide or full-width. To fix and keep up to date, extend update-unicode to download the list of wide and full-width codepoints from the offical sources. In passing, remove some comments about non-spacing characters that haven't been accurate since we removed the former hardcoded logic. Jacob Champion Reported and reviewed by Pavel Stehule Discussion: https://www.postgresql.org/message-id/flat/CAFj8pRCeX21O69YHxmykYySYyprZAqrKWWg0KoGKdjgqcGyygg@mail.gmail.com
This commit is contained in:
1
src/common/unicode/.gitignore
vendored
1
src/common/unicode/.gitignore
vendored
@ -4,5 +4,6 @@
|
||||
# Downloaded files
|
||||
/CompositionExclusions.txt
|
||||
/DerivedNormalizationProps.txt
|
||||
/EastAsianWidth.txt
|
||||
/NormalizationTest.txt
|
||||
/UnicodeData.txt
|
||||
|
@ -18,14 +18,14 @@ LIBS += $(PTHREAD_LIBS)
|
||||
# By default, do nothing.
|
||||
all:
|
||||
|
||||
update-unicode: unicode_norm_table.h unicode_combining_table.h unicode_normprops_table.h unicode_norm_hashfunc.h
|
||||
update-unicode: unicode_norm_table.h unicode_combining_table.h unicode_east_asian_fw_table.h unicode_normprops_table.h unicode_norm_hashfunc.h
|
||||
mv $^ ../../../src/include/common/
|
||||
$(MAKE) normalization-check
|
||||
|
||||
# These files are part of the Unicode Character Database. Download
|
||||
# them on demand. The dependency on Makefile.global is for
|
||||
# UNICODE_VERSION.
|
||||
UnicodeData.txt DerivedNormalizationProps.txt CompositionExclusions.txt NormalizationTest.txt: $(top_builddir)/src/Makefile.global
|
||||
UnicodeData.txt EastAsianWidth.txt DerivedNormalizationProps.txt CompositionExclusions.txt NormalizationTest.txt: $(top_builddir)/src/Makefile.global
|
||||
$(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F)
|
||||
|
||||
# Generation of conversion tables used for string normalization with
|
||||
@ -38,6 +38,9 @@ unicode_norm_table.h: generate-unicode_norm_table.pl UnicodeData.txt Composition
|
||||
unicode_combining_table.h: generate-unicode_combining_table.pl UnicodeData.txt
|
||||
$(PERL) $^ >$@
|
||||
|
||||
unicode_east_asian_fw_table.h: generate-unicode_east_asian_fw_table.pl EastAsianWidth.txt
|
||||
$(PERL) $^ >$@
|
||||
|
||||
unicode_normprops_table.h: generate-unicode_normprops_table.pl DerivedNormalizationProps.txt
|
||||
$(PERL) $^ >$@
|
||||
|
||||
@ -64,6 +67,6 @@ clean:
|
||||
rm -f $(OBJS) norm_test norm_test.o
|
||||
|
||||
distclean: clean
|
||||
rm -f UnicodeData.txt CompositionExclusions.txt NormalizationTest.txt norm_test_table.h unicode_norm_table.h
|
||||
rm -f UnicodeData.txt EastAsianWidth.txt CompositionExclusions.txt NormalizationTest.txt norm_test_table.h unicode_norm_table.h
|
||||
|
||||
maintainer-clean: distclean
|
||||
|
76
src/common/unicode/generate-unicode_east_asian_fw_table.pl
Normal file
76
src/common/unicode/generate-unicode_east_asian_fw_table.pl
Normal file
@ -0,0 +1,76 @@
|
||||
#!/usr/bin/perl
|
||||
#
|
||||
# Generate a sorted list of non-overlapping intervals of East Asian Wide (W)
|
||||
# and East Asian Fullwidth (F) characters, using Unicode data files as input.
|
||||
# Pass EastAsianWidth.txt as argument. The output is on stdout.
|
||||
#
|
||||
# Copyright (c) 2019-2021, PostgreSQL Global Development Group
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
|
||||
my $range_start = undef;
|
||||
my ($first, $last);
|
||||
my $prev_last;
|
||||
|
||||
print
|
||||
"/* generated by src/common/unicode/generate-unicode_east_asian_fw_table.pl, do not edit */\n\n";
|
||||
|
||||
print "static const struct mbinterval east_asian_fw[] = {\n";
|
||||
|
||||
foreach my $line (<ARGV>)
|
||||
{
|
||||
chomp $line;
|
||||
$line =~ s/\s*#.*$//;
|
||||
next if $line eq '';
|
||||
my ($codepoint, $width) = split ';', $line;
|
||||
|
||||
if ($codepoint =~ /\.\./)
|
||||
{
|
||||
($first, $last) = split /\.\./, $codepoint;
|
||||
}
|
||||
else
|
||||
{
|
||||
$first = $last = $codepoint;
|
||||
}
|
||||
|
||||
($first, $last) = map(hex, ($first, $last));
|
||||
|
||||
if ($width eq 'F' || $width eq 'W')
|
||||
{
|
||||
# fullwidth/wide characters
|
||||
if (!defined($range_start))
|
||||
{
|
||||
# save for start of range if one hasn't been started yet
|
||||
$range_start = $first;
|
||||
}
|
||||
elsif ($first != $prev_last + 1)
|
||||
{
|
||||
# ranges aren't contiguous; emit the last and start a new one
|
||||
printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_last;
|
||||
$range_start = $first;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
# not wide characters, print out previous range if any
|
||||
if (defined($range_start))
|
||||
{
|
||||
printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_last;
|
||||
$range_start = undef;
|
||||
}
|
||||
}
|
||||
}
|
||||
continue
|
||||
{
|
||||
$prev_last = $last;
|
||||
}
|
||||
|
||||
# don't forget any ranges at the very end of the database (though there are none
|
||||
# as of Unicode 13.0)
|
||||
if (defined($range_start))
|
||||
{
|
||||
printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_last;
|
||||
}
|
||||
|
||||
print "};\n";
|
@ -583,8 +583,8 @@ pg_utf_mblen(const unsigned char *s)
|
||||
|
||||
struct mbinterval
|
||||
{
|
||||
unsigned short first;
|
||||
unsigned short last;
|
||||
unsigned int first;
|
||||
unsigned int last;
|
||||
};
|
||||
|
||||
/* auxiliary function for binary search in interval table */
|
||||
@ -623,12 +623,6 @@ mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
|
||||
* category code Mn or Me in the Unicode database) have a
|
||||
* column width of 0.
|
||||
*
|
||||
* - Other format characters (general category code Cf in the Unicode
|
||||
* database) and ZERO WIDTH SPACE (U+200B) have a column width of 0.
|
||||
*
|
||||
* - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF)
|
||||
* have a column width of 0.
|
||||
*
|
||||
* - Spacing characters in the East Asian Wide (W) or East Asian
|
||||
* FullWidth (F) category as defined in Unicode Technical
|
||||
* Report #11 have a column width of 2.
|
||||
@ -645,6 +639,7 @@ static int
|
||||
ucs_wcwidth(pg_wchar ucs)
|
||||
{
|
||||
#include "common/unicode_combining_table.h"
|
||||
#include "common/unicode_east_asian_fw_table.h"
|
||||
|
||||
/* test for 8-bit control characters */
|
||||
if (ucs == 0)
|
||||
@ -653,27 +648,25 @@ ucs_wcwidth(pg_wchar ucs)
|
||||
if (ucs < 0x20 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff)
|
||||
return -1;
|
||||
|
||||
/* binary search in table of non-spacing characters */
|
||||
/*
|
||||
* binary search in table of non-spacing characters
|
||||
*
|
||||
* XXX: In the official Unicode sources, it is possible for a character to
|
||||
* be described as both non-spacing and wide at the same time. As of
|
||||
* Unicode 13.0, treating the non-spacing property as the determining
|
||||
* factor for display width leads to the correct behavior, so do that
|
||||
* search first.
|
||||
*/
|
||||
if (mbbisearch(ucs, combining,
|
||||
sizeof(combining) / sizeof(struct mbinterval) - 1))
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* if we arrive here, ucs is not a combining or C0/C1 control character
|
||||
*/
|
||||
/* binary search in table of wide characters */
|
||||
if (mbbisearch(ucs, east_asian_fw,
|
||||
sizeof(east_asian_fw) / sizeof(struct mbinterval) - 1))
|
||||
return 2;
|
||||
|
||||
return 1 +
|
||||
(ucs >= 0x1100 &&
|
||||
(ucs <= 0x115f || /* Hangul Jamo init. consonants */
|
||||
(ucs >= 0x2e80 && ucs <= 0xa4cf && (ucs & ~0x0011) != 0x300a &&
|
||||
ucs != 0x303f) || /* CJK ... Yi */
|
||||
(ucs >= 0xac00 && ucs <= 0xd7a3) || /* Hangul Syllables */
|
||||
(ucs >= 0xf900 && ucs <= 0xfaff) || /* CJK Compatibility
|
||||
* Ideographs */
|
||||
(ucs >= 0xfe30 && ucs <= 0xfe6f) || /* CJK Compatibility Forms */
|
||||
(ucs >= 0xff00 && ucs <= 0xff5f) || /* Fullwidth Forms */
|
||||
(ucs >= 0xffe0 && ucs <= 0xffe6) ||
|
||||
(ucs >= 0x20000 && ucs <= 0x2ffff)));
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
|
Reference in New Issue
Block a user