1
0
mirror of https://github.com/postgres/postgres.git synced 2025-11-16 15:02:33 +03:00

Treat Unicode codepoints of category "Format" as non-spacing

Commit d8594d123 updated the list of non-spacing codepoints used
for calculating display width, but in doing so inadvertently removed
some, since the script used for that commit only considered combining
characters.

For complete coverage for zero-width characters, include codepoints in
the category Cf (Format). To reflect the wider purpose, also rename files
and update comments that referred specifically to combining characters.

Some of these ranges have been missing since v12, but due to lack of
field complaints it was determined not important enough to justify adding
special-case logic the backbranches.

Kyotaro Horiguchi

Report by Pavel Stehule
Discussion: https://www.postgresql.org/message-id/flat/CAFj8pRBE8yvpQ0FSkPCoe0Ny1jAAsAQ6j3qMgVwWvkqAoaaNmQ%40mail.gmail.com
This commit is contained in:
John Naylor
2022-09-13 16:13:33 +07:00
parent bb629c294b
commit 0bd9c62973
4 changed files with 34 additions and 23 deletions

View File

@@ -18,7 +18,7 @@ LIBS += $(PTHREAD_LIBS)
# By default, do nothing.
all:
update-unicode: unicode_norm_table.h unicode_combining_table.h unicode_east_asian_fw_table.h unicode_normprops_table.h unicode_norm_hashfunc.h
update-unicode: unicode_norm_table.h unicode_nonspacing_table.h unicode_east_asian_fw_table.h unicode_normprops_table.h unicode_norm_hashfunc.h
mv $^ $(top_srcdir)/src/include/common/
$(MAKE) normalization-check
@@ -35,7 +35,7 @@ unicode_norm_hashfunc.h: unicode_norm_table.h
unicode_norm_table.h: generate-unicode_norm_table.pl UnicodeData.txt CompositionExclusions.txt
$(PERL) $<
unicode_combining_table.h: generate-unicode_combining_table.pl UnicodeData.txt
unicode_nonspacing_table.h: generate-unicode_nonspacing_table.pl UnicodeData.txt
$(PERL) $^ >$@
unicode_east_asian_fw_table.h: generate-unicode_east_asian_fw_table.pl EastAsianWidth.txt

View File

@@ -15,9 +15,9 @@ my $prev_codepoint;
my $count = 0;
print
"/* generated by src/common/unicode/generate-unicode_combining_table.pl, do not edit */\n\n";
"/* generated by src/common/unicode/generate-unicode_nonspacing_table.pl, do not edit */\n\n";
print "static const struct mbinterval combining[] = {\n";
print "static const struct mbinterval nonspacing[] = {\n";
foreach my $line (<ARGV>)
{
@@ -25,9 +25,11 @@ foreach my $line (<ARGV>)
my @fields = split ';', $line;
$codepoint = hex $fields[0];
if ($fields[2] eq 'Me' || $fields[2] eq 'Mn')
# Me and Mn refer to combining characters
# Cf refers to format characters
if ($fields[2] eq 'Me' || $fields[2] eq 'Mn' || $fields[2] eq 'Cf')
{
# combining character, save for start of range
# non-spacing character, save for start of range
if (!defined($range_start))
{
$range_start = $codepoint;
@@ -35,7 +37,7 @@ foreach my $line (<ARGV>)
}
else
{
# not a combining character, print out previous range if any
# not a non-spacing character, print out previous range if any
if (defined($range_start))
{
printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_codepoint;