1
0
mirror of https://github.com/postgres/postgres.git synced 2025-11-18 02:02:55 +03:00

Generate EUC_CN mappings from gb18030-2022.ucm

In the wake of cfa6cd292, EUC_CN was the only encoding that used
gb-18030-2000.xml to generate the .map files. Since EUC_CN is a subset
of GB18030, we can easily use the same UCM file. This allows deleting
the XML file from our repository.

Author: Chao Li <lic@highgo.com>
Discussion: https://postgr.es/m/CANWCAZaNRXZ-5NuXmsaMA2mKvMZnCGHZqQusLkpE%2B8YX%2Bi5OYg%40mail.gmail.com
This commit is contained in:
John Naylor
2025-10-02 12:36:24 +07:00
parent 684a745f55
commit 48566180ef
3 changed files with 23 additions and 30929 deletions

View File

@@ -50,7 +50,7 @@ $(eval $(call map_rule,gbk,UCS_to_most.pl,CP936.TXT,GBK))
$(eval $(call map_rule,johab,UCS_to_JOHAB.pl,JOHAB.TXT)) $(eval $(call map_rule,johab,UCS_to_JOHAB.pl,JOHAB.TXT))
$(eval $(call map_rule,uhc,UCS_to_UHC.pl,windows-949-2000.xml)) $(eval $(call map_rule,uhc,UCS_to_UHC.pl,windows-949-2000.xml))
$(eval $(call map_rule,euc_jp,UCS_to_EUC_JP.pl,CP932.TXT JIS0212.TXT)) $(eval $(call map_rule,euc_jp,UCS_to_EUC_JP.pl,CP932.TXT JIS0212.TXT))
$(eval $(call map_rule,euc_cn,UCS_to_EUC_CN.pl,gb-18030-2000.xml)) $(eval $(call map_rule,euc_cn,UCS_to_EUC_CN.pl,gb18030-2022.ucm))
$(eval $(call map_rule,euc_kr,UCS_to_EUC_KR.pl,KSX1001.TXT)) $(eval $(call map_rule,euc_kr,UCS_to_EUC_KR.pl,KSX1001.TXT))
$(eval $(call map_rule,euc_tw,UCS_to_EUC_TW.pl,CNS11643.TXT)) $(eval $(call map_rule,euc_tw,UCS_to_EUC_TW.pl,CNS11643.TXT))
$(eval $(call map_rule,sjis,UCS_to_SJIS.pl,CP932.TXT)) $(eval $(call map_rule,sjis,UCS_to_SJIS.pl,CP932.TXT))
@@ -75,7 +75,7 @@ BIG5.TXT CNS11643.TXT:
euc-jis-2004-std.txt sjis-0213-2004-std.txt: euc-jis-2004-std.txt sjis-0213-2004-std.txt:
$(DOWNLOAD) http://x0213.org/codetable/$(@F) $(DOWNLOAD) http://x0213.org/codetable/$(@F)
gb-18030-2000.xml windows-949-2000.xml: windows-949-2000.xml:
$(DOWNLOAD) https://raw.githubusercontent.com/unicode-org/icu-data/master/charset/data/xml/$(@F) $(DOWNLOAD) https://raw.githubusercontent.com/unicode-org/icu-data/master/charset/data/xml/$(@F)
gb18030-2022.ucm: gb18030-2022.ucm:

View File

@@ -2,16 +2,17 @@
# #
# Copyright (c) 2007-2025, PostgreSQL Global Development Group # Copyright (c) 2007-2025, PostgreSQL Global Development Group
# #
# src/backend/utils/mb/Unicode/UCS_to_GB18030.pl # src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl
# #
# Generate UTF-8 <--> GB18030 code conversion tables from # Generate UTF-8 <--> EUC_CN code conversion tables from
# "gb-18030-2000.xml", obtained from # "gb18030-2022.ucm", obtained from
# http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/ # https://github.com/unicode-org/icu/blob/main/icu4c/source/data/mappings/
# #
# The lines we care about in the source file look like # The lines we care about in the source file look like
# <a u="009A" b="81 30 83 36"/> # <UXXXX> \xYY[\xYY...] |n
# where the "u" field is the Unicode code point in hex, # where XXXX is the Unicode code point in hex,
# and the "b" field is the hex byte sequence for GB18030 # and the \xYY... is the hex byte sequence for GB18030,
# and n is a flag indicating the type of mapping.
use strict; use strict;
use warnings FATAL => 'all'; use warnings FATAL => 'all';
@@ -22,7 +23,7 @@ my $this_script = 'src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl';
# Read the input # Read the input
my $in_file = "gb-18030-2000.xml"; my $in_file = "gb18030-2022.ucm";
open(my $in, '<', $in_file) || die("cannot open $in_file"); open(my $in, '<', $in_file) || die("cannot open $in_file");
@@ -30,9 +31,18 @@ my @mapping;
while (<$in>) while (<$in>)
{ {
next if (!m/<a u="([0-9A-F]+)" b="([0-9A-F ]+)"/); # Mappings may have been removed by commenting out
my ($u, $c) = ($1, $2); next if /^#/;
$c =~ s/ //g;
next if !/^<U([0-9A-Fa-f]+)>\s+
((?:\\x[0-9A-Fa-f]{2})+)\s+
\|(\d+)/x;
my ($u, $c, $flag) = ($1, $2, $3);
$c =~ s/\\x//g;
# We only want round-trip mappings
next if ($flag ne '0');
my $ucs = hex($u); my $ucs = hex($u);
my $code = hex($c); my $code = hex($c);

File diff suppressed because it is too large Load Diff