mirror of
https://github.com/postgres/postgres.git
synced 2025-10-16 17:07:43 +03:00
Generate GB18030 mappings from the Unicode Consortium's UCM file
Previously we built the .map files for GB18030 (version 2000) from an XML file. The 2022 version for this encoding is only available as a Unicode Character Mapping (UCM) file, so as preparatory refactoring switch to this format as the source for building version 2000. As we do with most input files for the conversion mappings, download the file on demand. In order to generate the same mappings we have now, we must download from a previous upstream commit, rather than the head since the latter contains a correction not present in our current .map files. The XML file is still used by EUC_CN, so we cannot delete it from our repository. GB18030 is a superset of EUC_CN, so it may be possible to build EUC_CN from the same UCM file, but that is left for future work. Author: Chao Li <lic@highgo.com> Discussion: https://postgr.es/m/966d9fc.169.198741fe60b.Coremail.jiaoshuntian%40highgo.com
This commit is contained in:
@@ -54,7 +54,7 @@ $(eval $(call map_rule,euc_cn,UCS_to_EUC_CN.pl,gb-18030-2000.xml))
|
|||||||
$(eval $(call map_rule,euc_kr,UCS_to_EUC_KR.pl,KSX1001.TXT))
|
$(eval $(call map_rule,euc_kr,UCS_to_EUC_KR.pl,KSX1001.TXT))
|
||||||
$(eval $(call map_rule,euc_tw,UCS_to_EUC_TW.pl,CNS11643.TXT))
|
$(eval $(call map_rule,euc_tw,UCS_to_EUC_TW.pl,CNS11643.TXT))
|
||||||
$(eval $(call map_rule,sjis,UCS_to_SJIS.pl,CP932.TXT))
|
$(eval $(call map_rule,sjis,UCS_to_SJIS.pl,CP932.TXT))
|
||||||
$(eval $(call map_rule,gb18030,UCS_to_GB18030.pl,gb-18030-2000.xml))
|
$(eval $(call map_rule,gb18030,UCS_to_GB18030.pl,gb-18030-2000.ucm))
|
||||||
$(eval $(call map_rule,big5,UCS_to_BIG5.pl,CP950.TXT BIG5.TXT CP950.TXT))
|
$(eval $(call map_rule,big5,UCS_to_BIG5.pl,CP950.TXT BIG5.TXT CP950.TXT))
|
||||||
$(eval $(call map_rule,euc_jis_2004,UCS_to_EUC_JIS_2004.pl,euc-jis-2004-std.txt))
|
$(eval $(call map_rule,euc_jis_2004,UCS_to_EUC_JIS_2004.pl,euc-jis-2004-std.txt))
|
||||||
$(eval $(call map_rule,shift_jis_2004,UCS_to_SHIFT_JIS_2004.pl,sjis-0213-2004-std.txt))
|
$(eval $(call map_rule,shift_jis_2004,UCS_to_SHIFT_JIS_2004.pl,sjis-0213-2004-std.txt))
|
||||||
@@ -78,6 +78,9 @@ euc-jis-2004-std.txt sjis-0213-2004-std.txt:
|
|||||||
gb-18030-2000.xml windows-949-2000.xml:
|
gb-18030-2000.xml windows-949-2000.xml:
|
||||||
$(DOWNLOAD) https://raw.githubusercontent.com/unicode-org/icu-data/master/charset/data/xml/$(@F)
|
$(DOWNLOAD) https://raw.githubusercontent.com/unicode-org/icu-data/master/charset/data/xml/$(@F)
|
||||||
|
|
||||||
|
gb-18030-2000.ucm:
|
||||||
|
$(DOWNLOAD) https://raw.githubusercontent.com/unicode-org/icu-data/d9d3a6ed27bb98a7106763e940258f0be8cd995b/charset/data/ucm/$(@F)
|
||||||
|
|
||||||
GB2312.TXT:
|
GB2312.TXT:
|
||||||
$(DOWNLOAD) 'http://trac.greenstone.org/browser/trunk/gsdl/unicode/MAPPINGS/EASTASIA/GB/GB2312.TXT?rev=1842&format=txt'
|
$(DOWNLOAD) 'http://trac.greenstone.org/browser/trunk/gsdl/unicode/MAPPINGS/EASTASIA/GB/GB2312.TXT?rev=1842&format=txt'
|
||||||
|
|
||||||
|
@@ -5,13 +5,14 @@
|
|||||||
# src/backend/utils/mb/Unicode/UCS_to_GB18030.pl
|
# src/backend/utils/mb/Unicode/UCS_to_GB18030.pl
|
||||||
#
|
#
|
||||||
# Generate UTF-8 <--> GB18030 code conversion tables from
|
# Generate UTF-8 <--> GB18030 code conversion tables from
|
||||||
# "gb-18030-2000.xml", obtained from
|
# "gb-18030-2000.ucm", obtained from
|
||||||
# http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/
|
# https://github.com/unicode-org/icu-data/tree/main/charset/data/ucm
|
||||||
#
|
#
|
||||||
# The lines we care about in the source file look like
|
# The lines we care about in the source file look like
|
||||||
# <a u="009A" b="81 30 83 36"/>
|
# <UXXXX> \xYY[\xYY...] |n
|
||||||
# where the "u" field is the Unicode code point in hex,
|
# where XXXX is the Unicode code point in hex,
|
||||||
# and the "b" field is the hex byte sequence for GB18030
|
# and the \xYY... is the hex byte sequence for GB18030,
|
||||||
|
# and n is a flag indicating the type of mapping.
|
||||||
|
|
||||||
use strict;
|
use strict;
|
||||||
use warnings FATAL => 'all';
|
use warnings FATAL => 'all';
|
||||||
@@ -22,7 +23,7 @@ my $this_script = 'src/backend/utils/mb/Unicode/UCS_to_GB18030.pl';
|
|||||||
|
|
||||||
# Read the input
|
# Read the input
|
||||||
|
|
||||||
my $in_file = "gb-18030-2000.xml";
|
my $in_file = "gb-18030-2000.ucm";
|
||||||
|
|
||||||
open(my $in, '<', $in_file) || die("cannot open $in_file");
|
open(my $in, '<', $in_file) || die("cannot open $in_file");
|
||||||
|
|
||||||
@@ -30,9 +31,18 @@ my @mapping;
|
|||||||
|
|
||||||
while (<$in>)
|
while (<$in>)
|
||||||
{
|
{
|
||||||
next if (!m/<a u="([0-9A-F]+)" b="([0-9A-F ]+)"/);
|
# Mappings may have been removed by commenting out
|
||||||
my ($u, $c) = ($1, $2);
|
next if /^#/;
|
||||||
$c =~ s/ //g;
|
|
||||||
|
next if !/^<U([0-9A-Fa-f]+)>\s+
|
||||||
|
((?:\\x[0-9A-Fa-f]{2})+)\s+
|
||||||
|
\|(\d+)/x;
|
||||||
|
my ($u, $c, $flag) = ($1, $2, $3);
|
||||||
|
$c =~ s/\\x//g;
|
||||||
|
|
||||||
|
# We only want round-trip mappings
|
||||||
|
next if ($flag ne '0');
|
||||||
|
|
||||||
my $ucs = hex($u);
|
my $ucs = hex($u);
|
||||||
my $code = hex($c);
|
my $code = hex($c);
|
||||||
if ($code >= 0x80 && $ucs >= 0x0080)
|
if ($code >= 0x80 && $ucs >= 0x0080)
|
||||||
|
@@ -124,7 +124,12 @@ utf8word_to_unicode(uint32 c)
|
|||||||
/*
|
/*
|
||||||
* Perform mapping of GB18030 ranges to UTF8
|
* Perform mapping of GB18030 ranges to UTF8
|
||||||
*
|
*
|
||||||
* The ranges we need to convert are specified in gb-18030-2000.xml.
|
* General description, and the range we need to convert for U+10000 and up:
|
||||||
|
* https://htmlpreview.github.io/?https://github.com/unicode-org/icu-data/blob/main/charset/source/gb18030/gb18030.html
|
||||||
|
*
|
||||||
|
* Ranges up to U+FFFF:
|
||||||
|
* https://github.com/unicode-org/icu-data/blob/main/charset/source/gb18030/ranges.txt
|
||||||
|
*
|
||||||
* All are ranges of 4-byte GB18030 codes.
|
* All are ranges of 4-byte GB18030 codes.
|
||||||
*/
|
*/
|
||||||
static uint32
|
static uint32
|
||||||
|
Reference in New Issue
Block a user