1
0
mirror of https://github.com/postgres/postgres.git synced 2025-10-16 17:07:43 +03:00

Update GB18030 encoding from version 2000 to 2022

Mappings for 18 characters have changed, affecting 36 code points. This
is a break in compatibility, but these characters are rarely used.

U+E5E5 (Private Use Area) was previously mapped to \xA3A0. This code
point now maps to \x65356535. Attempting to convert \xA3A0 will now
raise an error.

Separate from the 2022 update, the following mappings were previously
swapped, and subsequently corrected in 2000 and later versions:
 * U+E7C7 (Private Use Area) now maps to \x8135F437
 * U+1E3F (Latin Small Letter M with Acute) now maps to \xA8BC

The 2022 standard mentions the following policy changes, but they
have no effect in our implementation:

66 new ideographs are now required, but these are mapped
algorithmically so were already handled by utf8_and_gb18030.c.

Nine CJK compatibility ideographs are no longer required, but
implementations may retain them, as does the source we use from
the Unicode Consortium.

Release notes: Compatibility section

For further details, see:
https://www.unicode.org/L2/L2022/22274-disruptive-changes.pdf
https://ken-lunde.medium.com/the-gb-18030-2022-standard-3d0ebaeb4132

Author: Chao Li <lic@highgo.com>
Author: Zheng Tao <taoz@highgo.com>
Discussion: https://postgr.es/m/966d9fc.169.198741fe60b.Coremail.jiaoshuntian%40highgo.com
This commit is contained in:
John Naylor
2025-09-24 13:26:05 +07:00
parent e41d954da6
commit 5334620eef
7 changed files with 1671 additions and 1592 deletions

View File

@@ -1876,7 +1876,7 @@ ORDER BY c COLLATE ebcdic;
</row>
<row>
<entry><literal>GB18030</literal></entry>
<entry>National Standard</entry>
<entry>National Standard, version 2022</entry>
<entry>Chinese</entry>
<entry>No</entry>
<entry>No</entry>

View File

@@ -54,7 +54,7 @@ $(eval $(call map_rule,euc_cn,UCS_to_EUC_CN.pl,gb-18030-2000.xml))
$(eval $(call map_rule,euc_kr,UCS_to_EUC_KR.pl,KSX1001.TXT))
$(eval $(call map_rule,euc_tw,UCS_to_EUC_TW.pl,CNS11643.TXT))
$(eval $(call map_rule,sjis,UCS_to_SJIS.pl,CP932.TXT))
$(eval $(call map_rule,gb18030,UCS_to_GB18030.pl,gb-18030-2000.ucm))
$(eval $(call map_rule,gb18030,UCS_to_GB18030.pl,gb18030-2022.ucm))
$(eval $(call map_rule,big5,UCS_to_BIG5.pl,CP950.TXT BIG5.TXT CP950.TXT))
$(eval $(call map_rule,euc_jis_2004,UCS_to_EUC_JIS_2004.pl,euc-jis-2004-std.txt))
$(eval $(call map_rule,shift_jis_2004,UCS_to_SHIFT_JIS_2004.pl,sjis-0213-2004-std.txt))
@@ -78,8 +78,8 @@ euc-jis-2004-std.txt sjis-0213-2004-std.txt:
gb-18030-2000.xml windows-949-2000.xml:
$(DOWNLOAD) https://raw.githubusercontent.com/unicode-org/icu-data/master/charset/data/xml/$(@F)
gb-18030-2000.ucm:
$(DOWNLOAD) https://raw.githubusercontent.com/unicode-org/icu-data/d9d3a6ed27bb98a7106763e940258f0be8cd995b/charset/data/ucm/$(@F)
gb18030-2022.ucm:
$(DOWNLOAD) https://raw.githubusercontent.com/unicode-org/icu/refs/heads/main/icu4c/source/data/mappings/$(@F)
GB2312.TXT:
$(DOWNLOAD) 'http://trac.greenstone.org/browser/trunk/gsdl/unicode/MAPPINGS/EASTASIA/GB/GB2312.TXT?rev=1842&format=txt'

View File

@@ -5,8 +5,8 @@
# src/backend/utils/mb/Unicode/UCS_to_GB18030.pl
#
# Generate UTF-8 <--> GB18030 code conversion tables from
# "gb-18030-2000.ucm", obtained from
# https://github.com/unicode-org/icu-data/tree/main/charset/data/ucm
# "gb18030-2022.ucm", obtained from
# https://github.com/unicode-org/icu/blob/main/icu4c/source/data/mappings/
#
# The lines we care about in the source file look like
# <UXXXX> \xYY[\xYY...] |n
@@ -23,7 +23,7 @@ my $this_script = 'src/backend/utils/mb/Unicode/UCS_to_GB18030.pl';
# Read the input
my $in_file = "gb-18030-2000.ucm";
my $in_file = "gb18030-2022.ucm";
open(my $in, '<', $in_file) || die("cannot open $in_file");

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -506,6 +506,7 @@ insert into gb18030_inputs values
('\x666f6fcff3', 'valid'),
('\x666f6f8431a530', 'valid, no translation to UTF-8'),
('\x666f6f84309c38', 'valid, translates to UTF-8 by mapping function'),
('\xa6d9', 'valid, changed from version 2000 to 2022'),
('\x666f6f84309c', 'incomplete char '),
('\x666f6f84309c0a', 'incomplete char, followed by newline '),
('\x666f6f84', 'incomplete char at end'),
@@ -521,12 +522,13 @@ select description, inbytes, (test_conv(inbytes::text::bytea, 'gb18030', 'gb1803
valid | \x666f6fcff3 | \x666f6fcff3 | |
valid, no translation to UTF-8 | \x666f6f8431a530 | \x666f6f8431a530 | |
valid, translates to UTF-8 by mapping function | \x666f6f84309c38 | \x666f6f84309c38 | |
valid, changed from version 2000 to 2022 | \xa6d9 | \xa6d9 | |
incomplete char | \x666f6f84309c | \x666f6f | \x84309c | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c
incomplete char, followed by newline | \x666f6f84309c0a | \x666f6f | \x84309c0a | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c 0x0a
incomplete char at end | \x666f6f84 | \x666f6f | \x84 | invalid byte sequence for encoding "GB18030": 0x84
invalid, NUL byte | \x666f6f84309c3800 | \x666f6f84309c38 | \x00 | invalid byte sequence for encoding "GB18030": 0x00
invalid, NUL byte | \x666f6f84309c0038 | \x666f6f | \x84309c0038 | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c 0x00
(9 rows)
(10 rows)
-- Test conversions from GB18030
select description, inbytes, (test_conv(inbytes, 'gb18030', 'utf8')).* from gb18030_inputs;
@@ -536,12 +538,13 @@ select description, inbytes, (test_conv(inbytes, 'gb18030', 'utf8')).* from gb18
valid | \x666f6fcff3 | \x666f6fe8b1a1 | |
valid, no translation to UTF-8 | \x666f6f8431a530 | \x666f6f | \x8431a530 | character with byte sequence 0x84 0x31 0xa5 0x30 in encoding "GB18030" has no equivalent in encoding "UTF8"
valid, translates to UTF-8 by mapping function | \x666f6f84309c38 | \x666f6fefa8aa | |
valid, changed from version 2000 to 2022 | \xa6d9 | \xefb890 | |
incomplete char | \x666f6f84309c | \x666f6f | \x84309c | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c
incomplete char, followed by newline | \x666f6f84309c0a | \x666f6f | \x84309c0a | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c 0x0a
incomplete char at end | \x666f6f84 | \x666f6f | \x84 | invalid byte sequence for encoding "GB18030": 0x84
invalid, NUL byte | \x666f6f84309c3800 | \x666f6fefa8aa | \x00 | invalid byte sequence for encoding "GB18030": 0x00
invalid, NUL byte | \x666f6f84309c0038 | \x666f6f | \x84309c0038 | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c 0x00
(9 rows)
(10 rows)
--
-- ISO-8859-5

View File

@@ -298,6 +298,7 @@ insert into gb18030_inputs values
('\x666f6fcff3', 'valid'),
('\x666f6f8431a530', 'valid, no translation to UTF-8'),
('\x666f6f84309c38', 'valid, translates to UTF-8 by mapping function'),
('\xa6d9', 'valid, changed from version 2000 to 2022'),
('\x666f6f84309c', 'incomplete char '),
('\x666f6f84309c0a', 'incomplete char, followed by newline '),
('\x666f6f84', 'incomplete char at end'),