Update GB18030 encoding from version 2000 to 2022

Mappings for 18 characters have changed, affecting 36 code points. This is a break in compatibility, but these characters are rarely used. U+E5E5 (Private Use Area) was previously mapped to \xA3A0. This code point now maps to \x65356535. Attempting to convert \xA3A0 will now raise an error. Separate from the 2022 update, the following mappings were previously swapped, and subsequently corrected in 2000 and later versions: * U+E7C7 (Private Use Area) now maps to \x8135F437 * U+1E3F (Latin Small Letter M with Acute) now maps to \xA8BC The 2022 standard mentions the following policy changes, but they have no effect in our implementation: 66 new ideographs are now required, but these are mapped algorithmically so were already handled by utf8_and_gb18030.c. Nine CJK compatibility ideographs are no longer required, but implementations may retain them, as does the source we use from the Unicode Consortium. Release notes: Compatibility section For further details, see: https://www.unicode.org/L2/L2022/22274-disruptive-changes.pdf https://ken-lunde.medium.com/the-gb-18030-2022-standard-3d0ebaeb4132 Author: Chao Li <lic@highgo.com> Author: Zheng Tao <taoz@highgo.com> Discussion: https://postgr.es/m/966d9fc.169.198741fe60b.Coremail.jiaoshuntian%40highgo.com
2025-11-18 02:02:55 +03:00 · 2025-09-24 13:26:05 +07:00
parent e41d954da6
commit 5334620eef
7 changed files with 1671 additions and 1592 deletions
--- a/src/backend/utils/mb/Unicode/Makefile
+++ b/src/backend/utils/mb/Unicode/Makefile
@@ -54,7 +54,7 @@ $(eval $(call map_rule,euc_cn,UCS_to_EUC_CN.pl,gb-18030-2000.xml))
 $(eval $(call map_rule,euc_kr,UCS_to_EUC_KR.pl,KSX1001.TXT))
 $(eval $(call map_rule,euc_tw,UCS_to_EUC_TW.pl,CNS11643.TXT))
 $(eval $(call map_rule,sjis,UCS_to_SJIS.pl,CP932.TXT))
-$(eval $(call map_rule,gb18030,UCS_to_GB18030.pl,gb-18030-2000.ucm))
+$(eval $(call map_rule,gb18030,UCS_to_GB18030.pl,gb18030-2022.ucm))
 $(eval $(call map_rule,big5,UCS_to_BIG5.pl,CP950.TXT BIG5.TXT CP950.TXT))
 $(eval $(call map_rule,euc_jis_2004,UCS_to_EUC_JIS_2004.pl,euc-jis-2004-std.txt))
 $(eval $(call map_rule,shift_jis_2004,UCS_to_SHIFT_JIS_2004.pl,sjis-0213-2004-std.txt))
@@ -78,8 +78,8 @@ euc-jis-2004-std.txt sjis-0213-2004-std.txt:
 gb-18030-2000.xml windows-949-2000.xml:
 	$(DOWNLOAD) https://raw.githubusercontent.com/unicode-org/icu-data/master/charset/data/xml/$(@F)

-gb-18030-2000.ucm:
-	$(DOWNLOAD) https://raw.githubusercontent.com/unicode-org/icu-data/d9d3a6ed27bb98a7106763e940258f0be8cd995b/charset/data/ucm/$(@F)
+gb18030-2022.ucm:
+	$(DOWNLOAD) https://raw.githubusercontent.com/unicode-org/icu/refs/heads/main/icu4c/source/data/mappings/$(@F)

 GB2312.TXT:
 	$(DOWNLOAD) 'http://trac.greenstone.org/browser/trunk/gsdl/unicode/MAPPINGS/EASTASIA/GB/GB2312.TXT?rev=1842&format=txt'
--- a/src/backend/utils/mb/Unicode/UCS_to_GB18030.pl
+++ b/src/backend/utils/mb/Unicode/UCS_to_GB18030.pl
@@ -5,8 +5,8 @@
 # src/backend/utils/mb/Unicode/UCS_to_GB18030.pl
 #
 # Generate UTF-8 <--> GB18030 code conversion tables from
-# "gb-18030-2000.ucm", obtained from
-# https://github.com/unicode-org/icu-data/tree/main/charset/data/ucm
+# "gb18030-2022.ucm", obtained from
+# https://github.com/unicode-org/icu/blob/main/icu4c/source/data/mappings/
 #
 # The lines we care about in the source file look like
 #   <UXXXX> \xYY[\xYY...] |n
@@ -23,7 +23,7 @@ my $this_script = 'src/backend/utils/mb/Unicode/UCS_to_GB18030.pl';

 # Read the input

-my $in_file = "gb-18030-2000.ucm";
+my $in_file = "gb18030-2022.ucm";

 open(my $in, '<', $in_file) || die("cannot open $in_file");

--- a/src/backend/utils/mb/Unicode/gb18030_to_utf8.map
+++ b/src/backend/utils/mb/Unicode/gb18030_to_utf8.map
--- a/src/backend/utils/mb/Unicode/utf8_to_gb18030.map
+++ b/src/backend/utils/mb/Unicode/utf8_to_gb18030.map