mirror of
				https://github.com/postgres/postgres.git
				synced 2025-11-03 09:13:20 +03:00 
			
		
		
		
	Add GB18030 support. Contributed by Bill Huang <bill_huanghb@ybb.ne.jp>
(ODBC support has not been committed yet. left for Hiroshi...)
This commit is contained in:
		
							
								
								
									
										63489
									
								
								src/backend/utils/mb/Unicode/ISO10646-GB18030.TXT
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										63489
									
								
								src/backend/utils/mb/Unicode/ISO10646-GB18030.TXT
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										95
									
								
								src/backend/utils/mb/Unicode/UCS_to_GB18030.pl
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										95
									
								
								src/backend/utils/mb/Unicode/UCS_to_GB18030.pl
									
									
									
									
									
										Executable file
									
								
							@@ -0,0 +1,95 @@
 | 
			
		||||
#! /usr/bin/perl
 | 
			
		||||
#
 | 
			
		||||
# Copyright 2002 by Bill Huang
 | 
			
		||||
#
 | 
			
		||||
# $Id: UCS_to_GB18030.pl,v 1.1 2002/06/13 08:28:55 ishii Exp $
 | 
			
		||||
#
 | 
			
		||||
# Generate UTF-8 <--> GB18030 code conversion tables from
 | 
			
		||||
# map files provided by Unicode organization.
 | 
			
		||||
# Unfortunately it is prohibited by the organization
 | 
			
		||||
# to distribute the map files. So if you try to use this script,
 | 
			
		||||
# you have to obtain ISO10646-GB18030.TXT from 
 | 
			
		||||
# the organization's ftp site.
 | 
			
		||||
#
 | 
			
		||||
# ISO10646-GB18030.TXT format:
 | 
			
		||||
#		 GB18030 code in hex
 | 
			
		||||
#		 UCS-2 code in hex
 | 
			
		||||
#		 # and Unicode name (not used in this script)
 | 
			
		||||
 | 
			
		||||
require "ucs2utf.pl";
 | 
			
		||||
 | 
			
		||||
# first generate UTF-8 --> GB18030 table
 | 
			
		||||
 | 
			
		||||
$in_file = "ISO10646-GB18030.TXT";
 | 
			
		||||
 | 
			
		||||
open( FILE, $in_file ) || die( "cannot open $in_file" );
 | 
			
		||||
 | 
			
		||||
while( <FILE> ){
 | 
			
		||||
	chop;
 | 
			
		||||
	if( /^#/ ){
 | 
			
		||||
		next;
 | 
			
		||||
	}
 | 
			
		||||
	( $u, $c, $rest ) = split;
 | 
			
		||||
	$utf = hex($u);
 | 
			
		||||
	$code = hex($c);
 | 
			
		||||
	$count++;
 | 
			
		||||
	$array{ $utf } = ($code);
 | 
			
		||||
}
 | 
			
		||||
close( FILE );
 | 
			
		||||
 | 
			
		||||
#
 | 
			
		||||
# first, generate UTF8 --> GB18030 table
 | 
			
		||||
#
 | 
			
		||||
 | 
			
		||||
$file = "utf8_to_gb18030.map";
 | 
			
		||||
open( FILE, "> $file" ) || die( "cannot open $file" );
 | 
			
		||||
print FILE "static pg_utf_to_local ULmapGB18030[ $count ] = {\n";
 | 
			
		||||
 | 
			
		||||
for $index ( sort {$a <=> $b} keys( %array ) ){
 | 
			
		||||
	$code = $array{ $index };
 | 
			
		||||
	$count--;
 | 
			
		||||
	if( $count == 0 ){
 | 
			
		||||
		printf FILE "  {0x%04x, 0x%04x}\n", $index, $code;
 | 
			
		||||
	} else {
 | 
			
		||||
		printf FILE "  {0x%04x, 0x%04x},\n", $index, $code;
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
print FILE "};\n";
 | 
			
		||||
close(FILE);
 | 
			
		||||
 | 
			
		||||
#
 | 
			
		||||
# then generate GB18030 --> UTF8 table
 | 
			
		||||
#
 | 
			
		||||
reset 'array';
 | 
			
		||||
 | 
			
		||||
open( FILE, $in_file ) || die( "cannot open $in_file" );
 | 
			
		||||
 | 
			
		||||
while( <FILE> ){
 | 
			
		||||
	chop;
 | 
			
		||||
	if( /^#/ ){
 | 
			
		||||
		next;
 | 
			
		||||
	}
 | 
			
		||||
	( $u, $c, $rest ) = split;
 | 
			
		||||
	$utf = hex($u);
 | 
			
		||||
	$code = hex($c);
 | 
			
		||||
	$count++;
 | 
			
		||||
	$array{ $code } = $utf;
 | 
			
		||||
}
 | 
			
		||||
close( FILE );
 | 
			
		||||
 | 
			
		||||
$file = "gb18030_to_utf8.map";
 | 
			
		||||
open( FILE, "> $file" ) || die( "cannot open $file" );
 | 
			
		||||
print FILE "static pg_local_to_utf LUmapGB18030[ $count ] = {\n";
 | 
			
		||||
for $index ( sort {$a <=> $b} keys( %array ) ){
 | 
			
		||||
	$utf = $array{ $index };
 | 
			
		||||
	$count--;
 | 
			
		||||
	if( $count == 0 ){
 | 
			
		||||
		printf FILE "  {0x%04x, 0x%04x}\n", $index, $utf;
 | 
			
		||||
	} else {
 | 
			
		||||
		printf FILE "  {0x%04x, 0x%04x},\n", $index, $utf;
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
print FILE "};\n";
 | 
			
		||||
close(FILE);
 | 
			
		||||
							
								
								
									
										63490
									
								
								src/backend/utils/mb/Unicode/gb18030_to_utf8.map
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										63490
									
								
								src/backend/utils/mb/Unicode/gb18030_to_utf8.map
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										63490
									
								
								src/backend/utils/mb/Unicode/utf8_to_gb18030.map
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										63490
									
								
								src/backend/utils/mb/Unicode/utf8_to_gb18030.map
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							@@ -6,7 +6,7 @@
 | 
			
		||||
 * WIN1250 client encoding support contributed by Pavel Behal
 | 
			
		||||
 * SJIS UDC (NEC selection IBM kanji) support contributed by Eiji Tokuya
 | 
			
		||||
 *
 | 
			
		||||
 * $Id: conv.c,v 1.37 2002/03/06 06:10:26 momjian Exp $
 | 
			
		||||
 * $Id: conv.c,v 1.38 2002/06/13 08:28:54 ishii Exp $
 | 
			
		||||
 *
 | 
			
		||||
 *
 | 
			
		||||
 */
 | 
			
		||||
@@ -48,6 +48,8 @@
 | 
			
		||||
#include "Unicode/euc_jp_to_utf8.map"
 | 
			
		||||
#include "Unicode/utf8_to_euc_cn.map"
 | 
			
		||||
#include "Unicode/euc_cn_to_utf8.map"
 | 
			
		||||
#include "Unicode/utf8_to_gb18030.map"
 | 
			
		||||
#include "Unicode/gb18030_to_utf8.map"
 | 
			
		||||
#include "Unicode/utf8_to_euc_kr.map"
 | 
			
		||||
#include "Unicode/euc_kr_to_utf8.map"
 | 
			
		||||
#include "Unicode/utf8_to_euc_tw.map"
 | 
			
		||||
@@ -515,6 +517,96 @@ mic2euc_cn(unsigned char *mic, unsigned char *p, int len)
 | 
			
		||||
	*p = '\0';
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * GB18030 ---> MIC
 | 
			
		||||
 * Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
 | 
			
		||||
 */
 | 
			
		||||
static void
 | 
			
		||||
gb180302mic(unsigned char *gb18030, unsigned char *p, int len)
 | 
			
		||||
{
 | 
			
		||||
	int			c1;
 | 
			
		||||
	int			c2;
 | 
			
		||||
 | 
			
		||||
	while (len > 0 && (c1 = *gb18030++))
 | 
			
		||||
	{
 | 
			
		||||
		if (c1 < 0x80)
 | 
			
		||||
		{						/* should be ASCII */
 | 
			
		||||
			len--;
 | 
			
		||||
			*p++ = c1;
 | 
			
		||||
		}
 | 
			
		||||
		else if(c1 >= 0x81 && c1 <= 0xfe)
 | 
			
		||||
		{
 | 
			
		||||
			c2 = *gb18030++;
 | 
			
		||||
			
 | 
			
		||||
			if(c2 >= 0x30 && c2 <= 0x69){
 | 
			
		||||
				len -= 4;
 | 
			
		||||
				*p++ = c1;
 | 
			
		||||
				*p++ = c2;
 | 
			
		||||
				*p++ = *gb18030++;
 | 
			
		||||
				*p++ = *gb18030++;
 | 
			
		||||
				*p++ = *gb18030++;
 | 
			
		||||
			}
 | 
			
		||||
			else if ((c2 >=0x40 && c2 <= 0x7e) ||(c2 >=0x80 && c2 <= 0xfe)){
 | 
			
		||||
				len -= 2;
 | 
			
		||||
				*p++ = c1;
 | 
			
		||||
				*p++ = c2;
 | 
			
		||||
				*p++ = *gb18030++;
 | 
			
		||||
			}
 | 
			
		||||
			else{	/*throw the strange code*/
 | 
			
		||||
				len--;
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
	*p = '\0';
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * MIC ---> GB18030
 | 
			
		||||
 * Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
 | 
			
		||||
 */
 | 
			
		||||
static void
 | 
			
		||||
mic2gb18030(unsigned char *mic, unsigned char *p, int len)
 | 
			
		||||
{
 | 
			
		||||
	int			c1;
 | 
			
		||||
	int			c2;
 | 
			
		||||
 | 
			
		||||
	while (len > 0 && (c1 = *mic))
 | 
			
		||||
	{
 | 
			
		||||
		len -= pg_mic_mblen(mic++);
 | 
			
		||||
 | 
			
		||||
		if (c1 <= 0x7f) /*ASCII*/
 | 
			
		||||
		{					
 | 
			
		||||
			*p++ = c1;
 | 
			
		||||
		}
 | 
			
		||||
		else if (c1 >= 0x81 && c1 <= 0xfe)
 | 
			
		||||
		{		
 | 
			
		||||
			c2 = *mic++;
 | 
			
		||||
			
 | 
			
		||||
			if((c2 >= 0x40 && c2 <= 0x7e) || (c2 >= 0x80 && c2 <= 0xfe)){
 | 
			
		||||
				*p++ = c1;
 | 
			
		||||
				*p++ = c2;
 | 
			
		||||
			}
 | 
			
		||||
			else if(c2 >= 0x30 && c2 <= 0x39){
 | 
			
		||||
				*p++ = c1;
 | 
			
		||||
				*p++ = c2;
 | 
			
		||||
				*p++ = *mic++;
 | 
			
		||||
				*p++ = *mic++;
 | 
			
		||||
			}	
 | 
			
		||||
			else{
 | 
			
		||||
				mic--;
 | 
			
		||||
				printBogusChar(&mic, &p);
 | 
			
		||||
				mic--;
 | 
			
		||||
				printBogusChar(&mic, &p);
 | 
			
		||||
			}		
 | 
			
		||||
		}
 | 
			
		||||
		else{
 | 
			
		||||
			mic--;
 | 
			
		||||
			printBogusChar(&mic, &p);
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
	*p = '\0';
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * EUC_TW ---> MIC
 | 
			
		||||
 */
 | 
			
		||||
@@ -1596,6 +1688,26 @@ euc_cn_to_utf(unsigned char *euc, unsigned char *utf, int len)
 | 
			
		||||
		  sizeof(LUmapEUC_CN) / sizeof(pg_local_to_utf), PG_EUC_CN, len);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * UTF-8 ---> GB18030
 | 
			
		||||
 */
 | 
			
		||||
static void
 | 
			
		||||
utf_to_gb18030(unsigned char *utf, unsigned char *euc, int len)
 | 
			
		||||
 | 
			
		||||
{
 | 
			
		||||
	utf_to_local(utf, euc, ULmapGB18030,
 | 
			
		||||
				 sizeof(ULmapGB18030) / sizeof(pg_utf_to_local), len);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * GB18030 ---> UTF-8
 | 
			
		||||
 */
 | 
			
		||||
static void
 | 
			
		||||
gb18030_to_utf(unsigned char *euc, unsigned char *utf, int len)
 | 
			
		||||
{
 | 
			
		||||
	local_to_utf(euc, utf, LUmapGB18030,
 | 
			
		||||
		  sizeof(LUmapGB18030) / sizeof(pg_local_to_utf), PG_GB18030, len);
 | 
			
		||||
}
 | 
			
		||||
/*
 | 
			
		||||
 * UTF-8 ---> EUC_KR
 | 
			
		||||
 */
 | 
			
		||||
@@ -1935,6 +2047,9 @@ pg_enconv	pg_enconv_tbl[] =
 | 
			
		||||
	{
 | 
			
		||||
		PG_WIN1250, win12502mic, mic2win1250, win1250_to_utf, utf_to_win1250
 | 
			
		||||
	},
 | 
			
		||||
	{
 | 
			
		||||
		PG_GB18030, gb180302mic, mic2gb18030, gb18030_to_utf, utf_to_gb18030
 | 
			
		||||
	},
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
#else
 | 
			
		||||
@@ -2019,9 +2134,18 @@ pg_enconv	pg_enconv_tbl[] =
 | 
			
		||||
	{
 | 
			
		||||
		PG_BIG5, big52mic, mic2big5, 0, 0
 | 
			
		||||
	},
 | 
			
		||||
	{
 | 
			
		||||
		PG_GBK, 0, 0, 0, 0
 | 
			
		||||
	},
 | 
			
		||||
	{
 | 
			
		||||
		PG_UHC, 0, 0, 0, 0
 | 
			
		||||
	},
 | 
			
		||||
	{
 | 
			
		||||
		PG_WIN1250, win12502mic, mic2win1250, 0, 0
 | 
			
		||||
	},
 | 
			
		||||
	{
 | 
			
		||||
		PG_GB18030, gb180302mic, mic2gb18030, 0, 0
 | 
			
		||||
	},
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
#endif   /* UNICODE_CONVERSION */
 | 
			
		||||
 
 | 
			
		||||
@@ -2,7 +2,7 @@
 | 
			
		||||
 * Encoding names and routines for work with it. All
 | 
			
		||||
 * in this file is shared bedween FE and BE.
 | 
			
		||||
 *
 | 
			
		||||
 * $Id: encnames.c,v 1.7 2002/03/05 05:52:44 momjian Exp $
 | 
			
		||||
 * $Id: encnames.c,v 1.8 2002/06/13 08:28:54 ishii Exp $
 | 
			
		||||
 */
 | 
			
		||||
#ifdef FRONTEND
 | 
			
		||||
#include "postgres_fe.h"
 | 
			
		||||
@@ -60,7 +60,11 @@ pg_encname	pg_encname_tbl[] =
 | 
			
		||||
	{
 | 
			
		||||
		"euctw", PG_EUC_TW
 | 
			
		||||
	},							/* EUC-TW; Extended Unix Code for
 | 
			
		||||
 | 
			
		||||
								 * traditional Chinese */
 | 
			
		||||
	{
 | 
			
		||||
		"gb18030", PG_GB18030
 | 
			
		||||
	},							/* GB18030;GB18030 */
 | 
			
		||||
	{
 | 
			
		||||
		"gbk", PG_GBK
 | 
			
		||||
	},							/* GBK; Chinese Windows CodePage 936
 | 
			
		||||
@@ -239,7 +243,6 @@ pg_encname	pg_encname_tbl[] =
 | 
			
		||||
	{
 | 
			
		||||
		"windows950", PG_BIG5
 | 
			
		||||
	},							/* alias for BIG5 */
 | 
			
		||||
 | 
			
		||||
	{
 | 
			
		||||
		NULL, 0
 | 
			
		||||
	}							/* last */
 | 
			
		||||
@@ -353,6 +356,9 @@ pg_enc2name pg_enc2name_tbl[] =
 | 
			
		||||
	},
 | 
			
		||||
	{
 | 
			
		||||
		"WIN1250", PG_WIN1250
 | 
			
		||||
	},
 | 
			
		||||
	{
 | 
			
		||||
		"GB18030", PG_GB18030
 | 
			
		||||
	}
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -1,7 +1,7 @@
 | 
			
		||||
/*
 | 
			
		||||
 * conversion functions between pg_wchar and multi-byte streams.
 | 
			
		||||
 * Tatsuo Ishii
 | 
			
		||||
 * $Id: wchar.c,v 1.27 2002/03/05 05:52:44 momjian Exp $
 | 
			
		||||
 * $Id: wchar.c,v 1.28 2002/06/13 08:28:54 ishii Exp $
 | 
			
		||||
 *
 | 
			
		||||
 * WIN1250 client encoding updated by Pavel Behal
 | 
			
		||||
 *
 | 
			
		||||
@@ -510,6 +510,31 @@ pg_uhc_mblen(const unsigned char *s)
 | 
			
		||||
	return (len);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 *  * GB18030
 | 
			
		||||
 *   * Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
 | 
			
		||||
 *    */
 | 
			
		||||
static int
 | 
			
		||||
pg_gb18030_mblen(const unsigned char *s)
 | 
			
		||||
{
 | 
			
		||||
        int                     len;
 | 
			
		||||
        if (*s <= 0x7f)
 | 
			
		||||
        {                                                       /* ASCII */
 | 
			
		||||
                len = 1;
 | 
			
		||||
        }
 | 
			
		||||
        else
 | 
			
		||||
        {                                                       
 | 
			
		||||
                if((*(s+1) >= 0x40 && *(s+1) <= 0x7e)|| (*(s+1) >= 0x80 && *(s+1) <= 0xfe))
 | 
			
		||||
                        len = 2;
 | 
			
		||||
                else if(*(s+1) >= 0x30 && *(s+1) <= 0x39)
 | 
			
		||||
                        len = 4;
 | 
			
		||||
                else
 | 
			
		||||
                        len = 2;
 | 
			
		||||
        }
 | 
			
		||||
        return (len);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
		
 | 
			
		||||
pg_wchar_tbl pg_wchar_table[] = {
 | 
			
		||||
	{pg_ascii2wchar_with_len, pg_ascii_mblen, 1},		/* 0; PG_SQL_ASCII	*/
 | 
			
		||||
	{pg_eucjp2wchar_with_len, pg_eucjp_mblen, 3},		/* 1; PG_EUC_JP */
 | 
			
		||||
@@ -544,6 +569,7 @@ pg_wchar_tbl pg_wchar_table[] = {
 | 
			
		||||
	{0, pg_gbk_mblen, 2},		/* 30; PG_GBK */
 | 
			
		||||
	{0, pg_uhc_mblen, 2},		/* 31; PG_UHC */
 | 
			
		||||
	{pg_latin12wchar_with_len, pg_latin1_mblen, 1},		/* 32; PG_WIN1250 */
 | 
			
		||||
	{0, pg_gb18030_mblen, 2}       /* 33; PG_GB18030 */
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
/* returns the byte length of a word for mule internal code */
 | 
			
		||||
 
 | 
			
		||||
@@ -1,4 +1,4 @@
 | 
			
		||||
/* $Id: pg_wchar.h,v 1.38 2002/03/05 05:52:50 momjian Exp $ */
 | 
			
		||||
/* $Id: pg_wchar.h,v 1.39 2002/06/13 08:30:22 ishii Exp $ */
 | 
			
		||||
 | 
			
		||||
#ifndef PG_WCHAR_H
 | 
			
		||||
#define PG_WCHAR_H
 | 
			
		||||
@@ -189,7 +189,7 @@ typedef enum pg_enc
 | 
			
		||||
	PG_GBK,					/* GBK (Windows-936) */
 | 
			
		||||
	PG_UHC,					/* UHC (Windows-949) */
 | 
			
		||||
	PG_WIN1250,					/* windows-1250 */
 | 
			
		||||
 | 
			
		||||
	PG_GB18030,					/* GB18030 */
 | 
			
		||||
	_PG_LAST_ENCODING_			/* mark only */
 | 
			
		||||
 | 
			
		||||
} pg_enc;
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user