mirror of
				https://github.com/postgres/postgres.git
				synced 2025-10-25 13:17:41 +03:00 
			
		
		
		
	Fix localization support for multibyte encoding and C locale.
Slightly reworked patch from Tatsuo Ishii
This commit is contained in:
		| @@ -12,13 +12,13 @@ | |||||||
| size_t | size_t | ||||||
| wchar2char(char *to, const wchar_t *from, size_t len) | wchar2char(char *to, const wchar_t *from, size_t len) | ||||||
| { | { | ||||||
|  | 	if (len == 0) | ||||||
|  | 		return 0; | ||||||
|  |  | ||||||
| 	if (GetDatabaseEncoding() == PG_UTF8) | 	if (GetDatabaseEncoding() == PG_UTF8) | ||||||
| 	{ | 	{ | ||||||
| 		int			r; | 		int			r; | ||||||
|  |  | ||||||
| 		if (len == 0) |  | ||||||
| 			return 0; |  | ||||||
|  |  | ||||||
| 		r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len, | 		r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len, | ||||||
| 								NULL, NULL); | 								NULL, NULL); | ||||||
|  |  | ||||||
| @@ -34,17 +34,19 @@ wchar2char(char *to, const wchar_t *from, size_t len) | |||||||
|  |  | ||||||
| 	return wcstombs(to, from, len); | 	return wcstombs(to, from, len); | ||||||
| } | } | ||||||
|  | #endif   /* WIN32 */ | ||||||
|  |  | ||||||
| size_t | size_t | ||||||
| char2wchar(wchar_t *to, const char *from, size_t len) | char2wchar(wchar_t *to, const char *from, size_t len) | ||||||
| { | { | ||||||
|  | 	if (len == 0) | ||||||
|  | 		return 0; | ||||||
|  |  | ||||||
|  | #ifdef WIN32 | ||||||
| 	if (GetDatabaseEncoding() == PG_UTF8) | 	if (GetDatabaseEncoding() == PG_UTF8) | ||||||
| 	{ | 	{ | ||||||
| 		int			r; | 		int			r; | ||||||
|  |  | ||||||
| 		if (len == 0) |  | ||||||
| 			return 0; |  | ||||||
|  |  | ||||||
| 		r = MultiByteToWideChar(CP_UTF8, 0, from, len, to, len); | 		r = MultiByteToWideChar(CP_UTF8, 0, from, len, to, len); | ||||||
|  |  | ||||||
| 		if (!r) | 		if (!r) | ||||||
| @@ -60,29 +62,44 @@ char2wchar(wchar_t *to, const char *from, size_t len) | |||||||
|  |  | ||||||
| 		return r; | 		return r; | ||||||
| 	} | 	} | ||||||
|  | 	else  | ||||||
|  | #endif /* WIN32 */ | ||||||
|  | 	if ( lc_ctype_is_c() ) | ||||||
|  | 	{ | ||||||
|  | 		/* | ||||||
|  | 		 * pg_mb2wchar_with_len always adds trailing '\0', so  | ||||||
|  | 		 * 'to' should be allocated with sufficient space  | ||||||
|  | 		 */ | ||||||
|  | 		return pg_mb2wchar_with_len(from, (pg_wchar *)to, len); | ||||||
|  | 	} | ||||||
|  |  | ||||||
| 	return mbstowcs(to, from, len); | 	return mbstowcs(to, from, len); | ||||||
| } | } | ||||||
| #endif   /* WIN32 */ |  | ||||||
|  |  | ||||||
| int | int | ||||||
| _t_isalpha(const char *ptr) | _t_isalpha(const char *ptr) | ||||||
| { | { | ||||||
| 	wchar_t		character; | 	wchar_t		character[2]; | ||||||
|  |  | ||||||
| 	char2wchar(&character, ptr, 1); | 	if (lc_ctype_is_c()) | ||||||
|  | 		return isalpha(TOUCHAR(ptr)); | ||||||
|  |  | ||||||
| 	return iswalpha((wint_t) character); | 	char2wchar(character, ptr, 1); | ||||||
|  |  | ||||||
|  | 	return iswalpha((wint_t) *character); | ||||||
| } | } | ||||||
|  |  | ||||||
| int | int | ||||||
| _t_isprint(const char *ptr) | _t_isprint(const char *ptr) | ||||||
| { | { | ||||||
| 	wchar_t		character; | 	wchar_t		character[2]; | ||||||
|  |  | ||||||
| 	char2wchar(&character, ptr, 1); | 	if (lc_ctype_is_c()) | ||||||
|  | 		return isprint(TOUCHAR(ptr)); | ||||||
|  |  | ||||||
| 	return iswprint((wint_t) character); | 	char2wchar(character, ptr, 1); | ||||||
|  |  | ||||||
|  | 	return iswprint((wint_t) *character); | ||||||
| } | } | ||||||
| #endif   /* TS_USE_WIDE */ | #endif   /* TS_USE_WIDE */ | ||||||
|  |  | ||||||
| @@ -126,7 +143,7 @@ lowerstr(char *str) | |||||||
| 		if ( wlen < 0 ) | 		if ( wlen < 0 ) | ||||||
| 			ereport(ERROR, | 			ereport(ERROR, | ||||||
| 					(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), | 					(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), | ||||||
| 					 errmsg("transalation failed from server encoding to wchar_t"))); | 					 errmsg("translation failed from server encoding to wchar_t"))); | ||||||
|  |  | ||||||
| 		Assert(wlen<=len); | 		Assert(wlen<=len); | ||||||
| 		wstr[wlen] = 0; | 		wstr[wlen] = 0; | ||||||
| @@ -152,7 +169,7 @@ lowerstr(char *str) | |||||||
| 		if ( wlen < 0 ) | 		if ( wlen < 0 ) | ||||||
| 			ereport(ERROR, | 			ereport(ERROR, | ||||||
| 					(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), | 					(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), | ||||||
| 					 errmsg("transalation failed from wchar_t to server encoding %d", errno))); | 					 errmsg("translation failed from wchar_t to server encoding %d", errno))); | ||||||
| 		Assert(wlen<=len); | 		Assert(wlen<=len); | ||||||
| 		out[wlen]='\0'; | 		out[wlen]='\0'; | ||||||
| 	} | 	} | ||||||
|   | |||||||
| @@ -30,16 +30,17 @@ | |||||||
| #define TOUCHAR(x)	(*((unsigned char*)(x))) | #define TOUCHAR(x)	(*((unsigned char*)(x))) | ||||||
|  |  | ||||||
| #ifdef TS_USE_WIDE | #ifdef TS_USE_WIDE | ||||||
|  | size_t		char2wchar(wchar_t *to, const char *from, size_t len); | ||||||
|  |  | ||||||
| #ifdef WIN32 | #ifdef WIN32 | ||||||
|  |  | ||||||
| size_t		wchar2char(char *to, const wchar_t *from, size_t len); | size_t		wchar2char(char *to, const wchar_t *from, size_t len); | ||||||
| size_t		char2wchar(wchar_t *to, const char *from, size_t len); |  | ||||||
| #else							/* WIN32 */ | #else							/* WIN32 */ | ||||||
|  |  | ||||||
| /* correct mbstowcs */ | /* correct wcstombs */ | ||||||
| #define char2wchar mbstowcs |  | ||||||
| #define wchar2char wcstombs | #define wchar2char wcstombs | ||||||
|  |  | ||||||
| #endif   /* WIN32 */ | #endif   /* WIN32 */ | ||||||
|  |  | ||||||
| #define t_isdigit(x)	( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) ) | #define t_isdigit(x)	( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) ) | ||||||
| @@ -55,10 +56,10 @@ extern int	_t_isprint(const char *ptr); | |||||||
|  */ |  */ | ||||||
| #define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false ) | #define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false ) | ||||||
|  |  | ||||||
| #define COPYCHAR(d,s)	do {				\ | #define COPYCHAR(d,s)	do {					\ | ||||||
| 	int lll = pg_mblen( s );			\ | 	int lll = pg_mblen( s );					\ | ||||||
| 							\ | 												\ | ||||||
| 	while( lll-- )					\ | 	while( lll-- )								\ | ||||||
| 		TOUCHAR((d)+lll) = TOUCHAR((s)+lll);	\ | 		TOUCHAR((d)+lll) = TOUCHAR((s)+lll);	\ | ||||||
| } while(0) | } while(0) | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,4 +1,4 @@ | |||||||
| /* $PostgreSQL: pgsql/contrib/tsearch2/wordparser/parser.c,v 1.11 2006/10/04 00:29:47 momjian Exp $ */ | /* $PostgreSQL: pgsql/contrib/tsearch2/wordparser/parser.c,v 1.12 2007/01/15 15:16:28 teodor Exp $ */ | ||||||
|  |  | ||||||
| #include "postgres.h" | #include "postgres.h" | ||||||
|  |  | ||||||
| @@ -40,16 +40,13 @@ TParserInit(char *str, int len) | |||||||
| #ifdef TS_USE_WIDE | #ifdef TS_USE_WIDE | ||||||
|  |  | ||||||
| 	/* | 	/* | ||||||
| 	 * Use wide char code only when max encoding length > 1 and ctype != C. | 	 * Use wide char code only when max encoding length > 1. | ||||||
| 	 * Some operating systems fail with multi-byte encodings and a C locale. |  | ||||||
| 	 * Also, for a C locale there is no need to process as multibyte. From |  | ||||||
| 	 * backend/utils/adt/oracle_compat.c Teodor |  | ||||||
| 	 */ | 	 */ | ||||||
|  |  | ||||||
| 	if (prs->charmaxlen > 1 && !lc_ctype_is_c()) | 	if (prs->charmaxlen > 1) | ||||||
| 	{ | 	{ | ||||||
| 		prs->usewide = true; | 		prs->usewide = true; | ||||||
| 		prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr); | 		prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr+1)); | ||||||
| 		prs->lenwstr = char2wchar(prs->wstr, prs->str, prs->lenstr); | 		prs->lenwstr = char2wchar(prs->wstr, prs->str, prs->lenstr); | ||||||
| 	} | 	} | ||||||
| 	else | 	else | ||||||
| @@ -83,25 +80,99 @@ TParserClose(TParser * prs) | |||||||
|  |  | ||||||
| /* | /* | ||||||
|  * defining support function, equvalent is* macroses, but |  * defining support function, equvalent is* macroses, but | ||||||
|  * working with any possible encodings and locales |  * working with any possible encodings and locales. Note, | ||||||
|  |  * that with multibyte encoding and C-locale isw* function may fail | ||||||
|  |  * or give wrong result. Note 2: multibyte encoding and C-locale  | ||||||
|  |  * often are used for Asian languages. | ||||||
|  */ |  */ | ||||||
|  |  | ||||||
| #ifdef TS_USE_WIDE | #ifdef TS_USE_WIDE | ||||||
|  |  | ||||||
| #define p_iswhat(type)										\ | #define p_iswhat(type)														\ | ||||||
| static int											\ | static int																	\ | ||||||
| p_is##type(TParser *prs) {									\ | p_is##type(TParser *prs) {													\ | ||||||
| 	Assert( prs->state );									\ | 	Assert( prs->state );													\ | ||||||
| 	return ( ( prs->usewide ) ? isw##type( (wint_t)*( prs->wstr + prs->state->poschar ) ) : \ | 	if ( prs->usewide )														\ | ||||||
| 		is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) );		\ | 	{																		\ | ||||||
| }	\ | 		if ( lc_ctype_is_c() )												\ | ||||||
| 												\ | 			return is##type( 0xff & *( prs->wstr + prs->state->poschar) );	\ | ||||||
| static int											\ | 																			\ | ||||||
| p_isnot##type(TParser *prs) {									\ | 		return isw##type( *(wint_t*)( prs->wstr + prs->state->poschar ) );	\ | ||||||
| 	return !p_is##type(prs);								\ | 	}																		\ | ||||||
|  | 																			\ | ||||||
|  | 	return is##type( *(unsigned char*)( prs->str + prs->state->posbyte ) );	\ | ||||||
|  | }																			\ | ||||||
|  | 																			\ | ||||||
|  | static int																	\ | ||||||
|  | p_isnot##type(TParser *prs) {												\ | ||||||
|  | 	return !p_is##type(prs);												\ | ||||||
| } | } | ||||||
|  |  | ||||||
|  | static int  | ||||||
|  | p_isalnum(TParser *prs) | ||||||
|  | { | ||||||
|  | 	Assert( prs->state ); | ||||||
|  |  | ||||||
|  | 	if (prs->usewide) | ||||||
|  | 	{ | ||||||
|  | 		if (lc_ctype_is_c()) | ||||||
|  | 		{ | ||||||
|  | 			unsigned int c = *(unsigned int*)(prs->wstr + prs->state->poschar); | ||||||
|  |  | ||||||
|  | 			/* | ||||||
|  | 			 * any non-ascii symbol with multibyte encoding | ||||||
|  | 			 * with C-locale is an alpha character | ||||||
|  | 			 */ | ||||||
|  | 			if ( c > 0x7f ) | ||||||
|  | 				return 1; | ||||||
|  |  | ||||||
|  | 			return isalnum(0xff & c); | ||||||
|  | 		} | ||||||
|  |  | ||||||
|  | 		return iswalnum( (wint_t)*( prs->wstr + prs->state->poschar)); | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	return isalnum( *(unsigned char*)( prs->str + prs->state->posbyte )); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | static int | ||||||
|  | p_isnotalnum(TParser *prs) | ||||||
|  | { | ||||||
|  | 	return !p_isalnum(prs); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | static int  | ||||||
|  | p_isalpha(TParser *prs) | ||||||
|  | { | ||||||
|  | 	Assert( prs->state ); | ||||||
|  |  | ||||||
|  | 	if (prs->usewide) | ||||||
|  | 	{ | ||||||
|  | 		if (lc_ctype_is_c()) | ||||||
|  | 		{ | ||||||
|  | 			unsigned int c = *(prs->wstr + prs->state->poschar); | ||||||
|  |  | ||||||
|  | 			/* | ||||||
|  | 			 * any non-ascii symbol with multibyte encoding | ||||||
|  | 			 * with C-locale is an alpha character | ||||||
|  | 			 */ | ||||||
|  | 			if ( c > 0x7f ) | ||||||
|  | 				return 1; | ||||||
|  |  | ||||||
|  | 			return isalpha(0xff & c); | ||||||
|  | 		} | ||||||
|  |  | ||||||
|  | 		return iswalpha( (wint_t)*( prs->wstr + prs->state->poschar)); | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	return isalpha( *(unsigned char*)( prs->str + prs->state->posbyte )); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | static int | ||||||
|  | p_isnotalpha(TParser *prs) | ||||||
|  | { | ||||||
|  | 	return !p_isalpha(prs); | ||||||
|  | } | ||||||
|  |  | ||||||
| /* p_iseq should be used only for ascii symbols */ | /* p_iseq should be used only for ascii symbols */ | ||||||
|  |  | ||||||
| @@ -111,18 +182,19 @@ p_iseq(TParser * prs, char c) | |||||||
| 	Assert(prs->state); | 	Assert(prs->state); | ||||||
| 	return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0; | 	return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0; | ||||||
| } | } | ||||||
|  |  | ||||||
| #else							/* TS_USE_WIDE */ | #else							/* TS_USE_WIDE */ | ||||||
|  |  | ||||||
| #define p_iswhat(type)										\ | #define p_iswhat(type)														\ | ||||||
| static int											\ | static int																	\ | ||||||
| p_is##type(TParser *prs) {									\ | p_is##type(TParser *prs) {													\ | ||||||
| 	Assert( prs->state );									\ | 	Assert( prs->state );													\ | ||||||
| 	return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) );			\ | 	return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) );	\ | ||||||
| }	\ | }																			\ | ||||||
| 												\ | 																			\ | ||||||
| static int											\ | static int																	\ | ||||||
| p_isnot##type(TParser *prs) {									\ | p_isnot##type(TParser *prs) {												\ | ||||||
| 	return !p_is##type(prs);								\ | 	return !p_is##type(prs);												\ | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -132,10 +204,12 @@ p_iseq(TParser * prs, char c) | |||||||
| 	Assert(prs->state); | 	Assert(prs->state); | ||||||
| 	return (*(prs->str + prs->state->posbyte) == c) ? 1 : 0; | 	return (*(prs->str + prs->state->posbyte) == c) ? 1 : 0; | ||||||
| } | } | ||||||
| #endif   /* TS_USE_WIDE */ |  | ||||||
|  |  | ||||||
| p_iswhat(alnum) | p_iswhat(alnum) | ||||||
| p_iswhat(alpha) | p_iswhat(alpha) | ||||||
|  |  | ||||||
|  | #endif   /* TS_USE_WIDE */ | ||||||
|  |  | ||||||
| p_iswhat(digit) | p_iswhat(digit) | ||||||
| p_iswhat(lower) | p_iswhat(lower) | ||||||
| p_iswhat(print) | p_iswhat(print) | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user