mirror of
				https://github.com/postgres/postgres.git
				synced 2025-10-25 13:17:41 +03:00 
			
		
		
		
	tsearch: use database default collation for parsing.
Previously, tsearch used the database's CTYPE setting, which only matches the database default collation if the locale provider is libc. Note that tsearch types (tsvector and tsquery) are not collatable types. The locale affects parsing the original text, which is a lossy process, so a COLLATE clause on the already-parsed value would not make sense. Reviewed-by: Peter Eisentraut <peter@eisentraut.org> Discussion: https://postgr.es/m/0151ad01239e2cc7b3139644358cf8f7b9622ff7.camel@j-davis.com
This commit is contained in:
		| @@ -20,45 +20,33 @@ | |||||||
| static void tsearch_readline_callback(void *arg); | static void tsearch_readline_callback(void *arg); | ||||||
|  |  | ||||||
|  |  | ||||||
| /* | /* space for a single character plus a trailing NUL */ | ||||||
|  * The reason these functions use a 3-wchar_t output buffer, not 2 as you | #define WC_BUF_LEN  2 | ||||||
|  * might expect, is that on Windows "wchar_t" is 16 bits and what we'll be |  | ||||||
|  * getting from char2wchar() is UTF16 not UTF32.  A single input character |  | ||||||
|  * may therefore produce a surrogate pair rather than just one wchar_t; |  | ||||||
|  * we also need room for a trailing null.  When we do get a surrogate pair, |  | ||||||
|  * we pass just the first code to iswdigit() etc, so that these functions will |  | ||||||
|  * always return false for characters outside the Basic Multilingual Plane. |  | ||||||
|  */ |  | ||||||
| #define WC_BUF_LEN  3 |  | ||||||
|  |  | ||||||
| int | int | ||||||
| t_isalpha(const char *ptr) | t_isalpha(const char *ptr) | ||||||
| { | { | ||||||
| 	int			clen = pg_mblen(ptr); | 	pg_wchar	wstr[WC_BUF_LEN]; | ||||||
| 	wchar_t		character[WC_BUF_LEN]; | 	int			wlen pg_attribute_unused(); | ||||||
| 	locale_t	mylocale = 0;	/* TODO */ |  | ||||||
|  |  | ||||||
| 	if (clen == 1 || database_ctype_is_c) | 	wlen = pg_mb2wchar_with_len(ptr, wstr, pg_mblen(ptr)); | ||||||
| 		return isalpha(TOUCHAR(ptr)); | 	Assert(wlen <= 1); | ||||||
|  |  | ||||||
| 	char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale); | 	/* pass single character, or NUL if empty */ | ||||||
|  | 	return pg_iswalpha(wstr[0], pg_database_locale()); | ||||||
| 	return iswalpha((wint_t) character[0]); |  | ||||||
| } | } | ||||||
|  |  | ||||||
| int | int | ||||||
| t_isalnum(const char *ptr) | t_isalnum(const char *ptr) | ||||||
| { | { | ||||||
| 	int			clen = pg_mblen(ptr); | 	pg_wchar	wstr[WC_BUF_LEN]; | ||||||
| 	wchar_t		character[WC_BUF_LEN]; | 	int			wlen pg_attribute_unused(); | ||||||
| 	locale_t	mylocale = 0;	/* TODO */ |  | ||||||
|  |  | ||||||
| 	if (clen == 1 || database_ctype_is_c) | 	wlen = pg_mb2wchar_with_len(ptr, wstr, pg_mblen(ptr)); | ||||||
| 		return isalnum(TOUCHAR(ptr)); | 	Assert(wlen <= 1); | ||||||
|  |  | ||||||
| 	char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale); | 	/* pass single character, or NUL if empty */ | ||||||
|  | 	return pg_iswalnum(wstr[0], pg_database_locale()); | ||||||
| 	return iswalnum((wint_t) character[0]); |  | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
| @@ -243,9 +243,7 @@ typedef struct TParser | |||||||
| 	/* string and position information */ | 	/* string and position information */ | ||||||
| 	char	   *str;			/* multibyte string */ | 	char	   *str;			/* multibyte string */ | ||||||
| 	int			lenstr;			/* length of mbstring */ | 	int			lenstr;			/* length of mbstring */ | ||||||
| 	wchar_t    *wstr;			/* wide character string */ |  | ||||||
| 	pg_wchar   *pgwstr;			/* wide character string for C-locale */ | 	pg_wchar   *pgwstr;			/* wide character string for C-locale */ | ||||||
| 	bool		usewide; |  | ||||||
|  |  | ||||||
| 	/* State of parse */ | 	/* State of parse */ | ||||||
| 	int			charmaxlen; | 	int			charmaxlen; | ||||||
| @@ -293,33 +291,8 @@ TParserInit(char *str, int len) | |||||||
| 	prs->charmaxlen = pg_database_encoding_max_length(); | 	prs->charmaxlen = pg_database_encoding_max_length(); | ||||||
| 	prs->str = str; | 	prs->str = str; | ||||||
| 	prs->lenstr = len; | 	prs->lenstr = len; | ||||||
|  | 	prs->pgwstr = (pg_wchar *) palloc(sizeof(pg_wchar) * (prs->lenstr + 1)); | ||||||
| 	/* | 	pg_mb2wchar_with_len(prs->str, prs->pgwstr, prs->lenstr); | ||||||
| 	 * Use wide char code only when max encoding length > 1. |  | ||||||
| 	 */ |  | ||||||
| 	if (prs->charmaxlen > 1) |  | ||||||
| 	{ |  | ||||||
| 		locale_t	mylocale = 0;	/* TODO */ |  | ||||||
|  |  | ||||||
| 		prs->usewide = true; |  | ||||||
| 		if (database_ctype_is_c) |  | ||||||
| 		{ |  | ||||||
| 			/* |  | ||||||
| 			 * char2wchar doesn't work for C-locale and sizeof(pg_wchar) could |  | ||||||
| 			 * be different from sizeof(wchar_t) |  | ||||||
| 			 */ |  | ||||||
| 			prs->pgwstr = (pg_wchar *) palloc(sizeof(pg_wchar) * (prs->lenstr + 1)); |  | ||||||
| 			pg_mb2wchar_with_len(prs->str, prs->pgwstr, prs->lenstr); |  | ||||||
| 		} |  | ||||||
| 		else |  | ||||||
| 		{ |  | ||||||
| 			prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1)); |  | ||||||
| 			char2wchar(prs->wstr, prs->lenstr + 1, prs->str, prs->lenstr, |  | ||||||
| 					   mylocale); |  | ||||||
| 		} |  | ||||||
| 	} |  | ||||||
| 	else |  | ||||||
| 		prs->usewide = false; |  | ||||||
|  |  | ||||||
| 	prs->state = newTParserPosition(NULL); | 	prs->state = newTParserPosition(NULL); | ||||||
| 	prs->state->state = TPS_Base; | 	prs->state->state = TPS_Base; | ||||||
| @@ -350,12 +323,9 @@ TParserCopyInit(const TParser *orig) | |||||||
| 	prs->charmaxlen = orig->charmaxlen; | 	prs->charmaxlen = orig->charmaxlen; | ||||||
| 	prs->str = orig->str + orig->state->posbyte; | 	prs->str = orig->str + orig->state->posbyte; | ||||||
| 	prs->lenstr = orig->lenstr - orig->state->posbyte; | 	prs->lenstr = orig->lenstr - orig->state->posbyte; | ||||||
| 	prs->usewide = orig->usewide; |  | ||||||
|  |  | ||||||
| 	if (orig->pgwstr) | 	if (orig->pgwstr) | ||||||
| 		prs->pgwstr = orig->pgwstr + orig->state->poschar; | 		prs->pgwstr = orig->pgwstr + orig->state->poschar; | ||||||
| 	if (orig->wstr) |  | ||||||
| 		prs->wstr = orig->wstr + orig->state->poschar; |  | ||||||
|  |  | ||||||
| 	prs->state = newTParserPosition(NULL); | 	prs->state = newTParserPosition(NULL); | ||||||
| 	prs->state->state = TPS_Base; | 	prs->state->state = TPS_Base; | ||||||
| @@ -379,8 +349,6 @@ TParserClose(TParser *prs) | |||||||
| 		prs->state = ptr; | 		prs->state = ptr; | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	if (prs->wstr) |  | ||||||
| 		pfree(prs->wstr); |  | ||||||
| 	if (prs->pgwstr) | 	if (prs->pgwstr) | ||||||
| 		pfree(prs->pgwstr); | 		pfree(prs->pgwstr); | ||||||
|  |  | ||||||
| @@ -412,13 +380,9 @@ TParserCopyClose(TParser *prs) | |||||||
|  |  | ||||||
|  |  | ||||||
| /* | /* | ||||||
|  * Character-type support functions, equivalent to is* macros, but |  * Character-type support functions using the database default locale. If the | ||||||
|  * working with any possible encodings and locales. Notes: |  * locale is C, and the input character is non-ascii, the value to be returned | ||||||
|  *	- with multibyte encoding and C-locale isw* function may fail |  * is determined by the 'nonascii' macro argument. | ||||||
|  *	  or give wrong result. |  | ||||||
|  *	- multibyte encoding and C-locale often are used for |  | ||||||
|  *	  Asian languages. |  | ||||||
|  *	- if locale is C then we use pgwstr instead of wstr. |  | ||||||
|  */ |  */ | ||||||
|  |  | ||||||
| #define p_iswhat(type, nonascii)											\ | #define p_iswhat(type, nonascii)											\ | ||||||
| @@ -426,19 +390,13 @@ TParserCopyClose(TParser *prs) | |||||||
| static int																	\ | static int																	\ | ||||||
| p_is##type(TParser *prs)													\ | p_is##type(TParser *prs)													\ | ||||||
| {																			\ | {																			\ | ||||||
|  | 	pg_locale_t locale = pg_database_locale();								\ | ||||||
|  | 	pg_wchar	wc;															\ | ||||||
| 	Assert(prs->state);														\ | 	Assert(prs->state);														\ | ||||||
| 	if (prs->usewide)														\ | 	wc = prs->pgwstr[prs->state->poschar];									\ | ||||||
| 	{																		\ | 	if (prs->charmaxlen > 1 && locale->ctype_is_c && wc > 0x7f)				\ | ||||||
| 		if (prs->pgwstr)													\ | 		return nonascii;													\ | ||||||
| 		{																	\ | 	return pg_isw##type(wc, pg_database_locale());						\ | ||||||
| 			unsigned int c = *(prs->pgwstr + prs->state->poschar);			\ |  | ||||||
| 			if (c > 0x7f)													\ |  | ||||||
| 				return nonascii;											\ |  | ||||||
| 			return is##type(c);												\ |  | ||||||
| 		}																	\ |  | ||||||
| 		return isw##type(*(prs->wstr + prs->state->poschar));				\ |  | ||||||
| 	}																		\ |  | ||||||
| 	return is##type(*(unsigned char *) (prs->str + prs->state->posbyte));	\ |  | ||||||
| }																			\ | }																			\ | ||||||
| 																			\ | 																			\ | ||||||
| static int																	\ | static int																	\ | ||||||
| @@ -703,7 +661,7 @@ p_isspecial(TParser *prs) | |||||||
| 	 * Check that only in utf encoding, because other encodings aren't | 	 * Check that only in utf encoding, because other encodings aren't | ||||||
| 	 * supported by postgres or even exists. | 	 * supported by postgres or even exists. | ||||||
| 	 */ | 	 */ | ||||||
| 	if (GetDatabaseEncoding() == PG_UTF8 && prs->usewide) | 	if (GetDatabaseEncoding() == PG_UTF8) | ||||||
| 	{ | 	{ | ||||||
| 		static const pg_wchar strange_letter[] = { | 		static const pg_wchar strange_letter[] = { | ||||||
| 			/* | 			/* | ||||||
| @@ -944,10 +902,7 @@ p_isspecial(TParser *prs) | |||||||
| 				   *StopMiddle; | 				   *StopMiddle; | ||||||
| 		pg_wchar	c; | 		pg_wchar	c; | ||||||
|  |  | ||||||
| 		if (prs->pgwstr) | 		c = *(prs->pgwstr + prs->state->poschar); | ||||||
| 			c = *(prs->pgwstr + prs->state->poschar); |  | ||||||
| 		else |  | ||||||
| 			c = (pg_wchar) *(prs->wstr + prs->state->poschar); |  | ||||||
|  |  | ||||||
| 		while (StopLow < StopHigh) | 		while (StopLow < StopHigh) | ||||||
| 		{ | 		{ | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user