1
0
mirror of https://github.com/postgres/postgres.git synced 2025-06-16 06:01:02 +03:00

Fix up handling of C/POSIX collations.

Install just one instance of the "C" and "POSIX" collations into
pg_collation, rather than one per encoding.  Make these instances exist
and do something useful even in machines without locale_t support: to wit,
it's now possible to force comparisons and case-folding functions to use C
locale in an otherwise non-C database, whether or not the platform has
support for using any additional collations.

Fix up severely broken upper/lower/initcap functions, too: the C/POSIX
fastpath now does what it is supposed to, and non-default collations are
handled correctly in single-byte database encodings.

Merge the two separate collation hashtables that were being maintained in
pg_locale.c, and be more wary of the possibility that we fail partway
through filling a cache entry.
This commit is contained in:
Tom Lane
2011-03-20 12:43:39 -04:00
parent c2f4ea469b
commit 176d5bae1d
8 changed files with 586 additions and 296 deletions

View File

@ -1462,10 +1462,16 @@ str_numth(char *dest, char *num, int type)
* in multibyte character sets. Note that in either case we are effectively
* assuming that the database character encoding matches the encoding implied
* by LC_CTYPE.
*
* If the system provides locale_t and associated functions (which are
* standardized by Open Group's XBD), we can support collations that are
* neither default nor C. The code is written to handle both combinations
* of have-wide-characters and have-locale_t, though it's rather unlikely
* a platform would have the latter without the former.
*/
/*
* wide-character-aware lower function
* collation-aware, wide-character-aware lower function
*
* We pass the number of bytes so we can pass varlena and char*
* to this function. The result is a palloc'd, null-terminated string.
@ -1474,21 +1480,31 @@ char *
str_tolower(const char *buff, size_t nbytes, Oid collid)
{
char *result;
pg_locale_t mylocale = 0;
if (!buff)
return NULL;
if (collid != DEFAULT_COLLATION_OID)
mylocale = pg_newlocale_from_collation(collid);
#ifdef USE_WIDE_UPPER_LOWER
if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c(collid))
/* C/POSIX collations use this path regardless of database encoding */
if (lc_ctype_is_c(collid))
{
char *p;
result = pnstrdup(buff, nbytes);
for (p = result; *p; p++)
*p = pg_ascii_tolower((unsigned char) *p);
}
#ifdef USE_WIDE_UPPER_LOWER
else if (pg_database_encoding_max_length() > 1)
{
pg_locale_t mylocale = 0;
wchar_t *workspace;
size_t curr_char;
size_t result_size;
if (collid != DEFAULT_COLLATION_OID)
mylocale = pg_newlocale_from_collation(collid);
/* Overflow paranoia */
if ((nbytes + 1) > (INT_MAX / sizeof(wchar_t)))
ereport(ERROR,
@ -1501,12 +1517,14 @@ str_tolower(const char *buff, size_t nbytes, Oid collid)
char2wchar(workspace, nbytes + 1, buff, nbytes, collid);
for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
{
#ifdef HAVE_LOCALE_T
if (mylocale)
workspace[curr_char] = towlower_l(workspace[curr_char], mylocale);
else
#endif
workspace[curr_char] = towlower(workspace[curr_char]);
workspace[curr_char] = towlower(workspace[curr_char]);
}
/* Make result large enough; case change might change number of bytes */
result_size = curr_char * pg_database_encoding_max_length() + 1;
@ -1515,22 +1533,40 @@ str_tolower(const char *buff, size_t nbytes, Oid collid)
wchar2char(result, workspace, result_size, collid);
pfree(workspace);
}
else
#endif /* USE_WIDE_UPPER_LOWER */
else
{
pg_locale_t mylocale = 0;
char *p;
if (collid != DEFAULT_COLLATION_OID)
mylocale = pg_newlocale_from_collation(collid);
result = pnstrdup(buff, nbytes);
/*
* Note: we assume that tolower_l() will not be so broken as to need
* an isupper_l() guard test. When using the default collation, we
* apply the traditional Postgres behavior that forces ASCII-style
* treatment of I/i, but in non-default collations you get exactly
* what the collation says.
*/
for (p = result; *p; p++)
*p = pg_tolower((unsigned char) *p);
{
#ifdef HAVE_LOCALE_T
if (mylocale)
*p = tolower_l((unsigned char) *p, mylocale);
else
#endif
*p = pg_tolower((unsigned char) *p);
}
}
return result;
}
/*
* wide-character-aware upper function
* collation-aware, wide-character-aware upper function
*
* We pass the number of bytes so we can pass varlena and char*
* to this function. The result is a palloc'd, null-terminated string.
@ -1539,21 +1575,31 @@ char *
str_toupper(const char *buff, size_t nbytes, Oid collid)
{
char *result;
pg_locale_t mylocale = 0;
if (!buff)
return NULL;
if (collid != DEFAULT_COLLATION_OID)
mylocale = pg_newlocale_from_collation(collid);
#ifdef USE_WIDE_UPPER_LOWER
if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c(collid))
/* C/POSIX collations use this path regardless of database encoding */
if (lc_ctype_is_c(collid))
{
char *p;
result = pnstrdup(buff, nbytes);
for (p = result; *p; p++)
*p = pg_ascii_toupper((unsigned char) *p);
}
#ifdef USE_WIDE_UPPER_LOWER
else if (pg_database_encoding_max_length() > 1)
{
pg_locale_t mylocale = 0;
wchar_t *workspace;
size_t curr_char;
size_t result_size;
if (collid != DEFAULT_COLLATION_OID)
mylocale = pg_newlocale_from_collation(collid);
/* Overflow paranoia */
if ((nbytes + 1) > (INT_MAX / sizeof(wchar_t)))
ereport(ERROR,
@ -1566,12 +1612,14 @@ str_toupper(const char *buff, size_t nbytes, Oid collid)
char2wchar(workspace, nbytes + 1, buff, nbytes, collid);
for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
{
#ifdef HAVE_LOCALE_T
if (mylocale)
workspace[curr_char] = towupper_l(workspace[curr_char], mylocale);
else
#endif
workspace[curr_char] = towupper(workspace[curr_char]);
workspace[curr_char] = towupper(workspace[curr_char]);
}
/* Make result large enough; case change might change number of bytes */
result_size = curr_char * pg_database_encoding_max_length() + 1;
@ -1580,22 +1628,40 @@ str_toupper(const char *buff, size_t nbytes, Oid collid)
wchar2char(result, workspace, result_size, collid);
pfree(workspace);
}
else
#endif /* USE_WIDE_UPPER_LOWER */
else
{
pg_locale_t mylocale = 0;
char *p;
if (collid != DEFAULT_COLLATION_OID)
mylocale = pg_newlocale_from_collation(collid);
result = pnstrdup(buff, nbytes);
/*
* Note: we assume that toupper_l() will not be so broken as to need
* an islower_l() guard test. When using the default collation, we
* apply the traditional Postgres behavior that forces ASCII-style
* treatment of I/i, but in non-default collations you get exactly
* what the collation says.
*/
for (p = result; *p; p++)
*p = pg_toupper((unsigned char) *p);
{
#ifdef HAVE_LOCALE_T
if (mylocale)
*p = toupper_l((unsigned char) *p, mylocale);
else
#endif
*p = pg_toupper((unsigned char) *p);
}
}
return result;
}
/*
* wide-character-aware initcap function
* collation-aware, wide-character-aware initcap function
*
* We pass the number of bytes so we can pass varlena and char*
* to this function. The result is a palloc'd, null-terminated string.
@ -1605,21 +1671,42 @@ str_initcap(const char *buff, size_t nbytes, Oid collid)
{
char *result;
int wasalnum = false;
pg_locale_t mylocale = 0;
if (!buff)
return NULL;
if (collid != DEFAULT_COLLATION_OID)
mylocale = pg_newlocale_from_collation(collid);
#ifdef USE_WIDE_UPPER_LOWER
if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c(collid))
/* C/POSIX collations use this path regardless of database encoding */
if (lc_ctype_is_c(collid))
{
char *p;
result = pnstrdup(buff, nbytes);
for (p = result; *p; p++)
{
char c;
if (wasalnum)
*p = c = pg_ascii_tolower((unsigned char) *p);
else
*p = c = pg_ascii_toupper((unsigned char) *p);
/* we don't trust isalnum() here */
wasalnum = ((c >= 'A' && c <= 'Z') ||
(c >= 'a' && c <= 'z') ||
(c >= '0' && c <= '9'));
}
}
#ifdef USE_WIDE_UPPER_LOWER
else if (pg_database_encoding_max_length() > 1)
{
pg_locale_t mylocale = 0;
wchar_t *workspace;
size_t curr_char;
size_t result_size;
if (collid != DEFAULT_COLLATION_OID)
mylocale = pg_newlocale_from_collation(collid);
/* Overflow paranoia */
if ((nbytes + 1) > (INT_MAX / sizeof(wchar_t)))
ereport(ERROR,
@ -1660,20 +1747,44 @@ str_initcap(const char *buff, size_t nbytes, Oid collid)
wchar2char(result, workspace, result_size, collid);
pfree(workspace);
}
else
#endif /* USE_WIDE_UPPER_LOWER */
else
{
pg_locale_t mylocale = 0;
char *p;
if (collid != DEFAULT_COLLATION_OID)
mylocale = pg_newlocale_from_collation(collid);
result = pnstrdup(buff, nbytes);
/*
* Note: we assume that toupper_l()/tolower_l() will not be so broken
* as to need guard tests. When using the default collation, we apply
* the traditional Postgres behavior that forces ASCII-style treatment
* of I/i, but in non-default collations you get exactly what the
* collation says.
*/
for (p = result; *p; p++)
{
if (wasalnum)
*p = pg_tolower((unsigned char) *p);
#ifdef HAVE_LOCALE_T
if (mylocale)
{
if (wasalnum)
*p = tolower_l((unsigned char) *p, mylocale);
else
*p = toupper_l((unsigned char) *p, mylocale);
wasalnum = isalnum_l((unsigned char) *p, mylocale);
}
else
*p = pg_toupper((unsigned char) *p);
wasalnum = isalnum((unsigned char) *p);
#endif
{
if (wasalnum)
*p = pg_tolower((unsigned char) *p);
else
*p = pg_toupper((unsigned char) *p);
wasalnum = isalnum((unsigned char) *p);
}
}
}