Fix up handling of C/POSIX collations.

Install just one instance of the "C" and "POSIX" collations into pg_collation, rather than one per encoding. Make these instances exist and do something useful even in machines without locale_t support: to wit, it's now possible to force comparisons and case-folding functions to use C locale in an otherwise non-C database, whether or not the platform has support for using any additional collations. Fix up severely broken upper/lower/initcap functions, too: the C/POSIX fastpath now does what it is supposed to, and non-default collations are handled correctly in single-byte database encodings. Merge the two separate collation hashtables that were being maintained in pg_locale.c, and be more wary of the possibility that we fail partway through filling a cache entry.
2025-12-22 17:42:17 +03:00 · 2011-03-20 12:43:39 -04:00
parent c2f4ea469b
commit 176d5bae1d
8 changed files with 586 additions and 296 deletions
--- a/src/backend/utils/adt/formatting.c
+++ b/src/backend/utils/adt/formatting.c
@@ -1462,10 +1462,16 @@ str_numth(char *dest, char *num, int type)
 * in multibyte character sets.  Note that in either case we are effectively
 * assuming that the database character encoding matches the encoding implied
 * by LC_CTYPE.
+ *
+ * If the system provides locale_t and associated functions (which are
+ * standardized by Open Group's XBD), we can support collations that are
+ * neither default nor C.  The code is written to handle both combinations
+ * of have-wide-characters and have-locale_t, though it's rather unlikely
+ * a platform would have the latter without the former.
 */

 /*
- * wide-character-aware lower function
+ * collation-aware, wide-character-aware lower function
 *
 * We pass the number of bytes so we can pass varlena and char*
 * to this function.  The result is a palloc'd, null-terminated string.
@@ -1474,21 +1480,31 @@ char *
 str_tolower(const char *buff, size_t nbytes, Oid collid)
 {
 	char	   *result;
-	pg_locale_t	mylocale = 0;

 	if (!buff)
 		return NULL;

-	if (collid != DEFAULT_COLLATION_OID)
-		mylocale = pg_newlocale_from_collation(collid);
-
-#ifdef USE_WIDE_UPPER_LOWER
-	if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c(collid))
+	/* C/POSIX collations use this path regardless of database encoding */
+	if (lc_ctype_is_c(collid))
 	{
+		char	   *p;
+
+		result = pnstrdup(buff, nbytes);
+
+		for (p = result; *p; p++)
+			*p = pg_ascii_tolower((unsigned char) *p);
+	}
+#ifdef USE_WIDE_UPPER_LOWER
+	else if (pg_database_encoding_max_length() > 1)
+	{
+		pg_locale_t	mylocale = 0;
 		wchar_t    *workspace;
 		size_t		curr_char;
 		size_t		result_size;

+		if (collid != DEFAULT_COLLATION_OID)
+			mylocale = pg_newlocale_from_collation(collid);
+
 		/* Overflow paranoia */
 		if ((nbytes + 1) > (INT_MAX / sizeof(wchar_t)))
 			ereport(ERROR,
@@ -1501,12 +1517,14 @@ str_tolower(const char *buff, size_t nbytes, Oid collid)
 		char2wchar(workspace, nbytes + 1, buff, nbytes, collid);

 		for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
+		{
 #ifdef HAVE_LOCALE_T
 			if (mylocale)
 				workspace[curr_char] = towlower_l(workspace[curr_char], mylocale);
 			else
 #endif
-			workspace[curr_char] = towlower(workspace[curr_char]);
+				workspace[curr_char] = towlower(workspace[curr_char]);
+		}

 		/* Make result large enough; case change might change number of bytes */
 		result_size = curr_char * pg_database_encoding_max_length() + 1;
@@ -1515,22 +1533,40 @@ str_tolower(const char *buff, size_t nbytes, Oid collid)
 		wchar2char(result, workspace, result_size, collid);
 		pfree(workspace);
 	}
-	else
 #endif   /* USE_WIDE_UPPER_LOWER */
+	else
 	{
+		pg_locale_t	mylocale = 0;
 		char	   *p;

+		if (collid != DEFAULT_COLLATION_OID)
+			mylocale = pg_newlocale_from_collation(collid);
+
 		result = pnstrdup(buff, nbytes);

+		/*
+		 * Note: we assume that tolower_l() will not be so broken as to need
+		 * an isupper_l() guard test.  When using the default collation, we
+		 * apply the traditional Postgres behavior that forces ASCII-style
+		 * treatment of I/i, but in non-default collations you get exactly
+		 * what the collation says.
+		 */
 		for (p = result; *p; p++)
-			*p = pg_tolower((unsigned char) *p);
+		{
+#ifdef HAVE_LOCALE_T
+			if (mylocale)
+				*p = tolower_l((unsigned char) *p, mylocale);
+			else
+#endif
+				*p = pg_tolower((unsigned char) *p);
+		}
 	}

 	return result;
 }

 /*
- * wide-character-aware upper function
+ * collation-aware, wide-character-aware upper function
 *
 * We pass the number of bytes so we can pass varlena and char*
 * to this function.  The result is a palloc'd, null-terminated string.
@@ -1539,21 +1575,31 @@ char *
 str_toupper(const char *buff, size_t nbytes, Oid collid)
 {
 	char	   *result;
-	pg_locale_t	mylocale = 0;

 	if (!buff)
 		return NULL;

-	if (collid != DEFAULT_COLLATION_OID)
-		mylocale = pg_newlocale_from_collation(collid);
-
-#ifdef USE_WIDE_UPPER_LOWER
-	if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c(collid))
+	/* C/POSIX collations use this path regardless of database encoding */
+	if (lc_ctype_is_c(collid))
 	{
+		char	   *p;
+
+		result = pnstrdup(buff, nbytes);
+
+		for (p = result; *p; p++)
+			*p = pg_ascii_toupper((unsigned char) *p);
+	}
+#ifdef USE_WIDE_UPPER_LOWER
+	else if (pg_database_encoding_max_length() > 1)
+	{
+		pg_locale_t	mylocale = 0;
 		wchar_t    *workspace;
 		size_t		curr_char;
 		size_t		result_size;

+		if (collid != DEFAULT_COLLATION_OID)
+			mylocale = pg_newlocale_from_collation(collid);
+
 		/* Overflow paranoia */
 		if ((nbytes + 1) > (INT_MAX / sizeof(wchar_t)))
 			ereport(ERROR,
@@ -1566,12 +1612,14 @@ str_toupper(const char *buff, size_t nbytes, Oid collid)
 		char2wchar(workspace, nbytes + 1, buff, nbytes, collid);

 		for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
+		{
 #ifdef HAVE_LOCALE_T
 			if (mylocale)
 				workspace[curr_char] = towupper_l(workspace[curr_char], mylocale);
 			else
 #endif
-			workspace[curr_char] = towupper(workspace[curr_char]);
+				workspace[curr_char] = towupper(workspace[curr_char]);
+		}

 		/* Make result large enough; case change might change number of bytes */
 		result_size = curr_char * pg_database_encoding_max_length() + 1;
@@ -1580,22 +1628,40 @@ str_toupper(const char *buff, size_t nbytes, Oid collid)
 		wchar2char(result, workspace, result_size, collid);
 		pfree(workspace);
 	}
-	else
 #endif   /* USE_WIDE_UPPER_LOWER */
+	else
 	{
+		pg_locale_t	mylocale = 0;
 		char	   *p;

+		if (collid != DEFAULT_COLLATION_OID)
+			mylocale = pg_newlocale_from_collation(collid);
+
 		result = pnstrdup(buff, nbytes);

+		/*
+		 * Note: we assume that toupper_l() will not be so broken as to need
+		 * an islower_l() guard test.  When using the default collation, we
+		 * apply the traditional Postgres behavior that forces ASCII-style
+		 * treatment of I/i, but in non-default collations you get exactly
+		 * what the collation says.
+		 */
 		for (p = result; *p; p++)
-			*p = pg_toupper((unsigned char) *p);
+		{
+#ifdef HAVE_LOCALE_T
+			if (mylocale)
+				*p = toupper_l((unsigned char) *p, mylocale);
+			else
+#endif
+				*p = pg_toupper((unsigned char) *p);
+		}
 	}

 	return result;
 }

 /*
- * wide-character-aware initcap function
+ * collation-aware, wide-character-aware initcap function
 *
 * We pass the number of bytes so we can pass varlena and char*
 * to this function.  The result is a palloc'd, null-terminated string.
@@ -1605,21 +1671,42 @@ str_initcap(const char *buff, size_t nbytes, Oid collid)
 {
 	char	   *result;
 	int			wasalnum = false;
-	pg_locale_t	mylocale = 0;

 	if (!buff)
 		return NULL;

-	if (collid != DEFAULT_COLLATION_OID)
-		mylocale = pg_newlocale_from_collation(collid);
-
-#ifdef USE_WIDE_UPPER_LOWER
-	if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c(collid))
+	/* C/POSIX collations use this path regardless of database encoding */
+	if (lc_ctype_is_c(collid))
 	{
+		char	   *p;
+
+		result = pnstrdup(buff, nbytes);
+
+		for (p = result; *p; p++)
+		{
+			char	c;
+
+			if (wasalnum)
+				*p = c = pg_ascii_tolower((unsigned char) *p);
+			else
+				*p = c = pg_ascii_toupper((unsigned char) *p);
+			/* we don't trust isalnum() here */
+			wasalnum = ((c >= 'A' && c <= 'Z') ||
+						(c >= 'a' && c <= 'z') ||
+						(c >= '0' && c <= '9'));
+		}
+	}
+#ifdef USE_WIDE_UPPER_LOWER
+	else if (pg_database_encoding_max_length() > 1)
+	{
+		pg_locale_t	mylocale = 0;
 		wchar_t    *workspace;
 		size_t		curr_char;
 		size_t		result_size;

+		if (collid != DEFAULT_COLLATION_OID)
+			mylocale = pg_newlocale_from_collation(collid);
+
 		/* Overflow paranoia */
 		if ((nbytes + 1) > (INT_MAX / sizeof(wchar_t)))
 			ereport(ERROR,
@@ -1660,20 +1747,44 @@ str_initcap(const char *buff, size_t nbytes, Oid collid)
 		wchar2char(result, workspace, result_size, collid);
 		pfree(workspace);
 	}
-	else
 #endif   /* USE_WIDE_UPPER_LOWER */
+	else
 	{
+		pg_locale_t	mylocale = 0;
 		char	   *p;

+		if (collid != DEFAULT_COLLATION_OID)
+			mylocale = pg_newlocale_from_collation(collid);
+
 		result = pnstrdup(buff, nbytes);

+		/*
+		 * Note: we assume that toupper_l()/tolower_l() will not be so broken
+		 * as to need guard tests.  When using the default collation, we apply
+		 * the traditional Postgres behavior that forces ASCII-style treatment
+		 * of I/i, but in non-default collations you get exactly what the
+		 * collation says.
+		 */
 		for (p = result; *p; p++)
 		{
-			if (wasalnum)
-				*p = pg_tolower((unsigned char) *p);
+#ifdef HAVE_LOCALE_T
+			if (mylocale)
+			{
+				if (wasalnum)
+					*p = tolower_l((unsigned char) *p, mylocale);
+				else
+					*p = toupper_l((unsigned char) *p, mylocale);
+				wasalnum = isalnum_l((unsigned char) *p, mylocale);
+			}
 			else
-				*p = pg_toupper((unsigned char) *p);
-			wasalnum = isalnum((unsigned char) *p);
+#endif
+			{
+				if (wasalnum)
+					*p = pg_tolower((unsigned char) *p);
+				else
+					*p = pg_toupper((unsigned char) *p);
+				wasalnum = isalnum((unsigned char) *p);
+			}
 		}
 	}