From 72fe6d24a38c88e112d5e63a8e907c3e96ae46ad Mon Sep 17 00:00:00 2001
From: Jeff Davis <jdavis@postgresql.org>
Date: Tue, 30 Jul 2024 00:58:06 -0700
Subject: [PATCH] Make collation not depend on setlocale().

Now that the result of pg_newlocale_from_collation() is always
non-NULL, then we can move the collate_is_c and ctype_is_c flags into
pg_locale_t. That simplifies the logic in lc_collate_is_c() and
lc_ctype_is_c(), removing the dependence on setlocale().

This commit also eliminates the multi-stage initialization of the
collation cache.

As long as we have catalog access, then it's now safe to call
pg_newlocale_from_collation() without checking lc_collate_is_c()
first.

Discussion: https://postgr.es/m/cfd9eb85-c52a-4ec9-a90e-a5e4de56e57d@eisentraut.org
Reviewed-by: Peter Eisentraut, Andreas Karlsson
---
 src/backend/utils/adt/pg_locale.c          | 180 +++------------------
 src/include/utils/pg_locale.h              |  14 ++
 src/test/regress/expected/collate.utf8.out |  26 +++
 src/test/regress/sql/collate.utf8.sql      |  15 ++
 4 files changed, 81 insertions(+), 154 deletions(-)

diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index 49f333b9b68..627ab89d7cc 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -128,9 +128,6 @@ static bool CurrentLCTimeValid = false;
 typedef struct
 {
 	Oid			collid;			/* hash key: pg_collation OID */
-	bool		collate_is_c;	/* is collation's LC_COLLATE C? */
-	bool		ctype_is_c;		/* is collation's LC_CTYPE C? */
-	bool		flags_valid;	/* true if above flags are valid */
 	pg_locale_t locale;			/* locale_t struct, or 0 if not valid */
 
 	/* needed for simplehash */
@@ -1225,29 +1222,13 @@ IsoLocaleName(const char *winlocname)
 /*
  * Cache mechanism for collation information.
  *
- * We cache two flags: whether the collation's LC_COLLATE or LC_CTYPE is C
- * (or POSIX), so we can optimize a few code paths in various places.
- * For the built-in C and POSIX collations, we can know that without even
- * doing a cache lookup, but we want to support aliases for C/POSIX too.
- * For the "default" collation, there are separate static cache variables,
- * since consulting the pg_collation catalog doesn't tell us what we need.
- *
- * Also, if a pg_locale_t has been requested for a collation, we cache that
- * for the life of a backend.
- *
- * Note that some code relies on the flags not reporting false negatives
- * (that is, saying it's not C when it is).  For example, char2wchar()
- * could fail if the locale is C, so str_tolower() shouldn't call it
- * in that case.
- *
  * Note that we currently lack any way to flush the cache.  Since we don't
  * support ALTER COLLATION, this is OK.  The worst case is that someone
  * drops a collation, and a useless cache entry hangs around in existing
  * backends.
  */
-
 static collation_cache_entry *
-lookup_collation_cache(Oid collation, bool set_flags)
+lookup_collation_cache(Oid collation)
 {
 	collation_cache_entry *cache_entry;
 	bool		found;
@@ -1271,59 +1252,9 @@ lookup_collation_cache(Oid collation, bool set_flags)
 		 * Make sure cache entry is marked invalid, in case we fail before
 		 * setting things.
 		 */
-		cache_entry->flags_valid = false;
 		cache_entry->locale = 0;
 	}
 
-	if (set_flags && !cache_entry->flags_valid)
-	{
-		/* Attempt to set the flags */
-		HeapTuple	tp;
-		Form_pg_collation collform;
-
-		tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collation));
-		if (!HeapTupleIsValid(tp))
-			elog(ERROR, "cache lookup failed for collation %u", collation);
-		collform = (Form_pg_collation) GETSTRUCT(tp);
-
-		if (collform->collprovider == COLLPROVIDER_BUILTIN)
-		{
-			Datum		datum;
-			const char *colllocale;
-
-			datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colllocale);
-			colllocale = TextDatumGetCString(datum);
-
-			cache_entry->collate_is_c = true;
-			cache_entry->ctype_is_c = (strcmp(colllocale, "C") == 0);
-		}
-		else if (collform->collprovider == COLLPROVIDER_LIBC)
-		{
-			Datum		datum;
-			const char *collcollate;
-			const char *collctype;
-
-			datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collcollate);
-			collcollate = TextDatumGetCString(datum);
-			datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collctype);
-			collctype = TextDatumGetCString(datum);
-
-			cache_entry->collate_is_c = ((strcmp(collcollate, "C") == 0) ||
-										 (strcmp(collcollate, "POSIX") == 0));
-			cache_entry->ctype_is_c = ((strcmp(collctype, "C") == 0) ||
-									   (strcmp(collctype, "POSIX") == 0));
-		}
-		else
-		{
-			cache_entry->collate_is_c = false;
-			cache_entry->ctype_is_c = false;
-		}
-
-		cache_entry->flags_valid = true;
-
-		ReleaseSysCache(tp);
-	}
-
 	return cache_entry;
 }
 
@@ -1341,47 +1272,6 @@ lc_collate_is_c(Oid collation)
 	if (!OidIsValid(collation))
 		return false;
 
-	/*
-	 * If we're asked about the default collation, we have to inquire of the C
-	 * library.  Cache the result so we only have to compute it once.
-	 */
-	if (collation == DEFAULT_COLLATION_OID)
-	{
-		static int	result = -1;
-		const char *localeptr;
-
-		if (result >= 0)
-			return (bool) result;
-
-		if (default_locale.provider == COLLPROVIDER_BUILTIN)
-		{
-			result = true;
-			return (bool) result;
-		}
-		else if (default_locale.provider == COLLPROVIDER_ICU)
-		{
-			result = false;
-			return (bool) result;
-		}
-		else if (default_locale.provider == COLLPROVIDER_LIBC)
-		{
-			localeptr = setlocale(LC_CTYPE, NULL);
-			if (!localeptr)
-				elog(ERROR, "invalid LC_CTYPE setting");
-		}
-		else
-			elog(ERROR, "unexpected collation provider '%c'",
-				 default_locale.provider);
-
-		if (strcmp(localeptr, "C") == 0)
-			result = true;
-		else if (strcmp(localeptr, "POSIX") == 0)
-			result = true;
-		else
-			result = false;
-		return (bool) result;
-	}
-
 	/*
 	 * If we're asked about the built-in C/POSIX collations, we know that.
 	 */
@@ -1392,7 +1282,7 @@ lc_collate_is_c(Oid collation)
 	/*
 	 * Otherwise, we have to consult pg_collation, but we cache that.
 	 */
-	return (lookup_collation_cache(collation, true))->collate_is_c;
+	return pg_newlocale_from_collation(collation)->collate_is_c;
 }
 
 /*
@@ -1408,46 +1298,6 @@ lc_ctype_is_c(Oid collation)
 	if (!OidIsValid(collation))
 		return false;
 
-	/*
-	 * If we're asked about the default collation, we have to inquire of the C
-	 * library.  Cache the result so we only have to compute it once.
-	 */
-	if (collation == DEFAULT_COLLATION_OID)
-	{
-		static int	result = -1;
-		const char *localeptr;
-
-		if (result >= 0)
-			return (bool) result;
-
-		if (default_locale.provider == COLLPROVIDER_BUILTIN)
-		{
-			localeptr = default_locale.info.builtin.locale;
-		}
-		else if (default_locale.provider == COLLPROVIDER_ICU)
-		{
-			result = false;
-			return (bool) result;
-		}
-		else if (default_locale.provider == COLLPROVIDER_LIBC)
-		{
-			localeptr = setlocale(LC_CTYPE, NULL);
-			if (!localeptr)
-				elog(ERROR, "invalid LC_CTYPE setting");
-		}
-		else
-			elog(ERROR, "unexpected collation provider '%c'",
-				 default_locale.provider);
-
-		if (strcmp(localeptr, "C") == 0)
-			result = true;
-		else if (strcmp(localeptr, "POSIX") == 0)
-			result = true;
-		else
-			result = false;
-		return (bool) result;
-	}
-
 	/*
 	 * If we're asked about the built-in C/POSIX collations, we know that.
 	 */
@@ -1458,7 +1308,7 @@ lc_ctype_is_c(Oid collation)
 	/*
 	 * Otherwise, we have to consult pg_collation, but we cache that.
 	 */
-	return (lookup_collation_cache(collation, true))->ctype_is_c;
+	return pg_newlocale_from_collation(collation)->ctype_is_c;
 }
 
 /* simple subroutine for reporting errors from newlocale() */
@@ -1647,6 +1497,9 @@ init_database_collation(void)
 
 		builtin_validate_locale(dbform->encoding, datlocale);
 
+		default_locale.collate_is_c = true;
+		default_locale.ctype_is_c = (strcmp(datlocale, "C") == 0);
+
 		default_locale.info.builtin.locale = MemoryContextStrdup(
 																 TopMemoryContext, datlocale);
 	}
@@ -1658,6 +1511,9 @@ init_database_collation(void)
 		datum = SysCacheGetAttrNotNull(DATABASEOID, tup, Anum_pg_database_datlocale);
 		datlocale = TextDatumGetCString(datum);
 
+		default_locale.collate_is_c = false;
+		default_locale.ctype_is_c = false;
+
 		datum = SysCacheGetAttr(DATABASEOID, tup, Anum_pg_database_daticurules, &isnull);
 		if (!isnull)
 			icurules = TextDatumGetCString(datum);
@@ -1678,6 +1534,11 @@ init_database_collation(void)
 		datum = SysCacheGetAttrNotNull(DATABASEOID, tup, Anum_pg_database_datctype);
 		datctype = TextDatumGetCString(datum);
 
+		default_locale.collate_is_c = (strcmp(datcollate, "C") == 0) ||
+			(strcmp(datcollate, "POSIX") == 0);
+		default_locale.ctype_is_c = (strcmp(datctype, "C") == 0) ||
+			(strcmp(datctype, "POSIX") == 0);
+
 		make_libc_collator(datcollate, datctype, &default_locale);
 	}
 
@@ -1712,7 +1573,7 @@ pg_newlocale_from_collation(Oid collid)
 	if (collid == DEFAULT_COLLATION_OID)
 		return &default_locale;
 
-	cache_entry = lookup_collation_cache(collid, false);
+	cache_entry = lookup_collation_cache(collid);
 
 	if (cache_entry->locale == 0)
 	{
@@ -1741,6 +1602,9 @@ pg_newlocale_from_collation(Oid collid)
 			datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colllocale);
 			locstr = TextDatumGetCString(datum);
 
+			result.collate_is_c = true;
+			result.ctype_is_c = (strcmp(locstr, "C") == 0);
+
 			builtin_validate_locale(GetDatabaseEncoding(), locstr);
 
 			result.info.builtin.locale = MemoryContextStrdup(TopMemoryContext,
@@ -1756,6 +1620,11 @@ pg_newlocale_from_collation(Oid collid)
 			datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collctype);
 			collctype = TextDatumGetCString(datum);
 
+			result.collate_is_c = (strcmp(collcollate, "C") == 0) ||
+				(strcmp(collcollate, "POSIX") == 0);
+			result.ctype_is_c = (strcmp(collctype, "C") == 0) ||
+				(strcmp(collctype, "POSIX") == 0);
+
 			make_libc_collator(collcollate, collctype, &result);
 		}
 		else if (collform->collprovider == COLLPROVIDER_ICU)
@@ -1766,6 +1635,9 @@ pg_newlocale_from_collation(Oid collid)
 			datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colllocale);
 			iculocstr = TextDatumGetCString(datum);
 
+			result.collate_is_c = false;
+			result.ctype_is_c = false;
+
 			datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collicurules, &isnull);
 			if (!isnull)
 				icurules = TextDatumGetCString(datum);
diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h
index 3e14a261b16..f41d33975be 100644
--- a/src/include/utils/pg_locale.h
+++ b/src/include/utils/pg_locale.h
@@ -69,11 +69,25 @@ extern void cache_locale_time(void);
 /*
  * We use a discriminated union to hold either a locale_t or an ICU collator.
  * pg_locale_t is occasionally checked for truth, so make it a pointer.
+ *
+ * Also, hold two flags: whether the collation's LC_COLLATE or LC_CTYPE is C
+ * (or POSIX), so we can optimize a few code paths in various places.  For the
+ * built-in C and POSIX collations, we can know that without even doing a
+ * cache lookup, but we want to support aliases for C/POSIX too.  For the
+ * "default" collation, there are separate static cache variables, since
+ * consulting the pg_collation catalog doesn't tell us what we need.
+ *
+ * Note that some code relies on the flags not reporting false negatives
+ * (that is, saying it's not C when it is).  For example, char2wchar()
+ * could fail if the locale is C, so str_tolower() shouldn't call it
+ * in that case.
  */
 struct pg_locale_struct
 {
 	char		provider;
 	bool		deterministic;
+	bool		collate_is_c;
+	bool		ctype_is_c;
 	union
 	{
 		struct
diff --git a/src/test/regress/expected/collate.utf8.out b/src/test/regress/expected/collate.utf8.out
index eff0ef21ac5..4558d2521a2 100644
--- a/src/test/regress/expected/collate.utf8.out
+++ b/src/test/regress/expected/collate.utf8.out
@@ -9,6 +9,32 @@ SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset
 \endif
 SET client_encoding TO UTF8;
 --
+-- Test builtin "C"
+--
+CREATE COLLATION regress_builtin_c (
+  provider = builtin, locale = 'C');
+-- non-ASCII characters are unchanged
+SELECT LOWER(U&'\00C1' COLLATE regress_builtin_c) = U&'\00C1';
+ ?column? 
+----------
+ t
+(1 row)
+
+SELECT UPPER(U&'\00E1' COLLATE regress_builtin_c) = U&'\00E1';
+ ?column? 
+----------
+ t
+(1 row)
+
+-- non-ASCII characters are not alphabetic
+SELECT U&'\00C1\00E1' !~ '[[:alpha:]]' COLLATE regress_builtin_c;
+ ?column? 
+----------
+ t
+(1 row)
+
+DROP COLLATION regress_builtin_c;
+--
 -- Test PG_C_UTF8
 --
 CREATE COLLATION regress_pg_c_utf8 (
diff --git a/src/test/regress/sql/collate.utf8.sql b/src/test/regress/sql/collate.utf8.sql
index 1f5f9ef491d..87fe06ddf1b 100644
--- a/src/test/regress/sql/collate.utf8.sql
+++ b/src/test/regress/sql/collate.utf8.sql
@@ -11,6 +11,21 @@ SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset
 
 SET client_encoding TO UTF8;
 
+--
+-- Test builtin "C"
+--
+CREATE COLLATION regress_builtin_c (
+  provider = builtin, locale = 'C');
+
+-- non-ASCII characters are unchanged
+SELECT LOWER(U&'\00C1' COLLATE regress_builtin_c) = U&'\00C1';
+SELECT UPPER(U&'\00E1' COLLATE regress_builtin_c) = U&'\00E1';
+
+-- non-ASCII characters are not alphabetic
+SELECT U&'\00C1\00E1' !~ '[[:alpha:]]' COLLATE regress_builtin_c;
+
+DROP COLLATION regress_builtin_c;
+
 --
 -- Test PG_C_UTF8
 --