From 72fe6d24a38c88e112d5e63a8e907c3e96ae46ad Mon Sep 17 00:00:00 2001 From: Jeff Davis Date: Tue, 30 Jul 2024 00:58:06 -0700 Subject: [PATCH] Make collation not depend on setlocale(). Now that the result of pg_newlocale_from_collation() is always non-NULL, then we can move the collate_is_c and ctype_is_c flags into pg_locale_t. That simplifies the logic in lc_collate_is_c() and lc_ctype_is_c(), removing the dependence on setlocale(). This commit also eliminates the multi-stage initialization of the collation cache. As long as we have catalog access, then it's now safe to call pg_newlocale_from_collation() without checking lc_collate_is_c() first. Discussion: https://postgr.es/m/cfd9eb85-c52a-4ec9-a90e-a5e4de56e57d@eisentraut.org Reviewed-by: Peter Eisentraut, Andreas Karlsson --- src/backend/utils/adt/pg_locale.c | 180 +++------------------ src/include/utils/pg_locale.h | 14 ++ src/test/regress/expected/collate.utf8.out | 26 +++ src/test/regress/sql/collate.utf8.sql | 15 ++ 4 files changed, 81 insertions(+), 154 deletions(-) diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index 49f333b9b68..627ab89d7cc 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -128,9 +128,6 @@ static bool CurrentLCTimeValid = false; typedef struct { Oid collid; /* hash key: pg_collation OID */ - bool collate_is_c; /* is collation's LC_COLLATE C? */ - bool ctype_is_c; /* is collation's LC_CTYPE C? */ - bool flags_valid; /* true if above flags are valid */ pg_locale_t locale; /* locale_t struct, or 0 if not valid */ /* needed for simplehash */ @@ -1225,29 +1222,13 @@ IsoLocaleName(const char *winlocname) /* * Cache mechanism for collation information. * - * We cache two flags: whether the collation's LC_COLLATE or LC_CTYPE is C - * (or POSIX), so we can optimize a few code paths in various places. - * For the built-in C and POSIX collations, we can know that without even - * doing a cache lookup, but we want to support aliases for C/POSIX too. - * For the "default" collation, there are separate static cache variables, - * since consulting the pg_collation catalog doesn't tell us what we need. - * - * Also, if a pg_locale_t has been requested for a collation, we cache that - * for the life of a backend. - * - * Note that some code relies on the flags not reporting false negatives - * (that is, saying it's not C when it is). For example, char2wchar() - * could fail if the locale is C, so str_tolower() shouldn't call it - * in that case. - * * Note that we currently lack any way to flush the cache. Since we don't * support ALTER COLLATION, this is OK. The worst case is that someone * drops a collation, and a useless cache entry hangs around in existing * backends. */ - static collation_cache_entry * -lookup_collation_cache(Oid collation, bool set_flags) +lookup_collation_cache(Oid collation) { collation_cache_entry *cache_entry; bool found; @@ -1271,59 +1252,9 @@ lookup_collation_cache(Oid collation, bool set_flags) * Make sure cache entry is marked invalid, in case we fail before * setting things. */ - cache_entry->flags_valid = false; cache_entry->locale = 0; } - if (set_flags && !cache_entry->flags_valid) - { - /* Attempt to set the flags */ - HeapTuple tp; - Form_pg_collation collform; - - tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collation)); - if (!HeapTupleIsValid(tp)) - elog(ERROR, "cache lookup failed for collation %u", collation); - collform = (Form_pg_collation) GETSTRUCT(tp); - - if (collform->collprovider == COLLPROVIDER_BUILTIN) - { - Datum datum; - const char *colllocale; - - datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colllocale); - colllocale = TextDatumGetCString(datum); - - cache_entry->collate_is_c = true; - cache_entry->ctype_is_c = (strcmp(colllocale, "C") == 0); - } - else if (collform->collprovider == COLLPROVIDER_LIBC) - { - Datum datum; - const char *collcollate; - const char *collctype; - - datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collcollate); - collcollate = TextDatumGetCString(datum); - datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collctype); - collctype = TextDatumGetCString(datum); - - cache_entry->collate_is_c = ((strcmp(collcollate, "C") == 0) || - (strcmp(collcollate, "POSIX") == 0)); - cache_entry->ctype_is_c = ((strcmp(collctype, "C") == 0) || - (strcmp(collctype, "POSIX") == 0)); - } - else - { - cache_entry->collate_is_c = false; - cache_entry->ctype_is_c = false; - } - - cache_entry->flags_valid = true; - - ReleaseSysCache(tp); - } - return cache_entry; } @@ -1341,47 +1272,6 @@ lc_collate_is_c(Oid collation) if (!OidIsValid(collation)) return false; - /* - * If we're asked about the default collation, we have to inquire of the C - * library. Cache the result so we only have to compute it once. - */ - if (collation == DEFAULT_COLLATION_OID) - { - static int result = -1; - const char *localeptr; - - if (result >= 0) - return (bool) result; - - if (default_locale.provider == COLLPROVIDER_BUILTIN) - { - result = true; - return (bool) result; - } - else if (default_locale.provider == COLLPROVIDER_ICU) - { - result = false; - return (bool) result; - } - else if (default_locale.provider == COLLPROVIDER_LIBC) - { - localeptr = setlocale(LC_CTYPE, NULL); - if (!localeptr) - elog(ERROR, "invalid LC_CTYPE setting"); - } - else - elog(ERROR, "unexpected collation provider '%c'", - default_locale.provider); - - if (strcmp(localeptr, "C") == 0) - result = true; - else if (strcmp(localeptr, "POSIX") == 0) - result = true; - else - result = false; - return (bool) result; - } - /* * If we're asked about the built-in C/POSIX collations, we know that. */ @@ -1392,7 +1282,7 @@ lc_collate_is_c(Oid collation) /* * Otherwise, we have to consult pg_collation, but we cache that. */ - return (lookup_collation_cache(collation, true))->collate_is_c; + return pg_newlocale_from_collation(collation)->collate_is_c; } /* @@ -1408,46 +1298,6 @@ lc_ctype_is_c(Oid collation) if (!OidIsValid(collation)) return false; - /* - * If we're asked about the default collation, we have to inquire of the C - * library. Cache the result so we only have to compute it once. - */ - if (collation == DEFAULT_COLLATION_OID) - { - static int result = -1; - const char *localeptr; - - if (result >= 0) - return (bool) result; - - if (default_locale.provider == COLLPROVIDER_BUILTIN) - { - localeptr = default_locale.info.builtin.locale; - } - else if (default_locale.provider == COLLPROVIDER_ICU) - { - result = false; - return (bool) result; - } - else if (default_locale.provider == COLLPROVIDER_LIBC) - { - localeptr = setlocale(LC_CTYPE, NULL); - if (!localeptr) - elog(ERROR, "invalid LC_CTYPE setting"); - } - else - elog(ERROR, "unexpected collation provider '%c'", - default_locale.provider); - - if (strcmp(localeptr, "C") == 0) - result = true; - else if (strcmp(localeptr, "POSIX") == 0) - result = true; - else - result = false; - return (bool) result; - } - /* * If we're asked about the built-in C/POSIX collations, we know that. */ @@ -1458,7 +1308,7 @@ lc_ctype_is_c(Oid collation) /* * Otherwise, we have to consult pg_collation, but we cache that. */ - return (lookup_collation_cache(collation, true))->ctype_is_c; + return pg_newlocale_from_collation(collation)->ctype_is_c; } /* simple subroutine for reporting errors from newlocale() */ @@ -1647,6 +1497,9 @@ init_database_collation(void) builtin_validate_locale(dbform->encoding, datlocale); + default_locale.collate_is_c = true; + default_locale.ctype_is_c = (strcmp(datlocale, "C") == 0); + default_locale.info.builtin.locale = MemoryContextStrdup( TopMemoryContext, datlocale); } @@ -1658,6 +1511,9 @@ init_database_collation(void) datum = SysCacheGetAttrNotNull(DATABASEOID, tup, Anum_pg_database_datlocale); datlocale = TextDatumGetCString(datum); + default_locale.collate_is_c = false; + default_locale.ctype_is_c = false; + datum = SysCacheGetAttr(DATABASEOID, tup, Anum_pg_database_daticurules, &isnull); if (!isnull) icurules = TextDatumGetCString(datum); @@ -1678,6 +1534,11 @@ init_database_collation(void) datum = SysCacheGetAttrNotNull(DATABASEOID, tup, Anum_pg_database_datctype); datctype = TextDatumGetCString(datum); + default_locale.collate_is_c = (strcmp(datcollate, "C") == 0) || + (strcmp(datcollate, "POSIX") == 0); + default_locale.ctype_is_c = (strcmp(datctype, "C") == 0) || + (strcmp(datctype, "POSIX") == 0); + make_libc_collator(datcollate, datctype, &default_locale); } @@ -1712,7 +1573,7 @@ pg_newlocale_from_collation(Oid collid) if (collid == DEFAULT_COLLATION_OID) return &default_locale; - cache_entry = lookup_collation_cache(collid, false); + cache_entry = lookup_collation_cache(collid); if (cache_entry->locale == 0) { @@ -1741,6 +1602,9 @@ pg_newlocale_from_collation(Oid collid) datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colllocale); locstr = TextDatumGetCString(datum); + result.collate_is_c = true; + result.ctype_is_c = (strcmp(locstr, "C") == 0); + builtin_validate_locale(GetDatabaseEncoding(), locstr); result.info.builtin.locale = MemoryContextStrdup(TopMemoryContext, @@ -1756,6 +1620,11 @@ pg_newlocale_from_collation(Oid collid) datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collctype); collctype = TextDatumGetCString(datum); + result.collate_is_c = (strcmp(collcollate, "C") == 0) || + (strcmp(collcollate, "POSIX") == 0); + result.ctype_is_c = (strcmp(collctype, "C") == 0) || + (strcmp(collctype, "POSIX") == 0); + make_libc_collator(collcollate, collctype, &result); } else if (collform->collprovider == COLLPROVIDER_ICU) @@ -1766,6 +1635,9 @@ pg_newlocale_from_collation(Oid collid) datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colllocale); iculocstr = TextDatumGetCString(datum); + result.collate_is_c = false; + result.ctype_is_c = false; + datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collicurules, &isnull); if (!isnull) icurules = TextDatumGetCString(datum); diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h index 3e14a261b16..f41d33975be 100644 --- a/src/include/utils/pg_locale.h +++ b/src/include/utils/pg_locale.h @@ -69,11 +69,25 @@ extern void cache_locale_time(void); /* * We use a discriminated union to hold either a locale_t or an ICU collator. * pg_locale_t is occasionally checked for truth, so make it a pointer. + * + * Also, hold two flags: whether the collation's LC_COLLATE or LC_CTYPE is C + * (or POSIX), so we can optimize a few code paths in various places. For the + * built-in C and POSIX collations, we can know that without even doing a + * cache lookup, but we want to support aliases for C/POSIX too. For the + * "default" collation, there are separate static cache variables, since + * consulting the pg_collation catalog doesn't tell us what we need. + * + * Note that some code relies on the flags not reporting false negatives + * (that is, saying it's not C when it is). For example, char2wchar() + * could fail if the locale is C, so str_tolower() shouldn't call it + * in that case. */ struct pg_locale_struct { char provider; bool deterministic; + bool collate_is_c; + bool ctype_is_c; union { struct diff --git a/src/test/regress/expected/collate.utf8.out b/src/test/regress/expected/collate.utf8.out index eff0ef21ac5..4558d2521a2 100644 --- a/src/test/regress/expected/collate.utf8.out +++ b/src/test/regress/expected/collate.utf8.out @@ -9,6 +9,32 @@ SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset \endif SET client_encoding TO UTF8; -- +-- Test builtin "C" +-- +CREATE COLLATION regress_builtin_c ( + provider = builtin, locale = 'C'); +-- non-ASCII characters are unchanged +SELECT LOWER(U&'\00C1' COLLATE regress_builtin_c) = U&'\00C1'; + ?column? +---------- + t +(1 row) + +SELECT UPPER(U&'\00E1' COLLATE regress_builtin_c) = U&'\00E1'; + ?column? +---------- + t +(1 row) + +-- non-ASCII characters are not alphabetic +SELECT U&'\00C1\00E1' !~ '[[:alpha:]]' COLLATE regress_builtin_c; + ?column? +---------- + t +(1 row) + +DROP COLLATION regress_builtin_c; +-- -- Test PG_C_UTF8 -- CREATE COLLATION regress_pg_c_utf8 ( diff --git a/src/test/regress/sql/collate.utf8.sql b/src/test/regress/sql/collate.utf8.sql index 1f5f9ef491d..87fe06ddf1b 100644 --- a/src/test/regress/sql/collate.utf8.sql +++ b/src/test/regress/sql/collate.utf8.sql @@ -11,6 +11,21 @@ SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset SET client_encoding TO UTF8; +-- +-- Test builtin "C" +-- +CREATE COLLATION regress_builtin_c ( + provider = builtin, locale = 'C'); + +-- non-ASCII characters are unchanged +SELECT LOWER(U&'\00C1' COLLATE regress_builtin_c) = U&'\00C1'; +SELECT UPPER(U&'\00E1' COLLATE regress_builtin_c) = U&'\00E1'; + +-- non-ASCII characters are not alphabetic +SELECT U&'\00C1\00E1' !~ '[[:alpha:]]' COLLATE regress_builtin_c; + +DROP COLLATION regress_builtin_c; + -- -- Test PG_C_UTF8 --