From a2f17f004d229f69a32cfa80904b95edcbc68f95 Mon Sep 17 00:00:00 2001 From: Jeff Davis Date: Wed, 8 Jan 2025 14:26:33 -0800 Subject: [PATCH] Control collation behavior with a method table. Previously, behavior branched based on the provider. A method table is less error-prone and more flexible. The ctype behavior will be addressed in an upcoming commit. Reviewed-by: Andreas Karlsson Discussion: https://postgr.es/m/2830211e1b6e6a2e26d845780b03e125281ea17b.camel%40j-davis.com --- src/backend/utils/adt/pg_locale.c | 123 +++------------------ src/backend/utils/adt/pg_locale_icu.c | 147 +++++++++++++++---------- src/backend/utils/adt/pg_locale_libc.c | 53 +++++++-- src/include/utils/pg_locale.h | 33 ++++++ 4 files changed, 180 insertions(+), 176 deletions(-) diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index dc8248fb269..875cca6efc8 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -92,27 +92,12 @@ extern char *get_collation_actual_version_builtin(const char *collcollate); /* pg_locale_icu.c */ #ifdef USE_ICU extern UCollator *pg_ucol_open(const char *loc_str); -extern int strncoll_icu(const char *arg1, ssize_t len1, - const char *arg2, ssize_t len2, - pg_locale_t locale); -extern size_t strnxfrm_icu(char *dest, size_t destsize, - const char *src, ssize_t srclen, - pg_locale_t locale); -extern size_t strnxfrm_prefix_icu(char *dest, size_t destsize, - const char *src, ssize_t srclen, - pg_locale_t locale); extern char *get_collation_actual_version_icu(const char *collcollate); #endif extern pg_locale_t create_pg_locale_icu(Oid collid, MemoryContext context); /* pg_locale_libc.c */ extern pg_locale_t create_pg_locale_libc(Oid collid, MemoryContext context); -extern int strncoll_libc(const char *arg1, ssize_t len1, - const char *arg2, ssize_t len2, - pg_locale_t locale); -extern size_t strnxfrm_libc(char *dest, size_t destsize, - const char *src, ssize_t srclen, - pg_locale_t locale); extern char *get_collation_actual_version_libc(const char *collcollate); extern size_t strlower_builtin(char *dst, size_t dstsize, const char *src, @@ -1244,6 +1229,9 @@ create_pg_locale(Oid collid, MemoryContext context) result->is_default = false; + Assert((result->collate_is_c && result->collate == NULL) || + (!result->collate_is_c && result->collate != NULL)); + datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collversion, &isnull); if (!isnull) @@ -1467,19 +1455,7 @@ pg_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen, int pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale) { - int result; - - if (locale->provider == COLLPROVIDER_LIBC) - result = strncoll_libc(arg1, -1, arg2, -1, locale); -#ifdef USE_ICU - else if (locale->provider == COLLPROVIDER_ICU) - result = strncoll_icu(arg1, -1, arg2, -1, locale); -#endif - else - /* shouldn't happen */ - PGLOCALE_SUPPORT_ERROR(locale->provider); - - return result; + return locale->collate->strncoll(arg1, -1, arg2, -1, locale); } /* @@ -1500,51 +1476,25 @@ int pg_strncoll(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, pg_locale_t locale) { - int result; - - if (locale->provider == COLLPROVIDER_LIBC) - result = strncoll_libc(arg1, len1, arg2, len2, locale); -#ifdef USE_ICU - else if (locale->provider == COLLPROVIDER_ICU) - result = strncoll_icu(arg1, len1, arg2, len2, locale); -#endif - else - /* shouldn't happen */ - PGLOCALE_SUPPORT_ERROR(locale->provider); - - return result; + return locale->collate->strncoll(arg1, len1, arg2, len2, locale); } /* * Return true if the collation provider supports pg_strxfrm() and * pg_strnxfrm(); otherwise false. * - * Unfortunately, it seems that strxfrm() for non-C collations is broken on - * many common platforms; testing of multiple versions of glibc reveals that, - * for many locales, strcoll() and strxfrm() do not return consistent - * results. While no other libc other than Cygwin has so far been shown to - * have a problem, we take the conservative course of action for right now and - * disable this categorically. (Users who are certain this isn't a problem on - * their system can define TRUST_STRXFRM.) * * No similar problem is known for the ICU provider. */ bool pg_strxfrm_enabled(pg_locale_t locale) { - if (locale->provider == COLLPROVIDER_LIBC) -#ifdef TRUST_STRXFRM - return true; -#else - return false; -#endif - else if (locale->provider == COLLPROVIDER_ICU) - return true; - else - /* shouldn't happen */ - PGLOCALE_SUPPORT_ERROR(locale->provider); - - return false; /* keep compiler quiet */ + /* + * locale->collate->strnxfrm is still a required method, even if it may + * have the wrong behavior, because the planner uses it for estimates in + * some cases. + */ + return locale->collate->strxfrm_is_safe; } /* @@ -1555,19 +1505,7 @@ pg_strxfrm_enabled(pg_locale_t locale) size_t pg_strxfrm(char *dest, const char *src, size_t destsize, pg_locale_t locale) { - size_t result = 0; /* keep compiler quiet */ - - if (locale->provider == COLLPROVIDER_LIBC) - result = strnxfrm_libc(dest, destsize, src, -1, locale); -#ifdef USE_ICU - else if (locale->provider == COLLPROVIDER_ICU) - result = strnxfrm_icu(dest, destsize, src, -1, locale); -#endif - else - /* shouldn't happen */ - PGLOCALE_SUPPORT_ERROR(locale->provider); - - return result; + return locale->collate->strnxfrm(dest, destsize, src, -1, locale); } /* @@ -1593,19 +1531,7 @@ size_t pg_strnxfrm(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { - size_t result = 0; /* keep compiler quiet */ - - if (locale->provider == COLLPROVIDER_LIBC) - result = strnxfrm_libc(dest, destsize, src, srclen, locale); -#ifdef USE_ICU - else if (locale->provider == COLLPROVIDER_ICU) - result = strnxfrm_icu(dest, destsize, src, srclen, locale); -#endif - else - /* shouldn't happen */ - PGLOCALE_SUPPORT_ERROR(locale->provider); - - return result; + return locale->collate->strnxfrm(dest, destsize, src, srclen, locale); } /* @@ -1615,15 +1541,7 @@ pg_strnxfrm(char *dest, size_t destsize, const char *src, ssize_t srclen, bool pg_strxfrm_prefix_enabled(pg_locale_t locale) { - if (locale->provider == COLLPROVIDER_LIBC) - return false; - else if (locale->provider == COLLPROVIDER_ICU) - return true; - else - /* shouldn't happen */ - PGLOCALE_SUPPORT_ERROR(locale->provider); - - return false; /* keep compiler quiet */ + return (locale->collate->strnxfrm_prefix != NULL); } /* @@ -1635,7 +1553,7 @@ size_t pg_strxfrm_prefix(char *dest, const char *src, size_t destsize, pg_locale_t locale) { - return pg_strnxfrm_prefix(dest, destsize, src, -1, locale); + return locale->collate->strnxfrm_prefix(dest, destsize, src, -1, locale); } /* @@ -1660,16 +1578,7 @@ size_t pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { - size_t result = 0; /* keep compiler quiet */ - -#ifdef USE_ICU - if (locale->provider == COLLPROVIDER_ICU) - result = strnxfrm_prefix_icu(dest, destsize, src, -1, locale); - else -#endif - PGLOCALE_SUPPORT_ERROR(locale->provider); - - return result; + return locale->collate->strnxfrm_prefix(dest, destsize, src, srclen, locale); } /* diff --git a/src/backend/utils/adt/pg_locale_icu.c b/src/backend/utils/adt/pg_locale_icu.c index 6e1fb78bbf3..5185b0f7289 100644 --- a/src/backend/utils/adt/pg_locale_icu.c +++ b/src/backend/utils/adt/pg_locale_icu.c @@ -58,13 +58,14 @@ extern size_t strupper_icu(char *dst, size_t dstsize, const char *src, #ifdef USE_ICU extern UCollator *pg_ucol_open(const char *loc_str); -extern int strncoll_icu(const char *arg1, ssize_t len1, + +static int strncoll_icu(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, pg_locale_t locale); -extern size_t strnxfrm_icu(char *dest, size_t destsize, +static size_t strnxfrm_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale); -extern size_t strnxfrm_prefix_icu(char *dest, size_t destsize, +static size_t strnxfrm_prefix_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale); extern char *get_collation_actual_version_icu(const char *collcollate); @@ -83,12 +84,20 @@ static UConverter *icu_converter = NULL; static UCollator *make_icu_collator(const char *iculocstr, const char *icurules); -static int strncoll_icu_no_utf8(const char *arg1, ssize_t len1, - const char *arg2, ssize_t len2, - pg_locale_t locale); -static size_t strnxfrm_prefix_icu_no_utf8(char *dest, size_t destsize, - const char *src, ssize_t srclen, - pg_locale_t locale); +static int strncoll_icu(const char *arg1, ssize_t len1, + const char *arg2, ssize_t len2, + pg_locale_t locale); +static size_t strnxfrm_prefix_icu(char *dest, size_t destsize, + const char *src, ssize_t srclen, + pg_locale_t locale); +#ifdef HAVE_UCOL_STRCOLLUTF8 +static int strncoll_icu_utf8(const char *arg1, ssize_t len1, + const char *arg2, ssize_t len2, + pg_locale_t locale); +#endif +static size_t strnxfrm_prefix_icu_utf8(char *dest, size_t destsize, + const char *src, ssize_t srclen, + pg_locale_t locale); static void init_icu_converter(void); static size_t uchar_length(UConverter *converter, const char *str, int32_t len); @@ -108,6 +117,25 @@ static int32_t u_strToTitle_default_BI(UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, const char *locale, UErrorCode *pErrorCode); + +static const struct collate_methods collate_methods_icu = { + .strncoll = strncoll_icu, + .strnxfrm = strnxfrm_icu, + .strnxfrm_prefix = strnxfrm_prefix_icu, + .strxfrm_is_safe = true, +}; + +static const struct collate_methods collate_methods_icu_utf8 = { +#ifdef HAVE_UCOL_STRCOLLUTF8 + .strncoll = strncoll_icu_utf8, +#else + .strncoll = strncoll_icu, +#endif + .strnxfrm = strnxfrm_icu, + .strnxfrm_prefix = strnxfrm_prefix_icu_utf8, + .strxfrm_is_safe = true, +}; + #endif pg_locale_t @@ -174,6 +202,10 @@ create_pg_locale_icu(Oid collid, MemoryContext context) result->deterministic = deterministic; result->collate_is_c = false; result->ctype_is_c = false; + if (GetDatabaseEncoding() == PG_UTF8) + result->collate = &collate_methods_icu_utf8; + else + result->collate = &collate_methods_icu; return result; #else @@ -408,42 +440,36 @@ strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, } /* - * strncoll_icu + * strncoll_icu_utf8 * * Call ucol_strcollUTF8() or ucol_strcoll() as appropriate for the given * database encoding. An argument length of -1 means the string is * NUL-terminated. */ +#ifdef HAVE_UCOL_STRCOLLUTF8 int -strncoll_icu(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, - pg_locale_t locale) +strncoll_icu_utf8(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, + pg_locale_t locale) { int result; + UErrorCode status; Assert(locale->provider == COLLPROVIDER_ICU); -#ifdef HAVE_UCOL_STRCOLLUTF8 - if (GetDatabaseEncoding() == PG_UTF8) - { - UErrorCode status; + Assert(GetDatabaseEncoding() == PG_UTF8); - status = U_ZERO_ERROR; - result = ucol_strcollUTF8(locale->info.icu.ucol, - arg1, len1, - arg2, len2, - &status); - if (U_FAILURE(status)) - ereport(ERROR, - (errmsg("collation failed: %s", u_errorName(status)))); - } - else -#endif - { - result = strncoll_icu_no_utf8(arg1, len1, arg2, len2, locale); - } + status = U_ZERO_ERROR; + result = ucol_strcollUTF8(locale->info.icu.ucol, + arg1, len1, + arg2, len2, + &status); + if (U_FAILURE(status)) + ereport(ERROR, + (errmsg("collation failed: %s", u_errorName(status)))); return result; } +#endif /* 'srclen' of -1 means the strings are NUL-terminated */ size_t @@ -494,37 +520,32 @@ strnxfrm_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, /* 'srclen' of -1 means the strings are NUL-terminated */ size_t -strnxfrm_prefix_icu(char *dest, size_t destsize, - const char *src, ssize_t srclen, - pg_locale_t locale) +strnxfrm_prefix_icu_utf8(char *dest, size_t destsize, + const char *src, ssize_t srclen, + pg_locale_t locale) { size_t result; + UCharIterator iter; + uint32_t state[2]; + UErrorCode status; Assert(locale->provider == COLLPROVIDER_ICU); - if (GetDatabaseEncoding() == PG_UTF8) - { - UCharIterator iter; - uint32_t state[2]; - UErrorCode status; + Assert(GetDatabaseEncoding() == PG_UTF8); - uiter_setUTF8(&iter, src, srclen); - state[0] = state[1] = 0; /* won't need that again */ - status = U_ZERO_ERROR; - result = ucol_nextSortKeyPart(locale->info.icu.ucol, - &iter, - state, - (uint8_t *) dest, - destsize, - &status); - if (U_FAILURE(status)) - ereport(ERROR, - (errmsg("sort key generation failed: %s", - u_errorName(status)))); - } - else - result = strnxfrm_prefix_icu_no_utf8(dest, destsize, src, srclen, - locale); + uiter_setUTF8(&iter, src, srclen); + state[0] = state[1] = 0; /* won't need that again */ + status = U_ZERO_ERROR; + result = ucol_nextSortKeyPart(locale->info.icu.ucol, + &iter, + state, + (uint8_t *) dest, + destsize, + &status); + if (U_FAILURE(status)) + ereport(ERROR, + (errmsg("sort key generation failed: %s", + u_errorName(status)))); return result; } @@ -653,7 +674,7 @@ u_strToTitle_default_BI(UChar *dest, int32_t destCapacity, } /* - * strncoll_icu_no_utf8 + * strncoll_icu * * Convert the arguments from the database encoding to UChar strings, then * call ucol_strcoll(). An argument length of -1 means that the string is @@ -663,8 +684,8 @@ u_strToTitle_default_BI(UChar *dest, int32_t destCapacity, * caller should call that instead. */ static int -strncoll_icu_no_utf8(const char *arg1, ssize_t len1, - const char *arg2, ssize_t len2, pg_locale_t locale) +strncoll_icu(const char *arg1, ssize_t len1, + const char *arg2, ssize_t len2, pg_locale_t locale) { char sbuf[TEXTBUFLEN]; char *buf = sbuf; @@ -677,6 +698,8 @@ strncoll_icu_no_utf8(const char *arg1, ssize_t len1, int result; Assert(locale->provider == COLLPROVIDER_ICU); + + /* if encoding is UTF8, use more efficient strncoll_icu_utf8 */ #ifdef HAVE_UCOL_STRCOLLUTF8 Assert(GetDatabaseEncoding() != PG_UTF8); #endif @@ -710,9 +733,9 @@ strncoll_icu_no_utf8(const char *arg1, ssize_t len1, /* 'srclen' of -1 means the strings are NUL-terminated */ static size_t -strnxfrm_prefix_icu_no_utf8(char *dest, size_t destsize, - const char *src, ssize_t srclen, - pg_locale_t locale) +strnxfrm_prefix_icu(char *dest, size_t destsize, + const char *src, ssize_t srclen, + pg_locale_t locale) { char sbuf[TEXTBUFLEN]; char *buf = sbuf; @@ -725,6 +748,8 @@ strnxfrm_prefix_icu_no_utf8(char *dest, size_t destsize, Size result_bsize; Assert(locale->provider == COLLPROVIDER_ICU); + + /* if encoding is UTF8, use more efficient strnxfrm_prefix_icu_utf8 */ Assert(GetDatabaseEncoding() != PG_UTF8); init_icu_converter(); diff --git a/src/backend/utils/adt/pg_locale_libc.c b/src/backend/utils/adt/pg_locale_libc.c index 81120061b50..8f9a8637897 100644 --- a/src/backend/utils/adt/pg_locale_libc.c +++ b/src/backend/utils/adt/pg_locale_libc.c @@ -50,10 +50,10 @@ extern size_t strtitle_libc(char *dst, size_t dstsize, const char *src, extern size_t strupper_libc(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale); -extern int strncoll_libc(const char *arg1, ssize_t len1, +static int strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, pg_locale_t locale); -extern size_t strnxfrm_libc(char *dest, size_t destsize, +static size_t strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale); extern char *get_collation_actual_version_libc(const char *collcollate); @@ -86,6 +86,40 @@ static size_t strupper_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale); +static const struct collate_methods collate_methods_libc = { + .strncoll = strncoll_libc, + .strnxfrm = strnxfrm_libc, + .strnxfrm_prefix = NULL, + + /* + * Unfortunately, it seems that strxfrm() for non-C collations is broken + * on many common platforms; testing of multiple versions of glibc reveals + * that, for many locales, strcoll() and strxfrm() do not return + * consistent results. While no other libc other than Cygwin has so far + * been shown to have a problem, we take the conservative course of action + * for right now and disable this categorically. (Users who are certain + * this isn't a problem on their system can define TRUST_STRXFRM.) + */ +#ifdef TRUST_STRXFRM + .strxfrm_is_safe = true, +#else + .strxfrm_is_safe = false, +#endif +}; + +#ifdef WIN32 +static const struct collate_methods collate_methods_libc_win32_utf8 = { + .strncoll = strncoll_libc_win32_utf8, + .strnxfrm = strnxfrm_libc, + .strnxfrm_prefix = NULL, +#ifdef TRUST_STRXFRM + .strxfrm_is_safe = true, +#else + .strxfrm_is_safe = false, +#endif +}; +#endif + size_t strlower_libc(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale) @@ -439,6 +473,15 @@ create_pg_locale_libc(Oid collid, MemoryContext context) result->ctype_is_c = (strcmp(ctype, "C") == 0) || (strcmp(ctype, "POSIX") == 0); result->info.lt = loc; + if (!result->collate_is_c) + { +#ifdef WIN32 + if (GetDatabaseEncoding() == PG_UTF8) + result->collate = &collate_methods_libc_win32_utf8; + else +#endif + result->collate = &collate_methods_libc; + } return result; } @@ -536,12 +579,6 @@ strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, Assert(locale->provider == COLLPROVIDER_LIBC); -#ifdef WIN32 - /* check for this case before doing the work for nul-termination */ - if (GetDatabaseEncoding() == PG_UTF8) - return strncoll_libc_win32_utf8(arg1, len1, arg2, len2, locale); -#endif /* WIN32 */ - if (bufsize1 + bufsize2 > TEXTBUFLEN) buf = palloc(bufsize1 + bufsize2); diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h index 7a70ec1173a..97b866b3444 100644 --- a/src/include/utils/pg_locale.h +++ b/src/include/utils/pg_locale.h @@ -47,6 +47,36 @@ extern struct lconv *PGLC_localeconv(void); extern void cache_locale_time(void); +struct pg_locale_struct; +typedef struct pg_locale_struct *pg_locale_t; + +/* methods that define collation behavior */ +struct collate_methods +{ + /* required */ + int (*strncoll) (const char *arg1, ssize_t len1, + const char *arg2, ssize_t len2, + pg_locale_t locale); + + /* required */ + size_t (*strnxfrm) (char *dest, size_t destsize, + const char *src, ssize_t srclen, + pg_locale_t locale); + + /* optional */ + size_t (*strnxfrm_prefix) (char *dest, size_t destsize, + const char *src, ssize_t srclen, + pg_locale_t locale); + + /* + * If the strnxfrm method is not trusted to return the correct results, + * set strxfrm_is_safe to false. It set to false, the method will not be + * used in most cases, but the planner still expects it to be there for + * estimation purposes (where incorrect results are acceptable). + */ + bool strxfrm_is_safe; +}; + /* * We use a discriminated union to hold either a locale_t or an ICU collator. * pg_locale_t is occasionally checked for truth, so make it a pointer. @@ -70,6 +100,9 @@ struct pg_locale_struct bool collate_is_c; bool ctype_is_c; bool is_default; + + const struct collate_methods *collate; /* NULL if collate_is_c */ + union { struct