From 806555e3000d0b0e0c536c1dc65548128d457d86 Mon Sep 17 00:00:00 2001 From: Jeff Davis Date: Tue, 16 Dec 2025 11:13:17 -0800 Subject: [PATCH] ltree: fix case-insensitive matching. Previously, ltree_prefix_eq_ci() used lowercasing with the default collation; while ltree_crc32_sz() used tolower() directly. These were equivalent only if the default collation provider was libc and the encoding was single-byte. Change both to use casefolding with the default collation. Backpatch through 18, where the casefolding APIs were introduced. The bug exists in earlier versions, but would require some adaptation. A REINDEX is required for ltree indexes where the database default collation is not libc. Reviewed-by: Chao Li Reviewed-by: Peter Eisentraut Backpatch-through: 18 Discussion: https://postgr.es/m/450ceb6260cad30d7afdf155d991a9caafee7c0d.camel@j-davis.com Discussion: https://postgr.es/m/01fc00fd66f641b9693d4f9f1af0ccf44cbdfbdf.camel@j-davis.com --- contrib/ltree/crc32.c | 51 ++++++++++++++++++++++++++++------- contrib/ltree/lquery_op.c | 39 ++++++++++++++++++++++++--- src/include/utils/pg_locale.h | 14 ++++++++++ 3 files changed, 91 insertions(+), 13 deletions(-) diff --git a/contrib/ltree/crc32.c b/contrib/ltree/crc32.c index 134f46a805e..ce1b0f28e21 100644 --- a/contrib/ltree/crc32.c +++ b/contrib/ltree/crc32.c @@ -10,15 +10,46 @@ #include "postgres.h" #include "ltree.h" -#ifdef LOWER_NODE -#include -#define TOLOWER(x) tolower((unsigned char) (x)) -#else -#define TOLOWER(x) (x) -#endif - #include "crc32.h" #include "utils/pg_crc.h" +#ifdef LOWER_NODE +#include "catalog/pg_collation.h" +#include "utils/pg_locale.h" +#endif + +#ifdef LOWER_NODE + +unsigned int +ltree_crc32_sz(const char *buf, int size) +{ + pg_crc32 crc; + const char *p = buf; + static pg_locale_t locale = NULL; + + if (!locale) + locale = pg_newlocale_from_collation(DEFAULT_COLLATION_OID); + + INIT_TRADITIONAL_CRC32(crc); + while (size > 0) + { + char foldstr[UNICODE_CASEMAP_BUFSZ]; + int srclen = pg_mblen(p); + size_t foldlen; + + /* fold one codepoint at a time */ + foldlen = pg_strfold(foldstr, UNICODE_CASEMAP_BUFSZ, p, srclen, + locale); + + COMP_TRADITIONAL_CRC32(crc, foldstr, foldlen); + + size -= srclen; + p += srclen; + } + FIN_TRADITIONAL_CRC32(crc); + return (unsigned int) crc; +} + +#else unsigned int ltree_crc32_sz(const char *buf, int size) @@ -29,12 +60,12 @@ ltree_crc32_sz(const char *buf, int size) INIT_TRADITIONAL_CRC32(crc); while (size > 0) { - char c = (char) TOLOWER(*p); - - COMP_TRADITIONAL_CRC32(crc, &c, 1); + COMP_TRADITIONAL_CRC32(crc, p, 1); size--; p++; } FIN_TRADITIONAL_CRC32(crc); return (unsigned int) crc; } + +#endif /* !LOWER_NODE */ diff --git a/contrib/ltree/lquery_op.c b/contrib/ltree/lquery_op.c index 0b39d64a839..9b1de101213 100644 --- a/contrib/ltree/lquery_op.c +++ b/contrib/ltree/lquery_op.c @@ -93,11 +93,44 @@ ltree_prefix_eq(const char *a, size_t a_sz, const char *b, size_t b_sz) bool ltree_prefix_eq_ci(const char *a, size_t a_sz, const char *b, size_t b_sz) { - char *al = str_tolower(a, a_sz, DEFAULT_COLLATION_OID); - char *bl = str_tolower(b, b_sz, DEFAULT_COLLATION_OID); + static pg_locale_t locale = NULL; + size_t al_sz = a_sz + 1; + size_t al_len; + char *al = palloc(al_sz); + size_t bl_sz = b_sz + 1; + size_t bl_len; + char *bl = palloc(bl_sz); bool res; - res = (strncmp(al, bl, a_sz) == 0); + if (!locale) + locale = pg_newlocale_from_collation(DEFAULT_COLLATION_OID); + + /* casefold both a and b */ + + al_len = pg_strfold(al, al_sz, a, a_sz, locale); + if (al_len + 1 > al_sz) + { + /* grow buffer if needed and retry */ + al_sz = al_len + 1; + al = repalloc(al, al_sz); + al_len = pg_strfold(al, al_sz, a, a_sz, locale); + Assert(al_len + 1 <= al_sz); + } + + bl_len = pg_strfold(bl, bl_sz, b, b_sz, locale); + if (bl_len + 1 > bl_sz) + { + /* grow buffer if needed and retry */ + bl_sz = bl_len + 1; + bl = repalloc(bl, bl_sz); + bl_len = pg_strfold(bl, bl_sz, b, b_sz, locale); + Assert(bl_len + 1 <= bl_sz); + } + + if (al_len > bl_len) + res = false; + else + res = (strncmp(al, bl, al_len) == 0); pfree(al); pfree(bl); diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h index 953e185f92d..3a758256591 100644 --- a/src/include/utils/pg_locale.h +++ b/src/include/utils/pg_locale.h @@ -24,6 +24,20 @@ /* use for libc locale names */ #define LOCALE_NAME_BUFLEN 128 +/* + * Maximum number of bytes needed to map a single codepoint. Useful for + * mapping and processing a single input codepoint at a time with a + * statically-allocated buffer. + * + * With full case mapping, an input codepoint may be mapped to as many as + * three output codepoints. See Unicode 16.0.0, section 5.18.2, "Change in + * Length": + * + * https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-5/#G29675 + */ +#define UNICODE_CASEMAP_LEN 3 +#define UNICODE_CASEMAP_BUFSZ (UNICODE_CASEMAP_LEN * MAX_MULTIBYTE_CHAR_LEN) + /* GUC settings */ extern PGDLLIMPORT char *locale_messages; extern PGDLLIMPORT char *locale_monetary;