1
0
mirror of https://github.com/postgres/postgres.git synced 2025-11-19 13:42:17 +03:00

Add SQL function CASEFOLD().

Useful for caseless matching. Similar to LOWER(), but avoids edge-case
problems with using LOWER() for caseless matching.

For collations that support it, CASEFOLD() handles characters with
more than two case variations or multi-character case variations. Some
characters may fold to uppercase. The results of case folding are also
more stable across Unicode versions than LOWER() or UPPER().

Discussion: https://postgr.es/m/a1886ddfcd8f60cb3e905c93009b646b4cfb74c5.camel%40j-davis.com
Reviewed-by: Ian Lawrence Barwick
This commit is contained in:
Jeff Davis
2025-01-24 14:56:22 -08:00
parent f15538cd27
commit bfc5992069
14 changed files with 278 additions and 3 deletions

View File

@@ -1819,6 +1819,75 @@ str_initcap(const char *buff, size_t nbytes, Oid collid)
return result;
}
/*
* collation-aware, wide-character-aware case folding
*
* We pass the number of bytes so we can pass varlena and char*
* to this function. The result is a palloc'd, null-terminated string.
*/
char *
str_casefold(const char *buff, size_t nbytes, Oid collid)
{
char *result;
pg_locale_t mylocale;
if (!buff)
return NULL;
if (!OidIsValid(collid))
{
/*
* This typically means that the parser could not resolve a conflict
* of implicit collations, so report it that way.
*/
ereport(ERROR,
(errcode(ERRCODE_INDETERMINATE_COLLATION),
errmsg("could not determine which collation to use for %s function",
"lower()"),
errhint("Use the COLLATE clause to set the collation explicitly.")));
}
if (GetDatabaseEncoding() != PG_UTF8)
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("Unicode case folding can only be performed if server encoding is UTF8")));
mylocale = pg_newlocale_from_collation(collid);
/* C/POSIX collations use this path regardless of database encoding */
if (mylocale->ctype_is_c)
{
result = asc_tolower(buff, nbytes);
}
else
{
const char *src = buff;
size_t srclen = nbytes;
size_t dstsize;
char *dst;
size_t needed;
/* first try buffer of equal size plus terminating NUL */
dstsize = srclen + 1;
dst = palloc(dstsize);
needed = pg_strfold(dst, dstsize, src, srclen, mylocale);
if (needed + 1 > dstsize)
{
/* grow buffer if needed and retry */
dstsize = needed + 1;
dst = repalloc(dst, dstsize);
needed = pg_strfold(dst, dstsize, src, srclen, mylocale);
Assert(needed + 1 <= dstsize);
}
Assert(dst[needed] == '\0');
result = dst;
}
return result;
}
/*
* ASCII-only lower function
*

View File

@@ -126,6 +126,22 @@ initcap(PG_FUNCTION_ARGS)
PG_RETURN_TEXT_P(result);
}
Datum
casefold(PG_FUNCTION_ARGS)
{
text *in_string = PG_GETARG_TEXT_PP(0);
char *out_string;
text *result;
out_string = str_casefold(VARDATA_ANY(in_string),
VARSIZE_ANY_EXHDR(in_string),
PG_GET_COLLATION());
result = cstring_to_text(out_string);
pfree(out_string);
PG_RETURN_TEXT_P(result);
}
/********************************************************************
*

View File

@@ -106,6 +106,8 @@ extern size_t strtitle_builtin(char *dst, size_t dstsize, const char *src,
ssize_t srclen, pg_locale_t locale);
extern size_t strupper_builtin(char *dst, size_t dstsize, const char *src,
ssize_t srclen, pg_locale_t locale);
extern size_t strfold_builtin(char *dst, size_t dstsize, const char *src,
ssize_t srclen, pg_locale_t locale);
extern size_t strlower_icu(char *dst, size_t dstsize, const char *src,
ssize_t srclen, pg_locale_t locale);
@@ -113,6 +115,8 @@ extern size_t strtitle_icu(char *dst, size_t dstsize, const char *src,
ssize_t srclen, pg_locale_t locale);
extern size_t strupper_icu(char *dst, size_t dstsize, const char *src,
ssize_t srclen, pg_locale_t locale);
extern size_t strfold_icu(char *dst, size_t dstsize, const char *src,
ssize_t srclen, pg_locale_t locale);
extern size_t strlower_libc(char *dst, size_t dstsize, const char *src,
ssize_t srclen, pg_locale_t locale);
@@ -1447,6 +1451,26 @@ pg_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen,
return 0; /* keep compiler quiet */
}
size_t
pg_strfold(char *dst, size_t dstsize, const char *src, ssize_t srclen,
pg_locale_t locale)
{
if (locale->provider == COLLPROVIDER_BUILTIN)
return strfold_builtin(dst, dstsize, src, srclen, locale);
#ifdef USE_ICU
else if (locale->provider == COLLPROVIDER_ICU)
return strfold_icu(dst, dstsize, src, srclen, locale);
#endif
/* for libc, just use strlower */
else if (locale->provider == COLLPROVIDER_LIBC)
return strlower_libc(dst, dstsize, src, srclen, locale);
else
/* shouldn't happen */
PGLOCALE_SUPPORT_ERROR(locale->provider);
return 0; /* keep compiler quiet */
}
/*
* pg_strcoll
*

View File

@@ -31,6 +31,8 @@ extern size_t strtitle_builtin(char *dst, size_t dstsize, const char *src,
ssize_t srclen, pg_locale_t locale);
extern size_t strupper_builtin(char *dst, size_t dstsize, const char *src,
ssize_t srclen, pg_locale_t locale);
extern size_t strfold_builtin(char *dst, size_t dstsize, const char *src,
ssize_t srclen, pg_locale_t locale);
struct WordBoundaryState
@@ -107,6 +109,14 @@ strupper_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
locale->info.builtin.casemap_full);
}
size_t
strfold_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
pg_locale_t locale)
{
return unicode_strfold(dest, destsize, src, srclen,
locale->info.builtin.casemap_full);
}
pg_locale_t
create_pg_locale_builtin(Oid collid, MemoryContext context)
{

View File

@@ -54,6 +54,8 @@ extern size_t strtitle_icu(char *dst, size_t dstsize, const char *src,
ssize_t srclen, pg_locale_t locale);
extern size_t strupper_icu(char *dst, size_t dstsize, const char *src,
ssize_t srclen, pg_locale_t locale);
extern size_t strfold_icu(char *dst, size_t dstsize, const char *src,
ssize_t srclen, pg_locale_t locale);
#ifdef USE_ICU
@@ -117,6 +119,10 @@ static int32_t u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
const char *locale,
UErrorCode *pErrorCode);
static int32_t u_strFoldCase_default(UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
const char *locale,
UErrorCode *pErrorCode);
static const struct collate_methods collate_methods_icu = {
.strncoll = strncoll_icu,
@@ -439,6 +445,26 @@ strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
return result_len;
}
size_t
strfold_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
pg_locale_t locale)
{
int32_t len_uchar;
int32_t len_conv;
UChar *buff_uchar;
UChar *buff_conv;
size_t result_len;
len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
len_conv = icu_convert_case(u_strFoldCase_default, locale,
&buff_conv, buff_uchar, len_uchar);
result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
pfree(buff_uchar);
pfree(buff_conv);
return result_len;
}
/*
* strncoll_icu_utf8
*
@@ -673,6 +699,38 @@ u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
NULL, locale, pErrorCode);
}
static int32_t
u_strFoldCase_default(UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
const char *locale,
UErrorCode *pErrorCode)
{
uint32 options = U_FOLD_CASE_DEFAULT;
char lang[3];
UErrorCode status;
/*
* Unlike the ICU APIs for lowercasing, titlecasing, and uppercasing, case
* folding does not accept a locale. Instead it just supports a single
* option relevant to Turkic languages 'az' and 'tr'; check for those
* languages to enable the option.
*/
status = U_ZERO_ERROR;
uloc_getLanguage(locale, lang, 3, &status);
if (U_SUCCESS(status))
{
/*
* The option name is confusing, but it causes u_strFoldCase to use
* the 'T' mappings, which are ignored for U_FOLD_CASE_DEFAULT.
*/
if (strcmp(lang, "tr") == 0 || strcmp(lang, "az") == 0)
options = U_FOLD_CASE_EXCLUDE_SPECIAL_I;
}
return u_strFoldCase(dest, destCapacity, src, srcLength,
options, pErrorCode);
}
/*
* strncoll_icu
*