mirror of
https://github.com/postgres/postgres.git
synced 2025-05-31 03:21:24 +03:00
Add SQL function CASEFOLD().
Useful for caseless matching. Similar to LOWER(), but avoids edge-case problems with using LOWER() for caseless matching. For collations that support it, CASEFOLD() handles characters with more than two case variations or multi-character case variations. Some characters may fold to uppercase. The results of case folding are also more stable across Unicode versions than LOWER() or UPPER(). Discussion: https://postgr.es/m/a1886ddfcd8f60cb3e905c93009b646b4cfb74c5.camel%40j-davis.com Reviewed-by: Ian Lawrence Barwick
This commit is contained in:
parent
f15538cd27
commit
bfc5992069
@ -2596,7 +2596,7 @@ SELECT NOT(ROW(table.*) IS NOT NULL) FROM TABLE; -- detect at least one null in
|
||||
|
||||
<row>
|
||||
<entry role="func_table_entry"><para role="func_signature">
|
||||
<indexterm>
|
||||
<indexterm id="function-lower">
|
||||
<primary>lower</primary>
|
||||
</indexterm>
|
||||
<function>lower</function> ( <type>text</type> )
|
||||
@ -2657,7 +2657,7 @@ SELECT NOT(ROW(table.*) IS NOT NULL) FROM TABLE; -- detect at least one null in
|
||||
|
||||
<row>
|
||||
<entry role="func_table_entry"><para role="func_signature">
|
||||
<indexterm>
|
||||
<indexterm id="function-normalize">
|
||||
<primary>normalize</primary>
|
||||
</indexterm>
|
||||
<indexterm>
|
||||
@ -3109,6 +3109,48 @@ SELECT NOT(ROW(table.*) IS NOT NULL) FROM TABLE; -- detect at least one null in
|
||||
</para></entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry role="func_table_entry"><para role="func_signature">
|
||||
<indexterm>
|
||||
<primary>casefold</primary>
|
||||
</indexterm>
|
||||
<function>casefold</function> ( <type>text</type> )
|
||||
<returnvalue>text</returnvalue>
|
||||
</para>
|
||||
<para>
|
||||
Performs case folding of the input string according to the collation.
|
||||
Case folding is similar to case conversion, but the purpose of case
|
||||
folding is to facilitate case-insensitive comparison of strings,
|
||||
whereas the purpose of case conversion is to convert to a particular
|
||||
cased form. This function can only be used when the server encoding
|
||||
is <literal>UTF8</literal>.
|
||||
</para>
|
||||
<para>
|
||||
Ordinarily, case folding simply converts to lowercase, but there are a
|
||||
few notable exceptions depending on the collation. For instance, the
|
||||
character <literal>Σ</literal> (U+03A3) has two lowercase forms:
|
||||
<literal>σ</literal> (U+03C3) and <literal>ς</literal> (U+03C2); case
|
||||
folding in the <literal>PG_C_UTF8</literal> collation maps all three
|
||||
forms to <literal>σ</literal>. Additionally, the result is not
|
||||
necessarily lowercase; some characters may be folded to uppercase.
|
||||
</para>
|
||||
<para>
|
||||
Case folding may change the length of the string. For instance, in
|
||||
the <literal>PG_UNICODE_FAST</literal> collation, <literal>ß</literal>
|
||||
(U+00DF) folds to <literal>ss</literal>.
|
||||
</para>
|
||||
<para>
|
||||
<function>casefold</function> can be used for Unicode Default Caseless
|
||||
Matching. It does not always preserve the normalized form of the
|
||||
input string (see <xref linkend="function-normalize"/>).
|
||||
</para>
|
||||
<para>
|
||||
The <literal>libc</literal> provider doesn't support case folding, so
|
||||
<function>casefold</function> is identical to <xref
|
||||
linkend="function-lower"/>.
|
||||
</para></entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry role="func_table_entry"><para role="func_signature">
|
||||
<indexterm>
|
||||
|
@ -1819,6 +1819,75 @@ str_initcap(const char *buff, size_t nbytes, Oid collid)
|
||||
return result;
|
||||
}
|
||||
|
||||
/*
|
||||
* collation-aware, wide-character-aware case folding
|
||||
*
|
||||
* We pass the number of bytes so we can pass varlena and char*
|
||||
* to this function. The result is a palloc'd, null-terminated string.
|
||||
*/
|
||||
char *
|
||||
str_casefold(const char *buff, size_t nbytes, Oid collid)
|
||||
{
|
||||
char *result;
|
||||
pg_locale_t mylocale;
|
||||
|
||||
if (!buff)
|
||||
return NULL;
|
||||
|
||||
if (!OidIsValid(collid))
|
||||
{
|
||||
/*
|
||||
* This typically means that the parser could not resolve a conflict
|
||||
* of implicit collations, so report it that way.
|
||||
*/
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INDETERMINATE_COLLATION),
|
||||
errmsg("could not determine which collation to use for %s function",
|
||||
"lower()"),
|
||||
errhint("Use the COLLATE clause to set the collation explicitly.")));
|
||||
}
|
||||
|
||||
if (GetDatabaseEncoding() != PG_UTF8)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_SYNTAX_ERROR),
|
||||
errmsg("Unicode case folding can only be performed if server encoding is UTF8")));
|
||||
|
||||
mylocale = pg_newlocale_from_collation(collid);
|
||||
|
||||
/* C/POSIX collations use this path regardless of database encoding */
|
||||
if (mylocale->ctype_is_c)
|
||||
{
|
||||
result = asc_tolower(buff, nbytes);
|
||||
}
|
||||
else
|
||||
{
|
||||
const char *src = buff;
|
||||
size_t srclen = nbytes;
|
||||
size_t dstsize;
|
||||
char *dst;
|
||||
size_t needed;
|
||||
|
||||
/* first try buffer of equal size plus terminating NUL */
|
||||
dstsize = srclen + 1;
|
||||
dst = palloc(dstsize);
|
||||
|
||||
needed = pg_strfold(dst, dstsize, src, srclen, mylocale);
|
||||
if (needed + 1 > dstsize)
|
||||
{
|
||||
/* grow buffer if needed and retry */
|
||||
dstsize = needed + 1;
|
||||
dst = repalloc(dst, dstsize);
|
||||
needed = pg_strfold(dst, dstsize, src, srclen, mylocale);
|
||||
Assert(needed + 1 <= dstsize);
|
||||
}
|
||||
|
||||
Assert(dst[needed] == '\0');
|
||||
result = dst;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/*
|
||||
* ASCII-only lower function
|
||||
*
|
||||
|
@ -126,6 +126,22 @@ initcap(PG_FUNCTION_ARGS)
|
||||
PG_RETURN_TEXT_P(result);
|
||||
}
|
||||
|
||||
Datum
|
||||
casefold(PG_FUNCTION_ARGS)
|
||||
{
|
||||
text *in_string = PG_GETARG_TEXT_PP(0);
|
||||
char *out_string;
|
||||
text *result;
|
||||
|
||||
out_string = str_casefold(VARDATA_ANY(in_string),
|
||||
VARSIZE_ANY_EXHDR(in_string),
|
||||
PG_GET_COLLATION());
|
||||
result = cstring_to_text(out_string);
|
||||
pfree(out_string);
|
||||
|
||||
PG_RETURN_TEXT_P(result);
|
||||
}
|
||||
|
||||
|
||||
/********************************************************************
|
||||
*
|
||||
|
@ -106,6 +106,8 @@ extern size_t strtitle_builtin(char *dst, size_t dstsize, const char *src,
|
||||
ssize_t srclen, pg_locale_t locale);
|
||||
extern size_t strupper_builtin(char *dst, size_t dstsize, const char *src,
|
||||
ssize_t srclen, pg_locale_t locale);
|
||||
extern size_t strfold_builtin(char *dst, size_t dstsize, const char *src,
|
||||
ssize_t srclen, pg_locale_t locale);
|
||||
|
||||
extern size_t strlower_icu(char *dst, size_t dstsize, const char *src,
|
||||
ssize_t srclen, pg_locale_t locale);
|
||||
@ -113,6 +115,8 @@ extern size_t strtitle_icu(char *dst, size_t dstsize, const char *src,
|
||||
ssize_t srclen, pg_locale_t locale);
|
||||
extern size_t strupper_icu(char *dst, size_t dstsize, const char *src,
|
||||
ssize_t srclen, pg_locale_t locale);
|
||||
extern size_t strfold_icu(char *dst, size_t dstsize, const char *src,
|
||||
ssize_t srclen, pg_locale_t locale);
|
||||
|
||||
extern size_t strlower_libc(char *dst, size_t dstsize, const char *src,
|
||||
ssize_t srclen, pg_locale_t locale);
|
||||
@ -1447,6 +1451,26 @@ pg_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen,
|
||||
return 0; /* keep compiler quiet */
|
||||
}
|
||||
|
||||
size_t
|
||||
pg_strfold(char *dst, size_t dstsize, const char *src, ssize_t srclen,
|
||||
pg_locale_t locale)
|
||||
{
|
||||
if (locale->provider == COLLPROVIDER_BUILTIN)
|
||||
return strfold_builtin(dst, dstsize, src, srclen, locale);
|
||||
#ifdef USE_ICU
|
||||
else if (locale->provider == COLLPROVIDER_ICU)
|
||||
return strfold_icu(dst, dstsize, src, srclen, locale);
|
||||
#endif
|
||||
/* for libc, just use strlower */
|
||||
else if (locale->provider == COLLPROVIDER_LIBC)
|
||||
return strlower_libc(dst, dstsize, src, srclen, locale);
|
||||
else
|
||||
/* shouldn't happen */
|
||||
PGLOCALE_SUPPORT_ERROR(locale->provider);
|
||||
|
||||
return 0; /* keep compiler quiet */
|
||||
}
|
||||
|
||||
/*
|
||||
* pg_strcoll
|
||||
*
|
||||
|
@ -31,6 +31,8 @@ extern size_t strtitle_builtin(char *dst, size_t dstsize, const char *src,
|
||||
ssize_t srclen, pg_locale_t locale);
|
||||
extern size_t strupper_builtin(char *dst, size_t dstsize, const char *src,
|
||||
ssize_t srclen, pg_locale_t locale);
|
||||
extern size_t strfold_builtin(char *dst, size_t dstsize, const char *src,
|
||||
ssize_t srclen, pg_locale_t locale);
|
||||
|
||||
|
||||
struct WordBoundaryState
|
||||
@ -107,6 +109,14 @@ strupper_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
|
||||
locale->info.builtin.casemap_full);
|
||||
}
|
||||
|
||||
size_t
|
||||
strfold_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
|
||||
pg_locale_t locale)
|
||||
{
|
||||
return unicode_strfold(dest, destsize, src, srclen,
|
||||
locale->info.builtin.casemap_full);
|
||||
}
|
||||
|
||||
pg_locale_t
|
||||
create_pg_locale_builtin(Oid collid, MemoryContext context)
|
||||
{
|
||||
|
@ -54,6 +54,8 @@ extern size_t strtitle_icu(char *dst, size_t dstsize, const char *src,
|
||||
ssize_t srclen, pg_locale_t locale);
|
||||
extern size_t strupper_icu(char *dst, size_t dstsize, const char *src,
|
||||
ssize_t srclen, pg_locale_t locale);
|
||||
extern size_t strfold_icu(char *dst, size_t dstsize, const char *src,
|
||||
ssize_t srclen, pg_locale_t locale);
|
||||
|
||||
#ifdef USE_ICU
|
||||
|
||||
@ -117,6 +119,10 @@ static int32_t u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, int32_t srcLength,
|
||||
const char *locale,
|
||||
UErrorCode *pErrorCode);
|
||||
static int32_t u_strFoldCase_default(UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, int32_t srcLength,
|
||||
const char *locale,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
static const struct collate_methods collate_methods_icu = {
|
||||
.strncoll = strncoll_icu,
|
||||
@ -439,6 +445,26 @@ strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
|
||||
return result_len;
|
||||
}
|
||||
|
||||
size_t
|
||||
strfold_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
|
||||
pg_locale_t locale)
|
||||
{
|
||||
int32_t len_uchar;
|
||||
int32_t len_conv;
|
||||
UChar *buff_uchar;
|
||||
UChar *buff_conv;
|
||||
size_t result_len;
|
||||
|
||||
len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
|
||||
len_conv = icu_convert_case(u_strFoldCase_default, locale,
|
||||
&buff_conv, buff_uchar, len_uchar);
|
||||
result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
|
||||
pfree(buff_uchar);
|
||||
pfree(buff_conv);
|
||||
|
||||
return result_len;
|
||||
}
|
||||
|
||||
/*
|
||||
* strncoll_icu_utf8
|
||||
*
|
||||
@ -673,6 +699,38 @@ u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
|
||||
NULL, locale, pErrorCode);
|
||||
}
|
||||
|
||||
static int32_t
|
||||
u_strFoldCase_default(UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, int32_t srcLength,
|
||||
const char *locale,
|
||||
UErrorCode *pErrorCode)
|
||||
{
|
||||
uint32 options = U_FOLD_CASE_DEFAULT;
|
||||
char lang[3];
|
||||
UErrorCode status;
|
||||
|
||||
/*
|
||||
* Unlike the ICU APIs for lowercasing, titlecasing, and uppercasing, case
|
||||
* folding does not accept a locale. Instead it just supports a single
|
||||
* option relevant to Turkic languages 'az' and 'tr'; check for those
|
||||
* languages to enable the option.
|
||||
*/
|
||||
status = U_ZERO_ERROR;
|
||||
uloc_getLanguage(locale, lang, 3, &status);
|
||||
if (U_SUCCESS(status))
|
||||
{
|
||||
/*
|
||||
* The option name is confusing, but it causes u_strFoldCase to use
|
||||
* the 'T' mappings, which are ignored for U_FOLD_CASE_DEFAULT.
|
||||
*/
|
||||
if (strcmp(lang, "tr") == 0 || strcmp(lang, "az") == 0)
|
||||
options = U_FOLD_CASE_EXCLUDE_SPECIAL_I;
|
||||
}
|
||||
|
||||
return u_strFoldCase(dest, destCapacity, src, srcLength,
|
||||
options, pErrorCode);
|
||||
}
|
||||
|
||||
/*
|
||||
* strncoll_icu
|
||||
*
|
||||
|
@ -57,6 +57,6 @@
|
||||
*/
|
||||
|
||||
/* yyyymmddN */
|
||||
#define CATALOG_VERSION_NO 202501231
|
||||
#define CATALOG_VERSION_NO 202501232
|
||||
|
||||
#endif
|
||||
|
@ -3623,6 +3623,9 @@
|
||||
{ oid => '872', descr => 'capitalize each word',
|
||||
proname => 'initcap', prorettype => 'text', proargtypes => 'text',
|
||||
prosrc => 'initcap' },
|
||||
{ oid => '9569', descr => 'fold case',
|
||||
proname => 'casefold', prorettype => 'text', proargtypes => 'text',
|
||||
prosrc => 'casefold' },
|
||||
{ oid => '873', descr => 'left-pad string to length',
|
||||
proname => 'lpad', prorettype => 'text', proargtypes => 'text int4 text',
|
||||
prosrc => 'lpad' },
|
||||
|
@ -21,6 +21,7 @@
|
||||
extern char *str_tolower(const char *buff, size_t nbytes, Oid collid);
|
||||
extern char *str_toupper(const char *buff, size_t nbytes, Oid collid);
|
||||
extern char *str_initcap(const char *buff, size_t nbytes, Oid collid);
|
||||
extern char *str_casefold(const char *buff, size_t nbytes, Oid collid);
|
||||
|
||||
extern char *asc_tolower(const char *buff, size_t nbytes);
|
||||
extern char *asc_toupper(const char *buff, size_t nbytes);
|
||||
|
@ -134,6 +134,9 @@ extern size_t pg_strtitle(char *dest, size_t destsize,
|
||||
extern size_t pg_strupper(char *dest, size_t destsize,
|
||||
const char *src, ssize_t srclen,
|
||||
pg_locale_t locale);
|
||||
extern size_t pg_strfold(char *dest, size_t destsize,
|
||||
const char *src, ssize_t srclen,
|
||||
pg_locale_t locale);
|
||||
extern int pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale);
|
||||
extern int pg_strncoll(const char *arg1, ssize_t len1,
|
||||
const char *arg2, ssize_t len2, pg_locale_t locale);
|
||||
|
@ -255,6 +255,30 @@ SELECT a, x, y FROM collate_test10 ORDER BY lower(y), a;
|
||||
1 | hij | hij
|
||||
(2 rows)
|
||||
|
||||
SELECT lower('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' COLLATE "en-x-icu");
|
||||
lower
|
||||
-------------------------------
|
||||
abcd 123 #$% ıiii̇ ß ß dždždž σσς
|
||||
(1 row)
|
||||
|
||||
SELECT casefold('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' COLLATE "en-x-icu");
|
||||
casefold
|
||||
---------------------------------
|
||||
abcd 123 #$% ıiii̇ ss ss dždždž σσσ
|
||||
(1 row)
|
||||
|
||||
SELECT lower('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' COLLATE "tr-x-icu");
|
||||
lower
|
||||
-------------------------------
|
||||
abcd 123 #$% ıiıi ß ß dždždž σσς
|
||||
(1 row)
|
||||
|
||||
SELECT casefold('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' COLLATE "tr-x-icu");
|
||||
casefold
|
||||
---------------------------------
|
||||
abcd 123 #$% ıiıi ss ss dždždž σσσ
|
||||
(1 row)
|
||||
|
||||
-- LIKE/ILIKE
|
||||
SELECT * FROM collate_test1 WHERE b LIKE 'abc';
|
||||
a | b
|
||||
|
@ -160,6 +160,13 @@ SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_C_UTF8; -- same as above with cases reversed
|
||||
t
|
||||
(1 row)
|
||||
|
||||
-- case folding
|
||||
select casefold('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' collate PG_C_UTF8);
|
||||
casefold
|
||||
-------------------------------
|
||||
abcd 123 #$% ıiiİ ß ß dždždž σσσ
|
||||
(1 row)
|
||||
|
||||
--
|
||||
-- Test PG_UNICODE_FAST
|
||||
--
|
||||
@ -320,3 +327,10 @@ SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_UNICODE_FAST; -- same as above with cases re
|
||||
t
|
||||
(1 row)
|
||||
|
||||
-- case folding
|
||||
select casefold('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' collate PG_UNICODE_FAST);
|
||||
casefold
|
||||
---------------------------------
|
||||
abcd 123 #$% ıiii̇ ss ss dždždž σσσ
|
||||
(1 row)
|
||||
|
||||
|
@ -116,6 +116,11 @@ SELECT a, lower(x COLLATE "C"), lower(y COLLATE "C") FROM collate_test10;
|
||||
|
||||
SELECT a, x, y FROM collate_test10 ORDER BY lower(y), a;
|
||||
|
||||
SELECT lower('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' COLLATE "en-x-icu");
|
||||
SELECT casefold('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' COLLATE "en-x-icu");
|
||||
SELECT lower('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' COLLATE "tr-x-icu");
|
||||
SELECT casefold('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' COLLATE "tr-x-icu");
|
||||
|
||||
-- LIKE/ILIKE
|
||||
|
||||
SELECT * FROM collate_test1 WHERE b LIKE 'abc';
|
||||
|
@ -81,6 +81,9 @@ SELECT 'xAb' !~* '[c-d]' COLLATE PG_C_UTF8;
|
||||
SELECT 'Δ' ~* '[γ-λ]' COLLATE PG_C_UTF8;
|
||||
SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_C_UTF8; -- same as above with cases reversed
|
||||
|
||||
-- case folding
|
||||
select casefold('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' collate PG_C_UTF8);
|
||||
|
||||
--
|
||||
-- Test PG_UNICODE_FAST
|
||||
--
|
||||
@ -140,3 +143,6 @@ SELECT 'xAb' ~* '[W-Y]' COLLATE PG_UNICODE_FAST;
|
||||
SELECT 'xAb' !~* '[c-d]' COLLATE PG_UNICODE_FAST;
|
||||
SELECT 'Δ' ~* '[γ-λ]' COLLATE PG_UNICODE_FAST;
|
||||
SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_UNICODE_FAST; -- same as above with cases reversed
|
||||
|
||||
-- case folding
|
||||
select casefold('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' collate PG_UNICODE_FAST);
|
||||
|
Loading…
x
Reference in New Issue
Block a user