1
0
mirror of https://github.com/postgres/postgres.git synced 2025-10-25 13:17:41 +03:00

Support Unicode full case mapping and conversion.

Generate tables from Unicode SpecialCasing.txt to support more
sophisticated case mapping behavior:

 * support case mappings to multiple codepoints, such as "ß"
   uppercasing to "SS"
 * support conditional case mappings, such as the "final sigma"
 * support titlecase variants, such as "dž" uppercasing to "DŽ" but
   titlecasing to "Dž"

Discussion: https://postgr.es/m/ddfd67928818f138f51635712529bc5e1d25e4e7.camel@j-davis.com
Discussion: https://postgr.es/m/27bb0e52-801d-4f73-a0a4-02cfdd4a9ada@eisentraut.org
Reviewed-by: Peter Eisentraut, Daniel Verite
This commit is contained in:
Jeff Davis
2025-01-17 15:56:20 -08:00
parent 6a9b2a631a
commit 286a365b9c
9 changed files with 3645 additions and 2993 deletions

View File

@@ -17,12 +17,15 @@
#include "common/unicode_case.h"
#include "common/unicode_case_table.h"
#include "common/unicode_category.h"
#include "mb/pg_wchar.h"
static const pg_case_map *find_case_map(pg_wchar ucs);
static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
CaseKind str_casekind, WordBoundaryNext wbnext,
CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
void *wbstate);
static bool check_special_conditions(int conditions, const char *str,
size_t len, size_t offset);
pg_wchar
unicode_lowercase_simple(pg_wchar code)
@@ -63,11 +66,16 @@ unicode_uppercase_simple(pg_wchar code)
*
* If dstsize is zero, dst may be NULL. This is useful for calculating the
* required buffer size before allocating.
*
* If full is true, use special case mappings if available and if the
* conditions are satisfied.
*/
size_t
unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen,
bool full)
{
return convert_case(dst, dstsize, src, srclen, CaseLower, NULL, NULL);
return convert_case(dst, dstsize, src, srclen, CaseLower, full, NULL,
NULL);
}
/*
@@ -86,6 +94,10 @@ unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
* If dstsize is zero, dst may be NULL. This is useful for calculating the
* required buffer size before allocating.
*
* If full is true, use special case mappings if available and if the
* conditions are satisfied. Otherwise, use only simple mappings and use
* uppercase instead of titlecase.
*
* Titlecasing requires knowledge about word boundaries, which is provided by
* the callback wbnext. A word boundary is the offset of the start of a word
* or the offset of the character immediately following a word.
@@ -97,9 +109,9 @@ unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
*/
size_t
unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
WordBoundaryNext wbnext, void *wbstate)
bool full, WordBoundaryNext wbnext, void *wbstate)
{
return convert_case(dst, dstsize, src, srclen, CaseTitle, wbnext,
return convert_case(dst, dstsize, src, srclen, CaseTitle, full, wbnext,
wbstate);
}
@@ -118,23 +130,38 @@ unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
*
* If dstsize is zero, dst may be NULL. This is useful for calculating the
* required buffer size before allocating.
*
* If full is true, use special case mappings if available and if the
* conditions are satisfied.
*/
size_t
unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen)
unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen,
bool full)
{
return convert_case(dst, dstsize, src, srclen, CaseUpper, NULL, NULL);
return convert_case(dst, dstsize, src, srclen, CaseUpper, full, NULL,
NULL);
}
/*
* Implement Unicode Default Case Conversion algorithm.
*
* If str_casekind is CaseLower or CaseUpper, map each character in the string
* for which a mapping is available.
*
* If str_casekind is CaseTitle, maps characters found on a word boundary to
* uppercase and other characters to lowercase.
* titlecase (or uppercase if full is false) and other characters to
* lowercase. NB: does not currently implement the Unicode behavior in which
* the word boundary is adjusted to the next Cased character. That behavior
* could be implemented as an option, but it doesn't match the default
* behavior of ICU, nor does it match the documented behavior of INITCAP().
*
* If full is true, use special mappings for relevant characters, which can
* map a single codepoint to multiple codepoints, or depend on conditions.
*/
static size_t
convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
CaseKind str_casekind, WordBoundaryNext wbnext, void *wbstate)
CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
void *wbstate)
{
/* character CaseKind varies while titlecasing */
CaseKind chr_casekind = str_casekind;
@@ -156,20 +183,53 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
pg_wchar u1 = utf8_to_unicode((unsigned char *) src + srcoff);
int u1len = unicode_utf8len(u1);
const pg_case_map *casemap = find_case_map(u1);
const pg_special_case *special = NULL;
if (str_casekind == CaseTitle)
{
if (srcoff == boundary)
{
chr_casekind = CaseUpper;
chr_casekind = full ? CaseTitle : CaseUpper;
boundary = wbnext(wbstate);
}
else
chr_casekind = CaseLower;
}
/*
* Find special case that matches the conditions, if any.
*
* Note: only a single special mapping per codepoint is currently
* supported, though Unicode allows for multiple special mappings for
* a single codepoint.
*/
if (full && casemap && casemap->special_case)
{
int16 conditions = casemap->special_case->conditions;
Assert(casemap->special_case->codepoint == u1);
if (check_special_conditions(conditions, src, srclen, srcoff))
special = casemap->special_case;
}
/* perform mapping, update result_len, and write to dst */
if (casemap)
if (special)
{
for (int i = 0; i < MAX_CASE_EXPANSION; i++)
{
pg_wchar u2 = special->map[chr_casekind][i];
size_t u2len = unicode_utf8len(u2);
if (u2 == '\0')
break;
if (result_len + u2len <= dstsize)
unicode_to_utf8(u2, (unsigned char *) dst + result_len);
result_len += u2len;
}
}
else if (casemap)
{
pg_wchar u2 = casemap->simplemap[chr_casekind];
pg_wchar u2len = unicode_utf8len(u2);
@@ -197,6 +257,82 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
return result_len;
}
/*
* Check that the condition matches Final_Sigma, described in Unicode Table
* 3-17. The character at the given offset must be directly preceded by a
* Cased character, and must not be directly followed by a Cased character.
*
* Case_Ignorable characters are ignored. NB: some characters may be both
* Cased and Case_Ignorable, in which case they are ignored.
*/
static bool
check_final_sigma(const unsigned char *str, size_t len, size_t offset)
{
/* the start of the string is not preceded by a Cased character */
if (offset == 0)
return false;
/* iterate backwards, looking for Cased character */
for (int i = offset - 1; i >= 0; i--)
{
if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
{
pg_wchar curr = utf8_to_unicode(str + i);
if (pg_u_prop_case_ignorable(curr))
continue;
else if (pg_u_prop_cased(curr))
break;
else
return false;
}
else if ((str[i] & 0xC0) == 0x80)
continue;
Assert(false); /* invalid UTF-8 */
}
/* end of string is not followed by a Cased character */
if (offset == len)
return true;
/* iterate forwards, looking for Cased character */
for (int i = offset + 1; i < len && str[i] != '\0'; i++)
{
if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
{
pg_wchar curr = utf8_to_unicode(str + i);
if (pg_u_prop_case_ignorable(curr))
continue;
else if (pg_u_prop_cased(curr))
return false;
else
break;
}
else if ((str[i] & 0xC0) == 0x80)
continue;
Assert(false); /* invalid UTF-8 */
}
return true;
}
static bool
check_special_conditions(int conditions, const char *str, size_t len,
size_t offset)
{
if (conditions == 0)
return true;
else if (conditions == PG_U_FINAL_SIGMA)
return check_final_sigma((unsigned char *) str, len, offset);
/* no other conditions supported */
Assert(false);
return false;
}
/* find entry in simple case map, if any */
static const pg_case_map *
find_case_map(pg_wchar ucs)