mirror of
https://github.com/postgres/postgres.git
synced 2025-10-25 13:17:41 +03:00
Support Unicode full case mapping and conversion.
Generate tables from Unicode SpecialCasing.txt to support more sophisticated case mapping behavior: * support case mappings to multiple codepoints, such as "ß" uppercasing to "SS" * support conditional case mappings, such as the "final sigma" * support titlecase variants, such as "dž" uppercasing to "DŽ" but titlecasing to "Dž" Discussion: https://postgr.es/m/ddfd67928818f138f51635712529bc5e1d25e4e7.camel@j-davis.com Discussion: https://postgr.es/m/27bb0e52-801d-4f73-a0a4-02cfdd4a9ada@eisentraut.org Reviewed-by: Peter Eisentraut, Daniel Verite
This commit is contained in:
@@ -17,12 +17,15 @@
|
||||
|
||||
#include "common/unicode_case.h"
|
||||
#include "common/unicode_case_table.h"
|
||||
#include "common/unicode_category.h"
|
||||
#include "mb/pg_wchar.h"
|
||||
|
||||
static const pg_case_map *find_case_map(pg_wchar ucs);
|
||||
static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
|
||||
CaseKind str_casekind, WordBoundaryNext wbnext,
|
||||
CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
|
||||
void *wbstate);
|
||||
static bool check_special_conditions(int conditions, const char *str,
|
||||
size_t len, size_t offset);
|
||||
|
||||
pg_wchar
|
||||
unicode_lowercase_simple(pg_wchar code)
|
||||
@@ -63,11 +66,16 @@ unicode_uppercase_simple(pg_wchar code)
|
||||
*
|
||||
* If dstsize is zero, dst may be NULL. This is useful for calculating the
|
||||
* required buffer size before allocating.
|
||||
*
|
||||
* If full is true, use special case mappings if available and if the
|
||||
* conditions are satisfied.
|
||||
*/
|
||||
size_t
|
||||
unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
|
||||
unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen,
|
||||
bool full)
|
||||
{
|
||||
return convert_case(dst, dstsize, src, srclen, CaseLower, NULL, NULL);
|
||||
return convert_case(dst, dstsize, src, srclen, CaseLower, full, NULL,
|
||||
NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -86,6 +94,10 @@ unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
|
||||
* If dstsize is zero, dst may be NULL. This is useful for calculating the
|
||||
* required buffer size before allocating.
|
||||
*
|
||||
* If full is true, use special case mappings if available and if the
|
||||
* conditions are satisfied. Otherwise, use only simple mappings and use
|
||||
* uppercase instead of titlecase.
|
||||
*
|
||||
* Titlecasing requires knowledge about word boundaries, which is provided by
|
||||
* the callback wbnext. A word boundary is the offset of the start of a word
|
||||
* or the offset of the character immediately following a word.
|
||||
@@ -97,9 +109,9 @@ unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
|
||||
*/
|
||||
size_t
|
||||
unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
|
||||
WordBoundaryNext wbnext, void *wbstate)
|
||||
bool full, WordBoundaryNext wbnext, void *wbstate)
|
||||
{
|
||||
return convert_case(dst, dstsize, src, srclen, CaseTitle, wbnext,
|
||||
return convert_case(dst, dstsize, src, srclen, CaseTitle, full, wbnext,
|
||||
wbstate);
|
||||
}
|
||||
|
||||
@@ -118,23 +130,38 @@ unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
|
||||
*
|
||||
* If dstsize is zero, dst may be NULL. This is useful for calculating the
|
||||
* required buffer size before allocating.
|
||||
*
|
||||
* If full is true, use special case mappings if available and if the
|
||||
* conditions are satisfied.
|
||||
*/
|
||||
size_t
|
||||
unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen)
|
||||
unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen,
|
||||
bool full)
|
||||
{
|
||||
return convert_case(dst, dstsize, src, srclen, CaseUpper, NULL, NULL);
|
||||
return convert_case(dst, dstsize, src, srclen, CaseUpper, full, NULL,
|
||||
NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
* Implement Unicode Default Case Conversion algorithm.
|
||||
*
|
||||
* If str_casekind is CaseLower or CaseUpper, map each character in the string
|
||||
* for which a mapping is available.
|
||||
*
|
||||
* If str_casekind is CaseTitle, maps characters found on a word boundary to
|
||||
* uppercase and other characters to lowercase.
|
||||
* titlecase (or uppercase if full is false) and other characters to
|
||||
* lowercase. NB: does not currently implement the Unicode behavior in which
|
||||
* the word boundary is adjusted to the next Cased character. That behavior
|
||||
* could be implemented as an option, but it doesn't match the default
|
||||
* behavior of ICU, nor does it match the documented behavior of INITCAP().
|
||||
*
|
||||
* If full is true, use special mappings for relevant characters, which can
|
||||
* map a single codepoint to multiple codepoints, or depend on conditions.
|
||||
*/
|
||||
static size_t
|
||||
convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
|
||||
CaseKind str_casekind, WordBoundaryNext wbnext, void *wbstate)
|
||||
CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
|
||||
void *wbstate)
|
||||
{
|
||||
/* character CaseKind varies while titlecasing */
|
||||
CaseKind chr_casekind = str_casekind;
|
||||
@@ -156,20 +183,53 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
|
||||
pg_wchar u1 = utf8_to_unicode((unsigned char *) src + srcoff);
|
||||
int u1len = unicode_utf8len(u1);
|
||||
const pg_case_map *casemap = find_case_map(u1);
|
||||
const pg_special_case *special = NULL;
|
||||
|
||||
if (str_casekind == CaseTitle)
|
||||
{
|
||||
if (srcoff == boundary)
|
||||
{
|
||||
chr_casekind = CaseUpper;
|
||||
chr_casekind = full ? CaseTitle : CaseUpper;
|
||||
boundary = wbnext(wbstate);
|
||||
}
|
||||
else
|
||||
chr_casekind = CaseLower;
|
||||
}
|
||||
|
||||
/*
|
||||
* Find special case that matches the conditions, if any.
|
||||
*
|
||||
* Note: only a single special mapping per codepoint is currently
|
||||
* supported, though Unicode allows for multiple special mappings for
|
||||
* a single codepoint.
|
||||
*/
|
||||
if (full && casemap && casemap->special_case)
|
||||
{
|
||||
int16 conditions = casemap->special_case->conditions;
|
||||
|
||||
Assert(casemap->special_case->codepoint == u1);
|
||||
if (check_special_conditions(conditions, src, srclen, srcoff))
|
||||
special = casemap->special_case;
|
||||
}
|
||||
|
||||
/* perform mapping, update result_len, and write to dst */
|
||||
if (casemap)
|
||||
if (special)
|
||||
{
|
||||
for (int i = 0; i < MAX_CASE_EXPANSION; i++)
|
||||
{
|
||||
pg_wchar u2 = special->map[chr_casekind][i];
|
||||
size_t u2len = unicode_utf8len(u2);
|
||||
|
||||
if (u2 == '\0')
|
||||
break;
|
||||
|
||||
if (result_len + u2len <= dstsize)
|
||||
unicode_to_utf8(u2, (unsigned char *) dst + result_len);
|
||||
|
||||
result_len += u2len;
|
||||
}
|
||||
}
|
||||
else if (casemap)
|
||||
{
|
||||
pg_wchar u2 = casemap->simplemap[chr_casekind];
|
||||
pg_wchar u2len = unicode_utf8len(u2);
|
||||
@@ -197,6 +257,82 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
|
||||
return result_len;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check that the condition matches Final_Sigma, described in Unicode Table
|
||||
* 3-17. The character at the given offset must be directly preceded by a
|
||||
* Cased character, and must not be directly followed by a Cased character.
|
||||
*
|
||||
* Case_Ignorable characters are ignored. NB: some characters may be both
|
||||
* Cased and Case_Ignorable, in which case they are ignored.
|
||||
*/
|
||||
static bool
|
||||
check_final_sigma(const unsigned char *str, size_t len, size_t offset)
|
||||
{
|
||||
/* the start of the string is not preceded by a Cased character */
|
||||
if (offset == 0)
|
||||
return false;
|
||||
|
||||
/* iterate backwards, looking for Cased character */
|
||||
for (int i = offset - 1; i >= 0; i--)
|
||||
{
|
||||
if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
|
||||
{
|
||||
pg_wchar curr = utf8_to_unicode(str + i);
|
||||
|
||||
if (pg_u_prop_case_ignorable(curr))
|
||||
continue;
|
||||
else if (pg_u_prop_cased(curr))
|
||||
break;
|
||||
else
|
||||
return false;
|
||||
}
|
||||
else if ((str[i] & 0xC0) == 0x80)
|
||||
continue;
|
||||
|
||||
Assert(false); /* invalid UTF-8 */
|
||||
}
|
||||
|
||||
/* end of string is not followed by a Cased character */
|
||||
if (offset == len)
|
||||
return true;
|
||||
|
||||
/* iterate forwards, looking for Cased character */
|
||||
for (int i = offset + 1; i < len && str[i] != '\0'; i++)
|
||||
{
|
||||
if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
|
||||
{
|
||||
pg_wchar curr = utf8_to_unicode(str + i);
|
||||
|
||||
if (pg_u_prop_case_ignorable(curr))
|
||||
continue;
|
||||
else if (pg_u_prop_cased(curr))
|
||||
return false;
|
||||
else
|
||||
break;
|
||||
}
|
||||
else if ((str[i] & 0xC0) == 0x80)
|
||||
continue;
|
||||
|
||||
Assert(false); /* invalid UTF-8 */
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
check_special_conditions(int conditions, const char *str, size_t len,
|
||||
size_t offset)
|
||||
{
|
||||
if (conditions == 0)
|
||||
return true;
|
||||
else if (conditions == PG_U_FINAL_SIGMA)
|
||||
return check_final_sigma((unsigned char *) str, len, offset);
|
||||
|
||||
/* no other conditions supported */
|
||||
Assert(false);
|
||||
return false;
|
||||
}
|
||||
|
||||
/* find entry in simple case map, if any */
|
||||
static const pg_case_map *
|
||||
find_case_map(pg_wchar ucs)
|
||||
|
||||
Reference in New Issue
Block a user