1
0
mirror of https://github.com/postgres/postgres.git synced 2025-10-25 13:17:41 +03:00

Add unicode_strtitle() for Unicode Default Case Conversion.

This brings the titlecasing implementation for the builtin provider
out of formatting.c and into unicode_case.c, along with
unicode_strlower() and unicode_strupper(). Accepts an arbitrary word
boundary callback.

Simple for now, but can be extended to support the Unicode Default
Case Conversion algorithm with full case mapping.

Discussion: https://postgr.es/m/3bc653b5d562ae9e2838b11cb696816c328a489a.camel@j-davis.com
Reviewed-by: Peter Eisentraut
This commit is contained in:
Jeff Davis
2024-03-29 17:35:07 -07:00
parent a96a8b15fa
commit 46e5441fa5
3 changed files with 140 additions and 48 deletions

View File

@@ -21,8 +21,9 @@
#include "mb/pg_wchar.h"
static const pg_case_map *find_case_map(pg_wchar ucs);
static size_t convert_case(char *dst, size_t dstsize, const char *src,
ssize_t srclen, CaseKind casekind);
static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
CaseKind str_casekind, WordBoundaryNext wbnext,
void *wbstate);
pg_wchar
unicode_lowercase_simple(pg_wchar code)
@@ -67,7 +68,40 @@ unicode_uppercase_simple(pg_wchar code)
size_t
unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
{
return convert_case(dst, dstsize, src, srclen, CaseLower);
return convert_case(dst, dstsize, src, srclen, CaseLower, NULL, NULL);
}
/*
* unicode_strtitle()
*
* Convert src to titlecase, and return the result length (not including
* terminating NUL).
*
* String src must be encoded in UTF-8. If srclen < 0, src must be
* NUL-terminated.
*
* Result string is stored in dst, truncating if larger than dstsize. If
* dstsize is greater than the result length, dst will be NUL-terminated;
* otherwise not.
*
* If dstsize is zero, dst may be NULL. This is useful for calculating the
* required buffer size before allocating.
*
* Titlecasing requires knowledge about word boundaries, which is provided by
* the callback wbnext. A word boundary is the offset of the start of a word
* or the offset of the character immediately following a word.
*
* The caller is expected to initialize and free the callback state
* wbstate. The callback should first return offset 0 for the first boundary;
* then the offset of each subsequent word boundary; then the total length of
* the string to indicate the final boundary.
*/
size_t
unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
WordBoundaryNext wbnext, void *wbstate)
{
return convert_case(dst, dstsize, src, srclen, CaseTitle, wbnext,
wbstate);
}
/*
@@ -89,20 +123,34 @@ unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
size_t
unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen)
{
return convert_case(dst, dstsize, src, srclen, CaseUpper);
return convert_case(dst, dstsize, src, srclen, CaseUpper, NULL, NULL);
}
/*
* Implement Unicode Default Case Conversion algorithm.
* If str_casekind is CaseLower or CaseUpper, map each character in the string
* for which a mapping is available.
*
* Map each character in the string for which a mapping is available.
* If str_casekind is CaseTitle, maps characters found on a word boundary to
* uppercase and other characters to lowercase.
*/
static size_t
convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
CaseKind casekind)
CaseKind str_casekind, WordBoundaryNext wbnext, void *wbstate)
{
/* character CaseKind varies while titlecasing */
CaseKind chr_casekind = str_casekind;
size_t srcoff = 0;
size_t result_len = 0;
size_t boundary = 0;
Assert((str_casekind == CaseTitle && wbnext && wbstate) ||
(str_casekind != CaseTitle && !wbnext && !wbstate));
if (str_casekind == CaseTitle)
{
boundary = wbnext(wbstate);
Assert(boundary == 0); /* start of text is always a boundary */
}
while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0')
{
@@ -110,9 +158,21 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
int u1len = unicode_utf8len(u1);
const pg_case_map *casemap = find_case_map(u1);
if (str_casekind == CaseTitle)
{
if (srcoff == boundary)
{
chr_casekind = CaseUpper;
boundary = wbnext(wbstate);
}
else
chr_casekind = CaseLower;
}
/* perform mapping, update result_len, and write to dst */
if (casemap)
{
pg_wchar u2 = casemap->simplemap[casekind];
pg_wchar u2 = casemap->simplemap[chr_casekind];
pg_wchar u2len = unicode_utf8len(u2);
if (result_len + u2len <= dstsize)