1
0
mirror of https://github.com/postgres/postgres.git synced 2025-06-30 21:42:05 +03:00

Add unicode_strtitle() for Unicode Default Case Conversion.

This brings the titlecasing implementation for the builtin provider
out of formatting.c and into unicode_case.c, along with
unicode_strlower() and unicode_strupper(). Accepts an arbitrary word
boundary callback.

Simple for now, but can be extended to support the Unicode Default
Case Conversion algorithm with full case mapping.

Discussion: https://postgr.es/m/3bc653b5d562ae9e2838b11cb696816c328a489a.camel@j-davis.com
Reviewed-by: Peter Eisentraut
This commit is contained in:
Jeff Davis
2024-03-29 17:35:07 -07:00
parent a96a8b15fa
commit 46e5441fa5
3 changed files with 140 additions and 48 deletions

View File

@ -1922,6 +1922,47 @@ str_toupper(const char *buff, size_t nbytes, Oid collid)
return result;
}
struct WordBoundaryState
{
const char *str;
size_t len;
size_t offset;
bool init;
bool prev_alnum;
};
/*
* Simple word boundary iterator that draws boundaries each time the result of
* pg_u_isalnum() changes.
*/
static size_t
initcap_wbnext(void *state)
{
struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state;
while (wbstate->offset < wbstate->len &&
wbstate->str[wbstate->offset] != '\0')
{
pg_wchar u = utf8_to_unicode((unsigned char *) wbstate->str +
wbstate->offset);
bool curr_alnum = pg_u_isalnum(u, true);
if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
{
size_t prev_offset = wbstate->offset;
wbstate->init = true;
wbstate->offset += unicode_utf8len(u);
wbstate->prev_alnum = curr_alnum;
return prev_offset;
}
wbstate->offset += unicode_utf8len(u);
}
return wbstate->len;
}
/*
* collation-aware, wide-character-aware initcap function
*
@ -1980,56 +2021,42 @@ str_initcap(const char *buff, size_t nbytes, Oid collid)
#endif
if (mylocale && mylocale->provider == COLLPROVIDER_BUILTIN)
{
const unsigned char *src = (unsigned char *) buff;
const char *src = buff;
size_t srclen = nbytes;
unsigned char *dst;
size_t dstsize;
int srcoff = 0;
int dstoff = 0;
char *dst;
size_t needed;
struct WordBoundaryState wbstate = {
.str = src,
.len = srclen,
.offset = 0,
.init = false,
.prev_alnum = false,
};
Assert(GetDatabaseEncoding() == PG_UTF8);
/* overflow paranoia */
if ((srclen + 1) > (INT_MAX / MAX_MULTIBYTE_CHAR_LEN))
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
/* first try buffer of equal size plus terminating NUL */
dstsize = srclen + 1;
dst = palloc(dstsize);
/* result is at most srclen codepoints plus terminating NUL */
dstsize = srclen * MAX_MULTIBYTE_CHAR_LEN + 1;
dst = (unsigned char *) palloc(dstsize);
while (srcoff < nbytes)
needed = unicode_strtitle(dst, dstsize, src, srclen,
initcap_wbnext, &wbstate);
if (needed + 1 > dstsize)
{
pg_wchar u1 = utf8_to_unicode(src + srcoff);
pg_wchar u2;
int u1len = unicode_utf8len(u1);
int u2len;
/* reset iterator */
wbstate.offset = 0;
wbstate.init = false;
if (wasalnum)
u2 = unicode_lowercase_simple(u1);
else
u2 = unicode_uppercase_simple(u1);
u2len = unicode_utf8len(u2);
Assert(dstoff + u2len + 1 <= dstsize);
wasalnum = pg_u_isalnum(u2, true);
unicode_to_utf8(u2, dst + dstoff);
srcoff += u1len;
dstoff += u2len;
/* grow buffer if needed and retry */
dstsize = needed + 1;
dst = repalloc(dst, dstsize);
needed = unicode_strtitle(dst, dstsize, src, srclen,
initcap_wbnext, &wbstate);
Assert(needed + 1 == dstsize);
}
Assert(dstoff + 1 <= dstsize);
*(dst + dstoff) = '\0';
dstoff++;
/* allocate result buffer of the right size and free workspace */
result = palloc(dstoff);
memcpy(result, dst, dstoff);
pfree(dst);
result = dst;
}
else
{