mirror of
https://github.com/postgres/postgres.git
synced 2025-11-06 07:49:08 +03:00
Use C11 char16_t and char32_t for Unicode code points.
Reviewed-by: Tatsuo Ishii <ishii@postgresql.org> Reviewed-by: Thomas Munro <thomas.munro@gmail.com> Reviewed-by: Peter Eisentraut <peter@eisentraut.org> Discussion: https://postgr.es/m/bedcc93d06203dfd89815b10f815ca2de8626e85.camel%40j-davis.com
This commit is contained in:
@@ -574,7 +574,7 @@ hexval(char c, int *result, struct Node *escontext, yyscan_t yyscanner)
|
||||
|
||||
/* Add given unicode character to scanstring */
|
||||
static bool
|
||||
addUnicodeChar(int ch, struct Node *escontext, yyscan_t yyscanner)
|
||||
addUnicodeChar(char32_t ch, struct Node *escontext, yyscan_t yyscanner)
|
||||
{
|
||||
if (ch == 0)
|
||||
{
|
||||
@@ -607,7 +607,7 @@ addUnicodeChar(int ch, struct Node *escontext, yyscan_t yyscanner)
|
||||
|
||||
/* Add unicode character, processing any surrogate pairs */
|
||||
static bool
|
||||
addUnicode(int ch, int *hi_surrogate, struct Node *escontext, yyscan_t yyscanner)
|
||||
addUnicode(char32_t ch, int *hi_surrogate, struct Node *escontext, yyscan_t yyscanner)
|
||||
{
|
||||
if (is_utf16_surrogate_first(ch))
|
||||
{
|
||||
@@ -655,7 +655,7 @@ parseUnicode(char *s, int l, struct Node *escontext, yyscan_t yyscanner)
|
||||
|
||||
for (i = 2; i < l; i += 2) /* skip '\u' */
|
||||
{
|
||||
int ch = 0;
|
||||
char32_t ch = 0;
|
||||
int j,
|
||||
si;
|
||||
|
||||
|
||||
@@ -15,7 +15,6 @@
|
||||
#include "catalog/pg_collation.h"
|
||||
#include "common/unicode_case.h"
|
||||
#include "common/unicode_category.h"
|
||||
#include "mb/pg_wchar.h"
|
||||
#include "miscadmin.h"
|
||||
#include "utils/builtins.h"
|
||||
#include "utils/pg_locale.h"
|
||||
@@ -35,6 +34,23 @@ struct WordBoundaryState
|
||||
bool prev_alnum;
|
||||
};
|
||||
|
||||
/*
|
||||
* In UTF-8, pg_wchar is guaranteed to be the code point value.
|
||||
*/
|
||||
static inline char32_t
|
||||
to_char32(pg_wchar wc)
|
||||
{
|
||||
Assert(GetDatabaseEncoding() == PG_UTF8);
|
||||
return (char32_t) wc;
|
||||
}
|
||||
|
||||
static inline pg_wchar
|
||||
to_pg_wchar(char32_t c32)
|
||||
{
|
||||
Assert(GetDatabaseEncoding() == PG_UTF8);
|
||||
return (pg_wchar) c32;
|
||||
}
|
||||
|
||||
/*
|
||||
* Simple word boundary iterator that draws boundaries each time the result of
|
||||
* pg_u_isalnum() changes.
|
||||
@@ -47,7 +63,7 @@ initcap_wbnext(void *state)
|
||||
while (wbstate->offset < wbstate->len &&
|
||||
wbstate->str[wbstate->offset] != '\0')
|
||||
{
|
||||
pg_wchar u = utf8_to_unicode((unsigned char *) wbstate->str +
|
||||
char32_t u = utf8_to_unicode((unsigned char *) wbstate->str +
|
||||
wbstate->offset);
|
||||
bool curr_alnum = pg_u_isalnum(u, wbstate->posix);
|
||||
|
||||
@@ -112,61 +128,61 @@ strfold_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
|
||||
static bool
|
||||
wc_isdigit_builtin(pg_wchar wc, pg_locale_t locale)
|
||||
{
|
||||
return pg_u_isdigit(wc, !locale->builtin.casemap_full);
|
||||
return pg_u_isdigit(to_char32(wc), !locale->builtin.casemap_full);
|
||||
}
|
||||
|
||||
static bool
|
||||
wc_isalpha_builtin(pg_wchar wc, pg_locale_t locale)
|
||||
{
|
||||
return pg_u_isalpha(wc);
|
||||
return pg_u_isalpha(to_char32(wc));
|
||||
}
|
||||
|
||||
static bool
|
||||
wc_isalnum_builtin(pg_wchar wc, pg_locale_t locale)
|
||||
{
|
||||
return pg_u_isalnum(wc, !locale->builtin.casemap_full);
|
||||
return pg_u_isalnum(to_char32(wc), !locale->builtin.casemap_full);
|
||||
}
|
||||
|
||||
static bool
|
||||
wc_isupper_builtin(pg_wchar wc, pg_locale_t locale)
|
||||
{
|
||||
return pg_u_isupper(wc);
|
||||
return pg_u_isupper(to_char32(wc));
|
||||
}
|
||||
|
||||
static bool
|
||||
wc_islower_builtin(pg_wchar wc, pg_locale_t locale)
|
||||
{
|
||||
return pg_u_islower(wc);
|
||||
return pg_u_islower(to_char32(wc));
|
||||
}
|
||||
|
||||
static bool
|
||||
wc_isgraph_builtin(pg_wchar wc, pg_locale_t locale)
|
||||
{
|
||||
return pg_u_isgraph(wc);
|
||||
return pg_u_isgraph(to_char32(wc));
|
||||
}
|
||||
|
||||
static bool
|
||||
wc_isprint_builtin(pg_wchar wc, pg_locale_t locale)
|
||||
{
|
||||
return pg_u_isprint(wc);
|
||||
return pg_u_isprint(to_char32(wc));
|
||||
}
|
||||
|
||||
static bool
|
||||
wc_ispunct_builtin(pg_wchar wc, pg_locale_t locale)
|
||||
{
|
||||
return pg_u_ispunct(wc, !locale->builtin.casemap_full);
|
||||
return pg_u_ispunct(to_char32(wc), !locale->builtin.casemap_full);
|
||||
}
|
||||
|
||||
static bool
|
||||
wc_isspace_builtin(pg_wchar wc, pg_locale_t locale)
|
||||
{
|
||||
return pg_u_isspace(wc);
|
||||
return pg_u_isspace(to_char32(wc));
|
||||
}
|
||||
|
||||
static bool
|
||||
wc_isxdigit_builtin(pg_wchar wc, pg_locale_t locale)
|
||||
{
|
||||
return pg_u_isxdigit(wc, !locale->builtin.casemap_full);
|
||||
return pg_u_isxdigit(to_char32(wc), !locale->builtin.casemap_full);
|
||||
}
|
||||
|
||||
static bool
|
||||
@@ -179,13 +195,13 @@ char_is_cased_builtin(char ch, pg_locale_t locale)
|
||||
static pg_wchar
|
||||
wc_toupper_builtin(pg_wchar wc, pg_locale_t locale)
|
||||
{
|
||||
return unicode_uppercase_simple(wc);
|
||||
return to_pg_wchar(unicode_uppercase_simple(to_char32(wc)));
|
||||
}
|
||||
|
||||
static pg_wchar
|
||||
wc_tolower_builtin(pg_wchar wc, pg_locale_t locale)
|
||||
{
|
||||
return unicode_lowercase_simple(wc);
|
||||
return to_pg_wchar(unicode_lowercase_simple(to_char32(wc)));
|
||||
}
|
||||
|
||||
static const struct ctype_methods ctype_methods_builtin = {
|
||||
|
||||
@@ -5419,12 +5419,12 @@ unicode_assigned(PG_FUNCTION_ARGS)
|
||||
ereport(ERROR,
|
||||
(errmsg("Unicode categorization can only be performed if server encoding is UTF8")));
|
||||
|
||||
/* convert to pg_wchar */
|
||||
/* convert to char32_t */
|
||||
size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
|
||||
p = (unsigned char *) VARDATA_ANY(input);
|
||||
for (int i = 0; i < size; i++)
|
||||
{
|
||||
pg_wchar uchar = utf8_to_unicode(p);
|
||||
char32_t uchar = utf8_to_unicode(p);
|
||||
int category = unicode_category(uchar);
|
||||
|
||||
if (category == PG_U_UNASSIGNED)
|
||||
@@ -5443,24 +5443,24 @@ unicode_normalize_func(PG_FUNCTION_ARGS)
|
||||
char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
|
||||
UnicodeNormalizationForm form;
|
||||
int size;
|
||||
pg_wchar *input_chars;
|
||||
pg_wchar *output_chars;
|
||||
char32_t *input_chars;
|
||||
char32_t *output_chars;
|
||||
unsigned char *p;
|
||||
text *result;
|
||||
int i;
|
||||
|
||||
form = unicode_norm_form_from_string(formstr);
|
||||
|
||||
/* convert to pg_wchar */
|
||||
/* convert to char32_t */
|
||||
size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
|
||||
input_chars = palloc((size + 1) * sizeof(pg_wchar));
|
||||
input_chars = palloc((size + 1) * sizeof(char32_t));
|
||||
p = (unsigned char *) VARDATA_ANY(input);
|
||||
for (i = 0; i < size; i++)
|
||||
{
|
||||
input_chars[i] = utf8_to_unicode(p);
|
||||
p += pg_utf_mblen(p);
|
||||
}
|
||||
input_chars[i] = (pg_wchar) '\0';
|
||||
input_chars[i] = (char32_t) '\0';
|
||||
Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
|
||||
|
||||
/* action */
|
||||
@@ -5468,7 +5468,7 @@ unicode_normalize_func(PG_FUNCTION_ARGS)
|
||||
|
||||
/* convert back to UTF-8 string */
|
||||
size = 0;
|
||||
for (pg_wchar *wp = output_chars; *wp; wp++)
|
||||
for (char32_t *wp = output_chars; *wp; wp++)
|
||||
{
|
||||
unsigned char buf[4];
|
||||
|
||||
@@ -5480,7 +5480,7 @@ unicode_normalize_func(PG_FUNCTION_ARGS)
|
||||
SET_VARSIZE(result, size + VARHDRSZ);
|
||||
|
||||
p = (unsigned char *) VARDATA_ANY(result);
|
||||
for (pg_wchar *wp = output_chars; *wp; wp++)
|
||||
for (char32_t *wp = output_chars; *wp; wp++)
|
||||
{
|
||||
unicode_to_utf8(*wp, p);
|
||||
p += pg_utf_mblen(p);
|
||||
@@ -5509,8 +5509,8 @@ unicode_is_normalized(PG_FUNCTION_ARGS)
|
||||
char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
|
||||
UnicodeNormalizationForm form;
|
||||
int size;
|
||||
pg_wchar *input_chars;
|
||||
pg_wchar *output_chars;
|
||||
char32_t *input_chars;
|
||||
char32_t *output_chars;
|
||||
unsigned char *p;
|
||||
int i;
|
||||
UnicodeNormalizationQC quickcheck;
|
||||
@@ -5519,16 +5519,16 @@ unicode_is_normalized(PG_FUNCTION_ARGS)
|
||||
|
||||
form = unicode_norm_form_from_string(formstr);
|
||||
|
||||
/* convert to pg_wchar */
|
||||
/* convert to char32_t */
|
||||
size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
|
||||
input_chars = palloc((size + 1) * sizeof(pg_wchar));
|
||||
input_chars = palloc((size + 1) * sizeof(char32_t));
|
||||
p = (unsigned char *) VARDATA_ANY(input);
|
||||
for (i = 0; i < size; i++)
|
||||
{
|
||||
input_chars[i] = utf8_to_unicode(p);
|
||||
p += pg_utf_mblen(p);
|
||||
}
|
||||
input_chars[i] = (pg_wchar) '\0';
|
||||
input_chars[i] = (char32_t) '\0';
|
||||
Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
|
||||
|
||||
/* quick check (see UAX #15) */
|
||||
@@ -5542,11 +5542,11 @@ unicode_is_normalized(PG_FUNCTION_ARGS)
|
||||
output_chars = unicode_normalize(form, input_chars);
|
||||
|
||||
output_size = 0;
|
||||
for (pg_wchar *wp = output_chars; *wp; wp++)
|
||||
for (char32_t *wp = output_chars; *wp; wp++)
|
||||
output_size++;
|
||||
|
||||
result = (size == output_size) &&
|
||||
(memcmp(input_chars, output_chars, size * sizeof(pg_wchar)) == 0);
|
||||
(memcmp(input_chars, output_chars, size * sizeof(char32_t)) == 0);
|
||||
|
||||
PG_RETURN_BOOL(result);
|
||||
}
|
||||
@@ -5602,7 +5602,7 @@ unistr(PG_FUNCTION_ARGS)
|
||||
int len;
|
||||
StringInfoData str;
|
||||
text *result;
|
||||
pg_wchar pair_first = 0;
|
||||
char16_t pair_first = 0;
|
||||
char cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
|
||||
|
||||
instr = VARDATA_ANY(input_text);
|
||||
@@ -5626,7 +5626,7 @@ unistr(PG_FUNCTION_ARGS)
|
||||
else if ((len >= 5 && isxdigits_n(instr + 1, 4)) ||
|
||||
(len >= 6 && instr[1] == 'u' && isxdigits_n(instr + 2, 4)))
|
||||
{
|
||||
pg_wchar unicode;
|
||||
char32_t unicode;
|
||||
int offset = instr[1] == 'u' ? 2 : 1;
|
||||
|
||||
unicode = hexval_n(instr + offset, 4);
|
||||
@@ -5662,7 +5662,7 @@ unistr(PG_FUNCTION_ARGS)
|
||||
}
|
||||
else if (len >= 8 && instr[1] == '+' && isxdigits_n(instr + 2, 6))
|
||||
{
|
||||
pg_wchar unicode;
|
||||
char32_t unicode;
|
||||
|
||||
unicode = hexval_n(instr + 2, 6);
|
||||
|
||||
@@ -5697,7 +5697,7 @@ unistr(PG_FUNCTION_ARGS)
|
||||
}
|
||||
else if (len >= 10 && instr[1] == 'U' && isxdigits_n(instr + 2, 8))
|
||||
{
|
||||
pg_wchar unicode;
|
||||
char32_t unicode;
|
||||
|
||||
unicode = hexval_n(instr + 2, 8);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user