1
0
mirror of https://github.com/postgres/postgres.git synced 2025-11-19 13:42:17 +03:00

Add unistr function

This allows decoding a string with Unicode escape sequences.  It is
similar to Unicode escape strings, but offers some more flexibility.

Author: Pavel Stehule <pavel.stehule@gmail.com>
Reviewed-by: Asif Rehman <asifr.rehman@gmail.com>
Discussion: https://www.postgresql.org/message-id/flat/CAFj8pRA5GnKT+gDVwbVRH2ep451H_myBt+NTz8RkYUARE9+qOQ@mail.gmail.com
This commit is contained in:
Peter Eisentraut
2021-03-28 08:16:15 +02:00
parent ebedd0c78f
commit f37fec837c
6 changed files with 310 additions and 1 deletions

View File

@@ -6380,3 +6380,213 @@ unicode_is_normalized(PG_FUNCTION_ARGS)
PG_RETURN_BOOL(result);
}
/*
* Check if first n chars are hexadecimal digits
*/
static bool
isxdigits_n(const char *instr, size_t n)
{
for (size_t i = 0; i < n; i++)
if (!isxdigit((unsigned char) instr[i]))
return false;
return true;
}
static unsigned int
hexval(unsigned char c)
{
if (c >= '0' && c <= '9')
return c - '0';
if (c >= 'a' && c <= 'f')
return c - 'a' + 0xA;
if (c >= 'A' && c <= 'F')
return c - 'A' + 0xA;
elog(ERROR, "invalid hexadecimal digit");
return 0; /* not reached */
}
/*
* Translate string with hexadecimal digits to number
*/
static unsigned int
hexval_n(const char *instr, size_t n)
{
unsigned int result = 0;
for (size_t i = 0; i < n; i++)
result += hexval(instr[i]) << (4 * (n - i - 1));
return result;
}
/*
* Replaces Unicode escape sequences by Unicode characters
*/
Datum
unistr(PG_FUNCTION_ARGS)
{
text *input_text = PG_GETARG_TEXT_PP(0);
char *instr;
int len;
StringInfoData str;
text *result;
pg_wchar pair_first = 0;
char cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
instr = VARDATA_ANY(input_text);
len = VARSIZE_ANY_EXHDR(input_text);
initStringInfo(&str);
while (len > 0)
{
if (instr[0] == '\\')
{
if (len >= 2 &&
instr[1] == '\\')
{
if (pair_first)
goto invalid_pair;
appendStringInfoChar(&str, '\\');
instr += 2;
len -= 2;
}
else if ((len >= 5 && isxdigits_n(instr + 1, 4)) ||
(len >= 6 && instr[1] == 'u' && isxdigits_n(instr + 2, 4)))
{
pg_wchar unicode;
int offset = instr[1] == 'u' ? 2 : 1;
unicode = hexval_n(instr + offset, 4);
if (!is_valid_unicode_codepoint(unicode))
ereport(ERROR,
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("invalid Unicode code point: %04X", unicode));
if (pair_first)
{
if (is_utf16_surrogate_second(unicode))
{
unicode = surrogate_pair_to_codepoint(pair_first, unicode);
pair_first = 0;
}
else
goto invalid_pair;
}
else if (is_utf16_surrogate_second(unicode))
goto invalid_pair;
if (is_utf16_surrogate_first(unicode))
pair_first = unicode;
else
{
pg_unicode_to_server(unicode, (unsigned char *) cbuf);
appendStringInfoString(&str, cbuf);
}
instr += 4 + offset;
len -= 4 + offset;
}
else if (len >= 8 && instr[1] == '+' && isxdigits_n(instr + 2, 6))
{
pg_wchar unicode;
unicode = hexval_n(instr + 2, 6);
if (!is_valid_unicode_codepoint(unicode))
ereport(ERROR,
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("invalid Unicode code point: %04X", unicode));
if (pair_first)
{
if (is_utf16_surrogate_second(unicode))
{
unicode = surrogate_pair_to_codepoint(pair_first, unicode);
pair_first = 0;
}
else
goto invalid_pair;
}
else if (is_utf16_surrogate_second(unicode))
goto invalid_pair;
if (is_utf16_surrogate_first(unicode))
pair_first = unicode;
else
{
pg_unicode_to_server(unicode, (unsigned char *) cbuf);
appendStringInfoString(&str, cbuf);
}
instr += 8;
len -= 8;
}
else if (len >= 10 && instr[1] == 'U' && isxdigits_n(instr + 2, 8))
{
pg_wchar unicode;
unicode = hexval_n(instr + 2, 8);
if (!is_valid_unicode_codepoint(unicode))
ereport(ERROR,
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("invalid Unicode code point: %04X", unicode));
if (pair_first)
{
if (is_utf16_surrogate_second(unicode))
{
unicode = surrogate_pair_to_codepoint(pair_first, unicode);
pair_first = 0;
}
else
goto invalid_pair;
}
else if (is_utf16_surrogate_second(unicode))
goto invalid_pair;
if (is_utf16_surrogate_first(unicode))
pair_first = unicode;
else
{
pg_unicode_to_server(unicode, (unsigned char *) cbuf);
appendStringInfoString(&str, cbuf);
}
instr += 10;
len -= 10;
}
else
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("invalid Unicode escape"),
errhint("Unicode escapes must be \\XXXX, \\+XXXXXX, \\uXXXX, or \\UXXXXXXXX.")));
}
else
{
if (pair_first)
goto invalid_pair;
appendStringInfoChar(&str, *instr++);
len--;
}
}
/* unfinished surrogate pair? */
if (pair_first)
goto invalid_pair;
result = cstring_to_text_with_len(str.data, str.len);
pfree(str.data);
PG_RETURN_TEXT_P(result);
invalid_pair:
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("invalid Unicode surrogate pair")));
}