1
0
mirror of https://github.com/postgres/postgres.git synced 2025-11-19 13:42:17 +03:00

Additional unicode primitive functions.

Introduce unicode_version(), icu_unicode_version(), and
unicode_assigned().

The latter requires introducing a new lookup table for the Unicode
General Category, which is generated along with the other Unicode
lookup tables.

Discussion: https://postgr.es/m/CA+TgmoYzYR-yhU6k1XFCADeyj=Oyz2PkVsa3iKv+keM8wp-F_A@mail.gmail.com
Reviewed-by: Peter Eisentraut
This commit is contained in:
Jeff Davis
2023-11-01 22:47:06 -07:00
parent 7021d3b176
commit a02b37fc08
18 changed files with 4924 additions and 22 deletions

View File

@@ -23,7 +23,9 @@
#include "catalog/pg_type.h"
#include "common/hashfn.h"
#include "common/int.h"
#include "common/unicode_category.h"
#include "common/unicode_norm.h"
#include "common/unicode_version.h"
#include "funcapi.h"
#include "lib/hyperloglog.h"
#include "libpq/pqformat.h"
@@ -6237,6 +6239,65 @@ unicode_norm_form_from_string(const char *formstr)
return form;
}
/*
* Returns version of Unicode used by Postgres in "major.minor" format (the
* same format as the Unicode version reported by ICU). The third component
* ("update version") never involves additions to the character repertiore and
* is unimportant for most purposes.
*
* See: https://unicode.org/versions/
*/
Datum
unicode_version(PG_FUNCTION_ARGS)
{
PG_RETURN_TEXT_P(cstring_to_text(PG_UNICODE_VERSION));
}
/*
* Returns version of Unicode used by ICU, if enabled; otherwise NULL.
*/
Datum
icu_unicode_version(PG_FUNCTION_ARGS)
{
#ifdef USE_ICU
PG_RETURN_TEXT_P(cstring_to_text(U_UNICODE_VERSION));
#else
PG_RETURN_NULL();
#endif
}
/*
* Check whether the string contains only assigned Unicode code
* points. Requires that the database encoding is UTF-8.
*/
Datum
unicode_assigned(PG_FUNCTION_ARGS)
{
text *input = PG_GETARG_TEXT_PP(0);
unsigned char *p;
int size;
if (GetDatabaseEncoding() != PG_UTF8)
ereport(ERROR,
(errmsg("Unicode categorization can only be performed if server encoding is UTF8")));
/* convert to pg_wchar */
size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
p = (unsigned char *) VARDATA_ANY(input);
for (int i = 0; i < size; i++)
{
pg_wchar uchar = utf8_to_unicode(p);
int category = unicode_category(uchar);
if (category == PG_U_UNASSIGNED)
PG_RETURN_BOOL(false);
p += pg_utf_mblen(p);
}
PG_RETURN_BOOL(true);
}
Datum
unicode_normalize_func(PG_FUNCTION_ARGS)
{