mirror of
https://github.com/postgres/postgres.git
synced 2025-11-19 13:42:17 +03:00
Additional unicode primitive functions.
Introduce unicode_version(), icu_unicode_version(), and unicode_assigned(). The latter requires introducing a new lookup table for the Unicode General Category, which is generated along with the other Unicode lookup tables. Discussion: https://postgr.es/m/CA+TgmoYzYR-yhU6k1XFCADeyj=Oyz2PkVsa3iKv+keM8wp-F_A@mail.gmail.com Reviewed-by: Peter Eisentraut
This commit is contained in:
@@ -23,7 +23,9 @@
|
||||
#include "catalog/pg_type.h"
|
||||
#include "common/hashfn.h"
|
||||
#include "common/int.h"
|
||||
#include "common/unicode_category.h"
|
||||
#include "common/unicode_norm.h"
|
||||
#include "common/unicode_version.h"
|
||||
#include "funcapi.h"
|
||||
#include "lib/hyperloglog.h"
|
||||
#include "libpq/pqformat.h"
|
||||
@@ -6237,6 +6239,65 @@ unicode_norm_form_from_string(const char *formstr)
|
||||
return form;
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns version of Unicode used by Postgres in "major.minor" format (the
|
||||
* same format as the Unicode version reported by ICU). The third component
|
||||
* ("update version") never involves additions to the character repertiore and
|
||||
* is unimportant for most purposes.
|
||||
*
|
||||
* See: https://unicode.org/versions/
|
||||
*/
|
||||
Datum
|
||||
unicode_version(PG_FUNCTION_ARGS)
|
||||
{
|
||||
PG_RETURN_TEXT_P(cstring_to_text(PG_UNICODE_VERSION));
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns version of Unicode used by ICU, if enabled; otherwise NULL.
|
||||
*/
|
||||
Datum
|
||||
icu_unicode_version(PG_FUNCTION_ARGS)
|
||||
{
|
||||
#ifdef USE_ICU
|
||||
PG_RETURN_TEXT_P(cstring_to_text(U_UNICODE_VERSION));
|
||||
#else
|
||||
PG_RETURN_NULL();
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* Check whether the string contains only assigned Unicode code
|
||||
* points. Requires that the database encoding is UTF-8.
|
||||
*/
|
||||
Datum
|
||||
unicode_assigned(PG_FUNCTION_ARGS)
|
||||
{
|
||||
text *input = PG_GETARG_TEXT_PP(0);
|
||||
unsigned char *p;
|
||||
int size;
|
||||
|
||||
if (GetDatabaseEncoding() != PG_UTF8)
|
||||
ereport(ERROR,
|
||||
(errmsg("Unicode categorization can only be performed if server encoding is UTF8")));
|
||||
|
||||
/* convert to pg_wchar */
|
||||
size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
|
||||
p = (unsigned char *) VARDATA_ANY(input);
|
||||
for (int i = 0; i < size; i++)
|
||||
{
|
||||
pg_wchar uchar = utf8_to_unicode(p);
|
||||
int category = unicode_category(uchar);
|
||||
|
||||
if (category == PG_U_UNASSIGNED)
|
||||
PG_RETURN_BOOL(false);
|
||||
|
||||
p += pg_utf_mblen(p);
|
||||
}
|
||||
|
||||
PG_RETURN_BOOL(true);
|
||||
}
|
||||
|
||||
Datum
|
||||
unicode_normalize_func(PG_FUNCTION_ARGS)
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user