1
0
mirror of https://github.com/postgres/postgres.git synced 2025-07-03 20:02:46 +03:00

Add SQL functions for Unicode normalization

This adds SQL expressions NORMALIZE() and IS NORMALIZED to convert and
check Unicode normal forms, per SQL standard.

To support fast IS NORMALIZED tests, we pull in a new data file
DerivedNormalizationProps.txt from Unicode and build a lookup table
from that, using techniques similar to ones already used for other
Unicode data.  make update-unicode will keep it up to date.  We only
build and use these tables for the NFC and NFKC forms, because they
are too big for NFD and NFKD and the improvement is not significant
enough there.

Reviewed-by: Daniel Verite <daniel@manitou-mail.org>
Reviewed-by: Andreas Karlsson <andreas@proxel.se>
Discussion: https://www.postgresql.org/message-id/flat/c1909f27-c269-2ed9-12f8-3ab72c8caf7a@2ndquadrant.com
This commit is contained in:
Peter Eisentraut
2020-03-26 08:14:00 +01:00
parent 070c3d3937
commit 2991ac5fc9
20 changed files with 6764 additions and 7 deletions

View File

@ -444,6 +444,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
%type <list> substr_list trim_list
%type <list> opt_interval interval_second
%type <node> overlay_placing substr_from substr_for
%type <str> unicode_normal_form
%type <boolean> opt_instead
%type <boolean> opt_unique opt_concurrently opt_verbose opt_full
@ -664,7 +665,8 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
MAPPING MATCH MATERIALIZED MAXVALUE METHOD MINUTE_P MINVALUE MODE MONTH_P MOVE
NAME_P NAMES NATIONAL NATURAL NCHAR NEW NEXT NO NONE
NAME_P NAMES NATIONAL NATURAL NCHAR NEW NEXT NFC NFD NFKC NFKD NO NONE
NORMALIZE NORMALIZED
NOT NOTHING NOTIFY NOTNULL NOWAIT NULL_P NULLIF
NULLS_P NUMERIC
@ -13491,6 +13493,22 @@ a_expr: c_expr { $$ = $1; }
list_make1($1), @2),
@2);
}
| a_expr IS NORMALIZED %prec IS
{
$$ = (Node *) makeFuncCall(SystemFuncName("is_normalized"), list_make1($1), @2);
}
| a_expr IS unicode_normal_form NORMALIZED %prec IS
{
$$ = (Node *) makeFuncCall(SystemFuncName("is_normalized"), list_make2($1, makeStringConst($3, @3)), @2);
}
| a_expr IS NOT NORMALIZED %prec IS
{
$$ = makeNotExpr((Node *) makeFuncCall(SystemFuncName("is_normalized"), list_make1($1), @2), @2);
}
| a_expr IS NOT unicode_normal_form NORMALIZED %prec IS
{
$$ = makeNotExpr((Node *) makeFuncCall(SystemFuncName("is_normalized"), list_make2($1, makeStringConst($4, @4)), @2), @2);
}
| DEFAULT
{
/*
@ -13934,6 +13952,14 @@ func_expr_common_subexpr:
{
$$ = (Node *) makeFuncCall(SystemFuncName("date_part"), $3, @1);
}
| NORMALIZE '(' a_expr ')'
{
$$ = (Node *) makeFuncCall(SystemFuncName("normalize"), list_make1($3), @1);
}
| NORMALIZE '(' a_expr ',' unicode_normal_form ')'
{
$$ = (Node *) makeFuncCall(SystemFuncName("normalize"), list_make2($3, makeStringConst($5, @5)), @1);
}
| OVERLAY '(' overlay_list ')'
{
/* overlay(A PLACING B FROM C FOR D) is converted to
@ -14569,6 +14595,13 @@ extract_arg:
| Sconst { $$ = $1; }
;
unicode_normal_form:
NFC { $$ = "nfc"; }
| NFD { $$ = "nfd"; }
| NFKC { $$ = "nfkc"; }
| NFKD { $$ = "nfkd"; }
;
/* OVERLAY() arguments
* SQL99 defines the OVERLAY() function:
* o overlay(text placing text from int for int)
@ -15315,7 +15348,12 @@ unreserved_keyword:
| NAMES
| NEW
| NEXT
| NFC
| NFD
| NFKC
| NFKD
| NO
| NORMALIZED
| NOTHING
| NOTIFY
| NOWAIT
@ -15494,6 +15532,7 @@ col_name_keyword:
| NATIONAL
| NCHAR
| NONE
| NORMALIZE
| NULLIF
| NUMERIC
| OUT_P