mirror of
https://github.com/postgres/postgres.git
synced 2025-07-28 23:42:10 +03:00
Previously we searched for code points where the Unicode data file listed an equivalent combining character sequence that added accents. Some codepoints redirect to a single other codepoint, instead of doing any combining. We can follow those references recursively to get the answer. Per bug report #18362, which reported missing Ancient Greek characters. Specifically, precomposed characters with oxia (from the polytonic accent system used for old Greek) just point to precomposed characters with tonos (from the monotonic accent system for modern Greek), and we have to follow the extra hop to find out that they are composed with an acute accent. Besides those, the new rule also: * pulls in a lot of 'Mathematical Alphanumeric Symbols', which are copies of the Latin and Greek alphabets and numbers rendered in different typefaces, and * corrects a single mathematical letter that previously came from the CLDR transliteration file, but the new rule extracts from the main Unicode database file, where clearly the latter is right and the former is a wrong (reported to CLDR). Reported-by: Cees van Zeeland <cees.van.zeeland@freedom.nl> Reviewed-by: Robert Haas <robertmhaas@gmail.com> Reviewed-by: Peter Eisentraut <peter@eisentraut.org> Reviewed-by: Michael Paquier <michael@paquier.xyz> Discussion: https://postgr.es/m/18362-be6d0cfe122b6354%40postgresql.org
182 lines
2.5 KiB
Plaintext
182 lines
2.5 KiB
Plaintext
/*
|
||
* This test must be run in a database with UTF-8 encoding,
|
||
* because other encodings don't support all the characters used.
|
||
*/
|
||
SELECT getdatabaseencoding() <> 'UTF8'
|
||
AS skip_test \gset
|
||
\if :skip_test
|
||
\quit
|
||
\endif
|
||
CREATE EXTENSION unaccent;
|
||
SET client_encoding TO 'UTF8';
|
||
SELECT unaccent('foobar');
|
||
unaccent
|
||
----------
|
||
foobar
|
||
(1 row)
|
||
|
||
SELECT unaccent('ёлка');
|
||
unaccent
|
||
----------
|
||
елка
|
||
(1 row)
|
||
|
||
SELECT unaccent('ЁЖИК');
|
||
unaccent
|
||
----------
|
||
ЕЖИК
|
||
(1 row)
|
||
|
||
SELECT unaccent('˃˖˗˜');
|
||
unaccent
|
||
----------
|
||
>+-~
|
||
(1 row)
|
||
|
||
SELECT unaccent('À'); -- Remove combining diacritical 0x0300
|
||
unaccent
|
||
----------
|
||
A
|
||
(1 row)
|
||
|
||
SELECT unaccent('℃℉'); -- degree signs
|
||
unaccent
|
||
----------
|
||
°C°F
|
||
(1 row)
|
||
|
||
SELECT unaccent('℗'); -- sound recording copyright
|
||
unaccent
|
||
----------
|
||
(P)
|
||
(1 row)
|
||
|
||
SELECT unaccent('1½'); -- math expression with whitespace
|
||
unaccent
|
||
----------
|
||
1 1/2
|
||
(1 row)
|
||
|
||
SELECT unaccent('〝'); -- quote
|
||
unaccent
|
||
----------
|
||
"
|
||
(1 row)
|
||
|
||
SELECT unaccent('unaccent', 'foobar');
|
||
unaccent
|
||
----------
|
||
foobar
|
||
(1 row)
|
||
|
||
SELECT unaccent('unaccent', 'ёлка');
|
||
unaccent
|
||
----------
|
||
елка
|
||
(1 row)
|
||
|
||
SELECT unaccent('unaccent', 'ЁЖИК');
|
||
unaccent
|
||
----------
|
||
ЕЖИК
|
||
(1 row)
|
||
|
||
SELECT unaccent('unaccent', '˃˖˗˜');
|
||
unaccent
|
||
----------
|
||
>+-~
|
||
(1 row)
|
||
|
||
SELECT unaccent('unaccent', 'À');
|
||
unaccent
|
||
----------
|
||
A
|
||
(1 row)
|
||
|
||
SELECT unaccent('unaccent', '℃℉');
|
||
unaccent
|
||
----------
|
||
°C°F
|
||
(1 row)
|
||
|
||
SELECT unaccent('unaccent', '℗');
|
||
unaccent
|
||
----------
|
||
(P)
|
||
(1 row)
|
||
|
||
SELECT unaccent('unaccent', '1½');
|
||
unaccent
|
||
----------
|
||
1 1/2
|
||
(1 row)
|
||
|
||
SELECT unaccent('unaccent', '〝');
|
||
unaccent
|
||
----------
|
||
"
|
||
(1 row)
|
||
|
||
SELECT ts_lexize('unaccent', 'foobar');
|
||
ts_lexize
|
||
-----------
|
||
|
||
(1 row)
|
||
|
||
SELECT ts_lexize('unaccent', 'ёлка');
|
||
ts_lexize
|
||
-----------
|
||
{елка}
|
||
(1 row)
|
||
|
||
SELECT ts_lexize('unaccent', 'ЁЖИК');
|
||
ts_lexize
|
||
-----------
|
||
{ЕЖИК}
|
||
(1 row)
|
||
|
||
SELECT ts_lexize('unaccent', '˃˖˗˜');
|
||
ts_lexize
|
||
-----------
|
||
{>+-~}
|
||
(1 row)
|
||
|
||
SELECT ts_lexize('unaccent', 'À');
|
||
ts_lexize
|
||
-----------
|
||
{A}
|
||
(1 row)
|
||
|
||
SELECT ts_lexize('unaccent', '℃℉');
|
||
ts_lexize
|
||
-----------
|
||
{°C°F}
|
||
(1 row)
|
||
|
||
SELECT ts_lexize('unaccent', '℗');
|
||
ts_lexize
|
||
-----------
|
||
{(P)}
|
||
(1 row)
|
||
|
||
SELECT ts_lexize('unaccent', '1½');
|
||
ts_lexize
|
||
-----------
|
||
{"1 1/2"}
|
||
(1 row)
|
||
|
||
SELECT ts_lexize('unaccent', '〝');
|
||
ts_lexize
|
||
-----------
|
||
{"\""}
|
||
(1 row)
|
||
|
||
-- Controversial case. Black-Letter Capital H (U+210C) is translated by
|
||
-- Latin-ASCII.xml as 'x', but it should be 'H'.
|
||
SELECT unaccent('ℌ');
|
||
unaccent
|
||
----------
|
||
H
|
||
(1 row)
|
||
|