mirror of
https://github.com/postgres/postgres.git
synced 2025-04-21 12:05:57 +03:00
As reported in bug #18057, the extension unaccent removes in its rule file whitespace characters that are intentionally specified when building unaccent.rules from UnicodeData.txt, causing an incorrect translation for some characters like numeric symbols. This is caused by the fact that all whitespaces before and after the origin and target characters are all discarded (this limitation is documented). This commit makes possible the use of quotes around target characters, so as whitespaces can be considered part of target characters. Some target characters use a double quote, these require an extra double quote. The documentation is updated to show how to use quoted areas, generate_unaccent_rules.py is updated to generate unaccent.rules and a couple of tests are added for numeric symbols. While working on this patch, I have implemented a fake rule file to test the parsing logic implemented, which is not included here as it would just consume extra cycles in the tests, and it requires the manipulation of an installation tree to be able to work correctly. As this requires a change of format in unaccent.rules, this cannot be backpatched, unfortunately. The idea to use double quotes as escaped characters comes from Tom Lane. Reported-by: Martin Schlossarek Author: Michael Paquier Discussion: https://postgr.es/m/18057-62712cad01bd202c@postgresql.org
49 lines
1.5 KiB
SQL
49 lines
1.5 KiB
SQL
/*
|
||
* This test must be run in a database with UTF-8 encoding,
|
||
* because other encodings don't support all the characters used.
|
||
*/
|
||
|
||
SELECT getdatabaseencoding() <> 'UTF8'
|
||
AS skip_test \gset
|
||
\if :skip_test
|
||
\quit
|
||
\endif
|
||
|
||
CREATE EXTENSION unaccent;
|
||
|
||
SET client_encoding TO 'UTF8';
|
||
|
||
SELECT unaccent('foobar');
|
||
SELECT unaccent('ёлка');
|
||
SELECT unaccent('ЁЖИК');
|
||
SELECT unaccent('˃˖˗˜');
|
||
SELECT unaccent('À'); -- Remove combining diacritical 0x0300
|
||
SELECT unaccent('℃℉'); -- degree signs
|
||
SELECT unaccent('℗'); -- sound recording copyright
|
||
SELECT unaccent('1½'); -- math expression with whitespace
|
||
SELECT unaccent('〝'); -- quote
|
||
|
||
SELECT unaccent('unaccent', 'foobar');
|
||
SELECT unaccent('unaccent', 'ёлка');
|
||
SELECT unaccent('unaccent', 'ЁЖИК');
|
||
SELECT unaccent('unaccent', '˃˖˗˜');
|
||
SELECT unaccent('unaccent', 'À');
|
||
SELECT unaccent('unaccent', '℃℉');
|
||
SELECT unaccent('unaccent', '℗');
|
||
SELECT unaccent('unaccent', '1½');
|
||
SELECT unaccent('unaccent', '〝');
|
||
|
||
SELECT ts_lexize('unaccent', 'foobar');
|
||
SELECT ts_lexize('unaccent', 'ёлка');
|
||
SELECT ts_lexize('unaccent', 'ЁЖИК');
|
||
SELECT ts_lexize('unaccent', '˃˖˗˜');
|
||
SELECT ts_lexize('unaccent', 'À');
|
||
SELECT ts_lexize('unaccent', '℃℉');
|
||
SELECT ts_lexize('unaccent', '℗');
|
||
SELECT ts_lexize('unaccent', '1½');
|
||
SELECT ts_lexize('unaccent', '〝');
|
||
|
||
-- Controversial case. Black-Letter Capital H (U+210C) is translated by
|
||
-- Latin-ASCII.xml as 'x', but it should be 'H'.
|
||
SELECT unaccent('ℌ');
|