mirror of
https://github.com/postgres/postgres.git
synced 2025-04-21 12:05:57 +03:00
Update unaccent rules with release 34 of CLDR for Latin-ASCII.xml
This has required an update of the python script generating the rules, as its format has changed in release 29. This release has also added new punctuation and symbols, and a new set of rules has been generated to include them. The way to find newest versions of Latin-ASCII gets also more clearly documented. Author: Hugh Ranalli, Michael Paquier Discussion: https://postgr.es/m/15548-cef1b3f8de190d4f@postgresql.org
This commit is contained in:
parent
c64d0cd5ce
commit
e1c1d5444e
@ -25,6 +25,12 @@ SELECT unaccent('ЁЖИК');
|
|||||||
ЕЖИК
|
ЕЖИК
|
||||||
(1 row)
|
(1 row)
|
||||||
|
|
||||||
|
SELECT unaccent('˃˖˗˜');
|
||||||
|
unaccent
|
||||||
|
----------
|
||||||
|
>+-~
|
||||||
|
(1 row)
|
||||||
|
|
||||||
SELECT unaccent('unaccent', 'foobar');
|
SELECT unaccent('unaccent', 'foobar');
|
||||||
unaccent
|
unaccent
|
||||||
----------
|
----------
|
||||||
@ -43,6 +49,12 @@ SELECT unaccent('unaccent', 'ЁЖИК');
|
|||||||
ЕЖИК
|
ЕЖИК
|
||||||
(1 row)
|
(1 row)
|
||||||
|
|
||||||
|
SELECT unaccent('unaccent', '˃˖˗˜');
|
||||||
|
unaccent
|
||||||
|
----------
|
||||||
|
>+-~
|
||||||
|
(1 row)
|
||||||
|
|
||||||
SELECT ts_lexize('unaccent', 'foobar');
|
SELECT ts_lexize('unaccent', 'foobar');
|
||||||
ts_lexize
|
ts_lexize
|
||||||
-----------
|
-----------
|
||||||
@ -61,3 +73,9 @@ SELECT ts_lexize('unaccent', 'ЁЖИК');
|
|||||||
{ЕЖИК}
|
{ЕЖИК}
|
||||||
(1 row)
|
(1 row)
|
||||||
|
|
||||||
|
SELECT ts_lexize('unaccent', '˃˖˗˜');
|
||||||
|
ts_lexize
|
||||||
|
-----------
|
||||||
|
{>+-~}
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
@ -20,8 +20,13 @@
|
|||||||
# option is enabled, the XML file of this transliterator [2] -- given as a
|
# option is enabled, the XML file of this transliterator [2] -- given as a
|
||||||
# command line argument -- will be parsed and used.
|
# command line argument -- will be parsed and used.
|
||||||
#
|
#
|
||||||
|
# Ideally you should use the latest release for each data set. For
|
||||||
|
# Latin-ASCII.xml, the latest data sets released can be browsed directly
|
||||||
|
# via [3]. Note that this script is compatible with at least release 29.
|
||||||
|
#
|
||||||
# [1] http://unicode.org/Public/8.0.0/ucd/UnicodeData.txt
|
# [1] http://unicode.org/Public/8.0.0/ucd/UnicodeData.txt
|
||||||
# [2] http://unicode.org/cldr/trac/export/12304/tags/release-28/common/transforms/Latin-ASCII.xml
|
# [2] http://unicode.org/cldr/trac/export/14746/tags/release-34/common/transforms/Latin-ASCII.xml
|
||||||
|
# [3] https://unicode.org/cldr/trac/browser/tags
|
||||||
|
|
||||||
# BEGIN: Python 2/3 compatibility - remove when Python 2 compatibility dropped
|
# BEGIN: Python 2/3 compatibility - remove when Python 2 compatibility dropped
|
||||||
# The approach is to be Python3 compatible with Python2 "backports".
|
# The approach is to be Python3 compatible with Python2 "backports".
|
||||||
@ -140,8 +145,18 @@ def parse_cldr_latin_ascii_transliterator(latinAsciiFilePath):
|
|||||||
transliterationTree = ET.parse(latinAsciiFilePath)
|
transliterationTree = ET.parse(latinAsciiFilePath)
|
||||||
transliterationTreeRoot = transliterationTree.getroot()
|
transliterationTreeRoot = transliterationTree.getroot()
|
||||||
|
|
||||||
for rule in transliterationTreeRoot.findall("./transforms/transform/tRule"):
|
# Fetch all the transliteration rules. Since release 29 of Latin-ASCII.xml
|
||||||
matches = rulePattern.search(rule.text)
|
# all the transliteration rules are located in a single tRule block with
|
||||||
|
# all rules separated into separate lines.
|
||||||
|
blockRules = transliterationTreeRoot.findall("./transforms/transform/tRule")
|
||||||
|
assert(len(blockRules) == 1)
|
||||||
|
|
||||||
|
# Split the block of rules into one element per line.
|
||||||
|
rules = blockRules[0].text.splitlines()
|
||||||
|
|
||||||
|
# And finish the processing of each individual rule.
|
||||||
|
for rule in rules:
|
||||||
|
matches = rulePattern.search(rule)
|
||||||
|
|
||||||
# The regular expression capture four groups corresponding
|
# The regular expression capture four groups corresponding
|
||||||
# to the characters.
|
# to the characters.
|
||||||
|
@ -8,11 +8,14 @@ SET client_encoding TO 'UTF8';
|
|||||||
SELECT unaccent('foobar');
|
SELECT unaccent('foobar');
|
||||||
SELECT unaccent('ёлка');
|
SELECT unaccent('ёлка');
|
||||||
SELECT unaccent('ЁЖИК');
|
SELECT unaccent('ЁЖИК');
|
||||||
|
SELECT unaccent('˃˖˗˜');
|
||||||
|
|
||||||
SELECT unaccent('unaccent', 'foobar');
|
SELECT unaccent('unaccent', 'foobar');
|
||||||
SELECT unaccent('unaccent', 'ёлка');
|
SELECT unaccent('unaccent', 'ёлка');
|
||||||
SELECT unaccent('unaccent', 'ЁЖИК');
|
SELECT unaccent('unaccent', 'ЁЖИК');
|
||||||
|
SELECT unaccent('unaccent', '˃˖˗˜');
|
||||||
|
|
||||||
SELECT ts_lexize('unaccent', 'foobar');
|
SELECT ts_lexize('unaccent', 'foobar');
|
||||||
SELECT ts_lexize('unaccent', 'ёлка');
|
SELECT ts_lexize('unaccent', 'ёлка');
|
||||||
SELECT ts_lexize('unaccent', 'ЁЖИК');
|
SELECT ts_lexize('unaccent', 'ЁЖИК');
|
||||||
|
SELECT ts_lexize('unaccent', '˃˖˗˜');
|
||||||
|
@ -399,6 +399,21 @@
|
|||||||
ʦ ts
|
ʦ ts
|
||||||
ʪ ls
|
ʪ ls
|
||||||
ʫ lz
|
ʫ lz
|
||||||
|
ʹ '
|
||||||
|
ʺ "
|
||||||
|
ʻ '
|
||||||
|
ʼ '
|
||||||
|
ʽ '
|
||||||
|
˂ <
|
||||||
|
˃ >
|
||||||
|
˄ ^
|
||||||
|
ˆ ^
|
||||||
|
ˈ '
|
||||||
|
ˋ `
|
||||||
|
ː :
|
||||||
|
˖ +
|
||||||
|
˗ -
|
||||||
|
˜ ~
|
||||||
Ά Α
|
Ά Α
|
||||||
Έ Ε
|
Έ Ε
|
||||||
Ή Η
|
Ή Η
|
||||||
|
Loading…
x
Reference in New Issue
Block a user