diff --git a/contrib/unaccent/expected/unaccent.out b/contrib/unaccent/expected/unaccent.out index 0835e141afb..69c2cf9bd7a 100644 --- a/contrib/unaccent/expected/unaccent.out +++ b/contrib/unaccent/expected/unaccent.out @@ -25,6 +25,12 @@ SELECT unaccent('ЁЖИК'); ЕЖИК (1 row) +SELECT unaccent('˃˖˗˜'); + unaccent +---------- + >+-~ +(1 row) + SELECT unaccent('unaccent', 'foobar'); unaccent ---------- @@ -43,6 +49,12 @@ SELECT unaccent('unaccent', 'ЁЖИК'); ЕЖИК (1 row) +SELECT unaccent('unaccent', '˃˖˗˜'); + unaccent +---------- + >+-~ +(1 row) + SELECT ts_lexize('unaccent', 'foobar'); ts_lexize ----------- @@ -61,3 +73,9 @@ SELECT ts_lexize('unaccent', 'ЁЖИК'); {ЕЖИК} (1 row) +SELECT ts_lexize('unaccent', '˃˖˗˜'); + ts_lexize +----------- + {>+-~} +(1 row) + diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py index c9aef490aef..4419a771edf 100644 --- a/contrib/unaccent/generate_unaccent_rules.py +++ b/contrib/unaccent/generate_unaccent_rules.py @@ -20,8 +20,13 @@ # option is enabled, the XML file of this transliterator [2] -- given as a # command line argument -- will be parsed and used. # +# Ideally you should use the latest release for each data set. For +# Latin-ASCII.xml, the latest data sets released can be browsed directly +# via [3]. Note that this script is compatible with at least release 29. +# # [1] http://unicode.org/Public/8.0.0/ucd/UnicodeData.txt -# [2] http://unicode.org/cldr/trac/export/12304/tags/release-28/common/transforms/Latin-ASCII.xml +# [2] http://unicode.org/cldr/trac/export/14746/tags/release-34/common/transforms/Latin-ASCII.xml +# [3] https://unicode.org/cldr/trac/browser/tags # BEGIN: Python 2/3 compatibility - remove when Python 2 compatibility dropped # The approach is to be Python3 compatible with Python2 "backports". @@ -140,8 +145,18 @@ def parse_cldr_latin_ascii_transliterator(latinAsciiFilePath): transliterationTree = ET.parse(latinAsciiFilePath) transliterationTreeRoot = transliterationTree.getroot() - for rule in transliterationTreeRoot.findall("./transforms/transform/tRule"): - matches = rulePattern.search(rule.text) + # Fetch all the transliteration rules. Since release 29 of Latin-ASCII.xml + # all the transliteration rules are located in a single tRule block with + # all rules separated into separate lines. + blockRules = transliterationTreeRoot.findall("./transforms/transform/tRule") + assert(len(blockRules) == 1) + + # Split the block of rules into one element per line. + rules = blockRules[0].text.splitlines() + + # And finish the processing of each individual rule. + for rule in rules: + matches = rulePattern.search(rule) # The regular expression capture four groups corresponding # to the characters. diff --git a/contrib/unaccent/sql/unaccent.sql b/contrib/unaccent/sql/unaccent.sql index ba72ab6261c..c671827caa5 100644 --- a/contrib/unaccent/sql/unaccent.sql +++ b/contrib/unaccent/sql/unaccent.sql @@ -8,11 +8,14 @@ SET client_encoding TO 'UTF8'; SELECT unaccent('foobar'); SELECT unaccent('ёлка'); SELECT unaccent('ЁЖИК'); +SELECT unaccent('˃˖˗˜'); SELECT unaccent('unaccent', 'foobar'); SELECT unaccent('unaccent', 'ёлка'); SELECT unaccent('unaccent', 'ЁЖИК'); +SELECT unaccent('unaccent', '˃˖˗˜'); SELECT ts_lexize('unaccent', 'foobar'); SELECT ts_lexize('unaccent', 'ёлка'); SELECT ts_lexize('unaccent', 'ЁЖИК'); +SELECT ts_lexize('unaccent', '˃˖˗˜'); diff --git a/contrib/unaccent/unaccent.rules b/contrib/unaccent/unaccent.rules index 76e4e69bebb..7ce25eef03d 100644 --- a/contrib/unaccent/unaccent.rules +++ b/contrib/unaccent/unaccent.rules @@ -399,6 +399,21 @@ ʦ ts ʪ ls ʫ lz +ʹ ' +ʺ " +ʻ ' +ʼ ' +ʽ ' +˂ < +˃ > +˄ ^ +ˆ ^ +ˈ ' +ˋ ` +ː : +˖ + +˗ - +˜ ~ Ά Α Έ Ε Ή Η