diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py index 859cac40fa1..c9aef490aef 100644 --- a/contrib/unaccent/generate_unaccent_rules.py +++ b/contrib/unaccent/generate_unaccent_rules.py @@ -1,4 +1,4 @@ -#!/usr/bin/python2 +#!/usr/bin/python # -*- coding: utf-8 -*- # # This script builds unaccent.rules on standard output when given the @@ -23,6 +23,24 @@ # [1] http://unicode.org/Public/8.0.0/ucd/UnicodeData.txt # [2] http://unicode.org/cldr/trac/export/12304/tags/release-28/common/transforms/Latin-ASCII.xml +# BEGIN: Python 2/3 compatibility - remove when Python 2 compatibility dropped +# The approach is to be Python3 compatible with Python2 "backports". +from __future__ import print_function +from __future__ import unicode_literals +import codecs +import sys + +if sys.version_info[0] <= 2: + # Encode stdout as UTF-8, so we can just print to it + sys.stdout = codecs.getwriter('utf8')(sys.stdout) + + # Map Python 2's chr to unichr + chr = unichr + + # Python 2 and 3 compatible bytes call + def bytes(source, encoding='ascii', errors='strict'): + return source.encode(encoding=encoding, errors=errors) +# END: Python 2/3 compatibility - remove when Python 2 compatibility dropped import re import argparse @@ -39,7 +57,7 @@ PLAIN_LETTER_RANGES = ((ord('a'), ord('z')), # Latin lower case (0x0391, 0x03a9)) # GREEK CAPITAL LETTER ALPHA, GREEK CAPITAL LETTER OMEGA def print_record(codepoint, letter): - print (unichr(codepoint) + "\t" + letter).encode("UTF-8") + print (chr(codepoint) + "\t" + letter) class Codepoint: def __init__(self, id, general_category, combining_ids): @@ -116,7 +134,7 @@ def parse_cldr_latin_ascii_transliterator(latinAsciiFilePath): charactersSet = set() # RegEx to parse rules - rulePattern = re.compile(ur'^(?:(.)|(\\u[0-9a-fA-F]{4})) \u2192 (?:\'(.+)\'|(.+)) ;') + rulePattern = re.compile(r'^(?:(.)|(\\u[0-9a-fA-F]{4})) \u2192 (?:\'(.+)\'|(.+)) ;') # construct tree from XML transliterationTree = ET.parse(latinAsciiFilePath) @@ -134,7 +152,7 @@ def parse_cldr_latin_ascii_transliterator(latinAsciiFilePath): # Group 3: plain "trg" char. Empty if group 4 is not. # Group 4: plain "trg" char between quotes. Empty if group 3 is not. if matches is not None: - src = matches.group(1) if matches.group(1) is not None else matches.group(2).decode('unicode-escape') + src = matches.group(1) if matches.group(1) is not None else bytes(matches.group(2), 'UTF-8').decode('unicode-escape') trg = matches.group(3) if matches.group(3) is not None else matches.group(4) # "'" and """ are escaped @@ -195,10 +213,10 @@ def main(args): len(codepoint.combining_ids) > 1: if is_letter_with_marks(codepoint, table): charactersSet.add((codepoint.id, - unichr(get_plain_letter(codepoint, table).id))) + chr(get_plain_letter(codepoint, table).id))) elif args.noLigaturesExpansion is False and is_ligature(codepoint, table): charactersSet.add((codepoint.id, - "".join(unichr(combining_codepoint.id) + "".join(chr(combining_codepoint.id) for combining_codepoint \ in get_plain_letters(codepoint, table))))