1
0
mirror of https://github.com/postgres/postgres.git synced 2025-06-11 20:28:21 +03:00

Extend the default rules file for contrib/unaccent with Vietnamese letters.

Improve generate_unaccent_rules.py to handle composed characters whose base
is another composed character rather than a plain letter.  The net effect
of this is to add a bunch of multi-accented Vietnamese characters to
unaccent.rules.

Original complaint from Kha Nguyen, diagnosis of the script's shortcoming
by Thomas Munro.

Dang Minh Huong and Michael Paquier

Discussion: https://postgr.es/m/CALo3sF6EC8cy1F2JUz=GRf5h4LMUJTaG3qpdoiLrNbWEXL-tRg@mail.gmail.com
This commit is contained in:
Tom Lane
2017-08-16 16:51:56 -04:00
parent 2b74303637
commit ec0a69e49b
2 changed files with 145 additions and 8 deletions

View File

@ -48,24 +48,47 @@ def is_mark(codepoint):
return codepoint.general_category in ("Mn", "Me", "Mc")
def is_letter_with_marks(codepoint, table):
"""Returns true for plain letters combined with one or more marks."""
"""Returns true for letters combined with one or more marks."""
# See http://www.unicode.org/reports/tr44/tr44-14.html#General_Category_Values
return len(codepoint.combining_ids) > 1 and \
is_plain_letter(table[codepoint.combining_ids[0]]) and \
all(is_mark(table[i]) for i in codepoint.combining_ids[1:])
# Letter may have no combining characters, in which case it has
# no marks.
if len(codepoint.combining_ids) == 1:
return False
# A letter without diacritical marks has none of them.
if any(is_mark(table[i]) for i in codepoint.combining_ids[1:]) is False:
return False
# Check if the base letter of this letter has marks.
codepoint_base = codepoint.combining_ids[0]
if (is_plain_letter(table[codepoint_base]) is False and \
is_letter_with_marks(table[codepoint_base], table) is False):
return False
return True
def is_letter(codepoint, table):
"""Return true for letter with or without diacritical marks."""
return is_plain_letter(codepoint) or is_letter_with_marks(codepoint, table)
def get_plain_letter(codepoint, table):
"""Return the base codepoint without marks."""
"""Return the base codepoint without marks. If this codepoint has more
than one combining character, do a recursive lookup on the table to
find out its plain base letter."""
if is_letter_with_marks(codepoint, table):
return table[codepoint.combining_ids[0]]
if len(table[codepoint.combining_ids[0]].combining_ids) > 1:
return get_plain_letter(table[codepoint.combining_ids[0]], table)
elif is_plain_letter(table[codepoint.combining_ids[0]]):
return table[codepoint.combining_ids[0]]
# Should not come here
assert(False)
elif is_plain_letter(codepoint):
return codepoint
else:
raise "mu"
# Should not come here
assert(False)
def is_ligature(codepoint, table):
"""Return true for letters combined with letters."""