mirror of
https://github.com/postgres/postgres.git
synced 2025-06-11 20:28:21 +03:00
Add combining characters to unaccent.rules.
Strip certain classes of combining characters, so that accents encoded this way are removed. Author: Hugh Ranalli Discussion: https://postgr.es/m/15548-cef1b3f8de190d4f%40postgresql.org
This commit is contained in:
@ -61,8 +61,25 @@ PLAIN_LETTER_RANGES = ((ord('a'), ord('z')), # Latin lower case
|
||||
(0x03b1, 0x03c9), # GREEK SMALL LETTER ALPHA, GREEK SMALL LETTER OMEGA
|
||||
(0x0391, 0x03a9)) # GREEK CAPITAL LETTER ALPHA, GREEK CAPITAL LETTER OMEGA
|
||||
|
||||
# Combining marks follow a "base" character, and result in a composite
|
||||
# character. Example: "U&'A\0300'"produces "À".There are three types of
|
||||
# combining marks: enclosing (Me), non-spacing combining (Mn), spacing
|
||||
# combining (Mc). We identify the ranges of marks we feel safe removing.
|
||||
# References:
|
||||
# https://en.wikipedia.org/wiki/Combining_character
|
||||
# https://www.unicode.org/charts/PDF/U0300.pdf
|
||||
# https://www.unicode.org/charts/PDF/U20D0.pdf
|
||||
COMBINING_MARK_RANGES = ((0x0300, 0x0362), # Mn: Accents, IPA
|
||||
(0x20dd, 0x20E0), # Me: Symbols
|
||||
(0x20e2, 0x20e4),) # Me: Screen, keycap, triangle
|
||||
|
||||
def print_record(codepoint, letter):
|
||||
print (chr(codepoint) + "\t" + letter)
|
||||
if letter:
|
||||
output = chr(codepoint) + "\t" + letter
|
||||
else:
|
||||
output = chr(codepoint)
|
||||
|
||||
print(output)
|
||||
|
||||
class Codepoint:
|
||||
def __init__(self, id, general_category, combining_ids):
|
||||
@ -70,6 +87,16 @@ class Codepoint:
|
||||
self.general_category = general_category
|
||||
self.combining_ids = combining_ids
|
||||
|
||||
def is_mark_to_remove(codepoint):
|
||||
"""Return true if this is a combining mark to remove."""
|
||||
if not is_mark(codepoint):
|
||||
return False
|
||||
|
||||
for begin, end in COMBINING_MARK_RANGES:
|
||||
if codepoint.id >= begin and codepoint.id <= end:
|
||||
return True
|
||||
return False
|
||||
|
||||
def is_plain_letter(codepoint):
|
||||
"""Return true if codepoint represents a "plain letter"."""
|
||||
for begin, end in PLAIN_LETTER_RANGES:
|
||||
@ -234,6 +261,8 @@ def main(args):
|
||||
"".join(chr(combining_codepoint.id)
|
||||
for combining_codepoint \
|
||||
in get_plain_letters(codepoint, table))))
|
||||
elif is_mark_to_remove(codepoint):
|
||||
charactersSet.add((codepoint.id, None))
|
||||
|
||||
# add CLDR Latin-ASCII characters
|
||||
if not args.noLigaturesExpansion:
|
||||
|
Reference in New Issue
Block a user