Add combining characters to unaccent.rules.

Strip certain classes of combining characters, so that accents encoded this way are removed. Author: Hugh Ranalli Discussion: https://postgr.es/m/15548-cef1b3f8de190d4f%40postgresql.org
2025-12-15 02:22:24 +03:00 · 2019-02-01 15:23:01 +01:00
parent 80579f9bb1
commit 456e3718e7
4 changed files with 157 additions and 1 deletions
--- a/contrib/unaccent/generate_unaccent_rules.py
+++ b/contrib/unaccent/generate_unaccent_rules.py
@@ -61,8 +61,25 @@ PLAIN_LETTER_RANGES = ((ord('a'), ord('z')), # Latin lower case
                       (0x03b1, 0x03c9),     # GREEK SMALL LETTER ALPHA, GREEK SMALL LETTER OMEGA
                       (0x0391, 0x03a9))     # GREEK CAPITAL LETTER ALPHA, GREEK CAPITAL LETTER OMEGA

+# Combining marks follow a "base" character, and result in a composite
+# character. Example: "U&'A\0300'"produces "À".There are three types of
+# combining marks: enclosing (Me), non-spacing combining (Mn), spacing
+# combining (Mc). We identify the ranges of marks we feel safe removing.
+# References:
+#   https://en.wikipedia.org/wiki/Combining_character
+#   https://www.unicode.org/charts/PDF/U0300.pdf
+#   https://www.unicode.org/charts/PDF/U20D0.pdf
+COMBINING_MARK_RANGES = ((0x0300, 0x0362),  # Mn: Accents, IPA
+                         (0x20dd, 0x20E0),  # Me: Symbols
+                         (0x20e2, 0x20e4),) # Me: Screen, keycap, triangle
+
 def print_record(codepoint, letter):
-    print (chr(codepoint) + "\t" + letter)
+    if letter:
+        output = chr(codepoint) + "\t" + letter
+    else:
+        output = chr(codepoint)
+
+    print(output)

 class Codepoint:
    def __init__(self, id, general_category, combining_ids):
@@ -70,6 +87,16 @@ class Codepoint:
        self.general_category = general_category
        self.combining_ids = combining_ids

+def is_mark_to_remove(codepoint):
+    """Return true if this is a combining mark to remove."""
+    if not is_mark(codepoint):
+        return False
+
+    for begin, end in COMBINING_MARK_RANGES:
+        if codepoint.id >= begin and codepoint.id <= end:
+            return True
+    return False
+
 def is_plain_letter(codepoint):
    """Return true if codepoint represents a "plain letter"."""
    for begin, end in PLAIN_LETTER_RANGES:
@@ -234,6 +261,8 @@ def main(args):
                             "".join(chr(combining_codepoint.id)
                                     for combining_codepoint \
                                     in get_plain_letters(codepoint, table))))
+        elif is_mark_to_remove(codepoint):
+            charactersSet.add((codepoint.id, None))

    # add CLDR Latin-ASCII characters
    if not args.noLigaturesExpansion: