diff --git a/contrib/unaccent/expected/unaccent.out b/contrib/unaccent/expected/unaccent.out index c1bd7cd897d..ee0ac71a1cc 100644 --- a/contrib/unaccent/expected/unaccent.out +++ b/contrib/unaccent/expected/unaccent.out @@ -37,6 +37,18 @@ SELECT unaccent('À'); -- Remove combining diacritical 0x0300 A (1 row) +SELECT unaccent('℃℉'); -- degree signs + unaccent +---------- + °C°F +(1 row) + +SELECT unaccent('℗'); -- sound recording copyright + unaccent +---------- + (P) +(1 row) + SELECT unaccent('unaccent', 'foobar'); unaccent ---------- @@ -67,6 +79,18 @@ SELECT unaccent('unaccent', 'À'); A (1 row) +SELECT unaccent('unaccent', '℃℉'); + unaccent +---------- + °C°F +(1 row) + +SELECT unaccent('unaccent', '℗'); + unaccent +---------- + (P) +(1 row) + SELECT ts_lexize('unaccent', 'foobar'); ts_lexize ----------- @@ -97,3 +121,23 @@ SELECT ts_lexize('unaccent', 'À'); {A} (1 row) +SELECT ts_lexize('unaccent', '℃℉'); + ts_lexize +----------- + {°C°F} +(1 row) + +SELECT ts_lexize('unaccent', '℗'); + ts_lexize +----------- + {(P)} +(1 row) + +-- Controversial case. Black-Letter Capital H (U+210C) is translated by +-- Latin-ASCII.xml as 'x', but it should be 'H'. +SELECT unaccent('ℌ'); + unaccent +---------- + x +(1 row) + diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py index c405e231b39..b4b4c38bebe 100644 --- a/contrib/unaccent/generate_unaccent_rules.py +++ b/contrib/unaccent/generate_unaccent_rules.py @@ -134,12 +134,12 @@ def get_plain_letter(codepoint, table): return table[codepoint.combining_ids[0]] # Should not come here - assert(False) + assert False, 'Codepoint U+%0.2X' % codepoint.id elif is_plain_letter(codepoint): return codepoint # Should not come here - assert(False) + assert False, 'Codepoint U+%0.2X' % codepoint.id def is_ligature(codepoint, table): @@ -212,7 +212,6 @@ def special_cases(): # Symbols of "Letterlike Symbols" Unicode Block (U+2100 to U+214F) charactersSet.add((0x2103, "\xb0C")) # DEGREE CELSIUS charactersSet.add((0x2109, "\xb0F")) # DEGREE FAHRENHEIT - charactersSet.add((0x2117, "(P)")) # SOUND RECORDING COPYRIGHT return charactersSet diff --git a/contrib/unaccent/sql/unaccent.sql b/contrib/unaccent/sql/unaccent.sql index 2ae097ff2b8..3fc0c706be3 100644 --- a/contrib/unaccent/sql/unaccent.sql +++ b/contrib/unaccent/sql/unaccent.sql @@ -10,15 +10,25 @@ SELECT unaccent('ёлка'); SELECT unaccent('ЁЖИК'); SELECT unaccent('˃˖˗˜'); SELECT unaccent('À'); -- Remove combining diacritical 0x0300 +SELECT unaccent('℃℉'); -- degree signs +SELECT unaccent('℗'); -- sound recording copyright SELECT unaccent('unaccent', 'foobar'); SELECT unaccent('unaccent', 'ёлка'); SELECT unaccent('unaccent', 'ЁЖИК'); SELECT unaccent('unaccent', '˃˖˗˜'); SELECT unaccent('unaccent', 'À'); +SELECT unaccent('unaccent', '℃℉'); +SELECT unaccent('unaccent', '℗'); SELECT ts_lexize('unaccent', 'foobar'); SELECT ts_lexize('unaccent', 'ёлка'); SELECT ts_lexize('unaccent', 'ЁЖИК'); SELECT ts_lexize('unaccent', '˃˖˗˜'); SELECT ts_lexize('unaccent', 'À'); +SELECT ts_lexize('unaccent', '℃℉'); +SELECT ts_lexize('unaccent', '℗'); + +-- Controversial case. Black-Letter Capital H (U+210C) is translated by +-- Latin-ASCII.xml as 'x', but it should be 'H'. +SELECT unaccent('ℌ');