diff options
Diffstat (limited to 'contrib/unaccent/generate_unaccent_rules.py')
-rw-r--r-- | contrib/unaccent/generate_unaccent_rules.py | 31 |
1 files changed, 30 insertions, 1 deletions
diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py index 4419a771edf..58b6e7deb74 100644 --- a/contrib/unaccent/generate_unaccent_rules.py +++ b/contrib/unaccent/generate_unaccent_rules.py @@ -61,8 +61,25 @@ PLAIN_LETTER_RANGES = ((ord('a'), ord('z')), # Latin lower case (0x03b1, 0x03c9), # GREEK SMALL LETTER ALPHA, GREEK SMALL LETTER OMEGA (0x0391, 0x03a9)) # GREEK CAPITAL LETTER ALPHA, GREEK CAPITAL LETTER OMEGA +# Combining marks follow a "base" character, and result in a composite +# character. Example: "U&'A\0300'"produces "À".There are three types of +# combining marks: enclosing (Me), non-spacing combining (Mn), spacing +# combining (Mc). We identify the ranges of marks we feel safe removing. +# References: +# https://en.wikipedia.org/wiki/Combining_character +# https://www.unicode.org/charts/PDF/U0300.pdf +# https://www.unicode.org/charts/PDF/U20D0.pdf +COMBINING_MARK_RANGES = ((0x0300, 0x0362), # Mn: Accents, IPA + (0x20dd, 0x20E0), # Me: Symbols + (0x20e2, 0x20e4),) # Me: Screen, keycap, triangle + def print_record(codepoint, letter): - print (chr(codepoint) + "\t" + letter) + if letter: + output = chr(codepoint) + "\t" + letter + else: + output = chr(codepoint) + + print(output) class Codepoint: def __init__(self, id, general_category, combining_ids): @@ -70,6 +87,16 @@ class Codepoint: self.general_category = general_category self.combining_ids = combining_ids +def is_mark_to_remove(codepoint): + """Return true if this is a combining mark to remove.""" + if not is_mark(codepoint): + return False + + for begin, end in COMBINING_MARK_RANGES: + if codepoint.id >= begin and codepoint.id <= end: + return True + return False + def is_plain_letter(codepoint): """Return true if codepoint represents a "plain letter".""" for begin, end in PLAIN_LETTER_RANGES: @@ -234,6 +261,8 @@ def main(args): "".join(chr(combining_codepoint.id) for combining_codepoint \ in get_plain_letters(codepoint, table)))) + elif is_mark_to_remove(codepoint): + charactersSet.add((codepoint.id, None)) # add CLDR Latin-ASCII characters if not args.noLigaturesExpansion: |