1 files changed, 12 insertions, 7 deletions
diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py
index cffb7db7cee..40822d0c176 100644
--- a/contrib/unaccent/generate_unaccent_rules.py
+++ b/contrib/unaccent/generate_unaccent_rules.py
@@ -104,10 +104,11 @@ def is_letter_with_marks(codepoint, table):
     """Returns true for letters combined with one or more marks."""
     # See https://www.unicode.org/reports/tr44/tr44-14.html#General_Category_Values
 
-    # Letter may have no combining characters, in which case it has
-    # no marks.
-    if len(codepoint.combining_ids) == 1:
-        return False
+    # Some codepoints redirect directly to another, instead of doing any
+    # "combining"...  but sometimes they redirect to a codepoint that doesn't
+    # exist, so ignore those.
+    if len(codepoint.combining_ids) == 1 and codepoint.combining_ids[0] in table:
+        return is_letter_with_marks(table[codepoint.combining_ids[0]], table)
 
     # A letter without diacritical marks has none of them.
     if any(is_mark(table[i]) for i in codepoint.combining_ids[1:]) is False:
@@ -148,8 +149,7 @@ def get_plain_letter(codepoint, table):
 
 def is_ligature(codepoint, table):
     """Return true for letters combined with letters."""
-    return all(is_letter(table[i], table) for i in codepoint.combining_ids)
-
+    return all(i in table and is_letter(table[i], table) for i in codepoint.combining_ids)
 
 def get_plain_letters(codepoint, table):
     """Return a list of plain letters from a ligature."""
@@ -200,6 +200,11 @@ def parse_cldr_latin_ascii_transliterator(latinAsciiFilePath):
             # the parser of unaccent only accepts non-whitespace characters
             # for "src" and "trg" (see unaccent.c)
             if not src.isspace() and not trg.isspace():
+                if src == "\u210c":
+                    # This mapping seems to be in error, and causes a collision
+                    # by disagreeing with the main Unicode database file:
+                    # https://unicode-org.atlassian.net/browse/CLDR-17656
+                    continue
                 charactersSet.add((ord(src), trg))
 
     return charactersSet
@@ -251,7 +256,7 @@ def main(args):
     # walk through all the codepoints looking for interesting mappings
     for codepoint in all:
         if codepoint.general_category.startswith('L') and \
-           len(codepoint.combining_ids) > 1:
+           len(codepoint.combining_ids) > 0:
             if is_letter_with_marks(codepoint, table):
                 charactersSet.add((codepoint.id,
                                    chr(get_plain_letter(codepoint, table).id)))