diff options
Diffstat (limited to 'unicode_gen.c')
-rw-r--r-- | unicode_gen.c | 541 |
1 files changed, 539 insertions, 2 deletions
diff --git a/unicode_gen.c b/unicode_gen.c index 0f11ef8..1b43538 100644 --- a/unicode_gen.c +++ b/unicode_gen.c @@ -156,6 +156,153 @@ char *get_line(char *buf, int buf_size, FILE *f) return buf; } +typedef struct REString { + struct REString *next; + uint32_t hash; + uint32_t len; + uint32_t flags; + uint32_t buf[]; +} REString; + +typedef struct { + uint32_t n_strings; + uint32_t hash_size; + int hash_bits; + REString **hash_table; +} REStringList; + +static uint32_t re_string_hash(int len, const uint32_t *buf) +{ + int i; + uint32_t h; + h = 1; + for(i = 0; i < len; i++) + h = h * 263 + buf[i]; + return h * 0x61C88647; +} + +static void re_string_list_init(REStringList *s) +{ + s->n_strings = 0; + s->hash_size = 0; + s->hash_bits = 0; + s->hash_table = NULL; +} + +static __maybe_unused void re_string_list_free(REStringList *s) +{ + REString *p, *p_next; + int i; + for(i = 0; i < s->hash_size; i++) { + for(p = s->hash_table[i]; p != NULL; p = p_next) { + p_next = p->next; + free(p); + } + } + free(s->hash_table); +} + +static void lre_print_char(int c, BOOL is_range) +{ + if (c == '\'' || c == '\\' || + (is_range && (c == '-' || c == ']'))) { + printf("\\%c", c); + } else if (c >= ' ' && c <= 126) { + printf("%c", c); + } else { + printf("\\u{%04x}", c); + } +} + +static __maybe_unused void re_string_list_dump(const char *str, const REStringList *s) +{ + REString *p; + int i, j, k; + + printf("%s:\n", str); + + j = 0; + for(i = 0; i < s->hash_size; i++) { + for(p = s->hash_table[i]; p != NULL; p = p->next) { + printf(" %d/%d: '", j, s->n_strings); + for(k = 0; k < p->len; k++) { + lre_print_char(p->buf[k], FALSE); + } + printf("'\n"); + j++; + } + } +} + +static REString *re_string_find2(REStringList *s, int len, const uint32_t *buf, + uint32_t h0, BOOL add_flag) +{ + uint32_t h = 0; /* avoid warning */ + REString *p; + if (s->n_strings != 0) { + h = h0 >> (32 - s->hash_bits); + for(p = s->hash_table[h]; p != NULL; p = p->next) { + if (p->hash == h0 && p->len == len && + !memcmp(p->buf, buf, len * sizeof(buf[0]))) { + return p; + } + } + } + /* not found */ + if (!add_flag) + return NULL; + /* increase the size of the hash table if needed */ + if (unlikely((s->n_strings + 1) > s->hash_size)) { + REString **new_hash_table, *p_next; + int new_hash_bits, i; + uint32_t new_hash_size; + new_hash_bits = max_int(s->hash_bits + 1, 4); + new_hash_size = 1 << new_hash_bits; + new_hash_table = malloc(sizeof(new_hash_table[0]) * new_hash_size); + if (!new_hash_table) + return NULL; + memset(new_hash_table, 0, sizeof(new_hash_table[0]) * new_hash_size); + for(i = 0; i < s->hash_size; i++) { + for(p = s->hash_table[i]; p != NULL; p = p_next) { + p_next = p->next; + h = p->hash >> (32 - new_hash_bits); + p->next = new_hash_table[h]; + new_hash_table[h] = p; + } + } + free(s->hash_table); + s->hash_bits = new_hash_bits; + s->hash_size = new_hash_size; + s->hash_table = new_hash_table; + h = h0 >> (32 - s->hash_bits); + } + + p = malloc(sizeof(REString) + len * sizeof(buf[0])); + if (!p) + return NULL; + p->next = s->hash_table[h]; + s->hash_table[h] = p; + s->n_strings++; + p->hash = h0; + p->len = len; + p->flags = 0; + memcpy(p->buf, buf, sizeof(buf[0]) * len); + return p; +} + +static REString *re_string_find(REStringList *s, int len, const uint32_t *buf, + BOOL add_flag) +{ + uint32_t h0; + h0 = re_string_hash(len, buf); + return re_string_find2(s, len, buf, h0, add_flag); +} + +static void re_string_add(REStringList *s, int len, const uint32_t *buf) +{ + re_string_find(s, len, buf, TRUE); +} + #define UNICODE_GENERAL_CATEGORY typedef enum { @@ -225,6 +372,23 @@ static const char *unicode_prop_short_name[] = { #undef UNICODE_PROP_LIST +#define UNICODE_SEQUENCE_PROP_LIST + +typedef enum { +#define DEF(id) SEQUENCE_PROP_ ## id, +#include "unicode_gen_def.h" +#undef DEF + SEQUENCE_PROP_COUNT, +} UnicodeSequencePropEnum1; + +static const char *unicode_sequence_prop_name[] = { +#define DEF(id) #id, +#include "unicode_gen_def.h" +#undef DEF +}; + +#undef UNICODE_SEQUENCE_PROP_LIST + typedef struct { /* case conv */ uint8_t u_len; @@ -247,7 +411,15 @@ typedef struct { int *decomp_data; } CCInfo; +typedef struct { + int count; + int size; + int *tab; +} UnicodeSequenceProperties; + CCInfo *unicode_db; +REStringList rgi_emoji_zwj_sequence; +DynBuf rgi_emoji_tag_sequence; int find_name(const char **tab, int tab_len, const char *name) { @@ -751,6 +923,147 @@ void parse_prop_list(const char *filename) fclose(f); } +#define SEQ_MAX_LEN 16 + +static BOOL is_emoji_modifier(uint32_t c) +{ + return (c >= 0x1f3fb && c <= 0x1f3ff); +} + +static void add_sequence_prop(int idx, int seq_len, int *seq) +{ + int i; + + assert(idx < SEQUENCE_PROP_COUNT); + switch(idx) { + case SEQUENCE_PROP_Basic_Emoji: + /* convert to 2 properties lists */ + if (seq_len == 1) { + set_prop(seq[0], PROP_Basic_Emoji1, 1); + } else if (seq_len == 2 && seq[1] == 0xfe0f) { + set_prop(seq[0], PROP_Basic_Emoji2, 1); + } else { + abort(); + } + break; + case SEQUENCE_PROP_RGI_Emoji_Modifier_Sequence: + assert(seq_len == 2); + assert(is_emoji_modifier(seq[1])); + assert(get_prop(seq[0], PROP_Emoji_Modifier_Base)); + set_prop(seq[0], PROP_RGI_Emoji_Modifier_Sequence, 1); + break; + case SEQUENCE_PROP_RGI_Emoji_Flag_Sequence: + { + int code; + assert(seq_len == 2); + assert(seq[0] >= 0x1F1E6 && seq[0] <= 0x1F1FF); + assert(seq[1] >= 0x1F1E6 && seq[1] <= 0x1F1FF); + code = (seq[0] - 0x1F1E6) * 26 + (seq[1] - 0x1F1E6); + /* XXX: would be more compact with a simple bitmap -> 676 bits */ + set_prop(code, PROP_RGI_Emoji_Flag_Sequence, 1); + } + break; + case SEQUENCE_PROP_RGI_Emoji_ZWJ_Sequence: + re_string_add(&rgi_emoji_zwj_sequence, seq_len, (uint32_t *)seq); + break; + case SEQUENCE_PROP_RGI_Emoji_Tag_Sequence: + { + assert(seq_len >= 3); + assert(seq[0] == 0x1F3F4); + assert(seq[seq_len - 1] == 0xE007F); + for(i = 1; i < seq_len - 1; i++) { + assert(seq[i] >= 0xe0001 && seq[i] <= 0xe007e); + dbuf_putc(&rgi_emoji_tag_sequence, seq[i] - 0xe0000); + } + dbuf_putc(&rgi_emoji_tag_sequence, 0); + } + break; + case SEQUENCE_PROP_Emoji_Keycap_Sequence: + assert(seq_len == 3); + assert(seq[1] == 0xfe0f); + assert(seq[2] == 0x20e3); + set_prop(seq[0], PROP_Emoji_Keycap_Sequence, 1); + break; + default: + assert(0); + } +} + +void parse_sequence_prop_list(const char *filename) +{ + FILE *f; + char line[4096], *p, buf[256], *q, *p_start; + uint32_t c0, c1, c; + int idx, seq_len; + int seq[SEQ_MAX_LEN]; + + f = fopen(filename, "rb"); + if (!f) { + perror(filename); + exit(1); + } + + for(;;) { + if (!get_line(line, sizeof(line), f)) + break; + p = line; + while (isspace(*p)) + p++; + if (*p == '#' || *p == '@' || *p == '\0') + continue; + p_start = p; + + /* find the sequence property name */ + p = strchr(p, ';'); + if (!p) + continue; + p++; + p += strspn(p, " \t"); + q = buf; + while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t' && *p != ';') { + if ((q - buf) < sizeof(buf) - 1) + *q++ = *p; + p++; + } + *q = '\0'; + idx = find_name(unicode_sequence_prop_name, + countof(unicode_sequence_prop_name), buf); + if (idx < 0) { + fprintf(stderr, "Property not found: %s\n", buf); + exit(1); + } + + p = p_start; + c0 = strtoul(p, (char **)&p, 16); + assert(c0 <= CHARCODE_MAX); + + if (*p == '.' && p[1] == '.') { + p += 2; + c1 = strtoul(p, (char **)&p, 16); + assert(c1 <= CHARCODE_MAX); + for(c = c0; c <= c1; c++) { + seq[0] = c; + add_sequence_prop(idx, 1, seq); + } + } else { + seq_len = 0; + seq[seq_len++] = c0; + for(;;) { + while (isspace(*p)) + p++; + if (*p == ';' || *p == '\0') + break; + c0 = strtoul(p, (char **)&p, 16); + assert(c0 <= CHARCODE_MAX); + assert(seq_len < countof(seq)); + seq[seq_len++] = c0; + } + add_sequence_prop(idx, seq_len, seq); + } + } + fclose(f); +} + void parse_scripts(const char *filename) { FILE *f; @@ -1654,7 +1967,7 @@ void dump_name_table(FILE *f, const char *cname, const char **tab_name, int len, maxw = 0; for(i = 0; i < len; i++) { w = strlen(tab_name[i]); - if (tab_short_name[i][0] != '\0') { + if (tab_short_name && tab_short_name[i][0] != '\0') { w += 1 + strlen(tab_short_name[i]); } if (maxw < w) @@ -1666,7 +1979,7 @@ void dump_name_table(FILE *f, const char *cname, const char **tab_name, int len, for(i = 0; i < len; i++) { fprintf(f, " \""); w = fprintf(f, "%s", tab_name[i]); - if (tab_short_name[i][0] != '\0') { + if (tab_short_name && tab_short_name[i][0] != '\0') { w += fprintf(f, ",%s", tab_short_name[i]); } fprintf(f, "\"%*s\"\\0\"\n", 1 + maxw - w, ""); @@ -1930,6 +2243,218 @@ void build_prop_list_table(FILE *f) fprintf(f, "};\n\n"); } +static BOOL is_emoji_hair_color(uint32_t c) +{ + return (c >= 0x1F9B0 && c <= 0x1F9B3); +} + +#define EMOJI_MOD_NONE 0 +#define EMOJI_MOD_TYPE1 1 +#define EMOJI_MOD_TYPE2 2 +#define EMOJI_MOD_TYPE2D 3 + +static BOOL mark_zwj_string(REStringList *sl, uint32_t *buf, int len, int mod_type, int *mod_pos, + int hc_pos, BOOL mark_flag) +{ + REString *p; + int i, n_mod, i0, i1, hc_count, j; + +#if 0 + if (mark_flag) + printf("mod_type=%d\n", mod_type); +#endif + + switch(mod_type) { + case EMOJI_MOD_NONE: + n_mod = 1; + break; + case EMOJI_MOD_TYPE1: + n_mod = 5; + break; + case EMOJI_MOD_TYPE2: + n_mod = 25; + break; + case EMOJI_MOD_TYPE2D: + n_mod = 20; + break; + default: + assert(0); + } + if (hc_pos >= 0) + hc_count = 4; + else + hc_count = 1; + /* check that all the related strings are present */ + for(j = 0; j < hc_count; j++) { + for(i = 0; i < n_mod; i++) { + switch(mod_type) { + case EMOJI_MOD_NONE: + break; + case EMOJI_MOD_TYPE1: + buf[mod_pos[0]] = 0x1f3fb + i; + break; + case EMOJI_MOD_TYPE2: + case EMOJI_MOD_TYPE2D: + i0 = i / 5; + i1 = i % 5; + /* avoid identical values */ + if (mod_type == EMOJI_MOD_TYPE2D && i0 >= i1) + i0++; + buf[mod_pos[0]] = 0x1f3fb + i0; + buf[mod_pos[1]] = 0x1f3fb + i1; + break; + default: + assert(0); + } + + if (hc_pos >= 0) + buf[hc_pos] = 0x1F9B0 + j; + + p = re_string_find(sl, len, buf, FALSE); + if (!p) + return FALSE; + if (mark_flag) + p->flags |= 1; + } + } + return TRUE; +} + +static void zwj_encode_string(DynBuf *dbuf, const uint32_t *buf, int len, int mod_type, int *mod_pos, + int hc_pos) +{ + int i, j; + int c, code; + uint32_t buf1[SEQ_MAX_LEN]; + + j = 0; + for(i = 0; i < len;) { + c = buf[i++]; + if (c >= 0x2000 && c <= 0x2fff) { + code = c - 0x2000; + } else if (c >= 0x1f000 && c <= 0x1ffff) { + code = c - 0x1f000 + 0x1000; + } else { + assert(0); + } + if (i < len && is_emoji_modifier(buf[i])) { + /* modifier */ + code |= (mod_type << 13); + i++; + } + if (i < len && buf[i] == 0xfe0f) { + /* presentation selector present */ + code |= 0x8000; + i++; + } + if (i < len) { + /* zero width join */ + assert(buf[i] == 0x200d); + i++; + } + buf1[j++] = code; + } + dbuf_putc(dbuf, j); + for(i = 0; i < j; i++) { + dbuf_putc(dbuf, buf1[i]); + dbuf_putc(dbuf, buf1[i] >> 8); + } +} + +static void build_rgi_emoji_zwj_sequence(FILE *f, REStringList *sl) +{ + int mod_pos[2], mod_count, hair_color_pos, j, h; + REString *p; + uint32_t buf[SEQ_MAX_LEN]; + DynBuf dbuf; + +#if 0 + { + for(h = 0; h < sl->hash_size; h++) { + for(p = sl->hash_table[h]; p != NULL; p = p->next) { + for(j = 0; j < p->len; j++) + printf(" %04x", p->buf[j]); + printf("\n"); + } + } + exit(0); + } +#endif + // printf("rgi_emoji_zwj_sequence: n=%d\n", sl->n_strings); + + dbuf_init(&dbuf); + + /* avoid duplicating strings with emoji modifiers or hair colors */ + for(h = 0; h < sl->hash_size; h++) { + for(p = sl->hash_table[h]; p != NULL; p = p->next) { + if (p->flags) /* already examined */ + continue; + mod_count = 0; + hair_color_pos = -1; + for(j = 0; j < p->len; j++) { + if (is_emoji_modifier(p->buf[j])) { + assert(mod_count < 2); + mod_pos[mod_count++] = j; + } else if (is_emoji_hair_color(p->buf[j])) { + hair_color_pos = j; + } + buf[j] = p->buf[j]; + } + + if (mod_count != 0 || hair_color_pos >= 0) { + int mod_type; + if (mod_count == 0) + mod_type = EMOJI_MOD_NONE; + else if (mod_count == 1) + mod_type = EMOJI_MOD_TYPE1; + else + mod_type = EMOJI_MOD_TYPE2; + + if (mark_zwj_string(sl, buf, p->len, mod_type, mod_pos, hair_color_pos, FALSE)) { + mark_zwj_string(sl, buf, p->len, mod_type, mod_pos, hair_color_pos, TRUE); + } else if (mod_type == EMOJI_MOD_TYPE2) { + mod_type = EMOJI_MOD_TYPE2D; + if (mark_zwj_string(sl, buf, p->len, mod_type, mod_pos, hair_color_pos, FALSE)) { + mark_zwj_string(sl, buf, p->len, mod_type, mod_pos, hair_color_pos, TRUE); + } else { + dump_str("not_found", (int *)p->buf, p->len); + goto keep; + } + } + if (hair_color_pos >= 0) + buf[hair_color_pos] = 0x1f9b0; + /* encode the string */ + zwj_encode_string(&dbuf, buf, p->len, mod_type, mod_pos, hair_color_pos); + } else { + keep: + zwj_encode_string(&dbuf, buf, p->len, EMOJI_MOD_NONE, NULL, -1); + } + } + } + + /* Encode */ + dump_byte_table(f, "unicode_rgi_emoji_zwj_sequence", dbuf.buf, dbuf.size); + + dbuf_free(&dbuf); +} + +void build_sequence_prop_list_table(FILE *f) +{ + int i; + fprintf(f, "typedef enum {\n"); + for(i = 0; i < SEQUENCE_PROP_COUNT; i++) + fprintf(f, " UNICODE_SEQUENCE_PROP_%s,\n", unicode_sequence_prop_name[i]); + fprintf(f, " UNICODE_SEQUENCE_PROP_COUNT,\n"); + fprintf(f, "} UnicodeSequencePropertyEnum;\n\n"); + + dump_name_table(f, "unicode_sequence_prop_name_table", + unicode_sequence_prop_name, SEQUENCE_PROP_COUNT, NULL); + + dump_byte_table(f, "unicode_rgi_emoji_tag_sequence", rgi_emoji_tag_sequence.buf, rgi_emoji_tag_sequence.size); + + build_rgi_emoji_zwj_sequence(f, &rgi_emoji_zwj_sequence); +} + #ifdef USE_TEST int check_conv(uint32_t *res, uint32_t c, int conv_type) { @@ -3156,6 +3681,8 @@ int main(int argc, char *argv[]) outfilename = argv[arg++]; unicode_db = mallocz(sizeof(unicode_db[0]) * (CHARCODE_MAX + 1)); + re_string_list_init(&rgi_emoji_zwj_sequence); + dbuf_init(&rgi_emoji_tag_sequence); snprintf(filename, sizeof(filename), "%s/UnicodeData.txt", unicode_db_path); @@ -3190,6 +3717,14 @@ int main(int argc, char *argv[]) unicode_db_path); parse_prop_list(filename); + snprintf(filename, sizeof(filename), "%s/emoji-sequences.txt", + unicode_db_path); + parse_sequence_prop_list(filename); + + snprintf(filename, sizeof(filename), "%s/emoji-zwj-sequences.txt", + unicode_db_path); + parse_sequence_prop_list(filename); + // dump_unicode_data(unicode_db); build_conv_table(unicode_db); @@ -3234,10 +3769,12 @@ int main(int argc, char *argv[]) build_script_table(fo); build_script_ext_table(fo); build_prop_list_table(fo); + build_sequence_prop_list_table(fo); fprintf(fo, "#endif /* CONFIG_ALL_UNICODE */\n"); fprintf(fo, "/* %u tables / %u bytes, %u index / %u bytes */\n", total_tables, total_table_bytes, total_index, total_index_bytes); fclose(fo); } + re_string_list_free(&rgi_emoji_zwj_sequence); return 0; } |