diff options
Diffstat (limited to 'libunicode.c')
-rw-r--r-- | libunicode.c | 215 |
1 files changed, 213 insertions, 2 deletions
diff --git a/libunicode.c b/libunicode.c index d1bf1e9..b4a0206 100644 --- a/libunicode.c +++ b/libunicode.c @@ -499,6 +499,9 @@ int cr_op(CharRange *cr, const uint32_t *a_pt, int a_len, case CR_OP_XOR: is_in = (a_idx & 1) ^ (b_idx & 1); break; + case CR_OP_SUB: + is_in = (a_idx & 1) & ((b_idx & 1) ^ 1); + break; default: abort(); } @@ -511,14 +514,14 @@ int cr_op(CharRange *cr, const uint32_t *a_pt, int a_len, return 0; } -int cr_union1(CharRange *cr, const uint32_t *b_pt, int b_len) +int cr_op1(CharRange *cr, const uint32_t *b_pt, int b_len, int op) { CharRange a = *cr; int ret; cr->len = 0; cr->size = 0; cr->points = NULL; - ret = cr_op(cr, a.points, a.len, b_pt, b_len, CR_OP_UNION); + ret = cr_op(cr, a.points, a.len, b_pt, b_len, op); cr_free(&a); return ret; } @@ -1554,6 +1557,7 @@ static int unicode_prop_ops(CharRange *cr, ...) cr2 = &stack[stack_len - 1]; cr3 = &stack[stack_len++]; cr_init(cr3, cr->mem_opaque, cr->realloc_func); + /* CR_OP_XOR may be used here */ if (cr_op(cr3, cr1->points, cr1->len, cr2->points, cr2->len, op - POP_UNION + CR_OP_UNION)) goto fail; @@ -1908,3 +1912,210 @@ BOOL lre_is_space_non_ascii(uint32_t c) } return FALSE; } + +#define SEQ_MAX_LEN 16 + +static int unicode_sequence_prop1(int seq_prop_idx, UnicodeSequencePropCB *cb, void *opaque, + CharRange *cr) +{ + int i, c, j; + uint32_t seq[SEQ_MAX_LEN]; + + switch(seq_prop_idx) { + case UNICODE_SEQUENCE_PROP_Basic_Emoji: + if (unicode_prop1(cr, UNICODE_PROP_Basic_Emoji1) < 0) + return -1; + for(i = 0; i < cr->len; i += 2) { + for(c = cr->points[i]; c < cr->points[i + 1]; c++) { + seq[0] = c; + cb(opaque, seq, 1); + } + } + + cr->len = 0; + + if (unicode_prop1(cr, UNICODE_PROP_Basic_Emoji2) < 0) + return -1; + for(i = 0; i < cr->len; i += 2) { + for(c = cr->points[i]; c < cr->points[i + 1]; c++) { + seq[0] = c; + seq[1] = 0xfe0f; + cb(opaque, seq, 2); + } + } + + break; + case UNICODE_SEQUENCE_PROP_RGI_Emoji_Modifier_Sequence: + if (unicode_prop1(cr, UNICODE_PROP_Emoji_Modifier_Base) < 0) + return -1; + for(i = 0; i < cr->len; i += 2) { + for(c = cr->points[i]; c < cr->points[i + 1]; c++) { + for(j = 0; j < 5; j++) { + seq[0] = c; + seq[1] = 0x1f3fb + j; + cb(opaque, seq, 2); + } + } + } + break; + case UNICODE_SEQUENCE_PROP_RGI_Emoji_Flag_Sequence: + if (unicode_prop1(cr, UNICODE_PROP_RGI_Emoji_Flag_Sequence) < 0) + return -1; + for(i = 0; i < cr->len; i += 2) { + for(c = cr->points[i]; c < cr->points[i + 1]; c++) { + int c0, c1; + c0 = c / 26; + c1 = c % 26; + seq[0] = 0x1F1E6 + c0; + seq[1] = 0x1F1E6 + c1; + cb(opaque, seq, 2); + } + } + break; + case UNICODE_SEQUENCE_PROP_RGI_Emoji_ZWJ_Sequence: + { + int len, code, pres, k, mod, mod_count, mod_pos[2], hc_pos, n_mod, n_hc, mod1; + int mod_idx, hc_idx, i0, i1; + const uint8_t *tab = unicode_rgi_emoji_zwj_sequence; + + for(i = 0; i < countof(unicode_rgi_emoji_zwj_sequence);) { + len = tab[i++]; + k = 0; + mod = 0; + mod_count = 0; + hc_pos = -1; + for(j = 0; j < len; j++) { + code = tab[i++]; + code |= tab[i++] << 8; + pres = code >> 15; + mod1 = (code >> 13) & 3; + code &= 0x1fff; + if (code < 0x1000) { + c = code + 0x2000; + } else { + c = 0x1f000 + (code - 0x1000); + } + if (c == 0x1f9b0) + hc_pos = k; + seq[k++] = c; + if (mod1 != 0) { + assert(mod_count < 2); + mod = mod1; + mod_pos[mod_count++] = k; + seq[k++] = 0; /* will be filled later */ + } + if (pres) { + seq[k++] = 0xfe0f; + } + if (j < len - 1) { + seq[k++] = 0x200d; + } + } + + /* genrate all the variants */ + switch(mod) { + case 1: + n_mod = 5; + break; + case 2: + n_mod = 25; + break; + case 3: + n_mod = 20; + break; + default: + n_mod = 1; + break; + } + if (hc_pos >= 0) + n_hc = 4; + else + n_hc = 1; + for(hc_idx = 0; hc_idx < n_hc; hc_idx++) { + for(mod_idx = 0; mod_idx < n_mod; mod_idx++) { + if (hc_pos >= 0) + seq[hc_pos] = 0x1f9b0 + hc_idx; + + switch(mod) { + case 1: + seq[mod_pos[0]] = 0x1f3fb + mod_idx; + break; + case 2: + case 3: + i0 = mod_idx / 5; + i1 = mod_idx % 5; + /* avoid identical values */ + if (mod == 3 && i0 >= i1) + i0++; + seq[mod_pos[0]] = 0x1f3fb + i0; + seq[mod_pos[1]] = 0x1f3fb + i1; + break; + default: + break; + } +#if 0 + for(j = 0; j < k; j++) + printf(" %04x", seq[j]); + printf("\n"); +#endif + cb(opaque, seq, k); + } + } + } + } + break; + case UNICODE_SEQUENCE_PROP_RGI_Emoji_Tag_Sequence: + { + for(i = 0; i < countof(unicode_rgi_emoji_tag_sequence);) { + j = 0; + seq[j++] = 0x1F3F4; + for(;;) { + c = unicode_rgi_emoji_tag_sequence[i++]; + if (c == 0x00) + break; + seq[j++] = 0xe0000 + c; + } + seq[j++] = 0xe007f; + cb(opaque, seq, j); + } + } + break; + case UNICODE_SEQUENCE_PROP_Emoji_Keycap_Sequence: + if (unicode_prop1(cr, UNICODE_PROP_Emoji_Keycap_Sequence) < 0) + return -1; + for(i = 0; i < cr->len; i += 2) { + for(c = cr->points[i]; c < cr->points[i + 1]; c++) { + seq[0] = c; + seq[1] = 0xfe0f; + seq[2] = 0x20e3; + cb(opaque, seq, 3); + } + } + break; + case UNICODE_SEQUENCE_PROP_RGI_Emoji: + /* all prevous sequences */ + for(i = UNICODE_SEQUENCE_PROP_Basic_Emoji; i <= UNICODE_SEQUENCE_PROP_RGI_Emoji_ZWJ_Sequence; i++) { + int ret; + ret = unicode_sequence_prop1(i, cb, opaque, cr); + if (ret < 0) + return ret; + cr->len = 0; + } + break; + default: + return -2; + } + return 0; +} + +/* build a unicode sequence property */ +/* return -2 if not found, -1 if other error. 'cr' is used as temporary memory. */ +int unicode_sequence_prop(const char *prop_name, UnicodeSequencePropCB *cb, void *opaque, + CharRange *cr) +{ + int seq_prop_idx; + seq_prop_idx = unicode_find_name(unicode_sequence_prop_name_table, prop_name); + if (seq_prop_idx < 0) + return -2; + return unicode_sequence_prop1(seq_prop_idx, cb, opaque, cr); +} |