diff options
author | Charlie Gordon <github@chqrlie.org> | 2024-05-05 12:10:24 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-05-05 12:10:24 +0200 |
commit | 1402478d8d280a1a62dfb76327dd569d6307a025 (patch) | |
tree | e9f4f938b677b069db88d1004f3dcb4ab3d88755 /libunicode.c | |
parent | 3b45d155c77bbdfe9177b1e03db830d2aff0b2a8 (diff) | |
download | quickjs-1402478d8d280a1a62dfb76327dd569d6307a025.tar.gz quickjs-1402478d8d280a1a62dfb76327dd569d6307a025.zip |
Improve unicode table handling (#286)
- Document table and index formats
- Add size statistics
- Fix UBSAN issue in `get_le24()`
Fixes #285
Diffstat (limited to 'libunicode.c')
-rw-r--r-- | libunicode.c | 36 |
1 files changed, 32 insertions, 4 deletions
diff --git a/libunicode.c b/libunicode.c index 4200252..a631bbd 100644 --- a/libunicode.c +++ b/libunicode.c @@ -262,11 +262,7 @@ int lre_canonicalize(uint32_t c, BOOL is_unicode) static uint32_t get_le24(const uint8_t *ptr) { -#if defined(__x86__) || defined(__x86_64__) - return *(uint16_t *)ptr | (ptr[2] << 16); -#else return ptr[0] | (ptr[1] << 8) | (ptr[2] << 16); -#endif } #define UNICODE_INDEX_BLOCK_LEN 32 @@ -317,6 +313,14 @@ static BOOL lre_is_in_table(uint32_t c, const uint8_t *table, return FALSE; /* outside the table */ p = table + pos; bit = 0; + /* Compressed run length encoding: + 00..3F: 2 packed lengths: 3-bit + 3-bit + 40..5F: 5-bits plus extra byte for length + 60..7F: 5-bits plus 2 extra bytes for length + 80..FF: 7-bit length + lengths must be incremented to get character count + Ranges alternate between false and true return value. + */ for(;;) { b = *p++; if (b < 64) { @@ -833,6 +837,13 @@ static int unicode_get_cc(uint32_t c) if (pos < 0) return 0; p = unicode_cc_table + pos; + /* Compressed run length encoding: + - 2 high order bits are combining class type + - 0:0, 1:230, 2:extra byte linear progression, 3:extra byte + - 00..2F: range length (add 1) + - 30..37: 3-bit range-length + 1 extra byte + - 38..3F: 3-bit range-length + 2 extra byte + */ for(;;) { b = *p++; type = b >> 6; @@ -1185,6 +1196,15 @@ static int unicode_general_category1(CharRange *cr, uint32_t gc_mask) p = unicode_gc_table; p_end = unicode_gc_table + countof(unicode_gc_table); c = 0; + /* Compressed range encoding: + initial byte: + bits 0..4: category number (special case 31) + bits 5..7: range length (add 1) + special case bits 5..7 == 7: read an extra byte + - 00..7F: range length (add 7 + 1) + - 80..BF: 6-bits plus extra byte for range length (add 7 + 128) + - C0..FF: 6-bits plus 2 extra bytes for range length (add 7 + 128 + 16384) + */ while (p < p_end) { b = *p++; n = b >> 5; @@ -1238,6 +1258,14 @@ static int unicode_prop1(CharRange *cr, int prop_idx) p_end = p + unicode_prop_len_table[prop_idx]; c = 0; bit = 0; + /* Compressed range encoding: + 00..3F: 2 packed lengths: 3-bit + 3-bit + 40..5F: 5-bits plus extra byte for length + 60..7F: 5-bits plus 2 extra bytes for length + 80..FF: 7-bit length + lengths must be incremented to get character count + Ranges alternate between false and true return value. + */ while (p < p_end) { c0 = c; b = *p++; |