diff options
author | Charlie Gordon <github@chqrlie.org> | 2024-05-05 12:10:24 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-05-05 12:10:24 +0200 |
commit | 1402478d8d280a1a62dfb76327dd569d6307a025 (patch) | |
tree | e9f4f938b677b069db88d1004f3dcb4ab3d88755 /unicode_gen.c | |
parent | 3b45d155c77bbdfe9177b1e03db830d2aff0b2a8 (diff) | |
download | quickjs-1402478d8d280a1a62dfb76327dd569d6307a025.tar.gz quickjs-1402478d8d280a1a62dfb76327dd569d6307a025.zip |
Improve unicode table handling (#286)
- Document table and index formats
- Add size statistics
- Fix UBSAN issue in `get_le24()`
Fixes #285
Diffstat (limited to 'unicode_gen.c')
-rw-r--r-- | unicode_gen.c | 116 |
1 files changed, 86 insertions, 30 deletions
diff --git a/unicode_gen.c b/unicode_gen.c index 9a7babb..14811ef 100644 --- a/unicode_gen.c +++ b/unicode_gen.c @@ -33,6 +33,11 @@ #include "cutils.h" +uint32_t total_tables; +uint32_t total_table_bytes; +uint32_t total_index; +uint32_t total_index_bytes; + /* define it to be able to test unicode.c */ //#define USE_TEST /* profile tests */ @@ -1328,7 +1333,9 @@ void dump_case_conv_table(FILE *f) uint32_t v; const TableEntry *te; - fprintf(f, "static const uint32_t case_conv_table1[%u] = {", conv_table_len); + total_tables++; + total_table_bytes += conv_table_len * sizeof(uint32_t); + fprintf(f, "static const uint32_t case_conv_table1[%d] = {", conv_table_len); for(i = 0; i < conv_table_len; i++) { if (i % 4 == 0) fprintf(f, "\n "); @@ -1341,7 +1348,9 @@ void dump_case_conv_table(FILE *f) } fprintf(f, "\n};\n\n"); - fprintf(f, "static const uint8_t case_conv_table2[%u] = {", conv_table_len); + total_tables++; + total_table_bytes += conv_table_len; + fprintf(f, "static const uint8_t case_conv_table2[%d] = {", conv_table_len); for(i = 0; i < conv_table_len; i++) { if (i % 8 == 0) fprintf(f, "\n "); @@ -1350,7 +1359,9 @@ void dump_case_conv_table(FILE *f) } fprintf(f, "\n};\n\n"); - fprintf(f, "static const uint16_t case_conv_ext[%u] = {", ext_data_len); + total_tables++; + total_table_bytes += ext_data_len * sizeof(uint16_t); + fprintf(f, "static const uint16_t case_conv_ext[%d] = {", ext_data_len); for(i = 0; i < ext_data_len; i++) { if (i % 8 == 0) fprintf(f, "\n "); @@ -1470,6 +1481,9 @@ void compute_internal_props(void) void dump_byte_table(FILE *f, const char *cname, const uint8_t *tab, int len) { int i; + + total_tables++; + total_table_bytes += len; fprintf(f, "static const uint8_t %s[%d] = {", cname, len); for(i = 0; i < len; i++) { if (i % 8 == 0) @@ -1479,9 +1493,26 @@ void dump_byte_table(FILE *f, const char *cname, const uint8_t *tab, int len) fprintf(f, "\n};\n\n"); } +void dump_index_table(FILE *f, const char *cname, const uint8_t *tab, int len) +{ + int i, code, offset; + + total_index++; + total_index_bytes += len; + fprintf(f, "static const uint8_t %s[%d] = {\n", cname, len); + for(i = 0; i < len; i += 3) { + code = tab[i] + (tab[i+1] << 8) + ((tab[i+2] & 0x1f) << 16); + offset = ((i / 3) + 1) * 32 + (tab[i+2] >> 5); + fprintf(f, " 0x%02x, 0x%02x, 0x%02x,", tab[i], tab[i+1], tab[i+2]); + fprintf(f, " // %6.5X at %d%s\n", code, offset, + i == len - 3 ? " (upper bound)" : ""); + } + fprintf(f, "};\n\n"); +} + #define PROP_BLOCK_LEN 32 -void build_prop_table(FILE *f, int prop_index, BOOL add_index) +void build_prop_table(FILE *f, const char *name, int prop_index, BOOL add_index) { int i, j, n, v, offset, code; DynBuf dbuf_s, *dbuf = &dbuf_s; @@ -1533,6 +1564,14 @@ void build_prop_table(FILE *f, int prop_index, BOOL add_index) block_end_pos += PROP_BLOCK_LEN; } + /* Compressed byte encoding: + 00..3F: 2 packed lengths: 3-bit + 3-bit + 40..5F: 5-bits plus extra byte for length + 60..7F: 5-bits plus 2 extra bytes for length + 80..FF: 7-bit length + lengths must be incremented to get character count + Ranges alternate between false and true return value. + */ v = buf[i]; code += v + 1; bit ^= 1; @@ -1573,7 +1612,7 @@ void build_prop_table(FILE *f, int prop_index, BOOL add_index) dump_byte_table(f, cname, dbuf->buf, dbuf->size); if (add_index) { snprintf(cname, sizeof(cname), "unicode_prop_%s_index", unicode_prop_name[prop_index]); - dump_byte_table(f, cname, dbuf2->buf, dbuf2->size); + dump_index_table(f, cname, dbuf2->buf, dbuf2->size); } dbuf_free(dbuf); @@ -1583,10 +1622,10 @@ void build_prop_table(FILE *f, int prop_index, BOOL add_index) void build_flags_tables(FILE *f) { - build_prop_table(f, PROP_Cased1, TRUE); - build_prop_table(f, PROP_Case_Ignorable, TRUE); - build_prop_table(f, PROP_ID_Start, TRUE); - build_prop_table(f, PROP_ID_Continue1, TRUE); + build_prop_table(f, "Cased1", PROP_Cased1, TRUE); + build_prop_table(f, "Case_Ignorable", PROP_Case_Ignorable, TRUE); + build_prop_table(f, "ID_Start", PROP_ID_Start, TRUE); + build_prop_table(f, "ID_Continue1", PROP_ID_Continue1, TRUE); } void dump_name_table(FILE *f, const char *cname, const char **tab_name, int len, @@ -1845,7 +1884,7 @@ void build_prop_list_table(FILE *f) i == PROP_ID_Continue1) { /* already generated */ } else { - build_prop_table(f, i, FALSE); + build_prop_table(f, unicode_prop_name[i], i, FALSE); } } @@ -1997,6 +2036,8 @@ void check_flags(void) void build_cc_table(FILE *f) { + // Compress combining class table + // see: https://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values int i, cc, n, type, n1, block_end_pos; DynBuf dbuf_s, *dbuf = &dbuf_s; DynBuf dbuf1_s, *dbuf1 = &dbuf1_s; @@ -2055,6 +2096,13 @@ void build_cc_table(FILE *f) #if defined(DUMP_CC_TABLE) || defined(DUMP_TABLE_SIZE) cw_start = dbuf->size; #endif + /* Compressed run length encoding: + - 2 high order bits are combining class type + - 0:0, 1:230, 2:extra byte linear progression, 3:extra byte + - 00..2F: range length (add 1) + - 30..37: 3-bit range-length + 1 extra byte + - 38..3F: 3-bit range-length + 2 extra byte + */ if (n1 < 48) { dbuf_putc(dbuf, n1 | (type << 6)); } else if (n1 < 48 + (1 << 11)) { @@ -2084,7 +2132,7 @@ void build_cc_table(FILE *f) dbuf_putc(dbuf1, v >> 16); dump_byte_table(f, "unicode_cc_table", dbuf->buf, dbuf->size); - dump_byte_table(f, "unicode_cc_index", dbuf1->buf, dbuf1->size); + dump_index_table(f, "unicode_cc_index", dbuf1->buf, dbuf1->size); #if defined(DUMP_CC_TABLE) || defined(DUMP_TABLE_SIZE) printf("CC table: size=%d (%d entries) [", @@ -2765,8 +2813,9 @@ void build_decompose_table(FILE *f) } #endif - fprintf(f, "static const uint32_t unicode_decomp_table1[%u] = {", - array_len); + total_tables++; + total_table_bytes += array_len * sizeof(uint32_t); + fprintf(f, "static const uint32_t unicode_decomp_table1[%d] = {", array_len); count = 0; for(i = 0; i <= code_max; i++) { de = &tab_de[i]; @@ -2784,8 +2833,9 @@ void build_decompose_table(FILE *f) } fprintf(f, "\n};\n\n"); - fprintf(f, "static const uint16_t unicode_decomp_table2[%u] = {", - array_len); + total_tables++; + total_table_bytes += array_len * sizeof(uint16_t); + fprintf(f, "static const uint16_t unicode_decomp_table2[%d] = {", array_len); count = 0; for(i = 0; i <= code_max; i++) { de = &tab_de[i]; @@ -2798,8 +2848,9 @@ void build_decompose_table(FILE *f) } fprintf(f, "\n};\n\n"); - fprintf(f, "static const uint8_t unicode_decomp_data[%u] = {", - data_len); + total_tables++; + total_table_bytes += data_len; + fprintf(f, "static const uint8_t unicode_decomp_data[%d] = {", data_len); for(i = 0; i < data_len; i++) { if (i % 8 == 0) fprintf(f, "\n "); @@ -2890,8 +2941,9 @@ void build_compose_table(FILE *f, const DecompEntry *tab_de) } #endif - fprintf(f, "static const uint16_t unicode_comp_table[%u] = {", - tab_ce_len); + total_tables++; + total_table_bytes += tab_ce_len * sizeof(uint16_t); + fprintf(f, "static const uint16_t unicode_comp_table[%u] = {", tab_ce_len); for(i = 0; i < tab_ce_len; i++) { if (i % 8 == 0) fprintf(f, "\n "); @@ -3066,22 +3118,24 @@ void normalization_test(const char *filename) } #endif -int main(int argc, char **argv) +int main(int argc, char *argv[]) { const char *unicode_db_path, *outfilename; char filename[1024]; - - if (argc < 2) { - printf("usage: %s unicode_db_path [output_file]\n" - "\n" - "If no output_file is given, a self test is done using the current unicode library\n", - argv[0]); - exit(1); + int arg = 1; + + if (arg >= argc || (!strcmp(argv[arg], "-h") || !strcmp(argv[arg], "--help"))) { + printf("usage: %s PATH [OUTPUT]\n" + " PATH path to the Unicode database directory\n" + " OUTPUT name of the output file. If omitted, a self test is performed\n" + " using the files from the Unicode library\n" + , argv[0]); + return 1; } - unicode_db_path = argv[1]; + unicode_db_path = argv[arg++]; outfilename = NULL; - if (argc >= 3) - outfilename = argv[2]; + if (arg < argc) + outfilename = argv[arg++]; unicode_db = mallocz(sizeof(unicode_db[0]) * (CHARCODE_MAX + 1)); @@ -3163,6 +3217,8 @@ int main(int argc, char **argv) build_script_ext_table(fo); build_prop_list_table(fo); fprintf(fo, "#endif /* CONFIG_ALL_UNICODE */\n"); + fprintf(fo, "/* %u tables / %u bytes, %u index / %u bytes */\n", + total_tables, total_table_bytes, total_index, total_index_bytes); fclose(fo); } return 0; |