From c18a3a4aa6ca91505badfe67081e81045e028f91 Mon Sep 17 00:00:00 2001 From: Alexander Borisov Date: Wed, 26 Aug 2020 21:05:46 +0300 Subject: [PATCH] Improved UTF-8 encoding/decoding. --- src/njs_encoding.c | 135 ++++++--------------------------------------- src/njs_parser.c | 18 ++---- src/njs_utf8.c | 97 +++++++++++++++++++------------- src/njs_utf8.h | 37 +++++++++++-- 4 files changed, 115 insertions(+), 172 deletions(-) diff --git a/src/njs_encoding.c b/src/njs_encoding.c index d0e60ab0..c68ecfad 100644 --- a/src/njs_encoding.c +++ b/src/njs_encoding.c @@ -18,7 +18,6 @@ typedef struct { njs_bool_t fatal; njs_bool_t ignore_bom; - uint32_t codepoint; njs_unicode_decode_t ctx; } njs_encoding_decode_t; @@ -87,11 +86,10 @@ njs_text_encoder_encode(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs, njs_index_t unused) { u_char *dst; - int64_t size; - uint32_t cp; + size_t size; njs_int_t ret; njs_value_t *this, *input, value; - const u_char *p, *start, *end; + const u_char *start, *end; njs_string_prop_t prop; njs_typed_array_t *array; njs_unicode_decode_t ctx; @@ -126,30 +124,9 @@ njs_text_encoder_encode(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs, end = start + prop.size; } - p = start; - - cp = 0; - size = 0; - njs_utf8_decode_init(&ctx); - while (p < end) { - cp = njs_utf8_decode(&ctx, &p, end); - - if (cp > NJS_UNICODE_MAX_CODEPOINT) { - if (cp == NJS_UNICODE_CONTINUE) { - continue; - } - - cp = NJS_UNICODE_REPLACEMENT; - } - - size += njs_utf8_size(cp); - } - - if (cp == NJS_UNICODE_CONTINUE) { - size += njs_utf8_size(NJS_UNICODE_REPLACEMENT); - } + (void) njs_utf8_stream_length(&ctx, start, end - start, 1, 0, &size); njs_set_number(&value, size); @@ -161,23 +138,7 @@ njs_text_encoder_encode(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs, dst = njs_typed_array_buffer(array)->u.u8; njs_utf8_decode_init(&ctx); - while (start < end) { - cp = njs_utf8_decode(&ctx, &start, end); - - if (cp > NJS_UNICODE_MAX_CODEPOINT) { - if (cp == NJS_UNICODE_CONTINUE) { - continue; - } - - cp = NJS_UNICODE_REPLACEMENT; - } - - dst = njs_utf8_encode(dst, cp); - } - - if (cp == NJS_UNICODE_CONTINUE) { - (void) njs_utf8_encode(dst, NJS_UNICODE_REPLACEMENT); - } + (void) njs_utf8_stream_encode(&ctx, start, end, dst, 1, 0); njs_set_typed_array(&vm->retval, array); @@ -410,7 +371,6 @@ njs_text_decoder_constructor(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs, return ret; } - data->codepoint = 0; njs_utf8_decode_init(&data->ctx); njs_set_data(&ov->value, data, NJS_DATA_TAG_TEXT_DECODER); @@ -573,12 +533,12 @@ njs_text_decoder_decode(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs, njs_index_t unused) { u_char *dst; - uint32_t length, cp; - uint64_t size; + size_t size; + ssize_t length; njs_int_t ret; njs_bool_t stream; njs_value_t retval, *this, *typed_array, *options; - const u_char *start, *end, *p; + const u_char *start, *end; njs_unicode_decode_t ctx; njs_encoding_decode_t *data; const njs_typed_array_t *array; @@ -632,52 +592,18 @@ njs_text_decoder_decode(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs, data = njs_object_data(this); ctx = data->ctx; - cp = data->codepoint; - - size = 0; - length = 0; - - p = start; /* Looking for BOM. */ - if (!data->ignore_bom && p + 3 <= end) { - cp = njs_utf8_decode(&ctx, &p, end); - - if (cp == NJS_UNICODE_BOM) { - start = p; - - } else { - p = start; - } + if (!data->ignore_bom) { + start += njs_utf8_bom(start, end); } - while (p < end) { - cp = njs_utf8_decode(&ctx, &p, end); - - if (njs_slow_path(cp > NJS_UNICODE_MAX_CODEPOINT)) { - if (cp == NJS_UNICODE_CONTINUE) { - break; - } - - if (data->fatal) { - goto fatal; - } - - cp = NJS_UNICODE_REPLACEMENT; - } - - size += njs_utf8_size(cp); - length++; - } - - if (cp == NJS_UNICODE_CONTINUE && !stream) { - if (data->fatal) { - goto fatal; - } - - size += njs_utf8_size(NJS_UNICODE_REPLACEMENT); - length++; + length = njs_utf8_stream_length(&ctx, start, end - start, !stream, + data->fatal, &size); + if (length == -1) { + njs_type_error(vm, "The encoded data was not valid"); + return NJS_ERROR; } dst = njs_string_alloc(vm, &vm->retval, size, length); @@ -685,40 +611,13 @@ njs_text_decoder_decode(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs, return NJS_ERROR; } - while (start < end) { - cp = njs_utf8_decode(&data->ctx, &start, end); - - if (cp > NJS_UNICODE_MAX_CODEPOINT) { - if (cp == NJS_UNICODE_CONTINUE) { - break; - } - - cp = NJS_UNICODE_REPLACEMENT; - } - - dst = njs_utf8_encode(dst, cp); - } - - if (stream) { - data->codepoint = cp; - return NJS_OK; - } + (void) njs_utf8_stream_encode(&data->ctx, start, end, dst, !stream, 0); - if (cp == NJS_UNICODE_CONTINUE) { - (void) njs_utf8_encode(dst, NJS_UNICODE_REPLACEMENT); + if (!stream) { + njs_utf8_decode_init(&data->ctx); } - data->codepoint = 0; - - njs_utf8_decode_init(&data->ctx); - return NJS_OK; - -fatal: - - njs_type_error(vm, "The encoded data was not valid"); - - return NJS_ERROR; } diff --git a/src/njs_parser.c b/src/njs_parser.c index 168ae985..5b806190 100644 --- a/src/njs_parser.c +++ b/src/njs_parser.c @@ -7897,15 +7897,16 @@ njs_parser_string_create(njs_vm_t *vm, njs_lexer_token_t *token, njs_value_t *value) { u_char *dst; - ssize_t size, length; - uint32_t cp; + size_t size, length; njs_str_t *src; const u_char *p, *end; njs_unicode_decode_t ctx; src = &token->text; - length = njs_utf8_safe_length(src->start, src->length, &size); + njs_utf8_decode_init(&ctx); + + length = njs_utf8_stream_length(&ctx, src->start, src->length, 1, 0, &size); dst = njs_string_alloc(vm, value, size, length); if (njs_slow_path(dst == NULL)) { @@ -7917,16 +7918,7 @@ njs_parser_string_create(njs_vm_t *vm, njs_lexer_token_t *token, njs_utf8_decode_init(&ctx); - while (p < end) { - cp = njs_utf8_decode(&ctx, &p, end); - - if (cp <= NJS_UNICODE_MAX_CODEPOINT) { - dst = njs_utf8_encode(dst, cp); - - } else { - dst = njs_utf8_encode(dst, NJS_UNICODE_REPLACEMENT); - } - } + (void) njs_utf8_stream_encode(&ctx, p, end, dst, 1, 0); if (length > NJS_STRING_MAP_STRIDE && size != length) { njs_string_offset_map_init(value->long_string.data->start, size); diff --git a/src/njs_utf8.c b/src/njs_utf8.c index 01cb5506..b4ba5181 100644 --- a/src/njs_utf8.c +++ b/src/njs_utf8.c @@ -213,6 +213,43 @@ failed: return NJS_UNICODE_ERROR; } + +u_char * +njs_utf8_stream_encode(njs_unicode_decode_t *ctx, const u_char *start, + const u_char *end, u_char *dst, njs_bool_t last, njs_bool_t fatal) +{ + uint32_t cp; + + while (start < end) { + cp = njs_utf8_decode(ctx, &start, end); + + if (cp > NJS_UNICODE_MAX_CODEPOINT) { + if (cp == NJS_UNICODE_CONTINUE) { + break; + } + + if (fatal) { + return NULL; + } + + cp = NJS_UNICODE_REPLACEMENT; + } + + dst = njs_utf8_encode(dst, cp); + } + + if (last && ctx->need != 0x00) { + if (fatal) { + return NULL; + } + + dst = njs_utf8_encode(dst, NJS_UNICODE_REPLACEMENT); + } + + return dst; +} + + /* * njs_utf8_casecmp() tests only up to the minimum of given lengths, but * requires lengths of both strings because otherwise njs_utf8_decode() @@ -314,57 +351,43 @@ njs_utf8_upper_case(const u_char **start, const u_char *end) ssize_t -njs_utf8_length(const u_char *p, size_t len) +njs_utf8_stream_length(njs_unicode_decode_t *ctx, const u_char *p, size_t len, + njs_bool_t last, njs_bool_t fatal, size_t *out_size) { - ssize_t length; - const u_char *end; - njs_unicode_decode_t ctx; + size_t size, length; + uint32_t codepoint; + const u_char *end; + size = 0; length = 0; end = p + len; - njs_utf8_decode_init(&ctx); - while (p < end) { - if (njs_slow_path(njs_utf8_decode(&ctx, &p, end) - > NJS_UNICODE_MAX_CODEPOINT)) - { - return -1; - } - - length++; - } - - return length; -} + codepoint = njs_utf8_decode(ctx, &p, end); + if (codepoint > NJS_UNICODE_MAX_CODEPOINT) { + if (codepoint == NJS_UNICODE_CONTINUE) { + break; + } -ssize_t -njs_utf8_safe_length(const u_char *p, size_t len, ssize_t *out_size) -{ - ssize_t size, length; - uint32_t codepoint; - const u_char *end; - njs_unicode_decode_t ctx; - - size = 0; - length = 0; - - end = p + len; - - njs_utf8_decode_init(&ctx); + if (fatal) { + return -1; + } - while (p < end) { - codepoint = njs_utf8_decode(&ctx, &p, end); + codepoint = NJS_UNICODE_REPLACEMENT; + } - if (codepoint <= NJS_UNICODE_MAX_CODEPOINT) { - size += njs_utf8_size(codepoint); + size += njs_utf8_size(codepoint); + length++; + } - } else { - size += njs_utf8_size(NJS_UNICODE_REPLACEMENT); + if (last && ctx->need != 0x00) { + if (fatal) { + return -1; } + size += njs_utf8_size(NJS_UNICODE_REPLACEMENT); length++; } diff --git a/src/njs_utf8.h b/src/njs_utf8.h index 8e019cfd..26c23dbb 100644 --- a/src/njs_utf8.h +++ b/src/njs_utf8.h @@ -8,18 +8,21 @@ #define _NJS_UTF8_H_INCLUDED_ -NJS_EXPORT u_char *njs_utf8_encode(u_char *p, uint32_t u); NJS_EXPORT uint32_t njs_utf8_decode(njs_unicode_decode_t *ctx, const u_char **data, const u_char *end); +NJS_EXPORT u_char *njs_utf8_encode(u_char *p, uint32_t u); +NJS_EXPORT u_char *njs_utf8_stream_encode(njs_unicode_decode_t *ctx, + const u_char *start, const u_char *end, u_char *dst, njs_bool_t last, + njs_bool_t fatal); NJS_EXPORT njs_int_t njs_utf8_casecmp(const u_char *start1, const u_char *start2, size_t len1, size_t len2); NJS_EXPORT uint32_t njs_utf8_lower_case(const u_char **start, const u_char *end); NJS_EXPORT uint32_t njs_utf8_upper_case(const u_char **start, const u_char *end); -NJS_EXPORT ssize_t njs_utf8_length(const u_char *p, size_t len); -NJS_EXPORT ssize_t njs_utf8_safe_length(const u_char *p, size_t len, - ssize_t *out_size); +NJS_EXPORT ssize_t njs_utf8_stream_length(njs_unicode_decode_t *ctx, + const u_char *p, size_t len, njs_bool_t last, njs_bool_t fatal, + size_t *out_size); NJS_EXPORT njs_bool_t njs_utf8_is_valid(const u_char *p, size_t len); @@ -119,6 +122,32 @@ njs_utf8_decode_init(njs_unicode_decode_t *ctx) } +njs_inline ssize_t +njs_utf8_length(const u_char *p, size_t len) +{ + njs_unicode_decode_t ctx; + + njs_utf8_decode_init(&ctx); + + return njs_utf8_stream_length(&ctx, p, len, 1, 1, NULL); +} + + +njs_inline size_t +njs_utf8_bom(const u_char *start, const u_char *end) +{ + if (start + 3 > end) { + return 0; + } + + if (start[0] == 0xEF && start[1] == 0xBB && start[2] == 0xBF) { + return 3; + } + + return 0; +} + + njs_inline size_t njs_utf8_size(uint32_t cp) { -- 2.47.3