From: Dmitry Volyntsev Date: Thu, 15 Sep 2022 05:14:50 +0000 (-0700) Subject: Parser: properly handling unicode space characters. X-Git-Tag: 0.7.8~29 X-Git-Url: http://www.kaiwu.me/postgresql/commit/static/gitweb.js?a=commitdiff_plain;h=af46e2733a0c87635af400a83114811d8497eb58;p=njs.git Parser: properly handling unicode space characters. --- diff --git a/src/njs_lexer.c b/src/njs_lexer.c index 7c1a8270..91b08e94 100644 --- a/src/njs_lexer.c +++ b/src/njs_lexer.c @@ -45,8 +45,8 @@ static const uint8_t njs_tokens[256] njs_aligned(64) = { NJS_TOKEN_ILLEGAL, NJS_TOKEN_ILLEGAL, NJS_TOKEN_ILLEGAL, NJS_TOKEN_ILLEGAL, /* \t */ NJS_TOKEN_ILLEGAL, NJS_TOKEN_SPACE, - /* \n */ NJS_TOKEN_LINE_END, NJS_TOKEN_ILLEGAL, - /* \r */ NJS_TOKEN_ILLEGAL, NJS_TOKEN_SPACE, + /* \n */ NJS_TOKEN_LINE_END, NJS_TOKEN_SPACE, + /* \r */ NJS_TOKEN_SPACE, NJS_TOKEN_SPACE, NJS_TOKEN_ILLEGAL, NJS_TOKEN_ILLEGAL, /* 0x10 */ NJS_TOKEN_ILLEGAL, NJS_TOKEN_ILLEGAL, @@ -437,15 +437,38 @@ njs_lexer_consume_token(njs_lexer_t *lexer, unsigned length) njs_int_t njs_lexer_make_token(njs_lexer_t *lexer, njs_lexer_token_t *token) { - u_char c, *p; + u_char c, *p; + uint32_t cp; + njs_unicode_decode_t ctx; c = ' '; + njs_utf8_decode_init(&ctx); + while (lexer->start < lexer->end) { - c = *lexer->start++; + c = *lexer->start; - if (njs_tokens[c] != NJS_TOKEN_SPACE) { - break; + if (njs_fast_path(!(c & 0x80))) { + lexer->start++; + + if (njs_tokens[c] != NJS_TOKEN_SPACE) { + break; + } + + } else { + + /* Unicode. */ + + cp = njs_utf8_decode(&ctx, (const u_char **) &lexer->start, + lexer->end); + if (njs_slow_path(cp > NJS_UNICODE_MAX_CODEPOINT)) { + c = '\0'; + break; + } + + if (!njs_utf8_is_whitespace(cp)) { + break; + } } } diff --git a/src/njs_str.h b/src/njs_str.h index ae4dd082..8c7e60b0 100644 --- a/src/njs_str.h +++ b/src/njs_str.h @@ -51,7 +51,6 @@ njs_is_whitespace(u_char c) case 0x0C: /* */ case 0x0D: /* */ case 0x20: /* */ - case 0xA0: /* */ return 1; default: diff --git a/src/test/njs_unit_test.c b/src/test/njs_unit_test.c index b42d0123..74a82d50 100644 --- a/src/test/njs_unit_test.c +++ b/src/test/njs_unit_test.c @@ -7341,6 +7341,11 @@ static njs_unit_test_t njs_test[] = "[a.length, a[33], a[34]]"), njs_str("35,a,�") }, + /* Spaces: U+0009U+000BU+000CU+0020U+00A0U+000AU+000DU+2028U+2029 */ + + { njs_str("\x09\x0a\x0b\x0c\x0d \xc2\xa0'a'\xe2\x80\xa8+\xe2\x80\xa9'b'"), + njs_str("ab") }, + /* Escape strings. */ { njs_str("'\\a \\' \\\" \\\\ \\0 \\b \\f \\n \\r \\t \\v'"),