]> git.kaiwu.me - njs.git/commitdiff
Parser: properly handling unicode space characters.
authorDmitry Volyntsev <xeioex@nginx.com>
Thu, 15 Sep 2022 05:14:50 +0000 (22:14 -0700)
committerDmitry Volyntsev <xeioex@nginx.com>
Thu, 15 Sep 2022 05:14:50 +0000 (22:14 -0700)
src/njs_lexer.c
src/njs_str.h
src/test/njs_unit_test.c

index 7c1a8270e848e91488dea4d8160936e22d6209a9..91b08e94155ff447aa5370e0a49dc3d79003d6b4 100644 (file)
@@ -45,8 +45,8 @@ static const uint8_t  njs_tokens[256]  njs_aligned(64) = {
                 NJS_TOKEN_ILLEGAL,           NJS_TOKEN_ILLEGAL,
                 NJS_TOKEN_ILLEGAL,           NJS_TOKEN_ILLEGAL,
     /* \t */    NJS_TOKEN_ILLEGAL,           NJS_TOKEN_SPACE,
-    /* \n */    NJS_TOKEN_LINE_END,          NJS_TOKEN_ILLEGAL,
-    /* \r */    NJS_TOKEN_ILLEGAL,           NJS_TOKEN_SPACE,
+    /* \n */    NJS_TOKEN_LINE_END,          NJS_TOKEN_SPACE,
+    /* \r */    NJS_TOKEN_SPACE,             NJS_TOKEN_SPACE,
                 NJS_TOKEN_ILLEGAL,           NJS_TOKEN_ILLEGAL,
 
     /* 0x10 */  NJS_TOKEN_ILLEGAL,           NJS_TOKEN_ILLEGAL,
@@ -437,15 +437,38 @@ njs_lexer_consume_token(njs_lexer_t *lexer, unsigned length)
 njs_int_t
 njs_lexer_make_token(njs_lexer_t *lexer, njs_lexer_token_t *token)
 {
-    u_char  c, *p;
+    u_char                c, *p;
+    uint32_t              cp;
+    njs_unicode_decode_t  ctx;
 
     c = ' ';
 
+    njs_utf8_decode_init(&ctx);
+
     while (lexer->start < lexer->end) {
-        c = *lexer->start++;
+        c = *lexer->start;
 
-        if (njs_tokens[c] != NJS_TOKEN_SPACE) {
-            break;
+        if (njs_fast_path(!(c & 0x80))) {
+            lexer->start++;
+
+            if (njs_tokens[c] != NJS_TOKEN_SPACE) {
+                break;
+            }
+
+        } else {
+
+            /* Unicode. */
+
+            cp = njs_utf8_decode(&ctx, (const u_char **) &lexer->start,
+                                 lexer->end);
+            if (njs_slow_path(cp > NJS_UNICODE_MAX_CODEPOINT)) {
+                c = '\0';
+                break;
+            }
+
+            if (!njs_utf8_is_whitespace(cp)) {
+                break;
+            }
         }
     }
 
index ae4dd082e6b1bd2dab746bf9dee433277f5a00dd..8c7e60b066079dbfe0dfa074ba9ea703b84a736f 100644 (file)
@@ -51,7 +51,6 @@ njs_is_whitespace(u_char c)
     case 0x0C:  /* <FF>   */
     case 0x0D:  /* <CR>   */
     case 0x20:  /* <SP>   */
-    case 0xA0:  /* <NBSP> */
         return 1;
 
     default:
index b42d0123a4d4da232b4526a57766128b57926113..74a82d5003f18658b9a90a91569a7a540ed2897e 100644 (file)
@@ -7341,6 +7341,11 @@ static njs_unit_test_t  njs_test[] =
                  "[a.length, a[33], a[34]]"),
       njs_str("35,a,�") },
 
+    /* Spaces: U+0009U+000BU+000CU+0020U+00A0U+000AU+000DU+2028U+2029 */
+
+    { njs_str("\x09\x0a\x0b\x0c\x0d \xc2\xa0'a'\xe2\x80\xa8+\xe2\x80\xa9'b'"),
+      njs_str("ab") },
+
     /* Escape strings. */
 
     { njs_str("'\\a \\' \\\" \\\\ \\0 \\b \\f \\n \\r \\t \\v'"),