]> git.kaiwu.me - njs.git/commitdiff
Improved processing of invalid surrogate pairs in strings.
authorAlexander Borisov <alexander.borisov@nginx.com>
Tue, 28 May 2019 17:49:58 +0000 (20:49 +0300)
committerAlexander Borisov <alexander.borisov@nginx.com>
Tue, 28 May 2019 17:49:58 +0000 (20:49 +0300)
Previously, an exception was thrown on invalid surrogate pairs.
Now, all such pairs are converted to replacement character.

This closes #170 issue on GitHub.

njs/njs_parser_terminal.c
njs/test/njs_unit_test.c
nxt/nxt_utf8.h

index 7f1122b72a3baf4667f8de0fa2f705fd0cf8e721..3c85ccdac26acc62ceee5775a331bb8f81206942 100644 (file)
@@ -1049,12 +1049,27 @@ njs_parser_escape_string_create(njs_vm_t *vm, njs_parser_t *parser,
         }
 
         if (cp_pair != 0) {
-            cp = njs_string_surrogate_pair(cp_pair, cp);
+            if (nxt_fast_path(cp >= 0xdc00 && cp <= 0xdfff)) {
+                cp = njs_string_surrogate_pair(cp_pair, cp);
+
+            } else if (nxt_slow_path(cp >= 0xd800 && cp <= 0xdbff)) {
+                cp = NXT_UTF8_REPLACEMENT;
+
+                dst = nxt_utf8_encode(dst, (uint32_t) cp);
+
+            } else {
+                dst = nxt_utf8_encode(dst, NXT_UTF8_REPLACEMENT);
+            }
+
             cp_pair = 0;
 
         } else if (cp >= 0xd800 && cp <= 0xdfff) {
-            cp_pair = cp;
-            continue;
+            if (cp <= 0xdbff && src[0] == '\\' && src[1] == 'u') {
+                cp_pair = cp;
+                continue;
+            }
+
+            cp = NXT_UTF8_REPLACEMENT;
         }
 
         dst = nxt_utf8_encode(dst, (uint32_t) cp);
@@ -1183,20 +1198,29 @@ njs_parser_escape_string_calc_length(njs_vm_t *vm, njs_parser_t *parser,
         }
 
         if (cp_pair != 0) {
-            if (nxt_slow_path(cp < 0xdc00 || cp > 0xdfff)) {
-                goto invalid_pair;
+            if (nxt_fast_path(cp >= 0xdc00 && cp <= 0xdfff)) {
+                cp = njs_string_surrogate_pair(cp_pair, cp);
+
+            } else if (nxt_slow_path(cp >= 0xd800 && cp <= 0xdbff)) {
+                cp = NXT_UTF8_REPLACEMENT;
+
+                size += nxt_utf8_size(cp);
+                length++;
+
+            } else {
+                size += nxt_utf8_size(NXT_UTF8_REPLACEMENT);
+                length++;
             }
 
-            cp = njs_string_surrogate_pair(cp_pair, cp);
             cp_pair = 0;
 
         } else if (cp >= 0xd800 && cp <= 0xdfff) {
-            if (nxt_slow_path(cp > 0xdbff || src[0] != '\\' || src[1] != 'u')) {
-                goto invalid_pair;
+            if (cp <= 0xdbff && src[0] == '\\' && src[1] == 'u') {
+                cp_pair = cp;
+                continue;
             }
 
-            cp_pair = cp;
-            continue;
+            cp = NXT_UTF8_REPLACEMENT;
         }
 
         size += nxt_utf8_size(cp);
@@ -1214,11 +1238,4 @@ invalid:
                             njs_parser_text(parser));
 
     return NJS_ERROR;
-
-invalid_pair:
-
-    njs_parser_syntax_error(vm, parser, "Invalid surrogate pair \"%V\"",
-                            njs_parser_text(parser));
-
-    return NJS_ERROR;
 }
index 458dc8ff210aabf06e4caff3fedd4b7c7b84a1cd..921e98a32efae861e0e6d8621906256f1c45adab 100644 (file)
@@ -4448,15 +4448,25 @@ static njs_unit_test_t  njs_test[] =
       nxt_string("1") },
 
     { nxt_string("'\\ud83d abc \\udc4d'"),
-      nxt_string("SyntaxError: Invalid surrogate pair "
-                 "\"\\ud83d abc \\udc4d\" in 1") },
+      nxt_string("� abc �") },
 
     { nxt_string("'\\ud83d'"),
-      nxt_string("SyntaxError: Invalid surrogate pair \"\\ud83d\" in 1") },
+      nxt_string("") },
 
     { nxt_string("'\\ud83d\\uabcd'"),
-      nxt_string("SyntaxError: Invalid surrogate pair "
-                 "\"\\ud83d\\uabcd\" in 1") },
+      nxt_string("�ꯍ") },
+
+    { nxt_string("'\\u{d800}\\u{dB00}'"),
+      nxt_string("��") },
+
+    { nxt_string("'\\u{d800}\\u{d7ff}'"),
+      nxt_string("�퟿") },
+
+    { nxt_string("'\\u{d800}['"),
+      nxt_string("�[") },
+
+    { nxt_string("'\\u{D800}\\u{'"),
+      nxt_string("SyntaxError: Invalid Unicode code point \"\\u{D800}\\u{\" in 1") },
 
     { nxt_string("''.hasOwnProperty('length')"),
       nxt_string("true") },
index 8362b40a16a312ceccd1e69e0b09b6d803c3bf5d..dc77f02efa8ab689c40cb4b6593634d4758f40cd 100644 (file)
@@ -15,6 +15,8 @@
  */
 #define NXT_UTF8_SORT_INVALID  0x0EEE0EEE
 
+#define NXT_UTF8_REPLACEMENT   0xFFFD
+
 
 NXT_EXPORT u_char *nxt_utf8_encode(u_char *p, uint32_t u);
 NXT_EXPORT uint32_t nxt_utf8_decode(const u_char **start, const u_char *end);