]> git.kaiwu.me - njs.git/commitdiff
Improved surrogate pairs support for PCRE2 backend.
authorDmitry Volyntsev <xeioex@nginx.com>
Fri, 6 May 2022 03:25:05 +0000 (20:25 -0700)
committerDmitry Volyntsev <xeioex@nginx.com>
Fri, 6 May 2022 03:25:05 +0000 (20:25 -0700)
In collaboration with Javier Evans.

external/njs_regex.c
src/test/njs_unit_test.c

index 183e52e6efb9269c77b2b5361f9f829cd9425676..2dd893e9d4ef9745ee755a9fde36b0862286161d 100644 (file)
@@ -60,8 +60,26 @@ njs_regex_compile_ctx_t *
 njs_regex_compile_ctx_create(njs_regex_generic_ctx_t *ctx)
 {
 #ifdef NJS_HAVE_PCRE2
+    pcre2_compile_context  *cc;
+
+    cc = pcre2_compile_context_create(ctx);
+
+#ifdef PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES
+    if (njs_fast_path(cc != NULL)) {
+        /* Workaround for surrogate pairs in regular expressions
+         *
+         * This option is needed because njs, unlike the standard ECMAScript,
+         * stores and processes strings in UTF-8 encoding.
+         * PCRE2 does not support surrogate pairs by default when it
+         * is compiled for UTF-8 only strings. But many polyfills
+         * and transpilers use such surrogate pairs expressions.
+         */
+        pcre2_set_compile_extra_options(cc,
+                                        PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES);
+    }
+#endif
 
-    return pcre2_compile_context_create(ctx);
+    return cc;
 
 #else
 
index 215f2dd24874ca66dbb5814423a61426b650de08..87d8d46c0c87d4bec5c8cfd53da6aebf153098cf 100644 (file)
@@ -10841,6 +10841,11 @@ static njs_unit_test_t  njs_test[] =
       njs_str("true") },
 #endif
 
+#ifdef PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES
+    { njs_str("/\\u200d\\ud800-/"),
+      njs_str("/\\u200d\\ud800-/") },
+#endif
+
     { njs_str("/(\\.(?!com|org)|\\/)/.test('ah.info')"),
       njs_str("true") },