From 99239ae377a71a4c79213f4be69d828c399e23da Mon Sep 17 00:00:00 2001 From: Dmitry Volyntsev Date: Thu, 5 May 2022 20:25:05 -0700 Subject: [PATCH] Improved surrogate pairs support for PCRE2 backend. In collaboration with Javier Evans. --- external/njs_regex.c | 20 +++++++++++++++++++- src/test/njs_unit_test.c | 5 +++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/external/njs_regex.c b/external/njs_regex.c index 183e52e6..2dd893e9 100644 --- a/external/njs_regex.c +++ b/external/njs_regex.c @@ -60,8 +60,26 @@ njs_regex_compile_ctx_t * njs_regex_compile_ctx_create(njs_regex_generic_ctx_t *ctx) { #ifdef NJS_HAVE_PCRE2 + pcre2_compile_context *cc; + + cc = pcre2_compile_context_create(ctx); + +#ifdef PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES + if (njs_fast_path(cc != NULL)) { + /* Workaround for surrogate pairs in regular expressions + * + * This option is needed because njs, unlike the standard ECMAScript, + * stores and processes strings in UTF-8 encoding. + * PCRE2 does not support surrogate pairs by default when it + * is compiled for UTF-8 only strings. But many polyfills + * and transpilers use such surrogate pairs expressions. + */ + pcre2_set_compile_extra_options(cc, + PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES); + } +#endif - return pcre2_compile_context_create(ctx); + return cc; #else diff --git a/src/test/njs_unit_test.c b/src/test/njs_unit_test.c index 215f2dd2..87d8d46c 100644 --- a/src/test/njs_unit_test.c +++ b/src/test/njs_unit_test.c @@ -10841,6 +10841,11 @@ static njs_unit_test_t njs_test[] = njs_str("true") }, #endif +#ifdef PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES + { njs_str("/\\u200d\\ud800-/"), + njs_str("/\\u200d\\ud800-/") }, +#endif + { njs_str("/(\\.(?!com|org)|\\/)/.test('ah.info')"), njs_str("true") }, -- 2.47.3