From a82142246c30da9ad3b7d67c0e7b3c18fa4c66fa Mon Sep 17 00:00:00 2001 From: Dmitry Volyntsev Date: Thu, 11 Nov 2021 14:27:15 +0000 Subject: [PATCH] Introduced PCRE2 RegExp backend. --- auto/options | 4 + auto/pcre | 76 +++++++++---- auto/sources | 15 ++- nginx/config.make | 2 +- src/njs_pcre2.c | 240 +++++++++++++++++++++++++++++++++++++++ src/njs_regex.h | 28 +++-- src/test/njs_unit_test.c | 96 +++++++++++----- 7 files changed, 398 insertions(+), 63 deletions(-) create mode 100644 src/njs_pcre2.c diff --git a/auto/options b/auto/options index 89bcd3b4..f4b3cad1 100644 --- a/auto/options +++ b/auto/options @@ -11,6 +11,8 @@ NJS_DEBUG_MEMORY=NO NJS_ADDRESS_SANITIZER=NO NJS_TEST262=YES +NJS_TRY_PCRE2=YES + NJS_CONFIGURE_OPTIONS= for njs_option @@ -31,6 +33,8 @@ do --debug-memory=*) NJS_DEBUG_MEMORY="$value" ;; --test262=*) NJS_TEST262="$value" ;; + --no-pcre2) NJS_TRY_PCRE2=NO ;; + --help) . auto/help exit 0 diff --git a/auto/pcre b/auto/pcre index be5dbae5..e7e696b2 100644 --- a/auto/pcre +++ b/auto/pcre @@ -2,33 +2,67 @@ # Copyright (C) Igor Sysoev # Copyright (C) NGINX, Inc. +njs_found=no +NJS_HAVE_PCRE2=NO -NJS_PCRE_CFLAGS= -NJS_PCRE_LIB= +if [ $NJS_TRY_PCRE2 = YES ]; then + if /bin/sh -c "(pcre2-config --version)" >> $NJS_AUTOCONF_ERR 2>&1; then -njs_found=no + NJS_PCRE_CFLAGS=`pcre2-config --cflags` + NJS_PCRE_LIB=`pcre2-config --libs8` -if /bin/sh -c "(pcre-config --version)" >> $NJS_AUTOCONF_ERR 2>&1; then + njs_feature="PCRE2 library" + njs_feature_name=NJS_HAVE_PCRE2 + njs_feature_run=no + njs_feature_incs="-DPCRE2_CODE_UNIT_WIDTH=8 $NJS_PCRE_CFLAGS" + njs_feature_libs=$NJS_PCRE_LIB + njs_feature_test="#include - NJS_PCRE_CFLAGS=`pcre-config --cflags` - NJS_PCRE_LIB=`pcre-config --libs` + int main(void) { + pcre2_code *re; - njs_feature="PCRE library" - njs_feature_name=NJS_HAVE_PCRE - njs_feature_run=no - njs_feature_incs=$NJS_PCRE_CFLAGS - njs_feature_libs=$NJS_PCRE_LIB - njs_feature_test="#include + re = pcre2_compile((PCRE2_SPTR)\"\", + PCRE2_ZERO_TERMINATED, 0, + NULL, NULL, NULL); + return (re == NULL); + }" - int main(void) { - pcre *re; + . auto/feature - re = pcre_compile(NULL, 0, NULL, 0, NULL); - if (re == NULL) - return 1; - return 0; - }" - . auto/feature + if [ $njs_found = yes ]; then + NJS_HAVE_PCRE2=YES + echo " + PCRE2 version: `pcre2-config --version`" + fi + fi +fi + +if [ $njs_found = no ]; then + if /bin/sh -c "(pcre-config --version)" >> $NJS_AUTOCONF_ERR 2>&1; then + + NJS_PCRE_CFLAGS=`pcre-config --cflags` + NJS_PCRE_LIB=`pcre-config --libs` + + njs_feature="PCRE library" + njs_feature_name=NJS_HAVE_PCRE + njs_feature_run=no + njs_feature_incs=$NJS_PCRE_CFLAGS + njs_feature_libs=$NJS_PCRE_LIB + njs_feature_test="#include + + int main(void) { + pcre *re; + + re = pcre_compile(NULL, 0, NULL, 0, NULL); + if (re == NULL) + return 1; + return 0; + }" + . auto/feature + + if [ $njs_found = yes ]; then + echo " + PCRE version: `pcre-config --version`" + fi + fi fi if [ $njs_found = no ]; then @@ -37,5 +71,3 @@ if [ $njs_found = no ]; then echo exit 1; fi - -echo " + PCRE version: `pcre-config --version`" diff --git a/auto/sources b/auto/sources index 9544e097..18f56883 100644 --- a/auto/sources +++ b/auto/sources @@ -16,7 +16,6 @@ NJS_LIB_SRCS=" \ src/njs_md5.c \ src/njs_sha1.c \ src/njs_sha2.c \ - src/njs_pcre.c \ src/njs_time.c \ src/njs_file.c \ src/njs_malloc.c \ @@ -64,6 +63,14 @@ NJS_LIB_SRCS=" \ src/njs_async.c \ " +NJS_LIB_PCRE_SRCS=" \ + src/njs_pcre.c \ +" + +NJS_LIB_PCRE2_SRCS=" \ + src/njs_pcre2.c \ +" + NJS_LIB_TEST_SRCS=" \ src/test/lvlhsh_unit_test.c \ src/test/random_unit_test.c \ @@ -76,6 +83,12 @@ NJS_TEST_SRCS=" \ src/test/njs_benchmark.c \ " +if [ "$NJS_HAVE_PCRE2" = "YES" ]; then + NJS_LIB_SRCS="$NJS_LIB_SRCS $NJS_LIB_PCRE2_SRCS" +else + NJS_LIB_SRCS="$NJS_LIB_SRCS $NJS_LIB_PCRE_SRCS" +fi + NJS_TS_SRCS=$(find ts/ -name "*.d.ts" -o -name "*.json") NJS_TEST_TS_SRCS=$(find test/ts/ -name "*.ts" -o -name "*.json") diff --git a/nginx/config.make b/nginx/config.make index 09ce410a..3623216c 100644 --- a/nginx/config.make +++ b/nginx/config.make @@ -3,7 +3,7 @@ cat << END >> $NGX_MAKEFILE $ngx_addon_dir/../build/libnjs.a: $NGX_MAKEFILE cd $ngx_addon_dir/.. \\ && if [ -f build/Makefile ]; then \$(MAKE) clean; fi \\ - && CFLAGS="\$(CFLAGS)" CC="\$(CC)" ./configure \\ + && CFLAGS="\$(CFLAGS)" CC="\$(CC)" ./configure --no-pcre2 \\ && \$(MAKE) END diff --git a/src/njs_pcre2.c b/src/njs_pcre2.c new file mode 100644 index 00000000..1cebc45d --- /dev/null +++ b/src/njs_pcre2.c @@ -0,0 +1,240 @@ + +/* + * Copyright (C) Dmitry Volyntsev + * Copyright (C) NGINX, Inc. + */ + + +#include + +#define PCRE2_CODE_UNIT_WIDTH 8 +#include + + +static const u_char* njs_regex_pcre2_error(int errcode, u_char buffer[128]); + + +njs_regex_generic_ctx_t * +njs_regex_generic_ctx_create(njs_pcre_malloc_t private_malloc, + njs_pcre_free_t private_free, void *memory_data) +{ + return pcre2_general_context_create(private_malloc, private_free, + memory_data); +} + + +njs_regex_compile_ctx_t * +njs_regex_compile_ctx_create(njs_regex_generic_ctx_t *ctx) +{ + return pcre2_compile_context_create(ctx); +} + + +njs_int_t +njs_regex_escape(njs_mp_t *mp, njs_str_t *text) +{ + return NJS_OK; +} + + +njs_int_t +njs_regex_compile(njs_regex_t *regex, u_char *source, size_t len, + njs_regex_flags_t flags, njs_regex_compile_ctx_t *ctx, njs_trace_t *trace) +{ + int ret; + u_char *error; + size_t erroff; + njs_uint_t options; + u_char errstr[128]; + + options = PCRE2_ALT_BSUX | PCRE2_MATCH_UNSET_BACKREF; + + if ((flags & NJS_REGEX_IGNORE_CASE)) { + options |= PCRE2_CASELESS; + } + + if ((flags & NJS_REGEX_MULTILINE)) { + options |= PCRE2_MULTILINE; + } + + if ((flags & NJS_REGEX_STICKY)) { + options |= PCRE2_ANCHORED; + } + + if ((flags & NJS_REGEX_UTF8)) { + options |= PCRE2_UTF; + } + + regex->code = pcre2_compile(source, len, options, &ret, &erroff, ctx); + + if (njs_slow_path(regex->code == NULL)) { + error = &source[erroff]; + + njs_alert(trace, NJS_LEVEL_ERROR, + "pcre_compile2(\"%s\") failed: %s at \"%s\"", + source, njs_regex_pcre2_error(ret, errstr), error); + + return NJS_DECLINED; + } + + ret = pcre2_pattern_info(regex->code, PCRE2_INFO_CAPTURECOUNT, + ®ex->ncaptures); + + if (njs_slow_path(ret < 0)) { + njs_alert(trace, NJS_LEVEL_ERROR, + "pcre2_pattern_info(\"%s\", PCRE2_INFO_CAPTURECOUNT) failed: %s", + source, njs_regex_pcre2_error(ret, errstr)); + + return NJS_ERROR; + } + + ret = pcre2_pattern_info(regex->code, PCRE2_INFO_BACKREFMAX, + ®ex->backrefmax); + + if (njs_slow_path(ret < 0)) { + njs_alert(trace, NJS_LEVEL_ERROR, + "pcre2_pattern_info(\"%s\", PCRE2_INFO_BACKREFMAX) failed: %s", + source, njs_regex_pcre2_error(ret, errstr)); + + return NJS_ERROR; + } + + /* Reserve additional elements for the first "$0" capture. */ + regex->ncaptures++; + + if (regex->ncaptures > 1) { + ret = pcre2_pattern_info(regex->code, PCRE2_INFO_NAMECOUNT, + ®ex->nentries); + + if (njs_slow_path(ret < 0)) { + njs_alert(trace, NJS_LEVEL_ERROR, + "pcre2_pattern_info(\"%s\", PCRE2_INFO_NAMECOUNT) failed: %s", + source, njs_regex_pcre2_error(ret, errstr)); + + return NJS_ERROR; + } + + if (regex->nentries != 0) { + ret = pcre2_pattern_info(regex->code, PCRE2_INFO_NAMEENTRYSIZE, + ®ex->entry_size); + + if (njs_slow_path(ret < 0)) { + njs_alert(trace, NJS_LEVEL_ERROR, + "pcre2_pattern_info(\"%s\", PCRE2_INFO_NAMEENTRYSIZE)" + " failed: %s", source, + njs_regex_pcre2_error(ret, errstr)); + + return NJS_ERROR; + } + + ret = pcre2_pattern_info(regex->code, PCRE2_INFO_NAMETABLE, + ®ex->entries); + + if (njs_slow_path(ret < 0)) { + njs_alert(trace, NJS_LEVEL_ERROR, + "pcre2_pattern_info(\"%s\", PCRE2_INFO_NAMETABLE) " + "failed: %s", source, + njs_regex_pcre2_error(ret, errstr)); + + return NJS_ERROR; + } + } + } + + return NJS_OK; +} + + +njs_bool_t +njs_regex_is_valid(njs_regex_t *regex) +{ + return (regex->code != NULL); +} + + +njs_int_t +njs_regex_named_captures(njs_regex_t *regex, njs_str_t *name, int n) +{ + char *entry; + + if (name == NULL) { + return regex->nentries; + } + + if (n >= regex->nentries) { + return NJS_ERROR; + } + + entry = regex->entries + regex->entry_size * n; + + name->start = (u_char *) entry + 2; + name->length = njs_strlen(name->start); + + return (entry[0] << 8) + entry[1]; +} + + +njs_regex_match_data_t * +njs_regex_match_data(njs_regex_t *regex, njs_regex_generic_ctx_t *ctx) +{ + if (regex != NULL) { + return pcre2_match_data_create_from_pattern(regex->code, ctx); + } + + return pcre2_match_data_create(0, ctx); +} + + +void +njs_regex_match_data_free(njs_regex_match_data_t *match_data, + njs_regex_generic_ctx_t *unused) +{ + pcre2_match_data_free(match_data); +} + + +njs_int_t +njs_regex_match(njs_regex_t *regex, const u_char *subject, size_t off, + size_t len, njs_regex_match_data_t *match_data, njs_trace_t *trace) +{ + int ret; + u_char errstr[128]; + + ret = pcre2_match(regex->code, subject, len, off, 0, match_data, NULL); + + if (ret < 0) { + if (ret == PCRE2_ERROR_NOMATCH) { + return NJS_DECLINED; + } + + njs_alert(trace, NJS_LEVEL_ERROR, "pcre2_match() failed: %s", + njs_regex_pcre2_error(ret, errstr)); + return NJS_ERROR; + } + + return ret; +} + + +size_t +njs_regex_capture(njs_regex_match_data_t *match_data, njs_uint_t n) +{ + size_t c; + + c = pcre2_get_ovector_pointer(match_data)[n]; + + if (c == PCRE2_UNSET) { + return NJS_REGEX_UNSET; + } + + return c; +} + + +static const u_char * +njs_regex_pcre2_error(int errcode, u_char buffer[128]) +{ + pcre2_get_error_message(errcode, buffer, 128); + + return buffer; +} diff --git a/src/njs_regex.h b/src/njs_regex.h index ee08fe1f..5cf09cf2 100644 --- a/src/njs_regex.h +++ b/src/njs_regex.h @@ -25,16 +25,6 @@ typedef void *(*njs_pcre_malloc_t)(size_t size, void *memory_data); typedef void (*njs_pcre_free_t)(void *p, void *memory_data); -typedef struct { - njs_pcre_malloc_t private_malloc; - njs_pcre_free_t private_free; - void *memory_data; -} njs_regex_generic_ctx_t; - - -#define njs_regex_compile_ctx_t void - - typedef struct { void *code; void *extra; @@ -46,6 +36,22 @@ typedef struct { } njs_regex_t; +#ifdef NJS_HAVE_PCRE2 + +#define njs_regex_generic_ctx_t void +#define njs_regex_compile_ctx_t void +#define njs_regex_match_data_t void + +#else + +typedef struct { + njs_pcre_malloc_t private_malloc; + njs_pcre_free_t private_free; + void *memory_data; +} njs_regex_generic_ctx_t; + +#define njs_regex_compile_ctx_t void + typedef struct { int ncaptures; /* @@ -57,6 +63,8 @@ typedef struct { int captures[3]; } njs_regex_match_data_t; +#endif + NJS_EXPORT njs_regex_generic_ctx_t * njs_regex_generic_ctx_create(njs_pcre_malloc_t private_malloc, diff --git a/src/test/njs_unit_test.c b/src/test/njs_unit_test.c index 11193dea..438a0fc3 100644 --- a/src/test/njs_unit_test.c +++ b/src/test/njs_unit_test.c @@ -6,7 +6,9 @@ #include +#ifndef NJS_HAVE_PCRE2 #include +#endif #include "njs_externals_test.h" @@ -19,6 +21,12 @@ #define njs_evar(little, big) big #endif +#ifdef NJS_HAVE_PCRE2 +#define njs_pcre_var(pcre2, pcre) pcre2 +#else +#define njs_pcre_var(pcre2, pcre) pcre +#endif + #define njs_declare_sparse_array(nm, sz) \ "var " nm " = Array(" njs_stringify(sz) "); " \ @@ -8632,9 +8640,6 @@ static njs_unit_test_t njs_test[] = { njs_str("String.bytesFrom([255,149,15,97,95]).replace(/_/g, 'X')[4]"), njs_str("X") }, - { njs_str("/]/"), - njs_str("/\\]/") }, - { njs_str("/=/"), njs_str("/=/") }, @@ -8647,11 +8652,12 @@ static njs_unit_test_t njs_test[] = { njs_str("/\\s*;\\s*/"), njs_str("/\\s*;\\s*/") }, - { njs_str("RegExp(']')"), +#ifndef NJS_HAVE_PCRE2 + { njs_str("/]/"), njs_str("/\\]/") }, - { njs_str("RegExp('[\\\\')"), - njs_str("SyntaxError: pcre_compile(\"[\\\") failed: \\ at end of pattern") }, + { njs_str("RegExp(']')"), + njs_str("/\\]/") }, { njs_str("RegExp('[\\\\\\\\]]')"), njs_str("/[\\\\]\\]/") }, @@ -8673,6 +8679,12 @@ static njs_unit_test_t njs_test[] = { njs_str("/]cd/"), njs_str("/\\]cd/") }, +#endif + + { njs_str("RegExp('[\\\\')"), + njs_str("SyntaxError: " + njs_pcre_var("pcre_compile2(\"[\\\") failed: \\ at end of pattern at \"\"", + "pcre_compile(\"[\\\") failed: \\ at end of pattern")) }, { njs_str("RegExp('\\\\0').source[1]"), njs_str("0") }, @@ -10698,8 +10710,10 @@ static njs_unit_test_t njs_test[] = { njs_str("/a\r/"), njs_str("SyntaxError: Unterminated RegExp \"/a\" in 1") }, +#ifndef NJS_HAVE_PCRE2 { njs_str("/a\\q/"), njs_str("/a\\q/") }, +#endif { njs_str("/\\\\/"), njs_str("/\\\\/") }, @@ -10750,17 +10764,23 @@ static njs_unit_test_t njs_test[] = ".every(ch=>/[\\]\\[!\"#$%&'()*+,.\\/:;<=>?@\\^_`{|}-]/.test(ch))"), njs_str("true") }, +#ifndef NJS_HAVE_PCRE2 { njs_str("/a\\q/.test('a\\q')"), njs_str("true") }, +#endif { njs_str("/(\\.(?!com|org)|\\/)/.test('ah.info')"), njs_str("true") }, { njs_str("/(/.test('')"), - njs_str("SyntaxError: pcre_compile(\"(\") failed: missing ) in 1") }, + njs_str("SyntaxError: " + njs_pcre_var("pcre_compile2(\"(\") failed: missing closing parenthesis at \"\" in 1", + "pcre_compile(\"(\") failed: missing ) in 1")) }, { njs_str("/+/.test('')"), - njs_str("SyntaxError: pcre_compile(\"+\") failed: nothing to repeat at \"+\" in 1") }, + njs_str("SyntaxError: " + njs_pcre_var("pcre_compile2(\"+\") failed: quantifier does not follow a repeatable item at \"+\" in 1", + "pcre_compile(\"+\") failed: nothing to repeat at \"+\" in 1")) }, { njs_str("/^$/.test('')"), njs_str("true") }, @@ -11040,17 +11060,27 @@ static njs_unit_test_t njs_test[] = njs_str("true") }, { njs_str("new RegExp('[')"), - njs_str("SyntaxError: pcre_compile(\"[\") failed: missing terminating ] for character class") }, + njs_str("SyntaxError: " + njs_pcre_var("pcre_compile2(\"[\") failed: missing terminating ] for character class at \"\"", + "pcre_compile(\"[\") failed: missing terminating ] for character class")) }, { njs_str("new RegExp('['.repeat(16))"), - njs_str("SyntaxError: pcre_compile(\"[[[[[[[[[[[[[[[[\") failed: missing terminating ] for character class") }, + njs_str("SyntaxError: " + njs_pcre_var("pcre_compile2(\"[[[[[[[[[[[[[[[[\") failed: missing terminating ] for character class at \"\"", + "pcre_compile(\"[[[[[[[[[[[[[[[[\") failed: missing terminating ] for character class")) }, { njs_str("new RegExp('\\\\')"), - njs_str("SyntaxError: pcre_compile(\"\\\") failed: \\ at end of pattern") }, + njs_str("SyntaxError: " + njs_pcre_var("pcre_compile2(\"\\\") failed: \\ at end of pattern at \"\"", + "pcre_compile(\"\\\") failed: \\ at end of pattern")) }, { njs_str("[0].map(RegExp().toString)"), njs_str("TypeError: \"this\" argument is not an object") }, + { njs_str("var arr = /\\1(A)/.exec('AA');" + "[arr[0], arr[1]]"), + njs_str("A,A") }, + /* Non-standard ECMA-262 features. */ /* 0x10400 is not a surrogate pair of 0xD801 and 0xDC00. */ @@ -21230,7 +21260,7 @@ static njs_unit_test_t njs_tz_test[] = }; -static njs_unit_test_t njs_regexp_test[] = +static njs_unit_test_t njs_regexp_optional_tests[] = { { njs_str("/[\\\\u02E0-\\\\u02E4]/"), njs_str("/[\\\\u02E0-\\\\u02E4]/") }, @@ -21262,6 +21292,7 @@ static njs_unit_test_t njs_regexp_test[] = { njs_str("RegExp('\x00').test('\0')"), njs_str("true") }, +#ifndef NJS_HAVE_PCRE2 { njs_str("RegExp('\x00\\\\x00').source"), njs_str("\\u0000\\x00") }, @@ -21270,6 +21301,7 @@ static njs_unit_test_t njs_regexp_test[] = { njs_str("RegExp('\\\\\\0').source"), njs_str("\\\\u0000") }, +#endif { njs_str("RegExp('[\0]').test('\0')"), njs_str("true") }, @@ -22154,9 +22186,11 @@ static njs_int_t njs_regexp_optional_test(njs_unit_test_t tests[], size_t num, njs_str_t *name, njs_opts_t *opts, njs_stat_t *stat) { + njs_bool_t safe; + +#ifndef NJS_HAVE_PCRE2 int erroff; pcre *re1, *re2; - njs_int_t ret; const char *errstr; /* @@ -22169,6 +22203,10 @@ njs_regexp_optional_test(njs_unit_test_t tests[], size_t num, njs_str_t *name, re1 = pcre_compile("/[\\u0410]/", PCRE_JAVASCRIPT_COMPAT, &errstr, &erroff, NULL); + if (re1 != NULL) { + pcre_free(re1); + } + /* * pcre-7.8 fails to compile unicode escape codes inside square brackets * even when PCRE_UTF8 option is provided. @@ -22176,24 +22214,24 @@ njs_regexp_optional_test(njs_unit_test_t tests[], size_t num, njs_str_t *name, re2 = pcre_compile("/[\\u0410]/", PCRE_JAVASCRIPT_COMPAT | PCRE_UTF8, &errstr, &erroff, NULL); - if (re1 == NULL && re2 != NULL) { - ret = njs_unit_test(tests, num, name, opts, stat); - if (ret != NJS_OK) { - return ret; - } - - } else { - njs_printf("njs unicode regexp tests skipped, libpcre fails\n"); + if (re2 != NULL) { + pcre_free(re2); } - if (re1 != NULL) { - pcre_free(re1); - } + safe = (re1 == NULL && re2 != NULL); - if (re2 != NULL) { - pcre_free(re2); +#else + + safe = 1; + +#endif + + if (safe) { + return njs_unit_test(tests, num, name, opts, stat); } + njs_printf("regexp optional tests skipped\n"); + return NJS_OK; } @@ -23331,10 +23369,10 @@ static njs_test_suite_t njs_suites[] = njs_nitems(njs_tz_test), njs_timezone_optional_test }, - { njs_str("regexp"), + { njs_str("regexp optional"), { .repeat = 1, .unsafe = 1 }, - njs_regexp_test, - njs_nitems(njs_regexp_test), + njs_regexp_optional_tests, + njs_nitems(njs_regexp_optional_tests), njs_regexp_optional_test }, { njs_str("vm_json"), -- 2.47.3