From: Dmitry Volyntsev Date: Wed, 9 Jun 2021 17:14:10 +0000 (+0000) Subject: Fixed String.prototype.split() according to the specification. X-Git-Tag: 0.6.0~5 X-Git-Url: http://www.kaiwu.me/postgresql/commit/?a=commitdiff_plain;h=308fea06c9a328de88fccbdfc08576b911552d6f;p=njs.git Fixed String.prototype.split() according to the specification. This closes #359 issue on GitHub. --- diff --git a/src/njs_regexp.c b/src/njs_regexp.c index 4709abc5..838be7a6 100644 --- a/src/njs_regexp.c +++ b/src/njs_regexp.c @@ -1612,6 +1612,250 @@ exception: } +static njs_int_t +njs_regexp_prototype_symbol_split(njs_vm_t *vm, njs_value_t *args, + njs_uint_t nargs, njs_index_t unused) +{ + u_char *dst; + int64_t e, i, p, q, ncaptures, length; + uint32_t limit; + njs_int_t ret; + njs_bool_t sticky; + njs_utf8_t utf8; + njs_array_t *array; + njs_value_t *rx, *string, *value; + njs_value_t r, z, this, s_lvalue, retval, setval, constructor; + njs_object_t *object; + const u_char *start, *end; + njs_string_prop_t s; + njs_value_t arguments[2]; + + static const njs_value_t string_lindex = njs_string("lastIndex"); + static const njs_value_t string_flags = njs_string("flags"); + + rx = njs_argument(args, 0); + + if (njs_slow_path(!njs_is_object(rx))) { + njs_type_error(vm, "\"this\" is not object"); + return NJS_ERROR; + } + + string = njs_lvalue_arg(&s_lvalue, args, nargs, 1); + + ret = njs_value_to_string(vm, string, string); + if (njs_slow_path(ret != NJS_OK)) { + return ret; + } + + njs_set_function(&constructor, &vm->constructors[NJS_OBJ_TYPE_REGEXP]); + + ret = njs_value_species_constructor(vm, rx, &constructor, &constructor); + if (njs_slow_path(ret != NJS_OK)) { + return ret; + } + + ret = njs_value_property(vm, rx, njs_value_arg(&string_flags), &retval); + if (njs_slow_path(ret == NJS_ERROR)) { + return NJS_ERROR; + } + + ret = njs_value_to_string(vm, &retval, &retval); + if (njs_slow_path(ret != NJS_OK)) { + return ret; + } + + (void) njs_string_prop(&s, &retval); + + sticky = memchr(s.start, 'y', s.size) != NULL; + + object = njs_function_new_object(vm, &constructor); + if (njs_slow_path(object == NULL)) { + return NJS_ERROR; + } + + njs_set_object(&this, object); + + arguments[0] = *rx; + + if (!sticky) { + length = njs_is_byte_string(&s) ? 0 : s.length + 1; + + dst = njs_string_alloc(vm, &arguments[1], s.size + 1, length); + if (njs_slow_path(dst == NULL)) { + return NJS_ERROR; + } + + dst = njs_cpymem(dst, s.start, s.size); + *dst++ = 'y'; + + } else { + arguments[1] = retval; + } + + ret = njs_function_call2(vm, njs_function(&constructor), &this, + njs_value_arg(&arguments), 2, &r, 1); + if (njs_slow_path(ret != NJS_OK)) { + return NJS_ERROR; + } + + rx = &r; + + array = njs_array_alloc(vm, 0, 0, NJS_ARRAY_SPARE); + if (njs_slow_path(array == NULL)) { + return NJS_ERROR; + } + + value = njs_arg(args, nargs, 2); + limit = UINT32_MAX; + + if (njs_is_defined(value)) { + ret = njs_value_to_uint32(vm, value, &limit); + if (njs_slow_path(ret != NJS_OK)) { + return ret; + } + } + + if (njs_slow_path(limit == 0)) { + goto done; + } + + length = njs_string_prop(&s, string); + + if (njs_slow_path(s.size == 0)) { + ret = njs_regexp_exec(vm, rx, string, &z); + if (njs_slow_path(ret != NJS_OK)) { + return NJS_ERROR; + } + + if (!njs_is_null(&z)) { + goto done; + } + + goto single; + } + + utf8 = NJS_STRING_BYTE; + + if (s.length != 0 && s.length != s.size) { + utf8 = NJS_STRING_UTF8; + } + + p = 0; + q = 0; + + while (q < length) { + njs_set_number(&setval, q); + ret = njs_value_property_set(vm, rx, njs_value_arg(&string_lindex), + &setval); + if (njs_slow_path(ret != NJS_OK)) { + return NJS_ERROR; + } + + ret = njs_regexp_exec(vm, rx, string, &z); + if (njs_slow_path(ret != NJS_OK)) { + return NJS_ERROR; + } + + if (njs_is_null(&z)) { + q = q + 1; + continue; + } + + ret = njs_value_property(vm, rx, njs_value_arg(&string_lindex), + &retval); + if (njs_slow_path(ret == NJS_ERROR)) { + return NJS_ERROR; + } + + ret = njs_value_to_length(vm, &retval, &e); + if (njs_slow_path(ret != NJS_OK)) { + return NJS_ERROR; + } + + e = njs_min(e, length); + + if (e == p) { + q = q + 1; + continue; + } + + if (utf8 == NJS_STRING_UTF8) { + start = njs_string_offset(s.start, s.start + s.size, p); + end = njs_string_offset(s.start, s.start + s.size, q); + + } else { + start = &s.start[p]; + end = &s.start[q]; + } + + ret = njs_string_split_part_add(vm, array, utf8, start, end - start); + if (njs_slow_path(ret != NJS_OK)) { + return ret; + } + + if (array->length == limit) { + goto done; + } + + p = e; + + ret = njs_object_length(vm, &z, &ncaptures); + if (njs_slow_path(ret != NJS_OK)) { + return NJS_ERROR; + } + + ncaptures = njs_max(ncaptures - 1, 0); + + for (i = 1; i <= ncaptures; i++) { + value = njs_array_push(vm, array); + if (njs_slow_path(value == NULL)) { + return NJS_ERROR; + } + + ret = njs_value_property_i64(vm, &z, i, value); + if (njs_slow_path(ret == NJS_ERROR)) { + return NJS_ERROR; + } + + if (array->length == limit) { + goto done; + } + } + + q = p; + } + + end = &s.start[s.size]; + + if (utf8 == NJS_STRING_UTF8) { + start = njs_string_offset(s.start, s.start + s.size, p); + + } else { + start = &s.start[p]; + } + + ret = njs_string_split_part_add(vm, array, utf8, start, end - start); + if (njs_slow_path(ret != NJS_OK)) { + return ret; + } + + goto done; + +single: + + value = njs_array_push(vm, array); + if (njs_slow_path(value == NULL)) { + return NJS_ERROR; + } + + *value = *string; + +done: + + njs_set_array(&vm->retval, array); + + return NJS_OK; +} static const njs_object_prop_t njs_regexp_constructor_properties[] = @@ -1755,6 +1999,14 @@ static const njs_object_prop_t njs_regexp_prototype_properties[] = .writable = 1, .configurable = 1, }, + + { + .type = NJS_PROPERTY, + .name = njs_wellknown_symbol(NJS_SYMBOL_SPLIT), + .value = njs_native_function(njs_regexp_prototype_symbol_split, 2), + .writable = 1, + .configurable = 1, + }, }; diff --git a/src/njs_string.c b/src/njs_string.c index 9326ce0b..0b7a06c5 100644 --- a/src/njs_string.c +++ b/src/njs_string.c @@ -72,8 +72,6 @@ static njs_int_t njs_string_bytes_from_string(njs_vm_t *vm, const njs_value_t *string, const njs_value_t *encoding); static njs_int_t njs_string_match_multiple(njs_vm_t *vm, njs_value_t *args, njs_regexp_pattern_t *pattern); -static njs_int_t njs_string_split_part_add(njs_vm_t *vm, njs_array_t *array, - njs_utf8_t utf8, const u_char *start, size_t size); #define njs_base64_encoded_length(len) (((len + 2) / 3) * 4) @@ -3338,181 +3336,151 @@ static njs_int_t njs_string_prototype_split(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs, njs_index_t unused) { - int *captures; - size_t size; - uint32_t limit; - njs_int_t ret; - njs_utf8_t utf8; - njs_value_t *value; - njs_array_t *array; - const u_char *p, *start, *next, *last, *end; - njs_regexp_utf8_t type; - njs_string_prop_t string, split; - njs_regexp_pattern_t *pattern; + size_t size; + uint32_t limit; + njs_int_t ret; + njs_utf8_t utf8; + njs_bool_t undefined; + njs_value_t *this, *separator, *value; + njs_value_t separator_lvalue, limit_lvalue, splitter; + njs_array_t *array; + const u_char *p, *start, *next, *last, *end; + njs_string_prop_t string, split; + njs_value_t arguments[3]; - ret = njs_string_object_validate(vm, njs_arg(args, nargs, 0)); - if (njs_slow_path(ret != NJS_OK)) { - return ret; - } + static const njs_value_t split_key = + njs_wellknown_symbol(NJS_SYMBOL_SPLIT); - array = njs_array_alloc(vm, 0, 0, NJS_ARRAY_SPARE); - if (njs_slow_path(array == NULL)) { + this = njs_argument(args, 0); + + if (njs_slow_path(njs_is_null_or_undefined(this))) { + njs_type_error(vm, "cannot convert \"%s\"to object", + njs_type_string(this->type)); return NJS_ERROR; } - if (nargs > 1) { - - if (nargs > 2) { - value = njs_argument(args, 2); - - if (njs_slow_path(!njs_is_number(value))) { - ret = njs_value_to_uint32(vm, value, &limit); - if (njs_slow_path(ret != NJS_OK)) { - return ret; - } - - } else { - limit = njs_number_to_uint32(njs_number(value)); - } - - if (limit == 0) { - goto done; - } + separator = njs_lvalue_arg(&separator_lvalue, args, nargs, 1); + value = njs_lvalue_arg(&limit_lvalue, args, nargs, 2); - } else { - limit = (uint32_t) -1; + if (!njs_is_null_or_undefined(separator)) { + ret = njs_value_method(vm, separator, njs_value_arg(&split_key), + &splitter); + if (njs_slow_path(ret != NJS_OK)) { + return ret; } - (void) njs_string_prop(&string, &args[0]); + if (njs_is_defined(&splitter)) { + arguments[0] = *this; + arguments[1] = *value; - if (string.size == 0) { - goto single; + return njs_function_call(vm, njs_function(&splitter), separator, + arguments, 2, &vm->retval); } + } - utf8 = NJS_STRING_BYTE; - type = NJS_REGEXP_BYTE; - - if (string.length != 0) { - utf8 = NJS_STRING_ASCII; - type = NJS_REGEXP_UTF8; + ret = njs_value_to_string(vm, this, this); + if (njs_slow_path(ret != NJS_OK)) { + return ret; + } - if (string.length != string.size) { - utf8 = NJS_STRING_UTF8; - } - } + array = njs_array_alloc(vm, 0, 0, NJS_ARRAY_SPARE); + if (njs_slow_path(array == NULL)) { + return NJS_ERROR; + } - switch (args[1].type) { + limit = UINT32_MAX; - case NJS_REGEXP: - pattern = njs_regexp_pattern(&args[1]); + if (njs_is_defined(value)) { + ret = njs_value_to_uint32(vm, value, &limit); + if (njs_slow_path(ret != NJS_OK)) { + return ret; + } + } - if (!njs_regex_is_valid(&pattern->regex[type])) { - goto single; - } + undefined = njs_is_undefined(separator); - start = string.start; - end = string.start + string.size; - - do { - ret = njs_regexp_match(vm, &pattern->regex[type], start, 0, - end - start, vm->single_match_data); - if (ret >= 0) { - captures = njs_regex_captures(vm->single_match_data); + ret = njs_value_to_string(vm, separator, separator); + if (njs_slow_path(ret != NJS_OK)) { + return ret; + } - p = start + captures[0]; - next = start + captures[1]; + if (njs_slow_path(limit == 0)) { + goto done; + } - } else if (ret == NJS_REGEX_NOMATCH) { - p = (u_char *) end; - next = (u_char *) end + 1; + if (njs_slow_path(undefined)) { + goto single; + } - } else { - return NJS_ERROR; - } + (void) njs_string_prop(&string, this); + (void) njs_string_prop(&split, separator); - /* Empty split regexp. */ - if (p == next) { - p = (utf8 != NJS_STRING_BYTE) ? njs_utf8_next(p, end) - : p + 1; - next = p; - } + if (njs_slow_path(string.size == 0)) { + if (split.size != 0) { + goto single; + } - size = p - start; + goto done; + } - ret = njs_string_split_part_add(vm, array, utf8, start, size); - if (njs_slow_path(ret != NJS_OK)) { - return ret; - } + utf8 = NJS_STRING_BYTE; - start = next; - limit--; + if (string.length != 0) { + utf8 = NJS_STRING_ASCII; - } while (limit != 0 && p < end); + if (string.length != string.size) { + utf8 = NJS_STRING_UTF8; + } + } - goto done; + start = string.start; + end = string.start + string.size; + last = end - split.size; - case NJS_UNDEFINED: - break; + do { - default: - if (njs_slow_path(!njs_is_string(&args[1]))) { - ret = njs_value_to_string(vm, &args[1], &args[1]); - if (njs_slow_path(ret != NJS_OK)) { - return ret; - } + for (p = start; p <= last; p++) { + if (memcmp(p, split.start, split.size) == 0) { + goto found; } + } - (void) njs_string_prop(&split, &args[1]); + p = end; - if (string.size < split.size) { - goto single; - } +found: - start = string.start; - end = string.start + string.size; - last = end - split.size; + next = p + split.size; - do { - for (p = start; p <= last; p++) { - if (memcmp(p, split.start, split.size) == 0) { - goto found; - } - } + /* Empty split string. */ - p = end; - -found: + if (p == next) { + p = (utf8 != NJS_STRING_BYTE) ? njs_utf8_next(p, end) + : p + 1; + next = p; + } - next = p + split.size; + size = p - start; - /* Empty split string. */ - if (p == next) { - p = (utf8 != NJS_STRING_BYTE) ? njs_utf8_next(p, end) - : p + 1; - next = p; - } + ret = njs_string_split_part_add(vm, array, utf8, start, size); + if (njs_slow_path(ret != NJS_OK)) { + return ret; + } - size = p - start; + start = next; + limit--; - ret = njs_string_split_part_add(vm, array, utf8, start, size); - if (njs_slow_path(ret != NJS_OK)) { - return ret; - } + } while (limit != 0 && p < end); - start = next; - limit--; + goto done; - } while (limit != 0 && p < end); +single: - goto done; - } + value = njs_array_push(vm, array); + if (njs_slow_path(value == NULL)) { + return NJS_ERROR; } -single: - - /* GC: retain. */ - array->start[0] = args[0]; - array->length = 1; + *value = *this; done: @@ -3522,7 +3490,7 @@ done: } -static njs_int_t +njs_int_t njs_string_split_part_add(njs_vm_t *vm, njs_array_t *array, njs_utf8_t utf8, const u_char *start, size_t size) { diff --git a/src/njs_string.h b/src/njs_string.h index 2f43d09b..5874790a 100644 --- a/src/njs_string.h +++ b/src/njs_string.h @@ -239,6 +239,8 @@ njs_int_t njs_string_decode_uri(njs_vm_t *vm, njs_value_t *args, njs_int_t njs_string_prototype_concat(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs, njs_index_t unused); +njs_int_t njs_string_split_part_add(njs_vm_t *vm, njs_array_t *array, + njs_utf8_t utf8, const u_char *start, size_t size); njs_int_t njs_string_get_substitution(njs_vm_t *vm, njs_value_t *matched, njs_value_t *string, int64_t pos, njs_value_t *captures, int64_t ncaptures, njs_value_t *groups, njs_value_t *replacement, njs_value_t *retval); diff --git a/src/test/njs_benchmark.c b/src/test/njs_benchmark.c index ff8f3807..018b87bc 100644 --- a/src/test/njs_benchmark.c +++ b/src/test/njs_benchmark.c @@ -317,8 +317,10 @@ static njs_benchmark_test_t njs_test[] = 1 }, { "regexp split", - njs_str("'a a'.split(/ /).length"), - njs_str("2"), + njs_str("var s = Array(26).fill(0).map((v,i)=> {" + " var u = String.fromCodePoint(65+i), l = u.toLowerCase(); return u+l+l;}).join('');" + "s.split(/(?=[A-Z])/).length"), + njs_str("26"), 100 }, { "regexp 10K split", diff --git a/src/test/njs_unit_test.c b/src/test/njs_unit_test.c index 62a51626..5bb3f1cb 100644 --- a/src/test/njs_unit_test.c +++ b/src/test/njs_unit_test.c @@ -8635,6 +8635,9 @@ static njs_unit_test_t njs_test[] = { njs_str("/[\\"), njs_str("SyntaxError: Unterminated RegExp \"/[\\\" in 1") }, + { njs_str("/\\s*;\\s*/"), + njs_str("/\\s*;\\s*/") }, + { njs_str("RegExp(']')"), njs_str("/\\]/") }, @@ -8802,7 +8805,7 @@ static njs_unit_test_t njs_test[] = njs_str("abc") }, { njs_str("''.split('').length"), - njs_str("1") }, + njs_str("0") }, { njs_str("'abc'.split('')"), njs_str("a,b,c") }, @@ -8858,9 +8861,40 @@ static njs_unit_test_t njs_test[] = { njs_str("'abc'.split(/abc/)"), njs_str(",") }, + { njs_str("'AbcDefGhi'.split(/([A-Z][a-z]+)/)"), + njs_str(",Abc,,Def,,Ghi,") }, + + { njs_str("'myCamelCaseString'.split(/(?=[A-Z])/)"), + njs_str("my,Camel,Case,String") }, + + { njs_str("'мояВерблюжьяСтрока'.split(/(?=[А-Я])/)"), + njs_str("моя,Верблюжья,Строка") }, + + { njs_str("'Harry Trump ;Fred Barney; Helen Rigby ; Bill Abel ;Chris Hand '.split( /\\s*(?:;|$)\\s*/)"), + njs_str("Harry Trump,Fred Barney,Helen Rigby,Bill Abel,Chris Hand,") }, + + { njs_str("'Гарри Трамп ;Фрэд Барни; Хелен Ригби ; Билл Абель'.split(/\\s*;\\s*/)"), + njs_str("Гарри Трамп,Фрэд Барни,Хелен Ригби,Билл Абель") }, + + { njs_str("'Hello 1 world. Sentence number 2.'.split(/(\\d)/)"), + njs_str("Hello ,1, world. Sentence number ,2,.") }, + + { njs_str("'Привет 1 мир. Предложение номер 2.'.split(/(\\d)/)"), + njs_str("Привет ,1, мир. Предложение номер ,2,.") }, + { njs_str("'0123456789'.split('').reverse().join('')"), njs_str("9876543210") }, + { njs_str("/-/[Symbol.split]('a-b-c')"), + njs_str("a,b,c") }, + + { njs_str("var O = RegExp.prototype[Symbol.split];" + "RegExp.prototype[Symbol.split] = function (s, limit) { " + " return O.call(this, s, limit).map(v => `@${v}#`); " + "};" + "'2016-01-02'.split(/-/)"), + njs_str("@2016#,@01#,@02#") }, + { njs_str("'abc'.repeat(3)"), njs_str("abcabcabc") }, @@ -17006,11 +17040,13 @@ static njs_unit_test_t njs_test[] = { njs_str("var a = [1]; a[2] = 'x'; JSON.stringify(a)"), njs_str("[1,null,\"x\"]") }, +#if (!NJS_HAVE_MEMORY_SANITIZER) /* very long test under MSAN */ { njs_str(njs_declare_sparse_array("a", 32769) "a[32] = 'a'; a[64] = 'b';" "var s = JSON.stringify(a); " "[s.length,s.substring(162,163),s.match(/null/g).length]"), njs_str("163844,a,32767") }, +#endif { njs_str(njs_declare_sparse_array("a", 8) "a[2] = 'a'; a[4] = 'b'; a.length = 3;"