diff options
author | bellard <6490144+bellard@users.noreply.github.com> | 2020-09-06 18:53:08 +0200 |
---|---|---|
committer | bellard <6490144+bellard@users.noreply.github.com> | 2020-09-06 18:53:08 +0200 |
commit | 91459fb6723e29e923380cec0023af93819ae69d (patch) | |
tree | 6a1aff8d9b290ed184d1481da50d0e6b4a9a324c /libregexp.c | |
parent | 9096e544ba2357eeadc6f09fc6e5cf58db7751bc (diff) | |
download | quickjs-91459fb6723e29e923380cec0023af93819ae69d.tar.gz quickjs-91459fb6723e29e923380cec0023af93819ae69d.zip |
2020-01-05 release
Diffstat (limited to 'libregexp.c')
-rw-r--r-- | libregexp.c | 2541 |
1 files changed, 2541 insertions, 0 deletions
diff --git a/libregexp.c b/libregexp.c new file mode 100644 index 0000000..4425ce5 --- /dev/null +++ b/libregexp.c @@ -0,0 +1,2541 @@ +/* + * Regular Expression Engine + * + * Copyright (c) 2017-2018 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include <stdlib.h> +#include <stdio.h> +#include <stdarg.h> +#include <inttypes.h> +#include <string.h> +#include <assert.h> + +#include "cutils.h" +#include "libregexp.h" + +/* + TODO: + + - Add full unicode canonicalize rules for character ranges (not + really useful but needed for exact "ignorecase" compatibility). + + - Add a lock step execution mode (=linear time execution guaranteed) + when the regular expression is "simple" i.e. no backreference nor + complicated lookahead. The opcodes are designed for this execution + model. +*/ + +#if defined(TEST) +#define DUMP_REOP +#endif + +typedef enum { +#define DEF(id, size) REOP_ ## id, +#include "libregexp-opcode.h" +#undef DEF + REOP_COUNT, +} REOPCodeEnum; + +#define CAPTURE_COUNT_MAX 255 +#define STACK_SIZE_MAX 255 + +/* unicode code points */ +#define CP_LS 0x2028 +#define CP_PS 0x2029 + +#define TMP_BUF_SIZE 128 + +typedef struct { + DynBuf byte_code; + const uint8_t *buf_ptr; + const uint8_t *buf_end; + const uint8_t *buf_start; + int re_flags; + BOOL is_utf16; + BOOL ignore_case; + BOOL dotall; + int capture_count; + int total_capture_count; /* -1 = not computed yet */ + int has_named_captures; /* -1 = don't know, 0 = no, 1 = yes */ + void *mem_opaque; + DynBuf group_names; + union { + char error_msg[TMP_BUF_SIZE]; + char tmp_buf[TMP_BUF_SIZE]; + } u; +} REParseState; + +typedef struct { +#ifdef DUMP_REOP + const char *name; +#endif + uint8_t size; +} REOpCode; + +static const REOpCode reopcode_info[REOP_COUNT] = { +#ifdef DUMP_REOP +#define DEF(id, size) { #id, size }, +#else +#define DEF(id, size) { size }, +#endif +#include "libregexp-opcode.h" +#undef DEF +}; + +#define RE_HEADER_FLAGS 0 +#define RE_HEADER_CAPTURE_COUNT 1 +#define RE_HEADER_STACK_SIZE 2 + +#define RE_HEADER_LEN 7 + +static inline int is_digit(int c) { + return c >= '0' && c <= '9'; +} + +/* insert 'len' bytes at position 'pos' */ +static void dbuf_insert(DynBuf *s, int pos, int len) +{ + dbuf_realloc(s, s->size + len); + memmove(s->buf + pos + len, s->buf + pos, s->size - pos); + s->size += len; +} + +/* canonicalize with the specific JS regexp rules */ +static uint32_t lre_canonicalize(uint32_t c, BOOL is_utf16) +{ + uint32_t res[LRE_CC_RES_LEN_MAX]; + int len; + if (is_utf16) { + if (likely(c < 128)) { + if (c >= 'A' && c <= 'Z') + c = c - 'A' + 'a'; + } else { + lre_case_conv(res, c, 2); + c = res[0]; + } + } else { + if (likely(c < 128)) { + if (c >= 'a' && c <= 'z') + c = c - 'a' + 'A'; + } else { + /* legacy regexp: to upper case if single char >= 128 */ + len = lre_case_conv(res, c, FALSE); + if (len == 1 && res[0] >= 128) + c = res[0]; + } + } + return c; +} + +static const uint16_t char_range_d[] = { + 1, + 0x0030, 0x0039 + 1, +}; + +/* code point ranges for Zs,Zl or Zp property */ +static const uint16_t char_range_s[] = { + 10, + 0x0009, 0x000D + 1, + 0x0020, 0x0020 + 1, + 0x00A0, 0x00A0 + 1, + 0x1680, 0x1680 + 1, + 0x2000, 0x200A + 1, + /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */ + /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */ + 0x2028, 0x2029 + 1, + 0x202F, 0x202F + 1, + 0x205F, 0x205F + 1, + 0x3000, 0x3000 + 1, + /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */ + 0xFEFF, 0xFEFF + 1, +}; + +BOOL lre_is_space(int c) +{ + int i, n, low, high; + n = (countof(char_range_s) - 1) / 2; + for(i = 0; i < n; i++) { + low = char_range_s[2 * i + 1]; + if (c < low) + return FALSE; + high = char_range_s[2 * i + 2]; + if (c < high) + return TRUE; + } + return FALSE; +} + +uint32_t const lre_id_start_table_ascii[4] = { + /* $ A-Z _ a-z */ + 0x00000000, 0x00000010, 0x87FFFFFE, 0x07FFFFFE +}; + +uint32_t const lre_id_continue_table_ascii[4] = { + /* $ 0-9 A-Z _ a-z */ + 0x00000000, 0x03FF0010, 0x87FFFFFE, 0x07FFFFFE +}; + + +static const uint16_t char_range_w[] = { + 4, + 0x0030, 0x0039 + 1, + 0x0041, 0x005A + 1, + 0x005F, 0x005F + 1, + 0x0061, 0x007A + 1, +}; + +#define CLASS_RANGE_BASE 0x40000000 + +typedef enum { + CHAR_RANGE_d, + CHAR_RANGE_D, + CHAR_RANGE_s, + CHAR_RANGE_S, + CHAR_RANGE_w, + CHAR_RANGE_W, +} CharRangeEnum; + +static const uint16_t *char_range_table[] = { + char_range_d, + char_range_s, + char_range_w, +}; + +static int cr_init_char_range(REParseState *s, CharRange *cr, uint32_t c) +{ + BOOL invert; + const uint16_t *c_pt; + int len, i; + + invert = c & 1; + c_pt = char_range_table[c >> 1]; + len = *c_pt++; + cr_init(cr, s->mem_opaque, lre_realloc); + for(i = 0; i < len * 2; i++) { + if (cr_add_point(cr, c_pt[i])) + goto fail; + } + if (invert) { + if (cr_invert(cr)) + goto fail; + } + return 0; + fail: + cr_free(cr); + return -1; +} + +static int cr_canonicalize(CharRange *cr) +{ + CharRange a; + uint32_t pt[2]; + int i, ret; + + cr_init(&a, cr->mem_opaque, lre_realloc); + pt[0] = 'a'; + pt[1] = 'z' + 1; + ret = cr_op(&a, cr->points, cr->len, pt, 2, CR_OP_INTER); + if (ret) + goto fail; + /* convert to upper case */ + /* XXX: the generic unicode case would be much more complicated + and not really useful */ + for(i = 0; i < a.len; i++) { + a.points[i] += 'A' - 'a'; + } + /* Note: for simplicity we keep the lower case ranges */ + ret = cr_union1(cr, a.points, a.len); + fail: + cr_free(&a); + return ret; +} + +#ifdef DUMP_REOP +static __maybe_unused void lre_dump_bytecode(const uint8_t *buf, + int buf_len) +{ + int pos, len, opcode, bc_len, re_flags, i; + uint32_t val; + + assert(buf_len >= RE_HEADER_LEN); + + re_flags= buf[0]; + bc_len = get_u32(buf + 3); + assert(bc_len + RE_HEADER_LEN <= buf_len); + printf("flags: 0x%x capture_count=%d stack_size=%d\n", + re_flags, buf[1], buf[2]); + if (re_flags & LRE_FLAG_NAMED_GROUPS) { + const char *p; + p = (char *)buf + RE_HEADER_LEN + bc_len; + printf("named groups: "); + for(i = 1; i < buf[1]; i++) { + if (i != 1) + printf(","); + printf("<%s>", p); + p += strlen(p) + 1; + } + printf("\n"); + assert(p == (char *)(buf + buf_len)); + } + printf("bytecode_len=%d\n", bc_len); + + buf += RE_HEADER_LEN; + pos = 0; + while (pos < bc_len) { + printf("%5u: ", pos); + opcode = buf[pos]; + len = reopcode_info[opcode].size; + if (opcode >= REOP_COUNT) { + printf(" invalid opcode=0x%02x\n", opcode); + break; + } + if ((pos + len) > bc_len) { + printf(" buffer overflow (opcode=0x%02x)\n", opcode); + break; + } + printf("%s", reopcode_info[opcode].name); + switch(opcode) { + case REOP_char: + val = get_u16(buf + pos + 1); + if (val >= ' ' && val <= 126) + printf(" '%c'", val); + else + printf(" 0x%04x", val); + break; + case REOP_char32: + val = get_u32(buf + pos + 1); + if (val >= ' ' && val <= 126) + printf(" '%c'", val); + else + printf(" 0x%08x", val); + break; + case REOP_goto: + case REOP_split_goto_first: + case REOP_split_next_first: + case REOP_loop: + case REOP_lookahead: + case REOP_negative_lookahead: + case REOP_bne_char_pos: + val = get_u32(buf + pos + 1); + val += (pos + 5); + printf(" %u", val); + break; + case REOP_simple_greedy_quant: + printf(" %u %u %u %u", + get_u32(buf + pos + 1) + (pos + 17), + get_u32(buf + pos + 1 + 4), + get_u32(buf + pos + 1 + 8), + get_u32(buf + pos + 1 + 12)); + break; + case REOP_save_start: + case REOP_save_end: + case REOP_back_reference: + case REOP_backward_back_reference: + printf(" %u", buf[pos + 1]); + break; + case REOP_save_reset: + printf(" %u %u", buf[pos + 1], buf[pos + 2]); + break; + case REOP_push_i32: + val = get_u32(buf + pos + 1); + printf(" %d", val); + break; + case REOP_range: + { + int n, i; + n = get_u16(buf + pos + 1); + len += n * 4; + for(i = 0; i < n * 2; i++) { + val = get_u16(buf + pos + 3 + i * 2); + printf(" 0x%04x", val); + } + } + break; + case REOP_range32: + { + int n, i; + n = get_u16(buf + pos + 1); + len += n * 8; + for(i = 0; i < n * 2; i++) { + val = get_u32(buf + pos + 3 + i * 4); + printf(" 0x%08x", val); + } + } + break; + default: + break; + } + printf("\n"); + pos += len; + } +} +#endif + +static void re_emit_op(REParseState *s, int op) +{ + dbuf_putc(&s->byte_code, op); +} + +/* return the offset of the u32 value */ +static int re_emit_op_u32(REParseState *s, int op, uint32_t val) +{ + int pos; + dbuf_putc(&s->byte_code, op); + pos = s->byte_code.size; + dbuf_put_u32(&s->byte_code, val); + return pos; +} + +static int re_emit_goto(REParseState *s, int op, uint32_t val) +{ + int pos; + dbuf_putc(&s->byte_code, op); + pos = s->byte_code.size; + dbuf_put_u32(&s->byte_code, val - (pos + 4)); + return pos; +} + +static void re_emit_op_u8(REParseState *s, int op, uint32_t val) +{ + dbuf_putc(&s->byte_code, op); + dbuf_putc(&s->byte_code, val); +} + +static void re_emit_op_u16(REParseState *s, int op, uint32_t val) +{ + dbuf_putc(&s->byte_code, op); + dbuf_put_u16(&s->byte_code, val); +} + +static int __attribute__((format(printf, 2, 3))) re_parse_error(REParseState *s, const char *fmt, ...) +{ + va_list ap; + va_start(ap, fmt); + vsnprintf(s->u.error_msg, sizeof(s->u.error_msg), fmt, ap); + va_end(ap); + return -1; +} + +/* Return -1 in case of overflow */ +static int parse_digits(const uint8_t **pp) +{ + const uint8_t *p; + uint64_t v; + int c; + + p = *pp; + v = 0; + for(;;) { + c = *p; + if (c < '0' || c > '9') + break; + v = v * 10 + c - '0'; + if (v >= INT32_MAX) + return -1; + p++; + } + *pp = p; + return v; +} + +static int re_parse_expect(REParseState *s, const uint8_t **pp, int c) +{ + const uint8_t *p; + p = *pp; + if (*p != c) + return re_parse_error(s, "expecting '%c'", c); + p++; + *pp = p; + return 0; +} + +/* Parse an escape sequence, *pp points after the '\': + allow_utf16 value: + 0 : no UTF-16 escapes allowed + 1 : UTF-16 escapes allowed + 2 : UTF-16 escapes allowed and escapes of surrogate pairs are + converted to a unicode character (unicode regexp case). + + Return the unicode char and update *pp if recognized, + return -1 if malformed escape, + return -2 otherwise. */ +int lre_parse_escape(const uint8_t **pp, int allow_utf16) +{ + const uint8_t *p; + uint32_t c; + + p = *pp; + c = *p++; + switch(c) { + case 'b': + c = '\b'; + break; + case 'f': + c = '\f'; + break; + case 'n': + c = '\n'; + break; + case 'r': + c = '\r'; + break; + case 't': + c = '\t'; + break; + case 'v': + c = '\v'; + break; + case 'x': + case 'u': + { + int h, n, i; + uint32_t c1; + + if (*p == '{' && allow_utf16) { + p++; + c = 0; + for(;;) { + h = from_hex(*p++); + if (h < 0) + return -1; + c = (c << 4) | h; + if (c > 0x10FFFF) + return -1; + if (*p == '}') + break; + } + p++; + } else { + if (c == 'x') { + n = 2; + } else { + n = 4; + } + + c = 0; + for(i = 0; i < n; i++) { + h = from_hex(*p++); + if (h < 0) { + return -1; + } + c = (c << 4) | h; + } + if (c >= 0xd800 && c < 0xdc00 && + allow_utf16 == 2 && p[0] == '\\' && p[1] == 'u') { + /* convert an escaped surrogate pair into a + unicode char */ + c1 = 0; + for(i = 0; i < 4; i++) { + h = from_hex(p[2 + i]); + if (h < 0) + break; + c1 = (c1 << 4) | h; + } + if (i == 4 && c1 >= 0xdc00 && c1 < 0xe000) { + p += 6; + c = (((c & 0x3ff) << 10) | (c1 & 0x3ff)) + 0x10000; + } + } + } + } + break; + case '0' ... '7': + c -= '0'; + if (allow_utf16 == 2) { + /* only accept \0 not followed by digit */ + if (c != 0 || is_digit(*p)) + return -1; + } else { + /* parse a legacy octal sequence */ + uint32_t v; + v = *p - '0'; + if (v > 7) + break; + c = (c << 3) | v; + p++; + if (c >= 32) + break; + v = *p - '0'; + if (v > 7) + break; + c = (c << 3) | v; + p++; + } + break; + default: + return -2; + } + *pp = p; + return c; +} + +#ifdef CONFIG_ALL_UNICODE +/* XXX: we use the same chars for name and value */ +static BOOL is_unicode_char(int c) +{ + return ((c >= '0' && c <= '9') || + (c >= 'A' && c <= 'Z') || + (c >= 'a' && c <= 'z') || + (c == '_')); +} + +static int parse_unicode_property(REParseState *s, CharRange *cr, + const uint8_t **pp, BOOL is_inv) +{ + const uint8_t *p; + char name[64], value[64]; + char *q; + BOOL script_ext; + int ret; + + p = *pp; + if (*p != '{') + return re_parse_error(s, "expecting '{' after \\p"); + p++; + q = name; + while (is_unicode_char(*p)) { + if ((q - name) > sizeof(name) - 1) + goto unknown_property_name; + *q++ = *p++; + } + *q = '\0'; + q = value; + if (*p == '=') { + p++; + while (is_unicode_char(*p)) { + if ((q - value) > sizeof(value) - 1) + return re_parse_error(s, "unknown unicode property value"); + *q++ = *p++; + } + } + *q = '\0'; + if (*p != '}') + return re_parse_error(s, "expecting '}'"); + p++; + // printf("name=%s value=%s\n", name, value); + + if (!strcmp(name, "Script") || !strcmp(name, "sc")) { + script_ext = FALSE; + goto do_script; + } else if (!strcmp(name, "Script_Extensions") || !strcmp(name, "scx")) { + script_ext = TRUE; + do_script: + cr_init(cr, s->mem_opaque, lre_realloc); + ret = unicode_script(cr, value, script_ext); + if (ret) { + cr_free(cr); + if (ret == -2) + return re_parse_error(s, "unknown unicode script"); + else + goto out_of_memory; + } + } else if (!strcmp(name, "General_Category") || !strcmp(name, "gc")) { + cr_init(cr, s->mem_opaque, lre_realloc); + ret = unicode_general_category(cr, value); + if (ret) { + cr_free(cr); + if (ret == -2) + return re_parse_error(s, "unknown unicode general category"); + else + goto out_of_memory; + } + } else if (value[0] == '\0') { + cr_init(cr, s->mem_opaque, lre_realloc); + ret = unicode_general_category(cr, name); + if (ret == -1) { + cr_free(cr); + goto out_of_memory; + } + if (ret < 0) { + ret = unicode_prop(cr, name); + if (ret) { + cr_free(cr); + if (ret == -2) + goto unknown_property_name; + else + goto out_of_memory; + } + } + } else { + unknown_property_name: + return re_parse_error(s, "unknown unicode property name"); + } + + if (is_inv) { + if (cr_invert(cr)) { + cr_free(cr); + return -1; + } + } + *pp = p; + return 0; + out_of_memory: + return re_parse_error(s, "out of memory"); +} +#endif /* CONFIG_ALL_UNICODE */ + +/* return -1 if error otherwise the character or a class range + (CLASS_RANGE_BASE). In case of class range, 'cr' is + initialized. Otherwise, it is ignored. */ +static int get_class_atom(REParseState *s, CharRange *cr, + const uint8_t **pp, BOOL inclass) +{ + const uint8_t *p; + uint32_t c; + int ret; + + p = *pp; + + c = *p; + switch(c) { + case '\\': + p++; + if (p >= s->buf_end) + goto unexpected_end; + c = *p++; + switch(c) { + case 'd': + c = CHAR_RANGE_d; + goto class_range; + case 'D': + c = CHAR_RANGE_D; + goto class_range; + case 's': + c = CHAR_RANGE_s; + goto class_range; + case 'S': + c = CHAR_RANGE_S; + goto class_range; + case 'w': + c = CHAR_RANGE_w; + goto class_range; + case 'W': + c = CHAR_RANGE_W; + class_range: + if (cr_init_char_range(s, cr, c)) + return -1; + c = CLASS_RANGE_BASE; + break; + case 'c': + c = *p; + if ((c >= 'a' && c <= 'z') || + (c >= 'A' && c <= 'Z') || + (((c >= '0' && c <= '9') || c == '_') && + inclass && !s->is_utf16)) { /* Annex B.1.4 */ + c &= 0x1f; + p++; + } else if (s->is_utf16) { + goto invalid_escape; + } else { + /* otherwise return '\' and 'c' */ + p--; + c = '\\'; + } + break; +#ifdef CONFIG_ALL_UNICODE + case 'p': + case 'P': + if (s->is_utf16) { + if (parse_unicode_property(s, cr, &p, (c == 'P'))) + return -1; + c = CLASS_RANGE_BASE; + break; + } + /* fall thru */ +#endif + default: + p--; + ret = lre_parse_escape(&p, s->is_utf16 * 2); + if (ret >= 0) { + c = ret; + } else { + if (ret == -2 && *p != '\0' && strchr("^$\\.*+?()[]{}|/", *p)) { + /* always valid to escape these characters */ + goto normal_char; + } else if (s->is_utf16) { + invalid_escape: + return re_parse_error(s, "invalid escape sequence in regular expression"); + } else { + /* just ignore the '\' */ + goto normal_char; + } + } + break; + } + break; + case '\0': + if (p >= s->buf_end) { + unexpected_end: + return re_parse_error(s, "unexpected end"); + } + /* fall thru */ + default: + normal_char: + /* normal char */ + if (c >= 128) { + c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p); + if ((unsigned)c > 0xffff && !s->is_utf16) { + /* XXX: should handle non BMP-1 code points */ + return re_parse_error(s, "malformed unicode char"); + } + } else { + p++; + } + break; + } + *pp = p; + return c; +} + +static int re_emit_range(REParseState *s, const CharRange *cr) +{ + int len, i; + uint32_t high; + + len = (unsigned)cr->len / 2; + if (len >= 65535) + return re_parse_error(s, "too many ranges"); + if (len == 0) { + /* not sure it can really happen. Emit a match that is always + false */ + re_emit_op_u32(s, REOP_char32, -1); + } else { + high = cr->points[cr->len - 1]; + if (high == UINT32_MAX) + high = cr->points[cr->len - 2]; + if (high <= 0xffff) { + /* can use 16 bit ranges with the conversion that 0xffff = + infinity */ + re_emit_op_u16(s, REOP_range, len); + for(i = 0; i < cr->len; i += 2) { + dbuf_put_u16(&s->byte_code, cr->points[i]); + high = cr->points[i + 1] - 1; + if (high == UINT32_MAX - 1) + high = 0xffff; + dbuf_put_u16(&s->byte_code, high); + } + } else { + re_emit_op_u16(s, REOP_range32, len); + for(i = 0; i < cr->len; i += 2) { + dbuf_put_u32(&s->byte_code, cr->points[i]); + dbuf_put_u32(&s->byte_code, cr->points[i + 1] - 1); + } + } + } + return 0; +} + +static int re_parse_char_class(REParseState *s, const uint8_t **pp) +{ + const uint8_t *p; + uint32_t c1, c2; + CharRange cr_s, *cr = &cr_s; + CharRange cr1_s, *cr1 = &cr1_s; + BOOL invert; + + cr_init(cr, s->mem_opaque, lre_realloc); + p = *pp; + p++; /* skip '[' */ + invert = FALSE; + if (*p == '^') { + p++; + invert = TRUE; + } + for(;;) { + if (*p == ']') + break; + c1 = get_class_atom(s, cr1, &p, TRUE); + if ((int)c1 < 0) + goto fail; + if (*p == '-' && p[1] != ']') { + const uint8_t *p0 = p + 1; + if (c1 >= CLASS_RANGE_BASE) { + if (s->is_utf16) { + cr_free(cr1); + goto invalid_class_range; + } + /* Annex B: match '-' character */ + goto class_atom; + } + c2 = get_class_atom(s, cr1, &p0, TRUE); + if ((int)c2 < 0) + goto fail; + if (c2 >= CLASS_RANGE_BASE) { + cr_free(cr1); + if (s->is_utf16) { + goto invalid_class_range; + } + /* Annex B: match '-' character */ + goto class_atom; + } + p = p0; + if (c2 < c1) { + invalid_class_range: + re_parse_error(s, "invalid class range"); + goto fail; + } + if (cr_union_interval(cr, c1, c2)) + goto memory_error; + } else { + class_atom: + if (c1 >= CLASS_RANGE_BASE) { + int ret; + ret = cr_union1(cr, cr1->points, cr1->len); + cr_free(cr1); + if (ret) + goto memory_error; + } else { + if (cr_union_interval(cr, c1, c1)) + goto memory_error; + } + } + } + if (s->ignore_case) { + if (cr_canonicalize(cr)) + goto memory_error; + } + if (invert) { + if (cr_invert(cr)) + goto memory_error; + } + if (re_emit_range(s, cr)) + goto fail; + cr_free(cr); + p++; /* skip ']' */ + *pp = p; + return 0; + memory_error: + re_parse_error(s, "out of memory"); + fail: + cr_free(cr); + return -1; +} + +/* Return: + 1 if the opcodes in bc_buf[] always advance the character pointer. + 0 if the character pointer may not be advanced. + -1 if the code may depend on side effects of its previous execution (backreference) +*/ +static int re_check_advance(const uint8_t *bc_buf, int bc_buf_len) +{ + int pos, opcode, ret, len, i; + uint32_t val, last; + BOOL has_back_reference; + uint8_t capture_bitmap[CAPTURE_COUNT_MAX]; + + ret = -2; /* not known yet */ + pos = 0; + has_back_reference = FALSE; + memset(capture_bitmap, 0, sizeof(capture_bitmap)); + + while (pos < bc_buf_len) { + opcode = bc_buf[pos]; + len = reopcode_info[opcode].size; + switch(opcode) { + case REOP_range: + val = get_u16(bc_buf + pos + 1); + len += val * 4; + goto simple_char; + case REOP_range32: + val = get_u16(bc_buf + pos + 1); + len += val * 8; + goto simple_char; + case REOP_char: + case REOP_char32: + case REOP_dot: + case REOP_any: + simple_char: + if (ret == -2) + ret = 1; + break; + case REOP_line_start: + case REOP_line_end: + case REOP_push_i32: + case REOP_push_char_pos: + case REOP_drop: + case REOP_word_boundary: + case REOP_not_word_boundary: + case REOP_prev: + /* no effect */ + break; + case REOP_save_start: + case REOP_save_end: + val = bc_buf[pos + 1]; + capture_bitmap[val] |= 1; + break; + case REOP_save_reset: + { + val = bc_buf[pos + 1]; + last = bc_buf[pos + 2]; + while (val < last) + capture_bitmap[val++] |= 1; + } + break; + case REOP_back_reference: + case REOP_backward_back_reference: + val = bc_buf[pos + 1]; + capture_bitmap[val] |= 2; + has_back_reference = TRUE; + break; + default: + /* safe behvior: we cannot predict the outcome */ + if (ret == -2) + ret = 0; + break; + } + pos += len; + } + if (has_back_reference) { + /* check if there is back reference which references a capture + made in the some code */ + for(i = 0; i < CAPTURE_COUNT_MAX; i++) { + if (capture_bitmap[i] == 3) + return -1; + } + } + if (ret == -2) + ret = 0; + return ret; +} + +/* return -1 if a simple quantifier cannot be used. Otherwise return + the number of characters in the atom. */ +static int re_is_simple_quantifier(const uint8_t *bc_buf, int bc_buf_len) +{ + int pos, opcode, len, count; + uint32_t val; + + count = 0; + pos = 0; + while (pos < bc_buf_len) { + opcode = bc_buf[pos]; + len = reopcode_info[opcode].size; + switch(opcode) { + case REOP_range: + val = get_u16(bc_buf + pos + 1); + len += val * 4; + goto simple_char; + case REOP_range32: + val = get_u16(bc_buf + pos + 1); + len += val * 8; + goto simple_char; + case REOP_char: + case REOP_char32: + case REOP_dot: + case REOP_any: + simple_char: + count++; + break; + case REOP_line_start: + case REOP_line_end: + case REOP_word_boundary: + case REOP_not_word_boundary: + break; + default: + return -1; + } + pos += len; + } + return count; +} + +/* '*pp' is the first char after '<' */ +static int re_parse_group_name(char *buf, int buf_size, + const uint8_t **pp, BOOL is_utf16) +{ + const uint8_t *p; + uint32_t c; + char *q; + + p = *pp; + q = buf; + for(;;) { + c = *p; + if (c == '\\') { + p++; + if (*p != 'u') + return -1; + c = lre_parse_escape(&p, is_utf16 * 2); + } else if (c == '>') { + break; + } else if (c >= 128) { + c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p); + } else { + p++; + } + if (c > 0x10FFFF) + return -1; + if (q == buf) { + if (!lre_js_is_ident_first(c)) + return -1; + } else { + if (!lre_js_is_ident_next(c)) + return -1; + } + if ((q - buf + UTF8_CHAR_LEN_MAX + 1) > buf_size) + return -1; + if (c < 128) { + *q++ = c; + } else { + q += unicode_to_utf8((uint8_t*)q, c); + } + } + if (q == buf) + return -1; + *q = '\0'; + p++; + *pp = p; + return 0; +} + +/* if capture_name = NULL: return the number of captures + 1. + Otherwise, return the capture index corresponding to capture_name + or -1 if none */ +static int re_parse_captures(REParseState *s, int *phas_named_captures, + const char *capture_name) +{ + const uint8_t *p; + int capture_index; + char name[TMP_BUF_SIZE]; + + capture_index = 1; + *phas_named_captures = 0; + for (p = s->buf_start; p < s->buf_end; p++) { + switch (*p) { + case '(': + if (p[1] == '?') { + if (p[2] == '<' && p[3] != '=' && p[3] != '!') { + *phas_named_captures = 1; + /* potential named capture */ + if (capture_name) { + p += 3; + if (re_parse_group_name(name, sizeof(name), &p, + s->is_utf16) == 0) { + if (!strcmp(name, capture_name)) + return capture_index; + } + } + capture_index++; + } + } else { + capture_index++; + } + break; + case '\\': + p++; + break; + case '[': + for (p += 1 + (*p == ']'); p < s->buf_end && *p != ']'; p++) { + if (*p == '\\') + p++; + } + break; + } + } + if (capture_name) + return -1; + else + return capture_index; +} + +static int re_count_captures(REParseState *s) +{ + if (s->total_capture_count < 0) { + s->total_capture_count = re_parse_captures(s, &s->has_named_captures, + NULL); + } + return s->total_capture_count; +} + +static BOOL re_has_named_captures(REParseState *s) +{ + if (s->has_named_captures < 0) + re_count_captures(s); + return s->has_named_captures; +} + +static int find_group_name(REParseState *s, const char *name) +{ + const char *p, *buf_end; + size_t len, name_len; + int capture_index; + + name_len = strlen(name); + p = (char *)s->group_names.buf; + buf_end = (char *)s->group_names.buf + s->group_names.size; + capture_index = 1; + while (p < buf_end) { + len = strlen(p); + if (len == name_len && memcmp(name, p, name_len) == 0) + return capture_index; + p += len + 1; + capture_index++; + } + return -1; +} + +static int re_parse_disjunction(REParseState *s, BOOL is_backward_dir); + +static int re_parse_term(REParseState *s, BOOL is_backward_dir) +{ + const uint8_t *p; + int c, last_atom_start, quant_min, quant_max, last_capture_count; + BOOL greedy, add_zero_advance_check, is_neg, is_backward_lookahead; + CharRange cr_s, *cr = &cr_s; + + last_atom_start = -1; + last_capture_count = 0; + p = s->buf_ptr; + c = *p; + switch(c) { + case '^': + p++; + re_emit_op(s, REOP_line_start); + break; + case '$': + p++; + re_emit_op(s, REOP_line_end); + break; + case '.': + p++; + last_atom_start = s->byte_code.size; + last_capture_count = s->capture_count; + if (is_backward_dir) + re_emit_op(s, REOP_prev); + re_emit_op(s, s->dotall ? REOP_any : REOP_dot); + if (is_backward_dir) + re_emit_op(s, REOP_prev); + break; + case '{': + /* As an extension (see ES6 annex B), we accept '{' not + followed by digits as a normal atom */ + if (!is_digit(p[1])) { + if (s->is_utf16) + goto invalid_quant_count; + goto parse_class_atom; + } + /* fall tru */ + case '*': + case '+': + case '?': + return re_parse_error(s, "nothing to repeat"); + case '(': + if (p[1] == '?') { + if (p[2] == ':') { + p += 3; + last_atom_start = s->byte_code.size; + last_capture_count = s->capture_count; + s->buf_ptr = p; + if (re_parse_disjunction(s, is_backward_dir)) + return -1; + p = s->buf_ptr; + if (re_parse_expect(s, &p, ')')) + return -1; + } else if ((p[2] == '=' || p[2] == '!')) { + is_neg = (p[2] == '!'); + is_backward_lookahead = FALSE; + p += 3; + goto lookahead; + } else if (p[2] == '<' && + (p[3] == '=' || p[3] == '!')) { + int pos; + is_neg = (p[3] == '!'); + is_backward_lookahead = TRUE; + p += 4; + /* lookahead */ + lookahead: + /* Annex B allows lookahead to be used as an atom for + the quantifiers */ + if (!s->is_utf16 && !is_backward_lookahead) { + last_atom_start = s->byte_code.size; + last_capture_count = s->capture_count; + } + pos = re_emit_op_u32(s, REOP_lookahead + is_neg, 0); + s->buf_ptr = p; + if (re_parse_disjunction(s, is_backward_lookahead)) + return -1; + p = s->buf_ptr; + if (re_parse_expect(s, &p, ')')) + return -1; + re_emit_op(s, REOP_match); + /* jump after the 'match' after the lookahead is successful */ + put_u32(s->byte_code.buf + pos, s->byte_code.size - (pos + 4)); + } else if (p[2] == '<') { + p += 3; + if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf), + &p, s->is_utf16)) { + return re_parse_error(s, "invalid group name"); + } + if (find_group_name(s, s->u.tmp_buf) > 0) { + return re_parse_error(s, "duplicate group name"); + } + /* group name with a trailing zero */ + dbuf_put(&s->group_names, (uint8_t *)s->u.tmp_buf, + strlen(s->u.tmp_buf) + 1); + s->has_named_captures = 1; + goto parse_capture; + } else { + return re_parse_error(s, "invalid group"); + } + } else { + int capture_index; + p++; + /* capture without group name */ + dbuf_putc(&s->group_names, 0); + parse_capture: + if (s->capture_count >= CAPTURE_COUNT_MAX) + return re_parse_error(s, "too many captures"); + last_atom_start = s->byte_code.size; + last_capture_count = s->capture_count; + capture_index = s->capture_count++; + re_emit_op_u8(s, REOP_save_start + is_backward_dir, + capture_index); + + s->buf_ptr = p; + if (re_parse_disjunction(s, is_backward_dir)) + return -1; + p = s->buf_ptr; + + re_emit_op_u8(s, REOP_save_start + 1 - is_backward_dir, + capture_index); + + if (re_parse_expect(s, &p, ')')) + return -1; + } + break; + case '\\': + switch(p[1]) { + case 'b': + case 'B': + re_emit_op(s, REOP_word_boundary + (p[1] != 'b')); + p += 2; + break; + case 'k': + { + const uint8_t *p1; + int dummy_res; + + p1 = p; + if (p1[2] != '<') { + /* annex B: we tolerate invalid group names in non + unicode mode if there is no named capture + definition */ + if (s->is_utf16 || re_has_named_captures(s)) + return re_parse_error(s, "expecting group name"); + else + goto parse_class_atom; + } + p1 += 3; + if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf), + &p1, s->is_utf16)) { + if (s->is_utf16 || re_has_named_captures(s)) + return re_parse_error(s, "invalid group name"); + else + goto parse_class_atom; + } + c = find_group_name(s, s->u.tmp_buf); + if (c < 0) { + /* no capture name parsed before, try to look + after (inefficient, but hopefully not common */ + c = re_parse_captures(s, &dummy_res, s->u.tmp_buf); + if (c < 0) { + if (s->is_utf16 || re_has_named_captures(s)) + return re_parse_error(s, "group name not defined"); + else + goto parse_class_atom; + } + } + p = p1; + } + goto emit_back_reference; + case '0': + p += 2; + c = 0; + if (s->is_utf16) { + if (is_digit(*p)) { + return re_parse_error(s, "invalid decimal escape in regular expression"); + } + } else { + /* Annex B.1.4: accept legacy octal */ + if (*p >= '0' && *p <= '7') { + c = *p++ - '0'; + if (*p >= '0' && *p <= '7') { + c = (c << 3) + *p++ - '0'; + } + } + } + goto normal_char; + case '1' ... '9': + { + const uint8_t *q = ++p; + + c = parse_digits(&p); + if (c < 0 || (c >= s->capture_count && c >= re_count_captures(s))) { + if (!s->is_utf16) { + /* Annex B.1.4: accept legacy octal */ + p = q; + if (*p <= '7') { + c = 0; + if (*p <= '3') + c = *p++ - '0'; + if (*p >= '0' && *p <= '7') { + c = (c << 3) + *p++ - '0'; + if (*p >= '0' && *p <= '7') { + c = (c << 3) + *p++ - '0'; + } + } + } else { + c = *p++; + } + goto normal_char; + } + return re_parse_error(s, "back reference out of range in reguar expression"); + } + emit_back_reference: + last_atom_start = s->byte_code.size; + last_capture_count = s->capture_count; + re_emit_op_u8(s, REOP_back_reference + is_backward_dir, c); + } + break; + default: + goto parse_class_atom; + } + break; + case '[': + last_atom_start = s->byte_code.size; + last_capture_count = s->capture_count; + if (is_backward_dir) + re_emit_op(s, REOP_prev); + if (re_parse_char_class(s, &p)) + return -1; + if (is_backward_dir) + re_emit_op(s, REOP_prev); + break; + case ']': + case '}': + if (s->is_utf16) + return re_parse_error(s, "syntax error"); + goto parse_class_atom; + default: + parse_class_atom: + c = get_class_atom(s, cr, &p, FALSE); + if ((int)c < 0) + return -1; + normal_char: + last_atom_start = s->byte_code.size; + last_capture_count = s->capture_count; + if (is_backward_dir) + re_emit_op(s, REOP_prev); + if (c >= CLASS_RANGE_BASE) { + int ret; + /* Note: canonicalization is not needed */ + ret = re_emit_range(s, cr); + cr_free(cr); + if (ret) + return -1; + } else { + if (s->ignore_case) + c = lre_canonicalize(c, s->is_utf16); + if (c <= 0xffff) + re_emit_op_u16(s, REOP_char, c); + else + re_emit_op_u32(s, REOP_char32, c); + } + if (is_backward_dir) + re_emit_op(s, REOP_prev); + break; + } + + /* quantifier */ + if (last_atom_start >= 0) { + c = *p; + switch(c) { + case '*': + p++; + quant_min = 0; + quant_max = INT32_MAX; + goto quantifier; + case '+': + p++; + quant_min = 1; + quant_max = INT32_MAX; + goto quantifier; + case '?': + p++; + quant_min = 0; + quant_max = 1; + goto quantifier; + case '{': + /* As an extension (see ES6 annex B), we accept '{' not + followed by digits as a normal atom */ + if (!is_digit(p[1])) { + if (s->is_utf16) + goto invalid_quant_count; + break; + } + p++; + quant_min = parse_digits(&p); + if (quant_min < 0) { + invalid_quant_count: + return re_parse_error(s, "invalid repetition count"); + } + quant_max = quant_min; + if (*p == ',') { + p++; + if (is_digit(*p)) { + quant_max = parse_digits(&p); + if (quant_max < 0 || quant_max < quant_min) + goto invalid_quant_count; + } else { + quant_max = INT32_MAX; /* infinity */ + } + } + if (re_parse_expect(s, &p, '}')) + return -1; + quantifier: + greedy = TRUE; + if (*p == '?') { + p++; + greedy = FALSE; + } + if (last_atom_start < 0) { + return re_parse_error(s, "nothing to repeat"); + } + if (greedy) { + int len, pos; + + if (quant_max > 0) { + /* specific optimization for simple quantifiers */ + len = re_is_simple_quantifier(s->byte_code.buf + last_atom_start, + s->byte_code.size - last_atom_start); + if (len > 0) { + re_emit_op(s, REOP_match); + + dbuf_insert(&s->byte_code, last_atom_start, 17); + pos = last_atom_start; + s->byte_code.buf[pos++] = REOP_simple_greedy_quant; + put_u32(&s->byte_code.buf[pos], + s->byte_code.size - last_atom_start - 17); + pos += 4; + put_u32(&s->byte_code.buf[pos], quant_min); + pos += 4; + put_u32(&s->byte_code.buf[pos], quant_max); + pos += 4; + put_u32(&s->byte_code.buf[pos], len); + pos += 4; + goto done; + } + } + + add_zero_advance_check = (re_check_advance(s->byte_code.buf + last_atom_start, + s->byte_code.size - last_atom_start) == 0); + } else { + add_zero_advance_check = FALSE; + } + + { + int len, pos; + len = s->byte_code.size - last_atom_start; + if (quant_min == 0) { + /* need to reset the capture in case the atom is + not executed */ + if (last_capture_count != s->capture_count) { + dbuf_insert(&s->byte_code, last_atom_start, 3); + s->byte_code.buf[last_atom_start++] = REOP_save_reset; + s->byte_code.buf[last_atom_start++] = last_capture_count; + s->byte_code.buf[last_atom_start++] = s->capture_count - 1; + } + if (quant_max == 0) { + s->byte_code.size = last_atom_start; + } else if (quant_max == 1) { + dbuf_insert(&s->byte_code, last_atom_start, 5); + s->byte_code.buf[last_atom_start] = REOP_split_goto_first + + greedy; + put_u32(s->byte_code.buf + last_atom_start + 1, len); + } else if (quant_max == INT32_MAX) { + dbuf_insert(&s->byte_code, last_atom_start, 5 + add_zero_advance_check); + s->byte_code.buf[last_atom_start] = REOP_split_goto_first + + greedy; + put_u32(s->byte_code.buf + last_atom_start + 1, + len + 5 + add_zero_advance_check); + if (add_zero_advance_check) { + /* avoid infinite loop by stoping the + recursion if no advance was made in the + atom (only works if the atom has no + side effect) */ + s->byte_code.buf[last_atom_start + 1 + 4] = REOP_push_char_pos; + re_emit_goto(s, REOP_bne_char_pos, last_atom_start); + } else { + re_emit_goto(s, REOP_goto, last_atom_start); + } + } else { + dbuf_insert(&s->byte_code, last_atom_start, 10); + pos = last_atom_start; + s->byte_code.buf[pos++] = REOP_push_i32; + put_u32(s->byte_code.buf + pos, quant_max); + pos += 4; + s->byte_code.buf[pos++] = REOP_split_goto_first + greedy; + put_u32(s->byte_code.buf + pos, len + 5); + re_emit_goto(s, REOP_loop, last_atom_start + 5); + re_emit_op(s, REOP_drop); + } + } else if (quant_min == 1 && quant_max == INT32_MAX && + !add_zero_advance_check) { + re_emit_goto(s, REOP_split_next_first - greedy, + last_atom_start); + } else { + if (quant_min == 1) { + /* nothing to add */ + } else { + dbuf_insert(&s->byte_code, last_atom_start, 5); + s->byte_code.buf[last_atom_start] = REOP_push_i32; + put_u32(s->byte_code.buf + last_atom_start + 1, + quant_min); + last_atom_start += 5; + re_emit_goto(s, REOP_loop, last_atom_start); + re_emit_op(s, REOP_drop); + } + if (quant_max == INT32_MAX) { + pos = s->byte_code.size; + re_emit_op_u32(s, REOP_split_goto_first + greedy, + len + 5 + add_zero_advance_check); + if (add_zero_advance_check) + re_emit_op(s, REOP_push_char_pos); + /* copy the atom */ + dbuf_put_self(&s->byte_code, last_atom_start, len); + if (add_zero_advance_check) + re_emit_goto(s, REOP_bne_char_pos, pos); + else + re_emit_goto(s, REOP_goto, pos); + } else if (quant_max > quant_min) { + re_emit_op_u32(s, REOP_push_i32, quant_max - quant_min); + pos = s->byte_code.size; + re_emit_op_u32(s, REOP_split_goto_first + greedy, len + 5); + /* copy the atom */ + dbuf_put_self(&s->byte_code, last_atom_start, len); + + re_emit_goto(s, REOP_loop, pos); + re_emit_op(s, REOP_drop); + } + } + last_atom_start = -1; + } + break; + default: + break; + } + } + done: + s->buf_ptr = p; + return 0; +} + +static int re_parse_alternative(REParseState *s, BOOL is_backward_dir) +{ + const uint8_t *p; + int ret; + size_t start, term_start, end, term_size; + + start = s->byte_code.size; + for(;;) { + p = s->buf_ptr; + if (p >= s->buf_end) + break; + if (*p == '|' || *p == ')') + break; + term_start = s->byte_code.size; + ret = re_parse_term(s, is_backward_dir); + if (ret) + return ret; + if (is_backward_dir) { + /* reverse the order of the terms (XXX: inefficient, but + speed is not really critical here) */ + end = s->byte_code.size; + term_size = end - term_start; + if (dbuf_realloc(&s->byte_code, end + term_size)) + return -1; + memmove(s->byte_code.buf + start + term_size, + s->byte_code.buf + start, + end - start); + memcpy(s->byte_code.buf + start, s->byte_code.buf + end, + term_size); + } + } + return 0; +} + +static int re_parse_disjunction(REParseState *s, BOOL is_backward_dir) +{ + int start, len, pos; + + start = s->byte_code.size; + if (re_parse_alternative(s, is_backward_dir)) + return -1; + while (*s->buf_ptr == '|') { + s->buf_ptr++; + + len = s->byte_code.size - start; + + /* insert a split before the first alternative */ + dbuf_insert(&s->byte_code, start, 5); + s->byte_code.buf[start] = REOP_split_next_first; + put_u32(s->byte_code.buf + start + 1, len + 5); + + pos = re_emit_op_u32(s, REOP_goto, 0); + + if (re_parse_alternative(s, is_backward_dir)) + return -1; + + /* patch the goto */ + len = s->byte_code.size - (pos + 4); + put_u32(s->byte_code.buf + pos, len); + } + return 0; +} + +/* the control flow is recursive so the analysis can be linear */ +static int compute_stack_size(const uint8_t *bc_buf, int bc_buf_len) +{ + int stack_size, stack_size_max, pos, opcode, len; + uint32_t val; + + stack_size = 0; + stack_size_max = 0; + bc_buf += RE_HEADER_LEN; + bc_buf_len -= RE_HEADER_LEN; + pos = 0; + while (pos < bc_buf_len) { + opcode = bc_buf[pos]; + len = reopcode_info[opcode].size; + assert(opcode < REOP_COUNT); + assert((pos + len) <= bc_buf_len); + switch(opcode) { + case REOP_push_i32: + case REOP_push_char_pos: + stack_size++; + if (stack_size > stack_size_max) { + if (stack_size > STACK_SIZE_MAX) + return -1; + stack_size_max = stack_size; + } + break; + case REOP_drop: + case REOP_bne_char_pos: + assert(stack_size > 0); + stack_size--; + break; + case REOP_range: + val = get_u16(bc_buf + pos + 1); + len += val * 4; + break; + case REOP_range32: + val = get_u16(bc_buf + pos + 1); + len += val * 8; + break; + } + pos += len; + } + return stack_size_max; +} + +/* 'buf' must be a zero terminated UTF-8 string of length buf_len. + Return NULL if error and allocate an error message in *perror_msg, + otherwise the compiled bytecode and its length in plen. +*/ +uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size, + const char *buf, size_t buf_len, int re_flags, + void *opaque) +{ + REParseState s_s, *s = &s_s; + int stack_size; + BOOL is_sticky; + + memset(s, 0, sizeof(*s)); + s->mem_opaque = opaque; + s->buf_ptr = (const uint8_t *)buf; + s->buf_end = s->buf_ptr + buf_len; + s->buf_start = s->buf_ptr; + s->re_flags = re_flags; + s->is_utf16 = ((re_flags & LRE_FLAG_UTF16) != 0); + is_sticky = ((re_flags & LRE_FLAG_STICKY) != 0); + s->ignore_case = ((re_flags & LRE_FLAG_IGNORECASE) != 0); + s->dotall = ((re_flags & LRE_FLAG_DOTALL) != 0); + s->capture_count = 1; + s->total_capture_count = -1; + s->has_named_captures = -1; + + dbuf_init2(&s->byte_code, opaque, lre_realloc); + dbuf_init2(&s->group_names, opaque, lre_realloc); + + dbuf_putc(&s->byte_code, re_flags); /* first element is the flags */ + dbuf_putc(&s->byte_code, 0); /* second element is the number of captures */ + dbuf_putc(&s->byte_code, 0); /* stack size */ + dbuf_put_u32(&s->byte_code, 0); /* bytecode length */ + + if (!is_sticky) { + /* iterate thru all positions (about the same as .*?( ... ) ) + . We do it without an explicit loop so that lock step + thread execution will be possible in an optimized + implementation */ + re_emit_op_u32(s, REOP_split_goto_first, 1 + 5); + re_emit_op(s, REOP_any); + re_emit_op_u32(s, REOP_goto, -(5 + 1 + 5)); + } + re_emit_op_u8(s, REOP_save_start, 0); + + if (re_parse_disjunction(s, FALSE)) { + error: + dbuf_free(&s->byte_code); + dbuf_free(&s->group_names); + pstrcpy(error_msg, error_msg_size, s->u.error_msg); + *plen = 0; + return NULL; + } + + re_emit_op_u8(s, REOP_save_end, 0); + + re_emit_op(s, REOP_match); + + if (*s->buf_ptr != '\0') { + re_parse_error(s, "extraneous characters at the end"); + goto error; + } + + if (dbuf_error(&s->byte_code)) { + re_parse_error(s, "out of memory"); + goto error; + } + + stack_size = compute_stack_size(s->byte_code.buf, s->byte_code.size); + if (stack_size < 0) { + re_parse_error(s, "too many imbricated quantifiers"); + goto error; + } + + s->byte_code.buf[RE_HEADER_CAPTURE_COUNT] = s->capture_count; + s->byte_code.buf[RE_HEADER_STACK_SIZE] = stack_size; + put_u32(s->byte_code.buf + 3, s->byte_code.size - RE_HEADER_LEN); + + /* add the named groups if needed */ + if (s->group_names.size > (s->capture_count - 1)) { + dbuf_put(&s->byte_code, s->group_names.buf, s->group_names.size); + s->byte_code.buf[RE_HEADER_FLAGS] |= LRE_FLAG_NAMED_GROUPS; + } + dbuf_free(&s->group_names); + +#ifdef DUMP_REOP + lre_dump_bytecode(s->byte_code.buf, s->byte_code.size); +#endif + + error_msg[0] = '\0'; + *plen = s->byte_code.size; + return s->byte_code.buf; +} + +static BOOL is_line_terminator(uint32_t c) +{ + return (c == '\n' || c == '\r' || c == CP_LS || c == CP_PS); +} + +static BOOL is_word_char(uint32_t c) +{ + return ((c >= '0' && c <= '9') || + (c >= 'a' && c <= 'z') || + (c >= 'A' && c <= 'Z') || + (c == '_')); +} + +#define GET_CHAR(c, cptr, cbuf_end) \ + do { \ + if (cbuf_type == 0) { \ + c = *cptr++; \ + } else { \ + uint32_t __c1; \ + c = *(uint16_t *)cptr; \ + cptr += 2; \ + if (c >= 0xd800 && c < 0xdc00 && \ + cbuf_type == 2 && cptr < cbuf_end) { \ + __c1 = *(uint16_t *)cptr; \ + if (__c1 >= 0xdc00 && __c1 < 0xe000) { \ + c = (((c & 0x3ff) << 10) | (__c1 & 0x3ff)) + 0x10000; \ + cptr += 2; \ + } \ + } \ + } \ + } while (0) + +#define PEEK_CHAR(c, cptr, cbuf_end) \ + do { \ + if (cbuf_type == 0) { \ + c = cptr[0]; \ + } else { \ + uint32_t __c1; \ + c = ((uint16_t *)cptr)[0]; \ + if (c >= 0xd800 && c < 0xdc00 && \ + cbuf_type == 2 && (cptr + 2) < cbuf_end) { \ + __c1 = ((uint16_t *)cptr)[1]; \ + if (__c1 >= 0xdc00 && __c1 < 0xe000) { \ + c = (((c & 0x3ff) << 10) | (__c1 & 0x3ff)) + 0x10000; \ + } \ + } \ + } \ + } while (0) + +#define PEEK_PREV_CHAR(c, cptr, cbuf_start) \ + do { \ + if (cbuf_type == 0) { \ + c = cptr[-1]; \ + } else { \ + uint32_t __c1; \ + c = ((uint16_t *)cptr)[-1]; \ + if (c >= 0xdc00 && c < 0xe000 && \ + cbuf_type == 2 && (cptr - 4) >= cbuf_start) { \ + __c1 = ((uint16_t *)cptr)[-2]; \ + if (__c1 >= 0xd800 && __c1 < 0xdc00 ) { \ + c = (((__c1 & 0x3ff) << 10) | (c & 0x3ff)) + 0x10000; \ + } \ + } \ + } \ + } while (0) + +#define GET_PREV_CHAR(c, cptr, cbuf_start) \ + do { \ + if (cbuf_type == 0) { \ + cptr--; \ + c = cptr[0]; \ + } else { \ + uint32_t __c1; \ + cptr -= 2; \ + c = ((uint16_t *)cptr)[0]; \ + if (c >= 0xdc00 && c < 0xe000 && \ + cbuf_type == 2 && cptr > cbuf_start) { \ + __c1 = ((uint16_t *)cptr)[-1]; \ + if (__c1 >= 0xd800 && __c1 < 0xdc00 ) { \ + cptr -= 2; \ + c = (((__c1 & 0x3ff) << 10) | (c & 0x3ff)) + 0x10000; \ + } \ + } \ + } \ + } while (0) + +#define PREV_CHAR(cptr, cbuf_start) \ + do { \ + if (cbuf_type == 0) { \ + cptr--; \ + } else { \ + cptr -= 2; \ + if (cbuf_type == 2) { \ + c = ((uint16_t *)cptr)[0]; \ + if (c >= 0xdc00 && c < 0xe000 && cptr > cbuf_start) { \ + c = ((uint16_t *)cptr)[-1]; \ + if (c >= 0xd800 && c < 0xdc00) \ + cptr -= 2; \ + } \ + } \ + } \ + } while (0) + +typedef uintptr_t StackInt; + +typedef enum { + RE_EXEC_STATE_SPLIT, + RE_EXEC_STATE_LOOKAHEAD, + RE_EXEC_STATE_NEGATIVE_LOOKAHEAD, + RE_EXEC_STATE_GREEDY_QUANT, +} REExecStateEnum; + +typedef struct REExecState { + REExecStateEnum type : 8; + uint8_t stack_len; + size_t count; /* only used for RE_EXEC_STATE_GREEDY_QUANT */ + const uint8_t *cptr; + const uint8_t *pc; + void *buf[0]; +} REExecState; + +typedef struct { + const uint8_t *cbuf; + const uint8_t *cbuf_end; + /* 0 = 8 bit chars, 1 = 16 bit chars, 2 = 16 bit chars, UTF-16 */ + int cbuf_type; + int capture_count; + int stack_size_max; + BOOL multi_line; + BOOL ignore_case; + BOOL is_utf16; + void *opaque; /* used for stack overflow check */ + + size_t state_size; + uint8_t *state_stack; + size_t state_stack_size; + size_t state_stack_len; +} REExecContext; + +static int push_state(REExecContext *s, + uint8_t **capture, + StackInt *stack, size_t stack_len, + const uint8_t *pc, const uint8_t *cptr, + REExecStateEnum type, size_t count) +{ + REExecState *rs; + uint8_t *new_stack; + size_t new_size, i, n; + StackInt *stack_buf; + + if (unlikely((s->state_stack_len + 1) > s->state_stack_size)) { + /* reallocate the stack */ + new_size = s->state_stack_size * 3 / 2; + if (new_size < 8) + new_size = 8; + new_stack = lre_realloc(s->opaque, s->state_stack, new_size * s->state_size); + if (!new_stack) + return -1; + s->state_stack_size = new_size; + s->state_stack = new_stack; + } + rs = (REExecState *)(s->state_stack + s->state_stack_len * s->state_size); + s->state_stack_len++; + rs->type = type; + rs->count = count; + rs->stack_len = stack_len; + rs->cptr = cptr; + rs->pc = pc; + n = 2 * s->capture_count; + for(i = 0; i < n; i++) + rs->buf[i] = capture[i]; + stack_buf = (StackInt *)(rs->buf + n); + for(i = 0; i < stack_len; i++) + stack_buf[i] = stack[i]; + return 0; +} + +/* return 1 if match, 0 if not match or -1 if error. */ +static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture, + StackInt *stack, int stack_len, + const uint8_t *pc, const uint8_t *cptr, + BOOL no_recurse) +{ + int opcode, ret; + int cbuf_type; + uint32_t val, c; + const uint8_t *cbuf_end; + + cbuf_type = s->cbuf_type; + cbuf_end = s->cbuf_end; + + for(;;) { + // printf("top=%p: pc=%d\n", th_list.top, (int)(pc - (bc_buf + RE_HEADER_LEN))); + opcode = *pc++; + switch(opcode) { + case REOP_match: + { + REExecState *rs; + if (no_recurse) + return (intptr_t)cptr; + ret = 1; + goto recurse; + no_match: + if (no_recurse) + return 0; + ret = 0; + recurse: + for(;;) { + if (s->state_stack_len == 0) + return ret; + rs = (REExecState *)(s->state_stack + + (s->state_stack_len - 1) * s->state_size); + if (rs->type == RE_EXEC_STATE_SPLIT) { + if (!ret) { + pop_state: + memcpy(capture, rs->buf, + sizeof(capture[0]) * 2 * s->capture_count); + pop_state1: + pc = rs->pc; + cptr = rs->cptr; + stack_len = rs->stack_len; + memcpy(stack, rs->buf + 2 * s->capture_count, + stack_len * sizeof(stack[0])); + s->state_stack_len--; + break; + } + } else if (rs->type == RE_EXEC_STATE_GREEDY_QUANT) { + if (!ret) { + uint32_t char_count, i; + memcpy(capture, rs->buf, + sizeof(capture[0]) * 2 * s->capture_count); + stack_len = rs->stack_len; + memcpy(stack, rs->buf + 2 * s->capture_count, + stack_len * sizeof(stack[0])); + pc = rs->pc; + cptr = rs->cptr; + /* go backward */ + char_count = get_u32(pc + 12); + for(i = 0; i < char_count; i++) { + PREV_CHAR(cptr, s->cbuf); + } + pc = (pc + 16) + (int)get_u32(pc); + rs->cptr = cptr; + rs->count--; + if (rs->count == 0) { + s->state_stack_len--; + } + break; + } + } else { + ret = ((rs->type == RE_EXEC_STATE_LOOKAHEAD && ret) || + (rs->type == RE_EXEC_STATE_NEGATIVE_LOOKAHEAD && !ret)); + if (ret) { + /* keep the capture in case of positive lookahead */ + if (rs->type == RE_EXEC_STATE_LOOKAHEAD) + goto pop_state1; + else + goto pop_state; + } + } + s->state_stack_len--; + } + } + break; + case REOP_char32: + val = get_u32(pc); + pc += 4; + goto test_char; + case REOP_char: + val = get_u16(pc); + pc += 2; + test_char: + if (cptr >= cbuf_end) + goto no_match; + GET_CHAR(c, cptr, cbuf_end); + if (s->ignore_case) { + c = lre_canonicalize(c, s->is_utf16); + } + if (val != c) + goto no_match; + break; + case REOP_split_goto_first: + case REOP_split_next_first: + { + const uint8_t *pc1; + + val = get_u32(pc); + pc += 4; + if (opcode == REOP_split_next_first) { + pc1 = pc + (int)val; + } else { + pc1 = pc; + pc = pc + (int)val; + } + ret = push_state(s, capture, stack, stack_len, + pc1, cptr, RE_EXEC_STATE_SPLIT, 0); + if (ret < 0) + return -1; + break; + } + case REOP_lookahead: + case REOP_negative_lookahead: + val = get_u32(pc); + pc += 4; + ret = push_state(s, capture, stack, stack_len, + pc + (int)val, cptr, + RE_EXEC_STATE_LOOKAHEAD + opcode - REOP_lookahead, + 0); + if (ret < 0) + return -1; + break; + + case REOP_goto: + val = get_u32(pc); + pc += 4 + (int)val; + break; + case REOP_line_start: + if (cptr == s->cbuf) + break; + if (!s->multi_line) + goto no_match; + PEEK_PREV_CHAR(c, cptr, s->cbuf); + if (!is_line_terminator(c)) + goto no_match; + break; + case REOP_line_end: + if (cptr == cbuf_end) + break; + if (!s->multi_line) + goto no_match; + PEEK_CHAR(c, cptr, cbuf_end); + if (!is_line_terminator(c)) + goto no_match; + break; + case REOP_dot: + if (cptr == cbuf_end) + goto no_match; + GET_CHAR(c, cptr, cbuf_end); + if (is_line_terminator(c)) + goto no_match; + break; + case REOP_any: + if (cptr == cbuf_end) + goto no_match; + GET_CHAR(c, cptr, cbuf_end); + break; + case REOP_save_start: + case REOP_save_end: + val = *pc++; + assert(val < s->capture_count); + capture[2 * val + opcode - REOP_save_start] = (uint8_t *)cptr; + break; + case REOP_save_reset: + { + uint32_t val2; + val = pc[0]; + val2 = pc[1]; + pc += 2; + assert(val2 < s->capture_count); + while (val <= val2) { + capture[2 * val] = NULL; + capture[2 * val + 1] = NULL; + val++; + } + } + break; + case REOP_push_i32: + val = get_u32(pc); + pc += 4; + stack[stack_len++] = val; + break; + case REOP_drop: + stack_len--; + break; + case REOP_loop: + val = get_u32(pc); + pc += 4; + if (--stack[stack_len - 1] != 0) { + pc += (int)val; + } + break; + case REOP_push_char_pos: + stack[stack_len++] = (uintptr_t)cptr; + break; + case REOP_bne_char_pos: + val = get_u32(pc); + pc += 4; + if (stack[--stack_len] != (uintptr_t)cptr) + pc += (int)val; + break; + case REOP_word_boundary: + case REOP_not_word_boundary: + { + BOOL v1, v2; + /* char before */ + if (cptr == s->cbuf) { + v1 = FALSE; + } else { + PEEK_PREV_CHAR(c, cptr, s->cbuf); + v1 = is_word_char(c); + } + /* current char */ + if (cptr >= cbuf_end) { + v2 = FALSE; + } else { + PEEK_CHAR(c, cptr, cbuf_end); + v2 = is_word_char(c); + } + if (v1 ^ v2 ^ (REOP_not_word_boundary - opcode)) + goto no_match; + } + break; + case REOP_back_reference: + case REOP_backward_back_reference: + { + const uint8_t *cptr1, *cptr1_end, *cptr1_start; + uint32_t c1, c2; + + val = *pc++; + if (val >= s->capture_count) + goto no_match; + cptr1_start = capture[2 * val]; + cptr1_end = capture[2 * val + 1]; + if (!cptr1_start || !cptr1_end) + break; + if (opcode == REOP_back_reference) { + cptr1 = cptr1_start; + while (cptr1 < cptr1_end) { + if (cptr >= cbuf_end) + goto no_match; + GET_CHAR(c1, cptr1, cptr1_end); + GET_CHAR(c2, cptr, cbuf_end); + if (s->ignore_case) { + c1 = lre_canonicalize(c1, s->is_utf16); + c2 = lre_canonicalize(c2, s->is_utf16); + } + if (c1 != c2) + goto no_match; + } + } else { + cptr1 = cptr1_end; + while (cptr1 > cptr1_start) { + if (cptr == s->cbuf) + goto no_match; + GET_PREV_CHAR(c1, cptr1, cptr1_start); + GET_PREV_CHAR(c2, cptr, s->cbuf); + if (s->ignore_case) { + c1 = lre_canonicalize(c1, s->is_utf16); + c2 = lre_canonicalize(c2, s->is_utf16); + } + if (c1 != c2) + goto no_match; + } + } + } + break; + case REOP_range: + { + int n; + uint32_t low, high, idx_min, idx_max, idx; + + n = get_u16(pc); /* n must be >= 1 */ + pc += 2; + if (cptr >= cbuf_end) + goto no_match; + GET_CHAR(c, cptr, cbuf_end); + if (s->ignore_case) { + c = lre_canonicalize(c, s->is_utf16); + } + idx_min = 0; + low = get_u16(pc + 0 * 4); + if (c < low) + goto no_match; + idx_max = n - 1; + high = get_u16(pc + idx_max * 4 + 2); + /* 0xffff in for last value means +infinity */ + if (unlikely(c >= 0xffff) && high == 0xffff) + goto range_match; + if (c > high) + goto no_match; + while (idx_min <= idx_max) { + idx = (idx_min + idx_max) / 2; + low = get_u16(pc + idx * 4); + high = get_u16(pc + idx * 4 + 2); + if (c < low) + idx_max = idx - 1; + else if (c > high) + idx_min = idx + 1; + else + goto range_match; + } + goto no_match; + range_match: + pc += 4 * n; + } + break; + case REOP_range32: + { + int n; + uint32_t low, high, idx_min, idx_max, idx; + + n = get_u16(pc); /* n must be >= 1 */ + pc += 2; + if (cptr >= cbuf_end) + goto no_match; + GET_CHAR(c, cptr, cbuf_end); + if (s->ignore_case) { + c = lre_canonicalize(c, s->is_utf16); + } + idx_min = 0; + low = get_u32(pc + 0 * 8); + if (c < low) + goto no_match; + idx_max = n - 1; + high = get_u32(pc + idx_max * 8 + 4); + if (c > high) + goto no_match; + while (idx_min <= idx_max) { + idx = (idx_min + idx_max) / 2; + low = get_u32(pc + idx * 8); + high = get_u32(pc + idx * 8 + 4); + if (c < low) + idx_max = idx - 1; + else if (c > high) + idx_min = idx + 1; + else + goto range32_match; + } + goto no_match; + range32_match: + pc += 8 * n; + } + break; + case REOP_prev: + /* go to the previous char */ + if (cptr == s->cbuf) + goto no_match; + PREV_CHAR(cptr, s->cbuf); + break; + case REOP_simple_greedy_quant: + { + uint32_t next_pos, quant_min, quant_max; + size_t q; + intptr_t res; + const uint8_t *pc1; + + next_pos = get_u32(pc); + quant_min = get_u32(pc + 4); + quant_max = get_u32(pc + 8); + pc += 16; + pc1 = pc; + pc += (int)next_pos; + + q = 0; + for(;;) { + res = lre_exec_backtrack(s, capture, stack, stack_len, + pc1, cptr, TRUE); + if (res == -1) + return res; + if (!res) + break; + cptr = (uint8_t *)res; + q++; + if (q >= quant_max && quant_max != INT32_MAX) + break; + } + if (q < quant_min) + goto no_match; + if (q > quant_min) { + /* will examine all matches down to quant_min */ + ret = push_state(s, capture, stack, stack_len, + pc1 - 16, cptr, + RE_EXEC_STATE_GREEDY_QUANT, + q - quant_min); + if (ret < 0) + return -1; + } + } + break; + default: + abort(); + } + } +} + +/* Return 1 if match, 0 if not match or -1 if error. cindex is the + starting position of the match and must be such as 0 <= cindex <= + clen. */ +int lre_exec(uint8_t **capture, + const uint8_t *bc_buf, const uint8_t *cbuf, int cindex, int clen, + int cbuf_type, void *opaque) +{ + REExecContext s_s, *s = &s_s; + int re_flags, i, alloca_size, ret; + StackInt *stack_buf; + + re_flags = bc_buf[RE_HEADER_FLAGS]; + s->multi_line = (re_flags & LRE_FLAG_MULTILINE) != 0; + s->ignore_case = (re_flags & LRE_FLAG_IGNORECASE) != 0; + s->is_utf16 = (re_flags & LRE_FLAG_UTF16) != 0; + s->capture_count = bc_buf[RE_HEADER_CAPTURE_COUNT]; + s->stack_size_max = bc_buf[RE_HEADER_STACK_SIZE]; + s->cbuf = cbuf; + s->cbuf_end = cbuf + (clen << cbuf_type); + s->cbuf_type = cbuf_type; + if (s->cbuf_type == 1 && s->is_utf16) + s->cbuf_type = 2; + s->opaque = opaque; + + s->state_size = sizeof(REExecState) + + s->capture_count * sizeof(capture[0]) * 2 + + s->stack_size_max * sizeof(stack_buf[0]); + s->state_stack = NULL; + s->state_stack_len = 0; + s->state_stack_size = 0; + + for(i = 0; i < s->capture_count * 2; i++) + capture[i] = NULL; + alloca_size = s->stack_size_max * sizeof(stack_buf[0]); + stack_buf = alloca(alloca_size); + ret = lre_exec_backtrack(s, capture, stack_buf, 0, bc_buf + RE_HEADER_LEN, + cbuf + (cindex << cbuf_type), FALSE); + lre_realloc(s->opaque, s->state_stack, 0); + return ret; +} + +int lre_get_capture_count(const uint8_t *bc_buf) +{ + return bc_buf[RE_HEADER_CAPTURE_COUNT]; +} + +int lre_get_flags(const uint8_t *bc_buf) +{ + return bc_buf[RE_HEADER_FLAGS]; +} + +#ifdef TEST + +BOOL lre_check_stack_overflow(void *opaque, size_t alloca_size) +{ + return FALSE; +} + +void *lre_realloc(void *opaque, void *ptr, size_t size) +{ + return realloc(ptr, size); +} + +int main(int argc, char **argv) +{ + int len, ret, i; + uint8_t *bc; + char error_msg[64]; + uint8_t *capture[CAPTURE_COUNT_MAX * 2]; + const char *input; + int input_len, capture_count; + + if (argc < 3) { + printf("usage: %s regexp input\n", argv[0]); + exit(1); + } + bc = lre_compile(&len, error_msg, sizeof(error_msg), argv[1], + strlen(argv[1]), 0, NULL); + if (!bc) { + fprintf(stderr, "error: %s\n", error_msg); + exit(1); + } + + input = argv[2]; + input_len = strlen(input); + + ret = lre_exec(capture, bc, (uint8_t *)input, 0, input_len, 0, NULL); + printf("ret=%d\n", ret); + if (ret == 1) { + capture_count = lre_get_capture_count(bc); + for(i = 0; i < 2 * capture_count; i++) { + uint8_t *ptr; + ptr = capture[i]; + printf("%d: ", i); + if (!ptr) + printf("<nil>"); + else + printf("%u", (int)(ptr - (uint8_t *)input)); + printf("\n"); + } + } + return 0; +} +#endif |