diff options
Diffstat (limited to 'libregexp.c')
-rw-r--r-- | libregexp.c | 828 |
1 files changed, 721 insertions, 107 deletions
diff --git a/libregexp.c b/libregexp.c index 8c47389..cca2197 100644 --- a/libregexp.c +++ b/libregexp.c @@ -71,6 +71,7 @@ typedef struct { const uint8_t *buf_start; int re_flags; BOOL is_unicode; + BOOL unicode_sets; /* if set, is_unicode is also set */ BOOL ignore_case; BOOL dotall; int capture_count; @@ -102,11 +103,11 @@ static const REOpCode reopcode_info[REOP_COUNT] = { }; #define RE_HEADER_FLAGS 0 -#define RE_HEADER_CAPTURE_COUNT 1 -#define RE_HEADER_STACK_SIZE 2 -#define RE_HEADER_BYTECODE_LEN 3 +#define RE_HEADER_CAPTURE_COUNT 2 +#define RE_HEADER_STACK_SIZE 3 +#define RE_HEADER_BYTECODE_LEN 4 -#define RE_HEADER_LEN 7 +#define RE_HEADER_LEN 8 static inline int is_digit(int c) { return c >= '0' && c <= '9'; @@ -122,6 +123,264 @@ static int dbuf_insert(DynBuf *s, int pos, int len) return 0; } +typedef struct REString { + struct REString *next; + uint32_t hash; + uint32_t len; + uint32_t buf[]; +} REString; + +typedef struct { + /* the string list is the union of 'char_range' and of the strings + in hash_table[]. The strings in hash_table[] have a length != + 1. */ + CharRange cr; + uint32_t n_strings; + uint32_t hash_size; + int hash_bits; + REString **hash_table; +} REStringList; + +static uint32_t re_string_hash(int len, const uint32_t *buf) +{ + int i; + uint32_t h; + h = 1; + for(i = 0; i < len; i++) + h = h * 263 + buf[i]; + return h * 0x61C88647; +} + +static void re_string_list_init(REParseState *s1, REStringList *s) +{ + cr_init(&s->cr, s1->opaque, lre_realloc); + s->n_strings = 0; + s->hash_size = 0; + s->hash_bits = 0; + s->hash_table = NULL; +} + +static void re_string_list_free(REStringList *s) +{ + REString *p, *p_next; + int i; + for(i = 0; i < s->hash_size; i++) { + for(p = s->hash_table[i]; p != NULL; p = p_next) { + p_next = p->next; + lre_realloc(s->cr.mem_opaque, p, 0); + } + } + lre_realloc(s->cr.mem_opaque, s->hash_table, 0); + + cr_free(&s->cr); +} + +static void lre_print_char(int c, BOOL is_range) +{ + if (c == '\'' || c == '\\' || + (is_range && (c == '-' || c == ']'))) { + printf("\\%c", c); + } else if (c >= ' ' && c <= 126) { + printf("%c", c); + } else { + printf("\\u{%04x}", c); + } +} + +static __maybe_unused void re_string_list_dump(const char *str, const REStringList *s) +{ + REString *p; + const CharRange *cr; + int i, j, k; + + printf("%s:\n", str); + printf(" ranges: ["); + cr = &s->cr; + for(i = 0; i < cr->len; i += 2) { + lre_print_char(cr->points[i], TRUE); + if (cr->points[i] != cr->points[i + 1] - 1) { + printf("-"); + lre_print_char(cr->points[i + 1] - 1, TRUE); + } + } + printf("]\n"); + + j = 0; + for(i = 0; i < s->hash_size; i++) { + for(p = s->hash_table[i]; p != NULL; p = p->next) { + printf(" %d/%d: '", j, s->n_strings); + for(k = 0; k < p->len; k++) { + lre_print_char(p->buf[k], FALSE); + } + printf("'\n"); + j++; + } + } +} + +static int re_string_find2(REStringList *s, int len, const uint32_t *buf, + uint32_t h0, BOOL add_flag) +{ + uint32_t h = 0; /* avoid warning */ + REString *p; + if (s->n_strings != 0) { + h = h0 >> (32 - s->hash_bits); + for(p = s->hash_table[h]; p != NULL; p = p->next) { + if (p->hash == h0 && p->len == len && + !memcmp(p->buf, buf, len * sizeof(buf[0]))) { + return 1; + } + } + } + /* not found */ + if (!add_flag) + return 0; + /* increase the size of the hash table if needed */ + if (unlikely((s->n_strings + 1) > s->hash_size)) { + REString **new_hash_table, *p_next; + int new_hash_bits, i; + uint32_t new_hash_size; + new_hash_bits = max_int(s->hash_bits + 1, 4); + new_hash_size = 1 << new_hash_bits; + new_hash_table = lre_realloc(s->cr.mem_opaque, NULL, + sizeof(new_hash_table[0]) * new_hash_size); + if (!new_hash_table) + return -1; + memset(new_hash_table, 0, sizeof(new_hash_table[0]) * new_hash_size); + for(i = 0; i < s->hash_size; i++) { + for(p = s->hash_table[i]; p != NULL; p = p_next) { + p_next = p->next; + h = p->hash >> (32 - new_hash_bits); + p->next = new_hash_table[h]; + new_hash_table[h] = p; + } + } + lre_realloc(s->cr.mem_opaque, s->hash_table, 0); + s->hash_bits = new_hash_bits; + s->hash_size = new_hash_size; + s->hash_table = new_hash_table; + h = h0 >> (32 - s->hash_bits); + } + + p = lre_realloc(s->cr.mem_opaque, NULL, sizeof(REString) + len * sizeof(buf[0])); + if (!p) + return -1; + p->next = s->hash_table[h]; + s->hash_table[h] = p; + s->n_strings++; + p->hash = h0; + p->len = len; + memcpy(p->buf, buf, sizeof(buf[0]) * len); + return 1; +} + +static int re_string_find(REStringList *s, int len, const uint32_t *buf, + BOOL add_flag) +{ + uint32_t h0; + h0 = re_string_hash(len, buf); + return re_string_find2(s, len, buf, h0, add_flag); +} + +/* return -1 if memory error, 0 if OK */ +static int re_string_add(REStringList *s, int len, const uint32_t *buf) +{ + if (len == 1) { + return cr_union_interval(&s->cr, buf[0], buf[0]); + } + if (re_string_find(s, len, buf, TRUE) < 0) + return -1; + return 0; +} + +/* a = a op b */ +static int re_string_list_op(REStringList *a, REStringList *b, int op) +{ + int i, ret; + REString *p, **pp; + + if (cr_op1(&a->cr, b->cr.points, b->cr.len, op)) + return -1; + + switch(op) { + case CR_OP_UNION: + if (b->n_strings != 0) { + for(i = 0; i < b->hash_size; i++) { + for(p = b->hash_table[i]; p != NULL; p = p->next) { + if (re_string_find2(a, p->len, p->buf, p->hash, TRUE) < 0) + return -1; + } + } + } + break; + case CR_OP_INTER: + case CR_OP_SUB: + for(i = 0; i < a->hash_size; i++) { + pp = &a->hash_table[i]; + for(;;) { + p = *pp; + if (p == NULL) + break; + ret = re_string_find2(b, p->len, p->buf, p->hash, FALSE); + if (op == CR_OP_SUB) + ret = !ret; + if (!ret) { + /* remove it */ + *pp = p->next; + a->n_strings--; + lre_realloc(a->cr.mem_opaque, p, 0); + } else { + /* keep it */ + pp = &p->next; + } + } + } + break; + default: + abort(); + } + return 0; +} + +static int re_string_list_canonicalize(REParseState *s1, + REStringList *s, BOOL is_unicode) +{ + if (cr_regexp_canonicalize(&s->cr, is_unicode)) + return -1; + if (s->n_strings != 0) { + REStringList a_s, *a = &a_s; + int i, j; + REString *p; + + /* XXX: simplify */ + re_string_list_init(s1, a); + + a->n_strings = s->n_strings; + a->hash_size = s->hash_size; + a->hash_bits = s->hash_bits; + a->hash_table = s->hash_table; + + s->n_strings = 0; + s->hash_size = 0; + s->hash_bits = 0; + s->hash_table = NULL; + + for(i = 0; i < a->hash_size; i++) { + for(p = a->hash_table[i]; p != NULL; p = p->next) { + for(j = 0; j < p->len; j++) { + p->buf[j] = lre_canonicalize(p->buf[j], is_unicode); + } + if (re_string_add(s, p->len, p->buf)) { + re_string_list_free(a); + return -1; + } + } + } + re_string_list_free(a); + } + return 0; +} + static const uint16_t char_range_d[] = { 1, 0x0030, 0x0039 + 1, @@ -170,7 +429,7 @@ static const uint16_t * const char_range_table[] = { char_range_w, }; -static int cr_init_char_range(REParseState *s, CharRange *cr, uint32_t c) +static int cr_init_char_range(REParseState *s, REStringList *cr, uint32_t c) { BOOL invert; const uint16_t *c_pt; @@ -179,18 +438,18 @@ static int cr_init_char_range(REParseState *s, CharRange *cr, uint32_t c) invert = c & 1; c_pt = char_range_table[c >> 1]; len = *c_pt++; - cr_init(cr, s->opaque, lre_realloc); + re_string_list_init(s, cr); for(i = 0; i < len * 2; i++) { - if (cr_add_point(cr, c_pt[i])) + if (cr_add_point(&cr->cr, c_pt[i])) goto fail; } if (invert) { - if (cr_invert(cr)) + if (cr_invert(&cr->cr)) goto fail; } return 0; fail: - cr_free(cr); + re_string_list_free(cr); return -1; } @@ -533,8 +792,16 @@ static BOOL is_unicode_char(int c) (c == '_')); } -static int parse_unicode_property(REParseState *s, CharRange *cr, - const uint8_t **pp, BOOL is_inv) +/* XXX: memory error test */ +static void seq_prop_cb(void *opaque, const uint32_t *seq, int seq_len) +{ + REStringList *sl = opaque; + re_string_add(sl, seq_len, seq); +} + +static int parse_unicode_property(REParseState *s, REStringList *cr, + const uint8_t **pp, BOOL is_inv, + BOOL allow_sequence_prop) { const uint8_t *p; char name[64], value[64]; @@ -574,51 +841,76 @@ static int parse_unicode_property(REParseState *s, CharRange *cr, } else if (!strcmp(name, "Script_Extensions") || !strcmp(name, "scx")) { script_ext = TRUE; do_script: - cr_init(cr, s->opaque, lre_realloc); - ret = unicode_script(cr, value, script_ext); + re_string_list_init(s, cr); + ret = unicode_script(&cr->cr, value, script_ext); if (ret) { - cr_free(cr); + re_string_list_free(cr); if (ret == -2) return re_parse_error(s, "unknown unicode script"); else goto out_of_memory; } } else if (!strcmp(name, "General_Category") || !strcmp(name, "gc")) { - cr_init(cr, s->opaque, lre_realloc); - ret = unicode_general_category(cr, value); + re_string_list_init(s, cr); + ret = unicode_general_category(&cr->cr, value); if (ret) { - cr_free(cr); + re_string_list_free(cr); if (ret == -2) return re_parse_error(s, "unknown unicode general category"); else goto out_of_memory; } } else if (value[0] == '\0') { - cr_init(cr, s->opaque, lre_realloc); - ret = unicode_general_category(cr, name); + re_string_list_init(s, cr); + ret = unicode_general_category(&cr->cr, name); if (ret == -1) { - cr_free(cr); + re_string_list_free(cr); goto out_of_memory; } if (ret < 0) { - ret = unicode_prop(cr, name); - if (ret) { - cr_free(cr); - if (ret == -2) - goto unknown_property_name; - else - goto out_of_memory; + ret = unicode_prop(&cr->cr, name); + if (ret == -1) { + re_string_list_free(cr); + goto out_of_memory; } } + if (ret < 0 && !is_inv && allow_sequence_prop) { + CharRange cr_tmp; + cr_init(&cr_tmp, s->opaque, lre_realloc); + ret = unicode_sequence_prop(name, seq_prop_cb, cr, &cr_tmp); + cr_free(&cr_tmp); + if (ret == -1) { + re_string_list_free(cr); + goto out_of_memory; + } + } + if (ret < 0) + goto unknown_property_name; } else { unknown_property_name: return re_parse_error(s, "unknown unicode property name"); } + /* the ordering of case folding and inversion differs with + unicode_sets. 'unicode_sets' ordering is more consistent */ + /* XXX: the spec seems incorrect, we do it as the other engines + seem to do it. */ + if (s->ignore_case && s->unicode_sets) { + if (re_string_list_canonicalize(s, cr, s->is_unicode)) { + re_string_list_free(cr); + goto out_of_memory; + } + } if (is_inv) { - if (cr_invert(cr)) { - cr_free(cr); - return -1; + if (cr_invert(&cr->cr)) { + re_string_list_free(cr); + goto out_of_memory; + } + } + if (s->ignore_case && !s->unicode_sets) { + if (re_string_list_canonicalize(s, cr, s->is_unicode)) { + re_string_list_free(cr); + goto out_of_memory; } } *pp = p; @@ -628,10 +920,61 @@ static int parse_unicode_property(REParseState *s, CharRange *cr, } #endif /* CONFIG_ALL_UNICODE */ +static int get_class_atom(REParseState *s, REStringList *cr, + const uint8_t **pp, BOOL inclass); + +static int parse_class_string_disjunction(REParseState *s, REStringList *cr, + const uint8_t **pp) +{ + const uint8_t *p; + DynBuf str; + int c; + + p = *pp; + if (*p != '{') + return re_parse_error(s, "expecting '{' after \\q"); + + dbuf_init2(&str, s->opaque, lre_realloc); + re_string_list_init(s, cr); + + p++; + for(;;) { + str.size = 0; + while (*p != '}' && *p != '|') { + c = get_class_atom(s, NULL, &p, FALSE); + if (c < 0) + goto fail; + if (dbuf_put_u32(&str, c)) { + re_parse_out_of_memory(s); + goto fail; + } + } + if (re_string_add(cr, str.size / 4, (uint32_t *)str.buf)) { + re_parse_out_of_memory(s); + goto fail; + } + if (*p == '}') + break; + p++; + } + if (s->ignore_case) { + if (re_string_list_canonicalize(s, cr, TRUE)) + goto fail; + } + p++; /* skip the '}' */ + dbuf_free(&str); + *pp = p; + return 0; + fail: + dbuf_free(&str); + re_string_list_free(cr); + return -1; +} + /* return -1 if error otherwise the character or a class range - (CLASS_RANGE_BASE). In case of class range, 'cr' is + (CLASS_RANGE_BASE) if cr != NULL. In case of class range, 'cr' is initialized. Otherwise, it is ignored. */ -static int get_class_atom(REParseState *s, CharRange *cr, +static int get_class_atom(REParseState *s, REStringList *cr, const uint8_t **pp, BOOL inclass) { const uint8_t *p; @@ -666,6 +1009,8 @@ static int get_class_atom(REParseState *s, CharRange *cr, case 'W': c = CHAR_RANGE_W; class_range: + if (!cr) + goto default_escape; if (cr_init_char_range(s, cr, c)) return -1; c = CLASS_RANGE_BASE; @@ -690,27 +1035,50 @@ static int get_class_atom(REParseState *s, CharRange *cr, if (!inclass && s->is_unicode) goto invalid_escape; break; + case '^': + case '$': + case '\\': + case '.': + case '*': + case '+': + case '?': + case '(': + case ')': + case '[': + case ']': + case '{': + case '}': + case '|': + case '/': + /* always valid to escape these characters */ + break; #ifdef CONFIG_ALL_UNICODE case 'p': case 'P': - if (s->is_unicode) { - if (parse_unicode_property(s, cr, &p, (c == 'P'))) + if (s->is_unicode && cr) { + if (parse_unicode_property(s, cr, &p, (c == 'P'), s->unicode_sets)) return -1; c = CLASS_RANGE_BASE; break; } - /* fall thru */ + goto default_escape; #endif + case 'q': + if (s->unicode_sets && cr && inclass) { + if (parse_class_string_disjunction(s, cr, &p)) + return -1; + c = CLASS_RANGE_BASE; + break; + } + goto default_escape; default: + default_escape: p--; ret = lre_parse_escape(&p, s->is_unicode * 2); if (ret >= 0) { c = ret; } else { - if (ret == -2 && *p != '\0' && strchr("^$\\.*+?()[]{}|/", *p)) { - /* always valid to escape these characters */ - goto normal_char; - } else if (s->is_unicode) { + if (s->is_unicode) { invalid_escape: return re_parse_error(s, "invalid escape sequence in regular expression"); } else { @@ -727,6 +1095,48 @@ static int get_class_atom(REParseState *s, CharRange *cr, return re_parse_error(s, "unexpected end"); } /* fall thru */ + goto normal_char; + + case '&': + case '!': + case '#': + case '$': + case '%': + case '*': + case '+': + case ',': + case '.': + case ':': + case ';': + case '<': + case '=': + case '>': + case '?': + case '@': + case '^': + case '`': + case '~': + if (s->unicode_sets && p[1] == c) { + /* forbidden double characters */ + return re_parse_error(s, "invalid class set operation in regular expression"); + } + goto normal_char; + + case '(': + case ')': + case '[': + case ']': + case '{': + case '}': + case '/': + case '-': + case '|': + if (s->unicode_sets) { + /* invalid characters in unicode sets */ + return re_parse_error(s, "invalid character in class in regular expression"); + } + goto normal_char; + default: normal_char: /* normal char */ @@ -754,8 +1164,6 @@ static int re_emit_range(REParseState *s, const CharRange *cr) if (len >= 65535) return re_parse_error(s, "too many ranges"); if (len == 0) { - /* not sure it can really happen. Emit a match that is always - false */ re_emit_op_u32(s, REOP_char32, -1); } else { high = cr->points[cr->len - 1]; @@ -783,15 +1191,135 @@ static int re_emit_range(REParseState *s, const CharRange *cr) return 0; } -static int re_parse_char_class(REParseState *s, const uint8_t **pp) +static int re_string_cmp_len(const void *a, const void *b, void *arg) +{ + REString *p1 = *(REString **)a; + REString *p2 = *(REString **)b; + return (p1->len < p2->len) - (p1->len > p2->len); +} + +static int re_emit_string_list(REParseState *s, const REStringList *sl) +{ + REString **tab, *p; + int i, j, c, split_pos, last_match_pos, n; + BOOL has_empty_string, is_last; + + // re_string_list_dump("sl", sl); + if (sl->n_strings == 0) { + /* simple case: only characters */ + if (re_emit_range(s, &sl->cr)) + return -1; + } else { + /* at least one string list is present : match the longest ones first */ + /* XXX: add a new op_switch opcode to compile as a trie */ + tab = lre_realloc(s->opaque, NULL, sizeof(tab[0]) * sl->n_strings); + if (!tab) { + re_parse_out_of_memory(s); + return -1; + } + has_empty_string = FALSE; + n = 0; + for(i = 0; i < sl->hash_size; i++) { + for(p = sl->hash_table[i]; p != NULL; p = p->next) { + if (p->len == 0) { + has_empty_string = TRUE; + } else { + tab[n++] = p; + } + } + } + assert(n <= sl->n_strings); + + rqsort(tab, n, sizeof(tab[0]), re_string_cmp_len, NULL); + + last_match_pos = -1; + for(i = 0; i < n; i++) { + p = tab[i]; + is_last = !has_empty_string && sl->cr.len == 0 && i == (n - 1); + if (!is_last) + split_pos = re_emit_op_u32(s, REOP_split_next_first, 0); + else + split_pos = 0; + for(j = 0; j < p->len; j++) { + c = p->buf[j]; + if (c <= 0xffff) + re_emit_op_u16(s, REOP_char, c); + else + re_emit_op_u32(s, REOP_char32, c); + } + if (!is_last) { + last_match_pos = re_emit_op_u32(s, REOP_goto, last_match_pos); + put_u32(s->byte_code.buf + split_pos, s->byte_code.size - (split_pos + 4)); + } + } + + if (sl->cr.len != 0) { + /* char range */ + is_last = !has_empty_string; + if (!is_last) + split_pos = re_emit_op_u32(s, REOP_split_next_first, 0); + else + split_pos = 0; /* not used */ + if (re_emit_range(s, &sl->cr)) { + lre_realloc(s->opaque, tab, 0); + return -1; + } + if (!is_last) + put_u32(s->byte_code.buf + split_pos, s->byte_code.size - (split_pos + 4)); + } + + /* patch the 'goto match' */ + while (last_match_pos != -1) { + int next_pos = get_u32(s->byte_code.buf + last_match_pos); + put_u32(s->byte_code.buf + last_match_pos, s->byte_code.size - (last_match_pos + 4)); + last_match_pos = next_pos; + } + + lre_realloc(s->opaque, tab, 0); + } + return 0; +} + +static int re_parse_nested_class(REParseState *s, REStringList *cr, const uint8_t **pp); + +static int re_parse_class_set_operand(REParseState *s, REStringList *cr, const uint8_t **pp) +{ + int c1; + const uint8_t *p = *pp; + + if (*p == '[') { + if (re_parse_nested_class(s, cr, pp)) + return -1; + } else { + c1 = get_class_atom(s, cr, pp, TRUE); + if (c1 < 0) + return -1; + if (c1 < CLASS_RANGE_BASE) { + /* create a range with a single character */ + re_string_list_init(s, cr); + if (s->ignore_case) + c1 = lre_canonicalize(c1, s->is_unicode); + if (cr_union_interval(&cr->cr, c1, c1)) { + re_string_list_free(cr); + return -1; + } + } + } + return 0; +} + +static int re_parse_nested_class(REParseState *s, REStringList *cr, const uint8_t **pp) { const uint8_t *p; uint32_t c1, c2; - CharRange cr_s, *cr = &cr_s; - CharRange cr1_s, *cr1 = &cr1_s; - BOOL invert; + int ret; + REStringList cr1_s, *cr1 = &cr1_s; + BOOL invert, is_first; + + if (lre_check_stack_overflow(s->opaque, 0)) + return re_parse_error(s, "stack overflow"); - cr_init(cr, s->opaque, lre_realloc); + re_string_list_init(s, cr); p = *pp; p++; /* skip '[' */ @@ -800,74 +1328,155 @@ static int re_parse_char_class(REParseState *s, const uint8_t **pp) p++; invert = TRUE; } - + + /* handle unions */ + is_first = TRUE; for(;;) { if (*p == ']') break; - c1 = get_class_atom(s, cr1, &p, TRUE); - if ((int)c1 < 0) - goto fail; - if (*p == '-' && p[1] != ']') { - const uint8_t *p0 = p + 1; - if (c1 >= CLASS_RANGE_BASE) { - if (s->is_unicode) { - cr_free(cr1); - goto invalid_class_range; - } - /* Annex B: match '-' character */ - goto class_atom; - } - c2 = get_class_atom(s, cr1, &p0, TRUE); - if ((int)c2 < 0) - goto fail; - if (c2 >= CLASS_RANGE_BASE) { - cr_free(cr1); - if (s->is_unicode) { - goto invalid_class_range; - } - /* Annex B: match '-' character */ - goto class_atom; - } - p = p0; - if (c2 < c1) { - invalid_class_range: - re_parse_error(s, "invalid class range"); + if (*p == '[' && s->unicode_sets) { + if (re_parse_nested_class(s, cr1, &p)) goto fail; - } - if (cr_union_interval(cr, c1, c2)) - goto memory_error; + goto class_union; } else { - class_atom: - if (c1 >= CLASS_RANGE_BASE) { - int ret; - ret = cr_union1(cr, cr1->points, cr1->len); - cr_free(cr1); - if (ret) - goto memory_error; + c1 = get_class_atom(s, cr1, &p, TRUE); + if ((int)c1 < 0) + goto fail; + if (*p == '-' && p[1] != ']') { + const uint8_t *p0 = p + 1; + if (p[1] == '-' && s->unicode_sets && is_first) + goto class_atom; /* first character class followed by '--' */ + if (c1 >= CLASS_RANGE_BASE) { + if (s->is_unicode) { + re_string_list_free(cr1); + goto invalid_class_range; + } + /* Annex B: match '-' character */ + goto class_atom; + } + c2 = get_class_atom(s, cr1, &p0, TRUE); + if ((int)c2 < 0) + goto fail; + if (c2 >= CLASS_RANGE_BASE) { + re_string_list_free(cr1); + if (s->is_unicode) { + goto invalid_class_range; + } + /* Annex B: match '-' character */ + goto class_atom; + } + p = p0; + if (c2 < c1) { + invalid_class_range: + re_parse_error(s, "invalid class range"); + goto fail; + } + if (s->ignore_case) { + CharRange cr2_s, *cr2 = &cr2_s; + cr_init(cr2, s->opaque, lre_realloc); + if (cr_add_interval(cr2, c1, c2 + 1) || + cr_regexp_canonicalize(cr2, s->is_unicode) || + cr_op1(&cr->cr, cr2->points, cr2->len, CR_OP_UNION)) { + cr_free(cr2); + goto memory_error; + } + cr_free(cr2); + } else { + if (cr_union_interval(&cr->cr, c1, c2)) + goto memory_error; + } + is_first = FALSE; /* union operation */ } else { - if (cr_union_interval(cr, c1, c1)) - goto memory_error; + class_atom: + if (c1 >= CLASS_RANGE_BASE) { + class_union: + ret = re_string_list_op(cr, cr1, CR_OP_UNION); + re_string_list_free(cr1); + if (ret) + goto memory_error; + } else { + if (s->ignore_case) + c1 = lre_canonicalize(c1, s->is_unicode); + if (cr_union_interval(&cr->cr, c1, c1)) + goto memory_error; + } } } + if (s->unicode_sets && is_first) { + if (*p == '&' && p[1] == '&' && p[2] != '&') { + /* handle '&&' */ + for(;;) { + if (*p == ']') { + break; + } else if (*p == '&' && p[1] == '&' && p[2] != '&') { + p += 2; + } else { + goto invalid_operation; + } + if (re_parse_class_set_operand(s, cr1, &p)) + goto fail; + ret = re_string_list_op(cr, cr1, CR_OP_INTER); + re_string_list_free(cr1); + if (ret) + goto memory_error; + } + } else if (*p == '-' && p[1] == '-') { + /* handle '--' */ + for(;;) { + if (*p == ']') { + break; + } else if (*p == '-' && p[1] == '-') { + p += 2; + } else { + invalid_operation: + re_parse_error(s, "invalid operation in regular expression"); + goto fail; + } + if (re_parse_class_set_operand(s, cr1, &p)) + goto fail; + ret = re_string_list_op(cr, cr1, CR_OP_SUB); + re_string_list_free(cr1); + if (ret) + goto memory_error; + } + } + } + is_first = FALSE; } - if (s->ignore_case) { - if (cr_regexp_canonicalize(cr, s->is_unicode)) - goto memory_error; - } + + p++; /* skip ']' */ + *pp = p; if (invert) { - if (cr_invert(cr)) + /* XXX: add may_contain_string syntax check to be fully + compliant. The test here accepts more input than the + spec. */ + if (cr->n_strings != 0) { + re_parse_error(s, "negated character class with strings in regular expression debugger eval code"); + goto fail; + } + if (cr_invert(&cr->cr)) goto memory_error; } - if (re_emit_range(s, cr)) - goto fail; - cr_free(cr); - p++; /* skip ']' */ - *pp = p; return 0; memory_error: re_parse_out_of_memory(s); fail: - cr_free(cr); + re_string_list_free(cr); + return -1; +} + +static int re_parse_char_class(REParseState *s, const uint8_t **pp) +{ + REStringList cr_s, *cr = &cr_s; + + if (re_parse_nested_class(s, cr, pp)) + return -1; + if (re_emit_string_list(s, cr)) + goto fail; + re_string_list_free(cr); + return 0; + fail: + re_string_list_free(cr); return -1; } @@ -1121,7 +1730,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir) const uint8_t *p; int c, last_atom_start, quant_min, quant_max, last_capture_count; BOOL greedy, add_zero_advance_check, is_neg, is_backward_lookahead; - CharRange cr_s, *cr = &cr_s; + REStringList cr_s, *cr = &cr_s; last_atom_start = -1; last_capture_count = 0; @@ -1385,9 +1994,8 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir) re_emit_op(s, REOP_prev); if (c >= CLASS_RANGE_BASE) { int ret; - /* Note: canonicalization is not needed */ - ret = re_emit_range(s, cr); - cr_free(cr); + ret = re_emit_string_list(s, cr); + re_string_list_free(cr); if (ret) return -1; } else { @@ -1737,10 +2345,11 @@ uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size, s->buf_end = s->buf_ptr + buf_len; s->buf_start = s->buf_ptr; s->re_flags = re_flags; - s->is_unicode = ((re_flags & LRE_FLAG_UNICODE) != 0); + s->is_unicode = ((re_flags & (LRE_FLAG_UNICODE | LRE_FLAG_UNICODE_SETS)) != 0); is_sticky = ((re_flags & LRE_FLAG_STICKY) != 0); s->ignore_case = ((re_flags & LRE_FLAG_IGNORECASE) != 0); s->dotall = ((re_flags & LRE_FLAG_DOTALL) != 0); + s->unicode_sets = ((re_flags & LRE_FLAG_UNICODE_SETS) != 0); s->capture_count = 1; s->total_capture_count = -1; s->has_named_captures = -1; @@ -1748,7 +2357,7 @@ uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size, dbuf_init2(&s->byte_code, opaque, lre_realloc); dbuf_init2(&s->group_names, opaque, lre_realloc); - dbuf_putc(&s->byte_code, re_flags); /* first element is the flags */ + dbuf_put_u16(&s->byte_code, re_flags); /* first element is the flags */ dbuf_putc(&s->byte_code, 0); /* second element is the number of captures */ dbuf_putc(&s->byte_code, 0); /* stack size */ dbuf_put_u32(&s->byte_code, 0); /* bytecode length */ @@ -1801,7 +2410,8 @@ uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size, /* add the named groups if needed */ if (s->group_names.size > (s->capture_count - 1)) { dbuf_put(&s->byte_code, s->group_names.buf, s->group_names.size); - s->byte_code.buf[RE_HEADER_FLAGS] |= LRE_FLAG_NAMED_GROUPS; + put_u16(s->byte_code.buf + RE_HEADER_FLAGS, + lre_get_flags(s->byte_code.buf) | LRE_FLAG_NAMED_GROUPS); } dbuf_free(&s->group_names); @@ -2221,6 +2831,8 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture, v1 = FALSE; } else { PEEK_PREV_CHAR(c, cptr, s->cbuf, cbuf_type); + if (s->ignore_case) + c = lre_canonicalize(c, s->is_unicode); v1 = is_word_char(c); } /* current char */ @@ -2228,6 +2840,8 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture, v2 = FALSE; } else { PEEK_CHAR(c, cptr, cbuf_end, cbuf_type); + if (s->ignore_case) + c = lre_canonicalize(c, s->is_unicode); v2 = is_word_char(c); } if (v1 ^ v2 ^ (REOP_not_word_boundary - opcode)) @@ -2424,7 +3038,7 @@ int lre_exec(uint8_t **capture, re_flags = lre_get_flags(bc_buf); s->multi_line = (re_flags & LRE_FLAG_MULTILINE) != 0; s->ignore_case = (re_flags & LRE_FLAG_IGNORECASE) != 0; - s->is_unicode = (re_flags & LRE_FLAG_UNICODE) != 0; + s->is_unicode = (re_flags & (LRE_FLAG_UNICODE | LRE_FLAG_UNICODE_SETS)) != 0; s->capture_count = bc_buf[RE_HEADER_CAPTURE_COUNT]; s->stack_size_max = bc_buf[RE_HEADER_STACK_SIZE]; s->cbuf = cbuf; @@ -2459,7 +3073,7 @@ int lre_get_capture_count(const uint8_t *bc_buf) int lre_get_flags(const uint8_t *bc_buf) { - return bc_buf[RE_HEADER_FLAGS]; + return get_u16(bc_buf + RE_HEADER_FLAGS); } /* Return NULL if no group names. Otherwise, return a pointer to |