summaryrefslogtreecommitdiff
path: root/libregexp.c
diff options
context:
space:
mode:
Diffstat (limited to 'libregexp.c')
-rw-r--r--libregexp.c828
1 files changed, 721 insertions, 107 deletions
diff --git a/libregexp.c b/libregexp.c
index 8c47389..cca2197 100644
--- a/libregexp.c
+++ b/libregexp.c
@@ -71,6 +71,7 @@ typedef struct {
const uint8_t *buf_start;
int re_flags;
BOOL is_unicode;
+ BOOL unicode_sets; /* if set, is_unicode is also set */
BOOL ignore_case;
BOOL dotall;
int capture_count;
@@ -102,11 +103,11 @@ static const REOpCode reopcode_info[REOP_COUNT] = {
};
#define RE_HEADER_FLAGS 0
-#define RE_HEADER_CAPTURE_COUNT 1
-#define RE_HEADER_STACK_SIZE 2
-#define RE_HEADER_BYTECODE_LEN 3
+#define RE_HEADER_CAPTURE_COUNT 2
+#define RE_HEADER_STACK_SIZE 3
+#define RE_HEADER_BYTECODE_LEN 4
-#define RE_HEADER_LEN 7
+#define RE_HEADER_LEN 8
static inline int is_digit(int c) {
return c >= '0' && c <= '9';
@@ -122,6 +123,264 @@ static int dbuf_insert(DynBuf *s, int pos, int len)
return 0;
}
+typedef struct REString {
+ struct REString *next;
+ uint32_t hash;
+ uint32_t len;
+ uint32_t buf[];
+} REString;
+
+typedef struct {
+ /* the string list is the union of 'char_range' and of the strings
+ in hash_table[]. The strings in hash_table[] have a length !=
+ 1. */
+ CharRange cr;
+ uint32_t n_strings;
+ uint32_t hash_size;
+ int hash_bits;
+ REString **hash_table;
+} REStringList;
+
+static uint32_t re_string_hash(int len, const uint32_t *buf)
+{
+ int i;
+ uint32_t h;
+ h = 1;
+ for(i = 0; i < len; i++)
+ h = h * 263 + buf[i];
+ return h * 0x61C88647;
+}
+
+static void re_string_list_init(REParseState *s1, REStringList *s)
+{
+ cr_init(&s->cr, s1->opaque, lre_realloc);
+ s->n_strings = 0;
+ s->hash_size = 0;
+ s->hash_bits = 0;
+ s->hash_table = NULL;
+}
+
+static void re_string_list_free(REStringList *s)
+{
+ REString *p, *p_next;
+ int i;
+ for(i = 0; i < s->hash_size; i++) {
+ for(p = s->hash_table[i]; p != NULL; p = p_next) {
+ p_next = p->next;
+ lre_realloc(s->cr.mem_opaque, p, 0);
+ }
+ }
+ lre_realloc(s->cr.mem_opaque, s->hash_table, 0);
+
+ cr_free(&s->cr);
+}
+
+static void lre_print_char(int c, BOOL is_range)
+{
+ if (c == '\'' || c == '\\' ||
+ (is_range && (c == '-' || c == ']'))) {
+ printf("\\%c", c);
+ } else if (c >= ' ' && c <= 126) {
+ printf("%c", c);
+ } else {
+ printf("\\u{%04x}", c);
+ }
+}
+
+static __maybe_unused void re_string_list_dump(const char *str, const REStringList *s)
+{
+ REString *p;
+ const CharRange *cr;
+ int i, j, k;
+
+ printf("%s:\n", str);
+ printf(" ranges: [");
+ cr = &s->cr;
+ for(i = 0; i < cr->len; i += 2) {
+ lre_print_char(cr->points[i], TRUE);
+ if (cr->points[i] != cr->points[i + 1] - 1) {
+ printf("-");
+ lre_print_char(cr->points[i + 1] - 1, TRUE);
+ }
+ }
+ printf("]\n");
+
+ j = 0;
+ for(i = 0; i < s->hash_size; i++) {
+ for(p = s->hash_table[i]; p != NULL; p = p->next) {
+ printf(" %d/%d: '", j, s->n_strings);
+ for(k = 0; k < p->len; k++) {
+ lre_print_char(p->buf[k], FALSE);
+ }
+ printf("'\n");
+ j++;
+ }
+ }
+}
+
+static int re_string_find2(REStringList *s, int len, const uint32_t *buf,
+ uint32_t h0, BOOL add_flag)
+{
+ uint32_t h = 0; /* avoid warning */
+ REString *p;
+ if (s->n_strings != 0) {
+ h = h0 >> (32 - s->hash_bits);
+ for(p = s->hash_table[h]; p != NULL; p = p->next) {
+ if (p->hash == h0 && p->len == len &&
+ !memcmp(p->buf, buf, len * sizeof(buf[0]))) {
+ return 1;
+ }
+ }
+ }
+ /* not found */
+ if (!add_flag)
+ return 0;
+ /* increase the size of the hash table if needed */
+ if (unlikely((s->n_strings + 1) > s->hash_size)) {
+ REString **new_hash_table, *p_next;
+ int new_hash_bits, i;
+ uint32_t new_hash_size;
+ new_hash_bits = max_int(s->hash_bits + 1, 4);
+ new_hash_size = 1 << new_hash_bits;
+ new_hash_table = lre_realloc(s->cr.mem_opaque, NULL,
+ sizeof(new_hash_table[0]) * new_hash_size);
+ if (!new_hash_table)
+ return -1;
+ memset(new_hash_table, 0, sizeof(new_hash_table[0]) * new_hash_size);
+ for(i = 0; i < s->hash_size; i++) {
+ for(p = s->hash_table[i]; p != NULL; p = p_next) {
+ p_next = p->next;
+ h = p->hash >> (32 - new_hash_bits);
+ p->next = new_hash_table[h];
+ new_hash_table[h] = p;
+ }
+ }
+ lre_realloc(s->cr.mem_opaque, s->hash_table, 0);
+ s->hash_bits = new_hash_bits;
+ s->hash_size = new_hash_size;
+ s->hash_table = new_hash_table;
+ h = h0 >> (32 - s->hash_bits);
+ }
+
+ p = lre_realloc(s->cr.mem_opaque, NULL, sizeof(REString) + len * sizeof(buf[0]));
+ if (!p)
+ return -1;
+ p->next = s->hash_table[h];
+ s->hash_table[h] = p;
+ s->n_strings++;
+ p->hash = h0;
+ p->len = len;
+ memcpy(p->buf, buf, sizeof(buf[0]) * len);
+ return 1;
+}
+
+static int re_string_find(REStringList *s, int len, const uint32_t *buf,
+ BOOL add_flag)
+{
+ uint32_t h0;
+ h0 = re_string_hash(len, buf);
+ return re_string_find2(s, len, buf, h0, add_flag);
+}
+
+/* return -1 if memory error, 0 if OK */
+static int re_string_add(REStringList *s, int len, const uint32_t *buf)
+{
+ if (len == 1) {
+ return cr_union_interval(&s->cr, buf[0], buf[0]);
+ }
+ if (re_string_find(s, len, buf, TRUE) < 0)
+ return -1;
+ return 0;
+}
+
+/* a = a op b */
+static int re_string_list_op(REStringList *a, REStringList *b, int op)
+{
+ int i, ret;
+ REString *p, **pp;
+
+ if (cr_op1(&a->cr, b->cr.points, b->cr.len, op))
+ return -1;
+
+ switch(op) {
+ case CR_OP_UNION:
+ if (b->n_strings != 0) {
+ for(i = 0; i < b->hash_size; i++) {
+ for(p = b->hash_table[i]; p != NULL; p = p->next) {
+ if (re_string_find2(a, p->len, p->buf, p->hash, TRUE) < 0)
+ return -1;
+ }
+ }
+ }
+ break;
+ case CR_OP_INTER:
+ case CR_OP_SUB:
+ for(i = 0; i < a->hash_size; i++) {
+ pp = &a->hash_table[i];
+ for(;;) {
+ p = *pp;
+ if (p == NULL)
+ break;
+ ret = re_string_find2(b, p->len, p->buf, p->hash, FALSE);
+ if (op == CR_OP_SUB)
+ ret = !ret;
+ if (!ret) {
+ /* remove it */
+ *pp = p->next;
+ a->n_strings--;
+ lre_realloc(a->cr.mem_opaque, p, 0);
+ } else {
+ /* keep it */
+ pp = &p->next;
+ }
+ }
+ }
+ break;
+ default:
+ abort();
+ }
+ return 0;
+}
+
+static int re_string_list_canonicalize(REParseState *s1,
+ REStringList *s, BOOL is_unicode)
+{
+ if (cr_regexp_canonicalize(&s->cr, is_unicode))
+ return -1;
+ if (s->n_strings != 0) {
+ REStringList a_s, *a = &a_s;
+ int i, j;
+ REString *p;
+
+ /* XXX: simplify */
+ re_string_list_init(s1, a);
+
+ a->n_strings = s->n_strings;
+ a->hash_size = s->hash_size;
+ a->hash_bits = s->hash_bits;
+ a->hash_table = s->hash_table;
+
+ s->n_strings = 0;
+ s->hash_size = 0;
+ s->hash_bits = 0;
+ s->hash_table = NULL;
+
+ for(i = 0; i < a->hash_size; i++) {
+ for(p = a->hash_table[i]; p != NULL; p = p->next) {
+ for(j = 0; j < p->len; j++) {
+ p->buf[j] = lre_canonicalize(p->buf[j], is_unicode);
+ }
+ if (re_string_add(s, p->len, p->buf)) {
+ re_string_list_free(a);
+ return -1;
+ }
+ }
+ }
+ re_string_list_free(a);
+ }
+ return 0;
+}
+
static const uint16_t char_range_d[] = {
1,
0x0030, 0x0039 + 1,
@@ -170,7 +429,7 @@ static const uint16_t * const char_range_table[] = {
char_range_w,
};
-static int cr_init_char_range(REParseState *s, CharRange *cr, uint32_t c)
+static int cr_init_char_range(REParseState *s, REStringList *cr, uint32_t c)
{
BOOL invert;
const uint16_t *c_pt;
@@ -179,18 +438,18 @@ static int cr_init_char_range(REParseState *s, CharRange *cr, uint32_t c)
invert = c & 1;
c_pt = char_range_table[c >> 1];
len = *c_pt++;
- cr_init(cr, s->opaque, lre_realloc);
+ re_string_list_init(s, cr);
for(i = 0; i < len * 2; i++) {
- if (cr_add_point(cr, c_pt[i]))
+ if (cr_add_point(&cr->cr, c_pt[i]))
goto fail;
}
if (invert) {
- if (cr_invert(cr))
+ if (cr_invert(&cr->cr))
goto fail;
}
return 0;
fail:
- cr_free(cr);
+ re_string_list_free(cr);
return -1;
}
@@ -533,8 +792,16 @@ static BOOL is_unicode_char(int c)
(c == '_'));
}
-static int parse_unicode_property(REParseState *s, CharRange *cr,
- const uint8_t **pp, BOOL is_inv)
+/* XXX: memory error test */
+static void seq_prop_cb(void *opaque, const uint32_t *seq, int seq_len)
+{
+ REStringList *sl = opaque;
+ re_string_add(sl, seq_len, seq);
+}
+
+static int parse_unicode_property(REParseState *s, REStringList *cr,
+ const uint8_t **pp, BOOL is_inv,
+ BOOL allow_sequence_prop)
{
const uint8_t *p;
char name[64], value[64];
@@ -574,51 +841,76 @@ static int parse_unicode_property(REParseState *s, CharRange *cr,
} else if (!strcmp(name, "Script_Extensions") || !strcmp(name, "scx")) {
script_ext = TRUE;
do_script:
- cr_init(cr, s->opaque, lre_realloc);
- ret = unicode_script(cr, value, script_ext);
+ re_string_list_init(s, cr);
+ ret = unicode_script(&cr->cr, value, script_ext);
if (ret) {
- cr_free(cr);
+ re_string_list_free(cr);
if (ret == -2)
return re_parse_error(s, "unknown unicode script");
else
goto out_of_memory;
}
} else if (!strcmp(name, "General_Category") || !strcmp(name, "gc")) {
- cr_init(cr, s->opaque, lre_realloc);
- ret = unicode_general_category(cr, value);
+ re_string_list_init(s, cr);
+ ret = unicode_general_category(&cr->cr, value);
if (ret) {
- cr_free(cr);
+ re_string_list_free(cr);
if (ret == -2)
return re_parse_error(s, "unknown unicode general category");
else
goto out_of_memory;
}
} else if (value[0] == '\0') {
- cr_init(cr, s->opaque, lre_realloc);
- ret = unicode_general_category(cr, name);
+ re_string_list_init(s, cr);
+ ret = unicode_general_category(&cr->cr, name);
if (ret == -1) {
- cr_free(cr);
+ re_string_list_free(cr);
goto out_of_memory;
}
if (ret < 0) {
- ret = unicode_prop(cr, name);
- if (ret) {
- cr_free(cr);
- if (ret == -2)
- goto unknown_property_name;
- else
- goto out_of_memory;
+ ret = unicode_prop(&cr->cr, name);
+ if (ret == -1) {
+ re_string_list_free(cr);
+ goto out_of_memory;
}
}
+ if (ret < 0 && !is_inv && allow_sequence_prop) {
+ CharRange cr_tmp;
+ cr_init(&cr_tmp, s->opaque, lre_realloc);
+ ret = unicode_sequence_prop(name, seq_prop_cb, cr, &cr_tmp);
+ cr_free(&cr_tmp);
+ if (ret == -1) {
+ re_string_list_free(cr);
+ goto out_of_memory;
+ }
+ }
+ if (ret < 0)
+ goto unknown_property_name;
} else {
unknown_property_name:
return re_parse_error(s, "unknown unicode property name");
}
+ /* the ordering of case folding and inversion differs with
+ unicode_sets. 'unicode_sets' ordering is more consistent */
+ /* XXX: the spec seems incorrect, we do it as the other engines
+ seem to do it. */
+ if (s->ignore_case && s->unicode_sets) {
+ if (re_string_list_canonicalize(s, cr, s->is_unicode)) {
+ re_string_list_free(cr);
+ goto out_of_memory;
+ }
+ }
if (is_inv) {
- if (cr_invert(cr)) {
- cr_free(cr);
- return -1;
+ if (cr_invert(&cr->cr)) {
+ re_string_list_free(cr);
+ goto out_of_memory;
+ }
+ }
+ if (s->ignore_case && !s->unicode_sets) {
+ if (re_string_list_canonicalize(s, cr, s->is_unicode)) {
+ re_string_list_free(cr);
+ goto out_of_memory;
}
}
*pp = p;
@@ -628,10 +920,61 @@ static int parse_unicode_property(REParseState *s, CharRange *cr,
}
#endif /* CONFIG_ALL_UNICODE */
+static int get_class_atom(REParseState *s, REStringList *cr,
+ const uint8_t **pp, BOOL inclass);
+
+static int parse_class_string_disjunction(REParseState *s, REStringList *cr,
+ const uint8_t **pp)
+{
+ const uint8_t *p;
+ DynBuf str;
+ int c;
+
+ p = *pp;
+ if (*p != '{')
+ return re_parse_error(s, "expecting '{' after \\q");
+
+ dbuf_init2(&str, s->opaque, lre_realloc);
+ re_string_list_init(s, cr);
+
+ p++;
+ for(;;) {
+ str.size = 0;
+ while (*p != '}' && *p != '|') {
+ c = get_class_atom(s, NULL, &p, FALSE);
+ if (c < 0)
+ goto fail;
+ if (dbuf_put_u32(&str, c)) {
+ re_parse_out_of_memory(s);
+ goto fail;
+ }
+ }
+ if (re_string_add(cr, str.size / 4, (uint32_t *)str.buf)) {
+ re_parse_out_of_memory(s);
+ goto fail;
+ }
+ if (*p == '}')
+ break;
+ p++;
+ }
+ if (s->ignore_case) {
+ if (re_string_list_canonicalize(s, cr, TRUE))
+ goto fail;
+ }
+ p++; /* skip the '}' */
+ dbuf_free(&str);
+ *pp = p;
+ return 0;
+ fail:
+ dbuf_free(&str);
+ re_string_list_free(cr);
+ return -1;
+}
+
/* return -1 if error otherwise the character or a class range
- (CLASS_RANGE_BASE). In case of class range, 'cr' is
+ (CLASS_RANGE_BASE) if cr != NULL. In case of class range, 'cr' is
initialized. Otherwise, it is ignored. */
-static int get_class_atom(REParseState *s, CharRange *cr,
+static int get_class_atom(REParseState *s, REStringList *cr,
const uint8_t **pp, BOOL inclass)
{
const uint8_t *p;
@@ -666,6 +1009,8 @@ static int get_class_atom(REParseState *s, CharRange *cr,
case 'W':
c = CHAR_RANGE_W;
class_range:
+ if (!cr)
+ goto default_escape;
if (cr_init_char_range(s, cr, c))
return -1;
c = CLASS_RANGE_BASE;
@@ -690,27 +1035,50 @@ static int get_class_atom(REParseState *s, CharRange *cr,
if (!inclass && s->is_unicode)
goto invalid_escape;
break;
+ case '^':
+ case '$':
+ case '\\':
+ case '.':
+ case '*':
+ case '+':
+ case '?':
+ case '(':
+ case ')':
+ case '[':
+ case ']':
+ case '{':
+ case '}':
+ case '|':
+ case '/':
+ /* always valid to escape these characters */
+ break;
#ifdef CONFIG_ALL_UNICODE
case 'p':
case 'P':
- if (s->is_unicode) {
- if (parse_unicode_property(s, cr, &p, (c == 'P')))
+ if (s->is_unicode && cr) {
+ if (parse_unicode_property(s, cr, &p, (c == 'P'), s->unicode_sets))
return -1;
c = CLASS_RANGE_BASE;
break;
}
- /* fall thru */
+ goto default_escape;
#endif
+ case 'q':
+ if (s->unicode_sets && cr && inclass) {
+ if (parse_class_string_disjunction(s, cr, &p))
+ return -1;
+ c = CLASS_RANGE_BASE;
+ break;
+ }
+ goto default_escape;
default:
+ default_escape:
p--;
ret = lre_parse_escape(&p, s->is_unicode * 2);
if (ret >= 0) {
c = ret;
} else {
- if (ret == -2 && *p != '\0' && strchr("^$\\.*+?()[]{}|/", *p)) {
- /* always valid to escape these characters */
- goto normal_char;
- } else if (s->is_unicode) {
+ if (s->is_unicode) {
invalid_escape:
return re_parse_error(s, "invalid escape sequence in regular expression");
} else {
@@ -727,6 +1095,48 @@ static int get_class_atom(REParseState *s, CharRange *cr,
return re_parse_error(s, "unexpected end");
}
/* fall thru */
+ goto normal_char;
+
+ case '&':
+ case '!':
+ case '#':
+ case '$':
+ case '%':
+ case '*':
+ case '+':
+ case ',':
+ case '.':
+ case ':':
+ case ';':
+ case '<':
+ case '=':
+ case '>':
+ case '?':
+ case '@':
+ case '^':
+ case '`':
+ case '~':
+ if (s->unicode_sets && p[1] == c) {
+ /* forbidden double characters */
+ return re_parse_error(s, "invalid class set operation in regular expression");
+ }
+ goto normal_char;
+
+ case '(':
+ case ')':
+ case '[':
+ case ']':
+ case '{':
+ case '}':
+ case '/':
+ case '-':
+ case '|':
+ if (s->unicode_sets) {
+ /* invalid characters in unicode sets */
+ return re_parse_error(s, "invalid character in class in regular expression");
+ }
+ goto normal_char;
+
default:
normal_char:
/* normal char */
@@ -754,8 +1164,6 @@ static int re_emit_range(REParseState *s, const CharRange *cr)
if (len >= 65535)
return re_parse_error(s, "too many ranges");
if (len == 0) {
- /* not sure it can really happen. Emit a match that is always
- false */
re_emit_op_u32(s, REOP_char32, -1);
} else {
high = cr->points[cr->len - 1];
@@ -783,15 +1191,135 @@ static int re_emit_range(REParseState *s, const CharRange *cr)
return 0;
}
-static int re_parse_char_class(REParseState *s, const uint8_t **pp)
+static int re_string_cmp_len(const void *a, const void *b, void *arg)
+{
+ REString *p1 = *(REString **)a;
+ REString *p2 = *(REString **)b;
+ return (p1->len < p2->len) - (p1->len > p2->len);
+}
+
+static int re_emit_string_list(REParseState *s, const REStringList *sl)
+{
+ REString **tab, *p;
+ int i, j, c, split_pos, last_match_pos, n;
+ BOOL has_empty_string, is_last;
+
+ // re_string_list_dump("sl", sl);
+ if (sl->n_strings == 0) {
+ /* simple case: only characters */
+ if (re_emit_range(s, &sl->cr))
+ return -1;
+ } else {
+ /* at least one string list is present : match the longest ones first */
+ /* XXX: add a new op_switch opcode to compile as a trie */
+ tab = lre_realloc(s->opaque, NULL, sizeof(tab[0]) * sl->n_strings);
+ if (!tab) {
+ re_parse_out_of_memory(s);
+ return -1;
+ }
+ has_empty_string = FALSE;
+ n = 0;
+ for(i = 0; i < sl->hash_size; i++) {
+ for(p = sl->hash_table[i]; p != NULL; p = p->next) {
+ if (p->len == 0) {
+ has_empty_string = TRUE;
+ } else {
+ tab[n++] = p;
+ }
+ }
+ }
+ assert(n <= sl->n_strings);
+
+ rqsort(tab, n, sizeof(tab[0]), re_string_cmp_len, NULL);
+
+ last_match_pos = -1;
+ for(i = 0; i < n; i++) {
+ p = tab[i];
+ is_last = !has_empty_string && sl->cr.len == 0 && i == (n - 1);
+ if (!is_last)
+ split_pos = re_emit_op_u32(s, REOP_split_next_first, 0);
+ else
+ split_pos = 0;
+ for(j = 0; j < p->len; j++) {
+ c = p->buf[j];
+ if (c <= 0xffff)
+ re_emit_op_u16(s, REOP_char, c);
+ else
+ re_emit_op_u32(s, REOP_char32, c);
+ }
+ if (!is_last) {
+ last_match_pos = re_emit_op_u32(s, REOP_goto, last_match_pos);
+ put_u32(s->byte_code.buf + split_pos, s->byte_code.size - (split_pos + 4));
+ }
+ }
+
+ if (sl->cr.len != 0) {
+ /* char range */
+ is_last = !has_empty_string;
+ if (!is_last)
+ split_pos = re_emit_op_u32(s, REOP_split_next_first, 0);
+ else
+ split_pos = 0; /* not used */
+ if (re_emit_range(s, &sl->cr)) {
+ lre_realloc(s->opaque, tab, 0);
+ return -1;
+ }
+ if (!is_last)
+ put_u32(s->byte_code.buf + split_pos, s->byte_code.size - (split_pos + 4));
+ }
+
+ /* patch the 'goto match' */
+ while (last_match_pos != -1) {
+ int next_pos = get_u32(s->byte_code.buf + last_match_pos);
+ put_u32(s->byte_code.buf + last_match_pos, s->byte_code.size - (last_match_pos + 4));
+ last_match_pos = next_pos;
+ }
+
+ lre_realloc(s->opaque, tab, 0);
+ }
+ return 0;
+}
+
+static int re_parse_nested_class(REParseState *s, REStringList *cr, const uint8_t **pp);
+
+static int re_parse_class_set_operand(REParseState *s, REStringList *cr, const uint8_t **pp)
+{
+ int c1;
+ const uint8_t *p = *pp;
+
+ if (*p == '[') {
+ if (re_parse_nested_class(s, cr, pp))
+ return -1;
+ } else {
+ c1 = get_class_atom(s, cr, pp, TRUE);
+ if (c1 < 0)
+ return -1;
+ if (c1 < CLASS_RANGE_BASE) {
+ /* create a range with a single character */
+ re_string_list_init(s, cr);
+ if (s->ignore_case)
+ c1 = lre_canonicalize(c1, s->is_unicode);
+ if (cr_union_interval(&cr->cr, c1, c1)) {
+ re_string_list_free(cr);
+ return -1;
+ }
+ }
+ }
+ return 0;
+}
+
+static int re_parse_nested_class(REParseState *s, REStringList *cr, const uint8_t **pp)
{
const uint8_t *p;
uint32_t c1, c2;
- CharRange cr_s, *cr = &cr_s;
- CharRange cr1_s, *cr1 = &cr1_s;
- BOOL invert;
+ int ret;
+ REStringList cr1_s, *cr1 = &cr1_s;
+ BOOL invert, is_first;
+
+ if (lre_check_stack_overflow(s->opaque, 0))
+ return re_parse_error(s, "stack overflow");
- cr_init(cr, s->opaque, lre_realloc);
+ re_string_list_init(s, cr);
p = *pp;
p++; /* skip '[' */
@@ -800,74 +1328,155 @@ static int re_parse_char_class(REParseState *s, const uint8_t **pp)
p++;
invert = TRUE;
}
-
+
+ /* handle unions */
+ is_first = TRUE;
for(;;) {
if (*p == ']')
break;
- c1 = get_class_atom(s, cr1, &p, TRUE);
- if ((int)c1 < 0)
- goto fail;
- if (*p == '-' && p[1] != ']') {
- const uint8_t *p0 = p + 1;
- if (c1 >= CLASS_RANGE_BASE) {
- if (s->is_unicode) {
- cr_free(cr1);
- goto invalid_class_range;
- }
- /* Annex B: match '-' character */
- goto class_atom;
- }
- c2 = get_class_atom(s, cr1, &p0, TRUE);
- if ((int)c2 < 0)
- goto fail;
- if (c2 >= CLASS_RANGE_BASE) {
- cr_free(cr1);
- if (s->is_unicode) {
- goto invalid_class_range;
- }
- /* Annex B: match '-' character */
- goto class_atom;
- }
- p = p0;
- if (c2 < c1) {
- invalid_class_range:
- re_parse_error(s, "invalid class range");
+ if (*p == '[' && s->unicode_sets) {
+ if (re_parse_nested_class(s, cr1, &p))
goto fail;
- }
- if (cr_union_interval(cr, c1, c2))
- goto memory_error;
+ goto class_union;
} else {
- class_atom:
- if (c1 >= CLASS_RANGE_BASE) {
- int ret;
- ret = cr_union1(cr, cr1->points, cr1->len);
- cr_free(cr1);
- if (ret)
- goto memory_error;
+ c1 = get_class_atom(s, cr1, &p, TRUE);
+ if ((int)c1 < 0)
+ goto fail;
+ if (*p == '-' && p[1] != ']') {
+ const uint8_t *p0 = p + 1;
+ if (p[1] == '-' && s->unicode_sets && is_first)
+ goto class_atom; /* first character class followed by '--' */
+ if (c1 >= CLASS_RANGE_BASE) {
+ if (s->is_unicode) {
+ re_string_list_free(cr1);
+ goto invalid_class_range;
+ }
+ /* Annex B: match '-' character */
+ goto class_atom;
+ }
+ c2 = get_class_atom(s, cr1, &p0, TRUE);
+ if ((int)c2 < 0)
+ goto fail;
+ if (c2 >= CLASS_RANGE_BASE) {
+ re_string_list_free(cr1);
+ if (s->is_unicode) {
+ goto invalid_class_range;
+ }
+ /* Annex B: match '-' character */
+ goto class_atom;
+ }
+ p = p0;
+ if (c2 < c1) {
+ invalid_class_range:
+ re_parse_error(s, "invalid class range");
+ goto fail;
+ }
+ if (s->ignore_case) {
+ CharRange cr2_s, *cr2 = &cr2_s;
+ cr_init(cr2, s->opaque, lre_realloc);
+ if (cr_add_interval(cr2, c1, c2 + 1) ||
+ cr_regexp_canonicalize(cr2, s->is_unicode) ||
+ cr_op1(&cr->cr, cr2->points, cr2->len, CR_OP_UNION)) {
+ cr_free(cr2);
+ goto memory_error;
+ }
+ cr_free(cr2);
+ } else {
+ if (cr_union_interval(&cr->cr, c1, c2))
+ goto memory_error;
+ }
+ is_first = FALSE; /* union operation */
} else {
- if (cr_union_interval(cr, c1, c1))
- goto memory_error;
+ class_atom:
+ if (c1 >= CLASS_RANGE_BASE) {
+ class_union:
+ ret = re_string_list_op(cr, cr1, CR_OP_UNION);
+ re_string_list_free(cr1);
+ if (ret)
+ goto memory_error;
+ } else {
+ if (s->ignore_case)
+ c1 = lre_canonicalize(c1, s->is_unicode);
+ if (cr_union_interval(&cr->cr, c1, c1))
+ goto memory_error;
+ }
}
}
+ if (s->unicode_sets && is_first) {
+ if (*p == '&' && p[1] == '&' && p[2] != '&') {
+ /* handle '&&' */
+ for(;;) {
+ if (*p == ']') {
+ break;
+ } else if (*p == '&' && p[1] == '&' && p[2] != '&') {
+ p += 2;
+ } else {
+ goto invalid_operation;
+ }
+ if (re_parse_class_set_operand(s, cr1, &p))
+ goto fail;
+ ret = re_string_list_op(cr, cr1, CR_OP_INTER);
+ re_string_list_free(cr1);
+ if (ret)
+ goto memory_error;
+ }
+ } else if (*p == '-' && p[1] == '-') {
+ /* handle '--' */
+ for(;;) {
+ if (*p == ']') {
+ break;
+ } else if (*p == '-' && p[1] == '-') {
+ p += 2;
+ } else {
+ invalid_operation:
+ re_parse_error(s, "invalid operation in regular expression");
+ goto fail;
+ }
+ if (re_parse_class_set_operand(s, cr1, &p))
+ goto fail;
+ ret = re_string_list_op(cr, cr1, CR_OP_SUB);
+ re_string_list_free(cr1);
+ if (ret)
+ goto memory_error;
+ }
+ }
+ }
+ is_first = FALSE;
}
- if (s->ignore_case) {
- if (cr_regexp_canonicalize(cr, s->is_unicode))
- goto memory_error;
- }
+
+ p++; /* skip ']' */
+ *pp = p;
if (invert) {
- if (cr_invert(cr))
+ /* XXX: add may_contain_string syntax check to be fully
+ compliant. The test here accepts more input than the
+ spec. */
+ if (cr->n_strings != 0) {
+ re_parse_error(s, "negated character class with strings in regular expression debugger eval code");
+ goto fail;
+ }
+ if (cr_invert(&cr->cr))
goto memory_error;
}
- if (re_emit_range(s, cr))
- goto fail;
- cr_free(cr);
- p++; /* skip ']' */
- *pp = p;
return 0;
memory_error:
re_parse_out_of_memory(s);
fail:
- cr_free(cr);
+ re_string_list_free(cr);
+ return -1;
+}
+
+static int re_parse_char_class(REParseState *s, const uint8_t **pp)
+{
+ REStringList cr_s, *cr = &cr_s;
+
+ if (re_parse_nested_class(s, cr, pp))
+ return -1;
+ if (re_emit_string_list(s, cr))
+ goto fail;
+ re_string_list_free(cr);
+ return 0;
+ fail:
+ re_string_list_free(cr);
return -1;
}
@@ -1121,7 +1730,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
const uint8_t *p;
int c, last_atom_start, quant_min, quant_max, last_capture_count;
BOOL greedy, add_zero_advance_check, is_neg, is_backward_lookahead;
- CharRange cr_s, *cr = &cr_s;
+ REStringList cr_s, *cr = &cr_s;
last_atom_start = -1;
last_capture_count = 0;
@@ -1385,9 +1994,8 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
re_emit_op(s, REOP_prev);
if (c >= CLASS_RANGE_BASE) {
int ret;
- /* Note: canonicalization is not needed */
- ret = re_emit_range(s, cr);
- cr_free(cr);
+ ret = re_emit_string_list(s, cr);
+ re_string_list_free(cr);
if (ret)
return -1;
} else {
@@ -1737,10 +2345,11 @@ uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size,
s->buf_end = s->buf_ptr + buf_len;
s->buf_start = s->buf_ptr;
s->re_flags = re_flags;
- s->is_unicode = ((re_flags & LRE_FLAG_UNICODE) != 0);
+ s->is_unicode = ((re_flags & (LRE_FLAG_UNICODE | LRE_FLAG_UNICODE_SETS)) != 0);
is_sticky = ((re_flags & LRE_FLAG_STICKY) != 0);
s->ignore_case = ((re_flags & LRE_FLAG_IGNORECASE) != 0);
s->dotall = ((re_flags & LRE_FLAG_DOTALL) != 0);
+ s->unicode_sets = ((re_flags & LRE_FLAG_UNICODE_SETS) != 0);
s->capture_count = 1;
s->total_capture_count = -1;
s->has_named_captures = -1;
@@ -1748,7 +2357,7 @@ uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size,
dbuf_init2(&s->byte_code, opaque, lre_realloc);
dbuf_init2(&s->group_names, opaque, lre_realloc);
- dbuf_putc(&s->byte_code, re_flags); /* first element is the flags */
+ dbuf_put_u16(&s->byte_code, re_flags); /* first element is the flags */
dbuf_putc(&s->byte_code, 0); /* second element is the number of captures */
dbuf_putc(&s->byte_code, 0); /* stack size */
dbuf_put_u32(&s->byte_code, 0); /* bytecode length */
@@ -1801,7 +2410,8 @@ uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size,
/* add the named groups if needed */
if (s->group_names.size > (s->capture_count - 1)) {
dbuf_put(&s->byte_code, s->group_names.buf, s->group_names.size);
- s->byte_code.buf[RE_HEADER_FLAGS] |= LRE_FLAG_NAMED_GROUPS;
+ put_u16(s->byte_code.buf + RE_HEADER_FLAGS,
+ lre_get_flags(s->byte_code.buf) | LRE_FLAG_NAMED_GROUPS);
}
dbuf_free(&s->group_names);
@@ -2221,6 +2831,8 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
v1 = FALSE;
} else {
PEEK_PREV_CHAR(c, cptr, s->cbuf, cbuf_type);
+ if (s->ignore_case)
+ c = lre_canonicalize(c, s->is_unicode);
v1 = is_word_char(c);
}
/* current char */
@@ -2228,6 +2840,8 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
v2 = FALSE;
} else {
PEEK_CHAR(c, cptr, cbuf_end, cbuf_type);
+ if (s->ignore_case)
+ c = lre_canonicalize(c, s->is_unicode);
v2 = is_word_char(c);
}
if (v1 ^ v2 ^ (REOP_not_word_boundary - opcode))
@@ -2424,7 +3038,7 @@ int lre_exec(uint8_t **capture,
re_flags = lre_get_flags(bc_buf);
s->multi_line = (re_flags & LRE_FLAG_MULTILINE) != 0;
s->ignore_case = (re_flags & LRE_FLAG_IGNORECASE) != 0;
- s->is_unicode = (re_flags & LRE_FLAG_UNICODE) != 0;
+ s->is_unicode = (re_flags & (LRE_FLAG_UNICODE | LRE_FLAG_UNICODE_SETS)) != 0;
s->capture_count = bc_buf[RE_HEADER_CAPTURE_COUNT];
s->stack_size_max = bc_buf[RE_HEADER_STACK_SIZE];
s->cbuf = cbuf;
@@ -2459,7 +3073,7 @@ int lre_get_capture_count(const uint8_t *bc_buf)
int lre_get_flags(const uint8_t *bc_buf)
{
- return bc_buf[RE_HEADER_FLAGS];
+ return get_u16(bc_buf + RE_HEADER_FLAGS);
}
/* Return NULL if no group names. Otherwise, return a pointer to