summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFabrice Bellard <fabrice@bellard.org>2025-05-16 20:34:18 +0200
committerFabrice Bellard <fabrice@bellard.org>2025-05-16 20:34:18 +0200
commitf95b8ba1bbf2f6ec85c340d470ad10d335b243c2 (patch)
tree1f9a98250e5b5b1d959a32bc5efc0627181bd842
parent9c973a8923145c5682afb904621cfbc59a7ec7f9 (diff)
downloadquickjs-f95b8ba1bbf2f6ec85c340d470ad10d335b243c2.tar.gz
quickjs-f95b8ba1bbf2f6ec85c340d470ad10d335b243c2.zip
added regexp modifiers
-rw-r--r--libregexp-opcode.h12
-rw-r--r--libregexp.c178
-rw-r--r--test262.conf2
3 files changed, 159 insertions, 33 deletions
diff --git a/libregexp-opcode.h b/libregexp-opcode.h
index f255e09..ebab751 100644
--- a/libregexp-opcode.h
+++ b/libregexp-opcode.h
@@ -26,11 +26,15 @@
DEF(invalid, 1) /* never used */
DEF(char, 3)
+DEF(char_i, 3)
DEF(char32, 5)
+DEF(char32_i, 5)
DEF(dot, 1)
DEF(any, 1) /* same as dot but match any character including line terminator */
DEF(line_start, 1)
+DEF(line_start_m, 1)
DEF(line_end, 1)
+DEF(line_end_m, 1)
DEF(goto, 5)
DEF(split_goto_first, 5)
DEF(split_next_first, 5)
@@ -42,11 +46,17 @@ DEF(loop, 5) /* decrement the top the stack and goto if != 0 */
DEF(push_i32, 5) /* push integer on the stack */
DEF(drop, 1)
DEF(word_boundary, 1)
+DEF(word_boundary_i, 1)
DEF(not_word_boundary, 1)
+DEF(not_word_boundary_i, 1)
DEF(back_reference, 2)
-DEF(backward_back_reference, 2) /* must come after back_reference */
+DEF(back_reference_i, 2) /* must come after */
+DEF(backward_back_reference, 2) /* must come after */
+DEF(backward_back_reference_i, 2) /* must come after */
DEF(range, 3) /* variable length */
+DEF(range_i, 3) /* variable length */
DEF(range32, 3) /* variable length */
+DEF(range32_i, 3) /* variable length */
DEF(lookahead, 5)
DEF(negative_lookahead, 5)
DEF(push_char_pos, 1) /* push the character position on the stack */
diff --git a/libregexp.c b/libregexp.c
index cca2197..2b33c86 100644
--- a/libregexp.c
+++ b/libregexp.c
@@ -73,6 +73,7 @@ typedef struct {
BOOL is_unicode;
BOOL unicode_sets; /* if set, is_unicode is also set */
BOOL ignore_case;
+ BOOL multi_line;
BOOL dotall;
int capture_count;
int total_capture_count; /* -1 = not computed yet */
@@ -499,6 +500,7 @@ static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
printf("%s", reopcode_info[opcode].name);
switch(opcode) {
case REOP_char:
+ case REOP_char_i:
val = get_u16(buf + pos + 1);
if (val >= ' ' && val <= 126)
printf(" '%c'", val);
@@ -506,6 +508,7 @@ static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
printf(" 0x%04x", val);
break;
case REOP_char32:
+ case REOP_char32_i:
val = get_u32(buf + pos + 1);
if (val >= ' ' && val <= 126)
printf(" '%c'", val);
@@ -532,7 +535,9 @@ static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
case REOP_save_start:
case REOP_save_end:
case REOP_back_reference:
+ case REOP_back_reference_i:
case REOP_backward_back_reference:
+ case REOP_backward_back_reference_i:
printf(" %u", buf[pos + 1]);
break;
case REOP_save_reset:
@@ -543,6 +548,7 @@ static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
printf(" %d", val);
break;
case REOP_range:
+ case REOP_range_i:
{
int n, i;
n = get_u16(buf + pos + 1);
@@ -554,6 +560,7 @@ static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
}
break;
case REOP_range32:
+ case REOP_range32_i:
{
int n, i;
n = get_u16(buf + pos + 1);
@@ -1172,7 +1179,7 @@ static int re_emit_range(REParseState *s, const CharRange *cr)
if (high <= 0xffff) {
/* can use 16 bit ranges with the conversion that 0xffff =
infinity */
- re_emit_op_u16(s, REOP_range, len);
+ re_emit_op_u16(s, s->ignore_case ? REOP_range_i : REOP_range, len);
for(i = 0; i < cr->len; i += 2) {
dbuf_put_u16(&s->byte_code, cr->points[i]);
high = cr->points[i + 1] - 1;
@@ -1181,7 +1188,7 @@ static int re_emit_range(REParseState *s, const CharRange *cr)
dbuf_put_u16(&s->byte_code, high);
}
} else {
- re_emit_op_u16(s, REOP_range32, len);
+ re_emit_op_u16(s, s->ignore_case ? REOP_range32_i : REOP_range32, len);
for(i = 0; i < cr->len; i += 2) {
dbuf_put_u32(&s->byte_code, cr->points[i]);
dbuf_put_u32(&s->byte_code, cr->points[i + 1] - 1);
@@ -1198,10 +1205,18 @@ static int re_string_cmp_len(const void *a, const void *b, void *arg)
return (p1->len < p2->len) - (p1->len > p2->len);
}
+static void re_emit_char(REParseState *s, int c)
+{
+ if (c <= 0xffff)
+ re_emit_op_u16(s, s->ignore_case ? REOP_char_i : REOP_char, c);
+ else
+ re_emit_op_u32(s, s->ignore_case ? REOP_char32_i : REOP_char32, c);
+}
+
static int re_emit_string_list(REParseState *s, const REStringList *sl)
{
REString **tab, *p;
- int i, j, c, split_pos, last_match_pos, n;
+ int i, j, split_pos, last_match_pos, n;
BOOL has_empty_string, is_last;
// re_string_list_dump("sl", sl);
@@ -1241,11 +1256,7 @@ static int re_emit_string_list(REParseState *s, const REStringList *sl)
else
split_pos = 0;
for(j = 0; j < p->len; j++) {
- c = p->buf[j];
- if (c <= 0xffff)
- re_emit_op_u16(s, REOP_char, c);
- else
- re_emit_op_u32(s, REOP_char32, c);
+ re_emit_char(s, p->buf[j]);
}
if (!is_last) {
last_match_pos = re_emit_op_u32(s, REOP_goto, last_match_pos);
@@ -1497,27 +1508,35 @@ static BOOL re_need_check_advance(const uint8_t *bc_buf, int bc_buf_len)
len = reopcode_info[opcode].size;
switch(opcode) {
case REOP_range:
+ case REOP_range_i:
val = get_u16(bc_buf + pos + 1);
len += val * 4;
goto simple_char;
case REOP_range32:
+ case REOP_range32_i:
val = get_u16(bc_buf + pos + 1);
len += val * 8;
goto simple_char;
case REOP_char:
+ case REOP_char_i:
case REOP_char32:
+ case REOP_char32_i:
case REOP_dot:
case REOP_any:
simple_char:
ret = FALSE;
break;
case REOP_line_start:
+ case REOP_line_start_m:
case REOP_line_end:
+ case REOP_line_end_m:
case REOP_push_i32:
case REOP_push_char_pos:
case REOP_drop:
case REOP_word_boundary:
+ case REOP_word_boundary_i:
case REOP_not_word_boundary:
+ case REOP_not_word_boundary_i:
case REOP_prev:
/* no effect */
break;
@@ -1525,7 +1544,9 @@ static BOOL re_need_check_advance(const uint8_t *bc_buf, int bc_buf_len)
case REOP_save_end:
case REOP_save_reset:
case REOP_back_reference:
+ case REOP_back_reference_i:
case REOP_backward_back_reference:
+ case REOP_backward_back_reference_i:
break;
default:
/* safe behavior: we cannot predict the outcome */
@@ -1550,24 +1571,32 @@ static int re_is_simple_quantifier(const uint8_t *bc_buf, int bc_buf_len)
len = reopcode_info[opcode].size;
switch(opcode) {
case REOP_range:
+ case REOP_range_i:
val = get_u16(bc_buf + pos + 1);
len += val * 4;
goto simple_char;
case REOP_range32:
+ case REOP_range32_i:
val = get_u16(bc_buf + pos + 1);
len += val * 8;
goto simple_char;
case REOP_char:
+ case REOP_char_i:
case REOP_char32:
+ case REOP_char32_i:
case REOP_dot:
case REOP_any:
simple_char:
count++;
break;
case REOP_line_start:
+ case REOP_line_start_m:
case REOP_line_end:
+ case REOP_line_end_m:
case REOP_word_boundary:
+ case REOP_word_boundary_i:
case REOP_not_word_boundary:
+ case REOP_not_word_boundary_i:
break;
default:
return -1;
@@ -1725,6 +1754,41 @@ static int find_group_name(REParseState *s, const char *name)
static int re_parse_disjunction(REParseState *s, BOOL is_backward_dir);
+static int re_parse_modifiers(REParseState *s, const uint8_t **pp)
+{
+ const uint8_t *p = *pp;
+ int mask = 0;
+ int val;
+
+ for(;;) {
+ if (*p == 'i') {
+ val = LRE_FLAG_IGNORECASE;
+ } else if (*p == 'm') {
+ val = LRE_FLAG_MULTILINE;
+ } else if (*p == 's') {
+ val = LRE_FLAG_DOTALL;
+ } else {
+ break;
+ }
+ if (mask & val)
+ return re_parse_error(s, "duplicate modifier: '%c'", *p);
+ mask |= val;
+ p++;
+ }
+ *pp = p;
+ return mask;
+}
+
+static BOOL update_modifier(BOOL val, int add_mask, int remove_mask,
+ int mask)
+{
+ if (add_mask & mask)
+ val = TRUE;
+ if (remove_mask & mask)
+ val = FALSE;
+ return val;
+}
+
static int re_parse_term(REParseState *s, BOOL is_backward_dir)
{
const uint8_t *p;
@@ -1739,11 +1803,11 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
switch(c) {
case '^':
p++;
- re_emit_op(s, REOP_line_start);
+ re_emit_op(s, s->multi_line ? REOP_line_start_m : REOP_line_start);
break;
case '$':
p++;
- re_emit_op(s, REOP_line_end);
+ re_emit_op(s, s->multi_line ? REOP_line_end_m : REOP_line_end);
break;
case '.':
p++;
@@ -1793,6 +1857,44 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
p = s->buf_ptr;
if (re_parse_expect(s, &p, ')'))
return -1;
+ } else if (p[2] == 'i' || p[2] == 'm' || p[2] == 's' || p[2] == '-') {
+ BOOL saved_ignore_case, saved_multi_line, saved_dotall;
+ int add_mask, remove_mask;
+ p += 2;
+ remove_mask = 0;
+ add_mask = re_parse_modifiers(s, &p);
+ if (add_mask < 0)
+ return -1;
+ if (*p == '-') {
+ p++;
+ remove_mask = re_parse_modifiers(s, &p);
+ if (remove_mask < 0)
+ return -1;
+ }
+ if ((add_mask == 0 && remove_mask == 0) ||
+ (add_mask & remove_mask) != 0) {
+ return re_parse_error(s, "invalid modifiers");
+ }
+ if (re_parse_expect(s, &p, ':'))
+ return -1;
+ saved_ignore_case = s->ignore_case;
+ saved_multi_line = s->multi_line;
+ saved_dotall = s->dotall;
+ s->ignore_case = update_modifier(s->ignore_case, add_mask, remove_mask, LRE_FLAG_IGNORECASE);
+ s->multi_line = update_modifier(s->multi_line, add_mask, remove_mask, LRE_FLAG_MULTILINE);
+ s->dotall = update_modifier(s->dotall, add_mask, remove_mask, LRE_FLAG_DOTALL);
+
+ last_atom_start = s->byte_code.size;
+ last_capture_count = s->capture_count;
+ s->buf_ptr = p;
+ if (re_parse_disjunction(s, is_backward_dir))
+ return -1;
+ p = s->buf_ptr;
+ if (re_parse_expect(s, &p, ')'))
+ return -1;
+ s->ignore_case = saved_ignore_case;
+ s->multi_line = saved_multi_line;
+ s->dotall = saved_dotall;
} else if ((p[2] == '=' || p[2] == '!')) {
is_neg = (p[2] == '!');
is_backward_lookahead = FALSE;
@@ -1871,7 +1973,11 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
switch(p[1]) {
case 'b':
case 'B':
- re_emit_op(s, REOP_word_boundary + (p[1] != 'b'));
+ if (p[1] != 'b') {
+ re_emit_op(s, s->ignore_case ? REOP_not_word_boundary_i : REOP_not_word_boundary);
+ } else {
+ re_emit_op(s, s->ignore_case ? REOP_word_boundary_i : REOP_word_boundary);
+ }
p += 2;
break;
case 'k':
@@ -1960,7 +2066,8 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
emit_back_reference:
last_atom_start = s->byte_code.size;
last_capture_count = s->capture_count;
- re_emit_op_u8(s, REOP_back_reference + is_backward_dir, c);
+
+ re_emit_op_u8(s, REOP_back_reference + 2 * is_backward_dir + s->ignore_case, c);
}
break;
default:
@@ -2001,10 +2108,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
} else {
if (s->ignore_case)
c = lre_canonicalize(c, s->is_unicode);
- if (c <= 0xffff)
- re_emit_op_u16(s, REOP_char, c);
- else
- re_emit_op_u32(s, REOP_char32, c);
+ re_emit_char(s, c);
}
if (is_backward_dir)
re_emit_op(s, REOP_prev);
@@ -2314,10 +2418,12 @@ static int compute_stack_size(const uint8_t *bc_buf, int bc_buf_len)
stack_size--;
break;
case REOP_range:
+ case REOP_range_i:
val = get_u16(bc_buf + pos + 1);
len += val * 4;
break;
case REOP_range32:
+ case REOP_range32_i:
val = get_u16(bc_buf + pos + 1);
len += val * 8;
break;
@@ -2348,6 +2454,7 @@ uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size,
s->is_unicode = ((re_flags & (LRE_FLAG_UNICODE | LRE_FLAG_UNICODE_SETS)) != 0);
is_sticky = ((re_flags & LRE_FLAG_STICKY) != 0);
s->ignore_case = ((re_flags & LRE_FLAG_IGNORECASE) != 0);
+ s->multi_line = ((re_flags & LRE_FLAG_MULTILINE) != 0);
s->dotall = ((re_flags & LRE_FLAG_DOTALL) != 0);
s->unicode_sets = ((re_flags & LRE_FLAG_UNICODE_SETS) != 0);
s->capture_count = 1;
@@ -2545,8 +2652,6 @@ typedef struct {
int cbuf_type;
int capture_count;
int stack_size_max;
- BOOL multi_line;
- BOOL ignore_case;
BOOL is_unicode;
int interrupt_counter;
void *opaque; /* used for stack overflow check */
@@ -2695,17 +2800,19 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
}
break;
case REOP_char32:
+ case REOP_char32_i:
val = get_u32(pc);
pc += 4;
goto test_char;
case REOP_char:
+ case REOP_char_i:
val = get_u16(pc);
pc += 2;
test_char:
if (cptr >= cbuf_end)
goto no_match;
GET_CHAR(c, cptr, cbuf_end, cbuf_type);
- if (s->ignore_case) {
+ if (opcode == REOP_char_i || opcode == REOP_char32_i) {
c = lre_canonicalize(c, s->is_unicode);
}
if (val != c)
@@ -2749,18 +2856,20 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
return LRE_RET_TIMEOUT;
break;
case REOP_line_start:
+ case REOP_line_start_m:
if (cptr == s->cbuf)
break;
- if (!s->multi_line)
+ if (opcode == REOP_line_start)
goto no_match;
PEEK_PREV_CHAR(c, cptr, s->cbuf, cbuf_type);
if (!is_line_terminator(c))
goto no_match;
break;
case REOP_line_end:
+ case REOP_line_end_m:
if (cptr == cbuf_end)
break;
- if (!s->multi_line)
+ if (opcode == REOP_line_end)
goto no_match;
PEEK_CHAR(c, cptr, cbuf_end, cbuf_type);
if (!is_line_terminator(c))
@@ -2823,15 +2932,19 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
goto no_match;
break;
case REOP_word_boundary:
+ case REOP_word_boundary_i:
case REOP_not_word_boundary:
+ case REOP_not_word_boundary_i:
{
BOOL v1, v2;
+ int ignore_case = (opcode == REOP_word_boundary_i || opcode == REOP_not_word_boundary_i);
+ BOOL is_boundary = (opcode == REOP_word_boundary || opcode == REOP_word_boundary_i);
/* char before */
if (cptr == s->cbuf) {
v1 = FALSE;
} else {
PEEK_PREV_CHAR(c, cptr, s->cbuf, cbuf_type);
- if (s->ignore_case)
+ if (ignore_case)
c = lre_canonicalize(c, s->is_unicode);
v1 = is_word_char(c);
}
@@ -2840,16 +2953,18 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
v2 = FALSE;
} else {
PEEK_CHAR(c, cptr, cbuf_end, cbuf_type);
- if (s->ignore_case)
+ if (ignore_case)
c = lre_canonicalize(c, s->is_unicode);
v2 = is_word_char(c);
}
- if (v1 ^ v2 ^ (REOP_not_word_boundary - opcode))
+ if (v1 ^ v2 ^ is_boundary)
goto no_match;
}
break;
case REOP_back_reference:
+ case REOP_back_reference_i:
case REOP_backward_back_reference:
+ case REOP_backward_back_reference_i:
{
const uint8_t *cptr1, *cptr1_end, *cptr1_start;
uint32_t c1, c2;
@@ -2861,14 +2976,15 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
cptr1_end = capture[2 * val + 1];
if (!cptr1_start || !cptr1_end)
break;
- if (opcode == REOP_back_reference) {
+ if (opcode == REOP_back_reference ||
+ opcode == REOP_back_reference_i) {
cptr1 = cptr1_start;
while (cptr1 < cptr1_end) {
if (cptr >= cbuf_end)
goto no_match;
GET_CHAR(c1, cptr1, cptr1_end, cbuf_type);
GET_CHAR(c2, cptr, cbuf_end, cbuf_type);
- if (s->ignore_case) {
+ if (opcode == REOP_back_reference_i) {
c1 = lre_canonicalize(c1, s->is_unicode);
c2 = lre_canonicalize(c2, s->is_unicode);
}
@@ -2882,7 +2998,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
goto no_match;
GET_PREV_CHAR(c1, cptr1, cptr1_start, cbuf_type);
GET_PREV_CHAR(c2, cptr, s->cbuf, cbuf_type);
- if (s->ignore_case) {
+ if (opcode == REOP_backward_back_reference_i) {
c1 = lre_canonicalize(c1, s->is_unicode);
c2 = lre_canonicalize(c2, s->is_unicode);
}
@@ -2893,6 +3009,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
}
break;
case REOP_range:
+ case REOP_range_i:
{
int n;
uint32_t low, high, idx_min, idx_max, idx;
@@ -2902,7 +3019,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
if (cptr >= cbuf_end)
goto no_match;
GET_CHAR(c, cptr, cbuf_end, cbuf_type);
- if (s->ignore_case) {
+ if (opcode == REOP_range_i) {
c = lre_canonicalize(c, s->is_unicode);
}
idx_min = 0;
@@ -2933,6 +3050,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
}
break;
case REOP_range32:
+ case REOP_range32_i:
{
int n;
uint32_t low, high, idx_min, idx_max, idx;
@@ -2942,7 +3060,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
if (cptr >= cbuf_end)
goto no_match;
GET_CHAR(c, cptr, cbuf_end, cbuf_type);
- if (s->ignore_case) {
+ if (opcode == REOP_range32_i) {
c = lre_canonicalize(c, s->is_unicode);
}
idx_min = 0;
@@ -3036,8 +3154,6 @@ int lre_exec(uint8_t **capture,
StackInt *stack_buf;
re_flags = lre_get_flags(bc_buf);
- s->multi_line = (re_flags & LRE_FLAG_MULTILINE) != 0;
- s->ignore_case = (re_flags & LRE_FLAG_IGNORECASE) != 0;
s->is_unicode = (re_flags & (LRE_FLAG_UNICODE | LRE_FLAG_UNICODE_SETS)) != 0;
s->capture_count = bc_buf[RE_HEADER_CAPTURE_COUNT];
s->stack_size_max = bc_buf[RE_HEADER_STACK_SIZE];
diff --git a/test262.conf b/test262.conf
index e99e9da..02df380 100644
--- a/test262.conf
+++ b/test262.conf
@@ -177,7 +177,7 @@ regexp-dotall
regexp-duplicate-named-groups=skip
regexp-lookbehind
regexp-match-indices
-regexp-modifiers=skip
+regexp-modifiers
regexp-named-groups
regexp-unicode-property-escapes
regexp-v-flag