summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--libregexp.c29
-rw-r--r--libregexp.h43
-rw-r--r--libunicode.c94
-rw-r--r--libunicode.h103
-rw-r--r--quickjs.c96
-rw-r--r--unicode_gen.c12
6 files changed, 243 insertions, 134 deletions
diff --git a/libregexp.c b/libregexp.c
index d73a19f..1091506 100644
--- a/libregexp.c
+++ b/libregexp.c
@@ -30,6 +30,7 @@
#include "cutils.h"
#include "libregexp.h"
+#include "libunicode.h"
/*
TODO:
@@ -141,32 +142,6 @@ static const uint16_t char_range_s[] = {
0xFEFF, 0xFEFF + 1,
};
-BOOL lre_is_space(int c)
-{
- int i, n, low, high;
- n = (countof(char_range_s) - 1) / 2;
- for(i = 0; i < n; i++) {
- low = char_range_s[2 * i + 1];
- if (c < low)
- return FALSE;
- high = char_range_s[2 * i + 2];
- if (c < high)
- return TRUE;
- }
- return FALSE;
-}
-
-uint32_t const lre_id_start_table_ascii[4] = {
- /* $ A-Z _ a-z */
- 0x00000000, 0x00000010, 0x87FFFFFE, 0x07FFFFFE
-};
-
-uint32_t const lre_id_continue_table_ascii[4] = {
- /* $ 0-9 A-Z _ a-z */
- 0x00000000, 0x03FF0010, 0x87FFFFFE, 0x07FFFFFE
-};
-
-
static const uint16_t char_range_w[] = {
4,
0x0030, 0x0039 + 1,
@@ -186,7 +161,7 @@ typedef enum {
CHAR_RANGE_W,
} CharRangeEnum;
-static const uint16_t *char_range_table[] = {
+static const uint16_t * const char_range_table[] = {
char_range_d,
char_range_s,
char_range_w,
diff --git a/libregexp.h b/libregexp.h
index 757b277..7af7ece 100644
--- a/libregexp.h
+++ b/libregexp.h
@@ -25,10 +25,7 @@
#define LIBREGEXP_H
#include <stddef.h>
-
-#include "libunicode.h"
-
-#define LRE_BOOL int /* for documentation purposes */
+#include <stdint.h>
#define LRE_FLAG_GLOBAL (1 << 0)
#define LRE_FLAG_IGNORECASE (1 << 1)
@@ -50,43 +47,9 @@ int lre_exec(uint8_t **capture,
int cbuf_type, void *opaque);
int lre_parse_escape(const uint8_t **pp, int allow_utf16);
-LRE_BOOL lre_is_space(int c);
-/* must be provided by the user */
-LRE_BOOL lre_check_stack_overflow(void *opaque, size_t alloca_size);
+/* must be provided by the user, return non zero if overflow */
+int lre_check_stack_overflow(void *opaque, size_t alloca_size);
void *lre_realloc(void *opaque, void *ptr, size_t size);
-/* JS identifier test */
-extern uint32_t const lre_id_start_table_ascii[4];
-extern uint32_t const lre_id_continue_table_ascii[4];
-
-static inline int lre_js_is_ident_first(int c)
-{
- if ((uint32_t)c < 128) {
- return (lre_id_start_table_ascii[c >> 5] >> (c & 31)) & 1;
- } else {
-#ifdef CONFIG_ALL_UNICODE
- return lre_is_id_start(c);
-#else
- return !lre_is_space(c);
-#endif
- }
-}
-
-static inline int lre_js_is_ident_next(int c)
-{
- if ((uint32_t)c < 128) {
- return (lre_id_continue_table_ascii[c >> 5] >> (c & 31)) & 1;
- } else {
- /* ZWNJ and ZWJ are accepted in identifiers */
-#ifdef CONFIG_ALL_UNICODE
- return lre_is_id_continue(c) || c == 0x200C || c == 0x200D;
-#else
- return !lre_is_space(c) || c == 0x200C || c == 0x200D;
-#endif
- }
-}
-
-#undef LRE_BOOL
-
#endif /* LIBREGEXP_H */
diff --git a/libunicode.c b/libunicode.c
index a631bbd..c80d2f3 100644
--- a/libunicode.c
+++ b/libunicode.c
@@ -1814,3 +1814,97 @@ int unicode_prop(CharRange *cr, const char *prop_name)
}
#endif /* CONFIG_ALL_UNICODE */
+
+/*---- lre codepoint categorizing functions ----*/
+
+#define S UNICODE_C_SPACE
+#define D UNICODE_C_DIGIT
+#define X UNICODE_C_XDIGIT
+#define U UNICODE_C_UPPER
+#define L UNICODE_C_LOWER
+#define _ UNICODE_C_UNDER
+#define d UNICODE_C_DOLLAR
+
+uint8_t const lre_ctype_bits[256] = {
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, S, S, S, S, S, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+
+ S, 0, 0, 0, d, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ X|D, X|D, X|D, X|D, X|D, X|D, X|D, X|D,
+ X|D, X|D, 0, 0, 0, 0, 0, 0,
+
+ 0, X|U, X|U, X|U, X|U, X|U, X|U, U,
+ U, U, U, U, U, U, U, U,
+ U, U, U, U, U, U, U, U,
+ U, U, U, 0, 0, 0, 0, _,
+
+ 0, X|L, X|L, X|L, X|L, X|L, X|L, L,
+ L, L, L, L, L, L, L, L,
+ L, L, L, L, L, L, L, L,
+ L, L, L, 0, 0, 0, 0, 0,
+
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+
+ S, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+#undef S
+#undef D
+#undef X
+#undef U
+#undef L
+#undef _
+#undef d
+
+/* code point ranges for Zs,Zl or Zp property */
+static const uint16_t char_range_s[] = {
+ 10,
+ 0x0009, 0x000D + 1,
+ 0x0020, 0x0020 + 1,
+ 0x00A0, 0x00A0 + 1,
+ 0x1680, 0x1680 + 1,
+ 0x2000, 0x200A + 1,
+ /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */
+ /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */
+ 0x2028, 0x2029 + 1,
+ 0x202F, 0x202F + 1,
+ 0x205F, 0x205F + 1,
+ 0x3000, 0x3000 + 1,
+ /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */
+ 0xFEFF, 0xFEFF + 1,
+};
+
+BOOL lre_is_space_non_ascii(uint32_t c)
+{
+ size_t i, n;
+
+ n = countof(char_range_s);
+ for(i = 5; i < n; i += 2) {
+ uint32_t low = char_range_s[i];
+ uint32_t high = char_range_s[i + 1];
+ if (c < low)
+ return FALSE;
+ if (c < high)
+ return TRUE;
+ }
+ return FALSE;
+}
diff --git a/libunicode.h b/libunicode.h
index f416157..cc2f244 100644
--- a/libunicode.h
+++ b/libunicode.h
@@ -24,27 +24,13 @@
#ifndef LIBUNICODE_H
#define LIBUNICODE_H
-#include <inttypes.h>
-
-#define LRE_BOOL int /* for documentation purposes */
+#include <stdint.h>
/* define it to include all the unicode tables (40KB larger) */
#define CONFIG_ALL_UNICODE
#define LRE_CC_RES_LEN_MAX 3
-typedef enum {
- UNICODE_NFC,
- UNICODE_NFD,
- UNICODE_NFKC,
- UNICODE_NFKD,
-} UnicodeNormalizationEnum;
-
-int lre_case_conv(uint32_t *res, uint32_t c, int conv_type);
-int lre_canonicalize(uint32_t c, LRE_BOOL is_unicode);
-LRE_BOOL lre_is_cased(uint32_t c);
-LRE_BOOL lre_is_case_ignorable(uint32_t c);
-
/* char ranges */
typedef struct {
@@ -102,12 +88,14 @@ int cr_op(CharRange *cr, const uint32_t *a_pt, int a_len,
int cr_invert(CharRange *cr);
-int cr_regexp_canonicalize(CharRange *cr, LRE_BOOL is_unicode);
-
-#ifdef CONFIG_ALL_UNICODE
+int cr_regexp_canonicalize(CharRange *cr, int is_unicode);
-LRE_BOOL lre_is_id_start(uint32_t c);
-LRE_BOOL lre_is_id_continue(uint32_t c);
+typedef enum {
+ UNICODE_NFC,
+ UNICODE_NFD,
+ UNICODE_NFKC,
+ UNICODE_NFKD,
+} UnicodeNormalizationEnum;
int unicode_normalize(uint32_t **pdst, const uint32_t *src, int src_len,
UnicodeNormalizationEnum n_type,
@@ -115,13 +103,80 @@ int unicode_normalize(uint32_t **pdst, const uint32_t *src, int src_len,
/* Unicode character range functions */
-int unicode_script(CharRange *cr,
- const char *script_name, LRE_BOOL is_ext);
+int unicode_script(CharRange *cr, const char *script_name, int is_ext);
int unicode_general_category(CharRange *cr, const char *gc_name);
int unicode_prop(CharRange *cr, const char *prop_name);
-#endif /* CONFIG_ALL_UNICODE */
+int lre_case_conv(uint32_t *res, uint32_t c, int conv_type);
+int lre_canonicalize(uint32_t c, int is_unicode);
+
+/* Code point type categories */
+enum {
+ UNICODE_C_SPACE = (1 << 0),
+ UNICODE_C_DIGIT = (1 << 1),
+ UNICODE_C_UPPER = (1 << 2),
+ UNICODE_C_LOWER = (1 << 3),
+ UNICODE_C_UNDER = (1 << 4),
+ UNICODE_C_DOLLAR = (1 << 5),
+ UNICODE_C_XDIGIT = (1 << 6),
+};
+extern uint8_t const lre_ctype_bits[256];
+
+/* zero or non-zero return value */
+int lre_is_cased(uint32_t c);
+int lre_is_case_ignorable(uint32_t c);
+int lre_is_id_start(uint32_t c);
+int lre_is_id_continue(uint32_t c);
+
+static inline int lre_is_space_byte(uint8_t c) {
+ return lre_ctype_bits[c] & UNICODE_C_SPACE;
+}
+
+static inline int lre_is_id_start_byte(uint8_t c) {
+ return lre_ctype_bits[c] & (UNICODE_C_UPPER | UNICODE_C_LOWER |
+ UNICODE_C_UNDER | UNICODE_C_DOLLAR);
+}
-#undef LRE_BOOL
+static inline int lre_is_id_continue_byte(uint8_t c) {
+ return lre_ctype_bits[c] & (UNICODE_C_UPPER | UNICODE_C_LOWER |
+ UNICODE_C_UNDER | UNICODE_C_DOLLAR |
+ UNICODE_C_DIGIT);
+}
+
+int lre_is_space_non_ascii(uint32_t c);
+
+static inline int lre_is_space(uint32_t c) {
+ if (c < 256)
+ return lre_is_space_byte(c);
+ else
+ return lre_is_space_non_ascii(c);
+}
+
+static inline int lre_js_is_ident_first(uint32_t c) {
+ if (c < 128) {
+ return lre_is_id_start_byte(c);
+ } else {
+#ifdef CONFIG_ALL_UNICODE
+ return lre_is_id_start(c);
+#else
+ return !lre_is_space_non_ascii(c);
+#endif
+ }
+}
+
+static inline int lre_js_is_ident_next(uint32_t c) {
+ if (c < 128) {
+ return lre_is_id_continue_byte(c);
+ } else {
+ /* ZWNJ and ZWJ are accepted in identifiers */
+ if (c >= 0x200C && c <= 0x200D)
+ return TRUE;
+#ifdef CONFIG_ALL_UNICODE
+ return lre_is_id_continue(c);
+#else
+ return !lre_is_space_non_ascii(c);
+#endif
+ }
+}
#endif /* LIBUNICODE_H */
diff --git a/quickjs.c b/quickjs.c
index e8fdd8a..2834195 100644
--- a/quickjs.c
+++ b/quickjs.c
@@ -44,6 +44,7 @@
#include "list.h"
#include "quickjs.h"
#include "libregexp.h"
+#include "libunicode.h"
#include "libbf.h"
#define OPTIMIZE 1
@@ -21188,8 +21189,7 @@ static JSAtom json_parse_ident(JSParseState *s, const uint8_t **pp, int c)
for(;;) {
buf[ident_pos++] = c;
c = *p;
- if (c >= 128 ||
- !((lre_id_continue_table_ascii[c >> 5] >> (c & 31)) & 1))
+ if (c >= 128 || !lre_is_id_continue_byte(c))
break;
p++;
if (unlikely(ident_pos >= ident_size - UTF8_CHAR_LEN_MAX)) {
@@ -21401,9 +21401,29 @@ static __exception int json_next_token(JSParseState *s)
return -1;
}
-/* only used for ':' and '=>', 'let' or 'function' look-ahead. *pp is
- only set if TOK_IMPORT is returned */
-/* XXX: handle all unicode cases */
+static int match_identifier(const uint8_t *p, const char *s) {
+ uint32_t c;
+ while (*s) {
+ if ((uint8_t)*s++ != *p++)
+ return 0;
+ }
+ c = *p;
+ if (c >= 128)
+ c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
+ return !lre_js_is_ident_next(c);
+}
+
+/* simple_next_token() is used to check for the next token in simple cases.
+ It is only used for ':' and '=>', 'let' or 'function' look-ahead.
+ (*pp) is only set if TOK_IMPORT is returned for JS_DetectModule()
+ Whitespace and comments are skipped correctly.
+ Then the next token is analyzed, only for specific words.
+ Return values:
+ - '\n' if !no_line_terminator
+ - TOK_ARROW, TOK_IN, TOK_IMPORT, TOK_OF, TOK_EXPORT, TOK_FUNCTION
+ - TOK_IDENT is returned for other identifiers and keywords
+ - otherwise the next character or unicode codepoint is returned.
+ */
static int simple_next_token(const uint8_t **pp, BOOL no_line_terminator)
{
const uint8_t *p;
@@ -21447,33 +21467,42 @@ static int simple_next_token(const uint8_t **pp, BOOL no_line_terminator)
if (*p == '>')
return TOK_ARROW;
break;
+ case 'i':
+ if (match_identifier(p, "n"))
+ return TOK_IN;
+ if (match_identifier(p, "mport")) {
+ *pp = p + 5;
+ return TOK_IMPORT;
+ }
+ return TOK_IDENT;
+ case 'o':
+ if (match_identifier(p, "f"))
+ return TOK_OF;
+ return TOK_IDENT;
+ case 'e':
+ if (match_identifier(p, "xport"))
+ return TOK_EXPORT;
+ return TOK_IDENT;
+ case 'f':
+ if (match_identifier(p, "unction"))
+ return TOK_FUNCTION;
+ return TOK_IDENT;
+ case '\\':
+ if (*p == 'u') {
+ if (lre_js_is_ident_first(lre_parse_escape(&p, TRUE)))
+ return TOK_IDENT;
+ }
+ break;
default:
- if (lre_js_is_ident_first(c)) {
- if (c == 'i') {
- if (p[0] == 'n' && !lre_js_is_ident_next(p[1])) {
- return TOK_IN;
- }
- if (p[0] == 'm' && p[1] == 'p' && p[2] == 'o' &&
- p[3] == 'r' && p[4] == 't' &&
- !lre_js_is_ident_next(p[5])) {
- *pp = p + 5;
- return TOK_IMPORT;
- }
- } else if (c == 'o' && *p == 'f' && !lre_js_is_ident_next(p[1])) {
- return TOK_OF;
- } else if (c == 'e' &&
- p[0] == 'x' && p[1] == 'p' && p[2] == 'o' &&
- p[3] == 'r' && p[4] == 't' &&
- !lre_js_is_ident_next(p[5])) {
- *pp = p + 5;
- return TOK_EXPORT;
- } else if (c == 'f' && p[0] == 'u' && p[1] == 'n' &&
- p[2] == 'c' && p[3] == 't' && p[4] == 'i' &&
- p[5] == 'o' && p[6] == 'n' && !lre_js_is_ident_next(p[7])) {
- return TOK_FUNCTION;
- }
- return TOK_IDENT;
+ if (c >= 128) {
+ c = unicode_from_utf8(p - 1, UTF8_CHAR_LEN_MAX, &p);
+ if (no_line_terminator && (c == CP_PS || c == CP_LS))
+ return '\n';
}
+ if (lre_is_space(c))
+ continue;
+ if (lre_js_is_ident_first(c))
+ return TOK_IDENT;
break;
}
return c;
@@ -26211,7 +26240,6 @@ static int is_let(JSParseState *s, int decl_mask)
int res = FALSE;
if (token_is_pseudo_keyword(s, JS_ATOM_let)) {
-#if 1
JSParsePos pos;
js_parse_get_pos(s, &pos);
for (;;) {
@@ -26244,12 +26272,6 @@ static int is_let(JSParseState *s, int decl_mask)
if (js_parse_seek_token(s, &pos)) {
res = -1;
}
-#else
- int tok = peek_token(s, TRUE);
- if (tok == '{' || tok == TOK_IDENT || peek_token(s, FALSE) == '[') {
- res = TRUE;
- }
-#endif
}
return res;
}
diff --git a/unicode_gen.c b/unicode_gen.c
index 14811ef..4f38052 100644
--- a/unicode_gen.c
+++ b/unicode_gen.c
@@ -273,7 +273,7 @@ int find_name(const char **tab, int tab_len, const char *name)
return -1;
}
-static int get_prop(uint32_t c, int prop_idx)
+static BOOL get_prop(uint32_t c, int prop_idx)
{
return (unicode_db[c].prop_bitmap_tab[prop_idx >> 5] >> (prop_idx & 0x1f)) & 1;
}
@@ -1981,7 +1981,7 @@ void check_flags(void)
BOOL flag_ref, flag;
for(c = 0; c <= CHARCODE_MAX; c++) {
flag_ref = get_prop(c, PROP_Cased);
- flag = lre_is_cased(c);
+ flag = !!lre_is_cased(c);
if (flag != flag_ref) {
printf("ERROR: c=%05x cased=%d ref=%d\n",
c, flag, flag_ref);
@@ -1989,7 +1989,7 @@ void check_flags(void)
}
flag_ref = get_prop(c, PROP_Case_Ignorable);
- flag = lre_is_case_ignorable(c);
+ flag = !!lre_is_case_ignorable(c);
if (flag != flag_ref) {
printf("ERROR: c=%05x case_ignorable=%d ref=%d\n",
c, flag, flag_ref);
@@ -1997,7 +1997,7 @@ void check_flags(void)
}
flag_ref = get_prop(c, PROP_ID_Start);
- flag = lre_is_id_start(c);
+ flag = !!lre_is_id_start(c);
if (flag != flag_ref) {
printf("ERROR: c=%05x id_start=%d ref=%d\n",
c, flag, flag_ref);
@@ -2005,7 +2005,7 @@ void check_flags(void)
}
flag_ref = get_prop(c, PROP_ID_Continue);
- flag = lre_is_id_continue(c);
+ flag = !!lre_is_id_continue(c);
if (flag != flag_ref) {
printf("ERROR: c=%05x id_cont=%d ref=%d\n",
c, flag, flag_ref);
@@ -2019,7 +2019,7 @@ void check_flags(void)
count = 0;
for(c = 0x20; c <= 0xffff; c++) {
flag_ref = get_prop(c, PROP_ID_Start);
- flag = lre_is_id_start(c);
+ flag = !!lre_is_id_start(c);
assert(flag == flag_ref);
count++;
}