diff options
Diffstat (limited to 'ext/misc/regexp.c')
-rw-r--r-- | ext/misc/regexp.c | 127 |
1 files changed, 117 insertions, 10 deletions
diff --git a/ext/misc/regexp.c b/ext/misc/regexp.c index b626ca424..7413ab80e 100644 --- a/ext/misc/regexp.c +++ b/ext/misc/regexp.c @@ -72,6 +72,7 @@ SQLITE_EXTENSION_INIT1 /* The end-of-input character */ #define RE_EOF 0 /* End of input */ +#define RE_START 0xfffffff /* Start of input - larger than an UTF-8 */ /* The NFA is implemented as sequence of opcodes taken from the following ** set. Each opcode has a single integer argument. @@ -93,6 +94,33 @@ SQLITE_EXTENSION_INIT1 #define RE_OP_SPACE 15 /* space: [ \t\n\r\v\f] */ #define RE_OP_NOTSPACE 16 /* Not a digit */ #define RE_OP_BOUNDARY 17 /* Boundary between word and non-word */ +#define RE_OP_ATSTART 18 /* Currently at the start of the string */ + +#if defined(SQLITE_DEBUG) +/* Opcode names used for symbolic debugging */ +static const char *ReOpName[] = { + "EOF", + "MATCH", + "ANY", + "ANYSTAR", + "FORK", + "GOTO", + "ACCEPT", + "CC_INC", + "CC_EXC", + "CC_VALUE", + "CC_RANGE", + "WORD", + "NOTWORD", + "DIGIT", + "NOTDIGIT", + "SPACE", + "NOTSPACE", + "BOUNDARY", + "ATSTART", +}; +#endif /* SQLITE_DEBUG */ + /* Each opcode is a "state" in the NFA */ typedef unsigned short ReStateNumber; @@ -127,7 +155,7 @@ struct ReCompiled { int *aArg; /* Arguments to each operator */ unsigned (*xNextChar)(ReInput*); /* Next character function */ unsigned char zInit[12]; /* Initial text to match */ - int nInit; /* Number of characters in zInit */ + int nInit; /* Number of bytes in zInit */ unsigned nState; /* Number of entries in aOp[] and aArg[] */ unsigned nAlloc; /* Slots allocated for aOp[] and aArg[] */ }; @@ -200,7 +228,7 @@ static int re_match(ReCompiled *pRe, const unsigned char *zIn, int nIn){ ReStateNumber *pToFree; unsigned int i = 0; unsigned int iSwap = 0; - int c = RE_EOF+1; + int c = RE_START; int cPrev = 0; int rc = 0; ReInput in; @@ -219,6 +247,7 @@ static int re_match(ReCompiled *pRe, const unsigned char *zIn, int nIn){ in.i++; } if( in.i+pRe->nInit>in.mx ) return 0; + c = RE_START-1; } if( pRe->nState<=(sizeof(aSpace)/(sizeof(aSpace[0])*2)) ){ @@ -247,6 +276,10 @@ static int re_match(ReCompiled *pRe, const unsigned char *zIn, int nIn){ if( pRe->aArg[x]==c ) re_add_state(pNext, x+1); break; } + case RE_OP_ATSTART: { + if( cPrev==RE_START ) re_add_state(pThis, x+1); + break; + } case RE_OP_ANY: { if( c!=0 ) re_add_state(pNext, x+1); break; @@ -328,7 +361,9 @@ static int re_match(ReCompiled *pRe, const unsigned char *zIn, int nIn){ } } for(i=0; i<pNext->nState; i++){ - if( pRe->aOp[pNext->aState[i]]==RE_OP_ACCEPT ){ rc = 1; break; } + int x = pNext->aState[i]; + while( pRe->aOp[x]==RE_OP_GOTO ) x += pRe->aArg[x]; + if( pRe->aOp[x]==RE_OP_ACCEPT ){ rc = 1; break; } } re_match_end: sqlite3_free(pToFree); @@ -483,7 +518,6 @@ static const char *re_subcompile_string(ReCompiled *p){ iStart = p->nState; switch( c ){ case '|': - case '$': case ')': { p->sIn.i--; return 0; @@ -520,6 +554,14 @@ static const char *re_subcompile_string(ReCompiled *p){ re_insert(p, iPrev, RE_OP_FORK, p->nState - iPrev+1); break; } + case '$': { + re_append(p, RE_OP_MATCH, RE_EOF); + break; + } + case '^': { + re_append(p, RE_OP_ATSTART, 0); + break; + } case '{': { int m = 0, n = 0; int sz, j; @@ -538,6 +580,7 @@ static const char *re_subcompile_string(ReCompiled *p){ if( m==0 ){ if( n==0 ) return "both m and n are zero in '{m,n}'"; re_insert(p, iPrev, RE_OP_FORK, sz+1); + iPrev++; n--; }else{ for(j=1; j<m; j++) re_copy(p, iPrev, sz); @@ -656,11 +699,7 @@ static const char *re_compile(ReCompiled **ppRe, const char *zIn, int noCase){ re_free(pRe); return zErr; } - if( rePeek(pRe)=='$' && pRe->sIn.i+1>=pRe->sIn.mx ){ - re_append(pRe, RE_OP_MATCH, RE_EOF); - re_append(pRe, RE_OP_ACCEPT, 0); - *ppRe = pRe; - }else if( pRe->sIn.i>=pRe->sIn.mx ){ + if( pRe->sIn.i>=pRe->sIn.mx ){ re_append(pRe, RE_OP_ACCEPT, 0); *ppRe = pRe; }else{ @@ -685,7 +724,7 @@ static const char *re_compile(ReCompiled **ppRe, const char *zIn, int noCase){ pRe->zInit[j++] = (unsigned char)(0xc0 | (x>>6)); pRe->zInit[j++] = 0x80 | (x&0x3f); }else if( x<=0xffff ){ - pRe->zInit[j++] = (unsigned char)(0xd0 | (x>>12)); + pRe->zInit[j++] = (unsigned char)(0xe0 | (x>>12)); pRe->zInit[j++] = 0x80 | ((x>>6)&0x3f); pRe->zInit[j++] = 0x80 | (x&0x3f); }else{ @@ -744,6 +783,67 @@ static void re_sql_func( } } +#if defined(SQLITE_DEBUG) +/* +** This function is used for testing and debugging only. It is only available +** if the SQLITE_DEBUG compile-time option is used. +** +** Compile a regular expression and then convert the compiled expression into +** text and return that text. +*/ +static void re_bytecode_func( + sqlite3_context *context, + int argc, + sqlite3_value **argv +){ + const char *zPattern; + const char *zErr; + ReCompiled *pRe; + sqlite3_str *pStr; + int i; + int n; + char *z; + + zPattern = (const char*)sqlite3_value_text(argv[0]); + if( zPattern==0 ) return; + zErr = re_compile(&pRe, zPattern, sqlite3_user_data(context)!=0); + if( zErr ){ + re_free(pRe); + sqlite3_result_error(context, zErr, -1); + return; + } + if( pRe==0 ){ + sqlite3_result_error_nomem(context); + return; + } + pStr = sqlite3_str_new(0); + if( pStr==0 ) goto re_bytecode_func_err; + if( pRe->nInit>0 ){ + sqlite3_str_appendf(pStr, "INIT "); + for(i=0; i<pRe->nInit; i++){ + sqlite3_str_appendf(pStr, "%02x", pRe->zInit[i]); + } + sqlite3_str_appendf(pStr, "\n"); + } + for(i=0; i<pRe->nState; i++){ + sqlite3_str_appendf(pStr, "%-8s %4d\n", + ReOpName[(unsigned char)pRe->aOp[i]], pRe->aArg[i]); + } + n = sqlite3_str_length(pStr); + z = sqlite3_str_finish(pStr); + if( n==0 ){ + sqlite3_free(z); + }else{ + sqlite3_result_text(context, z, n-1, sqlite3_free); + } + +re_bytecode_func_err: + re_free(pRe); +} + +#endif /* SQLITE_DEBUG */ + + /* ** Invoke this routine to register the regexp() function with the ** SQLite database connection. @@ -768,6 +868,13 @@ int sqlite3_regexp_init( rc = sqlite3_create_function(db, "regexpi", 2, SQLITE_UTF8|SQLITE_INNOCUOUS|SQLITE_DETERMINISTIC, (void*)db, re_sql_func, 0, 0); +#if defined(SQLITE_DEBUG) + if( rc==SQLITE_OK ){ + rc = sqlite3_create_function(db, "regexp_bytecode", 1, + SQLITE_UTF8|SQLITE_INNOCUOUS|SQLITE_DETERMINISTIC, + 0, re_bytecode_func, 0, 0); + } +#endif /* SQLITE_DEBUG */ } return rc; } |