diff options
author | drh <drh@noemail.net> | 2016-02-08 02:30:50 +0000 |
---|---|---|
committer | drh <drh@noemail.net> | 2016-02-08 02:30:50 +0000 |
commit | 8974331fc35a22b48ccf1ca2c709890646f10a3d (patch) | |
tree | e5838f56a193bc1b47f486985dfefff5dacbf000 /src | |
parent | fd0d93fd71b2c9f1f889bb2e2022a1119a2b52bd (diff) | |
download | sqlite-8974331fc35a22b48ccf1ca2c709890646f10a3d.tar.gz sqlite-8974331fc35a22b48ccf1ca2c709890646f10a3d.zip |
Demonstrate a much faster sqlite3GetToken() routine by using a lookup table
to map initial token characters into a character class. This check-in does
not work for EBCDIC. More optimization needed.
FossilOrigin-Name: 9115baa1919584dc8ca25bbff54d3b65748a9631
Diffstat (limited to 'src')
-rw-r--r-- | src/tokenize.c | 133 |
1 files changed, 92 insertions, 41 deletions
diff --git a/src/tokenize.c b/src/tokenize.c index 5bee3d5a8..c4b36c475 100644 --- a/src/tokenize.c +++ b/src/tokenize.c @@ -18,6 +18,56 @@ #include "sqliteInt.h" #include <stdlib.h> +/* Character classes for tokenizing */ +#define CC_X 0 /* The letter 'x' or 'X'. Start of x'01234fed' */ +#define CC_KYWD 1 /* Alphabetics or '_'. Usable in a keyword */ +#define CC_ID 2 /* unicode characters usable in IDs */ +#define CC_DIGIT 3 /* Digits */ +#define CC_DOLLAR 4 /* '$' */ +#define CC_VARALPHA 5 /* '@', '#', ':'. Alphabetic SQL variables */ +#define CC_VARNUM 6 /* '?'. Numeric SQL variables */ +#define CC_SPACE 7 /* Space characters */ +#define CC_QUOTE 8 /* '"', '\'', or '`'. String literals, quoted ids */ +#define CC_QUOTE2 9 /* '['. [...] style quoted ids */ +#define CC_PIPE 10 /* '|'. Bitwise OR or concatenate */ +#define CC_MINUS 11 /* '-'. Minus or SQL-style comment */ +#define CC_LT 12 /* '<'. Part of < or <= or <> */ +#define CC_GT 13 /* '>'. Part of > or >= */ +#define CC_EQ 14 /* '='. Part of = or == */ +#define CC_BANG 15 /* '!'. Part of != */ +#define CC_SLASH 16 /* '/'. / or c-style comment */ +#define CC_LP 17 /* '(' */ +#define CC_RP 18 /* ')' */ +#define CC_SEMI 19 /* ';' */ +#define CC_PLUS 20 /* '+' */ +#define CC_STAR 21 /* '*' */ +#define CC_PERCENT 22 /* '%' */ +#define CC_COMMA 23 /* ',' */ +#define CC_AND 24 /* '&' */ +#define CC_TILDA 25 /* '~' */ +#define CC_DOT 26 /* '.' */ +#define CC_ILLEGAL 27 /* Illegal character */ + +static const unsigned char aiClass[] = { +/* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xa xb xc xd xe xf */ +/* 0x */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 7, 7, 27, 7, 7, 27, 27, +/* 1x */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, +/* 2x */ 7, 15, 8, 5, 4, 22, 24, 8, 17, 18, 21, 20, 23, 11, 26, 16, +/* 3x */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 5, 19, 12, 14, 13, 6, +/* 4x */ 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 5x */ 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 9, 27, 27, 27, 1, +/* 6x */ 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 7x */ 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 27, 10, 27, 25, 27, +/* 8x */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +/* 9x */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +/* Ax */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +/* Bx */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +/* Cx */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +/* Dx */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +/* Ex */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +/* Fx */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 +}; + /* ** The charMap() macro maps alphabetic characters into their ** lower-case ASCII equivalent. On ASCII machines, this is just @@ -115,8 +165,8 @@ int sqlite3IsIdChar(u8 c){ return IdChar(c); } */ int sqlite3GetToken(const unsigned char *z, int *tokenType){ int i, c; - switch( *z ){ - case ' ': case '\t': case '\n': case '\f': case '\r': { + switch( aiClass[*z] ){ + case CC_SPACE: { testcase( z[0]==' ' ); testcase( z[0]=='\t' ); testcase( z[0]=='\n' ); @@ -126,7 +176,7 @@ int sqlite3GetToken(const unsigned char *z, int *tokenType){ *tokenType = TK_SPACE; return i; } - case '-': { + case CC_MINUS: { if( z[1]=='-' ){ for(i=2; (c=z[i])!=0 && c!='\n'; i++){} *tokenType = TK_SPACE; /* IMP: R-22934-25134 */ @@ -135,27 +185,27 @@ int sqlite3GetToken(const unsigned char *z, int *tokenType){ *tokenType = TK_MINUS; return 1; } - case '(': { + case CC_LP: { *tokenType = TK_LP; return 1; } - case ')': { + case CC_RP: { *tokenType = TK_RP; return 1; } - case ';': { + case CC_SEMI: { *tokenType = TK_SEMI; return 1; } - case '+': { + case CC_PLUS: { *tokenType = TK_PLUS; return 1; } - case '*': { + case CC_STAR: { *tokenType = TK_STAR; return 1; } - case '/': { + case CC_SLASH: { if( z[1]!='*' || z[2]==0 ){ *tokenType = TK_SLASH; return 1; @@ -165,15 +215,15 @@ int sqlite3GetToken(const unsigned char *z, int *tokenType){ *tokenType = TK_SPACE; /* IMP: R-22934-25134 */ return i; } - case '%': { + case CC_PERCENT: { *tokenType = TK_REM; return 1; } - case '=': { + case CC_EQ: { *tokenType = TK_EQ; return 1 + (z[1]=='='); } - case '<': { + case CC_LT: { if( (c=z[1])=='=' ){ *tokenType = TK_LE; return 2; @@ -188,7 +238,7 @@ int sqlite3GetToken(const unsigned char *z, int *tokenType){ return 1; } } - case '>': { + case CC_GT: { if( (c=z[1])=='=' ){ *tokenType = TK_GE; return 2; @@ -200,7 +250,7 @@ int sqlite3GetToken(const unsigned char *z, int *tokenType){ return 1; } } - case '!': { + case CC_BANG: { if( z[1]!='=' ){ *tokenType = TK_ILLEGAL; return 2; @@ -209,7 +259,7 @@ int sqlite3GetToken(const unsigned char *z, int *tokenType){ return 2; } } - case '|': { + case CC_PIPE: { if( z[1]!='|' ){ *tokenType = TK_BITOR; return 1; @@ -218,21 +268,19 @@ int sqlite3GetToken(const unsigned char *z, int *tokenType){ return 2; } } - case ',': { + case CC_COMMA: { *tokenType = TK_COMMA; return 1; } - case '&': { + case CC_AND: { *tokenType = TK_BITAND; return 1; } - case '~': { + case CC_TILDA: { *tokenType = TK_BITNOT; return 1; } - case '`': - case '\'': - case '"': { + case CC_QUOTE: { int delim = z[0]; testcase( delim=='`' ); testcase( delim=='\'' ); @@ -257,7 +305,7 @@ int sqlite3GetToken(const unsigned char *z, int *tokenType){ return i; } } - case '.': { + case CC_DOT: { #ifndef SQLITE_OMIT_FLOATING_POINT if( !sqlite3Isdigit(z[1]) ) #endif @@ -268,8 +316,7 @@ int sqlite3GetToken(const unsigned char *z, int *tokenType){ /* If the next character is a digit, this is a floating point ** number that begins with ".". Fall thru into the next case */ } - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': { + case CC_DIGIT: { testcase( z[0]=='0' ); testcase( z[0]=='1' ); testcase( z[0]=='2' ); testcase( z[0]=='3' ); testcase( z[0]=='4' ); testcase( z[0]=='5' ); testcase( z[0]=='6' ); testcase( z[0]=='7' ); testcase( z[0]=='8' ); @@ -304,22 +351,18 @@ int sqlite3GetToken(const unsigned char *z, int *tokenType){ } return i; } - case '[': { + case CC_QUOTE2: { for(i=1, c=z[0]; c!=']' && (c=z[i])!=0; i++){} *tokenType = c==']' ? TK_ID : TK_ILLEGAL; return i; } - case '?': { + case CC_VARNUM: { *tokenType = TK_VARIABLE; for(i=1; sqlite3Isdigit(z[i]); i++){} return i; } -#ifndef SQLITE_OMIT_TCL_VARIABLE - case '$': -#endif - case '@': /* For compatibility with MS SQL Server */ - case '#': - case ':': { + case CC_DOLLAR: + case CC_VARALPHA: { int n = 0; testcase( z[0]=='$' ); testcase( z[0]=='@' ); testcase( z[0]==':' ); testcase( z[0]=='#' ); @@ -349,7 +392,7 @@ int sqlite3GetToken(const unsigned char *z, int *tokenType){ return i; } #ifndef SQLITE_OMIT_BLOB_LITERAL - case 'x': case 'X': { + case CC_X: { testcase( z[0]=='x' ); testcase( z[0]=='X' ); if( z[1]=='\'' ){ *tokenType = TK_BLOB; @@ -361,20 +404,28 @@ int sqlite3GetToken(const unsigned char *z, int *tokenType){ if( z[i] ) i++; return i; } - /* Otherwise fall through to the next case */ + i = 1; + break; } #endif - default: { - if( !IdChar(*z) ){ - break; - } - for(i=1; IdChar(z[i]); i++){} + case CC_KYWD: { + for(i=1; aiClass[z[i]]<=CC_KYWD; i++){} + if( aiClass[z[i]]<=CC_DOLLAR ){ i++; break; } *tokenType = TK_ID; return keywordCode((char*)z, i, tokenType); } + case CC_ID: { + i = 1; + break; + } + default: { + *tokenType = TK_ILLEGAL; + return 1; + } } - *tokenType = TK_ILLEGAL; - return 1; + while( aiClass[z[i]]<=CC_DOLLAR ){ i++; } + *tokenType = TK_ID; + return i; } /* |