aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Makefile.in1
-rw-r--r--Makefile.msc1
-rw-r--r--ext/misc/normalize.c707
-rw-r--r--main.mk1
-rw-r--r--manifest21
-rw-r--r--manifest.uuid2
-rw-r--r--src/test1.c30
-rw-r--r--test/normalize.test72
8 files changed, 825 insertions, 10 deletions
diff --git a/Makefile.in b/Makefile.in
index f340178c7..7bc413e84 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -440,6 +440,7 @@ TESTSRC += \
$(TOP)/ext/misc/ieee754.c \
$(TOP)/ext/misc/mmapwarm.c \
$(TOP)/ext/misc/nextchar.c \
+ $(TOP)/ext/misc/normalize.c \
$(TOP)/ext/misc/percentile.c \
$(TOP)/ext/misc/regexp.c \
$(TOP)/ext/misc/remember.c \
diff --git a/Makefile.msc b/Makefile.msc
index 73c8df2e2..875aeedea 100644
--- a/Makefile.msc
+++ b/Makefile.msc
@@ -1500,6 +1500,7 @@ TESTEXT = \
$(TOP)\ext\misc\ieee754.c \
$(TOP)\ext\misc\mmapwarm.c \
$(TOP)\ext\misc\nextchar.c \
+ $(TOP)\ext\misc\normalize.c \
$(TOP)\ext\misc\percentile.c \
$(TOP)\ext\misc\regexp.c \
$(TOP)\ext\misc\remember.c \
diff --git a/ext/misc/normalize.c b/ext/misc/normalize.c
new file mode 100644
index 000000000..fd656f130
--- /dev/null
+++ b/ext/misc/normalize.c
@@ -0,0 +1,707 @@
+/*
+** 2018-01-08
+**
+** The author disclaims copyright to this source code. In place of
+** a legal notice, here is a blessing:
+**
+** May you do good and not evil.
+** May you find forgiveness for yourself and forgive others.
+** May you share freely, never taking more than you give.
+**
+******************************************************************************
+**
+** This file contains code to implement the sqlite3_normalize() function.
+**
+** char *sqlite3_normalize(const char *zSql);
+**
+** This function takes an SQL string as input and returns a "normalized"
+** version of that string in memory obtained from sqlite3_malloc64(). The
+** caller is responsible for ensuring that the returned memory is freed.
+**
+** If a memory allocation error occurs, this routine returns NULL.
+**
+** The normalization consists of the following transformations:
+**
+** (1) Convert every literal (string, blob literal, numeric constant,
+** or "NULL" constant) into a ?
+**
+** (2) Remove all superfluous whitespace, including comments. Change
+** all required whitespace to a single space character.
+**
+** (3) Lowercase all ASCII characters.
+**
+** (4) If an IN or NOT IN operator is followed by a list of 1 or more
+** values, convert that list into "(?,?,?)".
+**
+** The purpose of normalization is two-fold:
+**
+** (1) Sanitize queries by removing potentially private or sensitive
+** information contained in literals.
+**
+** (2) Identify structurally identical queries by comparing their
+** normalized forms.
+**
+** Command-Line Utility
+** --------------------
+**
+** This file also contains code for a command-line utility that converts
+** SQL queries in text files into their normalized forms. To build the
+** command-line program, compile this file with -DSQLITE_NORMALIZE_CLI
+** and link it against the SQLite library.
+*/
+#include <sqlite3.h>
+#include <string.h>
+
+/*
+** Implementation note:
+**
+** Much of the tokenizer logic is copied out of the tokenize.c source file
+** of SQLite. That logic could be simplified for this particular application,
+** but that would impose a risk of introducing subtle errors. It is best to
+** keep the code as close to the original as possible.
+**
+** The tokenize code is in sync with the SQLite core as of 2018-01-08.
+** Any future changes to the core tokenizer might require corresponding
+** adjustments to the tokenizer logic in this module.
+*/
+
+
+/* Character classes for tokenizing
+**
+** In the sqlite3GetToken() function, a switch() on aiClass[c] is implemented
+** using a lookup table, whereas a switch() directly on c uses a binary search.
+** The lookup table is much faster. To maximize speed, and to ensure that
+** a lookup table is used, all of the classes need to be small integers and
+** all of them need to be used within the switch.
+*/
+#define CC_X 0 /* The letter 'x', or start of BLOB literal */
+#define CC_KYWD 1 /* Alphabetics or '_'. Usable in a keyword */
+#define CC_ID 2 /* unicode characters usable in IDs */
+#define CC_DIGIT 3 /* Digits */
+#define CC_DOLLAR 4 /* '$' */
+#define CC_VARALPHA 5 /* '@', '#', ':'. Alphabetic SQL variables */
+#define CC_VARNUM 6 /* '?'. Numeric SQL variables */
+#define CC_SPACE 7 /* Space characters */
+#define CC_QUOTE 8 /* '"', '\'', or '`'. String literals, quoted ids */
+#define CC_QUOTE2 9 /* '['. [...] style quoted ids */
+#define CC_PIPE 10 /* '|'. Bitwise OR or concatenate */
+#define CC_MINUS 11 /* '-'. Minus or SQL-style comment */
+#define CC_LT 12 /* '<'. Part of < or <= or <> */
+#define CC_GT 13 /* '>'. Part of > or >= */
+#define CC_EQ 14 /* '='. Part of = or == */
+#define CC_BANG 15 /* '!'. Part of != */
+#define CC_SLASH 16 /* '/'. / or c-style comment */
+#define CC_LP 17 /* '(' */
+#define CC_RP 18 /* ')' */
+#define CC_SEMI 19 /* ';' */
+#define CC_PLUS 20 /* '+' */
+#define CC_STAR 21 /* '*' */
+#define CC_PERCENT 22 /* '%' */
+#define CC_COMMA 23 /* ',' */
+#define CC_AND 24 /* '&' */
+#define CC_TILDA 25 /* '~' */
+#define CC_DOT 26 /* '.' */
+#define CC_ILLEGAL 27 /* Illegal character */
+
+static const unsigned char aiClass[] = {
+/* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xa xb xc xd xe xf */
+/* 0x */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 7, 7, 27, 7, 7, 27, 27,
+/* 1x */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+/* 2x */ 7, 15, 8, 5, 4, 22, 24, 8, 17, 18, 21, 20, 23, 11, 26, 16,
+/* 3x */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 5, 19, 12, 14, 13, 6,
+/* 4x */ 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+/* 5x */ 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 9, 27, 27, 27, 1,
+/* 6x */ 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+/* 7x */ 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 27, 10, 27, 25, 27,
+/* 8x */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+/* 9x */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+/* Ax */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+/* Bx */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+/* Cx */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+/* Dx */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+/* Ex */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+/* Fx */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+};
+
+/* An array to map all upper-case characters into their corresponding
+** lower-case character.
+**
+** SQLite only considers US-ASCII (or EBCDIC) characters. We do not
+** handle case conversions for the UTF character set since the tables
+** involved are nearly as big or bigger than SQLite itself.
+*/
+static const unsigned char sqlite3UpperToLower[] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
+ 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
+ 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
+ 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 97, 98, 99,100,101,102,103,
+ 104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,
+ 122, 91, 92, 93, 94, 95, 96, 97, 98, 99,100,101,102,103,104,105,106,107,
+ 108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,
+ 126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,
+ 144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,
+ 162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,
+ 180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,
+ 198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,
+ 216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,
+ 234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,
+ 252,253,254,255
+};
+
+/*
+** The following 256 byte lookup table is used to support SQLites built-in
+** equivalents to the following standard library functions:
+**
+** isspace() 0x01
+** isalpha() 0x02
+** isdigit() 0x04
+** isalnum() 0x06
+** isxdigit() 0x08
+** toupper() 0x20
+** SQLite identifier character 0x40
+** Quote character 0x80
+**
+** Bit 0x20 is set if the mapped character requires translation to upper
+** case. i.e. if the character is a lower-case ASCII character.
+** If x is a lower-case ASCII character, then its upper-case equivalent
+** is (x - 0x20). Therefore toupper() can be implemented as:
+**
+** (x & ~(map[x]&0x20))
+**
+** The equivalent of tolower() is implemented using the sqlite3UpperToLower[]
+** array. tolower() is used more often than toupper() by SQLite.
+**
+** Bit 0x40 is set if the character is non-alphanumeric and can be used in an
+** SQLite identifier. Identifiers are alphanumerics, "_", "$", and any
+** non-ASCII UTF character. Hence the test for whether or not a character is
+** part of an identifier is 0x46.
+*/
+static const unsigned char sqlite3CtypeMap[256] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 00..07 ........ */
+ 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, /* 08..0f ........ */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 10..17 ........ */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 18..1f ........ */
+ 0x01, 0x00, 0x80, 0x00, 0x40, 0x00, 0x00, 0x80, /* 20..27 !"#$%&' */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 28..2f ()*+,-./ */
+ 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, /* 30..37 01234567 */
+ 0x0c, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 38..3f 89:;<=>? */
+
+ 0x00, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x02, /* 40..47 @ABCDEFG */
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 48..4f HIJKLMNO */
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 50..57 PQRSTUVW */
+ 0x02, 0x02, 0x02, 0x80, 0x00, 0x00, 0x00, 0x40, /* 58..5f XYZ[\]^_ */
+ 0x80, 0x2a, 0x2a, 0x2a, 0x2a, 0x2a, 0x2a, 0x22, /* 60..67 `abcdefg */
+ 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, /* 68..6f hijklmno */
+ 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, /* 70..77 pqrstuvw */
+ 0x22, 0x22, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, /* 78..7f xyz{|}~. */
+
+ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 80..87 ........ */
+ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 88..8f ........ */
+ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 90..97 ........ */
+ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 98..9f ........ */
+ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* a0..a7 ........ */
+ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* a8..af ........ */
+ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* b0..b7 ........ */
+ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* b8..bf ........ */
+
+ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* c0..c7 ........ */
+ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* c8..cf ........ */
+ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* d0..d7 ........ */
+ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* d8..df ........ */
+ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* e0..e7 ........ */
+ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* e8..ef ........ */
+ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* f0..f7 ........ */
+ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40 /* f8..ff ........ */
+};
+#define sqlite3Toupper(x) ((x)&~(sqlite3CtypeMap[(unsigned char)(x)]&0x20))
+#define sqlite3Isspace(x) (sqlite3CtypeMap[(unsigned char)(x)]&0x01)
+#define sqlite3Isalnum(x) (sqlite3CtypeMap[(unsigned char)(x)]&0x06)
+#define sqlite3Isalpha(x) (sqlite3CtypeMap[(unsigned char)(x)]&0x02)
+#define sqlite3Isdigit(x) (sqlite3CtypeMap[(unsigned char)(x)]&0x04)
+#define sqlite3Isxdigit(x) (sqlite3CtypeMap[(unsigned char)(x)]&0x08)
+#define sqlite3Tolower(x) (sqlite3UpperToLower[(unsigned char)(x)])
+#define sqlite3Isquote(x) (sqlite3CtypeMap[(unsigned char)(x)]&0x80)
+
+
+/*
+** If X is a character that can be used in an identifier then
+** IdChar(X) will be true. Otherwise it is false.
+**
+** For ASCII, any character with the high-order bit set is
+** allowed in an identifier. For 7-bit characters,
+** sqlite3IsIdChar[X] must be 1.
+**
+** For EBCDIC, the rules are more complex but have the same
+** end result.
+**
+** Ticket #1066. the SQL standard does not allow '$' in the
+** middle of identifiers. But many SQL implementations do.
+** SQLite will allow '$' in identifiers for compatibility.
+** But the feature is undocumented.
+*/
+#define IdChar(C) ((sqlite3CtypeMap[(unsigned char)C]&0x46)!=0)
+
+/*
+** Ignore testcase() macros
+*/
+#define testcase(X)
+
+/*
+** Token values
+*/
+#define TK_SPACE 0
+#define TK_NAME 1
+#define TK_LITERAL 2
+#define TK_PUNCT 3
+#define TK_ERROR 4
+
+#define TK_MINUS TK_PUNCT
+#define TK_LP TK_PUNCT
+#define TK_RP TK_PUNCT
+#define TK_SEMI TK_PUNCT
+#define TK_PLUS TK_PUNCT
+#define TK_STAR TK_PUNCT
+#define TK_SLASH TK_PUNCT
+#define TK_REM TK_PUNCT
+#define TK_EQ TK_PUNCT
+#define TK_LE TK_PUNCT
+#define TK_NE TK_PUNCT
+#define TK_LSHIFT TK_PUNCT
+#define TK_LT TK_PUNCT
+#define TK_GE TK_PUNCT
+#define TK_RSHIFT TK_PUNCT
+#define TK_GT TK_PUNCT
+#define TK_GE TK_PUNCT
+#define TK_BITOR TK_PUNCT
+#define TK_CONCAT TK_PUNCT
+#define TK_COMMA TK_PUNCT
+#define TK_BITAND TK_PUNCT
+#define TK_BITNOT TK_PUNCT
+#define TK_STRING TK_LITERAL
+#define TK_ID TK_NAME
+#define TK_ILLEGAL TK_ERROR
+#define TK_DOT TK_PUNCT
+#define TK_INTEGER TK_LITERAL
+#define TK_FLOAT TK_LITERAL
+#define TK_VARIABLE TK_LITERAL
+#define TK_BLOB TK_LITERAL
+
+/*
+** Return the length (in bytes) of the token that begins at z[0].
+** Store the token type in *tokenType before returning.
+*/
+static int sqlite3GetToken(const unsigned char *z, int *tokenType){
+ int i, c;
+ switch( aiClass[*z] ){ /* Switch on the character-class of the first byte
+ ** of the token. See the comment on the CC_ defines
+ ** above. */
+ case CC_SPACE: {
+ for(i=1; sqlite3Isspace(z[i]); i++){}
+ *tokenType = TK_SPACE;
+ return i;
+ }
+ case CC_MINUS: {
+ if( z[1]=='-' ){
+ for(i=2; (c=z[i])!=0 && c!='\n'; i++){}
+ *tokenType = TK_SPACE;
+ return i;
+ }
+ *tokenType = TK_MINUS;
+ return 1;
+ }
+ case CC_LP: {
+ *tokenType = TK_LP;
+ return 1;
+ }
+ case CC_RP: {
+ *tokenType = TK_RP;
+ return 1;
+ }
+ case CC_SEMI: {
+ *tokenType = TK_SEMI;
+ return 1;
+ }
+ case CC_PLUS: {
+ *tokenType = TK_PLUS;
+ return 1;
+ }
+ case CC_STAR: {
+ *tokenType = TK_STAR;
+ return 1;
+ }
+ case CC_SLASH: {
+ if( z[1]!='*' || z[2]==0 ){
+ *tokenType = TK_SLASH;
+ return 1;
+ }
+ for(i=3, c=z[2]; (c!='*' || z[i]!='/') && (c=z[i])!=0; i++){}
+ if( c ) i++;
+ *tokenType = TK_SPACE;
+ return i;
+ }
+ case CC_PERCENT: {
+ *tokenType = TK_REM;
+ return 1;
+ }
+ case CC_EQ: {
+ *tokenType = TK_EQ;
+ return 1 + (z[1]=='=');
+ }
+ case CC_LT: {
+ if( (c=z[1])=='=' ){
+ *tokenType = TK_LE;
+ return 2;
+ }else if( c=='>' ){
+ *tokenType = TK_NE;
+ return 2;
+ }else if( c=='<' ){
+ *tokenType = TK_LSHIFT;
+ return 2;
+ }else{
+ *tokenType = TK_LT;
+ return 1;
+ }
+ }
+ case CC_GT: {
+ if( (c=z[1])=='=' ){
+ *tokenType = TK_GE;
+ return 2;
+ }else if( c=='>' ){
+ *tokenType = TK_RSHIFT;
+ return 2;
+ }else{
+ *tokenType = TK_GT;
+ return 1;
+ }
+ }
+ case CC_BANG: {
+ if( z[1]!='=' ){
+ *tokenType = TK_ILLEGAL;
+ return 1;
+ }else{
+ *tokenType = TK_NE;
+ return 2;
+ }
+ }
+ case CC_PIPE: {
+ if( z[1]!='|' ){
+ *tokenType = TK_BITOR;
+ return 1;
+ }else{
+ *tokenType = TK_CONCAT;
+ return 2;
+ }
+ }
+ case CC_COMMA: {
+ *tokenType = TK_COMMA;
+ return 1;
+ }
+ case CC_AND: {
+ *tokenType = TK_BITAND;
+ return 1;
+ }
+ case CC_TILDA: {
+ *tokenType = TK_BITNOT;
+ return 1;
+ }
+ case CC_QUOTE: {
+ int delim = z[0];
+ testcase( delim=='`' );
+ testcase( delim=='\'' );
+ testcase( delim=='"' );
+ for(i=1; (c=z[i])!=0; i++){
+ if( c==delim ){
+ if( z[i+1]==delim ){
+ i++;
+ }else{
+ break;
+ }
+ }
+ }
+ if( c=='\'' ){
+ *tokenType = TK_STRING;
+ return i+1;
+ }else if( c!=0 ){
+ *tokenType = TK_ID;
+ return i+1;
+ }else{
+ *tokenType = TK_ILLEGAL;
+ return i;
+ }
+ }
+ case CC_DOT: {
+ if( !sqlite3Isdigit(z[1]) ){
+ *tokenType = TK_DOT;
+ return 1;
+ }
+ /* If the next character is a digit, this is a floating point
+ ** number that begins with ".". Fall thru into the next case */
+ }
+ case CC_DIGIT: {
+ *tokenType = TK_INTEGER;
+ if( z[0]=='0' && (z[1]=='x' || z[1]=='X') && sqlite3Isxdigit(z[2]) ){
+ for(i=3; sqlite3Isxdigit(z[i]); i++){}
+ return i;
+ }
+ for(i=0; sqlite3Isdigit(z[i]); i++){}
+ if( z[i]=='.' ){
+ i++;
+ while( sqlite3Isdigit(z[i]) ){ i++; }
+ *tokenType = TK_FLOAT;
+ }
+ if( (z[i]=='e' || z[i]=='E') &&
+ ( sqlite3Isdigit(z[i+1])
+ || ((z[i+1]=='+' || z[i+1]=='-') && sqlite3Isdigit(z[i+2]))
+ )
+ ){
+ i += 2;
+ while( sqlite3Isdigit(z[i]) ){ i++; }
+ *tokenType = TK_FLOAT;
+ }
+ while( IdChar(z[i]) ){
+ *tokenType = TK_ILLEGAL;
+ i++;
+ }
+ return i;
+ }
+ case CC_QUOTE2: {
+ for(i=1, c=z[0]; c!=']' && (c=z[i])!=0; i++){}
+ *tokenType = c==']' ? TK_ID : TK_ILLEGAL;
+ return i;
+ }
+ case CC_VARNUM: {
+ *tokenType = TK_VARIABLE;
+ for(i=1; sqlite3Isdigit(z[i]); i++){}
+ return i;
+ }
+ case CC_DOLLAR:
+ case CC_VARALPHA: {
+ int n = 0;
+ testcase( z[0]=='$' ); testcase( z[0]=='@' );
+ testcase( z[0]==':' ); testcase( z[0]=='#' );
+ *tokenType = TK_VARIABLE;
+ for(i=1; (c=z[i])!=0; i++){
+ if( IdChar(c) ){
+ n++;
+ }else if( c=='(' && n>0 ){
+ do{
+ i++;
+ }while( (c=z[i])!=0 && !sqlite3Isspace(c) && c!=')' );
+ if( c==')' ){
+ i++;
+ }else{
+ *tokenType = TK_ILLEGAL;
+ }
+ break;
+ }else if( c==':' && z[i+1]==':' ){
+ i++;
+ }else{
+ break;
+ }
+ }
+ if( n==0 ) *tokenType = TK_ILLEGAL;
+ return i;
+ }
+ case CC_KYWD: {
+ for(i=1; aiClass[z[i]]<=CC_KYWD; i++){}
+ if( IdChar(z[i]) ){
+ /* This token started out using characters that can appear in keywords,
+ ** but z[i] is a character not allowed within keywords, so this must
+ ** be an identifier instead */
+ i++;
+ break;
+ }
+ *tokenType = TK_ID;
+ return i;
+ }
+ case CC_X: {
+ testcase( z[0]=='x' ); testcase( z[0]=='X' );
+ if( z[1]=='\'' ){
+ *tokenType = TK_BLOB;
+ for(i=2; sqlite3Isxdigit(z[i]); i++){}
+ if( z[i]!='\'' || i%2 ){
+ *tokenType = TK_ILLEGAL;
+ while( z[i] && z[i]!='\'' ){ i++; }
+ }
+ if( z[i] ) i++;
+ return i;
+ }
+ /* If it is not a BLOB literal, then it must be an ID, since no
+ ** SQL keywords start with the letter 'x'. Fall through */
+ }
+ case CC_ID: {
+ i = 1;
+ break;
+ }
+ default: {
+ *tokenType = TK_ILLEGAL;
+ return 1;
+ }
+ }
+ while( IdChar(z[i]) ){ i++; }
+ *tokenType = TK_ID;
+ return i;
+}
+
+char *sqlite3_normalize(const char *zSql){
+ char *z; /* The output string */
+ sqlite3_int64 nZ; /* Size of the output string in bytes */
+ sqlite3_int64 nSql; /* Size of the input string in bytes */
+ int i; /* Next character to read from zSql[] */
+ int j; /* Next slot to fill in on z[] */
+ int tokenType; /* Type of the next token */
+ int n; /* Size of the next token */
+ int k; /* Loop counter */
+
+ nSql = strlen(zSql);
+ nZ = nSql;
+ z = sqlite3_malloc64( nZ+2 );
+ if( z==0 ) return 0;
+ for(i=j=0; zSql[i]; i += n){
+ n = sqlite3GetToken((unsigned char*)zSql+i, &tokenType);
+ switch( tokenType ){
+ case TK_SPACE: {
+ break;
+ }
+ case TK_ERROR: {
+ sqlite3_free(z);
+ return 0;
+ }
+ case TK_LITERAL: {
+ z[j++] = '?';
+ break;
+ }
+ case TK_PUNCT:
+ case TK_NAME: {
+ if( n==4 && sqlite3_strnicmp(zSql+i,"NULL",4)==0 ){
+ if( (j>=3 && strncmp(z+j-2,"is",2)==0 && !IdChar(z[j-3]))
+ || (j>=4 && strncmp(z+j-3,"not",3)==0 && !IdChar(z[j-4]))
+ ){
+ /* NULL is a keyword in this case, not a literal value */
+ }else{
+ /* Here the NULL is a literal value */
+ z[j++] = '?';
+ break;
+ }
+ }
+ if( j>0 && IdChar(z[j-1]) && IdChar(zSql[i]) ) z[j++] = ' ';
+ for(k=0; k<n; k++){
+ z[j++] = sqlite3Tolower(zSql[i+k]);
+ }
+ break;
+ }
+ }
+ }
+ while( j>0 && z[j-1]==' ' ){ j--; }
+ if( i>0 && z[j-1]!=';' ){ z[j++] = ';'; }
+ z[j] = 0;
+
+ /* Make a second pass converting "in(...)" where the "..." is not a
+ ** SELECT statement into "in(?,?,?)" */
+ for(i=0; i<j; i=n){
+ char *zIn = strstr(z+i, "in(");
+ int nParen;
+ if( zIn==0 ) break;
+ n = (int)(zIn-z)+3; /* Index of first char past "in(" */
+ if( n && IdChar(zIn[-1]) ) continue;
+ if( strncmp(zIn, "in(select",9)==0 && !IdChar(zIn[9]) ) continue;
+ if( strncmp(zIn, "in(with",7)==0 && !IdChar(zIn[7]) ) continue;
+ for(nParen=1, k=0; z[n+k]; k++){
+ if( z[n+k]=='(' ) nParen++;
+ if( z[n+k]==')' ){
+ nParen--;
+ if( nParen==0 ) break;
+ }
+ }
+ /* k is the number of bytes in the "..." within "in(...)" */
+ if( k<5 ){
+ z = sqlite3_realloc64(z, j+(5-k)+1);
+ if( z==0 ) return 0;
+ memmove(z+n+5, z+n+k, j-(n+k));
+ }else if( k>5 ){
+ memmove(z+n+5, z+n+k, j-(n+k));
+ }
+ j = j-k+5;
+ z[j] = 0;
+ memcpy(z+n, "?,?,?", 5);
+ }
+ return z;
+}
+
+/*
+** For testing purposes, or to build a stand-alone SQL normalizer program,
+** compile this one source file with the -DSQLITE_NORMALIZE_CLI and link
+** it against any SQLite library. The resulting command-line program will
+** run sqlite3_normalize() over the text of all files named on the command-
+** line and show the result on standard output.
+*/
+#ifdef SQLITE_NORMALIZE_CLI
+#include <stdio.h>
+#include <stdlib.h>
+
+/*
+** Break zIn up into separate SQL statements and run sqlite3_normalize()
+** on each one. Print the result of each run.
+*/
+static void normalizeFile(char *zIn){
+ int i;
+ if( zIn==0 ) return;
+ for(i=0; zIn[i]; i++){
+ char cSaved;
+ if( zIn[i]!=';' ) continue;
+ cSaved = zIn[i+1];
+ zIn[i+1] = 0;
+ if( sqlite3_complete(zIn) ){
+ char *zOut = sqlite3_normalize(zIn);
+ if( zOut ){
+ printf("%s\n", zOut);
+ sqlite3_free(zOut);
+ }else{
+ fprintf(stderr, "ERROR: %s\n", zIn);
+ }
+ zIn[i+1] = cSaved;
+ zIn += i+1;
+ i = -1;
+ }else{
+ zIn[i+1] = cSaved;
+ }
+ }
+}
+
+/*
+** The main routine for "sql_normalize". Read files named on the
+** command-line and run the text of each through sqlite3_normalize().
+*/
+int main(int argc, char **argv){
+ int i;
+ FILE *in;
+ char *zBuf = 0;
+ sqlite3_int64 sz, got;
+
+ for(i=1; i<argc; i++){
+ in = fopen(argv[i], "rb");
+ if( in==0 ){
+ fprintf(stderr, "cannot open \"%s\"\n", argv[i]);
+ continue;
+ }
+ fseek(in, 0, SEEK_END);
+ sz = ftell(in);
+ rewind(in);
+ zBuf = sqlite3_realloc64(zBuf, sz+1);
+ if( zBuf==0 ){
+ fprintf(stderr, "failed to malloc for %lld bytes\n", sz);
+ exit(1);
+ }
+ got = fread(zBuf, 1, sz, in);
+ fclose(in);
+ if( got!=sz ){
+ fprintf(stderr, "only able to read %lld of %lld bytes from \"%s\"\n",
+ got, sz, argv[i]);
+ }else{
+ zBuf[got] = 0;
+ normalizeFile(zBuf);
+ }
+ }
+ sqlite3_free(zBuf);
+}
+#endif /* SQLITE_NORMALIZE_CLI */
diff --git a/main.mk b/main.mk
index f2640b433..2eddfc6f2 100644
--- a/main.mk
+++ b/main.mk
@@ -361,6 +361,7 @@ TESTSRC += \
$(TOP)/ext/misc/ieee754.c \
$(TOP)/ext/misc/mmapwarm.c \
$(TOP)/ext/misc/nextchar.c \
+ $(TOP)/ext/misc/normalize.c \
$(TOP)/ext/misc/percentile.c \
$(TOP)/ext/misc/regexp.c \
$(TOP)/ext/misc/remember.c \
diff --git a/manifest b/manifest
index ffdd6c23c..57e34466e 100644
--- a/manifest
+++ b/manifest
@@ -1,10 +1,10 @@
-C Interchange\sthe\snumeric\scodes\sfor\sCURSOR_VALID\sand\sCURSOR_INVALID\sto\sobtain\na\ssmall\ssize\sdecrease\sand\sperformance\sincrease.
-D 2018-01-24T14:40:01.721
+C Add\sthe\snormalize.c\sextension.
+D 2018-01-24T15:07:37.708
F .fossil-settings/empty-dirs dbb81e8fc0401ac46a1491ab34a7f2c7c0452f2f06b54ebb845d024ca8283ef1
F .fossil-settings/ignore-glob 35175cdfcf539b2318cb04a9901442804be81cd677d8b889fcc9149c21f239ea
-F Makefile.in 38f84f301cbef443b2d269f67a74b8cc536469831f70df7c3e912acc04932cc2
+F Makefile.in 7a3f714b4fcf793108042b7b0a5c720b0b310ec84314d61ba7f3f49f27e550ea
F Makefile.linux-gcc 7bc79876b875010e8c8f9502eb935ca92aa3c434
-F Makefile.msc ede26e3fb675e0b3b07627640ce5917154a6ee7f8f2c97424eb5ab5f651cbd56
+F Makefile.msc b5e9eb7212c5a61d4800172c710376d1158fd5eea2edc0be8e05c0c372e767e4
F README.md d748f58e3ab0fe0307fb4ae0942b415d93dcc4288756e366cc9e7cf8260c093f
F VERSION cdf91ac446255ecf3d8f6d8c3ee40d64123235ae5b3cef29d344e61b45ec3759
F aclocal.m4 a5c22d164aff7ed549d53a90fa56d56955281f50
@@ -284,6 +284,7 @@ F ext/misc/json1.c dbe086615b9546c156bf32b9378fc09383b58bd17513b866cfd24c1e15281
F ext/misc/memvfs.c ab36f49e02ebcdf85a1e08dc4d8599ea8f343e073ac9e0bca18a98b7e1ec9567
F ext/misc/mmapwarm.c 70b618f2d0bde43fae288ad0b7498a629f2b6f61b50a27e06fae3cd23c83af29
F ext/misc/nextchar.c 35c8b8baacb96d92abbb34a83a997b797075b342
+F ext/misc/normalize.c 19262ef3ef29d4de2f281b423326865c8916c63d0cb09f1dc98d24d5c1e8ba64
F ext/misc/percentile.c 92699c8cd7d517ff610e6037e56506f8904dae2e
F ext/misc/regexp.c a68d25c659bd2d893cd1215667bbf75ecb9dc7d4
F ext/misc/remember.c add730f0f7e7436cd15ea3fd6a90fd83c3f706ab44169f7f048438b7d6baa69c
@@ -409,7 +410,7 @@ F ext/userauth/userauth.c 3410be31283abba70255d71fd24734e017a4497f
F install-sh 9d4de14ab9fb0facae2f48780b874848cbf2f895 x
F ltmain.sh 3ff0879076df340d2e23ae905484d8c15d5fdea8
F magic.txt 8273bf49ba3b0c8559cb2774495390c31fd61c60
-F main.mk 4d19895cb268021474ade6244119c80b8a3f1d910cb5b13cf5e04f5f37c3d61b
+F main.mk c8c473bd91d553acab3fb0608ddb69fc769c7bcf6d9e258800504bfda86c792b
F mkso.sh fd21c06b063bb16a5d25deea1752c2da6ac3ed83
F mptest/config01.test 3c6adcbc50b991866855f1977ff172eb6d901271
F mptest/config02.test 4415dfe36c48785f751e16e32c20b077c28ae504
@@ -496,7 +497,7 @@ F src/sqliteLimit.h 1513bfb7b20378aa0041e7022d04acb73525de35b80b252f1b83fedb4de6
F src/status.c 9737ed017279a9e0c5da748701c3c7bf1e8ae0dae459aad20dd64fcff97a7e35
F src/table.c b46ad567748f24a326d9de40e5b9659f96ffff34
F src/tclsqlite.c 11a2618c227fd13ccad73ee02d1199f9880c59db2b3144fd7432db1980a2577d
-F src/test1.c b52f9e7fe62016d357c3266fcfa0793cc1883d3cb2b11dfa39fcba2e70b0305c
+F src/test1.c 1ab7cbbb6693e08364c1a9241e2aee17f8c4925e4cc52396be77ae6845a05828
F src/test2.c 3efb99ab7f1fc8d154933e02ae1378bac9637da5
F src/test3.c b8434949dfb8aff8dfa082c8b592109e77844c2135ed3c492113839b6956255b
F src/test4.c 18ec393bb4d0ad1de729f0b94da7267270f3d8e6
@@ -1095,6 +1096,7 @@ F test/mutex2.test bfeaeac2e73095b2ac32285d2756e3a65e681660
F test/nan.test 437d40e6d0778b050d7750726c0cbd2c9936b81962926e8f8c48ca698f00f4d1
F test/nockpt.test 9a436a7213ba5ef7a32304998d386d3ea3f76c9d
F test/nolock.test f196cf8b8fbea4e2ca345140a2b3f3b0da45c76e
+F test/normalize.test 501630ab49b0b26b65c74124bf03e3374c1b57fa97aae750f84803609141d167
F test/notify1.test 669b2b743618efdc18ca4b02f45423d5d2304abf
F test/notify2.test 2ecabaa1305083856b7c39cf32816b612740c161
F test/notify3.test 10ff25cde502e72a92053a2f215d64bece4ef934
@@ -1700,7 +1702,8 @@ F vsixtest/vsixtest.tcl 6a9a6ab600c25a91a7acc6293828957a386a8a93
F vsixtest/vsixtest.vcxproj.data 2ed517e100c66dc455b492e1a33350c1b20fbcdc
F vsixtest/vsixtest.vcxproj.filters 37e51ffedcdb064aad6ff33b6148725226cd608e
F vsixtest/vsixtest_TemporaryKey.pfx e5b1b036facdb453873e7084e1cae9102ccc67a0
-P e4766cabdf64d8e998048ae43154466fc9c3fad0b86102a42b65122abfcbbe55
-R db620f4cdcf6bbd9d15c47f8bc8f1349
+P e0f192ea6dda4fa0b243d58c8ce41932519141bcae0689a90318b4f866f54edd 658f42257d56a3562dfa8e55023c6e497f55c565cd751d942de3a9e0cb2fa708
+R 01bd832eb21dc9b5a19a52a2b0f11c48
+T +closed 658f42257d56a3562dfa8e55023c6e497f55c565cd751d942de3a9e0cb2fa708
U drh
-Z 93685aa7f4c0affe18e092c9d91e885b
+Z 043a35201b0c771ecddffe95c53bb65f
diff --git a/manifest.uuid b/manifest.uuid
index e81bb2abc..498f994dc 100644
--- a/manifest.uuid
+++ b/manifest.uuid
@@ -1 +1 @@
-e0f192ea6dda4fa0b243d58c8ce41932519141bcae0689a90318b4f866f54edd \ No newline at end of file
+16ebe558d2f0982f630c39b6c1b00d14d27011d454cced213f17c26491a2c06e \ No newline at end of file
diff --git a/src/test1.c b/src/test1.c
index 55d92eb1f..bc8f389db 100644
--- a/src/test1.c
+++ b/src/test1.c
@@ -4560,6 +4560,35 @@ static int SQLITE_TCLAPI test_complete16(
}
/*
+** Usage: sqlite3_normalize SQL
+**
+** Return the normalized value for an SQL statement.
+*/
+static int SQLITE_TCLAPI test_normalize(
+ void * clientData,
+ Tcl_Interp *interp,
+ int objc,
+ Tcl_Obj *CONST objv[]
+){
+ char *zSql;
+ char *zNorm;
+ extern char *sqlite3_normalize(const char*);
+
+ if( objc!=2 ){
+ Tcl_WrongNumArgs(interp, 1, objv, "SQL");
+ return TCL_ERROR;
+ }
+
+ zSql = (char*)Tcl_GetString(objv[1]);
+ zNorm = sqlite3_normalize(zSql);
+ if( zNorm ){
+ Tcl_SetObjResult(interp, Tcl_NewStringObj(zNorm, -1));
+ sqlite3_free(zNorm);
+ }
+ return TCL_OK;
+}
+
+/*
** Usage: sqlite3_step STMT
**
** Advance the statement to the next row.
@@ -7547,6 +7576,7 @@ int Sqlitetest1_Init(Tcl_Interp *interp){
{ "sqlite3_open16", test_open16 ,0 },
{ "sqlite3_open_v2", test_open_v2 ,0 },
{ "sqlite3_complete16", test_complete16 ,0 },
+ { "sqlite3_normalize", test_normalize ,0 },
{ "sqlite3_prepare", test_prepare ,0 },
{ "sqlite3_prepare16", test_prepare16 ,0 },
diff --git a/test/normalize.test b/test/normalize.test
new file mode 100644
index 000000000..8932650c8
--- /dev/null
+++ b/test/normalize.test
@@ -0,0 +1,72 @@
+# 2018-01-08
+#
+# The author disclaims copyright to this source code. In place of
+# a legal notice, here is a blessing:
+#
+# May you do good and not evil.
+# May you find forgiveness for yourself and forgive others.
+# May you share freely, never taking more than you give.
+#
+#***********************************************************************
+#
+# Tests for the sqlite3_normalize() extension function.
+#
+
+set testdir [file dirname $argv0]
+source $testdir/tester.tcl
+set testprefix normalize
+
+foreach {tnum sql norm} {
+ 100
+ {SELECT * FROM t1 WHERE a IN (1) AND b=51.42}
+ {select*from t1 where a in(?,?,?)and b=?;}
+
+ 110
+ {SELECT a, b+15, c FROM t1 WHERE d NOT IN (SELECT x FROM t2);}
+ {select a,b+?,c from t1 where d not in(select x from t2);}
+
+ 120
+ { SELECT NULL, b FROM t1 -- comment text
+ WHERE d IN (WITH t(a) AS (VALUES(5)) /* CTE */
+ SELECT a FROM t)
+ OR e='hello';
+ }
+ {select?,b from t1 where d in(with t(a)as(values(?))select a from t)or e=?;}
+
+ 121
+ {/*Initial comment*/
+ -- another comment line
+ SELECT NULL /* comment */ , b FROM t1 -- comment text
+ WHERE d IN (WITH t(a) AS (VALUES(5)) /* CTE */
+ SELECT a FROM t)
+ OR e='hello';
+ }
+ {select?,b from t1 where d in(with t(a)as(values(?))select a from t)or e=?;}
+
+ 130
+ {/* Query containing parameters */
+ SELECT x,$::abc(15),y,@abc,z,?99,w FROM t1 /* Trailing comment */}
+ {select x,?,y,?,z,?,w from t1;}
+
+ 140
+ {/* Long list on the RHS of IN */
+ SELECT 15 IN (1,2,3,(SELECT * FROM t1),'xyz',x'abcd',22*(x+5),null);}
+ {select?in(?,?,?);}
+
+ 150
+ {SELECT x'abc'; -- illegal token}
+ {}
+
+ 160
+ {SELECT a,NULL,b FROM t1 WHERE c IS NOT NULL or D is null or e=5}
+ {select a,?,b from t1 where c is not null or d is null or e=?;}
+
+ 170
+ {/* IN list exactly 5 bytes long */
+ SELECT * FROM t1 WHERE x IN (1,2,3);}
+ {select*from t1 where x in(?,?,?);}
+} {
+ do_test $tnum [list sqlite3_normalize $sql] $norm
+}
+
+finish_test