From a41f2855e9310602d0680fe6056c3653504705a4 Mon Sep 17 00:00:00 2001 From: drh Date: Mon, 8 Jan 2018 16:54:26 +0000 Subject: [PATCH 1/4] First code for an auxiliary function that will normalize an SQL statement. FossilOrigin-Name: 84814aac81e54f03430f180926156ab0fc01e22bbce2bb228b698ea0d4a588ea --- ext/misc/normalize.c | 639 +++++++++++++++++++++++++++++++++++++++++++ manifest | 14 +- manifest.uuid | 2 +- 3 files changed, 649 insertions(+), 6 deletions(-) create mode 100644 ext/misc/normalize.c diff --git a/ext/misc/normalize.c b/ext/misc/normalize.c new file mode 100644 index 0000000000..914a154d9a --- /dev/null +++ b/ext/misc/normalize.c @@ -0,0 +1,639 @@ +/* +** 2018-01-08 +** +** The author disclaims copyright to this source code. In place of +** a legal notice, here is a blessing: +** +** May you do good and not evil. +** May you find forgiveness for yourself and forgive others. +** May you share freely, never taking more than you give. +** +****************************************************************************** +** +** This file contains code to implement the sqlite3_normalize() function. +** +** char *sqlite3_normalize(const char *zSql); +** +** This function takes an SQL string as input and returns a "normalized" +** version of that string in memory obtained from sqlite3_malloc64(). The +** caller is responsible for ensuring that the returned memory is freed. +** +** If a memory allocation error occurs, this routine returns NULL. +** +** The normalization consists of the following transformations: +** +** (1) Convert every literal (string, blob literal, numeric constant, +** or "NULL" constant) into a ? +** +** (2) Remove all superfluous whitespace, including comments. Change +** all required whitespace to a single space character. +** +** (3) Lowercase all ASCII characters. +** +** (4) If an IN or NOT IN operator is followed by a list of 1 or more +** values, convert that list into "(?,?,?)". +** +** The purpose of normalization is two-fold: +** +** (1) Sanitize queries by removing possibly sensitive information contained +** in literals. +** +** (2) Identify structurally identical queries by comparing their +** normalized forms. +*/ +#include +#include + +/* +** Implementation note: +** +** Much of the tokenizer logic is copied out of the tokenize.c source file +** of SQLite. This logic could be simplified for this particular application, +** but that would impose a risk of introducing subtle errors. It is best to +** keep the code as close to the original as possible. +*/ + + +/* Character classes for tokenizing +** +** In the sqlite3GetToken() function, a switch() on aiClass[c] is implemented +** using a lookup table, whereas a switch() directly on c uses a binary search. +** The lookup table is much faster. To maximize speed, and to ensure that +** a lookup table is used, all of the classes need to be small integers and +** all of them need to be used within the switch. +*/ +#define CC_X 0 /* The letter 'x', or start of BLOB literal */ +#define CC_KYWD 1 /* Alphabetics or '_'. Usable in a keyword */ +#define CC_ID 2 /* unicode characters usable in IDs */ +#define CC_DIGIT 3 /* Digits */ +#define CC_DOLLAR 4 /* '$' */ +#define CC_VARALPHA 5 /* '@', '#', ':'. Alphabetic SQL variables */ +#define CC_VARNUM 6 /* '?'. Numeric SQL variables */ +#define CC_SPACE 7 /* Space characters */ +#define CC_QUOTE 8 /* '"', '\'', or '`'. String literals, quoted ids */ +#define CC_QUOTE2 9 /* '['. [...] style quoted ids */ +#define CC_PIPE 10 /* '|'. Bitwise OR or concatenate */ +#define CC_MINUS 11 /* '-'. Minus or SQL-style comment */ +#define CC_LT 12 /* '<'. Part of < or <= or <> */ +#define CC_GT 13 /* '>'. Part of > or >= */ +#define CC_EQ 14 /* '='. Part of = or == */ +#define CC_BANG 15 /* '!'. Part of != */ +#define CC_SLASH 16 /* '/'. / or c-style comment */ +#define CC_LP 17 /* '(' */ +#define CC_RP 18 /* ')' */ +#define CC_SEMI 19 /* ';' */ +#define CC_PLUS 20 /* '+' */ +#define CC_STAR 21 /* '*' */ +#define CC_PERCENT 22 /* '%' */ +#define CC_COMMA 23 /* ',' */ +#define CC_AND 24 /* '&' */ +#define CC_TILDA 25 /* '~' */ +#define CC_DOT 26 /* '.' */ +#define CC_ILLEGAL 27 /* Illegal character */ + +static const unsigned char aiClass[] = { +/* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xa xb xc xd xe xf */ +/* 0x */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 7, 7, 27, 7, 7, 27, 27, +/* 1x */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, +/* 2x */ 7, 15, 8, 5, 4, 22, 24, 8, 17, 18, 21, 20, 23, 11, 26, 16, +/* 3x */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 5, 19, 12, 14, 13, 6, +/* 4x */ 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 5x */ 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 9, 27, 27, 27, 1, +/* 6x */ 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 7x */ 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 27, 10, 27, 25, 27, +/* 8x */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +/* 9x */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +/* Ax */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +/* Bx */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +/* Cx */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +/* Dx */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +/* Ex */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +/* Fx */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 +}; + +/* An array to map all upper-case characters into their corresponding +** lower-case character. +** +** SQLite only considers US-ASCII (or EBCDIC) characters. We do not +** handle case conversions for the UTF character set since the tables +** involved are nearly as big or bigger than SQLite itself. +*/ +const unsigned char sqlite3UpperToLower[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, + 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, + 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 97, 98, 99,100,101,102,103, + 104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121, + 122, 91, 92, 93, 94, 95, 96, 97, 98, 99,100,101,102,103,104,105,106,107, + 108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125, + 126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143, + 144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161, + 162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179, + 180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197, + 198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215, + 216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233, + 234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251, + 252,253,254,255 +}; + +/* +** The following 256 byte lookup table is used to support SQLites built-in +** equivalents to the following standard library functions: +** +** isspace() 0x01 +** isalpha() 0x02 +** isdigit() 0x04 +** isalnum() 0x06 +** isxdigit() 0x08 +** toupper() 0x20 +** SQLite identifier character 0x40 +** Quote character 0x80 +** +** Bit 0x20 is set if the mapped character requires translation to upper +** case. i.e. if the character is a lower-case ASCII character. +** If x is a lower-case ASCII character, then its upper-case equivalent +** is (x - 0x20). Therefore toupper() can be implemented as: +** +** (x & ~(map[x]&0x20)) +** +** The equivalent of tolower() is implemented using the sqlite3UpperToLower[] +** array. tolower() is used more often than toupper() by SQLite. +** +** Bit 0x40 is set if the character is non-alphanumeric and can be used in an +** SQLite identifier. Identifiers are alphanumerics, "_", "$", and any +** non-ASCII UTF character. Hence the test for whether or not a character is +** part of an identifier is 0x46. +*/ +const unsigned char sqlite3CtypeMap[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 00..07 ........ */ + 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, /* 08..0f ........ */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 10..17 ........ */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 18..1f ........ */ + 0x01, 0x00, 0x80, 0x00, 0x40, 0x00, 0x00, 0x80, /* 20..27 !"#$%&' */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 28..2f ()*+,-./ */ + 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, /* 30..37 01234567 */ + 0x0c, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 38..3f 89:;<=>? */ + + 0x00, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x02, /* 40..47 @ABCDEFG */ + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 48..4f HIJKLMNO */ + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 50..57 PQRSTUVW */ + 0x02, 0x02, 0x02, 0x80, 0x00, 0x00, 0x00, 0x40, /* 58..5f XYZ[\]^_ */ + 0x80, 0x2a, 0x2a, 0x2a, 0x2a, 0x2a, 0x2a, 0x22, /* 60..67 `abcdefg */ + 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, /* 68..6f hijklmno */ + 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, /* 70..77 pqrstuvw */ + 0x22, 0x22, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, /* 78..7f xyz{|}~. */ + + 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 80..87 ........ */ + 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 88..8f ........ */ + 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 90..97 ........ */ + 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 98..9f ........ */ + 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* a0..a7 ........ */ + 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* a8..af ........ */ + 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* b0..b7 ........ */ + 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* b8..bf ........ */ + + 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* c0..c7 ........ */ + 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* c8..cf ........ */ + 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* d0..d7 ........ */ + 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* d8..df ........ */ + 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* e0..e7 ........ */ + 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* e8..ef ........ */ + 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* f0..f7 ........ */ + 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40 /* f8..ff ........ */ +}; +#define sqlite3Toupper(x) ((x)&~(sqlite3CtypeMap[(unsigned char)(x)]&0x20)) +#define sqlite3Isspace(x) (sqlite3CtypeMap[(unsigned char)(x)]&0x01) +#define sqlite3Isalnum(x) (sqlite3CtypeMap[(unsigned char)(x)]&0x06) +#define sqlite3Isalpha(x) (sqlite3CtypeMap[(unsigned char)(x)]&0x02) +#define sqlite3Isdigit(x) (sqlite3CtypeMap[(unsigned char)(x)]&0x04) +#define sqlite3Isxdigit(x) (sqlite3CtypeMap[(unsigned char)(x)]&0x08) +#define sqlite3Tolower(x) (sqlite3UpperToLower[(unsigned char)(x)]) +#define sqlite3Isquote(x) (sqlite3CtypeMap[(unsigned char)(x)]&0x80) + + +/* +** If X is a character that can be used in an identifier then +** IdChar(X) will be true. Otherwise it is false. +** +** For ASCII, any character with the high-order bit set is +** allowed in an identifier. For 7-bit characters, +** sqlite3IsIdChar[X] must be 1. +** +** For EBCDIC, the rules are more complex but have the same +** end result. +** +** Ticket #1066. the SQL standard does not allow '$' in the +** middle of identifiers. But many SQL implementations do. +** SQLite will allow '$' in identifiers for compatibility. +** But the feature is undocumented. +*/ +#define IdChar(C) ((sqlite3CtypeMap[(unsigned char)C]&0x46)!=0) + +/* +** Ignore testcase() macros +*/ +#define testcase(X) + +/* +** Token values +*/ +#define TK_SPACE 0 +#define TK_NAME 1 +#define TK_LITERAL 2 +#define TK_PUNCT 3 +#define TK_ERROR 4 + +#define TK_MINUS TK_PUNCT +#define TK_LP TK_PUNCT +#define TK_RP TK_PUNCT +#define TK_SEMI TK_PUNCT +#define TK_PLUS TK_PUNCT +#define TK_STAR TK_PUNCT +#define TK_SLASH TK_PUNCT +#define TK_REM TK_PUNCT +#define TK_EQ TK_PUNCT +#define TK_LE TK_PUNCT +#define TK_NE TK_PUNCT +#define TK_LSHIFT TK_PUNCT +#define TK_LT TK_PUNCT +#define TK_GE TK_PUNCT +#define TK_RSHIFT TK_PUNCT +#define TK_GT TK_PUNCT +#define TK_GE TK_PUNCT +#define TK_BITOR TK_PUNCT +#define TK_CONCAT TK_PUNCT +#define TK_COMMA TK_PUNCT +#define TK_BITAND TK_PUNCT +#define TK_BITNOT TK_PUNCT +#define TK_STRING TK_LITERAL +#define TK_ID TK_NAME +#define TK_ILLEGAL TK_ERROR +#define TK_DOT TK_PUNCT +#define TK_INTEGER TK_LITERAL +#define TK_FLOAT TK_LITERAL +#define TK_VARIABLE TK_LITERAL +#define TK_BLOB TK_LITERAL + +/* +** Return the length (in bytes) of the token that begins at z[0]. +** Store the token type in *tokenType before returning. +*/ +static int sqlite3GetToken(const unsigned char *z, int *tokenType){ + int i, c; + switch( aiClass[*z] ){ /* Switch on the character-class of the first byte + ** of the token. See the comment on the CC_ defines + ** above. */ + case CC_SPACE: { + for(i=1; sqlite3Isspace(z[i]); i++){} + *tokenType = TK_SPACE; + return i; + } + case CC_MINUS: { + if( z[1]=='-' ){ + for(i=2; (c=z[i])!=0 && c!='\n'; i++){} + *tokenType = TK_SPACE; + return i; + } + *tokenType = TK_MINUS; + return 1; + } + case CC_LP: { + *tokenType = TK_LP; + return 1; + } + case CC_RP: { + *tokenType = TK_RP; + return 1; + } + case CC_SEMI: { + *tokenType = TK_SEMI; + return 1; + } + case CC_PLUS: { + *tokenType = TK_PLUS; + return 1; + } + case CC_STAR: { + *tokenType = TK_STAR; + return 1; + } + case CC_SLASH: { + if( z[1]!='*' || z[2]==0 ){ + *tokenType = TK_SLASH; + return 1; + } + for(i=3, c=z[2]; (c!='*' || z[i]!='/') && (c=z[i])!=0; i++){} + if( c ) i++; + *tokenType = TK_SPACE; + return i; + } + case CC_PERCENT: { + *tokenType = TK_REM; + return 1; + } + case CC_EQ: { + *tokenType = TK_EQ; + return 1 + (z[1]=='='); + } + case CC_LT: { + if( (c=z[1])=='=' ){ + *tokenType = TK_LE; + return 2; + }else if( c=='>' ){ + *tokenType = TK_NE; + return 2; + }else if( c=='<' ){ + *tokenType = TK_LSHIFT; + return 2; + }else{ + *tokenType = TK_LT; + return 1; + } + } + case CC_GT: { + if( (c=z[1])=='=' ){ + *tokenType = TK_GE; + return 2; + }else if( c=='>' ){ + *tokenType = TK_RSHIFT; + return 2; + }else{ + *tokenType = TK_GT; + return 1; + } + } + case CC_BANG: { + if( z[1]!='=' ){ + *tokenType = TK_ILLEGAL; + return 1; + }else{ + *tokenType = TK_NE; + return 2; + } + } + case CC_PIPE: { + if( z[1]!='|' ){ + *tokenType = TK_BITOR; + return 1; + }else{ + *tokenType = TK_CONCAT; + return 2; + } + } + case CC_COMMA: { + *tokenType = TK_COMMA; + return 1; + } + case CC_AND: { + *tokenType = TK_BITAND; + return 1; + } + case CC_TILDA: { + *tokenType = TK_BITNOT; + return 1; + } + case CC_QUOTE: { + int delim = z[0]; + testcase( delim=='`' ); + testcase( delim=='\'' ); + testcase( delim=='"' ); + for(i=1; (c=z[i])!=0; i++){ + if( c==delim ){ + if( z[i+1]==delim ){ + i++; + }else{ + break; + } + } + } + if( c=='\'' ){ + *tokenType = TK_STRING; + return i+1; + }else if( c!=0 ){ + *tokenType = TK_ID; + return i+1; + }else{ + *tokenType = TK_ILLEGAL; + return i; + } + } + case CC_DOT: { + if( !sqlite3Isdigit(z[1]) ){ + *tokenType = TK_DOT; + return 1; + } + /* If the next character is a digit, this is a floating point + ** number that begins with ".". Fall thru into the next case */ + } + case CC_DIGIT: { + *tokenType = TK_INTEGER; + if( z[0]=='0' && (z[1]=='x' || z[1]=='X') && sqlite3Isxdigit(z[2]) ){ + for(i=3; sqlite3Isxdigit(z[i]); i++){} + return i; + } + for(i=0; sqlite3Isdigit(z[i]); i++){} + if( z[i]=='.' ){ + i++; + while( sqlite3Isdigit(z[i]) ){ i++; } + *tokenType = TK_FLOAT; + } + if( (z[i]=='e' || z[i]=='E') && + ( sqlite3Isdigit(z[i+1]) + || ((z[i+1]=='+' || z[i+1]=='-') && sqlite3Isdigit(z[i+2])) + ) + ){ + i += 2; + while( sqlite3Isdigit(z[i]) ){ i++; } + *tokenType = TK_FLOAT; + } + while( IdChar(z[i]) ){ + *tokenType = TK_ILLEGAL; + i++; + } + return i; + } + case CC_QUOTE2: { + for(i=1, c=z[0]; c!=']' && (c=z[i])!=0; i++){} + *tokenType = c==']' ? TK_ID : TK_ILLEGAL; + return i; + } + case CC_VARNUM: { + *tokenType = TK_VARIABLE; + for(i=1; sqlite3Isdigit(z[i]); i++){} + return i; + } + case CC_DOLLAR: + case CC_VARALPHA: { + int n = 0; + testcase( z[0]=='$' ); testcase( z[0]=='@' ); + testcase( z[0]==':' ); testcase( z[0]=='#' ); + *tokenType = TK_VARIABLE; + for(i=1; (c=z[i])!=0; i++){ + if( IdChar(c) ){ + n++; + }else if( c=='(' && n>0 ){ + do{ + i++; + }while( (c=z[i])!=0 && !sqlite3Isspace(c) && c!=')' ); + if( c==')' ){ + i++; + }else{ + *tokenType = TK_ILLEGAL; + } + break; + }else if( c==':' && z[i+1]==':' ){ + i++; + }else{ + break; + } + } + if( n==0 ) *tokenType = TK_ILLEGAL; + return i; + } + case CC_KYWD: { + for(i=1; aiClass[z[i]]<=CC_KYWD; i++){} + if( IdChar(z[i]) ){ + /* This token started out using characters that can appear in keywords, + ** but z[i] is a character not allowed within keywords, so this must + ** be an identifier instead */ + i++; + break; + } + *tokenType = TK_ID; + return i; + } + case CC_X: { + testcase( z[0]=='x' ); testcase( z[0]=='X' ); + if( z[1]=='\'' ){ + *tokenType = TK_BLOB; + for(i=2; sqlite3Isxdigit(z[i]); i++){} + if( z[i]!='\'' || i%2 ){ + *tokenType = TK_ILLEGAL; + while( z[i] && z[i]!='\'' ){ i++; } + } + if( z[i] ) i++; + return i; + } + /* If it is not a BLOB literal, then it must be an ID, since no + ** SQL keywords start with the letter 'x'. Fall through */ + } + case CC_ID: { + i = 1; + break; + } + default: { + *tokenType = TK_ILLEGAL; + return 1; + } + } + while( IdChar(z[i]) ){ i++; } + *tokenType = TK_ID; + return i; +} + +char *sqlite3_normalize(const char *zSql){ + char *z; /* The output string */ + sqlite3_int64 nZ; /* Size of the output string in bytes */ + sqlite3_int64 nSql; /* Size of the input string in bytes */ + int i; /* Next character to read from zSql[] */ + int j; /* Next slot to fill in on z[] */ + int tokenType; /* Type of the next token */ + int n; /* Size of the next token */ + int k; /* Loop counter */ + + nSql = strlen(zSql); + nZ = nSql; + z = sqlite3_malloc64( nZ+2 ); + if( z==0 ) return 0; + for(i=j=0; zSql[i]; i += n){ + n = sqlite3GetToken((unsigned char*)zSql+i, &tokenType); + switch( tokenType ){ + case TK_SPACE: { + break; + } + case TK_ERROR: { + sqlite3_free(z); + return 0; + } + case TK_LITERAL: { + z[j++] = '?'; + break; + } + case TK_PUNCT: + case TK_NAME: { + if( j>0 && IdChar(z[j-1]) && IdChar(zSql[i]) ) z[j++] = ' '; + for(k=0; k0 && z[j-1]==' ' ){ j--; } + if( i>0 && z[j-1]!=';' ){ z[j++] = ';'; } + z[j] = 0; + return z; +} + +#ifdef NORMALIZE_TEST +#include +#include + +static void normalizeFile(char *zIn){ + int i; + if( zIn==0 ) return; + for(i=0; zIn[i]; i++){ + char cSaved; + if( zIn[i]!=';' ) continue; + cSaved = zIn[i+1]; + zIn[i+1] = 0; + if( sqlite3_complete(zIn) ){ + char *zOut = sqlite3_normalize(zIn); + if( zOut ){ + printf("%s\n", zOut); + sqlite3_free(zOut); + }else{ + fprintf(stderr, "ERROR: %s\n", zIn); + } + zIn[i+1] = cSaved; + zIn += i+1; + i = -1; + }else{ + zIn[i+1] = cSaved; + } + } +} + +int main(int argc, char **argv){ + int i; + FILE *in; + char *zBuf = 0; + sqlite3_int64 sz, got; + + for(i=1; i Date: Mon, 8 Jan 2018 19:18:27 +0000 Subject: [PATCH 2/4] Add IN-operator normalizating and the SQLITE_NORMALIZE_CLI compile-time option for generating a stand-alone program. FossilOrigin-Name: d77dbb398afa80c1b3373c55f278491e83d5c80ddc22dbc303876bdcbc127df9 --- ext/misc/normalize.c | 67 ++++++++++++++++++++++++++++++++++++++++---- manifest | 15 ++++------ manifest.uuid | 2 +- 3 files changed, 69 insertions(+), 15 deletions(-) diff --git a/ext/misc/normalize.c b/ext/misc/normalize.c index 914a154d9a..fe5fb01b4d 100644 --- a/ext/misc/normalize.c +++ b/ext/misc/normalize.c @@ -35,11 +35,19 @@ ** ** The purpose of normalization is two-fold: ** -** (1) Sanitize queries by removing possibly sensitive information contained -** in literals. +** (1) Sanitize queries by removing potentially private or sensitive +** information contained in literals. ** ** (2) Identify structurally identical queries by comparing their ** normalized forms. +** +** Command-Line Utility +** -------------------- +** +** This file also contains code for a command-line utility that converts +** SQL queries in text files into their normalized forms. To build the +** command-line program, compile this file with -DSQLITE_NORMALIZE_CLI +** and link it against the SQLite library. */ #include #include @@ -48,9 +56,13 @@ ** Implementation note: ** ** Much of the tokenizer logic is copied out of the tokenize.c source file -** of SQLite. This logic could be simplified for this particular application, +** of SQLite. That logic could be simplified for this particular application, ** but that would impose a risk of introducing subtle errors. It is best to ** keep the code as close to the original as possible. +** +** The tokenize code is in sync with the SQLite core as of 2018-01-08. +** Any future changes to the core tokenizer might require corresponding +** adjustments to the tokenizer logic in this module. */ @@ -572,13 +584,54 @@ char *sqlite3_normalize(const char *zSql){ while( j>0 && z[j-1]==' ' ){ j--; } if( i>0 && z[j-1]!=';' ){ z[j++] = ';'; } z[j] = 0; + + /* Make a second pass converting "in(...)" where the "..." is not a + ** SELECT statement into "in(?,?,?)" */ + for(i=0; i5 ){ + memmove(z+n+5, z+n+k, j-(n+k)); + } + j = j-k+5; + z[j] = 0; + memcpy(z+n, "?,?,?", 5); + } return z; } -#ifdef NORMALIZE_TEST +/* +** For testing purposes, or to build a stand-alone SQL normalizer program, +** compile this one source file with the -DSQLITE_NORMALIZE_CLI and link +** it against any SQLite library. The resulting command-line program will +** run sqlite3_normalize() over the text of all files named on the command- +** line and show the result on standard output. +*/ +#ifdef SQLITE_NORMALIZE_CLI #include #include +/* +** Break zIn up into separate SQL statements and run sqlite3_normalize() +** on each one. Print the result of each run. +*/ static void normalizeFile(char *zIn){ int i; if( zIn==0 ) return; @@ -604,6 +657,10 @@ static void normalizeFile(char *zIn){ } } +/* +** The main routine for "sql_normalize". Read files named on the +** command-line and run the text of each through sqlite3_normalize(). +*/ int main(int argc, char **argv){ int i; FILE *in; @@ -636,4 +693,4 @@ int main(int argc, char **argv){ } sqlite3_free(zBuf); } -#endif /* NORMALIZE_TEST */ +#endif /* SQLITE_NORMALIZE_CLI */ diff --git a/manifest b/manifest index 7cd42ecfd6..4cc5e208a8 100644 --- a/manifest +++ b/manifest @@ -1,5 +1,5 @@ -C First\scode\sfor\san\sauxiliary\sfunction\sthat\swill\snormalize\san\sSQL\sstatement. -D 2018-01-08T16:54:26.258 +C Add\sIN-operator\snormalizating\sand\sthe\sSQLITE_NORMALIZE_CLI\scompile-time\noption\sfor\sgenerating\sa\sstand-alone\sprogram. +D 2018-01-08T19:18:27.144 F .fossil-settings/empty-dirs dbb81e8fc0401ac46a1491ab34a7f2c7c0452f2f06b54ebb845d024ca8283ef1 F .fossil-settings/ignore-glob 35175cdfcf539b2318cb04a9901442804be81cd677d8b889fcc9149c21f239ea F Makefile.in 12b6daa4bdb03fa87da27cbc205ff88ace645475b5be79414a3038b68ade14cb @@ -284,7 +284,7 @@ F ext/misc/json1.c dbe086615b9546c156bf32b9378fc09383b58bd17513b866cfd24c1e15281 F ext/misc/memvfs.c ab36f49e02ebcdf85a1e08dc4d8599ea8f343e073ac9e0bca18a98b7e1ec9567 F ext/misc/mmapwarm.c 70b618f2d0bde43fae288ad0b7498a629f2b6f61b50a27e06fae3cd23c83af29 F ext/misc/nextchar.c 35c8b8baacb96d92abbb34a83a997b797075b342 -F ext/misc/normalize.c 5ca65b1332e41d679d7c07374398b49d020281d4ea30f267c6a6fb500e71f7c7 +F ext/misc/normalize.c c5f9b3968bf0bfac6ca1c1f802a30ca08e96a9ac7ab3297190b2745046d7d838 F ext/misc/percentile.c 92699c8cd7d517ff610e6037e56506f8904dae2e F ext/misc/regexp.c a68d25c659bd2d893cd1215667bbf75ecb9dc7d4 F ext/misc/remember.c add730f0f7e7436cd15ea3fd6a90fd83c3f706ab44169f7f048438b7d6baa69c @@ -1698,10 +1698,7 @@ F vsixtest/vsixtest.tcl 6a9a6ab600c25a91a7acc6293828957a386a8a93 F vsixtest/vsixtest.vcxproj.data 2ed517e100c66dc455b492e1a33350c1b20fbcdc F vsixtest/vsixtest.vcxproj.filters 37e51ffedcdb064aad6ff33b6148725226cd608e F vsixtest/vsixtest_TemporaryKey.pfx e5b1b036facdb453873e7084e1cae9102ccc67a0 -P 90cb01d8d6ac12d0b88f2952a75aeefa81ba66f5e4a5377fdd8b9f86aec8e927 -R 9bd921419b4f2c3b5e55f041d45b29ce -T *branch * normalize -T *sym-normalize * -T -sym-trunk * +P 84814aac81e54f03430f180926156ab0fc01e22bbce2bb228b698ea0d4a588ea +R 52aee785605536a5becd57b22eca4f38 U drh -Z c8d4afe60d21d90fa2f1cb97ff587c09 +Z be9e0b00c5811d466485564c364b04ac diff --git a/manifest.uuid b/manifest.uuid index 68c796f48f..6aaff97c39 100644 --- a/manifest.uuid +++ b/manifest.uuid @@ -1 +1 @@ -84814aac81e54f03430f180926156ab0fc01e22bbce2bb228b698ea0d4a588ea \ No newline at end of file +d77dbb398afa80c1b3373c55f278491e83d5c80ddc22dbc303876bdcbc127df9 \ No newline at end of file From d19866bb98c548049757d4e54efda009badd655c Mon Sep 17 00:00:00 2001 From: drh Date: Mon, 8 Jan 2018 19:29:28 +0000 Subject: [PATCH 3/4] Special handling of the NULL keyword. Sometimes it is a literal, and sometimes it is a keyword. FossilOrigin-Name: db5d138e97f22ad4d4d11dbef96df93696ba0e557809066bc263ca3c3898f349 --- ext/misc/normalize.c | 11 +++++++++++ manifest | 12 ++++++------ manifest.uuid | 2 +- 3 files changed, 18 insertions(+), 7 deletions(-) diff --git a/ext/misc/normalize.c b/ext/misc/normalize.c index fe5fb01b4d..a3c00210f6 100644 --- a/ext/misc/normalize.c +++ b/ext/misc/normalize.c @@ -573,6 +573,17 @@ char *sqlite3_normalize(const char *zSql){ } case TK_PUNCT: case TK_NAME: { + if( n==4 && sqlite3_strnicmp(zSql+i,"NULL",4)==0 ){ + if( (j>=3 && strncmp(z+j-2,"is",2)==0 && !IdChar(z[j-3])) + || (j>=4 && strncmp(z+j-3,"not",3)==0 && !IdChar(z[j-4])) + ){ + /* NULL is a keyword in this case, not a literal value */ + }else{ + /* Here the NULL is a literal value */ + z[j++] = '?'; + break; + } + } if( j>0 && IdChar(z[j-1]) && IdChar(zSql[i]) ) z[j++] = ' '; for(k=0; k Date: Mon, 8 Jan 2018 20:04:57 +0000 Subject: [PATCH 4/4] Test cases for sqlite3_normalize(). FossilOrigin-Name: 658f42257d56a3562dfa8e55023c6e497f55c565cd751d942de3a9e0cb2fa708 --- Makefile.in | 1 + Makefile.msc | 1 + ext/misc/normalize.c | 4 +-- main.mk | 1 + manifest | 21 +++++++------ manifest.uuid | 2 +- src/test1.c | 30 ++++++++++++++++++ test/normalize.test | 72 ++++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 119 insertions(+), 13 deletions(-) create mode 100644 test/normalize.test diff --git a/Makefile.in b/Makefile.in index af1bd61548..98655bb820 100644 --- a/Makefile.in +++ b/Makefile.in @@ -440,6 +440,7 @@ TESTSRC += \ $(TOP)/ext/misc/ieee754.c \ $(TOP)/ext/misc/mmapwarm.c \ $(TOP)/ext/misc/nextchar.c \ + $(TOP)/ext/misc/normalize.c \ $(TOP)/ext/misc/percentile.c \ $(TOP)/ext/misc/regexp.c \ $(TOP)/ext/misc/remember.c \ diff --git a/Makefile.msc b/Makefile.msc index 4d6c5c0cc2..640f95deaa 100644 --- a/Makefile.msc +++ b/Makefile.msc @@ -1500,6 +1500,7 @@ TESTEXT = \ $(TOP)\ext\misc\ieee754.c \ $(TOP)\ext\misc\mmapwarm.c \ $(TOP)\ext\misc\nextchar.c \ + $(TOP)\ext\misc\normalize.c \ $(TOP)\ext\misc\percentile.c \ $(TOP)\ext\misc\regexp.c \ $(TOP)\ext\misc\remember.c \ diff --git a/ext/misc/normalize.c b/ext/misc/normalize.c index a3c00210f6..fd656f1303 100644 --- a/ext/misc/normalize.c +++ b/ext/misc/normalize.c @@ -130,7 +130,7 @@ static const unsigned char aiClass[] = { ** handle case conversions for the UTF character set since the tables ** involved are nearly as big or bigger than SQLite itself. */ -const unsigned char sqlite3UpperToLower[] = { +static const unsigned char sqlite3UpperToLower[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, @@ -176,7 +176,7 @@ const unsigned char sqlite3UpperToLower[] = { ** non-ASCII UTF character. Hence the test for whether or not a character is ** part of an identifier is 0x46. */ -const unsigned char sqlite3CtypeMap[256] = { +static const unsigned char sqlite3CtypeMap[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 00..07 ........ */ 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, /* 08..0f ........ */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 10..17 ........ */ diff --git a/main.mk b/main.mk index fc68d9d307..6271e4b682 100644 --- a/main.mk +++ b/main.mk @@ -361,6 +361,7 @@ TESTSRC += \ $(TOP)/ext/misc/ieee754.c \ $(TOP)/ext/misc/mmapwarm.c \ $(TOP)/ext/misc/nextchar.c \ + $(TOP)/ext/misc/normalize.c \ $(TOP)/ext/misc/percentile.c \ $(TOP)/ext/misc/regexp.c \ $(TOP)/ext/misc/remember.c \ diff --git a/manifest b/manifest index 2c752a40db..14067d0999 100644 --- a/manifest +++ b/manifest @@ -1,10 +1,10 @@ -C Special\shandling\sof\sthe\sNULL\skeyword.\s\sSometimes\sit\sis\sa\sliteral,\sand\ssometimes\nit\sis\sa\skeyword. -D 2018-01-08T19:29:28.438 +C Test\scases\sfor\ssqlite3_normalize(). +D 2018-01-08T20:04:57.992 F .fossil-settings/empty-dirs dbb81e8fc0401ac46a1491ab34a7f2c7c0452f2f06b54ebb845d024ca8283ef1 F .fossil-settings/ignore-glob 35175cdfcf539b2318cb04a9901442804be81cd677d8b889fcc9149c21f239ea -F Makefile.in 12b6daa4bdb03fa87da27cbc205ff88ace645475b5be79414a3038b68ade14cb +F Makefile.in 822052988bbf6204ca2c3e2665618b5a48acec24105d5afa8c4ebb59cb941d3e F Makefile.linux-gcc 7bc79876b875010e8c8f9502eb935ca92aa3c434 -F Makefile.msc b33f630258ce72fb3150c1a6819a484eba3950dc9397571d48672087e12abf4a +F Makefile.msc 0f2875c7d4cb93ae4dab661f58f9d68699d323cfb87ebd7d8dccbb9949979b83 F README.md eeae1e552f93ef72ef7c5b8f6647b368a001c28820ad1df179d3dae602bef681 F VERSION 0c10cdfed866fdd2d80434f64f042c3330f1daaed12e54287beb104f04b3faaf F aclocal.m4 a5c22d164aff7ed549d53a90fa56d56955281f50 @@ -284,7 +284,7 @@ F ext/misc/json1.c dbe086615b9546c156bf32b9378fc09383b58bd17513b866cfd24c1e15281 F ext/misc/memvfs.c ab36f49e02ebcdf85a1e08dc4d8599ea8f343e073ac9e0bca18a98b7e1ec9567 F ext/misc/mmapwarm.c 70b618f2d0bde43fae288ad0b7498a629f2b6f61b50a27e06fae3cd23c83af29 F ext/misc/nextchar.c 35c8b8baacb96d92abbb34a83a997b797075b342 -F ext/misc/normalize.c f26e69c7539f74dbc0a00468df117bb9085f229ecb64d65c10ebf59a010511bb +F ext/misc/normalize.c 19262ef3ef29d4de2f281b423326865c8916c63d0cb09f1dc98d24d5c1e8ba64 F ext/misc/percentile.c 92699c8cd7d517ff610e6037e56506f8904dae2e F ext/misc/regexp.c a68d25c659bd2d893cd1215667bbf75ecb9dc7d4 F ext/misc/remember.c add730f0f7e7436cd15ea3fd6a90fd83c3f706ab44169f7f048438b7d6baa69c @@ -407,7 +407,7 @@ F ext/userauth/userauth.c 3410be31283abba70255d71fd24734e017a4497f F install-sh 9d4de14ab9fb0facae2f48780b874848cbf2f895 x F ltmain.sh 3ff0879076df340d2e23ae905484d8c15d5fdea8 F magic.txt 8273bf49ba3b0c8559cb2774495390c31fd61c60 -F main.mk 7965d01485f7bec7945407c5238985ea8c7cb2cb686d2bfdbe3d5f79d6fd4eb2 +F main.mk c6b8ee2fa871e01ad6a460d750cd074010292464a60a37a2207930ae0df629dc F mkso.sh fd21c06b063bb16a5d25deea1752c2da6ac3ed83 F mptest/config01.test 3c6adcbc50b991866855f1977ff172eb6d901271 F mptest/config02.test 4415dfe36c48785f751e16e32c20b077c28ae504 @@ -494,7 +494,7 @@ F src/sqliteLimit.h 1513bfb7b20378aa0041e7022d04acb73525de35b80b252f1b83fedb4de6 F src/status.c 9737ed017279a9e0c5da748701c3c7bf1e8ae0dae459aad20dd64fcff97a7e35 F src/table.c b46ad567748f24a326d9de40e5b9659f96ffff34 F src/tclsqlite.c 1833388c01e3b77f4c712185ee7250b9423ee0981ce6ae7e401e47db0319a696 -F src/test1.c b52f9e7fe62016d357c3266fcfa0793cc1883d3cb2b11dfa39fcba2e70b0305c +F src/test1.c 1ab7cbbb6693e08364c1a9241e2aee17f8c4925e4cc52396be77ae6845a05828 F src/test2.c 3efb99ab7f1fc8d154933e02ae1378bac9637da5 F src/test3.c b8434949dfb8aff8dfa082c8b592109e77844c2135ed3c492113839b6956255b F src/test4.c 18ec393bb4d0ad1de729f0b94da7267270f3d8e6 @@ -1093,6 +1093,7 @@ F test/mutex2.test bfeaeac2e73095b2ac32285d2756e3a65e681660 F test/nan.test 437d40e6d0778b050d7750726c0cbd2c9936b81962926e8f8c48ca698f00f4d1 F test/nockpt.test 9a436a7213ba5ef7a32304998d386d3ea3f76c9d F test/nolock.test f196cf8b8fbea4e2ca345140a2b3f3b0da45c76e +F test/normalize.test 501630ab49b0b26b65c74124bf03e3374c1b57fa97aae750f84803609141d167 F test/notify1.test 669b2b743618efdc18ca4b02f45423d5d2304abf F test/notify2.test 2ecabaa1305083856b7c39cf32816b612740c161 F test/notify3.test 10ff25cde502e72a92053a2f215d64bece4ef934 @@ -1698,7 +1699,7 @@ F vsixtest/vsixtest.tcl 6a9a6ab600c25a91a7acc6293828957a386a8a93 F vsixtest/vsixtest.vcxproj.data 2ed517e100c66dc455b492e1a33350c1b20fbcdc F vsixtest/vsixtest.vcxproj.filters 37e51ffedcdb064aad6ff33b6148725226cd608e F vsixtest/vsixtest_TemporaryKey.pfx e5b1b036facdb453873e7084e1cae9102ccc67a0 -P d77dbb398afa80c1b3373c55f278491e83d5c80ddc22dbc303876bdcbc127df9 -R 52c05b49206cf2984d560c493371a137 +P db5d138e97f22ad4d4d11dbef96df93696ba0e557809066bc263ca3c3898f349 +R 5585a6831c91176202d5e0f81c59c944 U drh -Z 26ab19c5b91f60ca62a12106abc7db8e +Z ca698384a23dd84f9bb875fa9da10ba1 diff --git a/manifest.uuid b/manifest.uuid index c338d86fc5..ac2ec7cffb 100644 --- a/manifest.uuid +++ b/manifest.uuid @@ -1 +1 @@ -db5d138e97f22ad4d4d11dbef96df93696ba0e557809066bc263ca3c3898f349 \ No newline at end of file +658f42257d56a3562dfa8e55023c6e497f55c565cd751d942de3a9e0cb2fa708 \ No newline at end of file diff --git a/src/test1.c b/src/test1.c index 55d92eb1fd..bc8f389dbd 100644 --- a/src/test1.c +++ b/src/test1.c @@ -4559,6 +4559,35 @@ static int SQLITE_TCLAPI test_complete16( return TCL_OK; } +/* +** Usage: sqlite3_normalize SQL +** +** Return the normalized value for an SQL statement. +*/ +static int SQLITE_TCLAPI test_normalize( + void * clientData, + Tcl_Interp *interp, + int objc, + Tcl_Obj *CONST objv[] +){ + char *zSql; + char *zNorm; + extern char *sqlite3_normalize(const char*); + + if( objc!=2 ){ + Tcl_WrongNumArgs(interp, 1, objv, "SQL"); + return TCL_ERROR; + } + + zSql = (char*)Tcl_GetString(objv[1]); + zNorm = sqlite3_normalize(zSql); + if( zNorm ){ + Tcl_SetObjResult(interp, Tcl_NewStringObj(zNorm, -1)); + sqlite3_free(zNorm); + } + return TCL_OK; +} + /* ** Usage: sqlite3_step STMT ** @@ -7547,6 +7576,7 @@ int Sqlitetest1_Init(Tcl_Interp *interp){ { "sqlite3_open16", test_open16 ,0 }, { "sqlite3_open_v2", test_open_v2 ,0 }, { "sqlite3_complete16", test_complete16 ,0 }, + { "sqlite3_normalize", test_normalize ,0 }, { "sqlite3_prepare", test_prepare ,0 }, { "sqlite3_prepare16", test_prepare16 ,0 }, diff --git a/test/normalize.test b/test/normalize.test new file mode 100644 index 0000000000..8932650c83 --- /dev/null +++ b/test/normalize.test @@ -0,0 +1,72 @@ +# 2018-01-08 +# +# The author disclaims copyright to this source code. In place of +# a legal notice, here is a blessing: +# +# May you do good and not evil. +# May you find forgiveness for yourself and forgive others. +# May you share freely, never taking more than you give. +# +#*********************************************************************** +# +# Tests for the sqlite3_normalize() extension function. +# + +set testdir [file dirname $argv0] +source $testdir/tester.tcl +set testprefix normalize + +foreach {tnum sql norm} { + 100 + {SELECT * FROM t1 WHERE a IN (1) AND b=51.42} + {select*from t1 where a in(?,?,?)and b=?;} + + 110 + {SELECT a, b+15, c FROM t1 WHERE d NOT IN (SELECT x FROM t2);} + {select a,b+?,c from t1 where d not in(select x from t2);} + + 120 + { SELECT NULL, b FROM t1 -- comment text + WHERE d IN (WITH t(a) AS (VALUES(5)) /* CTE */ + SELECT a FROM t) + OR e='hello'; + } + {select?,b from t1 where d in(with t(a)as(values(?))select a from t)or e=?;} + + 121 + {/*Initial comment*/ + -- another comment line + SELECT NULL /* comment */ , b FROM t1 -- comment text + WHERE d IN (WITH t(a) AS (VALUES(5)) /* CTE */ + SELECT a FROM t) + OR e='hello'; + } + {select?,b from t1 where d in(with t(a)as(values(?))select a from t)or e=?;} + + 130 + {/* Query containing parameters */ + SELECT x,$::abc(15),y,@abc,z,?99,w FROM t1 /* Trailing comment */} + {select x,?,y,?,z,?,w from t1;} + + 140 + {/* Long list on the RHS of IN */ + SELECT 15 IN (1,2,3,(SELECT * FROM t1),'xyz',x'abcd',22*(x+5),null);} + {select?in(?,?,?);} + + 150 + {SELECT x'abc'; -- illegal token} + {} + + 160 + {SELECT a,NULL,b FROM t1 WHERE c IS NOT NULL or D is null or e=5} + {select a,?,b from t1 where c is not null or d is null or e=?;} + + 170 + {/* IN list exactly 5 bytes long */ + SELECT * FROM t1 WHERE x IN (1,2,3);} + {select*from t1 where x in(?,?,?);} +} { + do_test $tnum [list sqlite3_normalize $sql] $norm +} + +finish_test