diff --git a/ext/fts1/fts1.c b/ext/fts1/fts1.c index 286655253a..ddd4529c7e 100644 --- a/ext/fts1/fts1.c +++ b/ext/fts1/fts1.c @@ -177,6 +177,25 @@ static int getVarint32(const char *p, int *pi){ * the previous token to make the estimate a tiny bit more precise. */ +/* It is not safe to call isspace(), tolower(), or isalnum() on +** hi-bit-set characters. This is the same solution used in the +** tokenizer. +*/ +/* TODO(shess) The snippet-generation code should be using the +** tokenizer-generated tokens rather than doing its own local +** tokenization. +*/ +/* TODO(shess) Is __isascii() a portable version of (c&0x80)==0? */ +static int safe_isspace(char c){ + return (c&0x80)==0 ? isspace(c) : 0; +} +static int safe_tolower(char c){ + return (c&0x80)==0 ? tolower(c) : c; +} +static int safe_isalnum(char c){ + return (c&0x80)==0 ? isalnum(c) : 0; +} + typedef enum DocListType { DL_DOCIDS, /* docids only */ DL_POSITIONS, /* docids + positions */ @@ -1536,7 +1555,7 @@ static int getToken(const char *z, int *tokenType){ return 0; } case ' ': case '\t': case '\n': case '\f': case '\r': { - for(i=1; isspace(z[i]); i++){} + for(i=1; safe_isspace(z[i]); i++){} *tokenType = TOKEN_SPACE; return i; } @@ -1688,7 +1707,7 @@ static void tokenListToIdList(char **azIn){ int i, j; if( azIn ){ for(i=0, j=-1; azIn[i]; i++){ - if( isalnum(azIn[i][0]) || azIn[i][1] ){ + if( safe_isalnum(azIn[i][0]) || azIn[i][1] ){ dequoteString(azIn[i]); if( j>=0 ){ azIn[j] = azIn[i]; @@ -1737,11 +1756,11 @@ static char *firstToken(char *zIn, char **pzTail){ ** s[] is t[]. */ static int startsWith(const char *s, const char *t){ - while( isspace(*s) ){ s++; } + while( safe_isspace(*s) ){ s++; } while( *t ){ - if( tolower(*s++)!=tolower(*t++) ) return 0; + if( safe_tolower(*s++)!=safe_tolower(*t++) ) return 0; } - return *s!='_' && !isalnum(*s); + return *s!='_' && !safe_isalnum(*s); } /* @@ -1853,7 +1872,7 @@ static int parseSpec(TableSpec *pSpec, int argc, const char *const*argv, char *p; pSpec->azContentColumn[i] = sqlite3_mprintf("c%d%s", i, azArg[i]); for (p = pSpec->azContentColumn[i]; *p ; ++p) { - if( !isalnum(*p) ) *p = '_'; + if( !safe_isalnum(*p) ) *p = '_'; } } @@ -2330,10 +2349,10 @@ static int wordBoundary( } } for(i=1; i<=10; i++){ - if( isspace(zDoc[iBreak-i]) ){ + if( safe_isspace(zDoc[iBreak-i]) ){ return iBreak - i + 1; } - if( isspace(zDoc[iBreak+i]) ){ + if( safe_isspace(zDoc[iBreak+i]) ){ return iBreak + i + 1; } } @@ -2346,7 +2365,7 @@ static int wordBoundary( */ static void appendWhiteSpace(StringBuffer *p){ if( p->len==0 ) return; - if( isspace(p->s[p->len-1]) ) return; + if( safe_isspace(p->s[p->len-1]) ) return; append(p, " "); } @@ -2354,7 +2373,7 @@ static void appendWhiteSpace(StringBuffer *p){ ** Remove white space from teh end of the StringBuffer */ static void trimWhiteSpace(StringBuffer *p){ - while( p->len>0 && isspace(p->s[p->len-1]) ){ + while( p->len>0 && safe_isspace(p->s[p->len-1]) ){ p->len--; } } diff --git a/ext/fts2/fts2.c b/ext/fts2/fts2.c index 3f49a2958b..2955e731a1 100644 --- a/ext/fts2/fts2.c +++ b/ext/fts2/fts2.c @@ -304,6 +304,25 @@ SQLITE_EXTENSION_INIT1 # define TRACE(A) #endif +/* It is not safe to call isspace(), tolower(), or isalnum() on +** hi-bit-set characters. This is the same solution used in the +** tokenizer. +*/ +/* TODO(shess) The snippet-generation code should be using the +** tokenizer-generated tokens rather than doing its own local +** tokenization. +*/ +/* TODO(shess) Is __isascii() a portable version of (c&0x80)==0? */ +static int safe_isspace(char c){ + return (c&0x80)==0 ? isspace(c) : 0; +} +static int safe_tolower(char c){ + return (c&0x80)==0 ? tolower(c) : c; +} +static int safe_isalnum(char c){ + return (c&0x80)==0 ? isalnum(c) : 0; +} + typedef enum DocListType { DL_DOCIDS, /* docids only */ DL_POSITIONS, /* docids + positions */ @@ -504,7 +523,7 @@ static void appendList(StringBuffer *sb, int nString, char **azString){ static int endsInWhiteSpace(StringBuffer *p){ return stringBufferLength(p)>0 && - isspace(stringBufferData(p)[stringBufferLength(p)-1]); + safe_isspace(stringBufferData(p)[stringBufferLength(p)-1]); } /* If the StringBuffer ends in something other than white space, add a @@ -2194,7 +2213,7 @@ static int getToken(const char *z, int *tokenType){ return 0; } case ' ': case '\t': case '\n': case '\f': case '\r': { - for(i=1; isspace(z[i]); i++){} + for(i=1; safe_isspace(z[i]); i++){} *tokenType = TOKEN_SPACE; return i; } @@ -2346,7 +2365,7 @@ static void tokenListToIdList(char **azIn){ int i, j; if( azIn ){ for(i=0, j=-1; azIn[i]; i++){ - if( isalnum(azIn[i][0]) || azIn[i][1] ){ + if( safe_isalnum(azIn[i][0]) || azIn[i][1] ){ dequoteString(azIn[i]); if( j>=0 ){ azIn[j] = azIn[i]; @@ -2395,11 +2414,11 @@ static char *firstToken(char *zIn, char **pzTail){ ** s[] is t[]. */ static int startsWith(const char *s, const char *t){ - while( isspace(*s) ){ s++; } + while( safe_isspace(*s) ){ s++; } while( *t ){ - if( tolower(*s++)!=tolower(*t++) ) return 0; + if( safe_tolower(*s++)!=safe_tolower(*t++) ) return 0; } - return *s!='_' && !isalnum(*s); + return *s!='_' && !safe_isalnum(*s); } /* @@ -2511,7 +2530,7 @@ static int parseSpec(TableSpec *pSpec, int argc, const char *const*argv, char *p; pSpec->azContentColumn[i] = sqlite3_mprintf("c%d%s", i, azArg[i]); for (p = pSpec->azContentColumn[i]; *p ; ++p) { - if( !isalnum(*p) ) *p = '_'; + if( !safe_isalnum(*p) ) *p = '_'; } } @@ -2971,10 +2990,10 @@ static int wordBoundary( } } for(i=1; i<=10; i++){ - if( isspace(zDoc[iBreak-i]) ){ + if( safe_isspace(zDoc[iBreak-i]) ){ return iBreak - i + 1; } - if( isspace(zDoc[iBreak+i]) ){ + if( safe_isspace(zDoc[iBreak+i]) ){ return iBreak + i + 1; } } diff --git a/manifest b/manifest index f34f66db28..f0ff011d8b 100644 --- a/manifest +++ b/manifest @@ -1,5 +1,5 @@ -C Assume\sthe\smalloc-failed\sflag\scannot\salready\sbe\sset\swhen\scalling\ssqlite3_errmsg(16)().\s(CVS\s3745) -D 2007-03-29T15:00:53 +C Don't\scall\sctype\sfunctions\son\shi-bit\schars.\s\sSome\splatforms\sraise\nassertions\swhen\sthis\soccurs,\sand\sit's\salmost\scertainly\snot\sthe\sright\nthing\sto\sdo\sin\sthe\sfirst\splace.\s(CVS\s3746) +D 2007-03-29T16:30:39 F Makefile.in 2f2c3bf69faf0ae7b8e8af4f94f1986849034530 F Makefile.linux-gcc 2d8574d1ba75f129aba2019f0b959db380a90935 F README 9c4e2d6706bdcc3efdd773ce752a8cdab4f90028 @@ -22,7 +22,7 @@ F ext/README.txt 913a7bd3f4837ab14d7e063304181787658b14e1 F ext/fts1/README.txt 20ac73b006a70bcfd80069bdaf59214b6cf1db5e F ext/fts1/ft_hash.c 3927bd880e65329bdc6f506555b228b28924921b F ext/fts1/ft_hash.h 1a35e654a235c2c662d3ca0dfc3138ad60b8b7d5 -F ext/fts1/fts1.c 0aab3cf20eefd38935c8f525494d689cb2785f1d +F ext/fts1/fts1.c 7585d9cb7ad7bcdf162936ab1fd64868f2f55ea5 F ext/fts1/fts1.h 6060b8f62c1d925ea8356cb1a6598073eb9159a6 F ext/fts1/fts1_hash.c 3196cee866edbebb1c0521e21672e6d599965114 F ext/fts1/fts1_hash.h 957d378355ed29f672cd5add012ce8b088a5e089 @@ -34,7 +34,7 @@ F ext/fts1/fulltext.h 08525a47852d1d62a0be81d3fc3fe2d23b094efd F ext/fts1/simple_tokenizer.c 1844d72f7194c3fd3d7e4173053911bf0661b70d F ext/fts1/tokenizer.h 0c53421b832366d20d720d21ea3e1f6e66a36ef9 F ext/fts2/README.txt 8c18f41574404623b76917b9da66fcb0ab38328d -F ext/fts2/fts2.c de8321a2ad1edea1f0dd223cb86cf008451784a4 +F ext/fts2/fts2.c 2e3cb46d28b0dd17b2ad3b48409618ace73caec6 F ext/fts2/fts2.h bbdab26d34f91974d5b9ade8b7836c140a7c4ce1 F ext/fts2/fts2_hash.c b3f22116d4ef0bc8f2da6e3fdc435c86d0951a9b F ext/fts2/fts2_hash.h e283308156018329f042816eb09334df714e105e @@ -214,6 +214,7 @@ F test/fts1e.test 77244843e925560b5a0b70069c3e7ab62f181ed2 F test/fts1f.test 2d6cb10d8b7a4e6edc321bbdb3982f1f48774714 F test/fts1i.test 6bfe08cdfdced063a39a50c8601da65e6274d879 F test/fts1j.test e4c0ffcd0ba2adce09c6b7b43ffd0749b5fda5c7 +F test/fts1k.test fdf295cb797ba6a2ef81ec41cb98df0ceb2e572c F test/fts1porter.test d86e9c3e0c7f8ff95add6582b4b585fb4e02b96d F test/fts2a.test 103fc178d134c54c44c1938a4331e9e2030792d9 F test/fts2b.test 964abc0236c849c07ca1ae496bb25c268ae94816 @@ -225,6 +226,7 @@ F test/fts2g.test c69a8ab43ec77d123976ba6cf9422d647ae63032 F test/fts2h.test 223af921323b409d4b5b18ff4e51619541b174bb F test/fts2i.test 1b22451d1f13f7c509baec620dc3a4a754885dd6 F test/fts2j.test f68d7611f76309bc8b94170f3740d9fbbc061d9b +F test/fts2l.test 4c53c89ce3919003765ff4fd8d98ecf724d97dd3 F test/func.test 019d706b2458dfdf239c74cc31143446de1ee44a F test/hook.test 7e7645fd9a033f79cce8fdff151e32715e7ec50a F test/in.test 369cb2aa1eab02296b4ec470732fe8c131260b1d @@ -444,7 +446,7 @@ F www/tclsqlite.tcl bb0d1357328a42b1993d78573e587c6dcbc964b9 F www/vdbe.tcl 87a31ace769f20d3627a64fa1fade7fed47b90d0 F www/version3.tcl 890248cf7b70e60c383b0e84d77d5132b3ead42b F www/whentouse.tcl 97e2b5cd296f7d8057e11f44427dea8a4c2db513 -P 3714ac173289e580a0302a5a3beac05823d92c5b -R 3cfcb502e90a93f72d96670b4207913a -U danielk1977 -Z 7e6377bdbc94cfb816f281bbc3868b86 +P 54fa22273d551e00e1abd86992ff7c62ec4e0daf +R 6645d4541d0d9e478c5b564689374f5f +U shess +Z 5e17544799ed91760b443021ffc206bc diff --git a/manifest.uuid b/manifest.uuid index e342a433b9..7d1b46cd55 100644 --- a/manifest.uuid +++ b/manifest.uuid @@ -1 +1 @@ -54fa22273d551e00e1abd86992ff7c62ec4e0daf \ No newline at end of file +f6c3abdc6c5e916e5366ba28fb1cd06ca3554303 \ No newline at end of file diff --git a/test/fts1k.test b/test/fts1k.test new file mode 100644 index 0000000000..2fffa41cdb --- /dev/null +++ b/test/fts1k.test @@ -0,0 +1,69 @@ +# 2007 March 28 +# +# The author disclaims copyright to this source code. +# +#************************************************************************* +# This file implements regression tests for SQLite library. The focus +# of this script is testing isspace/isalnum/tolower problems with the +# FTS1 module. Unfortunately, this code isn't a really principled set +# of tests, because it's impossible to know where new uses of these +# functions might appear. +# +# $Id: fts1k.test,v 1.1 2007/03/29 16:30:41 shess Exp $ +# + +set testdir [file dirname $argv0] +source $testdir/tester.tcl + +# If SQLITE_ENABLE_FTS1 is defined, omit this file. +ifcapable !fts1 { + finish_test + return +} + +# Tests that startsWith() (calls isspace, tolower, isalnum) can handle +# hi-bit chars. parseSpec() also calls isalnum here. +do_test fts1k-1.1 { + execsql "CREATE VIRTUAL TABLE t1 USING fts1(content, \x80)" +} {} + +# Additionally tests isspace() call in getToken(), and isalnum() call +# in tokenListToIdList(). +do_test fts1k-1.2 { + catch { + execsql "CREATE VIRTUAL TABLE t2 USING fts1(content, tokenize \x80)" + } + sqlite3_errmsg $DB +} "unknown tokenizer: \x80" + +# Additionally test final isalnum() in startsWith(). +do_test fts1k-1.3 { + execsql "CREATE VIRTUAL TABLE t3 USING fts1(content, tokenize\x80)" +} {} + +# The snippet-generation code has calls to isspace() which are sort of +# hard to get to. It finds convenient breakpoints by starting ~40 +# chars before and after the matched term, and scanning ~10 chars +# around that position for isspace() characters. The long word with +# embedded hi-bit chars causes one of these isspace() calls to be +# exercised. The version with a couple extra spaces should cause the +# other isspace() call to be exercised. [Both cases have been tested +# in the debugger, but I'm hoping to continue to catch it if simple +# constant changes change things slightly. +# +# The trailing and leading hi-bit chars help with code which tests for +# isspace() to coalesce multiple spaces. + +set word "\x80xxxxx\x80xxxxx\x80xxxxx\x80xxxxx\x80xxxxx\x80xxxxx\x80" +set phrase1 "$word $word $word target $word $word $word" +set phrase2 "$word $word $word target $word $word $word" + +db eval {CREATE VIRTUAL TABLE t4 USING fts1(content)} +db eval "INSERT INTO t4 (content) VALUES ('$phrase1')" +db eval "INSERT INTO t4 (content) VALUES ('$phrase2')" + +do_test fts1k-1.4 { + execsql {SELECT rowid, length(snippet(t4)) FROM t4 WHERE t4 MATCH 'target'} +} {1 111 2 117} + +finish_test diff --git a/test/fts2l.test b/test/fts2l.test new file mode 100644 index 0000000000..739eb5073c --- /dev/null +++ b/test/fts2l.test @@ -0,0 +1,69 @@ +# 2007 March 28 +# +# The author disclaims copyright to this source code. +# +#************************************************************************* +# This file implements regression tests for SQLite library. The focus +# of this script is testing isspace/isalnum/tolower problems with the +# FTS2 module. Unfortunately, this code isn't a really principled set +# of tests, because it's impossible to know where new uses of these +# functions might appear. +# +# $Id: fts2l.test,v 1.1 2007/03/29 16:30:41 shess Exp $ +# + +set testdir [file dirname $argv0] +source $testdir/tester.tcl + +# If SQLITE_ENABLE_FTS2 is defined, omit this file. +ifcapable !fts2 { + finish_test + return +} + +# Tests that startsWith() (calls isspace, tolower, isalnum) can handle +# hi-bit chars. parseSpec() also calls isalnum here. +do_test fts2l-1.1 { + execsql "CREATE VIRTUAL TABLE t1 USING fts2(content, \x80)" +} {} + +# Additionally tests isspace() call in getToken(), and isalnum() call +# in tokenListToIdList(). +do_test fts2l-1.2 { + catch { + execsql "CREATE VIRTUAL TABLE t2 USING fts2(content, tokenize \x80)" + } + sqlite3_errmsg $DB +} "unknown tokenizer: \x80" + +# Additionally test final isalnum() in startsWith(). +do_test fts2l-1.3 { + execsql "CREATE VIRTUAL TABLE t3 USING fts2(content, tokenize\x80)" +} {} + +# The snippet-generation code has calls to isspace() which are sort of +# hard to get to. It finds convenient breakpoints by starting ~40 +# chars before and after the matched term, and scanning ~10 chars +# around that position for isspace() characters. The long word with +# embedded hi-bit chars causes one of these isspace() calls to be +# exercised. The version with a couple extra spaces should cause the +# other isspace() call to be exercised. [Both cases have been tested +# in the debugger, but I'm hoping to continue to catch it if simple +# constant changes change things slightly. +# +# The trailing and leading hi-bit chars help with code which tests for +# isspace() to coalesce multiple spaces. + +set word "\x80xxxxx\x80xxxxx\x80xxxxx\x80xxxxx\x80xxxxx\x80xxxxx\x80" +set phrase1 "$word $word $word target $word $word $word" +set phrase2 "$word $word $word target $word $word $word" + +db eval {CREATE VIRTUAL TABLE t4 USING fts2(content)} +db eval "INSERT INTO t4 (content) VALUES ('$phrase1')" +db eval "INSERT INTO t4 (content) VALUES ('$phrase2')" + +do_test fts2l-1.4 { + execsql {SELECT rowid, length(snippet(t4)) FROM t4 WHERE t4 MATCH 'target'} +} {1 111 2 117} + +finish_test