From 73f7d6ed75ede36611aa3a69d70d802eecf900fa Mon Sep 17 00:00:00 2001 From: dan Date: Mon, 12 Jan 2015 17:58:04 +0000 Subject: [PATCH] Optimize the unicode61 tokenizer so that it handles ascii text faster. Make it the default tokenizer. Change the name of the simple tokenizer to "ascii". FossilOrigin-Name: f22dbccad9499624880ddd48df1b07fb42b1ad66 --- ext/fts5/fts5.c | 18 ++- ext/fts5/fts5_tokenize.c | 231 +++++++++++++++++++------------ ext/fts5/test/fts5near.test | 2 +- ext/fts5/test/fts5tokenizer.test | 4 +- ext/fts5/test/fts5unicode.test | 22 ++- manifest | 20 +-- manifest.uuid | 2 +- 7 files changed, 190 insertions(+), 109 deletions(-) diff --git a/ext/fts5/fts5.c b/ext/fts5/fts5.c index 1dd026bb4b..054851cba2 100644 --- a/ext/fts5/fts5.c +++ b/ext/fts5/fts5.c @@ -72,6 +72,7 @@ struct Fts5Global { i64 iNextId; /* Used to allocate unique cursor ids */ Fts5Auxiliary *pAux; /* First in list of all aux. functions */ Fts5TokenizerModule *pTok; /* First in list of all tokenizer modules */ + Fts5TokenizerModule *pDfltTok; /* Default tokenizer module */ Fts5Cursor *pCsr; /* First in list of all open cursors */ }; @@ -771,7 +772,7 @@ static int fts5FindRankFunction(Fts5Cursor *pCsr){ Fts5Table *pTab = (Fts5Table*)(pCsr->base.pVtab); Fts5Config *pConfig = pTab->pConfig; int rc = SQLITE_OK; - Fts5Auxiliary *pAux; + Fts5Auxiliary *pAux = 0; const char *zRank = pCsr->zRank; const char *zRankArgs = pCsr->zRankArgs; @@ -1028,7 +1029,6 @@ static int fts5SeekCursor(Fts5Cursor *pCsr){ } static void fts5SetVtabError(Fts5Table *p, const char *zFormat, ...){ - int rc; va_list ap; /* ... printf arguments */ va_start(ap, zFormat); assert( p->base.zErrMsg==0 ); @@ -1796,6 +1796,9 @@ static int fts5CreateTokenizer( pNew->xDestroy = xDestroy; pNew->pNext = pGlobal->pTok; pGlobal->pTok = pNew; + if( pNew->pNext==0 ){ + pGlobal->pDfltTok = pNew; + } }else{ rc = SQLITE_NOMEM; } @@ -1817,8 +1820,12 @@ static int fts5FindTokenizer( int rc = SQLITE_OK; Fts5TokenizerModule *pTok; - for(pTok=pGlobal->pTok; pTok; pTok=pTok->pNext){ - if( sqlite3_stricmp(zName, pTok->zName)==0 ) break; + if( zName==0 ){ + pTok = pGlobal->pDfltTok; + }else{ + for(pTok=pGlobal->pTok; pTok; pTok=pTok->pNext){ + if( sqlite3_stricmp(zName, pTok->zName)==0 ) break; + } } if( pTok ){ @@ -1841,8 +1848,9 @@ int sqlite3Fts5GetTokenizer( ){ Fts5TokenizerModule *pMod = 0; int rc = SQLITE_OK; + if( nArg==0 ){ - pMod = pGlobal->pTok; + pMod = pGlobal->pDfltTok; }else{ for(pMod=pGlobal->pTok; pMod; pMod=pMod->pNext){ if( sqlite3_stricmp(azArg[0], pMod->zName)==0 ) break; diff --git a/ext/fts5/fts5_tokenize.c b/ext/fts5/fts5_tokenize.c index c3f3e5aaa8..feb3513a46 100644 --- a/ext/fts5/fts5_tokenize.c +++ b/ext/fts5/fts5_tokenize.c @@ -16,14 +16,14 @@ #include /************************************************************************** -** Start of simple tokenizer implementation. +** Start of ascii tokenizer implementation. */ /* ** For tokenizers with no "unicode" modifier, the set of token characters ** is the same as the set of ASCII range alphanumeric characters. */ -static unsigned char aSimpleTokenChar[128] = { +static unsigned char aAsciiTokenChar[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x00..0x0F */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x10..0x1F */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x20..0x2F */ @@ -34,13 +34,13 @@ static unsigned char aSimpleTokenChar[128] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 0x70..0x7F */ }; -typedef struct SimpleTokenizer SimpleTokenizer; -struct SimpleTokenizer { +typedef struct AsciiTokenizer AsciiTokenizer; +struct AsciiTokenizer { unsigned char aTokenChar[128]; }; -static void fts5SimpleAddExceptions( - SimpleTokenizer *p, +static void fts5AsciiAddExceptions( + AsciiTokenizer *p, const char *zArg, int bTokenChars ){ @@ -53,32 +53,32 @@ static void fts5SimpleAddExceptions( } /* -** Create a "simple" tokenizer. +** Create a "ascii" tokenizer. */ -static int fts5SimpleCreate( +static int fts5AsciiCreate( void *pCtx, const char **azArg, int nArg, Fts5Tokenizer **ppOut ){ int rc = SQLITE_OK; - SimpleTokenizer *p = 0; + AsciiTokenizer *p = 0; if( nArg%2 ){ rc = SQLITE_ERROR; }else{ - p = sqlite3_malloc(sizeof(SimpleTokenizer)); + p = sqlite3_malloc(sizeof(AsciiTokenizer)); if( p==0 ){ rc = SQLITE_NOMEM; }else{ int i; - memset(p, 0, sizeof(SimpleTokenizer)); - memcpy(p->aTokenChar, aSimpleTokenChar, sizeof(aSimpleTokenChar)); + memset(p, 0, sizeof(AsciiTokenizer)); + memcpy(p->aTokenChar, aAsciiTokenChar, sizeof(aAsciiTokenChar)); for(i=0; rc==SQLITE_OK && iiCode ) break; + if( iCode<128 ){ + p->aTokenChar[iCode] = bTokenChars; + }else{ + bToken = sqlite3Fts5UnicodeIsalnum(iCode); + assert( (bToken==0 || bToken==1) ); + assert( (bTokenChars==0 || bTokenChars==1) ); + if( bToken!=bTokenChars && sqlite3Fts5UnicodeIsdiacritic(iCode)==0 ){ + int i; + for(i=0; iiCode ) break; + } + memmove(&aNew[i+1], &aNew[i], (nNew-i)*sizeof(int)); + aNew[i] = iCode; + nNew++; } - memmove(&aNew[i+1], &aNew[i], (nNew-i)*sizeof(int)); - aNew[i] = iCode; - nNew++; } } p->aiException = aNew; @@ -301,6 +309,19 @@ static int fts5UnicodeIsException(Unicode61Tokenizer *p, int iCode){ return 0; } +/* +** Delete a "unicode61" tokenizer. +*/ +static void fts5UnicodeDelete(Fts5Tokenizer *pTok){ + if( pTok ){ + Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTok; + sqlite3_free(p->aiException); + sqlite3_free(p->aFold); + sqlite3_free(p); + } + return; +} + /* ** Create a "unicode61" tokenizer. */ @@ -319,7 +340,13 @@ static int fts5UnicodeCreate( if( p ){ int i; memset(p, 0, sizeof(Unicode61Tokenizer)); + memcpy(p->aTokenChar, aAsciiTokenChar, sizeof(aAsciiTokenChar)); p->bRemoveDiacritic = 1; + p->nFold = 64; + p->aFold = sqlite3_malloc(p->nFold * sizeof(char)); + if( p->aFold==0 ){ + rc = SQLITE_NOMEM; + } for(i=0; rc==SQLITE_OK && iaiException); - sqlite3_free(p); - return; -} - /* ** Return true if, for the purposes of tokenizing with the tokenizer ** passed as the first argument, codepoint iCode is considered a token @@ -365,9 +386,6 @@ static int fts5UnicodeIsAlnum(Unicode61Tokenizer *p, int iCode){ return sqlite3Fts5UnicodeIsalnum(iCode) ^ fts5UnicodeIsException(p, iCode); } -/* -** Tokenize some text using a unicode61 tokenizer. -*/ static int fts5UnicodeTokenize( Fts5Tokenizer *pTokenizer, void *pCtx, @@ -375,59 +393,94 @@ static int fts5UnicodeTokenize( int (*xToken)(void*, const char*, int nToken, int iStart, int iEnd) ){ Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTokenizer; - const unsigned char *zInput = (const unsigned char*)pText; - const unsigned char *zTerm = &zInput[nText]; - const unsigned char *z = zInput; int rc = SQLITE_OK; - int nBuf = 0; - unsigned char *zBuf = 0; - unsigned char *zOut = 0; + unsigned char *a = p->aTokenChar; - while( rc==SQLITE_OK && zzBuf ){ - bAlnum = sqlite3Fts5UnicodeIsdiacritic(iCode); + /* Output buffer */ + char *aFold = p->aFold; + int nFold = p->nFold; + + /* Each iteration of this loop gobbles up a contiguous run of separators, + ** then the next token. */ + while( rc==SQLITE_OK ){ + int iCode; /* non-ASCII codepoint read from input */ + char *zOut = aFold; + int is; + int ie; + + /* Skip any separator characters. */ + while( 1 ){ + if( zCsr>=zTerm ) goto tokenize_done; + if( *zCsr & 0x80 ) { + /* A character outside of the ascii range. Skip past it if it is + ** a separator character. Or break out of the loop if it is not. */ + is = zCsr - (unsigned char*)pText; + READ_UTF8(zCsr, zTerm, iCode); + if( fts5UnicodeIsAlnum(p, iCode) ){ + goto non_ascii_tokenchar; + } + }else{ + if( a[*zCsr] ){ + is = zCsr - (unsigned char*)pText; + goto ascii_tokenchar; + } + zCsr++; + } } - if( bAlnum ){ - int iOut; + /* Run through the tokenchars. Fold them into the output buffer along + ** the way. */ + while( zCsr=nBuf ){ - unsigned char *zNew; - nBuf = (nBuf ? nBuf*2 : 128); - zNew = sqlite3_realloc(zBuf, nBuf); - if( zNew==0 ){ + /* Grow the output buffer so that there is sufficient space to fit the + ** largest possible utf-8 character. */ + if( (zOut-aFold)+6>nFold ){ + aFold = sqlite3_malloc(nFold*2); + if( aFold==0 ){ rc = SQLITE_NOMEM; - goto tokenize_finished; - }else{ - zOut = &zNew[zOut-zBuf]; - zBuf = zNew; + goto tokenize_done; } + memcpy(aFold, p->aFold, nFold); + sqlite3_free(p->aFold); + p->aFold = aFold; + p->nFold = nFold = nFold*2; } - /* Write the new character to it */ - iOut = sqlite3Fts5UnicodeFold(iCode, p->bRemoveDiacritic); - if( iOut ) WRITE_UTF8(zOut, iOut); + if( *zCsr & 0x80 ){ + /* An non-ascii-range character. Fold it into the output buffer if + ** it is a token character, or break out of the loop if it is not. */ + READ_UTF8(zCsr, zTerm, iCode); + if( fts5UnicodeIsAlnum(p,iCode)||sqlite3Fts5UnicodeIsdiacritic(iCode) ){ + non_ascii_tokenchar: + iCode = sqlite3Fts5UnicodeFold(iCode, p->bRemoveDiacritic); + if( iCode ) WRITE_UTF8(zOut, iCode); + }else{ + break; + } + }else if( a[*zCsr]==0 ){ + /* An ascii-range separator character. End of token. */ + break; + }else{ + ascii_tokenchar: + if( *zCsr>='A' && *zCsr<='Z' ){ + *zOut++ = *zCsr + 32; + }else{ + *zOut++ = *zCsr; + } + zCsr++; + } + ie = zCsr - (unsigned char*)pText; } - if( zOut>zBuf && (bAlnum==0 || z>=zTerm) ){ - int ie = (bAlnum ? z : zCode) - zInput; - rc = xToken(pCtx, (const char*)zBuf, zOut-zBuf, zStart-zInput, ie); - zOut = zBuf; - } + /* Invoke the token callback */ + rc = xToken(pCtx, aFold, zOut-aFold, is, ie); } - - tokenize_finished: - sqlite3_free(zBuf); + + tokenize_done: + if( rc==SQLITE_DONE ) rc = SQLITE_OK; return rc; } @@ -475,7 +528,7 @@ static int fts5PorterCreate( pRet = (PorterTokenizer*)sqlite3_malloc(sizeof(PorterTokenizer)); if( pRet ){ memset(pRet, 0, sizeof(PorterTokenizer)); - rc = pApi->xFindTokenizer(pApi, "simple", &pUserdata, &pRet->tokenizer); + rc = pApi->xFindTokenizer(pApi, "ascii", &pUserdata, &pRet->tokenizer); }else{ rc = SQLITE_NOMEM; } @@ -789,9 +842,9 @@ int sqlite3Fts5TokenizerInit(fts5_api *pApi){ const char *zName; fts5_tokenizer x; } aBuiltin[] = { - { "porter", {fts5PorterCreate, fts5PorterDelete, fts5PorterTokenize }}, { "unicode61", {fts5UnicodeCreate, fts5UnicodeDelete, fts5UnicodeTokenize}}, - { "simple", {fts5SimpleCreate, fts5SimpleDelete, fts5SimpleTokenize }} + { "ascii", {fts5AsciiCreate, fts5AsciiDelete, fts5AsciiTokenize }}, + { "porter", {fts5PorterCreate, fts5PorterDelete, fts5PorterTokenize }}, }; int rc = SQLITE_OK; /* Return code */ diff --git a/ext/fts5/test/fts5near.test b/ext/fts5/test/fts5near.test index 7425a4f24e..f545447e6f 100644 --- a/ext/fts5/test/fts5near.test +++ b/ext/fts5/test/fts5near.test @@ -24,7 +24,7 @@ proc do_near_test {tn doc near res} { } execsql { - CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = 'simple tokenchars .') + CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = 'ascii tokenchars .') } do_near_test 1.1 ". . a . . . b . ." { NEAR(a b, 5) } 1 diff --git a/ext/fts5/test/fts5tokenizer.test b/ext/fts5/test/fts5tokenizer.test index e45f7fd89a..d8c4f20f0e 100644 --- a/ext/fts5/test/fts5tokenizer.test +++ b/ext/fts5/test/fts5tokenizer.test @@ -33,7 +33,7 @@ do_execsql_test 1.3 { DROP TABLE ft1; } do_execsql_test 1.4 { - CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'porter simple'); + CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'porter ascii'); DROP TABLE ft1; } @@ -75,7 +75,7 @@ do_catchsql_test 4.2 { #------------------------------------------------------------------------- # Test the "separators" and "tokenchars" options a bit. # -foreach {tn tokenizer} {1 simple 2 unicode61} { +foreach {tn tokenizer} {1 ascii 2 unicode61} { reset_db set T "$tokenizer tokenchars ',.:' separators 'xyz'" execsql "CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = \"$T\")" diff --git a/ext/fts5/test/fts5unicode.test b/ext/fts5/test/fts5unicode.test index 737604c1f1..0018a49030 100644 --- a/ext/fts5/test/fts5unicode.test +++ b/ext/fts5/test/fts5unicode.test @@ -25,12 +25,32 @@ proc tokenize_test {tn tokenizer input output} { }] [list {*}$output]] } -foreach {tn t} {1 simple 2 unicode61} { +foreach {tn t} {1 ascii 2 unicode61} { tokenize_test 1.$tn.0 $t {A B C D} {a b c d} tokenize_test 1.$tn.1 $t {May you share freely,} {may you share freely} tokenize_test 1.$tn.2 $t {..May...you.shAre.freely} {may you share freely} tokenize_test 1.$tn.3 $t {} {} } +#------------------------------------------------------------------------- +# Check that "unicode61" really is the default tokenizer. +# + +do_execsql_test 2.0 " + CREATE VIRTUAL TABLE t1 USING fts5(x); + CREATE VIRTUAL TABLE t2 USING fts5(x, tokenize = unicode61); + CREATE VIRTUAL TABLE t3 USING fts5(x, tokenize = ascii); + INSERT INTO t1 VALUES('\xC0\xC8\xCC'); + INSERT INTO t2 VALUES('\xC0\xC8\xCC'); + INSERT INTO t3 VALUES('\xC0\xC8\xCC'); +" +breakpoint +do_execsql_test 2.1 " + SELECT 't1' FROM t1 WHERE t1 MATCH '\xE0\xE8\xEC'; + SELECT 't2' FROM t2 WHERE t2 MATCH '\xE0\xE8\xEC'; + SELECT 't3' FROM t3 WHERE t3 MATCH '\xE0\xE8\xEC'; +" {t1 t2} + + finish_test diff --git a/manifest b/manifest index d065fd8d57..2c25d46d59 100644 --- a/manifest +++ b/manifest @@ -1,5 +1,5 @@ -C Fix\ssome\sdocumentation\sissues\sin\sfts5. -D 2015-01-10T20:34:27.199 +C Optimize\sthe\sunicode61\stokenizer\sso\sthat\sit\shandles\sascii\stext\sfaster.\sMake\sit\sthe\sdefault\stokenizer.\sChange\sthe\sname\sof\sthe\ssimple\stokenizer\sto\s"ascii". +D 2015-01-12T17:58:04.627 F Makefile.arm-wince-mingw32ce-gcc d6df77f1f48d690bd73162294bbba7f59507c72f F Makefile.in 7cd23e4fc91004a6bd081623e1bc6932e44828c0 F Makefile.linux-gcc 91d710bdc4998cb015f39edf3cb314ec4f4d7e23 @@ -104,7 +104,7 @@ F ext/fts3/unicode/CaseFolding.txt 8c678ca52ecc95e16bc7afc2dbf6fc9ffa05db8c F ext/fts3/unicode/UnicodeData.txt cd07314edb62d49fde34debdaf92fa2aa69011e7 F ext/fts3/unicode/mkunicode.tcl 4199cb887040ee3c3cd59a5171ddb0566904586e F ext/fts5/extract_api_docs.tcl 55a6d648d516f35d9a1e580ac00de27154e1904a -F ext/fts5/fts5.c c90004f4a91ce4f4dfad2fc980ade0d9314ebb10 +F ext/fts5/fts5.c 790880afffb249c79f9a36b38f9d774515f5cf7b F ext/fts5/fts5.h f931954065693898d26c51f23f1d27200184a69a F ext/fts5/fts5Int.h 0142ba4c3c70e1976578604c0e738670f7689726 F ext/fts5/fts5_aux.c 549aef152b0fd46020f5595d861b1fd60b3f9b4f @@ -115,7 +115,7 @@ F ext/fts5/fts5_hash.c 63fa8379c5f2ac107d47c2b7d9ac04c95ef8a279 F ext/fts5/fts5_index.c ea36c1e42aaf8038b6139be95575eb7fe01f34e4 F ext/fts5/fts5_storage.c 8bc9e5b6654e1545e9513def277ef3f025921664 F ext/fts5/fts5_tcl.c 1293fac2bb26903fd3d5cdee59c5885ba7e620d5 -F ext/fts5/fts5_tokenize.c 4c30cf32c63e59bec5b38533e0a65987df262851 +F ext/fts5/fts5_tokenize.c bdb6a1f599a94ec6e9c1cad037d1071e823dcb5d F ext/fts5/fts5_unicode2.c 9c7dd640d1f014bf5c3ee029759adfbb4d7e95a9 F ext/fts5/fts5parse.y 777da8e5819f75c217982c79c29d014c293acac9 F ext/fts5/test/fts5_common.tcl 08e939096a07eb77a7a986613e960f31d3cab2cc @@ -135,12 +135,12 @@ F ext/fts5/test/fts5auxdata.test c69b86092bf1a157172de5f9169731af3403179b F ext/fts5/test/fts5content.test 4234e0b11e003fe1e80472aa637f70464396fdd0 F ext/fts5/test/fts5ea.test 04695560a444fcc00c3c4f27783bdcfbf71f030c F ext/fts5/test/fts5fault1.test f3f4c6ed15cc7a4dc8d517c0d1969d8e5a35a65c -F ext/fts5/test/fts5near.test 70a568a1211a5b6d5a17282790d5f8cbbe086ce0 +F ext/fts5/test/fts5near.test 3f9f64e16cac82725d03d4e04c661090f0b3b947 F ext/fts5/test/fts5optimize.test 0028c90a7817d3e576d1148fc8dff17d89054e54 F ext/fts5/test/fts5porter.test 50322599823cb8080a99f0ec0c39f7d0c12bcb5e F ext/fts5/test/fts5rebuild.test 2a5e98205393487b4a732c8290999af7c0b907b4 -F ext/fts5/test/fts5tokenizer.test f951bb9be29232bd057b0ac4d535b879d9cd9a89 -F ext/fts5/test/fts5unicode.test 9ae93296e59917c1210336388f6d3b98051b50c9 +F ext/fts5/test/fts5tokenizer.test b34ae592db66f6e89546d791ce1f905ba0b3395c +F ext/fts5/test/fts5unicode.test 79b3e34eb29ce4929628aa514a40cb467fdabe4d F ext/fts5/test/fts5unicode2.test 64a5267fd6082fcb46439892ebd0cbaa5c38acee F ext/icu/README.txt d9fbbad0c2f647c3fdf715fc9fd64af53aedfc43 F ext/icu/icu.c d415ccf984defeb9df2c0e1afcfaa2f6dc05eacb @@ -1274,7 +1274,7 @@ F tool/vdbe_profile.tcl 67746953071a9f8f2f668b73fe899074e2c6d8c1 F tool/warnings-clang.sh f6aa929dc20ef1f856af04a730772f59283631d4 F tool/warnings.sh 0abfd78ceb09b7f7c27c688c8e3fe93268a13b32 F tool/win/sqlite.vsix deb315d026cc8400325c5863eef847784a219a2f -P e749be563d8e738af113bd301770e2f22763ab77 -R 5c59d3558d2a230e6048c600760933d7 +P 512e1bdb4093b59d1494dfc63391476eadd52aea +R 30a0c3c40d1701cf92ddf5b1410b6af9 U dan -Z 6c17e3ae4cf92b8841424ff4d00c314d +Z 9b7b348d489cfd6e15d4a8bf3e2c22e9 diff --git a/manifest.uuid b/manifest.uuid index de97eaee58..4575836fe1 100644 --- a/manifest.uuid +++ b/manifest.uuid @@ -1 +1 @@ -512e1bdb4093b59d1494dfc63391476eadd52aea \ No newline at end of file +f22dbccad9499624880ddd48df1b07fb42b1ad66 \ No newline at end of file