1
0
mirror of https://github.com/sqlite/sqlite.git synced 2025-08-05 15:55:57 +03:00

Optimize the unicode61 tokenizer so that it handles ascii text faster. Make it the default tokenizer. Change the name of the simple tokenizer to "ascii".

FossilOrigin-Name: f22dbccad9499624880ddd48df1b07fb42b1ad66
This commit is contained in:
dan
2015-01-12 17:58:04 +00:00
parent 27277c4e3c
commit 73f7d6ed75
7 changed files with 190 additions and 109 deletions

View File

@@ -72,6 +72,7 @@ struct Fts5Global {
i64 iNextId; /* Used to allocate unique cursor ids */ i64 iNextId; /* Used to allocate unique cursor ids */
Fts5Auxiliary *pAux; /* First in list of all aux. functions */ Fts5Auxiliary *pAux; /* First in list of all aux. functions */
Fts5TokenizerModule *pTok; /* First in list of all tokenizer modules */ Fts5TokenizerModule *pTok; /* First in list of all tokenizer modules */
Fts5TokenizerModule *pDfltTok; /* Default tokenizer module */
Fts5Cursor *pCsr; /* First in list of all open cursors */ Fts5Cursor *pCsr; /* First in list of all open cursors */
}; };
@@ -771,7 +772,7 @@ static int fts5FindRankFunction(Fts5Cursor *pCsr){
Fts5Table *pTab = (Fts5Table*)(pCsr->base.pVtab); Fts5Table *pTab = (Fts5Table*)(pCsr->base.pVtab);
Fts5Config *pConfig = pTab->pConfig; Fts5Config *pConfig = pTab->pConfig;
int rc = SQLITE_OK; int rc = SQLITE_OK;
Fts5Auxiliary *pAux; Fts5Auxiliary *pAux = 0;
const char *zRank = pCsr->zRank; const char *zRank = pCsr->zRank;
const char *zRankArgs = pCsr->zRankArgs; const char *zRankArgs = pCsr->zRankArgs;
@@ -1028,7 +1029,6 @@ static int fts5SeekCursor(Fts5Cursor *pCsr){
} }
static void fts5SetVtabError(Fts5Table *p, const char *zFormat, ...){ static void fts5SetVtabError(Fts5Table *p, const char *zFormat, ...){
int rc;
va_list ap; /* ... printf arguments */ va_list ap; /* ... printf arguments */
va_start(ap, zFormat); va_start(ap, zFormat);
assert( p->base.zErrMsg==0 ); assert( p->base.zErrMsg==0 );
@@ -1796,6 +1796,9 @@ static int fts5CreateTokenizer(
pNew->xDestroy = xDestroy; pNew->xDestroy = xDestroy;
pNew->pNext = pGlobal->pTok; pNew->pNext = pGlobal->pTok;
pGlobal->pTok = pNew; pGlobal->pTok = pNew;
if( pNew->pNext==0 ){
pGlobal->pDfltTok = pNew;
}
}else{ }else{
rc = SQLITE_NOMEM; rc = SQLITE_NOMEM;
} }
@@ -1817,9 +1820,13 @@ static int fts5FindTokenizer(
int rc = SQLITE_OK; int rc = SQLITE_OK;
Fts5TokenizerModule *pTok; Fts5TokenizerModule *pTok;
if( zName==0 ){
pTok = pGlobal->pDfltTok;
}else{
for(pTok=pGlobal->pTok; pTok; pTok=pTok->pNext){ for(pTok=pGlobal->pTok; pTok; pTok=pTok->pNext){
if( sqlite3_stricmp(zName, pTok->zName)==0 ) break; if( sqlite3_stricmp(zName, pTok->zName)==0 ) break;
} }
}
if( pTok ){ if( pTok ){
*pTokenizer = pTok->x; *pTokenizer = pTok->x;
@@ -1841,8 +1848,9 @@ int sqlite3Fts5GetTokenizer(
){ ){
Fts5TokenizerModule *pMod = 0; Fts5TokenizerModule *pMod = 0;
int rc = SQLITE_OK; int rc = SQLITE_OK;
if( nArg==0 ){ if( nArg==0 ){
pMod = pGlobal->pTok; pMod = pGlobal->pDfltTok;
}else{ }else{
for(pMod=pGlobal->pTok; pMod; pMod=pMod->pNext){ for(pMod=pGlobal->pTok; pMod; pMod=pMod->pNext){
if( sqlite3_stricmp(azArg[0], pMod->zName)==0 ) break; if( sqlite3_stricmp(azArg[0], pMod->zName)==0 ) break;

View File

@@ -16,14 +16,14 @@
#include <assert.h> #include <assert.h>
/************************************************************************** /**************************************************************************
** Start of simple tokenizer implementation. ** Start of ascii tokenizer implementation.
*/ */
/* /*
** For tokenizers with no "unicode" modifier, the set of token characters ** For tokenizers with no "unicode" modifier, the set of token characters
** is the same as the set of ASCII range alphanumeric characters. ** is the same as the set of ASCII range alphanumeric characters.
*/ */
static unsigned char aSimpleTokenChar[128] = { static unsigned char aAsciiTokenChar[128] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x00..0x0F */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x00..0x0F */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x10..0x1F */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x10..0x1F */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x20..0x2F */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x20..0x2F */
@@ -34,13 +34,13 @@ static unsigned char aSimpleTokenChar[128] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 0x70..0x7F */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 0x70..0x7F */
}; };
typedef struct SimpleTokenizer SimpleTokenizer; typedef struct AsciiTokenizer AsciiTokenizer;
struct SimpleTokenizer { struct AsciiTokenizer {
unsigned char aTokenChar[128]; unsigned char aTokenChar[128];
}; };
static void fts5SimpleAddExceptions( static void fts5AsciiAddExceptions(
SimpleTokenizer *p, AsciiTokenizer *p,
const char *zArg, const char *zArg,
int bTokenChars int bTokenChars
){ ){
@@ -53,32 +53,32 @@ static void fts5SimpleAddExceptions(
} }
/* /*
** Create a "simple" tokenizer. ** Create a "ascii" tokenizer.
*/ */
static int fts5SimpleCreate( static int fts5AsciiCreate(
void *pCtx, void *pCtx,
const char **azArg, int nArg, const char **azArg, int nArg,
Fts5Tokenizer **ppOut Fts5Tokenizer **ppOut
){ ){
int rc = SQLITE_OK; int rc = SQLITE_OK;
SimpleTokenizer *p = 0; AsciiTokenizer *p = 0;
if( nArg%2 ){ if( nArg%2 ){
rc = SQLITE_ERROR; rc = SQLITE_ERROR;
}else{ }else{
p = sqlite3_malloc(sizeof(SimpleTokenizer)); p = sqlite3_malloc(sizeof(AsciiTokenizer));
if( p==0 ){ if( p==0 ){
rc = SQLITE_NOMEM; rc = SQLITE_NOMEM;
}else{ }else{
int i; int i;
memset(p, 0, sizeof(SimpleTokenizer)); memset(p, 0, sizeof(AsciiTokenizer));
memcpy(p->aTokenChar, aSimpleTokenChar, sizeof(aSimpleTokenChar)); memcpy(p->aTokenChar, aAsciiTokenChar, sizeof(aAsciiTokenChar));
for(i=0; rc==SQLITE_OK && i<nArg; i+=2){ for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
const char *zArg = azArg[i+1]; const char *zArg = azArg[i+1];
if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){ if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){
fts5SimpleAddExceptions(p, zArg, 1); fts5AsciiAddExceptions(p, zArg, 1);
}else }else
if( 0==sqlite3_stricmp(azArg[i], "separators") ){ if( 0==sqlite3_stricmp(azArg[i], "separators") ){
fts5SimpleAddExceptions(p, zArg, 0); fts5AsciiAddExceptions(p, zArg, 0);
}else{ }else{
rc = SQLITE_ERROR; rc = SQLITE_ERROR;
} }
@@ -91,14 +91,14 @@ static int fts5SimpleCreate(
} }
/* /*
** Delete a "simple" tokenizer. ** Delete a "ascii" tokenizer.
*/ */
static void fts5SimpleDelete(Fts5Tokenizer *p){ static void fts5AsciiDelete(Fts5Tokenizer *p){
sqlite3_free(p); sqlite3_free(p);
} }
static void simpleFold(char *aOut, const char *aIn, int nByte){ static void asciiFold(char *aOut, const char *aIn, int nByte){
int i; int i;
for(i=0; i<nByte; i++){ for(i=0; i<nByte; i++){
char c = aIn[i]; char c = aIn[i];
@@ -108,15 +108,15 @@ static void simpleFold(char *aOut, const char *aIn, int nByte){
} }
/* /*
** Tokenize some text using the simple tokenizer. ** Tokenize some text using the ascii tokenizer.
*/ */
static int fts5SimpleTokenize( static int fts5AsciiTokenize(
Fts5Tokenizer *pTokenizer, Fts5Tokenizer *pTokenizer,
void *pCtx, void *pCtx,
const char *pText, int nText, const char *pText, int nText,
int (*xToken)(void*, const char*, int nToken, int iStart, int iEnd) int (*xToken)(void*, const char*, int nToken, int iStart, int iEnd)
){ ){
SimpleTokenizer *p = (SimpleTokenizer*)pTokenizer; AsciiTokenizer *p = (AsciiTokenizer*)pTokenizer;
int rc = SQLITE_OK; int rc = SQLITE_OK;
int ie; int ie;
int is = 0; int is = 0;
@@ -130,14 +130,14 @@ static int fts5SimpleTokenize(
int nByte; int nByte;
/* Skip any leading divider characters. */ /* Skip any leading divider characters. */
while( is<nText && ((pText[is]&0x80) || a[(int)pText[is]]==0) ){ while( is<nText && ((pText[is]&0x80)==0 && a[(int)pText[is]]==0) ){
is++; is++;
} }
if( is==nText ) break; if( is==nText ) break;
/* Count the token characters */ /* Count the token characters */
ie = is+1; ie = is+1;
while( ie<nText && ((pText[ie]&0x80)==0 && a[(int)pText[ie]] ) ){ while( ie<nText && ((pText[ie]&0x80) || a[(int)pText[ie]] ) ){
ie++; ie++;
} }
@@ -152,7 +152,7 @@ static int fts5SimpleTokenize(
} }
nFold = nByte*2; nFold = nByte*2;
} }
simpleFold(pFold, &pText[is], nByte); asciiFold(pFold, &pText[is], nByte);
/* Invoke the token callback */ /* Invoke the token callback */
rc = xToken(pCtx, pFold, nByte, is, ie); rc = xToken(pCtx, pFold, nByte, is, ie);
@@ -206,6 +206,7 @@ static const unsigned char sqlite3Utf8Trans1[] = {
|| (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; } \ || (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; } \
} }
#define WRITE_UTF8(zOut, c) { \ #define WRITE_UTF8(zOut, c) { \
if( c<0x00080 ){ \ if( c<0x00080 ){ \
*zOut++ = (unsigned char)(c&0xFF); \ *zOut++ = (unsigned char)(c&0xFF); \
@@ -230,6 +231,9 @@ static const unsigned char sqlite3Utf8Trans1[] = {
typedef struct Unicode61Tokenizer Unicode61Tokenizer; typedef struct Unicode61Tokenizer Unicode61Tokenizer;
struct Unicode61Tokenizer { struct Unicode61Tokenizer {
unsigned char aTokenChar[128]; /* ASCII range token characters */
char *aFold; /* Buffer to fold text into */
int nFold; /* Size of aFold[] in bytes */
int bRemoveDiacritic; /* True if remove_diacritics=1 is set */ int bRemoveDiacritic; /* True if remove_diacritics=1 is set */
int nException; int nException;
int *aiException; int *aiException;
@@ -254,6 +258,9 @@ static int fts5UnicodeAddExceptions(
int iCode; int iCode;
int bToken; int bToken;
READ_UTF8(zCsr, zTerm, iCode); READ_UTF8(zCsr, zTerm, iCode);
if( iCode<128 ){
p->aTokenChar[iCode] = bTokenChars;
}else{
bToken = sqlite3Fts5UnicodeIsalnum(iCode); bToken = sqlite3Fts5UnicodeIsalnum(iCode);
assert( (bToken==0 || bToken==1) ); assert( (bToken==0 || bToken==1) );
assert( (bTokenChars==0 || bTokenChars==1) ); assert( (bTokenChars==0 || bTokenChars==1) );
@@ -267,6 +274,7 @@ static int fts5UnicodeAddExceptions(
nNew++; nNew++;
} }
} }
}
p->aiException = aNew; p->aiException = aNew;
p->nException = nNew; p->nException = nNew;
}else{ }else{
@@ -301,6 +309,19 @@ static int fts5UnicodeIsException(Unicode61Tokenizer *p, int iCode){
return 0; return 0;
} }
/*
** Delete a "unicode61" tokenizer.
*/
static void fts5UnicodeDelete(Fts5Tokenizer *pTok){
if( pTok ){
Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTok;
sqlite3_free(p->aiException);
sqlite3_free(p->aFold);
sqlite3_free(p);
}
return;
}
/* /*
** Create a "unicode61" tokenizer. ** Create a "unicode61" tokenizer.
*/ */
@@ -319,7 +340,13 @@ static int fts5UnicodeCreate(
if( p ){ if( p ){
int i; int i;
memset(p, 0, sizeof(Unicode61Tokenizer)); memset(p, 0, sizeof(Unicode61Tokenizer));
memcpy(p->aTokenChar, aAsciiTokenChar, sizeof(aAsciiTokenChar));
p->bRemoveDiacritic = 1; p->bRemoveDiacritic = 1;
p->nFold = 64;
p->aFold = sqlite3_malloc(p->nFold * sizeof(char));
if( p->aFold==0 ){
rc = SQLITE_NOMEM;
}
for(i=0; rc==SQLITE_OK && i<nArg; i+=2){ for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
const char *zArg = azArg[i+1]; const char *zArg = azArg[i+1];
if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){ if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){
@@ -340,21 +367,15 @@ static int fts5UnicodeCreate(
}else{ }else{
rc = SQLITE_NOMEM; rc = SQLITE_NOMEM;
} }
if( rc!=SQLITE_OK ){
fts5UnicodeDelete((Fts5Tokenizer*)p);
p = 0;
}
*ppOut = (Fts5Tokenizer*)p; *ppOut = (Fts5Tokenizer*)p;
} }
return rc; return rc;
} }
/*
** Delete a "unicode61" tokenizer.
*/
static void fts5UnicodeDelete(Fts5Tokenizer *pTok){
Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTok;
sqlite3_free(p->aiException);
sqlite3_free(p);
return;
}
/* /*
** Return true if, for the purposes of tokenizing with the tokenizer ** Return true if, for the purposes of tokenizing with the tokenizer
** passed as the first argument, codepoint iCode is considered a token ** passed as the first argument, codepoint iCode is considered a token
@@ -365,9 +386,6 @@ static int fts5UnicodeIsAlnum(Unicode61Tokenizer *p, int iCode){
return sqlite3Fts5UnicodeIsalnum(iCode) ^ fts5UnicodeIsException(p, iCode); return sqlite3Fts5UnicodeIsalnum(iCode) ^ fts5UnicodeIsException(p, iCode);
} }
/*
** Tokenize some text using a unicode61 tokenizer.
*/
static int fts5UnicodeTokenize( static int fts5UnicodeTokenize(
Fts5Tokenizer *pTokenizer, Fts5Tokenizer *pTokenizer,
void *pCtx, void *pCtx,
@@ -375,59 +393,94 @@ static int fts5UnicodeTokenize(
int (*xToken)(void*, const char*, int nToken, int iStart, int iEnd) int (*xToken)(void*, const char*, int nToken, int iStart, int iEnd)
){ ){
Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTokenizer; Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTokenizer;
const unsigned char *zInput = (const unsigned char*)pText;
const unsigned char *zTerm = &zInput[nText];
const unsigned char *z = zInput;
int rc = SQLITE_OK; int rc = SQLITE_OK;
int nBuf = 0; unsigned char *a = p->aTokenChar;
unsigned char *zBuf = 0;
unsigned char *zOut = 0;
while( rc==SQLITE_OK && z<zTerm ){ unsigned char *zTerm = (unsigned char*)&pText[nText];
int iCode; unsigned char *zCsr = (unsigned char *)pText;
int bAlnum;
const unsigned char *zStart;
const unsigned char *zCode;
if( zOut==zBuf ) zStart = z; /* Output buffer */
zCode = z; char *aFold = p->aFold;
READ_UTF8(z, zTerm, iCode); int nFold = p->nFold;
bAlnum = fts5UnicodeIsAlnum(p, iCode);
if( bAlnum==0 && zOut>zBuf ){ /* Each iteration of this loop gobbles up a contiguous run of separators,
bAlnum = sqlite3Fts5UnicodeIsdiacritic(iCode); ** then the next token. */
while( rc==SQLITE_OK ){
int iCode; /* non-ASCII codepoint read from input */
char *zOut = aFold;
int is;
int ie;
/* Skip any separator characters. */
while( 1 ){
if( zCsr>=zTerm ) goto tokenize_done;
if( *zCsr & 0x80 ) {
/* A character outside of the ascii range. Skip past it if it is
** a separator character. Or break out of the loop if it is not. */
is = zCsr - (unsigned char*)pText;
READ_UTF8(zCsr, zTerm, iCode);
if( fts5UnicodeIsAlnum(p, iCode) ){
goto non_ascii_tokenchar;
} }
if( bAlnum ){
int iOut;
/* Grow the output buffer if required */
while( (zOut-zBuf)+4>=nBuf ){
unsigned char *zNew;
nBuf = (nBuf ? nBuf*2 : 128);
zNew = sqlite3_realloc(zBuf, nBuf);
if( zNew==0 ){
rc = SQLITE_NOMEM;
goto tokenize_finished;
}else{ }else{
zOut = &zNew[zOut-zBuf]; if( a[*zCsr] ){
zBuf = zNew; is = zCsr - (unsigned char*)pText;
goto ascii_tokenchar;
}
zCsr++;
} }
} }
/* Write the new character to it */ /* Run through the tokenchars. Fold them into the output buffer along
iOut = sqlite3Fts5UnicodeFold(iCode, p->bRemoveDiacritic); ** the way. */
if( iOut ) WRITE_UTF8(zOut, iOut); while( zCsr<zTerm ){
/* Grow the output buffer so that there is sufficient space to fit the
** largest possible utf-8 character. */
if( (zOut-aFold)+6>nFold ){
aFold = sqlite3_malloc(nFold*2);
if( aFold==0 ){
rc = SQLITE_NOMEM;
goto tokenize_done;
}
memcpy(aFold, p->aFold, nFold);
sqlite3_free(p->aFold);
p->aFold = aFold;
p->nFold = nFold = nFold*2;
} }
if( zOut>zBuf && (bAlnum==0 || z>=zTerm) ){ if( *zCsr & 0x80 ){
int ie = (bAlnum ? z : zCode) - zInput; /* An non-ascii-range character. Fold it into the output buffer if
rc = xToken(pCtx, (const char*)zBuf, zOut-zBuf, zStart-zInput, ie); ** it is a token character, or break out of the loop if it is not. */
zOut = zBuf; READ_UTF8(zCsr, zTerm, iCode);
if( fts5UnicodeIsAlnum(p,iCode)||sqlite3Fts5UnicodeIsdiacritic(iCode) ){
non_ascii_tokenchar:
iCode = sqlite3Fts5UnicodeFold(iCode, p->bRemoveDiacritic);
if( iCode ) WRITE_UTF8(zOut, iCode);
}else{
break;
} }
}else if( a[*zCsr]==0 ){
/* An ascii-range separator character. End of token. */
break;
}else{
ascii_tokenchar:
if( *zCsr>='A' && *zCsr<='Z' ){
*zOut++ = *zCsr + 32;
}else{
*zOut++ = *zCsr;
}
zCsr++;
}
ie = zCsr - (unsigned char*)pText;
} }
tokenize_finished: /* Invoke the token callback */
sqlite3_free(zBuf); rc = xToken(pCtx, aFold, zOut-aFold, is, ie);
}
tokenize_done:
if( rc==SQLITE_DONE ) rc = SQLITE_OK;
return rc; return rc;
} }
@@ -475,7 +528,7 @@ static int fts5PorterCreate(
pRet = (PorterTokenizer*)sqlite3_malloc(sizeof(PorterTokenizer)); pRet = (PorterTokenizer*)sqlite3_malloc(sizeof(PorterTokenizer));
if( pRet ){ if( pRet ){
memset(pRet, 0, sizeof(PorterTokenizer)); memset(pRet, 0, sizeof(PorterTokenizer));
rc = pApi->xFindTokenizer(pApi, "simple", &pUserdata, &pRet->tokenizer); rc = pApi->xFindTokenizer(pApi, "ascii", &pUserdata, &pRet->tokenizer);
}else{ }else{
rc = SQLITE_NOMEM; rc = SQLITE_NOMEM;
} }
@@ -789,9 +842,9 @@ int sqlite3Fts5TokenizerInit(fts5_api *pApi){
const char *zName; const char *zName;
fts5_tokenizer x; fts5_tokenizer x;
} aBuiltin[] = { } aBuiltin[] = {
{ "porter", {fts5PorterCreate, fts5PorterDelete, fts5PorterTokenize }},
{ "unicode61", {fts5UnicodeCreate, fts5UnicodeDelete, fts5UnicodeTokenize}}, { "unicode61", {fts5UnicodeCreate, fts5UnicodeDelete, fts5UnicodeTokenize}},
{ "simple", {fts5SimpleCreate, fts5SimpleDelete, fts5SimpleTokenize }} { "ascii", {fts5AsciiCreate, fts5AsciiDelete, fts5AsciiTokenize }},
{ "porter", {fts5PorterCreate, fts5PorterDelete, fts5PorterTokenize }},
}; };
int rc = SQLITE_OK; /* Return code */ int rc = SQLITE_OK; /* Return code */

View File

@@ -24,7 +24,7 @@ proc do_near_test {tn doc near res} {
} }
execsql { execsql {
CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = 'simple tokenchars .') CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = 'ascii tokenchars .')
} }
do_near_test 1.1 ". . a . . . b . ." { NEAR(a b, 5) } 1 do_near_test 1.1 ". . a . . . b . ." { NEAR(a b, 5) } 1

View File

@@ -33,7 +33,7 @@ do_execsql_test 1.3 {
DROP TABLE ft1; DROP TABLE ft1;
} }
do_execsql_test 1.4 { do_execsql_test 1.4 {
CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'porter simple'); CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'porter ascii');
DROP TABLE ft1; DROP TABLE ft1;
} }
@@ -75,7 +75,7 @@ do_catchsql_test 4.2 {
#------------------------------------------------------------------------- #-------------------------------------------------------------------------
# Test the "separators" and "tokenchars" options a bit. # Test the "separators" and "tokenchars" options a bit.
# #
foreach {tn tokenizer} {1 simple 2 unicode61} { foreach {tn tokenizer} {1 ascii 2 unicode61} {
reset_db reset_db
set T "$tokenizer tokenchars ',.:' separators 'xyz'" set T "$tokenizer tokenchars ',.:' separators 'xyz'"
execsql "CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = \"$T\")" execsql "CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = \"$T\")"

View File

@@ -25,12 +25,32 @@ proc tokenize_test {tn tokenizer input output} {
}] [list {*}$output]] }] [list {*}$output]]
} }
foreach {tn t} {1 simple 2 unicode61} { foreach {tn t} {1 ascii 2 unicode61} {
tokenize_test 1.$tn.0 $t {A B C D} {a b c d} tokenize_test 1.$tn.0 $t {A B C D} {a b c d}
tokenize_test 1.$tn.1 $t {May you share freely,} {may you share freely} tokenize_test 1.$tn.1 $t {May you share freely,} {may you share freely}
tokenize_test 1.$tn.2 $t {..May...you.shAre.freely} {may you share freely} tokenize_test 1.$tn.2 $t {..May...you.shAre.freely} {may you share freely}
tokenize_test 1.$tn.3 $t {} {} tokenize_test 1.$tn.3 $t {} {}
} }
#-------------------------------------------------------------------------
# Check that "unicode61" really is the default tokenizer.
#
do_execsql_test 2.0 "
CREATE VIRTUAL TABLE t1 USING fts5(x);
CREATE VIRTUAL TABLE t2 USING fts5(x, tokenize = unicode61);
CREATE VIRTUAL TABLE t3 USING fts5(x, tokenize = ascii);
INSERT INTO t1 VALUES('\xC0\xC8\xCC');
INSERT INTO t2 VALUES('\xC0\xC8\xCC');
INSERT INTO t3 VALUES('\xC0\xC8\xCC');
"
breakpoint
do_execsql_test 2.1 "
SELECT 't1' FROM t1 WHERE t1 MATCH '\xE0\xE8\xEC';
SELECT 't2' FROM t2 WHERE t2 MATCH '\xE0\xE8\xEC';
SELECT 't3' FROM t3 WHERE t3 MATCH '\xE0\xE8\xEC';
" {t1 t2}
finish_test finish_test

View File

@@ -1,5 +1,5 @@
C Fix\ssome\sdocumentation\sissues\sin\sfts5. C Optimize\sthe\sunicode61\stokenizer\sso\sthat\sit\shandles\sascii\stext\sfaster.\sMake\sit\sthe\sdefault\stokenizer.\sChange\sthe\sname\sof\sthe\ssimple\stokenizer\sto\s"ascii".
D 2015-01-10T20:34:27.199 D 2015-01-12T17:58:04.627
F Makefile.arm-wince-mingw32ce-gcc d6df77f1f48d690bd73162294bbba7f59507c72f F Makefile.arm-wince-mingw32ce-gcc d6df77f1f48d690bd73162294bbba7f59507c72f
F Makefile.in 7cd23e4fc91004a6bd081623e1bc6932e44828c0 F Makefile.in 7cd23e4fc91004a6bd081623e1bc6932e44828c0
F Makefile.linux-gcc 91d710bdc4998cb015f39edf3cb314ec4f4d7e23 F Makefile.linux-gcc 91d710bdc4998cb015f39edf3cb314ec4f4d7e23
@@ -104,7 +104,7 @@ F ext/fts3/unicode/CaseFolding.txt 8c678ca52ecc95e16bc7afc2dbf6fc9ffa05db8c
F ext/fts3/unicode/UnicodeData.txt cd07314edb62d49fde34debdaf92fa2aa69011e7 F ext/fts3/unicode/UnicodeData.txt cd07314edb62d49fde34debdaf92fa2aa69011e7
F ext/fts3/unicode/mkunicode.tcl 4199cb887040ee3c3cd59a5171ddb0566904586e F ext/fts3/unicode/mkunicode.tcl 4199cb887040ee3c3cd59a5171ddb0566904586e
F ext/fts5/extract_api_docs.tcl 55a6d648d516f35d9a1e580ac00de27154e1904a F ext/fts5/extract_api_docs.tcl 55a6d648d516f35d9a1e580ac00de27154e1904a
F ext/fts5/fts5.c c90004f4a91ce4f4dfad2fc980ade0d9314ebb10 F ext/fts5/fts5.c 790880afffb249c79f9a36b38f9d774515f5cf7b
F ext/fts5/fts5.h f931954065693898d26c51f23f1d27200184a69a F ext/fts5/fts5.h f931954065693898d26c51f23f1d27200184a69a
F ext/fts5/fts5Int.h 0142ba4c3c70e1976578604c0e738670f7689726 F ext/fts5/fts5Int.h 0142ba4c3c70e1976578604c0e738670f7689726
F ext/fts5/fts5_aux.c 549aef152b0fd46020f5595d861b1fd60b3f9b4f F ext/fts5/fts5_aux.c 549aef152b0fd46020f5595d861b1fd60b3f9b4f
@@ -115,7 +115,7 @@ F ext/fts5/fts5_hash.c 63fa8379c5f2ac107d47c2b7d9ac04c95ef8a279
F ext/fts5/fts5_index.c ea36c1e42aaf8038b6139be95575eb7fe01f34e4 F ext/fts5/fts5_index.c ea36c1e42aaf8038b6139be95575eb7fe01f34e4
F ext/fts5/fts5_storage.c 8bc9e5b6654e1545e9513def277ef3f025921664 F ext/fts5/fts5_storage.c 8bc9e5b6654e1545e9513def277ef3f025921664
F ext/fts5/fts5_tcl.c 1293fac2bb26903fd3d5cdee59c5885ba7e620d5 F ext/fts5/fts5_tcl.c 1293fac2bb26903fd3d5cdee59c5885ba7e620d5
F ext/fts5/fts5_tokenize.c 4c30cf32c63e59bec5b38533e0a65987df262851 F ext/fts5/fts5_tokenize.c bdb6a1f599a94ec6e9c1cad037d1071e823dcb5d
F ext/fts5/fts5_unicode2.c 9c7dd640d1f014bf5c3ee029759adfbb4d7e95a9 F ext/fts5/fts5_unicode2.c 9c7dd640d1f014bf5c3ee029759adfbb4d7e95a9
F ext/fts5/fts5parse.y 777da8e5819f75c217982c79c29d014c293acac9 F ext/fts5/fts5parse.y 777da8e5819f75c217982c79c29d014c293acac9
F ext/fts5/test/fts5_common.tcl 08e939096a07eb77a7a986613e960f31d3cab2cc F ext/fts5/test/fts5_common.tcl 08e939096a07eb77a7a986613e960f31d3cab2cc
@@ -135,12 +135,12 @@ F ext/fts5/test/fts5auxdata.test c69b86092bf1a157172de5f9169731af3403179b
F ext/fts5/test/fts5content.test 4234e0b11e003fe1e80472aa637f70464396fdd0 F ext/fts5/test/fts5content.test 4234e0b11e003fe1e80472aa637f70464396fdd0
F ext/fts5/test/fts5ea.test 04695560a444fcc00c3c4f27783bdcfbf71f030c F ext/fts5/test/fts5ea.test 04695560a444fcc00c3c4f27783bdcfbf71f030c
F ext/fts5/test/fts5fault1.test f3f4c6ed15cc7a4dc8d517c0d1969d8e5a35a65c F ext/fts5/test/fts5fault1.test f3f4c6ed15cc7a4dc8d517c0d1969d8e5a35a65c
F ext/fts5/test/fts5near.test 70a568a1211a5b6d5a17282790d5f8cbbe086ce0 F ext/fts5/test/fts5near.test 3f9f64e16cac82725d03d4e04c661090f0b3b947
F ext/fts5/test/fts5optimize.test 0028c90a7817d3e576d1148fc8dff17d89054e54 F ext/fts5/test/fts5optimize.test 0028c90a7817d3e576d1148fc8dff17d89054e54
F ext/fts5/test/fts5porter.test 50322599823cb8080a99f0ec0c39f7d0c12bcb5e F ext/fts5/test/fts5porter.test 50322599823cb8080a99f0ec0c39f7d0c12bcb5e
F ext/fts5/test/fts5rebuild.test 2a5e98205393487b4a732c8290999af7c0b907b4 F ext/fts5/test/fts5rebuild.test 2a5e98205393487b4a732c8290999af7c0b907b4
F ext/fts5/test/fts5tokenizer.test f951bb9be29232bd057b0ac4d535b879d9cd9a89 F ext/fts5/test/fts5tokenizer.test b34ae592db66f6e89546d791ce1f905ba0b3395c
F ext/fts5/test/fts5unicode.test 9ae93296e59917c1210336388f6d3b98051b50c9 F ext/fts5/test/fts5unicode.test 79b3e34eb29ce4929628aa514a40cb467fdabe4d
F ext/fts5/test/fts5unicode2.test 64a5267fd6082fcb46439892ebd0cbaa5c38acee F ext/fts5/test/fts5unicode2.test 64a5267fd6082fcb46439892ebd0cbaa5c38acee
F ext/icu/README.txt d9fbbad0c2f647c3fdf715fc9fd64af53aedfc43 F ext/icu/README.txt d9fbbad0c2f647c3fdf715fc9fd64af53aedfc43
F ext/icu/icu.c d415ccf984defeb9df2c0e1afcfaa2f6dc05eacb F ext/icu/icu.c d415ccf984defeb9df2c0e1afcfaa2f6dc05eacb
@@ -1274,7 +1274,7 @@ F tool/vdbe_profile.tcl 67746953071a9f8f2f668b73fe899074e2c6d8c1
F tool/warnings-clang.sh f6aa929dc20ef1f856af04a730772f59283631d4 F tool/warnings-clang.sh f6aa929dc20ef1f856af04a730772f59283631d4
F tool/warnings.sh 0abfd78ceb09b7f7c27c688c8e3fe93268a13b32 F tool/warnings.sh 0abfd78ceb09b7f7c27c688c8e3fe93268a13b32
F tool/win/sqlite.vsix deb315d026cc8400325c5863eef847784a219a2f F tool/win/sqlite.vsix deb315d026cc8400325c5863eef847784a219a2f
P e749be563d8e738af113bd301770e2f22763ab77 P 512e1bdb4093b59d1494dfc63391476eadd52aea
R 5c59d3558d2a230e6048c600760933d7 R 30a0c3c40d1701cf92ddf5b1410b6af9
U dan U dan
Z 6c17e3ae4cf92b8841424ff4d00c314d Z 9b7b348d489cfd6e15d4a8bf3e2c22e9

View File

@@ -1 +1 @@
512e1bdb4093b59d1494dfc63391476eadd52aea f22dbccad9499624880ddd48df1b07fb42b1ad66