1
0
mirror of https://github.com/sqlite/sqlite.git synced 2025-07-30 19:03:16 +03:00

Fix the fts5 trigram tokenizer so that it handles non-nul-terminated strings.

FossilOrigin-Name: 84f4e37178a65e3128ac0240d37ac40df08b4050ab070d10707e35d11dcbeb10
This commit is contained in:
dan
2024-11-11 19:49:26 +00:00
parent be46f935dc
commit 0cd2ffffb7
5 changed files with 45 additions and 18 deletions

View File

@ -730,8 +730,9 @@ static int SQLITE_TCLAPI f5tTokenize(
int objc,
Tcl_Obj *CONST objv[]
){
char *zText;
Tcl_Size nText;
char *pCopy = 0;
char *zText = 0;
Tcl_Size nText = 0;
sqlite3 *db = 0;
fts5_api *pApi = 0;
Fts5Tokenizer *pTok = 0;
@ -778,22 +779,33 @@ static int SQLITE_TCLAPI f5tTokenize(
return TCL_ERROR;
}
if( nText>0 ){
pCopy = sqlite3_malloc(nText);
if( pCopy==0 ){
tokenizer.xDelete(pTok);
Tcl_AppendResult(interp, "error in sqlite3_malloc()", (char*)0);
return TCL_ERROR;
}else{
memcpy(pCopy, zText, nText);
}
}
pRet = Tcl_NewObj();
Tcl_IncrRefCount(pRet);
ctx.bSubst = (objc==5);
ctx.pRet = pRet;
ctx.zInput = zText;
ctx.zInput = pCopy;
rc = tokenizer.xTokenize(
pTok, (void*)&ctx, FTS5_TOKENIZE_DOCUMENT, zText,(int)nText, xTokenizeCb2
pTok, (void*)&ctx, FTS5_TOKENIZE_DOCUMENT, pCopy,(int)nText, xTokenizeCb2
);
tokenizer.xDelete(pTok);
sqlite3_free(pCopy);
if( rc!=SQLITE_OK ){
Tcl_AppendResult(interp, "error in tokenizer.xTokenize()", (char*)0);
Tcl_DecrRefCount(pRet);
return TCL_ERROR;
}
Tcl_Free((void*)azArg);
Tcl_SetObjResult(interp, pRet);
Tcl_DecrRefCount(pRet);

View File

@ -1354,7 +1354,7 @@ static int fts5TriTokenize(
int ii;
const unsigned char *zIn = (const unsigned char*)pText;
const unsigned char *zEof = &zIn[nText];
u32 iCode;
u32 iCode = 0;
int aStart[3]; /* Input offset of each character in aBuf[] */
UNUSED_PARAM(unusedFlags);
@ -1363,8 +1363,8 @@ static int fts5TriTokenize(
for(ii=0; ii<3; ii++){
do {
aStart[ii] = zIn - (const unsigned char*)pText;
if( zIn>=zEof ) return SQLITE_OK;
READ_UTF8(zIn, zEof, iCode);
if( iCode==0 ) return SQLITE_OK;
if( p->bFold ) iCode = sqlite3Fts5UnicodeFold(iCode, p->iFoldParam);
}while( iCode==0 );
WRITE_UTF8(zOut, iCode);
@ -1385,8 +1385,11 @@ static int fts5TriTokenize(
/* Read characters from the input up until the first non-diacritic */
do {
iNext = zIn - (const unsigned char*)pText;
if( zIn>=zEof ){
iCode = 0;
break;
}
READ_UTF8(zIn, zEof, iCode);
if( iCode==0 ) break;
if( p->bFold ) iCode = sqlite3Fts5UnicodeFold(iCode, p->iFoldParam);
}while( iCode==0 );

View File

@ -350,5 +350,17 @@ do_execsql_test 11.1 {
INSERT INTO t4 VALUES( str('') );
}
do_test 12.0 {
sqlite3_fts5_tokenize db trigram "abcd"
} {abc 0 3 bcd 1 4}
do_test 12.1 {
sqlite3_fts5_tokenize db trigram "a"
} {}
do_test 12.2 {
sqlite3_fts5_tokenize db trigram ""
} {}
finish_test