1
0
mirror of https://github.com/sqlite/sqlite.git synced 2025-07-29 08:01:23 +03:00

Ensure that tokendata=1 queries avoid loading large doclists for queries like "common AND uncommon", just as tokendata=0 queries do.

FossilOrigin-Name: 7bda09ab404a110d57449e149a3281fca8dc4cacf7bd9832ea2a1356ad20fe8e
This commit is contained in:
dan
2023-12-02 17:32:16 +00:00
parent f4c2962558
commit c44041e03b
9 changed files with 217 additions and 34 deletions

View File

@ -397,6 +397,7 @@ struct Fts5IndexIter {
#define FTS5INDEX_QUERY_NOOUTPUT 0x0020
#define FTS5INDEX_QUERY_SKIPHASH 0x0040
#define FTS5INDEX_QUERY_NOTOKENDATA 0x0080
#define FTS5INDEX_QUERY_SCANONETERM 0x0100
/*
** Create/destroy an Fts5Index object.
@ -786,6 +787,7 @@ int sqlite3Fts5ExprPhraseCollist(Fts5Expr *, int, const u8 **, int *);
int sqlite3Fts5ExprQueryToken(Fts5Expr*, int, int, const char**, int*);
int sqlite3Fts5ExprInstToken(Fts5Expr*, int, int, int, int, const char**, int*);
void sqlite3Fts5ExprClearTokens(Fts5Expr*);
/*******************************************
** The fts5_expr.c API above this point is used by the other hand-written

View File

@ -3050,17 +3050,6 @@ int sqlite3Fts5ExprPopulatePoslists(
sCtx.aPopulator = aPopulator;
sCtx.iOff = (((i64)iCol) << 32) - 1;
/* If this is a tokendata=1 table, clear out the hash tables of
** full-terms. */
if( pConfig->bTokendata ){
for(i=0; i<pExpr->nPhrase; i++){
Fts5ExprTerm *pT;
for(pT=&pExpr->apExprPhrase[i]->aTerm[0]; pT; pT=pT->pSynonym){
sqlite3Fts5IndexIterClearTokendata(pT->pIter);
}
}
}
for(i=0; i<pExpr->nPhrase; i++){
Fts5ExprNode *pNode = pExpr->apExprPhrase[i]->pNode;
Fts5Colset *pColset = pNode->pNear->pColset;
@ -3225,3 +3214,17 @@ int sqlite3Fts5ExprInstToken(
return sqlite3Fts5IterToken(pIter, iRowid, iCol, iOff+iToken, ppOut, pnOut);
}
/*
** Clear the token mappings for all Fts5IndexIter objects mannaged by
** the expression passed as the only argument.
*/
void sqlite3Fts5ExprClearTokens(Fts5Expr *pExpr){
int ii;
for(ii=0; ii<pExpr->nPhrase; ii++){
Fts5ExprTerm *pT;
for(pT=&pExpr->apExprPhrase[ii]->aTerm[0]; pT; pT=pT->pSynonym){
sqlite3Fts5IndexIterClearTokendata(pT->pIter);
}
}
}

View File

@ -366,6 +366,7 @@ struct Fts5Index {
sqlite3_stmt *pIdxWriter; /* "INSERT ... %_idx VALUES(?,?,?,?)" */
sqlite3_stmt *pIdxDeleter; /* "DELETE FROM %_idx WHERE segid=?" */
sqlite3_stmt *pIdxSelect;
sqlite3_stmt *pIdxNextSelect;
int nRead; /* Total number of blocks read */
sqlite3_stmt *pDeleteFromIdx;
@ -2660,7 +2661,7 @@ static void fts5SegIterSeekInit(
fts5LeafSeek(p, bGe, pIter, pTerm, nTerm);
}
if( p->rc==SQLITE_OK && bGe==0 ){
if( p->rc==SQLITE_OK && (bGe==0 || (flags & FTS5INDEX_QUERY_SCANONETERM)) ){
pIter->flags |= FTS5_SEGITER_ONETERM;
if( pIter->pLeaf ){
if( flags & FTS5INDEX_QUERY_DESC ){
@ -2693,6 +2694,79 @@ static void fts5SegIterSeekInit(
);
}
/*
** SQL used by fts5SegIterNextInit() to find the page to open.
*/
static sqlite3_stmt *fts5IdxNextStmt(Fts5Index *p){
if( p->pIdxNextSelect==0 ){
Fts5Config *pConfig = p->pConfig;
fts5IndexPrepareStmt(p, &p->pIdxNextSelect, sqlite3_mprintf(
"SELECT pgno FROM '%q'.'%q_idx' WHERE "
"segid=? AND term>? ORDER BY term ASC LIMIT 1",
pConfig->zDb, pConfig->zName
));
}
return p->pIdxNextSelect;
}
/*
** This is similar to fts5SegIterSeekInit(), except that it initializes
** the segment iterator to point to the first term following the page
** with pToken/nToken on it.
*/
static void fts5SegIterNextInit(
Fts5Index *p,
const char *pTerm, int nTerm,
Fts5StructureSegment *pSeg, /* Description of segment */
Fts5SegIter *pIter /* Object to populate */
){
int iPg = -1; /* Page of segment to open */
int bDlidx = 0;
sqlite3_stmt *pSel = 0; /* SELECT to find iPg */
pSel = fts5IdxNextStmt(p);
if( pSel ){
assert( p->rc==SQLITE_OK );
sqlite3_bind_int(pSel, 1, pSeg->iSegid);
sqlite3_bind_blob(pSel, 2, pTerm, nTerm, SQLITE_STATIC);
if( sqlite3_step(pSel)==SQLITE_ROW ){
i64 val = sqlite3_column_int64(pSel, 0);
iPg = (int)(val>>1);
bDlidx = (val & 0x0001);
}
p->rc = sqlite3_reset(pSel);
if( p->rc ) return;
}
memset(pIter, 0, sizeof(*pIter));
pIter->pSeg = pSeg;
pIter->flags |= FTS5_SEGITER_ONETERM;
if( iPg>=0 ){
pIter->iLeafPgno = iPg - 1;
fts5SegIterNextPage(p, pIter);
fts5SegIterSetNext(p, pIter);
fts5SegIterAllocTombstone(p, pIter);
}
if( pIter->pLeaf ){
const u8 *a = pIter->pLeaf->p;
int iTermOff = 0;
pIter->iPgidxOff = pIter->pLeaf->szLeaf;
pIter->iPgidxOff += fts5GetVarint32(&a[pIter->iPgidxOff], iTermOff);
pIter->iLeafOffset = iTermOff;
fts5SegIterLoadTerm(p, pIter, 0);
fts5SegIterLoadNPos(p, pIter);
if( bDlidx ) fts5SegIterLoadDlidx(p, pIter);
assert( p->rc!=SQLITE_OK ||
fts5BufferCompareBlob(&pIter->term, pTerm, nTerm)>0
);
}
}
/*
** Initialize the object pIter to point to term pTerm/nTerm within the
** in-memory hash table. If there is no such term in the hash-table, the
@ -6346,6 +6420,7 @@ int sqlite3Fts5IndexClose(Fts5Index *p){
sqlite3_finalize(p->pIdxWriter);
sqlite3_finalize(p->pIdxDeleter);
sqlite3_finalize(p->pIdxSelect);
sqlite3_finalize(p->pIdxNextSelect);
sqlite3_finalize(p->pDataVersion);
sqlite3_finalize(p->pDeleteFromIdx);
sqlite3Fts5HashFree(p->pHash);
@ -6496,7 +6571,7 @@ static Fts5TokenDataIter *fts5AppendTokendataIter(
if( p->rc==SQLITE_OK ){
if( pIn==0 || pIn->nIter==pIn->nIterAlloc ){
int nAlloc = pIn ? pIn->nIterAlloc*2 : 16;
int nByte = nAlloc * sizeof(Fts5Iter*);
int nByte = nAlloc * sizeof(Fts5Iter*) + sizeof(Fts5TokenDataIter);
Fts5TokenDataIter *pNew = (Fts5TokenDataIter*)sqlite3_realloc(pIn, nByte);
if( pNew==0 ){
@ -6513,6 +6588,7 @@ static Fts5TokenDataIter *fts5AppendTokendataIter(
}else{
pRet->apIter[pRet->nIter++] = pAppend;
}
assert( pRet==0 || pRet->nIter<=pRet->nIterAlloc );
return pRet;
}
@ -6747,6 +6823,10 @@ static void fts5TokendataSetTermIfEof(Fts5Iter *pIter, Fts5Buffer *pTerm){
}
}
/*
** This function sets up an iterator to use for a non-prefix query on a
** tokendata=1 table.
*/
static Fts5Iter *fts5SetupTokendataIter(
Fts5Index *p, /* FTS index to query */
const u8 *pToken, /* Buffer containing query term */
@ -6756,7 +6836,7 @@ static Fts5Iter *fts5SetupTokendataIter(
Fts5Iter *pRet = 0;
Fts5TokenDataIter *pSet = 0;
Fts5Structure *pStruct = 0;
const int flags = FTS5INDEX_QUERY_SKIPEMPTY | FTS5INDEX_QUERY_SCAN;
const int flags = FTS5INDEX_QUERY_SCANONETERM | FTS5INDEX_QUERY_SCAN;
Fts5Buffer bSeek = {0, 0, 0};
Fts5Buffer *pSmall = 0;
@ -6787,20 +6867,32 @@ static Fts5Iter *fts5SetupTokendataIter(
for(iLvl=0; iLvl<pStruct->nLevel; iLvl++){
for(iSeg=pStruct->aLevel[iLvl].nSeg-1; iSeg>=0; iSeg--){
Fts5StructureSegment *pSeg = &pStruct->aLevel[iLvl].aSeg[iSeg];
fts5SegIterSeekInit(p, bSeek.p, bSeek.n, flags, pSeg, pNewIter);
int bDone = 0;
pNewIter++;
if( pPrevIter ){
if( fts5BufferCompare(pSmall, &pPrevIter->term) ){
fts5SegIterSetEOF(pPrevIter);
memcpy(pNewIter, pPrevIter, sizeof(Fts5SegIter));
memset(pPrevIter, 0, sizeof(Fts5SegIter));
bDone = 1;
}else if( pPrevIter->pLeaf
&& pPrevIter->iEndofDoclist>pPrevIter->pLeaf->szLeaf
){
fts5SegIterNextInit(p,(const char*)bSeek.p,bSeek.n-1,pSeg,pNewIter);
bDone = 1;
}
pPrevIter++;
}
if( bDone==0 ){
fts5SegIterSeekInit(p, bSeek.p, bSeek.n, flags, pSeg, pNewIter);
}
pNewIter++;
if( pPrevIter ) pPrevIter++;
}
}
fts5TokendataSetTermIfEof(pPrev, pSmall);
pNew->bSkipEmpty = (0!=(flags & FTS5INDEX_QUERY_SKIPEMPTY));
pNew->bSkipEmpty = 1;
pNew->pColset = pColset;
fts5IterSetOutputCb(&p->rc, pNew);
@ -7043,7 +7135,6 @@ int sqlite3Fts5IterToken(
*/
void sqlite3Fts5IndexIterClearTokendata(Fts5IndexIter *pIndexIter){
Fts5Iter *pIter = (Fts5Iter*)pIndexIter;
assert( pIter->pIndex->pConfig->eDetail!=FTS5_DETAIL_FULL );
if( pIter->pTokenDataIter ){
pIter->pTokenDataIter->nMap = 0;
}

View File

@ -916,6 +916,16 @@ static int fts5NextMethod(sqlite3_vtab_cursor *pCursor){
);
assert( !CsrFlagTest(pCsr, FTS5CSR_EOF) );
/* If this cursor uses FTS5_PLAN_MATCH and this is a tokendata=1 table,
** clear any token mappings accumulated at the fts5_index.c level. In
** other cases, specifically FTS5_PLAN_SOURCE and FTS5_PLAN_SORTED_MATCH,
** we need to retain the mappings for the entire query. */
if( pCsr->ePlan==FTS5_PLAN_MATCH
&& ((Fts5Table*)pCursor->pVtab)->pConfig->bTokendata
){
sqlite3Fts5ExprClearTokens(pCsr->pExpr);
}
if( pCsr->ePlan<3 ){
int bSkip = 0;
if( (rc = fts5CursorReseek(pCsr, &bSkip)) || bSkip ) return rc;

View File

@ -98,7 +98,6 @@ do_execsql_test 1.10 {
INSERT INTO ft VALUES('WORLD');
}
breakpoint
do_execsql_test 1.11 { SELECT rowid FROM ft('hello'); } {1 2 3}
do_execsql_test 1.12 { SELECT rowid FROM ft('today'); } {4 5 6}
do_execsql_test 1.13 { SELECT rowid FROM ft('world'); } {7 8 9}

View File

@ -46,7 +46,6 @@ foreach_detail_mode $testprefix {
SELECT fts5_test_poslist(ft) FROM ft('hello');
} {{0.0.0 0.0.2 0.0.4}}
breakpoint
do_execsql_test 1.3 {
SELECT
insttoken(ft, 0, 0),
@ -63,6 +62,18 @@ breakpoint
FROM ft('hello') ORDER BY rank;
} {hello.Hello hello.HELLO hello}
do_execsql_test 1.5 {
CREATE VIRTUAL TABLE ft2 USING fts5(
x, tokenize="origintext unicode61", tokendata=1, detail=%DETAIL%
);
INSERT INTO ft2(rowid, x) VALUES(1, 'ONE one two three ONE');
INSERT INTO ft2(rowid, x) VALUES(2, 'TWO one two three TWO');
INSERT INTO ft2(rowid, x) VALUES(3, 'THREE one two three THREE');
}
do_execsql_test 1.6 {
SELECT insttoken(ft2, 0, 0), rowid FROM ft2('three') ORDER BY rank;
} {three.THREE 3 three 1 three 2}
}
finish_test

View File

@ -0,0 +1,66 @@
# 2023 November 22
#
# The author disclaims copyright to this source code. In place of
# a legal notice, here is a blessing:
#
# May you do good and not evil.
# May you find forgiveness for yourself and forgive others.
# May you share freely, never taking more than you give.
#
#***********************************************************************
#
# Tests focused on phrase queries.
#
source [file join [file dirname [info script]] fts5_common.tcl]
set testprefix fts5origintext4
# If SQLITE_ENABLE_FTS5 is defined, omit this file.
ifcapable !fts5 {
finish_test
return
}
sqlite3_fts5_register_origintext db
do_execsql_test 1.0 {
CREATE VIRTUAL TABLE ft USING fts5(
x, tokenize="origintext unicode61", tokendata=1
);
}
do_execsql_test 1.1 {
BEGIN;
INSERT INTO ft SELECT 'the first thing';
WITH s(i) AS (
SELECT 1 UNION ALL SELECT i+1 FROM s WHERE i<90000
)
INSERT INTO ft SELECT 'The second thing' FROM s;
INSERT INTO ft SELECT 'the first thing';
COMMIT;
INSERT INTO ft(ft) VALUES('optimize');
}
foreach {tn sql expr} {
1 { SELECT rowid FROM ft('the') } {$mem > 250000}
2 { SELECT rowid FROM ft('first') } {$mem < 50000}
3 { SELECT rowid FROM ft('the first') } {$mem < 50000}
} {
db close
sqlite3 db test.db
sqlite3_fts5_register_origintext db
execsql $sql
do_test 1.2.$tn {
set mem [lindex [sqlite3_db_status db CACHE_USED 0] 1]
expr $expr
} 1
}
proc b {x} { string map [list "\0" "."] $x }
db func b b
# execsql_pp { SELECT segid, b(term), pgno from ft_idx }
finish_test