mirror of
https://github.com/sqlite/sqlite.git
synced 2025-07-29 08:01:23 +03:00
Ensure that tokendata=1 queries avoid loading large doclists for queries like "common AND uncommon", just as tokendata=0 queries do.
FossilOrigin-Name: 7bda09ab404a110d57449e149a3281fca8dc4cacf7bd9832ea2a1356ad20fe8e
This commit is contained in:
@ -397,6 +397,7 @@ struct Fts5IndexIter {
|
||||
#define FTS5INDEX_QUERY_NOOUTPUT 0x0020
|
||||
#define FTS5INDEX_QUERY_SKIPHASH 0x0040
|
||||
#define FTS5INDEX_QUERY_NOTOKENDATA 0x0080
|
||||
#define FTS5INDEX_QUERY_SCANONETERM 0x0100
|
||||
|
||||
/*
|
||||
** Create/destroy an Fts5Index object.
|
||||
@ -786,6 +787,7 @@ int sqlite3Fts5ExprPhraseCollist(Fts5Expr *, int, const u8 **, int *);
|
||||
|
||||
int sqlite3Fts5ExprQueryToken(Fts5Expr*, int, int, const char**, int*);
|
||||
int sqlite3Fts5ExprInstToken(Fts5Expr*, int, int, int, int, const char**, int*);
|
||||
void sqlite3Fts5ExprClearTokens(Fts5Expr*);
|
||||
|
||||
/*******************************************
|
||||
** The fts5_expr.c API above this point is used by the other hand-written
|
||||
|
@ -3050,17 +3050,6 @@ int sqlite3Fts5ExprPopulatePoslists(
|
||||
sCtx.aPopulator = aPopulator;
|
||||
sCtx.iOff = (((i64)iCol) << 32) - 1;
|
||||
|
||||
/* If this is a tokendata=1 table, clear out the hash tables of
|
||||
** full-terms. */
|
||||
if( pConfig->bTokendata ){
|
||||
for(i=0; i<pExpr->nPhrase; i++){
|
||||
Fts5ExprTerm *pT;
|
||||
for(pT=&pExpr->apExprPhrase[i]->aTerm[0]; pT; pT=pT->pSynonym){
|
||||
sqlite3Fts5IndexIterClearTokendata(pT->pIter);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for(i=0; i<pExpr->nPhrase; i++){
|
||||
Fts5ExprNode *pNode = pExpr->apExprPhrase[i]->pNode;
|
||||
Fts5Colset *pColset = pNode->pNear->pColset;
|
||||
@ -3225,3 +3214,17 @@ int sqlite3Fts5ExprInstToken(
|
||||
return sqlite3Fts5IterToken(pIter, iRowid, iCol, iOff+iToken, ppOut, pnOut);
|
||||
}
|
||||
|
||||
/*
|
||||
** Clear the token mappings for all Fts5IndexIter objects mannaged by
|
||||
** the expression passed as the only argument.
|
||||
*/
|
||||
void sqlite3Fts5ExprClearTokens(Fts5Expr *pExpr){
|
||||
int ii;
|
||||
for(ii=0; ii<pExpr->nPhrase; ii++){
|
||||
Fts5ExprTerm *pT;
|
||||
for(pT=&pExpr->apExprPhrase[ii]->aTerm[0]; pT; pT=pT->pSynonym){
|
||||
sqlite3Fts5IndexIterClearTokendata(pT->pIter);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -366,6 +366,7 @@ struct Fts5Index {
|
||||
sqlite3_stmt *pIdxWriter; /* "INSERT ... %_idx VALUES(?,?,?,?)" */
|
||||
sqlite3_stmt *pIdxDeleter; /* "DELETE FROM %_idx WHERE segid=?" */
|
||||
sqlite3_stmt *pIdxSelect;
|
||||
sqlite3_stmt *pIdxNextSelect;
|
||||
int nRead; /* Total number of blocks read */
|
||||
|
||||
sqlite3_stmt *pDeleteFromIdx;
|
||||
@ -2660,7 +2661,7 @@ static void fts5SegIterSeekInit(
|
||||
fts5LeafSeek(p, bGe, pIter, pTerm, nTerm);
|
||||
}
|
||||
|
||||
if( p->rc==SQLITE_OK && bGe==0 ){
|
||||
if( p->rc==SQLITE_OK && (bGe==0 || (flags & FTS5INDEX_QUERY_SCANONETERM)) ){
|
||||
pIter->flags |= FTS5_SEGITER_ONETERM;
|
||||
if( pIter->pLeaf ){
|
||||
if( flags & FTS5INDEX_QUERY_DESC ){
|
||||
@ -2693,6 +2694,79 @@ static void fts5SegIterSeekInit(
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
** SQL used by fts5SegIterNextInit() to find the page to open.
|
||||
*/
|
||||
static sqlite3_stmt *fts5IdxNextStmt(Fts5Index *p){
|
||||
if( p->pIdxNextSelect==0 ){
|
||||
Fts5Config *pConfig = p->pConfig;
|
||||
fts5IndexPrepareStmt(p, &p->pIdxNextSelect, sqlite3_mprintf(
|
||||
"SELECT pgno FROM '%q'.'%q_idx' WHERE "
|
||||
"segid=? AND term>? ORDER BY term ASC LIMIT 1",
|
||||
pConfig->zDb, pConfig->zName
|
||||
));
|
||||
|
||||
}
|
||||
return p->pIdxNextSelect;
|
||||
}
|
||||
|
||||
/*
|
||||
** This is similar to fts5SegIterSeekInit(), except that it initializes
|
||||
** the segment iterator to point to the first term following the page
|
||||
** with pToken/nToken on it.
|
||||
*/
|
||||
static void fts5SegIterNextInit(
|
||||
Fts5Index *p,
|
||||
const char *pTerm, int nTerm,
|
||||
Fts5StructureSegment *pSeg, /* Description of segment */
|
||||
Fts5SegIter *pIter /* Object to populate */
|
||||
){
|
||||
int iPg = -1; /* Page of segment to open */
|
||||
int bDlidx = 0;
|
||||
sqlite3_stmt *pSel = 0; /* SELECT to find iPg */
|
||||
|
||||
pSel = fts5IdxNextStmt(p);
|
||||
if( pSel ){
|
||||
assert( p->rc==SQLITE_OK );
|
||||
sqlite3_bind_int(pSel, 1, pSeg->iSegid);
|
||||
sqlite3_bind_blob(pSel, 2, pTerm, nTerm, SQLITE_STATIC);
|
||||
|
||||
if( sqlite3_step(pSel)==SQLITE_ROW ){
|
||||
i64 val = sqlite3_column_int64(pSel, 0);
|
||||
iPg = (int)(val>>1);
|
||||
bDlidx = (val & 0x0001);
|
||||
}
|
||||
p->rc = sqlite3_reset(pSel);
|
||||
if( p->rc ) return;
|
||||
}
|
||||
|
||||
memset(pIter, 0, sizeof(*pIter));
|
||||
pIter->pSeg = pSeg;
|
||||
pIter->flags |= FTS5_SEGITER_ONETERM;
|
||||
if( iPg>=0 ){
|
||||
pIter->iLeafPgno = iPg - 1;
|
||||
fts5SegIterNextPage(p, pIter);
|
||||
fts5SegIterSetNext(p, pIter);
|
||||
fts5SegIterAllocTombstone(p, pIter);
|
||||
}
|
||||
if( pIter->pLeaf ){
|
||||
const u8 *a = pIter->pLeaf->p;
|
||||
int iTermOff = 0;
|
||||
|
||||
pIter->iPgidxOff = pIter->pLeaf->szLeaf;
|
||||
pIter->iPgidxOff += fts5GetVarint32(&a[pIter->iPgidxOff], iTermOff);
|
||||
pIter->iLeafOffset = iTermOff;
|
||||
fts5SegIterLoadTerm(p, pIter, 0);
|
||||
fts5SegIterLoadNPos(p, pIter);
|
||||
if( bDlidx ) fts5SegIterLoadDlidx(p, pIter);
|
||||
|
||||
assert( p->rc!=SQLITE_OK ||
|
||||
fts5BufferCompareBlob(&pIter->term, pTerm, nTerm)>0
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
** Initialize the object pIter to point to term pTerm/nTerm within the
|
||||
** in-memory hash table. If there is no such term in the hash-table, the
|
||||
@ -6346,6 +6420,7 @@ int sqlite3Fts5IndexClose(Fts5Index *p){
|
||||
sqlite3_finalize(p->pIdxWriter);
|
||||
sqlite3_finalize(p->pIdxDeleter);
|
||||
sqlite3_finalize(p->pIdxSelect);
|
||||
sqlite3_finalize(p->pIdxNextSelect);
|
||||
sqlite3_finalize(p->pDataVersion);
|
||||
sqlite3_finalize(p->pDeleteFromIdx);
|
||||
sqlite3Fts5HashFree(p->pHash);
|
||||
@ -6496,7 +6571,7 @@ static Fts5TokenDataIter *fts5AppendTokendataIter(
|
||||
if( p->rc==SQLITE_OK ){
|
||||
if( pIn==0 || pIn->nIter==pIn->nIterAlloc ){
|
||||
int nAlloc = pIn ? pIn->nIterAlloc*2 : 16;
|
||||
int nByte = nAlloc * sizeof(Fts5Iter*);
|
||||
int nByte = nAlloc * sizeof(Fts5Iter*) + sizeof(Fts5TokenDataIter);
|
||||
Fts5TokenDataIter *pNew = (Fts5TokenDataIter*)sqlite3_realloc(pIn, nByte);
|
||||
|
||||
if( pNew==0 ){
|
||||
@ -6513,6 +6588,7 @@ static Fts5TokenDataIter *fts5AppendTokendataIter(
|
||||
}else{
|
||||
pRet->apIter[pRet->nIter++] = pAppend;
|
||||
}
|
||||
assert( pRet==0 || pRet->nIter<=pRet->nIterAlloc );
|
||||
|
||||
return pRet;
|
||||
}
|
||||
@ -6747,6 +6823,10 @@ static void fts5TokendataSetTermIfEof(Fts5Iter *pIter, Fts5Buffer *pTerm){
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
** This function sets up an iterator to use for a non-prefix query on a
|
||||
** tokendata=1 table.
|
||||
*/
|
||||
static Fts5Iter *fts5SetupTokendataIter(
|
||||
Fts5Index *p, /* FTS index to query */
|
||||
const u8 *pToken, /* Buffer containing query term */
|
||||
@ -6756,7 +6836,7 @@ static Fts5Iter *fts5SetupTokendataIter(
|
||||
Fts5Iter *pRet = 0;
|
||||
Fts5TokenDataIter *pSet = 0;
|
||||
Fts5Structure *pStruct = 0;
|
||||
const int flags = FTS5INDEX_QUERY_SKIPEMPTY | FTS5INDEX_QUERY_SCAN;
|
||||
const int flags = FTS5INDEX_QUERY_SCANONETERM | FTS5INDEX_QUERY_SCAN;
|
||||
|
||||
Fts5Buffer bSeek = {0, 0, 0};
|
||||
Fts5Buffer *pSmall = 0;
|
||||
@ -6787,20 +6867,32 @@ static Fts5Iter *fts5SetupTokendataIter(
|
||||
for(iLvl=0; iLvl<pStruct->nLevel; iLvl++){
|
||||
for(iSeg=pStruct->aLevel[iLvl].nSeg-1; iSeg>=0; iSeg--){
|
||||
Fts5StructureSegment *pSeg = &pStruct->aLevel[iLvl].aSeg[iSeg];
|
||||
fts5SegIterSeekInit(p, bSeek.p, bSeek.n, flags, pSeg, pNewIter);
|
||||
int bDone = 0;
|
||||
|
||||
pNewIter++;
|
||||
if( pPrevIter ){
|
||||
if( fts5BufferCompare(pSmall, &pPrevIter->term) ){
|
||||
fts5SegIterSetEOF(pPrevIter);
|
||||
memcpy(pNewIter, pPrevIter, sizeof(Fts5SegIter));
|
||||
memset(pPrevIter, 0, sizeof(Fts5SegIter));
|
||||
bDone = 1;
|
||||
}else if( pPrevIter->pLeaf
|
||||
&& pPrevIter->iEndofDoclist>pPrevIter->pLeaf->szLeaf
|
||||
){
|
||||
fts5SegIterNextInit(p,(const char*)bSeek.p,bSeek.n-1,pSeg,pNewIter);
|
||||
bDone = 1;
|
||||
}
|
||||
pPrevIter++;
|
||||
}
|
||||
|
||||
if( bDone==0 ){
|
||||
fts5SegIterSeekInit(p, bSeek.p, bSeek.n, flags, pSeg, pNewIter);
|
||||
}
|
||||
|
||||
pNewIter++;
|
||||
if( pPrevIter ) pPrevIter++;
|
||||
}
|
||||
}
|
||||
fts5TokendataSetTermIfEof(pPrev, pSmall);
|
||||
|
||||
pNew->bSkipEmpty = (0!=(flags & FTS5INDEX_QUERY_SKIPEMPTY));
|
||||
pNew->bSkipEmpty = 1;
|
||||
pNew->pColset = pColset;
|
||||
fts5IterSetOutputCb(&p->rc, pNew);
|
||||
|
||||
@ -7043,7 +7135,6 @@ int sqlite3Fts5IterToken(
|
||||
*/
|
||||
void sqlite3Fts5IndexIterClearTokendata(Fts5IndexIter *pIndexIter){
|
||||
Fts5Iter *pIter = (Fts5Iter*)pIndexIter;
|
||||
assert( pIter->pIndex->pConfig->eDetail!=FTS5_DETAIL_FULL );
|
||||
if( pIter->pTokenDataIter ){
|
||||
pIter->pTokenDataIter->nMap = 0;
|
||||
}
|
||||
|
@ -916,6 +916,16 @@ static int fts5NextMethod(sqlite3_vtab_cursor *pCursor){
|
||||
);
|
||||
assert( !CsrFlagTest(pCsr, FTS5CSR_EOF) );
|
||||
|
||||
/* If this cursor uses FTS5_PLAN_MATCH and this is a tokendata=1 table,
|
||||
** clear any token mappings accumulated at the fts5_index.c level. In
|
||||
** other cases, specifically FTS5_PLAN_SOURCE and FTS5_PLAN_SORTED_MATCH,
|
||||
** we need to retain the mappings for the entire query. */
|
||||
if( pCsr->ePlan==FTS5_PLAN_MATCH
|
||||
&& ((Fts5Table*)pCursor->pVtab)->pConfig->bTokendata
|
||||
){
|
||||
sqlite3Fts5ExprClearTokens(pCsr->pExpr);
|
||||
}
|
||||
|
||||
if( pCsr->ePlan<3 ){
|
||||
int bSkip = 0;
|
||||
if( (rc = fts5CursorReseek(pCsr, &bSkip)) || bSkip ) return rc;
|
||||
|
@ -98,7 +98,6 @@ do_execsql_test 1.10 {
|
||||
INSERT INTO ft VALUES('WORLD');
|
||||
}
|
||||
|
||||
breakpoint
|
||||
do_execsql_test 1.11 { SELECT rowid FROM ft('hello'); } {1 2 3}
|
||||
do_execsql_test 1.12 { SELECT rowid FROM ft('today'); } {4 5 6}
|
||||
do_execsql_test 1.13 { SELECT rowid FROM ft('world'); } {7 8 9}
|
||||
|
@ -46,7 +46,6 @@ foreach_detail_mode $testprefix {
|
||||
SELECT fts5_test_poslist(ft) FROM ft('hello');
|
||||
} {{0.0.0 0.0.2 0.0.4}}
|
||||
|
||||
breakpoint
|
||||
do_execsql_test 1.3 {
|
||||
SELECT
|
||||
insttoken(ft, 0, 0),
|
||||
@ -63,6 +62,18 @@ breakpoint
|
||||
FROM ft('hello') ORDER BY rank;
|
||||
} {hello.Hello hello.HELLO hello}
|
||||
|
||||
do_execsql_test 1.5 {
|
||||
CREATE VIRTUAL TABLE ft2 USING fts5(
|
||||
x, tokenize="origintext unicode61", tokendata=1, detail=%DETAIL%
|
||||
);
|
||||
INSERT INTO ft2(rowid, x) VALUES(1, 'ONE one two three ONE');
|
||||
INSERT INTO ft2(rowid, x) VALUES(2, 'TWO one two three TWO');
|
||||
INSERT INTO ft2(rowid, x) VALUES(3, 'THREE one two three THREE');
|
||||
}
|
||||
|
||||
do_execsql_test 1.6 {
|
||||
SELECT insttoken(ft2, 0, 0), rowid FROM ft2('three') ORDER BY rank;
|
||||
} {three.THREE 3 three 1 three 2}
|
||||
}
|
||||
|
||||
finish_test
|
||||
|
66
ext/fts5/test/fts5origintext4.test
Normal file
66
ext/fts5/test/fts5origintext4.test
Normal file
@ -0,0 +1,66 @@
|
||||
# 2023 November 22
|
||||
#
|
||||
# The author disclaims copyright to this source code. In place of
|
||||
# a legal notice, here is a blessing:
|
||||
#
|
||||
# May you do good and not evil.
|
||||
# May you find forgiveness for yourself and forgive others.
|
||||
# May you share freely, never taking more than you give.
|
||||
#
|
||||
#***********************************************************************
|
||||
#
|
||||
# Tests focused on phrase queries.
|
||||
#
|
||||
|
||||
source [file join [file dirname [info script]] fts5_common.tcl]
|
||||
set testprefix fts5origintext4
|
||||
|
||||
# If SQLITE_ENABLE_FTS5 is defined, omit this file.
|
||||
ifcapable !fts5 {
|
||||
finish_test
|
||||
return
|
||||
}
|
||||
|
||||
sqlite3_fts5_register_origintext db
|
||||
do_execsql_test 1.0 {
|
||||
CREATE VIRTUAL TABLE ft USING fts5(
|
||||
x, tokenize="origintext unicode61", tokendata=1
|
||||
);
|
||||
}
|
||||
|
||||
do_execsql_test 1.1 {
|
||||
BEGIN;
|
||||
INSERT INTO ft SELECT 'the first thing';
|
||||
|
||||
WITH s(i) AS (
|
||||
SELECT 1 UNION ALL SELECT i+1 FROM s WHERE i<90000
|
||||
)
|
||||
INSERT INTO ft SELECT 'The second thing' FROM s;
|
||||
|
||||
INSERT INTO ft SELECT 'the first thing';
|
||||
COMMIT;
|
||||
INSERT INTO ft(ft) VALUES('optimize');
|
||||
}
|
||||
|
||||
foreach {tn sql expr} {
|
||||
1 { SELECT rowid FROM ft('the') } {$mem > 250000}
|
||||
2 { SELECT rowid FROM ft('first') } {$mem < 50000}
|
||||
3 { SELECT rowid FROM ft('the first') } {$mem < 50000}
|
||||
} {
|
||||
db close
|
||||
sqlite3 db test.db
|
||||
sqlite3_fts5_register_origintext db
|
||||
|
||||
execsql $sql
|
||||
do_test 1.2.$tn {
|
||||
set mem [lindex [sqlite3_db_status db CACHE_USED 0] 1]
|
||||
expr $expr
|
||||
} 1
|
||||
}
|
||||
|
||||
proc b {x} { string map [list "\0" "."] $x }
|
||||
db func b b
|
||||
# execsql_pp { SELECT segid, b(term), pgno from ft_idx }
|
||||
|
||||
finish_test
|
||||
|
Reference in New Issue
Block a user