diff --git a/ext/fts3/fts3.c b/ext/fts3/fts3.c index 7a5c63d06a..1d52b687d2 100644 --- a/ext/fts3/fts3.c +++ b/ext/fts3/fts3.c @@ -797,6 +797,7 @@ static int fulltextClose(sqlite3_vtab_cursor *pCursor){ sqlite3_finalize(pCsr->pStmt); sqlite3Fts3ExprFree(pCsr->pExpr); sqlite3_free(pCsr->aDoclist); + sqlite3_free(pCsr->aMatchinfo); sqlite3_free(pCsr); return SQLITE_OK; } @@ -842,6 +843,7 @@ static int fts3NextMethod(sqlite3_vtab_cursor *pCursor){ sqlite3_reset(pCsr->pStmt); fts3GetDeltaVarint(&pCsr->pNextId, &pCsr->iPrevId); pCsr->isRequireSeek = 1; + pCsr->isMatchinfoOk = 1; } return rc; } @@ -1004,25 +1006,6 @@ static void fts3ColumnlistCopy(char **pp, char **ppPoslist){ *ppPoslist = pEnd; } -/* -** This function is used to count the entries in a column-list (delta-encoded -** list of term offsets within a single column of a single row). -*/ -static int fts3ColumnlistCount(char **ppCollist){ - char *pEnd = *ppCollist; - char c = 0; - int nEntry = 0; - - /* A column-list is terminated by either a 0x01 or 0x00. */ - while( 0xFE & (*pEnd | c) ){ - c = *pEnd++ & 0x80; - if( !c ) nEntry++; - } - - *ppCollist = pEnd; - return nEntry; -} - /* ** Value used to signify the end of an offset-list. This is safe because ** it is not possible to have a document with 2^31 terms. @@ -2035,200 +2018,60 @@ static int fts3RollbackMethod(sqlite3_vtab *pVtab){ } /* -** The following flags affect the format of the blob of unsigned int values -** returned by the matchinfo() function. The format is defined as follows: -** -** Integer 0: Number of 'simple queries' that make up the FTS3 query. -** Integer 1: Number of columns in queried table. -** -** followed by the data for (query 0, column 0), (query 0, column 1) ... -** (query 1, column 0) and so on. -** -** The first integer in each data is the number of hits that the simple -** query has in the current column. -** -** If the GLOBALCOUNT flag is set, then this is followed by the total -** number of hits the simple query has in the current column of *all* -** selected rows. -** -** If the PHRASELENGTH flag is set, this is followed by the number of -** tokens in the phrase. -** -** If the POSITIONLIST flag is set, then this is followed by -** integers - the positions of each of the hits for the current column/query. +** Load the doclist associated with expression pExpr to pExpr->aDoclist. +** The loaded doclist contains positions as well as the document ids. +** This is used by the matchinfo(), snippet() and offsets() auxillary +** functions. */ -#define FTS3_MATCHINFO_GLOBALCOUNT 0x00000001 -#define FTS3_MATCHINFO_POSITIONLIST 0x00000002 -#define FTS3_MATCHINFO_PHRASELENGTH 0x00000004 - -typedef struct MatchInfo MatchInfo; -struct MatchInfo { - int rc; /* Return code. SQLITE_OK if no error */ - sqlite3_int64 iDocid; /* Docid of entry to return data for */ - Fts3Table *pTab; /* FTS3 Virtual table */ - int flags; /* Output flags (see above) */ - int nQuery; /* Number of simple queries */ - - /* Malloced output buffer */ - unsigned int *aOut; - int nOut; - int nAlloc; -}; - -static void fts3MatchInfoAppend(MatchInfo *pInfo, unsigned int iVal){ - if( pInfo->rc!=SQLITE_OK ) return; - - if( pInfo->nOut==pInfo->nAlloc ){ - int nNew = pInfo->nAlloc*2+100; - unsigned int *aNew = (unsigned int *)sqlite3_realloc( - pInfo->aOut, nNew * sizeof(unsigned int) - ); - if( !aNew ){ - pInfo->rc = SQLITE_NOMEM; - return; - } - pInfo->aOut = aNew; - pInfo->nAlloc = nNew; - } - - pInfo->aOut[pInfo->nOut++] = iVal; +int sqlite3Fts3ExprLoadDoclist(Fts3Table *pTab, Fts3Expr *pExpr){ + return evalFts3Expr(pTab, pExpr, &pExpr->aDoclist, &pExpr->nDoclist, 1); } /* -** Iterate through each simple query that makes up the query expression -** implemented by the cursor passed as the first argument. +** After ExprLoadDoclist() (see above) has been called, this function is +** used to iterate through the position lists that make up the doclist +** stored in pExpr->aDoclist. */ -static void fts3ExprMatchInfo( - sqlite3_context *pCtx, - Fts3Expr *pExpr, - MatchInfo *pInfo +char *sqlite3Fts3FindPositions( + Fts3Expr *pExpr, /* Access this expressions doclist */ + sqlite3_int64 iDocid, /* Docid associated with requested pos-list */ + int iCol /* Column of requested pos-list */ ){ - int eType = pExpr->eType; - if( eType==FTSQUERY_NOT || pInfo->rc ){ - return; - }else if( eType!=FTSQUERY_PHRASE ){ - assert( pExpr->pLeft && pExpr->pRight ); - fts3ExprMatchInfo(pCtx, pExpr->pLeft, pInfo); - if( pInfo->rc==SQLITE_OK ){ - fts3ExprMatchInfo(pCtx, pExpr->pRight, pInfo); - } - }else{ - int nPhrase = pExpr->pPhrase->nToken; - Fts3Table *pTab = pInfo->pTab; + assert( pExpr->isLoaded ); + if( pExpr->aDoclist ){ + char *pEnd = &pExpr->aDoclist[pExpr->nDoclist]; + char *pCsr = pExpr->pCurrent; - /* If it is not loaded already, load the doclist for this simple query - ** from the FTS3 full-text index. - */ - if( pExpr->isLoaded==0 ){ - pInfo->rc = evalFts3Expr(pTab,pExpr,&pExpr->aDoclist,&pExpr->nDoclist,1); - if( pInfo->rc ) return; - pExpr->isLoaded = 1; - } - - /* If aDoclist is not NULL, search for the doclist entry in pExpr->aDoclist - ** associated with the docid pInfo->iDocid. - */ - if( pExpr->aDoclist ){ - char *pEnd = &pExpr->aDoclist[pExpr->nDoclist]; - sqlite3_int64 iSearch = pInfo->iDocid; - - if( pExpr->pCurrent==0 ){ - assert( pExpr->iDocid==0 ); - pExpr->pCurrent = pExpr->aDoclist; - fts3GetDeltaVarint(&pExpr->pCurrent, &pExpr->iDocid); - } - - while( pExpr->iDocidpCurrentpCurrent); - if( pExpr->pCurrentpCurrent, &pExpr->iDocid); - } - } - - if( pExpr->iDocid==iSearch ){ - int i; - for(i=0; inColumn; i++){ - unsigned int iLocalOff; - - /* Add space for the "local-count" field. */ - iLocalOff = pInfo->nOut; - fts3MatchInfoAppend(pInfo, 0); - if( pInfo->rc ) return; - - /* If the GLOBALCOUNT field is required, write the global-count - ** value for this query/column to the output buffer. - */ - if( pInfo->flags&FTS3_MATCHINFO_GLOBALCOUNT ){ - if( !pExpr->aHist ){ - char *pCsr = pExpr->aDoclist; - - /* Allocate a zeroed buffer to store the global-counts - ** corresponding to this simple query for each table column. - */ - int nByte = sizeof(unsigned int)*pTab->nColumn; - pExpr->aHist = (unsigned int *)sqlite3_malloc(nByte); - if( !pExpr->aHist ){ - pInfo->rc = SQLITE_NOMEM; - return; - } - memset(pExpr->aHist, 0, nByte); - - /* Scan the entire doclist to populate Fts3Expr.aHist[]. */ - while( pCsraHist[iCol] += fts3ColumnlistCount(&pCsr); - } - pCsr++; - } - } - - fts3MatchInfoAppend(pInfo, pExpr->aHist[i]); - } - - if( pInfo->flags&FTS3_MATCHINFO_PHRASELENGTH ){ - fts3MatchInfoAppend(pInfo, nPhrase); - } - - if( i==0 ){ - if( *pExpr->pCurrent==0x01 ) continue; - }else{ - sqlite3_int64 iCol; - char *pList = pExpr->pCurrent; - if( *pList==0x00 ) continue; - pList++; - pList += sqlite3Fts3GetVarint(pList, &iCol); - if( iCol!=i ) continue; - pExpr->pCurrent = pList; - } - - if( pInfo->flags&FTS3_MATCHINFO_POSITIONLIST ){ - int nLocal = 0; - sqlite3_int64 iOffset = 0; - char *pList = pExpr->pCurrent; - while( *pList&0xFE ){ - fts3GetDeltaVarint(&pList, &iOffset); - iOffset -= 2; - fts3MatchInfoAppend(pInfo, (unsigned int)(iOffset+1-nPhrase)); - nLocal++; - } - pExpr->pCurrent = pList; - pInfo->aOut[iLocalOff] = nLocal; - }else{ - pInfo->aOut[iLocalOff] = fts3ColumnlistCount(&pExpr->pCurrent); - } - } - pExpr->pCurrent++; - if( pExpr->pCurrentpCurrent, &pExpr->iDocid); + assert( pCsr ); + while( pCsriCurrentiCurrent); + pExpr->pCurrent = pCsr; + }else{ + if( pExpr->iCurrent==iDocid ){ + int iThis = 0; + if( iCol<0 ){ + /* If iCol is negative, return a pointer to the start of the + ** position-list (instead of a pointer to the start of a list + ** of offsets associated with a specific column). + */ + return pCsr; + } + while( iThisnQuery++; } + + return 0; } /* @@ -2298,6 +2141,47 @@ static void fts3SnippetFunc( } } +/* +** Implementation of the snippet2() function for FTS3 +*/ +static void fts3Snippet2Func( + sqlite3_context *pContext, /* SQLite function call context */ + int nVal, /* Size of apVal[] array */ + sqlite3_value **apVal /* Array of arguments */ +){ + Fts3Cursor *pCsr; /* Cursor handle passed through apVal[0] */ + const char *zStart = ""; + const char *zEnd = ""; + const char *zEllipsis = "..."; + int iCol = -1; + int nToken = 10; + + /* There must be at least one argument passed to this function (otherwise + ** the non-overloaded version would have been called instead of this one). + */ + assert( nVal>=1 ); + + if( nVal>6 ){ + sqlite3_result_error(pContext, + "wrong number of arguments to function snippet()", -1); + return; + } + if( fts3FunctionArg(pContext, "snippet", apVal[0], &pCsr) ) return; + + switch( nVal ){ + case 6: nToken = sqlite3_value_int(apVal[5]); + case 5: iCol = sqlite3_value_int(apVal[4]); + case 4: zEllipsis = (const char*)sqlite3_value_text(apVal[3]); + case 3: zEnd = (const char*)sqlite3_value_text(apVal[2]); + case 2: zStart = (const char*)sqlite3_value_text(apVal[1]); + } + if( !zEllipsis || !zEnd || !zStart ){ + sqlite3_result_error_nomem(pContext); + }else if( SQLITE_OK==fts3CursorSeek(pContext, pCsr) ){ + sqlite3Fts3Snippet2(pContext, pCsr, zStart, zEnd, zEllipsis, iCol, nToken); + } +} + /* ** Implementation of the offsets() function for FTS3 */ @@ -2367,55 +2251,15 @@ static void fts3MatchinfoFunc( sqlite3_value **apVal /* Array of arguments */ ){ Fts3Cursor *pCsr; /* Cursor handle passed through apVal[0] */ - int flags = 0; - - if( nVal==2 ){ - int i; - const unsigned char *zFlags = sqlite3_value_text(apVal[1]); - for(i=0; zFlags[i]; i++){ - switch( zFlags[i] ){ - case 'g': flags |= FTS3_MATCHINFO_GLOBALCOUNT; break; - case 'p': flags |= FTS3_MATCHINFO_POSITIONLIST; break; - case 'n': flags |= FTS3_MATCHINFO_PHRASELENGTH; break; - default: { - char zErr[18]; - memcpy(zErr, "Unknown flag: \"%c\"", 18); - zErr[16] = (char)zFlags[i]; - sqlite3_result_error(pContext, zErr, -1); - return; - } - } - } - }else if( nVal!=1 ){ + if( nVal!=1 ){ sqlite3_result_error(pContext, "wrong number of arguments to function matchinfo()", -1); return; } if( SQLITE_OK==fts3FunctionArg(pContext, "matchinfo", apVal[0], &pCsr) ){ - MatchInfo ctx; - memset(&ctx, 0, sizeof(ctx)); - ctx.iDocid = pCsr->iPrevId; - ctx.pTab = (Fts3Table *)pCsr->base.pVtab; - ctx.flags = flags; - - fts3MatchInfoAppend(&ctx, 0); - fts3MatchInfoAppend(&ctx, ctx.pTab->nColumn); - - /* Iterate through each of the 'simple' queries that make up the query - ** expression. A 'simple' query is a phrase (including token and token - ** prefix) or NEAR query. - */ - fts3ExprMatchInfo(pContext, pCsr->pExpr, &ctx); - if( ctx.rc ){ - sqlite3_free(ctx.aOut); - sqlite3_result_error_code(pContext, ctx.rc); - }else{ - int nByte = ctx.nOut*sizeof(unsigned int); - ctx.aOut[0] = ctx.nQuery; - sqlite3_result_blob(pContext, ctx.aOut, nByte, sqlite3_free); - } + sqlite3Fts3Matchinfo(pContext, pCsr); } } @@ -2435,6 +2279,7 @@ static int fts3FindFunctionMethod( void (*xFunc)(sqlite3_context*,int,sqlite3_value**); } aOverload[] = { { "snippet", fts3SnippetFunc }, + { "snippet2", fts3Snippet2Func }, { "offsets", fts3OffsetsFunc }, { "optimize", fts3OptimizeFunc }, { "matchinfo", fts3MatchinfoFunc }, @@ -2584,6 +2429,7 @@ int sqlite3Fts3Init(sqlite3 *db){ if( SQLITE_OK==rc && SQLITE_OK==(rc = sqlite3Fts3InitHashTable(db, pHash, "fts3_tokenizer")) && SQLITE_OK==(rc = sqlite3_overload_function(db, "snippet", -1)) + && SQLITE_OK==(rc = sqlite3_overload_function(db, "snippet2", -1)) && SQLITE_OK==(rc = sqlite3_overload_function(db, "offsets", 1)) && SQLITE_OK==(rc = sqlite3_overload_function(db, "matchinfo", -1)) && SQLITE_OK==(rc = sqlite3_overload_function(db, "optimize", 1)) diff --git a/ext/fts3/fts3Int.h b/ext/fts3/fts3Int.h index e3e18e701b..ceb13ee7d6 100644 --- a/ext/fts3/fts3Int.h +++ b/ext/fts3/fts3Int.h @@ -70,6 +70,8 @@ */ typedef unsigned char u8; /* 1-byte (or larger) unsigned integer */ typedef short int i16; /* 2-byte (or larger) signed integer */ +typedef unsigned int u32; /* 4-byte unsigned integer */ +typedef sqlite3_uint64 u64; /* 8-byte unsigned integer */ /* ** Macro used to suppress compiler warnings for unused parameters. */ @@ -146,6 +148,8 @@ struct Fts3Cursor { char *pNextId; /* Pointer into the body of aDoclist */ char *aDoclist; /* List of docids for full-text queries */ int nDoclist; /* Size of buffer at aDoclist */ + int isMatchinfoOk; /* True when aMatchinfo[] matches iPrevId */ + u32 *aMatchinfo; }; /* @@ -205,12 +209,12 @@ struct Fts3Expr { Fts3Expr *pRight; /* Right operand */ Fts3Phrase *pPhrase; /* Valid if eType==FTSQUERY_PHRASE */ - int isLoaded; - sqlite3_int64 iDocid; - char *aDoclist; - int nDoclist; + int isLoaded; /* True if aDoclist/nDoclist are initialized. */ + char *aDoclist; /* Buffer containing doclist */ + int nDoclist; /* Size of aDoclist in bytes */ + + sqlite3_int64 iCurrent; char *pCurrent; - unsigned int *aHist; }; /* @@ -273,6 +277,9 @@ int sqlite3Fts3GetVarint32(const char *, int *); int sqlite3Fts3VarintLen(sqlite3_uint64); void sqlite3Fts3Dequote(char *); +char *sqlite3Fts3FindPositions(Fts3Expr *, sqlite3_int64, int); +int sqlite3Fts3ExprLoadDoclist(Fts3Table *, Fts3Expr *); + /* fts3_tokenizer.c */ const char *sqlite3Fts3NextToken(const char *, int *); int sqlite3Fts3InitHashTable(sqlite3 *, Fts3Hash *, const char *); @@ -285,6 +292,10 @@ void sqlite3Fts3Offsets(sqlite3_context*, Fts3Cursor*); void sqlite3Fts3Snippet(sqlite3_context*, Fts3Cursor*, const char *, const char *, const char * ); +void sqlite3Fts3Snippet2(sqlite3_context *, Fts3Cursor *, const char *, + const char *, const char *, int, int +); +void sqlite3Fts3Matchinfo(sqlite3_context *, Fts3Cursor *); /* fts3_expr.c */ int sqlite3Fts3ExprParse(sqlite3_tokenizer *, diff --git a/ext/fts3/fts3_expr.c b/ext/fts3/fts3_expr.c index 0841082962..7542c28a97 100644 --- a/ext/fts3/fts3_expr.c +++ b/ext/fts3/fts3_expr.c @@ -736,7 +736,6 @@ void sqlite3Fts3ExprFree(Fts3Expr *p){ sqlite3Fts3ExprFree(p->pLeft); sqlite3Fts3ExprFree(p->pRight); sqlite3_free(p->aDoclist); - sqlite3_free(p->aHist); sqlite3_free(p); } } diff --git a/ext/fts3/fts3_snippet.c b/ext/fts3/fts3_snippet.c index cdc55aff46..8d3b32f12b 100644 --- a/ext/fts3/fts3_snippet.c +++ b/ext/fts3/fts3_snippet.c @@ -731,4 +731,614 @@ void sqlite3Fts3Snippet( fts3SnippetFree(p); } +/************************************************************************* +** Below this point is the alternative, experimental snippet() implementation. +*/ + +#define SNIPPET_BUFFER_CHUNK 64 +#define SNIPPET_BUFFER_SIZE SNIPPET_BUFFER_CHUNK*4 +#define SNIPPET_BUFFER_MASK (SNIPPET_BUFFER_SIZE-1) + +static void fts3GetDeltaPosition(char **pp, int *piPos){ + int iVal; + *pp += sqlite3Fts3GetVarint32(*pp, &iVal); + *piPos += (iVal-2); +} + +/* +** Iterate through all phrase nodes in an FTS3 query, except those that +** are part of a sub-tree that is the right-hand-side of a NOT operator. +** For each phrase node found, the supplied callback function is invoked. +** +** If the callback function returns anything other than SQLITE_OK, +** the iteration is abandoned and the error code returned immediately. +** Otherwise, SQLITE_OK is returned after a callback has been made for +** all eligible phrase nodes. +*/ +static int fts3ExprIterate( + Fts3Expr *pExpr, /* Expression to iterate phrases of */ + int (*x)(Fts3Expr *, void *), /* Callback function to invoke for phrases */ + void *pCtx /* Second argument to pass to callback */ +){ + int rc; + int eType = pExpr->eType; + if( eType==FTSQUERY_NOT ){ + rc = SQLITE_OK; + }else if( eType!=FTSQUERY_PHRASE ){ + assert( pExpr->pLeft && pExpr->pRight ); + rc = fts3ExprIterate(pExpr->pLeft, x, pCtx); + if( rc==SQLITE_OK ){ + rc = fts3ExprIterate(pExpr->pRight, x, pCtx); + } + }else{ + rc = x(pExpr, pCtx); + } + return rc; +} + +typedef struct LoadDoclistCtx LoadDoclistCtx; +struct LoadDoclistCtx { + Fts3Table *pTab; /* FTS3 Table */ + int nPhrase; /* Number of phrases so far */ +}; + +static int fts3ExprLoadDoclistsCb(Fts3Expr *pExpr, void *ctx){ + int rc = SQLITE_OK; + LoadDoclistCtx *p = (LoadDoclistCtx *)ctx; + p->nPhrase++; + if( pExpr->isLoaded==0 ){ + rc = sqlite3Fts3ExprLoadDoclist(p->pTab, pExpr); + pExpr->isLoaded = 1; + if( rc==SQLITE_OK && pExpr->aDoclist ){ + pExpr->pCurrent = pExpr->aDoclist; + pExpr->pCurrent += sqlite3Fts3GetVarint(pExpr->pCurrent,&pExpr->iCurrent); + } + } + return rc; +} + +static int fts3ExprLoadDoclists(Fts3Cursor *pCsr, int *pnPhrase){ + int rc; + LoadDoclistCtx sCtx = {0, 0}; + sCtx.pTab = (Fts3Table *)pCsr->base.pVtab; + rc = fts3ExprIterate(pCsr->pExpr, fts3ExprLoadDoclistsCb, (void *)&sCtx); + *pnPhrase = sCtx.nPhrase; + return rc; +} + +/* +** Each call to this function populates a chunk of a snippet-buffer +** SNIPPET_BUFFER_CHUNK bytes in size. +** +** Return true if the end of the data has been reached (and all subsequent +** calls to fts3LoadSnippetBuffer() with the same arguments will be no-ops), +** or false otherwise. +*/ +static int fts3LoadSnippetBuffer( + int iPos, /* Document token offset to load data for */ + u8 *aBuffer, /* Circular snippet buffer to populate */ + int nList, /* Number of position lists in appList */ + char **apList, /* IN/OUT: nList position list pointers */ + int *aiPrev /* IN/OUT: Previous positions read */ +){ + int i; + int nFin = 0; + + assert( (iPos&(SNIPPET_BUFFER_CHUNK-1))==0 ); + + memset(&aBuffer[iPos&SNIPPET_BUFFER_MASK], 0, SNIPPET_BUFFER_CHUNK); + + for(i=0; i=iPos ){ + aBuffer[iPrev&SNIPPET_BUFFER_MASK] = i+1; + } + if( 0==((*pList)&0xFE) ){ + nFin++; + break; + } + fts3GetDeltaPosition(&pList, &iPrev); + } + + aiPrev[i] = iPrev; + apList[i] = pList; + } + + return (nFin==nList); +} + +typedef struct SnippetCtx SnippetCtx; +struct SnippetCtx { + Fts3Cursor *pCsr; + int iCol; + int iPhrase; + int *aiPrev; + int *anToken; + char **apList; +}; + +static int fts3SnippetFindPositions(Fts3Expr *pExpr, void *ctx){ + SnippetCtx *p = (SnippetCtx *)ctx; + int iPhrase = p->iPhrase++; + char *pCsr; + + p->anToken[iPhrase] = pExpr->pPhrase->nToken; + pCsr = sqlite3Fts3FindPositions(pExpr, p->pCsr->iPrevId, p->iCol); + + if( pCsr ){ + int iVal; + pCsr += sqlite3Fts3GetVarint32(pCsr, &iVal); + p->apList[iPhrase] = pCsr; + p->aiPrev[iPhrase] = iVal-2; + } + return SQLITE_OK; +} + +static void fts3SnippetCnt( + int iIdx, + int nSnippet, + int *anCnt, + u8 *aBuffer, + int *anToken, + u64 *pHlmask +){ + int iSub = (iIdx-1)&SNIPPET_BUFFER_MASK; + int iAdd = (iIdx+nSnippet-1)&SNIPPET_BUFFER_MASK; + int iSub2 = (iIdx+(nSnippet/3)-1)&SNIPPET_BUFFER_MASK; + int iAdd2 = (iIdx+(nSnippet*2/3)-1)&SNIPPET_BUFFER_MASK; + + u64 h = *pHlmask; + + anCnt[ aBuffer[iSub] ]--; + anCnt[ aBuffer[iSub2] ]--; + anCnt[ aBuffer[iAdd] ]++; + anCnt[ aBuffer[iAdd2] ]++; + + h = h >> 1; + if( aBuffer[iAdd] ){ + int j; + for(j=anToken[aBuffer[iAdd]-1]; j>=1; j--){ + h |= (u64)1 << (nSnippet-j); + } + } + *pHlmask = h; +} + +static int fts3SnippetScore(int n, int *anCnt){ + int j; + int iScore = 0; + for(j=1; j<=n; j++){ + int nCnt = anCnt[j]; + iScore += nCnt + (nCnt ? 1000 : 0); + } + return iScore; +} + +static int fts3BestSnippet( + int nSnippet, /* Desired snippet length */ + Fts3Cursor *pCsr, /* Cursor to create snippet for */ + int iCol, /* Index of column to create snippet from */ + int *piPos, /* OUT: Starting token for best snippet */ + u64 *pHlmask /* OUT: Highlight mask for best snippet */ +){ + int rc; /* Return Code */ + u8 aBuffer[SNIPPET_BUFFER_SIZE];/* Circular snippet buffer */ + int *aiPrev; /* Used by fts3LoadSnippetBuffer() */ + int *anToken; /* Number of tokens in each phrase */ + char **apList; /* Array of position lists */ + int *anCnt; /* Running totals of phrase occurences */ + int nList; + + int i; + + u64 hlmask = 0; /* Current mask of highlighted terms */ + u64 besthlmask = 0; /* Mask of highlighted terms for iBestPos */ + int iBestPos = 0; /* Starting position of 'best' snippet */ + int iBestScore = 0; /* Score of best snippet higher->better */ + SnippetCtx sCtx; + + /* Iterate through the phrases in the expression to count them. The same + ** callback makes sure the doclists are loaded for each phrase. + */ + rc = fts3ExprLoadDoclists(pCsr, &nList); + if( rc!=SQLITE_OK ){ + return rc; + } + + /* Now that it is known how many phrases there are, allocate and zero + ** the required arrays using malloc(). + */ + apList = sqlite3_malloc( + sizeof(u8*)*nList + /* apList */ + sizeof(int)*(nList) + /* anToken */ + sizeof(int)*nList + /* aiPrev */ + sizeof(int)*(nList+1) /* anCnt */ + ); + if( !apList ){ + return SQLITE_NOMEM; + } + memset(apList, 0, sizeof(u8*)*nList+sizeof(int)*nList+sizeof(int)*nList); + anToken = (int *)&apList[nList]; + aiPrev = &anToken[nList]; + anCnt = &aiPrev[nList]; + + /* Initialize the contents of the aiPrev and aiList arrays. */ + sCtx.pCsr = pCsr; + sCtx.iCol = iCol; + sCtx.apList = apList; + sCtx.aiPrev = aiPrev; + sCtx.anToken = anToken; + sCtx.iPhrase = 0; + (void)fts3ExprIterate(pCsr->pExpr, fts3SnippetFindPositions, (void *)&sCtx); + + /* Load the first two chunks of data into the buffer. */ + memset(aBuffer, 0, SNIPPET_BUFFER_SIZE); + fts3LoadSnippetBuffer(0, aBuffer, nList, apList, aiPrev); + fts3LoadSnippetBuffer(SNIPPET_BUFFER_CHUNK, aBuffer, nList, apList, aiPrev); + + /* Set the initial contents of the highlight-mask and anCnt[] array. */ + for(i=1-nSnippet; i<=0; i++){ + fts3SnippetCnt(i, nSnippet, anCnt, aBuffer, anToken, &hlmask); + } + iBestScore = fts3SnippetScore(nList, anCnt); + besthlmask = hlmask; + iBestPos = 0; + + for(i=1; 1; i++){ + int iScore; + + if( 0==(i&(SNIPPET_BUFFER_CHUNK-1)) ){ + int iLoad = i + SNIPPET_BUFFER_CHUNK; + if( fts3LoadSnippetBuffer(iLoad, aBuffer, nList, apList, aiPrev) ) break; + } + + /* Figure out how highly a snippet starting at token offset i scores + ** according to fts3SnippetScore(). If it is higher than any previously + ** considered position, save the current position, score and hlmask as + ** the best snippet candidate found so far. + */ + fts3SnippetCnt(i, nSnippet, anCnt, aBuffer, anToken, &hlmask); + iScore = fts3SnippetScore(nList, anCnt); + if( iScore>iBestScore ){ + iBestPos = i; + iBestScore = iScore; + besthlmask = hlmask; + } + } + + sqlite3_free(apList); + *piPos = iBestPos; + *pHlmask = besthlmask; + return SQLITE_OK; +} + +typedef struct StrBuffer StrBuffer; +struct StrBuffer { + char *z; + int n; + int nAlloc; +}; + +static int fts3StringAppend( + StrBuffer *pStr, + const char *zAppend, + int nAppend +){ + if( nAppend<0 ){ + nAppend = strlen(zAppend); + } + + if( pStr->n+nAppend+1>=pStr->nAlloc ){ + int nAlloc = pStr->nAlloc+nAppend+100; + char *zNew = sqlite3_realloc(pStr->z, nAlloc); + if( !zNew ){ + return SQLITE_NOMEM; + } + pStr->z = zNew; + pStr->nAlloc = nAlloc; + } + + memcpy(&pStr->z[pStr->n], zAppend, nAppend); + pStr->n += nAppend; + pStr->z[pStr->n] = '\0'; + + return SQLITE_OK; +} + +static int fts3SnippetText( + Fts3Cursor *pCsr, /* FTS3 Cursor */ + const char *zDoc, /* Document to extract snippet from */ + int nDoc, /* Size of zDoc in bytes */ + int nSnippet, /* Number of tokens in extracted snippet */ + int iPos, /* Index of first document token in snippet */ + u64 hlmask, /* Bitmask of terms to highlight in snippet */ + const char *zOpen, /* String inserted before highlighted term */ + const char *zClose, /* String inserted after highlighted term */ + const char *zEllipsis, + char **pzSnippet /* OUT: Snippet text */ +){ + Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab; + int rc; /* Return code */ + int iCurrent = 0; + int iStart = 0; + int iEnd; + + sqlite3_tokenizer_module *pMod; /* Tokenizer module methods object */ + sqlite3_tokenizer_cursor *pC; /* Tokenizer cursor open on zDoc/nDoc */ + const char *ZDUMMY; /* Dummy arguments used with tokenizer */ + int DUMMY1, DUMMY2, DUMMY3; /* Dummy arguments used with tokenizer */ + + StrBuffer res = {0, 0, 0}; /* Result string */ + + /* Open a token cursor on the document. Read all tokens up to and + ** including token iPos (the first token of the snippet). Set variable + ** iStart to the byte offset in zDoc of the start of token iPos. + */ + pMod = (sqlite3_tokenizer_module *)pTab->pTokenizer->pModule; + rc = pMod->xOpen(pTab->pTokenizer, zDoc, nDoc, &pC); + while( rc==SQLITE_OK && iCurrentxNext(pC, &ZDUMMY, &DUMMY1, &iStart, &DUMMY2, &iCurrent); + } + iEnd = iStart; + + if( rc==SQLITE_OK && iStart>0 ){ + rc = fts3StringAppend(&res, zEllipsis, -1); + } + + while( rc==SQLITE_OK ){ + int iBegin; + int iFin; + rc = pMod->xNext(pC, &ZDUMMY, &DUMMY1, &iBegin, &iFin, &iCurrent); + + if( rc==SQLITE_OK ){ + if( iCurrent>=(iPos+nSnippet) ){ + rc = SQLITE_DONE; + }else{ + iEnd = iFin; + if( hlmask & ((u64)1 << (iCurrent-iPos)) ){ + if( fts3StringAppend(&res, &zDoc[iStart], iBegin-iStart) + || fts3StringAppend(&res, zOpen, -1) + || fts3StringAppend(&res, &zDoc[iBegin], iEnd-iBegin) + || fts3StringAppend(&res, zClose, -1) + ){ + rc = SQLITE_NOMEM; + } + iStart = iEnd; + } + } + } + } + assert( rc!=SQLITE_OK ); + if( rc==SQLITE_DONE ){ + rc = fts3StringAppend(&res, &zDoc[iStart], iEnd-iStart); + if( rc==SQLITE_OK ){ + rc = pMod->xNext(pC, &ZDUMMY, &DUMMY1, &DUMMY2, &DUMMY3, &iCurrent); + if( rc==SQLITE_OK ){ + rc = fts3StringAppend(&res, zEllipsis, -1); + }else if( rc==SQLITE_DONE ){ + rc = fts3StringAppend(&res, &zDoc[iEnd], -1); + } + } + } + + pMod->xClose(pC); + if( rc!=SQLITE_OK ){ + sqlite3_free(res.z); + }else{ + *pzSnippet = res.z; + } + return rc; +} + + +/* +** An instance of this structure is used to collect the 'global' part of +** the matchinfo statistics. The 'global' part consists of the following: +** +** 1. The number of phrases in the query (nPhrase). +** +** 2. The number of columns in the FTS3 table (nCol). +** +** 3. A matrix of (nPhrase*nCol) integers containing the sum of the +** number of hits for each phrase in each column across all rows +** of the table. +** +** The total size of the global matchinfo array, assuming the number of +** columns is N and the number of phrases is P is: +** +** 2 + P*(N+1) +** +** The number of hits for the 3rd phrase in the second column is found +** using the expression: +** +** aGlobal[2 + P*(1+2) + 1] +*/ +typedef struct MatchInfo MatchInfo; +struct MatchInfo { + Fts3Table *pTab; /* FTS3 Table */ + Fts3Cursor *pCursor; /* FTS3 Cursor */ + int iPhrase; /* Number of phrases so far */ + int nCol; /* Number of columns in table */ + u32 *aGlobal; /* Pre-allocated buffer */ +}; + +/* +** This function is used to count the entries in a column-list (delta-encoded +** list of term offsets within a single column of a single row). +*/ +static int fts3ColumnlistCount(char **ppCollist){ + char *pEnd = *ppCollist; + char c = 0; + int nEntry = 0; + + /* A column-list is terminated by either a 0x01 or 0x00. */ + while( 0xFE & (*pEnd | c) ){ + c = *pEnd++ & 0x80; + if( !c ) nEntry++; + } + + *ppCollist = pEnd; + return nEntry; +} + +static void fts3LoadColumnlistCounts(char **pp, u32 *aOut){ + char *pCsr = *pp; + while( *pCsr ){ + sqlite3_int64 iCol = 0; + if( *pCsr==0x01 ){ + pCsr++; + pCsr += sqlite3Fts3GetVarint(pCsr, &iCol); + } + aOut[iCol] += fts3ColumnlistCount(&pCsr); + } + pCsr++; + *pp = pCsr; +} + +/* +** fts3ExprIterate() callback used to collect the "global" matchinfo stats +** for a single query. +*/ +static int fts3ExprGlobalMatchinfoCb( + Fts3Expr *pExpr, /* Phrase expression node */ + void *pCtx /* Pointer to MatchInfo structure */ +){ + MatchInfo *p = (MatchInfo *)pCtx; + char *pCsr; + char *pEnd; + const int iStart = 2 + p->nCol*p->iPhrase; + + assert( pExpr->isLoaded ); + + /* Fill in the global hit count matrix row for this phrase. */ + pCsr = pExpr->aDoclist; + pEnd = &pExpr->aDoclist[pExpr->nDoclist]; + while( pCsraGlobal[iStart]); + } + + p->iPhrase++; + return SQLITE_OK; +} + +static int fts3ExprLocalMatchinfoCb( + Fts3Expr *pExpr, /* Phrase expression node */ + void *pCtx /* Pointer to MatchInfo structure */ +){ + MatchInfo *p = (MatchInfo *)pCtx; + int iPhrase = p->iPhrase++; + + if( pExpr->aDoclist ){ + char *pCsr; + int iOffset = 2 + p->nCol*(p->aGlobal[0]+iPhrase); + + memset(&p->aGlobal[iOffset], 0, p->nCol*sizeof(u32)); + pCsr = sqlite3Fts3FindPositions(pExpr, p->pCursor->iPrevId, -1); + if( pCsr ) fts3LoadColumnlistCounts(&pCsr, &p->aGlobal[iOffset]); + } + + return SQLITE_OK; +} + +/* +** Populate pCsr->aMatchinfo[] with data for the current row. The 'matchinfo' +** data is an array of 32-bit unsigned integers (C type u32). +*/ +static int fts3GetMatchinfo(Fts3Cursor *pCsr){ + MatchInfo g; + Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab; + if( pCsr->aMatchinfo==0 ){ + int rc; + int nPhrase; + int nMatchinfo; + + g.pTab = pTab; + g.nCol = pTab->nColumn; + g.iPhrase = 0; + rc = fts3ExprLoadDoclists(pCsr, &nPhrase); + if( rc!=SQLITE_OK ){ + return rc; + } + + nMatchinfo = 2 + 2*g.nCol*nPhrase; + + g.iPhrase = 0; + g.aGlobal = (u32 *)sqlite3_malloc(sizeof(u32)*nMatchinfo); + if( !g.aGlobal ){ + return SQLITE_NOMEM; + } + memset(g.aGlobal, 0, sizeof(u32)*nMatchinfo); + + g.aGlobal[0] = nPhrase; + g.aGlobal[1] = g.nCol; + (void)fts3ExprIterate(pCsr->pExpr, fts3ExprGlobalMatchinfoCb, (void *)&g); + + pCsr->aMatchinfo = g.aGlobal; + } + + g.pTab = pTab; + g.pCursor = pCsr; + g.nCol = pTab->nColumn; + g.iPhrase = 0; + g.aGlobal = pCsr->aMatchinfo; + + if( pCsr->isMatchinfoOk ){ + (void)fts3ExprIterate(pCsr->pExpr, fts3ExprLocalMatchinfoCb, (void *)&g); + pCsr->isMatchinfoOk = 0; + } + + return SQLITE_OK; +} + +void sqlite3Fts3Snippet2( + sqlite3_context *pCtx, /* SQLite function call context */ + Fts3Cursor *pCsr, /* Cursor object */ + const char *zStart, /* Snippet start text - "" */ + const char *zEnd, /* Snippet end text - "" */ + const char *zEllipsis, /* Snippet ellipsis text - "..." */ + int iCol, /* Extract snippet from this column */ + int nToken /* Approximate number of tokens in snippet */ +){ + int rc; + int iPos = 0; + u64 hlmask = 0; + char *z = 0; + int nDoc; + const char *zDoc; + + rc = fts3BestSnippet(nToken, pCsr, iCol, &iPos, &hlmask); + + nDoc = sqlite3_column_bytes(pCsr->pStmt, iCol+1); + zDoc = (const char *)sqlite3_column_text(pCsr->pStmt, iCol+1); + + if( rc==SQLITE_OK ){ + rc = fts3SnippetText( + pCsr, zDoc, nDoc, nToken, iPos, hlmask, zStart, zEnd, zEllipsis, &z); + } + if( rc!=SQLITE_OK ){ + sqlite3_result_error_code(pCtx, rc); + }else{ + sqlite3_result_text(pCtx, z, -1, sqlite3_free); + } +} + +void sqlite3Fts3Matchinfo(sqlite3_context *pContext, Fts3Cursor *pCsr){ + int rc = fts3GetMatchinfo(pCsr); + if( rc!=SQLITE_OK ){ + sqlite3_result_error_code(pContext, rc); + }else{ + int n = sizeof(u32)*(2+pCsr->aMatchinfo[0]*pCsr->aMatchinfo[1]*2); + sqlite3_result_blob(pContext, pCsr->aMatchinfo, n, SQLITE_TRANSIENT); + } +} + #endif diff --git a/manifest b/manifest index 8b046ed9e5..ed324b541d 100644 --- a/manifest +++ b/manifest @@ -1,8 +1,5 @@ ------BEGIN PGP SIGNED MESSAGE----- -Hash: SHA1 - -C Fix\sa\sbug\sin\sthe\snew\ssqlite3_test_control\scase\sof\sthe\sprevious\scheck-in. -D 2010-01-02T03:46:44 +C Add\sexperimental\simplementation\sof\sFTS3\sfunctions\smatchinfo()\sand\ssnippet()\s(not\senabled\sby\sdefault). +D 2010-01-02T19:02:02 F Makefile.arm-wince-mingw32ce-gcc fcd5e9cd67fe88836360bb4f9ef4cb7f8e2fb5a0 F Makefile.in c5827ead754ab32b9585487177c93bb00b9497b3 F Makefile.linux-gcc d53183f4aa6a9192d249731c90dbdffbd2c68654 @@ -59,15 +56,15 @@ F ext/fts2/mkfts2amal.tcl 974d5d438cb3f7c4a652639262f82418c1e4cff0 F ext/fts3/README.syntax a19711dc5458c20734b8e485e75fb1981ec2427a F ext/fts3/README.tokenizers 998756696647400de63d5ba60e9655036cb966e9 F ext/fts3/README.txt 8c18f41574404623b76917b9da66fcb0ab38328d -F ext/fts3/fts3.c ac757a4561401c5b3cea6e387e758503ec4770b5 +F ext/fts3/fts3.c 15fb87c1f00dfd88c2fbbbd9e50f319ea77834f0 F ext/fts3/fts3.h 3a10a0af180d502cecc50df77b1b22df142817fe -F ext/fts3/fts3Int.h 0d7c8d66ff9be8c79710438a46a7d046fcdedfc2 -F ext/fts3/fts3_expr.c 541de159278cfa694c584c763d23c3e23d796851 +F ext/fts3/fts3Int.h 9326800fa10e06d8e9d6d519f873b1371252968a +F ext/fts3/fts3_expr.c f4ff02ebe854e97ac03ff00b38b728a9ab57fd4b F ext/fts3/fts3_hash.c 3c8f6387a4a7f5305588b203fa7c887d753e1f1c F ext/fts3/fts3_hash.h 8331fb2206c609f9fc4c4735b9ab5ad6137c88ec F ext/fts3/fts3_icu.c ac494aed69835008185299315403044664bda295 F ext/fts3/fts3_porter.c a651e287e02b49b565a6ccf9441959d434489156 -F ext/fts3/fts3_snippet.c 6c2eb6d872d66b2a9aa5663f2662e993f18a6496 +F ext/fts3/fts3_snippet.c a11d9f293eb92fb07ac26a994d0aa2bc35fe1c2a F ext/fts3/fts3_tokenizer.c 1a49ee3d79cbf0b9386250370d9cbfe4bb89c8ff F ext/fts3/fts3_tokenizer.h 13ffd9fcb397fec32a05ef5cd9e0fa659bf3dbd3 F ext/fts3/fts3_tokenizer1.c 11a604a53cff5e8c28882727bf794e5252e5227b @@ -406,8 +403,8 @@ F test/fts3expr.test 05dab77387801e4900009917bb18f556037d82da F test/fts3expr2.test 18da930352e5693eaa163a3eacf96233b7290d1a F test/fts3malloc.test d02ee86b21edd2b43044e0d6dfdcd26cb6efddcb F test/fts3near.test dc196dd17b4606f440c580d45b3d23aa975fd077 -F test/fts3query.test 2cba25181dac298abc10c3086a88b308f90a93c4 -F test/fts3rnd.test 654daa6206f9d63ed3388858c60bba3fd4004a5f +F test/fts3query.test ca21717993f51caa7e36231dba2499868f3f8a6f +F test/fts3rnd.test 153b4214bad6084a348814f3dd651a92e2f31d9b F test/func.test af106ed834001738246d276659406823e35cde7b F test/func2.test 772d66227e4e6684b86053302e2d74a2500e1e0f F test/fuzz.test a4174c3009a3e2c2e14b31b364ebf7ddb49de2c9 @@ -786,14 +783,7 @@ F tool/speedtest2.tcl ee2149167303ba8e95af97873c575c3e0fab58ff F tool/speedtest8.c 2902c46588c40b55661e471d7a86e4dd71a18224 F tool/speedtest8inst1.c 293327bc76823f473684d589a8160bde1f52c14e F tool/vdbe-compress.tcl d70ea6d8a19e3571d7ab8c9b75cba86d1173ff0f -P d3cdc4b12be7f1ed2249ad210482200868956d12 -R 047d1c5bb153bc040a549a907f16107f -U drh -Z 6dc4f01f89c08e391eb9a66f4b2613a9 ------BEGIN PGP SIGNATURE----- -Version: GnuPG v1.4.6 (GNU/Linux) - -iD8DBQFLPsGnoxKgR168RlERAkrWAJ0ZGJJy+5KFTMkbrfz90FsJmwxY5QCeN3vh -oKIjY83kRZfkqbPt0/cbgkk= -=58hz ------END PGP SIGNATURE----- +P 3b77701bc854997346e9cc33fe64d00d4b6332bd +R 352836846f02a941ba33453a9ca0d839 +U dan +Z 0ec00194a72e0933026761576b69b758 diff --git a/manifest.uuid b/manifest.uuid index cb61a85891..7d99a01c9d 100644 --- a/manifest.uuid +++ b/manifest.uuid @@ -1 +1 @@ -3b77701bc854997346e9cc33fe64d00d4b6332bd \ No newline at end of file +51f7ee844057086789dcfcdcba7daf45343cae62 \ No newline at end of file diff --git a/test/fts3query.test b/test/fts3query.test index 8b10f24775..2a4d6e720e 100644 --- a/test/fts3query.test +++ b/test/fts3query.test @@ -99,22 +99,7 @@ db func mit mit do_test fts3query-3.3 { execsql { SELECT mit(matchinfo(foobar)) FROM foobar WHERE foobar MATCH 'the' } -} {{1 1 3}} -do_test fts3query-3.4 { - execsql { - SELECT mit(matchinfo(foobar, 'g')) FROM foobar WHERE foobar MATCH 'the' - } } {{1 1 3 3}} -do_test fts3query-3.5 { - execsql { - SELECT mit(matchinfo(foobar, 'p')) FROM foobar WHERE foobar MATCH 'the' - } -} {{1 1 3 27 74 79}} -do_test fts3query-3.5 { - execsql { - SELECT mit(matchinfo(foobar, 'pg')) FROM foobar WHERE foobar MATCH 'the' - } -} {{1 1 3 3 27 74 79}} finish_test diff --git a/test/fts3rnd.test b/test/fts3rnd.test index a527214b18..c319d35d69 100644 --- a/test/fts3rnd.test +++ b/test/fts3rnd.test @@ -160,18 +160,24 @@ proc simple_phrase {zPrefix} { } proc simple_token_matchinfo {zToken} { + set total(0) 0 + set total(1) 0 + set total(2) 0 + foreach key [lsort -integer [array names ::t1]] { set value $::t1($key) set cnt [list] - foreach col $value { - lappend cnt [llength [lsearch -all $col $zToken]] + foreach i {0 1 2} col $value { + set n [llength [lsearch -all $col $zToken]] + lappend cnt $n + incr total($i) $n } if {[lindex [lsort $cnt] end]} { - lappend ret $key [concat 1 3 $cnt] + lappend ret $key [concat 1 3 XXX $cnt] } } - set ret + string map [list XXX "$total(0) $total(1) $total(2)"] $ret } proc simple_near {termlist nNear} {