/* ** 2009 Oct 23 ** ** The author disclaims copyright to this source code. In place of ** a legal notice, here is a blessing: ** ** May you do good and not evil. ** May you find forgiveness for yourself and forgive others. ** May you share freely, never taking more than you give. ** ****************************************************************************** */ #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) #include "fts3Int.h" #include #include #include typedef struct Snippet Snippet; /* ** An instance of the following structure keeps track of generated ** matching-word offset information and snippets. */ struct Snippet { int nMatch; /* Total number of matches */ int nAlloc; /* Space allocated for aMatch[] */ struct snippetMatch { /* One entry for each matching term */ char snStatus; /* Status flag for use while constructing snippets */ short int nByte; /* Number of bytes in the term */ short int iCol; /* The column that contains the match */ short int iTerm; /* The index in Query.pTerms[] of the matching term */ int iToken; /* The index of the matching document token */ int iStart; /* The offset to the first character of the term */ } *aMatch; /* Points to space obtained from malloc */ char *zOffset; /* Text rendering of aMatch[] */ int nOffset; /* strlen(zOffset) */ char *zSnippet; /* Snippet text */ int nSnippet; /* strlen(zSnippet) */ }; /* It is not safe to call isspace(), tolower(), or isalnum() on ** hi-bit-set characters. This is the same solution used in the ** tokenizer. */ static int fts3snippetIsspace(char c){ return (c&0x80)==0 ? isspace(c) : 0; } /* ** A StringBuffer object holds a zero-terminated string that grows ** arbitrarily by appending. Space to hold the string is obtained ** from sqlite3_malloc(). After any memory allocation failure, ** StringBuffer.z is set to NULL and no further allocation is attempted. */ typedef struct StringBuffer { char *z; /* Text of the string. Space from malloc. */ int nUsed; /* Number bytes of z[] used, not counting \000 terminator */ int nAlloc; /* Bytes allocated for z[] */ } StringBuffer; /* ** Initialize a new StringBuffer. */ static void fts3SnippetSbInit(StringBuffer *p){ p->nAlloc = 100; p->nUsed = 0; p->z = sqlite3_malloc( p->nAlloc ); } /* ** Append text to the string buffer. */ static void fts3SnippetAppend(StringBuffer *p, const char *zNew, int nNew){ if( p->z==0 ) return; if( nNew<0 ) nNew = (int)strlen(zNew); if( p->nUsed + nNew >= p->nAlloc ){ int nAlloc; char *zNew; nAlloc = p->nUsed + nNew + p->nAlloc; zNew = sqlite3_realloc(p->z, nAlloc); if( zNew==0 ){ sqlite3_free(p->z); p->z = 0; return; } p->z = zNew; p->nAlloc = nAlloc; } memcpy(&p->z[p->nUsed], zNew, nNew); p->nUsed += nNew; p->z[p->nUsed] = 0; } /* If the StringBuffer ends in something other than white space, add a ** single space character to the end. */ static void fts3SnippetAppendWhiteSpace(StringBuffer *p){ if( p->z && p->nUsed && !fts3snippetIsspace(p->z[p->nUsed-1]) ){ fts3SnippetAppend(p, " ", 1); } } /* Remove white space from the end of the StringBuffer */ static void fts3SnippetTrimWhiteSpace(StringBuffer *p){ if( p->z ){ while( p->nUsed && fts3snippetIsspace(p->z[p->nUsed-1]) ){ p->nUsed--; } p->z[p->nUsed] = 0; } } /* ** Release all memory associated with the Snippet structure passed as ** an argument. */ static void fts3SnippetFree(Snippet *p){ if( p ){ sqlite3_free(p->aMatch); sqlite3_free(p->zOffset); sqlite3_free(p->zSnippet); sqlite3_free(p); } } /* ** Append a single entry to the p->aMatch[] log. */ static int snippetAppendMatch( Snippet *p, /* Append the entry to this snippet */ int iCol, int iTerm, /* The column and query term */ int iToken, /* Matching token in document */ int iStart, int nByte /* Offset and size of the match */ ){ int i; struct snippetMatch *pMatch; if( p->nMatch+1>=p->nAlloc ){ struct snippetMatch *pNew; p->nAlloc = p->nAlloc*2 + 10; pNew = sqlite3_realloc(p->aMatch, p->nAlloc*sizeof(p->aMatch[0]) ); if( pNew==0 ){ p->aMatch = 0; p->nMatch = 0; p->nAlloc = 0; return SQLITE_NOMEM; } p->aMatch = pNew; } i = p->nMatch++; pMatch = &p->aMatch[i]; pMatch->iCol = (short)iCol; pMatch->iTerm = (short)iTerm; pMatch->iToken = iToken; pMatch->iStart = iStart; pMatch->nByte = (short)nByte; return SQLITE_OK; } /* ** Sizing information for the circular buffer used in snippetOffsetsOfColumn() */ #define FTS3_ROTOR_SZ (32) #define FTS3_ROTOR_MASK (FTS3_ROTOR_SZ-1) /* ** Function to iterate through the tokens of a compiled expression. ** ** Except, skip all tokens on the right-hand side of a NOT operator. ** This function is used to find tokens as part of snippet and offset ** generation and we do nt want snippets and offsets to report matches ** for tokens on the RHS of a NOT. */ static int fts3NextExprToken(Fts3Expr **ppExpr, int *piToken){ Fts3Expr *p = *ppExpr; int iToken = *piToken; if( iToken<0 ){ /* In this case the expression p is the root of an expression tree. ** Move to the first token in the expression tree. */ while( p->pLeft ){ p = p->pLeft; } iToken = 0; }else{ assert(p && p->eType==FTSQUERY_PHRASE ); if( iToken<(p->pPhrase->nToken-1) ){ iToken++; }else{ iToken = 0; while( p->pParent && p->pParent->pLeft!=p ){ assert( p->pParent->pRight==p ); p = p->pParent; } p = p->pParent; if( p ){ assert( p->pRight!=0 ); p = p->pRight; while( p->pLeft ){ p = p->pLeft; } } } } *ppExpr = p; *piToken = iToken; return p?1:0; } /* ** Return TRUE if the expression node pExpr is located beneath the ** RHS of a NOT operator. */ static int fts3ExprBeneathNot(Fts3Expr *p){ Fts3Expr *pParent; while( p ){ pParent = p->pParent; if( pParent && pParent->eType==FTSQUERY_NOT && pParent->pRight==p ){ return 1; } p = pParent; } return 0; } /* ** Add entries to pSnippet->aMatch[] for every match that occurs against ** document zDoc[0..nDoc-1] which is stored in column iColumn. */ static int snippetOffsetsOfColumn( Fts3Cursor *pCur, /* The fulltest search cursor */ Snippet *pSnippet, /* The Snippet object to be filled in */ int iColumn, /* Index of fulltext table column */ const char *zDoc, /* Text of the fulltext table column */ int nDoc /* Length of zDoc in bytes */ ){ const sqlite3_tokenizer_module *pTModule; /* The tokenizer module */ sqlite3_tokenizer *pTokenizer; /* The specific tokenizer */ sqlite3_tokenizer_cursor *pTCursor; /* Tokenizer cursor */ Fts3Table *pVtab; /* The full text index */ int nColumn; /* Number of columns in the index */ int i, j; /* Loop counters */ int rc; /* Return code */ unsigned int match, prevMatch; /* Phrase search bitmasks */ const char *zToken; /* Next token from the tokenizer */ int nToken; /* Size of zToken */ int iBegin, iEnd, iPos; /* Offsets of beginning and end */ /* The following variables keep a circular buffer of the last ** few tokens */ unsigned int iRotor = 0; /* Index of current token */ int iRotorBegin[FTS3_ROTOR_SZ]; /* Beginning offset of token */ int iRotorLen[FTS3_ROTOR_SZ]; /* Length of token */ pVtab = (Fts3Table *)pCur->base.pVtab; nColumn = pVtab->nColumn; pTokenizer = pVtab->pTokenizer; pTModule = pTokenizer->pModule; rc = pTModule->xOpen(pTokenizer, zDoc, nDoc, &pTCursor); if( rc ) return rc; pTCursor->pTokenizer = pTokenizer; prevMatch = 0; while( (rc = pTModule->xNext(pTCursor, &zToken, &nToken, &iBegin, &iEnd, &iPos))==SQLITE_OK ){ Fts3Expr *pIter = pCur->pExpr; int iIter = -1; iRotorBegin[iRotor&FTS3_ROTOR_MASK] = iBegin; iRotorLen[iRotor&FTS3_ROTOR_MASK] = iEnd-iBegin; match = 0; for(i=0; i<(FTS3_ROTOR_SZ-1) && fts3NextExprToken(&pIter, &iIter); i++){ int nPhrase; /* Number of tokens in current phrase */ struct PhraseToken *pToken; /* Current token */ int iCol; /* Column index */ if( fts3ExprBeneathNot(pIter) ) continue; nPhrase = pIter->pPhrase->nToken; pToken = &pIter->pPhrase->aToken[iIter]; iCol = pIter->pPhrase->iColumn; if( iCol>=0 && iColn>nToken ) continue; if( !pToken->isPrefix && pToken->nn<=nToken ); if( memcmp(pToken->z, zToken, pToken->n) ) continue; if( iIter>0 && (prevMatch & (1<=0; j--){ int k = (iRotor-j) & FTS3_ROTOR_MASK; rc = snippetAppendMatch(pSnippet, iColumn, i-j, iPos-j, iRotorBegin[k], iRotorLen[k]); if( rc ) goto end_offsets_of_column; } } } prevMatch = match<<1; iRotor++; } end_offsets_of_column: pTModule->xClose(pTCursor); return rc==SQLITE_DONE ? SQLITE_OK : rc; } /* ** Remove entries from the pSnippet structure to account for the NEAR ** operator. When this is called, pSnippet contains the list of token ** offsets produced by treating all NEAR operators as AND operators. ** This function removes any entries that should not be present after ** accounting for the NEAR restriction. For example, if the queried ** document is: ** ** "A B C D E A" ** ** and the query is: ** ** A NEAR/0 E ** ** then when this function is called the Snippet contains token offsets ** 0, 4 and 5. This function removes the "0" entry (because the first A ** is not near enough to an E). ** ** When this function is called, the value pointed to by parameter piLeft is ** the integer id of the left-most token in the expression tree headed by ** pExpr. This function increments *piLeft by the total number of tokens ** in the expression tree headed by pExpr. ** ** Return 1 if any trimming occurs. Return 0 if no trimming is required. */ static int trimSnippetOffsets( Fts3Expr *pExpr, /* The search expression */ Snippet *pSnippet, /* The set of snippet offsets to be trimmed */ int *piLeft /* Index of left-most token in pExpr */ ){ if( pExpr ){ if( trimSnippetOffsets(pExpr->pLeft, pSnippet, piLeft) ){ return 1; } switch( pExpr->eType ){ case FTSQUERY_PHRASE: *piLeft += pExpr->pPhrase->nToken; break; case FTSQUERY_NEAR: { /* The right-hand-side of a NEAR operator is always a phrase. The ** left-hand-side is either a phrase or an expression tree that is ** itself headed by a NEAR operator. The following initializations ** set local variable iLeft to the token number of the left-most ** token in the right-hand phrase, and iRight to the right most ** token in the same phrase. For example, if we had: ** ** MATCH '"abc def" NEAR/2 "ghi jkl"' ** ** then iLeft will be set to 2 (token number of ghi) and nToken will ** be set to 4. */ Fts3Expr *pLeft = pExpr->pLeft; Fts3Expr *pRight = pExpr->pRight; int iLeft = *piLeft; int nNear = pExpr->nNear; int nToken = pRight->pPhrase->nToken; int jj, ii; if( pLeft->eType==FTSQUERY_NEAR ){ pLeft = pLeft->pRight; } assert( pRight->eType==FTSQUERY_PHRASE ); assert( pLeft->eType==FTSQUERY_PHRASE ); nToken += pLeft->pPhrase->nToken; for(ii=0; iinMatch; ii++){ struct snippetMatch *p = &pSnippet->aMatch[ii]; if( p->iTerm==iLeft ){ int isOk = 0; /* Snippet ii is an occurence of query term iLeft in the document. ** It occurs at position (p->iToken) of the document. We now ** search for an instance of token (iLeft-1) somewhere in the ** range (p->iToken - nNear)...(p->iToken + nNear + nToken) within ** the set of snippetMatch structures. If one is found, proceed. ** If one cannot be found, then remove snippets ii..(ii+N-1) ** from the matching snippets, where N is the number of tokens ** in phrase pRight->pPhrase. */ for(jj=0; isOk==0 && jjnMatch; jj++){ struct snippetMatch *p2 = &pSnippet->aMatch[jj]; if( p2->iTerm==(iLeft-1) ){ if( p2->iToken>=(p->iToken-nNear-1) && p2->iToken<(p->iToken+nNear+nToken) ){ isOk = 1; } } } if( !isOk ){ int kk; for(kk=0; kkpPhrase->nToken; kk++){ pSnippet->aMatch[kk+ii].iTerm = -2; } return 1; } } if( p->iTerm==(iLeft-1) ){ int isOk = 0; for(jj=0; isOk==0 && jjnMatch; jj++){ struct snippetMatch *p2 = &pSnippet->aMatch[jj]; if( p2->iTerm==iLeft ){ if( p2->iToken<=(p->iToken+nNear+1) && p2->iToken>(p->iToken-nNear-nToken) ){ isOk = 1; } } } if( !isOk ){ int kk; for(kk=0; kkpPhrase->nToken; kk++){ pSnippet->aMatch[ii-kk].iTerm = -2; } return 1; } } } break; } } if( trimSnippetOffsets(pExpr->pRight, pSnippet, piLeft) ){ return 1; } } return 0; } /* ** Compute all offsets for the current row of the query. ** If the offsets have already been computed, this routine is a no-op. */ static int snippetAllOffsets(Fts3Cursor *pCsr, Snippet **ppSnippet){ Fts3Table *p = (Fts3Table *)pCsr->base.pVtab; /* The FTS3 virtual table */ int nColumn; /* Number of columns. Docid does count */ int iColumn; /* Index of of a column */ int i; /* Loop index */ int iFirst; /* First column to search */ int iLast; /* Last coumn to search */ int iTerm = 0; Snippet *pSnippet; int rc = SQLITE_OK; if( pCsr->pExpr==0 ){ return SQLITE_OK; } pSnippet = (Snippet *)sqlite3_malloc(sizeof(Snippet)); *ppSnippet = pSnippet; if( !pSnippet ){ return SQLITE_NOMEM; } memset(pSnippet, 0, sizeof(Snippet)); nColumn = p->nColumn; iColumn = (pCsr->eSearch - 2); if( iColumn<0 || iColumn>=nColumn ){ /* Look for matches over all columns of the full-text index */ iFirst = 0; iLast = nColumn-1; }else{ /* Look for matches in the iColumn-th column of the index only */ iFirst = iColumn; iLast = iColumn; } for(i=iFirst; rc==SQLITE_OK && i<=iLast; i++){ const char *zDoc; int nDoc; zDoc = (const char*)sqlite3_column_text(pCsr->pStmt, i+1); nDoc = sqlite3_column_bytes(pCsr->pStmt, i+1); if( zDoc==0 && sqlite3_column_type(pCsr->pStmt, i+1)!=SQLITE_NULL ){ rc = SQLITE_NOMEM; }else{ rc = snippetOffsetsOfColumn(pCsr, pSnippet, i, zDoc, nDoc); } } while( trimSnippetOffsets(pCsr->pExpr, pSnippet, &iTerm) ){ iTerm = 0; } return rc; } /* ** Convert the information in the aMatch[] array of the snippet ** into the string zOffset[0..nOffset-1]. This string is used as ** the return of the SQL offsets() function. */ static void snippetOffsetText(Snippet *p){ int i; int cnt = 0; StringBuffer sb; char zBuf[200]; if( p->zOffset ) return; fts3SnippetSbInit(&sb); for(i=0; inMatch; i++){ struct snippetMatch *pMatch = &p->aMatch[i]; if( pMatch->iTerm>=0 ){ /* If snippetMatch.iTerm is less than 0, then the match was ** discarded as part of processing the NEAR operator (see the ** trimSnippetOffsetsForNear() function for details). Ignore ** it in this case */ zBuf[0] = ' '; sqlite3_snprintf(sizeof(zBuf)-1, &zBuf[cnt>0], "%d %d %d %d", pMatch->iCol, pMatch->iTerm, pMatch->iStart, pMatch->nByte); fts3SnippetAppend(&sb, zBuf, -1); cnt++; } } p->zOffset = sb.z; p->nOffset = sb.z ? sb.nUsed : 0; } /* ** zDoc[0..nDoc-1] is phrase of text. aMatch[0..nMatch-1] are a set ** of matching words some of which might be in zDoc. zDoc is column ** number iCol. ** ** iBreak is suggested spot in zDoc where we could begin or end an ** excerpt. Return a value similar to iBreak but possibly adjusted ** to be a little left or right so that the break point is better. */ static int wordBoundary( int iBreak, /* The suggested break point */ const char *zDoc, /* Document text */ int nDoc, /* Number of bytes in zDoc[] */ struct snippetMatch *aMatch, /* Matching words */ int nMatch, /* Number of entries in aMatch[] */ int iCol /* The column number for zDoc[] */ ){ int i; if( iBreak<=10 ){ return 0; } if( iBreak>=nDoc-10 ){ return nDoc; } for(i=0; ALWAYS(i0 && aMatch[i-1].iStart+aMatch[i-1].nByte>=iBreak ){ return aMatch[i-1].iStart; } } for(i=1; i<=10; i++){ if( fts3snippetIsspace(zDoc[iBreak-i]) ){ return iBreak - i + 1; } if( fts3snippetIsspace(zDoc[iBreak+i]) ){ return iBreak + i + 1; } } return iBreak; } /* ** Allowed values for Snippet.aMatch[].snStatus */ #define SNIPPET_IGNORE 0 /* It is ok to omit this match from the snippet */ #define SNIPPET_DESIRED 1 /* We want to include this match in the snippet */ /* ** Generate the text of a snippet. */ static void snippetText( Fts3Cursor *pCursor, /* The cursor we need the snippet for */ Snippet *pSnippet, const char *zStartMark, /* Markup to appear before each match */ const char *zEndMark, /* Markup to appear after each match */ const char *zEllipsis /* Ellipsis mark */ ){ int i, j; struct snippetMatch *aMatch; int nMatch; int nDesired; StringBuffer sb; int tailCol; int tailOffset; int iCol; int nDoc; const char *zDoc; int iStart, iEnd; int tailEllipsis = 0; int iMatch; sqlite3_free(pSnippet->zSnippet); pSnippet->zSnippet = 0; aMatch = pSnippet->aMatch; nMatch = pSnippet->nMatch; fts3SnippetSbInit(&sb); for(i=0; i0; i++){ if( aMatch[i].snStatus!=SNIPPET_DESIRED ) continue; nDesired--; iCol = aMatch[i].iCol; zDoc = (const char*)sqlite3_column_text(pCursor->pStmt, iCol+1); nDoc = sqlite3_column_bytes(pCursor->pStmt, iCol+1); iStart = aMatch[i].iStart - 40; iStart = wordBoundary(iStart, zDoc, nDoc, aMatch, nMatch, iCol); if( iStart<=10 ){ iStart = 0; } if( iCol==tailCol && iStart<=tailOffset+20 ){ iStart = tailOffset; } if( (iCol!=tailCol && tailCol>=0) || iStart!=tailOffset ){ fts3SnippetTrimWhiteSpace(&sb); fts3SnippetAppendWhiteSpace(&sb); fts3SnippetAppend(&sb, zEllipsis, -1); fts3SnippetAppendWhiteSpace(&sb); } iEnd = aMatch[i].iStart + aMatch[i].nByte + 40; iEnd = wordBoundary(iEnd, zDoc, nDoc, aMatch, nMatch, iCol); if( iEnd>=nDoc-10 ){ iEnd = nDoc; tailEllipsis = 0; }else{ tailEllipsis = 1; } while( iMatchzSnippet = sb.z; pSnippet->nSnippet = sb.z ? sb.nUsed : 0; } void sqlite3Fts3Offsets( sqlite3_context *pCtx, /* SQLite function call context */ Fts3Cursor *pCsr /* Cursor object */ ){ Snippet *p; /* Snippet structure */ int rc = snippetAllOffsets(pCsr, &p); if( rc==SQLITE_OK ){ snippetOffsetText(p); if( p->zOffset ){ sqlite3_result_text(pCtx, p->zOffset, p->nOffset, SQLITE_TRANSIENT); }else{ sqlite3_result_error_nomem(pCtx); } }else{ sqlite3_result_error_nomem(pCtx); } fts3SnippetFree(p); } void sqlite3Fts3Snippet( sqlite3_context *pCtx, /* SQLite function call context */ Fts3Cursor *pCsr, /* Cursor object */ const char *zStart, /* Snippet start text - "" */ const char *zEnd, /* Snippet end text - "" */ const char *zEllipsis /* Snippet ellipsis text - "..." */ ){ Snippet *p; /* Snippet structure */ int rc = snippetAllOffsets(pCsr, &p); if( rc==SQLITE_OK ){ snippetText(pCsr, p, zStart, zEnd, zEllipsis); if( p->zSnippet ){ sqlite3_result_text(pCtx, p->zSnippet, p->nSnippet, SQLITE_TRANSIENT); }else{ sqlite3_result_error_nomem(pCtx); } }else{ sqlite3_result_error_nomem(pCtx); } fts3SnippetFree(p); } /************************************************************************* ** Below this point is the alternative, experimental snippet() implementation. */ #define SNIPPET_BUFFER_CHUNK 64 #define SNIPPET_BUFFER_SIZE SNIPPET_BUFFER_CHUNK*4 #define SNIPPET_BUFFER_MASK (SNIPPET_BUFFER_SIZE-1) static void fts3GetDeltaPosition(char **pp, int *piPos){ int iVal; *pp += sqlite3Fts3GetVarint32(*pp, &iVal); *piPos += (iVal-2); } /* ** Iterate through all phrase nodes in an FTS3 query, except those that ** are part of a sub-tree that is the right-hand-side of a NOT operator. ** For each phrase node found, the supplied callback function is invoked. ** ** If the callback function returns anything other than SQLITE_OK, ** the iteration is abandoned and the error code returned immediately. ** Otherwise, SQLITE_OK is returned after a callback has been made for ** all eligible phrase nodes. */ static int fts3ExprIterate( Fts3Expr *pExpr, /* Expression to iterate phrases of */ int (*x)(Fts3Expr *, void *), /* Callback function to invoke for phrases */ void *pCtx /* Second argument to pass to callback */ ){ int rc; int eType = pExpr->eType; if( eType==FTSQUERY_NOT ){ rc = SQLITE_OK; }else if( eType!=FTSQUERY_PHRASE ){ assert( pExpr->pLeft && pExpr->pRight ); rc = fts3ExprIterate(pExpr->pLeft, x, pCtx); if( rc==SQLITE_OK ){ rc = fts3ExprIterate(pExpr->pRight, x, pCtx); } }else{ rc = x(pExpr, pCtx); } return rc; } typedef struct LoadDoclistCtx LoadDoclistCtx; struct LoadDoclistCtx { Fts3Table *pTab; /* FTS3 Table */ int nPhrase; /* Number of phrases so far */ }; static int fts3ExprLoadDoclistsCb(Fts3Expr *pExpr, void *ctx){ int rc = SQLITE_OK; LoadDoclistCtx *p = (LoadDoclistCtx *)ctx; p->nPhrase++; if( pExpr->isLoaded==0 ){ rc = sqlite3Fts3ExprLoadDoclist(p->pTab, pExpr); pExpr->isLoaded = 1; if( rc==SQLITE_OK && pExpr->aDoclist ){ pExpr->pCurrent = pExpr->aDoclist; pExpr->pCurrent += sqlite3Fts3GetVarint(pExpr->pCurrent,&pExpr->iCurrent); } } return rc; } static int fts3ExprLoadDoclists(Fts3Cursor *pCsr, int *pnPhrase){ int rc; LoadDoclistCtx sCtx = {0, 0}; sCtx.pTab = (Fts3Table *)pCsr->base.pVtab; rc = fts3ExprIterate(pCsr->pExpr, fts3ExprLoadDoclistsCb, (void *)&sCtx); *pnPhrase = sCtx.nPhrase; return rc; } /* ** Each call to this function populates a chunk of a snippet-buffer ** SNIPPET_BUFFER_CHUNK bytes in size. ** ** Return true if the end of the data has been reached (and all subsequent ** calls to fts3LoadSnippetBuffer() with the same arguments will be no-ops), ** or false otherwise. */ static int fts3LoadSnippetBuffer( int iPos, /* Document token offset to load data for */ u8 *aBuffer, /* Circular snippet buffer to populate */ int nList, /* Number of position lists in appList */ char **apList, /* IN/OUT: nList position list pointers */ int *aiPrev /* IN/OUT: Previous positions read */ ){ int i; int nFin = 0; assert( (iPos&(SNIPPET_BUFFER_CHUNK-1))==0 ); memset(&aBuffer[iPos&SNIPPET_BUFFER_MASK], 0, SNIPPET_BUFFER_CHUNK); for(i=0; i=iPos ){ aBuffer[iPrev&SNIPPET_BUFFER_MASK] = i+1; } if( 0==((*pList)&0xFE) ){ nFin++; break; } fts3GetDeltaPosition(&pList, &iPrev); } aiPrev[i] = iPrev; apList[i] = pList; } return (nFin==nList); } typedef struct SnippetCtx SnippetCtx; struct SnippetCtx { Fts3Cursor *pCsr; int iCol; int iPhrase; int *aiPrev; int *anToken; char **apList; }; static int fts3SnippetFindPositions(Fts3Expr *pExpr, void *ctx){ SnippetCtx *p = (SnippetCtx *)ctx; int iPhrase = p->iPhrase++; char *pCsr; p->anToken[iPhrase] = pExpr->pPhrase->nToken; pCsr = sqlite3Fts3FindPositions(pExpr, p->pCsr->iPrevId, p->iCol); if( pCsr ){ int iVal; pCsr += sqlite3Fts3GetVarint32(pCsr, &iVal); p->apList[iPhrase] = pCsr; p->aiPrev[iPhrase] = iVal-2; } return SQLITE_OK; } static void fts3SnippetCnt( int iIdx, int nSnippet, int *anCnt, u8 *aBuffer, int *anToken, u64 *pHlmask ){ int iSub = (iIdx-1)&SNIPPET_BUFFER_MASK; int iAdd = (iIdx+nSnippet-1)&SNIPPET_BUFFER_MASK; int iSub2 = (iIdx+(nSnippet/3)-1)&SNIPPET_BUFFER_MASK; int iAdd2 = (iIdx+(nSnippet*2/3)-1)&SNIPPET_BUFFER_MASK; u64 h = *pHlmask; anCnt[ aBuffer[iSub] ]--; anCnt[ aBuffer[iSub2] ]--; anCnt[ aBuffer[iAdd] ]++; anCnt[ aBuffer[iAdd2] ]++; h = h >> 1; if( aBuffer[iAdd] ){ int j; for(j=anToken[aBuffer[iAdd]-1]; j>=1; j--){ h |= (u64)1 << (nSnippet-j); } } *pHlmask = h; } static int fts3SnippetScore(int n, int *anCnt){ int j; int iScore = 0; for(j=1; j<=n; j++){ int nCnt = anCnt[j]; iScore += nCnt + (nCnt ? 1000 : 0); } return iScore; } static int fts3BestSnippet( int nSnippet, /* Desired snippet length */ Fts3Cursor *pCsr, /* Cursor to create snippet for */ int iCol, /* Index of column to create snippet from */ int *piPos, /* OUT: Starting token for best snippet */ u64 *pHlmask /* OUT: Highlight mask for best snippet */ ){ int rc; /* Return Code */ u8 aBuffer[SNIPPET_BUFFER_SIZE];/* Circular snippet buffer */ int *aiPrev; /* Used by fts3LoadSnippetBuffer() */ int *anToken; /* Number of tokens in each phrase */ char **apList; /* Array of position lists */ int *anCnt; /* Running totals of phrase occurences */ int nList; int i; u64 hlmask = 0; /* Current mask of highlighted terms */ u64 besthlmask = 0; /* Mask of highlighted terms for iBestPos */ int iBestPos = 0; /* Starting position of 'best' snippet */ int iBestScore = 0; /* Score of best snippet higher->better */ SnippetCtx sCtx; /* Iterate through the phrases in the expression to count them. The same ** callback makes sure the doclists are loaded for each phrase. */ rc = fts3ExprLoadDoclists(pCsr, &nList); if( rc!=SQLITE_OK ){ return rc; } /* Now that it is known how many phrases there are, allocate and zero ** the required arrays using malloc(). */ apList = sqlite3_malloc( sizeof(u8*)*nList + /* apList */ sizeof(int)*(nList) + /* anToken */ sizeof(int)*nList + /* aiPrev */ sizeof(int)*(nList+1) /* anCnt */ ); if( !apList ){ return SQLITE_NOMEM; } memset(apList, 0, sizeof(u8*)*nList+sizeof(int)*nList+sizeof(int)*nList); anToken = (int *)&apList[nList]; aiPrev = &anToken[nList]; anCnt = &aiPrev[nList]; /* Initialize the contents of the aiPrev and aiList arrays. */ sCtx.pCsr = pCsr; sCtx.iCol = iCol; sCtx.apList = apList; sCtx.aiPrev = aiPrev; sCtx.anToken = anToken; sCtx.iPhrase = 0; (void)fts3ExprIterate(pCsr->pExpr, fts3SnippetFindPositions, (void *)&sCtx); /* Load the first two chunks of data into the buffer. */ memset(aBuffer, 0, SNIPPET_BUFFER_SIZE); fts3LoadSnippetBuffer(0, aBuffer, nList, apList, aiPrev); fts3LoadSnippetBuffer(SNIPPET_BUFFER_CHUNK, aBuffer, nList, apList, aiPrev); /* Set the initial contents of the highlight-mask and anCnt[] array. */ for(i=1-nSnippet; i<=0; i++){ fts3SnippetCnt(i, nSnippet, anCnt, aBuffer, anToken, &hlmask); } iBestScore = fts3SnippetScore(nList, anCnt); besthlmask = hlmask; iBestPos = 0; for(i=1; 1; i++){ int iScore; if( 0==(i&(SNIPPET_BUFFER_CHUNK-1)) ){ int iLoad = i + SNIPPET_BUFFER_CHUNK; if( fts3LoadSnippetBuffer(iLoad, aBuffer, nList, apList, aiPrev) ) break; } /* Figure out how highly a snippet starting at token offset i scores ** according to fts3SnippetScore(). If it is higher than any previously ** considered position, save the current position, score and hlmask as ** the best snippet candidate found so far. */ fts3SnippetCnt(i, nSnippet, anCnt, aBuffer, anToken, &hlmask); iScore = fts3SnippetScore(nList, anCnt); if( iScore>iBestScore ){ iBestPos = i; iBestScore = iScore; besthlmask = hlmask; } } sqlite3_free(apList); *piPos = iBestPos; *pHlmask = besthlmask; return SQLITE_OK; } typedef struct StrBuffer StrBuffer; struct StrBuffer { char *z; int n; int nAlloc; }; static int fts3StringAppend( StrBuffer *pStr, const char *zAppend, int nAppend ){ if( nAppend<0 ){ nAppend = strlen(zAppend); } if( pStr->n+nAppend+1>=pStr->nAlloc ){ int nAlloc = pStr->nAlloc+nAppend+100; char *zNew = sqlite3_realloc(pStr->z, nAlloc); if( !zNew ){ return SQLITE_NOMEM; } pStr->z = zNew; pStr->nAlloc = nAlloc; } memcpy(&pStr->z[pStr->n], zAppend, nAppend); pStr->n += nAppend; pStr->z[pStr->n] = '\0'; return SQLITE_OK; } static int fts3SnippetText( Fts3Cursor *pCsr, /* FTS3 Cursor */ const char *zDoc, /* Document to extract snippet from */ int nDoc, /* Size of zDoc in bytes */ int nSnippet, /* Number of tokens in extracted snippet */ int iPos, /* Index of first document token in snippet */ u64 hlmask, /* Bitmask of terms to highlight in snippet */ const char *zOpen, /* String inserted before highlighted term */ const char *zClose, /* String inserted after highlighted term */ const char *zEllipsis, char **pzSnippet /* OUT: Snippet text */ ){ Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab; int rc; /* Return code */ int iCurrent = 0; int iStart = 0; int iEnd; sqlite3_tokenizer_module *pMod; /* Tokenizer module methods object */ sqlite3_tokenizer_cursor *pC; /* Tokenizer cursor open on zDoc/nDoc */ const char *ZDUMMY; /* Dummy arguments used with tokenizer */ int DUMMY1, DUMMY2, DUMMY3; /* Dummy arguments used with tokenizer */ StrBuffer res = {0, 0, 0}; /* Result string */ /* Open a token cursor on the document. Read all tokens up to and ** including token iPos (the first token of the snippet). Set variable ** iStart to the byte offset in zDoc of the start of token iPos. */ pMod = (sqlite3_tokenizer_module *)pTab->pTokenizer->pModule; rc = pMod->xOpen(pTab->pTokenizer, zDoc, nDoc, &pC); while( rc==SQLITE_OK && iCurrentxNext(pC, &ZDUMMY, &DUMMY1, &iStart, &DUMMY2, &iCurrent); } iEnd = iStart; if( rc==SQLITE_OK && iStart>0 ){ rc = fts3StringAppend(&res, zEllipsis, -1); } while( rc==SQLITE_OK ){ int iBegin; int iFin; rc = pMod->xNext(pC, &ZDUMMY, &DUMMY1, &iBegin, &iFin, &iCurrent); if( rc==SQLITE_OK ){ if( iCurrent>=(iPos+nSnippet) ){ rc = SQLITE_DONE; }else{ iEnd = iFin; if( hlmask & ((u64)1 << (iCurrent-iPos)) ){ if( fts3StringAppend(&res, &zDoc[iStart], iBegin-iStart) || fts3StringAppend(&res, zOpen, -1) || fts3StringAppend(&res, &zDoc[iBegin], iEnd-iBegin) || fts3StringAppend(&res, zClose, -1) ){ rc = SQLITE_NOMEM; } iStart = iEnd; } } } } assert( rc!=SQLITE_OK ); if( rc==SQLITE_DONE ){ rc = fts3StringAppend(&res, &zDoc[iStart], iEnd-iStart); if( rc==SQLITE_OK ){ rc = pMod->xNext(pC, &ZDUMMY, &DUMMY1, &DUMMY2, &DUMMY3, &iCurrent); if( rc==SQLITE_OK ){ rc = fts3StringAppend(&res, zEllipsis, -1); }else if( rc==SQLITE_DONE ){ rc = fts3StringAppend(&res, &zDoc[iEnd], -1); } } } pMod->xClose(pC); if( rc!=SQLITE_OK ){ sqlite3_free(res.z); }else{ *pzSnippet = res.z; } return rc; } /* ** An instance of this structure is used to collect the 'global' part of ** the matchinfo statistics. The 'global' part consists of the following: ** ** 1. The number of phrases in the query (nPhrase). ** ** 2. The number of columns in the FTS3 table (nCol). ** ** 3. A matrix of (nPhrase*nCol) integers containing the sum of the ** number of hits for each phrase in each column across all rows ** of the table. ** ** The total size of the global matchinfo array, assuming the number of ** columns is N and the number of phrases is P is: ** ** 2 + P*(N+1) ** ** The number of hits for the 3rd phrase in the second column is found ** using the expression: ** ** aGlobal[2 + P*(1+2) + 1] */ typedef struct MatchInfo MatchInfo; struct MatchInfo { Fts3Table *pTab; /* FTS3 Table */ Fts3Cursor *pCursor; /* FTS3 Cursor */ int iPhrase; /* Number of phrases so far */ int nCol; /* Number of columns in table */ u32 *aGlobal; /* Pre-allocated buffer */ }; /* ** This function is used to count the entries in a column-list (delta-encoded ** list of term offsets within a single column of a single row). */ static int fts3ColumnlistCount(char **ppCollist){ char *pEnd = *ppCollist; char c = 0; int nEntry = 0; /* A column-list is terminated by either a 0x01 or 0x00. */ while( 0xFE & (*pEnd | c) ){ c = *pEnd++ & 0x80; if( !c ) nEntry++; } *ppCollist = pEnd; return nEntry; } static void fts3LoadColumnlistCounts(char **pp, u32 *aOut){ char *pCsr = *pp; while( *pCsr ){ sqlite3_int64 iCol = 0; if( *pCsr==0x01 ){ pCsr++; pCsr += sqlite3Fts3GetVarint(pCsr, &iCol); } aOut[iCol] += fts3ColumnlistCount(&pCsr); } pCsr++; *pp = pCsr; } /* ** fts3ExprIterate() callback used to collect the "global" matchinfo stats ** for a single query. */ static int fts3ExprGlobalMatchinfoCb( Fts3Expr *pExpr, /* Phrase expression node */ void *pCtx /* Pointer to MatchInfo structure */ ){ MatchInfo *p = (MatchInfo *)pCtx; char *pCsr; char *pEnd; const int iStart = 2 + p->nCol*p->iPhrase; assert( pExpr->isLoaded ); /* Fill in the global hit count matrix row for this phrase. */ pCsr = pExpr->aDoclist; pEnd = &pExpr->aDoclist[pExpr->nDoclist]; while( pCsraGlobal[iStart]); } p->iPhrase++; return SQLITE_OK; } static int fts3ExprLocalMatchinfoCb( Fts3Expr *pExpr, /* Phrase expression node */ void *pCtx /* Pointer to MatchInfo structure */ ){ MatchInfo *p = (MatchInfo *)pCtx; int iPhrase = p->iPhrase++; if( pExpr->aDoclist ){ char *pCsr; int iOffset = 2 + p->nCol*(p->aGlobal[0]+iPhrase); memset(&p->aGlobal[iOffset], 0, p->nCol*sizeof(u32)); pCsr = sqlite3Fts3FindPositions(pExpr, p->pCursor->iPrevId, -1); if( pCsr ) fts3LoadColumnlistCounts(&pCsr, &p->aGlobal[iOffset]); } return SQLITE_OK; } /* ** Populate pCsr->aMatchinfo[] with data for the current row. The 'matchinfo' ** data is an array of 32-bit unsigned integers (C type u32). */ static int fts3GetMatchinfo(Fts3Cursor *pCsr){ MatchInfo g; Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab; if( pCsr->aMatchinfo==0 ){ int rc; int nPhrase; int nMatchinfo; g.pTab = pTab; g.nCol = pTab->nColumn; g.iPhrase = 0; rc = fts3ExprLoadDoclists(pCsr, &nPhrase); if( rc!=SQLITE_OK ){ return rc; } nMatchinfo = 2 + 2*g.nCol*nPhrase; g.iPhrase = 0; g.aGlobal = (u32 *)sqlite3_malloc(sizeof(u32)*nMatchinfo); if( !g.aGlobal ){ return SQLITE_NOMEM; } memset(g.aGlobal, 0, sizeof(u32)*nMatchinfo); g.aGlobal[0] = nPhrase; g.aGlobal[1] = g.nCol; (void)fts3ExprIterate(pCsr->pExpr, fts3ExprGlobalMatchinfoCb, (void *)&g); pCsr->aMatchinfo = g.aGlobal; } g.pTab = pTab; g.pCursor = pCsr; g.nCol = pTab->nColumn; g.iPhrase = 0; g.aGlobal = pCsr->aMatchinfo; if( pCsr->isMatchinfoOk ){ (void)fts3ExprIterate(pCsr->pExpr, fts3ExprLocalMatchinfoCb, (void *)&g); pCsr->isMatchinfoOk = 0; } return SQLITE_OK; } void sqlite3Fts3Snippet2( sqlite3_context *pCtx, /* SQLite function call context */ Fts3Cursor *pCsr, /* Cursor object */ const char *zStart, /* Snippet start text - "" */ const char *zEnd, /* Snippet end text - "" */ const char *zEllipsis, /* Snippet ellipsis text - "..." */ int iCol, /* Extract snippet from this column */ int nToken /* Approximate number of tokens in snippet */ ){ int rc; int iPos = 0; u64 hlmask = 0; char *z = 0; int nDoc; const char *zDoc; rc = fts3BestSnippet(nToken, pCsr, iCol, &iPos, &hlmask); nDoc = sqlite3_column_bytes(pCsr->pStmt, iCol+1); zDoc = (const char *)sqlite3_column_text(pCsr->pStmt, iCol+1); if( rc==SQLITE_OK ){ rc = fts3SnippetText( pCsr, zDoc, nDoc, nToken, iPos, hlmask, zStart, zEnd, zEllipsis, &z); } if( rc!=SQLITE_OK ){ sqlite3_result_error_code(pCtx, rc); }else{ sqlite3_result_text(pCtx, z, -1, sqlite3_free); } } void sqlite3Fts3Matchinfo(sqlite3_context *pContext, Fts3Cursor *pCsr){ int rc = fts3GetMatchinfo(pCsr); if( rc!=SQLITE_OK ){ sqlite3_result_error_code(pContext, rc); }else{ int n = sizeof(u32)*(2+pCsr->aMatchinfo[0]*pCsr->aMatchinfo[1]*2); sqlite3_result_blob(pContext, pCsr->aMatchinfo, n, SQLITE_TRANSIENT); } } #endif