From f596da5cc6176e4f1754aee21b954eda6c3edef5 Mon Sep 17 00:00:00 2001 From: dan Date: Tue, 12 Jan 2010 17:57:30 +0000 Subject: [PATCH] Update comments in fts3_snippet.c. FossilOrigin-Name: dd9689b7c31861484f8d8062478ae73d3c940902 --- ext/fts3/fts3_snippet.c | 570 +++++++++++++++++++++++++--------------- manifest | 26 +- manifest.uuid | 2 +- test/fts3query.test | 2 +- 4 files changed, 369 insertions(+), 231 deletions(-) diff --git a/ext/fts3/fts3_snippet.c b/ext/fts3/fts3_snippet.c index 20b636305b..9299a0e714 100644 --- a/ext/fts3/fts3_snippet.c +++ b/ext/fts3/fts3_snippet.c @@ -18,24 +18,115 @@ #include #include -#define SNIPPET_BUFFER_CHUNK 64 -#define SNIPPET_BUFFER_SIZE SNIPPET_BUFFER_CHUNK*4 -#define SNIPPET_BUFFER_MASK (SNIPPET_BUFFER_SIZE-1) +/* +** Used as an fts3ExprIterate() context when loading phrase doclists to +** Fts3Expr.aDoclist[]/nDoclist. +*/ +typedef struct LoadDoclistCtx LoadDoclistCtx; +struct LoadDoclistCtx { + Fts3Table *pTab; /* FTS3 Table */ + int nPhrase; /* Number of phrases seen so far */ + int nToken; /* Number of tokens seen so far */ +}; + +/* +** The following types are used as part of the implementation of the +** fts3BestSnippet() routine. +*/ +typedef struct SnippetIter SnippetIter; +typedef struct SnippetPhrase SnippetPhrase; +typedef struct SnippetFragment SnippetFragment; + +struct SnippetIter { + Fts3Cursor *pCsr; /* Cursor snippet is being generated from */ + int iCol; /* Extract snippet from this column */ + int nSnippet; /* Requested snippet length (in tokens) */ + int nPhrase; /* Number of phrases in query */ + SnippetPhrase *aPhrase; /* Array of size nPhrase */ + int iCurrent; /* First token of current snippet */ +}; + +struct SnippetPhrase { + int nToken; /* Number of tokens in phrase */ + char *pList; /* Pointer to start of phrase position list */ + int iHead; /* Next value in position list */ + char *pHead; /* Position list data following iHead */ + int iTail; /* Next value in trailing position list */ + char *pTail; /* Position list data following iTail */ +}; + +struct SnippetFragment { + int iCol; /* Column snippet is extracted from */ + int iPos; /* Index of first token in snippet */ + u64 covered; /* Mask of query phrases covered */ + u64 hlmask; /* Mask of snippet terms to highlight */ +}; + +/* +** This type is used as an fts3ExprIterate() context object while +** accumulating the data returned by the matchinfo() function. +*/ +typedef struct MatchInfo MatchInfo; +struct MatchInfo { + Fts3Cursor *pCursor; /* FTS3 Cursor */ + int nCol; /* Number of columns in table */ + u32 *aMatchinfo; /* Pre-allocated buffer */ +}; + + + +/* +** The snippet() and offsets() functions both return text values. An instance +** of the following structure is used to accumulate those values while the +** functions are running. See fts3StringAppend() for details. +*/ +typedef struct StrBuffer StrBuffer; +struct StrBuffer { + char *z; /* Pointer to buffer containing string */ + int n; /* Length of z in bytes (excl. nul-term) */ + int nAlloc; /* Allocated size of buffer z in bytes */ +}; + + +/* +** This function is used to help iterate through a position-list. A position +** list is a list of unique integers, sorted from smallest to largest. Each +** element of the list is represented by an FTS3 varint that takes the value +** of the difference between the current element and the previous one plus +** two. For example, to store the position-list: +** +** 4 9 113 +** +** the three varints: +** +** 6 7 106 +** +** are encoded. +** +** When this function is called, *pp points to the start of an element of +** the list. *piPos contains the value of the previous entry in the list. +** After it returns, *piPos contains the value of the next element of the +** list and *pp is advanced to the following varint. +*/ static void fts3GetDeltaPosition(char **pp, int *piPos){ int iVal; *pp += sqlite3Fts3GetVarint32(*pp, &iVal); *piPos += (iVal-2); } +/* +** Helper function for fts3ExprIterate() (see below). +*/ static int fts3ExprIterate2( Fts3Expr *pExpr, /* Expression to iterate phrases of */ int *piPhrase, /* Pointer to phrase counter */ int (*x)(Fts3Expr*,int,void*), /* Callback function to invoke for phrases */ void *pCtx /* Second argument to pass to callback */ ){ - int rc; - int eType = pExpr->eType; + int rc; /* Return code */ + int eType = pExpr->eType; /* Type of expression node pExpr */ + if( eType!=FTSQUERY_PHRASE ){ assert( pExpr->pLeft && pExpr->pRight ); rc = fts3ExprIterate2(pExpr->pLeft, piPhrase, x, pCtx); @@ -64,17 +155,24 @@ static int fts3ExprIterate( int (*x)(Fts3Expr*,int,void*), /* Callback function to invoke for phrases */ void *pCtx /* Second argument to pass to callback */ ){ - int iPhrase = 0; + int iPhrase = 0; /* Variable used as the phrase counter */ return fts3ExprIterate2(pExpr, &iPhrase, x, pCtx); } -typedef struct LoadDoclistCtx LoadDoclistCtx; -struct LoadDoclistCtx { - Fts3Table *pTab; /* FTS3 Table */ - int nPhrase; /* Number of phrases so far */ - int nToken; /* Number of tokens so far */ -}; - +/* +** The argument to this function is always a phrase node. Its doclist +** (Fts3Expr.aDoclist[]) and the doclists associated with all phrase nodes +** to the left of this one in the query tree have already been loaded. +** +** If this phrase node is part of a series of phrase nodes joined by +** NEAR operators (and is not the left-most of said series), then elements are +** removed from the phrases doclist consistent with the NEAR restriction. If +** required, elements may be removed from the doclists of phrases to the +** left of this one that are part of the same series of NEAR operator +** connected phrases. +** +** If an OOM error occurs, SQLITE_NOMEM is returned. Otherwise, SQLITE_OK. +*/ static int fts3ExprNearTrim(Fts3Expr *pExpr){ int rc = SQLITE_OK; Fts3Expr *pParent = pExpr->pParent; @@ -106,6 +204,11 @@ static int fts3ExprNearTrim(Fts3Expr *pExpr){ return rc; } +/* +** This is an fts3ExprIterate() callback used while loading the doclists +** for each phrase into Fts3Expr.aDoclist[]/nDoclist. See also +** fts3ExprLoadDoclists(). +*/ static int fts3ExprLoadDoclistsCb1(Fts3Expr *pExpr, int iPhrase, void *ctx){ int rc = SQLITE_OK; LoadDoclistCtx *p = (LoadDoclistCtx *)ctx; @@ -124,6 +227,11 @@ static int fts3ExprLoadDoclistsCb1(Fts3Expr *pExpr, int iPhrase, void *ctx){ return rc; } +/* +** This is an fts3ExprIterate() callback used while loading the doclists +** for each phrase into Fts3Expr.aDoclist[]/nDoclist. See also +** fts3ExprLoadDoclists(). +*/ static int fts3ExprLoadDoclistsCb2(Fts3Expr *pExpr, int iPhrase, void *ctx){ if( pExpr->aDoclist ){ pExpr->pCurrent = pExpr->aDoclist; @@ -133,13 +241,23 @@ static int fts3ExprLoadDoclistsCb2(Fts3Expr *pExpr, int iPhrase, void *ctx){ return SQLITE_OK; } +/* +** Load the doclists for each phrase in the query associated with FTS3 cursor +** pCsr. +** +** If pnPhrase is not NULL, then *pnPhrase is set to the number of matchable +** phrases in the expression (all phrases except those directly or +** indirectly descended from the right-hand-side of a NOT operator). If +** pnToken is not NULL, then it is set to the number of tokens in all +** matchable phrases of the expression. +*/ static int fts3ExprLoadDoclists( - Fts3Cursor *pCsr, + Fts3Cursor *pCsr, /* Fts3 cursor for current query */ int *pnPhrase, /* OUT: Number of phrases in query */ int *pnToken /* OUT: Number of tokens in query */ ){ - int rc; - LoadDoclistCtx sCtx = {0, 0, 0}; + int rc; /* Return Code */ + LoadDoclistCtx sCtx = {0,0,0}; /* Context for fts3ExprIterate() */ sCtx.pTab = (Fts3Table *)pCsr->base.pVtab; rc = fts3ExprIterate(pCsr->pExpr, fts3ExprLoadDoclistsCb1, (void *)&sCtx); if( rc==SQLITE_OK ){ @@ -150,30 +268,6 @@ static int fts3ExprLoadDoclists( return rc; } -/* -** The following types are used as part of the implementation of the -** fts3BestSnippet() routine. -*/ -typedef struct SnippetCtx SnippetCtx; -typedef struct SnippetPhrase SnippetPhrase; - -struct SnippetCtx { - Fts3Cursor *pCsr; /* Cursor snippet is being generated from */ - int iCol; /* Extract snippet from this column */ - int nSnippet; /* Requested snippet length (in tokens) */ - int nPhrase; /* Number of phrases in query */ - SnippetPhrase *aPhrase; /* Array of size nPhrase */ - int iCurrent; /* First token of current snippet */ -}; -struct SnippetPhrase { - int nToken; /* Number of tokens in phrase */ - char *pList; /* Pointer to start of phrase position list */ - int iHead; /* Next value in position list */ - char *pHead; /* Position list data following iHead */ - int iTail; /* Next value in trailing position list */ - char *pTail; /* Position list data following iTail */ -}; - /* ** Advance the position list iterator specified by the first two ** arguments so that it points to the first element with a value greater @@ -198,11 +292,14 @@ static void fts3SnippetAdvance(char **ppIter, int *piIter, int iNext){ } } -static int fts3SnippetNextCandidate(SnippetCtx *pIter){ +/* +** Advance the snippet iterator to the next candidate snippet. +*/ +static int fts3SnippetNextCandidate(SnippetIter *pIter){ int i; /* Loop counter */ if( pIter->iCurrent<0 ){ - /* The SnippetCtx object has just been initialized. The first snippet + /* The SnippetIter object has just been initialized. The first snippet ** candidate always starts at offset 0 (even if this candidate has a ** score of 0.0). */ @@ -240,8 +337,12 @@ static int fts3SnippetNextCandidate(SnippetCtx *pIter){ return 0; } +/* +** Retrieve information about the current candidate snippet of snippet +** iterator pIter. +*/ static void fts3SnippetDetails( - SnippetCtx *pIter, /* Snippet iterator */ + SnippetIter *pIter, /* Snippet iterator */ u64 mCovered, /* Bitmask of phrases already covered */ int *piToken, /* OUT: First token of proposed snippet */ int *piScore, /* OUT: "Score" for this snippet */ @@ -249,11 +350,10 @@ static void fts3SnippetDetails( u64 *pmHighlight /* OUT: Bitmask of terms to highlight */ ){ int iStart = pIter->iCurrent; /* First token of snippet */ - - int iScore = 0; - int i; - u64 mCover = 0; - u64 mHighlight = 0; + int iScore = 0; /* Score of this snippet */ + int i; /* Loop counter */ + u64 mCover = 0; /* Mask of phrases covered by this snippet */ + u64 mHighlight = 0; /* Mask of tokens to highlight in snippet */ for(i=0; inPhrase; i++){ SnippetPhrase *pPhrase = &pIter->aPhrase[i]; @@ -283,6 +383,7 @@ static void fts3SnippetDetails( } } + /* Set the output variables before returning. */ *piToken = iStart; *piScore = iScore; *pmCover = mCover; @@ -291,10 +392,10 @@ static void fts3SnippetDetails( /* ** This function is an fts3ExprIterate() callback used by fts3BestSnippet(). -** Each invocation populates an element of the SnippetCtx.aPhrase[] array. +** Each invocation populates an element of the SnippetIter.aPhrase[] array. */ static int fts3SnippetFindPositions(Fts3Expr *pExpr, int iPhrase, void *ctx){ - SnippetCtx *p = (SnippetCtx *)ctx; + SnippetIter *p = (SnippetIter *)ctx; SnippetPhrase *pPhrase = &p->aPhrase[iPhrase]; char *pCsr; @@ -316,16 +417,21 @@ static int fts3SnippetFindPositions(Fts3Expr *pExpr, int iPhrase, void *ctx){ return SQLITE_OK; } -#define BITMASK_SIZE 64 - -typedef struct SnippetFragment SnippetFragment; -struct SnippetFragment { - int iCol; /* Column snippet is extracted from */ - int iPos; /* Index of first token in snippet */ - u64 covered; /* Mask of query phrases covered */ - u64 hlmask; /* Mask of snippet terms to highlight */ -}; - +/* +** Select the fragment of text consisting of nFragment contiguous tokens +** from column iCol that represent the "best" snippet. The best snippet +** is the snippet with the highest score, where scores are calculated +** by adding: +** +** (a) +1 point for each occurence of a matchable phrase in the snippet. +** +** (b) +1000 points for the first occurence of each matchable phrase in +** the snippet for which the corresponding mCovered bit is not set. +** +** The selected snippet parameters are stored in structure *pFragment before +** returning. The score of the selected snippet is stored in *piScore +** before returning. +*/ static int fts3BestSnippet( int nSnippet, /* Desired snippet length */ Fts3Cursor *pCsr, /* Cursor to create snippet for */ @@ -337,12 +443,12 @@ static int fts3BestSnippet( ){ int rc; /* Return Code */ int nList; /* Number of phrases in expression */ - SnippetCtx sCtx; /* Snippet context object */ + SnippetIter sIter; /* Iterates through snippet candidates */ int nByte; /* Number of bytes of space to allocate */ - int iBestScore = -1; - int i; + int iBestScore = -1; /* Best snippet score found so far */ + int i; /* Loop counter */ - memset(&sCtx, 0, sizeof(sCtx)); + memset(&sIter, 0, sizeof(sIter)); /* Iterate through the phrases in the expression to count them. The same ** callback makes sure the doclists are loaded for each phrase. @@ -356,36 +462,39 @@ static int fts3BestSnippet( ** the required space using malloc(). */ nByte = sizeof(SnippetPhrase) * nList; - sCtx.aPhrase = (SnippetPhrase *)sqlite3_malloc(nByte); - if( !sCtx.aPhrase ){ + sIter.aPhrase = (SnippetPhrase *)sqlite3_malloc(nByte); + if( !sIter.aPhrase ){ return SQLITE_NOMEM; } - memset(sCtx.aPhrase, 0, nByte); + memset(sIter.aPhrase, 0, nByte); - /* Initialize the contents of the SnippetCtx object. Then iterate through + /* Initialize the contents of the SnippetIter object. Then iterate through ** the set of phrases in the expression to populate the aPhrase[] array. */ - sCtx.pCsr = pCsr; - sCtx.iCol = iCol; - sCtx.nSnippet = nSnippet; - sCtx.nPhrase = nList; - sCtx.iCurrent = -1; - (void)fts3ExprIterate(pCsr->pExpr, fts3SnippetFindPositions, (void *)&sCtx); + sIter.pCsr = pCsr; + sIter.iCol = iCol; + sIter.nSnippet = nSnippet; + sIter.nPhrase = nList; + sIter.iCurrent = -1; + (void)fts3ExprIterate(pCsr->pExpr, fts3SnippetFindPositions, (void *)&sIter); + /* Set the *pmSeen output variable. */ for(i=0; iiCol = iCol; - while( !fts3SnippetNextCandidate(&sCtx) ){ + while( !fts3SnippetNextCandidate(&sIter) ){ int iPos; int iScore; u64 mCover; u64 mHighlight; - fts3SnippetDetails(&sCtx, mCovered, &iPos, &iScore, &mCover, &mHighlight); - + fts3SnippetDetails(&sIter, mCovered, &iPos, &iScore, &mCover, &mHighlight); assert( iScore>=0 ); if( iScore>iBestScore ){ pFragment->iPos = iPos; @@ -395,28 +504,31 @@ static int fts3BestSnippet( } } - sqlite3_free(sCtx.aPhrase); + sqlite3_free(sIter.aPhrase); *piScore = iBestScore; return SQLITE_OK; } -typedef struct StrBuffer StrBuffer; -struct StrBuffer { - char *z; - int n; - int nAlloc; -}; - +/* +** Append a string to the string-buffer passed as the first argument. +** +** If nAppend is negative, then the length of the string zAppend is +** determined using strlen(). +*/ static int fts3StringAppend( - StrBuffer *pStr, - const char *zAppend, - int nAppend + StrBuffer *pStr, /* Buffer to append to */ + const char *zAppend, /* Pointer to data to append to buffer */ + int nAppend /* Size of zAppend in bytes (or -1) */ ){ if( nAppend<0 ){ nAppend = strlen(zAppend); } + /* If there is insufficient space allocated at StrBuffer.z, use realloc() + ** to grow the buffer until so that it is big enough to accomadate the + ** appended data. + */ if( pStr->n+nAppend+1>=pStr->nAlloc ){ int nAlloc = pStr->nAlloc+nAppend+100; char *zNew = sqlite3_realloc(pStr->z, nAlloc); @@ -427,6 +539,7 @@ static int fts3StringAppend( pStr->nAlloc = nAlloc; } + /* Append the data to the string buffer. */ memcpy(&pStr->z[pStr->n], zAppend, nAppend); pStr->n += nAppend; pStr->z[pStr->n] = '\0'; @@ -434,33 +547,63 @@ static int fts3StringAppend( return SQLITE_OK; } +/* +** The fts3BestSnippet() function often selects snippets that end with a +** query term. That is, the final term of the snippet is always a term +** that requires highlighting. For example, if 'X' is a highlighted term +** and '.' is a non-highlighted term, BestSnippet() may select: +** +** ........X.....X +** +** This function "shifts" the beginning of the snippet forward in the +** document so that there are approximately the same number of +** non-highlighted terms to the right of the final highlighted term as there +** are to the left of the first highlighted term. For example, to this: +** +** ....X.....X.... +** +** This is done as part of extracting the snippet text, not when selecting +** the snippet. Snippet selection is done based on doclists only, so there +** is no way for fts3BestSnippet() to know whether or not the document +** actually contains terms that follow the final highlighted term. +*/ int fts3SnippetShift( - Fts3Table *pTab, - int nSnippet, - const char *zDoc, - int nDoc, - int *piPos, - u64 *pHlmask + Fts3Table *pTab, /* FTS3 table snippet comes from */ + int nSnippet, /* Number of tokens desired for snippet */ + const char *zDoc, /* Document text to extract snippet from */ + int nDoc, /* Size of buffer zDoc in bytes */ + int *piPos, /* IN/OUT: First token of snippet */ + u64 *pHlmask /* IN/OUT: Mask of tokens to highlight */ ){ - u64 hlmask = *pHlmask; + u64 hlmask = *pHlmask; /* Local copy of initial highlight-mask */ if( hlmask ){ - int nLeft; - int nRight; - int nDesired; + int nLeft; /* Tokens to the left of first highlight */ + int nRight; /* Tokens to the right of last highlight */ + int nDesired; /* Ideal number of tokens to shift forward */ for(nLeft=0; !(hlmask & ((u64)1 << nLeft)); nLeft++); for(nRight=0; !(hlmask & ((u64)1 << (nSnippet-1-nRight))); nRight++); - nDesired = (nLeft-nRight)/2; + + /* Ideally, the start of the snippet should be pushed forward in the + ** document nDesired tokens. This block checks if there are actually + ** nDesired tokens to the right of the snippet. If so, *piPos and + ** *pHlMask are updated to shift the snippet nDesired tokens to the + ** right. Otherwise, the snippet is shifted by the number of tokens + ** available. + */ if( nDesired>0 ){ - int nShift; - int iCurrent = 0; - int rc; + int nShift; /* Number of tokens to shift snippet by */ + int iCurrent = 0; /* Token counter */ + int rc; /* Return Code */ sqlite3_tokenizer_module *pMod; sqlite3_tokenizer_cursor *pC; - pMod = (sqlite3_tokenizer_module *)pTab->pTokenizer->pModule; + + /* Open a cursor on zDoc/nDoc. Check if there are (nSnippet+nDesired) + ** or more tokens in zDoc/nDoc. + */ rc = pMod->xOpen(pTab->pTokenizer, zDoc, nDoc, &pC); if( rc!=SQLITE_OK ){ return rc; @@ -484,6 +627,10 @@ int fts3SnippetShift( return SQLITE_OK; } +/* +** Extract the snippet text for fragment pFragment from cursor pCsr and +** append it to string buffer pOut. +*/ static int fts3SnippetText( Fts3Cursor *pCsr, /* FTS3 Cursor */ SnippetFragment *pFragment, /* Snippet to extract */ @@ -492,8 +639,8 @@ static int fts3SnippetText( int nSnippet, /* Number of tokens in extracted snippet */ const char *zOpen, /* String inserted before highlighted term */ const char *zClose, /* String inserted after highlighted term */ - const char *zEllipsis, - StrBuffer *pOut + const char *zEllipsis, /* String inserted between snippets */ + StrBuffer *pOut /* Write output here */ ){ Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab; int rc; /* Return code */ @@ -501,23 +648,23 @@ static int fts3SnippetText( int nDoc; /* Size of zDoc in bytes */ int iCurrent = 0; /* Current token number of document */ int iEnd = 0; /* Byte offset of end of current token */ - int isShiftDone = 0; - int iPos = pFragment->iPos; - u64 hlmask = pFragment->hlmask; - + int isShiftDone = 0; /* True after snippet is shifted */ + int iPos = pFragment->iPos; /* First token of snippet */ + u64 hlmask = pFragment->hlmask; /* Highlight-mask for snippet */ + int iCol = pFragment->iCol+1; /* Query column to extract text from */ sqlite3_tokenizer_module *pMod; /* Tokenizer module methods object */ sqlite3_tokenizer_cursor *pC; /* Tokenizer cursor open on zDoc/nDoc */ - const char *ZDUMMY; /* Dummy arguments used with tokenizer */ - int DUMMY1; /* Dummy arguments used with tokenizer */ + const char *ZDUMMY; /* Dummy argument used with tokenizer */ + int DUMMY1; /* Dummy argument used with tokenizer */ - zDoc = (const char *)sqlite3_column_text(pCsr->pStmt, pFragment->iCol+1); + zDoc = (const char *)sqlite3_column_text(pCsr->pStmt, iCol); if( zDoc==0 ){ - if( sqlite3_column_type(pCsr->pStmt, pFragment->iCol+1)!=SQLITE_NULL ){ + if( sqlite3_column_type(pCsr->pStmt, iCol)!=SQLITE_NULL ){ return SQLITE_NOMEM; } return SQLITE_OK; } - nDoc = sqlite3_column_bytes(pCsr->pStmt, pFragment->iCol+1); + nDoc = sqlite3_column_bytes(pCsr->pStmt, iCol); /* Open a token cursor on the document. */ pMod = (sqlite3_tokenizer_module *)pTab->pTokenizer->pModule; @@ -530,7 +677,7 @@ static int fts3SnippetText( while( rc==SQLITE_OK ){ int iBegin; /* Offset in zDoc of start of token */ int iFin; /* Offset in zDoc of end of token */ - int isHighlight; + int isHighlight; /* True for highlighted terms */ rc = pMod->xNext(pC, &ZDUMMY, &DUMMY1, &iBegin, &iFin, &iCurrent); if( rc!=SQLITE_OK ){ @@ -584,39 +731,17 @@ static int fts3SnippetText( /* -** An instance of this structure is used to collect the 'global' part of -** the matchinfo statistics. The 'global' part consists of the following: +** This function is used to count the entries in a column-list (a +** delta-encoded list of term offsets within a single column of a single +** row). When this function is called, *ppCollist should point to the +** beginning of the first varint in the column-list (the varint that +** contains the position of the first matching term in the column data). +** Before returning, *ppCollist is set to point to the first byte after +** the last varint in the column-list (either the 0x00 signifying the end +** of the position-list, or the 0x01 that precedes the column number of +** the next column in the position-list). ** -** 1. The number of phrases in the query (nPhrase). -** -** 2. The number of columns in the FTS3 table (nCol). -** -** 3. A matrix of (nPhrase*nCol) integers containing the sum of the -** number of hits for each phrase in each column across all rows -** of the table. -** -** The total size of the global matchinfo array, assuming the number of -** columns is N and the number of phrases is P is: -** -** 2 + P*(N+1) -** -** The number of hits for the 3rd phrase in the second column is found -** using the expression: -** -** aGlobal[2 + P*(1+2) + 1] -*/ -typedef struct MatchInfo MatchInfo; -struct MatchInfo { - Fts3Table *pTab; /* FTS3 Table */ - Fts3Cursor *pCursor; /* FTS3 Cursor */ - int iPhrase; /* Number of phrases so far */ - int nCol; /* Number of columns in table */ - u32 *aGlobal; /* Pre-allocated buffer */ -}; - -/* -** This function is used to count the entries in a column-list (delta-encoded -** list of term offsets within a single column of a single row). +** The number of elements in the column-list is returned. */ static int fts3ColumnlistCount(char **ppCollist){ char *pEnd = *ppCollist; @@ -633,15 +758,20 @@ static int fts3ColumnlistCount(char **ppCollist){ return nEntry; } -static void fts3LoadColumnlistCounts(char **pp, u32 *aOut){ +static void fts3LoadColumnlistCounts(char **pp, u32 *aOut, int isGlobal){ char *pCsr = *pp; while( *pCsr ){ + int nHit; sqlite3_int64 iCol = 0; if( *pCsr==0x01 ){ pCsr++; pCsr += sqlite3Fts3GetVarint(pCsr, &iCol); } - aOut[iCol] += fts3ColumnlistCount(&pCsr); + nHit = fts3ColumnlistCount(&pCsr); + if( isGlobal ){ + if( nHit ) aOut[iCol*3+1]++; + } + aOut[iCol*3] += nHit; } pCsr++; *pp = pCsr; @@ -649,17 +779,18 @@ static void fts3LoadColumnlistCounts(char **pp, u32 *aOut){ /* ** fts3ExprIterate() callback used to collect the "global" matchinfo stats -** for a single query. +** for a single query. The "global" stats are those elements of the matchinfo +** array that are constant for all rows returned by the current query. */ static int fts3ExprGlobalMatchinfoCb( Fts3Expr *pExpr, /* Phrase expression node */ - int iPhrase, + int iPhrase, /* Phrase number (numbered from zero) */ void *pCtx /* Pointer to MatchInfo structure */ ){ MatchInfo *p = (MatchInfo *)pCtx; char *pCsr; char *pEnd; - const int iStart = 2 + p->nCol*p->iPhrase; + const int iStart = 2 + (iPhrase * p->nCol * 3) + 1; assert( pExpr->isLoaded ); @@ -667,84 +798,96 @@ static int fts3ExprGlobalMatchinfoCb( pCsr = pExpr->aDoclist; pEnd = &pExpr->aDoclist[pExpr->nDoclist]; while( pCsraGlobal[iStart]); - } - - p->iPhrase++; - return SQLITE_OK; -} - -static int fts3ExprLocalMatchinfoCb( - Fts3Expr *pExpr, /* Phrase expression node */ - int iPhrase, - void *pCtx /* Pointer to MatchInfo structure */ -){ - MatchInfo *p = (MatchInfo *)pCtx; - p->iPhrase++; - - if( pExpr->aDoclist ){ - char *pCsr; - int iOffset = 2 + p->nCol*(p->aGlobal[0]+iPhrase); - - memset(&p->aGlobal[iOffset], 0, p->nCol*sizeof(u32)); - pCsr = sqlite3Fts3FindPositions(pExpr, p->pCursor->iPrevId, -1); - if( pCsr ) fts3LoadColumnlistCounts(&pCsr, &p->aGlobal[iOffset]); + while( *pCsr++ & 0x80 ); /* Skip past docid. */ + fts3LoadColumnlistCounts(&pCsr, &p->aMatchinfo[iStart], 1); } return SQLITE_OK; } /* -** Populate pCsr->aMatchinfo[] with data for the current row. The 'matchinfo' -** data is an array of 32-bit unsigned integers (C type u32). +** fts3ExprIterate() callback used to collect the "local" matchinfo stats +** for a single query. The "local" stats are those elements of the matchinfo +** array that are different for each row returned by the query. +*/ +static int fts3ExprLocalMatchinfoCb( + Fts3Expr *pExpr, /* Phrase expression node */ + int iPhrase, /* Phrase number */ + void *pCtx /* Pointer to MatchInfo structure */ +){ + MatchInfo *p = (MatchInfo *)pCtx; + + if( pExpr->aDoclist ){ + char *pCsr; + int iStart = 2 + (iPhrase * p->nCol * 3); + int i; + + for(i=0; inCol; i++) p->aMatchinfo[iStart+i*3] = 0; + + pCsr = sqlite3Fts3FindPositions(pExpr, p->pCursor->iPrevId, -1); + if( pCsr ){ + fts3LoadColumnlistCounts(&pCsr, &p->aMatchinfo[iStart], 0); + } + } + + return SQLITE_OK; +} + +/* +** Populate pCsr->aMatchinfo[] with data for the current row. The +** 'matchinfo' data is an array of 32-bit unsigned integers (C type u32). */ static int fts3GetMatchinfo(Fts3Cursor *pCsr){ - MatchInfo g; + MatchInfo sInfo; Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab; - if( pCsr->aMatchinfo==0 ){ - int rc; - int nPhrase; - int nMatchinfo; - g.pTab = pTab; - g.nCol = pTab->nColumn; - g.iPhrase = 0; + sInfo.pCursor = pCsr; + sInfo.nCol = pTab->nColumn; + + if( pCsr->aMatchinfo==0 ){ + /* If Fts3Cursor.aMatchinfo[] is NULL, then this is the first time the + ** matchinfo function has been called for this query. In this case + ** allocate the array used to accumulate the matchinfo data and + ** initialize those elements that are constant for every row. + */ + int rc; /* Return Code */ + int nPhrase; /* Number of phrases */ + int nMatchinfo; /* Number of u32 elements in match-info */ + + /* Load doclists for each phrase in the query. */ rc = fts3ExprLoadDoclists(pCsr, &nPhrase, 0); if( rc!=SQLITE_OK ){ return rc; } + nMatchinfo = 2 + 3*sInfo.nCol*nPhrase; - nMatchinfo = 2 + 2*g.nCol*nPhrase; - - g.iPhrase = 0; - g.aGlobal = (u32 *)sqlite3_malloc(sizeof(u32)*nMatchinfo); - if( !g.aGlobal ){ + sInfo.aMatchinfo = (u32 *)sqlite3_malloc(sizeof(u32)*nMatchinfo); + if( !sInfo.aMatchinfo ){ return SQLITE_NOMEM; } - memset(g.aGlobal, 0, sizeof(u32)*nMatchinfo); + memset(sInfo.aMatchinfo, 0, sizeof(u32)*nMatchinfo); - g.aGlobal[0] = nPhrase; - g.aGlobal[1] = g.nCol; - (void)fts3ExprIterate(pCsr->pExpr, fts3ExprGlobalMatchinfoCb, (void *)&g); - pCsr->aMatchinfo = g.aGlobal; + /* First element of match-info is the number of phrases in the query */ + sInfo.aMatchinfo[0] = nPhrase; + sInfo.aMatchinfo[1] = sInfo.nCol; + (void)fts3ExprIterate(pCsr->pExpr, fts3ExprGlobalMatchinfoCb,(void*)&sInfo); + + pCsr->aMatchinfo = sInfo.aMatchinfo; } - g.pTab = pTab; - g.pCursor = pCsr; - g.nCol = pTab->nColumn; - g.iPhrase = 0; - g.aGlobal = pCsr->aMatchinfo; - + sInfo.aMatchinfo = pCsr->aMatchinfo; if( pCsr->isMatchinfoOk ){ - (void)fts3ExprIterate(pCsr->pExpr, fts3ExprLocalMatchinfoCb, (void *)&g); + (void)fts3ExprIterate(pCsr->pExpr, fts3ExprLocalMatchinfoCb, (void*)&sInfo); pCsr->isMatchinfoOk = 0; } return SQLITE_OK; } +/* +** Implementation of snippet() function. +*/ void sqlite3Fts3Snippet( sqlite3_context *pCtx, /* SQLite function call context */ Fts3Cursor *pCsr, /* Cursor object */ @@ -835,12 +978,13 @@ void sqlite3Fts3Snippet( typedef struct TermOffset TermOffset; +typedef struct TermOffsetCtx TermOffsetCtx; + struct TermOffset { char *pList; /* Position-list */ int iPos; /* Position just read from pList */ - int iOff; + int iOff; /* Offset of this term from read positions */ }; -typedef struct TermOffsetCtx TermOffsetCtx; struct TermOffsetCtx { int iCol; /* Column of table to populate aTerm for */ @@ -885,17 +1029,15 @@ void sqlite3Fts3Offsets( ){ Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab; sqlite3_tokenizer_module const *pMod = pTab->pTokenizer->pModule; - const char *ZDUMMY; - int NDUMMY; - + const char *ZDUMMY; /* Dummy argument used with xNext() */ + int NDUMMY; /* Dummy argument used with xNext() */ int rc; /* Return Code */ int nToken; /* Number of tokens in query */ int iCol; /* Column currently being processed */ StrBuffer res = {0, 0, 0}; /* Result string */ + TermOffsetCtx sCtx; /* Context for fts3ExprTermOffsetInit() */ - TermOffsetCtx sCtx; memset(&sCtx, 0, sizeof(sCtx)); - assert( pCsr->isRequireSeek==0 ); /* Count the number of terms in the query */ @@ -910,6 +1052,9 @@ void sqlite3Fts3Offsets( } sCtx.iDocid = pCsr->iPrevId; + /* Loop through the table columns, appending offset information to + ** string-buffer res for each column. + */ for(iCol=0; iColnColumn; iCol++){ sqlite3_tokenizer_cursor *pC; /* Tokenizer cursor */ int iStart; @@ -1001,12 +1146,15 @@ void sqlite3Fts3Offsets( return; } +/* +** Implementation of matchinfo() function. +*/ void sqlite3Fts3Matchinfo(sqlite3_context *pContext, Fts3Cursor *pCsr){ int rc = fts3GetMatchinfo(pCsr); if( rc!=SQLITE_OK ){ sqlite3_result_error_code(pContext, rc); }else{ - int n = sizeof(u32)*(2+pCsr->aMatchinfo[0]*pCsr->aMatchinfo[1]*2); + int n = sizeof(u32)*(2+pCsr->aMatchinfo[0]*pCsr->aMatchinfo[1]*3); sqlite3_result_blob(pContext, pCsr->aMatchinfo, n, SQLITE_TRANSIENT); } } diff --git a/manifest b/manifest index 50046553cb..c37d774682 100644 --- a/manifest +++ b/manifest @@ -1,8 +1,5 @@ ------BEGIN PGP SIGNED MESSAGE----- -Hash: SHA1 - -C Use\s#ifdefs\sto\sdisable\sunused\scode\swhen\sSQLITE_OMIT_FLOATING_POINT\sis\sdefined. -D 2010-01-12T17:04:08 +C Update\scomments\sin\sfts3_snippet.c. +D 2010-01-12T17:57:30 F Makefile.arm-wince-mingw32ce-gcc fcd5e9cd67fe88836360bb4f9ef4cb7f8e2fb5a0 F Makefile.in c5827ead754ab32b9585487177c93bb00b9497b3 F Makefile.linux-gcc d53183f4aa6a9192d249731c90dbdffbd2c68654 @@ -67,7 +64,7 @@ F ext/fts3/fts3_hash.c 3c8f6387a4a7f5305588b203fa7c887d753e1f1c F ext/fts3/fts3_hash.h 8331fb2206c609f9fc4c4735b9ab5ad6137c88ec F ext/fts3/fts3_icu.c ac494aed69835008185299315403044664bda295 F ext/fts3/fts3_porter.c a651e287e02b49b565a6ccf9441959d434489156 -F ext/fts3/fts3_snippet.c 99bc7e7356ff17667afa5ef744cae5c8dc71c7d6 +F ext/fts3/fts3_snippet.c fdc2c12e9387d140f3b4422caa4adcbc9c8eddc9 F ext/fts3/fts3_tokenizer.c 1a49ee3d79cbf0b9386250370d9cbfe4bb89c8ff F ext/fts3/fts3_tokenizer.h 13ffd9fcb397fec32a05ef5cd9e0fa659bf3dbd3 F ext/fts3/fts3_tokenizer1.c 11a604a53cff5e8c28882727bf794e5252e5227b @@ -406,7 +403,7 @@ F test/fts3expr.test 05dab77387801e4900009917bb18f556037d82da F test/fts3expr2.test 18da930352e5693eaa163a3eacf96233b7290d1a F test/fts3malloc.test d02ee86b21edd2b43044e0d6dfdcd26cb6efddcb F test/fts3near.test 2e318ee434d32babd27c167142e2b94ddbab4844 -F test/fts3query.test ca21717993f51caa7e36231dba2499868f3f8a6f +F test/fts3query.test 154fe4b015fd61af523ee083570a134f508f5be7 F test/fts3rnd.test 153b4214bad6084a348814f3dd651a92e2f31d9b F test/fts3snippet.test 16a05b313bf85da4d0b5cb683549fc2279430fbe F test/func.test af106ed834001738246d276659406823e35cde7b @@ -788,14 +785,7 @@ F tool/speedtest2.tcl ee2149167303ba8e95af97873c575c3e0fab58ff F tool/speedtest8.c 2902c46588c40b55661e471d7a86e4dd71a18224 F tool/speedtest8inst1.c 293327bc76823f473684d589a8160bde1f52c14e F tool/vdbe-compress.tcl d70ea6d8a19e3571d7ab8c9b75cba86d1173ff0f -P 8bd0f8147dbf75367243b3eecfb705da6e794ab7 -R 2f049c9b0458cd4d56b0979326210233 -U drh -Z 56a3411ed01b97bc47b7776cdc99d804 ------BEGIN PGP SIGNATURE----- -Version: GnuPG v1.4.6 (GNU/Linux) - -iD8DBQFLTKuKoxKgR168RlERArj5AJ9MklFLlBTCDE3gwJo/U2R4JCBGqQCfeywI -MozPgxZX2+OOhsyGOcvPvws= -=hPv2 ------END PGP SIGNATURE----- +P 66bab8561926963a87f15ad559cba36545c9892c +R 559c18af8cc5acebd93fdb777aca8b37 +U dan +Z e6a76bd6811266db12895b07d433a416 diff --git a/manifest.uuid b/manifest.uuid index 84dbc56256..a9f4831fca 100644 --- a/manifest.uuid +++ b/manifest.uuid @@ -1 +1 @@ -66bab8561926963a87f15ad559cba36545c9892c \ No newline at end of file +dd9689b7c31861484f8d8062478ae73d3c940902 \ No newline at end of file diff --git a/test/fts3query.test b/test/fts3query.test index 2a4d6e720e..b5af50a7ab 100644 --- a/test/fts3query.test +++ b/test/fts3query.test @@ -99,7 +99,7 @@ db func mit mit do_test fts3query-3.3 { execsql { SELECT mit(matchinfo(foobar)) FROM foobar WHERE foobar MATCH 'the' } -} {{1 1 3 3}} +} {{1 1 3 3 1}} finish_test