sqlite/ext/fts3/fts3_snippet.c

/*
** 2009 Oct 23
**
** The author disclaims copyright to this source code.  In place of
** a legal notice, here is a blessing:
**
**    May you do good and not evil.
**    May you find forgiveness for yourself and forgive others.
**    May you share freely, never taking more than you give.
**
******************************************************************************
*/

#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)

#include "fts3Int.h"
#include <string.h>
#include <assert.h>
#include <ctype.h>

typedef struct Snippet Snippet;

/*
** An instance of the following structure keeps track of generated
** matching-word offset information and snippets.
*/
struct Snippet {
  int nMatch;                     /* Total number of matches */
  int nAlloc;                     /* Space allocated for aMatch[] */
  struct snippetMatch {  /* One entry for each matching term */
    char snStatus;       /* Status flag for use while constructing snippets */
    short int nByte;     /* Number of bytes in the term */
    short int iCol;      /* The column that contains the match */
    short int iTerm;     /* The index in Query.pTerms[] of the matching term */
    int iToken;          /* The index of the matching document token */
    int iStart;          /* The offset to the first character of the term */
  } *aMatch;                      /* Points to space obtained from malloc */
  char *zOffset;                  /* Text rendering of aMatch[] */
  int nOffset;                    /* strlen(zOffset) */
  char *zSnippet;                 /* Snippet text */
  int nSnippet;                   /* strlen(zSnippet) */
};


/* It is not safe to call isspace(), tolower(), or isalnum() on
** hi-bit-set characters.  This is the same solution used in the
** tokenizer.
*/
static int fts3snippetIsspace(char c){
  return (c&0x80)==0 ? isspace(c) : 0;
}


/*
** A StringBuffer object holds a zero-terminated string that grows
** arbitrarily by appending.  Space to hold the string is obtained
** from sqlite3_malloc().  After any memory allocation failure,
** StringBuffer.z is set to NULL and no further allocation is attempted.
*/
typedef struct StringBuffer {
  char *z;         /* Text of the string.  Space from malloc. */
  int nUsed;       /* Number bytes of z[] used, not counting \000 terminator */
  int nAlloc;      /* Bytes allocated for z[] */
} StringBuffer;


/*
** Initialize a new StringBuffer.
*/
static void fts3SnippetSbInit(StringBuffer *p){
  p->nAlloc = 100;
  p->nUsed = 0;
  p->z = sqlite3_malloc( p->nAlloc );
}

/*
** Append text to the string buffer.
*/
static void fts3SnippetAppend(StringBuffer *p, const char *zNew, int nNew){
  if( p->z==0 ) return;
  if( nNew<0 ) nNew = (int)strlen(zNew);
  if( p->nUsed + nNew >= p->nAlloc ){
    int nAlloc;
    char *zNew;

    nAlloc = p->nUsed + nNew + p->nAlloc;
    zNew = sqlite3_realloc(p->z, nAlloc);
    if( zNew==0 ){
      sqlite3_free(p->z);
      p->z = 0;
      return;
    }
    p->z = zNew;
    p->nAlloc = nAlloc;
  }
  memcpy(&p->z[p->nUsed], zNew, nNew);
  p->nUsed += nNew;
  p->z[p->nUsed] = 0;
}

/* If the StringBuffer ends in something other than white space, add a
** single space character to the end.
*/
static void fts3SnippetAppendWhiteSpace(StringBuffer *p){
  if( p->z && p->nUsed && !fts3snippetIsspace(p->z[p->nUsed-1]) ){
    fts3SnippetAppend(p, " ", 1);
  }
}

/* Remove white space from the end of the StringBuffer */
static void fts3SnippetTrimWhiteSpace(StringBuffer *p){
  if( p->z ){
    while( p->nUsed && fts3snippetIsspace(p->z[p->nUsed-1]) ){
      p->nUsed--;
    }
    p->z[p->nUsed] = 0;
  }
}

/*
** Release all memory associated with the Snippet structure passed as
** an argument.
*/
static void fts3SnippetFree(Snippet *p){
  if( p ){
    sqlite3_free(p->aMatch);
    sqlite3_free(p->zOffset);
    sqlite3_free(p->zSnippet);
    sqlite3_free(p);
  }
}

/*
** Append a single entry to the p->aMatch[] log.
*/
static int snippetAppendMatch(
  Snippet *p,               /* Append the entry to this snippet */
  int iCol, int iTerm,      /* The column and query term */
  int iToken,               /* Matching token in document */
  int iStart, int nByte     /* Offset and size of the match */
){
  int i;
  struct snippetMatch *pMatch;
  if( p->nMatch+1>=p->nAlloc ){
    struct snippetMatch *pNew;
    p->nAlloc = p->nAlloc*2 + 10;
    pNew = sqlite3_realloc(p->aMatch, p->nAlloc*sizeof(p->aMatch[0]) );
    if( pNew==0 ){
      p->aMatch = 0;
      p->nMatch = 0;
      p->nAlloc = 0;
      return SQLITE_NOMEM;
    }
    p->aMatch = pNew;
  }
  i = p->nMatch++;
  pMatch = &p->aMatch[i];
  pMatch->iCol = (short)iCol;
  pMatch->iTerm = (short)iTerm;
  pMatch->iToken = iToken;
  pMatch->iStart = iStart;
  pMatch->nByte = (short)nByte;
  return SQLITE_OK;
}

/*
** Sizing information for the circular buffer used in snippetOffsetsOfColumn()
*/
#define FTS3_ROTOR_SZ   (32)
#define FTS3_ROTOR_MASK (FTS3_ROTOR_SZ-1)

/*
** Function to iterate through the tokens of a compiled expression.
**
** Except, skip all tokens on the right-hand side of a NOT operator.
** This function is used to find tokens as part of snippet and offset
** generation and we do nt want snippets and offsets to report matches
** for tokens on the RHS of a NOT.
*/
static int fts3NextExprToken(Fts3Expr **ppExpr, int *piToken){
  Fts3Expr *p = *ppExpr;
  int iToken = *piToken;
  if( iToken<0 ){
    /* In this case the expression p is the root of an expression tree.
    ** Move to the first token in the expression tree.
    */
    while( p->pLeft ){
      p = p->pLeft;
    }
    iToken = 0;
  }else{
    assert(p && p->eType==FTSQUERY_PHRASE );
    if( iToken<(p->pPhrase->nToken-1) ){
      iToken++;
    }else{
      iToken = 0;
      while( p->pParent && p->pParent->pLeft!=p ){
        assert( p->pParent->pRight==p );
        p = p->pParent;
      }
      p = p->pParent;
      if( p ){
        assert( p->pRight!=0 );
        p = p->pRight;
        while( p->pLeft ){
          p = p->pLeft;
        }
      }
    }
  }

  *ppExpr = p;
  *piToken = iToken;
  return p?1:0;
}

/*
** Return TRUE if the expression node pExpr is located beneath the
** RHS of a NOT operator.
*/
static int fts3ExprBeneathNot(Fts3Expr *p){
  Fts3Expr *pParent;
  while( p ){
    pParent = p->pParent;
    if( pParent && pParent->eType==FTSQUERY_NOT && pParent->pRight==p ){
      return 1;
    }
    p = pParent;
  }
  return 0;
}

/*
** Add entries to pSnippet->aMatch[] for every match that occurs against
** document zDoc[0..nDoc-1] which is stored in column iColumn.
*/
static int snippetOffsetsOfColumn(
  Fts3Cursor *pCur,         /* The fulltest search cursor */
  Snippet *pSnippet,             /* The Snippet object to be filled in */
  int iColumn,                   /* Index of fulltext table column */
  const char *zDoc,              /* Text of the fulltext table column */
  int nDoc                       /* Length of zDoc in bytes */
){
  const sqlite3_tokenizer_module *pTModule;  /* The tokenizer module */
  sqlite3_tokenizer *pTokenizer;             /* The specific tokenizer */
  sqlite3_tokenizer_cursor *pTCursor;        /* Tokenizer cursor */
  Fts3Table *pVtab;                /* The full text index */
  int nColumn;                         /* Number of columns in the index */
  int i, j;                            /* Loop counters */
  int rc;                              /* Return code */
  unsigned int match, prevMatch;       /* Phrase search bitmasks */
  const char *zToken;                  /* Next token from the tokenizer */
  int nToken;                          /* Size of zToken */
  int iBegin, iEnd, iPos;              /* Offsets of beginning and end */

  /* The following variables keep a circular buffer of the last
  ** few tokens */
  unsigned int iRotor = 0;             /* Index of current token */
  int iRotorBegin[FTS3_ROTOR_SZ];      /* Beginning offset of token */
  int iRotorLen[FTS3_ROTOR_SZ];        /* Length of token */

  pVtab =  (Fts3Table *)pCur->base.pVtab;
  nColumn = pVtab->nColumn;
  pTokenizer = pVtab->pTokenizer;
  pTModule = pTokenizer->pModule;
  rc = pTModule->xOpen(pTokenizer, zDoc, nDoc, &pTCursor);
  if( rc ) return rc;
  pTCursor->pTokenizer = pTokenizer;

  prevMatch = 0;
  while( (rc = pTModule->xNext(pTCursor, &zToken, &nToken,
                               &iBegin, &iEnd, &iPos))==SQLITE_OK ){
    Fts3Expr *pIter = pCur->pExpr;
    int iIter = -1;
    iRotorBegin[iRotor&FTS3_ROTOR_MASK] = iBegin;
    iRotorLen[iRotor&FTS3_ROTOR_MASK] = iEnd-iBegin;
    match = 0;
    for(i=0; i<(FTS3_ROTOR_SZ-1) && fts3NextExprToken(&pIter, &iIter); i++){
      int nPhrase;                    /* Number of tokens in current phrase */
      struct PhraseToken *pToken;     /* Current token */
      int iCol;                       /* Column index */

      if( fts3ExprBeneathNot(pIter) ) continue;
      nPhrase = pIter->pPhrase->nToken;
      pToken = &pIter->pPhrase->aToken[iIter];
      iCol = pIter->pPhrase->iColumn;
      if( iCol>=0 && iCol<nColumn && iCol!=iColumn ) continue;
      if( pToken->n>nToken ) continue;
      if( !pToken->isPrefix && pToken->n<nToken ) continue;
      assert( pToken->n<=nToken );
      if( memcmp(pToken->z, zToken, pToken->n) ) continue;
      if( iIter>0 && (prevMatch & (1<<i))==0 ) continue;
      match |= 1<<i;
      if( i==(FTS3_ROTOR_SZ-2) || nPhrase==iIter+1 ){
        for(j=nPhrase-1; j>=0; j--){
          int k = (iRotor-j) & FTS3_ROTOR_MASK;
          rc = snippetAppendMatch(pSnippet, iColumn, i-j, iPos-j,
                                  iRotorBegin[k], iRotorLen[k]);
          if( rc ) goto end_offsets_of_column;
        }
      }
    }
    prevMatch = match<<1;
    iRotor++;
  }
end_offsets_of_column:
  pTModule->xClose(pTCursor);
  return rc==SQLITE_DONE ? SQLITE_OK : rc;
}

/*
** Remove entries from the pSnippet structure to account for the NEAR
** operator. When this is called, pSnippet contains the list of token
** offsets produced by treating all NEAR operators as AND operators.
** This function removes any entries that should not be present after
** accounting for the NEAR restriction. For example, if the queried
** document is:
**
**     "A B C D E A"
**
** and the query is:
**
**     A NEAR/0 E
**
** then when this function is called the Snippet contains token offsets
** 0, 4 and 5. This function removes the "0" entry (because the first A
** is not near enough to an E).
**
** When this function is called, the value pointed to by parameter piLeft is
** the integer id of the left-most token in the expression tree headed by
** pExpr. This function increments *piLeft by the total number of tokens
** in the expression tree headed by pExpr.
**
** Return 1 if any trimming occurs.  Return 0 if no trimming is required.
*/
static int trimSnippetOffsets(
  Fts3Expr *pExpr,      /* The search expression */
  Snippet *pSnippet,    /* The set of snippet offsets to be trimmed */
  int *piLeft           /* Index of left-most token in pExpr */
){
  if( pExpr ){
    if( trimSnippetOffsets(pExpr->pLeft, pSnippet, piLeft) ){
      return 1;
    }

    switch( pExpr->eType ){
      case FTSQUERY_PHRASE:
        *piLeft += pExpr->pPhrase->nToken;
        break;
      case FTSQUERY_NEAR: {
        /* The right-hand-side of a NEAR operator is always a phrase. The
        ** left-hand-side is either a phrase or an expression tree that is
        ** itself headed by a NEAR operator. The following initializations
        ** set local variable iLeft to the token number of the left-most
        ** token in the right-hand phrase, and iRight to the right most
        ** token in the same phrase. For example, if we had:
        **
        **     <col> MATCH '"abc def" NEAR/2 "ghi jkl"'
        **
        ** then iLeft will be set to 2 (token number of ghi) and nToken will
        ** be set to 4.
        */
        Fts3Expr *pLeft = pExpr->pLeft;
        Fts3Expr *pRight = pExpr->pRight;
        int iLeft = *piLeft;
        int nNear = pExpr->nNear;
        int nToken = pRight->pPhrase->nToken;
        int jj, ii;
        if( pLeft->eType==FTSQUERY_NEAR ){
          pLeft = pLeft->pRight;
        }
        assert( pRight->eType==FTSQUERY_PHRASE );
        assert( pLeft->eType==FTSQUERY_PHRASE );
        nToken += pLeft->pPhrase->nToken;

        for(ii=0; ii<pSnippet->nMatch; ii++){
          struct snippetMatch *p = &pSnippet->aMatch[ii];
          if( p->iTerm==iLeft ){
            int isOk = 0;
            /* Snippet ii is an occurence of query term iLeft in the document.
            ** It occurs at position (p->iToken) of the document. We now
            ** search for an instance of token (iLeft-1) somewhere in the
            ** range (p->iToken - nNear)...(p->iToken + nNear + nToken) within
            ** the set of snippetMatch structures. If one is found, proceed.
            ** If one cannot be found, then remove snippets ii..(ii+N-1)
            ** from the matching snippets, where N is the number of tokens
            ** in phrase pRight->pPhrase.
            */
            for(jj=0; isOk==0 && jj<pSnippet->nMatch; jj++){
              struct snippetMatch *p2 = &pSnippet->aMatch[jj];
              if( p2->iTerm==(iLeft-1) ){
                if( p2->iToken>=(p->iToken-nNear-1)
                 && p2->iToken<(p->iToken+nNear+nToken)
                ){
                  isOk = 1;
                }
              }
            }
            if( !isOk ){
              int kk;
              for(kk=0; kk<pRight->pPhrase->nToken; kk++){
                pSnippet->aMatch[kk+ii].iTerm = -2;
              }
              return 1;
            }
          }
          if( p->iTerm==(iLeft-1) ){
            int isOk = 0;
            for(jj=0; isOk==0 && jj<pSnippet->nMatch; jj++){
              struct snippetMatch *p2 = &pSnippet->aMatch[jj];
              if( p2->iTerm==iLeft ){
                if( p2->iToken<=(p->iToken+nNear+1)
                 && p2->iToken>(p->iToken-nNear-nToken)
                ){
                  isOk = 1;
                }
              }
            }
            if( !isOk ){
              int kk;
              for(kk=0; kk<pLeft->pPhrase->nToken; kk++){
                pSnippet->aMatch[ii-kk].iTerm = -2;
              }
              return 1;
            }
          }
        }
        break;
      }
    }

    if( trimSnippetOffsets(pExpr->pRight, pSnippet, piLeft) ){
      return 1;
    }
  }
  return 0;
}

/*
** Compute all offsets for the current row of the query.
** If the offsets have already been computed, this routine is a no-op.
*/
static int snippetAllOffsets(Fts3Cursor *pCsr, Snippet **ppSnippet){
  Fts3Table *p = (Fts3Table *)pCsr->base.pVtab;  /* The FTS3 virtual table */
  int nColumn;           /* Number of columns.  Docid does count */
  int iColumn;           /* Index of of a column */
  int i;                 /* Loop index */
  int iFirst;            /* First column to search */
  int iLast;             /* Last coumn to search */
  int iTerm = 0;
  Snippet *pSnippet;
  int rc = SQLITE_OK;

  if( pCsr->pExpr==0 ){
    return SQLITE_OK;
  }

  pSnippet = (Snippet *)sqlite3_malloc(sizeof(Snippet));
  *ppSnippet = pSnippet;
  if( !pSnippet ){
    return SQLITE_NOMEM;
  }
  memset(pSnippet, 0, sizeof(Snippet));

  nColumn = p->nColumn;
  iColumn = (pCsr->eSearch - 2);
  if( iColumn<0 || iColumn>=nColumn ){
    /* Look for matches over all columns of the full-text index */
    iFirst = 0;
    iLast = nColumn-1;
  }else{
    /* Look for matches in the iColumn-th column of the index only */
    iFirst = iColumn;
    iLast = iColumn;
  }
  for(i=iFirst; rc==SQLITE_OK && i<=iLast; i++){
    const char *zDoc;
    int nDoc;
    zDoc = (const char*)sqlite3_column_text(pCsr->pStmt, i+1);
    nDoc = sqlite3_column_bytes(pCsr->pStmt, i+1);
    if( zDoc==0 && sqlite3_column_type(pCsr->pStmt, i+1)!=SQLITE_NULL ){
      rc = SQLITE_NOMEM;
    }else{
      rc = snippetOffsetsOfColumn(pCsr, pSnippet, i, zDoc, nDoc);
    }
  }

  while( trimSnippetOffsets(pCsr->pExpr, pSnippet, &iTerm) ){
    iTerm = 0;
  }

  return rc;
}

/*
** Convert the information in the aMatch[] array of the snippet
** into the string zOffset[0..nOffset-1]. This string is used as
** the return of the SQL offsets() function.
*/
static void snippetOffsetText(Snippet *p){
  int i;
  int cnt = 0;
  StringBuffer sb;
  char zBuf[200];
  if( p->zOffset ) return;
  fts3SnippetSbInit(&sb);
  for(i=0; i<p->nMatch; i++){
    struct snippetMatch *pMatch = &p->aMatch[i];
    if( pMatch->iTerm>=0 ){
      /* If snippetMatch.iTerm is less than 0, then the match was
      ** discarded as part of processing the NEAR operator (see the
      ** trimSnippetOffsetsForNear() function for details). Ignore
      ** it in this case
      */
      zBuf[0] = ' ';
      sqlite3_snprintf(sizeof(zBuf)-1, &zBuf[cnt>0], "%d %d %d %d",
          pMatch->iCol, pMatch->iTerm, pMatch->iStart, pMatch->nByte);
      fts3SnippetAppend(&sb, zBuf, -1);
      cnt++;
    }
  }
  p->zOffset = sb.z;
  p->nOffset = sb.z ? sb.nUsed : 0;
}

/*
** zDoc[0..nDoc-1] is phrase of text.  aMatch[0..nMatch-1] are a set
** of matching words some of which might be in zDoc.  zDoc is column
** number iCol.
**
** iBreak is suggested spot in zDoc where we could begin or end an
** excerpt.  Return a value similar to iBreak but possibly adjusted
** to be a little left or right so that the break point is better.
*/
static int wordBoundary(
  int iBreak,                   /* The suggested break point */
  const char *zDoc,             /* Document text */
  int nDoc,                     /* Number of bytes in zDoc[] */
  struct snippetMatch *aMatch,  /* Matching words */
  int nMatch,                   /* Number of entries in aMatch[] */
  int iCol                      /* The column number for zDoc[] */
){
  int i;
  if( iBreak<=10 ){
    return 0;
  }
  if( iBreak>=nDoc-10 ){
    return nDoc;
  }
  for(i=0; ALWAYS(i<nMatch) && aMatch[i].iCol<iCol; i++){}
  while( i<nMatch && aMatch[i].iStart+aMatch[i].nByte<iBreak ){ i++; }
  if( i<nMatch ){
    if( aMatch[i].iStart<iBreak+10 ){
      return aMatch[i].iStart;
    }
    if( i>0 && aMatch[i-1].iStart+aMatch[i-1].nByte>=iBreak ){
      return aMatch[i-1].iStart;
    }
  }
  for(i=1; i<=10; i++){
    if( fts3snippetIsspace(zDoc[iBreak-i]) ){
      return iBreak - i + 1;
    }
    if( fts3snippetIsspace(zDoc[iBreak+i]) ){
      return iBreak + i + 1;
    }
  }
  return iBreak;
}


/*
** Allowed values for Snippet.aMatch[].snStatus
*/
#define SNIPPET_IGNORE  0   /* It is ok to omit this match from the snippet */
#define SNIPPET_DESIRED 1   /* We want to include this match in the snippet */

/*
** Generate the text of a snippet.
*/
static void snippetText(
  Fts3Cursor *pCursor,   /* The cursor we need the snippet for */
  Snippet *pSnippet,
  const char *zStartMark,     /* Markup to appear before each match */
  const char *zEndMark,       /* Markup to appear after each match */
  const char *zEllipsis       /* Ellipsis mark */
){
  int i, j;
  struct snippetMatch *aMatch;
  int nMatch;
  int nDesired;
  StringBuffer sb;
  int tailCol;
  int tailOffset;
  int iCol;
  int nDoc;
  const char *zDoc;
  int iStart, iEnd;
  int tailEllipsis = 0;
  int iMatch;


  sqlite3_free(pSnippet->zSnippet);
  pSnippet->zSnippet = 0;
  aMatch = pSnippet->aMatch;
  nMatch = pSnippet->nMatch;
  fts3SnippetSbInit(&sb);

  for(i=0; i<nMatch; i++){
    aMatch[i].snStatus = SNIPPET_IGNORE;
  }
  nDesired = 0;
  for(i=0; i<FTS3_ROTOR_SZ; i++){
    for(j=0; j<nMatch; j++){
      if( aMatch[j].iTerm==i ){
        aMatch[j].snStatus = SNIPPET_DESIRED;
        nDesired++;
        break;
      }
    }
  }

  iMatch = 0;
  tailCol = -1;
  tailOffset = 0;
  for(i=0; i<nMatch && nDesired>0; i++){
    if( aMatch[i].snStatus!=SNIPPET_DESIRED ) continue;
    nDesired--;
    iCol = aMatch[i].iCol;
    zDoc = (const char*)sqlite3_column_text(pCursor->pStmt, iCol+1);
    nDoc = sqlite3_column_bytes(pCursor->pStmt, iCol+1);
    iStart = aMatch[i].iStart - 40;
    iStart = wordBoundary(iStart, zDoc, nDoc, aMatch, nMatch, iCol);
    if( iStart<=10 ){
      iStart = 0;
    }
    if( iCol==tailCol && iStart<=tailOffset+20 ){
      iStart = tailOffset;
    }
    if( (iCol!=tailCol && tailCol>=0) || iStart!=tailOffset ){
      fts3SnippetTrimWhiteSpace(&sb);
      fts3SnippetAppendWhiteSpace(&sb);
      fts3SnippetAppend(&sb, zEllipsis, -1);
      fts3SnippetAppendWhiteSpace(&sb);
    }
    iEnd = aMatch[i].iStart + aMatch[i].nByte + 40;
    iEnd = wordBoundary(iEnd, zDoc, nDoc, aMatch, nMatch, iCol);
    if( iEnd>=nDoc-10 ){
      iEnd = nDoc;
      tailEllipsis = 0;
    }else{
      tailEllipsis = 1;
    }
    while( iMatch<nMatch && aMatch[iMatch].iCol<iCol ){ iMatch++; }
    while( iStart<iEnd ){
      while( iMatch<nMatch && aMatch[iMatch].iStart<iStart
             && aMatch[iMatch].iCol<=iCol ){
        iMatch++;
      }
      if( iMatch<nMatch && aMatch[iMatch].iStart<iEnd
             && aMatch[iMatch].iCol==iCol ){
        fts3SnippetAppend(&sb, &zDoc[iStart], aMatch[iMatch].iStart - iStart);
        iStart = aMatch[iMatch].iStart;
        fts3SnippetAppend(&sb, zStartMark, -1);
        fts3SnippetAppend(&sb, &zDoc[iStart], aMatch[iMatch].nByte);
        fts3SnippetAppend(&sb, zEndMark, -1);
        iStart += aMatch[iMatch].nByte;
        for(j=iMatch+1; j<nMatch; j++){
          if( aMatch[j].iTerm==aMatch[iMatch].iTerm
              && aMatch[j].snStatus==SNIPPET_DESIRED ){
            nDesired--;
            aMatch[j].snStatus = SNIPPET_IGNORE;
          }
        }
      }else{
        fts3SnippetAppend(&sb, &zDoc[iStart], iEnd - iStart);
        iStart = iEnd;
      }
    }
    tailCol = iCol;
    tailOffset = iEnd;
  }
  fts3SnippetTrimWhiteSpace(&sb);
  if( tailEllipsis ){
    fts3SnippetAppendWhiteSpace(&sb);
    fts3SnippetAppend(&sb, zEllipsis, -1);
  }
  pSnippet->zSnippet = sb.z;
  pSnippet->nSnippet = sb.z ? sb.nUsed : 0;
}

void sqlite3Fts3Offsets(
  sqlite3_context *pCtx,          /* SQLite function call context */
  Fts3Cursor *pCsr                /* Cursor object */
){
  Snippet *p;                     /* Snippet structure */
  int rc = snippetAllOffsets(pCsr, &p);
  if( rc==SQLITE_OK ){
    snippetOffsetText(p);
    if( p->zOffset ){
      sqlite3_result_text(pCtx, p->zOffset, p->nOffset, SQLITE_TRANSIENT);
    }else{
      sqlite3_result_error_nomem(pCtx);
    }
  }else{
    sqlite3_result_error_nomem(pCtx);
  }
  fts3SnippetFree(p);
}

void sqlite3Fts3Snippet(
  sqlite3_context *pCtx,          /* SQLite function call context */
  Fts3Cursor *pCsr,               /* Cursor object */
  const char *zStart,             /* Snippet start text - "<b>" */
  const char *zEnd,               /* Snippet end text - "</b>" */
  const char *zEllipsis           /* Snippet ellipsis text - "<b>...</b>" */
){
  Snippet *p;                     /* Snippet structure */
  int rc = snippetAllOffsets(pCsr, &p);
  if( rc==SQLITE_OK ){
    snippetText(pCsr, p, zStart, zEnd, zEllipsis);
    if( p->zSnippet ){
      sqlite3_result_text(pCtx, p->zSnippet, p->nSnippet, SQLITE_TRANSIENT);
    }else{
      sqlite3_result_error_nomem(pCtx);
    }
  }else{
    sqlite3_result_error_nomem(pCtx);
  }
  fts3SnippetFree(p);
}

/*************************************************************************
** Below this point is the alternative, experimental snippet() implementation.
*/

#define SNIPPET_BUFFER_CHUNK  64
#define SNIPPET_BUFFER_SIZE   SNIPPET_BUFFER_CHUNK*4
#define SNIPPET_BUFFER_MASK   (SNIPPET_BUFFER_SIZE-1)

static void fts3GetDeltaPosition(char **pp, int *piPos){
  int iVal;
  *pp += sqlite3Fts3GetVarint32(*pp, &iVal);
  *piPos += (iVal-2);
}

/*
** Iterate through all phrase nodes in an FTS3 query, except those that
** are part of a sub-tree that is the right-hand-side of a NOT operator.
** For each phrase node found, the supplied callback function is invoked.
**
** If the callback function returns anything other than SQLITE_OK,
** the iteration is abandoned and the error code returned immediately.
** Otherwise, SQLITE_OK is returned after a callback has been made for
** all eligible phrase nodes.
*/
static int fts3ExprIterate(
  Fts3Expr *pExpr,                /* Expression to iterate phrases of */
  int (*x)(Fts3Expr *, void *),   /* Callback function to invoke for phrases */
  void *pCtx                      /* Second argument to pass to callback */
){
  int rc;
  int eType = pExpr->eType;
  if( eType==FTSQUERY_NOT ){
    rc = SQLITE_OK;
  }else if( eType!=FTSQUERY_PHRASE ){
    assert( pExpr->pLeft && pExpr->pRight );
    rc = fts3ExprIterate(pExpr->pLeft, x, pCtx);
    if( rc==SQLITE_OK ){
      rc = fts3ExprIterate(pExpr->pRight, x, pCtx);
    }
  }else{
    rc = x(pExpr, pCtx);
  }
  return rc;
}

typedef struct LoadDoclistCtx LoadDoclistCtx;
struct LoadDoclistCtx {
  Fts3Table *pTab;                /* FTS3 Table */
  int nPhrase;                    /* Number of phrases so far */
};

static int fts3ExprLoadDoclistsCb(Fts3Expr *pExpr, void *ctx){
  int rc = SQLITE_OK;
  LoadDoclistCtx *p = (LoadDoclistCtx *)ctx;
  p->nPhrase++;
  if( pExpr->isLoaded==0 ){
    rc = sqlite3Fts3ExprLoadDoclist(p->pTab, pExpr);
    pExpr->isLoaded = 1;
    if( rc==SQLITE_OK && pExpr->aDoclist ){
      pExpr->pCurrent = pExpr->aDoclist;
      pExpr->pCurrent += sqlite3Fts3GetVarint(pExpr->pCurrent,&pExpr->iCurrent);
    }
  }
  return rc;
}

static int fts3ExprLoadDoclists(Fts3Cursor *pCsr, int *pnPhrase){
  int rc;
  LoadDoclistCtx sCtx = {0, 0};
  sCtx.pTab = (Fts3Table *)pCsr->base.pVtab;
  rc = fts3ExprIterate(pCsr->pExpr, fts3ExprLoadDoclistsCb, (void *)&sCtx);
  *pnPhrase = sCtx.nPhrase;
  return rc;
}

/*
** Each call to this function populates a chunk of a snippet-buffer
** SNIPPET_BUFFER_CHUNK bytes in size.
**
** Return true if the end of the data has been reached (and all subsequent
** calls to fts3LoadSnippetBuffer() with the same arguments will be no-ops),
** or false otherwise.
*/
static int fts3LoadSnippetBuffer(
  int iPos,                       /* Document token offset to load data for */
  u8 *aBuffer,                    /* Circular snippet buffer to populate */
  int nList,                      /* Number of position lists in appList */
  char **apList,                  /* IN/OUT: nList position list pointers */
  int *aiPrev                     /* IN/OUT: Previous positions read */
){
  int i;
  int nFin = 0;

  assert( (iPos&(SNIPPET_BUFFER_CHUNK-1))==0 );

  memset(&aBuffer[iPos&SNIPPET_BUFFER_MASK], 0, SNIPPET_BUFFER_CHUNK);

  for(i=0; i<nList; i++){
    int iPrev = aiPrev[i];
    char *pList = apList[i];

    if( !pList ){
      nFin++;
      continue;
    }

    while( iPrev<(iPos+SNIPPET_BUFFER_CHUNK) ){
      if( iPrev>=iPos ){
        aBuffer[iPrev&SNIPPET_BUFFER_MASK] = i+1;
      }
      if( 0==((*pList)&0xFE) ){
        nFin++;
        break;
      }
      fts3GetDeltaPosition(&pList, &iPrev);
    }

    aiPrev[i] = iPrev;
    apList[i] = pList;
  }

  return (nFin==nList);
}

typedef struct SnippetCtx SnippetCtx;
struct SnippetCtx {
  Fts3Cursor *pCsr;
  int iCol;
  int iPhrase;
  int *aiPrev;
  int *anToken;
  char **apList;
};

static int fts3SnippetFindPositions(Fts3Expr *pExpr, void *ctx){
  SnippetCtx *p = (SnippetCtx *)ctx;
  int iPhrase = p->iPhrase++;
  char *pCsr;

  p->anToken[iPhrase] = pExpr->pPhrase->nToken;
  pCsr = sqlite3Fts3FindPositions(pExpr, p->pCsr->iPrevId, p->iCol);

  if( pCsr ){
    int iVal;
    pCsr += sqlite3Fts3GetVarint32(pCsr, &iVal);
    p->apList[iPhrase] = pCsr;
    p->aiPrev[iPhrase] = iVal-2;
  }
  return SQLITE_OK;
}

static void fts3SnippetCnt(
  int iIdx,
  int nSnippet,
  int *anCnt,
  u8 *aBuffer,
  int *anToken,
  u64 *pHlmask
){
  int iSub =  (iIdx-1)&SNIPPET_BUFFER_MASK;
  int iAdd =  (iIdx+nSnippet-1)&SNIPPET_BUFFER_MASK;
  int iSub2 = (iIdx+(nSnippet/3)-1)&SNIPPET_BUFFER_MASK;
  int iAdd2 = (iIdx+(nSnippet*2/3)-1)&SNIPPET_BUFFER_MASK;

  u64 h = *pHlmask;

  anCnt[ aBuffer[iSub]  ]--;
  anCnt[ aBuffer[iSub2] ]--;
  anCnt[ aBuffer[iAdd]  ]++;
  anCnt[ aBuffer[iAdd2] ]++;

  h = h >> 1;
  if( aBuffer[iAdd] ){
    int j;
    for(j=anToken[aBuffer[iAdd]-1]; j>=1; j--){
      h |= (u64)1 << (nSnippet-j);
    }
  }
  *pHlmask = h;
}

static int fts3SnippetScore(int n, int *anCnt){
  int j;
  int iScore = 0;
  for(j=1; j<=n; j++){
    int nCnt = anCnt[j];
    iScore += nCnt + (nCnt ? 1000 : 0);
  }
  return iScore;
}

static int fts3BestSnippet(
  int nSnippet,                   /* Desired snippet length */
  Fts3Cursor *pCsr,               /* Cursor to create snippet for */
  int iCol,                       /* Index of column to create snippet from */
  int *piPos,                     /* OUT: Starting token for best snippet */
  u64 *pHlmask                    /* OUT: Highlight mask for best snippet */
){
  int rc;                         /* Return Code */
  u8 aBuffer[SNIPPET_BUFFER_SIZE];/* Circular snippet buffer */
  int *aiPrev;                    /* Used by fts3LoadSnippetBuffer() */
  int *anToken;                   /* Number of tokens in each phrase */
  char **apList;                  /* Array of position lists */
  int *anCnt;                     /* Running totals of phrase occurences */
  int nList;

  int i;

  u64 hlmask = 0;                 /* Current mask of highlighted terms */
  u64 besthlmask = 0;             /* Mask of highlighted terms for iBestPos */
  int iBestPos = 0;               /* Starting position of 'best' snippet */
  int iBestScore = 0;             /* Score of best snippet higher->better */
  SnippetCtx sCtx;

  /* Iterate through the phrases in the expression to count them. The same
  ** callback makes sure the doclists are loaded for each phrase.
  */
  rc = fts3ExprLoadDoclists(pCsr, &nList);
  if( rc!=SQLITE_OK ){
    return rc;
  }

  /* Now that it is known how many phrases there are, allocate and zero
  ** the required arrays using malloc().
  */
  apList = sqlite3_malloc(
      sizeof(u8*)*nList +         /* apList */
      sizeof(int)*(nList) +       /* anToken */
      sizeof(int)*nList +         /* aiPrev */
      sizeof(int)*(nList+1)       /* anCnt */
  );
  if( !apList ){
    return SQLITE_NOMEM;
  }
  memset(apList, 0, sizeof(u8*)*nList+sizeof(int)*nList+sizeof(int)*nList);
  anToken = (int *)&apList[nList];
  aiPrev = &anToken[nList];
  anCnt = &aiPrev[nList];

  /* Initialize the contents of the aiPrev and aiList arrays. */
  sCtx.pCsr = pCsr;
  sCtx.iCol = iCol;
  sCtx.apList = apList;
  sCtx.aiPrev = aiPrev;
  sCtx.anToken = anToken;
  sCtx.iPhrase = 0;
  (void)fts3ExprIterate(pCsr->pExpr, fts3SnippetFindPositions, (void *)&sCtx);

  /* Load the first two chunks of data into the buffer. */
  memset(aBuffer, 0, SNIPPET_BUFFER_SIZE);
  fts3LoadSnippetBuffer(0, aBuffer, nList, apList, aiPrev);
  fts3LoadSnippetBuffer(SNIPPET_BUFFER_CHUNK, aBuffer, nList, apList, aiPrev);

  /* Set the initial contents of the highlight-mask and anCnt[] array. */
  for(i=1-nSnippet; i<=0; i++){
    fts3SnippetCnt(i, nSnippet, anCnt, aBuffer, anToken, &hlmask);
  }
  iBestScore = fts3SnippetScore(nList, anCnt);
  besthlmask = hlmask;
  iBestPos = 0;

  for(i=1; 1; i++){
    int iScore;

    if( 0==(i&(SNIPPET_BUFFER_CHUNK-1)) ){
      int iLoad = i + SNIPPET_BUFFER_CHUNK;
      if( fts3LoadSnippetBuffer(iLoad, aBuffer, nList, apList, aiPrev) ) break;
    }

    /* Figure out how highly a snippet starting at token offset i scores
    ** according to fts3SnippetScore(). If it is higher than any previously
    ** considered position, save the current position, score and hlmask as
    ** the best snippet candidate found so far.
    */
    fts3SnippetCnt(i, nSnippet, anCnt, aBuffer, anToken, &hlmask);
    iScore = fts3SnippetScore(nList, anCnt);
    if( iScore>iBestScore ){
      iBestPos = i;
      iBestScore = iScore;
      besthlmask = hlmask;
    }
  }

  sqlite3_free(apList);
  *piPos = iBestPos;
  *pHlmask = besthlmask;
  return SQLITE_OK;
}

typedef struct StrBuffer StrBuffer;
struct StrBuffer {
  char *z;
  int n;
  int nAlloc;
};

static int fts3StringAppend(
  StrBuffer *pStr,
  const char *zAppend,
  int nAppend
){
  if( nAppend<0 ){
    nAppend = strlen(zAppend);
  }

  if( pStr->n+nAppend+1>=pStr->nAlloc ){
    int nAlloc = pStr->nAlloc+nAppend+100;
    char *zNew = sqlite3_realloc(pStr->z, nAlloc);
    if( !zNew ){
      return SQLITE_NOMEM;
    }
    pStr->z = zNew;
    pStr->nAlloc = nAlloc;
  }

  memcpy(&pStr->z[pStr->n], zAppend, nAppend);
  pStr->n += nAppend;
  pStr->z[pStr->n] = '\0';

  return SQLITE_OK;
}

static int fts3SnippetText(
  Fts3Cursor *pCsr,               /* FTS3 Cursor */
  const char *zDoc,               /* Document to extract snippet from */
  int nDoc,                       /* Size of zDoc in bytes */
  int nSnippet,                   /* Number of tokens in extracted snippet */
  int iPos,                       /* Index of first document token in snippet */
  u64 hlmask,                     /* Bitmask of terms to highlight in snippet */
  const char *zOpen,              /* String inserted before highlighted term */
  const char *zClose,             /* String inserted after highlighted term */
  const char *zEllipsis,
  char **pzSnippet                /* OUT: Snippet text */
){
  Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
  int rc;                         /* Return code */
  int iCurrent = 0;
  int iStart = 0;
  int iEnd;

  sqlite3_tokenizer_module *pMod; /* Tokenizer module methods object */
  sqlite3_tokenizer_cursor *pC;   /* Tokenizer cursor open on zDoc/nDoc */
  const char *ZDUMMY;             /* Dummy arguments used with tokenizer */
  int DUMMY1, DUMMY2, DUMMY3;     /* Dummy arguments used with tokenizer */

  StrBuffer res = {0, 0, 0};   /* Result string */

  /* Open a token cursor on the document. Read all tokens up to and
  ** including token iPos (the first token of the snippet). Set variable
  ** iStart to the byte offset in zDoc of the start of token iPos.
  */
  pMod = (sqlite3_tokenizer_module *)pTab->pTokenizer->pModule;
  rc = pMod->xOpen(pTab->pTokenizer, zDoc, nDoc, &pC);
  while( rc==SQLITE_OK && iCurrent<iPos ){
    rc = pMod->xNext(pC, &ZDUMMY, &DUMMY1, &iStart, &DUMMY2, &iCurrent);
  }
  iEnd = iStart;

  if( rc==SQLITE_OK && iStart>0 ){
    rc = fts3StringAppend(&res, zEllipsis, -1);
  }

  while( rc==SQLITE_OK ){
    int iBegin;
    int iFin;
    rc = pMod->xNext(pC, &ZDUMMY, &DUMMY1, &iBegin, &iFin, &iCurrent);

    if( rc==SQLITE_OK ){
      if( iCurrent>=(iPos+nSnippet) ){
        rc = SQLITE_DONE;
      }else{
        iEnd = iFin;
        if( hlmask & ((u64)1 << (iCurrent-iPos)) ){
          if( fts3StringAppend(&res, &zDoc[iStart], iBegin-iStart)
           || fts3StringAppend(&res, zOpen, -1)
           || fts3StringAppend(&res, &zDoc[iBegin], iEnd-iBegin)
           || fts3StringAppend(&res, zClose, -1)
          ){
            rc = SQLITE_NOMEM;
          }
          iStart = iEnd;
        }
      }
    }
  }
  assert( rc!=SQLITE_OK );
  if( rc==SQLITE_DONE ){
    rc = fts3StringAppend(&res, &zDoc[iStart], iEnd-iStart);
    if( rc==SQLITE_OK ){
      rc = pMod->xNext(pC, &ZDUMMY, &DUMMY1, &DUMMY2, &DUMMY3, &iCurrent);
      if( rc==SQLITE_OK ){
        rc = fts3StringAppend(&res, zEllipsis, -1);
      }else if( rc==SQLITE_DONE ){
        rc = fts3StringAppend(&res, &zDoc[iEnd], -1);
      }
    }
  }

  pMod->xClose(pC);
  if( rc!=SQLITE_OK ){
    sqlite3_free(res.z);
  }else{
    *pzSnippet = res.z;
  }
  return rc;
}


/*
** An instance of this structure is used to collect the 'global' part of
** the matchinfo statistics. The 'global' part consists of the following:
**
**   1. The number of phrases in the query (nPhrase).
**
**   2. The number of columns in the FTS3 table (nCol).
**
**   3. A matrix of (nPhrase*nCol) integers containing the sum of the
**      number of hits for each phrase in each column across all rows
**      of the table.
**
** The total size of the global matchinfo array, assuming the number of
** columns is N and the number of phrases is P is:
**
**   2 + P*(N+1)
**
** The number of hits for the 3rd phrase in the second column is found
** using the expression:
**
**   aGlobal[2 + P*(1+2) + 1]
*/
typedef struct MatchInfo MatchInfo;
struct MatchInfo {
  Fts3Table *pTab;                /* FTS3 Table */
  Fts3Cursor *pCursor;            /* FTS3 Cursor */
  int iPhrase;                    /* Number of phrases so far */
  int nCol;                       /* Number of columns in table */
  u32 *aGlobal;                   /* Pre-allocated buffer */
};

/*
** This function is used to count the entries in a column-list (delta-encoded
** list of term offsets within a single column of a single row).
*/
static int fts3ColumnlistCount(char **ppCollist){
  char *pEnd = *ppCollist;
  char c = 0;
  int nEntry = 0;

  /* A column-list is terminated by either a 0x01 or 0x00. */
  while( 0xFE & (*pEnd | c) ){
    c = *pEnd++ & 0x80;
    if( !c ) nEntry++;
  }

  *ppCollist = pEnd;
  return nEntry;
}

static void fts3LoadColumnlistCounts(char **pp, u32 *aOut){
  char *pCsr = *pp;
  while( *pCsr ){
    sqlite3_int64 iCol = 0;
    if( *pCsr==0x01 ){
      pCsr++;
      pCsr += sqlite3Fts3GetVarint(pCsr, &iCol);
    }
    aOut[iCol] += fts3ColumnlistCount(&pCsr);
  }
  pCsr++;
  *pp = pCsr;
}

/*
** fts3ExprIterate() callback used to collect the "global" matchinfo stats
** for a single query.
*/
static int fts3ExprGlobalMatchinfoCb(
  Fts3Expr *pExpr,                /* Phrase expression node */
  void *pCtx                      /* Pointer to MatchInfo structure */
){
  MatchInfo *p = (MatchInfo *)pCtx;
  char *pCsr;
  char *pEnd;
  const int iStart = 2 + p->nCol*p->iPhrase;

  assert( pExpr->isLoaded );

  /* Fill in the global hit count matrix row for this phrase. */
  pCsr = pExpr->aDoclist;
  pEnd = &pExpr->aDoclist[pExpr->nDoclist];
  while( pCsr<pEnd ){
    while( *pCsr++ & 0x80 );
    fts3LoadColumnlistCounts(&pCsr, &p->aGlobal[iStart]);
  }

  p->iPhrase++;
  return SQLITE_OK;
}

static int fts3ExprLocalMatchinfoCb(
  Fts3Expr *pExpr,                /* Phrase expression node */
  void *pCtx                      /* Pointer to MatchInfo structure */
){
  MatchInfo *p = (MatchInfo *)pCtx;
  int iPhrase = p->iPhrase++;

  if( pExpr->aDoclist ){
    char *pCsr;
    int iOffset = 2 + p->nCol*(p->aGlobal[0]+iPhrase);

    memset(&p->aGlobal[iOffset], 0, p->nCol*sizeof(u32));
    pCsr = sqlite3Fts3FindPositions(pExpr, p->pCursor->iPrevId, -1);
    if( pCsr ) fts3LoadColumnlistCounts(&pCsr, &p->aGlobal[iOffset]);
  }

  return SQLITE_OK;
}

/*
** Populate pCsr->aMatchinfo[] with data for the current row. The 'matchinfo'
** data is an array of 32-bit unsigned integers (C type u32).
*/
static int fts3GetMatchinfo(Fts3Cursor *pCsr){
  MatchInfo g;
  Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
  if( pCsr->aMatchinfo==0 ){
    int rc;
    int nPhrase;
    int nMatchinfo;

    g.pTab = pTab;
    g.nCol = pTab->nColumn;
    g.iPhrase = 0;
    rc = fts3ExprLoadDoclists(pCsr, &nPhrase);
    if( rc!=SQLITE_OK ){
      return rc;
    }

    nMatchinfo = 2 + 2*g.nCol*nPhrase;

    g.iPhrase = 0;
    g.aGlobal = (u32 *)sqlite3_malloc(sizeof(u32)*nMatchinfo);
    if( !g.aGlobal ){
      return SQLITE_NOMEM;
    }
    memset(g.aGlobal, 0, sizeof(u32)*nMatchinfo);

    g.aGlobal[0] = nPhrase;
    g.aGlobal[1] = g.nCol;
    (void)fts3ExprIterate(pCsr->pExpr, fts3ExprGlobalMatchinfoCb, (void *)&g);

    pCsr->aMatchinfo = g.aGlobal;
  }

  g.pTab = pTab;
  g.pCursor = pCsr;
  g.nCol = pTab->nColumn;
  g.iPhrase = 0;
  g.aGlobal = pCsr->aMatchinfo;

  if( pCsr->isMatchinfoOk ){
    (void)fts3ExprIterate(pCsr->pExpr, fts3ExprLocalMatchinfoCb, (void *)&g);
    pCsr->isMatchinfoOk = 0;
  }

  return SQLITE_OK;
}

void sqlite3Fts3Snippet2(
  sqlite3_context *pCtx,          /* SQLite function call context */
  Fts3Cursor *pCsr,               /* Cursor object */
  const char *zStart,             /* Snippet start text - "<b>" */
  const char *zEnd,               /* Snippet end text - "</b>" */
  const char *zEllipsis,          /* Snippet ellipsis text - "<b>...</b>" */
  int iCol,                       /* Extract snippet from this column */
  int nToken                      /* Approximate number of tokens in snippet */
){
  int rc;
  int iPos = 0;
  u64 hlmask = 0;
  char *z = 0;
  int nDoc;
  const char *zDoc;

  rc = fts3BestSnippet(nToken, pCsr, iCol, &iPos, &hlmask);

  nDoc = sqlite3_column_bytes(pCsr->pStmt, iCol+1);
  zDoc = (const char *)sqlite3_column_text(pCsr->pStmt, iCol+1);

  if( rc==SQLITE_OK ){
    rc = fts3SnippetText(
        pCsr, zDoc, nDoc, nToken, iPos, hlmask, zStart, zEnd, zEllipsis, &z);
  }
  if( rc!=SQLITE_OK ){
    sqlite3_result_error_code(pCtx, rc);
  }else{
    sqlite3_result_text(pCtx, z, -1, sqlite3_free);
  }
}

void sqlite3Fts3Matchinfo(sqlite3_context *pContext, Fts3Cursor *pCsr){
  int rc = fts3GetMatchinfo(pCsr);
  if( rc!=SQLITE_OK ){
    sqlite3_result_error_code(pContext, rc);
  }else{
    int n = sizeof(u32)*(2+pCsr->aMatchinfo[0]*pCsr->aMatchinfo[1]*2);
    sqlite3_result_blob(pContext, pCsr->aMatchinfo, n, SQLITE_TRANSIENT);
  }
}

#endif