mirror of
https://github.com/sqlite/sqlite.git
synced 2025-07-30 19:03:16 +03:00
The FTS1 tables have a new automatic column named "offset" that returns
a string containing byte offset information for all matching terms. Also added a large test case based on SQLite mailing list entries. (CVS 3417) FossilOrigin-Name: f25cfa1aec0e4c1fe07176039a1b7f4e6a2c66ec
This commit is contained in:
297
ext/fts1/fts1.c
297
ext/fts1/fts1.c
@ -41,19 +41,28 @@ SQLITE_EXTENSION_INIT1
|
||||
/* utility functions */
|
||||
|
||||
typedef struct StringBuffer {
|
||||
int len; /* length, not including null terminator */
|
||||
char *s;
|
||||
int len; /* length, not including null terminator */
|
||||
int alloced; /* Space allocated for s[] */
|
||||
char *s; /* Content of the string */
|
||||
} StringBuffer;
|
||||
|
||||
void initStringBuffer(StringBuffer *sb){
|
||||
sb->len = 0;
|
||||
sb->s = malloc(1);
|
||||
sb->alloced = 100;
|
||||
sb->s = malloc(100);
|
||||
sb->s[0] = '\0';
|
||||
}
|
||||
|
||||
void append(StringBuffer *sb, const char *zFrom){
|
||||
int nFrom = strlen(zFrom);
|
||||
sb->s = realloc(sb->s, sb->len + nFrom + 1);
|
||||
if( sb->len + nFrom >= sb->alloced ){
|
||||
sb->alloced = sb->len + nFrom + 100;
|
||||
sb->s = realloc(sb->s, sb->alloced+1);
|
||||
if( sb->s==0 ){
|
||||
initStringBuffer(sb);
|
||||
return;
|
||||
}
|
||||
}
|
||||
strcpy(sb->s + sb->len, zFrom);
|
||||
sb->len += nFrom;
|
||||
}
|
||||
@ -845,6 +854,7 @@ typedef struct fulltext_vtab fulltext_vtab;
|
||||
*/
|
||||
typedef struct QueryTerm {
|
||||
short int nPhrase; /* How many following terms are part of the same phrase */
|
||||
short int iPhrase; /* This is the i-th term of a phrase. */
|
||||
short int iColumn; /* Column of the index that must match this term */
|
||||
signed char isOr; /* this term is preceded by "OR" */
|
||||
signed char isNot; /* this term is preceded by "-" */
|
||||
@ -891,6 +901,24 @@ typedef struct Query {
|
||||
} Query;
|
||||
|
||||
|
||||
/*
|
||||
** An instance of the following structure keeps track of generated
|
||||
** matching-word offset information and snippets.
|
||||
*/
|
||||
typedef struct Snippet {
|
||||
int nMatch; /* Total number of matches */
|
||||
int nAlloc; /* Space allocated for aMatch[] */
|
||||
struct { /* One entry for each matching term */
|
||||
int iCol; /* The column that contains the match */
|
||||
int iTerm; /* The index in Query.pTerms[] of the matching term */
|
||||
int iStart; /* The offset to the first character of the term */
|
||||
int nByte; /* Number of bytes in the term */
|
||||
} *aMatch; /* Points to space obtained from malloc */
|
||||
char *zOffset; /* Text rendering of aMatch[] */
|
||||
int nOffset; /* strlen(zOffset) */
|
||||
} Snippet;
|
||||
|
||||
|
||||
typedef enum QueryType {
|
||||
QUERY_GENERIC, /* table scan */
|
||||
QUERY_ROWID, /* lookup by rowid */
|
||||
@ -974,10 +1002,12 @@ struct fulltext_vtab {
|
||||
*/
|
||||
typedef struct fulltext_cursor {
|
||||
sqlite3_vtab_cursor base; /* Base class used by SQLite core */
|
||||
QueryType iCursorType; /* Type of cursor */
|
||||
QueryType iCursorType; /* Copy of sqlite3_index_info.idxNum */
|
||||
sqlite3_stmt *pStmt; /* Prepared statement in use by the cursor */
|
||||
int eof; /* True if at End Of Results */
|
||||
Query q; /* Parsed query string */
|
||||
Snippet snippet; /* Cached snippet for the current row */
|
||||
int iColumn; /* Column being searched */
|
||||
DocListReader result; /* used when iCursorType == QUERY_FULLTEXT */
|
||||
} fulltext_cursor;
|
||||
|
||||
@ -1601,7 +1631,6 @@ typedef struct TableSpec {
|
||||
char **azColumn; /* Original names of columns to be indexed */
|
||||
char *zColumnList; /* Comma-separated list of names for %_content */
|
||||
char **azTokenizer; /* Name of tokenizer and its arguments */
|
||||
char **azDelimiter; /* Delimiters used for snippets */
|
||||
} TableSpec;
|
||||
|
||||
/*
|
||||
@ -1611,7 +1640,6 @@ void clearTableSpec(TableSpec *p) {
|
||||
free(p->azColumn);
|
||||
free(p->zColumnList);
|
||||
free(p->azTokenizer);
|
||||
free(p->azDelimiter);
|
||||
}
|
||||
|
||||
/* Parse a CREATE VIRTUAL TABLE statement, which looks like this:
|
||||
@ -1627,7 +1655,6 @@ int parseSpec(TableSpec *pSpec, int argc, const char *const*argv, char**pzErr){
|
||||
char *z, *zDummy;
|
||||
char **azArg;
|
||||
const char *zTokenizer = 0; /* argv[] entry describing the tokenizer */
|
||||
const char *zDelimiter = 0; /* argv[] entry describing the delimiters */
|
||||
|
||||
assert( argc>=3 );
|
||||
/* Current interface:
|
||||
@ -1664,13 +1691,10 @@ int parseSpec(TableSpec *pSpec, int argc, const char *const*argv, char**pzErr){
|
||||
pSpec->nColumn = 0;
|
||||
pSpec->azColumn = azArg;
|
||||
zTokenizer = "tokenize simple";
|
||||
zDelimiter = "delimiters('[',']','...')";
|
||||
n = 0;
|
||||
for(i=3, j=0; i<argc; ++i){
|
||||
if( startsWith(azArg[i],"tokenize") ){
|
||||
zTokenizer = azArg[i];
|
||||
}else if( startsWith(azArg[i],"delimiters") ){
|
||||
zDelimiter = azArg[i];
|
||||
}else{
|
||||
z = azArg[pSpec->nColumn] = firstToken(azArg[i], &zDummy);
|
||||
pSpec->nColumn++;
|
||||
@ -1716,12 +1740,6 @@ int parseSpec(TableSpec *pSpec, int argc, const char *const*argv, char**pzErr){
|
||||
pSpec->azTokenizer = tokenizeString(zTokenizer, &n);
|
||||
tokenListToIdList(pSpec->azTokenizer);
|
||||
|
||||
/*
|
||||
** Parse the delimiter specification string.
|
||||
*/
|
||||
pSpec->azDelimiter = tokenizeString(zDelimiter, &n);
|
||||
tokenListToIdList(pSpec->azDelimiter);
|
||||
|
||||
return SQLITE_OK;
|
||||
}
|
||||
|
||||
@ -1730,7 +1748,7 @@ int parseSpec(TableSpec *pSpec, int argc, const char *const*argv, char**pzErr){
|
||||
** the virtual table. Return a pointer to this schema.
|
||||
**
|
||||
** If the addAllColumn parameter is true, then add a column named
|
||||
** "_all" to the end of the schema.
|
||||
** "_all" to the end of the schema. Also add the "offset" column.
|
||||
**
|
||||
** Space is obtained from sqlite3_mprintf() and should be freed
|
||||
** using sqlite3_free().
|
||||
@ -1749,7 +1767,7 @@ static char *fulltextSchema(
|
||||
zSchema = zNext;
|
||||
zSep = ",";
|
||||
}
|
||||
zNext = sqlite3_mprintf("%s,_all)", zSchema);
|
||||
zNext = sqlite3_mprintf("%s,_all,offset)", zSchema);
|
||||
sqlite3_free(zSchema);
|
||||
return zNext;
|
||||
}
|
||||
@ -1976,6 +1994,169 @@ static void queryClear(Query *q){
|
||||
memset(q, 0, sizeof(*q));
|
||||
}
|
||||
|
||||
/* Free all of the dynamically allocated memory held by the
|
||||
** Snippet
|
||||
*/
|
||||
static void snippetClear(Snippet *p){
|
||||
free(p->aMatch);
|
||||
free(p->zOffset);
|
||||
memset(p, 0, sizeof(*p));
|
||||
}
|
||||
/*
|
||||
** Append a single entry to the p->aMatch[] log.
|
||||
*/
|
||||
static void snippetAppendMatch(
|
||||
Snippet *p, /* Append the entry to this snippet */
|
||||
int iCol, int iTerm, /* The column and query term */
|
||||
int iStart, int nByte /* Offset and size of the match */
|
||||
){
|
||||
int i;
|
||||
if( p->nMatch+1>=p->nAlloc ){
|
||||
p->nAlloc = p->nAlloc*2 + 10;
|
||||
p->aMatch = realloc(p->aMatch, p->nAlloc*sizeof(p->aMatch[0]) );
|
||||
if( p->aMatch==0 ){
|
||||
p->nMatch = 0;
|
||||
p->nAlloc = 0;
|
||||
return;
|
||||
}
|
||||
}
|
||||
i = p->nMatch++;
|
||||
p->aMatch[i].iCol = iCol;
|
||||
p->aMatch[i].iTerm = iTerm;
|
||||
p->aMatch[i].iStart = iStart;
|
||||
p->aMatch[i].nByte = nByte;
|
||||
}
|
||||
|
||||
/*
|
||||
** Sizing information for the circular buffer used in snippetOffsetsOfColumn()
|
||||
*/
|
||||
#define FTS1_ROTOR_SZ (32)
|
||||
#define FTS1_ROTOR_MASK (FTS1_ROTOR_SZ-1)
|
||||
|
||||
/*
|
||||
** Add entries to pSnippet->aMatch[] for every match that occurs against
|
||||
** document zDoc[0..nDoc-1] which is stored in column iColumn.
|
||||
*/
|
||||
static void snippetOffsetsOfColumn(
|
||||
Query *pQuery,
|
||||
Snippet *pSnippet,
|
||||
int iColumn,
|
||||
const char *zDoc,
|
||||
int nDoc
|
||||
){
|
||||
const sqlite3_tokenizer_module *pTModule; /* The tokenizer module */
|
||||
sqlite3_tokenizer *pTokenizer; /* The specific tokenizer */
|
||||
sqlite3_tokenizer_cursor *pTCursor; /* Tokenizer cursor */
|
||||
fulltext_vtab *pVtab; /* The full text index */
|
||||
int nColumn; /* Number of columns in the index */
|
||||
const QueryTerm *aTerm; /* Query string terms */
|
||||
int nTerm; /* Number of query string terms */
|
||||
int i, j; /* Loop counters */
|
||||
int rc; /* Return code */
|
||||
unsigned int match, prevMatch; /* Phrase search bitmasks */
|
||||
const char *zToken; /* Next token from the tokenizer */
|
||||
int nToken; /* Size of zToken */
|
||||
int iBegin, iEnd, iPos; /* Offsets of beginning and end */
|
||||
|
||||
/* The following variables keep a circular buffer of the last
|
||||
** few tokens */
|
||||
unsigned int iRotor = 0; /* Index of current token */
|
||||
int iRotorBegin[FTS1_ROTOR_SZ]; /* Beginning offset of token */
|
||||
int iRotorLen[FTS1_ROTOR_SZ]; /* Length of token */
|
||||
|
||||
pVtab = pQuery->pFts;
|
||||
nColumn = pVtab->nColumn;
|
||||
pTokenizer = pVtab->pTokenizer;
|
||||
pTModule = pTokenizer->pModule;
|
||||
rc = pTModule->xOpen(pTokenizer, zDoc, nDoc, &pTCursor);
|
||||
if( rc ) return;
|
||||
pTCursor->pTokenizer = pTokenizer;
|
||||
aTerm = pQuery->pTerms;
|
||||
nTerm = pQuery->nTerms;
|
||||
if( nTerm>=FTS1_ROTOR_SZ ){
|
||||
nTerm = FTS1_ROTOR_SZ - 1;
|
||||
}
|
||||
prevMatch = 0;
|
||||
while(1){
|
||||
rc = pTModule->xNext(pTCursor, &zToken, &nToken, &iBegin, &iEnd, &iPos);
|
||||
if( rc ) break;
|
||||
iRotorBegin[iRotor&FTS1_ROTOR_MASK] = iBegin;
|
||||
iRotorLen[iRotor&FTS1_ROTOR_MASK] = iEnd-iBegin;
|
||||
match = 0;
|
||||
for(i=0; i<nTerm; i++){
|
||||
int iCol;
|
||||
iCol = aTerm[i].iColumn;
|
||||
if( iCol>=0 && iCol<nColumn && iCol!=iColumn ) continue;
|
||||
if( aTerm[i].nTerm!=nToken ) continue;
|
||||
if( memcmp(aTerm[i].pTerm, zToken, nToken) ) continue;
|
||||
if( aTerm[i].iPhrase>1 && (prevMatch & (1<<i))==0 ) continue;
|
||||
match |= 1<<i;
|
||||
if( i==nTerm-1 || aTerm[i+1].iPhrase==1 ){
|
||||
for(j=aTerm[i].iPhrase-1; j>=0; j--){
|
||||
int k = (iRotor-j) & FTS1_ROTOR_MASK;
|
||||
snippetAppendMatch(pSnippet, iColumn, i-j,
|
||||
iRotorBegin[k], iRotorLen[k]);
|
||||
}
|
||||
}
|
||||
}
|
||||
prevMatch = match<<1;
|
||||
iRotor++;
|
||||
}
|
||||
pTModule->xClose(pTCursor);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
** Compute all offsets for the current row of the query.
|
||||
** If the offsets have already been computed, this routine is a no-op.
|
||||
*/
|
||||
static void snippetAllOffsets(fulltext_cursor *p){
|
||||
int nColumn;
|
||||
int iColumn, i;
|
||||
int iFirst, iLast;
|
||||
fulltext_vtab *pFts;
|
||||
|
||||
if( p->snippet.nMatch ) return;
|
||||
if( p->q.nTerms==0 ) return;
|
||||
pFts = p->q.pFts;
|
||||
nColumn = pFts->nColumn;
|
||||
iColumn = p->iCursorType;
|
||||
if( iColumn<0 || iColumn>=nColumn ){
|
||||
iFirst = 0;
|
||||
iLast = nColumn-1;
|
||||
}else{
|
||||
iFirst = iColumn;
|
||||
iLast = iColumn;
|
||||
}
|
||||
for(i=iFirst; i<=iLast; i++){
|
||||
const char *zDoc;
|
||||
int nDoc;
|
||||
zDoc = (const char*)sqlite3_column_text(p->pStmt, i+1);
|
||||
nDoc = sqlite3_column_bytes(p->pStmt, i+1);
|
||||
snippetOffsetsOfColumn(&p->q, &p->snippet, i, zDoc, nDoc);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
** Convert the information in the aMatch[] array of the snippet
|
||||
** into the string zOffset[0..nOffset-1].
|
||||
*/
|
||||
static void snippetOffsetText(Snippet *p){
|
||||
int i;
|
||||
StringBuffer sb;
|
||||
char zBuf[200];
|
||||
if( p->zOffset ) return;
|
||||
initStringBuffer(&sb);
|
||||
for(i=0; i<p->nMatch; i++){
|
||||
zBuf[0] = ' ';
|
||||
sprintf(&zBuf[i>0], "%d %d %d %d", p->aMatch[i].iCol,
|
||||
p->aMatch[i].iTerm, p->aMatch[i].iStart, p->aMatch[i].nByte);
|
||||
append(&sb, zBuf);
|
||||
}
|
||||
p->zOffset = sb.s;
|
||||
p->nOffset = sb.len;
|
||||
}
|
||||
|
||||
/*
|
||||
** Close the cursor. For additional information see the documentation
|
||||
** on the xClose method of the virtual table interface.
|
||||
@ -1985,6 +2166,7 @@ static int fulltextClose(sqlite3_vtab_cursor *pCursor){
|
||||
TRACE(("FTS1 Close %p\n", c));
|
||||
sqlite3_finalize(c->pStmt);
|
||||
queryClear(&c->q);
|
||||
snippetClear(&c->snippet);
|
||||
if( c->result.pDoclist!=NULL ){
|
||||
docListDelete(c->result.pDoclist);
|
||||
}
|
||||
@ -1998,6 +2180,7 @@ static int fulltextNext(sqlite3_vtab_cursor *pCursor){
|
||||
int rc;
|
||||
|
||||
TRACE(("FTS1 Next %p\n", pCursor));
|
||||
snippetClear(&c->snippet);
|
||||
if( c->iCursorType < QUERY_FULLTEXT ){
|
||||
/* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */
|
||||
rc = sqlite3_step(c->pStmt);
|
||||
@ -2132,6 +2315,7 @@ static int tokenizeSegment(
|
||||
sqlite3_tokenizer_cursor *pCursor;
|
||||
int firstIndex = pQuery->nTerms;
|
||||
int iCol;
|
||||
int nTerm = 1;
|
||||
|
||||
int rc = pModule->xOpen(pTokenizer, pSegment, nSegment, &pCursor);
|
||||
if( rc!=SQLITE_OK ) return rc;
|
||||
@ -2160,6 +2344,10 @@ static int tokenizeSegment(
|
||||
if( !inPhrase && iBegin>0 && pSegment[iBegin-1]=='-' ){
|
||||
pQuery->pTerms[pQuery->nTerms-1].isNot = 1;
|
||||
}
|
||||
pQuery->pTerms[pQuery->nTerms-1].iPhrase = nTerm;
|
||||
if( inPhrase ){
|
||||
nTerm++;
|
||||
}
|
||||
}
|
||||
|
||||
if( inPhrase && pQuery->nTerms>firstIndex ){
|
||||
@ -2356,28 +2544,46 @@ out:
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* This is the xEof method of the virtual table. The SQLite core
|
||||
** calls this routine to find out if it has reached the end of
|
||||
** a query's results set.
|
||||
*/
|
||||
static int fulltextEof(sqlite3_vtab_cursor *pCursor){
|
||||
fulltext_cursor *c = (fulltext_cursor *) pCursor;
|
||||
return c->eof;
|
||||
}
|
||||
|
||||
/* This is the xColumn method of the virtual table. The SQLite
|
||||
** core calls this method during a query when it needs the value
|
||||
** of a column from the virtual table. This method needs to use
|
||||
** one of the sqlite3_result_*() routines to store the requested
|
||||
** value back in the pContext.
|
||||
*/
|
||||
static int fulltextColumn(sqlite3_vtab_cursor *pCursor,
|
||||
sqlite3_context *pContext, int idxCol){
|
||||
fulltext_cursor *c = (fulltext_cursor *) pCursor;
|
||||
fulltext_vtab *v = cursor_vtab(c);
|
||||
const char *s;
|
||||
|
||||
if( idxCol==v->nColumn ){ /* a request for _all */
|
||||
if( idxCol<v->nColumn ){
|
||||
sqlite3_value *pVal = sqlite3_column_value(c->pStmt, idxCol+1);
|
||||
sqlite3_result_value(pContext, pVal);
|
||||
}else if( idxCol==v->nColumn ){
|
||||
/* The _all column */
|
||||
sqlite3_result_null(pContext);
|
||||
} else {
|
||||
assert( idxCol<v->nColumn );
|
||||
s = (const char *) sqlite3_column_text(c->pStmt, idxCol+1);
|
||||
sqlite3_result_text(pContext, s, -1, SQLITE_TRANSIENT);
|
||||
}else if( idxCol==v->nColumn+1 ){
|
||||
/* The offset column */
|
||||
snippetAllOffsets(c);
|
||||
snippetOffsetText(&c->snippet);
|
||||
sqlite3_result_text(pContext, c->snippet.zOffset, c->snippet.nOffset,
|
||||
SQLITE_STATIC);
|
||||
}
|
||||
|
||||
return SQLITE_OK;
|
||||
}
|
||||
|
||||
/* This is the xRowid method. The SQLite core calls this routine to
|
||||
** retrive the rowid for the current row of the result set. The
|
||||
** rowid should be written to *pRowid.
|
||||
*/
|
||||
static int fulltextRowid(sqlite3_vtab_cursor *pCursor, sqlite_int64 *pRowid){
|
||||
fulltext_cursor *c = (fulltext_cursor *) pCursor;
|
||||
|
||||
@ -2589,27 +2795,34 @@ static int fulltextUpdate(sqlite3_vtab *pVtab, int nArg, sqlite3_value **ppArg,
|
||||
|
||||
/* ppArg[1] = rowid
|
||||
* ppArg[2..2+v->nColumn-1] = values
|
||||
* ppArg[2+v->nColumn] = value for _all (we ignore this) */
|
||||
assert( nArg==2+v->nColumn+1);
|
||||
* ppArg[2+v->nColumn] = value for _all (we ignore this)
|
||||
* ppArg[3+v->nColumn] = value of offset (we ignore this too)
|
||||
*/
|
||||
assert( nArg==2+v->nColumn+2);
|
||||
|
||||
return index_insert(v, ppArg[1], &ppArg[2], pRowid);
|
||||
}
|
||||
|
||||
static const sqlite3_module fulltextModule = {
|
||||
0,
|
||||
fulltextCreate,
|
||||
fulltextConnect,
|
||||
fulltextBestIndex,
|
||||
fulltextDisconnect,
|
||||
fulltextDestroy,
|
||||
fulltextOpen,
|
||||
fulltextClose,
|
||||
fulltextFilter,
|
||||
fulltextNext,
|
||||
fulltextEof,
|
||||
fulltextColumn,
|
||||
fulltextRowid,
|
||||
fulltextUpdate
|
||||
/* iVersion */ 0,
|
||||
/* xCreate */ fulltextCreate,
|
||||
/* xConnect */ fulltextConnect,
|
||||
/* xBestIndex */ fulltextBestIndex,
|
||||
/* xDisconnect */ fulltextDisconnect,
|
||||
/* xDestroy */ fulltextDestroy,
|
||||
/* xOpen */ fulltextOpen,
|
||||
/* xClose */ fulltextClose,
|
||||
/* xFilter */ fulltextFilter,
|
||||
/* xNext */ fulltextNext,
|
||||
/* xEof */ fulltextEof,
|
||||
/* xColumn */ fulltextColumn,
|
||||
/* xRowid */ fulltextRowid,
|
||||
/* xUpdate */ fulltextUpdate,
|
||||
/* xBegin */ 0,
|
||||
/* xSync */ 0,
|
||||
/* xCommit */ 0,
|
||||
/* xRollback */ 0,
|
||||
/* xFindFunction */ 0,
|
||||
};
|
||||
|
||||
int sqlite3Fts1Init(sqlite3 *db){
|
||||
|
Reference in New Issue
Block a user