From f800e3e63a859f6c7bcc5b6ac5c8759c3f3e1c39 Mon Sep 17 00:00:00 2001 From: drh Date: Thu, 14 Sep 2006 01:17:30 +0000 Subject: [PATCH] The FTS1 tables have a new automatic column named "offset" that returns a string containing byte offset information for all matching terms. Also added a large test case based on SQLite mailing list entries. (CVS 3417) FossilOrigin-Name: f25cfa1aec0e4c1fe07176039a1b7f4e6a2c66ec --- ext/fts1/fts1.c | 297 +++++++++++++++++++++++++++++++++++++++++------- manifest | 12 +- manifest.uuid | 2 +- 3 files changed, 262 insertions(+), 49 deletions(-) diff --git a/ext/fts1/fts1.c b/ext/fts1/fts1.c index 69a3feac60..3485c18820 100644 --- a/ext/fts1/fts1.c +++ b/ext/fts1/fts1.c @@ -41,19 +41,28 @@ SQLITE_EXTENSION_INIT1 /* utility functions */ typedef struct StringBuffer { - int len; /* length, not including null terminator */ - char *s; + int len; /* length, not including null terminator */ + int alloced; /* Space allocated for s[] */ + char *s; /* Content of the string */ } StringBuffer; void initStringBuffer(StringBuffer *sb){ sb->len = 0; - sb->s = malloc(1); + sb->alloced = 100; + sb->s = malloc(100); sb->s[0] = '\0'; } void append(StringBuffer *sb, const char *zFrom){ int nFrom = strlen(zFrom); - sb->s = realloc(sb->s, sb->len + nFrom + 1); + if( sb->len + nFrom >= sb->alloced ){ + sb->alloced = sb->len + nFrom + 100; + sb->s = realloc(sb->s, sb->alloced+1); + if( sb->s==0 ){ + initStringBuffer(sb); + return; + } + } strcpy(sb->s + sb->len, zFrom); sb->len += nFrom; } @@ -845,6 +854,7 @@ typedef struct fulltext_vtab fulltext_vtab; */ typedef struct QueryTerm { short int nPhrase; /* How many following terms are part of the same phrase */ + short int iPhrase; /* This is the i-th term of a phrase. */ short int iColumn; /* Column of the index that must match this term */ signed char isOr; /* this term is preceded by "OR" */ signed char isNot; /* this term is preceded by "-" */ @@ -891,6 +901,24 @@ typedef struct Query { } Query; +/* +** An instance of the following structure keeps track of generated +** matching-word offset information and snippets. +*/ +typedef struct Snippet { + int nMatch; /* Total number of matches */ + int nAlloc; /* Space allocated for aMatch[] */ + struct { /* One entry for each matching term */ + int iCol; /* The column that contains the match */ + int iTerm; /* The index in Query.pTerms[] of the matching term */ + int iStart; /* The offset to the first character of the term */ + int nByte; /* Number of bytes in the term */ + } *aMatch; /* Points to space obtained from malloc */ + char *zOffset; /* Text rendering of aMatch[] */ + int nOffset; /* strlen(zOffset) */ +} Snippet; + + typedef enum QueryType { QUERY_GENERIC, /* table scan */ QUERY_ROWID, /* lookup by rowid */ @@ -974,10 +1002,12 @@ struct fulltext_vtab { */ typedef struct fulltext_cursor { sqlite3_vtab_cursor base; /* Base class used by SQLite core */ - QueryType iCursorType; /* Type of cursor */ + QueryType iCursorType; /* Copy of sqlite3_index_info.idxNum */ sqlite3_stmt *pStmt; /* Prepared statement in use by the cursor */ int eof; /* True if at End Of Results */ Query q; /* Parsed query string */ + Snippet snippet; /* Cached snippet for the current row */ + int iColumn; /* Column being searched */ DocListReader result; /* used when iCursorType == QUERY_FULLTEXT */ } fulltext_cursor; @@ -1601,7 +1631,6 @@ typedef struct TableSpec { char **azColumn; /* Original names of columns to be indexed */ char *zColumnList; /* Comma-separated list of names for %_content */ char **azTokenizer; /* Name of tokenizer and its arguments */ - char **azDelimiter; /* Delimiters used for snippets */ } TableSpec; /* @@ -1611,7 +1640,6 @@ void clearTableSpec(TableSpec *p) { free(p->azColumn); free(p->zColumnList); free(p->azTokenizer); - free(p->azDelimiter); } /* Parse a CREATE VIRTUAL TABLE statement, which looks like this: @@ -1627,7 +1655,6 @@ int parseSpec(TableSpec *pSpec, int argc, const char *const*argv, char**pzErr){ char *z, *zDummy; char **azArg; const char *zTokenizer = 0; /* argv[] entry describing the tokenizer */ - const char *zDelimiter = 0; /* argv[] entry describing the delimiters */ assert( argc>=3 ); /* Current interface: @@ -1664,13 +1691,10 @@ int parseSpec(TableSpec *pSpec, int argc, const char *const*argv, char**pzErr){ pSpec->nColumn = 0; pSpec->azColumn = azArg; zTokenizer = "tokenize simple"; - zDelimiter = "delimiters('[',']','...')"; n = 0; for(i=3, j=0; inColumn] = firstToken(azArg[i], &zDummy); pSpec->nColumn++; @@ -1716,12 +1740,6 @@ int parseSpec(TableSpec *pSpec, int argc, const char *const*argv, char**pzErr){ pSpec->azTokenizer = tokenizeString(zTokenizer, &n); tokenListToIdList(pSpec->azTokenizer); - /* - ** Parse the delimiter specification string. - */ - pSpec->azDelimiter = tokenizeString(zDelimiter, &n); - tokenListToIdList(pSpec->azDelimiter); - return SQLITE_OK; } @@ -1730,7 +1748,7 @@ int parseSpec(TableSpec *pSpec, int argc, const char *const*argv, char**pzErr){ ** the virtual table. Return a pointer to this schema. ** ** If the addAllColumn parameter is true, then add a column named -** "_all" to the end of the schema. +** "_all" to the end of the schema. Also add the "offset" column. ** ** Space is obtained from sqlite3_mprintf() and should be freed ** using sqlite3_free(). @@ -1749,7 +1767,7 @@ static char *fulltextSchema( zSchema = zNext; zSep = ","; } - zNext = sqlite3_mprintf("%s,_all)", zSchema); + zNext = sqlite3_mprintf("%s,_all,offset)", zSchema); sqlite3_free(zSchema); return zNext; } @@ -1976,6 +1994,169 @@ static void queryClear(Query *q){ memset(q, 0, sizeof(*q)); } +/* Free all of the dynamically allocated memory held by the +** Snippet +*/ +static void snippetClear(Snippet *p){ + free(p->aMatch); + free(p->zOffset); + memset(p, 0, sizeof(*p)); +} +/* +** Append a single entry to the p->aMatch[] log. +*/ +static void snippetAppendMatch( + Snippet *p, /* Append the entry to this snippet */ + int iCol, int iTerm, /* The column and query term */ + int iStart, int nByte /* Offset and size of the match */ +){ + int i; + if( p->nMatch+1>=p->nAlloc ){ + p->nAlloc = p->nAlloc*2 + 10; + p->aMatch = realloc(p->aMatch, p->nAlloc*sizeof(p->aMatch[0]) ); + if( p->aMatch==0 ){ + p->nMatch = 0; + p->nAlloc = 0; + return; + } + } + i = p->nMatch++; + p->aMatch[i].iCol = iCol; + p->aMatch[i].iTerm = iTerm; + p->aMatch[i].iStart = iStart; + p->aMatch[i].nByte = nByte; +} + +/* +** Sizing information for the circular buffer used in snippetOffsetsOfColumn() +*/ +#define FTS1_ROTOR_SZ (32) +#define FTS1_ROTOR_MASK (FTS1_ROTOR_SZ-1) + +/* +** Add entries to pSnippet->aMatch[] for every match that occurs against +** document zDoc[0..nDoc-1] which is stored in column iColumn. +*/ +static void snippetOffsetsOfColumn( + Query *pQuery, + Snippet *pSnippet, + int iColumn, + const char *zDoc, + int nDoc +){ + const sqlite3_tokenizer_module *pTModule; /* The tokenizer module */ + sqlite3_tokenizer *pTokenizer; /* The specific tokenizer */ + sqlite3_tokenizer_cursor *pTCursor; /* Tokenizer cursor */ + fulltext_vtab *pVtab; /* The full text index */ + int nColumn; /* Number of columns in the index */ + const QueryTerm *aTerm; /* Query string terms */ + int nTerm; /* Number of query string terms */ + int i, j; /* Loop counters */ + int rc; /* Return code */ + unsigned int match, prevMatch; /* Phrase search bitmasks */ + const char *zToken; /* Next token from the tokenizer */ + int nToken; /* Size of zToken */ + int iBegin, iEnd, iPos; /* Offsets of beginning and end */ + + /* The following variables keep a circular buffer of the last + ** few tokens */ + unsigned int iRotor = 0; /* Index of current token */ + int iRotorBegin[FTS1_ROTOR_SZ]; /* Beginning offset of token */ + int iRotorLen[FTS1_ROTOR_SZ]; /* Length of token */ + + pVtab = pQuery->pFts; + nColumn = pVtab->nColumn; + pTokenizer = pVtab->pTokenizer; + pTModule = pTokenizer->pModule; + rc = pTModule->xOpen(pTokenizer, zDoc, nDoc, &pTCursor); + if( rc ) return; + pTCursor->pTokenizer = pTokenizer; + aTerm = pQuery->pTerms; + nTerm = pQuery->nTerms; + if( nTerm>=FTS1_ROTOR_SZ ){ + nTerm = FTS1_ROTOR_SZ - 1; + } + prevMatch = 0; + while(1){ + rc = pTModule->xNext(pTCursor, &zToken, &nToken, &iBegin, &iEnd, &iPos); + if( rc ) break; + iRotorBegin[iRotor&FTS1_ROTOR_MASK] = iBegin; + iRotorLen[iRotor&FTS1_ROTOR_MASK] = iEnd-iBegin; + match = 0; + for(i=0; i=0 && iCol1 && (prevMatch & (1<=0; j--){ + int k = (iRotor-j) & FTS1_ROTOR_MASK; + snippetAppendMatch(pSnippet, iColumn, i-j, + iRotorBegin[k], iRotorLen[k]); + } + } + } + prevMatch = match<<1; + iRotor++; + } + pTModule->xClose(pTCursor); +} + + +/* +** Compute all offsets for the current row of the query. +** If the offsets have already been computed, this routine is a no-op. +*/ +static void snippetAllOffsets(fulltext_cursor *p){ + int nColumn; + int iColumn, i; + int iFirst, iLast; + fulltext_vtab *pFts; + + if( p->snippet.nMatch ) return; + if( p->q.nTerms==0 ) return; + pFts = p->q.pFts; + nColumn = pFts->nColumn; + iColumn = p->iCursorType; + if( iColumn<0 || iColumn>=nColumn ){ + iFirst = 0; + iLast = nColumn-1; + }else{ + iFirst = iColumn; + iLast = iColumn; + } + for(i=iFirst; i<=iLast; i++){ + const char *zDoc; + int nDoc; + zDoc = (const char*)sqlite3_column_text(p->pStmt, i+1); + nDoc = sqlite3_column_bytes(p->pStmt, i+1); + snippetOffsetsOfColumn(&p->q, &p->snippet, i, zDoc, nDoc); + } +} + +/* +** Convert the information in the aMatch[] array of the snippet +** into the string zOffset[0..nOffset-1]. +*/ +static void snippetOffsetText(Snippet *p){ + int i; + StringBuffer sb; + char zBuf[200]; + if( p->zOffset ) return; + initStringBuffer(&sb); + for(i=0; inMatch; i++){ + zBuf[0] = ' '; + sprintf(&zBuf[i>0], "%d %d %d %d", p->aMatch[i].iCol, + p->aMatch[i].iTerm, p->aMatch[i].iStart, p->aMatch[i].nByte); + append(&sb, zBuf); + } + p->zOffset = sb.s; + p->nOffset = sb.len; +} + /* ** Close the cursor. For additional information see the documentation ** on the xClose method of the virtual table interface. @@ -1985,6 +2166,7 @@ static int fulltextClose(sqlite3_vtab_cursor *pCursor){ TRACE(("FTS1 Close %p\n", c)); sqlite3_finalize(c->pStmt); queryClear(&c->q); + snippetClear(&c->snippet); if( c->result.pDoclist!=NULL ){ docListDelete(c->result.pDoclist); } @@ -1998,6 +2180,7 @@ static int fulltextNext(sqlite3_vtab_cursor *pCursor){ int rc; TRACE(("FTS1 Next %p\n", pCursor)); + snippetClear(&c->snippet); if( c->iCursorType < QUERY_FULLTEXT ){ /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */ rc = sqlite3_step(c->pStmt); @@ -2132,6 +2315,7 @@ static int tokenizeSegment( sqlite3_tokenizer_cursor *pCursor; int firstIndex = pQuery->nTerms; int iCol; + int nTerm = 1; int rc = pModule->xOpen(pTokenizer, pSegment, nSegment, &pCursor); if( rc!=SQLITE_OK ) return rc; @@ -2160,6 +2344,10 @@ static int tokenizeSegment( if( !inPhrase && iBegin>0 && pSegment[iBegin-1]=='-' ){ pQuery->pTerms[pQuery->nTerms-1].isNot = 1; } + pQuery->pTerms[pQuery->nTerms-1].iPhrase = nTerm; + if( inPhrase ){ + nTerm++; + } } if( inPhrase && pQuery->nTerms>firstIndex ){ @@ -2356,28 +2544,46 @@ out: return rc; } +/* This is the xEof method of the virtual table. The SQLite core +** calls this routine to find out if it has reached the end of +** a query's results set. +*/ static int fulltextEof(sqlite3_vtab_cursor *pCursor){ fulltext_cursor *c = (fulltext_cursor *) pCursor; return c->eof; } +/* This is the xColumn method of the virtual table. The SQLite +** core calls this method during a query when it needs the value +** of a column from the virtual table. This method needs to use +** one of the sqlite3_result_*() routines to store the requested +** value back in the pContext. +*/ static int fulltextColumn(sqlite3_vtab_cursor *pCursor, sqlite3_context *pContext, int idxCol){ fulltext_cursor *c = (fulltext_cursor *) pCursor; fulltext_vtab *v = cursor_vtab(c); - const char *s; - if( idxCol==v->nColumn ){ /* a request for _all */ + if( idxColnColumn ){ + sqlite3_value *pVal = sqlite3_column_value(c->pStmt, idxCol+1); + sqlite3_result_value(pContext, pVal); + }else if( idxCol==v->nColumn ){ + /* The _all column */ sqlite3_result_null(pContext); - } else { - assert( idxColnColumn ); - s = (const char *) sqlite3_column_text(c->pStmt, idxCol+1); - sqlite3_result_text(pContext, s, -1, SQLITE_TRANSIENT); + }else if( idxCol==v->nColumn+1 ){ + /* The offset column */ + snippetAllOffsets(c); + snippetOffsetText(&c->snippet); + sqlite3_result_text(pContext, c->snippet.zOffset, c->snippet.nOffset, + SQLITE_STATIC); } - return SQLITE_OK; } +/* This is the xRowid method. The SQLite core calls this routine to +** retrive the rowid for the current row of the result set. The +** rowid should be written to *pRowid. +*/ static int fulltextRowid(sqlite3_vtab_cursor *pCursor, sqlite_int64 *pRowid){ fulltext_cursor *c = (fulltext_cursor *) pCursor; @@ -2589,27 +2795,34 @@ static int fulltextUpdate(sqlite3_vtab *pVtab, int nArg, sqlite3_value **ppArg, /* ppArg[1] = rowid * ppArg[2..2+v->nColumn-1] = values - * ppArg[2+v->nColumn] = value for _all (we ignore this) */ - assert( nArg==2+v->nColumn+1); + * ppArg[2+v->nColumn] = value for _all (we ignore this) + * ppArg[3+v->nColumn] = value of offset (we ignore this too) + */ + assert( nArg==2+v->nColumn+2); return index_insert(v, ppArg[1], &ppArg[2], pRowid); } static const sqlite3_module fulltextModule = { - 0, - fulltextCreate, - fulltextConnect, - fulltextBestIndex, - fulltextDisconnect, - fulltextDestroy, - fulltextOpen, - fulltextClose, - fulltextFilter, - fulltextNext, - fulltextEof, - fulltextColumn, - fulltextRowid, - fulltextUpdate + /* iVersion */ 0, + /* xCreate */ fulltextCreate, + /* xConnect */ fulltextConnect, + /* xBestIndex */ fulltextBestIndex, + /* xDisconnect */ fulltextDisconnect, + /* xDestroy */ fulltextDestroy, + /* xOpen */ fulltextOpen, + /* xClose */ fulltextClose, + /* xFilter */ fulltextFilter, + /* xNext */ fulltextNext, + /* xEof */ fulltextEof, + /* xColumn */ fulltextColumn, + /* xRowid */ fulltextRowid, + /* xUpdate */ fulltextUpdate, + /* xBegin */ 0, + /* xSync */ 0, + /* xCommit */ 0, + /* xRollback */ 0, + /* xFindFunction */ 0, }; int sqlite3Fts1Init(sqlite3 *db){ diff --git a/manifest b/manifest index 1febb2d0d9..920421fdc1 100644 --- a/manifest +++ b/manifest @@ -1,5 +1,5 @@ -C Modify\sthe\s".dump"\scommand\sin\sthe\scommand-line\sshell\sso\sthat\sit\sworks\nwith\svirtual\stables.\s(CVS\s3416) -D 2006-09-13T20:22:02 +C The\sFTS1\stables\shave\sa\snew\sautomatic\scolumn\snamed\s"offset"\sthat\sreturns\na\sstring\scontaining\sbyte\soffset\sinformation\sfor\sall\smatching\sterms.\nAlso\sadded\sa\slarge\stest\scase\sbased\son\sSQLite\smailing\slist\sentries.\s(CVS\s3417) +D 2006-09-14T01:17:31 F Makefile.in cabd42d34340f49260bc2a7668c38eba8d4cfd99 F Makefile.linux-gcc 2d8574d1ba75f129aba2019f0b959db380a90935 F README 9c4e2d6706bdcc3efdd773ce752a8cdab4f90028 @@ -21,7 +21,7 @@ F ext/README.txt 913a7bd3f4837ab14d7e063304181787658b14e1 F ext/fts1/README.txt 20ac73b006a70bcfd80069bdaf59214b6cf1db5e F ext/fts1/ft_hash.c 3927bd880e65329bdc6f506555b228b28924921b F ext/fts1/ft_hash.h 1a35e654a235c2c662d3ca0dfc3138ad60b8b7d5 -F ext/fts1/fts1.c 997a09e0a83390815f10e43b7545c8f61b4c7a15 +F ext/fts1/fts1.c 9fcad5a187e62928a7a573eb09522c66a68c016b F ext/fts1/fts1.h 6060b8f62c1d925ea8356cb1a6598073eb9159a6 F ext/fts1/fts1_hash.c 3196cee866edbebb1c0521e21672e6d599965114 F ext/fts1/fts1_hash.h 957d378355ed29f672cd5add012ce8b088a5e089 @@ -398,7 +398,7 @@ F www/tclsqlite.tcl bb0d1357328a42b1993d78573e587c6dcbc964b9 F www/vdbe.tcl 87a31ace769f20d3627a64fa1fade7fed47b90d0 F www/version3.tcl 890248cf7b70e60c383b0e84d77d5132b3ead42b F www/whentouse.tcl 97e2b5cd296f7d8057e11f44427dea8a4c2db513 -P f4ab546b2e8105422fb1baa2b86e688b5d19f20e -R 652a526cbce7142a0518bfc34577283d +P afd40184b752f641b423ceffac2476f2cfbdfd31 +R ca773c981e9d66672176c6dfb66c4d51 U drh -Z cf3bcfae5cca273a32c198de20c140a4 +Z a0cf419bdc2fb2e2cc1249db2fd8f262 diff --git a/manifest.uuid b/manifest.uuid index 8a0d98dcd7..fc7c30426c 100644 --- a/manifest.uuid +++ b/manifest.uuid @@ -1 +1 @@ -afd40184b752f641b423ceffac2476f2cfbdfd31 \ No newline at end of file +f25cfa1aec0e4c1fe07176039a1b7f4e6a2c66ec \ No newline at end of file