diff --git a/ext/fts5/fts5_index.c b/ext/fts5/fts5_index.c index 4d22c41467..a7394a84e5 100644 --- a/ext/fts5/fts5_index.c +++ b/ext/fts5/fts5_index.c @@ -81,7 +81,7 @@ ** + for each segment from oldest to newest: ** + segment id (always > 0) ** + b-tree height (1 -> root is leaf, 2 -> root is parent of leaf etc.) -** + first leaf page number (often 1) +** + first leaf page number (often 1, always greater than 0) ** + final leaf page number ** ** 2. The Averages Record: @@ -4049,6 +4049,39 @@ int sqlite3Fts5IndexClose(Fts5Index *p, int bDestroy){ return rc; } +/* +** Argument p points to a buffer containing utf-8 text that is n bytes in +** size. Return the number of bytes in the nChar character prefix of the +** buffer, or 0 if there are less than nChar characters in total. +*/ +static int fts5IndexCharlenToBytelen(const char *p, int nByte, int nChar){ + int n = 0; + int i; + for(i=0; i=nByte ) return 0; /* Input contains fewer than nChar chars */ + if( (unsigned char)p[n++]>=0xc0 ){ + while( (p[n] & 0xc0)==0x80 ) n++; + } + } + return n; +} + +/* +** pIn is a UTF-8 encoded string, nIn bytes in size. Return the number of +** unicode characters in the string. +*/ +int fts5IndexCharlen(const char *pIn, int nIn){ + int nChar = 0; + int i = 0; + while( i=0xc0 ){ + while( inPrefix; iIdx++){ - int n = ((iIdx==pConfig->nPrefix) ? nTerm : pConfig->aPrefix[iIdx]); - if( n<=nTerm ){ - ret ^= fts5IndexEntryCksum(iRowid, iCol, iPos, pTerm, n); + ret = fts5IndexEntryCksum(iRowid, iCol, iPos, pTerm, nTerm); + + for(iIdx=0; iIdxnPrefix; iIdx++){ + int nByte = fts5IndexCharlenToBytelen(pTerm, nTerm, pConfig->aPrefix[iIdx]); + if( nByte ){ + ret ^= fts5IndexEntryCksum(iRowid, iCol, iPos, pTerm, nByte); } } @@ -4107,8 +4142,9 @@ int sqlite3Fts5IndexWrite( ** prefix hash tables that it is large enough for. */ fts5AddTermToHash(p, 0, iCol, iPos, pToken, nToken); for(i=0; inPrefix; i++){ - if( nToken>=pConfig->aPrefix[i] ){ - fts5AddTermToHash(p, i+1, iCol, iPos, pToken, pConfig->aPrefix[i]); + int nByte = fts5IndexCharlenToBytelen(pToken, nToken, pConfig->aPrefix[i]); + if( nByte ){ + fts5AddTermToHash(p, i+1, iCol, iPos, pToken, nByte); } } @@ -4130,8 +4166,9 @@ int sqlite3Fts5IndexQuery( if( flags & FTS5INDEX_QUERY_PREFIX ){ Fts5Config *pConfig = p->pConfig; + int nChar = fts5IndexCharlen(pToken, nToken); for(iIdx=1; iIdx<=pConfig->nPrefix; iIdx++){ - if( pConfig->aPrefix[iIdx-1]==nToken ) break; + if( pConfig->aPrefix[iIdx-1]==nChar ) break; } if( iIdx>pConfig->nPrefix ){ iIdx = -1; @@ -4602,6 +4639,55 @@ static void fts5DecodeFunction( fts5BufferFree(&s); } +/* +** The implementation of user-defined scalar function fts5_rowid(). +*/ +static void fts5RowidFunction( + sqlite3_context *pCtx, /* Function call context */ + int nArg, /* Number of args (always 2) */ + sqlite3_value **apVal /* Function arguments */ +){ + const char *zArg; + if( nArg==0 ){ + sqlite3_result_error(pCtx, "should be: fts5_rowid(subject, ....)", -1); + }else{ + zArg = (const char*)sqlite3_value_text(apVal[0]); + if( 0==sqlite3_stricmp(zArg, "segment") ){ + i64 iRowid; + int idx, segid, height, pgno; + if( nArg!=5 ){ + sqlite3_result_error(pCtx, + "should be: fts5_rowid('segment', idx, segid, height, pgno))", -1 + ); + }else{ + idx = sqlite3_value_int(apVal[1]); + segid = sqlite3_value_int(apVal[2]); + height = sqlite3_value_int(apVal[3]); + pgno = sqlite3_value_int(apVal[4]); + iRowid = FTS5_SEGMENT_ROWID(idx, segid, height, pgno); + sqlite3_result_int64(pCtx, iRowid); + } + }else if( 0==sqlite3_stricmp(zArg, "start-of-index") ){ + i64 iRowid; + int idx; + if( nArg!=2 ){ + sqlite3_result_error(pCtx, + "should be: fts5_rowid('start-of-index', idx)", -1 + ); + }else{ + idx = sqlite3_value_int(apVal[1]); + iRowid = FTS5_SEGMENT_ROWID(idx, 1, 0, 0); + sqlite3_result_int64(pCtx, iRowid); + } + }else { + sqlite3_result_error(pCtx, + "first arg to fts5_rowid() must be 'segment' " + "or 'start-of-index' ..." + , -1 + ); + } + } +} /* ** This is called as part of registering the FTS5 module with database @@ -4615,6 +4701,11 @@ int sqlite3Fts5IndexInit(sqlite3 *db){ int rc = sqlite3_create_function( db, "fts5_decode", 2, SQLITE_UTF8, 0, fts5DecodeFunction, 0, 0 ); + if( rc==SQLITE_OK ){ + rc = sqlite3_create_function( + db, "fts5_rowid", -1, SQLITE_UTF8, 0, fts5RowidFunction, 0, 0 + ); + } return rc; } diff --git a/ext/fts5/test/fts5prefix.test b/ext/fts5/test/fts5prefix.test new file mode 100644 index 0000000000..44c21a744c --- /dev/null +++ b/ext/fts5/test/fts5prefix.test @@ -0,0 +1,60 @@ +# 2015 Jan 13 +# +# The author disclaims copyright to this source code. In place of +# a legal notice, here is a blessing: +# +# May you do good and not evil. +# May you find forgiveness for yourself and forgive others. +# May you share freely, never taking more than you give. +# +#*********************************************************************** +# +# + +source [file join [file dirname [info script]] fts5_common.tcl] +set testprefix fts5prefix + + +#------------------------------------------------------------------------- +# Check that prefix indexes really do index n-character prefixes, not +# n-byte prefixes. Use the ascii tokenizer so as not to be confused by +# diacritic removal. +# +do_execsql_test 1.0 { + CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = ascii, prefix = 2) +} + +do_test 1.2 { + foreach {rowid string} { + 1 "\xCA\xCB\xCC\xCD" + 2 "\u1234\u5678\u4321\u8765" + } { + execsql { INSERT INTO t1(rowid, x) VALUES($rowid, $string) } + } +} {} + +do_execsql_test 1.1.2 { + INSERT INTO t1(t1) VALUES('integrity-check'); +} + +#db eval { select fts5_decode(id, block) AS d FROM t1_data; } { puts $d } + +foreach o {1 2} { + if {$o==2} breakpoint + foreach {tn q res} { + 1 "SELECT rowid FROM t1 WHERE t1 MATCH '\xCA\xCB*'" 1 + 2 "SELECT rowid FROM t1 WHERE t1 MATCH '\u1234\u5678*'" 2 + } { + do_execsql_test 1.$o.$tn $q $res + } + + execsql { + DELETE FROM t1_data WHERE + rowid>=fts5_rowid('start-of-index', 0) AND + rowid