1
0
mirror of https://github.com/sqlite/sqlite.git synced 2025-08-01 06:27:03 +03:00

Add experimental unicode-aware trigram tokenizer to fts5. And support for LIKE and GLOB optimizations for fts5 tables that use said tokenizer.

FossilOrigin-Name: 0d7810c1aea93c0a3da1ccc4911dbce8a1b6e1dbfe1ab7e800289a0c783b5985
This commit is contained in:
dan
2020-09-30 20:35:37 +00:00
parent fad4dd0fd1
commit 33a99fad08
9 changed files with 385 additions and 62 deletions

View File

@ -184,6 +184,7 @@ struct Fts5Config {
Fts5Tokenizer *pTok;
fts5_tokenizer *pTokApi;
int bLock; /* True when table is preparing statement */
int ePattern; /* FTS_PATTERN_XXX constant */
/* Values loaded from the %_config table */
int iCookie; /* Incremented when %_config is modified */
@ -204,17 +205,19 @@ struct Fts5Config {
};
/* Current expected value of %_config table 'version' field */
#define FTS5_CURRENT_VERSION 4
#define FTS5_CURRENT_VERSION 4
#define FTS5_CONTENT_NORMAL 0
#define FTS5_CONTENT_NONE 1
#define FTS5_CONTENT_EXTERNAL 2
#define FTS5_DETAIL_FULL 0
#define FTS5_DETAIL_NONE 1
#define FTS5_DETAIL_COLUMNS 2
#define FTS5_DETAIL_FULL 0
#define FTS5_DETAIL_NONE 1
#define FTS5_DETAIL_COLUMNS 2
#define FTS5_PATTERN_NONE 0
#define FTS5_PATTERN_LIKE 65 /* matches SQLITE_INDEX_CONSTRAINT_LIKE */
#define FTS5_PATTERN_GLOB 66 /* matches SQLITE_INDEX_CONSTRAINT_GLOB */
int sqlite3Fts5ConfigParse(
Fts5Global*, sqlite3*, int, const char **, Fts5Config**, char**
@ -554,8 +557,7 @@ int sqlite3Fts5GetTokenizer(
Fts5Global*,
const char **azArg,
int nArg,
Fts5Tokenizer**,
fts5_tokenizer**,
Fts5Config*,
char **pzErr
);
@ -797,6 +799,10 @@ int sqlite3Fts5AuxInit(fts5_api*);
*/
int sqlite3Fts5TokenizerInit(fts5_api*);
int sqlite3Fts5TokenizerPattern(
int (*xCreate)(void*, const char**, int, Fts5Tokenizer**),
Fts5Tokenizer *pTok
);
/*
** End of interface to code in fts5_tokenizer.c.
**************************************************************************/

View File

@ -325,7 +325,7 @@ static int fts5ConfigParseSpecial(
rc = SQLITE_ERROR;
}else{
rc = sqlite3Fts5GetTokenizer(pGlobal,
(const char**)azArg, (int)nArg, &pConfig->pTok, &pConfig->pTokApi,
(const char**)azArg, (int)nArg, pConfig,
pzErr
);
}
@ -397,9 +397,7 @@ static int fts5ConfigParseSpecial(
*/
static int fts5ConfigDefaultTokenizer(Fts5Global *pGlobal, Fts5Config *pConfig){
assert( pConfig->pTok==0 && pConfig->pTokApi==0 );
return sqlite3Fts5GetTokenizer(
pGlobal, 0, 0, &pConfig->pTok, &pConfig->pTokApi, 0
);
return sqlite3Fts5GetTokenizer(pGlobal, 0, 0, pConfig, 0);
}
/*

View File

@ -284,6 +284,66 @@ int sqlite3Fts5ExprNew(
return sParse.rc;
}
int sqlite3Fts5ExprPattern(
Fts5Config *pConfig, int iCol, const char *zText, Fts5Expr **pp
){
i64 nText = strlen(zText);
char *zExpr = (char*)sqlite3_malloc64(nText*4 + 1);
int rc = SQLITE_OK;
if( zExpr==0 ){
rc = SQLITE_NOMEM;
}else{
char aSpec[3];
int iOut = 0;
int i = 0;
int iFirst = 0;
if( pConfig->ePattern==FTS5_PATTERN_LIKE ){
aSpec[0] = '_';
aSpec[1] = '%';
aSpec[2] = 0;
}else{
aSpec[0] = '*';
aSpec[1] = '?';
aSpec[2] = '[';
}
while( i<=nText ){
if( i==nText
|| zText[i]==aSpec[0] || zText[i]==aSpec[1] || zText[i]==aSpec[2]
){
if( i-iFirst>=3 ){
int jj;
zExpr[iOut++] = '"';
for(jj=iFirst; jj<i; jj++){
zExpr[iOut++] = zText[jj];
if( zText[jj]=='"' ) zExpr[iOut++] = '"';
}
zExpr[iOut++] = '"';
zExpr[iOut++] = ' ';
}
if( zText[i]==aSpec[2] ){
i += 2;
if( zText[i-1]=='^' ) i++;
while( i<nText && zText[i]!=']' ) i++;
}
iFirst = i+1;
}
i++;
}
if( iOut>0 ){
zExpr[iOut] = '\0';
rc = sqlite3Fts5ExprNew(pConfig, iCol, zExpr, pp, pConfig->pzErrmsg);
}else{
*pp = 0;
}
sqlite3_free(zExpr);
}
return rc;
}
/*
** Free the expression node object passed as the only argument.
*/

View File

@ -493,7 +493,9 @@ static void fts5SetUniqueFlag(sqlite3_index_info *pIdxInfo){
**
** Match against table column: "m"
** Match against rank column: "r"
** Match against other column: "<column-number>"
** Match against other column: "M<column-number>"
** LIKE against other column: "L<column-number>"
** GLOB against other column: "G<column-number>"
** Equality constraint against the rowid: "="
** A < or <= against the rowid: "<"
** A > or >= against the rowid: ">"
@ -554,7 +556,7 @@ static int fts5BestIndexMethod(sqlite3_vtab *pVTab, sqlite3_index_info *pInfo){
return SQLITE_ERROR;
}
idxStr = (char*)sqlite3_malloc(pInfo->nConstraint * 6 + 1);
idxStr = (char*)sqlite3_malloc(pInfo->nConstraint * 8 + 1);
if( idxStr==0 ) return SQLITE_NOMEM;
pInfo->idxStr = idxStr;
pInfo->needToFreeIdxStr = 1;
@ -578,25 +580,29 @@ static int fts5BestIndexMethod(sqlite3_vtab *pVTab, sqlite3_index_info *pInfo){
if( bSeenRank ) continue;
idxStr[iIdxStr++] = 'r';
bSeenRank = 1;
}else{
}else if( iCol>=0 ){
bSeenMatch = 1;
idxStr[iIdxStr++] = 'm';
if( iCol<nCol ){
sqlite3_snprintf(6, &idxStr[iIdxStr], "%d", iCol);
idxStr += strlen(&idxStr[iIdxStr]);
assert( idxStr[iIdxStr]=='\0' );
}
idxStr[iIdxStr++] = 'M';
sqlite3_snprintf(6, &idxStr[iIdxStr], "%d", iCol);
idxStr += strlen(&idxStr[iIdxStr]);
assert( idxStr[iIdxStr]=='\0' );
}
pInfo->aConstraintUsage[i].argvIndex = ++iCons;
pInfo->aConstraintUsage[i].omit = 1;
}
}
else if( p->usable && bSeenEq==0
&& p->op==SQLITE_INDEX_CONSTRAINT_EQ && iCol<0
){
idxStr[iIdxStr++] = '=';
bSeenEq = 1;
pInfo->aConstraintUsage[i].argvIndex = ++iCons;
}else if( p->usable ){
if( iCol>=0 && iCol<nCol && pConfig->ePattern==p->op ){
assert( p->op==FTS5_PATTERN_LIKE || p->op==FTS5_PATTERN_GLOB );
idxStr[iIdxStr++] = p->op==FTS5_PATTERN_LIKE ? 'L' : 'G';
sqlite3_snprintf(6, &idxStr[iIdxStr], "%d", iCol);
idxStr += strlen(&idxStr[iIdxStr]);
pInfo->aConstraintUsage[i].argvIndex = ++iCons;
assert( idxStr[iIdxStr]=='\0' );
}else if( bSeenEq==0 && p->op==SQLITE_INDEX_CONSTRAINT_EQ && iCol<0 ){
idxStr[iIdxStr++] = '=';
bSeenEq = 1;
pInfo->aConstraintUsage[i].argvIndex = ++iCons;
}
}
}
@ -1229,19 +1235,14 @@ static int fts5FilterMethod(
case 'r':
pRank = apVal[i];
break;
case 'm': {
case 'M': {
const char *zText = (const char*)sqlite3_value_text(apVal[i]);
if( zText==0 ) zText = "";
if( idxStr[iIdxStr]>='0' && idxStr[iIdxStr]<='9' ){
iCol = 0;
do{
iCol = iCol*10 + (idxStr[iIdxStr]-'0');
iIdxStr++;
}while( idxStr[iIdxStr]>='0' && idxStr[iIdxStr]<='9' );
}else{
iCol = pConfig->nCol;
}
iCol = 0;
do{
iCol = iCol*10 + (idxStr[iIdxStr]-'0');
iIdxStr++;
}while( idxStr[iIdxStr]>='0' && idxStr[iIdxStr]<='9' );
if( zText[0]=='*' ){
/* The user has issued a query of the form "MATCH '*...'". This
@ -1261,6 +1262,22 @@ static int fts5FilterMethod(
break;
}
case 'L':
case 'G': {
const char *zText = (const char*)sqlite3_value_text(apVal[i]);
iCol = 0;
do{
iCol = iCol*10 + (idxStr[iIdxStr]-'0');
iIdxStr++;
}while( idxStr[iIdxStr]>='0' && idxStr[iIdxStr]<='9' );
rc = sqlite3Fts5ExprPattern(pConfig, iCol, zText, &pExpr);
if( rc==SQLITE_OK ){
rc = sqlite3Fts5ExprAnd(&pCsr->pExpr, pExpr);
pExpr = 0;
}
if( rc!=SQLITE_OK ) goto filter_out;
break;
}
case '=':
pRowidEq = apVal[i];
break;
@ -2672,8 +2689,7 @@ int sqlite3Fts5GetTokenizer(
Fts5Global *pGlobal,
const char **azArg,
int nArg,
Fts5Tokenizer **ppTok,
fts5_tokenizer **ppTokApi,
Fts5Config *pConfig,
char **pzErr
){
Fts5TokenizerModule *pMod;
@ -2685,16 +2701,22 @@ int sqlite3Fts5GetTokenizer(
rc = SQLITE_ERROR;
*pzErr = sqlite3_mprintf("no such tokenizer: %s", azArg[0]);
}else{
rc = pMod->x.xCreate(pMod->pUserData, &azArg[1], (nArg?nArg-1:0), ppTok);
*ppTokApi = &pMod->x;
if( rc!=SQLITE_OK && pzErr ){
*pzErr = sqlite3_mprintf("error in tokenizer constructor");
rc = pMod->x.xCreate(
pMod->pUserData, &azArg[1], (nArg?nArg-1:0), &pConfig->pTok
);
pConfig->pTokApi = &pMod->x;
if( rc!=SQLITE_OK ){
if( pzErr ) *pzErr = sqlite3_mprintf("error in tokenizer constructor");
}else{
pConfig->ePattern = sqlite3Fts5TokenizerPattern(
pMod->x.xCreate, pConfig->pTok
);
}
}
if( rc!=SQLITE_OK ){
*ppTokApi = 0;
*ppTok = 0;
pConfig->pTokApi = 0;
pConfig->pTok = 0;
}
return rc;

View File

@ -1258,6 +1258,118 @@ static int fts5PorterTokenize(
);
}
/**************************************************************************
** Start of trigram implementation.
*/
typedef struct TrigramTokenizer TrigramTokenizer;
struct TrigramTokenizer {
int bFold;
};
/*
** Free a trigram tokenizer.
*/
static void fts5TriDelete(Fts5Tokenizer *p){
sqlite3_free(p);
}
/*
** Allocate a trigram tokenizer.
*/
static int fts5TriCreate(
void *pCtx,
const char **azArg,
int nArg,
Fts5Tokenizer **ppOut
){
int rc = SQLITE_OK;
TrigramTokenizer *pNew = (TrigramTokenizer*)sqlite3_malloc(sizeof(*pNew));
if( pNew==0 ){
rc = SQLITE_NOMEM;
}else{
int i;
pNew->bFold = 1;
for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
const char *zArg = azArg[i+1];
if( 0==sqlite3_stricmp(azArg[i], "case_sensitive") ){
if( (zArg[0]!='0' && zArg[0]!='1') || zArg[1] ){
rc = SQLITE_ERROR;
}else{
pNew->bFold = (zArg[0]=='0');
}
}else{
rc = SQLITE_ERROR;
}
}
if( rc!=SQLITE_OK ){
fts5TriDelete((Fts5Tokenizer*)pNew);
pNew = 0;
}
}
*ppOut = (Fts5Tokenizer*)pNew;
return rc;
}
/*
** Trigram tokenizer tokenize routine.
*/
static int fts5TriTokenize(
Fts5Tokenizer *pTok,
void *pCtx,
int flags,
const char *pText, int nText,
int (*xToken)(void*, int, const char*, int, int, int)
){
TrigramTokenizer *p = (TrigramTokenizer*)pTok;
int rc = SQLITE_OK;
char aBuf[32];
const unsigned char *zIn = (const unsigned char*)pText;
const unsigned char *zEof = &zIn[nText];
u32 iCode;
while( 1 ){
char *zOut = aBuf;
int iStart = zIn - (const unsigned char*)pText;
const unsigned char *zNext;
READ_UTF8(zIn, zEof, iCode);
zNext = zIn;
if( zIn<zEof ){
if( p->bFold ) iCode = sqlite3Fts5UnicodeFold(iCode, 0);
WRITE_UTF8(zOut, iCode);
READ_UTF8(zIn, zEof, iCode);
}else{
break;
}
if( zIn<zEof ){
if( p->bFold ) iCode = sqlite3Fts5UnicodeFold(iCode, 0);
WRITE_UTF8(zOut, iCode);
READ_UTF8(zIn, zEof, iCode);
if( p->bFold ) iCode = sqlite3Fts5UnicodeFold(iCode, 0);
WRITE_UTF8(zOut, iCode);
}else{
break;
}
rc = xToken(pCtx, 0, aBuf, zOut-aBuf, iStart, iStart + zOut-aBuf);
if( rc!=SQLITE_OK ) break;
zIn = zNext;
}
return rc;
}
int sqlite3Fts5TokenizerPattern(
int (*xCreate)(void*, const char**, int, Fts5Tokenizer**),
Fts5Tokenizer *pTok
){
if( xCreate==fts5TriCreate ){
TrigramTokenizer *p = (TrigramTokenizer*)pTok;
return p->bFold ? FTS5_PATTERN_LIKE : FTS5_PATTERN_GLOB;
}
return FTS5_PATTERN_NONE;
}
/*
** Register all built-in tokenizers with FTS5.
*/
@ -1269,6 +1381,7 @@ int sqlite3Fts5TokenizerInit(fts5_api *pApi){
{ "unicode61", {fts5UnicodeCreate, fts5UnicodeDelete, fts5UnicodeTokenize}},
{ "ascii", {fts5AsciiCreate, fts5AsciiDelete, fts5AsciiTokenize }},
{ "porter", {fts5PorterCreate, fts5PorterDelete, fts5PorterTokenize }},
{ "trigram", {fts5TriCreate, fts5TriDelete, fts5TriTokenize}},
};
int rc = SQLITE_OK; /* Return code */

View File

@ -31,7 +31,7 @@ do_eqp_test 1.1 {
} {
QUERY PLAN
|--SCAN TABLE t1
`--SCAN TABLE f1 VIRTUAL TABLE INDEX 0:m
`--SCAN TABLE f1 VIRTUAL TABLE INDEX 0:M1
}
do_eqp_test 1.2 {
@ -46,7 +46,7 @@ do_eqp_test 1.3 {
SELECT * FROM f1 WHERE f1 MATCH ? ORDER BY ff
} {
QUERY PLAN
|--SCAN TABLE f1 VIRTUAL TABLE INDEX 0:m
|--SCAN TABLE f1 VIRTUAL TABLE INDEX 0:M1
`--USE TEMP B-TREE FOR ORDER BY
}

View File

@ -0,0 +1,121 @@
# 2020 September 30
#
# The author disclaims copyright to this source code. In place of
# a legal notice, here is a blessing:
#
# May you do good and not evil.
# May you find forgiveness for yourself and forgive others.
# May you share freely, never taking more than you give.
#
#*************************************************************************
#
# Tests for the fts5 "trigram" tokenizer.
#
source [file join [file dirname [info script]] fts5_common.tcl]
ifcapable !fts5 { finish_test ; return }
set ::testprefix fts5trigram
do_execsql_test 1.0 {
CREATE VIRTUAL TABLE t1 USING fts5(y, tokenize=trigram);
INSERT INTO t1 VALUES('abcdefghijklm');
INSERT INTO t1 VALUES('กรุงเทพมหานคร');
}
foreach {tn s res} {
1 abc "(abc)defghijklm"
2 defgh "abc(defgh)ijklm"
3 abcdefghijklm "(abcdefghijklm)"
4 กรุ "(กรุ)งเทพมหานคร"
5 งเทพมห "กรุ(งเทพมห)านคร"
6 กรุงเทพมหานคร "(กรุงเทพมหานคร)"
7 Abc "(abc)defghijklm"
8 deFgh "abc(defgh)ijklm"
9 aBcdefGhijKlm "(abcdefghijklm)"
} {
do_execsql_test 1.1.$tn {
SELECT highlight(t1, 0, '(', ')') FROM t1($s)
} $res
}
do_execsql_test 1.2.0 {
SELECT fts5_expr('ABCD', 'tokenize=trigram')
} {{"abc" + "bcd"}}
do_execsql_test 1.2.1 {
SELECT * FROM t1 WHERE y LIKE ? ESCAPE 'a'
}
foreach {tn like res} {
1 {%cDef%} 1
2 {cDef%} {}
3 {%f%} 1
4 {%f_h%} 1
5 {%f_g%} {}
6 {abc%klm} 1
7 {ABCDEFG%} 1
8 {%รุงเ%} 2
} {
do_execsql_test 1.3.$tn {
SELECT rowid FROM t1 WHERE y LIKE $like
} $res
}
#-------------------------------------------------------------------------
reset_db
do_execsql_test 2.0 {
CREATE VIRTUAL TABLE t1 USING fts5(y, tokenize="trigram case_sensitive 1");
INSERT INTO t1 VALUES('abcdefghijklm');
INSERT INTO t1 VALUES('กรุงเทพมหานคร');
}
foreach {tn s res} {
1 abc "(abc)defghijklm"
2 defgh "abc(defgh)ijklm"
3 abcdefghijklm "(abcdefghijklm)"
4 กรุ "(กรุ)งเทพมหานคร"
5 งเทพมห "กรุ(งเทพมห)านคร"
6 กรุงเทพมหานคร "(กรุงเทพมหานคร)"
7 Abc ""
8 deFgh ""
9 aBcdefGhijKlm ""
} {
do_execsql_test 2.1.$tn {
SELECT highlight(t1, 0, '(', ')') FROM t1($s)
} $res
}
foreach {tn like res} {
1 {%cDef%} 1
2 {cDef%} {}
3 {%f%} 1
4 {%f_h%} 1
5 {%f_g%} {}
6 {abc%klm} 1
7 {ABCDEFG%} 1
8 {%รุงเ%} 2
} {
do_execsql_test 2.2.$tn {
SELECT rowid FROM t1 WHERE y LIKE $like
} $res
}
foreach {tn like res} {
1 {*cdef*} 1
2 {cdef*} {}
3 {*f*} 1
4 {*f?h*} 1
5 {*f?g*} {}
6 {abc*klm} 1
7 {abcdefg*} 1
8 {*รุงเ*} 2
9 {abc[d]efg*} 1
10 {abc[]d]efg*} 1
11 {abc[^]d]efg*} {}
12 {abc[^]XYZ]efg*} 1
} {
do_execsql_test 2.3.$tn {
SELECT rowid FROM t1 WHERE y GLOB $like
} $res
}
finish_test