mirror of
https://github.com/sqlite/sqlite.git
synced 2025-12-24 14:17:58 +03:00
Improve the performance of the fts5 porter tokenizer implementation.
FossilOrigin-Name: 96ea600440de05ee663e71c3f0d0de2c64108bf9
This commit is contained in:
@@ -443,6 +443,7 @@ static int fts5UnicodeTokenize(
|
||||
rc = SQLITE_NOMEM;
|
||||
goto tokenize_done;
|
||||
}
|
||||
zOut = &aFold[zOut - p->aFold];
|
||||
memcpy(aFold, p->aFold, nFold);
|
||||
sqlite3_free(p->aFold);
|
||||
p->aFold = aFold;
|
||||
@@ -528,7 +529,7 @@ static int fts5PorterCreate(
|
||||
pRet = (PorterTokenizer*)sqlite3_malloc(sizeof(PorterTokenizer));
|
||||
if( pRet ){
|
||||
memset(pRet, 0, sizeof(PorterTokenizer));
|
||||
rc = pApi->xFindTokenizer(pApi, "ascii", &pUserdata, &pRet->tokenizer);
|
||||
rc = pApi->xFindTokenizer(pApi, "unicode61", &pUserdata, &pRet->tokenizer);
|
||||
}else{
|
||||
rc = SQLITE_NOMEM;
|
||||
}
|
||||
@@ -666,6 +667,448 @@ static int fts5Porter_Vowel(char *zStem, int nStem){
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/**************************************************************************
|
||||
***************************************************************************
|
||||
** GENERATED CODE STARTS HERE (mkportersteps.tcl)
|
||||
*/
|
||||
|
||||
static int fts5PorterStep4(char *aBuf, int *pnBuf){
|
||||
int ret = 0;
|
||||
int nBuf = *pnBuf;
|
||||
switch( aBuf[nBuf-2] ){
|
||||
|
||||
case 'a':
|
||||
if( nBuf>2 && 0==memcmp("al", &aBuf[nBuf-2], 2) ){
|
||||
if( fts5Porter_MGt1(aBuf, nBuf-2) ){
|
||||
*pnBuf = nBuf - 2;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case 'c':
|
||||
if( nBuf>4 && 0==memcmp("ance", &aBuf[nBuf-4], 4) ){
|
||||
if( fts5Porter_MGt1(aBuf, nBuf-4) ){
|
||||
*pnBuf = nBuf - 4;
|
||||
}
|
||||
}else if( nBuf>4 && 0==memcmp("ence", &aBuf[nBuf-4], 4) ){
|
||||
if( fts5Porter_MGt1(aBuf, nBuf-4) ){
|
||||
*pnBuf = nBuf - 4;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case 'e':
|
||||
if( nBuf>2 && 0==memcmp("er", &aBuf[nBuf-2], 2) ){
|
||||
if( fts5Porter_MGt1(aBuf, nBuf-2) ){
|
||||
*pnBuf = nBuf - 2;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case 'i':
|
||||
if( nBuf>2 && 0==memcmp("ic", &aBuf[nBuf-2], 2) ){
|
||||
if( fts5Porter_MGt1(aBuf, nBuf-2) ){
|
||||
*pnBuf = nBuf - 2;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case 'l':
|
||||
if( nBuf>4 && 0==memcmp("able", &aBuf[nBuf-4], 4) ){
|
||||
if( fts5Porter_MGt1(aBuf, nBuf-4) ){
|
||||
*pnBuf = nBuf - 4;
|
||||
}
|
||||
}else if( nBuf>4 && 0==memcmp("ible", &aBuf[nBuf-4], 4) ){
|
||||
if( fts5Porter_MGt1(aBuf, nBuf-4) ){
|
||||
*pnBuf = nBuf - 4;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case 'n':
|
||||
if( nBuf>3 && 0==memcmp("ant", &aBuf[nBuf-3], 3) ){
|
||||
if( fts5Porter_MGt1(aBuf, nBuf-3) ){
|
||||
*pnBuf = nBuf - 3;
|
||||
}
|
||||
}else if( nBuf>5 && 0==memcmp("ement", &aBuf[nBuf-5], 5) ){
|
||||
if( fts5Porter_MGt1(aBuf, nBuf-5) ){
|
||||
*pnBuf = nBuf - 5;
|
||||
}
|
||||
}else if( nBuf>4 && 0==memcmp("ment", &aBuf[nBuf-4], 4) ){
|
||||
if( fts5Porter_MGt1(aBuf, nBuf-4) ){
|
||||
*pnBuf = nBuf - 4;
|
||||
}
|
||||
}else if( nBuf>3 && 0==memcmp("ent", &aBuf[nBuf-3], 3) ){
|
||||
if( fts5Porter_MGt1(aBuf, nBuf-3) ){
|
||||
*pnBuf = nBuf - 3;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case 'o':
|
||||
if( nBuf>3 && 0==memcmp("ion", &aBuf[nBuf-3], 3) ){
|
||||
if( fts5Porter_MGt1_and_S_or_T(aBuf, nBuf-3) ){
|
||||
*pnBuf = nBuf - 3;
|
||||
}
|
||||
}else if( nBuf>2 && 0==memcmp("ou", &aBuf[nBuf-2], 2) ){
|
||||
if( fts5Porter_MGt1(aBuf, nBuf-2) ){
|
||||
*pnBuf = nBuf - 2;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case 's':
|
||||
if( nBuf>3 && 0==memcmp("ism", &aBuf[nBuf-3], 3) ){
|
||||
if( fts5Porter_MGt1(aBuf, nBuf-3) ){
|
||||
*pnBuf = nBuf - 3;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case 't':
|
||||
if( nBuf>3 && 0==memcmp("ate", &aBuf[nBuf-3], 3) ){
|
||||
if( fts5Porter_MGt1(aBuf, nBuf-3) ){
|
||||
*pnBuf = nBuf - 3;
|
||||
}
|
||||
}else if( nBuf>3 && 0==memcmp("iti", &aBuf[nBuf-3], 3) ){
|
||||
if( fts5Porter_MGt1(aBuf, nBuf-3) ){
|
||||
*pnBuf = nBuf - 3;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case 'u':
|
||||
if( nBuf>3 && 0==memcmp("ous", &aBuf[nBuf-3], 3) ){
|
||||
if( fts5Porter_MGt1(aBuf, nBuf-3) ){
|
||||
*pnBuf = nBuf - 3;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case 'v':
|
||||
if( nBuf>3 && 0==memcmp("ive", &aBuf[nBuf-3], 3) ){
|
||||
if( fts5Porter_MGt1(aBuf, nBuf-3) ){
|
||||
*pnBuf = nBuf - 3;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case 'z':
|
||||
if( nBuf>3 && 0==memcmp("ize", &aBuf[nBuf-3], 3) ){
|
||||
if( fts5Porter_MGt1(aBuf, nBuf-3) ){
|
||||
*pnBuf = nBuf - 3;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
static int fts5PorterStep1B2(char *aBuf, int *pnBuf){
|
||||
int ret = 0;
|
||||
int nBuf = *pnBuf;
|
||||
switch( aBuf[nBuf-2] ){
|
||||
|
||||
case 'a':
|
||||
if( nBuf>2 && 0==memcmp("at", &aBuf[nBuf-2], 2) ){
|
||||
memcpy(&aBuf[nBuf-2], "ate", 3);
|
||||
*pnBuf = nBuf - 2 + 3;
|
||||
ret = 1;
|
||||
}
|
||||
break;
|
||||
|
||||
case 'b':
|
||||
if( nBuf>2 && 0==memcmp("bl", &aBuf[nBuf-2], 2) ){
|
||||
memcpy(&aBuf[nBuf-2], "ble", 3);
|
||||
*pnBuf = nBuf - 2 + 3;
|
||||
ret = 1;
|
||||
}
|
||||
break;
|
||||
|
||||
case 'i':
|
||||
if( nBuf>2 && 0==memcmp("iz", &aBuf[nBuf-2], 2) ){
|
||||
memcpy(&aBuf[nBuf-2], "ize", 3);
|
||||
*pnBuf = nBuf - 2 + 3;
|
||||
ret = 1;
|
||||
}
|
||||
break;
|
||||
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
static int fts5PorterStep2(char *aBuf, int *pnBuf){
|
||||
int ret = 0;
|
||||
int nBuf = *pnBuf;
|
||||
switch( aBuf[nBuf-2] ){
|
||||
|
||||
case 'a':
|
||||
if( nBuf>7 && 0==memcmp("ational", &aBuf[nBuf-7], 7) ){
|
||||
if( fts5Porter_MGt0(aBuf, nBuf-7) ){
|
||||
memcpy(&aBuf[nBuf-7], "ate", 3);
|
||||
*pnBuf = nBuf - 7 + 3;
|
||||
}
|
||||
}else if( nBuf>6 && 0==memcmp("tional", &aBuf[nBuf-6], 6) ){
|
||||
if( fts5Porter_MGt0(aBuf, nBuf-6) ){
|
||||
memcpy(&aBuf[nBuf-6], "tion", 4);
|
||||
*pnBuf = nBuf - 6 + 4;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case 'c':
|
||||
if( nBuf>4 && 0==memcmp("enci", &aBuf[nBuf-4], 4) ){
|
||||
if( fts5Porter_MGt0(aBuf, nBuf-4) ){
|
||||
memcpy(&aBuf[nBuf-4], "ence", 4);
|
||||
*pnBuf = nBuf - 4 + 4;
|
||||
}
|
||||
}else if( nBuf>4 && 0==memcmp("anci", &aBuf[nBuf-4], 4) ){
|
||||
if( fts5Porter_MGt0(aBuf, nBuf-4) ){
|
||||
memcpy(&aBuf[nBuf-4], "ance", 4);
|
||||
*pnBuf = nBuf - 4 + 4;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case 'e':
|
||||
if( nBuf>4 && 0==memcmp("izer", &aBuf[nBuf-4], 4) ){
|
||||
if( fts5Porter_MGt0(aBuf, nBuf-4) ){
|
||||
memcpy(&aBuf[nBuf-4], "ize", 3);
|
||||
*pnBuf = nBuf - 4 + 3;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case 'g':
|
||||
if( nBuf>4 && 0==memcmp("logi", &aBuf[nBuf-4], 4) ){
|
||||
if( fts5Porter_MGt0(aBuf, nBuf-4) ){
|
||||
memcpy(&aBuf[nBuf-4], "log", 3);
|
||||
*pnBuf = nBuf - 4 + 3;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case 'l':
|
||||
if( nBuf>3 && 0==memcmp("bli", &aBuf[nBuf-3], 3) ){
|
||||
if( fts5Porter_MGt0(aBuf, nBuf-3) ){
|
||||
memcpy(&aBuf[nBuf-3], "ble", 3);
|
||||
*pnBuf = nBuf - 3 + 3;
|
||||
}
|
||||
}else if( nBuf>4 && 0==memcmp("alli", &aBuf[nBuf-4], 4) ){
|
||||
if( fts5Porter_MGt0(aBuf, nBuf-4) ){
|
||||
memcpy(&aBuf[nBuf-4], "al", 2);
|
||||
*pnBuf = nBuf - 4 + 2;
|
||||
}
|
||||
}else if( nBuf>5 && 0==memcmp("entli", &aBuf[nBuf-5], 5) ){
|
||||
if( fts5Porter_MGt0(aBuf, nBuf-5) ){
|
||||
memcpy(&aBuf[nBuf-5], "ent", 3);
|
||||
*pnBuf = nBuf - 5 + 3;
|
||||
}
|
||||
}else if( nBuf>3 && 0==memcmp("eli", &aBuf[nBuf-3], 3) ){
|
||||
if( fts5Porter_MGt0(aBuf, nBuf-3) ){
|
||||
memcpy(&aBuf[nBuf-3], "e", 1);
|
||||
*pnBuf = nBuf - 3 + 1;
|
||||
}
|
||||
}else if( nBuf>5 && 0==memcmp("ousli", &aBuf[nBuf-5], 5) ){
|
||||
if( fts5Porter_MGt0(aBuf, nBuf-5) ){
|
||||
memcpy(&aBuf[nBuf-5], "ous", 3);
|
||||
*pnBuf = nBuf - 5 + 3;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case 'o':
|
||||
if( nBuf>7 && 0==memcmp("ization", &aBuf[nBuf-7], 7) ){
|
||||
if( fts5Porter_MGt0(aBuf, nBuf-7) ){
|
||||
memcpy(&aBuf[nBuf-7], "ize", 3);
|
||||
*pnBuf = nBuf - 7 + 3;
|
||||
}
|
||||
}else if( nBuf>5 && 0==memcmp("ation", &aBuf[nBuf-5], 5) ){
|
||||
if( fts5Porter_MGt0(aBuf, nBuf-5) ){
|
||||
memcpy(&aBuf[nBuf-5], "ate", 3);
|
||||
*pnBuf = nBuf - 5 + 3;
|
||||
}
|
||||
}else if( nBuf>4 && 0==memcmp("ator", &aBuf[nBuf-4], 4) ){
|
||||
if( fts5Porter_MGt0(aBuf, nBuf-4) ){
|
||||
memcpy(&aBuf[nBuf-4], "ate", 3);
|
||||
*pnBuf = nBuf - 4 + 3;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case 's':
|
||||
if( nBuf>5 && 0==memcmp("alism", &aBuf[nBuf-5], 5) ){
|
||||
if( fts5Porter_MGt0(aBuf, nBuf-5) ){
|
||||
memcpy(&aBuf[nBuf-5], "al", 2);
|
||||
*pnBuf = nBuf - 5 + 2;
|
||||
}
|
||||
}else if( nBuf>7 && 0==memcmp("iveness", &aBuf[nBuf-7], 7) ){
|
||||
if( fts5Porter_MGt0(aBuf, nBuf-7) ){
|
||||
memcpy(&aBuf[nBuf-7], "ive", 3);
|
||||
*pnBuf = nBuf - 7 + 3;
|
||||
}
|
||||
}else if( nBuf>7 && 0==memcmp("fulness", &aBuf[nBuf-7], 7) ){
|
||||
if( fts5Porter_MGt0(aBuf, nBuf-7) ){
|
||||
memcpy(&aBuf[nBuf-7], "ful", 3);
|
||||
*pnBuf = nBuf - 7 + 3;
|
||||
}
|
||||
}else if( nBuf>7 && 0==memcmp("ousness", &aBuf[nBuf-7], 7) ){
|
||||
if( fts5Porter_MGt0(aBuf, nBuf-7) ){
|
||||
memcpy(&aBuf[nBuf-7], "ous", 3);
|
||||
*pnBuf = nBuf - 7 + 3;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case 't':
|
||||
if( nBuf>5 && 0==memcmp("aliti", &aBuf[nBuf-5], 5) ){
|
||||
if( fts5Porter_MGt0(aBuf, nBuf-5) ){
|
||||
memcpy(&aBuf[nBuf-5], "al", 2);
|
||||
*pnBuf = nBuf - 5 + 2;
|
||||
}
|
||||
}else if( nBuf>5 && 0==memcmp("iviti", &aBuf[nBuf-5], 5) ){
|
||||
if( fts5Porter_MGt0(aBuf, nBuf-5) ){
|
||||
memcpy(&aBuf[nBuf-5], "ive", 3);
|
||||
*pnBuf = nBuf - 5 + 3;
|
||||
}
|
||||
}else if( nBuf>6 && 0==memcmp("biliti", &aBuf[nBuf-6], 6) ){
|
||||
if( fts5Porter_MGt0(aBuf, nBuf-6) ){
|
||||
memcpy(&aBuf[nBuf-6], "ble", 3);
|
||||
*pnBuf = nBuf - 6 + 3;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
static int fts5PorterStep3(char *aBuf, int *pnBuf){
|
||||
int ret = 0;
|
||||
int nBuf = *pnBuf;
|
||||
switch( aBuf[nBuf-2] ){
|
||||
|
||||
case 'a':
|
||||
if( nBuf>4 && 0==memcmp("ical", &aBuf[nBuf-4], 4) ){
|
||||
if( fts5Porter_MGt0(aBuf, nBuf-4) ){
|
||||
memcpy(&aBuf[nBuf-4], "ic", 2);
|
||||
*pnBuf = nBuf - 4 + 2;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case 's':
|
||||
if( nBuf>4 && 0==memcmp("ness", &aBuf[nBuf-4], 4) ){
|
||||
if( fts5Porter_MGt0(aBuf, nBuf-4) ){
|
||||
*pnBuf = nBuf - 4;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case 't':
|
||||
if( nBuf>5 && 0==memcmp("icate", &aBuf[nBuf-5], 5) ){
|
||||
if( fts5Porter_MGt0(aBuf, nBuf-5) ){
|
||||
memcpy(&aBuf[nBuf-5], "ic", 2);
|
||||
*pnBuf = nBuf - 5 + 2;
|
||||
}
|
||||
}else if( nBuf>5 && 0==memcmp("iciti", &aBuf[nBuf-5], 5) ){
|
||||
if( fts5Porter_MGt0(aBuf, nBuf-5) ){
|
||||
memcpy(&aBuf[nBuf-5], "ic", 2);
|
||||
*pnBuf = nBuf - 5 + 2;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case 'u':
|
||||
if( nBuf>3 && 0==memcmp("ful", &aBuf[nBuf-3], 3) ){
|
||||
if( fts5Porter_MGt0(aBuf, nBuf-3) ){
|
||||
*pnBuf = nBuf - 3;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case 'v':
|
||||
if( nBuf>5 && 0==memcmp("ative", &aBuf[nBuf-5], 5) ){
|
||||
if( fts5Porter_MGt0(aBuf, nBuf-5) ){
|
||||
*pnBuf = nBuf - 5;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case 'z':
|
||||
if( nBuf>5 && 0==memcmp("alize", &aBuf[nBuf-5], 5) ){
|
||||
if( fts5Porter_MGt0(aBuf, nBuf-5) ){
|
||||
memcpy(&aBuf[nBuf-5], "al", 2);
|
||||
*pnBuf = nBuf - 5 + 2;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
static int fts5PorterStep1B(char *aBuf, int *pnBuf){
|
||||
int ret = 0;
|
||||
int nBuf = *pnBuf;
|
||||
switch( aBuf[nBuf-2] ){
|
||||
|
||||
case 'e':
|
||||
if( nBuf>3 && 0==memcmp("eed", &aBuf[nBuf-3], 3) ){
|
||||
if( fts5Porter_MGt0(aBuf, nBuf-3) ){
|
||||
memcpy(&aBuf[nBuf-3], "ee", 2);
|
||||
*pnBuf = nBuf - 3 + 2;
|
||||
}
|
||||
}else if( nBuf>2 && 0==memcmp("ed", &aBuf[nBuf-2], 2) ){
|
||||
if( fts5Porter_Vowel(aBuf, nBuf-2) ){
|
||||
*pnBuf = nBuf - 2;
|
||||
ret = 1;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case 'n':
|
||||
if( nBuf>3 && 0==memcmp("ing", &aBuf[nBuf-3], 3) ){
|
||||
if( fts5Porter_Vowel(aBuf, nBuf-3) ){
|
||||
*pnBuf = nBuf - 3;
|
||||
ret = 1;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
** GENERATED CODE ENDS HERE (mkportersteps.tcl)
|
||||
***************************************************************************
|
||||
**************************************************************************/
|
||||
|
||||
static void fts5PorterStep1A(char *aBuf, int *pnBuf){
|
||||
int nBuf = *pnBuf;
|
||||
if( aBuf[nBuf-1]=='s' ){
|
||||
if( aBuf[nBuf-2]=='e' ){
|
||||
if( (nBuf>4 && aBuf[nBuf-4]=='s' && aBuf[nBuf-3]=='s')
|
||||
|| (nBuf>3 && aBuf[nBuf-3]=='i' )
|
||||
){
|
||||
*pnBuf = nBuf-2;
|
||||
}else{
|
||||
*pnBuf = nBuf-1;
|
||||
}
|
||||
}
|
||||
else if( aBuf[nBuf-2]!='s' ){
|
||||
*pnBuf = nBuf-1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static int fts5PorterCb(
|
||||
void *pCtx,
|
||||
const char *pToken,
|
||||
@@ -675,96 +1118,8 @@ static int fts5PorterCb(
|
||||
){
|
||||
PorterContext *p = (PorterContext*)pCtx;
|
||||
|
||||
PorterRule aStep1A[] = {
|
||||
{ "sses", 4, 0, "ss", 2 },
|
||||
{ "ies", 3, 0, "i", 1 },
|
||||
{ "ss", 2, 0, "ss", 2 },
|
||||
{ "s", 1, 0, "", 0 },
|
||||
{ 0, 0, 0, 0 }
|
||||
};
|
||||
|
||||
PorterRule aStep1B[] = {
|
||||
{ "eed", 3, fts5Porter_MGt0, "ee", 2 },
|
||||
{ "ed", 2, fts5Porter_Vowel, "", 0 },
|
||||
{ "ing", 3, fts5Porter_Vowel, "", 0 },
|
||||
{ 0, 0, 0, 0 }
|
||||
};
|
||||
|
||||
PorterRule aStep1B2[] = {
|
||||
{ "at", 2, 0, "ate", 3 },
|
||||
{ "bl", 2, 0, "ble", 3 },
|
||||
{ "iz", 2, 0, "ize", 3 },
|
||||
{ 0, 0, 0, 0 }
|
||||
};
|
||||
|
||||
PorterRule aStep1C[] = {
|
||||
{ "y", 1, fts5Porter_Vowel, "i", 1 },
|
||||
{ 0, 0, 0, 0 }
|
||||
};
|
||||
|
||||
PorterRule aStep2[] = {
|
||||
{ "ational", 7, fts5Porter_MGt0, "ate", 3},
|
||||
{ "tional", 6, fts5Porter_MGt0, "tion", 4},
|
||||
{ "enci", 4, fts5Porter_MGt0, "ence", 4},
|
||||
{ "anci", 4, fts5Porter_MGt0, "ance", 4},
|
||||
{ "izer", 4, fts5Porter_MGt0, "ize", 3},
|
||||
{ "logi", 4, fts5Porter_MGt0, "log", 3}, /* added post 1979 */
|
||||
{ "bli", 3, fts5Porter_MGt0, "ble", 3}, /* modified post 1979 */
|
||||
{ "alli", 4, fts5Porter_MGt0, "al", 2},
|
||||
{ "entli", 5, fts5Porter_MGt0, "ent", 3},
|
||||
{ "eli", 3, fts5Porter_MGt0, "e", 1},
|
||||
{ "ousli", 5, fts5Porter_MGt0, "ous", 3},
|
||||
{ "ization", 7, fts5Porter_MGt0, "ize", 3},
|
||||
{ "ation", 5, fts5Porter_MGt0, "ate", 3},
|
||||
{ "ator", 4, fts5Porter_MGt0, "ate", 3},
|
||||
{ "alism", 5, fts5Porter_MGt0, "al", 2},
|
||||
{ "iveness", 7, fts5Porter_MGt0, "ive", 3},
|
||||
{ "fulness", 7, fts5Porter_MGt0, "ful", 3},
|
||||
{ "ousness", 7, fts5Porter_MGt0, "ous", 3},
|
||||
{ "aliti", 5, fts5Porter_MGt0, "al", 2},
|
||||
{ "iviti", 5, fts5Porter_MGt0, "ive", 3},
|
||||
{ "biliti", 6, fts5Porter_MGt0, "ble", 3},
|
||||
{ 0, 0, 0, 0 }
|
||||
};
|
||||
|
||||
PorterRule aStep3[] = {
|
||||
{ "icate", 5, fts5Porter_MGt0, "ic", 2},
|
||||
{ "ative", 5, fts5Porter_MGt0, "", 0},
|
||||
{ "alize", 5, fts5Porter_MGt0, "al", 2},
|
||||
{ "iciti", 5, fts5Porter_MGt0, "ic", 2},
|
||||
{ "ical", 4, fts5Porter_MGt0, "ic", 2},
|
||||
{ "ful", 3, fts5Porter_MGt0, "", 0},
|
||||
{ "ness", 4, fts5Porter_MGt0, "", 0},
|
||||
{ 0, 0, 0, 0 }
|
||||
};
|
||||
|
||||
PorterRule aStep4[] = {
|
||||
{ "al", 2, fts5Porter_MGt1, "", 0},
|
||||
{ "ance", 4, fts5Porter_MGt1, "", 0},
|
||||
{ "ence", 4, fts5Porter_MGt1, "", 0},
|
||||
{ "er", 2, fts5Porter_MGt1, "", 0},
|
||||
{ "ic", 2, fts5Porter_MGt1, "", 0},
|
||||
{ "able", 4, fts5Porter_MGt1, "", 0},
|
||||
{ "ible", 4, fts5Porter_MGt1, "", 0},
|
||||
{ "ant", 3, fts5Porter_MGt1, "", 0},
|
||||
{ "ement", 5, fts5Porter_MGt1, "", 0},
|
||||
{ "ment", 4, fts5Porter_MGt1, "", 0},
|
||||
{ "ent", 3, fts5Porter_MGt1, "", 0},
|
||||
{ "ion", 3, fts5Porter_MGt1_and_S_or_T, "", 0},
|
||||
{ "ou", 2, fts5Porter_MGt1, "", 0},
|
||||
{ "ism", 3, fts5Porter_MGt1, "", 0},
|
||||
{ "ate", 3, fts5Porter_MGt1, "", 0},
|
||||
{ "iti", 3, fts5Porter_MGt1, "", 0},
|
||||
{ "ous", 3, fts5Porter_MGt1, "", 0},
|
||||
{ "ive", 3, fts5Porter_MGt1, "", 0},
|
||||
{ "ize", 3, fts5Porter_MGt1, "", 0},
|
||||
{ 0, 0, 0, 0 }
|
||||
};
|
||||
|
||||
|
||||
char *aBuf;
|
||||
int nBuf;
|
||||
int n;
|
||||
|
||||
if( nToken>FTS5_PORTER_MAX_TOKEN || nToken<3 ) goto pass_through;
|
||||
aBuf = p->aBuf;
|
||||
@@ -772,10 +1127,9 @@ static int fts5PorterCb(
|
||||
memcpy(aBuf, pToken, nBuf);
|
||||
|
||||
/* Step 1. */
|
||||
fts5PorterApply(aBuf, &nBuf, aStep1A);
|
||||
n = fts5PorterApply(aBuf, &nBuf, aStep1B);
|
||||
if( n==1 || n==2 ){
|
||||
if( fts5PorterApply(aBuf, &nBuf, aStep1B2)<0 ){
|
||||
fts5PorterStep1A(aBuf, &nBuf);
|
||||
if( fts5PorterStep1B(aBuf, &nBuf) ){
|
||||
if( fts5PorterStep1B2(aBuf, &nBuf)==0 ){
|
||||
char c = aBuf[nBuf-1];
|
||||
if( fts5PorterIsVowel(c, 0)==0
|
||||
&& c!='l' && c!='s' && c!='z' && c==aBuf[nBuf-2]
|
||||
@@ -786,12 +1140,16 @@ static int fts5PorterCb(
|
||||
}
|
||||
}
|
||||
}
|
||||
fts5PorterApply(aBuf, &nBuf, aStep1C);
|
||||
|
||||
/* Step 1C. */
|
||||
if( aBuf[nBuf-1]=='y' && fts5Porter_Vowel(aBuf, nBuf-1) ){
|
||||
aBuf[nBuf-1] = 'i';
|
||||
}
|
||||
|
||||
/* Steps 2 through 4. */
|
||||
fts5PorterApply(aBuf, &nBuf, aStep2);
|
||||
fts5PorterApply(aBuf, &nBuf, aStep3);
|
||||
fts5PorterApply(aBuf, &nBuf, aStep4);
|
||||
fts5PorterStep2(aBuf, &nBuf);
|
||||
fts5PorterStep3(aBuf, &nBuf);
|
||||
fts5PorterStep4(aBuf, &nBuf);
|
||||
|
||||
/* Step 5a. */
|
||||
if( nBuf>0 && aBuf[nBuf-1]=='e' ){
|
||||
|
||||
222
ext/fts5/mkportersteps.tcl
Normal file
222
ext/fts5/mkportersteps.tcl
Normal file
@@ -0,0 +1,222 @@
|
||||
#
|
||||
# 2014 Jun 09
|
||||
#
|
||||
# The author disclaims copyright to this source code. In place of
|
||||
# a legal notice, here is a blessing:
|
||||
#
|
||||
# May you do good and not evil.
|
||||
# May you find forgiveness for yourself and forgive others.
|
||||
# May you share freely, never taking more than you give.
|
||||
#
|
||||
#-------------------------------------------------------------------------
|
||||
#
|
||||
# This script generates the implementations of the following C functions,
|
||||
# which are part of the porter tokenizer implementation:
|
||||
#
|
||||
# static int fts5PorterStep1B(char *aBuf, int *pnBuf);
|
||||
# static int fts5PorterStep1B2(char *aBuf, int *pnBuf);
|
||||
# static int fts5PorterStep2(char *aBuf, int *pnBuf);
|
||||
# static int fts5PorterStep3(char *aBuf, int *pnBuf);
|
||||
# static int fts5PorterStep4(char *aBuf, int *pnBuf);
|
||||
#
|
||||
|
||||
set O(Step1B2) {
|
||||
{ at {} ate 1 }
|
||||
{ bl {} ble 1 }
|
||||
{ iz {} ize 1 }
|
||||
}
|
||||
|
||||
set O(Step1B) {
|
||||
{ "eed" fts5Porter_MGt0 "ee" 0 }
|
||||
{ "ed" fts5Porter_Vowel "" 1 }
|
||||
{ "ing" fts5Porter_Vowel "" 1 }
|
||||
}
|
||||
|
||||
set O(Step2) {
|
||||
{ "ational" fts5Porter_MGt0 "ate" }
|
||||
{ "tional" fts5Porter_MGt0 "tion" }
|
||||
{ "enci" fts5Porter_MGt0 "ence" }
|
||||
{ "anci" fts5Porter_MGt0 "ance" }
|
||||
{ "izer" fts5Porter_MGt0 "ize" }
|
||||
{ "logi" fts5Porter_MGt0 "log" }
|
||||
{ "bli" fts5Porter_MGt0 "ble" }
|
||||
{ "alli" fts5Porter_MGt0 "al" }
|
||||
{ "entli" fts5Porter_MGt0 "ent" }
|
||||
{ "eli" fts5Porter_MGt0 "e" }
|
||||
{ "ousli" fts5Porter_MGt0 "ous" }
|
||||
{ "ization" fts5Porter_MGt0 "ize" }
|
||||
{ "ation" fts5Porter_MGt0 "ate" }
|
||||
{ "ator" fts5Porter_MGt0 "ate" }
|
||||
{ "alism" fts5Porter_MGt0 "al" }
|
||||
{ "iveness" fts5Porter_MGt0 "ive" }
|
||||
{ "fulness" fts5Porter_MGt0 "ful" }
|
||||
{ "ousness" fts5Porter_MGt0 "ous" }
|
||||
{ "aliti" fts5Porter_MGt0 "al" }
|
||||
{ "iviti" fts5Porter_MGt0 "ive" }
|
||||
{ "biliti" fts5Porter_MGt0 "ble" }
|
||||
}
|
||||
|
||||
set O(Step3) {
|
||||
{ "icate" fts5Porter_MGt0 "ic" }
|
||||
{ "ative" fts5Porter_MGt0 "" }
|
||||
{ "alize" fts5Porter_MGt0 "al" }
|
||||
{ "iciti" fts5Porter_MGt0 "ic" }
|
||||
{ "ical" fts5Porter_MGt0 "ic" }
|
||||
{ "ful" fts5Porter_MGt0 "" }
|
||||
{ "ness" fts5Porter_MGt0 "" }
|
||||
}
|
||||
|
||||
set O(Step4) {
|
||||
{ "al" fts5Porter_MGt1 "" }
|
||||
{ "ance" fts5Porter_MGt1 "" }
|
||||
{ "ence" fts5Porter_MGt1 "" }
|
||||
{ "er" fts5Porter_MGt1 "" }
|
||||
{ "ic" fts5Porter_MGt1 "" }
|
||||
{ "able" fts5Porter_MGt1 "" }
|
||||
{ "ible" fts5Porter_MGt1 "" }
|
||||
{ "ant" fts5Porter_MGt1 "" }
|
||||
{ "ement" fts5Porter_MGt1 "" }
|
||||
{ "ment" fts5Porter_MGt1 "" }
|
||||
{ "ent" fts5Porter_MGt1 "" }
|
||||
{ "ion" fts5Porter_MGt1_and_S_or_T "" }
|
||||
{ "ou" fts5Porter_MGt1 "" }
|
||||
{ "ism" fts5Porter_MGt1 "" }
|
||||
{ "ate" fts5Porter_MGt1 "" }
|
||||
{ "iti" fts5Porter_MGt1 "" }
|
||||
{ "ous" fts5Porter_MGt1 "" }
|
||||
{ "ive" fts5Porter_MGt1 "" }
|
||||
{ "ize" fts5Porter_MGt1 "" }
|
||||
}
|
||||
|
||||
proc sort_cb {lhs rhs} {
|
||||
set L [string range [lindex $lhs 0] end-1 end-1]
|
||||
set R [string range [lindex $rhs 0] end-1 end-1]
|
||||
string compare $L $R
|
||||
}
|
||||
|
||||
proc create_step_function {name data} {
|
||||
|
||||
set T(function) {
|
||||
static int fts5Porter${name}(char *aBuf, int *pnBuf){
|
||||
int ret = 0;
|
||||
int nBuf = *pnBuf;
|
||||
switch( aBuf[nBuf-2] ){
|
||||
${switchbody}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
set T(case) {
|
||||
case '${k}':
|
||||
${ifstmts}
|
||||
break;
|
||||
}
|
||||
|
||||
set T(if_0_0_0) {
|
||||
if( ${match} ){
|
||||
*pnBuf = nBuf - $n;
|
||||
}
|
||||
}
|
||||
set T(if_1_0_0) {
|
||||
if( ${match} ){
|
||||
if( ${cond} ){
|
||||
*pnBuf = nBuf - $n;
|
||||
}
|
||||
}
|
||||
}
|
||||
set T(if_0_1_0) {
|
||||
if( ${match} ){
|
||||
${memcpy}
|
||||
*pnBuf = nBuf - $n + $nRep;
|
||||
}
|
||||
}
|
||||
set T(if_1_1_0) {
|
||||
if( ${match} ){
|
||||
if( ${cond} ){
|
||||
${memcpy}
|
||||
*pnBuf = nBuf - $n + $nRep;
|
||||
}
|
||||
}
|
||||
}
|
||||
set T(if_1_0_1) {
|
||||
if( ${match} ){
|
||||
if( ${cond} ){
|
||||
*pnBuf = nBuf - $n;
|
||||
ret = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
set T(if_0_1_1) {
|
||||
if( ${match} ){
|
||||
${memcpy}
|
||||
*pnBuf = nBuf - $n + $nRep;
|
||||
ret = 1;
|
||||
}
|
||||
}
|
||||
set T(if_1_1_1) {
|
||||
if( ${match} ){
|
||||
if( ${cond} ){
|
||||
${memcpy}
|
||||
*pnBuf = nBuf - $n + $nRep;
|
||||
ret = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
set switchbody ""
|
||||
|
||||
foreach I $data {
|
||||
set k [string range [lindex $I 0] end-1 end-1]
|
||||
lappend aCase($k) $I
|
||||
}
|
||||
foreach k [lsort [array names aCase]] {
|
||||
set ifstmts ""
|
||||
foreach I $aCase($k) {
|
||||
set zSuffix [lindex $I 0] ;# Suffix text for this rule
|
||||
set zRep [lindex $I 2] ;# Replacement text for rule
|
||||
set xCond [lindex $I 1] ;# Condition callback (or "")
|
||||
|
||||
set n [string length $zSuffix]
|
||||
set nRep [string length $zRep]
|
||||
|
||||
set match "nBuf>$n && 0==memcmp(\"$zSuffix\", &aBuf\[nBuf-$n\], $n)"
|
||||
set memcpy "memcpy(&aBuf\[nBuf-$n\], \"$zRep\", $nRep);"
|
||||
set cond "${xCond}(aBuf, nBuf-$n)"
|
||||
|
||||
set bMemcpy [expr {$nRep>0}]
|
||||
set bCond [expr {$xCond!=""}]
|
||||
set bRet [expr {[llength $I]>3 && [lindex $I 3]}]
|
||||
|
||||
set t $T(if_${bCond}_${bMemcpy}_${bRet})
|
||||
lappend ifstmts [string trim [subst -nocommands $t]]
|
||||
}
|
||||
|
||||
set ifstmts [join $ifstmts "else "]
|
||||
|
||||
append switchbody [subst -nocommands $T(case)]
|
||||
}
|
||||
|
||||
|
||||
puts [subst -nocommands $T(function)]
|
||||
}
|
||||
|
||||
|
||||
puts [string trim {
|
||||
/**************************************************************************
|
||||
***************************************************************************
|
||||
** GENERATED CODE STARTS HERE (mkportersteps.tcl)
|
||||
*/
|
||||
}]
|
||||
foreach step [array names O] {
|
||||
create_step_function $step $O($step)
|
||||
}
|
||||
puts [string trim {
|
||||
/*
|
||||
** GENERATED CODE ENDS HERE (mkportersteps.tcl)
|
||||
***************************************************************************
|
||||
**************************************************************************/
|
||||
}]
|
||||
|
||||
|
||||
|
||||
71
ext/fts5/tool/loadfts5.tcl
Normal file
71
ext/fts5/tool/loadfts5.tcl
Normal file
@@ -0,0 +1,71 @@
|
||||
|
||||
|
||||
proc loadfile {f} {
|
||||
set fd [open $f]
|
||||
set data [read $fd]
|
||||
close $fd
|
||||
return $data
|
||||
}
|
||||
|
||||
set ::nRow 0
|
||||
proc load_hierachy {dir} {
|
||||
foreach f [glob -nocomplain -dir $dir *] {
|
||||
if {$::O(limit) && $::nRow>=$::O(limit)} break
|
||||
if {[file isdir $f]} {
|
||||
load_hierachy $f
|
||||
} else {
|
||||
db eval { INSERT INTO t1 VALUES($f, loadfile($f)) }
|
||||
incr ::nRow
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
proc usage {} {
|
||||
puts stderr "Usage: $::argv0 ?SWITCHES? DATABASE PATH"
|
||||
puts stderr ""
|
||||
puts stderr "Switches are:"
|
||||
puts stderr " -fts4 (use fts4 instead of fts5)"
|
||||
exit 1
|
||||
}
|
||||
|
||||
set O(vtab) fts5
|
||||
set O(tok) ""
|
||||
set O(limit) 0
|
||||
|
||||
if {[llength $argv]<2} usage
|
||||
for {set i 0} {$i < [llength $argv]-2} {incr i} {
|
||||
set arg [lindex $argv $i]
|
||||
switch -- [lindex $argv $i] {
|
||||
-fts4 {
|
||||
set O(vtab) fts4
|
||||
}
|
||||
|
||||
-fts5 {
|
||||
set O(vtab) fts5
|
||||
}
|
||||
|
||||
-porter {
|
||||
set O(tok) ", tokenize=porter"
|
||||
}
|
||||
|
||||
-limit {
|
||||
incr i
|
||||
set O(limit) [lindex $argv $i]
|
||||
}
|
||||
|
||||
default {
|
||||
usage
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sqlite3 db [lindex $argv end-1]
|
||||
db func loadfile loadfile
|
||||
|
||||
db transaction {
|
||||
db eval "CREATE VIRTUAL TABLE t1 USING $O(vtab) (path, content$O(tok))"
|
||||
load_hierachy [lindex $argv end]
|
||||
}
|
||||
|
||||
|
||||
|
||||
14
manifest
14
manifest
@@ -1,5 +1,5 @@
|
||||
C Fix\sprefix\sindexes\sso\sthat\sthey\swork\sin\scharacters,\snot\sbytes.
|
||||
D 2015-01-13T17:25:08.235
|
||||
C Improve\sthe\sperformance\sof\sthe\sfts5\sporter\stokenizer\simplementation.
|
||||
D 2015-01-17T17:48:10.103
|
||||
F Makefile.arm-wince-mingw32ce-gcc d6df77f1f48d690bd73162294bbba7f59507c72f
|
||||
F Makefile.in 7cd23e4fc91004a6bd081623e1bc6932e44828c0
|
||||
F Makefile.linux-gcc 91d710bdc4998cb015f39edf3cb314ec4f4d7e23
|
||||
@@ -115,9 +115,10 @@ F ext/fts5/fts5_hash.c 63fa8379c5f2ac107d47c2b7d9ac04c95ef8a279
|
||||
F ext/fts5/fts5_index.c 6f9f98875b2ee5a16255911e1dc1b0b32cb1c350
|
||||
F ext/fts5/fts5_storage.c 8bc9e5b6654e1545e9513def277ef3f025921664
|
||||
F ext/fts5/fts5_tcl.c 1293fac2bb26903fd3d5cdee59c5885ba7e620d5
|
||||
F ext/fts5/fts5_tokenize.c bdb6a1f599a94ec6e9c1cad037d1071e823dcb5d
|
||||
F ext/fts5/fts5_tokenize.c 7c61d5c35c3449597bdeaa54dd48afe26852c7b0
|
||||
F ext/fts5/fts5_unicode2.c 9c7dd640d1f014bf5c3ee029759adfbb4d7e95a9
|
||||
F ext/fts5/fts5parse.y 777da8e5819f75c217982c79c29d014c293acac9
|
||||
F ext/fts5/mkportersteps.tcl 5acf962d2e0074f701620bb5308155fa1e4a63ba
|
||||
F ext/fts5/test/fts5_common.tcl 08e939096a07eb77a7a986613e960f31d3cab2cc
|
||||
F ext/fts5/test/fts5aa.test 3941b54d7585153be0c5cf0026f7dd8cfef13ea9
|
||||
F ext/fts5/test/fts5ab.test 91a3faac09ad9fab5f71494db6e4071963281536
|
||||
@@ -143,6 +144,7 @@ F ext/fts5/test/fts5rebuild.test 2a5e98205393487b4a732c8290999af7c0b907b4
|
||||
F ext/fts5/test/fts5tokenizer.test b34ae592db66f6e89546d791ce1f905ba0b3395c
|
||||
F ext/fts5/test/fts5unicode.test 79b3e34eb29ce4929628aa514a40cb467fdabe4d
|
||||
F ext/fts5/test/fts5unicode2.test 64a5267fd6082fcb46439892ebd0cbaa5c38acee
|
||||
F ext/fts5/tool/loadfts5.tcl 55c1f3ebf3f4b4f54be5bbdc823e36d59fc5e2dd
|
||||
F ext/icu/README.txt d9fbbad0c2f647c3fdf715fc9fd64af53aedfc43
|
||||
F ext/icu/icu.c d415ccf984defeb9df2c0e1afcfaa2f6dc05eacb
|
||||
F ext/icu/sqliteicu.h 728867a802baa5a96de7495e9689a8e01715ef37
|
||||
@@ -1275,7 +1277,7 @@ F tool/vdbe_profile.tcl 67746953071a9f8f2f668b73fe899074e2c6d8c1
|
||||
F tool/warnings-clang.sh f6aa929dc20ef1f856af04a730772f59283631d4
|
||||
F tool/warnings.sh 0abfd78ceb09b7f7c27c688c8e3fe93268a13b32
|
||||
F tool/win/sqlite.vsix deb315d026cc8400325c5863eef847784a219a2f
|
||||
P f22dbccad9499624880ddd48df1b07fb42b1ad66
|
||||
R 8d592e678c3bea0440cf749de24705b7
|
||||
P af8d43a4a08528bbae25ee38fe25de8a86f8a21c
|
||||
R bbc2aaea254f25294ae3538c1336787c
|
||||
U dan
|
||||
Z 3408fdf2714814208d88a4779f5de9eb
|
||||
Z 3ca0ddccabcad41dd9682a0c32f2940d
|
||||
|
||||
@@ -1 +1 @@
|
||||
af8d43a4a08528bbae25ee38fe25de8a86f8a21c
|
||||
96ea600440de05ee663e71c3f0d0de2c64108bf9
|
||||
Reference in New Issue
Block a user