1
0
mirror of https://github.com/sqlite/sqlite.git synced 2025-12-24 14:17:58 +03:00

Improve the performance of the fts5 porter tokenizer implementation.

FossilOrigin-Name: 96ea600440de05ee663e71c3f0d0de2c64108bf9
This commit is contained in:
dan
2015-01-17 17:48:10 +00:00
parent 851ca6e715
commit 2656167f6e
5 changed files with 757 additions and 104 deletions

View File

@@ -443,6 +443,7 @@ static int fts5UnicodeTokenize(
rc = SQLITE_NOMEM;
goto tokenize_done;
}
zOut = &aFold[zOut - p->aFold];
memcpy(aFold, p->aFold, nFold);
sqlite3_free(p->aFold);
p->aFold = aFold;
@@ -528,7 +529,7 @@ static int fts5PorterCreate(
pRet = (PorterTokenizer*)sqlite3_malloc(sizeof(PorterTokenizer));
if( pRet ){
memset(pRet, 0, sizeof(PorterTokenizer));
rc = pApi->xFindTokenizer(pApi, "ascii", &pUserdata, &pRet->tokenizer);
rc = pApi->xFindTokenizer(pApi, "unicode61", &pUserdata, &pRet->tokenizer);
}else{
rc = SQLITE_NOMEM;
}
@@ -666,6 +667,448 @@ static int fts5Porter_Vowel(char *zStem, int nStem){
return 0;
}
/**************************************************************************
***************************************************************************
** GENERATED CODE STARTS HERE (mkportersteps.tcl)
*/
static int fts5PorterStep4(char *aBuf, int *pnBuf){
int ret = 0;
int nBuf = *pnBuf;
switch( aBuf[nBuf-2] ){
case 'a':
if( nBuf>2 && 0==memcmp("al", &aBuf[nBuf-2], 2) ){
if( fts5Porter_MGt1(aBuf, nBuf-2) ){
*pnBuf = nBuf - 2;
}
}
break;
case 'c':
if( nBuf>4 && 0==memcmp("ance", &aBuf[nBuf-4], 4) ){
if( fts5Porter_MGt1(aBuf, nBuf-4) ){
*pnBuf = nBuf - 4;
}
}else if( nBuf>4 && 0==memcmp("ence", &aBuf[nBuf-4], 4) ){
if( fts5Porter_MGt1(aBuf, nBuf-4) ){
*pnBuf = nBuf - 4;
}
}
break;
case 'e':
if( nBuf>2 && 0==memcmp("er", &aBuf[nBuf-2], 2) ){
if( fts5Porter_MGt1(aBuf, nBuf-2) ){
*pnBuf = nBuf - 2;
}
}
break;
case 'i':
if( nBuf>2 && 0==memcmp("ic", &aBuf[nBuf-2], 2) ){
if( fts5Porter_MGt1(aBuf, nBuf-2) ){
*pnBuf = nBuf - 2;
}
}
break;
case 'l':
if( nBuf>4 && 0==memcmp("able", &aBuf[nBuf-4], 4) ){
if( fts5Porter_MGt1(aBuf, nBuf-4) ){
*pnBuf = nBuf - 4;
}
}else if( nBuf>4 && 0==memcmp("ible", &aBuf[nBuf-4], 4) ){
if( fts5Porter_MGt1(aBuf, nBuf-4) ){
*pnBuf = nBuf - 4;
}
}
break;
case 'n':
if( nBuf>3 && 0==memcmp("ant", &aBuf[nBuf-3], 3) ){
if( fts5Porter_MGt1(aBuf, nBuf-3) ){
*pnBuf = nBuf - 3;
}
}else if( nBuf>5 && 0==memcmp("ement", &aBuf[nBuf-5], 5) ){
if( fts5Porter_MGt1(aBuf, nBuf-5) ){
*pnBuf = nBuf - 5;
}
}else if( nBuf>4 && 0==memcmp("ment", &aBuf[nBuf-4], 4) ){
if( fts5Porter_MGt1(aBuf, nBuf-4) ){
*pnBuf = nBuf - 4;
}
}else if( nBuf>3 && 0==memcmp("ent", &aBuf[nBuf-3], 3) ){
if( fts5Porter_MGt1(aBuf, nBuf-3) ){
*pnBuf = nBuf - 3;
}
}
break;
case 'o':
if( nBuf>3 && 0==memcmp("ion", &aBuf[nBuf-3], 3) ){
if( fts5Porter_MGt1_and_S_or_T(aBuf, nBuf-3) ){
*pnBuf = nBuf - 3;
}
}else if( nBuf>2 && 0==memcmp("ou", &aBuf[nBuf-2], 2) ){
if( fts5Porter_MGt1(aBuf, nBuf-2) ){
*pnBuf = nBuf - 2;
}
}
break;
case 's':
if( nBuf>3 && 0==memcmp("ism", &aBuf[nBuf-3], 3) ){
if( fts5Porter_MGt1(aBuf, nBuf-3) ){
*pnBuf = nBuf - 3;
}
}
break;
case 't':
if( nBuf>3 && 0==memcmp("ate", &aBuf[nBuf-3], 3) ){
if( fts5Porter_MGt1(aBuf, nBuf-3) ){
*pnBuf = nBuf - 3;
}
}else if( nBuf>3 && 0==memcmp("iti", &aBuf[nBuf-3], 3) ){
if( fts5Porter_MGt1(aBuf, nBuf-3) ){
*pnBuf = nBuf - 3;
}
}
break;
case 'u':
if( nBuf>3 && 0==memcmp("ous", &aBuf[nBuf-3], 3) ){
if( fts5Porter_MGt1(aBuf, nBuf-3) ){
*pnBuf = nBuf - 3;
}
}
break;
case 'v':
if( nBuf>3 && 0==memcmp("ive", &aBuf[nBuf-3], 3) ){
if( fts5Porter_MGt1(aBuf, nBuf-3) ){
*pnBuf = nBuf - 3;
}
}
break;
case 'z':
if( nBuf>3 && 0==memcmp("ize", &aBuf[nBuf-3], 3) ){
if( fts5Porter_MGt1(aBuf, nBuf-3) ){
*pnBuf = nBuf - 3;
}
}
break;
}
return ret;
}
static int fts5PorterStep1B2(char *aBuf, int *pnBuf){
int ret = 0;
int nBuf = *pnBuf;
switch( aBuf[nBuf-2] ){
case 'a':
if( nBuf>2 && 0==memcmp("at", &aBuf[nBuf-2], 2) ){
memcpy(&aBuf[nBuf-2], "ate", 3);
*pnBuf = nBuf - 2 + 3;
ret = 1;
}
break;
case 'b':
if( nBuf>2 && 0==memcmp("bl", &aBuf[nBuf-2], 2) ){
memcpy(&aBuf[nBuf-2], "ble", 3);
*pnBuf = nBuf - 2 + 3;
ret = 1;
}
break;
case 'i':
if( nBuf>2 && 0==memcmp("iz", &aBuf[nBuf-2], 2) ){
memcpy(&aBuf[nBuf-2], "ize", 3);
*pnBuf = nBuf - 2 + 3;
ret = 1;
}
break;
}
return ret;
}
static int fts5PorterStep2(char *aBuf, int *pnBuf){
int ret = 0;
int nBuf = *pnBuf;
switch( aBuf[nBuf-2] ){
case 'a':
if( nBuf>7 && 0==memcmp("ational", &aBuf[nBuf-7], 7) ){
if( fts5Porter_MGt0(aBuf, nBuf-7) ){
memcpy(&aBuf[nBuf-7], "ate", 3);
*pnBuf = nBuf - 7 + 3;
}
}else if( nBuf>6 && 0==memcmp("tional", &aBuf[nBuf-6], 6) ){
if( fts5Porter_MGt0(aBuf, nBuf-6) ){
memcpy(&aBuf[nBuf-6], "tion", 4);
*pnBuf = nBuf - 6 + 4;
}
}
break;
case 'c':
if( nBuf>4 && 0==memcmp("enci", &aBuf[nBuf-4], 4) ){
if( fts5Porter_MGt0(aBuf, nBuf-4) ){
memcpy(&aBuf[nBuf-4], "ence", 4);
*pnBuf = nBuf - 4 + 4;
}
}else if( nBuf>4 && 0==memcmp("anci", &aBuf[nBuf-4], 4) ){
if( fts5Porter_MGt0(aBuf, nBuf-4) ){
memcpy(&aBuf[nBuf-4], "ance", 4);
*pnBuf = nBuf - 4 + 4;
}
}
break;
case 'e':
if( nBuf>4 && 0==memcmp("izer", &aBuf[nBuf-4], 4) ){
if( fts5Porter_MGt0(aBuf, nBuf-4) ){
memcpy(&aBuf[nBuf-4], "ize", 3);
*pnBuf = nBuf - 4 + 3;
}
}
break;
case 'g':
if( nBuf>4 && 0==memcmp("logi", &aBuf[nBuf-4], 4) ){
if( fts5Porter_MGt0(aBuf, nBuf-4) ){
memcpy(&aBuf[nBuf-4], "log", 3);
*pnBuf = nBuf - 4 + 3;
}
}
break;
case 'l':
if( nBuf>3 && 0==memcmp("bli", &aBuf[nBuf-3], 3) ){
if( fts5Porter_MGt0(aBuf, nBuf-3) ){
memcpy(&aBuf[nBuf-3], "ble", 3);
*pnBuf = nBuf - 3 + 3;
}
}else if( nBuf>4 && 0==memcmp("alli", &aBuf[nBuf-4], 4) ){
if( fts5Porter_MGt0(aBuf, nBuf-4) ){
memcpy(&aBuf[nBuf-4], "al", 2);
*pnBuf = nBuf - 4 + 2;
}
}else if( nBuf>5 && 0==memcmp("entli", &aBuf[nBuf-5], 5) ){
if( fts5Porter_MGt0(aBuf, nBuf-5) ){
memcpy(&aBuf[nBuf-5], "ent", 3);
*pnBuf = nBuf - 5 + 3;
}
}else if( nBuf>3 && 0==memcmp("eli", &aBuf[nBuf-3], 3) ){
if( fts5Porter_MGt0(aBuf, nBuf-3) ){
memcpy(&aBuf[nBuf-3], "e", 1);
*pnBuf = nBuf - 3 + 1;
}
}else if( nBuf>5 && 0==memcmp("ousli", &aBuf[nBuf-5], 5) ){
if( fts5Porter_MGt0(aBuf, nBuf-5) ){
memcpy(&aBuf[nBuf-5], "ous", 3);
*pnBuf = nBuf - 5 + 3;
}
}
break;
case 'o':
if( nBuf>7 && 0==memcmp("ization", &aBuf[nBuf-7], 7) ){
if( fts5Porter_MGt0(aBuf, nBuf-7) ){
memcpy(&aBuf[nBuf-7], "ize", 3);
*pnBuf = nBuf - 7 + 3;
}
}else if( nBuf>5 && 0==memcmp("ation", &aBuf[nBuf-5], 5) ){
if( fts5Porter_MGt0(aBuf, nBuf-5) ){
memcpy(&aBuf[nBuf-5], "ate", 3);
*pnBuf = nBuf - 5 + 3;
}
}else if( nBuf>4 && 0==memcmp("ator", &aBuf[nBuf-4], 4) ){
if( fts5Porter_MGt0(aBuf, nBuf-4) ){
memcpy(&aBuf[nBuf-4], "ate", 3);
*pnBuf = nBuf - 4 + 3;
}
}
break;
case 's':
if( nBuf>5 && 0==memcmp("alism", &aBuf[nBuf-5], 5) ){
if( fts5Porter_MGt0(aBuf, nBuf-5) ){
memcpy(&aBuf[nBuf-5], "al", 2);
*pnBuf = nBuf - 5 + 2;
}
}else if( nBuf>7 && 0==memcmp("iveness", &aBuf[nBuf-7], 7) ){
if( fts5Porter_MGt0(aBuf, nBuf-7) ){
memcpy(&aBuf[nBuf-7], "ive", 3);
*pnBuf = nBuf - 7 + 3;
}
}else if( nBuf>7 && 0==memcmp("fulness", &aBuf[nBuf-7], 7) ){
if( fts5Porter_MGt0(aBuf, nBuf-7) ){
memcpy(&aBuf[nBuf-7], "ful", 3);
*pnBuf = nBuf - 7 + 3;
}
}else if( nBuf>7 && 0==memcmp("ousness", &aBuf[nBuf-7], 7) ){
if( fts5Porter_MGt0(aBuf, nBuf-7) ){
memcpy(&aBuf[nBuf-7], "ous", 3);
*pnBuf = nBuf - 7 + 3;
}
}
break;
case 't':
if( nBuf>5 && 0==memcmp("aliti", &aBuf[nBuf-5], 5) ){
if( fts5Porter_MGt0(aBuf, nBuf-5) ){
memcpy(&aBuf[nBuf-5], "al", 2);
*pnBuf = nBuf - 5 + 2;
}
}else if( nBuf>5 && 0==memcmp("iviti", &aBuf[nBuf-5], 5) ){
if( fts5Porter_MGt0(aBuf, nBuf-5) ){
memcpy(&aBuf[nBuf-5], "ive", 3);
*pnBuf = nBuf - 5 + 3;
}
}else if( nBuf>6 && 0==memcmp("biliti", &aBuf[nBuf-6], 6) ){
if( fts5Porter_MGt0(aBuf, nBuf-6) ){
memcpy(&aBuf[nBuf-6], "ble", 3);
*pnBuf = nBuf - 6 + 3;
}
}
break;
}
return ret;
}
static int fts5PorterStep3(char *aBuf, int *pnBuf){
int ret = 0;
int nBuf = *pnBuf;
switch( aBuf[nBuf-2] ){
case 'a':
if( nBuf>4 && 0==memcmp("ical", &aBuf[nBuf-4], 4) ){
if( fts5Porter_MGt0(aBuf, nBuf-4) ){
memcpy(&aBuf[nBuf-4], "ic", 2);
*pnBuf = nBuf - 4 + 2;
}
}
break;
case 's':
if( nBuf>4 && 0==memcmp("ness", &aBuf[nBuf-4], 4) ){
if( fts5Porter_MGt0(aBuf, nBuf-4) ){
*pnBuf = nBuf - 4;
}
}
break;
case 't':
if( nBuf>5 && 0==memcmp("icate", &aBuf[nBuf-5], 5) ){
if( fts5Porter_MGt0(aBuf, nBuf-5) ){
memcpy(&aBuf[nBuf-5], "ic", 2);
*pnBuf = nBuf - 5 + 2;
}
}else if( nBuf>5 && 0==memcmp("iciti", &aBuf[nBuf-5], 5) ){
if( fts5Porter_MGt0(aBuf, nBuf-5) ){
memcpy(&aBuf[nBuf-5], "ic", 2);
*pnBuf = nBuf - 5 + 2;
}
}
break;
case 'u':
if( nBuf>3 && 0==memcmp("ful", &aBuf[nBuf-3], 3) ){
if( fts5Porter_MGt0(aBuf, nBuf-3) ){
*pnBuf = nBuf - 3;
}
}
break;
case 'v':
if( nBuf>5 && 0==memcmp("ative", &aBuf[nBuf-5], 5) ){
if( fts5Porter_MGt0(aBuf, nBuf-5) ){
*pnBuf = nBuf - 5;
}
}
break;
case 'z':
if( nBuf>5 && 0==memcmp("alize", &aBuf[nBuf-5], 5) ){
if( fts5Porter_MGt0(aBuf, nBuf-5) ){
memcpy(&aBuf[nBuf-5], "al", 2);
*pnBuf = nBuf - 5 + 2;
}
}
break;
}
return ret;
}
static int fts5PorterStep1B(char *aBuf, int *pnBuf){
int ret = 0;
int nBuf = *pnBuf;
switch( aBuf[nBuf-2] ){
case 'e':
if( nBuf>3 && 0==memcmp("eed", &aBuf[nBuf-3], 3) ){
if( fts5Porter_MGt0(aBuf, nBuf-3) ){
memcpy(&aBuf[nBuf-3], "ee", 2);
*pnBuf = nBuf - 3 + 2;
}
}else if( nBuf>2 && 0==memcmp("ed", &aBuf[nBuf-2], 2) ){
if( fts5Porter_Vowel(aBuf, nBuf-2) ){
*pnBuf = nBuf - 2;
ret = 1;
}
}
break;
case 'n':
if( nBuf>3 && 0==memcmp("ing", &aBuf[nBuf-3], 3) ){
if( fts5Porter_Vowel(aBuf, nBuf-3) ){
*pnBuf = nBuf - 3;
ret = 1;
}
}
break;
}
return ret;
}
/*
** GENERATED CODE ENDS HERE (mkportersteps.tcl)
***************************************************************************
**************************************************************************/
static void fts5PorterStep1A(char *aBuf, int *pnBuf){
int nBuf = *pnBuf;
if( aBuf[nBuf-1]=='s' ){
if( aBuf[nBuf-2]=='e' ){
if( (nBuf>4 && aBuf[nBuf-4]=='s' && aBuf[nBuf-3]=='s')
|| (nBuf>3 && aBuf[nBuf-3]=='i' )
){
*pnBuf = nBuf-2;
}else{
*pnBuf = nBuf-1;
}
}
else if( aBuf[nBuf-2]!='s' ){
*pnBuf = nBuf-1;
}
}
}
static int fts5PorterCb(
void *pCtx,
const char *pToken,
@@ -675,96 +1118,8 @@ static int fts5PorterCb(
){
PorterContext *p = (PorterContext*)pCtx;
PorterRule aStep1A[] = {
{ "sses", 4, 0, "ss", 2 },
{ "ies", 3, 0, "i", 1 },
{ "ss", 2, 0, "ss", 2 },
{ "s", 1, 0, "", 0 },
{ 0, 0, 0, 0 }
};
PorterRule aStep1B[] = {
{ "eed", 3, fts5Porter_MGt0, "ee", 2 },
{ "ed", 2, fts5Porter_Vowel, "", 0 },
{ "ing", 3, fts5Porter_Vowel, "", 0 },
{ 0, 0, 0, 0 }
};
PorterRule aStep1B2[] = {
{ "at", 2, 0, "ate", 3 },
{ "bl", 2, 0, "ble", 3 },
{ "iz", 2, 0, "ize", 3 },
{ 0, 0, 0, 0 }
};
PorterRule aStep1C[] = {
{ "y", 1, fts5Porter_Vowel, "i", 1 },
{ 0, 0, 0, 0 }
};
PorterRule aStep2[] = {
{ "ational", 7, fts5Porter_MGt0, "ate", 3},
{ "tional", 6, fts5Porter_MGt0, "tion", 4},
{ "enci", 4, fts5Porter_MGt0, "ence", 4},
{ "anci", 4, fts5Porter_MGt0, "ance", 4},
{ "izer", 4, fts5Porter_MGt0, "ize", 3},
{ "logi", 4, fts5Porter_MGt0, "log", 3}, /* added post 1979 */
{ "bli", 3, fts5Porter_MGt0, "ble", 3}, /* modified post 1979 */
{ "alli", 4, fts5Porter_MGt0, "al", 2},
{ "entli", 5, fts5Porter_MGt0, "ent", 3},
{ "eli", 3, fts5Porter_MGt0, "e", 1},
{ "ousli", 5, fts5Porter_MGt0, "ous", 3},
{ "ization", 7, fts5Porter_MGt0, "ize", 3},
{ "ation", 5, fts5Porter_MGt0, "ate", 3},
{ "ator", 4, fts5Porter_MGt0, "ate", 3},
{ "alism", 5, fts5Porter_MGt0, "al", 2},
{ "iveness", 7, fts5Porter_MGt0, "ive", 3},
{ "fulness", 7, fts5Porter_MGt0, "ful", 3},
{ "ousness", 7, fts5Porter_MGt0, "ous", 3},
{ "aliti", 5, fts5Porter_MGt0, "al", 2},
{ "iviti", 5, fts5Porter_MGt0, "ive", 3},
{ "biliti", 6, fts5Porter_MGt0, "ble", 3},
{ 0, 0, 0, 0 }
};
PorterRule aStep3[] = {
{ "icate", 5, fts5Porter_MGt0, "ic", 2},
{ "ative", 5, fts5Porter_MGt0, "", 0},
{ "alize", 5, fts5Porter_MGt0, "al", 2},
{ "iciti", 5, fts5Porter_MGt0, "ic", 2},
{ "ical", 4, fts5Porter_MGt0, "ic", 2},
{ "ful", 3, fts5Porter_MGt0, "", 0},
{ "ness", 4, fts5Porter_MGt0, "", 0},
{ 0, 0, 0, 0 }
};
PorterRule aStep4[] = {
{ "al", 2, fts5Porter_MGt1, "", 0},
{ "ance", 4, fts5Porter_MGt1, "", 0},
{ "ence", 4, fts5Porter_MGt1, "", 0},
{ "er", 2, fts5Porter_MGt1, "", 0},
{ "ic", 2, fts5Porter_MGt1, "", 0},
{ "able", 4, fts5Porter_MGt1, "", 0},
{ "ible", 4, fts5Porter_MGt1, "", 0},
{ "ant", 3, fts5Porter_MGt1, "", 0},
{ "ement", 5, fts5Porter_MGt1, "", 0},
{ "ment", 4, fts5Porter_MGt1, "", 0},
{ "ent", 3, fts5Porter_MGt1, "", 0},
{ "ion", 3, fts5Porter_MGt1_and_S_or_T, "", 0},
{ "ou", 2, fts5Porter_MGt1, "", 0},
{ "ism", 3, fts5Porter_MGt1, "", 0},
{ "ate", 3, fts5Porter_MGt1, "", 0},
{ "iti", 3, fts5Porter_MGt1, "", 0},
{ "ous", 3, fts5Porter_MGt1, "", 0},
{ "ive", 3, fts5Porter_MGt1, "", 0},
{ "ize", 3, fts5Porter_MGt1, "", 0},
{ 0, 0, 0, 0 }
};
char *aBuf;
int nBuf;
int n;
if( nToken>FTS5_PORTER_MAX_TOKEN || nToken<3 ) goto pass_through;
aBuf = p->aBuf;
@@ -772,10 +1127,9 @@ static int fts5PorterCb(
memcpy(aBuf, pToken, nBuf);
/* Step 1. */
fts5PorterApply(aBuf, &nBuf, aStep1A);
n = fts5PorterApply(aBuf, &nBuf, aStep1B);
if( n==1 || n==2 ){
if( fts5PorterApply(aBuf, &nBuf, aStep1B2)<0 ){
fts5PorterStep1A(aBuf, &nBuf);
if( fts5PorterStep1B(aBuf, &nBuf) ){
if( fts5PorterStep1B2(aBuf, &nBuf)==0 ){
char c = aBuf[nBuf-1];
if( fts5PorterIsVowel(c, 0)==0
&& c!='l' && c!='s' && c!='z' && c==aBuf[nBuf-2]
@@ -786,12 +1140,16 @@ static int fts5PorterCb(
}
}
}
fts5PorterApply(aBuf, &nBuf, aStep1C);
/* Step 1C. */
if( aBuf[nBuf-1]=='y' && fts5Porter_Vowel(aBuf, nBuf-1) ){
aBuf[nBuf-1] = 'i';
}
/* Steps 2 through 4. */
fts5PorterApply(aBuf, &nBuf, aStep2);
fts5PorterApply(aBuf, &nBuf, aStep3);
fts5PorterApply(aBuf, &nBuf, aStep4);
fts5PorterStep2(aBuf, &nBuf);
fts5PorterStep3(aBuf, &nBuf);
fts5PorterStep4(aBuf, &nBuf);
/* Step 5a. */
if( nBuf>0 && aBuf[nBuf-1]=='e' ){

222
ext/fts5/mkportersteps.tcl Normal file
View File

@@ -0,0 +1,222 @@
#
# 2014 Jun 09
#
# The author disclaims copyright to this source code. In place of
# a legal notice, here is a blessing:
#
# May you do good and not evil.
# May you find forgiveness for yourself and forgive others.
# May you share freely, never taking more than you give.
#
#-------------------------------------------------------------------------
#
# This script generates the implementations of the following C functions,
# which are part of the porter tokenizer implementation:
#
# static int fts5PorterStep1B(char *aBuf, int *pnBuf);
# static int fts5PorterStep1B2(char *aBuf, int *pnBuf);
# static int fts5PorterStep2(char *aBuf, int *pnBuf);
# static int fts5PorterStep3(char *aBuf, int *pnBuf);
# static int fts5PorterStep4(char *aBuf, int *pnBuf);
#
set O(Step1B2) {
{ at {} ate 1 }
{ bl {} ble 1 }
{ iz {} ize 1 }
}
set O(Step1B) {
{ "eed" fts5Porter_MGt0 "ee" 0 }
{ "ed" fts5Porter_Vowel "" 1 }
{ "ing" fts5Porter_Vowel "" 1 }
}
set O(Step2) {
{ "ational" fts5Porter_MGt0 "ate" }
{ "tional" fts5Porter_MGt0 "tion" }
{ "enci" fts5Porter_MGt0 "ence" }
{ "anci" fts5Porter_MGt0 "ance" }
{ "izer" fts5Porter_MGt0 "ize" }
{ "logi" fts5Porter_MGt0 "log" }
{ "bli" fts5Porter_MGt0 "ble" }
{ "alli" fts5Porter_MGt0 "al" }
{ "entli" fts5Porter_MGt0 "ent" }
{ "eli" fts5Porter_MGt0 "e" }
{ "ousli" fts5Porter_MGt0 "ous" }
{ "ization" fts5Porter_MGt0 "ize" }
{ "ation" fts5Porter_MGt0 "ate" }
{ "ator" fts5Porter_MGt0 "ate" }
{ "alism" fts5Porter_MGt0 "al" }
{ "iveness" fts5Porter_MGt0 "ive" }
{ "fulness" fts5Porter_MGt0 "ful" }
{ "ousness" fts5Porter_MGt0 "ous" }
{ "aliti" fts5Porter_MGt0 "al" }
{ "iviti" fts5Porter_MGt0 "ive" }
{ "biliti" fts5Porter_MGt0 "ble" }
}
set O(Step3) {
{ "icate" fts5Porter_MGt0 "ic" }
{ "ative" fts5Porter_MGt0 "" }
{ "alize" fts5Porter_MGt0 "al" }
{ "iciti" fts5Porter_MGt0 "ic" }
{ "ical" fts5Porter_MGt0 "ic" }
{ "ful" fts5Porter_MGt0 "" }
{ "ness" fts5Porter_MGt0 "" }
}
set O(Step4) {
{ "al" fts5Porter_MGt1 "" }
{ "ance" fts5Porter_MGt1 "" }
{ "ence" fts5Porter_MGt1 "" }
{ "er" fts5Porter_MGt1 "" }
{ "ic" fts5Porter_MGt1 "" }
{ "able" fts5Porter_MGt1 "" }
{ "ible" fts5Porter_MGt1 "" }
{ "ant" fts5Porter_MGt1 "" }
{ "ement" fts5Porter_MGt1 "" }
{ "ment" fts5Porter_MGt1 "" }
{ "ent" fts5Porter_MGt1 "" }
{ "ion" fts5Porter_MGt1_and_S_or_T "" }
{ "ou" fts5Porter_MGt1 "" }
{ "ism" fts5Porter_MGt1 "" }
{ "ate" fts5Porter_MGt1 "" }
{ "iti" fts5Porter_MGt1 "" }
{ "ous" fts5Porter_MGt1 "" }
{ "ive" fts5Porter_MGt1 "" }
{ "ize" fts5Porter_MGt1 "" }
}
proc sort_cb {lhs rhs} {
set L [string range [lindex $lhs 0] end-1 end-1]
set R [string range [lindex $rhs 0] end-1 end-1]
string compare $L $R
}
proc create_step_function {name data} {
set T(function) {
static int fts5Porter${name}(char *aBuf, int *pnBuf){
int ret = 0;
int nBuf = *pnBuf;
switch( aBuf[nBuf-2] ){
${switchbody}
}
return ret;
}
}
set T(case) {
case '${k}':
${ifstmts}
break;
}
set T(if_0_0_0) {
if( ${match} ){
*pnBuf = nBuf - $n;
}
}
set T(if_1_0_0) {
if( ${match} ){
if( ${cond} ){
*pnBuf = nBuf - $n;
}
}
}
set T(if_0_1_0) {
if( ${match} ){
${memcpy}
*pnBuf = nBuf - $n + $nRep;
}
}
set T(if_1_1_0) {
if( ${match} ){
if( ${cond} ){
${memcpy}
*pnBuf = nBuf - $n + $nRep;
}
}
}
set T(if_1_0_1) {
if( ${match} ){
if( ${cond} ){
*pnBuf = nBuf - $n;
ret = 1;
}
}
}
set T(if_0_1_1) {
if( ${match} ){
${memcpy}
*pnBuf = nBuf - $n + $nRep;
ret = 1;
}
}
set T(if_1_1_1) {
if( ${match} ){
if( ${cond} ){
${memcpy}
*pnBuf = nBuf - $n + $nRep;
ret = 1;
}
}
}
set switchbody ""
foreach I $data {
set k [string range [lindex $I 0] end-1 end-1]
lappend aCase($k) $I
}
foreach k [lsort [array names aCase]] {
set ifstmts ""
foreach I $aCase($k) {
set zSuffix [lindex $I 0] ;# Suffix text for this rule
set zRep [lindex $I 2] ;# Replacement text for rule
set xCond [lindex $I 1] ;# Condition callback (or "")
set n [string length $zSuffix]
set nRep [string length $zRep]
set match "nBuf>$n && 0==memcmp(\"$zSuffix\", &aBuf\[nBuf-$n\], $n)"
set memcpy "memcpy(&aBuf\[nBuf-$n\], \"$zRep\", $nRep);"
set cond "${xCond}(aBuf, nBuf-$n)"
set bMemcpy [expr {$nRep>0}]
set bCond [expr {$xCond!=""}]
set bRet [expr {[llength $I]>3 && [lindex $I 3]}]
set t $T(if_${bCond}_${bMemcpy}_${bRet})
lappend ifstmts [string trim [subst -nocommands $t]]
}
set ifstmts [join $ifstmts "else "]
append switchbody [subst -nocommands $T(case)]
}
puts [subst -nocommands $T(function)]
}
puts [string trim {
/**************************************************************************
***************************************************************************
** GENERATED CODE STARTS HERE (mkportersteps.tcl)
*/
}]
foreach step [array names O] {
create_step_function $step $O($step)
}
puts [string trim {
/*
** GENERATED CODE ENDS HERE (mkportersteps.tcl)
***************************************************************************
**************************************************************************/
}]

View File

@@ -0,0 +1,71 @@
proc loadfile {f} {
set fd [open $f]
set data [read $fd]
close $fd
return $data
}
set ::nRow 0
proc load_hierachy {dir} {
foreach f [glob -nocomplain -dir $dir *] {
if {$::O(limit) && $::nRow>=$::O(limit)} break
if {[file isdir $f]} {
load_hierachy $f
} else {
db eval { INSERT INTO t1 VALUES($f, loadfile($f)) }
incr ::nRow
}
}
}
proc usage {} {
puts stderr "Usage: $::argv0 ?SWITCHES? DATABASE PATH"
puts stderr ""
puts stderr "Switches are:"
puts stderr " -fts4 (use fts4 instead of fts5)"
exit 1
}
set O(vtab) fts5
set O(tok) ""
set O(limit) 0
if {[llength $argv]<2} usage
for {set i 0} {$i < [llength $argv]-2} {incr i} {
set arg [lindex $argv $i]
switch -- [lindex $argv $i] {
-fts4 {
set O(vtab) fts4
}
-fts5 {
set O(vtab) fts5
}
-porter {
set O(tok) ", tokenize=porter"
}
-limit {
incr i
set O(limit) [lindex $argv $i]
}
default {
usage
}
}
}
sqlite3 db [lindex $argv end-1]
db func loadfile loadfile
db transaction {
db eval "CREATE VIRTUAL TABLE t1 USING $O(vtab) (path, content$O(tok))"
load_hierachy [lindex $argv end]
}

View File

@@ -1,5 +1,5 @@
C Fix\sprefix\sindexes\sso\sthat\sthey\swork\sin\scharacters,\snot\sbytes.
D 2015-01-13T17:25:08.235
C Improve\sthe\sperformance\sof\sthe\sfts5\sporter\stokenizer\simplementation.
D 2015-01-17T17:48:10.103
F Makefile.arm-wince-mingw32ce-gcc d6df77f1f48d690bd73162294bbba7f59507c72f
F Makefile.in 7cd23e4fc91004a6bd081623e1bc6932e44828c0
F Makefile.linux-gcc 91d710bdc4998cb015f39edf3cb314ec4f4d7e23
@@ -115,9 +115,10 @@ F ext/fts5/fts5_hash.c 63fa8379c5f2ac107d47c2b7d9ac04c95ef8a279
F ext/fts5/fts5_index.c 6f9f98875b2ee5a16255911e1dc1b0b32cb1c350
F ext/fts5/fts5_storage.c 8bc9e5b6654e1545e9513def277ef3f025921664
F ext/fts5/fts5_tcl.c 1293fac2bb26903fd3d5cdee59c5885ba7e620d5
F ext/fts5/fts5_tokenize.c bdb6a1f599a94ec6e9c1cad037d1071e823dcb5d
F ext/fts5/fts5_tokenize.c 7c61d5c35c3449597bdeaa54dd48afe26852c7b0
F ext/fts5/fts5_unicode2.c 9c7dd640d1f014bf5c3ee029759adfbb4d7e95a9
F ext/fts5/fts5parse.y 777da8e5819f75c217982c79c29d014c293acac9
F ext/fts5/mkportersteps.tcl 5acf962d2e0074f701620bb5308155fa1e4a63ba
F ext/fts5/test/fts5_common.tcl 08e939096a07eb77a7a986613e960f31d3cab2cc
F ext/fts5/test/fts5aa.test 3941b54d7585153be0c5cf0026f7dd8cfef13ea9
F ext/fts5/test/fts5ab.test 91a3faac09ad9fab5f71494db6e4071963281536
@@ -143,6 +144,7 @@ F ext/fts5/test/fts5rebuild.test 2a5e98205393487b4a732c8290999af7c0b907b4
F ext/fts5/test/fts5tokenizer.test b34ae592db66f6e89546d791ce1f905ba0b3395c
F ext/fts5/test/fts5unicode.test 79b3e34eb29ce4929628aa514a40cb467fdabe4d
F ext/fts5/test/fts5unicode2.test 64a5267fd6082fcb46439892ebd0cbaa5c38acee
F ext/fts5/tool/loadfts5.tcl 55c1f3ebf3f4b4f54be5bbdc823e36d59fc5e2dd
F ext/icu/README.txt d9fbbad0c2f647c3fdf715fc9fd64af53aedfc43
F ext/icu/icu.c d415ccf984defeb9df2c0e1afcfaa2f6dc05eacb
F ext/icu/sqliteicu.h 728867a802baa5a96de7495e9689a8e01715ef37
@@ -1275,7 +1277,7 @@ F tool/vdbe_profile.tcl 67746953071a9f8f2f668b73fe899074e2c6d8c1
F tool/warnings-clang.sh f6aa929dc20ef1f856af04a730772f59283631d4
F tool/warnings.sh 0abfd78ceb09b7f7c27c688c8e3fe93268a13b32
F tool/win/sqlite.vsix deb315d026cc8400325c5863eef847784a219a2f
P f22dbccad9499624880ddd48df1b07fb42b1ad66
R 8d592e678c3bea0440cf749de24705b7
P af8d43a4a08528bbae25ee38fe25de8a86f8a21c
R bbc2aaea254f25294ae3538c1336787c
U dan
Z 3408fdf2714814208d88a4779f5de9eb
Z 3ca0ddccabcad41dd9682a0c32f2940d

View File

@@ -1 +1 @@
af8d43a4a08528bbae25ee38fe25de8a86f8a21c
96ea600440de05ee663e71c3f0d0de2c64108bf9