Optimize the unicode61 tokenizer so that it handles ascii text faster. Make it the default tokenizer. Change the name of the simple tokenizer to "ascii".

FossilOrigin-Name: f22dbccad9499624880ddd48df1b07fb42b1ad66
2025-08-01 06:27:03 +03:00 · 2015-01-12 17:58:04 +00:00
parent 27277c4e3c
commit 73f7d6ed75
7 changed files with 190 additions and 109 deletions
--- a/ext/fts5/fts5.c
+++ b/ext/fts5/fts5.c
@ -72,6 +72,7 @@ struct Fts5Global {
  i64 iNextId;                    /* Used to allocate unique cursor ids */
  Fts5Auxiliary *pAux;            /* First in list of all aux. functions */
  Fts5TokenizerModule *pTok;      /* First in list of all tokenizer modules */
+  Fts5TokenizerModule *pDfltTok;  /* Default tokenizer module */
  Fts5Cursor *pCsr;               /* First in list of all open cursors */
 };

@ -771,7 +772,7 @@ static int fts5FindRankFunction(Fts5Cursor *pCsr){
  Fts5Table *pTab = (Fts5Table*)(pCsr->base.pVtab);
  Fts5Config *pConfig = pTab->pConfig;
  int rc = SQLITE_OK;
-  Fts5Auxiliary *pAux;
+  Fts5Auxiliary *pAux = 0;
  const char *zRank = pCsr->zRank;
  const char *zRankArgs = pCsr->zRankArgs;

@ -1028,7 +1029,6 @@ static int fts5SeekCursor(Fts5Cursor *pCsr){
 }

 static void fts5SetVtabError(Fts5Table *p, const char *zFormat, ...){
-  int rc;
  va_list ap;                     /* ... printf arguments */
  va_start(ap, zFormat);
  assert( p->base.zErrMsg==0 );
@ -1796,6 +1796,9 @@ static int fts5CreateTokenizer(
    pNew->xDestroy = xDestroy;
    pNew->pNext = pGlobal->pTok;
    pGlobal->pTok = pNew;
+    if( pNew->pNext==0 ){
+      pGlobal->pDfltTok = pNew;
+    }
  }else{
    rc = SQLITE_NOMEM;
  }
@ -1817,8 +1820,12 @@ static int fts5FindTokenizer(
  int rc = SQLITE_OK;
  Fts5TokenizerModule *pTok;

-  for(pTok=pGlobal->pTok; pTok; pTok=pTok->pNext){
-    if( sqlite3_stricmp(zName, pTok->zName)==0 ) break;
+  if( zName==0 ){
+    pTok = pGlobal->pDfltTok;
+  }else{
+    for(pTok=pGlobal->pTok; pTok; pTok=pTok->pNext){
+      if( sqlite3_stricmp(zName, pTok->zName)==0 ) break;
+    }
  }

  if( pTok ){
@ -1841,8 +1848,9 @@ int sqlite3Fts5GetTokenizer(
 ){
  Fts5TokenizerModule *pMod = 0;
  int rc = SQLITE_OK;
+
  if( nArg==0 ){
-    pMod = pGlobal->pTok;
+    pMod = pGlobal->pDfltTok;
  }else{
    for(pMod=pGlobal->pTok; pMod; pMod=pMod->pNext){
      if( sqlite3_stricmp(azArg[0], pMod->zName)==0 ) break;
--- a/ext/fts5/fts5_tokenize.c
+++ b/ext/fts5/fts5_tokenize.c
@ -16,14 +16,14 @@
 #include <assert.h>

 /**************************************************************************
-** Start of simple tokenizer implementation.
+** Start of ascii tokenizer implementation.
 */

 /*
 ** For tokenizers with no "unicode" modifier, the set of token characters
 ** is the same as the set of ASCII range alphanumeric characters. 
 */
-static unsigned char aSimpleTokenChar[128] = {
+static unsigned char aAsciiTokenChar[128] = {
  0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,   /* 0x00..0x0F */
  0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,   /* 0x10..0x1F */
  0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,   /* 0x20..0x2F */
@ -34,13 +34,13 @@ static unsigned char aSimpleTokenChar[128] = {
  1, 1, 1, 1, 1, 1, 1, 1,   1, 1, 1, 0, 0, 0, 0, 0,   /* 0x70..0x7F */
 };

-typedef struct SimpleTokenizer SimpleTokenizer;
-struct SimpleTokenizer {
+typedef struct AsciiTokenizer AsciiTokenizer;
+struct AsciiTokenizer {
  unsigned char aTokenChar[128];
 };

-static void fts5SimpleAddExceptions(
-  SimpleTokenizer *p, 
+static void fts5AsciiAddExceptions(
+  AsciiTokenizer *p, 
  const char *zArg, 
  int bTokenChars
 ){
@ -53,32 +53,32 @@ static void fts5SimpleAddExceptions(
 }

 /*
-** Create a "simple" tokenizer.
+** Create a "ascii" tokenizer.
 */
-static int fts5SimpleCreate(
+static int fts5AsciiCreate(
  void *pCtx, 
  const char **azArg, int nArg,
  Fts5Tokenizer **ppOut
 ){
  int rc = SQLITE_OK;
-  SimpleTokenizer *p = 0;
+  AsciiTokenizer *p = 0;
  if( nArg%2 ){
    rc = SQLITE_ERROR;
  }else{
-    p = sqlite3_malloc(sizeof(SimpleTokenizer));
+    p = sqlite3_malloc(sizeof(AsciiTokenizer));
    if( p==0 ){
      rc = SQLITE_NOMEM;
    }else{
      int i;
-      memset(p, 0, sizeof(SimpleTokenizer));
-      memcpy(p->aTokenChar, aSimpleTokenChar, sizeof(aSimpleTokenChar));
+      memset(p, 0, sizeof(AsciiTokenizer));
+      memcpy(p->aTokenChar, aAsciiTokenChar, sizeof(aAsciiTokenChar));
      for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
        const char *zArg = azArg[i+1];
        if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){
-          fts5SimpleAddExceptions(p, zArg, 1);
+          fts5AsciiAddExceptions(p, zArg, 1);
        }else
        if( 0==sqlite3_stricmp(azArg[i], "separators") ){
-          fts5SimpleAddExceptions(p, zArg, 0);
+          fts5AsciiAddExceptions(p, zArg, 0);
        }else{
          rc = SQLITE_ERROR;
        }
@ -91,14 +91,14 @@ static int fts5SimpleCreate(
 }

 /*
-** Delete a "simple" tokenizer.
+** Delete a "ascii" tokenizer.
 */
-static void fts5SimpleDelete(Fts5Tokenizer *p){
+static void fts5AsciiDelete(Fts5Tokenizer *p){
  sqlite3_free(p);
 }


-static void simpleFold(char *aOut, const char *aIn, int nByte){
+static void asciiFold(char *aOut, const char *aIn, int nByte){
  int i;
  for(i=0; i<nByte; i++){
    char c = aIn[i];
@ -108,15 +108,15 @@ static void simpleFold(char *aOut, const char *aIn, int nByte){
 }

 /*
-** Tokenize some text using the simple tokenizer.
+** Tokenize some text using the ascii tokenizer.
 */
-static int fts5SimpleTokenize(
+static int fts5AsciiTokenize(
  Fts5Tokenizer *pTokenizer,
  void *pCtx,
  const char *pText, int nText,
  int (*xToken)(void*, const char*, int nToken, int iStart, int iEnd)
 ){
-  SimpleTokenizer *p = (SimpleTokenizer*)pTokenizer;
+  AsciiTokenizer *p = (AsciiTokenizer*)pTokenizer;
  int rc = SQLITE_OK;
  int ie;
  int is = 0;
@ -130,14 +130,14 @@ static int fts5SimpleTokenize(
    int nByte;

    /* Skip any leading divider characters. */
-    while( is<nText && ((pText[is]&0x80) || a[(int)pText[is]]==0) ){
+    while( is<nText && ((pText[is]&0x80)==0 && a[(int)pText[is]]==0) ){
      is++;
    }
    if( is==nText ) break;

    /* Count the token characters */
    ie = is+1;
-    while( ie<nText && ((pText[ie]&0x80)==0 && a[(int)pText[ie]] ) ){
+    while( ie<nText && ((pText[ie]&0x80) || a[(int)pText[ie]] ) ){
      ie++;
    }

@ -152,7 +152,7 @@ static int fts5SimpleTokenize(
      }
      nFold = nByte*2;
    }
-    simpleFold(pFold, &pText[is], nByte);
+    asciiFold(pFold, &pText[is], nByte);

    /* Invoke the token callback */
    rc = xToken(pCtx, pFold, nByte, is, ie);
@ -206,6 +206,7 @@ static const unsigned char sqlite3Utf8Trans1[] = {
        || (c&0xFFFFFFFE)==0xFFFE ){  c = 0xFFFD; }        \
  }

+
 #define WRITE_UTF8(zOut, c) {                          \
  if( c<0x00080 ){                                     \
    *zOut++ = (unsigned char)(c&0xFF);                 \
@ -230,6 +231,9 @@ static const unsigned char sqlite3Utf8Trans1[] = {

 typedef struct Unicode61Tokenizer Unicode61Tokenizer;
 struct Unicode61Tokenizer {
+  unsigned char aTokenChar[128];  /* ASCII range token characters */
+  char *aFold;                    /* Buffer to fold text into */
+  int nFold;                      /* Size of aFold[] in bytes */
  int bRemoveDiacritic;           /* True if remove_diacritics=1 is set */
  int nException;
  int *aiException;
@ -254,17 +258,21 @@ static int fts5UnicodeAddExceptions(
        int iCode;
        int bToken;
        READ_UTF8(zCsr, zTerm, iCode);
-        bToken = sqlite3Fts5UnicodeIsalnum(iCode);
-        assert( (bToken==0 || bToken==1) ); 
-        assert( (bTokenChars==0 || bTokenChars==1) );
-        if( bToken!=bTokenChars && sqlite3Fts5UnicodeIsdiacritic(iCode)==0 ){
-          int i;
-          for(i=0; i<nNew; i++){
-            if( aNew[i]>iCode ) break;
+        if( iCode<128 ){
+          p->aTokenChar[iCode] = bTokenChars;
+        }else{
+          bToken = sqlite3Fts5UnicodeIsalnum(iCode);
+          assert( (bToken==0 || bToken==1) ); 
+          assert( (bTokenChars==0 || bTokenChars==1) );
+          if( bToken!=bTokenChars && sqlite3Fts5UnicodeIsdiacritic(iCode)==0 ){
+            int i;
+            for(i=0; i<nNew; i++){
+              if( aNew[i]>iCode ) break;
+            }
+            memmove(&aNew[i+1], &aNew[i], (nNew-i)*sizeof(int));
+            aNew[i] = iCode;
+            nNew++;
          }
-          memmove(&aNew[i+1], &aNew[i], (nNew-i)*sizeof(int));
-          aNew[i] = iCode;
-          nNew++;
        }
      }
      p->aiException = aNew;
@ -301,6 +309,19 @@ static int fts5UnicodeIsException(Unicode61Tokenizer *p, int iCode){
  return 0;
 }

+/*
+** Delete a "unicode61" tokenizer.
+*/
+static void fts5UnicodeDelete(Fts5Tokenizer *pTok){
+  if( pTok ){
+    Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTok;
+    sqlite3_free(p->aiException);
+    sqlite3_free(p->aFold);
+    sqlite3_free(p);
+  }
+  return;
+}
+
 /*
 ** Create a "unicode61" tokenizer.
 */
@ -319,7 +340,13 @@ static int fts5UnicodeCreate(
    if( p ){
      int i;
      memset(p, 0, sizeof(Unicode61Tokenizer));
+      memcpy(p->aTokenChar, aAsciiTokenChar, sizeof(aAsciiTokenChar));
      p->bRemoveDiacritic = 1;
+      p->nFold = 64;
+      p->aFold = sqlite3_malloc(p->nFold * sizeof(char));
+      if( p->aFold==0 ){
+        rc = SQLITE_NOMEM;
+      }
      for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
        const char *zArg = azArg[i+1];
        if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){
@ -340,21 +367,15 @@ static int fts5UnicodeCreate(
    }else{
      rc = SQLITE_NOMEM;
    }
+    if( rc!=SQLITE_OK ){
+      fts5UnicodeDelete((Fts5Tokenizer*)p);
+      p = 0;
+    }
    *ppOut = (Fts5Tokenizer*)p;
  }
  return rc;
 }

-/*
-** Delete a "unicode61" tokenizer.
-*/
-static void fts5UnicodeDelete(Fts5Tokenizer *pTok){
-  Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTok;
-  sqlite3_free(p->aiException);
-  sqlite3_free(p);
-  return;
-}
-
 /*
 ** Return true if, for the purposes of tokenizing with the tokenizer
 ** passed as the first argument, codepoint iCode is considered a token 
@ -365,9 +386,6 @@ static int fts5UnicodeIsAlnum(Unicode61Tokenizer *p, int iCode){
  return sqlite3Fts5UnicodeIsalnum(iCode) ^ fts5UnicodeIsException(p, iCode);
 }

-/*
-** Tokenize some text using a unicode61 tokenizer.
-*/
 static int fts5UnicodeTokenize(
  Fts5Tokenizer *pTokenizer,
  void *pCtx,
@ -375,59 +393,94 @@ static int fts5UnicodeTokenize(
  int (*xToken)(void*, const char*, int nToken, int iStart, int iEnd)
 ){
  Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTokenizer;
-  const unsigned char *zInput = (const unsigned char*)pText;
-  const unsigned char *zTerm = &zInput[nText];
-  const unsigned char *z = zInput;
  int rc = SQLITE_OK;
-  int nBuf = 0;
-  unsigned char *zBuf = 0;
-  unsigned char *zOut = 0;
+  unsigned char *a = p->aTokenChar;

-  while( rc==SQLITE_OK && z<zTerm ){
-    int iCode;
-    int bAlnum;
-    const unsigned char *zStart;
-    const unsigned char *zCode;
+  unsigned char *zTerm = (unsigned char*)&pText[nText];
+  unsigned char *zCsr = (unsigned char *)pText;

-    if( zOut==zBuf ) zStart = z;
-    zCode = z;
-    READ_UTF8(z, zTerm, iCode);
-    bAlnum = fts5UnicodeIsAlnum(p, iCode);
-    if( bAlnum==0 && zOut>zBuf ){
-      bAlnum = sqlite3Fts5UnicodeIsdiacritic(iCode);
+  /* Output buffer */
+  char *aFold = p->aFold;
+  int nFold = p->nFold;
+
+  /* Each iteration of this loop gobbles up a contiguous run of separators,
+  ** then the next token.  */
+  while( rc==SQLITE_OK ){
+    int iCode;                    /* non-ASCII codepoint read from input */
+    char *zOut = aFold;
+    int is;
+    int ie;
+
+    /* Skip any separator characters. */
+    while( 1 ){
+      if( zCsr>=zTerm ) goto tokenize_done;
+      if( *zCsr & 0x80 ) {
+        /* A character outside of the ascii range. Skip past it if it is
+        ** a separator character. Or break out of the loop if it is not. */
+        is = zCsr - (unsigned char*)pText;
+        READ_UTF8(zCsr, zTerm, iCode);
+        if( fts5UnicodeIsAlnum(p, iCode) ){
+          goto non_ascii_tokenchar;
+        }
+      }else{
+        if( a[*zCsr] ){
+          is = zCsr - (unsigned char*)pText;
+          goto ascii_tokenchar;
+        }
+        zCsr++;
+      }
    }

-    if( bAlnum ){
-      int iOut;
+    /* Run through the tokenchars. Fold them into the output buffer along
+    ** the way.  */
+    while( zCsr<zTerm ){

-      /* Grow the output buffer if required */
-      while( (zOut-zBuf)+4>=nBuf ){
-        unsigned char *zNew;
-        nBuf = (nBuf ? nBuf*2 : 128);
-        zNew = sqlite3_realloc(zBuf, nBuf);
-        if( zNew==0 ){
+      /* Grow the output buffer so that there is sufficient space to fit the
+      ** largest possible utf-8 character.  */
+      if( (zOut-aFold)+6>nFold ){
+        aFold = sqlite3_malloc(nFold*2);
+        if( aFold==0 ){
          rc = SQLITE_NOMEM;
-          goto tokenize_finished;
-        }else{
-          zOut = &zNew[zOut-zBuf];
-          zBuf = zNew;
+          goto tokenize_done;
        }
+        memcpy(aFold, p->aFold, nFold);
+        sqlite3_free(p->aFold);
+        p->aFold = aFold;
+        p->nFold = nFold = nFold*2;
      }

-      /* Write the new character to it */
-      iOut = sqlite3Fts5UnicodeFold(iCode, p->bRemoveDiacritic);
-      if( iOut ) WRITE_UTF8(zOut, iOut);
+      if( *zCsr & 0x80 ){
+        /* An non-ascii-range character. Fold it into the output buffer if
+        ** it is a token character, or break out of the loop if it is not. */
+        READ_UTF8(zCsr, zTerm, iCode);
+        if( fts5UnicodeIsAlnum(p,iCode)||sqlite3Fts5UnicodeIsdiacritic(iCode) ){
+ non_ascii_tokenchar:
+          iCode = sqlite3Fts5UnicodeFold(iCode, p->bRemoveDiacritic);
+          if( iCode ) WRITE_UTF8(zOut, iCode);
+        }else{
+          break;
+        }
+      }else if( a[*zCsr]==0 ){
+        /* An ascii-range separator character. End of token. */
+        break; 
+      }else{
+ ascii_tokenchar:
+        if( *zCsr>='A' && *zCsr<='Z' ){
+          *zOut++ = *zCsr + 32;
+        }else{
+          *zOut++ = *zCsr;
+        }
+        zCsr++;
+      }
+      ie = zCsr - (unsigned char*)pText;
    }

-    if( zOut>zBuf && (bAlnum==0 || z>=zTerm) ){
-      int ie = (bAlnum ? z : zCode) - zInput;
-      rc = xToken(pCtx, (const char*)zBuf, zOut-zBuf, zStart-zInput, ie);
-      zOut = zBuf;
-    }
+    /* Invoke the token callback */
+    rc = xToken(pCtx, aFold, zOut-aFold, is, ie);
  }
-
- tokenize_finished:
-  sqlite3_free(zBuf);
+  
+ tokenize_done:
+  if( rc==SQLITE_DONE ) rc = SQLITE_OK;
  return rc;
 }

@ -475,7 +528,7 @@ static int fts5PorterCreate(
  pRet = (PorterTokenizer*)sqlite3_malloc(sizeof(PorterTokenizer));
  if( pRet ){
    memset(pRet, 0, sizeof(PorterTokenizer));
-    rc = pApi->xFindTokenizer(pApi, "simple", &pUserdata, &pRet->tokenizer);
+    rc = pApi->xFindTokenizer(pApi, "ascii", &pUserdata, &pRet->tokenizer);
  }else{
    rc = SQLITE_NOMEM;
  }
@ -789,9 +842,9 @@ int sqlite3Fts5TokenizerInit(fts5_api *pApi){
    const char *zName;
    fts5_tokenizer x;
  } aBuiltin[] = {
-    { "porter",    {fts5PorterCreate, fts5PorterDelete, fts5PorterTokenize }},
    { "unicode61", {fts5UnicodeCreate, fts5UnicodeDelete, fts5UnicodeTokenize}},
-    { "simple",    {fts5SimpleCreate, fts5SimpleDelete, fts5SimpleTokenize }}
+    { "ascii",     {fts5AsciiCreate, fts5AsciiDelete, fts5AsciiTokenize }},
+    { "porter",    {fts5PorterCreate, fts5PorterDelete, fts5PorterTokenize }},
  };
  
  int rc = SQLITE_OK;             /* Return code */
--- a/ext/fts5/test/fts5near.test
+++ b/ext/fts5/test/fts5near.test
@ -24,7 +24,7 @@ proc do_near_test {tn doc near res} {
 }

 execsql { 
-  CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = 'simple tokenchars .') 
+  CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = 'ascii tokenchars .') 
 }

 do_near_test 1.1 ". . a . . . b . ." { NEAR(a b, 5) } 1
--- a/ext/fts5/test/fts5tokenizer.test
+++ b/ext/fts5/test/fts5tokenizer.test
@ -33,7 +33,7 @@ do_execsql_test 1.3 {
  DROP TABLE ft1;
 }
 do_execsql_test 1.4 {
-  CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'porter simple');
+  CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'porter ascii');
  DROP TABLE ft1;
 }

@ -75,7 +75,7 @@ do_catchsql_test 4.2 {
 #-------------------------------------------------------------------------
 # Test the "separators" and "tokenchars" options a bit.
 #
-foreach {tn tokenizer} {1 simple 2 unicode61} {
+foreach {tn tokenizer} {1 ascii 2 unicode61} {
  reset_db
  set T "$tokenizer tokenchars ',.:' separators 'xyz'"
  execsql "CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = \"$T\")"
--- a/ext/fts5/test/fts5unicode.test
+++ b/ext/fts5/test/fts5unicode.test
@ -25,12 +25,32 @@ proc tokenize_test {tn tokenizer input output} {
  }] [list {*}$output]]
 }

-foreach {tn t} {1 simple 2 unicode61} {
+foreach {tn t} {1 ascii 2 unicode61} {
  tokenize_test 1.$tn.0 $t {A B C D} {a b c d}
  tokenize_test 1.$tn.1 $t {May you share freely,} {may you share freely}
  tokenize_test 1.$tn.2 $t {..May...you.shAre.freely} {may you share freely}
  tokenize_test 1.$tn.3 $t {} {}
 }

+#-------------------------------------------------------------------------
+# Check that "unicode61" really is the default tokenizer.
+#
+
+do_execsql_test 2.0 "
+  CREATE VIRTUAL TABLE t1 USING fts5(x);
+  CREATE VIRTUAL TABLE t2 USING fts5(x, tokenize = unicode61);
+  CREATE VIRTUAL TABLE t3 USING fts5(x, tokenize = ascii);
+  INSERT INTO t1 VALUES('\xC0\xC8\xCC');
+  INSERT INTO t2 VALUES('\xC0\xC8\xCC');
+  INSERT INTO t3 VALUES('\xC0\xC8\xCC');
+"
+breakpoint
+do_execsql_test 2.1 "
+  SELECT 't1' FROM t1 WHERE t1 MATCH '\xE0\xE8\xEC';
+  SELECT 't2' FROM t2 WHERE t2 MATCH '\xE0\xE8\xEC';
+  SELECT 't3' FROM t3 WHERE t3 MATCH '\xE0\xE8\xEC';
+" {t1 t2}
+
+
 finish_test