Add the "tokenchars=" and "separators=" options, for customizing the set of characters considered to be token separators, to the unicode61 tokenizer.

FossilOrigin-Name: e56fb462aa1f11bb23303ae0dc62815c21e26a52
2025-07-30 19:03:16 +03:00 · 2012-06-07 15:53:48 +00:00
parent a879342b0e
commit 25cdf46ae4
5 changed files with 251 additions and 45 deletions
--- a/ext/fts3/fts3_tokenizer.c
+++ b/ext/fts3/fts3_tokenizer.c
@ -209,10 +209,9 @@ int sqlite3Fts3InitTokenizer(
 /*
 ** Implementation of a special SQL scalar function for testing tokenizers 
 ** designed to be used in concert with the Tcl testing framework. This
-** function must be called with two arguments:
+** function must be called with two or more arguments:
 **
-**   SELECT <function-name>(<key-name>, <input-string>);
-**   SELECT <function-name>(<key-name>, <pointer>);
+**   SELECT <function-name>(<key-name>, ..., <input-string>);
 **
 ** where <function-name> is the name passed as the second argument
 ** to the sqlite3Fts3InitHashTable() function (e.g. 'fts3_tokenizer')
@ -249,27 +248,27 @@ static void testFunc(
  const char *zInput;
  int nInput;

-  const char *zArg = 0;
+  const char *azArg[64];

  const char *zToken;
  int nToken;
  int iStart;
  int iEnd;
  int iPos;
+  int i;

  Tcl_Obj *pRet;

-  assert( argc==2 || argc==3 );
+  if( argc<2 ){
+    sqlite3_result_error(context, "insufficient arguments", -1);
+    return;
+  }

  nName = sqlite3_value_bytes(argv[0]);
  zName = (const char *)sqlite3_value_text(argv[0]);
  nInput = sqlite3_value_bytes(argv[argc-1]);
  zInput = (const char *)sqlite3_value_text(argv[argc-1]);

-  if( argc==3 ){
-    zArg = (const char *)sqlite3_value_text(argv[1]);
-  }
-
  pHash = (Fts3Hash *)sqlite3_user_data(context);
  p = (sqlite3_tokenizer_module *)sqlite3Fts3HashFind(pHash, zName, nName+1);

@ -283,7 +282,11 @@ static void testFunc(
  pRet = Tcl_NewObj();
  Tcl_IncrRefCount(pRet);

-  if( SQLITE_OK!=p->xCreate(zArg ? 1 : 0, &zArg, &pTokenizer) ){
+  for(i=1; i<argc-1; i++){
+    azArg[i-1] = (const char *)sqlite3_value_text(argv[i]);
+  }
+
+  if( SQLITE_OK!=p->xCreate(argc-2, azArg, &pTokenizer) ){
    zErr = "error in xCreate()";
    goto finish;
  }
@ -467,10 +470,7 @@ int sqlite3Fts3InitHashTable(
  }
 #ifdef SQLITE_TEST
  if( SQLITE_OK==rc ){
-    rc = sqlite3_create_function(db, zTest, 2, any, p, testFunc, 0, 0);
-  }
-  if( SQLITE_OK==rc ){
-    rc = sqlite3_create_function(db, zTest, 3, any, p, testFunc, 0, 0);
+    rc = sqlite3_create_function(db, zTest, -1, any, p, testFunc, 0, 0);
  }
  if( SQLITE_OK==rc ){
    rc = sqlite3_create_function(db, zTest2, 0, any, pdb, intTestFunc, 0, 0);
--- a/ext/fts3/fts3_unicode.c
+++ b/ext/fts3/fts3_unicode.c
@ -83,6 +83,8 @@ typedef struct unicode_cursor unicode_cursor;
 struct unicode_tokenizer {
  sqlite3_tokenizer base;
  int bRemoveDiacritic;
+  int nException;
+  int *aiException;
 };

 struct unicode_cursor {
@ -95,6 +97,121 @@ struct unicode_cursor {
  int nAlloc;                     /* space allocated at zToken */
 };

+
+/*
+** Destroy a tokenizer allocated by unicodeCreate().
+*/
+static int unicodeDestroy(sqlite3_tokenizer *pTokenizer){
+  if( pTokenizer ){
+    unicode_tokenizer *p = (unicode_tokenizer *)pTokenizer;
+    sqlite3_free(p->aiException);
+    sqlite3_free(p);
+  }
+  return SQLITE_OK;
+}
+
+/*
+** As part of a tokenchars= or separators= option, the CREATE VIRTUAL TABLE
+** statement has specified that the tokenizer for this table shall consider
+** all characters in string zIn/nIn to be separators (if bAlnum==0) or
+** token characters (if bAlnum==1).
+**
+** For each codepoint in the zIn/nIn string, this function checks if the
+** sqlite3FtsUnicodeIsalnum() function already returns the desired result.
+** If so, no action is taken. Otherwise, the codepoint is added to the 
+** unicode_tokenizer.aiException[] array. For the purposes of tokenization,
+** the return value of sqlite3FtsUnicodeIsalnum() is inverted for all
+** codepoints in the aiException[] array.
+**
+** If a standalone diacritic mark (one that sqlite3FtsUnicodeIsdiacritic()
+** identifies as a diacritic) occurs in the zIn/nIn string it is ignored.
+** It is not possible to change the behaviour of the tokenizer with respect
+** to these codepoints.
+*/
+static int unicodeAddExceptions(
+  unicode_tokenizer *p,           /* Tokenizer to add exceptions to */
+  int bAlnum,                     /* Replace Isalnum() return value with this */
+  const char *zIn,                /* Array of characters to make exceptions */
+  int nIn                         /* Length of z in bytes */
+){
+  const unsigned char *z = (const unsigned char *)zIn;
+  const unsigned char *zTerm = &z[nIn];
+  int iCode;
+  int nEntry = 0;
+
+  assert( bAlnum==0 || bAlnum==1 );
+
+  while( z<zTerm ){
+    READ_UTF8(z, zTerm, iCode);
+    assert( (sqlite3FtsUnicodeIsalnum(iCode) & 0xFFFFFFFE)==0 );
+    if( sqlite3FtsUnicodeIsalnum(iCode)!=bAlnum 
+     && sqlite3FtsUnicodeIsdiacritic(iCode)==0 
+    ){
+      nEntry++;
+    }
+  }
+
+  if( nEntry ){
+    int *aNew;                    /* New aiException[] array */
+    int nNew;                     /* Number of valid entries in array aNew[] */
+
+    aNew = sqlite3_realloc(p->aiException, (p->nException+nEntry)*sizeof(int));
+    if( aNew==0 ) return SQLITE_NOMEM;
+    nNew = p->nException;
+
+    z = (const unsigned char *)zIn;
+    while( z<zTerm ){
+      READ_UTF8(z, zTerm, iCode);
+      if( sqlite3FtsUnicodeIsalnum(iCode)!=bAlnum 
+       && sqlite3FtsUnicodeIsdiacritic(iCode)==0
+      ){
+        int i, j;
+        for(i=0; i<nNew && aNew[i]<iCode; i++);
+        for(j=nNew; j>i; j--) aNew[j] = aNew[j-1];
+        aNew[i] = iCode;
+        nNew++;
+      }
+    }
+    p->aiException = aNew;
+    p->nException = nNew;
+  }
+
+  return SQLITE_OK;
+}
+
+/*
+** Return true if the p->aiException[] array contains the value iCode.
+*/
+static int unicodeIsException(unicode_tokenizer *p, int iCode){
+  if( p->nException>0 ){
+    int *a = p->aiException;
+    int iLo = 0;
+    int iHi = p->nException-1;
+
+    while( iHi>=iLo ){
+      int iTest = (iHi + iLo) / 2;
+      if( iCode==a[iTest] ){
+        return 1;
+      }else if( iCode>a[iTest] ){
+        iLo = iTest+1;
+      }else{
+        iHi = iTest-1;
+      }
+    }
+  }
+
+  return 0;
+}
+
+/*
+** Return true if, for the purposes of tokenization, codepoint iCode is
+** considered a token character (not a separator).
+*/
+static int unicodeIsAlnum(unicode_tokenizer *p, int iCode){
+  assert( (sqlite3FtsUnicodeIsalnum(iCode) & 0xFFFFFFFE)==0 );
+  return sqlite3FtsUnicodeIsalnum(iCode) ^ unicodeIsException(p, iCode);
+}
+
 /*
 ** Create a new tokenizer instance.
 */
@ -105,14 +222,14 @@ static int unicodeCreate(
 ){
  unicode_tokenizer *pNew;        /* New tokenizer object */
  int i;
+  int rc = SQLITE_OK;
+
  pNew = (unicode_tokenizer *) sqlite3_malloc(sizeof(unicode_tokenizer));
-  if( pNew==NULL ){
-    return SQLITE_NOMEM;
-  }
+  if( pNew==NULL ) return SQLITE_NOMEM;
  memset(pNew, 0, sizeof(unicode_tokenizer));
  pNew->bRemoveDiacritic = 1;

-  for(i=0; i<nArg; i++){
+  for(i=0; rc==SQLITE_OK && i<nArg; i++){
    const char *z = azArg[i];
    int n = strlen(z);

@ -122,22 +239,24 @@ static int unicodeCreate(
    else if( n==19 && memcmp("remove_diacritics=0", z, 19)==0 ){
      pNew->bRemoveDiacritic = 0;
    }
+    else if( n>=11 && memcmp("tokenchars=", z, 11)==0 ){
+      rc = unicodeAddExceptions(pNew, 1, &z[11], n-11);
+    }
+    else if( n>=11 && memcmp("separators=", z, 11)==0 ){
+      rc = unicodeAddExceptions(pNew, 0, &z[11], n-11);
+    }
    else{
      /* Unrecognized argument */
-      return SQLITE_ERROR;
+      rc  = SQLITE_ERROR;
    }
  }

-  *pp = &pNew->base;
-  return SQLITE_OK;
-}
-
-/*
-** Destroy a tokenizer allocated by unicodeCreate().
-*/
-static int unicodeDestroy(sqlite3_tokenizer *pTokenizer){
-  sqlite3_free(pTokenizer);
-  return SQLITE_OK;
+  if( rc!=SQLITE_OK ){
+    unicodeDestroy((sqlite3_tokenizer *)pNew);
+    pNew = 0;
+  }
+  *pp = (sqlite3_tokenizer *)pNew;
+  return rc;
 }

 /*
@ -190,14 +309,15 @@ static int unicodeClose(sqlite3_tokenizer_cursor *pCursor){
 ** have been opened by a prior call to simpleOpen().
 */
 static int unicodeNext(
-  sqlite3_tokenizer_cursor *p,    /* Cursor returned by simpleOpen */
+  sqlite3_tokenizer_cursor *pC,   /* Cursor returned by simpleOpen */
  const char **paToken,           /* OUT: Token text */
  int *pnToken,                   /* OUT: Number of bytes at *paToken */
  int *piStart,                   /* OUT: Starting offset of token */
  int *piEnd,                     /* OUT: Ending offset of token */
  int *piPos                      /* OUT: Position integer of token */
 ){
-  unicode_cursor *pCsr = (unicode_cursor *)p;
+  unicode_cursor *pCsr = (unicode_cursor *)pC;
+  unicode_tokenizer *p = ((unicode_tokenizer *)pCsr->base.pTokenizer);
  int iCode;
  char *zOut;
  const unsigned char *z = &pCsr->aInput[pCsr->iOff];
@ -210,7 +330,7 @@ static int unicodeNext(
  ** the input.  */
  while( z<zTerm ){
    READ_UTF8(z, zTerm, iCode);
-    if( sqlite3FtsUnicodeIsalnum(iCode) ) break;
+    if( unicodeIsAlnum(p, iCode) ) break;
    zStart = z;
  }
  if( zStart>=zTerm ) return SQLITE_DONE;
@ -230,9 +350,7 @@ static int unicodeNext(

    /* Write the folded case of the last character read to the output */
    zEnd = z;
-    iOut = sqlite3FtsUnicodeFold(iCode, 
-        ((unicode_tokenizer *)pCsr->base.pTokenizer)->bRemoveDiacritic
-    );
+    iOut = sqlite3FtsUnicodeFold(iCode, p->bRemoveDiacritic);
    if( iOut ){
      WRITE_UTF8(zOut, iOut);
    }
@ -240,7 +358,7 @@ static int unicodeNext(
    /* If the cursor is not at EOF, read the next character */
    if( z>=zTerm ) break;
    READ_UTF8(z, zTerm, iCode);
-  }while( sqlite3FtsUnicodeIsalnum(iCode) 
+  }while( unicodeIsAlnum(p, iCode) 
       || sqlite3FtsUnicodeIsdiacritic(iCode)
  );