Fix prefix indexes so that they work in characters, not bytes.

FossilOrigin-Name: af8d43a4a08528bbae25ee38fe25de8a86f8a21c
2025-08-01 06:27:03 +03:00 · 2015-01-13 17:25:08 +00:00
parent 73f7d6ed75
commit 851ca6e715
4 changed files with 167 additions and 15 deletions
--- a/ext/fts5/fts5_index.c
+++ b/ext/fts5/fts5_index.c
@ -81,7 +81,7 @@
 **     + for each segment from oldest to newest:
 **         + segment id (always > 0)
 **         + b-tree height (1 -> root is leaf, 2 -> root is parent of leaf etc.)
-**         + first leaf page number (often 1)
+**         + first leaf page number (often 1, always greater than 0)
 **         + final leaf page number
 **
 ** 2. The Averages Record:
@ -4049,6 +4049,39 @@ int sqlite3Fts5IndexClose(Fts5Index *p, int bDestroy){
  return rc;
 }

+/*
+** Argument p points to a buffer containing utf-8 text that is n bytes in 
+** size. Return the number of bytes in the nChar character prefix of the
+** buffer, or 0 if there are less than nChar characters in total.
+*/
+static int fts5IndexCharlenToBytelen(const char *p, int nByte, int nChar){
+  int n = 0;
+  int i;
+  for(i=0; i<nChar; i++){
+    if( n>=nByte ) return 0;      /* Input contains fewer than nChar chars */
+    if( (unsigned char)p[n++]>=0xc0 ){
+      while( (p[n] & 0xc0)==0x80 ) n++;
+    }
+  }
+  return n;
+}
+
+/*
+** pIn is a UTF-8 encoded string, nIn bytes in size. Return the number of
+** unicode characters in the string.
+*/
+int fts5IndexCharlen(const char *pIn, int nIn){
+  int nChar = 0;            
+  int i = 0;
+  while( i<nIn ){
+    if( (unsigned char)pIn[i++]>=0xc0 ){
+      while( i<nIn && (pIn[i] & 0xc0)==0x80 ) i++;
+    }
+    nChar++;
+  }
+  return nChar;
+}
+
 /*
 ** Calculate and return a checksum that is the XOR of the index entry
 ** checksum of all entries that would be generated by the token specified
@ -4064,10 +4097,12 @@ u64 sqlite3Fts5IndexCksum(
  u64 ret = 0;                    /* Return value */
  int iIdx;                       /* For iterating through indexes */

-  for(iIdx=0; iIdx<=pConfig->nPrefix; iIdx++){
-    int n = ((iIdx==pConfig->nPrefix) ? nTerm : pConfig->aPrefix[iIdx]);
-    if( n<=nTerm ){
-      ret ^= fts5IndexEntryCksum(iRowid, iCol, iPos, pTerm, n);
+  ret = fts5IndexEntryCksum(iRowid, iCol, iPos, pTerm, nTerm);
+
+  for(iIdx=0; iIdx<pConfig->nPrefix; iIdx++){
+    int nByte = fts5IndexCharlenToBytelen(pTerm, nTerm, pConfig->aPrefix[iIdx]);
+    if( nByte ){
+      ret ^= fts5IndexEntryCksum(iRowid, iCol, iPos, pTerm, nByte);
    }
  }

@ -4107,8 +4142,9 @@ int sqlite3Fts5IndexWrite(
  ** prefix hash tables that it is large enough for. */
  fts5AddTermToHash(p, 0, iCol, iPos, pToken, nToken);
  for(i=0; i<pConfig->nPrefix; i++){
-    if( nToken>=pConfig->aPrefix[i] ){
-      fts5AddTermToHash(p, i+1, iCol, iPos, pToken, pConfig->aPrefix[i]);
+    int nByte = fts5IndexCharlenToBytelen(pToken, nToken, pConfig->aPrefix[i]);
+    if( nByte ){
+      fts5AddTermToHash(p, i+1, iCol, iPos, pToken, nByte);
    }
  }

@ -4130,8 +4166,9 @@ int sqlite3Fts5IndexQuery(

  if( flags & FTS5INDEX_QUERY_PREFIX ){
    Fts5Config *pConfig = p->pConfig;
+    int nChar = fts5IndexCharlen(pToken, nToken);
    for(iIdx=1; iIdx<=pConfig->nPrefix; iIdx++){
-      if( pConfig->aPrefix[iIdx-1]==nToken ) break;
+      if( pConfig->aPrefix[iIdx-1]==nChar ) break;
    }
    if( iIdx>pConfig->nPrefix ){
      iIdx = -1;
@ -4602,6 +4639,55 @@ static void fts5DecodeFunction(
  fts5BufferFree(&s);
 }

+/*
+** The implementation of user-defined scalar function fts5_rowid().
+*/
+static void fts5RowidFunction(
+  sqlite3_context *pCtx,          /* Function call context */
+  int nArg,                       /* Number of args (always 2) */
+  sqlite3_value **apVal           /* Function arguments */
+){
+  const char *zArg;
+  if( nArg==0 ){
+    sqlite3_result_error(pCtx, "should be: fts5_rowid(subject, ....)", -1);
+  }else{
+    zArg = (const char*)sqlite3_value_text(apVal[0]);
+    if( 0==sqlite3_stricmp(zArg, "segment") ){
+      i64 iRowid;
+      int idx, segid, height, pgno;
+      if( nArg!=5 ){
+        sqlite3_result_error(pCtx, 
+            "should be: fts5_rowid('segment', idx, segid, height, pgno))", -1
+        );
+      }else{
+        idx = sqlite3_value_int(apVal[1]);
+        segid = sqlite3_value_int(apVal[2]);
+        height = sqlite3_value_int(apVal[3]);
+        pgno = sqlite3_value_int(apVal[4]);
+        iRowid = FTS5_SEGMENT_ROWID(idx, segid, height, pgno);
+        sqlite3_result_int64(pCtx, iRowid);
+      }
+    }else if( 0==sqlite3_stricmp(zArg, "start-of-index") ){
+      i64 iRowid;
+      int idx;
+      if( nArg!=2 ){
+        sqlite3_result_error(pCtx, 
+            "should be: fts5_rowid('start-of-index', idx)", -1
+        );
+      }else{
+        idx = sqlite3_value_int(apVal[1]);
+        iRowid = FTS5_SEGMENT_ROWID(idx, 1, 0, 0);
+        sqlite3_result_int64(pCtx, iRowid);
+      }
+    }else {
+      sqlite3_result_error(pCtx, 
+        "first arg to fts5_rowid() must be 'segment' "
+        "or 'start-of-index' ..."
+        , -1
+      );
+    }
+  }
+}

 /*
 ** This is called as part of registering the FTS5 module with database
@ -4615,6 +4701,11 @@ int sqlite3Fts5IndexInit(sqlite3 *db){
  int rc = sqlite3_create_function(
      db, "fts5_decode", 2, SQLITE_UTF8, 0, fts5DecodeFunction, 0, 0
  );
+  if( rc==SQLITE_OK ){
+    rc = sqlite3_create_function(
+        db, "fts5_rowid", -1, SQLITE_UTF8, 0, fts5RowidFunction, 0, 0
+    );
+  }
  return rc;
 }

--- a/ext/fts5/test/fts5prefix.test
+++ b/ext/fts5/test/fts5prefix.test
@ -0,0 +1,60 @@
+# 2015 Jan 13
+#
+# The author disclaims copyright to this source code.  In place of
+# a legal notice, here is a blessing:
+#
+#    May you do good and not evil.
+#    May you find forgiveness for yourself and forgive others.
+#    May you share freely, never taking more than you give.
+#
+#***********************************************************************
+#
+#
+
+source [file join [file dirname [info script]] fts5_common.tcl]
+set testprefix fts5prefix
+
+
+#-------------------------------------------------------------------------
+# Check that prefix indexes really do index n-character prefixes, not 
+# n-byte prefixes. Use the ascii tokenizer so as not to be confused by
+# diacritic removal.
+#
+do_execsql_test 1.0 { 
+  CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = ascii, prefix = 2) 
+}
+
+do_test 1.2 {
+  foreach {rowid string} {
+    1 "\xCA\xCB\xCC\xCD"
+    2 "\u1234\u5678\u4321\u8765"
+  } {
+    execsql { INSERT INTO t1(rowid, x) VALUES($rowid, $string) }
+  }
+} {}
+
+do_execsql_test 1.1.2 {
+  INSERT INTO t1(t1) VALUES('integrity-check');
+}
+
+#db eval { select fts5_decode(id, block) AS d FROM t1_data; } { puts $d }
+
+foreach o {1 2} {
+  if {$o==2} breakpoint
+  foreach {tn q res} {
+    1 "SELECT rowid FROM t1 WHERE t1 MATCH '\xCA\xCB*'" 1
+    2 "SELECT rowid FROM t1 WHERE t1 MATCH '\u1234\u5678*'" 2
+  } {
+    do_execsql_test 1.$o.$tn $q $res
+  }
+
+  execsql {
+    DELETE FROM t1_data WHERE 
+    rowid>=fts5_rowid('start-of-index', 0) AND 
+    rowid<fts5_rowid('start-of-index', 1);
+  }
+}
+
+
+finish_test
+