Have the FTS unicode61 strip out diacritics when tokenizing text. This can be disabled by specifying the tokenizer option "remove_diacritics=0".

FossilOrigin-Name: 790f76a5898dad1a955d40edddf11f7b0fec0ccd
2025-07-30 19:03:16 +03:00 · 2012-06-06 19:30:38 +00:00
parent f05305477c
commit 754d3adf7c
7 changed files with 383 additions and 32 deletions
--- a/ext/fts3/fts3Int.h
+++ b/ext/fts3/fts3Int.h
@ -542,8 +542,11 @@ int sqlite3Fts3MsrIncrRestart(Fts3MultiSegReader *pCsr);
 int sqlite3Fts3DeferredTokenList(Fts3DeferredToken *, char **, int *);
 /* fts3_unicode2.c (functions generated by parsing unicode text files) */
-int sqlite3FtsUnicodeTolower(int);
+#ifndef SQLITE_DISABLE_FTS3_UNICODE
 int sqlite3FtsUnicodeFold(int, int);
 int sqlite3FtsUnicodeIsalnum(int);
 int sqlite3FtsUnicodeIsdiacritic(int);
 #endif
 #endif /* !SQLITE_CORE || SQLITE_ENABLE_FTS3 */
 #endif /* _FTSINT_H */
--- a/ext/fts3/fts3_unicode.c
+++ b/ext/fts3/fts3_unicode.c
@ -82,6 +82,7 @@ typedef struct unicode_cursor unicode_cursor;
 struct unicode_tokenizer {
  sqlite3_tokenizer base;
  int bRemoveDiacritic;
 };
 struct unicode_cursor {
@ -103,11 +104,30 @@ static int unicodeCreate(
  sqlite3_tokenizer **pp          /* OUT: New tokenizer handle */
 ){
  unicode_tokenizer *pNew;        /* New tokenizer object */
  int i;
  pNew = (unicode_tokenizer *) sqlite3_malloc(sizeof(unicode_tokenizer));
  if( pNew==NULL ){
    return SQLITE_NOMEM;
  }
  memset(pNew, 0, sizeof(unicode_tokenizer));
  pNew->bRemoveDiacritic = 1;
  for(i=0; i<nArg; i++){
    const char *z = azArg[i];
    int n = strlen(z);
    if( n==19 && memcmp("remove_diacritics=1", z, 19)==0 ){
      pNew->bRemoveDiacritic = 1;
    }
    else if( n==19 && memcmp("remove_diacritics=0", z, 19)==0 ){
      pNew->bRemoveDiacritic = 0;
    }
    else{
      /* Unrecognized argument */
      return SQLITE_ERROR;
    }
  }
  *pp = &pNew->base;
  return SQLITE_OK;
 }
@ -197,6 +217,8 @@ static int unicodeNext(
  zOut = pCsr->zToken;
  do {
    int iOut;
    /* Grow the output buffer if required. */
    if( (zOut-pCsr->zToken)>=(pCsr->nAlloc-4) ){
      char *zNew = sqlite3_realloc(pCsr->zToken, pCsr->nAlloc+64);
@ -208,12 +230,19 @@ static int unicodeNext(
    /* Write the folded case of the last character read to the output */
    zEnd = z;
-    WRITE_UTF8(zOut, sqlite3FtsUnicodeTolower(iCode));
+    iOut = sqlite3FtsUnicodeFold(iCode, 
        ((unicode_tokenizer *)pCsr->base.pTokenizer)->bRemoveDiacritic
    );
    if( iOut ){
      WRITE_UTF8(zOut, iOut);
    }
    /* If the cursor is not at EOF, read the next character */
    if( z>=zTerm ) break;
    READ_UTF8(z, zTerm, iCode);
-  }while( sqlite3FtsUnicodeIsalnum(iCode) );
+  }while( sqlite3FtsUnicodeIsalnum(iCode) 
       || sqlite3FtsUnicodeIsdiacritic(iCode)
  );
  /* Set the output variables and return. */
  pCsr->iOff = (z - pCsr->aInput);
--- a/ext/fts3/fts3_unicode2.c
+++ b/ext/fts3/fts3_unicode2.c
@ -152,6 +152,74 @@ int sqlite3FtsUnicodeIsalnum(int c){
 }
 /*
 ** If the argument is a codepoint corresponding to a lowercase letter
 ** in the ASCII range with a diacritic added, return the codepoint
 ** of the ASCII letter only. For example, if passed 235 - "LATIN
 ** SMALL LETTER E WITH DIAERESIS" - return 65 ("LATIN SMALL LETTER
 ** E"). The resuls of passing a codepoint that corresponds to an
 ** uppercase letter are undefined.
 */
 static int remove_diacritic(int c){
  unsigned short aDia[] = {
        0,  1797,  1848,  1859,  1891,  1928,  1940,  1995, 
     2024,  2040,  2060,  2110,  2168,  2206,  2264,  2286, 
     2344,  2383,  2472,  2488,  2516,  2596,  2668,  2732, 
     2782,  2842,  2894,  2954,  2984,  3000,  3028,  3336, 
     3456,  3696,  3712,  3728,  3744,  3896,  3912,  3928, 
     3968,  4008,  4040,  4106,  4138,  4170,  4202,  4234, 
     4266,  4296,  4312,  4344,  4408,  4424,  4472,  4504, 
     6148,  6198,  6264,  6280,  6360,  6429,  6505,  6529, 
    61448, 61468, 61534, 61592, 61642, 61688, 61704, 61726, 
    61784, 61800, 61836, 61880, 61914, 61948, 61998, 62122, 
    62154, 62200, 62218, 62302, 62364, 62442, 62478, 62536, 
    62554, 62584, 62604, 62640, 62648, 62656, 62664, 62730, 
    62924, 63050, 63082, 63274, 63390, 
  };
  char aChar[] = {
    '\0', 'a',  'c',  'e',  'i',  'n',  'o',  'u',  'y',  'y',  'a',  'c',  
    'd',  'e',  'e',  'g',  'h',  'i',  'j',  'k',  'l',  'n',  'o',  'r',  
    's',  't',  'u',  'u',  'w',  'y',  'z',  'o',  'u',  'a',  'i',  'o',  
    'u',  'g',  'k',  'o',  'j',  'g',  'n',  'a',  'e',  'i',  'o',  'r',  
    'u',  's',  't',  'h',  'a',  'e',  'o',  'y',  '\0', '\0', '\0', '\0', 
    '\0', '\0', '\0', '\0', 'a',  'b',  'd',  'd',  'e',  'f',  'g',  'h',  
    'h',  'i',  'k',  'l',  'l',  'm',  'n',  'p',  'r',  'r',  's',  't',  
    'u',  'v',  'w',  'w',  'x',  'y',  'z',  'h',  't',  'w',  'y',  'a',  
    'e',  'i',  'o',  'u',  'y',  
  };
  unsigned int key = (((unsigned int)c)<<3) | 0x00000007;
  int iRes = 0;
  int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1;
  int iLo = 0;
  while( iHi>=iLo ){
    int iTest = (iHi + iLo) / 2;
    if( key >= aDia[iTest] ){
      iRes = iTest;
      iLo = iTest+1;
    }else{
      iHi = iTest-1;
    }
  }
  assert( key>=aDia[iRes] );
  return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]);
 };
 /*
 ** Return true if the argument interpreted as a unicode codepoint
 ** is a diacritical modifier character.
 */
 int sqlite3FtsUnicodeIsdiacritic(int c){
  unsigned int mask0 = 0x08029FDF;
  unsigned int mask1 = 0x000361F8;
  if( c<768 || c>817 ) return 0;
  return (c < 768+32) ?
      (mask0 & (1 << (c-768))) :
      (mask1 & (1 << (c-768-32)));
 }
 /*
 ** Interpret the argument as a unicode codepoint. If the codepoint
 ** is an upper case character that has a lower case equivalent,
@ -161,7 +229,7 @@ int sqlite3FtsUnicodeIsalnum(int c){
 ** The results are undefined if the value passed to this function
 ** is less than zero.
 */
-int sqlite3FtsUnicodeTolower(int c){
+int sqlite3FtsUnicodeFold(int c, int bRemoveDiacritic){
  /* Each entry in the following array defines a rule for folding a range
  ** of codepoints to lower case. The rule applies to a range of nRange
  ** codepoints starting at codepoint iCode.
@ -284,6 +352,8 @@ int sqlite3FtsUnicodeTolower(int c){
        assert( ret>0 );
      }
    }
    if( bRemoveDiacritic ) ret = remove_diacritic(ret);
  }
  else if( c>=66560 && c<66600 ){
--- a/ext/fts3/unicode/mkunicode.tcl
+++ b/ext/fts3/unicode/mkunicode.tcl
@ -1,4 +1,208 @@
 #
 # Parameter $zName must be a path to the file UnicodeData.txt. This command
 # reads the file and returns a list of mappings required to remove all
 # diacritical marks from a unicode string. Each mapping is itself a list
 # consisting of two elements - the unicode codepoint and the single ASCII
 # character that it should be replaced with, or an empty string if the 
 # codepoint should simply be removed from the input. Examples:
 #
 #   { 224 a  }     (replace codepoint 224 to "a")
 #   { 769 "" }     (remove codepoint 769 from input)
 #
 # Mappings are only returned for non-upper case codepoints. It is assumed
 # that the input has already been folded to lower case.
 #
 proc rd_load_unicodedata_text {zName} {
  global tl_lookup_table
  set fd [open $zName]
  set lField {
    code
    character_name
    general_category
    canonical_combining_classes
    bidirectional_category
    character_decomposition_mapping
    decimal_digit_value
    digit_value
    numeric_value
    mirrored
    unicode_1_name
    iso10646_comment_field
    uppercase_mapping
    lowercase_mapping
    titlecase_mapping
  }
  set lRet [list]
  while { ![eof $fd] } {
    set line [gets $fd]
    if {$line == ""} continue
    set fields [split $line ";"]
    if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
    foreach $lField $fields {}
    if { [llength $character_decomposition_mapping]!=2
      || [string is xdigit [lindex $character_decomposition_mapping 0]]==0
    } {
      continue
    }
    set iCode  [expr "0x$code"]
    set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"]
    set iDia   [expr "0x[lindex $character_decomposition_mapping 1]"]
    if {[info exists tl_lookup_table($iCode)]} continue
    if { ($iAscii >= 97 && $iAscii <= 122)
      || ($iAscii >= 65 && $iAscii <= 90)
    } {
      lappend lRet [list $iCode [string tolower [format %c $iAscii]]]
      set dia($iDia) 1
    }
  }
  foreach d [array names dia] {
    lappend lRet [list $d ""]
  }
  set lRet [lsort -integer -index 0 $lRet]
  close $fd
  set lRet
 }
 proc print_rd {map} {
  global tl_lookup_table
  set aChar [list]
  set lRange [list]
  set nRange 1
  set iFirst  [lindex $map 0 0]
  set cPrev   [lindex $map 0 1]
  foreach m [lrange $map 1 end] {
    foreach {i c} $m {}
    if {$cPrev == $c} {
      for {set j [expr $iFirst+$nRange]} {$j<$i} {incr j} {
        if {[info exists tl_lookup_table($j)]==0} break
      }
      if {$j==$i} {
        set nNew [expr {(1 + $i - $iFirst)}]
        if {$nNew<=8} {
          set nRange $nNew
          continue
        }
      }
    }
    lappend lRange [list $iFirst $nRange]
    lappend aChar  $cPrev
    set iFirst $i
    set cPrev  $c
    set nRange 1
  }
  lappend lRange [list $iFirst $nRange]
  lappend aChar $cPrev
  puts "/*"
  puts "** If the argument is a codepoint corresponding to a lowercase letter"
  puts "** in the ASCII range with a diacritic added, return the codepoint"
  puts "** of the ASCII letter only. For example, if passed 235 - \"LATIN"
  puts "** SMALL LETTER E WITH DIAERESIS\" - return 65 (\"LATIN SMALL LETTER"
  puts "** E\"). The resuls of passing a codepoint that corresponds to an"
  puts "** uppercase letter are undefined."
  puts "*/"
  puts "static int remove_diacritic(int c)\{"
  puts "  unsigned short aDia\[\] = \{"
  puts -nonewline "        0, "
  set i 1
  foreach r $lRange {
    foreach {iCode nRange} $r {}
    if {($i % 8)==0} {puts "" ; puts -nonewline "    " }
    incr i
    puts -nonewline [format "%5d" [expr ($iCode<<3) + $nRange-1]]
    puts -nonewline ", "
  }
  puts ""
  puts "  \};"
  puts "  char aChar\[\] = \{"
  puts -nonewline "    '\\0', "
  set i 1
  foreach c $aChar {
    set str "'$c',  "
    if {$c == ""} { set str "'\\0', " }
    if {($i % 12)==0} {puts "" ; puts -nonewline "    " }
    incr i
    puts -nonewline "$str"
  }
  puts ""
  puts "  \};"
  puts {
  unsigned int key = (((unsigned int)c)<<3) | 0x00000007;
  int iRes = 0;
  int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1;
  int iLo = 0;
  while( iHi>=iLo ){
    int iTest = (iHi + iLo) / 2;
    if( key >= aDia[iTest] ){
      iRes = iTest;
      iLo = iTest+1;
    }else{
      iHi = iTest-1;
    }
  }
  assert( key>=aDia[iRes] );
  return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]);}
  puts "\};"
 }
 proc print_isdiacritic {zFunc map} {
  set lCode [list]
  foreach m $map {
    foreach {code char} $m {}
    if {$code && $char == ""} { lappend lCode $code }
  }
  set lCode [lsort -integer $lCode]
  set iFirst [lindex $lCode 0]
  set iLast [lindex $lCode end]
  set i1 0
  set i2 0
  foreach c $lCode {
    set i [expr $c - $iFirst]
    if {$i < 32} {
      set i1 [expr {$i1 | (1<<$i)}]
    } else {
      set i2 [expr {$i2 | (1<<($i-32))}]
    }
  }
  puts "/*"
  puts "** Return true if the argument interpreted as a unicode codepoint" 
  puts "** is a diacritical modifier character."
  puts "*/"
  puts "int ${zFunc}\(int c)\{"
  puts "  unsigned int mask0 = [format "0x%08X" $i1];"
  puts "  unsigned int mask1 = [format "0x%08X" $i2];"
  puts "  if( c<$iFirst || c>$iLast ) return 0;"
  puts "  return (c < $iFirst+32) ?"
  puts "      (mask0 & (1 << (c-$iFirst))) :"
  puts "      (mask1 & (1 << (c-$iFirst-32)));"
  puts "\}"
 }
 #-------------------------------------------------------------------------
 # Parameter $zName must be a path to the file UnicodeData.txt. This command
 # reads the file and returns a list of codepoints (integers). The list
@ -393,7 +597,7 @@ proc tl_print_ioff_table {liOff} {
 }
-proc print_tolower {zFunc} {
+proc print_fold {zFunc} {
  set lRecord [tl_create_records]
@ -407,7 +611,7 @@ proc print_tolower {zFunc} {
  puts "** The results are undefined if the value passed to this function"
  puts "** is less than zero."
  puts "*/"
-  puts "int ${zFunc}\(int c)\{"
+  puts "int ${zFunc}\(int c, int bRemoveDiacritic)\{"
  set liOff [tl_generate_ioff_table $lRecord]
  tl_print_table_header
@ -451,6 +655,8 @@ proc print_tolower {zFunc} {
        assert( ret>0 );
      }
    }
    if( bRemoveDiacritic ) ret = remove_diacritic(ret);
  }
  }
@ -463,22 +669,38 @@ proc print_tolower {zFunc} {
  puts "\}"
 }
-proc print_tolower_test {zFunc} {
+proc print_fold_test {zFunc mappings} {
  global tl_lookup_table
-  puts "static int tolower_test(int *piCode)\{"
+  foreach m $mappings {
    set c [lindex $m 1]
    if {$c == ""} {
      set extra([lindex $m 0]) 0
    } else {
      scan $c %c i
      set extra([lindex $m 0]) $i
    }
  }
  puts "static int fold_test(int *piCode)\{"
  puts -nonewline "  static int aLookup\[\] = \{"
  for {set i 0} {$i < 70000} {incr i} {
    set expected $i
    catch { set expected $tl_lookup_table($i) }
-    if {($i % 8)==0}  { puts "" ; puts -nonewline "    " }
+    set expected2 $expected
-    puts -nonewline "$expected, "
+    catch { set expected2 $extra($expected2) }
    if {($i % 4)==0}  { puts "" ; puts -nonewline "    " }
    puts -nonewline "$expected, $expected2, "
  }
  puts "  \};"
  puts "  int i;"
  puts "  for(i=0; i<sizeof(aLookup)/sizeof(aLookup\[0\]); i++)\{"
-  puts "    if( ${zFunc}\(i)!=aLookup\[i\] )\{"
+  puts "    int iCode = (i/2);"
-  puts "      *piCode = i;"
+  puts "    int bFlag = i & 0x0001;"
  puts "    if( ${zFunc}\(iCode, bFlag)!=aLookup\[i\] )\{"
  puts "      *piCode = iCode;"
  puts "      return 1;"
  puts "    \}"
  puts "  \}"
@ -524,9 +746,9 @@ proc print_test_main {} {
  puts "  r1 = isalnum_test(&code);"
  puts "  if( r1 ) printf(\"isalnum(): Problem with code %d\\n\",code);"
  puts "  else printf(\"isalnum(): test passed\\n\");"
-  puts "  r2 = tolower_test(&code);"
+  puts "  r2 = fold_test(&code);"
-  puts "  if( r2 ) printf(\"tolower(): Problem with code %d\\n\",code);"
+  puts "  if( r2 ) printf(\"fold(): Problem with code %d\\n\",code);"
-  puts "  else printf(\"tolower(): test passed\\n\");"
+  puts "  else printf(\"fold(): test passed\\n\");"
  puts "  return (r1 || r2);"
  puts "\}"
 }
@ -545,9 +767,10 @@ set unicodedata.txt [lindex $argv end]
 set casefolding.txt [lindex $argv end-1]
 set generate_test_code [expr {[llength $argv]==3}]
 print_fileheader
 # Print the isalnum() function to stdout.
 #
 print_fileheader
 set lRange [an_load_separator_ranges]
 print_isalnum sqlite3FtsUnicodeIsalnum $lRange
@ -556,17 +779,28 @@ print_isalnum sqlite3FtsUnicodeIsalnum $lRange
 puts ""
 puts ""
-# Print the tolower() function to stdout.
+# Load the fold data. This is used by the [rd_XXX] commands
-#
+# as well as [print_fold].
 tl_load_casefolding_txt ${casefolding.txt}
-print_tolower sqlite3FtsUnicodeTolower
+
 set mappings [rd_load_unicodedata_text ${unicodedata.txt}]
 print_rd $mappings
 puts ""
 puts ""
 print_isdiacritic sqlite3FtsUnicodeIsdiacritic $mappings
 puts ""
 puts ""
 # Print the fold() function to stdout.
 #
 print_fold sqlite3FtsUnicodeFold
 # Print the test routines and main() function to stdout, if -test 
 # was specified.
 #
 if {$::generate_test_code} {
  print_test_isalnum sqlite3FtsUnicodeIsalnum $lRange
-  print_tolower_test sqlite3FtsUnicodeTolower 
+  print_fold_test sqlite3FtsUnicodeFold $mappings
  print_test_main 
 }
--- a/22
+++ b/22
@ -1,5 +1,5 @@
-C Avoid\sresetting\sthe\sshared-cache\sschema\swhen\son\sof\sthe\sconnections\susing\nthe\sshared\scache\scloses.\s\sDelay\sresetting\sthe\sschema\suntil\sthe\slast\sconnection\ncloses.
+C Have\sthe\sFTS\sunicode61\sstrip\sout\sdiacritics\swhen\stokenizing\stext.\sThis\scan\sbe\sdisabled\sby\sspecifying\sthe\stokenizer\soption\s"remove_diacritics=0".
-D 2012-06-06T19:01:13.928
+D 2012-06-06T19:30:38.602
 F Makefile.arm-wince-mingw32ce-gcc d6df77f1f48d690bd73162294bbba7f59507c72f
 F Makefile.in 4f37eb61be9d38643cdd839a74b8e3bad724cfcf
 F Makefile.linux-gcc 91d710bdc4998cb015f39edf3cb314ec4f4d7e23
@ -57,7 +57,7 @@ F ext/fts3/README.tokenizers e0a8b81383ea60d0334d274fadf305ea14a8c314
 F ext/fts3/README.txt 8c18f41574404623b76917b9da66fcb0ab38328d
 F ext/fts3/fts3.c 41824d0db7d244ca335ce98162df1244863a05c4
 F ext/fts3/fts3.h 3a10a0af180d502cecc50df77b1b22df142817fe
-F ext/fts3/fts3Int.h 7b163fa22e7a625c404c424f2779a4d7b14c14ad
+F ext/fts3/fts3Int.h 11c711068474ffe66548d21a2a8498b3dea25348
 F ext/fts3/fts3_aux.c 5205182bd8f372782597888156404766edf5781e
 F ext/fts3/fts3_expr.c dbc7ba4c3a6061adde0f38ed8e9b349568299551
 F ext/fts3/fts3_hash.c 8dd2d06b66c72c628c2732555a32bc0943114914
@ -70,15 +70,15 @@ F ext/fts3/fts3_test.c 348f7d08cae05285794e23dc4fe8b8fdf66e264a
 F ext/fts3/fts3_tokenizer.c 3da7254a9881f7e270ab28e2004e0d22b3212bce
 F ext/fts3/fts3_tokenizer.h 66dec98e365854b6cd2d54f1a96bb6d428fc5a68
 F ext/fts3/fts3_tokenizer1.c 5c98225a53705e5ee34824087478cf477bdb7004
-F ext/fts3/fts3_unicode.c 76b6f6fe6e86acd75b08272502fae74a13cef310
+F ext/fts3/fts3_unicode.c a3c1b0780f764c75844bd13afd9fba139049a121
-F ext/fts3/fts3_unicode2.c 3ddf1728a396a03b5a73ff0f11ecfd2009de117d
+F ext/fts3/fts3_unicode2.c 6381bcfd621b2800df134a560737eaa1ed07cb17
 F ext/fts3/fts3_write.c 6a6391d6b01114f885e24e1f66bbc11ffba0e9e2
 F ext/fts3/fts3speed.tcl b54caf6a18d38174f1a6e84219950d85e98bb1e9
 F ext/fts3/mkfts3amal.tcl 252ecb7fe6467854f2aa237bf2c390b74e71f100
 F ext/fts3/tool/fts3view.c 6cfc5b67a5f0e09c0d698f9fd012c784bfaa9197
 F ext/fts3/unicode/CaseFolding.txt 8c678ca52ecc95e16bc7afc2dbf6fc9ffa05db8c
 F ext/fts3/unicode/UnicodeData.txt cd07314edb62d49fde34debdaf92fa2aa69011e7
-F ext/fts3/unicode/mkunicode.tcl 2029991cc2cd0bf71df12768578a29c852bf54d1
+F ext/fts3/unicode/mkunicode.tcl 7a9bc018e2962abb79563c5a39fe581fcbf2f675
 F ext/icu/README.txt bf8461d8cdc6b8f514c080e4e10dc3b2bbdfefa9
 F ext/icu/icu.c eb9ae1d79046bd7871aa97ee6da51eb770134b5a
 F ext/icu/sqliteicu.h 728867a802baa5a96de7495e9689a8e01715ef37
@ -501,7 +501,7 @@ F test/fts4langid.test 24a6e41063b416bbdf371ff6b4476fa41c194aa7
 F test/fts4merge.test c424309743fdd203f8e56a1f1cd7872cd66cc0ee
 F test/fts4merge2.test 5faa558d1b672f82b847d2a337465fa745e46891
 F test/fts4merge3.test aab02a09f50fe6baaddc2e159c3eabc116d45fc7
-F test/fts4unicode.test c812e9cf843e26ba633f58b36a2629f878af20fd
+F test/fts4unicode.test f394585139ff878f9af0c83791a5f612d45a5984
 F test/func.test 9809b7622d721904a8cc33c1ffb87f46d506ed01
 F test/func2.test 772d66227e4e6684b86053302e2d74a2500e1e0f
 F test/func3.test 001021e5b88bd02a3b365a5c5fd8f6f49d39744a
@ -1005,7 +1005,7 @@ F tool/tostr.awk e75472c2f98dd76e06b8c9c1367f4ab07e122d06
 F tool/vdbe-compress.tcl d70ea6d8a19e3571d7ab8c9b75cba86d1173ff0f
 F tool/warnings-clang.sh a8a0a3babda96dfb1ff51adda3cbbf3dfb7266c2
 F tool/warnings.sh fbc018d67fd7395f440c28f33ef0f94420226381
-P 61669c95859e187618fb2fb4249306a947ae8d26 c469850b2eb61a63150cc5fc7d2fe98f0b5abffb
+P 635e3a762ddeb1f952f66a08c1d4d53e3f42c9eb
-R 4e8ba0ec11cbdf49789a46888b404344
+R 24f67134e172119b7ccb50ae93a76cbd
-U drh
+U dan
-Z 7052addf7652de59fa1313dd54313af2
+Z 32c2f5c3d9b162b4ae41c62d929207f9
--- a/manifest.uuid
+++ b/manifest.uuid
@ -1 +1 @@
-635e3a762ddeb1f952f66a08c1d4d53e3f42c9eb
+790f76a5898dad1a955d40edddf11f7b0fec0ccd
--- a/test/fts4unicode.test
+++ b/test/fts4unicode.test
@ -18,6 +18,13 @@ ifcapable !fts3_unicode { finish_test ; return }
 set ::testprefix fts4unicode
 proc do_unicode_token_test {tn input res} {
  set input [string map {' ''} $input]
  uplevel [list do_execsql_test $tn "
    SELECT fts3_tokenizer_test('unicode61', 'remove_diacritics=0', '$input');
  " [list [list {*}$res]]]
 }
 proc do_unicode_token_test2 {tn input res} {
  set input [string map {' ''} $input]
  uplevel [list do_execsql_test $tn "
    SELECT fts3_tokenizer_test('unicode61', '$input');
@ -40,6 +47,14 @@ do_unicode_token_test 1.7 "The\u00bfquick\u224ebrown\u2263fox" {
  0 the The 1 quick quick 2 brown brown 3 fox fox
 }
 do_unicode_token_test2 1.8  {a B c D} {0 a a 1 b B 2 c c 3 d D}
 do_unicode_token_test2 1.9  {<7B> <20> <20>} {0 a <20> 1 o <20> 2 u <20>}
 do_unicode_token_test2 1.10 {x<>x x<>x x<>x} {0 xax x<>x 1 xox x<>x 2 xux x<>x}
 # Check that diacritics are removed if remove_diacritics=1 is specified.
 # And that they do not break tokens.
 do_unicode_token_test2 1.10 "xx\u0301xx" "0 xxxx xx\u301xx"
 #-------------------------------------------------------------------------
 #
 set docs [list {
		`@ -1 +1 @@`
			`635e3a762ddeb1f952f66a08c1d4d53e3f42c9eb`				`790f76a5898dad1a955d40edddf11f7b0fec0ccd`