Up until now the fts4 "unicode61" tokenizer has treated all private use codepoints except the first and last of each of the three ranges as alphanumeric (eligible to be part of tokens). This commit fixes this so that all private use codepoints are considered alphanumeric. In other words, it fixes the handling of codepoints 0xE000, 0xF8FF, 0xF0000, 0xFFFFD, 0x100000 and 0x10FFFD.

FossilOrigin-Name: 6cfd9af5250029c0d275be027b4208c48954a8a1
2025-07-29 08:01:23 +03:00 · 2013-06-05 16:17:21 +00:00
parent f5ad80397d
commit f2c9229f73
5 changed files with 56 additions and 36 deletions
--- a/test/fts4unicode.test
+++ b/test/fts4unicode.test
@ -384,5 +384,23 @@ foreach T $tokenizers {
  do_isspace_test 6.$T.19 $T   {8287 12288}
 }

+#-------------------------------------------------------------------------
+# Test that the private use ranges are treated as alphanumeric.
+#
+breakpoint
+foreach {tn1 c} {
+  1 \ue000 2 \ue001 3 \uf000 4 \uf8fe 5 \uf8ff
+} {
+  foreach {tn2 config res} {
+    1 ""             "0 hello*world hello*world"
+    2 "separators=*" "0 hello hello 1 world world"
+  } {
+    set config [string map [list * $c] $config]
+    set input  [string map [list * $c] "hello*world"]
+    set output [string map [list * $c] $res]
+    do_unicode_token_test3 7.$tn1.$tn2 {*}$config $input $output
+  }
+}
+

 finish_test