mirror of
https://github.com/sqlite/sqlite.git
synced 2025-07-29 08:01:23 +03:00
Up until now the fts4 "unicode61" tokenizer has treated all private use codepoints except the first and last of each of the three ranges as alphanumeric (eligible to be part of tokens). This commit fixes this so that all private use codepoints are considered alphanumeric. In other words, it fixes the handling of codepoints 0xE000, 0xF8FF, 0xF0000, 0xFFFFD, 0x100000 and 0x10FFFD.
FossilOrigin-Name: 6cfd9af5250029c0d275be027b4208c48954a8a1
This commit is contained in:
@ -101,28 +101,27 @@ int sqlite3FtsUnicodeIsalnum(int c){
|
||||
0x02A97004, 0x02A9DC03, 0x02A9EC01, 0x02AAC001, 0x02AAC803,
|
||||
0x02AADC02, 0x02AAF802, 0x02AB0401, 0x02AB7802, 0x02ABAC07,
|
||||
0x02ABD402, 0x02AF8C0B, 0x03600001, 0x036DFC02, 0x036FFC02,
|
||||
0x037FFC02, 0x03E3FC01, 0x03EC7801, 0x03ECA401, 0x03EEC810,
|
||||
0x03F4F802, 0x03F7F002, 0x03F8001A, 0x03F88007, 0x03F8C023,
|
||||
0x03F95013, 0x03F9A004, 0x03FBFC01, 0x03FC040F, 0x03FC6807,
|
||||
0x03FCEC06, 0x03FD6C0B, 0x03FF8007, 0x03FFA007, 0x03FFE405,
|
||||
0x04040003, 0x0404DC09, 0x0405E411, 0x0406400C, 0x0407402E,
|
||||
0x040E7C01, 0x040F4001, 0x04215C01, 0x04247C01, 0x0424FC01,
|
||||
0x04280403, 0x04281402, 0x04283004, 0x0428E003, 0x0428FC01,
|
||||
0x04294009, 0x0429FC01, 0x042CE407, 0x04400003, 0x0440E016,
|
||||
0x04420003, 0x0442C012, 0x04440003, 0x04449C0E, 0x04450004,
|
||||
0x04460003, 0x0446CC0E, 0x04471404, 0x045AAC0D, 0x0491C004,
|
||||
0x05BD442E, 0x05BE3C04, 0x074000F6, 0x07440027, 0x0744A4B5,
|
||||
0x07480046, 0x074C0057, 0x075B0401, 0x075B6C01, 0x075BEC01,
|
||||
0x075C5401, 0x075CD401, 0x075D3C01, 0x075DBC01, 0x075E2401,
|
||||
0x075EA401, 0x075F0C01, 0x07BBC002, 0x07C0002C, 0x07C0C064,
|
||||
0x07C2800F, 0x07C2C40E, 0x07C3040F, 0x07C3440F, 0x07C4401F,
|
||||
0x07C4C03C, 0x07C5C02B, 0x07C7981D, 0x07C8402B, 0x07C90009,
|
||||
0x07C94002, 0x07CC0021, 0x07CCC006, 0x07CCDC46, 0x07CE0014,
|
||||
0x07CE8025, 0x07CF1805, 0x07CF8011, 0x07D0003F, 0x07D10001,
|
||||
0x07D108B6, 0x07D3E404, 0x07D4003E, 0x07D50004, 0x07D54018,
|
||||
0x07D7EC46, 0x07D9140B, 0x07DA0046, 0x07DC0074, 0x38000401,
|
||||
0x38008060, 0x380400F0, 0x3C000001, 0x3FFFF401, 0x40000001,
|
||||
0x43FFF401,
|
||||
0x037FFC01, 0x03EC7801, 0x03ECA401, 0x03EEC810, 0x03F4F802,
|
||||
0x03F7F002, 0x03F8001A, 0x03F88007, 0x03F8C023, 0x03F95013,
|
||||
0x03F9A004, 0x03FBFC01, 0x03FC040F, 0x03FC6807, 0x03FCEC06,
|
||||
0x03FD6C0B, 0x03FF8007, 0x03FFA007, 0x03FFE405, 0x04040003,
|
||||
0x0404DC09, 0x0405E411, 0x0406400C, 0x0407402E, 0x040E7C01,
|
||||
0x040F4001, 0x04215C01, 0x04247C01, 0x0424FC01, 0x04280403,
|
||||
0x04281402, 0x04283004, 0x0428E003, 0x0428FC01, 0x04294009,
|
||||
0x0429FC01, 0x042CE407, 0x04400003, 0x0440E016, 0x04420003,
|
||||
0x0442C012, 0x04440003, 0x04449C0E, 0x04450004, 0x04460003,
|
||||
0x0446CC0E, 0x04471404, 0x045AAC0D, 0x0491C004, 0x05BD442E,
|
||||
0x05BE3C04, 0x074000F6, 0x07440027, 0x0744A4B5, 0x07480046,
|
||||
0x074C0057, 0x075B0401, 0x075B6C01, 0x075BEC01, 0x075C5401,
|
||||
0x075CD401, 0x075D3C01, 0x075DBC01, 0x075E2401, 0x075EA401,
|
||||
0x075F0C01, 0x07BBC002, 0x07C0002C, 0x07C0C064, 0x07C2800F,
|
||||
0x07C2C40E, 0x07C3040F, 0x07C3440F, 0x07C4401F, 0x07C4C03C,
|
||||
0x07C5C02B, 0x07C7981D, 0x07C8402B, 0x07C90009, 0x07C94002,
|
||||
0x07CC0021, 0x07CCC006, 0x07CCDC46, 0x07CE0014, 0x07CE8025,
|
||||
0x07CF1805, 0x07CF8011, 0x07D0003F, 0x07D10001, 0x07D108B6,
|
||||
0x07D3E404, 0x07D4003E, 0x07D50004, 0x07D54018, 0x07D7EC46,
|
||||
0x07D9140B, 0x07DA0046, 0x07DC0074, 0x38000401, 0x38008060,
|
||||
0x380400F0,
|
||||
};
|
||||
static const unsigned int aAscii[4] = {
|
||||
0xFFFFFFFF, 0xFC00FFFF, 0xF8000001, 0xF8000001,
|
||||
|
@ -239,7 +239,10 @@ proc an_load_unicodedata_text {zName} {
|
||||
foreach $lField $fields {}
|
||||
|
||||
set iCode [expr "0x$code"]
|
||||
set bAlnum [expr {[lsearch {L N} [string range $general_category 0 0]]>=0}]
|
||||
set bAlnum [expr {
|
||||
[lsearch {L N} [string range $general_category 0 0]] >= 0
|
||||
|| $general_category=="Co"
|
||||
}]
|
||||
|
||||
if { !$bAlnum } { lappend lRet $iCode }
|
||||
}
|
||||
@ -360,7 +363,7 @@ proc print_isalnum {zFunc lRange} {
|
||||
}
|
||||
assert( aEntry[0]<key );
|
||||
assert( key>=aEntry[iRes] );
|
||||
return (c >= ((aEntry[iRes]>>10) + (aEntry[iRes]&0x3FF)));
|
||||
return (((unsigned int)c) >= ((aEntry[iRes]>>10) + (aEntry[iRes]&0x3FF)));
|
||||
}
|
||||
return 1;}
|
||||
puts "\}"
|
||||
@ -729,7 +732,7 @@ proc print_fileheader {} {
|
||||
*/
|
||||
}]
|
||||
puts ""
|
||||
puts "#if !defined(SQLITE_DISABLE_FTS3_UNICODE)"
|
||||
puts "#if defined(SQLITE_ENABLE_FTS4_UNICODE61)"
|
||||
puts "#if defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4)"
|
||||
puts ""
|
||||
puts "#include <assert.h>"
|
||||
@ -805,4 +808,4 @@ if {$::generate_test_code} {
|
||||
}
|
||||
|
||||
puts "#endif /* defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) */"
|
||||
puts "#endif /* !defined(SQLITE_DISABLE_FTS3_UNICODE) */"
|
||||
puts "#endif /* !defined(SQLITE_ENABLE_FTS4_UNICODE61) */"
|
||||
|
Reference in New Issue
Block a user