mirror of
https://github.com/sqlite/sqlite.git
synced 2026-01-06 08:01:16 +03:00
Add the "remove_diacritics=2" option to the unicode61 tokenizer in both FTS5
and FTS3/4. FossilOrigin-Name: 06177f3f114b5d804b84c27ac843740282e2176fdf0f7a999feda0e1b624adec
This commit is contained in:
@@ -82,7 +82,7 @@ typedef struct unicode_cursor unicode_cursor;
|
||||
|
||||
struct unicode_tokenizer {
|
||||
sqlite3_tokenizer base;
|
||||
int bRemoveDiacritic;
|
||||
int eRemoveDiacritic;
|
||||
int nException;
|
||||
int *aiException;
|
||||
};
|
||||
@@ -227,17 +227,20 @@ static int unicodeCreate(
|
||||
pNew = (unicode_tokenizer *) sqlite3_malloc(sizeof(unicode_tokenizer));
|
||||
if( pNew==NULL ) return SQLITE_NOMEM;
|
||||
memset(pNew, 0, sizeof(unicode_tokenizer));
|
||||
pNew->bRemoveDiacritic = 1;
|
||||
pNew->eRemoveDiacritic = 1;
|
||||
|
||||
for(i=0; rc==SQLITE_OK && i<nArg; i++){
|
||||
const char *z = azArg[i];
|
||||
int n = (int)strlen(z);
|
||||
|
||||
if( n==19 && memcmp("remove_diacritics=1", z, 19)==0 ){
|
||||
pNew->bRemoveDiacritic = 1;
|
||||
pNew->eRemoveDiacritic = 1;
|
||||
}
|
||||
else if( n==19 && memcmp("remove_diacritics=0", z, 19)==0 ){
|
||||
pNew->bRemoveDiacritic = 0;
|
||||
pNew->eRemoveDiacritic = 0;
|
||||
}
|
||||
else if( n==19 && memcmp("remove_diacritics=2", z, 19)==0 ){
|
||||
pNew->eRemoveDiacritic = 2;
|
||||
}
|
||||
else if( n>=11 && memcmp("tokenchars=", z, 11)==0 ){
|
||||
rc = unicodeAddExceptions(pNew, 1, &z[11], n-11);
|
||||
@@ -350,7 +353,7 @@ static int unicodeNext(
|
||||
|
||||
/* Write the folded case of the last character read to the output */
|
||||
zEnd = z;
|
||||
iOut = sqlite3FtsUnicodeFold((int)iCode, p->bRemoveDiacritic);
|
||||
iOut = sqlite3FtsUnicodeFold((int)iCode, p->eRemoveDiacritic);
|
||||
if( iOut ){
|
||||
WRITE_UTF8(zOut, iOut);
|
||||
}
|
||||
|
||||
@@ -159,32 +159,47 @@ int sqlite3FtsUnicodeIsalnum(int c){
|
||||
** E"). The resuls of passing a codepoint that corresponds to an
|
||||
** uppercase letter are undefined.
|
||||
*/
|
||||
static int remove_diacritic(int c){
|
||||
static int remove_diacritic(int c, int bComplex){
|
||||
unsigned short aDia[] = {
|
||||
0, 1797, 1848, 1859, 1891, 1928, 1940, 1995,
|
||||
2024, 2040, 2060, 2110, 2168, 2206, 2264, 2286,
|
||||
2344, 2383, 2472, 2488, 2516, 2596, 2668, 2732,
|
||||
2782, 2842, 2894, 2954, 2984, 3000, 3028, 3336,
|
||||
3456, 3696, 3712, 3728, 3744, 3896, 3912, 3928,
|
||||
3968, 4008, 4040, 4106, 4138, 4170, 4202, 4234,
|
||||
4266, 4296, 4312, 4344, 4408, 4424, 4472, 4504,
|
||||
6148, 6198, 6264, 6280, 6360, 6429, 6505, 6529,
|
||||
61448, 61468, 61534, 61592, 61642, 61688, 61704, 61726,
|
||||
61784, 61800, 61836, 61880, 61914, 61948, 61998, 62122,
|
||||
62154, 62200, 62218, 62302, 62364, 62442, 62478, 62536,
|
||||
62554, 62584, 62604, 62640, 62648, 62656, 62664, 62730,
|
||||
62924, 63050, 63082, 63274, 63390,
|
||||
3456, 3696, 3712, 3728, 3744, 3766, 3832, 3896,
|
||||
3912, 3928, 3944, 3968, 4008, 4040, 4056, 4106,
|
||||
4138, 4170, 4202, 4234, 4266, 4296, 4312, 4344,
|
||||
4408, 4424, 4442, 4472, 4488, 4504, 6148, 6198,
|
||||
6264, 6280, 6360, 6429, 6505, 6529, 61448, 61468,
|
||||
61512, 61534, 61592, 61610, 61642, 61672, 61688, 61704,
|
||||
61726, 61784, 61800, 61816, 61836, 61880, 61896, 61914,
|
||||
61948, 61998, 62062, 62122, 62154, 62184, 62200, 62218,
|
||||
62252, 62302, 62364, 62410, 62442, 62478, 62536, 62554,
|
||||
62584, 62604, 62640, 62648, 62656, 62664, 62730, 62766,
|
||||
62830, 62890, 62924, 62974, 63032, 63050, 63082, 63118,
|
||||
63182, 63242, 63274, 63310, 63368, 63390,
|
||||
};
|
||||
char aChar[] = {
|
||||
'\0', 'a', 'c', 'e', 'i', 'n', 'o', 'u', 'y', 'y', 'a', 'c',
|
||||
'd', 'e', 'e', 'g', 'h', 'i', 'j', 'k', 'l', 'n', 'o', 'r',
|
||||
's', 't', 'u', 'u', 'w', 'y', 'z', 'o', 'u', 'a', 'i', 'o',
|
||||
'u', 'g', 'k', 'o', 'j', 'g', 'n', 'a', 'e', 'i', 'o', 'r',
|
||||
'u', 's', 't', 'h', 'a', 'e', 'o', 'y', '\0', '\0', '\0', '\0',
|
||||
'\0', '\0', '\0', '\0', 'a', 'b', 'd', 'd', 'e', 'f', 'g', 'h',
|
||||
'h', 'i', 'k', 'l', 'l', 'm', 'n', 'p', 'r', 'r', 's', 't',
|
||||
'u', 'v', 'w', 'w', 'x', 'y', 'z', 'h', 't', 'w', 'y', 'a',
|
||||
'e', 'i', 'o', 'u', 'y',
|
||||
'\0', 'a'|0x00, 'c'|0x00, 'e'|0x00, 'i'|0x00, 'n'|0x00,
|
||||
'o'|0x00, 'u'|0x00, 'y'|0x00, 'y'|0x00, 'a'|0x00, 'c'|0x00,
|
||||
'd'|0x00, 'e'|0x00, 'e'|0x00, 'g'|0x00, 'h'|0x00, 'i'|0x00,
|
||||
'j'|0x00, 'k'|0x00, 'l'|0x00, 'n'|0x00, 'o'|0x00, 'r'|0x00,
|
||||
's'|0x00, 't'|0x00, 'u'|0x00, 'u'|0x00, 'w'|0x00, 'y'|0x00,
|
||||
'z'|0x00, 'o'|0x00, 'u'|0x00, 'a'|0x00, 'i'|0x00, 'o'|0x00,
|
||||
'u'|0x00, 'u'|0x80, 'a'|0x80, 'g'|0x00, 'k'|0x00, 'o'|0x00,
|
||||
'o'|0x80, 'j'|0x00, 'g'|0x00, 'n'|0x00, 'a'|0x80, 'a'|0x00,
|
||||
'e'|0x00, 'i'|0x00, 'o'|0x00, 'r'|0x00, 'u'|0x00, 's'|0x00,
|
||||
't'|0x00, 'h'|0x00, 'a'|0x00, 'e'|0x00, 'o'|0x80, 'o'|0x00,
|
||||
'o'|0x80, 'y'|0x00, '\0', '\0', '\0', '\0',
|
||||
'\0', '\0', '\0', '\0', 'a'|0x00, 'b'|0x00,
|
||||
'c'|0x80, 'd'|0x00, 'd'|0x00, 'e'|0x80, 'e'|0x00, 'e'|0x80,
|
||||
'f'|0x00, 'g'|0x00, 'h'|0x00, 'h'|0x00, 'i'|0x00, 'i'|0x80,
|
||||
'k'|0x00, 'l'|0x00, 'l'|0x80, 'l'|0x00, 'm'|0x00, 'n'|0x00,
|
||||
'o'|0x80, 'p'|0x00, 'r'|0x00, 'r'|0x80, 'r'|0x00, 's'|0x00,
|
||||
's'|0x80, 't'|0x00, 'u'|0x00, 'u'|0x80, 'v'|0x00, 'w'|0x00,
|
||||
'w'|0x00, 'x'|0x00, 'y'|0x00, 'z'|0x00, 'h'|0x00, 't'|0x00,
|
||||
'w'|0x00, 'y'|0x00, 'a'|0x00, 'a'|0x80, 'a'|0x80, 'a'|0x80,
|
||||
'e'|0x00, 'e'|0x80, 'e'|0x80, 'i'|0x00, 'o'|0x00, 'o'|0x80,
|
||||
'o'|0x80, 'o'|0x80, 'u'|0x00, 'u'|0x80, 'u'|0x80, 'y'|0x00,
|
||||
};
|
||||
|
||||
unsigned int key = (((unsigned int)c)<<3) | 0x00000007;
|
||||
@@ -201,7 +216,8 @@ static int remove_diacritic(int c){
|
||||
}
|
||||
}
|
||||
assert( key>=aDia[iRes] );
|
||||
return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]);
|
||||
if( bComplex==0 && (aChar[iRes] & 0x80) ) return c;
|
||||
return (c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : ((int)aChar[iRes] & 0x7F);
|
||||
}
|
||||
|
||||
|
||||
@@ -228,7 +244,7 @@ int sqlite3FtsUnicodeIsdiacritic(int c){
|
||||
** The results are undefined if the value passed to this function
|
||||
** is less than zero.
|
||||
*/
|
||||
int sqlite3FtsUnicodeFold(int c, int bRemoveDiacritic){
|
||||
int sqlite3FtsUnicodeFold(int c, int eRemoveDiacritic){
|
||||
/* Each entry in the following array defines a rule for folding a range
|
||||
** of codepoints to lower case. The rule applies to a range of nRange
|
||||
** codepoints starting at codepoint iCode.
|
||||
@@ -351,7 +367,9 @@ int sqlite3FtsUnicodeFold(int c, int bRemoveDiacritic){
|
||||
assert( ret>0 );
|
||||
}
|
||||
|
||||
if( bRemoveDiacritic ) ret = remove_diacritic(ret);
|
||||
if( eRemoveDiacritic ){
|
||||
ret = remove_diacritic(ret, eRemoveDiacritic==2);
|
||||
}
|
||||
}
|
||||
|
||||
else if( c>=66560 && c<66600 ){
|
||||
|
||||
@@ -9,11 +9,12 @@ proc print_rd {map} {
|
||||
set nRange 1
|
||||
set iFirst [lindex $map 0 0]
|
||||
set cPrev [lindex $map 0 1]
|
||||
set fPrev [lindex $map 0 2]
|
||||
|
||||
foreach m [lrange $map 1 end] {
|
||||
foreach {i c} $m {}
|
||||
foreach {i c f} $m {}
|
||||
|
||||
if {$cPrev == $c} {
|
||||
if {$cPrev == $c && $fPrev==$f} {
|
||||
for {set j [expr $iFirst+$nRange]} {$j<$i} {incr j} {
|
||||
if {[info exists tl_lookup_table($j)]==0} break
|
||||
}
|
||||
@@ -29,13 +30,16 @@ proc print_rd {map} {
|
||||
|
||||
lappend lRange [list $iFirst $nRange]
|
||||
lappend aChar $cPrev
|
||||
lappend aFlag $fPrev
|
||||
|
||||
set iFirst $i
|
||||
set cPrev $c
|
||||
set fPrev $f
|
||||
set nRange 1
|
||||
}
|
||||
lappend lRange [list $iFirst $nRange]
|
||||
lappend aChar $cPrev
|
||||
lappend aFlag $fPrev
|
||||
|
||||
puts "/*"
|
||||
puts "** If the argument is a codepoint corresponding to a lowercase letter"
|
||||
@@ -45,7 +49,7 @@ proc print_rd {map} {
|
||||
puts "** E\"). The resuls of passing a codepoint that corresponds to an"
|
||||
puts "** uppercase letter are undefined."
|
||||
puts "*/"
|
||||
puts "static int ${::remove_diacritic}(int c)\{"
|
||||
puts "static int ${::remove_diacritic}(int c, int bComplex)\{"
|
||||
puts " unsigned short aDia\[\] = \{"
|
||||
puts -nonewline " 0, "
|
||||
set i 1
|
||||
@@ -60,13 +64,17 @@ proc print_rd {map} {
|
||||
puts ""
|
||||
puts " \};"
|
||||
puts " char aChar\[\] = \{"
|
||||
puts -nonewline " '\\0', "
|
||||
puts -nonewline " '\\0', "
|
||||
set i 1
|
||||
foreach c $aChar {
|
||||
set str "'$c', "
|
||||
if {$c == ""} { set str "'\\0', " }
|
||||
foreach c $aChar f $aFlag {
|
||||
if { $f } {
|
||||
set str "'$c'|0x80, "
|
||||
} else {
|
||||
set str "'$c'|0x00, "
|
||||
}
|
||||
if {$c == ""} { set str "'\\0', " }
|
||||
|
||||
if {($i % 12)==0} {puts "" ; puts -nonewline " " }
|
||||
if {($i % 6)==0} {puts "" ; puts -nonewline " " }
|
||||
incr i
|
||||
puts -nonewline "$str"
|
||||
}
|
||||
@@ -87,7 +95,8 @@ proc print_rd {map} {
|
||||
}
|
||||
}
|
||||
assert( key>=aDia[iRes] );
|
||||
return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]);}
|
||||
if( bComplex==0 && (aChar[iRes] & 0x80) ) return c;
|
||||
return (c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : ((int)aChar[iRes] & 0x7F);}
|
||||
puts "\}"
|
||||
}
|
||||
|
||||
@@ -95,7 +104,8 @@ proc print_isdiacritic {zFunc map} {
|
||||
|
||||
set lCode [list]
|
||||
foreach m $map {
|
||||
foreach {code char} $m {}
|
||||
foreach {code char flag} $m {}
|
||||
if {$flag} continue
|
||||
if {$code && $char == ""} { lappend lCode $code }
|
||||
}
|
||||
set lCode [lsort -integer $lCode]
|
||||
@@ -472,7 +482,7 @@ proc print_fold {zFunc} {
|
||||
puts "** The results are undefined if the value passed to this function"
|
||||
puts "** is less than zero."
|
||||
puts "*/"
|
||||
puts "int ${zFunc}\(int c, int bRemoveDiacritic)\{"
|
||||
puts "int ${zFunc}\(int c, int eRemoveDiacritic)\{"
|
||||
|
||||
set liOff [tl_generate_ioff_table $lRecord]
|
||||
tl_print_table_header
|
||||
@@ -516,7 +526,9 @@ proc print_fold {zFunc} {
|
||||
assert( ret>0 );
|
||||
}
|
||||
|
||||
if( bRemoveDiacritic ) ret = ${::remove_diacritic}(ret);
|
||||
if( eRemoveDiacritic ){
|
||||
ret = ${::remove_diacritic}(ret, eRemoveDiacritic==2);
|
||||
}
|
||||
}
|
||||
}]
|
||||
|
||||
|
||||
@@ -7,12 +7,24 @@
|
||||
# character that it should be replaced with, or an empty string if the
|
||||
# codepoint should simply be removed from the input. Examples:
|
||||
#
|
||||
# { 224 a } (replace codepoint 224 to "a")
|
||||
# { 769 "" } (remove codepoint 769 from input)
|
||||
# { 224 a 0 } (replace codepoint 224 to "a")
|
||||
# { 769 "" 0 } (remove codepoint 769 from input)
|
||||
#
|
||||
# Mappings are only returned for non-upper case codepoints. It is assumed
|
||||
# that the input has already been folded to lower case.
|
||||
#
|
||||
# The third value in the list is always either 0 or 1. 0 if the
|
||||
# UnicodeData.txt file maps the codepoint to a single ASCII character and
|
||||
# a diacritic, or 1 if the mapping is indirect. For example, consider the
|
||||
# two entries:
|
||||
#
|
||||
# 1ECD;LATIN SMALL LETTER O WITH DOT BELOW;Ll;0;L;006F 0323;;;;N;;;1ECC;;1ECC
|
||||
# 1ED9;LATIN SMALL LETTER O WITH CIRCUMFLEX AND DOT BELOW;Ll;0;L;1ECD 0302;;;;N;;;1ED8;;1ED8
|
||||
#
|
||||
# The first codepoint is a direct mapping (as 006F is ASCII and 0323 is a
|
||||
# diacritic). The second is an indirect mapping, as it maps to the
|
||||
# first codepoint plus 0302 (a diacritic).
|
||||
#
|
||||
proc rd_load_unicodedata_text {zName} {
|
||||
global tl_lookup_table
|
||||
|
||||
@@ -53,18 +65,29 @@ proc rd_load_unicodedata_text {zName} {
|
||||
set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"]
|
||||
set iDia [expr "0x[lindex $character_decomposition_mapping 1]"]
|
||||
|
||||
# Filter out upper-case characters, as they will be mapped to their
|
||||
# lower-case equivalents before this data is used.
|
||||
if {[info exists tl_lookup_table($iCode)]} continue
|
||||
|
||||
# Check if this is an indirect mapping. If so, set bIndirect to true
|
||||
# and change $iAscii to the indirectly mappped ASCII character.
|
||||
set bIndirect 0
|
||||
if {[info exists dia($iDia)] && [info exists mapping($iAscii)]} {
|
||||
set iAscii $mapping($iAscii)
|
||||
set bIndirect 1
|
||||
}
|
||||
|
||||
if { ($iAscii >= 97 && $iAscii <= 122)
|
||||
|| ($iAscii >= 65 && $iAscii <= 90)
|
||||
} {
|
||||
lappend lRet [list $iCode [string tolower [format %c $iAscii]]]
|
||||
lappend lRet [list $iCode [string tolower [format %c $iAscii]] $bIndirect]
|
||||
set mapping($iCode) $iAscii
|
||||
set dia($iDia) 1
|
||||
}
|
||||
}
|
||||
|
||||
foreach d [array names dia] {
|
||||
lappend lRet [list $d ""]
|
||||
lappend lRet [list $d "" 0]
|
||||
}
|
||||
set lRet [lsort -integer -index 0 $lRet]
|
||||
|
||||
|
||||
@@ -234,13 +234,18 @@ struct Unicode61Tokenizer {
|
||||
unsigned char aTokenChar[128]; /* ASCII range token characters */
|
||||
char *aFold; /* Buffer to fold text into */
|
||||
int nFold; /* Size of aFold[] in bytes */
|
||||
int bRemoveDiacritic; /* True if remove_diacritics=1 is set */
|
||||
int eRemoveDiacritic; /* True if remove_diacritics=1 is set */
|
||||
int nException;
|
||||
int *aiException;
|
||||
|
||||
unsigned char aCategory[32]; /* True for token char categories */
|
||||
};
|
||||
|
||||
/* Values for eRemoveDiacritic (must match internals of fts5_unicode2.c) */
|
||||
#define FTS5_REMOVE_DIACRITICS_NONE 0
|
||||
#define FTS5_REMOVE_DIACRITICS_SIMPLE 1
|
||||
#define FTS5_REMOVE_DIACRITICS_COMPLEX 2
|
||||
|
||||
static int fts5UnicodeAddExceptions(
|
||||
Unicode61Tokenizer *p, /* Tokenizer object */
|
||||
const char *z, /* Characters to treat as exceptions */
|
||||
@@ -361,7 +366,7 @@ static int fts5UnicodeCreate(
|
||||
int i;
|
||||
memset(p, 0, sizeof(Unicode61Tokenizer));
|
||||
|
||||
p->bRemoveDiacritic = 1;
|
||||
p->eRemoveDiacritic = FTS5_REMOVE_DIACRITICS_SIMPLE;
|
||||
p->nFold = 64;
|
||||
p->aFold = sqlite3_malloc(p->nFold * sizeof(char));
|
||||
if( p->aFold==0 ){
|
||||
@@ -382,10 +387,15 @@ static int fts5UnicodeCreate(
|
||||
for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
|
||||
const char *zArg = azArg[i+1];
|
||||
if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){
|
||||
if( (zArg[0]!='0' && zArg[0]!='1') || zArg[1] ){
|
||||
if( (zArg[0]!='0' && zArg[0]!='1' && zArg[0]!='2') || zArg[1] ){
|
||||
rc = SQLITE_ERROR;
|
||||
}else{
|
||||
p->eRemoveDiacritic = (zArg[0] - '0');
|
||||
assert( p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_NONE
|
||||
|| p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_SIMPLE
|
||||
|| p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_COMPLEX
|
||||
);
|
||||
}
|
||||
p->bRemoveDiacritic = (zArg[0]=='1');
|
||||
}else
|
||||
if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){
|
||||
rc = fts5UnicodeAddExceptions(p, zArg, 1);
|
||||
@@ -499,7 +509,7 @@ static int fts5UnicodeTokenize(
|
||||
READ_UTF8(zCsr, zTerm, iCode);
|
||||
if( fts5UnicodeIsAlnum(p,iCode)||sqlite3Fts5UnicodeIsdiacritic(iCode) ){
|
||||
non_ascii_tokenchar:
|
||||
iCode = sqlite3Fts5UnicodeFold(iCode, p->bRemoveDiacritic);
|
||||
iCode = sqlite3Fts5UnicodeFold(iCode, p->eRemoveDiacritic);
|
||||
if( iCode ) WRITE_UTF8(zOut, iCode);
|
||||
}else{
|
||||
break;
|
||||
|
||||
@@ -28,32 +28,47 @@
|
||||
** E"). The resuls of passing a codepoint that corresponds to an
|
||||
** uppercase letter are undefined.
|
||||
*/
|
||||
static int fts5_remove_diacritic(int c){
|
||||
static int fts5_remove_diacritic(int c, int bComplex){
|
||||
unsigned short aDia[] = {
|
||||
0, 1797, 1848, 1859, 1891, 1928, 1940, 1995,
|
||||
2024, 2040, 2060, 2110, 2168, 2206, 2264, 2286,
|
||||
2344, 2383, 2472, 2488, 2516, 2596, 2668, 2732,
|
||||
2782, 2842, 2894, 2954, 2984, 3000, 3028, 3336,
|
||||
3456, 3696, 3712, 3728, 3744, 3896, 3912, 3928,
|
||||
3968, 4008, 4040, 4106, 4138, 4170, 4202, 4234,
|
||||
4266, 4296, 4312, 4344, 4408, 4424, 4472, 4504,
|
||||
6148, 6198, 6264, 6280, 6360, 6429, 6505, 6529,
|
||||
61448, 61468, 61534, 61592, 61642, 61688, 61704, 61726,
|
||||
61784, 61800, 61836, 61880, 61914, 61948, 61998, 62122,
|
||||
62154, 62200, 62218, 62302, 62364, 62442, 62478, 62536,
|
||||
62554, 62584, 62604, 62640, 62648, 62656, 62664, 62730,
|
||||
62924, 63050, 63082, 63274, 63390,
|
||||
3456, 3696, 3712, 3728, 3744, 3766, 3832, 3896,
|
||||
3912, 3928, 3944, 3968, 4008, 4040, 4056, 4106,
|
||||
4138, 4170, 4202, 4234, 4266, 4296, 4312, 4344,
|
||||
4408, 4424, 4442, 4472, 4488, 4504, 6148, 6198,
|
||||
6264, 6280, 6360, 6429, 6505, 6529, 61448, 61468,
|
||||
61512, 61534, 61592, 61610, 61642, 61672, 61688, 61704,
|
||||
61726, 61784, 61800, 61816, 61836, 61880, 61896, 61914,
|
||||
61948, 61998, 62062, 62122, 62154, 62184, 62200, 62218,
|
||||
62252, 62302, 62364, 62410, 62442, 62478, 62536, 62554,
|
||||
62584, 62604, 62640, 62648, 62656, 62664, 62730, 62766,
|
||||
62830, 62890, 62924, 62974, 63032, 63050, 63082, 63118,
|
||||
63182, 63242, 63274, 63310, 63368, 63390,
|
||||
};
|
||||
char aChar[] = {
|
||||
'\0', 'a', 'c', 'e', 'i', 'n', 'o', 'u', 'y', 'y', 'a', 'c',
|
||||
'd', 'e', 'e', 'g', 'h', 'i', 'j', 'k', 'l', 'n', 'o', 'r',
|
||||
's', 't', 'u', 'u', 'w', 'y', 'z', 'o', 'u', 'a', 'i', 'o',
|
||||
'u', 'g', 'k', 'o', 'j', 'g', 'n', 'a', 'e', 'i', 'o', 'r',
|
||||
'u', 's', 't', 'h', 'a', 'e', 'o', 'y', '\0', '\0', '\0', '\0',
|
||||
'\0', '\0', '\0', '\0', 'a', 'b', 'd', 'd', 'e', 'f', 'g', 'h',
|
||||
'h', 'i', 'k', 'l', 'l', 'm', 'n', 'p', 'r', 'r', 's', 't',
|
||||
'u', 'v', 'w', 'w', 'x', 'y', 'z', 'h', 't', 'w', 'y', 'a',
|
||||
'e', 'i', 'o', 'u', 'y',
|
||||
'\0', 'a'|0x00, 'c'|0x00, 'e'|0x00, 'i'|0x00, 'n'|0x00,
|
||||
'o'|0x00, 'u'|0x00, 'y'|0x00, 'y'|0x00, 'a'|0x00, 'c'|0x00,
|
||||
'd'|0x00, 'e'|0x00, 'e'|0x00, 'g'|0x00, 'h'|0x00, 'i'|0x00,
|
||||
'j'|0x00, 'k'|0x00, 'l'|0x00, 'n'|0x00, 'o'|0x00, 'r'|0x00,
|
||||
's'|0x00, 't'|0x00, 'u'|0x00, 'u'|0x00, 'w'|0x00, 'y'|0x00,
|
||||
'z'|0x00, 'o'|0x00, 'u'|0x00, 'a'|0x00, 'i'|0x00, 'o'|0x00,
|
||||
'u'|0x00, 'u'|0x80, 'a'|0x80, 'g'|0x00, 'k'|0x00, 'o'|0x00,
|
||||
'o'|0x80, 'j'|0x00, 'g'|0x00, 'n'|0x00, 'a'|0x80, 'a'|0x00,
|
||||
'e'|0x00, 'i'|0x00, 'o'|0x00, 'r'|0x00, 'u'|0x00, 's'|0x00,
|
||||
't'|0x00, 'h'|0x00, 'a'|0x00, 'e'|0x00, 'o'|0x80, 'o'|0x00,
|
||||
'o'|0x80, 'y'|0x00, '\0', '\0', '\0', '\0',
|
||||
'\0', '\0', '\0', '\0', 'a'|0x00, 'b'|0x00,
|
||||
'c'|0x80, 'd'|0x00, 'd'|0x00, 'e'|0x80, 'e'|0x00, 'e'|0x80,
|
||||
'f'|0x00, 'g'|0x00, 'h'|0x00, 'h'|0x00, 'i'|0x00, 'i'|0x80,
|
||||
'k'|0x00, 'l'|0x00, 'l'|0x80, 'l'|0x00, 'm'|0x00, 'n'|0x00,
|
||||
'o'|0x80, 'p'|0x00, 'r'|0x00, 'r'|0x80, 'r'|0x00, 's'|0x00,
|
||||
's'|0x80, 't'|0x00, 'u'|0x00, 'u'|0x80, 'v'|0x00, 'w'|0x00,
|
||||
'w'|0x00, 'x'|0x00, 'y'|0x00, 'z'|0x00, 'h'|0x00, 't'|0x00,
|
||||
'w'|0x00, 'y'|0x00, 'a'|0x00, 'a'|0x80, 'a'|0x80, 'a'|0x80,
|
||||
'e'|0x00, 'e'|0x80, 'e'|0x80, 'i'|0x00, 'o'|0x00, 'o'|0x80,
|
||||
'o'|0x80, 'o'|0x80, 'u'|0x00, 'u'|0x80, 'u'|0x80, 'y'|0x00,
|
||||
};
|
||||
|
||||
unsigned int key = (((unsigned int)c)<<3) | 0x00000007;
|
||||
@@ -70,7 +85,8 @@ static int fts5_remove_diacritic(int c){
|
||||
}
|
||||
}
|
||||
assert( key>=aDia[iRes] );
|
||||
return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]);
|
||||
if( bComplex==0 && (aChar[iRes] & 0x80) ) return c;
|
||||
return (c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : ((int)aChar[iRes] & 0x7F);
|
||||
}
|
||||
|
||||
|
||||
@@ -97,7 +113,7 @@ int sqlite3Fts5UnicodeIsdiacritic(int c){
|
||||
** The results are undefined if the value passed to this function
|
||||
** is less than zero.
|
||||
*/
|
||||
int sqlite3Fts5UnicodeFold(int c, int bRemoveDiacritic){
|
||||
int sqlite3Fts5UnicodeFold(int c, int eRemoveDiacritic){
|
||||
/* Each entry in the following array defines a rule for folding a range
|
||||
** of codepoints to lower case. The rule applies to a range of nRange
|
||||
** codepoints starting at codepoint iCode.
|
||||
@@ -220,7 +236,9 @@ int sqlite3Fts5UnicodeFold(int c, int bRemoveDiacritic){
|
||||
assert( ret>0 );
|
||||
}
|
||||
|
||||
if( bRemoveDiacritic ) ret = fts5_remove_diacritic(ret);
|
||||
if( eRemoveDiacritic ){
|
||||
ret = fts5_remove_diacritic(ret, eRemoveDiacritic==2);
|
||||
}
|
||||
}
|
||||
|
||||
else if( c>=66560 && c<66600 ){
|
||||
@@ -231,11 +249,9 @@ int sqlite3Fts5UnicodeFold(int c, int bRemoveDiacritic){
|
||||
}
|
||||
|
||||
|
||||
#if 0
|
||||
int sqlite3Fts5UnicodeNCat(void) {
|
||||
return 32;
|
||||
}
|
||||
#endif
|
||||
|
||||
int sqlite3Fts5UnicodeCatParse(const char *zCat, u8 *aArray){
|
||||
aArray[0] = 1;
|
||||
@@ -756,7 +772,7 @@ void sqlite3Fts5UnicodeAscii(u8 *aArray, u8 *aAscii){
|
||||
int bToken = aArray[ aFts5UnicodeData[iTbl] & 0x1F ];
|
||||
int n = (aFts5UnicodeData[iTbl] >> 5) + i;
|
||||
for(; i<128 && i<n; i++){
|
||||
aAscii[i] = (u8)bToken;
|
||||
aAscii[i] = bToken;
|
||||
}
|
||||
iTbl++;
|
||||
}
|
||||
|
||||
@@ -189,7 +189,7 @@ do_catchsql_test 6.2 {
|
||||
} {1 {error in tokenizer constructor}}
|
||||
do_catchsql_test 6.3 {
|
||||
CREATE VIRTUAL TABLE a3 USING fts5(
|
||||
x, y, tokenize = 'unicode61 remove_diacritics 2'
|
||||
x, y, tokenize = 'unicode61 remove_diacritics 3'
|
||||
);
|
||||
} {1 {error in tokenizer constructor}}
|
||||
do_catchsql_test 6.4 {
|
||||
|
||||
65
ext/fts5/test/fts5umlaut.test
Normal file
65
ext/fts5/test/fts5umlaut.test
Normal file
@@ -0,0 +1,65 @@
|
||||
# 2014 June 17
|
||||
#
|
||||
# The author disclaims copyright to this source code. In place of
|
||||
# a legal notice, here is a blessing:
|
||||
#
|
||||
# May you do good and not evil.
|
||||
# May you find forgiveness for yourself and forgive others.
|
||||
# May you share freely, never taking more than you give.
|
||||
#
|
||||
#*************************************************************************
|
||||
# This file implements regression tests for SQLite library. The
|
||||
# focus of this script is testing the FTS5 module.
|
||||
#
|
||||
|
||||
source [file join [file dirname [info script]] fts5_common.tcl]
|
||||
set testprefix fts5umlaut
|
||||
|
||||
# If SQLITE_ENABLE_FTS5 is not defined, omit this file.
|
||||
ifcapable !fts5 {
|
||||
finish_test
|
||||
return
|
||||
}
|
||||
|
||||
do_execsql_test 1.0 {
|
||||
CREATE VIRTUAL TABLE t1 USING fts5(x);
|
||||
CREATE VIRTUAL TABLE t2 USING fts5(
|
||||
x,
|
||||
tokenize="unicode61 remove_diacritics 2"
|
||||
);
|
||||
}
|
||||
|
||||
foreach {tn q res1 res2} {
|
||||
1 "Hà Nội" 0 1
|
||||
2 "Hà Noi" 1 1
|
||||
3 "Ha Noi" 1 1
|
||||
4 "Ha N\u1ed9i" 0 1
|
||||
5 "Ha N\u006fi" 1 1
|
||||
6 "Ha N\u006f\u0302i" 1 1
|
||||
7 "Ha N\u006f\u0323\u0302i" 1 1
|
||||
} {
|
||||
do_execsql_test 1.$tn.1 {
|
||||
DELETE FROM t1;
|
||||
INSERT INTO t1(rowid, x) VALUES (1, 'Ha Noi');
|
||||
SELECT count(*) FROM t1($q)
|
||||
} $res1
|
||||
do_execsql_test 1.$tn.2 {
|
||||
DELETE FROM t1;
|
||||
INSERT INTO t1(rowid, x) VALUES (1, $q);
|
||||
SELECT count(*) FROM t1('Ha Noi')
|
||||
} $res1
|
||||
|
||||
do_execsql_test 1.$tn.2 {
|
||||
DELETE FROM t2;
|
||||
INSERT INTO t2(rowid, x) VALUES (1, 'Ha Noi');
|
||||
SELECT count(*) FROM t2($q)
|
||||
} $res2
|
||||
do_execsql_test 1.$tn.2 {
|
||||
DELETE FROM t2;
|
||||
INSERT INTO t2(rowid, x) VALUES (1, $q);
|
||||
SELECT count(*) FROM t2('Ha Noi')
|
||||
} $res2
|
||||
}
|
||||
|
||||
finish_test
|
||||
|
||||
@@ -36,24 +36,26 @@ foreach x [an_load_unicodedata_text $UD] {
|
||||
}
|
||||
|
||||
foreach {y} [rd_load_unicodedata_text $UD] {
|
||||
foreach {code ascii} $y {}
|
||||
foreach {code ascii f} $y {}
|
||||
if {$ascii==""} {
|
||||
set int 0
|
||||
} else {
|
||||
binary scan $ascii c int
|
||||
}
|
||||
set aDiacritic($code) $int
|
||||
set aDiacritic($code,$f) $int
|
||||
if {$f==0} { set aDiacritic($code,1) $int }
|
||||
}
|
||||
|
||||
proc tcl_fold {i {bRemoveDiacritic 0}} {
|
||||
global tl_lookup_table
|
||||
global aDiacritic
|
||||
set f [expr $bRemoveDiacritic==2]
|
||||
|
||||
if {[info exists tl_lookup_table($i)]} {
|
||||
set i $tl_lookup_table($i)
|
||||
}
|
||||
if {$bRemoveDiacritic && [info exists aDiacritic($i)]} {
|
||||
set i $aDiacritic($i)
|
||||
if {$bRemoveDiacritic && [info exists aDiacritic($i,$f)]} {
|
||||
set i $aDiacritic($i,$f)
|
||||
}
|
||||
expr $i
|
||||
}
|
||||
@@ -85,7 +87,7 @@ do_execsql_test 1.1 {
|
||||
SELECT count(*), min(i) FROM ii WHERE fts5_fold(i)!=CAST(tcl_fold(i) AS int);
|
||||
} {0 {}}
|
||||
|
||||
do_execsql_test 1.2 {
|
||||
do_execsql_test 1.2.1 {
|
||||
WITH ii(i) AS (
|
||||
SELECT -1
|
||||
UNION ALL
|
||||
@@ -95,6 +97,16 @@ do_execsql_test 1.2 {
|
||||
WHERE fts5_fold(i,1)!=CAST(tcl_fold(i,1) AS int);
|
||||
} {0 {}}
|
||||
|
||||
do_execsql_test 1.2.2 {
|
||||
WITH ii(i) AS (
|
||||
SELECT -1
|
||||
UNION ALL
|
||||
SELECT i+1 FROM ii WHERE i<100000
|
||||
)
|
||||
SELECT count(*), min(i) FROM ii
|
||||
WHERE fts5_fold(i,2)!=CAST(tcl_fold(i,2) AS int);
|
||||
} {0 {}}
|
||||
|
||||
do_execsql_test 1.3 {
|
||||
WITH ii(i) AS (
|
||||
SELECT -1
|
||||
|
||||
Reference in New Issue
Block a user