mirror of
https://github.com/sqlite/sqlite.git
synced 2025-08-07 02:42:48 +03:00
Improve test coverage of fts5_unicode2.c.
FossilOrigin-Name: fea8a4db9d8c7b9a946017a0dc984cbca6ce240e
This commit is contained in:
@@ -1,77 +1,5 @@
|
|||||||
|
|
||||||
#
|
source [file join [file dirname [info script]] parseunicode.tcl]
|
||||||
# Parameter $zName must be a path to the file UnicodeData.txt. This command
|
|
||||||
# reads the file and returns a list of mappings required to remove all
|
|
||||||
# diacritical marks from a unicode string. Each mapping is itself a list
|
|
||||||
# consisting of two elements - the unicode codepoint and the single ASCII
|
|
||||||
# character that it should be replaced with, or an empty string if the
|
|
||||||
# codepoint should simply be removed from the input. Examples:
|
|
||||||
#
|
|
||||||
# { 224 a } (replace codepoint 224 to "a")
|
|
||||||
# { 769 "" } (remove codepoint 769 from input)
|
|
||||||
#
|
|
||||||
# Mappings are only returned for non-upper case codepoints. It is assumed
|
|
||||||
# that the input has already been folded to lower case.
|
|
||||||
#
|
|
||||||
proc rd_load_unicodedata_text {zName} {
|
|
||||||
global tl_lookup_table
|
|
||||||
|
|
||||||
set fd [open $zName]
|
|
||||||
set lField {
|
|
||||||
code
|
|
||||||
character_name
|
|
||||||
general_category
|
|
||||||
canonical_combining_classes
|
|
||||||
bidirectional_category
|
|
||||||
character_decomposition_mapping
|
|
||||||
decimal_digit_value
|
|
||||||
digit_value
|
|
||||||
numeric_value
|
|
||||||
mirrored
|
|
||||||
unicode_1_name
|
|
||||||
iso10646_comment_field
|
|
||||||
uppercase_mapping
|
|
||||||
lowercase_mapping
|
|
||||||
titlecase_mapping
|
|
||||||
}
|
|
||||||
set lRet [list]
|
|
||||||
|
|
||||||
while { ![eof $fd] } {
|
|
||||||
set line [gets $fd]
|
|
||||||
if {$line == ""} continue
|
|
||||||
|
|
||||||
set fields [split $line ";"]
|
|
||||||
if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
|
|
||||||
foreach $lField $fields {}
|
|
||||||
if { [llength $character_decomposition_mapping]!=2
|
|
||||||
|| [string is xdigit [lindex $character_decomposition_mapping 0]]==0
|
|
||||||
} {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
set iCode [expr "0x$code"]
|
|
||||||
set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"]
|
|
||||||
set iDia [expr "0x[lindex $character_decomposition_mapping 1]"]
|
|
||||||
|
|
||||||
if {[info exists tl_lookup_table($iCode)]} continue
|
|
||||||
|
|
||||||
if { ($iAscii >= 97 && $iAscii <= 122)
|
|
||||||
|| ($iAscii >= 65 && $iAscii <= 90)
|
|
||||||
} {
|
|
||||||
lappend lRet [list $iCode [string tolower [format %c $iAscii]]]
|
|
||||||
set dia($iDia) 1
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
foreach d [array names dia] {
|
|
||||||
lappend lRet [list $d ""]
|
|
||||||
}
|
|
||||||
set lRet [lsort -integer -index 0 $lRet]
|
|
||||||
|
|
||||||
close $fd
|
|
||||||
set lRet
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
proc print_rd {map} {
|
proc print_rd {map} {
|
||||||
global tl_lookup_table
|
global tl_lookup_table
|
||||||
@@ -204,53 +132,6 @@ proc print_isdiacritic {zFunc map} {
|
|||||||
|
|
||||||
#-------------------------------------------------------------------------
|
#-------------------------------------------------------------------------
|
||||||
|
|
||||||
# Parameter $zName must be a path to the file UnicodeData.txt. This command
|
|
||||||
# reads the file and returns a list of codepoints (integers). The list
|
|
||||||
# contains all codepoints in the UnicodeData.txt assigned to any "General
|
|
||||||
# Category" that is not a "Letter" or "Number".
|
|
||||||
#
|
|
||||||
proc an_load_unicodedata_text {zName} {
|
|
||||||
set fd [open $zName]
|
|
||||||
set lField {
|
|
||||||
code
|
|
||||||
character_name
|
|
||||||
general_category
|
|
||||||
canonical_combining_classes
|
|
||||||
bidirectional_category
|
|
||||||
character_decomposition_mapping
|
|
||||||
decimal_digit_value
|
|
||||||
digit_value
|
|
||||||
numeric_value
|
|
||||||
mirrored
|
|
||||||
unicode_1_name
|
|
||||||
iso10646_comment_field
|
|
||||||
uppercase_mapping
|
|
||||||
lowercase_mapping
|
|
||||||
titlecase_mapping
|
|
||||||
}
|
|
||||||
set lRet [list]
|
|
||||||
|
|
||||||
while { ![eof $fd] } {
|
|
||||||
set line [gets $fd]
|
|
||||||
if {$line == ""} continue
|
|
||||||
|
|
||||||
set fields [split $line ";"]
|
|
||||||
if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
|
|
||||||
foreach $lField $fields {}
|
|
||||||
|
|
||||||
set iCode [expr "0x$code"]
|
|
||||||
set bAlnum [expr {
|
|
||||||
[lsearch {L N} [string range $general_category 0 0]] >= 0
|
|
||||||
|| $general_category=="Co"
|
|
||||||
}]
|
|
||||||
|
|
||||||
if { !$bAlnum } { lappend lRet $iCode }
|
|
||||||
}
|
|
||||||
|
|
||||||
close $fd
|
|
||||||
set lRet
|
|
||||||
}
|
|
||||||
|
|
||||||
proc an_load_separator_ranges {} {
|
proc an_load_separator_ranges {} {
|
||||||
global unicodedata.txt
|
global unicodedata.txt
|
||||||
set lSep [an_load_unicodedata_text ${unicodedata.txt}]
|
set lSep [an_load_unicodedata_text ${unicodedata.txt}]
|
||||||
@@ -440,29 +321,6 @@ proc print_test_isalnum {zFunc lRange} {
|
|||||||
|
|
||||||
#-------------------------------------------------------------------------
|
#-------------------------------------------------------------------------
|
||||||
|
|
||||||
proc tl_load_casefolding_txt {zName} {
|
|
||||||
global tl_lookup_table
|
|
||||||
|
|
||||||
set fd [open $zName]
|
|
||||||
while { ![eof $fd] } {
|
|
||||||
set line [gets $fd]
|
|
||||||
if {[string range $line 0 0] == "#"} continue
|
|
||||||
if {$line == ""} continue
|
|
||||||
|
|
||||||
foreach x {a b c d} {unset -nocomplain $x}
|
|
||||||
foreach {a b c d} [split $line ";"] {}
|
|
||||||
|
|
||||||
set a2 [list]
|
|
||||||
set c2 [list]
|
|
||||||
foreach elem $a { lappend a2 [expr "0x[string trim $elem]"] }
|
|
||||||
foreach elem $c { lappend c2 [expr "0x[string trim $elem]"] }
|
|
||||||
set b [string trim $b]
|
|
||||||
set d [string trim $d]
|
|
||||||
|
|
||||||
if {$b=="C" || $b=="S"} { set tl_lookup_table($a2) $c2 }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
proc tl_create_records {} {
|
proc tl_create_records {} {
|
||||||
global tl_lookup_table
|
global tl_lookup_table
|
||||||
|
|
||||||
@@ -635,10 +493,12 @@ proc print_fold {zFunc} {
|
|||||||
if( c<128 ){
|
if( c<128 ){
|
||||||
if( c>='A' && c<='Z' ) ret = c + ('a' - 'A');
|
if( c>='A' && c<='Z' ) ret = c + ('a' - 'A');
|
||||||
}else if( c<65536 ){
|
}else if( c<65536 ){
|
||||||
|
const struct TableEntry *p;
|
||||||
int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
|
int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
|
||||||
int iLo = 0;
|
int iLo = 0;
|
||||||
int iRes = -1;
|
int iRes = -1;
|
||||||
|
|
||||||
|
assert( c>aEntry[0].iCode );
|
||||||
while( iHi>=iLo ){
|
while( iHi>=iLo ){
|
||||||
int iTest = (iHi + iLo) / 2;
|
int iTest = (iHi + iLo) / 2;
|
||||||
int cmp = (c - aEntry[iTest].iCode);
|
int cmp = (c - aEntry[iTest].iCode);
|
||||||
@@ -649,15 +509,13 @@ proc print_fold {zFunc} {
|
|||||||
iHi = iTest-1;
|
iHi = iTest-1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
assert( iRes<0 || c>=aEntry[iRes].iCode );
|
|
||||||
|
|
||||||
if( iRes>=0 ){
|
assert( iRes>=0 && c>=aEntry[iRes].iCode );
|
||||||
const struct TableEntry *p = &aEntry[iRes];
|
p = &aEntry[iRes];
|
||||||
if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
|
if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
|
||||||
ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
|
ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
|
||||||
assert( ret>0 );
|
assert( ret>0 );
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
if( bRemoveDiacritic ) ret = ${::remove_diacritic}(ret);
|
if( bRemoveDiacritic ) ret = ${::remove_diacritic}(ret);
|
||||||
}
|
}
|
||||||
|
146
ext/fts3/unicode/parseunicode.tcl
Normal file
146
ext/fts3/unicode/parseunicode.tcl
Normal file
@@ -0,0 +1,146 @@
|
|||||||
|
|
||||||
|
#--------------------------------------------------------------------------
|
||||||
|
# Parameter $zName must be a path to the file UnicodeData.txt. This command
|
||||||
|
# reads the file and returns a list of mappings required to remove all
|
||||||
|
# diacritical marks from a unicode string. Each mapping is itself a list
|
||||||
|
# consisting of two elements - the unicode codepoint and the single ASCII
|
||||||
|
# character that it should be replaced with, or an empty string if the
|
||||||
|
# codepoint should simply be removed from the input. Examples:
|
||||||
|
#
|
||||||
|
# { 224 a } (replace codepoint 224 to "a")
|
||||||
|
# { 769 "" } (remove codepoint 769 from input)
|
||||||
|
#
|
||||||
|
# Mappings are only returned for non-upper case codepoints. It is assumed
|
||||||
|
# that the input has already been folded to lower case.
|
||||||
|
#
|
||||||
|
proc rd_load_unicodedata_text {zName} {
|
||||||
|
global tl_lookup_table
|
||||||
|
|
||||||
|
set fd [open $zName]
|
||||||
|
set lField {
|
||||||
|
code
|
||||||
|
character_name
|
||||||
|
general_category
|
||||||
|
canonical_combining_classes
|
||||||
|
bidirectional_category
|
||||||
|
character_decomposition_mapping
|
||||||
|
decimal_digit_value
|
||||||
|
digit_value
|
||||||
|
numeric_value
|
||||||
|
mirrored
|
||||||
|
unicode_1_name
|
||||||
|
iso10646_comment_field
|
||||||
|
uppercase_mapping
|
||||||
|
lowercase_mapping
|
||||||
|
titlecase_mapping
|
||||||
|
}
|
||||||
|
set lRet [list]
|
||||||
|
|
||||||
|
while { ![eof $fd] } {
|
||||||
|
set line [gets $fd]
|
||||||
|
if {$line == ""} continue
|
||||||
|
|
||||||
|
set fields [split $line ";"]
|
||||||
|
if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
|
||||||
|
foreach $lField $fields {}
|
||||||
|
if { [llength $character_decomposition_mapping]!=2
|
||||||
|
|| [string is xdigit [lindex $character_decomposition_mapping 0]]==0
|
||||||
|
} {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
set iCode [expr "0x$code"]
|
||||||
|
set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"]
|
||||||
|
set iDia [expr "0x[lindex $character_decomposition_mapping 1]"]
|
||||||
|
|
||||||
|
if {[info exists tl_lookup_table($iCode)]} continue
|
||||||
|
|
||||||
|
if { ($iAscii >= 97 && $iAscii <= 122)
|
||||||
|
|| ($iAscii >= 65 && $iAscii <= 90)
|
||||||
|
} {
|
||||||
|
lappend lRet [list $iCode [string tolower [format %c $iAscii]]]
|
||||||
|
set dia($iDia) 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
foreach d [array names dia] {
|
||||||
|
lappend lRet [list $d ""]
|
||||||
|
}
|
||||||
|
set lRet [lsort -integer -index 0 $lRet]
|
||||||
|
|
||||||
|
close $fd
|
||||||
|
set lRet
|
||||||
|
}
|
||||||
|
|
||||||
|
#-------------------------------------------------------------------------
|
||||||
|
# Parameter $zName must be a path to the file UnicodeData.txt. This command
|
||||||
|
# reads the file and returns a list of codepoints (integers). The list
|
||||||
|
# contains all codepoints in the UnicodeData.txt assigned to any "General
|
||||||
|
# Category" that is not a "Letter" or "Number".
|
||||||
|
#
|
||||||
|
proc an_load_unicodedata_text {zName} {
|
||||||
|
set fd [open $zName]
|
||||||
|
set lField {
|
||||||
|
code
|
||||||
|
character_name
|
||||||
|
general_category
|
||||||
|
canonical_combining_classes
|
||||||
|
bidirectional_category
|
||||||
|
character_decomposition_mapping
|
||||||
|
decimal_digit_value
|
||||||
|
digit_value
|
||||||
|
numeric_value
|
||||||
|
mirrored
|
||||||
|
unicode_1_name
|
||||||
|
iso10646_comment_field
|
||||||
|
uppercase_mapping
|
||||||
|
lowercase_mapping
|
||||||
|
titlecase_mapping
|
||||||
|
}
|
||||||
|
set lRet [list]
|
||||||
|
|
||||||
|
while { ![eof $fd] } {
|
||||||
|
set line [gets $fd]
|
||||||
|
if {$line == ""} continue
|
||||||
|
|
||||||
|
set fields [split $line ";"]
|
||||||
|
if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
|
||||||
|
foreach $lField $fields {}
|
||||||
|
|
||||||
|
set iCode [expr "0x$code"]
|
||||||
|
set bAlnum [expr {
|
||||||
|
[lsearch {L N} [string range $general_category 0 0]] >= 0
|
||||||
|
|| $general_category=="Co"
|
||||||
|
}]
|
||||||
|
|
||||||
|
if { !$bAlnum } { lappend lRet $iCode }
|
||||||
|
}
|
||||||
|
|
||||||
|
close $fd
|
||||||
|
set lRet
|
||||||
|
}
|
||||||
|
|
||||||
|
proc tl_load_casefolding_txt {zName} {
|
||||||
|
global tl_lookup_table
|
||||||
|
|
||||||
|
set fd [open $zName]
|
||||||
|
while { ![eof $fd] } {
|
||||||
|
set line [gets $fd]
|
||||||
|
if {[string range $line 0 0] == "#"} continue
|
||||||
|
if {$line == ""} continue
|
||||||
|
|
||||||
|
foreach x {a b c d} {unset -nocomplain $x}
|
||||||
|
foreach {a b c d} [split $line ";"] {}
|
||||||
|
|
||||||
|
set a2 [list]
|
||||||
|
set c2 [list]
|
||||||
|
foreach elem $a { lappend a2 [expr "0x[string trim $elem]"] }
|
||||||
|
foreach elem $c { lappend c2 [expr "0x[string trim $elem]"] }
|
||||||
|
set b [string trim $b]
|
||||||
|
set d [string trim $d]
|
||||||
|
|
||||||
|
if {$b=="C" || $b=="S"} { set tl_lookup_table($a2) $c2 }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
@@ -631,4 +631,15 @@ int sqlite3Fts5VocabInit(Fts5Global*, sqlite3*);
|
|||||||
** End of interface to code in fts5_vocab.c.
|
** End of interface to code in fts5_vocab.c.
|
||||||
**************************************************************************/
|
**************************************************************************/
|
||||||
|
|
||||||
|
|
||||||
|
/**************************************************************************
|
||||||
|
** Interface to automatically generated code in fts5_unicode2.c.
|
||||||
|
*/
|
||||||
|
int sqlite3Fts5UnicodeIsalnum(int c);
|
||||||
|
int sqlite3Fts5UnicodeIsdiacritic(int c);
|
||||||
|
int sqlite3Fts5UnicodeFold(int c, int bRemoveDiacritic);
|
||||||
|
/*
|
||||||
|
** End of interface to code in fts5_unicode2.c.
|
||||||
|
**************************************************************************/
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@@ -1618,6 +1618,45 @@ static void fts5ExprFunctionTcl(
|
|||||||
fts5ExprFunction(pCtx, nArg, apVal, 1);
|
fts5ExprFunction(pCtx, nArg, apVal, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
** The implementation of an SQLite user-defined-function that accepts a
|
||||||
|
** single integer as an argument. If the integer is an alpha-numeric
|
||||||
|
** unicode code point, 1 is returned. Otherwise 0.
|
||||||
|
*/
|
||||||
|
static void fts5ExprIsAlnum(
|
||||||
|
sqlite3_context *pCtx, /* Function call context */
|
||||||
|
int nArg, /* Number of args */
|
||||||
|
sqlite3_value **apVal /* Function arguments */
|
||||||
|
){
|
||||||
|
int iCode;
|
||||||
|
if( nArg!=1 ){
|
||||||
|
sqlite3_result_error(pCtx,
|
||||||
|
"wrong number of arguments to function fts5_isalnum", -1
|
||||||
|
);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
iCode = sqlite3_value_int(apVal[0]);
|
||||||
|
sqlite3_result_int(pCtx, sqlite3Fts5UnicodeIsalnum(iCode));
|
||||||
|
}
|
||||||
|
|
||||||
|
static void fts5ExprFold(
|
||||||
|
sqlite3_context *pCtx, /* Function call context */
|
||||||
|
int nArg, /* Number of args */
|
||||||
|
sqlite3_value **apVal /* Function arguments */
|
||||||
|
){
|
||||||
|
if( nArg!=1 && nArg!=2 ){
|
||||||
|
sqlite3_result_error(pCtx,
|
||||||
|
"wrong number of arguments to function fts5_fold", -1
|
||||||
|
);
|
||||||
|
}else{
|
||||||
|
int iCode;
|
||||||
|
int bRemoveDiacritics = 0;
|
||||||
|
iCode = sqlite3_value_int(apVal[0]);
|
||||||
|
if( nArg==2 ) bRemoveDiacritics = sqlite3_value_int(apVal[1]);
|
||||||
|
sqlite3_result_int(pCtx, sqlite3Fts5UnicodeFold(iCode, bRemoveDiacritics));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
** This is called during initialization to register the fts5_expr() scalar
|
** This is called during initialization to register the fts5_expr() scalar
|
||||||
** UDF with the SQLite handle passed as the only argument.
|
** UDF with the SQLite handle passed as the only argument.
|
||||||
@@ -1629,6 +1668,8 @@ int sqlite3Fts5ExprInit(Fts5Global *pGlobal, sqlite3 *db){
|
|||||||
} aFunc[] = {
|
} aFunc[] = {
|
||||||
{ "fts5_expr", fts5ExprFunctionHr },
|
{ "fts5_expr", fts5ExprFunctionHr },
|
||||||
{ "fts5_expr_tcl", fts5ExprFunctionTcl },
|
{ "fts5_expr_tcl", fts5ExprFunctionTcl },
|
||||||
|
{ "fts5_isalnum", fts5ExprIsAlnum },
|
||||||
|
{ "fts5_fold", fts5ExprFold },
|
||||||
};
|
};
|
||||||
int i;
|
int i;
|
||||||
int rc = SQLITE_OK;
|
int rc = SQLITE_OK;
|
||||||
|
@@ -174,13 +174,6 @@ static int fts5AsciiTokenize(
|
|||||||
** Start of unicode61 tokenizer implementation.
|
** Start of unicode61 tokenizer implementation.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/*
|
|
||||||
** Functions in fts5_unicode2.c.
|
|
||||||
*/
|
|
||||||
int sqlite3Fts5UnicodeIsalnum(int c);
|
|
||||||
int sqlite3Fts5UnicodeIsdiacritic(int c);
|
|
||||||
int sqlite3Fts5UnicodeFold(int c, int bRemoveDiacritic);
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
** The following two macros - READ_UTF8 and WRITE_UTF8 - have been copied
|
** The following two macros - READ_UTF8 and WRITE_UTF8 - have been copied
|
||||||
|
@@ -327,10 +327,12 @@ int sqlite3Fts5UnicodeFold(int c, int bRemoveDiacritic){
|
|||||||
if( c<128 ){
|
if( c<128 ){
|
||||||
if( c>='A' && c<='Z' ) ret = c + ('a' - 'A');
|
if( c>='A' && c<='Z' ) ret = c + ('a' - 'A');
|
||||||
}else if( c<65536 ){
|
}else if( c<65536 ){
|
||||||
|
const struct TableEntry *p;
|
||||||
int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
|
int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
|
||||||
int iLo = 0;
|
int iLo = 0;
|
||||||
int iRes = -1;
|
int iRes = -1;
|
||||||
|
|
||||||
|
assert( c>aEntry[0].iCode );
|
||||||
while( iHi>=iLo ){
|
while( iHi>=iLo ){
|
||||||
int iTest = (iHi + iLo) / 2;
|
int iTest = (iHi + iLo) / 2;
|
||||||
int cmp = (c - aEntry[iTest].iCode);
|
int cmp = (c - aEntry[iTest].iCode);
|
||||||
@@ -341,15 +343,13 @@ int sqlite3Fts5UnicodeFold(int c, int bRemoveDiacritic){
|
|||||||
iHi = iTest-1;
|
iHi = iTest-1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
assert( iRes<0 || c>=aEntry[iRes].iCode );
|
|
||||||
|
|
||||||
if( iRes>=0 ){
|
assert( iRes>=0 && c>=aEntry[iRes].iCode );
|
||||||
const struct TableEntry *p = &aEntry[iRes];
|
p = &aEntry[iRes];
|
||||||
if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
|
if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
|
||||||
ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
|
ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
|
||||||
assert( ret>0 );
|
assert( ret>0 );
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
if( bRemoveDiacritic ) ret = fts5_remove_diacritic(ret);
|
if( bRemoveDiacritic ) ret = fts5_remove_diacritic(ret);
|
||||||
}
|
}
|
||||||
|
122
ext/fts5/test/fts5unicode3.test
Normal file
122
ext/fts5/test/fts5unicode3.test
Normal file
@@ -0,0 +1,122 @@
|
|||||||
|
# 2014 Dec 20
|
||||||
|
#
|
||||||
|
# The author disclaims copyright to this source code. In place of
|
||||||
|
# a legal notice, here is a blessing:
|
||||||
|
#
|
||||||
|
# May you do good and not evil.
|
||||||
|
# May you find forgiveness for yourself and forgive others.
|
||||||
|
# May you share freely, never taking more than you give.
|
||||||
|
#
|
||||||
|
#***********************************************************************
|
||||||
|
#
|
||||||
|
# Tests focusing on the fts5 tokenizers
|
||||||
|
#
|
||||||
|
|
||||||
|
proc fts3_unicode_path {file} {
|
||||||
|
file join [file dirname [info script]] .. .. fts3 unicode $file
|
||||||
|
}
|
||||||
|
|
||||||
|
source [file join [file dirname [info script]] fts5_common.tcl]
|
||||||
|
source [fts3_unicode_path parseunicode.tcl]
|
||||||
|
set testprefix fts5unicode3
|
||||||
|
|
||||||
|
set CF [fts3_unicode_path CaseFolding.txt]
|
||||||
|
set UD [fts3_unicode_path UnicodeData.txt]
|
||||||
|
|
||||||
|
tl_load_casefolding_txt $CF
|
||||||
|
foreach x [an_load_unicodedata_text $UD] {
|
||||||
|
set aNotAlnum($x) 1
|
||||||
|
}
|
||||||
|
|
||||||
|
foreach {y} [rd_load_unicodedata_text $UD] {
|
||||||
|
foreach {code ascii} $y {}
|
||||||
|
if {$ascii==""} {
|
||||||
|
set int 0
|
||||||
|
} else {
|
||||||
|
binary scan $ascii c int
|
||||||
|
}
|
||||||
|
set aDiacritic($code) $int
|
||||||
|
}
|
||||||
|
|
||||||
|
proc tcl_fold {i {bRemoveDiacritic 0}} {
|
||||||
|
global tl_lookup_table
|
||||||
|
global aDiacritic
|
||||||
|
|
||||||
|
if {[info exists tl_lookup_table($i)]} {
|
||||||
|
set i $tl_lookup_table($i)
|
||||||
|
}
|
||||||
|
if {$bRemoveDiacritic && [info exists aDiacritic($i)]} {
|
||||||
|
set i $aDiacritic($i)
|
||||||
|
}
|
||||||
|
expr $i
|
||||||
|
}
|
||||||
|
db func tcl_fold tcl_fold
|
||||||
|
|
||||||
|
proc tcl_isalnum {i} {
|
||||||
|
global aNotAlnum
|
||||||
|
expr {![info exists aNotAlnum($i)]}
|
||||||
|
}
|
||||||
|
db func tcl_isalnum tcl_isalnum
|
||||||
|
|
||||||
|
|
||||||
|
do_catchsql_test 1.0.1 {
|
||||||
|
SELECT fts5_isalnum(1, 2, 3);
|
||||||
|
} {1 {wrong number of arguments to function fts5_isalnum}}
|
||||||
|
do_catchsql_test 1.0.2 {
|
||||||
|
SELECT fts5_fold();
|
||||||
|
} {1 {wrong number of arguments to function fts5_fold}}
|
||||||
|
do_catchsql_test 1.0.3 {
|
||||||
|
SELECT fts5_fold(1,2,3);
|
||||||
|
} {1 {wrong number of arguments to function fts5_fold}}
|
||||||
|
|
||||||
|
do_execsql_test 1.1 {
|
||||||
|
WITH ii(i) AS (
|
||||||
|
SELECT -1
|
||||||
|
UNION ALL
|
||||||
|
SELECT i+1 FROM ii WHERE i<100000
|
||||||
|
)
|
||||||
|
SELECT count(*), min(i) FROM ii WHERE fts5_fold(i)!=CAST(tcl_fold(i) AS int);
|
||||||
|
} {0 {}}
|
||||||
|
|
||||||
|
do_execsql_test 1.2 {
|
||||||
|
WITH ii(i) AS (
|
||||||
|
SELECT -1
|
||||||
|
UNION ALL
|
||||||
|
SELECT i+1 FROM ii WHERE i<100000
|
||||||
|
)
|
||||||
|
SELECT count(*), min(i) FROM ii
|
||||||
|
WHERE fts5_fold(i,1)!=CAST(tcl_fold(i,1) AS int);
|
||||||
|
} {0 {}}
|
||||||
|
|
||||||
|
do_execsql_test 1.3 {
|
||||||
|
WITH ii(i) AS (
|
||||||
|
SELECT -1
|
||||||
|
UNION ALL
|
||||||
|
SELECT i+1 FROM ii WHERE i<100000
|
||||||
|
)
|
||||||
|
SELECT count(*), min(i) FROM ii
|
||||||
|
WHERE fts5_isalnum(i)!=CAST(tcl_isalnum(i) AS int);
|
||||||
|
} {0 {}}
|
||||||
|
|
||||||
|
do_test 1.4 {
|
||||||
|
set str {CREATE VIRTUAL TABLE f3 USING fts5(a, tokenize=}
|
||||||
|
append str {"unicode61 separators '}
|
||||||
|
for {set i 700} {$i<900} {incr i} {
|
||||||
|
append str [format %c $i]
|
||||||
|
}
|
||||||
|
append str {'");}
|
||||||
|
execsql $str
|
||||||
|
} {}
|
||||||
|
do_test 1.5 {
|
||||||
|
set str {CREATE VIRTUAL TABLE f5 USING fts5(a, tokenize=}
|
||||||
|
append str {"unicode61 tokenchars '}
|
||||||
|
for {set i 700} {$i<900} {incr i} {
|
||||||
|
append str [format %c $i]
|
||||||
|
}
|
||||||
|
append str {'");}
|
||||||
|
execsql $str
|
||||||
|
} {}
|
||||||
|
|
||||||
|
|
||||||
|
finish_test
|
||||||
|
|
22
manifest
22
manifest
@@ -1,5 +1,5 @@
|
|||||||
C Improve\stest\scoverage\sof\sfts5_tokenize.c.
|
C Improve\stest\scoverage\sof\sfts5_unicode2.c.
|
||||||
D 2015-05-20T09:27:51.629
|
D 2015-05-22T06:08:25.338
|
||||||
F Makefile.arm-wince-mingw32ce-gcc d6df77f1f48d690bd73162294bbba7f59507c72f
|
F Makefile.arm-wince-mingw32ce-gcc d6df77f1f48d690bd73162294bbba7f59507c72f
|
||||||
F Makefile.in 2c28e557780395095c307a6e5cb539419027eb5e
|
F Makefile.in 2c28e557780395095c307a6e5cb539419027eb5e
|
||||||
F Makefile.linux-gcc 91d710bdc4998cb015f39edf3cb314ec4f4d7e23
|
F Makefile.linux-gcc 91d710bdc4998cb015f39edf3cb314ec4f4d7e23
|
||||||
@@ -102,21 +102,22 @@ F ext/fts3/mkfts3amal.tcl 252ecb7fe6467854f2aa237bf2c390b74e71f100
|
|||||||
F ext/fts3/tool/fts3view.c 8e53d0190a7b3443764bbd32ad47be2bd852026d
|
F ext/fts3/tool/fts3view.c 8e53d0190a7b3443764bbd32ad47be2bd852026d
|
||||||
F ext/fts3/unicode/CaseFolding.txt 8c678ca52ecc95e16bc7afc2dbf6fc9ffa05db8c
|
F ext/fts3/unicode/CaseFolding.txt 8c678ca52ecc95e16bc7afc2dbf6fc9ffa05db8c
|
||||||
F ext/fts3/unicode/UnicodeData.txt cd07314edb62d49fde34debdaf92fa2aa69011e7
|
F ext/fts3/unicode/UnicodeData.txt cd07314edb62d49fde34debdaf92fa2aa69011e7
|
||||||
F ext/fts3/unicode/mkunicode.tcl 159c1194da0bc72f51b3c2eb71022568006dc5ad
|
F ext/fts3/unicode/mkunicode.tcl b321eea0c1604954a098775ce0b7860bc449f686
|
||||||
|
F ext/fts3/unicode/parseunicode.tcl da577d1384810fb4e2b209bf3313074353193e95
|
||||||
F ext/fts5/extract_api_docs.tcl 55a6d648d516f35d9a1e580ac00de27154e1904a
|
F ext/fts5/extract_api_docs.tcl 55a6d648d516f35d9a1e580ac00de27154e1904a
|
||||||
F ext/fts5/fts5.c 74d18b4dc7518c7cd85609f1541e83bc564619a2
|
F ext/fts5/fts5.c 74d18b4dc7518c7cd85609f1541e83bc564619a2
|
||||||
F ext/fts5/fts5.h 4266c6231094005b051dbfc8dd85d2bc57243d34
|
F ext/fts5/fts5.h 4266c6231094005b051dbfc8dd85d2bc57243d34
|
||||||
F ext/fts5/fts5Int.h 9e581dc077d4c6758eaeb0d6a85dc875f53918dc
|
F ext/fts5/fts5Int.h ba0fd64be01cf7bf47ad20fcd23b629fdde6c4dc
|
||||||
F ext/fts5/fts5_aux.c d53f00f31ad615ca4f139dd8751f9041afa00971
|
F ext/fts5/fts5_aux.c d53f00f31ad615ca4f139dd8751f9041afa00971
|
||||||
F ext/fts5/fts5_buffer.c 861599a0abe2383f0cd0352c57001140a26b0930
|
F ext/fts5/fts5_buffer.c 861599a0abe2383f0cd0352c57001140a26b0930
|
||||||
F ext/fts5/fts5_config.c 11f969ed711a0a8b611d47431d74c372ad78c713
|
F ext/fts5/fts5_config.c 11f969ed711a0a8b611d47431d74c372ad78c713
|
||||||
F ext/fts5/fts5_expr.c 0c4b50bb48740c76b8e8b89d5d40a55f8dbffd07
|
F ext/fts5/fts5_expr.c f9a2ef4efbc4b133e0173e4bf7d7ebff33eddcf1
|
||||||
F ext/fts5/fts5_hash.c 54dd25348a46ea62ea96322c572e08cd1fb37304
|
F ext/fts5/fts5_hash.c 54dd25348a46ea62ea96322c572e08cd1fb37304
|
||||||
F ext/fts5/fts5_index.c 2c4500c35072b049d1391bbb4e64e4c0e3d3dd43
|
F ext/fts5/fts5_index.c 2c4500c35072b049d1391bbb4e64e4c0e3d3dd43
|
||||||
F ext/fts5/fts5_storage.c 5d2b51adb304643d8f825ba89283d628418b20c2
|
F ext/fts5/fts5_storage.c 5d2b51adb304643d8f825ba89283d628418b20c2
|
||||||
F ext/fts5/fts5_tcl.c 7ea165878e4ae3598e89acd470a0ee1b5a00e33c
|
F ext/fts5/fts5_tcl.c 7ea165878e4ae3598e89acd470a0ee1b5a00e33c
|
||||||
F ext/fts5/fts5_tokenize.c 6f4d2cbe7ed892821d1a233c7db613dafdb3877a
|
F ext/fts5/fts5_tokenize.c 24649425adfea2c4877d8f69f2754b70374940ec
|
||||||
F ext/fts5/fts5_unicode2.c f74f53316377068812a1fa5a37819e6b8124631d
|
F ext/fts5/fts5_unicode2.c c75022368f940a38afa1d2f0164c78b11ab2f383
|
||||||
F ext/fts5/fts5_vocab.c b54301e376f59f08f662b5dde1cfaf26e86e4db6
|
F ext/fts5/fts5_vocab.c b54301e376f59f08f662b5dde1cfaf26e86e4db6
|
||||||
F ext/fts5/fts5parse.y 777da8e5819f75c217982c79c29d014c293acac9
|
F ext/fts5/fts5parse.y 777da8e5819f75c217982c79c29d014c293acac9
|
||||||
F ext/fts5/mkportersteps.tcl 5acf962d2e0074f701620bb5308155fa1e4a63ba
|
F ext/fts5/mkportersteps.tcl 5acf962d2e0074f701620bb5308155fa1e4a63ba
|
||||||
@@ -168,6 +169,7 @@ F ext/fts5/test/fts5rowid.test ca9d91ccb3a4590fc561b2d7a884361bb21e8df5
|
|||||||
F ext/fts5/test/fts5tokenizer.test 668747fcb41de6fc7daebc478920b705164fccc1
|
F ext/fts5/test/fts5tokenizer.test 668747fcb41de6fc7daebc478920b705164fccc1
|
||||||
F ext/fts5/test/fts5unicode.test 79b3e34eb29ce4929628aa514a40cb467fdabe4d
|
F ext/fts5/test/fts5unicode.test 79b3e34eb29ce4929628aa514a40cb467fdabe4d
|
||||||
F ext/fts5/test/fts5unicode2.test ad38982b03dc9213445facb16e99f668a74cc4ba
|
F ext/fts5/test/fts5unicode2.test ad38982b03dc9213445facb16e99f668a74cc4ba
|
||||||
|
F ext/fts5/test/fts5unicode3.test 273f9086ad33935566bbc0d0c94d0d9687ef686b
|
||||||
F ext/fts5/test/fts5unindexed.test f388605341a476b6ab622b4c267cd168f59a5944
|
F ext/fts5/test/fts5unindexed.test f388605341a476b6ab622b4c267cd168f59a5944
|
||||||
F ext/fts5/test/fts5version.test dc34a735af6625a1a7a4a916a38d122071343887
|
F ext/fts5/test/fts5version.test dc34a735af6625a1a7a4a916a38d122071343887
|
||||||
F ext/fts5/test/fts5vocab.test 80fb22850dd3b2c92a3896e6021605e08c0872aa
|
F ext/fts5/test/fts5vocab.test 80fb22850dd3b2c92a3896e6021605e08c0872aa
|
||||||
@@ -1329,7 +1331,7 @@ F tool/vdbe_profile.tcl 67746953071a9f8f2f668b73fe899074e2c6d8c1
|
|||||||
F tool/warnings-clang.sh f6aa929dc20ef1f856af04a730772f59283631d4
|
F tool/warnings-clang.sh f6aa929dc20ef1f856af04a730772f59283631d4
|
||||||
F tool/warnings.sh 0abfd78ceb09b7f7c27c688c8e3fe93268a13b32
|
F tool/warnings.sh 0abfd78ceb09b7f7c27c688c8e3fe93268a13b32
|
||||||
F tool/win/sqlite.vsix deb315d026cc8400325c5863eef847784a219a2f
|
F tool/win/sqlite.vsix deb315d026cc8400325c5863eef847784a219a2f
|
||||||
P 4f90ba20e2be6ec5755fe894938ac97342d6fbf6
|
P 0e91a6a520f040b8902da6a1a4d9107dc66c0ea3
|
||||||
R 43528c0613d372060fbd8256efc47909
|
R dac2002cb3b723a15f8d8c03f8a4c974
|
||||||
U dan
|
U dan
|
||||||
Z e3c696b644b37e5798613b4f15c87656
|
Z b9f569713ab52c4f747377183dfd6e18
|
||||||
|
@@ -1 +1 @@
|
|||||||
0e91a6a520f040b8902da6a1a4d9107dc66c0ea3
|
fea8a4db9d8c7b9a946017a0dc984cbca6ce240e
|
Reference in New Issue
Block a user