1
0
mirror of https://github.com/sqlite/sqlite.git synced 2025-07-30 19:03:16 +03:00

Improve test coverage of fts5_unicode2.c.

FossilOrigin-Name: fea8a4db9d8c7b9a946017a0dc984cbca6ce240e
This commit is contained in:
dan
2015-05-22 06:08:25 +00:00
parent 8c1f46de50
commit 21b7d2a9b8
9 changed files with 349 additions and 176 deletions

View File

@ -1,77 +1,5 @@
#
# Parameter $zName must be a path to the file UnicodeData.txt. This command
# reads the file and returns a list of mappings required to remove all
# diacritical marks from a unicode string. Each mapping is itself a list
# consisting of two elements - the unicode codepoint and the single ASCII
# character that it should be replaced with, or an empty string if the
# codepoint should simply be removed from the input. Examples:
#
# { 224 a } (replace codepoint 224 to "a")
# { 769 "" } (remove codepoint 769 from input)
#
# Mappings are only returned for non-upper case codepoints. It is assumed
# that the input has already been folded to lower case.
#
proc rd_load_unicodedata_text {zName} {
global tl_lookup_table
set fd [open $zName]
set lField {
code
character_name
general_category
canonical_combining_classes
bidirectional_category
character_decomposition_mapping
decimal_digit_value
digit_value
numeric_value
mirrored
unicode_1_name
iso10646_comment_field
uppercase_mapping
lowercase_mapping
titlecase_mapping
}
set lRet [list]
while { ![eof $fd] } {
set line [gets $fd]
if {$line == ""} continue
set fields [split $line ";"]
if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
foreach $lField $fields {}
if { [llength $character_decomposition_mapping]!=2
|| [string is xdigit [lindex $character_decomposition_mapping 0]]==0
} {
continue
}
set iCode [expr "0x$code"]
set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"]
set iDia [expr "0x[lindex $character_decomposition_mapping 1]"]
if {[info exists tl_lookup_table($iCode)]} continue
if { ($iAscii >= 97 && $iAscii <= 122)
|| ($iAscii >= 65 && $iAscii <= 90)
} {
lappend lRet [list $iCode [string tolower [format %c $iAscii]]]
set dia($iDia) 1
}
}
foreach d [array names dia] {
lappend lRet [list $d ""]
}
set lRet [lsort -integer -index 0 $lRet]
close $fd
set lRet
}
source [file join [file dirname [info script]] parseunicode.tcl]
proc print_rd {map} {
global tl_lookup_table
@ -204,53 +132,6 @@ proc print_isdiacritic {zFunc map} {
#-------------------------------------------------------------------------
# Parameter $zName must be a path to the file UnicodeData.txt. This command
# reads the file and returns a list of codepoints (integers). The list
# contains all codepoints in the UnicodeData.txt assigned to any "General
# Category" that is not a "Letter" or "Number".
#
proc an_load_unicodedata_text {zName} {
set fd [open $zName]
set lField {
code
character_name
general_category
canonical_combining_classes
bidirectional_category
character_decomposition_mapping
decimal_digit_value
digit_value
numeric_value
mirrored
unicode_1_name
iso10646_comment_field
uppercase_mapping
lowercase_mapping
titlecase_mapping
}
set lRet [list]
while { ![eof $fd] } {
set line [gets $fd]
if {$line == ""} continue
set fields [split $line ";"]
if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
foreach $lField $fields {}
set iCode [expr "0x$code"]
set bAlnum [expr {
[lsearch {L N} [string range $general_category 0 0]] >= 0
|| $general_category=="Co"
}]
if { !$bAlnum } { lappend lRet $iCode }
}
close $fd
set lRet
}
proc an_load_separator_ranges {} {
global unicodedata.txt
set lSep [an_load_unicodedata_text ${unicodedata.txt}]
@ -440,29 +321,6 @@ proc print_test_isalnum {zFunc lRange} {
#-------------------------------------------------------------------------
proc tl_load_casefolding_txt {zName} {
global tl_lookup_table
set fd [open $zName]
while { ![eof $fd] } {
set line [gets $fd]
if {[string range $line 0 0] == "#"} continue
if {$line == ""} continue
foreach x {a b c d} {unset -nocomplain $x}
foreach {a b c d} [split $line ";"] {}
set a2 [list]
set c2 [list]
foreach elem $a { lappend a2 [expr "0x[string trim $elem]"] }
foreach elem $c { lappend c2 [expr "0x[string trim $elem]"] }
set b [string trim $b]
set d [string trim $d]
if {$b=="C" || $b=="S"} { set tl_lookup_table($a2) $c2 }
}
}
proc tl_create_records {} {
global tl_lookup_table
@ -635,10 +493,12 @@ proc print_fold {zFunc} {
if( c<128 ){
if( c>='A' && c<='Z' ) ret = c + ('a' - 'A');
}else if( c<65536 ){
const struct TableEntry *p;
int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
int iLo = 0;
int iRes = -1;
assert( c>aEntry[0].iCode );
while( iHi>=iLo ){
int iTest = (iHi + iLo) / 2;
int cmp = (c - aEntry[iTest].iCode);
@ -649,14 +509,12 @@ proc print_fold {zFunc} {
iHi = iTest-1;
}
}
assert( iRes<0 || c>=aEntry[iRes].iCode );
if( iRes>=0 ){
const struct TableEntry *p = &aEntry[iRes];
if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
assert( ret>0 );
}
assert( iRes>=0 && c>=aEntry[iRes].iCode );
p = &aEntry[iRes];
if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
assert( ret>0 );
}
if( bRemoveDiacritic ) ret = ${::remove_diacritic}(ret);