Improve test coverage of fts5_unicode2.c.

FossilOrigin-Name: fea8a4db9d8c7b9a946017a0dc984cbca6ce240e
2025-07-30 19:03:16 +03:00 · 2015-05-22 06:08:25 +00:00
parent 8c1f46de50
commit 21b7d2a9b8
9 changed files with 349 additions and 176 deletions
--- a/ext/fts3/unicode/mkunicode.tcl
+++ b/ext/fts3/unicode/mkunicode.tcl
@ -1,77 +1,5 @@

-#
-# Parameter $zName must be a path to the file UnicodeData.txt. This command
-# reads the file and returns a list of mappings required to remove all
-# diacritical marks from a unicode string. Each mapping is itself a list
-# consisting of two elements - the unicode codepoint and the single ASCII
-# character that it should be replaced with, or an empty string if the 
-# codepoint should simply be removed from the input. Examples:
-#
-#   { 224 a  }     (replace codepoint 224 to "a")
-#   { 769 "" }     (remove codepoint 769 from input)
-#
-# Mappings are only returned for non-upper case codepoints. It is assumed
-# that the input has already been folded to lower case.
-#
-proc rd_load_unicodedata_text {zName} {
-  global tl_lookup_table
-
-  set fd [open $zName]
-  set lField {
-    code
-    character_name
-    general_category
-    canonical_combining_classes
-    bidirectional_category
-    character_decomposition_mapping
-    decimal_digit_value
-    digit_value
-    numeric_value
-    mirrored
-    unicode_1_name
-    iso10646_comment_field
-    uppercase_mapping
-    lowercase_mapping
-    titlecase_mapping
-  }
-  set lRet [list]
-
-  while { ![eof $fd] } {
-    set line [gets $fd]
-    if {$line == ""} continue
-
-    set fields [split $line ";"]
-    if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
-    foreach $lField $fields {}
-    if { [llength $character_decomposition_mapping]!=2
-      || [string is xdigit [lindex $character_decomposition_mapping 0]]==0
-    } {
-      continue
-    }
-
-    set iCode  [expr "0x$code"]
-    set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"]
-    set iDia   [expr "0x[lindex $character_decomposition_mapping 1]"]
-
-    if {[info exists tl_lookup_table($iCode)]} continue
-
-    if { ($iAscii >= 97 && $iAscii <= 122)
-      || ($iAscii >= 65 && $iAscii <= 90)
-    } {
-      lappend lRet [list $iCode [string tolower [format %c $iAscii]]]
-      set dia($iDia) 1
-    }
-  }
-
-  foreach d [array names dia] {
-    lappend lRet [list $d ""]
-  }
-  set lRet [lsort -integer -index 0 $lRet]
-
-  close $fd
-  set lRet
-}
-
+source [file join [file dirname [info script]] parseunicode.tcl]

 proc print_rd {map} {
  global tl_lookup_table
@ -204,53 +132,6 @@ proc print_isdiacritic {zFunc map} {

 #-------------------------------------------------------------------------

-# Parameter $zName must be a path to the file UnicodeData.txt. This command
-# reads the file and returns a list of codepoints (integers). The list
-# contains all codepoints in the UnicodeData.txt assigned to any "General
-# Category" that is not a "Letter" or "Number".
-#
-proc an_load_unicodedata_text {zName} {
-  set fd [open $zName]
-  set lField {
-    code
-    character_name
-    general_category
-    canonical_combining_classes
-    bidirectional_category
-    character_decomposition_mapping
-    decimal_digit_value
-    digit_value
-    numeric_value
-    mirrored
-    unicode_1_name
-    iso10646_comment_field
-    uppercase_mapping
-    lowercase_mapping
-    titlecase_mapping
-  }
-  set lRet [list]
-
-  while { ![eof $fd] } {
-    set line [gets $fd]
-    if {$line == ""} continue
-
-    set fields [split $line ";"]
-    if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
-    foreach $lField $fields {}
-
-    set iCode [expr "0x$code"]
-    set bAlnum [expr {
-         [lsearch {L N} [string range $general_category 0 0]] >= 0
-      || $general_category=="Co"
-    }]
-
-    if { !$bAlnum } { lappend lRet $iCode }
-  }
-
-  close $fd
-  set lRet
-}
-
 proc an_load_separator_ranges {} {
  global unicodedata.txt
  set lSep [an_load_unicodedata_text ${unicodedata.txt}]
@ -440,29 +321,6 @@ proc print_test_isalnum {zFunc lRange} {

 #-------------------------------------------------------------------------

-proc tl_load_casefolding_txt {zName} {
-  global tl_lookup_table
-
-  set fd [open $zName]
-  while { ![eof $fd] } {
-    set line [gets $fd]
-    if {[string range $line 0 0] == "#"} continue
-    if {$line == ""} continue
-
-    foreach x {a b c d} {unset -nocomplain $x}
-    foreach {a b c d} [split $line ";"] {}
-
-    set a2 [list]
-    set c2 [list]
-    foreach elem $a { lappend a2 [expr "0x[string trim $elem]"] }
-    foreach elem $c { lappend c2 [expr "0x[string trim $elem]"] }
-    set b [string trim $b]
-    set d [string trim $d]
-
-    if {$b=="C" || $b=="S"} { set tl_lookup_table($a2) $c2 }
-  }
-}
-
 proc tl_create_records {} {
  global tl_lookup_table

@ -635,10 +493,12 @@ proc print_fold {zFunc} {
  if( c<128 ){
    if( c>='A' && c<='Z' ) ret = c + ('a' - 'A');
  }else if( c<65536 ){
+    const struct TableEntry *p;
    int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
    int iLo = 0;
    int iRes = -1;

+    assert( c>aEntry[0].iCode );
    while( iHi>=iLo ){
      int iTest = (iHi + iLo) / 2;
      int cmp = (c - aEntry[iTest].iCode);
@ -649,14 +509,12 @@ proc print_fold {zFunc} {
        iHi = iTest-1;
      }
    }
-    assert( iRes<0 || c>=aEntry[iRes].iCode );

-    if( iRes>=0 ){
-      const struct TableEntry *p = &aEntry[iRes];
-      if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
-        ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
-        assert( ret>0 );
-      }
+    assert( iRes>=0 && c>=aEntry[iRes].iCode );
+    p = &aEntry[iRes];
+    if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
+      ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
+      assert( ret>0 );
    }

    if( bRemoveDiacritic ) ret = ${::remove_diacritic}(ret);