mirror of
https://github.com/sqlite/sqlite.git
synced 2025-07-30 19:03:16 +03:00
Add the "remove_diacritics=2" option to the unicode61 tokenizer in both FTS5
and FTS3/4. FossilOrigin-Name: 06177f3f114b5d804b84c27ac843740282e2176fdf0f7a999feda0e1b624adec
This commit is contained in:
@ -9,11 +9,12 @@ proc print_rd {map} {
|
||||
set nRange 1
|
||||
set iFirst [lindex $map 0 0]
|
||||
set cPrev [lindex $map 0 1]
|
||||
set fPrev [lindex $map 0 2]
|
||||
|
||||
foreach m [lrange $map 1 end] {
|
||||
foreach {i c} $m {}
|
||||
foreach {i c f} $m {}
|
||||
|
||||
if {$cPrev == $c} {
|
||||
if {$cPrev == $c && $fPrev==$f} {
|
||||
for {set j [expr $iFirst+$nRange]} {$j<$i} {incr j} {
|
||||
if {[info exists tl_lookup_table($j)]==0} break
|
||||
}
|
||||
@ -29,13 +30,16 @@ proc print_rd {map} {
|
||||
|
||||
lappend lRange [list $iFirst $nRange]
|
||||
lappend aChar $cPrev
|
||||
lappend aFlag $fPrev
|
||||
|
||||
set iFirst $i
|
||||
set cPrev $c
|
||||
set fPrev $f
|
||||
set nRange 1
|
||||
}
|
||||
lappend lRange [list $iFirst $nRange]
|
||||
lappend aChar $cPrev
|
||||
lappend aFlag $fPrev
|
||||
|
||||
puts "/*"
|
||||
puts "** If the argument is a codepoint corresponding to a lowercase letter"
|
||||
@ -45,7 +49,7 @@ proc print_rd {map} {
|
||||
puts "** E\"). The resuls of passing a codepoint that corresponds to an"
|
||||
puts "** uppercase letter are undefined."
|
||||
puts "*/"
|
||||
puts "static int ${::remove_diacritic}(int c)\{"
|
||||
puts "static int ${::remove_diacritic}(int c, int bComplex)\{"
|
||||
puts " unsigned short aDia\[\] = \{"
|
||||
puts -nonewline " 0, "
|
||||
set i 1
|
||||
@ -60,13 +64,17 @@ proc print_rd {map} {
|
||||
puts ""
|
||||
puts " \};"
|
||||
puts " char aChar\[\] = \{"
|
||||
puts -nonewline " '\\0', "
|
||||
puts -nonewline " '\\0', "
|
||||
set i 1
|
||||
foreach c $aChar {
|
||||
set str "'$c', "
|
||||
if {$c == ""} { set str "'\\0', " }
|
||||
foreach c $aChar f $aFlag {
|
||||
if { $f } {
|
||||
set str "'$c'|0x80, "
|
||||
} else {
|
||||
set str "'$c'|0x00, "
|
||||
}
|
||||
if {$c == ""} { set str "'\\0', " }
|
||||
|
||||
if {($i % 12)==0} {puts "" ; puts -nonewline " " }
|
||||
if {($i % 6)==0} {puts "" ; puts -nonewline " " }
|
||||
incr i
|
||||
puts -nonewline "$str"
|
||||
}
|
||||
@ -87,7 +95,8 @@ proc print_rd {map} {
|
||||
}
|
||||
}
|
||||
assert( key>=aDia[iRes] );
|
||||
return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]);}
|
||||
if( bComplex==0 && (aChar[iRes] & 0x80) ) return c;
|
||||
return (c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : ((int)aChar[iRes] & 0x7F);}
|
||||
puts "\}"
|
||||
}
|
||||
|
||||
@ -95,7 +104,8 @@ proc print_isdiacritic {zFunc map} {
|
||||
|
||||
set lCode [list]
|
||||
foreach m $map {
|
||||
foreach {code char} $m {}
|
||||
foreach {code char flag} $m {}
|
||||
if {$flag} continue
|
||||
if {$code && $char == ""} { lappend lCode $code }
|
||||
}
|
||||
set lCode [lsort -integer $lCode]
|
||||
@ -472,7 +482,7 @@ proc print_fold {zFunc} {
|
||||
puts "** The results are undefined if the value passed to this function"
|
||||
puts "** is less than zero."
|
||||
puts "*/"
|
||||
puts "int ${zFunc}\(int c, int bRemoveDiacritic)\{"
|
||||
puts "int ${zFunc}\(int c, int eRemoveDiacritic)\{"
|
||||
|
||||
set liOff [tl_generate_ioff_table $lRecord]
|
||||
tl_print_table_header
|
||||
@ -516,7 +526,9 @@ proc print_fold {zFunc} {
|
||||
assert( ret>0 );
|
||||
}
|
||||
|
||||
if( bRemoveDiacritic ) ret = ${::remove_diacritic}(ret);
|
||||
if( eRemoveDiacritic ){
|
||||
ret = ${::remove_diacritic}(ret, eRemoveDiacritic==2);
|
||||
}
|
||||
}
|
||||
}]
|
||||
|
||||
|
@ -7,12 +7,24 @@
|
||||
# character that it should be replaced with, or an empty string if the
|
||||
# codepoint should simply be removed from the input. Examples:
|
||||
#
|
||||
# { 224 a } (replace codepoint 224 to "a")
|
||||
# { 769 "" } (remove codepoint 769 from input)
|
||||
# { 224 a 0 } (replace codepoint 224 to "a")
|
||||
# { 769 "" 0 } (remove codepoint 769 from input)
|
||||
#
|
||||
# Mappings are only returned for non-upper case codepoints. It is assumed
|
||||
# that the input has already been folded to lower case.
|
||||
#
|
||||
# The third value in the list is always either 0 or 1. 0 if the
|
||||
# UnicodeData.txt file maps the codepoint to a single ASCII character and
|
||||
# a diacritic, or 1 if the mapping is indirect. For example, consider the
|
||||
# two entries:
|
||||
#
|
||||
# 1ECD;LATIN SMALL LETTER O WITH DOT BELOW;Ll;0;L;006F 0323;;;;N;;;1ECC;;1ECC
|
||||
# 1ED9;LATIN SMALL LETTER O WITH CIRCUMFLEX AND DOT BELOW;Ll;0;L;1ECD 0302;;;;N;;;1ED8;;1ED8
|
||||
#
|
||||
# The first codepoint is a direct mapping (as 006F is ASCII and 0323 is a
|
||||
# diacritic). The second is an indirect mapping, as it maps to the
|
||||
# first codepoint plus 0302 (a diacritic).
|
||||
#
|
||||
proc rd_load_unicodedata_text {zName} {
|
||||
global tl_lookup_table
|
||||
|
||||
@ -53,18 +65,29 @@ proc rd_load_unicodedata_text {zName} {
|
||||
set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"]
|
||||
set iDia [expr "0x[lindex $character_decomposition_mapping 1]"]
|
||||
|
||||
# Filter out upper-case characters, as they will be mapped to their
|
||||
# lower-case equivalents before this data is used.
|
||||
if {[info exists tl_lookup_table($iCode)]} continue
|
||||
|
||||
# Check if this is an indirect mapping. If so, set bIndirect to true
|
||||
# and change $iAscii to the indirectly mappped ASCII character.
|
||||
set bIndirect 0
|
||||
if {[info exists dia($iDia)] && [info exists mapping($iAscii)]} {
|
||||
set iAscii $mapping($iAscii)
|
||||
set bIndirect 1
|
||||
}
|
||||
|
||||
if { ($iAscii >= 97 && $iAscii <= 122)
|
||||
|| ($iAscii >= 65 && $iAscii <= 90)
|
||||
} {
|
||||
lappend lRet [list $iCode [string tolower [format %c $iAscii]]]
|
||||
lappend lRet [list $iCode [string tolower [format %c $iAscii]] $bIndirect]
|
||||
set mapping($iCode) $iAscii
|
||||
set dia($iDia) 1
|
||||
}
|
||||
}
|
||||
|
||||
foreach d [array names dia] {
|
||||
lappend lRet [list $d ""]
|
||||
lappend lRet [list $d "" 0]
|
||||
}
|
||||
set lRet [lsort -integer -index 0 $lRet]
|
||||
|
||||
|
Reference in New Issue
Block a user