1
0
mirror of https://github.com/sqlite/sqlite.git synced 2025-07-30 19:03:16 +03:00

Add the "remove_diacritics=2" option to the unicode61 tokenizer in both FTS5

and FTS3/4.

FossilOrigin-Name: 06177f3f114b5d804b84c27ac843740282e2176fdf0f7a999feda0e1b624adec
This commit is contained in:
dan
2018-12-03 16:14:49 +00:00
parent 8c53b4e7f6
commit e89feee5c3
12 changed files with 320 additions and 94 deletions

View File

@ -7,12 +7,24 @@
# character that it should be replaced with, or an empty string if the
# codepoint should simply be removed from the input. Examples:
#
# { 224 a } (replace codepoint 224 to "a")
# { 769 "" } (remove codepoint 769 from input)
# { 224 a 0 } (replace codepoint 224 to "a")
# { 769 "" 0 } (remove codepoint 769 from input)
#
# Mappings are only returned for non-upper case codepoints. It is assumed
# that the input has already been folded to lower case.
#
# The third value in the list is always either 0 or 1. 0 if the
# UnicodeData.txt file maps the codepoint to a single ASCII character and
# a diacritic, or 1 if the mapping is indirect. For example, consider the
# two entries:
#
# 1ECD;LATIN SMALL LETTER O WITH DOT BELOW;Ll;0;L;006F 0323;;;;N;;;1ECC;;1ECC
# 1ED9;LATIN SMALL LETTER O WITH CIRCUMFLEX AND DOT BELOW;Ll;0;L;1ECD 0302;;;;N;;;1ED8;;1ED8
#
# The first codepoint is a direct mapping (as 006F is ASCII and 0323 is a
# diacritic). The second is an indirect mapping, as it maps to the
# first codepoint plus 0302 (a diacritic).
#
proc rd_load_unicodedata_text {zName} {
global tl_lookup_table
@ -53,18 +65,29 @@ proc rd_load_unicodedata_text {zName} {
set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"]
set iDia [expr "0x[lindex $character_decomposition_mapping 1]"]
# Filter out upper-case characters, as they will be mapped to their
# lower-case equivalents before this data is used.
if {[info exists tl_lookup_table($iCode)]} continue
# Check if this is an indirect mapping. If so, set bIndirect to true
# and change $iAscii to the indirectly mappped ASCII character.
set bIndirect 0
if {[info exists dia($iDia)] && [info exists mapping($iAscii)]} {
set iAscii $mapping($iAscii)
set bIndirect 1
}
if { ($iAscii >= 97 && $iAscii <= 122)
|| ($iAscii >= 65 && $iAscii <= 90)
} {
lappend lRet [list $iCode [string tolower [format %c $iAscii]]]
lappend lRet [list $iCode [string tolower [format %c $iAscii]] $bIndirect]
set mapping($iCode) $iAscii
set dia($iDia) 1
}
}
foreach d [array names dia] {
lappend lRet [list $d ""]
lappend lRet [list $d "" 0]
}
set lRet [lsort -integer -index 0 $lRet]