diff --git a/ext/fts3/fts3Int.h b/ext/fts3/fts3Int.h index 097d4a3f0a..98e353bea5 100644 --- a/ext/fts3/fts3Int.h +++ b/ext/fts3/fts3Int.h @@ -542,8 +542,11 @@ int sqlite3Fts3MsrIncrRestart(Fts3MultiSegReader *pCsr); int sqlite3Fts3DeferredTokenList(Fts3DeferredToken *, char **, int *); /* fts3_unicode2.c (functions generated by parsing unicode text files) */ -int sqlite3FtsUnicodeTolower(int); +#ifndef SQLITE_DISABLE_FTS3_UNICODE +int sqlite3FtsUnicodeFold(int, int); int sqlite3FtsUnicodeIsalnum(int); +int sqlite3FtsUnicodeIsdiacritic(int); +#endif #endif /* !SQLITE_CORE || SQLITE_ENABLE_FTS3 */ #endif /* _FTSINT_H */ diff --git a/ext/fts3/fts3_unicode.c b/ext/fts3/fts3_unicode.c index 83b1c322b2..c5228273b3 100644 --- a/ext/fts3/fts3_unicode.c +++ b/ext/fts3/fts3_unicode.c @@ -82,6 +82,7 @@ typedef struct unicode_cursor unicode_cursor; struct unicode_tokenizer { sqlite3_tokenizer base; + int bRemoveDiacritic; }; struct unicode_cursor { @@ -103,11 +104,30 @@ static int unicodeCreate( sqlite3_tokenizer **pp /* OUT: New tokenizer handle */ ){ unicode_tokenizer *pNew; /* New tokenizer object */ + int i; pNew = (unicode_tokenizer *) sqlite3_malloc(sizeof(unicode_tokenizer)); if( pNew==NULL ){ return SQLITE_NOMEM; } memset(pNew, 0, sizeof(unicode_tokenizer)); + pNew->bRemoveDiacritic = 1; + + for(i=0; ibRemoveDiacritic = 1; + } + else if( n==19 && memcmp("remove_diacritics=0", z, 19)==0 ){ + pNew->bRemoveDiacritic = 0; + } + else{ + /* Unrecognized argument */ + return SQLITE_ERROR; + } + } + *pp = &pNew->base; return SQLITE_OK; } @@ -197,6 +217,8 @@ static int unicodeNext( zOut = pCsr->zToken; do { + int iOut; + /* Grow the output buffer if required. */ if( (zOut-pCsr->zToken)>=(pCsr->nAlloc-4) ){ char *zNew = sqlite3_realloc(pCsr->zToken, pCsr->nAlloc+64); @@ -208,12 +230,19 @@ static int unicodeNext( /* Write the folded case of the last character read to the output */ zEnd = z; - WRITE_UTF8(zOut, sqlite3FtsUnicodeTolower(iCode)); + iOut = sqlite3FtsUnicodeFold(iCode, + ((unicode_tokenizer *)pCsr->base.pTokenizer)->bRemoveDiacritic + ); + if( iOut ){ + WRITE_UTF8(zOut, iOut); + } /* If the cursor is not at EOF, read the next character */ if( z>=zTerm ) break; READ_UTF8(z, zTerm, iCode); - }while( sqlite3FtsUnicodeIsalnum(iCode) ); + }while( sqlite3FtsUnicodeIsalnum(iCode) + || sqlite3FtsUnicodeIsdiacritic(iCode) + ); /* Set the output variables and return. */ pCsr->iOff = (z - pCsr->aInput); diff --git a/ext/fts3/fts3_unicode2.c b/ext/fts3/fts3_unicode2.c index 6f053c1321..3bb7874b29 100644 --- a/ext/fts3/fts3_unicode2.c +++ b/ext/fts3/fts3_unicode2.c @@ -152,6 +152,74 @@ int sqlite3FtsUnicodeIsalnum(int c){ } +/* +** If the argument is a codepoint corresponding to a lowercase letter +** in the ASCII range with a diacritic added, return the codepoint +** of the ASCII letter only. For example, if passed 235 - "LATIN +** SMALL LETTER E WITH DIAERESIS" - return 65 ("LATIN SMALL LETTER +** E"). The resuls of passing a codepoint that corresponds to an +** uppercase letter are undefined. +*/ +static int remove_diacritic(int c){ + unsigned short aDia[] = { + 0, 1797, 1848, 1859, 1891, 1928, 1940, 1995, + 2024, 2040, 2060, 2110, 2168, 2206, 2264, 2286, + 2344, 2383, 2472, 2488, 2516, 2596, 2668, 2732, + 2782, 2842, 2894, 2954, 2984, 3000, 3028, 3336, + 3456, 3696, 3712, 3728, 3744, 3896, 3912, 3928, + 3968, 4008, 4040, 4106, 4138, 4170, 4202, 4234, + 4266, 4296, 4312, 4344, 4408, 4424, 4472, 4504, + 6148, 6198, 6264, 6280, 6360, 6429, 6505, 6529, + 61448, 61468, 61534, 61592, 61642, 61688, 61704, 61726, + 61784, 61800, 61836, 61880, 61914, 61948, 61998, 62122, + 62154, 62200, 62218, 62302, 62364, 62442, 62478, 62536, + 62554, 62584, 62604, 62640, 62648, 62656, 62664, 62730, + 62924, 63050, 63082, 63274, 63390, + }; + char aChar[] = { + '\0', 'a', 'c', 'e', 'i', 'n', 'o', 'u', 'y', 'y', 'a', 'c', + 'd', 'e', 'e', 'g', 'h', 'i', 'j', 'k', 'l', 'n', 'o', 'r', + 's', 't', 'u', 'u', 'w', 'y', 'z', 'o', 'u', 'a', 'i', 'o', + 'u', 'g', 'k', 'o', 'j', 'g', 'n', 'a', 'e', 'i', 'o', 'r', + 'u', 's', 't', 'h', 'a', 'e', 'o', 'y', '\0', '\0', '\0', '\0', + '\0', '\0', '\0', '\0', 'a', 'b', 'd', 'd', 'e', 'f', 'g', 'h', + 'h', 'i', 'k', 'l', 'l', 'm', 'n', 'p', 'r', 'r', 's', 't', + 'u', 'v', 'w', 'w', 'x', 'y', 'z', 'h', 't', 'w', 'y', 'a', + 'e', 'i', 'o', 'u', 'y', + }; + + unsigned int key = (((unsigned int)c)<<3) | 0x00000007; + int iRes = 0; + int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1; + int iLo = 0; + while( iHi>=iLo ){ + int iTest = (iHi + iLo) / 2; + if( key >= aDia[iTest] ){ + iRes = iTest; + iLo = iTest+1; + }else{ + iHi = iTest-1; + } + } + assert( key>=aDia[iRes] ); + return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]); +}; + + +/* +** Return true if the argument interpreted as a unicode codepoint +** is a diacritical modifier character. +*/ +int sqlite3FtsUnicodeIsdiacritic(int c){ + unsigned int mask0 = 0x08029FDF; + unsigned int mask1 = 0x000361F8; + if( c<768 || c>817 ) return 0; + return (c < 768+32) ? + (mask0 & (1 << (c-768))) : + (mask1 & (1 << (c-768-32))); +} + + /* ** Interpret the argument as a unicode codepoint. If the codepoint ** is an upper case character that has a lower case equivalent, @@ -161,7 +229,7 @@ int sqlite3FtsUnicodeIsalnum(int c){ ** The results are undefined if the value passed to this function ** is less than zero. */ -int sqlite3FtsUnicodeTolower(int c){ +int sqlite3FtsUnicodeFold(int c, int bRemoveDiacritic){ /* Each entry in the following array defines a rule for folding a range ** of codepoints to lower case. The rule applies to a range of nRange ** codepoints starting at codepoint iCode. @@ -284,6 +352,8 @@ int sqlite3FtsUnicodeTolower(int c){ assert( ret>0 ); } } + + if( bRemoveDiacritic ) ret = remove_diacritic(ret); } else if( c>=66560 && c<66600 ){ diff --git a/ext/fts3/unicode/mkunicode.tcl b/ext/fts3/unicode/mkunicode.tcl index 83f079dfb4..0d58e8aa5c 100644 --- a/ext/fts3/unicode/mkunicode.tcl +++ b/ext/fts3/unicode/mkunicode.tcl @@ -1,4 +1,208 @@ +# +# Parameter $zName must be a path to the file UnicodeData.txt. This command +# reads the file and returns a list of mappings required to remove all +# diacritical marks from a unicode string. Each mapping is itself a list +# consisting of two elements - the unicode codepoint and the single ASCII +# character that it should be replaced with, or an empty string if the +# codepoint should simply be removed from the input. Examples: +# +# { 224 a } (replace codepoint 224 to "a") +# { 769 "" } (remove codepoint 769 from input) +# +# Mappings are only returned for non-upper case codepoints. It is assumed +# that the input has already been folded to lower case. +# +proc rd_load_unicodedata_text {zName} { + global tl_lookup_table + + set fd [open $zName] + set lField { + code + character_name + general_category + canonical_combining_classes + bidirectional_category + character_decomposition_mapping + decimal_digit_value + digit_value + numeric_value + mirrored + unicode_1_name + iso10646_comment_field + uppercase_mapping + lowercase_mapping + titlecase_mapping + } + set lRet [list] + + while { ![eof $fd] } { + set line [gets $fd] + if {$line == ""} continue + + set fields [split $line ";"] + if {[llength $fields] != [llength $lField]} { error "parse error: $line" } + foreach $lField $fields {} + if { [llength $character_decomposition_mapping]!=2 + || [string is xdigit [lindex $character_decomposition_mapping 0]]==0 + } { + continue + } + + set iCode [expr "0x$code"] + set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"] + set iDia [expr "0x[lindex $character_decomposition_mapping 1]"] + + if {[info exists tl_lookup_table($iCode)]} continue + + if { ($iAscii >= 97 && $iAscii <= 122) + || ($iAscii >= 65 && $iAscii <= 90) + } { + lappend lRet [list $iCode [string tolower [format %c $iAscii]]] + set dia($iDia) 1 + } + } + + foreach d [array names dia] { + lappend lRet [list $d ""] + } + set lRet [lsort -integer -index 0 $lRet] + + close $fd + set lRet +} + + +proc print_rd {map} { + global tl_lookup_table + set aChar [list] + set lRange [list] + + set nRange 1 + set iFirst [lindex $map 0 0] + set cPrev [lindex $map 0 1] + + foreach m [lrange $map 1 end] { + foreach {i c} $m {} + + if {$cPrev == $c} { + for {set j [expr $iFirst+$nRange]} {$j<$i} {incr j} { + if {[info exists tl_lookup_table($j)]==0} break + } + + if {$j==$i} { + set nNew [expr {(1 + $i - $iFirst)}] + if {$nNew<=8} { + set nRange $nNew + continue + } + } + } + + lappend lRange [list $iFirst $nRange] + lappend aChar $cPrev + + set iFirst $i + set cPrev $c + set nRange 1 + } + lappend lRange [list $iFirst $nRange] + lappend aChar $cPrev + + puts "/*" + puts "** If the argument is a codepoint corresponding to a lowercase letter" + puts "** in the ASCII range with a diacritic added, return the codepoint" + puts "** of the ASCII letter only. For example, if passed 235 - \"LATIN" + puts "** SMALL LETTER E WITH DIAERESIS\" - return 65 (\"LATIN SMALL LETTER" + puts "** E\"). The resuls of passing a codepoint that corresponds to an" + puts "** uppercase letter are undefined." + puts "*/" + puts "static int remove_diacritic(int c)\{" + puts " unsigned short aDia\[\] = \{" + puts -nonewline " 0, " + set i 1 + foreach r $lRange { + foreach {iCode nRange} $r {} + if {($i % 8)==0} {puts "" ; puts -nonewline " " } + incr i + + puts -nonewline [format "%5d" [expr ($iCode<<3) + $nRange-1]] + puts -nonewline ", " + } + puts "" + puts " \};" + puts " char aChar\[\] = \{" + puts -nonewline " '\\0', " + set i 1 + foreach c $aChar { + set str "'$c', " + if {$c == ""} { set str "'\\0', " } + + if {($i % 12)==0} {puts "" ; puts -nonewline " " } + incr i + puts -nonewline "$str" + } + puts "" + puts " \};" + puts { + unsigned int key = (((unsigned int)c)<<3) | 0x00000007; + int iRes = 0; + int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1; + int iLo = 0; + while( iHi>=iLo ){ + int iTest = (iHi + iLo) / 2; + if( key >= aDia[iTest] ){ + iRes = iTest; + iLo = iTest+1; + }else{ + iHi = iTest-1; + } + } + assert( key>=aDia[iRes] ); + return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]);} + puts "\};" +} + +proc print_isdiacritic {zFunc map} { + + set lCode [list] + foreach m $map { + foreach {code char} $m {} + if {$code && $char == ""} { lappend lCode $code } + } + set lCode [lsort -integer $lCode] + set iFirst [lindex $lCode 0] + set iLast [lindex $lCode end] + + set i1 0 + set i2 0 + + foreach c $lCode { + set i [expr $c - $iFirst] + if {$i < 32} { + set i1 [expr {$i1 | (1<<$i)}] + } else { + set i2 [expr {$i2 | (1<<($i-32))}] + } + } + + puts "/*" + puts "** Return true if the argument interpreted as a unicode codepoint" + puts "** is a diacritical modifier character." + puts "*/" + puts "int ${zFunc}\(int c)\{" + puts " unsigned int mask0 = [format "0x%08X" $i1];" + puts " unsigned int mask1 = [format "0x%08X" $i2];" + + puts " if( c<$iFirst || c>$iLast ) return 0;" + puts " return (c < $iFirst+32) ?" + puts " (mask0 & (1 << (c-$iFirst))) :" + puts " (mask1 & (1 << (c-$iFirst-32)));" + puts "\}" +} + + +#------------------------------------------------------------------------- # Parameter $zName must be a path to the file UnicodeData.txt. This command # reads the file and returns a list of codepoints (integers). The list @@ -393,7 +597,7 @@ proc tl_print_ioff_table {liOff} { } -proc print_tolower {zFunc} { +proc print_fold {zFunc} { set lRecord [tl_create_records] @@ -407,7 +611,7 @@ proc print_tolower {zFunc} { puts "** The results are undefined if the value passed to this function" puts "** is less than zero." puts "*/" - puts "int ${zFunc}\(int c)\{" + puts "int ${zFunc}\(int c, int bRemoveDiacritic)\{" set liOff [tl_generate_ioff_table $lRecord] tl_print_table_header @@ -451,6 +655,8 @@ proc print_tolower {zFunc} { assert( ret>0 ); } } + + if( bRemoveDiacritic ) ret = remove_diacritic(ret); } } @@ -463,22 +669,38 @@ proc print_tolower {zFunc} { puts "\}" } -proc print_tolower_test {zFunc} { +proc print_fold_test {zFunc mappings} { global tl_lookup_table - puts "static int tolower_test(int *piCode)\{" + foreach m $mappings { + set c [lindex $m 1] + if {$c == ""} { + set extra([lindex $m 0]) 0 + } else { + scan $c %c i + set extra([lindex $m 0]) $i + } + } + + puts "static int fold_test(int *piCode)\{" puts -nonewline " static int aLookup\[\] = \{" for {set i 0} {$i < 70000} {incr i} { + set expected $i catch { set expected $tl_lookup_table($i) } - if {($i % 8)==0} { puts "" ; puts -nonewline " " } - puts -nonewline "$expected, " + set expected2 $expected + catch { set expected2 $extra($expected2) } + + if {($i % 4)==0} { puts "" ; puts -nonewline " " } + puts -nonewline "$expected, $expected2, " } puts " \};" puts " int i;" puts " for(i=0; i