mirror of
https://github.com/MariaDB/server.git
synced 2025-08-08 11:22:35 +03:00
BUG#19580 - FULLTEXT search produces wrong results on UTF-8 columns
The problem was that MySQL hadn't true ctype implementation. As a result many multibyte punctuation/whitespace characters were treated as word characters. This fix uses recently added CTYPE table for unicode character sets (WL1386) to detect unicode punctuation/whitespace characters correctly. Note: this is incompatible change since it changes parser behavior. One will have to use REPAIR TABLE statement to rebuild fulltext indexes. mysql-test/r/fulltext2.result: Testcase for BUG#19580. mysql-test/t/fulltext2.test: Testcase for BUG#19580. storage/myisam/ft_parser.c: Use WL1386 "CTYPE table for unicode character sets" functionality. storage/myisam/ft_update.c: Use WL1386 "CTYPE table for unicode character sets" functionality. Reverse fix for BUG#16489 "utf8 + fulltext leads to corrupt index file.". It is not needed anymore, since we have true ctype implementation. storage/myisam/ftdefs.h: Use WL1386 "CTYPE table for unicode character sets" functionality. Rework true_word_char macro so it accepts ctype instead of charset as first param. It doesn't use my_isalnum anymore, but instead directly checks ctype. Obsolete word_char macro removed.
This commit is contained in:
@@ -114,6 +114,7 @@ byte ft_get_word(CHARSET_INFO *cs, byte **start, byte *end,
|
||||
FT_WORD *word, MYSQL_FTPARSER_BOOLEAN_INFO *param)
|
||||
{
|
||||
byte *doc=*start;
|
||||
int ctype;
|
||||
uint mwc, length, mbl;
|
||||
|
||||
param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0);
|
||||
@@ -122,9 +123,11 @@ byte ft_get_word(CHARSET_INFO *cs, byte **start, byte *end,
|
||||
|
||||
while (doc<end)
|
||||
{
|
||||
for (;doc<end;doc++)
|
||||
for (; doc < end; doc+= (mbl > 0 ? mbl : 1))
|
||||
{
|
||||
if (true_word_char(cs,*doc)) break;
|
||||
mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
|
||||
if (true_word_char(ctype, *doc))
|
||||
break;
|
||||
if (*doc == FTB_RQUOT && param->quot)
|
||||
{
|
||||
param->quot=doc;
|
||||
@@ -158,14 +161,16 @@ byte ft_get_word(CHARSET_INFO *cs, byte **start, byte *end,
|
||||
}
|
||||
|
||||
mwc=length=0;
|
||||
for (word->pos=doc; doc<end; length++, mbl=my_mbcharlen(cs, *(uchar *)doc), doc+=(mbl ? mbl : 1))
|
||||
if (true_word_char(cs,*doc))
|
||||
for (word->pos= doc; doc < end; length++, doc+= (mbl > 0 ? mbl : 1))
|
||||
{
|
||||
mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
|
||||
if (true_word_char(ctype, *doc))
|
||||
mwc=0;
|
||||
else if (!misc_word_char(*doc) || mwc)
|
||||
break;
|
||||
else
|
||||
mwc++;
|
||||
|
||||
}
|
||||
param->prev='A'; /* be sure *prev is true_word_char */
|
||||
word->len= (uint)(doc-word->pos) - mwc;
|
||||
if ((param->trunc=(doc<end && *doc == FTB_TRUNC)))
|
||||
@@ -200,24 +205,31 @@ byte ft_simple_get_word(CHARSET_INFO *cs, byte **start, const byte *end,
|
||||
{
|
||||
byte *doc= *start;
|
||||
uint mwc, length, mbl;
|
||||
int ctype;
|
||||
DBUG_ENTER("ft_simple_get_word");
|
||||
|
||||
do
|
||||
{
|
||||
for (;; doc++)
|
||||
for (;; doc+= (mbl > 0 ? mbl : 1))
|
||||
{
|
||||
if (doc >= end) DBUG_RETURN(0);
|
||||
if (true_word_char(cs, *doc)) break;
|
||||
if (doc >= end)
|
||||
DBUG_RETURN(0);
|
||||
mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
|
||||
if (true_word_char(ctype, *doc))
|
||||
break;
|
||||
}
|
||||
|
||||
mwc= length= 0;
|
||||
for (word->pos=doc; doc<end; length++, mbl=my_mbcharlen(cs, *(uchar *)doc), doc+=(mbl ? mbl : 1))
|
||||
if (true_word_char(cs,*doc))
|
||||
for (word->pos= doc; doc < end; length++, doc+= (mbl > 0 ? mbl : 1))
|
||||
{
|
||||
mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
|
||||
if (true_word_char(ctype, *doc))
|
||||
mwc= 0;
|
||||
else if (!misc_word_char(*doc) || mwc)
|
||||
break;
|
||||
else
|
||||
mwc++;
|
||||
}
|
||||
|
||||
word->len= (uint)(doc-word->pos) - mwc;
|
||||
|
||||
|
Reference in New Issue
Block a user