diff --git a/include/m_ctype.h b/include/m_ctype.h index 1f42b514a1b..ddc21070547 100644 --- a/include/m_ctype.h +++ b/include/m_ctype.h @@ -365,6 +365,11 @@ uint my_instr_mb(struct charset_info_st *, const char *s, uint s_length, my_match_t *match, uint nmatch); +int my_wildcmp_unicode(CHARSET_INFO *cs, + const char *str, const char *str_end, + const char *wildstr, const char *wildend, + int escape, int w_one, int w_many, + MY_UNICASE_INFO **weights); extern my_bool my_parse_charset_xml(const char *bug, uint len, int (*add)(CHARSET_INFO *cs)); diff --git a/mysql-test/r/ctype_utf8.result b/mysql-test/r/ctype_utf8.result index 2e8bbc8fa92..e65eb96cb68 100644 --- a/mysql-test/r/ctype_utf8.result +++ b/mysql-test/r/ctype_utf8.result @@ -63,6 +63,15 @@ select 'A' like 'a' collate utf8_bin; select _utf8 0xD0B0D0B1D0B2 like concat(_utf8'%',_utf8 0xD0B1,_utf8 '%'); _utf8 0xD0B0D0B1D0B2 like concat(_utf8'%',_utf8 0xD0B1,_utf8 '%') 1 +select convert(_latin1'Günter André' using utf8) like CONVERT(_latin1'GÜNTER%' USING utf8); +convert(_latin1'Günter André' using utf8) like CONVERT(_latin1'GÜNTER%' USING utf8) +1 +select CONVERT(_koi8r'×ÁÓÑ' USING utf8) LIKE CONVERT(_koi8r'÷áóñ' USING utf8); +CONVERT(_koi8r'×ÁÓÑ' USING utf8) LIKE CONVERT(_koi8r'÷áóñ' USING utf8) +1 +select CONVERT(_koi8r'÷áóñ' USING utf8) LIKE CONVERT(_koi8r'×ÁÓÑ' USING utf8); +CONVERT(_koi8r'÷áóñ' USING utf8) LIKE CONVERT(_koi8r'×ÁÓÑ' USING utf8) +1 SELECT 'a' = 'a '; 'a' = 'a ' 1 diff --git a/mysql-test/t/ctype_utf8.test b/mysql-test/t/ctype_utf8.test index c74bb59ae6b..238cd6daef3 100644 --- a/mysql-test/t/ctype_utf8.test +++ b/mysql-test/t/ctype_utf8.test @@ -33,6 +33,14 @@ select 'A' like 'a'; select 'A' like 'a' collate utf8_bin; select _utf8 0xD0B0D0B1D0B2 like concat(_utf8'%',_utf8 0xD0B1,_utf8 '%'); +# Bug #6040: can't retrieve records with umlaut +# characters in case insensitive manner. +# Case insensitive search LIKE comparison +# was broken for multibyte characters: +select convert(_latin1'Günter André' using utf8) like CONVERT(_latin1'GÜNTER%' USING utf8); +select CONVERT(_koi8r'×ÁÓÑ' USING utf8) LIKE CONVERT(_koi8r'÷áóñ' USING utf8); +select CONVERT(_koi8r'÷áóñ' USING utf8) LIKE CONVERT(_koi8r'×ÁÓÑ' USING utf8); + # # Check the following: # "a" == "a " diff --git a/strings/CHARSET_INFO.txt b/strings/CHARSET_INFO.txt new file mode 100644 index 00000000000..e8c13996707 --- /dev/null +++ b/strings/CHARSET_INFO.txt @@ -0,0 +1,222 @@ + +CHARSET_INFO +============ +A structure containing data for charset+collation pair implementation. + +Virtual functions which use this data are collected +into separate structures MY_CHARSET_HANDLER and +MY_COLLATION_HANDLER. + + +typedef struct charset_info_st +{ + uint number; + uint primary_number; + uint binary_number; + uint state; + + const char *csname; + const char *name; + const char *comment; + + uchar *ctype; + uchar *to_lower; + uchar *to_upper; + uchar *sort_order; + + uint16 *tab_to_uni; + MY_UNI_IDX *tab_from_uni; + + uchar state_map[256]; + uchar ident_map[256]; + + uint strxfrm_multiply; + uint mbminlen; + uint mbmaxlen; + char max_sort_char; /* For LIKE optimization */ + + MY_CHARSET_HANDLER *cset; + MY_COLLATION_HANDLER *coll; + +} CHARSET_INFO; + + +CHARSET_INFO fields description: +=============================== + + +Numbers (identifiers) +--------------------- + +number - an ID uniquely identifying this charset+collation pair. + +primary_number - ID of a charset+collation pair, which consists +of the same character set and the default collation of this +character set. Not really used now. Intended to optimize some +parts of the code where we need to find the default collation +using its non-default counterpart for the given character set. + +binary_numner - ID of a charset+collation pair, which consists +of the same character set and the binary collation of this +character set. Not really used now. Intended to optimize +"SELECT BINARY x" in the future. + +Names +----- + + csname - name of the character set for this charset+collation pair. + name - name of the collation for this charset+collation pair. + comment - a text comment, dysplayed in "Description" column of + SHOW CHARACTER SET output. + +Conversion tables +----------------- + + ctype - pointer to array[257] of "type of characters" + bit mask for each chatacter, e.g. if a + character is a digit or a letter or a separator, etc. + to_lower - pointer to arrat[256] used in LCASE() + to_upper - pointer to array[256] used in UCASE() + sort_order - pointer to array[256] used for strings comparison + + + +Unicode conversion data +----------------------- +For 8bit character sets: + +tab_to_uni : array[256] of charset->Unicode translation +tab_from_uni: a structure for Unicode->charset translation + +Non-8 bit charsets have their own structures per charset +hidden in correspondent ctype-xxx.c file and don't use +tab_to_uni and tab_from_uni tables. + + +Parser maps +----------- +state_map[] +ident_map[] + + These maps are to quickly identify if a character is +an identificator part, a digit, a special character, +or a part of other SQL language lexical item. + +Probably can be combined with ctype array in the future. +But for some reasons these two arrays are used in the parser, +while a separate ctype[] array is used in the other part of the +code, like fulltext, etc. + + +Misc fields +----------- + + strxfrm_multiply - how many times a sort key (i.e. a string + which can be passed into memcmp() for comparison) + can be longer than the original string. + Usually it is 1. For some complex + collations it can be bigger. For example + in latin1_german2_ci, a sort key is up to + twice longer than the original string. + e.g. Letter 'A' with two dots above is + substituted with 'AE'. + mbminlen - mininum multibyte sequence length. + Now always 1 accept ucs2. For ucs2 + it is 2. + mbmaxlen - maximum multibyte sequence length. + 1 for 8bit charsets. Can be also 2 or 3. + + + +MY_CHARSET_HANDLER +================== + +MY_CHARSET_HANDLER is a collection of character-set +related routines. Defined in m_ctype.h. Have the +following set of functions: + +Multibyte routines +------------------ +ismbchar() - detects if the given string is a multibyte sequence +mbcharlen() - retuturns length of multibyte sequence starting with + the given character +numchars() - returns number of characters in the given string, e.g. + in SQL function CHAR_LENGTH(). +charpos() - calculates the offset of the given position in the string. + Used in SQL functions LEFT(), RIGHT(), SUBSTRING(), + INSERT() + +well_formed_length() + - finds the length of correctly formed multybyte beginning. + Used in INSERTs to cut a beginning of the given string + which is + a) "well formed" according to the given character set. + b) can fit into the given data type + Terminates the string in the good position, taking in account + multibyte character boundaries. + +lengthsp() - returns the length of the given string without traling spaces. + + +Unicode conversion routines +--------------------------- +mb_wc - converts the left multibyte sequence into it Unicode code. +mc_mb - converts the given Unicode code into multibyte sequence. + + +Case and sort convertion +------------------------ +caseup_str - converts the given 0-terminated string into the upper case +casedn_str - converts the given 0-terminated string into the lower case +caseup - converts the given string into the lower case using length +casedn - converts the given string into the lower case using length + +Number-to-string conversion routines +------------------------------------ +snprintf() +long10_to_str() +longlong10_to_str() + +The names are pretty self-descripting. + +String padding routines +----------------------- +fill() - writes the given Unicode value into the given string + with the given length. Used to pad the string, usually + with space character, according to the given charset. + +String-to-numner conversion routines +------------------------------------ +strntol() +strntoul() +strntoll() +strntoull() +strntod() + +These functions are almost for the same thing with their +STDLIB counterparts, but also: + - accept length instead of 0-terminator + - and are character set dependant + +Simple scanner routines +----------------------- +scan() - to skip leading spaces in the given string. + Used when a string value is inserted into a numeric field. + + + +MY_COLLATION_HANDLER +==================== +strnncoll() - compares two strings according to the given collation +strnncollsp() - like the above but ignores trailing spaces +strnxfrm() - makes a sort key suitable for memcmp() corresponding + to the given string +like_range() - creates a LIKE range, for optimizer +wildcmp() - wildcard comparison, for LIKE +strcasecmp() - 0-terminated string comparison +instr() - finds the first substring appearence in the string +hash_sort() - calculates hash value taking in account + the collation rules, e.g. case-insensitivity, + accent sensitivity, etc. + + \ No newline at end of file diff --git a/strings/ctype-ucs2.c b/strings/ctype-ucs2.c index 3247e1d7424..851c2044f47 100644 --- a/strings/ctype-ucs2.c +++ b/strings/ctype-ucs2.c @@ -1231,172 +1231,14 @@ uint my_lengthsp_ucs2(CHARSET_INFO *cs __attribute__((unused)), } -/* -** Compare string against string with wildcard -** 0 if matched -** -1 if not matched with wildcard -** 1 if matched with wildcard -*/ - -static -int my_wildcmp_ucs2(CHARSET_INFO *cs, - const char *str,const char *str_end, - const char *wildstr,const char *wildend, - int escape, int w_one, int w_many, - MY_UNICASE_INFO **weights) -{ - int result= -1; /* Not found, using wildcards */ - my_wc_t s_wc, w_wc; - int scan, plane; - - while (wildstr != wildend) - { - - while (1) - { - scan= my_ucs2_uni(cs,&w_wc, (const uchar*)wildstr, - (const uchar*)wildend); - if (scan <= 0) - return 1; - - if (w_wc == (my_wc_t)escape) - { - wildstr+= scan; - scan= my_ucs2_uni(cs,&w_wc, (const uchar*)wildstr, - (const uchar*)wildend); - if (scan <= 0) - return 1; - } - - if (w_wc == (my_wc_t)w_many) - { - result= 1; /* Found an anchor char */ - break; - } - - wildstr+= scan; - scan= my_ucs2_uni(cs, &s_wc, (const uchar*)str, (const uchar*)str_end); - if (scan <=0) - return 1; - str+= scan; - - if (w_wc == (my_wc_t)w_one) - { - result= 1; /* Found an anchor char */ - } - else - { - if (weights) - { - plane=(s_wc>>8) & 0xFF; - s_wc = weights[plane] ? weights[plane][s_wc & 0xFF].sort : s_wc; - plane=(w_wc>>8) & 0xFF; - w_wc = weights[plane] ? weights[plane][w_wc & 0xFF].sort : w_wc; - } - if (s_wc != w_wc) - return 1; /* No match */ - } - if (wildstr == wildend) - return (str != str_end); /* Match if both are at end */ - } - - - if (w_wc == (my_wc_t)w_many) - { /* Found w_many */ - - /* Remove any '%' and '_' from the wild search string */ - for ( ; wildstr != wildend ; ) - { - scan= my_ucs2_uni(cs,&w_wc, (const uchar*)wildstr, - (const uchar*)wildend); - if (scan <= 0) - return 1; - - if (w_wc == (my_wc_t)w_many) - { - wildstr+= scan; - continue; - } - - if (w_wc == (my_wc_t)w_one) - { - wildstr+= scan; - scan= my_ucs2_uni(cs, &s_wc, (const uchar*)str, - (const uchar*)str_end); - if (scan <=0) - return 1; - str+= scan; - continue; - } - break; /* Not a wild character */ - } - - if (wildstr == wildend) - return 0; /* Ok if w_many is last */ - - if (str == str_end) - return -1; - - scan= my_ucs2_uni(cs,&w_wc, (const uchar*)wildstr, - (const uchar*)wildend); - if (scan <= 0) - return 1; - - if (w_wc == (my_wc_t)escape) - { - wildstr+= scan; - scan= my_ucs2_uni(cs,&w_wc, (const uchar*)wildstr, - (const uchar*)wildend); - if (scan <= 0) - return 1; - } - - while (1) - { - /* Skip until the first character from wildstr is found */ - while (str != str_end) - { - scan= my_ucs2_uni(cs,&s_wc, (const uchar*)str, - (const uchar*)str_end); - if (scan <= 0) - return 1; - if (weights) - { - plane=(s_wc>>8) & 0xFF; - s_wc = weights[plane] ? weights[plane][s_wc & 0xFF].sort : s_wc; - plane=(w_wc>>8) & 0xFF; - w_wc = weights[plane] ? weights[plane][w_wc & 0xFF].sort : w_wc; - } - - if (s_wc == w_wc) - break; - str+= scan; - } - if (str == str_end) - return -1; - - result= my_wildcmp_ucs2(cs,str,str_end,wildstr,wildend,escape, - w_one,w_many,weights); - - if (result <= 0) - return result; - - str+= scan; - } - } - } - return (str != str_end ? 1 : 0); -} - - static int my_wildcmp_ucs2_ci(CHARSET_INFO *cs, const char *str,const char *str_end, const char *wildstr,const char *wildend, int escape, int w_one, int w_many) { - return my_wildcmp_ucs2(cs,str,str_end,wildstr,wildend, - escape,w_one,w_many,uni_plane); + return my_wildcmp_unicode(cs,str,str_end,wildstr,wildend, + escape,w_one,w_many,uni_plane); } @@ -1406,8 +1248,8 @@ int my_wildcmp_ucs2_bin(CHARSET_INFO *cs, const char *wildstr,const char *wildend, int escape, int w_one, int w_many) { - return my_wildcmp_ucs2(cs,str,str_end,wildstr,wildend, - escape,w_one,w_many,NULL); + return my_wildcmp_unicode(cs,str,str_end,wildstr,wildend, + escape,w_one,w_many,NULL); } diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c index fd6610b72b1..f5d40fb8ded 100644 --- a/strings/ctype-utf8.c +++ b/strings/ctype-utf8.c @@ -1518,6 +1518,161 @@ MY_UNICASE_INFO *uni_plane[256]={ }; + +/* +** Compare string against string with wildcard +** This function is used in UTF8 and UCS2 +** +** 0 if matched +** -1 if not matched with wildcard +** 1 if matched with wildcard +*/ + +int my_wildcmp_unicode(CHARSET_INFO *cs, + const char *str,const char *str_end, + const char *wildstr,const char *wildend, + int escape, int w_one, int w_many, + MY_UNICASE_INFO **weights) +{ + int result= -1; /* Not found, using wildcards */ + my_wc_t s_wc, w_wc; + int scan, plane; + int (*mb_wc)(struct charset_info_st *cs, my_wc_t *wc, + const unsigned char *s,const unsigned char *e); + mb_wc= cs->cset->mb_wc; + + while (wildstr != wildend) + { + while (1) + { + if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr, + (const uchar*)wildend)) <= 0) + return 1; + + if (w_wc == (my_wc_t)escape) + { + wildstr+= scan; + if ((scan= mb_wc(cs,&w_wc, (const uchar*)wildstr, + (const uchar*)wildend)) <= 0) + return 1; + } + + if (w_wc == (my_wc_t)w_many) + { + result= 1; /* Found an anchor char */ + break; + } + + wildstr+= scan; + if ((scan= mb_wc(cs, &s_wc, (const uchar*)str, + (const uchar*)str_end)) <=0) + return 1; + str+= scan; + + if (w_wc == (my_wc_t)w_one) + { + result= 1; /* Found an anchor char */ + } + else + { + if (weights) + { + plane=(s_wc>>8) & 0xFF; + s_wc = weights[plane] ? weights[plane][s_wc & 0xFF].sort : s_wc; + plane=(w_wc>>8) & 0xFF; + w_wc = weights[plane] ? weights[plane][w_wc & 0xFF].sort : w_wc; + } + if (s_wc != w_wc) + return 1; /* No match */ + } + if (wildstr == wildend) + return (str != str_end); /* Match if both are at end */ + } + + + if (w_wc == (my_wc_t)w_many) + { /* Found w_many */ + + /* Remove any '%' and '_' from the wild search string */ + for ( ; wildstr != wildend ; ) + { + if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr, + (const uchar*)wildend)) <= 0) + return 1; + + if (w_wc == (my_wc_t)w_many) + { + wildstr+= scan; + continue; + } + + if (w_wc == (my_wc_t)w_one) + { + wildstr+= scan; + if ((scan= mb_wc(cs, &s_wc, (const uchar*)str, + (const uchar*)str_end)) <=0) + return 1; + str+= scan; + continue; + } + break; /* Not a wild character */ + } + + if (wildstr == wildend) + return 0; /* Ok if w_many is last */ + + if (str == str_end) + return -1; + + if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr, + (const uchar*)wildend)) <=0) + return 1; + + if (w_wc == (my_wc_t)escape) + { + wildstr+= scan; + if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr, + (const uchar*)wildend)) <=0) + return 1; + } + + while (1) + { + /* Skip until the first character from wildstr is found */ + while (str != str_end) + { + if ((scan= mb_wc(cs, &s_wc, (const uchar*)str, + (const uchar*)str_end)) <=0) + return 1; + if (weights) + { + plane=(s_wc>>8) & 0xFF; + s_wc = weights[plane] ? weights[plane][s_wc & 0xFF].sort : s_wc; + plane=(w_wc>>8) & 0xFF; + w_wc = weights[plane] ? weights[plane][w_wc & 0xFF].sort : w_wc; + } + + if (s_wc == w_wc) + break; + str+= scan; + } + if (str == str_end) + return -1; + + result= my_wildcmp_unicode(cs, str, str_end, wildstr, wildend, + escape, w_one, w_many, + weights); + + if (result <= 0) + return result; + + str+= scan; + } + } + } + return (str != str_end ? 1 : 0); +} + #endif @@ -1992,6 +2147,17 @@ static int my_strcasecmp_utf8(CHARSET_INFO *cs, const char *s, const char *t) return my_strncasecmp_utf8(cs, s, t, len); } +static +int my_wildcmp_utf8(CHARSET_INFO *cs, + const char *str,const char *str_end, + const char *wildstr,const char *wildend, + int escape, int w_one, int w_many) +{ + return my_wildcmp_unicode(cs,str,str_end,wildstr,wildend, + escape,w_one,w_many,uni_plane); +} + + static int my_strnxfrm_utf8(CHARSET_INFO *cs, uchar *dst, uint dstlen, const uchar *src, uint srclen) @@ -2060,7 +2226,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler = my_strnncollsp_utf8, my_strnxfrm_utf8, my_like_range_mb, - my_wildcmp_mb, + my_wildcmp_utf8, my_strcasecmp_utf8, my_instr_mb, my_hash_sort_utf8