diff --git a/strings/ctype-ucs2.c b/strings/ctype-ucs2.c index d1441a4d3a5..90aa1a93bed 100644 --- a/strings/ctype-ucs2.c +++ b/strings/ctype-ucs2.c @@ -1186,11 +1186,14 @@ my_lengthsp_mb2(CHARSET_INFO *cs __attribute__((unused)), #define MY_UTF16_SURROGATE_LOW_FIRST 0xDC00 #define MY_UTF16_SURROGATE_LOW_LAST 0xDFFF -#define MY_UTF16_HIGH_HEAD(x) ((((uchar) (x)) & 0xFC) == 0xD8) -#define MY_UTF16_LOW_HEAD(x) ((((uchar) (x)) & 0xFC) == 0xDC) -#define MY_UTF16_SURROGATE(x) (((x) & 0xF800) == 0xD800) +#define MY_UTF16_HIGH_HEAD(x) ((((uchar) (x)) & 0xFC) == 0xD8) +#define MY_UTF16_LOW_HEAD(x) ((((uchar) (x)) & 0xFC) == 0xDC) +/* Test if a byte is a leading byte of a high or low surrogate head: */ +#define MY_UTF16_SURROGATE_HEAD(x) ((((uchar) (x)) & 0xF8) == 0xD8) +/* Test if a Unicode code point is a high or low surrogate head */ +#define MY_UTF16_SURROGATE(x) (((x) & 0xF800) == 0xD800) -#define MY_UTF16_WC2(a, b) ((a << 8) + b) +#define MY_UTF16_WC2(a, b) ((a << 8) + b) /* a= 110110?? (<< 18) @@ -1201,6 +1204,30 @@ my_lengthsp_mb2(CHARSET_INFO *cs __attribute__((unused)), #define MY_UTF16_WC4(a, b, c, d) (((a & 3) << 18) + (b << 10) + \ ((c & 3) << 8) + d + 0x10000) +#define IS_MB2_CHAR(b0,b1) (!MY_UTF16_SURROGATE_HEAD(b0)) +#define IS_MB4_CHAR(b0,b1,b2,b3) (MY_UTF16_HIGH_HEAD(b0) && MY_UTF16_LOW_HEAD(b2)) + +static inline int my_weight_mb2_utf16mb2_general_ci(uchar b0, uchar b1) +{ + my_wc_t wc= MY_UTF16_WC2(b0, b1); + MY_UNICASE_CHARACTER *page= my_unicase_default.page[wc >> 8]; + return (int) (page ? page[wc & 0xFF].sort : wc); +} +#define MY_FUNCTION_NAME(x) my_ ## x ## _utf16_general_ci +#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x)) +#define WEIGHT_MB2(b0,b1) my_weight_mb2_utf16mb2_general_ci(b0,b1) +#define WEIGHT_MB4(b0,b1,b2,b3) ((int) MY_UTF16_WC4(b0, b1, b2, b3)) +#include "strcoll.ic" + +#define MY_FUNCTION_NAME(x) my_ ## x ## _utf16_bin +#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x)) +#define WEIGHT_MB2(b0,b1) ((int) MY_UTF16_WC2(b0, b1)) +#define WEIGHT_MB4(b0,b1,b2,b3) ((int) MY_UTF16_WC4(b0, b1, b2, b3)) +#include "strcoll.ic" + +#undef IS_MB2_CHAR +#undef IS_MB4_CHAR + static int my_utf16_uni(CHARSET_INFO *cs __attribute__((unused)), my_wc_t *pwc, const uchar *s, const uchar *e) @@ -1371,146 +1398,6 @@ my_casedn_utf16(CHARSET_INFO *cs, char *src, size_t srclen, } -static int -my_strnncoll_utf16(CHARSET_INFO *cs, - const uchar *s, size_t slen, - const uchar *t, size_t tlen, - my_bool t_is_prefix) -{ - int s_res, t_res; - my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc); - my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc; - const uchar *se= s + slen; - const uchar *te= t + tlen; - MY_UNICASE_INFO *uni_plane= cs->caseinfo; - - while (s < se && t < te) - { - s_res= mb_wc(cs, &s_wc, s, se); - t_res= mb_wc(cs, &t_wc, t, te); - - if (s_res <= 0 || t_res <= 0) - { - /* Incorrect string, compare by char value */ - return my_bincmp(s, se, t, te); - } - - my_tosort_utf16(uni_plane, &s_wc); - my_tosort_utf16(uni_plane, &t_wc); - - if (s_wc != t_wc) - { - return s_wc > t_wc ? 1 : -1; - } - - s+= s_res; - t+= t_res; - } - return (int) (t_is_prefix ? (t - te) : ((se - s) - (te - t))); -} - - -/** - Compare strings, discarding end space - - If one string is shorter as the other, then we space extend the other - so that the strings have equal length. - - This will ensure that the following things hold: - - "a" == "a " - "a\0" < "a" - "a\0" < "a " - - @param cs Character set pinter. - @param a First string to compare. - @param a_length Length of 'a'. - @param b Second string to compare. - @param b_length Length of 'b'. - - IMPLEMENTATION - - @return Comparison result. - @retval Negative number, if a less than b. - @retval 0, if a is equal to b - @retval Positive number, if a > b -*/ - -static int -my_strnncollsp_utf16(CHARSET_INFO *cs, - const uchar *s, size_t slen, - const uchar *t, size_t tlen, - my_bool diff_if_only_endspace_difference) -{ - int res; - my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc); - my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc; - const uchar *se= s + slen, *te= t + tlen; - MY_UNICASE_INFO *uni_plane= cs->caseinfo; - - DBUG_ASSERT((slen % 2) == 0); - DBUG_ASSERT((tlen % 2) == 0); - -#ifndef VARCHAR_WITH_DIFF_ENDSPACE_ARE_DIFFERENT_FOR_UNIQUE - diff_if_only_endspace_difference= FALSE; -#endif - - while (s < se && t < te) - { - int s_res= mb_wc(cs, &s_wc, s, se); - int t_res= mb_wc(cs, &t_wc, t, te); - - if (s_res <= 0 || t_res <= 0) - { - /* Incorrect string, compare bytewise */ - return my_bincmp(s, se, t, te); - } - - my_tosort_utf16(uni_plane, &s_wc); - my_tosort_utf16(uni_plane, &t_wc); - - if (s_wc != t_wc) - { - return s_wc > t_wc ? 1 : -1; - } - - s+= s_res; - t+= t_res; - } - - slen= (size_t) (se - s); - tlen= (size_t) (te - t); - res= 0; - - if (slen != tlen) - { - int s_res, swap= 1; - if (diff_if_only_endspace_difference) - res= 1; /* Assume 's' is bigger */ - if (slen < tlen) - { - slen= tlen; - s= t; - se= te; - swap= -1; - res= -res; - } - - for ( ; s < se; s+= s_res) - { - if ((s_res= mb_wc(cs, &s_wc, s, se)) < 0) - { - DBUG_ASSERT(0); - return 0; - } - if (s_wc != ' ') - return (s_wc < ' ') ? -swap : swap; - } - } - return res; -} - - static uint my_ismbchar_utf16(CHARSET_INFO *cs, const char *b, const char *e) { @@ -1623,111 +1510,6 @@ my_wildcmp_utf16_bin(CHARSET_INFO *cs, } -static int -my_strnncoll_utf16_bin(CHARSET_INFO *cs, - const uchar *s, size_t slen, - const uchar *t, size_t tlen, - my_bool t_is_prefix) -{ - int s_res,t_res; - my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc); - my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc; - const uchar *se=s+slen; - const uchar *te=t+tlen; - - while ( s < se && t < te ) - { - s_res= mb_wc(cs, &s_wc, s, se); - t_res= mb_wc(cs, &t_wc, t, te); - - if (s_res <= 0 || t_res <= 0) - { - /* Incorrect string, compare by char value */ - return my_bincmp(s, se, t, te); - } - if (s_wc != t_wc) - { - return s_wc > t_wc ? 1 : -1; - } - - s+= s_res; - t+= t_res; - } - return (int) (t_is_prefix ? (t - te) : ((se - s) - (te - t))); -} - - -static int -my_strnncollsp_utf16_bin(CHARSET_INFO *cs, - const uchar *s, size_t slen, - const uchar *t, size_t tlen, - my_bool diff_if_only_endspace_difference) -{ - int res; - my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc); - my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc; - const uchar *se= s + slen, *te= t + tlen; - - DBUG_ASSERT((slen % 2) == 0); - DBUG_ASSERT((tlen % 2) == 0); - -#ifndef VARCHAR_WITH_DIFF_ENDSPACE_ARE_DIFFERENT_FOR_UNIQUE - diff_if_only_endspace_difference= FALSE; -#endif - - while (s < se && t < te) - { - int s_res= mb_wc(cs, &s_wc, s, se); - int t_res= mb_wc(cs, &t_wc, t, te); - - if (s_res <= 0 || t_res <= 0) - { - /* Incorrect string, compare bytewise */ - return my_bincmp(s, se, t, te); - } - - if (s_wc != t_wc) - { - return s_wc > t_wc ? 1 : -1; - } - - s+= s_res; - t+= t_res; - } - - slen= (size_t) (se - s); - tlen= (size_t) (te - t); - res= 0; - - if (slen != tlen) - { - int s_res, swap= 1; - if (diff_if_only_endspace_difference) - res= 1; /* Assume 's' is bigger */ - if (slen < tlen) - { - slen= tlen; - s= t; - se= te; - swap= -1; - res= -res; - } - - for ( ; s < se; s+= s_res) - { - if ((s_res= mb_wc(cs, &s_wc, s, se)) < 0) - { - DBUG_ASSERT(0); - return 0; - } - if (s_wc != ' ') - return (s_wc < ' ') ? -swap : swap; - } - } - return res; -} - - static void my_hash_sort_utf16_bin(CHARSET_INFO *cs, const uchar *pos, size_t len, ulong *nr1, ulong *nr2) @@ -1747,8 +1529,8 @@ my_hash_sort_utf16_bin(CHARSET_INFO *cs, static MY_COLLATION_HANDLER my_collation_utf16_general_ci_handler = { NULL, /* init */ - my_strnncoll_utf16, - my_strnncollsp_utf16, + my_strnncoll_utf16_general_ci, + my_strnncollsp_utf16_general_ci, my_strnxfrm_unicode, my_strnxfrmlen_unicode, my_like_range_generic, @@ -1877,6 +1659,24 @@ struct charset_info_st my_charset_utf16_bin= }; +#define IS_MB2_CHAR(b0,b1) (!MY_UTF16_SURROGATE_HEAD(b1)) +#define IS_MB4_CHAR(b0,b1,b2,b3) (MY_UTF16_HIGH_HEAD(b1) && MY_UTF16_LOW_HEAD(b3)) + +#define MY_FUNCTION_NAME(x) my_ ## x ## _utf16le_general_ci +#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x)) +#define WEIGHT_MB2(b0,b1) my_weight_mb2_utf16mb2_general_ci(b1,b0) +#define WEIGHT_MB4(b0,b1,b2,b3) ((int) MY_UTF16_WC4(b1, b0, b3, b2)) +#include "strcoll.ic" + +#define MY_FUNCTION_NAME(x) my_ ## x ## _utf16le_bin +#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x)) +#define WEIGHT_MB2(b0,b1) ((int) MY_UTF16_WC2(b1, b0)) +#define WEIGHT_MB4(b0,b1,b2,b3) ((int) MY_UTF16_WC4(b1, b0, b3, b2)) +#include "strcoll.ic" + +#undef IS_MB2_CHAR +#undef IS_MB4_CHAR + static int my_utf16le_uni(CHARSET_INFO *cs __attribute__((unused)), my_wc_t *pwc, const uchar *s, const uchar *e) @@ -1948,6 +1748,38 @@ my_lengthsp_utf16le(CHARSET_INFO *cs __attribute__((unused)), } +static MY_COLLATION_HANDLER my_collation_utf16le_general_ci_handler = +{ + NULL, /* init */ + my_strnncoll_utf16le_general_ci, + my_strnncollsp_utf16le_general_ci, + my_strnxfrm_unicode, + my_strnxfrmlen_unicode, + my_like_range_generic, + my_wildcmp_utf16_ci, + my_strcasecmp_mb2_or_mb4, + my_instr_mb, + my_hash_sort_utf16, + my_propagate_simple +}; + + +static MY_COLLATION_HANDLER my_collation_utf16le_bin_handler = +{ + NULL, /* init */ + my_strnncoll_utf16le_bin, + my_strnncollsp_utf16le_bin, + my_strnxfrm_unicode_full_bin, + my_strnxfrmlen_unicode_full_bin, + my_like_range_generic, + my_wildcmp_utf16_bin, + my_strcasecmp_mb2_or_mb4, + my_instr_mb, + my_hash_sort_utf16_bin, + my_propagate_simple +}; + + static MY_CHARSET_HANDLER my_charset_utf16le_handler= { NULL, /* init */ @@ -2012,7 +1844,7 @@ struct charset_info_st my_charset_utf16le_general_ci= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf16le_handler, - &my_collation_utf16_general_ci_handler + &my_collation_utf16le_general_ci_handler }; @@ -2045,7 +1877,7 @@ struct charset_info_st my_charset_utf16le_bin= 0, /* escape_with_backslash_is_dangerous */ 1, /* levels_for_order */ &my_charset_utf16le_handler, - &my_collation_utf16_bin_handler + &my_collation_utf16le_bin_handler }; @@ -3058,6 +2890,31 @@ static const uchar to_upper_ucs2[] = { }; +/* Definitions for strcoll.ic */ +#define IS_MB2_CHAR(x,y) (1) +#define UCS2_CODE(b0,b1) (((uchar) b0) << 8 | ((uchar) b1)) + + +static inline int my_weight_mb2_ucs2_general_ci(uchar b0, uchar b1) +{ + my_wc_t wc= UCS2_CODE(b0, b1); + MY_UNICASE_CHARACTER *page= my_unicase_default.page[wc >> 8]; + return (int) (page ? page[wc & 0xFF].sort : wc); +} + + +#define MY_FUNCTION_NAME(x) my_ ## x ## _ucs2_general_ci +#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x)) +#define WEIGHT_MB2(b0,b1) my_weight_mb2_ucs2_general_ci(b0,b1) +#include "strcoll.ic" + + +#define MY_FUNCTION_NAME(x) my_ ## x ## _ucs2_bin +#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x)) +#define WEIGHT_MB2(b0,b1) UCS2_CODE(b0,b1) +#include "strcoll.ic" + + static int my_charlen_ucs2(CHARSET_INFO *cs __attribute__((unused)), const uchar *s, const uchar *e) @@ -3208,120 +3065,6 @@ my_fill_ucs2(CHARSET_INFO *cs __attribute__((unused)), } -static int my_strnncoll_ucs2(CHARSET_INFO *cs, - const uchar *s, size_t slen, - const uchar *t, size_t tlen, - my_bool t_is_prefix) -{ - int s_res,t_res; - my_wc_t UNINIT_VAR(s_wc),UNINIT_VAR(t_wc); - const uchar *se=s+slen; - const uchar *te=t+tlen; - MY_UNICASE_INFO *uni_plane= cs->caseinfo; - - while ( s < se && t < te ) - { - s_res=my_ucs2_uni(cs,&s_wc, s, se); - t_res=my_ucs2_uni(cs,&t_wc, t, te); - - if ( s_res <= 0 || t_res <= 0 ) - { - /* Incorrect string, compare by char value */ - return ((int)s[0]-(int)t[0]); - } - - my_tosort_ucs2(uni_plane, &s_wc); - my_tosort_ucs2(uni_plane, &t_wc); - - if ( s_wc != t_wc ) - { - return s_wc > t_wc ? 1 : -1; - } - - s+=s_res; - t+=t_res; - } - return (int) (t_is_prefix ? t-te : ((se-s) - (te-t))); -} - -/* - Compare strings, discarding end space - - SYNOPSIS - my_strnncollsp_ucs2() - cs character set handler - a First string to compare - a_length Length of 'a' - b Second string to compare - b_length Length of 'b' - - IMPLEMENTATION - If one string is shorter as the other, then we space extend the other - so that the strings have equal length. - - This will ensure that the following things hold: - - "a" == "a " - "a\0" < "a" - "a\0" < "a " - - RETURN - < 0 a < b - = 0 a == b - > 0 a > b -*/ - -static int my_strnncollsp_ucs2(CHARSET_INFO *cs __attribute__((unused)), - const uchar *s, size_t slen, - const uchar *t, size_t tlen, - my_bool diff_if_only_endspace_difference - __attribute__((unused))) -{ - const uchar *se, *te; - size_t minlen; - MY_UNICASE_INFO *uni_plane= cs->caseinfo; - - /* extra safety to make sure the lengths are even numbers */ - slen&= ~1; - tlen&= ~1; - - se= s + slen; - te= t + tlen; - - for (minlen= MY_MIN(slen, tlen); minlen; minlen-= 2) - { - int s_wc = uni_plane->page[s[0]] ? (int) uni_plane->page[s[0]][s[1]].sort : - (((int) s[0]) << 8) + (int) s[1]; - - int t_wc = uni_plane->page[t[0]] ? (int) uni_plane->page[t[0]][t[1]].sort : - (((int) t[0]) << 8) + (int) t[1]; - if ( s_wc != t_wc ) - return s_wc > t_wc ? 1 : -1; - - s+= 2; - t+= 2; - } - - if (slen != tlen) - { - int swap= 1; - if (slen < tlen) - { - s= t; - se= te; - swap= -1; - } - - for ( ; s < se ; s+= 2) - { - if (s[0] || s[1] != ' ') - return (s[0] == 0 && s[1] < ' ') ? -swap : swap; - } - } - return 0; -} - - static uint my_ismbchar_ucs2(CHARSET_INFO *cs __attribute__((unused)), const char *b, const char *e) @@ -3417,85 +3160,6 @@ int my_wildcmp_ucs2_bin(CHARSET_INFO *cs, } -static -int my_strnncoll_ucs2_bin(CHARSET_INFO *cs, - const uchar *s, size_t slen, - const uchar *t, size_t tlen, - my_bool t_is_prefix) -{ - int s_res,t_res; - my_wc_t UNINIT_VAR(s_wc),UNINIT_VAR(t_wc); - const uchar *se=s+slen; - const uchar *te=t+tlen; - - while ( s < se && t < te ) - { - s_res=my_ucs2_uni(cs,&s_wc, s, se); - t_res=my_ucs2_uni(cs,&t_wc, t, te); - - if ( s_res <= 0 || t_res <= 0 ) - { - /* Incorrect string, compare by char value */ - return ((int)s[0]-(int)t[0]); - } - if ( s_wc != t_wc ) - { - return s_wc > t_wc ? 1 : -1; - } - - s+=s_res; - t+=t_res; - } - return (int) (t_is_prefix ? t-te : ((se-s) - (te-t))); -} - -static int my_strnncollsp_ucs2_bin(CHARSET_INFO *cs __attribute__((unused)), - const uchar *s, size_t slen, - const uchar *t, size_t tlen, - my_bool diff_if_only_endspace_difference - __attribute__((unused))) -{ - const uchar *se, *te; - size_t minlen; - - /* extra safety to make sure the lengths are even numbers */ - slen= (slen >> 1) << 1; - tlen= (tlen >> 1) << 1; - - se= s + slen; - te= t + tlen; - - for (minlen= MY_MIN(slen, tlen); minlen; minlen-= 2) - { - int s_wc= s[0] * 256 + s[1]; - int t_wc= t[0] * 256 + t[1]; - if ( s_wc != t_wc ) - return s_wc > t_wc ? 1 : -1; - - s+= 2; - t+= 2; - } - - if (slen != tlen) - { - int swap= 1; - if (slen < tlen) - { - s= t; - se= te; - swap= -1; - } - - for ( ; s < se ; s+= 2) - { - if (s[0] || s[1] != ' ') - return (s[0] == 0 && s[1] < ' ') ? -swap : swap; - } - } - return 0; -} - - static void my_hash_sort_ucs2_bin(CHARSET_INFO *cs __attribute__((unused)), const uchar *key, size_t len,ulong *nr1, ulong *nr2) @@ -3518,8 +3182,8 @@ void my_hash_sort_ucs2_bin(CHARSET_INFO *cs __attribute__((unused)), static MY_COLLATION_HANDLER my_collation_ucs2_general_ci_handler = { NULL, /* init */ - my_strnncoll_ucs2, - my_strnncollsp_ucs2, + my_strnncoll_ucs2_general_ci, + my_strnncollsp_ucs2_general_ci, my_strnxfrm_unicode, my_strnxfrmlen_unicode, my_like_range_generic, diff --git a/strings/strcoll.ic b/strings/strcoll.ic index 5f4ee615d84..4bced593a23 100644 --- a/strings/strcoll.ic +++ b/strings/strcoll.ic @@ -64,13 +64,16 @@ @return - the number of bytes scanned The including source file must define the following macros: - IS_MB1_CHAR(x) - IS_MB1_MB2HEAD_GAP(x) - optional, for better performance - IS_MB2_CHAR(x,y) - IS_MB3_CHAR(x,y,z) - for character sets with mbmaxlen>2 + IS_MB1_CHAR(b0) - for character sets that have MB1 characters + IS_MB1_MB2HEAD_GAP(b0) - optional, for better performance + IS_MB2_CHAR(b0,b1) - for character sets that have MB2 characters + IS_MB3_CHAR(b0,b1,b2) - for character sets that have MB3 characters + IS_MB4_CHAR(b0,b1,b2,b3) - for character sets with have MB4 characters WEIGHT_PAD_SPACE - WEIGHT_MB1(x) - WEIGHT_MB2(x,y) + WEIGHT_MB1(b0) - for character sets that have MB1 characters + WEIGHT_MB2(b0,b1) - for character sets that have MB2 characters + WEIGHT_MB3(b0,b1,b2) - for character sets that have MB3 characters + WEIGHT_MB4(b0,b1,b2,b3) - for character sets that have MB4 characters WEIGHT_ILSEQ(x) */ static inline uint @@ -82,11 +85,13 @@ MY_FUNCTION_NAME(scan_weight)(int *weight, const uchar *str, const uchar *end) return 0; } +#ifdef IS_MB1_CHAR if (IS_MB1_CHAR(*str)) { *weight= WEIGHT_MB1(*str); /* A valid single byte character*/ return 1; } +#endif #ifdef IS_MB1_MBHEAD_UNUSED_GAP /* @@ -98,6 +103,7 @@ MY_FUNCTION_NAME(scan_weight)(int *weight, const uchar *str, const uchar *end) goto bad; #endif +#ifdef IS_MB2_CHAR if (str + 2 > end) /* The string ended unexpectedly */ goto bad; /* Treat as a bad byte */ @@ -106,6 +112,7 @@ MY_FUNCTION_NAME(scan_weight)(int *weight, const uchar *str, const uchar *end) *weight= WEIGHT_MB2(str[0], str[1]); return 2; /* A valid two-byte character */ } +#endif #ifdef IS_MB3_CHAR if (str + 3 > end) /* Incomplete three-byte character */ diff --git a/unittest/strings/strings-t.c b/unittest/strings/strings-t.c index 4e9ca820981..51537e624f9 100644 --- a/unittest/strings/strings-t.c +++ b/unittest/strings/strings-t.c @@ -149,7 +149,7 @@ typedef struct A1A1 - MB2 or 8BIT+8BIT E0E0 - MB2 */ -STRNNCOLL_PARAM strcoll_mb2_common[]= +static STRNNCOLL_PARAM strcoll_mb2_common[]= { /* Compare two good sequences */ {CSTR(""), CSTR(""), 0}, @@ -210,7 +210,7 @@ STRNNCOLL_PARAM strcoll_mb2_common[]= /* For character sets that have good mb2 characters A1A1 and F9FE */ -STRNNCOLL_PARAM strcoll_mb2_A1A1_mb2_F9FE[]= +static STRNNCOLL_PARAM strcoll_mb2_A1A1_mb2_F9FE[]= { /* Compare two good characters */ {CSTR(""), CSTR("\xF9\xFE"), -1}, @@ -246,7 +246,7 @@ STRNNCOLL_PARAM strcoll_mb2_A1A1_mb2_F9FE[]= A1A1 - a good mb2 character F9FE - a bad sequence */ -STRNNCOLL_PARAM strcoll_mb2_A1A1_bad_F9FE[]= +static STRNNCOLL_PARAM strcoll_mb2_A1A1_bad_F9FE[]= { /* Compare a good character to an illegal or an incomplete sequence */ {CSTR(""), CSTR("\xF9\xFE"), -1}, @@ -283,7 +283,7 @@ STRNNCOLL_PARAM strcoll_mb2_A1A1_bad_F9FE[]= F9 - ILSEQ or H2 F9FE - a bad sequence (ILSEQ+XX or H2+ILSEQ) */ -STRNNCOLL_PARAM strcoll_mb1_A1_bad_F9FE[]= +static STRNNCOLL_PARAM strcoll_mb1_A1_bad_F9FE[]= { /* Compare two good characters */ {CSTR(""), CSTR("\xA1"), -1}, @@ -323,7 +323,7 @@ STRNNCOLL_PARAM strcoll_mb1_A1_bad_F9FE[]= and sort in this order: 8181 < A1 < E0E0 */ -STRNNCOLL_PARAM strcoll_8181_A1_E0E0[]= +static STRNNCOLL_PARAM strcoll_8181_A1_E0E0[]= { {CSTR("\x81\x81"), CSTR("\xA1"), -1}, {CSTR("\x81\x81"), CSTR("\xE0\xE0"), -1}, @@ -336,7 +336,7 @@ STRNNCOLL_PARAM strcoll_8181_A1_E0E0[]= /* A shared test for eucjpms and ujis. */ -STRNNCOLL_PARAM strcoll_ujis[]= +static STRNNCOLL_PARAM strcoll_ujis[]= { {CSTR("\x8E\xA1"), CSTR("\x8E"), -1}, /* Good MB2 vs incomplete MB2 */ {CSTR("\x8E\xA1"), CSTR("\x8F\xA1"), -1}, /* Good MB2 vs incomplete MB3 */ @@ -347,7 +347,7 @@ STRNNCOLL_PARAM strcoll_ujis[]= }; -STRNNCOLL_PARAM strcoll_utf8mb3_common[]= +static STRNNCOLL_PARAM strcoll_utf8mb3_common[]= { {CSTR("\xC0"), CSTR("\xC1"), -1}, /* Unused byte vs unused byte */ {CSTR("\xC0"), CSTR("\xFF"), -1}, /* Unused byte vs unused byte */ @@ -369,7 +369,7 @@ STRNNCOLL_PARAM strcoll_utf8mb3_common[]= }; -STRNNCOLL_PARAM strcoll_utf8mb4_common[]= +static STRNNCOLL_PARAM strcoll_utf8mb4_common[]= { /* Minimum four-byte character: U+10000 == _utf8 0xF0908080 */ {CSTR("\xF0\x90\x80\x80"), CSTR("\xC0"), -1}, /* MB4 vs unused byte */ @@ -412,6 +412,101 @@ STRNNCOLL_PARAM strcoll_utf8mb4_common[]= }; +static STRNNCOLL_PARAM strcoll_ucs2_common[]= +{ + {CSTR("\xC0"), CSTR("\xC1"), -1}, /* Incomlete MB2 vs incomplete MB2 */ + {CSTR("\xC0"), CSTR("\xFF"), -1}, /* Incomlete MB2 vs incomplete MB2 */ + {CSTR("\xC2\xA1"), CSTR("\xC0"), -1}, /* MB2 vs incomplete MB2 */ + {CSTR("\xC2\xA1"), CSTR("\xC2"), -1}, /* MB2 vs incomplete MB2 */ + {CSTR("\xC2\xA0"), CSTR("\xC2\xA1"), -1}, /* MB2 vs MB2 */ + {CSTR("\xC2\xA1"), CSTR("\xC2\xA2"), -1}, /* MB2 vs MB2 */ + + {CSTR("\xFF\xFF"), CSTR("\x00"),-1}, /* MB2 vs incomplete */ + {CSTR("\xFF\xFF\xFF\xFF"), CSTR("\x00"),-1}, /* MB2+MB2 vs incomplete */ + {CSTR("\xFF\xFF\xFF\xFF"), CSTR("\x00\x00\x00"), 1},/* MB2+MB2 vs MB2+incomplete */ + + {NULL, 0, NULL, 0, 0} +}; + + +/* Tests that involve comparison to SPACE (explicit, or padded) */ +static STRNNCOLL_PARAM strcoll_ucs2_space[]= +{ + {CSTR("\x00\x1F"), CSTR("\x00\x20"), -1}, /* MB2 vs MB2 */ + {CSTR("\x00\x20"), CSTR("\x00\x21"), -1}, /* MB2 vs MB2 */ + {CSTR("\x00\x1F"), CSTR(""), -1}, /* MB2 vs empty */ + {CSTR("\x00\x20"), CSTR(""), 0}, /* MB2 vs empty */ + {CSTR("\x00\x21"), CSTR(""), 1}, /* MB2 vs empty */ + + {NULL, 0, NULL, 0, 0} +}; + + +/* Tests that involve comparison to SPACE (explicit, or padded) */ +static STRNNCOLL_PARAM strcoll_utf16le_space[]= +{ + {CSTR("\x1F\x00"), CSTR("\x20\x00"), -1}, /* MB2 vs MB2 */ + {CSTR("\x20\x00"), CSTR("\x21\x00"), -1}, /* MB2 vs MB2 */ + {CSTR("\x1F\x00"), CSTR(""), -1}, /* MB2 vs empty */ + {CSTR("\x20\x00"), CSTR(""), 0}, /* MB2 vs empty */ + {CSTR("\x21\x00"), CSTR(""), 1}, /* MB2 vs empty */ + + {NULL, 0, NULL, 0, 0} +}; + + +static STRNNCOLL_PARAM strcoll_utf16_common[]= +{ + /* Minimum four-byte character: U+10000 == _utf16 0xD800DC00 */ + {CSTR("\xD8\x00\xDC\x00"), CSTR("\xC0"), -1},/* MB4 vs incomplete MB2 */ + {CSTR("\xD8\x00\xDC\x00"), CSTR("\xC2"), -1},/* MB4 vs incomplete MB2 */ + {CSTR("\xD8\x00\xDC\x00"), CSTR("\xD8\x00\xDB\x00"),-1},/* MB4 vs broken MB4 */ + {CSTR("\xD8\x00\xDC\x00"), CSTR("\xD8\x00\xE0\x00"),-1},/* MB4 vs broken MB4 */ + {CSTR("\xD8\x00\xDC\x00"), CSTR("\xDC\x00"), -1},/* MB4 vs broken MB2 */ + {CSTR("\xD8\x00\xDC\x00"), CSTR("\xD8\x00\xDC"), -1},/* MB4 vs incomplete MB4 */ + + /* Maximum four-byte character: U+10FFFF == _utf8 0xF48FBFBF */ + {CSTR("\xDB\xFF\xDF\xFF"), CSTR("\xC0"), -1},/* MB4 vs incomplete MB2 */ + {CSTR("\xDB\xFF\xDF\xFF"), CSTR("\xC2"), -1},/* MB4 vs incomplete MB2 */ + {CSTR("\xDB\xFF\xDF\xFF"), CSTR("\xD8\x00\xDB\x00"),-1},/* MB4 vs broken MB4 */ + {CSTR("\xDB\xFF\xDF\xFF"), CSTR("\xD8\x00\xE0\x00"),-1},/* MB4 vs broken MB4 */ + {CSTR("\xDB\xFF\xDF\xFF"), CSTR("\xDC\x00"), -1},/* MB4 vs broken MB2 */ + {CSTR("\xDB\xFF\xDF\xFF"), CSTR("\xDC\xFF\xDF"), -1},/* MB4 vs incomplete MB4 */ + + /* Broken MB4 vs broken MB4 */ + {CSTR("\xD8\x00\xDC\x00"), CSTR("\xD8\x00\xDC\x01"),-1},/* Broken MB4 vs broken MB4 */ + {CSTR("\xDB\xFF\xE0\xFE"), CSTR("\xDB\xFF\xE0\xFF"),-1},/* Broken MB4 vs broken MB4 */ + + {NULL, 0, NULL, 0, 0} +}; + + +static STRNNCOLL_PARAM strcoll_utf16le_common[]= +{ + /* Minimum four-byte character: U+10000 == _utf16 0xD800DC00 */ + {CSTR("\x00\xD8\x00\xDC"), CSTR("\xC0"), -1},/* MB4 vs incomplete MB2 */ + {CSTR("\x00\xD8\x00\xDC"), CSTR("\xC2"), -1},/* MB4 vs incomplete MB2 */ + {CSTR("\x00\xD8\x00\xDC"), CSTR("\x00\xD8\x00\xDB"),-1},/* MB4 vs broken MB4 */ + {CSTR("\x00\xD8\x00\xDC"), CSTR("\x00\xD8\x00\xD0"),-1},/* MB4 vs broken MB4 */ + {CSTR("\x00\xD8\x00\xDC"), CSTR("\x00\xDC"), -1},/* MB4 vs broken MB2 */ + {CSTR("\x00\xD8\x00\xDC"), CSTR("\x00\xD8\x00"), -1},/* MB4 vs incomplete MB4 */ + + /* Maximum four-byte character: U+10FFFF == _utf8 0xF48FBFBF */ + {CSTR("\xFF\xDB\xFF\xDF"), CSTR("\xC0"), -1},/* MB4 vs incomplete MB2 */ + {CSTR("\xFF\xDB\xFF\xDF"), CSTR("\xC2"), -1},/* MB4 vs incomplete MB2 */ + {CSTR("\xFF\xDB\xFF\xDF"), CSTR("\x00\xD8\x00\xDB"),-1},/* MB4 vs broken MB4 */ + {CSTR("\xFF\xDB\xFF\xDF"), CSTR("\x00\xD8\x00\xE0"),-1},/* MB4 vs broken MB4 */ + {CSTR("\xFF\xDB\xFF\xDF"), CSTR("\x00\xDC"), -1},/* MB4 vs broken MB2 */ + {CSTR("\xFF\xDB\xFF\xDF"), CSTR("\xFF\xDC\x00"), -1},/* MB4 vs incomplete MB4 */ + + /* Broken MB4 vs broken MB4 */ + {CSTR("\x00\xD8\x00\xDC"), CSTR("\x00\xD8\x01\xDC"),-1},/* Broken MB4 vs broken MB4 */ + {CSTR("\xFF\xDB\xFE\xE0"), CSTR("\xFF\xDB\xFF\xE0"),-1},/* Broken MB4 vs broken MB4 */ + + {NULL, 0, NULL, 0, 0} +}; + + static void str2hex(char *dst, size_t dstlen, const char *src, size_t srclen) { @@ -528,6 +623,12 @@ test_strcollsp() failed+= strcollsp(&my_charset_sjis_japanese_ci, strcoll_8181_A1_E0E0); failed+= strcollsp(&my_charset_sjis_bin, strcoll_8181_A1_E0E0); #endif +#ifdef HAVE_CHARSET_ucs2 + failed+= strcollsp(&my_charset_ucs2_general_ci, strcoll_ucs2_common); + failed+= strcollsp(&my_charset_ucs2_general_ci, strcoll_ucs2_space); + failed+= strcollsp(&my_charset_ucs2_bin, strcoll_ucs2_common); + failed+= strcollsp(&my_charset_ucs2_bin, strcoll_ucs2_space); +#endif #ifdef HAVE_CHARSET_ujis failed+= strcollsp(&my_charset_ujis_japanese_ci, strcoll_mb2_common); failed+= strcollsp(&my_charset_ujis_bin, strcoll_mb2_common); @@ -536,6 +637,21 @@ test_strcollsp() failed+= strcollsp(&my_charset_ujis_japanese_ci, strcoll_ujis); failed+= strcollsp(&my_charset_ujis_bin, strcoll_ujis); #endif +#ifdef HAVE_CHARSET_utf16 + failed+= strcollsp(&my_charset_utf16_general_ci, strcoll_ucs2_common); + failed+= strcollsp(&my_charset_utf16_general_ci, strcoll_ucs2_space); + failed+= strcollsp(&my_charset_utf16_general_ci, strcoll_utf16_common); + failed+= strcollsp(&my_charset_utf16_bin, strcoll_ucs2_common); + failed+= strcollsp(&my_charset_utf16_bin, strcoll_ucs2_space); + failed+= strcollsp(&my_charset_utf16_bin, strcoll_utf16_common); + + failed+= strcollsp(&my_charset_utf16le_general_ci,strcoll_ucs2_common); + failed+= strcollsp(&my_charset_utf16le_general_ci,strcoll_utf16le_space); + failed+= strcollsp(&my_charset_utf16le_general_ci,strcoll_utf16le_common); + failed+= strcollsp(&my_charset_utf16le_bin, strcoll_ucs2_common); + failed+= strcollsp(&my_charset_utf16le_bin, strcoll_utf16le_space); + failed+= strcollsp(&my_charset_utf16le_bin, strcoll_utf16le_common); +#endif #ifdef HAVE_CHARSET_utf8 failed+= strcollsp(&my_charset_utf8_general_ci, strcoll_utf8mb3_common); failed+= strcollsp(&my_charset_utf8_general_mysql500_ci, strcoll_utf8mb3_common);