mirror of
https://github.com/MariaDB/server.git
synced 2025-07-29 05:21:33 +03:00
MDEV-8416 ucs2: compare broken bytes as "greater than any non-broken character"
MDEV-8418 utf16: compare broken bytes as "greater than any non-broken character"
This commit is contained in:
@ -1188,6 +1188,9 @@ my_lengthsp_mb2(CHARSET_INFO *cs __attribute__((unused)),
|
||||
|
||||
#define MY_UTF16_HIGH_HEAD(x) ((((uchar) (x)) & 0xFC) == 0xD8)
|
||||
#define MY_UTF16_LOW_HEAD(x) ((((uchar) (x)) & 0xFC) == 0xDC)
|
||||
/* Test if a byte is a leading byte of a high or low surrogate head: */
|
||||
#define MY_UTF16_SURROGATE_HEAD(x) ((((uchar) (x)) & 0xF8) == 0xD8)
|
||||
/* Test if a Unicode code point is a high or low surrogate head */
|
||||
#define MY_UTF16_SURROGATE(x) (((x) & 0xF800) == 0xD800)
|
||||
|
||||
#define MY_UTF16_WC2(a, b) ((a << 8) + b)
|
||||
@ -1201,6 +1204,30 @@ my_lengthsp_mb2(CHARSET_INFO *cs __attribute__((unused)),
|
||||
#define MY_UTF16_WC4(a, b, c, d) (((a & 3) << 18) + (b << 10) + \
|
||||
((c & 3) << 8) + d + 0x10000)
|
||||
|
||||
#define IS_MB2_CHAR(b0,b1) (!MY_UTF16_SURROGATE_HEAD(b0))
|
||||
#define IS_MB4_CHAR(b0,b1,b2,b3) (MY_UTF16_HIGH_HEAD(b0) && MY_UTF16_LOW_HEAD(b2))
|
||||
|
||||
static inline int my_weight_mb2_utf16mb2_general_ci(uchar b0, uchar b1)
|
||||
{
|
||||
my_wc_t wc= MY_UTF16_WC2(b0, b1);
|
||||
MY_UNICASE_CHARACTER *page= my_unicase_default.page[wc >> 8];
|
||||
return (int) (page ? page[wc & 0xFF].sort : wc);
|
||||
}
|
||||
#define MY_FUNCTION_NAME(x) my_ ## x ## _utf16_general_ci
|
||||
#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
|
||||
#define WEIGHT_MB2(b0,b1) my_weight_mb2_utf16mb2_general_ci(b0,b1)
|
||||
#define WEIGHT_MB4(b0,b1,b2,b3) ((int) MY_UTF16_WC4(b0, b1, b2, b3))
|
||||
#include "strcoll.ic"
|
||||
|
||||
#define MY_FUNCTION_NAME(x) my_ ## x ## _utf16_bin
|
||||
#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
|
||||
#define WEIGHT_MB2(b0,b1) ((int) MY_UTF16_WC2(b0, b1))
|
||||
#define WEIGHT_MB4(b0,b1,b2,b3) ((int) MY_UTF16_WC4(b0, b1, b2, b3))
|
||||
#include "strcoll.ic"
|
||||
|
||||
#undef IS_MB2_CHAR
|
||||
#undef IS_MB4_CHAR
|
||||
|
||||
static int
|
||||
my_utf16_uni(CHARSET_INFO *cs __attribute__((unused)),
|
||||
my_wc_t *pwc, const uchar *s, const uchar *e)
|
||||
@ -1371,146 +1398,6 @@ my_casedn_utf16(CHARSET_INFO *cs, char *src, size_t srclen,
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
my_strnncoll_utf16(CHARSET_INFO *cs,
|
||||
const uchar *s, size_t slen,
|
||||
const uchar *t, size_t tlen,
|
||||
my_bool t_is_prefix)
|
||||
{
|
||||
int s_res, t_res;
|
||||
my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc);
|
||||
my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
|
||||
const uchar *se= s + slen;
|
||||
const uchar *te= t + tlen;
|
||||
MY_UNICASE_INFO *uni_plane= cs->caseinfo;
|
||||
|
||||
while (s < se && t < te)
|
||||
{
|
||||
s_res= mb_wc(cs, &s_wc, s, se);
|
||||
t_res= mb_wc(cs, &t_wc, t, te);
|
||||
|
||||
if (s_res <= 0 || t_res <= 0)
|
||||
{
|
||||
/* Incorrect string, compare by char value */
|
||||
return my_bincmp(s, se, t, te);
|
||||
}
|
||||
|
||||
my_tosort_utf16(uni_plane, &s_wc);
|
||||
my_tosort_utf16(uni_plane, &t_wc);
|
||||
|
||||
if (s_wc != t_wc)
|
||||
{
|
||||
return s_wc > t_wc ? 1 : -1;
|
||||
}
|
||||
|
||||
s+= s_res;
|
||||
t+= t_res;
|
||||
}
|
||||
return (int) (t_is_prefix ? (t - te) : ((se - s) - (te - t)));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
Compare strings, discarding end space
|
||||
|
||||
If one string is shorter as the other, then we space extend the other
|
||||
so that the strings have equal length.
|
||||
|
||||
This will ensure that the following things hold:
|
||||
|
||||
"a" == "a "
|
||||
"a\0" < "a"
|
||||
"a\0" < "a "
|
||||
|
||||
@param cs Character set pinter.
|
||||
@param a First string to compare.
|
||||
@param a_length Length of 'a'.
|
||||
@param b Second string to compare.
|
||||
@param b_length Length of 'b'.
|
||||
|
||||
IMPLEMENTATION
|
||||
|
||||
@return Comparison result.
|
||||
@retval Negative number, if a less than b.
|
||||
@retval 0, if a is equal to b
|
||||
@retval Positive number, if a > b
|
||||
*/
|
||||
|
||||
static int
|
||||
my_strnncollsp_utf16(CHARSET_INFO *cs,
|
||||
const uchar *s, size_t slen,
|
||||
const uchar *t, size_t tlen,
|
||||
my_bool diff_if_only_endspace_difference)
|
||||
{
|
||||
int res;
|
||||
my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc);
|
||||
my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
|
||||
const uchar *se= s + slen, *te= t + tlen;
|
||||
MY_UNICASE_INFO *uni_plane= cs->caseinfo;
|
||||
|
||||
DBUG_ASSERT((slen % 2) == 0);
|
||||
DBUG_ASSERT((tlen % 2) == 0);
|
||||
|
||||
#ifndef VARCHAR_WITH_DIFF_ENDSPACE_ARE_DIFFERENT_FOR_UNIQUE
|
||||
diff_if_only_endspace_difference= FALSE;
|
||||
#endif
|
||||
|
||||
while (s < se && t < te)
|
||||
{
|
||||
int s_res= mb_wc(cs, &s_wc, s, se);
|
||||
int t_res= mb_wc(cs, &t_wc, t, te);
|
||||
|
||||
if (s_res <= 0 || t_res <= 0)
|
||||
{
|
||||
/* Incorrect string, compare bytewise */
|
||||
return my_bincmp(s, se, t, te);
|
||||
}
|
||||
|
||||
my_tosort_utf16(uni_plane, &s_wc);
|
||||
my_tosort_utf16(uni_plane, &t_wc);
|
||||
|
||||
if (s_wc != t_wc)
|
||||
{
|
||||
return s_wc > t_wc ? 1 : -1;
|
||||
}
|
||||
|
||||
s+= s_res;
|
||||
t+= t_res;
|
||||
}
|
||||
|
||||
slen= (size_t) (se - s);
|
||||
tlen= (size_t) (te - t);
|
||||
res= 0;
|
||||
|
||||
if (slen != tlen)
|
||||
{
|
||||
int s_res, swap= 1;
|
||||
if (diff_if_only_endspace_difference)
|
||||
res= 1; /* Assume 's' is bigger */
|
||||
if (slen < tlen)
|
||||
{
|
||||
slen= tlen;
|
||||
s= t;
|
||||
se= te;
|
||||
swap= -1;
|
||||
res= -res;
|
||||
}
|
||||
|
||||
for ( ; s < se; s+= s_res)
|
||||
{
|
||||
if ((s_res= mb_wc(cs, &s_wc, s, se)) < 0)
|
||||
{
|
||||
DBUG_ASSERT(0);
|
||||
return 0;
|
||||
}
|
||||
if (s_wc != ' ')
|
||||
return (s_wc < ' ') ? -swap : swap;
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
static uint
|
||||
my_ismbchar_utf16(CHARSET_INFO *cs, const char *b, const char *e)
|
||||
{
|
||||
@ -1623,111 +1510,6 @@ my_wildcmp_utf16_bin(CHARSET_INFO *cs,
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
my_strnncoll_utf16_bin(CHARSET_INFO *cs,
|
||||
const uchar *s, size_t slen,
|
||||
const uchar *t, size_t tlen,
|
||||
my_bool t_is_prefix)
|
||||
{
|
||||
int s_res,t_res;
|
||||
my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc);
|
||||
my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
|
||||
const uchar *se=s+slen;
|
||||
const uchar *te=t+tlen;
|
||||
|
||||
while ( s < se && t < te )
|
||||
{
|
||||
s_res= mb_wc(cs, &s_wc, s, se);
|
||||
t_res= mb_wc(cs, &t_wc, t, te);
|
||||
|
||||
if (s_res <= 0 || t_res <= 0)
|
||||
{
|
||||
/* Incorrect string, compare by char value */
|
||||
return my_bincmp(s, se, t, te);
|
||||
}
|
||||
if (s_wc != t_wc)
|
||||
{
|
||||
return s_wc > t_wc ? 1 : -1;
|
||||
}
|
||||
|
||||
s+= s_res;
|
||||
t+= t_res;
|
||||
}
|
||||
return (int) (t_is_prefix ? (t - te) : ((se - s) - (te - t)));
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
my_strnncollsp_utf16_bin(CHARSET_INFO *cs,
|
||||
const uchar *s, size_t slen,
|
||||
const uchar *t, size_t tlen,
|
||||
my_bool diff_if_only_endspace_difference)
|
||||
{
|
||||
int res;
|
||||
my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc);
|
||||
my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
|
||||
const uchar *se= s + slen, *te= t + tlen;
|
||||
|
||||
DBUG_ASSERT((slen % 2) == 0);
|
||||
DBUG_ASSERT((tlen % 2) == 0);
|
||||
|
||||
#ifndef VARCHAR_WITH_DIFF_ENDSPACE_ARE_DIFFERENT_FOR_UNIQUE
|
||||
diff_if_only_endspace_difference= FALSE;
|
||||
#endif
|
||||
|
||||
while (s < se && t < te)
|
||||
{
|
||||
int s_res= mb_wc(cs, &s_wc, s, se);
|
||||
int t_res= mb_wc(cs, &t_wc, t, te);
|
||||
|
||||
if (s_res <= 0 || t_res <= 0)
|
||||
{
|
||||
/* Incorrect string, compare bytewise */
|
||||
return my_bincmp(s, se, t, te);
|
||||
}
|
||||
|
||||
if (s_wc != t_wc)
|
||||
{
|
||||
return s_wc > t_wc ? 1 : -1;
|
||||
}
|
||||
|
||||
s+= s_res;
|
||||
t+= t_res;
|
||||
}
|
||||
|
||||
slen= (size_t) (se - s);
|
||||
tlen= (size_t) (te - t);
|
||||
res= 0;
|
||||
|
||||
if (slen != tlen)
|
||||
{
|
||||
int s_res, swap= 1;
|
||||
if (diff_if_only_endspace_difference)
|
||||
res= 1; /* Assume 's' is bigger */
|
||||
if (slen < tlen)
|
||||
{
|
||||
slen= tlen;
|
||||
s= t;
|
||||
se= te;
|
||||
swap= -1;
|
||||
res= -res;
|
||||
}
|
||||
|
||||
for ( ; s < se; s+= s_res)
|
||||
{
|
||||
if ((s_res= mb_wc(cs, &s_wc, s, se)) < 0)
|
||||
{
|
||||
DBUG_ASSERT(0);
|
||||
return 0;
|
||||
}
|
||||
if (s_wc != ' ')
|
||||
return (s_wc < ' ') ? -swap : swap;
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
my_hash_sort_utf16_bin(CHARSET_INFO *cs,
|
||||
const uchar *pos, size_t len, ulong *nr1, ulong *nr2)
|
||||
@ -1747,8 +1529,8 @@ my_hash_sort_utf16_bin(CHARSET_INFO *cs,
|
||||
static MY_COLLATION_HANDLER my_collation_utf16_general_ci_handler =
|
||||
{
|
||||
NULL, /* init */
|
||||
my_strnncoll_utf16,
|
||||
my_strnncollsp_utf16,
|
||||
my_strnncoll_utf16_general_ci,
|
||||
my_strnncollsp_utf16_general_ci,
|
||||
my_strnxfrm_unicode,
|
||||
my_strnxfrmlen_unicode,
|
||||
my_like_range_generic,
|
||||
@ -1877,6 +1659,24 @@ struct charset_info_st my_charset_utf16_bin=
|
||||
};
|
||||
|
||||
|
||||
#define IS_MB2_CHAR(b0,b1) (!MY_UTF16_SURROGATE_HEAD(b1))
|
||||
#define IS_MB4_CHAR(b0,b1,b2,b3) (MY_UTF16_HIGH_HEAD(b1) && MY_UTF16_LOW_HEAD(b3))
|
||||
|
||||
#define MY_FUNCTION_NAME(x) my_ ## x ## _utf16le_general_ci
|
||||
#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
|
||||
#define WEIGHT_MB2(b0,b1) my_weight_mb2_utf16mb2_general_ci(b1,b0)
|
||||
#define WEIGHT_MB4(b0,b1,b2,b3) ((int) MY_UTF16_WC4(b1, b0, b3, b2))
|
||||
#include "strcoll.ic"
|
||||
|
||||
#define MY_FUNCTION_NAME(x) my_ ## x ## _utf16le_bin
|
||||
#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
|
||||
#define WEIGHT_MB2(b0,b1) ((int) MY_UTF16_WC2(b1, b0))
|
||||
#define WEIGHT_MB4(b0,b1,b2,b3) ((int) MY_UTF16_WC4(b1, b0, b3, b2))
|
||||
#include "strcoll.ic"
|
||||
|
||||
#undef IS_MB2_CHAR
|
||||
#undef IS_MB4_CHAR
|
||||
|
||||
static int
|
||||
my_utf16le_uni(CHARSET_INFO *cs __attribute__((unused)),
|
||||
my_wc_t *pwc, const uchar *s, const uchar *e)
|
||||
@ -1948,6 +1748,38 @@ my_lengthsp_utf16le(CHARSET_INFO *cs __attribute__((unused)),
|
||||
}
|
||||
|
||||
|
||||
static MY_COLLATION_HANDLER my_collation_utf16le_general_ci_handler =
|
||||
{
|
||||
NULL, /* init */
|
||||
my_strnncoll_utf16le_general_ci,
|
||||
my_strnncollsp_utf16le_general_ci,
|
||||
my_strnxfrm_unicode,
|
||||
my_strnxfrmlen_unicode,
|
||||
my_like_range_generic,
|
||||
my_wildcmp_utf16_ci,
|
||||
my_strcasecmp_mb2_or_mb4,
|
||||
my_instr_mb,
|
||||
my_hash_sort_utf16,
|
||||
my_propagate_simple
|
||||
};
|
||||
|
||||
|
||||
static MY_COLLATION_HANDLER my_collation_utf16le_bin_handler =
|
||||
{
|
||||
NULL, /* init */
|
||||
my_strnncoll_utf16le_bin,
|
||||
my_strnncollsp_utf16le_bin,
|
||||
my_strnxfrm_unicode_full_bin,
|
||||
my_strnxfrmlen_unicode_full_bin,
|
||||
my_like_range_generic,
|
||||
my_wildcmp_utf16_bin,
|
||||
my_strcasecmp_mb2_or_mb4,
|
||||
my_instr_mb,
|
||||
my_hash_sort_utf16_bin,
|
||||
my_propagate_simple
|
||||
};
|
||||
|
||||
|
||||
static MY_CHARSET_HANDLER my_charset_utf16le_handler=
|
||||
{
|
||||
NULL, /* init */
|
||||
@ -2012,7 +1844,7 @@ struct charset_info_st my_charset_utf16le_general_ci=
|
||||
0, /* escape_with_backslash_is_dangerous */
|
||||
1, /* levels_for_order */
|
||||
&my_charset_utf16le_handler,
|
||||
&my_collation_utf16_general_ci_handler
|
||||
&my_collation_utf16le_general_ci_handler
|
||||
};
|
||||
|
||||
|
||||
@ -2045,7 +1877,7 @@ struct charset_info_st my_charset_utf16le_bin=
|
||||
0, /* escape_with_backslash_is_dangerous */
|
||||
1, /* levels_for_order */
|
||||
&my_charset_utf16le_handler,
|
||||
&my_collation_utf16_bin_handler
|
||||
&my_collation_utf16le_bin_handler
|
||||
};
|
||||
|
||||
|
||||
@ -3058,6 +2890,31 @@ static const uchar to_upper_ucs2[] = {
|
||||
};
|
||||
|
||||
|
||||
/* Definitions for strcoll.ic */
|
||||
#define IS_MB2_CHAR(x,y) (1)
|
||||
#define UCS2_CODE(b0,b1) (((uchar) b0) << 8 | ((uchar) b1))
|
||||
|
||||
|
||||
static inline int my_weight_mb2_ucs2_general_ci(uchar b0, uchar b1)
|
||||
{
|
||||
my_wc_t wc= UCS2_CODE(b0, b1);
|
||||
MY_UNICASE_CHARACTER *page= my_unicase_default.page[wc >> 8];
|
||||
return (int) (page ? page[wc & 0xFF].sort : wc);
|
||||
}
|
||||
|
||||
|
||||
#define MY_FUNCTION_NAME(x) my_ ## x ## _ucs2_general_ci
|
||||
#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
|
||||
#define WEIGHT_MB2(b0,b1) my_weight_mb2_ucs2_general_ci(b0,b1)
|
||||
#include "strcoll.ic"
|
||||
|
||||
|
||||
#define MY_FUNCTION_NAME(x) my_ ## x ## _ucs2_bin
|
||||
#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
|
||||
#define WEIGHT_MB2(b0,b1) UCS2_CODE(b0,b1)
|
||||
#include "strcoll.ic"
|
||||
|
||||
|
||||
static int
|
||||
my_charlen_ucs2(CHARSET_INFO *cs __attribute__((unused)),
|
||||
const uchar *s, const uchar *e)
|
||||
@ -3208,120 +3065,6 @@ my_fill_ucs2(CHARSET_INFO *cs __attribute__((unused)),
|
||||
}
|
||||
|
||||
|
||||
static int my_strnncoll_ucs2(CHARSET_INFO *cs,
|
||||
const uchar *s, size_t slen,
|
||||
const uchar *t, size_t tlen,
|
||||
my_bool t_is_prefix)
|
||||
{
|
||||
int s_res,t_res;
|
||||
my_wc_t UNINIT_VAR(s_wc),UNINIT_VAR(t_wc);
|
||||
const uchar *se=s+slen;
|
||||
const uchar *te=t+tlen;
|
||||
MY_UNICASE_INFO *uni_plane= cs->caseinfo;
|
||||
|
||||
while ( s < se && t < te )
|
||||
{
|
||||
s_res=my_ucs2_uni(cs,&s_wc, s, se);
|
||||
t_res=my_ucs2_uni(cs,&t_wc, t, te);
|
||||
|
||||
if ( s_res <= 0 || t_res <= 0 )
|
||||
{
|
||||
/* Incorrect string, compare by char value */
|
||||
return ((int)s[0]-(int)t[0]);
|
||||
}
|
||||
|
||||
my_tosort_ucs2(uni_plane, &s_wc);
|
||||
my_tosort_ucs2(uni_plane, &t_wc);
|
||||
|
||||
if ( s_wc != t_wc )
|
||||
{
|
||||
return s_wc > t_wc ? 1 : -1;
|
||||
}
|
||||
|
||||
s+=s_res;
|
||||
t+=t_res;
|
||||
}
|
||||
return (int) (t_is_prefix ? t-te : ((se-s) - (te-t)));
|
||||
}
|
||||
|
||||
/*
|
||||
Compare strings, discarding end space
|
||||
|
||||
SYNOPSIS
|
||||
my_strnncollsp_ucs2()
|
||||
cs character set handler
|
||||
a First string to compare
|
||||
a_length Length of 'a'
|
||||
b Second string to compare
|
||||
b_length Length of 'b'
|
||||
|
||||
IMPLEMENTATION
|
||||
If one string is shorter as the other, then we space extend the other
|
||||
so that the strings have equal length.
|
||||
|
||||
This will ensure that the following things hold:
|
||||
|
||||
"a" == "a "
|
||||
"a\0" < "a"
|
||||
"a\0" < "a "
|
||||
|
||||
RETURN
|
||||
< 0 a < b
|
||||
= 0 a == b
|
||||
> 0 a > b
|
||||
*/
|
||||
|
||||
static int my_strnncollsp_ucs2(CHARSET_INFO *cs __attribute__((unused)),
|
||||
const uchar *s, size_t slen,
|
||||
const uchar *t, size_t tlen,
|
||||
my_bool diff_if_only_endspace_difference
|
||||
__attribute__((unused)))
|
||||
{
|
||||
const uchar *se, *te;
|
||||
size_t minlen;
|
||||
MY_UNICASE_INFO *uni_plane= cs->caseinfo;
|
||||
|
||||
/* extra safety to make sure the lengths are even numbers */
|
||||
slen&= ~1;
|
||||
tlen&= ~1;
|
||||
|
||||
se= s + slen;
|
||||
te= t + tlen;
|
||||
|
||||
for (minlen= MY_MIN(slen, tlen); minlen; minlen-= 2)
|
||||
{
|
||||
int s_wc = uni_plane->page[s[0]] ? (int) uni_plane->page[s[0]][s[1]].sort :
|
||||
(((int) s[0]) << 8) + (int) s[1];
|
||||
|
||||
int t_wc = uni_plane->page[t[0]] ? (int) uni_plane->page[t[0]][t[1]].sort :
|
||||
(((int) t[0]) << 8) + (int) t[1];
|
||||
if ( s_wc != t_wc )
|
||||
return s_wc > t_wc ? 1 : -1;
|
||||
|
||||
s+= 2;
|
||||
t+= 2;
|
||||
}
|
||||
|
||||
if (slen != tlen)
|
||||
{
|
||||
int swap= 1;
|
||||
if (slen < tlen)
|
||||
{
|
||||
s= t;
|
||||
se= te;
|
||||
swap= -1;
|
||||
}
|
||||
|
||||
for ( ; s < se ; s+= 2)
|
||||
{
|
||||
if (s[0] || s[1] != ' ')
|
||||
return (s[0] == 0 && s[1] < ' ') ? -swap : swap;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static uint my_ismbchar_ucs2(CHARSET_INFO *cs __attribute__((unused)),
|
||||
const char *b,
|
||||
const char *e)
|
||||
@ -3417,85 +3160,6 @@ int my_wildcmp_ucs2_bin(CHARSET_INFO *cs,
|
||||
}
|
||||
|
||||
|
||||
static
|
||||
int my_strnncoll_ucs2_bin(CHARSET_INFO *cs,
|
||||
const uchar *s, size_t slen,
|
||||
const uchar *t, size_t tlen,
|
||||
my_bool t_is_prefix)
|
||||
{
|
||||
int s_res,t_res;
|
||||
my_wc_t UNINIT_VAR(s_wc),UNINIT_VAR(t_wc);
|
||||
const uchar *se=s+slen;
|
||||
const uchar *te=t+tlen;
|
||||
|
||||
while ( s < se && t < te )
|
||||
{
|
||||
s_res=my_ucs2_uni(cs,&s_wc, s, se);
|
||||
t_res=my_ucs2_uni(cs,&t_wc, t, te);
|
||||
|
||||
if ( s_res <= 0 || t_res <= 0 )
|
||||
{
|
||||
/* Incorrect string, compare by char value */
|
||||
return ((int)s[0]-(int)t[0]);
|
||||
}
|
||||
if ( s_wc != t_wc )
|
||||
{
|
||||
return s_wc > t_wc ? 1 : -1;
|
||||
}
|
||||
|
||||
s+=s_res;
|
||||
t+=t_res;
|
||||
}
|
||||
return (int) (t_is_prefix ? t-te : ((se-s) - (te-t)));
|
||||
}
|
||||
|
||||
static int my_strnncollsp_ucs2_bin(CHARSET_INFO *cs __attribute__((unused)),
|
||||
const uchar *s, size_t slen,
|
||||
const uchar *t, size_t tlen,
|
||||
my_bool diff_if_only_endspace_difference
|
||||
__attribute__((unused)))
|
||||
{
|
||||
const uchar *se, *te;
|
||||
size_t minlen;
|
||||
|
||||
/* extra safety to make sure the lengths are even numbers */
|
||||
slen= (slen >> 1) << 1;
|
||||
tlen= (tlen >> 1) << 1;
|
||||
|
||||
se= s + slen;
|
||||
te= t + tlen;
|
||||
|
||||
for (minlen= MY_MIN(slen, tlen); minlen; minlen-= 2)
|
||||
{
|
||||
int s_wc= s[0] * 256 + s[1];
|
||||
int t_wc= t[0] * 256 + t[1];
|
||||
if ( s_wc != t_wc )
|
||||
return s_wc > t_wc ? 1 : -1;
|
||||
|
||||
s+= 2;
|
||||
t+= 2;
|
||||
}
|
||||
|
||||
if (slen != tlen)
|
||||
{
|
||||
int swap= 1;
|
||||
if (slen < tlen)
|
||||
{
|
||||
s= t;
|
||||
se= te;
|
||||
swap= -1;
|
||||
}
|
||||
|
||||
for ( ; s < se ; s+= 2)
|
||||
{
|
||||
if (s[0] || s[1] != ' ')
|
||||
return (s[0] == 0 && s[1] < ' ') ? -swap : swap;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static
|
||||
void my_hash_sort_ucs2_bin(CHARSET_INFO *cs __attribute__((unused)),
|
||||
const uchar *key, size_t len,ulong *nr1, ulong *nr2)
|
||||
@ -3518,8 +3182,8 @@ void my_hash_sort_ucs2_bin(CHARSET_INFO *cs __attribute__((unused)),
|
||||
static MY_COLLATION_HANDLER my_collation_ucs2_general_ci_handler =
|
||||
{
|
||||
NULL, /* init */
|
||||
my_strnncoll_ucs2,
|
||||
my_strnncollsp_ucs2,
|
||||
my_strnncoll_ucs2_general_ci,
|
||||
my_strnncollsp_ucs2_general_ci,
|
||||
my_strnxfrm_unicode,
|
||||
my_strnxfrmlen_unicode,
|
||||
my_like_range_generic,
|
||||
|
@ -64,13 +64,16 @@
|
||||
@return - the number of bytes scanned
|
||||
|
||||
The including source file must define the following macros:
|
||||
IS_MB1_CHAR(x)
|
||||
IS_MB1_MB2HEAD_GAP(x) - optional, for better performance
|
||||
IS_MB2_CHAR(x,y)
|
||||
IS_MB3_CHAR(x,y,z) - for character sets with mbmaxlen>2
|
||||
IS_MB1_CHAR(b0) - for character sets that have MB1 characters
|
||||
IS_MB1_MB2HEAD_GAP(b0) - optional, for better performance
|
||||
IS_MB2_CHAR(b0,b1) - for character sets that have MB2 characters
|
||||
IS_MB3_CHAR(b0,b1,b2) - for character sets that have MB3 characters
|
||||
IS_MB4_CHAR(b0,b1,b2,b3) - for character sets with have MB4 characters
|
||||
WEIGHT_PAD_SPACE
|
||||
WEIGHT_MB1(x)
|
||||
WEIGHT_MB2(x,y)
|
||||
WEIGHT_MB1(b0) - for character sets that have MB1 characters
|
||||
WEIGHT_MB2(b0,b1) - for character sets that have MB2 characters
|
||||
WEIGHT_MB3(b0,b1,b2) - for character sets that have MB3 characters
|
||||
WEIGHT_MB4(b0,b1,b2,b3) - for character sets that have MB4 characters
|
||||
WEIGHT_ILSEQ(x)
|
||||
*/
|
||||
static inline uint
|
||||
@ -82,11 +85,13 @@ MY_FUNCTION_NAME(scan_weight)(int *weight, const uchar *str, const uchar *end)
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef IS_MB1_CHAR
|
||||
if (IS_MB1_CHAR(*str))
|
||||
{
|
||||
*weight= WEIGHT_MB1(*str); /* A valid single byte character*/
|
||||
return 1;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef IS_MB1_MBHEAD_UNUSED_GAP
|
||||
/*
|
||||
@ -98,6 +103,7 @@ MY_FUNCTION_NAME(scan_weight)(int *weight, const uchar *str, const uchar *end)
|
||||
goto bad;
|
||||
#endif
|
||||
|
||||
#ifdef IS_MB2_CHAR
|
||||
if (str + 2 > end) /* The string ended unexpectedly */
|
||||
goto bad; /* Treat as a bad byte */
|
||||
|
||||
@ -106,6 +112,7 @@ MY_FUNCTION_NAME(scan_weight)(int *weight, const uchar *str, const uchar *end)
|
||||
*weight= WEIGHT_MB2(str[0], str[1]);
|
||||
return 2; /* A valid two-byte character */
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef IS_MB3_CHAR
|
||||
if (str + 3 > end) /* Incomplete three-byte character */
|
||||
|
@ -149,7 +149,7 @@ typedef struct
|
||||
A1A1 - MB2 or 8BIT+8BIT
|
||||
E0E0 - MB2
|
||||
*/
|
||||
STRNNCOLL_PARAM strcoll_mb2_common[]=
|
||||
static STRNNCOLL_PARAM strcoll_mb2_common[]=
|
||||
{
|
||||
/* Compare two good sequences */
|
||||
{CSTR(""), CSTR(""), 0},
|
||||
@ -210,7 +210,7 @@ STRNNCOLL_PARAM strcoll_mb2_common[]=
|
||||
/*
|
||||
For character sets that have good mb2 characters A1A1 and F9FE
|
||||
*/
|
||||
STRNNCOLL_PARAM strcoll_mb2_A1A1_mb2_F9FE[]=
|
||||
static STRNNCOLL_PARAM strcoll_mb2_A1A1_mb2_F9FE[]=
|
||||
{
|
||||
/* Compare two good characters */
|
||||
{CSTR(""), CSTR("\xF9\xFE"), -1},
|
||||
@ -246,7 +246,7 @@ STRNNCOLL_PARAM strcoll_mb2_A1A1_mb2_F9FE[]=
|
||||
A1A1 - a good mb2 character
|
||||
F9FE - a bad sequence
|
||||
*/
|
||||
STRNNCOLL_PARAM strcoll_mb2_A1A1_bad_F9FE[]=
|
||||
static STRNNCOLL_PARAM strcoll_mb2_A1A1_bad_F9FE[]=
|
||||
{
|
||||
/* Compare a good character to an illegal or an incomplete sequence */
|
||||
{CSTR(""), CSTR("\xF9\xFE"), -1},
|
||||
@ -283,7 +283,7 @@ STRNNCOLL_PARAM strcoll_mb2_A1A1_bad_F9FE[]=
|
||||
F9 - ILSEQ or H2
|
||||
F9FE - a bad sequence (ILSEQ+XX or H2+ILSEQ)
|
||||
*/
|
||||
STRNNCOLL_PARAM strcoll_mb1_A1_bad_F9FE[]=
|
||||
static STRNNCOLL_PARAM strcoll_mb1_A1_bad_F9FE[]=
|
||||
{
|
||||
/* Compare two good characters */
|
||||
{CSTR(""), CSTR("\xA1"), -1},
|
||||
@ -323,7 +323,7 @@ STRNNCOLL_PARAM strcoll_mb1_A1_bad_F9FE[]=
|
||||
and sort in this order:
|
||||
8181 < A1 < E0E0
|
||||
*/
|
||||
STRNNCOLL_PARAM strcoll_8181_A1_E0E0[]=
|
||||
static STRNNCOLL_PARAM strcoll_8181_A1_E0E0[]=
|
||||
{
|
||||
{CSTR("\x81\x81"), CSTR("\xA1"), -1},
|
||||
{CSTR("\x81\x81"), CSTR("\xE0\xE0"), -1},
|
||||
@ -336,7 +336,7 @@ STRNNCOLL_PARAM strcoll_8181_A1_E0E0[]=
|
||||
/*
|
||||
A shared test for eucjpms and ujis.
|
||||
*/
|
||||
STRNNCOLL_PARAM strcoll_ujis[]=
|
||||
static STRNNCOLL_PARAM strcoll_ujis[]=
|
||||
{
|
||||
{CSTR("\x8E\xA1"), CSTR("\x8E"), -1}, /* Good MB2 vs incomplete MB2 */
|
||||
{CSTR("\x8E\xA1"), CSTR("\x8F\xA1"), -1}, /* Good MB2 vs incomplete MB3 */
|
||||
@ -347,7 +347,7 @@ STRNNCOLL_PARAM strcoll_ujis[]=
|
||||
};
|
||||
|
||||
|
||||
STRNNCOLL_PARAM strcoll_utf8mb3_common[]=
|
||||
static STRNNCOLL_PARAM strcoll_utf8mb3_common[]=
|
||||
{
|
||||
{CSTR("\xC0"), CSTR("\xC1"), -1}, /* Unused byte vs unused byte */
|
||||
{CSTR("\xC0"), CSTR("\xFF"), -1}, /* Unused byte vs unused byte */
|
||||
@ -369,7 +369,7 @@ STRNNCOLL_PARAM strcoll_utf8mb3_common[]=
|
||||
};
|
||||
|
||||
|
||||
STRNNCOLL_PARAM strcoll_utf8mb4_common[]=
|
||||
static STRNNCOLL_PARAM strcoll_utf8mb4_common[]=
|
||||
{
|
||||
/* Minimum four-byte character: U+10000 == _utf8 0xF0908080 */
|
||||
{CSTR("\xF0\x90\x80\x80"), CSTR("\xC0"), -1}, /* MB4 vs unused byte */
|
||||
@ -412,6 +412,101 @@ STRNNCOLL_PARAM strcoll_utf8mb4_common[]=
|
||||
};
|
||||
|
||||
|
||||
static STRNNCOLL_PARAM strcoll_ucs2_common[]=
|
||||
{
|
||||
{CSTR("\xC0"), CSTR("\xC1"), -1}, /* Incomlete MB2 vs incomplete MB2 */
|
||||
{CSTR("\xC0"), CSTR("\xFF"), -1}, /* Incomlete MB2 vs incomplete MB2 */
|
||||
{CSTR("\xC2\xA1"), CSTR("\xC0"), -1}, /* MB2 vs incomplete MB2 */
|
||||
{CSTR("\xC2\xA1"), CSTR("\xC2"), -1}, /* MB2 vs incomplete MB2 */
|
||||
{CSTR("\xC2\xA0"), CSTR("\xC2\xA1"), -1}, /* MB2 vs MB2 */
|
||||
{CSTR("\xC2\xA1"), CSTR("\xC2\xA2"), -1}, /* MB2 vs MB2 */
|
||||
|
||||
{CSTR("\xFF\xFF"), CSTR("\x00"),-1}, /* MB2 vs incomplete */
|
||||
{CSTR("\xFF\xFF\xFF\xFF"), CSTR("\x00"),-1}, /* MB2+MB2 vs incomplete */
|
||||
{CSTR("\xFF\xFF\xFF\xFF"), CSTR("\x00\x00\x00"), 1},/* MB2+MB2 vs MB2+incomplete */
|
||||
|
||||
{NULL, 0, NULL, 0, 0}
|
||||
};
|
||||
|
||||
|
||||
/* Tests that involve comparison to SPACE (explicit, or padded) */
|
||||
static STRNNCOLL_PARAM strcoll_ucs2_space[]=
|
||||
{
|
||||
{CSTR("\x00\x1F"), CSTR("\x00\x20"), -1}, /* MB2 vs MB2 */
|
||||
{CSTR("\x00\x20"), CSTR("\x00\x21"), -1}, /* MB2 vs MB2 */
|
||||
{CSTR("\x00\x1F"), CSTR(""), -1}, /* MB2 vs empty */
|
||||
{CSTR("\x00\x20"), CSTR(""), 0}, /* MB2 vs empty */
|
||||
{CSTR("\x00\x21"), CSTR(""), 1}, /* MB2 vs empty */
|
||||
|
||||
{NULL, 0, NULL, 0, 0}
|
||||
};
|
||||
|
||||
|
||||
/* Tests that involve comparison to SPACE (explicit, or padded) */
|
||||
static STRNNCOLL_PARAM strcoll_utf16le_space[]=
|
||||
{
|
||||
{CSTR("\x1F\x00"), CSTR("\x20\x00"), -1}, /* MB2 vs MB2 */
|
||||
{CSTR("\x20\x00"), CSTR("\x21\x00"), -1}, /* MB2 vs MB2 */
|
||||
{CSTR("\x1F\x00"), CSTR(""), -1}, /* MB2 vs empty */
|
||||
{CSTR("\x20\x00"), CSTR(""), 0}, /* MB2 vs empty */
|
||||
{CSTR("\x21\x00"), CSTR(""), 1}, /* MB2 vs empty */
|
||||
|
||||
{NULL, 0, NULL, 0, 0}
|
||||
};
|
||||
|
||||
|
||||
static STRNNCOLL_PARAM strcoll_utf16_common[]=
|
||||
{
|
||||
/* Minimum four-byte character: U+10000 == _utf16 0xD800DC00 */
|
||||
{CSTR("\xD8\x00\xDC\x00"), CSTR("\xC0"), -1},/* MB4 vs incomplete MB2 */
|
||||
{CSTR("\xD8\x00\xDC\x00"), CSTR("\xC2"), -1},/* MB4 vs incomplete MB2 */
|
||||
{CSTR("\xD8\x00\xDC\x00"), CSTR("\xD8\x00\xDB\x00"),-1},/* MB4 vs broken MB4 */
|
||||
{CSTR("\xD8\x00\xDC\x00"), CSTR("\xD8\x00\xE0\x00"),-1},/* MB4 vs broken MB4 */
|
||||
{CSTR("\xD8\x00\xDC\x00"), CSTR("\xDC\x00"), -1},/* MB4 vs broken MB2 */
|
||||
{CSTR("\xD8\x00\xDC\x00"), CSTR("\xD8\x00\xDC"), -1},/* MB4 vs incomplete MB4 */
|
||||
|
||||
/* Maximum four-byte character: U+10FFFF == _utf8 0xF48FBFBF */
|
||||
{CSTR("\xDB\xFF\xDF\xFF"), CSTR("\xC0"), -1},/* MB4 vs incomplete MB2 */
|
||||
{CSTR("\xDB\xFF\xDF\xFF"), CSTR("\xC2"), -1},/* MB4 vs incomplete MB2 */
|
||||
{CSTR("\xDB\xFF\xDF\xFF"), CSTR("\xD8\x00\xDB\x00"),-1},/* MB4 vs broken MB4 */
|
||||
{CSTR("\xDB\xFF\xDF\xFF"), CSTR("\xD8\x00\xE0\x00"),-1},/* MB4 vs broken MB4 */
|
||||
{CSTR("\xDB\xFF\xDF\xFF"), CSTR("\xDC\x00"), -1},/* MB4 vs broken MB2 */
|
||||
{CSTR("\xDB\xFF\xDF\xFF"), CSTR("\xDC\xFF\xDF"), -1},/* MB4 vs incomplete MB4 */
|
||||
|
||||
/* Broken MB4 vs broken MB4 */
|
||||
{CSTR("\xD8\x00\xDC\x00"), CSTR("\xD8\x00\xDC\x01"),-1},/* Broken MB4 vs broken MB4 */
|
||||
{CSTR("\xDB\xFF\xE0\xFE"), CSTR("\xDB\xFF\xE0\xFF"),-1},/* Broken MB4 vs broken MB4 */
|
||||
|
||||
{NULL, 0, NULL, 0, 0}
|
||||
};
|
||||
|
||||
|
||||
static STRNNCOLL_PARAM strcoll_utf16le_common[]=
|
||||
{
|
||||
/* Minimum four-byte character: U+10000 == _utf16 0xD800DC00 */
|
||||
{CSTR("\x00\xD8\x00\xDC"), CSTR("\xC0"), -1},/* MB4 vs incomplete MB2 */
|
||||
{CSTR("\x00\xD8\x00\xDC"), CSTR("\xC2"), -1},/* MB4 vs incomplete MB2 */
|
||||
{CSTR("\x00\xD8\x00\xDC"), CSTR("\x00\xD8\x00\xDB"),-1},/* MB4 vs broken MB4 */
|
||||
{CSTR("\x00\xD8\x00\xDC"), CSTR("\x00\xD8\x00\xD0"),-1},/* MB4 vs broken MB4 */
|
||||
{CSTR("\x00\xD8\x00\xDC"), CSTR("\x00\xDC"), -1},/* MB4 vs broken MB2 */
|
||||
{CSTR("\x00\xD8\x00\xDC"), CSTR("\x00\xD8\x00"), -1},/* MB4 vs incomplete MB4 */
|
||||
|
||||
/* Maximum four-byte character: U+10FFFF == _utf8 0xF48FBFBF */
|
||||
{CSTR("\xFF\xDB\xFF\xDF"), CSTR("\xC0"), -1},/* MB4 vs incomplete MB2 */
|
||||
{CSTR("\xFF\xDB\xFF\xDF"), CSTR("\xC2"), -1},/* MB4 vs incomplete MB2 */
|
||||
{CSTR("\xFF\xDB\xFF\xDF"), CSTR("\x00\xD8\x00\xDB"),-1},/* MB4 vs broken MB4 */
|
||||
{CSTR("\xFF\xDB\xFF\xDF"), CSTR("\x00\xD8\x00\xE0"),-1},/* MB4 vs broken MB4 */
|
||||
{CSTR("\xFF\xDB\xFF\xDF"), CSTR("\x00\xDC"), -1},/* MB4 vs broken MB2 */
|
||||
{CSTR("\xFF\xDB\xFF\xDF"), CSTR("\xFF\xDC\x00"), -1},/* MB4 vs incomplete MB4 */
|
||||
|
||||
/* Broken MB4 vs broken MB4 */
|
||||
{CSTR("\x00\xD8\x00\xDC"), CSTR("\x00\xD8\x01\xDC"),-1},/* Broken MB4 vs broken MB4 */
|
||||
{CSTR("\xFF\xDB\xFE\xE0"), CSTR("\xFF\xDB\xFF\xE0"),-1},/* Broken MB4 vs broken MB4 */
|
||||
|
||||
{NULL, 0, NULL, 0, 0}
|
||||
};
|
||||
|
||||
|
||||
static void
|
||||
str2hex(char *dst, size_t dstlen, const char *src, size_t srclen)
|
||||
{
|
||||
@ -528,6 +623,12 @@ test_strcollsp()
|
||||
failed+= strcollsp(&my_charset_sjis_japanese_ci, strcoll_8181_A1_E0E0);
|
||||
failed+= strcollsp(&my_charset_sjis_bin, strcoll_8181_A1_E0E0);
|
||||
#endif
|
||||
#ifdef HAVE_CHARSET_ucs2
|
||||
failed+= strcollsp(&my_charset_ucs2_general_ci, strcoll_ucs2_common);
|
||||
failed+= strcollsp(&my_charset_ucs2_general_ci, strcoll_ucs2_space);
|
||||
failed+= strcollsp(&my_charset_ucs2_bin, strcoll_ucs2_common);
|
||||
failed+= strcollsp(&my_charset_ucs2_bin, strcoll_ucs2_space);
|
||||
#endif
|
||||
#ifdef HAVE_CHARSET_ujis
|
||||
failed+= strcollsp(&my_charset_ujis_japanese_ci, strcoll_mb2_common);
|
||||
failed+= strcollsp(&my_charset_ujis_bin, strcoll_mb2_common);
|
||||
@ -536,6 +637,21 @@ test_strcollsp()
|
||||
failed+= strcollsp(&my_charset_ujis_japanese_ci, strcoll_ujis);
|
||||
failed+= strcollsp(&my_charset_ujis_bin, strcoll_ujis);
|
||||
#endif
|
||||
#ifdef HAVE_CHARSET_utf16
|
||||
failed+= strcollsp(&my_charset_utf16_general_ci, strcoll_ucs2_common);
|
||||
failed+= strcollsp(&my_charset_utf16_general_ci, strcoll_ucs2_space);
|
||||
failed+= strcollsp(&my_charset_utf16_general_ci, strcoll_utf16_common);
|
||||
failed+= strcollsp(&my_charset_utf16_bin, strcoll_ucs2_common);
|
||||
failed+= strcollsp(&my_charset_utf16_bin, strcoll_ucs2_space);
|
||||
failed+= strcollsp(&my_charset_utf16_bin, strcoll_utf16_common);
|
||||
|
||||
failed+= strcollsp(&my_charset_utf16le_general_ci,strcoll_ucs2_common);
|
||||
failed+= strcollsp(&my_charset_utf16le_general_ci,strcoll_utf16le_space);
|
||||
failed+= strcollsp(&my_charset_utf16le_general_ci,strcoll_utf16le_common);
|
||||
failed+= strcollsp(&my_charset_utf16le_bin, strcoll_ucs2_common);
|
||||
failed+= strcollsp(&my_charset_utf16le_bin, strcoll_utf16le_space);
|
||||
failed+= strcollsp(&my_charset_utf16le_bin, strcoll_utf16le_common);
|
||||
#endif
|
||||
#ifdef HAVE_CHARSET_utf8
|
||||
failed+= strcollsp(&my_charset_utf8_general_ci, strcoll_utf8mb3_common);
|
||||
failed+= strcollsp(&my_charset_utf8_general_mysql500_ci, strcoll_utf8mb3_common);
|
||||
|
Reference in New Issue
Block a user