mirror of
https://github.com/MariaDB/server.git
synced 2025-07-30 16:24:05 +03:00
MDEV-8417 utf8mb4: compare broken bytes as "greater than any non-broken character"
This commit is contained in:
@ -1802,5 +1802,28 @@ DROP TABLE t1;
|
|||||||
--echo #
|
--echo #
|
||||||
|
|
||||||
--echo #
|
--echo #
|
||||||
--echo # End of tests
|
--echo # ctype_utf8mb4.inc: Start of 10.1 tests
|
||||||
|
--echo #
|
||||||
|
|
||||||
|
--echo #
|
||||||
|
--echo # MDEV-8417 utf8mb4: compare broken bytes as "greater than any non-broken character"
|
||||||
|
--echo #
|
||||||
|
CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET utf8mb4, KEY(a));
|
||||||
|
INSERT INTO t1 VALUES (0x61);
|
||||||
|
INSERT INTO t1 VALUES (0xC280),(0xDFBF);
|
||||||
|
INSERT INTO t1 VALUES (0xE0A080),(0xEFBFBF);
|
||||||
|
INSERT INTO t1 VALUES (0xF0908080),(0xF48FBFBF);
|
||||||
|
SELECT HEX(a) FROM t1 ORDER BY a;
|
||||||
|
SELECT HEX(a) FROM t1 ORDER BY a DESC;
|
||||||
|
ALTER TABLE t1 MODIFY a VARCHAR(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;
|
||||||
|
SELECT HEX(a) FROM t1 ORDER BY a;
|
||||||
|
SELECT HEX(a) FROM t1 ORDER BY a DESC;
|
||||||
|
DROP TABLE t1;
|
||||||
|
|
||||||
|
--echo #
|
||||||
|
--echo # ctype_utf8mb4.inc: End of 10.1 tests
|
||||||
|
--echo #
|
||||||
|
|
||||||
|
--echo #
|
||||||
|
--echo # End of ctype_utf8mb4.inc
|
||||||
--echo #
|
--echo #
|
||||||
|
@ -2495,5 +2495,57 @@ DROP TABLE t1;
|
|||||||
# End of 5.5 tests
|
# End of 5.5 tests
|
||||||
#
|
#
|
||||||
#
|
#
|
||||||
# End of tests
|
# ctype_utf8mb4.inc: Start of 10.1 tests
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# MDEV-8417 utf8mb4: compare broken bytes as "greater than any non-broken character"
|
||||||
|
#
|
||||||
|
CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET utf8mb4, KEY(a));
|
||||||
|
INSERT INTO t1 VALUES (0x61);
|
||||||
|
INSERT INTO t1 VALUES (0xC280),(0xDFBF);
|
||||||
|
INSERT INTO t1 VALUES (0xE0A080),(0xEFBFBF);
|
||||||
|
INSERT INTO t1 VALUES (0xF0908080),(0xF48FBFBF);
|
||||||
|
SELECT HEX(a) FROM t1 ORDER BY a;
|
||||||
|
HEX(a)
|
||||||
|
61
|
||||||
|
C280
|
||||||
|
DFBF
|
||||||
|
E0A080
|
||||||
|
EFBFBF
|
||||||
|
F0908080
|
||||||
|
F48FBFBF
|
||||||
|
SELECT HEX(a) FROM t1 ORDER BY a DESC;
|
||||||
|
HEX(a)
|
||||||
|
F48FBFBF
|
||||||
|
F0908080
|
||||||
|
EFBFBF
|
||||||
|
E0A080
|
||||||
|
DFBF
|
||||||
|
C280
|
||||||
|
61
|
||||||
|
ALTER TABLE t1 MODIFY a VARCHAR(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;
|
||||||
|
SELECT HEX(a) FROM t1 ORDER BY a;
|
||||||
|
HEX(a)
|
||||||
|
61
|
||||||
|
C280
|
||||||
|
DFBF
|
||||||
|
E0A080
|
||||||
|
EFBFBF
|
||||||
|
F0908080
|
||||||
|
F48FBFBF
|
||||||
|
SELECT HEX(a) FROM t1 ORDER BY a DESC;
|
||||||
|
HEX(a)
|
||||||
|
F48FBFBF
|
||||||
|
F0908080
|
||||||
|
EFBFBF
|
||||||
|
E0A080
|
||||||
|
DFBF
|
||||||
|
C280
|
||||||
|
61
|
||||||
|
DROP TABLE t1;
|
||||||
|
#
|
||||||
|
# ctype_utf8mb4.inc: End of 10.1 tests
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# End of ctype_utf8mb4.inc
|
||||||
#
|
#
|
||||||
|
@ -2642,5 +2642,57 @@ DROP TABLE t1;
|
|||||||
# End of 5.5 tests
|
# End of 5.5 tests
|
||||||
#
|
#
|
||||||
#
|
#
|
||||||
# End of tests
|
# ctype_utf8mb4.inc: Start of 10.1 tests
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# MDEV-8417 utf8mb4: compare broken bytes as "greater than any non-broken character"
|
||||||
|
#
|
||||||
|
CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET utf8mb4, KEY(a));
|
||||||
|
INSERT INTO t1 VALUES (0x61);
|
||||||
|
INSERT INTO t1 VALUES (0xC280),(0xDFBF);
|
||||||
|
INSERT INTO t1 VALUES (0xE0A080),(0xEFBFBF);
|
||||||
|
INSERT INTO t1 VALUES (0xF0908080),(0xF48FBFBF);
|
||||||
|
SELECT HEX(a) FROM t1 ORDER BY a;
|
||||||
|
HEX(a)
|
||||||
|
61
|
||||||
|
C280
|
||||||
|
DFBF
|
||||||
|
E0A080
|
||||||
|
EFBFBF
|
||||||
|
F0908080
|
||||||
|
F48FBFBF
|
||||||
|
SELECT HEX(a) FROM t1 ORDER BY a DESC;
|
||||||
|
HEX(a)
|
||||||
|
F48FBFBF
|
||||||
|
F0908080
|
||||||
|
EFBFBF
|
||||||
|
E0A080
|
||||||
|
DFBF
|
||||||
|
C280
|
||||||
|
61
|
||||||
|
ALTER TABLE t1 MODIFY a VARCHAR(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;
|
||||||
|
SELECT HEX(a) FROM t1 ORDER BY a;
|
||||||
|
HEX(a)
|
||||||
|
61
|
||||||
|
C280
|
||||||
|
DFBF
|
||||||
|
E0A080
|
||||||
|
EFBFBF
|
||||||
|
F0908080
|
||||||
|
F48FBFBF
|
||||||
|
SELECT HEX(a) FROM t1 ORDER BY a DESC;
|
||||||
|
HEX(a)
|
||||||
|
F48FBFBF
|
||||||
|
F0908080
|
||||||
|
EFBFBF
|
||||||
|
E0A080
|
||||||
|
DFBF
|
||||||
|
C280
|
||||||
|
61
|
||||||
|
DROP TABLE t1;
|
||||||
|
#
|
||||||
|
# ctype_utf8mb4.inc: End of 10.1 tests
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# End of ctype_utf8mb4.inc
|
||||||
#
|
#
|
||||||
|
@ -2642,5 +2642,57 @@ DROP TABLE t1;
|
|||||||
# End of 5.5 tests
|
# End of 5.5 tests
|
||||||
#
|
#
|
||||||
#
|
#
|
||||||
# End of tests
|
# ctype_utf8mb4.inc: Start of 10.1 tests
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# MDEV-8417 utf8mb4: compare broken bytes as "greater than any non-broken character"
|
||||||
|
#
|
||||||
|
CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET utf8mb4, KEY(a));
|
||||||
|
INSERT INTO t1 VALUES (0x61);
|
||||||
|
INSERT INTO t1 VALUES (0xC280),(0xDFBF);
|
||||||
|
INSERT INTO t1 VALUES (0xE0A080),(0xEFBFBF);
|
||||||
|
INSERT INTO t1 VALUES (0xF0908080),(0xF48FBFBF);
|
||||||
|
SELECT HEX(a) FROM t1 ORDER BY a;
|
||||||
|
HEX(a)
|
||||||
|
61
|
||||||
|
C280
|
||||||
|
DFBF
|
||||||
|
E0A080
|
||||||
|
EFBFBF
|
||||||
|
F0908080
|
||||||
|
F48FBFBF
|
||||||
|
SELECT HEX(a) FROM t1 ORDER BY a DESC;
|
||||||
|
HEX(a)
|
||||||
|
F48FBFBF
|
||||||
|
F0908080
|
||||||
|
EFBFBF
|
||||||
|
E0A080
|
||||||
|
DFBF
|
||||||
|
C280
|
||||||
|
61
|
||||||
|
ALTER TABLE t1 MODIFY a VARCHAR(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;
|
||||||
|
SELECT HEX(a) FROM t1 ORDER BY a;
|
||||||
|
HEX(a)
|
||||||
|
61
|
||||||
|
C280
|
||||||
|
DFBF
|
||||||
|
E0A080
|
||||||
|
EFBFBF
|
||||||
|
F0908080
|
||||||
|
F48FBFBF
|
||||||
|
SELECT HEX(a) FROM t1 ORDER BY a DESC;
|
||||||
|
HEX(a)
|
||||||
|
F48FBFBF
|
||||||
|
F0908080
|
||||||
|
EFBFBF
|
||||||
|
E0A080
|
||||||
|
DFBF
|
||||||
|
C280
|
||||||
|
61
|
||||||
|
DROP TABLE t1;
|
||||||
|
#
|
||||||
|
# ctype_utf8mb4.inc: End of 10.1 tests
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# End of ctype_utf8mb4.inc
|
||||||
#
|
#
|
||||||
|
@ -85,7 +85,8 @@
|
|||||||
IS_CONTINUATION_BYTE(b3) && \
|
IS_CONTINUATION_BYTE(b3) && \
|
||||||
(b0 >= 0xf1 || b1 >= 0x90) && \
|
(b0 >= 0xf1 || b1 >= 0x90) && \
|
||||||
(b0 <= 0xf3 || b1 <= 0x8F))
|
(b0 <= 0xf3 || b1 <= 0x8F))
|
||||||
|
#define IS_UTF8MB4_STEP3(b0,b1,b2,b3) (((uchar) (b0) < 0xF5) && \
|
||||||
|
IS_UTF8MB4_STEP2(b0,b1,b2,b3))
|
||||||
|
|
||||||
/* Convert individual bytes to Unicode code points */
|
/* Convert individual bytes to Unicode code points */
|
||||||
#define UTF8MB2_CODE(b0,b1) (((my_wc_t) ((uchar) b0 & 0x1f) << 6) |\
|
#define UTF8MB2_CODE(b0,b1) (((my_wc_t) ((uchar) b0 & 0x1f) << 6) |\
|
||||||
@ -7622,146 +7623,6 @@ my_casedn_str_utf8mb4(CHARSET_INFO *cs, char *src)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static int
|
|
||||||
my_strnncoll_utf8mb4(CHARSET_INFO *cs,
|
|
||||||
const uchar *s, size_t slen,
|
|
||||||
const uchar *t, size_t tlen,
|
|
||||||
my_bool t_is_prefix)
|
|
||||||
{
|
|
||||||
my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc);
|
|
||||||
const uchar *se= s + slen;
|
|
||||||
const uchar *te= t + tlen;
|
|
||||||
MY_UNICASE_INFO *uni_plane= cs->caseinfo;
|
|
||||||
|
|
||||||
while ( s < se && t < te )
|
|
||||||
{
|
|
||||||
int s_res= my_mb_wc_utf8mb4(cs, &s_wc, s, se);
|
|
||||||
int t_res= my_mb_wc_utf8mb4(cs, &t_wc, t, te);
|
|
||||||
|
|
||||||
if ( s_res <= 0 || t_res <= 0 )
|
|
||||||
{
|
|
||||||
/* Incorrect string, compare bytewise */
|
|
||||||
return bincmp_utf8mb4(s, se, t, te);
|
|
||||||
}
|
|
||||||
|
|
||||||
my_tosort_unicode(uni_plane, &s_wc, cs->state);
|
|
||||||
my_tosort_unicode(uni_plane, &t_wc, cs->state);
|
|
||||||
|
|
||||||
if ( s_wc != t_wc )
|
|
||||||
{
|
|
||||||
return s_wc > t_wc ? 1 : -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
s+= s_res;
|
|
||||||
t+= t_res;
|
|
||||||
}
|
|
||||||
return (int) (t_is_prefix ? (t - te) : ((se - s) - (te - t)));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
|
|
||||||
Compare strings, discarding end space
|
|
||||||
|
|
||||||
If one string is shorter as the other, then we space extend the other
|
|
||||||
so that the strings have equal length.
|
|
||||||
|
|
||||||
This will ensure that the following things hold:
|
|
||||||
|
|
||||||
"a" == "a "
|
|
||||||
"a\0" < "a"
|
|
||||||
"a\0" < "a "
|
|
||||||
|
|
||||||
@param cs Character set pinter.
|
|
||||||
@param a First string to compare.
|
|
||||||
@param a_length Length of 'a'.
|
|
||||||
@param b Second string to compare.
|
|
||||||
@param b_length Length of 'b'.
|
|
||||||
@param diff_if_only_endspace_difference
|
|
||||||
Set to 1 if the strings should be regarded as different
|
|
||||||
if they only difference in end space
|
|
||||||
|
|
||||||
@return Comparison result.
|
|
||||||
@retval Negative number, if a less than b.
|
|
||||||
@retval 0, if a is equal to b
|
|
||||||
@retval Positive number, if a > b
|
|
||||||
*/
|
|
||||||
|
|
||||||
static int
|
|
||||||
my_strnncollsp_utf8mb4(CHARSET_INFO *cs,
|
|
||||||
const uchar *s, size_t slen,
|
|
||||||
const uchar *t, size_t tlen,
|
|
||||||
my_bool diff_if_only_endspace_difference)
|
|
||||||
{
|
|
||||||
int res;
|
|
||||||
my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc);
|
|
||||||
const uchar *se= s + slen, *te= t + tlen;
|
|
||||||
MY_UNICASE_INFO *uni_plane= cs->caseinfo;
|
|
||||||
|
|
||||||
#ifndef VARCHAR_WITH_DIFF_ENDSPACE_ARE_DIFFERENT_FOR_UNIQUE
|
|
||||||
diff_if_only_endspace_difference= FALSE;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
while ( s < se && t < te )
|
|
||||||
{
|
|
||||||
int s_res= my_mb_wc_utf8mb4(cs, &s_wc, s, se);
|
|
||||||
int t_res= my_mb_wc_utf8mb4(cs, &t_wc, t, te);
|
|
||||||
|
|
||||||
if ( s_res <= 0 || t_res <= 0 )
|
|
||||||
{
|
|
||||||
/* Incorrect string, compare bytewise */
|
|
||||||
return bincmp_utf8mb4(s, se, t, te);
|
|
||||||
}
|
|
||||||
|
|
||||||
my_tosort_unicode(uni_plane, &s_wc, cs->state);
|
|
||||||
my_tosort_unicode(uni_plane, &t_wc, cs->state);
|
|
||||||
|
|
||||||
if ( s_wc != t_wc )
|
|
||||||
{
|
|
||||||
return s_wc > t_wc ? 1 : -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
s+=s_res;
|
|
||||||
t+=t_res;
|
|
||||||
}
|
|
||||||
|
|
||||||
slen= (size_t) (se-s);
|
|
||||||
tlen= (size_t) (te-t);
|
|
||||||
res= 0;
|
|
||||||
|
|
||||||
if (slen != tlen)
|
|
||||||
{
|
|
||||||
int swap= 1;
|
|
||||||
if (diff_if_only_endspace_difference)
|
|
||||||
res= 1; /* Assume 'a' is bigger */
|
|
||||||
if (slen < tlen)
|
|
||||||
{
|
|
||||||
slen= tlen;
|
|
||||||
s= t;
|
|
||||||
se= te;
|
|
||||||
swap= -1;
|
|
||||||
res= -res;
|
|
||||||
}
|
|
||||||
/*
|
|
||||||
This following loop uses the fact that in UTF-8
|
|
||||||
all multibyte characters are greater than space,
|
|
||||||
and all multibyte head characters are greater than
|
|
||||||
space. It means if we meet a character greater
|
|
||||||
than space, it always means that the longer string
|
|
||||||
is greater. So we can reuse the same loop from the
|
|
||||||
8bit version, without having to process full multibute
|
|
||||||
sequences.
|
|
||||||
*/
|
|
||||||
for ( ; s < se; s++)
|
|
||||||
{
|
|
||||||
if (*s != ' ')
|
|
||||||
return (*s < ' ') ? -swap : swap;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
Compare 0-terminated UTF8 strings.
|
Compare 0-terminated UTF8 strings.
|
||||||
|
|
||||||
@ -7906,6 +7767,30 @@ size_t my_well_formed_len_utf8mb4(CHARSET_INFO *cs,
|
|||||||
#undef DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
|
#undef DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
|
||||||
/* my_well_formed_char_length_utf8mb4 */
|
/* my_well_formed_char_length_utf8mb4 */
|
||||||
|
|
||||||
|
|
||||||
|
#define MY_FUNCTION_NAME(x) my_ ## x ## _utf8mb4_general_ci
|
||||||
|
#define IS_MB4_CHAR(b0,b1,b2,b3) IS_UTF8MB4_STEP3(b0,b1,b2,b3)
|
||||||
|
#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
|
||||||
|
#define WEIGHT_MB1(b0) my_weight_mb1_utf8_general_ci(b0)
|
||||||
|
#define WEIGHT_MB2(b0,b1) my_weight_mb2_utf8_general_ci(b0,b1)
|
||||||
|
#define WEIGHT_MB3(b0,b1,b2) my_weight_mb3_utf8_general_ci(b0,b1,b2)
|
||||||
|
/*
|
||||||
|
There is no mapping between code point and weight for non-BMP characters
|
||||||
|
in utf8mb4_general_ci. Just using code point as weight.
|
||||||
|
*/
|
||||||
|
#define WEIGHT_MB4(b0,b1,b2,b3) UTF8MB4_CODE(b0,b1,b2,b3)
|
||||||
|
#include "strcoll.ic"
|
||||||
|
|
||||||
|
|
||||||
|
#define MY_FUNCTION_NAME(x) my_ ## x ## _utf8mb4_bin
|
||||||
|
#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
|
||||||
|
#define WEIGHT_MB1(b0) ((int) (uchar) (b0))
|
||||||
|
#define WEIGHT_MB2(b0,b1) ((int) UTF8MB2_CODE(b0,b1))
|
||||||
|
#define WEIGHT_MB3(b0,b1,b2) ((int) UTF8MB3_CODE(b0,b1,b2))
|
||||||
|
#define WEIGHT_MB4(b0,b1,b2,b3) ((int) UTF8MB4_CODE(b0,b1,b2,b3))
|
||||||
|
#include "strcoll.ic"
|
||||||
|
|
||||||
|
|
||||||
static uint
|
static uint
|
||||||
my_ismbchar_utf8mb4(CHARSET_INFO *cs, const char *b, const char *e)
|
my_ismbchar_utf8mb4(CHARSET_INFO *cs, const char *b, const char *e)
|
||||||
{
|
{
|
||||||
@ -7934,8 +7819,8 @@ my_mbcharlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), uint c)
|
|||||||
static MY_COLLATION_HANDLER my_collation_utf8mb4_general_ci_handler=
|
static MY_COLLATION_HANDLER my_collation_utf8mb4_general_ci_handler=
|
||||||
{
|
{
|
||||||
NULL, /* init */
|
NULL, /* init */
|
||||||
my_strnncoll_utf8mb4,
|
my_strnncoll_utf8mb4_general_ci,
|
||||||
my_strnncollsp_utf8mb4,
|
my_strnncollsp_utf8mb4_general_ci,
|
||||||
my_strnxfrm_unicode,
|
my_strnxfrm_unicode,
|
||||||
my_strnxfrmlen_unicode,
|
my_strnxfrmlen_unicode,
|
||||||
my_like_range_mb,
|
my_like_range_mb,
|
||||||
@ -7950,8 +7835,8 @@ static MY_COLLATION_HANDLER my_collation_utf8mb4_general_ci_handler=
|
|||||||
static MY_COLLATION_HANDLER my_collation_utf8mb4_bin_handler =
|
static MY_COLLATION_HANDLER my_collation_utf8mb4_bin_handler =
|
||||||
{
|
{
|
||||||
NULL, /* init */
|
NULL, /* init */
|
||||||
my_strnncoll_mb_bin,
|
my_strnncoll_utf8mb4_bin,
|
||||||
my_strnncollsp_mb_bin,
|
my_strnncollsp_utf8mb4_bin,
|
||||||
my_strnxfrm_unicode_full_bin,
|
my_strnxfrm_unicode_full_bin,
|
||||||
my_strnxfrmlen_unicode_full_bin,
|
my_strnxfrmlen_unicode_full_bin,
|
||||||
my_like_range_mb,
|
my_like_range_mb,
|
||||||
|
@ -118,6 +118,18 @@ MY_FUNCTION_NAME(scan_weight)(int *weight, const uchar *str, const uchar *end)
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef IS_MB4_CHAR
|
||||||
|
if (str + 4 > end) /* Incomplete four-byte character */
|
||||||
|
goto bad;
|
||||||
|
|
||||||
|
if (IS_MB4_CHAR(str[0], str[1], str[2], str[3]))
|
||||||
|
{
|
||||||
|
*weight= WEIGHT_MB4(str[0], str[1], str[2], str[3]);
|
||||||
|
return 4; /* A valid four-byte character */
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
bad:
|
bad:
|
||||||
*weight= WEIGHT_ILSEQ(str[0]); /* Bad byte */
|
*weight= WEIGHT_ILSEQ(str[0]); /* Bad byte */
|
||||||
return 1;
|
return 1;
|
||||||
@ -252,4 +264,5 @@ MY_FUNCTION_NAME(strnncollsp)(CHARSET_INFO *cs __attribute__((unused)),
|
|||||||
#undef WEIGHT_MB1
|
#undef WEIGHT_MB1
|
||||||
#undef WEIGHT_MB2
|
#undef WEIGHT_MB2
|
||||||
#undef WEIGHT_MB3
|
#undef WEIGHT_MB3
|
||||||
|
#undef WEIGHT_MB4
|
||||||
#undef WEIGHT_PAD_SPACE
|
#undef WEIGHT_PAD_SPACE
|
||||||
|
@ -369,6 +369,49 @@ STRNNCOLL_PARAM strcoll_utf8mb3_common[]=
|
|||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
STRNNCOLL_PARAM strcoll_utf8mb4_common[]=
|
||||||
|
{
|
||||||
|
/* Minimum four-byte character: U+10000 == _utf8 0xF0908080 */
|
||||||
|
{CSTR("\xF0\x90\x80\x80"), CSTR("\xC0"), -1}, /* MB4 vs unused byte */
|
||||||
|
{CSTR("\xF0\x90\x80\x80"), CSTR("\xC2"), -1}, /* MB4 vs incomplete MB2 */
|
||||||
|
{CSTR("\xF0\x90\x80\x80"), CSTR("\xE0\xA0\x7F"),-1}, /* MB4 vs broken MB3 */
|
||||||
|
{CSTR("\xF0\x90\x80\x80"), CSTR("\xE0\xA0\xC0"),-1}, /* MB4 vs broken MB3 */
|
||||||
|
{CSTR("\xF0\x90\x80\x80"), CSTR("\xE0\xA0"), -1}, /* MB4 vs incomplete MB3 */
|
||||||
|
{CSTR("\xF0\x90\x80\x80"), CSTR("\xF0\x90\x80"),-1}, /* MB4 vs incomplete MB4 */
|
||||||
|
{CSTR("\xF0\x90\x80\x80"), CSTR("\xF0\x90\x80\x7F"),-1},/* MB4 vs broken MB4 */
|
||||||
|
{CSTR("\xF0\x90\x80\x80"), CSTR("\xF0\x90\x80\xC0"),-1},/* MB4 vs broken MB4 */
|
||||||
|
|
||||||
|
/* Maximum four-byte character: U+10FFFF == _utf8 0xF48FBFBF */
|
||||||
|
{CSTR("\xF4\x8F\xBF\xBF"), CSTR("\xC0"), -1}, /* MB4 vs unused byte */
|
||||||
|
{CSTR("\xF4\x8F\xBF\xBF"), CSTR("\xC2"), -1}, /* MB4 vs incomplete MB2 */
|
||||||
|
{CSTR("\xF4\x8F\xBF\xBF"), CSTR("\xE0\xA0\x7F"),-1}, /* MB4 vs broken MB3 */
|
||||||
|
{CSTR("\xF4\x8F\xBF\xBF"), CSTR("\xE0\xA0\xC0"),-1}, /* MB4 vs broken MB3 */
|
||||||
|
{CSTR("\xF4\x8F\xBF\xBF"), CSTR("\xE0\xA0"), -1}, /* MB4 vs incomplete MB3 */
|
||||||
|
{CSTR("\xF4\x8F\xBF\xBF"), CSTR("\xF0\x90\x80"),-1}, /* MB4 vs incomplete MB4 */
|
||||||
|
{CSTR("\xF4\x8F\xBF\xBF"), CSTR("\xF0\x90\x80\x7F"),-1},/* MB4 vs broken MB4 */
|
||||||
|
{CSTR("\xF4\x8F\xBF\xBF"), CSTR("\xF0\x90\x80\xC0"),-1},/* MB4 vs broken MB4 */
|
||||||
|
|
||||||
|
/* Broken MB4 vs incomplete/broken MB3 */
|
||||||
|
{CSTR("\xF0\x90\x80\x7F"), CSTR("\xE0\xA0"), 1}, /* Broken MB4 vs incomplete MB3 */
|
||||||
|
{CSTR("\xF0\x90\x80\x7F"), CSTR("\xE0\xA0\x7F"),1}, /* Broken MB4 vs broken MB3 */
|
||||||
|
{CSTR("\xF0\x90\x80\x7F"), CSTR("\xE0\xA0\xC0"),1}, /* Broken MB4 vs broken MB3 */
|
||||||
|
|
||||||
|
/*
|
||||||
|
Broken MB4 vs incomplete MB4:
|
||||||
|
The three leftmost bytes are compared binary, the fourth byte is compared
|
||||||
|
to auto-padded space.
|
||||||
|
*/
|
||||||
|
{CSTR("\xF0\x90\x80\x1F"), CSTR("\xF0\x90\x80"),-1}, /* Broken MB4 vs incomplete MB4 */
|
||||||
|
{CSTR("\xF0\x90\x80\x7E"), CSTR("\xF0\x90\x80"),1}, /* Broken MB4 vs incomplete MB4 */
|
||||||
|
|
||||||
|
/* Broken MB4 vs broken MB4 */
|
||||||
|
{CSTR("\xF0\x90\x80\x7E"), CSTR("\xF0\x90\x80\x7F"),-1},/* Broken MB4 vs broken MB4 */
|
||||||
|
{CSTR("\xF0\x90\x80\x7E"), CSTR("\xF0\x90\x80\xC0"),-1},/* Broken MB4 vs broken MB4 */
|
||||||
|
|
||||||
|
{NULL, 0, NULL, 0, 0}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
static void
|
static void
|
||||||
str2hex(char *dst, size_t dstlen, const char *src, size_t srclen)
|
str2hex(char *dst, size_t dstlen, const char *src, size_t srclen)
|
||||||
{
|
{
|
||||||
@ -497,6 +540,12 @@ test_strcollsp()
|
|||||||
failed+= strcollsp(&my_charset_utf8_general_ci, strcoll_utf8mb3_common);
|
failed+= strcollsp(&my_charset_utf8_general_ci, strcoll_utf8mb3_common);
|
||||||
failed+= strcollsp(&my_charset_utf8_general_mysql500_ci, strcoll_utf8mb3_common);
|
failed+= strcollsp(&my_charset_utf8_general_mysql500_ci, strcoll_utf8mb3_common);
|
||||||
failed+= strcollsp(&my_charset_utf8_bin, strcoll_utf8mb3_common);
|
failed+= strcollsp(&my_charset_utf8_bin, strcoll_utf8mb3_common);
|
||||||
|
#endif
|
||||||
|
#ifdef HAVE_CHARSET_utf8mb4
|
||||||
|
failed+= strcollsp(&my_charset_utf8mb4_general_ci, strcoll_utf8mb3_common);
|
||||||
|
failed+= strcollsp(&my_charset_utf8mb4_bin, strcoll_utf8mb3_common);
|
||||||
|
failed+= strcollsp(&my_charset_utf8mb4_general_ci, strcoll_utf8mb4_common);
|
||||||
|
failed+= strcollsp(&my_charset_utf8mb4_bin, strcoll_utf8mb4_common);
|
||||||
#endif
|
#endif
|
||||||
return failed;
|
return failed;
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user