mirror of
https://github.com/MariaDB/server.git
synced 2025-08-08 11:22:35 +03:00
MDEV-8214 Asian MB2 charsets: compare broken bytes as "greater than any non-broken character"
This commit is contained in:
@@ -95,11 +95,361 @@ static CHARSET_INFO *charset_list[]=
|
||||
};
|
||||
|
||||
|
||||
typedef struct
|
||||
{
|
||||
const char *a;
|
||||
size_t alen;
|
||||
const char *b;
|
||||
size_t blen;
|
||||
int res;
|
||||
} STRNNCOLL_PARAM;
|
||||
|
||||
|
||||
#define CSTR(x) (x),(sizeof(x)-1)
|
||||
|
||||
/*
|
||||
Byte sequence types used in the tests:
|
||||
8BIT - a 8 bit byte (>=00x80) which makes a single byte characters
|
||||
MB2 - two bytes that make a valid character
|
||||
H2 - a byte which is a valid MB2 head byte
|
||||
T2 - a byte which is a valid MB2 tail byte
|
||||
ILSEQ - a byte which makes an illegal sequence
|
||||
H2+ILSEQ - a sequence that starts with a valid H2 byte,
|
||||
but not followed by a valid T2 byte.
|
||||
|
||||
Charset H2 T2 8BIT
|
||||
------- ---------------- --------------- --------
|
||||
big5 [A1..F9] [40..7E,A1..FE]
|
||||
euckr [81..FE] [41..5A,61..7A,81..FE]
|
||||
gb2312 [A1..F7] [A1..FE]
|
||||
gbk [81..FE] [40..7E,80..FE]
|
||||
|
||||
cp932 [81..9F,E0..FC] [40..7E,80..FC] [A1..DF]
|
||||
sjis [81..9F,E0..FC] [40..7E,80..FC] [A1..DF]
|
||||
|
||||
|
||||
Essential byte sequences in various character sets:
|
||||
|
||||
Sequence big5 cp932 euckr gb2312 gbk sjis
|
||||
-------- ---- ----- ----- ------ --- ----
|
||||
80 ILSEQ ILSEQ ILSEQ ILSEQ ILSEQ ILSEQ
|
||||
81 ILSEQ H2 H2 ILSEQ H2 H2
|
||||
A1 H2 8BIT H2 H2 H2 8BIT
|
||||
A1A1 MB2 8BIT+8BIT MB2 MB2 MB2 8BIT+8BIT
|
||||
E0E0 MB2 MB2 MB2 MB2 MB2 MB2
|
||||
F9FE MB2 H2+ILSEQ MB2 ILSEQ+T2 MB2 H2+ILSEQ
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
For character sets that have the following byte sequences:
|
||||
80 - ILSEQ
|
||||
81 - ILSEQ or H2
|
||||
F9 - ILSEQ or H2
|
||||
A1A1 - MB2 or 8BIT+8BIT
|
||||
E0E0 - MB2
|
||||
*/
|
||||
STRNNCOLL_PARAM strcoll_mb2_common[]=
|
||||
{
|
||||
/* Compare two good sequences */
|
||||
{CSTR(""), CSTR(""), 0},
|
||||
{CSTR(""), CSTR(" "), 0},
|
||||
{CSTR(""), CSTR("A"), -1},
|
||||
{CSTR(""), CSTR("a"), -1},
|
||||
{CSTR(""), CSTR("\xA1\xA1"), -1},
|
||||
{CSTR(""), CSTR("\xE0\xE0"), -1},
|
||||
|
||||
{CSTR(" "), CSTR(""), 0},
|
||||
{CSTR(" "), CSTR(" "), 0},
|
||||
{CSTR(" "), CSTR("A"), -1},
|
||||
{CSTR(" "), CSTR("a"), -1},
|
||||
{CSTR(" "), CSTR("\xA1\xA1"), -1},
|
||||
{CSTR(" "), CSTR("\xE0\xE0"), -1},
|
||||
|
||||
{CSTR("a"), CSTR(""), 1},
|
||||
{CSTR("a"), CSTR(" "), 1},
|
||||
{CSTR("a"), CSTR("a"), 0},
|
||||
{CSTR("a"), CSTR("\xA1\xA1"), -1},
|
||||
{CSTR("a"), CSTR("\xE0\xE0"), -1},
|
||||
|
||||
{CSTR("\xA1\xA1"), CSTR("\xA1\xA1"), 0},
|
||||
{CSTR("\xA1\xA1"), CSTR("\xE0\xE0"), -1},
|
||||
|
||||
/* Compare a good character to an illegal or an incomplete sequence */
|
||||
{CSTR(""), CSTR("\x80"), -1},
|
||||
{CSTR(""), CSTR("\x81"), -1},
|
||||
{CSTR(""), CSTR("\xF9"), -1},
|
||||
|
||||
{CSTR(" "), CSTR("\x80"), -1},
|
||||
{CSTR(" "), CSTR("\x81"), -1},
|
||||
{CSTR(" "), CSTR("\xF9"), -1},
|
||||
|
||||
{CSTR("a"), CSTR("\x80"), -1},
|
||||
{CSTR("a"), CSTR("\x81"), -1},
|
||||
{CSTR("a"), CSTR("\xF9"), -1},
|
||||
|
||||
{CSTR("\xA1\xA1"), CSTR("\x80"), -1},
|
||||
{CSTR("\xA1\xA1"), CSTR("\x81"), -1},
|
||||
{CSTR("\xA1\xA1"), CSTR("\xF9"), -1},
|
||||
|
||||
{CSTR("\xE0\xE0"), CSTR("\x80"), -1},
|
||||
{CSTR("\xE0\xE0"), CSTR("\x81"), -1},
|
||||
{CSTR("\xE0\xE0"), CSTR("\xF9"), -1},
|
||||
|
||||
/* Compare two bad/incomplete sequences */
|
||||
{CSTR("\x80"), CSTR("\x80"), 0},
|
||||
{CSTR("\x80"), CSTR("\x81"), -1},
|
||||
{CSTR("\x80"), CSTR("\xF9"), -1},
|
||||
{CSTR("\x81"), CSTR("\x81"), 0},
|
||||
{CSTR("\x81"), CSTR("\xF9"), -1},
|
||||
|
||||
{NULL, 0, NULL, 0, 0}
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
For character sets that have good mb2 characters A1A1 and F9FE
|
||||
*/
|
||||
STRNNCOLL_PARAM strcoll_mb2_A1A1_mb2_F9FE[]=
|
||||
{
|
||||
/* Compare two good characters */
|
||||
{CSTR(""), CSTR("\xF9\xFE"), -1},
|
||||
{CSTR(" "), CSTR("\xF9\xFE"), -1},
|
||||
{CSTR("a") , CSTR("\xF9\xFE"), -1},
|
||||
{CSTR("\xA1\xA1"), CSTR("\xF9\xFE"), -1},
|
||||
{CSTR("\xF9\xFE"), CSTR("\xF9\xFE"), 0},
|
||||
|
||||
/* Compare a good character to an illegal or an incomplete sequence */
|
||||
{CSTR(""), CSTR("\xA1"), -1},
|
||||
{CSTR(""), CSTR("\xF9"), -1},
|
||||
{CSTR("a"), CSTR("\xA1"), -1},
|
||||
{CSTR("a"), CSTR("\xF9"), -1},
|
||||
|
||||
{CSTR("\xA1\xA1"), CSTR("\xA1"), -1},
|
||||
{CSTR("\xA1\xA1"), CSTR("\xF9"), -1},
|
||||
|
||||
{CSTR("\xF9\xFE"), CSTR("\x80"), -1},
|
||||
{CSTR("\xF9\xFE"), CSTR("\x81"), -1},
|
||||
{CSTR("\xF9\xFE"), CSTR("\xA1"), -1},
|
||||
{CSTR("\xF9\xFE"), CSTR("\xF9"), -1},
|
||||
|
||||
/* Compare two bad/incomplete sequences */
|
||||
{CSTR("\x80"), CSTR("\xA1"), -1},
|
||||
{CSTR("\x80"), CSTR("\xF9"), -1},
|
||||
|
||||
{NULL, 0, NULL, 0, 0}
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
For character sets that have:
|
||||
A1A1 - a good mb2 character
|
||||
F9FE - a bad sequence
|
||||
*/
|
||||
STRNNCOLL_PARAM strcoll_mb2_A1A1_bad_F9FE[]=
|
||||
{
|
||||
/* Compare a good character to an illegal or an incomplete sequence */
|
||||
{CSTR(""), CSTR("\xF9\xFE"), -1},
|
||||
{CSTR(" "), CSTR("\xF9\xFE"), -1},
|
||||
{CSTR("a") , CSTR("\xF9\xFE"), -1},
|
||||
{CSTR("\xA1\xA1"), CSTR("\xF9\xFE"), -1},
|
||||
|
||||
{CSTR(""), CSTR("\xA1"), -1},
|
||||
{CSTR(""), CSTR("\xF9"), -1},
|
||||
{CSTR("a"), CSTR("\xA1"), -1},
|
||||
{CSTR("a"), CSTR("\xF9"), -1},
|
||||
|
||||
{CSTR("\xA1\xA1"), CSTR("\xA1"), -1},
|
||||
{CSTR("\xA1\xA1"), CSTR("\xF9"), -1},
|
||||
|
||||
/* Compare two bad/incomplete sequences */
|
||||
{CSTR("\xF9\xFE"), CSTR("\x80"), 1},
|
||||
{CSTR("\xF9\xFE"), CSTR("\x81"), 1},
|
||||
{CSTR("\xF9\xFE"), CSTR("\xA1"), 1},
|
||||
{CSTR("\xF9\xFE"), CSTR("\xF9"), 1},
|
||||
{CSTR("\x80"), CSTR("\xA1"), -1},
|
||||
{CSTR("\x80"), CSTR("\xF9"), -1},
|
||||
{CSTR("\xF9\xFE"), CSTR("\xF9\xFE"), 0},
|
||||
|
||||
{NULL, 0, NULL, 0, 0}
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
For character sets that have:
|
||||
80 - ILSEQ or H2
|
||||
81 - ILSEQ or H2
|
||||
A1 - 8BIT
|
||||
F9 - ILSEQ or H2
|
||||
F9FE - a bad sequence (ILSEQ+XX or H2+ILSEQ)
|
||||
*/
|
||||
STRNNCOLL_PARAM strcoll_mb1_A1_bad_F9FE[]=
|
||||
{
|
||||
/* Compare two good characters */
|
||||
{CSTR(""), CSTR("\xA1"), -1},
|
||||
{CSTR("\xA1\xA1"), CSTR("\xA1"), 1},
|
||||
|
||||
/* Compare a good character to an illegal or an incomplete sequence */
|
||||
{CSTR(""), CSTR("\xF9"), -1},
|
||||
{CSTR(""), CSTR("\xF9\xFE"), -1},
|
||||
{CSTR(" "), CSTR("\xF9\xFE"), -1},
|
||||
{CSTR("a"), CSTR("\xF9\xFE"), -1},
|
||||
{CSTR("a"), CSTR("\xA1"), -1},
|
||||
{CSTR("a"), CSTR("\xF9"), -1},
|
||||
|
||||
{CSTR("\xA1\xA1"), CSTR("\xF9"), -1},
|
||||
{CSTR("\xA1\xA1"), CSTR("\xF9\xFE"), -1},
|
||||
|
||||
{CSTR("\xF9\xFE"), CSTR("\x80"), 1},
|
||||
{CSTR("\xF9\xFE"), CSTR("\x81"), 1},
|
||||
{CSTR("\xF9\xFE"), CSTR("\xA1"), 1},
|
||||
{CSTR("\xF9\xFE"), CSTR("\xF9"), 1},
|
||||
|
||||
{CSTR("\x80"), CSTR("\xA1"), 1},
|
||||
|
||||
/* Compare two bad/incomplete sequences */
|
||||
{CSTR("\x80"), CSTR("\xF9"), -1},
|
||||
{CSTR("\xF9\xFE"), CSTR("\xF9\xFE"), 0},
|
||||
|
||||
{NULL, 0, NULL, 0, 0}
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
For character sets (e.g. cp932 and sjis) that have:
|
||||
8181 - a valid MB2 character
|
||||
A1 - a valid 8BIT character
|
||||
E0E0 - a valid MB2 character
|
||||
and sort in this order:
|
||||
8181 < A1 < E0E0
|
||||
*/
|
||||
STRNNCOLL_PARAM strcoll_8181_A1_E0E0[]=
|
||||
{
|
||||
{CSTR("\x81\x81"), CSTR("\xA1"), -1},
|
||||
{CSTR("\x81\x81"), CSTR("\xE0\xE0"), -1},
|
||||
{CSTR("\xA1"), CSTR("\xE0\xE0"), -1},
|
||||
|
||||
{NULL, 0, NULL, 0, 0}
|
||||
};
|
||||
|
||||
|
||||
static void
|
||||
str2hex(char *dst, size_t dstlen, const char *src, size_t srclen)
|
||||
{
|
||||
char *dstend= dst + dstlen;
|
||||
const char *srcend= src + srclen;
|
||||
for (*dst= '\0' ; dst + 3 < dstend && src < srcend; )
|
||||
{
|
||||
sprintf(dst, "%02X", (unsigned char) src[0]);
|
||||
dst+=2;
|
||||
src++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Check if the two comparison result are semantically equal:
|
||||
both are negative, both are positive, or both are zero.
|
||||
*/
|
||||
static int
|
||||
eqres(int ares, int bres)
|
||||
{
|
||||
return (ares < 0 && bres < 0) ||
|
||||
(ares > 0 && bres > 0) ||
|
||||
(ares == 0 && bres == 0);
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
strcollsp(CHARSET_INFO *cs, const STRNNCOLL_PARAM *param)
|
||||
{
|
||||
int failed= 0;
|
||||
const STRNNCOLL_PARAM *p;
|
||||
diag("%-20s %-10s %-10s %10s %10s", "Collation", "a", "b", "ExpectSign", "Actual");
|
||||
for (p= param; p->a; p++)
|
||||
{
|
||||
char ahex[64], bhex[64];
|
||||
int res= cs->coll->strnncollsp(cs, (uchar *) p->a, p->alen,
|
||||
(uchar *) p->b, p->blen, 0);
|
||||
str2hex(ahex, sizeof(ahex), p->a, p->alen);
|
||||
str2hex(bhex, sizeof(bhex), p->b, p->blen);
|
||||
diag("%-20s %-10s %-10s %10d %10d%s",
|
||||
cs->name, ahex, bhex, p->res, res,
|
||||
eqres(res, p->res) ? "" : " FAILED");
|
||||
if (!eqres(res, p->res))
|
||||
{
|
||||
failed++;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Test in reverse order */
|
||||
res= cs->coll->strnncollsp(cs, (uchar *) p->b, p->blen,
|
||||
(uchar *) p->a, p->alen, 0);
|
||||
if (!eqres(res, -p->res))
|
||||
{
|
||||
diag("Comparison in reverse order failed. Expected %d, got %d",
|
||||
-p->res, res);
|
||||
failed++;
|
||||
}
|
||||
}
|
||||
}
|
||||
return failed;
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
test_strcollsp()
|
||||
{
|
||||
int failed= 0;
|
||||
#ifdef HAVE_CHARSET_big5
|
||||
failed+= strcollsp(&my_charset_big5_chinese_ci, strcoll_mb2_common);
|
||||
failed+= strcollsp(&my_charset_big5_chinese_ci, strcoll_mb2_A1A1_mb2_F9FE);
|
||||
failed+= strcollsp(&my_charset_big5_bin, strcoll_mb2_common);
|
||||
failed+= strcollsp(&my_charset_big5_bin, strcoll_mb2_A1A1_mb2_F9FE);
|
||||
#endif
|
||||
#ifdef HAVE_CHARSET_cp932
|
||||
failed+= strcollsp(&my_charset_cp932_japanese_ci, strcoll_mb2_common);
|
||||
failed+= strcollsp(&my_charset_cp932_japanese_ci, strcoll_mb1_A1_bad_F9FE);
|
||||
failed+= strcollsp(&my_charset_cp932_bin, strcoll_mb2_common);
|
||||
failed+= strcollsp(&my_charset_cp932_bin, strcoll_mb1_A1_bad_F9FE);
|
||||
failed+= strcollsp(&my_charset_cp932_japanese_ci, strcoll_8181_A1_E0E0);
|
||||
failed+= strcollsp(&my_charset_cp932_bin, strcoll_8181_A1_E0E0);
|
||||
#endif
|
||||
#ifdef HAVE_CHARSET_euckr
|
||||
failed+= strcollsp(&my_charset_euckr_korean_ci, strcoll_mb2_common);
|
||||
failed+= strcollsp(&my_charset_euckr_korean_ci, strcoll_mb2_A1A1_mb2_F9FE);
|
||||
failed+= strcollsp(&my_charset_euckr_bin, strcoll_mb2_common);
|
||||
failed+= strcollsp(&my_charset_euckr_bin, strcoll_mb2_A1A1_mb2_F9FE);
|
||||
#endif
|
||||
#ifdef HAVE_CHARSET_gb2312
|
||||
failed+= strcollsp(&my_charset_gb2312_chinese_ci, strcoll_mb2_common);
|
||||
failed+= strcollsp(&my_charset_gb2312_chinese_ci, strcoll_mb2_A1A1_bad_F9FE);
|
||||
failed+= strcollsp(&my_charset_gb2312_bin, strcoll_mb2_common);
|
||||
failed+= strcollsp(&my_charset_gb2312_bin, strcoll_mb2_A1A1_bad_F9FE);
|
||||
#endif
|
||||
#ifdef HAVE_CHARSET_gbk
|
||||
failed+= strcollsp(&my_charset_gbk_chinese_ci, strcoll_mb2_common);
|
||||
failed+= strcollsp(&my_charset_gbk_chinese_ci, strcoll_mb2_A1A1_mb2_F9FE);
|
||||
failed+= strcollsp(&my_charset_gbk_bin, strcoll_mb2_common);
|
||||
failed+= strcollsp(&my_charset_gbk_bin, strcoll_mb2_A1A1_mb2_F9FE);
|
||||
#endif
|
||||
#ifdef HAVE_CHARSET_sjis
|
||||
failed+= strcollsp(&my_charset_sjis_japanese_ci, strcoll_mb2_common);
|
||||
failed+= strcollsp(&my_charset_sjis_bin, strcoll_mb2_common);
|
||||
failed+= strcollsp(&my_charset_sjis_japanese_ci, strcoll_mb1_A1_bad_F9FE);
|
||||
failed+= strcollsp(&my_charset_sjis_bin, strcoll_mb1_A1_bad_F9FE);
|
||||
failed+= strcollsp(&my_charset_sjis_japanese_ci, strcoll_8181_A1_E0E0);
|
||||
failed+= strcollsp(&my_charset_sjis_bin, strcoll_8181_A1_E0E0);
|
||||
#endif
|
||||
return failed;
|
||||
}
|
||||
|
||||
|
||||
int main()
|
||||
{
|
||||
size_t i, failed= 0;
|
||||
|
||||
plan(1);
|
||||
plan(2);
|
||||
diag("Testing my_like_range_xxx() functions");
|
||||
|
||||
for (i= 0; i < array_elements(charset_list); i++)
|
||||
@@ -112,5 +462,10 @@ int main()
|
||||
}
|
||||
}
|
||||
ok(failed == 0, "Testing my_like_range_xxx() functions");
|
||||
|
||||
diag("Testing cs->coll->strnncollsp()");
|
||||
failed= test_strcollsp();
|
||||
ok(failed == 0, "Testing cs->coll->strnncollsp()");
|
||||
|
||||
return exit_status();
|
||||
}
|
||||
|
Reference in New Issue
Block a user