1
0
mirror of https://github.com/MariaDB/server.git synced 2025-08-08 11:22:35 +03:00

MDEV-8214 Asian MB2 charsets: compare broken bytes as "greater than any non-broken character"

This commit is contained in:
Alexander Barkov
2015-06-26 13:40:28 +04:00
parent d535728165
commit 4f828a1cac
9 changed files with 830 additions and 373 deletions

View File

@@ -95,11 +95,361 @@ static CHARSET_INFO *charset_list[]=
};
typedef struct
{
const char *a;
size_t alen;
const char *b;
size_t blen;
int res;
} STRNNCOLL_PARAM;
#define CSTR(x) (x),(sizeof(x)-1)
/*
Byte sequence types used in the tests:
8BIT - a 8 bit byte (>=00x80) which makes a single byte characters
MB2 - two bytes that make a valid character
H2 - a byte which is a valid MB2 head byte
T2 - a byte which is a valid MB2 tail byte
ILSEQ - a byte which makes an illegal sequence
H2+ILSEQ - a sequence that starts with a valid H2 byte,
but not followed by a valid T2 byte.
Charset H2 T2 8BIT
------- ---------------- --------------- --------
big5 [A1..F9] [40..7E,A1..FE]
euckr [81..FE] [41..5A,61..7A,81..FE]
gb2312 [A1..F7] [A1..FE]
gbk [81..FE] [40..7E,80..FE]
cp932 [81..9F,E0..FC] [40..7E,80..FC] [A1..DF]
sjis [81..9F,E0..FC] [40..7E,80..FC] [A1..DF]
Essential byte sequences in various character sets:
Sequence big5 cp932 euckr gb2312 gbk sjis
-------- ---- ----- ----- ------ --- ----
80 ILSEQ ILSEQ ILSEQ ILSEQ ILSEQ ILSEQ
81 ILSEQ H2 H2 ILSEQ H2 H2
A1 H2 8BIT H2 H2 H2 8BIT
A1A1 MB2 8BIT+8BIT MB2 MB2 MB2 8BIT+8BIT
E0E0 MB2 MB2 MB2 MB2 MB2 MB2
F9FE MB2 H2+ILSEQ MB2 ILSEQ+T2 MB2 H2+ILSEQ
*/
/*
For character sets that have the following byte sequences:
80 - ILSEQ
81 - ILSEQ or H2
F9 - ILSEQ or H2
A1A1 - MB2 or 8BIT+8BIT
E0E0 - MB2
*/
STRNNCOLL_PARAM strcoll_mb2_common[]=
{
/* Compare two good sequences */
{CSTR(""), CSTR(""), 0},
{CSTR(""), CSTR(" "), 0},
{CSTR(""), CSTR("A"), -1},
{CSTR(""), CSTR("a"), -1},
{CSTR(""), CSTR("\xA1\xA1"), -1},
{CSTR(""), CSTR("\xE0\xE0"), -1},
{CSTR(" "), CSTR(""), 0},
{CSTR(" "), CSTR(" "), 0},
{CSTR(" "), CSTR("A"), -1},
{CSTR(" "), CSTR("a"), -1},
{CSTR(" "), CSTR("\xA1\xA1"), -1},
{CSTR(" "), CSTR("\xE0\xE0"), -1},
{CSTR("a"), CSTR(""), 1},
{CSTR("a"), CSTR(" "), 1},
{CSTR("a"), CSTR("a"), 0},
{CSTR("a"), CSTR("\xA1\xA1"), -1},
{CSTR("a"), CSTR("\xE0\xE0"), -1},
{CSTR("\xA1\xA1"), CSTR("\xA1\xA1"), 0},
{CSTR("\xA1\xA1"), CSTR("\xE0\xE0"), -1},
/* Compare a good character to an illegal or an incomplete sequence */
{CSTR(""), CSTR("\x80"), -1},
{CSTR(""), CSTR("\x81"), -1},
{CSTR(""), CSTR("\xF9"), -1},
{CSTR(" "), CSTR("\x80"), -1},
{CSTR(" "), CSTR("\x81"), -1},
{CSTR(" "), CSTR("\xF9"), -1},
{CSTR("a"), CSTR("\x80"), -1},
{CSTR("a"), CSTR("\x81"), -1},
{CSTR("a"), CSTR("\xF9"), -1},
{CSTR("\xA1\xA1"), CSTR("\x80"), -1},
{CSTR("\xA1\xA1"), CSTR("\x81"), -1},
{CSTR("\xA1\xA1"), CSTR("\xF9"), -1},
{CSTR("\xE0\xE0"), CSTR("\x80"), -1},
{CSTR("\xE0\xE0"), CSTR("\x81"), -1},
{CSTR("\xE0\xE0"), CSTR("\xF9"), -1},
/* Compare two bad/incomplete sequences */
{CSTR("\x80"), CSTR("\x80"), 0},
{CSTR("\x80"), CSTR("\x81"), -1},
{CSTR("\x80"), CSTR("\xF9"), -1},
{CSTR("\x81"), CSTR("\x81"), 0},
{CSTR("\x81"), CSTR("\xF9"), -1},
{NULL, 0, NULL, 0, 0}
};
/*
For character sets that have good mb2 characters A1A1 and F9FE
*/
STRNNCOLL_PARAM strcoll_mb2_A1A1_mb2_F9FE[]=
{
/* Compare two good characters */
{CSTR(""), CSTR("\xF9\xFE"), -1},
{CSTR(" "), CSTR("\xF9\xFE"), -1},
{CSTR("a") , CSTR("\xF9\xFE"), -1},
{CSTR("\xA1\xA1"), CSTR("\xF9\xFE"), -1},
{CSTR("\xF9\xFE"), CSTR("\xF9\xFE"), 0},
/* Compare a good character to an illegal or an incomplete sequence */
{CSTR(""), CSTR("\xA1"), -1},
{CSTR(""), CSTR("\xF9"), -1},
{CSTR("a"), CSTR("\xA1"), -1},
{CSTR("a"), CSTR("\xF9"), -1},
{CSTR("\xA1\xA1"), CSTR("\xA1"), -1},
{CSTR("\xA1\xA1"), CSTR("\xF9"), -1},
{CSTR("\xF9\xFE"), CSTR("\x80"), -1},
{CSTR("\xF9\xFE"), CSTR("\x81"), -1},
{CSTR("\xF9\xFE"), CSTR("\xA1"), -1},
{CSTR("\xF9\xFE"), CSTR("\xF9"), -1},
/* Compare two bad/incomplete sequences */
{CSTR("\x80"), CSTR("\xA1"), -1},
{CSTR("\x80"), CSTR("\xF9"), -1},
{NULL, 0, NULL, 0, 0}
};
/*
For character sets that have:
A1A1 - a good mb2 character
F9FE - a bad sequence
*/
STRNNCOLL_PARAM strcoll_mb2_A1A1_bad_F9FE[]=
{
/* Compare a good character to an illegal or an incomplete sequence */
{CSTR(""), CSTR("\xF9\xFE"), -1},
{CSTR(" "), CSTR("\xF9\xFE"), -1},
{CSTR("a") , CSTR("\xF9\xFE"), -1},
{CSTR("\xA1\xA1"), CSTR("\xF9\xFE"), -1},
{CSTR(""), CSTR("\xA1"), -1},
{CSTR(""), CSTR("\xF9"), -1},
{CSTR("a"), CSTR("\xA1"), -1},
{CSTR("a"), CSTR("\xF9"), -1},
{CSTR("\xA1\xA1"), CSTR("\xA1"), -1},
{CSTR("\xA1\xA1"), CSTR("\xF9"), -1},
/* Compare two bad/incomplete sequences */
{CSTR("\xF9\xFE"), CSTR("\x80"), 1},
{CSTR("\xF9\xFE"), CSTR("\x81"), 1},
{CSTR("\xF9\xFE"), CSTR("\xA1"), 1},
{CSTR("\xF9\xFE"), CSTR("\xF9"), 1},
{CSTR("\x80"), CSTR("\xA1"), -1},
{CSTR("\x80"), CSTR("\xF9"), -1},
{CSTR("\xF9\xFE"), CSTR("\xF9\xFE"), 0},
{NULL, 0, NULL, 0, 0}
};
/*
For character sets that have:
80 - ILSEQ or H2
81 - ILSEQ or H2
A1 - 8BIT
F9 - ILSEQ or H2
F9FE - a bad sequence (ILSEQ+XX or H2+ILSEQ)
*/
STRNNCOLL_PARAM strcoll_mb1_A1_bad_F9FE[]=
{
/* Compare two good characters */
{CSTR(""), CSTR("\xA1"), -1},
{CSTR("\xA1\xA1"), CSTR("\xA1"), 1},
/* Compare a good character to an illegal or an incomplete sequence */
{CSTR(""), CSTR("\xF9"), -1},
{CSTR(""), CSTR("\xF9\xFE"), -1},
{CSTR(" "), CSTR("\xF9\xFE"), -1},
{CSTR("a"), CSTR("\xF9\xFE"), -1},
{CSTR("a"), CSTR("\xA1"), -1},
{CSTR("a"), CSTR("\xF9"), -1},
{CSTR("\xA1\xA1"), CSTR("\xF9"), -1},
{CSTR("\xA1\xA1"), CSTR("\xF9\xFE"), -1},
{CSTR("\xF9\xFE"), CSTR("\x80"), 1},
{CSTR("\xF9\xFE"), CSTR("\x81"), 1},
{CSTR("\xF9\xFE"), CSTR("\xA1"), 1},
{CSTR("\xF9\xFE"), CSTR("\xF9"), 1},
{CSTR("\x80"), CSTR("\xA1"), 1},
/* Compare two bad/incomplete sequences */
{CSTR("\x80"), CSTR("\xF9"), -1},
{CSTR("\xF9\xFE"), CSTR("\xF9\xFE"), 0},
{NULL, 0, NULL, 0, 0}
};
/*
For character sets (e.g. cp932 and sjis) that have:
8181 - a valid MB2 character
A1 - a valid 8BIT character
E0E0 - a valid MB2 character
and sort in this order:
8181 < A1 < E0E0
*/
STRNNCOLL_PARAM strcoll_8181_A1_E0E0[]=
{
{CSTR("\x81\x81"), CSTR("\xA1"), -1},
{CSTR("\x81\x81"), CSTR("\xE0\xE0"), -1},
{CSTR("\xA1"), CSTR("\xE0\xE0"), -1},
{NULL, 0, NULL, 0, 0}
};
static void
str2hex(char *dst, size_t dstlen, const char *src, size_t srclen)
{
char *dstend= dst + dstlen;
const char *srcend= src + srclen;
for (*dst= '\0' ; dst + 3 < dstend && src < srcend; )
{
sprintf(dst, "%02X", (unsigned char) src[0]);
dst+=2;
src++;
}
}
/*
Check if the two comparison result are semantically equal:
both are negative, both are positive, or both are zero.
*/
static int
eqres(int ares, int bres)
{
return (ares < 0 && bres < 0) ||
(ares > 0 && bres > 0) ||
(ares == 0 && bres == 0);
}
static int
strcollsp(CHARSET_INFO *cs, const STRNNCOLL_PARAM *param)
{
int failed= 0;
const STRNNCOLL_PARAM *p;
diag("%-20s %-10s %-10s %10s %10s", "Collation", "a", "b", "ExpectSign", "Actual");
for (p= param; p->a; p++)
{
char ahex[64], bhex[64];
int res= cs->coll->strnncollsp(cs, (uchar *) p->a, p->alen,
(uchar *) p->b, p->blen, 0);
str2hex(ahex, sizeof(ahex), p->a, p->alen);
str2hex(bhex, sizeof(bhex), p->b, p->blen);
diag("%-20s %-10s %-10s %10d %10d%s",
cs->name, ahex, bhex, p->res, res,
eqres(res, p->res) ? "" : " FAILED");
if (!eqres(res, p->res))
{
failed++;
}
else
{
/* Test in reverse order */
res= cs->coll->strnncollsp(cs, (uchar *) p->b, p->blen,
(uchar *) p->a, p->alen, 0);
if (!eqres(res, -p->res))
{
diag("Comparison in reverse order failed. Expected %d, got %d",
-p->res, res);
failed++;
}
}
}
return failed;
}
static int
test_strcollsp()
{
int failed= 0;
#ifdef HAVE_CHARSET_big5
failed+= strcollsp(&my_charset_big5_chinese_ci, strcoll_mb2_common);
failed+= strcollsp(&my_charset_big5_chinese_ci, strcoll_mb2_A1A1_mb2_F9FE);
failed+= strcollsp(&my_charset_big5_bin, strcoll_mb2_common);
failed+= strcollsp(&my_charset_big5_bin, strcoll_mb2_A1A1_mb2_F9FE);
#endif
#ifdef HAVE_CHARSET_cp932
failed+= strcollsp(&my_charset_cp932_japanese_ci, strcoll_mb2_common);
failed+= strcollsp(&my_charset_cp932_japanese_ci, strcoll_mb1_A1_bad_F9FE);
failed+= strcollsp(&my_charset_cp932_bin, strcoll_mb2_common);
failed+= strcollsp(&my_charset_cp932_bin, strcoll_mb1_A1_bad_F9FE);
failed+= strcollsp(&my_charset_cp932_japanese_ci, strcoll_8181_A1_E0E0);
failed+= strcollsp(&my_charset_cp932_bin, strcoll_8181_A1_E0E0);
#endif
#ifdef HAVE_CHARSET_euckr
failed+= strcollsp(&my_charset_euckr_korean_ci, strcoll_mb2_common);
failed+= strcollsp(&my_charset_euckr_korean_ci, strcoll_mb2_A1A1_mb2_F9FE);
failed+= strcollsp(&my_charset_euckr_bin, strcoll_mb2_common);
failed+= strcollsp(&my_charset_euckr_bin, strcoll_mb2_A1A1_mb2_F9FE);
#endif
#ifdef HAVE_CHARSET_gb2312
failed+= strcollsp(&my_charset_gb2312_chinese_ci, strcoll_mb2_common);
failed+= strcollsp(&my_charset_gb2312_chinese_ci, strcoll_mb2_A1A1_bad_F9FE);
failed+= strcollsp(&my_charset_gb2312_bin, strcoll_mb2_common);
failed+= strcollsp(&my_charset_gb2312_bin, strcoll_mb2_A1A1_bad_F9FE);
#endif
#ifdef HAVE_CHARSET_gbk
failed+= strcollsp(&my_charset_gbk_chinese_ci, strcoll_mb2_common);
failed+= strcollsp(&my_charset_gbk_chinese_ci, strcoll_mb2_A1A1_mb2_F9FE);
failed+= strcollsp(&my_charset_gbk_bin, strcoll_mb2_common);
failed+= strcollsp(&my_charset_gbk_bin, strcoll_mb2_A1A1_mb2_F9FE);
#endif
#ifdef HAVE_CHARSET_sjis
failed+= strcollsp(&my_charset_sjis_japanese_ci, strcoll_mb2_common);
failed+= strcollsp(&my_charset_sjis_bin, strcoll_mb2_common);
failed+= strcollsp(&my_charset_sjis_japanese_ci, strcoll_mb1_A1_bad_F9FE);
failed+= strcollsp(&my_charset_sjis_bin, strcoll_mb1_A1_bad_F9FE);
failed+= strcollsp(&my_charset_sjis_japanese_ci, strcoll_8181_A1_E0E0);
failed+= strcollsp(&my_charset_sjis_bin, strcoll_8181_A1_E0E0);
#endif
return failed;
}
int main()
{
size_t i, failed= 0;
plan(1);
plan(2);
diag("Testing my_like_range_xxx() functions");
for (i= 0; i < array_elements(charset_list); i++)
@@ -112,5 +462,10 @@ int main()
}
}
ok(failed == 0, "Testing my_like_range_xxx() functions");
diag("Testing cs->coll->strnncollsp()");
failed= test_strcollsp();
ok(failed == 0, "Testing cs->coll->strnncollsp()");
return exit_status();
}