mirror of
https://github.com/MariaDB/server.git
synced 2025-07-29 05:21:33 +03:00
Bug#22638 SOUNDEX broken for international characters
Problem: SOUNDEX returned an invalid string for international characters in multi-byte character sets. For example: for a Chinese/Japanese 3-byte long character _utf8 0xE99885 it took only the very first byte 0xE9, put it into the outout string and then appended with three DIGIT ZERO characters, so the result was 0xE9303030 - which is an invalide utf8 string. Fix: make SOUNDEX() multi-byte aware and - put only complete characters into result, thus return only valid strings. This patch also makes SOUNDEX() compatible with UCS2. mysql-test/r/ctype_ucs.result: Adding tests mysql-test/r/ctype_utf8.result: Adding tests mysql-test/t/ctype_ucs.test: Adding tests mysql-test/t/ctype_utf8.test: Adding tests sql/item_strfunc.cc: Making soundex multi-byte aware.
This commit is contained in:
@ -839,6 +839,24 @@ lily
|
||||
river
|
||||
drop table t1;
|
||||
deallocate prepare stmt;
|
||||
set names latin1;
|
||||
set character_set_connection=ucs2;
|
||||
select soundex(''),soundex('he'),soundex('hello all folks'),soundex('#3556 in bugdb');
|
||||
soundex('') soundex('he') soundex('hello all folks') soundex('#3556 in bugdb')
|
||||
H000 H4142 I51231
|
||||
select hex(soundex('')),hex(soundex('he')),hex(soundex('hello all folks')),hex(soundex('#3556 in bugdb'));
|
||||
hex(soundex('')) hex(soundex('he')) hex(soundex('hello all folks')) hex(soundex('#3556 in bugdb'))
|
||||
0048003000300030 00480034003100340032 004900350031003200330031
|
||||
select 'mood' sounds like 'mud';
|
||||
'mood' sounds like 'mud'
|
||||
1
|
||||
select hex(soundex(_ucs2 0x041004110412));
|
||||
hex(soundex(_ucs2 0x041004110412))
|
||||
0410003000300030
|
||||
select hex(soundex(_ucs2 0x00BF00C0));
|
||||
hex(soundex(_ucs2 0x00BF00C0))
|
||||
00C0003000300030
|
||||
set names latin1;
|
||||
create table t1(a blob, b text charset utf8, c text charset ucs2);
|
||||
select data_type, character_octet_length, character_maximum_length
|
||||
from information_schema.columns where table_name='t1';
|
||||
|
@ -854,6 +854,18 @@ select * from t1 where soundex(a) = soundex('test');
|
||||
id a
|
||||
1 Test
|
||||
drop table t1;
|
||||
select soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB);
|
||||
soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB)
|
||||
阅000
|
||||
select hex(soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB));
|
||||
hex(soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB))
|
||||
E99885303030
|
||||
select soundex(_utf8 0xD091D092D093);
|
||||
soundex(_utf8 0xD091D092D093)
|
||||
Б000
|
||||
select hex(soundex(_utf8 0xD091D092D093));
|
||||
hex(soundex(_utf8 0xD091D092D093))
|
||||
D091303030
|
||||
SET collation_connection='utf8_general_ci';
|
||||
create table t1 select repeat('a',4000) a;
|
||||
delete from t1;
|
||||
|
@ -572,6 +572,20 @@ select utext from t1 where utext like '%%';
|
||||
drop table t1;
|
||||
deallocate prepare stmt;
|
||||
|
||||
#
|
||||
# Bug#22638 SOUNDEX broken for international characters
|
||||
#
|
||||
set names latin1;
|
||||
set character_set_connection=ucs2;
|
||||
select soundex(''),soundex('he'),soundex('hello all folks'),soundex('#3556 in bugdb');
|
||||
select hex(soundex('')),hex(soundex('he')),hex(soundex('hello all folks')),hex(soundex('#3556 in bugdb'));
|
||||
select 'mood' sounds like 'mud';
|
||||
# Cyrillic A, BE, VE
|
||||
select hex(soundex(_ucs2 0x041004110412));
|
||||
# Make sure that "U+00BF INVERTED QUESTION MARK" is not considered as letter
|
||||
select hex(soundex(_ucs2 0x00BF00C0));
|
||||
set names latin1;
|
||||
|
||||
#
|
||||
# Bug #14290: character_maximum_length for text fields
|
||||
#
|
||||
|
@ -702,6 +702,14 @@ select * from t1 where soundex(a) = soundex('TEST');
|
||||
select * from t1 where soundex(a) = soundex('test');
|
||||
drop table t1;
|
||||
|
||||
#
|
||||
# Bug#22638 SOUNDEX broken for international characters
|
||||
#
|
||||
select soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB);
|
||||
select hex(soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB));
|
||||
select soundex(_utf8 0xD091D092D093);
|
||||
select hex(soundex(_utf8 0xD091D092D093));
|
||||
|
||||
|
||||
SET collation_connection='utf8_general_ci';
|
||||
-- source include/ctype_filesort.inc
|
||||
|
Reference in New Issue
Block a user