1
0
mirror of https://github.com/MariaDB/server.git synced 2025-07-29 05:21:33 +03:00

Bug#22638 SOUNDEX broken for international characters

Problem: SOUNDEX returned an invalid string for international
characters in multi-byte character sets.
For example: for a Chinese/Japanese 3-byte long character
_utf8 0xE99885 it took only the very first byte 0xE9,
put it into the outout string and then appended with three 
DIGIT ZERO characters, so the result was 0xE9303030 - which
is an invalide utf8 string.
Fix: make SOUNDEX() multi-byte aware and - put only complete
characters into result, thus return only valid strings.
This patch also makes SOUNDEX() compatible with UCS2.


mysql-test/r/ctype_ucs.result:
  Adding tests
mysql-test/r/ctype_utf8.result:
  Adding tests
mysql-test/t/ctype_ucs.test:
  Adding tests
mysql-test/t/ctype_utf8.test:
  Adding tests
sql/item_strfunc.cc:
  Making soundex multi-byte aware.
This commit is contained in:
unknown
2007-03-28 18:57:30 +05:00
parent 685d21b72f
commit b5cc4fa61d
5 changed files with 157 additions and 28 deletions

View File

@ -839,6 +839,24 @@ lily
river
drop table t1;
deallocate prepare stmt;
set names latin1;
set character_set_connection=ucs2;
select soundex(''),soundex('he'),soundex('hello all folks'),soundex('#3556 in bugdb');
soundex('') soundex('he') soundex('hello all folks') soundex('#3556 in bugdb')
H000 H4142 I51231
select hex(soundex('')),hex(soundex('he')),hex(soundex('hello all folks')),hex(soundex('#3556 in bugdb'));
hex(soundex('')) hex(soundex('he')) hex(soundex('hello all folks')) hex(soundex('#3556 in bugdb'))
0048003000300030 00480034003100340032 004900350031003200330031
select 'mood' sounds like 'mud';
'mood' sounds like 'mud'
1
select hex(soundex(_ucs2 0x041004110412));
hex(soundex(_ucs2 0x041004110412))
0410003000300030
select hex(soundex(_ucs2 0x00BF00C0));
hex(soundex(_ucs2 0x00BF00C0))
00C0003000300030
set names latin1;
create table t1(a blob, b text charset utf8, c text charset ucs2);
select data_type, character_octet_length, character_maximum_length
from information_schema.columns where table_name='t1';

View File

@ -854,6 +854,18 @@ select * from t1 where soundex(a) = soundex('test');
id a
1 Test
drop table t1;
select soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB);
soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB)
阅000
select hex(soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB));
hex(soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB))
E99885303030
select soundex(_utf8 0xD091D092D093);
soundex(_utf8 0xD091D092D093)
Б000
select hex(soundex(_utf8 0xD091D092D093));
hex(soundex(_utf8 0xD091D092D093))
D091303030
SET collation_connection='utf8_general_ci';
create table t1 select repeat('a',4000) a;
delete from t1;

View File

@ -572,6 +572,20 @@ select utext from t1 where utext like '%%';
drop table t1;
deallocate prepare stmt;
#
# Bug#22638 SOUNDEX broken for international characters
#
set names latin1;
set character_set_connection=ucs2;
select soundex(''),soundex('he'),soundex('hello all folks'),soundex('#3556 in bugdb');
select hex(soundex('')),hex(soundex('he')),hex(soundex('hello all folks')),hex(soundex('#3556 in bugdb'));
select 'mood' sounds like 'mud';
# Cyrillic A, BE, VE
select hex(soundex(_ucs2 0x041004110412));
# Make sure that "U+00BF INVERTED QUESTION MARK" is not considered as letter
select hex(soundex(_ucs2 0x00BF00C0));
set names latin1;
#
# Bug #14290: character_maximum_length for text fields
#

View File

@ -702,6 +702,14 @@ select * from t1 where soundex(a) = soundex('TEST');
select * from t1 where soundex(a) = soundex('test');
drop table t1;
#
# Bug#22638 SOUNDEX broken for international characters
#
select soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB);
select hex(soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB));
select soundex(_utf8 0xD091D092D093);
select hex(soundex(_utf8 0xD091D092D093));
SET collation_connection='utf8_general_ci';
-- source include/ctype_filesort.inc