mirror of
https://github.com/MariaDB/server.git
synced 2025-08-01 03:47:19 +03:00
ctype-utf8.c:
A faster UTF8 null-terminated string implementation. It is used for identifier comparison, so it's quite critical.
This commit is contained in:
@ -2103,49 +2103,103 @@ static int my_strnncollsp_utf8(CHARSET_INFO *cs,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static int my_strncasecmp_utf8(CHARSET_INFO *cs,
|
/*
|
||||||
const char *s, const char *t, uint len)
|
Compare 0-terminated UTF8 strings.
|
||||||
|
|
||||||
|
SYNOPSIS
|
||||||
|
my_strcasecmp_utf8()
|
||||||
|
cs character set handler
|
||||||
|
s First 0-terminated string to compare
|
||||||
|
t Second 0-terminated string to compare
|
||||||
|
|
||||||
|
IMPLEMENTATION
|
||||||
|
|
||||||
|
RETURN
|
||||||
|
- negative number if s < t
|
||||||
|
- positive number if s > t
|
||||||
|
- 0 is the strings are equal
|
||||||
|
*/
|
||||||
|
|
||||||
|
static
|
||||||
|
int my_strcasecmp_utf8(CHARSET_INFO *cs, const char *s, const char *t)
|
||||||
{
|
{
|
||||||
int s_res,t_res;
|
while (s[0] && t[0])
|
||||||
my_wc_t s_wc,t_wc;
|
|
||||||
const char *se=s+len;
|
|
||||||
const char *te=t+len;
|
|
||||||
|
|
||||||
while ( s < se && t < te )
|
|
||||||
{
|
{
|
||||||
int plane;
|
my_wc_t s_wc,t_wc;
|
||||||
|
|
||||||
s_res=my_utf8_uni(cs,&s_wc, (const uchar*)s, (const uchar*)se);
|
if (s[0] >= 0)
|
||||||
t_res=my_utf8_uni(cs,&t_wc, (const uchar*)t, (const uchar*)te);
|
|
||||||
|
|
||||||
if ( s_res <= 0 || t_res <= 0 )
|
|
||||||
{
|
{
|
||||||
/* Incorrect string, compare byte by byte value */
|
/*
|
||||||
return bincmp(s, se, t, te);
|
s[0] is between 0 and 127.
|
||||||
|
It represents a single byte character.
|
||||||
|
Convert it into weight according to collation.
|
||||||
|
*/
|
||||||
|
s_wc= plane00[(uchar) s[0]].tolower;
|
||||||
|
s++;
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
int plane, res;
|
||||||
|
|
||||||
|
/*
|
||||||
|
Scan a multibyte character.
|
||||||
|
|
||||||
plane=(s_wc>>8) & 0xFF;
|
In the future it is worth to write a special version of my_utf8_uni()
|
||||||
s_wc = uni_plane[plane] ? uni_plane[plane][s_wc & 0xFF].tolower : s_wc;
|
for 0-terminated strings which will not take in account length. Now
|
||||||
|
we call the regular version of my_utf8_uni() with s+3 in the
|
||||||
plane=(t_wc>>8) & 0xFF;
|
last argument. s+3 is enough to scan any multibyte sequence.
|
||||||
t_wc = uni_plane[plane] ? uni_plane[plane][t_wc & 0xFF].tolower : t_wc;
|
|
||||||
|
|
||||||
|
Calling the regular version of my_utf8_uni is safe for 0-terminated
|
||||||
|
strings: we will never lose the end of the string:
|
||||||
|
If we have 0 character in the middle of a multibyte sequence,
|
||||||
|
then my_utf8_uni will always return a negative number, so the
|
||||||
|
loop with finish.
|
||||||
|
*/
|
||||||
|
|
||||||
|
res= my_utf8_uni(cs,&s_wc, (const uchar*)s, (const uchar*) s + 3);
|
||||||
|
|
||||||
|
/*
|
||||||
|
In the case of wrong multibyte sequence we will
|
||||||
|
call strcmp() for byte-to-byte comparison.
|
||||||
|
*/
|
||||||
|
if (res <= 0)
|
||||||
|
return strcmp(s, t);
|
||||||
|
s+= res;
|
||||||
|
|
||||||
|
/* Convert Unicode code into weight according to collation */
|
||||||
|
plane=(s_wc>>8) & 0xFF;
|
||||||
|
s_wc = uni_plane[plane] ? uni_plane[plane][s_wc & 0xFF].tolower : s_wc;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* Do the same for the second string */
|
||||||
|
|
||||||
|
if (t[0] >= 0)
|
||||||
|
{
|
||||||
|
/* Convert single byte character into weight */
|
||||||
|
t_wc= plane00[(uchar) t[0]].tolower;
|
||||||
|
t++;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
int plane;
|
||||||
|
int res=my_utf8_uni(cs,&t_wc, (const uchar*)t, (const uchar*) t + 3);
|
||||||
|
if (res <= 0)
|
||||||
|
return strcmp(s, t);
|
||||||
|
t+= res;
|
||||||
|
|
||||||
|
/* Convert code into weight */
|
||||||
|
plane=(t_wc>>8) & 0xFF;
|
||||||
|
t_wc = uni_plane[plane] ? uni_plane[plane][t_wc & 0xFF].tolower : t_wc;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Now we have two weights, let's compare them */
|
||||||
if ( s_wc != t_wc )
|
if ( s_wc != t_wc )
|
||||||
return ((int) s_wc) - ((int) t_wc);
|
return ((int) s_wc) - ((int) t_wc);
|
||||||
|
|
||||||
s+=s_res;
|
|
||||||
t+=t_res;
|
|
||||||
}
|
}
|
||||||
return ( (se-s) - (te-t) );
|
return ((int)(uchar)s[0]) - ((int) (uchar) t[0]);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int my_strcasecmp_utf8(CHARSET_INFO *cs, const char *s, const char *t)
|
|
||||||
{
|
|
||||||
uint s_len=strlen(s);
|
|
||||||
uint t_len=strlen(t);
|
|
||||||
uint len = (s_len > t_len) ? s_len : t_len;
|
|
||||||
return my_strncasecmp_utf8(cs, s, t, len);
|
|
||||||
}
|
|
||||||
|
|
||||||
static
|
static
|
||||||
int my_wildcmp_utf8(CHARSET_INFO *cs,
|
int my_wildcmp_utf8(CHARSET_INFO *cs,
|
||||||
|
Reference in New Issue
Block a user