mirror of
https://github.com/MariaDB/server.git
synced 2025-07-29 05:21:33 +03:00
Merging my_convert() from 10.0-serg
modified: include/m_ctype.h mysys/ma_dyncol.c mysys/string.c sql/sql_string.cc sql/sql_string.h strings/ctype.c
This commit is contained in:
@ -591,6 +591,10 @@ my_bool my_charset_is_ascii_compatible(CHARSET_INFO *cs);
|
|||||||
extern size_t my_vsnprintf_ex(CHARSET_INFO *cs, char *to, size_t n,
|
extern size_t my_vsnprintf_ex(CHARSET_INFO *cs, char *to, size_t n,
|
||||||
const char* fmt, va_list ap);
|
const char* fmt, va_list ap);
|
||||||
|
|
||||||
|
uint32 my_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs,
|
||||||
|
const char *from, uint32 from_length,
|
||||||
|
CHARSET_INFO *from_cs, uint *errors);
|
||||||
|
|
||||||
#define _MY_U 01 /* Upper case */
|
#define _MY_U 01 /* Upper case */
|
||||||
#define _MY_L 02 /* Lower case */
|
#define _MY_L 02 /* Lower case */
|
||||||
#define _MY_NMR 04 /* Numeral (digit) */
|
#define _MY_NMR 04 /* Numeral (digit) */
|
||||||
|
@ -3853,7 +3853,7 @@ mariadb_dyncol_val_str(DYNAMIC_STRING *str, DYNAMIC_COLUMN_VALUE *val,
|
|||||||
if (!quote)
|
if (!quote)
|
||||||
{
|
{
|
||||||
/* convert to the destination */
|
/* convert to the destination */
|
||||||
str->length+= copy_and_convert_extended(str->str, bufflen,
|
str->length+= my_convert(str->str, bufflen,
|
||||||
cs,
|
cs,
|
||||||
from, (uint32)len,
|
from, (uint32)len,
|
||||||
val->x.string.charset,
|
val->x.string.charset,
|
||||||
@ -3862,8 +3862,7 @@ mariadb_dyncol_val_str(DYNAMIC_STRING *str, DYNAMIC_COLUMN_VALUE *val,
|
|||||||
}
|
}
|
||||||
if ((alloc= (char *)my_malloc(bufflen, MYF(0))))
|
if ((alloc= (char *)my_malloc(bufflen, MYF(0))))
|
||||||
{
|
{
|
||||||
len=
|
len= my_convert(alloc, bufflen, cs,
|
||||||
copy_and_convert_extended(alloc, bufflen, cs,
|
|
||||||
from, (uint32)len,
|
from, (uint32)len,
|
||||||
val->x.string.charset,
|
val->x.string.charset,
|
||||||
&dummy_errors);
|
&dummy_errors);
|
||||||
|
@ -223,77 +223,3 @@ void dynstr_reassociate(DYNAMIC_STRING *str, char **ptr, size_t *length,
|
|||||||
*alloc_length= str->max_length;
|
*alloc_length= str->max_length;
|
||||||
str->str=0;
|
str->str=0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
|
||||||
copy a string from one character set to another
|
|
||||||
|
|
||||||
SYNOPSIS
|
|
||||||
copy_and_convert()
|
|
||||||
to Store result here
|
|
||||||
to_cs Character set of result string
|
|
||||||
from Copy from here
|
|
||||||
from_length Length of from string
|
|
||||||
from_cs From character set
|
|
||||||
|
|
||||||
NOTES
|
|
||||||
'to' must be big enough as form_length * to_cs->mbmaxlen
|
|
||||||
|
|
||||||
RETURN
|
|
||||||
length of bytes copied to 'to'
|
|
||||||
*/
|
|
||||||
|
|
||||||
uint32
|
|
||||||
copy_and_convert_extended(char *to, uint32 to_length, CHARSET_INFO *to_cs,
|
|
||||||
const char *from, uint32 from_length,
|
|
||||||
CHARSET_INFO *from_cs,
|
|
||||||
uint *errors)
|
|
||||||
{
|
|
||||||
int cnvres;
|
|
||||||
my_wc_t wc;
|
|
||||||
const uchar *from_end= (const uchar*) from+from_length;
|
|
||||||
char *to_start= to;
|
|
||||||
uchar *to_end= (uchar*) to+to_length;
|
|
||||||
my_charset_conv_mb_wc mb_wc= from_cs->cset->mb_wc;
|
|
||||||
my_charset_conv_wc_mb wc_mb= to_cs->cset->wc_mb;
|
|
||||||
uint error_count= 0;
|
|
||||||
|
|
||||||
while (1)
|
|
||||||
{
|
|
||||||
if ((cnvres= (*mb_wc)(from_cs, &wc, (uchar*) from,
|
|
||||||
from_end)) > 0)
|
|
||||||
from+= cnvres;
|
|
||||||
else if (cnvres == MY_CS_ILSEQ)
|
|
||||||
{
|
|
||||||
error_count++;
|
|
||||||
from++;
|
|
||||||
wc= '?';
|
|
||||||
}
|
|
||||||
else if (cnvres > MY_CS_TOOSMALL)
|
|
||||||
{
|
|
||||||
/*
|
|
||||||
A correct multibyte sequence detected
|
|
||||||
But it doesn't have Unicode mapping.
|
|
||||||
*/
|
|
||||||
error_count++;
|
|
||||||
from+= (-cnvres);
|
|
||||||
wc= '?';
|
|
||||||
}
|
|
||||||
else
|
|
||||||
break; // Not enough characters
|
|
||||||
|
|
||||||
outp:
|
|
||||||
if ((cnvres= (*wc_mb)(to_cs, wc, (uchar*) to, to_end)) > 0)
|
|
||||||
to+= cnvres;
|
|
||||||
else if (cnvres == MY_CS_ILUNI && wc != '?')
|
|
||||||
{
|
|
||||||
error_count++;
|
|
||||||
wc= '?';
|
|
||||||
goto outp;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
*errors= error_count;
|
|
||||||
return (uint32) (to - to_start);
|
|
||||||
}
|
|
||||||
|
@ -776,67 +776,6 @@ String *copy_if_not_alloced(String *to,String *from,uint32 from_length)
|
|||||||
Help functions
|
Help functions
|
||||||
****************************************************************************/
|
****************************************************************************/
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
|
||||||
Optimized for quick copying of ASCII characters in the range 0x00..0x7F.
|
|
||||||
*/
|
|
||||||
uint32
|
|
||||||
copy_and_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs,
|
|
||||||
const char *from, uint32 from_length, CHARSET_INFO *from_cs,
|
|
||||||
uint *errors)
|
|
||||||
{
|
|
||||||
/*
|
|
||||||
If any of the character sets is not ASCII compatible,
|
|
||||||
immediately switch to slow mb_wc->wc_mb method.
|
|
||||||
*/
|
|
||||||
if ((to_cs->state | from_cs->state) & MY_CS_NONASCII)
|
|
||||||
return copy_and_convert_extended(to, to_length, to_cs,
|
|
||||||
from, from_length, from_cs, errors);
|
|
||||||
|
|
||||||
uint32 length= min(to_length, from_length), length2= length;
|
|
||||||
|
|
||||||
#if defined(__i386__) || defined(__x86_64__)
|
|
||||||
/*
|
|
||||||
Special loop for i386, it allows to refer to a
|
|
||||||
non-aligned memory block as UINT32, which makes
|
|
||||||
it possible to copy four bytes at once. This
|
|
||||||
gives about 10% performance improvement comparing
|
|
||||||
to byte-by-byte loop.
|
|
||||||
*/
|
|
||||||
for ( ; length >= 4; length-= 4, from+= 4, to+= 4)
|
|
||||||
{
|
|
||||||
if ((*(uint32*)from) & 0x80808080)
|
|
||||||
break;
|
|
||||||
*((uint32*) to)= *((const uint32*) from);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
for (; ; *to++= *from++, length--)
|
|
||||||
{
|
|
||||||
if (!length)
|
|
||||||
{
|
|
||||||
*errors= 0;
|
|
||||||
return length2;
|
|
||||||
}
|
|
||||||
if (*((unsigned char*) from) > 0x7F) /* A non-ASCII character */
|
|
||||||
{
|
|
||||||
uint32 copied_length= length2 - length;
|
|
||||||
to_length-= copied_length;
|
|
||||||
from_length-= copied_length;
|
|
||||||
return copied_length + copy_and_convert_extended(to, to_length,
|
|
||||||
to_cs,
|
|
||||||
from, from_length,
|
|
||||||
from_cs,
|
|
||||||
errors);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
DBUG_ASSERT(FALSE); // Should never get to here
|
|
||||||
return 0; // Make compiler happy
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
Copy string with HEX-encoding of "bad" characters.
|
Copy string with HEX-encoding of "bad" characters.
|
||||||
|
|
||||||
|
@ -34,9 +34,13 @@ typedef struct st_mem_root MEM_ROOT;
|
|||||||
|
|
||||||
int sortcmp(const String *a,const String *b, CHARSET_INFO *cs);
|
int sortcmp(const String *a,const String *b, CHARSET_INFO *cs);
|
||||||
String *copy_if_not_alloced(String *a,String *b,uint32 arg_length);
|
String *copy_if_not_alloced(String *a,String *b,uint32 arg_length);
|
||||||
uint32 copy_and_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs,
|
inline uint32 copy_and_convert(char *to, uint32 to_length,
|
||||||
|
const CHARSET_INFO *to_cs,
|
||||||
const char *from, uint32 from_length,
|
const char *from, uint32 from_length,
|
||||||
CHARSET_INFO *from_cs, uint *errors);
|
const CHARSET_INFO *from_cs, uint *errors)
|
||||||
|
{
|
||||||
|
return my_convert(to, to_length, to_cs, from, from_length, from_cs, errors);
|
||||||
|
}
|
||||||
uint32 well_formed_copy_nchars(CHARSET_INFO *to_cs,
|
uint32 well_formed_copy_nchars(CHARSET_INFO *to_cs,
|
||||||
char *to, uint to_length,
|
char *to, uint to_length,
|
||||||
CHARSET_INFO *from_cs,
|
CHARSET_INFO *from_cs,
|
||||||
|
141
strings/ctype.c
141
strings/ctype.c
@ -430,3 +430,144 @@ my_charset_is_ascii_compatible(CHARSET_INFO *cs)
|
|||||||
}
|
}
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
Convert a string between two character sets.
|
||||||
|
'to' must be large enough to store (form_length * to_cs->mbmaxlen) bytes.
|
||||||
|
|
||||||
|
@param to[OUT] Store result here
|
||||||
|
@param to_length Size of "to" buffer
|
||||||
|
@param to_cs Character set of result string
|
||||||
|
@param from Copy from here
|
||||||
|
@param from_length Length of the "from" string
|
||||||
|
@param from_cs Character set of the "from" string
|
||||||
|
@param errors[OUT] Number of conversion errors
|
||||||
|
|
||||||
|
@return Number of bytes copied to 'to' string
|
||||||
|
*/
|
||||||
|
|
||||||
|
static uint32
|
||||||
|
my_convert_internal(char *to, uint32 to_length,
|
||||||
|
CHARSET_INFO *to_cs,
|
||||||
|
const char *from, uint32 from_length,
|
||||||
|
CHARSET_INFO *from_cs, uint *errors)
|
||||||
|
{
|
||||||
|
int cnvres;
|
||||||
|
my_wc_t wc;
|
||||||
|
const uchar *from_end= (const uchar*) from + from_length;
|
||||||
|
char *to_start= to;
|
||||||
|
uchar *to_end= (uchar*) to + to_length;
|
||||||
|
my_charset_conv_mb_wc mb_wc= from_cs->cset->mb_wc;
|
||||||
|
my_charset_conv_wc_mb wc_mb= to_cs->cset->wc_mb;
|
||||||
|
uint error_count= 0;
|
||||||
|
|
||||||
|
while (1)
|
||||||
|
{
|
||||||
|
if ((cnvres= (*mb_wc)(from_cs, &wc, (uchar*) from, from_end)) > 0)
|
||||||
|
from+= cnvres;
|
||||||
|
else if (cnvres == MY_CS_ILSEQ)
|
||||||
|
{
|
||||||
|
error_count++;
|
||||||
|
from++;
|
||||||
|
wc= '?';
|
||||||
|
}
|
||||||
|
else if (cnvres > MY_CS_TOOSMALL)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
A correct multibyte sequence detected
|
||||||
|
But it doesn't have Unicode mapping.
|
||||||
|
*/
|
||||||
|
error_count++;
|
||||||
|
from+= (-cnvres);
|
||||||
|
wc= '?';
|
||||||
|
}
|
||||||
|
else
|
||||||
|
break; // Not enough characters
|
||||||
|
|
||||||
|
outp:
|
||||||
|
if ((cnvres= (*wc_mb)(to_cs, wc, (uchar*) to, to_end)) > 0)
|
||||||
|
to+= cnvres;
|
||||||
|
else if (cnvres == MY_CS_ILUNI && wc != '?')
|
||||||
|
{
|
||||||
|
error_count++;
|
||||||
|
wc= '?';
|
||||||
|
goto outp;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
*errors= error_count;
|
||||||
|
return (uint32) (to - to_start);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
Convert a string between two character sets.
|
||||||
|
Optimized for quick copying of ASCII characters in the range 0x00..0x7F.
|
||||||
|
'to' must be large enough to store (form_length * to_cs->mbmaxlen) bytes.
|
||||||
|
|
||||||
|
@param to[OUT] Store result here
|
||||||
|
@param to_length Size of "to" buffer
|
||||||
|
@param to_cs Character set of result string
|
||||||
|
@param from Copy from here
|
||||||
|
@param from_length Length of the "from" string
|
||||||
|
@param from_cs Character set of the "from" string
|
||||||
|
@param errors[OUT] Number of conversion errors
|
||||||
|
|
||||||
|
@return Number of bytes copied to 'to' string
|
||||||
|
*/
|
||||||
|
|
||||||
|
uint32
|
||||||
|
my_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs,
|
||||||
|
const char *from, uint32 from_length,
|
||||||
|
CHARSET_INFO *from_cs, uint *errors)
|
||||||
|
{
|
||||||
|
uint32 length, length2;
|
||||||
|
/*
|
||||||
|
If any of the character sets is not ASCII compatible,
|
||||||
|
immediately switch to slow mb_wc->wc_mb method.
|
||||||
|
*/
|
||||||
|
if ((to_cs->state | from_cs->state) & MY_CS_NONASCII)
|
||||||
|
return my_convert_internal(to, to_length, to_cs,
|
||||||
|
from, from_length, from_cs, errors);
|
||||||
|
|
||||||
|
length= length2= MY_MIN(to_length, from_length);
|
||||||
|
|
||||||
|
#if defined(__i386__) || defined(__x86_64__)
|
||||||
|
/*
|
||||||
|
Special loop for i386, it allows to refer to a
|
||||||
|
non-aligned memory block as UINT32, which makes
|
||||||
|
it possible to copy four bytes at once. This
|
||||||
|
gives about 10% performance improvement comparing
|
||||||
|
to byte-by-byte loop.
|
||||||
|
*/
|
||||||
|
for ( ; length >= 4; length-= 4, from+= 4, to+= 4)
|
||||||
|
{
|
||||||
|
if ((*(uint32*)from) & 0x80808080)
|
||||||
|
break;
|
||||||
|
*((uint32*) to)= *((const uint32*) from);
|
||||||
|
}
|
||||||
|
#endif /* __i386__ */
|
||||||
|
|
||||||
|
for (; ; *to++= *from++, length--)
|
||||||
|
{
|
||||||
|
if (!length)
|
||||||
|
{
|
||||||
|
*errors= 0;
|
||||||
|
return length2;
|
||||||
|
}
|
||||||
|
if (*((unsigned char*) from) > 0x7F) /* A non-ASCII character */
|
||||||
|
{
|
||||||
|
uint32 copied_length= length2 - length;
|
||||||
|
to_length-= copied_length;
|
||||||
|
from_length-= copied_length;
|
||||||
|
return copied_length + my_convert_internal(to, to_length, to_cs,
|
||||||
|
from, from_length, from_cs,
|
||||||
|
errors);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
DBUG_ASSERT(FALSE); // Should never get to here
|
||||||
|
return 0; // Make compiler happy
|
||||||
|
}
|
||||||
|
Reference in New Issue
Block a user