mirror of
https://github.com/MariaDB/server.git
synced 2025-07-29 05:21:33 +03:00
Merging my_convert() from 10.0-serg
modified: include/m_ctype.h mysys/ma_dyncol.c mysys/string.c sql/sql_string.cc sql/sql_string.h strings/ctype.c
This commit is contained in:
@ -591,6 +591,10 @@ my_bool my_charset_is_ascii_compatible(CHARSET_INFO *cs);
|
||||
extern size_t my_vsnprintf_ex(CHARSET_INFO *cs, char *to, size_t n,
|
||||
const char* fmt, va_list ap);
|
||||
|
||||
uint32 my_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs,
|
||||
const char *from, uint32 from_length,
|
||||
CHARSET_INFO *from_cs, uint *errors);
|
||||
|
||||
#define _MY_U 01 /* Upper case */
|
||||
#define _MY_L 02 /* Lower case */
|
||||
#define _MY_NMR 04 /* Numeral (digit) */
|
||||
|
@ -3853,20 +3853,19 @@ mariadb_dyncol_val_str(DYNAMIC_STRING *str, DYNAMIC_COLUMN_VALUE *val,
|
||||
if (!quote)
|
||||
{
|
||||
/* convert to the destination */
|
||||
str->length+= copy_and_convert_extended(str->str, bufflen,
|
||||
cs,
|
||||
from, (uint32)len,
|
||||
val->x.string.charset,
|
||||
&dummy_errors);
|
||||
str->length+= my_convert(str->str, bufflen,
|
||||
cs,
|
||||
from, (uint32)len,
|
||||
val->x.string.charset,
|
||||
&dummy_errors);
|
||||
return ER_DYNCOL_OK;
|
||||
}
|
||||
if ((alloc= (char *)my_malloc(bufflen, MYF(0))))
|
||||
{
|
||||
len=
|
||||
copy_and_convert_extended(alloc, bufflen, cs,
|
||||
from, (uint32)len,
|
||||
val->x.string.charset,
|
||||
&dummy_errors);
|
||||
len= my_convert(alloc, bufflen, cs,
|
||||
from, (uint32)len,
|
||||
val->x.string.charset,
|
||||
&dummy_errors);
|
||||
from= alloc;
|
||||
}
|
||||
else
|
||||
|
@ -223,77 +223,3 @@ void dynstr_reassociate(DYNAMIC_STRING *str, char **ptr, size_t *length,
|
||||
*alloc_length= str->max_length;
|
||||
str->str=0;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
copy a string from one character set to another
|
||||
|
||||
SYNOPSIS
|
||||
copy_and_convert()
|
||||
to Store result here
|
||||
to_cs Character set of result string
|
||||
from Copy from here
|
||||
from_length Length of from string
|
||||
from_cs From character set
|
||||
|
||||
NOTES
|
||||
'to' must be big enough as form_length * to_cs->mbmaxlen
|
||||
|
||||
RETURN
|
||||
length of bytes copied to 'to'
|
||||
*/
|
||||
|
||||
uint32
|
||||
copy_and_convert_extended(char *to, uint32 to_length, CHARSET_INFO *to_cs,
|
||||
const char *from, uint32 from_length,
|
||||
CHARSET_INFO *from_cs,
|
||||
uint *errors)
|
||||
{
|
||||
int cnvres;
|
||||
my_wc_t wc;
|
||||
const uchar *from_end= (const uchar*) from+from_length;
|
||||
char *to_start= to;
|
||||
uchar *to_end= (uchar*) to+to_length;
|
||||
my_charset_conv_mb_wc mb_wc= from_cs->cset->mb_wc;
|
||||
my_charset_conv_wc_mb wc_mb= to_cs->cset->wc_mb;
|
||||
uint error_count= 0;
|
||||
|
||||
while (1)
|
||||
{
|
||||
if ((cnvres= (*mb_wc)(from_cs, &wc, (uchar*) from,
|
||||
from_end)) > 0)
|
||||
from+= cnvres;
|
||||
else if (cnvres == MY_CS_ILSEQ)
|
||||
{
|
||||
error_count++;
|
||||
from++;
|
||||
wc= '?';
|
||||
}
|
||||
else if (cnvres > MY_CS_TOOSMALL)
|
||||
{
|
||||
/*
|
||||
A correct multibyte sequence detected
|
||||
But it doesn't have Unicode mapping.
|
||||
*/
|
||||
error_count++;
|
||||
from+= (-cnvres);
|
||||
wc= '?';
|
||||
}
|
||||
else
|
||||
break; // Not enough characters
|
||||
|
||||
outp:
|
||||
if ((cnvres= (*wc_mb)(to_cs, wc, (uchar*) to, to_end)) > 0)
|
||||
to+= cnvres;
|
||||
else if (cnvres == MY_CS_ILUNI && wc != '?')
|
||||
{
|
||||
error_count++;
|
||||
wc= '?';
|
||||
goto outp;
|
||||
}
|
||||
else
|
||||
break;
|
||||
}
|
||||
*errors= error_count;
|
||||
return (uint32) (to - to_start);
|
||||
}
|
||||
|
@ -776,67 +776,6 @@ String *copy_if_not_alloced(String *to,String *from,uint32 from_length)
|
||||
Help functions
|
||||
****************************************************************************/
|
||||
|
||||
|
||||
|
||||
/*
|
||||
Optimized for quick copying of ASCII characters in the range 0x00..0x7F.
|
||||
*/
|
||||
uint32
|
||||
copy_and_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs,
|
||||
const char *from, uint32 from_length, CHARSET_INFO *from_cs,
|
||||
uint *errors)
|
||||
{
|
||||
/*
|
||||
If any of the character sets is not ASCII compatible,
|
||||
immediately switch to slow mb_wc->wc_mb method.
|
||||
*/
|
||||
if ((to_cs->state | from_cs->state) & MY_CS_NONASCII)
|
||||
return copy_and_convert_extended(to, to_length, to_cs,
|
||||
from, from_length, from_cs, errors);
|
||||
|
||||
uint32 length= min(to_length, from_length), length2= length;
|
||||
|
||||
#if defined(__i386__) || defined(__x86_64__)
|
||||
/*
|
||||
Special loop for i386, it allows to refer to a
|
||||
non-aligned memory block as UINT32, which makes
|
||||
it possible to copy four bytes at once. This
|
||||
gives about 10% performance improvement comparing
|
||||
to byte-by-byte loop.
|
||||
*/
|
||||
for ( ; length >= 4; length-= 4, from+= 4, to+= 4)
|
||||
{
|
||||
if ((*(uint32*)from) & 0x80808080)
|
||||
break;
|
||||
*((uint32*) to)= *((const uint32*) from);
|
||||
}
|
||||
#endif
|
||||
|
||||
for (; ; *to++= *from++, length--)
|
||||
{
|
||||
if (!length)
|
||||
{
|
||||
*errors= 0;
|
||||
return length2;
|
||||
}
|
||||
if (*((unsigned char*) from) > 0x7F) /* A non-ASCII character */
|
||||
{
|
||||
uint32 copied_length= length2 - length;
|
||||
to_length-= copied_length;
|
||||
from_length-= copied_length;
|
||||
return copied_length + copy_and_convert_extended(to, to_length,
|
||||
to_cs,
|
||||
from, from_length,
|
||||
from_cs,
|
||||
errors);
|
||||
}
|
||||
}
|
||||
|
||||
DBUG_ASSERT(FALSE); // Should never get to here
|
||||
return 0; // Make compiler happy
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
Copy string with HEX-encoding of "bad" characters.
|
||||
|
||||
|
@ -34,9 +34,13 @@ typedef struct st_mem_root MEM_ROOT;
|
||||
|
||||
int sortcmp(const String *a,const String *b, CHARSET_INFO *cs);
|
||||
String *copy_if_not_alloced(String *a,String *b,uint32 arg_length);
|
||||
uint32 copy_and_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs,
|
||||
const char *from, uint32 from_length,
|
||||
CHARSET_INFO *from_cs, uint *errors);
|
||||
inline uint32 copy_and_convert(char *to, uint32 to_length,
|
||||
const CHARSET_INFO *to_cs,
|
||||
const char *from, uint32 from_length,
|
||||
const CHARSET_INFO *from_cs, uint *errors)
|
||||
{
|
||||
return my_convert(to, to_length, to_cs, from, from_length, from_cs, errors);
|
||||
}
|
||||
uint32 well_formed_copy_nchars(CHARSET_INFO *to_cs,
|
||||
char *to, uint to_length,
|
||||
CHARSET_INFO *from_cs,
|
||||
|
141
strings/ctype.c
141
strings/ctype.c
@ -430,3 +430,144 @@ my_charset_is_ascii_compatible(CHARSET_INFO *cs)
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Convert a string between two character sets.
|
||||
'to' must be large enough to store (form_length * to_cs->mbmaxlen) bytes.
|
||||
|
||||
@param to[OUT] Store result here
|
||||
@param to_length Size of "to" buffer
|
||||
@param to_cs Character set of result string
|
||||
@param from Copy from here
|
||||
@param from_length Length of the "from" string
|
||||
@param from_cs Character set of the "from" string
|
||||
@param errors[OUT] Number of conversion errors
|
||||
|
||||
@return Number of bytes copied to 'to' string
|
||||
*/
|
||||
|
||||
static uint32
|
||||
my_convert_internal(char *to, uint32 to_length,
|
||||
CHARSET_INFO *to_cs,
|
||||
const char *from, uint32 from_length,
|
||||
CHARSET_INFO *from_cs, uint *errors)
|
||||
{
|
||||
int cnvres;
|
||||
my_wc_t wc;
|
||||
const uchar *from_end= (const uchar*) from + from_length;
|
||||
char *to_start= to;
|
||||
uchar *to_end= (uchar*) to + to_length;
|
||||
my_charset_conv_mb_wc mb_wc= from_cs->cset->mb_wc;
|
||||
my_charset_conv_wc_mb wc_mb= to_cs->cset->wc_mb;
|
||||
uint error_count= 0;
|
||||
|
||||
while (1)
|
||||
{
|
||||
if ((cnvres= (*mb_wc)(from_cs, &wc, (uchar*) from, from_end)) > 0)
|
||||
from+= cnvres;
|
||||
else if (cnvres == MY_CS_ILSEQ)
|
||||
{
|
||||
error_count++;
|
||||
from++;
|
||||
wc= '?';
|
||||
}
|
||||
else if (cnvres > MY_CS_TOOSMALL)
|
||||
{
|
||||
/*
|
||||
A correct multibyte sequence detected
|
||||
But it doesn't have Unicode mapping.
|
||||
*/
|
||||
error_count++;
|
||||
from+= (-cnvres);
|
||||
wc= '?';
|
||||
}
|
||||
else
|
||||
break; // Not enough characters
|
||||
|
||||
outp:
|
||||
if ((cnvres= (*wc_mb)(to_cs, wc, (uchar*) to, to_end)) > 0)
|
||||
to+= cnvres;
|
||||
else if (cnvres == MY_CS_ILUNI && wc != '?')
|
||||
{
|
||||
error_count++;
|
||||
wc= '?';
|
||||
goto outp;
|
||||
}
|
||||
else
|
||||
break;
|
||||
}
|
||||
*errors= error_count;
|
||||
return (uint32) (to - to_start);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Convert a string between two character sets.
|
||||
Optimized for quick copying of ASCII characters in the range 0x00..0x7F.
|
||||
'to' must be large enough to store (form_length * to_cs->mbmaxlen) bytes.
|
||||
|
||||
@param to[OUT] Store result here
|
||||
@param to_length Size of "to" buffer
|
||||
@param to_cs Character set of result string
|
||||
@param from Copy from here
|
||||
@param from_length Length of the "from" string
|
||||
@param from_cs Character set of the "from" string
|
||||
@param errors[OUT] Number of conversion errors
|
||||
|
||||
@return Number of bytes copied to 'to' string
|
||||
*/
|
||||
|
||||
uint32
|
||||
my_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs,
|
||||
const char *from, uint32 from_length,
|
||||
CHARSET_INFO *from_cs, uint *errors)
|
||||
{
|
||||
uint32 length, length2;
|
||||
/*
|
||||
If any of the character sets is not ASCII compatible,
|
||||
immediately switch to slow mb_wc->wc_mb method.
|
||||
*/
|
||||
if ((to_cs->state | from_cs->state) & MY_CS_NONASCII)
|
||||
return my_convert_internal(to, to_length, to_cs,
|
||||
from, from_length, from_cs, errors);
|
||||
|
||||
length= length2= MY_MIN(to_length, from_length);
|
||||
|
||||
#if defined(__i386__) || defined(__x86_64__)
|
||||
/*
|
||||
Special loop for i386, it allows to refer to a
|
||||
non-aligned memory block as UINT32, which makes
|
||||
it possible to copy four bytes at once. This
|
||||
gives about 10% performance improvement comparing
|
||||
to byte-by-byte loop.
|
||||
*/
|
||||
for ( ; length >= 4; length-= 4, from+= 4, to+= 4)
|
||||
{
|
||||
if ((*(uint32*)from) & 0x80808080)
|
||||
break;
|
||||
*((uint32*) to)= *((const uint32*) from);
|
||||
}
|
||||
#endif /* __i386__ */
|
||||
|
||||
for (; ; *to++= *from++, length--)
|
||||
{
|
||||
if (!length)
|
||||
{
|
||||
*errors= 0;
|
||||
return length2;
|
||||
}
|
||||
if (*((unsigned char*) from) > 0x7F) /* A non-ASCII character */
|
||||
{
|
||||
uint32 copied_length= length2 - length;
|
||||
to_length-= copied_length;
|
||||
from_length-= copied_length;
|
||||
return copied_length + my_convert_internal(to, to_length, to_cs,
|
||||
from, from_length, from_cs,
|
||||
errors);
|
||||
}
|
||||
}
|
||||
|
||||
DBUG_ASSERT(FALSE); // Should never get to here
|
||||
return 0; // Make compiler happy
|
||||
}
|
||||
|
Reference in New Issue
Block a user