diff --git a/include/m_ctype.h b/include/m_ctype.h index 95b520e4ee9..7483e8f53a6 100644 --- a/include/m_ctype.h +++ b/include/m_ctype.h @@ -591,6 +591,10 @@ my_bool my_charset_is_ascii_compatible(CHARSET_INFO *cs); extern size_t my_vsnprintf_ex(CHARSET_INFO *cs, char *to, size_t n, const char* fmt, va_list ap); +uint32 my_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs, + const char *from, uint32 from_length, + CHARSET_INFO *from_cs, uint *errors); + #define _MY_U 01 /* Upper case */ #define _MY_L 02 /* Lower case */ #define _MY_NMR 04 /* Numeral (digit) */ diff --git a/mysys/ma_dyncol.c b/mysys/ma_dyncol.c index 71ceceaf162..1d297d918da 100644 --- a/mysys/ma_dyncol.c +++ b/mysys/ma_dyncol.c @@ -3853,20 +3853,19 @@ mariadb_dyncol_val_str(DYNAMIC_STRING *str, DYNAMIC_COLUMN_VALUE *val, if (!quote) { /* convert to the destination */ - str->length+= copy_and_convert_extended(str->str, bufflen, - cs, - from, (uint32)len, - val->x.string.charset, - &dummy_errors); + str->length+= my_convert(str->str, bufflen, + cs, + from, (uint32)len, + val->x.string.charset, + &dummy_errors); return ER_DYNCOL_OK; } if ((alloc= (char *)my_malloc(bufflen, MYF(0)))) { - len= - copy_and_convert_extended(alloc, bufflen, cs, - from, (uint32)len, - val->x.string.charset, - &dummy_errors); + len= my_convert(alloc, bufflen, cs, + from, (uint32)len, + val->x.string.charset, + &dummy_errors); from= alloc; } else diff --git a/mysys/string.c b/mysys/string.c index 1263e7824f9..42fe83ed4e1 100644 --- a/mysys/string.c +++ b/mysys/string.c @@ -223,77 +223,3 @@ void dynstr_reassociate(DYNAMIC_STRING *str, char **ptr, size_t *length, *alloc_length= str->max_length; str->str=0; } - - -/* - copy a string from one character set to another - - SYNOPSIS - copy_and_convert() - to Store result here - to_cs Character set of result string - from Copy from here - from_length Length of from string - from_cs From character set - - NOTES - 'to' must be big enough as form_length * to_cs->mbmaxlen - - RETURN - length of bytes copied to 'to' -*/ - -uint32 -copy_and_convert_extended(char *to, uint32 to_length, CHARSET_INFO *to_cs, - const char *from, uint32 from_length, - CHARSET_INFO *from_cs, - uint *errors) -{ - int cnvres; - my_wc_t wc; - const uchar *from_end= (const uchar*) from+from_length; - char *to_start= to; - uchar *to_end= (uchar*) to+to_length; - my_charset_conv_mb_wc mb_wc= from_cs->cset->mb_wc; - my_charset_conv_wc_mb wc_mb= to_cs->cset->wc_mb; - uint error_count= 0; - - while (1) - { - if ((cnvres= (*mb_wc)(from_cs, &wc, (uchar*) from, - from_end)) > 0) - from+= cnvres; - else if (cnvres == MY_CS_ILSEQ) - { - error_count++; - from++; - wc= '?'; - } - else if (cnvres > MY_CS_TOOSMALL) - { - /* - A correct multibyte sequence detected - But it doesn't have Unicode mapping. - */ - error_count++; - from+= (-cnvres); - wc= '?'; - } - else - break; // Not enough characters - -outp: - if ((cnvres= (*wc_mb)(to_cs, wc, (uchar*) to, to_end)) > 0) - to+= cnvres; - else if (cnvres == MY_CS_ILUNI && wc != '?') - { - error_count++; - wc= '?'; - goto outp; - } - else - break; - } - *errors= error_count; - return (uint32) (to - to_start); -} diff --git a/sql/sql_string.cc b/sql/sql_string.cc index f1cb5e07eca..014efd48065 100644 --- a/sql/sql_string.cc +++ b/sql/sql_string.cc @@ -776,67 +776,6 @@ String *copy_if_not_alloced(String *to,String *from,uint32 from_length) Help functions ****************************************************************************/ - - -/* - Optimized for quick copying of ASCII characters in the range 0x00..0x7F. -*/ -uint32 -copy_and_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs, - const char *from, uint32 from_length, CHARSET_INFO *from_cs, - uint *errors) -{ - /* - If any of the character sets is not ASCII compatible, - immediately switch to slow mb_wc->wc_mb method. - */ - if ((to_cs->state | from_cs->state) & MY_CS_NONASCII) - return copy_and_convert_extended(to, to_length, to_cs, - from, from_length, from_cs, errors); - - uint32 length= min(to_length, from_length), length2= length; - -#if defined(__i386__) || defined(__x86_64__) - /* - Special loop for i386, it allows to refer to a - non-aligned memory block as UINT32, which makes - it possible to copy four bytes at once. This - gives about 10% performance improvement comparing - to byte-by-byte loop. - */ - for ( ; length >= 4; length-= 4, from+= 4, to+= 4) - { - if ((*(uint32*)from) & 0x80808080) - break; - *((uint32*) to)= *((const uint32*) from); - } -#endif - - for (; ; *to++= *from++, length--) - { - if (!length) - { - *errors= 0; - return length2; - } - if (*((unsigned char*) from) > 0x7F) /* A non-ASCII character */ - { - uint32 copied_length= length2 - length; - to_length-= copied_length; - from_length-= copied_length; - return copied_length + copy_and_convert_extended(to, to_length, - to_cs, - from, from_length, - from_cs, - errors); - } - } - - DBUG_ASSERT(FALSE); // Should never get to here - return 0; // Make compiler happy -} - - /** Copy string with HEX-encoding of "bad" characters. diff --git a/sql/sql_string.h b/sql/sql_string.h index 1979ac6e4af..352dfbe9fa3 100644 --- a/sql/sql_string.h +++ b/sql/sql_string.h @@ -34,9 +34,13 @@ typedef struct st_mem_root MEM_ROOT; int sortcmp(const String *a,const String *b, CHARSET_INFO *cs); String *copy_if_not_alloced(String *a,String *b,uint32 arg_length); -uint32 copy_and_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs, - const char *from, uint32 from_length, - CHARSET_INFO *from_cs, uint *errors); +inline uint32 copy_and_convert(char *to, uint32 to_length, + const CHARSET_INFO *to_cs, + const char *from, uint32 from_length, + const CHARSET_INFO *from_cs, uint *errors) +{ + return my_convert(to, to_length, to_cs, from, from_length, from_cs, errors); +} uint32 well_formed_copy_nchars(CHARSET_INFO *to_cs, char *to, uint to_length, CHARSET_INFO *from_cs, diff --git a/strings/ctype.c b/strings/ctype.c index 23f18b6617b..b71d7dee4c4 100644 --- a/strings/ctype.c +++ b/strings/ctype.c @@ -430,3 +430,144 @@ my_charset_is_ascii_compatible(CHARSET_INFO *cs) } return 1; } + + +/* + Convert a string between two character sets. + 'to' must be large enough to store (form_length * to_cs->mbmaxlen) bytes. + + @param to[OUT] Store result here + @param to_length Size of "to" buffer + @param to_cs Character set of result string + @param from Copy from here + @param from_length Length of the "from" string + @param from_cs Character set of the "from" string + @param errors[OUT] Number of conversion errors + + @return Number of bytes copied to 'to' string +*/ + +static uint32 +my_convert_internal(char *to, uint32 to_length, + CHARSET_INFO *to_cs, + const char *from, uint32 from_length, + CHARSET_INFO *from_cs, uint *errors) +{ + int cnvres; + my_wc_t wc; + const uchar *from_end= (const uchar*) from + from_length; + char *to_start= to; + uchar *to_end= (uchar*) to + to_length; + my_charset_conv_mb_wc mb_wc= from_cs->cset->mb_wc; + my_charset_conv_wc_mb wc_mb= to_cs->cset->wc_mb; + uint error_count= 0; + + while (1) + { + if ((cnvres= (*mb_wc)(from_cs, &wc, (uchar*) from, from_end)) > 0) + from+= cnvres; + else if (cnvres == MY_CS_ILSEQ) + { + error_count++; + from++; + wc= '?'; + } + else if (cnvres > MY_CS_TOOSMALL) + { + /* + A correct multibyte sequence detected + But it doesn't have Unicode mapping. + */ + error_count++; + from+= (-cnvres); + wc= '?'; + } + else + break; // Not enough characters + +outp: + if ((cnvres= (*wc_mb)(to_cs, wc, (uchar*) to, to_end)) > 0) + to+= cnvres; + else if (cnvres == MY_CS_ILUNI && wc != '?') + { + error_count++; + wc= '?'; + goto outp; + } + else + break; + } + *errors= error_count; + return (uint32) (to - to_start); +} + + +/* + Convert a string between two character sets. + Optimized for quick copying of ASCII characters in the range 0x00..0x7F. + 'to' must be large enough to store (form_length * to_cs->mbmaxlen) bytes. + + @param to[OUT] Store result here + @param to_length Size of "to" buffer + @param to_cs Character set of result string + @param from Copy from here + @param from_length Length of the "from" string + @param from_cs Character set of the "from" string + @param errors[OUT] Number of conversion errors + + @return Number of bytes copied to 'to' string +*/ + +uint32 +my_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs, + const char *from, uint32 from_length, + CHARSET_INFO *from_cs, uint *errors) +{ + uint32 length, length2; + /* + If any of the character sets is not ASCII compatible, + immediately switch to slow mb_wc->wc_mb method. + */ + if ((to_cs->state | from_cs->state) & MY_CS_NONASCII) + return my_convert_internal(to, to_length, to_cs, + from, from_length, from_cs, errors); + + length= length2= MY_MIN(to_length, from_length); + +#if defined(__i386__) || defined(__x86_64__) + /* + Special loop for i386, it allows to refer to a + non-aligned memory block as UINT32, which makes + it possible to copy four bytes at once. This + gives about 10% performance improvement comparing + to byte-by-byte loop. + */ + for ( ; length >= 4; length-= 4, from+= 4, to+= 4) + { + if ((*(uint32*)from) & 0x80808080) + break; + *((uint32*) to)= *((const uint32*) from); + } +#endif /* __i386__ */ + + for (; ; *to++= *from++, length--) + { + if (!length) + { + *errors= 0; + return length2; + } + if (*((unsigned char*) from) > 0x7F) /* A non-ASCII character */ + { + uint32 copied_length= length2 - length; + to_length-= copied_length; + from_length-= copied_length; + return copied_length + my_convert_internal(to, to_length, to_cs, + from, from_length, from_cs, + errors); + } + } + + DBUG_ASSERT(FALSE); // Should never get to here + return 0; // Make compiler happy +}