Merging my_convert() from 10.0-serg

modified: include/m_ctype.h mysys/ma_dyncol.c mysys/string.c sql/sql_string.cc sql/sql_string.h strings/ctype.c
2025-07-29 05:21:33 +03:00 · 2013-08-01 17:03:15 +04:00
parent 5f6380adde
commit 2404456608
6 changed files with 161 additions and 148 deletions
--- a/include/m_ctype.h
+++ b/include/m_ctype.h
@ -591,6 +591,10 @@ my_bool my_charset_is_ascii_compatible(CHARSET_INFO *cs);
 extern size_t my_vsnprintf_ex(CHARSET_INFO *cs, char *to, size_t n,
                              const char* fmt, va_list ap);
 uint32 my_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs,
                  const char *from, uint32 from_length,
                  CHARSET_INFO *from_cs, uint *errors);
 #define	_MY_U	01	/* Upper case */
 #define	_MY_L	02	/* Lower case */
 #define	_MY_NMR	04	/* Numeral (digit) */
--- a/mysys/ma_dyncol.c
+++ b/mysys/ma_dyncol.c
@ -3853,7 +3853,7 @@ mariadb_dyncol_val_str(DYNAMIC_STRING *str, DYNAMIC_COLUMN_VALUE *val,
          if (!quote)
          {
            /* convert to the destination */
-            str->length+= copy_and_convert_extended(str->str, bufflen,
+            str->length+= my_convert(str->str, bufflen,
                                     cs,
                                     from, (uint32)len,
                                     val->x.string.charset,
@ -3862,8 +3862,7 @@ mariadb_dyncol_val_str(DYNAMIC_STRING *str, DYNAMIC_COLUMN_VALUE *val,
          }
          if ((alloc= (char *)my_malloc(bufflen, MYF(0))))
          {
-            len=
+            len= my_convert(alloc, bufflen, cs,
              copy_and_convert_extended(alloc, bufflen, cs,
                            from, (uint32)len,
                            val->x.string.charset,
                            &dummy_errors);
--- a/mysys/string.c
+++ b/mysys/string.c
@ -223,77 +223,3 @@ void dynstr_reassociate(DYNAMIC_STRING *str, char **ptr, size_t *length,
  *alloc_length= str->max_length;
  str->str=0;
 }
 /*
  copy a string from one character set to another
  SYNOPSIS
    copy_and_convert()
    to			Store result here
    to_cs		Character set of result string
    from		Copy from here
    from_length		Length of from string
    from_cs		From character set
  NOTES
    'to' must be big enough as form_length * to_cs->mbmaxlen
  RETURN
    length of bytes copied to 'to'
 */
 uint32
 copy_and_convert_extended(char *to, uint32 to_length, CHARSET_INFO *to_cs, 
                          const char *from, uint32 from_length,
                          CHARSET_INFO *from_cs,
                          uint *errors)
 {
  int         cnvres;
  my_wc_t     wc;
  const uchar *from_end= (const uchar*) from+from_length;
  char *to_start= to;
  uchar *to_end= (uchar*) to+to_length;
  my_charset_conv_mb_wc mb_wc= from_cs->cset->mb_wc;
  my_charset_conv_wc_mb wc_mb= to_cs->cset->wc_mb;
  uint error_count= 0;
  while (1)
  {
    if ((cnvres= (*mb_wc)(from_cs, &wc, (uchar*) from,
 				      from_end)) > 0)
      from+= cnvres;
    else if (cnvres == MY_CS_ILSEQ)
    {
      error_count++;
      from++;
      wc= '?';
    }
    else if (cnvres > MY_CS_TOOSMALL)
    {
      /*
        A correct multibyte sequence detected
        But it doesn't have Unicode mapping.
      */
      error_count++;
      from+= (-cnvres);
      wc= '?';
    }
    else
      break;  // Not enough characters
 outp:
    if ((cnvres= (*wc_mb)(to_cs, wc, (uchar*) to, to_end)) > 0)
      to+= cnvres;
    else if (cnvres == MY_CS_ILUNI && wc != '?')
    {
      error_count++;
      wc= '?';
      goto outp;
    }
    else
      break;
  }
  *errors= error_count;
  return (uint32) (to - to_start);
 }
--- a/sql/sql_string.cc
+++ b/sql/sql_string.cc
@ -776,67 +776,6 @@ String *copy_if_not_alloced(String *to,String *from,uint32 from_length)
  Help functions
 ****************************************************************************/
 /*
  Optimized for quick copying of ASCII characters in the range 0x00..0x7F.
 */
 uint32
 copy_and_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs, 
                 const char *from, uint32 from_length, CHARSET_INFO *from_cs,
                 uint *errors)
 {
  /*
    If any of the character sets is not ASCII compatible,
    immediately switch to slow mb_wc->wc_mb method.
  */
  if ((to_cs->state | from_cs->state) & MY_CS_NONASCII)
    return copy_and_convert_extended(to, to_length, to_cs,
                                     from, from_length, from_cs, errors);
  uint32 length= min(to_length, from_length), length2= length;
 #if defined(__i386__) || defined(__x86_64__)
  /*
    Special loop for i386, it allows to refer to a
    non-aligned memory block as UINT32, which makes
    it possible to copy four bytes at once. This
    gives about 10% performance improvement comparing
    to byte-by-byte loop.
  */
  for ( ; length >= 4; length-= 4, from+= 4, to+= 4)
  {
    if ((*(uint32*)from) & 0x80808080)
      break;
    *((uint32*) to)= *((const uint32*) from);
  }
 #endif
  for (; ; *to++= *from++, length--)
  {
    if (!length)
    {
      *errors= 0;
      return length2;
    }
    if (*((unsigned char*) from) > 0x7F) /* A non-ASCII character */
    {
      uint32 copied_length= length2 - length;
      to_length-= copied_length;
      from_length-= copied_length;
      return copied_length + copy_and_convert_extended(to, to_length,
                                                       to_cs,
                                                       from, from_length,
                                                       from_cs,
                                                       errors);
    }
  }
  DBUG_ASSERT(FALSE); // Should never get to here
  return 0;           // Make compiler happy
 }
 /**
  Copy string with HEX-encoding of "bad" characters.
--- a/sql/sql_string.h
+++ b/sql/sql_string.h
@ -34,9 +34,13 @@ typedef struct st_mem_root MEM_ROOT;
 int sortcmp(const String *a,const String *b, CHARSET_INFO *cs);
 String *copy_if_not_alloced(String *a,String *b,uint32 arg_length);
-uint32 copy_and_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs,
+inline uint32 copy_and_convert(char *to, uint32 to_length,
                               const CHARSET_INFO *to_cs,
                               const char *from, uint32 from_length,
-			CHARSET_INFO *from_cs, uint *errors);
+                               const CHARSET_INFO *from_cs, uint *errors)
 {
  return my_convert(to, to_length, to_cs, from, from_length, from_cs, errors);
 }
 uint32 well_formed_copy_nchars(CHARSET_INFO *to_cs,
                               char *to, uint to_length,
                               CHARSET_INFO *from_cs,
--- a/strings/ctype.c
+++ b/strings/ctype.c
@ -430,3 +430,144 @@ my_charset_is_ascii_compatible(CHARSET_INFO *cs)
  }
  return 1;
 }
 /*
  Convert a string between two character sets.
  'to' must be large enough to store (form_length * to_cs->mbmaxlen) bytes.
  @param  to[OUT]       Store result here
  @param  to_length     Size of "to" buffer
  @param  to_cs         Character set of result string
  @param  from          Copy from here
  @param  from_length   Length of the "from" string
  @param  from_cs       Character set of the "from" string
  @param  errors[OUT]   Number of conversion errors
  @return Number of bytes copied to 'to' string
 */
 static uint32
 my_convert_internal(char *to, uint32 to_length,
                    CHARSET_INFO *to_cs,
                    const char *from, uint32 from_length,
                    CHARSET_INFO *from_cs, uint *errors)
 {
  int         cnvres;
  my_wc_t     wc;
  const uchar *from_end= (const uchar*) from + from_length;
  char *to_start= to;
  uchar *to_end= (uchar*) to + to_length;
  my_charset_conv_mb_wc mb_wc= from_cs->cset->mb_wc;
  my_charset_conv_wc_mb wc_mb= to_cs->cset->wc_mb;
  uint error_count= 0;
  while (1)
  {
    if ((cnvres= (*mb_wc)(from_cs, &wc, (uchar*) from, from_end)) > 0)
      from+= cnvres;
    else if (cnvres == MY_CS_ILSEQ)
    {
      error_count++;
      from++;
      wc= '?';
    }
    else if (cnvres > MY_CS_TOOSMALL)
    {
      /*
        A correct multibyte sequence detected
        But it doesn't have Unicode mapping.
      */
      error_count++;
      from+= (-cnvres);
      wc= '?';
    }
    else
      break;  // Not enough characters
 outp:
    if ((cnvres= (*wc_mb)(to_cs, wc, (uchar*) to, to_end)) > 0)
      to+= cnvres;
    else if (cnvres == MY_CS_ILUNI && wc != '?')
    {
      error_count++;
      wc= '?';
      goto outp;
    }
    else
      break;
  }
  *errors= error_count;
  return (uint32) (to - to_start);
 }
 /*
  Convert a string between two character sets.
   Optimized for quick copying of ASCII characters in the range 0x00..0x7F.
  'to' must be large enough to store (form_length * to_cs->mbmaxlen) bytes.
  @param  to[OUT]       Store result here
  @param  to_length     Size of "to" buffer
  @param  to_cs         Character set of result string
  @param  from          Copy from here
  @param  from_length   Length of the "from" string
  @param  from_cs       Character set of the "from" string
  @param  errors[OUT]   Number of conversion errors
  @return Number of bytes copied to 'to' string
 */
 uint32
 my_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs,
           const char *from, uint32 from_length,
           CHARSET_INFO *from_cs, uint *errors)
 {
  uint32 length, length2;
  /*
    If any of the character sets is not ASCII compatible,
    immediately switch to slow mb_wc->wc_mb method.
  */
  if ((to_cs->state | from_cs->state) & MY_CS_NONASCII)
    return my_convert_internal(to, to_length, to_cs,
                               from, from_length, from_cs, errors);
  length= length2= MY_MIN(to_length, from_length);
 #if defined(__i386__) || defined(__x86_64__)
  /*
    Special loop for i386, it allows to refer to a
    non-aligned memory block as UINT32, which makes
    it possible to copy four bytes at once. This
    gives about 10% performance improvement comparing
    to byte-by-byte loop.
  */
  for ( ; length >= 4; length-= 4, from+= 4, to+= 4)
  {
    if ((*(uint32*)from) & 0x80808080)
      break;
    *((uint32*) to)= *((const uint32*) from);
  }
 #endif /* __i386__ */
  for (; ; *to++= *from++, length--)
  {
    if (!length)
    {
      *errors= 0;
      return length2;
    }
    if (*((unsigned char*) from) > 0x7F) /* A non-ASCII character */
    {
      uint32 copied_length= length2 - length;
      to_length-= copied_length;
      from_length-= copied_length;
      return copied_length + my_convert_internal(to, to_length, to_cs,
                                                 from, from_length, from_cs,
                                                 errors);
    }
  }
  DBUG_ASSERT(FALSE); // Should never get to here
  return 0;           // Make compiler happy
 }