mirror of
				https://github.com/MariaDB/server.git
				synced 2025-11-03 14:33:32 +03:00 
			
		
		
		
	use same (slightly unwieldy) name in all trees; fix before this version goes "public". bless ctype to avoid upmerge conflict, le sigh.
		
			
				
	
	
		
			408 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			408 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
/* Copyright (C) 2000 MySQL AB
 | 
						|
 | 
						|
   This program is free software; you can redistribute it and/or modify
 | 
						|
   it under the terms of the GNU General Public License as published by
 | 
						|
   the Free Software Foundation; version 2 of the License.
 | 
						|
 | 
						|
   This program is distributed in the hope that it will be useful,
 | 
						|
   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
						|
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
						|
   GNU General Public License for more details.
 | 
						|
 | 
						|
   You should have received a copy of the GNU General Public License
 | 
						|
   along with this program; if not, write to the Free Software
 | 
						|
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
 | 
						|
 | 
						|
#include <my_global.h>
 | 
						|
#include <m_ctype.h>
 | 
						|
#include <my_xml.h>
 | 
						|
#ifndef SCO
 | 
						|
#include <m_string.h>
 | 
						|
#endif
 | 
						|
 | 
						|
 | 
						|
/*
 | 
						|
 | 
						|
  This files implements routines which parse XML based
 | 
						|
  character set and collation description files.
 | 
						|
  
 | 
						|
  Unicode collations are encoded according to
 | 
						|
  
 | 
						|
    Unicode Technical Standard #35
 | 
						|
    Locale Data Markup Language (LDML)
 | 
						|
    http://www.unicode.org/reports/tr35/
 | 
						|
  
 | 
						|
  and converted into ICU string according to
 | 
						|
  
 | 
						|
    Collation Customization
 | 
						|
    http://oss.software.ibm.com/icu/userguide/Collate_Customization.html
 | 
						|
  
 | 
						|
*/
 | 
						|
 | 
						|
static char *mstr(char *str,const char *src,size_t l1,size_t l2)
 | 
						|
{
 | 
						|
  l1= l1<l2 ? l1 : l2;
 | 
						|
  memcpy(str,src,l1);
 | 
						|
  str[l1]='\0';
 | 
						|
  return str;
 | 
						|
}
 | 
						|
 | 
						|
struct my_cs_file_section_st
 | 
						|
{
 | 
						|
  int        state;
 | 
						|
  const char *str;
 | 
						|
};
 | 
						|
 | 
						|
#define _CS_MISC	1
 | 
						|
#define _CS_ID		2
 | 
						|
#define _CS_CSNAME	3
 | 
						|
#define _CS_FAMILY	4
 | 
						|
#define _CS_ORDER	5
 | 
						|
#define _CS_COLNAME	6
 | 
						|
#define _CS_FLAG	7
 | 
						|
#define _CS_CHARSET	8
 | 
						|
#define _CS_COLLATION	9
 | 
						|
#define _CS_UPPERMAP	10
 | 
						|
#define _CS_LOWERMAP	11
 | 
						|
#define _CS_UNIMAP	12
 | 
						|
#define _CS_COLLMAP	13
 | 
						|
#define _CS_CTYPEMAP	14
 | 
						|
#define _CS_PRIMARY_ID	15
 | 
						|
#define _CS_BINARY_ID	16
 | 
						|
#define _CS_CSDESCRIPT	17
 | 
						|
#define _CS_RESET	18
 | 
						|
#define	_CS_DIFF1	19
 | 
						|
#define	_CS_DIFF2	20
 | 
						|
#define	_CS_DIFF3	21
 | 
						|
 | 
						|
 | 
						|
static struct my_cs_file_section_st sec[] =
 | 
						|
{
 | 
						|
  {_CS_MISC,		"xml"},
 | 
						|
  {_CS_MISC,		"xml/version"},
 | 
						|
  {_CS_MISC,		"xml/encoding"},
 | 
						|
  {_CS_MISC,		"charsets"},
 | 
						|
  {_CS_MISC,		"charsets/max-id"},
 | 
						|
  {_CS_CHARSET,		"charsets/charset"},
 | 
						|
  {_CS_PRIMARY_ID,	"charsets/charset/primary-id"},
 | 
						|
  {_CS_BINARY_ID,	"charsets/charset/binary-id"},
 | 
						|
  {_CS_CSNAME,		"charsets/charset/name"},
 | 
						|
  {_CS_FAMILY,		"charsets/charset/family"},
 | 
						|
  {_CS_CSDESCRIPT,	"charsets/charset/description"},
 | 
						|
  {_CS_MISC,		"charsets/charset/alias"},
 | 
						|
  {_CS_MISC,		"charsets/charset/ctype"},
 | 
						|
  {_CS_CTYPEMAP,	"charsets/charset/ctype/map"},
 | 
						|
  {_CS_MISC,		"charsets/charset/upper"},
 | 
						|
  {_CS_UPPERMAP,	"charsets/charset/upper/map"},
 | 
						|
  {_CS_MISC,		"charsets/charset/lower"},
 | 
						|
  {_CS_LOWERMAP,	"charsets/charset/lower/map"},
 | 
						|
  {_CS_MISC,		"charsets/charset/unicode"},
 | 
						|
  {_CS_UNIMAP,		"charsets/charset/unicode/map"},
 | 
						|
  {_CS_COLLATION,	"charsets/charset/collation"},
 | 
						|
  {_CS_COLNAME,		"charsets/charset/collation/name"},
 | 
						|
  {_CS_ID,		"charsets/charset/collation/id"},
 | 
						|
  {_CS_ORDER,		"charsets/charset/collation/order"},
 | 
						|
  {_CS_FLAG,		"charsets/charset/collation/flag"},
 | 
						|
  {_CS_COLLMAP,		"charsets/charset/collation/map"},
 | 
						|
  {_CS_RESET,		"charsets/charset/collation/rules/reset"},
 | 
						|
  {_CS_DIFF1,		"charsets/charset/collation/rules/p"},
 | 
						|
  {_CS_DIFF2,		"charsets/charset/collation/rules/s"},
 | 
						|
  {_CS_DIFF3,		"charsets/charset/collation/rules/t"},
 | 
						|
  {0,	NULL}
 | 
						|
};
 | 
						|
 | 
						|
static struct my_cs_file_section_st * cs_file_sec(const char *attr, size_t len)
 | 
						|
{
 | 
						|
  struct my_cs_file_section_st *s;
 | 
						|
  for (s=sec; s->str; s++)
 | 
						|
  {
 | 
						|
    if (!strncmp(attr,s->str,len))
 | 
						|
      return s;
 | 
						|
  }
 | 
						|
  return NULL;
 | 
						|
}
 | 
						|
 | 
						|
#define MY_CS_CSDESCR_SIZE	64
 | 
						|
#define MY_CS_TAILORING_SIZE	1024
 | 
						|
 | 
						|
typedef struct my_cs_file_info
 | 
						|
{
 | 
						|
  char   csname[MY_CS_NAME_SIZE];
 | 
						|
  char   name[MY_CS_NAME_SIZE];
 | 
						|
  uchar  ctype[MY_CS_CTYPE_TABLE_SIZE];
 | 
						|
  uchar  to_lower[MY_CS_TO_LOWER_TABLE_SIZE];
 | 
						|
  uchar  to_upper[MY_CS_TO_UPPER_TABLE_SIZE];
 | 
						|
  uchar  sort_order[MY_CS_SORT_ORDER_TABLE_SIZE];
 | 
						|
  uint16 tab_to_uni[MY_CS_TO_UNI_TABLE_SIZE];
 | 
						|
  char   comment[MY_CS_CSDESCR_SIZE];
 | 
						|
  char   tailoring[MY_CS_TAILORING_SIZE];
 | 
						|
  size_t tailoring_length;
 | 
						|
  CHARSET_INFO cs;
 | 
						|
  int (*add_collation)(CHARSET_INFO *cs);
 | 
						|
} MY_CHARSET_LOADER;
 | 
						|
 | 
						|
 | 
						|
 | 
						|
static int fill_uchar(uchar *a,uint size,const char *str, size_t len)
 | 
						|
{
 | 
						|
  uint i= 0;
 | 
						|
  const char *s, *b, *e=str+len;
 | 
						|
  
 | 
						|
  for (s=str ; s < e ; i++)
 | 
						|
  { 
 | 
						|
    for ( ; (s < e) && strchr(" \t\r\n",s[0]); s++) ;
 | 
						|
    b=s;
 | 
						|
    for ( ; (s < e) && !strchr(" \t\r\n",s[0]); s++) ;
 | 
						|
    if (s == b || i > size)
 | 
						|
      break;
 | 
						|
    a[i]= (uchar) strtoul(b,NULL,16);
 | 
						|
  }
 | 
						|
  return 0;
 | 
						|
}
 | 
						|
 | 
						|
static int fill_uint16(uint16 *a,uint size,const char *str, size_t len)
 | 
						|
{
 | 
						|
  uint i= 0;
 | 
						|
  
 | 
						|
  const char *s, *b, *e=str+len;
 | 
						|
  for (s=str ; s < e ; i++)
 | 
						|
  { 
 | 
						|
    for ( ; (s < e) && strchr(" \t\r\n",s[0]); s++) ;
 | 
						|
    b=s;
 | 
						|
    for ( ; (s < e) && !strchr(" \t\r\n",s[0]); s++) ;
 | 
						|
    if (s == b || i > size)
 | 
						|
      break;
 | 
						|
    a[i]= (uint16) strtol(b,NULL,16);
 | 
						|
  }
 | 
						|
  return 0;
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
static int cs_enter(MY_XML_PARSER *st,const char *attr, size_t len)
 | 
						|
{
 | 
						|
  struct my_cs_file_info *i= (struct my_cs_file_info *)st->user_data;
 | 
						|
  struct my_cs_file_section_st *s= cs_file_sec(attr,len);
 | 
						|
  
 | 
						|
  if ( s && (s->state == _CS_CHARSET))
 | 
						|
    bzero(&i->cs,sizeof(i->cs));
 | 
						|
  
 | 
						|
  if (s && (s->state == _CS_COLLATION))
 | 
						|
    i->tailoring_length= 0;
 | 
						|
 | 
						|
  return MY_XML_OK;
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
static int cs_leave(MY_XML_PARSER *st,const char *attr, size_t len)
 | 
						|
{
 | 
						|
  struct my_cs_file_info *i= (struct my_cs_file_info *)st->user_data;
 | 
						|
  struct my_cs_file_section_st *s= cs_file_sec(attr,len);
 | 
						|
  int    state= s ? s->state : 0;
 | 
						|
  int    rc;
 | 
						|
  
 | 
						|
  switch(state){
 | 
						|
  case _CS_COLLATION:
 | 
						|
    rc= i->add_collation ? i->add_collation(&i->cs) : MY_XML_OK;
 | 
						|
    break;
 | 
						|
  default:
 | 
						|
    rc=MY_XML_OK;
 | 
						|
  }
 | 
						|
  return rc;
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
static int cs_value(MY_XML_PARSER *st,const char *attr, size_t len)
 | 
						|
{
 | 
						|
  struct my_cs_file_info *i= (struct my_cs_file_info *)st->user_data;
 | 
						|
  struct my_cs_file_section_st *s;
 | 
						|
  int    state= (int)((s=cs_file_sec(st->attr, strlen(st->attr))) ? s->state :
 | 
						|
                      0);
 | 
						|
  
 | 
						|
  switch (state) {
 | 
						|
  case _CS_ID:
 | 
						|
    i->cs.number= strtol(attr,(char**)NULL,10);
 | 
						|
    break;
 | 
						|
  case _CS_BINARY_ID:
 | 
						|
    i->cs.binary_number= strtol(attr,(char**)NULL,10);
 | 
						|
    break;
 | 
						|
  case _CS_PRIMARY_ID:
 | 
						|
    i->cs.primary_number= strtol(attr,(char**)NULL,10);
 | 
						|
    break;
 | 
						|
  case _CS_COLNAME:
 | 
						|
    i->cs.name=mstr(i->name,attr,len,MY_CS_NAME_SIZE-1);
 | 
						|
    break;
 | 
						|
  case _CS_CSNAME:
 | 
						|
    i->cs.csname=mstr(i->csname,attr,len,MY_CS_NAME_SIZE-1);
 | 
						|
    break;
 | 
						|
  case _CS_CSDESCRIPT:
 | 
						|
    i->cs.comment=mstr(i->comment,attr,len,MY_CS_CSDESCR_SIZE-1);
 | 
						|
    break;
 | 
						|
  case _CS_FLAG:
 | 
						|
    if (!strncmp("primary",attr,len))
 | 
						|
      i->cs.state|= MY_CS_PRIMARY;
 | 
						|
    else if (!strncmp("binary",attr,len))
 | 
						|
      i->cs.state|= MY_CS_BINSORT;
 | 
						|
    else if (!strncmp("compiled",attr,len))
 | 
						|
      i->cs.state|= MY_CS_COMPILED;
 | 
						|
    break;
 | 
						|
  case _CS_UPPERMAP:
 | 
						|
    fill_uchar(i->to_upper,MY_CS_TO_UPPER_TABLE_SIZE,attr,len);
 | 
						|
    i->cs.to_upper=i->to_upper;
 | 
						|
    break;
 | 
						|
  case _CS_LOWERMAP:
 | 
						|
    fill_uchar(i->to_lower,MY_CS_TO_LOWER_TABLE_SIZE,attr,len);
 | 
						|
    i->cs.to_lower=i->to_lower;
 | 
						|
    break;
 | 
						|
  case _CS_UNIMAP:
 | 
						|
    fill_uint16(i->tab_to_uni,MY_CS_TO_UNI_TABLE_SIZE,attr,len);
 | 
						|
    i->cs.tab_to_uni=i->tab_to_uni;
 | 
						|
    break;
 | 
						|
  case _CS_COLLMAP:
 | 
						|
    fill_uchar(i->sort_order,MY_CS_SORT_ORDER_TABLE_SIZE,attr,len);
 | 
						|
    i->cs.sort_order=i->sort_order;
 | 
						|
    break;
 | 
						|
  case _CS_CTYPEMAP:
 | 
						|
    fill_uchar(i->ctype,MY_CS_CTYPE_TABLE_SIZE,attr,len);
 | 
						|
    i->cs.ctype=i->ctype;
 | 
						|
    break;
 | 
						|
  case _CS_RESET:
 | 
						|
  case _CS_DIFF1:
 | 
						|
  case _CS_DIFF2:
 | 
						|
  case _CS_DIFF3:
 | 
						|
    {
 | 
						|
      /*
 | 
						|
        Convert collation description from
 | 
						|
        Locale Data Markup Language (LDML)
 | 
						|
        into ICU Collation Customization expression.
 | 
						|
      */
 | 
						|
      char arg[16];
 | 
						|
      const char *cmd[]= {"&","<","<<","<<<"};
 | 
						|
      i->cs.tailoring= i->tailoring;
 | 
						|
      mstr(arg,attr,len,sizeof(arg)-1);
 | 
						|
      if (i->tailoring_length + 20 < sizeof(i->tailoring))
 | 
						|
      {
 | 
						|
        char *dst= i->tailoring_length + i->tailoring;
 | 
						|
        i->tailoring_length+= sprintf(dst," %s %s",cmd[state-_CS_RESET],arg);
 | 
						|
      }
 | 
						|
    }
 | 
						|
  }
 | 
						|
  return MY_XML_OK;
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
my_bool my_parse_charset_xml(const char *buf, size_t len,
 | 
						|
                             int (*add_collation)(CHARSET_INFO *cs))
 | 
						|
{
 | 
						|
  MY_XML_PARSER p;
 | 
						|
  struct my_cs_file_info i;
 | 
						|
  my_bool rc;
 | 
						|
  
 | 
						|
  my_xml_parser_create(&p);
 | 
						|
  my_xml_set_enter_handler(&p,cs_enter);
 | 
						|
  my_xml_set_value_handler(&p,cs_value);
 | 
						|
  my_xml_set_leave_handler(&p,cs_leave);
 | 
						|
  i.add_collation= add_collation;
 | 
						|
  my_xml_set_user_data(&p,(void*)&i);
 | 
						|
  rc= (my_xml_parse(&p,buf,len) == MY_XML_OK) ? FALSE : TRUE;
 | 
						|
  my_xml_parser_free(&p);
 | 
						|
  return rc;
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
/*
 | 
						|
  Check repertoire: detect pure ascii strings
 | 
						|
*/
 | 
						|
uint
 | 
						|
my_string_repertoire(CHARSET_INFO *cs, const char *str, ulong length)
 | 
						|
{
 | 
						|
  const char *strend= str + length;
 | 
						|
  if (cs->mbminlen == 1)
 | 
						|
  {
 | 
						|
    for ( ; str < strend; str++)
 | 
						|
    {
 | 
						|
      if (((uchar) *str) > 0x7F)
 | 
						|
        return MY_REPERTOIRE_UNICODE30;
 | 
						|
    }
 | 
						|
  }
 | 
						|
  else
 | 
						|
  {
 | 
						|
    my_wc_t wc;
 | 
						|
    int chlen;
 | 
						|
    for (;
 | 
						|
         (chlen= cs->cset->mb_wc(cs, &wc, (uchar*) str, (uchar*) strend)) > 0;
 | 
						|
         str+= chlen)
 | 
						|
    {
 | 
						|
      if (wc > 0x7F)
 | 
						|
        return MY_REPERTOIRE_UNICODE30;
 | 
						|
    }
 | 
						|
  }
 | 
						|
  return MY_REPERTOIRE_ASCII;
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
/*
 | 
						|
  Returns repertoire for charset
 | 
						|
*/
 | 
						|
uint my_charset_repertoire(CHARSET_INFO *cs)
 | 
						|
{
 | 
						|
  return cs->state & MY_CS_PUREASCII ?
 | 
						|
    MY_REPERTOIRE_ASCII : MY_REPERTOIRE_UNICODE30;
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
/*
 | 
						|
  Detect whether a character set is ASCII compatible.
 | 
						|
 | 
						|
  Returns TRUE for:
 | 
						|
  
 | 
						|
  - all 8bit character sets whose Unicode mapping of 0x7B is '{'
 | 
						|
    (ignores swe7 which maps 0x7B to "LATIN LETTER A WITH DIAERESIS")
 | 
						|
  
 | 
						|
  - all multi-byte character sets having mbminlen == 1
 | 
						|
    (ignores ucs2 whose mbminlen is 2)
 | 
						|
  
 | 
						|
  TODO:
 | 
						|
  
 | 
						|
  When merging to 5.2, this function should be changed
 | 
						|
  to check a new flag MY_CS_NONASCII, 
 | 
						|
  
 | 
						|
     return (cs->flag & MY_CS_NONASCII) ? 0 : 1;
 | 
						|
  
 | 
						|
  This flag was previously added into 5.2 under terms
 | 
						|
  of WL#3759 "Optimize identifier conversion in client-server protocol"
 | 
						|
  especially to mark character sets not compatible with ASCII.
 | 
						|
  
 | 
						|
  We won't backport this flag to 5.0 or 5.1.
 | 
						|
  This function is Ok for 5.0 and 5.1, because we're not going
 | 
						|
  to introduce new tricky character sets between 5.0 and 5.2.
 | 
						|
*/
 | 
						|
my_bool
 | 
						|
my_charset_is_ascii_based(CHARSET_INFO *cs)
 | 
						|
{
 | 
						|
  return 
 | 
						|
    (cs->mbmaxlen == 1 && cs->tab_to_uni && cs->tab_to_uni['{'] == '{') ||
 | 
						|
    (cs->mbminlen == 1 && cs->mbmaxlen > 1);
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
/*
 | 
						|
  Detect if a character set is 8bit,
 | 
						|
  and it is pure ascii, i.e. doesn't have
 | 
						|
  characters outside U+0000..U+007F
 | 
						|
  This functions is shared between "conf_to_src"
 | 
						|
  and dynamic charsets loader in "mysqld".
 | 
						|
*/
 | 
						|
my_bool
 | 
						|
my_charset_is_8bit_pure_ascii(CHARSET_INFO *cs)
 | 
						|
{
 | 
						|
  size_t code;
 | 
						|
  if (!cs->tab_to_uni)
 | 
						|
    return 0;
 | 
						|
  for (code= 0; code < 256; code++)
 | 
						|
  {
 | 
						|
    if (cs->tab_to_uni[code] > 0x7F)
 | 
						|
      return 0;
 | 
						|
  }
 | 
						|
  return 1;
 | 
						|
}
 |