Allocate memory when a character set is requested:

- For simple character sets: from_uni convertion table. - For UCA: alternative weight arrays. Use mbminlen instead of MY_CS_NONTEXT
2026-01-06 05:22:24 +03:00 · 2004-06-11 16:29:16 +05:00
parent 4047b5ade3
commit 5275eb21c2
22 changed files with 608 additions and 560 deletions
--- a/mysys/charset.c
+++ b/mysys/charset.c
@@ -22,354 +22,6 @@
 #include <my_xml.h>


-/*
-  Collation language is implemented according to
-  subset of ICU Collation Customization (tailorings):
-  http://oss.software.ibm.com/icu/userguide/Collate_Customization.html
-  
-  Collation language elements:
-  Delimiters:
-    space   - skipped
-  
-  <char> :=  A-Z | a-z | \uXXXX
-  
-  Shift command:
-    <shift>  := &       - reset at this letter. 
-  
-  Diff command:
-    <d1> :=  <     - Identifies a primary difference.
-    <d2> :=  <<    - Identifies a secondary difference.
-    <d3> := <<<    - Idenfifies a tertiary difference.
-  
-  
-  Collation rules:
-    <ruleset> :=  <rule>  { <ruleset> }
-    
-    <rule> :=   <d1>    <string>
-              | <d2>    <string>
-              | <d3>    <string>
-              | <shift> <char>
-    
-    <string> := <char> [ <string> ]
-
-  An example, Polish collation:
-  
-    &A < \u0105 <<< \u0104
-    &C < \u0107 <<< \u0106
-    &E < \u0119 <<< \u0118
-    &L < \u0142 <<< \u0141
-    &N < \u0144 <<< \u0143
-    &O < \u00F3 <<< \u00D3
-    &S < \u015B <<< \u015A
-    &Z < \u017A <<< \u017B    
-*/
-
-
-typedef enum my_coll_lexem_num_en
-{
-  MY_COLL_LEXEM_EOF	= 0,
-  MY_COLL_LEXEM_DIFF	= 1, 
-  MY_COLL_LEXEM_SHIFT	= 4,
-  MY_COLL_LEXEM_CHAR	= 5,
-  MY_COLL_LEXEM_ERROR	= 6
-} my_coll_lexem_num;
-
-
-typedef struct my_coll_lexem_st
-{
-  const char *beg;
-  const char *end;
-  const char *prev;
-  int   diff;
-  int   code;
-} MY_COLL_LEXEM;
-
-
-/*
-  Initialize collation rule lexical anilizer
-  
-  SYNOPSIS
-    my_coll_lexem_init
-    lexem                Lex analizer to init
-    str                  Const string to parse
-    strend               End of the string
-  USAGE
-  
-  RETURN VALUES
-    N/A
-*/
-
-static void my_coll_lexem_init(MY_COLL_LEXEM *lexem,
-                               const char *str, const char *strend)
-{
-  lexem->beg= str;
-  lexem->prev= str;
-  lexem->end= strend;
-  lexem->diff= 0;
-  lexem->code= 0;
-}
-
-
-/*
-  Print collation customization expression parse error, with context.
-  
-  SYNOPSIS
-    my_coll_lexem_print_error
-    lexem                Lex analizer to take context from
-    errstr               sting to write error to
-    errsize              errstr size
-    txt                  error message
-  USAGE
-  
-  RETURN VALUES
-    N/A
-*/
-
-static void my_coll_lexem_print_error(MY_COLL_LEXEM *lexem,
-                                      char *errstr, size_t errsize,
-                                      const char *txt)
-{
-  char tail[30];
-  size_t len= lexem->end - lexem->prev;
-  strmake (tail, lexem->prev, min(len, sizeof(tail)-1));
-  errstr[errsize-1]= '\0';
-  my_snprintf(errstr,errsize-1,"%s at '%s'", txt, tail);
-}
-
-
-/*
-  Convert a hex digit into its numeric value
-  
-  SYNOPSIS
-    ch2x
-    ch                   hex digit to convert
-  USAGE
-  
-  RETURN VALUES
-    an integer value in the range 0..15
-    -1 on error
-*/
-
-static int ch2x(int ch)
-{
-  if (ch >= '0' && ch <= '9')
-    return ch - '0';
-  
-  if (ch >= 'a' && ch <= 'f')
-    return 10 + ch - 'a';
-  
-  if (ch >= 'A' && ch <= 'F')
-    return 10 + ch - 'A';
-  
-  return -1;
-}
-
-
-/*
-  Collation language lexical parser:
-  Scans the next lexem.
-  
-  SYNOPSIS
-    my_coll_lexem_next
-    lexem                Lex analizer, previously initialized by 
-                         my_coll_lexem_init.
-  USAGE
-    Call this function in a loop
-    
-  RETURN VALUES
-    Lexem number: eof, diff, shift, char or error.
-*/
-
-static my_coll_lexem_num my_coll_lexem_next(MY_COLL_LEXEM *lexem)
-{
-  for ( ;lexem->beg < lexem->end ; lexem->beg++)
-  {
-    lexem->prev= lexem->beg;
-    if (lexem->beg[0] == ' '  || lexem->beg[0] == '\t' || 
-        lexem->beg[0] == '\r' || lexem->beg[0] == '\n')
-      continue;
-    
-    if (lexem->beg[0] == '&')
-    {
-      lexem->beg++;
-      return MY_COLL_LEXEM_SHIFT;
-    }
-    
-    if (lexem->beg[0] == '<')
-    {
-      for (lexem->beg++, lexem->diff=1; 
-           (lexem->beg < lexem->end) && 
-           (lexem->beg[0] == '<') && (lexem->diff<3);
-           lexem->beg++, lexem->diff++);
-        return MY_COLL_LEXEM_DIFF;
-    }
-    
-    if ((lexem->beg[0] >= 'a' && lexem->beg[0] <= 'z') ||
-        (lexem->beg[0] >= 'A' && lexem->beg[0] <= 'Z'))
-    {
-      lexem->code= lexem->beg[0];
-      lexem->beg++;
-      return MY_COLL_LEXEM_CHAR;
-    }
-    
-    if ((lexem->beg[0] == '\\') && 
-        (lexem->beg+2 < lexem->end) && 
-        (lexem->beg[1] == 'u'))
-    {
-      int ch;
-      
-      lexem->code= 0;
-      for (lexem->beg+=2; 
-           (lexem->beg < lexem->end) && ((ch= ch2x(lexem->beg[0])) >= 0) ; 
-           lexem->beg++)
-      {
-        lexem->code= (lexem->code << 4) + ch;
-      }
-      return MY_COLL_LEXEM_CHAR;
-    }
-    
-    return MY_COLL_LEXEM_ERROR;
-  }
-  return MY_COLL_LEXEM_EOF;
-}
-
-
-/*
-  Collation rule item
-*/
-
-typedef struct my_coll_rule_item_st
-{
-  uint base;     /* Base character                             */
-  uint curr;     /* Current character                          */
-  int diff[3];   /* Primary, Secondary and Tertiary difference */
-} MY_COLL_RULE;
-
-
-/*
-  Collation language syntax parser.
-  Uses lexical parser.
-  
-  SYNOPSIS
-    my_coll_rule_parse
-    rule                 Collation rule list to load to.
-    str                  A string containin collation language expression.
-    strend               End of the string.
-  USAGE
-    
-  RETURN VALUES
-    0 - OK
-    1 - ERROR, e.g. too many items.
-*/
-
-static int my_coll_rule_parse(MY_COLL_RULE *rule, size_t mitems,
-                              const char *str, const char *strend,
-                              char *errstr, size_t errsize)
-{
-  MY_COLL_LEXEM lexem;
-  my_coll_lexem_num lexnum;
-  my_coll_lexem_num prevlexnum= MY_COLL_LEXEM_ERROR;
-  MY_COLL_RULE item; 
-  int state= 0;
-  size_t nitems= 0;
-  
-  /* Init all variables */
-  errstr[0]= '\0';
-  bzero(&item, sizeof(item));
-  my_coll_lexem_init(&lexem, str, strend);
-  
-  while ((lexnum= my_coll_lexem_next(&lexem)))
-  {
-    if (lexnum == MY_COLL_LEXEM_ERROR)
-    {
-      my_coll_lexem_print_error(&lexem,errstr,errsize-1,"Unknown character");
-      return -1;
-    }
-    
-    switch (state) {
-    case 0:
-      if (lexnum != MY_COLL_LEXEM_SHIFT)
-      {
-        my_coll_lexem_print_error(&lexem,errstr,errsize-1,"& expected");
-        return -1;
-      }
-      prevlexnum= lexnum;
-      state= 2;
-      continue;
-      
-    case 1:
-      if (lexnum != MY_COLL_LEXEM_SHIFT && lexnum != MY_COLL_LEXEM_DIFF)
-      {
-        my_coll_lexem_print_error(&lexem,errstr,errsize-1,"& or < expected");
-        return -1;
-      }
-      prevlexnum= lexnum;
-      state= 2;
-      continue;
-      
-    case 2:
-      if (lexnum != MY_COLL_LEXEM_CHAR)
-      {
-        my_coll_lexem_print_error(&lexem,errstr,errsize-1,"character expected");
-        return -1;
-      }
-      
-      if (prevlexnum == MY_COLL_LEXEM_SHIFT)
-      {
-        item.base= lexem.code;
-        item.diff[0]= 0;
-        item.diff[1]= 0;
-        item.diff[2]= 0;
-      }
-      else if (prevlexnum == MY_COLL_LEXEM_DIFF)
-      {
-        item.curr= lexem.code;
-        if (lexem.diff == 3)
-        {
-          item.diff[2]++;
-        }
-        else if (lexem.diff == 2)
-        {
-          item.diff[1]++;
-          item.diff[2]= 0;
-        }
-        else if (lexem.diff == 1)
-        {
-          item.diff[0]++;
-          item.diff[1]= 0;
-          item.diff[2]= 0;
-        }
-        if (nitems >= mitems)
-        {
-          my_coll_lexem_print_error(&lexem,errstr,errsize-1,"Too many rules");
-          return -1;
-        }
-        rule[nitems++]= item;
-      }
-      else
-      {
-        my_coll_lexem_print_error(&lexem,errstr,errsize-1,"Should never happen");
-        return -1;
-      }
-      state= 1;
-      continue;
-    }
-  }
-  return (size_t) nitems;
-}
-
-
-typedef struct
-{
-  int		nchars;
-  MY_UNI_IDX	uidx;
-} uni_idx;
-
-#define PLANE_SIZE	0x100
-#define PLANE_NUM	0x100
-#define PLANE_NUMBER(x)	(((x)>>8) % PLANE_NUM)
-
-
 /*
  The code below implements this functionality:
  
@@ -484,91 +136,6 @@ static void simple_cs_init_functions(CHARSET_INFO *cs)
 }


-static int pcmp(const void * f, const void * s)
-{
-  const uni_idx *F= (const uni_idx*) f;
-  const uni_idx *S= (const uni_idx*) s;
-  int res;
-
-  if (!(res=((S->nchars)-(F->nchars))))
-    res=((F->uidx.from)-(S->uidx.to));
-  return res;
-}
-
-
-static my_bool create_fromuni(CHARSET_INFO *cs)
-{
-  uni_idx	idx[PLANE_NUM];
-  int		i,n;
-  
-  /* Clear plane statistics */
-  bzero(idx,sizeof(idx));
-  
-  /* Count number of characters in each plane */
-  for (i=0; i< 0x100; i++)
-  {
-    uint16 wc=cs->tab_to_uni[i];
-    int pl= PLANE_NUMBER(wc);
-    
-    if (wc || !i)
-    {
-      if (!idx[pl].nchars)
-      {
-        idx[pl].uidx.from=wc;
-        idx[pl].uidx.to=wc;
-      }else
-      {
-        idx[pl].uidx.from=wc<idx[pl].uidx.from?wc:idx[pl].uidx.from;
-        idx[pl].uidx.to=wc>idx[pl].uidx.to?wc:idx[pl].uidx.to;
-      }
-      idx[pl].nchars++;
-    }
-  }
-  
-  /* Sort planes in descending order */
-  qsort(&idx,PLANE_NUM,sizeof(uni_idx),&pcmp);
-  
-  for (i=0; i < PLANE_NUM; i++)
-  {
-    int ch,numchars;
-    
-    /* Skip empty plane */
-    if (!idx[i].nchars)
-      break;
-    
-    numchars=idx[i].uidx.to-idx[i].uidx.from+1;
-    if (!(idx[i].uidx.tab=(uchar*) my_once_alloc(numchars *
-						 sizeof(*idx[i].uidx.tab),
-						 MYF(MY_WME))))
-      return TRUE;
-
-    bzero(idx[i].uidx.tab,numchars*sizeof(*idx[i].uidx.tab));
-    
-    for (ch=1; ch < PLANE_SIZE; ch++)
-    {
-      uint16 wc=cs->tab_to_uni[ch];
-      if (wc >= idx[i].uidx.from && wc <= idx[i].uidx.to && wc)
-      {
-        int ofs= wc - idx[i].uidx.from;
-        idx[i].uidx.tab[ofs]= ch;
-      }
-    }
-  }
-  
-  /* Allocate and fill reverse table for each plane */
-  n=i;
-  if (!(cs->tab_from_uni= (MY_UNI_IDX*) my_once_alloc(sizeof(MY_UNI_IDX)*(n+1),
-						      MYF(MY_WME))))
-    return TRUE;
-
-  for (i=0; i< n; i++)
-    cs->tab_from_uni[i]= idx[i].uidx;
-  
-  /* Set end-of-list marker */
-  bzero(&cs->tab_from_uni[i],sizeof(MY_UNI_IDX));
-  return FALSE;
-}
-

 static int simple_cs_copy_data(CHARSET_INFO *to, CHARSET_INFO *from)
 {
@@ -622,8 +189,6 @@ static int simple_cs_copy_data(CHARSET_INFO *to, CHARSET_INFO *from)
    if (!(to->tab_to_uni= (uint16*)  my_once_memdup((char*)from->tab_to_uni,
 						    sz, MYF(MY_WME))))
      goto err;
-    if (create_fromuni(to))
-      goto err;
  }
  to->mbminlen= 1;
  to->mbmaxlen= 1;
@@ -754,117 +319,6 @@ static my_tailoring tailoring[]=
  }
 };

-#define MY_MAX_COLL_RULE 64
-
-/*
-  This function copies an UCS2 collation from
-  the default Unicode Collation Algorithm (UCA)
-  weights applying tailorings, i.e. a set of
-  alternative weights for some characters. 
-  
-  The default UCA weights are stored in my_charset_ucs2_general_uca.
-  They consist of 256 pages, 256 character each.
-  
-  If a page is not overwritten by tailoring rules,
-  it is copies as is from UCA as is.
-  
-  If a page contains some overwritten characters, it is
-  allocated. Untouched characters are copied from the
-  default weights.
-*/
-
-static my_bool create_tailoring(CHARSET_INFO *cs)
-{
-  MY_COLL_RULE rule[MY_MAX_COLL_RULE];
-  char errstr[128];
-  uchar   *newlengths;
-  uint16 **newweights;
-  const uchar *deflengths= my_charset_ucs2_general_uca.sort_order;
-  uint16     **defweights= my_charset_ucs2_general_uca.sort_order_big;
-  int rc, i;
-
-  if (!cs->tailoring)
-    return 1;
-  
-  /* Parse ICU Collation Customization expression */
-  if ((rc= my_coll_rule_parse(rule, MY_MAX_COLL_RULE,
-                              cs->tailoring,
-                              cs->tailoring + strlen(cs->tailoring),
-                              errstr, sizeof(errstr))) <= 0)
-  {
-    /* 
-      TODO: add error message reporting.
-      printf("Error: %d '%s'\n", rc, errstr);
-    */
-    return 1;
-  }
-  
-  if (!(newweights= (uint16**) my_once_alloc(256*sizeof(uint16*),MYF(MY_WME))))
-    return 1;
-  bzero(newweights, 256*sizeof(uint16*));
-  
-  if (!(newlengths= (uchar*) my_once_memdup(deflengths,256,MYF(MY_WME))))
-    return 1;
-  
-  /*
-    Calculate maximum lenghts for the pages
-    which will be overwritten.
-  */
-  for (i=0; i < rc; i++)
-  {
-    uint pageb= (rule[i].base >> 8) & 0xFF;
-    uint pagec= (rule[i].curr >> 8) & 0xFF;
-    
-    if (newlengths[pagec] < deflengths[pageb])
-      newlengths[pagec]= deflengths[pageb];
-  }
-  
-  for (i=0; i < rc;  i++)
-  {
-    uint pageb= (rule[i].base >> 8) & 0xFF;
-    uint pagec= (rule[i].curr >> 8) & 0xFF;
-    uint chb, chc;
-    
-    if (!newweights[pagec])
-    {
-      /* Alloc new page and copy the default UCA weights */
-      uint size= 256*newlengths[pagec]*sizeof(uint16);
-      
-      if (!(newweights[pagec]= (uint16*) my_once_alloc(size,MYF(MY_WME))))
-        return 1;
-      bzero((void*) newweights[pagec], size);
-      
-      for (chc=0 ; chc < 256; chc++)
-      {
-        memcpy(newweights[pagec] + chc*newlengths[pagec],
-               defweights[pagec] + chc*deflengths[pagec],
-               deflengths[pagec]*sizeof(uint16));
-      }
-    }
-    
-    /* 
-      Aply the alternative rule:
-      shift to the base character and primary difference.
-    */
-    chc= rule[i].curr & 0xFF;
-    chb= rule[i].base & 0xFF;
-    memcpy(newweights[pagec] + chc*newlengths[pagec],
-           defweights[pageb] + chb*deflengths[pageb],
-           deflengths[pageb]*sizeof(uint16));
-    /* Apply primary difference */
-    newweights[pagec][chc*newlengths[pagec]]+= rule[i].diff[0];
-  }
-  
-  /* Copy non-overwritten pages from the default UCA weights */
-  for (i= 0; i < 256 ; i++)
-    if (!newweights[i])
-      newweights[i]= defweights[i];
-  
-  cs->sort_order= newlengths;
-  cs->sort_order_big= newweights;
-  
-  return 0;
-}


 static int ucs2_copy_data(CHARSET_INFO *to, CHARSET_INFO *from)
@@ -894,7 +348,7 @@ static int ucs2_copy_data(CHARSET_INFO *to, CHARSET_INFO *from)
  to->mbminlen= 2;
  to->mbmaxlen= 2;
  
-  return create_tailoring(to);
+  return 0;
  
 err:
  return 1;
@@ -997,7 +451,7 @@ static my_bool init_uca_charsets()
  CHARSET_INFO cs= my_charset_ucs2_general_uca;
  char name[64];
  
-  cs.state= MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONTEXT;
+  cs.state= MY_CS_STRNXFRM|MY_CS_UNICODE;
  for (t= tailoring; t->tailoring; t++)
  {
    cs.number= 128 + t->number;
@@ -1083,6 +537,10 @@ void add_compiled_collation(CHARSET_INFO *cs)
  cs->state|= MY_CS_AVAILABLE;
 }

+static void *cs_alloc(uint size)
+{
+  return my_once_alloc(size, MYF(MY_WME));
+}


 #ifdef __NETWARE__
@@ -1207,6 +665,14 @@ static CHARSET_INFO *get_internal_charset(uint cs_number, myf flags)
    cs= (cs->state & MY_CS_AVAILABLE) ? cs : NULL;
  }
  pthread_mutex_unlock(&THR_LOCK_charset);
+  if (cs && !(cs->state & MY_CS_READY))
+  {
+    if ((cs->cset->init && cs->cset->init(cs, cs_alloc)) ||
+        (cs->coll->init && cs->coll->init(cs, cs_alloc)))
+      cs= NULL;
+    else
+      cs->state|= MY_CS_READY;
+  }
  return cs;
 }