Merge tulin@bk-internal.mysql.com:/home/bk/mysql-4.1

into poseidon.ndb.mysql.com:/home/tomas/mysql-4.1-ndb
2025-08-01 03:47:19 +03:00 · 2004-10-18 13:39:19 +00:00
parent a8b7ac32d0 85828f4a1b
commit 012de9d742
6 changed files with 415 additions and 163 deletions
--- a/include/m_ctype.h
+++ b/include/m_ctype.h
@ -365,6 +365,11 @@ uint my_instr_mb(struct charset_info_st *,
                 const char *s, uint s_length,
                 my_match_t *match, uint nmatch);

+int my_wildcmp_unicode(CHARSET_INFO *cs,
+                       const char *str, const char *str_end,
+                       const char *wildstr, const char *wildend,
+                       int escape, int w_one, int w_many,
+                       MY_UNICASE_INFO **weights);

 extern my_bool my_parse_charset_xml(const char *bug, uint len,
 				    int (*add)(CHARSET_INFO *cs));
--- a/mysql-test/r/ctype_utf8.result
+++ b/mysql-test/r/ctype_utf8.result
@ -63,6 +63,15 @@ select 'A' like 'a' collate utf8_bin;
 select _utf8 0xD0B0D0B1D0B2 like concat(_utf8'%',_utf8 0xD0B1,_utf8 '%');
 _utf8 0xD0B0D0B1D0B2 like concat(_utf8'%',_utf8 0xD0B1,_utf8 '%')
 1
+select convert(_latin1'G<>nter Andr<64>' using utf8) like CONVERT(_latin1'G<>NTER%' USING utf8);
+convert(_latin1'G<>nter Andr<64>' using utf8) like CONVERT(_latin1'G<>NTER%' USING utf8)
+1
+select CONVERT(_koi8r'<27><><EFBFBD><EFBFBD>' USING utf8) LIKE CONVERT(_koi8r'<27><><EFBFBD><EFBFBD>' USING utf8);
+CONVERT(_koi8r'<27><><EFBFBD><EFBFBD>' USING utf8) LIKE CONVERT(_koi8r'<27><><EFBFBD><EFBFBD>' USING utf8)
+1
+select CONVERT(_koi8r'<27><><EFBFBD><EFBFBD>' USING utf8) LIKE CONVERT(_koi8r'<27><><EFBFBD><EFBFBD>' USING utf8);
+CONVERT(_koi8r'<27><><EFBFBD><EFBFBD>' USING utf8) LIKE CONVERT(_koi8r'<27><><EFBFBD><EFBFBD>' USING utf8)
+1
 SELECT 'a' = 'a ';
 'a' = 'a '
 1
--- a/mysql-test/t/ctype_utf8.test
+++ b/mysql-test/t/ctype_utf8.test
@ -33,6 +33,14 @@ select 'A' like 'a';
 select 'A' like 'a' collate utf8_bin;
 select _utf8 0xD0B0D0B1D0B2 like concat(_utf8'%',_utf8 0xD0B1,_utf8 '%');

+# Bug #6040: can't retrieve records with umlaut
+# characters in case insensitive manner.
+# Case insensitive search LIKE comparison
+# was broken for multibyte characters:
+select convert(_latin1'G<>nter Andr<64>' using utf8) like CONVERT(_latin1'G<>NTER%' USING utf8);
+select CONVERT(_koi8r'<27><><EFBFBD><EFBFBD>' USING utf8) LIKE CONVERT(_koi8r'<27><><EFBFBD><EFBFBD>' USING utf8);
+select CONVERT(_koi8r'<27><><EFBFBD><EFBFBD>' USING utf8) LIKE CONVERT(_koi8r'<27><><EFBFBD><EFBFBD>' USING utf8);
+
 #
 # Check the following:
 # "a"  == "a "
--- a/strings/CHARSET_INFO.txt
+++ b/strings/CHARSET_INFO.txt
@ -0,0 +1,222 @@
+
+CHARSET_INFO
+============
+A structure containing data for charset+collation pair implementation. 
+
+Virtual functions which use this data are collected
+into separate structures MY_CHARSET_HANDLER and
+MY_COLLATION_HANDLER.
+
+
+typedef struct charset_info_st
+{
+  uint      number;
+  uint      primary_number;
+  uint      binary_number;
+  uint      state;
+
+  const char *csname;
+  const char *name;
+  const char *comment;
+
+  uchar    *ctype;
+  uchar    *to_lower;
+  uchar    *to_upper;
+  uchar    *sort_order;
+
+  uint16      *tab_to_uni;
+  MY_UNI_IDX  *tab_from_uni;
+
+  uchar state_map[256];
+  uchar ident_map[256];
+
+  uint      strxfrm_multiply;
+  uint      mbminlen;
+  uint      mbmaxlen;
+  char      max_sort_char; /* For LIKE optimization */
+
+  MY_CHARSET_HANDLER *cset;
+  MY_COLLATION_HANDLER *coll;
+
+} CHARSET_INFO;
+
+
+CHARSET_INFO fields description:
+===============================
+
+
+Numbers (identifiers)
+---------------------
+
+number - an ID uniquely identifying this charset+collation pair.
+
+primary_number - ID of a charset+collation pair, which consists
+of the same character set and the default collation of this
+character set. Not really used now. Intended to optimize some
+parts of the code where we need to find the default collation
+using its non-default counterpart for the given character set.
+
+binary_numner - ID of a charset+collation pair, which consists
+of the same character set and the binary collation of this
+character set. Not really used now. Intended to optimize
+"SELECT BINARY x" in the future.
+
+Names
+-----
+
+  csname  - name of the character set for this charset+collation pair.
+  name    - name of the collation for this charset+collation pair.
+  comment - a text comment, dysplayed in "Description" column of
+            SHOW CHARACTER SET output.
+
+Conversion tables
+-----------------
+  
+  ctype      - pointer to array[257] of "type of characters"
+               bit mask for each chatacter, e.g. if a 
+               character is a digit or a letter or a separator, etc.
+  to_lower   - pointer to arrat[256] used in LCASE()
+  to_upper   - pointer to array[256] used in UCASE()
+  sort_order - pointer to array[256] used for strings comparison
+
+
+
+Unicode conversion data
+-----------------------
+For 8bit character sets:
+
+tab_to_uni  : array[256] of charset->Unicode translation
+tab_from_uni: a structure for Unicode->charset translation
+
+Non-8 bit charsets have their own structures per charset
+hidden in correspondent ctype-xxx.c file and don't use
+tab_to_uni and tab_from_uni tables.
+
+
+Parser maps
+-----------
+state_map[]
+ident_map[]
+
+ These maps are to quickly identify if a character is
+an identificator part, a digit, a special character, 
+or a part of other SQL language lexical item.
+
+Probably can be combined with ctype array in the future.
+But for some reasons these two arrays are used in the parser,
+while a separate ctype[] array is used in the other part of the
+code, like fulltext, etc.
+
+
+Misc fields
+-----------
+
+  strxfrm_multiply - how many times a sort key (i.e. a string
+                     which can be passed into memcmp() for comparison)
+                     can be longer than the original string. 
+                     Usually it is 1. For some complex
+                     collations it can be bigger. For example
+                     in latin1_german2_ci, a sort key is up to
+                     twice longer than the original string.
+                     e.g. Letter 'A' with two dots above is
+                     substituted with 'AE'. 
+  mbminlen         - mininum multibyte sequence length.
+                     Now always 1 accept ucs2. For ucs2
+                     it is 2.
+  mbmaxlen         - maximum multibyte sequence length.
+                     1 for 8bit charsets. Can be also 2 or 3.
+
+
+
+MY_CHARSET_HANDLER
+==================
+
+MY_CHARSET_HANDLER is a collection of character-set
+related routines. Defined in m_ctype.h. Have the 
+following set of functions:
+
+Multibyte routines
+------------------
+ismbchar()  - detects if the given string is a multibyte sequence
+mbcharlen() - retuturns length of multibyte sequence starting with
+              the given character
+numchars()  - returns number of characters in the given string, e.g.
+              in SQL function CHAR_LENGTH().
+charpos()   - calculates the offset of the given position in the string.
+              Used in SQL functions LEFT(), RIGHT(), SUBSTRING(), 
+              INSERT()
+
+well_formed_length()
+            - finds the length of correctly formed multybyte beginning.
+              Used in INSERTs to cut a beginning of the given string
+              which is
+              a) "well formed" according to the given character set.
+              b)  can fit into the given data type
+              Terminates the string in the good position, taking in account
+              multibyte character boundaries.
+
+lengthsp()  - returns the length of the given string without traling spaces.
+
+
+Unicode conversion routines
+---------------------------
+mb_wc       - converts the left multibyte sequence into it Unicode code.
+mc_mb       - converts the given Unicode code into multibyte sequence.
+
+
+Case and sort convertion
+------------------------
+caseup_str  - converts the given 0-terminated string into the upper case
+casedn_str  - converts the given 0-terminated string into the lower case
+caseup      - converts the given string into the lower case using length
+casedn      - converts the given string into the lower case using length
+
+Number-to-string conversion routines
+------------------------------------
+snprintf()
+long10_to_str()
+longlong10_to_str()
+
+The names are pretty self-descripting.
+
+String padding routines
+-----------------------
+fill()     - writes the given Unicode value into the given string
+             with the given length. Used to pad the string, usually
+             with space character, according to the given charset.
+
+String-to-numner conversion routines
+------------------------------------
+strntol()
+strntoul()
+strntoll()
+strntoull()
+strntod()
+
+These functions are almost for the same thing with their
+STDLIB counterparts, but also:
+  - accept length instead of 0-terminator
+  - and are character set dependant
+
+Simple scanner routines
+-----------------------
+scan()    - to skip leading spaces in the given string.
+            Used when a string value is inserted into a numeric field.
+
+
+
+MY_COLLATION_HANDLER
+====================
+strnncoll()   - compares two strings according to the given collation
+strnncollsp() - like the above but ignores trailing spaces
+strnxfrm()    - makes a sort key suitable for memcmp() corresponding
+                to the given string
+like_range()  - creates a LIKE range, for optimizer
+wildcmp()     - wildcard comparison, for LIKE
+strcasecmp()  - 0-terminated string comparison
+instr()       - finds the first substring appearence in the string
+hash_sort()   - calculates hash value taking in account
+                the collation rules, e.g. case-insensitivity, 
+                accent sensitivity, etc.
+
+ 
--- a/strings/ctype-ucs2.c
+++ b/strings/ctype-ucs2.c
@ -1231,172 +1231,14 @@ uint my_lengthsp_ucs2(CHARSET_INFO *cs __attribute__((unused)),
 }


-/*
-** Compare string against string with wildcard
-**	0 if matched
-**	-1 if not matched with wildcard
-**	 1 if matched with wildcard
-*/
-
-static
-int my_wildcmp_ucs2(CHARSET_INFO *cs,
-		    const char *str,const char *str_end,
-		    const char *wildstr,const char *wildend,
-		    int escape, int w_one, int w_many,
-		    MY_UNICASE_INFO **weights)
-{
-  int result= -1;			/* Not found, using wildcards */
-  my_wc_t s_wc, w_wc;
-  int scan, plane;
-  
-  while (wildstr != wildend)
-  {
-    
-    while (1)
-    {
-      scan= my_ucs2_uni(cs,&w_wc, (const uchar*)wildstr,
-			(const uchar*)wildend);
-      if (scan <= 0)
-        return 1;
-      
-      if (w_wc ==  (my_wc_t)escape)
-      {
-        wildstr+= scan;
-        scan= my_ucs2_uni(cs,&w_wc, (const uchar*)wildstr,
-			  (const uchar*)wildend);
-        if (scan <= 0)
-          return 1;
-      }
-      
-      if (w_wc == (my_wc_t)w_many)
-      {
-        result= 1;				/* Found an anchor char */
-        break;
-      }
-      
-      wildstr+= scan;
-      scan= my_ucs2_uni(cs, &s_wc, (const uchar*)str, (const uchar*)str_end);
-      if (scan <=0)
-        return 1;
-      str+= scan;
-      
-      if (w_wc == (my_wc_t)w_one)
-      {
-        result= 1;				/* Found an anchor char */
-      }
-      else
-      {
-        if (weights)
-        {
-          plane=(s_wc>>8) & 0xFF;
-          s_wc = weights[plane] ? weights[plane][s_wc & 0xFF].sort : s_wc;
-          plane=(w_wc>>8) & 0xFF;
-          w_wc = weights[plane] ? weights[plane][w_wc & 0xFF].sort : w_wc;
-        }
-        if (s_wc != w_wc)
-          return 1;				/* No match */
-      }
-      if (wildstr == wildend)
-	return (str != str_end);		/* Match if both are at end */
-    }
-    
-    
-    if (w_wc == (my_wc_t)w_many)
-    {						/* Found w_many */
-    
-      /* Remove any '%' and '_' from the wild search string */
-      for ( ; wildstr != wildend ; )
-      {
-        scan= my_ucs2_uni(cs,&w_wc, (const uchar*)wildstr,
-			  (const uchar*)wildend);
-        if (scan <= 0)
-          return 1;
-        
-	if (w_wc == (my_wc_t)w_many)
-	{
-	  wildstr+= scan;
-	  continue;
-	} 
-	
-	if (w_wc == (my_wc_t)w_one)
-	{
-	  wildstr+= scan;
-	  scan= my_ucs2_uni(cs, &s_wc, (const uchar*)str,
-			    (const uchar*)str_end);
-          if (scan <=0)
-            return 1;
-          str+= scan;
-	  continue;
-	}
-	break;					/* Not a wild character */
-      }
-      
-      if (wildstr == wildend)
-	return 0;				/* Ok if w_many is last */
-      
-      if (str == str_end)
-	return -1;
-      
-      scan= my_ucs2_uni(cs,&w_wc, (const uchar*)wildstr,
-			(const uchar*)wildend);
-      if (scan <= 0)
-        return 1;
-      
-      if (w_wc ==  (my_wc_t)escape)
-      {
-        wildstr+= scan;
-        scan= my_ucs2_uni(cs,&w_wc, (const uchar*)wildstr,
-			  (const uchar*)wildend);
-        if (scan <= 0)
-          return 1;
-      }
-      
-      while (1)
-      {
-        /* Skip until the first character from wildstr is found */
-        while (str != str_end)
-        {
-          scan= my_ucs2_uni(cs,&s_wc, (const uchar*)str,
-			    (const uchar*)str_end);
-          if (scan <= 0)
-            return 1;
-          if (weights)
-          {
-            plane=(s_wc>>8) & 0xFF;
-            s_wc = weights[plane] ? weights[plane][s_wc & 0xFF].sort : s_wc;
-            plane=(w_wc>>8) & 0xFF;
-            w_wc = weights[plane] ? weights[plane][w_wc & 0xFF].sort : w_wc;
-          }
-          
-          if (s_wc == w_wc)
-            break;
-          str+= scan;
-        }
-        if (str == str_end)
-          return -1;
-        
-        result= my_wildcmp_ucs2(cs,str,str_end,wildstr,wildend,escape,
-                                w_one,w_many,weights);
-        
-        if (result <= 0)
-          return result;
-        
-        str+= scan;
-      } 
-    }
-  }
-  return (str != str_end ? 1 : 0);
-}
-
-
 static
 int my_wildcmp_ucs2_ci(CHARSET_INFO *cs,
 		    const char *str,const char *str_end,
 		    const char *wildstr,const char *wildend,
 		    int escape, int w_one, int w_many)
 {
-  return my_wildcmp_ucs2(cs,str,str_end,wildstr,wildend,
-                         escape,w_one,w_many,uni_plane); 
+  return my_wildcmp_unicode(cs,str,str_end,wildstr,wildend,
+                            escape,w_one,w_many,uni_plane); 
 }


@ -1406,8 +1248,8 @@ int my_wildcmp_ucs2_bin(CHARSET_INFO *cs,
 		    const char *wildstr,const char *wildend,
 		    int escape, int w_one, int w_many)
 {
-  return my_wildcmp_ucs2(cs,str,str_end,wildstr,wildend,
-                         escape,w_one,w_many,NULL); 
+  return my_wildcmp_unicode(cs,str,str_end,wildstr,wildend,
+                            escape,w_one,w_many,NULL); 
 }


--- a/strings/ctype-utf8.c
+++ b/strings/ctype-utf8.c
@ -1518,6 +1518,161 @@ MY_UNICASE_INFO *uni_plane[256]={

 };

+
+/*
+** Compare string against string with wildcard
+** This function is used in UTF8 and UCS2
+**
+**	0 if matched
+**	-1 if not matched with wildcard
+**	 1 if matched with wildcard
+*/
+
+int my_wildcmp_unicode(CHARSET_INFO *cs,
+		       const char *str,const char *str_end,
+		       const char *wildstr,const char *wildend,
+		       int escape, int w_one, int w_many,
+		       MY_UNICASE_INFO **weights)
+{
+  int result= -1;			/* Not found, using wildcards */
+  my_wc_t s_wc, w_wc;
+  int scan, plane;
+  int (*mb_wc)(struct charset_info_st *cs, my_wc_t *wc,
+               const unsigned char *s,const unsigned char *e);
+  mb_wc= cs->cset->mb_wc;
+  
+  while (wildstr != wildend)
+  {
+    while (1)
+    {
+      if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr,
+                       (const uchar*)wildend)) <= 0)
+        return 1;
+      
+      if (w_wc ==  (my_wc_t)escape)
+      {
+        wildstr+= scan;
+        if ((scan= mb_wc(cs,&w_wc, (const uchar*)wildstr,
+                         (const uchar*)wildend)) <= 0)
+          return 1;
+      }
+      
+      if (w_wc == (my_wc_t)w_many)
+      {
+        result= 1;				/* Found an anchor char */
+        break;
+      }
+      
+      wildstr+= scan;
+      if ((scan= mb_wc(cs, &s_wc, (const uchar*)str,
+                       (const uchar*)str_end)) <=0)
+        return 1;
+      str+= scan;
+      
+      if (w_wc == (my_wc_t)w_one)
+      {
+        result= 1;				/* Found an anchor char */
+      }
+      else
+      {
+        if (weights)
+        {
+          plane=(s_wc>>8) & 0xFF;
+          s_wc = weights[plane] ? weights[plane][s_wc & 0xFF].sort : s_wc;
+          plane=(w_wc>>8) & 0xFF;
+          w_wc = weights[plane] ? weights[plane][w_wc & 0xFF].sort : w_wc;
+        }
+        if (s_wc != w_wc)
+          return 1;				/* No match */
+      }
+      if (wildstr == wildend)
+	return (str != str_end);		/* Match if both are at end */
+    }
+    
+    
+    if (w_wc == (my_wc_t)w_many)
+    {						/* Found w_many */
+    
+      /* Remove any '%' and '_' from the wild search string */
+      for ( ; wildstr != wildend ; )
+      {
+        if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr,
+                         (const uchar*)wildend)) <= 0)
+          return 1;
+        
+	if (w_wc == (my_wc_t)w_many)
+	{
+	  wildstr+= scan;
+	  continue;
+	} 
+	
+	if (w_wc == (my_wc_t)w_one)
+	{
+	  wildstr+= scan;
+          if ((scan= mb_wc(cs, &s_wc, (const uchar*)str,
+                           (const uchar*)str_end)) <=0)
+            return 1;
+          str+= scan;
+	  continue;
+	}
+	break;					/* Not a wild character */
+      }
+      
+      if (wildstr == wildend)
+	return 0;				/* Ok if w_many is last */
+      
+      if (str == str_end)
+	return -1;
+      
+      if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr,
+                       (const uchar*)wildend)) <=0)
+        return 1;
+      
+      if (w_wc ==  (my_wc_t)escape)
+      {
+        wildstr+= scan;
+        if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr,
+                         (const uchar*)wildend)) <=0)
+          return 1;
+      }
+      
+      while (1)
+      {
+        /* Skip until the first character from wildstr is found */
+        while (str != str_end)
+        {
+          if ((scan= mb_wc(cs, &s_wc, (const uchar*)str,
+                           (const uchar*)str_end)) <=0)
+            return 1;
+          if (weights)
+          {
+            plane=(s_wc>>8) & 0xFF;
+            s_wc = weights[plane] ? weights[plane][s_wc & 0xFF].sort : s_wc;
+            plane=(w_wc>>8) & 0xFF;
+            w_wc = weights[plane] ? weights[plane][w_wc & 0xFF].sort : w_wc;
+          }
+          
+          if (s_wc == w_wc)
+            break;
+          str+= scan;
+        }
+        if (str == str_end)
+          return -1;
+        
+        result= my_wildcmp_unicode(cs, str, str_end, wildstr, wildend,
+                                   escape, w_one, w_many,
+                                   weights);
+        
+        if (result <= 0)
+          return result;
+        
+        str+= scan;
+      } 
+    }
+  }
+  return (str != str_end ? 1 : 0);
+}
+
 #endif


@ -1992,6 +2147,17 @@ static int my_strcasecmp_utf8(CHARSET_INFO *cs, const char *s, const char *t)
  return  my_strncasecmp_utf8(cs, s, t, len);
 }

+static
+int my_wildcmp_utf8(CHARSET_INFO *cs,
+		    const char *str,const char *str_end,
+		    const char *wildstr,const char *wildend,
+		    int escape, int w_one, int w_many)
+{
+  return my_wildcmp_unicode(cs,str,str_end,wildstr,wildend,
+                            escape,w_one,w_many,uni_plane); 
+}
+
+
 static int my_strnxfrm_utf8(CHARSET_INFO *cs,
                            uchar *dst, uint dstlen,
                            const uchar *src, uint srclen)
@ -2060,7 +2226,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler =
    my_strnncollsp_utf8,
    my_strnxfrm_utf8,
    my_like_range_mb,
-    my_wildcmp_mb,
+    my_wildcmp_utf8,
    my_strcasecmp_utf8,
    my_instr_mb,
    my_hash_sort_utf8