From 2496e85b84aad64a273dcb6ee45bb4a706c4b87d Mon Sep 17 00:00:00 2001
From: "bar@mysql.com" <>
Date: Thu, 19 Aug 2004 15:15:10 +0500
Subject: [PATCH] Bug#4521: unique key prefix interacts poorly with utf8. Fix
 for binary collations for MyISAM and HEAP BTREE. This patch also changes
 trailing spaces behaviour for binary collations. Binary collations now have
 PAD characteristic too.

---
 myisam/mi_search.c             |  15 +++-
 mysql-test/r/binary.result     |   2 +
 mysql-test/r/ctype_utf8.result |  92 ++++++++++++++++++++++
 mysql-test/r/endspace.result   |   2 +-
 mysql-test/r/myisam.result     |   1 +
 mysql-test/t/ctype_utf8.test   | 106 ++++++++++++++++++++++++-
 sql/field.h                    |   2 +-
 sql/ha_berkeley.cc             |   6 +-
 sql/item_cmpfunc.cc            |   4 +-
 strings/ctype-bin.c            | 138 ++++++++++++++++++++++++++-------
 strings/ctype-mb.c             |  57 +++++++++++++-
 11 files changed, 384 insertions(+), 41 deletions(-)

diff --git a/myisam/mi_search.c b/myisam/mi_search.c
index 2f1c37e4f21..24f5db1401d 100644
--- a/myisam/mi_search.c
+++ b/myisam/mi_search.c
@@ -396,9 +396,18 @@ int _mi_prefix_search(MI_INFO *info, register MI_KEYDEF *keyinfo, uchar *page,
 
       matched=prefix_len+left;
 
-      for (my_flag=0;left;left--)
-        if ((my_flag= (int) sort_order[*vseg++] - (int) sort_order[*k++]))
-          break;
+      if (sort_order)
+      {
+        for (my_flag=0;left;left--)
+          if ((my_flag= (int) sort_order[*vseg++] - (int) sort_order[*k++]))
+            break;
+      }
+      else
+      {
+        for (my_flag=0;left;left--)
+          if ((my_flag= (int) *vseg++ - (int) *k++))
+            break;
+      }
 
       if (my_flag>0)      /* mismatch */
         break;
diff --git a/mysql-test/r/binary.result b/mysql-test/r/binary.result
index 000c0c16d77..a4ced14bb12 100644
--- a/mysql-test/r/binary.result
+++ b/mysql-test/r/binary.result
@@ -59,8 +59,10 @@ concat("-",a,"-",b,"-")
 -hello-hello-
 select concat("-",a,"-",b,"-") from t1 where b="hello ";
 concat("-",a,"-",b,"-")
+-hello-hello-
 select concat("-",a,"-",b,"-") from t1 ignore index (b) where b="hello ";
 concat("-",a,"-",b,"-")
+-hello-hello-
 alter table t1 modify b tinytext not null, drop key b, add key (b(100));
 select concat("-",a,"-",b,"-") from t1;
 concat("-",a,"-",b,"-")
diff --git a/mysql-test/r/ctype_utf8.result b/mysql-test/r/ctype_utf8.result
index 0cc3ea2cf17..cfad82fa053 100644
--- a/mysql-test/r/ctype_utf8.result
+++ b/mysql-test/r/ctype_utf8.result
@@ -397,3 +397,95 @@ select c as c_a from t1 where c='б';
 c_a
 б
 drop table t1;
+create table t1 (c varchar(30) character set utf8 collate utf8_bin, unique(c(10)));
+insert into t1 values ('1'),('2'),('3'),('x'),('y'),('z');
+insert into t1 values ('aaaaaaaaaa');
+insert into t1 values ('aaaaaaaaaaa');
+ERROR 23000: Duplicate entry 'aaaaaaaaaaa' for key 1
+insert into t1 values ('aaaaaaaaaaaa');
+ERROR 23000: Duplicate entry 'aaaaaaaaaaaa' for key 1
+insert into t1 values (repeat('b',20));
+select c c1 from t1 where c='1';
+c1
+1
+select c c2 from t1 where c='2';
+c2
+2
+select c c3 from t1 where c='3';
+c3
+3
+select c cx from t1 where c='x';
+cx
+x
+select c cy from t1 where c='y';
+cy
+y
+select c cz from t1 where c='z';
+cz
+z
+select c ca10 from t1 where c='aaaaaaaaaa';
+ca10
+aaaaaaaaaa
+select c cb20 from t1 where c=repeat('b',20);
+cb20
+bbbbbbbbbbbbbbbbbbbb
+drop table t1;
+create table t1 (c char(3) character set utf8 collate utf8_bin, unique (c(2)));
+insert into t1 values ('1'),('2'),('3'),('4'),('x'),('y'),('z');
+insert into t1 values ('a');
+insert into t1 values ('aa');
+insert into t1 values ('aaa');
+ERROR 23000: Duplicate entry 'aaa' for key 1
+insert into t1 values ('b');
+insert into t1 values ('bb');
+insert into t1 values ('bbb');
+ERROR 23000: Duplicate entry 'bbb' for key 1
+insert into t1 values ('а');
+insert into t1 values ('аа');
+insert into t1 values ('ааа');
+ERROR 23000: Duplicate entry 'ааа' for key 1
+insert into t1 values ('б');
+insert into t1 values ('бб');
+insert into t1 values ('ббб');
+ERROR 23000: Duplicate entry 'ббб' for key 1
+insert into t1 values ('ꪪ');
+insert into t1 values ('ꪪꪪ');
+insert into t1 values ('ꪪꪪꪪ');
+ERROR 23000: Duplicate entry 'ꪪꪪ' for key 1
+drop table t1;
+create table t1 (
+c char(10) character set utf8 collate utf8_bin,
+unique key a using btree (c(1))
+) engine=heap;
+show create table t1;
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `c` char(10) character set utf8 collate utf8_bin default NULL,
+  UNIQUE KEY `a` TYPE BTREE (`c`(1))
+) ENGINE=HEAP DEFAULT CHARSET=latin1
+insert into t1 values ('a'),('b'),('c'),('d'),('e'),('f');
+insert into t1 values ('aa');
+ERROR 23000: Duplicate entry 'aa' for key 1
+insert into t1 values ('aaa');
+ERROR 23000: Duplicate entry 'aaa' for key 1
+insert into t1 values ('б');
+insert into t1 values ('бб');
+ERROR 23000: Duplicate entry 'б�' for key 1
+insert into t1 values ('ббб');
+ERROR 23000: Duplicate entry 'б�' for key 1
+select c as c_all from t1 order by c;
+c_all
+a
+b
+c
+d
+e
+f
+б
+select c as c_a from t1 where c='a';
+c_a
+a
+select c as c_a from t1 where c='б';
+c_a
+б
+drop table t1;
diff --git a/mysql-test/r/endspace.result b/mysql-test/r/endspace.result
index 4800bbf4ecb..167adea6674 100644
--- a/mysql-test/r/endspace.result
+++ b/mysql-test/r/endspace.result
@@ -19,7 +19,7 @@ select 'a  a' > 'a', 'a  \0' < 'a';
 1	1
 select binary 'a  a' > 'a', binary 'a  \0' > 'a', binary 'a\0' > 'a';
 binary 'a  a' > 'a'	binary 'a  \0' > 'a'	binary 'a\0' > 'a'
-1	1	1
+1	0	0
 create table t1 (text1 varchar(32) not NULL, KEY key1 (text1));
 insert into t1 values ('teststring'), ('nothing'), ('teststring\t');
 check table t1;
diff --git a/mysql-test/r/myisam.result b/mysql-test/r/myisam.result
index 354675cd4d4..0109097d3a1 100644
--- a/mysql-test/r/myisam.result
+++ b/mysql-test/r/myisam.result
@@ -412,6 +412,7 @@ aaa.
 aaa   .
 select concat(a,'.') from t1 where binary a='aaa';
 concat(a,'.')
+aaa   .
 aaa.
 update t1 set a='bbb' where a='aaa';
 select concat(a,'.') from t1;
diff --git a/mysql-test/t/ctype_utf8.test b/mysql-test/t/ctype_utf8.test
index 0d3bec258bc..a8a02118269 100644
--- a/mysql-test/t/ctype_utf8.test
+++ b/mysql-test/t/ctype_utf8.test
@@ -189,7 +189,7 @@ drop table t2;
 
 #
 # Bug 4521: unique key prefix interacts poorly with utf8
-# Check keys with prefix compression
+# MYISAM: keys with prefix compression, case insensitive collation.
 #
 create table t1 (c varchar(30) character set utf8, unique(c(10)));
 insert into t1 values ('1'),('2'),('3'),('x'),('y'),('z');
@@ -211,7 +211,8 @@ drop table t1;
 
 #
 # Bug 4521: unique key prefix interacts poorly with utf8
-# Check fixed length keys
+# MYISAM: fixed length keys, case insensitive collation
+#
 create table t1 (c char(3) character set utf8, unique (c(2)));
 insert into t1 values ('1'),('2'),('3'),('4'),('x'),('y'),('z');
 insert into t1 values ('a');
@@ -283,3 +284,104 @@ select c as c_all from t1 order by c;
 select c as c_a from t1 where c='a';
 select c as c_a from t1 where c='б';
 drop table t1;
+
+
+#
+# Bug 4521: unique key prefix interacts poorly with utf8
+# MYISAM: keys with prefix compression, binary collation.
+#
+create table t1 (c varchar(30) character set utf8 collate utf8_bin, unique(c(10)));
+insert into t1 values ('1'),('2'),('3'),('x'),('y'),('z');
+insert into t1 values ('aaaaaaaaaa');
+--error 1062
+insert into t1 values ('aaaaaaaaaaa');
+--error 1062
+insert into t1 values ('aaaaaaaaaaaa');
+insert into t1 values (repeat('b',20));
+select c c1 from t1 where c='1';
+select c c2 from t1 where c='2';
+select c c3 from t1 where c='3';
+select c cx from t1 where c='x';
+select c cy from t1 where c='y';
+select c cz from t1 where c='z';
+select c ca10 from t1 where c='aaaaaaaaaa';
+select c cb20 from t1 where c=repeat('b',20);
+drop table t1;
+
+#
+# Bug 4521: unique key prefix interacts poorly with utf8
+# MYISAM: fixed length keys, binary collation
+#
+create table t1 (c char(3) character set utf8 collate utf8_bin, unique (c(2)));
+insert into t1 values ('1'),('2'),('3'),('4'),('x'),('y'),('z');
+insert into t1 values ('a');
+insert into t1 values ('aa');
+--error 1062
+insert into t1 values ('aaa');
+insert into t1 values ('b');
+insert into t1 values ('bb');
+--error 1062
+insert into t1 values ('bbb');
+insert into t1 values ('а');
+insert into t1 values ('аа');
+--error 1062
+insert into t1 values ('ааа');
+insert into t1 values ('б');
+insert into t1 values ('бб');
+--error 1062
+insert into t1 values ('ббб');
+insert into t1 values ('ꪪ');
+insert into t1 values ('ꪪꪪ');
+--error 1062
+insert into t1 values ('ꪪꪪꪪ');
+drop table t1;
+
+#
+# Bug 4531: unique key prefix interacts poorly with utf8
+# Check HEAP+HASH, binary collation
+#
+# This doesn't work correctly yet.
+#
+#create table t1 (
+#c char(10) character set utf8 collate utf8_bin,
+#unique key a using hash (c(1))
+#) engine=heap;
+#show create table t1;
+#insert into t1 values ('a'),('b'),('c'),('d'),('e'),('f');
+#--error 1062
+#insert into t1 values ('aa');
+#--error 1062
+#insert into t1 values ('aaa');
+#insert into t1 values ('б');
+#--error 1062
+#insert into t1 values ('бб');
+#--error 1062
+#insert into t1 values ('ббб');
+#select c as c_all from t1 order by c;
+#select c as c_a from t1 where c='a';
+#select c as c_a from t1 where c='б';
+#drop table t1;
+
+#
+# Bug 4531: unique key prefix interacts poorly with utf8
+# Check HEAP+BTREE, binary collation
+#
+create table t1 (
+c char(10) character set utf8 collate utf8_bin,
+unique key a using btree (c(1))
+) engine=heap;
+show create table t1;
+insert into t1 values ('a'),('b'),('c'),('d'),('e'),('f');
+--error 1062
+insert into t1 values ('aa');
+--error 1062
+insert into t1 values ('aaa');
+insert into t1 values ('б');
+--error 1062
+insert into t1 values ('бб');
+--error 1062
+insert into t1 values ('ббб');
+select c as c_all from t1 order by c;
+select c as c_a from t1 where c='a';
+select c as c_a from t1 where c='б';
+drop table t1;
diff --git a/sql/field.h b/sql/field.h
index fe06cd96f1a..83c5a71f07f 100644
--- a/sql/field.h
+++ b/sql/field.h
@@ -357,7 +357,7 @@ public:
   uint size_of() const { return sizeof(*this); }
   CHARSET_INFO *charset(void) const { return field_charset; }
   void set_charset(CHARSET_INFO *charset) { field_charset=charset; }
-  bool binary() const { return field_charset->state & MY_CS_BINSORT ? 1 : 0; }
+  bool binary() const { return field_charset == &my_charset_bin; }
   uint32 max_length() { return field_length; }
   friend class create_field;
 };
diff --git a/sql/ha_berkeley.cc b/sql/ha_berkeley.cc
index 39ef6ca855a..7cd534d60b3 100644
--- a/sql/ha_berkeley.cc
+++ b/sql/ha_berkeley.cc
@@ -357,9 +357,11 @@ ulong ha_berkeley::index_flags(uint idx, uint part, bool all_parts) const
     case HA_KEYTYPE_VARTEXT:
       /*
         As BDB stores only one copy of equal strings, we can't use key read
-        on these
+        on these. Binary collations do support key read though.
       */
-      flags&= ~HA_KEYREAD_ONLY;
+      if (!(table->key_info[idx].key_part[i].field->charset()->state
+           & MY_CS_BINSORT))
+        flags&= ~HA_KEYREAD_ONLY;
       break;
     default:                                    // Keep compiler happy
       break;
diff --git a/sql/item_cmpfunc.cc b/sql/item_cmpfunc.cc
index 23bdad1aae5..3c75dba42da 100644
--- a/sql/item_cmpfunc.cc
+++ b/sql/item_cmpfunc.cc
@@ -303,10 +303,10 @@ int Arg_comparator::set_compare_func(Item_bool_func2 *item, Item_result type)
       my_coll_agg_error((*a)->collation, (*b)->collation, owner->func_name());
       return 1;
     }
-    if (my_binary_compare(cmp_collation.collation))
+    if (cmp_collation.collation == &my_charset_bin)
     {
       /*
-	We are using binary collation, change to compare byte by byte,
+	We are using BLOB/BINARY/VARBINARY, change to compare byte by byte,
 	without removing end space
       */
       if (func == &Arg_comparator::compare_string)
diff --git a/strings/ctype-bin.c b/strings/ctype-bin.c
index cc83471f264..e759a5654f1 100644
--- a/strings/ctype-bin.c
+++ b/strings/ctype-bin.c
@@ -68,31 +68,10 @@ static uchar bin_char_array[] =
 
 
 
-/*
-  Compare two strings. Result is sign(first_argument - second_argument)
-
-  SYNOPSIS
-    my_strnncoll_binary()
-    cs			Chararacter set
-    s			String to compare
-    slen		Length of 's'
-    t			String to compare
-    tlen		Length of 't'
-
-  NOTE
-   This is used also when comparing with end space removal, as end space
-   is significant for binary strings
-
-  RETURN
-  < 0	s < t
-  0	s == t
-  > 0	s > t
-*/
-
 static int my_strnncoll_binary(CHARSET_INFO * cs __attribute__((unused)),
-				const uchar *s, uint slen,
-				const uchar *t, uint tlen,
-                                my_bool t_is_prefix)
+                               const uchar *s, uint slen,
+                               const uchar *t, uint tlen,
+                               my_bool t_is_prefix)
 {
   uint len=min(slen,tlen);
   int cmp= memcmp(s,t,len);
@@ -100,14 +79,105 @@ static int my_strnncoll_binary(CHARSET_INFO * cs __attribute__((unused)),
 }
 
 
+/*
+  Compare two strings. Result is sign(first_argument - second_argument)
+
+  SYNOPSIS
+    my_strnncollsp_binary()
+    cs			Chararacter set
+    s			String to compare
+    slen		Length of 's'
+    t			String to compare
+    tlen		Length of 't'
+
+  NOTE
+   This function is used for real binary strings, i.e. for
+   BLOB, BINARY(N) and VARBINARY(N).
+   It does not ignore trailing spaces.
+
+  RETURN
+  < 0	s < t
+  0	s == t
+  > 0	s > t
+*/
+
 static int my_strnncollsp_binary(CHARSET_INFO * cs __attribute__((unused)),
-                                const uchar *s, uint slen,
-                                const uchar *t, uint tlen)
+                                 const uchar *s, uint slen,
+                                 const uchar *t, uint tlen)
 {
   return my_strnncoll_binary(cs,s,slen,t,tlen,0);
 }
 
 
+static int my_strnncoll_8bit_bin(CHARSET_INFO * cs __attribute__((unused)),
+                                 const uchar *s, uint slen,
+                                 const uchar *t, uint tlen,
+                                 my_bool t_is_prefix)
+{
+  uint len=min(slen,tlen);
+  int cmp= memcmp(s,t,len);
+  return cmp ? cmp : (int)((t_is_prefix ? len : slen) - tlen);
+}
+
+
+/*
+  Compare two strings. Result is sign(first_argument - second_argument)
+
+  SYNOPSIS
+    my_strnncollsp_8bit_bin()
+    cs			Chararacter set
+    s			String to compare
+    slen		Length of 's'
+    t			String to compare
+    tlen		Length of 't'
+
+  NOTE
+   This function is used for character strings with binary collations.
+   It ignores trailing spaces.
+
+  RETURN
+  < 0	s < t
+  0	s == t
+  > 0	s > t
+*/
+
+static int my_strnncollsp_8bit_bin(CHARSET_INFO * cs __attribute__((unused)),
+                                   const uchar *a, uint a_length, 
+                                   const uchar *b, uint b_length)
+{
+  const uchar *end;
+  uint length;
+
+  end= a + (length= min(a_length, b_length));
+  while (a < end)
+  {
+    if (*a++ != *b++)
+      return ((int) a[-1] - (int) b[-1]);
+  }
+  if (a_length != b_length)
+  {
+    int swap= 0;
+    /*
+      Check the next not space character of the longer key. If it's < ' ',
+      then it's smaller than the other key.
+    */
+    if (a_length < b_length)
+    {
+      /* put shorter key in s */
+      a_length= b_length;
+      a= b;
+      swap= -1;					/* swap sign of result */
+    }
+    for (end= a + a_length-length; a < end ; a++)
+    {
+      if (*a != ' ')
+	return ((int) *a - (int) ' ') ^ swap;
+    }
+  }
+  return 0;
+}
+
+
 /* This function is used for all conversion functions */
 
 static void my_case_str_bin(CHARSET_INFO *cs __attribute__((unused)),
@@ -342,6 +412,20 @@ skip:
 
 
 MY_COLLATION_HANDLER my_collation_8bit_bin_handler =
+{
+    NULL,			/* init */
+    my_strnncoll_8bit_bin,
+    my_strnncollsp_8bit_bin,
+    my_strnxfrm_bin,
+    my_like_range_simple,
+    my_wildcmp_bin,
+    my_strcasecmp_bin,
+    my_instr_bin,
+    my_hash_sort_bin
+};
+
+
+static MY_COLLATION_HANDLER my_collation_binary_handler =
 {
     NULL,			/* init */
     my_strnncoll_binary,
@@ -407,5 +491,5 @@ CHARSET_INFO my_charset_bin =
     0,				/* min_sort_char */
     255,			/* max_sort_char */
     &my_charset_handler,
-    &my_collation_8bit_bin_handler
+    &my_collation_binary_handler
 };
diff --git a/strings/ctype-mb.c b/strings/ctype-mb.c
index 7b0dadcfa19..ecafa6356d5 100644
--- a/strings/ctype-mb.c
+++ b/strings/ctype-mb.c
@@ -360,11 +360,62 @@ static int my_strnncoll_mb_bin(CHARSET_INFO * cs __attribute__((unused)),
   return cmp ? cmp : (int) ((t_is_prefix ? len : slen) - tlen);
 }
 
+
+/*
+  Compare two strings. 
+  
+  SYNOPSIS
+    my_strnncollsp_mb_bin()
+    cs			Chararacter set
+    s			String to compare
+    slen		Length of 's'
+    t			String to compare
+    tlen		Length of 't'
+
+  NOTE
+   This function is used for character strings with binary collations.
+   It ignores trailing spaces.
+
+  RETURN
+    A negative number if s < t
+    A positive number if s > t
+    0 if strings are equal
+*/
+
 static int my_strnncollsp_mb_bin(CHARSET_INFO * cs __attribute__((unused)),
-                                 const uchar *s, uint slen,
-                                 const uchar *t, uint tlen)
+                                 const uchar *a, uint a_length, 
+                                 const uchar *b, uint b_length)
 {
-  return my_strnncoll_mb_bin(cs,s,slen,t,tlen,0);
+  const uchar *end;
+  uint length;
+  
+  end= a + (length= min(a_length, b_length));
+  while (a < end)
+  {
+    if (*a++ != *b++)
+      return ((int) a[-1] - (int) b[-1]);
+  }
+  if (a_length != b_length)
+  {
+    int swap= 0;
+    /*
+      Check the next not space character of the longer key. If it's < ' ',
+      then it's smaller than the other key.
+    */
+    if (a_length < b_length)
+    {
+      /* put shorter key in s */
+      a_length= b_length;
+      a= b;
+      swap= -1;					/* swap sign of result */
+    }
+    for (end= a + a_length-length; a < end ; a++)
+    {
+      if (*a != ' ')
+	return ((int) *a - (int) ' ') ^ swap;
+    }
+  }
+  return 0;
 }