MDEV-8417 utf8mb4: compare broken bytes as "greater than any non-broken character"

2025-07-30 16:24:05 +03:00 · 2015-07-06 10:47:39 +04:00
parent 7ab7f5327a
commit 35d8ac350d
7 changed files with 275 additions and 149 deletions
--- a/mysql-test/include/ctype_utf8mb4.inc
+++ b/mysql-test/include/ctype_utf8mb4.inc
@ -1802,5 +1802,28 @@ DROP TABLE t1;
 --echo #
 --echo #
--echo # End of tests
+--echo # ctype_utf8mb4.inc: Start of 10.1 tests
 --echo #
 --echo #
 --echo # MDEV-8417 utf8mb4: compare broken bytes as "greater than any non-broken character"
 --echo #
 CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET utf8mb4, KEY(a));
 INSERT INTO t1 VALUES (0x61);
 INSERT INTO t1 VALUES (0xC280),(0xDFBF);
 INSERT INTO t1 VALUES (0xE0A080),(0xEFBFBF);
 INSERT INTO t1 VALUES (0xF0908080),(0xF48FBFBF);
 SELECT HEX(a) FROM t1 ORDER BY a;
 SELECT HEX(a) FROM t1 ORDER BY a DESC;
 ALTER TABLE t1 MODIFY a VARCHAR(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;
 SELECT HEX(a) FROM t1 ORDER BY a;
 SELECT HEX(a) FROM t1 ORDER BY a DESC;
 DROP TABLE t1;
 --echo #
 --echo # ctype_utf8mb4.inc: End of 10.1 tests
 --echo #
 --echo #
 --echo # End of ctype_utf8mb4.inc
 --echo #
--- a/mysql-test/r/ctype_utf8mb4_heap.result
+++ b/mysql-test/r/ctype_utf8mb4_heap.result
@ -2495,5 +2495,57 @@ DROP TABLE t1;
 # End of 5.5 tests
 #
 #
-# End of tests
+# ctype_utf8mb4.inc: Start of 10.1 tests
 #
 #
 # MDEV-8417 utf8mb4: compare broken bytes as "greater than any non-broken character"
 #
 CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET utf8mb4, KEY(a));
 INSERT INTO t1 VALUES (0x61);
 INSERT INTO t1 VALUES (0xC280),(0xDFBF);
 INSERT INTO t1 VALUES (0xE0A080),(0xEFBFBF);
 INSERT INTO t1 VALUES (0xF0908080),(0xF48FBFBF);
 SELECT HEX(a) FROM t1 ORDER BY a;
 HEX(a)
 61
 C280
 DFBF
 E0A080
 EFBFBF
 F0908080
 F48FBFBF
 SELECT HEX(a) FROM t1 ORDER BY a DESC;
 HEX(a)
 F48FBFBF
 F0908080
 EFBFBF
 E0A080
 DFBF
 C280
 61
 ALTER TABLE t1 MODIFY a VARCHAR(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;
 SELECT HEX(a) FROM t1 ORDER BY a;
 HEX(a)
 61
 C280
 DFBF
 E0A080
 EFBFBF
 F0908080
 F48FBFBF
 SELECT HEX(a) FROM t1 ORDER BY a DESC;
 HEX(a)
 F48FBFBF
 F0908080
 EFBFBF
 E0A080
 DFBF
 C280
 61
 DROP TABLE t1;
 #
 # ctype_utf8mb4.inc: End of 10.1 tests
 #
 #
 # End of ctype_utf8mb4.inc
 #
--- a/mysql-test/r/ctype_utf8mb4_innodb.result
+++ b/mysql-test/r/ctype_utf8mb4_innodb.result
@ -2642,5 +2642,57 @@ DROP TABLE t1;
 # End of 5.5 tests
 #
 #
-# End of tests
+# ctype_utf8mb4.inc: Start of 10.1 tests
 #
 #
 # MDEV-8417 utf8mb4: compare broken bytes as "greater than any non-broken character"
 #
 CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET utf8mb4, KEY(a));
 INSERT INTO t1 VALUES (0x61);
 INSERT INTO t1 VALUES (0xC280),(0xDFBF);
 INSERT INTO t1 VALUES (0xE0A080),(0xEFBFBF);
 INSERT INTO t1 VALUES (0xF0908080),(0xF48FBFBF);
 SELECT HEX(a) FROM t1 ORDER BY a;
 HEX(a)
 61
 C280
 DFBF
 E0A080
 EFBFBF
 F0908080
 F48FBFBF
 SELECT HEX(a) FROM t1 ORDER BY a DESC;
 HEX(a)
 F48FBFBF
 F0908080
 EFBFBF
 E0A080
 DFBF
 C280
 61
 ALTER TABLE t1 MODIFY a VARCHAR(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;
 SELECT HEX(a) FROM t1 ORDER BY a;
 HEX(a)
 61
 C280
 DFBF
 E0A080
 EFBFBF
 F0908080
 F48FBFBF
 SELECT HEX(a) FROM t1 ORDER BY a DESC;
 HEX(a)
 F48FBFBF
 F0908080
 EFBFBF
 E0A080
 DFBF
 C280
 61
 DROP TABLE t1;
 #
 # ctype_utf8mb4.inc: End of 10.1 tests
 #
 #
 # End of ctype_utf8mb4.inc
 #
--- a/mysql-test/r/ctype_utf8mb4_myisam.result
+++ b/mysql-test/r/ctype_utf8mb4_myisam.result
@ -2642,5 +2642,57 @@ DROP TABLE t1;
 # End of 5.5 tests
 #
 #
-# End of tests
+# ctype_utf8mb4.inc: Start of 10.1 tests
 #
 #
 # MDEV-8417 utf8mb4: compare broken bytes as "greater than any non-broken character"
 #
 CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET utf8mb4, KEY(a));
 INSERT INTO t1 VALUES (0x61);
 INSERT INTO t1 VALUES (0xC280),(0xDFBF);
 INSERT INTO t1 VALUES (0xE0A080),(0xEFBFBF);
 INSERT INTO t1 VALUES (0xF0908080),(0xF48FBFBF);
 SELECT HEX(a) FROM t1 ORDER BY a;
 HEX(a)
 61
 C280
 DFBF
 E0A080
 EFBFBF
 F0908080
 F48FBFBF
 SELECT HEX(a) FROM t1 ORDER BY a DESC;
 HEX(a)
 F48FBFBF
 F0908080
 EFBFBF
 E0A080
 DFBF
 C280
 61
 ALTER TABLE t1 MODIFY a VARCHAR(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;
 SELECT HEX(a) FROM t1 ORDER BY a;
 HEX(a)
 61
 C280
 DFBF
 E0A080
 EFBFBF
 F0908080
 F48FBFBF
 SELECT HEX(a) FROM t1 ORDER BY a DESC;
 HEX(a)
 F48FBFBF
 F0908080
 EFBFBF
 E0A080
 DFBF
 C280
 61
 DROP TABLE t1;
 #
 # ctype_utf8mb4.inc: End of 10.1 tests
 #
 #
 # End of ctype_utf8mb4.inc
 #
--- a/strings/ctype-utf8.c
+++ b/strings/ctype-utf8.c
@ -85,7 +85,8 @@
                                       IS_CONTINUATION_BYTE(b3) && \
                                       (b0 >= 0xf1 || b1 >= 0x90) && \
                                       (b0 <= 0xf3 || b1 <= 0x8F))
-
+#define IS_UTF8MB4_STEP3(b0,b1,b2,b3) (((uchar) (b0) < 0xF5) && \
                                       IS_UTF8MB4_STEP2(b0,b1,b2,b3))
 /* Convert individual bytes to Unicode code points */
 #define UTF8MB2_CODE(b0,b1)       (((my_wc_t) ((uchar) b0 & 0x1f) << 6)  |\
@ -7622,146 +7623,6 @@ my_casedn_str_utf8mb4(CHARSET_INFO *cs, char *src)
 }
 static int
 my_strnncoll_utf8mb4(CHARSET_INFO *cs,
                     const uchar *s, size_t slen,
                     const uchar *t, size_t tlen,
                     my_bool t_is_prefix)
 {
  my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc);
  const uchar *se= s + slen;
  const uchar *te= t + tlen;
  MY_UNICASE_INFO *uni_plane= cs->caseinfo;
  while ( s < se && t < te )
  {
    int s_res= my_mb_wc_utf8mb4(cs, &s_wc, s, se);
    int t_res= my_mb_wc_utf8mb4(cs, &t_wc, t, te);
    if ( s_res <= 0 || t_res <= 0 )
    {
      /* Incorrect string, compare bytewise */
      return bincmp_utf8mb4(s, se, t, te);
    }
    my_tosort_unicode(uni_plane, &s_wc, cs->state);
    my_tosort_unicode(uni_plane, &t_wc, cs->state);
    if ( s_wc != t_wc )
    {
      return s_wc > t_wc ? 1 : -1;
    }
    s+= s_res;
    t+= t_res;
  }
  return (int) (t_is_prefix ? (t - te) : ((se - s) - (te - t)));
 }
 /**
  Compare strings, discarding end space
  If one string is shorter as the other, then we space extend the other
  so that the strings have equal length.
  This will ensure that the following things hold:
    "a"  == "a "
    "a\0" < "a"
    "a\0" < "a "
  @param  cs        Character set pinter.
  @param  a         First string to compare.
  @param  a_length  Length of 'a'.
  @param  b         Second string to compare.
  @param  b_length  Length of 'b'.
  @param  diff_if_only_endspace_difference
                    Set to 1 if the strings should be regarded as different
                    if they only difference in end space
  @return Comparison result.
    @retval Negative number, if a less than b.
    @retval 0, if a is equal to b
    @retval Positive number, if a > b
 */
 static int
 my_strnncollsp_utf8mb4(CHARSET_INFO *cs,
                       const uchar *s, size_t slen,
                       const uchar *t, size_t tlen,
                       my_bool diff_if_only_endspace_difference)
 {
  int res;
  my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc);
  const uchar *se= s + slen, *te= t + tlen;
  MY_UNICASE_INFO *uni_plane= cs->caseinfo;
 #ifndef VARCHAR_WITH_DIFF_ENDSPACE_ARE_DIFFERENT_FOR_UNIQUE
  diff_if_only_endspace_difference= FALSE;
 #endif
  while ( s < se && t < te )
  {
    int s_res= my_mb_wc_utf8mb4(cs, &s_wc, s, se);
    int t_res= my_mb_wc_utf8mb4(cs, &t_wc, t, te);
    if ( s_res <= 0 || t_res <= 0 )
    {
      /* Incorrect string, compare bytewise */
      return bincmp_utf8mb4(s, se, t, te);
    }
    my_tosort_unicode(uni_plane, &s_wc, cs->state);
    my_tosort_unicode(uni_plane, &t_wc, cs->state);
    if ( s_wc != t_wc )
    {
      return s_wc > t_wc ? 1 : -1;
    }
    s+=s_res;
    t+=t_res;
  }
  slen= (size_t) (se-s);
  tlen= (size_t) (te-t);
  res= 0;
  if (slen != tlen)
  {
    int swap= 1;
    if (diff_if_only_endspace_difference)
      res= 1;                                   /* Assume 'a' is bigger */
    if (slen < tlen)
    {
      slen= tlen;
      s= t;
      se= te;
      swap= -1;
      res= -res;
    }
    /*
      This following loop uses the fact that in UTF-8
      all multibyte characters are greater than space,
      and all multibyte head characters are greater than
      space. It means if we meet a character greater
      than space, it always means that the longer string
      is greater. So we can reuse the same loop from the
      8bit version, without having to process full multibute
      sequences.
    */
    for ( ; s < se; s++)
    {
      if (*s != ' ')
 	return (*s < ' ') ? -swap : swap;
    }
  }
  return res;
 }
 /**
  Compare 0-terminated UTF8 strings.
@ -7906,6 +7767,30 @@ size_t my_well_formed_len_utf8mb4(CHARSET_INFO *cs,
 #undef DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
 /* my_well_formed_char_length_utf8mb4 */
 #define MY_FUNCTION_NAME(x)      my_ ## x ## _utf8mb4_general_ci
 #define IS_MB4_CHAR(b0,b1,b2,b3) IS_UTF8MB4_STEP3(b0,b1,b2,b3)
 #define WEIGHT_ILSEQ(x)          (0xFF0000 + (uchar) (x))
 #define WEIGHT_MB1(b0)           my_weight_mb1_utf8_general_ci(b0)
 #define WEIGHT_MB2(b0,b1)        my_weight_mb2_utf8_general_ci(b0,b1)
 #define WEIGHT_MB3(b0,b1,b2)     my_weight_mb3_utf8_general_ci(b0,b1,b2)
 /*
  There is no mapping between code point and weight for non-BMP characters
  in utf8mb4_general_ci. Just using code point as weight.
 */
 #define WEIGHT_MB4(b0,b1,b2,b3)  UTF8MB4_CODE(b0,b1,b2,b3)
 #include "strcoll.ic"
 #define MY_FUNCTION_NAME(x)      my_ ## x ## _utf8mb4_bin
 #define WEIGHT_ILSEQ(x)          (0xFF0000 + (uchar) (x))
 #define WEIGHT_MB1(b0)           ((int) (uchar) (b0))
 #define WEIGHT_MB2(b0,b1)        ((int) UTF8MB2_CODE(b0,b1))
 #define WEIGHT_MB3(b0,b1,b2)     ((int) UTF8MB3_CODE(b0,b1,b2))
 #define WEIGHT_MB4(b0,b1,b2,b3)  ((int) UTF8MB4_CODE(b0,b1,b2,b3))
 #include "strcoll.ic"
 static uint
 my_ismbchar_utf8mb4(CHARSET_INFO *cs, const char *b, const char *e)
 {
@ -7934,8 +7819,8 @@ my_mbcharlen_utf8mb4(CHARSET_INFO *cs  __attribute__((unused)), uint c)
 static MY_COLLATION_HANDLER my_collation_utf8mb4_general_ci_handler=
 {
  NULL,               /* init */
-  my_strnncoll_utf8mb4,
+  my_strnncoll_utf8mb4_general_ci,
-  my_strnncollsp_utf8mb4,
+  my_strnncollsp_utf8mb4_general_ci,
  my_strnxfrm_unicode,
  my_strnxfrmlen_unicode,
  my_like_range_mb,
@ -7950,8 +7835,8 @@ static MY_COLLATION_HANDLER my_collation_utf8mb4_general_ci_handler=
 static MY_COLLATION_HANDLER my_collation_utf8mb4_bin_handler =
 {
    NULL,		/* init */
-    my_strnncoll_mb_bin,
+    my_strnncoll_utf8mb4_bin,
-    my_strnncollsp_mb_bin,
+    my_strnncollsp_utf8mb4_bin,
    my_strnxfrm_unicode_full_bin,
    my_strnxfrmlen_unicode_full_bin,
    my_like_range_mb,
--- a/strings/strcoll.ic
+++ b/strings/strcoll.ic
@ -118,6 +118,18 @@ MY_FUNCTION_NAME(scan_weight)(int *weight, const uchar *str, const uchar *end)
  }
 #endif
 #ifdef IS_MB4_CHAR
  if (str + 4 > end)                     /* Incomplete four-byte character */
    goto bad;
  if (IS_MB4_CHAR(str[0], str[1], str[2], str[3]))
  {
    *weight= WEIGHT_MB4(str[0], str[1], str[2], str[3]);
    return 4;                            /* A valid four-byte character */
  }
 #endif
 bad:
  *weight= WEIGHT_ILSEQ(str[0]);         /* Bad byte */
  return 1;
@ -252,4 +264,5 @@ MY_FUNCTION_NAME(strnncollsp)(CHARSET_INFO *cs __attribute__((unused)),
 #undef WEIGHT_MB1
 #undef WEIGHT_MB2
 #undef WEIGHT_MB3
 #undef WEIGHT_MB4
 #undef WEIGHT_PAD_SPACE
--- a/unittest/strings/strings-t.c
+++ b/unittest/strings/strings-t.c
@ -369,6 +369,49 @@ STRNNCOLL_PARAM strcoll_utf8mb3_common[]=
 };
 STRNNCOLL_PARAM strcoll_utf8mb4_common[]=
 {
  /* Minimum four-byte character: U+10000 == _utf8 0xF0908080 */
  {CSTR("\xF0\x90\x80\x80"), CSTR("\xC0"),        -1},  /* MB4 vs unused byte */
  {CSTR("\xF0\x90\x80\x80"), CSTR("\xC2"),        -1},  /* MB4 vs incomplete MB2 */
  {CSTR("\xF0\x90\x80\x80"), CSTR("\xE0\xA0\x7F"),-1},  /* MB4 vs broken MB3 */
  {CSTR("\xF0\x90\x80\x80"), CSTR("\xE0\xA0\xC0"),-1},  /* MB4 vs broken MB3 */
  {CSTR("\xF0\x90\x80\x80"), CSTR("\xE0\xA0"),     -1}, /* MB4 vs incomplete MB3 */
  {CSTR("\xF0\x90\x80\x80"), CSTR("\xF0\x90\x80"),-1},  /* MB4 vs incomplete MB4 */
  {CSTR("\xF0\x90\x80\x80"), CSTR("\xF0\x90\x80\x7F"),-1},/* MB4 vs broken MB4 */
  {CSTR("\xF0\x90\x80\x80"), CSTR("\xF0\x90\x80\xC0"),-1},/* MB4 vs broken MB4 */
  /* Maximum four-byte character: U+10FFFF == _utf8 0xF48FBFBF */
  {CSTR("\xF4\x8F\xBF\xBF"), CSTR("\xC0"),        -1},  /* MB4 vs unused byte */
  {CSTR("\xF4\x8F\xBF\xBF"), CSTR("\xC2"),        -1},  /* MB4 vs incomplete MB2 */
  {CSTR("\xF4\x8F\xBF\xBF"), CSTR("\xE0\xA0\x7F"),-1},  /* MB4 vs broken MB3 */
  {CSTR("\xF4\x8F\xBF\xBF"), CSTR("\xE0\xA0\xC0"),-1},  /* MB4 vs broken MB3 */
  {CSTR("\xF4\x8F\xBF\xBF"), CSTR("\xE0\xA0"),     -1}, /* MB4 vs incomplete MB3 */
  {CSTR("\xF4\x8F\xBF\xBF"), CSTR("\xF0\x90\x80"),-1},  /* MB4 vs incomplete MB4 */
  {CSTR("\xF4\x8F\xBF\xBF"), CSTR("\xF0\x90\x80\x7F"),-1},/* MB4 vs broken MB4 */
  {CSTR("\xF4\x8F\xBF\xBF"), CSTR("\xF0\x90\x80\xC0"),-1},/* MB4 vs broken MB4 */
  /* Broken MB4 vs incomplete/broken MB3 */
  {CSTR("\xF0\x90\x80\x7F"), CSTR("\xE0\xA0"),    1},  /* Broken MB4 vs incomplete MB3 */
  {CSTR("\xF0\x90\x80\x7F"), CSTR("\xE0\xA0\x7F"),1},  /* Broken MB4 vs broken MB3 */
  {CSTR("\xF0\x90\x80\x7F"), CSTR("\xE0\xA0\xC0"),1},  /* Broken MB4 vs broken MB3 */
  /*
    Broken MB4 vs incomplete MB4:
    The three leftmost bytes are compared binary, the fourth byte is compared
    to auto-padded space.
  */
  {CSTR("\xF0\x90\x80\x1F"), CSTR("\xF0\x90\x80"),-1}, /* Broken MB4 vs incomplete MB4 */
  {CSTR("\xF0\x90\x80\x7E"), CSTR("\xF0\x90\x80"),1},  /* Broken MB4 vs incomplete MB4 */
  /* Broken MB4 vs broken MB4 */
  {CSTR("\xF0\x90\x80\x7E"), CSTR("\xF0\x90\x80\x7F"),-1},/* Broken MB4 vs broken MB4 */
  {CSTR("\xF0\x90\x80\x7E"), CSTR("\xF0\x90\x80\xC0"),-1},/* Broken MB4 vs broken MB4 */
  {NULL, 0, NULL, 0, 0}
 };
 static void
 str2hex(char *dst, size_t dstlen, const char *src, size_t srclen)
 {
@ -497,6 +540,12 @@ test_strcollsp()
  failed+= strcollsp(&my_charset_utf8_general_ci,          strcoll_utf8mb3_common);
  failed+= strcollsp(&my_charset_utf8_general_mysql500_ci, strcoll_utf8mb3_common);
  failed+= strcollsp(&my_charset_utf8_bin,                 strcoll_utf8mb3_common);
 #endif
 #ifdef HAVE_CHARSET_utf8mb4
  failed+= strcollsp(&my_charset_utf8mb4_general_ci,          strcoll_utf8mb3_common);
  failed+= strcollsp(&my_charset_utf8mb4_bin,                 strcoll_utf8mb3_common);
  failed+= strcollsp(&my_charset_utf8mb4_general_ci,          strcoll_utf8mb4_common);
  failed+= strcollsp(&my_charset_utf8mb4_bin,                 strcoll_utf8mb4_common);
 #endif
  return failed;
 }