A joint patch for MDEV-19284 and MDEV-19285 (INSTANT ALTER)

This patch fixes: - MDEV-19284 INSTANT ALTER with ucs2-to-utf16 conversion produces bad data - MDEV-19285 INSTANT ALTER from ascii_general_ci to latin1_general_ci produces corrupt data These regressions were introduced in 10.4.3 by: - MDEV-15564 Avoid table rebuild in ALTER TABLE on collation or charset changes Changes: 1. Cleanup: Adding a helper method Field_longstr::csinfo_change_allows_instant_alter(), to remove some duplicate code in field.cc. 2. Cleanup: removing Type_handler::Charsets_are_compatible() and static function charsets_are_compatible() and introducing new methods in the recently added class Charset instead: - encoding_allows_reinterpret_as() - encoding_and_order_allow_reinterpret_as() 3. Bug fix: Removing the code that allowed instant conversion for ascii-to->8bit and ucs2-to->utf16. This actually fixes MDEV-19284 and MDEV-19285. 4. Bug fix: Adding a helper method Charset::collation_specific_name(). The old corresponding code in Type_handler::Charsets_are_compatible() was not safe against (badly named) user-defined collations whose character set name can be longer than collation name.
2025-11-25 17:25:02 +03:00 · 2019-04-19 15:18:38 +04:00
parent 9aa80fcf46
commit c59d6395a6
8 changed files with 925 additions and 847 deletions
--- a/sql/sql_type.cc
+++ b/sql/sql_type.cc
@@ -8219,48 +8219,51 @@ Type_handler_timestamp_common::Item_param_val_native(THD *thd,
    TIME_to_native(thd, &ltime, to, item->datetime_precision(thd));
 }

-static bool charsets_are_compatible(const char *old_cs_name,
-                                    const CHARSET_INFO *new_ci)
+
+LEX_CSTRING Charset::collation_specific_name() const
 {
-  const char *new_cs_name= new_ci->csname;
+  /*
+    User defined collations can provide arbitrary names
+    for character sets and collations, so a collation
+    name not necessarily starts with the character set name.
+  */
+  size_t csname_length= strlen(m_charset->csname);
+  if (strncmp(m_charset->name, m_charset->csname, csname_length))
+    return {NULL, 0};
+  const char *ptr= m_charset->name + csname_length;
+  return {ptr, strlen(ptr) };
+}

-  if (!strcmp(old_cs_name, new_cs_name))
+
+bool
+Charset::encoding_allows_reinterpret_as(const CHARSET_INFO *cs) const
+{
+  if (!strcmp(m_charset->csname, cs->csname))
    return true;

-  if (!strcmp(old_cs_name, MY_UTF8MB3) && !strcmp(new_cs_name, MY_UTF8MB4))
-    return true;
-
-  if (!strcmp(old_cs_name, "ascii") && !(new_ci->state & MY_CS_NONASCII))
-    return true;
-
-  if (!strcmp(old_cs_name, "ucs2") && !strcmp(new_cs_name, "utf16"))
+  if (!strcmp(m_charset->csname, MY_UTF8MB3) &&
+      !strcmp(cs->csname, MY_UTF8MB4))
    return true;

+  /*
+    Originally we allowed here instat ALTER for ASCII-to-LATIN1
+    and UCS2-to-UTF16, but this was wrong:
+    - MariaDB's ascii is not a subset for 8-bit character sets
+      like latin1, because it allows storing bytes 0x80..0xFF as
+      "unassigned" characters (see MDEV-19285).
+    - MariaDB's ucs2 (as in Unicode-1.1) is not a subset for UTF16,
+      because they treat surrogate codes differently (MDEV-19284).
+  */
  return false;
 }

-bool Type_handler::Charsets_are_compatible(const CHARSET_INFO *old_ci,
-                                           const CHARSET_INFO *new_ci,
-                                           bool part_of_a_key)
+
+bool
+Charset::encoding_and_order_allow_reinterpret_as(CHARSET_INFO *cs) const
 {
-  const char *old_cs_name= old_ci->csname;
-  const char *new_cs_name= new_ci->csname;
-
-  if (!charsets_are_compatible(old_cs_name, new_ci))
-  {
+  if (!encoding_allows_reinterpret_as(cs))
    return false;
-  }
-
-  if (!part_of_a_key)
-  {
-    return true;
-  }
-
-  if (strcmp(old_ci->name + strlen(old_cs_name),
-             new_ci->name + strlen(new_cs_name)))
-  {
-    return false;
-  }
-
-  return true;
+  LEX_CSTRING name0= collation_specific_name();
+  LEX_CSTRING name1= Charset(cs).collation_specific_name();
+  return name0.length && !cmp(&name0, &name1);
 }