1
0
mirror of https://github.com/MariaDB/server.git synced 2025-07-18 23:03:28 +03:00

A joint patch for MDEV-19284 and MDEV-19285 (INSTANT ALTER)

This patch fixes:

- MDEV-19284 INSTANT ALTER with ucs2-to-utf16 conversion produces bad data
- MDEV-19285 INSTANT ALTER from ascii_general_ci to latin1_general_ci produces corrupt data

These regressions were introduced in 10.4.3 by:
- MDEV-15564 Avoid table rebuild in ALTER TABLE on collation or charset changes

Changes:

1. Cleanup: Adding a helper method
   Field_longstr::csinfo_change_allows_instant_alter(),
   to remove some duplicate code in field.cc.

2. Cleanup: removing Type_handler::Charsets_are_compatible() and static
   function charsets_are_compatible() and
   introducing new methods in the recently added class Charset instead:
   - encoding_allows_reinterpret_as()
   - encoding_and_order_allow_reinterpret_as()

3. Bug fix: Removing the code that allowed instant conversion for
   ascii-to->8bit and ucs2-to->utf16.
   This actually fixes MDEV-19284 and MDEV-19285.

4. Bug fix: Adding a helper method Charset::collation_specific_name().
   The old corresponding code in Type_handler::Charsets_are_compatible()
   was not safe against (badly named) user-defined collations whose
   character set name can be longer than collation name.
This commit is contained in:
Alexander Barkov
2019-04-19 15:18:38 +04:00
parent 9aa80fcf46
commit c59d6395a6
8 changed files with 925 additions and 847 deletions

View File

@ -8219,48 +8219,51 @@ Type_handler_timestamp_common::Item_param_val_native(THD *thd,
TIME_to_native(thd, &ltime, to, item->datetime_precision(thd));
}
static bool charsets_are_compatible(const char *old_cs_name,
const CHARSET_INFO *new_ci)
LEX_CSTRING Charset::collation_specific_name() const
{
const char *new_cs_name= new_ci->csname;
/*
User defined collations can provide arbitrary names
for character sets and collations, so a collation
name not necessarily starts with the character set name.
*/
size_t csname_length= strlen(m_charset->csname);
if (strncmp(m_charset->name, m_charset->csname, csname_length))
return {NULL, 0};
const char *ptr= m_charset->name + csname_length;
return {ptr, strlen(ptr) };
}
if (!strcmp(old_cs_name, new_cs_name))
bool
Charset::encoding_allows_reinterpret_as(const CHARSET_INFO *cs) const
{
if (!strcmp(m_charset->csname, cs->csname))
return true;
if (!strcmp(old_cs_name, MY_UTF8MB3) && !strcmp(new_cs_name, MY_UTF8MB4))
return true;
if (!strcmp(old_cs_name, "ascii") && !(new_ci->state & MY_CS_NONASCII))
return true;
if (!strcmp(old_cs_name, "ucs2") && !strcmp(new_cs_name, "utf16"))
if (!strcmp(m_charset->csname, MY_UTF8MB3) &&
!strcmp(cs->csname, MY_UTF8MB4))
return true;
/*
Originally we allowed here instat ALTER for ASCII-to-LATIN1
and UCS2-to-UTF16, but this was wrong:
- MariaDB's ascii is not a subset for 8-bit character sets
like latin1, because it allows storing bytes 0x80..0xFF as
"unassigned" characters (see MDEV-19285).
- MariaDB's ucs2 (as in Unicode-1.1) is not a subset for UTF16,
because they treat surrogate codes differently (MDEV-19284).
*/
return false;
}
bool Type_handler::Charsets_are_compatible(const CHARSET_INFO *old_ci,
const CHARSET_INFO *new_ci,
bool part_of_a_key)
bool
Charset::encoding_and_order_allow_reinterpret_as(CHARSET_INFO *cs) const
{
const char *old_cs_name= old_ci->csname;
const char *new_cs_name= new_ci->csname;
if (!charsets_are_compatible(old_cs_name, new_ci))
{
if (!encoding_allows_reinterpret_as(cs))
return false;
}
if (!part_of_a_key)
{
return true;
}
if (strcmp(old_ci->name + strlen(old_cs_name),
new_ci->name + strlen(new_cs_name)))
{
return false;
}
return true;
LEX_CSTRING name0= collation_specific_name();
LEX_CSTRING name1= Charset(cs).collation_specific_name();
return name0.length && !cmp(&name0, &name1);
}