MDEV-30164 System variable for default collations

This patch adds a way to override default collations (or "character set collations") for desired character sets. The SQL standard says: > Each collation known in an SQL-environment is applicable to one > or more character sets, and for each character set, one or more > collations are applicable to it, one of which is associated with > it as its character set collation. In MariaDB, character set collations has been hard-coded so far, e.g. utf8mb4_general_ci has been a hard-coded character set collation for utf8mb4. This patch allows to override (globally per server, or per session) character set collations, so for example, uca1400_ai_ci can be set as a character set collation for Unicode character sets (instead of compiled xxx_general_ci). The array of overridden character set collations is stored in a new (session and global) system variable @@character_set_collations and can be set as a comma separated list of charset=collation pairs, e.g.: SET @@character_set_collations='utf8mb3=uca1400_ai_ci,utf8mb4=uca1400_ai_ci'; The variable is empty by default, which mean use the hard-coded character set collations (e.g. utf8mb4_general_ci for utf8mb4). The variable can also be set globally by passing to the server startup command line, and/or in my.cnf.
2025-08-08 11:22:35 +03:00 · 2022-12-14 18:46:27 +04:00
parent 584c2351de
commit 75f25e4ca7
59 changed files with 2228 additions and 111 deletions
--- a/sql/simple_tokenizer.h
+++ b/sql/simple_tokenizer.h
@@ -0,0 +1,85 @@
+/* Copyright (c) 2023, MariaDB Corporation.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335  USA */
+
+#ifndef SIMPLE_TOKENIZER_INCLUDED
+#define SIMPLE_TOKENIZER_INCLUDED
+
+
+class Simple_tokenizer
+{
+  const char *m_ptr;
+  const char *m_end;
+public:
+  Simple_tokenizer(const char *str, size_t length)
+   :m_ptr(str), m_end(str + length)
+  { }
+  const char *ptr() const
+  {
+    return m_ptr;
+  }
+  bool eof() const
+  {
+    return m_ptr >= m_end;
+  }
+  void get_spaces()
+  {
+    for ( ; !eof(); m_ptr++)
+    {
+      if (m_ptr[0] != ' ')
+        break;
+    }
+  }
+  bool is_ident_start(char ch) const
+  {
+    return (ch >= 'a' && ch <= 'z') ||
+           (ch >= 'A' && ch <= 'Z') ||
+           ch == '_';
+  }
+  bool is_ident_body(char ch) const
+  {
+    return is_ident_start(ch) ||
+           (ch >= '0' && ch <= '9');
+  }
+  bool is_ident_start() const
+  {
+    return !eof() && is_ident_start(*m_ptr);
+  }
+  bool is_ident_body() const
+  {
+    return !eof() && is_ident_body(*m_ptr);
+  }
+  LEX_CSTRING get_ident()
+  {
+    get_spaces();
+    if (!is_ident_start())
+      return {m_ptr,0};
+    const char *start= m_ptr++;
+    for ( ; is_ident_body(); m_ptr++)
+    { }
+    LEX_CSTRING res= {start, (size_t) (m_ptr - start)};
+    return res;
+  }
+  bool get_char(char ch)
+  {
+    get_spaces();
+    if (eof() || *m_ptr != ch)
+      return true;
+    m_ptr++;
+    return false;
+  }
+};
+
+
+#endif // SIMPLE_TOKENIZER_INCLUDED