move convert to datatypes::Charset class

2025-10-15 14:06:45 +03:00 · 2024-03-13 12:34:06 +00:00
parent c024bb0be5
commit 38c9b51a13
4 changed files with 60 additions and 61 deletions
--- a/mysql-test/columnstore/basic/r/mcs228_regexp_operator.result
+++ b/mysql-test/columnstore/basic/r/mcs228_regexp_operator.result
@@ -543,97 +543,97 @@ INSERT INTO t2 values('こんにちは');
 INSERT INTO t2 values('привет');
 INSERT INTO t2 values('Γεια');
 INSERT INTO t2 values('სალამი');
-SELECT hello, hello regexp 'ん.ち' from t2;
+SELECT hello, hello regexp 'ん.ち' FROM t2;
 hello	hello regexp 'ん.ち'
 こんにちは	0
 привет	0
 Γεια	0
 სალამი	0
-SELECT hello, hello regexp 'и.е' from t2;
+SELECT hello, hello regexp 'и.е' FROM t2;
 hello	hello regexp 'и.е'
 こんにちは	0
 привет	0
 Γεια	0
 სალამი	0
-SELECT hello, hello regexp 'ε.α' from t2;
+SELECT hello, hello regexp 'ε.α' FROM t2;
 hello	hello regexp 'ε.α'
 こんにちは	0
 привет	0
 Γεια	0
 სალამი	0
-SELECT hello, hello regexp 'ა.ა' from t2;
+SELECT hello, hello regexp 'ა.ა' FROM t2;
 hello	hello regexp 'ა.ა'
 こんにちは	0
 привет	0
 Γεια	0
 სალამი	0
-SELECT hello, regexp_substr(hello, 'ん.ち') from t2;
+SELECT hello, regexp_substr(hello, 'ん.ち') FROM t2;
 hello	regexp_substr(hello, 'ん.ち')
 こんにちは	
 привет	
 Γεια	
 სალამი	
-SELECT hello, regexp_substr(hello, 'и.е') from t2;
+SELECT hello, regexp_substr(hello, 'и.е') FROM t2;
 hello	regexp_substr(hello, 'и.е')
 こんにちは	
 привет	
 Γεια	
 სალამი	
-SELECT hello, regexp_substr(hello, 'ε.α') from t2;
+SELECT hello, regexp_substr(hello, 'ε.α') FROM t2;
 hello	regexp_substr(hello, 'ε.α')
 こんにちは	
 привет	
 Γεια	
 სალამი	
-SELECT hello, regexp_substr(hello, 'ა.ა') from t2;
+SELECT hello, regexp_substr(hello, 'ა.ა') FROM t2;
 hello	regexp_substr(hello, 'ა.ა')
 こんにちは	
 привет	
 Γεια	
 სალამი	
-SELECT hello, regexp_instr(hello, 'ん.ち') from t2;
+SELECT hello, regexp_instr(hello, 'ん.ち') FROM t2;
 hello	regexp_instr(hello, 'ん.ち')
 こんにちは	0
 привет	0
 Γεια	0
 სალამი	0
-SELECT hello, regexp_instr(hello, 'и.е') from t2;
+SELECT hello, regexp_instr(hello, 'и.е') FROM t2;
 hello	regexp_instr(hello, 'и.е')
 こんにちは	0
 привет	0
 Γεια	0
 სალამი	0
-SELECT hello, regexp_instr(hello, 'ε.α') from t2;
+SELECT hello, regexp_instr(hello, 'ε.α') FROM t2;
 hello	regexp_instr(hello, 'ε.α')
 こんにちは	0
 привет	0
 Γεια	0
 სალამი	0
-SELECT hello, regexp_instr(hello, 'ა.ა') from t2;
+SELECT hello, regexp_instr(hello, 'ა.ა') FROM t2;
 hello	regexp_instr(hello, 'ა.ა')
 こんにちは	0
 привет	0
 Γεια	0
 სალამი	0
-SELECT hello, regexp_replace(hello, 'ん.ち', 'Достоевский') from t2;
+SELECT hello, regexp_replace(hello, 'ん.ち', 'Достоевский') FROM t2;
 hello	regexp_replace(hello, 'ん.ち', 'Достоевский')
 こんにちは	こんにちは
 привет	привет
 Γεια	Γεια
 სალამი	სალამი
-SELECT hello, regexp_replace(hello, 'и.е', 'Достоевский') from t2;
+SELECT hello, regexp_replace(hello, 'и.е', 'Достоевский') FROM t2;
 hello	regexp_replace(hello, 'и.е', 'Достоевский')
 こんにちは	こんにちは
 привет	привет
 Γεια	Γεια
 სალამი	სალამი
-SELECT hello, regexp_replace(hello, 'ε.α', 'Достоевский') from t2;
+SELECT hello, regexp_replace(hello, 'ε.α', 'Достоевский') FROM t2;
 hello	regexp_replace(hello, 'ε.α', 'Достоевский')
 こんにちは	こんにちは
 привет	привет
 Γεια	Γεια
 სალამი	სალამი
-SELECT hello, regexp_replace(hello, 'ა.ა', 'Достоевский') from t2;
+SELECT hello, regexp_replace(hello, 'ა.ა', 'Достоевский') FROM t2;
 hello	regexp_replace(hello, 'ა.ა', 'Достоевский')
 こんにちは	こんにちは
 привет	привет
--- a/mysql-test/columnstore/basic/t/mcs228_regexp_operator.test
+++ b/mysql-test/columnstore/basic/t/mcs228_regexp_operator.test
@@ -152,32 +152,31 @@ SELECT t1_TIME, REGEXP_REPLACE(t1_TIME, '22$', 'KittyCat') FROM t1 ORDER BY 1;
 SET character_set_connection = 'utf8';
 CREATE TABLE t2 (hello text) engine columnstore;
 INSERT INTO t2 values('こんにちは');
 INSERT INTO t2 values('привет');
 INSERT INTO t2 values('Γεια');
 INSERT INTO t2 values('სალამი');
-SELECT hello, hello regexp 'ん.ち' from t2;
+SELECT hello, hello regexp 'ん.ち' FROM t2;
-SELECT hello, hello regexp 'и.е' from t2;
+SELECT hello, hello regexp 'и.е' FROM t2;
-SELECT hello, hello regexp 'ε.α' from t2;
+SELECT hello, hello regexp 'ε.α' FROM t2;
-SELECT hello, hello regexp 'ა.ა' from t2;
+SELECT hello, hello regexp 'ა.ა' FROM t2;
-SELECT hello, regexp_substr(hello, 'ん.ち') from t2;
+SELECT hello, regexp_substr(hello, 'ん.ち') FROM t2;
-SELECT hello, regexp_substr(hello, 'и.е') from t2;
+SELECT hello, regexp_substr(hello, 'и.е') FROM t2;
-SELECT hello, regexp_substr(hello, 'ε.α') from t2;
+SELECT hello, regexp_substr(hello, 'ε.α') FROM t2;
-SELECT hello, regexp_substr(hello, 'ა.ა') from t2;
+SELECT hello, regexp_substr(hello, 'ა.ა') FROM t2;
-SELECT hello, regexp_instr(hello, 'ん.ち') from t2;
+SELECT hello, regexp_instr(hello, 'ん.ち') FROM t2;
-SELECT hello, regexp_instr(hello, 'и.е') from t2;
+SELECT hello, regexp_instr(hello, 'и.е') FROM t2;
-SELECT hello, regexp_instr(hello, 'ε.α') from t2;
+SELECT hello, regexp_instr(hello, 'ε.α') FROM t2;
-SELECT hello, regexp_instr(hello, 'ა.ა') from t2;
+SELECT hello, regexp_instr(hello, 'ა.ა') FROM t2;
-SELECT hello, regexp_replace(hello, 'ん.ち', 'Достоевский') from t2;
+SELECT hello, regexp_replace(hello, 'ん.ち', 'Достоевский') FROM t2;
-SELECT hello, regexp_replace(hello, 'и.е', 'Достоевский') from t2;
+SELECT hello, regexp_replace(hello, 'и.е', 'Достоевский') FROM t2;
-SELECT hello, regexp_replace(hello, 'ε.α', 'Достоевский') from t2;
+SELECT hello, regexp_replace(hello, 'ε.α', 'Достоевский') FROM t2;
-SELECT hello, regexp_replace(hello, 'ა.ა', 'Достоевский') from t2;
+SELECT hello, regexp_replace(hello, 'ა.ა', 'Достоевский') FROM t2;
 # Clean UP
 DROP DATABASE mcs228_db;
--- a/utils/common/collation.h
+++ b/utils/common/collation.h
@@ -146,6 +146,23 @@ class Charset
  Charset(CHARSET_INFO* cs = nullptr) : mCharset(cs ? cs : &my_charset_bin)
  {
  }
  bool operator==(const Charset& rhs)
  {
     return rhs.getCharset().cs_name.str == getCharset().cs_name.str;
  }
  std::string convert(const std::string& from, const datatypes::Charset& fromCs) const
  {
     std::string result;
     uint dummy_errors;
     result.resize(from.size() * getCharset().mbmaxlen);
     size_t resultingSize = my_convert(const_cast<char*>(result.c_str()), result.size(), &getCharset(), from.c_str(),
                                       from.size(), &fromCs.getCharset(), &dummy_errors);
     result.resize(resultingSize);
     return result;
  }
  Charset(uint32_t charsetNumber);
  void setCharset(uint32_t charsetNumber);
  CHARSET_INFO& getCharset() const
--- a/utils/funcexp/func_regexp.cpp
+++ b/utils/funcexp/func_regexp.cpp
@@ -39,42 +39,27 @@ using namespace execplan;
 #include "errorcodes.h"
 #include "idberrorinfo.h"
 #include "errorids.h"
 using namespace logging;
 namespace
 {
 std::string csConvert(const std::string& from, CHARSET_INFO* to_cs, CHARSET_INFO* from_cs)
 {
  std::string result;
  uint dummy_errors;
  result.resize(from.size() * to_cs->mbmaxlen);
  size_t resultingSize = my_convert(const_cast<char*>(result.c_str()), result.size(), to_cs, from.c_str(),
                                    from.size(), from_cs, &dummy_errors);
  result.resize(resultingSize);
  return result;
 }
 using jp = jpcre2::select<char>;
 struct PCREOptions
 {
  PCREOptions(execplan::CalpontSystemCatalog::ColType& ct);
-  CHARSET_INFO* dataCharset = &my_charset_utf8mb3_general_ci;
+  datatypes::Charset dataCharset = my_charset_utf8mb3_general_ci;
-  CHARSET_INFO* libraryCharset = &my_charset_utf8mb3_general_ci;
+  datatypes::Charset libraryCharset = my_charset_utf8mb3_general_ci;
  jpcre2::Uint flags = 0;
  bool conversionIsNeeded = false;
 };
 inline bool areSameCharsets(CHARSET_INFO* cs1, CHARSET_INFO* cs2)
 {
  return (cs1->cs_name.str == cs2->cs_name.str);
 }
 PCREOptions::PCREOptions(execplan::CalpontSystemCatalog::ColType& ct)
 {
-  CHARSET_INFO* cs = ct.getCharset();
+  datatypes::Charset cs = ct.getCharset();
  // TODO use system variable instead if hardcode default_regex_flags_pcre(_current_thd());
  // PCRE2_DOTALL | PCRE2_DUPNAMES | PCRE2_EXTENDED | PCRE2_EXTENDED_MORE | PCRE2_MULTILINE | PCRE2_UNGREEDY;
@@ -82,12 +67,11 @@ PCREOptions::PCREOptions(execplan::CalpontSystemCatalog::ColType& ct)
  jpcre2::Uint defaultFlags = 0;
  flags = (cs != &my_charset_bin ? (PCRE2_UTF | PCRE2_UCP) : 0) |
-          ((cs->state & (MY_CS_BINSORT | MY_CS_CSSORT)) ? 0 : PCRE2_CASELESS) | defaultFlags;
+          ((cs.getCharset().state & (MY_CS_BINSORT | MY_CS_CSSORT)) ? 0 : PCRE2_CASELESS) | defaultFlags;
  // Convert text data to utf-8.
  dataCharset = cs;
-  libraryCharset = cs == &my_charset_bin ? &my_charset_bin : &my_charset_utf8mb3_general_ci;
+  libraryCharset = cs == my_charset_bin ? my_charset_bin : my_charset_utf8mb3_general_ci;
  conversionIsNeeded = (cs != &my_charset_bin) && !areSameCharsets(cs, libraryCharset);
 }
 struct RegExpParams
@@ -99,8 +83,8 @@ struct RegExpParams
    if (options.conversionIsNeeded)
      return *this;
-    expression = csConvert(expression, options.libraryCharset, options.dataCharset);
+    expression = options.libraryCharset.convert(expression, options.dataCharset);
-    pattern = csConvert(pattern, options.libraryCharset, options.dataCharset);
+    pattern = options.libraryCharset.convert(pattern, options.dataCharset);
    return *this;
  }
 };
@@ -329,11 +313,10 @@ std::string Func_regexp_replace::getStrVal(rowgroup::Row& row, FunctionParm& fp,
  const auto& replaceWithStr = replaceWith.unsafeStringRef();
  if (options.conversionIsNeeded)
  {
-    const auto& convertedReplaceToken = csConvert(replaceWithStr, options.libraryCharset, options.dataCharset);
+    const auto& convertedReplaceToken = options.libraryCharset.convert(replaceWithStr, options.dataCharset);
    return re.replace(param.expression, convertedReplaceToken, "g");
  }
  return re.replace(param.expression, replaceWithStr, "g");
 }
@@ -391,8 +374,8 @@ std::string Func_regexp_instr::getStrVal(rowgroup::Row& row, FunctionParm& fp, b
    return "0";
  size_t offset = vec_soff[0];
-  size_t charNumber =
+  size_t charNumber = options.libraryCharset.getCharset().numchars(param.expression.c_str(),
-      options.libraryCharset->numchars(param.expression.c_str(), param.expression.c_str() + offset);
+                                                                   param.expression.c_str() + offset);
  return std::to_string(charNumber + 1);
 }