diff --git a/mysql-test/columnstore/basic/r/mcs228_regexp_operator.result b/mysql-test/columnstore/basic/r/mcs228_regexp_operator.result index 959fa64b5..96578b1e1 100644 --- a/mysql-test/columnstore/basic/r/mcs228_regexp_operator.result +++ b/mysql-test/columnstore/basic/r/mcs228_regexp_operator.result @@ -543,97 +543,97 @@ INSERT INTO t2 values('こんにちは'); INSERT INTO t2 values('привет'); INSERT INTO t2 values('Γεια'); INSERT INTO t2 values('სალამი'); -SELECT hello, hello regexp 'ん.ち' from t2; +SELECT hello, hello regexp 'ん.ち' FROM t2; hello hello regexp 'ん.ち' こんにちは 0 привет 0 Γεια 0 სალამი 0 -SELECT hello, hello regexp 'и.е' from t2; +SELECT hello, hello regexp 'и.е' FROM t2; hello hello regexp 'и.е' こんにちは 0 привет 0 Γεια 0 სალამი 0 -SELECT hello, hello regexp 'ε.α' from t2; +SELECT hello, hello regexp 'ε.α' FROM t2; hello hello regexp 'ε.α' こんにちは 0 привет 0 Γεια 0 სალამი 0 -SELECT hello, hello regexp 'ა.ა' from t2; +SELECT hello, hello regexp 'ა.ა' FROM t2; hello hello regexp 'ა.ა' こんにちは 0 привет 0 Γεια 0 სალამი 0 -SELECT hello, regexp_substr(hello, 'ん.ち') from t2; +SELECT hello, regexp_substr(hello, 'ん.ち') FROM t2; hello regexp_substr(hello, 'ん.ち') こんにちは привет Γεια სალამი -SELECT hello, regexp_substr(hello, 'и.е') from t2; +SELECT hello, regexp_substr(hello, 'и.е') FROM t2; hello regexp_substr(hello, 'и.е') こんにちは привет Γεια სალამი -SELECT hello, regexp_substr(hello, 'ε.α') from t2; +SELECT hello, regexp_substr(hello, 'ε.α') FROM t2; hello regexp_substr(hello, 'ε.α') こんにちは привет Γεια სალამი -SELECT hello, regexp_substr(hello, 'ა.ა') from t2; +SELECT hello, regexp_substr(hello, 'ა.ა') FROM t2; hello regexp_substr(hello, 'ა.ა') こんにちは привет Γεια სალამი -SELECT hello, regexp_instr(hello, 'ん.ち') from t2; +SELECT hello, regexp_instr(hello, 'ん.ち') FROM t2; hello regexp_instr(hello, 'ん.ち') こんにちは 0 привет 0 Γεια 0 სალამი 0 -SELECT hello, regexp_instr(hello, 'и.е') from t2; +SELECT hello, regexp_instr(hello, 'и.е') FROM t2; hello regexp_instr(hello, 'и.е') こんにちは 0 привет 0 Γεια 0 სალამი 0 -SELECT hello, regexp_instr(hello, 'ε.α') from t2; +SELECT hello, regexp_instr(hello, 'ε.α') FROM t2; hello regexp_instr(hello, 'ε.α') こんにちは 0 привет 0 Γεια 0 სალამი 0 -SELECT hello, regexp_instr(hello, 'ა.ა') from t2; +SELECT hello, regexp_instr(hello, 'ა.ა') FROM t2; hello regexp_instr(hello, 'ა.ა') こんにちは 0 привет 0 Γεια 0 სალამი 0 -SELECT hello, regexp_replace(hello, 'ん.ち', 'Достоевский') from t2; +SELECT hello, regexp_replace(hello, 'ん.ち', 'Достоевский') FROM t2; hello regexp_replace(hello, 'ん.ち', 'Достоевский') こんにちは こんにちは привет привет Γεια Γεια სალამი სალამი -SELECT hello, regexp_replace(hello, 'и.е', 'Достоевский') from t2; +SELECT hello, regexp_replace(hello, 'и.е', 'Достоевский') FROM t2; hello regexp_replace(hello, 'и.е', 'Достоевский') こんにちは こんにちは привет привет Γεια Γεια სალამი სალამი -SELECT hello, regexp_replace(hello, 'ε.α', 'Достоевский') from t2; +SELECT hello, regexp_replace(hello, 'ε.α', 'Достоевский') FROM t2; hello regexp_replace(hello, 'ε.α', 'Достоевский') こんにちは こんにちは привет привет Γεια Γεια სალამი სალამი -SELECT hello, regexp_replace(hello, 'ა.ა', 'Достоевский') from t2; +SELECT hello, regexp_replace(hello, 'ა.ა', 'Достоевский') FROM t2; hello regexp_replace(hello, 'ა.ა', 'Достоевский') こんにちは こんにちは привет привет diff --git a/mysql-test/columnstore/basic/t/mcs228_regexp_operator.test b/mysql-test/columnstore/basic/t/mcs228_regexp_operator.test index 4ba2a6e65..c281ecaaa 100644 --- a/mysql-test/columnstore/basic/t/mcs228_regexp_operator.test +++ b/mysql-test/columnstore/basic/t/mcs228_regexp_operator.test @@ -152,32 +152,31 @@ SELECT t1_TIME, REGEXP_REPLACE(t1_TIME, '22$', 'KittyCat') FROM t1 ORDER BY 1; SET character_set_connection = 'utf8'; - CREATE TABLE t2 (hello text) engine columnstore; INSERT INTO t2 values('こんにちは'); INSERT INTO t2 values('привет'); INSERT INTO t2 values('Γεια'); INSERT INTO t2 values('სალამი'); -SELECT hello, hello regexp 'ん.ち' from t2; -SELECT hello, hello regexp 'и.е' from t2; -SELECT hello, hello regexp 'ε.α' from t2; -SELECT hello, hello regexp 'ა.ა' from t2; +SELECT hello, hello regexp 'ん.ち' FROM t2; +SELECT hello, hello regexp 'и.е' FROM t2; +SELECT hello, hello regexp 'ε.α' FROM t2; +SELECT hello, hello regexp 'ა.ა' FROM t2; -SELECT hello, regexp_substr(hello, 'ん.ち') from t2; -SELECT hello, regexp_substr(hello, 'и.е') from t2; -SELECT hello, regexp_substr(hello, 'ε.α') from t2; -SELECT hello, regexp_substr(hello, 'ა.ა') from t2; +SELECT hello, regexp_substr(hello, 'ん.ち') FROM t2; +SELECT hello, regexp_substr(hello, 'и.е') FROM t2; +SELECT hello, regexp_substr(hello, 'ε.α') FROM t2; +SELECT hello, regexp_substr(hello, 'ა.ა') FROM t2; -SELECT hello, regexp_instr(hello, 'ん.ち') from t2; -SELECT hello, regexp_instr(hello, 'и.е') from t2; -SELECT hello, regexp_instr(hello, 'ε.α') from t2; -SELECT hello, regexp_instr(hello, 'ა.ა') from t2; +SELECT hello, regexp_instr(hello, 'ん.ち') FROM t2; +SELECT hello, regexp_instr(hello, 'и.е') FROM t2; +SELECT hello, regexp_instr(hello, 'ε.α') FROM t2; +SELECT hello, regexp_instr(hello, 'ა.ა') FROM t2; -SELECT hello, regexp_replace(hello, 'ん.ち', 'Достоевский') from t2; -SELECT hello, regexp_replace(hello, 'и.е', 'Достоевский') from t2; -SELECT hello, regexp_replace(hello, 'ε.α', 'Достоевский') from t2; -SELECT hello, regexp_replace(hello, 'ა.ა', 'Достоевский') from t2; +SELECT hello, regexp_replace(hello, 'ん.ち', 'Достоевский') FROM t2; +SELECT hello, regexp_replace(hello, 'и.е', 'Достоевский') FROM t2; +SELECT hello, regexp_replace(hello, 'ε.α', 'Достоевский') FROM t2; +SELECT hello, regexp_replace(hello, 'ა.ა', 'Достоевский') FROM t2; # Clean UP DROP DATABASE mcs228_db; diff --git a/utils/common/collation.h b/utils/common/collation.h index 82e8e30b0..954141ba5 100644 --- a/utils/common/collation.h +++ b/utils/common/collation.h @@ -146,6 +146,23 @@ class Charset Charset(CHARSET_INFO* cs = nullptr) : mCharset(cs ? cs : &my_charset_bin) { } + + bool operator==(const Charset& rhs) + { + return rhs.getCharset().cs_name.str == getCharset().cs_name.str; + } + + std::string convert(const std::string& from, const datatypes::Charset& fromCs) const + { + std::string result; + uint dummy_errors; + result.resize(from.size() * getCharset().mbmaxlen); + size_t resultingSize = my_convert(const_cast(result.c_str()), result.size(), &getCharset(), from.c_str(), + from.size(), &fromCs.getCharset(), &dummy_errors); + result.resize(resultingSize); + return result; + } + Charset(uint32_t charsetNumber); void setCharset(uint32_t charsetNumber); CHARSET_INFO& getCharset() const diff --git a/utils/funcexp/func_regexp.cpp b/utils/funcexp/func_regexp.cpp index 3da9fc7dd..d21275c9e 100644 --- a/utils/funcexp/func_regexp.cpp +++ b/utils/funcexp/func_regexp.cpp @@ -39,42 +39,27 @@ using namespace execplan; #include "errorcodes.h" #include "idberrorinfo.h" #include "errorids.h" + using namespace logging; namespace { -std::string csConvert(const std::string& from, CHARSET_INFO* to_cs, CHARSET_INFO* from_cs) -{ - std::string result; - uint dummy_errors; - result.resize(from.size() * to_cs->mbmaxlen); - size_t resultingSize = my_convert(const_cast(result.c_str()), result.size(), to_cs, from.c_str(), - from.size(), from_cs, &dummy_errors); - result.resize(resultingSize); - return result; -} - using jp = jpcre2::select; struct PCREOptions { PCREOptions(execplan::CalpontSystemCatalog::ColType& ct); - CHARSET_INFO* dataCharset = &my_charset_utf8mb3_general_ci; - CHARSET_INFO* libraryCharset = &my_charset_utf8mb3_general_ci; + datatypes::Charset dataCharset = my_charset_utf8mb3_general_ci; + datatypes::Charset libraryCharset = my_charset_utf8mb3_general_ci; jpcre2::Uint flags = 0; bool conversionIsNeeded = false; }; -inline bool areSameCharsets(CHARSET_INFO* cs1, CHARSET_INFO* cs2) -{ - return (cs1->cs_name.str == cs2->cs_name.str); -} - PCREOptions::PCREOptions(execplan::CalpontSystemCatalog::ColType& ct) { - CHARSET_INFO* cs = ct.getCharset(); + datatypes::Charset cs = ct.getCharset(); // TODO use system variable instead if hardcode default_regex_flags_pcre(_current_thd()); // PCRE2_DOTALL | PCRE2_DUPNAMES | PCRE2_EXTENDED | PCRE2_EXTENDED_MORE | PCRE2_MULTILINE | PCRE2_UNGREEDY; @@ -82,12 +67,11 @@ PCREOptions::PCREOptions(execplan::CalpontSystemCatalog::ColType& ct) jpcre2::Uint defaultFlags = 0; flags = (cs != &my_charset_bin ? (PCRE2_UTF | PCRE2_UCP) : 0) | - ((cs->state & (MY_CS_BINSORT | MY_CS_CSSORT)) ? 0 : PCRE2_CASELESS) | defaultFlags; + ((cs.getCharset().state & (MY_CS_BINSORT | MY_CS_CSSORT)) ? 0 : PCRE2_CASELESS) | defaultFlags; // Convert text data to utf-8. dataCharset = cs; - libraryCharset = cs == &my_charset_bin ? &my_charset_bin : &my_charset_utf8mb3_general_ci; - conversionIsNeeded = (cs != &my_charset_bin) && !areSameCharsets(cs, libraryCharset); + libraryCharset = cs == my_charset_bin ? my_charset_bin : my_charset_utf8mb3_general_ci; } struct RegExpParams @@ -99,8 +83,8 @@ struct RegExpParams if (options.conversionIsNeeded) return *this; - expression = csConvert(expression, options.libraryCharset, options.dataCharset); - pattern = csConvert(pattern, options.libraryCharset, options.dataCharset); + expression = options.libraryCharset.convert(expression, options.dataCharset); + pattern = options.libraryCharset.convert(pattern, options.dataCharset); return *this; } }; @@ -329,11 +313,10 @@ std::string Func_regexp_replace::getStrVal(rowgroup::Row& row, FunctionParm& fp, const auto& replaceWithStr = replaceWith.unsafeStringRef(); if (options.conversionIsNeeded) { - const auto& convertedReplaceToken = csConvert(replaceWithStr, options.libraryCharset, options.dataCharset); + const auto& convertedReplaceToken = options.libraryCharset.convert(replaceWithStr, options.dataCharset); return re.replace(param.expression, convertedReplaceToken, "g"); } - return re.replace(param.expression, replaceWithStr, "g"); } @@ -391,8 +374,8 @@ std::string Func_regexp_instr::getStrVal(rowgroup::Row& row, FunctionParm& fp, b return "0"; size_t offset = vec_soff[0]; - size_t charNumber = - options.libraryCharset->numchars(param.expression.c_str(), param.expression.c_str() + offset); + size_t charNumber = options.libraryCharset.getCharset().numchars(param.expression.c_str(), + param.expression.c_str() + offset); return std::to_string(charNumber + 1); }