diff --git a/dbcon/mysql/ha_mcs_execplan.cpp b/dbcon/mysql/ha_mcs_execplan.cpp index 75e3acbbe..50ab865a8 100644 --- a/dbcon/mysql/ha_mcs_execplan.cpp +++ b/dbcon/mysql/ha_mcs_execplan.cpp @@ -4387,7 +4387,7 @@ ReturnedColumn* buildFunctionColumn(Item_func* ifp, gp_walk_info& gwi, bool& non // A few functions use a different collation than that found in // the base ifp class - if (funcName == "locate" || funcName == "find_in_set" || funcName == "strcmp") + if (funcName == "locate" || funcName == "find_in_set" || funcName == "strcmp" || funcName == "regexp_instr") { DTCollation dt; ifp->Type_std_attributes::agg_arg_charsets_for_comparison(dt, ifp->func_name_cstring(), diff --git a/mysql-test/columnstore/basic/t/mcs228_regexp_operator.test b/mysql-test/columnstore/basic/t/mcs228_regexp_operator.test index 88e6ffd08..67d54f434 100644 --- a/mysql-test/columnstore/basic/t/mcs228_regexp_operator.test +++ b/mysql-test/columnstore/basic/t/mcs228_regexp_operator.test @@ -150,5 +150,33 @@ SELECT t1_TIME, REGEXP_INSTR(t1_TIME, '22$') FROM t1 ORDER BY 1; SELECT t1_TIME, REGEXP_REPLACE(t1_TIME, '(59)+', 'KittyCat') FROM t1 ORDER BY 1; SELECT t1_TIME, REGEXP_REPLACE(t1_TIME, '22$', 'KittyCat') FROM t1 ORDER BY 1; + +CREATE TABLE t2 (hello text) engine columnstore; +INSERT INTO t2 values('こんにちは'); +INSERT INTO t2 values('привет'); +INSERT INTO t2 values('Γεια'); +INSERT INTO t2 values('სალამი'); + +SELECT hello, hello regexp 'ん.ち' from t2; +SELECT hello, hello regexp 'и.е' from t2; +SELECT hello, hello regexp 'ε.α' from t2; +SELECT hello, hello regexp 'ა.ა' from t2; + +SELECT hello, regexp_substr(hello, 'ん.ち') from t2; +SELECT hello, regexp_substr(hello, 'и.е') from t2; +SELECT hello, regexp_substr(hello, 'ε.α') from t2; +SELECT hello, regexp_substr(hello, 'ა.ა') from t2; + +SELECT hello, regexp_instr(hello, 'ん.ち') from t2; +SELECT hello, regexp_instr(hello, 'и.е') from t2; +SELECT hello, regexp_instr(hello, 'ε.α') from t2; +SELECT hello, regexp_instr(hello, 'ა.ა') from t2; + +SELECT hello, regexp_replace(hello, 'ん.ち', 'Достоевский') from t2; +SELECT hello, regexp_replace(hello, 'и.е', 'Достоевский') from t2; +SELECT hello, regexp_replace(hello, 'ε.α', 'Достоевский') from t2; +SELECT hello, regexp_replace(hello, 'ა.ა', 'Достоевский') from t2; + + # Clean UP DROP DATABASE mcs228_db; diff --git a/utils/funcexp/func_regexp.cpp b/utils/funcexp/func_regexp.cpp index ad8332371..dca50ae06 100644 --- a/utils/funcexp/func_regexp.cpp +++ b/utils/funcexp/func_regexp.cpp @@ -250,6 +250,37 @@ CalpontSystemCatalog::ColType Func_regexp::operationType(FunctionParm& fp, using jp = jpcre2::select; +struct PCREOptions +{ + jpcre2::Uint flags = 0; + CHARSET_INFO* library_charset = &my_charset_utf8mb3_general_ci; + bool conversion_is_needed = false; +}; + +inline bool areSameCharsets(CHARSET_INFO* cs1, CHARSET_INFO* cs2) +{ + return (cs1->cs_name.str == cs2->cs_name.str); +} + +PCREOptions pcreOptions(execplan::CalpontSystemCatalog::ColType& ct) +{ + CHARSET_INFO* cs = ct.getCharset(); + PCREOptions options; + + // TODO use system variable instead if hardcode default_regex_flags_pcre(_current_thd()); + + jpcre2::Uint defaultFlags = + PCRE2_DOTALL | PCRE2_DUPNAMES | PCRE2_EXTENDED | PCRE2_EXTENDED_MORE | PCRE2_MULTILINE | PCRE2_UNGREEDY; + + options.flags = (cs != &my_charset_bin ? (PCRE2_UTF | PCRE2_UCP) : 0) | + ((cs->state & (MY_CS_BINSORT | MY_CS_CSSORT)) ? 0 : PCRE2_CASELESS) | defaultFlags; + + // Convert text data to utf-8. + options.library_charset = cs == &my_charset_bin ? &my_charset_bin : &my_charset_utf8mb3_general_ci; + options.conversion_is_needed = (cs != &my_charset_bin) && !areSameCharsets(cs, options.library_charset); + return options; +} + /* returns the string subject with all occurrences of the regular expression pattern replaced by the string replace. If no occurrences are found, then subject is returned as is. @@ -269,7 +300,9 @@ std::string Func_regexp_replace::getStrVal(rowgroup::Row& row, FunctionParm& fp, if (replace_with.isNull()) return param.expression; - jp::Regex re(param.pattern); + const PCREOptions& options = pcreOptions(ct); + jp::Regex re(param.pattern, options.flags); + return re.replace(param.expression, replace_with.unsafeStringRef(), "g"); } @@ -286,7 +319,8 @@ std::string Func_regexp_substr::getStrVal(rowgroup::Row& row, FunctionParm& fp, if (isNull) return std::string{}; - jp::Regex re(param.pattern); + const PCREOptions& options = pcreOptions(ct); + jp::Regex re(param.pattern, options.flags); jp::RegexMatch rm(&re); jp::VecNum vec_num; @@ -311,7 +345,8 @@ std::string Func_regexp_instr::getStrVal(rowgroup::Row& row, FunctionParm& fp, b if (isNull) return std::string{}; - jp::Regex re(param.pattern); + const PCREOptions& options = pcreOptions(ct); + jp::Regex re(param.pattern, options.flags); jp::RegexMatch rm(&re); jpcre2::VecOff vec_soff; @@ -320,7 +355,10 @@ std::string Func_regexp_instr::getStrVal(rowgroup::Row& row, FunctionParm& fp, b if (count == 0) return "0"; - return std::to_string(vec_soff[0] + 1); + size_t offset = vec_soff[0]; + size_t charNumber = ct.getCharset()->numchars(param.expression.c_str(), param.expression.c_str() + offset); + + return std::to_string(charNumber + 1); } /* @@ -334,7 +372,8 @@ bool Func_regexp::getBoolVal(rowgroup::Row& row, FunctionParm& fp, bool& isNull, if (isNull) return false; - jp::Regex re(param.pattern); + const PCREOptions& options = pcreOptions(ct); + jp::Regex re(param.pattern, options.flags); return re.match(param.expression); }