From 5104a7e1ba9e8a3e39bf5f0869ba694285220444 Mon Sep 17 00:00:00 2001 From: Leonid Fedorov Date: Mon, 11 Mar 2024 15:09:04 +0000 Subject: [PATCH] charsets support for regexp funcions --- .../basic/r/mcs228_regexp_operator.result | 102 ++++++++++++++++++ .../basic/t/mcs228_regexp_operator.test | 3 +- utils/funcexp/func_regexp.cpp | 59 ++++++++-- 3 files changed, 153 insertions(+), 11 deletions(-) diff --git a/mysql-test/columnstore/basic/r/mcs228_regexp_operator.result b/mysql-test/columnstore/basic/r/mcs228_regexp_operator.result index f3e5a7d78..959fa64b5 100644 --- a/mysql-test/columnstore/basic/r/mcs228_regexp_operator.result +++ b/mysql-test/columnstore/basic/r/mcs228_regexp_operator.result @@ -537,4 +537,106 @@ t1_TIME REGEXP_REPLACE(t1_TIME, '22$', 'KittyCat') 01:08:59 01:08:59 22:12:02 22:12:02 23:59:59 23:59:59 +SET character_set_connection = 'utf8'; +CREATE TABLE t2 (hello text) engine columnstore; +INSERT INTO t2 values('こんにちは'); +INSERT INTO t2 values('привет'); +INSERT INTO t2 values('Γεια'); +INSERT INTO t2 values('სალამი'); +SELECT hello, hello regexp 'ん.ち' from t2; +hello hello regexp 'ん.ち' +こんにちは 0 +привет 0 +Γεια 0 +სალამი 0 +SELECT hello, hello regexp 'и.е' from t2; +hello hello regexp 'и.е' +こんにちは 0 +привет 0 +Γεια 0 +სალამი 0 +SELECT hello, hello regexp 'ε.α' from t2; +hello hello regexp 'ε.α' +こんにちは 0 +привет 0 +Γεια 0 +სალამი 0 +SELECT hello, hello regexp 'ა.ა' from t2; +hello hello regexp 'ა.ა' +こんにちは 0 +привет 0 +Γεια 0 +სალამი 0 +SELECT hello, regexp_substr(hello, 'ん.ち') from t2; +hello regexp_substr(hello, 'ん.ち') +こんにちは +привет +Γεια +სალამი +SELECT hello, regexp_substr(hello, 'и.е') from t2; +hello regexp_substr(hello, 'и.е') +こんにちは +привет +Γεια +სალამი +SELECT hello, regexp_substr(hello, 'ε.α') from t2; +hello regexp_substr(hello, 'ε.α') +こんにちは +привет +Γεια +სალამი +SELECT hello, regexp_substr(hello, 'ა.ა') from t2; +hello regexp_substr(hello, 'ა.ა') +こんにちは +привет +Γεια +სალამი +SELECT hello, regexp_instr(hello, 'ん.ち') from t2; +hello regexp_instr(hello, 'ん.ち') +こんにちは 0 +привет 0 +Γεια 0 +სალამი 0 +SELECT hello, regexp_instr(hello, 'и.е') from t2; +hello regexp_instr(hello, 'и.е') +こんにちは 0 +привет 0 +Γεια 0 +სალამი 0 +SELECT hello, regexp_instr(hello, 'ε.α') from t2; +hello regexp_instr(hello, 'ε.α') +こんにちは 0 +привет 0 +Γεια 0 +სალამი 0 +SELECT hello, regexp_instr(hello, 'ა.ა') from t2; +hello regexp_instr(hello, 'ა.ა') +こんにちは 0 +привет 0 +Γεια 0 +სალამი 0 +SELECT hello, regexp_replace(hello, 'ん.ち', 'Достоевский') from t2; +hello regexp_replace(hello, 'ん.ち', 'Достоевский') +こんにちは こんにちは +привет привет +Γεια Γεια +სალამი სალამი +SELECT hello, regexp_replace(hello, 'и.е', 'Достоевский') from t2; +hello regexp_replace(hello, 'и.е', 'Достоевский') +こんにちは こんにちは +привет привет +Γεια Γεια +სალამი სალამი +SELECT hello, regexp_replace(hello, 'ε.α', 'Достоевский') from t2; +hello regexp_replace(hello, 'ε.α', 'Достоевский') +こんにちは こんにちは +привет привет +Γεια Γεια +სალამი სალამი +SELECT hello, regexp_replace(hello, 'ა.ა', 'Достоевский') from t2; +hello regexp_replace(hello, 'ა.ა', 'Достоевский') +こんにちは こんにちは +привет привет +Γεια Γεια +სალამი სალამი DROP DATABASE mcs228_db; diff --git a/mysql-test/columnstore/basic/t/mcs228_regexp_operator.test b/mysql-test/columnstore/basic/t/mcs228_regexp_operator.test index 67d54f434..4ba2a6e65 100644 --- a/mysql-test/columnstore/basic/t/mcs228_regexp_operator.test +++ b/mysql-test/columnstore/basic/t/mcs228_regexp_operator.test @@ -150,6 +150,8 @@ SELECT t1_TIME, REGEXP_INSTR(t1_TIME, '22$') FROM t1 ORDER BY 1; SELECT t1_TIME, REGEXP_REPLACE(t1_TIME, '(59)+', 'KittyCat') FROM t1 ORDER BY 1; SELECT t1_TIME, REGEXP_REPLACE(t1_TIME, '22$', 'KittyCat') FROM t1 ORDER BY 1; +SET character_set_connection = 'utf8'; + CREATE TABLE t2 (hello text) engine columnstore; INSERT INTO t2 values('こんにちは'); @@ -177,6 +179,5 @@ SELECT hello, regexp_replace(hello, 'и.е', 'Достоевский') from t2; SELECT hello, regexp_replace(hello, 'ε.α', 'Достоевский') from t2; SELECT hello, regexp_replace(hello, 'ა.ა', 'Достоевский') from t2; - # Clean UP DROP DATABASE mcs228_db; diff --git a/utils/funcexp/func_regexp.cpp b/utils/funcexp/func_regexp.cpp index dca50ae06..7f53d41c2 100644 --- a/utils/funcexp/func_regexp.cpp +++ b/utils/funcexp/func_regexp.cpp @@ -253,8 +253,9 @@ using jp = jpcre2::select; struct PCREOptions { jpcre2::Uint flags = 0; - CHARSET_INFO* library_charset = &my_charset_utf8mb3_general_ci; - bool conversion_is_needed = false; + CHARSET_INFO* dataCharset = &my_charset_utf8mb3_general_ci; + CHARSET_INFO* libraryCharset = &my_charset_utf8mb3_general_ci; + bool conversionIsNeeded = false; }; inline bool areSameCharsets(CHARSET_INFO* cs1, CHARSET_INFO* cs2) @@ -268,19 +269,41 @@ PCREOptions pcreOptions(execplan::CalpontSystemCatalog::ColType& ct) PCREOptions options; // TODO use system variable instead if hardcode default_regex_flags_pcre(_current_thd()); + // PCRE2_DOTALL | PCRE2_DUPNAMES | PCRE2_EXTENDED | PCRE2_EXTENDED_MORE | PCRE2_MULTILINE | PCRE2_UNGREEDY; - jpcre2::Uint defaultFlags = - PCRE2_DOTALL | PCRE2_DUPNAMES | PCRE2_EXTENDED | PCRE2_EXTENDED_MORE | PCRE2_MULTILINE | PCRE2_UNGREEDY; + jpcre2::Uint defaultFlags = 0; options.flags = (cs != &my_charset_bin ? (PCRE2_UTF | PCRE2_UCP) : 0) | ((cs->state & (MY_CS_BINSORT | MY_CS_CSSORT)) ? 0 : PCRE2_CASELESS) | defaultFlags; // Convert text data to utf-8. - options.library_charset = cs == &my_charset_bin ? &my_charset_bin : &my_charset_utf8mb3_general_ci; - options.conversion_is_needed = (cs != &my_charset_bin) && !areSameCharsets(cs, options.library_charset); + options.dataCharset = cs; + options.libraryCharset = cs == &my_charset_bin ? &my_charset_bin : &my_charset_utf8mb3_general_ci; + options.conversionIsNeeded = (cs != &my_charset_bin) && !areSameCharsets(cs, options.libraryCharset); + return options; } +std::string csConvert(const std::string& from, CHARSET_INFO* to_cs, CHARSET_INFO* from_cs) +{ + std::string result; + uint dummy_errors; + result.resize(from.size() * to_cs->mbmaxlen); + size_t resultingSize = my_convert(const_cast(result.c_str()), result.size(), to_cs, from.c_str(), + from.size(), from_cs, &dummy_errors); + result.resize(resultingSize); + return result; +} + +void regexpParamCSfix(const PCREOptions options, RegExpParams& param) +{ + if (!options.conversionIsNeeded) + return; + + param.expression = csConvert(param.expression, options.libraryCharset, options.dataCharset); + param.pattern = csConvert(param.pattern, options.libraryCharset, options.dataCharset); +} + /* returns the string subject with all occurrences of the regular expression pattern replaced by the string replace. If no occurrences are found, then subject is returned as is. @@ -295,15 +318,24 @@ std::string Func_regexp_replace::getStrVal(rowgroup::Row& row, FunctionParm& fp, if (isNull) return std::string{}; - const auto& replace_with = fp[2]->data()->getStrVal(row, isNull); + const auto& replaceWith = fp[2]->data()->getStrVal(row, isNull); - if (replace_with.isNull()) + if (replaceWith.isNull()) return param.expression; const PCREOptions& options = pcreOptions(ct); + + auto replaceWithStr = replaceWith.unsafeStringRef(); + if (options.conversionIsNeeded) + { + replaceWithStr = csConvert(replaceWithStr, options.libraryCharset, options.dataCharset); + } + + regexpParamCSfix(options, param); + jp::Regex re(param.pattern, options.flags); - return re.replace(param.expression, replace_with.unsafeStringRef(), "g"); + return re.replace(param.expression, replaceWithStr, "g"); } /* @@ -320,6 +352,8 @@ std::string Func_regexp_substr::getStrVal(rowgroup::Row& row, FunctionParm& fp, return std::string{}; const PCREOptions& options = pcreOptions(ct); + regexpParamCSfix(options, param); + jp::Regex re(param.pattern, options.flags); jp::RegexMatch rm(&re); jp::VecNum vec_num; @@ -346,6 +380,8 @@ std::string Func_regexp_instr::getStrVal(rowgroup::Row& row, FunctionParm& fp, b return std::string{}; const PCREOptions& options = pcreOptions(ct); + regexpParamCSfix(options, param); + jp::Regex re(param.pattern, options.flags); jp::RegexMatch rm(&re); jpcre2::VecOff vec_soff; @@ -356,7 +392,8 @@ std::string Func_regexp_instr::getStrVal(rowgroup::Row& row, FunctionParm& fp, b return "0"; size_t offset = vec_soff[0]; - size_t charNumber = ct.getCharset()->numchars(param.expression.c_str(), param.expression.c_str() + offset); + size_t charNumber = + options.libraryCharset->numchars(param.expression.c_str(), param.expression.c_str() + offset); return std::to_string(charNumber + 1); } @@ -373,6 +410,8 @@ bool Func_regexp::getBoolVal(rowgroup::Row& row, FunctionParm& fp, bool& isNull, return false; const PCREOptions& options = pcreOptions(ct); + regexpParamCSfix(options, param); + jp::Regex re(param.pattern, options.flags); return re.match(param.expression); }