1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-07-30 19:23:07 +03:00

utf-8 support for regexp functions

This commit is contained in:
Leonid Fedorov
2024-03-08 16:56:27 +00:00
parent 33b0fee5cd
commit 4c85b166ca
3 changed files with 73 additions and 6 deletions

View File

@ -4387,7 +4387,7 @@ ReturnedColumn* buildFunctionColumn(Item_func* ifp, gp_walk_info& gwi, bool& non
// A few functions use a different collation than that found in // A few functions use a different collation than that found in
// the base ifp class // the base ifp class
if (funcName == "locate" || funcName == "find_in_set" || funcName == "strcmp") if (funcName == "locate" || funcName == "find_in_set" || funcName == "strcmp" || funcName == "regexp_instr")
{ {
DTCollation dt; DTCollation dt;
ifp->Type_std_attributes::agg_arg_charsets_for_comparison(dt, ifp->func_name_cstring(), ifp->Type_std_attributes::agg_arg_charsets_for_comparison(dt, ifp->func_name_cstring(),

View File

@ -150,5 +150,33 @@ SELECT t1_TIME, REGEXP_INSTR(t1_TIME, '22$') FROM t1 ORDER BY 1;
SELECT t1_TIME, REGEXP_REPLACE(t1_TIME, '(59)+', 'KittyCat') FROM t1 ORDER BY 1; SELECT t1_TIME, REGEXP_REPLACE(t1_TIME, '(59)+', 'KittyCat') FROM t1 ORDER BY 1;
SELECT t1_TIME, REGEXP_REPLACE(t1_TIME, '22$', 'KittyCat') FROM t1 ORDER BY 1; SELECT t1_TIME, REGEXP_REPLACE(t1_TIME, '22$', 'KittyCat') FROM t1 ORDER BY 1;
CREATE TABLE t2 (hello text) engine columnstore;
INSERT INTO t2 values('こんにちは');
INSERT INTO t2 values('привет');
INSERT INTO t2 values('Γεια');
INSERT INTO t2 values('სალამი');
SELECT hello, hello regexp 'ん.ち' from t2;
SELECT hello, hello regexp 'и.е' from t2;
SELECT hello, hello regexp 'ε.α' from t2;
SELECT hello, hello regexp 'ა.ა' from t2;
SELECT hello, regexp_substr(hello, 'ん.ち') from t2;
SELECT hello, regexp_substr(hello, 'и.е') from t2;
SELECT hello, regexp_substr(hello, 'ε.α') from t2;
SELECT hello, regexp_substr(hello, 'ა.ა') from t2;
SELECT hello, regexp_instr(hello, 'ん.ち') from t2;
SELECT hello, regexp_instr(hello, 'и.е') from t2;
SELECT hello, regexp_instr(hello, 'ε.α') from t2;
SELECT hello, regexp_instr(hello, 'ა.ა') from t2;
SELECT hello, regexp_replace(hello, 'ん.ち', 'Достоевский') from t2;
SELECT hello, regexp_replace(hello, 'и.е', 'Достоевский') from t2;
SELECT hello, regexp_replace(hello, 'ε.α', 'Достоевский') from t2;
SELECT hello, regexp_replace(hello, 'ა.ა', 'Достоевский') from t2;
# Clean UP # Clean UP
DROP DATABASE mcs228_db; DROP DATABASE mcs228_db;

View File

@ -250,6 +250,37 @@ CalpontSystemCatalog::ColType Func_regexp::operationType(FunctionParm& fp,
using jp = jpcre2::select<char>; using jp = jpcre2::select<char>;
struct PCREOptions
{
jpcre2::Uint flags = 0;
CHARSET_INFO* library_charset = &my_charset_utf8mb3_general_ci;
bool conversion_is_needed = false;
};
inline bool areSameCharsets(CHARSET_INFO* cs1, CHARSET_INFO* cs2)
{
return (cs1->cs_name.str == cs2->cs_name.str);
}
PCREOptions pcreOptions(execplan::CalpontSystemCatalog::ColType& ct)
{
CHARSET_INFO* cs = ct.getCharset();
PCREOptions options;
// TODO use system variable instead if hardcode default_regex_flags_pcre(_current_thd());
jpcre2::Uint defaultFlags =
PCRE2_DOTALL | PCRE2_DUPNAMES | PCRE2_EXTENDED | PCRE2_EXTENDED_MORE | PCRE2_MULTILINE | PCRE2_UNGREEDY;
options.flags = (cs != &my_charset_bin ? (PCRE2_UTF | PCRE2_UCP) : 0) |
((cs->state & (MY_CS_BINSORT | MY_CS_CSSORT)) ? 0 : PCRE2_CASELESS) | defaultFlags;
// Convert text data to utf-8.
options.library_charset = cs == &my_charset_bin ? &my_charset_bin : &my_charset_utf8mb3_general_ci;
options.conversion_is_needed = (cs != &my_charset_bin) && !areSameCharsets(cs, options.library_charset);
return options;
}
/* /*
returns the string subject with all occurrences of the regular expression pattern replaced by returns the string subject with all occurrences of the regular expression pattern replaced by
the string replace. If no occurrences are found, then subject is returned as is. the string replace. If no occurrences are found, then subject is returned as is.
@ -269,7 +300,9 @@ std::string Func_regexp_replace::getStrVal(rowgroup::Row& row, FunctionParm& fp,
if (replace_with.isNull()) if (replace_with.isNull())
return param.expression; return param.expression;
jp::Regex re(param.pattern); const PCREOptions& options = pcreOptions(ct);
jp::Regex re(param.pattern, options.flags);
return re.replace(param.expression, replace_with.unsafeStringRef(), "g"); return re.replace(param.expression, replace_with.unsafeStringRef(), "g");
} }
@ -286,7 +319,8 @@ std::string Func_regexp_substr::getStrVal(rowgroup::Row& row, FunctionParm& fp,
if (isNull) if (isNull)
return std::string{}; return std::string{};
jp::Regex re(param.pattern); const PCREOptions& options = pcreOptions(ct);
jp::Regex re(param.pattern, options.flags);
jp::RegexMatch rm(&re); jp::RegexMatch rm(&re);
jp::VecNum vec_num; jp::VecNum vec_num;
@ -311,7 +345,8 @@ std::string Func_regexp_instr::getStrVal(rowgroup::Row& row, FunctionParm& fp, b
if (isNull) if (isNull)
return std::string{}; return std::string{};
jp::Regex re(param.pattern); const PCREOptions& options = pcreOptions(ct);
jp::Regex re(param.pattern, options.flags);
jp::RegexMatch rm(&re); jp::RegexMatch rm(&re);
jpcre2::VecOff vec_soff; jpcre2::VecOff vec_soff;
@ -320,7 +355,10 @@ std::string Func_regexp_instr::getStrVal(rowgroup::Row& row, FunctionParm& fp, b
if (count == 0) if (count == 0)
return "0"; return "0";
return std::to_string(vec_soff[0] + 1); size_t offset = vec_soff[0];
size_t charNumber = ct.getCharset()->numchars(param.expression.c_str(), param.expression.c_str() + offset);
return std::to_string(charNumber + 1);
} }
/* /*
@ -334,7 +372,8 @@ bool Func_regexp::getBoolVal(rowgroup::Row& row, FunctionParm& fp, bool& isNull,
if (isNull) if (isNull)
return false; return false;
jp::Regex re(param.pattern); const PCREOptions& options = pcreOptions(ct);
jp::Regex re(param.pattern, options.flags);
return re.match(param.expression); return re.match(param.expression);
} }