1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-07-29 08:21:15 +03:00

utf-8 support for regexp functions

This commit is contained in:
Leonid Fedorov
2024-03-08 16:56:27 +00:00
parent 33b0fee5cd
commit 4c85b166ca
3 changed files with 73 additions and 6 deletions

View File

@ -4387,7 +4387,7 @@ ReturnedColumn* buildFunctionColumn(Item_func* ifp, gp_walk_info& gwi, bool& non
// A few functions use a different collation than that found in
// the base ifp class
if (funcName == "locate" || funcName == "find_in_set" || funcName == "strcmp")
if (funcName == "locate" || funcName == "find_in_set" || funcName == "strcmp" || funcName == "regexp_instr")
{
DTCollation dt;
ifp->Type_std_attributes::agg_arg_charsets_for_comparison(dt, ifp->func_name_cstring(),

View File

@ -150,5 +150,33 @@ SELECT t1_TIME, REGEXP_INSTR(t1_TIME, '22$') FROM t1 ORDER BY 1;
SELECT t1_TIME, REGEXP_REPLACE(t1_TIME, '(59)+', 'KittyCat') FROM t1 ORDER BY 1;
SELECT t1_TIME, REGEXP_REPLACE(t1_TIME, '22$', 'KittyCat') FROM t1 ORDER BY 1;
CREATE TABLE t2 (hello text) engine columnstore;
INSERT INTO t2 values('こんにちは');
INSERT INTO t2 values('привет');
INSERT INTO t2 values('Γεια');
INSERT INTO t2 values('სალამი');
SELECT hello, hello regexp 'ん.ち' from t2;
SELECT hello, hello regexp 'и.е' from t2;
SELECT hello, hello regexp 'ε.α' from t2;
SELECT hello, hello regexp 'ა.ა' from t2;
SELECT hello, regexp_substr(hello, 'ん.ち') from t2;
SELECT hello, regexp_substr(hello, 'и.е') from t2;
SELECT hello, regexp_substr(hello, 'ε.α') from t2;
SELECT hello, regexp_substr(hello, 'ა.ა') from t2;
SELECT hello, regexp_instr(hello, 'ん.ち') from t2;
SELECT hello, regexp_instr(hello, 'и.е') from t2;
SELECT hello, regexp_instr(hello, 'ε.α') from t2;
SELECT hello, regexp_instr(hello, 'ა.ა') from t2;
SELECT hello, regexp_replace(hello, 'ん.ち', 'Достоевский') from t2;
SELECT hello, regexp_replace(hello, 'и.е', 'Достоевский') from t2;
SELECT hello, regexp_replace(hello, 'ε.α', 'Достоевский') from t2;
SELECT hello, regexp_replace(hello, 'ა.ა', 'Достоевский') from t2;
# Clean UP
DROP DATABASE mcs228_db;

View File

@ -250,6 +250,37 @@ CalpontSystemCatalog::ColType Func_regexp::operationType(FunctionParm& fp,
using jp = jpcre2::select<char>;
struct PCREOptions
{
jpcre2::Uint flags = 0;
CHARSET_INFO* library_charset = &my_charset_utf8mb3_general_ci;
bool conversion_is_needed = false;
};
inline bool areSameCharsets(CHARSET_INFO* cs1, CHARSET_INFO* cs2)
{
return (cs1->cs_name.str == cs2->cs_name.str);
}
PCREOptions pcreOptions(execplan::CalpontSystemCatalog::ColType& ct)
{
CHARSET_INFO* cs = ct.getCharset();
PCREOptions options;
// TODO use system variable instead if hardcode default_regex_flags_pcre(_current_thd());
jpcre2::Uint defaultFlags =
PCRE2_DOTALL | PCRE2_DUPNAMES | PCRE2_EXTENDED | PCRE2_EXTENDED_MORE | PCRE2_MULTILINE | PCRE2_UNGREEDY;
options.flags = (cs != &my_charset_bin ? (PCRE2_UTF | PCRE2_UCP) : 0) |
((cs->state & (MY_CS_BINSORT | MY_CS_CSSORT)) ? 0 : PCRE2_CASELESS) | defaultFlags;
// Convert text data to utf-8.
options.library_charset = cs == &my_charset_bin ? &my_charset_bin : &my_charset_utf8mb3_general_ci;
options.conversion_is_needed = (cs != &my_charset_bin) && !areSameCharsets(cs, options.library_charset);
return options;
}
/*
returns the string subject with all occurrences of the regular expression pattern replaced by
the string replace. If no occurrences are found, then subject is returned as is.
@ -269,7 +300,9 @@ std::string Func_regexp_replace::getStrVal(rowgroup::Row& row, FunctionParm& fp,
if (replace_with.isNull())
return param.expression;
jp::Regex re(param.pattern);
const PCREOptions& options = pcreOptions(ct);
jp::Regex re(param.pattern, options.flags);
return re.replace(param.expression, replace_with.unsafeStringRef(), "g");
}
@ -286,7 +319,8 @@ std::string Func_regexp_substr::getStrVal(rowgroup::Row& row, FunctionParm& fp,
if (isNull)
return std::string{};
jp::Regex re(param.pattern);
const PCREOptions& options = pcreOptions(ct);
jp::Regex re(param.pattern, options.flags);
jp::RegexMatch rm(&re);
jp::VecNum vec_num;
@ -311,7 +345,8 @@ std::string Func_regexp_instr::getStrVal(rowgroup::Row& row, FunctionParm& fp, b
if (isNull)
return std::string{};
jp::Regex re(param.pattern);
const PCREOptions& options = pcreOptions(ct);
jp::Regex re(param.pattern, options.flags);
jp::RegexMatch rm(&re);
jpcre2::VecOff vec_soff;
@ -320,7 +355,10 @@ std::string Func_regexp_instr::getStrVal(rowgroup::Row& row, FunctionParm& fp, b
if (count == 0)
return "0";
return std::to_string(vec_soff[0] + 1);
size_t offset = vec_soff[0];
size_t charNumber = ct.getCharset()->numchars(param.expression.c_str(), param.expression.c_str() + offset);
return std::to_string(charNumber + 1);
}
/*
@ -334,7 +372,8 @@ bool Func_regexp::getBoolVal(rowgroup::Row& row, FunctionParm& fp, bool& isNull,
if (isNull)
return false;
jp::Regex re(param.pattern);
const PCREOptions& options = pcreOptions(ct);
jp::Regex re(param.pattern, options.flags);
return re.match(param.expression);
}