1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-07-29 08:21:15 +03:00

charsets support for regexp funcions

This commit is contained in:
Leonid Fedorov
2024-03-11 15:09:04 +00:00
parent 4c85b166ca
commit 5104a7e1ba
3 changed files with 153 additions and 11 deletions

View File

@ -537,4 +537,106 @@ t1_TIME REGEXP_REPLACE(t1_TIME, '22$', 'KittyCat')
01:08:59 01:08:59 01:08:59 01:08:59
22:12:02 22:12:02 22:12:02 22:12:02
23:59:59 23:59:59 23:59:59 23:59:59
SET character_set_connection = 'utf8';
CREATE TABLE t2 (hello text) engine columnstore;
INSERT INTO t2 values('こんにちは');
INSERT INTO t2 values('привет');
INSERT INTO t2 values('Γεια');
INSERT INTO t2 values('სალამი');
SELECT hello, hello regexp 'ん.ち' from t2;
hello hello regexp 'ん.ち'
こんにちは 0
привет 0
Γεια 0
სალამი 0
SELECT hello, hello regexp 'и.е' from t2;
hello hello regexp 'и.е'
こんにちは 0
привет 0
Γεια 0
სალამი 0
SELECT hello, hello regexp 'ε.α' from t2;
hello hello regexp 'ε.α'
こんにちは 0
привет 0
Γεια 0
სალამი 0
SELECT hello, hello regexp 'ა.ა' from t2;
hello hello regexp 'ა.ა'
こんにちは 0
привет 0
Γεια 0
სალამი 0
SELECT hello, regexp_substr(hello, 'ん.ち') from t2;
hello regexp_substr(hello, 'ん.ち')
こんにちは
привет
Γεια
სალამი
SELECT hello, regexp_substr(hello, 'и.е') from t2;
hello regexp_substr(hello, 'и.е')
こんにちは
привет
Γεια
სალამი
SELECT hello, regexp_substr(hello, 'ε.α') from t2;
hello regexp_substr(hello, 'ε.α')
こんにちは
привет
Γεια
სალამი
SELECT hello, regexp_substr(hello, 'ა.ა') from t2;
hello regexp_substr(hello, 'ა.ა')
こんにちは
привет
Γεια
სალამი
SELECT hello, regexp_instr(hello, 'ん.ち') from t2;
hello regexp_instr(hello, 'ん.ち')
こんにちは 0
привет 0
Γεια 0
სალამი 0
SELECT hello, regexp_instr(hello, 'и.е') from t2;
hello regexp_instr(hello, 'и.е')
こんにちは 0
привет 0
Γεια 0
სალამი 0
SELECT hello, regexp_instr(hello, 'ε.α') from t2;
hello regexp_instr(hello, 'ε.α')
こんにちは 0
привет 0
Γεια 0
სალამი 0
SELECT hello, regexp_instr(hello, 'ა.ა') from t2;
hello regexp_instr(hello, 'ა.ა')
こんにちは 0
привет 0
Γεια 0
სალამი 0
SELECT hello, regexp_replace(hello, 'ん.ち', 'Достоевский') from t2;
hello regexp_replace(hello, 'ん.ち', 'Достоевский')
こんにちは こんにちは
привет привет
Γεια Γεια
სალამი სალამი
SELECT hello, regexp_replace(hello, 'и.е', 'Достоевский') from t2;
hello regexp_replace(hello, 'и.е', 'Достоевский')
こんにちは こんにちは
привет привет
Γεια Γεια
სალამი სალამი
SELECT hello, regexp_replace(hello, 'ε.α', 'Достоевский') from t2;
hello regexp_replace(hello, 'ε.α', 'Достоевский')
こんにちは こんにちは
привет привет
Γεια Γεια
სალამი სალამი
SELECT hello, regexp_replace(hello, 'ა.ა', 'Достоевский') from t2;
hello regexp_replace(hello, 'ა.ა', 'Достоевский')
こんにちは こんにちは
привет привет
Γεια Γεια
სალამი სალამი
DROP DATABASE mcs228_db; DROP DATABASE mcs228_db;

View File

@ -150,6 +150,8 @@ SELECT t1_TIME, REGEXP_INSTR(t1_TIME, '22$') FROM t1 ORDER BY 1;
SELECT t1_TIME, REGEXP_REPLACE(t1_TIME, '(59)+', 'KittyCat') FROM t1 ORDER BY 1; SELECT t1_TIME, REGEXP_REPLACE(t1_TIME, '(59)+', 'KittyCat') FROM t1 ORDER BY 1;
SELECT t1_TIME, REGEXP_REPLACE(t1_TIME, '22$', 'KittyCat') FROM t1 ORDER BY 1; SELECT t1_TIME, REGEXP_REPLACE(t1_TIME, '22$', 'KittyCat') FROM t1 ORDER BY 1;
SET character_set_connection = 'utf8';
CREATE TABLE t2 (hello text) engine columnstore; CREATE TABLE t2 (hello text) engine columnstore;
INSERT INTO t2 values('こんにちは'); INSERT INTO t2 values('こんにちは');
@ -177,6 +179,5 @@ SELECT hello, regexp_replace(hello, 'и.е', 'Достоевский') from t2;
SELECT hello, regexp_replace(hello, 'ε.α', 'Достоевский') from t2; SELECT hello, regexp_replace(hello, 'ε.α', 'Достоевский') from t2;
SELECT hello, regexp_replace(hello, 'ა.ა', 'Достоевский') from t2; SELECT hello, regexp_replace(hello, 'ა.ა', 'Достоевский') from t2;
# Clean UP # Clean UP
DROP DATABASE mcs228_db; DROP DATABASE mcs228_db;

View File

@ -253,8 +253,9 @@ using jp = jpcre2::select<char>;
struct PCREOptions struct PCREOptions
{ {
jpcre2::Uint flags = 0; jpcre2::Uint flags = 0;
CHARSET_INFO* library_charset = &my_charset_utf8mb3_general_ci; CHARSET_INFO* dataCharset = &my_charset_utf8mb3_general_ci;
bool conversion_is_needed = false; CHARSET_INFO* libraryCharset = &my_charset_utf8mb3_general_ci;
bool conversionIsNeeded = false;
}; };
inline bool areSameCharsets(CHARSET_INFO* cs1, CHARSET_INFO* cs2) inline bool areSameCharsets(CHARSET_INFO* cs1, CHARSET_INFO* cs2)
@ -268,19 +269,41 @@ PCREOptions pcreOptions(execplan::CalpontSystemCatalog::ColType& ct)
PCREOptions options; PCREOptions options;
// TODO use system variable instead if hardcode default_regex_flags_pcre(_current_thd()); // TODO use system variable instead if hardcode default_regex_flags_pcre(_current_thd());
// PCRE2_DOTALL | PCRE2_DUPNAMES | PCRE2_EXTENDED | PCRE2_EXTENDED_MORE | PCRE2_MULTILINE | PCRE2_UNGREEDY;
jpcre2::Uint defaultFlags = jpcre2::Uint defaultFlags = 0;
PCRE2_DOTALL | PCRE2_DUPNAMES | PCRE2_EXTENDED | PCRE2_EXTENDED_MORE | PCRE2_MULTILINE | PCRE2_UNGREEDY;
options.flags = (cs != &my_charset_bin ? (PCRE2_UTF | PCRE2_UCP) : 0) | options.flags = (cs != &my_charset_bin ? (PCRE2_UTF | PCRE2_UCP) : 0) |
((cs->state & (MY_CS_BINSORT | MY_CS_CSSORT)) ? 0 : PCRE2_CASELESS) | defaultFlags; ((cs->state & (MY_CS_BINSORT | MY_CS_CSSORT)) ? 0 : PCRE2_CASELESS) | defaultFlags;
// Convert text data to utf-8. // Convert text data to utf-8.
options.library_charset = cs == &my_charset_bin ? &my_charset_bin : &my_charset_utf8mb3_general_ci; options.dataCharset = cs;
options.conversion_is_needed = (cs != &my_charset_bin) && !areSameCharsets(cs, options.library_charset); options.libraryCharset = cs == &my_charset_bin ? &my_charset_bin : &my_charset_utf8mb3_general_ci;
options.conversionIsNeeded = (cs != &my_charset_bin) && !areSameCharsets(cs, options.libraryCharset);
return options; return options;
} }
std::string csConvert(const std::string& from, CHARSET_INFO* to_cs, CHARSET_INFO* from_cs)
{
std::string result;
uint dummy_errors;
result.resize(from.size() * to_cs->mbmaxlen);
size_t resultingSize = my_convert(const_cast<char*>(result.c_str()), result.size(), to_cs, from.c_str(),
from.size(), from_cs, &dummy_errors);
result.resize(resultingSize);
return result;
}
void regexpParamCSfix(const PCREOptions options, RegExpParams& param)
{
if (!options.conversionIsNeeded)
return;
param.expression = csConvert(param.expression, options.libraryCharset, options.dataCharset);
param.pattern = csConvert(param.pattern, options.libraryCharset, options.dataCharset);
}
/* /*
returns the string subject with all occurrences of the regular expression pattern replaced by returns the string subject with all occurrences of the regular expression pattern replaced by
the string replace. If no occurrences are found, then subject is returned as is. the string replace. If no occurrences are found, then subject is returned as is.
@ -295,15 +318,24 @@ std::string Func_regexp_replace::getStrVal(rowgroup::Row& row, FunctionParm& fp,
if (isNull) if (isNull)
return std::string{}; return std::string{};
const auto& replace_with = fp[2]->data()->getStrVal(row, isNull); const auto& replaceWith = fp[2]->data()->getStrVal(row, isNull);
if (replace_with.isNull()) if (replaceWith.isNull())
return param.expression; return param.expression;
const PCREOptions& options = pcreOptions(ct); const PCREOptions& options = pcreOptions(ct);
auto replaceWithStr = replaceWith.unsafeStringRef();
if (options.conversionIsNeeded)
{
replaceWithStr = csConvert(replaceWithStr, options.libraryCharset, options.dataCharset);
}
regexpParamCSfix(options, param);
jp::Regex re(param.pattern, options.flags); jp::Regex re(param.pattern, options.flags);
return re.replace(param.expression, replace_with.unsafeStringRef(), "g"); return re.replace(param.expression, replaceWithStr, "g");
} }
/* /*
@ -320,6 +352,8 @@ std::string Func_regexp_substr::getStrVal(rowgroup::Row& row, FunctionParm& fp,
return std::string{}; return std::string{};
const PCREOptions& options = pcreOptions(ct); const PCREOptions& options = pcreOptions(ct);
regexpParamCSfix(options, param);
jp::Regex re(param.pattern, options.flags); jp::Regex re(param.pattern, options.flags);
jp::RegexMatch rm(&re); jp::RegexMatch rm(&re);
jp::VecNum vec_num; jp::VecNum vec_num;
@ -346,6 +380,8 @@ std::string Func_regexp_instr::getStrVal(rowgroup::Row& row, FunctionParm& fp, b
return std::string{}; return std::string{};
const PCREOptions& options = pcreOptions(ct); const PCREOptions& options = pcreOptions(ct);
regexpParamCSfix(options, param);
jp::Regex re(param.pattern, options.flags); jp::Regex re(param.pattern, options.flags);
jp::RegexMatch rm(&re); jp::RegexMatch rm(&re);
jpcre2::VecOff vec_soff; jpcre2::VecOff vec_soff;
@ -356,7 +392,8 @@ std::string Func_regexp_instr::getStrVal(rowgroup::Row& row, FunctionParm& fp, b
return "0"; return "0";
size_t offset = vec_soff[0]; size_t offset = vec_soff[0];
size_t charNumber = ct.getCharset()->numchars(param.expression.c_str(), param.expression.c_str() + offset); size_t charNumber =
options.libraryCharset->numchars(param.expression.c_str(), param.expression.c_str() + offset);
return std::to_string(charNumber + 1); return std::to_string(charNumber + 1);
} }
@ -373,6 +410,8 @@ bool Func_regexp::getBoolVal(rowgroup::Row& row, FunctionParm& fp, bool& isNull,
return false; return false;
const PCREOptions& options = pcreOptions(ct); const PCREOptions& options = pcreOptions(ct);
regexpParamCSfix(options, param);
jp::Regex re(param.pattern, options.flags); jp::Regex re(param.pattern, options.flags);
return re.match(param.expression); return re.match(param.expression);
} }