You've already forked mariadb-columnstore-engine
mirror of
https://github.com/mariadb-corporation/mariadb-columnstore-engine.git
synced 2025-07-05 15:41:14 +03:00
MCOL-5328: use PCRE2 and JPCRE wrapper (#3137)
PCRE2 for regexp functions in columnstore
This commit is contained in:
@ -25,14 +25,10 @@
|
||||
#include <string>
|
||||
using namespace std;
|
||||
|
||||
#ifdef __linux__
|
||||
#include <regex.h>
|
||||
#else
|
||||
#include <regex>
|
||||
using namespace boost;
|
||||
#endif
|
||||
#include "utils/pcre2/jpcre2.hpp"
|
||||
|
||||
#include "functor_bool.h"
|
||||
#include "functor_str.h"
|
||||
#include "functioncolumn.h"
|
||||
#include "predicateoperator.h"
|
||||
#include "constantcolumn.h"
|
||||
@ -43,12 +39,58 @@ using namespace execplan;
|
||||
#include "errorcodes.h"
|
||||
#include "idberrorinfo.h"
|
||||
#include "errorids.h"
|
||||
|
||||
using namespace logging;
|
||||
|
||||
namespace
|
||||
{
|
||||
inline bool getBool(rowgroup::Row& row, funcexp::FunctionParm& pm, bool& isNull,
|
||||
CalpontSystemCatalog::ColType& ct, long timeZone)
|
||||
|
||||
using jp = jpcre2::select<char>;
|
||||
|
||||
struct PCREOptions
|
||||
{
|
||||
PCREOptions(execplan::CalpontSystemCatalog::ColType& ct);
|
||||
|
||||
datatypes::Charset dataCharset = my_charset_utf8mb3_general_ci;
|
||||
datatypes::Charset libraryCharset = my_charset_utf8mb3_general_ci;
|
||||
jpcre2::Uint flags = 0;
|
||||
bool conversionIsNeeded = false;
|
||||
};
|
||||
|
||||
PCREOptions::PCREOptions(execplan::CalpontSystemCatalog::ColType& ct)
|
||||
{
|
||||
datatypes::Charset cs = ct.getCharset();
|
||||
|
||||
// TODO use system variable instead if hardcode default_regex_flags_pcre(_current_thd());
|
||||
// PCRE2_DOTALL | PCRE2_DUPNAMES | PCRE2_EXTENDED | PCRE2_EXTENDED_MORE | PCRE2_MULTILINE | PCRE2_UNGREEDY;
|
||||
|
||||
jpcre2::Uint defaultFlags = 0;
|
||||
|
||||
flags = (cs != &my_charset_bin ? (PCRE2_UTF | PCRE2_UCP) : 0) |
|
||||
((cs.getCharset().state & (MY_CS_BINSORT | MY_CS_CSSORT)) ? 0 : PCRE2_CASELESS) | defaultFlags;
|
||||
|
||||
// Convert text data to utf-8.
|
||||
dataCharset = cs;
|
||||
libraryCharset = cs == my_charset_bin ? my_charset_bin : my_charset_utf8mb3_general_ci;
|
||||
}
|
||||
|
||||
struct RegExpParams
|
||||
{
|
||||
std::string expression;
|
||||
std::string pattern;
|
||||
RegExpParams& CharsetFix(const PCREOptions options)
|
||||
{
|
||||
if (options.conversionIsNeeded)
|
||||
return *this;
|
||||
|
||||
expression = options.libraryCharset.convert(expression, options.dataCharset);
|
||||
pattern = options.libraryCharset.convert(pattern, options.dataCharset);
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
inline RegExpParams getEpressionAndPattern(rowgroup::Row& row, funcexp::FunctionParm& pm, bool& isNull,
|
||||
CalpontSystemCatalog::ColType& ct, long timeZone)
|
||||
{
|
||||
string expr;
|
||||
string pattern;
|
||||
@ -212,39 +254,148 @@ inline bool getBool(rowgroup::Row& row, funcexp::FunctionParm& pm, bool& isNull,
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __linux__
|
||||
regex_t re;
|
||||
|
||||
regcomp(&re, pattern.c_str(), REG_EXTENDED | REG_NOSUB);
|
||||
|
||||
int res = regexec(&re, expr.c_str(), 0, NULL, 0);
|
||||
regfree(&re);
|
||||
|
||||
if (res == 0)
|
||||
return true;
|
||||
else
|
||||
return false;
|
||||
|
||||
#else
|
||||
std::regex pat(pattern.c_str());
|
||||
return std::regex_search(expr.c_str(), pat);
|
||||
#endif
|
||||
return RegExpParams{expr, pattern};
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace funcexp
|
||||
{
|
||||
CalpontSystemCatalog::ColType Func_regexp_replace::operationType(FunctionParm& fp,
|
||||
CalpontSystemCatalog::ColType& resultType)
|
||||
{
|
||||
// operation type is not used by this functor
|
||||
return fp[0]->data()->resultType();
|
||||
}
|
||||
|
||||
CalpontSystemCatalog::ColType Func_regexp_substr::operationType(FunctionParm& fp,
|
||||
CalpontSystemCatalog::ColType& resultType)
|
||||
{
|
||||
// operation type is not used by this functor
|
||||
return fp[0]->data()->resultType();
|
||||
}
|
||||
|
||||
CalpontSystemCatalog::ColType Func_regexp_instr::operationType(FunctionParm& fp,
|
||||
CalpontSystemCatalog::ColType& resultType)
|
||||
{
|
||||
// operation type is not used by this functor
|
||||
return fp[0]->data()->resultType();
|
||||
}
|
||||
|
||||
CalpontSystemCatalog::ColType Func_regexp::operationType(FunctionParm& fp,
|
||||
CalpontSystemCatalog::ColType& resultType)
|
||||
{
|
||||
return resultType;
|
||||
}
|
||||
|
||||
bool Func_regexp::getBoolVal(rowgroup::Row& row, FunctionParm& pm, bool& isNull,
|
||||
/*
|
||||
returns the string subject with all occurrences of the regular expression pattern replaced by
|
||||
the string replace. If no occurrences are found, then subject is returned as is.
|
||||
https://mariadb.com/kb/en/regexp_replace/
|
||||
*/
|
||||
std::string Func_regexp_replace::getStrVal(rowgroup::Row& row, FunctionParm& fp, bool& isNull,
|
||||
execplan::CalpontSystemCatalog::ColType& ct)
|
||||
|
||||
{
|
||||
RegExpParams param = getEpressionAndPattern(row, fp, isNull, ct, ct.getTimeZone());
|
||||
|
||||
if (isNull)
|
||||
return std::string{};
|
||||
|
||||
const auto& replaceWith = fp[2]->data()->getStrVal(row, isNull);
|
||||
|
||||
if (replaceWith.isNull())
|
||||
return param.expression;
|
||||
|
||||
PCREOptions options(ct);
|
||||
param.CharsetFix(options);
|
||||
jp::Regex re(param.pattern, options.flags);
|
||||
|
||||
const auto& replaceWithStr = replaceWith.unsafeStringRef();
|
||||
if (options.conversionIsNeeded)
|
||||
{
|
||||
const auto& convertedReplaceToken = options.libraryCharset.convert(replaceWithStr, options.dataCharset);
|
||||
return re.replace(param.expression, convertedReplaceToken, "g");
|
||||
}
|
||||
|
||||
return re.replace(param.expression, replaceWithStr, "g");
|
||||
}
|
||||
|
||||
/*
|
||||
Returns the part of the string subject that matches the regular expression pattern, or an empty string if
|
||||
pattern was not found. https://mariadb.com/kb/en/regexp_substr/
|
||||
*/
|
||||
std::string Func_regexp_substr::getStrVal(rowgroup::Row& row, FunctionParm& fp, bool& isNull,
|
||||
execplan::CalpontSystemCatalog::ColType& ct)
|
||||
|
||||
{
|
||||
RegExpParams param = getEpressionAndPattern(row, fp, isNull, ct, ct.getTimeZone());
|
||||
|
||||
if (isNull)
|
||||
return std::string{};
|
||||
|
||||
PCREOptions options(ct);
|
||||
param.CharsetFix(options);
|
||||
|
||||
jp::Regex re(param.pattern, options.flags);
|
||||
jp::RegexMatch rm(&re);
|
||||
jp::VecNum vec_num;
|
||||
|
||||
size_t count = rm.setSubject(param.expression).setNumberedSubstringVector(&vec_num).match();
|
||||
|
||||
if (count == 0)
|
||||
return std::string{};
|
||||
|
||||
return vec_num[0][0];
|
||||
}
|
||||
|
||||
/*
|
||||
Returns the position of the first occurrence of the regular expression pattern in the string subject, or 0
|
||||
if pattern was not found. https://mariadb.com/kb/en/regexp_instr/
|
||||
*/
|
||||
std::string Func_regexp_instr::getStrVal(rowgroup::Row& row, FunctionParm& fp, bool& isNull,
|
||||
execplan::CalpontSystemCatalog::ColType& ct)
|
||||
|
||||
{
|
||||
RegExpParams param = getEpressionAndPattern(row, fp, isNull, ct, ct.getTimeZone());
|
||||
|
||||
if (isNull)
|
||||
return std::string{};
|
||||
|
||||
PCREOptions options(ct);
|
||||
param.CharsetFix(options);
|
||||
|
||||
jp::Regex re(param.pattern, options.flags);
|
||||
jp::RegexMatch rm(&re);
|
||||
jpcre2::VecOff vec_soff;
|
||||
|
||||
size_t count = rm.setSubject(param.expression).setMatchStartOffsetVector(&vec_soff).match();
|
||||
|
||||
if (count == 0)
|
||||
return "0";
|
||||
|
||||
size_t offset = vec_soff[0];
|
||||
size_t charNumber = options.libraryCharset.getCharset().numchars(param.expression.c_str(),
|
||||
param.expression.c_str() + offset);
|
||||
|
||||
return std::to_string(charNumber + 1);
|
||||
}
|
||||
|
||||
/*
|
||||
https://mariadb.com/kb/en/regexp/
|
||||
*/
|
||||
bool Func_regexp::getBoolVal(rowgroup::Row& row, FunctionParm& fp, bool& isNull,
|
||||
CalpontSystemCatalog::ColType& ct)
|
||||
{
|
||||
return getBool(row, pm, isNull, ct, ct.getTimeZone()) && !isNull;
|
||||
RegExpParams param = getEpressionAndPattern(row, fp, isNull, ct, ct.getTimeZone());
|
||||
|
||||
if (isNull)
|
||||
return false;
|
||||
|
||||
PCREOptions options(ct);
|
||||
param.CharsetFix(options);
|
||||
|
||||
jp::Regex re(param.pattern, options.flags);
|
||||
return re.match(param.expression);
|
||||
}
|
||||
|
||||
} // namespace funcexp
|
||||
|
Reference in New Issue
Block a user