From 0c3d4004bf992052c87168686bb10ece31d63cd3 Mon Sep 17 00:00:00 2001 From: Leonid Fedorov Date: Wed, 6 Mar 2024 15:51:43 +0000 Subject: [PATCH] regexp_replace, regexp_substr and regexp_instr functions --- utils/funcexp/func_regexp.cpp | 125 ++++++++++++++++++++++++++++++++-- utils/funcexp/funcexp.cpp | 5 +- utils/funcexp/functor_str.h | 55 +++++++++++++++ 3 files changed, 177 insertions(+), 8 deletions(-) diff --git a/utils/funcexp/func_regexp.cpp b/utils/funcexp/func_regexp.cpp index 7d79a76ac..8e00da94e 100644 --- a/utils/funcexp/func_regexp.cpp +++ b/utils/funcexp/func_regexp.cpp @@ -28,6 +28,7 @@ using namespace std; #include "utils/pcre2/jpcre2.hpp" #include "functor_bool.h" +#include "functor_str.h" #include "functioncolumn.h" #include "predicateoperator.h" #include "constantcolumn.h" @@ -42,10 +43,17 @@ using namespace logging; namespace { -inline bool getBool(rowgroup::Row& row, funcexp::FunctionParm& pm, bool& isNull, - CalpontSystemCatalog::ColType& ct, long timeZone) + +struct RegExpParams { - string expr ; + std::string expression; + std::string pattern; +}; + +inline RegExpParams getEpressionAndPattern(rowgroup::Row& row, funcexp::FunctionParm& pm, bool& isNull, + CalpontSystemCatalog::ColType& ct, long timeZone) +{ + string expr; string pattern; switch (pm[0]->data()->resultType().colDataType) @@ -207,23 +215,126 @@ inline bool getBool(rowgroup::Row& row, funcexp::FunctionParm& pm, bool& isNull, } } - jpcre2::select::Regex re(pattern); - return re.match(expr); + return RegExpParams{expr, pattern}; } } // namespace namespace funcexp { +CalpontSystemCatalog::ColType Func_regexp_replace::operationType(FunctionParm& fp, + CalpontSystemCatalog::ColType& resultType) +{ + // operation type is not used by this functor + return fp[0]->data()->resultType(); +} + +CalpontSystemCatalog::ColType Func_regexp_substr::operationType(FunctionParm& fp, + CalpontSystemCatalog::ColType& resultType) +{ + // operation type is not used by this functor + return fp[0]->data()->resultType(); +} + +CalpontSystemCatalog::ColType Func_regexp_instr::operationType(FunctionParm& fp, + CalpontSystemCatalog::ColType& resultType) +{ + // operation type is not used by this functor + return fp[0]->data()->resultType(); +} + CalpontSystemCatalog::ColType Func_regexp::operationType(FunctionParm& fp, CalpontSystemCatalog::ColType& resultType) { return resultType; } -bool Func_regexp::getBoolVal(rowgroup::Row& row, FunctionParm& pm, bool& isNull, +using jp = jpcre2::select; + +/* + returns the string subject with all occurrences of the regular expression pattern replaced by + the string replace. If no occurrences are found, then subject is returned as is. + https://mariadb.com/kb/en/regexp_replace/ +*/ +std::string Func_regexp_replace::getStrVal(rowgroup::Row& row, FunctionParm& fp, bool& isNull, + execplan::CalpontSystemCatalog::ColType& ct) + +{ + if (isNull) + return std::string{}; + + RegExpParams param = getEpressionAndPattern(row, fp, isNull, ct, ct.getTimeZone()); + const auto& replace_with = fp[2]->data()->getStrVal(row, isNull); + + if (replace_with.isNull()) + return std::string{}; + + jp::Regex re(param.pattern); + return re.replace(param.expression, replace_with.unsafeStringRef(), "g"); +} + +/* + Returns the part of the string subject that matches the regular expression pattern, or an empty string if + pattern was not found. https://mariadb.com/kb/en/regexp_substr/ +*/ +std::string Func_regexp_substr::getStrVal(rowgroup::Row& row, FunctionParm& fp, bool& isNull, + execplan::CalpontSystemCatalog::ColType& ct) + +{ + if (isNull) + return std::string{}; + + RegExpParams param = getEpressionAndPattern(row, fp, isNull, ct, ct.getTimeZone()); + + jp::Regex re(param.pattern); + jp::RegexMatch rm(&re); + jp::VecNum vec_num; + + size_t count = rm.setSubject(param.expression).setNumberedSubstringVector(&vec_num).match(); + + if (count == 0) + return std::string{}; + + return vec_num[0][0]; +} + +/* + Returns the position of the first occurrence of the regular expression pattern in the string subject, or 0 + if pattern was not found. https://mariadb.com/kb/en/regexp_instr/ +*/ +std::string Func_regexp_instr::getStrVal(rowgroup::Row& row, FunctionParm& fp, bool& isNull, + execplan::CalpontSystemCatalog::ColType& ct) + +{ + if (isNull) + return std::string{}; + + RegExpParams param = getEpressionAndPattern(row, fp, isNull, ct, ct.getTimeZone()); + + jp::Regex re(param.pattern); + jp::RegexMatch rm(&re); + jpcre2::VecOff vec_soff; + + size_t count = rm.setSubject(param.expression).setMatchStartOffsetVector(&vec_soff).match(); + + if (count == 0) + return "0"; + + return std::to_string(vec_soff[0] + 1); +} + +/* + https://mariadb.com/kb/en/regexp/ +*/ +bool Func_regexp::getBoolVal(rowgroup::Row& row, FunctionParm& fp, bool& isNull, CalpontSystemCatalog::ColType& ct) { - return getBool(row, pm, isNull, ct, ct.getTimeZone()) && !isNull; + if (isNull) + return false; + + RegExpParams param = getEpressionAndPattern(row, fp, isNull, ct, ct.getTimeZone()); + + jp::Regex re(param.pattern); + return re.match(param.expression); } } // namespace funcexp diff --git a/utils/funcexp/funcexp.cpp b/utils/funcexp/funcexp.cpp index b07fdf27c..82f6003d8 100644 --- a/utils/funcexp/funcexp.cpp +++ b/utils/funcexp/funcexp.cpp @@ -215,7 +215,10 @@ FuncExp::FuncExp() fFuncMap["quarter"] = new Func_quarter(); fFuncMap["radians"] = new Func_radians(); // dlh fFuncMap["rand"] = new Func_rand(); - fFuncMap["regexp"] = new Func_regexp(); // dlh + fFuncMap["regexp"] = new Func_regexp(); + fFuncMap["regexp_instr"] = new Func_regexp_instr(); + fFuncMap["regexp_replace"] = new Func_regexp_replace(); + fFuncMap["regexp_substr"] = new Func_regexp_substr(); // dlh fFuncMap["repeat"] = new Func_repeat(); // dlh fFuncMap["replace"] = new Func_replace(); // dlh fFuncMap["replace_oracle"] = new Func_replace_oracle(); // dlh diff --git a/utils/funcexp/functor_str.h b/utils/funcexp/functor_str.h index b2fbe0933..17b8adea8 100644 --- a/utils/funcexp/functor_str.h +++ b/utils/funcexp/functor_str.h @@ -470,6 +470,61 @@ class Func_replace : public Func_Str execplan::CalpontSystemCatalog::ColType& op_ct); }; +class Func_regexp_replace : public Func_Str +{ + public: + Func_regexp_replace() : Func_Str("regexp_replace") + { + } + virtual ~Func_regexp_replace() + { + } + + execplan::CalpontSystemCatalog::ColType operationType(FunctionParm& fp, + execplan::CalpontSystemCatalog::ColType& resultType); + + std::string getStrVal(rowgroup::Row& row, FunctionParm& fp, bool& isNull, + execplan::CalpontSystemCatalog::ColType& op_ct); +}; + + +class Func_regexp_instr : public Func_Str +{ + public: + Func_regexp_instr() : Func_Str("regexp_instr") + { + } + virtual ~Func_regexp_instr() + { + } + + execplan::CalpontSystemCatalog::ColType operationType(FunctionParm& fp, + execplan::CalpontSystemCatalog::ColType& resultType); + + std::string getStrVal(rowgroup::Row& row, FunctionParm& fp, bool& isNull, + execplan::CalpontSystemCatalog::ColType& op_ct); +}; + + +class Func_regexp_substr : public Func_Str +{ + public: + Func_regexp_substr() : Func_Str("regexp_substr") + { + } + virtual ~Func_regexp_substr() + { + } + + execplan::CalpontSystemCatalog::ColType operationType(FunctionParm& fp, + execplan::CalpontSystemCatalog::ColType& resultType); + + std::string getStrVal(rowgroup::Row& row, FunctionParm& fp, bool& isNull, + execplan::CalpontSystemCatalog::ColType& op_ct); +}; + + + class Func_replace_oracle : public Func_Str { public: