From 7293ec522cfcf95a94713a0a7e3bbb06d9a41402 Mon Sep 17 00:00:00 2001 From: David Hall Date: Fri, 11 Aug 2017 12:46:45 -0500 Subject: [PATCH] MCOL-523 Add avg_mode function --- utils/udfsdk/avg_mode.cpp | 298 ++++++++++++++++++++++++++++++++++++++ utils/udfsdk/avg_mode.h | 294 +++++++++++++++++++++++++++++++++++++ 2 files changed, 592 insertions(+) create mode 100755 utils/udfsdk/avg_mode.cpp create mode 100755 utils/udfsdk/avg_mode.h diff --git a/utils/udfsdk/avg_mode.cpp b/utils/udfsdk/avg_mode.cpp new file mode 100755 index 000000000..b29bcd69c --- /dev/null +++ b/utils/udfsdk/avg_mode.cpp @@ -0,0 +1,298 @@ +/* Copyright (C) 2017 MariaDB Corporaton + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; version 2 of + the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + MA 02110-1301, USA. */ + +#include +#include +#include +#include "avg_mode.h" +#include "bytestream.h" +#include "objectreader.h" + +using namespace mcsv1sdk; + +mcsv1_UDAF::ReturnCode avg_mode::init(mcsv1Context* context, + COL_TYPES& colTypes) +{ + if (colTypes.size() < 1) + { + // The error message will be prepended with + // "The storage engine for the table doesn't support " + context->setErrorMessage("avg_mode() with 0 arguments"); + return mcsv1_UDAF::ERROR; + } + if (colTypes.size() > 1) + { + context->setErrorMessage("avg_mode() with more than 1 argument"); + return mcsv1_UDAF::ERROR; + } + + if (!(isNumeric(colTypes[0].second))) + { + // The error message will be prepended with + // "The storage engine for the table doesn't support " + context->setErrorMessage("avg_mode() with non-numeric argument"); + return mcsv1_UDAF::ERROR; + } + + context->setResultType(CalpontSystemCatalog::DOUBLE); + context->setColWidth(8); + context->setScale(context->getScale()*2); + context->setPrecision(19); + context->setRunFlag(mcsv1sdk::UDAF_IGNORE_NULLS); + return mcsv1_UDAF::SUCCESS; + +} + +mcsv1_UDAF::ReturnCode avg_mode::finish(mcsv1Context* context) +{ + return mcsv1_UDAF::SUCCESS; +} + +mcsv1_UDAF::ReturnCode avg_mode::reset(mcsv1Context* context) +{ + ModeData* data = static_cast(context->getUserData()); + data->mData.clear(); + return mcsv1_UDAF::SUCCESS; +} + +mcsv1_UDAF::ReturnCode avg_mode::nextValue(mcsv1Context* context, + std::vector& valsIn) +{ + static_any::any& valIn = valsIn[0].columnData; + MODE_DATA& data = static_cast(context->getUserData())->mData; + DATATYPE val = 0.0; + + if (valIn.empty()) + { + return mcsv1_UDAF::SUCCESS; // Ought not happen when UDAF_IGNORE_NULLS is on. + } + + if (valIn.compatible(charTypeId)) + { + val = valIn.cast(); + } + else if (valIn.compatible(scharTypeId)) + { + val = valIn.cast(); + } + else if (valIn.compatible(shortTypeId)) + { + val = valIn.cast(); + } + else if (valIn.compatible(intTypeId)) + { + val = valIn.cast(); + } + else if (valIn.compatible(longTypeId)) + { + val = valIn.cast(); + } + else if (valIn.compatible(llTypeId)) + { + val = valIn.cast(); + } + else if (valIn.compatible(ucharTypeId)) + { + val = valIn.cast(); + } + else if (valIn.compatible(ushortTypeId)) + { + val = valIn.cast(); + } + else if (valIn.compatible(uintTypeId)) + { + val = valIn.cast(); + } + else if (valIn.compatible(ulongTypeId)) + { + val = valIn.cast(); + } + else if (valIn.compatible(ullTypeId)) + { + val = valIn.cast(); + } + else if (valIn.compatible(floatTypeId)) + { + val = valIn.cast(); + } + else if (valIn.compatible(doubleTypeId)) + { + val = valIn.cast(); + } + + // For decimal types, we need to move the decimal point. + uint32_t scale = valsIn[0].scale; + if (val != 0 && scale > 0) + { + val /= pow(10.0, (double)scale); + } + data[val]++; + + return mcsv1_UDAF::SUCCESS; +} + +mcsv1_UDAF::ReturnCode avg_mode::subEvaluate(mcsv1Context* context, const UserData* userDataIn) +{ + if (!userDataIn) + { + return mcsv1_UDAF::SUCCESS; + } + MODE_DATA& outData = static_cast(context->getUserData())->mData; + const MODE_DATA& inData = static_cast(userDataIn)->mData; + MODE_DATA::const_iterator iter = inData.begin(); + for (; iter != inData.end(); ++iter) + { + outData[iter->first] += iter->second; + } + return mcsv1_UDAF::SUCCESS; +} + +mcsv1_UDAF::ReturnCode avg_mode::evaluate(mcsv1Context* context, static_any::any& valOut) +{ + uint64_t maxCnt=0; + MODE_DATA& data = static_cast(context->getUserData())->mData; + if (data.size() == 0) + { + valOut = (DATATYPE)0; + return mcsv1_UDAF::SUCCESS; + } + MODE_DATA::iterator iter(data.begin()); + for (; iter != data.end(); ++iter) + { + if (iter->second > maxCnt) + { + valOut = iter->first; + maxCnt = iter->second; + } + } + return mcsv1_UDAF::SUCCESS; +} + +mcsv1_UDAF::ReturnCode avg_mode::dropValue(mcsv1Context* context, + std::vector& valsDropped) +{ + static_any::any& valIn = valsDropped[0].columnData; + MODE_DATA& data = static_cast(context->getUserData())->mData; + DATATYPE val = 0.0; + + if (valIn.empty()) + { + return mcsv1_UDAF::SUCCESS; // Ought not happen when UDAF_IGNORE_NULLS is on. + } + + if (valIn.compatible(charTypeId)) + { + val = valIn.cast(); + } + else if (valIn.compatible(scharTypeId)) + { + val = valIn.cast(); + } + else if (valIn.compatible(shortTypeId)) + { + val = valIn.cast(); + } + else if (valIn.compatible(intTypeId)) + { + val = valIn.cast(); + } + else if (valIn.compatible(longTypeId)) + { + val = valIn.cast(); + } + else if (valIn.compatible(llTypeId)) + { + val = valIn.cast(); + } + else if (valIn.compatible(ucharTypeId)) + { + val = valIn.cast(); + } + else if (valIn.compatible(ushortTypeId)) + { + val = valIn.cast(); + } + else if (valIn.compatible(uintTypeId)) + { + val = valIn.cast(); + } + else if (valIn.compatible(ulongTypeId)) + { + val = valIn.cast(); + } + else if (valIn.compatible(ullTypeId)) + { + val = valIn.cast(); + } + else if (valIn.compatible(floatTypeId)) + { + val = valIn.cast(); + } + else if (valIn.compatible(doubleTypeId)) + { + val = valIn.cast(); + } + + // For decimal types, we need to move the decimal point. + uint32_t scale = valsDropped[0].scale; + if (val != 0 && scale > 0) + { + val /= pow(10.0, (double)scale); + } + + data[val]--; + + return mcsv1_UDAF::SUCCESS; +} + +mcsv1_UDAF::ReturnCode avg_mode::createUserData(UserData*& userData, int32_t& length) +{ + userData = new ModeData; + length = sizeof(ModeData); + return mcsv1_UDAF::SUCCESS; +} + +void ModeData::serialize(messageqcpp::ByteStream& bs) const +{ + MODE_DATA::const_iterator iter = mData.begin(); + DATATYPE num; + uint32_t cnt; + bs << (int32_t)mData.size(); + for (; iter != mData.end(); ++iter) + { + num = iter->first; + bs << num; + cnt = iter->second; + bs << cnt; + } +} + +void ModeData::unserialize(messageqcpp::ByteStream& bs) +{ + mData.clear(); + int32_t sz; + DATATYPE num; + uint32_t cnt; + bs >> sz; + for (int i = 0; i < sz; ++i) + { + bs >> num; + bs >> cnt; + mData[num] = cnt; + } +} + diff --git a/utils/udfsdk/avg_mode.h b/utils/udfsdk/avg_mode.h new file mode 100755 index 000000000..8bf375fa8 --- /dev/null +++ b/utils/udfsdk/avg_mode.h @@ -0,0 +1,294 @@ +/* Copyright (C) 2017 MariaDB Corporaton + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; version 2 of + the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + MA 02110-1301, USA. */ + +/*********************************************************************** +* $Id$ +* +* mcsv1_UDAF.h +***********************************************************************/ + +/** + * Columnstore interface for writing a User Defined Aggregate + * Functions (UDAF) and User Defined Analytic Functions (UDAnF) + * or a function that can act as either - UDA(n)F + * + * The basic steps are: + * + * 1. Create a the UDA(n)F function interface in some .h file. + * 2. Create the UDF function implementation in some .cpp file + * 3. Create the connector stub (MariaDB UDAF definition) for + * this UDF function. + * 4. build the dynamic library using all of the source. + * 5 Put the library in $COLUMNSTORE_INSTALL/lib of + * all modules + * 6. restart the Columnstore system. + * 7. notify mysqld about the new function: + * + * CREATE AGGREGATE FUNCTION avg_mode returns REAL soname + * 'libudf_mysql.so'; + * + * The UDAF functions may run distributed in the Columnstore + * engine. UDAnF do not run distributed. + * + * UDAF is User Defined Aggregate Function. + * UDAnF is User Defined Analytic Function. + * UDA(n)F is an acronym for a function that could be either. It + * is also used to describe the interface that is used for + * either. + */ +#ifndef HEADER_mode +#define HEADER_mode + +#include +#include +#include +#include +#ifdef _MSC_VER +#include +#else +#include +#endif + +#include "mcsv1_udaf.h" +#include "calpontsystemcatalog.h" +#include "windowfunctioncolumn.h" +using namespace execplan; + +#if defined(_MSC_VER) && defined(xxxRGNODE_DLLEXPORT) +#define EXPORT __declspec(dllexport) +#else +#define EXPORT +#endif + +namespace mcsv1sdk +{ + +#define DATATYPE double +typedef std::tr1::unordered_map MODE_DATA; + +// Override UserData for data storage +struct ModeData : public UserData +{ + ModeData() {}; + + virtual ~ModeData(){} + + virtual void serialize(messageqcpp::ByteStream& bs) const; + virtual void unserialize(messageqcpp::ByteStream& bs); + + MODE_DATA mData; +private: + // For now, copy construction is unwanted + ModeData(UserData&); +}; + +// Override mcsv1_UDAF to build your User Defined Aggregate (UDAF) and/or +// User Defined Analytic Function (UDAnF). +// These will be singleton classes, so don't put any instance +// specific data in here. All instance data is stored in mcsv1Context +// passed to each user function and retrieved by the getUserData() method. +// +// Each API function returns a ReturnCode. If ERROR is returned at any time, +// the query is aborted, getInterrupted() will begin to return true and the +// message set in config->setErrorMessage() is returned to MariaDB. + +// Return the avg_mode value of the dataset + +class avg_mode : public mcsv1_UDAF +{ +public: + // Defaults OK + avg_mode() : mcsv1_UDAF(){}; + virtual ~avg_mode(){}; + + /** + * init() + * + * Mandatory. Implement this to initialize flags and instance + * data. Called once per SQL statement. You can do any sanity + * checks here. + * + * colTypes (in) - A vector of ColDataType defining the + * parameters of the UDA(n)F call. These can be used to decide + * to override the default return type. If desired, the new + * return type can be set by context->setReturnType() and + * decimal scale and precision can be set by context->setScale + * and context->setPrecision respectively. + * + * Return mcsv1_UDAF::ERROR on any error, such as non-compatible + * colTypes or wrong number of arguments. Else return + * mcsv1_UDAF::SUCCESS. + */ + virtual ReturnCode init(mcsv1Context* context, + COL_TYPES& colTypes); + + /** + * finish() + * + * Mandatory. Completes the UDA(n)F. Called once per SQL + * statement. Do not free any memory allocated by + * context->setUserDataSize(). The SDK Framework owns that memory + * and will handle that. Often, there is nothing to do here. + */ + virtual ReturnCode finish(mcsv1Context* context); + + /** + * reset() + * + * Mandatory. Reset the UDA(n)F for a new group, partition or, + * in some cases, new Window Frame. Do not free any memory + * allocated by context->setUserDataSize(). The SDK Framework owns + * that memory and will handle that. Use this opportunity to + * reset any variables in context->getUserData() needed for the + * next aggregation. May be called multiple times if running in + * a ditributed fashion. + * + * Use this opportunity to initialize the userData. + */ + virtual ReturnCode reset(mcsv1Context* context); + + /** + * nextValue() + * + * Mandatory. Handle a single row. + * + * colsIn - A vector of data structure describing the input + * data. + * + * This function is called once for every row in the filtered + * result set (before aggregation). It is very important that + * this function is efficient. + * + * If the UDAF is running in a distributed fashion, nextValue + * cannot depend on order, as it will only be called for each + * row found on the specific PM. + * + * valsIn (in) - a vector of the parameters from the row. + */ + virtual ReturnCode nextValue(mcsv1Context* context, + std::vector& valsIn); + + /** + * subEvaluate() + * + * Mandatory -- Called if the UDAF is running in a distributed + * fashion. Columnstore tries to run all aggregate functions + * distributed, depending on context. + * + * Perform an aggregation on rows partially aggregated by + * nextValue. Columnstore calls nextValue for each row on a + * given PM for a group (GROUP BY). subEvaluate is called on the + * UM to consolodate those values into a single instance of + * userData. Keep your aggregated totals in context's userData. + * The first time this is called for a group, reset() would have + * been called with this version of userData. + * + * Called for every partial data set in each group in GROUP BY. + * + * When subEvaluate has been called for all subAggregated data + * sets, Evaluate will be called with the same context as here. + * + * valIn (In) - This is a pointer to a memory block of the size + * set in setUserDataSize. It will contain the value of userData + * as seen in the last call to NextValue for a given PM. + * + */ + virtual ReturnCode subEvaluate(mcsv1Context* context, const UserData* valIn); + + /** + * evaluate() + * + * Mandatory. Get the aggregated value. + * + * Called for every new group if UDAF GROUP BY, UDAnF partition + * or, in some cases, new Window Frame. + * + * Set the aggregated value into valOut. The datatype is assumed + * to be the same as that set in the init() function; + * + * If the UDAF is running in a distributed fashion, evaluate is + * called after a series of subEvaluate calls. + * + * valOut (out) - Set the aggregated value here. The datatype is + * assumed to be the same as that set in the init() function; + * + * To return a NULL value, don't assign to valOut. + */ + virtual ReturnCode evaluate(mcsv1Context* context, static_any::any& valOut); + + /** + * dropValue() + * + * Optional -- If defined, the server will call this instead of + * reset for UDAnF. + * + * Don't implement if a UDAnF has one or more of the following: + * The UDAnF can't be used with a Window Frame + * The UDAnF is not reversable in some way + * The UDAnF is not interested in optimal performance + * + * If not implemented, reset() followed by a series of + * nextValue() will be called for each movement of the Window + * Frame. + * + * If implemented, then each movement of the Window Frame will + * result in dropValue() being called for each row falling out + * of the Frame and nextValue() being called for each new row + * coming into the Frame. + * + * valsDropped (in) - a vector of the parameters from the row + * leaving the Frame + * + * dropValue() will not be called for unbounded/current row type + * frames, as those are already optimized. + */ + virtual ReturnCode dropValue(mcsv1Context* context, + std::vector& valsDropped); + + /** + * createUserData() + * + * Optional -- If defined, the server will call this instead of + * createUserData on context. + * + * Create your variable length data structure via + * data = new + * + * The data structure may contain references to containers or + * pointers to other objects. Remember that for distributed + * processing, this may be called multiple times for variaous + * computing blocks. At the least, it will be called once per PM + * that processes the data, and once more for the UM. For UDAnF, + * it may only be called once. + * + * Set length to the length of the data structure you create. + * + * For each call to createUserData(), there will be a + * corresponding deleteUserData() where you must clean up. Any + * memory leaks are your fault. + * + */ + virtual ReturnCode createUserData(UserData*& data, int32_t& length); +protected: +}; + +}; // namespace + +#undef EXPORT + +#endif // HEADER_mode.h +