1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-08-01 06:46:55 +03:00

MCOL-523 Add avg_mode function

This commit is contained in:
David Hall
2017-08-11 12:46:45 -05:00
parent 4eafaa8682
commit 7293ec522c
2 changed files with 592 additions and 0 deletions

298
utils/udfsdk/avg_mode.cpp Executable file
View File

@ -0,0 +1,298 @@
/* Copyright (C) 2017 MariaDB Corporaton
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; version 2 of
the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
MA 02110-1301, USA. */
#include <sstream>
#include <cstring>
#include <typeinfo>
#include "avg_mode.h"
#include "bytestream.h"
#include "objectreader.h"
using namespace mcsv1sdk;
mcsv1_UDAF::ReturnCode avg_mode::init(mcsv1Context* context,
COL_TYPES& colTypes)
{
if (colTypes.size() < 1)
{
// The error message will be prepended with
// "The storage engine for the table doesn't support "
context->setErrorMessage("avg_mode() with 0 arguments");
return mcsv1_UDAF::ERROR;
}
if (colTypes.size() > 1)
{
context->setErrorMessage("avg_mode() with more than 1 argument");
return mcsv1_UDAF::ERROR;
}
if (!(isNumeric(colTypes[0].second)))
{
// The error message will be prepended with
// "The storage engine for the table doesn't support "
context->setErrorMessage("avg_mode() with non-numeric argument");
return mcsv1_UDAF::ERROR;
}
context->setResultType(CalpontSystemCatalog::DOUBLE);
context->setColWidth(8);
context->setScale(context->getScale()*2);
context->setPrecision(19);
context->setRunFlag(mcsv1sdk::UDAF_IGNORE_NULLS);
return mcsv1_UDAF::SUCCESS;
}
mcsv1_UDAF::ReturnCode avg_mode::finish(mcsv1Context* context)
{
return mcsv1_UDAF::SUCCESS;
}
mcsv1_UDAF::ReturnCode avg_mode::reset(mcsv1Context* context)
{
ModeData* data = static_cast<ModeData*>(context->getUserData());
data->mData.clear();
return mcsv1_UDAF::SUCCESS;
}
mcsv1_UDAF::ReturnCode avg_mode::nextValue(mcsv1Context* context,
std::vector<ColumnDatum>& valsIn)
{
static_any::any& valIn = valsIn[0].columnData;
MODE_DATA& data = static_cast<ModeData*>(context->getUserData())->mData;
DATATYPE val = 0.0;
if (valIn.empty())
{
return mcsv1_UDAF::SUCCESS; // Ought not happen when UDAF_IGNORE_NULLS is on.
}
if (valIn.compatible(charTypeId))
{
val = valIn.cast<char>();
}
else if (valIn.compatible(scharTypeId))
{
val = valIn.cast<signed char>();
}
else if (valIn.compatible(shortTypeId))
{
val = valIn.cast<short>();
}
else if (valIn.compatible(intTypeId))
{
val = valIn.cast<int>();
}
else if (valIn.compatible(longTypeId))
{
val = valIn.cast<long>();
}
else if (valIn.compatible(llTypeId))
{
val = valIn.cast<long long>();
}
else if (valIn.compatible(ucharTypeId))
{
val = valIn.cast<unsigned char>();
}
else if (valIn.compatible(ushortTypeId))
{
val = valIn.cast<unsigned short>();
}
else if (valIn.compatible(uintTypeId))
{
val = valIn.cast<unsigned int>();
}
else if (valIn.compatible(ulongTypeId))
{
val = valIn.cast<unsigned long>();
}
else if (valIn.compatible(ullTypeId))
{
val = valIn.cast<unsigned long long>();
}
else if (valIn.compatible(floatTypeId))
{
val = valIn.cast<float>();
}
else if (valIn.compatible(doubleTypeId))
{
val = valIn.cast<double>();
}
// For decimal types, we need to move the decimal point.
uint32_t scale = valsIn[0].scale;
if (val != 0 && scale > 0)
{
val /= pow(10.0, (double)scale);
}
data[val]++;
return mcsv1_UDAF::SUCCESS;
}
mcsv1_UDAF::ReturnCode avg_mode::subEvaluate(mcsv1Context* context, const UserData* userDataIn)
{
if (!userDataIn)
{
return mcsv1_UDAF::SUCCESS;
}
MODE_DATA& outData = static_cast<ModeData*>(context->getUserData())->mData;
const MODE_DATA& inData = static_cast<const ModeData*>(userDataIn)->mData;
MODE_DATA::const_iterator iter = inData.begin();
for (; iter != inData.end(); ++iter)
{
outData[iter->first] += iter->second;
}
return mcsv1_UDAF::SUCCESS;
}
mcsv1_UDAF::ReturnCode avg_mode::evaluate(mcsv1Context* context, static_any::any& valOut)
{
uint64_t maxCnt=0;
MODE_DATA& data = static_cast<ModeData*>(context->getUserData())->mData;
if (data.size() == 0)
{
valOut = (DATATYPE)0;
return mcsv1_UDAF::SUCCESS;
}
MODE_DATA::iterator iter(data.begin());
for (; iter != data.end(); ++iter)
{
if (iter->second > maxCnt)
{
valOut = iter->first;
maxCnt = iter->second;
}
}
return mcsv1_UDAF::SUCCESS;
}
mcsv1_UDAF::ReturnCode avg_mode::dropValue(mcsv1Context* context,
std::vector<ColumnDatum>& valsDropped)
{
static_any::any& valIn = valsDropped[0].columnData;
MODE_DATA& data = static_cast<ModeData*>(context->getUserData())->mData;
DATATYPE val = 0.0;
if (valIn.empty())
{
return mcsv1_UDAF::SUCCESS; // Ought not happen when UDAF_IGNORE_NULLS is on.
}
if (valIn.compatible(charTypeId))
{
val = valIn.cast<char>();
}
else if (valIn.compatible(scharTypeId))
{
val = valIn.cast<signed char>();
}
else if (valIn.compatible(shortTypeId))
{
val = valIn.cast<short>();
}
else if (valIn.compatible(intTypeId))
{
val = valIn.cast<int>();
}
else if (valIn.compatible(longTypeId))
{
val = valIn.cast<long>();
}
else if (valIn.compatible(llTypeId))
{
val = valIn.cast<long long>();
}
else if (valIn.compatible(ucharTypeId))
{
val = valIn.cast<unsigned char>();
}
else if (valIn.compatible(ushortTypeId))
{
val = valIn.cast<unsigned short>();
}
else if (valIn.compatible(uintTypeId))
{
val = valIn.cast<unsigned int>();
}
else if (valIn.compatible(ulongTypeId))
{
val = valIn.cast<unsigned long>();
}
else if (valIn.compatible(ullTypeId))
{
val = valIn.cast<unsigned long long>();
}
else if (valIn.compatible(floatTypeId))
{
val = valIn.cast<float>();
}
else if (valIn.compatible(doubleTypeId))
{
val = valIn.cast<double>();
}
// For decimal types, we need to move the decimal point.
uint32_t scale = valsDropped[0].scale;
if (val != 0 && scale > 0)
{
val /= pow(10.0, (double)scale);
}
data[val]--;
return mcsv1_UDAF::SUCCESS;
}
mcsv1_UDAF::ReturnCode avg_mode::createUserData(UserData*& userData, int32_t& length)
{
userData = new ModeData;
length = sizeof(ModeData);
return mcsv1_UDAF::SUCCESS;
}
void ModeData::serialize(messageqcpp::ByteStream& bs) const
{
MODE_DATA::const_iterator iter = mData.begin();
DATATYPE num;
uint32_t cnt;
bs << (int32_t)mData.size();
for (; iter != mData.end(); ++iter)
{
num = iter->first;
bs << num;
cnt = iter->second;
bs << cnt;
}
}
void ModeData::unserialize(messageqcpp::ByteStream& bs)
{
mData.clear();
int32_t sz;
DATATYPE num;
uint32_t cnt;
bs >> sz;
for (int i = 0; i < sz; ++i)
{
bs >> num;
bs >> cnt;
mData[num] = cnt;
}
}

294
utils/udfsdk/avg_mode.h Executable file
View File

@ -0,0 +1,294 @@
/* Copyright (C) 2017 MariaDB Corporaton
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; version 2 of
the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
MA 02110-1301, USA. */
/***********************************************************************
* $Id$
*
* mcsv1_UDAF.h
***********************************************************************/
/**
* Columnstore interface for writing a User Defined Aggregate
* Functions (UDAF) and User Defined Analytic Functions (UDAnF)
* or a function that can act as either - UDA(n)F
*
* The basic steps are:
*
* 1. Create a the UDA(n)F function interface in some .h file.
* 2. Create the UDF function implementation in some .cpp file
* 3. Create the connector stub (MariaDB UDAF definition) for
* this UDF function.
* 4. build the dynamic library using all of the source.
* 5 Put the library in $COLUMNSTORE_INSTALL/lib of
* all modules
* 6. restart the Columnstore system.
* 7. notify mysqld about the new function:
*
* CREATE AGGREGATE FUNCTION avg_mode returns REAL soname
* 'libudf_mysql.so';
*
* The UDAF functions may run distributed in the Columnstore
* engine. UDAnF do not run distributed.
*
* UDAF is User Defined Aggregate Function.
* UDAnF is User Defined Analytic Function.
* UDA(n)F is an acronym for a function that could be either. It
* is also used to describe the interface that is used for
* either.
*/
#ifndef HEADER_mode
#define HEADER_mode
#include <cstdlib>
#include <string>
#include <vector>
#include <boost/any.hpp>
#ifdef _MSC_VER
#include <unordered_map>
#else
#include <tr1/unordered_map>
#endif
#include "mcsv1_udaf.h"
#include "calpontsystemcatalog.h"
#include "windowfunctioncolumn.h"
using namespace execplan;
#if defined(_MSC_VER) && defined(xxxRGNODE_DLLEXPORT)
#define EXPORT __declspec(dllexport)
#else
#define EXPORT
#endif
namespace mcsv1sdk
{
#define DATATYPE double
typedef std::tr1::unordered_map<DATATYPE, uint32_t> MODE_DATA;
// Override UserData for data storage
struct ModeData : public UserData
{
ModeData() {};
virtual ~ModeData(){}
virtual void serialize(messageqcpp::ByteStream& bs) const;
virtual void unserialize(messageqcpp::ByteStream& bs);
MODE_DATA mData;
private:
// For now, copy construction is unwanted
ModeData(UserData&);
};
// Override mcsv1_UDAF to build your User Defined Aggregate (UDAF) and/or
// User Defined Analytic Function (UDAnF).
// These will be singleton classes, so don't put any instance
// specific data in here. All instance data is stored in mcsv1Context
// passed to each user function and retrieved by the getUserData() method.
//
// Each API function returns a ReturnCode. If ERROR is returned at any time,
// the query is aborted, getInterrupted() will begin to return true and the
// message set in config->setErrorMessage() is returned to MariaDB.
// Return the avg_mode value of the dataset
class avg_mode : public mcsv1_UDAF
{
public:
// Defaults OK
avg_mode() : mcsv1_UDAF(){};
virtual ~avg_mode(){};
/**
* init()
*
* Mandatory. Implement this to initialize flags and instance
* data. Called once per SQL statement. You can do any sanity
* checks here.
*
* colTypes (in) - A vector of ColDataType defining the
* parameters of the UDA(n)F call. These can be used to decide
* to override the default return type. If desired, the new
* return type can be set by context->setReturnType() and
* decimal scale and precision can be set by context->setScale
* and context->setPrecision respectively.
*
* Return mcsv1_UDAF::ERROR on any error, such as non-compatible
* colTypes or wrong number of arguments. Else return
* mcsv1_UDAF::SUCCESS.
*/
virtual ReturnCode init(mcsv1Context* context,
COL_TYPES& colTypes);
/**
* finish()
*
* Mandatory. Completes the UDA(n)F. Called once per SQL
* statement. Do not free any memory allocated by
* context->setUserDataSize(). The SDK Framework owns that memory
* and will handle that. Often, there is nothing to do here.
*/
virtual ReturnCode finish(mcsv1Context* context);
/**
* reset()
*
* Mandatory. Reset the UDA(n)F for a new group, partition or,
* in some cases, new Window Frame. Do not free any memory
* allocated by context->setUserDataSize(). The SDK Framework owns
* that memory and will handle that. Use this opportunity to
* reset any variables in context->getUserData() needed for the
* next aggregation. May be called multiple times if running in
* a ditributed fashion.
*
* Use this opportunity to initialize the userData.
*/
virtual ReturnCode reset(mcsv1Context* context);
/**
* nextValue()
*
* Mandatory. Handle a single row.
*
* colsIn - A vector of data structure describing the input
* data.
*
* This function is called once for every row in the filtered
* result set (before aggregation). It is very important that
* this function is efficient.
*
* If the UDAF is running in a distributed fashion, nextValue
* cannot depend on order, as it will only be called for each
* row found on the specific PM.
*
* valsIn (in) - a vector of the parameters from the row.
*/
virtual ReturnCode nextValue(mcsv1Context* context,
std::vector<ColumnDatum>& valsIn);
/**
* subEvaluate()
*
* Mandatory -- Called if the UDAF is running in a distributed
* fashion. Columnstore tries to run all aggregate functions
* distributed, depending on context.
*
* Perform an aggregation on rows partially aggregated by
* nextValue. Columnstore calls nextValue for each row on a
* given PM for a group (GROUP BY). subEvaluate is called on the
* UM to consolodate those values into a single instance of
* userData. Keep your aggregated totals in context's userData.
* The first time this is called for a group, reset() would have
* been called with this version of userData.
*
* Called for every partial data set in each group in GROUP BY.
*
* When subEvaluate has been called for all subAggregated data
* sets, Evaluate will be called with the same context as here.
*
* valIn (In) - This is a pointer to a memory block of the size
* set in setUserDataSize. It will contain the value of userData
* as seen in the last call to NextValue for a given PM.
*
*/
virtual ReturnCode subEvaluate(mcsv1Context* context, const UserData* valIn);
/**
* evaluate()
*
* Mandatory. Get the aggregated value.
*
* Called for every new group if UDAF GROUP BY, UDAnF partition
* or, in some cases, new Window Frame.
*
* Set the aggregated value into valOut. The datatype is assumed
* to be the same as that set in the init() function;
*
* If the UDAF is running in a distributed fashion, evaluate is
* called after a series of subEvaluate calls.
*
* valOut (out) - Set the aggregated value here. The datatype is
* assumed to be the same as that set in the init() function;
*
* To return a NULL value, don't assign to valOut.
*/
virtual ReturnCode evaluate(mcsv1Context* context, static_any::any& valOut);
/**
* dropValue()
*
* Optional -- If defined, the server will call this instead of
* reset for UDAnF.
*
* Don't implement if a UDAnF has one or more of the following:
* The UDAnF can't be used with a Window Frame
* The UDAnF is not reversable in some way
* The UDAnF is not interested in optimal performance
*
* If not implemented, reset() followed by a series of
* nextValue() will be called for each movement of the Window
* Frame.
*
* If implemented, then each movement of the Window Frame will
* result in dropValue() being called for each row falling out
* of the Frame and nextValue() being called for each new row
* coming into the Frame.
*
* valsDropped (in) - a vector of the parameters from the row
* leaving the Frame
*
* dropValue() will not be called for unbounded/current row type
* frames, as those are already optimized.
*/
virtual ReturnCode dropValue(mcsv1Context* context,
std::vector<ColumnDatum>& valsDropped);
/**
* createUserData()
*
* Optional -- If defined, the server will call this instead of
* createUserData on context.
*
* Create your variable length data structure via
* data = new <datatype>
*
* The data structure may contain references to containers or
* pointers to other objects. Remember that for distributed
* processing, this may be called multiple times for variaous
* computing blocks. At the least, it will be called once per PM
* that processes the data, and once more for the UM. For UDAnF,
* it may only be called once.
*
* Set length to the length of the data structure you create.
*
* For each call to createUserData(), there will be a
* corresponding deleteUserData() where you must clean up. Any
* memory leaks are your fault.
*
*/
virtual ReturnCode createUserData(UserData*& data, int32_t& length);
protected:
};
}; // namespace
#undef EXPORT
#endif // HEADER_mode.h