MCOL-523 Add avg_mode function

2025-08-01 06:46:55 +03:00 · 2017-08-11 12:46:45 -05:00
parent 4eafaa8682
commit 7293ec522c
2 changed files with 592 additions and 0 deletions
--- a/utils/udfsdk/avg_mode.cpp
+++ b/utils/udfsdk/avg_mode.cpp
@ -0,0 +1,298 @@
+/* Copyright (C) 2017 MariaDB Corporaton
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation; version 2 of
+   the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+   MA 02110-1301, USA. */
+
+#include <sstream>
+#include <cstring>
+#include <typeinfo>
+#include "avg_mode.h"
+#include "bytestream.h"
+#include "objectreader.h"
+
+using namespace mcsv1sdk;
+
+mcsv1_UDAF::ReturnCode avg_mode::init(mcsv1Context* context,
+									 COL_TYPES& colTypes)
+{
+	if (colTypes.size() < 1)
+	{
+		// The error message will be prepended with
+		// "The storage engine for the table doesn't support "
+		context->setErrorMessage("avg_mode() with 0 arguments");
+		return mcsv1_UDAF::ERROR;
+	}
+	if (colTypes.size() > 1)
+	{
+		context->setErrorMessage("avg_mode() with more than 1 argument");
+		return mcsv1_UDAF::ERROR;
+	}
+
+	if (!(isNumeric(colTypes[0].second)))
+	{
+		// The error message will be prepended with
+		// "The storage engine for the table doesn't support "
+		context->setErrorMessage("avg_mode() with non-numeric argument");
+		return mcsv1_UDAF::ERROR;
+	}
+
+	context->setResultType(CalpontSystemCatalog::DOUBLE);
+	context->setColWidth(8);
+	context->setScale(context->getScale()*2);
+	context->setPrecision(19);
+	context->setRunFlag(mcsv1sdk::UDAF_IGNORE_NULLS);
+	return mcsv1_UDAF::SUCCESS;
+
+}
+
+mcsv1_UDAF::ReturnCode avg_mode::finish(mcsv1Context* context)
+{
+	return mcsv1_UDAF::SUCCESS;
+}
+
+mcsv1_UDAF::ReturnCode avg_mode::reset(mcsv1Context* context)
+{
+	ModeData* data = static_cast<ModeData*>(context->getUserData());
+	data->mData.clear();
+	return mcsv1_UDAF::SUCCESS;
+}
+
+mcsv1_UDAF::ReturnCode avg_mode::nextValue(mcsv1Context* context, 
+										  std::vector<ColumnDatum>& valsIn)
+{
+	static_any::any& valIn = valsIn[0].columnData;
+	MODE_DATA& data = static_cast<ModeData*>(context->getUserData())->mData;
+	DATATYPE val = 0.0;
+
+	if (valIn.empty())
+	{
+		return mcsv1_UDAF::SUCCESS; // Ought not happen when UDAF_IGNORE_NULLS is on.
+	}
+
+	if (valIn.compatible(charTypeId))
+	{
+		val = valIn.cast<char>();
+	}
+	else if (valIn.compatible(scharTypeId))
+	{
+		val = valIn.cast<signed char>();
+	}
+	else if (valIn.compatible(shortTypeId))
+	{
+		val = valIn.cast<short>();
+	}
+	else if (valIn.compatible(intTypeId))
+	{
+		val = valIn.cast<int>();
+	}
+	else if (valIn.compatible(longTypeId))
+	{
+		val = valIn.cast<long>();
+	}
+	else if (valIn.compatible(llTypeId))
+	{
+		val = valIn.cast<long long>();
+	}
+	else if (valIn.compatible(ucharTypeId))
+	{
+		val = valIn.cast<unsigned char>();
+	}
+	else if (valIn.compatible(ushortTypeId))
+	{
+		val = valIn.cast<unsigned short>();
+	}
+	else if (valIn.compatible(uintTypeId))
+	{
+		val = valIn.cast<unsigned int>();
+	}
+	else if (valIn.compatible(ulongTypeId))
+	{
+		val = valIn.cast<unsigned long>();
+	}
+	else if (valIn.compatible(ullTypeId))
+	{
+		val = valIn.cast<unsigned long long>();
+	}
+	else if (valIn.compatible(floatTypeId))
+	{
+		val = valIn.cast<float>();
+	}
+	else if (valIn.compatible(doubleTypeId))
+	{
+		val = valIn.cast<double>();
+	}
+
+	// For decimal types, we need to move the decimal point.
+	uint32_t scale = valsIn[0].scale;
+	if (val != 0 && scale > 0)
+	{
+		val /= pow(10.0, (double)scale);
+	}
+	data[val]++;
+
+	return mcsv1_UDAF::SUCCESS;
+}
+
+mcsv1_UDAF::ReturnCode avg_mode::subEvaluate(mcsv1Context* context, const UserData* userDataIn)
+{
+	if (!userDataIn)
+	{
+		return mcsv1_UDAF::SUCCESS;
+	}
+	MODE_DATA& outData = static_cast<ModeData*>(context->getUserData())->mData;
+	const MODE_DATA& inData = static_cast<const ModeData*>(userDataIn)->mData;
+	MODE_DATA::const_iterator iter = inData.begin();
+	for (; iter != inData.end(); ++iter)
+	{
+		outData[iter->first] += iter->second;
+	}
+	return mcsv1_UDAF::SUCCESS;
+}
+
+mcsv1_UDAF::ReturnCode avg_mode::evaluate(mcsv1Context* context, static_any::any& valOut)
+{
+	uint64_t maxCnt=0;
+	MODE_DATA& data = static_cast<ModeData*>(context->getUserData())->mData;
+	if (data.size() == 0)
+	{
+		valOut = (DATATYPE)0;
+		return mcsv1_UDAF::SUCCESS;
+	}
+	MODE_DATA::iterator iter(data.begin());
+	for (; iter != data.end(); ++iter)
+	{
+		if (iter->second > maxCnt)
+		{
+			valOut = iter->first;
+			maxCnt = iter->second;
+		}
+	}
+	return mcsv1_UDAF::SUCCESS;
+}
+
+mcsv1_UDAF::ReturnCode avg_mode::dropValue(mcsv1Context* context, 
+													std::vector<ColumnDatum>& valsDropped)
+{
+	static_any::any& valIn = valsDropped[0].columnData;
+	MODE_DATA& data = static_cast<ModeData*>(context->getUserData())->mData;
+	DATATYPE val = 0.0;
+
+	if (valIn.empty())
+	{
+		return mcsv1_UDAF::SUCCESS; // Ought not happen when UDAF_IGNORE_NULLS is on.
+	}
+
+	if (valIn.compatible(charTypeId))
+	{
+		val = valIn.cast<char>();
+	}
+	else if (valIn.compatible(scharTypeId))
+	{
+		val = valIn.cast<signed char>();
+	}
+	else if (valIn.compatible(shortTypeId))
+	{
+		val = valIn.cast<short>();
+	}
+	else if (valIn.compatible(intTypeId))
+	{
+		val = valIn.cast<int>();
+	}
+	else if (valIn.compatible(longTypeId))
+	{
+		val = valIn.cast<long>();
+	}
+	else if (valIn.compatible(llTypeId))
+	{
+		val = valIn.cast<long long>();
+	}
+	else if (valIn.compatible(ucharTypeId))
+	{
+		val = valIn.cast<unsigned char>();
+	}
+	else if (valIn.compatible(ushortTypeId))
+	{
+		val = valIn.cast<unsigned short>();
+	}
+	else if (valIn.compatible(uintTypeId))
+	{
+		val = valIn.cast<unsigned int>();
+	}
+	else if (valIn.compatible(ulongTypeId))
+	{
+		val = valIn.cast<unsigned long>();
+	}
+	else if (valIn.compatible(ullTypeId))
+	{
+		val = valIn.cast<unsigned long long>();
+	}
+	else if (valIn.compatible(floatTypeId))
+	{
+		val = valIn.cast<float>();
+	}
+	else if (valIn.compatible(doubleTypeId))
+	{
+		val = valIn.cast<double>();
+	}
+
+	// For decimal types, we need to move the decimal point.
+	uint32_t scale = valsDropped[0].scale;
+	if (val != 0 && scale > 0)
+	{
+		val /= pow(10.0, (double)scale);
+	}
+
+	data[val]--;
+
+	return mcsv1_UDAF::SUCCESS;
+}
+
+mcsv1_UDAF::ReturnCode avg_mode::createUserData(UserData*& userData, int32_t& length)
+{
+	userData = new ModeData;
+	length = sizeof(ModeData);
+	return mcsv1_UDAF::SUCCESS;
+}
+
+void ModeData::serialize(messageqcpp::ByteStream& bs) const
+{
+	MODE_DATA::const_iterator iter = mData.begin();
+	DATATYPE num;
+	uint32_t cnt;
+	bs << (int32_t)mData.size();
+	for (; iter != mData.end(); ++iter)
+	{
+		num = iter->first;
+		bs << num;
+		cnt = iter->second;
+		bs << cnt;
+	}
+}
+
+void ModeData::unserialize(messageqcpp::ByteStream& bs)
+{
+	mData.clear();
+	int32_t sz;
+	DATATYPE num;
+	uint32_t cnt;
+	bs >> sz;
+	for (int i = 0; i < sz; ++i)
+	{
+		bs >> num;
+		bs >> cnt;
+		mData[num] = cnt;
+	}
+}
+
--- a/utils/udfsdk/avg_mode.h
+++ b/utils/udfsdk/avg_mode.h
@ -0,0 +1,294 @@
+/* Copyright (C) 2017 MariaDB Corporaton
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation; version 2 of
+   the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+   MA 02110-1301, USA. */
+
+/***********************************************************************
+*   $Id$
+*
+*   mcsv1_UDAF.h
+***********************************************************************/
+
+/** 
+ * Columnstore interface for writing a User Defined Aggregate 
+ * Functions (UDAF) and User Defined Analytic Functions (UDAnF) 
+ * or a function that can act as either - UDA(n)F 
+ *  
+ * The basic steps are:
+ *
+ * 1. Create a the UDA(n)F function interface in some .h file. 
+ * 2. Create the UDF function implementation in some .cpp file 
+ * 3. Create the connector stub (MariaDB UDAF definition) for 
+ * this UDF function.  
+ * 4. build the dynamic library using all of the source. 
+ * 5  Put the library in $COLUMNSTORE_INSTALL/lib of 
+ * all modules 
+ * 6. restart the Columnstore system. 
+ * 7. notify mysqld about the new function:
+ *  
+ *    CREATE AGGREGATE FUNCTION avg_mode returns REAL soname
+ *    'libudf_mysql.so';
+ *  
+ * The UDAF functions may run distributed in the Columnstore 
+ * engine. UDAnF do not run distributed. 
+ *  
+ * UDAF is User Defined Aggregate Function. 
+ * UDAnF is User Defined Analytic Function. 
+ * UDA(n)F is an acronym for a function that could be either. It 
+ * is also used to describe the interface that is used for 
+ * either. 
+ */
+#ifndef HEADER_mode
+#define HEADER_mode
+
+#include <cstdlib>
+#include <string>
+#include <vector>
+#include <boost/any.hpp>
+#ifdef _MSC_VER
+#include <unordered_map>
+#else
+#include <tr1/unordered_map>
+#endif
+
+#include "mcsv1_udaf.h"
+#include "calpontsystemcatalog.h"
+#include "windowfunctioncolumn.h"
+using namespace execplan;
+
+#if defined(_MSC_VER) && defined(xxxRGNODE_DLLEXPORT)
+#define EXPORT __declspec(dllexport)
+#else
+#define EXPORT
+#endif
+
+namespace mcsv1sdk
+{
+
+#define DATATYPE double
+typedef std::tr1::unordered_map<DATATYPE, uint32_t> MODE_DATA;
+
+// Override UserData for data storage
+struct ModeData : public UserData
+{
+	ModeData() {};
+
+	virtual ~ModeData(){}
+
+	virtual void serialize(messageqcpp::ByteStream& bs) const;
+	virtual void unserialize(messageqcpp::ByteStream& bs);
+
+	MODE_DATA mData;
+private:
+	// For now, copy construction is unwanted
+	ModeData(UserData&);
+};
+
+// Override mcsv1_UDAF to build your User Defined Aggregate (UDAF) and/or 
+// User Defined Analytic Function (UDAnF).
+// These will be singleton classes, so don't put any instance
+// specific data in here. All instance data is stored in mcsv1Context
+// passed to each user function and retrieved by the getUserData() method.
+// 
+// Each API function returns a ReturnCode. If ERROR is returned at any time, 
+// the query is aborted, getInterrupted() will begin to return true and the 
+// message set in config->setErrorMessage() is returned to MariaDB. 
+
+// Return the avg_mode value of the dataset
+
+class avg_mode : public  mcsv1_UDAF
+{
+public:
+	// Defaults OK
+	avg_mode() : mcsv1_UDAF(){};
+	virtual ~avg_mode(){};
+
+	/** 
+	 * init() 
+	 *  
+	 * Mandatory. Implement this to initialize flags and instance 
+	 * data. Called once per SQL statement. You can do any sanity 
+	 * checks here. 
+	 *  
+	 * colTypes (in) - A vector of ColDataType defining the 
+	 * parameters of the UDA(n)F call. These can be used to decide 
+	 * to override the default return type. If desired, the new 
+	 * return type can be set by context->setReturnType() and 
+	 * decimal scale and precision can be set by context->setScale 
+	 * and context->setPrecision respectively. 
+	 *  
+	 * Return mcsv1_UDAF::ERROR on any error, such as non-compatible
+	 * colTypes or wrong number of arguments. Else return 
+	 * mcsv1_UDAF::SUCCESS. 
+	 */
+	virtual ReturnCode init(mcsv1Context* context,
+							COL_TYPES& colTypes);
+
+	/** 
+	 * finish() 
+	 *  
+	 * Mandatory. Completes the UDA(n)F. Called once per SQL 
+	 * statement. Do not free any memory allocated by 
+	 * context->setUserDataSize(). The SDK Framework owns that memory 
+	 * and will handle that. Often, there is nothing to do here. 
+	 */
+	virtual ReturnCode finish(mcsv1Context* context);
+
+	/** 
+	 * reset() 
+	 *  
+	 * Mandatory. Reset the UDA(n)F for a new group, partition or, 
+	 * in some cases, new Window Frame. Do not free any memory 
+	 * allocated by context->setUserDataSize(). The SDK Framework owns 
+	 * that memory and will handle that. Use this opportunity to 
+	 * reset any variables in context->getUserData() needed for the 
+	 * next aggregation. May be called multiple times if running in 
+	 * a ditributed fashion. 
+	 *  
+	 * Use this opportunity to initialize the userData.
+	 */
+	virtual ReturnCode reset(mcsv1Context* context);
+
+	/** 
+	 * nextValue() 
+	 *  
+	 * Mandatory. Handle a single row. 
+	 *  
+	 * colsIn - A vector of data structure describing the input 
+	 * data. 
+	 *  
+	 * This function is called once for every row in the filtered 
+	 * result set (before aggregation). It is very important that 
+	 * this function is efficient. 
+	 *  
+	 * If the UDAF is running in a distributed fashion, nextValue 
+	 * cannot depend on order, as it will only be called for each 
+	 * row found on the specific PM. 
+	 *  
+	 * valsIn (in) - a vector of the parameters from the row.
+	 */
+	virtual ReturnCode nextValue(mcsv1Context* context, 
+								 std::vector<ColumnDatum>& valsIn);
+
+	 /** 
+	  * subEvaluate() 
+	  *  
+	  * Mandatory -- Called if the UDAF is running in a distributed 
+	  * fashion. Columnstore tries to run all aggregate functions 
+	  * distributed, depending on context. 
+	  *  
+	  * Perform an aggregation on rows partially aggregated by 
+	  * nextValue. Columnstore calls nextValue for each row on a 
+	  * given PM for a group (GROUP BY). subEvaluate is called on the
+	  * UM to consolodate those values into a single instance of 
+	  * userData. Keep your aggregated totals in context's userData. 
+	  * The first time this is called for a group, reset() would have 
+	  * been called with this version of userData. 
+	  *  
+	  * Called for every partial data set in each group in GROUP BY.
+	  *  
+	  * When subEvaluate has been called for all subAggregated data 
+	  * sets, Evaluate will be called with the same context as here.
+	  *  
+	  * valIn (In) - This is a pointer to a memory block of the size 
+	  * set in setUserDataSize. It will contain the value of userData 
+	  * as seen in the last call to NextValue for a given PM.
+	  *  
+	  */
+	 virtual ReturnCode subEvaluate(mcsv1Context* context, const UserData* valIn);
+
+	/** 
+	 * evaluate() 
+	 *  
+	 * Mandatory. Get the aggregated value.
+	 *  
+	 * Called for every new group if UDAF GROUP BY, UDAnF partition 
+	 * or, in some cases, new Window Frame. 
+	 *  
+	 * Set the aggregated value into valOut. The datatype is assumed
+	 * to be the same as that set in the init() function; 
+	 *  
+	 * If the UDAF is running in a distributed fashion, evaluate is 
+	 * called after a series of subEvaluate calls. 
+	 *  
+	 * valOut (out) - Set the aggregated value here. The datatype is
+	 * assumed to be the same as that set in the init() function; 
+	 *  
+	 * To return a NULL value, don't assign to valOut.
+	 */
+	virtual ReturnCode evaluate(mcsv1Context* context, static_any::any& valOut);
+
+	/** 
+	 * dropValue() 
+	 *  
+	 * Optional -- If defined, the server will call this instead of 
+	 * reset for UDAnF. 
+	 *  
+	 * Don't implement if a UDAnF has one or more of the following: 
+	 * The UDAnF can't be used with a Window Frame
+	 * The UDAnF is not reversable in some way 
+	 * The UDAnF is not interested in optimal performance 
+	 *  
+	 * If not implemented, reset() followed by a series of 
+	 * nextValue() will be called for each movement of the Window 
+	 * Frame. 
+	 *  
+	 * If implemented, then each movement of the Window Frame will 
+	 * result in dropValue() being called for each row falling out 
+	 * of the Frame and nextValue() being called for each new row 
+	 * coming into the Frame. 
+	 *  
+	 * valsDropped (in) - a vector of the parameters from the row 
+	 * leaving the Frame 
+	 *  
+	 * dropValue() will not be called for unbounded/current row type
+	 * frames, as those are already optimized. 
+	 */
+	virtual ReturnCode dropValue(mcsv1Context* context, 
+								 std::vector<ColumnDatum>& valsDropped);
+
+	/** 
+	 * createUserData()
+	 *  
+	 * Optional -- If defined, the server will call this instead of 
+	 * createUserData on context. 
+	 *  
+	 * Create your variable length data structure via 
+	 * data = new <datatype> 
+	 *  
+	 * The data structure may contain references to containers or 
+	 * pointers to other objects. Remember that for distributed 
+	 * processing, this may be called multiple times for variaous 
+	 * computing blocks. At the least, it will be called once per PM 
+	 * that processes the data, and once more for the UM. For UDAnF, 
+	 * it may only be called once. 
+	 *  
+	 * Set length to the length of the data structure you create.
+	 *  
+	 * For each call to createUserData(), there will be a 
+	 * corresponding deleteUserData() where you must clean up. Any 
+	 * memory leaks are your fault. 
+	 *  
+	 */ 
+	virtual ReturnCode createUserData(UserData*& data, int32_t& length);
+protected:
+};
+
+};  // namespace
+
+#undef EXPORT
+
+#endif // HEADER_mode.h
+