1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-07-30 19:23:07 +03:00

Mcol 5092 MODA uses wrong column width for some types (#2450)

* MCOL-5092 Ensure column width is correct for datatype
                       Change MODA return type to STRING
                       Modify MODA to handle every numeric type
* MCOL-5162 MODA to support char and varchar with collation support

Fixes to the aggregate bit functions
When we fixed the storage sign issue for MCOL-5092, it uncovered a problem in the bit aggregates (bit_and, bit_or and bit_xor). These aggregates should always return UBIGINT, but they relied on the type of the argument column, which gave bad results.
This commit is contained in:
David.Hall
2022-08-11 15:16:11 -05:00
committed by GitHub
parent c906172bf5
commit 2020f35e88
16 changed files with 594 additions and 111 deletions

View File

@ -45,6 +45,7 @@
#include "calpontsystemcatalog.h"
#include "windowfunctioncolumn.h"
#include "hasher.h"
#include "collation.h"
#if defined(_MSC_VER) && defined(xxxRGNODE_DLLEXPORT)
#define EXPORT __declspec(dllexport)
@ -58,6 +59,8 @@ namespace mcsv1sdk
template <class T>
struct hasher
{
hasher(uint32_t cs_num){}
inline size_t operator()(T val) const
{
return fHasher((char*)&val, sizeof(T));
@ -67,34 +70,74 @@ struct hasher
utils::Hasher fHasher;
};
// A special hasher for double that may only have 10 bytes
template <>
struct hasher<long double>
{
hasher(uint32_t cs_num){}
inline size_t operator()(long double val) const
{
if (sizeof(long double) == 8) // Probably just MSC, but you never know.
{
return fHasher((char*)&val, sizeof(long double));
}
else
{
// For Linux x86_64, long double is stored in 128 bits, but only 80 are significant
return fHasher((char*)&val, 10);
}
#ifdef MASK_LONGDOUBLE
// For Linux x86_64, long double is stored in 128 bits, but only 80 are significant
return fHasher((char*)&val, 10);
#else
return fHasher((char*)&val, sizeof(long double));
#endif
}
private:
utils::Hasher fHasher;
};
// A collation aware hasher for strings
template<>
struct hasher<string>
{
hasher(uint32_t cs_num) : fHasher(cs_num){}
inline size_t operator()(string val) const
{
return fHasher(val.c_str(), val.size());
}
private:
datatypes::CollationAwareHasher fHasher;
};
template<class T>
struct comparator
{
comparator(uint32_t cs_num){}
bool operator()(const T& lhs, const T& rhs) const
{
return lhs == rhs;
}
};
// A collation aware string comparator
template <>
struct comparator<std::string>
{
comparator(uint32_t cs_num) : fCs(cs_num) {}
bool operator()(const std::string lhs, const std::string rhs) const
{
return fCs.eq(lhs, rhs);
}
private:
datatypes::Charset fCs;
};
// Override UserData for data storage
struct ModaData : public UserData
{
ModaData()
ModaData(uint32_t cs_num = 8)
: fMap(NULL)
, fReturnType((uint32_t)execplan::CalpontSystemCatalog::UNDEFINED)
, fColWidth(0)
, modaImpl(NULL){};
, modaImpl(NULL)
, fCs_num(cs_num){}
virtual ~ModaData()
{
@ -105,22 +148,23 @@ struct ModaData : public UserData
virtual void unserialize(messageqcpp::ByteStream& bs);
template <class T>
std::unordered_map<T, uint32_t, hasher<T> >* getMap()
std::unordered_map<T, uint32_t, hasher<T>, comparator<T> >* getMap()
{
if (!fMap)
{
// Just in time creation
fMap = new std::unordered_map<T, uint32_t, hasher<T> >;
fMap = new std::unordered_map<T, uint32_t, hasher<T>, comparator<T> >(
10, hasher<T>(fCs_num), comparator<T>(fCs_num));
}
return (std::unordered_map<T, uint32_t, hasher<T> >*)fMap;
return (std::unordered_map<T, uint32_t, hasher<T>, comparator<T> >*)fMap;
}
// The const version is only called by serialize()
// It shouldn't (and can't) create a new map.
template <class T>
std::unordered_map<T, uint32_t, hasher<T> >* getMap() const
std::unordered_map<T, uint32_t, hasher<T>, comparator<T> >* getMap() const
{
return (std::unordered_map<T, uint32_t, hasher<T> >*)fMap;
return (std::unordered_map<T, uint32_t, hasher<T>, comparator<T> >*)fMap;
}
template <class T>
@ -128,7 +172,7 @@ struct ModaData : public UserData
{
if (fMap)
{
delete (std::unordered_map<T, uint32_t, hasher<T> >*)fMap;
delete (std::unordered_map<T, uint32_t, hasher<T>, comparator<T> >*)fMap;
fMap = NULL;
}
}
@ -148,6 +192,7 @@ struct ModaData : public UserData
uint32_t fReturnType;
uint32_t fColWidth;
mcsv1_UDAF* modaImpl; // A pointer to one of the Moda_impl_T concrete classes
uint32_t fCs_num;
private:
// For now, copy construction is unwanted
@ -159,10 +204,11 @@ struct ModaData : public UserData
template <class T>
void serializeMap(messageqcpp::ByteStream& bs) const
{
std::unordered_map<T, uint32_t, hasher<T> >* map = getMap<T>();
bs << fCs_num;
std::unordered_map<T, uint32_t, hasher<T>, comparator<T> >* map = getMap<T>();
if (map)
{
typename std::unordered_map<T, uint32_t, hasher<T> >::const_iterator iter;
typename std::unordered_map<T, uint32_t, hasher<T>, comparator<T> >::const_iterator iter;
bs << (uint64_t)map->size();
for (iter = map->begin(); iter != map->end(); ++iter)
{
@ -179,11 +225,13 @@ struct ModaData : public UserData
template <class T>
void unserializeMap(messageqcpp::ByteStream& bs)
{
bs >> fCs_num;
uint32_t cnt;
T num;
uint64_t sz;
bs >> sz;
std::unordered_map<T, uint32_t, hasher<T> >* map = getMap<T>();
std::unordered_map<T, uint32_t, hasher<T>, comparator<T> >* map = getMap<T>();
map->clear();
for (uint64_t i = 0; i < sz; ++i)
{
@ -217,6 +265,31 @@ class Moda_impl_T : public mcsv1_UDAF
}
};
template<> // string specialization
class Moda_impl_T<string> : public mcsv1_UDAF
{
public:
// Defaults OK
Moda_impl_T() : cs(8) {};
virtual ~Moda_impl_T() {};
virtual mcsv1_UDAF::ReturnCode init(mcsv1Context* context, ColumnDatum* colTypes);
virtual mcsv1_UDAF::ReturnCode reset(mcsv1Context* context);
virtual mcsv1_UDAF::ReturnCode nextValue(mcsv1Context* context, ColumnDatum* valsIn);
virtual mcsv1_UDAF::ReturnCode subEvaluate(mcsv1Context* context, const UserData* valIn);
virtual mcsv1_UDAF::ReturnCode evaluate(mcsv1Context* context, static_any::any& valOut);
virtual mcsv1_UDAF::ReturnCode dropValue(mcsv1Context* context, ColumnDatum* valsDropped);
// Dummy: not used
virtual mcsv1_UDAF::ReturnCode createUserData(UserData*& userData, int32_t& length)
{
return mcsv1_UDAF::SUCCESS;
}
private:
datatypes::Charset cs;
};
// moda returns the modal value of the dataset. If more than one value
// have the same maximum number of occurances, then the one closest to
// AVG wins. If two are the same distance from AVG, then the smaller wins.
@ -276,6 +349,7 @@ class moda : public mcsv1_UDAF
Moda_impl_T<float> moda_impl_float;
Moda_impl_T<double> moda_impl_double;
Moda_impl_T<long double> moda_impl_longdouble;
Moda_impl_T<string> moda_impl_string;
};
}; // namespace mcsv1sdk