1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-08-01 06:46:55 +03:00

MCOL-641 Refactor initial extent elimination support.

This commit also adds support in TupleHashJoinStep::forwardCPData,
although we currently do not support wide decimals as join keys.

Row estimation to determine large-side of the join is also updated.
This commit is contained in:
Gagan Goel
2020-07-24 19:04:25 -04:00
committed by Roman Nozdrin
parent ca53b6348a
commit d3bc68b02f
32 changed files with 1221 additions and 386 deletions

View File

@ -30,6 +30,8 @@
#include <stdint.h>
#include <string.h>
using int128_t = __int128;
namespace utils
{
/** @brief class Hasher
@ -346,6 +348,25 @@ public:
}
};
// TODO a copy of these classes also exists in primitiveprocessor.h; consolidate
class Hash128
{
public:
inline size_t operator()(const int128_t i) const
{
return *reinterpret_cast<const uint64_t*>(&i);
}
};
class Equal128
{
public:
inline bool operator()(const int128_t f1, const int128_t f2) const
{
return f1 == f2;
}
};
//------------------------------------------------------------------------------
/** @brief class TupleHasher
*

View File

@ -32,6 +32,9 @@ namespace utils
const uint8_t MAXLENGTH16BYTES = 42;
const uint8_t MAXLENGTH8BYTES = 23;
const int128_t minInt128 = int128_t(0x8000000000000000LL) << 64;
const int128_t maxInt128 = (int128_t(0x7FFFFFFFFFFFFFFFLL) << 64) + 0xFFFFFFFFFFFFFFFFLL;
inline bool isWideDecimalNullValue(const int128_t& val)
{
const uint64_t* ptr = reinterpret_cast<const uint64_t*>(&val);

View File

@ -161,21 +161,10 @@ const int32_t MIN_TIMESTAMP_VALUE = 0;
namespace dataconvert
{
// Decimal has maximum 38 digits with 3 extra chars for dot(.), minus(-), null character(\0)
const int MAX_DECIMAL_STRING_LENGTH = 41;
// WIP MCOL-641
using int128_t = __int128;
using uint128_t = unsigned __int128;
struct Int128Pod_struct
{
uint64_t lo;
uint64_t hi;
};
typedef Int128Pod_struct Int128Pod_t;
enum CalpontDateTimeFormat
{
CALPONTDATE_ENUM = 1, // date format is: "YYYY-MM-DD"
@ -1067,27 +1056,6 @@ public:
static size_t writeFractionalPart(int128_t* dec, char* p, const unsigned int buflen,
const uint8_t scale);
static inline void int128Max(int128_t& i)
{
Int128Pod_t *pod = reinterpret_cast<Int128Pod_t*>(&i);
pod->lo = 0xFFFFFFFFFFFFFFFF;
pod->hi = 0x7FFFFFFFFFFFFFFF;
}
static inline void int128Min(int128_t& i)
{
Int128Pod_t *pod = reinterpret_cast<Int128Pod_t*>(&i);
pod->lo = 0;
pod->hi = 0x8000000000000000;
}
static inline void uint128Max(uint128_t& i)
{
Int128Pod_t *pod = reinterpret_cast<Int128Pod_t*>(&i);
pod->lo = 0xFFFFFFFFFFFFFFFF;
pod->hi = 0xFFFFFFFFFFFFFFFF;
}
static inline std::string constructRegexp(const std::string& str);
static inline void trimWhitespace(int64_t& charData);
static inline bool isEscapedChar(char c)

View File

@ -20,16 +20,19 @@
#include <algorithm>
#include <vector>
#include <limits>
#ifdef _MSC_VER
#include <unordered_set>
#else
#ifndef _MSC_VER
#include <tr1/unordered_set>
#else
#include <unordered_set>
#endif
#include "hasher.h"
#include "lbidlist.h"
#include "spinlock.h"
#include "vlarray.h"
#include "widedecimalutils.h"
using namespace std;
using namespace rowgroup;
using namespace utils;
@ -102,18 +105,38 @@ TupleJoiner::TupleJoiner(
smallKeyColumns.push_back(smallJoinColumn);
largeKeyColumns.push_back(largeJoinColumn);
discreteValues.reset(new bool[1]);
cpValues.reset(new vector<int64_t>[1]);
cpValues.reset(new vector<int128_t>[1]);
discreteValues[0] = false;
if (smallRG.isUnsigned(smallKeyColumns[0]))
{
cpValues[0].push_back(numeric_limits<uint64_t>::max());
cpValues[0].push_back(0);
if (datatypes::Decimal::isWideDecimalType(
smallRG.getColType(smallKeyColumns[0]),
smallRG.getColumnWidth(smallKeyColumns[0])))
{
cpValues[0].push_back((int128_t) -1);
cpValues[0].push_back(0);
}
else
{
cpValues[0].push_back((int128_t) numeric_limits<uint64_t>::max());
cpValues[0].push_back(0);
}
}
else
{
cpValues[0].push_back(numeric_limits<int64_t>::max());
cpValues[0].push_back(numeric_limits<int64_t>::min());
if (datatypes::Decimal::isWideDecimalType(
smallRG.getColType(smallKeyColumns[0]),
smallRG.getColumnWidth(smallKeyColumns[0])))
{
cpValues[0].push_back(utils::maxInt128);
cpValues[0].push_back(utils::minInt128);
}
else
{
cpValues[0].push_back((int128_t) numeric_limits<int64_t>::max());
cpValues[0].push_back((int128_t) numeric_limits<int64_t>::min());
}
}
if (smallRG.isUnsigned(smallJoinColumn) != largeRG.isUnsigned(largeJoinColumn))
@ -195,20 +218,40 @@ TupleJoiner::TupleJoiner(
storedKeyAlloc[i].setAllocSize(keyLength);
discreteValues.reset(new bool[smallKeyColumns.size()]);
cpValues.reset(new vector<int64_t>[smallKeyColumns.size()]);
cpValues.reset(new vector<int128_t>[smallKeyColumns.size()]);
for (i = 0; i < smallKeyColumns.size(); i++)
{
discreteValues[i] = false;
if (isUnsigned(smallRG.getColTypes()[smallKeyColumns[i]]))
{
cpValues[i].push_back(static_cast<int64_t>(numeric_limits<uint64_t>::max()));
cpValues[i].push_back(0);
if (datatypes::Decimal::isWideDecimalType(
smallRG.getColType(smallKeyColumns[i]),
smallRG.getColumnWidth(smallKeyColumns[i])))
{
cpValues[i].push_back((int128_t) -1);
cpValues[i].push_back(0);
}
else
{
cpValues[i].push_back((int128_t) numeric_limits<uint64_t>::max());
cpValues[i].push_back(0);
}
}
else
{
cpValues[i].push_back(numeric_limits<int64_t>::max());
cpValues[i].push_back(numeric_limits<int64_t>::min());
if (datatypes::Decimal::isWideDecimalType(
smallRG.getColType(smallKeyColumns[i]),
smallRG.getColumnWidth(smallKeyColumns[i])))
{
cpValues[i].push_back(utils::maxInt128);
cpValues[i].push_back(utils::minInt128);
}
else
{
cpValues[i].push_back(numeric_limits<int64_t>::max());
cpValues[i].push_back(numeric_limits<int64_t>::min());
}
}
}
}
@ -678,8 +721,9 @@ void TupleJoiner::doneInserting()
for (col = 0; col < smallKeyColumns.size(); col++)
{
tr1::unordered_set<int64_t> uniquer;
tr1::unordered_set<int64_t>::iterator uit;
typedef std::tr1::unordered_set<int128_t, utils::Hash128, utils::Equal128> unordered_set_int128;
unordered_set_int128 uniquer;
unordered_set_int128::iterator uit;
sthash_t::iterator sthit;
hash_t::iterator hit;
ldhash_t::iterator ldit;
@ -758,6 +802,12 @@ void TupleJoiner::doneInserting()
}
}
}
else if (datatypes::Decimal::isWideDecimalType(
smallRow.getColType(smallKeyColumns[col]),
smallRow.getColumnWidth(smallKeyColumns[col])))
{
uniquer.insert(*((int128_t*)smallRow.getBinaryField<int128_t>(smallKeyColumns[col])));
}
else if (smallRow.isUnsigned(smallKeyColumns[col]))
{
uniquer.insert((int64_t)smallRow.getUintField(smallKeyColumns[col]));
@ -1080,21 +1130,22 @@ void TupleJoiner::updateCPData(const Row& r)
{
int64_t val = r.getIntField(colIdx);
if (order_swap(val) < order_swap(min) ||
min == numeric_limits<int64_t>::max())
if (order_swap(val) < order_swap((int64_t) min) ||
((int64_t) min) == numeric_limits<int64_t>::max())
{
min = val;
}
if (order_swap(val) > order_swap(max) ||
max == numeric_limits<int64_t>::min())
if (order_swap(val) > order_swap((int64_t) max) ||
((int64_t) max) == numeric_limits<int64_t>::min())
{
max = val;
}
}
else if (r.isUnsigned(colIdx))
{
uint64_t uval;
uint128_t uval;
if (r.getColType(colIdx) == CalpontSystemCatalog::LONGDOUBLE)
{
double dval = (double)roundl(r.getLongDoubleField(smallKeyColumns[col]));
@ -1114,20 +1165,27 @@ void TupleJoiner::updateCPData(const Row& r)
}
}
}
else if (datatypes::Decimal::isWideDecimalType(
r.getColType(colIdx),
r.getColumnWidth(colIdx)))
{
uval = *((int128_t*)r.getBinaryField<int128_t>(colIdx));
}
else
{
uval = r.getUintField(colIdx);
}
if (uval > static_cast<uint64_t>(max))
max = static_cast<int64_t>(uval);
if (uval > static_cast<uint128_t>(max))
max = static_cast<int128_t>(uval);
if (uval < static_cast<uint64_t>(min))
min = static_cast<int64_t>(uval);
if (uval < static_cast<uint128_t>(min))
min = static_cast<int128_t>(uval);
}
else
{
int64_t val = 0;
int128_t val = 0;
if (r.getColType(colIdx) == CalpontSystemCatalog::LONGDOUBLE)
{
double dval = (double)roundl(r.getLongDoubleField(colIdx));
@ -1147,13 +1205,12 @@ void TupleJoiner::updateCPData(const Row& r)
}
}
}
else if (r.getColumnWidth(colIdx) == datatypes::MAXDECIMALWIDTH
&& (r.getColType(colIdx) == CalpontSystemCatalog::DECIMAL
|| r.getColType(colIdx) == CalpontSystemCatalog::UDECIMAL))
else if (datatypes::Decimal::isWideDecimalType(
r.getColType(colIdx),
r.getColumnWidth(colIdx)))
{
// WIP MCOL-641
val = *((int128_t*)r.getBinaryField<int128_t>(colIdx));
}
else
{
val = r.getIntField(colIdx);
@ -1681,20 +1738,40 @@ boost::shared_ptr<TupleJoiner> TupleJoiner::copyForDiskJoin()
ret->uniqueLimit = uniqueLimit;
ret->discreteValues.reset(new bool[smallKeyColumns.size()]);
ret->cpValues.reset(new vector<int64_t>[smallKeyColumns.size()]);
ret->cpValues.reset(new vector<int128_t>[smallKeyColumns.size()]);
for (uint32_t i = 0; i < smallKeyColumns.size(); i++)
{
ret->discreteValues[i] = false;
if (isUnsigned(smallRG.getColTypes()[smallKeyColumns[i]]))
{
ret->cpValues[i].push_back(static_cast<int64_t>(numeric_limits<uint64_t>::max()));
ret->cpValues[i].push_back(0);
if (datatypes::Decimal::isWideDecimalType(
smallRG.getColType(smallKeyColumns[i]),
smallRG.getColumnWidth(smallKeyColumns[i])))
{
ret->cpValues[i].push_back((int128_t) -1);
ret->cpValues[i].push_back(0);
}
else
{
ret->cpValues[i].push_back((int128_t) numeric_limits<uint64_t>::max());
ret->cpValues[i].push_back(0);
}
}
else
{
ret->cpValues[i].push_back(numeric_limits<int64_t>::max());
ret->cpValues[i].push_back(numeric_limits<int64_t>::min());
if (datatypes::Decimal::isWideDecimalType(
smallRG.getColType(smallKeyColumns[i]),
smallRG.getColumnWidth(smallKeyColumns[i])))
{
ret->cpValues[i].push_back(utils::maxInt128);
ret->cpValues[i].push_back(utils::minInt128);
}
else
{
ret->cpValues[i].push_back(numeric_limits<int64_t>::max());
ret->cpValues[i].push_back(numeric_limits<int64_t>::min());
}
}
}

View File

@ -287,7 +287,7 @@ public:
{
return discreteValues;
}
inline const boost::scoped_array<std::vector<int64_t> >& getCPData()
inline const boost::scoped_array<std::vector<int128_t> >& getCPData()
{
return cpValues;
}
@ -413,7 +413,7 @@ private:
/* Runtime casual partitioning support */
void updateCPData(const rowgroup::Row& r);
boost::scoped_array<bool> discreteValues;
boost::scoped_array<std::vector<int64_t> > cpValues; // if !discreteValues, [0] has min, [1] has max
boost::scoped_array<std::vector<int128_t> > cpValues; // if !discreteValues, [0] has min, [1] has max
uint32_t uniqueLimit;
bool finished;

View File

@ -66,6 +66,7 @@ typedef const struct charset_info_st CHARSET_INFO;
// Workaround for my_global.h #define of isnan(X) causing a std::std namespace
using int128_t = __int128;
using uint128_t = unsigned __int128;
namespace rowgroup
{