1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-07-05 15:41:14 +03:00

MCOL-5153 This patch replaces MDB collation aware hash function with the (#2488)

exact functionality that does not use MDB hash function.
This patch also takes a bit from Robin Hood hash map implementation forgotten
that reduces hash function collision rate.
This commit is contained in:
Roman Nozdrin
2022-08-07 02:36:03 +03:00
committed by GitHub
parent af9caf8d6e
commit dd96e686c0
4 changed files with 59 additions and 15 deletions

View File

@ -136,8 +136,7 @@ class Charset
protected: protected:
const struct charset_info_st* mCharset; const struct charset_info_st* mCharset;
private: private:
static constexpr uint flags_ = MY_STRXFRM_PAD_WITH_SPACE | MY_STRXFRM_PAD_TO_MAXLEN; static constexpr const uint flags_ = MY_STRXFRM_PAD_WITH_SPACE | MY_STRXFRM_PAD_TO_MAXLEN;
public: public:
Charset(CHARSET_INFO& cs) : mCharset(&cs) Charset(CHARSET_INFO& cs) : mCharset(&cs)
{ {
@ -209,6 +208,10 @@ class Charset
assert(len <= sizeof(T)); assert(len <= sizeof(T));
return ret; return ret;
} }
static uint getDefaultFlags()
{
return flags_;
}
}; };
class CollationAwareHasher : public Charset class CollationAwareHasher : public Charset

View File

@ -103,6 +103,8 @@
2055 ERR_DISKAGG_TOO_BIG Not enough memory to make disk-based aggregation. Raise TotalUmMemory if possible. 2055 ERR_DISKAGG_TOO_BIG Not enough memory to make disk-based aggregation. Raise TotalUmMemory if possible.
2056 ERR_DISKAGG_FILEIO_ERROR There was an IO error during a disk-based aggregation: %1% 2056 ERR_DISKAGG_FILEIO_ERROR There was an IO error during a disk-based aggregation: %1%
2057 ERR_JOIN_RESULT_TOO_BIG Not enough memory to consolidate join results. Estimated %1% MB needed. TotalUmMemory is %2% MB. 2057 ERR_JOIN_RESULT_TOO_BIG Not enough memory to consolidate join results. Estimated %1% MB needed. TotalUmMemory is %2% MB.
2058 ERR_DISKAGG_OVERFLOW1 The hash function used produces a lot of hash collisions (1).
2059 ERR_DISKAGG_OVERFLOW2 The hash function used produces a lot of hash collisions (2).
# Sub-query errors # Sub-query errors
3001 ERR_NON_SUPPORT_SUB_QUERY_TYPE This subquery type is not supported yet. 3001 ERR_NON_SUPPORT_SUB_QUERY_TYPE This subquery type is not supported yet.

View File

@ -166,10 +166,24 @@ uint64_t hashRow(const rowgroup::Row& r, std::size_t lastCol)
case execplan::CalpontSystemCatalog::VARCHAR: case execplan::CalpontSystemCatalog::VARCHAR:
case execplan::CalpontSystemCatalog::BLOB: case execplan::CalpontSystemCatalog::BLOB:
case execplan::CalpontSystemCatalog::TEXT: case execplan::CalpontSystemCatalog::TEXT:
h.add(r.getCharset(i), r.getConstString(i)); {
strHashUsed = true; auto strColValue = r.getConstString(i);
if (strColValue.length() > MaxConstStrSize)
{
h.add(r.getCharset(i), strColValue);
strHashUsed = true;
}
else
{
auto cs = r.getCharset(i);
uchar buf[MaxConstStrBufSize];
uint nActualWeights = cs->strnxfrm(buf, MaxConstStrBufSize, MaxConstStrBufSize,
reinterpret_cast<const uchar*>(strColValue.str()), strColValue.length(),
datatypes::Charset::getDefaultFlags());
ret = hashData(buf, nActualWeights, ret);
}
break; break;
}
default: ret = hashData(r.getData() + r.getOffset(i), r.getColumnWidth(i), ret); break; default: ret = hashData(r.getData() + r.getOffset(i), r.getColumnWidth(i), ret); break;
} }
} }
@ -1820,15 +1834,26 @@ void RowAggStorage::increaseSize()
if (fCurData->fSize * maxMaskMultiplierWoRehashing < calcMaxSize(fCurData->fMask + 1)) if (fCurData->fSize * maxMaskMultiplierWoRehashing < calcMaxSize(fCurData->fMask + 1))
{ {
// something strange happens... // something strange happens...
throw logging::IDBExcept(logging::IDBErrorInfo::instance()->errorMsg(logging::ERR_DISKAGG_ERROR), throw logging::IDBExcept(logging::IDBErrorInfo::instance()->errorMsg(logging::ERR_DISKAGG_OVERFLOW2),
logging::ERR_DISKAGG_ERROR); logging::ERR_DISKAGG_OVERFLOW2);
} }
auto freeMem = fMM->getFree(); auto freeMem = fMM->getFree();
if (fEnabledDiskAggregation || if (fEnabledDiskAggregation ||
freeMem > (fMM->getUsed() + fCurData->fHashes->memUsage() + fStorage->getAproxRGSize()) * 2) freeMem > (fMM->getUsed() + fCurData->fHashes->memUsage() + fStorage->getAproxRGSize()) * 2)
{ {
rehashPowerOfTwo((fCurData->fMask + 1) * 2); if (fCurData->fSize * 2 < maxSize)
{
// we have to resize, even though there would still be plenty of space left!
// Try to rehash instead. Delete freed memory so we don't steadyily increase mem in case
// we have to rehash a few times
nextHashMultiplier();
rehashPowerOfTwo(fCurData->fMask + 1);
}
else
{
rehashPowerOfTwo((fCurData->fMask + 1) * 2);
}
} }
else if (fGeneration < MAX_INMEMORY_GENS - 1) else if (fGeneration < MAX_INMEMORY_GENS - 1)
{ {
@ -1888,8 +1913,8 @@ void RowAggStorage::insertSwap(size_t oldIdx, RowPosHashStorage* oldHashes)
{ {
if (fCurData->fMaxSize == 0 && !tryIncreaseInfo()) if (fCurData->fMaxSize == 0 && !tryIncreaseInfo())
{ {
throw logging::IDBExcept(logging::IDBErrorInfo::instance()->errorMsg(logging::ERR_DISKAGG_ERROR), throw logging::IDBExcept(logging::IDBErrorInfo::instance()->errorMsg(logging::ERR_DISKAGG_OVERFLOW1),
logging::ERR_DISKAGG_ERROR); logging::ERR_DISKAGG_OVERFLOW1);
} }
size_t idx{}; size_t idx{};

View File

@ -36,6 +36,9 @@ class RowGroupStorage;
uint64_t hashRow(const rowgroup::Row& r, std::size_t lastCol); uint64_t hashRow(const rowgroup::Row& r, std::size_t lastCol);
constexpr const size_t MaxConstStrSize = 2048ULL;
constexpr const size_t MaxConstStrBufSize = MaxConstStrSize << 1;
class RowAggStorage class RowAggStorage
{ {
public: public:
@ -161,6 +164,9 @@ class RowAggStorage
*/ */
inline void rowHashToIdx(uint64_t h, uint32_t& info, size_t& idx, const Data* curData) const inline void rowHashToIdx(uint64_t h, uint32_t& info, size_t& idx, const Data* curData) const
{ {
// An addition from the original robin hood HM.
h *= fCurData->hashMultiplier_;
h ^= h >> 33U;
info = curData->fInfoInc + static_cast<uint32_t>((h & INFO_MASK) >> curData->fInfoHashShift); info = curData->fInfoInc + static_cast<uint32_t>((h & INFO_MASK) >> curData->fInfoHashShift);
idx = (h >> INIT_INFO_BITS) & curData->fMask; idx = (h >> INIT_INFO_BITS) & curData->fMask;
} }
@ -230,6 +236,13 @@ class RowAggStorage
info = fCurData->fInfo[idx]; info = fCurData->fInfo[idx];
} }
void nextHashMultiplier()
{
// adding an *even* number, so that the multiplier will always stay odd. This is necessary
// so that the hash stays a mixing function (and thus doesn't have any information loss).
fCurData->hashMultiplier_ += 0xc4ceb9fe1a85ec54;
}
/** @brief Increase internal data size if needed /** @brief Increase internal data size if needed
*/ */
void increaseSize(); void increaseSize();
@ -325,6 +338,7 @@ class RowAggStorage
size_t fSize{0}; size_t fSize{0};
size_t fMask{0}; size_t fMask{0};
size_t fMaxSize{0}; size_t fMaxSize{0};
uint64_t hashMultiplier_{0xc4ceb9fe1a85ec53ULL};
uint32_t fInfoInc{INIT_INFO_INC}; uint32_t fInfoInc{INIT_INFO_INC};
uint32_t fInfoHashShift{INIT_INFO_HASH_SHIFT}; uint32_t fInfoHashShift{INIT_INFO_HASH_SHIFT};
}; };