You've already forked mariadb-columnstore-engine
mirror of
https://github.com/mariadb-corporation/mariadb-columnstore-engine.git
synced 2025-07-29 08:21:15 +03:00
MCOL-5153 This patch replaces MDB collation aware hash function with the (#2488)
exact functionality that does not use MDB hash function. This patch also takes a bit from Robin Hood hash map implementation forgotten that reduces hash function collision rate.
This commit is contained in:
@ -136,8 +136,7 @@ class Charset
|
||||
protected:
|
||||
const struct charset_info_st* mCharset;
|
||||
private:
|
||||
static constexpr uint flags_ = MY_STRXFRM_PAD_WITH_SPACE | MY_STRXFRM_PAD_TO_MAXLEN;
|
||||
|
||||
static constexpr const uint flags_ = MY_STRXFRM_PAD_WITH_SPACE | MY_STRXFRM_PAD_TO_MAXLEN;
|
||||
public:
|
||||
Charset(CHARSET_INFO& cs) : mCharset(&cs)
|
||||
{
|
||||
@ -209,6 +208,10 @@ class Charset
|
||||
assert(len <= sizeof(T));
|
||||
return ret;
|
||||
}
|
||||
static uint getDefaultFlags()
|
||||
{
|
||||
return flags_;
|
||||
}
|
||||
};
|
||||
|
||||
class CollationAwareHasher : public Charset
|
||||
|
@ -10,14 +10,14 @@
|
||||
# token 1: error ID
|
||||
# token 2: internal error name
|
||||
# token 3: error message text to describe the error
|
||||
#
|
||||
#
|
||||
# The tokens should be separated by one tab character. The error message text may contain
|
||||
# any character(s) except tab. The line must end with a single '\n'.
|
||||
|
||||
# The id to treat all messages as CRITICAL
|
||||
444 ERR_ALWAYS_CRITICAL
|
||||
|
||||
# Non support errors 1000 ~ 2000.
|
||||
# Non support errors 1000 ~ 2000.
|
||||
# The query will go through the optimizer again with some optimization turned off
|
||||
1000 ERR_MISS_JOIN %1% not joined.
|
||||
1001 ERR_NON_SUPPORTED_FUNCTION Function '%1%' isn't supported.
|
||||
@ -72,7 +72,7 @@
|
||||
2031 ERR_BRM_LOOKUP Blocks are missing. Alter or drop table in progress?
|
||||
2032 ERR_INCORRECT_VALUE Incorrect %1% value: '%2%'.
|
||||
2033 ERR_SYSTEM_CATALOG Error occurred when calling system catalog.
|
||||
2034 ERR_DATA_OFFLINE At least one DBRoot required for that query is offline.
|
||||
2034 ERR_DATA_OFFLINE At least one DBRoot required for that query is offline.
|
||||
2035 ERR_ASSERTION_FAILURE An internal error occurred. Check the error log file & contact support.
|
||||
2036 ERR_PARTITION_NO_SCHEMA No schema is specified for this partition function.
|
||||
2037 ERR_INVALID_FUNC_ARGUMENT %1% in function arguments.
|
||||
@ -103,6 +103,8 @@
|
||||
2055 ERR_DISKAGG_TOO_BIG Not enough memory to make disk-based aggregation. Raise TotalUmMemory if possible.
|
||||
2056 ERR_DISKAGG_FILEIO_ERROR There was an IO error during a disk-based aggregation: %1%
|
||||
2057 ERR_JOIN_RESULT_TOO_BIG Not enough memory to consolidate join results. Estimated %1% MB needed. TotalUmMemory is %2% MB.
|
||||
2058 ERR_DISKAGG_OVERFLOW1 The hash function used produces a lot of hash collisions (1).
|
||||
2059 ERR_DISKAGG_OVERFLOW2 The hash function used produces a lot of hash collisions (2).
|
||||
|
||||
# Sub-query errors
|
||||
3001 ERR_NON_SUPPORT_SUB_QUERY_TYPE This subquery type is not supported yet.
|
||||
@ -162,7 +164,7 @@
|
||||
6001 ERR_NETWORK DBRM encountered a network error, check the controllernode.
|
||||
6002 ERR_BRM_MUTEX A process crashed while holding the BRM mutex. The lock state is unreliable. Please restart Columnstore.
|
||||
6003 ERR_UNRECOVERABLE_LOCK_STATE Unrecoverable BRM lock state detected. Diagnostic values: r=%1% rwt=%2 w=%3% wwt=%4%. Please restart Columnstore.
|
||||
6004 ERR_RECOVERABLE_LOCK_STATE Attempting to fix the BRM lock state. Diagnostic values: r=%1% rwt=%2 w=%3% wwt=%4%.
|
||||
6004 ERR_RECOVERABLE_LOCK_STATE Attempting to fix the BRM lock state. Diagnostic values: r=%1% rwt=%2 w=%3% wwt=%4%.
|
||||
6005 ERR_SUCCESSFUL_RECOVERY BRM lock state appears to be functional again.
|
||||
6006 ERR_HARD_FAILURE DBRM encountered, most likely, a network or disk problem performing that operation.
|
||||
|
||||
@ -200,7 +202,7 @@
|
||||
9023 ERR_WF_OVERFLOW '%1%' overflow.
|
||||
9024 ERR_WF_COLUMN_MISSING '%1%' is not in tuple.
|
||||
9025 ERR_WF_UNKNOWN_BOUND Unknown window frame start/bound type: '%1%'.
|
||||
9026 ERR_WF_NOT_IN_COL_MAP Returned column not in intermediate result set.
|
||||
9026 ERR_WF_NOT_IN_COL_MAP Returned column not in intermediate result set.
|
||||
9027 ERR_WF_ARG_OUT_OF_RANGE Argument '%1%' is out of range.
|
||||
9028 ERR_WF_NOT_ALLOWED Window functions are not allowed in %1%.
|
||||
9029 ERR_WF_IDB_ONLY Window function are only supported for Columnstore tables.
|
||||
|
@ -166,10 +166,24 @@ uint64_t hashRow(const rowgroup::Row& r, std::size_t lastCol)
|
||||
case execplan::CalpontSystemCatalog::VARCHAR:
|
||||
case execplan::CalpontSystemCatalog::BLOB:
|
||||
case execplan::CalpontSystemCatalog::TEXT:
|
||||
h.add(r.getCharset(i), r.getConstString(i));
|
||||
strHashUsed = true;
|
||||
{
|
||||
auto strColValue = r.getConstString(i);
|
||||
if (strColValue.length() > MaxConstStrSize)
|
||||
{
|
||||
h.add(r.getCharset(i), strColValue);
|
||||
strHashUsed = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
auto cs = r.getCharset(i);
|
||||
uchar buf[MaxConstStrBufSize];
|
||||
uint nActualWeights = cs->strnxfrm(buf, MaxConstStrBufSize, MaxConstStrBufSize,
|
||||
reinterpret_cast<const uchar*>(strColValue.str()), strColValue.length(),
|
||||
datatypes::Charset::getDefaultFlags());
|
||||
ret = hashData(buf, nActualWeights, ret);
|
||||
}
|
||||
break;
|
||||
|
||||
}
|
||||
default: ret = hashData(r.getData() + r.getOffset(i), r.getColumnWidth(i), ret); break;
|
||||
}
|
||||
}
|
||||
@ -1820,15 +1834,26 @@ void RowAggStorage::increaseSize()
|
||||
if (fCurData->fSize * maxMaskMultiplierWoRehashing < calcMaxSize(fCurData->fMask + 1))
|
||||
{
|
||||
// something strange happens...
|
||||
throw logging::IDBExcept(logging::IDBErrorInfo::instance()->errorMsg(logging::ERR_DISKAGG_ERROR),
|
||||
logging::ERR_DISKAGG_ERROR);
|
||||
throw logging::IDBExcept(logging::IDBErrorInfo::instance()->errorMsg(logging::ERR_DISKAGG_OVERFLOW2),
|
||||
logging::ERR_DISKAGG_OVERFLOW2);
|
||||
}
|
||||
|
||||
auto freeMem = fMM->getFree();
|
||||
if (fEnabledDiskAggregation ||
|
||||
freeMem > (fMM->getUsed() + fCurData->fHashes->memUsage() + fStorage->getAproxRGSize()) * 2)
|
||||
{
|
||||
rehashPowerOfTwo((fCurData->fMask + 1) * 2);
|
||||
if (fCurData->fSize * 2 < maxSize)
|
||||
{
|
||||
// we have to resize, even though there would still be plenty of space left!
|
||||
// Try to rehash instead. Delete freed memory so we don't steadyily increase mem in case
|
||||
// we have to rehash a few times
|
||||
nextHashMultiplier();
|
||||
rehashPowerOfTwo(fCurData->fMask + 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
rehashPowerOfTwo((fCurData->fMask + 1) * 2);
|
||||
}
|
||||
}
|
||||
else if (fGeneration < MAX_INMEMORY_GENS - 1)
|
||||
{
|
||||
@ -1888,8 +1913,8 @@ void RowAggStorage::insertSwap(size_t oldIdx, RowPosHashStorage* oldHashes)
|
||||
{
|
||||
if (fCurData->fMaxSize == 0 && !tryIncreaseInfo())
|
||||
{
|
||||
throw logging::IDBExcept(logging::IDBErrorInfo::instance()->errorMsg(logging::ERR_DISKAGG_ERROR),
|
||||
logging::ERR_DISKAGG_ERROR);
|
||||
throw logging::IDBExcept(logging::IDBErrorInfo::instance()->errorMsg(logging::ERR_DISKAGG_OVERFLOW1),
|
||||
logging::ERR_DISKAGG_OVERFLOW1);
|
||||
}
|
||||
|
||||
size_t idx{};
|
||||
|
@ -36,6 +36,9 @@ class RowGroupStorage;
|
||||
|
||||
uint64_t hashRow(const rowgroup::Row& r, std::size_t lastCol);
|
||||
|
||||
constexpr const size_t MaxConstStrSize = 2048ULL;
|
||||
constexpr const size_t MaxConstStrBufSize = MaxConstStrSize << 1;
|
||||
|
||||
class RowAggStorage
|
||||
{
|
||||
public:
|
||||
@ -161,6 +164,9 @@ class RowAggStorage
|
||||
*/
|
||||
inline void rowHashToIdx(uint64_t h, uint32_t& info, size_t& idx, const Data* curData) const
|
||||
{
|
||||
// An addition from the original robin hood HM.
|
||||
h *= fCurData->hashMultiplier_;
|
||||
h ^= h >> 33U;
|
||||
info = curData->fInfoInc + static_cast<uint32_t>((h & INFO_MASK) >> curData->fInfoHashShift);
|
||||
idx = (h >> INIT_INFO_BITS) & curData->fMask;
|
||||
}
|
||||
@ -230,6 +236,13 @@ class RowAggStorage
|
||||
info = fCurData->fInfo[idx];
|
||||
}
|
||||
|
||||
void nextHashMultiplier()
|
||||
{
|
||||
// adding an *even* number, so that the multiplier will always stay odd. This is necessary
|
||||
// so that the hash stays a mixing function (and thus doesn't have any information loss).
|
||||
fCurData->hashMultiplier_ += 0xc4ceb9fe1a85ec54;
|
||||
}
|
||||
|
||||
/** @brief Increase internal data size if needed
|
||||
*/
|
||||
void increaseSize();
|
||||
@ -325,6 +338,7 @@ class RowAggStorage
|
||||
size_t fSize{0};
|
||||
size_t fMask{0};
|
||||
size_t fMaxSize{0};
|
||||
uint64_t hashMultiplier_{0xc4ceb9fe1a85ec53ULL};
|
||||
uint32_t fInfoInc{INIT_INFO_INC};
|
||||
uint32_t fInfoHashShift{INIT_INFO_HASH_SHIFT};
|
||||
};
|
||||
|
Reference in New Issue
Block a user