1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-07-30 19:23:07 +03:00

MCOL-5451 This resolves external GROUP BY result inconsistency issues (#2791)

Given that idx is a RH hashmap bucket number and info is intra-bucket idx
    the root cause is triggered by the difference of idx/hash pair
    calculation for a certain GROUP BY generation and for generation
    aggregations merging that takes place in RowAggStorage::finalize.
    This patch generalizes rowHashToIdx to leverage it in both cases
    mentioned above.
This commit is contained in:
Roman Nozdrin
2023-03-28 17:10:41 +01:00
committed by GitHub
parent f6cfac2e80
commit a1d20d82d5
2 changed files with 33 additions and 65 deletions

View File

@ -147,34 +147,22 @@ class RowAggStorage
*/
void shiftUp(size_t startIdx, size_t insIdx);
/** @brief Find best position of row and save it's hash.
*
* @param row(in) input row
* @param info(out) info data
* @param idx(out) index computed from row hash
* @param hash(out) row hash value
*/
void rowToIdx(const Row& row, uint32_t& info, size_t& idx, uint64_t& hash) const;
void rowToIdx(const Row& row, uint32_t& info, size_t& idx, uint64_t& hash, const Data* curData) const;
/** @brief Find best position using precomputed hash
*
* @param h(in) row hash
* @param info(out) info data
* @param idx(out) index
*/
inline void rowHashToIdx(uint64_t h, uint32_t& info, size_t& idx, const Data* curData) const
using InfoIdxType = std::pair<uint32_t, size_t>;
inline InfoIdxType rowHashToIdx(uint64_t h, const size_t mask, const uint64_t hashMultiplier,
const uint32_t infoInc, const uint32_t infoHashShift) const
{
// An addition from the original robin hood HM.
h *= fCurData->hashMultiplier_;
h *= hashMultiplier;
h ^= h >> 33U;
info = curData->fInfoInc + static_cast<uint32_t>((h & INFO_MASK) >> curData->fInfoHashShift);
idx = (h >> INIT_INFO_BITS) & curData->fMask;
uint32_t info = infoInc + static_cast<uint32_t>((h & INFO_MASK) >> infoHashShift);
size_t idx = (h >> INIT_INFO_BITS) & mask;
return {info, idx};
}
inline void rowHashToIdx(uint64_t h, uint32_t& info, size_t& idx) const
inline InfoIdxType rowHashToIdx(uint64_t h) const
{
return rowHashToIdx(h, info, idx, fCurData);
return rowHashToIdx(h, fCurData->fMask, fCurData->hashMultiplier_, fCurData->fInfoInc,
fCurData->fInfoHashShift);
}
/** @brief Iterate over internal info until info with less-or-equal distance
@ -237,13 +225,6 @@ class RowAggStorage
info = fCurData->fInfo[idx];
}
void nextHashMultiplier()
{
// adding an *even* number, so that the multiplier will always stay odd. This is necessary
// so that the hash stays a mixing function (and thus doesn't have any information loss).
fCurData->hashMultiplier_ += 0xc4ceb9fe1a85ec54;
}
/** @brief Increase internal data size if needed
*/
void increaseSize();
@ -310,8 +291,8 @@ class RowAggStorage
*/
void loadGeneration(uint16_t gen);
/** @brief Load previously dumped data into the tmp storage */
void loadGeneration(uint16_t gen, size_t& size, size_t& mask, size_t& maxSize, uint32_t& infoInc,
uint32_t& infoHashShift, std::unique_ptr<uint8_t[]>& info);
void loadGeneration(uint16_t gen, size_t& size, size_t& mask, size_t& maxSize, size_t& hashMultiplier,
uint32_t& infoInc, uint32_t& infoHashShift, std::unique_ptr<uint8_t[]>& info);
/** @brief Remove temporary data files */
void cleanup();