diff --git a/dbcon/mysql/ha_mcs_sysvars.cpp b/dbcon/mysql/ha_mcs_sysvars.cpp index 1a129819e..4596bf1cf 100644 --- a/dbcon/mysql/ha_mcs_sysvars.cpp +++ b/dbcon/mysql/ha_mcs_sysvars.cpp @@ -39,6 +39,8 @@ static MYSQL_THDVAR_ENUM(compression_type, PLUGIN_VAR_RQCMDARG, "SNAPPY segment files are Snappy compressed (default);" #ifdef HAVE_LZ4 "LZ4 segment files are LZ4 compressed;", +# else + , #endif NULL, // check NULL, // update diff --git a/utils/common/collation.h b/utils/common/collation.h index 1e540d0fe..280bcd366 100644 --- a/utils/common/collation.h +++ b/utils/common/collation.h @@ -183,6 +183,10 @@ class Charset { return flags_; } + size_t strnxfrm(uchar* dst, size_t dstlen, uint nweights, const uchar* src, size_t srclen, uint flags) + { + return mCharset->coll->strnxfrm(mCharset, dst, dstlen, nweights, src, srclen, flags); + } }; class CollationAwareHasher : public Charset diff --git a/utils/common/conststring.h b/utils/common/conststring.h index 14dc63101..451733538 100644 --- a/utils/common/conststring.h +++ b/utils/common/conststring.h @@ -64,6 +64,13 @@ class ConstString } return *this; } + ConstString& rtrimSpaces() + { + for (; mLength && mStr[mLength - 1] == ' '; --mLength) + { + } + return *this; + } }; } // namespace utils diff --git a/utils/common/hasher.h b/utils/common/hasher.h index c13ab7633..08ccf02e4 100644 --- a/utils/common/hasher.h +++ b/utils/common/hasher.h @@ -27,8 +27,10 @@ #ifndef UTILS_HASHER_H #define UTILS_HASHER_H +#include #include #include +#include #include "mcs_basic_types.h" namespace utils @@ -204,6 +206,81 @@ class Hasher_r } }; +// This stream hasher was borrowed from RobinHood +class Hasher64_r +{ + public: + inline uint64_t operator()(const void* ptr, uint32_t len, uint64_t x = 0ULL) + { + auto const* const data64 = static_cast(ptr); + uint64_t h = seed ^ (len * m); + + std::size_t const n_blocks = len / 8; + if (x) + { + x *= m; + x ^= x >> r; + x *= m; + h ^= x; + h *= m; + } + for (std::size_t i = 0; i < n_blocks; ++i) + { + uint64_t k; + memcpy(&k, data64 + i, sizeof(k)); + + k *= m; + k ^= k >> r; + k *= m; + + h ^= k; + h *= m; + } + + auto const* const data8 = reinterpret_cast(data64 + n_blocks); + switch (len & 7U) + { + case 7: + h ^= static_cast(data8[6]) << 48U; + // FALLTHROUGH + case 6: + h ^= static_cast(data8[5]) << 40U; + // FALLTHROUGH + case 5: + h ^= static_cast(data8[4]) << 32U; + // FALLTHROUGH + case 4: + h ^= static_cast(data8[3]) << 24U; + // FALLTHROUGH + case 3: + h ^= static_cast(data8[2]) << 16U; + // FALLTHROUGH + case 2: + h ^= static_cast(data8[1]) << 8U; + // FALLTHROUGH + case 1: + h ^= static_cast(data8[0]); + h *= m; + // FALLTHROUGH + default: break; + } + return h; + } + + inline uint64_t finalize(uint64_t h, uint64_t len) const + { + h ^= h >> r; + h *= m; + h ^= h >> r; + return h; + } + + private: + static constexpr uint64_t m = 0xc6a4a7935bd1e995ULL; + static constexpr uint64_t seed = 0xe17a1465ULL; + static constexpr unsigned int r = 47; +}; + class Hasher128 { public: diff --git a/utils/rowgroup/rowstorage.cpp b/utils/rowgroup/rowstorage.cpp index 28152122d..bfa60dcea 100644 --- a/utils/rowgroup/rowstorage.cpp +++ b/utils/rowgroup/rowstorage.cpp @@ -79,73 +79,6 @@ std::string errorString(int errNo) auto* buf = strerror_r(errNo, tmp, sizeof(tmp)); return {buf}; } - -inline uint64_t hashData(const void* ptr, uint32_t len, uint64_t x = 0ULL) -{ - static constexpr uint64_t m = 0xc6a4a7935bd1e995ULL; - static constexpr uint64_t seed = 0xe17a1465ULL; - static constexpr unsigned int r = 47; - - auto const* const data64 = static_cast(ptr); - uint64_t h = seed ^ (len * m); - - std::size_t const n_blocks = len / 8; - if (x) - { - x *= m; - x ^= x >> r; - x *= m; - h ^= x; - h *= m; - } - for (std::size_t i = 0; i < n_blocks; ++i) - { - uint64_t k; - memcpy(&k, data64 + i, sizeof(k)); - - k *= m; - k ^= k >> r; - k *= m; - - h ^= k; - h *= m; - } - - auto const* const data8 = reinterpret_cast(data64 + n_blocks); - switch (len & 7U) - { - case 7: - h ^= static_cast(data8[6]) << 48U; - // FALLTHROUGH - case 6: - h ^= static_cast(data8[5]) << 40U; - // FALLTHROUGH - case 5: - h ^= static_cast(data8[4]) << 32U; - // FALLTHROUGH - case 4: - h ^= static_cast(data8[3]) << 24U; - // FALLTHROUGH - case 3: - h ^= static_cast(data8[2]) << 16U; - // FALLTHROUGH - case 2: - h ^= static_cast(data8[1]) << 8U; - // FALLTHROUGH - case 1: - h ^= static_cast(data8[0]); - h *= m; - // FALLTHROUGH - default: break; - } - - h ^= h >> r; - h *= m; - h ^= h >> r; - - return h; -} - } // anonymous namespace namespace rowgroup @@ -157,7 +90,10 @@ uint64_t hashRow(const rowgroup::Row& r, std::size_t lastCol) return 0; datatypes::MariaDBHasher h; + utils::Hasher64_r columnHasher; + bool strHashUsed = false; + for (uint32_t i = 0; i <= lastCol; ++i) { switch (r.getColType(i)) @@ -167,34 +103,47 @@ uint64_t hashRow(const rowgroup::Row& r, std::size_t lastCol) case execplan::CalpontSystemCatalog::BLOB: case execplan::CalpontSystemCatalog::TEXT: { + auto cs = r.getCharset(i); auto strColValue = r.getConstString(i); - if (strColValue.length() > MaxConstStrSize) + auto strColValueLen = strColValue.length(); + if (strColValueLen > MaxConstStrSize) { - h.add(r.getCharset(i), strColValue); + h.add(cs, strColValue); strHashUsed = true; } else { - auto cs = r.getCharset(i); - uchar buf[MaxConstStrBufSize]; - uint nActualWeights = cs->strnxfrm(buf, MaxConstStrBufSize, MaxConstStrBufSize, - reinterpret_cast(strColValue.str()), strColValue.length(), - datatypes::Charset::getDefaultFlags()); - ret = hashData(buf, nActualWeights, ret); + // This is relatively big stack allocation. + // It is aligned for future vectorization of hash calculation. + uchar buf[MaxConstStrBufSize] __attribute__((aligned(64))); + // Pay attention to the last strxfrm argument value. + // It is called flags and in many cases it has padding + // enabled(MY_STRXFRM_PAD_WITH_SPACE bit). With padding enabled + // strxfrm returns MaxConstStrBufSize bytes and not the actual + // weights array length. Here I disable padding. + auto charset = datatypes::Charset(cs); + auto trimStrColValue = strColValue.rtrimSpaces(); + // The padding is disabled b/c we previously use rtrimSpaces(). + // strColValueLen is used here. + size_t nActualWeights = charset.strnxfrm(buf, MaxConstStrBufSize, strColValueLen, + reinterpret_cast(trimStrColValue.str()), + trimStrColValue.length(), 0); + ret = columnHasher(reinterpret_cast(buf), nActualWeights, ret); } break; } - default: ret = hashData(r.getData() + r.getOffset(i), r.getColumnWidth(i), ret); break; + default: ret = columnHasher(r.getData() + r.getOffset(i), r.getColumnWidth(i), ret); break; } } + // The properties of the hash produced are worse if MDB hasher results are incorporated + // so late but these results must be used very infrequently. if (strHashUsed) { uint64_t strhash = h.finalize(); - ret = hashData(&strhash, sizeof(strhash), ret); + ret = columnHasher(&strhash, sizeof(strhash), ret); } - - return ret; + return columnHasher.finalize(ret, lastCol << 2); } /** @brief NoOP interface to LRU-cache used by RowGroupStorage & HashStorage