diff --git a/dbcon/mysql/ha_mcs_sysvars.cpp b/dbcon/mysql/ha_mcs_sysvars.cpp index f372d9ec8..415257c5f 100644 --- a/dbcon/mysql/ha_mcs_sysvars.cpp +++ b/dbcon/mysql/ha_mcs_sysvars.cpp @@ -36,9 +36,12 @@ static TYPELIB mcs_compression_type_names_lib = {array_elements(mcs_compression_ // compression type static MYSQL_THDVAR_ENUM(compression_type, PLUGIN_VAR_RQCMDARG, "Controls compression algorithm for create tables. Possible values are: " - "NO_COMPRESSION segment files aren't compressed; " "SNAPPY segment files are Snappy compressed (default);" +#ifdef HAVE_LZ4 "LZ4 segment files are LZ4 compressed;", +# else + , +#endif NULL, // check NULL, // update 1, // default diff --git a/utils/common/collation.h b/utils/common/collation.h index 424c74e90..6e2cf58cc 100644 --- a/utils/common/collation.h +++ b/utils/common/collation.h @@ -135,8 +135,10 @@ class Charset { protected: const struct charset_info_st* mCharset; + private: static constexpr const uint flags_ = MY_STRXFRM_PAD_WITH_SPACE | MY_STRXFRM_PAD_TO_MAXLEN; + public: Charset(CHARSET_INFO& cs) : mCharset(&cs) { @@ -187,7 +189,7 @@ class Charset } size_t strnxfrm(uchar* dst, size_t dstlen, uint nweights, const uchar* src, size_t srclen, uint flags) { - idbassert(mCharset->coll); + assert(mCharset->coll); return mCharset->coll->strnxfrm(mCharset, dst, dstlen, nweights, src, srclen, flags); } // The magic check that tells that bytes are mapped to weights as 1:1 @@ -195,21 +197,21 @@ class Charset { return (mCharset->state & MY_CS_NON1TO1) == 0; } - template + template T strnxfrm(const char* src) const { T ret = 0; - size_t len __attribute__((unused)) = mCharset->strnxfrm((char*)&ret, sizeof(T), sizeof(T), - src, sizeof(T), flags_); + size_t len __attribute__((unused)) = + mCharset->strnxfrm((char*)&ret, sizeof(T), sizeof(T), src, sizeof(T), flags_); assert(len <= sizeof(T)); return ret; } - template - T strnxfrm(const utils::ConstString &src) const + template + T strnxfrm(const utils::ConstString& src) const { T ret = 0; - size_t len __attribute__((unused)) = mCharset->strnxfrm((char*)&ret, sizeof(T), sizeof(T), - (char*)src.str(), src.length(), flags_); + size_t len __attribute__((unused)) = + mCharset->strnxfrm((char*)&ret, sizeof(T), sizeof(T), (char*)src.str(), src.length(), flags_); assert(len <= sizeof(T)); return ret; } diff --git a/utils/common/conststring.h b/utils/common/conststring.h index b38be5cbe..c1024c0a7 100644 --- a/utils/common/conststring.h +++ b/utils/common/conststring.h @@ -66,6 +66,13 @@ class ConstString } return *this; } + ConstString& rtrimSpaces() + { + for (; mLength && mStr[mLength - 1] == ' '; --mLength) + { + } + return *this; + } }; } // namespace utils diff --git a/utils/common/hasher.h b/utils/common/hasher.h index 1b5fb14ca..f55d3c971 100644 --- a/utils/common/hasher.h +++ b/utils/common/hasher.h @@ -26,8 +26,10 @@ #pragma once +#include #include #include +#include #include "mcs_basic_types.h" namespace utils @@ -203,6 +205,81 @@ class Hasher_r } }; +// This stream hasher was borrowed from RobinHood +class Hasher64_r +{ + public: + inline uint64_t operator()(const void* ptr, uint32_t len, uint64_t x = 0ULL) + { + auto const* const data64 = static_cast(ptr); + uint64_t h = seed ^ (len * m); + + std::size_t const n_blocks = len / 8; + if (x) + { + x *= m; + x ^= x >> r; + x *= m; + h ^= x; + h *= m; + } + for (std::size_t i = 0; i < n_blocks; ++i) + { + uint64_t k; + memcpy(&k, data64 + i, sizeof(k)); + + k *= m; + k ^= k >> r; + k *= m; + + h ^= k; + h *= m; + } + + auto const* const data8 = reinterpret_cast(data64 + n_blocks); + switch (len & 7U) + { + case 7: + h ^= static_cast(data8[6]) << 48U; + // FALLTHROUGH + case 6: + h ^= static_cast(data8[5]) << 40U; + // FALLTHROUGH + case 5: + h ^= static_cast(data8[4]) << 32U; + // FALLTHROUGH + case 4: + h ^= static_cast(data8[3]) << 24U; + // FALLTHROUGH + case 3: + h ^= static_cast(data8[2]) << 16U; + // FALLTHROUGH + case 2: + h ^= static_cast(data8[1]) << 8U; + // FALLTHROUGH + case 1: + h ^= static_cast(data8[0]); + h *= m; + // FALLTHROUGH + default: break; + } + return h; + } + + inline uint64_t finalize(uint64_t h, uint64_t len) const + { + h ^= h >> r; + h *= m; + h ^= h >> r; + return h; + } + + private: + static constexpr uint64_t m = 0xc6a4a7935bd1e995ULL; + static constexpr uint64_t seed = 0xe17a1465ULL; + static constexpr unsigned int r = 47; +}; + class Hasher128 { public: diff --git a/utils/rowgroup/rowstorage.cpp b/utils/rowgroup/rowstorage.cpp index e3f0ed191..750c11207 100644 --- a/utils/rowgroup/rowstorage.cpp +++ b/utils/rowgroup/rowstorage.cpp @@ -79,73 +79,6 @@ std::string errorString(int errNo) auto* buf = strerror_r(errNo, tmp, sizeof(tmp)); return {buf}; } - -inline uint64_t hashData(const void* ptr, uint32_t len, uint64_t x = 0ULL) -{ - static constexpr uint64_t m = 0xc6a4a7935bd1e995ULL; - static constexpr uint64_t seed = 0xe17a1465ULL; - static constexpr unsigned int r = 47; - - auto const* const data64 = static_cast(ptr); - uint64_t h = seed ^ (len * m); - - std::size_t const n_blocks = len / 8; - if (x) - { - x *= m; - x ^= x >> r; - x *= m; - h ^= x; - h *= m; - } - for (std::size_t i = 0; i < n_blocks; ++i) - { - uint64_t k; - memcpy(&k, data64 + i, sizeof(k)); - - k *= m; - k ^= k >> r; - k *= m; - - h ^= k; - h *= m; - } - - auto const* const data8 = reinterpret_cast(data64 + n_blocks); - switch (len & 7U) - { - case 7: - h ^= static_cast(data8[6]) << 48U; - // FALLTHROUGH - case 6: - h ^= static_cast(data8[5]) << 40U; - // FALLTHROUGH - case 5: - h ^= static_cast(data8[4]) << 32U; - // FALLTHROUGH - case 4: - h ^= static_cast(data8[3]) << 24U; - // FALLTHROUGH - case 3: - h ^= static_cast(data8[2]) << 16U; - // FALLTHROUGH - case 2: - h ^= static_cast(data8[1]) << 8U; - // FALLTHROUGH - case 1: - h ^= static_cast(data8[0]); - h *= m; - // FALLTHROUGH - default: break; - } - - h ^= h >> r; - h *= m; - h ^= h >> r; - - return h; -} - } // anonymous namespace namespace rowgroup @@ -157,7 +90,10 @@ uint64_t hashRow(const rowgroup::Row& r, std::size_t lastCol) return 0; datatypes::MariaDBHasher h; + utils::Hasher64_r columnHasher; + bool strHashUsed = false; + for (uint32_t i = 0; i <= lastCol; ++i) { switch (r.getColType(i)) @@ -167,34 +103,47 @@ uint64_t hashRow(const rowgroup::Row& r, std::size_t lastCol) case execplan::CalpontSystemCatalog::BLOB: case execplan::CalpontSystemCatalog::TEXT: { + auto cs = r.getCharset(i); auto strColValue = r.getConstString(i); - if (strColValue.length() > MaxConstStrSize) + auto strColValueLen = strColValue.length(); + if (strColValueLen > MaxConstStrSize) { - h.add(r.getCharset(i), strColValue); + h.add(cs, strColValue); strHashUsed = true; } else { - auto cs = r.getCharset(i); - uchar buf[MaxConstStrBufSize]; - uint nActualWeights = cs->strnxfrm(buf, MaxConstStrBufSize, MaxConstStrBufSize, - reinterpret_cast(strColValue.str()), strColValue.length(), - datatypes::Charset::getDefaultFlags()); - ret = hashData(buf, nActualWeights, ret); + // This is relatively big stack allocation. + // It is aligned for future vectorization of hash calculation. + uchar buf[MaxConstStrBufSize] __attribute__((aligned(64))); + // Pay attention to the last strxfrm argument value. + // It is called flags and in many cases it has padding + // enabled(MY_STRXFRM_PAD_WITH_SPACE bit). With padding enabled + // strxfrm returns MaxConstStrBufSize bytes and not the actual + // weights array length. Here I disable padding. + auto charset = datatypes::Charset(cs); + auto trimStrColValue = strColValue.rtrimSpaces(); + // The padding is disabled b/c we previously use rtrimSpaces(). + // strColValueLen is used here. + size_t nActualWeights = charset.strnxfrm(buf, MaxConstStrBufSize, strColValueLen, + reinterpret_cast(trimStrColValue.str()), + trimStrColValue.length(), 0); + ret = columnHasher(reinterpret_cast(buf), nActualWeights, ret); } break; } - default: ret = hashData(r.getData() + r.getOffset(i), r.getColumnWidth(i), ret); break; + default: ret = columnHasher(r.getData() + r.getOffset(i), r.getColumnWidth(i), ret); break; } } + // The properties of the hash produced are worse if MDB hasher results are incorporated + // so late but these results must be used very infrequently. if (strHashUsed) { uint64_t strhash = h.finalize(); - ret = hashData(&strhash, sizeof(strhash), ret); + ret = columnHasher(&strhash, sizeof(strhash), ret); } - - return ret; + return columnHasher.finalize(ret, lastCol << 2); } /** @brief NoOP interface to LRU-cache used by RowGroupStorage & HashStorage