You've already forked mariadb-columnstore-engine
mirror of
https://github.com/mariadb-corporation/mariadb-columnstore-engine.git
synced 2025-07-29 08:21:15 +03:00
MCOL-5199 This patch solves the overal performance degradation introduced with a new way of char columns hashing
in aggregation code The patch disables padding that forces hasher to calculate over the whole 2k buffer. This patch also moves hashing code into the common place where it belongs.
This commit is contained in:
@ -36,9 +36,12 @@ static TYPELIB mcs_compression_type_names_lib = {array_elements(mcs_compression_
|
|||||||
// compression type
|
// compression type
|
||||||
static MYSQL_THDVAR_ENUM(compression_type, PLUGIN_VAR_RQCMDARG,
|
static MYSQL_THDVAR_ENUM(compression_type, PLUGIN_VAR_RQCMDARG,
|
||||||
"Controls compression algorithm for create tables. Possible values are: "
|
"Controls compression algorithm for create tables. Possible values are: "
|
||||||
"NO_COMPRESSION segment files aren't compressed; "
|
|
||||||
"SNAPPY segment files are Snappy compressed (default);"
|
"SNAPPY segment files are Snappy compressed (default);"
|
||||||
|
#ifdef HAVE_LZ4
|
||||||
"LZ4 segment files are LZ4 compressed;",
|
"LZ4 segment files are LZ4 compressed;",
|
||||||
|
# else
|
||||||
|
,
|
||||||
|
#endif
|
||||||
NULL, // check
|
NULL, // check
|
||||||
NULL, // update
|
NULL, // update
|
||||||
1, // default
|
1, // default
|
||||||
|
@ -135,8 +135,10 @@ class Charset
|
|||||||
{
|
{
|
||||||
protected:
|
protected:
|
||||||
const struct charset_info_st* mCharset;
|
const struct charset_info_st* mCharset;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
static constexpr const uint flags_ = MY_STRXFRM_PAD_WITH_SPACE | MY_STRXFRM_PAD_TO_MAXLEN;
|
static constexpr const uint flags_ = MY_STRXFRM_PAD_WITH_SPACE | MY_STRXFRM_PAD_TO_MAXLEN;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
Charset(CHARSET_INFO& cs) : mCharset(&cs)
|
Charset(CHARSET_INFO& cs) : mCharset(&cs)
|
||||||
{
|
{
|
||||||
@ -187,7 +189,7 @@ class Charset
|
|||||||
}
|
}
|
||||||
size_t strnxfrm(uchar* dst, size_t dstlen, uint nweights, const uchar* src, size_t srclen, uint flags)
|
size_t strnxfrm(uchar* dst, size_t dstlen, uint nweights, const uchar* src, size_t srclen, uint flags)
|
||||||
{
|
{
|
||||||
idbassert(mCharset->coll);
|
assert(mCharset->coll);
|
||||||
return mCharset->coll->strnxfrm(mCharset, dst, dstlen, nweights, src, srclen, flags);
|
return mCharset->coll->strnxfrm(mCharset, dst, dstlen, nweights, src, srclen, flags);
|
||||||
}
|
}
|
||||||
// The magic check that tells that bytes are mapped to weights as 1:1
|
// The magic check that tells that bytes are mapped to weights as 1:1
|
||||||
@ -195,21 +197,21 @@ class Charset
|
|||||||
{
|
{
|
||||||
return (mCharset->state & MY_CS_NON1TO1) == 0;
|
return (mCharset->state & MY_CS_NON1TO1) == 0;
|
||||||
}
|
}
|
||||||
template<typename T>
|
template <typename T>
|
||||||
T strnxfrm(const char* src) const
|
T strnxfrm(const char* src) const
|
||||||
{
|
{
|
||||||
T ret = 0;
|
T ret = 0;
|
||||||
size_t len __attribute__((unused)) = mCharset->strnxfrm((char*)&ret, sizeof(T), sizeof(T),
|
size_t len __attribute__((unused)) =
|
||||||
src, sizeof(T), flags_);
|
mCharset->strnxfrm((char*)&ret, sizeof(T), sizeof(T), src, sizeof(T), flags_);
|
||||||
assert(len <= sizeof(T));
|
assert(len <= sizeof(T));
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
template<typename T>
|
template <typename T>
|
||||||
T strnxfrm(const utils::ConstString &src) const
|
T strnxfrm(const utils::ConstString& src) const
|
||||||
{
|
{
|
||||||
T ret = 0;
|
T ret = 0;
|
||||||
size_t len __attribute__((unused)) = mCharset->strnxfrm((char*)&ret, sizeof(T), sizeof(T),
|
size_t len __attribute__((unused)) =
|
||||||
(char*)src.str(), src.length(), flags_);
|
mCharset->strnxfrm((char*)&ret, sizeof(T), sizeof(T), (char*)src.str(), src.length(), flags_);
|
||||||
assert(len <= sizeof(T));
|
assert(len <= sizeof(T));
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
@ -66,6 +66,13 @@ class ConstString
|
|||||||
}
|
}
|
||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
|
ConstString& rtrimSpaces()
|
||||||
|
{
|
||||||
|
for (; mLength && mStr[mLength - 1] == ' '; --mLength)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace utils
|
} // namespace utils
|
||||||
|
@ -26,8 +26,10 @@
|
|||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
#include <cstddef>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
#include <string>
|
||||||
#include "mcs_basic_types.h"
|
#include "mcs_basic_types.h"
|
||||||
|
|
||||||
namespace utils
|
namespace utils
|
||||||
@ -203,6 +205,81 @@ class Hasher_r
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// This stream hasher was borrowed from RobinHood
|
||||||
|
class Hasher64_r
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
inline uint64_t operator()(const void* ptr, uint32_t len, uint64_t x = 0ULL)
|
||||||
|
{
|
||||||
|
auto const* const data64 = static_cast<uint64_t const*>(ptr);
|
||||||
|
uint64_t h = seed ^ (len * m);
|
||||||
|
|
||||||
|
std::size_t const n_blocks = len / 8;
|
||||||
|
if (x)
|
||||||
|
{
|
||||||
|
x *= m;
|
||||||
|
x ^= x >> r;
|
||||||
|
x *= m;
|
||||||
|
h ^= x;
|
||||||
|
h *= m;
|
||||||
|
}
|
||||||
|
for (std::size_t i = 0; i < n_blocks; ++i)
|
||||||
|
{
|
||||||
|
uint64_t k;
|
||||||
|
memcpy(&k, data64 + i, sizeof(k));
|
||||||
|
|
||||||
|
k *= m;
|
||||||
|
k ^= k >> r;
|
||||||
|
k *= m;
|
||||||
|
|
||||||
|
h ^= k;
|
||||||
|
h *= m;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto const* const data8 = reinterpret_cast<uint8_t const*>(data64 + n_blocks);
|
||||||
|
switch (len & 7U)
|
||||||
|
{
|
||||||
|
case 7:
|
||||||
|
h ^= static_cast<uint64_t>(data8[6]) << 48U;
|
||||||
|
// FALLTHROUGH
|
||||||
|
case 6:
|
||||||
|
h ^= static_cast<uint64_t>(data8[5]) << 40U;
|
||||||
|
// FALLTHROUGH
|
||||||
|
case 5:
|
||||||
|
h ^= static_cast<uint64_t>(data8[4]) << 32U;
|
||||||
|
// FALLTHROUGH
|
||||||
|
case 4:
|
||||||
|
h ^= static_cast<uint64_t>(data8[3]) << 24U;
|
||||||
|
// FALLTHROUGH
|
||||||
|
case 3:
|
||||||
|
h ^= static_cast<uint64_t>(data8[2]) << 16U;
|
||||||
|
// FALLTHROUGH
|
||||||
|
case 2:
|
||||||
|
h ^= static_cast<uint64_t>(data8[1]) << 8U;
|
||||||
|
// FALLTHROUGH
|
||||||
|
case 1:
|
||||||
|
h ^= static_cast<uint64_t>(data8[0]);
|
||||||
|
h *= m;
|
||||||
|
// FALLTHROUGH
|
||||||
|
default: break;
|
||||||
|
}
|
||||||
|
return h;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline uint64_t finalize(uint64_t h, uint64_t len) const
|
||||||
|
{
|
||||||
|
h ^= h >> r;
|
||||||
|
h *= m;
|
||||||
|
h ^= h >> r;
|
||||||
|
return h;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
static constexpr uint64_t m = 0xc6a4a7935bd1e995ULL;
|
||||||
|
static constexpr uint64_t seed = 0xe17a1465ULL;
|
||||||
|
static constexpr unsigned int r = 47;
|
||||||
|
};
|
||||||
|
|
||||||
class Hasher128
|
class Hasher128
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
|
@ -79,73 +79,6 @@ std::string errorString(int errNo)
|
|||||||
auto* buf = strerror_r(errNo, tmp, sizeof(tmp));
|
auto* buf = strerror_r(errNo, tmp, sizeof(tmp));
|
||||||
return {buf};
|
return {buf};
|
||||||
}
|
}
|
||||||
|
|
||||||
inline uint64_t hashData(const void* ptr, uint32_t len, uint64_t x = 0ULL)
|
|
||||||
{
|
|
||||||
static constexpr uint64_t m = 0xc6a4a7935bd1e995ULL;
|
|
||||||
static constexpr uint64_t seed = 0xe17a1465ULL;
|
|
||||||
static constexpr unsigned int r = 47;
|
|
||||||
|
|
||||||
auto const* const data64 = static_cast<uint64_t const*>(ptr);
|
|
||||||
uint64_t h = seed ^ (len * m);
|
|
||||||
|
|
||||||
std::size_t const n_blocks = len / 8;
|
|
||||||
if (x)
|
|
||||||
{
|
|
||||||
x *= m;
|
|
||||||
x ^= x >> r;
|
|
||||||
x *= m;
|
|
||||||
h ^= x;
|
|
||||||
h *= m;
|
|
||||||
}
|
|
||||||
for (std::size_t i = 0; i < n_blocks; ++i)
|
|
||||||
{
|
|
||||||
uint64_t k;
|
|
||||||
memcpy(&k, data64 + i, sizeof(k));
|
|
||||||
|
|
||||||
k *= m;
|
|
||||||
k ^= k >> r;
|
|
||||||
k *= m;
|
|
||||||
|
|
||||||
h ^= k;
|
|
||||||
h *= m;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto const* const data8 = reinterpret_cast<uint8_t const*>(data64 + n_blocks);
|
|
||||||
switch (len & 7U)
|
|
||||||
{
|
|
||||||
case 7:
|
|
||||||
h ^= static_cast<uint64_t>(data8[6]) << 48U;
|
|
||||||
// FALLTHROUGH
|
|
||||||
case 6:
|
|
||||||
h ^= static_cast<uint64_t>(data8[5]) << 40U;
|
|
||||||
// FALLTHROUGH
|
|
||||||
case 5:
|
|
||||||
h ^= static_cast<uint64_t>(data8[4]) << 32U;
|
|
||||||
// FALLTHROUGH
|
|
||||||
case 4:
|
|
||||||
h ^= static_cast<uint64_t>(data8[3]) << 24U;
|
|
||||||
// FALLTHROUGH
|
|
||||||
case 3:
|
|
||||||
h ^= static_cast<uint64_t>(data8[2]) << 16U;
|
|
||||||
// FALLTHROUGH
|
|
||||||
case 2:
|
|
||||||
h ^= static_cast<uint64_t>(data8[1]) << 8U;
|
|
||||||
// FALLTHROUGH
|
|
||||||
case 1:
|
|
||||||
h ^= static_cast<uint64_t>(data8[0]);
|
|
||||||
h *= m;
|
|
||||||
// FALLTHROUGH
|
|
||||||
default: break;
|
|
||||||
}
|
|
||||||
|
|
||||||
h ^= h >> r;
|
|
||||||
h *= m;
|
|
||||||
h ^= h >> r;
|
|
||||||
|
|
||||||
return h;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // anonymous namespace
|
} // anonymous namespace
|
||||||
|
|
||||||
namespace rowgroup
|
namespace rowgroup
|
||||||
@ -157,7 +90,10 @@ uint64_t hashRow(const rowgroup::Row& r, std::size_t lastCol)
|
|||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
datatypes::MariaDBHasher h;
|
datatypes::MariaDBHasher h;
|
||||||
|
utils::Hasher64_r columnHasher;
|
||||||
|
|
||||||
bool strHashUsed = false;
|
bool strHashUsed = false;
|
||||||
|
|
||||||
for (uint32_t i = 0; i <= lastCol; ++i)
|
for (uint32_t i = 0; i <= lastCol; ++i)
|
||||||
{
|
{
|
||||||
switch (r.getColType(i))
|
switch (r.getColType(i))
|
||||||
@ -167,34 +103,47 @@ uint64_t hashRow(const rowgroup::Row& r, std::size_t lastCol)
|
|||||||
case execplan::CalpontSystemCatalog::BLOB:
|
case execplan::CalpontSystemCatalog::BLOB:
|
||||||
case execplan::CalpontSystemCatalog::TEXT:
|
case execplan::CalpontSystemCatalog::TEXT:
|
||||||
{
|
{
|
||||||
|
auto cs = r.getCharset(i);
|
||||||
auto strColValue = r.getConstString(i);
|
auto strColValue = r.getConstString(i);
|
||||||
if (strColValue.length() > MaxConstStrSize)
|
auto strColValueLen = strColValue.length();
|
||||||
|
if (strColValueLen > MaxConstStrSize)
|
||||||
{
|
{
|
||||||
h.add(r.getCharset(i), strColValue);
|
h.add(cs, strColValue);
|
||||||
strHashUsed = true;
|
strHashUsed = true;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
auto cs = r.getCharset(i);
|
// This is relatively big stack allocation.
|
||||||
uchar buf[MaxConstStrBufSize];
|
// It is aligned for future vectorization of hash calculation.
|
||||||
uint nActualWeights = cs->strnxfrm(buf, MaxConstStrBufSize, MaxConstStrBufSize,
|
uchar buf[MaxConstStrBufSize] __attribute__((aligned(64)));
|
||||||
reinterpret_cast<const uchar*>(strColValue.str()), strColValue.length(),
|
// Pay attention to the last strxfrm argument value.
|
||||||
datatypes::Charset::getDefaultFlags());
|
// It is called flags and in many cases it has padding
|
||||||
ret = hashData(buf, nActualWeights, ret);
|
// enabled(MY_STRXFRM_PAD_WITH_SPACE bit). With padding enabled
|
||||||
|
// strxfrm returns MaxConstStrBufSize bytes and not the actual
|
||||||
|
// weights array length. Here I disable padding.
|
||||||
|
auto charset = datatypes::Charset(cs);
|
||||||
|
auto trimStrColValue = strColValue.rtrimSpaces();
|
||||||
|
// The padding is disabled b/c we previously use rtrimSpaces().
|
||||||
|
// strColValueLen is used here.
|
||||||
|
size_t nActualWeights = charset.strnxfrm(buf, MaxConstStrBufSize, strColValueLen,
|
||||||
|
reinterpret_cast<const uchar*>(trimStrColValue.str()),
|
||||||
|
trimStrColValue.length(), 0);
|
||||||
|
ret = columnHasher(reinterpret_cast<const void*>(buf), nActualWeights, ret);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
default: ret = hashData(r.getData() + r.getOffset(i), r.getColumnWidth(i), ret); break;
|
default: ret = columnHasher(r.getData() + r.getOffset(i), r.getColumnWidth(i), ret); break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// The properties of the hash produced are worse if MDB hasher results are incorporated
|
||||||
|
// so late but these results must be used very infrequently.
|
||||||
if (strHashUsed)
|
if (strHashUsed)
|
||||||
{
|
{
|
||||||
uint64_t strhash = h.finalize();
|
uint64_t strhash = h.finalize();
|
||||||
ret = hashData(&strhash, sizeof(strhash), ret);
|
ret = columnHasher(&strhash, sizeof(strhash), ret);
|
||||||
}
|
}
|
||||||
|
return columnHasher.finalize(ret, lastCol << 2);
|
||||||
return ret;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** @brief NoOP interface to LRU-cache used by RowGroupStorage & HashStorage
|
/** @brief NoOP interface to LRU-cache used by RowGroupStorage & HashStorage
|
||||||
|
Reference in New Issue
Block a user