1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-07-30 19:23:07 +03:00

Merge pull request #2004 from drrtuy/MCOL-4759

MCOL-4759 Upmerge for MCOL-4564 code that implements hash merging fam…
This commit is contained in:
Roman Nozdrin
2021-06-28 14:05:16 +03:00
committed by GitHub
2 changed files with 122 additions and 52 deletions

57
utils/common/hashfamily.h Normal file
View File

@ -0,0 +1,57 @@
/* Copyright (C) 2021 Mariadb Corporation.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; version 2 of
the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
MA 02110-1301, USA. */
#ifndef UTILS_HASHFAMILY_H
#define UTILS_HASHFAMILY_H
#include "hasher.h"
#include "collation.h"
namespace utils
{
class HashFamily
{
public:
HashFamily(const utils::Hasher_r& h,
const uint64_t intermediateHash,
const uint64_t len,
const datatypes::MariaDBHasher& hM) : mHasher(h),
mMariaDBHasher(hM),
mHasher_rHash(intermediateHash),
mHasher_rLen(len)
{ }
// Algorithm, seed and factor are taken from this discussion
// https://stackoverflow.com/questions/1646807/quick-and-simple-hash-code-combinations
inline uint64_t finalize() const
{
return (seed * factor + mHasher.finalize(mHasher_rHash, mHasher_rLen)) * factor + mMariaDBHasher.finalize();
}
private:
constexpr static uint64_t seed = 1009ULL;
constexpr static uint64_t factor = 9176ULL;
const utils::Hasher_r& mHasher;
const datatypes::MariaDBHasher& mMariaDBHasher;
const uint64_t mHasher_rHash;
const uint32_t mHasher_rLen;
};
}
#endif
// vim:ts=2 sw=2:

View File

@ -60,7 +60,7 @@
#include "../winport/winport.h" #include "../winport/winport.h"
#include "collation.h" #include "collation.h"
#include "common/hashfamily.h"
// Workaround for my_global.h #define of isnan(X) causing a std::std namespace // Workaround for my_global.h #define of isnan(X) causing a std::std namespace
@ -70,57 +70,57 @@ namespace rowgroup
const int16_t rgCommonSize = 8192; const int16_t rgCommonSize = 8192;
/* /*
The RowGroup family of classes encapsulate the data moved through the The RowGroup family of classes encapsulate the data moved through the
system. system.
- RowGroup specifies the format of the data primarily (+ some other metadata), - RowGroup specifies the format of the data primarily (+ some other metadata),
- RGData (aka RowGroup Data) encapsulates the data, - RGData (aka RowGroup Data) encapsulates the data,
- Row is used to extract fields from the data and iterate. - Row is used to extract fields from the data and iterate.
JobListFactory instantiates the RowGroups to be used by each stage of processing. JobListFactory instantiates the RowGroups to be used by each stage of processing.
RGDatas are passed between stages, and their RowGroup instances are used RGDatas are passed between stages, and their RowGroup instances are used
to interpret them. to interpret them.
Historically, row data was just a chunk of contiguous memory, a uint8_t *. Historically, row data was just a chunk of contiguous memory, a uint8_t *.
Every field had a fixed width, which allowed for quick offset Every field had a fixed width, which allowed for quick offset
calculation when assigning or retrieving individual fields. That worked calculation when assigning or retrieving individual fields. That worked
well for a few years, but at some point it became common to declare well for a few years, but at some point it became common to declare
all strings as max-length, and to manipulate them in queries. all strings as max-length, and to manipulate them in queries.
Having fixed-width fields, even for strings, required an unreasonable Having fixed-width fields, even for strings, required an unreasonable
amount of memory. RGData & StringStore were introduced to handle strings amount of memory. RGData & StringStore were introduced to handle strings
more efficiently, at least with respect to memory. The row data would more efficiently, at least with respect to memory. The row data would
still be a uint8_t *, and columns would be fixed-width, but string fields still be a uint8_t *, and columns would be fixed-width, but string fields
above a certain width would contain a 'Pointer' that referenced a string in above a certain width would contain a 'Pointer' that referenced a string in
StringStore. Strings are stored efficiently in StringStore, so there is StringStore. Strings are stored efficiently in StringStore, so there is
no longer wasted space. no longer wasted space.
StringStore comes with a different inefficiency however. When a value StringStore comes with a different inefficiency however. When a value
is overwritten, the original string cannot be freed independently of the is overwritten, the original string cannot be freed independently of the
others, so it continues to use space. If values are only set once, as is others, so it continues to use space. If values are only set once, as is
the typical case, then StringStore is efficient. When it is necessary the typical case, then StringStore is efficient. When it is necessary
to overwrite string fields, it is possible to configure these classes to overwrite string fields, it is possible to configure these classes
to use the original data format so that old string fields do not accumulate to use the original data format so that old string fields do not accumulate
in memory. Of course, be careful, because blobs and text fields in CS are in memory. Of course, be careful, because blobs and text fields in CS are
declared as 2GB strings! declared as 2GB strings!
A single RGData contains up to one 'logical block' worth of data, A single RGData contains up to one 'logical block' worth of data,
which is 8192 rows. One RGData is usually treated as one unit of work by which is 8192 rows. One RGData is usually treated as one unit of work by
PrimProc and the JobSteps, but the rows an RGData contains and how many are PrimProc and the JobSteps, but the rows an RGData contains and how many are
treated as a work unit depend on the operation being done. treated as a work unit depend on the operation being done.
For example, PrimProc works in units of 8192 contiguous rows For example, PrimProc works in units of 8192 contiguous rows
that come from disk. If half of the rows were filtered out, then the that come from disk. If half of the rows were filtered out, then the
RGData it passes to the next stage would only contain 4096 rows. RGData it passes to the next stage would only contain 4096 rows.
Others build results incrementally before passing them along, such as Others build results incrementally before passing them along, such as
group-by. If one group contains 11111 values, then group-by will group-by. If one group contains 11111 values, then group-by will
return 2 RGDatas for that group, one with 8192 rows, and one with 2919. return 2 RGDatas for that group, one with 8192 rows, and one with 2919.
Note: There is no synchronization in any of these classes for obvious Note: There is no synchronization in any of these classes for obvious
performance reasons. Likewise, although it's technically safe for many performance reasons. Likewise, although it's technically safe for many
readers to access an RGData simultaneously, that would not be an readers to access an RGData simultaneously, that would not be an
efficient thing to do. Try to stick to designs where a single RGData efficient thing to do. Try to stick to designs where a single RGData
is used by a single thread at a time. is used by a single thread at a time.
*/ */
@ -138,7 +138,7 @@ inline T derefFromTwoVectorPtrs(const std::vector<T>* outer,
const T innerIdx) const T innerIdx)
{ {
auto outerIdx = inner->operator[](innerIdx); auto outerIdx = inner->operator[](innerIdx);
return outer->operator[](outerIdx); return outer->operator[](outerIdx);
} }
class StringStore class StringStore
@ -375,7 +375,7 @@ public:
inline execplan::CalpontSystemCatalog::ColDataType* getColTypes(); inline execplan::CalpontSystemCatalog::ColDataType* getColTypes();
inline const execplan::CalpontSystemCatalog::ColDataType* getColTypes() const; inline const execplan::CalpontSystemCatalog::ColDataType* getColTypes() const;
inline uint32_t getCharsetNumber(uint32_t colIndex) const; inline uint32_t getCharsetNumber(uint32_t colIndex) const;
// this returns true if the type is not CHAR or VARCHAR // this returns true if the type is not CHAR or VARCHAR
inline bool isCharType(uint32_t colIndex) const; inline bool isCharType(uint32_t colIndex) const;
inline bool isUnsigned(uint32_t colIndex) const; inline bool isUnsigned(uint32_t colIndex) const;
@ -429,7 +429,7 @@ public:
inline bool equals(long double val, uint32_t colIndex) const; inline bool equals(long double val, uint32_t colIndex) const;
bool equals(const std::string& val, uint32_t colIndex) const; bool equals(const std::string& val, uint32_t colIndex) const;
inline bool equals(const int128_t& val, uint32_t colIndex) const; inline bool equals(const int128_t& val, uint32_t colIndex) const;
inline double getDoubleField(uint32_t colIndex) const; inline double getDoubleField(uint32_t colIndex) const;
inline float getFloatField(uint32_t colIndex) const; inline float getFloatField(uint32_t colIndex) const;
inline datatypes::Decimal getDecimalField(uint32_t colIndex) const inline datatypes::Decimal getDecimalField(uint32_t colIndex) const
@ -513,7 +513,7 @@ public:
inline T* getBinaryField(T* argtype, uint32_t colIndex) const; inline T* getBinaryField(T* argtype, uint32_t colIndex) const;
template <typename T> template <typename T>
inline T* getBinaryField_offset(uint32_t offset) const; inline T* getBinaryField_offset(uint32_t offset) const;
inline boost::shared_ptr<mcsv1sdk::UserData> getUserData(uint32_t colIndex) const; inline boost::shared_ptr<mcsv1sdk::UserData> getUserData(uint32_t colIndex) const;
inline void setUserData(mcsv1sdk::mcsv1Context& context, inline void setUserData(mcsv1sdk::mcsv1Context& context,
boost::shared_ptr<mcsv1sdk::UserData> userData, boost::shared_ptr<mcsv1sdk::UserData> userData,
@ -569,18 +569,21 @@ public:
// a fcn to check the type defs seperately doesn't exist yet. No normalization. // a fcn to check the type defs seperately doesn't exist yet. No normalization.
inline uint64_t hash(uint32_t lastCol) const; // generates a hash for cols [0-lastCol] inline uint64_t hash(uint32_t lastCol) const; // generates a hash for cols [0-lastCol]
inline uint64_t hash() const; // generates a hash for all cols inline uint64_t hash() const; // generates a hash for all cols
inline void colUpdateMariaDBHasher(datatypes::MariaDBHasher &hasher, uint32_t col) const; inline void colUpdateHasher(datatypes::MariaDBHasher& hM,
inline void colUpdateMariaDBHasherTypeless(datatypes::MariaDBHasher &hasher, uint32_t keyColsIdx, const utils::Hasher_r& h,
const std::vector<uint32_t>& keyCols, const uint32_t col,
const std::vector<uint32_t>* smallSideKeyColumnsIds, uint32_t& intermediateHash) const;
const std::vector<uint32_t>* smallSideColumnsWidths) const; inline void colUpdateHasherTypeless(datatypes::MariaDBHasher &hasher, uint32_t keyColsIdx,
const std::vector<uint32_t>& keyCols,
const std::vector<uint32_t>* smallSideKeyColumnsIds,
const std::vector<uint32_t>* smallSideColumnsWidths) const;
inline uint64_t hashTypeless(const std::vector<uint32_t>& keyCols, inline uint64_t hashTypeless(const std::vector<uint32_t>& keyCols,
const std::vector<uint32_t>* smallSideKeyColumnsIds, const std::vector<uint32_t>* smallSideKeyColumnsIds,
const std::vector<uint32_t>* smallSideColumnsWidths) const const std::vector<uint32_t>* smallSideColumnsWidths) const
{ {
datatypes::MariaDBHasher h; datatypes::MariaDBHasher h;
for (uint32_t i = 0; i < keyCols.size(); i++) for (uint32_t i = 0; i < keyCols.size(); i++)
colUpdateMariaDBHasherTypeless(h, i, keyCols, smallSideKeyColumnsIds, smallSideColumnsWidths); colUpdateHasherTypeless(h, i, keyCols, smallSideKeyColumnsIds, smallSideColumnsWidths);
return h.finalize(); return h.finalize();
} }
@ -591,7 +594,7 @@ public:
{ {
userDataStore = u; userDataStore = u;
} }
const CHARSET_INFO* getCharset(uint32_t col) const; const CHARSET_INFO* getCharset(uint32_t col) const;
private: private:
@ -946,7 +949,10 @@ inline utils::ConstString Row::getConstString(uint32_t colIndex) const
} }
inline void Row::colUpdateMariaDBHasher(datatypes::MariaDBHasher &h, uint32_t col) const inline void Row::colUpdateHasher(datatypes::MariaDBHasher& hM,
const utils::Hasher_r& h,
const uint32_t col,
uint32_t& intermediateHash) const
{ {
switch (getColType(col)) switch (getColType(col))
{ {
@ -956,17 +962,19 @@ inline void Row::colUpdateMariaDBHasher(datatypes::MariaDBHasher &h, uint32_t co
case execplan::CalpontSystemCatalog::TEXT: case execplan::CalpontSystemCatalog::TEXT:
{ {
CHARSET_INFO *cs = getCharset(col); CHARSET_INFO *cs = getCharset(col);
h.add(cs, getConstString(col)); hM.add(cs, getConstString(col));
break; break;
} }
default: default:
h.add(&my_charset_bin, getShortConstString(col)); {
intermediateHash = h((const char*) &data[offsets[col]], colWidths[col], intermediateHash);
break; break;
}
} }
} }
inline void Row::colUpdateMariaDBHasherTypeless(datatypes::MariaDBHasher &h, uint32_t keyColsIdx, inline void Row::colUpdateHasherTypeless(datatypes::MariaDBHasher &h, uint32_t keyColsIdx,
const std::vector<uint32_t>& keyCols, const std::vector<uint32_t>& keyCols,
const std::vector<uint32_t>* smallSideKeyColumnsIds, const std::vector<uint32_t>* smallSideKeyColumnsIds,
const std::vector<uint32_t>* smallSideColumnsWidths) const const std::vector<uint32_t>* smallSideColumnsWidths) const
@ -1472,7 +1480,12 @@ inline uint64_t Row::hash() const
inline uint64_t Row::hash(uint32_t lastCol) const inline uint64_t Row::hash(uint32_t lastCol) const
{ {
datatypes::MariaDBHasher h; // Use two hash classes. MariaDBHasher for text-based
// collation-aware data types and Hasher_r for all other data types.
// We deliver a hash that is a combination of both hashers' results.
utils::Hasher_r h;
datatypes::MariaDBHasher hM;
uint32_t intermediateHash = 0;
// Sometimes we ask this to hash 0 bytes, and it comes through looking like // Sometimes we ask this to hash 0 bytes, and it comes through looking like
// lastCol = -1. Return 0. // lastCol = -1. Return 0.
@ -1480,9 +1493,9 @@ inline uint64_t Row::hash(uint32_t lastCol) const
return 0; return 0;
for (uint32_t i = 0; i <= lastCol; i++) for (uint32_t i = 0; i <= lastCol; i++)
colUpdateMariaDBHasher(h, i); colUpdateHasher(hM, h, i, intermediateHash);
return h.finalize(); return utils::HashFamily(h, intermediateHash, lastCol << 2, hM).finalize();
} }
inline bool Row::equals(const Row& r2) const inline bool Row::equals(const Row& r2) const
@ -1661,7 +1674,7 @@ public:
uint16_t* blockNum); uint16_t* blockNum);
inline void setStringStore(boost::shared_ptr<StringStore>); inline void setStringStore(boost::shared_ptr<StringStore>);
const CHARSET_INFO* getCharset(uint32_t col); const CHARSET_INFO* getCharset(uint32_t col);
private: private:
@ -1682,7 +1695,7 @@ private:
// For string collation // For string collation
std::vector<uint32_t> charsetNumbers; std::vector<uint32_t> charsetNumbers;
std::vector<CHARSET_INFO*> charsets; std::vector<CHARSET_INFO*> charsets;
// DECIMAL support. For non-decimal fields, the values are 0. // DECIMAL support. For non-decimal fields, the values are 0.
std::vector<uint32_t> scale; std::vector<uint32_t> scale;
std::vector<uint32_t> precision; std::vector<uint32_t> precision;