You've already forked mariadb-columnstore-engine
mirror of
https://github.com/mariadb-corporation/mariadb-columnstore-engine.git
synced 2025-07-30 19:23:07 +03:00
Merge pull request #2004 from drrtuy/MCOL-4759
MCOL-4759 Upmerge for MCOL-4564 code that implements hash merging fam…
This commit is contained in:
57
utils/common/hashfamily.h
Normal file
57
utils/common/hashfamily.h
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
/* Copyright (C) 2021 Mariadb Corporation.
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or
|
||||||
|
modify it under the terms of the GNU General Public License
|
||||||
|
as published by the Free Software Foundation; version 2 of
|
||||||
|
the License.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with this program; if not, write to the Free Software
|
||||||
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
||||||
|
MA 02110-1301, USA. */
|
||||||
|
|
||||||
|
#ifndef UTILS_HASHFAMILY_H
|
||||||
|
#define UTILS_HASHFAMILY_H
|
||||||
|
|
||||||
|
#include "hasher.h"
|
||||||
|
#include "collation.h"
|
||||||
|
|
||||||
|
namespace utils
|
||||||
|
{
|
||||||
|
|
||||||
|
class HashFamily
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
HashFamily(const utils::Hasher_r& h,
|
||||||
|
const uint64_t intermediateHash,
|
||||||
|
const uint64_t len,
|
||||||
|
const datatypes::MariaDBHasher& hM) : mHasher(h),
|
||||||
|
mMariaDBHasher(hM),
|
||||||
|
mHasher_rHash(intermediateHash),
|
||||||
|
mHasher_rLen(len)
|
||||||
|
{ }
|
||||||
|
|
||||||
|
// Algorithm, seed and factor are taken from this discussion
|
||||||
|
// https://stackoverflow.com/questions/1646807/quick-and-simple-hash-code-combinations
|
||||||
|
inline uint64_t finalize() const
|
||||||
|
{
|
||||||
|
return (seed * factor + mHasher.finalize(mHasher_rHash, mHasher_rLen)) * factor + mMariaDBHasher.finalize();
|
||||||
|
}
|
||||||
|
private:
|
||||||
|
constexpr static uint64_t seed = 1009ULL;
|
||||||
|
constexpr static uint64_t factor = 9176ULL;
|
||||||
|
|
||||||
|
const utils::Hasher_r& mHasher;
|
||||||
|
const datatypes::MariaDBHasher& mMariaDBHasher;
|
||||||
|
const uint64_t mHasher_rHash;
|
||||||
|
const uint32_t mHasher_rLen;
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
// vim:ts=2 sw=2:
|
@ -60,7 +60,7 @@
|
|||||||
#include "../winport/winport.h"
|
#include "../winport/winport.h"
|
||||||
|
|
||||||
#include "collation.h"
|
#include "collation.h"
|
||||||
|
#include "common/hashfamily.h"
|
||||||
|
|
||||||
// Workaround for my_global.h #define of isnan(X) causing a std::std namespace
|
// Workaround for my_global.h #define of isnan(X) causing a std::std namespace
|
||||||
|
|
||||||
@ -70,57 +70,57 @@ namespace rowgroup
|
|||||||
const int16_t rgCommonSize = 8192;
|
const int16_t rgCommonSize = 8192;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
The RowGroup family of classes encapsulate the data moved through the
|
The RowGroup family of classes encapsulate the data moved through the
|
||||||
system.
|
system.
|
||||||
|
|
||||||
- RowGroup specifies the format of the data primarily (+ some other metadata),
|
- RowGroup specifies the format of the data primarily (+ some other metadata),
|
||||||
- RGData (aka RowGroup Data) encapsulates the data,
|
- RGData (aka RowGroup Data) encapsulates the data,
|
||||||
- Row is used to extract fields from the data and iterate.
|
- Row is used to extract fields from the data and iterate.
|
||||||
|
|
||||||
JobListFactory instantiates the RowGroups to be used by each stage of processing.
|
JobListFactory instantiates the RowGroups to be used by each stage of processing.
|
||||||
RGDatas are passed between stages, and their RowGroup instances are used
|
RGDatas are passed between stages, and their RowGroup instances are used
|
||||||
to interpret them.
|
to interpret them.
|
||||||
|
|
||||||
Historically, row data was just a chunk of contiguous memory, a uint8_t *.
|
Historically, row data was just a chunk of contiguous memory, a uint8_t *.
|
||||||
Every field had a fixed width, which allowed for quick offset
|
Every field had a fixed width, which allowed for quick offset
|
||||||
calculation when assigning or retrieving individual fields. That worked
|
calculation when assigning or retrieving individual fields. That worked
|
||||||
well for a few years, but at some point it became common to declare
|
well for a few years, but at some point it became common to declare
|
||||||
all strings as max-length, and to manipulate them in queries.
|
all strings as max-length, and to manipulate them in queries.
|
||||||
|
|
||||||
Having fixed-width fields, even for strings, required an unreasonable
|
Having fixed-width fields, even for strings, required an unreasonable
|
||||||
amount of memory. RGData & StringStore were introduced to handle strings
|
amount of memory. RGData & StringStore were introduced to handle strings
|
||||||
more efficiently, at least with respect to memory. The row data would
|
more efficiently, at least with respect to memory. The row data would
|
||||||
still be a uint8_t *, and columns would be fixed-width, but string fields
|
still be a uint8_t *, and columns would be fixed-width, but string fields
|
||||||
above a certain width would contain a 'Pointer' that referenced a string in
|
above a certain width would contain a 'Pointer' that referenced a string in
|
||||||
StringStore. Strings are stored efficiently in StringStore, so there is
|
StringStore. Strings are stored efficiently in StringStore, so there is
|
||||||
no longer wasted space.
|
no longer wasted space.
|
||||||
|
|
||||||
StringStore comes with a different inefficiency however. When a value
|
StringStore comes with a different inefficiency however. When a value
|
||||||
is overwritten, the original string cannot be freed independently of the
|
is overwritten, the original string cannot be freed independently of the
|
||||||
others, so it continues to use space. If values are only set once, as is
|
others, so it continues to use space. If values are only set once, as is
|
||||||
the typical case, then StringStore is efficient. When it is necessary
|
the typical case, then StringStore is efficient. When it is necessary
|
||||||
to overwrite string fields, it is possible to configure these classes
|
to overwrite string fields, it is possible to configure these classes
|
||||||
to use the original data format so that old string fields do not accumulate
|
to use the original data format so that old string fields do not accumulate
|
||||||
in memory. Of course, be careful, because blobs and text fields in CS are
|
in memory. Of course, be careful, because blobs and text fields in CS are
|
||||||
declared as 2GB strings!
|
declared as 2GB strings!
|
||||||
|
|
||||||
A single RGData contains up to one 'logical block' worth of data,
|
A single RGData contains up to one 'logical block' worth of data,
|
||||||
which is 8192 rows. One RGData is usually treated as one unit of work by
|
which is 8192 rows. One RGData is usually treated as one unit of work by
|
||||||
PrimProc and the JobSteps, but the rows an RGData contains and how many are
|
PrimProc and the JobSteps, but the rows an RGData contains and how many are
|
||||||
treated as a work unit depend on the operation being done.
|
treated as a work unit depend on the operation being done.
|
||||||
|
|
||||||
For example, PrimProc works in units of 8192 contiguous rows
|
For example, PrimProc works in units of 8192 contiguous rows
|
||||||
that come from disk. If half of the rows were filtered out, then the
|
that come from disk. If half of the rows were filtered out, then the
|
||||||
RGData it passes to the next stage would only contain 4096 rows.
|
RGData it passes to the next stage would only contain 4096 rows.
|
||||||
|
|
||||||
Others build results incrementally before passing them along, such as
|
Others build results incrementally before passing them along, such as
|
||||||
group-by. If one group contains 11111 values, then group-by will
|
group-by. If one group contains 11111 values, then group-by will
|
||||||
return 2 RGDatas for that group, one with 8192 rows, and one with 2919.
|
return 2 RGDatas for that group, one with 8192 rows, and one with 2919.
|
||||||
|
|
||||||
Note: There is no synchronization in any of these classes for obvious
|
Note: There is no synchronization in any of these classes for obvious
|
||||||
performance reasons. Likewise, although it's technically safe for many
|
performance reasons. Likewise, although it's technically safe for many
|
||||||
readers to access an RGData simultaneously, that would not be an
|
readers to access an RGData simultaneously, that would not be an
|
||||||
efficient thing to do. Try to stick to designs where a single RGData
|
efficient thing to do. Try to stick to designs where a single RGData
|
||||||
is used by a single thread at a time.
|
is used by a single thread at a time.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
@ -138,7 +138,7 @@ inline T derefFromTwoVectorPtrs(const std::vector<T>* outer,
|
|||||||
const T innerIdx)
|
const T innerIdx)
|
||||||
{
|
{
|
||||||
auto outerIdx = inner->operator[](innerIdx);
|
auto outerIdx = inner->operator[](innerIdx);
|
||||||
return outer->operator[](outerIdx);
|
return outer->operator[](outerIdx);
|
||||||
}
|
}
|
||||||
|
|
||||||
class StringStore
|
class StringStore
|
||||||
@ -375,7 +375,7 @@ public:
|
|||||||
inline execplan::CalpontSystemCatalog::ColDataType* getColTypes();
|
inline execplan::CalpontSystemCatalog::ColDataType* getColTypes();
|
||||||
inline const execplan::CalpontSystemCatalog::ColDataType* getColTypes() const;
|
inline const execplan::CalpontSystemCatalog::ColDataType* getColTypes() const;
|
||||||
inline uint32_t getCharsetNumber(uint32_t colIndex) const;
|
inline uint32_t getCharsetNumber(uint32_t colIndex) const;
|
||||||
|
|
||||||
// this returns true if the type is not CHAR or VARCHAR
|
// this returns true if the type is not CHAR or VARCHAR
|
||||||
inline bool isCharType(uint32_t colIndex) const;
|
inline bool isCharType(uint32_t colIndex) const;
|
||||||
inline bool isUnsigned(uint32_t colIndex) const;
|
inline bool isUnsigned(uint32_t colIndex) const;
|
||||||
@ -429,7 +429,7 @@ public:
|
|||||||
inline bool equals(long double val, uint32_t colIndex) const;
|
inline bool equals(long double val, uint32_t colIndex) const;
|
||||||
bool equals(const std::string& val, uint32_t colIndex) const;
|
bool equals(const std::string& val, uint32_t colIndex) const;
|
||||||
inline bool equals(const int128_t& val, uint32_t colIndex) const;
|
inline bool equals(const int128_t& val, uint32_t colIndex) const;
|
||||||
|
|
||||||
inline double getDoubleField(uint32_t colIndex) const;
|
inline double getDoubleField(uint32_t colIndex) const;
|
||||||
inline float getFloatField(uint32_t colIndex) const;
|
inline float getFloatField(uint32_t colIndex) const;
|
||||||
inline datatypes::Decimal getDecimalField(uint32_t colIndex) const
|
inline datatypes::Decimal getDecimalField(uint32_t colIndex) const
|
||||||
@ -513,7 +513,7 @@ public:
|
|||||||
inline T* getBinaryField(T* argtype, uint32_t colIndex) const;
|
inline T* getBinaryField(T* argtype, uint32_t colIndex) const;
|
||||||
template <typename T>
|
template <typename T>
|
||||||
inline T* getBinaryField_offset(uint32_t offset) const;
|
inline T* getBinaryField_offset(uint32_t offset) const;
|
||||||
|
|
||||||
inline boost::shared_ptr<mcsv1sdk::UserData> getUserData(uint32_t colIndex) const;
|
inline boost::shared_ptr<mcsv1sdk::UserData> getUserData(uint32_t colIndex) const;
|
||||||
inline void setUserData(mcsv1sdk::mcsv1Context& context,
|
inline void setUserData(mcsv1sdk::mcsv1Context& context,
|
||||||
boost::shared_ptr<mcsv1sdk::UserData> userData,
|
boost::shared_ptr<mcsv1sdk::UserData> userData,
|
||||||
@ -569,18 +569,21 @@ public:
|
|||||||
// a fcn to check the type defs seperately doesn't exist yet. No normalization.
|
// a fcn to check the type defs seperately doesn't exist yet. No normalization.
|
||||||
inline uint64_t hash(uint32_t lastCol) const; // generates a hash for cols [0-lastCol]
|
inline uint64_t hash(uint32_t lastCol) const; // generates a hash for cols [0-lastCol]
|
||||||
inline uint64_t hash() const; // generates a hash for all cols
|
inline uint64_t hash() const; // generates a hash for all cols
|
||||||
inline void colUpdateMariaDBHasher(datatypes::MariaDBHasher &hasher, uint32_t col) const;
|
inline void colUpdateHasher(datatypes::MariaDBHasher& hM,
|
||||||
inline void colUpdateMariaDBHasherTypeless(datatypes::MariaDBHasher &hasher, uint32_t keyColsIdx,
|
const utils::Hasher_r& h,
|
||||||
const std::vector<uint32_t>& keyCols,
|
const uint32_t col,
|
||||||
const std::vector<uint32_t>* smallSideKeyColumnsIds,
|
uint32_t& intermediateHash) const;
|
||||||
const std::vector<uint32_t>* smallSideColumnsWidths) const;
|
inline void colUpdateHasherTypeless(datatypes::MariaDBHasher &hasher, uint32_t keyColsIdx,
|
||||||
|
const std::vector<uint32_t>& keyCols,
|
||||||
|
const std::vector<uint32_t>* smallSideKeyColumnsIds,
|
||||||
|
const std::vector<uint32_t>* smallSideColumnsWidths) const;
|
||||||
inline uint64_t hashTypeless(const std::vector<uint32_t>& keyCols,
|
inline uint64_t hashTypeless(const std::vector<uint32_t>& keyCols,
|
||||||
const std::vector<uint32_t>* smallSideKeyColumnsIds,
|
const std::vector<uint32_t>* smallSideKeyColumnsIds,
|
||||||
const std::vector<uint32_t>* smallSideColumnsWidths) const
|
const std::vector<uint32_t>* smallSideColumnsWidths) const
|
||||||
{
|
{
|
||||||
datatypes::MariaDBHasher h;
|
datatypes::MariaDBHasher h;
|
||||||
for (uint32_t i = 0; i < keyCols.size(); i++)
|
for (uint32_t i = 0; i < keyCols.size(); i++)
|
||||||
colUpdateMariaDBHasherTypeless(h, i, keyCols, smallSideKeyColumnsIds, smallSideColumnsWidths);
|
colUpdateHasherTypeless(h, i, keyCols, smallSideKeyColumnsIds, smallSideColumnsWidths);
|
||||||
return h.finalize();
|
return h.finalize();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -591,7 +594,7 @@ public:
|
|||||||
{
|
{
|
||||||
userDataStore = u;
|
userDataStore = u;
|
||||||
}
|
}
|
||||||
|
|
||||||
const CHARSET_INFO* getCharset(uint32_t col) const;
|
const CHARSET_INFO* getCharset(uint32_t col) const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
@ -946,7 +949,10 @@ inline utils::ConstString Row::getConstString(uint32_t colIndex) const
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
inline void Row::colUpdateMariaDBHasher(datatypes::MariaDBHasher &h, uint32_t col) const
|
inline void Row::colUpdateHasher(datatypes::MariaDBHasher& hM,
|
||||||
|
const utils::Hasher_r& h,
|
||||||
|
const uint32_t col,
|
||||||
|
uint32_t& intermediateHash) const
|
||||||
{
|
{
|
||||||
switch (getColType(col))
|
switch (getColType(col))
|
||||||
{
|
{
|
||||||
@ -956,17 +962,19 @@ inline void Row::colUpdateMariaDBHasher(datatypes::MariaDBHasher &h, uint32_t co
|
|||||||
case execplan::CalpontSystemCatalog::TEXT:
|
case execplan::CalpontSystemCatalog::TEXT:
|
||||||
{
|
{
|
||||||
CHARSET_INFO *cs = getCharset(col);
|
CHARSET_INFO *cs = getCharset(col);
|
||||||
h.add(cs, getConstString(col));
|
hM.add(cs, getConstString(col));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
default:
|
default:
|
||||||
h.add(&my_charset_bin, getShortConstString(col));
|
{
|
||||||
|
intermediateHash = h((const char*) &data[offsets[col]], colWidths[col], intermediateHash);
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
inline void Row::colUpdateMariaDBHasherTypeless(datatypes::MariaDBHasher &h, uint32_t keyColsIdx,
|
inline void Row::colUpdateHasherTypeless(datatypes::MariaDBHasher &h, uint32_t keyColsIdx,
|
||||||
const std::vector<uint32_t>& keyCols,
|
const std::vector<uint32_t>& keyCols,
|
||||||
const std::vector<uint32_t>* smallSideKeyColumnsIds,
|
const std::vector<uint32_t>* smallSideKeyColumnsIds,
|
||||||
const std::vector<uint32_t>* smallSideColumnsWidths) const
|
const std::vector<uint32_t>* smallSideColumnsWidths) const
|
||||||
@ -1472,7 +1480,12 @@ inline uint64_t Row::hash() const
|
|||||||
|
|
||||||
inline uint64_t Row::hash(uint32_t lastCol) const
|
inline uint64_t Row::hash(uint32_t lastCol) const
|
||||||
{
|
{
|
||||||
datatypes::MariaDBHasher h;
|
// Use two hash classes. MariaDBHasher for text-based
|
||||||
|
// collation-aware data types and Hasher_r for all other data types.
|
||||||
|
// We deliver a hash that is a combination of both hashers' results.
|
||||||
|
utils::Hasher_r h;
|
||||||
|
datatypes::MariaDBHasher hM;
|
||||||
|
uint32_t intermediateHash = 0;
|
||||||
|
|
||||||
// Sometimes we ask this to hash 0 bytes, and it comes through looking like
|
// Sometimes we ask this to hash 0 bytes, and it comes through looking like
|
||||||
// lastCol = -1. Return 0.
|
// lastCol = -1. Return 0.
|
||||||
@ -1480,9 +1493,9 @@ inline uint64_t Row::hash(uint32_t lastCol) const
|
|||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
for (uint32_t i = 0; i <= lastCol; i++)
|
for (uint32_t i = 0; i <= lastCol; i++)
|
||||||
colUpdateMariaDBHasher(h, i);
|
colUpdateHasher(hM, h, i, intermediateHash);
|
||||||
|
|
||||||
return h.finalize();
|
return utils::HashFamily(h, intermediateHash, lastCol << 2, hM).finalize();
|
||||||
}
|
}
|
||||||
|
|
||||||
inline bool Row::equals(const Row& r2) const
|
inline bool Row::equals(const Row& r2) const
|
||||||
@ -1661,7 +1674,7 @@ public:
|
|||||||
uint16_t* blockNum);
|
uint16_t* blockNum);
|
||||||
|
|
||||||
inline void setStringStore(boost::shared_ptr<StringStore>);
|
inline void setStringStore(boost::shared_ptr<StringStore>);
|
||||||
|
|
||||||
const CHARSET_INFO* getCharset(uint32_t col);
|
const CHARSET_INFO* getCharset(uint32_t col);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
@ -1682,7 +1695,7 @@ private:
|
|||||||
// For string collation
|
// For string collation
|
||||||
std::vector<uint32_t> charsetNumbers;
|
std::vector<uint32_t> charsetNumbers;
|
||||||
std::vector<CHARSET_INFO*> charsets;
|
std::vector<CHARSET_INFO*> charsets;
|
||||||
|
|
||||||
// DECIMAL support. For non-decimal fields, the values are 0.
|
// DECIMAL support. For non-decimal fields, the values are 0.
|
||||||
std::vector<uint32_t> scale;
|
std::vector<uint32_t> scale;
|
||||||
std::vector<uint32_t> precision;
|
std::vector<uint32_t> precision;
|
||||||
|
Reference in New Issue
Block a user