1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-07-30 19:23:07 +03:00

[MCOL-4849] Parallelize the processing of the bytestream vector.

This patch changes the logic of the `receiveMultiPrimitiveMessages`
function in the following way:

1. We have only one aggregation thread which reads the data from Queue (which is populated
by messages from BPPs).
2. Processing of the received `bytestream vector` could be in parallel depends on the
type of `TupleBPS` operation (join, fe2, ...) and actual thread pool workload.

The motivation is to eliminate some amount of context switches.
This commit is contained in:
Denis Khalikov
2021-10-13 15:31:05 +03:00
parent 650d45fcc1
commit b382f681a1
4 changed files with 786 additions and 665 deletions

View File

@ -1075,6 +1075,24 @@ public:
virtual void setFE23Output(const rowgroup::RowGroup& rg) = 0;
};
struct _CPInfo
{
_CPInfo(int64_t MIN, int64_t MAX, uint64_t l, bool val) : min(MIN), max(MAX), LBID(l), valid(val){};
_CPInfo(int128_t BIGMIN, int128_t BIGMAX, uint64_t l, bool val)
: bigMin(BIGMIN), bigMax(BIGMAX), LBID(l), valid(val){};
union
{
int128_t bigMin;
int64_t min;
};
union
{
int128_t bigMax;
int64_t max;
};
uint64_t LBID;
bool valid;
};
/** @brief class TupleBPS
*
@ -1113,7 +1131,12 @@ public:
*
* The main loop for the receive-side thread. Don't call it directly.
*/
void receiveMultiPrimitiveMessages(uint32_t threadID);
void receiveMultiPrimitiveMessages();
// Processes the vector of `bytestream` starting from `begin` index to the `end` index, non inclusive.
void processByteStreamVector(vector<boost::shared_ptr<messageqcpp::ByteStream>>& bsv,
const uint32_t begin, const uint32_t end, vector<_CPInfo>& cpv,
RowGroupDL* dlp, const uint32_t threadID);
/** @brief Add a filter when the column is anything but a 4-byte float type.
*
@ -1352,6 +1375,10 @@ private:
void startPrimitiveThread();
void startAggregationThread();
// Processes the vector of `bytestream` starting from `begin` index to the `end` index, non inclusive.
void startProcessingThread(TupleBPS* tbps, vector<boost::shared_ptr<messageqcpp::ByteStream>>& bsv,
const uint32_t begin, const uint32_t end, vector<_CPInfo>& cpv, RowGroupDL* dlp,
const uint32_t threadID);
void initializeConfigParms();
uint64_t getFBO(uint64_t lbid);
void checkDupOutputColumns(const rowgroup::RowGroup& rg);
@ -1359,7 +1386,6 @@ private:
void dupOutputColumns(rowgroup::RGData&, rowgroup::RowGroup&);
void rgDataToDl(rowgroup::RGData&, rowgroup::RowGroup&, RowGroupDL*);
void rgDataVecToDl(std::vector<rowgroup::RGData>&, rowgroup::RowGroup&, RowGroupDL*);
DistributedEngineComm* fDec;
boost::shared_ptr<BatchPrimitiveProcessorJL> fBPP;
uint16_t fNumSteps;
@ -1373,6 +1399,7 @@ private:
PrimitiveStepType ffirstStepType;
bool isFilterFeeder;
std::vector<uint64_t> fProducerThreads; // thread pool handles
std::vector<uint64_t> fProcessorThreads;
messageqcpp::ByteStream fFilterString;
uint32_t fFilterCount;
execplan::CalpontSystemCatalog::ColType fColType;
@ -1470,13 +1497,10 @@ private:
rowgroup::RGData fe2Data;
rowgroup::Row fe2InRow, fe2OutRow;
void processFE2(rowgroup::RowGroup& input, rowgroup::RowGroup& output,
rowgroup::Row& inRow, rowgroup::Row& outRow,
std::vector<rowgroup::RGData>* rgData,
funcexp::FuncExpWrapper* localFE2);
void processFE2_oneRG(rowgroup::RowGroup& input, rowgroup::RowGroup& output,
rowgroup::Row& inRow, rowgroup::Row& outRow,
funcexp::FuncExpWrapper* localFE2);
void processFE2(rowgroup::RowGroup& input, rowgroup::RowGroup& output, rowgroup::Row& inRow, rowgroup::Row& outRow,
std::vector<rowgroup::RGData>* rgData, funcexp::FuncExpWrapper* localFE2);
void processFE2_oneRG(rowgroup::RowGroup& input, rowgroup::RowGroup& output, rowgroup::Row& inRow,
rowgroup::Row& outRow, funcexp::FuncExpWrapper* localFE2);
/* Runtime Casual Partitioning adjustments. The CP code is needlessly complicated;
* to avoid making it worse, decided to designate 'scanFlags' as the static
@ -1491,11 +1515,98 @@ private:
boost::shared_ptr<RowGroupDL> deliveryDL;
uint32_t deliveryIt;
struct JoinLocalData
{
JoinLocalData() = delete;
JoinLocalData(const JoinLocalData&) = delete;
JoinLocalData(JoinLocalData&&) = delete;
JoinLocalData& operator=(const JoinLocalData&) = delete;
JoinLocalData& operator=(JoinLocalData&&) = delete;
~JoinLocalData() = default;
JoinLocalData(rowgroup::RowGroup& primRowGroup, rowgroup::RowGroup& outputRowGroup,
boost::shared_ptr<funcexp::FuncExpWrapper>& fe2, rowgroup::RowGroup& fe2Output,
std::vector<rowgroup::RowGroup>& joinerMatchesRGs, rowgroup::RowGroup& joinFERG,
std::vector<boost::shared_ptr<joiner::TupleJoiner>>& tjoiners, uint32_t smallSideCount,
bool doJoin);
rowgroup::RowGroup local_primRG;
rowgroup::RowGroup local_outputRG;
uint32_t cachedIO_Thread = 0;
uint32_t physIO_Thread = 0;
uint32_t touchedBlocks_Thread = 0;
int64_t ridsReturned_Thread = 0;
// On init.
bool doJoin;
boost::shared_ptr<funcexp::FuncExpWrapper> fe2;
rowgroup::RowGroup fe2Output;
uint32_t smallSideCount;
std::vector<rowgroup::RowGroup> joinerMatchesRGs;
rowgroup::RowGroup joinFERG;
std::vector<boost::shared_ptr<joiner::TupleJoiner>> tjoiners;
// Join vars.
vector<vector<rowgroup::Row::Pointer>> joinerOutput;
rowgroup::Row largeSideRow;
rowgroup::Row joinedBaseRow;
rowgroup::Row largeNull;
rowgroup::Row joinFERow; // LSR clean
boost::scoped_array<rowgroup::Row> smallSideRows;
boost::scoped_array<rowgroup::Row> smallNulls;
boost::scoped_array<uint8_t> joinedBaseRowData;
boost::scoped_array<uint8_t> joinFERowData;
boost::shared_array<int> largeMapping;
vector<boost::shared_array<int>> smallMappings;
vector<boost::shared_array<int>> fergMappings;
rowgroup::RGData joinedData;
boost::scoped_array<uint8_t> largeNullMemory;
boost::scoped_array<boost::shared_array<uint8_t>> smallNullMemory;
uint32_t matchCount;
rowgroup::Row postJoinRow;
rowgroup::RowGroup local_fe2Output;
rowgroup::RGData local_fe2Data;
rowgroup::Row local_fe2OutRow;
funcexp::FuncExpWrapper local_fe2;
};
// We will initialize the actual value in TupleBPS `initializeConfigParms` function.
uint32_t fMaxNumProcessorThreads = 16;
// Based on the `ThreadPool` workload we set it in runtime for each `TupleBPS` operation.
uint32_t fNumProcessorThreads = 1;
std::shared_ptr<JoinLocalData> getJoinLocalDataByIndex(uint32_t index)
{
idbassert(index < fNumProcessorThreads && joinLocalDataPool.size() == fNumProcessorThreads);
return joinLocalDataPool[index];
}
void initializeJoinLocalDataPool(uint32_t numThreads)
{
idbassert(numThreads <= fMaxNumProcessorThreads);
for (uint32_t i = 0; i < numThreads; ++i)
{
joinLocalDataPool.push_back(std::shared_ptr<JoinLocalData>(
new JoinLocalData(primRowGroup, outputRowGroup, fe2, fe2Output, joinerMatchesRGs, joinFERG,
tjoiners, smallSideCount, doJoin)));
}
fNumProcessorThreads = numThreads;
}
// Join local data vector.
std::vector<std::shared_ptr<JoinLocalData>> joinLocalDataPool;
/* shared nothing support */
struct Job
{
Job(uint32_t d, uint32_t n, uint32_t b, boost::shared_ptr<messageqcpp::ByteStream>& bs) :
dbroot(d), connectionNum(n), expectedResponses(b), msg(bs) { }
Job(uint32_t d, uint32_t n, uint32_t b, boost::shared_ptr<messageqcpp::ByteStream>& bs)
: dbroot(d), connectionNum(n), expectedResponses(b), msg(bs)
{
}
uint32_t dbroot;
uint32_t connectionNum;
uint32_t expectedResponses;
@ -1523,7 +1634,6 @@ private:
bool compareRange(uint8_t COP, int64_t min, int64_t max, int64_t val) const;
bool hasPCFilter, hasPMFilter, hasRIDFilter, hasSegmentFilter, hasDBRootFilter, hasSegmentDirFilter,
hasPartitionFilter, hasMaxFilter, hasMinFilter, hasLBIDFilter, hasExtentIDFilter;
};
/** @brief class FilterStep

File diff suppressed because it is too large Load Diff

View File

@ -908,8 +908,7 @@ void TupleJoiner::setInUM(vector<RGData> &rgs)
}
}
void TupleJoiner::setPMJoinResults(boost::shared_array<vector<uint32_t> > jr,
uint32_t threadID)
void TupleJoiner::setPMJoinResults(boost::shared_array<vector<uint32_t>> jr, uint32_t threadID)
{
pmJoinResults[threadID] = jr;
}

View File

@ -225,6 +225,10 @@ public:
return fMaxThreads;
}
/** @brief get the issued number of threads
*/
inline size_t getIssuedThreads() { return fIssued; }
/** @brief queue size accessor
*
*/
@ -292,6 +296,7 @@ public:
fDebug = d;
}
friend class ThreadPoolMonitor;
protected: