You've already forked mariadb-columnstore-engine
mirror of
https://github.com/mariadb-corporation/mariadb-columnstore-engine.git
synced 2025-07-27 21:01:50 +03:00
fix(aggregation, disk-based) MCOL-5691 distinct aggregate disk based (#3145)
* fix(aggregation, disk-based): MCOL-5689 this fixes disk-based distinct aggregation functions Previously disk-based distinct aggregation functions produced incorrect results b/c there was no finalization applied for previous generations stored on disk. * fix(aggregation, disk-based): Fix disk-based COUNT(DISTINCT ...) queries. (Case 2). (Distinct & Multi-Distinct, Single- & Multi-Threaded). * fix(aggregation, disk-based): Fix disk-based DISTINCT & GROUP BY queries. (Case 1). (Distinct & Multi-Distinct, Single- & Multi-Threaded). --------- Co-authored-by: Theresa Hradilak <theresa.hradilak@gmail.com> Co-authored-by: Roman Nozdrin <rnozdrin@mariadb.com>
This commit is contained in:
@ -289,6 +289,7 @@ TupleAggregateStep::TupleAggregateStep(const SP_ROWAGG_UM_t& agg, const RowGroup
|
||||
fNumOfBuckets =
|
||||
calcNumberOfBuckets(memLimit, fNumOfThreads, fNumOfBuckets, fNumOfRowGroups, fRowGroupIn.getRowSize(),
|
||||
fRowGroupOut.getRowSize(), fRm->getAllowDiskAggregation());
|
||||
|
||||
fNumOfThreads = std::min(fNumOfThreads, fNumOfBuckets);
|
||||
|
||||
fMemUsage.reset(new uint64_t[fNumOfThreads]);
|
||||
@ -392,7 +393,7 @@ void TupleAggregateStep::doThreadedSecondPhaseAggregate(uint32_t threadID)
|
||||
rowGroupIn->initRow(&rowIn);
|
||||
auto* subDistAgg = dynamic_cast<RowAggregationUM*>(multiDist->subAggregators()[j].get());
|
||||
|
||||
while (subDistAgg->nextRowGroup())
|
||||
while (subDistAgg->nextOutputRowGroup())
|
||||
{
|
||||
rowGroupIn = (multiDist->subAggregators()[j]->getOutputRowGroup());
|
||||
rgDataVec.emplace_back(subDistAgg->moveCurrentRGData());
|
||||
@ -416,7 +417,7 @@ void TupleAggregateStep::doThreadedSecondPhaseAggregate(uint32_t threadID)
|
||||
rowGroupIn->initRow(&rowIn);
|
||||
auto* subAgg = dynamic_cast<RowAggregationUM*>(aggDist->aggregator().get());
|
||||
|
||||
while (subAgg->nextRowGroup())
|
||||
while (subAgg->nextOutputRowGroup())
|
||||
{
|
||||
rowGroupIn->setData(aggDist->aggregator()->getOutputRowGroup()->getRGData());
|
||||
rgDataVec.emplace_back(subAgg->moveCurrentRGData());
|
||||
@ -571,7 +572,7 @@ bool TupleAggregateStep::nextDeliveredRowGroup()
|
||||
{
|
||||
for (; fBucketNum < fNumOfBuckets; fBucketNum++)
|
||||
{
|
||||
while (fAggregators[fBucketNum]->nextRowGroup())
|
||||
while (fAggregators[fBucketNum]->nextOutputRowGroup())
|
||||
{
|
||||
fAggregators[fBucketNum]->finalize();
|
||||
fRowGroupDelivered.setData(fAggregators[fBucketNum]->getOutputRowGroup()->getRGData());
|
||||
@ -5708,14 +5709,27 @@ void TupleAggregateStep::doAggregate()
|
||||
return;
|
||||
}
|
||||
|
||||
/** @brief Aggregate input row groups in two-phase multi-threaded aggregation.
|
||||
* In second phase handle three different aggregation cases differently:
|
||||
* 1. Query contains at least one aggregation on a DISTINCT column, e.g. SUM (DISTINCT col1) AND at least one
|
||||
* GROUP BY column
|
||||
* 2. Query contains at least one aggregation on a DISTINCT column but no GROUP BY column
|
||||
* 3. Query contains no aggregation on a DISTINCT column, but at least one GROUP BY column
|
||||
* DISTINCT selects (e.g. SELECT DISTINCT col1 FROM ...) are handled in tupleannexstep.cpp.
|
||||
*/
|
||||
uint64_t TupleAggregateStep::doThreadedAggregate(ByteStream& bs, RowGroupDL* dlp)
|
||||
{
|
||||
uint32_t i;
|
||||
RGData rgData;
|
||||
// initialize return value variable
|
||||
uint64_t rowCount = 0;
|
||||
|
||||
try
|
||||
{
|
||||
/*
|
||||
* Phase 1: Distribute input rows to different buckets depending on the hash value of the group by columns
|
||||
* per row. Then distribute buckets equally on aggregators in fAggregators. (Number of fAggregators ==
|
||||
* fNumOfBuckets). Each previously created hash bucket is represented as one RowGroup in a fAggregator.
|
||||
*/
|
||||
|
||||
if (!fDoneAggregate)
|
||||
{
|
||||
initializeMultiThread();
|
||||
@ -5724,9 +5738,9 @@ uint64_t TupleAggregateStep::doThreadedAggregate(ByteStream& bs, RowGroupDL* dlp
|
||||
runners.reserve(fNumOfThreads); // to prevent a resize during use
|
||||
|
||||
// Start the aggregator threads
|
||||
for (i = 0; i < fNumOfThreads; i++)
|
||||
for (uint32_t threadNum = 0; threadNum < fNumOfThreads; threadNum++)
|
||||
{
|
||||
runners.push_back(jobstepThreadPool.invoke(ThreadedAggregator(this, i)));
|
||||
runners.push_back(jobstepThreadPool.invoke(ThreadedAggregator(this, threadNum)));
|
||||
}
|
||||
|
||||
// Now wait for all those threads
|
||||
@ -5740,18 +5754,28 @@ uint64_t TupleAggregateStep::doThreadedAggregate(ByteStream& bs, RowGroupDL* dlp
|
||||
// much memory on average
|
||||
uint32_t threads = std::max(1U, fNumOfThreads / 2);
|
||||
runners.reserve(threads);
|
||||
for (i = 0; i < threads; ++i)
|
||||
for (uint32_t threadNum = 0; threadNum < threads; ++threadNum)
|
||||
{
|
||||
runners.push_back(jobstepThreadPool.invoke(ThreadedAggregateFinalizer(this, i)));
|
||||
runners.push_back(jobstepThreadPool.invoke(ThreadedAggregateFinalizer(this, threadNum)));
|
||||
}
|
||||
jobstepThreadPool.join(runners);
|
||||
}
|
||||
|
||||
if (dynamic_cast<RowAggregationDistinct*>(fAggregator.get()) && fAggregator->aggMapKeyLength() > 0)
|
||||
/*
|
||||
* Phase 2: Depending on query type (see below) do aggregation per previously created RowGroup of rows
|
||||
* that need to aggregated and output results.
|
||||
*/
|
||||
|
||||
auto* distinctAggregator = dynamic_cast<RowAggregationDistinct*>(fAggregator.get());
|
||||
const bool hasGroupByColumns = fAggregator->aggMapKeyLength() > 0;
|
||||
|
||||
// Case 1: Query contains at least one aggregation on a DISTINCT column AND at least one GROUP BY column
|
||||
// e.g. SELECT SUM(DISTINCT col1) FROM test GROUP BY col2;
|
||||
if (distinctAggregator && hasGroupByColumns)
|
||||
{
|
||||
// 2nd phase multi-threaded aggregate
|
||||
if (!fEndOfResult)
|
||||
{
|
||||
// Do multi-threaded second phase aggregation (per row group created for GROUP BY statement)
|
||||
if (!fDoneAggregate)
|
||||
{
|
||||
vector<uint64_t> runners; // thread pool handles
|
||||
@ -5759,97 +5783,78 @@ uint64_t TupleAggregateStep::doThreadedAggregate(ByteStream& bs, RowGroupDL* dlp
|
||||
|
||||
uint32_t bucketsPerThread = fNumOfBuckets / fNumOfThreads;
|
||||
uint32_t numThreads = ((fNumOfBuckets % fNumOfThreads) == 0 ? fNumOfThreads : fNumOfThreads + 1);
|
||||
// uint32_t bucketsPerThread = 1;
|
||||
// uint32_t numThreads = fNumOfBuckets;
|
||||
|
||||
runners.reserve(numThreads);
|
||||
|
||||
for (i = 0; i < numThreads; i++)
|
||||
for (uint32_t threadNum = 0; threadNum < numThreads; threadNum++)
|
||||
{
|
||||
runners.push_back(jobstepThreadPool.invoke(
|
||||
ThreadedSecondPhaseAggregator(this, i * bucketsPerThread, bucketsPerThread)));
|
||||
ThreadedSecondPhaseAggregator(this, threadNum * bucketsPerThread, bucketsPerThread)));
|
||||
}
|
||||
|
||||
jobstepThreadPool.join(runners);
|
||||
}
|
||||
|
||||
// Deliver results
|
||||
fDoneAggregate = true;
|
||||
bool done = true;
|
||||
|
||||
while (nextDeliveredRowGroup())
|
||||
while (nextDeliveredRowGroup() && !cancelled())
|
||||
{
|
||||
done = false;
|
||||
rowCount = fRowGroupOut.getRowCount();
|
||||
|
||||
if (rowCount != 0)
|
||||
{
|
||||
if (fRowGroupOut.getColumnCount() != fRowGroupDelivered.getColumnCount())
|
||||
pruneAuxColumns();
|
||||
|
||||
if (dlp)
|
||||
{
|
||||
rgData = fRowGroupDelivered.duplicate();
|
||||
dlp->insert(rgData);
|
||||
}
|
||||
else
|
||||
{
|
||||
bs.restart();
|
||||
fRowGroupDelivered.serializeRGData(bs);
|
||||
if (!cleanUpAndOutputRowGroup(bs, dlp))
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
done = true;
|
||||
}
|
||||
|
||||
if (done)
|
||||
{
|
||||
fEndOfResult = true;
|
||||
}
|
||||
}
|
||||
else
|
||||
}
|
||||
// Case 2. Query contains at least one aggregation on a DISTINCT column but no GROUP BY column
|
||||
// e.g. SELECT SUM(DISTINCT col1) FROM test;
|
||||
else if (distinctAggregator)
|
||||
{
|
||||
auto* agg = dynamic_cast<RowAggregationDistinct*>(fAggregator.get());
|
||||
|
||||
if (!fEndOfResult)
|
||||
{
|
||||
if (!fDoneAggregate)
|
||||
{
|
||||
for (i = 0; i < fNumOfBuckets; i++)
|
||||
// Do aggregation over all row groups. As all row groups need to be aggregated together there is no
|
||||
// easy way of multi-threading this and it's done in a single thread for now.
|
||||
for (uint32_t bucketNum = 0; bucketNum < fNumOfBuckets; bucketNum++)
|
||||
{
|
||||
if (fEndOfResult == false)
|
||||
{
|
||||
// do the final aggregtion and deliver the results
|
||||
// at least one RowGroup for aggregate results
|
||||
// for "distinct without group by" case
|
||||
if (agg != nullptr)
|
||||
{
|
||||
auto* aggMultiDist = dynamic_cast<RowAggregationMultiDistinct*>(fAggregators[i].get());
|
||||
auto* aggDist = dynamic_cast<RowAggregationDistinct*>(fAggregators[i].get());
|
||||
agg->aggregator(aggDist->aggregator());
|
||||
// The distinctAggregator accumulates the aggregation results of all row groups by being added
|
||||
// all row groups of each bucket aggregator and doing an aggregation step after each addition.
|
||||
auto* bucketMultiDistinctAggregator =
|
||||
dynamic_cast<RowAggregationMultiDistinct*>(fAggregators[bucketNum].get());
|
||||
auto* bucketDistinctAggregator =
|
||||
dynamic_cast<RowAggregationDistinct*>(fAggregators[bucketNum].get());
|
||||
distinctAggregator->aggregator(bucketDistinctAggregator->aggregator());
|
||||
|
||||
if (aggMultiDist)
|
||||
if (bucketMultiDistinctAggregator)
|
||||
{
|
||||
(dynamic_cast<RowAggregationMultiDistinct*>(agg))
|
||||
->subAggregators(aggMultiDist->subAggregators());
|
||||
(dynamic_cast<RowAggregationMultiDistinct*>(distinctAggregator))
|
||||
->subAggregators(bucketMultiDistinctAggregator->subAggregators());
|
||||
}
|
||||
|
||||
agg->doDistinctAggregation();
|
||||
}
|
||||
// for "group by without distinct" case
|
||||
else
|
||||
{
|
||||
fAggregator->append(fAggregators[i].get());
|
||||
}
|
||||
distinctAggregator->aggregator()->finalAggregation();
|
||||
distinctAggregator->doDistinctAggregation();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Deliver results
|
||||
fDoneAggregate = true;
|
||||
}
|
||||
|
||||
bool done = true;
|
||||
|
||||
//@bug4459
|
||||
while (fAggregator->nextRowGroup() && !cancelled())
|
||||
{
|
||||
done = false;
|
||||
@ -5860,22 +5865,45 @@ uint64_t TupleAggregateStep::doThreadedAggregate(ByteStream& bs, RowGroupDL* dlp
|
||||
|
||||
if (rowCount != 0)
|
||||
{
|
||||
if (fRowGroupOut.getColumnCount() != fRowGroupDelivered.getColumnCount())
|
||||
pruneAuxColumns();
|
||||
|
||||
if (dlp)
|
||||
{
|
||||
rgData = fRowGroupDelivered.duplicate();
|
||||
dlp->insert(rgData);
|
||||
}
|
||||
else
|
||||
{
|
||||
bs.restart();
|
||||
fRowGroupDelivered.serializeRGData(bs);
|
||||
if (!cleanUpAndOutputRowGroup(bs, dlp))
|
||||
break;
|
||||
}
|
||||
done = true;
|
||||
}
|
||||
if (done)
|
||||
fEndOfResult = true;
|
||||
}
|
||||
}
|
||||
// CASE 3: Query contains no aggregation on a DISTINCT column, but at least one GROUP BY column
|
||||
// e.g. SELECT SUM(col1) FROM test GROUP BY col2;
|
||||
// Do aggregation over all row groups. As all row groups need to be aggregated together there is no
|
||||
// easy way of multi-threading this and it's done in a single thread for now.
|
||||
else if (hasGroupByColumns)
|
||||
{
|
||||
if (!fEndOfResult && !fDoneAggregate)
|
||||
{
|
||||
for (uint32_t bucketNum = 0; bucketNum < fNumOfBuckets; ++bucketNum)
|
||||
{
|
||||
fAggregator->append(fAggregators[bucketNum].get());
|
||||
}
|
||||
}
|
||||
|
||||
fDoneAggregate = true;
|
||||
bool done = true;
|
||||
|
||||
while (fAggregator->nextRowGroup() && !cancelled())
|
||||
{
|
||||
done = false;
|
||||
fAggregator->finalize();
|
||||
rowCount = fRowGroupOut.getRowCount();
|
||||
fRowsReturned += rowCount;
|
||||
fRowGroupDelivered.setData(fRowGroupOut.getRGData());
|
||||
|
||||
if (rowCount != 0)
|
||||
{
|
||||
if (!cleanUpAndOutputRowGroup(bs, dlp))
|
||||
break;
|
||||
}
|
||||
done = true;
|
||||
}
|
||||
|
||||
@ -5884,7 +5912,14 @@ uint64_t TupleAggregateStep::doThreadedAggregate(ByteStream& bs, RowGroupDL* dlp
|
||||
fEndOfResult = true;
|
||||
}
|
||||
}
|
||||
} // try
|
||||
else
|
||||
{
|
||||
throw logic_error(
|
||||
"TupleAggregateStep::doThreadedAggregate: No DISTINCT columns nested into aggregation function "
|
||||
"or "
|
||||
"GROUP BY columns found. Should not reach here.");
|
||||
}
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
handleException(std::current_exception(), logging::tupleAggregateStepErr,
|
||||
@ -5924,6 +5959,23 @@ uint64_t TupleAggregateStep::doThreadedAggregate(ByteStream& bs, RowGroupDL* dlp
|
||||
return rowCount;
|
||||
}
|
||||
|
||||
bool TupleAggregateStep::cleanUpAndOutputRowGroup(ByteStream& bs, RowGroupDL* dlp)
|
||||
{
|
||||
if (fRowGroupOut.getColumnCount() != fRowGroupDelivered.getColumnCount())
|
||||
pruneAuxColumns();
|
||||
|
||||
if (dlp)
|
||||
{
|
||||
RGData rgData = fRowGroupDelivered.duplicate();
|
||||
dlp->insert(rgData);
|
||||
return true;
|
||||
}
|
||||
|
||||
bs.restart();
|
||||
fRowGroupDelivered.serializeRGData(bs);
|
||||
return false;
|
||||
}
|
||||
|
||||
void TupleAggregateStep::pruneAuxColumns()
|
||||
{
|
||||
uint64_t rowCount = fRowGroupOut.getRowCount();
|
||||
|
@ -161,6 +161,7 @@ class TupleAggregateStep : public JobStep, public TupleDeliveryStep
|
||||
void doThreadedSecondPhaseAggregate(uint32_t threadID);
|
||||
bool nextDeliveredRowGroup();
|
||||
void pruneAuxColumns();
|
||||
bool cleanUpAndOutputRowGroup(messageqcpp::ByteStream& bs, RowGroupDL* dlp);
|
||||
void formatMiniStats();
|
||||
void printCalTrace();
|
||||
template <class GroupByMap>
|
||||
|
@ -4099,6 +4099,18 @@ bool RowAggregationUM::nextRowGroup()
|
||||
return more;
|
||||
}
|
||||
|
||||
bool RowAggregationUM::nextOutputRowGroup()
|
||||
{
|
||||
bool more = fRowAggStorage->getNextOutputRGData(fCurRGData);
|
||||
|
||||
if (more)
|
||||
{
|
||||
fRowGroupOut->setData(fCurRGData.get());
|
||||
}
|
||||
|
||||
return more;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Row Aggregation constructor used on UM
|
||||
// For 2nd phase of two-phase case, from partial RG to final aggregated RG
|
||||
@ -4558,7 +4570,10 @@ void RowAggregationDistinct::addRowGroup(const RowGroup* pRows,
|
||||
//------------------------------------------------------------------------------
|
||||
void RowAggregationDistinct::doDistinctAggregation()
|
||||
{
|
||||
while (dynamic_cast<RowAggregationUM*>(fAggregator.get())->nextRowGroup())
|
||||
auto* umAggregator = dynamic_cast<RowAggregationUM*>(fAggregator.get());
|
||||
if (umAggregator)
|
||||
{
|
||||
while (umAggregator->nextOutputRowGroup())
|
||||
{
|
||||
fRowGroupIn.setData(fAggregator->getOutputRowGroup()->getRGData());
|
||||
|
||||
@ -4572,6 +4587,13 @@ void RowAggregationDistinct::doDistinctAggregation()
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::ostringstream errmsg;
|
||||
errmsg << "RowAggregationDistinct: incorrect fAggregator class.";
|
||||
cerr << errmsg.str() << endl;
|
||||
}
|
||||
}
|
||||
|
||||
void RowAggregationDistinct::doDistinctAggregation_rowVec(vector<std::pair<Row::Pointer, uint64_t>>& inRows)
|
||||
{
|
||||
|
@ -679,6 +679,17 @@ class RowAggregationUM : public RowAggregation
|
||||
*/
|
||||
bool nextRowGroup();
|
||||
|
||||
/** @brief Returns aggregated rows in a RowGroup as long as there are still not returned result RowGroups.
|
||||
*
|
||||
* This function should be called repeatedly until false is returned (meaning end of data).
|
||||
* Returns data from in-memory storage, as well as spilled data from disk. If disk-based aggregation is
|
||||
* happening, finalAggregation() should be called before returning result RowGroups to finalize the used
|
||||
* RowAggStorages, merge different spilled generations and obtain correct aggregation results.
|
||||
*
|
||||
* @returns True if there are more result RowGroups, else false if all results have been returned.
|
||||
*/
|
||||
bool nextOutputRowGroup();
|
||||
|
||||
/** @brief Add an aggregator for DISTINCT aggregation
|
||||
*/
|
||||
void distinctAggregator(const boost::shared_ptr<RowAggregation>& da)
|
||||
|
@ -18,6 +18,7 @@
|
||||
#include <unistd.h>
|
||||
#include <sys/stat.h>
|
||||
#include <boost/filesystem.hpp>
|
||||
#include <cstdint>
|
||||
#include "rowgroup.h"
|
||||
#include <resourcemanager.h>
|
||||
#include <fcntl.h>
|
||||
@ -79,6 +80,11 @@ std::string errorString(int errNo)
|
||||
auto* buf = strerror_r(errNo, tmp, sizeof(tmp));
|
||||
return {buf};
|
||||
}
|
||||
|
||||
size_t findFirstSetBit(const uint64_t mask)
|
||||
{
|
||||
return __builtin_ffsll(mask);
|
||||
}
|
||||
} // anonymous namespace
|
||||
|
||||
namespace rowgroup
|
||||
@ -552,7 +558,7 @@ class Dumper
|
||||
class RowGroupStorage
|
||||
{
|
||||
public:
|
||||
using RGDataStorage = std::vector<std::unique_ptr<RGData>>;
|
||||
using RGDataStorage = std::vector<RGDataUnPtr>;
|
||||
|
||||
public:
|
||||
/** @brief Default constructor
|
||||
@ -613,6 +619,54 @@ class RowGroupStorage
|
||||
return fRowGroupOut->getSizeWithStrings(fMaxRows);
|
||||
}
|
||||
|
||||
// This shifts data within RGData such that it compacts the non finalized rows
|
||||
PosOpos shiftRowsInRowGroup(RGDataUnPtr& rgdata, uint64_t fgid, uint64_t tgid)
|
||||
{
|
||||
uint64_t pos = 0;
|
||||
uint64_t opos = 0;
|
||||
|
||||
fRowGroupOut->setData(rgdata.get());
|
||||
for (auto i = fgid; i < tgid; ++i)
|
||||
{
|
||||
if ((i - fgid) * HashMaskElements >= fRowGroupOut->getRowCount())
|
||||
break;
|
||||
uint64_t mask = ~fFinalizedRows[i];
|
||||
if ((i - fgid + 1) * HashMaskElements > fRowGroupOut->getRowCount())
|
||||
{
|
||||
mask &= (~0ULL) >> ((i - fgid + 1) * HashMaskElements - fRowGroupOut->getRowCount());
|
||||
}
|
||||
opos = (i - fgid) * HashMaskElements;
|
||||
|
||||
if (mask == ~0ULL)
|
||||
{
|
||||
if (LIKELY(pos != opos))
|
||||
moveRows(rgdata.get(), pos, opos, HashMaskElements);
|
||||
pos += HashMaskElements;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (mask == 0)
|
||||
continue;
|
||||
|
||||
while (mask != 0)
|
||||
{
|
||||
// find position until block full of not finalized rows.
|
||||
size_t b = findFirstSetBit(mask);
|
||||
size_t e = findFirstSetBit(~(mask >> b)) + b;
|
||||
if (UNLIKELY(e >= HashMaskElements))
|
||||
mask = 0;
|
||||
else
|
||||
mask >>= e;
|
||||
if (LIKELY(pos != opos + b - 1))
|
||||
moveRows(rgdata.get(), pos, opos + b - 1, e - b);
|
||||
pos += e - b;
|
||||
opos += e;
|
||||
}
|
||||
--opos;
|
||||
}
|
||||
return {pos, opos};
|
||||
}
|
||||
|
||||
/** @brief Take away RGDatas from another RowGroupStorage
|
||||
*
|
||||
* If some of the RGDatas is not in the memory do not load them,
|
||||
@ -626,7 +680,7 @@ class RowGroupStorage
|
||||
}
|
||||
void append(RowGroupStorage* o)
|
||||
{
|
||||
std::unique_ptr<RGData> rgd;
|
||||
RGDataUnPtr rgd;
|
||||
std::string ofname;
|
||||
while (o->getNextRGData(rgd, ofname))
|
||||
{
|
||||
@ -666,11 +720,130 @@ class RowGroupStorage
|
||||
}
|
||||
}
|
||||
|
||||
/** @brief Get the last RGData from fRGDatas, remove it from the vector and return its id.
|
||||
*
|
||||
* @param rgdata The RGData to be retrieved
|
||||
*/
|
||||
uint64_t getLastRGData(RGDataUnPtr& rgdata)
|
||||
{
|
||||
assert(!fRGDatas.empty());
|
||||
uint64_t rgid = fRGDatas.size() - 1;
|
||||
rgdata = std::move(fRGDatas[rgid]);
|
||||
fRGDatas.pop_back();
|
||||
return rgid;
|
||||
}
|
||||
|
||||
static FgidTgid calculateGids(const uint64_t rgid, const uint64_t fMaxRows)
|
||||
{
|
||||
// Calculate from first and last uint64_t entry in fFinalizedRows BitMap
|
||||
// which contains information about rows in the RGData.
|
||||
uint64_t fgid = rgid * fMaxRows / HashMaskElements;
|
||||
uint64_t tgid = fgid + fMaxRows / HashMaskElements;
|
||||
return {fgid, tgid};
|
||||
}
|
||||
|
||||
/** @brief Used to output aggregation results from memory and disk in the current generation in the form of
|
||||
* RGData. Returns next RGData, loads from disk if necessary. Skips finalized rows as they would contain
|
||||
* duplicate results, compacts actual rows into start of RGData and adapts number of rows transmitted in
|
||||
* RGData.
|
||||
* @returns A pointer to the next RGData or an empty pointer if there are no more RGDatas in this
|
||||
* generation.
|
||||
*/
|
||||
bool getNextOutputRGData(RGDataUnPtr& rgdata)
|
||||
{
|
||||
if (UNLIKELY(fRGDatas.empty()))
|
||||
{
|
||||
fMM->release();
|
||||
return false;
|
||||
}
|
||||
|
||||
while (!fRGDatas.empty())
|
||||
{
|
||||
auto rgid = getLastRGData(rgdata);
|
||||
auto [fgid, tgid] = calculateGids(rgid, fMaxRows);
|
||||
|
||||
if (fFinalizedRows.size() <= fgid)
|
||||
{
|
||||
// There are no finalized rows in this RGData. We can just return it.
|
||||
// Load from disk if necessary and unlink DumpFile.
|
||||
if (!rgdata)
|
||||
{
|
||||
loadRG(rgid, rgdata, true);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
if (tgid >= fFinalizedRows.size())
|
||||
fFinalizedRows.resize(tgid + 1, 0ULL);
|
||||
|
||||
// Check if there are rows to process
|
||||
bool hasReturnRows = false;
|
||||
for (auto i = fgid; i < tgid; ++i)
|
||||
{
|
||||
if (fFinalizedRows[i] != ~0ULL)
|
||||
{
|
||||
// Not all rows are finalized, we have to return at least parts of this RGData
|
||||
hasReturnRows = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (rgdata)
|
||||
{
|
||||
// RGData is currently in memory
|
||||
if (!hasReturnRows)
|
||||
{
|
||||
// All rows are finalized, don't return this RGData
|
||||
continue;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (hasReturnRows)
|
||||
{
|
||||
// Load RGData from disk, unlink dump file and continue processing
|
||||
loadRG(rgid, rgdata, true);
|
||||
}
|
||||
else
|
||||
{
|
||||
// All rows are finalized. Unlink dump file and continue search for return RGData
|
||||
unlink(makeRGFilename(rgid).c_str());
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
auto [pos, opos] = shiftRowsInRowGroup(rgdata, fgid, tgid);
|
||||
|
||||
// Nothing got shifted at all -> all rows must be finalized. If all rows finalized remove
|
||||
// RGData and file and don't give it out.
|
||||
if (pos == 0)
|
||||
{
|
||||
fLRU->remove(rgid);
|
||||
unlink(makeRGFilename(rgid).c_str());
|
||||
continue;
|
||||
}
|
||||
|
||||
// set RGData with number of not finalized rows which have been compacted at front of RGData
|
||||
fRowGroupOut->setData(rgdata.get());
|
||||
fRowGroupOut->setRowCount(pos);
|
||||
int64_t memSz = fRowGroupOut->getSizeWithStrings(fMaxRows);
|
||||
|
||||
// Release the memory used by the current rgdata from this MemoryManager.
|
||||
fMM->release(memSz);
|
||||
unlink(makeRGFilename(rgid).c_str());
|
||||
|
||||
// to periodically clean up freed memory so it can be used by other threads.
|
||||
fLRU->remove(rgid);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/** @brief Returns next RGData, load it from disk if necessary.
|
||||
*
|
||||
* @returns pointer to the next RGData or empty pointer if there is nothing
|
||||
*/
|
||||
std::unique_ptr<RGData> getNextRGData()
|
||||
RGDataUnPtr getNextRGData()
|
||||
{
|
||||
while (!fRGDatas.empty())
|
||||
{
|
||||
@ -1030,7 +1203,7 @@ class RowGroupStorage
|
||||
* @param fname(out) Filename of the dump if it's not in the memory
|
||||
* @returns true if there is available RGData
|
||||
*/
|
||||
bool getNextRGData(std::unique_ptr<RGData>& rgdata, std::string& fname)
|
||||
bool getNextRGData(RGDataUnPtr& rgdata, std::string& fname)
|
||||
{
|
||||
if (UNLIKELY(fRGDatas.empty()))
|
||||
{
|
||||
@ -1039,12 +1212,9 @@ class RowGroupStorage
|
||||
}
|
||||
while (!fRGDatas.empty())
|
||||
{
|
||||
uint64_t rgid = fRGDatas.size() - 1;
|
||||
rgdata = std::move(fRGDatas[rgid]);
|
||||
fRGDatas.pop_back();
|
||||
auto rgid = getLastRGData(rgdata);
|
||||
auto [fgid, tgid] = calculateGids(rgid, fMaxRows);
|
||||
|
||||
uint64_t fgid = rgid * fMaxRows / 64;
|
||||
uint64_t tgid = fgid + fMaxRows / 64;
|
||||
if (fFinalizedRows.size() > fgid)
|
||||
{
|
||||
if (tgid >= fFinalizedRows.size())
|
||||
@ -1068,45 +1238,7 @@ class RowGroupStorage
|
||||
continue;
|
||||
}
|
||||
|
||||
uint64_t pos = 0;
|
||||
uint64_t opos = 0;
|
||||
fRowGroupOut->setData(rgdata.get());
|
||||
for (auto i = fgid; i < tgid; ++i)
|
||||
{
|
||||
if ((i - fgid) * 64 >= fRowGroupOut->getRowCount())
|
||||
break;
|
||||
uint64_t mask = ~fFinalizedRows[i];
|
||||
if ((i - fgid + 1) * 64 > fRowGroupOut->getRowCount())
|
||||
{
|
||||
mask &= (~0ULL) >> ((i - fgid + 1) * 64 - fRowGroupOut->getRowCount());
|
||||
}
|
||||
opos = (i - fgid) * 64;
|
||||
if (mask == ~0ULL)
|
||||
{
|
||||
if (LIKELY(pos != opos))
|
||||
moveRows(rgdata.get(), pos, opos, 64);
|
||||
pos += 64;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (mask == 0)
|
||||
continue;
|
||||
|
||||
while (mask != 0)
|
||||
{
|
||||
size_t b = __builtin_ffsll(mask);
|
||||
size_t e = __builtin_ffsll(~(mask >> b)) + b;
|
||||
if (UNLIKELY(e >= 64))
|
||||
mask = 0;
|
||||
else
|
||||
mask >>= e;
|
||||
if (LIKELY(pos != opos + b - 1))
|
||||
moveRows(rgdata.get(), pos, opos + b - 1, e - b);
|
||||
pos += e - b;
|
||||
opos += e;
|
||||
}
|
||||
--opos;
|
||||
}
|
||||
auto [pos, opos] = shiftRowsInRowGroup(rgdata, fgid, tgid);
|
||||
|
||||
if (pos == 0)
|
||||
{
|
||||
@ -1119,6 +1251,7 @@ class RowGroupStorage
|
||||
fRowGroupOut->setRowCount(pos);
|
||||
}
|
||||
|
||||
// Release the memory used by the current rgdata.
|
||||
if (rgdata)
|
||||
{
|
||||
fRowGroupOut->setData(rgdata.get());
|
||||
@ -1130,6 +1263,7 @@ class RowGroupStorage
|
||||
{
|
||||
fname = makeRGFilename(rgid);
|
||||
}
|
||||
// to periodically clean up freed memory so it can be used by other threads.
|
||||
fLRU->remove(rgid);
|
||||
return true;
|
||||
}
|
||||
@ -1169,7 +1303,7 @@ class RowGroupStorage
|
||||
loadRG(rgid, fRGDatas[rgid]);
|
||||
}
|
||||
|
||||
void loadRG(uint64_t rgid, std::unique_ptr<RGData>& rgdata, bool unlinkDump = false)
|
||||
void loadRG(uint64_t rgid, RGDataUnPtr& rgdata, bool unlinkDump = false)
|
||||
{
|
||||
auto fname = makeRGFilename(rgid);
|
||||
|
||||
@ -1736,7 +1870,7 @@ void RowAggStorage::append(RowAggStorage& other)
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<RGData> RowAggStorage::getNextRGData()
|
||||
RGDataUnPtr RowAggStorage::getNextRGData()
|
||||
{
|
||||
if (!fStorage)
|
||||
{
|
||||
@ -1747,6 +1881,43 @@ std::unique_ptr<RGData> RowAggStorage::getNextRGData()
|
||||
return fStorage->getNextRGData();
|
||||
}
|
||||
|
||||
bool RowAggStorage::getNextOutputRGData(RGDataUnPtr& rgdata)
|
||||
{
|
||||
if (!fStorage)
|
||||
{
|
||||
return {};
|
||||
}
|
||||
|
||||
cleanup();
|
||||
freeData();
|
||||
|
||||
// fGeneration is an unsigned int, we need a signed int for a comparison >= 0
|
||||
int32_t gen = fGeneration;
|
||||
while (gen >= 0)
|
||||
{
|
||||
bool moreInGeneration = fStorage->getNextOutputRGData(rgdata);
|
||||
|
||||
if (moreInGeneration)
|
||||
{
|
||||
fRowGroupOut->setData(rgdata.get());
|
||||
return true;
|
||||
}
|
||||
|
||||
// all generations have been emptied
|
||||
if (fGeneration == 0)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
// current generation has no more RGDatas to return
|
||||
// load earlier generation and continue with returning its RGDatas
|
||||
gen--;
|
||||
fGeneration--;
|
||||
fStorage.reset(fStorage->clone(fGeneration));
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void RowAggStorage::freeData()
|
||||
{
|
||||
for (auto& data : fGens)
|
||||
|
@ -20,6 +20,7 @@
|
||||
#include "resourcemanager.h"
|
||||
#include "rowgroup.h"
|
||||
#include "idbcompress.h"
|
||||
#include <cstdint>
|
||||
#include <random>
|
||||
#include <sys/stat.h>
|
||||
#include <unistd.h>
|
||||
@ -35,10 +36,15 @@ class RowPosHashStorage;
|
||||
using RowPosHashStoragePtr = std::unique_ptr<RowPosHashStorage>;
|
||||
class RowGroupStorage;
|
||||
|
||||
using RGDataUnPtr = std::unique_ptr<RGData>;
|
||||
using PosOpos = std::pair<uint64_t, uint64_t>;
|
||||
using FgidTgid = std::pair<uint64_t, uint64_t>;
|
||||
|
||||
uint64_t hashRow(const rowgroup::Row& r, std::size_t lastCol);
|
||||
|
||||
constexpr const size_t MaxConstStrSize = 2048ULL;
|
||||
constexpr const size_t MaxConstStrBufSize = MaxConstStrSize << 1;
|
||||
constexpr const uint64_t HashMaskElements = 64ULL;
|
||||
|
||||
class RowAggStorage
|
||||
{
|
||||
@ -97,6 +103,12 @@ class RowAggStorage
|
||||
*/
|
||||
std::unique_ptr<RGData> getNextRGData();
|
||||
|
||||
/** @brief Remove last RGData from in-memory storage or disk.
|
||||
* Iterates over all generations on disk if available.
|
||||
* @returns True if RGData is returned in parameter or false if no more RGDatas can be returned.
|
||||
*/
|
||||
bool getNextOutputRGData(std::unique_ptr<RGData>& rgdata);
|
||||
|
||||
/** @brief TODO
|
||||
*
|
||||
* @param mergeFunc
|
||||
|
@ -28,6 +28,7 @@
|
||||
#include <sys/stat.h>
|
||||
#include <cerrno>
|
||||
#include <fcntl.h>
|
||||
#include <mutex>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <iostream>
|
||||
|
Reference in New Issue
Block a user