1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-08-08 14:22:09 +03:00

MCOL4841 dev port run large join without OOM

This commit is contained in:
David Hall
2022-02-09 17:33:55 -06:00
parent d30e140dc3
commit 27dea733c5
34 changed files with 821 additions and 518 deletions

View File

@@ -155,14 +155,15 @@ struct TupleBPSAggregators
}
};
TupleBPS::JoinLocalData::JoinLocalData(RowGroup& primRowGroup, RowGroup& outputRowGroup,
TupleBPS::JoinLocalData::JoinLocalData(TupleBPS* pTupleBPS, RowGroup& primRowGroup, RowGroup& outputRowGroup,
boost::shared_ptr<funcexp::FuncExpWrapper>& fe2,
rowgroup::RowGroup& fe2Output,
std::vector<rowgroup::RowGroup>& joinerMatchesRGs,
rowgroup::RowGroup& joinFERG,
std::vector<boost::shared_ptr<joiner::TupleJoiner>>& tjoiners,
uint32_t smallSideCount, bool doJoin)
: local_primRG(primRowGroup)
: tbps(pTupleBPS)
, local_primRG(primRowGroup)
, local_outputRG(outputRowGroup)
, fe2(fe2)
, fe2Output(fe2Output)
@@ -239,6 +240,130 @@ TupleBPS::JoinLocalData::JoinLocalData(RowGroup& primRowGroup, RowGroup& outputR
}
}
uint64_t TupleBPS::JoinLocalData::generateJoinResultSet(const uint32_t depth,
std::vector<rowgroup::RGData>& outputData,
RowGroupDL* dlp)
{
uint32_t i;
Row& smallRow = smallSideRows[depth];
uint64_t memSizeForOutputRG = 0;
if (depth < smallSideCount - 1)
{
for (i = 0; i < joinerOutput[depth].size() && !tbps->cancelled(); i++)
{
smallRow.setPointer(joinerOutput[depth][i]);
applyMapping(smallMappings[depth], smallRow, &joinedBaseRow);
memSizeForOutputRG += generateJoinResultSet(depth + 1, outputData, dlp);
}
}
else
{
local_outputRG.getRow(local_outputRG.getRowCount(), &postJoinRow);
for (i = 0; i < joinerOutput[depth].size() && !tbps->cancelled();
i++, postJoinRow.nextRow(), local_outputRG.incRowCount())
{
smallRow.setPointer(joinerOutput[depth][i]);
if (UNLIKELY(local_outputRG.getRowCount() == 8192))
{
uint32_t dbRoot = local_outputRG.getDBRoot();
uint64_t baseRid = local_outputRG.getBaseRid();
outputData.push_back(joinedData);
// Don't let the join results buffer get out of control.
if (tbps->resourceManager()->getMemory(local_outputRG.getMaxDataSize(), false))
{
memSizeForOutputRG += local_outputRG.getMaxDataSize();
}
else
{
// Don't wait for memory, just send the data on to DL.
RowGroup out(local_outputRG);
if (fe2 && tbps->runFEonPM())
{
processFE2(outputData);
tbps->rgDataVecToDl(outputData, local_fe2Output, dlp);
}
else
{
tbps->rgDataVecToDl(outputData, out, dlp);
}
tbps->resourceManager()->returnMemory(memSizeForOutputRG);
memSizeForOutputRG = 0;
}
joinedData.reinit(local_outputRG);
local_outputRG.setData(&joinedData);
local_outputRG.resetRowGroup(baseRid);
local_outputRG.setDBRoot(dbRoot);
local_outputRG.getRow(0, &postJoinRow);
}
applyMapping(smallMappings[depth], smallRow, &joinedBaseRow);
copyRow(joinedBaseRow, &postJoinRow);
}
}
return memSizeForOutputRG;
}
void TupleBPS::JoinLocalData::processFE2(vector<rowgroup::RGData>& rgData)
{
vector<RGData> results;
RGData result;
uint32_t i, j;
bool ret;
result = RGData(local_fe2Output);
local_fe2Output.setData(&result);
local_fe2Output.resetRowGroup(-1);
local_fe2Output.getRow(0, &local_fe2OutRow);
for (i = 0; i < rgData.size(); i++)
{
local_outputRG.setData(&(rgData)[i]);
if (local_fe2Output.getRowCount() == 0)
{
local_fe2Output.resetRowGroup(local_outputRG.getBaseRid());
local_fe2Output.setDBRoot(local_outputRG.getDBRoot());
}
local_outputRG.getRow(0, &postJoinRow);
for (j = 0; j < local_outputRG.getRowCount(); j++, postJoinRow.nextRow())
{
ret = local_fe2.evaluate(&postJoinRow);
if (ret)
{
applyMapping(tbps->fe2Mapping, postJoinRow, &local_fe2OutRow);
local_fe2OutRow.setRid(postJoinRow.getRelRid());
local_fe2Output.incRowCount();
local_fe2OutRow.nextRow();
if (local_fe2Output.getRowCount() == 8192 ||
local_fe2Output.getDBRoot() != local_outputRG.getDBRoot() ||
local_fe2Output.getBaseRid() != local_outputRG.getBaseRid())
{
results.push_back(result);
result = RGData(local_fe2Output);
local_fe2Output.setData(&result);
local_fe2Output.resetRowGroup(local_outputRG.getBaseRid());
local_fe2Output.setDBRoot(local_outputRG.getDBRoot());
local_fe2Output.getRow(0, &local_fe2OutRow);
}
}
}
}
if (local_fe2Output.getRowCount() > 0)
{
results.push_back(result);
}
rgData.swap(results);
}
struct ByteStreamProcessor
{
ByteStreamProcessor(TupleBPS* tbps, vector<boost::shared_ptr<messageqcpp::ByteStream>>& bsv,
@@ -1265,7 +1390,7 @@ void TupleBPS::run()
if (fe1)
fBPP->setFEGroup1(fe1, fe1Input);
if (fe2 && runFEonPM)
if (fe2 && bRunFEonPM)
fBPP->setFEGroup2(fe2, fe2Output);
if (fe2)
@@ -1970,6 +2095,7 @@ void TupleBPS::processByteStreamVector(vector<boost::shared_ptr<messageqcpp::Byt
uint32_t cachedIO;
uint32_t physIO;
uint32_t touchedBlocks;
int32_t memAmount = 0;
for (uint32_t i = begin; i < end; ++i)
{
@@ -2131,26 +2257,7 @@ void TupleBPS::processByteStreamVector(vector<boost::shared_ptr<messageqcpp::Byt
{
applyMapping(data->largeMapping, data->largeSideRow, &data->joinedBaseRow);
data->joinedBaseRow.setRid(data->largeSideRow.getRelRid());
generateJoinResultSet(data->joinerOutput, data->joinedBaseRow, data->smallMappings, 0,
data->local_outputRG, data->joinedData, &rgDatav, data->smallSideRows,
data->postJoinRow);
// Bug 3510: Don't let the join results buffer get out of control. Need
// to refactor this. All post-join processing needs to go here AND below
// for now.
if (rgDatav.size() * data->local_outputRG.getMaxDataSize() > 50000000)
{
RowGroup out(data->local_outputRG);
if (fe2 && !runFEonPM)
{
processFE2(out, data->local_fe2Output, data->postJoinRow, data->local_fe2OutRow, &rgDatav,
&data->local_fe2);
rgDataVecToDl(rgDatav, data->local_fe2Output, dlp);
}
else
rgDataVecToDl(rgDatav, out, dlp);
}
memAmount += data->generateJoinResultSet(0, rgDatav, dlp);
}
} // end of the for-loop in the join code
@@ -2163,12 +2270,16 @@ void TupleBPS::processByteStreamVector(vector<boost::shared_ptr<messageqcpp::Byt
{
rgDatav.push_back(rgData);
}
if (memAmount)
{
resourceManager()->returnMemory(memAmount);
memAmount = 0;
}
// Execute UM F & E group 2 on rgDatav
if (fe2 && !runFEonPM && rgDatav.size() > 0 && !cancelled())
if (fe2 && !bRunFEonPM && rgDatav.size() > 0 && !cancelled())
{
processFE2(data->local_outputRG, data->local_fe2Output, data->postJoinRow, data->local_fe2OutRow,
&rgDatav, &data->local_fe2);
data->processFE2(rgDatav);
rgDataVecToDl(rgDatav, data->local_fe2Output, dlp);
}
@@ -2192,7 +2303,7 @@ void TupleBPS::processByteStreamVector(vector<boost::shared_ptr<messageqcpp::Byt
// insert the resulting rowgroup data from a single bytestream into dlp
if (rgDatav.size() > 0)
{
if (fe2 && runFEonPM)
if (fe2 && bRunFEonPM)
rgDataVecToDl(rgDatav, data->local_fe2Output, dlp);
else
rgDataVecToDl(rgDatav, data->local_outputRG, dlp);
@@ -2339,10 +2450,7 @@ void TupleBPS::receiveMultiPrimitiveMessages()
start = end;
}
// Join threads.
for (uint32_t i = 0, e = fProcessorThreads.size(); i < e; ++i)
jobstepThreadPool.join(fProcessorThreads[i]);
jobstepThreadPool.join(fProcessorThreads);
// Clear all.
fProcessorThreads.clear();
bsv.clear();
@@ -2399,7 +2507,7 @@ void TupleBPS::receiveMultiPrimitiveMessages()
abort_nolock();
}
// We have on thread here and do not need to notify any waiting producer threads, because we are done of
// We have one thread here and do not need to notify any waiting producer threads, because we are done with
// consuming messages from queue.
tplLock.unlock();
@@ -2447,8 +2555,7 @@ void TupleBPS::receiveMultiPrimitiveMessages()
if (fe2)
{
rgDatav.push_back(data->joinedData);
processFE2(data->local_outputRG, data->local_fe2Output, data->postJoinRow, data->local_fe2OutRow,
&rgDatav, &data->local_fe2);
data->processFE2(rgDatav);
if (rgDatav.size() > 0)
rgDataToDl(rgDatav[0], data->local_fe2Output, dlp);
@@ -2470,8 +2577,7 @@ void TupleBPS::receiveMultiPrimitiveMessages()
if (fe2)
{
rgDatav.push_back(data->joinedData);
processFE2(data->local_outputRG, data->local_fe2Output, data->postJoinRow, data->local_fe2OutRow,
&rgDatav, &data->local_fe2);
data->processFE2(rgDatav);
if (rgDatav.size() > 0)
rgDataToDl(rgDatav[0], data->local_fe2Output, dlp);
@@ -2811,51 +2917,6 @@ void TupleBPS::setJoinedResultRG(const rowgroup::RowGroup& rg)
fe2Mapping = makeMapping(outputRowGroup, fe2Output);
}
/* probably worthwhile to make some of these class vars */
void TupleBPS::generateJoinResultSet(const vector<vector<Row::Pointer>>& joinerOutput, Row& baseRow,
const vector<shared_array<int>>& mappings, const uint32_t depth,
RowGroup& outputRG, RGData& rgData, vector<RGData>* outputData,
const scoped_array<Row>& smallRows, Row& joinedRow)
{
uint32_t i;
Row& smallRow = smallRows[depth];
if (depth < smallSideCount - 1)
{
for (i = 0; i < joinerOutput[depth].size(); i++)
{
smallRow.setPointer(joinerOutput[depth][i]);
applyMapping(mappings[depth], smallRow, &baseRow);
generateJoinResultSet(joinerOutput, baseRow, mappings, depth + 1, outputRG, rgData, outputData,
smallRows, joinedRow);
}
}
else
{
outputRG.getRow(outputRG.getRowCount(), &joinedRow);
for (i = 0; i < joinerOutput[depth].size(); i++, joinedRow.nextRow(), outputRG.incRowCount())
{
smallRow.setPointer(joinerOutput[depth][i]);
if (UNLIKELY(outputRG.getRowCount() == 8192))
{
uint32_t dbRoot = outputRG.getDBRoot();
uint64_t baseRid = outputRG.getBaseRid();
outputData->push_back(rgData);
rgData = RGData(outputRG);
outputRG.setData(&rgData);
outputRG.resetRowGroup(baseRid);
outputRG.setDBRoot(dbRoot);
outputRG.getRow(0, &joinedRow);
}
applyMapping(mappings[depth], smallRow, &baseRow);
copyRow(baseRow, &joinedRow);
}
}
}
const rowgroup::RowGroup& TupleBPS::getOutputRowGroup() const
{
return outputRowGroup;
@@ -2970,9 +3031,9 @@ void TupleBPS::setFcnExpGroup2(const boost::shared_ptr<funcexp::FuncExpWrapper>&
fe2Output = rg;
checkDupOutputColumns(rg);
fe2Mapping = makeMapping(outputRowGroup, fe2Output);
runFEonPM = runFE2onPM;
bRunFEonPM = runFE2onPM;
if (runFEonPM)
if (bRunFEonPM)
fBPP->setFEGroup2(fe2, fe2Output);
}
@@ -2985,7 +3046,7 @@ void TupleBPS::setFcnExpGroup3(const vector<execplan::SRCP>& fe)
fe2->addReturnedColumn(fe[i]);
// if this is called, there's no join, so it can always run on the PM
runFEonPM = true;
bRunFEonPM = true;
fBPP->setFEGroup2(fe2, fe2Output);
}
@@ -2995,93 +3056,10 @@ void TupleBPS::setFE23Output(const rowgroup::RowGroup& feOutput)
checkDupOutputColumns(feOutput);
fe2Mapping = makeMapping(outputRowGroup, fe2Output);
if (fe2 && runFEonPM)
if (fe2 && bRunFEonPM)
fBPP->setFEGroup2(fe2, fe2Output);
}
void TupleBPS::processFE2_oneRG(RowGroup& input, RowGroup& output, Row& inRow, Row& outRow,
funcexp::FuncExpWrapper* local_fe)
{
bool ret;
uint32_t i;
output.resetRowGroup(input.getBaseRid());
output.setDBRoot(input.getDBRoot());
output.getRow(0, &outRow);
input.getRow(0, &inRow);
for (i = 0; i < input.getRowCount(); i++, inRow.nextRow())
{
ret = local_fe->evaluate(&inRow);
if (ret)
{
applyMapping(fe2Mapping, inRow, &outRow);
outRow.setRid(inRow.getRelRid());
output.incRowCount();
outRow.nextRow();
}
}
}
void TupleBPS::processFE2(RowGroup& input, RowGroup& output, Row& inRow, Row& outRow, vector<RGData>* rgData,
funcexp::FuncExpWrapper* local_fe)
{
vector<RGData> results;
RGData result;
uint32_t i, j;
bool ret;
result = RGData(output);
output.setData(&result);
output.resetRowGroup(-1);
output.getRow(0, &outRow);
for (i = 0; i < rgData->size(); i++)
{
input.setData(&(*rgData)[i]);
if (output.getRowCount() == 0)
{
output.resetRowGroup(input.getBaseRid());
output.setDBRoot(input.getDBRoot());
}
input.getRow(0, &inRow);
for (j = 0; j < input.getRowCount(); j++, inRow.nextRow())
{
ret = local_fe->evaluate(&inRow);
if (ret)
{
applyMapping(fe2Mapping, inRow, &outRow);
outRow.setRid(inRow.getRelRid());
output.incRowCount();
outRow.nextRow();
if (output.getRowCount() == 8192 || output.getDBRoot() != input.getDBRoot() ||
output.getBaseRid() != input.getBaseRid())
{
results.push_back(result);
result = RGData(output);
output.setData(&result);
output.resetRowGroup(input.getBaseRid());
output.setDBRoot(input.getDBRoot());
output.getRow(0, &outRow);
}
}
}
}
if (output.getRowCount() > 0)
{
results.push_back(result);
}
rgData->swap(results);
}
const rowgroup::RowGroup& TupleBPS::getDeliveredRowGroup() const
{
if (fe2)