1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-07-30 19:23:07 +03:00

Reapply "fix(aggregation, disk-based) MCOL-5691 distinct aggregate disk based (#3145)"

This reverts commit a5c12b98d7.
This commit is contained in:
Aleksei Antipovskii
2024-12-06 11:32:36 +01:00
committed by drrtuy
parent 5e5d328269
commit e0a01c6cf4
7 changed files with 417 additions and 144 deletions

View File

@ -57,7 +57,7 @@
#include "rowstorage.h"
//..comment out NDEBUG to enable assertions, uncomment NDEBUG to disable
//#define NDEBUG
// #define NDEBUG
#include "mcs_decimal.h"
using namespace std;
@ -315,7 +315,7 @@ void RowAggregation::updateStringMinMax(utils::NullString val1, utils::NullStrin
if (val1.isNull())
{
// as any comparison with NULL is false, it should not affect min/max ranges.
return ; // do nothing.
return; // do nothing.
}
CHARSET_INFO* cs = fRow.getCharset(col);
int tmp = cs->strnncoll(val1.str(), val1.length(), val2.str(), val2.length());
@ -810,8 +810,9 @@ void RowAggregation::aggregateRow(Row& row, const uint64_t* hash,
std::vector<mcsv1sdk::mcsv1Context>* rgContextColl)
{
uint32_t cnt = fRollupFlag ? fGroupByCols.size() : 1;
for (uint32_t z = 0; z < cnt; z++) {
// groupby column list is not empty, find the entry.
for (uint32_t z = 0; z < cnt; z++)
{
// groupby column list is not empty, find the entry.
if (!fGroupByCols.empty())
{
bool is_new_row;
@ -856,7 +857,8 @@ void RowAggregation::aggregateRow(Row& row, const uint64_t* hash,
updateEntry(row, rgContextColl);
// these quantities are unsigned and comparing z and cnt - 1 can be incorrect
// because cnt can be zero.
if ((z + 1 < cnt)) {
if ((z + 1 < cnt))
{
// if we are rolling up, we mark appropriate field as NULL and also increment
// value in the "mark" column, so that we can differentiate between data and
// various rollups.
@ -1169,8 +1171,8 @@ void RowAggregation::doMinMax(const Row& rowIn, int64_t colIn, int64_t colOut, i
{
if (LIKELY(rowIn.getColumnWidth(colIn) == datatypes::MAXDECIMALWIDTH))
{
updateIntMinMax(rowIn.getTSInt128Field(colIn).getValue(), fRow.getTSInt128Field(colOut).getValue(), colOut,
funcType);
updateIntMinMax(rowIn.getTSInt128Field(colIn).getValue(), fRow.getTSInt128Field(colOut).getValue(),
colOut, funcType);
}
else if (rowIn.getColumnWidth(colIn) <= datatypes::MAXLEGACYWIDTH)
{
@ -2120,10 +2122,9 @@ void RowAggregation::doStatistics(const Row& rowIn, int64_t colIn, int64_t colOu
long double mean = fRow.getLongDoubleField(colAux);
long double scaledMomentum2 = fRow.getLongDoubleField(colAux + 1);
volatile long double delta = valIn - mean;
mean += delta/count;
mean += delta / count;
scaledMomentum2 += delta * (valIn - mean);
fRow.setDoubleField(count, colOut);
fRow.setLongDoubleField(mean, colAux);
fRow.setLongDoubleField(scaledMomentum2, colAux + 1);
@ -2173,8 +2174,7 @@ void RowAggregation::doUDAF(const Row& rowIn, int64_t colIn, int64_t colOut, int
cc = dynamic_cast<execplan::ConstantColumn*>(fFunctionCols[funcColsIdx]->fpConstCol.get());
}
if ((cc && cc->isNull()) ||
(!cc && isNull(&fRowGroupIn, rowIn, colIn) == true))
if ((cc && cc->isNull()) || (!cc && isNull(&fRowGroupIn, rowIn, colIn) == true))
{
if (udafContextsColl[origFuncColsIdx].getRunFlag(mcsv1sdk::UDAF_IGNORE_NULLS))
{
@ -2500,7 +2500,8 @@ void RowAggregation::loadEmptySet(messageqcpp::ByteStream& bs)
//------------------------------------------------------------------------------
RowAggregationUM::RowAggregationUM(const vector<SP_ROWAGG_GRPBY_t>& rowAggGroupByCols,
const vector<SP_ROWAGG_FUNC_t>& rowAggFunctionCols,
joblist::ResourceManager* r, boost::shared_ptr<int64_t> sessionLimit, bool withRollup)
joblist::ResourceManager* r, boost::shared_ptr<int64_t> sessionLimit,
bool withRollup)
: RowAggregation(rowAggGroupByCols, rowAggFunctionCols, r, sessionLimit, withRollup)
, fHasAvg(false)
, fHasStatsFunc(false)
@ -3228,7 +3229,7 @@ void RowAggregationUM::SetUDAFAnyValue(static_any::any& valOut, int64_t colOut)
case execplan::CalpontSystemCatalog::CHAR:
case execplan::CalpontSystemCatalog::VARCHAR:
case execplan::CalpontSystemCatalog::TEXT: fRow.setStringField(strOut, colOut); break;
case execplan::CalpontSystemCatalog::TEXT: fRow.setStringField(strOut, colOut); break;
case execplan::CalpontSystemCatalog::VARBINARY:
case execplan::CalpontSystemCatalog::CLOB:
@ -4220,13 +4221,26 @@ bool RowAggregationUM::nextRowGroup()
return more;
}
bool RowAggregationUM::nextOutputRowGroup()
{
bool more = fRowAggStorage->getNextOutputRGData(fCurRGData);
if (more)
{
fRowGroupOut->setData(fCurRGData.get());
}
return more;
}
//------------------------------------------------------------------------------
// Row Aggregation constructor used on UM
// For 2nd phase of two-phase case, from partial RG to final aggregated RG
//------------------------------------------------------------------------------
RowAggregationUMP2::RowAggregationUMP2(const vector<SP_ROWAGG_GRPBY_t>& rowAggGroupByCols,
const vector<SP_ROWAGG_FUNC_t>& rowAggFunctionCols,
joblist::ResourceManager* r, boost::shared_ptr<int64_t> sessionLimit, bool withRollup)
joblist::ResourceManager* r, boost::shared_ptr<int64_t> sessionLimit,
bool withRollup)
: RowAggregationUM(rowAggGroupByCols, rowAggFunctionCols, r, sessionLimit, withRollup)
{
}
@ -4450,7 +4464,7 @@ void RowAggregationUMP2::doAvg(const Row& rowIn, int64_t colIn, int64_t colOut,
{
if (LIKELY(cnt > 0))
{
int128_t valOut = fRow.getTSInt128Field(colOut).getValue();;
int128_t valOut = fRow.getTSInt128Field(colOut).getValue();
int128_t sum = valOut + wideValue;
fRow.setInt128Field(sum, colOut);
fRow.setUintField(rowIn.getUintField(colAuxIn) + cnt, colAux);
@ -4509,7 +4523,8 @@ void RowAggregationUMP2::doStatistics(const Row& rowIn, int64_t colIn, int64_t c
{
volatile long double delta = mean - blockMean;
nextMean = (mean * count + blockMean * blockCount) / nextCount;
nextScaledMomentum2 = scaledMomentum2 + blockScaledMomentum2 + delta * delta * (count * blockCount / nextCount);
nextScaledMomentum2 =
scaledMomentum2 + blockScaledMomentum2 + delta * delta * (count * blockCount / nextCount);
}
fRow.setDoubleField(nextCount, colOut);
fRow.setLongDoubleField(nextMean, colAux);
@ -4682,19 +4697,29 @@ void RowAggregationDistinct::addRowGroup(const RowGroup* pRows,
//------------------------------------------------------------------------------
void RowAggregationDistinct::doDistinctAggregation()
{
while (dynamic_cast<RowAggregationUM*>(fAggregator.get())->nextRowGroup())
auto* umAggregator = dynamic_cast<RowAggregationUM*>(fAggregator.get());
if (umAggregator)
{
fRowGroupIn.setData(fAggregator->getOutputRowGroup()->getRGData());
Row rowIn;
fRowGroupIn.initRow(&rowIn);
fRowGroupIn.getRow(0, &rowIn);
for (uint64_t i = 0; i < fRowGroupIn.getRowCount(); ++i, rowIn.nextRow())
while (umAggregator->nextOutputRowGroup())
{
aggregateRow(rowIn);
fRowGroupIn.setData(fAggregator->getOutputRowGroup()->getRGData());
Row rowIn;
fRowGroupIn.initRow(&rowIn);
fRowGroupIn.getRow(0, &rowIn);
for (uint64_t i = 0; i < fRowGroupIn.getRowCount(); ++i, rowIn.nextRow())
{
aggregateRow(rowIn);
}
}
}
else
{
std::ostringstream errmsg;
errmsg << "RowAggregationDistinct: incorrect fAggregator class.";
cerr << errmsg.str() << endl;
}
}
void RowAggregationDistinct::doDistinctAggregation_rowVec(vector<std::pair<Row::Pointer, uint64_t>>& inRows)