Reapply "fix(aggregation, disk-based) MCOL-5691 distinct aggregate disk based (#3145)"

This reverts commit a5c12b98d7.
2025-07-30 19:23:07 +03:00 · 2024-12-06 11:32:36 +01:00
parent 5e5d328269
commit e0a01c6cf4
7 changed files with 417 additions and 144 deletions
--- a/utils/rowgroup/rowaggregation.cpp
+++ b/utils/rowgroup/rowaggregation.cpp
@ -57,7 +57,7 @@
 #include "rowstorage.h"

 //..comment out NDEBUG to enable assertions, uncomment NDEBUG to disable
-//#define NDEBUG
+// #define NDEBUG
 #include "mcs_decimal.h"

 using namespace std;
@ -315,7 +315,7 @@ void RowAggregation::updateStringMinMax(utils::NullString val1, utils::NullStrin
  if (val1.isNull())
  {
    // as any comparison with NULL is false, it should not affect min/max ranges.
-    return ; // do nothing.
+    return;  // do nothing.
  }
  CHARSET_INFO* cs = fRow.getCharset(col);
  int tmp = cs->strnncoll(val1.str(), val1.length(), val2.str(), val2.length());
@ -810,8 +810,9 @@ void RowAggregation::aggregateRow(Row& row, const uint64_t* hash,
                                  std::vector<mcsv1sdk::mcsv1Context>* rgContextColl)
 {
  uint32_t cnt = fRollupFlag ? fGroupByCols.size() : 1;
-  for (uint32_t z = 0; z < cnt; z++) {
-  // groupby column list is not empty, find the entry.
+  for (uint32_t z = 0; z < cnt; z++)
+  {
+    // groupby column list is not empty, find the entry.
    if (!fGroupByCols.empty())
    {
      bool is_new_row;
@ -856,7 +857,8 @@ void RowAggregation::aggregateRow(Row& row, const uint64_t* hash,
    updateEntry(row, rgContextColl);
    // these quantities are unsigned and comparing z and cnt - 1 can be incorrect
    // because cnt can be zero.
-    if ((z + 1 < cnt)) {
+    if ((z + 1 < cnt))
+    {
      // if we are rolling up, we mark appropriate field as NULL and also increment
      // value in the "mark" column, so that we can differentiate between data and
      // various rollups.
@ -1169,8 +1171,8 @@ void RowAggregation::doMinMax(const Row& rowIn, int64_t colIn, int64_t colOut, i
    {
      if (LIKELY(rowIn.getColumnWidth(colIn) == datatypes::MAXDECIMALWIDTH))
      {
-        updateIntMinMax(rowIn.getTSInt128Field(colIn).getValue(), fRow.getTSInt128Field(colOut).getValue(), colOut,
-                        funcType);
+        updateIntMinMax(rowIn.getTSInt128Field(colIn).getValue(), fRow.getTSInt128Field(colOut).getValue(),
+                        colOut, funcType);
      }
      else if (rowIn.getColumnWidth(colIn) <= datatypes::MAXLEGACYWIDTH)
      {
@ -2120,10 +2122,9 @@ void RowAggregation::doStatistics(const Row& rowIn, int64_t colIn, int64_t colOu
  long double mean = fRow.getLongDoubleField(colAux);
  long double scaledMomentum2 = fRow.getLongDoubleField(colAux + 1);
  volatile long double delta = valIn - mean;
-  mean += delta/count;
+  mean += delta / count;
  scaledMomentum2 += delta * (valIn - mean);

-
  fRow.setDoubleField(count, colOut);
  fRow.setLongDoubleField(mean, colAux);
  fRow.setLongDoubleField(scaledMomentum2, colAux + 1);
@ -2173,8 +2174,7 @@ void RowAggregation::doUDAF(const Row& rowIn, int64_t colIn, int64_t colOut, int
      cc = dynamic_cast<execplan::ConstantColumn*>(fFunctionCols[funcColsIdx]->fpConstCol.get());
    }

-    if ((cc && cc->isNull()) ||
-        (!cc && isNull(&fRowGroupIn, rowIn, colIn) == true))
+    if ((cc && cc->isNull()) || (!cc && isNull(&fRowGroupIn, rowIn, colIn) == true))
    {
      if (udafContextsColl[origFuncColsIdx].getRunFlag(mcsv1sdk::UDAF_IGNORE_NULLS))
      {
@ -2500,7 +2500,8 @@ void RowAggregation::loadEmptySet(messageqcpp::ByteStream& bs)
 //------------------------------------------------------------------------------
 RowAggregationUM::RowAggregationUM(const vector<SP_ROWAGG_GRPBY_t>& rowAggGroupByCols,
                                   const vector<SP_ROWAGG_FUNC_t>& rowAggFunctionCols,
-                                   joblist::ResourceManager* r, boost::shared_ptr<int64_t> sessionLimit, bool withRollup)
+                                   joblist::ResourceManager* r, boost::shared_ptr<int64_t> sessionLimit,
+                                   bool withRollup)
 : RowAggregation(rowAggGroupByCols, rowAggFunctionCols, r, sessionLimit, withRollup)
 , fHasAvg(false)
 , fHasStatsFunc(false)
@ -3228,7 +3229,7 @@ void RowAggregationUM::SetUDAFAnyValue(static_any::any& valOut, int64_t colOut)

    case execplan::CalpontSystemCatalog::CHAR:
    case execplan::CalpontSystemCatalog::VARCHAR:
-    case execplan::CalpontSystemCatalog::TEXT:    fRow.setStringField(strOut, colOut); break;
+    case execplan::CalpontSystemCatalog::TEXT: fRow.setStringField(strOut, colOut); break;

    case execplan::CalpontSystemCatalog::VARBINARY:
    case execplan::CalpontSystemCatalog::CLOB:
@ -4220,13 +4221,26 @@ bool RowAggregationUM::nextRowGroup()
  return more;
 }

+bool RowAggregationUM::nextOutputRowGroup()
+{
+  bool more = fRowAggStorage->getNextOutputRGData(fCurRGData);
+
+  if (more)
+  {
+    fRowGroupOut->setData(fCurRGData.get());
+  }
+
+  return more;
+}
+
 //------------------------------------------------------------------------------
 // Row Aggregation constructor used on UM
 // For 2nd phase of two-phase case, from partial RG to final aggregated RG
 //------------------------------------------------------------------------------
 RowAggregationUMP2::RowAggregationUMP2(const vector<SP_ROWAGG_GRPBY_t>& rowAggGroupByCols,
                                       const vector<SP_ROWAGG_FUNC_t>& rowAggFunctionCols,
-                                       joblist::ResourceManager* r, boost::shared_ptr<int64_t> sessionLimit, bool withRollup)
+                                       joblist::ResourceManager* r, boost::shared_ptr<int64_t> sessionLimit,
+                                       bool withRollup)
 : RowAggregationUM(rowAggGroupByCols, rowAggFunctionCols, r, sessionLimit, withRollup)
 {
 }
@ -4450,7 +4464,7 @@ void RowAggregationUMP2::doAvg(const Row& rowIn, int64_t colIn, int64_t colOut,
  {
    if (LIKELY(cnt > 0))
    {
-      int128_t valOut = fRow.getTSInt128Field(colOut).getValue();;
+      int128_t valOut = fRow.getTSInt128Field(colOut).getValue();
      int128_t sum = valOut + wideValue;
      fRow.setInt128Field(sum, colOut);
      fRow.setUintField(rowIn.getUintField(colAuxIn) + cnt, colAux);
@ -4509,7 +4523,8 @@ void RowAggregationUMP2::doStatistics(const Row& rowIn, int64_t colIn, int64_t c
  {
    volatile long double delta = mean - blockMean;
    nextMean = (mean * count + blockMean * blockCount) / nextCount;
-    nextScaledMomentum2 = scaledMomentum2 + blockScaledMomentum2 + delta * delta * (count * blockCount / nextCount);
+    nextScaledMomentum2 =
+        scaledMomentum2 + blockScaledMomentum2 + delta * delta * (count * blockCount / nextCount);
  }
  fRow.setDoubleField(nextCount, colOut);
  fRow.setLongDoubleField(nextMean, colAux);
@ -4682,19 +4697,29 @@ void RowAggregationDistinct::addRowGroup(const RowGroup* pRows,
 //------------------------------------------------------------------------------
 void RowAggregationDistinct::doDistinctAggregation()
 {
-  while (dynamic_cast<RowAggregationUM*>(fAggregator.get())->nextRowGroup())
+  auto* umAggregator = dynamic_cast<RowAggregationUM*>(fAggregator.get());
+  if (umAggregator)
  {
-    fRowGroupIn.setData(fAggregator->getOutputRowGroup()->getRGData());
-
-    Row rowIn;
-    fRowGroupIn.initRow(&rowIn);
-    fRowGroupIn.getRow(0, &rowIn);
-
-    for (uint64_t i = 0; i < fRowGroupIn.getRowCount(); ++i, rowIn.nextRow())
+    while (umAggregator->nextOutputRowGroup())
    {
-      aggregateRow(rowIn);
+      fRowGroupIn.setData(fAggregator->getOutputRowGroup()->getRGData());
+
+      Row rowIn;
+      fRowGroupIn.initRow(&rowIn);
+      fRowGroupIn.getRow(0, &rowIn);
+
+      for (uint64_t i = 0; i < fRowGroupIn.getRowCount(); ++i, rowIn.nextRow())
+      {
+        aggregateRow(rowIn);
+      }
    }
  }
+  else
+  {
+    std::ostringstream errmsg;
+    errmsg << "RowAggregationDistinct: incorrect fAggregator class.";
+    cerr << errmsg.str() << endl;
+  }
 }

 void RowAggregationDistinct::doDistinctAggregation_rowVec(vector<std::pair<Row::Pointer, uint64_t>>& inRows)