fix(aggregation, RAM): MCOL-5715 Changes the second phase aggregation. (#3171)

This patch changes the second phase aggregation pipeline - takes into account current memory consumption. Co-authored-by: Leonid Fedorov <79837786+mariadb-LeonidFedorov@users.noreply.github.com> Co-authored-by: drrtuy <roman.nozdrin@mariadb.com>
2025-08-01 06:46:55 +03:00 · 2024-08-29 14:24:47 +03:00
parent e0a01c6cf4
commit 928678499a
1 changed files with 131 additions and 97 deletions
--- a/dbcon/joblist/tupleaggregatestep.cpp
+++ b/dbcon/joblist/tupleaggregatestep.cpp
@ -360,45 +360,86 @@ void TupleAggregateStep::doThreadedSecondPhaseAggregate(uint32_t threadID)
  if (threadID >= fNumOfBuckets)
    return;
-  scoped_array<RowBucketVec> rowBucketVecs(new RowBucketVec[fNumOfBuckets]);
+  bool finishedSecondPhase = false;
-  scoped_array<bool> bucketDone(new bool[fNumOfBuckets]);
+  bool diskAggAllowed = fRm->getAllowDiskAggregation();
-  uint32_t hashlen = fAggregator->aggMapKeyLength();
+  const uint32_t maxRowsSize = rowgroup::rgCommonSize;
-  try
+  while (!finishedSecondPhase && !fEndOfResult)
  {
-    RowAggregationDistinct* aggDist = dynamic_cast<RowAggregationDistinct*>(fAggregators[threadID].get());
+    scoped_array<RowBucketVec> rowBucketVecs(new RowBucketVec[fNumOfBuckets]);
-    RowAggregationMultiDistinct* multiDist =
+    scoped_array<bool> bucketDone(new bool[fNumOfBuckets]);
-        dynamic_cast<RowAggregationMultiDistinct*>(fAggregators[threadID].get());
+    uint32_t hashlen = fAggregator->aggMapKeyLength();
-    Row rowIn;
+    bool outOfMemory = false;
-    RowGroup* rowGroupIn = nullptr;
+    size_t totalMemSizeConsumed = 0;
    rowGroupIn = (aggDist->aggregator()->getOutputRowGroup());
    uint32_t bucketID;
    std::vector<std::unique_ptr<RGData>> rgDataVec;
-    if (multiDist)
+    try
    {
-      for (uint32_t i = 0; i < fNumOfBuckets; i++)
+      RowAggregationDistinct* aggDist = dynamic_cast<RowAggregationDistinct*>(fAggregators[threadID].get());
-        rowBucketVecs[i].resize(multiDist->subAggregators().size());
+      RowAggregationMultiDistinct* multiDist =
-    }
+          dynamic_cast<RowAggregationMultiDistinct*>(fAggregators[threadID].get());
-    else
+      Row rowIn;
-    {
+      RowGroup* rowGroupIn = nullptr;
-      for (uint32_t i = 0; i < fNumOfBuckets; i++)
+      rowGroupIn = (aggDist->aggregator()->getOutputRowGroup());
-        rowBucketVecs[i].resize(1);
+      uint32_t bucketID;
-    }
+      std::vector<std::unique_ptr<RGData>> rgDataVec;
-    // dispatch rows to bucket
+      if (multiDist)
    if (multiDist)
    {
      for (uint32_t j = 0; j < multiDist->subAggregators().size(); j++)
      {
-        rowGroupIn = (multiDist->subAggregators()[j]->getOutputRowGroup());
+        for (uint32_t i = 0; i < fNumOfBuckets; i++)
-        rowGroupIn->initRow(&rowIn);
+          rowBucketVecs[i].resize(multiDist->subAggregators().size());
-        auto* subDistAgg = dynamic_cast<RowAggregationUM*>(multiDist->subAggregators()[j].get());
+      }
      else
      {
        for (uint32_t i = 0; i < fNumOfBuckets; i++)
          rowBucketVecs[i].resize(1);
      }
-        while (subDistAgg->nextOutputRowGroup())
+      // dispatch rows to bucket
      if (multiDist)
      {
        for (uint32_t j = 0; j < multiDist->subAggregators().size(); j++)
        {
          rowGroupIn = (multiDist->subAggregators()[j]->getOutputRowGroup());
-          rgDataVec.emplace_back(subDistAgg->moveCurrentRGData());
+          rowGroupIn->initRow(&rowIn);
          auto* subDistAgg = dynamic_cast<RowAggregationUM*>(multiDist->subAggregators()[j].get());
          while (subDistAgg->nextOutputRowGroup())
          {
            rowGroupIn = (multiDist->subAggregators()[j]->getOutputRowGroup());
            rgDataVec.emplace_back(subDistAgg->moveCurrentRGData());
            rowGroupIn->getRow(0, &rowIn);
            for (uint64_t i = 0; i < rowGroupIn->getRowCount(); ++i)
            {
              // The key is the groupby columns, which are the leading columns.
              // uint8_t* hashMapKey = rowIn.getData() + 2;
              // bucketID = hash.operator()(hashMapKey) & fBucketMask;
              uint64_t hash = rowgroup::hashRow(rowIn, hashlen - 1);
              bucketID = hash % fNumOfBuckets;
              rowBucketVecs[bucketID][j].emplace_back(rowIn.getPointer(), hash);
              rowIn.nextRow();
            }
            const auto rgSize = diskAggAllowed ? rowGroupIn->getSizeWithStrings(maxRowsSize)
                                               : rowGroupIn->getSizeWithStrings();
            totalMemSizeConsumed += rgSize;
            if (!fRm->getMemory(rgSize, fSessionMemLimit, !diskAggAllowed))
            {
              outOfMemory = true;
              break;
            }
          }
        }
      }
      else
      {
        rowGroupIn->initRow(&rowIn);
        auto* subAgg = dynamic_cast<RowAggregationUM*>(aggDist->aggregator().get());
        while (subAgg->nextOutputRowGroup())
        {
          rowGroupIn->setData(aggDist->aggregator()->getOutputRowGroup()->getRGData());
          rgDataVec.emplace_back(subAgg->moveCurrentRGData());
          rowGroupIn->getRow(0, &rowIn);
          for (uint64_t i = 0; i < rowGroupIn->getRowCount(); ++i)
@ -408,87 +449,80 @@ void TupleAggregateStep::doThreadedSecondPhaseAggregate(uint32_t threadID)
            // bucketID = hash.operator()(hashMapKey) & fBucketMask;
            uint64_t hash = rowgroup::hashRow(rowIn, hashlen - 1);
            bucketID = hash % fNumOfBuckets;
-            rowBucketVecs[bucketID][j].emplace_back(rowIn.getPointer(), hash);
+            rowBucketVecs[bucketID][0].emplace_back(rowIn.getPointer(), hash);
            rowIn.nextRow();
          }
        }
      }
    }
    else
    {
      rowGroupIn->initRow(&rowIn);
      auto* subAgg = dynamic_cast<RowAggregationUM*>(aggDist->aggregator().get());
-      while (subAgg->nextOutputRowGroup())
+          const auto rgSize =
-      {
+              diskAggAllowed ? rowGroupIn->getSizeWithStrings(maxRowsSize) : rowGroupIn->getSizeWithStrings();
-        rowGroupIn->setData(aggDist->aggregator()->getOutputRowGroup()->getRGData());
+          totalMemSizeConsumed += rgSize;
-        rgDataVec.emplace_back(subAgg->moveCurrentRGData());
+          if (!fRm->getMemory(rgSize, fSessionMemLimit, !diskAggAllowed))
        rowGroupIn->getRow(0, &rowIn);
        for (uint64_t i = 0; i < rowGroupIn->getRowCount(); ++i)
        {
          // The key is the groupby columns, which are the leading columns.
          // uint8_t* hashMapKey = rowIn.getData() + 2;
          // bucketID = hash.operator()(hashMapKey) & fBucketMask;
          uint64_t hash = rowgroup::hashRow(rowIn, hashlen - 1);
          bucketID = hash % fNumOfBuckets;
          rowBucketVecs[bucketID][0].emplace_back(rowIn.getPointer(), hash);
          rowIn.nextRow();
        }
      }
    }
    bool done = false;
    // reset bucketDone[] to be false
    // memset(bucketDone, 0, sizeof(bucketDone));
    fill(&bucketDone[0], &bucketDone[fNumOfBuckets], false);
    while (!done && !cancelled())
    {
      done = true;
      for (uint32_t c = 0; c < fNumOfBuckets && !cancelled(); c++)
      {
        if (!bucketDone[c] && fAgg_mutex[c]->try_lock())
        {
          try
          {
-            if (multiDist)
+            outOfMemory = true;
-              dynamic_cast<RowAggregationMultiDistinct*>(fAggregators[c].get())
+            break;
                  ->doDistinctAggregation_rowVec(rowBucketVecs[c]);
            else
              dynamic_cast<RowAggregationDistinct*>(fAggregators[c].get())
                  ->doDistinctAggregation_rowVec(rowBucketVecs[c][0]);
          }
-          catch (...)
+        }
      }
      if (!outOfMemory)
        finishedSecondPhase = true;
      bool done = false;
      // reset bucketDone[] to be false
      // memset(bucketDone, 0, sizeof(bucketDone));
      fill(&bucketDone[0], &bucketDone[fNumOfBuckets], false);
      while (!done && !cancelled())
      {
        done = true;
        for (uint32_t c = 0; c < fNumOfBuckets && !cancelled(); c++)
        {
          if (!bucketDone[c] && fAgg_mutex[c]->try_lock())
          {
            try
            {
              if (multiDist)
                dynamic_cast<RowAggregationMultiDistinct*>(fAggregators[c].get())
                    ->doDistinctAggregation_rowVec(rowBucketVecs[c]);
              else
                dynamic_cast<RowAggregationDistinct*>(fAggregators[c].get())
                    ->doDistinctAggregation_rowVec(rowBucketVecs[c][0]);
            }
            catch (...)
            {
              fAgg_mutex[c]->unlock();
              throw;
            }
            fAgg_mutex[c]->unlock();
-            throw;
+            bucketDone[c] = true;
            rowBucketVecs[c][0].clear();
          }
          else if (!bucketDone[c])
          {
            done = false;
          }
          fAgg_mutex[c]->unlock();
          bucketDone[c] = true;
          rowBucketVecs[c][0].clear();
        }
        else if (!bucketDone[c])
        {
          done = false;
        }
      }
    }
-    if (cancelled())
+      fRm->returnMemory(totalMemSizeConsumed, fSessionMemLimit);
      if (cancelled())
      {
        fRm->returnMemory(totalMemSizeConsumed, fSessionMemLimit);
        finishedSecondPhase = true;
        fEndOfResult = true;
      }
    }  // try
    catch (...)
    {
      fRm->returnMemory(totalMemSizeConsumed, fSessionMemLimit);
      handleException(std::current_exception(), logging::tupleAggregateStepErr,
                      logging::ERR_AGGREGATION_TOO_BIG,
                      "TupleAggregateStep::doThreadedSecondPhaseAggregate()");
      fEndOfResult = true;
      finishedSecondPhase = true;
    }
  }  // try
  catch (...)
  {
    handleException(std::current_exception(), logging::tupleAggregateStepErr,
                    logging::ERR_AGGREGATION_TOO_BIG, "TupleAggregateStep::doThreadedSecondPhaseAggregate()");
    fEndOfResult = true;
  }
  fDoneAggregate = true;