fix(aggregation, disk-based) MCOL-5691 distinct aggregate disk based (#3145)

* fix(aggregation, disk-based): MCOL-5689 this fixes disk-based distinct aggregation functions Previously disk-based distinct aggregation functions produced incorrect results b/c there was no finalization applied for previous generations stored on disk. * fix(aggregation, disk-based): Fix disk-based COUNT(DISTINCT ...) queries. (Case 2). (Distinct & Multi-Distinct, Single- & Multi-Threaded). * fix(aggregation, disk-based): Fix disk-based DISTINCT & GROUP BY queries. (Case 1). (Distinct & Multi-Distinct, Single- & Multi-Threaded). --------- Co-authored-by: Theresa Hradilak <theresa.hradilak@gmail.com> Co-authored-by: Roman Nozdrin <rnozdrin@mariadb.com>
2025-08-01 06:46:55 +03:00 · 2024-03-24 17:04:37 +02:00
parent 8cb7bc8e54
commit 444cf4c65e
7 changed files with 398 additions and 128 deletions
--- a/dbcon/joblist/tupleaggregatestep.cpp
+++ b/dbcon/joblist/tupleaggregatestep.cpp
@ -289,6 +289,7 @@ TupleAggregateStep::TupleAggregateStep(const SP_ROWAGG_UM_t& agg, const RowGroup
  fNumOfBuckets =
      calcNumberOfBuckets(memLimit, fNumOfThreads, fNumOfBuckets, fNumOfRowGroups, fRowGroupIn.getRowSize(),
                          fRowGroupOut.getRowSize(), fRm->getAllowDiskAggregation());
+
  fNumOfThreads = std::min(fNumOfThreads, fNumOfBuckets);

  fMemUsage.reset(new uint64_t[fNumOfThreads]);
@ -392,7 +393,7 @@ void TupleAggregateStep::doThreadedSecondPhaseAggregate(uint32_t threadID)
        rowGroupIn->initRow(&rowIn);
        auto* subDistAgg = dynamic_cast<RowAggregationUM*>(multiDist->subAggregators()[j].get());

-        while (subDistAgg->nextRowGroup())
+        while (subDistAgg->nextOutputRowGroup())
        {
          rowGroupIn = (multiDist->subAggregators()[j]->getOutputRowGroup());
          rgDataVec.emplace_back(subDistAgg->moveCurrentRGData());
@ -416,7 +417,7 @@ void TupleAggregateStep::doThreadedSecondPhaseAggregate(uint32_t threadID)
      rowGroupIn->initRow(&rowIn);
      auto* subAgg = dynamic_cast<RowAggregationUM*>(aggDist->aggregator().get());

-      while (subAgg->nextRowGroup())
+      while (subAgg->nextOutputRowGroup())
      {
        rowGroupIn->setData(aggDist->aggregator()->getOutputRowGroup()->getRGData());
        rgDataVec.emplace_back(subAgg->moveCurrentRGData());
@ -571,7 +572,7 @@ bool TupleAggregateStep::nextDeliveredRowGroup()
 {
  for (; fBucketNum < fNumOfBuckets; fBucketNum++)
  {
-    while (fAggregators[fBucketNum]->nextRowGroup())
+    while (fAggregators[fBucketNum]->nextOutputRowGroup())
    {
      fAggregators[fBucketNum]->finalize();
      fRowGroupDelivered.setData(fAggregators[fBucketNum]->getOutputRowGroup()->getRGData());
@ -5708,14 +5709,27 @@ void TupleAggregateStep::doAggregate()
  return;
 }

+/** @brief Aggregate input row groups in two-phase multi-threaded aggregation.
+ * In second phase handle three different aggregation cases differently:
+ * 1. Query contains at least one aggregation on a DISTINCT column, e.g. SUM (DISTINCT col1) AND at least one
+ * GROUP BY column
+ * 2. Query contains at least one aggregation on a DISTINCT column but no GROUP BY column
+ * 3. Query contains no aggregation on a DISTINCT column, but at least one GROUP BY column
+ * DISTINCT selects (e.g. SELECT DISTINCT col1 FROM ...) are handled in tupleannexstep.cpp.
+ */
 uint64_t TupleAggregateStep::doThreadedAggregate(ByteStream& bs, RowGroupDL* dlp)
 {
-  uint32_t i;
-  RGData rgData;
+  // initialize return value variable
  uint64_t rowCount = 0;

  try
  {
+    /*
+     * Phase 1: Distribute input rows to different buckets depending on the hash value of the group by columns
+     * per row. Then distribute buckets equally on aggregators in fAggregators. (Number of fAggregators ==
+     * fNumOfBuckets). Each previously created hash bucket is represented as one RowGroup in a fAggregator.
+     */
+
    if (!fDoneAggregate)
    {
      initializeMultiThread();
@ -5724,9 +5738,9 @@ uint64_t TupleAggregateStep::doThreadedAggregate(ByteStream& bs, RowGroupDL* dlp
      runners.reserve(fNumOfThreads);  // to prevent a resize during use

      // Start the aggregator threads
-      for (i = 0; i < fNumOfThreads; i++)
+      for (uint32_t threadNum = 0; threadNum < fNumOfThreads; threadNum++)
      {
-        runners.push_back(jobstepThreadPool.invoke(ThreadedAggregator(this, i)));
+        runners.push_back(jobstepThreadPool.invoke(ThreadedAggregator(this, threadNum)));
      }

      // Now wait for all those threads
@ -5740,18 +5754,28 @@ uint64_t TupleAggregateStep::doThreadedAggregate(ByteStream& bs, RowGroupDL* dlp
      // much memory on average
      uint32_t threads = std::max(1U, fNumOfThreads / 2);
      runners.reserve(threads);
-      for (i = 0; i < threads; ++i)
+      for (uint32_t threadNum = 0; threadNum < threads; ++threadNum)
      {
-        runners.push_back(jobstepThreadPool.invoke(ThreadedAggregateFinalizer(this, i)));
+        runners.push_back(jobstepThreadPool.invoke(ThreadedAggregateFinalizer(this, threadNum)));
      }
      jobstepThreadPool.join(runners);
    }

-    if (dynamic_cast<RowAggregationDistinct*>(fAggregator.get()) && fAggregator->aggMapKeyLength() > 0)
+    /*
+     * Phase 2: Depending on query type (see below) do aggregation per previously created RowGroup of rows
+     * that need to aggregated and output results.
+     */
+
+    auto* distinctAggregator = dynamic_cast<RowAggregationDistinct*>(fAggregator.get());
+    const bool hasGroupByColumns = fAggregator->aggMapKeyLength() > 0;
+
+    // Case 1: Query contains at least one aggregation on a DISTINCT column AND at least one GROUP BY column
+    // e.g. SELECT SUM(DISTINCT col1) FROM test GROUP BY col2;
+    if (distinctAggregator && hasGroupByColumns)
    {
-      // 2nd phase multi-threaded aggregate
      if (!fEndOfResult)
      {
+        // Do multi-threaded second phase aggregation (per row group created for GROUP BY statement)
        if (!fDoneAggregate)
        {
          vector<uint64_t> runners;  // thread pool handles
@ -5759,97 +5783,114 @@ uint64_t TupleAggregateStep::doThreadedAggregate(ByteStream& bs, RowGroupDL* dlp

          uint32_t bucketsPerThread = fNumOfBuckets / fNumOfThreads;
          uint32_t numThreads = ((fNumOfBuckets % fNumOfThreads) == 0 ? fNumOfThreads : fNumOfThreads + 1);
-          // uint32_t bucketsPerThread = 1;
-          // uint32_t numThreads = fNumOfBuckets;

          runners.reserve(numThreads);

-          for (i = 0; i < numThreads; i++)
+          for (uint32_t threadNum = 0; threadNum < numThreads; threadNum++)
          {
            runners.push_back(jobstepThreadPool.invoke(
-                ThreadedSecondPhaseAggregator(this, i * bucketsPerThread, bucketsPerThread)));
+                ThreadedSecondPhaseAggregator(this, threadNum * bucketsPerThread, bucketsPerThread)));
          }

          jobstepThreadPool.join(runners);
        }

+        // Deliver results
        fDoneAggregate = true;
        bool done = true;
-
-        while (nextDeliveredRowGroup())
+        while (nextDeliveredRowGroup() && !cancelled())
        {
          done = false;
          rowCount = fRowGroupOut.getRowCount();

          if (rowCount != 0)
          {
-            if (fRowGroupOut.getColumnCount() != fRowGroupDelivered.getColumnCount())
-              pruneAuxColumns();
-
-            if (dlp)
-            {
-              rgData = fRowGroupDelivered.duplicate();
-              dlp->insert(rgData);
-            }
-            else
-            {
-              bs.restart();
-              fRowGroupDelivered.serializeRGData(bs);
+            if (!cleanUpAndOutputRowGroup(bs, dlp))
              break;
-            }
          }

          done = true;
        }

        if (done)
+        {
          fEndOfResult = true;
+        }
      }
    }
-    else
+    // Case 2. Query contains at least one aggregation on a DISTINCT column but no GROUP BY column
+    // e.g. SELECT SUM(DISTINCT col1) FROM test;
+    else if (distinctAggregator)
    {
-      auto* agg = dynamic_cast<RowAggregationDistinct*>(fAggregator.get());
-
      if (!fEndOfResult)
      {
        if (!fDoneAggregate)
        {
-          for (i = 0; i < fNumOfBuckets; i++)
+          // Do aggregation over all row groups. As all row groups need to be aggregated together there is no
+          // easy way of multi-threading this and it's done in a single thread for now.
+          for (uint32_t bucketNum = 0; bucketNum < fNumOfBuckets; bucketNum++)
          {
            if (fEndOfResult == false)
            {
-              // do the final aggregtion and deliver the results
-              // at least one RowGroup for aggregate results
-              // for "distinct without group by" case
-              if (agg != nullptr)
-              {
-                auto* aggMultiDist = dynamic_cast<RowAggregationMultiDistinct*>(fAggregators[i].get());
-                auto* aggDist = dynamic_cast<RowAggregationDistinct*>(fAggregators[i].get());
-                agg->aggregator(aggDist->aggregator());
+              // The distinctAggregator accumulates the aggregation results of all row groups by being added
+              // all row groups of each bucket aggregator and doing an aggregation step after each addition.
+              auto* bucketMultiDistinctAggregator =
+                  dynamic_cast<RowAggregationMultiDistinct*>(fAggregators[bucketNum].get());
+              auto* bucketDistinctAggregator =
+                  dynamic_cast<RowAggregationDistinct*>(fAggregators[bucketNum].get());
+              distinctAggregator->aggregator(bucketDistinctAggregator->aggregator());

-                if (aggMultiDist)
-                {
-                  (dynamic_cast<RowAggregationMultiDistinct*>(agg))
-                      ->subAggregators(aggMultiDist->subAggregators());
-                }
-
-                agg->doDistinctAggregation();
-              }
-              // for "group by without distinct" case
-              else
+              if (bucketMultiDistinctAggregator)
              {
-                fAggregator->append(fAggregators[i].get());
+                (dynamic_cast<RowAggregationMultiDistinct*>(distinctAggregator))
+                    ->subAggregators(bucketMultiDistinctAggregator->subAggregators());
              }
+
+              distinctAggregator->aggregator()->finalAggregation();
+              distinctAggregator->doDistinctAggregation();
            }
          }
        }

+        // Deliver results
        fDoneAggregate = true;
+        bool done = true;
+        while (fAggregator->nextRowGroup() && !cancelled())
+        {
+          done = false;
+          fAggregator->finalize();
+          rowCount = fRowGroupOut.getRowCount();
+          fRowsReturned += rowCount;
+          fRowGroupDelivered.setData(fRowGroupOut.getRGData());
+
+          if (rowCount != 0)
+          {
+            if (!cleanUpAndOutputRowGroup(bs, dlp))
+              break;
+          }
+          done = true;
+        }
+        if (done)
+          fEndOfResult = true;
+      }
+    }
+    // CASE 3: Query contains no aggregation on a DISTINCT column, but at least one GROUP BY column
+    // e.g. SELECT SUM(col1) FROM test GROUP BY col2;
+    // Do aggregation over all row groups. As all row groups need to be aggregated together there is no
+    // easy way of multi-threading this and it's done in a single thread for now.
+    else if (hasGroupByColumns)
+    {
+      if (!fEndOfResult && !fDoneAggregate)
+      {
+        for (uint32_t bucketNum = 0; bucketNum < fNumOfBuckets; ++bucketNum)
+        {
+          fAggregator->append(fAggregators[bucketNum].get());
+        }
      }

+      fDoneAggregate = true;
      bool done = true;

-      //@bug4459
      while (fAggregator->nextRowGroup() && !cancelled())
      {
        done = false;
@ -5860,22 +5901,9 @@ uint64_t TupleAggregateStep::doThreadedAggregate(ByteStream& bs, RowGroupDL* dlp

        if (rowCount != 0)
        {
-          if (fRowGroupOut.getColumnCount() != fRowGroupDelivered.getColumnCount())
-            pruneAuxColumns();
-
-          if (dlp)
-          {
-            rgData = fRowGroupDelivered.duplicate();
-            dlp->insert(rgData);
-          }
-          else
-          {
-            bs.restart();
-            fRowGroupDelivered.serializeRGData(bs);
+          if (!cleanUpAndOutputRowGroup(bs, dlp))
            break;
-          }
        }
-
        done = true;
      }

@ -5884,7 +5912,14 @@ uint64_t TupleAggregateStep::doThreadedAggregate(ByteStream& bs, RowGroupDL* dlp
        fEndOfResult = true;
      }
    }
-  }  // try
+    else
+    {
+      throw logic_error(
+          "TupleAggregateStep::doThreadedAggregate: No DISTINCT columns nested into aggregation function "
+          "or "
+          "GROUP BY columns found. Should not reach here.");
+    }
+  }
  catch (...)
  {
    handleException(std::current_exception(), logging::tupleAggregateStepErr,
@ -5924,6 +5959,23 @@ uint64_t TupleAggregateStep::doThreadedAggregate(ByteStream& bs, RowGroupDL* dlp
  return rowCount;
 }

+bool TupleAggregateStep::cleanUpAndOutputRowGroup(ByteStream& bs, RowGroupDL* dlp)
+{
+  if (fRowGroupOut.getColumnCount() != fRowGroupDelivered.getColumnCount())
+    pruneAuxColumns();
+
+  if (dlp)
+  {
+    RGData rgData = fRowGroupDelivered.duplicate();
+    dlp->insert(rgData);
+    return true;
+  }
+
+  bs.restart();
+  fRowGroupDelivered.serializeRGData(bs);
+  return false;
+}
+
 void TupleAggregateStep::pruneAuxColumns()
 {
  uint64_t rowCount = fRowGroupOut.getRowCount();
--- a/dbcon/joblist/tupleaggregatestep.h
+++ b/dbcon/joblist/tupleaggregatestep.h
@ -161,6 +161,7 @@ class TupleAggregateStep : public JobStep, public TupleDeliveryStep
  void doThreadedSecondPhaseAggregate(uint32_t threadID);
  bool nextDeliveredRowGroup();
  void pruneAuxColumns();
+  bool cleanUpAndOutputRowGroup(messageqcpp::ByteStream& bs, RowGroupDL* dlp);
  void formatMiniStats();
  void printCalTrace();
  template <class GroupByMap>