[MCOL-4709] Disk-based aggregation

* Introduce multigeneration aggregation * Do not save unused part of RGDatas to disk * Add IO error explanation (strerror) * Reduce memory usage while aggregating * introduce in-memory generations to better memory utilization * Try to limit the qty of buckets at a low limit * Refactor disk aggregation a bit * pass calculated hash into RowAggregation * try to keep some RGData with free space in memory * do not dump more than half of rowgroups to disk if generations are allowed, instead start a new generation * for each thread shift the first processed bucket at each iteration, so the generations start more evenly * Unify temp data location * Explicitly create temp subdirectories whether disk aggregation/join are enabled or not
2025-10-17 01:27:36 +03:00 · 2021-01-15 18:52:13 +03:00
parent 3537c0d635
commit 475104e4d3
24 changed files with 5932 additions and 906 deletions
--- a/dbcon/joblist/resourcemanager.cpp
+++ b/dbcon/joblist/resourcemanager.cpp
@@ -55,6 +55,7 @@ const string ResourceManager::fExtentMapStr("ExtentMap");
 //const string ResourceManager::fDMLProcStr("DMLProc");
 //const string ResourceManager::fBatchInsertStr("BatchInsert");
 const string ResourceManager::fOrderByLimitStr("OrderByLimit");
 const string ResourceManager::fRowAggregationStr("RowAggregation");
 ResourceManager* ResourceManager::fInstance = NULL;
 boost::mutex mx;
@@ -254,6 +255,10 @@ ResourceManager::ResourceManager(bool runningInExeMgr) :
        fUseHdfs = true;
    else
        fUseHdfs = false;
    fAllowedDiskAggregation = getBoolVal(fRowAggregationStr,
                                         "AllowDiskBasedAggregation",
                                         defaultAllowDiskAggregation);
 }
 int ResourceManager::getEmPriority() const
--- a/dbcon/joblist/resourcemanager.h
+++ b/dbcon/joblist/resourcemanager.h
@@ -126,6 +126,8 @@ const uint64_t defaultOrderByLimitMaxMemory = 1 * 1024 * 1024 * 1024ULL;
 const uint64_t defaultDECThrottleThreshold = 200000000;  // ~200 MB
 const uint8_t defaultUseCpimport = 1;
 const bool defaultAllowDiskAggregation = false;
 /** @brief ResourceManager
 *	Returns requested values from Config
 *
@@ -149,7 +151,7 @@ public:
    /** @brief dtor
     */
-    virtual ~ResourceManager() { }
+    virtual ~ResourceManager() {}
    typedef std::map <uint32_t, uint64_t> MemMap;
@@ -177,6 +179,11 @@ public:
        return  getIntVal(fExeMgrStr, "ExecQueueSize", defaultEMExecQueueSize);
    }
    bool         getAllowDiskAggregation() const
    {
        return fAllowedDiskAggregation;
    }
    int	      	getHjMaxBuckets() const
    {
        return  getUintVal(fHashJoinStr, "MaxBuckets", defaultHJMaxBuckets);
@@ -391,7 +398,7 @@ public:
        atomicops::atomicAdd(&totalUmMemLimit, amount);
        atomicops::atomicAdd(sessionLimit.get(), amount);
    }
-    inline int64_t availableMemory()
+    inline int64_t availableMemory() const
    {
        return totalUmMemLimit;
    }
@@ -559,6 +566,8 @@ private:
    template<typename IntType>
    IntType getIntVal(const std::string& section, const std::string& name, IntType defval) const;
    bool getBoolVal(const std::string& section, const std::string& name, bool defval) const;
    void logMessage(logging::LOG_TYPE logLevel, logging::Message::MessageID mid, uint64_t value = 0, uint32_t sessionId = 0);
    /*static	const*/ std::string fExeMgrStr;
@@ -573,6 +582,7 @@ private:
    /*static	const*/ std::string fDMLProcStr;
    /*static	const*/ std::string fBatchInsertStr;
    static	const std::string fOrderByLimitStr;
    static      const std::string fRowAggregationStr;
    config::Config* fConfig;
    static ResourceManager* fInstance;
    uint32_t fTraceFlags;
@@ -604,6 +614,7 @@ private:
    bool isExeMgr;
    bool fUseHdfs;
    bool fAllowedDiskAggregation{false};
 };
@@ -644,7 +655,11 @@ inline IntType ResourceManager::getIntVal(const std::string& section, const std:
    return ( 0 == retStr.length() ? defval : fConfig->fromText(retStr) );
 }
-
+inline bool ResourceManager::getBoolVal(const std::string& section, const std::string& name, bool defval) const
 {
  auto retStr = fConfig->getConfig(section, name);
  return ( 0 == retStr.length() ? defval : (retStr == "y" || retStr == "Y") );
 }
 }
--- a/dbcon/joblist/tupleaggregatestep.cpp
+++ b/dbcon/joblist/tupleaggregatestep.cpp
@@ -129,7 +129,7 @@ struct cmpTuple
    }
 };
-typedef vector<Row::Pointer> RowBucket;
+typedef vector<std::pair<Row::Pointer, uint64_t>> RowBucket;
 typedef vector<RowBucket> RowBucketVec;
 // The AGG_MAP type is used to maintain a list of aggregate functions in order to
@@ -402,6 +402,17 @@ TupleAggregateStep::TupleAggregateStep(
    fNumOfThreads = fRm->aggNumThreads();
    fNumOfBuckets = fRm->aggNumBuckets();
    fNumOfRowGroups = fRm->aggNumRowGroups();
    auto memLimit = std::min(fRm->availableMemory(), *fSessionMemLimit);
    fNumOfBuckets = calcNumberOfBuckets(memLimit,
                                        fNumOfThreads,
                                        fNumOfBuckets,
                                        fNumOfRowGroups,
                                        fRowGroupIn.getRowSize(),
                                        fRowGroupOut.getRowSize(),
                                        fRm->getAllowDiskAggregation());
    fNumOfThreads = std::min(fNumOfThreads, fNumOfBuckets);
    fMemUsage.reset(new uint64_t[fNumOfThreads]);
    memset(fMemUsage.get(), 0, fNumOfThreads * sizeof(uint64_t));
@@ -440,7 +451,7 @@ void TupleAggregateStep::initializeMultiThread()
    for (i = 0; i < fNumOfBuckets; i++)
    {
-		boost::mutex* lock = new boost::mutex();
+        boost::mutex* lock = new boost::mutex();
        fAgg_mutex.push_back(lock);
        fRowGroupOuts[i] = fRowGroupOut;
        rgData.reinit(fRowGroupOut);
@@ -481,9 +492,10 @@ void TupleAggregateStep::doThreadedSecondPhaseAggregate(uint32_t threadID)
        RowAggregationDistinct* aggDist = dynamic_cast<RowAggregationDistinct*>(fAggregators[threadID].get());
        RowAggregationMultiDistinct* multiDist = dynamic_cast<RowAggregationMultiDistinct*>(fAggregators[threadID].get());
        Row rowIn;
-        RowGroup* rowGroupIn = 0;
+        RowGroup* rowGroupIn = nullptr;
        rowGroupIn = (aggDist->aggregator()->getOutputRowGroup());
        uint32_t bucketID;
        std::vector<std::unique_ptr<RGData>> rgDataVec;
        if (multiDist)
        {
@@ -503,10 +515,12 @@ void TupleAggregateStep::doThreadedSecondPhaseAggregate(uint32_t threadID)
            {
                rowGroupIn = (multiDist->subAggregators()[j]->getOutputRowGroup());
                rowGroupIn->initRow(&rowIn);
                auto* subDistAgg = dynamic_cast<RowAggregationUM*>(multiDist->subAggregators()[j].get());
-                while (dynamic_cast<RowAggregationUM*>(multiDist->subAggregators()[j].get())->nextRowGroup())
+                while (subDistAgg->nextRowGroup())
                {
                    rowGroupIn = (multiDist->subAggregators()[j]->getOutputRowGroup());
                    rgDataVec.emplace_back(subDistAgg->moveCurrentRGData());
                    rowGroupIn->getRow(0, &rowIn);
                    for (uint64_t i = 0; i < rowGroupIn->getRowCount(); ++i)
@@ -514,8 +528,9 @@ void TupleAggregateStep::doThreadedSecondPhaseAggregate(uint32_t threadID)
                        // The key is the groupby columns, which are the leading columns.
                        //uint8_t* hashMapKey = rowIn.getData() + 2;
                        //bucketID = hash.operator()(hashMapKey) & fBucketMask;
-                        bucketID = rowIn.hash(hashlen - 1) % fNumOfBuckets;
+                        uint64_t hash = rowgroup::hashRow(rowIn, hashlen - 1);
-                        rowBucketVecs[bucketID][j].push_back(rowIn.getPointer());
+                        bucketID = hash % fNumOfBuckets;
                        rowBucketVecs[bucketID][j].emplace_back(rowIn.getPointer(), hash);
                        rowIn.nextRow();
                    }
                }
@@ -524,10 +539,12 @@ void TupleAggregateStep::doThreadedSecondPhaseAggregate(uint32_t threadID)
        else
        {
            rowGroupIn->initRow(&rowIn);
            auto* subAgg = dynamic_cast<RowAggregationUM*>(aggDist->aggregator().get());
-            while (dynamic_cast<RowAggregationUM*>(aggDist->aggregator().get())->nextRowGroup())
+            while (subAgg->nextRowGroup())
            {
                rowGroupIn->setData(aggDist->aggregator()->getOutputRowGroup()->getRGData());
                rgDataVec.emplace_back(subAgg->moveCurrentRGData());
                rowGroupIn->getRow(0, &rowIn);
                for (uint64_t i = 0; i < rowGroupIn->getRowCount(); ++i)
@@ -535,8 +552,9 @@ void TupleAggregateStep::doThreadedSecondPhaseAggregate(uint32_t threadID)
                    // The key is the groupby columns, which are the leading columns.
                    //uint8_t* hashMapKey = rowIn.getData() + 2;
                    //bucketID = hash.operator()(hashMapKey) & fBucketMask;
-                    bucketID = rowIn.hash(hashlen - 1) % fNumOfBuckets;
+                    uint64_t hash = rowgroup::hashRow(rowIn, hashlen - 1);
-                    rowBucketVecs[bucketID][0].push_back(rowIn.getPointer());
+                    bucketID = hash % fNumOfBuckets;
                    rowBucketVecs[bucketID][0].emplace_back(rowIn.getPointer(), hash);
                    rowIn.nextRow();
                }
            }
@@ -971,7 +989,7 @@ SJSTEP TupleAggregateStep::prepAggregate(SJSTEP& step, JobInfo& jobInfo)
        if (doUMOnly)
            rgs.push_back(rgs[0]);
    }
-    
+
    if (!doUMOnly)
    {
        if (distinctAgg == true)
@@ -1013,7 +1031,7 @@ SJSTEP TupleAggregateStep::prepAggregate(SJSTEP& step, JobInfo& jobInfo)
    // Setup the input JobstepAssoctiation -- the mechanism
    // whereby the previous step feeds data to this step.
-    // Otherwise, we need to create one and hook to the 
+    // Otherwise, we need to create one and hook to the
    // previous step as well as this aggregate step.
    spjs->stepId(step->stepId() + 1);
@@ -1299,7 +1317,7 @@ void TupleAggregateStep::prep1PhaseAggregate(
                    if (pUDAFFunc && udafc->getContext().getParamKeys()->size() == 0)
                    {
                        for (uint64_t k = i+1;
-                             k < returnedColVec.size() && returnedColVec[k].second == AggregateColumn::MULTI_PARM; 
+                             k < returnedColVec.size() && returnedColVec[k].second == AggregateColumn::MULTI_PARM;
                             ++k)
                        {
                            udafc->getContext().getParamKeys()->push_back(returnedColVec[k].first);
@@ -1333,7 +1351,7 @@ void TupleAggregateStep::prep1PhaseAggregate(
                precisionAgg.push_back(precisionProj[colProj]);
                typeAgg.push_back(typeProj[colProj]);
                csNumAgg.push_back(csNumProj[colProj]);
-               widthAgg.push_back(width[colProj]);
+                 widthAgg.push_back(width[colProj]);
            }
            break;
@@ -1836,7 +1854,7 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate(
                        if (pUDAFFunc && udafc->getContext().getParamKeys()->size() == 0)
                        {
                            for (uint64_t k = i+1;
-                                 k < aggColVec.size() && aggColVec[k].second == AggregateColumn::MULTI_PARM; 
+                                 k < aggColVec.size() && aggColVec[k].second == AggregateColumn::MULTI_PARM;
                                 ++k)
                            {
                                udafc->getContext().getParamKeys()->push_back(aggColVec[k].first);
@@ -2140,7 +2158,7 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate(
            groupByNoDist.push_back(groupby);
            aggFuncMap.insert(make_pair(boost::make_tuple(keysAgg[i], 0, pUDAFFunc, udafc ? udafc->getContext().getParamKeys() : NULL), i));
        }
-        
+
        // locate the return column position in aggregated rowgroup
        uint64_t outIdx = 0;
        for (uint64_t i = 0; i < returnedColVec.size(); i++)
@@ -2198,7 +2216,7 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate(
                        if (pUDAFFunc && udafc->getContext().getParamKeys()->size() == 0)
                        {
                            for (uint64_t k = i+1;
-                                 k < returnedColVec.size() && returnedColVec[k].second == AggregateColumn::MULTI_PARM; 
+                                 k < returnedColVec.size() && returnedColVec[k].second == AggregateColumn::MULTI_PARM;
                                 ++k)
                            {
                                udafc->getContext().getParamKeys()->push_back(returnedColVec[k].first);
@@ -3120,7 +3138,7 @@ void TupleAggregateStep::prep2PhasesAggregate(
                        if (pUDAFFunc && udafc->getContext().getParamKeys()->size() == 0)
                        {
                            for (uint64_t k = i+1;
-                                 k < aggColVec.size() && aggColVec[k].second == AggregateColumn::MULTI_PARM; 
+                                 k < aggColVec.size() && aggColVec[k].second == AggregateColumn::MULTI_PARM;
                                 ++k)
                            {
                                udafc->getContext().getParamKeys()->push_back(aggColVec[k].first);
@@ -3430,7 +3448,7 @@ void TupleAggregateStep::prep2PhasesAggregate(
                        if (pUDAFFunc && udafc->getContext().getParamKeys()->size() == 0)
                        {
                            for (uint64_t k = i+1;
-                                 k < returnedColVec.size() && returnedColVec[k].second == AggregateColumn::MULTI_PARM; 
+                                 k < returnedColVec.size() && returnedColVec[k].second == AggregateColumn::MULTI_PARM;
                                 ++k)
                            {
                                udafc->getContext().getParamKeys()->push_back(returnedColVec[k].first);
@@ -3732,7 +3750,7 @@ void TupleAggregateStep::prep2PhasesAggregate(
    for (uint64_t i = 0; i < oidsAggUm.size(); i++)
        posAggUm.push_back(posAggUm[i] + widthAggUm[i]);
-    RowGroup aggRgUm(oidsAggUm.size(), posAggUm, oidsAggUm, keysAggUm, typeAggUm, 
+    RowGroup aggRgUm(oidsAggUm.size(), posAggUm, oidsAggUm, keysAggUm, typeAggUm,
                     csNumAggUm, scaleAggUm, precisionAggUm, jobInfo.stringTableThreshold);
    SP_ROWAGG_UM_t rowAggUm(new RowAggregationUMP2(groupByUm, functionVecUm, jobInfo.rm, jobInfo.umMemLimit));
    rowAggUm->timeZone(jobInfo.timeZone);
@@ -3744,7 +3762,7 @@ void TupleAggregateStep::prep2PhasesAggregate(
    for (uint64_t i = 0; i < oidsAggPm.size(); i++)
        posAggPm.push_back(posAggPm[i] + widthAggPm[i]);
-    RowGroup aggRgPm(oidsAggPm.size(), posAggPm, oidsAggPm, keysAggPm, typeAggPm, 
+    RowGroup aggRgPm(oidsAggPm.size(), posAggPm, oidsAggPm, keysAggPm, typeAggPm,
                     csNumAggPm, scaleAggPm, precisionAggPm, jobInfo.stringTableThreshold);
    SP_ROWAGG_PM_t rowAggPm(new RowAggregation(groupByPm, functionVecPm));
    rowAggPm->timeZone(jobInfo.timeZone);
@@ -4005,7 +4023,7 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate(
                        if (pUDAFFunc && udafc->getContext().getParamKeys()->size() == 0)
                        {
                            for (uint64_t k = i+1;
-                                 k < aggColVec.size() && aggColVec[k].second == AggregateColumn::MULTI_PARM; 
+                                 k < aggColVec.size() && aggColVec[k].second == AggregateColumn::MULTI_PARM;
                                 ++k)
                            {
                                udafc->getContext().getParamKeys()->push_back(aggColVec[k].first);
@@ -4401,7 +4419,7 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate(
                        if (pUDAFFunc && udafc->getContext().getParamKeys()->size() == 0)
                        {
                            for (uint64_t k = i+1;
-                                 k < returnedColVec.size() && returnedColVec[k].second == AggregateColumn::MULTI_PARM; 
+                                 k < returnedColVec.size() && returnedColVec[k].second == AggregateColumn::MULTI_PARM;
                                 ++k)
                            {
                                udafc->getContext().getParamKeys()->push_back(returnedColVec[k].first);
@@ -4808,7 +4826,7 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate(
    for (uint64_t i = 0; i < oidsAggUm.size(); i++)
        posAggUm.push_back(posAggUm[i] + widthAggUm[i]);
-    RowGroup aggRgUm(oidsAggUm.size(), posAggUm, oidsAggUm, keysAggUm, typeAggUm, 
+    RowGroup aggRgUm(oidsAggUm.size(), posAggUm, oidsAggUm, keysAggUm, typeAggUm,
                     csNumAggUm, scaleAggUm, precisionAggUm, jobInfo.stringTableThreshold);
    SP_ROWAGG_UM_t rowAggUm(new RowAggregationUMP2(groupByUm, functionNoDistVec, jobInfo.rm, jobInfo.umMemLimit));
    rowAggUm->timeZone(jobInfo.timeZone);
@@ -4818,8 +4836,8 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate(
    for (uint64_t i = 0; i < oidsAggDist.size(); i++)
        posAggDist.push_back(posAggDist[i] + widthAggDist[i]);
-    RowGroup aggRgDist(oidsAggDist.size(), posAggDist, oidsAggDist, keysAggDist, 
+    RowGroup aggRgDist(oidsAggDist.size(), posAggDist, oidsAggDist, keysAggDist,
-                       typeAggDist, csNumAggDist, scaleAggDist, 
+                       typeAggDist, csNumAggDist, scaleAggDist,
                       precisionAggDist, jobInfo.stringTableThreshold);
    SP_ROWAGG_DIST rowAggDist(new RowAggregationDistinct(groupByNoDist, functionVecUm, jobInfo.rm, jobInfo.umMemLimit));
    rowAggDist->timeZone(jobInfo.timeZone);
@@ -5058,7 +5076,7 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate(
    for (uint64_t i = 0; i < oidsAggPm.size(); i++)
        posAggPm.push_back(posAggPm[i] + widthAggPm[i]);
-    RowGroup aggRgPm(oidsAggPm.size(), posAggPm, oidsAggPm, keysAggPm, typeAggPm, 
+    RowGroup aggRgPm(oidsAggPm.size(), posAggPm, oidsAggPm, keysAggPm, typeAggPm,
                     csNumAggPm, scaleAggPm, precisionAggPm, jobInfo.stringTableThreshold);
    SP_ROWAGG_PM_t rowAggPm(new RowAggregation(groupByPm, functionVecPm));
    rowAggPm->timeZone(jobInfo.timeZone);
@@ -5100,7 +5118,7 @@ void TupleAggregateStep::prepExpressionOnAggregate(SP_ROWAGG_UM_t& aggUM, JobInf
        uint64_t eid = -1;
        if (((ac = dynamic_cast<ArithmeticColumn*>(it->get())) != NULL) &&
-             (ac->aggColumnList().size() > 0) && 
+             (ac->aggColumnList().size() > 0) &&
             (ac->windowfunctionColumnList().size() == 0))
        {
            const vector<SimpleColumn*>& scols = ac->simpleColumnList();
@@ -5256,6 +5274,26 @@ void TupleAggregateStep::aggregateRowGroups()
    }
 }
 void TupleAggregateStep::threadedAggregateFinalize(uint32_t threadID)
 {
  for (uint32_t i = 0; i < fNumOfBuckets; ++i)
  {
    if (fAgg_mutex[i]->try_lock())
    {
      try
      {
        if (fAggregators[i])
          fAggregators[i]->finalAggregation();
      }
      catch (...)
      {
        fAgg_mutex[i]->unlock();
        throw;
      }
      fAgg_mutex[i]->unlock();
    }
  }
 }
 void TupleAggregateStep::threadedAggregateRowGroups(uint32_t threadID)
 {
@@ -5268,9 +5306,10 @@ void TupleAggregateStep::threadedAggregateRowGroups(uint32_t threadID)
    vector<uint32_t> hashLens;
    bool locked = false;
    bool more = true;
-    RowGroupDL* dlIn = NULL;
+    RowGroupDL* dlIn = nullptr;
    uint32_t rgVecShift = float(fNumOfBuckets) / fNumOfThreads * threadID;
-    RowAggregationMultiDistinct* multiDist = NULL;
+    RowAggregationMultiDistinct* multiDist = nullptr;
    if (!fDoneAggregate)
    {
@@ -5279,7 +5318,7 @@ void TupleAggregateStep::threadedAggregateRowGroups(uint32_t threadID)
        dlIn = fInputJobStepAssociation.outAt(0)->rowGroupDL();
-        if (dlIn == NULL)
+        if (dlIn == nullptr)
            throw logic_error("Input is not RowGroup data list in delivery step.");
        vector<RGData> rgDatas;
@@ -5358,29 +5397,35 @@ void TupleAggregateStep::threadedAggregateRowGroups(uint32_t threadID)
                    if (more)
                    {
-                        fRowGroupIns[threadID].setData(&rgData);
+                      fRowGroupIns[threadID].setData(&rgData);
-                        fMemUsage[threadID] += fRowGroupIns[threadID].getSizeWithStrings();
+                      fMemUsage[threadID] +=
                          fRowGroupIns[threadID].getSizeWithStrings();
-                        if (!fRm->getMemory(fRowGroupIns[threadID].getSizeWithStrings(), fSessionMemLimit))
+                      bool diskAggAllowed = fRm->getAllowDiskAggregation();
-                        {
+                      if (!fRm->getMemory(
-                            rgDatas.clear();    // to short-cut the rest of processing
+                              fRowGroupIns[threadID].getSizeWithStrings(),
-                            abort();
+                              fSessionMemLimit, !diskAggAllowed))
-                            more = false;
+                      {
-                            fEndOfResult = true;
+                          if (!diskAggAllowed)
                          {
                              rgDatas.clear();    // to short-cut the rest of processing
                              more = false;
                              fEndOfResult = true;
-                            if (status() == 0)
+                              if (status() == 0)
-                            {
+                              {
-                                errorMessage(IDBErrorInfo::instance()->errorMsg(
+                                  errorMessage(IDBErrorInfo::instance()->errorMsg(
-                                                 ERR_AGGREGATION_TOO_BIG));
+                                      ERR_AGGREGATION_TOO_BIG));
-                                status(ERR_AGGREGATION_TOO_BIG);
+                                  status(ERR_AGGREGATION_TOO_BIG);
-                            }
+                              }
-
+                          }
-                            break;
+                          else
-                        }
+                          {
-                        else
+                              rgDatas.push_back(rgData);
-                        {
+                          }
-                            rgDatas.push_back(rgData);
+                          break;
-                        }
+                      }
                      rgDatas.push_back(rgData);
                    }
                    else
                    {
@@ -5429,8 +5474,9 @@ void TupleAggregateStep::threadedAggregateRowGroups(uint32_t threadID)
                                // TBD This approach could potentiall
                                // put all values in on bucket.
-                                bucketID = distRow[j].hash(hashLens[j] - 1) % fNumOfBuckets;
+                                uint64_t hash = rowgroup::hashRow(distRow[j], hashLens[j] - 1);
-                                rowBucketVecs[bucketID][j].push_back(rowIn.getPointer());
+                                bucketID = hash % fNumOfBuckets;
                                rowBucketVecs[bucketID][j].emplace_back(rowIn.getPointer(), hash);
                                rowIn.nextRow();
                            }
                        }
@@ -5447,10 +5493,11 @@ void TupleAggregateStep::threadedAggregateRowGroups(uint32_t threadID)
                        for (uint64_t i = 0; i < fRowGroupIns[threadID].getRowCount(); ++i)
                        {
                            // The key is the groupby columns, which are the leading columns.
-                            // TBD This approach could potentiall
+                            // TBD This approach could potential
                            // put all values in on bucket.
-                            int bucketID = rowIn.hash(hashLens[0] - 1) % fNumOfBuckets;
+                            uint64_t hash = rowgroup::hashRow(rowIn, hashLens[0] - 1);
-                            rowBucketVecs[bucketID][0].push_back(rowIn.getPointer());
+                            int bucketID = hash% fNumOfBuckets;
                            rowBucketVecs[bucketID][0].emplace_back(rowIn.getPointer(), hash);
                            rowIn.nextRow();
                        }
                    }
@@ -5465,8 +5512,11 @@ void TupleAggregateStep::threadedAggregateRowGroups(uint32_t threadID)
                    bool didWork = false;
                    done = true;
-                    for (uint32_t c = 0; c < fNumOfBuckets && !cancelled(); c++)
+                    // each thread starts from its own bucket for better distribution
                    uint32_t shift = (rgVecShift++) % fNumOfBuckets;
                    for (uint32_t ci = 0; ci < fNumOfBuckets && !cancelled(); ci++)
                    {
                        uint32_t c = (ci + shift) % fNumOfBuckets;
                        if (!fEndOfResult && !bucketDone[c] && fAgg_mutex[c]->try_lock())
                        {
                            try
@@ -5484,9 +5534,9 @@ void TupleAggregateStep::threadedAggregateRowGroups(uint32_t threadID)
                                throw;
                            }
                            fAgg_mutex[c]->unlock();
                            rowBucketVecs[c][0].clear();
                            bucketDone[c] = true;
                            fAgg_mutex[c]->unlock();
                        }
                        else if (!bucketDone[c])
                        {
@@ -5519,7 +5569,7 @@ void TupleAggregateStep::threadedAggregateRowGroups(uint32_t threadID)
            handleException(std::current_exception(),
                            logging::tupleAggregateStepErr,
                            logging::ERR_AGGREGATION_TOO_BIG,
-                            "TupleAggregateStep::threadedAggregateRowGroups()");
+                            "TupleAggregateStep::threadedAggregateRowGroups()[" + std::to_string(threadID) + "]");
            fEndOfResult = true;
            fDoneAggregate = true;
        }
@@ -5527,7 +5577,8 @@ void TupleAggregateStep::threadedAggregateRowGroups(uint32_t threadID)
    if (!locked) fMutex.lock();
-    while (more) more = dlIn->next(fInputIter, &rgData);
+    while (more)
        more = dlIn->next(fInputIter, &rgData);
    fMutex.unlock();
    locked = false;
@@ -5639,6 +5690,20 @@ uint64_t TupleAggregateStep::doThreadedAggregate(ByteStream& bs, RowGroupDL* dlp
            jobstepThreadPool.join(runners);
        }
        if (!cancelled())
        {
            vector<uint64_t> runners;
            // use half of the threads because finalizing requires twice as
            // much memory on average
            uint32_t threads = std::max(1U, fNumOfThreads / 2);
            runners.reserve(threads);
            for (i = 0; i < threads; ++i)
            {
                runners.push_back(jobstepThreadPool.invoke(ThreadedAggregateFinalizer(this, i)));
            }
            jobstepThreadPool.join(runners);
        }
        if (dynamic_cast<RowAggregationDistinct*>(fAggregator.get()) && fAggregator->aggMapKeyLength() > 0)
        {
            // 2nd phase multi-threaded aggregate
@@ -5700,7 +5765,7 @@ uint64_t TupleAggregateStep::doThreadedAggregate(ByteStream& bs, RowGroupDL* dlp
        }
        else
        {
-            RowAggregationDistinct* agg = dynamic_cast<RowAggregationDistinct*>(fAggregator.get());
+            auto* agg = dynamic_cast<RowAggregationDistinct*>(fAggregator.get());
            if (!fEndOfResult)
            {
@@ -5713,27 +5778,26 @@ uint64_t TupleAggregateStep::doThreadedAggregate(ByteStream& bs, RowGroupDL* dlp
                            // do the final aggregtion and deliver the results
                            // at least one RowGroup for aggregate results
                            // for "distinct without group by" case
-                            if (agg != NULL)
+                            if (agg != nullptr)
                            {
-                                RowAggregationMultiDistinct* aggMultiDist =
+                                auto* aggMultiDist =
                                    dynamic_cast<RowAggregationMultiDistinct*>(fAggregators[i].get());
-                                RowAggregationDistinct* aggDist =
+                                auto* aggDist =
                                    dynamic_cast<RowAggregationDistinct*>(fAggregators[i].get());
                                agg->aggregator(aggDist->aggregator());
                                if (aggMultiDist)
                                {
                                    (dynamic_cast<RowAggregationMultiDistinct*>(agg))
-                                    ->subAggregators(aggMultiDist->subAggregators());
+                                        ->subAggregators(aggMultiDist->subAggregators());
                                }
                                agg->doDistinctAggregation();
                            }
                            // for "group by without distinct" case
                            else
                            {
-                                fAggregator->resultDataVec().insert(
+                                fAggregator->append(fAggregators[i].get());
                                    fAggregator->resultDataVec().end(),
                                    fAggregators[i]->resultDataVec().begin(),
                                    fAggregators[i]->resultDataVec().end());
                            }
                        }
                    }
--- a/dbcon/joblist/tupleaggregatestep.h
+++ b/dbcon/joblist/tupleaggregatestep.h
@@ -105,6 +105,7 @@ private:
    uint64_t doThreadedAggregate(messageqcpp::ByteStream& bs, RowGroupDL* dlp);
    void aggregateRowGroups();
    void threadedAggregateRowGroups(uint32_t threadID);
    void threadedAggregateFinalize(uint32_t threadID);
    void doThreadedSecondPhaseAggregate(uint32_t threadID);
    bool nextDeliveredRowGroup();
    void pruneAuxColumns();
@@ -156,7 +157,9 @@ private:
        {}
        void operator()()
        {
-            utils::setThreadName("TASThrAggr");
+            std::string t{"TASThrAggr"};
            t.append(std::to_string(fThreadID));
            utils::setThreadName(t.c_str());
            fStep->threadedAggregateRowGroups(fThreadID);
        }
@@ -164,6 +167,26 @@ private:
        uint32_t fThreadID;
    };
    class ThreadedAggregateFinalizer
    {
    public:
        ThreadedAggregateFinalizer(TupleAggregateStep* step, uint32_t threadID) :
            fStep(step),
            fThreadID(threadID)
        {}
        void operator()()
        {
            std::string t{"TASThrFin"};
            t.append(std::to_string(fThreadID));
            utils::setThreadName(t.c_str());
            fStep->threadedAggregateFinalize(fThreadID);
        }
        TupleAggregateStep* fStep;
        uint32_t fThreadID;
    };
    class ThreadedSecondPhaseAggregator
    {
    public:
--- a/exemgr/main.cpp
+++ b/exemgr/main.cpp
@@ -1453,49 +1453,85 @@ int setupResources()
 void cleanTempDir()
 {
-    const auto config = config::Config::makeConfig();
+  using TempDirPurpose = config::Config::TempDirPurpose;
-    std::string allowDJS = config->getConfig("HashJoin", "AllowDiskBasedJoin");
+  struct Dirs
-    std::string tmpPrefix = config->getConfig("HashJoin", "TempFilePath");
+  {
    std::string section;
    std::string allowed;
    TempDirPurpose purpose;
  };
  std::vector<Dirs> dirs{
      {
          "HashJoin",
          "AllowDiskBasedJoin",
          TempDirPurpose::Joins
      },
      {
          "RowAggregation",
          "AllowDiskBasedAggregation",
          TempDirPurpose::Aggregates
      }
  };
  const auto config = config::Config::makeConfig();
-    if (allowDJS == "N" || allowDJS == "n")
+  for (const auto& dir : dirs)
-        return;
+  {
    std::string allowStr = config->getConfig(dir.section, dir.allowed);
    bool allow = (allowStr == "Y" || allowStr == "y");
-    if (tmpPrefix.empty())
+    std::string tmpPrefix = config->getTempFileDir(dir.purpose);
-        tmpPrefix = "/tmp/cs-diskjoin";
+
    if (allow && tmpPrefix.empty())
    {
      std::cerr << "Empty tmp directory name for " << dir.section << std::endl;
      logging::LoggingID logid(16, 0, 0);
      logging::Message::Args args;
      logging::Message message(8);
      args.add("Empty tmp directory name for:");
      args.add(dir.section);
      message.format(args);
      logging::Logger logger(logid.fSubsysID);
      logger.logMessage(logging::LOG_TYPE_CRITICAL, message, logid);
    }
    tmpPrefix += "/";
-    assert(tmpPrefix != "/");
+    idbassert(tmpPrefix != "/");
    /* This is quite scary as ExeMgr usually runs as root */
    try
    {
      if (allow)
      {
        boost::filesystem::remove_all(tmpPrefix);
-        boost::filesystem::create_directories(tmpPrefix);
+      }
      boost::filesystem::create_directories(tmpPrefix);
    }
-    catch (const std::exception& ex)
+    catch (const std::exception &ex)
    {
-        std::cerr << ex.what() << std::endl;
+      std::cerr << ex.what() << std::endl;
-        logging::LoggingID logid(16, 0, 0);
+      logging::LoggingID logid(16, 0, 0);
-        logging::Message::Args args;
+      logging::Message::Args args;
-        logging::Message message(8);
+      logging::Message message(8);
-        args.add("Execption whilst cleaning tmpdir: ");
+      args.add("Exception whilst cleaning tmpdir: ");
-        args.add(ex.what());
+      args.add(ex.what());
-        message.format( args );
+      message.format(args);
-        logging::Logger logger(logid.fSubsysID);
+      logging::Logger logger(logid.fSubsysID);
-        logger.logMessage(logging::LOG_TYPE_WARNING, message, logid);
+      logger.logMessage(logging::LOG_TYPE_WARNING, message, logid);
    }
    catch (...)
    {
-        std::cerr << "Caught unknown exception during tmpdir cleanup" << std::endl;
+      std::cerr << "Caught unknown exception during tmpdir cleanup"
-        logging::LoggingID logid(16, 0, 0);
+                << std::endl;
-        logging::Message::Args args;
+      logging::LoggingID logid(16, 0, 0);
-        logging::Message message(8);
+      logging::Message::Args args;
-        args.add("Unknown execption whilst cleaning tmpdir");
+      logging::Message message(8);
-        message.format( args );
+      args.add("Unknown exception whilst cleaning tmpdir");
-        logging::Logger logger(logid.fSubsysID);
+      message.format(args);
-        logger.logMessage(logging::LOG_TYPE_WARNING, message, logid);
+      logging::Logger logger(logid.fSubsysID);
      logger.logMessage(logging::LOG_TYPE_WARNING, message, logid);
    }
  }
 }
--- a/oam/etc/Columnstore.xml
+++ b/oam/etc/Columnstore.xml
@@ -264,7 +264,10 @@
 		-->
 		<hdfsRdwrScratch>/rdwrscratch</hdfsRdwrScratch> <!-- Do not set to an hdfs file path -->
 		<TempFileDir>/columnstore_tmp_files</TempFileDir>
- 		<SystemTempFileDir>/tmp/columnstore_tmp_files</SystemTempFileDir>
+		<!-- Be careful modifying SystemTempFileDir!  On start, ExeMgr deletes
 			the entire subdirectories "joins" & "aggregates" and recreates it to make sure no
 			files are left behind. -->
 		<SystemTempFileDir>/tmp/columnstore_tmp_files</SystemTempFileDir>
 	</SystemConfig>
 	<SystemModuleConfig>
 		<ModuleType1>dm</ModuleType1>
@@ -489,10 +492,6 @@
 		<TotalUmMemory>25%</TotalUmMemory>
 		<CPUniqueLimit>100</CPUniqueLimit>
 		<AllowDiskBasedJoin>N</AllowDiskBasedJoin>
 		<!-- Be careful modifying TempFilePath!  On start, ExeMgr deletes
 			the entire directory and recreates it to make sure no
 			files are left behind.
 		<TempFilePath>/tmp/cs-diskjoin</TempFilePath>  -->
 		<TempFileCompression>Y</TempFileCompression>
 	</HashJoin>
 	<JobList>
@@ -519,9 +518,10 @@
 		<MaxBuckets>512</MaxBuckets>            <!-- Number of buckets -->
 	</TupleWSDL>
 	<RowAggregation>
-               <!-- <RowAggrThreads>4</RowAggrThreads> --> <!-- Default value is the number of cores -->
+		<!-- <RowAggrThreads>4</RowAggrThreads> --> <!-- Default value is the number of cores -->
 		<!-- <RowAggrBuckets>32</RowAggrBuckets> --> <!-- Default value is number of cores * 4 -->
 		<!-- <RowAggrRowGroupsPerThread>20</RowAggrRowGroupsPerThread> --> <!-- Default value is 20 -->
 		<!-- <AllowDiskBasedAggregation>N</AllowDiskBasedAggregation> --> <!-- Default value is N -->
 	</RowAggregation>
 	<CrossEngineSupport>
 		<Host>127.0.0.1</Host>
--- a/oam/etc/Columnstore.xml.singleserver
+++ b/oam/etc/Columnstore.xml.singleserver
@@ -253,6 +253,10 @@
 		<hdfsRdwrBufferMaxSize>8G</hdfsRdwrBufferMaxSize>
 		-->
 		<hdfsRdwrScratch>/tmp/rdwrscratch</hdfsRdwrScratch> <!-- Do not set to an hdfs file path -->
 		<!-- Be careful modifying SystemTempFileDir!  On start, ExeMgr deletes
 			the entire subdirectories "joins" & "aggregates" and recreates it to make sure no
 			files are left behind. -->
 		<SystemTempFileDir>/tmp/columnstore_tmp_files</SystemTempFileDir>
 	</SystemConfig>
 	<SystemModuleConfig>
 		<ModuleType1>dm</ModuleType1>
@@ -483,10 +487,6 @@
 		<TotalPmUmMemory>10%</TotalPmUmMemory>
 		<CPUniqueLimit>100</CPUniqueLimit>
 		<AllowDiskBasedJoin>N</AllowDiskBasedJoin>
 		<!-- Be careful modifying TempFilePath!  On start, ExeMgr deletes
 			the entire directory and recreates it to make sure no
 			files are left behind.  -->
        <TempFilePath>/var/lib/columnstore/tmp/cs-diskjoin</TempFilePath>
 		<TempFileCompression>Y</TempFileCompression>
 	</HashJoin>
 	<JobList>
@@ -513,9 +513,10 @@
 		<MaxBuckets>512</MaxBuckets>            <!-- Number of buckets -->
 	</TupleWSDL>
 	<RowAggregation>
-               <!-- <RowAggrThreads>4</RowAggrThreads> --> <!-- Default value is the number of cores -->
+        <!-- <RowAggrThreads>4</RowAggrThreads> --> <!-- Default value is the number of cores -->
 		<!-- <RowAggrBuckets>32</RowAggrBuckets> --> <!-- Default value is number of cores * 4 -->
 		<!-- <RowAggrRowGroupsPerThread>20</RowAggrRowGroupsPerThread> --> <!-- Default value is 20 -->
 		<!-- <AllowDiskBasedAggregation>N</AllowDiskBasedAggregation> --> <!-- Default value is N -->
 	</RowAggregation>
 	<CrossEngineSupport>
 		<Host>127.0.0.1</Host>
--- a/tools/rgprint/CMakeLists.txt
+++ b/tools/rgprint/CMakeLists.txt
@@ -0,0 +1,14 @@
 include_directories( ${ENGINE_COMMON_INCLUDES} )
 ########### next target ###############
 set(rgprint_SRCS rgprint.cpp)
 add_executable(rgprint ${rgprint_SRCS})
 target_link_libraries(rgprint ${ENGINE_LDFLAGS} ${NETSNMP_LIBRARIES} ${MARIADB_CLIENT_LIBS} ${ENGINE_WRITE_LIBS})
 install(TARGETS rgprint DESTINATION ${ENGINE_BINDIR} COMPONENT columnstore-engine)
--- a/tools/rgprint/rgprint.cpp
+++ b/tools/rgprint/rgprint.cpp
@@ -0,0 +1,94 @@
 /* Copyright (C) 2021 MariaDB Corporation
   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License
   as published by the Free Software Foundation; version 2 of
   the License.
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.
   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
   MA 02110-1301, USA. */
 #include <sys/stat.h>
 #include <fcntl.h>
 #include <stdio.h>
 #include <unistd.h>
 #include <iostream>
 #include <utils/rowgroup/rowgroup.h>
 int main(int argc, char* argv[])
 {
  if (argc < 2)
  {
    std::cerr << "Usage: " << argv[0] << " <dump file>" << std::endl;
    return 0;
  }
  rowgroup::RowGroup rg;
  char* p = strrchr(argv[1], '/');
  int rfd = -1;
  if (p == nullptr)
    p = argv[1];
  unsigned pid;
  void* agg;
  auto c = sscanf(p, "Agg-p%u-t%p-", &pid, &agg);
  if (c == 2) {
    char fname[1024];
    snprintf(fname, sizeof(fname), "META-p%u-t%p", pid, agg);
    rfd = open(fname, O_RDONLY);
  }
  if (rfd < 0)
    rfd = open("./META", O_RDONLY);
  if (rfd >= 0) {
    struct stat rst;
    fstat(rfd, &rst);
    messageqcpp::ByteStream rbs;
    rbs.needAtLeast(rst.st_size);
    rbs.restart();
    auto r = read(rfd, rbs.getInputPtr(), rst.st_size);
    if (r != rst.st_size)
      abort();
    rbs.advanceInputPtr(r);
    rg.deserialize(rbs);
    close(rfd);
  } else {
    std::vector<uint32_t> pos{2, 6, 22, 30, 46, 54}; // ?
    std::vector<uint32_t> oids{3011, 3011, 3011, 3011, 3011}; // ?
    std::vector<uint32_t> keys{1, 1, 1, 1, 1}; // ?
    std::vector<execplan::CalpontSystemCatalog::ColDataType> col_t{
        execplan::CalpontSystemCatalog::INT,
        execplan::CalpontSystemCatalog::LONGDOUBLE,
        execplan::CalpontSystemCatalog::UBIGINT,
        execplan::CalpontSystemCatalog::LONGDOUBLE,
        execplan::CalpontSystemCatalog::UBIGINT
    };
    std::vector<uint32_t> csN{8, 8, 8, 8, 8};
    std::vector<uint32_t> scale{0, 0, 0, 0, 0};
    std::vector<uint32_t> prec{10, 4294967295, 9999, 4294967295, 19};
    rg = rowgroup::RowGroup(5, pos, oids, keys, col_t, csN, scale, prec, 20, false, std::vector<bool>{});
  }
  int fd = open(argv[1], O_RDONLY);
  struct stat st;
  fstat(fd, &st);
  messageqcpp::ByteStream bs;
  bs.needAtLeast(st.st_size);
  bs.restart();
  auto r = read(fd, bs.getInputPtr(), st.st_size);
  if (r != st.st_size)
    abort();
  bs.advanceInputPtr(r);
  rowgroup::RGData rst;
  rst.deserialize(bs);
  rg.setData(&rst);
  close(fd);
  std::cout << "RowGroup data:\n" << rg.toString() << std::endl;
  return 0;
 }
--- a/utils/common/robin_hood.h
+++ b/utils/common/robin_hood.h
--- a/utils/common/threadnaming.cpp
+++ b/utils/common/threadnaming.cpp
@@ -16,6 +16,7 @@
   MA 02110-1301, USA. */
 #include <sys/prctl.h>
 #include "threadnaming.h"
 namespace utils
 {
@@ -23,4 +24,11 @@ namespace utils
    {
        prctl(PR_SET_NAME, threadName, 0, 0, 0);
    }
    std::string getThreadName()
    {
      char buf[32];
      prctl(PR_GET_NAME, buf, 0, 0, 0);
      return std::string(buf);
    }
 } // end of namespace
--- a/utils/common/threadnaming.h
+++ b/utils/common/threadnaming.h
@@ -17,8 +17,11 @@
 #ifndef H_SETTHREADNAME
 #define H_SETTHREADNAME
 #include <string>
 namespace utils
 {
    void setThreadName(const char *threadName);
    std::string getThreadName();
 } // end of namespace
 #endif
--- a/utils/configcpp/configcpp.cpp
+++ b/utils/configcpp/configcpp.cpp
@@ -59,6 +59,9 @@ namespace fs = boost::filesystem;
 #include "installdir.h"
 #ifdef _MSC_VER
 #include "idbregistry.h"
 #include <unordered_map>
 #else
 #include <tr1/unordered_map>
 #endif
 #include "bytestream.h"
@@ -673,6 +676,24 @@ const vector<string> Config::enumSection(const string& section)
    return fParser.enumSection(fDoc, section);
 }
 std::string Config::getTempFileDir(Config::TempDirPurpose what)
 {
  std::string prefix = getConfig("SystemConfig", "SystemTempFileDir");
  if (prefix.empty())
  {
    prefix.assign("/tmp/columnstore_tmp_files");
  }
  prefix.append("/");
  switch (what)
  {
  case TempDirPurpose::Joins:
    return prefix.append("joins/");
  case TempDirPurpose::Aggregates:
    return prefix.append("aggregates/");
  }
  // NOTREACHED
  return {};
 }
 } //namespace config
 // vim:ts=4 sw=4:
--- a/utils/configcpp/configcpp.h
+++ b/utils/configcpp/configcpp.h
@@ -203,6 +203,14 @@ public:
     */
    EXPORT const std::vector<std::string> enumSection(const std::string& section);
    enum class TempDirPurpose
    {
      Joins,      ///< disk joins
      Aggregates  ///< disk-based aggregation
    };
    /** @brief Return temporaru directory path for the specified purpose */
    EXPORT std::string getTempFileDir(TempDirPurpose what);
 protected:
    /** @brief parse the XML file
    *
--- a/utils/joiner/joinpartition.cpp
+++ b/utils/joiner/joinpartition.cpp
@@ -129,7 +129,8 @@ JoinPartition::JoinPartition(const JoinPartition& jp, bool splitMode) :
    // Instead, each will double in size, giving a capacity of 8GB -> 16 -> 32, and so on.
 //	bucketCount = jp.bucketCount;
    bucketCount = 2;
-    filenamePrefix = startup::StartUp::tmpDir();
+    config::Config* config = config::Config::makeConfig();
    filenamePrefix = config->getTempFileDir(config::Config::TempDirPurpose::Joins);
    filenamePrefix += "/Columnstore-join-data-";
--- a/utils/loggingcpp/ErrorMessage.txt
+++ b/utils/loggingcpp/ErrorMessage.txt
@@ -100,6 +100,10 @@
 2053	ERR_FUNC_OUT_OF_RANGE_RESULT	The result is out of range for function %1% using value(s): %2% %3%
 2054	ERR_DISKAGG_ERROR	Unknown error while aggregation.
 2055	ERR_DISKAGG_TOO_BIG	Not enough memory to make disk-based aggregation. Raise TotalUmMemory if possible.
 2056	ERR_DISKAGG_FILEIO_ERROR	There was an IO error during a disk-based aggregation: %1%
 # Sub-query errors
 3001	ERR_NON_SUPPORT_SUB_QUERY_TYPE	This subquery type is not supported yet.
 3002	ERR_MORE_THAN_1_ROW	Subquery returns more than 1 row.
--- a/utils/rowgroup/CMakeLists.txt
+++ b/utils/rowgroup/CMakeLists.txt
@@ -4,7 +4,7 @@ include_directories( ${ENGINE_COMMON_INCLUDES} )
 ########### next target ###############
-set(rowgroup_LIB_SRCS rowaggregation.cpp rowgroup.cpp)
+set(rowgroup_LIB_SRCS rowaggregation.cpp rowgroup.cpp rowstorage.cpp)
 #librowgroup_la_CXXFLAGS = $(march_flags) $(AM_CXXFLAGS)
--- a/utils/rowgroup/rowaggregation.cpp
+++ b/utils/rowgroup/rowaggregation.cpp
--- a/utils/rowgroup/rowaggregation.h
+++ b/utils/rowgroup/rowaggregation.h
@@ -30,7 +30,8 @@
 */
 #include <cstring>
-#include <stdint.h>
+#include <cstdint>
 #include <utility>
 #include <vector>
 #ifdef _MSC_VER
 #include <unordered_map>
@@ -54,6 +55,9 @@
 #include "constantcolumn.h"
 #include "resourcemanager.h"
 #include "rowstorage.h"
 // To do: move code that depends on joblist to a proper subsystem.
 namespace joblist
 {
@@ -63,17 +67,6 @@ class ResourceManager;
 namespace rowgroup
 {
 struct RowPosition
 {
    uint64_t group: 48;
    uint64_t row: 16;
    static const uint64_t MSB = 0x800000000000ULL;   //48th bit is set
    inline RowPosition(uint64_t g, uint64_t r) : group(g), row(r) { }
    inline RowPosition() { }
 };
 /** @brief Enumerates aggregate functions supported by RowAggregation
 */
 enum RowAggFunctionType
@@ -143,9 +136,9 @@ struct RowAggGroupByCol
     *    outputColIndex argument should be omitted if this GroupBy
     *    column is not to be included in the output.
     */
-    RowAggGroupByCol(int32_t inputColIndex, int32_t outputColIndex = -1) :
+    explicit RowAggGroupByCol(int32_t inputColIndex, int32_t outputColIndex = -1) :
        fInputColumnIndex(inputColIndex), fOutputColumnIndex(outputColIndex) {}
-    ~RowAggGroupByCol() {}
+    ~RowAggGroupByCol() = default;
    uint32_t	fInputColumnIndex;
    uint32_t	fOutputColumnIndex;
@@ -184,7 +177,7 @@ struct RowAggFunctionCol
                      int32_t inputColIndex, int32_t outputColIndex, int32_t auxColIndex = -1) :
        fAggFunction(aggFunction), fStatsFunction(stats), fInputColumnIndex(inputColIndex),
        fOutputColumnIndex(outputColIndex), fAuxColumnIndex(auxColIndex) {}
-    virtual ~RowAggFunctionCol() {}
+    virtual ~RowAggFunctionCol() = default;
    virtual void serialize(messageqcpp::ByteStream& bs) const;
    virtual void deserialize(messageqcpp::ByteStream& bs);
@@ -237,10 +230,10 @@ struct RowUDAFFunctionCol : public RowAggFunctionCol
        bInterrupted(false)
    {}
-    virtual ~RowUDAFFunctionCol() {}
+    ~RowUDAFFunctionCol() override = default;
-    virtual void serialize(messageqcpp::ByteStream& bs) const;
+    void serialize(messageqcpp::ByteStream& bs) const override;
-    virtual void deserialize(messageqcpp::ByteStream& bs);
+    void deserialize(messageqcpp::ByteStream& bs) override;
    mcsv1sdk::mcsv1Context fUDAFContext;  // The UDAF context
    bool bInterrupted;                    // Shared by all the threads
@@ -312,104 +305,18 @@ struct ConstantAggData
    ConstantAggData() : fOp(ROWAGG_FUNCT_UNDEFINE), fIsNull(false)
    {}
-    ConstantAggData(const std::string& v, RowAggFunctionType f, bool n) :
+    ConstantAggData(std::string v, RowAggFunctionType f, bool n) :
-        fConstValue(v), fOp(f), fIsNull(n)
+        fConstValue(std::move(v)), fOp(f), fIsNull(n)
    {}
-    ConstantAggData(const std::string& v, const std::string u, RowAggFunctionType f, bool n) :
+    ConstantAggData(std::string v, std::string u, RowAggFunctionType f, bool n) :
-        fConstValue(v), fUDAFName(u), fOp(f), fIsNull(n)
+        fConstValue(std::move(v)), fUDAFName(std::move(u)), fOp(f), fIsNull(n)
    {}
 };
 typedef boost::shared_ptr<RowAggGroupByCol>  SP_ROWAGG_GRPBY_t;
 typedef boost::shared_ptr<RowAggFunctionCol> SP_ROWAGG_FUNC_t;
 class RowAggregation;
 class AggHasher
 {
 public:
    AggHasher(const Row& row, Row** tRow, uint32_t keyCount, RowAggregation* ra);
    inline uint64_t operator()(const RowPosition& p) const;
 private:
    explicit AggHasher();
    RowAggregation* agg;
    Row** tmpRow;
    mutable Row r;
    uint32_t lastKeyCol;
 };
 class AggComparator
 {
 public:
    AggComparator(const Row& row, Row** tRow, uint32_t keyCount, RowAggregation* ra);
    inline bool operator()(const RowPosition&, const RowPosition&) const;
 private:
    explicit AggComparator();
    RowAggregation* agg;
    Row** tmpRow;
    mutable Row r1, r2;
    uint32_t lastKeyCol;
 };
 class KeyStorage
 {
 public:
    KeyStorage(const RowGroup& keyRG, Row** tRow);
    inline RowPosition addKey();
    inline uint64_t getMemUsage();
 private:
    Row row;
    Row** tmpRow;
    RowGroup rg;
    std::vector<RGData> storage;
    uint64_t memUsage;
    friend class ExternalKeyEq;
    friend class ExternalKeyHasher;
 };
 class ExternalKeyHasher
 {
 public:
    ExternalKeyHasher(const RowGroup& keyRG, KeyStorage* ks, uint32_t keyColCount, Row** tRow);
    inline uint64_t operator()(const RowPosition& pos) const;
 private:
    mutable Row row;
    mutable Row** tmpRow;
    uint32_t lastKeyCol;
    KeyStorage* ks;
 };
 class ExternalKeyEq
 {
 public:
    ExternalKeyEq(const RowGroup& keyRG, KeyStorage* ks, uint32_t keyColCount, Row** tRow);
    inline bool operator()(const RowPosition& pos1, const RowPosition& pos2) const;
 private:
    mutable Row row1, row2;
    mutable Row** tmpRow;
    uint32_t lastKeyCol;
    KeyStorage* ks;
 };
 typedef std::tr1::unordered_set<RowPosition, AggHasher, AggComparator, utils::STLPoolAllocator<RowPosition> >
 RowAggMap_t;
 #if defined(__GNUC__) && (__GNUC__ == 4 && __GNUC_MINOR__ < 5)
 typedef std::tr1::unordered_map<RowPosition, RowPosition, ExternalKeyHasher, ExternalKeyEq,
        utils::STLPoolAllocator<std::pair<const RowPosition, RowPosition> > > ExtKeyMap_t;
 #else
 typedef std::tr1::unordered_map<RowPosition, RowPosition, ExternalKeyHasher, ExternalKeyEq,
        utils::STLPoolAllocator<std::pair<RowPosition, RowPosition> > > ExtKeyMap_t;
 #endif
 struct GroupConcat
 {
    // GROUP_CONCAT(DISTINCT col1, 'const', col2 ORDER BY col3 desc SEPARATOR 'sep')
@@ -427,7 +334,7 @@ struct GroupConcat
    boost::shared_ptr<int64_t>			fSessionMemLimit;
    std::string fTimeZone;
-    GroupConcat() : fRm(NULL) {}
+    GroupConcat() : fRm(nullptr) {}
 };
 typedef boost::shared_ptr<GroupConcat>  SP_GroupConcat;
@@ -436,7 +343,7 @@ typedef boost::shared_ptr<GroupConcat>  SP_GroupConcat;
 class GroupConcatAg
 {
 public:
-    GroupConcatAg(SP_GroupConcat&);
+    explicit GroupConcatAg(SP_GroupConcat&);
    virtual ~GroupConcatAg();
    virtual void initialize() {};
@@ -446,7 +353,7 @@ public:
    void getResult(uint8_t*) {};
    uint8_t* getResult()
    {
-        return NULL;
+        return nullptr;
    }
 protected:
@@ -478,12 +385,14 @@ public:
     */
    RowAggregation();
    RowAggregation(const std::vector<SP_ROWAGG_GRPBY_t>& rowAggGroupByCols,
-                   const std::vector<SP_ROWAGG_FUNC_t>&  rowAggFunctionCols);
+                   const std::vector<SP_ROWAGG_FUNC_t>&  rowAggFunctionCols,
                   joblist::ResourceManager* rm = nullptr,
                   boost::shared_ptr<int64_t> sessMemLimit = {});
    RowAggregation(const RowAggregation& rhs);
    /** @brief RowAggregation default destructor
     */
-    virtual ~RowAggregation();
+    ~RowAggregation() override;
    /** @brief clone this object for multi-thread use
     */
@@ -551,28 +460,19 @@ public:
     * @parm pRowGroupIn(in) RowGroup to be added to aggregation.
     */
    virtual void addRowGroup(const RowGroup* pRowGroupIn);
-    virtual void addRowGroup(const RowGroup* pRowGroupIn, std::vector<Row::Pointer>& inRows);
+    virtual void addRowGroup(const RowGroup* pRowGroupIn, std::vector<std::pair<Row::Pointer, uint64_t>>& inRows);
    /** @brief Serialize RowAggregation object into a ByteStream.
     *
     * @parm bs(out) BytesStream that is to be written to.
     */
-    void serialize(messageqcpp::ByteStream& bs) const;
+    void serialize(messageqcpp::ByteStream& bs) const override;
    /** @brief Unserialize RowAggregation object from a ByteStream.
     *
     * @parm bs(in) BytesStream that is to be read from.
     */
-    void deserialize(messageqcpp::ByteStream& bs);
+    void deserialize(messageqcpp::ByteStream& bs) override;
    /** @brief set the memory limit for RowAggregation
     *
     * @parm limit(in) memory limit for both Map and secondary RowGroups
     */
    void setMaxMemory(uint64_t limit)
    {
        fMaxMemory = limit;
    }
    /** @brief load result set into byte stream
     *
@@ -594,18 +494,12 @@ public:
        return fRowGroupOut;
    }
-    RowAggMap_t* mapPtr()
+    void append(RowAggregation* other);
    {
        return fAggMapPtr;
    }
    std::vector<RGData*>& resultDataVec()
    {
        return fResultDataVec;
    }
    virtual void aggregateRow(Row& row,
                              const uint64_t* hash = nullptr,
                              std::vector<mcsv1sdk::mcsv1Context>* rgContextColl = nullptr);
-    inline uint32_t aggMapKeyLength()
+    inline uint32_t aggMapKeyLength() const
    {
        return fAggMapKeyCount;
    }
@@ -623,6 +517,16 @@ public:
        return &fRGContextColl;
    }
    void finalAggregation()
    {
        return fRowAggStorage->finalize([this](Row& row) { mergeEntries(row);}, fRow);
    }
    std::unique_ptr<RGData> moveCurrentRGData()
    {
        return std::move(fCurRGData);
    }
 protected:
    virtual void initialize();
    virtual void initMapData(const Row& row);
@@ -630,10 +534,12 @@ protected:
    virtual void updateEntry(const Row& row,
                             std::vector<mcsv1sdk::mcsv1Context>* rgContextColl = nullptr);
    void mergeEntries(const Row& row);
    virtual void doMinMax(const Row&, int64_t, int64_t, int);
    virtual void doSum(const Row&, int64_t, int64_t, int);
-    virtual void doAvg(const Row&, int64_t, int64_t, int64_t);
+    virtual void doAvg(const Row&, int64_t, int64_t, int64_t, bool merge = false);
    virtual void doStatistics(const Row&, int64_t, int64_t, int64_t);
    void mergeStatistics(const Row&, uint64_t colOut, uint64_t colAux);
    virtual void doBitOp(const Row&, int64_t, int64_t, int);
    virtual void doUDAF(const Row&,
                        int64_t,
@@ -647,12 +553,6 @@ protected:
        return true;
    }
    virtual bool newRowGroup();
    virtual void clearAggMap()
    {
        if (fAggMapPtr) fAggMapPtr->clear();
    }
    void resetUDAF(RowUDAFFunctionCol* rowUDAF);
    void resetUDAF(RowUDAFFunctionCol* rowUDAF, uint64_t funcColIdx);
@@ -673,24 +573,19 @@ protected:
    inline void updateStringMinMax(std::string val1, std::string val2, int64_t col, int func);
    std::vector<SP_ROWAGG_GRPBY_t>                  fGroupByCols;
    std::vector<SP_ROWAGG_FUNC_t>                   fFunctionCols;
    RowAggMap_t*                                    fAggMapPtr;
    uint32_t                                        fAggMapKeyCount;   // the number of columns that make up the key
    RowGroup                                        fRowGroupIn;
    RowGroup*                                       fRowGroupOut;
    // for when the group by & distinct keys are not stored in the output rows
    rowgroup::RowGroup fKeyRG;
    Row                                             fRow;
    Row                                             fNullRow;
    Row*											 tmpRow;   // used by the hashers & eq functors
    boost::scoped_array<uint8_t>                    fNullRowData;
    std::vector<RGData*>                           fResultDataVec;
-    uint64_t                                        fTotalRowCount;
+    std::unique_ptr<RowAggStorage> fRowAggStorage;
    uint64_t                                        fMaxTotalRowCount;
    uint64_t                                        fMaxMemory;
    RGData*                                         fPrimaryRowData;
    std::vector<boost::shared_ptr<RGData> >         fSecondaryRowDataVec;
    // for support PM aggregation after PM hashjoin
    std::vector<RowGroup>*                          fSmallSideRGs;
@@ -700,28 +595,19 @@ protected:
    uint32_t                                            fSmallSideCount;
    boost::scoped_array<Row> rowSmalls;
    // for hashmap
    boost::shared_ptr<utils::STLPoolAllocator<RowPosition> > fAlloc;
    // for 8k poc
    RowGroup                                        fEmptyRowGroup;
    RGData                                          fEmptyRowData;
    Row                                             fEmptyRow;
-    boost::scoped_ptr<AggHasher> fHasher;
+    bool fKeyOnHeap = false;
    boost::scoped_ptr<AggComparator> fEq;
    std::string fTimeZone;
    //TODO: try to get rid of these friend decl's.  AggHasher & Comparator
    //need access to rowgroup storage holding the rows to hash & ==.
    friend class AggHasher;
    friend class AggComparator;
    // We need a separate copy for each thread.
    mcsv1sdk::mcsv1Context fRGContext;
    std::vector<mcsv1sdk::mcsv1Context> fRGContextColl;
-    
+
    // These are handy for testing the actual type of static_any for UDAF
    static const static_any::any& charTypeId;
    static const static_any::any& scharTypeId;
@@ -742,6 +628,10 @@ protected:
    // For UDAF along with with multiple distinct columns
    std::vector<SP_ROWAGG_FUNC_t>* fOrigFunctionCols;
    joblist::ResourceManager*  fRm = nullptr;
    boost::shared_ptr<int64_t> fSessionMemLimit;
    std::unique_ptr<RGData> fCurRGData;
 };
 //------------------------------------------------------------------------------
@@ -764,11 +654,11 @@ public:
    /** @brief RowAggregationUM default destructor
     */
-    ~RowAggregationUM();
+    ~RowAggregationUM() override;
    /** @brief Denotes end of data insertion following multiple calls to addRowGroup().
     */
-    void endOfInput();
+    void endOfInput() override;
    /** @brief Finializes the result set before sending back to the front end.
     */
@@ -805,7 +695,7 @@ public:
    {
        return fRm;
    }
-    inline virtual RowAggregationUM* clone() const
+    inline RowAggregationUM* clone() const override
    {
        return new RowAggregationUM (*this);
    }
@@ -832,22 +722,18 @@ public:
        return fGroupConcat;
    }
-    void aggregateRow(Row&,
+    void aggReset() override;
                      std::vector<mcsv1sdk::mcsv1Context>* rgContextColl = nullptr) override;
    virtual void aggReset();
-    void setInputOutput(const RowGroup& pRowGroupIn, RowGroup* pRowGroupOut);
+    void setInputOutput(const RowGroup& pRowGroupIn, RowGroup* pRowGroupOut) override;
 protected:
    // virtual methods from base
    void initialize() override;
    void attachGroupConcatAg() override;
    void updateEntry(const Row& row,
                     std::vector<mcsv1sdk::mcsv1Context>* rgContextColl = nullptr) override;
-
+    bool countSpecial(const RowGroup* pRG) override
    void aggregateRowWithRemap(Row&,
                               std::vector<mcsv1sdk::mcsv1Context>* rgContextColl = nullptr);
    void attachGroupConcatAg();
    bool countSpecial(const RowGroup* pRG)
    {
        fRow.setIntField<8>(
            fRow.getIntField<8>(
@@ -856,8 +742,6 @@ protected:
        return true;
    }
    bool newRowGroup();
    // calculate the average after all rows received. UM only function.
    void calculateAvgColumns();
@@ -889,7 +773,6 @@ protected:
    virtual void setGroupConcatString();
    bool fHasAvg;
    bool fKeyOnHeap;
    bool fHasStatsFunc;
    bool fHasUDAF;
@@ -902,8 +785,6 @@ protected:
     * the memory from rm in that order. */
    uint64_t                          fTotalMemUsage;
    joblist::ResourceManager*         fRm;
    // @bug3475, aggregate(constant), sum(0), count(null), etc
    std::vector<ConstantAggData>      fConstantAggregate;
@@ -912,18 +793,8 @@ protected:
    std::vector<SP_GroupConcatAg>     fGroupConcatAg;
    std::vector<SP_ROWAGG_FUNC_t>     fFunctionColGc;
    // for when the group by & distinct keys are not stored in the output rows
    rowgroup::RowGroup fKeyRG;
    boost::scoped_ptr<ExternalKeyEq> fExtEq;
    boost::scoped_ptr<ExternalKeyHasher> fExtHash;
    boost::scoped_ptr<KeyStorage> fKeyStore;
    boost::scoped_ptr<utils::STLPoolAllocator<std::pair<RowPosition, RowPosition> > > fExtKeyMapAlloc;
    boost::scoped_ptr<ExtKeyMap_t> fExtKeyMap;
    boost::shared_ptr<int64_t> fSessionMemLimit;
 private:
    uint64_t fLastMemUsage;
    uint32_t fNextRGIndex;
 };
@@ -951,8 +822,8 @@ public:
    /** @brief RowAggregationUMP2 default destructor
     */
-    ~RowAggregationUMP2();
+    ~RowAggregationUMP2() override;
-    inline virtual RowAggregationUMP2* clone() const
+    inline RowAggregationUMP2* clone() const override
    {
        return new RowAggregationUMP2 (*this);
    }
@@ -961,17 +832,17 @@ protected:
    // virtual methods from base
    void updateEntry(const Row& row,
                     std::vector<mcsv1sdk::mcsv1Context>* rgContextColl = nullptr) override;
-    void doAvg(const Row&, int64_t, int64_t, int64_t);
+    void doAvg(const Row&, int64_t, int64_t, int64_t, bool merge = false) override;
-    void doStatistics(const Row&, int64_t, int64_t, int64_t);
+    void doStatistics(const Row&, int64_t, int64_t, int64_t) override;
-    void doGroupConcat(const Row&, int64_t, int64_t);
+    void doGroupConcat(const Row&, int64_t, int64_t) override;
-    void doBitOp(const Row&, int64_t, int64_t, int);
+    void doBitOp(const Row&, int64_t, int64_t, int) override;
    void doUDAF(const Row&,
                int64_t,
                int64_t,
                int64_t,
                uint64_t& funcColsIdx,
                std::vector<mcsv1sdk::mcsv1Context>* rgContextColl = nullptr) override;
-    bool countSpecial(const RowGroup* pRG)
+    bool countSpecial(const RowGroup* pRG) override
    {
        return false;
    }
@@ -1002,18 +873,18 @@ public:
    /** @brief RowAggregationDistinct default destructor
     */
-    ~RowAggregationDistinct();
+    ~RowAggregationDistinct() override;
    /** @brief Add an aggregator for pre-DISTINCT aggregation
     */
    void addAggregator(const boost::shared_ptr<RowAggregation>& agg, const RowGroup& rg);
-    void setInputOutput(const RowGroup& pRowGroupIn, RowGroup* pRowGroupOut);
+    void setInputOutput(const RowGroup& pRowGroupIn, RowGroup* pRowGroupOut) override;
    virtual void doDistinctAggregation();
-    virtual void doDistinctAggregation_rowVec(std::vector<Row::Pointer>& inRows);
+    virtual void doDistinctAggregation_rowVec(std::vector<std::pair<Row::Pointer, uint64_t>>& inRows);
-    void addRowGroup(const RowGroup* pRowGroupIn);
+    void addRowGroup(const RowGroup* pRowGroupIn) override;
-    void addRowGroup(const RowGroup* pRowGroupIn, std::vector<Row::Pointer>& inRows);
+    void addRowGroup(const RowGroup* pRowGroupIn, std::vector<std::pair<Row::Pointer, uint64_t>>& inRows) override;
    // multi-threade debug
    boost::shared_ptr<RowAggregation>& aggregator()
@@ -1022,7 +893,7 @@ public:
    }
    void aggregator(boost::shared_ptr<RowAggregation> aggregator)
    {
-        fAggregator = aggregator;
+        fAggregator = std::move(aggregator);
    }
    RowGroup& rowGroupDist()
    {
@@ -1032,7 +903,7 @@ public:
    {
        fRowGroupDist = rowGroupDist;
    }
-    inline virtual RowAggregationDistinct* clone() const
+    inline RowAggregationDistinct* clone() const override
    {
        return new RowAggregationDistinct (*this);
    }
@@ -1067,20 +938,20 @@ public:
    /** @brief RowAggregationSubDistinct default destructor
     */
-    ~RowAggregationSubDistinct();
+    ~RowAggregationSubDistinct() override;
-    void setInputOutput(const RowGroup& pRowGroupIn, RowGroup* pRowGroupOut);
+    void setInputOutput(const RowGroup& pRowGroupIn, RowGroup* pRowGroupOut) override;
-    void addRowGroup(const RowGroup* pRowGroupIn);
+    void addRowGroup(const RowGroup* pRowGroupIn) override;
-    inline virtual RowAggregationSubDistinct* clone() const
+    inline RowAggregationSubDistinct* clone() const override
    {
        return new RowAggregationSubDistinct (*this);
    }
-    void addRowGroup(const RowGroup* pRowGroupIn, std::vector<Row::Pointer>& inRow);
+    void addRowGroup(const RowGroup* pRowGroupIn, std::vector<std::pair<Row::Pointer, uint64_t>>& inRow) override;
 protected:
    // virtual methods from RowAggregationUM
-    void doGroupConcat(const Row&, int64_t, int64_t);
+    void doGroupConcat(const Row&, int64_t, int64_t) override;
    // for groupby columns and the aggregated distinct column
    Row                                             fDistRow;
@@ -1108,7 +979,7 @@ public:
    /** @brief RowAggregationMultiDistinct default destructor
     */
-    ~RowAggregationMultiDistinct();
+    ~RowAggregationMultiDistinct() override;
    /** @brief Add sub aggregators
     */
@@ -1116,21 +987,21 @@ public:
                          const RowGroup& rg,
                          const std::vector<SP_ROWAGG_FUNC_t>& funct);
-    void setInputOutput(const RowGroup& pRowGroupIn, RowGroup* pRowGroupOut);
+    void setInputOutput(const RowGroup& pRowGroupIn, RowGroup* pRowGroupOut) override;
    using RowAggregationDistinct::addRowGroup;
-    void addRowGroup(const RowGroup* pRowGroupIn);
+    void addRowGroup(const RowGroup* pRowGroupIn) override;
    using RowAggregationDistinct::doDistinctAggregation;
-    virtual void doDistinctAggregation();
+    void doDistinctAggregation() override;
    using RowAggregationDistinct::doDistinctAggregation_rowVec;
-    virtual void doDistinctAggregation_rowVec(std::vector<std::vector<Row::Pointer> >& inRows);
+    virtual void doDistinctAggregation_rowVec(std::vector<std::vector<std::pair<Row::Pointer, uint64_t>> >& inRows);
-    inline virtual RowAggregationMultiDistinct* clone() const
+    inline RowAggregationMultiDistinct* clone() const override
    {
        return new RowAggregationMultiDistinct (*this);
    }
-    void addRowGroup(const RowGroup* pRowGroupIn, std::vector<std::vector<Row::Pointer> >& inRows);
+    void addRowGroup(const RowGroup* pRowGroupIn, std::vector<std::vector<std::pair<Row::Pointer, uint64_t>>>& inRows);
    std::vector<boost::shared_ptr<RowAggregationUM> >& subAggregators()
    {
--- a/utils/rowgroup/rowgroup.cpp
+++ b/utils/rowgroup/rowgroup.cpp
@@ -32,6 +32,7 @@
 using namespace std;
 #include <boost/shared_array.hpp>
 #include <numeric>
 using namespace boost;
 #include "bytestream.h"
@@ -405,6 +406,7 @@ RGData::RGData(const RowGroup& rg, uint32_t rowCount)
     */
    memset(rowData.get(), 0, rg.getDataSize(rowCount));   // XXXPAT: make valgrind happy temporarily
 #endif
  memset(rowData.get(), 0, rg.getDataSize(rowCount));   // XXXPAT: make valgrind happy temporarily
 }
 RGData::RGData(const RowGroup& rg)
@@ -481,7 +483,7 @@ void RGData::serialize(ByteStream& bs, uint32_t amount) const
        bs << (uint8_t) 0;
 }
-void RGData::deserialize(ByteStream& bs, bool hasLenField)
+void RGData::deserialize(ByteStream& bs, uint32_t defAmount)
 {
    uint32_t amount, sig;
    uint8_t* buf;
@@ -493,7 +495,7 @@ void RGData::deserialize(ByteStream& bs, bool hasLenField)
    {
        bs >> sig;
        bs >> amount;
-        rowData.reset(new uint8_t[amount]);
+        rowData.reset(new uint8_t[std::max(amount, defAmount)]);
        buf = bs.buf();
        memcpy(rowData.get(), buf, amount);
        bs.advance(amount);
@@ -577,12 +579,13 @@ Row& Row::operator=(const Row& r)
    return *this;
 }
-string Row::toString() const
+string Row::toString(uint32_t rownum) const
 {
    ostringstream os;
    uint32_t i;
    //os << getRid() << ": ";
    os << "[" << std::setw(5) << rownum << std::setw(0) << "]: ";
    os << (int) useStringTable << ": ";
    for (i = 0; i < columnCount; i++)
@@ -1447,7 +1450,7 @@ uint32_t RowGroup::getColumnCount() const
    return columnCount;
 }
-string RowGroup::toString() const
+string RowGroup::toString(const std::vector<uint64_t>& used) const
 {
    ostringstream os;
    ostream_iterator<int> oIter1(os, "\t");
@@ -1479,6 +1482,8 @@ string RowGroup::toString() const
        os << "uses a string table\n";
    else
        os << "doesn't use a string table\n";
    if (!used.empty())
      os << "sparse\n";
    //os << "strings = " << hex << (int64_t) strings << "\n";
    //os << "data = " << (int64_t) data << "\n" << dec;
@@ -1488,14 +1493,25 @@ string RowGroup::toString() const
        initRow(&r);
        getRow(0, &r);
        os << "rowcount = " << getRowCount() << endl;
        if (!used.empty())
        {
          uint64_t cnt = std::accumulate(used.begin(), used.end(), 0ULL,
                                         [](uint64_t a, uint64_t bits) {
                                           return a + __builtin_popcountll(bits);
                                         });
          os << "sparse row count = " << cnt << endl;
        }
        os << "base rid = " << getBaseRid() << endl;
        os << "status = " << getStatus() << endl;
        os << "dbroot = " << getDBRoot() << endl;
        os << "row data...\n";
-        for (uint32_t i = 0; i < getRowCount(); i++)
+        uint32_t max_cnt = used.empty() ? getRowCount() : (used.size() * 64);
        for (uint32_t i = 0; i < max_cnt; i++)
        {
-            os << r.toString() << endl;
+            if (!used.empty() && !(used[i/64] & (1ULL << (i%64))))
              continue;
            os << r.toString(i) << endl;
            r.nextRow();
        }
    }
--- a/utils/rowgroup/rowgroup.h
+++ b/utils/rowgroup/rowgroup.h
@@ -270,7 +270,7 @@ public:
    // the 'hasLengthField' is there b/c PM aggregation (and possibly others) currently sends
    // inline data with a length field.  Once that's converted to string table format, that
    // option can go away.
-    void deserialize(messageqcpp::ByteStream&, bool hasLengthField = false); // returns the # of bytes read
+    void deserialize(messageqcpp::ByteStream&, uint32_t amount = 0); // returns the # of bytes read
    inline uint64_t getStringTableMemUsage();
    void clear();
@@ -531,7 +531,7 @@ public:
    template<typename T>
    inline void copyBinaryField(Row& dest, uint32_t destIndex, uint32_t srcIndex) const;
-    std::string toString() const;
+    std::string toString(uint32_t rownum = 0) const;
    std::string toCSV() const;
    /* These fcns are used only in joins.  The RID doesn't matter on the side that
@@ -1537,7 +1537,7 @@ public:
    RGData duplicate();   // returns a copy of the attached RGData
-    std::string toString() const;
+    std::string toString(const std::vector<uint64_t>& used = {}) const;
    /** operator+=
    *
--- a/utils/rowgroup/rowstorage.cpp
+++ b/utils/rowgroup/rowstorage.cpp
--- a/utils/rowgroup/rowstorage.h
+++ b/utils/rowgroup/rowstorage.h
@@ -0,0 +1,366 @@
 /* Copyright (C) 2021 MariaDB Corporation
   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License
   as published by the Free Software Foundation; version 2 of
   the License.
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.
   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
   MA 02110-1301, USA. */
 #ifndef ROWSTORAGE_H
 #define ROWSTORAGE_H
 #include "rowgroup.h"
 #include <sys/stat.h>
 #include <unistd.h>
 namespace rowgroup
 {
 uint32_t calcNumberOfBuckets(ssize_t availMem,
                             uint32_t numOfThreads,
                             uint32_t numOfBuckets,
                             uint32_t groupsPerThread,
                             uint32_t inRowSize,
                             uint32_t outRowSize,
                             bool enabledDiskAggr);
 class MemManager;
 class RowPosHashStorage;
 using RowPosHashStoragePtr = std::unique_ptr<RowPosHashStorage>;
 class RowGroupStorage;
 uint64_t hashRow(const rowgroup::Row& r, std::size_t lastCol);
 class RowAggStorage
 {
 public:
  RowAggStorage(const std::string& tmpDir,
                RowGroup* rowGroupOut,
                RowGroup* keysRowGroup,
                uint32_t keyCount,
                joblist::ResourceManager* rm = nullptr,
                boost::shared_ptr<int64_t> sessLimit = {},
                bool enabledDiskAgg = false,
                bool allowGenerations = false);
  RowAggStorage(const std::string& tmpDir,
                RowGroup* rowGroupOut,
                uint32_t keyCount,
                joblist::ResourceManager* rm = nullptr,
                boost::shared_ptr<int64_t> sessLimit = {},
                bool enabledDiskAgg = false,
                bool allowGenerations = false)
      : RowAggStorage(tmpDir, rowGroupOut, rowGroupOut, keyCount,
                      rm, std::move(sessLimit),
                      enabledDiskAgg, allowGenerations)
  {}
  ~RowAggStorage();
  static uint16_t getMaxRows(bool enabledDiskAgg)
  {
    return (enabledDiskAgg ? 8192 : 256);
  }
  static size_t getBucketSize();
  /** @brief Find or create resulting row.
   *
   *    Create "aggregation key" row if necessary.
   *    NB! Using getTargetRow() after append() is UB!
   *
   *  @param row(in)  input row
   *  @param rowOut() row to aggregate data from input row
   *
   *  @returns true if new row created, false otherwise
   */
  bool getTargetRow(const Row& row, Row& rowOut);
  bool getTargetRow(const Row& row, uint64_t row_hash, Row& rowOut);
  /** @brief Dump some RGDatas to disk and release memory for further use.
   */
  void dump();
  /** @brief Append RGData from other RowAggStorage and clear it.
   *
   *    NB! Any operation except getNextRGData() or append() is UB!
   *
   * @param other(in) donor storage
   */
  void append(RowAggStorage& other);
  /** @brief Remove last RGData from internal RGData storage and return it.
   *
   * @returns pointer to the next RGData or nullptr if empty
   */
  std::unique_ptr<RGData> getNextRGData();
  /** @brief TODO
   *
   * @param mergeFunc
   * @param rowOut
   */
  void finalize(std::function<void(Row &)> mergeFunc, Row &rowOut);
  /** @brief Calculate maximum size of hash assuming 80% fullness.
   *
   * @param elems(in) number of elements
   * @returns calculated size
   */
  inline static size_t calcMaxSize(size_t elems) noexcept
  {
    if (LIKELY(elems <= std::numeric_limits<size_t>::max() / 100))
      return elems * 80 / 100;
    return (elems / 100) * 80;
  }
  inline static size_t calcSizeWithBuffer(size_t elems, size_t maxSize) noexcept
  {
    return elems + std::min(maxSize, 0xFFUL);
  }
  inline static size_t calcSizeWithBuffer(size_t elems) noexcept
  {
    return calcSizeWithBuffer(elems, calcMaxSize(elems));
  }
 private:
  struct Data;
  /** @brief Create new RowAggStorage with the same params and load dumped data
   *
   * @param gen(in) generation number
   * @return pointer to a new RowAggStorage
   */
  RowAggStorage* clone(uint16_t gen) const;
  /** @brief Free any internal data
   */
  void freeData();
  /** @brief Move internal data & row position inside [insIdx, startIdx] up by 1.
   *
   * @param startIdx(in) last element's index to move
   * @param insIdx(in)   first element's index to move
   */
  void shiftUp(size_t startIdx, size_t insIdx);
  /** @brief Find best position of row and save it's hash.
   *
   * @param row(in)   input row
   * @param info(out) info data
   * @param idx(out)  index computed from row hash
   * @param hash(out) row hash value
   */
  void rowToIdx(const Row& row, uint32_t& info, size_t& idx, uint64_t& hash) const;
  void rowToIdx(const Row& row, uint32_t& info, size_t& idx, uint64_t& hash, const Data* curData) const;
  /** @brief Find best position using precomputed hash
   *
   * @param h(in)     row hash
   * @param info(out) info data
   * @param idx(out)  index
   */
  inline void rowHashToIdx(uint64_t h, uint32_t& info, size_t& idx, const Data* curData) const
  {
    info = curData->fInfoInc + static_cast<uint32_t>((h & INFO_MASK) >> curData->fInfoHashShift);
    idx = (h >> INIT_INFO_BITS) & curData->fMask;
  }
  inline void rowHashToIdx(uint64_t h, uint32_t& info, size_t& idx) const
  {
    return rowHashToIdx(h, info, idx, fCurData);
  }
  /** @brief Iterate over internal info until info with less-or-equal distance
   *         from the best position was found.
   *
   * @param info(in,out) info data
   * @param idx(in,out)  index
   */
  inline void nextWhileLess(uint32_t& info, size_t& idx, const Data* curData) const noexcept
  {
    while (info < curData->fInfo[idx])
    {
      next(info, idx, curData);
    }
  }
  inline void nextWhileLess(uint32_t& info, size_t& idx) const noexcept
  {
    return nextWhileLess(info, idx, fCurData);
  }
  /** @brief Get next index and corresponding info
   */
  inline void next(uint32_t& info, size_t& idx, const Data* curData) const noexcept
  {
    ++(idx);
    info += curData->fInfoInc;
  }
  inline void next(uint32_t& info, size_t& idx) const noexcept
  {
    return next(info, idx, fCurData);
  }
  /** @brief Get index and info of the next non-empty entry
   */
  inline void nextExisting(uint32_t& info, size_t& idx) const noexcept
  {
    uint64_t n = 0;
    uint64_t data;
    while (true)
    {
      memcpy(&data, fCurData->fInfo + idx, sizeof(data));
      if (data == 0)
      {
        idx += sizeof(n);
      }
      else
      {
        break;
      }
    }
 #if BYTE_ORDER == BIG_ENDIAN
    n = __builtin_clzll(data) / sizeof(data);
 #else
    n = __builtin_ctzll(data) / sizeof(data);
 #endif
    idx += n;
    info = fCurData->fInfo[idx];
  }
  /** @brief Increase internal data size if needed
   */
  void increaseSize();
  /** @brief Increase distance capacity of info removing 1 bit of the hash.
   *
   * @returns success
   */
  bool tryIncreaseInfo();
  /** @brief Reserve space for number of elements (power of two)
   *
   *    This function performs re-insert all data
   *
   * @param elems(in)   new size
   */
  void rehashPowerOfTwo(size_t elems);
  /** @brief Move elements from old one into rehashed data.
   *
   *    It's mostly the same algo as in getTargetRow(), but returns nothing
   *    and skips some checks because it's guaranteed that there is no dups.
   *
   * @param oldIdx(in)    index of "old" data
   * @param oldHashes(in) old storage of row positions and hashes
   */
  void insertSwap(size_t oldIdx, RowPosHashStorage* oldHashes);
  /** @brief (Re)Initialize internal data of specified size.
   *
   * @param elems(in) number of elements
   */
  void initData(size_t elems, const RowPosHashStorage* oldHashes);
  /** @brief Calculate memory size of info data
   *
   * @param elems(in) number of elements
   * @returns size in bytes
   */
  inline static size_t calcBytes(size_t elems) noexcept
  {
    return elems + sizeof(uint64_t);
  }
  /** @brief Reserve place sufficient for elems
   *
   * @param elems(in) number of elements
   */
  void reserve(size_t elems);
  /** @brief Start new aggregation generation
   *
   * Dump all the data on disk, including internal info data, positions & row
   * hashes, and the rowgroups itself.
   */
  void startNewGeneration();
  /** @brief Save internal info data on disk */
  void dumpInternalData() const;
  /** @brief Load previously dumped data from disk
   *
   * @param gen(in) generation number
   */
  void loadGeneration(uint16_t gen);
  /** @brief Load previously dumped data into the tmp storage */
  void loadGeneration(uint16_t gen, size_t& size, size_t& mask, size_t& maxSize, uint32_t& infoInc, uint32_t& infoHashShift, uint8_t*& info);
  /** @brief Remove temporary data files */
  void cleanup();
  void cleanup(uint16_t gen);
  /** @brief Remove all temporary data files */
  void cleanupAll() noexcept;
  std::string makeDumpFilename(int32_t gen = -1) const;
 private:
  static constexpr size_t   INIT_SIZE{sizeof(uint64_t)};
  static constexpr uint32_t INIT_INFO_BITS{5};
  static constexpr uint8_t  INIT_INFO_INC{1U << INIT_INFO_BITS};
  static constexpr size_t   INFO_MASK{INIT_INFO_INC - 1U};
  static constexpr uint8_t  INIT_INFO_HASH_SHIFT{0};
  static constexpr uint16_t MAX_INMEMORY_GENS{4};
  struct Data
  {
    RowPosHashStoragePtr fHashes;
    uint8_t *fInfo{nullptr};
    size_t fSize{0};
    size_t fMask{0};
    size_t fMaxSize{0};
    uint32_t fInfoInc{INIT_INFO_INC};
    uint32_t fInfoHashShift{INIT_INFO_HASH_SHIFT};
  };
  std::vector<std::unique_ptr<Data>> fGens;
  Data* fCurData;
  uint32_t fMaxRows;
  const bool fExtKeys;
  std::unique_ptr<RowGroupStorage> fStorage;
  RowGroupStorage* fKeysStorage;
  uint32_t fLastKeyCol;
  uint16_t fGeneration{0};
  void* fUniqId;
  Row fKeyRow;
  std::unique_ptr<MemManager> fMM;
  uint32_t fNumOfInputRGPerThread;
  bool fAggregated = true;
  bool fAllowGenerations;
  bool fEnabledDiskAggregation;
  std::string fTmpDir;
  bool fInitialized{false};
  rowgroup::RowGroup* fRowGroupOut;
  rowgroup::RowGroup* fKeysRowGroup;
 };
 } // namespace rowgroup
 #endif // MYSQL_ROWSTORAGE_H
--- a/writeengine/shared/we_rbmetawriter.cpp
+++ b/writeengine/shared/we_rbmetawriter.cpp
@@ -452,7 +452,7 @@ std::string RBMetaWriter::openMetaFile ( uint16_t dbRoot )
        throw WeException( oss.str(), ERR_FILE_OPEN );
    }
-    { 
+    {
        std::ostringstream ossChown;
        idbdatafile::IDBFileSystem& fs = IDBPolicy::getFs(tmpMetaFileName.c_str());
        if (chownPath(ossChown, tmpMetaFileName, fs)
@@ -1338,7 +1338,7 @@ int RBMetaWriter::writeHWMChunk(
        return ERR_METADATABKUP_COMP_RENAME;
    }
-    { 
+    {
        std::ostringstream ossChown;
        idbdatafile::IDBFileSystem& fs = IDBPolicy::getFs(fileName.c_str());
        if (chownPath(ossChown, fileName, fs)