[MCOL-4709] Disk-based aggregation

* Introduce multigeneration aggregation * Do not save unused part of RGDatas to disk * Add IO error explanation (strerror) * Reduce memory usage while aggregating * introduce in-memory generations to better memory utilization * Try to limit the qty of buckets at a low limit * Refactor disk aggregation a bit * pass calculated hash into RowAggregation * try to keep some RGData with free space in memory * do not dump more than half of rowgroups to disk if generations are allowed, instead start a new generation * for each thread shift the first processed bucket at each iteration, so the generations start more evenly * Unify temp data location * Explicitly create temp subdirectories whether disk aggregation/join are enabled or not
2025-07-29 08:21:15 +03:00 · 2021-01-15 18:52:13 +03:00
parent 3537c0d635
commit 475104e4d3
24 changed files with 5932 additions and 906 deletions
--- a/utils/common/robin_hood.h
+++ b/utils/common/robin_hood.h
--- a/utils/common/threadnaming.cpp
+++ b/utils/common/threadnaming.cpp
@ -16,6 +16,7 @@
   MA 02110-1301, USA. */

 #include <sys/prctl.h>
+#include "threadnaming.h"

 namespace utils
 {
@ -23,4 +24,11 @@ namespace utils
    {
        prctl(PR_SET_NAME, threadName, 0, 0, 0);
    }
+
+    std::string getThreadName()
+    {
+      char buf[32];
+      prctl(PR_GET_NAME, buf, 0, 0, 0);
+      return std::string(buf);
+    }
 } // end of namespace
--- a/utils/common/threadnaming.h
+++ b/utils/common/threadnaming.h
@ -17,8 +17,11 @@
 #ifndef H_SETTHREADNAME
 #define H_SETTHREADNAME

+#include <string>
+
 namespace utils
 {
    void setThreadName(const char *threadName);
+    std::string getThreadName();
 } // end of namespace
 #endif
--- a/utils/configcpp/configcpp.cpp
+++ b/utils/configcpp/configcpp.cpp
@ -59,6 +59,9 @@ namespace fs = boost::filesystem;
 #include "installdir.h"
 #ifdef _MSC_VER
 #include "idbregistry.h"
+#include <unordered_map>
+#else
+#include <tr1/unordered_map>
 #endif

 #include "bytestream.h"
@ -673,6 +676,24 @@ const vector<string> Config::enumSection(const string& section)

    return fParser.enumSection(fDoc, section);
 }
+std::string Config::getTempFileDir(Config::TempDirPurpose what)
+{
+  std::string prefix = getConfig("SystemConfig", "SystemTempFileDir");
+  if (prefix.empty())
+  {
+    prefix.assign("/tmp/columnstore_tmp_files");
+  }
+  prefix.append("/");
+  switch (what)
+  {
+  case TempDirPurpose::Joins:
+    return prefix.append("joins/");
+  case TempDirPurpose::Aggregates:
+    return prefix.append("aggregates/");
+  }
+  // NOTREACHED
+  return {};
+}

 } //namespace config
 // vim:ts=4 sw=4:
--- a/utils/configcpp/configcpp.h
+++ b/utils/configcpp/configcpp.h
@ -203,6 +203,14 @@ public:
     */
    EXPORT const std::vector<std::string> enumSection(const std::string& section);

+    enum class TempDirPurpose
+    {
+      Joins,      ///< disk joins
+      Aggregates  ///< disk-based aggregation
+    };
+    /** @brief Return temporaru directory path for the specified purpose */
+    EXPORT std::string getTempFileDir(TempDirPurpose what);
+
 protected:
    /** @brief parse the XML file
    *
--- a/utils/joiner/joinpartition.cpp
+++ b/utils/joiner/joinpartition.cpp
@ -129,7 +129,8 @@ JoinPartition::JoinPartition(const JoinPartition& jp, bool splitMode) :
    // Instead, each will double in size, giving a capacity of 8GB -> 16 -> 32, and so on.
 //	bucketCount = jp.bucketCount;
    bucketCount = 2;
-    filenamePrefix = startup::StartUp::tmpDir();
+    config::Config* config = config::Config::makeConfig();
+    filenamePrefix = config->getTempFileDir(config::Config::TempDirPurpose::Joins);

    filenamePrefix += "/Columnstore-join-data-";

--- a/utils/loggingcpp/ErrorMessage.txt
+++ b/utils/loggingcpp/ErrorMessage.txt
@ -100,6 +100,10 @@

 2053	ERR_FUNC_OUT_OF_RANGE_RESULT	The result is out of range for function %1% using value(s): %2% %3%

+2054	ERR_DISKAGG_ERROR	Unknown error while aggregation.
+2055	ERR_DISKAGG_TOO_BIG	Not enough memory to make disk-based aggregation. Raise TotalUmMemory if possible.
+2056	ERR_DISKAGG_FILEIO_ERROR	There was an IO error during a disk-based aggregation: %1%
+
 # Sub-query errors
 3001	ERR_NON_SUPPORT_SUB_QUERY_TYPE	This subquery type is not supported yet.
 3002	ERR_MORE_THAN_1_ROW	Subquery returns more than 1 row.
--- a/utils/rowgroup/CMakeLists.txt
+++ b/utils/rowgroup/CMakeLists.txt
@ -4,7 +4,7 @@ include_directories( ${ENGINE_COMMON_INCLUDES} )

 ########### next target ###############

-set(rowgroup_LIB_SRCS rowaggregation.cpp rowgroup.cpp)
+set(rowgroup_LIB_SRCS rowaggregation.cpp rowgroup.cpp rowstorage.cpp)

 #librowgroup_la_CXXFLAGS = $(march_flags) $(AM_CXXFLAGS)

--- a/utils/rowgroup/rowaggregation.cpp
+++ b/utils/rowgroup/rowaggregation.cpp
--- a/utils/rowgroup/rowaggregation.h
+++ b/utils/rowgroup/rowaggregation.h
@ -30,7 +30,8 @@
 */

 #include <cstring>
-#include <stdint.h>
+#include <cstdint>
+#include <utility>
 #include <vector>
 #ifdef _MSC_VER
 #include <unordered_map>
@ -54,6 +55,9 @@
 #include "constantcolumn.h"


+#include "resourcemanager.h"
+#include "rowstorage.h"
+
 // To do: move code that depends on joblist to a proper subsystem.
 namespace joblist
 {
@ -63,17 +67,6 @@ class ResourceManager;
 namespace rowgroup
 {

-
-struct RowPosition
-{
-    uint64_t group: 48;
-    uint64_t row: 16;
-
-    static const uint64_t MSB = 0x800000000000ULL;   //48th bit is set
-    inline RowPosition(uint64_t g, uint64_t r) : group(g), row(r) { }
-    inline RowPosition() { }
-};
-
 /** @brief Enumerates aggregate functions supported by RowAggregation
 */
 enum RowAggFunctionType
@ -143,9 +136,9 @@ struct RowAggGroupByCol
     *    outputColIndex argument should be omitted if this GroupBy
     *    column is not to be included in the output.
     */
-    RowAggGroupByCol(int32_t inputColIndex, int32_t outputColIndex = -1) :
+    explicit RowAggGroupByCol(int32_t inputColIndex, int32_t outputColIndex = -1) :
        fInputColumnIndex(inputColIndex), fOutputColumnIndex(outputColIndex) {}
-    ~RowAggGroupByCol() {}
+    ~RowAggGroupByCol() = default;

    uint32_t	fInputColumnIndex;
    uint32_t	fOutputColumnIndex;
@ -184,7 +177,7 @@ struct RowAggFunctionCol
                      int32_t inputColIndex, int32_t outputColIndex, int32_t auxColIndex = -1) :
        fAggFunction(aggFunction), fStatsFunction(stats), fInputColumnIndex(inputColIndex),
        fOutputColumnIndex(outputColIndex), fAuxColumnIndex(auxColIndex) {}
-    virtual ~RowAggFunctionCol() {}
+    virtual ~RowAggFunctionCol() = default;

    virtual void serialize(messageqcpp::ByteStream& bs) const;
    virtual void deserialize(messageqcpp::ByteStream& bs);
@ -237,10 +230,10 @@ struct RowUDAFFunctionCol : public RowAggFunctionCol
        bInterrupted(false)
    {}

-    virtual ~RowUDAFFunctionCol() {}
+    ~RowUDAFFunctionCol() override = default;

-    virtual void serialize(messageqcpp::ByteStream& bs) const;
-    virtual void deserialize(messageqcpp::ByteStream& bs);
+    void serialize(messageqcpp::ByteStream& bs) const override;
+    void deserialize(messageqcpp::ByteStream& bs) override;

    mcsv1sdk::mcsv1Context fUDAFContext;  // The UDAF context
    bool bInterrupted;                    // Shared by all the threads
@ -312,104 +305,18 @@ struct ConstantAggData
    ConstantAggData() : fOp(ROWAGG_FUNCT_UNDEFINE), fIsNull(false)
    {}

-    ConstantAggData(const std::string& v, RowAggFunctionType f, bool n) :
-        fConstValue(v), fOp(f), fIsNull(n)
+    ConstantAggData(std::string v, RowAggFunctionType f, bool n) :
+        fConstValue(std::move(v)), fOp(f), fIsNull(n)
    {}

-    ConstantAggData(const std::string& v, const std::string u, RowAggFunctionType f, bool n) :
-        fConstValue(v), fUDAFName(u), fOp(f), fIsNull(n)
+    ConstantAggData(std::string v, std::string u, RowAggFunctionType f, bool n) :
+        fConstValue(std::move(v)), fUDAFName(std::move(u)), fOp(f), fIsNull(n)
    {}
 };

 typedef boost::shared_ptr<RowAggGroupByCol>  SP_ROWAGG_GRPBY_t;
 typedef boost::shared_ptr<RowAggFunctionCol> SP_ROWAGG_FUNC_t;

-class RowAggregation;
-
-class AggHasher
-{
-public:
-    AggHasher(const Row& row, Row** tRow, uint32_t keyCount, RowAggregation* ra);
-    inline uint64_t operator()(const RowPosition& p) const;
-
-private:
-    explicit AggHasher();
-    RowAggregation* agg;
-    Row** tmpRow;
-    mutable Row r;
-    uint32_t lastKeyCol;
-};
-
-class AggComparator
-{
-public:
-    AggComparator(const Row& row, Row** tRow, uint32_t keyCount, RowAggregation* ra);
-    inline bool operator()(const RowPosition&, const RowPosition&) const;
-
-private:
-    explicit AggComparator();
-    RowAggregation* agg;
-    Row** tmpRow;
-    mutable Row r1, r2;
-    uint32_t lastKeyCol;
-};
-
-class KeyStorage
-{
-public:
-    KeyStorage(const RowGroup& keyRG, Row** tRow);
-
-    inline RowPosition addKey();
-    inline uint64_t getMemUsage();
-
-private:
-    Row row;
-    Row** tmpRow;
-    RowGroup rg;
-    std::vector<RGData> storage;
-    uint64_t memUsage;
-
-    friend class ExternalKeyEq;
-    friend class ExternalKeyHasher;
-};
-
-class ExternalKeyHasher
-{
-public:
-    ExternalKeyHasher(const RowGroup& keyRG, KeyStorage* ks, uint32_t keyColCount, Row** tRow);
-    inline uint64_t operator()(const RowPosition& pos) const;
-
-private:
-    mutable Row row;
-    mutable Row** tmpRow;
-    uint32_t lastKeyCol;
-    KeyStorage* ks;
-};
-
-class ExternalKeyEq
-{
-public:
-    ExternalKeyEq(const RowGroup& keyRG, KeyStorage* ks, uint32_t keyColCount, Row** tRow);
-    inline bool operator()(const RowPosition& pos1, const RowPosition& pos2) const;
-
-private:
-    mutable Row row1, row2;
-    mutable Row** tmpRow;
-    uint32_t lastKeyCol;
-    KeyStorage* ks;
-};
-
-typedef std::tr1::unordered_set<RowPosition, AggHasher, AggComparator, utils::STLPoolAllocator<RowPosition> >
-RowAggMap_t;
-
-#if defined(__GNUC__) && (__GNUC__ == 4 && __GNUC_MINOR__ < 5)
-typedef std::tr1::unordered_map<RowPosition, RowPosition, ExternalKeyHasher, ExternalKeyEq,
-        utils::STLPoolAllocator<std::pair<const RowPosition, RowPosition> > > ExtKeyMap_t;
-#else
-typedef std::tr1::unordered_map<RowPosition, RowPosition, ExternalKeyHasher, ExternalKeyEq,
-        utils::STLPoolAllocator<std::pair<RowPosition, RowPosition> > > ExtKeyMap_t;
-#endif
-
 struct GroupConcat
 {
    // GROUP_CONCAT(DISTINCT col1, 'const', col2 ORDER BY col3 desc SEPARATOR 'sep')
@ -427,7 +334,7 @@ struct GroupConcat
    boost::shared_ptr<int64_t>			fSessionMemLimit;
    std::string fTimeZone;

-    GroupConcat() : fRm(NULL) {}
+    GroupConcat() : fRm(nullptr) {}
 };

 typedef boost::shared_ptr<GroupConcat>  SP_GroupConcat;
@ -436,7 +343,7 @@ typedef boost::shared_ptr<GroupConcat>  SP_GroupConcat;
 class GroupConcatAg
 {
 public:
-    GroupConcatAg(SP_GroupConcat&);
+    explicit GroupConcatAg(SP_GroupConcat&);
    virtual ~GroupConcatAg();

    virtual void initialize() {};
@ -446,7 +353,7 @@ public:
    void getResult(uint8_t*) {};
    uint8_t* getResult()
    {
-        return NULL;
+        return nullptr;
    }

 protected:
@ -478,12 +385,14 @@ public:
     */
    RowAggregation();
    RowAggregation(const std::vector<SP_ROWAGG_GRPBY_t>& rowAggGroupByCols,
-                   const std::vector<SP_ROWAGG_FUNC_t>&  rowAggFunctionCols);
+                   const std::vector<SP_ROWAGG_FUNC_t>&  rowAggFunctionCols,
+                   joblist::ResourceManager* rm = nullptr,
+                   boost::shared_ptr<int64_t> sessMemLimit = {});
    RowAggregation(const RowAggregation& rhs);

    /** @brief RowAggregation default destructor
     */
-    virtual ~RowAggregation();
+    ~RowAggregation() override;

    /** @brief clone this object for multi-thread use
     */
@ -551,28 +460,19 @@ public:
     * @parm pRowGroupIn(in) RowGroup to be added to aggregation.
     */
    virtual void addRowGroup(const RowGroup* pRowGroupIn);
-    virtual void addRowGroup(const RowGroup* pRowGroupIn, std::vector<Row::Pointer>& inRows);
+    virtual void addRowGroup(const RowGroup* pRowGroupIn, std::vector<std::pair<Row::Pointer, uint64_t>>& inRows);

    /** @brief Serialize RowAggregation object into a ByteStream.
     *
     * @parm bs(out) BytesStream that is to be written to.
     */
-    void serialize(messageqcpp::ByteStream& bs) const;
+    void serialize(messageqcpp::ByteStream& bs) const override;

    /** @brief Unserialize RowAggregation object from a ByteStream.
     *
     * @parm bs(in) BytesStream that is to be read from.
     */
-    void deserialize(messageqcpp::ByteStream& bs);
-
-    /** @brief set the memory limit for RowAggregation
-     *
-     * @parm limit(in) memory limit for both Map and secondary RowGroups
-     */
-    void setMaxMemory(uint64_t limit)
-    {
-        fMaxMemory = limit;
-    }
+    void deserialize(messageqcpp::ByteStream& bs) override;

    /** @brief load result set into byte stream
     *
@ -594,18 +494,12 @@ public:
        return fRowGroupOut;
    }

-    RowAggMap_t* mapPtr()
-    {
-        return fAggMapPtr;
-    }
-    std::vector<RGData*>& resultDataVec()
-    {
-        return fResultDataVec;
-    }
+    void append(RowAggregation* other);

    virtual void aggregateRow(Row& row,
+                              const uint64_t* hash = nullptr,
                              std::vector<mcsv1sdk::mcsv1Context>* rgContextColl = nullptr);
-    inline uint32_t aggMapKeyLength()
+    inline uint32_t aggMapKeyLength() const
    {
        return fAggMapKeyCount;
    }
@ -623,6 +517,16 @@ public:
        return &fRGContextColl;
    }

+    void finalAggregation()
+    {
+        return fRowAggStorage->finalize([this](Row& row) { mergeEntries(row);}, fRow);
+    }
+
+    std::unique_ptr<RGData> moveCurrentRGData()
+    {
+        return std::move(fCurRGData);
+    }
+
 protected:
    virtual void initialize();
    virtual void initMapData(const Row& row);
@ -630,10 +534,12 @@ protected:

    virtual void updateEntry(const Row& row,
                             std::vector<mcsv1sdk::mcsv1Context>* rgContextColl = nullptr);
+    void mergeEntries(const Row& row);
    virtual void doMinMax(const Row&, int64_t, int64_t, int);
    virtual void doSum(const Row&, int64_t, int64_t, int);
-    virtual void doAvg(const Row&, int64_t, int64_t, int64_t);
+    virtual void doAvg(const Row&, int64_t, int64_t, int64_t, bool merge = false);
    virtual void doStatistics(const Row&, int64_t, int64_t, int64_t);
+    void mergeStatistics(const Row&, uint64_t colOut, uint64_t colAux);
    virtual void doBitOp(const Row&, int64_t, int64_t, int);
    virtual void doUDAF(const Row&,
                        int64_t,
@ -647,12 +553,6 @@ protected:
        return true;
    }

-    virtual bool newRowGroup();
-    virtual void clearAggMap()
-    {
-        if (fAggMapPtr) fAggMapPtr->clear();
-    }
-
    void resetUDAF(RowUDAFFunctionCol* rowUDAF);
    void resetUDAF(RowUDAFFunctionCol* rowUDAF, uint64_t funcColIdx);

@ -673,24 +573,19 @@ protected:
    inline void updateStringMinMax(std::string val1, std::string val2, int64_t col, int func);
    std::vector<SP_ROWAGG_GRPBY_t>                  fGroupByCols;
    std::vector<SP_ROWAGG_FUNC_t>                   fFunctionCols;
-    RowAggMap_t*                                    fAggMapPtr;
    uint32_t                                        fAggMapKeyCount;   // the number of columns that make up the key
    RowGroup                                        fRowGroupIn;
    RowGroup*                                       fRowGroupOut;

+    // for when the group by & distinct keys are not stored in the output rows
+    rowgroup::RowGroup fKeyRG;
+
    Row                                             fRow;
    Row                                             fNullRow;
    Row*											 tmpRow;   // used by the hashers & eq functors
    boost::scoped_array<uint8_t>                    fNullRowData;
-    std::vector<RGData*>                           fResultDataVec;

-    uint64_t                                        fTotalRowCount;
-    uint64_t                                        fMaxTotalRowCount;
-    uint64_t                                        fMaxMemory;
-
-    RGData*                                         fPrimaryRowData;
-
-    std::vector<boost::shared_ptr<RGData> >         fSecondaryRowDataVec;
+    std::unique_ptr<RowAggStorage> fRowAggStorage;

    // for support PM aggregation after PM hashjoin
    std::vector<RowGroup>*                          fSmallSideRGs;
@ -700,28 +595,19 @@ protected:
    uint32_t                                            fSmallSideCount;
    boost::scoped_array<Row> rowSmalls;

-    // for hashmap
-    boost::shared_ptr<utils::STLPoolAllocator<RowPosition> > fAlloc;
-
    // for 8k poc
    RowGroup                                        fEmptyRowGroup;
    RGData                                          fEmptyRowData;
    Row                                             fEmptyRow;

-    boost::scoped_ptr<AggHasher> fHasher;
-    boost::scoped_ptr<AggComparator> fEq;
+    bool fKeyOnHeap = false;

    std::string fTimeZone;

-    //TODO: try to get rid of these friend decl's.  AggHasher & Comparator
-    //need access to rowgroup storage holding the rows to hash & ==.
-    friend class AggHasher;
-    friend class AggComparator;
-
    // We need a separate copy for each thread.
    mcsv1sdk::mcsv1Context fRGContext;
    std::vector<mcsv1sdk::mcsv1Context> fRGContextColl;
-    
+
    // These are handy for testing the actual type of static_any for UDAF
    static const static_any::any& charTypeId;
    static const static_any::any& scharTypeId;
@ -742,6 +628,10 @@ protected:

    // For UDAF along with with multiple distinct columns
    std::vector<SP_ROWAGG_FUNC_t>* fOrigFunctionCols;
+
+    joblist::ResourceManager*  fRm = nullptr;
+    boost::shared_ptr<int64_t> fSessionMemLimit;
+    std::unique_ptr<RGData> fCurRGData;
 };

 //------------------------------------------------------------------------------
@ -764,11 +654,11 @@ public:

    /** @brief RowAggregationUM default destructor
     */
-    ~RowAggregationUM();
+    ~RowAggregationUM() override;

    /** @brief Denotes end of data insertion following multiple calls to addRowGroup().
     */
-    void endOfInput();
+    void endOfInput() override;

    /** @brief Finializes the result set before sending back to the front end.
     */
@ -805,7 +695,7 @@ public:
    {
        return fRm;
    }
-    inline virtual RowAggregationUM* clone() const
+    inline RowAggregationUM* clone() const override
    {
        return new RowAggregationUM (*this);
    }
@ -832,22 +722,18 @@ public:
        return fGroupConcat;
    }

-    void aggregateRow(Row&,
-                      std::vector<mcsv1sdk::mcsv1Context>* rgContextColl = nullptr) override;
-    virtual void aggReset();
+    void aggReset() override;

-    void setInputOutput(const RowGroup& pRowGroupIn, RowGroup* pRowGroupOut);
+    void setInputOutput(const RowGroup& pRowGroupIn, RowGroup* pRowGroupOut) override;

 protected:
+    // virtual methods from base
    void initialize() override;
+
+    void attachGroupConcatAg() override;
    void updateEntry(const Row& row,
                     std::vector<mcsv1sdk::mcsv1Context>* rgContextColl = nullptr) override;
-
-    void aggregateRowWithRemap(Row&,
-                               std::vector<mcsv1sdk::mcsv1Context>* rgContextColl = nullptr);
-
-    void attachGroupConcatAg();
-    bool countSpecial(const RowGroup* pRG)
+    bool countSpecial(const RowGroup* pRG) override
    {
        fRow.setIntField<8>(
            fRow.getIntField<8>(
@ -856,8 +742,6 @@ protected:
        return true;
    }

-    bool newRowGroup();
-
    // calculate the average after all rows received. UM only function.
    void calculateAvgColumns();

@ -889,7 +773,6 @@ protected:
    virtual void setGroupConcatString();

    bool fHasAvg;
-    bool fKeyOnHeap;
    bool fHasStatsFunc;
    bool fHasUDAF;

@ -902,8 +785,6 @@ protected:
     * the memory from rm in that order. */
    uint64_t                          fTotalMemUsage;

-    joblist::ResourceManager*         fRm;
-
    // @bug3475, aggregate(constant), sum(0), count(null), etc
    std::vector<ConstantAggData>      fConstantAggregate;

@ -912,18 +793,8 @@ protected:
    std::vector<SP_GroupConcatAg>     fGroupConcatAg;
    std::vector<SP_ROWAGG_FUNC_t>     fFunctionColGc;

-    // for when the group by & distinct keys are not stored in the output rows
-    rowgroup::RowGroup fKeyRG;
-    boost::scoped_ptr<ExternalKeyEq> fExtEq;
-    boost::scoped_ptr<ExternalKeyHasher> fExtHash;
-    boost::scoped_ptr<KeyStorage> fKeyStore;
-    boost::scoped_ptr<utils::STLPoolAllocator<std::pair<RowPosition, RowPosition> > > fExtKeyMapAlloc;
-    boost::scoped_ptr<ExtKeyMap_t> fExtKeyMap;
-
-    boost::shared_ptr<int64_t> fSessionMemLimit;
 private:
    uint64_t fLastMemUsage;
-    uint32_t fNextRGIndex;
 };


@ -951,8 +822,8 @@ public:

    /** @brief RowAggregationUMP2 default destructor
     */
-    ~RowAggregationUMP2();
-    inline virtual RowAggregationUMP2* clone() const
+    ~RowAggregationUMP2() override;
+    inline RowAggregationUMP2* clone() const override
    {
        return new RowAggregationUMP2 (*this);
    }
@ -961,17 +832,17 @@ protected:
    // virtual methods from base
    void updateEntry(const Row& row,
                     std::vector<mcsv1sdk::mcsv1Context>* rgContextColl = nullptr) override;
-    void doAvg(const Row&, int64_t, int64_t, int64_t);
-    void doStatistics(const Row&, int64_t, int64_t, int64_t);
-    void doGroupConcat(const Row&, int64_t, int64_t);
-    void doBitOp(const Row&, int64_t, int64_t, int);
+    void doAvg(const Row&, int64_t, int64_t, int64_t, bool merge = false) override;
+    void doStatistics(const Row&, int64_t, int64_t, int64_t) override;
+    void doGroupConcat(const Row&, int64_t, int64_t) override;
+    void doBitOp(const Row&, int64_t, int64_t, int) override;
    void doUDAF(const Row&,
                int64_t,
                int64_t,
                int64_t,
                uint64_t& funcColsIdx,
                std::vector<mcsv1sdk::mcsv1Context>* rgContextColl = nullptr) override;
-    bool countSpecial(const RowGroup* pRG)
+    bool countSpecial(const RowGroup* pRG) override
    {
        return false;
    }
@ -1002,18 +873,18 @@ public:

    /** @brief RowAggregationDistinct default destructor
     */
-    ~RowAggregationDistinct();
+    ~RowAggregationDistinct() override;

    /** @brief Add an aggregator for pre-DISTINCT aggregation
     */
    void addAggregator(const boost::shared_ptr<RowAggregation>& agg, const RowGroup& rg);

-    void setInputOutput(const RowGroup& pRowGroupIn, RowGroup* pRowGroupOut);
+    void setInputOutput(const RowGroup& pRowGroupIn, RowGroup* pRowGroupOut) override;

    virtual void doDistinctAggregation();
-    virtual void doDistinctAggregation_rowVec(std::vector<Row::Pointer>& inRows);
-    void addRowGroup(const RowGroup* pRowGroupIn);
-    void addRowGroup(const RowGroup* pRowGroupIn, std::vector<Row::Pointer>& inRows);
+    virtual void doDistinctAggregation_rowVec(std::vector<std::pair<Row::Pointer, uint64_t>>& inRows);
+    void addRowGroup(const RowGroup* pRowGroupIn) override;
+    void addRowGroup(const RowGroup* pRowGroupIn, std::vector<std::pair<Row::Pointer, uint64_t>>& inRows) override;

    // multi-threade debug
    boost::shared_ptr<RowAggregation>& aggregator()
@ -1022,7 +893,7 @@ public:
    }
    void aggregator(boost::shared_ptr<RowAggregation> aggregator)
    {
-        fAggregator = aggregator;
+        fAggregator = std::move(aggregator);
    }
    RowGroup& rowGroupDist()
    {
@ -1032,7 +903,7 @@ public:
    {
        fRowGroupDist = rowGroupDist;
    }
-    inline virtual RowAggregationDistinct* clone() const
+    inline RowAggregationDistinct* clone() const override
    {
        return new RowAggregationDistinct (*this);
    }
@ -1067,20 +938,20 @@ public:

    /** @brief RowAggregationSubDistinct default destructor
     */
-    ~RowAggregationSubDistinct();
+    ~RowAggregationSubDistinct() override;

-    void setInputOutput(const RowGroup& pRowGroupIn, RowGroup* pRowGroupOut);
-    void addRowGroup(const RowGroup* pRowGroupIn);
-    inline virtual RowAggregationSubDistinct* clone() const
+    void setInputOutput(const RowGroup& pRowGroupIn, RowGroup* pRowGroupOut) override;
+    void addRowGroup(const RowGroup* pRowGroupIn) override;
+    inline RowAggregationSubDistinct* clone() const override
    {
        return new RowAggregationSubDistinct (*this);
    }

-    void addRowGroup(const RowGroup* pRowGroupIn, std::vector<Row::Pointer>& inRow);
+    void addRowGroup(const RowGroup* pRowGroupIn, std::vector<std::pair<Row::Pointer, uint64_t>>& inRow) override;

 protected:
    // virtual methods from RowAggregationUM
-    void doGroupConcat(const Row&, int64_t, int64_t);
+    void doGroupConcat(const Row&, int64_t, int64_t) override;

    // for groupby columns and the aggregated distinct column
    Row                                             fDistRow;
@ -1108,7 +979,7 @@ public:

    /** @brief RowAggregationMultiDistinct default destructor
     */
-    ~RowAggregationMultiDistinct();
+    ~RowAggregationMultiDistinct() override;

    /** @brief Add sub aggregators
     */
@ -1116,21 +987,21 @@ public:
                          const RowGroup& rg,
                          const std::vector<SP_ROWAGG_FUNC_t>& funct);

-    void setInputOutput(const RowGroup& pRowGroupIn, RowGroup* pRowGroupOut);
+    void setInputOutput(const RowGroup& pRowGroupIn, RowGroup* pRowGroupOut) override;
    using RowAggregationDistinct::addRowGroup;
-    void addRowGroup(const RowGroup* pRowGroupIn);
+    void addRowGroup(const RowGroup* pRowGroupIn) override;

    using RowAggregationDistinct::doDistinctAggregation;
-    virtual void doDistinctAggregation();
+    void doDistinctAggregation() override;
    using RowAggregationDistinct::doDistinctAggregation_rowVec;
-    virtual void doDistinctAggregation_rowVec(std::vector<std::vector<Row::Pointer> >& inRows);
+    virtual void doDistinctAggregation_rowVec(std::vector<std::vector<std::pair<Row::Pointer, uint64_t>> >& inRows);

-    inline virtual RowAggregationMultiDistinct* clone() const
+    inline RowAggregationMultiDistinct* clone() const override
    {
        return new RowAggregationMultiDistinct (*this);
    }

-    void addRowGroup(const RowGroup* pRowGroupIn, std::vector<std::vector<Row::Pointer> >& inRows);
+    void addRowGroup(const RowGroup* pRowGroupIn, std::vector<std::vector<std::pair<Row::Pointer, uint64_t>>>& inRows);

    std::vector<boost::shared_ptr<RowAggregationUM> >& subAggregators()
    {
--- a/utils/rowgroup/rowgroup.cpp
+++ b/utils/rowgroup/rowgroup.cpp
@ -32,6 +32,7 @@
 using namespace std;

 #include <boost/shared_array.hpp>
+#include <numeric>
 using namespace boost;

 #include "bytestream.h"
@ -405,6 +406,7 @@ RGData::RGData(const RowGroup& rg, uint32_t rowCount)
     */
    memset(rowData.get(), 0, rg.getDataSize(rowCount));   // XXXPAT: make valgrind happy temporarily
 #endif
+  memset(rowData.get(), 0, rg.getDataSize(rowCount));   // XXXPAT: make valgrind happy temporarily
 }

 RGData::RGData(const RowGroup& rg)
@ -481,7 +483,7 @@ void RGData::serialize(ByteStream& bs, uint32_t amount) const
        bs << (uint8_t) 0;
 }

-void RGData::deserialize(ByteStream& bs, bool hasLenField)
+void RGData::deserialize(ByteStream& bs, uint32_t defAmount)
 {
    uint32_t amount, sig;
    uint8_t* buf;
@ -493,7 +495,7 @@ void RGData::deserialize(ByteStream& bs, bool hasLenField)
    {
        bs >> sig;
        bs >> amount;
-        rowData.reset(new uint8_t[amount]);
+        rowData.reset(new uint8_t[std::max(amount, defAmount)]);
        buf = bs.buf();
        memcpy(rowData.get(), buf, amount);
        bs.advance(amount);
@ -577,12 +579,13 @@ Row& Row::operator=(const Row& r)
    return *this;
 }

-string Row::toString() const
+string Row::toString(uint32_t rownum) const
 {
    ostringstream os;
    uint32_t i;

    //os << getRid() << ": ";
+    os << "[" << std::setw(5) << rownum << std::setw(0) << "]: ";
    os << (int) useStringTable << ": ";

    for (i = 0; i < columnCount; i++)
@ -1447,7 +1450,7 @@ uint32_t RowGroup::getColumnCount() const
    return columnCount;
 }

-string RowGroup::toString() const
+string RowGroup::toString(const std::vector<uint64_t>& used) const
 {
    ostringstream os;
    ostream_iterator<int> oIter1(os, "\t");
@ -1479,6 +1482,8 @@ string RowGroup::toString() const
        os << "uses a string table\n";
    else
        os << "doesn't use a string table\n";
+    if (!used.empty())
+      os << "sparse\n";

    //os << "strings = " << hex << (int64_t) strings << "\n";
    //os << "data = " << (int64_t) data << "\n" << dec;
@ -1488,14 +1493,25 @@ string RowGroup::toString() const
        initRow(&r);
        getRow(0, &r);
        os << "rowcount = " << getRowCount() << endl;
+        if (!used.empty())
+        {
+          uint64_t cnt = std::accumulate(used.begin(), used.end(), 0ULL,
+                                         [](uint64_t a, uint64_t bits) {
+                                           return a + __builtin_popcountll(bits);
+                                         });
+          os << "sparse row count = " << cnt << endl;
+        }
        os << "base rid = " << getBaseRid() << endl;
        os << "status = " << getStatus() << endl;
        os << "dbroot = " << getDBRoot() << endl;
        os << "row data...\n";

-        for (uint32_t i = 0; i < getRowCount(); i++)
+        uint32_t max_cnt = used.empty() ? getRowCount() : (used.size() * 64);
+        for (uint32_t i = 0; i < max_cnt; i++)
        {
-            os << r.toString() << endl;
+            if (!used.empty() && !(used[i/64] & (1ULL << (i%64))))
+              continue;
+            os << r.toString(i) << endl;
            r.nextRow();
        }
    }
--- a/utils/rowgroup/rowgroup.h
+++ b/utils/rowgroup/rowgroup.h
@ -270,7 +270,7 @@ public:
    // the 'hasLengthField' is there b/c PM aggregation (and possibly others) currently sends
    // inline data with a length field.  Once that's converted to string table format, that
    // option can go away.
-    void deserialize(messageqcpp::ByteStream&, bool hasLengthField = false); // returns the # of bytes read
+    void deserialize(messageqcpp::ByteStream&, uint32_t amount = 0); // returns the # of bytes read

    inline uint64_t getStringTableMemUsage();
    void clear();
@ -531,7 +531,7 @@ public:
    template<typename T>
    inline void copyBinaryField(Row& dest, uint32_t destIndex, uint32_t srcIndex) const;

-    std::string toString() const;
+    std::string toString(uint32_t rownum = 0) const;
    std::string toCSV() const;

    /* These fcns are used only in joins.  The RID doesn't matter on the side that
@ -1537,7 +1537,7 @@ public:

    RGData duplicate();   // returns a copy of the attached RGData

-    std::string toString() const;
+    std::string toString(const std::vector<uint64_t>& used = {}) const;

    /** operator+=
    *
--- a/utils/rowgroup/rowstorage.cpp
+++ b/utils/rowgroup/rowstorage.cpp
--- a/utils/rowgroup/rowstorage.h
+++ b/utils/rowgroup/rowstorage.h
@ -0,0 +1,366 @@
+/* Copyright (C) 2021 MariaDB Corporation
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation; version 2 of
+   the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+   MA 02110-1301, USA. */
+
+#ifndef ROWSTORAGE_H
+#define ROWSTORAGE_H
+
+#include "rowgroup.h"
+#include <sys/stat.h>
+#include <unistd.h>
+
+namespace rowgroup
+{
+
+uint32_t calcNumberOfBuckets(ssize_t availMem,
+                             uint32_t numOfThreads,
+                             uint32_t numOfBuckets,
+                             uint32_t groupsPerThread,
+                             uint32_t inRowSize,
+                             uint32_t outRowSize,
+                             bool enabledDiskAggr);
+
+class MemManager;
+class RowPosHashStorage;
+using RowPosHashStoragePtr = std::unique_ptr<RowPosHashStorage>;
+class RowGroupStorage;
+
+uint64_t hashRow(const rowgroup::Row& r, std::size_t lastCol);
+
+class RowAggStorage
+{
+public:
+  RowAggStorage(const std::string& tmpDir,
+                RowGroup* rowGroupOut,
+                RowGroup* keysRowGroup,
+                uint32_t keyCount,
+                joblist::ResourceManager* rm = nullptr,
+                boost::shared_ptr<int64_t> sessLimit = {},
+                bool enabledDiskAgg = false,
+                bool allowGenerations = false);
+
+  RowAggStorage(const std::string& tmpDir,
+                RowGroup* rowGroupOut,
+                uint32_t keyCount,
+                joblist::ResourceManager* rm = nullptr,
+                boost::shared_ptr<int64_t> sessLimit = {},
+                bool enabledDiskAgg = false,
+                bool allowGenerations = false)
+      : RowAggStorage(tmpDir, rowGroupOut, rowGroupOut, keyCount,
+                      rm, std::move(sessLimit),
+                      enabledDiskAgg, allowGenerations)
+  {}
+
+  ~RowAggStorage();
+
+  static uint16_t getMaxRows(bool enabledDiskAgg)
+  {
+    return (enabledDiskAgg ? 8192 : 256);
+  }
+
+  static size_t getBucketSize();
+
+  /** @brief Find or create resulting row.
+   *
+   *    Create "aggregation key" row if necessary.
+   *    NB! Using getTargetRow() after append() is UB!
+   *
+   *  @param row(in)  input row
+   *  @param rowOut() row to aggregate data from input row
+   *
+   *  @returns true if new row created, false otherwise
+   */
+  bool getTargetRow(const Row& row, Row& rowOut);
+  bool getTargetRow(const Row& row, uint64_t row_hash, Row& rowOut);
+
+  /** @brief Dump some RGDatas to disk and release memory for further use.
+   */
+  void dump();
+
+  /** @brief Append RGData from other RowAggStorage and clear it.
+   *
+   *    NB! Any operation except getNextRGData() or append() is UB!
+   *
+   * @param other(in) donor storage
+   */
+  void append(RowAggStorage& other);
+
+  /** @brief Remove last RGData from internal RGData storage and return it.
+   *
+   * @returns pointer to the next RGData or nullptr if empty
+   */
+  std::unique_ptr<RGData> getNextRGData();
+
+  /** @brief TODO
+   *
+   * @param mergeFunc
+   * @param rowOut
+   */
+  void finalize(std::function<void(Row &)> mergeFunc, Row &rowOut);
+
+  /** @brief Calculate maximum size of hash assuming 80% fullness.
+   *
+   * @param elems(in) number of elements
+   * @returns calculated size
+   */
+  inline static size_t calcMaxSize(size_t elems) noexcept
+  {
+    if (LIKELY(elems <= std::numeric_limits<size_t>::max() / 100))
+      return elems * 80 / 100;
+
+    return (elems / 100) * 80;
+  }
+
+  inline static size_t calcSizeWithBuffer(size_t elems, size_t maxSize) noexcept
+  {
+    return elems + std::min(maxSize, 0xFFUL);
+  }
+
+  inline static size_t calcSizeWithBuffer(size_t elems) noexcept
+  {
+    return calcSizeWithBuffer(elems, calcMaxSize(elems));
+  }
+
+private:
+  struct Data;
+  /** @brief Create new RowAggStorage with the same params and load dumped data
+   *
+   * @param gen(in) generation number
+   * @return pointer to a new RowAggStorage
+   */
+  RowAggStorage* clone(uint16_t gen) const;
+
+  /** @brief Free any internal data
+   */
+  void freeData();
+
+  /** @brief Move internal data & row position inside [insIdx, startIdx] up by 1.
+   *
+   * @param startIdx(in) last element's index to move
+   * @param insIdx(in)   first element's index to move
+   */
+  void shiftUp(size_t startIdx, size_t insIdx);
+
+  /** @brief Find best position of row and save it's hash.
+   *
+   * @param row(in)   input row
+   * @param info(out) info data
+   * @param idx(out)  index computed from row hash
+   * @param hash(out) row hash value
+   */
+  void rowToIdx(const Row& row, uint32_t& info, size_t& idx, uint64_t& hash) const;
+  void rowToIdx(const Row& row, uint32_t& info, size_t& idx, uint64_t& hash, const Data* curData) const;
+
+  /** @brief Find best position using precomputed hash
+   *
+   * @param h(in)     row hash
+   * @param info(out) info data
+   * @param idx(out)  index
+   */
+  inline void rowHashToIdx(uint64_t h, uint32_t& info, size_t& idx, const Data* curData) const
+  {
+    info = curData->fInfoInc + static_cast<uint32_t>((h & INFO_MASK) >> curData->fInfoHashShift);
+    idx = (h >> INIT_INFO_BITS) & curData->fMask;
+  }
+
+  inline void rowHashToIdx(uint64_t h, uint32_t& info, size_t& idx) const
+  {
+    return rowHashToIdx(h, info, idx, fCurData);
+  }
+
+  /** @brief Iterate over internal info until info with less-or-equal distance
+   *         from the best position was found.
+   *
+   * @param info(in,out) info data
+   * @param idx(in,out)  index
+   */
+  inline void nextWhileLess(uint32_t& info, size_t& idx, const Data* curData) const noexcept
+  {
+    while (info < curData->fInfo[idx])
+    {
+      next(info, idx, curData);
+    }
+  }
+
+  inline void nextWhileLess(uint32_t& info, size_t& idx) const noexcept
+  {
+    return nextWhileLess(info, idx, fCurData);
+  }
+
+  /** @brief Get next index and corresponding info
+   */
+  inline void next(uint32_t& info, size_t& idx, const Data* curData) const noexcept
+  {
+    ++(idx);
+    info += curData->fInfoInc;
+  }
+
+  inline void next(uint32_t& info, size_t& idx) const noexcept
+  {
+    return next(info, idx, fCurData);
+  }
+
+  /** @brief Get index and info of the next non-empty entry
+   */
+  inline void nextExisting(uint32_t& info, size_t& idx) const noexcept
+  {
+    uint64_t n = 0;
+    uint64_t data;
+    while (true)
+    {
+      memcpy(&data, fCurData->fInfo + idx, sizeof(data));
+      if (data == 0)
+      {
+        idx += sizeof(n);
+      }
+      else
+      {
+        break;
+      }
+    }
+
+#if BYTE_ORDER == BIG_ENDIAN
+    n = __builtin_clzll(data) / sizeof(data);
+#else
+    n = __builtin_ctzll(data) / sizeof(data);
+#endif
+    idx += n;
+    info = fCurData->fInfo[idx];
+  }
+
+  /** @brief Increase internal data size if needed
+   */
+  void increaseSize();
+
+  /** @brief Increase distance capacity of info removing 1 bit of the hash.
+   *
+   * @returns success
+   */
+  bool tryIncreaseInfo();
+
+  /** @brief Reserve space for number of elements (power of two)
+   *
+   *    This function performs re-insert all data
+   *
+   * @param elems(in)   new size
+   */
+  void rehashPowerOfTwo(size_t elems);
+
+  /** @brief Move elements from old one into rehashed data.
+   *
+   *    It's mostly the same algo as in getTargetRow(), but returns nothing
+   *    and skips some checks because it's guaranteed that there is no dups.
+   *
+   * @param oldIdx(in)    index of "old" data
+   * @param oldHashes(in) old storage of row positions and hashes
+   */
+  void insertSwap(size_t oldIdx, RowPosHashStorage* oldHashes);
+
+  /** @brief (Re)Initialize internal data of specified size.
+   *
+   * @param elems(in) number of elements
+   */
+  void initData(size_t elems, const RowPosHashStorage* oldHashes);
+
+  /** @brief Calculate memory size of info data
+   *
+   * @param elems(in) number of elements
+   * @returns size in bytes
+   */
+  inline static size_t calcBytes(size_t elems) noexcept
+  {
+    return elems + sizeof(uint64_t);
+  }
+
+  /** @brief Reserve place sufficient for elems
+   *
+   * @param elems(in) number of elements
+   */
+  void reserve(size_t elems);
+
+  /** @brief Start new aggregation generation
+   *
+   * Dump all the data on disk, including internal info data, positions & row
+   * hashes, and the rowgroups itself.
+   */
+  void startNewGeneration();
+
+  /** @brief Save internal info data on disk */
+  void dumpInternalData() const;
+
+  /** @brief Load previously dumped data from disk
+   *
+   * @param gen(in) generation number
+   */
+  void loadGeneration(uint16_t gen);
+  /** @brief Load previously dumped data into the tmp storage */
+  void loadGeneration(uint16_t gen, size_t& size, size_t& mask, size_t& maxSize, uint32_t& infoInc, uint32_t& infoHashShift, uint8_t*& info);
+
+  /** @brief Remove temporary data files */
+  void cleanup();
+  void cleanup(uint16_t gen);
+
+  /** @brief Remove all temporary data files */
+  void cleanupAll() noexcept;
+
+  std::string makeDumpFilename(int32_t gen = -1) const;
+
+private:
+  static constexpr size_t   INIT_SIZE{sizeof(uint64_t)};
+  static constexpr uint32_t INIT_INFO_BITS{5};
+  static constexpr uint8_t  INIT_INFO_INC{1U << INIT_INFO_BITS};
+  static constexpr size_t   INFO_MASK{INIT_INFO_INC - 1U};
+  static constexpr uint8_t  INIT_INFO_HASH_SHIFT{0};
+  static constexpr uint16_t MAX_INMEMORY_GENS{4};
+
+  struct Data
+  {
+    RowPosHashStoragePtr fHashes;
+    uint8_t *fInfo{nullptr};
+    size_t fSize{0};
+    size_t fMask{0};
+    size_t fMaxSize{0};
+    uint32_t fInfoInc{INIT_INFO_INC};
+    uint32_t fInfoHashShift{INIT_INFO_HASH_SHIFT};
+  };
+  std::vector<std::unique_ptr<Data>> fGens;
+  Data* fCurData;
+  uint32_t fMaxRows;
+  const bool fExtKeys;
+
+  std::unique_ptr<RowGroupStorage> fStorage;
+  RowGroupStorage* fKeysStorage;
+  uint32_t fLastKeyCol;
+
+  uint16_t fGeneration{0};
+  void* fUniqId;
+
+  Row fKeyRow;
+
+  std::unique_ptr<MemManager> fMM;
+  uint32_t fNumOfInputRGPerThread;
+  bool fAggregated = true;
+  bool fAllowGenerations;
+  bool fEnabledDiskAggregation;
+  std::string fTmpDir;
+  bool fInitialized{false};
+  rowgroup::RowGroup* fRowGroupOut;
+  rowgroup::RowGroup* fKeysRowGroup;
+};
+
+} // namespace rowgroup
+
+#endif // MYSQL_ROWSTORAGE_H