[MCOL-4849] Parallelize the processing of the bytestream vector.

This patch changes the logic of the `receiveMultiPrimitiveMessages` function in the following way: 1. We have only one aggregation thread which reads the data from Queue (which is populated by messages from BPPs). 2. Processing of the received `bytestream vector` could be in parallel depends on the type of `TupleBPS` operation (join, fe2, ...) and actual thread pool workload. The motivation is to eliminate some amount of context switches.
2025-07-30 19:23:07 +03:00 · 2021-10-13 15:31:05 +03:00
parent 650d45fcc1
commit b382f681a1
4 changed files with 786 additions and 665 deletions
--- a/dbcon/joblist/primitivestep.h
+++ b/dbcon/joblist/primitivestep.h
@ -1075,6 +1075,24 @@ public:
    virtual void setFE23Output(const rowgroup::RowGroup& rg) = 0;
 };

+struct _CPInfo
+{
+    _CPInfo(int64_t MIN, int64_t MAX, uint64_t l, bool val) : min(MIN), max(MAX), LBID(l), valid(val){};
+    _CPInfo(int128_t BIGMIN, int128_t BIGMAX, uint64_t l, bool val)
+        : bigMin(BIGMIN), bigMax(BIGMAX), LBID(l), valid(val){};
+    union
+    {
+        int128_t bigMin;
+        int64_t min;
+    };
+    union
+    {
+        int128_t bigMax;
+        int64_t max;
+    };
+    uint64_t LBID;
+    bool valid;
+};

 /** @brief class TupleBPS
 *
@ -1113,7 +1131,12 @@ public:
     *
     * The main loop for the receive-side thread.  Don't call it directly.
     */
-    void receiveMultiPrimitiveMessages(uint32_t threadID);
+    void receiveMultiPrimitiveMessages();
+
+    // Processes the vector of `bytestream` starting from `begin` index to the `end` index, non inclusive.
+    void processByteStreamVector(vector<boost::shared_ptr<messageqcpp::ByteStream>>& bsv,
+                                 const uint32_t begin, const uint32_t end, vector<_CPInfo>& cpv,
+                                 RowGroupDL* dlp, const uint32_t threadID);

    /** @brief Add a filter when the column is anything but a 4-byte float type.
     *
@ -1352,6 +1375,10 @@ private:

  void startPrimitiveThread();
  void startAggregationThread();
+  // Processes the vector of `bytestream` starting from `begin` index to the `end` index, non inclusive.
+  void startProcessingThread(TupleBPS* tbps, vector<boost::shared_ptr<messageqcpp::ByteStream>>& bsv,
+                             const uint32_t begin, const uint32_t end, vector<_CPInfo>& cpv, RowGroupDL* dlp,
+                             const uint32_t threadID);
  void initializeConfigParms();
  uint64_t getFBO(uint64_t lbid);
  void checkDupOutputColumns(const rowgroup::RowGroup& rg);
@ -1359,7 +1386,6 @@ private:
  void dupOutputColumns(rowgroup::RGData&, rowgroup::RowGroup&);
  void rgDataToDl(rowgroup::RGData&, rowgroup::RowGroup&, RowGroupDL*);
  void rgDataVecToDl(std::vector<rowgroup::RGData>&, rowgroup::RowGroup&, RowGroupDL*);
-
  DistributedEngineComm* fDec;
  boost::shared_ptr<BatchPrimitiveProcessorJL> fBPP;
  uint16_t fNumSteps;
@ -1373,6 +1399,7 @@ private:
  PrimitiveStepType ffirstStepType;
  bool isFilterFeeder;
  std::vector<uint64_t> fProducerThreads; // thread pool handles
+  std::vector<uint64_t> fProcessorThreads;
  messageqcpp::ByteStream fFilterString;
  uint32_t fFilterCount;
  execplan::CalpontSystemCatalog::ColType fColType;
@ -1470,13 +1497,10 @@ private:
  rowgroup::RGData fe2Data;
  rowgroup::Row fe2InRow, fe2OutRow;

-    void processFE2(rowgroup::RowGroup& input, rowgroup::RowGroup& output,
-                    rowgroup::Row& inRow, rowgroup::Row& outRow,
-                    std::vector<rowgroup::RGData>* rgData,
-                    funcexp::FuncExpWrapper* localFE2);
-    void processFE2_oneRG(rowgroup::RowGroup& input, rowgroup::RowGroup& output,
-                          rowgroup::Row& inRow, rowgroup::Row& outRow,
-                          funcexp::FuncExpWrapper* localFE2);
+  void processFE2(rowgroup::RowGroup& input, rowgroup::RowGroup& output, rowgroup::Row& inRow, rowgroup::Row& outRow,
+                  std::vector<rowgroup::RGData>* rgData, funcexp::FuncExpWrapper* localFE2);
+  void processFE2_oneRG(rowgroup::RowGroup& input, rowgroup::RowGroup& output, rowgroup::Row& inRow,
+                        rowgroup::Row& outRow, funcexp::FuncExpWrapper* localFE2);

  /* Runtime Casual Partitioning adjustments.  The CP code is needlessly complicated;
   * to avoid making it worse, decided to designate 'scanFlags' as the static
@ -1491,11 +1515,98 @@ private:
  boost::shared_ptr<RowGroupDL> deliveryDL;
  uint32_t deliveryIt;

+  struct JoinLocalData
+  {
+      JoinLocalData() = delete;
+      JoinLocalData(const JoinLocalData&) = delete;
+      JoinLocalData(JoinLocalData&&) = delete;
+      JoinLocalData& operator=(const JoinLocalData&) = delete;
+      JoinLocalData& operator=(JoinLocalData&&) = delete;
+      ~JoinLocalData() = default;
+
+      JoinLocalData(rowgroup::RowGroup& primRowGroup, rowgroup::RowGroup& outputRowGroup,
+                    boost::shared_ptr<funcexp::FuncExpWrapper>& fe2, rowgroup::RowGroup& fe2Output,
+                    std::vector<rowgroup::RowGroup>& joinerMatchesRGs, rowgroup::RowGroup& joinFERG,
+                    std::vector<boost::shared_ptr<joiner::TupleJoiner>>& tjoiners, uint32_t smallSideCount,
+                    bool doJoin);
+
+      rowgroup::RowGroup local_primRG;
+      rowgroup::RowGroup local_outputRG;
+
+      uint32_t cachedIO_Thread = 0;
+      uint32_t physIO_Thread = 0;
+      uint32_t touchedBlocks_Thread = 0;
+      int64_t ridsReturned_Thread = 0;
+
+      // On init.
+      bool doJoin;
+      boost::shared_ptr<funcexp::FuncExpWrapper> fe2;
+      rowgroup::RowGroup fe2Output;
+      uint32_t smallSideCount;
+      std::vector<rowgroup::RowGroup> joinerMatchesRGs;
+      rowgroup::RowGroup joinFERG;
+      std::vector<boost::shared_ptr<joiner::TupleJoiner>> tjoiners;
+
+      // Join vars.
+      vector<vector<rowgroup::Row::Pointer>> joinerOutput;
+      rowgroup::Row largeSideRow;
+      rowgroup::Row joinedBaseRow;
+      rowgroup::Row largeNull;
+      rowgroup::Row joinFERow; // LSR clean
+      boost::scoped_array<rowgroup::Row> smallSideRows;
+      boost::scoped_array<rowgroup::Row> smallNulls;
+      boost::scoped_array<uint8_t> joinedBaseRowData;
+      boost::scoped_array<uint8_t> joinFERowData;
+      boost::shared_array<int> largeMapping;
+      vector<boost::shared_array<int>> smallMappings;
+      vector<boost::shared_array<int>> fergMappings;
+      rowgroup::RGData joinedData;
+      boost::scoped_array<uint8_t> largeNullMemory;
+      boost::scoped_array<boost::shared_array<uint8_t>> smallNullMemory;
+      uint32_t matchCount;
+
+      rowgroup::Row postJoinRow;
+      rowgroup::RowGroup local_fe2Output;
+      rowgroup::RGData local_fe2Data;
+      rowgroup::Row local_fe2OutRow;
+      funcexp::FuncExpWrapper local_fe2;
+  };
+
+  // We will initialize the actual value in TupleBPS `initializeConfigParms` function.
+  uint32_t fMaxNumProcessorThreads = 16;
+
+  // Based on the `ThreadPool` workload we set it in runtime for each `TupleBPS` operation.
+  uint32_t fNumProcessorThreads = 1;
+
+  std::shared_ptr<JoinLocalData> getJoinLocalDataByIndex(uint32_t index)
+  {
+      idbassert(index < fNumProcessorThreads && joinLocalDataPool.size() == fNumProcessorThreads);
+      return joinLocalDataPool[index];
+  }
+
+  void initializeJoinLocalDataPool(uint32_t numThreads)
+  {
+      idbassert(numThreads <= fMaxNumProcessorThreads);
+      for (uint32_t i = 0; i < numThreads; ++i)
+      {
+          joinLocalDataPool.push_back(std::shared_ptr<JoinLocalData>(
+              new JoinLocalData(primRowGroup, outputRowGroup, fe2, fe2Output, joinerMatchesRGs, joinFERG,
+                                tjoiners, smallSideCount, doJoin)));
+      }
+
+      fNumProcessorThreads = numThreads;
+  }
+
+  // Join local data vector.
+  std::vector<std::shared_ptr<JoinLocalData>> joinLocalDataPool;
+
  /* shared nothing support */
  struct Job
  {
-        Job(uint32_t d, uint32_t n, uint32_t b, boost::shared_ptr<messageqcpp::ByteStream>& bs) :
-            dbroot(d), connectionNum(n), expectedResponses(b), msg(bs) { }
+      Job(uint32_t d, uint32_t n, uint32_t b, boost::shared_ptr<messageqcpp::ByteStream>& bs)
+          : dbroot(d), connectionNum(n), expectedResponses(b), msg(bs)
+      {
+      }
      uint32_t dbroot;
      uint32_t connectionNum;
      uint32_t expectedResponses;
@ -1523,7 +1634,6 @@ private:
    bool compareRange(uint8_t COP, int64_t min, int64_t max, int64_t val) const;
    bool hasPCFilter, hasPMFilter, hasRIDFilter, hasSegmentFilter, hasDBRootFilter, hasSegmentDirFilter,
         hasPartitionFilter, hasMaxFilter, hasMinFilter, hasLBIDFilter, hasExtentIDFilter;
-
    };

 /** @brief class FilterStep
--- a/dbcon/joblist/tuple-bps.cpp
+++ b/dbcon/joblist/tuple-bps.cpp
--- a/utils/joiner/tuplejoiner.cpp
+++ b/utils/joiner/tuplejoiner.cpp
@ -908,8 +908,7 @@ void TupleJoiner::setInUM(vector<RGData> &rgs)
    }
 }

-void TupleJoiner::setPMJoinResults(boost::shared_array<vector<uint32_t> > jr,
-                                   uint32_t threadID)
+void TupleJoiner::setPMJoinResults(boost::shared_array<vector<uint32_t>> jr, uint32_t threadID)
 {
    pmJoinResults[threadID] = jr;
 }
--- a/utils/threadpool/threadpool.h
+++ b/utils/threadpool/threadpool.h
@ -225,6 +225,10 @@ public:
        return fMaxThreads;
    }

+    /** @brief get the issued number of threads
+      */
+    inline size_t getIssuedThreads() { return fIssued; }
+
    /** @brief queue size accessor
      *
      */
@ -292,6 +296,7 @@ public:
        fDebug = d;
    }

+
    friend class ThreadPoolMonitor;
 protected: