feat(joblist,runtime): this is the first part of the execution model that produces a workload that can be predicted for a given query.

* feat(joblist,runtime): this is the first part of the execution model that produces a workload that can be predicted for a given query. - forces to UM join converter to use a value from a configuration - replaces a constant used to control a number of outstanding requests with a value depends on column width - modifies related Columnstore.xml values
2025-07-30 19:23:07 +03:00 · 2024-12-03 22:17:49 +00:00
parent bba2133cd0
commit 6445f4dff3
9 changed files with 76 additions and 74 deletions
--- a/dbcon/joblist/distributedenginecomm.cpp
+++ b/dbcon/joblist/distributedenginecomm.cpp
@ -68,6 +68,7 @@ using namespace oam;
 using namespace joblist;

 #include "atomicops.h"
+#include "threadnaming.h"

 namespace
 {
@ -131,6 +132,7 @@ struct EngineCommRunner
  uint32_t connIndex;
  void operator()()
  {
+    utils::setThreadName("DECRunner");
    // cout << "Listening on client at 0x" << hex << (ptrdiff_t)client << dec << endl;
    try
    {
--- a/dbcon/joblist/primitivestep.h
+++ b/dbcon/joblist/primitivestep.h
@ -112,12 +112,12 @@ class pColStep : public JobStep
   *
   * Starts processing.  Set at least the RID list before calling this.
   */
-  virtual void run(){};
+  virtual void run() {};
  /** @brief Sync's the caller with the end of execution.
   *
   * Does nothing.  Returns when this instance is finished.
   */
-  virtual void join(){};
+  virtual void join() {};

  virtual const std::string toString() const;

@ -1459,6 +1459,8 @@ class TupleBPS : public BatchPrimitive, public TupleDeliveryStep
  void interleaveJobs(std::vector<Job>* jobs) const;
  void sendJobs(const std::vector<Job>& jobs);
  uint32_t numDBRoots;
+  // presumably there must be not more than 2^32 blocks per job as of 23.02.
+  uint32_t blocksPerJob;

  /* Pseudo column filter processing.  Think about refactoring into a separate class. */
  bool processPseudoColFilters(uint32_t extentIndex, boost::shared_ptr<std::map<int, int>> dbRootPMMap) const;
--- a/dbcon/joblist/tuple-bps.cpp
+++ b/dbcon/joblist/tuple-bps.cpp
@ -19,7 +19,7 @@
 //  $Id: tuple-bps.cpp 9705 2013-07-17 20:06:07Z pleblanc $

 #include <unistd.h>
-//#define NDEBUG
+// #define NDEBUG
 #include <cassert>
 #include <sstream>
 #include <iomanip>
@ -77,7 +77,9 @@ using namespace querytele;

 #include "columnwidth.h"
 #include "pseudocolumn.h"
-//#define DEBUG 1
+// #define DEBUG 1
+
+// #include "poormanprofiler.inc"

 extern boost::mutex fileLock_g;

@ -396,15 +398,6 @@ void TupleBPS::initializeConfigParms()
 {
  string strVal;

-  //...Get the tuning parameters that throttle msgs sent to primproc
-  //...fFilterRowReqLimit puts a cap on how many rids we will request from
-  //...    primproc, before pausing to let the consumer thread catch up.
-  //...    Without this limit, there is a chance that PrimProc could flood
-  //...    ExeMgr with thousands of messages that will consume massive
-  //...    amounts of memory for a 100 gigabyte database.
-  //...fFilterRowReqThreshold is the level at which the number of outstanding
-  //...    rids must fall below, before the producer can send more rids.
-
  // These could go in constructor
  fRequestSize = fRm->getJlRequestSize();
  fMaxOutstandingRequests = fRm->getJlMaxOutstandingRequests();
@ -556,14 +549,14 @@ TupleBPS::TupleBPS(const pColScanStep& rhs, const JobInfo& jobInfo) : BatchPrimi

      throw runtime_error(oss.str());
    }
-    catch(std::exception& ex)
+    catch (std::exception& ex)
    {
      std::ostringstream oss;
      oss << "Error getting AUX column OID for table " << tableName.toString();
      oss << " due to:  " << ex.what();
      throw runtime_error(oss.str());
    }
-    catch(...)
+    catch (...)
    {
      std::ostringstream oss;
      oss << "Error getting AUX column OID for table " << tableName.toString();
@ -1684,7 +1677,8 @@ void TupleBPS::sendJobs(const vector<Job>& jobs)
    if (recvWaiting)
      condvar.notify_all();

-    while ((msgsSent - msgsRecvd > fMaxOutstandingRequests << LOGICAL_EXTENT_CONVERTER) && !fDie)
+    // Send not more than fMaxOutstandingRequests jobs out. min(blocksPerJob) = 16
+    while ((msgsSent - msgsRecvd > fMaxOutstandingRequests * (blocksPerJob >> 1)) && !fDie)
    {
      sendWaiting = true;
      condvarWakeupProducer.wait(tplLock);
@ -2007,7 +2001,6 @@ void TupleBPS::makeJobs(vector<Job>* jobs)
  uint32_t i;
  uint32_t lbidsToScan;
  uint32_t blocksToScan;
-  uint32_t blocksPerJob;
  LBID_t startingLBID;
  oam::OamCache* oamCache = oam::OamCache::makeOamCache();
  boost::shared_ptr<map<int, int>> dbRootConnectionMap = oamCache->getDBRootToConnectionMap();
@ -2227,6 +2220,8 @@ void TupleBPS::processByteStreamVector(vector<boost::shared_ptr<messageqcpp::Byt
      // changes made here should also be made there and vice versa.
      if (hasUMJoin || !fBPP->pmSendsFinalResult())
      {
+        utils::setThreadName("BSPJoin");
+
        data->joinedData = RGData(data->local_outputRG);
        data->local_outputRG.setData(&data->joinedData);
        data->local_outputRG.resetRowGroup(data->local_primRG.getBaseRid());
@ -2340,6 +2335,8 @@ void TupleBPS::processByteStreamVector(vector<boost::shared_ptr<messageqcpp::Byt
        {
          rgDatav.push_back(data->joinedData);
        }
+
+        utils::setThreadName("ByteStreamProcessor");
      }
      else
      {
@ -2351,6 +2348,7 @@ void TupleBPS::processByteStreamVector(vector<boost::shared_ptr<messageqcpp::Byt
        memAmount = 0;
      }

+      utils::setThreadName("BSPFE2");
      // Execute UM F & E group 2 on rgDatav
      if (fe2 && !bRunFEonPM && rgDatav.size() > 0 && !cancelled())
      {
@ -2358,6 +2356,8 @@ void TupleBPS::processByteStreamVector(vector<boost::shared_ptr<messageqcpp::Byt
        rgDataVecToDl(rgDatav, data->local_fe2Output, dlp);
      }

+        utils::setThreadName("ByteStreamProcessor");
+
      data->cachedIO_Thread += cachedIO;
      data->physIO_Thread += physIO;
      data->touchedBlocks_Thread += touchedBlocks;
@ -2777,8 +2777,7 @@ void TupleBPS::receiveMultiPrimitiveMessages()
             << totalBlockedReadCount << "/" << totalBlockedWriteCount << "; output size-" << ridsReturned
             << endl
             << "\tPartitionBlocksEliminated-" << fNumBlksSkipped << "; MsgBytesIn-" << msgBytesInKB << "KB"
-             << "; MsgBytesOut-" << msgBytesOutKB << "KB"
-             << "; TotalMsgs-" << totalMsgs << endl
+             << "; MsgBytesOut-" << msgBytesOutKB << "KB" << "; TotalMsgs-" << totalMsgs << endl
             << "\t1st read " << dlTimes.FirstReadTimeString() << "; EOI " << dlTimes.EndOfInputTimeString()
             << "; runtime-" << JSTimeStamp::tsdiffstr(dlTimes.EndOfInputTime(), dlTimes.FirstReadTime())
             << "s\n\tUUID " << uuids::to_string(fStepUuid) << "\n\tQuery UUID "
@ -3179,9 +3178,8 @@ bool TupleBPS::deliverStringTableRowGroup() const
 void TupleBPS::formatMiniStats()
 {
  ostringstream oss;
-  oss << "BPS "
-      << "PM " << alias() << " " << fTableOid << " " << fBPP->toMiniString() << " " << fPhysicalIO << " "
-      << fCacheIO << " " << fNumBlksSkipped << " "
+  oss << "BPS " << "PM " << alias() << " " << fTableOid << " " << fBPP->toMiniString() << " " << fPhysicalIO
+      << " " << fCacheIO << " " << fNumBlksSkipped << " "
      << JSTimeStamp::tsdiffstr(dlTimes.EndOfInputTime(), dlTimes.FirstReadTime()) << " " << ridsReturned
      << " ";

--- a/dbcon/joblist/tupleannexstep.cpp
+++ b/dbcon/joblist/tupleannexstep.cpp
@ -18,7 +18,7 @@

 //  $Id: tupleannexstep.cpp 9661 2013-07-01 20:33:05Z pleblanc $

-//#define NDEBUG
+// #define NDEBUG
 #include <cassert>
 #include <sstream>
 #include <iomanip>
@ -251,10 +251,6 @@ void TupleAnnexStep::run()
    fRunnersList.resize(fMaxThreads);
    fInputIteratorsList.resize(fMaxThreads + 1);

-    // Activate stats collecting before CS spawns threads.
-    if (traceOn())
-      dlTimes.setFirstReadTime();
-
    // *DRRTUY Make this block conditional
    StepTeleStats sts;
    sts.query_uuid = fQueryUuid;
@ -858,7 +854,7 @@ void TupleAnnexStep::finalizeParallelOrderByDistinct()
        break;
      }
    }  // end of limit bound while loop
-  }    // end of if-else
+  }  // end of if-else

  if (fRowGroupOut.getRowCount() > 0)
  {
@ -1045,7 +1041,7 @@ void TupleAnnexStep::finalizeParallelOrderBy()
        break;
      }
    }  // end of limit bound while loop
-  }    // end of if-else
+  }  // end of if-else

  if (fRowGroupOut.getRowCount() > 0)
  {
@ -1065,9 +1061,6 @@ void TupleAnnexStep::finalizeParallelOrderBy()

  if (traceOn())
  {
-    if (dlTimes.FirstReadTime().tv_sec == 0)
-      dlTimes.setFirstReadTime();
-
    dlTimes.setLastReadTime();
    dlTimes.setEndOfInputTime();
    printCalTrace();
@ -1102,6 +1095,13 @@ void TupleAnnexStep::executeParallelOrderBy(uint64_t id)
  try
  {
    more = fInputDL->next(fInputIteratorsList[id], &rgDataIn);
+
+    // Stats collecting.
+    if (more && (id == 1) && traceOn())
+    {
+      dlTimes.setFirstReadTime();
+    }
+
    if (more)
      dlOffset++;

@ -1241,14 +1241,9 @@ void TupleAnnexStep::formatMiniStats()
 {
  ostringstream oss;
  oss << "TNS ";
-  oss << "UM "
-      << "- "
-      << "- "
-      << "- "
-      << "- "
-      << "- "
-      << "- " << JSTimeStamp::tsdiffstr(dlTimes.EndOfInputTime(), dlTimes.FirstReadTime()) << " "
-      << fRowsReturned << " ";
+  oss << "UM " << "- " << "- " << "- " << "- " << "- " << "- "
+      << JSTimeStamp::tsdiffstr(dlTimes.EndOfInputTime(), dlTimes.FirstReadTime()) << " " << fRowsReturned
+      << " ";
  fMiniInfo += oss.str();
 }

--- a/dbcon/joblist/tuplehashjoin.cpp
+++ b/dbcon/joblist/tuplehashjoin.cpp
@ -278,12 +278,12 @@ void TupleHashJoinStep::startSmallRunners(uint index)
  if (typelessJoin[index])
  {
    joiner.reset(new TupleJoiner(smallRGs[index], largeRG, smallSideKeys[index], largeSideKeys[index], jt,
-                                 &jobstepThreadPool));
+                                 &jobstepThreadPool, numCores));
  }
  else
  {
    joiner.reset(new TupleJoiner(smallRGs[index], largeRG, smallSideKeys[index][0], largeSideKeys[index][0],
-                                 jt, &jobstepThreadPool));
+                                 jt, &jobstepThreadPool, numCores));
  }

  joiner->setUniqueLimit(uniqueLimit);
@ -1297,15 +1297,11 @@ void TupleHashJoinStep::formatMiniStats(uint32_t index)
  else
    oss << "- ";

-  oss << " "
-      << "- "
-      << "- "
-      << "- "
+  oss << " " << "- " << "- " << "- "
      << "- "
      //		<< JSTimeStamp::tsdiffstr(dlTimes.EndOfInputTime(), dlTimes.FirstReadTime()) << " "
      //		dlTimes are not timed in this step, using '--------' instead.
-      << "-------- "
-      << "-\n";
+      << "-------- " << "-\n";
  fMiniInfo += oss.str();
 }