fix(rowgroup): RGData now uses uint64_t counter for the fixed sizes columns data buf.

The buffer can utilize > 4GB RAM that is necessary for PM side join. RGData ctor uses uint32_t allocating data buffer. This fact causes implicit heap overflow.
2025-07-30 19:23:07 +03:00 · 2024-08-24 19:13:58 +00:00
parent ca6c35abdd
commit dc03621e9d
8 changed files with 78 additions and 121 deletions
--- a/dbcon/joblist/batchprimitiveprocessor-jl.cpp
+++ b/dbcon/joblist/batchprimitiveprocessor-jl.cpp
@ -72,7 +72,6 @@ BatchPrimitiveProcessorJL::BatchPrimitiveProcessorJL(const ResourceManager* rm)
 , LBIDTrace(false)
 , tupleLength(0)
 , status(0)
- , sendRowGroups(false)
 , valueColumn(0)
 , sendTupleJoinRowGroupData(false)
 , bop(BOP_AND)
@ -147,7 +146,7 @@ void BatchPrimitiveProcessorJL::addFilterStep(const pDictionaryStep& step)

  tableOID = step.tableOid();

-  if (filterCount == 0 && !sendRowGroups)
+  if (filterCount == 0)
  {
    sendAbsRids = true;
    sendValues = true;
@ -244,7 +243,7 @@ void BatchPrimitiveProcessorJL::addProjectStep(const PassThruStep& step)
  if (utils::isWide(cc->getWidth()))
    wideColumnsWidths |= cc->getWidth();

-  if (filterCount == 0 && !sendRowGroups)
+  if (filterCount == 0)
    sendValues = true;

  idbassert(sessionID == step.sessionId());
@ -283,7 +282,7 @@ void BatchPrimitiveProcessorJL::addProjectStep(const PassThruStep& p, const pDic
  projectCount++;
  needStrValues = true;

-  if (filterCount == 0 && !sendRowGroups)
+  if (filterCount == 0)
  {
    sendValues = true;
    sendAbsRids = true;
@ -1054,9 +1053,6 @@ void BatchPrimitiveProcessorJL::createBPP(ByteStream& bs) const
  if (tJoiners.size() > 0)
    flags |= HAS_JOINER;

-  if (sendRowGroups)
-    flags |= HAS_ROWGROUP;
-
  if (sendTupleJoinRowGroupData)
    flags |= JOIN_ROWGROUP_DATA;

@ -1071,12 +1067,6 @@ void BatchPrimitiveProcessorJL::createBPP(ByteStream& bs) const
  bs << bop;
  bs << (uint8_t)(forHJ ? 1 : 0);

-  if (sendRowGroups)
-  {
-    bs << valueColumn;
-    bs << inputRG;
-  }
-
  if (ot == ROW_GROUP)
  {
    bs << projectionRG;
@ -1248,6 +1238,7 @@ void BatchPrimitiveProcessorJL::createBPP(ByteStream& bs) const
 * (projection count)x run msgs for projection Commands
 */

+// The deser counterpart function is BPP::resetBPP
 void BatchPrimitiveProcessorJL::runBPP(ByteStream& bs, uint32_t pmNum, bool isExeMgrDEC)
 {
  ISMPacketHeader ism;
@ -1289,35 +1280,28 @@ void BatchPrimitiveProcessorJL::runBPP(ByteStream& bs, uint32_t pmNum, bool isEx
  bs << sentByEM;

  if (_hasScan)
+  {
    idbassert(ridCount == 0);
-  else if (!sendRowGroups)
+  }
+  else
+  {
    idbassert(ridCount > 0 && (ridMap != 0 || sendAbsRids));
-  else
-    idbassert(inputRG.getRowCount() > 0);
-
-  if (sendRowGroups)
-  {
-    uint32_t rgSize = inputRG.getDataSize();
-    bs << rgSize;
-    bs.append(inputRG.getData(), rgSize);
  }
+
+  bs << ridCount;
+
+  if (sendAbsRids)
+    bs.append((uint8_t*)absRids.get(), ridCount << 3);
  else
  {
-    bs << ridCount;
-
-    if (sendAbsRids)
-      bs.append((uint8_t*)absRids.get(), ridCount << 3);
-    else
-    {
-      bs << ridMap;
-      bs << baseRid;
-      bs.append((uint8_t*)relRids, ridCount << 1);
-    }
-
-    if (sendValues)
-      bs.append((uint8_t*)values, ridCount << 3);
+    bs << ridMap;
+    bs << baseRid;
+    bs.append((uint8_t*)relRids, ridCount << 1);
  }

+  if (sendValues)
+    bs.append((uint8_t*)values, ridCount << 3);
+
  for (i = 0; i < filterCount; i++)
    filterSteps[i]->runCommand(bs);

@ -1667,7 +1651,6 @@ void BatchPrimitiveProcessorJL::setJoinedRowGroup(const rowgroup::RowGroup& rg)

 void BatchPrimitiveProcessorJL::setInputRowGroup(const rowgroup::RowGroup& rg)
 {
-  sendRowGroups = true;
  sendAbsRids = false;
  sendValues = false;
  inputRG = rg;
--- a/dbcon/joblist/batchprimitiveprocessor-jl.h
+++ b/dbcon/joblist/batchprimitiveprocessor-jl.h
@ -343,7 +343,6 @@ class BatchPrimitiveProcessorJL

  /* for RowGroup return type */
  rowgroup::RowGroup inputRG, projectionRG;
-  bool sendRowGroups;
  uint32_t valueColumn;

  /* for PM Aggregation */
--- a/dbcon/joblist/resourcemanager.cpp
+++ b/dbcon/joblist/resourcemanager.cpp
@ -368,6 +368,7 @@ bool ResourceManager::getMemory(int64_t amount, boost::shared_ptr<int64_t>& sess
  return (ret1 && ret2);
 }
 // Don't care about session memory
+// The amount type is unsafe if amount close to max<int64_t> that is unrealistic in 2024.
 bool ResourceManager::getMemory(int64_t amount, bool patience)
 {
  bool ret1 = (atomicops::atomicSub(&totalUmMemLimit, amount) >= 0);
--- a/dbcon/joblist/tuplehashjoin.cpp
+++ b/dbcon/joblist/tuplehashjoin.cpp
@ -417,6 +417,7 @@ void TupleHashJoinStep::smallRunnerFcn(uint32_t index, uint threadID, uint64_t*
  smallRG.initRow(&r);
  try
  {
+    // Very unfortunate choice for the type b/c of RM::getMemory type.
    ssize_t rgSize;
    bool gotMem;
    goto next;
--- a/utils/rowgroup/rowgroup.cpp
+++ b/utils/rowgroup/rowgroup.cpp
@ -31,7 +31,6 @@
 #include <iterator>
 using namespace std;

-
 #include <numeric>

 #include "bytestream.h"
@ -49,8 +48,6 @@ namespace rowgroup
 {
 using cscType = execplan::CalpontSystemCatalog::ColDataType;

-
-
 StringStore::~StringStore()
 {
 #if 0
@ -302,47 +299,27 @@ void UserDataStore::deserialize(ByteStream& bs)
  return;
 }

-
 RGData::RGData(const RowGroup& rg, uint32_t rowCount)
 {
-  // cout << "rgdata++ = " << __sync_add_and_fetch(&rgDataCount, 1) << endl;
-  rowData.reset(new uint8_t[rg.getDataSize(rowCount)]);
+  RGDataSizeType s = rg.getDataSize(rowCount);
+  rowData.reset(new uint8_t[s]);

  if (rg.usesStringTable() && rowCount > 0)
    strings.reset(new StringStore());

  userDataStore.reset();
-
-
-#ifdef VALGRIND
-  /* In a PM-join, we can serialize entire tables; not every value has been
-   * filled in yet.  Need to look into that.  Valgrind complains that
-   * those bytes are uninitialized, this suppresses that error.
-   */
-  memset(rowData.get(), 0, rg.getDataSize(rowCount));  // XXXPAT: make valgrind happy temporarily
-#endif
-  memset(rowData.get(), 0, rg.getDataSize(rowCount));  // XXXPAT: make valgrind happy temporarily
  columnCount = rg.getColumnCount();
  rowSize = rg.getRowSize();
 }

 RGData::RGData(const RowGroup& rg)
 {
-  // cout << "rgdata++ = " << __sync_add_and_fetch(&rgDataCount, 1) << endl;
  rowData.reset(new uint8_t[rg.getMaxDataSize()]);

  if (rg.usesStringTable())
    strings.reset(new StringStore());

  userDataStore.reset();
-
-#ifdef VALGRIND
-  /* In a PM-join, we can serialize entire tables; not every value has been
-   * filled in yet.  Need to look into that.  Valgrind complains that
-   * those bytes are uninitialized, this suppresses that error.
-   */
-  memset(rowData.get(), 0, rg.getMaxDataSize());
-#endif
  columnCount = rg.getColumnCount();
  rowSize = rg.getRowSize();
 }
@ -356,16 +333,6 @@ void RGData::reinit(const RowGroup& rg, uint32_t rowCount)
    strings.reset(new StringStore());
  else
    strings.reset();
-
-#ifdef VALGRIND
-  /* In a PM-join, we can serialize entire tables; not every value has been
-   * filled in yet.  Need to look into that.  Valgrind complains that
-   * those bytes are uninitialized, this suppresses that error.
-   */
-  memset(rowData.get(), 0, rg.getDataSize(rowCount));
-#endif
-  columnCount = rg.getColumnCount();
-  rowSize = rg.getRowSize();
 }

 void RGData::reinit(const RowGroup& rg)
@ -373,7 +340,7 @@ void RGData::reinit(const RowGroup& rg)
  reinit(rg, 8192);
 }

-void RGData::serialize(ByteStream& bs, uint32_t amount) const
+void RGData::serialize(ByteStream& bs, RGDataSizeType amount) const
 {
  // cout << "serializing!\n";
  bs << (uint32_t)RGDATA_SIG;
@ -399,9 +366,10 @@ void RGData::serialize(ByteStream& bs, uint32_t amount) const
    bs << (uint8_t)0;
 }

-void RGData::deserialize(ByteStream& bs, uint32_t defAmount)
+void RGData::deserialize(ByteStream& bs, RGDataSizeType defAmount)
 {
-  uint32_t amount, sig;
+  uint32_t sig;
+  RGDataSizeType amount;
  uint8_t* buf;
  uint8_t tmp8;

@ -642,7 +610,7 @@ string Row::toCSV() const

 void Row::setToNull(uint32_t colIndex)
 {
-  setNullMark(colIndex, true); // mark as null.
+  setNullMark(colIndex, true);  // mark as null.
  switch (types[colIndex])
  {
    case CalpontSystemCatalog::TINYINT: data[offsets[colIndex]] = joblist::TINYINTNULL; break;
@ -665,11 +633,11 @@ void Row::setToNull(uint32_t colIndex)
      *((int32_t*)&data[offsets[colIndex]]) = static_cast<int32_t>(joblist::DATENULL);
      break;

-      case CalpontSystemCatalog::BIGINT:
-        if (precision[colIndex] != MagicPrecisionForCountAgg)
-          *((uint64_t*)&data[offsets[colIndex]]) = joblist::BIGINTNULL;
-        else  // work around for count() in outer join result.
-          *((uint64_t*)&data[offsets[colIndex]]) = 0;
+    case CalpontSystemCatalog::BIGINT:
+      if (precision[colIndex] != MagicPrecisionForCountAgg)
+        *((uint64_t*)&data[offsets[colIndex]]) = joblist::BIGINTNULL;
+      else  // work around for count() in outer join result.
+        *((uint64_t*)&data[offsets[colIndex]]) = 0;

      break;

@ -680,9 +648,13 @@ void Row::setToNull(uint32_t colIndex)
      *((long double*)&data[offsets[colIndex]]) = joblist::LONGDOUBLENULL;
      break;

-    case CalpontSystemCatalog::DATETIME: *((uint64_t*)&data[offsets[colIndex]]) = joblist::DATETIMENULL; break;
+    case CalpontSystemCatalog::DATETIME:
+      *((uint64_t*)&data[offsets[colIndex]]) = joblist::DATETIMENULL;
+      break;

-    case CalpontSystemCatalog::TIMESTAMP: *((uint64_t*)&data[offsets[colIndex]]) = joblist::TIMESTAMPNULL; break;
+    case CalpontSystemCatalog::TIMESTAMP:
+      *((uint64_t*)&data[offsets[colIndex]]) = joblist::TIMESTAMPNULL;
+      break;

    case CalpontSystemCatalog::TIME: *((uint64_t*)&data[offsets[colIndex]]) = joblist::TIMENULL; break;

@ -716,9 +688,7 @@ void Row::setToNull(uint32_t colIndex)
        case 7:
        case 8: *((uint64_t*)&data[offsets[colIndex]]) = joblist::CHAR8NULL; break;

-        default:
-          setNullMark(colIndex, true);
-          break;
+        default: setNullMark(colIndex, true); break;
      }

      break;
@ -751,7 +721,9 @@ void Row::setToNull(uint32_t colIndex)

    case CalpontSystemCatalog::UTINYINT: data[offsets[colIndex]] = joblist::UTINYINTNULL; break;

-    case CalpontSystemCatalog::USMALLINT: *((uint16_t*)&data[offsets[colIndex]]) = joblist::USMALLINTNULL; break;
+    case CalpontSystemCatalog::USMALLINT:
+      *((uint16_t*)&data[offsets[colIndex]]) = joblist::USMALLINTNULL;
+      break;

    case CalpontSystemCatalog::UMEDINT:
    case CalpontSystemCatalog::UINT: *((uint32_t*)&data[offsets[colIndex]]) = joblist::UINTNULL; break;
@ -760,8 +732,8 @@ void Row::setToNull(uint32_t colIndex)

    default:
      ostringstream os;
-      os << "Row::initToNull(): got bad column type (" << types[colIndex] << ").  Width=" << getColumnWidth(colIndex)
-         << endl;
+      os << "Row::initToNull(): got bad column type (" << types[colIndex]
+         << ").  Width=" << getColumnWidth(colIndex) << endl;
      os << toString();
      throw logic_error(os.str());
  }
@ -870,8 +842,8 @@ bool Row::isNullValue(uint32_t colIndex) const
        return strings->isNullValue(offset);
      }

-//      if (data[offsets[colIndex]] == 0)  // empty string
-//        return true;
+      //      if (data[offsets[colIndex]] == 0)  // empty string
+      //        return true;

      switch (len)
      {
@ -1120,7 +1092,6 @@ RowGroup::RowGroup(const RowGroup& r)
    offsets = &stOffsets[0];
  else if (!useStringTable && !oldOffsets.empty())
    offsets = &oldOffsets[0];
-
 }

 RowGroup& RowGroup::operator=(const RowGroup& r)
@ -1235,27 +1206,28 @@ void RowGroup::serializeRGData(ByteStream& bs) const
  rgData->serialize(bs, getDataSize());
 }

-uint32_t RowGroup::getDataSize() const
+RGDataSizeType RowGroup::getDataSize() const
 {
  return getDataSize(getRowCount());
 }

-uint32_t RowGroup::getDataSize(uint64_t n) const
+RGDataSizeType RowGroup::getDataSize(uint64_t n) const
 {
-  return headerSize + (n * getRowSize());
+  return headerSize + (n * static_cast<RGDataSizeType>(getRowSize()));
 }

-uint32_t RowGroup::getMaxDataSize() const
+RGDataSizeType RowGroup::getMaxDataSize() const
 {
-  return headerSize + (8192 * getRowSize());
+  return headerSize + (static_cast<RGDataSizeType>(rgCommonSize) * static_cast<RGDataSizeType>(getRowSize()));
 }

-uint32_t RowGroup::getMaxDataSizeWithStrings() const
+RGDataSizeType RowGroup::getMaxDataSizeWithStrings() const
 {
-  return headerSize + (8192 * (oldOffsets[columnCount] + columnCount));
+  return headerSize +
+         (static_cast<RGDataSizeType>(rgCommonSize) * static_cast<RGDataSizeType>(getRowSizeWithStrings()));
 }

-uint32_t RowGroup::getEmptySize() const
+RGDataSizeType RowGroup::getEmptySize() const
 {
  return headerSize;
 }
@ -1325,9 +1297,8 @@ string RowGroup::toString(const std::vector<uint64_t>& used) const
    os << "rowcount = " << getRowCount() << endl;
    if (!used.empty())
    {
-      uint64_t cnt =
-          std::accumulate(used.begin(), used.end(), 0ULL,
-                          [](uint64_t a, uint64_t bits) { return a + __builtin_popcountll(bits); });
+      uint64_t cnt = std::accumulate(used.begin(), used.end(), 0ULL, [](uint64_t a, uint64_t bits)
+                                     { return a + __builtin_popcountll(bits); });
      os << "sparse row count = " << cnt << endl;
    }
    os << "base rid = " << getBaseRid() << endl;
--- a/utils/rowgroup/rowgroup.h
+++ b/utils/rowgroup/rowgroup.h
@ -62,6 +62,7 @@
 namespace rowgroup
 {
 const int16_t rgCommonSize = 8192;
+using RGDataSizeType = uint64_t;

 /*
    The RowGroup family of classes encapsulate the data moved through the
@ -270,14 +271,14 @@ class RGData


  // amount should be the # returned by RowGroup::getDataSize()
-  void serialize(messageqcpp::ByteStream&, uint32_t amount) const;
+  void serialize(messageqcpp::ByteStream&, RGDataSizeType amount) const;

  // the 'hasLengthField' is there b/c PM aggregation (and possibly others) currently sends
  // inline data with a length field.  Once that's converted to string table format, that
  // option can go away.
-  void deserialize(messageqcpp::ByteStream&, uint32_t amount = 0);  // returns the # of bytes read
+  void deserialize(messageqcpp::ByteStream&, RGDataSizeType amount = 0);  // returns the # of bytes read

-  inline uint64_t getStringTableMemUsage();
+  inline RGDataSizeType getStringTableMemUsage();
  void clear();
  void reinit(const RowGroup& rg);
  void reinit(const RowGroup& rg, uint32_t rowCount);
@ -1496,15 +1497,15 @@ class RowGroup : public messageqcpp::Serializeable
  uint32_t getDBRoot() const;
  void setDBRoot(uint32_t);

-  uint32_t getDataSize() const;
-  uint32_t getDataSize(uint64_t n) const;
-  uint32_t getMaxDataSize() const;
-  uint32_t getMaxDataSizeWithStrings() const;
-  uint32_t getEmptySize() const;
+  RGDataSizeType getDataSize() const;
+  RGDataSizeType getDataSize(uint64_t n) const;
+  RGDataSizeType getMaxDataSize() const;
+  RGDataSizeType getMaxDataSizeWithStrings() const;
+  RGDataSizeType getEmptySize() const;

  // this returns the size of the row data with the string table
-  inline uint64_t getSizeWithStrings() const;
-  inline uint64_t getSizeWithStrings(uint64_t n) const;
+  inline RGDataSizeType getSizeWithStrings() const;
+  inline RGDataSizeType getSizeWithStrings(uint64_t n) const;

  // sets the row count to 0 and the baseRid to something
  // effectively initializing whatever chunk of memory
@ -1625,11 +1626,11 @@ class RowGroup : public messageqcpp::Serializeable
  uint32_t sTableThreshold = 20;
  std::shared_ptr<bool[]> forceInline;

-  static const uint32_t headerSize = 18;
-  static const uint32_t rowCountOffset = 0;
-  static const uint32_t baseRidOffset = 4;
-  static const uint32_t statusOffset = 12;
-  static const uint32_t dbRootOffset = 14;
+  static const uint64_t headerSize = 18;
+  static const uint64_t rowCountOffset = 0;
+  static const uint64_t baseRidOffset = 4;
+  static const uint64_t statusOffset = 12;
+  static const uint64_t dbRootOffset = 14;
 };

 inline uint64_t convertToRid(const uint32_t& partNum, const uint16_t& segNum, const uint8_t& extentNum,
@ -1775,7 +1776,7 @@ inline uint32_t RowGroup::getRowSizeWithStrings() const
  return oldOffsets[columnCount] + columnCount;
 }

-inline uint64_t RowGroup::getSizeWithStrings(uint64_t n) const
+inline RGDataSizeType RowGroup::getSizeWithStrings(uint64_t n) const
 {
  if (strings == nullptr)
    return getDataSize(n);
--- a/utils/rowgroup/rowstorage.cpp
+++ b/utils/rowgroup/rowstorage.cpp
@ -635,6 +635,7 @@ class RowGroupStorage
      if (fRGDatas[rgid])
      {
        fRowGroupOut->setData(fRGDatas[rgid].get());
+        // An implicit s2u type cast.
        int64_t memSz = fRowGroupOut->getSizeWithStrings(fMaxRows);
        if (!fMM->acquire(memSz))
        {
@ -792,7 +793,7 @@ class RowGroupStorage

    while (rgid >= fRGDatas.size())
    {
-      int64_t memSz = fRowGroupOut->getSizeWithStrings(fMaxRows);
+      auto memSz = fRowGroupOut->getSizeWithStrings(fMaxRows);
      if (!fMM->acquire(memSz))
      {
        throw logging::IDBExcept(
--- a/utils/windowfunction/idborderby.cpp
+++ b/utils/windowfunction/idborderby.cpp
@ -753,7 +753,7 @@ void IdbOrderBy::initialize(const RowGroup& rg)
  // initialize rows
  IdbCompare::initialize(rg);

-  uint64_t newSize = rg.getSizeWithStrings(fRowsPerRG);
+  auto newSize = rg.getSizeWithStrings(fRowsPerRG);
  if (fRm && !fRm->getMemory(newSize, fSessionMemLimit))
  {
    cerr << IDBErrorInfo::instance()->errorMsg(fErrorCode) << " @" << __FILE__ << ":" << __LINE__;