MCOL-5191 Add MCV statistics.

This patch adds: 1. Initial version of random sampling. 2. Initial version of MCV statistics.
2025-08-08 14:22:09 +03:00 · 2022-09-08 18:42:19 +03:00
parent 9d774c1d95
commit e299a8409d
3 changed files with 552 additions and 351 deletions
--- a/primitives/primproc/sqlfrontsessionthread.cpp
+++ b/primitives/primproc/sqlfrontsessionthread.cpp
@@ -187,7 +187,8 @@ namespace exemgr
    fIos.write(emsgBs);
  }

-  void SQLFrontSessionThread::analyzeTableExecute(messageqcpp::ByteStream& bs, joblist::SJLP& jl, bool& stmtCounted)
+  void SQLFrontSessionThread::analyzeTableExecute(messageqcpp::ByteStream& bs, joblist::SJLP& jl,
+                                                  bool& stmtCounted)
  {
    auto* statementsRunningCount = globServiceExeMgr->getStatementsRunningCount();
    messageqcpp::ByteStream::quadbyte qb;
@@ -238,19 +239,23 @@ namespace exemgr
    jl->doQuery();

    FEMsgHandler msgHandler(jl, &fIos);
-
    msgHandler.start();
-    auto rowCount = jl->projectTable(100, bs);
+
+    // Seemls like this a legacy parameter, not really needed.
+    const uint32_t dummyTableOid = 100;
+    auto* statisticsManager = statistics::StatisticsManager::instance();
+    // Process rowGroup by rowGroup.
+    auto rowCount = jl->projectTable(dummyTableOid, bs);
+    while (rowCount)
+    {
+      auto outRG = (static_cast<joblist::TupleJobList*>(jl.get()))->getOutputRowGroup();
+      statisticsManager->collectSample(outRG);
+      rowCount = jl->projectTable(dummyTableOid, bs);
+    }
    msgHandler.stop();

-    auto outRG = (static_cast<joblist::TupleJobList*>(jl.get()))->getOutputRowGroup();
-
-    if (caep.traceOn())
-      std::cout << "Row count " << rowCount << std::endl;
-
-    // Process `RowGroup`, increase an epoch and save statistics to the file.
-    auto* statisticsManager = statistics::StatisticsManager::instance();
-    statisticsManager->analyzeColumnKeyTypes(outRG, caep.traceOn());
+    // Analyze collected samples.
+    statisticsManager->analyzeSample(caep.traceOn());
    statisticsManager->incEpoch();
    statisticsManager->saveToFile();

@@ -348,9 +353,7 @@ namespace exemgr
          // making the whole session wait.  It can take several seconds.
          std::unique_lock<std::mutex> scoped(jlMutex);
          destructing++;
-          std::thread bgdtor(
-              [jl, &jlMutex, &jlCleanupDone, &destructing]
-              {
+          std::thread bgdtor([jl, &jlMutex, &jlCleanupDone, &destructing] {
            std::unique_lock<std::mutex> scoped(jlMutex);
            const_cast<joblist::SJLP&>(jl).reset();  // this happens second; does real destruction
            if (--destructing == 0)
@@ -883,9 +886,7 @@ namespace exemgr
          int stmtID = csep.statementID();
          std::unique_lock<std::mutex> scoped(jlMutex);
          destructing++;
-          std::thread bgdtor(
-              [jl, &jlMutex, &jlCleanupDone, stmtID, &li, &destructing, &msgLog]
-              {
+          std::thread bgdtor([jl, &jlMutex, &jlCleanupDone, stmtID, &li, &destructing, &msgLog] {
            std::unique_lock<std::mutex> scoped(jlMutex);
            const_cast<joblist::SJLP&>(jl).reset();  // this happens second; does real destruction
            logging::Message::Args args;
@@ -987,5 +988,5 @@ namespace exemgr
    std::unique_lock<std::mutex> scoped(jlMutex);
    while (destructing > 0)
      jlCleanupDone.wait(scoped);
-}
+  }
 }; // namespace exemgr
--- a/utils/common/statistics.cpp
+++ b/utils/common/statistics.cpp
@@ -17,6 +17,7 @@

 #include <iostream>
 #include <atomic>
+#include <random>
 #include <boost/filesystem.hpp>

 #include "statistics.h"
@@ -31,61 +32,135 @@ using namespace logging;

 namespace statistics
 {
-using ColumnsCache = std::vector<std::unordered_set<uint32_t>>;
-
 StatisticsManager* StatisticsManager::instance()
 {
  static StatisticsManager* sm = new StatisticsManager();
  return sm;
 }

-void StatisticsManager::analyzeColumnKeyTypes(const rowgroup::RowGroup& rowGroup, bool trace)
+void StatisticsManager::collectSample(const rowgroup::RowGroup& rowGroup)
 {
  std::lock_guard<std::mutex> lock(mut);
-  auto rowCount = rowGroup.getRowCount();
+  const auto rowCount = rowGroup.getRowCount();
  const auto columnCount = rowGroup.getColumnCount();
  if (!rowCount || !columnCount)
    return;

-  auto& oids = rowGroup.getOIDs();
+  const auto& oids = rowGroup.getOIDs();
+  for (const auto oid : oids)
+  {
+    // Initialize a column data with 0.
+    if (!columnGroups.count(oid))
+      columnGroups[oid] = std::vector<uint64_t>(maxSampleSize, 0);
+  }

+  // Initialize a first row from the given `rowGroup`.
  rowgroup::Row r;
  rowGroup.initRow(&r);
  rowGroup.getRow(0, &r);

-  ColumnsCache columns(columnCount, std::unordered_set<uint32_t>());
-  // Init key types.
-  for (uint32_t index = 0; index < columnCount; ++index)
-    keyTypes[oids[index]] = KeyType::PK;
+  // Generate a uniform distribution.
+  std::random_device randomDevice;
+  std::mt19937 gen32(randomDevice());
+  std::uniform_int_distribution<> uniformDistribution(0, currentRowIndex + rowCount - 1);

-  const uint32_t maxRowCount = 4096;
-  // TODO: We should read just couple of blocks from columns, not all data, but this requires
-  // more deep refactoring of column commands.
-  rowCount = std::min(rowCount, maxRowCount);
-  // This is strange, it's a CS but I'm processing data as row by row, how to fix it?
  for (uint32_t i = 0; i < rowCount; ++i)
+  {
+    if (currentSampleSize < maxSampleSize)
    {
      for (uint32_t j = 0; j < columnCount; ++j)
      {
-      if (r.isNullValue(j) || columns[j].count(r.getIntField(j)))
-        keyTypes[oids[j]] = KeyType::FK;
+        // FIXME: Handle null values as well.
+        if (!r.isNullValue(j))
+          columnGroups[oids[j]][currentSampleSize] = r.getIntField(j);
+      }
+      ++currentSampleSize;
+    }
    else
-        columns[j].insert(r.getIntField(j));
+    {
+      const uint32_t index = uniformDistribution(gen32);
+      if (index < maxSampleSize)
+      {
+        for (uint32_t j = 0; j < columnCount; ++j)
+          columnGroups[oids[j]][index] = r.getIntField(j);
+      }
    }
    r.nextRow();
+    ++currentRowIndex;
  }
-
-  if (trace)
-    output(StatisticsType::PK_FK);
 }

-void StatisticsManager::output(StatisticsType statisticsType)
+void StatisticsManager::analyzeSample(bool traceOn)
 {
-  if (statisticsType == StatisticsType::PK_FK)
+  if (traceOn)
+    std::cout << "Sample size: " << currentSampleSize << std::endl;
+
+  // PK_FK statistics.
+  for (const auto& [oid, sample] : columnGroups)
+    keyTypes[oid] = std::make_pair(KeyType::PK, currentRowIndex);
+
+  for (const auto& [oid, sample] : columnGroups)
  {
+    std::unordered_set<uint32_t> columnsCache;
+    std::unordered_map<uint64_t, uint32_t> columnMCV;
+    for (uint32_t i = 0; i < currentSampleSize; ++i)
+    {
+      const auto value = sample[i];
+      // PK_FK statistics.
+      if (columnsCache.count(value) && keyTypes[oid].first == KeyType::PK)
+        keyTypes[oid].first = KeyType::FK;
+      else
+        columnsCache.insert(value);
+
+      // MCV statistics.
+      if (columnMCV.count(value))
+        columnMCV[value]++;
+      else
+        columnMCV.insert({value, 1});
+    }
+
+    // MCV statistics.
+    std::vector<pair<uint64_t, uint32_t>> mcvList(columnMCV.begin(), columnMCV.end());
+    std::sort(mcvList.begin(), mcvList.end(),
+              [](const std::pair<uint64_t, uint32_t>& a, const std::pair<uint64_t, uint32_t>& b) {
+                return a.second > b.second;
+              });
+
+    // 200 buckets as Microsoft does.
+    const auto mcvSize = std::min(columnMCV.size(), static_cast<uint64_t>(200));
+    mcv[oid] = std::unordered_map<uint64_t, uint32_t>(mcvList.begin(), mcvList.begin() + mcvSize);
+  }
+
+  if (traceOn)
+    output();
+
+  // Clear sample.
+  columnGroups.clear();
+  currentSampleSize = 0;
+  currentRowIndex = 0;
+}
+
+void StatisticsManager::output()
+{
  std::cout << "Columns count: " << keyTypes.size() << std::endl;
+
+  std::cout << "Statistics type [PK_FK]:  " << std::endl;
  for (const auto& p : keyTypes)
-      std::cout << p.first << " " << (int)p.second << std::endl;
+  {
+    std::cout << "OID: " << p.first << " ";
+    if (static_cast<uint32_t>(p.second.first) == 0)
+      std::cout << "PK ";
+    else
+      std::cout << "FK ";
+    std::cout << "row count: " << p.second.second << std::endl;
+  }
+
+  std::cout << "Statistics type [MCV]: " << std::endl;
+  for (const auto& [oid, columnMCV] : mcv)
+  {
+    std::cout << "OID: " << oid << std::endl;
+    for (const auto& [value, count] : columnMCV)
+      std::cout << value << ": " << count << std::endl;
  }
 }

@@ -94,8 +169,16 @@ std::unique_ptr<char[]> StatisticsManager::convertStatsToDataStream(uint64_t& da
 {
  // Number of pairs.
  uint64_t count = keyTypes.size();
-  // count, [[uid, keyType], ... ]
-  dataStreamSize = sizeof(uint64_t) + count * (sizeof(uint32_t) + sizeof(KeyType));
+  // count, [[uid, keyType, rows count], ... ]
+  dataStreamSize = sizeof(uint64_t) + count * (sizeof(uint32_t) + sizeof(KeyType) + sizeof(uint32_t));
+
+  // Count the size of the MCV.
+  for (const auto& [oid, mcvColumn] : mcv)
+  {
+    // [oid, list size, list [value, count]]
+    dataStreamSize +=
+        (sizeof(uint32_t) + sizeof(uint32_t) + ((sizeof(uint64_t) + sizeof(uint32_t)) * mcvColumn.size()));
+  }

  // Allocate memory for data stream.
  std::unique_ptr<char[]> dataStreamSmartPtr(new char[dataStreamSize]);
@@ -105,21 +188,95 @@ std::unique_ptr<char[]> StatisticsManager::convertStatsToDataStream(uint64_t& da
  std::memcpy(dataStream, reinterpret_cast<char*>(&count), sizeof(uint64_t));
  offset += sizeof(uint64_t);

-  // For each pair [oid, key type].
+  // For each pair [oid, key type, rows count].
  for (const auto& p : keyTypes)
  {
    uint32_t oid = p.first;
    std::memcpy(&dataStream[offset], reinterpret_cast<char*>(&oid), sizeof(uint32_t));
    offset += sizeof(uint32_t);
-
-    KeyType keyType = p.second;
+    KeyType keyType = p.second.first;
    std::memcpy(&dataStream[offset], reinterpret_cast<char*>(&keyType), sizeof(KeyType));
    offset += sizeof(KeyType);
+    uint32_t rowCount = p.second.second;
+    std::memcpy(&dataStream[offset], reinterpret_cast<char*>(&rowCount), sizeof(uint32_t));
+    offset += sizeof(uint32_t);
  }

+  // For each [oid, list size, list [value, count]].
+  for (const auto& p : mcv)
+  {
+    // [oid]
+    uint32_t oid = p.first;
+    std::memcpy(&dataStream[offset], reinterpret_cast<char*>(&oid), sizeof(uint32_t));
+    offset += sizeof(uint32_t);
+
+    // [list size]
+    const auto& mcvColumn = p.second;
+    uint32_t size = mcvColumn.size();
+    std::memcpy(&dataStream[offset], reinterpret_cast<char*>(&size), sizeof(uint32_t));
+    offset += sizeof(uint32_t);
+
+    // [list [value, count]]
+    for (const auto& mcvPair : mcvColumn)
+    {
+      uint64_t value = mcvPair.first;
+      std::memcpy(&dataStream[offset], reinterpret_cast<char*>(&value), sizeof(uint64_t));
+      offset += sizeof(uint64_t);
+      uint32_t count = mcvPair.second;
+      std::memcpy(&dataStream[offset], reinterpret_cast<char*>(&count), sizeof(uint32_t));
+      offset += sizeof(uint32_t);
+    }
+  }
  return dataStreamSmartPtr;
 }

+void StatisticsManager::convertStatsFromDataStream(std::unique_ptr<char[]> dataStreamSmartPtr)
+{
+  auto* dataStream = dataStreamSmartPtr.get();
+  uint64_t count = 0;
+  std::memcpy(reinterpret_cast<char*>(&count), dataStream, sizeof(uint64_t));
+  uint64_t offset = sizeof(uint64_t);
+
+  // For each pair.
+  for (uint64_t i = 0; i < count; ++i)
+  {
+    uint32_t oid, rowCount;
+    KeyType keyType;
+    std::memcpy(reinterpret_cast<char*>(&oid), &dataStream[offset], sizeof(uint32_t));
+    offset += sizeof(uint32_t);
+    std::memcpy(reinterpret_cast<char*>(&keyType), &dataStream[offset], sizeof(KeyType));
+    offset += sizeof(KeyType);
+    std::memcpy(reinterpret_cast<char*>(&rowCount), &dataStream[offset], sizeof(uint32_t));
+    offset += sizeof(uint32_t);
+    // Insert pair.
+    keyTypes[oid] = std::make_pair(keyType, rowCount);
+  }
+
+  for (uint64_t i = 0; i < count; ++i)
+  {
+    uint32_t oid;
+    std::memcpy(reinterpret_cast<char*>(&oid), &dataStream[offset], sizeof(uint32_t));
+    offset += sizeof(uint32_t);
+
+    uint32_t mcvSize;
+    std::memcpy(reinterpret_cast<char*>(&mcvSize), &dataStream[offset], sizeof(uint32_t));
+    offset += sizeof(uint32_t);
+
+    std::unordered_map<uint64_t, uint32_t> columnMCV;
+    for (uint32_t j = 0; j < mcvSize; ++j)
+    {
+      uint64_t value;
+      std::memcpy(reinterpret_cast<char*>(&value), &dataStream[offset], sizeof(uint64_t));
+      offset += sizeof(uint64_t);
+      uint32_t count;
+      std::memcpy(reinterpret_cast<char*>(&count), &dataStream[offset], sizeof(uint32_t));
+      offset += sizeof(uint32_t);
+      columnMCV[value] = count;
+    }
+    mcv[oid] = std::move(columnMCV);
+  }
+}
+
 void StatisticsManager::saveToFile()
 {
  std::lock_guard<std::mutex> lock(mut);
@@ -228,22 +385,7 @@ void StatisticsManager::loadFromFile()
  if (dataHash != computedDataHash)
    throw ios_base::failure("StatisticsManager::loadFromFile(): invalid file hash. ");

-  uint64_t count = 0;
-  std::memcpy(reinterpret_cast<char*>(&count), dataStream, sizeof(uint64_t));
-  uint64_t offset = sizeof(uint64_t);
-
-  // For each pair.
-  for (uint64_t i = 0; i < count; ++i)
-  {
-    uint32_t oid;
-    KeyType keyType;
-    std::memcpy(reinterpret_cast<char*>(&oid), &dataStream[offset], sizeof(uint32_t));
-    offset += sizeof(uint32_t);
-    std::memcpy(reinterpret_cast<char*>(&keyType), &dataStream[offset], sizeof(KeyType));
-    offset += sizeof(KeyType);
-    // Insert pair.
-    keyTypes[oid] = keyType;
-  }
+  convertStatsFromDataStream(std::move(dataStreamSmartPtr));
 }

 uint64_t StatisticsManager::computeHashFromStats()
@@ -261,10 +403,25 @@ void StatisticsManager::serialize(messageqcpp::ByteStream& bs)
  bs << epoch;
  bs << count;

+  // PK_FK
  for (const auto& keyType : keyTypes)
  {
    bs << keyType.first;
-    bs << (uint32_t)keyType.second;
+    bs << (uint32_t)keyType.second.first;
+    bs << keyType.second.second;
+  }
+
+  // MCV
+  for (const auto& p : mcv)
+  {
+    bs << p.first;
+    const auto& mcvColumn = p.second;
+    bs << static_cast<uint32_t>(mcvColumn.size());
+    for (const auto& mcvPair : mcvColumn)
+    {
+      bs << mcvPair.first;
+      bs << mcvPair.second;
+    }
  }
 }

@@ -275,12 +432,34 @@ void StatisticsManager::unserialize(messageqcpp::ByteStream& bs)
  bs >> epoch;
  bs >> count;

+  // PK_FK
  for (uint32_t i = 0; i < count; ++i)
  {
-    uint32_t oid, keyType;
+    uint32_t oid, keyType, rowCount;
    bs >> oid;
    bs >> keyType;
-    keyTypes[oid] = static_cast<KeyType>(keyType);
+    bs >> rowCount;
+    keyTypes[oid] = std::make_pair(static_cast<KeyType>(keyType), rowCount);
+  }
+
+  // MCV
+  for (uint32_t i = 0; i < count; ++i)
+  {
+    uint32_t oid, mcvSize;
+    bs >> oid;
+    bs >> mcvSize;
+    std::unordered_map<uint64_t, uint32_t> mcvColumn;
+
+    for (uint32_t j = 0; j < mcvSize; ++j)
+    {
+      uint64_t value;
+      uint32_t count;
+      bs >> value;
+      bs >> count;
+      mcvColumn[value] = count;
+    }
+
+    mcv[oid] = std::move(mcvColumn);
  }
 }

@@ -291,7 +470,7 @@ bool StatisticsManager::hasKey(uint32_t oid)

 KeyType StatisticsManager::getKeyType(uint32_t oid)
 {
-  return keyTypes[oid];
+  return keyTypes[oid].first;
 }

 StatisticsDistributor* StatisticsDistributor::instance()
--- a/utils/common/statistics.h
+++ b/utils/common/statistics.h
@@ -49,8 +49,10 @@ enum class KeyType : uint32_t
 // Rerpresents types of statistics CS supports.
 enum class StatisticsType : uint32_t
 {
-  // A special statistics type, made to solve circular inner join problem.
-  PK_FK
+  // A special statistics type, specifies whether a column a primary key or foreign key.
+  PK_FK,
+  // Most common values.
+  MCV
 };

 // Represetns a header for the statistics file.
@@ -63,6 +65,11 @@ struct StatisticsFileHeader
  uint8_t offset[1024];
 };

+using ColumnsCache = std::unordered_map<uint32_t, std::unordered_set<uint64_t>>;
+using ColumnGroup = std::unordered_map<uint32_t, std::vector<uint64_t>>;
+using KeyTypes = std::unordered_map<uint32_t, std::pair<KeyType, uint32_t>>;
+using MCVList = std::unordered_map<uint32_t, std::unordered_map<uint64_t, uint32_t>>;
+
 // This class is responsible for processing and storing statistics.
 // On each `analyze table` iteration it increases an epoch and stores
 // the updated statistics into the special file.
@@ -71,10 +78,12 @@ class StatisticsManager
 public:
  // Returns the instance of this class, static initialization happens only once.
  static StatisticsManager* instance();
-  // Analyzes the given `rowGroup` by processing it row by row and searching for foreign key.
-  void analyzeColumnKeyTypes(const rowgroup::RowGroup& rowGroup, bool trace);
+  // Collect samples from the given `rowGroup`.
+  void collectSample(const rowgroup::RowGroup& rowGroup);
+  // Analyzes collected samples.
+  void analyzeSample(bool traceOn);
  // Ouputs stats to out stream.
-  void output(StatisticsType statisticsType = StatisticsType::PK_FK);
+  void output();
  // Saves stats to the file.
  void saveToFile();
  // Loads stats from the file.
@@ -95,17 +104,29 @@ class StatisticsManager
  KeyType getKeyType(uint32_t oid);

 private:
-  std::map<uint32_t, KeyType> keyTypes;
-  StatisticsManager() : epoch(0), version(1)
+  StatisticsManager() : currentSampleSize(0), currentRowIndex(0), epoch(0), version(1)
  {
    // Initialize plugins.
    IDBPolicy::configIDBPolicy();
  }
  std::unique_ptr<char[]> convertStatsToDataStream(uint64_t& dataStreamSize);
+  void convertStatsFromDataStream(std::unique_ptr<char[]> dataStreamSmartPtr);

-  std::mutex mut;
+  // Internal data represents a sample [OID, vector of values].
+  ColumnGroup columnGroups;
+  // Internal data for the PK/FK statistics [OID, bool value].
+  KeyTypes keyTypes;
+  // Internal data for MCV list [OID, list[value, count]]
+  MCVList mcv;
+
+  // TODO: Think about sample size.
+  const uint32_t maxSampleSize = 64000;
+  uint32_t currentSampleSize;
+  uint32_t currentRowIndex;
  uint32_t epoch;
  uint32_t version;
+
+  std::mutex mut;
  std::string statsFile = "/var/lib/columnstore/local/statistics";
 };