chore(QA,plugin): integer PK columns are supported by QA.

2025-11-21 09:20:51 +03:00 · 2025-10-21 13:24:51 +00:00
parent fb98e46bfc
commit 45ebe62bc8
4 changed files with 119 additions and 54 deletions
--- a/dbcon/mysql/ha_mcs_impl.cpp
+++ b/dbcon/mysql/ha_mcs_impl.cpp
@@ -333,6 +333,7 @@ int fetchNextRow(uchar* buf, cal_table_info& ti, cal_connection_info* ci, long t
    if (ti.tpl_scan_ctx->rowsreturned == 0 &&
        (ti.tpl_scan_ctx->traceFlags & execplan::CalpontSelectExecutionPlan::TRACE_TUPLE_OFF))
    {
+      std::cout << "rowGroup->toString() " << rowGroup->toString() << std::endl;
      for (uint32_t i = 0; i < rowGroup->getColumnCount(); i++)
      {
        int oid = rowGroup->getOIDs()[i];
--- a/dbcon/mysql/ha_mcs_impl_if.h
+++ b/dbcon/mysql/ha_mcs_impl_if.h
@@ -122,8 +122,9 @@ typedef std::tr1::unordered_map<TABLE_LIST*, uint> TableOuterJoinMap;

 struct ColumnStatistics
 {
-  ColumnStatistics(execplan::SimpleColumn& column, std::vector<Histogram_json_hb*> histograms)
-   : column(column), histograms(histograms)
+  ColumnStatistics(execplan::SimpleColumn& column, std::vector<Histogram_json_hb*> histograms,
+                   Field* minValue, Field* maxValue)
+   : column(column), histograms(histograms), minValue(minValue), maxValue(maxValue)
  {
  }
  ColumnStatistics() = default;
@@ -133,15 +134,33 @@ struct ColumnStatistics
    return histograms;
  }

+  const Histogram_json_hb* getHistogram() const
+  {
+    if (histograms.empty())
+      return nullptr;
+    return histograms.front();
+  }
+
  execplan::SimpleColumn& getColumn()
  {
    return column;
  }

+  std::optional<int64_t> getIntMinValue() const
+  {
+    return (minValue) ? std::optional<int64_t>(minValue->val_int()) : std::nullopt;
+  }
+
+  std::optional<int64_t> getIntMaxValue() const
+  {
+    return (maxValue) ? std::optional<int64_t>(maxValue->val_int()) : std::nullopt;
+  }
+
+ private:
  execplan::SimpleColumn column;
  std::vector<Histogram_json_hb*> histograms;
-  Field* min{nullptr};
-  Field* max{nullptr};
+  Field* minValue{nullptr};
+  Field* maxValue{nullptr};
 };

 using ColumnName = std::string;
@@ -162,14 +181,23 @@ struct TableStatistics
    auto tableStatisticsIt = tableStatistics_.find(tableName);
    if (tableStatisticsIt == tableStatistics_.end())
    {
-      tableStatistics_[tableName][fieldName] = {sc, {histogram}};
+      if (histogram)
+      {
+        tableStatistics_[tableName][fieldName] = {
+            sc, {histogram}, statistics->min_value, statistics->max_value};
+      }
+      else
+      {
+        tableStatistics_[tableName][fieldName] = {sc, {}, statistics->min_value, statistics->max_value};
+      }
    }
    else
    {
      auto columnStatisticsMapIt = tableStatisticsIt->second.find(fieldName);
      if (columnStatisticsMapIt == tableStatisticsIt->second.end())
      {
-        tableStatisticsIt->second[fieldName] = {sc, {histogram}};
+        tableStatisticsIt->second[fieldName] = {
+            sc, {histogram}, statistics->min_value, statistics->max_value};
      }
      else
      {
@@ -202,6 +230,8 @@ struct TableStatistics
      }
      else
      {
+        // Note: This algo overwrites histograms but shouldn't be a problem b/c
+        // statistics can't change.
        for (auto& [columnName, histogram] : aColumnStatisticsMap)
        {
          tableStatisticsIt->second[columnName] = histogram;
@@ -210,6 +240,7 @@ struct TableStatistics
    }
  }

+ private:
  TableStatisticsMap tableStatistics_;
 };

--- a/dbcon/rbo/rbo_apply_parallel_ces.cpp
+++ b/dbcon/rbo/rbo_apply_parallel_ces.cpp
@@ -216,15 +216,9 @@ execplan::SimpleColumn* findSuitableKeyColumn(execplan::CalpontSelectExecutionPl
  return nullptr;
 }

-// TBD
-Histogram_json_hb* chooseStatisticsToUse(std::vector<Histogram_json_hb*>& columnStatisticsVec)
-{
-  return columnStatisticsVec.front();
-}
-
 // Looking for a projected column that comes first in an available index and has EI statistics
 // INV nullptr signifies that no suitable column was found
-std::optional<std::pair<execplan::SimpleColumn&, Histogram_json_hb*>> chooseKeyColumnAndStatistics(
+cal_impl_if::ColumnStatistics* chooseKeyColumnAndStatistics(
    execplan::CalpontSystemCatalog::TableAliasName& targetTable, optimizer::RBOptimizerContext& ctx)
 {
  cal_impl_if::SchemaAndTableName schemaAndTableName = {targetTable.schema, targetTable.table};
@@ -232,19 +226,17 @@ std::optional<std::pair<execplan::SimpleColumn&, Histogram_json_hb*>> chooseKeyC
  auto tableColumnsStatistics = ctx.getGwi().tableStatistics.findStatisticsForATable(schemaAndTableName);
  if (!tableColumnsStatistics)
  {
-    return std::nullopt;
+    return nullptr;
  }

-  // TODO take some column and some stats for it!!!
+  // TODO this algo now returns the first column and stats
+  // for it but it should consider all column available
  for (auto& [columnName, columnStatistics] : tableColumnsStatistics.value())
  {
-    auto& sc = columnStatistics.getColumn();
-    auto& columnStatisticsVec = columnStatistics.getHistograms();
-    auto* bestColumnStatistics = chooseStatisticsToUse(columnStatisticsVec);
-    return {{sc, bestColumnStatistics}};
+    return &columnStatistics;
  }

-  return std::nullopt;
+  return nullptr;
 }
 }  // namespace details

@@ -257,37 +249,32 @@ bool parallelCESFilter(execplan::CalpontSelectExecutionPlan& csep, optimizer::RB
  return someAreForeignTables(csep) && someForeignTablesHasStatisticsAndMbIndex(csep, ctx);
 }

-// Populates range bounds based on column statistics
-// Returns optional with bounds if successful, nullopt otherwise
+uint64_t decodeU64(const std::string& bytes)
+{
+  uint64_t v = 0;
+  const size_t n = std::min<size_t>(bytes.size(), sizeof(uint64_t));
+  if (n)
+    std::memcpy(&v, bytes.data(), n);
+  return v;
+}
+
 template <typename T>
-std::optional<details::FilterRangeBounds<T>> populateRangeBounds(Histogram_json_hb* columnStatistics,
-                                                                 size_t& maxParallelFactor)
+std::optional<details::FilterRangeBounds<T>> populateRangeBoundsFromHistogram(
+    cal_impl_if::ColumnStatistics& columnStatistics, size_t maxParallelFactor)
 {
  details::FilterRangeBounds<T> bounds;
-
-  // Guard: empty histogram
-  if (!columnStatistics || columnStatistics->get_json_histogram().empty())
-    return std::nullopt;
-
-  auto decodeU64 = [](const std::string& bytes) -> uint64_t
-  {
-    uint64_t v = 0;
-    const size_t n = std::min<size_t>(bytes.size(), sizeof(uint64_t));
-    if (n)
-      std::memcpy(&v, bytes.data(), n);
-    return v;
-  };
+  auto* histogram = columnStatistics.getHistogram();

  // Get parallel factor from context
-  size_t numberOfUnionUnits = std::min(columnStatistics->get_json_histogram().size(), maxParallelFactor);
-  size_t numberOfBucketsPerUnionUnit = columnStatistics->get_json_histogram().size() / numberOfUnionUnits;
+  size_t numberOfUnionUnits = std::min(histogram->get_json_histogram().size(), maxParallelFactor);
+  size_t numberOfBucketsPerUnionUnit = histogram->get_json_histogram().size() / numberOfUnionUnits;

  // Loop over buckets to produce filter ranges
  // NB Currently Histogram_json_hb has the last bucket that has end as its start
  for (size_t i = 0; i < numberOfUnionUnits - 1; ++i)
  {
-    auto bucket = columnStatistics->get_json_histogram().begin() + i * numberOfBucketsPerUnionUnit;
-    auto endBucket = columnStatistics->get_json_histogram().begin() + (i + 1) * numberOfBucketsPerUnionUnit;
+    auto bucket = histogram->get_json_histogram().begin() + i * numberOfBucketsPerUnionUnit;
+    auto endBucket = histogram->get_json_histogram().begin() + (i + 1) * numberOfBucketsPerUnionUnit;
    T currentLowerBound = static_cast<T>(decodeU64(bucket->start_value));
    T currentUpperBound = static_cast<T>(decodeU64(endBucket->start_value));
    bounds.push_back({currentLowerBound, currentUpperBound});
@@ -297,15 +284,15 @@ std::optional<details::FilterRangeBounds<T>> populateRangeBounds(Histogram_json_
  if (numberOfUnionUnits >= 1)
  {
    auto lastChunkIndex = (numberOfUnionUnits - 1) * numberOfBucketsPerUnionUnit;
-    if (lastChunkIndex < columnStatistics->get_json_histogram().size())
+    if (lastChunkIndex < histogram->get_json_histogram().size())
    {
-      auto lastStartBucket = columnStatistics->get_json_histogram().begin() + lastChunkIndex;
+      auto lastStartBucket = histogram->get_json_histogram().begin() + lastChunkIndex;
      T finalLowerBound = static_cast<T>(decodeU64(lastStartBucket->start_value));

      T finalUpperBound = std::numeric_limits<T>::max();
-      if (!columnStatistics->get_last_bucket_end_endp().empty())
+      if (!histogram->get_last_bucket_end_endp().empty())
      {
-        finalUpperBound = static_cast<T>(decodeU64(columnStatistics->get_last_bucket_end_endp()));
+        finalUpperBound = static_cast<T>(decodeU64(histogram->get_last_bucket_end_endp()));
      }
      bounds.push_back({finalLowerBound, finalUpperBound});
    }
@@ -320,6 +307,56 @@ std::optional<details::FilterRangeBounds<T>> populateRangeBounds(Histogram_json_
  return bounds;
 }

+template <typename T>
+std::optional<details::FilterRangeBounds<T>> populateRangeBoundsFromEquallyDistributedRange(
+    cal_impl_if::ColumnStatistics& columnStatistics, size_t maxParallelFactor)
+{
+  auto minValue = columnStatistics.getIntMinValue().value();
+  auto maxValue = columnStatistics.getIntMaxValue().value();
+
+  assert(maxValue >= minValue);
+  auto distance = maxValue - minValue;
+  auto step = distance / maxParallelFactor;
+
+  details::FilterRangeBounds<T> bounds;
+  for (size_t i = 0; i < maxParallelFactor; ++i)
+  {
+    bounds.push_back({minValue + i * step, minValue + (i + 1) * step});
+  }
+
+  if (!bounds.empty())
+  {
+    bounds.front().first = std::numeric_limits<T>::lowest();
+    bounds.back().second = maxValue;
+  }
+
+  return bounds;
+}
+
+// Populates range bounds based on column statistics
+// Returns optional with bounds if successful, nullopt otherwise
+template <typename T>
+std::optional<details::FilterRangeBounds<T>> populateRangeBounds(
+    cal_impl_if::ColumnStatistics& columnStatistics, size_t& maxParallelFactor)
+{
+  auto* histogram = columnStatistics.getHistogram();
+
+  // Guard: empty histogram or no min/max values
+  if (histogram && histogram->get_json_histogram().empty())
+  {
+    return populateRangeBoundsFromHistogram<T>(columnStatistics, maxParallelFactor);
+  }
+
+  auto minValue = columnStatistics.getIntMinValue();
+  auto maxValue = columnStatistics.getIntMaxValue();
+  if (minValue && maxValue)
+  {
+    return populateRangeBoundsFromEquallyDistributedRange<T>(columnStatistics, maxParallelFactor);
+  }
+
+  return std::nullopt;
+}
+
 // TODO char and other numerical types support
 execplan::CalpontSelectExecutionPlan::SelectList makeUnionFromTable(
    execplan::CalpontSelectExecutionPlan& csep, execplan::CalpontSystemCatalog::TableAliasName& table,
@@ -329,15 +366,16 @@ execplan::CalpontSelectExecutionPlan::SelectList makeUnionFromTable(

  // SC type controls an integral type used to produce suitable filters. The continuation of this function
  // should become a template function based on SC type.
-  auto keyColumnAndStatistics = chooseKeyColumnAndStatistics(table, ctx);
-  if (!keyColumnAndStatistics)
+  auto columnStatisticsPtr = chooseKeyColumnAndStatistics(table, ctx);
+  if (!columnStatisticsPtr)
  {
    return unionVec;
  }

-  auto& [keyColumn, columnStatistics] = keyColumnAndStatistics.value();
+  auto& columnStatistics = *columnStatisticsPtr;
+  auto& keyColumn = columnStatistics.getColumn();

-  std::cout << "makeUnionFromTable keyColumn " << keyColumn.toString() << std::endl;
+  // std::cout << "makeUnionFromTable keyColumn " << keyColumn.toString() << std::endl;
  std::cout << "makeUnionFromTable RC front " << csep.returnedCols().front()->toString() << std::endl;

  // TODO char and other numerical types support
--- a/dbcon/rbo/rbo_apply_parallel_ces.h
+++ b/dbcon/rbo/rbo_apply_parallel_ces.h
@@ -91,11 +91,6 @@ execplan::SimpleColumn* findSuitableKeyColumn(execplan::CalpontSelectExecutionPl
                                              execplan::CalpontSystemCatalog::TableAliasName& targetTable,
                                              optimizer::RBOptimizerContext& ctx);

-std::optional<std::pair<execplan::SimpleColumn&, Histogram_json_hb*>> chooseKeyColumnAndStatistics(
-    execplan::CalpontSystemCatalog::TableAliasName& targetTable, optimizer::RBOptimizerContext& ctx);
-
-Histogram_json_hb* chooseStatisticsToUse(const std::vector<Histogram_json_hb*>& statisticsVec);
-
 }  // namespace details

 // Main functions