feat(rbo,rules,QA): index column type is now derived from the corresponding Field

2025-10-31 18:30:33 +03:00 · 2025-08-05 14:33:39 +00:00
parent e167082497
commit c030ff4224
4 changed files with 168 additions and 50 deletions
--- a/dbcon/execplan/simplecolumn.cpp
+++ b/dbcon/execplan/simplecolumn.cpp
@@ -207,6 +207,7 @@ SimpleColumn::SimpleColumn(const SimpleColumn& rhs, const uint32_t sessionID)
 , fTimeZone(rhs.timeZone())
 , fisColumnStore(rhs.isColumnStore())
 {
+  fResultType = rhs.resultType();
 }

 SimpleColumn::SimpleColumn(const ReturnedColumn& rhs, const uint32_t sessionID)
@@ -250,6 +251,7 @@ SimpleColumn& SimpleColumn::operator=(const SimpleColumn& rhs)
    fDistinct = rhs.distinct();
    fisColumnStore = rhs.isColumnStore();
    fPartitions = rhs.fPartitions;
+    fResultType = rhs.resultType();
  }

  return *this;
--- a/dbcon/mysql/ha_mcs_execplan.cpp
+++ b/dbcon/mysql/ha_mcs_execplan.cpp
@@ -2651,6 +2651,39 @@ CalpontSystemCatalog::ColType colType_MysqlToIDB(const Item* item)
  return ct;
 }

+// Simplified version to support QA-specific RBO re-write rule.
+// TBD turn into template to merge with colType_MysqlToIDB for Item
+CalpontSystemCatalog::ColType colType_MysqlToIDB(const Field* field)
+{
+  CalpontSystemCatalog::ColType ct;
+  ct.precision = 4;
+
+  switch (field->result_type())
+  {
+    case INT_RESULT:
+      if (field->is_unsigned())
+      {
+        ct.colDataType = CalpontSystemCatalog::UBIGINT;
+      }
+      else
+      {
+        ct.colDataType = CalpontSystemCatalog::BIGINT;
+      }
+
+      ct.colWidth = 8;
+      break;
+
+    case STRING_RESULT:
+      ct.colDataType = CalpontSystemCatalog::VARCHAR;
+
+    default:
+      IDEBUG(cerr << "colType_MysqlToIDB:: Unknown result type of MySQL " << item->result_type() << endl);
+      break;
+  }
+  ct.charsetNumber = field->charset()->number;
+  return ct;
+}
+
 bool itemDisablesWrapping(Item* item, gp_walk_info& gwi)
 {
  if (gwi.select_lex == nullptr)
@@ -4184,6 +4217,62 @@ SimpleColumn* buildSimpleColumn(Item_field* ifp, gp_walk_info& gwi)
  return sc;
 }

+SimpleColumn* buildSimpleColumnFromFieldForStatistics(Field* field, gp_walk_info& gwi)
+{
+  if (!gwi.csc)
+  {
+    gwi.csc = CalpontSystemCatalog::makeCalpontSystemCatalog(gwi.sessionid);
+    gwi.csc->identity(CalpontSystemCatalog::FE);
+  }
+
+  CalpontSystemCatalog::ColType ct;
+  datatypes::SimpleColumnParam prm(gwi.sessionid, true);
+
+  try
+  {
+    // check foreign engine
+    if (field->table)
+      prm.columnStore(ha_mcs_common::isMCSTable(field->table));
+
+    if (prm.columnStore())
+    {
+      ct = gwi.csc->colType(gwi.csc->lookupOID(
+          make_tcn(field->table->s->db.str, field->table->s->table_name.str, field->field_name.str)));
+    }
+    else
+    {
+      ct = colType_MysqlToIDB(field);
+    }
+  }
+  catch (std::exception& ex)
+  {
+    gwi.fatalParseError = true;
+    gwi.parseErrorText = ex.what();
+    return NULL;
+  }
+
+  const datatypes::DatabaseQualifiedColumnName name(field->table->s->db.str, field->table->s->table_name.str,
+                                                    field->field_name.str);
+  const datatypes::TypeHandler* h = ct.typeHandler();
+  SimpleColumn* sc = h->newSimpleColumn(name, ct, prm);
+
+  sc->resultType(ct);
+  sc->charsetNumber(field->charset()->number);
+  string tbname(field->table->s->table_name.str);
+
+  // Note: differs with the original buildSimpleColumn
+  sc->tableAlias(field->table->alias.c_ptr(), lower_case_table_names);
+
+  sc->alias(field->field_name.str);
+  sc->isColumnStore(prm.columnStore());
+  sc->timeZone(gwi.timeZone);
+
+  sc->oid(field->field_index + 1);  // ExeMgr requires offset started from 1
+  // TODO add partitions support here
+
+  return sc;
+}
+
 ParseTree* buildParseTree(Item* item, gp_walk_info& gwi, bool& /*nonSupport*/)
 {
  ParseTree* pt = 0;
@@ -5215,7 +5304,6 @@ int processFrom(bool& isUnion, SELECT_LEX& select_lex, gp_walk_info& gwi, SCSEP&
        {
          for (uint j = 0; j < table_ptr->table->s->keys; j++)
          {
-            // for (uint i = 0; i < table_ptr->table->s->key_info[j].usable_key_parts; i++)
            {
              Field* field = table_ptr->table->key_info[j].key_part[0].field;
              std::cout << "j index " << j << " i column " << 0 << " fieldnr "
@@ -5225,28 +5313,29 @@ int processFrom(bool& isUnion, SELECT_LEX& select_lex, gp_walk_info& gwi, SCSEP&
                auto* histogram = dynamic_cast<Histogram_json_hb*>(field->read_stats->histogram);
                if (histogram)
                {
-                  std::cout << " has stats ";
-                  SchemaAndTableName tableName = {field->table->s->db.str,
-                    field->table->s->table_name.str};
-                  execplan::SimpleColumn simpleColumn = {field->table->s->db.str,
-                    field->table->s->table_name.str,
-                    field->field_name.str};
+                  std::cout << " has stats with " << histogram->buckets.size() << " buckets";
+                  SchemaAndTableName tableName = {field->table->s->db.str, field->table->s->table_name.str};
+                  auto* sc = buildSimpleColumnFromFieldForStatistics(field, gwi);
+                  std::cout << "sc with stats !!!!! " << sc->toString() << std::endl;
+                  // execplan::SimpleColumn simpleColumn = {
+                  //     field->table->s->db.str, field->table->s->table_name.str, field->field_name.str, false};
+
                  auto tableStatisticsMapIt = gwi.tableStatisticsMap.find(tableName);
                  if (tableStatisticsMapIt == gwi.tableStatisticsMap.end())
                  {
-                    gwi.tableStatisticsMap[tableName][field->field_name.str] = {simpleColumn, {*histogram}};
+                    gwi.tableStatisticsMap[tableName][field->field_name.str] = {*sc, {histogram}};
                  }
                  else
                  {
                    auto columnStatisticsMapIt = tableStatisticsMapIt->second.find(field->field_name.str);
                    if (columnStatisticsMapIt == tableStatisticsMapIt->second.end())
                    {
-                      tableStatisticsMapIt->second[field->field_name.str] = {simpleColumn, {*histogram}};
+                      tableStatisticsMapIt->second[field->field_name.str] = {*sc, {histogram}};
                    }
                    else
                    {
                      auto columnStatisticsVec = columnStatisticsMapIt->second.second;
-                      columnStatisticsVec.push_back(*histogram);
+                      columnStatisticsVec.push_back(histogram);
                    }
                  }
                }
--- a/dbcon/mysql/ha_mcs_impl_if.h
+++ b/dbcon/mysql/ha_mcs_impl_if.h
@@ -116,7 +116,7 @@ typedef std::map<execplan::CalpontSystemCatalog::TableAliasName, std::pair<int,
 typedef std::tr1::unordered_map<TABLE_LIST*, std::vector<COND*>> TableOnExprList;
 typedef std::tr1::unordered_map<TABLE_LIST*, uint> TableOuterJoinMap;
 using ColumnName = std::string;
-using ColumnStatisticsMap = std::unordered_map<ColumnName, std::pair<execplan::SimpleColumn, std::vector<Histogram_json_hb>>>;
+using ColumnStatisticsMap = std::unordered_map<ColumnName, std::pair<execplan::SimpleColumn, std::vector<Histogram_json_hb*>>>;
 using TableStatisticsMap = std::unordered_map<SchemaAndTableName, ColumnStatisticsMap, SchemaAndTableNameHash>;

 // This structure is used to store MDB AST -> CSEP translation context.
--- a/dbcon/mysql/rbo_apply_parallel_ces.cpp
+++ b/dbcon/mysql/rbo_apply_parallel_ces.cpp
@@ -64,7 +64,8 @@ bool someAreForeignTables(execplan::CalpontSelectExecutionPlan& csep)
                     [](const auto& table) { return !table.isColumnstore(); });
 }

-bool someForeignTablesHasStatisticsAndMbIndex(execplan::CalpontSelectExecutionPlan& csep, optimizer::RBOptimizerContext& ctx)
+bool someForeignTablesHasStatisticsAndMbIndex(execplan::CalpontSelectExecutionPlan& csep,
+                                              optimizer::RBOptimizerContext& ctx)
 {
  return std::any_of(
      csep.tableList().begin(), csep.tableList().end(),
@@ -100,12 +101,13 @@ execplan::ParseTree* filtersWithNewRange(execplan::SCSEP& csep, execplan::Simple
  // There is a question with ownership of the const column
  // TODO here we lost upper bound value if predicate is not changed to weak lt
  execplan::SOP ltOp = (isLast) ? boost::make_shared<execplan::Operator>(execplan::PredicateOperator("<="))
-                               : boost::make_shared<execplan::Operator>(execplan::PredicateOperator("<"));
+                                : boost::make_shared<execplan::Operator>(execplan::PredicateOperator("<"));
  ltOp->setOpType(filterColLeftOp->resultType(), tableKeyColumnLeftOp->resultType());
  ltOp->resultType(ltOp->operationType());

  auto* sfr = new execplan::SimpleFilter(ltOp, tableKeyColumnLeftOp, filterColLeftOp);
-  // TODO new
+  // TODO new 
+  // TODO remove new and re-use tableKeyColumnLeftOp
  auto tableKeyColumnRightOp = new execplan::SimpleColumn(column);
  tableKeyColumnRightOp->resultType(column.resultType());
  // TODO hardcoded column type and value
@@ -173,63 +175,74 @@ execplan::SimpleColumn* findSuitableKeyColumn(execplan::CalpontSelectExecutionPl
 }

 // TBD
-Histogram_json_hb& chooseStatisticsToUse(std::vector<Histogram_json_hb>& columnStatisticsVec)
+Histogram_json_hb* chooseStatisticsToUse(std::vector<Histogram_json_hb*>& columnStatisticsVec)
 {
  return columnStatisticsVec.front();
 }

+// Looking for a projected column that comes first in an available index and has EI statistics
+// INV nullptr signifies that no suitable column was found
+std::optional<std::pair<execplan::SimpleColumn&, Histogram_json_hb*>> chooseKeyColumnAndStatistics(
+    execplan::CalpontSystemCatalog::TableAliasName& targetTable, optimizer::RBOptimizerContext& ctx)
+{
+  cal_impl_if::SchemaAndTableName schemaAndTableName = {targetTable.schema, targetTable.table};
+
+  auto tableColumnsStatisticsIt = ctx.gwi.tableStatisticsMap.find(schemaAndTableName);
+  if (tableColumnsStatisticsIt == ctx.gwi.tableStatisticsMap.end() ||
+      tableColumnsStatisticsIt->second.empty())
+  {
+    return std::nullopt;
+  }
+
+  // TODO take some column and some stats for it!!!
+  for (auto& [columnName, scAndStatisticsVec] : tableColumnsStatisticsIt->second)
+  {
+    auto& [sc, columnStatisticsVec] = scAndStatisticsVec;
+    auto* columnStatistics = chooseStatisticsToUse(columnStatisticsVec);
+    return {{sc, columnStatistics}};
+  }
+
+  return std::nullopt;
+}
+
 // Populates range bounds based on column statistics
 // Returns optional with bounds if successful, nullopt otherwise
 template <typename T>
-std::optional<FilterRangeBounds<T>> populateRangeBounds(execplan::SimpleColumn* keyColumn,
-                                                        optimizer::RBOptimizerContext& ctx)
+std::optional<FilterRangeBounds<T>> populateRangeBounds(Histogram_json_hb* columnStatistics)
 {
-  cal_impl_if::SchemaAndTableName schemaAndTableName = {keyColumn->schemaName(), keyColumn->tableName()};
-  auto tableColumnsStatisticsIt = ctx.gwi.tableStatisticsMap.find(schemaAndTableName);
-  if (tableColumnsStatisticsIt == ctx.gwi.tableStatisticsMap.end())
-  {
-    return std::nullopt;
-  }
-
-  auto columnStatisticsIt = tableColumnsStatisticsIt->second.find(keyColumn->columnName());
-  if (columnStatisticsIt == tableColumnsStatisticsIt->second.end())
-  {
-    return std::nullopt;
-  }
-
-  auto& [simpleColumn, columnStatisticsVec] = columnStatisticsIt->second;
-  auto& columnStatistics = chooseStatisticsToUse(columnStatisticsVec);
+  FilterRangeBounds<T> bounds;

  // TODO configurable parallel factor via session variable
  // NB now histogram size is the way to control parallel factor with 16 being the maximum
-  size_t numberOfUnionUnits = std::min(columnStatistics.get_json_histogram().size(), MaxParallelFactor);
-  size_t numberOfBucketsPerUnionUnit = columnStatistics.get_json_histogram().size() / numberOfUnionUnits;
+  std::cout << "populateRangeBounds() columnStatistics->buckets.size() " << columnStatistics->get_json_histogram().size()
+            << std::endl;
+  size_t numberOfUnionUnits = std::min(columnStatistics->get_json_histogram().size(), MaxParallelFactor);
+  size_t numberOfBucketsPerUnionUnit = columnStatistics->get_json_histogram().size() / numberOfUnionUnits;

  std::cout << "Number of union units: " << numberOfUnionUnits << std::endl;
  std::cout << "Number of buckets per union unit: " << numberOfBucketsPerUnionUnit << std::endl;

-  FilterRangeBounds<T> bounds;
-
  // Loop over buckets to produce filter ranges
  // NB Currently Histogram_json_hb has the last bucket that has end as its start
  for (size_t i = 0; i < numberOfUnionUnits - 1; ++i)
  {
-    auto bucket = columnStatistics.get_json_histogram().begin() + i * numberOfBucketsPerUnionUnit;
-    auto endBucket = columnStatistics.get_json_histogram().begin() + (i + 1) * numberOfBucketsPerUnionUnit;
+    auto bucket = columnStatistics->get_json_histogram().begin() + i * numberOfBucketsPerUnionUnit;
+    auto endBucket = columnStatistics->get_json_histogram().begin() + (i + 1) * numberOfBucketsPerUnionUnit;
    T currentLowerBound = *(uint32_t*)bucket->start_value.data();
    T currentUpperBound = *(uint32_t*)endBucket->start_value.data();
    bounds.push_back({currentLowerBound, currentUpperBound});
  }
-  for (auto& bucket : columnStatistics.get_json_histogram())
+  for (auto& bucket : columnStatistics->get_json_histogram())
  {
    T currentLowerBound = *(uint32_t*)bucket.start_value.data();
    std::cout << "Bucket: " << currentLowerBound << std::endl;
  }
-  // auto penultimateBucket = columnStatistics.get_json_histogram().begin() + numberOfUnionUnits * numberOfBucketsPerUnionUnit;
-  // T currentLowerBound = *(uint32_t*)penultimateBucket->start_value.data();
-  // T currentUpperBound = *(uint32_t*)columnStatistics.get_last_bucket_end_endp().data();
+  // auto penultimateBucket = columnStatistics.get_json_histogram().begin() + numberOfUnionUnits *
+  // numberOfBucketsPerUnionUnit; T currentLowerBound = *(uint32_t*)penultimateBucket->start_value.data(); T
+  // currentUpperBound = *(uint32_t*)columnStatistics.get_last_bucket_end_endp().data();
  // bounds.push_back({currentLowerBound, currentUpperBound});

+
  for (auto& bound : bounds)
  {
    std::cout << "Bound: " << bound.first << " " << bound.second << std::endl;
@@ -247,14 +260,19 @@ execplan::CalpontSelectExecutionPlan::SelectList makeUnionFromTable(

  // SC type controls an integral type used to produce suitable filters. The continuation of this function
  // should become a template function based on SC type.
-  execplan::SimpleColumn* keyColumn = findSuitableKeyColumn(csep, table, ctx);
-  if (!keyColumn)
+  auto keyColumnAndStatistics = chooseKeyColumnAndStatistics(table, ctx);
+  if (!keyColumnAndStatistics)
  {
    return unionVec;
  }

+  auto& [keyColumn, columnStatistics] = keyColumnAndStatistics.value();
+
+  std::cout << "makeUnionFromTable keyColumn " << keyColumn.toString() << std::endl;
+  std::cout << "makeUnionFromTable RC front " << csep.returnedCols().front()->toString() << std::endl;
+
  // TODO char and other numerical types support
-  auto boundsOpt = populateRangeBounds<uint64_t>(keyColumn, ctx);
+  auto boundsOpt = populateRangeBounds<uint64_t>(columnStatistics);
  if (!boundsOpt.has_value())
  {
    return unionVec;
@@ -269,15 +287,24 @@ execplan::CalpontSelectExecutionPlan::SelectList makeUnionFromTable(
    {
      auto clonedCSEP = csep.cloneForTableWORecursiveSelects(table);
      // Add BETWEEN based on key column range
-      clonedCSEP->filters(filtersWithNewRange(clonedCSEP, *keyColumn, bounds[i], false));
+      auto filter = filtersWithNewRange(clonedCSEP, keyColumn, bounds[i], false);
+      clonedCSEP->filters(filter);
+      // To create CES filter we need to have a column in the column map
+      clonedCSEP->columnMap().insert({keyColumn.columnName(), execplan::SRCP(keyColumn.clone())});
      unionVec.push_back(clonedCSEP);
    }
  }
  // This last bound produces low <= col <= high
-  auto clonedCSEP = csep.cloneForTableWORecursiveSelects(table);
-  clonedCSEP->filters(filtersWithNewRange(clonedCSEP, *keyColumn, bounds.back(), true));
-  unionVec.push_back(clonedCSEP);
-
+  // TODO add NULLs into filter of the last step
+  if (!bounds.empty())
+  {
+    auto clonedCSEP = csep.cloneForTableWORecursiveSelects(table);
+    auto filter = filtersWithNewRange(clonedCSEP, keyColumn, bounds.back(), true);
+    clonedCSEP->columnMap().insert({keyColumn.columnName(), execplan::SRCP(keyColumn.clone())});
+    clonedCSEP->filters(filter);
+    unionVec.push_back(clonedCSEP);
+  }
+  
  return unionVec;
 }
 bool applyParallelCES(execplan::CalpontSelectExecutionPlan& csep, RBOptimizerContext& ctx)