chore(rbo,rules,QA): new extractColumnStatistics, some comments with a bit of re-factoring.

2025-10-31 18:30:33 +03:00 · 2025-08-05 14:56:14 +00:00
parent c030ff4224
commit c19c49ba13
2 changed files with 67 additions and 57 deletions
--- a/dbcon/mysql/ha_mcs_execplan.cpp
+++ b/dbcon/mysql/ha_mcs_execplan.cpp
@@ -2673,8 +2673,7 @@ CalpontSystemCatalog::ColType colType_MysqlToIDB(const Field* field)
      ct.colWidth = 8;
      break;

-    case STRING_RESULT:
-      ct.colDataType = CalpontSystemCatalog::VARCHAR;
+    case STRING_RESULT: ct.colDataType = CalpontSystemCatalog::VARCHAR;

    default:
      IDEBUG(cerr << "colType_MysqlToIDB:: Unknown result type of MySQL " << item->result_type() << endl);
@@ -5205,6 +5204,64 @@ void setExecutionParams(gp_walk_info& gwi, SCSEP& csep)
    csep->umMemLimit(get_um_mem_limit(gwi.thd) * 1024ULL * 1024);
 }

+// Loop over available indexes to find and extract corresponding EI column statistics
+// for the first column of the index if any.
+// Statistics is stored in GWI context.
+// Mock for ES 10.6
+// TODO clean up extra logging when the feature is ready
+#if MYSQL_VERSION_ID >= 110401
+void extractColumnStatistics(TABLE_LIST* table_ptr, gp_walk_info& gwi)
+{
+  for (uint j = 0; j < table_ptr->table->s->keys; j++)
+  {
+    {
+      Field* field = table_ptr->table->key_info[j].key_part[0].field;
+      std::cout << "j index " << j << " i column " << 0 << " fieldnr "
+                << table_ptr->table->key_info[j].key_part[0].fieldnr << " " << field->field_name.str;
+      if (field->read_stats)
+      {
+        auto* histogram = dynamic_cast<Histogram_json_hb*>(field->read_stats->histogram);
+        if (histogram)
+        {
+          std::cout << " has stats with " << histogram->buckets.size() << " buckets";
+          SchemaAndTableName tableName = {field->table->s->db.str, field->table->s->table_name.str};
+          auto* sc = buildSimpleColumnFromFieldForStatistics(field, gwi);
+          std::cout << "sc with stats !!!!! " << sc->toString() << std::endl;
+
+          auto tableStatisticsMapIt = gwi.tableStatisticsMap.find(tableName);
+          if (tableStatisticsMapIt == gwi.tableStatisticsMap.end())
+          {
+            gwi.tableStatisticsMap[tableName][field->field_name.str] = {*sc, {histogram}};
+          }
+          else
+          {
+            auto columnStatisticsMapIt = tableStatisticsMapIt->second.find(field->field_name.str);
+            if (columnStatisticsMapIt == tableStatisticsMapIt->second.end())
+            {
+              tableStatisticsMapIt->second[field->field_name.str] = {*sc, {histogram}};
+            }
+            else
+            {
+              auto columnStatisticsVec = columnStatisticsMapIt->second.second;
+              columnStatisticsVec.push_back(histogram);
+            }
+          }
+        }
+        else
+        {
+          std::cout << " no stats ";
+        }
+      }
+      std::cout << std::endl;
+    }
+  }
+}
+#else
+void extractColumnStatistics(Item_field* /*ifp*/, gp_walk_info& /*gwi*/)
+{
+}
+#endif
+
 /*@brief  Process FROM part of the query or sub-query      */
 /***********************************************************
 * DESCRIPTION:
@@ -5302,51 +5359,8 @@ int processFrom(bool& isUnion, SELECT_LEX& select_lex, gp_walk_info& gwi, SCSEP&
        }
        else
        {
-          for (uint j = 0; j < table_ptr->table->s->keys; j++)
-          {
-            {
-              Field* field = table_ptr->table->key_info[j].key_part[0].field;
-              std::cout << "j index " << j << " i column " << 0 << " fieldnr "
-                        << table_ptr->table->key_info[j].key_part[0].fieldnr << " " << field->field_name.str;
-              if (field->read_stats)
-              {
-                auto* histogram = dynamic_cast<Histogram_json_hb*>(field->read_stats->histogram);
-                if (histogram)
-                {
-                  std::cout << " has stats with " << histogram->buckets.size() << " buckets";
-                  SchemaAndTableName tableName = {field->table->s->db.str, field->table->s->table_name.str};
-                  auto* sc = buildSimpleColumnFromFieldForStatistics(field, gwi);
-                  std::cout << "sc with stats !!!!! " << sc->toString() << std::endl;
-                  // execplan::SimpleColumn simpleColumn = {
-                  //     field->table->s->db.str, field->table->s->table_name.str, field->field_name.str, false};
-
-                  auto tableStatisticsMapIt = gwi.tableStatisticsMap.find(tableName);
-                  if (tableStatisticsMapIt == gwi.tableStatisticsMap.end())
-                  {
-                    gwi.tableStatisticsMap[tableName][field->field_name.str] = {*sc, {histogram}};
-                  }
-                  else
-                  {
-                    auto columnStatisticsMapIt = tableStatisticsMapIt->second.find(field->field_name.str);
-                    if (columnStatisticsMapIt == tableStatisticsMapIt->second.end())
-                    {
-                      tableStatisticsMapIt->second[field->field_name.str] = {*sc, {histogram}};
-                    }
-                    else
-                    {
-                      auto columnStatisticsVec = columnStatisticsMapIt->second.second;
-                      columnStatisticsVec.push_back(histogram);
-                    }
-                  }
-                }
-                else
-                {
-                  std::cout << " no stats ";
-                }
-              }
-              std::cout << std::endl;
-            }
-          }
+          // TODO move extractColumnStatistics up when statistics is supported in MCS
+          extractColumnStatistics(table_ptr, gwi);
        }
        string table_name = table_ptr->table_name.str;

@@ -6557,7 +6571,6 @@ int processSelect(SELECT_LEX& select_lex, gp_walk_info& gwi, SCSEP& csep, vector
      case Item::FIELD_ITEM:
      {
        Item_field* ifp = (Item_field*)item;
-        // extractColumnStatistics(ifp, gwi);
        // Handle * case
        if (ifp->field_name.length && string(ifp->field_name.str) == "*")
        {
--- a/dbcon/mysql/rbo_apply_parallel_ces.cpp
+++ b/dbcon/mysql/rbo_apply_parallel_ces.cpp
@@ -79,11 +79,8 @@ bool someForeignTablesHasStatisticsAndMbIndex(execplan::CalpontSelectExecutionPl

 bool parallelCESFilter(execplan::CalpontSelectExecutionPlan& csep, optimizer::RBOptimizerContext& ctx)
 {
-  auto tables = csep.tableList();
-  // This is leaf and there are no other tables at this level in neither UNION, nor derived table.
-  // TODO filter out CSEPs with orderBy, groupBy, having
+  // TODO filter out CSEPs with orderBy, groupBy, having || or clean up OB,GB,HAVING cloning CSEP
  // Filter out tables that were re-written.
-  // return tables.size() == 1 && !tables[0].isColumnstore() && !tableIsInUnion(tables[0], csep);
  return someAreForeignTables(csep) && someForeignTablesHasStatisticsAndMbIndex(csep, ctx);
 }

@@ -106,7 +103,7 @@ execplan::ParseTree* filtersWithNewRange(execplan::SCSEP& csep, execplan::Simple
  ltOp->resultType(ltOp->operationType());

  auto* sfr = new execplan::SimpleFilter(ltOp, tableKeyColumnLeftOp, filterColLeftOp);
-  // TODO new 
+  // TODO new
  // TODO remove new and re-use tableKeyColumnLeftOp
  auto tableKeyColumnRightOp = new execplan::SimpleColumn(column);
  tableKeyColumnRightOp->resultType(column.resultType());
@@ -214,8 +211,8 @@ std::optional<FilterRangeBounds<T>> populateRangeBounds(Histogram_json_hb* colum

  // TODO configurable parallel factor via session variable
  // NB now histogram size is the way to control parallel factor with 16 being the maximum
-  std::cout << "populateRangeBounds() columnStatistics->buckets.size() " << columnStatistics->get_json_histogram().size()
-            << std::endl;
+  std::cout << "populateRangeBounds() columnStatistics->buckets.size() "
+            << columnStatistics->get_json_histogram().size() << std::endl;
  size_t numberOfUnionUnits = std::min(columnStatistics->get_json_histogram().size(), MaxParallelFactor);
  size_t numberOfBucketsPerUnionUnit = columnStatistics->get_json_histogram().size() / numberOfUnionUnits;

@@ -237,12 +234,12 @@ std::optional<FilterRangeBounds<T>> populateRangeBounds(Histogram_json_hb* colum
    T currentLowerBound = *(uint32_t*)bucket.start_value.data();
    std::cout << "Bucket: " << currentLowerBound << std::endl;
  }
+  // TODO leave this here b/c there is a corresponding JIRA about the last upper range bound. 
  // auto penultimateBucket = columnStatistics.get_json_histogram().begin() + numberOfUnionUnits *
  // numberOfBucketsPerUnionUnit; T currentLowerBound = *(uint32_t*)penultimateBucket->start_value.data(); T
  // currentUpperBound = *(uint32_t*)columnStatistics.get_last_bucket_end_endp().data();
  // bounds.push_back({currentLowerBound, currentUpperBound});

-
  for (auto& bound : bounds)
  {
    std::cout << "Bound: " << bound.first << " " << bound.second << std::endl;
@@ -304,7 +301,7 @@ execplan::CalpontSelectExecutionPlan::SelectList makeUnionFromTable(
    clonedCSEP->filters(filter);
    unionVec.push_back(clonedCSEP);
  }
-  
+
  return unionVec;
 }
 bool applyParallelCES(execplan::CalpontSelectExecutionPlan& csep, RBOptimizerContext& ctx)