feat(rbo,rules,QA): refactored statistics storage

2025-11-02 06:13:16 +03:00 · 2025-07-31 12:50:24 +00:00
parent 112ba9f162
commit e167082497
3 changed files with 71 additions and 41 deletions
--- a/dbcon/mysql/ha_mcs_execplan.cpp
+++ b/dbcon/mysql/ha_mcs_execplan.cpp
@@ -5227,8 +5227,28 @@ int processFrom(bool& isUnion, SELECT_LEX& select_lex, gp_walk_info& gwi, SCSEP&
                {
                  std::cout << " has stats ";
                  SchemaAndTableName tableName = {field->table->s->db.str,
-                    field->table->s->table_name.str}; 
-                  gwi.tableStatisticsMap[tableName][field->field_name.str] = *histogram;
+                    field->table->s->table_name.str};
+                  execplan::SimpleColumn simpleColumn = {field->table->s->db.str,
+                    field->table->s->table_name.str,
+                    field->field_name.str};
+                  auto tableStatisticsMapIt = gwi.tableStatisticsMap.find(tableName);
+                  if (tableStatisticsMapIt == gwi.tableStatisticsMap.end())
+                  {
+                    gwi.tableStatisticsMap[tableName][field->field_name.str] = {simpleColumn, {*histogram}};
+                  }
+                  else
+                  {
+                    auto columnStatisticsMapIt = tableStatisticsMapIt->second.find(field->field_name.str);
+                    if (columnStatisticsMapIt == tableStatisticsMapIt->second.end())
+                    {
+                      tableStatisticsMapIt->second[field->field_name.str] = {simpleColumn, {*histogram}};
+                    }
+                    else
+                    {
+                      auto columnStatisticsVec = columnStatisticsMapIt->second.second;
+                      columnStatisticsVec.push_back(*histogram);
+                    }
+                  }
                }
                else
                {
@@ -6321,43 +6341,43 @@ int processLimitAndOffset(SELECT_LEX& select_lex, gp_walk_info& gwi, SCSEP& csep
 // for the first column of the index if any.
 // Statistics is stored in GWI context.
 // Mock for ES 10.6
-#if MYSQL_VERSION_ID >= 120401
-void extractColumnStatistics(Item_field* ifp, gp_walk_info& gwi)
-{
-  for (uint j = 0; j < ifp->field->table->s->keys; j++)
-  {
-    for (uint i = 0; i < ifp->field->table->s->key_info[j].usable_key_parts; i++)
-    {
-      if (ifp->field->table->s->key_info[j].key_part[i].fieldnr == ifp->field->field_index + 1)
-      {
-        if (i == 0 && ifp->field->read_stats)
-        {
-          assert(ifp->field->table->s);
-          auto* histogram = dynamic_cast<Histogram_json_hb*>(ifp->field->read_stats->histogram);
-          if (histogram)
-          {
-            SchemaAndTableName tableName = {ifp->field->table->s->db.str,
-                                            ifp->field->table->s->table_name.str};
-            auto tableStatisticsMapIt = gwi.tableStatisticsMap.find(tableName);
-            if (tableStatisticsMapIt == gwi.tableStatisticsMap.end())
-            {
-              gwi.tableStatisticsMap.insert({tableName, {{ifp->field->field_name.str, *histogram}}});
-            }
-            else
-            {
-              tableStatisticsMapIt->second.insert({ifp->field->field_name.str, *histogram});
-            }
-          }
-        }
-      }
-    }
-  }
-}
-#else
-void extractColumnStatistics(Item_field* /*ifp*/, gp_walk_info& /*gwi*/)
-{
-}
-#endif
+// #if MYSQL_VERSION_ID >= 120401
+// void extractColumnStatistics(Item_field* ifp, gp_walk_info& gwi)
+// {
+//   for (uint j = 0; j < ifp->field->table->s->keys; j++)
+//   {
+//     for (uint i = 0; i < ifp->field->table->s->key_info[j].usable_key_parts; i++)
+//     {
+//       if (ifp->field->table->s->key_info[j].key_part[i].fieldnr == ifp->field->field_index + 1)
+//       {
+//         if (i == 0 && ifp->field->read_stats)
+//         {
+//           assert(ifp->field->table->s);
+//           auto* histogram = dynamic_cast<Histogram_json_hb*>(ifp->field->read_stats->histogram);
+//           if (histogram)
+//           {
+//             SchemaAndTableName tableName = {ifp->field->table->s->db.str,
+//                                             ifp->field->table->s->table_name.str};
+//             auto tableStatisticsMapIt = gwi.tableStatisticsMap.find(tableName);
+//             if (tableStatisticsMapIt == gwi.tableStatisticsMap.end())
+//             {
+//               gwi.tableStatisticsMap.insert({tableName, {{ifp->field->field_name.str, *histogram}}});
+//             }
+//             else
+//             {
+//               tableStatisticsMapIt->second.insert({ifp->field->field_name.str, *histogram});
+//             }
+//           }
+//         }
+//       }
+//     }
+//   }
+// }
+// #else
+// void extractColumnStatistics(Item_field* /*ifp*/, gp_walk_info& /*gwi*/)
+// {
+// }
+// #endif

 /*@brief  Process SELECT part of a query or sub-query      */
 /***********************************************************
--- a/dbcon/mysql/ha_mcs_impl_if.h
+++ b/dbcon/mysql/ha_mcs_impl_if.h
@@ -116,7 +116,7 @@ typedef std::map<execplan::CalpontSystemCatalog::TableAliasName, std::pair<int,
 typedef std::tr1::unordered_map<TABLE_LIST*, std::vector<COND*>> TableOnExprList;
 typedef std::tr1::unordered_map<TABLE_LIST*, uint> TableOuterJoinMap;
 using ColumnName = std::string;
-using ColumnStatisticsMap = std::unordered_map<ColumnName, Histogram_json_hb>;
+using ColumnStatisticsMap = std::unordered_map<ColumnName, std::pair<execplan::SimpleColumn, std::vector<Histogram_json_hb>>>;
 using TableStatisticsMap = std::unordered_map<SchemaAndTableName, ColumnStatisticsMap, SchemaAndTableNameHash>;

 // This structure is used to store MDB AST -> CSEP translation context.
--- a/dbcon/mysql/rbo_apply_parallel_ces.cpp
+++ b/dbcon/mysql/rbo_apply_parallel_ces.cpp
@@ -105,6 +105,7 @@ execplan::ParseTree* filtersWithNewRange(execplan::SCSEP& csep, execplan::Simple
  ltOp->resultType(ltOp->operationType());

  auto* sfr = new execplan::SimpleFilter(ltOp, tableKeyColumnLeftOp, filterColLeftOp);
+  // TODO new
  auto tableKeyColumnRightOp = new execplan::SimpleColumn(column);
  tableKeyColumnRightOp->resultType(column.resultType());
  // TODO hardcoded column type and value
@@ -114,8 +115,10 @@ execplan::ParseTree* filtersWithNewRange(execplan::SCSEP& csep, execplan::Simple
  gtOp->setOpType(filterColRightOp->resultType(), tableKeyColumnRightOp->resultType());
  gtOp->resultType(gtOp->operationType());

+  // TODO new
  auto* sfl = new execplan::SimpleFilter(gtOp, tableKeyColumnRightOp, filterColRightOp);

+  // TODO new
  execplan::ParseTree* ptp = new execplan::ParseTree(new execplan::LogicOperator("and"));
  ptp->right(sfr);
  ptp->left(sfl);
@@ -169,6 +172,12 @@ execplan::SimpleColumn* findSuitableKeyColumn(execplan::CalpontSelectExecutionPl
  return nullptr;
 }

+// TBD
+Histogram_json_hb& chooseStatisticsToUse(std::vector<Histogram_json_hb>& columnStatisticsVec)
+{
+  return columnStatisticsVec.front();
+}
+
 // Populates range bounds based on column statistics
 // Returns optional with bounds if successful, nullopt otherwise
 template <typename T>
@@ -188,7 +197,8 @@ std::optional<FilterRangeBounds<T>> populateRangeBounds(execplan::SimpleColumn*
    return std::nullopt;
  }

-  auto columnStatistics = columnStatisticsIt->second;
+  auto& [simpleColumn, columnStatisticsVec] = columnStatisticsIt->second;
+  auto& columnStatistics = chooseStatisticsToUse(columnStatisticsVec);

  // TODO configurable parallel factor via session variable
  // NB now histogram size is the way to control parallel factor with 16 being the maximum