1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-11-02 06:13:16 +03:00

feat(rbo,rules,QA): refactored statistics storage

This commit is contained in:
drrtuy
2025-07-31 12:50:24 +00:00
parent 112ba9f162
commit e167082497
3 changed files with 71 additions and 41 deletions

View File

@@ -5227,8 +5227,28 @@ int processFrom(bool& isUnion, SELECT_LEX& select_lex, gp_walk_info& gwi, SCSEP&
{
std::cout << " has stats ";
SchemaAndTableName tableName = {field->table->s->db.str,
field->table->s->table_name.str};
gwi.tableStatisticsMap[tableName][field->field_name.str] = *histogram;
field->table->s->table_name.str};
execplan::SimpleColumn simpleColumn = {field->table->s->db.str,
field->table->s->table_name.str,
field->field_name.str};
auto tableStatisticsMapIt = gwi.tableStatisticsMap.find(tableName);
if (tableStatisticsMapIt == gwi.tableStatisticsMap.end())
{
gwi.tableStatisticsMap[tableName][field->field_name.str] = {simpleColumn, {*histogram}};
}
else
{
auto columnStatisticsMapIt = tableStatisticsMapIt->second.find(field->field_name.str);
if (columnStatisticsMapIt == tableStatisticsMapIt->second.end())
{
tableStatisticsMapIt->second[field->field_name.str] = {simpleColumn, {*histogram}};
}
else
{
auto columnStatisticsVec = columnStatisticsMapIt->second.second;
columnStatisticsVec.push_back(*histogram);
}
}
}
else
{
@@ -6321,43 +6341,43 @@ int processLimitAndOffset(SELECT_LEX& select_lex, gp_walk_info& gwi, SCSEP& csep
// for the first column of the index if any.
// Statistics is stored in GWI context.
// Mock for ES 10.6
#if MYSQL_VERSION_ID >= 120401
void extractColumnStatistics(Item_field* ifp, gp_walk_info& gwi)
{
for (uint j = 0; j < ifp->field->table->s->keys; j++)
{
for (uint i = 0; i < ifp->field->table->s->key_info[j].usable_key_parts; i++)
{
if (ifp->field->table->s->key_info[j].key_part[i].fieldnr == ifp->field->field_index + 1)
{
if (i == 0 && ifp->field->read_stats)
{
assert(ifp->field->table->s);
auto* histogram = dynamic_cast<Histogram_json_hb*>(ifp->field->read_stats->histogram);
if (histogram)
{
SchemaAndTableName tableName = {ifp->field->table->s->db.str,
ifp->field->table->s->table_name.str};
auto tableStatisticsMapIt = gwi.tableStatisticsMap.find(tableName);
if (tableStatisticsMapIt == gwi.tableStatisticsMap.end())
{
gwi.tableStatisticsMap.insert({tableName, {{ifp->field->field_name.str, *histogram}}});
}
else
{
tableStatisticsMapIt->second.insert({ifp->field->field_name.str, *histogram});
}
}
}
}
}
}
}
#else
void extractColumnStatistics(Item_field* /*ifp*/, gp_walk_info& /*gwi*/)
{
}
#endif
// #if MYSQL_VERSION_ID >= 120401
// void extractColumnStatistics(Item_field* ifp, gp_walk_info& gwi)
// {
// for (uint j = 0; j < ifp->field->table->s->keys; j++)
// {
// for (uint i = 0; i < ifp->field->table->s->key_info[j].usable_key_parts; i++)
// {
// if (ifp->field->table->s->key_info[j].key_part[i].fieldnr == ifp->field->field_index + 1)
// {
// if (i == 0 && ifp->field->read_stats)
// {
// assert(ifp->field->table->s);
// auto* histogram = dynamic_cast<Histogram_json_hb*>(ifp->field->read_stats->histogram);
// if (histogram)
// {
// SchemaAndTableName tableName = {ifp->field->table->s->db.str,
// ifp->field->table->s->table_name.str};
// auto tableStatisticsMapIt = gwi.tableStatisticsMap.find(tableName);
// if (tableStatisticsMapIt == gwi.tableStatisticsMap.end())
// {
// gwi.tableStatisticsMap.insert({tableName, {{ifp->field->field_name.str, *histogram}}});
// }
// else
// {
// tableStatisticsMapIt->second.insert({ifp->field->field_name.str, *histogram});
// }
// }
// }
// }
// }
// }
// }
// #else
// void extractColumnStatistics(Item_field* /*ifp*/, gp_walk_info& /*gwi*/)
// {
// }
// #endif
/*@brief Process SELECT part of a query or sub-query */
/***********************************************************

View File

@@ -116,7 +116,7 @@ typedef std::map<execplan::CalpontSystemCatalog::TableAliasName, std::pair<int,
typedef std::tr1::unordered_map<TABLE_LIST*, std::vector<COND*>> TableOnExprList;
typedef std::tr1::unordered_map<TABLE_LIST*, uint> TableOuterJoinMap;
using ColumnName = std::string;
using ColumnStatisticsMap = std::unordered_map<ColumnName, Histogram_json_hb>;
using ColumnStatisticsMap = std::unordered_map<ColumnName, std::pair<execplan::SimpleColumn, std::vector<Histogram_json_hb>>>;
using TableStatisticsMap = std::unordered_map<SchemaAndTableName, ColumnStatisticsMap, SchemaAndTableNameHash>;
// This structure is used to store MDB AST -> CSEP translation context.

View File

@@ -105,6 +105,7 @@ execplan::ParseTree* filtersWithNewRange(execplan::SCSEP& csep, execplan::Simple
ltOp->resultType(ltOp->operationType());
auto* sfr = new execplan::SimpleFilter(ltOp, tableKeyColumnLeftOp, filterColLeftOp);
// TODO new
auto tableKeyColumnRightOp = new execplan::SimpleColumn(column);
tableKeyColumnRightOp->resultType(column.resultType());
// TODO hardcoded column type and value
@@ -114,8 +115,10 @@ execplan::ParseTree* filtersWithNewRange(execplan::SCSEP& csep, execplan::Simple
gtOp->setOpType(filterColRightOp->resultType(), tableKeyColumnRightOp->resultType());
gtOp->resultType(gtOp->operationType());
// TODO new
auto* sfl = new execplan::SimpleFilter(gtOp, tableKeyColumnRightOp, filterColRightOp);
// TODO new
execplan::ParseTree* ptp = new execplan::ParseTree(new execplan::LogicOperator("and"));
ptp->right(sfr);
ptp->left(sfl);
@@ -169,6 +172,12 @@ execplan::SimpleColumn* findSuitableKeyColumn(execplan::CalpontSelectExecutionPl
return nullptr;
}
// TBD
Histogram_json_hb& chooseStatisticsToUse(std::vector<Histogram_json_hb>& columnStatisticsVec)
{
return columnStatisticsVec.front();
}
// Populates range bounds based on column statistics
// Returns optional with bounds if successful, nullopt otherwise
template <typename T>
@@ -188,7 +197,8 @@ std::optional<FilterRangeBounds<T>> populateRangeBounds(execplan::SimpleColumn*
return std::nullopt;
}
auto columnStatistics = columnStatisticsIt->second;
auto& [simpleColumn, columnStatisticsVec] = columnStatisticsIt->second;
auto& columnStatistics = chooseStatisticsToUse(columnStatisticsVec);
// TODO configurable parallel factor via session variable
// NB now histogram size is the way to control parallel factor with 16 being the maximum