1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-10-31 18:30:33 +03:00

feat(rbo,rules,QA): index column type is now derived from the corresponding Field

This commit is contained in:
drrtuy
2025-08-05 14:33:39 +00:00
parent e167082497
commit c030ff4224
4 changed files with 168 additions and 50 deletions

View File

@@ -64,7 +64,8 @@ bool someAreForeignTables(execplan::CalpontSelectExecutionPlan& csep)
[](const auto& table) { return !table.isColumnstore(); });
}
bool someForeignTablesHasStatisticsAndMbIndex(execplan::CalpontSelectExecutionPlan& csep, optimizer::RBOptimizerContext& ctx)
bool someForeignTablesHasStatisticsAndMbIndex(execplan::CalpontSelectExecutionPlan& csep,
optimizer::RBOptimizerContext& ctx)
{
return std::any_of(
csep.tableList().begin(), csep.tableList().end(),
@@ -100,12 +101,13 @@ execplan::ParseTree* filtersWithNewRange(execplan::SCSEP& csep, execplan::Simple
// There is a question with ownership of the const column
// TODO here we lost upper bound value if predicate is not changed to weak lt
execplan::SOP ltOp = (isLast) ? boost::make_shared<execplan::Operator>(execplan::PredicateOperator("<="))
: boost::make_shared<execplan::Operator>(execplan::PredicateOperator("<"));
: boost::make_shared<execplan::Operator>(execplan::PredicateOperator("<"));
ltOp->setOpType(filterColLeftOp->resultType(), tableKeyColumnLeftOp->resultType());
ltOp->resultType(ltOp->operationType());
auto* sfr = new execplan::SimpleFilter(ltOp, tableKeyColumnLeftOp, filterColLeftOp);
// TODO new
// TODO new
// TODO remove new and re-use tableKeyColumnLeftOp
auto tableKeyColumnRightOp = new execplan::SimpleColumn(column);
tableKeyColumnRightOp->resultType(column.resultType());
// TODO hardcoded column type and value
@@ -173,63 +175,74 @@ execplan::SimpleColumn* findSuitableKeyColumn(execplan::CalpontSelectExecutionPl
}
// TBD
Histogram_json_hb& chooseStatisticsToUse(std::vector<Histogram_json_hb>& columnStatisticsVec)
Histogram_json_hb* chooseStatisticsToUse(std::vector<Histogram_json_hb*>& columnStatisticsVec)
{
return columnStatisticsVec.front();
}
// Looking for a projected column that comes first in an available index and has EI statistics
// INV nullptr signifies that no suitable column was found
std::optional<std::pair<execplan::SimpleColumn&, Histogram_json_hb*>> chooseKeyColumnAndStatistics(
execplan::CalpontSystemCatalog::TableAliasName& targetTable, optimizer::RBOptimizerContext& ctx)
{
cal_impl_if::SchemaAndTableName schemaAndTableName = {targetTable.schema, targetTable.table};
auto tableColumnsStatisticsIt = ctx.gwi.tableStatisticsMap.find(schemaAndTableName);
if (tableColumnsStatisticsIt == ctx.gwi.tableStatisticsMap.end() ||
tableColumnsStatisticsIt->second.empty())
{
return std::nullopt;
}
// TODO take some column and some stats for it!!!
for (auto& [columnName, scAndStatisticsVec] : tableColumnsStatisticsIt->second)
{
auto& [sc, columnStatisticsVec] = scAndStatisticsVec;
auto* columnStatistics = chooseStatisticsToUse(columnStatisticsVec);
return {{sc, columnStatistics}};
}
return std::nullopt;
}
// Populates range bounds based on column statistics
// Returns optional with bounds if successful, nullopt otherwise
template <typename T>
std::optional<FilterRangeBounds<T>> populateRangeBounds(execplan::SimpleColumn* keyColumn,
optimizer::RBOptimizerContext& ctx)
std::optional<FilterRangeBounds<T>> populateRangeBounds(Histogram_json_hb* columnStatistics)
{
cal_impl_if::SchemaAndTableName schemaAndTableName = {keyColumn->schemaName(), keyColumn->tableName()};
auto tableColumnsStatisticsIt = ctx.gwi.tableStatisticsMap.find(schemaAndTableName);
if (tableColumnsStatisticsIt == ctx.gwi.tableStatisticsMap.end())
{
return std::nullopt;
}
auto columnStatisticsIt = tableColumnsStatisticsIt->second.find(keyColumn->columnName());
if (columnStatisticsIt == tableColumnsStatisticsIt->second.end())
{
return std::nullopt;
}
auto& [simpleColumn, columnStatisticsVec] = columnStatisticsIt->second;
auto& columnStatistics = chooseStatisticsToUse(columnStatisticsVec);
FilterRangeBounds<T> bounds;
// TODO configurable parallel factor via session variable
// NB now histogram size is the way to control parallel factor with 16 being the maximum
size_t numberOfUnionUnits = std::min(columnStatistics.get_json_histogram().size(), MaxParallelFactor);
size_t numberOfBucketsPerUnionUnit = columnStatistics.get_json_histogram().size() / numberOfUnionUnits;
std::cout << "populateRangeBounds() columnStatistics->buckets.size() " << columnStatistics->get_json_histogram().size()
<< std::endl;
size_t numberOfUnionUnits = std::min(columnStatistics->get_json_histogram().size(), MaxParallelFactor);
size_t numberOfBucketsPerUnionUnit = columnStatistics->get_json_histogram().size() / numberOfUnionUnits;
std::cout << "Number of union units: " << numberOfUnionUnits << std::endl;
std::cout << "Number of buckets per union unit: " << numberOfBucketsPerUnionUnit << std::endl;
FilterRangeBounds<T> bounds;
// Loop over buckets to produce filter ranges
// NB Currently Histogram_json_hb has the last bucket that has end as its start
for (size_t i = 0; i < numberOfUnionUnits - 1; ++i)
{
auto bucket = columnStatistics.get_json_histogram().begin() + i * numberOfBucketsPerUnionUnit;
auto endBucket = columnStatistics.get_json_histogram().begin() + (i + 1) * numberOfBucketsPerUnionUnit;
auto bucket = columnStatistics->get_json_histogram().begin() + i * numberOfBucketsPerUnionUnit;
auto endBucket = columnStatistics->get_json_histogram().begin() + (i + 1) * numberOfBucketsPerUnionUnit;
T currentLowerBound = *(uint32_t*)bucket->start_value.data();
T currentUpperBound = *(uint32_t*)endBucket->start_value.data();
bounds.push_back({currentLowerBound, currentUpperBound});
}
for (auto& bucket : columnStatistics.get_json_histogram())
for (auto& bucket : columnStatistics->get_json_histogram())
{
T currentLowerBound = *(uint32_t*)bucket.start_value.data();
std::cout << "Bucket: " << currentLowerBound << std::endl;
}
// auto penultimateBucket = columnStatistics.get_json_histogram().begin() + numberOfUnionUnits * numberOfBucketsPerUnionUnit;
// T currentLowerBound = *(uint32_t*)penultimateBucket->start_value.data();
// T currentUpperBound = *(uint32_t*)columnStatistics.get_last_bucket_end_endp().data();
// auto penultimateBucket = columnStatistics.get_json_histogram().begin() + numberOfUnionUnits *
// numberOfBucketsPerUnionUnit; T currentLowerBound = *(uint32_t*)penultimateBucket->start_value.data(); T
// currentUpperBound = *(uint32_t*)columnStatistics.get_last_bucket_end_endp().data();
// bounds.push_back({currentLowerBound, currentUpperBound});
for (auto& bound : bounds)
{
std::cout << "Bound: " << bound.first << " " << bound.second << std::endl;
@@ -247,14 +260,19 @@ execplan::CalpontSelectExecutionPlan::SelectList makeUnionFromTable(
// SC type controls an integral type used to produce suitable filters. The continuation of this function
// should become a template function based on SC type.
execplan::SimpleColumn* keyColumn = findSuitableKeyColumn(csep, table, ctx);
if (!keyColumn)
auto keyColumnAndStatistics = chooseKeyColumnAndStatistics(table, ctx);
if (!keyColumnAndStatistics)
{
return unionVec;
}
auto& [keyColumn, columnStatistics] = keyColumnAndStatistics.value();
std::cout << "makeUnionFromTable keyColumn " << keyColumn.toString() << std::endl;
std::cout << "makeUnionFromTable RC front " << csep.returnedCols().front()->toString() << std::endl;
// TODO char and other numerical types support
auto boundsOpt = populateRangeBounds<uint64_t>(keyColumn, ctx);
auto boundsOpt = populateRangeBounds<uint64_t>(columnStatistics);
if (!boundsOpt.has_value())
{
return unionVec;
@@ -269,15 +287,24 @@ execplan::CalpontSelectExecutionPlan::SelectList makeUnionFromTable(
{
auto clonedCSEP = csep.cloneForTableWORecursiveSelects(table);
// Add BETWEEN based on key column range
clonedCSEP->filters(filtersWithNewRange(clonedCSEP, *keyColumn, bounds[i], false));
auto filter = filtersWithNewRange(clonedCSEP, keyColumn, bounds[i], false);
clonedCSEP->filters(filter);
// To create CES filter we need to have a column in the column map
clonedCSEP->columnMap().insert({keyColumn.columnName(), execplan::SRCP(keyColumn.clone())});
unionVec.push_back(clonedCSEP);
}
}
// This last bound produces low <= col <= high
auto clonedCSEP = csep.cloneForTableWORecursiveSelects(table);
clonedCSEP->filters(filtersWithNewRange(clonedCSEP, *keyColumn, bounds.back(), true));
unionVec.push_back(clonedCSEP);
// TODO add NULLs into filter of the last step
if (!bounds.empty())
{
auto clonedCSEP = csep.cloneForTableWORecursiveSelects(table);
auto filter = filtersWithNewRange(clonedCSEP, keyColumn, bounds.back(), true);
clonedCSEP->columnMap().insert({keyColumn.columnName(), execplan::SRCP(keyColumn.clone())});
clonedCSEP->filters(filter);
unionVec.push_back(clonedCSEP);
}
return unionVec;
}
bool applyParallelCES(execplan::CalpontSelectExecutionPlan& csep, RBOptimizerContext& ctx)