diff --git a/dbcon/mysql/ha_mcs_impl.cpp b/dbcon/mysql/ha_mcs_impl.cpp index e2b47e997..a86793a45 100644 --- a/dbcon/mysql/ha_mcs_impl.cpp +++ b/dbcon/mysql/ha_mcs_impl.cpp @@ -333,6 +333,7 @@ int fetchNextRow(uchar* buf, cal_table_info& ti, cal_connection_info* ci, long t if (ti.tpl_scan_ctx->rowsreturned == 0 && (ti.tpl_scan_ctx->traceFlags & execplan::CalpontSelectExecutionPlan::TRACE_TUPLE_OFF)) { + std::cout << "rowGroup->toString() " << rowGroup->toString() << std::endl; for (uint32_t i = 0; i < rowGroup->getColumnCount(); i++) { int oid = rowGroup->getOIDs()[i]; diff --git a/dbcon/mysql/ha_mcs_impl_if.h b/dbcon/mysql/ha_mcs_impl_if.h index d2ca6ae30..ab99765e5 100644 --- a/dbcon/mysql/ha_mcs_impl_if.h +++ b/dbcon/mysql/ha_mcs_impl_if.h @@ -122,8 +122,9 @@ typedef std::tr1::unordered_map TableOuterJoinMap; struct ColumnStatistics { - ColumnStatistics(execplan::SimpleColumn& column, std::vector histograms) - : column(column), histograms(histograms) + ColumnStatistics(execplan::SimpleColumn& column, std::vector histograms, + Field* minValue, Field* maxValue) + : column(column), histograms(histograms), minValue(minValue), maxValue(maxValue) { } ColumnStatistics() = default; @@ -133,15 +134,33 @@ struct ColumnStatistics return histograms; } + const Histogram_json_hb* getHistogram() const + { + if (histograms.empty()) + return nullptr; + return histograms.front(); + } + execplan::SimpleColumn& getColumn() { return column; } + std::optional getIntMinValue() const + { + return (minValue) ? std::optional(minValue->val_int()) : std::nullopt; + } + + std::optional getIntMaxValue() const + { + return (maxValue) ? std::optional(maxValue->val_int()) : std::nullopt; + } + + private: execplan::SimpleColumn column; std::vector histograms; - Field* min{nullptr}; - Field* max{nullptr}; + Field* minValue{nullptr}; + Field* maxValue{nullptr}; }; using ColumnName = std::string; @@ -162,14 +181,23 @@ struct TableStatistics auto tableStatisticsIt = tableStatistics_.find(tableName); if (tableStatisticsIt == tableStatistics_.end()) { - tableStatistics_[tableName][fieldName] = {sc, {histogram}}; + if (histogram) + { + tableStatistics_[tableName][fieldName] = { + sc, {histogram}, statistics->min_value, statistics->max_value}; + } + else + { + tableStatistics_[tableName][fieldName] = {sc, {}, statistics->min_value, statistics->max_value}; + } } else { auto columnStatisticsMapIt = tableStatisticsIt->second.find(fieldName); if (columnStatisticsMapIt == tableStatisticsIt->second.end()) { - tableStatisticsIt->second[fieldName] = {sc, {histogram}}; + tableStatisticsIt->second[fieldName] = { + sc, {histogram}, statistics->min_value, statistics->max_value}; } else { @@ -202,6 +230,8 @@ struct TableStatistics } else { + // Note: This algo overwrites histograms but shouldn't be a problem b/c + // statistics can't change. for (auto& [columnName, histogram] : aColumnStatisticsMap) { tableStatisticsIt->second[columnName] = histogram; @@ -210,6 +240,7 @@ struct TableStatistics } } + private: TableStatisticsMap tableStatistics_; }; diff --git a/dbcon/rbo/rbo_apply_parallel_ces.cpp b/dbcon/rbo/rbo_apply_parallel_ces.cpp index e7c8c4037..8568b2508 100644 --- a/dbcon/rbo/rbo_apply_parallel_ces.cpp +++ b/dbcon/rbo/rbo_apply_parallel_ces.cpp @@ -216,15 +216,9 @@ execplan::SimpleColumn* findSuitableKeyColumn(execplan::CalpontSelectExecutionPl return nullptr; } -// TBD -Histogram_json_hb* chooseStatisticsToUse(std::vector& columnStatisticsVec) -{ - return columnStatisticsVec.front(); -} - // Looking for a projected column that comes first in an available index and has EI statistics // INV nullptr signifies that no suitable column was found -std::optional> chooseKeyColumnAndStatistics( +cal_impl_if::ColumnStatistics* chooseKeyColumnAndStatistics( execplan::CalpontSystemCatalog::TableAliasName& targetTable, optimizer::RBOptimizerContext& ctx) { cal_impl_if::SchemaAndTableName schemaAndTableName = {targetTable.schema, targetTable.table}; @@ -232,19 +226,17 @@ std::optional> chooseKeyC auto tableColumnsStatistics = ctx.getGwi().tableStatistics.findStatisticsForATable(schemaAndTableName); if (!tableColumnsStatistics) { - return std::nullopt; + return nullptr; } - // TODO take some column and some stats for it!!! + // TODO this algo now returns the first column and stats + // for it but it should consider all column available for (auto& [columnName, columnStatistics] : tableColumnsStatistics.value()) { - auto& sc = columnStatistics.getColumn(); - auto& columnStatisticsVec = columnStatistics.getHistograms(); - auto* bestColumnStatistics = chooseStatisticsToUse(columnStatisticsVec); - return {{sc, bestColumnStatistics}}; + return &columnStatistics; } - return std::nullopt; + return nullptr; } } // namespace details @@ -257,37 +249,32 @@ bool parallelCESFilter(execplan::CalpontSelectExecutionPlan& csep, optimizer::RB return someAreForeignTables(csep) && someForeignTablesHasStatisticsAndMbIndex(csep, ctx); } -// Populates range bounds based on column statistics -// Returns optional with bounds if successful, nullopt otherwise +uint64_t decodeU64(const std::string& bytes) +{ + uint64_t v = 0; + const size_t n = std::min(bytes.size(), sizeof(uint64_t)); + if (n) + std::memcpy(&v, bytes.data(), n); + return v; +} + template -std::optional> populateRangeBounds(Histogram_json_hb* columnStatistics, - size_t& maxParallelFactor) +std::optional> populateRangeBoundsFromHistogram( + cal_impl_if::ColumnStatistics& columnStatistics, size_t maxParallelFactor) { details::FilterRangeBounds bounds; - - // Guard: empty histogram - if (!columnStatistics || columnStatistics->get_json_histogram().empty()) - return std::nullopt; - - auto decodeU64 = [](const std::string& bytes) -> uint64_t - { - uint64_t v = 0; - const size_t n = std::min(bytes.size(), sizeof(uint64_t)); - if (n) - std::memcpy(&v, bytes.data(), n); - return v; - }; + auto* histogram = columnStatistics.getHistogram(); // Get parallel factor from context - size_t numberOfUnionUnits = std::min(columnStatistics->get_json_histogram().size(), maxParallelFactor); - size_t numberOfBucketsPerUnionUnit = columnStatistics->get_json_histogram().size() / numberOfUnionUnits; + size_t numberOfUnionUnits = std::min(histogram->get_json_histogram().size(), maxParallelFactor); + size_t numberOfBucketsPerUnionUnit = histogram->get_json_histogram().size() / numberOfUnionUnits; // Loop over buckets to produce filter ranges // NB Currently Histogram_json_hb has the last bucket that has end as its start for (size_t i = 0; i < numberOfUnionUnits - 1; ++i) { - auto bucket = columnStatistics->get_json_histogram().begin() + i * numberOfBucketsPerUnionUnit; - auto endBucket = columnStatistics->get_json_histogram().begin() + (i + 1) * numberOfBucketsPerUnionUnit; + auto bucket = histogram->get_json_histogram().begin() + i * numberOfBucketsPerUnionUnit; + auto endBucket = histogram->get_json_histogram().begin() + (i + 1) * numberOfBucketsPerUnionUnit; T currentLowerBound = static_cast(decodeU64(bucket->start_value)); T currentUpperBound = static_cast(decodeU64(endBucket->start_value)); bounds.push_back({currentLowerBound, currentUpperBound}); @@ -297,15 +284,15 @@ std::optional> populateRangeBounds(Histogram_json_ if (numberOfUnionUnits >= 1) { auto lastChunkIndex = (numberOfUnionUnits - 1) * numberOfBucketsPerUnionUnit; - if (lastChunkIndex < columnStatistics->get_json_histogram().size()) + if (lastChunkIndex < histogram->get_json_histogram().size()) { - auto lastStartBucket = columnStatistics->get_json_histogram().begin() + lastChunkIndex; + auto lastStartBucket = histogram->get_json_histogram().begin() + lastChunkIndex; T finalLowerBound = static_cast(decodeU64(lastStartBucket->start_value)); T finalUpperBound = std::numeric_limits::max(); - if (!columnStatistics->get_last_bucket_end_endp().empty()) + if (!histogram->get_last_bucket_end_endp().empty()) { - finalUpperBound = static_cast(decodeU64(columnStatistics->get_last_bucket_end_endp())); + finalUpperBound = static_cast(decodeU64(histogram->get_last_bucket_end_endp())); } bounds.push_back({finalLowerBound, finalUpperBound}); } @@ -320,6 +307,56 @@ std::optional> populateRangeBounds(Histogram_json_ return bounds; } +template +std::optional> populateRangeBoundsFromEquallyDistributedRange( + cal_impl_if::ColumnStatistics& columnStatistics, size_t maxParallelFactor) +{ + auto minValue = columnStatistics.getIntMinValue().value(); + auto maxValue = columnStatistics.getIntMaxValue().value(); + + assert(maxValue >= minValue); + auto distance = maxValue - minValue; + auto step = distance / maxParallelFactor; + + details::FilterRangeBounds bounds; + for (size_t i = 0; i < maxParallelFactor; ++i) + { + bounds.push_back({minValue + i * step, minValue + (i + 1) * step}); + } + + if (!bounds.empty()) + { + bounds.front().first = std::numeric_limits::lowest(); + bounds.back().second = maxValue; + } + + return bounds; +} + +// Populates range bounds based on column statistics +// Returns optional with bounds if successful, nullopt otherwise +template +std::optional> populateRangeBounds( + cal_impl_if::ColumnStatistics& columnStatistics, size_t& maxParallelFactor) +{ + auto* histogram = columnStatistics.getHistogram(); + + // Guard: empty histogram or no min/max values + if (histogram && histogram->get_json_histogram().empty()) + { + return populateRangeBoundsFromHistogram(columnStatistics, maxParallelFactor); + } + + auto minValue = columnStatistics.getIntMinValue(); + auto maxValue = columnStatistics.getIntMaxValue(); + if (minValue && maxValue) + { + return populateRangeBoundsFromEquallyDistributedRange(columnStatistics, maxParallelFactor); + } + + return std::nullopt; +} + // TODO char and other numerical types support execplan::CalpontSelectExecutionPlan::SelectList makeUnionFromTable( execplan::CalpontSelectExecutionPlan& csep, execplan::CalpontSystemCatalog::TableAliasName& table, @@ -329,15 +366,16 @@ execplan::CalpontSelectExecutionPlan::SelectList makeUnionFromTable( // SC type controls an integral type used to produce suitable filters. The continuation of this function // should become a template function based on SC type. - auto keyColumnAndStatistics = chooseKeyColumnAndStatistics(table, ctx); - if (!keyColumnAndStatistics) + auto columnStatisticsPtr = chooseKeyColumnAndStatistics(table, ctx); + if (!columnStatisticsPtr) { return unionVec; } - auto& [keyColumn, columnStatistics] = keyColumnAndStatistics.value(); + auto& columnStatistics = *columnStatisticsPtr; + auto& keyColumn = columnStatistics.getColumn(); - std::cout << "makeUnionFromTable keyColumn " << keyColumn.toString() << std::endl; + // std::cout << "makeUnionFromTable keyColumn " << keyColumn.toString() << std::endl; std::cout << "makeUnionFromTable RC front " << csep.returnedCols().front()->toString() << std::endl; // TODO char and other numerical types support diff --git a/dbcon/rbo/rbo_apply_parallel_ces.h b/dbcon/rbo/rbo_apply_parallel_ces.h index dee6ea173..6d2223b59 100644 --- a/dbcon/rbo/rbo_apply_parallel_ces.h +++ b/dbcon/rbo/rbo_apply_parallel_ces.h @@ -91,11 +91,6 @@ execplan::SimpleColumn* findSuitableKeyColumn(execplan::CalpontSelectExecutionPl execplan::CalpontSystemCatalog::TableAliasName& targetTable, optimizer::RBOptimizerContext& ctx); -std::optional> chooseKeyColumnAndStatistics( - execplan::CalpontSystemCatalog::TableAliasName& targetTable, optimizer::RBOptimizerContext& ctx); - -Histogram_json_hb* chooseStatisticsToUse(const std::vector& statisticsVec); - } // namespace details // Main functions