1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-10-31 18:30:33 +03:00

feat(optimizer): MCOL-5250 rewrite queries with DISTINCT (#3666)

* feat(optimizer): MCOL-5250 rewrite queries with DISTINCT

... as aggregated queries.
So query
```
SELECT DISTINCT <cols list>
FROM <from list>
WHERE <where clause>
HAVING <having clause>
ORDER BY <orderby list>
LIMIT <limit>
```
will become
```
SELECT *
FROM
  (
    SELECT <cols list>
    FROM <from list>
    WHERE <where clause>
    HAVING <having clause>
  ) a
GROUP BY 1,2,3,...,N
ORDER BY <orderby list>
LIMIT limit
```

* move ORDER BY to the outer query

* fix test

* reuse cloneWORecursiveSelects() in clone()

* fix subselect columns processing
This commit is contained in:
Alexey Antipovsky
2025-09-22 14:16:37 +02:00
committed by GitHub
parent 736ec81e4d
commit cfa9a7ff2c
13 changed files with 333 additions and 22 deletions

View File

@@ -1197,4 +1197,49 @@ execplan::SCSEP CalpontSelectExecutionPlan::cloneForTableWORecursiveSelectsGbObH
return newPlan; return newPlan;
} }
SCSEP CalpontSelectExecutionPlan::clone()
{
auto newPlan = cloneWORecursiveSelects();
newPlan->fSelectSubList.clear();
for (const auto& subPlan : fSubSelects)
{
auto* subCSEP = dynamic_cast<CalpontSelectExecutionPlan*>(subPlan.get());
idbassert_s(subCSEP != nullptr, "subPlan is not a CalpontSelectExecutionPlan");
newPlan->fSubSelects.push_back(subCSEP->clone());
}
newPlan->fDerivedTableList.clear();
for (const auto& drvTable: fDerivedTableList)
{
auto* drvCSEP = dynamic_cast<CalpontSelectExecutionPlan*>(drvTable.get());
idbassert_s(drvCSEP != nullptr, "derivedTable is not a CalpontSelectExecutionPlan");
newPlan->fDerivedTableList.push_back(drvCSEP->clone());
}
newPlan->fUnionVec.clear();
for (const auto& subPlan : fUnionVec)
{
auto* subCSEP = dynamic_cast<CalpontSelectExecutionPlan*>(subPlan.get());
idbassert_s(subCSEP != nullptr, "unionVec is not a CalpontSelectExecutionPlan");
newPlan->fUnionVec.push_back(subCSEP->clone());
}
newPlan->fSelectSubList.clear();
for (const auto& subPlan : fSelectSubList)
{
auto* subCSEP = dynamic_cast<CalpontSelectExecutionPlan*>(subPlan.get());
idbassert_s(subCSEP != nullptr, "subPlan is not a CalpontSelectExecutionPlan");
newPlan->fSelectSubList.push_back(subCSEP->clone());
}
newPlan->fSubSelectList.clear();
for (const auto& subPlan : fSubSelectList)
{
newPlan->fSubSelectList.push_back(subPlan->clone());
}
return newPlan;
}
} // namespace execplan } // namespace execplan

View File

@@ -168,6 +168,7 @@ class CalpontSelectExecutionPlan : public CalpontExecutionPlan
execplan::SCSEP cloneForTableWORecursiveSelectsGbObHaving( execplan::SCSEP cloneForTableWORecursiveSelectsGbObHaving(
const execplan::CalpontSystemCatalog::TableAliasName& targetTableAlias, const bool withFilters = true); const execplan::CalpontSystemCatalog::TableAliasName& targetTableAlias, const bool withFilters = true);
SCSEP clone();
/** /**
* Access and mutator methods * Access and mutator methods
*/ */
@@ -495,7 +496,7 @@ class CalpontSelectExecutionPlan : public CalpontExecutionPlan
{ {
return fDerivedTableList; return fDerivedTableList;
} }
void derivedTableList(SelectList& derivedTableList) void derivedTableList(const SelectList& derivedTableList)
{ {
fDerivedTableList = derivedTableList; fDerivedTableList = derivedTableList;
} }

View File

@@ -100,6 +100,7 @@ ReturnedColumn::ReturnedColumn(const ReturnedColumn& rhs, const uint32_t session
, fSessionID(sessionID) , fSessionID(sessionID)
, fSequence(rhs.fSequence) , fSequence(rhs.fSequence)
, fCardinality(rhs.fCardinality) , fCardinality(rhs.fCardinality)
, fAlias(rhs.alias())
, fDistinct(rhs.fDistinct) , fDistinct(rhs.fDistinct)
, fJoinInfo(rhs.fJoinInfo) , fJoinInfo(rhs.fJoinInfo)
, fAsc(rhs.fAsc) , fAsc(rhs.fAsc)

View File

@@ -131,7 +131,8 @@ void getSimpleColsExtended(execplan::ParseTree* n, void* obj)
else if (selectFilter) else if (selectFilter)
{ {
selectFilter->setSimpleColumnListExtended(); selectFilter->setSimpleColumnListExtended();
list->insert(list->end(), selectFilter->simpleColumnListExtended().begin(), selectFilter->simpleColumnListExtended().end()); list->insert(list->end(), selectFilter->simpleColumnListExtended().begin(),
selectFilter->simpleColumnListExtended().end());
} }
else if (cf) else if (cf)
{ {
@@ -861,4 +862,37 @@ std::optional<CalpontSystemCatalog::TableAliasName> sameTableCheck(
return tan; return tan;
} }
std::string getSimpleColumnAlias(const ReturnedColumn& origCol, int64_t colPos)
{
std::string alias = origCol.alias();
if (alias.empty())
{
if (auto* sc = dynamic_cast<const SimpleColumn*>(&origCol); sc)
{
alias = sc->columnName();
}
else if (auto* fc = dynamic_cast<const FunctionColumn*>(&origCol); fc)
{
alias = fc->functionName();
}
else if (auto* ac = dynamic_cast<const AggregateColumn*>(&origCol); ac)
{
alias = ac->functionName();
}
else if (auto* wc = dynamic_cast<const WindowFunctionColumn*>(&origCol); wc)
{
alias = wc->functionName();
}
}
if (alias.empty())
{
alias = "`$col_" + std::to_string(colPos) + "`";
}
if (alias[0] != '`')
{
alias = "`" + alias + "`";
}
return alias;
}
} // namespace execplan } // namespace execplan

View File

@@ -413,4 +413,8 @@ ParseTree* replaceRefCol(ParseTree*& n, CalpontSelectExecutionPlan::ReturnedColu
std::optional<CalpontSystemCatalog::TableAliasName> sameTableCheck( std::optional<CalpontSystemCatalog::TableAliasName> sameTableCheck(
std::vector<SimpleColumn*> simpleColumnList); std::vector<SimpleColumn*> simpleColumnList);
/// utility function for constructing a reasonable alias for a SimpleColumn copy, based on the alias/column
/// name/function name of the original colum
std::string getSimpleColumnAlias(const ReturnedColumn& origCol, int64_t colPos);
} // namespace execplan } // namespace execplan

View File

@@ -1,4 +1,8 @@
set(rbo_SRCS rulebased_optimizer.cpp rbo_apply_parallel_ces.cpp rbo_predicate_pushdown.cpp) set(rbo_SRCS
rulebased_optimizer.cpp
rbo_apply_parallel_ces.cpp
rbo_apply_rewrite_distinct.cpp
rbo_predicate_pushdown.cpp)
columnstore_library(rbo ${rbo_SRCS}) columnstore_library(rbo ${rbo_SRCS})

View File

@@ -46,8 +46,6 @@ using ExtraSRRC = std::vector<std::unique_ptr<execplan::SimpleColumn>>;
using SCAndItsProjectionPosition = std::pair<execplan::SimpleColumn*, uint32_t>; using SCAndItsProjectionPosition = std::pair<execplan::SimpleColumn*, uint32_t>;
using SCsAndTheirProjectionPositions = std::vector<SCAndItsProjectionPosition>; using SCsAndTheirProjectionPositions = std::vector<SCAndItsProjectionPosition>;
static const std::string RewrittenSubTableAliasPrefix = "$added_sub_";
namespace details namespace details
{ {
@@ -562,8 +560,7 @@ bool applyParallelCES(execplan::CalpontSelectExecutionPlan& csep, optimizer::RBO
auto anyColumnStatistics = ctx.getGwi().findStatisticsForATable(schemaAndTableName); auto anyColumnStatistics = ctx.getGwi().findStatisticsForATable(schemaAndTableName);
if (!table.isColumnstore() && anyColumnStatistics) if (!table.isColumnstore() && anyColumnStatistics)
{ {
std::string tableAlias = optimizer::RewrittenSubTableAliasPrefix + table.schema + "_" + table.table + std::string tableAlias = getRewrittenSubTableAlias(table, ctx);
"_" + std::to_string(ctx.getUniqueId());
tableAliasToSCPositionsMap.insert({table, {tableAlias, {}, 0}}); tableAliasToSCPositionsMap.insert({table, {tableAlias, {}, 0}});
execplan::CalpontSystemCatalog::TableAliasName tn = execplan::make_aliasview("", "", tableAlias, ""); execplan::CalpontSystemCatalog::TableAliasName tn = execplan::make_aliasview("", "", tableAlias, "");
newTableList.push_back(tn); newTableList.push_back(tn);

View File

@@ -0,0 +1,180 @@
/* Copyright (C) 2025 MariaDB Corporation
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; version 2 of
the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
MA 02110-1301, USA. */
#include "rulebased_optimizer.h"
#include "calpontselectexecutionplan.h"
#include "aggregatecolumn.h"
#include "simplecolumn.h"
#include "existsfilter.h"
#include "functioncolumn.h"
#include "logicoperator.h"
namespace optimizer
{
bool rewriteDistinctFilter(execplan::CalpontSelectExecutionPlan& csep, RBOptimizerContext& /*ctx*/)
{
return csep.distinct() && csep.tableList().size() > 0;
}
execplan::SRCP cloneAsSimpleColumn(const execplan::SRCP& rc, const std::string& tableAlias, int64_t colPos)
{
auto rcCloned = boost::make_shared<execplan::SimpleColumn>(*rc);
// fill SimpleColumn data
rcCloned->schemaName("");
rcCloned->tableName(tableAlias);
rcCloned->oid(0);
rcCloned->tableAlias(tableAlias);
rcCloned->data("");
// fill ReturnedColumn data
rcCloned->charsetNumber(rc->charsetNumber());
// fill TreeNode data
rcCloned->derivedTable(tableAlias);
rcCloned->derivedRefCol(rc.get());
rcCloned->resultType(rc->resultType());
rcCloned->operationType(rc->operationType());
rcCloned->colPosition(colPos);
if (const auto* rcsc = dynamic_cast<execplan::SimpleColumn*>(rc.get()); rcsc != nullptr)
{
rcCloned->timeZone(rcsc->timeZone());
}
else if (const auto* rcfc = dynamic_cast<execplan::FunctionColumn*>(rc.get()))
{
rcCloned->timeZone(rcfc->timeZone());
}
else if (const auto* rcac = dynamic_cast<execplan::AggregateColumn*>(rc.get()))
{
rcCloned->timeZone(rcac->timeZone());
}
else if (const auto* rcwc = dynamic_cast<execplan::WindowFunctionColumn*>(rc.get()))
{
rcCloned->timeZone(rcwc->timeZone());
}
rc->incRefCount();
auto colName = getSimpleColumnAlias(*rc, colPos);
rcCloned->columnName(colName);
rcCloned->alias("`" + tableAlias + "`." + colName);
rcCloned->colSource(0);
return rcCloned;
}
bool applyRewriteDistinct(execplan::CalpontSelectExecutionPlan& csep, RBOptimizerContext& ctx)
{
auto origCSEP = csep.clone();
auto tableAlias = getRewrittenSubTableAlias(csep.tableList()[0], ctx);
origCSEP->location(execplan::CalpontSelectExecutionPlan::FROM);
origCSEP->subType(execplan::CalpontSelectExecutionPlan::FROM_SUBS);
origCSEP->derivedTbAlias(tableAlias);
csep.subSelectList({});
csep.subSelects({});
csep.selectSubList({});
csep.unionVec({});
execplan::CalpontSelectExecutionPlan::TableList tblList;
tblList.push_back(execplan::make_aliasview("", "", tableAlias, ""));
csep.tableList(tblList);
execplan::CalpontSelectExecutionPlan::SelectList derivedTblList;
derivedTblList.emplace_back(origCSEP);
csep.derivedTableList(derivedTblList);
csep.distinct(false);
csep.filters(nullptr);
csep.having(nullptr);
csep.returnedCols({});
csep.groupByCols({});
int64_t colPos = 0;
for (const auto& rc : origCSEP->returnedCols())
{
auto rcCloned = cloneAsSimpleColumn(rc, tableAlias, colPos);
csep.returnedCols().emplace_back(rcCloned);
auto grpByCloned = cloneAsSimpleColumn(rc, tableAlias, colPos);
grpByCloned->orderPos(colPos);
csep.groupByCols().emplace_back(grpByCloned);
++colPos;
}
// order by
csep.orderByCols({});
int64_t orderByColPos = 0;
for (const auto& obc : origCSEP->orderByCols())
{
bool found = false;
int64_t retColPos = 0;
for (const auto& rc : origCSEP->returnedCols())
{
if (*obc == *rc)
{
// lucky me, order by column is in the result set
found = true;
execplan::SRCP outerRC;
if (retColPos < colPos)
{
outerRC = csep.returnedCols()[retColPos];
}
else
{
outerRC = csep.orderByCols()[retColPos - colPos];
}
auto obcCloned = cloneAsSimpleColumn(outerRC, tableAlias, retColPos);
obcCloned->asc(obc->asc());
obcCloned->nullsFirst(obc->nullsFirst());
csep.orderByCols().emplace_back(obcCloned);
break;
}
++retColPos;
}
if (found)
{
continue;
}
// order by column is not in the result set of the original query, so add it to the resultset
auto rc = boost::shared_ptr<execplan::ReturnedColumn>(obc->clone());
origCSEP->returnedCols().emplace_back(rc);
auto rcCloned = cloneAsSimpleColumn(rc, tableAlias, colPos + orderByColPos);
//This "order by" column does not belong to "group by" columns, so it should be an aggregated column
auto* aggCol = new execplan::AggregateColumn();
auto obcCloned = boost::shared_ptr<execplan::ReturnedColumn>(aggCol);
aggCol->asc(obc->asc());
aggCol->nullsFirst(obc->nullsFirst());
aggCol->aggOp(execplan::AggregateColumn::SELECT_SOME);
aggCol->aggParms().emplace_back(rcCloned);
csep.orderByCols().emplace_back(obcCloned);
++orderByColPos;
}
origCSEP->orderByCols().clear();
origCSEP->distinct(false);
return true;
}
} // namespace optimizer

View File

@@ -0,0 +1,31 @@
/* Copyright (C) 2025 MariaDB Corporation
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; version 2 of
the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
MA 02110-1301, USA. */
#pragma once
#define PREFER_MY_CONFIG_H
#include <my_config.h>
#include "../mysql/idb_mysql.h"
#include "execplan/calpontselectexecutionplan.h"
#include "rulebased_optimizer.h"
namespace optimizer
{
bool rewriteDistinctFilter(execplan::CalpontSelectExecutionPlan& csep, RBOptimizerContext& ctx);
bool applyRewriteDistinct(execplan::CalpontSelectExecutionPlan& csep, RBOptimizerContext& ctx);
}

View File

@@ -15,29 +15,31 @@
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
MA 02110-1301, USA. */ MA 02110-1301, USA. */
#include <algorithm>
#include <cstddef>
#include <cstdint>
#include <limits>
#include "rulebased_optimizer.h" #include "rulebased_optimizer.h"
#include "configcpp.h" #include "configcpp.h"
#include "constantcolumn.h" #include "constantcolumn.h"
#include "execplan/calpontselectexecutionplan.h" #include "execplan/calpontselectexecutionplan.h"
#include "execplan/simplecolumn.h"
#include "existsfilter.h"
#include "logicoperator.h"
#include "operator.h"
#include "predicateoperator.h" #include "predicateoperator.h"
#include "simplefilter.h"
#include "rbo_apply_parallel_ces.h" #include "rbo_apply_parallel_ces.h"
#include "rbo_predicate_pushdown.h" #include "rbo_predicate_pushdown.h"
#include "rbo_apply_rewrite_distinct.h"
#include "utils/pron/pron.h" #include "utils/pron/pron.h"
#include "calpontsystemcatalog.h"
#include "functioncolumn.h"
namespace optimizer namespace optimizer
{ {
std::string getRewrittenSubTableAlias(const execplan::CalpontSystemCatalog::TableAliasName& table,
const RBOptimizerContext& ctx)
{
static const std::string rewrittenSubTableAliasPrefix{"$added_sub_"};
return rewrittenSubTableAliasPrefix + table.schema + "_" + table.table + "_" +
std::to_string(ctx.getUniqueId());
}
// Apply a list of rules to a CSEP // Apply a list of rules to a CSEP
bool optimizeCSEPWithRules(execplan::CalpontSelectExecutionPlan& root, const std::vector<Rule>& rules, bool optimizeCSEPWithRules(execplan::CalpontSelectExecutionPlan& root, const std::vector<Rule>& rules,
optimizer::RBOptimizerContext& ctx) optimizer::RBOptimizerContext& ctx)
@@ -83,6 +85,10 @@ bool optimizeCSEP(execplan::CalpontSelectExecutionPlan& root, optimizer::RBOptim
{ {
optimizer::Rule parallelCES{"parallel_ces", optimizer::parallelCESFilter, optimizer::applyParallelCES}; optimizer::Rule parallelCES{"parallel_ces", optimizer::parallelCESFilter, optimizer::applyParallelCES};
rules.push_back(parallelCES); rules.push_back(parallelCES);
optimizer::Rule rewriteDistinct{"rewrite_distinct", optimizer::rewriteDistinctFilter,
optimizer::applyRewriteDistinct};
rules.push_back(rewriteDistinct);
} }
optimizer::Rule predicatePushdown{"predicate_pushdown", optimizer::predicatePushdownFilter, optimizer::Rule predicatePushdown{"predicate_pushdown", optimizer::predicatePushdownFilter,

View File

@@ -27,6 +27,7 @@
#include <dbcon/mysql/ha_mcs_impl_if.h> #include <dbcon/mysql/ha_mcs_impl_if.h>
#include "execplan/calpontselectexecutionplan.h" #include "execplan/calpontselectexecutionplan.h"
#include "execplan/calpontsystemcatalog.h"
namespace optimizer namespace optimizer
{ {
@@ -141,4 +142,7 @@ struct Rule
bool optimizeCSEP(execplan::CalpontSelectExecutionPlan& root, RBOptimizerContext& ctx, bool optimizeCSEP(execplan::CalpontSelectExecutionPlan& root, RBOptimizerContext& ctx,
bool useUnstableOptimizer); bool useUnstableOptimizer);
} // namespace optimizer std::string getRewrittenSubTableAlias(const execplan::CalpontSystemCatalog::TableAliasName& table,
const RBOptimizerContext& ctx);
}

View File

@@ -11,10 +11,10 @@ COUNT(DISTINCT col2)
5 5
SELECT DISTINCT col1 FROM t1; SELECT DISTINCT col1 FROM t1;
col1 col1
NULL
1 1
2 2
3 3
NULL
SELECT DISTINCT col1 FROM t1 ORDER BY col1 DESC; SELECT DISTINCT col1 FROM t1 ORDER BY col1 DESC;
col1 col1
3 3
@@ -33,10 +33,10 @@ CREATE TABLE t2(col1 INT)ENGINE=Columnstore;
INSERT INTO t2 SELECT DISTINCT col1 FROM t1; INSERT INTO t2 SELECT DISTINCT col1 FROM t1;
SELECT * FROM t2; SELECT * FROM t2;
col1 col1
NULL
1 1
2 2
3 3
NULL
CREATE TABLE t3 (name varchar(255)); CREATE TABLE t3 (name varchar(255));
INSERT INTO t3 VALUES ('aa'),('ab'),('ac'),('ad'),('ae'); INSERT INTO t3 VALUES ('aa'),('ab'),('ac'),('ad'),('ae');
SELECT DISTINCT * FROM t3; SELECT DISTINCT * FROM t3;
@@ -46,7 +46,7 @@ ab
ac ac
ad ad
ae ae
SELECT DISTINCT name FROM t3 LIMIT 2; SELECT DISTINCT name FROM t3 ORDER BY name LIMIT 2;
name name
aa aa
ab ab

View File

@@ -15,18 +15,22 @@ CREATE TABLE t1(col1 INT, col2 CHAR(5))ENGINE=Columnstore;
INSERT INTO t1 VALUES(NULL, NULL),(1,'a'),(1,'b'),(1,'c'),(2,'dd'),(3,'eee'); INSERT INTO t1 VALUES(NULL, NULL),(1,'a'),(1,'b'),(1,'c'),(2,'dd'),(3,'eee');
SELECT COUNT(DISTINCT col1) FROM t1; SELECT COUNT(DISTINCT col1) FROM t1;
SELECT COUNT(DISTINCT col2) FROM t1; SELECT COUNT(DISTINCT col2) FROM t1;
--sorted_result
SELECT DISTINCT col1 FROM t1; SELECT DISTINCT col1 FROM t1;
SELECT DISTINCT col1 FROM t1 ORDER BY col1 DESC; SELECT DISTINCT col1 FROM t1 ORDER BY col1 DESC;
--sorted_result
SELECT DISTINCT col2 FROM t1; SELECT DISTINCT col2 FROM t1;
CREATE TABLE t2(col1 INT)ENGINE=Columnstore; CREATE TABLE t2(col1 INT)ENGINE=Columnstore;
INSERT INTO t2 SELECT DISTINCT col1 FROM t1; INSERT INTO t2 SELECT DISTINCT col1 FROM t1;
--sorted_result
SELECT * FROM t2; SELECT * FROM t2;
CREATE TABLE t3 (name varchar(255)); CREATE TABLE t3 (name varchar(255));
INSERT INTO t3 VALUES ('aa'),('ab'),('ac'),('ad'),('ae'); INSERT INTO t3 VALUES ('aa'),('ab'),('ac'),('ad'),('ae');
--sorted_result
SELECT DISTINCT * FROM t3; SELECT DISTINCT * FROM t3;
SELECT DISTINCT name FROM t3 LIMIT 2; SELECT DISTINCT name FROM t3 ORDER BY name LIMIT 2;
SELECT DISTINCT 1 FROM t3 LIMIT 3; SELECT DISTINCT 1 FROM t3 LIMIT 3;
# Clean UP # Clean UP