1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-07-29 08:21:15 +03:00

Merge pull request #1828 from tntnatbry/MCOL-4543-4589

MCOL -4543/MCOL-4589 Subquery optimization
This commit is contained in:
Roman Nozdrin
2021-04-14 13:50:46 +03:00
committed by GitHub
5 changed files with 446 additions and 16 deletions

View File

@ -360,6 +360,14 @@ public:
return fColumnMap;
}
/** column map
* all the columns appeared on query
*/
ColumnMap& columnMap()
{
return fColumnMap;
}
/** assign the static fColMap to non-static fColumnMap. map-wise copy */
void columnMap (const ColumnMap& columnMap);

View File

@ -57,7 +57,7 @@ void derivedTableOptimization(THD* thd, SCSEP& csep)
for (uint i = 0; i < derivedTbList.size(); i++)
{
CalpontSelectExecutionPlan* plan = dynamic_cast<CalpontSelectExecutionPlan*>(derivedTbList[i].get());
CalpontSelectExecutionPlan* plan = reinterpret_cast<CalpontSelectExecutionPlan*>(derivedTbList[i].get());
CalpontSelectExecutionPlan::ReturnedColumnList cols = plan->returnedCols();
vector<CalpontSelectExecutionPlan::ReturnedColumnList> unionColVec;
@ -73,7 +73,7 @@ void derivedTableOptimization(THD* thd, SCSEP& csep)
for (uint j = 0; j < plan->unionVec().size(); j++)
{
unionColVec.push_back(
dynamic_cast<CalpontSelectExecutionPlan*>(plan->unionVec()[j].get())->returnedCols());
reinterpret_cast<CalpontSelectExecutionPlan*>(plan->unionVec()[j].get())->returnedCols());
}
}
@ -82,7 +82,7 @@ void derivedTableOptimization(THD* thd, SCSEP& csep)
for (uint j = 0; j < plan->unionVec().size(); j++)
{
if (dynamic_cast<CalpontSelectExecutionPlan*>(plan->unionVec()[j].get())->tableList().empty())
if (reinterpret_cast<CalpontSelectExecutionPlan*>(plan->unionVec()[j].get())->tableList().empty())
{
horizontalOptimization = false;
break;
@ -93,7 +93,29 @@ void derivedTableOptimization(THD* thd, SCSEP& csep)
{
int64_t val = 1;
for (uint i = 0; i < cols.size(); i++)
// TODO MCOL-4543 Only project those columns from the subquery
// which are referenced in the outer select. So for example,
// if a table t contains 10 columns c1 ... c10 :
// "select count(c2) from (select * from t) q;"
// with p being the subquery execution plan, p->columnMap()
// and p->returnedCols() should both be of size 1, instead
// of 10, with entries for c2 in each.
//
// We are currently performing a dumb optimization:
// Instead of just referencing c2, we are referencing (c1,c2)
// for the above query. This is due to complexity associated
// with modifying ReturnedColumn::colPosition()
// (from a value of 1 to a value of 0) of the outer query
// which references c2. So essentially, if c2 is replaced by c10
// in the above query, we fallback to projecting all 10 columns
// of the subquery in ExeMgr.
// This will be addressed in future.
CalpontSelectExecutionPlan::ReturnedColumnList nonConstCols;
vector<CalpontSelectExecutionPlan::ReturnedColumnList> nonConstUnionColVec(unionColVec.size());
int64_t lastNonConstIndex = -1;
for (int64_t i = cols.size() - 1; i >= 0; i--)
{
//if (cols[i]->derivedTable().empty())
if (cols[i]->refCount() == 0)
@ -101,22 +123,84 @@ void derivedTableOptimization(THD* thd, SCSEP& csep)
if (cols[i]->derivedRefCol())
cols[i]->derivedRefCol()->decRefCount();
if (lastNonConstIndex == -1)
{
SimpleColumn* sc = dynamic_cast<SimpleColumn*>(cols[i].get());
if (sc && (plan->columnMap().count(sc->columnName()) == 1))
{
plan->columnMap().erase(sc->columnName());
}
}
else
{
cols[i].reset(new ConstantColumn(val));
(dynamic_cast<ConstantColumn*>(cols[i].get()))->timeZone(thd->variables.time_zone->get_name()->ptr());
(reinterpret_cast<ConstantColumn*>(cols[i].get()))->timeZone(thd->variables.time_zone->get_name()->ptr());
}
for (uint j = 0; j < unionColVec.size(); j++)
{
unionColVec[j][i].reset(new ConstantColumn(val));
(dynamic_cast<ConstantColumn*>(unionColVec[j][i].get()))->timeZone(thd->variables.time_zone->get_name()->ptr());
if (lastNonConstIndex == -1)
{
CalpontSelectExecutionPlan* unionSubPlan =
reinterpret_cast<CalpontSelectExecutionPlan*>(plan->unionVec()[j].get());
SimpleColumn* sc = dynamic_cast<SimpleColumn*>(unionSubPlan->returnedCols()[i].get());
if (sc && (unionSubPlan->columnMap().count(sc->columnName()) == 1))
{
unionSubPlan->columnMap().erase(sc->columnName());
}
}
else
{
unionColVec[j][i].reset(new ConstantColumn(val));
(reinterpret_cast<ConstantColumn*>(unionColVec[j][i].get()))->timeZone(thd->variables.time_zone->get_name()->ptr());
}
}
}
else if (lastNonConstIndex == -1)
{
lastNonConstIndex = i;
}
}
if (lastNonConstIndex == -1)
{
// None of the subquery columns are referenced, just use the first one
if (!cols.empty())
{
cols[0].reset(new ConstantColumn(val));
(reinterpret_cast<ConstantColumn*>(cols[0].get()))->timeZone(thd->variables.time_zone->get_name()->ptr());
nonConstCols.push_back(cols[0]);
for (uint j = 0; j < unionColVec.size(); j++)
{
unionColVec[j][0].reset(new ConstantColumn(val));
(reinterpret_cast<ConstantColumn*>(unionColVec[j][0].get()))->timeZone(thd->variables.time_zone->get_name()->ptr());
nonConstUnionColVec[j].push_back(unionColVec[j][0]);
}
}
}
else
{
nonConstCols.assign(cols.begin(), cols.begin() + lastNonConstIndex + 1);
for (uint j = 0; j < unionColVec.size(); j++)
{
nonConstUnionColVec[j].assign(unionColVec[j].begin(), unionColVec[j].begin() + lastNonConstIndex + 1);
}
}
// set back
plan->returnedCols(cols);
plan->returnedCols(nonConstCols);
for (uint j = 0; j < unionColVec.size(); j++)
dynamic_cast<CalpontSelectExecutionPlan*>(plan->unionVec()[j].get())->returnedCols(unionColVec[j]);
{
CalpontSelectExecutionPlan* unionSubPlan =
reinterpret_cast<CalpontSelectExecutionPlan*>(plan->unionVec()[j].get());
unionSubPlan->returnedCols(nonConstUnionColVec[j]);
}
}
}
@ -151,7 +235,7 @@ void derivedTableOptimization(THD* thd, SCSEP& csep)
for (uint i = 0; i < derivedTbList.size(); i++)
{
CalpontSelectExecutionPlan* plan = dynamic_cast<CalpontSelectExecutionPlan*>(derivedTbList[i].get());
CalpontSelectExecutionPlan* plan = reinterpret_cast<CalpontSelectExecutionPlan*>(derivedTbList[i].get());
CalpontSelectExecutionPlan::ReturnedColumnList derivedColList = plan->returnedCols();
mapIt = derivedTbFilterMap.find(plan->derivedTbAlias());
@ -181,7 +265,7 @@ void derivedTableOptimization(THD* thd, SCSEP& csep)
for (uint j = 0; j < plan->unionVec().size(); j++)
{
CalpontSelectExecutionPlan* unionPlan =
dynamic_cast<CalpontSelectExecutionPlan*>(plan->unionVec()[j].get());
reinterpret_cast<CalpontSelectExecutionPlan*>(plan->unionVec()[j].get());
CalpontSelectExecutionPlan::ReturnedColumnList unionColList = unionPlan->returnedCols();
ParseTree* mainFilterForUnion = new ParseTree();
mainFilterForUnion->copyTree(*(mapIt->second));

View File

@ -2920,15 +2920,46 @@ SimpleColumn* getSmallestColumn(boost::shared_ptr<CalpontSystemCatalog> csc,
if (tan.alias == csep->derivedTbAlias())
{
assert (!csep->returnedCols().empty());
ReturnedColumn* rc = dynamic_cast<ReturnedColumn*>(csep->returnedCols()[0].get());
const CalpontSelectExecutionPlan::ReturnedColumnList& cols = csep->returnedCols();
CalpontSelectExecutionPlan::ReturnedColumnList::const_iterator iter;
ReturnedColumn* rc;
for (iter = cols.begin(); iter != cols.end(); iter++)
{
if ((*iter)->refCount() != 0)
{
rc = dynamic_cast<ReturnedColumn*>(iter->get());
break;
}
}
if (iter == cols.end())
{
assert (!cols.empty());
// We take cols[0] here due to the optimization happening in
// derivedTableOptimization. All cols with refCount 0 from
// the end of the cols list are optimized out, until the
// first column with non-zero refCount is encountered. So
// here, if instead of cols[0], we take cols[1] (based on
// some logic) and increment it's refCount, then cols[0] is
// not optimized out in derivedTableOptimization and is
// added as a ConstantColumn to the derived table's returned
// column list. This later causes an ineffective row group
// with row of the form (1, cols[1]_value1) to be created in ExeMgr.
rc = dynamic_cast<ReturnedColumn*>(cols[0].get());
// @bug5634 derived table optimization.
rc->incRefCount();
}
SimpleColumn* sc = new SimpleColumn();
sc->columnName(rc->alias());
sc->sequence(0);
sc->tableAlias(tan.alias);
sc->timeZone(gwi.thd->variables.time_zone->get_name()->ptr());
// @bug5634 derived table optimization.
rc->incRefCount();
sc->derivedTable(csep->derivedTbAlias());
sc->derivedRefCol(rc);
return sc;

View File

@ -0,0 +1,240 @@
DROP DATABASE IF EXISTS mcol4543;
CREATE DATABASE mcol4543;
USE mcol4543;
CREATE TABLE t1 (a int, b int) engine=columnstore;
INSERT INTO t1 values (1, 1), (2, 1), (3, 2), (4, 2), (5, 2);
SELECT "123" FROM (SELECT * FROM t1) q;
123
123
123
123
123
123
SELECT "123" FROM (SELECT "234" FROM t1) q;
123
123
123
123
123
123
SELECT a FROM (SELECT * FROM t1) q;
a
1
2
3
4
5
SELECT b FROM (SELECT * FROM t1) q;
b
1
1
2
2
2
SELECT a,b FROM (SELECT * FROM t1) q;
a b
1 1
2 1
3 2
4 2
5 2
SELECT b,a FROM (SELECT * FROM t1) q;
b a
1 1
1 2
2 3
2 4
2 5
SELECT a FROM (SELECT * FROM (SELECT * FROM (SELECT * FROM t1) q1) q2) q3;
a
1
2
3
4
5
SELECT b FROM (SELECT * FROM (SELECT * FROM (SELECT * FROM t1) q1) q2) q3;
b
1
1
2
2
2
SELECT a FROM (SELECT b,a FROM (SELECT * FROM t1) q1) q2;
a
1
2
3
4
5
SELECT b FROM (SELECT b,a FROM (SELECT * FROM t1) q1) q2;
b
1
1
2
2
2
SELECT "123" FROM (SELECT * FROM t1) q GROUP BY a ORDER BY a;
123
123
123
123
123
123
SELECT "123" FROM (SELECT * FROM t1) q GROUP BY b ORDER BY b;
123
123
123
SELECT "123" FROM (SELECT * FROM t1) q GROUP BY a,b ORDER BY a,b;
123
123
123
123
123
123
SELECT COUNT(a) FROM (SELECT * FROM t1) q GROUP BY a ORDER BY a;
COUNT(a)
1
1
1
1
1
SELECT COUNT(b) FROM (SELECT * FROM t1) q GROUP BY b ORDER BY b;
COUNT(b)
2
3
SELECT COUNT(a) FROM (SELECT * FROM t1) q GROUP BY b ORDER BY b;
COUNT(a)
2
3
SELECT COUNT(b) FROM (SELECT * FROM t1) q GROUP BY a ORDER BY a;
COUNT(b)
1
1
1
1
1
SELECT c1 FROM (SELECT a AS c1, COUNT(a) AS c2 FROM t1 GROUP BY c1) q ORDER BY c1;
c1
1
2
3
4
5
SELECT c2 FROM (SELECT a AS c1, COUNT(a) AS c2 FROM t1 GROUP BY c1) q ORDER BY c2;
c2
1
1
1
1
1
SELECT * FROM (SELECT a AS c1, COUNT(a) AS c2 FROM t1 GROUP BY c1) q ORDER BY c1,c2;
c1 c2
1 1
2 1
3 1
4 1
5 1
SELECT tab1.a FROM t1 tab1 JOIN (SELECT * FROM t1) tab2 ON tab1.a=tab2.a ORDER BY tab1.a;
a
1
2
3
4
5
SELECT tab1.a FROM t1 tab1 JOIN (SELECT * FROM t1) tab2 ON tab1.b=tab2.b ORDER BY tab1.a;
a
1
1
2
2
3
3
3
4
4
4
5
5
5
SELECT tab1.a FROM t1 tab1 JOIN (SELECT * FROM t1) tab2 ON tab1.b=tab2.b and tab1.a=tab2.a ORDER BY tab1.a;
a
1
2
3
4
5
SELECT tab1.a, tab2.b FROM t1 tab1 JOIN (SELECT * FROM t1) tab2 ON tab1.a=tab2.a ORDER BY tab1.a,tab2.b;
a b
1 1
2 1
3 2
4 2
5 2
SELECT COUNT(a) FROM (SELECT * FROM t1 UNION ALL SELECT * FROM t1) q;
COUNT(a)
10
SELECT COUNT(b) FROM (SELECT * FROM t1 UNION ALL SELECT * FROM t1) q;
COUNT(b)
10
SELECT COUNT(b), COUNT(a) FROM (SELECT * FROM t1 UNION ALL SELECT * FROM t1) q;
COUNT(b) COUNT(a)
10 10
SELECT COUNT(a) FROM (SELECT * FROM t1 UNION ALL SELECT * FROM t1) q GROUP BY b ORDER BY b;
COUNT(a)
4
6
SELECT q1.a FROM (SELECT * FROM (SELECT * FROM t1 UNION ALL SELECT * FROM t1) q1_1) q1 JOIN
(SELECT * FROM (SELECT * FROM t1 UNION ALL SELECT * FROM t1) q2_1) q2 ON q1.a=q2.a ORDER BY 1;
a
1
1
1
1
2
2
2
2
3
3
3
3
4
4
4
4
5
5
5
5
SELECT q1.a, q2.b FROM (SELECT * FROM (SELECT * FROM t1 UNION ALL SELECT * FROM t1) q1_1) q1 JOIN
(SELECT * FROM (SELECT * FROM t1 UNION ALL SELECT * FROM t1) q2_1) q2 ON q1.a=q2.a ORDER BY 2 desc, 1 asc;
a b
3 2
3 2
3 2
3 2
4 2
4 2
4 2
4 2
5 2
5 2
5 2
5 2
1 1
1 1
1 1
1 1
2 1
2 1
2 1
2 1
SELECT "123" FROM (SELECT * FROM t1) q GROUP BY b ORDER BY b;
123
123
123
SELECT "123" FROM (SELECT * FROM t1) q GROUP BY b;
123
123
123
DROP DATABASE mcol4543;

View File

@ -0,0 +1,67 @@
# Test cases for MCOL-4543
# The test cases demonstrate that non-referenced subquery columns
# (by non-reference we mean the subquery column is not
# referenced/used by the outer query) which are optimized out
# by the patch for MCOL-4543 do not impact the query results.
-- source ../include/have_columnstore.inc
--disable_warnings
DROP DATABASE IF EXISTS mcol4543;
--enable_warnings
CREATE DATABASE mcol4543;
USE mcol4543;
CREATE TABLE t1 (a int, b int) engine=columnstore;
INSERT INTO t1 values (1, 1), (2, 1), (3, 2), (4, 2), (5, 2);
# Test subquery columns referenced/not-referenced in simple projections
SELECT "123" FROM (SELECT * FROM t1) q;
SELECT "123" FROM (SELECT "234" FROM t1) q;
SELECT a FROM (SELECT * FROM t1) q;
SELECT b FROM (SELECT * FROM t1) q;
SELECT a,b FROM (SELECT * FROM t1) q;
SELECT b,a FROM (SELECT * FROM t1) q;
SELECT a FROM (SELECT * FROM (SELECT * FROM (SELECT * FROM t1) q1) q2) q3;
SELECT b FROM (SELECT * FROM (SELECT * FROM (SELECT * FROM t1) q1) q2) q3;
SELECT a FROM (SELECT b,a FROM (SELECT * FROM t1) q1) q2;
SELECT b FROM (SELECT b,a FROM (SELECT * FROM t1) q1) q2;
# Test subquery columns referenced/not-referenced in group by's and aggregates
SELECT "123" FROM (SELECT * FROM t1) q GROUP BY a ORDER BY a;
SELECT "123" FROM (SELECT * FROM t1) q GROUP BY b ORDER BY b;
SELECT "123" FROM (SELECT * FROM t1) q GROUP BY a,b ORDER BY a,b;
SELECT COUNT(a) FROM (SELECT * FROM t1) q GROUP BY a ORDER BY a;
SELECT COUNT(b) FROM (SELECT * FROM t1) q GROUP BY b ORDER BY b;
SELECT COUNT(a) FROM (SELECT * FROM t1) q GROUP BY b ORDER BY b;
SELECT COUNT(b) FROM (SELECT * FROM t1) q GROUP BY a ORDER BY a;
SELECT c1 FROM (SELECT a AS c1, COUNT(a) AS c2 FROM t1 GROUP BY c1) q ORDER BY c1;
SELECT c2 FROM (SELECT a AS c1, COUNT(a) AS c2 FROM t1 GROUP BY c1) q ORDER BY c2;
SELECT * FROM (SELECT a AS c1, COUNT(a) AS c2 FROM t1 GROUP BY c1) q ORDER BY c1,c2;
# Test subquery columns referenced/not-referenced in joins
SELECT tab1.a FROM t1 tab1 JOIN (SELECT * FROM t1) tab2 ON tab1.a=tab2.a ORDER BY tab1.a;
SELECT tab1.a FROM t1 tab1 JOIN (SELECT * FROM t1) tab2 ON tab1.b=tab2.b ORDER BY tab1.a;
SELECT tab1.a FROM t1 tab1 JOIN (SELECT * FROM t1) tab2 ON tab1.b=tab2.b and tab1.a=tab2.a ORDER BY tab1.a;
SELECT tab1.a, tab2.b FROM t1 tab1 JOIN (SELECT * FROM t1) tab2 ON tab1.a=tab2.a ORDER BY tab1.a,tab2.b;
# Test subquery columns referenced/not-referenced when subqueries contain unions
SELECT COUNT(a) FROM (SELECT * FROM t1 UNION ALL SELECT * FROM t1) q;
SELECT COUNT(b) FROM (SELECT * FROM t1 UNION ALL SELECT * FROM t1) q;
SELECT COUNT(b), COUNT(a) FROM (SELECT * FROM t1 UNION ALL SELECT * FROM t1) q;
SELECT COUNT(a) FROM (SELECT * FROM t1 UNION ALL SELECT * FROM t1) q GROUP BY b ORDER BY b;
SELECT q1.a FROM (SELECT * FROM (SELECT * FROM t1 UNION ALL SELECT * FROM t1) q1_1) q1 JOIN
(SELECT * FROM (SELECT * FROM t1 UNION ALL SELECT * FROM t1) q2_1) q2 ON q1.a=q2.a ORDER BY 1;
SELECT q1.a, q2.b FROM (SELECT * FROM (SELECT * FROM t1 UNION ALL SELECT * FROM t1) q1_1) q1 JOIN
(SELECT * FROM (SELECT * FROM t1 UNION ALL SELECT * FROM t1) q2_1) q2 ON q1.a=q2.a ORDER BY 2 desc, 1 asc;
# Patch for MCOL-4543 also optimizes out an unnecessary BPS projection in PrimProc
# that was happening earlier. The following 2 queries trigger this optimization.
# To see the optimization, run "select calgettrace();" after the query execution.
SELECT "123" FROM (SELECT * FROM t1) q GROUP BY b ORDER BY b;
SELECT "123" FROM (SELECT * FROM t1) q GROUP BY b;
DROP DATABASE mcol4543;