diff --git a/dbcon/execplan/calpontselectexecutionplan.h b/dbcon/execplan/calpontselectexecutionplan.h index c11a2c1df..42664b71c 100644 --- a/dbcon/execplan/calpontselectexecutionplan.h +++ b/dbcon/execplan/calpontselectexecutionplan.h @@ -360,6 +360,14 @@ public: return fColumnMap; } + /** column map + * all the columns appeared on query + */ + ColumnMap& columnMap() + { + return fColumnMap; + } + /** assign the static fColMap to non-static fColumnMap. map-wise copy */ void columnMap (const ColumnMap& columnMap); diff --git a/dbcon/mysql/ha_from_sub.cpp b/dbcon/mysql/ha_from_sub.cpp index 3322f51a2..c95bbe96c 100644 --- a/dbcon/mysql/ha_from_sub.cpp +++ b/dbcon/mysql/ha_from_sub.cpp @@ -57,7 +57,7 @@ void derivedTableOptimization(THD* thd, SCSEP& csep) for (uint i = 0; i < derivedTbList.size(); i++) { - CalpontSelectExecutionPlan* plan = dynamic_cast(derivedTbList[i].get()); + CalpontSelectExecutionPlan* plan = reinterpret_cast(derivedTbList[i].get()); CalpontSelectExecutionPlan::ReturnedColumnList cols = plan->returnedCols(); vector unionColVec; @@ -73,7 +73,7 @@ void derivedTableOptimization(THD* thd, SCSEP& csep) for (uint j = 0; j < plan->unionVec().size(); j++) { unionColVec.push_back( - dynamic_cast(plan->unionVec()[j].get())->returnedCols()); + reinterpret_cast(plan->unionVec()[j].get())->returnedCols()); } } @@ -82,7 +82,7 @@ void derivedTableOptimization(THD* thd, SCSEP& csep) for (uint j = 0; j < plan->unionVec().size(); j++) { - if (dynamic_cast(plan->unionVec()[j].get())->tableList().empty()) + if (reinterpret_cast(plan->unionVec()[j].get())->tableList().empty()) { horizontalOptimization = false; break; @@ -93,7 +93,29 @@ void derivedTableOptimization(THD* thd, SCSEP& csep) { int64_t val = 1; - for (uint i = 0; i < cols.size(); i++) + // TODO MCOL-4543 Only project those columns from the subquery + // which are referenced in the outer select. So for example, + // if a table t contains 10 columns c1 ... c10 : + // "select count(c2) from (select * from t) q;" + // with p being the subquery execution plan, p->columnMap() + // and p->returnedCols() should both be of size 1, instead + // of 10, with entries for c2 in each. + // + // We are currently performing a dumb optimization: + // Instead of just referencing c2, we are referencing (c1,c2) + // for the above query. This is due to complexity associated + // with modifying ReturnedColumn::colPosition() + // (from a value of 1 to a value of 0) of the outer query + // which references c2. So essentially, if c2 is replaced by c10 + // in the above query, we fallback to projecting all 10 columns + // of the subquery in ExeMgr. + // This will be addressed in future. + CalpontSelectExecutionPlan::ReturnedColumnList nonConstCols; + vector nonConstUnionColVec(unionColVec.size()); + + int64_t lastNonConstIndex = -1; + + for (int64_t i = cols.size() - 1; i >= 0; i--) { //if (cols[i]->derivedTable().empty()) if (cols[i]->refCount() == 0) @@ -101,22 +123,84 @@ void derivedTableOptimization(THD* thd, SCSEP& csep) if (cols[i]->derivedRefCol()) cols[i]->derivedRefCol()->decRefCount(); - cols[i].reset(new ConstantColumn(val)); - (dynamic_cast(cols[i].get()))->timeZone(thd->variables.time_zone->get_name()->ptr()); + if (lastNonConstIndex == -1) + { + SimpleColumn* sc = dynamic_cast(cols[i].get()); + + if (sc && (plan->columnMap().count(sc->columnName()) == 1)) + { + plan->columnMap().erase(sc->columnName()); + } + } + else + { + cols[i].reset(new ConstantColumn(val)); + (reinterpret_cast(cols[i].get()))->timeZone(thd->variables.time_zone->get_name()->ptr()); + } for (uint j = 0; j < unionColVec.size(); j++) { - unionColVec[j][i].reset(new ConstantColumn(val)); - (dynamic_cast(unionColVec[j][i].get()))->timeZone(thd->variables.time_zone->get_name()->ptr()); + if (lastNonConstIndex == -1) + { + CalpontSelectExecutionPlan* unionSubPlan = + reinterpret_cast(plan->unionVec()[j].get()); + + SimpleColumn* sc = dynamic_cast(unionSubPlan->returnedCols()[i].get()); + + if (sc && (unionSubPlan->columnMap().count(sc->columnName()) == 1)) + { + unionSubPlan->columnMap().erase(sc->columnName()); + } + } + else + { + unionColVec[j][i].reset(new ConstantColumn(val)); + (reinterpret_cast(unionColVec[j][i].get()))->timeZone(thd->variables.time_zone->get_name()->ptr()); + } } } + else if (lastNonConstIndex == -1) + { + lastNonConstIndex = i; + } + } + + if (lastNonConstIndex == -1) + { + // None of the subquery columns are referenced, just use the first one + if (!cols.empty()) + { + cols[0].reset(new ConstantColumn(val)); + (reinterpret_cast(cols[0].get()))->timeZone(thd->variables.time_zone->get_name()->ptr()); + nonConstCols.push_back(cols[0]); + + for (uint j = 0; j < unionColVec.size(); j++) + { + unionColVec[j][0].reset(new ConstantColumn(val)); + (reinterpret_cast(unionColVec[j][0].get()))->timeZone(thd->variables.time_zone->get_name()->ptr()); + nonConstUnionColVec[j].push_back(unionColVec[j][0]); + } + } + } + else + { + nonConstCols.assign(cols.begin(), cols.begin() + lastNonConstIndex + 1); + + for (uint j = 0; j < unionColVec.size(); j++) + { + nonConstUnionColVec[j].assign(unionColVec[j].begin(), unionColVec[j].begin() + lastNonConstIndex + 1); + } } // set back - plan->returnedCols(cols); + plan->returnedCols(nonConstCols); for (uint j = 0; j < unionColVec.size(); j++) - dynamic_cast(plan->unionVec()[j].get())->returnedCols(unionColVec[j]); + { + CalpontSelectExecutionPlan* unionSubPlan = + reinterpret_cast(plan->unionVec()[j].get()); + unionSubPlan->returnedCols(nonConstUnionColVec[j]); + } } } @@ -151,7 +235,7 @@ void derivedTableOptimization(THD* thd, SCSEP& csep) for (uint i = 0; i < derivedTbList.size(); i++) { - CalpontSelectExecutionPlan* plan = dynamic_cast(derivedTbList[i].get()); + CalpontSelectExecutionPlan* plan = reinterpret_cast(derivedTbList[i].get()); CalpontSelectExecutionPlan::ReturnedColumnList derivedColList = plan->returnedCols(); mapIt = derivedTbFilterMap.find(plan->derivedTbAlias()); @@ -181,7 +265,7 @@ void derivedTableOptimization(THD* thd, SCSEP& csep) for (uint j = 0; j < plan->unionVec().size(); j++) { CalpontSelectExecutionPlan* unionPlan = - dynamic_cast(plan->unionVec()[j].get()); + reinterpret_cast(plan->unionVec()[j].get()); CalpontSelectExecutionPlan::ReturnedColumnList unionColList = unionPlan->returnedCols(); ParseTree* mainFilterForUnion = new ParseTree(); mainFilterForUnion->copyTree(*(mapIt->second)); diff --git a/dbcon/mysql/ha_mcs_execplan.cpp b/dbcon/mysql/ha_mcs_execplan.cpp index 437a7e253..c6db35ebc 100755 --- a/dbcon/mysql/ha_mcs_execplan.cpp +++ b/dbcon/mysql/ha_mcs_execplan.cpp @@ -2920,15 +2920,46 @@ SimpleColumn* getSmallestColumn(boost::shared_ptr csc, if (tan.alias == csep->derivedTbAlias()) { - assert (!csep->returnedCols().empty()); - ReturnedColumn* rc = dynamic_cast(csep->returnedCols()[0].get()); + const CalpontSelectExecutionPlan::ReturnedColumnList& cols = csep->returnedCols(); + + CalpontSelectExecutionPlan::ReturnedColumnList::const_iterator iter; + + ReturnedColumn* rc; + + for (iter = cols.begin(); iter != cols.end(); iter++) + { + if ((*iter)->refCount() != 0) + { + rc = dynamic_cast(iter->get()); + break; + } + } + + if (iter == cols.end()) + { + assert (!cols.empty()); + + // We take cols[0] here due to the optimization happening in + // derivedTableOptimization. All cols with refCount 0 from + // the end of the cols list are optimized out, until the + // first column with non-zero refCount is encountered. So + // here, if instead of cols[0], we take cols[1] (based on + // some logic) and increment it's refCount, then cols[0] is + // not optimized out in derivedTableOptimization and is + // added as a ConstantColumn to the derived table's returned + // column list. This later causes an ineffective row group + // with row of the form (1, cols[1]_value1) to be created in ExeMgr. + rc = dynamic_cast(cols[0].get()); + + // @bug5634 derived table optimization. + rc->incRefCount(); + } + SimpleColumn* sc = new SimpleColumn(); sc->columnName(rc->alias()); sc->sequence(0); sc->tableAlias(tan.alias); sc->timeZone(gwi.thd->variables.time_zone->get_name()->ptr()); - // @bug5634 derived table optimization. - rc->incRefCount(); sc->derivedTable(csep->derivedTbAlias()); sc->derivedRefCol(rc); return sc; diff --git a/mtr/basic/r/mcol-4543.result b/mtr/basic/r/mcol-4543.result new file mode 100644 index 000000000..2ba78a8bd --- /dev/null +++ b/mtr/basic/r/mcol-4543.result @@ -0,0 +1,240 @@ +DROP DATABASE IF EXISTS mcol4543; +CREATE DATABASE mcol4543; +USE mcol4543; +CREATE TABLE t1 (a int, b int) engine=columnstore; +INSERT INTO t1 values (1, 1), (2, 1), (3, 2), (4, 2), (5, 2); +SELECT "123" FROM (SELECT * FROM t1) q; +123 +123 +123 +123 +123 +123 +SELECT "123" FROM (SELECT "234" FROM t1) q; +123 +123 +123 +123 +123 +123 +SELECT a FROM (SELECT * FROM t1) q; +a +1 +2 +3 +4 +5 +SELECT b FROM (SELECT * FROM t1) q; +b +1 +1 +2 +2 +2 +SELECT a,b FROM (SELECT * FROM t1) q; +a b +1 1 +2 1 +3 2 +4 2 +5 2 +SELECT b,a FROM (SELECT * FROM t1) q; +b a +1 1 +1 2 +2 3 +2 4 +2 5 +SELECT a FROM (SELECT * FROM (SELECT * FROM (SELECT * FROM t1) q1) q2) q3; +a +1 +2 +3 +4 +5 +SELECT b FROM (SELECT * FROM (SELECT * FROM (SELECT * FROM t1) q1) q2) q3; +b +1 +1 +2 +2 +2 +SELECT a FROM (SELECT b,a FROM (SELECT * FROM t1) q1) q2; +a +1 +2 +3 +4 +5 +SELECT b FROM (SELECT b,a FROM (SELECT * FROM t1) q1) q2; +b +1 +1 +2 +2 +2 +SELECT "123" FROM (SELECT * FROM t1) q GROUP BY a ORDER BY a; +123 +123 +123 +123 +123 +123 +SELECT "123" FROM (SELECT * FROM t1) q GROUP BY b ORDER BY b; +123 +123 +123 +SELECT "123" FROM (SELECT * FROM t1) q GROUP BY a,b ORDER BY a,b; +123 +123 +123 +123 +123 +123 +SELECT COUNT(a) FROM (SELECT * FROM t1) q GROUP BY a ORDER BY a; +COUNT(a) +1 +1 +1 +1 +1 +SELECT COUNT(b) FROM (SELECT * FROM t1) q GROUP BY b ORDER BY b; +COUNT(b) +2 +3 +SELECT COUNT(a) FROM (SELECT * FROM t1) q GROUP BY b ORDER BY b; +COUNT(a) +2 +3 +SELECT COUNT(b) FROM (SELECT * FROM t1) q GROUP BY a ORDER BY a; +COUNT(b) +1 +1 +1 +1 +1 +SELECT c1 FROM (SELECT a AS c1, COUNT(a) AS c2 FROM t1 GROUP BY c1) q ORDER BY c1; +c1 +1 +2 +3 +4 +5 +SELECT c2 FROM (SELECT a AS c1, COUNT(a) AS c2 FROM t1 GROUP BY c1) q ORDER BY c2; +c2 +1 +1 +1 +1 +1 +SELECT * FROM (SELECT a AS c1, COUNT(a) AS c2 FROM t1 GROUP BY c1) q ORDER BY c1,c2; +c1 c2 +1 1 +2 1 +3 1 +4 1 +5 1 +SELECT tab1.a FROM t1 tab1 JOIN (SELECT * FROM t1) tab2 ON tab1.a=tab2.a ORDER BY tab1.a; +a +1 +2 +3 +4 +5 +SELECT tab1.a FROM t1 tab1 JOIN (SELECT * FROM t1) tab2 ON tab1.b=tab2.b ORDER BY tab1.a; +a +1 +1 +2 +2 +3 +3 +3 +4 +4 +4 +5 +5 +5 +SELECT tab1.a FROM t1 tab1 JOIN (SELECT * FROM t1) tab2 ON tab1.b=tab2.b and tab1.a=tab2.a ORDER BY tab1.a; +a +1 +2 +3 +4 +5 +SELECT tab1.a, tab2.b FROM t1 tab1 JOIN (SELECT * FROM t1) tab2 ON tab1.a=tab2.a ORDER BY tab1.a,tab2.b; +a b +1 1 +2 1 +3 2 +4 2 +5 2 +SELECT COUNT(a) FROM (SELECT * FROM t1 UNION ALL SELECT * FROM t1) q; +COUNT(a) +10 +SELECT COUNT(b) FROM (SELECT * FROM t1 UNION ALL SELECT * FROM t1) q; +COUNT(b) +10 +SELECT COUNT(b), COUNT(a) FROM (SELECT * FROM t1 UNION ALL SELECT * FROM t1) q; +COUNT(b) COUNT(a) +10 10 +SELECT COUNT(a) FROM (SELECT * FROM t1 UNION ALL SELECT * FROM t1) q GROUP BY b ORDER BY b; +COUNT(a) +4 +6 +SELECT q1.a FROM (SELECT * FROM (SELECT * FROM t1 UNION ALL SELECT * FROM t1) q1_1) q1 JOIN +(SELECT * FROM (SELECT * FROM t1 UNION ALL SELECT * FROM t1) q2_1) q2 ON q1.a=q2.a ORDER BY 1; +a +1 +1 +1 +1 +2 +2 +2 +2 +3 +3 +3 +3 +4 +4 +4 +4 +5 +5 +5 +5 +SELECT q1.a, q2.b FROM (SELECT * FROM (SELECT * FROM t1 UNION ALL SELECT * FROM t1) q1_1) q1 JOIN +(SELECT * FROM (SELECT * FROM t1 UNION ALL SELECT * FROM t1) q2_1) q2 ON q1.a=q2.a ORDER BY 2 desc, 1 asc; +a b +3 2 +3 2 +3 2 +3 2 +4 2 +4 2 +4 2 +4 2 +5 2 +5 2 +5 2 +5 2 +1 1 +1 1 +1 1 +1 1 +2 1 +2 1 +2 1 +2 1 +SELECT "123" FROM (SELECT * FROM t1) q GROUP BY b ORDER BY b; +123 +123 +123 +SELECT "123" FROM (SELECT * FROM t1) q GROUP BY b; +123 +123 +123 +DROP DATABASE mcol4543; diff --git a/mtr/basic/t/mcol-4543.test b/mtr/basic/t/mcol-4543.test new file mode 100644 index 000000000..d92b399b9 --- /dev/null +++ b/mtr/basic/t/mcol-4543.test @@ -0,0 +1,67 @@ +# Test cases for MCOL-4543 +# The test cases demonstrate that non-referenced subquery columns +# (by non-reference we mean the subquery column is not +# referenced/used by the outer query) which are optimized out +# by the patch for MCOL-4543 do not impact the query results. + +-- source ../include/have_columnstore.inc + +--disable_warnings +DROP DATABASE IF EXISTS mcol4543; +--enable_warnings + +CREATE DATABASE mcol4543; + +USE mcol4543; + +CREATE TABLE t1 (a int, b int) engine=columnstore; + +INSERT INTO t1 values (1, 1), (2, 1), (3, 2), (4, 2), (5, 2); + +# Test subquery columns referenced/not-referenced in simple projections +SELECT "123" FROM (SELECT * FROM t1) q; +SELECT "123" FROM (SELECT "234" FROM t1) q; +SELECT a FROM (SELECT * FROM t1) q; +SELECT b FROM (SELECT * FROM t1) q; +SELECT a,b FROM (SELECT * FROM t1) q; +SELECT b,a FROM (SELECT * FROM t1) q; +SELECT a FROM (SELECT * FROM (SELECT * FROM (SELECT * FROM t1) q1) q2) q3; +SELECT b FROM (SELECT * FROM (SELECT * FROM (SELECT * FROM t1) q1) q2) q3; +SELECT a FROM (SELECT b,a FROM (SELECT * FROM t1) q1) q2; +SELECT b FROM (SELECT b,a FROM (SELECT * FROM t1) q1) q2; + +# Test subquery columns referenced/not-referenced in group by's and aggregates +SELECT "123" FROM (SELECT * FROM t1) q GROUP BY a ORDER BY a; +SELECT "123" FROM (SELECT * FROM t1) q GROUP BY b ORDER BY b; +SELECT "123" FROM (SELECT * FROM t1) q GROUP BY a,b ORDER BY a,b; +SELECT COUNT(a) FROM (SELECT * FROM t1) q GROUP BY a ORDER BY a; +SELECT COUNT(b) FROM (SELECT * FROM t1) q GROUP BY b ORDER BY b; +SELECT COUNT(a) FROM (SELECT * FROM t1) q GROUP BY b ORDER BY b; +SELECT COUNT(b) FROM (SELECT * FROM t1) q GROUP BY a ORDER BY a; +SELECT c1 FROM (SELECT a AS c1, COUNT(a) AS c2 FROM t1 GROUP BY c1) q ORDER BY c1; +SELECT c2 FROM (SELECT a AS c1, COUNT(a) AS c2 FROM t1 GROUP BY c1) q ORDER BY c2; +SELECT * FROM (SELECT a AS c1, COUNT(a) AS c2 FROM t1 GROUP BY c1) q ORDER BY c1,c2; + +# Test subquery columns referenced/not-referenced in joins +SELECT tab1.a FROM t1 tab1 JOIN (SELECT * FROM t1) tab2 ON tab1.a=tab2.a ORDER BY tab1.a; +SELECT tab1.a FROM t1 tab1 JOIN (SELECT * FROM t1) tab2 ON tab1.b=tab2.b ORDER BY tab1.a; +SELECT tab1.a FROM t1 tab1 JOIN (SELECT * FROM t1) tab2 ON tab1.b=tab2.b and tab1.a=tab2.a ORDER BY tab1.a; +SELECT tab1.a, tab2.b FROM t1 tab1 JOIN (SELECT * FROM t1) tab2 ON tab1.a=tab2.a ORDER BY tab1.a,tab2.b; + +# Test subquery columns referenced/not-referenced when subqueries contain unions +SELECT COUNT(a) FROM (SELECT * FROM t1 UNION ALL SELECT * FROM t1) q; +SELECT COUNT(b) FROM (SELECT * FROM t1 UNION ALL SELECT * FROM t1) q; +SELECT COUNT(b), COUNT(a) FROM (SELECT * FROM t1 UNION ALL SELECT * FROM t1) q; +SELECT COUNT(a) FROM (SELECT * FROM t1 UNION ALL SELECT * FROM t1) q GROUP BY b ORDER BY b; +SELECT q1.a FROM (SELECT * FROM (SELECT * FROM t1 UNION ALL SELECT * FROM t1) q1_1) q1 JOIN +(SELECT * FROM (SELECT * FROM t1 UNION ALL SELECT * FROM t1) q2_1) q2 ON q1.a=q2.a ORDER BY 1; +SELECT q1.a, q2.b FROM (SELECT * FROM (SELECT * FROM t1 UNION ALL SELECT * FROM t1) q1_1) q1 JOIN +(SELECT * FROM (SELECT * FROM t1 UNION ALL SELECT * FROM t1) q2_1) q2 ON q1.a=q2.a ORDER BY 2 desc, 1 asc; + +# Patch for MCOL-4543 also optimizes out an unnecessary BPS projection in PrimProc +# that was happening earlier. The following 2 queries trigger this optimization. +# To see the optimization, run "select calgettrace();" after the query execution. +SELECT "123" FROM (SELECT * FROM t1) q GROUP BY b ORDER BY b; +SELECT "123" FROM (SELECT * FROM t1) q GROUP BY b; + +DROP DATABASE mcol4543;