1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-07-29 08:21:15 +03:00

Merge pull request #2044 from dhall-MariaDB/MCOL-3738

MCOL-3738 COUNT(DISTINCT) with multiple parms
This commit is contained in:
Gagan Goel
2021-07-12 07:34:56 -04:00
committed by GitHub
4 changed files with 87 additions and 19 deletions

View File

@ -2184,6 +2184,9 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate(
// locate the return column position in aggregated rowgroup
uint64_t outIdx = 0;
RowAggFunctionType prevAggOp = ROWAGG_FUNCT_UNDEFINE;
uint32_t prevRetKey = 0;
for (uint64_t i = 0; i < returnedColVec.size(); i++)
{
udafc = NULL;
@ -2195,10 +2198,31 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate(
if (aggOp == ROWAGG_MULTI_PARM)
{
// Duplicate detection doesn't work for multi-parm`
// If this function was earlier detected as a duplicate, unduplicate it.
SP_ROWAGG_FUNC_t funct = functionVec2.back();
if (funct->fAggFunction == ROWAGG_DUP_FUNCT)
funct->fAggFunction = prevAggOp;
// Remove it from aggDupFuncMap if it's in there.
funct->hasMultiParm = true;
AGG_MAP::iterator it = aggDupFuncMap.find(boost::make_tuple(prevRetKey, prevAggOp, pUDAFFunc, udafc ? udafc->getContext().getParamKeys() : NULL));
if (it != aggDupFuncMap.end())
{
aggDupFuncMap.erase(it);
}
// Skip on final agg.: Extra parms for an aggregate have no work there.
++multiParms;
continue;
}
else
{
// Save the op for MULTI_PARM exclusion when COUNT(DISTINCT)
prevAggOp = aggOp;
prevRetKey = returnedColVec[i].first;
}
if (find(jobInfo.distinctColVec.begin(), jobInfo.distinctColVec.end(), retKey) !=
jobInfo.distinctColVec.end() )
@ -2543,8 +2567,6 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate(
funct->fAggFunction = ROWAGG_DUP_STATS;
else if (funct->fAggFunction == ROWAGG_UDAF)
funct->fAggFunction = ROWAGG_DUP_UDAF;
else if (funct->fAggFunction == ROWAGG_COUNT_DISTINCT_COL_NAME) // Don't track dup for this one. Gets confused when multi-parm.
{}
else
funct->fAggFunction = ROWAGG_DUP_FUNCT;
@ -2599,7 +2621,7 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate(
// now fix the AVG distinct function, locate the count(distinct column) position
for (uint64_t i = 0; i < functionVec2.size(); i++)
{
if (functionVec2[i]->fAggFunction == ROWAGG_COUNT_DISTINCT_COL_NAME)
if (functionVec2[i]->fAggFunction == ROWAGG_COUNT_DISTINCT_COL_NAME && !functionVec2[i]->hasMultiParm)
{
// if the count(distinct k) can be associated with an avg(distinct k)
map<uint32_t, SP_ROWAGG_FUNC_t>::iterator k =
@ -2919,11 +2941,6 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate(
for (uint64_t k = 0; k < returnedColVec.size(); k++)
{
if (functionIdMap(returnedColVec[k].second) == ROWAGG_MULTI_PARM)
{
++multiParms;
continue;
}
// search non-distinct functions in functionVec
vector<SP_ROWAGG_FUNC_t>::iterator it = functionVec2.begin();
@ -4471,6 +4488,9 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate(
// locate the return column position in aggregated rowgroup from PM
// outIdx is i without the multi-columns,
uint64_t outIdx = 0;
RowAggFunctionType prevAggOp = ROWAGG_FUNCT_UNDEFINE;
uint32_t prevRetKey = 0;
for (uint64_t i = 0; i < returnedColVec.size(); i++)
{
pUDAFFunc = NULL;
@ -4483,10 +4503,30 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate(
if (aggOp == ROWAGG_MULTI_PARM)
{
// Skip on UM: Extra parms for an aggregate have no work on the UM
// Duplicate detection doesn't work for multi-parm`
// If this function was earlier detected as a duplicate, unduplicate it.
SP_ROWAGG_FUNC_t funct = functionVecUm.back();
if (funct->fAggFunction == ROWAGG_DUP_FUNCT)
funct->fAggFunction = prevAggOp;
// Remove it from aggDupFuncMap if it's in there.
funct->hasMultiParm = true;
AGG_MAP::iterator it = aggDupFuncMap.find(boost::make_tuple(prevRetKey, prevAggOp, pUDAFFunc, udafc ? udafc->getContext().getParamKeys() : NULL));
if (it != aggDupFuncMap.end())
{
aggDupFuncMap.erase(it);
}
// Skip further UM porocessing of the multi-parm: Extra parms for an aggregate have no work on the UM
++multiParms;
continue;
}
else
{
// Save the op for MULTI_PARM exclusion when COUNT(DISTINCT)
prevAggOp = aggOp;
prevRetKey = returnedColVec[i].first;
}
if (aggOp == ROWAGG_UDAF)
{
@ -4763,8 +4803,6 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate(
funct->fAggFunction = ROWAGG_DUP_STATS;
else if (funct->fAggFunction == ROWAGG_UDAF)
funct->fAggFunction = ROWAGG_DUP_UDAF;
else if (funct->fAggFunction == ROWAGG_COUNT_DISTINCT_COL_NAME) // Don't track dup for this one. Gets confused when multi-parm.
{}
else
funct->fAggFunction = ROWAGG_DUP_FUNCT;
@ -4819,7 +4857,7 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate(
//distinct avg
for (uint64_t i = 0; i < functionVecUm.size(); i++)
{
if (functionVecUm[i]->fAggFunction == ROWAGG_COUNT_DISTINCT_COL_NAME)
if (functionVecUm[i]->fAggFunction == ROWAGG_COUNT_DISTINCT_COL_NAME && !functionVecUm[i]->hasMultiParm)
{
map<uint32_t, SP_ROWAGG_FUNC_t>::iterator k =
avgDistFuncMap.find(keysAggDist[functionVecUm[i]->fOutputColumnIndex]);
@ -5130,11 +5168,6 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate(
for (uint64_t k = 0; k < returnedColVec.size(); k++)
{
if (functionIdMap(returnedColVec[k].second) == ROWAGG_MULTI_PARM)
{
++multiParms;
continue;
}
// search non-distinct functions in functionVec
vector<SP_ROWAGG_FUNC_t>::iterator it = functionVecUm.begin();

View File

@ -29,10 +29,35 @@ idx count(distinct c1, c2) count(distinct c1, c3, char1)
2 3 2
3 2 2
4 1 1
select avg(distinct c2), count(c2), count( distinct c2, c3), count(c2),sum(distinct c2), avg(distinct c3), count(distinct c2, c3) from t1;
avg(distinct c2) count(c2) count( distinct c2, c3) count(c2) sum(distinct c2) avg(distinct c3) count(distinct c2, c3)
3.0000 12 12 12 9 3.0000 12
select avg(distinct c2), count(c2), count( distinct c2, c3), count(c2),sum(distinct c2), avg(distinct c3), count(distinct c2, c3) from t1 group by c1;
avg(distinct c2) count(c2) count( distinct c2, c3) count(c2) sum(distinct c2) avg(distinct c3) count(distinct c2, c3)
3.0000 6 6 6 9 2.0000 6
3.0000 6 6 6 9 3.6667 6
select avg(distinct c2), count(c2), count( distinct c2, c3), count(c2),sum(distinct c2), avg(distinct c3), count(distinct c2) from t1 group by c1;
avg(distinct c2) count(c2) count( distinct c2, c3) count(c2) sum(distinct c2) avg(distinct c3) count(distinct c2)
3.0000 6 6 6 9 2.0000 3
3.0000 6 6 6 9 3.6667 3
select avg(distinct c2), count(c2), count( distinct c2), count(c2),sum(distinct c2), avg(distinct c3), count(distinct c2, c3) from t1 group by c1;
avg(distinct c2) count(c2) count( distinct c2) count(c2) sum(distinct c2) avg(distinct c3) count(distinct c2, c3)
3.0000 6 3 6 9 2.0000 6
3.0000 6 3 6 9 3.6667 6
select group_concat(distinct char1), avg(distinct c2), count(c2), count( distinct c2), count(c2),sum(distinct c2), avg(distinct c3), count(distinct c2, c3, c1) from t1;
group_concat(distinct char1) avg(distinct c2) count(c2) count( distinct c2) count(c2) sum(distinct c2) avg(distinct c3) count(distinct c2, c3, c1)
elsewhere this way comes,something this way comes 3.0000 12 3 12 9 3.0000 12
select group_concat(distinct char1), avg(distinct c2), count(c2), count( distinct c2), count(c2),sum(distinct c2), avg(distinct c3), count(distinct c2, c3) from t1 group by c1;
group_concat(distinct char1) avg(distinct c2) count(c2) count( distinct c2) count(c2) sum(distinct c2) avg(distinct c3) count(distinct c2, c3)
elsewhere this way comes,something this way comes 3.0000 6 3 6 9 2.0000 6
elsewhere this way comes,something this way comes 3.0000 6 3 6 9 3.6667 6
select idx, sum(c3), count(distinct c1, c2), count(distinct c1, c3, char1), group_concat("ls_", char1) from t1 group by idx order by idx;
idx sum(c3) count(distinct c1, c2) count(distinct c1, c3, char1) group_concat("ls_", char1)
1 9 6 4 ls_something this way comes,ls_elsewhere this way comes,ls_something this way comes,ls_something this way comes,ls_elsewhere this way comes,ls_elsewhere this way comes
2 9 3 2 ls_something this way comes,ls_elsewhere this way comes,ls_something this way comes
3 8 2 2 ls_something this way comes,ls_elsewhere this way comes
4 5 1 1 ls_elsewhere this way comes
select group_concat(distinct char1), avg(distinct c2), count(c2), count( distinct c2), count(c2),sum(distinct c2), avg(distinct c3), count(distinct c2, c3, c1) from t1;
group_concat(distinct char1) avg(distinct c2) count(c2) count( distinct c2) count(c2) sum(distinct c2) avg(distinct c3) count(distinct c2, c3, c1)
elsewhere this way comes,something this way comes 3.0000 12 3 12 9 3.0000 12
DROP DATABASE mcol_3738_db;

View File

@ -33,8 +33,16 @@ insert into t1 values (1, 2, 2, 1, "something this way comes"),
select count(distinct c1, c2), count(distinct char1) from t1;
select idx, count(distinct c1, c2), count(distinct c1, c3, char1) from t1 group by idx order by idx;
select avg(distinct c2), count(c2), count( distinct c2, c3), count(c2),sum(distinct c2), avg(distinct c3), count(distinct c2, c3) from t1;
select avg(distinct c2), count(c2), count( distinct c2, c3), count(c2),sum(distinct c2), avg(distinct c3), count(distinct c2, c3) from t1 group by c1;
select avg(distinct c2), count(c2), count( distinct c2, c3), count(c2),sum(distinct c2), avg(distinct c3), count(distinct c2) from t1 group by c1;
select avg(distinct c2), count(c2), count( distinct c2), count(c2),sum(distinct c2), avg(distinct c3), count(distinct c2, c3) from t1 group by c1;
select group_concat(distinct char1), avg(distinct c2), count(c2), count( distinct c2), count(c2),sum(distinct c2), avg(distinct c3), count(distinct c2, c3, c1) from t1;
# group_concat causes the aggregation to be performed on UM only.
select group_concat(distinct char1), avg(distinct c2), count(c2), count( distinct c2), count(c2),sum(distinct c2), avg(distinct c3), count(distinct c2, c3) from t1 group by c1;
select idx, sum(c3), count(distinct c1, c2), count(distinct c1, c3, char1), group_concat("ls_", char1) from t1 group by idx order by idx;
select group_concat(distinct char1), avg(distinct c2), count(c2), count( distinct c2), count(c2),sum(distinct c2), avg(distinct c3), count(distinct c2, c3, c1) from t1;
# Clean UP
DROP DATABASE mcol_3738_db;

View File

@ -176,7 +176,7 @@ struct RowAggFunctionCol
RowAggFunctionCol(RowAggFunctionType aggFunction, RowAggFunctionType stats,
int32_t inputColIndex, int32_t outputColIndex, int32_t auxColIndex = -1) :
fAggFunction(aggFunction), fStatsFunction(stats), fInputColumnIndex(inputColIndex),
fOutputColumnIndex(outputColIndex), fAuxColumnIndex(auxColIndex) {}
fOutputColumnIndex(outputColIndex), fAuxColumnIndex(auxColIndex), hasMultiParm(false) {}
virtual ~RowAggFunctionCol() = default;
virtual void serialize(messageqcpp::ByteStream& bs) const;
@ -203,6 +203,8 @@ struct RowAggFunctionCol
// with fAggFunction == ROWAGG_MULTI_PARM. Order is important.
// If this parameter is constant, that value is here.
execplan::SRCP fpConstCol;
bool hasMultiParm;
};