1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-07-30 19:23:07 +03:00

Mcol 3738 Allow COUNT(DISTINCT to have multiple parms) (#2002)

* MCOL-3738 allow COUNT(DISTINCT) multiple parameters
Changes in the way tupleaggregatestep sets up the aggregate arrays.

* MCOL-3738 mtr test
This commit is contained in:
David.Hall
2021-06-28 12:14:44 -05:00
committed by GitHub
parent 994e8027bd
commit 132146b9c8
4 changed files with 310 additions and 105 deletions

View File

@ -1782,13 +1782,19 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate(
}
// vectors for aggregate functions
RowAggFunctionType aggOp = ROWAGG_FUNCT_UNDEFINE;
RowAggFunctionType prevAggOp = ROWAGG_FUNCT_UNDEFINE;
for (uint64_t i = 0; i < aggColVec.size(); i++)
{
pUDAFFunc = NULL;
uint32_t aggKey = aggColVec[i].first;
RowAggFunctionType aggOp = functionIdMap(aggColVec[i].second);
aggOp = functionIdMap(aggColVec[i].second);
RowAggFunctionType stats = statsFuncIdMap(aggColVec[i].second);
// Save the op for MULTI_PARM exclusion when COUNT(DISTINCT)
if (aggOp != ROWAGG_MULTI_PARM)
prevAggOp = aggOp;
// skip if this is a constant
if (aggOp == ROWAGG_CONSTANT)
continue;
@ -1829,10 +1835,13 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate(
throw logic_error(emsg.str());
}
// We skip distinct aggs, including extra parms. These are handled by adding them to group by list above.
if (aggOp == ROWAGG_DISTINCT_SUM ||
aggOp == ROWAGG_DISTINCT_AVG ||
aggOp == ROWAGG_COUNT_DISTINCT_COL_NAME)
continue;
if (aggOp == ROWAGG_MULTI_PARM && prevAggOp == ROWAGG_COUNT_DISTINCT_COL_NAME)
continue;
uint64_t colProj = projColPosMap[aggKey];
@ -2103,7 +2112,7 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate(
funct->fpConstCol = udafc->aggParms()[udafcParamIdx];
}
}
else
else if (prevAggOp != ROWAGG_COUNT_DISTINCT_COL_NAME)
{
throw QueryDataExcept("prep1PhaseDistinctAggregate: UDAF multi function with no parms", aggregateFuncErr);
}
@ -2522,6 +2531,8 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate(
funct->fAggFunction = ROWAGG_DUP_STATS;
else if (funct->fAggFunction == ROWAGG_UDAF)
funct->fAggFunction = ROWAGG_DUP_UDAF;
else if (funct->fAggFunction == ROWAGG_COUNT_DISTINCT_COL_NAME) // Don't track dup for this one. Gets confused when multi-parm.
{}
else
funct->fAggFunction = ROWAGG_DUP_FUNCT;
@ -2724,13 +2735,36 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate(
// for distinct, each column requires seperate rowgroup
vector<SP_ROWAGG_DIST> rowAggSubDistVec;
for (uint64_t i = 0; i < jobInfo.distinctColVec.size(); i++)
uint32_t distinctColKey;
int64_t j;
uint64_t k;
uint64_t outIdx = 0;
for (uint64_t i = 0; i < returnedColVec.size(); i++)
{
uint32_t distinctColKey = jobInfo.distinctColVec[i];
uint64_t j = -1;
if (returnedColVec[i].second == 0)
{
++outIdx;
continue;
}
j = -1;
distinctColKey = -1;
// Find the entry in distinctColVec, if any
for (k = 0; k < jobInfo.distinctColVec.size(); k++)
{
distinctColKey = jobInfo.distinctColVec[k];
if (returnedColVec[i].first == distinctColKey)
break;
}
if (distinctColKey == (uint32_t)-1)
{
++outIdx;
continue;
}
// locate the distinct key in the row group
for (uint64_t k = 0; k < keysAgg.size(); k++)
for (k = 0; k < keysAgg.size(); k++)
{
if (keysProj[k] == distinctColKey)
{
@ -2739,7 +2773,7 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate(
}
}
idbassert(j != (uint64_t) - 1);
idbassert(j != -1);
oidsAggSub = oidsAggGb;
keysAggSub = keysAggGb;
@ -2757,20 +2791,9 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate(
csNumAggSub.push_back(csNumProj[j]);
widthAggSub.push_back(widthProj[j]);
// construct sub-rowgroup
posAggSub.clear();
posAggSub.push_back(2); // rid
for (uint64_t k = 0; k < oidsAggSub.size(); k++)
posAggSub.push_back(posAggSub[k] + widthAggSub[k]);
RowGroup subRg(oidsAggSub.size(), posAggSub, oidsAggSub, keysAggSub, typeAggSub,
csNumAggSub, scaleAggSub, precisionAggSub, jobInfo.stringTableThreshold);
subRgVec.push_back(subRg);
// construct groupby vector
vector<SP_ROWAGG_GRPBY_t> groupBySub;
uint64_t k = 0;
k = 0;
while (k < jobInfo.groupByColVec.size())
{
@ -2778,11 +2801,60 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate(
groupBySub.push_back(groupby);
k++;
}
// add the distinct column as groupby
SP_ROWAGG_GRPBY_t groupby(new RowAggGroupByCol(j, k));
groupBySub.push_back(groupby);
// Add multi parm distinct
while ((i+1) < returnedColVec.size() && functionIdMap(returnedColVec[i+1].second) == ROWAGG_MULTI_PARM)
{
++i;
uint32_t dColKey = -1;
j = -1;
// Find the entry in distinctColVec, if any
for (k = 0; k < jobInfo.distinctColVec.size(); k++)
{
dColKey = jobInfo.distinctColVec[k];
if (returnedColVec[i].first == dColKey)
break;
}
idbassert(dColKey != (uint32_t)-1);
// locate the distinct key in the row group
for (k = 0; k < keysAgg.size(); k++)
{
if (keysProj[k] == dColKey)
{
j = k;
break;
}
}
idbassert(j != -1);
oidsAggSub.push_back(oidsProj[j]);
keysAggSub.push_back(keysProj[j]);
scaleAggSub.push_back(scaleProj[j]);
precisionAggSub.push_back(precisionProj[j]);
typeAggSub.push_back(typeProj[j]);
csNumAggSub.push_back(csNumProj[j]);
widthAggSub.push_back(widthProj[j]);
SP_ROWAGG_GRPBY_t groupby(new RowAggGroupByCol(j, k));
groupBySub.push_back(groupby);
}
// construct sub-rowgroup
posAggSub.clear();
posAggSub.push_back(2); // rid
for ( k = 0; k < oidsAggSub.size(); k++)
posAggSub.push_back(posAggSub[k] + widthAggSub[k]);
RowGroup subRg(oidsAggSub.size(), posAggSub, oidsAggSub, keysAggSub, typeAggSub,
csNumAggSub, scaleAggSub, precisionAggSub, jobInfo.stringTableThreshold);
subRgVec.push_back(subRg);
// Keep a count of the parms after the first for any aggregate.
// These will be skipped and the count needs to be subtracted
// from where the aux column will be.
@ -2792,37 +2864,26 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate(
// -- dummy function vector for sub-aggregator, which does distinct only
// -- aggregate function on this distinct column for rowAggDist
vector<SP_ROWAGG_FUNC_t> functionSub1, functionSub2;
// search the function in functionVec
vector<SP_ROWAGG_FUNC_t>::iterator it = functionVec2.begin();
for (uint64_t k = 0; k < returnedColVec.size(); k++)
while (it != functionVec2.end())
{
if (functionIdMap(returnedColVec[i].second) == ROWAGG_MULTI_PARM)
SP_ROWAGG_FUNC_t f = *it++;
if ((f->fOutputColumnIndex == outIdx) &&
(f->fAggFunction == ROWAGG_COUNT_DISTINCT_COL_NAME ||
f->fAggFunction == ROWAGG_DISTINCT_SUM ||
f->fAggFunction == ROWAGG_DISTINCT_AVG))
{
++multiParms;
continue;
}
if (returnedColVec[k].first != distinctColKey)
continue;
// search the function in functionVec
vector<SP_ROWAGG_FUNC_t>::iterator it = functionVec2.begin();
while (it != functionVec2.end())
{
SP_ROWAGG_FUNC_t f = *it++;
if ((f->fOutputColumnIndex == k) &&
(f->fAggFunction == ROWAGG_COUNT_DISTINCT_COL_NAME ||
f->fAggFunction == ROWAGG_DISTINCT_SUM ||
f->fAggFunction == ROWAGG_DISTINCT_AVG))
{
SP_ROWAGG_FUNC_t funct(new RowAggFunctionCol(
f->fAggFunction,
f->fStatsFunction,
groupBySub.size() - 1,
f->fOutputColumnIndex,
f->fAuxColumnIndex-multiParms));
functionSub2.push_back(funct);
}
SP_ROWAGG_FUNC_t funct(
new RowAggFunctionCol(
f->fAggFunction,
f->fStatsFunction,
groupBySub.size() - 1,
f->fOutputColumnIndex,
f->fAuxColumnIndex-multiParms));
functionSub2.push_back(funct);
}
}
@ -2834,6 +2895,8 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate(
// add to rowAggDist
multiDistinctAggregator->addSubAggregator(subAgg, subRg, functionSub2);
++outIdx;
}
// cover any non-distinct column functions
@ -3968,11 +4031,17 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate(
}
// vectors for aggregate functions
RowAggFunctionType aggOp = ROWAGG_FUNCT_UNDEFINE;
RowAggFunctionType prevAggOp = ROWAGG_FUNCT_UNDEFINE;
for (uint64_t i = 0; i < aggColVec.size(); i++)
{
// skip on PM if this is a constant
RowAggFunctionType aggOp = functionIdMap(aggColVec[i].second);
aggOp = functionIdMap(aggColVec[i].second);
// Save the op for MULTI_PARM exclusion when COUNT(DISTINCT)
if (aggOp != ROWAGG_MULTI_PARM)
prevAggOp = aggOp;
// skip on PM if this is a constant
if (aggOp == ROWAGG_CONSTANT)
continue;
@ -3994,17 +4063,21 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate(
throw logic_error(emsg.str());
}
RowAggFunctionType stats = statsFuncIdMap(aggColVec[i].second);
RowAggFunctionType stats = statsFuncIdMap(aggOp);
// skip sum / count(column) if avg is also selected
if ((aggOp == ROWAGG_SUM || aggOp == ROWAGG_COUNT_COL_NAME) &&
(avgSet.find(aggKey) != avgSet.end()))
continue;
// We skip distinct aggs, including extra parms. These are handled by adding them to group by list above.
if (aggOp == ROWAGG_DISTINCT_SUM ||
aggOp == ROWAGG_DISTINCT_AVG ||
aggOp == ROWAGG_COUNT_DISTINCT_COL_NAME)
continue;
if (aggOp == ROWAGG_MULTI_PARM && prevAggOp == ROWAGG_COUNT_DISTINCT_COL_NAME)
continue;
uint64_t colProj = projColPosMap[aggKey];
SP_ROWAGG_FUNC_t funct;
@ -4273,7 +4346,7 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate(
funct->fpConstCol = udafc->aggParms()[udafcParamIdx];
}
}
else
else if (prevAggOp != ROWAGG_COUNT_DISTINCT_COL_NAME)
{
throw QueryDataExcept("prep2PhasesDistinctAggregate: UDAF multi function with no parms", aggregateFuncErr);
}
@ -4682,6 +4755,8 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate(
funct->fAggFunction = ROWAGG_DUP_STATS;
else if (funct->fAggFunction == ROWAGG_UDAF)
funct->fAggFunction = ROWAGG_DUP_UDAF;
else if (funct->fAggFunction == ROWAGG_COUNT_DISTINCT_COL_NAME) // Don't track dup for this one. Gets confused when multi-parm.
{}
else
funct->fAggFunction = ROWAGG_DUP_FUNCT;
@ -4874,16 +4949,39 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate(
widthAggGb.push_back(widthAggUm[i]);
}
// for distinct, each column requires seperate rowgroup
// for distinct, each column requires a seperate rowgroup
vector<SP_ROWAGG_DIST> rowAggSubDistVec;
for (uint64_t i = 0; i < jobInfo.distinctColVec.size(); i++)
uint32_t distinctColKey;
int64_t j;
uint64_t k;
uint64_t outIdx = 0;
for (uint64_t i = 0; i < returnedColVec.size(); i++)
{
uint32_t distinctColKey = jobInfo.distinctColVec[i];
uint64_t j = -1;
if (returnedColVec[i].second == 0)
{
++outIdx;
continue;
}
j = -1;
distinctColKey = -1;
// Find the entry in distinctColVec, if any
for (k = 0; k < jobInfo.distinctColVec.size(); k++)
{
distinctColKey = jobInfo.distinctColVec[k];
if (returnedColVec[i].first == distinctColKey)
break;
}
if (distinctColKey == (uint32_t)-1)
{
++outIdx;
continue;
}
// locate the distinct key in the row group
for (uint64_t k = 0; k < keysAggUm.size(); k++)
for (k = 0; k < keysAggUm.size(); k++)
{
if (keysAggUm[k] == distinctColKey)
{
@ -4892,7 +4990,7 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate(
}
}
idbassert(j != (uint64_t) - 1);
idbassert(j != -1);
oidsAggSub = oidsAggGb;
keysAggSub = keysAggGb;
@ -4907,23 +5005,12 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate(
scaleAggSub.push_back(scaleAggUm[j]);
precisionAggSub.push_back(precisionAggUm[j]);
typeAggSub.push_back(typeAggUm[j]);
csNumAggSub.push_back(csNumAggUm[i]);
csNumAggSub.push_back(csNumAggUm[j]);
widthAggSub.push_back(widthAggUm[j]);
// construct sub-rowgroup
posAggSub.clear();
posAggSub.push_back(2); // rid
for (uint64_t k = 0; k < oidsAggSub.size(); k++)
posAggSub.push_back(posAggSub[k] + widthAggSub[k]);
RowGroup subRg(oidsAggSub.size(), posAggSub, oidsAggSub, keysAggSub, typeAggSub,
csNumAggSub, scaleAggSub, precisionAggSub, jobInfo.stringTableThreshold);
subRgVec.push_back(subRg);
// construct groupby vector
vector<SP_ROWAGG_GRPBY_t> groupBySub;
uint64_t k = 0;
k = 0;
while (k < jobInfo.groupByColVec.size())
{
@ -4931,11 +5018,60 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate(
groupBySub.push_back(groupby);
k++;
}
// add the distinct column as groupby
SP_ROWAGG_GRPBY_t groupby(new RowAggGroupByCol(j, k));
groupBySub.push_back(groupby);
// Add multi parm distinct
while ((i+1) < returnedColVec.size() && functionIdMap(returnedColVec[i+1].second) == ROWAGG_MULTI_PARM)
{
++i;
uint32_t dColKey = -1;
j = -1;
// Find the entry in distinctColVec, if any
for (k = 0; k < jobInfo.distinctColVec.size(); k++)
{
dColKey = jobInfo.distinctColVec[k];
if (returnedColVec[i].first == dColKey)
break;
}
idbassert(dColKey != (uint32_t)-1);
// locate the distinct key in the row group
for (k = 0; k < keysAggUm.size(); k++)
{
if (keysAggUm[k] == dColKey)
{
j = k;
break;
}
}
idbassert(j != -1);
oidsAggSub.push_back(oidsAggUm[j]);
keysAggSub.push_back(keysAggUm[j]);
scaleAggSub.push_back(scaleAggUm[j]);
precisionAggSub.push_back(precisionAggUm[j]);
typeAggSub.push_back(typeAggUm[j]);
csNumAggSub.push_back(csNumAggUm[j]);
widthAggSub.push_back(widthAggUm[j]);
SP_ROWAGG_GRPBY_t groupby(new RowAggGroupByCol(j, k));
groupBySub.push_back(groupby);
}
// construct sub-rowgroup
posAggSub.clear();
posAggSub.push_back(2); // rid
for ( k = 0; k < oidsAggSub.size(); k++)
posAggSub.push_back(posAggSub[k] + widthAggSub[k]);
RowGroup subRg(oidsAggSub.size(), posAggSub, oidsAggSub, keysAggSub, typeAggSub,
csNumAggSub, scaleAggSub, precisionAggSub, jobInfo.stringTableThreshold);
subRgVec.push_back(subRg);
// Keep a count of the parms after the first for any aggregate.
// These will be skipped and the count needs to be subtracted
// from where the aux column will be.
@ -4945,38 +5081,26 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate(
// -- dummy function vector for sub-aggregator, which does distinct only
// -- aggregate function on this distinct column for rowAggDist
vector<SP_ROWAGG_FUNC_t> functionSub1, functionSub2;
// search the function in functionVec
vector<SP_ROWAGG_FUNC_t>::iterator it = functionVecUm.begin();
for (uint64_t k = 0; k < returnedColVec.size(); k++)
while (it != functionVecUm.end())
{
if (functionIdMap(returnedColVec[i].second) == ROWAGG_MULTI_PARM)
SP_ROWAGG_FUNC_t f = *it++;
if ((f->fOutputColumnIndex == outIdx) &&
(f->fAggFunction == ROWAGG_COUNT_DISTINCT_COL_NAME ||
f->fAggFunction == ROWAGG_DISTINCT_SUM ||
f->fAggFunction == ROWAGG_DISTINCT_AVG))
{
++multiParms;
continue;
}
if (returnedColVec[k].first != distinctColKey)
continue;
// search the function in functionVec
vector<SP_ROWAGG_FUNC_t>::iterator it = functionVecUm.begin();
while (it != functionVecUm.end())
{
SP_ROWAGG_FUNC_t f = *it++;
if ((f->fOutputColumnIndex == k) &&
(f->fAggFunction == ROWAGG_COUNT_DISTINCT_COL_NAME ||
f->fAggFunction == ROWAGG_DISTINCT_SUM ||
f->fAggFunction == ROWAGG_DISTINCT_AVG))
{
SP_ROWAGG_FUNC_t funct(
new RowAggFunctionCol(
f->fAggFunction,
f->fStatsFunction,
groupBySub.size() - 1,
f->fOutputColumnIndex,
f->fAuxColumnIndex-multiParms));
functionSub2.push_back(funct);
}
SP_ROWAGG_FUNC_t funct(
new RowAggFunctionCol(
f->fAggFunction,
f->fStatsFunction,
groupBySub.size() - 1,
f->fOutputColumnIndex,
f->fAuxColumnIndex-multiParms));
functionSub2.push_back(funct);
}
}
@ -4986,6 +5110,8 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate(
// add to rowAggDist
multiDistinctAggregator->addSubAggregator(subAgg, subRg, functionSub2);
++outIdx;
}
// cover any non-distinct column functions

View File

@ -4737,10 +4737,11 @@ ReturnedColumn* buildAggregateColumn(Item* item, gp_walk_info& gwi)
gwi.aggOnSelect = true;
// Argument_count() is the # of formal parms to the agg fcn. Columnstore
// only supports 1 argument except UDAnF and GROUP_CONCAT
// TODO: Support more than one parm for COUNT(DISTINCT)
if (isp->argument_count() != 1 && isp->sum_func() != Item_sum::GROUP_CONCAT_FUNC
&& isp->sum_func() != Item_sum::UDF_SUM_FUNC)
// only supports 1 argument except UDAnF, COUNT(DISTINC) and GROUP_CONCAT
if (isp->argument_count() != 1
&& isp->sum_func() != Item_sum::COUNT_DISTINCT_FUNC
&& isp->sum_func() != Item_sum::GROUP_CONCAT_FUNC
&& isp->sum_func() != Item_sum::UDF_SUM_FUNC)
{
gwi.fatalParseError = true;
gwi.parseErrorText = IDBErrorInfo::instance()->errorMsg(ERR_MUL_ARG_AGG);