1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-08-01 06:46:55 +03:00

fix(aggregate): MCOL-5467 Add support for duplicate expressions in group by. (#3045)

This patch adds support for duplicate expressions (builtin_functions) with
one argument in select statement and group by statement.
This commit is contained in:
Denis Khalikov
2023-12-05 15:04:53 +03:00
committed by GitHub
parent b25dce3181
commit 9119f6f7b8
6 changed files with 659 additions and 364 deletions

View File

@ -75,67 +75,9 @@ using namespace querytele;
namespace
{
struct cmpTuple
{
bool operator()(boost::tuple<uint32_t, int, mcsv1sdk::mcsv1_UDAF*, std::vector<uint32_t>*> a,
boost::tuple<uint32_t, int, mcsv1sdk::mcsv1_UDAF*, std::vector<uint32_t>*> b) const
{
uint32_t keya = boost::get<0>(a);
uint32_t keyb = boost::get<0>(b);
int opa;
int opb;
mcsv1sdk::mcsv1_UDAF* pUDAFa;
mcsv1sdk::mcsv1_UDAF* pUDAFb;
// If key is less than
if (keya < keyb)
return true;
if (keya == keyb)
{
// test Op
opa = boost::get<1>(a);
opb = boost::get<1>(b);
if (opa < opb)
return true;
if (opa == opb)
{
// look at the UDAF object
pUDAFa = boost::get<2>(a);
pUDAFb = boost::get<2>(b);
if (pUDAFa < pUDAFb)
return true;
if (pUDAFa == pUDAFb)
{
std::vector<uint32_t>* paramKeysa = boost::get<3>(a);
std::vector<uint32_t>* paramKeysb = boost::get<3>(b);
if (paramKeysa == NULL || paramKeysb == NULL)
return false;
if (paramKeysa->size() < paramKeysb->size())
return true;
if (paramKeysa->size() == paramKeysb->size())
{
for (uint64_t i = 0; i < paramKeysa->size(); ++i)
{
if ((*paramKeysa)[i] < (*paramKeysb)[i])
return true;
}
}
}
}
}
return false;
}
};
typedef vector<std::pair<Row::Pointer, uint64_t>> RowBucket;
typedef vector<RowBucket> RowBucketVec;
// The AGG_MAP type is used to maintain a list of aggregate functions in order to
// detect duplicates. Since all UDAF have the same op type (ROWAGG_UDAF), we add in
// the function pointer in order to ensure uniqueness.
typedef map<boost::tuple<uint32_t, int, mcsv1sdk::mcsv1_UDAF*, std::vector<uint32_t>*>, uint64_t, cmpTuple>
AGG_MAP;
inline RowAggFunctionType functionIdMap(int planFuncId)
{
switch (planFuncId)
@ -1190,13 +1132,31 @@ void TupleAggregateStep::prep1PhaseAggregate(JobInfo& jobInfo, vector<RowGroup>&
}
else
{
Message::Args args;
args.add(keyName(i, key, jobInfo));
string emsg = IDBErrorInfo::instance()->errorMsg(ERR_NOT_GROUPBY_EXPRESSION, args);
cerr << "prep1PhaseAggregate: " << emsg << " oid=" << (int)jobInfo.keyInfo->tupleKeyVec[key].fId
<< ", alias=" << jobInfo.keyInfo->tupleKeyVec[key].fTable
<< ", view=" << jobInfo.keyInfo->tupleKeyVec[key].fView << ", function=" << (int)aggOp << endl;
throw IDBExcept(emsg, ERR_NOT_GROUPBY_EXPRESSION);
uint32_t foundTupleKey{0};
if (tryToFindEqualFunctionColumnByTupleKey(jobInfo, groupbyMap, key, foundTupleKey))
{
oidsAgg.push_back(oidsProj[colProj]);
keysAgg.push_back(key);
scaleAgg.push_back(scaleProj[colProj]);
precisionAgg.push_back(precisionProj[colProj]);
typeAgg.push_back(typeProj[colProj]);
csNumAgg.push_back(csNumProj[colProj]);
widthAgg.push_back(width[colProj]);
// Update key.
key = foundTupleKey;
++outIdx;
continue;
}
else
{
Message::Args args;
args.add(keyName(i, key, jobInfo));
string emsg = IDBErrorInfo::instance()->errorMsg(ERR_NOT_GROUPBY_EXPRESSION, args);
cerr << "prep1PhaseAggregate: " << emsg << " oid=" << (int)jobInfo.keyInfo->tupleKeyVec[key].fId
<< ", alias=" << jobInfo.keyInfo->tupleKeyVec[key].fTable
<< ", view=" << jobInfo.keyInfo->tupleKeyVec[key].fView << ", function=" << (int)aggOp << endl;
throw IDBExcept(emsg, ERR_NOT_GROUPBY_EXPRESSION);
}
}
}
@ -1503,7 +1463,8 @@ void TupleAggregateStep::prep1PhaseAggregate(JobInfo& jobInfo, vector<RowGroup>&
RowGroup aggRG(oidsAgg.size(), posAgg, oidsAgg, keysAgg, typeAgg, csNumAgg, scaleAgg, precisionAgg,
jobInfo.stringTableThreshold);
SP_ROWAGG_UM_t rowAgg(new RowAggregationUM(groupBy, functionVec, jobInfo.rm, jobInfo.umMemLimit, jobInfo.hasRollup));
SP_ROWAGG_UM_t rowAgg(
new RowAggregationUM(groupBy, functionVec, jobInfo.rm, jobInfo.umMemLimit, jobInfo.hasRollup));
rowAgg->timeZone(jobInfo.timeZone);
rowgroups.push_back(aggRG);
aggregators.push_back(rowAgg);
@ -2245,81 +2206,108 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate(JobInfo& jobInfo, vector<Ro
// not a direct hit -- a returned column is not already in the RG from PMs
else
{
bool returnColMissing = true;
// check if a SUM or COUNT covered by AVG
if (aggOp == ROWAGG_SUM || aggOp == ROWAGG_COUNT_COL_NAME)
uint32_t foundTupleKey{0};
if (tryToFindEqualFunctionColumnByTupleKey(jobInfo, aggFuncMap, retKey, foundTupleKey))
{
it = aggFuncMap.find(boost::make_tuple(returnedColVec[i].first, ROWAGG_AVG, pUDAFFunc,
udafc ? udafc->getContext().getParamKeys() : NULL));
if (it != aggFuncMap.end())
AGG_MAP::iterator it = aggFuncMap.find(boost::make_tuple(
foundTupleKey, aggOp, pUDAFFunc, udafc ? udafc->getContext().getParamKeys() : NULL));
colAgg = it->second;
oidsAggDist.push_back(oidsAgg[colAgg]);
keysAggDist.push_back(keysAgg[colAgg]);
scaleAggDist.push_back(scaleAgg[colAgg]);
precisionAggDist.push_back(precisionAgg[colAgg]);
typeAggDist.push_back(typeAgg[colAgg]);
csNumAggDist.push_back(csNumAgg[colAgg]);
uint32_t width = widthAgg[colAgg];
if (aggOp == ROWAGG_GROUP_CONCAT || aggOp == ROWAGG_JSON_ARRAY)
{
// false alarm
returnColMissing = false;
TupleInfo ti = getTupleInfo(retKey, jobInfo);
colAgg = it->second;
if (ti.width > width)
width = ti.width;
}
widthAggDist.push_back(width);
if (aggOp == ROWAGG_SUM)
// Update the `retKey` to specify that this column is a duplicate.
retKey = foundTupleKey;
}
else
{
bool returnColMissing = true;
// check if a SUM or COUNT covered by AVG
if (aggOp == ROWAGG_SUM || aggOp == ROWAGG_COUNT_COL_NAME)
{
it = aggFuncMap.find(boost::make_tuple(returnedColVec[i].first, ROWAGG_AVG, pUDAFFunc,
udafc ? udafc->getContext().getParamKeys() : NULL));
if (it != aggFuncMap.end())
{
oidsAggDist.push_back(oidsAgg[colAgg]);
keysAggDist.push_back(retKey);
csNumAggDist.push_back(8);
wideDecimalOrLongDouble(colAgg, typeAgg[colAgg], precisionAgg, scaleAgg, widthAgg,
typeAggDist, scaleAggDist, precisionAggDist, widthAggDist);
}
else
{
// leave the count() to avg
aggOp = ROWAGG_COUNT_NO_OP;
// false alarm
returnColMissing = false;
oidsAggDist.push_back(oidsAgg[colAgg]);
keysAggDist.push_back(retKey);
scaleAggDist.push_back(0);
colAgg = it->second;
if (isUnsigned(typeAgg[colAgg]))
if (aggOp == ROWAGG_SUM)
{
typeAggDist.push_back(CalpontSystemCatalog::UBIGINT);
precisionAggDist.push_back(20);
oidsAggDist.push_back(oidsAgg[colAgg]);
keysAggDist.push_back(retKey);
csNumAggDist.push_back(8);
wideDecimalOrLongDouble(colAgg, typeAgg[colAgg], precisionAgg, scaleAgg, widthAgg,
typeAggDist, scaleAggDist, precisionAggDist, widthAggDist);
}
else
{
typeAggDist.push_back(CalpontSystemCatalog::BIGINT);
precisionAggDist.push_back(19);
// leave the count() to avg
aggOp = ROWAGG_COUNT_NO_OP;
oidsAggDist.push_back(oidsAgg[colAgg]);
keysAggDist.push_back(retKey);
scaleAggDist.push_back(0);
if (isUnsigned(typeAgg[colAgg]))
{
typeAggDist.push_back(CalpontSystemCatalog::UBIGINT);
precisionAggDist.push_back(20);
}
else
{
typeAggDist.push_back(CalpontSystemCatalog::BIGINT);
precisionAggDist.push_back(19);
}
csNumAggDist.push_back(8);
widthAggDist.push_back(bigIntWidth);
}
csNumAggDist.push_back(8);
widthAggDist.push_back(bigIntWidth);
}
}
}
else if (find(jobInfo.expressionVec.begin(), jobInfo.expressionVec.end(), retKey) !=
jobInfo.expressionVec.end())
{
// a function on aggregation
TupleInfo ti = getTupleInfo(retKey, jobInfo);
oidsAggDist.push_back(ti.oid);
keysAggDist.push_back(retKey);
scaleAggDist.push_back(ti.scale);
precisionAggDist.push_back(ti.precision);
typeAggDist.push_back(ti.dtype);
csNumAggDist.push_back(ti.csNum);
widthAggDist.push_back(ti.width);
else if (find(jobInfo.expressionVec.begin(), jobInfo.expressionVec.end(), retKey) !=
jobInfo.expressionVec.end())
{
// a function on aggregation
TupleInfo ti = getTupleInfo(retKey, jobInfo);
oidsAggDist.push_back(ti.oid);
keysAggDist.push_back(retKey);
scaleAggDist.push_back(ti.scale);
precisionAggDist.push_back(ti.precision);
typeAggDist.push_back(ti.dtype);
csNumAggDist.push_back(ti.csNum);
widthAggDist.push_back(ti.width);
returnColMissing = false;
}
else if (aggOp == ROWAGG_CONSTANT)
{
TupleInfo ti = getTupleInfo(retKey, jobInfo);
oidsAggDist.push_back(ti.oid);
keysAggDist.push_back(retKey);
scaleAggDist.push_back(ti.scale);
precisionAggDist.push_back(ti.precision);
typeAggDist.push_back(ti.dtype);
csNumAggDist.push_back(ti.csNum);
widthAggDist.push_back(ti.width);
returnColMissing = false;
}
else if (aggOp == ROWAGG_CONSTANT)
{
TupleInfo ti = getTupleInfo(retKey, jobInfo);
oidsAggDist.push_back(ti.oid);
keysAggDist.push_back(retKey);
scaleAggDist.push_back(ti.scale);
precisionAggDist.push_back(ti.precision);
typeAggDist.push_back(ti.dtype);
csNumAggDist.push_back(ti.csNum);
widthAggDist.push_back(ti.width);
returnColMissing = false;
}
returnColMissing = false;
}
#if 0
else if (aggOp == ROWAGG_GROUP_CONCAT || aggOp == ROWAGG_JSON_ARRAY)
@ -2337,63 +2325,64 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate(JobInfo& jobInfo, vector<Ro
}
#endif
else if (jobInfo.groupConcatInfo.columns().find(retKey) !=
jobInfo.groupConcatInfo.columns().end())
{
// TODO: columns only for group_concat do not needed in result set.
for (uint64_t k = 0; k < keysProj.size(); k++)
else if (jobInfo.groupConcatInfo.columns().find(retKey) !=
jobInfo.groupConcatInfo.columns().end())
{
if (retKey == keysProj[k])
// TODO: columns only for group_concat do not needed in result set.
for (uint64_t k = 0; k < keysProj.size(); k++)
{
oidsAggDist.push_back(oidsProj[k]);
keysAggDist.push_back(retKey);
scaleAggDist.push_back(scaleProj[k] >> 8);
precisionAggDist.push_back(precisionProj[k]);
typeAggDist.push_back(typeProj[k]);
csNumAggDist.push_back(csNumProj[k]);
widthAggDist.push_back(widthProj[k]);
if (retKey == keysProj[k])
{
oidsAggDist.push_back(oidsProj[k]);
keysAggDist.push_back(retKey);
scaleAggDist.push_back(scaleProj[k] >> 8);
precisionAggDist.push_back(precisionProj[k]);
typeAggDist.push_back(typeProj[k]);
csNumAggDist.push_back(csNumProj[k]);
widthAggDist.push_back(widthProj[k]);
returnColMissing = false;
break;
returnColMissing = false;
break;
}
}
}
}
else if (jobInfo.windowSet.find(retKey) != jobInfo.windowSet.end())
{
// skip window columns/expression, which are computed later
for (uint64_t k = 0; k < keysProj.size(); k++)
else if (jobInfo.windowSet.find(retKey) != jobInfo.windowSet.end())
{
if (retKey == keysProj[k])
// skip window columns/expression, which are computed later
for (uint64_t k = 0; k < keysProj.size(); k++)
{
oidsAggDist.push_back(oidsProj[k]);
keysAggDist.push_back(retKey);
scaleAggDist.push_back(scaleProj[k] >> 8);
precisionAggDist.push_back(precisionProj[k]);
typeAggDist.push_back(typeProj[k]);
csNumAggDist.push_back(csNumProj[k]);
widthAggDist.push_back(widthProj[k]);
if (retKey == keysProj[k])
{
oidsAggDist.push_back(oidsProj[k]);
keysAggDist.push_back(retKey);
scaleAggDist.push_back(scaleProj[k] >> 8);
precisionAggDist.push_back(precisionProj[k]);
typeAggDist.push_back(typeProj[k]);
csNumAggDist.push_back(csNumProj[k]);
widthAggDist.push_back(widthProj[k]);
returnColMissing = false;
break;
returnColMissing = false;
break;
}
}
}
}
if (returnColMissing)
{
Message::Args args;
args.add(keyName(outIdx, retKey, jobInfo));
string emsg = IDBErrorInfo::instance()->errorMsg(ERR_NOT_GROUPBY_EXPRESSION, args);
cerr << "prep1PhaseDistinctAggregate: " << emsg
<< " oid=" << (int)jobInfo.keyInfo->tupleKeyVec[retKey].fId
<< ", alias=" << jobInfo.keyInfo->tupleKeyVec[retKey].fTable
<< ", view=" << jobInfo.keyInfo->tupleKeyVec[retKey].fView << ", function=" << (int)aggOp
<< endl;
throw IDBExcept(emsg, ERR_NOT_GROUPBY_EXPRESSION);
}
} // else
} // switch
if (returnColMissing)
{
Message::Args args;
args.add(keyName(outIdx, retKey, jobInfo));
string emsg = IDBErrorInfo::instance()->errorMsg(ERR_NOT_GROUPBY_EXPRESSION, args);
cerr << "prep1PhaseDistinctAggregate: " << emsg
<< " oid=" << (int)jobInfo.keyInfo->tupleKeyVec[retKey].fId
<< ", alias=" << jobInfo.keyInfo->tupleKeyVec[retKey].fTable
<< ", view=" << jobInfo.keyInfo->tupleKeyVec[retKey].fView << ", function=" << (int)aggOp
<< endl;
throw IDBExcept(emsg, ERR_NOT_GROUPBY_EXPRESSION);
}
} // else
} // switch
}
}
// update groupby vector if the groupby column is a returned column
@ -3179,14 +3168,14 @@ void TupleAggregateStep::prep2PhasesAggregate(JobInfo& jobInfo, vector<RowGroup>
colAggPm++;
}
// PM: put the count column for avg next to the sum
// let fall through to add a count column for average function
if (aggOp != ROWAGG_AVG)
break;
// The AVG aggregation has a special treatment everywhere.
// This is so because AVG(column) is a SUM(column)/COUNT(column)
// and these aggregations can be utilized by AVG, if present.
/* fall through */
// PM: put the count column for avg next to the sum
// let fall through to add a count column for average function
if (aggOp != ROWAGG_AVG)
break;
// The AVG aggregation has a special treatment everywhere.
// This is so because AVG(column) is a SUM(column)/COUNT(column)
// and these aggregations can be utilized by AVG, if present.
/* fall through */
case ROWAGG_COUNT_ASTERISK:
case ROWAGG_COUNT_COL_NAME:
@ -3439,99 +3428,120 @@ void TupleAggregateStep::prep2PhasesAggregate(JobInfo& jobInfo, vector<RowGroup>
// not a direct hit -- a returned column is not already in the RG from PMs
else
{
bool returnColMissing = true;
// check if a SUM or COUNT covered by AVG
if (aggOp == ROWAGG_SUM || aggOp == ROWAGG_COUNT_COL_NAME)
// MCOL-5476.
uint32_t foundTupleKey{0};
if (tryToFindEqualFunctionColumnByTupleKey(jobInfo, aggFuncMap, retKey, foundTupleKey))
{
it = aggFuncMap.find(boost::make_tuple(returnedColVec[i].first, ROWAGG_AVG, pUDAFFunc,
udafc ? udafc->getContext().getParamKeys() : NULL));
AGG_MAP::iterator it = aggFuncMap.find(boost::make_tuple(
foundTupleKey, aggOp, pUDAFFunc, udafc ? udafc->getContext().getParamKeys() : NULL));
colPm = it->second;
oidsAggUm.push_back(oidsAggPm[colPm]);
keysAggUm.push_back(retKey);
scaleAggUm.push_back(scaleAggPm[colPm]);
precisionAggUm.push_back(precisionAggPm[colPm]);
typeAggUm.push_back(typeAggPm[colPm]);
csNumAggUm.push_back(csNumAggPm[colPm]);
widthAggUm.push_back(widthAggPm[colPm]);
// Update the `retKey` to specify that this column is a duplicate.
retKey = foundTupleKey;
}
else
{
bool returnColMissing = true;
if (it != aggFuncMap.end())
// check if a SUM or COUNT covered by AVG
if (aggOp == ROWAGG_SUM || aggOp == ROWAGG_COUNT_COL_NAME)
{
// false alarm
returnColMissing = false;
it = aggFuncMap.find(boost::make_tuple(returnedColVec[i].first, ROWAGG_AVG, pUDAFFunc,
udafc ? udafc->getContext().getParamKeys() : NULL));
colPm = it->second;
if (aggOp == ROWAGG_SUM)
if (it != aggFuncMap.end())
{
wideDecimalOrLongDouble(colPm, typeAggPm[colPm], precisionAggPm, scaleAggPm, widthAggPm,
typeAggUm, scaleAggUm, precisionAggUm, widthAggUm);
// false alarm
returnColMissing = false;
oidsAggUm.push_back(oidsAggPm[colPm]);
keysAggUm.push_back(retKey);
csNumAggUm.push_back(8);
}
else
{
// leave the count() to avg
aggOp = ROWAGG_COUNT_NO_OP;
colPm = it->second;
colPm++;
oidsAggUm.push_back(oidsAggPm[colPm]);
keysAggUm.push_back(retKey);
scaleAggUm.push_back(0);
precisionAggUm.push_back(19);
typeAggUm.push_back(CalpontSystemCatalog::UBIGINT);
csNumAggUm.push_back(8);
widthAggUm.push_back(bigIntWidth);
if (aggOp == ROWAGG_SUM)
{
wideDecimalOrLongDouble(colPm, typeAggPm[colPm], precisionAggPm, scaleAggPm, widthAggPm,
typeAggUm, scaleAggUm, precisionAggUm, widthAggUm);
oidsAggUm.push_back(oidsAggPm[colPm]);
keysAggUm.push_back(retKey);
csNumAggUm.push_back(8);
}
else
{
// leave the count() to avg
aggOp = ROWAGG_COUNT_NO_OP;
colPm++;
oidsAggUm.push_back(oidsAggPm[colPm]);
keysAggUm.push_back(retKey);
scaleAggUm.push_back(0);
precisionAggUm.push_back(19);
typeAggUm.push_back(CalpontSystemCatalog::UBIGINT);
csNumAggUm.push_back(8);
widthAggUm.push_back(bigIntWidth);
}
}
}
}
else if (find(jobInfo.expressionVec.begin(), jobInfo.expressionVec.end(), retKey) !=
jobInfo.expressionVec.end())
{
// a function on aggregation
TupleInfo ti = getTupleInfo(retKey, jobInfo);
oidsAggUm.push_back(ti.oid);
keysAggUm.push_back(retKey);
scaleAggUm.push_back(ti.scale);
precisionAggUm.push_back(ti.precision);
typeAggUm.push_back(ti.dtype);
csNumAggUm.push_back(ti.csNum);
widthAggUm.push_back(ti.width);
else if (find(jobInfo.expressionVec.begin(), jobInfo.expressionVec.end(), retKey) !=
jobInfo.expressionVec.end())
{
// a function on aggregation
TupleInfo ti = getTupleInfo(retKey, jobInfo);
oidsAggUm.push_back(ti.oid);
keysAggUm.push_back(retKey);
scaleAggUm.push_back(ti.scale);
precisionAggUm.push_back(ti.precision);
typeAggUm.push_back(ti.dtype);
csNumAggUm.push_back(ti.csNum);
widthAggUm.push_back(ti.width);
returnColMissing = false;
}
else if (jobInfo.windowSet.find(retKey) != jobInfo.windowSet.end())
{
// an window function
TupleInfo ti = getTupleInfo(retKey, jobInfo);
oidsAggUm.push_back(ti.oid);
keysAggUm.push_back(retKey);
scaleAggUm.push_back(ti.scale);
precisionAggUm.push_back(ti.precision);
typeAggUm.push_back(ti.dtype);
csNumAggUm.push_back(ti.csNum);
widthAggUm.push_back(ti.width);
returnColMissing = false;
}
else if (jobInfo.windowSet.find(retKey) != jobInfo.windowSet.end())
{
// an window function
TupleInfo ti = getTupleInfo(retKey, jobInfo);
oidsAggUm.push_back(ti.oid);
keysAggUm.push_back(retKey);
scaleAggUm.push_back(ti.scale);
precisionAggUm.push_back(ti.precision);
typeAggUm.push_back(ti.dtype);
csNumAggUm.push_back(ti.csNum);
widthAggUm.push_back(ti.width);
returnColMissing = false;
}
else if (aggOp == ROWAGG_CONSTANT)
{
TupleInfo ti = getTupleInfo(retKey, jobInfo);
oidsAggUm.push_back(ti.oid);
keysAggUm.push_back(retKey);
scaleAggUm.push_back(ti.scale);
precisionAggUm.push_back(ti.precision);
typeAggUm.push_back(ti.dtype);
csNumAggUm.push_back(ti.csNum);
widthAggUm.push_back(ti.width);
returnColMissing = false;
}
else if (aggOp == ROWAGG_CONSTANT)
{
TupleInfo ti = getTupleInfo(retKey, jobInfo);
oidsAggUm.push_back(ti.oid);
keysAggUm.push_back(retKey);
scaleAggUm.push_back(ti.scale);
precisionAggUm.push_back(ti.precision);
typeAggUm.push_back(ti.dtype);
csNumAggUm.push_back(ti.csNum);
widthAggUm.push_back(ti.width);
returnColMissing = false;
}
returnColMissing = false;
}
if (returnColMissing)
{
Message::Args args;
args.add(keyName(outIdx, retKey, jobInfo));
string emsg = IDBErrorInfo::instance()->errorMsg(ERR_NOT_GROUPBY_EXPRESSION, args);
cerr << "prep2PhasesAggregate: " << emsg << " oid=" << (int)jobInfo.keyInfo->tupleKeyVec[retKey].fId
<< ", alias=" << jobInfo.keyInfo->tupleKeyVec[retKey].fTable
<< ", view=" << jobInfo.keyInfo->tupleKeyVec[retKey].fView << ", function=" << (int)aggOp
<< endl;
throw IDBExcept(emsg, ERR_NOT_GROUPBY_EXPRESSION);
if (returnColMissing)
{
Message::Args args;
args.add(keyName(outIdx, retKey, jobInfo));
string emsg = IDBErrorInfo::instance()->errorMsg(ERR_NOT_GROUPBY_EXPRESSION, args);
cerr << "prep2PhasesAggregate: " << emsg
<< " oid=" << (int)jobInfo.keyInfo->tupleKeyVec[retKey].fId
<< ", alias=" << jobInfo.keyInfo->tupleKeyVec[retKey].fTable
<< ", view=" << jobInfo.keyInfo->tupleKeyVec[retKey].fView << ", function=" << (int)aggOp
<< endl;
throw IDBExcept(emsg, ERR_NOT_GROUPBY_EXPRESSION);
}
}
}
@ -3713,7 +3723,8 @@ void TupleAggregateStep::prep2PhasesAggregate(JobInfo& jobInfo, vector<RowGroup>
RowGroup aggRgUm(oidsAggUm.size(), posAggUm, oidsAggUm, keysAggUm, typeAggUm, csNumAggUm, scaleAggUm,
precisionAggUm, jobInfo.stringTableThreshold);
SP_ROWAGG_UM_t rowAggUm(new RowAggregationUMP2(groupByUm, functionVecUm, jobInfo.rm, jobInfo.umMemLimit, false));
SP_ROWAGG_UM_t rowAggUm(
new RowAggregationUMP2(groupByUm, functionVecUm, jobInfo.rm, jobInfo.umMemLimit, false));
rowAggUm->timeZone(jobInfo.timeZone);
rowgroups.push_back(aggRgUm);
aggregators.push_back(rowAggUm);
@ -4505,109 +4516,130 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate(JobInfo& jobInfo, vector<R
// not a direct hit -- a returned column is not already in the RG from PMs
else
{
bool returnColMissing = true;
// check if a SUM or COUNT covered by AVG
if (aggOp == ROWAGG_SUM || aggOp == ROWAGG_COUNT_COL_NAME)
// MCOL-5476.
uint32_t foundTupleKey{0};
if (tryToFindEqualFunctionColumnByTupleKey(jobInfo, aggFuncMap, retKey, foundTupleKey))
{
it = aggFuncMap.find(boost::make_tuple(returnedColVec[i].first, ROWAGG_AVG, pUDAFFunc,
udafc ? udafc->getContext().getParamKeys() : NULL));
AGG_MAP::iterator it = aggFuncMap.find(boost::make_tuple(
foundTupleKey, aggOp, pUDAFFunc, udafc ? udafc->getContext().getParamKeys() : NULL));
colUm = it->second;
oidsAggDist.push_back(oidsAggUm[colUm]);
keysAggDist.push_back(keysAggUm[colUm]);
scaleAggDist.push_back(scaleAggUm[colUm]);
precisionAggDist.push_back(precisionAggUm[colUm]);
typeAggDist.push_back(typeAggUm[colUm]);
csNumAggDist.push_back(csNumAggUm[colUm]);
widthAggDist.push_back(widthAggUm[colUm]);
// Update the `retKey` to specify that this column is a duplicate.
retKey = foundTupleKey;
}
else
{
// here
bool returnColMissing = true;
if (it != aggFuncMap.end())
// check if a SUM or COUNT covered by AVG
if (aggOp == ROWAGG_SUM || aggOp == ROWAGG_COUNT_COL_NAME)
{
// false alarm
returnColMissing = false;
it = aggFuncMap.find(boost::make_tuple(returnedColVec[i].first, ROWAGG_AVG, pUDAFFunc,
udafc ? udafc->getContext().getParamKeys() : NULL));
colUm = it->second;
if (aggOp == ROWAGG_SUM)
if (it != aggFuncMap.end())
{
oidsAggDist.push_back(oidsAggUm[colUm]);
keysAggDist.push_back(retKey);
csNumAggDist.push_back(8);
wideDecimalOrLongDouble(colUm, typeAggUm[colUm], precisionAggUm, scaleAggUm, widthAggUm,
typeAggDist, scaleAggDist, precisionAggDist, widthAggDist);
}
else
{
// leave the count() to avg
aggOp = ROWAGG_COUNT_NO_OP;
// false alarm
returnColMissing = false;
oidsAggDist.push_back(oidsAggUm[colUm]);
keysAggDist.push_back(retKey);
scaleAggDist.push_back(0);
if (isUnsigned(typeAggUm[colUm]))
colUm = it->second;
if (aggOp == ROWAGG_SUM)
{
precisionAggDist.push_back(20);
typeAggDist.push_back(CalpontSystemCatalog::UBIGINT);
oidsAggDist.push_back(oidsAggUm[colUm]);
keysAggDist.push_back(retKey);
csNumAggDist.push_back(8);
wideDecimalOrLongDouble(colUm, typeAggUm[colUm], precisionAggUm, scaleAggUm, widthAggUm,
typeAggDist, scaleAggDist, precisionAggDist, widthAggDist);
}
else
{
precisionAggDist.push_back(19);
typeAggDist.push_back(CalpontSystemCatalog::BIGINT);
// leave the count() to avg
aggOp = ROWAGG_COUNT_NO_OP;
oidsAggDist.push_back(oidsAggUm[colUm]);
keysAggDist.push_back(retKey);
scaleAggDist.push_back(0);
if (isUnsigned(typeAggUm[colUm]))
{
precisionAggDist.push_back(20);
typeAggDist.push_back(CalpontSystemCatalog::UBIGINT);
}
else
{
precisionAggDist.push_back(19);
typeAggDist.push_back(CalpontSystemCatalog::BIGINT);
}
csNumAggDist.push_back(8);
widthAggDist.push_back(bigIntWidth);
}
csNumAggDist.push_back(8);
widthAggDist.push_back(bigIntWidth);
}
}
}
else if (find(jobInfo.expressionVec.begin(), jobInfo.expressionVec.end(), retKey) !=
jobInfo.expressionVec.end())
{
// a function on aggregation
TupleInfo ti = getTupleInfo(retKey, jobInfo);
oidsAggDist.push_back(ti.oid);
keysAggDist.push_back(retKey);
scaleAggDist.push_back(ti.scale);
precisionAggDist.push_back(ti.precision);
typeAggDist.push_back(ti.dtype);
csNumAggDist.push_back(ti.csNum);
widthAggDist.push_back(ti.width);
else if (find(jobInfo.expressionVec.begin(), jobInfo.expressionVec.end(), retKey) !=
jobInfo.expressionVec.end())
{
// a function on aggregation
TupleInfo ti = getTupleInfo(retKey, jobInfo);
oidsAggDist.push_back(ti.oid);
keysAggDist.push_back(retKey);
scaleAggDist.push_back(ti.scale);
precisionAggDist.push_back(ti.precision);
typeAggDist.push_back(ti.dtype);
csNumAggDist.push_back(ti.csNum);
widthAggDist.push_back(ti.width);
returnColMissing = false;
}
else if (jobInfo.windowSet.find(retKey) != jobInfo.windowSet.end())
{
// a window function
TupleInfo ti = getTupleInfo(retKey, jobInfo);
oidsAggDist.push_back(ti.oid);
keysAggDist.push_back(retKey);
scaleAggDist.push_back(ti.scale);
precisionAggDist.push_back(ti.precision);
typeAggDist.push_back(ti.dtype);
csNumAggDist.push_back(ti.csNum);
widthAggDist.push_back(ti.width);
returnColMissing = false;
}
else if (jobInfo.windowSet.find(retKey) != jobInfo.windowSet.end())
{
// a window function
TupleInfo ti = getTupleInfo(retKey, jobInfo);
oidsAggDist.push_back(ti.oid);
keysAggDist.push_back(retKey);
scaleAggDist.push_back(ti.scale);
precisionAggDist.push_back(ti.precision);
typeAggDist.push_back(ti.dtype);
csNumAggDist.push_back(ti.csNum);
widthAggDist.push_back(ti.width);
returnColMissing = false;
}
else if (aggOp == ROWAGG_CONSTANT)
{
TupleInfo ti = getTupleInfo(retKey, jobInfo);
oidsAggDist.push_back(ti.oid);
keysAggDist.push_back(retKey);
scaleAggDist.push_back(ti.scale);
precisionAggDist.push_back(ti.precision);
typeAggDist.push_back(ti.dtype);
csNumAggDist.push_back(ti.csNum);
widthAggDist.push_back(ti.width);
returnColMissing = false;
}
else if (aggOp == ROWAGG_CONSTANT)
{
TupleInfo ti = getTupleInfo(retKey, jobInfo);
oidsAggDist.push_back(ti.oid);
keysAggDist.push_back(retKey);
scaleAggDist.push_back(ti.scale);
precisionAggDist.push_back(ti.precision);
typeAggDist.push_back(ti.dtype);
csNumAggDist.push_back(ti.csNum);
widthAggDist.push_back(ti.width);
returnColMissing = false;
}
returnColMissing = false;
}
if (returnColMissing)
{
Message::Args args;
args.add(keyName(outIdx, retKey, jobInfo));
string emsg = IDBErrorInfo::instance()->errorMsg(ERR_NOT_GROUPBY_EXPRESSION, args);
cerr << "prep2PhasesDistinctAggregate: " << emsg
<< " oid=" << (int)jobInfo.keyInfo->tupleKeyVec[retKey].fId
<< ", alias=" << jobInfo.keyInfo->tupleKeyVec[retKey].fTable
<< ", view=" << jobInfo.keyInfo->tupleKeyVec[retKey].fView << ", function=" << (int)aggOp
<< endl;
throw IDBExcept(emsg, ERR_NOT_GROUPBY_EXPRESSION);
}
} // else not a direct hit
} // else not a DISTINCT
if (returnColMissing)
{
Message::Args args;
args.add(keyName(outIdx, retKey, jobInfo));
string emsg = IDBErrorInfo::instance()->errorMsg(ERR_NOT_GROUPBY_EXPRESSION, args);
cerr << "prep2PhasesDistinctAggregate: " << emsg
<< " oid=" << (int)jobInfo.keyInfo->tupleKeyVec[retKey].fId
<< ", alias=" << jobInfo.keyInfo->tupleKeyVec[retKey].fTable
<< ", view=" << jobInfo.keyInfo->tupleKeyVec[retKey].fView << ", function=" << (int)aggOp
<< endl;
throw IDBExcept(emsg, ERR_NOT_GROUPBY_EXPRESSION);
}
} // else not a direct hit
}
} // else not a DISTINCT
// update groupby vector if the groupby column is a returned column
if (returnedColVec[i].second == 0)
@ -5496,18 +5528,18 @@ void TupleAggregateStep::threadedAggregateRowGroups(uint32_t threadID)
// The key is the groupby columns, which are the leading columns.
// TBD This approach could potential
// put all values in on bucket.
// The fAggregator->hasRollup() is true when we perform one-phase
// aggregation and also are doing subtotals' computations.
// Subtotals produce new keys whose hash values may not be in
// the processing bucket. Consider case for key tuples (1,2) and (1,3).
// Their subtotals's keys will be (1, NULL) and (1, NULL)
// but they will be left in their processing buckets and never
// gets aggregated properly.
// Due to this, we put all rows into the same bucket 0 when perfoming
// single-phase aggregation with subtotals.
// For all other cases (single-phase without subtotals and two-phase
// aggregation with and without subtotals) fAggregator->hasRollup() is false.
// In these cases we have full parallel processing as expected.
// The fAggregator->hasRollup() is true when we perform one-phase
// aggregation and also are doing subtotals' computations.
// Subtotals produce new keys whose hash values may not be in
// the processing bucket. Consider case for key tuples (1,2) and (1,3).
// Their subtotals's keys will be (1, NULL) and (1, NULL)
// but they will be left in their processing buckets and never
// gets aggregated properly.
// Due to this, we put all rows into the same bucket 0 when perfoming
// single-phase aggregation with subtotals.
// For all other cases (single-phase without subtotals and two-phase
// aggregation with and without subtotals) fAggregator->hasRollup() is false.
// In these cases we have full parallel processing as expected.
uint64_t hash = fAggregator->hasRollup() ? 0 : rowgroup::hashRow(rowIn, hashLens[0] - 1);
int bucketID = hash % fNumOfBuckets;
rowBucketVecs[bucketID][0].emplace_back(rowIn.getPointer(), hash);
@ -5953,4 +5985,44 @@ void TupleAggregateStep::formatMiniStats()
fMiniInfo += oss.str();
}
} // namespace joblist
uint32_t TupleAggregateStep::getTupleKeyFromTuple(
const boost::tuple<uint32_t, int, mcsv1sdk::mcsv1_UDAF*, std::vector<uint32_t>*>& tuple)
{
return tuple.get<0>();
}
uint32_t TupleAggregateStep::getTupleKeyFromTuple(uint32_t key)
{
return key;
}
template <class GroupByMap>
bool TupleAggregateStep::tryToFindEqualFunctionColumnByTupleKey(JobInfo& jobInfo, GroupByMap& groupByMap,
const uint32_t tupleKey, uint32_t& foundKey)
{
auto funcMapIt = jobInfo.functionColumnMap.find(tupleKey);
if (funcMapIt != jobInfo.functionColumnMap.end())
{
const auto& rFunctionInfo = funcMapIt->second;
// Try to match given `tupleKey` in `groupByMap`.
for (const auto& groupByMapPair : groupByMap)
{
const auto currentTupleKey = getTupleKeyFromTuple(groupByMapPair.first);
auto currentFuncMapIt = jobInfo.functionColumnMap.find(currentTupleKey);
// Skip if the keys are the same.
if (currentFuncMapIt != jobInfo.functionColumnMap.end() && currentTupleKey != tupleKey)
{
const auto& lFunctionInfo = currentFuncMapIt->second;
// Oid and function name should be the same.
if (lFunctionInfo.associatedColumnOid == rFunctionInfo.associatedColumnOid &&
lFunctionInfo.functionName == rFunctionInfo.functionName)
{
foundKey = currentTupleKey;
return true;
}
}
}
}
return false;
}
} // namespace joblist