fix(aggregate): MCOL-5467 Add support for duplicate expressions in group by. (#3045)

This patch adds support for duplicate expressions (builtin_functions) with one argument in select statement and group by statement.
2025-08-01 06:46:55 +03:00 · 2023-12-05 15:04:53 +03:00
parent b25dce3181
commit 9119f6f7b8
6 changed files with 659 additions and 364 deletions
--- a/dbcon/joblist/tupleaggregatestep.cpp
+++ b/dbcon/joblist/tupleaggregatestep.cpp
@ -75,67 +75,9 @@ using namespace querytele;

 namespace
 {
-struct cmpTuple
-{
-  bool operator()(boost::tuple<uint32_t, int, mcsv1sdk::mcsv1_UDAF*, std::vector<uint32_t>*> a,
-                  boost::tuple<uint32_t, int, mcsv1sdk::mcsv1_UDAF*, std::vector<uint32_t>*> b) const
-  {
-    uint32_t keya = boost::get<0>(a);
-    uint32_t keyb = boost::get<0>(b);
-    int opa;
-    int opb;
-    mcsv1sdk::mcsv1_UDAF* pUDAFa;
-    mcsv1sdk::mcsv1_UDAF* pUDAFb;
-
-    // If key is less than
-    if (keya < keyb)
-      return true;
-    if (keya == keyb)
-    {
-      // test Op
-      opa = boost::get<1>(a);
-      opb = boost::get<1>(b);
-      if (opa < opb)
-        return true;
-      if (opa == opb)
-      {
-        // look at the UDAF object
-        pUDAFa = boost::get<2>(a);
-        pUDAFb = boost::get<2>(b);
-        if (pUDAFa < pUDAFb)
-          return true;
-        if (pUDAFa == pUDAFb)
-        {
-          std::vector<uint32_t>* paramKeysa = boost::get<3>(a);
-          std::vector<uint32_t>* paramKeysb = boost::get<3>(b);
-          if (paramKeysa == NULL || paramKeysb == NULL)
-            return false;
-          if (paramKeysa->size() < paramKeysb->size())
-            return true;
-          if (paramKeysa->size() == paramKeysb->size())
-          {
-            for (uint64_t i = 0; i < paramKeysa->size(); ++i)
-            {
-              if ((*paramKeysa)[i] < (*paramKeysb)[i])
-                return true;
-            }
-          }
-        }
-      }
-    }
-    return false;
-  }
-};
-
 typedef vector<std::pair<Row::Pointer, uint64_t>> RowBucket;
 typedef vector<RowBucket> RowBucketVec;

-// The AGG_MAP type is used to maintain a list of aggregate functions in order to
-// detect duplicates. Since all UDAF have the same op type (ROWAGG_UDAF), we add in
-// the function pointer in order to ensure uniqueness.
-typedef map<boost::tuple<uint32_t, int, mcsv1sdk::mcsv1_UDAF*, std::vector<uint32_t>*>, uint64_t, cmpTuple>
-    AGG_MAP;
-
 inline RowAggFunctionType functionIdMap(int planFuncId)
 {
  switch (planFuncId)
@ -1190,13 +1132,31 @@ void TupleAggregateStep::prep1PhaseAggregate(JobInfo& jobInfo, vector<RowGroup>&
      }
      else
      {
-        Message::Args args;
-        args.add(keyName(i, key, jobInfo));
-        string emsg = IDBErrorInfo::instance()->errorMsg(ERR_NOT_GROUPBY_EXPRESSION, args);
-        cerr << "prep1PhaseAggregate: " << emsg << " oid=" << (int)jobInfo.keyInfo->tupleKeyVec[key].fId
-             << ", alias=" << jobInfo.keyInfo->tupleKeyVec[key].fTable
-             << ", view=" << jobInfo.keyInfo->tupleKeyVec[key].fView << ", function=" << (int)aggOp << endl;
-        throw IDBExcept(emsg, ERR_NOT_GROUPBY_EXPRESSION);
+        uint32_t foundTupleKey{0};
+        if (tryToFindEqualFunctionColumnByTupleKey(jobInfo, groupbyMap, key, foundTupleKey))
+        {
+          oidsAgg.push_back(oidsProj[colProj]);
+          keysAgg.push_back(key);
+          scaleAgg.push_back(scaleProj[colProj]);
+          precisionAgg.push_back(precisionProj[colProj]);
+          typeAgg.push_back(typeProj[colProj]);
+          csNumAgg.push_back(csNumProj[colProj]);
+          widthAgg.push_back(width[colProj]);
+          // Update key.
+          key = foundTupleKey;
+          ++outIdx;
+          continue;
+        }
+        else
+        {
+          Message::Args args;
+          args.add(keyName(i, key, jobInfo));
+          string emsg = IDBErrorInfo::instance()->errorMsg(ERR_NOT_GROUPBY_EXPRESSION, args);
+          cerr << "prep1PhaseAggregate: " << emsg << " oid=" << (int)jobInfo.keyInfo->tupleKeyVec[key].fId
+               << ", alias=" << jobInfo.keyInfo->tupleKeyVec[key].fTable
+               << ", view=" << jobInfo.keyInfo->tupleKeyVec[key].fView << ", function=" << (int)aggOp << endl;
+          throw IDBExcept(emsg, ERR_NOT_GROUPBY_EXPRESSION);
+        }
      }
    }

@ -1503,7 +1463,8 @@ void TupleAggregateStep::prep1PhaseAggregate(JobInfo& jobInfo, vector<RowGroup>&

  RowGroup aggRG(oidsAgg.size(), posAgg, oidsAgg, keysAgg, typeAgg, csNumAgg, scaleAgg, precisionAgg,
                 jobInfo.stringTableThreshold);
-  SP_ROWAGG_UM_t rowAgg(new RowAggregationUM(groupBy, functionVec, jobInfo.rm, jobInfo.umMemLimit, jobInfo.hasRollup));
+  SP_ROWAGG_UM_t rowAgg(
+      new RowAggregationUM(groupBy, functionVec, jobInfo.rm, jobInfo.umMemLimit, jobInfo.hasRollup));
  rowAgg->timeZone(jobInfo.timeZone);
  rowgroups.push_back(aggRG);
  aggregators.push_back(rowAgg);
@ -2245,81 +2206,108 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate(JobInfo& jobInfo, vector<Ro
          // not a direct hit -- a returned column is not already in the RG from PMs
          else
          {
-            bool returnColMissing = true;
-
-            // check if a SUM or COUNT covered by AVG
-            if (aggOp == ROWAGG_SUM || aggOp == ROWAGG_COUNT_COL_NAME)
+            uint32_t foundTupleKey{0};
+            if (tryToFindEqualFunctionColumnByTupleKey(jobInfo, aggFuncMap, retKey, foundTupleKey))
            {
-              it = aggFuncMap.find(boost::make_tuple(returnedColVec[i].first, ROWAGG_AVG, pUDAFFunc,
-                                                     udafc ? udafc->getContext().getParamKeys() : NULL));
-
-              if (it != aggFuncMap.end())
+              AGG_MAP::iterator it = aggFuncMap.find(boost::make_tuple(
+                  foundTupleKey, aggOp, pUDAFFunc, udafc ? udafc->getContext().getParamKeys() : NULL));
+              colAgg = it->second;
+              oidsAggDist.push_back(oidsAgg[colAgg]);
+              keysAggDist.push_back(keysAgg[colAgg]);
+              scaleAggDist.push_back(scaleAgg[colAgg]);
+              precisionAggDist.push_back(precisionAgg[colAgg]);
+              typeAggDist.push_back(typeAgg[colAgg]);
+              csNumAggDist.push_back(csNumAgg[colAgg]);
+              uint32_t width = widthAgg[colAgg];
+              if (aggOp == ROWAGG_GROUP_CONCAT || aggOp == ROWAGG_JSON_ARRAY)
              {
-                // false alarm
-                returnColMissing = false;
+                TupleInfo ti = getTupleInfo(retKey, jobInfo);

-                colAgg = it->second;
+                if (ti.width > width)
+                  width = ti.width;
+              }
+              widthAggDist.push_back(width);

-                if (aggOp == ROWAGG_SUM)
+              // Update the `retKey` to specify that this column is a duplicate.
+              retKey = foundTupleKey;
+            }
+            else
+            {
+              bool returnColMissing = true;
+
+              // check if a SUM or COUNT covered by AVG
+              if (aggOp == ROWAGG_SUM || aggOp == ROWAGG_COUNT_COL_NAME)
+              {
+                it = aggFuncMap.find(boost::make_tuple(returnedColVec[i].first, ROWAGG_AVG, pUDAFFunc,
+                                                       udafc ? udafc->getContext().getParamKeys() : NULL));
+
+                if (it != aggFuncMap.end())
                {
-                  oidsAggDist.push_back(oidsAgg[colAgg]);
-                  keysAggDist.push_back(retKey);
-                  csNumAggDist.push_back(8);
-                  wideDecimalOrLongDouble(colAgg, typeAgg[colAgg], precisionAgg, scaleAgg, widthAgg,
-                                          typeAggDist, scaleAggDist, precisionAggDist, widthAggDist);
-                }
-                else
-                {
-                  // leave the count() to avg
-                  aggOp = ROWAGG_COUNT_NO_OP;
+                  // false alarm
+                  returnColMissing = false;

-                  oidsAggDist.push_back(oidsAgg[colAgg]);
-                  keysAggDist.push_back(retKey);
-                  scaleAggDist.push_back(0);
+                  colAgg = it->second;

-                  if (isUnsigned(typeAgg[colAgg]))
+                  if (aggOp == ROWAGG_SUM)
                  {
-                    typeAggDist.push_back(CalpontSystemCatalog::UBIGINT);
-                    precisionAggDist.push_back(20);
+                    oidsAggDist.push_back(oidsAgg[colAgg]);
+                    keysAggDist.push_back(retKey);
+                    csNumAggDist.push_back(8);
+                    wideDecimalOrLongDouble(colAgg, typeAgg[colAgg], precisionAgg, scaleAgg, widthAgg,
+                                            typeAggDist, scaleAggDist, precisionAggDist, widthAggDist);
                  }
                  else
                  {
-                    typeAggDist.push_back(CalpontSystemCatalog::BIGINT);
-                    precisionAggDist.push_back(19);
+                    // leave the count() to avg
+                    aggOp = ROWAGG_COUNT_NO_OP;
+
+                    oidsAggDist.push_back(oidsAgg[colAgg]);
+                    keysAggDist.push_back(retKey);
+                    scaleAggDist.push_back(0);
+
+                    if (isUnsigned(typeAgg[colAgg]))
+                    {
+                      typeAggDist.push_back(CalpontSystemCatalog::UBIGINT);
+                      precisionAggDist.push_back(20);
+                    }
+                    else
+                    {
+                      typeAggDist.push_back(CalpontSystemCatalog::BIGINT);
+                      precisionAggDist.push_back(19);
+                    }
+                    csNumAggDist.push_back(8);
+                    widthAggDist.push_back(bigIntWidth);
                  }
-                  csNumAggDist.push_back(8);
-                  widthAggDist.push_back(bigIntWidth);
                }
              }
-            }
-            else if (find(jobInfo.expressionVec.begin(), jobInfo.expressionVec.end(), retKey) !=
-                     jobInfo.expressionVec.end())
-            {
-              // a function on aggregation
-              TupleInfo ti = getTupleInfo(retKey, jobInfo);
-              oidsAggDist.push_back(ti.oid);
-              keysAggDist.push_back(retKey);
-              scaleAggDist.push_back(ti.scale);
-              precisionAggDist.push_back(ti.precision);
-              typeAggDist.push_back(ti.dtype);
-              csNumAggDist.push_back(ti.csNum);
-              widthAggDist.push_back(ti.width);
+              else if (find(jobInfo.expressionVec.begin(), jobInfo.expressionVec.end(), retKey) !=
+                       jobInfo.expressionVec.end())
+              {
+                // a function on aggregation
+                TupleInfo ti = getTupleInfo(retKey, jobInfo);
+                oidsAggDist.push_back(ti.oid);
+                keysAggDist.push_back(retKey);
+                scaleAggDist.push_back(ti.scale);
+                precisionAggDist.push_back(ti.precision);
+                typeAggDist.push_back(ti.dtype);
+                csNumAggDist.push_back(ti.csNum);
+                widthAggDist.push_back(ti.width);

-              returnColMissing = false;
-            }
-            else if (aggOp == ROWAGG_CONSTANT)
-            {
-              TupleInfo ti = getTupleInfo(retKey, jobInfo);
-              oidsAggDist.push_back(ti.oid);
-              keysAggDist.push_back(retKey);
-              scaleAggDist.push_back(ti.scale);
-              precisionAggDist.push_back(ti.precision);
-              typeAggDist.push_back(ti.dtype);
-              csNumAggDist.push_back(ti.csNum);
-              widthAggDist.push_back(ti.width);
+                returnColMissing = false;
+              }
+              else if (aggOp == ROWAGG_CONSTANT)
+              {
+                TupleInfo ti = getTupleInfo(retKey, jobInfo);
+                oidsAggDist.push_back(ti.oid);
+                keysAggDist.push_back(retKey);
+                scaleAggDist.push_back(ti.scale);
+                precisionAggDist.push_back(ti.precision);
+                typeAggDist.push_back(ti.dtype);
+                csNumAggDist.push_back(ti.csNum);
+                widthAggDist.push_back(ti.width);

-              returnColMissing = false;
-            }
+                returnColMissing = false;
+              }

 #if 0
                        else if (aggOp == ROWAGG_GROUP_CONCAT || aggOp == ROWAGG_JSON_ARRAY)
@ -2337,63 +2325,64 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate(JobInfo& jobInfo, vector<Ro
                        }

 #endif
-            else if (jobInfo.groupConcatInfo.columns().find(retKey) !=
-                     jobInfo.groupConcatInfo.columns().end())
-            {
-              // TODO: columns only for group_concat do not needed in result set.
-              for (uint64_t k = 0; k < keysProj.size(); k++)
+              else if (jobInfo.groupConcatInfo.columns().find(retKey) !=
+                       jobInfo.groupConcatInfo.columns().end())
              {
-                if (retKey == keysProj[k])
+                // TODO: columns only for group_concat do not needed in result set.
+                for (uint64_t k = 0; k < keysProj.size(); k++)
                {
-                  oidsAggDist.push_back(oidsProj[k]);
-                  keysAggDist.push_back(retKey);
-                  scaleAggDist.push_back(scaleProj[k] >> 8);
-                  precisionAggDist.push_back(precisionProj[k]);
-                  typeAggDist.push_back(typeProj[k]);
-                  csNumAggDist.push_back(csNumProj[k]);
-                  widthAggDist.push_back(widthProj[k]);
+                  if (retKey == keysProj[k])
+                  {
+                    oidsAggDist.push_back(oidsProj[k]);
+                    keysAggDist.push_back(retKey);
+                    scaleAggDist.push_back(scaleProj[k] >> 8);
+                    precisionAggDist.push_back(precisionProj[k]);
+                    typeAggDist.push_back(typeProj[k]);
+                    csNumAggDist.push_back(csNumProj[k]);
+                    widthAggDist.push_back(widthProj[k]);

-                  returnColMissing = false;
-                  break;
+                    returnColMissing = false;
+                    break;
+                  }
                }
              }
-            }

-            else if (jobInfo.windowSet.find(retKey) != jobInfo.windowSet.end())
-            {
-              // skip window columns/expression, which are computed later
-              for (uint64_t k = 0; k < keysProj.size(); k++)
+              else if (jobInfo.windowSet.find(retKey) != jobInfo.windowSet.end())
              {
-                if (retKey == keysProj[k])
+                // skip window columns/expression, which are computed later
+                for (uint64_t k = 0; k < keysProj.size(); k++)
                {
-                  oidsAggDist.push_back(oidsProj[k]);
-                  keysAggDist.push_back(retKey);
-                  scaleAggDist.push_back(scaleProj[k] >> 8);
-                  precisionAggDist.push_back(precisionProj[k]);
-                  typeAggDist.push_back(typeProj[k]);
-                  csNumAggDist.push_back(csNumProj[k]);
-                  widthAggDist.push_back(widthProj[k]);
+                  if (retKey == keysProj[k])
+                  {
+                    oidsAggDist.push_back(oidsProj[k]);
+                    keysAggDist.push_back(retKey);
+                    scaleAggDist.push_back(scaleProj[k] >> 8);
+                    precisionAggDist.push_back(precisionProj[k]);
+                    typeAggDist.push_back(typeProj[k]);
+                    csNumAggDist.push_back(csNumProj[k]);
+                    widthAggDist.push_back(widthProj[k]);

-                  returnColMissing = false;
-                  break;
+                    returnColMissing = false;
+                    break;
+                  }
                }
              }
-            }

-            if (returnColMissing)
-            {
-              Message::Args args;
-              args.add(keyName(outIdx, retKey, jobInfo));
-              string emsg = IDBErrorInfo::instance()->errorMsg(ERR_NOT_GROUPBY_EXPRESSION, args);
-              cerr << "prep1PhaseDistinctAggregate: " << emsg
-                   << " oid=" << (int)jobInfo.keyInfo->tupleKeyVec[retKey].fId
-                   << ", alias=" << jobInfo.keyInfo->tupleKeyVec[retKey].fTable
-                   << ", view=" << jobInfo.keyInfo->tupleKeyVec[retKey].fView << ", function=" << (int)aggOp
-                   << endl;
-              throw IDBExcept(emsg, ERR_NOT_GROUPBY_EXPRESSION);
-            }
-          }  // else
-        }    // switch
+              if (returnColMissing)
+              {
+                Message::Args args;
+                args.add(keyName(outIdx, retKey, jobInfo));
+                string emsg = IDBErrorInfo::instance()->errorMsg(ERR_NOT_GROUPBY_EXPRESSION, args);
+                cerr << "prep1PhaseDistinctAggregate: " << emsg
+                     << " oid=" << (int)jobInfo.keyInfo->tupleKeyVec[retKey].fId
+                     << ", alias=" << jobInfo.keyInfo->tupleKeyVec[retKey].fTable
+                     << ", view=" << jobInfo.keyInfo->tupleKeyVec[retKey].fView << ", function=" << (int)aggOp
+                     << endl;
+                throw IDBExcept(emsg, ERR_NOT_GROUPBY_EXPRESSION);
+              }
+            }  // else
+          }    // switch
+        }
      }

      // update groupby vector if the groupby column is a returned column
@ -3179,14 +3168,14 @@ void TupleAggregateStep::prep2PhasesAggregate(JobInfo& jobInfo, vector<RowGroup>
          colAggPm++;
        }

-        // PM: put the count column for avg next to the sum
-        // let fall through to add a count column for average function
-        if (aggOp != ROWAGG_AVG)
-          break;
-        // The AVG aggregation has a special treatment everywhere.
-        // This is so because AVG(column) is a SUM(column)/COUNT(column)
-        // and these aggregations can be utilized by AVG, if present.
-        /* fall through */
+          // PM: put the count column for avg next to the sum
+          // let fall through to add a count column for average function
+          if (aggOp != ROWAGG_AVG)
+            break;
+          // The AVG aggregation has a special treatment everywhere.
+          // This is so because AVG(column) is a SUM(column)/COUNT(column)
+          // and these aggregations can be utilized by AVG, if present.
+          /* fall through */

        case ROWAGG_COUNT_ASTERISK:
        case ROWAGG_COUNT_COL_NAME:
@ -3439,99 +3428,120 @@ void TupleAggregateStep::prep2PhasesAggregate(JobInfo& jobInfo, vector<RowGroup>
      // not a direct hit -- a returned column is not already in the RG from PMs
      else
      {
-        bool returnColMissing = true;
-
-        // check if a SUM or COUNT covered by AVG
-        if (aggOp == ROWAGG_SUM || aggOp == ROWAGG_COUNT_COL_NAME)
+        // MCOL-5476.
+        uint32_t foundTupleKey{0};
+        if (tryToFindEqualFunctionColumnByTupleKey(jobInfo, aggFuncMap, retKey, foundTupleKey))
        {
-          it = aggFuncMap.find(boost::make_tuple(returnedColVec[i].first, ROWAGG_AVG, pUDAFFunc,
-                                                 udafc ? udafc->getContext().getParamKeys() : NULL));
+          AGG_MAP::iterator it = aggFuncMap.find(boost::make_tuple(
+              foundTupleKey, aggOp, pUDAFFunc, udafc ? udafc->getContext().getParamKeys() : NULL));
+          colPm = it->second;
+          oidsAggUm.push_back(oidsAggPm[colPm]);
+          keysAggUm.push_back(retKey);
+          scaleAggUm.push_back(scaleAggPm[colPm]);
+          precisionAggUm.push_back(precisionAggPm[colPm]);
+          typeAggUm.push_back(typeAggPm[colPm]);
+          csNumAggUm.push_back(csNumAggPm[colPm]);
+          widthAggUm.push_back(widthAggPm[colPm]);
+          // Update the `retKey` to specify that this column is a duplicate.
+          retKey = foundTupleKey;
+        }
+        else
+        {
+          bool returnColMissing = true;

-          if (it != aggFuncMap.end())
+          // check if a SUM or COUNT covered by AVG
+          if (aggOp == ROWAGG_SUM || aggOp == ROWAGG_COUNT_COL_NAME)
          {
-            // false alarm
-            returnColMissing = false;
+            it = aggFuncMap.find(boost::make_tuple(returnedColVec[i].first, ROWAGG_AVG, pUDAFFunc,
+                                                   udafc ? udafc->getContext().getParamKeys() : NULL));

-            colPm = it->second;
-
-            if (aggOp == ROWAGG_SUM)
+            if (it != aggFuncMap.end())
            {
-              wideDecimalOrLongDouble(colPm, typeAggPm[colPm], precisionAggPm, scaleAggPm, widthAggPm,
-                                      typeAggUm, scaleAggUm, precisionAggUm, widthAggUm);
+              // false alarm
+              returnColMissing = false;

-              oidsAggUm.push_back(oidsAggPm[colPm]);
-              keysAggUm.push_back(retKey);
-              csNumAggUm.push_back(8);
-            }
-            else
-            {
-              // leave the count() to avg
-              aggOp = ROWAGG_COUNT_NO_OP;
+              colPm = it->second;

-              colPm++;
-              oidsAggUm.push_back(oidsAggPm[colPm]);
-              keysAggUm.push_back(retKey);
-              scaleAggUm.push_back(0);
-              precisionAggUm.push_back(19);
-              typeAggUm.push_back(CalpontSystemCatalog::UBIGINT);
-              csNumAggUm.push_back(8);
-              widthAggUm.push_back(bigIntWidth);
+              if (aggOp == ROWAGG_SUM)
+              {
+                wideDecimalOrLongDouble(colPm, typeAggPm[colPm], precisionAggPm, scaleAggPm, widthAggPm,
+                                        typeAggUm, scaleAggUm, precisionAggUm, widthAggUm);
+
+                oidsAggUm.push_back(oidsAggPm[colPm]);
+                keysAggUm.push_back(retKey);
+                csNumAggUm.push_back(8);
+              }
+              else
+              {
+                // leave the count() to avg
+                aggOp = ROWAGG_COUNT_NO_OP;
+
+                colPm++;
+                oidsAggUm.push_back(oidsAggPm[colPm]);
+                keysAggUm.push_back(retKey);
+                scaleAggUm.push_back(0);
+                precisionAggUm.push_back(19);
+                typeAggUm.push_back(CalpontSystemCatalog::UBIGINT);
+                csNumAggUm.push_back(8);
+                widthAggUm.push_back(bigIntWidth);
+              }
            }
          }
-        }
-        else if (find(jobInfo.expressionVec.begin(), jobInfo.expressionVec.end(), retKey) !=
-                 jobInfo.expressionVec.end())
-        {
-          // a function on aggregation
-          TupleInfo ti = getTupleInfo(retKey, jobInfo);
-          oidsAggUm.push_back(ti.oid);
-          keysAggUm.push_back(retKey);
-          scaleAggUm.push_back(ti.scale);
-          precisionAggUm.push_back(ti.precision);
-          typeAggUm.push_back(ti.dtype);
-          csNumAggUm.push_back(ti.csNum);
-          widthAggUm.push_back(ti.width);
+          else if (find(jobInfo.expressionVec.begin(), jobInfo.expressionVec.end(), retKey) !=
+                   jobInfo.expressionVec.end())
+          {
+            // a function on aggregation
+            TupleInfo ti = getTupleInfo(retKey, jobInfo);
+            oidsAggUm.push_back(ti.oid);
+            keysAggUm.push_back(retKey);
+            scaleAggUm.push_back(ti.scale);
+            precisionAggUm.push_back(ti.precision);
+            typeAggUm.push_back(ti.dtype);
+            csNumAggUm.push_back(ti.csNum);
+            widthAggUm.push_back(ti.width);

-          returnColMissing = false;
-        }
-        else if (jobInfo.windowSet.find(retKey) != jobInfo.windowSet.end())
-        {
-          // an window function
-          TupleInfo ti = getTupleInfo(retKey, jobInfo);
-          oidsAggUm.push_back(ti.oid);
-          keysAggUm.push_back(retKey);
-          scaleAggUm.push_back(ti.scale);
-          precisionAggUm.push_back(ti.precision);
-          typeAggUm.push_back(ti.dtype);
-          csNumAggUm.push_back(ti.csNum);
-          widthAggUm.push_back(ti.width);
+            returnColMissing = false;
+          }
+          else if (jobInfo.windowSet.find(retKey) != jobInfo.windowSet.end())
+          {
+            // an window function
+            TupleInfo ti = getTupleInfo(retKey, jobInfo);
+            oidsAggUm.push_back(ti.oid);
+            keysAggUm.push_back(retKey);
+            scaleAggUm.push_back(ti.scale);
+            precisionAggUm.push_back(ti.precision);
+            typeAggUm.push_back(ti.dtype);
+            csNumAggUm.push_back(ti.csNum);
+            widthAggUm.push_back(ti.width);

-          returnColMissing = false;
-        }
-        else if (aggOp == ROWAGG_CONSTANT)
-        {
-          TupleInfo ti = getTupleInfo(retKey, jobInfo);
-          oidsAggUm.push_back(ti.oid);
-          keysAggUm.push_back(retKey);
-          scaleAggUm.push_back(ti.scale);
-          precisionAggUm.push_back(ti.precision);
-          typeAggUm.push_back(ti.dtype);
-          csNumAggUm.push_back(ti.csNum);
-          widthAggUm.push_back(ti.width);
+            returnColMissing = false;
+          }
+          else if (aggOp == ROWAGG_CONSTANT)
+          {
+            TupleInfo ti = getTupleInfo(retKey, jobInfo);
+            oidsAggUm.push_back(ti.oid);
+            keysAggUm.push_back(retKey);
+            scaleAggUm.push_back(ti.scale);
+            precisionAggUm.push_back(ti.precision);
+            typeAggUm.push_back(ti.dtype);
+            csNumAggUm.push_back(ti.csNum);
+            widthAggUm.push_back(ti.width);

-          returnColMissing = false;
-        }
+            returnColMissing = false;
+          }

-        if (returnColMissing)
-        {
-          Message::Args args;
-          args.add(keyName(outIdx, retKey, jobInfo));
-          string emsg = IDBErrorInfo::instance()->errorMsg(ERR_NOT_GROUPBY_EXPRESSION, args);
-          cerr << "prep2PhasesAggregate: " << emsg << " oid=" << (int)jobInfo.keyInfo->tupleKeyVec[retKey].fId
-               << ", alias=" << jobInfo.keyInfo->tupleKeyVec[retKey].fTable
-               << ", view=" << jobInfo.keyInfo->tupleKeyVec[retKey].fView << ", function=" << (int)aggOp
-               << endl;
-          throw IDBExcept(emsg, ERR_NOT_GROUPBY_EXPRESSION);
+          if (returnColMissing)
+          {
+            Message::Args args;
+            args.add(keyName(outIdx, retKey, jobInfo));
+            string emsg = IDBErrorInfo::instance()->errorMsg(ERR_NOT_GROUPBY_EXPRESSION, args);
+            cerr << "prep2PhasesAggregate: " << emsg
+                 << " oid=" << (int)jobInfo.keyInfo->tupleKeyVec[retKey].fId
+                 << ", alias=" << jobInfo.keyInfo->tupleKeyVec[retKey].fTable
+                 << ", view=" << jobInfo.keyInfo->tupleKeyVec[retKey].fView << ", function=" << (int)aggOp
+                 << endl;
+            throw IDBExcept(emsg, ERR_NOT_GROUPBY_EXPRESSION);
+          }
        }
      }

@ -3713,7 +3723,8 @@ void TupleAggregateStep::prep2PhasesAggregate(JobInfo& jobInfo, vector<RowGroup>

  RowGroup aggRgUm(oidsAggUm.size(), posAggUm, oidsAggUm, keysAggUm, typeAggUm, csNumAggUm, scaleAggUm,
                   precisionAggUm, jobInfo.stringTableThreshold);
-  SP_ROWAGG_UM_t rowAggUm(new RowAggregationUMP2(groupByUm, functionVecUm, jobInfo.rm, jobInfo.umMemLimit, false));
+  SP_ROWAGG_UM_t rowAggUm(
+      new RowAggregationUMP2(groupByUm, functionVecUm, jobInfo.rm, jobInfo.umMemLimit, false));
  rowAggUm->timeZone(jobInfo.timeZone);
  rowgroups.push_back(aggRgUm);
  aggregators.push_back(rowAggUm);
@ -4505,109 +4516,130 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate(JobInfo& jobInfo, vector<R
        // not a direct hit -- a returned column is not already in the RG from PMs
        else
        {
-          bool returnColMissing = true;
-
-          // check if a SUM or COUNT covered by AVG
-          if (aggOp == ROWAGG_SUM || aggOp == ROWAGG_COUNT_COL_NAME)
+          // MCOL-5476.
+          uint32_t foundTupleKey{0};
+          if (tryToFindEqualFunctionColumnByTupleKey(jobInfo, aggFuncMap, retKey, foundTupleKey))
          {
-            it = aggFuncMap.find(boost::make_tuple(returnedColVec[i].first, ROWAGG_AVG, pUDAFFunc,
-                                                   udafc ? udafc->getContext().getParamKeys() : NULL));
+            AGG_MAP::iterator it = aggFuncMap.find(boost::make_tuple(
+                foundTupleKey, aggOp, pUDAFFunc, udafc ? udafc->getContext().getParamKeys() : NULL));
+            colUm = it->second;
+            oidsAggDist.push_back(oidsAggUm[colUm]);
+            keysAggDist.push_back(keysAggUm[colUm]);
+            scaleAggDist.push_back(scaleAggUm[colUm]);
+            precisionAggDist.push_back(precisionAggUm[colUm]);
+            typeAggDist.push_back(typeAggUm[colUm]);
+            csNumAggDist.push_back(csNumAggUm[colUm]);
+            widthAggDist.push_back(widthAggUm[colUm]);
+            // Update the `retKey` to specify that this column is a duplicate.
+            retKey = foundTupleKey;
+          }
+          else
+          {
+            // here
+            bool returnColMissing = true;

-            if (it != aggFuncMap.end())
+            // check if a SUM or COUNT covered by AVG
+            if (aggOp == ROWAGG_SUM || aggOp == ROWAGG_COUNT_COL_NAME)
            {
-              // false alarm
-              returnColMissing = false;
+              it = aggFuncMap.find(boost::make_tuple(returnedColVec[i].first, ROWAGG_AVG, pUDAFFunc,
+                                                     udafc ? udafc->getContext().getParamKeys() : NULL));

-              colUm = it->second;
-
-              if (aggOp == ROWAGG_SUM)
+              if (it != aggFuncMap.end())
              {
-                oidsAggDist.push_back(oidsAggUm[colUm]);
-                keysAggDist.push_back(retKey);
-                csNumAggDist.push_back(8);
-                wideDecimalOrLongDouble(colUm, typeAggUm[colUm], precisionAggUm, scaleAggUm, widthAggUm,
-                                        typeAggDist, scaleAggDist, precisionAggDist, widthAggDist);
-              }
-              else
-              {
-                // leave the count() to avg
-                aggOp = ROWAGG_COUNT_NO_OP;
+                // false alarm
+                returnColMissing = false;

-                oidsAggDist.push_back(oidsAggUm[colUm]);
-                keysAggDist.push_back(retKey);
-                scaleAggDist.push_back(0);
-                if (isUnsigned(typeAggUm[colUm]))
+                colUm = it->second;
+
+                if (aggOp == ROWAGG_SUM)
                {
-                  precisionAggDist.push_back(20);
-                  typeAggDist.push_back(CalpontSystemCatalog::UBIGINT);
+                  oidsAggDist.push_back(oidsAggUm[colUm]);
+                  keysAggDist.push_back(retKey);
+                  csNumAggDist.push_back(8);
+                  wideDecimalOrLongDouble(colUm, typeAggUm[colUm], precisionAggUm, scaleAggUm, widthAggUm,
+                                          typeAggDist, scaleAggDist, precisionAggDist, widthAggDist);
                }
                else
                {
-                  precisionAggDist.push_back(19);
-                  typeAggDist.push_back(CalpontSystemCatalog::BIGINT);
+                  // leave the count() to avg
+                  aggOp = ROWAGG_COUNT_NO_OP;
+
+                  oidsAggDist.push_back(oidsAggUm[colUm]);
+                  keysAggDist.push_back(retKey);
+                  scaleAggDist.push_back(0);
+                  if (isUnsigned(typeAggUm[colUm]))
+                  {
+                    precisionAggDist.push_back(20);
+                    typeAggDist.push_back(CalpontSystemCatalog::UBIGINT);
+                  }
+                  else
+                  {
+                    precisionAggDist.push_back(19);
+                    typeAggDist.push_back(CalpontSystemCatalog::BIGINT);
+                  }
+                  csNumAggDist.push_back(8);
+                  widthAggDist.push_back(bigIntWidth);
                }
-                csNumAggDist.push_back(8);
-                widthAggDist.push_back(bigIntWidth);
              }
            }
-          }
-          else if (find(jobInfo.expressionVec.begin(), jobInfo.expressionVec.end(), retKey) !=
-                   jobInfo.expressionVec.end())
-          {
-            // a function on aggregation
-            TupleInfo ti = getTupleInfo(retKey, jobInfo);
-            oidsAggDist.push_back(ti.oid);
-            keysAggDist.push_back(retKey);
-            scaleAggDist.push_back(ti.scale);
-            precisionAggDist.push_back(ti.precision);
-            typeAggDist.push_back(ti.dtype);
-            csNumAggDist.push_back(ti.csNum);
-            widthAggDist.push_back(ti.width);
+            else if (find(jobInfo.expressionVec.begin(), jobInfo.expressionVec.end(), retKey) !=
+                     jobInfo.expressionVec.end())
+            {
+              // a function on aggregation
+              TupleInfo ti = getTupleInfo(retKey, jobInfo);
+              oidsAggDist.push_back(ti.oid);
+              keysAggDist.push_back(retKey);
+              scaleAggDist.push_back(ti.scale);
+              precisionAggDist.push_back(ti.precision);
+              typeAggDist.push_back(ti.dtype);
+              csNumAggDist.push_back(ti.csNum);
+              widthAggDist.push_back(ti.width);

-            returnColMissing = false;
-          }
-          else if (jobInfo.windowSet.find(retKey) != jobInfo.windowSet.end())
-          {
-            // a window function
-            TupleInfo ti = getTupleInfo(retKey, jobInfo);
-            oidsAggDist.push_back(ti.oid);
-            keysAggDist.push_back(retKey);
-            scaleAggDist.push_back(ti.scale);
-            precisionAggDist.push_back(ti.precision);
-            typeAggDist.push_back(ti.dtype);
-            csNumAggDist.push_back(ti.csNum);
-            widthAggDist.push_back(ti.width);
+              returnColMissing = false;
+            }
+            else if (jobInfo.windowSet.find(retKey) != jobInfo.windowSet.end())
+            {
+              // a window function
+              TupleInfo ti = getTupleInfo(retKey, jobInfo);
+              oidsAggDist.push_back(ti.oid);
+              keysAggDist.push_back(retKey);
+              scaleAggDist.push_back(ti.scale);
+              precisionAggDist.push_back(ti.precision);
+              typeAggDist.push_back(ti.dtype);
+              csNumAggDist.push_back(ti.csNum);
+              widthAggDist.push_back(ti.width);

-            returnColMissing = false;
-          }
-          else if (aggOp == ROWAGG_CONSTANT)
-          {
-            TupleInfo ti = getTupleInfo(retKey, jobInfo);
-            oidsAggDist.push_back(ti.oid);
-            keysAggDist.push_back(retKey);
-            scaleAggDist.push_back(ti.scale);
-            precisionAggDist.push_back(ti.precision);
-            typeAggDist.push_back(ti.dtype);
-            csNumAggDist.push_back(ti.csNum);
-            widthAggDist.push_back(ti.width);
+              returnColMissing = false;
+            }
+            else if (aggOp == ROWAGG_CONSTANT)
+            {
+              TupleInfo ti = getTupleInfo(retKey, jobInfo);
+              oidsAggDist.push_back(ti.oid);
+              keysAggDist.push_back(retKey);
+              scaleAggDist.push_back(ti.scale);
+              precisionAggDist.push_back(ti.precision);
+              typeAggDist.push_back(ti.dtype);
+              csNumAggDist.push_back(ti.csNum);
+              widthAggDist.push_back(ti.width);

-            returnColMissing = false;
-          }
+              returnColMissing = false;
+            }

-          if (returnColMissing)
-          {
-            Message::Args args;
-            args.add(keyName(outIdx, retKey, jobInfo));
-            string emsg = IDBErrorInfo::instance()->errorMsg(ERR_NOT_GROUPBY_EXPRESSION, args);
-            cerr << "prep2PhasesDistinctAggregate: " << emsg
-                 << " oid=" << (int)jobInfo.keyInfo->tupleKeyVec[retKey].fId
-                 << ", alias=" << jobInfo.keyInfo->tupleKeyVec[retKey].fTable
-                 << ", view=" << jobInfo.keyInfo->tupleKeyVec[retKey].fView << ", function=" << (int)aggOp
-                 << endl;
-            throw IDBExcept(emsg, ERR_NOT_GROUPBY_EXPRESSION);
-          }
-        }  // else not a direct hit
-      }    // else not a DISTINCT
+            if (returnColMissing)
+            {
+              Message::Args args;
+              args.add(keyName(outIdx, retKey, jobInfo));
+              string emsg = IDBErrorInfo::instance()->errorMsg(ERR_NOT_GROUPBY_EXPRESSION, args);
+              cerr << "prep2PhasesDistinctAggregate: " << emsg
+                   << " oid=" << (int)jobInfo.keyInfo->tupleKeyVec[retKey].fId
+                   << ", alias=" << jobInfo.keyInfo->tupleKeyVec[retKey].fTable
+                   << ", view=" << jobInfo.keyInfo->tupleKeyVec[retKey].fView << ", function=" << (int)aggOp
+                   << endl;
+              throw IDBExcept(emsg, ERR_NOT_GROUPBY_EXPRESSION);
+            }
+          }  // else not a direct hit
+        }
+      }  // else not a DISTINCT

      // update groupby vector if the groupby column is a returned column
      if (returnedColVec[i].second == 0)
@ -5496,18 +5528,18 @@ void TupleAggregateStep::threadedAggregateRowGroups(uint32_t threadID)
              // The key is the groupby columns, which are the leading columns.
              // TBD This approach could potential
              // put all values in on bucket.
-	      // The fAggregator->hasRollup() is true when we perform one-phase
-	      // aggregation and also are doing subtotals' computations.
-	      // Subtotals produce new keys whose hash values may not be in
-	      // the processing bucket. Consider case for key tuples (1,2) and (1,3).
-	      // Their subtotals's keys will be (1, NULL) and (1, NULL)
-	      // but they will be left in their processing buckets and never
-	      // gets aggregated properly.
-	      // Due to this, we put all rows into the same bucket 0 when perfoming
-	      // single-phase aggregation with subtotals.
-	      // For all other cases (single-phase without subtotals and two-phase
-	      // aggregation with and without subtotals) fAggregator->hasRollup() is false.
-	      // In these cases we have full parallel processing as expected.
+              // The fAggregator->hasRollup() is true when we perform one-phase
+              // aggregation and also are doing subtotals' computations.
+              // Subtotals produce new keys whose hash values may not be in
+              // the processing bucket. Consider case for key tuples (1,2) and (1,3).
+              // Their subtotals's keys will be (1, NULL) and (1, NULL)
+              // but they will be left in their processing buckets and never
+              // gets aggregated properly.
+              // Due to this, we put all rows into the same bucket 0 when perfoming
+              // single-phase aggregation with subtotals.
+              // For all other cases (single-phase without subtotals and two-phase
+              // aggregation with and without subtotals) fAggregator->hasRollup() is false.
+              // In these cases we have full parallel processing as expected.
              uint64_t hash = fAggregator->hasRollup() ? 0 : rowgroup::hashRow(rowIn, hashLens[0] - 1);
              int bucketID = hash % fNumOfBuckets;
              rowBucketVecs[bucketID][0].emplace_back(rowIn.getPointer(), hash);
@ -5953,4 +5985,44 @@ void TupleAggregateStep::formatMiniStats()
  fMiniInfo += oss.str();
 }

-}  // namespace joblist
+uint32_t TupleAggregateStep::getTupleKeyFromTuple(
+    const boost::tuple<uint32_t, int, mcsv1sdk::mcsv1_UDAF*, std::vector<uint32_t>*>& tuple)
+{
+  return tuple.get<0>();
+}
+
+uint32_t TupleAggregateStep::getTupleKeyFromTuple(uint32_t key)
+{
+  return key;
+}
+
+template <class GroupByMap>
+bool TupleAggregateStep::tryToFindEqualFunctionColumnByTupleKey(JobInfo& jobInfo, GroupByMap& groupByMap,
+                                                                const uint32_t tupleKey, uint32_t& foundKey)
+{
+  auto funcMapIt = jobInfo.functionColumnMap.find(tupleKey);
+  if (funcMapIt != jobInfo.functionColumnMap.end())
+  {
+    const auto& rFunctionInfo = funcMapIt->second;
+    // Try to match given `tupleKey` in `groupByMap`.
+    for (const auto& groupByMapPair : groupByMap)
+    {
+      const auto currentTupleKey = getTupleKeyFromTuple(groupByMapPair.first);
+      auto currentFuncMapIt = jobInfo.functionColumnMap.find(currentTupleKey);
+      // Skip if the keys are the same.
+      if (currentFuncMapIt != jobInfo.functionColumnMap.end() && currentTupleKey != tupleKey)
+      {
+        const auto& lFunctionInfo = currentFuncMapIt->second;
+        // Oid and function name should be the same.
+        if (lFunctionInfo.associatedColumnOid == rFunctionInfo.associatedColumnOid &&
+            lFunctionInfo.functionName == rFunctionInfo.functionName)
+        {
+          foundKey = currentTupleKey;
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+}  // namespace joblist