Merge pull request #2044 from dhall-MariaDB/MCOL-3738

MCOL-3738 COUNT(DISTINCT) with multiple parms
2025-07-29 08:21:15 +03:00 · 2021-07-12 07:34:56 -04:00
parent 4069a48f42 76607be63a
commit 3d557a2f1e
4 changed files with 87 additions and 19 deletions
--- a/dbcon/joblist/tupleaggregatestep.cpp
+++ b/dbcon/joblist/tupleaggregatestep.cpp
@ -2184,6 +2184,9 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate(

        // locate the return column position in aggregated rowgroup
        uint64_t outIdx = 0;
+        RowAggFunctionType prevAggOp = ROWAGG_FUNCT_UNDEFINE;
+        uint32_t prevRetKey = 0;
+
        for (uint64_t i = 0; i < returnedColVec.size(); i++)
        {
            udafc = NULL;
@ -2195,10 +2198,31 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate(

            if (aggOp == ROWAGG_MULTI_PARM)
            {
+                // Duplicate detection doesn't work for multi-parm`
+                
+                // If this function was earlier detected as a duplicate, unduplicate it.
+                SP_ROWAGG_FUNC_t funct = functionVec2.back();
+                if (funct->fAggFunction == ROWAGG_DUP_FUNCT)
+                    funct->fAggFunction = prevAggOp;
+
+                // Remove it from aggDupFuncMap if it's in there.
+                funct->hasMultiParm = true;
+                AGG_MAP::iterator it = aggDupFuncMap.find(boost::make_tuple(prevRetKey, prevAggOp, pUDAFFunc, udafc ? udafc->getContext().getParamKeys() : NULL));
+                if (it != aggDupFuncMap.end())
+                {
+                    aggDupFuncMap.erase(it);
+                }
+
                                // Skip on final agg.: Extra parms for an aggregate have no work there.
                ++multiParms;
                continue;
            }
+            else
+            {
+                // Save the op for MULTI_PARM exclusion when COUNT(DISTINCT)
+                prevAggOp = aggOp;
+                prevRetKey = returnedColVec[i].first;
+            }

            if  (find(jobInfo.distinctColVec.begin(), jobInfo.distinctColVec.end(), retKey) !=
                    jobInfo.distinctColVec.end() )
@ -2543,8 +2567,6 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate(
                        funct->fAggFunction = ROWAGG_DUP_STATS;
                    else if (funct->fAggFunction == ROWAGG_UDAF)
                        funct->fAggFunction = ROWAGG_DUP_UDAF;
-                    else if (funct->fAggFunction == ROWAGG_COUNT_DISTINCT_COL_NAME)  // Don't track dup for this one. Gets confused when multi-parm.
-                    {}
                    else
                        funct->fAggFunction = ROWAGG_DUP_FUNCT;

@ -2599,7 +2621,7 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate(
        // now fix the AVG distinct function, locate the count(distinct column) position
        for (uint64_t i = 0; i < functionVec2.size(); i++)
        {
-            if (functionVec2[i]->fAggFunction == ROWAGG_COUNT_DISTINCT_COL_NAME)
+            if (functionVec2[i]->fAggFunction == ROWAGG_COUNT_DISTINCT_COL_NAME && !functionVec2[i]->hasMultiParm)
            {
                // if the count(distinct k) can be associated with an avg(distinct k)
                map<uint32_t, SP_ROWAGG_FUNC_t>::iterator k =
@ -2919,11 +2941,6 @@ void TupleAggregateStep::prep1PhaseDistinctAggregate(

            for (uint64_t k = 0; k < returnedColVec.size(); k++)
            {
-                if (functionIdMap(returnedColVec[k].second) == ROWAGG_MULTI_PARM)
-                {
-                    ++multiParms;
-                    continue;
-                }
                // search non-distinct functions in functionVec
                vector<SP_ROWAGG_FUNC_t>::iterator it = functionVec2.begin();

@ -4471,6 +4488,9 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate(
        // locate the return column position in aggregated rowgroup from PM
        // outIdx is i without the multi-columns,
        uint64_t outIdx = 0;
+        RowAggFunctionType prevAggOp = ROWAGG_FUNCT_UNDEFINE;
+        uint32_t prevRetKey = 0;
+        
        for (uint64_t i = 0; i < returnedColVec.size(); i++)
        {
            pUDAFFunc = NULL;
@ -4483,10 +4503,30 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate(

            if (aggOp == ROWAGG_MULTI_PARM)
            {
-                // Skip on UM: Extra parms for an aggregate have no work on the UM
+                // Duplicate detection doesn't work for multi-parm`
+                
+                // If this function was earlier detected as a duplicate, unduplicate it.
+                SP_ROWAGG_FUNC_t funct = functionVecUm.back();
+                if (funct->fAggFunction == ROWAGG_DUP_FUNCT)
+                    funct->fAggFunction = prevAggOp;
+
+                // Remove it from aggDupFuncMap if it's in there.
+                funct->hasMultiParm = true;
+                AGG_MAP::iterator it = aggDupFuncMap.find(boost::make_tuple(prevRetKey, prevAggOp, pUDAFFunc, udafc ? udafc->getContext().getParamKeys() : NULL));
+                if (it != aggDupFuncMap.end())
+                {
+                    aggDupFuncMap.erase(it);
+                }
+                // Skip further UM porocessing of the multi-parm: Extra parms for an aggregate have no work on the UM
                ++multiParms;
                continue;
            }
+            else
+            {
+                // Save the op for MULTI_PARM exclusion when COUNT(DISTINCT)
+                prevAggOp = aggOp;
+                prevRetKey = returnedColVec[i].first;
+            }

            if (aggOp == ROWAGG_UDAF)
            {
@ -4763,8 +4803,6 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate(
                        funct->fAggFunction = ROWAGG_DUP_STATS;
                    else if (funct->fAggFunction == ROWAGG_UDAF)
                        funct->fAggFunction = ROWAGG_DUP_UDAF;
-                    else if (funct->fAggFunction == ROWAGG_COUNT_DISTINCT_COL_NAME)  // Don't track dup for this one. Gets confused when multi-parm.
-                    {}
                    else
                        funct->fAggFunction = ROWAGG_DUP_FUNCT;

@ -4819,7 +4857,7 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate(
        //distinct avg
        for (uint64_t i = 0; i < functionVecUm.size(); i++)
        {
-            if (functionVecUm[i]->fAggFunction == ROWAGG_COUNT_DISTINCT_COL_NAME)
+            if (functionVecUm[i]->fAggFunction == ROWAGG_COUNT_DISTINCT_COL_NAME && !functionVecUm[i]->hasMultiParm)
            {
                map<uint32_t, SP_ROWAGG_FUNC_t>::iterator k =
                    avgDistFuncMap.find(keysAggDist[functionVecUm[i]->fOutputColumnIndex]);
@ -5130,11 +5168,6 @@ void TupleAggregateStep::prep2PhasesDistinctAggregate(

            for (uint64_t k = 0; k < returnedColVec.size(); k++)
            {
-                if (functionIdMap(returnedColVec[k].second) == ROWAGG_MULTI_PARM)
-                {
-                    ++multiParms;
-                    continue;
-                }
                // search non-distinct functions in functionVec
                vector<SP_ROWAGG_FUNC_t>::iterator it = functionVecUm.begin();

--- a/mysql-test/columnstore/basic/r/mcol3738_count_distinct_multiparm.result
+++ b/mysql-test/columnstore/basic/r/mcol3738_count_distinct_multiparm.result
@ -29,10 +29,35 @@ idx	count(distinct c1,  c2)	count(distinct c1, c3, char1)
 2	3	2
 3	2	2
 4	1	1
+select avg(distinct c2), count(c2), count( distinct c2, c3), count(c2),sum(distinct c2), avg(distinct c3), count(distinct c2, c3) from t1;
+avg(distinct c2)	count(c2)	count( distinct c2, c3)	count(c2)	sum(distinct c2)	avg(distinct c3)	count(distinct c2, c3)
+3.0000	12	12	12	9	3.0000	12
+select avg(distinct c2), count(c2), count( distinct c2, c3), count(c2),sum(distinct c2), avg(distinct c3), count(distinct c2, c3) from t1 group by c1;
+avg(distinct c2)	count(c2)	count( distinct c2, c3)	count(c2)	sum(distinct c2)	avg(distinct c3)	count(distinct c2, c3)
+3.0000	6	6	6	9	2.0000	6
+3.0000	6	6	6	9	3.6667	6
+select avg(distinct c2), count(c2), count( distinct c2, c3), count(c2),sum(distinct c2), avg(distinct c3), count(distinct c2) from t1 group by c1;
+avg(distinct c2)	count(c2)	count( distinct c2, c3)	count(c2)	sum(distinct c2)	avg(distinct c3)	count(distinct c2)
+3.0000	6	6	6	9	2.0000	3
+3.0000	6	6	6	9	3.6667	3
+select avg(distinct c2), count(c2), count( distinct c2), count(c2),sum(distinct c2), avg(distinct c3), count(distinct c2, c3) from t1 group by c1;
+avg(distinct c2)	count(c2)	count( distinct c2)	count(c2)	sum(distinct c2)	avg(distinct c3)	count(distinct c2, c3)
+3.0000	6	3	6	9	2.0000	6
+3.0000	6	3	6	9	3.6667	6
+select group_concat(distinct char1), avg(distinct c2), count(c2), count( distinct c2), count(c2),sum(distinct c2), avg(distinct c3), count(distinct c2, c3, c1) from t1;
+group_concat(distinct char1)	avg(distinct c2)	count(c2)	count( distinct c2)	count(c2)	sum(distinct c2)	avg(distinct c3)	count(distinct c2, c3, c1)
+elsewhere this way comes,something this way comes	3.0000	12	3	12	9	3.0000	12
+select group_concat(distinct char1), avg(distinct c2), count(c2), count( distinct c2), count(c2),sum(distinct c2), avg(distinct c3), count(distinct c2, c3) from t1 group by c1;
+group_concat(distinct char1)	avg(distinct c2)	count(c2)	count( distinct c2)	count(c2)	sum(distinct c2)	avg(distinct c3)	count(distinct c2, c3)
+elsewhere this way comes,something this way comes	3.0000	6	3	6	9	2.0000	6
+elsewhere this way comes,something this way comes	3.0000	6	3	6	9	3.6667	6
 select idx, sum(c3), count(distinct c1,  c2), count(distinct c1, c3, char1), group_concat("ls_", char1) from t1 group by idx order by idx;
 idx	sum(c3)	count(distinct c1,  c2)	count(distinct c1, c3, char1)	group_concat("ls_", char1)
 1	9	6	4	ls_something this way comes,ls_elsewhere this way comes,ls_something this way comes,ls_something this way comes,ls_elsewhere this way comes,ls_elsewhere this way comes
 2	9	3	2	ls_something this way comes,ls_elsewhere this way comes,ls_something this way comes
 3	8	2	2	ls_something this way comes,ls_elsewhere this way comes
 4	5	1	1	ls_elsewhere this way comes
+select group_concat(distinct char1), avg(distinct c2), count(c2), count( distinct c2), count(c2),sum(distinct c2), avg(distinct c3), count(distinct c2, c3, c1) from t1;
+group_concat(distinct char1)	avg(distinct c2)	count(c2)	count( distinct c2)	count(c2)	sum(distinct c2)	avg(distinct c3)	count(distinct c2, c3, c1)
+elsewhere this way comes,something this way comes	3.0000	12	3	12	9	3.0000	12
 DROP DATABASE mcol_3738_db;
--- a/mysql-test/columnstore/basic/t/mcol3738_count_distinct_multiparm.test
+++ b/mysql-test/columnstore/basic/t/mcol3738_count_distinct_multiparm.test
@ -33,8 +33,16 @@ insert into t1 values   (1, 2, 2, 1, "something this way comes"),

 select count(distinct c1,  c2), count(distinct char1) from t1;
 select idx, count(distinct c1,  c2), count(distinct c1, c3, char1) from t1 group by idx order by idx;
+select avg(distinct c2), count(c2), count( distinct c2, c3), count(c2),sum(distinct c2), avg(distinct c3), count(distinct c2, c3) from t1;
+select avg(distinct c2), count(c2), count( distinct c2, c3), count(c2),sum(distinct c2), avg(distinct c3), count(distinct c2, c3) from t1 group by c1;
+select avg(distinct c2), count(c2), count( distinct c2, c3), count(c2),sum(distinct c2), avg(distinct c3), count(distinct c2) from t1 group by c1;
+select avg(distinct c2), count(c2), count( distinct c2), count(c2),sum(distinct c2), avg(distinct c3), count(distinct c2, c3) from t1 group by c1;
+select group_concat(distinct char1), avg(distinct c2), count(c2), count( distinct c2), count(c2),sum(distinct c2), avg(distinct c3), count(distinct c2, c3, c1) from t1;
+
 # group_concat causes the aggregation to be performed on UM only.
+select group_concat(distinct char1), avg(distinct c2), count(c2), count( distinct c2), count(c2),sum(distinct c2), avg(distinct c3), count(distinct c2, c3) from t1 group by c1;
 select idx, sum(c3), count(distinct c1,  c2), count(distinct c1, c3, char1), group_concat("ls_", char1) from t1 group by idx order by idx;
+select group_concat(distinct char1), avg(distinct c2), count(c2), count( distinct c2), count(c2),sum(distinct c2), avg(distinct c3), count(distinct c2, c3, c1) from t1;

 # Clean UP
 DROP DATABASE mcol_3738_db;
--- a/utils/rowgroup/rowaggregation.h
+++ b/utils/rowgroup/rowaggregation.h
@ -176,7 +176,7 @@ struct RowAggFunctionCol
    RowAggFunctionCol(RowAggFunctionType aggFunction, RowAggFunctionType stats,
                      int32_t inputColIndex, int32_t outputColIndex, int32_t auxColIndex = -1) :
        fAggFunction(aggFunction), fStatsFunction(stats), fInputColumnIndex(inputColIndex),
-        fOutputColumnIndex(outputColIndex), fAuxColumnIndex(auxColIndex) {}
+        fOutputColumnIndex(outputColIndex), fAuxColumnIndex(auxColIndex), hasMultiParm(false) {}
    virtual ~RowAggFunctionCol() = default;

    virtual void serialize(messageqcpp::ByteStream& bs) const;
@ -203,6 +203,8 @@ struct RowAggFunctionCol
    // with fAggFunction == ROWAGG_MULTI_PARM. Order is important.
    // If this parameter is constant, that value is here.
    execplan::SRCP fpConstCol;
+    
+    bool hasMultiParm;
 };