Provide moving-aggregate support for a bunch of numerical aggregates.

First installment of the promised moving-aggregate support in built-in aggregates: count(), sum(), avg(), stddev() and variance() for assorted datatypes, though not for float4/float8. In passing, remove a 2001-vintage kluge in interval_accum(): interval array elements have been properly aligned since around 2003, but nobody remembered to take out this workaround. Also, fix a thinko in the opr_sanity tests for moving-aggregate catalog entries. David Rowley and Florian Pflug, reviewed by Dean Rasheed
2025-07-05 07:21:24 +03:00 · 2014-04-12 20:33:09 -04:00
parent a9d9acbf21
commit 9d229f399e
13 changed files with 1232 additions and 224 deletions
--- a/src/backend/utils/adt/numeric.c
+++ b/src/backend/utils/adt/numeric.c
@ -2506,22 +2506,19 @@ numeric_float4(PG_FUNCTION_ARGS)
 * Actually, it's a pointer to a NumericAggState allocated in the aggregate
 * context.  The digit buffers for the NumericVars will be there too.
 *
- * Note that the transition functions don't bother to create a NumericAggState
- * until they see the first non-null input value; therefore, the final
- * functions will never see N == 0.  (The case is represented as a NULL
- * state pointer, instead.)
- *
 * ----------------------------------------------------------------------
 */

 typedef struct NumericAggState
 {
 	bool		calcSumX2;		/* if true, calculate sumX2 */
-	bool		isNaN;			/* true if any processed number was NaN */
 	MemoryContext agg_context;	/* context we're calculating in */
 	int64		N;				/* count of processed numbers */
 	NumericVar	sumX;			/* sum of processed numbers */
 	NumericVar	sumX2;			/* sum of squares of processed numbers */
+	int			maxScale;		/* maximum scale seen so far */
+	int64		maxScaleCount;  /* number of values seen with maximum scale */
+	int64		NaNcount;		/* count of NaN values (not included in N!) */
 } NumericAggState;

 /*
@ -2559,16 +2556,28 @@ do_numeric_accum(NumericAggState *state, Numeric newval)
 	NumericVar	X2;
 	MemoryContext old_context;

-	/* result is NaN if any processed number is NaN */
-	if (state->isNaN || NUMERIC_IS_NAN(newval))
+	/* Count NaN inputs separately from all else */
+	if (NUMERIC_IS_NAN(newval))
 	{
-		state->isNaN = true;
+		state->NaNcount++;
 		return;
 	}

 	/* load processed number in short-lived context */
 	init_var_from_num(newval, &X);

+	/*
+	 * Track the highest input dscale that we've seen, to support inverse
+	 * transitions (see do_numeric_discard).
+	 */
+	if (X.dscale > state->maxScale)
+	{
+		state->maxScale = X.dscale;
+		state->maxScaleCount = 1;
+	}
+	else if (X.dscale == state->maxScale)
+		state->maxScaleCount++;
+
 	/* if we need X^2, calculate that in short-lived context */
 	if (state->calcSumX2)
 	{
@ -2599,6 +2608,97 @@ do_numeric_accum(NumericAggState *state, Numeric newval)
 	MemoryContextSwitchTo(old_context);
 }

+/*
+ * Attempt to remove an input value from the aggregated state.
+ *
+ * If the value cannot be removed then the function will return false; the
+ * possible reasons for failing are described below.
+ *
+ * If we aggregate the values 1.01 and 2 then the result will be 3.01.
+ * If we are then asked to un-aggregate the 1.01 then we must fail as we
+ * won't be able to tell what the new aggregated value's dscale should be.
+ * We don't want to return 2.00 (dscale = 2), since the sum's dscale would
+ * have been zero if we'd really aggregated only 2.
+ *
+ * Note: alternatively, we could count the number of inputs with each possible
+ * dscale (up to some sane limit).  Not yet clear if it's worth the trouble.
+ */
+static bool
+do_numeric_discard(NumericAggState *state, Numeric newval)
+{
+	NumericVar	X;
+	NumericVar	X2;
+	MemoryContext old_context;
+
+	/* Count NaN inputs separately from all else */
+	if (NUMERIC_IS_NAN(newval))
+	{
+		state->NaNcount--;
+		return true;
+	}
+
+	/* load processed number in short-lived context */
+	init_var_from_num(newval, &X);
+
+	/*
+	 * state->sumX's dscale is the maximum dscale of any of the inputs.
+	 * Removing the last input with that dscale would require us to recompute
+	 * the maximum dscale of the *remaining* inputs, which we cannot do unless
+	 * no more non-NaN inputs remain at all.  So we report a failure instead,
+	 * and force the aggregation to be redone from scratch.
+	 */
+	if (X.dscale == state->maxScale)
+	{
+		if (state->maxScaleCount > 1 || state->maxScale == 0)
+		{
+			/*
+			 * Some remaining inputs have same dscale, or dscale hasn't
+			 * gotten above zero anyway
+			 */
+			state->maxScaleCount--;
+		}
+		else if (state->N == 1)
+		{
+			/* No remaining non-NaN inputs at all, so reset maxScale */
+			state->maxScale = 0;
+			state->maxScaleCount = 0;
+		}
+		else
+		{
+			/* Correct new maxScale is uncertain, must fail */
+			return false;
+		}
+	}
+
+	/* if we need X^2, calculate that in short-lived context */
+	if (state->calcSumX2)
+	{
+		init_var(&X2);
+		mul_var(&X, &X, &X2, X.dscale * 2);
+	}
+
+	/* The rest of this needs to work in the aggregate context */
+	old_context = MemoryContextSwitchTo(state->agg_context);
+
+	if (state->N-- > 1)
+	{
+		/* De-accumulate sums */
+		sub_var(&(state->sumX), &X, &(state->sumX));
+
+		if (state->calcSumX2)
+			sub_var(&(state->sumX2), &X2, &(state->sumX2));
+	}
+	else
+	{
+		/* Sums will be reset by next call to do_numeric_accum */
+		Assert(state->N == 0);
+	}
+
+	MemoryContextSwitchTo(old_context);
+
+	return true;
+}
+
 /*
 * Generic transition function for numeric aggregates that require sumX2.
 */
@ -2609,14 +2709,12 @@ numeric_accum(PG_FUNCTION_ARGS)

 	state = PG_ARGISNULL(0) ? NULL : (NumericAggState *) PG_GETARG_POINTER(0);

-	if (!PG_ARGISNULL(1))
-	{
-		/* Create the state data when we see the first non-null input. */
-		if (state == NULL)
-			state = makeNumericAggState(fcinfo, true);
+	/* Create the state data on the first call */
+	if (state == NULL)
+		state = makeNumericAggState(fcinfo, true);

+	if (!PG_ARGISNULL(1))
 		do_numeric_accum(state, PG_GETARG_NUMERIC(1));
-	}

 	PG_RETURN_POINTER(state);
 }
@ -2631,18 +2729,42 @@ numeric_avg_accum(PG_FUNCTION_ARGS)

 	state = PG_ARGISNULL(0) ? NULL : (NumericAggState *) PG_GETARG_POINTER(0);

+	/* Create the state data on the first call */
+	if (state == NULL)
+		state = makeNumericAggState(fcinfo, false);
+
+	if (!PG_ARGISNULL(1))
+		do_numeric_accum(state, PG_GETARG_NUMERIC(1));
+
+	PG_RETURN_POINTER(state);
+}
+
+/*
+ * Generic inverse transition function for numeric aggregates
+ * (with or without requirement for X^2).
+ */
+Datum
+numeric_accum_inv(PG_FUNCTION_ARGS)
+{
+	NumericAggState *state;
+
+	state = PG_ARGISNULL(0) ? NULL : (NumericAggState *) PG_GETARG_POINTER(0);
+
+	/* Should not get here with no state */
+	if (state == NULL)
+		elog(ERROR, "numeric_accum_inv called with NULL state");
+
 	if (!PG_ARGISNULL(1))
 	{
-		/* Create the state data when we see the first non-null input. */
-		if (state == NULL)
-			state = makeNumericAggState(fcinfo, false);
-
-		do_numeric_accum(state, PG_GETARG_NUMERIC(1));
+		/* If we fail to perform the inverse transition, return NULL */
+		if (!do_numeric_discard(state, PG_GETARG_NUMERIC(1)))
+			PG_RETURN_NULL();
 	}

 	PG_RETURN_POINTER(state);
 }

+
 /*
 * Integer data types all use Numeric accumulators to share code and
 * avoid risk of overflow.	For int2 and int4 inputs, Numeric accumulation
@ -2659,17 +2781,16 @@ int2_accum(PG_FUNCTION_ARGS)

 	state = PG_ARGISNULL(0) ? NULL : (NumericAggState *) PG_GETARG_POINTER(0);

+	/* Create the state data on the first call */
+	if (state == NULL)
+		state = makeNumericAggState(fcinfo, true);
+
 	if (!PG_ARGISNULL(1))
 	{
 		Numeric		newval;

 		newval = DatumGetNumeric(DirectFunctionCall1(int2_numeric,
 													 PG_GETARG_DATUM(1)));
-
-		/* Create the state data when we see the first non-null input. */
-		if (state == NULL)
-			state = makeNumericAggState(fcinfo, true);
-
 		do_numeric_accum(state, newval);
 	}

@ -2683,17 +2804,16 @@ int4_accum(PG_FUNCTION_ARGS)

 	state = PG_ARGISNULL(0) ? NULL : (NumericAggState *) PG_GETARG_POINTER(0);

+	/* Create the state data on the first call */
+	if (state == NULL)
+		state = makeNumericAggState(fcinfo, true);
+
 	if (!PG_ARGISNULL(1))
 	{
 		Numeric		newval;

 		newval = DatumGetNumeric(DirectFunctionCall1(int4_numeric,
 													 PG_GETARG_DATUM(1)));
-
-		/* Create the state data when we see the first non-null input. */
-		if (state == NULL)
-			state = makeNumericAggState(fcinfo, true);
-
 		do_numeric_accum(state, newval);
 	}

@ -2707,17 +2827,16 @@ int8_accum(PG_FUNCTION_ARGS)

 	state = PG_ARGISNULL(0) ? NULL : (NumericAggState *) PG_GETARG_POINTER(0);

+	/* Create the state data on the first call */
+	if (state == NULL)
+		state = makeNumericAggState(fcinfo, true);
+
 	if (!PG_ARGISNULL(1))
 	{
 		Numeric		newval;

 		newval = DatumGetNumeric(DirectFunctionCall1(int8_numeric,
 													 PG_GETARG_DATUM(1)));
-
-		/* Create the state data when we see the first non-null input. */
-		if (state == NULL)
-			state = makeNumericAggState(fcinfo, true);
-
 		do_numeric_accum(state, newval);
 	}

@ -2734,6 +2853,90 @@ int8_avg_accum(PG_FUNCTION_ARGS)

 	state = PG_ARGISNULL(0) ? NULL : (NumericAggState *) PG_GETARG_POINTER(0);

+	/* Create the state data on the first call */
+	if (state == NULL)
+		state = makeNumericAggState(fcinfo, false);
+
+	if (!PG_ARGISNULL(1))
+	{
+		Numeric		newval;
+
+		newval = DatumGetNumeric(DirectFunctionCall1(int8_numeric,
+													 PG_GETARG_DATUM(1)));
+		do_numeric_accum(state, newval);
+	}
+
+	PG_RETURN_POINTER(state);
+}
+
+
+/*
+ * Inverse transition functions to go with the above.
+ */
+
+Datum
+int2_accum_inv(PG_FUNCTION_ARGS)
+{
+	NumericAggState *state;
+
+	state = PG_ARGISNULL(0) ? NULL : (NumericAggState *) PG_GETARG_POINTER(0);
+
+	/* Should not get here with no state */
+	if (state == NULL)
+		elog(ERROR, "int2_accum_inv called with NULL state");
+
+	if (!PG_ARGISNULL(1))
+	{
+		Numeric		newval;
+
+		newval = DatumGetNumeric(DirectFunctionCall1(int2_numeric,
+													 PG_GETARG_DATUM(1)));
+
+		/* Should never fail, all inputs have dscale 0 */
+		if (!do_numeric_discard(state, newval))
+			elog(ERROR, "do_numeric_discard failed unexpectedly");
+	}
+
+	PG_RETURN_POINTER(state);
+}
+
+Datum
+int4_accum_inv(PG_FUNCTION_ARGS)
+{
+	NumericAggState *state;
+
+	state = PG_ARGISNULL(0) ? NULL : (NumericAggState *) PG_GETARG_POINTER(0);
+
+	/* Should not get here with no state */
+	if (state == NULL)
+		elog(ERROR, "int4_accum_inv called with NULL state");
+
+	if (!PG_ARGISNULL(1))
+	{
+		Numeric		newval;
+
+		newval = DatumGetNumeric(DirectFunctionCall1(int4_numeric,
+													 PG_GETARG_DATUM(1)));
+
+		/* Should never fail, all inputs have dscale 0 */
+		if (!do_numeric_discard(state, newval))
+			elog(ERROR, "do_numeric_discard failed unexpectedly");
+	}
+
+	PG_RETURN_POINTER(state);
+}
+
+Datum
+int8_accum_inv(PG_FUNCTION_ARGS)
+{
+	NumericAggState *state;
+
+	state = PG_ARGISNULL(0) ? NULL : (NumericAggState *) PG_GETARG_POINTER(0);
+
+	/* Should not get here with no state */
+	if (state == NULL)
+		elog(ERROR, "int8_accum_inv called with NULL state");
+
 	if (!PG_ARGISNULL(1))
 	{
 		Numeric		newval;
@ -2741,17 +2944,14 @@ int8_avg_accum(PG_FUNCTION_ARGS)
 		newval = DatumGetNumeric(DirectFunctionCall1(int8_numeric,
 													 PG_GETARG_DATUM(1)));

-		/* Create the state data when we see the first non-null input. */
-		if (state == NULL)
-			state = makeNumericAggState(fcinfo, false);
-
-		do_numeric_accum(state, newval);
+		/* Should never fail, all inputs have dscale 0 */
+		if (!do_numeric_discard(state, newval))
+			elog(ERROR, "do_numeric_discard failed unexpectedly");
 	}

 	PG_RETURN_POINTER(state);
 }

-
 Datum
 numeric_avg(PG_FUNCTION_ARGS)
 {
@ -2760,10 +2960,12 @@ numeric_avg(PG_FUNCTION_ARGS)
 	Datum		sumX_datum;

 	state = PG_ARGISNULL(0) ? NULL : (NumericAggState *) PG_GETARG_POINTER(0);
-	if (state == NULL)			/* there were no non-null inputs */
+
+	/* If there were no non-null inputs, return NULL */
+	if (state == NULL || (state->N + state->NaNcount) == 0)
 		PG_RETURN_NULL();

-	if (state->isNaN)			/* there was at least one NaN input */
+	if (state->NaNcount > 0)		/* there was at least one NaN input */
 		PG_RETURN_NUMERIC(make_result(&const_nan));

 	N_datum = DirectFunctionCall1(int8_numeric, Int64GetDatum(state->N));
@ -2778,10 +2980,12 @@ numeric_sum(PG_FUNCTION_ARGS)
 	NumericAggState *state;

 	state = PG_ARGISNULL(0) ? NULL : (NumericAggState *) PG_GETARG_POINTER(0);
-	if (state == NULL)			/* there were no non-null inputs */
+
+	/* If there were no non-null inputs, return NULL */
+	if (state == NULL || (state->N + state->NaNcount) == 0)
 		PG_RETURN_NULL();

-	if (state->isNaN)			/* there was at least one NaN input */
+	if (state->NaNcount > 0)		/* there was at least one NaN input */
 		PG_RETURN_NUMERIC(make_result(&const_nan));

 	PG_RETURN_NUMERIC(make_result(&(state->sumX)));
@ -2812,7 +3016,7 @@ numeric_stddev_internal(NumericAggState *state,
 	int			rscale;

 	/* Deal with empty input and NaN-input cases */
-	if (state == NULL)
+	if (state == NULL || (state->N + state->NaNcount) == 0)
 	{
 		*is_null = true;
 		return NULL;
@ -2820,7 +3024,7 @@ numeric_stddev_internal(NumericAggState *state,

 	*is_null = false;

-	if (state->isNaN)
+	if (state->NaNcount > 0)
 		return make_result(&const_nan);

 	init_var(&vN);
@ -2965,6 +3169,9 @@ numeric_stddev_pop(PG_FUNCTION_ARGS)
 * data value into the transition data: it doesn't know how to do the type
 * conversion.	The upshot is that these routines have to be marked non-strict
 * and handle substitution of the first non-null input themselves.
+ *
+ * Note: these functions are used only in plain aggregation mode.
+ * In moving-aggregate mode, we use intX_avg_accum and intX_avg_accum_inv.
 */

 Datum
@ -3107,6 +3314,10 @@ int8_sum(PG_FUNCTION_ARGS)
 /*
 * Routines for avg(int2) and avg(int4).  The transition datatype
 * is a two-element int8 array, holding count and sum.
+ *
+ * These functions are also used for sum(int2) and sum(int4) when
+ * operating in moving-aggregate mode, since for correct inverse transitions
+ * we need to count the inputs.
 */

 typedef struct Int8TransTypeData
@ -3171,6 +3382,62 @@ int4_avg_accum(PG_FUNCTION_ARGS)
 	PG_RETURN_ARRAYTYPE_P(transarray);
 }

+Datum
+int2_avg_accum_inv(PG_FUNCTION_ARGS)
+{
+	ArrayType  *transarray;
+	int16		newval = PG_GETARG_INT16(1);
+	Int8TransTypeData *transdata;
+
+	/*
+	 * If we're invoked as an aggregate, we can cheat and modify our first
+	 * parameter in-place to reduce palloc overhead. Otherwise we need to make
+	 * a copy of it before scribbling on it.
+	 */
+	if (AggCheckCallContext(fcinfo, NULL))
+		transarray = PG_GETARG_ARRAYTYPE_P(0);
+	else
+		transarray = PG_GETARG_ARRAYTYPE_P_COPY(0);
+
+	if (ARR_HASNULL(transarray) ||
+		ARR_SIZE(transarray) != ARR_OVERHEAD_NONULLS(1) + sizeof(Int8TransTypeData))
+		elog(ERROR, "expected 2-element int8 array");
+
+	transdata = (Int8TransTypeData *) ARR_DATA_PTR(transarray);
+	transdata->count--;
+	transdata->sum -= newval;
+
+	PG_RETURN_ARRAYTYPE_P(transarray);
+}
+
+Datum
+int4_avg_accum_inv(PG_FUNCTION_ARGS)
+{
+	ArrayType  *transarray;
+	int32		newval = PG_GETARG_INT32(1);
+	Int8TransTypeData *transdata;
+
+	/*
+	 * If we're invoked as an aggregate, we can cheat and modify our first
+	 * parameter in-place to reduce palloc overhead. Otherwise we need to make
+	 * a copy of it before scribbling on it.
+	 */
+	if (AggCheckCallContext(fcinfo, NULL))
+		transarray = PG_GETARG_ARRAYTYPE_P(0);
+	else
+		transarray = PG_GETARG_ARRAYTYPE_P_COPY(0);
+
+	if (ARR_HASNULL(transarray) ||
+		ARR_SIZE(transarray) != ARR_OVERHEAD_NONULLS(1) + sizeof(Int8TransTypeData))
+		elog(ERROR, "expected 2-element int8 array");
+
+	transdata = (Int8TransTypeData *) ARR_DATA_PTR(transarray);
+	transdata->count--;
+	transdata->sum -= newval;
+
+	PG_RETURN_ARRAYTYPE_P(transarray);
+}
+
 Datum
 int8_avg(PG_FUNCTION_ARGS)
 {
@ -3196,6 +3463,28 @@ int8_avg(PG_FUNCTION_ARGS)
 	PG_RETURN_DATUM(DirectFunctionCall2(numeric_div, sumd, countd));
 }

+/*
+ * SUM(int2) and SUM(int4) both return int8, so we can use this
+ * final function for both.
+ */
+Datum
+int2int4_sum(PG_FUNCTION_ARGS)
+{
+	ArrayType  *transarray = PG_GETARG_ARRAYTYPE_P(0);
+	Int8TransTypeData *transdata;
+
+	if (ARR_HASNULL(transarray) ||
+		ARR_SIZE(transarray) != ARR_OVERHEAD_NONULLS(1) + sizeof(Int8TransTypeData))
+		elog(ERROR, "expected 2-element int8 array");
+	transdata = (Int8TransTypeData *) ARR_DATA_PTR(transarray);
+
+	/* SQL defines SUM of no values to be NULL */
+	if (transdata->count == 0)
+		PG_RETURN_NULL();
+
+	PG_RETURN_DATUM(Int64GetDatumFast(transdata->sum));
+}
+

 /* ----------------------------------------------------------------------
 *