diff --git a/utils/regr/corr.cpp b/utils/regr/corr.cpp index 4d6768353..85576d1a6 100644 --- a/utils/regr/corr.cpp +++ b/utils/regr/corr.cpp @@ -39,11 +39,11 @@ static Add_corr_ToUDAFMap addToMap; struct corr_data { uint64_t cnt; - long double sumx; - long double sumx2; // sum of (x squared) - long double sumy; - long double sumy2; // sum of (y squared) - long double sumxy; // sum of x * y + long double avgx; + long double varx; + long double avgy; + long double vary; + long double cxy; }; mcsv1_UDAF::ReturnCode corr::init(mcsv1Context* context, ColumnDatum* colTypes) @@ -76,11 +76,11 @@ mcsv1_UDAF::ReturnCode corr::reset(mcsv1Context* context) { struct corr_data* data = (struct corr_data*)context->getUserData()->data; data->cnt = 0; - data->sumx = 0.0; - data->sumx2 = 0.0; - data->sumy = 0.0; - data->sumy2 = 0.0; - data->sumxy = 0.0; + data->avgx = 0.0; + data->varx = 0.0; + data->avgy = 0.0; + data->vary = 0.0; + data->cxy = 0.0; return mcsv1_UDAF::SUCCESS; } @@ -90,15 +90,30 @@ mcsv1_UDAF::ReturnCode corr::nextValue(mcsv1Context* context, ColumnDatum* valsI double valx = toDouble(valsIn[1]); struct corr_data* data = (struct corr_data*)context->getUserData()->data; - data->sumy += valy; - data->sumy2 += valy * valy; - - data->sumx += valx; - data->sumx2 += valx * valx; - - data->sumxy += valx * valy; - + long double avgyPrev = data->avgy; + long double varyPrev = data->vary; + long double avgxPrev = data->avgx; + long double varxPrev = data->varx; + long double cxyPrev = data->cxy; ++data->cnt; + uint64_t cnt = data->cnt; + + long double dx = valx - avgxPrev; + long double dy = valy - avgyPrev; + + avgyPrev += dy / cnt; + avgxPrev += dx / cnt; + + varxPrev += dx * (valx - avgxPrev); + varyPrev += dy * (valy - avgyPrev); + + cxyPrev += dx * (valy - avgyPrev); + + data->avgx = avgxPrev; + data->avgy = avgyPrev; + data->varx = varxPrev; + data->vary = varyPrev; + data->cxy = cxyPrev; return mcsv1_UDAF::SUCCESS; } @@ -113,12 +128,38 @@ mcsv1_UDAF::ReturnCode corr::subEvaluate(mcsv1Context* context, const UserData* struct corr_data* outData = (struct corr_data*)context->getUserData()->data; struct corr_data* inData = (struct corr_data*)userDataIn->data; - outData->sumx += inData->sumx; - outData->sumx2 += inData->sumx2; - outData->sumy += inData->sumy; - outData->sumy2 += inData->sumy2; - outData->sumxy += inData->sumxy; - outData->cnt += inData->cnt; + uint64_t outCnt = outData->cnt; + long double outAvgx = outData->avgx; + long double outAvgy = outData->avgy; + long double outVarx = outData->varx; + long double outVary = outData->vary; + long double outCxy = outData->cxy; + + uint64_t inCnt = inData->cnt; + long double inAvgx = inData->avgx; + long double inAvgy = inData->avgy; + long double inVarx = inData->varx; + long double inVary = inData->vary; + long double inCxy = inData->cxy; + + uint64_t resCnt = inCnt + outCnt; + long double deltax = outAvgx - inAvgx; + long double deltay = outAvgy - inAvgy; + + long double resAvgx = inAvgx + deltax * outCnt / resCnt; + long double resAvgy = inAvgy + deltay * outCnt / resCnt; + + long double resVarx = outVarx + inVarx + deltax * deltax * inCnt * outCnt / resCnt; + long double resVary = outVary + inVary + deltay * deltay * inCnt * outCnt / resCnt; + + long double resCxy = outCxy + inCxy + deltax * deltay * inCnt * outCnt / resCnt; + + outData->avgx = resAvgx; + outData->avgy = resAvgy; + outData->varx = resVarx; + outData->vary = resVary; + outData->cxy = resCxy; + outData->cnt = resCnt; return mcsv1_UDAF::SUCCESS; } @@ -129,19 +170,17 @@ mcsv1_UDAF::ReturnCode corr::evaluate(mcsv1Context* context, static_any::any& va double N = data->cnt; if (N > 1) { - long double sumx = data->sumx; - long double sumy = data->sumy; - long double sumx2 = data->sumx2; - long double sumy2 = data->sumy2; - long double sumxy = data->sumxy; + long double varx = data->varx; + long double vary = data->vary; + long double cxy = data->cxy; - long double var_popx = (sumx2 - (sumx * sumx / N)) / N; + long double var_popx = varx / N; if (var_popx <= 0) // Catch -0 { // When var_popx is 0, NULL is the result. return mcsv1_UDAF::SUCCESS; } - long double var_popy = (sumy2 - (sumy * sumy / N)) / N; + long double var_popy = vary / N; if (var_popy <= 0) // Catch -0 { // When var_popy is 0, NULL is the result @@ -149,8 +188,7 @@ mcsv1_UDAF::ReturnCode corr::evaluate(mcsv1Context* context, static_any::any& va } long double std_popx = sqrt(var_popx); long double std_popy = sqrt(var_popy); - long double covar_pop = (sumxy - ((sumx * sumy) / N)) / N; - long double corr = covar_pop / (std_popy * std_popx); + long double corr = cxy / (std_popy * std_popx * N); valOut = static_cast(corr); } return mcsv1_UDAF::SUCCESS; @@ -162,14 +200,30 @@ mcsv1_UDAF::ReturnCode corr::dropValue(mcsv1Context* context, ColumnDatum* valsD double valx = toDouble(valsDropped[1]); struct corr_data* data = (struct corr_data*)context->getUserData()->data; - data->sumy -= valy; - data->sumy2 -= valy * valy; - - data->sumx -= valx; - data->sumx2 -= valx * valx; - - data->sumxy -= valx * valy; + long double avgyPrev = data->avgy; + long double varyPrev = data->vary; + long double avgxPrev = data->avgx; + long double varxPrev = data->varx; + long double cxyPrev = data->cxy; --data->cnt; + uint64_t cnt = data->cnt; + + long double dx = valx - avgxPrev; + long double dy = valy - avgyPrev; + + avgyPrev -= dy / cnt; + avgxPrev -= dx / cnt; + + varxPrev -= dx * (valx - avgxPrev); + varyPrev -= dy * (valy - avgyPrev); + + cxyPrev -= dx * (valy - avgyPrev); + + data->avgx = avgxPrev; + data->avgy = avgyPrev; + data->varx = varxPrev; + data->vary = varyPrev; + data->cxy = cxyPrev; return mcsv1_UDAF::SUCCESS; } diff --git a/utils/regr/covar_pop.cpp b/utils/regr/covar_pop.cpp index 7220dd36e..d24951466 100644 --- a/utils/regr/covar_pop.cpp +++ b/utils/regr/covar_pop.cpp @@ -39,9 +39,9 @@ static Add_covar_pop_ToUDAFMap addToMap; struct covar_pop_data { uint64_t cnt; - long double sumx; - long double sumy; - long double sumxy; // sum of x * y + long double avgx; + long double avgy; + long double cxy; }; mcsv1_UDAF::ReturnCode covar_pop::init(mcsv1Context* context, ColumnDatum* colTypes) @@ -74,9 +74,9 @@ mcsv1_UDAF::ReturnCode covar_pop::reset(mcsv1Context* context) { struct covar_pop_data* data = (struct covar_pop_data*)context->getUserData()->data; data->cnt = 0; - data->sumx = 0.0; - data->sumy = 0.0; - data->sumxy = 0.0; + data->avgx = 0.0; + data->avgy = 0.0; + data->cxy = 0.0; return mcsv1_UDAF::SUCCESS; } @@ -86,12 +86,18 @@ mcsv1_UDAF::ReturnCode covar_pop::nextValue(mcsv1Context* context, ColumnDatum* double valx = toDouble(valsIn[1]); struct covar_pop_data* data = (struct covar_pop_data*)context->getUserData()->data; - data->sumy += valy; - data->sumx += valx; - - data->sumxy += valx * valy; - + long double avgyPrev = data->avgy; + long double avgxPrev = data->avgx; + long double cxyPrev = data->cxy; ++data->cnt; + uint64_t cnt = data->cnt; + long double dx = valx - avgxPrev; + avgyPrev += (valy - avgyPrev)/cnt; + avgxPrev += dx / cnt; + cxyPrev += dx * (valy - avgyPrev); + data->avgx = avgxPrev; + data->avgy = avgyPrev; + data->cxy = cxyPrev; return mcsv1_UDAF::SUCCESS; } @@ -106,10 +112,27 @@ mcsv1_UDAF::ReturnCode covar_pop::subEvaluate(mcsv1Context* context, const UserD struct covar_pop_data* outData = (struct covar_pop_data*)context->getUserData()->data; struct covar_pop_data* inData = (struct covar_pop_data*)userDataIn->data; - outData->sumx += inData->sumx; - outData->sumy += inData->sumy; - outData->sumxy += inData->sumxy; - outData->cnt += inData->cnt; + uint64_t outCnt = outData->cnt; + long double outAvgx = outData->avgx; + long double outAvgy = outData->avgy; + long double outCxy = outData->cxy; + + uint64_t inCnt = inData->cnt; + long double inAvgx = inData->avgx; + long double inAvgy = inData->avgy; + long double inCxy = inData->cxy; + + uint64_t resCnt = inCnt + outCnt; + long double deltax = outAvgx - inAvgx; + long double deltay = outAvgy - inAvgy; + long double resAvgx = inAvgx + deltax * outCnt / resCnt; + long double resAvgy = inAvgy + deltay * outCnt / resCnt; + long double resCxy = outCxy + inCxy + deltax * deltay * inCnt * outCnt / resCnt; + + outData->avgx = resAvgx; + outData->avgy = resAvgy; + outData->cxy = resCxy; + outData->cnt = resCnt; return mcsv1_UDAF::SUCCESS; } @@ -120,11 +143,7 @@ mcsv1_UDAF::ReturnCode covar_pop::evaluate(mcsv1Context* context, static_any::an double N = data->cnt; if (N > 0) { - long double sumx = data->sumx; - long double sumy = data->sumy; - long double sumxy = data->sumxy; - - long double covar_pop = (sumxy - ((sumx * sumy) / N)) / N; + long double covar_pop = data->cxy / N; valOut = static_cast(covar_pop); } return mcsv1_UDAF::SUCCESS; @@ -136,11 +155,20 @@ mcsv1_UDAF::ReturnCode covar_pop::dropValue(mcsv1Context* context, ColumnDatum* double valx = toDouble(valsDropped[1]); struct covar_pop_data* data = (struct covar_pop_data*)context->getUserData()->data; - data->sumy -= valy; - data->sumx -= valx; - - data->sumxy -= valx * valy; + long double avgyPrev = data->avgy; + long double avgxPrev = data->avgx; + long double cxyPrev = data->cxy; --data->cnt; + uint64_t cnt = data->cnt; + long double dx = valx - avgxPrev; + + avgyPrev -= (valy - avgyPrev) / cnt; + avgxPrev -= dx / cnt; + cxyPrev -= dx * (valy - avgyPrev); + + data->avgx = avgxPrev; + data->avgy = avgyPrev; + data->cxy = cxyPrev; return mcsv1_UDAF::SUCCESS; } diff --git a/utils/regr/covar_samp.cpp b/utils/regr/covar_samp.cpp index fddcaab51..4369dbca1 100644 --- a/utils/regr/covar_samp.cpp +++ b/utils/regr/covar_samp.cpp @@ -39,9 +39,9 @@ static Add_covar_samp_ToUDAFMap addToMap; struct covar_samp_data { uint64_t cnt; - long double sumx; - long double sumy; - long double sumxy; // sum of x * y + long double avgx; + long double avgy; + long double cxy; }; mcsv1_UDAF::ReturnCode covar_samp::init(mcsv1Context* context, ColumnDatum* colTypes) @@ -74,9 +74,9 @@ mcsv1_UDAF::ReturnCode covar_samp::reset(mcsv1Context* context) { struct covar_samp_data* data = (struct covar_samp_data*)context->getUserData()->data; data->cnt = 0; - data->sumx = 0.0; - data->sumy = 0.0; - data->sumxy = 0.0; + data->avgx = 0.0; + data->avgy = 0.0; + data->cxy = 0.0; return mcsv1_UDAF::SUCCESS; } @@ -86,13 +86,18 @@ mcsv1_UDAF::ReturnCode covar_samp::nextValue(mcsv1Context* context, ColumnDatum* double valx = toDouble(valsIn[1]); struct covar_samp_data* data = (struct covar_samp_data*)context->getUserData()->data; - data->sumy += valy; - - data->sumx += valx; - - data->sumxy += valx * valy; - + long double avgyPrev = data->avgy; + long double avgxPrev = data->avgx; + long double cxyPrev = data->cxy; ++data->cnt; + uint64_t cnt = data->cnt; + long double dx = valx - avgxPrev; + avgyPrev += (valy - avgyPrev)/cnt; + avgxPrev += dx / cnt; + cxyPrev += dx * (valy - avgyPrev); + data->avgx = avgxPrev; + data->avgy = avgyPrev; + data->cxy = cxyPrev; return mcsv1_UDAF::SUCCESS; } @@ -107,10 +112,27 @@ mcsv1_UDAF::ReturnCode covar_samp::subEvaluate(mcsv1Context* context, const User struct covar_samp_data* outData = (struct covar_samp_data*)context->getUserData()->data; struct covar_samp_data* inData = (struct covar_samp_data*)userDataIn->data; - outData->sumx += inData->sumx; - outData->sumy += inData->sumy; - outData->sumxy += inData->sumxy; - outData->cnt += inData->cnt; + uint64_t outCnt = outData->cnt; + long double outAvgx = outData->avgx; + long double outAvgy = outData->avgy; + long double outCxy = outData->cxy; + + uint64_t inCnt = inData->cnt; + long double inAvgx = inData->avgx; + long double inAvgy = inData->avgy; + long double inCxy = inData->cxy; + + uint64_t resCnt = inCnt + outCnt; + long double deltax = outAvgx - inAvgx; + long double deltay = outAvgy - inAvgy; + long double resAvgx = inAvgx + deltax * outCnt / resCnt; + long double resAvgy = inAvgy + deltay * outCnt / resCnt; + long double resCxy = outCxy + inCxy + deltax * deltay * inCnt * outCnt / resCnt; + + outData->avgx = resAvgx; + outData->avgy = resAvgy; + outData->cxy = resCxy; + outData->cnt = resCnt; return mcsv1_UDAF::SUCCESS; } @@ -121,11 +143,7 @@ mcsv1_UDAF::ReturnCode covar_samp::evaluate(mcsv1Context* context, static_any::a double N = data->cnt; if (N > 1) { - long double sumx = data->sumx; - long double sumy = data->sumy; - long double sumxy = data->sumxy; - - long double covar_samp = (sumxy - ((sumx * sumy) / N)) / (N - 1); + long double covar_samp = data->cxy / (N - 1); valOut = static_cast(covar_samp); } else if (N == 1) @@ -141,12 +159,19 @@ mcsv1_UDAF::ReturnCode covar_samp::dropValue(mcsv1Context* context, ColumnDatum* double valx = toDouble(valsDropped[1]); struct covar_samp_data* data = (struct covar_samp_data*)context->getUserData()->data; - data->sumy -= valy; - - data->sumx -= valx; - - data->sumxy -= valx * valy; + long double avgyPrev = data->avgy; + long double avgxPrev = data->avgx; + long double cxyPrev = data->cxy; --data->cnt; + uint64_t cnt = data->cnt; + long double dx = valx - avgxPrev; + avgyPrev -= (valy - avgyPrev) / cnt; + avgxPrev -= dx / cnt; + cxyPrev -= dx * (valy - avgyPrev); + + data->avgx = avgxPrev; + data->avgy = avgyPrev; + data->cxy = cxyPrev; return mcsv1_UDAF::SUCCESS; } diff --git a/utils/regr/regr_intercept.cpp b/utils/regr/regr_intercept.cpp index 8dda7c08a..221d4f63d 100644 --- a/utils/regr/regr_intercept.cpp +++ b/utils/regr/regr_intercept.cpp @@ -39,10 +39,10 @@ static Add_regr_intercept_ToUDAFMap addToMap; struct regr_intercept_data { uint64_t cnt; - long double sumx; - long double sumx2; // sum of (x squared) - long double sumy; - long double sumxy; // sum of x * y + long double avgx; + long double cx; + long double avgy; + long double cxy; }; mcsv1_UDAF::ReturnCode regr_intercept::init(mcsv1Context* context, ColumnDatum* colTypes) @@ -75,10 +75,10 @@ mcsv1_UDAF::ReturnCode regr_intercept::reset(mcsv1Context* context) { struct regr_intercept_data* data = (struct regr_intercept_data*)context->getUserData()->data; data->cnt = 0; - data->sumx = 0.0; - data->sumx2 = 0.0; - data->sumy = 0.0; - data->sumxy = 0.0; + data->avgx = 0.0; + data->cx = 0.0; + data->avgy = 0.0; + data->cxy = 0.0; return mcsv1_UDAF::SUCCESS; } @@ -88,12 +88,27 @@ mcsv1_UDAF::ReturnCode regr_intercept::nextValue(mcsv1Context* context, ColumnDa double valx = toDouble(valsIn[1]); struct regr_intercept_data* data = (struct regr_intercept_data*)context->getUserData()->data; - data->sumy += valy; - data->sumx += valx; - data->sumx2 += valx * valx; - - data->sumxy += valx * valy; + long double avgyPrev = data->avgy; + long double avgxPrev = data->avgx; + long double cxPrev = data->cx; + long double cxyPrev = data->cxy; ++data->cnt; + uint64_t cnt = data->cnt; + + long double dx = valx - avgxPrev; + long double dy = valy - avgyPrev; + + avgyPrev += dy / cnt; + avgxPrev += dx / cnt; + + cxPrev += dx * (valx - avgxPrev); + + cxyPrev += dx * (valy - avgyPrev); + + data->avgx = avgxPrev; + data->avgy = avgyPrev; + data->cx = cxPrev; + data->cxy = cxyPrev; return mcsv1_UDAF::SUCCESS; } @@ -108,11 +123,34 @@ mcsv1_UDAF::ReturnCode regr_intercept::subEvaluate(mcsv1Context* context, const struct regr_intercept_data* outData = (struct regr_intercept_data*)context->getUserData()->data; struct regr_intercept_data* inData = (struct regr_intercept_data*)userDataIn->data; - outData->sumx += inData->sumx; - outData->sumx2 += inData->sumx2; - outData->sumy += inData->sumy; - outData->sumxy += inData->sumxy; - outData->cnt += inData->cnt; + uint64_t outCnt = outData->cnt; + long double outAvgx = outData->avgx; + long double outAvgy = outData->avgy; + long double outCx = outData->cx; + long double outCxy = outData->cxy; + + uint64_t inCnt = inData->cnt; + long double inAvgx = inData->avgx; + long double inAvgy = inData->avgy; + long double inCx = inData->cx; + long double inCxy = inData->cxy; + + uint64_t resCnt = inCnt + outCnt; + long double deltax = outAvgx - inAvgx; + long double deltay = outAvgy - inAvgy; + + long double resAvgx = inAvgx + deltax * outCnt / resCnt; + long double resAvgy = inAvgy + deltay * outCnt / resCnt; + + long double resCx = outCx + inCx + deltax * deltax * inCnt * outCnt / resCnt; + + long double resCxy = outCxy + inCxy + deltax * deltay * inCnt * outCnt / resCnt; + + outData->avgx = resAvgx; + outData->avgy = resAvgy; + outData->cx = resCx; + outData->cxy = resCxy; + outData->cnt = resCnt; return mcsv1_UDAF::SUCCESS; } @@ -123,18 +161,14 @@ mcsv1_UDAF::ReturnCode regr_intercept::evaluate(mcsv1Context* context, static_an long double N = data->cnt; if (N > 1) { - long double sumx = data->sumx; - long double sumy = data->sumy; - long double sumx2 = data->sumx2; - long double sumxy = data->sumxy; + long double avgx = data->avgx; + long double avgy = data->avgy; + long double cx = data->cx; + long double cxy = data->cxy; // regr_intercept is AVG(y) - slope(y,x)*avg(x) - // We do some algebra and we can get a slightly smaller calculation - long double numerator = sumy * sumx2 - sumx * sumxy; - long double var_pop = - (N * sumx2) - (sumx * sumx); // Not realy var_pop, but sort of after some reductions - if (var_pop > 0) + if (cx > 0) { - valOut = static_cast(numerator / var_pop); + valOut = static_cast(avgy - cxy / cx * avgx); } } return mcsv1_UDAF::SUCCESS; @@ -146,12 +180,27 @@ mcsv1_UDAF::ReturnCode regr_intercept::dropValue(mcsv1Context* context, ColumnDa double valx = toDouble(valsDropped[1]); struct regr_intercept_data* data = (struct regr_intercept_data*)context->getUserData()->data; - data->sumy -= valy; - data->sumx -= valx; - data->sumx2 -= valx * valx; - - data->sumxy -= valx * valy; + long double avgyPrev = data->avgy; + long double avgxPrev = data->avgx; + long double cxPrev = data->cx; + long double cxyPrev = data->cxy; --data->cnt; + uint64_t cnt = data->cnt; + + long double dx = valx - avgxPrev; + long double dy = valy - avgyPrev; + + avgyPrev -= dy / cnt; + avgxPrev -= dx / cnt; + + cxPrev -= dx * (valx - avgxPrev); + + cxyPrev -= dx * (valy - avgyPrev); + + data->avgx = avgxPrev; + data->avgy = avgyPrev; + data->cx = cxPrev; + data->cxy = cxyPrev; return mcsv1_UDAF::SUCCESS; } diff --git a/utils/regr/regr_r2.cpp b/utils/regr/regr_r2.cpp index 99a204af8..1a03f895d 100644 --- a/utils/regr/regr_r2.cpp +++ b/utils/regr/regr_r2.cpp @@ -39,11 +39,11 @@ static Add_regr_r2_ToUDAFMap addToMap; struct regr_r2_data { uint64_t cnt; - long double sumx; - long double sumx2; // sum of (x squared) - long double sumy; - long double sumy2; // sum of (y squared) - long double sumxy; // sum of x * y + long double avgx; + long double varx; + long double avgy; + long double vary; + long double cxy; }; mcsv1_UDAF::ReturnCode regr_r2::init(mcsv1Context* context, ColumnDatum* colTypes) @@ -76,11 +76,11 @@ mcsv1_UDAF::ReturnCode regr_r2::reset(mcsv1Context* context) { struct regr_r2_data* data = (struct regr_r2_data*)context->getUserData()->data; data->cnt = 0; - data->sumx = 0.0; - data->sumx2 = 0.0; - data->sumy = 0.0; - data->sumy2 = 0.0; - data->sumxy = 0.0; + data->avgx = 0.0; + data->varx = 0.0; + data->avgy = 0.0; + data->vary = 0.0; + data->cxy = 0.0; return mcsv1_UDAF::SUCCESS; } @@ -89,16 +89,30 @@ mcsv1_UDAF::ReturnCode regr_r2::nextValue(mcsv1Context* context, ColumnDatum* va double valy = toDouble(valsIn[0]); double valx = toDouble(valsIn[1]); struct regr_r2_data* data = (struct regr_r2_data*)context->getUserData()->data; - - data->sumy += valy; - data->sumy2 += valy * valy; - - data->sumx += valx; - data->sumx2 += valx * valx; - - data->sumxy += valx * valy; - + long double avgyPrev = data->avgy; + long double varyPrev = data->vary; + long double avgxPrev = data->avgx; + long double varxPrev = data->varx; + long double cxyPrev = data->cxy; ++data->cnt; + uint64_t cnt = data->cnt; + + long double dx = valx - avgxPrev; + long double dy = valy - avgyPrev; + + avgyPrev += dy / cnt; + avgxPrev += dx / cnt; + + varxPrev += dx * (valx - avgxPrev); + varyPrev += dy * (valy - avgyPrev); + + cxyPrev += dx * (valy - avgyPrev); + + data->avgx = avgxPrev; + data->avgy = avgyPrev; + data->varx = varxPrev; + data->vary = varyPrev; + data->cxy = cxyPrev; return mcsv1_UDAF::SUCCESS; } @@ -113,12 +127,38 @@ mcsv1_UDAF::ReturnCode regr_r2::subEvaluate(mcsv1Context* context, const UserDat struct regr_r2_data* outData = (struct regr_r2_data*)context->getUserData()->data; struct regr_r2_data* inData = (struct regr_r2_data*)userDataIn->data; - outData->sumx += inData->sumx; - outData->sumx2 += inData->sumx2; - outData->sumy += inData->sumy; - outData->sumy2 += inData->sumy2; - outData->sumxy += inData->sumxy; - outData->cnt += inData->cnt; + uint64_t outCnt = outData->cnt; + long double outAvgx = outData->avgx; + long double outAvgy = outData->avgy; + long double outVarx = outData->varx; + long double outVary = outData->vary; + long double outCxy = outData->cxy; + + uint64_t inCnt = inData->cnt; + long double inAvgx = inData->avgx; + long double inAvgy = inData->avgy; + long double inVarx = inData->varx; + long double inVary = inData->vary; + long double inCxy = inData->cxy; + + uint64_t resCnt = inCnt + outCnt; + long double deltax = outAvgx - inAvgx; + long double deltay = outAvgy - inAvgy; + + long double resAvgx = inAvgx + deltax * outCnt / resCnt; + long double resAvgy = inAvgy + deltay * outCnt / resCnt; + + long double resVarx = outVarx + inVarx + deltax * deltax * inCnt * outCnt / resCnt; + long double resVary = outVary + inVary + deltay * deltay * inCnt * outCnt / resCnt; + + long double resCxy = outCxy + inCxy + deltax * deltay * inCnt * outCnt / resCnt; + + outData->avgx = resAvgx; + outData->avgy = resAvgy; + outData->varx = resVarx; + outData->vary = resVary; + outData->cxy = resCxy; + outData->cnt = resCnt; return mcsv1_UDAF::SUCCESS; } @@ -129,19 +169,17 @@ mcsv1_UDAF::ReturnCode regr_r2::evaluate(mcsv1Context* context, static_any::any& double N = data->cnt; if (N > 1) { - long double sumx = data->sumx; - long double sumy = data->sumy; - long double sumx2 = data->sumx2; - long double sumy2 = data->sumy2; - long double sumxy = data->sumxy; + long double varx = data->varx; + long double vary = data->vary; + long double cxy = data->cxy; - long double var_popx = (sumx2 - (sumx * sumx / N)) / N; + long double var_popx = varx / N; if (var_popx <= 0) // Catch -0 { // When var_popx is 0, NULL is the result. return mcsv1_UDAF::SUCCESS; } - double var_popy = (sumy2 - (sumy * sumy / N)) / N; + long double var_popy = vary / N; if (var_popy <= 0) // Catch -0 { // When var_popy is 0, 1 is the result @@ -150,8 +188,7 @@ mcsv1_UDAF::ReturnCode regr_r2::evaluate(mcsv1Context* context, static_any::any& } long double std_popx = sqrt(var_popx); long double std_popy = sqrt(var_popy); - long double covar_pop = (sumxy - ((sumx * sumy) / N)) / N; - long double corr = covar_pop / (std_popy * std_popx); + long double corr = cxy / (std_popy * std_popx * N); valOut = static_cast(corr * corr); } return mcsv1_UDAF::SUCCESS; @@ -163,14 +200,30 @@ mcsv1_UDAF::ReturnCode regr_r2::dropValue(mcsv1Context* context, ColumnDatum* va double valx = toDouble(valsDropped[1]); struct regr_r2_data* data = (struct regr_r2_data*)context->getUserData()->data; - data->sumy -= valy; - data->sumy2 -= valy * valy; - - data->sumx -= valx; - data->sumx2 -= valx * valx; - - data->sumxy -= valx * valy; + long double avgyPrev = data->avgy; + long double varyPrev = data->vary; + long double avgxPrev = data->avgx; + long double varxPrev = data->varx; + long double cxyPrev = data->cxy; --data->cnt; + uint64_t cnt = data->cnt; + + long double dx = valx - avgxPrev; + long double dy = valy - avgyPrev; + + avgyPrev -= dy / cnt; + avgxPrev -= dx / cnt; + + varxPrev -= dx * (valx - avgxPrev); + varyPrev -= dy * (valy - avgyPrev); + + cxyPrev -= dx * (valy - avgyPrev); + + data->avgx = avgxPrev; + data->avgy = avgyPrev; + data->varx = varxPrev; + data->vary = varyPrev; + data->cxy = cxyPrev; return mcsv1_UDAF::SUCCESS; } diff --git a/utils/regr/regr_slope.cpp b/utils/regr/regr_slope.cpp index de182e148..466bb8687 100644 --- a/utils/regr/regr_slope.cpp +++ b/utils/regr/regr_slope.cpp @@ -39,10 +39,10 @@ static Add_regr_slope_ToUDAFMap addToMap; struct regr_slope_data { uint64_t cnt; - long double sumx; - long double sumx2; // sum of (x squared) - long double sumy; - long double sumxy; // sum of x * y + long double avgx; + long double cx; + long double avgy; + long double cxy; }; mcsv1_UDAF::ReturnCode regr_slope::init(mcsv1Context* context, ColumnDatum* colTypes) @@ -74,10 +74,10 @@ mcsv1_UDAF::ReturnCode regr_slope::reset(mcsv1Context* context) { struct regr_slope_data* data = (struct regr_slope_data*)context->getUserData()->data; data->cnt = 0; - data->sumx = 0.0; - data->sumx2 = 0.0; - data->sumy = 0.0; - data->sumxy = 0.0; + data->avgx = 0.0; + data->cx = 0.0; + data->avgy = 0.0; + data->cxy = 0.0; return mcsv1_UDAF::SUCCESS; } @@ -87,12 +87,27 @@ mcsv1_UDAF::ReturnCode regr_slope::nextValue(mcsv1Context* context, ColumnDatum* double valx = toDouble(valsIn[1]); struct regr_slope_data* data = (struct regr_slope_data*)context->getUserData()->data; - data->sumy += valy; - data->sumx += valx; - data->sumx2 += valx * valx; - data->sumxy += valx * valy; + long double avgyPrev = data->avgy; + long double avgxPrev = data->avgx; + long double cxPrev = data->cx; + long double cxyPrev = data->cxy; ++data->cnt; + uint64_t cnt = data->cnt; + long double dx = valx - avgxPrev; + long double dy = valy - avgyPrev; + + avgyPrev += dy / cnt; + avgxPrev += dx / cnt; + + cxPrev += dx * (valx - avgxPrev); + + cxyPrev += dx * (valy - avgyPrev); + + data->avgx = avgxPrev; + data->avgy = avgyPrev; + data->cx = cxPrev; + data->cxy = cxyPrev; return mcsv1_UDAF::SUCCESS; } @@ -106,11 +121,35 @@ mcsv1_UDAF::ReturnCode regr_slope::subEvaluate(mcsv1Context* context, const User struct regr_slope_data* outData = (struct regr_slope_data*)context->getUserData()->data; struct regr_slope_data* inData = (struct regr_slope_data*)userDataIn->data; - outData->sumx += inData->sumx; - outData->sumx2 += inData->sumx2; - outData->sumy += inData->sumy; - outData->sumxy += inData->sumxy; - outData->cnt += inData->cnt; + uint64_t outCnt = outData->cnt; + long double outAvgx = outData->avgx; + long double outAvgy = outData->avgy; + long double outCx = outData->cx; + long double outCxy = outData->cxy; + + uint64_t inCnt = inData->cnt; + long double inAvgx = inData->avgx; + long double inAvgy = inData->avgy; + long double inCx = inData->cx; + long double inCxy = inData->cxy; + + uint64_t resCnt = inCnt + outCnt; + long double deltax = outAvgx - inAvgx; + long double deltay = outAvgy - inAvgy; + + long double resAvgx = inAvgx + deltax * outCnt / resCnt; + long double resAvgy = inAvgy + deltay * outCnt / resCnt; + + long double resCx = outCx + inCx + deltax * deltax * inCnt * outCnt / resCnt; + + long double resCxy = outCxy + inCxy + deltax * deltay * inCnt * outCnt / resCnt; + + outData->avgx = resAvgx; + outData->avgy = resAvgy; + outData->cx = resCx; + outData->cxy = resCxy; + outData->cnt = resCnt; + return mcsv1_UDAF::SUCCESS; } @@ -122,18 +161,11 @@ mcsv1_UDAF::ReturnCode regr_slope::evaluate(mcsv1Context* context, static_any::a if (N > 1) { // COVAR_POP(y, x) / VAR_POP(x) - long double sumx = data->sumx; - long double sumy = data->sumy; - long double sumx2 = data->sumx2; - long double sumxy = data->sumxy; - // These aren't really covar_pop and var_pop. For the purposes of this calculation - // we multiplied everything by N to reduce calc time and variance. - // It all comes out after the final divide - long double covar_pop = N * sumxy - sumx * sumy; - long double var_pop = N * sumx2 - sumx * sumx; - if (var_pop > 0) + long double cx = data->cx; + long double cxy = data->cxy; + if (cx > 0) { - valOut = static_cast(covar_pop / var_pop); + valOut = static_cast(cxy / cx); } } return mcsv1_UDAF::SUCCESS; @@ -145,11 +177,27 @@ mcsv1_UDAF::ReturnCode regr_slope::dropValue(mcsv1Context* context, ColumnDatum* double valx = toDouble(valsDropped[1]); struct regr_slope_data* data = (struct regr_slope_data*)context->getUserData()->data; - data->sumy -= valy; - data->sumx -= valx; - data->sumx2 -= valx * valx; - data->sumxy -= valx * valy; + long double avgyPrev = data->avgy; + long double avgxPrev = data->avgx; + long double cxPrev = data->cx; + long double cxyPrev = data->cxy; --data->cnt; + uint64_t cnt = data->cnt; + + long double dx = valx - avgxPrev; + long double dy = valy - avgyPrev; + + avgyPrev -= dy / cnt; + avgxPrev -= dx / cnt; + + cxPrev -= dx * (valx - avgxPrev); + + cxyPrev -= dx * (valy - avgyPrev); + + data->avgx = avgxPrev; + data->avgy = avgyPrev; + data->cx = cxPrev; + data->cxy = cxyPrev; return mcsv1_UDAF::SUCCESS; } diff --git a/utils/regr/regr_sxx.cpp b/utils/regr/regr_sxx.cpp index f62595a6b..b70755652 100644 --- a/utils/regr/regr_sxx.cpp +++ b/utils/regr/regr_sxx.cpp @@ -39,8 +39,8 @@ static Add_regr_sxx_ToUDAFMap addToMap; struct regr_sxx_data { uint64_t cnt; - long double sumx; - long double sumx2; // sum of (x squared) + long double avgx; + long double cx; }; mcsv1_UDAF::ReturnCode regr_sxx::init(mcsv1Context* context, ColumnDatum* colTypes) @@ -73,8 +73,8 @@ mcsv1_UDAF::ReturnCode regr_sxx::reset(mcsv1Context* context) { struct regr_sxx_data* data = (struct regr_sxx_data*)context->getUserData()->data; data->cnt = 0; - data->sumx = 0.0; - data->sumx2 = 0.0; + data->avgx = 0.0; + data->cx = 0.0; return mcsv1_UDAF::SUCCESS; } @@ -82,11 +82,15 @@ mcsv1_UDAF::ReturnCode regr_sxx::nextValue(mcsv1Context* context, ColumnDatum* v { double valx = toDouble(valsIn[1]); struct regr_sxx_data* data = (struct regr_sxx_data*)context->getUserData()->data; - - data->sumx += valx; - data->sumx2 += valx * valx; - + long double avgxPrev = data->avgx; + long double cxPrev = data->cx; ++data->cnt; + uint64_t cnt = data->cnt; + long double dx = valx - avgxPrev; + avgxPrev += dx / cnt; + cxPrev += dx * (valx - avgxPrev); + data->avgx = avgxPrev; + data->cx = cxPrev; return mcsv1_UDAF::SUCCESS; } @@ -101,9 +105,24 @@ mcsv1_UDAF::ReturnCode regr_sxx::subEvaluate(mcsv1Context* context, const UserDa struct regr_sxx_data* outData = (struct regr_sxx_data*)context->getUserData()->data; struct regr_sxx_data* inData = (struct regr_sxx_data*)userDataIn->data; - outData->sumx += inData->sumx; - outData->sumx2 += inData->sumx2; - outData->cnt += inData->cnt; + uint64_t outCnt = outData->cnt; + long double outAvgx = outData->avgx; + long double outCx = outData->cx; + + uint64_t inCnt = inData->cnt; + long double inAvgx = inData->avgx; + long double inCx = inData->cx; + + uint64_t resCnt = inCnt + outCnt; + long double deltax = outAvgx - inAvgx; + + long double resAvgx = inAvgx + deltax * outCnt / resCnt; + + long double resCx = outCx + inCx + deltax * deltax * inCnt * outCnt / resCnt; + + outData->avgx = resAvgx; + outData->cx = resCx; + outData->cnt = resCnt; return mcsv1_UDAF::SUCCESS; } @@ -114,7 +133,7 @@ mcsv1_UDAF::ReturnCode regr_sxx::evaluate(mcsv1Context* context, static_any::any long double N = data->cnt; if (N > 0) { - long double regr_sxx = (data->sumx2 - (data->sumx * data->sumx / N)); + long double regr_sxx = data->cx; if (regr_sxx < 0) // catch -0 regr_sxx = 0; valOut = static_cast(regr_sxx); @@ -126,11 +145,15 @@ mcsv1_UDAF::ReturnCode regr_sxx::dropValue(mcsv1Context* context, ColumnDatum* v { double valx = toDouble(valsDropped[1]); struct regr_sxx_data* data = (struct regr_sxx_data*)context->getUserData()->data; - - data->sumx -= valx; - data->sumx2 -= valx * valx; - + long double avgxPrev = data->avgx; + long double cxPrev = data->cx; --data->cnt; + uint64_t cnt = data->cnt; + long double dx = valx - avgxPrev; + avgxPrev -= dx / cnt; + cxPrev -= dx * (valx - avgxPrev); + data->avgx = avgxPrev; + data->cx = cxPrev; return mcsv1_UDAF::SUCCESS; } diff --git a/utils/regr/regr_syy.cpp b/utils/regr/regr_syy.cpp index 2b39fea5d..350c8605b 100644 --- a/utils/regr/regr_syy.cpp +++ b/utils/regr/regr_syy.cpp @@ -39,8 +39,8 @@ static Add_regr_syy_ToUDAFMap addToMap; struct regr_syy_data { uint64_t cnt; - long double sumy; - long double sumy2; // sum of (y squared) + long double avgy; + long double cy; // sum of (y squared) }; mcsv1_UDAF::ReturnCode regr_syy::init(mcsv1Context* context, ColumnDatum* colTypes) @@ -73,8 +73,8 @@ mcsv1_UDAF::ReturnCode regr_syy::reset(mcsv1Context* context) { struct regr_syy_data* data = (struct regr_syy_data*)context->getUserData()->data; data->cnt = 0; - data->sumy = 0.0; - data->sumy2 = 0.0; + data->avgy = 0.0; + data->cy = 0.0; return mcsv1_UDAF::SUCCESS; } @@ -82,11 +82,15 @@ mcsv1_UDAF::ReturnCode regr_syy::nextValue(mcsv1Context* context, ColumnDatum* v { double valy = toDouble(valsIn[0]); struct regr_syy_data* data = (struct regr_syy_data*)context->getUserData()->data; - - data->sumy += valy; - data->sumy2 += valy * valy; - + long double avgyPrev = data->avgy; + long double cyPrev = data->cy; ++data->cnt; + uint64_t cnt = data->cnt; + long double dy = valy - avgyPrev; + avgyPrev += dy / cnt; + cyPrev += dy * (valy - avgyPrev); + data->avgy = avgyPrev; + data->cy = cyPrev; return mcsv1_UDAF::SUCCESS; } @@ -101,9 +105,24 @@ mcsv1_UDAF::ReturnCode regr_syy::subEvaluate(mcsv1Context* context, const UserDa struct regr_syy_data* outData = (struct regr_syy_data*)context->getUserData()->data; struct regr_syy_data* inData = (struct regr_syy_data*)userDataIn->data; - outData->sumy += inData->sumy; - outData->sumy2 += inData->sumy2; - outData->cnt += inData->cnt; + uint64_t outCnt = outData->cnt; + long double outAvgy = outData->avgy; + long double outCy = outData->cy; + + uint64_t inCnt = inData->cnt; + long double inAvgy = inData->avgy; + long double inCy = inData->cy; + + uint64_t resCnt = inCnt + outCnt; + long double deltay = outAvgy - inAvgy; + + long double resAvgy = inAvgy + deltay * outCnt / resCnt; + + long double resCy = outCy + inCy + deltay * deltay * inCnt * outCnt / resCnt; + + outData->avgy = resAvgy; + outData->cy = resCy; + outData->cnt = resCnt; return mcsv1_UDAF::SUCCESS; } @@ -114,7 +133,7 @@ mcsv1_UDAF::ReturnCode regr_syy::evaluate(mcsv1Context* context, static_any::any long double N = data->cnt; if (N > 0) { - long double var_popy = (data->sumy2 - (data->sumy * data->sumy / N)); + long double var_popy = data->cy; if (var_popy < 0) // might be -0 var_popy = 0; valOut = static_cast(var_popy); @@ -127,10 +146,15 @@ mcsv1_UDAF::ReturnCode regr_syy::dropValue(mcsv1Context* context, ColumnDatum* v double valy = toDouble(valsDropped[0]); struct regr_syy_data* data = (struct regr_syy_data*)context->getUserData()->data; - data->sumy -= valy; - data->sumy2 -= valy * valy; - + long double avgyPrev = data->avgy; + long double cyPrev = data->cy; --data->cnt; + uint64_t cnt = data->cnt; + long double dy = valy - avgyPrev; + avgyPrev -= dy / cnt; + cyPrev -= dy * (valy - avgyPrev); + data->avgy = avgyPrev; + data->cy = cyPrev; return mcsv1_UDAF::SUCCESS; }