1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-07-30 19:23:07 +03:00

Put Welford's online algorithm everywhere

This commit is contained in:
mariadb-AndreyPiskunov
2022-09-06 18:45:56 +03:00
committed by Leonid Fedorov
parent 14b976991b
commit 14810d87ba
8 changed files with 531 additions and 227 deletions

View File

@ -39,11 +39,11 @@ static Add_corr_ToUDAFMap addToMap;
struct corr_data
{
uint64_t cnt;
long double sumx;
long double sumx2; // sum of (x squared)
long double sumy;
long double sumy2; // sum of (y squared)
long double sumxy; // sum of x * y
long double avgx;
long double varx;
long double avgy;
long double vary;
long double cxy;
};
mcsv1_UDAF::ReturnCode corr::init(mcsv1Context* context, ColumnDatum* colTypes)
@ -76,11 +76,11 @@ mcsv1_UDAF::ReturnCode corr::reset(mcsv1Context* context)
{
struct corr_data* data = (struct corr_data*)context->getUserData()->data;
data->cnt = 0;
data->sumx = 0.0;
data->sumx2 = 0.0;
data->sumy = 0.0;
data->sumy2 = 0.0;
data->sumxy = 0.0;
data->avgx = 0.0;
data->varx = 0.0;
data->avgy = 0.0;
data->vary = 0.0;
data->cxy = 0.0;
return mcsv1_UDAF::SUCCESS;
}
@ -90,15 +90,30 @@ mcsv1_UDAF::ReturnCode corr::nextValue(mcsv1Context* context, ColumnDatum* valsI
double valx = toDouble(valsIn[1]);
struct corr_data* data = (struct corr_data*)context->getUserData()->data;
data->sumy += valy;
data->sumy2 += valy * valy;
data->sumx += valx;
data->sumx2 += valx * valx;
data->sumxy += valx * valy;
long double avgyPrev = data->avgy;
long double varyPrev = data->vary;
long double avgxPrev = data->avgx;
long double varxPrev = data->varx;
long double cxyPrev = data->cxy;
++data->cnt;
uint64_t cnt = data->cnt;
long double dx = valx - avgxPrev;
long double dy = valy - avgyPrev;
avgyPrev += dy / cnt;
avgxPrev += dx / cnt;
varxPrev += dx * (valx - avgxPrev);
varyPrev += dy * (valy - avgyPrev);
cxyPrev += dx * (valy - avgyPrev);
data->avgx = avgxPrev;
data->avgy = avgyPrev;
data->varx = varxPrev;
data->vary = varyPrev;
data->cxy = cxyPrev;
return mcsv1_UDAF::SUCCESS;
}
@ -113,12 +128,38 @@ mcsv1_UDAF::ReturnCode corr::subEvaluate(mcsv1Context* context, const UserData*
struct corr_data* outData = (struct corr_data*)context->getUserData()->data;
struct corr_data* inData = (struct corr_data*)userDataIn->data;
outData->sumx += inData->sumx;
outData->sumx2 += inData->sumx2;
outData->sumy += inData->sumy;
outData->sumy2 += inData->sumy2;
outData->sumxy += inData->sumxy;
outData->cnt += inData->cnt;
uint64_t outCnt = outData->cnt;
long double outAvgx = outData->avgx;
long double outAvgy = outData->avgy;
long double outVarx = outData->varx;
long double outVary = outData->vary;
long double outCxy = outData->cxy;
uint64_t inCnt = inData->cnt;
long double inAvgx = inData->avgx;
long double inAvgy = inData->avgy;
long double inVarx = inData->varx;
long double inVary = inData->vary;
long double inCxy = inData->cxy;
uint64_t resCnt = inCnt + outCnt;
long double deltax = outAvgx - inAvgx;
long double deltay = outAvgy - inAvgy;
long double resAvgx = inAvgx + deltax * outCnt / resCnt;
long double resAvgy = inAvgy + deltay * outCnt / resCnt;
long double resVarx = outVarx + inVarx + deltax * deltax * inCnt * outCnt / resCnt;
long double resVary = outVary + inVary + deltay * deltay * inCnt * outCnt / resCnt;
long double resCxy = outCxy + inCxy + deltax * deltay * inCnt * outCnt / resCnt;
outData->avgx = resAvgx;
outData->avgy = resAvgy;
outData->varx = resVarx;
outData->vary = resVary;
outData->cxy = resCxy;
outData->cnt = resCnt;
return mcsv1_UDAF::SUCCESS;
}
@ -129,19 +170,17 @@ mcsv1_UDAF::ReturnCode corr::evaluate(mcsv1Context* context, static_any::any& va
double N = data->cnt;
if (N > 1)
{
long double sumx = data->sumx;
long double sumy = data->sumy;
long double sumx2 = data->sumx2;
long double sumy2 = data->sumy2;
long double sumxy = data->sumxy;
long double varx = data->varx;
long double vary = data->vary;
long double cxy = data->cxy;
long double var_popx = (sumx2 - (sumx * sumx / N)) / N;
long double var_popx = varx / N;
if (var_popx <= 0) // Catch -0
{
// When var_popx is 0, NULL is the result.
return mcsv1_UDAF::SUCCESS;
}
long double var_popy = (sumy2 - (sumy * sumy / N)) / N;
long double var_popy = vary / N;
if (var_popy <= 0) // Catch -0
{
// When var_popy is 0, NULL is the result
@ -149,8 +188,7 @@ mcsv1_UDAF::ReturnCode corr::evaluate(mcsv1Context* context, static_any::any& va
}
long double std_popx = sqrt(var_popx);
long double std_popy = sqrt(var_popy);
long double covar_pop = (sumxy - ((sumx * sumy) / N)) / N;
long double corr = covar_pop / (std_popy * std_popx);
long double corr = cxy / (std_popy * std_popx * N);
valOut = static_cast<double>(corr);
}
return mcsv1_UDAF::SUCCESS;
@ -162,14 +200,30 @@ mcsv1_UDAF::ReturnCode corr::dropValue(mcsv1Context* context, ColumnDatum* valsD
double valx = toDouble(valsDropped[1]);
struct corr_data* data = (struct corr_data*)context->getUserData()->data;
data->sumy -= valy;
data->sumy2 -= valy * valy;
data->sumx -= valx;
data->sumx2 -= valx * valx;
data->sumxy -= valx * valy;
long double avgyPrev = data->avgy;
long double varyPrev = data->vary;
long double avgxPrev = data->avgx;
long double varxPrev = data->varx;
long double cxyPrev = data->cxy;
--data->cnt;
uint64_t cnt = data->cnt;
long double dx = valx - avgxPrev;
long double dy = valy - avgyPrev;
avgyPrev -= dy / cnt;
avgxPrev -= dx / cnt;
varxPrev -= dx * (valx - avgxPrev);
varyPrev -= dy * (valy - avgyPrev);
cxyPrev -= dx * (valy - avgyPrev);
data->avgx = avgxPrev;
data->avgy = avgyPrev;
data->varx = varxPrev;
data->vary = varyPrev;
data->cxy = cxyPrev;
return mcsv1_UDAF::SUCCESS;
}

View File

@ -39,9 +39,9 @@ static Add_covar_pop_ToUDAFMap addToMap;
struct covar_pop_data
{
uint64_t cnt;
long double sumx;
long double sumy;
long double sumxy; // sum of x * y
long double avgx;
long double avgy;
long double cxy;
};
mcsv1_UDAF::ReturnCode covar_pop::init(mcsv1Context* context, ColumnDatum* colTypes)
@ -74,9 +74,9 @@ mcsv1_UDAF::ReturnCode covar_pop::reset(mcsv1Context* context)
{
struct covar_pop_data* data = (struct covar_pop_data*)context->getUserData()->data;
data->cnt = 0;
data->sumx = 0.0;
data->sumy = 0.0;
data->sumxy = 0.0;
data->avgx = 0.0;
data->avgy = 0.0;
data->cxy = 0.0;
return mcsv1_UDAF::SUCCESS;
}
@ -86,12 +86,18 @@ mcsv1_UDAF::ReturnCode covar_pop::nextValue(mcsv1Context* context, ColumnDatum*
double valx = toDouble(valsIn[1]);
struct covar_pop_data* data = (struct covar_pop_data*)context->getUserData()->data;
data->sumy += valy;
data->sumx += valx;
data->sumxy += valx * valy;
long double avgyPrev = data->avgy;
long double avgxPrev = data->avgx;
long double cxyPrev = data->cxy;
++data->cnt;
uint64_t cnt = data->cnt;
long double dx = valx - avgxPrev;
avgyPrev += (valy - avgyPrev)/cnt;
avgxPrev += dx / cnt;
cxyPrev += dx * (valy - avgyPrev);
data->avgx = avgxPrev;
data->avgy = avgyPrev;
data->cxy = cxyPrev;
return mcsv1_UDAF::SUCCESS;
}
@ -106,10 +112,27 @@ mcsv1_UDAF::ReturnCode covar_pop::subEvaluate(mcsv1Context* context, const UserD
struct covar_pop_data* outData = (struct covar_pop_data*)context->getUserData()->data;
struct covar_pop_data* inData = (struct covar_pop_data*)userDataIn->data;
outData->sumx += inData->sumx;
outData->sumy += inData->sumy;
outData->sumxy += inData->sumxy;
outData->cnt += inData->cnt;
uint64_t outCnt = outData->cnt;
long double outAvgx = outData->avgx;
long double outAvgy = outData->avgy;
long double outCxy = outData->cxy;
uint64_t inCnt = inData->cnt;
long double inAvgx = inData->avgx;
long double inAvgy = inData->avgy;
long double inCxy = inData->cxy;
uint64_t resCnt = inCnt + outCnt;
long double deltax = outAvgx - inAvgx;
long double deltay = outAvgy - inAvgy;
long double resAvgx = inAvgx + deltax * outCnt / resCnt;
long double resAvgy = inAvgy + deltay * outCnt / resCnt;
long double resCxy = outCxy + inCxy + deltax * deltay * inCnt * outCnt / resCnt;
outData->avgx = resAvgx;
outData->avgy = resAvgy;
outData->cxy = resCxy;
outData->cnt = resCnt;
return mcsv1_UDAF::SUCCESS;
}
@ -120,11 +143,7 @@ mcsv1_UDAF::ReturnCode covar_pop::evaluate(mcsv1Context* context, static_any::an
double N = data->cnt;
if (N > 0)
{
long double sumx = data->sumx;
long double sumy = data->sumy;
long double sumxy = data->sumxy;
long double covar_pop = (sumxy - ((sumx * sumy) / N)) / N;
long double covar_pop = data->cxy / N;
valOut = static_cast<double>(covar_pop);
}
return mcsv1_UDAF::SUCCESS;
@ -136,11 +155,20 @@ mcsv1_UDAF::ReturnCode covar_pop::dropValue(mcsv1Context* context, ColumnDatum*
double valx = toDouble(valsDropped[1]);
struct covar_pop_data* data = (struct covar_pop_data*)context->getUserData()->data;
data->sumy -= valy;
data->sumx -= valx;
data->sumxy -= valx * valy;
long double avgyPrev = data->avgy;
long double avgxPrev = data->avgx;
long double cxyPrev = data->cxy;
--data->cnt;
uint64_t cnt = data->cnt;
long double dx = valx - avgxPrev;
avgyPrev -= (valy - avgyPrev) / cnt;
avgxPrev -= dx / cnt;
cxyPrev -= dx * (valy - avgyPrev);
data->avgx = avgxPrev;
data->avgy = avgyPrev;
data->cxy = cxyPrev;
return mcsv1_UDAF::SUCCESS;
}

View File

@ -39,9 +39,9 @@ static Add_covar_samp_ToUDAFMap addToMap;
struct covar_samp_data
{
uint64_t cnt;
long double sumx;
long double sumy;
long double sumxy; // sum of x * y
long double avgx;
long double avgy;
long double cxy;
};
mcsv1_UDAF::ReturnCode covar_samp::init(mcsv1Context* context, ColumnDatum* colTypes)
@ -74,9 +74,9 @@ mcsv1_UDAF::ReturnCode covar_samp::reset(mcsv1Context* context)
{
struct covar_samp_data* data = (struct covar_samp_data*)context->getUserData()->data;
data->cnt = 0;
data->sumx = 0.0;
data->sumy = 0.0;
data->sumxy = 0.0;
data->avgx = 0.0;
data->avgy = 0.0;
data->cxy = 0.0;
return mcsv1_UDAF::SUCCESS;
}
@ -86,13 +86,18 @@ mcsv1_UDAF::ReturnCode covar_samp::nextValue(mcsv1Context* context, ColumnDatum*
double valx = toDouble(valsIn[1]);
struct covar_samp_data* data = (struct covar_samp_data*)context->getUserData()->data;
data->sumy += valy;
data->sumx += valx;
data->sumxy += valx * valy;
long double avgyPrev = data->avgy;
long double avgxPrev = data->avgx;
long double cxyPrev = data->cxy;
++data->cnt;
uint64_t cnt = data->cnt;
long double dx = valx - avgxPrev;
avgyPrev += (valy - avgyPrev)/cnt;
avgxPrev += dx / cnt;
cxyPrev += dx * (valy - avgyPrev);
data->avgx = avgxPrev;
data->avgy = avgyPrev;
data->cxy = cxyPrev;
return mcsv1_UDAF::SUCCESS;
}
@ -107,10 +112,27 @@ mcsv1_UDAF::ReturnCode covar_samp::subEvaluate(mcsv1Context* context, const User
struct covar_samp_data* outData = (struct covar_samp_data*)context->getUserData()->data;
struct covar_samp_data* inData = (struct covar_samp_data*)userDataIn->data;
outData->sumx += inData->sumx;
outData->sumy += inData->sumy;
outData->sumxy += inData->sumxy;
outData->cnt += inData->cnt;
uint64_t outCnt = outData->cnt;
long double outAvgx = outData->avgx;
long double outAvgy = outData->avgy;
long double outCxy = outData->cxy;
uint64_t inCnt = inData->cnt;
long double inAvgx = inData->avgx;
long double inAvgy = inData->avgy;
long double inCxy = inData->cxy;
uint64_t resCnt = inCnt + outCnt;
long double deltax = outAvgx - inAvgx;
long double deltay = outAvgy - inAvgy;
long double resAvgx = inAvgx + deltax * outCnt / resCnt;
long double resAvgy = inAvgy + deltay * outCnt / resCnt;
long double resCxy = outCxy + inCxy + deltax * deltay * inCnt * outCnt / resCnt;
outData->avgx = resAvgx;
outData->avgy = resAvgy;
outData->cxy = resCxy;
outData->cnt = resCnt;
return mcsv1_UDAF::SUCCESS;
}
@ -121,11 +143,7 @@ mcsv1_UDAF::ReturnCode covar_samp::evaluate(mcsv1Context* context, static_any::a
double N = data->cnt;
if (N > 1)
{
long double sumx = data->sumx;
long double sumy = data->sumy;
long double sumxy = data->sumxy;
long double covar_samp = (sumxy - ((sumx * sumy) / N)) / (N - 1);
long double covar_samp = data->cxy / (N - 1);
valOut = static_cast<double>(covar_samp);
}
else if (N == 1)
@ -141,12 +159,19 @@ mcsv1_UDAF::ReturnCode covar_samp::dropValue(mcsv1Context* context, ColumnDatum*
double valx = toDouble(valsDropped[1]);
struct covar_samp_data* data = (struct covar_samp_data*)context->getUserData()->data;
data->sumy -= valy;
data->sumx -= valx;
data->sumxy -= valx * valy;
long double avgyPrev = data->avgy;
long double avgxPrev = data->avgx;
long double cxyPrev = data->cxy;
--data->cnt;
uint64_t cnt = data->cnt;
long double dx = valx - avgxPrev;
avgyPrev -= (valy - avgyPrev) / cnt;
avgxPrev -= dx / cnt;
cxyPrev -= dx * (valy - avgyPrev);
data->avgx = avgxPrev;
data->avgy = avgyPrev;
data->cxy = cxyPrev;
return mcsv1_UDAF::SUCCESS;
}

View File

@ -39,10 +39,10 @@ static Add_regr_intercept_ToUDAFMap addToMap;
struct regr_intercept_data
{
uint64_t cnt;
long double sumx;
long double sumx2; // sum of (x squared)
long double sumy;
long double sumxy; // sum of x * y
long double avgx;
long double cx;
long double avgy;
long double cxy;
};
mcsv1_UDAF::ReturnCode regr_intercept::init(mcsv1Context* context, ColumnDatum* colTypes)
@ -75,10 +75,10 @@ mcsv1_UDAF::ReturnCode regr_intercept::reset(mcsv1Context* context)
{
struct regr_intercept_data* data = (struct regr_intercept_data*)context->getUserData()->data;
data->cnt = 0;
data->sumx = 0.0;
data->sumx2 = 0.0;
data->sumy = 0.0;
data->sumxy = 0.0;
data->avgx = 0.0;
data->cx = 0.0;
data->avgy = 0.0;
data->cxy = 0.0;
return mcsv1_UDAF::SUCCESS;
}
@ -88,12 +88,27 @@ mcsv1_UDAF::ReturnCode regr_intercept::nextValue(mcsv1Context* context, ColumnDa
double valx = toDouble(valsIn[1]);
struct regr_intercept_data* data = (struct regr_intercept_data*)context->getUserData()->data;
data->sumy += valy;
data->sumx += valx;
data->sumx2 += valx * valx;
data->sumxy += valx * valy;
long double avgyPrev = data->avgy;
long double avgxPrev = data->avgx;
long double cxPrev = data->cx;
long double cxyPrev = data->cxy;
++data->cnt;
uint64_t cnt = data->cnt;
long double dx = valx - avgxPrev;
long double dy = valy - avgyPrev;
avgyPrev += dy / cnt;
avgxPrev += dx / cnt;
cxPrev += dx * (valx - avgxPrev);
cxyPrev += dx * (valy - avgyPrev);
data->avgx = avgxPrev;
data->avgy = avgyPrev;
data->cx = cxPrev;
data->cxy = cxyPrev;
return mcsv1_UDAF::SUCCESS;
}
@ -108,11 +123,34 @@ mcsv1_UDAF::ReturnCode regr_intercept::subEvaluate(mcsv1Context* context, const
struct regr_intercept_data* outData = (struct regr_intercept_data*)context->getUserData()->data;
struct regr_intercept_data* inData = (struct regr_intercept_data*)userDataIn->data;
outData->sumx += inData->sumx;
outData->sumx2 += inData->sumx2;
outData->sumy += inData->sumy;
outData->sumxy += inData->sumxy;
outData->cnt += inData->cnt;
uint64_t outCnt = outData->cnt;
long double outAvgx = outData->avgx;
long double outAvgy = outData->avgy;
long double outCx = outData->cx;
long double outCxy = outData->cxy;
uint64_t inCnt = inData->cnt;
long double inAvgx = inData->avgx;
long double inAvgy = inData->avgy;
long double inCx = inData->cx;
long double inCxy = inData->cxy;
uint64_t resCnt = inCnt + outCnt;
long double deltax = outAvgx - inAvgx;
long double deltay = outAvgy - inAvgy;
long double resAvgx = inAvgx + deltax * outCnt / resCnt;
long double resAvgy = inAvgy + deltay * outCnt / resCnt;
long double resCx = outCx + inCx + deltax * deltax * inCnt * outCnt / resCnt;
long double resCxy = outCxy + inCxy + deltax * deltay * inCnt * outCnt / resCnt;
outData->avgx = resAvgx;
outData->avgy = resAvgy;
outData->cx = resCx;
outData->cxy = resCxy;
outData->cnt = resCnt;
return mcsv1_UDAF::SUCCESS;
}
@ -123,18 +161,14 @@ mcsv1_UDAF::ReturnCode regr_intercept::evaluate(mcsv1Context* context, static_an
long double N = data->cnt;
if (N > 1)
{
long double sumx = data->sumx;
long double sumy = data->sumy;
long double sumx2 = data->sumx2;
long double sumxy = data->sumxy;
long double avgx = data->avgx;
long double avgy = data->avgy;
long double cx = data->cx;
long double cxy = data->cxy;
// regr_intercept is AVG(y) - slope(y,x)*avg(x)
// We do some algebra and we can get a slightly smaller calculation
long double numerator = sumy * sumx2 - sumx * sumxy;
long double var_pop =
(N * sumx2) - (sumx * sumx); // Not realy var_pop, but sort of after some reductions
if (var_pop > 0)
if (cx > 0)
{
valOut = static_cast<double>(numerator / var_pop);
valOut = static_cast<double>(avgy - cxy / cx * avgx);
}
}
return mcsv1_UDAF::SUCCESS;
@ -146,12 +180,27 @@ mcsv1_UDAF::ReturnCode regr_intercept::dropValue(mcsv1Context* context, ColumnDa
double valx = toDouble(valsDropped[1]);
struct regr_intercept_data* data = (struct regr_intercept_data*)context->getUserData()->data;
data->sumy -= valy;
data->sumx -= valx;
data->sumx2 -= valx * valx;
data->sumxy -= valx * valy;
long double avgyPrev = data->avgy;
long double avgxPrev = data->avgx;
long double cxPrev = data->cx;
long double cxyPrev = data->cxy;
--data->cnt;
uint64_t cnt = data->cnt;
long double dx = valx - avgxPrev;
long double dy = valy - avgyPrev;
avgyPrev -= dy / cnt;
avgxPrev -= dx / cnt;
cxPrev -= dx * (valx - avgxPrev);
cxyPrev -= dx * (valy - avgyPrev);
data->avgx = avgxPrev;
data->avgy = avgyPrev;
data->cx = cxPrev;
data->cxy = cxyPrev;
return mcsv1_UDAF::SUCCESS;
}

View File

@ -39,11 +39,11 @@ static Add_regr_r2_ToUDAFMap addToMap;
struct regr_r2_data
{
uint64_t cnt;
long double sumx;
long double sumx2; // sum of (x squared)
long double sumy;
long double sumy2; // sum of (y squared)
long double sumxy; // sum of x * y
long double avgx;
long double varx;
long double avgy;
long double vary;
long double cxy;
};
mcsv1_UDAF::ReturnCode regr_r2::init(mcsv1Context* context, ColumnDatum* colTypes)
@ -76,11 +76,11 @@ mcsv1_UDAF::ReturnCode regr_r2::reset(mcsv1Context* context)
{
struct regr_r2_data* data = (struct regr_r2_data*)context->getUserData()->data;
data->cnt = 0;
data->sumx = 0.0;
data->sumx2 = 0.0;
data->sumy = 0.0;
data->sumy2 = 0.0;
data->sumxy = 0.0;
data->avgx = 0.0;
data->varx = 0.0;
data->avgy = 0.0;
data->vary = 0.0;
data->cxy = 0.0;
return mcsv1_UDAF::SUCCESS;
}
@ -89,16 +89,30 @@ mcsv1_UDAF::ReturnCode regr_r2::nextValue(mcsv1Context* context, ColumnDatum* va
double valy = toDouble(valsIn[0]);
double valx = toDouble(valsIn[1]);
struct regr_r2_data* data = (struct regr_r2_data*)context->getUserData()->data;
data->sumy += valy;
data->sumy2 += valy * valy;
data->sumx += valx;
data->sumx2 += valx * valx;
data->sumxy += valx * valy;
long double avgyPrev = data->avgy;
long double varyPrev = data->vary;
long double avgxPrev = data->avgx;
long double varxPrev = data->varx;
long double cxyPrev = data->cxy;
++data->cnt;
uint64_t cnt = data->cnt;
long double dx = valx - avgxPrev;
long double dy = valy - avgyPrev;
avgyPrev += dy / cnt;
avgxPrev += dx / cnt;
varxPrev += dx * (valx - avgxPrev);
varyPrev += dy * (valy - avgyPrev);
cxyPrev += dx * (valy - avgyPrev);
data->avgx = avgxPrev;
data->avgy = avgyPrev;
data->varx = varxPrev;
data->vary = varyPrev;
data->cxy = cxyPrev;
return mcsv1_UDAF::SUCCESS;
}
@ -113,12 +127,38 @@ mcsv1_UDAF::ReturnCode regr_r2::subEvaluate(mcsv1Context* context, const UserDat
struct regr_r2_data* outData = (struct regr_r2_data*)context->getUserData()->data;
struct regr_r2_data* inData = (struct regr_r2_data*)userDataIn->data;
outData->sumx += inData->sumx;
outData->sumx2 += inData->sumx2;
outData->sumy += inData->sumy;
outData->sumy2 += inData->sumy2;
outData->sumxy += inData->sumxy;
outData->cnt += inData->cnt;
uint64_t outCnt = outData->cnt;
long double outAvgx = outData->avgx;
long double outAvgy = outData->avgy;
long double outVarx = outData->varx;
long double outVary = outData->vary;
long double outCxy = outData->cxy;
uint64_t inCnt = inData->cnt;
long double inAvgx = inData->avgx;
long double inAvgy = inData->avgy;
long double inVarx = inData->varx;
long double inVary = inData->vary;
long double inCxy = inData->cxy;
uint64_t resCnt = inCnt + outCnt;
long double deltax = outAvgx - inAvgx;
long double deltay = outAvgy - inAvgy;
long double resAvgx = inAvgx + deltax * outCnt / resCnt;
long double resAvgy = inAvgy + deltay * outCnt / resCnt;
long double resVarx = outVarx + inVarx + deltax * deltax * inCnt * outCnt / resCnt;
long double resVary = outVary + inVary + deltay * deltay * inCnt * outCnt / resCnt;
long double resCxy = outCxy + inCxy + deltax * deltay * inCnt * outCnt / resCnt;
outData->avgx = resAvgx;
outData->avgy = resAvgy;
outData->varx = resVarx;
outData->vary = resVary;
outData->cxy = resCxy;
outData->cnt = resCnt;
return mcsv1_UDAF::SUCCESS;
}
@ -129,19 +169,17 @@ mcsv1_UDAF::ReturnCode regr_r2::evaluate(mcsv1Context* context, static_any::any&
double N = data->cnt;
if (N > 1)
{
long double sumx = data->sumx;
long double sumy = data->sumy;
long double sumx2 = data->sumx2;
long double sumy2 = data->sumy2;
long double sumxy = data->sumxy;
long double varx = data->varx;
long double vary = data->vary;
long double cxy = data->cxy;
long double var_popx = (sumx2 - (sumx * sumx / N)) / N;
long double var_popx = varx / N;
if (var_popx <= 0) // Catch -0
{
// When var_popx is 0, NULL is the result.
return mcsv1_UDAF::SUCCESS;
}
double var_popy = (sumy2 - (sumy * sumy / N)) / N;
long double var_popy = vary / N;
if (var_popy <= 0) // Catch -0
{
// When var_popy is 0, 1 is the result
@ -150,8 +188,7 @@ mcsv1_UDAF::ReturnCode regr_r2::evaluate(mcsv1Context* context, static_any::any&
}
long double std_popx = sqrt(var_popx);
long double std_popy = sqrt(var_popy);
long double covar_pop = (sumxy - ((sumx * sumy) / N)) / N;
long double corr = covar_pop / (std_popy * std_popx);
long double corr = cxy / (std_popy * std_popx * N);
valOut = static_cast<double>(corr * corr);
}
return mcsv1_UDAF::SUCCESS;
@ -163,14 +200,30 @@ mcsv1_UDAF::ReturnCode regr_r2::dropValue(mcsv1Context* context, ColumnDatum* va
double valx = toDouble(valsDropped[1]);
struct regr_r2_data* data = (struct regr_r2_data*)context->getUserData()->data;
data->sumy -= valy;
data->sumy2 -= valy * valy;
data->sumx -= valx;
data->sumx2 -= valx * valx;
data->sumxy -= valx * valy;
long double avgyPrev = data->avgy;
long double varyPrev = data->vary;
long double avgxPrev = data->avgx;
long double varxPrev = data->varx;
long double cxyPrev = data->cxy;
--data->cnt;
uint64_t cnt = data->cnt;
long double dx = valx - avgxPrev;
long double dy = valy - avgyPrev;
avgyPrev -= dy / cnt;
avgxPrev -= dx / cnt;
varxPrev -= dx * (valx - avgxPrev);
varyPrev -= dy * (valy - avgyPrev);
cxyPrev -= dx * (valy - avgyPrev);
data->avgx = avgxPrev;
data->avgy = avgyPrev;
data->varx = varxPrev;
data->vary = varyPrev;
data->cxy = cxyPrev;
return mcsv1_UDAF::SUCCESS;
}

View File

@ -39,10 +39,10 @@ static Add_regr_slope_ToUDAFMap addToMap;
struct regr_slope_data
{
uint64_t cnt;
long double sumx;
long double sumx2; // sum of (x squared)
long double sumy;
long double sumxy; // sum of x * y
long double avgx;
long double cx;
long double avgy;
long double cxy;
};
mcsv1_UDAF::ReturnCode regr_slope::init(mcsv1Context* context, ColumnDatum* colTypes)
@ -74,10 +74,10 @@ mcsv1_UDAF::ReturnCode regr_slope::reset(mcsv1Context* context)
{
struct regr_slope_data* data = (struct regr_slope_data*)context->getUserData()->data;
data->cnt = 0;
data->sumx = 0.0;
data->sumx2 = 0.0;
data->sumy = 0.0;
data->sumxy = 0.0;
data->avgx = 0.0;
data->cx = 0.0;
data->avgy = 0.0;
data->cxy = 0.0;
return mcsv1_UDAF::SUCCESS;
}
@ -87,12 +87,27 @@ mcsv1_UDAF::ReturnCode regr_slope::nextValue(mcsv1Context* context, ColumnDatum*
double valx = toDouble(valsIn[1]);
struct regr_slope_data* data = (struct regr_slope_data*)context->getUserData()->data;
data->sumy += valy;
data->sumx += valx;
data->sumx2 += valx * valx;
data->sumxy += valx * valy;
long double avgyPrev = data->avgy;
long double avgxPrev = data->avgx;
long double cxPrev = data->cx;
long double cxyPrev = data->cxy;
++data->cnt;
uint64_t cnt = data->cnt;
long double dx = valx - avgxPrev;
long double dy = valy - avgyPrev;
avgyPrev += dy / cnt;
avgxPrev += dx / cnt;
cxPrev += dx * (valx - avgxPrev);
cxyPrev += dx * (valy - avgyPrev);
data->avgx = avgxPrev;
data->avgy = avgyPrev;
data->cx = cxPrev;
data->cxy = cxyPrev;
return mcsv1_UDAF::SUCCESS;
}
@ -106,11 +121,35 @@ mcsv1_UDAF::ReturnCode regr_slope::subEvaluate(mcsv1Context* context, const User
struct regr_slope_data* outData = (struct regr_slope_data*)context->getUserData()->data;
struct regr_slope_data* inData = (struct regr_slope_data*)userDataIn->data;
outData->sumx += inData->sumx;
outData->sumx2 += inData->sumx2;
outData->sumy += inData->sumy;
outData->sumxy += inData->sumxy;
outData->cnt += inData->cnt;
uint64_t outCnt = outData->cnt;
long double outAvgx = outData->avgx;
long double outAvgy = outData->avgy;
long double outCx = outData->cx;
long double outCxy = outData->cxy;
uint64_t inCnt = inData->cnt;
long double inAvgx = inData->avgx;
long double inAvgy = inData->avgy;
long double inCx = inData->cx;
long double inCxy = inData->cxy;
uint64_t resCnt = inCnt + outCnt;
long double deltax = outAvgx - inAvgx;
long double deltay = outAvgy - inAvgy;
long double resAvgx = inAvgx + deltax * outCnt / resCnt;
long double resAvgy = inAvgy + deltay * outCnt / resCnt;
long double resCx = outCx + inCx + deltax * deltax * inCnt * outCnt / resCnt;
long double resCxy = outCxy + inCxy + deltax * deltay * inCnt * outCnt / resCnt;
outData->avgx = resAvgx;
outData->avgy = resAvgy;
outData->cx = resCx;
outData->cxy = resCxy;
outData->cnt = resCnt;
return mcsv1_UDAF::SUCCESS;
}
@ -122,18 +161,11 @@ mcsv1_UDAF::ReturnCode regr_slope::evaluate(mcsv1Context* context, static_any::a
if (N > 1)
{
// COVAR_POP(y, x) / VAR_POP(x)
long double sumx = data->sumx;
long double sumy = data->sumy;
long double sumx2 = data->sumx2;
long double sumxy = data->sumxy;
// These aren't really covar_pop and var_pop. For the purposes of this calculation
// we multiplied everything by N to reduce calc time and variance.
// It all comes out after the final divide
long double covar_pop = N * sumxy - sumx * sumy;
long double var_pop = N * sumx2 - sumx * sumx;
if (var_pop > 0)
long double cx = data->cx;
long double cxy = data->cxy;
if (cx > 0)
{
valOut = static_cast<double>(covar_pop / var_pop);
valOut = static_cast<double>(cxy / cx);
}
}
return mcsv1_UDAF::SUCCESS;
@ -145,11 +177,27 @@ mcsv1_UDAF::ReturnCode regr_slope::dropValue(mcsv1Context* context, ColumnDatum*
double valx = toDouble(valsDropped[1]);
struct regr_slope_data* data = (struct regr_slope_data*)context->getUserData()->data;
data->sumy -= valy;
data->sumx -= valx;
data->sumx2 -= valx * valx;
data->sumxy -= valx * valy;
long double avgyPrev = data->avgy;
long double avgxPrev = data->avgx;
long double cxPrev = data->cx;
long double cxyPrev = data->cxy;
--data->cnt;
uint64_t cnt = data->cnt;
long double dx = valx - avgxPrev;
long double dy = valy - avgyPrev;
avgyPrev -= dy / cnt;
avgxPrev -= dx / cnt;
cxPrev -= dx * (valx - avgxPrev);
cxyPrev -= dx * (valy - avgyPrev);
data->avgx = avgxPrev;
data->avgy = avgyPrev;
data->cx = cxPrev;
data->cxy = cxyPrev;
return mcsv1_UDAF::SUCCESS;
}

View File

@ -39,8 +39,8 @@ static Add_regr_sxx_ToUDAFMap addToMap;
struct regr_sxx_data
{
uint64_t cnt;
long double sumx;
long double sumx2; // sum of (x squared)
long double avgx;
long double cx;
};
mcsv1_UDAF::ReturnCode regr_sxx::init(mcsv1Context* context, ColumnDatum* colTypes)
@ -73,8 +73,8 @@ mcsv1_UDAF::ReturnCode regr_sxx::reset(mcsv1Context* context)
{
struct regr_sxx_data* data = (struct regr_sxx_data*)context->getUserData()->data;
data->cnt = 0;
data->sumx = 0.0;
data->sumx2 = 0.0;
data->avgx = 0.0;
data->cx = 0.0;
return mcsv1_UDAF::SUCCESS;
}
@ -82,11 +82,15 @@ mcsv1_UDAF::ReturnCode regr_sxx::nextValue(mcsv1Context* context, ColumnDatum* v
{
double valx = toDouble(valsIn[1]);
struct regr_sxx_data* data = (struct regr_sxx_data*)context->getUserData()->data;
data->sumx += valx;
data->sumx2 += valx * valx;
long double avgxPrev = data->avgx;
long double cxPrev = data->cx;
++data->cnt;
uint64_t cnt = data->cnt;
long double dx = valx - avgxPrev;
avgxPrev += dx / cnt;
cxPrev += dx * (valx - avgxPrev);
data->avgx = avgxPrev;
data->cx = cxPrev;
return mcsv1_UDAF::SUCCESS;
}
@ -101,9 +105,24 @@ mcsv1_UDAF::ReturnCode regr_sxx::subEvaluate(mcsv1Context* context, const UserDa
struct regr_sxx_data* outData = (struct regr_sxx_data*)context->getUserData()->data;
struct regr_sxx_data* inData = (struct regr_sxx_data*)userDataIn->data;
outData->sumx += inData->sumx;
outData->sumx2 += inData->sumx2;
outData->cnt += inData->cnt;
uint64_t outCnt = outData->cnt;
long double outAvgx = outData->avgx;
long double outCx = outData->cx;
uint64_t inCnt = inData->cnt;
long double inAvgx = inData->avgx;
long double inCx = inData->cx;
uint64_t resCnt = inCnt + outCnt;
long double deltax = outAvgx - inAvgx;
long double resAvgx = inAvgx + deltax * outCnt / resCnt;
long double resCx = outCx + inCx + deltax * deltax * inCnt * outCnt / resCnt;
outData->avgx = resAvgx;
outData->cx = resCx;
outData->cnt = resCnt;
return mcsv1_UDAF::SUCCESS;
}
@ -114,7 +133,7 @@ mcsv1_UDAF::ReturnCode regr_sxx::evaluate(mcsv1Context* context, static_any::any
long double N = data->cnt;
if (N > 0)
{
long double regr_sxx = (data->sumx2 - (data->sumx * data->sumx / N));
long double regr_sxx = data->cx;
if (regr_sxx < 0) // catch -0
regr_sxx = 0;
valOut = static_cast<double>(regr_sxx);
@ -126,11 +145,15 @@ mcsv1_UDAF::ReturnCode regr_sxx::dropValue(mcsv1Context* context, ColumnDatum* v
{
double valx = toDouble(valsDropped[1]);
struct regr_sxx_data* data = (struct regr_sxx_data*)context->getUserData()->data;
data->sumx -= valx;
data->sumx2 -= valx * valx;
long double avgxPrev = data->avgx;
long double cxPrev = data->cx;
--data->cnt;
uint64_t cnt = data->cnt;
long double dx = valx - avgxPrev;
avgxPrev -= dx / cnt;
cxPrev -= dx * (valx - avgxPrev);
data->avgx = avgxPrev;
data->cx = cxPrev;
return mcsv1_UDAF::SUCCESS;
}

View File

@ -39,8 +39,8 @@ static Add_regr_syy_ToUDAFMap addToMap;
struct regr_syy_data
{
uint64_t cnt;
long double sumy;
long double sumy2; // sum of (y squared)
long double avgy;
long double cy; // sum of (y squared)
};
mcsv1_UDAF::ReturnCode regr_syy::init(mcsv1Context* context, ColumnDatum* colTypes)
@ -73,8 +73,8 @@ mcsv1_UDAF::ReturnCode regr_syy::reset(mcsv1Context* context)
{
struct regr_syy_data* data = (struct regr_syy_data*)context->getUserData()->data;
data->cnt = 0;
data->sumy = 0.0;
data->sumy2 = 0.0;
data->avgy = 0.0;
data->cy = 0.0;
return mcsv1_UDAF::SUCCESS;
}
@ -82,11 +82,15 @@ mcsv1_UDAF::ReturnCode regr_syy::nextValue(mcsv1Context* context, ColumnDatum* v
{
double valy = toDouble(valsIn[0]);
struct regr_syy_data* data = (struct regr_syy_data*)context->getUserData()->data;
data->sumy += valy;
data->sumy2 += valy * valy;
long double avgyPrev = data->avgy;
long double cyPrev = data->cy;
++data->cnt;
uint64_t cnt = data->cnt;
long double dy = valy - avgyPrev;
avgyPrev += dy / cnt;
cyPrev += dy * (valy - avgyPrev);
data->avgy = avgyPrev;
data->cy = cyPrev;
return mcsv1_UDAF::SUCCESS;
}
@ -101,9 +105,24 @@ mcsv1_UDAF::ReturnCode regr_syy::subEvaluate(mcsv1Context* context, const UserDa
struct regr_syy_data* outData = (struct regr_syy_data*)context->getUserData()->data;
struct regr_syy_data* inData = (struct regr_syy_data*)userDataIn->data;
outData->sumy += inData->sumy;
outData->sumy2 += inData->sumy2;
outData->cnt += inData->cnt;
uint64_t outCnt = outData->cnt;
long double outAvgy = outData->avgy;
long double outCy = outData->cy;
uint64_t inCnt = inData->cnt;
long double inAvgy = inData->avgy;
long double inCy = inData->cy;
uint64_t resCnt = inCnt + outCnt;
long double deltay = outAvgy - inAvgy;
long double resAvgy = inAvgy + deltay * outCnt / resCnt;
long double resCy = outCy + inCy + deltay * deltay * inCnt * outCnt / resCnt;
outData->avgy = resAvgy;
outData->cy = resCy;
outData->cnt = resCnt;
return mcsv1_UDAF::SUCCESS;
}
@ -114,7 +133,7 @@ mcsv1_UDAF::ReturnCode regr_syy::evaluate(mcsv1Context* context, static_any::any
long double N = data->cnt;
if (N > 0)
{
long double var_popy = (data->sumy2 - (data->sumy * data->sumy / N));
long double var_popy = data->cy;
if (var_popy < 0) // might be -0
var_popy = 0;
valOut = static_cast<double>(var_popy);
@ -127,10 +146,15 @@ mcsv1_UDAF::ReturnCode regr_syy::dropValue(mcsv1Context* context, ColumnDatum* v
double valy = toDouble(valsDropped[0]);
struct regr_syy_data* data = (struct regr_syy_data*)context->getUserData()->data;
data->sumy -= valy;
data->sumy2 -= valy * valy;
long double avgyPrev = data->avgy;
long double cyPrev = data->cy;
--data->cnt;
uint64_t cnt = data->cnt;
long double dy = valy - avgyPrev;
avgyPrev -= dy / cnt;
cyPrev -= dy * (valy - avgyPrev);
data->avgy = avgyPrev;
data->cy = cyPrev;
return mcsv1_UDAF::SUCCESS;
}