1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-07-30 19:23:07 +03:00

Put Welford's online algorithm everywhere

This commit is contained in:
mariadb-AndreyPiskunov
2022-09-06 18:45:56 +03:00
committed by Leonid Fedorov
parent 14b976991b
commit 14810d87ba
8 changed files with 531 additions and 227 deletions

View File

@ -39,9 +39,9 @@ static Add_covar_samp_ToUDAFMap addToMap;
struct covar_samp_data
{
uint64_t cnt;
long double sumx;
long double sumy;
long double sumxy; // sum of x * y
long double avgx;
long double avgy;
long double cxy;
};
mcsv1_UDAF::ReturnCode covar_samp::init(mcsv1Context* context, ColumnDatum* colTypes)
@ -74,9 +74,9 @@ mcsv1_UDAF::ReturnCode covar_samp::reset(mcsv1Context* context)
{
struct covar_samp_data* data = (struct covar_samp_data*)context->getUserData()->data;
data->cnt = 0;
data->sumx = 0.0;
data->sumy = 0.0;
data->sumxy = 0.0;
data->avgx = 0.0;
data->avgy = 0.0;
data->cxy = 0.0;
return mcsv1_UDAF::SUCCESS;
}
@ -86,13 +86,18 @@ mcsv1_UDAF::ReturnCode covar_samp::nextValue(mcsv1Context* context, ColumnDatum*
double valx = toDouble(valsIn[1]);
struct covar_samp_data* data = (struct covar_samp_data*)context->getUserData()->data;
data->sumy += valy;
data->sumx += valx;
data->sumxy += valx * valy;
long double avgyPrev = data->avgy;
long double avgxPrev = data->avgx;
long double cxyPrev = data->cxy;
++data->cnt;
uint64_t cnt = data->cnt;
long double dx = valx - avgxPrev;
avgyPrev += (valy - avgyPrev)/cnt;
avgxPrev += dx / cnt;
cxyPrev += dx * (valy - avgyPrev);
data->avgx = avgxPrev;
data->avgy = avgyPrev;
data->cxy = cxyPrev;
return mcsv1_UDAF::SUCCESS;
}
@ -107,10 +112,27 @@ mcsv1_UDAF::ReturnCode covar_samp::subEvaluate(mcsv1Context* context, const User
struct covar_samp_data* outData = (struct covar_samp_data*)context->getUserData()->data;
struct covar_samp_data* inData = (struct covar_samp_data*)userDataIn->data;
outData->sumx += inData->sumx;
outData->sumy += inData->sumy;
outData->sumxy += inData->sumxy;
outData->cnt += inData->cnt;
uint64_t outCnt = outData->cnt;
long double outAvgx = outData->avgx;
long double outAvgy = outData->avgy;
long double outCxy = outData->cxy;
uint64_t inCnt = inData->cnt;
long double inAvgx = inData->avgx;
long double inAvgy = inData->avgy;
long double inCxy = inData->cxy;
uint64_t resCnt = inCnt + outCnt;
long double deltax = outAvgx - inAvgx;
long double deltay = outAvgy - inAvgy;
long double resAvgx = inAvgx + deltax * outCnt / resCnt;
long double resAvgy = inAvgy + deltay * outCnt / resCnt;
long double resCxy = outCxy + inCxy + deltax * deltay * inCnt * outCnt / resCnt;
outData->avgx = resAvgx;
outData->avgy = resAvgy;
outData->cxy = resCxy;
outData->cnt = resCnt;
return mcsv1_UDAF::SUCCESS;
}
@ -121,11 +143,7 @@ mcsv1_UDAF::ReturnCode covar_samp::evaluate(mcsv1Context* context, static_any::a
double N = data->cnt;
if (N > 1)
{
long double sumx = data->sumx;
long double sumy = data->sumy;
long double sumxy = data->sumxy;
long double covar_samp = (sumxy - ((sumx * sumy) / N)) / (N - 1);
long double covar_samp = data->cxy / (N - 1);
valOut = static_cast<double>(covar_samp);
}
else if (N == 1)
@ -141,12 +159,19 @@ mcsv1_UDAF::ReturnCode covar_samp::dropValue(mcsv1Context* context, ColumnDatum*
double valx = toDouble(valsDropped[1]);
struct covar_samp_data* data = (struct covar_samp_data*)context->getUserData()->data;
data->sumy -= valy;
data->sumx -= valx;
data->sumxy -= valx * valy;
long double avgyPrev = data->avgy;
long double avgxPrev = data->avgx;
long double cxyPrev = data->cxy;
--data->cnt;
uint64_t cnt = data->cnt;
long double dx = valx - avgxPrev;
avgyPrev -= (valy - avgyPrev) / cnt;
avgxPrev -= dx / cnt;
cxyPrev -= dx * (valy - avgyPrev);
data->avgx = avgxPrev;
data->avgy = avgyPrev;
data->cxy = cxyPrev;
return mcsv1_UDAF::SUCCESS;
}