1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-07-30 19:23:07 +03:00

Welford's algorithm STD and VAR on window functions

This commit is contained in:
Andrey Piskunov
2022-06-01 19:05:33 +03:00
parent c5fa27475d
commit 66c69c7609
6 changed files with 31809 additions and 15 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,117 @@
-- source ../include/have_columnstore.inc
--disable_warnings
DROP DATABASE IF EXISTS std_test_db;
--enable_warnings
CREATE DATABASE std_test_db;
USE std_test_db;
create table t1 (
col_signed tinyint,
col_unsigned tinyint unsigned
)engine=columnstore;
--replace_result $MTR_SUITE_DIR MTR_SUITE_DIR
--eval LOAD DATA LOCAL infile '$MTR_SUITE_DIR/../std_data/tinyint_range.tbl' INTO TABLE t1 FIELDS TERMINATED BY '|';
ALTER TABLE t1 ADD COLUMN col_small_signed SMALLINT;
ALTER TABLE t1 ADD COLUMN col_small_unsigned SMALLINT UNSIGNED;
ALTER TABLE t1 ADD COLUMN col_med_signed MEDIUMINT;
ALTER TABLE t1 ADD COLUMN col_med_unsigned MEDIUMINT UNSIGNED;
ALTER TABLE t1 ADD COLUMN col_int_signed INT;
ALTER TABLE t1 ADD COLUMN col_int_unsigned INT UNSIGNED;
ALTER TABLE t1 ADD COLUMN col_big_signed BIGINT;
ALTER TABLE t1 ADD COLUMN col_big_unsigned BIGINT UNSIGNED;
ALTER TABLE t1 ADD COLUMN col_dec_signed DECIMAL(38,0);
ALTER TABLE t1 ADD COLUMN col_dec_unsigned DECIMAL(38,0) UNSIGNED;
ALTER TABLE t1 ADD COLUMN col_float_signed FLOAT;
ALTER TABLE t1 ADD COLUMN col_float_unsigned FLOAT UNSIGNED;
ALTER TABLE t1 ADD COLUMN col_double_signed DOUBLE;
ALTER TABLE t1 ADD COLUMN col_double_unsigned DOUBLE UNSIGNED;
UPDATE t1 SET col_small_signed=col_signed + sign(col_signed) * 32000;
UPDATE t1 SET col_small_unsigned=col_unsigned + 65000;
UPDATE t1 SET col_med_signed=col_signed + sign(col_signed) * 8388000;
UPDATE t1 SET col_med_unsigned=col_unsigned + 16776000;
UPDATE t1 SET col_int_signed=col_signed + sign(col_signed) * 2147483000;
UPDATE t1 SET col_int_unsigned=col_unsigned + 4294000000;
UPDATE t1 SET col_big_signed=col_signed + sign(col_signed) * 9223372036854775000;
UPDATE t1 SET col_big_unsigned=col_unsigned + 18446744073709551000;
UPDATE t1 SET col_dec_signed=col_signed + sign(col_signed) * 800000000000000000000000000000000001;
UPDATE t1 SET col_dec_unsigned=col_unsigned + 800000000000000000000000000000000003;
UPDATE t1 SET col_float_signed=col_signed + 0.637 + sign(col_signed) * 8388000;
UPDATE t1 SET col_float_unsigned=col_unsigned + 0.637 + 16776000;
UPDATE t1 SET col_double_signed=col_signed + 0.637 + sign(col_signed) * 2147483000;
UPDATE t1 SET col_double_unsigned=col_unsigned + 0.637 + 4294000000;
SELECT 'q1', floor(STD(col_signed) OVER ()) AS std FROM t1;
SELECT 'q2', floor(STD(col_unsigned) OVER ()) AS std FROM t1;
SELECT 'q3', floor(STD(col_small_signed) OVER ()) AS std FROM t1;
SELECT 'q4', floor(STD(col_small_unsigned) OVER ()) AS std FROM t1;
SELECT 'q5', floor(STD(col_med_signed) OVER ()) AS std FROM t1;
SELECT 'q6', floor(STD(col_med_unsigned) OVER ()) AS std FROM t1;
SELECT 'q7', floor(STD(col_int_signed) OVER ()) AS std FROM t1;
SELECT 'q8', floor(STD(col_int_unsigned) OVER ()) AS std FROM t1;
SELECT 'q9', floor(STD(col_big_signed) OVER ()) AS std FROM t1;
SELECT 'q10', floor(STD(col_big_unsigned) OVER ()) AS std FROM t1;
SELECT 'q11', floor(STD(col_dec_signed) OVER ()) AS std FROM t1;
SELECT 'q13', floor(STD(col_float_signed) OVER ()) AS std FROM t1;
SELECT 'q14', floor(STD(col_float_unsigned) OVER ()) AS std FROM t1;
SELECT 'q15', floor(STD(col_double_signed) OVER ()) AS std FROM t1;
SELECT 'q16', floor(STD(col_double_unsigned) OVER ()) AS std FROM t1;
SELECT 'q17', floor(STDDEV_SAMP(col_signed) OVER ()) AS std_samp FROM t1;
SELECT 'q18', floor(STDDEV_SAMP(col_unsigned) OVER ()) AS std_samp FROM t1;
SELECT 'q19', floor(STDDEV_SAMP(col_small_signed) OVER ()) AS std_samp FROM t1;
SELECT 'q20', floor(STDDEV_SAMP(col_small_unsigned) OVER ()) AS std_samp FROM t1;
SELECT 'q21', floor(STDDEV_SAMP(col_med_signed) OVER ()) AS std_samp FROM t1;
SELECT 'q22', floor(STDDEV_SAMP(col_med_unsigned) OVER ()) AS std_samp FROM t1;
SELECT 'q23', floor(STDDEV_SAMP(col_int_signed) OVER ()) AS std_samp FROM t1;
SELECT 'q24', floor(STDDEV_SAMP(col_int_unsigned) OVER ()) AS std_samp FROM t1;
SELECT 'q25', floor(STDDEV_SAMP(col_big_signed) OVER ()) AS std_samp FROM t1;
SELECT 'q26', floor(STDDEV_SAMP(col_big_unsigned) OVER ()) AS std_samp FROM t1;
SELECT 'q27', floor(STDDEV_SAMP(col_dec_signed) OVER ()) AS std_samp FROM t1;
SELECT 'q29', floor(STDDEV_SAMP(col_float_signed) OVER ()) AS std_samp FROM t1;
SELECT 'q30', floor(STDDEV_SAMP(col_float_unsigned) OVER ()) AS std_samp FROM t1;
SELECT 'q31', floor(STDDEV_SAMP(col_double_signed) OVER ()) AS std_samp FROM t1;
SELECT 'q32', floor(STDDEV_SAMP(col_double_unsigned) OVER ()) AS std_samp FROM t1;
SELECT 'q33', floor(VAR_POP(col_signed) OVER()) AS var FROM t1;
SELECT 'q34', floor(VAR_POP(col_unsigned) OVER()) AS var FROM t1;
SELECT 'q35', floor(VAR_POP(col_small_signed) OVER()) AS var FROM t1;
SELECT 'q36', floor(VAR_POP(col_small_unsigned) OVER()) AS var FROM t1;
SELECT 'q37', floor(VAR_POP(col_med_signed) OVER()) AS var FROM t1;
SELECT 'q38', floor(VAR_POP(col_med_unsigned) OVER()) AS var FROM t1;
SELECT 'q39', floor(VAR_POP(col_int_signed) OVER()) AS var FROM t1;
SELECT 'q40', floor(VAR_POP(col_int_unsigned) OVER()) AS var FROM t1;
SELECT 'q41', floor(VAR_POP(col_big_signed) OVER()) AS var FROM t1;
SELECT 'q42', floor(VAR_POP(col_big_unsigned) OVER()) AS var FROM t1;
SELECT 'q43', floor(VAR_POP(col_dec_signed) OVER()) AS var FROM t1;
SELECT 'q45', floor(VAR_POP(col_float_signed) OVER()) AS var FROM t1;
SELECT 'q46', floor(VAR_POP(col_float_unsigned) OVER()) AS var FROM t1;
SELECT 'q47', floor(VAR_POP(col_double_signed) OVER()) AS var FROM t1;
SELECT 'q48', floor(VAR_POP(col_double_unsigned) OVER()) AS var FROM t1;
SELECT 'q49', floor(VAR_SAMP(col_signed) OVER()) AS var_samp FROM t1;
SELECT 'q50', floor(VAR_SAMP(col_unsigned) OVER()) AS var_samp FROM t1;
SELECT 'q51', floor(VAR_SAMP(col_small_signed) OVER()) AS var_samp FROM t1;
SELECT 'q52', floor(VAR_SAMP(col_small_unsigned) OVER()) AS var_samp FROM t1;
SELECT 'q53', floor(VAR_SAMP(col_med_signed) OVER()) AS var_samp FROM t1;
SELECT 'q54', floor(VAR_SAMP(col_med_unsigned) OVER()) AS var_samp FROM t1;
SELECT 'q55', floor(VAR_SAMP(col_int_signed) OVER()) AS var_samp FROM t1;
SELECT 'q56', floor(VAR_SAMP(col_int_unsigned) OVER()) AS var_samp FROM t1;
SELECT 'q57', floor(VAR_SAMP(col_big_signed) OVER()) AS var_samp FROM t1;
SELECT 'q58', floor(VAR_SAMP(col_big_unsigned) OVER()) AS var_samp FROM t1;
SELECT 'q59', floor(VAR_SAMP(col_dec_signed) OVER()) AS var_samp FROM t1;
SELECT 'q61', floor(VAR_SAMP(col_float_signed) OVER()) AS var_samp FROM t1;
SELECT 'q62', floor(VAR_SAMP(col_float_unsigned) OVER()) AS var_samp FROM t1;
SELECT 'q63', floor(VAR_SAMP(col_double_signed) OVER()) AS var_samp FROM t1;
SELECT 'q64', floor(VAR_SAMP(col_double_unsigned) OVER()) AS var_samp FROM t1;
# Clean UP
DROP DATABASE std_test_db;

View File

@ -0,0 +1,120 @@
-- source ../include/have_columnstore.inc
--disable_warnings
DROP DATABASE IF EXISTS std_test_db;
--enable_warnings
CREATE DATABASE std_test_db;
USE std_test_db;
create table t1 (
col_signed tinyint,
col_unsigned tinyint unsigned
);
--replace_result $MTR_SUITE_DIR MTR_SUITE_DIR
--eval LOAD DATA LOCAL infile '$MTR_SUITE_DIR/../std_data/tinyint_range.tbl' INTO TABLE t1 FIELDS TERMINATED BY '|';
ALTER TABLE t1 ADD COLUMN col_small_signed SMALLINT;
ALTER TABLE t1 ADD COLUMN col_small_unsigned SMALLINT UNSIGNED;
ALTER TABLE t1 ADD COLUMN col_med_signed MEDIUMINT;
ALTER TABLE t1 ADD COLUMN col_med_unsigned MEDIUMINT UNSIGNED;
ALTER TABLE t1 ADD COLUMN col_int_signed INT;
ALTER TABLE t1 ADD COLUMN col_int_unsigned INT UNSIGNED;
ALTER TABLE t1 ADD COLUMN col_big_signed BIGINT;
ALTER TABLE t1 ADD COLUMN col_big_unsigned BIGINT UNSIGNED;
ALTER TABLE t1 ADD COLUMN col_dec_signed DECIMAL(38,0);
ALTER TABLE t1 ADD COLUMN col_dec_unsigned DECIMAL(38,0) UNSIGNED;
ALTER TABLE t1 ADD COLUMN col_float_signed FLOAT;
ALTER TABLE t1 ADD COLUMN col_float_unsigned FLOAT UNSIGNED;
ALTER TABLE t1 ADD COLUMN col_double_signed DOUBLE;
ALTER TABLE t1 ADD COLUMN col_double_unsigned DOUBLE UNSIGNED;
UPDATE t1 SET col_small_signed=col_signed + sign(col_signed) * 32000;
UPDATE t1 SET col_small_unsigned=col_unsigned + 65000;
UPDATE t1 SET col_med_signed=col_signed + sign(col_signed) * 8388000;
UPDATE t1 SET col_med_unsigned=col_unsigned + 16776000;
UPDATE t1 SET col_int_signed=col_signed + sign(col_signed) * 2147483000;
UPDATE t1 SET col_int_unsigned=col_unsigned + 4294000000;
UPDATE t1 SET col_big_signed=col_signed + sign(col_signed) * 9223372036854775000;
UPDATE t1 SET col_big_unsigned=col_unsigned + 18446744073709551000;
UPDATE t1 SET col_dec_signed=col_signed + sign(col_signed) * 800000000000000000000000000000000001;
UPDATE t1 SET col_dec_unsigned=col_unsigned + 800000000000000000000000000000000003;
UPDATE t1 SET col_float_signed=col_signed + 0.637 + sign(col_signed) * 8388000;
UPDATE t1 SET col_float_unsigned=col_unsigned + 0.637 + 16776000;
UPDATE t1 SET col_double_signed=col_signed + 0.637 + sign(col_signed) * 2147483000;
UPDATE t1 SET col_double_unsigned=col_unsigned + 0.637 + 4294000000;
SELECT 'q1', floor(STD(col_signed) OVER ()) AS std FROM t1;
SELECT 'q2', floor(STD(col_unsigned) OVER ()) AS std FROM t1;
SELECT 'q3', floor(STD(col_small_signed) OVER ()) AS std FROM t1;
SELECT 'q4', floor(STD(col_small_unsigned) OVER ()) AS std FROM t1;
SELECT 'q5', floor(STD(col_med_signed) OVER ()) AS std FROM t1;
SELECT 'q6', floor(STD(col_med_unsigned) OVER ()) AS std FROM t1;
SELECT 'q7', floor(STD(col_int_signed) OVER ()) AS std FROM t1;
SELECT 'q8', floor(STD(col_int_unsigned) OVER ()) AS std FROM t1;
SELECT 'q9', floor(STD(col_big_signed) OVER ()) AS std FROM t1;
SELECT 'q10', floor(STD(col_big_unsigned) OVER ()) AS std FROM t1;
SELECT 'q11', floor(STD(col_dec_signed) OVER ()) AS std FROM t1;
SELECT 'q12', floor(STD(col_dec_unsigned) OVER ()) AS std FROM t1;
SELECT 'q13', floor(STD(col_float_signed) OVER ()) AS std FROM t1;
SELECT 'q14', floor(STD(col_float_unsigned) OVER ()) AS std FROM t1;
SELECT 'q15', floor(STD(col_double_signed) OVER ()) AS std FROM t1;
SELECT 'q16', floor(STD(col_double_unsigned) OVER ()) AS std FROM t1;
SELECT 'q17', floor(STDDEV_SAMP(col_signed) OVER ()) AS std_samp FROM t1;
SELECT 'q18', floor(STDDEV_SAMP(col_unsigned) OVER ()) AS std_samp FROM t1;
SELECT 'q19', floor(STDDEV_SAMP(col_small_signed) OVER ()) AS std_samp FROM t1;
SELECT 'q20', floor(STDDEV_SAMP(col_small_unsigned) OVER ()) AS std_samp FROM t1;
SELECT 'q21', floor(STDDEV_SAMP(col_med_signed) OVER ()) AS std_samp FROM t1;
SELECT 'q22', floor(STDDEV_SAMP(col_med_unsigned) OVER ()) AS std_samp FROM t1;
SELECT 'q23', floor(STDDEV_SAMP(col_int_signed) OVER ()) AS std_samp FROM t1;
SELECT 'q24', floor(STDDEV_SAMP(col_int_unsigned) OVER ()) AS std_samp FROM t1;
SELECT 'q25', floor(STDDEV_SAMP(col_big_signed) OVER ()) AS std_samp FROM t1;
SELECT 'q26', floor(STDDEV_SAMP(col_big_unsigned) OVER ()) AS std_samp FROM t1;
SELECT 'q27', floor(STDDEV_SAMP(col_dec_signed) OVER ()) AS std_samp FROM t1;
SELECT 'q28', floor(STDDEV_SAMP(col_dec_unsigned) OVER ()) AS std_samp FROM t1;
SELECT 'q29', floor(STDDEV_SAMP(col_float_signed) OVER ()) AS std_samp FROM t1;
SELECT 'q30', floor(STDDEV_SAMP(col_float_unsigned) OVER ()) AS std_samp FROM t1;
SELECT 'q31', floor(STDDEV_SAMP(col_double_signed) OVER ()) AS std_samp FROM t1;
SELECT 'q32', floor(STDDEV_SAMP(col_double_unsigned) OVER ()) AS std_samp FROM t1;
SELECT 'q33', floor(VAR_POP(col_signed) OVER()) AS var FROM t1;
SELECT 'q34', floor(VAR_POP(col_unsigned) OVER()) AS var FROM t1;
SELECT 'q35', floor(VAR_POP(col_small_signed) OVER()) AS var FROM t1;
SELECT 'q36', floor(VAR_POP(col_small_unsigned) OVER()) AS var FROM t1;
SELECT 'q37', floor(VAR_POP(col_med_signed) OVER()) AS var FROM t1;
SELECT 'q38', floor(VAR_POP(col_med_unsigned) OVER()) AS var FROM t1;
SELECT 'q39', floor(VAR_POP(col_int_signed) OVER()) AS var FROM t1;
SELECT 'q40', floor(VAR_POP(col_int_unsigned) OVER()) AS var FROM t1;
SELECT 'q41', floor(VAR_POP(col_big_signed) OVER()) AS var FROM t1;
SELECT 'q42', floor(VAR_POP(col_big_unsigned) OVER()) AS var FROM t1;
SELECT 'q43', floor(VAR_POP(col_dec_signed) OVER()) AS var FROM t1;
SELECT 'q44', floor(VAR_POP(col_dec_unsigned) OVER()) AS var FROM t1;
SELECT 'q45', floor(VAR_POP(col_float_signed) OVER()) AS var FROM t1;
SELECT 'q46', floor(VAR_POP(col_float_unsigned) OVER()) AS var FROM t1;
SELECT 'q47', floor(VAR_POP(col_double_signed) OVER()) AS var FROM t1;
SELECT 'q48', floor(VAR_POP(col_double_unsigned) OVER()) AS var FROM t1;
SELECT 'q49', floor(VAR_SAMP(col_signed) OVER()) AS var_samp FROM t1;
SELECT 'q50', floor(VAR_SAMP(col_unsigned) OVER()) AS var_samp FROM t1;
SELECT 'q51', floor(VAR_SAMP(col_small_signed) OVER()) AS var_samp FROM t1;
SELECT 'q52', floor(VAR_SAMP(col_small_unsigned) OVER()) AS var_samp FROM t1;
SELECT 'q54', floor(VAR_SAMP(col_med_unsigned) OVER()) AS var_samp FROM t1;
SELECT 'q55', floor(VAR_SAMP(col_int_signed) OVER()) AS var_samp FROM t1;
SELECT 'q56', floor(VAR_SAMP(col_int_unsigned) OVER()) AS var_samp FROM t1;
SELECT 'q57', floor(VAR_SAMP(col_big_signed) OVER()) AS var_samp FROM t1;
SELECT 'q58', floor(VAR_SAMP(col_big_unsigned) OVER()) AS var_samp FROM t1;
SELECT 'q59', floor(VAR_SAMP(col_dec_signed) OVER()) AS var_samp FROM t1;
SELECT 'q60', floor(VAR_SAMP(col_dec_unsigned) OVER()) AS var_samp FROM t1;
SELECT 'q61', floor(VAR_SAMP(col_float_signed) OVER()) AS var_samp FROM t1;
SELECT 'q62', floor(VAR_SAMP(col_float_unsigned) OVER()) AS var_samp FROM t1;
SELECT 'q63', floor(VAR_SAMP(col_double_signed) OVER()) AS var_samp FROM t1;
SELECT 'q64', floor(VAR_SAMP(col_double_unsigned) OVER()) AS var_samp FROM t1;
# Clean UP
DROP DATABASE std_test_db;

View File

@ -139,8 +139,8 @@ WindowFunctionType* WF_stats<T>::clone() const
template <typename T> template <typename T>
void WF_stats<T>::resetData() void WF_stats<T>::resetData()
{ {
fSum1 = 0; fMean = 0;
fSum2 = 0; fM2sum = 0;
fCount = 0; fCount = 0;
fStats = 0.0; fStats = 0.0;
@ -171,33 +171,28 @@ void WF_stats<T>::operator()(int64_t b, int64_t e, int64_t c)
if (fRow.isNullValue(colIn) == true) if (fRow.isNullValue(colIn) == true)
continue; continue;
// Welford's single-pass algorithm
T valIn; T valIn;
getValue(colIn, valIn, &cdt); getValue(colIn, valIn, &cdt);
long double val = (long double)valIn; long double val = (long double)valIn;
fSum1 += val;
fSum2 += val * val;
fCount++; fCount++;
long double delta = val - fMean;
fMean += delta/fCount;
fM2sum += delta * (val - fMean);
} }
if (fCount > 1) if (fCount > 1)
{ {
uint32_t scale = fRow.getScale(colIn); uint32_t scale = fRow.getScale(colIn);
auto factor = datatypes::scaleDivisor<long double>(scale); auto factor = datatypes::scaleDivisor<long double>(scale);
long double ldSum1 = fSum1; long double stat = fM2sum;
long double ldSum2 = fSum2;
// adjust the scale if necessary // adjust the scale if necessary
if (scale != 0 && cdt != CalpontSystemCatalog::LONGDOUBLE) if (scale != 0 && cdt != CalpontSystemCatalog::LONGDOUBLE)
{ {
ldSum1 /= factor; stat /= factor * factor;
ldSum2 /= factor * factor;
} }
long double stat = ldSum1 * ldSum1 / fCount;
stat = ldSum2 - stat;
if (fFunctionId == WF__STDDEV_POP) if (fFunctionId == WF__STDDEV_POP)
stat = sqrt(stat / fCount); stat = sqrt(stat / fCount);
else if (fFunctionId == WF__STDDEV_SAMP) else if (fFunctionId == WF__STDDEV_SAMP)

View File

@ -40,8 +40,8 @@ class WF_stats : public WindowFunctionType
static boost::shared_ptr<WindowFunctionType> makeFunction(int, const string&, int, WindowFunctionColumn*); static boost::shared_ptr<WindowFunctionType> makeFunction(int, const string&, int, WindowFunctionColumn*);
protected: protected:
long double fSum1; long double fMean;
long double fSum2; long double fM2sum;
uint64_t fCount; uint64_t fCount;
double fStats; double fStats;
}; };