Handle constant inputs to corr() and related aggregates more precisely.

The SQL standard says that corr() and friends should return NULL in the mathematically-undefined case where all the inputs in one of the columns have the same value. We were checking that by seeing if the sums Sxx and Syy were zero, but that approach is very vulnerable to roundoff error: if a sum is close to zero but not exactly that, we'd come out with a pretty silly non-NULL result. Instead, directly track whether the inputs are all equal by remembering the common value in each column. Once we detect that a new input is different from before, represent that by storing NaN for the common value. (An objection to this scheme is that if the inputs are all NaN, we will consider that they were not all equal. But under IEEE float arithmetic rules, one NaN is never equal to another, so this behavior is arguably correct. Moreover it matches what we did before in such cases.) Then, leave the sums at their exact value of zero for as long as we haven't detected different input values. This solution requires the aggregate transition state to contain 8 float values not 6, which is not problematic, and it seems to add less than 1% to the aggregates' runtime, which seems acceptable. While we're here, improve corr()'s final function to cope with overflow/underflow in the final calculation, and to clamp its result to [-1, 1] in case of roundoff error. Although this is arguably a bug fix, it requires a catversion bump due to the change in aggregates' initial states, so it can't be back-patched. Patch written by me, but many of the ideas are due to Dean Rasheed, who also did a deal of testing. Bug: #19340 Reported-by: Oleg Ivanov <o15611@gmail.com> Author: Tom Lane <tgl@sss.pgh.pa.us> Co-authored-by: Dean Rasheed <dean.a.rasheed@gmail.com> Discussion: https://postgr.es/m/19340-6fb9f6637f562092@postgresql.org
2025-12-19 17:02:53 +03:00 · 2025-12-06 18:31:26 -05:00
parent 25303961d0
commit 6498287696
5 changed files with 248 additions and 67 deletions
--- a/src/test/regress/expected/aggregates.out
+++ b/src/test/regress/expected/aggregates.out
@@ -515,6 +515,62 @@ SELECT covar_pop(1::float8,'nan'::float8), covar_samp(3::float8,'nan'::float8);
       NaN |           
 (1 row)

+-- check some cases that formerly had poor roundoff-error behavior
+SELECT corr(0.09, g), regr_r2(0.09, g)
+  FROM generate_series(1, 30) g;
+ corr | regr_r2 
+------+---------
+      |       1
+(1 row)
+
+SELECT corr(g, 0.09), regr_r2(g, 0.09), regr_slope(g, 0.09), regr_intercept(g, 0.09)
+  FROM generate_series(1, 30) g;
+ corr | regr_r2 | regr_slope | regr_intercept 
+------+---------+------------+----------------
+      |         |            |               
+(1 row)
+
+SELECT corr(1.3 + g * 1e-16, 1.3 + g * 1e-16)
+  FROM generate_series(1, 3) g;
+ corr 
+------
+     
+(1 row)
+
+SELECT corr(1e-100 + g * 1e-105, 1e-100 + g * 1e-105)
+  FROM generate_series(1, 3) g;
+ corr 
+------
+    1
+(1 row)
+
+SELECT corr(1e-100 + g * 1e-105, 1e-100 + g * 1e-105)
+  FROM generate_series(1, 30) g;
+ corr 
+------
+    1
+(1 row)
+
+-- these examples pose definitional questions for NaN inputs,
+-- which we resolve by saying that an all-NaN input column is not all equal
+SELECT corr(g, 'NaN') FROM generate_series(1, 30) g;
+ corr 
+------
+  NaN
+(1 row)
+
+SELECT corr(0.1, 'NaN') FROM generate_series(1, 30) g;
+ corr 
+------
+     
+(1 row)
+
+SELECT corr('NaN', 'NaN') FROM generate_series(1, 30) g;
+ corr 
+------
+  NaN
+(1 row)
+
 -- test accum and combine functions directly
 CREATE TABLE regr_test (x float8, y float8);
 INSERT INTO regr_test VALUES (10,150),(20,250),(30,350),(80,540),(100,200);
@@ -538,10 +594,10 @@ SELECT float8_accum('{4,140,2900}'::float8[], 100);
 {5,240,6280}
 (1 row)

-SELECT float8_regr_accum('{4,140,2900,1290,83075,15050}'::float8[], 200, 100);
-      float8_regr_accum       
------------------------------
- {5,240,6280,1490,95080,8680}
+SELECT float8_regr_accum('{4,140,2900,1290,83075,15050,100,0}'::float8[], 200, 100);
+           float8_regr_accum           
+---------------------------------------
+ {5,240,2900,1490,95080,15050,100,NaN}
 (1 row)

 SELECT count(*), sum(x), regr_sxx(y,x), sum(y),regr_syy(y,x), regr_sxy(y,x)
@@ -576,25 +632,25 @@ SELECT float8_combine('{3,60,200}'::float8[], '{2,180,200}'::float8[]);
 {5,240,6280}
 (1 row)

-SELECT float8_regr_combine('{3,60,200,750,20000,2000}'::float8[],
-                           '{0,0,0,0,0,0}'::float8[]);
-    float8_regr_combine    
---------------------------
- {3,60,200,750,20000,2000}
+SELECT float8_regr_combine('{3,60,200,750,20000,2000,1,NaN}'::float8[],
+                           '{0,0,0,0,0,0,0,0}'::float8[]);
+       float8_regr_combine       
+---------------------------------
+ {3,60,200,750,20000,2000,1,NaN}
 (1 row)

-SELECT float8_regr_combine('{0,0,0,0,0,0}'::float8[],
-                           '{2,180,200,740,57800,-3400}'::float8[]);
-     float8_regr_combine     
-----------------------------
- {2,180,200,740,57800,-3400}
+SELECT float8_regr_combine('{0,0,0,0,0,0,0,0}'::float8[],
+                           '{2,180,200,740,57800,-3400,NaN,1}'::float8[]);
+        float8_regr_combine        
+-----------------------------------
+ {2,180,200,740,57800,-3400,NaN,1}
 (1 row)

-SELECT float8_regr_combine('{3,60,200,750,20000,2000}'::float8[],
-                           '{2,180,200,740,57800,-3400}'::float8[]);
-     float8_regr_combine      
------------------------------
- {5,240,6280,1490,95080,8680}
+SELECT float8_regr_combine('{3,60,200,750,20000,2000,7,8}'::float8[],
+                           '{2,180,200,740,57800,-3400,7,9}'::float8[]);
+        float8_regr_combine         
+------------------------------------
+ {5,240,6280,1490,95080,8680,7,NaN}
 (1 row)

 DROP TABLE regr_test;
--- a/src/test/regress/sql/aggregates.sql
+++ b/src/test/regress/sql/aggregates.sql
@@ -140,6 +140,24 @@ SELECT covar_pop(1::float8,2::float8), covar_samp(3::float8,4::float8);
 SELECT covar_pop(1::float8,'inf'::float8), covar_samp(3::float8,'inf'::float8);
 SELECT covar_pop(1::float8,'nan'::float8), covar_samp(3::float8,'nan'::float8);

+-- check some cases that formerly had poor roundoff-error behavior
+SELECT corr(0.09, g), regr_r2(0.09, g)
+  FROM generate_series(1, 30) g;
+SELECT corr(g, 0.09), regr_r2(g, 0.09), regr_slope(g, 0.09), regr_intercept(g, 0.09)
+  FROM generate_series(1, 30) g;
+SELECT corr(1.3 + g * 1e-16, 1.3 + g * 1e-16)
+  FROM generate_series(1, 3) g;
+SELECT corr(1e-100 + g * 1e-105, 1e-100 + g * 1e-105)
+  FROM generate_series(1, 3) g;
+SELECT corr(1e-100 + g * 1e-105, 1e-100 + g * 1e-105)
+  FROM generate_series(1, 30) g;
+
+-- these examples pose definitional questions for NaN inputs,
+-- which we resolve by saying that an all-NaN input column is not all equal
+SELECT corr(g, 'NaN') FROM generate_series(1, 30) g;
+SELECT corr(0.1, 'NaN') FROM generate_series(1, 30) g;
+SELECT corr('NaN', 'NaN') FROM generate_series(1, 30) g;
+
 -- test accum and combine functions directly
 CREATE TABLE regr_test (x float8, y float8);
 INSERT INTO regr_test VALUES (10,150),(20,250),(30,350),(80,540),(100,200);
@@ -148,7 +166,7 @@ FROM regr_test WHERE x IN (10,20,30,80);
 SELECT count(*), sum(x), regr_sxx(y,x), sum(y),regr_syy(y,x), regr_sxy(y,x)
 FROM regr_test;
 SELECT float8_accum('{4,140,2900}'::float8[], 100);
-SELECT float8_regr_accum('{4,140,2900,1290,83075,15050}'::float8[], 200, 100);
+SELECT float8_regr_accum('{4,140,2900,1290,83075,15050,100,0}'::float8[], 200, 100);
 SELECT count(*), sum(x), regr_sxx(y,x), sum(y),regr_syy(y,x), regr_sxy(y,x)
 FROM regr_test WHERE x IN (10,20,30);
 SELECT count(*), sum(x), regr_sxx(y,x), sum(y),regr_syy(y,x), regr_sxy(y,x)
@@ -156,12 +174,12 @@ FROM regr_test WHERE x IN (80,100);
 SELECT float8_combine('{3,60,200}'::float8[], '{0,0,0}'::float8[]);
 SELECT float8_combine('{0,0,0}'::float8[], '{2,180,200}'::float8[]);
 SELECT float8_combine('{3,60,200}'::float8[], '{2,180,200}'::float8[]);
-SELECT float8_regr_combine('{3,60,200,750,20000,2000}'::float8[],
-                           '{0,0,0,0,0,0}'::float8[]);
-SELECT float8_regr_combine('{0,0,0,0,0,0}'::float8[],
-                           '{2,180,200,740,57800,-3400}'::float8[]);
-SELECT float8_regr_combine('{3,60,200,750,20000,2000}'::float8[],
-                           '{2,180,200,740,57800,-3400}'::float8[]);
+SELECT float8_regr_combine('{3,60,200,750,20000,2000,1,NaN}'::float8[],
+                           '{0,0,0,0,0,0,0,0}'::float8[]);
+SELECT float8_regr_combine('{0,0,0,0,0,0,0,0}'::float8[],
+                           '{2,180,200,740,57800,-3400,NaN,1}'::float8[]);
+SELECT float8_regr_combine('{3,60,200,750,20000,2000,7,8}'::float8[],
+                           '{2,180,200,740,57800,-3400,7,9}'::float8[]);
 DROP TABLE regr_test;

 -- test count, distinct