Rework the stats_ext test

As suggested by Tom Lane, avoid printing specific estimated cost values, because they vary across architectures; instead, verify plan shapes (in this case, HashAggregate vs. GroupAggregate), as we do in other planner tests. We can now remove expected/stats_ext_1.out. Author: Tomas Vondra
2025-11-21 00:42:43 +03:00 · 2017-03-27 12:40:42 -03:00
parent 70ec3f1f8f
commit bed9ef5a16
3 changed files with 269 additions and 214 deletions
--- a/src/test/regress/expected/stats_ext.out
+++ b/src/test/regress/expected/stats_ext.out
@@ -1,4 +1,10 @@
 -- Generic extended statistics support
 -- We will be checking execution plans without/with statistics, so
 -- let's make sure we get simple non-parallel plans. Also set the
 -- work_mem low so that we can use small amounts of data.
 SET max_parallel_workers = 0;
 SET max_parallel_workers_per_gather = 0;
 SET work_mem = '128kB';
 -- Ensure stats are dropped sanely
 CREATE TABLE ab1 (a INTEGER, b INTEGER, c INTEGER);
 CREATE STATISTICS ab1_a_b_stats ON (a, b) FROM ab1;
@@ -49,6 +55,67 @@ CREATE TABLE ndistinct (
    c INT,
    d INT
 );
 -- over-estimates when using only per-column statistics
 INSERT INTO ndistinct (a, b, c, filler1)
     SELECT i/100, i/100, i/100, cash_words((i/100)::money)
       FROM generate_series(1,30000) s(i);
 ANALYZE ndistinct;
 -- Group Aggregate, due to over-estimate of the number of groups
 EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY a, b;
            QUERY PLAN             
 -----------------------------------
 GroupAggregate
   Group Key: a, b
   ->  Sort
         Sort Key: a, b
         ->  Seq Scan on ndistinct
 (5 rows)
 EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY b, c;
            QUERY PLAN             
 -----------------------------------
 GroupAggregate
   Group Key: b, c
   ->  Sort
         Sort Key: b, c
         ->  Seq Scan on ndistinct
 (5 rows)
 EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c;
            QUERY PLAN             
 -----------------------------------
 GroupAggregate
   Group Key: a, b, c
   ->  Sort
         Sort Key: a, b, c
         ->  Seq Scan on ndistinct
 (5 rows)
 EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d;
            QUERY PLAN             
 -----------------------------------
 GroupAggregate
   Group Key: a, b, c, d
   ->  Sort
         Sort Key: a, b, c, d
         ->  Seq Scan on ndistinct
 (5 rows)
 EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d;
            QUERY PLAN             
 -----------------------------------
 GroupAggregate
   Group Key: b, c, d
   ->  Sort
         Sort Key: b, c, d
         ->  Seq Scan on ndistinct
 (5 rows)
 -- unknown column
 CREATE STATISTICS s10 ON (unknown_column) FROM ndistinct;
 ERROR:  column "unknown_column" referenced in statistics does not exist
@@ -63,100 +130,184 @@ CREATE STATISTICS s10 ON (a, a, b) FROM ndistinct;
 ERROR:  duplicate column name in statistics definition
 -- correct command
 CREATE STATISTICS s10 ON (a, b, c) FROM ndistinct;
-- perfectly correlated groups
+ANALYZE ndistinct;
 SELECT staenabled, standistinct
  FROM pg_statistic_ext WHERE starelid = 'ndistinct'::regclass;
 staenabled |                                          standistinct                                          
 ------------+------------------------------------------------------------------------------------------------
 {d}        | [{(b 3 4), 301.000000}, {(b 3 6), 301.000000}, {(b 4 6), 301.000000}, {(b 3 4 6), 301.000000}]
 (1 row)
 -- Hash Aggregate, thanks to estimates improved by the statistic
 EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY a, b;
         QUERY PLAN          
 -----------------------------
 HashAggregate
   Group Key: a, b
   ->  Seq Scan on ndistinct
 (3 rows)
 EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY b, c;
         QUERY PLAN          
 -----------------------------
 HashAggregate
   Group Key: b, c
   ->  Seq Scan on ndistinct
 (3 rows)
 EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c;
         QUERY PLAN          
 -----------------------------
 HashAggregate
   Group Key: a, b, c
   ->  Seq Scan on ndistinct
 (3 rows)
 -- last two plans keep using Group Aggregate, because 'd' is not covered
 -- by the statistic and while it's NULL-only we assume 200 values for it
 EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d;
            QUERY PLAN             
 -----------------------------------
 GroupAggregate
   Group Key: a, b, c, d
   ->  Sort
         Sort Key: a, b, c, d
         ->  Seq Scan on ndistinct
 (5 rows)
 EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d;
            QUERY PLAN             
 -----------------------------------
 GroupAggregate
   Group Key: b, c, d
   ->  Sort
         Sort Key: b, c, d
         ->  Seq Scan on ndistinct
 (5 rows)
 TRUNCATE TABLE ndistinct;
 -- under-estimates when using only per-column statistics
 INSERT INTO ndistinct (a, b, c, filler1)
-     SELECT i/100, i/100, i/100, cash_words(i::money)
+     SELECT mod(i,50), mod(i,51), mod(i,32),
            cash_words(mod(i,33)::int::money)
       FROM generate_series(1,10000) s(i);
 ANALYZE ndistinct;
 SELECT staenabled, standistinct
  FROM pg_statistic_ext WHERE starelid = 'ndistinct'::regclass;
 staenabled |                                            standistinct                                            
------------+------------------------------------------------------------------------------------------------
+------------+----------------------------------------------------------------------------------------------------
- {d}        | [{(b 3 4), 101.000000}, {(b 3 6), 101.000000}, {(b 4 6), 101.000000}, {(b 3 4 6), 101.000000}]
+ {d}        | [{(b 3 4), 2550.000000}, {(b 3 6), 800.000000}, {(b 4 6), 1632.000000}, {(b 3 4 6), 10000.000000}]
 (1 row)
 -- plans using Group Aggregate, thanks to using correct esimates
 EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY a, b;
            QUERY PLAN             
-----------------------------
+-----------------------------------
- HashAggregate
+ GroupAggregate
   Group Key: a, b
   ->  Sort
         Sort Key: a, b
         ->  Seq Scan on ndistinct
-(3 rows)
+(5 rows)
 EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c;
            QUERY PLAN             
-----------------------------
+-----------------------------------
- HashAggregate
+ GroupAggregate
   Group Key: a, b, c
   ->  Sort
         Sort Key: a, b, c
         ->  Seq Scan on ndistinct
-(3 rows)
+(5 rows)
 EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d;
            QUERY PLAN             
 -----------------------------------
 GroupAggregate
   Group Key: a, b, c, d
   ->  Sort
         Sort Key: a, b, c, d
         ->  Seq Scan on ndistinct
 (5 rows)
 EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d;
         QUERY PLAN          
 -----------------------------
 HashAggregate
-   Group Key: a, b, c, d
+   Group Key: b, c, d
   ->  Seq Scan on ndistinct
 (3 rows)
-TRUNCATE TABLE ndistinct;
+EXPLAIN (COSTS off)
-- partially correlated groups
+ SELECT COUNT(*) FROM ndistinct GROUP BY a, d;
-INSERT INTO ndistinct (a, b, c)
+         QUERY PLAN          
-     SELECT i/50, i/100, i/200 FROM generate_series(1,10000) s(i);
+-----------------------------
-ANALYZE ndistinct;
+ HashAggregate
   Group Key: a, d
   ->  Seq Scan on ndistinct
 (3 rows)
 DROP STATISTICS s10;
 SELECT staenabled, standistinct
  FROM pg_statistic_ext WHERE starelid = 'ndistinct'::regclass;
 staenabled | standistinct 
------------+------------------------------------------------------------------------------------------------
+------------+--------------
- {d}        | [{(b 3 4), 201.000000}, {(b 3 6), 201.000000}, {(b 4 6), 101.000000}, {(b 3 4 6), 201.000000}]
+(0 rows)
 (1 row)
-EXPLAIN
+-- dropping the statistics switches the plans to Hash Aggregate,
 -- due to under-estimates
 EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY a, b;
         QUERY PLAN          
---------------------------------------------------------------------
+-----------------------------
- HashAggregate  (cost=230.00..232.01 rows=201 width=16)
+ HashAggregate
   Group Key: a, b
-   ->  Seq Scan on ndistinct  (cost=0.00..155.00 rows=10000 width=8)
+   ->  Seq Scan on ndistinct
 (3 rows)
-EXPLAIN
+EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c;
         QUERY PLAN          
----------------------------------------------------------------------
+-----------------------------
- HashAggregate  (cost=255.00..257.01 rows=201 width=20)
+ HashAggregate
   Group Key: a, b, c
-   ->  Seq Scan on ndistinct  (cost=0.00..155.00 rows=10000 width=12)
+   ->  Seq Scan on ndistinct
 (3 rows)
-EXPLAIN
+EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d;
         QUERY PLAN          
----------------------------------------------------------------------
+-----------------------------
- HashAggregate  (cost=280.00..290.00 rows=1000 width=24)
+ HashAggregate
   Group Key: a, b, c, d
-   ->  Seq Scan on ndistinct  (cost=0.00..155.00 rows=10000 width=16)
+   ->  Seq Scan on ndistinct
 (3 rows)
-EXPLAIN
+EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d;
         QUERY PLAN          
----------------------------------------------------------------------
+-----------------------------
- HashAggregate  (cost=255.00..265.00 rows=1000 width=20)
+ HashAggregate
   Group Key: b, c, d
-   ->  Seq Scan on ndistinct  (cost=0.00..155.00 rows=10000 width=12)
+   ->  Seq Scan on ndistinct
 (3 rows)
-EXPLAIN
+EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY a, d;
         QUERY PLAN          
---------------------------------------------------------------------
+-----------------------------
- HashAggregate  (cost=230.00..240.00 rows=1000 width=16)
+ HashAggregate
   Group Key: a, d
-   ->  Seq Scan on ndistinct  (cost=0.00..155.00 rows=10000 width=8)
+   ->  Seq Scan on ndistinct
 (3 rows)
 DROP TABLE ndistinct;
--- a/src/test/regress/expected/stats_ext_1.out
+++ b/src/test/regress/expected/stats_ext_1.out
@@ -1,155 +0,0 @@
 -- Generic extended statistics support
 -- Ensure stats are dropped sanely
 CREATE TABLE ab1 (a INTEGER, b INTEGER, c INTEGER);
 CREATE STATISTICS ab1_a_b_stats ON (a, b) FROM ab1;
 DROP STATISTICS ab1_a_b_stats;
 CREATE SCHEMA regress_schema_2;
 CREATE STATISTICS regress_schema_2.ab1_a_b_stats ON (a, b) FROM ab1;
 DROP STATISTICS regress_schema_2.ab1_a_b_stats;
 -- Ensure statistics are dropped when columns are
 CREATE STATISTICS ab1_b_c_stats ON (b, c) FROM ab1;
 CREATE STATISTICS ab1_a_b_c_stats ON (a, b, c) FROM ab1;
 CREATE STATISTICS ab1_a_b_stats ON (a, b) FROM ab1;
 ALTER TABLE ab1 DROP COLUMN a;
 \d ab1
                Table "public.ab1"
 Column |  Type   | Collation | Nullable | Default 
 --------+---------+-----------+----------+---------
 b      | integer |           |          | 
 c      | integer |           |          | 
 Statistics:
    "public.ab1_b_c_stats" WITH (ndistinct) ON (b, c)
 DROP TABLE ab1;
 -- Ensure things work sanely with SET STATISTICS 0
 CREATE TABLE ab1 (a INTEGER, b INTEGER);
 ALTER TABLE ab1 ALTER a SET STATISTICS 0;
 INSERT INTO ab1 SELECT a, a%23 FROM generate_series(1, 1000) a;
 CREATE STATISTICS ab1_a_b_stats ON (a, b) FROM ab1;
 ANALYZE ab1;
 ERROR:  extended statistics could not be collected for column "a" of relation public.ab1
 HINT:  Consider ALTER TABLE "public"."ab1" ALTER "a" SET STATISTICS -1
 ALTER TABLE ab1 ALTER a SET STATISTICS -1;
 ANALYZE ab1;
 DROP TABLE ab1;
 -- n-distinct tests
 CREATE TABLE ndistinct (
    filler1 TEXT,
    filler2 NUMERIC,
    a INT,
    b INT,
    filler3 DATE,
    c INT,
    d INT
 );
 -- unknown column
 CREATE STATISTICS s10 ON (unknown_column) FROM ndistinct;
 ERROR:  column "unknown_column" referenced in statistics does not exist
 -- single column
 CREATE STATISTICS s10 ON (a) FROM ndistinct;
 ERROR:  statistics require at least 2 columns
 -- single column, duplicated
 CREATE STATISTICS s10 ON (a,a) FROM ndistinct;
 ERROR:  duplicate column name in statistics definition
 -- two columns, one duplicated
 CREATE STATISTICS s10 ON (a, a, b) FROM ndistinct;
 ERROR:  duplicate column name in statistics definition
 -- correct command
 CREATE STATISTICS s10 ON (a, b, c) FROM ndistinct;
 -- perfectly correlated groups
 INSERT INTO ndistinct (a, b, c, filler1)
     SELECT i/100, i/100, i/100, cash_words(i::money)
       FROM generate_series(1,10000) s(i);
 ANALYZE ndistinct;
 SELECT staenabled, standistinct
  FROM pg_statistic_ext WHERE starelid = 'ndistinct'::regclass;
 staenabled |                                          standistinct                                          
 ------------+------------------------------------------------------------------------------------------------
 {d}        | [{(b 3 4), 101.000000}, {(b 3 6), 101.000000}, {(b 4 6), 101.000000}, {(b 3 4 6), 101.000000}]
 (1 row)
 EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY a, b;
         QUERY PLAN          
 -----------------------------
 HashAggregate
   Group Key: a, b
   ->  Seq Scan on ndistinct
 (3 rows)
 EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c;
         QUERY PLAN          
 -----------------------------
 HashAggregate
   Group Key: a, b, c
   ->  Seq Scan on ndistinct
 (3 rows)
 EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d;
         QUERY PLAN          
 -----------------------------
 HashAggregate
   Group Key: a, b, c, d
   ->  Seq Scan on ndistinct
 (3 rows)
 TRUNCATE TABLE ndistinct;
 -- partially correlated groups
 INSERT INTO ndistinct (a, b, c)
     SELECT i/50, i/100, i/200 FROM generate_series(1,10000) s(i);
 ANALYZE ndistinct;
 SELECT staenabled, standistinct
  FROM pg_statistic_ext WHERE starelid = 'ndistinct'::regclass;
 staenabled |                                          standistinct                                          
 ------------+------------------------------------------------------------------------------------------------
 {d}        | [{(b 3 4), 201.000000}, {(b 3 6), 201.000000}, {(b 4 6), 101.000000}, {(b 3 4 6), 201.000000}]
 (1 row)
 EXPLAIN
 SELECT COUNT(*) FROM ndistinct GROUP BY a, b;
                             QUERY PLAN                              
 ---------------------------------------------------------------------
 HashAggregate  (cost=225.00..227.01 rows=201 width=16)
   Group Key: a, b
   ->  Seq Scan on ndistinct  (cost=0.00..150.00 rows=10000 width=8)
 (3 rows)
 EXPLAIN
 SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c;
                              QUERY PLAN                              
 ----------------------------------------------------------------------
 HashAggregate  (cost=250.00..252.01 rows=201 width=20)
   Group Key: a, b, c
   ->  Seq Scan on ndistinct  (cost=0.00..150.00 rows=10000 width=12)
 (3 rows)
 EXPLAIN
 SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d;
                              QUERY PLAN                              
 ----------------------------------------------------------------------
 HashAggregate  (cost=275.00..285.00 rows=1000 width=24)
   Group Key: a, b, c, d
   ->  Seq Scan on ndistinct  (cost=0.00..150.00 rows=10000 width=16)
 (3 rows)
 EXPLAIN
 SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d;
                              QUERY PLAN                              
 ----------------------------------------------------------------------
 HashAggregate  (cost=250.00..260.00 rows=1000 width=20)
   Group Key: b, c, d
   ->  Seq Scan on ndistinct  (cost=0.00..150.00 rows=10000 width=12)
 (3 rows)
 EXPLAIN
 SELECT COUNT(*) FROM ndistinct GROUP BY a, d;
                             QUERY PLAN                              
 ---------------------------------------------------------------------
 HashAggregate  (cost=225.00..235.00 rows=1000 width=16)
   Group Key: a, d
   ->  Seq Scan on ndistinct  (cost=0.00..150.00 rows=10000 width=8)
 (3 rows)
 DROP TABLE ndistinct;
--- a/src/test/regress/sql/stats_ext.sql
+++ b/src/test/regress/sql/stats_ext.sql
@@ -1,5 +1,12 @@
 -- Generic extended statistics support
 -- We will be checking execution plans without/with statistics, so
 -- let's make sure we get simple non-parallel plans. Also set the
 -- work_mem low so that we can use small amounts of data.
 SET max_parallel_workers = 0;
 SET max_parallel_workers_per_gather = 0;
 SET work_mem = '128kB';
 -- Ensure stats are dropped sanely
 CREATE TABLE ab1 (a INTEGER, b INTEGER, c INTEGER);
 CREATE STATISTICS ab1_a_b_stats ON (a, b) FROM ab1;
@@ -43,6 +50,29 @@ CREATE TABLE ndistinct (
    d INT
 );
 -- over-estimates when using only per-column statistics
 INSERT INTO ndistinct (a, b, c, filler1)
     SELECT i/100, i/100, i/100, cash_words((i/100)::money)
       FROM generate_series(1,30000) s(i);
 ANALYZE ndistinct;
 -- Group Aggregate, due to over-estimate of the number of groups
 EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY a, b;
 EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY b, c;
 EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c;
 EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d;
 EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d;
 -- unknown column
 CREATE STATISTICS s10 ON (unknown_column) FROM ndistinct;
@@ -58,9 +88,35 @@ CREATE STATISTICS s10 ON (a, a, b) FROM ndistinct;
 -- correct command
 CREATE STATISTICS s10 ON (a, b, c) FROM ndistinct;
-- perfectly correlated groups
+ANALYZE ndistinct;
 SELECT staenabled, standistinct
  FROM pg_statistic_ext WHERE starelid = 'ndistinct'::regclass;
 -- Hash Aggregate, thanks to estimates improved by the statistic
 EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY a, b;
 EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY b, c;
 EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c;
 -- last two plans keep using Group Aggregate, because 'd' is not covered
 -- by the statistic and while it's NULL-only we assume 200 values for it
 EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d;
 EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d;
 TRUNCATE TABLE ndistinct;
 -- under-estimates when using only per-column statistics
 INSERT INTO ndistinct (a, b, c, filler1)
-     SELECT i/100, i/100, i/100, cash_words(i::money)
+     SELECT mod(i,50), mod(i,51), mod(i,32),
            cash_words(mod(i,33)::int::money)
       FROM generate_series(1,10000) s(i);
 ANALYZE ndistinct;
@@ -68,6 +124,7 @@ ANALYZE ndistinct;
 SELECT staenabled, standistinct
  FROM pg_statistic_ext WHERE starelid = 'ndistinct'::regclass;
 -- plans using Group Aggregate, thanks to using correct esimates
 EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY a, b;
@@ -77,30 +134,32 @@ EXPLAIN (COSTS off)
 EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d;
-TRUNCATE TABLE ndistinct;
+EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d;
-- partially correlated groups
+EXPLAIN (COSTS off)
-INSERT INTO ndistinct (a, b, c)
+ SELECT COUNT(*) FROM ndistinct GROUP BY a, d;
     SELECT i/50, i/100, i/200 FROM generate_series(1,10000) s(i);
-ANALYZE ndistinct;
+DROP STATISTICS s10;
 SELECT staenabled, standistinct
  FROM pg_statistic_ext WHERE starelid = 'ndistinct'::regclass;
-EXPLAIN
+-- dropping the statistics switches the plans to Hash Aggregate,
 -- due to under-estimates
 EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY a, b;
-EXPLAIN
+EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c;
-EXPLAIN
+EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d;
-EXPLAIN
+EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d;
-EXPLAIN
+EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY a, d;
 DROP TABLE ndistinct;