1
0
mirror of https://github.com/postgres/postgres.git synced 2025-05-08 07:21:33 +03:00

Teach planner how to estimate rows for timestamp generate_series

This provides the planner with row estimates for
generate_series(TIMESTAMP, TIMESTAMP, INTERVAL),
generate_series(TIMESTAMPTZ, TIMESTAMPTZ, INTERVAL) and
generate_series(TIMESTAMPTZ, TIMESTAMPTZ, INTERVAL, TEXT) when the input
parameter values can be estimated during planning.

Author: David Rowley
Reviewed-by: jian he <jian.universality@gmail.com>
Discussion: https://postgr.es/m/CAApHDvrBE%3D%2BASo_sGYmQJ3GvO8GPvX5yxXhRS%3Dt_ybd4odFkhQ%40mail.gmail.com
This commit is contained in:
David Rowley 2024-07-09 09:54:59 +12:00
parent 5193ca8e15
commit 036bdcec9f
4 changed files with 291 additions and 3 deletions

View File

@ -27,6 +27,7 @@
#include "funcapi.h"
#include "libpq/pqformat.h"
#include "miscadmin.h"
#include "optimizer/optimizer.h"
#include "nodes/nodeFuncs.h"
#include "nodes/supportnodes.h"
#include "parser/scansup.h"
@ -6680,6 +6681,93 @@ generate_series_timestamptz_at_zone(PG_FUNCTION_ARGS)
return generate_series_timestamptz_internal(fcinfo);
}
/*
* Planner support function for generate_series(timestamp, timestamp, interval)
*/
Datum
generate_series_timestamp_support(PG_FUNCTION_ARGS)
{
Node *rawreq = (Node *) PG_GETARG_POINTER(0);
Node *ret = NULL;
if (IsA(rawreq, SupportRequestRows))
{
/* Try to estimate the number of rows returned */
SupportRequestRows *req = (SupportRequestRows *) rawreq;
if (is_funcclause(req->node)) /* be paranoid */
{
List *args = ((FuncExpr *) req->node)->args;
Node *arg1,
*arg2,
*arg3;
/* We can use estimated argument values here */
arg1 = estimate_expression_value(req->root, linitial(args));
arg2 = estimate_expression_value(req->root, lsecond(args));
arg3 = estimate_expression_value(req->root, lthird(args));
/*
* If any argument is constant NULL, we can safely assume that
* zero rows are returned. Otherwise, if they're all non-NULL
* constants, we can calculate the number of rows that will be
* returned.
*/
if ((IsA(arg1, Const) && ((Const *) arg1)->constisnull) ||
(IsA(arg2, Const) && ((Const *) arg2)->constisnull) ||
(IsA(arg3, Const) && ((Const *) arg3)->constisnull))
{
req->rows = 0;
ret = (Node *) req;
}
else if (IsA(arg1, Const) && IsA(arg2, Const) && IsA(arg3, Const))
{
Timestamp start,
finish;
Interval *step;
Datum diff;
double dstep;
int64 dummy;
start = DatumGetTimestamp(((Const *) arg1)->constvalue);
finish = DatumGetTimestamp(((Const *) arg2)->constvalue);
step = DatumGetIntervalP(((Const *) arg3)->constvalue);
/*
* Perform some prechecks which could cause timestamp_mi to
* raise an ERROR. It's much better to just return some
* default estimate than error out in a support function.
*/
if (!TIMESTAMP_NOT_FINITE(start) && !TIMESTAMP_NOT_FINITE(finish) &&
!pg_sub_s64_overflow(finish, start, &dummy))
{
diff = DirectFunctionCall2(timestamp_mi,
TimestampGetDatum(finish),
TimestampGetDatum(start));
#define INTERVAL_TO_MICROSECONDS(i) ((((double) (i)->month * DAYS_PER_MONTH + (i)->day)) * USECS_PER_DAY + (i)->time)
dstep = INTERVAL_TO_MICROSECONDS(step);
/* This equation works for either sign of step */
if (dstep != 0.0)
{
Interval *idiff = DatumGetIntervalP(diff);
double ddiff = INTERVAL_TO_MICROSECONDS(idiff);
req->rows = floor(ddiff / dstep + 1.0);
ret = (Node *) req;
}
#undef INTERVAL_TO_MICROSECONDS
}
}
}
}
PG_RETURN_POINTER(ret);
}
/* timestamp_at_local()
* timestamptz_at_local()
*

View File

@ -8331,19 +8331,25 @@
prorettype => 'numeric', proargtypes => 'numeric numeric',
prosrc => 'generate_series_numeric' },
{ oid => '938', descr => 'non-persistent series generator',
proname => 'generate_series', prorows => '1000', proretset => 't',
proname => 'generate_series', prorows => '1000',
prosupport => 'generate_series_timestamp_support', proretset => 't',
prorettype => 'timestamp', proargtypes => 'timestamp timestamp interval',
prosrc => 'generate_series_timestamp' },
{ oid => '939', descr => 'non-persistent series generator',
proname => 'generate_series', prorows => '1000', proretset => 't',
proname => 'generate_series', prorows => '1000',
prosupport => 'generate_series_timestamp_support', proretset => 't',
provolatile => 's', prorettype => 'timestamptz',
proargtypes => 'timestamptz timestamptz interval',
prosrc => 'generate_series_timestamptz' },
{ oid => '6274', descr => 'non-persistent series generator',
proname => 'generate_series', prorows => '1000', proretset => 't',
proname => 'generate_series', prorows => '1000',
prosupport => 'generate_series_timestamp_support', proretset => 't',
prorettype => 'timestamptz',
proargtypes => 'timestamptz timestamptz interval text',
prosrc => 'generate_series_timestamptz_at_zone' },
{ oid => '8402', descr => 'planner support for generate_series',
proname => 'generate_series_timestamp_support', prorettype => 'internal',
proargtypes => 'internal', prosrc => 'generate_series_timestamp_support' },
# boolean aggregates
{ oid => '2515', descr => 'aggregate transition function',

View File

@ -2,6 +2,43 @@
\getenv libdir PG_LIBDIR
\getenv dlsuffix PG_DLSUFFIX
\set regresslib :libdir '/regress' :dlsuffix
-- Function to assist with verifying EXPLAIN which includes costs. A series
-- of bool flags allows control over which portions are masked out
CREATE FUNCTION explain_mask_costs(query text, do_analyze bool,
hide_costs bool, hide_row_est bool, hide_width bool) RETURNS setof text
LANGUAGE plpgsql AS
$$
DECLARE
ln text;
analyze_str text;
BEGIN
IF do_analyze = true THEN
analyze_str := 'on';
ELSE
analyze_str := 'off';
END IF;
FOR ln IN
EXECUTE format('explain (analyze %s, costs on, summary off, timing off) %s',
analyze_str, query)
LOOP
IF hide_costs = true THEN
ln := regexp_replace(ln, 'cost=\d+\.\d\d\.\.\d+\.\d\d', 'cost=N..N');
END IF;
IF hide_row_est = true THEN
-- don't use 'g' so that we leave the actual rows intact
ln := regexp_replace(ln, 'rows=\d+', 'rows=N');
END IF;
IF hide_width = true THEN
ln := regexp_replace(ln, 'width=\d+', 'width=N');
END IF;
RETURN NEXT ln;
END LOOP;
END;
$$;
--
-- num_nulls()
--
@ -594,6 +631,78 @@ SELECT * FROM tenk1 a JOIN my_gen_series(1,10) g ON a.unique1 = g;
Index Cond: (unique1 = g.g)
(4 rows)
--
-- Test the SupportRequestRows support function for generate_series_timestamp()
--
-- Ensure the row estimate matches the actual rows
SELECT explain_mask_costs($$
SELECT * FROM generate_series(TIMESTAMPTZ '2024-02-01', TIMESTAMPTZ '2024-03-01', INTERVAL '1 day') g(s);$$,
true, true, false, true);
explain_mask_costs
------------------------------------------------------------------------------------------
Function Scan on generate_series g (cost=N..N rows=30 width=N) (actual rows=30 loops=1)
(1 row)
-- As above but with generate_series_timestamp
SELECT explain_mask_costs($$
SELECT * FROM generate_series(TIMESTAMP '2024-02-01', TIMESTAMP '2024-03-01', INTERVAL '1 day') g(s);$$,
true, true, false, true);
explain_mask_costs
------------------------------------------------------------------------------------------
Function Scan on generate_series g (cost=N..N rows=30 width=N) (actual rows=30 loops=1)
(1 row)
-- As above but with generate_series_timestamptz_at_zone()
SELECT explain_mask_costs($$
SELECT * FROM generate_series(TIMESTAMPTZ '2024-02-01', TIMESTAMPTZ '2024-03-01', INTERVAL '1 day', 'UTC') g(s);$$,
true, true, false, true);
explain_mask_costs
------------------------------------------------------------------------------------------
Function Scan on generate_series g (cost=N..N rows=30 width=N) (actual rows=30 loops=1)
(1 row)
-- Ensure the estimated and actual row counts match when the range isn't
-- evenly divisible by the step
SELECT explain_mask_costs($$
SELECT * FROM generate_series(TIMESTAMPTZ '2024-02-01', TIMESTAMPTZ '2024-03-01', INTERVAL '7 day') g(s);$$,
true, true, false, true);
explain_mask_costs
----------------------------------------------------------------------------------------
Function Scan on generate_series g (cost=N..N rows=5 width=N) (actual rows=5 loops=1)
(1 row)
-- Ensure the estimates match when step is decreasing
SELECT explain_mask_costs($$
SELECT * FROM generate_series(TIMESTAMPTZ '2024-03-01', TIMESTAMPTZ '2024-02-01', INTERVAL '-1 day') g(s);$$,
true, true, false, true);
explain_mask_costs
------------------------------------------------------------------------------------------
Function Scan on generate_series g (cost=N..N rows=30 width=N) (actual rows=30 loops=1)
(1 row)
-- Ensure an empty range estimates 1 row
SELECT explain_mask_costs($$
SELECT * FROM generate_series(TIMESTAMPTZ '2024-03-01', TIMESTAMPTZ '2024-02-01', INTERVAL '1 day') g(s);$$,
true, true, false, true);
explain_mask_costs
----------------------------------------------------------------------------------------
Function Scan on generate_series g (cost=N..N rows=1 width=N) (actual rows=0 loops=1)
(1 row)
-- Ensure we get the default row estimate for infinity values
SELECT explain_mask_costs($$
SELECT * FROM generate_series(TIMESTAMPTZ '-infinity', TIMESTAMPTZ 'infinity', INTERVAL '1 day') g(s);$$,
false, true, false, true);
explain_mask_costs
-------------------------------------------------------------------
Function Scan on generate_series g (cost=N..N rows=1000 width=N)
(1 row)
-- Ensure the row estimate behaves correctly when step size is zero.
-- We expect generate_series_timestamp() to throw the error rather than in
-- the support function.
SELECT * FROM generate_series(TIMESTAMPTZ '2024-02-01', TIMESTAMPTZ '2024-03-01', INTERVAL '0 day') g(s);
ERROR: step size cannot equal zero
-- Test functions for control data
SELECT count(*) > 0 AS ok FROM pg_control_checkpoint();
ok
@ -706,3 +815,4 @@ SELECT pg_column_toast_chunk_id(a) IS NULL,
(1 row)
DROP TABLE test_chunk_id;
DROP FUNCTION explain_mask_costs(text, bool, bool, bool, bool);

View File

@ -4,6 +4,44 @@
\set regresslib :libdir '/regress' :dlsuffix
-- Function to assist with verifying EXPLAIN which includes costs. A series
-- of bool flags allows control over which portions are masked out
CREATE FUNCTION explain_mask_costs(query text, do_analyze bool,
hide_costs bool, hide_row_est bool, hide_width bool) RETURNS setof text
LANGUAGE plpgsql AS
$$
DECLARE
ln text;
analyze_str text;
BEGIN
IF do_analyze = true THEN
analyze_str := 'on';
ELSE
analyze_str := 'off';
END IF;
FOR ln IN
EXECUTE format('explain (analyze %s, costs on, summary off, timing off) %s',
analyze_str, query)
LOOP
IF hide_costs = true THEN
ln := regexp_replace(ln, 'cost=\d+\.\d\d\.\.\d+\.\d\d', 'cost=N..N');
END IF;
IF hide_row_est = true THEN
-- don't use 'g' so that we leave the actual rows intact
ln := regexp_replace(ln, 'rows=\d+', 'rows=N');
END IF;
IF hide_width = true THEN
ln := regexp_replace(ln, 'width=\d+', 'width=N');
END IF;
RETURN NEXT ln;
END LOOP;
END;
$$;
--
-- num_nulls()
--
@ -224,6 +262,51 @@ SELECT * FROM tenk1 a JOIN my_gen_series(1,1000) g ON a.unique1 = g;
EXPLAIN (COSTS OFF)
SELECT * FROM tenk1 a JOIN my_gen_series(1,10) g ON a.unique1 = g;
--
-- Test the SupportRequestRows support function for generate_series_timestamp()
--
-- Ensure the row estimate matches the actual rows
SELECT explain_mask_costs($$
SELECT * FROM generate_series(TIMESTAMPTZ '2024-02-01', TIMESTAMPTZ '2024-03-01', INTERVAL '1 day') g(s);$$,
true, true, false, true);
-- As above but with generate_series_timestamp
SELECT explain_mask_costs($$
SELECT * FROM generate_series(TIMESTAMP '2024-02-01', TIMESTAMP '2024-03-01', INTERVAL '1 day') g(s);$$,
true, true, false, true);
-- As above but with generate_series_timestamptz_at_zone()
SELECT explain_mask_costs($$
SELECT * FROM generate_series(TIMESTAMPTZ '2024-02-01', TIMESTAMPTZ '2024-03-01', INTERVAL '1 day', 'UTC') g(s);$$,
true, true, false, true);
-- Ensure the estimated and actual row counts match when the range isn't
-- evenly divisible by the step
SELECT explain_mask_costs($$
SELECT * FROM generate_series(TIMESTAMPTZ '2024-02-01', TIMESTAMPTZ '2024-03-01', INTERVAL '7 day') g(s);$$,
true, true, false, true);
-- Ensure the estimates match when step is decreasing
SELECT explain_mask_costs($$
SELECT * FROM generate_series(TIMESTAMPTZ '2024-03-01', TIMESTAMPTZ '2024-02-01', INTERVAL '-1 day') g(s);$$,
true, true, false, true);
-- Ensure an empty range estimates 1 row
SELECT explain_mask_costs($$
SELECT * FROM generate_series(TIMESTAMPTZ '2024-03-01', TIMESTAMPTZ '2024-02-01', INTERVAL '1 day') g(s);$$,
true, true, false, true);
-- Ensure we get the default row estimate for infinity values
SELECT explain_mask_costs($$
SELECT * FROM generate_series(TIMESTAMPTZ '-infinity', TIMESTAMPTZ 'infinity', INTERVAL '1 day') g(s);$$,
false, true, false, true);
-- Ensure the row estimate behaves correctly when step size is zero.
-- We expect generate_series_timestamp() to throw the error rather than in
-- the support function.
SELECT * FROM generate_series(TIMESTAMPTZ '2024-02-01', TIMESTAMPTZ '2024-03-01', INTERVAL '0 day') g(s);
-- Test functions for control data
SELECT count(*) > 0 AS ok FROM pg_control_checkpoint();
SELECT count(*) > 0 AS ok FROM pg_control_init();
@ -273,3 +356,4 @@ SELECT pg_column_toast_chunk_id(a) IS NULL,
pg_column_toast_chunk_id(b) IN (SELECT chunk_id FROM pg_toast.:toastrel)
FROM test_chunk_id;
DROP TABLE test_chunk_id;
DROP FUNCTION explain_mask_costs(text, bool, bool, bool, bool);