From 12915a58eec962f407a6c38ce2bf08a48dde57b5 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 25 Dec 2023 00:52:42 +0200 Subject: [PATCH] Enhance checkpointer restartpoint statistics Bhis commit introduces enhancements to the pg_stat_checkpointer view by adding three new columns: restartpoints_timed, restartpoints_req, and restartpoints_done. These additions aim to improve the visibility and monitoring of restartpoint processes on replicas. Previously, it was challenging to differentiate between successful and failed restartpoint requests. This limitation arises because restartpoints on replicas are dependent on checkpoint records from the primary, and cannot occur more frequently than these checkpoints. The new columns allow for clear distinction and tracking of restartpoint requests, their triggers, and successful completions. This enhancement aids database administrators and developers in better understanding and diagnosing issues related to restartpoint behavior, particularly in scenarios where restartpoint requests may fail. System catalog is changed. Catversion is bumped. Discussion: https://postgr.es/m/99b2ccd1-a77a-962a-0837-191cdf56c2b9%40inbox.ru Author: Anton A. Melnikov Reviewed-by: Kyotaro Horiguchi, Alexander Korotkov --- doc/src/sgml/monitoring.sgml | 27 +++++++++++++ doc/src/sgml/wal.sgml | 39 ++++++++++++++++--- src/backend/catalog/system_views.sql | 3 ++ src/backend/postmaster/checkpointer.c | 27 ++++++++++++- .../utils/activity/pgstat_checkpointer.c | 6 +++ src/backend/utils/adt/pgstatfuncs.c | 18 +++++++++ src/include/catalog/catversion.h | 2 +- src/include/catalog/pg_proc.dat | 15 +++++++ src/include/pgstat.h | 3 ++ src/test/regress/expected/rules.out | 3 ++ 10 files changed, 134 insertions(+), 9 deletions(-) diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index 4f8058d8b1b..b804eb8b5ef 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -2982,6 +2982,33 @@ description | Waiting for a newly initialized WAL file to reach durable storage + + + restartpoints_timed bigint + + + Number of scheduled restartpoints due to timeout or after a failed attempt to perform it + + + + + + restartpoints_req bigint + + + Number of requested restartpoints + + + + + + restartpoints_done bigint + + + Number of restartpoints that have been performed + + + write_time double precision diff --git a/doc/src/sgml/wal.sgml b/doc/src/sgml/wal.sgml index 2ed4eb659db..05e2a8f8be9 100644 --- a/doc/src/sgml/wal.sgml +++ b/doc/src/sgml/wal.sgml @@ -655,14 +655,41 @@ directory. Restartpoints can't be performed more frequently than checkpoints on the primary because restartpoints can only be performed at checkpoint records. - A restartpoint is triggered when a checkpoint record is reached if at - least checkpoint_timeout seconds have passed since the last - restartpoint, or if WAL size is about to exceed - max_wal_size. However, because of limitations on when a - restartpoint can be performed, max_wal_size is often exceeded - during recovery, by up to one checkpoint cycle's worth of WAL. + A restartpoint can be demanded by a schedule or by an external request. + The restartpoints_timed counter in the + pg_stat_checkpointer + view counts the first ones while the restartpoints_req + the second. + A restartpoint is triggered by schedule when a checkpoint record is reached + if at least seconds have passed since + the last performed restartpoint or when the previous attempt to perform + the restartpoint has failed. In the last case, the next restartpoint + will be scheduled in 15 seconds. + A restartpoint is triggered by request due to similar reasons like checkpoint + but mostly if WAL size is about to exceed + However, because of limitations on when a restartpoint can be performed, + max_wal_size is often exceeded during recovery, + by up to one checkpoint cycle's worth of WAL. (max_wal_size is never a hard limit anyway, so you should always leave plenty of headroom to avoid running out of disk space.) + The restartpoints_done counter in the + pg_stat_checkpointer + view counts the restartpoints that have really been performed. + + + + In some cases, when the WAL size on the primary increases quickly, + for instance during massive INSERT, + the restartpoints_req counter on the standby + may demonstrate a peak growth. + This occurs because requests to create a new restartpoint due to increased + XLOG consumption cannot be performed because the safe checkpoint record + since the last restartpoint has not yet been replayed on the standby. + This behavior is normal and does not lead to an increase in system resource + consumption. + Only the restartpoints_done + counter among the restartpoint-related ones indicates that noticeable system + resources have been spent. diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 11d18ed9dd6..058fc47c919 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -1141,6 +1141,9 @@ CREATE VIEW pg_stat_checkpointer AS SELECT pg_stat_get_checkpointer_num_timed() AS num_timed, pg_stat_get_checkpointer_num_requested() AS num_requested, + pg_stat_get_checkpointer_restartpoints_timed() AS restartpoints_timed, + pg_stat_get_checkpointer_restartpoints_requested() AS restartpoints_req, + pg_stat_get_checkpointer_restartpoints_performed() AS restartpoints_done, pg_stat_get_checkpointer_write_time() AS write_time, pg_stat_get_checkpointer_sync_time() AS sync_time, pg_stat_get_checkpointer_buffers_written() AS buffers_written, diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c index dc2da5a2cd8..67ecb177e7e 100644 --- a/src/backend/postmaster/checkpointer.c +++ b/src/backend/postmaster/checkpointer.c @@ -340,6 +340,8 @@ CheckpointerMain(void) pg_time_t now; int elapsed_secs; int cur_timeout; + bool chkpt_or_rstpt_requested = false; + bool chkpt_or_rstpt_timed = false; /* Clear any already-pending wakeups */ ResetLatch(MyLatch); @@ -358,7 +360,7 @@ CheckpointerMain(void) if (((volatile CheckpointerShmemStruct *) CheckpointerShmem)->ckpt_flags) { do_checkpoint = true; - PendingCheckpointerStats.num_requested++; + chkpt_or_rstpt_requested = true; } /* @@ -372,7 +374,7 @@ CheckpointerMain(void) if (elapsed_secs >= CheckPointTimeout) { if (!do_checkpoint) - PendingCheckpointerStats.num_timed++; + chkpt_or_rstpt_timed = true; do_checkpoint = true; flags |= CHECKPOINT_CAUSE_TIME; } @@ -408,6 +410,24 @@ CheckpointerMain(void) if (flags & CHECKPOINT_END_OF_RECOVERY) do_restartpoint = false; + if (chkpt_or_rstpt_timed) + { + chkpt_or_rstpt_timed = false; + if (do_restartpoint) + PendingCheckpointerStats.restartpoints_timed++; + else + PendingCheckpointerStats.num_timed++; + } + + if (chkpt_or_rstpt_requested) + { + chkpt_or_rstpt_requested = false; + if (do_restartpoint) + PendingCheckpointerStats.restartpoints_requested++; + else + PendingCheckpointerStats.num_requested++; + } + /* * We will warn if (a) too soon since last checkpoint (whatever * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag @@ -471,6 +491,9 @@ CheckpointerMain(void) * checkpoints happen at a predictable spacing. */ last_checkpoint_time = now; + + if (do_restartpoint) + PendingCheckpointerStats.restartpoints_performed++; } else { diff --git a/src/backend/utils/activity/pgstat_checkpointer.c b/src/backend/utils/activity/pgstat_checkpointer.c index 301a0bc7bd3..6ee258f2402 100644 --- a/src/backend/utils/activity/pgstat_checkpointer.c +++ b/src/backend/utils/activity/pgstat_checkpointer.c @@ -49,6 +49,9 @@ pgstat_report_checkpointer(void) #define CHECKPOINTER_ACC(fld) stats_shmem->stats.fld += PendingCheckpointerStats.fld CHECKPOINTER_ACC(num_timed); CHECKPOINTER_ACC(num_requested); + CHECKPOINTER_ACC(restartpoints_timed); + CHECKPOINTER_ACC(restartpoints_requested); + CHECKPOINTER_ACC(restartpoints_performed); CHECKPOINTER_ACC(write_time); CHECKPOINTER_ACC(sync_time); CHECKPOINTER_ACC(buffers_written); @@ -116,6 +119,9 @@ pgstat_checkpointer_snapshot_cb(void) #define CHECKPOINTER_COMP(fld) pgStatLocal.snapshot.checkpointer.fld -= reset.fld; CHECKPOINTER_COMP(num_timed); CHECKPOINTER_COMP(num_requested); + CHECKPOINTER_COMP(restartpoints_timed); + CHECKPOINTER_COMP(restartpoints_requested); + CHECKPOINTER_COMP(restartpoints_performed); CHECKPOINTER_COMP(write_time); CHECKPOINTER_COMP(sync_time); CHECKPOINTER_COMP(buffers_written); diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c index 0cea320c00e..e65cbf41e9f 100644 --- a/src/backend/utils/adt/pgstatfuncs.c +++ b/src/backend/utils/adt/pgstatfuncs.c @@ -1193,6 +1193,24 @@ pg_stat_get_checkpointer_num_requested(PG_FUNCTION_ARGS) PG_RETURN_INT64(pgstat_fetch_stat_checkpointer()->num_requested); } +Datum +pg_stat_get_checkpointer_restartpoints_timed(PG_FUNCTION_ARGS) +{ + PG_RETURN_INT64(pgstat_fetch_stat_checkpointer()->restartpoints_timed); +} + +Datum +pg_stat_get_checkpointer_restartpoints_requested(PG_FUNCTION_ARGS) +{ + PG_RETURN_INT64(pgstat_fetch_stat_checkpointer()->restartpoints_requested); +} + +Datum +pg_stat_get_checkpointer_restartpoints_performed(PG_FUNCTION_ARGS) +{ + PG_RETURN_INT64(pgstat_fetch_stat_checkpointer()->restartpoints_performed); +} + Datum pg_stat_get_checkpointer_buffers_written(PG_FUNCTION_ARGS) { diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index ae1bee42a9a..2fd601add0f 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -57,6 +57,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 202312211 +#define CATALOG_VERSION_NO 202312251 #endif diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index b8b26c263db..9052f5262a2 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -5721,6 +5721,21 @@ proname => 'pg_stat_get_checkpointer_num_requested', provolatile => 's', proparallel => 'r', prorettype => 'int8', proargtypes => '', prosrc => 'pg_stat_get_checkpointer_num_requested' }, +{ oid => '8743', + descr => 'statistics: number of timed restartpoints started by the checkpointer', + proname => 'pg_stat_get_checkpointer_restartpoints_timed', provolatile => 's', + proparallel => 'r', prorettype => 'int8', proargtypes => '', + prosrc => 'pg_stat_get_checkpointer_restartpoints_timed' }, +{ oid => '8744', + descr => 'statistics: number of backend requested restartpoints started by the checkpointer', + proname => 'pg_stat_get_checkpointer_restartpoints_requested', provolatile => 's', + proparallel => 'r', prorettype => 'int8', proargtypes => '', + prosrc => 'pg_stat_get_checkpointer_restartpoints_requested' }, +{ oid => '8745', + descr => 'statistics: number of backend performed restartpoints', + proname => 'pg_stat_get_checkpointer_restartpoints_performed', provolatile => 's', + proparallel => 'r', prorettype => 'int8', proargtypes => '', + prosrc => 'pg_stat_get_checkpointer_restartpoints_performed' }, { oid => '2771', descr => 'statistics: number of buffers written by the checkpointer', proname => 'pg_stat_get_checkpointer_buffers_written', provolatile => 's', diff --git a/src/include/pgstat.h b/src/include/pgstat.h index fc93d0d731d..ab91b3b367d 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -262,6 +262,9 @@ typedef struct PgStat_CheckpointerStats { PgStat_Counter num_timed; PgStat_Counter num_requested; + PgStat_Counter restartpoints_timed; + PgStat_Counter restartpoints_requested; + PgStat_Counter restartpoints_performed; PgStat_Counter write_time; /* times in milliseconds */ PgStat_Counter sync_time; PgStat_Counter buffers_written; diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index 05070393b99..f645e8486bf 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -1822,6 +1822,9 @@ pg_stat_bgwriter| SELECT pg_stat_get_bgwriter_buf_written_clean() AS buffers_cle pg_stat_get_bgwriter_stat_reset_time() AS stats_reset; pg_stat_checkpointer| SELECT pg_stat_get_checkpointer_num_timed() AS num_timed, pg_stat_get_checkpointer_num_requested() AS num_requested, + pg_stat_get_checkpointer_restartpoints_timed() AS restartpoints_timed, + pg_stat_get_checkpointer_restartpoints_requested() AS restartpoints_req, + pg_stat_get_checkpointer_restartpoints_performed() AS restartpoints_done, pg_stat_get_checkpointer_write_time() AS write_time, pg_stat_get_checkpointer_sync_time() AS sync_time, pg_stat_get_checkpointer_buffers_written() AS buffers_written,