1
0
mirror of https://github.com/postgres/postgres.git synced 2025-07-28 23:42:10 +03:00

Re-add GUC track_wal_io_timing

This commit is a rework of 2421e9a51d, about which Andres Freund has
raised some concerns as it is valuable to have both track_io_timing and
track_wal_io_timing in some cases, as the WAL write and fsync paths can
be a major bottleneck for some workloads.  Hence, it can be relevant to
not calculate the WAL timings in environments where pg_test_timing
performs poorly while capturing some IO data under track_io_timing for
the non-WAL IO paths.  The opposite can be also true: it should be
possible to disable the non-WAL timings and enable the WAL timings (the
previous GUC setups allowed this possibility).

track_wal_io_timing is added back in this commit, controlling if WAL
timings should be calculated in pg_stat_io for the read, fsync and write
paths, as done previously with pg_stat_wal.  pg_stat_wal previously
tracked only the sync and write parts (now removed), read stats is new
data tracked in pg_stat_io, all three are aggregated if
track_wal_io_timing is enabled.  The read part matters during recovery
or if a XLogReader is used.

Extra note: more control over if the types of timings calculated in
pg_stat_io could be done with a GUC that lists pairs of (IOObject,IOOp).

Reported-by: Andres Freund <andres@anarazel.de>
Author: Bertrand Drouvot <bertranddrouvot.pg@gmail.com>
Co-authored-by: Michael Paquier <michael@paquier.xyz>
Discussion: https://postgr.es/m/3opf2wh2oljco6ldyqf7ukabw3jijnnhno6fjb4mlu6civ5h24@fcwmhsgmlmzu
This commit is contained in:
Michael Paquier
2025-02-26 09:49:59 +09:00
parent a5cbdeb98a
commit 6c349d83b6
15 changed files with 96 additions and 32 deletions

View File

@ -134,6 +134,7 @@ int CommitSiblings = 5; /* # concurrent xacts needed to sleep */
int wal_retrieve_retry_interval = 5000;
int max_slot_wal_keep_size_mb = -1;
int wal_decode_buffer_size = 512 * 1024;
bool track_wal_io_timing = false;
#ifdef WAL_DEBUG
bool XLOG_DEBUG = false;
@ -2437,7 +2438,7 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible)
/*
* Measure I/O timing to write WAL data, for pg_stat_io.
*/
start = pgstat_prepare_io_time();
start = pgstat_prepare_io_time(track_wal_io_timing);
pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE);
written = pg_pwrite(openLogFile, from, nleft, startoffset);
@ -3248,7 +3249,7 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
errmsg("could not create file \"%s\": %m", tmppath)));
/* Measure I/O timing when initializing segment */
io_start = pgstat_prepare_io_time();
io_start = pgstat_prepare_io_time(track_wal_io_timing);
pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE);
save_errno = 0;
@ -3310,7 +3311,7 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
}
/* Measure I/O timing when flushing segment */
io_start = pgstat_prepare_io_time();
io_start = pgstat_prepare_io_time(track_wal_io_timing);
pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_SYNC);
if (pg_fsync(fd) != 0)
@ -8744,7 +8745,7 @@ issue_xlog_fsync(int fd, XLogSegNo segno, TimeLineID tli)
/*
* Measure I/O timing to sync the WAL file for pg_stat_io.
*/
start = pgstat_prepare_io_time();
start = pgstat_prepare_io_time(track_wal_io_timing);
pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC);
switch (wal_sync_method)

View File

@ -1557,7 +1557,7 @@ WALRead(XLogReaderState *state,
#ifndef FRONTEND
/* Measure I/O timing when reading segment */
io_start = pgstat_prepare_io_time();
io_start = pgstat_prepare_io_time(track_wal_io_timing);
pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
#endif

View File

@ -3401,7 +3401,7 @@ retry:
readOff = targetPageOff;
/* Measure I/O timing when reading segment */
io_start = pgstat_prepare_io_time();
io_start = pgstat_prepare_io_time(track_wal_io_timing);
pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff);

View File

@ -1509,7 +1509,7 @@ WaitReadBuffers(ReadBuffersOperation *operation)
io_pages[io_buffers_len++] = BufferGetBlock(buffers[i]);
}
io_start = pgstat_prepare_io_time();
io_start = pgstat_prepare_io_time(track_io_timing);
smgrreadv(operation->smgr, forknum, io_first_block, io_pages, io_buffers_len);
pgstat_count_io_op_time(io_object, io_context, IOOP_READ, io_start,
1, io_buffers_len * BLCKSZ);
@ -2402,7 +2402,7 @@ ExtendBufferedRelShared(BufferManagerRelation bmr,
}
}
io_start = pgstat_prepare_io_time();
io_start = pgstat_prepare_io_time(track_io_timing);
/*
* Note: if smgrzeroextend fails, we will end up with buffers that are
@ -3857,7 +3857,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
*/
bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
io_start = pgstat_prepare_io_time();
io_start = pgstat_prepare_io_time(track_io_timing);
/*
* bufToWrite is either the shared buffer or a copy, as appropriate.
@ -4459,7 +4459,7 @@ FlushRelationBuffers(Relation rel)
PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
io_start = pgstat_prepare_io_time();
io_start = pgstat_prepare_io_time(track_io_timing);
smgrwrite(srel,
BufTagGetForkNum(&bufHdr->tag),
@ -5905,7 +5905,7 @@ IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context)
sort_pending_writebacks(wb_context->pending_writebacks,
wb_context->nr_pending);
io_start = pgstat_prepare_io_time();
io_start = pgstat_prepare_io_time(track_io_timing);
/*
* Coalesce neighbouring writes, but nothing else. For that we iterate

View File

@ -244,7 +244,7 @@ GetLocalVictimBuffer(void)
PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
io_start = pgstat_prepare_io_time();
io_start = pgstat_prepare_io_time(track_io_timing);
/* And write... */
smgrwrite(oreln,
@ -414,7 +414,7 @@ ExtendBufferedRelLocal(BufferManagerRelation bmr,
}
}
io_start = pgstat_prepare_io_time();
io_start = pgstat_prepare_io_time(track_io_timing);
/* actually extend relation */
smgrzeroextend(bmr.smgr, fork, first_block, extend_by, false);

View File

@ -1391,7 +1391,7 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
ereport(DEBUG1,
(errmsg_internal("could not forward fsync request because request queue is full")));
io_start = pgstat_prepare_io_time();
io_start = pgstat_prepare_io_time(track_io_timing);
if (FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) < 0)
ereport(data_sync_elevel(ERROR),
@ -1790,7 +1790,7 @@ mdsyncfiletag(const FileTag *ftag, char *path)
need_to_close = true;
}
io_start = pgstat_prepare_io_time();
io_start = pgstat_prepare_io_time(track_io_timing);
/* Sync the file. */
result = FileSync(file, WAIT_EVENT_DATA_FILE_SYNC);

View File

@ -24,6 +24,7 @@
#include "postgres.h"
#include "access/xlog.h"
#include "storage/bufmgr.h"
#include "utils/memutils.h"
#include "utils/pgstat_internal.h"
@ -43,7 +44,7 @@ void
pgstat_count_backend_io_op_time(IOObject io_object, IOContext io_context,
IOOp io_op, instr_time io_time)
{
Assert(track_io_timing);
Assert(track_io_timing || track_wal_io_timing);
if (!pgstat_tracks_backend_bktype(MyBackendType))
return;

View File

@ -83,20 +83,22 @@ pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op,
}
/*
* Initialize the internal timing for an IO operation.
* Initialize the internal timing for an IO operation, depending on an
* IO timing GUC.
*/
instr_time
pgstat_prepare_io_time(void)
pgstat_prepare_io_time(bool track_io_guc)
{
instr_time io_start;
if (track_io_timing)
if (track_io_guc)
INSTR_TIME_SET_CURRENT(io_start);
else
{
/*
* There is no need to set io_start when an IO timing GUC is disabled,
* still initialize it to zero to avoid compiler warnings.
* There is no need to set io_start when an IO timing GUC is disabled.
* Initialize it to zero to avoid compiler warnings and to let
* pgstat_count_io_op_time() know that timings should be ignored.
*/
INSTR_TIME_SET_ZERO(io_start);
}
@ -119,7 +121,7 @@ void
pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op,
instr_time start_time, uint32 cnt, uint64 bytes)
{
if (track_io_timing)
if (!INSTR_TIME_IS_ZERO(start_time))
{
instr_time io_time;

View File

@ -1501,6 +1501,15 @@ struct config_bool ConfigureNamesBool[] =
false,
NULL, NULL, NULL
},
{
{"track_wal_io_timing", PGC_SUSET, STATS_CUMULATIVE,
gettext_noop("Collects timing statistics for WAL I/O activity."),
NULL
},
&track_wal_io_timing,
false,
NULL, NULL, NULL
},
{
{"update_process_title", PGC_SUSET, PROCESS_TITLE,

View File

@ -641,6 +641,7 @@
#track_counts = on
#track_cost_delay_timing = off
#track_io_timing = off
#track_wal_io_timing = off
#track_functions = none # none, pl, all
#stats_fetch_consistency = cache # cache, none, snapshot

View File

@ -54,6 +54,7 @@ extern PGDLLIMPORT char *wal_consistency_checking_string;
extern PGDLLIMPORT bool log_checkpoints;
extern PGDLLIMPORT int CommitDelay;
extern PGDLLIMPORT int CommitSiblings;
extern PGDLLIMPORT bool track_wal_io_timing;
extern PGDLLIMPORT int wal_decode_buffer_size;
extern PGDLLIMPORT int CheckPointSegments;

View File

@ -564,7 +564,7 @@ extern bool pgstat_bktype_io_stats_valid(PgStat_BktypeIO *backend_io,
BackendType bktype);
extern void pgstat_count_io_op(IOObject io_object, IOContext io_context,
IOOp io_op, uint32 cnt, uint64 bytes);
extern instr_time pgstat_prepare_io_time(void);
extern instr_time pgstat_prepare_io_time(bool track_io_guc);
extern void pgstat_count_io_op_time(IOObject io_object, IOContext io_context,
IOOp io_op, instr_time start_time,
uint32 cnt, uint64 bytes);