diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 1bc8d6555b9..2d6dbc65612 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -472,8 +472,9 @@ static BufferDesc *BufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, - bool *foundPtr); -static void FlushBuffer(BufferDesc *buf, SMgrRelation reln); + bool *foundPtr, IOContext *io_context); +static void FlushBuffer(BufferDesc *buf, SMgrRelation reln, + IOObject io_object, IOContext io_context); static void FindAndDropRelationBuffers(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber nForkBlock, @@ -814,6 +815,8 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BufferDesc *bufHdr; Block bufBlock; bool found; + IOContext io_context; + IOObject io_object; bool isExtend; bool isLocalBuf = SmgrIsTemp(smgr); @@ -846,7 +849,14 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, if (isLocalBuf) { - bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found); + /* + * LocalBufferAlloc() will set the io_context to IOCONTEXT_NORMAL. We + * do not use a BufferAccessStrategy for I/O of temporary tables. + * However, in some cases, the "strategy" may not be NULL, so we can't + * rely on IOContextForStrategy() to set the right IOContext for us. + * This may happen in cases like CREATE TEMPORARY TABLE AS... + */ + bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found, &io_context); if (found) pgBufferUsage.local_blks_hit++; else if (isExtend) @@ -862,7 +872,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, * not currently in memory. */ bufHdr = BufferAlloc(smgr, relpersistence, forkNum, blockNum, - strategy, &found); + strategy, &found, &io_context); if (found) pgBufferUsage.shared_blks_hit++; else if (isExtend) @@ -977,7 +987,16 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, */ Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /* spinlock not needed */ - bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr); + if (isLocalBuf) + { + bufBlock = LocalBufHdrGetBlock(bufHdr); + io_object = IOOBJECT_TEMP_RELATION; + } + else + { + bufBlock = BufHdrGetBlock(bufHdr); + io_object = IOOBJECT_RELATION; + } if (isExtend) { @@ -986,6 +1005,8 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, /* don't set checksum for all-zero page */ smgrextend(smgr, forkNum, blockNum, (char *) bufBlock, false); + pgstat_count_io_op(io_object, io_context, IOOP_EXTEND); + /* * NB: we're *not* doing a ScheduleBufferTagForWriteback here; * although we're essentially performing a write. At least on linux @@ -1013,6 +1034,8 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, smgrread(smgr, forkNum, blockNum, (char *) bufBlock); + pgstat_count_io_op(io_object, io_context, IOOP_READ); + if (track_io_timing) { INSTR_TIME_SET_CURRENT(io_time); @@ -1106,14 +1129,19 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, * *foundPtr is actually redundant with the buffer's BM_VALID flag, but * we keep it for simplicity in ReadBuffer. * + * io_context is passed as an output parameter to avoid calling + * IOContextForStrategy() when there is a shared buffers hit and no IO + * statistics need be captured. + * * No locks are held either at entry or exit. */ static BufferDesc * BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, - bool *foundPtr) + bool *foundPtr, IOContext *io_context) { + bool from_ring; BufferTag newTag; /* identity of requested block */ uint32 newHash; /* hash value for newTag */ LWLock *newPartitionLock; /* buffer partition lock for it */ @@ -1165,8 +1193,11 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, { /* * If we get here, previous attempts to read the buffer must - * have failed ... but we shall bravely try again. + * have failed ... but we shall bravely try again. Set + * io_context since we will in fact need to count an IO + * Operation. */ + *io_context = IOContextForStrategy(strategy); *foundPtr = false; } } @@ -1180,6 +1211,8 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, */ LWLockRelease(newPartitionLock); + *io_context = IOContextForStrategy(strategy); + /* Loop here in case we have to try another victim buffer */ for (;;) { @@ -1193,7 +1226,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, * Select a victim buffer. The buffer is returned with its header * spinlock still held! */ - buf = StrategyGetBuffer(strategy, &buf_state); + buf = StrategyGetBuffer(strategy, &buf_state, &from_ring); Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0); @@ -1247,7 +1280,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, UnlockBufHdr(buf, buf_state); if (XLogNeedsFlush(lsn) && - StrategyRejectBuffer(strategy, buf)) + StrategyRejectBuffer(strategy, buf, from_ring)) { /* Drop lock/pin and loop around for another buffer */ LWLockRelease(BufferDescriptorGetContentLock(buf)); @@ -1262,7 +1295,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, smgr->smgr_rlocator.locator.dbOid, smgr->smgr_rlocator.locator.relNumber); - FlushBuffer(buf, NULL); + FlushBuffer(buf, NULL, IOOBJECT_RELATION, *io_context); LWLockRelease(BufferDescriptorGetContentLock(buf)); ScheduleBufferTagForWriteback(&BackendWritebackContext, @@ -1443,6 +1476,28 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, LWLockRelease(newPartitionLock); + if (oldFlags & BM_VALID) + { + /* + * When a BufferAccessStrategy is in use, blocks evicted from shared + * buffers are counted as IOOP_EVICT in the corresponding context + * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a + * strategy in two cases: 1) while initially claiming buffers for the + * strategy ring 2) to replace an existing strategy ring buffer + * because it is pinned or in use and cannot be reused. + * + * Blocks evicted from buffers already in the strategy ring are + * counted as IOOP_REUSE in the corresponding strategy context. + * + * At this point, we can accurately count evictions and reuses, + * because we have successfully claimed the valid buffer. Previously, + * we may have been forced to release the buffer due to concurrent + * pinners or erroring out. + */ + pgstat_count_io_op(IOOBJECT_RELATION, *io_context, + from_ring ? IOOP_REUSE : IOOP_EVICT); + } + /* * Buffer contents are currently invalid. Try to obtain the right to * start I/O. If StartBufferIO returns false, then someone else managed @@ -2563,7 +2618,7 @@ SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context) PinBuffer_Locked(bufHdr); LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED); - FlushBuffer(bufHdr, NULL); + FlushBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL); LWLockRelease(BufferDescriptorGetContentLock(bufHdr)); @@ -2813,7 +2868,8 @@ BufferGetTag(Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum, * as the second parameter. If not, pass NULL. */ static void -FlushBuffer(BufferDesc *buf, SMgrRelation reln) +FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, + IOContext io_context) { XLogRecPtr recptr; ErrorContextCallback errcallback; @@ -2907,6 +2963,26 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln) bufToWrite, false); + /* + * When a strategy is in use, only flushes of dirty buffers already in the + * strategy ring are counted as strategy writes (IOCONTEXT + * [BULKREAD|BULKWRITE|VACUUM] IOOP_WRITE) for the purpose of IO + * statistics tracking. + * + * If a shared buffer initially added to the ring must be flushed before + * being used, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE. + * + * If a shared buffer which was added to the ring later because the + * current strategy buffer is pinned or in use or because all strategy + * buffers were dirty and rejected (for BAS_BULKREAD operations only) + * requires flushing, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE + * (from_ring will be false). + * + * When a strategy is not in use, the write can only be a "regular" write + * of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE). + */ + pgstat_count_io_op(IOOBJECT_RELATION, io_context, IOOP_WRITE); + if (track_io_timing) { INSTR_TIME_SET_CURRENT(io_time); @@ -3549,6 +3625,8 @@ FlushRelationBuffers(Relation rel) buf_state &= ~(BM_DIRTY | BM_JUST_DIRTIED); pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state); + pgstat_count_io_op(IOOBJECT_TEMP_RELATION, IOCONTEXT_NORMAL, IOOP_WRITE); + /* Pop the error context stack */ error_context_stack = errcallback.previous; } @@ -3581,7 +3659,7 @@ FlushRelationBuffers(Relation rel) { PinBuffer_Locked(bufHdr); LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED); - FlushBuffer(bufHdr, RelationGetSmgr(rel)); + FlushBuffer(bufHdr, RelationGetSmgr(rel), IOOBJECT_RELATION, IOCONTEXT_NORMAL); LWLockRelease(BufferDescriptorGetContentLock(bufHdr)); UnpinBuffer(bufHdr); } @@ -3679,7 +3757,7 @@ FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels) { PinBuffer_Locked(bufHdr); LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED); - FlushBuffer(bufHdr, srelent->srel); + FlushBuffer(bufHdr, srelent->srel, IOOBJECT_RELATION, IOCONTEXT_NORMAL); LWLockRelease(BufferDescriptorGetContentLock(bufHdr)); UnpinBuffer(bufHdr); } @@ -3889,7 +3967,7 @@ FlushDatabaseBuffers(Oid dbid) { PinBuffer_Locked(bufHdr); LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED); - FlushBuffer(bufHdr, NULL); + FlushBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL); LWLockRelease(BufferDescriptorGetContentLock(bufHdr)); UnpinBuffer(bufHdr); } @@ -3916,7 +3994,7 @@ FlushOneBuffer(Buffer buffer) Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr))); - FlushBuffer(bufHdr, NULL); + FlushBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL); } /* diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c index 7dec35801cb..c690d5f15f2 100644 --- a/src/backend/storage/buffer/freelist.c +++ b/src/backend/storage/buffer/freelist.c @@ -15,6 +15,7 @@ */ #include "postgres.h" +#include "pgstat.h" #include "port/atomics.h" #include "storage/buf_internals.h" #include "storage/bufmgr.h" @@ -81,12 +82,6 @@ typedef struct BufferAccessStrategyData */ int current; - /* - * True if the buffer just returned by StrategyGetBuffer had been in the - * ring already. - */ - bool current_was_in_ring; - /* * Array of buffer numbers. InvalidBuffer (that is, zero) indicates we * have not yet selected a buffer for this ring slot. For allocation @@ -198,13 +193,15 @@ have_free_buffer(void) * return the buffer with the buffer header spinlock still held. */ BufferDesc * -StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state) +StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_ring) { BufferDesc *buf; int bgwprocno; int trycounter; uint32 local_buf_state; /* to avoid repeated (de-)referencing */ + *from_ring = false; + /* * If given a strategy object, see whether it can select a buffer. We * assume strategy objects don't need buffer_strategy_lock. @@ -213,7 +210,10 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state) { buf = GetBufferFromRing(strategy, buf_state); if (buf != NULL) + { + *from_ring = true; return buf; + } } /* @@ -602,7 +602,7 @@ FreeAccessStrategy(BufferAccessStrategy strategy) /* * GetBufferFromRing -- returns a buffer from the ring, or NULL if the - * ring is empty. + * ring is empty / not usable. * * The bufhdr spin lock is held on the returned buffer. */ @@ -625,10 +625,7 @@ GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state) */ bufnum = strategy->buffers[strategy->current]; if (bufnum == InvalidBuffer) - { - strategy->current_was_in_ring = false; return NULL; - } /* * If the buffer is pinned we cannot use it under any circumstances. @@ -644,7 +641,6 @@ GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state) if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0 && BUF_STATE_GET_USAGECOUNT(local_buf_state) <= 1) { - strategy->current_was_in_ring = true; *buf_state = local_buf_state; return buf; } @@ -654,7 +650,6 @@ GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state) * Tell caller to allocate a new buffer with the normal allocation * strategy. He'll then replace this ring element via AddBufferToRing. */ - strategy->current_was_in_ring = false; return NULL; } @@ -670,6 +665,39 @@ AddBufferToRing(BufferAccessStrategy strategy, BufferDesc *buf) strategy->buffers[strategy->current] = BufferDescriptorGetBuffer(buf); } +/* + * Utility function returning the IOContext of a given BufferAccessStrategy's + * strategy ring. + */ +IOContext +IOContextForStrategy(BufferAccessStrategy strategy) +{ + if (!strategy) + return IOCONTEXT_NORMAL; + + switch (strategy->btype) + { + case BAS_NORMAL: + + /* + * Currently, GetAccessStrategy() returns NULL for + * BufferAccessStrategyType BAS_NORMAL, so this case is + * unreachable. + */ + pg_unreachable(); + return IOCONTEXT_NORMAL; + case BAS_BULKREAD: + return IOCONTEXT_BULKREAD; + case BAS_BULKWRITE: + return IOCONTEXT_BULKWRITE; + case BAS_VACUUM: + return IOCONTEXT_VACUUM; + } + + elog(ERROR, "unrecognized BufferAccessStrategyType: %d", strategy->btype); + pg_unreachable(); +} + /* * StrategyRejectBuffer -- consider rejecting a dirty buffer * @@ -682,14 +710,14 @@ AddBufferToRing(BufferAccessStrategy strategy, BufferDesc *buf) * if this buffer should be written and re-used. */ bool -StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf) +StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring) { /* We only do this in bulkread mode */ if (strategy->btype != BAS_BULKREAD) return false; /* Don't muck with behavior of normal buffer-replacement strategy */ - if (!strategy->current_was_in_ring || + if (!from_ring || strategy->buffers[strategy->current] != BufferDescriptorGetBuffer(buf)) return false; diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c index 71e2a5fecd5..5325ddb663d 100644 --- a/src/backend/storage/buffer/localbuf.c +++ b/src/backend/storage/buffer/localbuf.c @@ -18,6 +18,7 @@ #include "access/parallel.h" #include "catalog/catalog.h" #include "executor/instrument.h" +#include "pgstat.h" #include "storage/buf_internals.h" #include "storage/bufmgr.h" #include "utils/guc_hooks.h" @@ -107,7 +108,7 @@ PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum, */ BufferDesc * LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, - bool *foundPtr) + bool *foundPtr, IOContext *io_context) { BufferTag newTag; /* identity of requested block */ LocalBufferLookupEnt *hresult; @@ -127,6 +128,14 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, hresult = (LocalBufferLookupEnt *) hash_search(LocalBufHash, &newTag, HASH_FIND, NULL); + /* + * IO Operations on local buffers are only done in IOCONTEXT_NORMAL. Set + * io_context here (instead of after a buffer hit would have returned) for + * convenience since we don't have to worry about the overhead of calling + * IOContextForStrategy(). + */ + *io_context = IOCONTEXT_NORMAL; + if (hresult) { b = hresult->id; @@ -230,6 +239,7 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, buf_state &= ~BM_DIRTY; pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state); + pgstat_count_io_op(IOOBJECT_TEMP_RELATION, IOCONTEXT_NORMAL, IOOP_WRITE); pgBufferUsage.local_blks_written++; } @@ -255,6 +265,7 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, ClearBufferTag(&bufHdr->tag); buf_state &= ~(BM_VALID | BM_TAG_VALID); pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state); + pgstat_count_io_op(IOOBJECT_TEMP_RELATION, IOCONTEXT_NORMAL, IOOP_EVICT); } hresult = (LocalBufferLookupEnt *) diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 60c9905eff9..8da813600c0 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -983,6 +983,15 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum) { MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1]; + /* + * fsyncs done through mdimmedsync() should be tracked in a separate + * IOContext than those done through mdsyncfiletag() to differentiate + * between unavoidable client backend fsyncs (e.g. those done during + * index build) and those which ideally would have been done by the + * checkpointer. Since other IO operations bypassing the buffer + * manager could also be tracked in such an IOContext, wait until + * these are also tracked to track immediate fsyncs. + */ if (FileSync(v->mdfd_vfd, WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC) < 0) ereport(data_sync_elevel(ERROR), (errcode_for_file_access(), @@ -1021,6 +1030,19 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false /* retryOnError */ )) { + /* + * We have no way of knowing if the current IOContext is + * IOCONTEXT_NORMAL or IOCONTEXT_[BULKREAD, BULKWRITE, VACUUM] at this + * point, so count the fsync as being in the IOCONTEXT_NORMAL + * IOContext. This is probably okay, because the number of backend + * fsyncs doesn't say anything about the efficacy of the + * BufferAccessStrategy. And counting both fsyncs done in + * IOCONTEXT_NORMAL and IOCONTEXT_[BULKREAD, BULKWRITE, VACUUM] under + * IOCONTEXT_NORMAL is likely clearer when investigating the number of + * backend fsyncs. + */ + pgstat_count_io_op(IOOBJECT_RELATION, IOCONTEXT_NORMAL, IOOP_FSYNC); + ereport(DEBUG1, (errmsg_internal("could not forward fsync request because request queue is full"))); @@ -1410,6 +1432,8 @@ mdsyncfiletag(const FileTag *ftag, char *path) if (need_to_close) FileClose(file); + pgstat_count_io_op(IOOBJECT_RELATION, IOCONTEXT_NORMAL, IOOP_FSYNC); + errno = save_errno; return result; } diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index ed8aa2519c0..0b448147407 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -15,6 +15,7 @@ #ifndef BUFMGR_INTERNALS_H #define BUFMGR_INTERNALS_H +#include "pgstat.h" #include "port/atomics.h" #include "storage/buf.h" #include "storage/bufmgr.h" @@ -391,11 +392,12 @@ extern void IssuePendingWritebacks(WritebackContext *context); extern void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag); /* freelist.c */ +extern IOContext IOContextForStrategy(BufferAccessStrategy bas); extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy, - uint32 *buf_state); + uint32 *buf_state, bool *from_ring); extern void StrategyFreeBuffer(BufferDesc *buf); extern bool StrategyRejectBuffer(BufferAccessStrategy strategy, - BufferDesc *buf); + BufferDesc *buf, bool from_ring); extern int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc); extern void StrategyNotifyBgWriter(int bgwprocno); @@ -417,7 +419,7 @@ extern PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum); extern BufferDesc *LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, - BlockNumber blockNum, bool *foundPtr); + BlockNumber blockNum, bool *foundPtr, IOContext *io_context); extern void MarkLocalBufferDirty(Buffer buffer); extern void DropRelationLocalBuffers(RelFileLocator rlocator, ForkNumber forkNum, diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index 33eadbc1291..b8a18b8081f 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -23,7 +23,12 @@ typedef void *Block; -/* Possible arguments for GetAccessStrategy() */ +/* + * Possible arguments for GetAccessStrategy(). + * + * If adding a new BufferAccessStrategyType, also add a new IOContext so + * IO statistics using this strategy are tracked. + */ typedef enum BufferAccessStrategyType { BAS_NORMAL, /* Normal random access */