1
0
mirror of https://github.com/postgres/postgres.git synced 2025-11-13 16:22:44 +03:00
Files
postgres/src/backend/storage/buffer/freelist.c
Bruce Momjian 6204d07ad6 Remove blank line in C code.
Was added in commit 5e89985928.

Reported-by: Ashutosh Bapat

Author: Ashutosh Bapat

Discussion: https://postgr.es/m/CAExHW5tba_biyuMrd_iPVzq-+XvsMdPcEnjQ+d+__V=cjYj8Pg@mail.gmail.com

Backpatch-through: master
2025-11-07 21:54:25 -05:00

806 lines
22 KiB
C

/*-------------------------------------------------------------------------
*
* freelist.c
* routines for managing the buffer pool's replacement strategy.
*
*
* Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* src/backend/storage/buffer/freelist.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "pgstat.h"
#include "port/atomics.h"
#include "storage/buf_internals.h"
#include "storage/bufmgr.h"
#include "storage/proc.h"
#define INT_ACCESS_ONCE(var) ((int)(*((volatile int *)&(var))))
/*
* The shared freelist control information.
*/
typedef struct
{
/* Spinlock: protects the values below */
slock_t buffer_strategy_lock;
/*
* clock-sweep hand: index of next buffer to consider grabbing. Note that
* this isn't a concrete buffer - we only ever increase the value. So, to
* get an actual buffer, it needs to be used modulo NBuffers.
*/
pg_atomic_uint32 nextVictimBuffer;
/*
* Statistics. These counters should be wide enough that they can't
* overflow during a single bgwriter cycle.
*/
uint32 completePasses; /* Complete cycles of the clock-sweep */
pg_atomic_uint32 numBufferAllocs; /* Buffers allocated since last reset */
/*
* Bgworker process to be notified upon activity or -1 if none. See
* StrategyNotifyBgWriter.
*/
int bgwprocno;
} BufferStrategyControl;
/* Pointers to shared state */
static BufferStrategyControl *StrategyControl = NULL;
/*
* Private (non-shared) state for managing a ring of shared buffers to re-use.
* This is currently the only kind of BufferAccessStrategy object, but someday
* we might have more kinds.
*/
typedef struct BufferAccessStrategyData
{
/* Overall strategy type */
BufferAccessStrategyType btype;
/* Number of elements in buffers[] array */
int nbuffers;
/*
* Index of the "current" slot in the ring, ie, the one most recently
* returned by GetBufferFromRing.
*/
int current;
/*
* Array of buffer numbers. InvalidBuffer (that is, zero) indicates we
* have not yet selected a buffer for this ring slot. For allocation
* simplicity this is palloc'd together with the fixed fields of the
* struct.
*/
Buffer buffers[FLEXIBLE_ARRAY_MEMBER];
} BufferAccessStrategyData;
/* Prototypes for internal functions */
static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy,
uint32 *buf_state);
static void AddBufferToRing(BufferAccessStrategy strategy,
BufferDesc *buf);
/*
* ClockSweepTick - Helper routine for StrategyGetBuffer()
*
* Move the clock hand one buffer ahead of its current position and return the
* id of the buffer now under the hand.
*/
static inline uint32
ClockSweepTick(void)
{
uint32 victim;
/*
* Atomically move hand ahead one buffer - if there's several processes
* doing this, this can lead to buffers being returned slightly out of
* apparent order.
*/
victim =
pg_atomic_fetch_add_u32(&StrategyControl->nextVictimBuffer, 1);
if (victim >= NBuffers)
{
uint32 originalVictim = victim;
/* always wrap what we look up in BufferDescriptors */
victim = victim % NBuffers;
/*
* If we're the one that just caused a wraparound, force
* completePasses to be incremented while holding the spinlock. We
* need the spinlock so StrategySyncStart() can return a consistent
* value consisting of nextVictimBuffer and completePasses.
*/
if (victim == 0)
{
uint32 expected;
uint32 wrapped;
bool success = false;
expected = originalVictim + 1;
while (!success)
{
/*
* Acquire the spinlock while increasing completePasses. That
* allows other readers to read nextVictimBuffer and
* completePasses in a consistent manner which is required for
* StrategySyncStart(). In theory delaying the increment
* could lead to an overflow of nextVictimBuffers, but that's
* highly unlikely and wouldn't be particularly harmful.
*/
SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
wrapped = expected % NBuffers;
success = pg_atomic_compare_exchange_u32(&StrategyControl->nextVictimBuffer,
&expected, wrapped);
if (success)
StrategyControl->completePasses++;
SpinLockRelease(&StrategyControl->buffer_strategy_lock);
}
}
}
return victim;
}
/*
* StrategyGetBuffer
*
* Called by the bufmgr to get the next candidate buffer to use in
* GetVictimBuffer(). The only hard requirement GetVictimBuffer() has is that
* the selected buffer must not currently be pinned by anyone.
*
* strategy is a BufferAccessStrategy object, or NULL for default strategy.
*
* It is the callers responsibility to ensure the buffer ownership can be
* tracked via TrackNewBufferPin().
*
* The buffer is pinned and marked as owned, using TrackNewBufferPin(),
* before returning.
*/
BufferDesc *
StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_ring)
{
BufferDesc *buf;
int bgwprocno;
int trycounter;
*from_ring = false;
/*
* If given a strategy object, see whether it can select a buffer. We
* assume strategy objects don't need buffer_strategy_lock.
*/
if (strategy != NULL)
{
buf = GetBufferFromRing(strategy, buf_state);
if (buf != NULL)
{
*from_ring = true;
return buf;
}
}
/*
* If asked, we need to waken the bgwriter. Since we don't want to rely on
* a spinlock for this we force a read from shared memory once, and then
* set the latch based on that value. We need to go through that length
* because otherwise bgwprocno might be reset while/after we check because
* the compiler might just reread from memory.
*
* This can possibly set the latch of the wrong process if the bgwriter
* dies in the wrong moment. But since PGPROC->procLatch is never
* deallocated the worst consequence of that is that we set the latch of
* some arbitrary process.
*/
bgwprocno = INT_ACCESS_ONCE(StrategyControl->bgwprocno);
if (bgwprocno != -1)
{
/* reset bgwprocno first, before setting the latch */
StrategyControl->bgwprocno = -1;
/*
* Not acquiring ProcArrayLock here which is slightly icky. It's
* actually fine because procLatch isn't ever freed, so we just can
* potentially set the wrong process' (or no process') latch.
*/
SetLatch(&ProcGlobal->allProcs[bgwprocno].procLatch);
}
/*
* We count buffer allocation requests so that the bgwriter can estimate
* the rate of buffer consumption. Note that buffers recycled by a
* strategy object are intentionally not counted here.
*/
pg_atomic_fetch_add_u32(&StrategyControl->numBufferAllocs, 1);
/* Use the "clock sweep" algorithm to find a free buffer */
trycounter = NBuffers;
for (;;)
{
uint32 old_buf_state;
uint32 local_buf_state;
buf = GetBufferDescriptor(ClockSweepTick());
/*
* Check whether the buffer can be used and pin it if so. Do this
* using a CAS loop, to avoid having to lock the buffer header.
*/
old_buf_state = pg_atomic_read_u32(&buf->state);
for (;;)
{
local_buf_state = old_buf_state;
/*
* If the buffer is pinned or has a nonzero usage_count, we cannot
* use it; decrement the usage_count (unless pinned) and keep
* scanning.
*/
if (BUF_STATE_GET_REFCOUNT(local_buf_state) != 0)
{
if (--trycounter == 0)
{
/*
* We've scanned all the buffers without making any state
* changes, so all the buffers are pinned (or were when we
* looked at them). We could hope that someone will free
* one eventually, but it's probably better to fail than
* to risk getting stuck in an infinite loop.
*/
elog(ERROR, "no unpinned buffers available");
}
break;
}
/* See equivalent code in PinBuffer() */
if (unlikely(local_buf_state & BM_LOCKED))
{
old_buf_state = WaitBufHdrUnlocked(buf);
continue;
}
if (BUF_STATE_GET_USAGECOUNT(local_buf_state) != 0)
{
local_buf_state -= BUF_USAGECOUNT_ONE;
if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
local_buf_state))
{
trycounter = NBuffers;
break;
}
}
else
{
/* pin the buffer if the CAS succeeds */
local_buf_state += BUF_REFCOUNT_ONE;
if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
local_buf_state))
{
/* Found a usable buffer */
if (strategy != NULL)
AddBufferToRing(strategy, buf);
*buf_state = local_buf_state;
TrackNewBufferPin(BufferDescriptorGetBuffer(buf));
return buf;
}
}
}
}
}
/*
* StrategySyncStart -- tell BgBufferSync where to start syncing
*
* The result is the buffer index of the best buffer to sync first.
* BgBufferSync() will proceed circularly around the buffer array from there.
*
* In addition, we return the completed-pass count (which is effectively
* the higher-order bits of nextVictimBuffer) and the count of recent buffer
* allocs if non-NULL pointers are passed. The alloc count is reset after
* being read.
*/
int
StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
{
uint32 nextVictimBuffer;
int result;
SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
nextVictimBuffer = pg_atomic_read_u32(&StrategyControl->nextVictimBuffer);
result = nextVictimBuffer % NBuffers;
if (complete_passes)
{
*complete_passes = StrategyControl->completePasses;
/*
* Additionally add the number of wraparounds that happened before
* completePasses could be incremented. C.f. ClockSweepTick().
*/
*complete_passes += nextVictimBuffer / NBuffers;
}
if (num_buf_alloc)
{
*num_buf_alloc = pg_atomic_exchange_u32(&StrategyControl->numBufferAllocs, 0);
}
SpinLockRelease(&StrategyControl->buffer_strategy_lock);
return result;
}
/*
* StrategyNotifyBgWriter -- set or clear allocation notification latch
*
* If bgwprocno isn't -1, the next invocation of StrategyGetBuffer will
* set that latch. Pass -1 to clear the pending notification before it
* happens. This feature is used by the bgwriter process to wake itself up
* from hibernation, and is not meant for anybody else to use.
*/
void
StrategyNotifyBgWriter(int bgwprocno)
{
/*
* We acquire buffer_strategy_lock just to ensure that the store appears
* atomic to StrategyGetBuffer. The bgwriter should call this rather
* infrequently, so there's no performance penalty from being safe.
*/
SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
StrategyControl->bgwprocno = bgwprocno;
SpinLockRelease(&StrategyControl->buffer_strategy_lock);
}
/*
* StrategyShmemSize
*
* estimate the size of shared memory used by the freelist-related structures.
*
* Note: for somewhat historical reasons, the buffer lookup hashtable size
* is also determined here.
*/
Size
StrategyShmemSize(void)
{
Size size = 0;
/* size of lookup hash table ... see comment in StrategyInitialize */
size = add_size(size, BufTableShmemSize(NBuffers + NUM_BUFFER_PARTITIONS));
/* size of the shared replacement strategy control block */
size = add_size(size, MAXALIGN(sizeof(BufferStrategyControl)));
return size;
}
/*
* StrategyInitialize -- initialize the buffer cache replacement
* strategy.
*
* Assumes: All of the buffers are already built into a linked list.
* Only called by postmaster and only during initialization.
*/
void
StrategyInitialize(bool init)
{
bool found;
/*
* Initialize the shared buffer lookup hashtable.
*
* Since we can't tolerate running out of lookup table entries, we must be
* sure to specify an adequate table size here. The maximum steady-state
* usage is of course NBuffers entries, but BufferAlloc() tries to insert
* a new entry before deleting the old. In principle this could be
* happening in each partition concurrently, so we could need as many as
* NBuffers + NUM_BUFFER_PARTITIONS entries.
*/
InitBufTable(NBuffers + NUM_BUFFER_PARTITIONS);
/*
* Get or create the shared strategy control block
*/
StrategyControl = (BufferStrategyControl *)
ShmemInitStruct("Buffer Strategy Status",
sizeof(BufferStrategyControl),
&found);
if (!found)
{
/*
* Only done once, usually in postmaster
*/
Assert(init);
SpinLockInit(&StrategyControl->buffer_strategy_lock);
/* Initialize the clock-sweep pointer */
pg_atomic_init_u32(&StrategyControl->nextVictimBuffer, 0);
/* Clear statistics */
StrategyControl->completePasses = 0;
pg_atomic_init_u32(&StrategyControl->numBufferAllocs, 0);
/* No pending notification */
StrategyControl->bgwprocno = -1;
}
else
Assert(!init);
}
/* ----------------------------------------------------------------
* Backend-private buffer ring management
* ----------------------------------------------------------------
*/
/*
* GetAccessStrategy -- create a BufferAccessStrategy object
*
* The object is allocated in the current memory context.
*/
BufferAccessStrategy
GetAccessStrategy(BufferAccessStrategyType btype)
{
int ring_size_kb;
/*
* Select ring size to use. See buffer/README for rationales.
*
* Note: if you change the ring size for BAS_BULKREAD, see also
* SYNC_SCAN_REPORT_INTERVAL in access/heap/syncscan.c.
*/
switch (btype)
{
case BAS_NORMAL:
/* if someone asks for NORMAL, just give 'em a "default" object */
return NULL;
case BAS_BULKREAD:
{
int ring_max_kb;
/*
* The ring always needs to be large enough to allow some
* separation in time between providing a buffer to the user
* of the strategy and that buffer being reused. Otherwise the
* user's pin will prevent reuse of the buffer, even without
* concurrent activity.
*
* We also need to ensure the ring always is large enough for
* SYNC_SCAN_REPORT_INTERVAL, as noted above.
*
* Thus we start out a minimal size and increase the size
* further if appropriate.
*/
ring_size_kb = 256;
/*
* There's no point in a larger ring if we won't be allowed to
* pin sufficiently many buffers. But we never limit to less
* than the minimal size above.
*/
ring_max_kb = GetPinLimit() * (BLCKSZ / 1024);
ring_max_kb = Max(ring_size_kb, ring_max_kb);
/*
* We would like the ring to additionally have space for the
* configured degree of IO concurrency. While being read in,
* buffers can obviously not yet be reused.
*
* Each IO can be up to io_combine_limit blocks large, and we
* want to start up to effective_io_concurrency IOs.
*
* Note that effective_io_concurrency may be 0, which disables
* AIO.
*/
ring_size_kb += (BLCKSZ / 1024) *
io_combine_limit * effective_io_concurrency;
if (ring_size_kb > ring_max_kb)
ring_size_kb = ring_max_kb;
break;
}
case BAS_BULKWRITE:
ring_size_kb = 16 * 1024;
break;
case BAS_VACUUM:
ring_size_kb = 2048;
break;
default:
elog(ERROR, "unrecognized buffer access strategy: %d",
(int) btype);
return NULL; /* keep compiler quiet */
}
return GetAccessStrategyWithSize(btype, ring_size_kb);
}
/*
* GetAccessStrategyWithSize -- create a BufferAccessStrategy object with a
* number of buffers equivalent to the passed in size.
*
* If the given ring size is 0, no BufferAccessStrategy will be created and
* the function will return NULL. ring_size_kb must not be negative.
*/
BufferAccessStrategy
GetAccessStrategyWithSize(BufferAccessStrategyType btype, int ring_size_kb)
{
int ring_buffers;
BufferAccessStrategy strategy;
Assert(ring_size_kb >= 0);
/* Figure out how many buffers ring_size_kb is */
ring_buffers = ring_size_kb / (BLCKSZ / 1024);
/* 0 means unlimited, so no BufferAccessStrategy required */
if (ring_buffers == 0)
return NULL;
/* Cap to 1/8th of shared_buffers */
ring_buffers = Min(NBuffers / 8, ring_buffers);
/* NBuffers should never be less than 16, so this shouldn't happen */
Assert(ring_buffers > 0);
/* Allocate the object and initialize all elements to zeroes */
strategy = (BufferAccessStrategy)
palloc0(offsetof(BufferAccessStrategyData, buffers) +
ring_buffers * sizeof(Buffer));
/* Set fields that don't start out zero */
strategy->btype = btype;
strategy->nbuffers = ring_buffers;
return strategy;
}
/*
* GetAccessStrategyBufferCount -- an accessor for the number of buffers in
* the ring
*
* Returns 0 on NULL input to match behavior of GetAccessStrategyWithSize()
* returning NULL with 0 size.
*/
int
GetAccessStrategyBufferCount(BufferAccessStrategy strategy)
{
if (strategy == NULL)
return 0;
return strategy->nbuffers;
}
/*
* GetAccessStrategyPinLimit -- get cap of number of buffers that should be pinned
*
* When pinning extra buffers to look ahead, users of a ring-based strategy are
* in danger of pinning too much of the ring at once while performing look-ahead.
* For some strategies, that means "escaping" from the ring, and in others it
* means forcing dirty data to disk very frequently with associated WAL
* flushing. Since external code has no insight into any of that, allow
* individual strategy types to expose a clamp that should be applied when
* deciding on a maximum number of buffers to pin at once.
*
* Callers should combine this number with other relevant limits and take the
* minimum.
*/
int
GetAccessStrategyPinLimit(BufferAccessStrategy strategy)
{
if (strategy == NULL)
return NBuffers;
switch (strategy->btype)
{
case BAS_BULKREAD:
/*
* Since BAS_BULKREAD uses StrategyRejectBuffer(), dirty buffers
* shouldn't be a problem and the caller is free to pin up to the
* entire ring at once.
*/
return strategy->nbuffers;
default:
/*
* Tell caller not to pin more than half the buffers in the ring.
* This is a trade-off between look ahead distance and deferring
* writeback and associated WAL traffic.
*/
return strategy->nbuffers / 2;
}
}
/*
* FreeAccessStrategy -- release a BufferAccessStrategy object
*
* A simple pfree would do at the moment, but we would prefer that callers
* don't assume that much about the representation of BufferAccessStrategy.
*/
void
FreeAccessStrategy(BufferAccessStrategy strategy)
{
/* don't crash if called on a "default" strategy */
if (strategy != NULL)
pfree(strategy);
}
/*
* GetBufferFromRing -- returns a buffer from the ring, or NULL if the
* ring is empty / not usable.
*
* The buffer is pinned and marked as owned, using TrackNewBufferPin(), before
* returning.
*/
static BufferDesc *
GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state)
{
BufferDesc *buf;
Buffer bufnum;
uint32 old_buf_state;
uint32 local_buf_state; /* to avoid repeated (de-)referencing */
/* Advance to next ring slot */
if (++strategy->current >= strategy->nbuffers)
strategy->current = 0;
/*
* If the slot hasn't been filled yet, tell the caller to allocate a new
* buffer with the normal allocation strategy. He will then fill this
* slot by calling AddBufferToRing with the new buffer.
*/
bufnum = strategy->buffers[strategy->current];
if (bufnum == InvalidBuffer)
return NULL;
buf = GetBufferDescriptor(bufnum - 1);
/*
* Check whether the buffer can be used and pin it if so. Do this using a
* CAS loop, to avoid having to lock the buffer header.
*/
old_buf_state = pg_atomic_read_u32(&buf->state);
for (;;)
{
local_buf_state = old_buf_state;
/*
* If the buffer is pinned we cannot use it under any circumstances.
*
* If usage_count is 0 or 1 then the buffer is fair game (we expect 1,
* since our own previous usage of the ring element would have left it
* there, but it might've been decremented by clock-sweep since then).
* A higher usage_count indicates someone else has touched the buffer,
* so we shouldn't re-use it.
*/
if (BUF_STATE_GET_REFCOUNT(local_buf_state) != 0
|| BUF_STATE_GET_USAGECOUNT(local_buf_state) > 1)
break;
/* See equivalent code in PinBuffer() */
if (unlikely(local_buf_state & BM_LOCKED))
{
old_buf_state = WaitBufHdrUnlocked(buf);
continue;
}
/* pin the buffer if the CAS succeeds */
local_buf_state += BUF_REFCOUNT_ONE;
if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
local_buf_state))
{
*buf_state = local_buf_state;
TrackNewBufferPin(BufferDescriptorGetBuffer(buf));
return buf;
}
}
/*
* Tell caller to allocate a new buffer with the normal allocation
* strategy. He'll then replace this ring element via AddBufferToRing.
*/
return NULL;
}
/*
* AddBufferToRing -- add a buffer to the buffer ring
*
* Caller must hold the buffer header spinlock on the buffer. Since this
* is called with the spinlock held, it had better be quite cheap.
*/
static void
AddBufferToRing(BufferAccessStrategy strategy, BufferDesc *buf)
{
strategy->buffers[strategy->current] = BufferDescriptorGetBuffer(buf);
}
/*
* Utility function returning the IOContext of a given BufferAccessStrategy's
* strategy ring.
*/
IOContext
IOContextForStrategy(BufferAccessStrategy strategy)
{
if (!strategy)
return IOCONTEXT_NORMAL;
switch (strategy->btype)
{
case BAS_NORMAL:
/*
* Currently, GetAccessStrategy() returns NULL for
* BufferAccessStrategyType BAS_NORMAL, so this case is
* unreachable.
*/
pg_unreachable();
return IOCONTEXT_NORMAL;
case BAS_BULKREAD:
return IOCONTEXT_BULKREAD;
case BAS_BULKWRITE:
return IOCONTEXT_BULKWRITE;
case BAS_VACUUM:
return IOCONTEXT_VACUUM;
}
elog(ERROR, "unrecognized BufferAccessStrategyType: %d", strategy->btype);
pg_unreachable();
}
/*
* StrategyRejectBuffer -- consider rejecting a dirty buffer
*
* When a nondefault strategy is used, the buffer manager calls this function
* when it turns out that the buffer selected by StrategyGetBuffer needs to
* be written out and doing so would require flushing WAL too. This gives us
* a chance to choose a different victim.
*
* Returns true if buffer manager should ask for a new victim, and false
* if this buffer should be written and re-used.
*/
bool
StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring)
{
/* We only do this in bulkread mode */
if (strategy->btype != BAS_BULKREAD)
return false;
/* Don't muck with behavior of normal buffer-replacement strategy */
if (!from_ring ||
strategy->buffers[strategy->current] != BufferDescriptorGetBuffer(buf))
return false;
/*
* Remove the dirty buffer from the ring; necessary to prevent infinite
* loop if all ring members are dirty.
*/
strategy->buffers[strategy->current] = InvalidBuffer;
return true;
}