mirror of
https://github.com/postgres/postgres.git
synced 2025-10-28 11:55:03 +03:00
Replace the BufMgrLock with separate locks on the lookup hashtable and
the freelist, plus per-buffer spinlocks that protect access to individual shared buffer headers. This requires abandoning a global freelist (since the freelist is a global contention point), which shoots down ARC and 2Q as well as plain LRU management. Adopt a clock sweep algorithm instead. Preliminary results show substantial improvement in multi-backend situations.
This commit is contained in:
@@ -8,7 +8,7 @@
|
||||
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* $PostgreSQL: pgsql/src/include/storage/buf_internals.h,v 1.76 2005/02/03 23:29:19 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/include/storage/buf_internals.h,v 1.77 2005/03/04 20:21:07 tgl Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -19,24 +19,39 @@
|
||||
#include "storage/buf.h"
|
||||
#include "storage/lwlock.h"
|
||||
#include "storage/shmem.h"
|
||||
#include "storage/spin.h"
|
||||
#include "utils/rel.h"
|
||||
|
||||
|
||||
/*
|
||||
* Flags for buffer descriptors
|
||||
*
|
||||
* Note: TAG_VALID essentially means that there is a buffer hashtable
|
||||
* entry associated with the buffer's tag.
|
||||
*/
|
||||
#define BM_DIRTY (1 << 0) /* data needs writing */
|
||||
#define BM_VALID (1 << 1) /* data is valid */
|
||||
#define BM_IO_IN_PROGRESS (1 << 2) /* read or write in
|
||||
#define BM_TAG_VALID (1 << 2) /* tag is assigned */
|
||||
#define BM_IO_IN_PROGRESS (1 << 3) /* read or write in
|
||||
* progress */
|
||||
#define BM_IO_ERROR (1 << 3) /* previous I/O failed */
|
||||
#define BM_JUST_DIRTIED (1 << 4) /* dirtied since write
|
||||
#define BM_IO_ERROR (1 << 4) /* previous I/O failed */
|
||||
#define BM_JUST_DIRTIED (1 << 5) /* dirtied since write
|
||||
* started */
|
||||
#define BM_PIN_COUNT_WAITER (1 << 5) /* have waiter for sole
|
||||
#define BM_PIN_COUNT_WAITER (1 << 6) /* have waiter for sole
|
||||
* pin */
|
||||
|
||||
typedef bits16 BufFlags;
|
||||
|
||||
/*
|
||||
* The maximum allowed value of usage_count represents a tradeoff between
|
||||
* accuracy and speed of the clock-sweep buffer management algorithm. A
|
||||
* large value (comparable to NBuffers) would approximate LRU semantics.
|
||||
* But it can take as many as BM_MAX_USAGE_COUNT+1 complete cycles of
|
||||
* clock sweeps to find a free buffer, so in practice we don't want the
|
||||
* value to be very large.
|
||||
*/
|
||||
#define BM_MAX_USAGE_COUNT 5
|
||||
|
||||
/*
|
||||
* Buffer tag identifies which disk block the buffer contains.
|
||||
*
|
||||
@@ -77,45 +92,81 @@ typedef struct buftag
|
||||
|
||||
/*
|
||||
* BufferDesc -- shared descriptor/state data for a single shared buffer.
|
||||
*
|
||||
* Note: buf_hdr_lock must be held to examine or change the tag, flags,
|
||||
* usage_count, refcount, or wait_backend_id fields. buf_id field never
|
||||
* changes after initialization, so does not need locking. freeNext is
|
||||
* protected by the BufFreelistLock not buf_hdr_lock. The LWLocks can take
|
||||
* care of themselves. The buf_hdr_lock is *not* used to control access to
|
||||
* the data in the buffer!
|
||||
*
|
||||
* An exception is that if we have the buffer pinned, its tag can't change
|
||||
* underneath us, so we can examine the tag without locking the spinlock.
|
||||
* Also, in places we do one-time reads of the flags without bothering to
|
||||
* lock the spinlock; this is generally for situations where we don't expect
|
||||
* the flag bit being tested to be changing.
|
||||
*
|
||||
* We can't physically remove items from a disk page if another backend has
|
||||
* the buffer pinned. Hence, a backend may need to wait for all other pins
|
||||
* to go away. This is signaled by storing its own backend ID into
|
||||
* wait_backend_id and setting flag bit BM_PIN_COUNT_WAITER. At present,
|
||||
* there can be only one such waiter per buffer.
|
||||
*
|
||||
* We use this same struct for local buffer headers, but the lock fields
|
||||
* are not used and not all of the flag bits are useful either.
|
||||
*/
|
||||
typedef struct sbufdesc
|
||||
{
|
||||
Buffer bufNext; /* link in freelist chain */
|
||||
SHMEM_OFFSET data; /* pointer to data in buf pool */
|
||||
|
||||
/* tag and id must be together for table lookup (still true?) */
|
||||
BufferTag tag; /* file/block identifier */
|
||||
int buf_id; /* buffer's index number (from 0) */
|
||||
|
||||
BufferTag tag; /* ID of page contained in buffer */
|
||||
BufFlags flags; /* see bit definitions above */
|
||||
uint16 usage_count; /* usage counter for clock sweep code */
|
||||
unsigned refcount; /* # of backends holding pins on buffer */
|
||||
BackendId wait_backend_id; /* backend ID of pin-count waiter */
|
||||
|
||||
slock_t buf_hdr_lock; /* protects the above fields */
|
||||
|
||||
int buf_id; /* buffer's index number (from 0) */
|
||||
int freeNext; /* link in freelist chain */
|
||||
|
||||
LWLockId io_in_progress_lock; /* to wait for I/O to complete */
|
||||
LWLockId cntx_lock; /* to lock access to page context */
|
||||
|
||||
bool cntxDirty; /* new way to mark block as dirty */
|
||||
|
||||
/*
|
||||
* We can't physically remove items from a disk page if another
|
||||
* backend has the buffer pinned. Hence, a backend may need to wait
|
||||
* for all other pins to go away. This is signaled by storing its own
|
||||
* backend ID into wait_backend_id and setting flag bit
|
||||
* BM_PIN_COUNT_WAITER. At present, there can be only one such waiter
|
||||
* per buffer.
|
||||
*/
|
||||
BackendId wait_backend_id; /* backend ID of pin-count waiter */
|
||||
LWLockId content_lock; /* to lock access to buffer contents */
|
||||
} BufferDesc;
|
||||
|
||||
#define BufferDescriptorGetBuffer(bdesc) ((bdesc)->buf_id + 1)
|
||||
|
||||
/*
|
||||
* The freeNext field is either the index of the next freelist entry,
|
||||
* or one of these special values:
|
||||
*/
|
||||
#define FREENEXT_END_OF_LIST (-1)
|
||||
#define FREENEXT_NOT_IN_LIST (-2)
|
||||
|
||||
/* in bufmgr.c */
|
||||
/*
|
||||
* Macros for acquiring/releasing a buffer header's spinlock. The
|
||||
* NoHoldoff cases may be used when we know that we hold some LWLock
|
||||
* and therefore interrupts are already held off. Do not apply these
|
||||
* to local buffers!
|
||||
*/
|
||||
#define LockBufHdr(bufHdr) \
|
||||
SpinLockAcquire(&(bufHdr)->buf_hdr_lock)
|
||||
#define UnlockBufHdr(bufHdr) \
|
||||
SpinLockRelease(&(bufHdr)->buf_hdr_lock)
|
||||
#define LockBufHdr_NoHoldoff(bufHdr) \
|
||||
SpinLockAcquire_NoHoldoff(&(bufHdr)->buf_hdr_lock)
|
||||
#define UnlockBufHdr_NoHoldoff(bufHdr) \
|
||||
SpinLockRelease_NoHoldoff(&(bufHdr)->buf_hdr_lock)
|
||||
|
||||
|
||||
/* in buf_init.c */
|
||||
extern BufferDesc *BufferDescriptors;
|
||||
|
||||
/* in localbuf.c */
|
||||
extern BufferDesc *LocalBufferDescriptors;
|
||||
|
||||
/* counters in buf_init.c */
|
||||
/* in freelist.c */
|
||||
extern bool strategy_hint_vacuum;
|
||||
|
||||
/* event counters in buf_init.c */
|
||||
extern long int ReadBufferCount;
|
||||
extern long int ReadLocalBufferCount;
|
||||
extern long int BufferHitCount;
|
||||
@@ -129,15 +180,9 @@ extern long int LocalBufferFlushCount;
|
||||
*/
|
||||
|
||||
/* freelist.c */
|
||||
extern BufferDesc *StrategyBufferLookup(BufferTag *tagPtr, bool recheck,
|
||||
int *cdb_found_index);
|
||||
extern BufferDesc *StrategyGetBuffer(int *cdb_replace_index);
|
||||
extern void StrategyReplaceBuffer(BufferDesc *buf, BufferTag *newTag,
|
||||
int cdb_found_index, int cdb_replace_index);
|
||||
extern void StrategyInvalidateBuffer(BufferDesc *buf);
|
||||
extern void StrategyHintVacuum(bool vacuum_active);
|
||||
extern int StrategyDirtyBufferList(BufferDesc **buffers, BufferTag *buftags,
|
||||
int max_buffers);
|
||||
extern BufferDesc *StrategyGetBuffer(void);
|
||||
extern void StrategyFreeBuffer(BufferDesc *buf, bool at_head);
|
||||
extern int StrategySyncStart(void);
|
||||
extern int StrategyShmemSize(void);
|
||||
extern void StrategyInitialize(bool init);
|
||||
|
||||
@@ -145,7 +190,7 @@ extern void StrategyInitialize(bool init);
|
||||
extern int BufTableShmemSize(int size);
|
||||
extern void InitBufTable(int size);
|
||||
extern int BufTableLookup(BufferTag *tagPtr);
|
||||
extern void BufTableInsert(BufferTag *tagPtr, int buf_id);
|
||||
extern int BufTableInsert(BufferTag *tagPtr, int buf_id);
|
||||
extern void BufTableDelete(BufferTag *tagPtr);
|
||||
|
||||
/* localbuf.c */
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* $PostgreSQL: pgsql/src/include/storage/bufmgr.h,v 1.89 2004/12/31 22:03:42 pgsql Exp $
|
||||
* $PostgreSQL: pgsql/src/include/storage/bufmgr.h,v 1.90 2005/03/04 20:21:07 tgl Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -27,21 +27,25 @@ extern DLLIMPORT int NBuffers;
|
||||
|
||||
/* in bufmgr.c */
|
||||
extern bool zero_damaged_pages;
|
||||
extern double bgwriter_lru_percent;
|
||||
extern double bgwriter_all_percent;
|
||||
extern int bgwriter_lru_maxpages;
|
||||
extern int bgwriter_all_maxpages;
|
||||
|
||||
/* in buf_init.c */
|
||||
extern DLLIMPORT Block *BufferBlockPointers;
|
||||
extern int32 *PrivateRefCount;
|
||||
extern DLLIMPORT int32 *PrivateRefCount;
|
||||
|
||||
/* in localbuf.c */
|
||||
extern DLLIMPORT int NLocBuffer;
|
||||
extern DLLIMPORT Block *LocalBufferBlockPointers;
|
||||
extern int32 *LocalRefCount;
|
||||
extern DLLIMPORT int32 *LocalRefCount;
|
||||
|
||||
/* special block number for ReadBuffer() */
|
||||
#define P_NEW InvalidBlockNumber /* grow the file to get a new page */
|
||||
|
||||
/*
|
||||
* Buffer context lock modes
|
||||
* Buffer content lock modes (mode argument for LockBuffer())
|
||||
*/
|
||||
#define BUFFER_LOCK_UNLOCK 0
|
||||
#define BUFFER_LOCK_SHARE 1
|
||||
@@ -150,8 +154,12 @@ extern void LockBufferForCleanup(Buffer buffer);
|
||||
extern void AbortBufferIO(void);
|
||||
|
||||
extern void BufmgrCommit(void);
|
||||
extern int BufferSync(int percent, int maxpages);
|
||||
extern void BufferSync(void);
|
||||
extern void BgBufferSync(void);
|
||||
|
||||
extern void InitLocalBuffer(void);
|
||||
|
||||
/* in freelist.c */
|
||||
extern void StrategyHintVacuum(bool vacuum_active);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* $PostgreSQL: pgsql/src/include/storage/lwlock.h,v 1.16 2004/12/31 22:03:42 pgsql Exp $
|
||||
* $PostgreSQL: pgsql/src/include/storage/lwlock.h,v 1.17 2005/03/04 20:21:07 tgl Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -25,7 +25,8 @@
|
||||
*/
|
||||
typedef enum LWLockId
|
||||
{
|
||||
BufMgrLock,
|
||||
BufMappingLock,
|
||||
BufFreelistLock,
|
||||
LockMgrLock,
|
||||
OidGenLock,
|
||||
XidGenLock,
|
||||
|
||||
Reference in New Issue
Block a user