mirror of
https://github.com/postgres/postgres.git
synced 2025-06-26 12:21:12 +03:00
Until now, only selected bulk operations (e.g. COPY) did this. If a given relfilenode received both a WAL-skipping COPY and a WAL-logged operation (e.g. INSERT), recovery could lose tuples from the COPY. See src/backend/access/transam/README section "Skipping WAL for New RelFileNode" for the new coding rules. Maintainers of table access methods should examine that section. To maintain data durability, just before commit, we choose between an fsync of the relfilenode and copying its contents to WAL. A new GUC, wal_skip_threshold, guides that choice. If this change slows a workload that creates small, permanent relfilenodes under wal_level=minimal, try adjusting wal_skip_threshold. Users setting a timeout on COMMIT may need to adjust that timeout, and log_min_duration_statement analysis will reflect time consumption moving to COMMIT from commands like COPY. Internally, this requires a reliable determination of whether RollbackAndReleaseCurrentSubTransaction() would unlink a relation's current relfilenode. Introduce rd_firstRelfilenodeSubid. Amend the specification of rd_createSubid such that the field is zero when a new rel has an old rd_node. Make relcache.c retain entries for certain dropped relations until end of transaction. Back-patch to 9.5 (all supported versions). This introduces a new WAL record type, XLOG_GIST_ASSIGN_LSN, without bumping XLOG_PAGE_MAGIC. As always, update standby systems before master systems. This changes sizeof(RelationData) and sizeof(IndexStmt), breaking binary compatibility for affected extensions. (The most recent commit to affect the same class of extensions was 089e4d405d0f3b94c74a2c6a54357a84a681754b.) Kyotaro Horiguchi, reviewed (in earlier, similar versions) by Robert Haas. Heikki Linnakangas and Michael Paquier implemented earlier designs that materially clarified the problem. Reviewed, in earlier designs, by Andrew Dunstan, Andres Freund, Alvaro Herrera, Tom Lane, Fujii Masao, and Simon Riggs. Reported by Martijn van Oosterhout. Discussion: https://postgr.es/m/20150702220524.GA9392@svana.org
284 lines
9.2 KiB
C
284 lines
9.2 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* bufmgr.h
|
|
* POSTGRES buffer manager definitions.
|
|
*
|
|
*
|
|
* Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
*
|
|
* src/include/storage/bufmgr.h
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
#ifndef BUFMGR_H
|
|
#define BUFMGR_H
|
|
|
|
#include "storage/block.h"
|
|
#include "storage/buf.h"
|
|
#include "storage/bufpage.h"
|
|
#include "storage/relfilenode.h"
|
|
#include "utils/relcache.h"
|
|
#include "utils/snapmgr.h"
|
|
#include "utils/tqual.h"
|
|
|
|
typedef void *Block;
|
|
|
|
/* Possible arguments for GetAccessStrategy() */
|
|
typedef enum BufferAccessStrategyType
|
|
{
|
|
BAS_NORMAL, /* Normal random access */
|
|
BAS_BULKREAD, /* Large read-only scan (hint bit updates are
|
|
* ok) */
|
|
BAS_BULKWRITE, /* Large multi-block write (e.g. COPY IN) */
|
|
BAS_VACUUM /* VACUUM */
|
|
} BufferAccessStrategyType;
|
|
|
|
/* Possible modes for ReadBufferExtended() */
|
|
typedef enum
|
|
{
|
|
RBM_NORMAL, /* Normal read */
|
|
RBM_ZERO_AND_LOCK, /* Don't read from disk, caller will
|
|
* initialize. Also locks the page. */
|
|
RBM_ZERO_AND_CLEANUP_LOCK, /* Like RBM_ZERO_AND_LOCK, but locks the page
|
|
* in "cleanup" mode */
|
|
RBM_ZERO_ON_ERROR, /* Read, but return an all-zeros page on error */
|
|
RBM_NORMAL_NO_LOG /* Don't log page as invalid during WAL
|
|
* replay; otherwise same as RBM_NORMAL */
|
|
} ReadBufferMode;
|
|
|
|
/* forward declared, to avoid having to expose buf_internals.h here */
|
|
struct WritebackContext;
|
|
|
|
/* forward declared, to avoid including smgr.h here */
|
|
struct SMgrRelationData;
|
|
|
|
/* in globals.c ... this duplicates miscadmin.h */
|
|
extern PGDLLIMPORT int NBuffers;
|
|
|
|
/* in bufmgr.c */
|
|
extern bool zero_damaged_pages;
|
|
extern int bgwriter_lru_maxpages;
|
|
extern double bgwriter_lru_multiplier;
|
|
extern bool track_io_timing;
|
|
extern int target_prefetch_pages;
|
|
|
|
extern int checkpoint_flush_after;
|
|
extern int backend_flush_after;
|
|
extern int bgwriter_flush_after;
|
|
|
|
/* in buf_init.c */
|
|
extern PGDLLIMPORT char *BufferBlocks;
|
|
|
|
/* in guc.c */
|
|
extern int effective_io_concurrency;
|
|
|
|
/* in localbuf.c */
|
|
extern PGDLLIMPORT int NLocBuffer;
|
|
extern PGDLLIMPORT Block *LocalBufferBlockPointers;
|
|
extern PGDLLIMPORT int32 *LocalRefCount;
|
|
|
|
/* upper limit for effective_io_concurrency */
|
|
#define MAX_IO_CONCURRENCY 1000
|
|
|
|
/* special block number for ReadBuffer() */
|
|
#define P_NEW InvalidBlockNumber /* grow the file to get a new page */
|
|
|
|
/*
|
|
* Buffer content lock modes (mode argument for LockBuffer())
|
|
*/
|
|
#define BUFFER_LOCK_UNLOCK 0
|
|
#define BUFFER_LOCK_SHARE 1
|
|
#define BUFFER_LOCK_EXCLUSIVE 2
|
|
|
|
/*
|
|
* These routines are beaten on quite heavily, hence the macroization.
|
|
*/
|
|
|
|
/*
|
|
* BufferIsValid
|
|
* True iff the given buffer number is valid (either as a shared
|
|
* or local buffer).
|
|
*
|
|
* Note: For a long time this was defined the same as BufferIsPinned,
|
|
* that is it would say False if you didn't hold a pin on the buffer.
|
|
* I believe this was bogus and served only to mask logic errors.
|
|
* Code should always know whether it has a buffer reference,
|
|
* independently of the pin state.
|
|
*
|
|
* Note: For a further long time this was not quite the inverse of the
|
|
* BufferIsInvalid() macro, in that it also did sanity checks to verify
|
|
* that the buffer number was in range. Most likely, this macro was
|
|
* originally intended only to be used in assertions, but its use has
|
|
* since expanded quite a bit, and the overhead of making those checks
|
|
* even in non-assert-enabled builds can be significant. Thus, we've
|
|
* now demoted the range checks to assertions within the macro itself.
|
|
*/
|
|
#define BufferIsValid(bufnum) \
|
|
( \
|
|
AssertMacro((bufnum) <= NBuffers && (bufnum) >= -NLocBuffer), \
|
|
(bufnum) != InvalidBuffer \
|
|
)
|
|
|
|
/*
|
|
* BufferGetBlock
|
|
* Returns a reference to a disk page image associated with a buffer.
|
|
*
|
|
* Note:
|
|
* Assumes buffer is valid.
|
|
*/
|
|
#define BufferGetBlock(buffer) \
|
|
( \
|
|
AssertMacro(BufferIsValid(buffer)), \
|
|
BufferIsLocal(buffer) ? \
|
|
LocalBufferBlockPointers[-(buffer) - 1] \
|
|
: \
|
|
(Block) (BufferBlocks + ((Size) ((buffer) - 1)) * BLCKSZ) \
|
|
)
|
|
|
|
/*
|
|
* BufferGetPageSize
|
|
* Returns the page size within a buffer.
|
|
*
|
|
* Notes:
|
|
* Assumes buffer is valid.
|
|
*
|
|
* The buffer can be a raw disk block and need not contain a valid
|
|
* (formatted) disk page.
|
|
*/
|
|
/* XXX should dig out of buffer descriptor */
|
|
#define BufferGetPageSize(buffer) \
|
|
( \
|
|
AssertMacro(BufferIsValid(buffer)), \
|
|
(Size)BLCKSZ \
|
|
)
|
|
|
|
/*
|
|
* BufferGetPage
|
|
* Returns the page associated with a buffer.
|
|
*
|
|
* When this is called as part of a scan, there may be a need for a nearby
|
|
* call to TestForOldSnapshot(). See the definition of that for details.
|
|
*/
|
|
#define BufferGetPage(buffer) ((Page)BufferGetBlock(buffer))
|
|
|
|
/*
|
|
* prototypes for functions in bufmgr.c
|
|
*/
|
|
extern bool ComputeIoConcurrency(int io_concurrency, double *target);
|
|
extern void PrefetchBuffer(Relation reln, ForkNumber forkNum,
|
|
BlockNumber blockNum);
|
|
extern Buffer ReadBuffer(Relation reln, BlockNumber blockNum);
|
|
extern Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum,
|
|
BlockNumber blockNum, ReadBufferMode mode,
|
|
BufferAccessStrategy strategy);
|
|
extern Buffer ReadBufferWithoutRelcache(RelFileNode rnode,
|
|
ForkNumber forkNum, BlockNumber blockNum,
|
|
ReadBufferMode mode, BufferAccessStrategy strategy);
|
|
extern void ReleaseBuffer(Buffer buffer);
|
|
extern void UnlockReleaseBuffer(Buffer buffer);
|
|
extern void MarkBufferDirty(Buffer buffer);
|
|
extern void IncrBufferRefCount(Buffer buffer);
|
|
extern Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation,
|
|
BlockNumber blockNum);
|
|
|
|
extern void InitBufferPool(void);
|
|
extern void InitBufferPoolAccess(void);
|
|
extern void InitBufferPoolBackend(void);
|
|
extern void AtEOXact_Buffers(bool isCommit);
|
|
extern void PrintBufferLeakWarning(Buffer buffer);
|
|
extern void CheckPointBuffers(int flags);
|
|
extern BlockNumber BufferGetBlockNumber(Buffer buffer);
|
|
extern BlockNumber RelationGetNumberOfBlocksInFork(Relation relation,
|
|
ForkNumber forkNum);
|
|
extern void FlushOneBuffer(Buffer buffer);
|
|
extern void FlushRelationBuffers(Relation rel);
|
|
extern void FlushRelationsAllBuffers(struct SMgrRelationData **smgrs, int nrels);
|
|
extern void FlushDatabaseBuffers(Oid dbid);
|
|
extern void DropRelFileNodeBuffers(RelFileNodeBackend rnode,
|
|
ForkNumber forkNum, BlockNumber firstDelBlock);
|
|
extern void DropRelFileNodesAllBuffers(RelFileNodeBackend *rnodes, int nnodes);
|
|
extern void DropDatabaseBuffers(Oid dbid);
|
|
|
|
#define RelationGetNumberOfBlocks(reln) \
|
|
RelationGetNumberOfBlocksInFork(reln, MAIN_FORKNUM)
|
|
|
|
extern bool BufferIsPermanent(Buffer buffer);
|
|
extern XLogRecPtr BufferGetLSNAtomic(Buffer buffer);
|
|
|
|
#ifdef NOT_USED
|
|
extern void PrintPinnedBufs(void);
|
|
#endif
|
|
extern Size BufferShmemSize(void);
|
|
extern void BufferGetTag(Buffer buffer, RelFileNode *rnode,
|
|
ForkNumber *forknum, BlockNumber *blknum);
|
|
|
|
extern void MarkBufferDirtyHint(Buffer buffer, bool buffer_std);
|
|
|
|
extern void UnlockBuffers(void);
|
|
extern void LockBuffer(Buffer buffer, int mode);
|
|
extern bool ConditionalLockBuffer(Buffer buffer);
|
|
extern void LockBufferForCleanup(Buffer buffer);
|
|
extern bool ConditionalLockBufferForCleanup(Buffer buffer);
|
|
extern bool HoldingBufferPinThatDelaysRecovery(void);
|
|
|
|
extern void AbortBufferIO(void);
|
|
|
|
extern void BufmgrCommit(void);
|
|
extern bool BgBufferSync(struct WritebackContext *wb_context);
|
|
|
|
extern void AtProcExit_LocalBuffers(void);
|
|
|
|
extern void TestForOldSnapshot_impl(Snapshot snapshot, Relation relation);
|
|
|
|
/* in freelist.c */
|
|
extern BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype);
|
|
extern void FreeAccessStrategy(BufferAccessStrategy strategy);
|
|
|
|
|
|
/* inline functions */
|
|
|
|
/*
|
|
* Although this header file is nominally backend-only, certain frontend
|
|
* programs like pg_xlogdump include it. For compilers that emit static
|
|
* inline functions even when they're unused, that leads to unsatisfied
|
|
* external references; hence hide these with #ifndef FRONTEND.
|
|
*/
|
|
|
|
#ifndef FRONTEND
|
|
|
|
/*
|
|
* Check whether the given snapshot is too old to have safely read the given
|
|
* page from the given table. If so, throw a "snapshot too old" error.
|
|
*
|
|
* This test generally needs to be performed after every BufferGetPage() call
|
|
* that is executed as part of a scan. It is not needed for calls made for
|
|
* modifying the page (for example, to position to the right place to insert a
|
|
* new index tuple or for vacuuming). It may also be omitted where calls to
|
|
* lower-level functions will have already performed the test.
|
|
*
|
|
* Note that a NULL snapshot argument is allowed and causes a fast return
|
|
* without error; this is to support call sites which can be called from
|
|
* either scans or index modification areas.
|
|
*
|
|
* For best performance, keep the tests that are fastest and/or most likely to
|
|
* exclude a page from old snapshot testing near the front.
|
|
*/
|
|
static inline void
|
|
TestForOldSnapshot(Snapshot snapshot, Relation relation, Page page)
|
|
{
|
|
Assert(relation != NULL);
|
|
|
|
if (old_snapshot_threshold >= 0
|
|
&& (snapshot) != NULL
|
|
&& ((snapshot)->satisfies == HeapTupleSatisfiesMVCC
|
|
|| (snapshot)->satisfies == HeapTupleSatisfiesToast)
|
|
&& !XLogRecPtrIsInvalid((snapshot)->lsn)
|
|
&& PageGetLSN(page) > (snapshot)->lsn)
|
|
TestForOldSnapshot_impl(snapshot, relation);
|
|
}
|
|
|
|
#endif /* FRONTEND */
|
|
|
|
#endif /* BUFMGR_H */
|