mirror of
https://github.com/postgres/postgres.git
synced 2025-05-21 15:54:08 +03:00
Fix race condition between hot standby and restoring a full-page image.
There was a window in RestoreBackupBlock where a page would be zeroed out, but not yet locked. If a backend pinned and locked the page in that window, it saw the zeroed page instead of the old page or new page contents, which could lead to missing rows in a result set, or errors. To fix, replace RBM_ZERO with RBM_ZERO_AND_LOCK, which atomically pins, zeroes, and locks the page, if it's not in the buffer cache already. In stable branches, the old RBM_ZERO constant is renamed to RBM_DO_NOT_USE, to avoid breaking any 3rd party extensions that might use RBM_ZERO. More importantly, this avoids renumbering the other enum values, which would cause even bigger confusion in extensions that use ReadBufferExtended, but haven't been recompiled. Backpatch to all supported versions; this has been racy since hot standby was introduced.
This commit is contained in:
parent
d47fff3d72
commit
7eab804c22
@ -155,9 +155,8 @@ _hash_getinitbuf(Relation rel, BlockNumber blkno)
|
|||||||
if (blkno == P_NEW)
|
if (blkno == P_NEW)
|
||||||
elog(ERROR, "hash AM does not use P_NEW");
|
elog(ERROR, "hash AM does not use P_NEW");
|
||||||
|
|
||||||
buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_ZERO, NULL);
|
buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_ZERO_AND_LOCK,
|
||||||
|
NULL);
|
||||||
LockBuffer(buf, HASH_WRITE);
|
|
||||||
|
|
||||||
/* ref count and lock type are correct */
|
/* ref count and lock type are correct */
|
||||||
|
|
||||||
@ -198,11 +197,13 @@ _hash_getnewbuf(Relation rel, BlockNumber blkno, ForkNumber forkNum)
|
|||||||
if (BufferGetBlockNumber(buf) != blkno)
|
if (BufferGetBlockNumber(buf) != blkno)
|
||||||
elog(ERROR, "unexpected hash relation size: %u, should be %u",
|
elog(ERROR, "unexpected hash relation size: %u, should be %u",
|
||||||
BufferGetBlockNumber(buf), blkno);
|
BufferGetBlockNumber(buf), blkno);
|
||||||
|
LockBuffer(buf, HASH_WRITE);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
buf = ReadBufferExtended(rel, forkNum, blkno, RBM_ZERO, NULL);
|
{
|
||||||
|
buf = ReadBufferExtended(rel, forkNum, blkno, RBM_ZERO_AND_LOCK,
|
||||||
LockBuffer(buf, HASH_WRITE);
|
NULL);
|
||||||
|
}
|
||||||
|
|
||||||
/* ref count and lock type are correct */
|
/* ref count and lock type are correct */
|
||||||
|
|
||||||
|
@ -4884,9 +4884,8 @@ heap_xlog_newpage(XLogRecPtr lsn, XLogRecord *record)
|
|||||||
* not do anything that assumes we are touching a heap.
|
* not do anything that assumes we are touching a heap.
|
||||||
*/
|
*/
|
||||||
buffer = XLogReadBufferExtended(xlrec->node, xlrec->forknum, xlrec->blkno,
|
buffer = XLogReadBufferExtended(xlrec->node, xlrec->forknum, xlrec->blkno,
|
||||||
RBM_ZERO);
|
RBM_ZERO_AND_LOCK);
|
||||||
Assert(BufferIsValid(buffer));
|
Assert(BufferIsValid(buffer));
|
||||||
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
|
|
||||||
page = (Page) BufferGetPage(buffer);
|
page = (Page) BufferGetPage(buffer);
|
||||||
|
|
||||||
Assert(record->xl_len == SizeOfHeapNewpage + BLCKSZ);
|
Assert(record->xl_len == SizeOfHeapNewpage + BLCKSZ);
|
||||||
|
@ -3813,12 +3813,8 @@ RestoreBackupBlock(XLogRecPtr lsn, XLogRecord *record, int block_index,
|
|||||||
{
|
{
|
||||||
/* Found it, apply the update */
|
/* Found it, apply the update */
|
||||||
buffer = XLogReadBufferExtended(bkpb.node, bkpb.fork, bkpb.block,
|
buffer = XLogReadBufferExtended(bkpb.node, bkpb.fork, bkpb.block,
|
||||||
RBM_ZERO);
|
get_cleanup_lock ? RBM_ZERO_AND_CLEANUP_LOCK : RBM_ZERO_AND_LOCK);
|
||||||
Assert(BufferIsValid(buffer));
|
Assert(BufferIsValid(buffer));
|
||||||
if (get_cleanup_lock)
|
|
||||||
LockBufferForCleanup(buffer);
|
|
||||||
else
|
|
||||||
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
|
|
||||||
|
|
||||||
page = (Page) BufferGetPage(buffer);
|
page = (Page) BufferGetPage(buffer);
|
||||||
|
|
||||||
|
@ -257,7 +257,8 @@ XLogCheckInvalidPages(void)
|
|||||||
* The returned buffer is exclusively-locked.
|
* The returned buffer is exclusively-locked.
|
||||||
*
|
*
|
||||||
* For historical reasons, instead of a ReadBufferMode argument, this only
|
* For historical reasons, instead of a ReadBufferMode argument, this only
|
||||||
* supports RBM_ZERO (init == true) and RBM_NORMAL (init == false) modes.
|
* supports RBM_ZERO_AND_LOCK (init == true) and RBM_NORMAL (init == false)
|
||||||
|
* modes.
|
||||||
*/
|
*/
|
||||||
Buffer
|
Buffer
|
||||||
XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init)
|
XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init)
|
||||||
@ -265,8 +266,8 @@ XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init)
|
|||||||
Buffer buf;
|
Buffer buf;
|
||||||
|
|
||||||
buf = XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno,
|
buf = XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno,
|
||||||
init ? RBM_ZERO : RBM_NORMAL);
|
init ? RBM_ZERO_AND_LOCK : RBM_NORMAL);
|
||||||
if (BufferIsValid(buf))
|
if (BufferIsValid(buf) && !init)
|
||||||
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
|
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
|
||||||
|
|
||||||
return buf;
|
return buf;
|
||||||
@ -285,8 +286,8 @@ XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init)
|
|||||||
* dropped or truncated. If we don't see evidence of that later in the WAL
|
* dropped or truncated. If we don't see evidence of that later in the WAL
|
||||||
* sequence, we'll complain at the end of WAL replay.)
|
* sequence, we'll complain at the end of WAL replay.)
|
||||||
*
|
*
|
||||||
* In RBM_ZERO and RBM_ZERO_ON_ERROR modes, if the page doesn't exist, the
|
* In RBM_ZERO_* modes, if the page doesn't exist, the relation is extended
|
||||||
* relation is extended with all-zeroes pages up to the given block number.
|
* with all-zeroes pages up to the given block number.
|
||||||
*
|
*
|
||||||
* In RBM_NORMAL_NO_LOG mode, we return InvalidBuffer if the page doesn't
|
* In RBM_NORMAL_NO_LOG mode, we return InvalidBuffer if the page doesn't
|
||||||
* exist, and we don't check for all-zeroes. Thus, no log entry is made
|
* exist, and we don't check for all-zeroes. Thus, no log entry is made
|
||||||
@ -340,7 +341,11 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
|
|||||||
do
|
do
|
||||||
{
|
{
|
||||||
if (buffer != InvalidBuffer)
|
if (buffer != InvalidBuffer)
|
||||||
|
{
|
||||||
|
if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
|
||||||
|
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
|
||||||
ReleaseBuffer(buffer);
|
ReleaseBuffer(buffer);
|
||||||
|
}
|
||||||
buffer = ReadBufferWithoutRelcache(rnode, forknum,
|
buffer = ReadBufferWithoutRelcache(rnode, forknum,
|
||||||
P_NEW, mode, NULL);
|
P_NEW, mode, NULL);
|
||||||
}
|
}
|
||||||
@ -348,6 +353,8 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
|
|||||||
/* Handle the corner case that P_NEW returns non-consecutive pages */
|
/* Handle the corner case that P_NEW returns non-consecutive pages */
|
||||||
if (BufferGetBlockNumber(buffer) != blkno)
|
if (BufferGetBlockNumber(buffer) != blkno)
|
||||||
{
|
{
|
||||||
|
if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
|
||||||
|
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
|
||||||
ReleaseBuffer(buffer);
|
ReleaseBuffer(buffer);
|
||||||
buffer = ReadBufferWithoutRelcache(rnode, forknum, blkno,
|
buffer = ReadBufferWithoutRelcache(rnode, forknum, blkno,
|
||||||
mode, NULL);
|
mode, NULL);
|
||||||
|
@ -210,14 +210,19 @@ ReadBuffer(Relation reln, BlockNumber blockNum)
|
|||||||
* valid, the page is zeroed instead of throwing an error. This is intended
|
* valid, the page is zeroed instead of throwing an error. This is intended
|
||||||
* for non-critical data, where the caller is prepared to repair errors.
|
* for non-critical data, where the caller is prepared to repair errors.
|
||||||
*
|
*
|
||||||
* In RBM_ZERO mode, if the page isn't in buffer cache already, it's filled
|
* In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
|
||||||
* with zeros instead of reading it from disk. Useful when the caller is
|
* filled with zeros instead of reading it from disk. Useful when the caller
|
||||||
* going to fill the page from scratch, since this saves I/O and avoids
|
* is going to fill the page from scratch, since this saves I/O and avoids
|
||||||
* unnecessary failure if the page-on-disk has corrupt page headers.
|
* unnecessary failure if the page-on-disk has corrupt page headers.
|
||||||
|
* The page is returned locked to ensure that the caller has a chance to
|
||||||
|
* initialize the page before it's made visible to others.
|
||||||
* Caution: do not use this mode to read a page that is beyond the relation's
|
* Caution: do not use this mode to read a page that is beyond the relation's
|
||||||
* current physical EOF; that is likely to cause problems in md.c when
|
* current physical EOF; that is likely to cause problems in md.c when
|
||||||
* the page is modified and written out. P_NEW is OK, though.
|
* the page is modified and written out. P_NEW is OK, though.
|
||||||
*
|
*
|
||||||
|
* RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
|
||||||
|
* a cleanup-strength lock on the page.
|
||||||
|
*
|
||||||
* RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
|
* RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
|
||||||
*
|
*
|
||||||
* If strategy is not NULL, a nondefault buffer access strategy is used.
|
* If strategy is not NULL, a nondefault buffer access strategy is used.
|
||||||
@ -359,6 +364,18 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
|
|||||||
isExtend,
|
isExtend,
|
||||||
found);
|
found);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* In RBM_ZERO_AND_LOCK mode, the caller expects the buffer to
|
||||||
|
* be already locked on return.
|
||||||
|
*/
|
||||||
|
if (!isLocalBuf)
|
||||||
|
{
|
||||||
|
if (mode == RBM_ZERO_AND_LOCK)
|
||||||
|
LWLockAcquire(bufHdr->content_lock, LW_EXCLUSIVE);
|
||||||
|
else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
|
||||||
|
LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr));
|
||||||
|
}
|
||||||
|
|
||||||
return BufferDescriptorGetBuffer(bufHdr);
|
return BufferDescriptorGetBuffer(bufHdr);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -439,8 +456,11 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
|
|||||||
* Read in the page, unless the caller intends to overwrite it and
|
* Read in the page, unless the caller intends to overwrite it and
|
||||||
* just wants us to allocate a buffer.
|
* just wants us to allocate a buffer.
|
||||||
*/
|
*/
|
||||||
if (mode == RBM_ZERO)
|
if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK ||
|
||||||
|
mode == RBM_DO_NOT_USE)
|
||||||
|
{
|
||||||
MemSet((char *) bufBlock, 0, BLCKSZ);
|
MemSet((char *) bufBlock, 0, BLCKSZ);
|
||||||
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
instr_time io_start,
|
instr_time io_start,
|
||||||
@ -481,6 +501,19 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* In RBM_ZERO_AND_LOCK mode, grab the buffer content lock before marking
|
||||||
|
* the page as valid, to make sure that no other backend sees the zeroed
|
||||||
|
* page before the caller has had a chance to initialize it.
|
||||||
|
*
|
||||||
|
* Since no-one else can be looking at the page contents yet, there is no
|
||||||
|
* difference between an exclusive lock and a cleanup-strength lock.
|
||||||
|
* (Note that we cannot use LockBuffer() of LockBufferForCleanup() here,
|
||||||
|
* because they assert that the buffer is already valid.)
|
||||||
|
*/
|
||||||
|
if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
|
||||||
|
LWLockAcquire(bufHdr->content_lock, LW_EXCLUSIVE);
|
||||||
|
|
||||||
if (isLocalBuf)
|
if (isLocalBuf)
|
||||||
{
|
{
|
||||||
/* Only need to adjust flags */
|
/* Only need to adjust flags */
|
||||||
|
@ -36,11 +36,16 @@ typedef enum BufferAccessStrategyType
|
|||||||
typedef enum
|
typedef enum
|
||||||
{
|
{
|
||||||
RBM_NORMAL, /* Normal read */
|
RBM_NORMAL, /* Normal read */
|
||||||
RBM_ZERO, /* Don't read from disk, caller will
|
RBM_DO_NOT_USE, /* This used to be RBM_ZERO. Only kept for
|
||||||
* initialize */
|
* binary compatibility with 3rd party
|
||||||
|
* extensions. */
|
||||||
RBM_ZERO_ON_ERROR, /* Read, but return an all-zeros page on error */
|
RBM_ZERO_ON_ERROR, /* Read, but return an all-zeros page on error */
|
||||||
RBM_NORMAL_NO_LOG /* Don't log page as invalid during WAL
|
RBM_NORMAL_NO_LOG, /* Don't log page as invalid during WAL
|
||||||
* replay; otherwise same as RBM_NORMAL */
|
* replay; otherwise same as RBM_NORMAL */
|
||||||
|
RBM_ZERO_AND_LOCK, /* Don't read from disk, caller will
|
||||||
|
* initialize. Also locks the page. */
|
||||||
|
RBM_ZERO_AND_CLEANUP_LOCK /* Like RBM_ZERO_AND_LOCK, but locks the page
|
||||||
|
* in "cleanup" mode */
|
||||||
} ReadBufferMode;
|
} ReadBufferMode;
|
||||||
|
|
||||||
/* in globals.c ... this duplicates miscadmin.h */
|
/* in globals.c ... this duplicates miscadmin.h */
|
||||||
|
Loading…
x
Reference in New Issue
Block a user