mirror of
https://github.com/postgres/postgres.git
synced 2025-05-01 01:04:50 +03:00
Fix possible recovery trouble if TRUNCATE overlaps a checkpoint.
If TRUNCATE causes some buffers to be invalidated and thus the checkpoint does not flush them, TRUNCATE must also ensure that the corresponding files are truncated on disk. Otherwise, a replay from the checkpoint might find that the buffers exist but have the wrong contents, which may cause replay to fail. Report by Teja Mupparti. Patch by Kyotaro Horiguchi, per a design suggestion from Heikki Linnakangas, with some changes to the comments by me. Review of this and a prior patch that approached the issue differently by Heikki Linnakangas, Andres Freund, Álvaro Herrera, Masahiko Sawada, and Tom Lane. Discussion: http://postgr.es/m/BYAPR06MB6373BF50B469CA393C614257ABF00@BYAPR06MB6373.namprd06.prod.outlook.com
This commit is contained in:
parent
81045e1e1c
commit
bbace5697d
@ -3075,8 +3075,8 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB)
|
||||
* crash/basebackup, even though the state of the data directory would
|
||||
* require it.
|
||||
*/
|
||||
Assert(!MyProc->delayChkpt);
|
||||
MyProc->delayChkpt = true;
|
||||
Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0);
|
||||
MyProc->delayChkpt |= DELAY_CHKPT_START;
|
||||
|
||||
/* WAL log truncation */
|
||||
WriteMTruncateXlogRec(newOldestMultiDB,
|
||||
@ -3102,7 +3102,7 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB)
|
||||
/* Then offsets */
|
||||
PerformOffsetsTruncation(oldestMulti, newOldestMulti);
|
||||
|
||||
MyProc->delayChkpt = false;
|
||||
MyProc->delayChkpt &= ~DELAY_CHKPT_START;
|
||||
|
||||
END_CRIT_SECTION();
|
||||
LWLockRelease(MultiXactTruncationLock);
|
||||
|
@ -474,7 +474,7 @@ MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, const char *gid,
|
||||
}
|
||||
proc->xid = xid;
|
||||
Assert(proc->xmin == InvalidTransactionId);
|
||||
proc->delayChkpt = false;
|
||||
proc->delayChkpt = 0;
|
||||
proc->statusFlags = 0;
|
||||
proc->pid = 0;
|
||||
proc->databaseId = databaseid;
|
||||
@ -1165,7 +1165,8 @@ EndPrepare(GlobalTransaction gxact)
|
||||
|
||||
START_CRIT_SECTION();
|
||||
|
||||
MyProc->delayChkpt = true;
|
||||
Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0);
|
||||
MyProc->delayChkpt |= DELAY_CHKPT_START;
|
||||
|
||||
XLogBeginInsert();
|
||||
for (record = records.head; record != NULL; record = record->next)
|
||||
@ -1208,7 +1209,7 @@ EndPrepare(GlobalTransaction gxact)
|
||||
* checkpoint starting after this will certainly see the gxact as a
|
||||
* candidate for fsyncing.
|
||||
*/
|
||||
MyProc->delayChkpt = false;
|
||||
MyProc->delayChkpt &= ~DELAY_CHKPT_START;
|
||||
|
||||
/*
|
||||
* Remember that we have this GlobalTransaction entry locked for us. If
|
||||
@ -2275,7 +2276,8 @@ RecordTransactionCommitPrepared(TransactionId xid,
|
||||
START_CRIT_SECTION();
|
||||
|
||||
/* See notes in RecordTransactionCommit */
|
||||
MyProc->delayChkpt = true;
|
||||
Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0);
|
||||
MyProc->delayChkpt |= DELAY_CHKPT_START;
|
||||
|
||||
/*
|
||||
* Emit the XLOG commit record. Note that we mark 2PC commits as
|
||||
@ -2323,7 +2325,7 @@ RecordTransactionCommitPrepared(TransactionId xid,
|
||||
TransactionIdCommitTree(xid, nchildren, children);
|
||||
|
||||
/* Checkpoint can proceed now */
|
||||
MyProc->delayChkpt = false;
|
||||
MyProc->delayChkpt &= ~DELAY_CHKPT_START;
|
||||
|
||||
END_CRIT_SECTION();
|
||||
|
||||
|
@ -1335,8 +1335,9 @@ RecordTransactionCommit(void)
|
||||
* This makes checkpoint's determination of which xacts are delayChkpt
|
||||
* a bit fuzzy, but it doesn't matter.
|
||||
*/
|
||||
Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0);
|
||||
START_CRIT_SECTION();
|
||||
MyProc->delayChkpt = true;
|
||||
MyProc->delayChkpt |= DELAY_CHKPT_START;
|
||||
|
||||
SetCurrentTransactionStopTimestamp();
|
||||
|
||||
@ -1437,7 +1438,7 @@ RecordTransactionCommit(void)
|
||||
*/
|
||||
if (markXidCommitted)
|
||||
{
|
||||
MyProc->delayChkpt = false;
|
||||
MyProc->delayChkpt &= ~DELAY_CHKPT_START;
|
||||
END_CRIT_SECTION();
|
||||
}
|
||||
|
||||
|
@ -9228,18 +9228,30 @@ CreateCheckPoint(int flags)
|
||||
* and we will correctly flush the update below. So we cannot miss any
|
||||
* xacts we need to wait for.
|
||||
*/
|
||||
vxids = GetVirtualXIDsDelayingChkpt(&nvxids);
|
||||
vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_START);
|
||||
if (nvxids > 0)
|
||||
{
|
||||
do
|
||||
{
|
||||
pg_usleep(10000L); /* wait for 10 msec */
|
||||
} while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids));
|
||||
} while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids,
|
||||
DELAY_CHKPT_START));
|
||||
}
|
||||
pfree(vxids);
|
||||
|
||||
CheckPointGuts(checkPoint.redo, flags);
|
||||
|
||||
vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_COMPLETE);
|
||||
if (nvxids > 0)
|
||||
{
|
||||
do
|
||||
{
|
||||
pg_usleep(10000L); /* wait for 10 msec */
|
||||
} while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids,
|
||||
DELAY_CHKPT_COMPLETE));
|
||||
}
|
||||
pfree(vxids);
|
||||
|
||||
/*
|
||||
* Take a snapshot of running transactions and write this to WAL. This
|
||||
* allows us to reconstruct the state of running transactions during
|
||||
|
@ -925,7 +925,7 @@ XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
|
||||
/*
|
||||
* Ensure no checkpoint can change our view of RedoRecPtr.
|
||||
*/
|
||||
Assert(MyProc->delayChkpt);
|
||||
Assert((MyProc->delayChkpt & DELAY_CHKPT_START) != 0);
|
||||
|
||||
/*
|
||||
* Update RedoRecPtr so that we can make the right decision
|
||||
|
@ -325,6 +325,22 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
|
||||
|
||||
RelationPreTruncate(rel);
|
||||
|
||||
/*
|
||||
* Make sure that a concurrent checkpoint can't complete while truncation
|
||||
* is in progress.
|
||||
*
|
||||
* The truncation operation might drop buffers that the checkpoint
|
||||
* otherwise would have flushed. If it does, then it's essential that
|
||||
* the files actually get truncated on disk before the checkpoint record
|
||||
* is written. Otherwise, if reply begins from that checkpoint, the
|
||||
* to-be-truncated blocks might still exist on disk but have older
|
||||
* contents than expected, which can cause replay to fail. It's OK for
|
||||
* the blocks to not exist on disk at all, but not for them to have the
|
||||
* wrong contents.
|
||||
*/
|
||||
Assert((MyProc->delayChkpt & DELAY_CHKPT_COMPLETE) == 0);
|
||||
MyProc->delayChkpt |= DELAY_CHKPT_COMPLETE;
|
||||
|
||||
/*
|
||||
* We WAL-log the truncation before actually truncating, which means
|
||||
* trouble if the truncation fails. If we then crash, the WAL replay
|
||||
@ -363,13 +379,24 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
|
||||
XLogFlush(lsn);
|
||||
}
|
||||
|
||||
/* Do the real work to truncate relation forks */
|
||||
/*
|
||||
* This will first remove any buffers from the buffer pool that should no
|
||||
* longer exist after truncation is complete, and then truncate the
|
||||
* corresponding files on disk.
|
||||
*/
|
||||
smgrtruncate(rel->rd_smgr, forks, nforks, blocks);
|
||||
|
||||
/* We've done all the critical work, so checkpoints are OK now. */
|
||||
MyProc->delayChkpt &= ~DELAY_CHKPT_COMPLETE;
|
||||
|
||||
/*
|
||||
* Update upper-level FSM pages to account for the truncation. This is
|
||||
* important because the just-truncated pages were likely marked as
|
||||
* all-free, and would be preferentially selected.
|
||||
*
|
||||
* NB: There's no point in delaying checkpoints until this is done.
|
||||
* Because the FSM is not WAL-logged, we have to be prepared for the
|
||||
* possibility of corruption after a crash anyway.
|
||||
*/
|
||||
if (need_fsm_vacuum)
|
||||
FreeSpaceMapVacuumRange(rel, nblocks, InvalidBlockNumber);
|
||||
|
@ -3946,7 +3946,9 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
|
||||
* essential that CreateCheckpoint waits for virtual transactions
|
||||
* rather than full transactionids.
|
||||
*/
|
||||
MyProc->delayChkpt = delayChkpt = true;
|
||||
Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0);
|
||||
MyProc->delayChkpt |= DELAY_CHKPT_START;
|
||||
delayChkpt = true;
|
||||
lsn = XLogSaveBufferForHint(buffer, buffer_std);
|
||||
}
|
||||
|
||||
@ -3979,7 +3981,7 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
|
||||
UnlockBufHdr(bufHdr, buf_state);
|
||||
|
||||
if (delayChkpt)
|
||||
MyProc->delayChkpt = false;
|
||||
MyProc->delayChkpt &= ~DELAY_CHKPT_START;
|
||||
|
||||
if (dirtied)
|
||||
{
|
||||
|
@ -689,7 +689,10 @@ ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid)
|
||||
|
||||
proc->lxid = InvalidLocalTransactionId;
|
||||
proc->xmin = InvalidTransactionId;
|
||||
proc->delayChkpt = false; /* be sure this is cleared in abort */
|
||||
|
||||
/* be sure this is cleared in abort */
|
||||
proc->delayChkpt = 0;
|
||||
|
||||
proc->recoveryConflictPending = false;
|
||||
|
||||
/* must be cleared with xid/xmin: */
|
||||
@ -728,7 +731,10 @@ ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid)
|
||||
proc->xid = InvalidTransactionId;
|
||||
proc->lxid = InvalidLocalTransactionId;
|
||||
proc->xmin = InvalidTransactionId;
|
||||
proc->delayChkpt = false; /* be sure this is cleared in abort */
|
||||
|
||||
/* be sure this is cleared in abort */
|
||||
proc->delayChkpt = 0;
|
||||
|
||||
proc->recoveryConflictPending = false;
|
||||
|
||||
/* must be cleared with xid/xmin: */
|
||||
@ -3043,7 +3049,8 @@ GetOldestSafeDecodingTransactionId(bool catalogOnly)
|
||||
* delaying checkpoint because they have critical actions in progress.
|
||||
*
|
||||
* Constructs an array of VXIDs of transactions that are currently in commit
|
||||
* critical sections, as shown by having delayChkpt set in their PGPROC.
|
||||
* critical sections, as shown by having specified delayChkpt bits set in their
|
||||
* PGPROC.
|
||||
*
|
||||
* Returns a palloc'd array that should be freed by the caller.
|
||||
* *nvxids is the number of valid entries.
|
||||
@ -3057,13 +3064,15 @@ GetOldestSafeDecodingTransactionId(bool catalogOnly)
|
||||
* for clearing of delayChkpt to propagate is unimportant for correctness.
|
||||
*/
|
||||
VirtualTransactionId *
|
||||
GetVirtualXIDsDelayingChkpt(int *nvxids)
|
||||
GetVirtualXIDsDelayingChkpt(int *nvxids, int type)
|
||||
{
|
||||
VirtualTransactionId *vxids;
|
||||
ProcArrayStruct *arrayP = procArray;
|
||||
int count = 0;
|
||||
int index;
|
||||
|
||||
Assert(type != 0);
|
||||
|
||||
/* allocate what's certainly enough result space */
|
||||
vxids = (VirtualTransactionId *)
|
||||
palloc(sizeof(VirtualTransactionId) * arrayP->maxProcs);
|
||||
@ -3075,7 +3084,7 @@ GetVirtualXIDsDelayingChkpt(int *nvxids)
|
||||
int pgprocno = arrayP->pgprocnos[index];
|
||||
PGPROC *proc = &allProcs[pgprocno];
|
||||
|
||||
if (proc->delayChkpt)
|
||||
if ((proc->delayChkpt & type) != 0)
|
||||
{
|
||||
VirtualTransactionId vxid;
|
||||
|
||||
@ -3101,12 +3110,14 @@ GetVirtualXIDsDelayingChkpt(int *nvxids)
|
||||
* those numbers should be small enough for it not to be a problem.
|
||||
*/
|
||||
bool
|
||||
HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids)
|
||||
HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids, int type)
|
||||
{
|
||||
bool result = false;
|
||||
ProcArrayStruct *arrayP = procArray;
|
||||
int index;
|
||||
|
||||
Assert(type != 0);
|
||||
|
||||
LWLockAcquire(ProcArrayLock, LW_SHARED);
|
||||
|
||||
for (index = 0; index < arrayP->numProcs; index++)
|
||||
@ -3117,7 +3128,8 @@ HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids)
|
||||
|
||||
GET_VXID_FROM_PGPROC(vxid, *proc);
|
||||
|
||||
if (proc->delayChkpt && VirtualTransactionIdIsValid(vxid))
|
||||
if ((proc->delayChkpt & type) != 0 &&
|
||||
VirtualTransactionIdIsValid(vxid))
|
||||
{
|
||||
int i;
|
||||
|
||||
|
@ -394,7 +394,7 @@ InitProcess(void)
|
||||
MyProc->roleId = InvalidOid;
|
||||
MyProc->tempNamespaceId = InvalidOid;
|
||||
MyProc->isBackgroundWorker = IsBackgroundWorker;
|
||||
MyProc->delayChkpt = false;
|
||||
MyProc->delayChkpt = 0;
|
||||
MyProc->statusFlags = 0;
|
||||
/* NB -- autovac launcher intentionally does not set IS_AUTOVACUUM */
|
||||
if (IsAutoVacuumWorkerProcess())
|
||||
@ -579,7 +579,7 @@ InitAuxiliaryProcess(void)
|
||||
MyProc->roleId = InvalidOid;
|
||||
MyProc->tempNamespaceId = InvalidOid;
|
||||
MyProc->isBackgroundWorker = IsBackgroundWorker;
|
||||
MyProc->delayChkpt = false;
|
||||
MyProc->delayChkpt = 0;
|
||||
MyProc->statusFlags = 0;
|
||||
MyProc->lwWaiting = false;
|
||||
MyProc->lwWaitMode = 0;
|
||||
|
@ -86,6 +86,41 @@ struct XidCache
|
||||
*/
|
||||
#define INVALID_PGPROCNO PG_INT32_MAX
|
||||
|
||||
/*
|
||||
* Flags for PGPROC.delayChkpt
|
||||
*
|
||||
* These flags can be used to delay the start or completion of a checkpoint
|
||||
* for short periods. A flag is in effect if the corresponding bit is set in
|
||||
* the PGPROC of any backend.
|
||||
*
|
||||
* For our purposes here, a checkpoint has three phases: (1) determine the
|
||||
* location to which the redo pointer will be moved, (2) write all the
|
||||
* data durably to disk, and (3) WAL-log the checkpoint.
|
||||
*
|
||||
* Setting DELAY_CHKPT_START prevents the system from moving from phase 1
|
||||
* to phase 2. This is useful when we are performing a WAL-logged modification
|
||||
* of data that will be flushed to disk in phase 2. By setting this flag
|
||||
* before writing WAL and clearing it after we've both written WAL and
|
||||
* performed the corresponding modification, we ensure that if the WAL record
|
||||
* is inserted prior to the new redo point, the corresponding data changes will
|
||||
* also be flushed to disk before the checkpoint can complete. (In the
|
||||
* extremely common case where the data being modified is in shared buffers
|
||||
* and we acquire an exclusive content lock on the relevant buffers before
|
||||
* writing WAL, this mechanism is not needed, because phase 2 will block
|
||||
* until we release the content lock and then flush the modified data to
|
||||
* disk.)
|
||||
*
|
||||
* Setting DELAY_CHKPT_COMPLETE prevents the system from moving from phase 2
|
||||
* to phase 3. This is useful if we are performing a WAL-logged operation that
|
||||
* might invalidate buffers, such as relation truncation. In this case, we need
|
||||
* to ensure that any buffers which were invalidated and thus not flushed by
|
||||
* the checkpoint are actaully destroyed on disk. Replay can cope with a file
|
||||
* or block that doesn't exist, but not with a block that has the wrong
|
||||
* contents.
|
||||
*/
|
||||
#define DELAY_CHKPT_START (1<<0)
|
||||
#define DELAY_CHKPT_COMPLETE (1<<1)
|
||||
|
||||
typedef enum
|
||||
{
|
||||
PROC_WAIT_STATUS_OK,
|
||||
@ -191,7 +226,7 @@ struct PGPROC
|
||||
pg_atomic_uint64 waitStart; /* time at which wait for lock acquisition
|
||||
* started */
|
||||
|
||||
bool delayChkpt; /* true if this proc delays checkpoint start */
|
||||
int delayChkpt; /* for DELAY_CHKPT_* flags */
|
||||
|
||||
uint8 statusFlags; /* this backend's status flags, see PROC_*
|
||||
* above. mirrored in
|
||||
|
@ -59,8 +59,9 @@ extern TransactionId GetOldestActiveTransactionId(void);
|
||||
extern TransactionId GetOldestSafeDecodingTransactionId(bool catalogOnly);
|
||||
extern void GetReplicationHorizons(TransactionId *slot_xmin, TransactionId *catalog_xmin);
|
||||
|
||||
extern VirtualTransactionId *GetVirtualXIDsDelayingChkpt(int *nvxids);
|
||||
extern bool HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids);
|
||||
extern VirtualTransactionId *GetVirtualXIDsDelayingChkpt(int *nvxids, int type);
|
||||
extern bool HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids,
|
||||
int nvxids, int type);
|
||||
|
||||
extern PGPROC *BackendPidGetProc(int pid);
|
||||
extern PGPROC *BackendPidGetProcWithLock(int pid);
|
||||
|
Loading…
x
Reference in New Issue
Block a user