diff --git a/contrib/pg_visibility/pg_visibility.c b/contrib/pg_visibility/pg_visibility.c index 5d0deaba61e..f97373e52e9 100644 --- a/contrib/pg_visibility/pg_visibility.c +++ b/contrib/pg_visibility/pg_visibility.c @@ -19,6 +19,7 @@ #include "funcapi.h" #include "miscadmin.h" #include "storage/bufmgr.h" +#include "storage/proc.h" #include "storage/procarray.h" #include "storage/read_stream.h" #include "storage/smgr.h" @@ -390,6 +391,7 @@ pg_truncate_visibility_map(PG_FUNCTION_ARGS) Relation rel; ForkNumber fork; BlockNumber block; + BlockNumber old_block; rel = relation_open(relid, AccessExclusiveLock); @@ -399,15 +401,22 @@ pg_truncate_visibility_map(PG_FUNCTION_ARGS) /* Forcibly reset cached file size */ RelationGetSmgr(rel)->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] = InvalidBlockNumber; + /* Compute new and old size before entering critical section. */ + fork = VISIBILITYMAP_FORKNUM; block = visibilitymap_prepare_truncate(rel, 0); - if (BlockNumberIsValid(block)) - { - fork = VISIBILITYMAP_FORKNUM; - smgrtruncate(RelationGetSmgr(rel), &fork, 1, &block); - } + old_block = BlockNumberIsValid(block) ? smgrnblocks(RelationGetSmgr(rel), fork) : 0; + + /* + * WAL-logging, buffer dropping, file truncation must be atomic and all on + * one side of a checkpoint. See RelationTruncate() for discussion. + */ + Assert((MyProc->delayChkptFlags & (DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE)) == 0); + MyProc->delayChkptFlags |= DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE; + START_CRIT_SECTION(); if (RelationNeedsWAL(rel)) { + XLogRecPtr lsn; xl_smgr_truncate xlrec; xlrec.blkno = 0; @@ -417,9 +426,17 @@ pg_truncate_visibility_map(PG_FUNCTION_ARGS) XLogBeginInsert(); XLogRegisterData((char *) &xlrec, sizeof(xlrec)); - XLogInsert(RM_SMGR_ID, XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE); + lsn = XLogInsert(RM_SMGR_ID, + XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE); + XLogFlush(lsn); } + if (BlockNumberIsValid(block)) + smgrtruncate(RelationGetSmgr(rel), &fork, 1, &old_block, &block); + + END_CRIT_SECTION(); + MyProc->delayChkptFlags &= ~(DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE); + /* * Release the lock right away, not at commit time. * diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c index bb8c4d15612..5b22cf10990 100644 --- a/src/backend/catalog/storage.c +++ b/src/backend/catalog/storage.c @@ -291,6 +291,7 @@ RelationTruncate(Relation rel, BlockNumber nblocks) bool vm; bool need_fsm_vacuum = false; ForkNumber forks[MAX_FORKNUM]; + BlockNumber old_blocks[MAX_FORKNUM]; BlockNumber blocks[MAX_FORKNUM]; int nforks = 0; SMgrRelation reln; @@ -306,6 +307,7 @@ RelationTruncate(Relation rel, BlockNumber nblocks) /* Prepare for truncation of MAIN fork of the relation */ forks[nforks] = MAIN_FORKNUM; + old_blocks[nforks] = smgrnblocks(reln, MAIN_FORKNUM); blocks[nforks] = nblocks; nforks++; @@ -317,6 +319,7 @@ RelationTruncate(Relation rel, BlockNumber nblocks) if (BlockNumberIsValid(blocks[nforks])) { forks[nforks] = FSM_FORKNUM; + old_blocks[nforks] = smgrnblocks(reln, FSM_FORKNUM); nforks++; need_fsm_vacuum = true; } @@ -330,6 +333,7 @@ RelationTruncate(Relation rel, BlockNumber nblocks) if (BlockNumberIsValid(blocks[nforks])) { forks[nforks] = VISIBILITYMAP_FORKNUM; + old_blocks[nforks] = smgrnblocks(reln, VISIBILITYMAP_FORKNUM); nforks++; } } @@ -366,14 +370,20 @@ RelationTruncate(Relation rel, BlockNumber nblocks) MyProc->delayChkptFlags |= DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE; /* - * We WAL-log the truncation before actually truncating, which means - * trouble if the truncation fails. If we then crash, the WAL replay - * likely isn't going to succeed in the truncation either, and cause a - * PANIC. It's tempting to put a critical section here, but that cure - * would be worse than the disease. It would turn a usually harmless - * failure to truncate, that might spell trouble at WAL replay, into a - * certain PANIC. + * We WAL-log the truncation first and then truncate in a critical + * section. Truncation drops buffers, even if dirty, and then truncates + * disk files. All of that work needs to complete before the lock is + * released, or else old versions of pages on disk that are missing recent + * changes would become accessible again. We'll try the whole operation + * again in crash recovery if we panic, but even then we can't give up + * because we don't want standbys' relation sizes to diverge and break + * replay or visibility invariants downstream. The critical section also + * suppresses interrupts. + * + * (See also pg_visibilitymap.c if changing this code.) */ + START_CRIT_SECTION(); + if (RelationNeedsWAL(rel)) { /* @@ -397,10 +407,10 @@ RelationTruncate(Relation rel, BlockNumber nblocks) * hit the disk before the WAL record, and the truncation of the FSM * or visibility map. If we crashed during that window, we'd be left * with a truncated heap, but the FSM or visibility map would still - * contain entries for the non-existent heap pages. + * contain entries for the non-existent heap pages, and standbys would + * also never replay the truncation. */ - if (fsm || vm) - XLogFlush(lsn); + XLogFlush(lsn); } /* @@ -408,7 +418,9 @@ RelationTruncate(Relation rel, BlockNumber nblocks) * longer exist after truncation is complete, and then truncate the * corresponding files on disk. */ - smgrtruncate(RelationGetSmgr(rel), forks, nforks, blocks); + smgrtruncate(RelationGetSmgr(rel), forks, nforks, old_blocks, blocks); + + END_CRIT_SECTION(); /* We've done all the critical work, so checkpoints are OK now. */ MyProc->delayChkptFlags &= ~(DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE); @@ -973,6 +985,7 @@ smgr_redo(XLogReaderState *record) Relation rel; ForkNumber forks[MAX_FORKNUM]; BlockNumber blocks[MAX_FORKNUM]; + BlockNumber old_blocks[MAX_FORKNUM]; int nforks = 0; bool need_fsm_vacuum = false; @@ -1007,6 +1020,7 @@ smgr_redo(XLogReaderState *record) if ((xlrec->flags & SMGR_TRUNCATE_HEAP) != 0) { forks[nforks] = MAIN_FORKNUM; + old_blocks[nforks] = smgrnblocks(reln, MAIN_FORKNUM); blocks[nforks] = xlrec->blkno; nforks++; @@ -1024,6 +1038,7 @@ smgr_redo(XLogReaderState *record) if (BlockNumberIsValid(blocks[nforks])) { forks[nforks] = FSM_FORKNUM; + old_blocks[nforks] = smgrnblocks(reln, FSM_FORKNUM); nforks++; need_fsm_vacuum = true; } @@ -1035,13 +1050,18 @@ smgr_redo(XLogReaderState *record) if (BlockNumberIsValid(blocks[nforks])) { forks[nforks] = VISIBILITYMAP_FORKNUM; + old_blocks[nforks] = smgrnblocks(reln, VISIBILITYMAP_FORKNUM); nforks++; } } /* Do the real work to truncate relation forks */ if (nforks > 0) - smgrtruncate(reln, forks, nforks, blocks); + { + START_CRIT_SECTION(); + smgrtruncate(reln, forks, nforks, old_blocks, blocks); + END_CRIT_SECTION(); + } /* * Update upper-level FSM pages to account for the truncation. This is diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 4a897a83593..11fccda475f 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -1154,19 +1154,21 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum) /* * mdtruncate() -- Truncate relation to specified number of blocks. + * + * Guaranteed not to allocate memory, so it can be used in a critical section. + * Caller must have called smgrnblocks() to obtain curnblk while holding a + * sufficient lock to prevent a change in relation size, and not used any smgr + * functions for this relation or handled interrupts in between. This makes + * sure we have opened all active segments, so that truncate loop will get + * them all! */ void -mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) +mdtruncate(SMgrRelation reln, ForkNumber forknum, + BlockNumber curnblk, BlockNumber nblocks) { - BlockNumber curnblk; BlockNumber priorblocks; int curopensegs; - /* - * NOTE: mdnblocks makes sure we have opened all active segments, so that - * truncation loop will get them all! - */ - curnblk = mdnblocks(reln, forknum); if (nblocks > curnblk) { /* Bogus request ... but no complaint if InRecovery */ @@ -1505,7 +1507,7 @@ _fdvec_resize(SMgrRelation reln, reln->md_seg_fds[forknum] = MemoryContextAlloc(MdCxt, sizeof(MdfdVec) * nseg); } - else + else if (nseg > reln->md_num_open_segs[forknum]) { /* * It doesn't seem worthwhile complicating the code to amortize @@ -1517,6 +1519,16 @@ _fdvec_resize(SMgrRelation reln, repalloc(reln->md_seg_fds[forknum], sizeof(MdfdVec) * nseg); } + else + { + /* + * We don't reallocate a smaller array, because we want mdtruncate() + * to be able to promise that it won't allocate memory, so that it is + * allowed in a critical section. This means that a bit of space in + * the array is now wasted, until the next time we add a segment and + * reallocate. + */ + } reln->md_num_open_segs[forknum] = nseg; } diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index 925728eb6c1..36ad34aa6ac 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -101,7 +101,7 @@ typedef struct f_smgr BlockNumber blocknum, BlockNumber nblocks); BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum); void (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum, - BlockNumber nblocks); + BlockNumber old_blocks, BlockNumber nblocks); void (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum); void (*smgr_registersync) (SMgrRelation reln, ForkNumber forknum); } f_smgr; @@ -719,10 +719,15 @@ smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum) * * The caller must hold AccessExclusiveLock on the relation, to ensure that * other backends receive the smgr invalidation event that this function sends - * before they access any forks of the relation again. + * before they access any forks of the relation again. The current size of + * the forks should be provided in old_nblocks. This function should normally + * be called in a critical section, but the current size must be checked + * outside the critical section, and no interrupts or smgr functions relating + * to this relation should be called in between. */ void -smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nblocks) +smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, + BlockNumber *old_nblocks, BlockNumber *nblocks) { int i; @@ -750,7 +755,8 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nb /* Make the cached size is invalid if we encounter an error. */ reln->smgr_cached_nblocks[forknum[i]] = InvalidBlockNumber; - smgrsw[reln->smgr_which].smgr_truncate(reln, forknum[i], nblocks[i]); + smgrsw[reln->smgr_which].smgr_truncate(reln, forknum[i], + old_nblocks[i], nblocks[i]); /* * We might as well update the local smgr_cached_nblocks values. The diff --git a/src/include/storage/md.h b/src/include/storage/md.h index b72293c79a5..e7671dd6c18 100644 --- a/src/include/storage/md.h +++ b/src/include/storage/md.h @@ -43,7 +43,7 @@ extern void mdwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks); extern BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum); extern void mdtruncate(SMgrRelation reln, ForkNumber forknum, - BlockNumber nblocks); + BlockNumber old_blocks, BlockNumber nblocks); extern void mdimmedsync(SMgrRelation reln, ForkNumber forknum); extern void mdregistersync(SMgrRelation reln, ForkNumber forknum); diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index 5ab992f5bd5..63a186bd346 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -105,8 +105,9 @@ extern void smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks); extern BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum); extern BlockNumber smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum); -extern void smgrtruncate(SMgrRelation reln, ForkNumber *forknum, - int nforks, BlockNumber *nblocks); +extern void smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, + BlockNumber *old_nblocks, + BlockNumber *nblocks); extern void smgrimmedsync(SMgrRelation reln, ForkNumber forknum); extern void smgrregistersync(SMgrRelation reln, ForkNumber forknum); extern void AtEOXact_SMgr(void);