1
0
mirror of https://github.com/postgres/postgres.git synced 2025-07-18 17:42:25 +03:00

Allow I/O reliability checks using 16-bit checksums

Checksums are set immediately prior to flush out of shared buffers
and checked when pages are read in again. Hint bit setting will
require full page write when block is dirtied, which causes various
infrastructure changes. Extensive comments, docs and README.

WARNING message thrown if checksum fails on non-all zeroes page;
ERROR thrown but can be disabled with ignore_checksum_failure = on.

Feature enabled by an initdb option, since transition from option off
to option on is long and complex and has not yet been implemented.
Default is not to use checksums.

Checksum used is WAL CRC-32 truncated to 16-bits.

Simon Riggs, Jeff Davis, Greg Smith
Wide input and assistance from many community members. Thank you.
This commit is contained in:
Simon Riggs
2013-03-22 13:54:07 +00:00
parent e4a05c7512
commit 96ef3b8ff1
40 changed files with 766 additions and 146 deletions

View File

@ -5754,17 +5754,23 @@ log_heap_freeze(Relation reln, Buffer buffer,
* being marked all-visible, and vm_buffer is the buffer containing the
* corresponding visibility map block. Both should have already been modified
* and dirtied.
*
* If checksums are enabled, we also add the heap_buffer to the chain to
* protect it from being torn.
*/
XLogRecPtr
log_heap_visible(RelFileNode rnode, BlockNumber block, Buffer vm_buffer,
log_heap_visible(RelFileNode rnode, Buffer heap_buffer, Buffer vm_buffer,
TransactionId cutoff_xid)
{
xl_heap_visible xlrec;
XLogRecPtr recptr;
XLogRecData rdata[2];
XLogRecData rdata[3];
Assert(BufferIsValid(heap_buffer));
Assert(BufferIsValid(vm_buffer));
xlrec.node = rnode;
xlrec.block = block;
xlrec.block = BufferGetBlockNumber(heap_buffer);
xlrec.cutoff_xid = cutoff_xid;
rdata[0].data = (char *) &xlrec;
@ -5778,6 +5784,17 @@ log_heap_visible(RelFileNode rnode, BlockNumber block, Buffer vm_buffer,
rdata[1].buffer_std = false;
rdata[1].next = NULL;
if (DataChecksumsEnabled())
{
rdata[1].next = &(rdata[2]);
rdata[2].data = NULL;
rdata[2].len = 0;
rdata[2].buffer = heap_buffer;
rdata[2].buffer_std = true;
rdata[2].next = NULL;
}
recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_VISIBLE, rdata);
return recptr;
@ -6139,8 +6156,6 @@ static void
heap_xlog_visible(XLogRecPtr lsn, XLogRecord *record)
{
xl_heap_visible *xlrec = (xl_heap_visible *) XLogRecGetData(record);
Buffer buffer;
Page page;
/*
* If there are any Hot Standby transactions running that have an xmin
@ -6155,39 +6170,56 @@ heap_xlog_visible(XLogRecPtr lsn, XLogRecord *record)
ResolveRecoveryConflictWithSnapshot(xlrec->cutoff_xid, xlrec->node);
/*
* Read the heap page, if it still exists. If the heap file has been
* dropped or truncated later in recovery, we don't need to update the
* page, but we'd better still update the visibility map.
* If heap block was backed up, restore it. This can only happen with
* checksums enabled.
*/
buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, xlrec->block,
RBM_NORMAL);
if (BufferIsValid(buffer))
if (record->xl_info & XLR_BKP_BLOCK(1))
{
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
page = (Page) BufferGetPage(buffer);
Assert(DataChecksumsEnabled());
(void) RestoreBackupBlock(lsn, record, 1, false, false);
}
else
{
Buffer buffer;
Page page;
/*
* We don't bump the LSN of the heap page when setting the visibility
* map bit, because that would generate an unworkable volume of
* full-page writes. This exposes us to torn page hazards, but since
* we're not inspecting the existing page contents in any way, we
* don't care.
*
* However, all operations that clear the visibility map bit *do* bump
* the LSN, and those operations will only be replayed if the XLOG LSN
* follows the page LSN. Thus, if the page LSN has advanced past our
* XLOG record's LSN, we mustn't mark the page all-visible, because
* the subsequent update won't be replayed to clear the flag.
* Read the heap page, if it still exists. If the heap file has been
* dropped or truncated later in recovery, we don't need to update the
* page, but we'd better still update the visibility map.
*/
if (lsn > PageGetLSN(page))
buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM,
xlrec->block, RBM_NORMAL);
if (BufferIsValid(buffer))
{
PageSetAllVisible(page);
MarkBufferDirty(buffer);
}
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
/* Done with heap page. */
UnlockReleaseBuffer(buffer);
page = (Page) BufferGetPage(buffer);
/*
* We don't bump the LSN of the heap page when setting the
* visibility map bit (unless checksums are enabled, in which case
* we must), because that would generate an unworkable volume of
* full-page writes. This exposes us to torn page hazards, but
* since we're not inspecting the existing page contents in any
* way, we don't care.
*
* However, all operations that clear the visibility map bit *do*
* bump the LSN, and those operations will only be replayed if the
* XLOG LSN follows the page LSN. Thus, if the page LSN has
* advanced past our XLOG record's LSN, we mustn't mark the page
* all-visible, because the subsequent update won't be replayed to
* clear the flag.
*/
if (lsn > PageGetLSN(page))
{
PageSetAllVisible(page);
MarkBufferDirty(buffer);
}
/* Done with heap page. */
UnlockReleaseBuffer(buffer);
}
}
/*
@ -6218,7 +6250,7 @@ heap_xlog_visible(XLogRecPtr lsn, XLogRecord *record)
* real harm is done; and the next VACUUM will fix it.
*/
if (lsn > PageGetLSN(BufferGetPage(vmbuffer)))
visibilitymap_set(reln, xlrec->block, lsn, vmbuffer,
visibilitymap_set(reln, xlrec->block, InvalidBuffer, lsn, vmbuffer,
xlrec->cutoff_xid);
ReleaseBuffer(vmbuffer);

View File

@ -262,7 +262,7 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin,
{
((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid;
PageClearFull(page);
SetBufferCommitInfoNeedsSave(buffer);
MarkBufferDirtyHint(buffer);
}
}

View File

@ -273,6 +273,8 @@ end_heap_rewrite(RewriteState state)
/* Write the last page, if any */
if (state->rs_buffer_valid)
{
PageSetChecksumInplace(state->rs_buffer, state->rs_blockno);
if (state->rs_use_wal)
log_newpage(&state->rs_new_rel->rd_node,
MAIN_FORKNUM,
@ -614,6 +616,8 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
{
/* Doesn't fit, so write out the existing page */
PageSetChecksumInplace(page, state->rs_blockno);
/* XLOG stuff */
if (state->rs_use_wal)
log_newpage(&state->rs_new_rel->rd_node,

View File

@ -233,13 +233,18 @@ visibilitymap_pin_ok(BlockNumber heapBlk, Buffer buf)
* marked all-visible; it is needed for Hot Standby, and can be
* InvalidTransactionId if the page contains no tuples.
*
* Caller is expected to set the heap page's PD_ALL_VISIBLE bit before calling
* this function. Except in recovery, caller should also pass the heap
* buffer. When checksums are enabled and we're not in recovery, we must add
* the heap buffer to the WAL chain to protect it from being torn.
*
* You must pass a buffer containing the correct map page to this function.
* Call visibilitymap_pin first to pin the right one. This function doesn't do
* any I/O.
*/
void
visibilitymap_set(Relation rel, BlockNumber heapBlk, XLogRecPtr recptr,
Buffer buf, TransactionId cutoff_xid)
visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf,
XLogRecPtr recptr, Buffer vmBuf, TransactionId cutoff_xid)
{
BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk);
@ -252,34 +257,55 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, XLogRecPtr recptr,
#endif
Assert(InRecovery || XLogRecPtrIsInvalid(recptr));
Assert(InRecovery || BufferIsValid(heapBuf));
/* Check that we have the right page pinned */
if (!BufferIsValid(buf) || BufferGetBlockNumber(buf) != mapBlock)
elog(ERROR, "wrong buffer passed to visibilitymap_set");
/* Check that we have the right heap page pinned, if present */
if (BufferIsValid(heapBuf) && BufferGetBlockNumber(heapBuf) != heapBlk)
elog(ERROR, "wrong heap buffer passed to visibilitymap_set");
page = BufferGetPage(buf);
/* Check that we have the right VM page pinned */
if (!BufferIsValid(vmBuf) || BufferGetBlockNumber(vmBuf) != mapBlock)
elog(ERROR, "wrong VM buffer passed to visibilitymap_set");
page = BufferGetPage(vmBuf);
map = PageGetContents(page);
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
LockBuffer(vmBuf, BUFFER_LOCK_EXCLUSIVE);
if (!(map[mapByte] & (1 << mapBit)))
{
START_CRIT_SECTION();
map[mapByte] |= (1 << mapBit);
MarkBufferDirty(buf);
MarkBufferDirty(vmBuf);
if (RelationNeedsWAL(rel))
{
if (XLogRecPtrIsInvalid(recptr))
recptr = log_heap_visible(rel->rd_node, heapBlk, buf,
{
Assert(!InRecovery);
recptr = log_heap_visible(rel->rd_node, heapBuf, vmBuf,
cutoff_xid);
/*
* If data checksums are enabled, we need to protect the heap
* page from being torn.
*/
if (DataChecksumsEnabled())
{
Page heapPage = BufferGetPage(heapBuf);
/* caller is expected to set PD_ALL_VISIBLE first */
Assert(PageIsAllVisible(heapPage));
PageSetLSN(heapPage, recptr);
}
}
PageSetLSN(page, recptr);
}
END_CRIT_SECTION();
}
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
LockBuffer(vmBuf, BUFFER_LOCK_UNLOCK);
}
/*
@ -579,6 +605,8 @@ vm_extend(Relation rel, BlockNumber vm_nblocks)
/* Now extend the file */
while (vm_nblocks_now < vm_nblocks)
{
PageSetChecksumInplace(pg, vm_nblocks_now);
smgrextend(rel->rd_smgr, VISIBILITYMAP_FORKNUM, vm_nblocks_now,
(char *) pg, false);
vm_nblocks_now++;