mirror of
https://github.com/postgres/postgres.git
synced 2025-12-01 12:18:01 +03:00
Add WAL consistency checking facility.
When the new GUC wal_consistency_checking is set to a non-empty value, it triggers recording of additional full-page images, which are compared on the standby against the results of applying the WAL record (without regard to those full-page images). Allowable differences such as hints are masked out, and the resulting pages are compared; any difference results in a FATAL error on the standby. Kuntal Ghosh, based on earlier patches by Michael Paquier and Heikki Linnakangas. Extensively reviewed and revised by Michael Paquier and by me, with additional reviews and comments from Amit Kapila, Álvaro Herrera, Simon Riggs, and Peter Eisentraut.
This commit is contained in:
@@ -13,6 +13,7 @@
|
||||
*/
|
||||
#include "postgres.h"
|
||||
|
||||
#include "access/bufmask.h"
|
||||
#include "access/generic_xlog.h"
|
||||
#include "access/xlogutils.h"
|
||||
#include "miscadmin.h"
|
||||
@@ -533,3 +534,14 @@ generic_redo(XLogReaderState *record)
|
||||
UnlockReleaseBuffer(buffers[block_id]);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Mask a generic page before performing consistency checks on it.
|
||||
*/
|
||||
void
|
||||
generic_mask(char *page, BlockNumber blkno)
|
||||
{
|
||||
mask_page_lsn(page);
|
||||
|
||||
mask_unused_space(page);
|
||||
}
|
||||
|
||||
@@ -30,8 +30,8 @@
|
||||
#include "utils/relmapper.h"
|
||||
|
||||
/* must be kept in sync with RmgrData definition in xlog_internal.h */
|
||||
#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup) \
|
||||
{ name, redo, desc, identify, startup, cleanup },
|
||||
#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask) \
|
||||
{ name, redo, desc, identify, startup, cleanup, mask },
|
||||
|
||||
const RmgrData RmgrTable[RM_MAX_ID + 1] = {
|
||||
#include "access/rmgrlist.h"
|
||||
|
||||
@@ -95,6 +95,8 @@ bool EnableHotStandby = false;
|
||||
bool fullPageWrites = true;
|
||||
bool wal_log_hints = false;
|
||||
bool wal_compression = false;
|
||||
char *wal_consistency_checking_string = NULL;
|
||||
bool *wal_consistency_checking = NULL;
|
||||
bool log_checkpoints = false;
|
||||
int sync_method = DEFAULT_SYNC_METHOD;
|
||||
int wal_level = WAL_LEVEL_MINIMAL;
|
||||
@@ -245,6 +247,10 @@ bool InArchiveRecovery = false;
|
||||
/* Was the last xlog file restored from archive, or local? */
|
||||
static bool restoredFromArchive = false;
|
||||
|
||||
/* Buffers dedicated to consistency checks of size BLCKSZ */
|
||||
static char *replay_image_masked = NULL;
|
||||
static char *master_image_masked = NULL;
|
||||
|
||||
/* options taken from recovery.conf for archive recovery */
|
||||
char *recoveryRestoreCommand = NULL;
|
||||
static char *recoveryEndCommand = NULL;
|
||||
@@ -903,6 +909,7 @@ static char *GetXLogBuffer(XLogRecPtr ptr);
|
||||
static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos);
|
||||
static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos);
|
||||
static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr);
|
||||
static void checkXLogConsistency(XLogReaderState *record);
|
||||
|
||||
static void WALInsertLockAcquire(void);
|
||||
static void WALInsertLockAcquireExclusive(void);
|
||||
@@ -1314,6 +1321,103 @@ ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr)
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Checks whether the current buffer page and backup page stored in the
|
||||
* WAL record are consistent or not. Before comparing the two pages, a
|
||||
* masking can be applied to the pages to ignore certain areas like hint bits,
|
||||
* unused space between pd_lower and pd_upper among other things. This
|
||||
* function should be called once WAL replay has been completed for a
|
||||
* given record.
|
||||
*/
|
||||
static void
|
||||
checkXLogConsistency(XLogReaderState *record)
|
||||
{
|
||||
RmgrId rmid = XLogRecGetRmid(record);
|
||||
RelFileNode rnode;
|
||||
ForkNumber forknum;
|
||||
BlockNumber blkno;
|
||||
int block_id;
|
||||
|
||||
/* Records with no backup blocks have no need for consistency checks. */
|
||||
if (!XLogRecHasAnyBlockRefs(record))
|
||||
return;
|
||||
|
||||
Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0);
|
||||
|
||||
for (block_id = 0; block_id <= record->max_block_id; block_id++)
|
||||
{
|
||||
Buffer buf;
|
||||
Page page;
|
||||
|
||||
if (!XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno))
|
||||
{
|
||||
/*
|
||||
* WAL record doesn't contain a block reference with the given id.
|
||||
* Do nothing.
|
||||
*/
|
||||
continue;
|
||||
}
|
||||
|
||||
Assert(XLogRecHasBlockImage(record, block_id));
|
||||
|
||||
/*
|
||||
* Read the contents from the current buffer and store it in a
|
||||
* temporary page.
|
||||
*/
|
||||
buf = XLogReadBufferExtended(rnode, forknum, blkno,
|
||||
RBM_NORMAL_NO_LOG);
|
||||
if (!BufferIsValid(buf))
|
||||
continue;
|
||||
|
||||
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
|
||||
page = BufferGetPage(buf);
|
||||
|
||||
/*
|
||||
* Take a copy of the local page where WAL has been applied to have a
|
||||
* comparison base before masking it...
|
||||
*/
|
||||
memcpy(replay_image_masked, page, BLCKSZ);
|
||||
|
||||
/* No need for this page anymore now that a copy is in. */
|
||||
UnlockReleaseBuffer(buf);
|
||||
|
||||
/*
|
||||
* If the block LSN is already ahead of this WAL record, we can't
|
||||
* expect contents to match. This can happen if recovery is restarted.
|
||||
*/
|
||||
if (PageGetLSN(replay_image_masked) > record->EndRecPtr)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* Read the contents from the backup copy, stored in WAL record and
|
||||
* store it in a temporary page. There is not need to allocate a new
|
||||
* page here, a local buffer is fine to hold its contents and a mask
|
||||
* can be directly applied on it.
|
||||
*/
|
||||
if (!RestoreBlockImage(record, block_id, master_image_masked))
|
||||
elog(ERROR, "failed to restore block image");
|
||||
|
||||
/*
|
||||
* If masking function is defined, mask both the master and replay
|
||||
* images
|
||||
*/
|
||||
if (RmgrTable[rmid].rm_mask != NULL)
|
||||
{
|
||||
RmgrTable[rmid].rm_mask(replay_image_masked, blkno);
|
||||
RmgrTable[rmid].rm_mask(master_image_masked, blkno);
|
||||
}
|
||||
|
||||
/* Time to compare the master and replay images. */
|
||||
if (memcmp(replay_image_masked, master_image_masked, BLCKSZ) != 0)
|
||||
{
|
||||
elog(FATAL,
|
||||
"inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
|
||||
rnode.spcNode, rnode.dbNode, rnode.relNode,
|
||||
forknum, blkno);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Subroutine of XLogInsertRecord. Copies a WAL record to an already-reserved
|
||||
* area in the WAL.
|
||||
@@ -6200,6 +6304,13 @@ StartupXLOG(void)
|
||||
errdetail("Failed while allocating an XLog reading processor.")));
|
||||
xlogreader->system_identifier = ControlFile->system_identifier;
|
||||
|
||||
/*
|
||||
* Allocate pages dedicated to WAL consistency checks, those had better
|
||||
* be aligned.
|
||||
*/
|
||||
replay_image_masked = (char *) palloc(BLCKSZ);
|
||||
master_image_masked = (char *) palloc(BLCKSZ);
|
||||
|
||||
if (read_backup_label(&checkPointLoc, &backupEndRequired,
|
||||
&backupFromStandby))
|
||||
{
|
||||
@@ -7000,6 +7111,15 @@ StartupXLOG(void)
|
||||
/* Now apply the WAL record itself */
|
||||
RmgrTable[record->xl_rmid].rm_redo(xlogreader);
|
||||
|
||||
/*
|
||||
* After redo, check whether the backup pages associated with
|
||||
* the WAL record are consistent with the existing pages. This
|
||||
* check is done only if consistency check is enabled for this
|
||||
* record.
|
||||
*/
|
||||
if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
|
||||
checkXLogConsistency(xlogreader);
|
||||
|
||||
/* Pop the error context stack */
|
||||
error_context_stack = errcallback.previous;
|
||||
|
||||
|
||||
@@ -421,10 +421,12 @@ XLogInsert(RmgrId rmid, uint8 info)
|
||||
elog(ERROR, "XLogBeginInsert was not called");
|
||||
|
||||
/*
|
||||
* The caller can set rmgr bits and XLR_SPECIAL_REL_UPDATE; the rest are
|
||||
* reserved for use by me.
|
||||
* The caller can set rmgr bits, XLR_SPECIAL_REL_UPDATE and
|
||||
* XLR_CHECK_CONSISTENCY; the rest are reserved for use by me.
|
||||
*/
|
||||
if ((info & ~(XLR_RMGR_INFO_MASK | XLR_SPECIAL_REL_UPDATE)) != 0)
|
||||
if ((info & ~(XLR_RMGR_INFO_MASK |
|
||||
XLR_SPECIAL_REL_UPDATE |
|
||||
XLR_CHECK_CONSISTENCY)) != 0)
|
||||
elog(PANIC, "invalid xlog info mask %02X", info);
|
||||
|
||||
TRACE_POSTGRESQL_XLOG_INSERT(rmid, info);
|
||||
@@ -504,6 +506,15 @@ XLogRecordAssemble(RmgrId rmid, uint8 info,
|
||||
rdt_datas_last = &hdr_rdt;
|
||||
hdr_rdt.data = hdr_scratch;
|
||||
|
||||
/*
|
||||
* Enforce consistency checks for this record if user is looking for
|
||||
* it. Do this before at the beginning of this routine to give the
|
||||
* possibility for callers of XLogInsert() to pass XLR_CHECK_CONSISTENCY
|
||||
* directly for a record.
|
||||
*/
|
||||
if (wal_consistency_checking[rmid])
|
||||
info |= XLR_CHECK_CONSISTENCY;
|
||||
|
||||
/*
|
||||
* Make an rdata chain containing all the data portions of all block
|
||||
* references. This includes the data for full-page images. Also append
|
||||
@@ -520,6 +531,7 @@ XLogRecordAssemble(RmgrId rmid, uint8 info,
|
||||
XLogRecordBlockCompressHeader cbimg = {0};
|
||||
bool samerel;
|
||||
bool is_compressed = false;
|
||||
bool include_image;
|
||||
|
||||
if (!regbuf->in_use)
|
||||
continue;
|
||||
@@ -563,7 +575,14 @@ XLogRecordAssemble(RmgrId rmid, uint8 info,
|
||||
if ((regbuf->flags & REGBUF_WILL_INIT) == REGBUF_WILL_INIT)
|
||||
bkpb.fork_flags |= BKPBLOCK_WILL_INIT;
|
||||
|
||||
if (needs_backup)
|
||||
/*
|
||||
* If needs_backup is true or WAL checking is enabled for
|
||||
* current resource manager, log a full-page write for the current
|
||||
* block.
|
||||
*/
|
||||
include_image = needs_backup || (info & XLR_CHECK_CONSISTENCY) != 0;
|
||||
|
||||
if (include_image)
|
||||
{
|
||||
Page page = regbuf->page;
|
||||
uint16 compressed_len;
|
||||
@@ -625,6 +644,15 @@ XLogRecordAssemble(RmgrId rmid, uint8 info,
|
||||
|
||||
bimg.bimg_info = (cbimg.hole_length == 0) ? 0 : BKPIMAGE_HAS_HOLE;
|
||||
|
||||
/*
|
||||
* If WAL consistency checking is enabled for the resource manager of
|
||||
* this WAL record, a full-page image is included in the record
|
||||
* for the block modified. During redo, the full-page is replayed
|
||||
* only if BKPIMAGE_APPLY is set.
|
||||
*/
|
||||
if (needs_backup)
|
||||
bimg.bimg_info |= BKPIMAGE_APPLY;
|
||||
|
||||
if (is_compressed)
|
||||
{
|
||||
bimg.length = compressed_len;
|
||||
@@ -687,7 +715,7 @@ XLogRecordAssemble(RmgrId rmid, uint8 info,
|
||||
/* Ok, copy the header to the scratch buffer */
|
||||
memcpy(scratch, &bkpb, SizeOfXLogRecordBlockHeader);
|
||||
scratch += SizeOfXLogRecordBlockHeader;
|
||||
if (needs_backup)
|
||||
if (include_image)
|
||||
{
|
||||
memcpy(scratch, &bimg, SizeOfXLogRecordBlockImageHeader);
|
||||
scratch += SizeOfXLogRecordBlockImageHeader;
|
||||
|
||||
@@ -997,6 +997,7 @@ ResetDecoder(XLogReaderState *state)
|
||||
state->blocks[block_id].in_use = false;
|
||||
state->blocks[block_id].has_image = false;
|
||||
state->blocks[block_id].has_data = false;
|
||||
state->blocks[block_id].apply_image = false;
|
||||
}
|
||||
state->max_block_id = -1;
|
||||
}
|
||||
@@ -1089,6 +1090,7 @@ DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, char **errormsg)
|
||||
|
||||
blk = &state->blocks[block_id];
|
||||
blk->in_use = true;
|
||||
blk->apply_image = false;
|
||||
|
||||
COPY_HEADER_FIELD(&fork_flags, sizeof(uint8));
|
||||
blk->forknum = fork_flags & BKPBLOCK_FORK_MASK;
|
||||
@@ -1120,6 +1122,9 @@ DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, char **errormsg)
|
||||
COPY_HEADER_FIELD(&blk->bimg_len, sizeof(uint16));
|
||||
COPY_HEADER_FIELD(&blk->hole_offset, sizeof(uint16));
|
||||
COPY_HEADER_FIELD(&blk->bimg_info, sizeof(uint8));
|
||||
|
||||
blk->apply_image = ((blk->bimg_info & BKPIMAGE_APPLY) != 0);
|
||||
|
||||
if (blk->bimg_info & BKPIMAGE_IS_COMPRESSED)
|
||||
{
|
||||
if (blk->bimg_info & BKPIMAGE_HAS_HOLE)
|
||||
@@ -1243,6 +1248,9 @@ DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, char **errormsg)
|
||||
|
||||
if (!blk->in_use)
|
||||
continue;
|
||||
|
||||
Assert(blk->has_image || !blk->apply_image);
|
||||
|
||||
if (blk->has_image)
|
||||
{
|
||||
blk->bkp_image = ptr;
|
||||
|
||||
@@ -275,9 +275,9 @@ XLogCheckInvalidPages(void)
|
||||
* will complain if we don't have the lock. In hot standby mode it's
|
||||
* definitely necessary.)
|
||||
*
|
||||
* Note: when a backup block is available in XLOG, we restore it
|
||||
* unconditionally, even if the page in the database appears newer. This is
|
||||
* to protect ourselves against database pages that were partially or
|
||||
* Note: when a backup block is available in XLOG with the BKPIMAGE_APPLY flag
|
||||
* set, we restore it, even if the page in the database appears newer. This
|
||||
* is to protect ourselves against database pages that were partially or
|
||||
* incorrectly written during a crash. We assume that the XLOG data must be
|
||||
* good because it has passed a CRC check, while the database page might not
|
||||
* be. This will force us to replay all subsequent modifications of the page
|
||||
@@ -352,9 +352,10 @@ XLogReadBufferForRedoExtended(XLogReaderState *record,
|
||||
if (!willinit && zeromode)
|
||||
elog(PANIC, "block to be initialized in redo routine must be marked with WILL_INIT flag in the WAL record");
|
||||
|
||||
/* If it's a full-page image, restore it. */
|
||||
if (XLogRecHasBlockImage(record, block_id))
|
||||
/* If it has a full-page image and it should be restored, do it. */
|
||||
if (XLogRecBlockImageApply(record, block_id))
|
||||
{
|
||||
Assert(XLogRecHasBlockImage(record, block_id));
|
||||
*buf = XLogReadBufferExtended(rnode, forknum, blkno,
|
||||
get_cleanup_lock ? RBM_ZERO_AND_CLEANUP_LOCK : RBM_ZERO_AND_LOCK);
|
||||
page = BufferGetPage(*buf);
|
||||
|
||||
Reference in New Issue
Block a user