diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index d3bfe41485d..935bdb0a0e1 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -817,8 +817,14 @@ static XLogSource XLogReceiptSource = 0; /* XLOG_FROM_* code */ static XLogRecPtr ReadRecPtr; /* start of last record read */ static XLogRecPtr EndRecPtr; /* end+1 of last record read */ -static XLogRecPtr minRecoveryPoint; /* local copy of - * ControlFile->minRecoveryPoint */ +/* + * Local copies of equivalent fields in the control file. When running + * crash recovery, minRecoveryPoint is set to InvalidXLogRecPtr as we + * expect to replay all the WAL available, and updateMinRecoveryPoint is + * switched to false to prevent any updates while replaying records. + * Those values are kept consistent as long as crash recovery runs. + */ +static XLogRecPtr minRecoveryPoint; static TimeLineID minRecoveryPointTLI; static bool updateMinRecoveryPoint = true; @@ -2685,20 +2691,26 @@ UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force) if (!updateMinRecoveryPoint || (!force && lsn <= minRecoveryPoint)) return; + /* + * An invalid minRecoveryPoint means that we need to recover all the WAL, + * i.e., we're doing crash recovery. We never modify the control file's + * value in that case, so we can short-circuit future checks here too. The + * local values of minRecoveryPoint and minRecoveryPointTLI should not be + * updated until crash recovery finishes. + */ + if (XLogRecPtrIsInvalid(minRecoveryPoint)) + { + updateMinRecoveryPoint = false; + return; + } + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); /* update local copy */ minRecoveryPoint = ControlFile->minRecoveryPoint; minRecoveryPointTLI = ControlFile->minRecoveryPointTLI; - /* - * An invalid minRecoveryPoint means that we need to recover all the WAL, - * i.e., we're doing crash recovery. We never modify the control file's - * value in that case, so we can short-circuit future checks here too. - */ - if (minRecoveryPoint == 0) - updateMinRecoveryPoint = false; - else if (force || minRecoveryPoint < lsn) + if (force || minRecoveryPoint < lsn) { XLogRecPtr newMinRecoveryPoint; TimeLineID newMinRecoveryPointTLI; @@ -3083,7 +3095,16 @@ XLogNeedsFlush(XLogRecPtr record) */ if (RecoveryInProgress()) { - /* Quick exit if already known updated */ + /* + * An invalid minRecoveryPoint means that we need to recover all the + * WAL, i.e., we're doing crash recovery. We never modify the control + * file's value in that case, so we can short-circuit future checks + * here too. + */ + if (XLogRecPtrIsInvalid(minRecoveryPoint)) + updateMinRecoveryPoint = false; + + /* Quick exit if already known to be updated or cannot be updated */ if (record <= minRecoveryPoint || !updateMinRecoveryPoint) return false; @@ -3097,20 +3118,8 @@ XLogNeedsFlush(XLogRecPtr record) minRecoveryPointTLI = ControlFile->minRecoveryPointTLI; LWLockRelease(ControlFileLock); - /* - * An invalid minRecoveryPoint means that we need to recover all the - * WAL, i.e., we're doing crash recovery. We never modify the control - * file's value in that case, so we can short-circuit future checks - * here too. - */ - if (minRecoveryPoint == 0) - updateMinRecoveryPoint = false; - /* check again */ - if (record <= minRecoveryPoint || !updateMinRecoveryPoint) - return false; - else - return true; + return record > minRecoveryPoint; } /* Quick exit if already known flushed */ @@ -4258,6 +4267,12 @@ ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, int emode, minRecoveryPoint = ControlFile->minRecoveryPoint; minRecoveryPointTLI = ControlFile->minRecoveryPointTLI; + /* + * The startup process can update its local copy of + * minRecoveryPoint from this point. + */ + updateMinRecoveryPoint = true; + UpdateControlFile(); LWLockRelease(ControlFileLock); @@ -6837,9 +6852,26 @@ StartupXLOG(void) /* No need to hold ControlFileLock yet, we aren't up far enough */ UpdateControlFile(); - /* initialize our local copy of minRecoveryPoint */ - minRecoveryPoint = ControlFile->minRecoveryPoint; - minRecoveryPointTLI = ControlFile->minRecoveryPointTLI; + /* + * Initialize our local copy of minRecoveryPoint. When doing crash + * recovery we want to replay up to the end of WAL. Particularly, in + * the case of a promoted standby minRecoveryPoint value in the + * control file is only updated after the first checkpoint. However, + * if the instance crashes before the first post-recovery checkpoint + * is completed then recovery will use a stale location causing the + * startup process to think that there are still invalid page + * references when checking for data consistency. + */ + if (InArchiveRecovery) + { + minRecoveryPoint = ControlFile->minRecoveryPoint; + minRecoveryPointTLI = ControlFile->minRecoveryPointTLI; + } + else + { + minRecoveryPoint = InvalidXLogRecPtr; + minRecoveryPointTLI = 0; + } /* * Reset pgstat data, because it may be invalid after recovery. @@ -7806,6 +7838,8 @@ CheckRecoveryConsistency(void) if (XLogRecPtrIsInvalid(minRecoveryPoint)) return; + Assert(InArchiveRecovery); + /* * assume that we are called in the startup process, and hence don't need * a lock to read lastReplayedEndRecPtr @@ -9913,11 +9947,16 @@ xlog_redo(XLogReaderState *record) * Update minRecoveryPoint to ensure that if recovery is aborted, we * recover back up to this point before allowing hot standby again. * This is important if the max_* settings are decreased, to ensure - * you don't run queries against the WAL preceding the change. + * you don't run queries against the WAL preceding the change. The + * local copies cannot be updated as long as crash recovery is + * happening and we expect all the WAL to be replayed. */ - minRecoveryPoint = ControlFile->minRecoveryPoint; - minRecoveryPointTLI = ControlFile->minRecoveryPointTLI; - if (minRecoveryPoint != 0 && minRecoveryPoint < lsn) + if (InArchiveRecovery) + { + minRecoveryPoint = ControlFile->minRecoveryPoint; + minRecoveryPointTLI = ControlFile->minRecoveryPointTLI; + } + if (minRecoveryPoint != InvalidXLogRecPtr && minRecoveryPoint < lsn) { ControlFile->minRecoveryPoint = lsn; ControlFile->minRecoveryPointTLI = ThisTimeLineID; diff --git a/src/test/recovery/t/013_promotion_pages.pl b/src/test/recovery/t/013_promotion_pages.pl new file mode 100644 index 00000000000..48f941b9631 --- /dev/null +++ b/src/test/recovery/t/013_promotion_pages.pl @@ -0,0 +1,87 @@ +# Test for promotion handling with WAL records generated post-promotion +# before the first checkpoint is generated. This test case checks for +# invalid page references at replay based on the minimum consistent +# recovery point defined. +use strict; +use warnings; +use PostgresNode; +use TestLib; +use Test::More tests => 1; + +# Initialize primary node +my $alpha = get_new_node('alpha'); +$alpha->init(allows_streaming => 1); +# Setting wal_log_hints to off is important to get invalid page +# references. +$alpha->append_conf("postgresql.conf", <start; + +# setup/start a standby +$alpha->backup('bkp'); +my $bravo = get_new_node('bravo'); +$bravo->init_from_backup($alpha, 'bkp', has_streaming => 1); +$bravo->append_conf('postgresql.conf', <start; + +# Dummy table for the upcoming tests. +$alpha->safe_psql('postgres', 'create table test1 (a int)'); +$alpha->safe_psql('postgres', 'insert into test1 select generate_series(1, 10000)'); + +# take a checkpoint +$alpha->safe_psql('postgres', 'checkpoint'); + +# The following vacuum will set visibility map bits and create +# problematic WAL records. +$alpha->safe_psql('postgres', 'vacuum verbose test1'); +# Wait for last record to have been replayed on the standby. +$alpha->wait_for_catchup($bravo, 'replay', + $alpha->lsn('insert')); + +# Now force a checkpoint on the standby. This seems unnecessary but for "some" +# reason, the previous checkpoint on the primary does not reflect on the standby +# and without an explicit checkpoint, it may start redo recovery from a much +# older point, which includes even create table and initial page additions. +$bravo->safe_psql('postgres', 'checkpoint'); + +# Now just use a dummy table and run some operations to move minRecoveryPoint +# beyond the previous vacuum. +$alpha->safe_psql('postgres', 'create table test2 (a int, b text)'); +$alpha->safe_psql('postgres', 'insert into test2 select generate_series(1,10000), md5(random()::text)'); +$alpha->safe_psql('postgres', 'truncate test2'); + +# Wait again for all records to be replayed. +$alpha->wait_for_catchup($bravo, 'replay', + $alpha->lsn('insert')); + +# Do the promotion, which reinitializes minRecoveryPoint in the control +# file so as WAL is replayed up to the end. +$bravo->promote; + +# Truncate the table on the promoted standby, vacuum and extend it +# again to create new page references. The first post-recovery checkpoint +# has not happened yet. +$bravo->safe_psql('postgres', 'truncate test1'); +$bravo->safe_psql('postgres', 'vacuum verbose test1'); +$bravo->safe_psql('postgres', 'insert into test1 select generate_series(1,1000)'); + +# Now crash-stop the promoted standby and restart. This makes sure that +# replay does not see invalid page references because of an invalid +# minimum consistent recovery point. +$bravo->stop('immediate'); +$bravo->start; + +# Check state of the table after full crash recovery. All its data should +# be here. +my $psql_out; +$bravo->psql( + 'postgres', + "SELECT count(*) FROM test1", + stdout => \$psql_out); +is($psql_out, '1000', "Check that table state is correct");