Wait for WAL summarization to catch up before creating .partial file.

When a standby is promoted, CleanupAfterArchiveRecovery() may decide to rename the final WAL file from the old timeline by adding ".partial" to the name. If WAL summarization is enabled and this file is renamed before its partial contents are summarized, WAL summarization breaks: the summarizer gets stuck at that point in the WAL stream and just errors out. To fix that, first make the startup process wait for WAL summarization to catch up before renaming the file. Generally, this should be quick, and if it's not, the user can shut off summarize_wal and try again. To make this fix work, also teach the WAL summarizer that after a promotion has occurred, no more WAL can appear on the previous timeline: previously, the WAL summarizer wouldn't switch to the new timeline until we actually started writing WAL there, but that meant that when the startup process was waiting for the WAL summarizer, it was waiting for an action that the summarizer wasn't yet prepared to take. In the process of fixing these bugs, I realized that the logic to wait for WAL summarization to catch up was spread out in a way that made it difficult to reuse properly, so this code refactors things to make it easier. Finally, add a test case that would have caught this bug and the previously-fixed bug that WAL summarization sometimes needs to back up when the timeline changes. Discussion: https://postgr.es/m/CA+TgmoZGEsZodXC4f=XZNkAeyuDmWTSkpkjCEOcF19Am0mt_OA@mail.gmail.com
2025-06-30 21:42:05 +03:00 · 2024-07-26 14:50:21 -04:00
parent f2af1f4559
commit 53b327f83e
7 changed files with 241 additions and 110 deletions
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@ -500,6 +500,11 @@ typedef struct XLogCtlData
 	 * If we create a new timeline when the system was started up,
 	 * PrevTimeLineID is the old timeline's ID that we forked off from.
 	 * Otherwise it's equal to InsertTimeLineID.
 	 *
 	 * We set these fields while holding info_lck. Most that reads these
 	 * values knows that recovery is no longer in progress and so can safely
 	 * read the value without a lock, but code that could be run either during
 	 * or after recovery can take info_lck while reading these values.
 	 */
 	TimeLineID	InsertTimeLineID;
 	TimeLineID	PrevTimeLineID;
@ -5317,6 +5322,13 @@ CleanupAfterArchiveRecovery(TimeLineID EndOfLogTLI, XLogRecPtr EndOfLog,
 			char		partialfname[MAXFNAMELEN];
 			char		partialpath[MAXPGPATH];
 			/*
 			 * If we're summarizing WAL, we can't rename the partial file
 			 * until the summarizer finishes with it, else it will fail.
 			 */
 			if (summarize_wal)
 				WaitForWalSummarization(EndOfLog);
 			XLogFilePath(origpath, EndOfLogTLI, endLogSegNo, wal_segment_size);
 			snprintf(partialfname, MAXFNAMELEN, "%s.partial", origfname);
 			snprintf(partialpath, MAXPGPATH, "%s.partial", origpath);
@ -5947,8 +5959,10 @@ StartupXLOG(void)
 	}
 	/* Save the selected TimeLineID in shared memory, too */
 	SpinLockAcquire(&XLogCtl->info_lck);
 	XLogCtl->InsertTimeLineID = newTLI;
 	XLogCtl->PrevTimeLineID = endOfRecoveryInfo->lastRecTLI;
 	SpinLockRelease(&XLogCtl->info_lck);
 	/*
 	 * Actually, if WAL ended in an incomplete record, skip the parts that
@ -6483,6 +6497,25 @@ GetWALInsertionTimeLine(void)
 	return XLogCtl->InsertTimeLineID;
 }
 /*
 * GetWALInsertionTimeLineIfSet -- If the system is not in recovery, returns
 * the WAL insertion timeline; else, returns 0. Wherever possible, use
 * GetWALInsertionTimeLine() instead, since it's cheaper. Note that this
 * function decides recovery has ended as soon as the insert TLI is set, which
 * happens before we set XLogCtl->SharedRecoveryState to RECOVERY_STATE_DONE.
 */
 TimeLineID
 GetWALInsertionTimeLineIfSet(void)
 {
 	TimeLineID	insertTLI;
 	SpinLockAcquire(&XLogCtl->info_lck);
 	insertTLI = XLogCtl->InsertTimeLineID;
 	SpinLockRelease(&XLogCtl->info_lck);
 	return insertTLI;
 }
 /*
 * GetLastImportantRecPtr -- Returns the LSN of the last important record
 * inserted. All records not explicitly marked as unimportant are considered
--- a/src/backend/backup/basebackup_incremental.c
+++ b/src/backend/backup/basebackup_incremental.c
@ -277,12 +277,6 @@ PrepareForIncrementalBackup(IncrementalBackupInfo *ib,
 	TimeLineID	earliest_wal_range_tli = 0;
 	XLogRecPtr	earliest_wal_range_start_lsn = InvalidXLogRecPtr;
 	TimeLineID	latest_wal_range_tli = 0;
 	XLogRecPtr	summarized_lsn;
 	XLogRecPtr	pending_lsn;
 	XLogRecPtr	prior_pending_lsn = InvalidXLogRecPtr;
 	int			deadcycles = 0;
 	TimestampTz initial_time,
 				current_time;
 	Assert(ib->buf.data == NULL);
@ -458,85 +452,13 @@ PrepareForIncrementalBackup(IncrementalBackupInfo *ib,
 	}
 	/*
-	 * Wait for WAL summarization to catch up to the backup start LSN (but
+	 * Wait for WAL summarization to catch up to the backup start LSN. This
-	 * time out if it doesn't do so quickly enough).
+	 * will throw an error if the WAL summarizer appears to be stuck. If WAL
 	 * summarization gets disabled while we're waiting, this will return
 	 * immediately, and we'll error out further down if the WAL summaries are
 	 * incomplete.
 	 */
-	initial_time = current_time = GetCurrentTimestamp();
+	WaitForWalSummarization(backup_state->startpoint);
 	while (1)
 	{
 		long		timeout_in_ms = 10000;
 		long		elapsed_seconds;
 		/*
 		 * Align the wait time to prevent drift. This doesn't really matter,
 		 * but we'd like the warnings about how long we've been waiting to say
 		 * 10 seconds, 20 seconds, 30 seconds, 40 seconds ... without ever
 		 * drifting to something that is not a multiple of ten.
 		 */
 		timeout_in_ms -=
 			TimestampDifferenceMilliseconds(initial_time, current_time) %
 			timeout_in_ms;
 		/* Wait for up to 10 seconds. */
 		summarized_lsn = WaitForWalSummarization(backup_state->startpoint,
 												 timeout_in_ms, &pending_lsn);
 		/* If WAL summarization has progressed sufficiently, stop waiting. */
 		if (summarized_lsn >= backup_state->startpoint)
 			break;
 		/*
 		 * Keep track of the number of cycles during which there has been no
 		 * progression of pending_lsn. If pending_lsn is not advancing, that
 		 * means that not only are no new files appearing on disk, but we're
 		 * not even incorporating new records into the in-memory state.
 		 */
 		if (pending_lsn > prior_pending_lsn)
 		{
 			prior_pending_lsn = pending_lsn;
 			deadcycles = 0;
 		}
 		else
 			++deadcycles;
 		/*
 		 * If we've managed to wait for an entire minute without the WAL
 		 * summarizer absorbing a single WAL record, error out; probably
 		 * something is wrong.
 		 *
 		 * We could consider also erroring out if the summarizer is taking too
 		 * long to catch up, but it's not clear what rate of progress would be
 		 * acceptable and what would be too slow. So instead, we just try to
 		 * error out in the case where there's no progress at all. That seems
 		 * likely to catch a reasonable number of the things that can go wrong
 		 * in practice (e.g. the summarizer process is completely hung, say
 		 * because somebody hooked up a debugger to it or something) without
 		 * giving up too quickly when the system is just slow.
 		 */
 		if (deadcycles >= 6)
 			ereport(ERROR,
 					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
 					 errmsg("WAL summarization is not progressing"),
 					 errdetail("Summarization is needed through %X/%X, but is stuck at %X/%X on disk and %X/%X in memory.",
 							   LSN_FORMAT_ARGS(backup_state->startpoint),
 							   LSN_FORMAT_ARGS(summarized_lsn),
 							   LSN_FORMAT_ARGS(pending_lsn))));
 		/*
 		 * Otherwise, just let the user know what's happening.
 		 */
 		current_time = GetCurrentTimestamp();
 		elapsed_seconds =
 			TimestampDifferenceMilliseconds(initial_time, current_time) / 1000;
 		ereport(WARNING,
 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
 				 errmsg("still waiting for WAL summarization through %X/%X after %ld seconds",
 						LSN_FORMAT_ARGS(backup_state->startpoint),
 						elapsed_seconds),
 				 errdetail("Summarization has reached %X/%X on disk and %X/%X in memory.",
 						   LSN_FORMAT_ARGS(summarized_lsn),
 						   LSN_FORMAT_ARGS(pending_lsn))));
 	}
 	/*
 	 * Retrieve a list of all WAL summaries on any timeline that overlap with
--- a/src/backend/postmaster/walsummarizer.c
+++ b/src/backend/postmaster/walsummarizer.c
@ -650,54 +650,132 @@ SetWalSummarizerLatch(void)
 }
 /*
- * Wait until WAL summarization reaches the given LSN, but not longer than
+ * Wait until WAL summarization reaches the given LSN, but time out with an
- * the given timeout.
+ * error if the summarizer seems to be stick.
 *
- * The return value is the first still-unsummarized LSN. If it's greater than
+ * Returns immediately if summarize_wal is turned off while we wait. Caller
- * or equal to the passed LSN, then that LSN was reached. If not, we timed out.
+ * is expected to handle this case, if necessary.
 *
 * Either way, *pending_lsn is set to the value taken from WalSummarizerCtl.
 */
-XLogRecPtr
+void
-WaitForWalSummarization(XLogRecPtr lsn, long timeout, XLogRecPtr *pending_lsn)
+WaitForWalSummarization(XLogRecPtr lsn)
 {
-	TimestampTz start_time = GetCurrentTimestamp();
+	TimestampTz initial_time,
-	TimestampTz deadline = TimestampTzPlusMilliseconds(start_time, timeout);
+				cycle_time,
-	XLogRecPtr	summarized_lsn;
+				current_time;
 	XLogRecPtr	prior_pending_lsn = InvalidXLogRecPtr;
 	int			deadcycles = 0;
-	Assert(!XLogRecPtrIsInvalid(lsn));
+	initial_time = cycle_time = GetCurrentTimestamp();
 	Assert(timeout > 0);
 	while (1)
 	{
-		TimestampTz now;
+		long		timeout_in_ms = 10000;
-		long		remaining_timeout;
+		XLogRecPtr	summarized_lsn;
 		XLogRecPtr	pending_lsn;
 		CHECK_FOR_INTERRUPTS();
 		/* If WAL summarization is disabled while we're waiting, give up. */
 		if (!summarize_wal)
 			return;
 		/*
 		 * If the LSN summarized on disk has reached the target value, stop.
 		 */
 		LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE);
 		summarized_lsn = WalSummarizerCtl->summarized_lsn;
-		*pending_lsn = WalSummarizerCtl->pending_lsn;
+		pending_lsn = WalSummarizerCtl->pending_lsn;
 		LWLockRelease(WALSummarizerLock);
 		/* If WAL summarization has progressed sufficiently, stop waiting. */
 		if (summarized_lsn >= lsn)
 			break;
-		/* Timeout reached? If yes, stop. */
+		/* Recheck current time. */
-		now = GetCurrentTimestamp();
+		current_time = GetCurrentTimestamp();
-		remaining_timeout = TimestampDifferenceMilliseconds(now, deadline);
+
-		if (remaining_timeout <= 0)
+		/* Have we finished the current cycle of waiting? */
-			break;
+		if (TimestampDifferenceMilliseconds(cycle_time,
 											current_time) >= timeout_in_ms)
 		{
 			long		elapsed_seconds;
 			/* Begin new wait cycle. */
 			cycle_time = TimestampTzPlusMilliseconds(cycle_time,
 													 timeout_in_ms);
 			/*
 			 * Keep track of the number of cycles during which there has been
 			 * no progression of pending_lsn. If pending_lsn is not advancing,
 			 * that means that not only are no new files appearing on disk,
 			 * but we're not even incorporating new records into the in-memory
 			 * state.
 			 */
 			if (pending_lsn > prior_pending_lsn)
 			{
 				prior_pending_lsn = pending_lsn;
 				deadcycles = 0;
 			}
 			else
 				++deadcycles;
 			/*
 			 * If we've managed to wait for an entire minute without the WAL
 			 * summarizer absorbing a single WAL record, error out; probably
 			 * something is wrong.
 			 *
 			 * We could consider also erroring out if the summarizer is taking
 			 * too long to catch up, but it's not clear what rate of progress
 			 * would be acceptable and what would be too slow. So instead, we
 			 * just try to error out in the case where there's no progress at
 			 * all. That seems likely to catch a reasonable number of the
 			 * things that can go wrong in practice (e.g. the summarizer
 			 * process is completely hung, say because somebody hooked up a
 			 * debugger to it or something) without giving up too quickly when
 			 * the system is just slow.
 			 */
 			if (deadcycles >= 6)
 				ereport(ERROR,
 						(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
 						 errmsg("WAL summarization is not progressing"),
 						 errdetail("Summarization is needed through %X/%X, but is stuck at %X/%X on disk and %X/%X in memory.",
 								   LSN_FORMAT_ARGS(lsn),
 								   LSN_FORMAT_ARGS(summarized_lsn),
 								   LSN_FORMAT_ARGS(pending_lsn))));
 			/*
 			 * Otherwise, just let the user know what's happening.
 			 */
 			elapsed_seconds =
 				TimestampDifferenceMilliseconds(initial_time,
 												current_time) / 1000;
 			ereport(WARNING,
 					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
 					 errmsg("still waiting for WAL summarization through %X/%X after %ld seconds",
 							LSN_FORMAT_ARGS(lsn),
 							elapsed_seconds),
 					 errdetail("Summarization has reached %X/%X on disk and %X/%X in memory.",
 							   LSN_FORMAT_ARGS(summarized_lsn),
 							   LSN_FORMAT_ARGS(pending_lsn))));
 		}
 		/*
 		 * Align the wait time to prevent drift. This doesn't really matter,
 		 * but we'd like the warnings about how long we've been waiting to say
 		 * 10 seconds, 20 seconds, 30 seconds, 40 seconds ... without ever
 		 * drifting to something that is not a multiple of ten.
 		 */
 		timeout_in_ms -=
 			TimestampDifferenceMilliseconds(cycle_time, current_time);
 		/* Wait and see. */
 		ConditionVariableTimedSleep(&WalSummarizerCtl->summary_file_cv,
-									remaining_timeout,
+									timeout_in_ms,
 									WAIT_EVENT_WAL_SUMMARY_READY);
 	}
 	ConditionVariableCancelSleep();
 	return summarized_lsn;
 }
 /*
@ -730,6 +808,22 @@ GetLatestLSN(TimeLineID *tli)
 		TimeLineID	flush_tli;
 		XLogRecPtr	replay_lsn;
 		TimeLineID	replay_tli;
 		TimeLineID	insert_tli;
 		/*
 		 * After the insert TLI has been set and before the control file has
 		 * been updated to show the DB in production, RecoveryInProgress()
 		 * will return true, because it's not yet safe for all backends to
 		 * begin writing WAL. However, replay has already ceased, so from our
 		 * point of view, recovery is already over. We should summarize up to
 		 * where replay stopped and then prepare to resume at the start of the
 		 * insert timeline.
 		 */
 		if ((insert_tli = GetWALInsertionTimeLineIfSet()) != 0)
 		{
 			*tli = insert_tli;
 			return GetXLogReplayRecPtr(NULL);
 		}
 		/*
 		 * What we really want to know is how much WAL has been flushed to
--- a/src/bin/pg_combinebackup/meson.build
+++ b/src/bin/pg_combinebackup/meson.build
@ -35,6 +35,7 @@ tests += {
      't/005_integrity.pl',
      't/006_db_file_copy.pl',
      't/007_wal_level_minimal.pl',
      't/008_promote.pl',
    ],
  }
 }
--- a/src/bin/pg_combinebackup/t/008_promote.pl
+++ b/src/bin/pg_combinebackup/t/008_promote.pl
@ -0,0 +1,81 @@
 # Copyright (c) 2021-2024, PostgreSQL Global Development Group
 #
 # Test whether WAL summaries are complete such that incremental backup
 # can be performed after promoting a standby at an arbitrary LSN.
 use strict;
 use warnings FATAL => 'all';
 use File::Compare;
 use PostgreSQL::Test::Cluster;
 use PostgreSQL::Test::Utils;
 use Test::More;
 # Can be changed to test the other modes.
 my $mode = $ENV{PG_TEST_PG_COMBINEBACKUP_MODE} || '--copy';
 note "testing using mode $mode";
 # Set up a new database instance.
 my $node1 = PostgreSQL::Test::Cluster->new('node1');
 $node1->init(has_archiving => 1, allows_streaming => 1);
 $node1->append_conf('postgresql.conf', 'summarize_wal = on');
 $node1->append_conf('postgresql.conf', 'log_min_messages = debug1');
 $node1->start;
 # Create a table and insert a test row into it.
 $node1->safe_psql('postgres', <<EOM);
 CREATE TABLE mytable (a int, b text);
 INSERT INTO mytable VALUES (1, 'avocado');
 EOM
 # Take a full backup.
 my $backup1path = $node1->backup_dir . '/backup1';
 $node1->command_ok(
 	[ 'pg_basebackup', '-D', $backup1path, '--no-sync', '-cfast' ],
 	"full backup from node1");
 # Checkpoint and record LSN after.
 $node1->safe_psql('postgres', 'CHECKPOINT');
 my $lsn = $node1->safe_psql('postgres', 'SELECT pg_current_wal_insert_lsn()');
 # Insert a second row on the original node.
 $node1->safe_psql('postgres', <<EOM);
 INSERT INTO mytable VALUES (2, 'beetle');
 EOM
 # Now create a second node. We want this to stream from the first node and
 # then stop recovery at some arbitrary LSN, not just when it hits the end of
 # WAL, so use a recovery target.
 my $node2 = PostgreSQL::Test::Cluster->new('node2');
 $node2->init_from_backup($node1, 'backup1', 'has_streaming' => 1);
 $node2->append_conf('postgresql.conf', <<EOM);
 recovery_target_lsn = '$lsn'
 recovery_target_action = 'pause'
 EOM
 $node2->start();
 # Wait until recoveery pauses, then promote.
 $node2->poll_query_until('postgres', "SELECT pg_get_wal_replay_pause_state() = 'paused';");
 $node2->safe_psql('postgres', "SELECT pg_promote()");
 # Once promotion occurs, insert a second row on the new node.
 $node2->poll_query_until('postgres', "SELECT pg_is_in_recovery() = 'f';");
 $node2->safe_psql('postgres', <<EOM);
 INSERT INTO mytable VALUES (2, 'blackberry');
 EOM
 # Now take an incremental backup. If WAL summarization didn't follow the
 # timeline cange correctly, something should break at this point.
 my $backup2path = $node1->backup_dir . '/backup2';
 $node2->command_ok(
 	[ 'pg_basebackup', '-D', $backup2path, '--no-sync', '-cfast',
 	  '--incremental', $backup1path . '/backup_manifest' ],
 	"incremental backup from node2");
 # Restore the incremental backup and use it to create a new node.
 my $node3 = PostgreSQL::Test::Cluster->new('node3');
 $node3->init_from_backup($node1, 'backup2',
 						 combine_with_prior => [ 'backup1' ]);
 $node3->start();
 done_testing();
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@ -248,6 +248,7 @@ extern XLogRecPtr GetRedoRecPtr(void);
 extern XLogRecPtr GetInsertRecPtr(void);
 extern XLogRecPtr GetFlushRecPtr(TimeLineID *insertTLI);
 extern TimeLineID GetWALInsertionTimeLine(void);
 extern TimeLineID GetWALInsertionTimeLineIfSet(void);
 extern XLogRecPtr GetLastImportantRecPtr(void);
 extern void SetWalWriterSleeping(bool sleeping);
--- a/src/include/postmaster/walsummarizer.h
+++ b/src/include/postmaster/walsummarizer.h
@ -30,7 +30,6 @@ extern void GetWalSummarizerState(TimeLineID *summarized_tli,
 extern XLogRecPtr GetOldestUnsummarizedLSN(TimeLineID *tli,
 										   bool *lsn_is_exact);
 extern void SetWalSummarizerLatch(void);
-extern XLogRecPtr WaitForWalSummarization(XLogRecPtr lsn, long timeout,
+extern void WaitForWalSummarization(XLogRecPtr lsn);
 										  XLogRecPtr *pending_lsn);
 #endif