Follow TLI of last replayed record, not recovery target TLI, in walsenders.

Most of the time, the last replayed record comes from the recovery target timeline, but there is a corner case where it makes a difference. When the startup process scans for a new timeline, and decides to change recovery target timeline, there is a window where the recovery target TLI has already been bumped, but there are no WAL segments from the new timeline in pg_xlog yet. For example, if we have just replayed up to point 0/30002D8, on timeline 1, there is a WAL file called 000000010000000000000003 in pg_xlog that contains the WAL up to that point. When recovery switches recovery target timeline to 2, a walsender can immediately try to read WAL from 0/30002D8, from timeline 2, so it will try to open WAL file 000000020000000000000003. However, that doesn't exist yet - the startup process hasn't copied that file from the archive yet nor has the walreceiver streamed it yet, so walsender fails with error "requested WAL segment 000000020000000000000003 has already been removed". That's harmless, in that the standby will try to reconnect later and by that time the segment is already created, but error messages that should be ignored are not good. To fix that, have walsender track the TLI of the last replayed record, instead of the recovery target timeline. That way walsender will not try to read anything from timeline 2, until the WAL segment has been created and at least one record has been replayed from it. The recovery target timeline is now xlog.c's internal affair, it doesn't need to be exposed in shared memory anymore. This fixes the error reported by Thom Brown. depesz the same error message, but I'm not sure if this fixes his scenario.
2025-07-02 09:02:37 +03:00 · 2012-12-20 14:23:31 +02:00
parent 1a11d4609e
commit af275a12df
6 changed files with 92 additions and 82 deletions
--- a/src/backend/replication/walreceiver.c
+++ b/src/backend/replication/walreceiver.c
@ -370,7 +370,7 @@ WalReceiverMain(void)
 			first_stream = false;

 			/* Initialize LogstreamResult and buffers for processing messages */
-			LogstreamResult.Write = LogstreamResult.Flush = GetXLogReplayRecPtr();
+			LogstreamResult.Write = LogstreamResult.Flush = GetXLogReplayRecPtr(NULL);
 			initStringInfo(&reply_message);
 			initStringInfo(&incoming_message);

@ -1026,7 +1026,7 @@ XLogWalRcvSendReply(bool force, bool requestReply)
 	/* Construct a new message */
 	writePtr = LogstreamResult.Write;
 	flushPtr = LogstreamResult.Flush;
-	applyPtr = GetXLogReplayRecPtr();
+	applyPtr = GetXLogReplayRecPtr(NULL);

 	resetStringInfo(&reply_message);
 	pq_sendbyte(&reply_message, 'r');
--- a/src/backend/replication/walreceiverfuncs.c
+++ b/src/backend/replication/walreceiverfuncs.c
@ -324,7 +324,7 @@ GetReplicationApplyDelay(void)
 	receivePtr = walrcv->receivedUpto;
 	SpinLockRelease(&walrcv->mutex);

-	replayPtr = GetXLogReplayRecPtr();
+	replayPtr = GetXLogReplayRecPtr(NULL);

 	if (XLByteEQ(receivePtr, replayPtr))
 		return 0;
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@ -169,6 +169,7 @@ static void WalSndLoop(void);
 static void InitWalSenderSlot(void);
 static void WalSndKill(int code, Datum arg);
 static void XLogSend(bool *caughtup);
+static XLogRecPtr GetStandbyFlushRecPtr(TimeLineID currentTLI);
 static void IdentifySystem(void);
 static void StartReplication(StartReplicationCmd *cmd);
 static void ProcessStandbyMessage(void);
@ -190,12 +191,6 @@ InitWalSender(void)
 	/* Set up resource owner */
 	CurrentResourceOwner = ResourceOwnerCreate(NULL, "walsender top-level resource owner");

-	/*
-	 * Use the recovery target timeline ID during recovery
-	 */
-	if (am_cascading_walsender)
-		ThisTimeLineID = GetRecoveryTargetTLI();
-
 	/*
 	 * Let postmaster know that we're a WAL sender. Once we've declared us as
 	 * a WAL sender process, postmaster will let us outlive the bgwriter and
@ -254,8 +249,8 @@ IdentifySystem(void)
 	am_cascading_walsender = RecoveryInProgress();
 	if (am_cascading_walsender)
 	{
-		logptr = GetStandbyFlushRecPtr();
-		ThisTimeLineID = GetRecoveryTargetTLI();
+		/* this also updates ThisTimeLineID */
+		logptr = GetStandbyFlushRecPtr(0);
 	}
 	else
 		logptr = GetInsertRecPtr();
@ -409,6 +404,7 @@ static void
 StartReplication(StartReplicationCmd *cmd)
 {
 	StringInfoData buf;
+	XLogRecPtr FlushPtr;

 	/*
 	 * We assume here that we're logging enough information in the WAL for
@ -421,8 +417,17 @@ StartReplication(StartReplicationCmd *cmd)

 	/*
 	 * Select the timeline. If it was given explicitly by the client, use
-	 * that. Otherwise use the current ThisTimeLineID.
+	 * that. Otherwise use the timeline of the last replayed record, which
+	 * is kept in ThisTimeLineID.
 	 */
+	if (am_cascading_walsender)
+	{
+		/* this also updates ThisTimeLineID */
+		FlushPtr = GetStandbyFlushRecPtr(0);
+	}
+	else
+		FlushPtr = GetFlushRecPtr();
+
 	if (cmd->timeline != 0)
 	{
 		XLogRecPtr	switchpoint;
@ -494,7 +499,6 @@ StartReplication(StartReplicationCmd *cmd)
 	if (!sendTimeLineIsHistoric ||
 		XLByteLT(cmd->startpoint, sendTimeLineValidUpto))
 	{
-		XLogRecPtr FlushPtr;
 		/*
 		 * When we first start replication the standby will be behind the primary.
 		 * For some applications, for example, synchronous replication, it is
@ -516,10 +520,6 @@ StartReplication(StartReplicationCmd *cmd)
 		 * Don't allow a request to stream from a future point in WAL that
 		 * hasn't been flushed to disk in this server yet.
 		 */
-		if (am_cascading_walsender)
-			FlushPtr = GetStandbyFlushRecPtr();
-		else
-			FlushPtr = GetFlushRecPtr();
 		if (XLByteLT(FlushPtr, cmd->startpoint))
 		{
 			ereport(ERROR,
@ -1330,7 +1330,7 @@ XLogSend(bool *caughtup)
 	 * that gets lost on the master.
 	 */
 	if (am_cascading_walsender)
-		FlushPtr = GetStandbyFlushRecPtr();
+		FlushPtr = GetStandbyFlushRecPtr(sendTimeLine);
 	else
 		FlushPtr = GetFlushRecPtr();

@ -1347,7 +1347,6 @@ XLogSend(bool *caughtup)
 	if (!sendTimeLineIsHistoric && am_cascading_walsender)
 	{
 		bool		becameHistoric = false;
-		TimeLineID	targetTLI;

 		if (!RecoveryInProgress())
 		{
@ -1355,7 +1354,6 @@ XLogSend(bool *caughtup)
 			 * We have been promoted. RecoveryInProgress() updated
 			 * ThisTimeLineID to the new current timeline.
 			 */
-			targetTLI = ThisTimeLineID;
 			am_cascading_walsender = false;
 			becameHistoric = true;
 		}
@ -1363,11 +1361,9 @@ XLogSend(bool *caughtup)
 		{
 			/*
 			 * Still a cascading standby. But is the timeline we're sending
-			 * still the recovery target timeline?
+			 * still the one recovery is recovering from?
 			 */
-			targetTLI = GetRecoveryTargetTLI();
-
-			if (targetTLI != sendTimeLine)
+			if (sendTimeLine != ThisTimeLineID)
 				becameHistoric = true;
 		}

@ -1380,7 +1376,7 @@ XLogSend(bool *caughtup)
 			 */
 			List	   *history;

-			history = readTimeLineHistory(targetTLI);
+			history = readTimeLineHistory(ThisTimeLineID);
 			sendTimeLineValidUpto = tliSwitchPoint(sendTimeLine, history);
 			Assert(XLByteLE(sentPtr, sendTimeLineValidUpto));
 			list_free_deep(history);
@ -1521,6 +1517,48 @@ XLogSend(bool *caughtup)
 	return;
 }

+/*
+ * Returns the latest point in WAL that has been safely flushed to disk, and
+ * can be sent to the standby. This should only be called when in recovery,
+ * ie. we're streaming to a cascaded standby.
+ *
+ * If currentTLI is non-zero, the function returns the point that the WAL on
+ * the given timeline has been flushed upto. If recovery has already switched
+ * to a different timeline, InvalidXLogRecPtr is returned.
+ *
+ * As a side-effect, ThisTimeLineID is updated to the TLI of the last
+ * replayed WAL record.
+ */
+static XLogRecPtr
+GetStandbyFlushRecPtr(TimeLineID currentTLI)
+{
+	XLogRecPtr replayPtr;
+	TimeLineID replayTLI;
+	XLogRecPtr receivePtr;
+	TimeLineID receiveTLI;
+	XLogRecPtr	result;
+
+	/*
+	 * We can safely send what's already been replayed. Also, if walreceiver
+	 * is streaming WAL from the same timeline, we can send anything that
+	 * it has streamed, but hasn't been replayed yet.
+	 */
+
+	receivePtr = GetWalRcvWriteRecPtr(NULL, &receiveTLI);
+	replayPtr = GetXLogReplayRecPtr(&replayTLI);
+
+	ThisTimeLineID = replayTLI;
+
+	if (currentTLI != replayTLI && currentTLI != 0)
+		return InvalidXLogRecPtr;
+
+	result = replayPtr;
+	if (receiveTLI == currentTLI && receivePtr > replayPtr)
+		result = receivePtr;
+
+	return result;
+}
+
 /*
 * Request walsenders to reload the currently-open WAL file
 */