1
0
mirror of https://github.com/postgres/postgres.git synced 2025-05-06 19:59:18 +03:00

Wait for WAL summarization to catch up before creating .partial file.

When a standby is promoted, CleanupAfterArchiveRecovery() may decide
to rename the final WAL file from the old timeline by adding ".partial"
to the name. If WAL summarization is enabled and this file is renamed
before its partial contents are summarized, WAL summarization breaks:
the summarizer gets stuck at that point in the WAL stream and just
errors out.

To fix that, first make the startup process wait for WAL summarization
to catch up before renaming the file. Generally, this should be quick,
and if it's not, the user can shut off summarize_wal and try again.
To make this fix work, also teach the WAL summarizer that after a
promotion has occurred, no more WAL can appear on the previous
timeline: previously, the WAL summarizer wouldn't switch to the new
timeline until we actually started writing WAL there, but that meant
that when the startup process was waiting for the WAL summarizer, it
was waiting for an action that the summarizer wasn't yet prepared to
take.

In the process of fixing these bugs, I realized that the logic to wait
for WAL summarization to catch up was spread out in a way that made
it difficult to reuse properly, so this code refactors things to make
it easier.

Finally, add a test case that would have caught this bug and the
previously-fixed bug that WAL summarization sometimes needs to back up
when the timeline changes.

Discussion: https://postgr.es/m/CA+TgmoZGEsZodXC4f=XZNkAeyuDmWTSkpkjCEOcF19Am0mt_OA@mail.gmail.com
This commit is contained in:
Robert Haas 2024-07-26 14:50:21 -04:00
parent f2af1f4559
commit 53b327f83e
7 changed files with 241 additions and 110 deletions

View File

@ -500,6 +500,11 @@ typedef struct XLogCtlData
* If we create a new timeline when the system was started up, * If we create a new timeline when the system was started up,
* PrevTimeLineID is the old timeline's ID that we forked off from. * PrevTimeLineID is the old timeline's ID that we forked off from.
* Otherwise it's equal to InsertTimeLineID. * Otherwise it's equal to InsertTimeLineID.
*
* We set these fields while holding info_lck. Most that reads these
* values knows that recovery is no longer in progress and so can safely
* read the value without a lock, but code that could be run either during
* or after recovery can take info_lck while reading these values.
*/ */
TimeLineID InsertTimeLineID; TimeLineID InsertTimeLineID;
TimeLineID PrevTimeLineID; TimeLineID PrevTimeLineID;
@ -5317,6 +5322,13 @@ CleanupAfterArchiveRecovery(TimeLineID EndOfLogTLI, XLogRecPtr EndOfLog,
char partialfname[MAXFNAMELEN]; char partialfname[MAXFNAMELEN];
char partialpath[MAXPGPATH]; char partialpath[MAXPGPATH];
/*
* If we're summarizing WAL, we can't rename the partial file
* until the summarizer finishes with it, else it will fail.
*/
if (summarize_wal)
WaitForWalSummarization(EndOfLog);
XLogFilePath(origpath, EndOfLogTLI, endLogSegNo, wal_segment_size); XLogFilePath(origpath, EndOfLogTLI, endLogSegNo, wal_segment_size);
snprintf(partialfname, MAXFNAMELEN, "%s.partial", origfname); snprintf(partialfname, MAXFNAMELEN, "%s.partial", origfname);
snprintf(partialpath, MAXPGPATH, "%s.partial", origpath); snprintf(partialpath, MAXPGPATH, "%s.partial", origpath);
@ -5947,8 +5959,10 @@ StartupXLOG(void)
} }
/* Save the selected TimeLineID in shared memory, too */ /* Save the selected TimeLineID in shared memory, too */
SpinLockAcquire(&XLogCtl->info_lck);
XLogCtl->InsertTimeLineID = newTLI; XLogCtl->InsertTimeLineID = newTLI;
XLogCtl->PrevTimeLineID = endOfRecoveryInfo->lastRecTLI; XLogCtl->PrevTimeLineID = endOfRecoveryInfo->lastRecTLI;
SpinLockRelease(&XLogCtl->info_lck);
/* /*
* Actually, if WAL ended in an incomplete record, skip the parts that * Actually, if WAL ended in an incomplete record, skip the parts that
@ -6483,6 +6497,25 @@ GetWALInsertionTimeLine(void)
return XLogCtl->InsertTimeLineID; return XLogCtl->InsertTimeLineID;
} }
/*
* GetWALInsertionTimeLineIfSet -- If the system is not in recovery, returns
* the WAL insertion timeline; else, returns 0. Wherever possible, use
* GetWALInsertionTimeLine() instead, since it's cheaper. Note that this
* function decides recovery has ended as soon as the insert TLI is set, which
* happens before we set XLogCtl->SharedRecoveryState to RECOVERY_STATE_DONE.
*/
TimeLineID
GetWALInsertionTimeLineIfSet(void)
{
TimeLineID insertTLI;
SpinLockAcquire(&XLogCtl->info_lck);
insertTLI = XLogCtl->InsertTimeLineID;
SpinLockRelease(&XLogCtl->info_lck);
return insertTLI;
}
/* /*
* GetLastImportantRecPtr -- Returns the LSN of the last important record * GetLastImportantRecPtr -- Returns the LSN of the last important record
* inserted. All records not explicitly marked as unimportant are considered * inserted. All records not explicitly marked as unimportant are considered

View File

@ -277,12 +277,6 @@ PrepareForIncrementalBackup(IncrementalBackupInfo *ib,
TimeLineID earliest_wal_range_tli = 0; TimeLineID earliest_wal_range_tli = 0;
XLogRecPtr earliest_wal_range_start_lsn = InvalidXLogRecPtr; XLogRecPtr earliest_wal_range_start_lsn = InvalidXLogRecPtr;
TimeLineID latest_wal_range_tli = 0; TimeLineID latest_wal_range_tli = 0;
XLogRecPtr summarized_lsn;
XLogRecPtr pending_lsn;
XLogRecPtr prior_pending_lsn = InvalidXLogRecPtr;
int deadcycles = 0;
TimestampTz initial_time,
current_time;
Assert(ib->buf.data == NULL); Assert(ib->buf.data == NULL);
@ -458,85 +452,13 @@ PrepareForIncrementalBackup(IncrementalBackupInfo *ib,
} }
/* /*
* Wait for WAL summarization to catch up to the backup start LSN (but * Wait for WAL summarization to catch up to the backup start LSN. This
* time out if it doesn't do so quickly enough). * will throw an error if the WAL summarizer appears to be stuck. If WAL
* summarization gets disabled while we're waiting, this will return
* immediately, and we'll error out further down if the WAL summaries are
* incomplete.
*/ */
initial_time = current_time = GetCurrentTimestamp(); WaitForWalSummarization(backup_state->startpoint);
while (1)
{
long timeout_in_ms = 10000;
long elapsed_seconds;
/*
* Align the wait time to prevent drift. This doesn't really matter,
* but we'd like the warnings about how long we've been waiting to say
* 10 seconds, 20 seconds, 30 seconds, 40 seconds ... without ever
* drifting to something that is not a multiple of ten.
*/
timeout_in_ms -=
TimestampDifferenceMilliseconds(initial_time, current_time) %
timeout_in_ms;
/* Wait for up to 10 seconds. */
summarized_lsn = WaitForWalSummarization(backup_state->startpoint,
timeout_in_ms, &pending_lsn);
/* If WAL summarization has progressed sufficiently, stop waiting. */
if (summarized_lsn >= backup_state->startpoint)
break;
/*
* Keep track of the number of cycles during which there has been no
* progression of pending_lsn. If pending_lsn is not advancing, that
* means that not only are no new files appearing on disk, but we're
* not even incorporating new records into the in-memory state.
*/
if (pending_lsn > prior_pending_lsn)
{
prior_pending_lsn = pending_lsn;
deadcycles = 0;
}
else
++deadcycles;
/*
* If we've managed to wait for an entire minute without the WAL
* summarizer absorbing a single WAL record, error out; probably
* something is wrong.
*
* We could consider also erroring out if the summarizer is taking too
* long to catch up, but it's not clear what rate of progress would be
* acceptable and what would be too slow. So instead, we just try to
* error out in the case where there's no progress at all. That seems
* likely to catch a reasonable number of the things that can go wrong
* in practice (e.g. the summarizer process is completely hung, say
* because somebody hooked up a debugger to it or something) without
* giving up too quickly when the system is just slow.
*/
if (deadcycles >= 6)
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("WAL summarization is not progressing"),
errdetail("Summarization is needed through %X/%X, but is stuck at %X/%X on disk and %X/%X in memory.",
LSN_FORMAT_ARGS(backup_state->startpoint),
LSN_FORMAT_ARGS(summarized_lsn),
LSN_FORMAT_ARGS(pending_lsn))));
/*
* Otherwise, just let the user know what's happening.
*/
current_time = GetCurrentTimestamp();
elapsed_seconds =
TimestampDifferenceMilliseconds(initial_time, current_time) / 1000;
ereport(WARNING,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("still waiting for WAL summarization through %X/%X after %ld seconds",
LSN_FORMAT_ARGS(backup_state->startpoint),
elapsed_seconds),
errdetail("Summarization has reached %X/%X on disk and %X/%X in memory.",
LSN_FORMAT_ARGS(summarized_lsn),
LSN_FORMAT_ARGS(pending_lsn))));
}
/* /*
* Retrieve a list of all WAL summaries on any timeline that overlap with * Retrieve a list of all WAL summaries on any timeline that overlap with

View File

@ -650,54 +650,132 @@ SetWalSummarizerLatch(void)
} }
/* /*
* Wait until WAL summarization reaches the given LSN, but not longer than * Wait until WAL summarization reaches the given LSN, but time out with an
* the given timeout. * error if the summarizer seems to be stick.
* *
* The return value is the first still-unsummarized LSN. If it's greater than * Returns immediately if summarize_wal is turned off while we wait. Caller
* or equal to the passed LSN, then that LSN was reached. If not, we timed out. * is expected to handle this case, if necessary.
*
* Either way, *pending_lsn is set to the value taken from WalSummarizerCtl.
*/ */
XLogRecPtr void
WaitForWalSummarization(XLogRecPtr lsn, long timeout, XLogRecPtr *pending_lsn) WaitForWalSummarization(XLogRecPtr lsn)
{ {
TimestampTz start_time = GetCurrentTimestamp(); TimestampTz initial_time,
TimestampTz deadline = TimestampTzPlusMilliseconds(start_time, timeout); cycle_time,
XLogRecPtr summarized_lsn; current_time;
XLogRecPtr prior_pending_lsn = InvalidXLogRecPtr;
int deadcycles = 0;
Assert(!XLogRecPtrIsInvalid(lsn)); initial_time = cycle_time = GetCurrentTimestamp();
Assert(timeout > 0);
while (1) while (1)
{ {
TimestampTz now; long timeout_in_ms = 10000;
long remaining_timeout; XLogRecPtr summarized_lsn;
XLogRecPtr pending_lsn;
CHECK_FOR_INTERRUPTS();
/* If WAL summarization is disabled while we're waiting, give up. */
if (!summarize_wal)
return;
/* /*
* If the LSN summarized on disk has reached the target value, stop. * If the LSN summarized on disk has reached the target value, stop.
*/ */
LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE); LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE);
summarized_lsn = WalSummarizerCtl->summarized_lsn; summarized_lsn = WalSummarizerCtl->summarized_lsn;
*pending_lsn = WalSummarizerCtl->pending_lsn; pending_lsn = WalSummarizerCtl->pending_lsn;
LWLockRelease(WALSummarizerLock); LWLockRelease(WALSummarizerLock);
/* If WAL summarization has progressed sufficiently, stop waiting. */
if (summarized_lsn >= lsn) if (summarized_lsn >= lsn)
break; break;
/* Timeout reached? If yes, stop. */ /* Recheck current time. */
now = GetCurrentTimestamp(); current_time = GetCurrentTimestamp();
remaining_timeout = TimestampDifferenceMilliseconds(now, deadline);
if (remaining_timeout <= 0) /* Have we finished the current cycle of waiting? */
break; if (TimestampDifferenceMilliseconds(cycle_time,
current_time) >= timeout_in_ms)
{
long elapsed_seconds;
/* Begin new wait cycle. */
cycle_time = TimestampTzPlusMilliseconds(cycle_time,
timeout_in_ms);
/*
* Keep track of the number of cycles during which there has been
* no progression of pending_lsn. If pending_lsn is not advancing,
* that means that not only are no new files appearing on disk,
* but we're not even incorporating new records into the in-memory
* state.
*/
if (pending_lsn > prior_pending_lsn)
{
prior_pending_lsn = pending_lsn;
deadcycles = 0;
}
else
++deadcycles;
/*
* If we've managed to wait for an entire minute without the WAL
* summarizer absorbing a single WAL record, error out; probably
* something is wrong.
*
* We could consider also erroring out if the summarizer is taking
* too long to catch up, but it's not clear what rate of progress
* would be acceptable and what would be too slow. So instead, we
* just try to error out in the case where there's no progress at
* all. That seems likely to catch a reasonable number of the
* things that can go wrong in practice (e.g. the summarizer
* process is completely hung, say because somebody hooked up a
* debugger to it or something) without giving up too quickly when
* the system is just slow.
*/
if (deadcycles >= 6)
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("WAL summarization is not progressing"),
errdetail("Summarization is needed through %X/%X, but is stuck at %X/%X on disk and %X/%X in memory.",
LSN_FORMAT_ARGS(lsn),
LSN_FORMAT_ARGS(summarized_lsn),
LSN_FORMAT_ARGS(pending_lsn))));
/*
* Otherwise, just let the user know what's happening.
*/
elapsed_seconds =
TimestampDifferenceMilliseconds(initial_time,
current_time) / 1000;
ereport(WARNING,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("still waiting for WAL summarization through %X/%X after %ld seconds",
LSN_FORMAT_ARGS(lsn),
elapsed_seconds),
errdetail("Summarization has reached %X/%X on disk and %X/%X in memory.",
LSN_FORMAT_ARGS(summarized_lsn),
LSN_FORMAT_ARGS(pending_lsn))));
}
/*
* Align the wait time to prevent drift. This doesn't really matter,
* but we'd like the warnings about how long we've been waiting to say
* 10 seconds, 20 seconds, 30 seconds, 40 seconds ... without ever
* drifting to something that is not a multiple of ten.
*/
timeout_in_ms -=
TimestampDifferenceMilliseconds(cycle_time, current_time);
/* Wait and see. */ /* Wait and see. */
ConditionVariableTimedSleep(&WalSummarizerCtl->summary_file_cv, ConditionVariableTimedSleep(&WalSummarizerCtl->summary_file_cv,
remaining_timeout, timeout_in_ms,
WAIT_EVENT_WAL_SUMMARY_READY); WAIT_EVENT_WAL_SUMMARY_READY);
} }
ConditionVariableCancelSleep(); ConditionVariableCancelSleep();
return summarized_lsn;
} }
/* /*
@ -730,6 +808,22 @@ GetLatestLSN(TimeLineID *tli)
TimeLineID flush_tli; TimeLineID flush_tli;
XLogRecPtr replay_lsn; XLogRecPtr replay_lsn;
TimeLineID replay_tli; TimeLineID replay_tli;
TimeLineID insert_tli;
/*
* After the insert TLI has been set and before the control file has
* been updated to show the DB in production, RecoveryInProgress()
* will return true, because it's not yet safe for all backends to
* begin writing WAL. However, replay has already ceased, so from our
* point of view, recovery is already over. We should summarize up to
* where replay stopped and then prepare to resume at the start of the
* insert timeline.
*/
if ((insert_tli = GetWALInsertionTimeLineIfSet()) != 0)
{
*tli = insert_tli;
return GetXLogReplayRecPtr(NULL);
}
/* /*
* What we really want to know is how much WAL has been flushed to * What we really want to know is how much WAL has been flushed to

View File

@ -35,6 +35,7 @@ tests += {
't/005_integrity.pl', 't/005_integrity.pl',
't/006_db_file_copy.pl', 't/006_db_file_copy.pl',
't/007_wal_level_minimal.pl', 't/007_wal_level_minimal.pl',
't/008_promote.pl',
], ],
} }
} }

View File

@ -0,0 +1,81 @@
# Copyright (c) 2021-2024, PostgreSQL Global Development Group
#
# Test whether WAL summaries are complete such that incremental backup
# can be performed after promoting a standby at an arbitrary LSN.
use strict;
use warnings FATAL => 'all';
use File::Compare;
use PostgreSQL::Test::Cluster;
use PostgreSQL::Test::Utils;
use Test::More;
# Can be changed to test the other modes.
my $mode = $ENV{PG_TEST_PG_COMBINEBACKUP_MODE} || '--copy';
note "testing using mode $mode";
# Set up a new database instance.
my $node1 = PostgreSQL::Test::Cluster->new('node1');
$node1->init(has_archiving => 1, allows_streaming => 1);
$node1->append_conf('postgresql.conf', 'summarize_wal = on');
$node1->append_conf('postgresql.conf', 'log_min_messages = debug1');
$node1->start;
# Create a table and insert a test row into it.
$node1->safe_psql('postgres', <<EOM);
CREATE TABLE mytable (a int, b text);
INSERT INTO mytable VALUES (1, 'avocado');
EOM
# Take a full backup.
my $backup1path = $node1->backup_dir . '/backup1';
$node1->command_ok(
[ 'pg_basebackup', '-D', $backup1path, '--no-sync', '-cfast' ],
"full backup from node1");
# Checkpoint and record LSN after.
$node1->safe_psql('postgres', 'CHECKPOINT');
my $lsn = $node1->safe_psql('postgres', 'SELECT pg_current_wal_insert_lsn()');
# Insert a second row on the original node.
$node1->safe_psql('postgres', <<EOM);
INSERT INTO mytable VALUES (2, 'beetle');
EOM
# Now create a second node. We want this to stream from the first node and
# then stop recovery at some arbitrary LSN, not just when it hits the end of
# WAL, so use a recovery target.
my $node2 = PostgreSQL::Test::Cluster->new('node2');
$node2->init_from_backup($node1, 'backup1', 'has_streaming' => 1);
$node2->append_conf('postgresql.conf', <<EOM);
recovery_target_lsn = '$lsn'
recovery_target_action = 'pause'
EOM
$node2->start();
# Wait until recoveery pauses, then promote.
$node2->poll_query_until('postgres', "SELECT pg_get_wal_replay_pause_state() = 'paused';");
$node2->safe_psql('postgres', "SELECT pg_promote()");
# Once promotion occurs, insert a second row on the new node.
$node2->poll_query_until('postgres', "SELECT pg_is_in_recovery() = 'f';");
$node2->safe_psql('postgres', <<EOM);
INSERT INTO mytable VALUES (2, 'blackberry');
EOM
# Now take an incremental backup. If WAL summarization didn't follow the
# timeline cange correctly, something should break at this point.
my $backup2path = $node1->backup_dir . '/backup2';
$node2->command_ok(
[ 'pg_basebackup', '-D', $backup2path, '--no-sync', '-cfast',
'--incremental', $backup1path . '/backup_manifest' ],
"incremental backup from node2");
# Restore the incremental backup and use it to create a new node.
my $node3 = PostgreSQL::Test::Cluster->new('node3');
$node3->init_from_backup($node1, 'backup2',
combine_with_prior => [ 'backup1' ]);
$node3->start();
done_testing();

View File

@ -248,6 +248,7 @@ extern XLogRecPtr GetRedoRecPtr(void);
extern XLogRecPtr GetInsertRecPtr(void); extern XLogRecPtr GetInsertRecPtr(void);
extern XLogRecPtr GetFlushRecPtr(TimeLineID *insertTLI); extern XLogRecPtr GetFlushRecPtr(TimeLineID *insertTLI);
extern TimeLineID GetWALInsertionTimeLine(void); extern TimeLineID GetWALInsertionTimeLine(void);
extern TimeLineID GetWALInsertionTimeLineIfSet(void);
extern XLogRecPtr GetLastImportantRecPtr(void); extern XLogRecPtr GetLastImportantRecPtr(void);
extern void SetWalWriterSleeping(bool sleeping); extern void SetWalWriterSleeping(bool sleeping);

View File

@ -30,7 +30,6 @@ extern void GetWalSummarizerState(TimeLineID *summarized_tli,
extern XLogRecPtr GetOldestUnsummarizedLSN(TimeLineID *tli, extern XLogRecPtr GetOldestUnsummarizedLSN(TimeLineID *tli,
bool *lsn_is_exact); bool *lsn_is_exact);
extern void SetWalSummarizerLatch(void); extern void SetWalSummarizerLatch(void);
extern XLogRecPtr WaitForWalSummarization(XLogRecPtr lsn, long timeout, extern void WaitForWalSummarization(XLogRecPtr lsn);
XLogRecPtr *pending_lsn);
#endif #endif