1
0
mirror of https://github.com/postgres/postgres.git synced 2025-05-20 05:13:53 +03:00

pg_rewind: Fix determining TLI when server was just promoted.

If the source server was just promoted, and it hasn't written the
checkpoint record yet, pg_rewind considered the server to be still on
the old timeline. Because of that, it would claim incorrectly that no
rewind is required. Fix that by looking at minRecoveryPointTLI in the
control file in addition to the ThisTimeLineID on the checkpoint.

This has been a known issue since forever, and we had worked around it
in the regression tests by issuing a checkpoint after each promotion,
before running pg_rewind. But that was always quite hacky, so better
to fix this properly. This doesn't add any new tests for this, but
removes the previously-added workarounds from the existing tests, so
that they should occasionally hit this codepath again.

This is arguably a bug fix, but don't backpatch because we haven't
really treated it as a bug so far. Also, the patch didn't apply
cleanly to v13 and below. I'm sure sure it could be made to work on
v13, but doesn't seem worth the risk and effort.

Reviewed-by: Kyotaro Horiguchi, Ibrar Ahmed, Aleksander Alekseev
Discussion: https://www.postgresql.org/message-id/9f568c97-87fe-a716-bd39-65299b8a60f4%40iki.fi
This commit is contained in:
Heikki Linnakangas 2023-02-23 15:22:53 +02:00
parent 75c737636b
commit 009eeee746
4 changed files with 64 additions and 59 deletions

View File

@ -45,7 +45,13 @@ static void digestControlFile(ControlFileData *ControlFile,
const char *content, size_t size); const char *content, size_t size);
static void getRestoreCommand(const char *argv0); static void getRestoreCommand(const char *argv0);
static void sanityChecks(void); static void sanityChecks(void);
static void findCommonAncestorTimeline(XLogRecPtr *recptr, int *tliIndex); static TimeLineHistoryEntry *getTimelineHistory(TimeLineID tli, bool is_source,
int *nentries);
static void findCommonAncestorTimeline(TimeLineHistoryEntry *a_history,
int a_nentries,
TimeLineHistoryEntry *b_history,
int b_nentries,
XLogRecPtr *recptr, int *tliIndex);
static void ensureCleanShutdown(const char *argv0); static void ensureCleanShutdown(const char *argv0);
static void disconnect_atexit(void); static void disconnect_atexit(void);
@ -134,6 +140,8 @@ main(int argc, char **argv)
XLogRecPtr chkptrec; XLogRecPtr chkptrec;
TimeLineID chkpttli; TimeLineID chkpttli;
XLogRecPtr chkptredo; XLogRecPtr chkptredo;
TimeLineID source_tli;
TimeLineID target_tli;
XLogRecPtr target_wal_endrec; XLogRecPtr target_wal_endrec;
size_t size; size_t size;
char *buffer; char *buffer;
@ -332,14 +340,28 @@ main(int argc, char **argv)
sanityChecks(); sanityChecks();
/*
* Usually, the TLI can be found in the latest checkpoint record. But if
* the source server is just being promoted (or it's a standby that's
* following a primary that's just being promoted), and the checkpoint
* requested by the promotion hasn't completed yet, the latest timeline is
* in minRecoveryPoint. So we check which is later, the TLI of the
* minRecoveryPoint or the latest checkpoint.
*/
source_tli = Max(ControlFile_source.minRecoveryPointTLI,
ControlFile_source.checkPointCopy.ThisTimeLineID);
/* Similarly for the target. */
target_tli = Max(ControlFile_target.minRecoveryPointTLI,
ControlFile_target.checkPointCopy.ThisTimeLineID);
/* /*
* Find the common ancestor timeline between the clusters. * Find the common ancestor timeline between the clusters.
* *
* If both clusters are already on the same timeline, there's nothing to * If both clusters are already on the same timeline, there's nothing to
* do. * do.
*/ */
if (ControlFile_target.checkPointCopy.ThisTimeLineID == if (target_tli == source_tli)
ControlFile_source.checkPointCopy.ThisTimeLineID)
{ {
pg_log_info("source and target cluster are on the same timeline"); pg_log_info("source and target cluster are on the same timeline");
rewind_needed = false; rewind_needed = false;
@ -348,12 +370,31 @@ main(int argc, char **argv)
else else
{ {
XLogRecPtr chkptendrec; XLogRecPtr chkptendrec;
TimeLineHistoryEntry *sourceHistory;
int sourceNentries;
/*
* Retrieve timelines for both source and target, and find the point
* where they diverged.
*/
sourceHistory = getTimelineHistory(source_tli, true, &sourceNentries);
targetHistory = getTimelineHistory(target_tli, false, &targetNentries);
findCommonAncestorTimeline(sourceHistory, sourceNentries,
targetHistory, targetNentries,
&divergerec, &lastcommontliIndex);
findCommonAncestorTimeline(&divergerec, &lastcommontliIndex);
pg_log_info("servers diverged at WAL location %X/%X on timeline %u", pg_log_info("servers diverged at WAL location %X/%X on timeline %u",
LSN_FORMAT_ARGS(divergerec), LSN_FORMAT_ARGS(divergerec),
targetHistory[lastcommontliIndex].tli); targetHistory[lastcommontliIndex].tli);
/*
* Don't need the source history anymore. The target history is still
* needed by the routines in parsexlog.c, when we read the target WAL.
*/
pfree(sourceHistory);
/* /*
* Determine the end-of-WAL on the target. * Determine the end-of-WAL on the target.
* *
@ -654,7 +695,8 @@ perform_rewind(filemap_t *filemap, rewind_source *source,
pg_fatal("source system was in unexpected state at end of rewind"); pg_fatal("source system was in unexpected state at end of rewind");
endrec = source->get_current_wal_insert_lsn(source); endrec = source->get_current_wal_insert_lsn(source);
endtli = ControlFile_source_after.checkPointCopy.ThisTimeLineID; endtli = Max(ControlFile_source_after.checkPointCopy.ThisTimeLineID,
ControlFile_source_after.minRecoveryPointTLI);
} }
} }
else else
@ -796,16 +838,12 @@ MinXLogRecPtr(XLogRecPtr a, XLogRecPtr b)
} }
/* /*
* Retrieve timeline history for given control file which should behold * Retrieve timeline history for the source or target system.
* either source or target.
*/ */
static TimeLineHistoryEntry * static TimeLineHistoryEntry *
getTimelineHistory(ControlFileData *controlFile, int *nentries) getTimelineHistory(TimeLineID tli, bool is_source, int *nentries)
{ {
TimeLineHistoryEntry *history; TimeLineHistoryEntry *history;
TimeLineID tli;
tli = controlFile->checkPointCopy.ThisTimeLineID;
/* /*
* Timeline 1 does not have a history file, so there is no need to check * Timeline 1 does not have a history file, so there is no need to check
@ -826,12 +864,10 @@ getTimelineHistory(ControlFileData *controlFile, int *nentries)
TLHistoryFilePath(path, tli); TLHistoryFilePath(path, tli);
/* Get history file from appropriate source */ /* Get history file from appropriate source */
if (controlFile == &ControlFile_source) if (is_source)
histfile = source->fetch_file(source, path, NULL); histfile = source->fetch_file(source, path, NULL);
else if (controlFile == &ControlFile_target)
histfile = slurpFile(datadir_target, path, NULL);
else else
pg_fatal("invalid control file"); histfile = slurpFile(datadir_target, path, NULL);
history = rewind_parseTimeLineHistory(histfile, tli, nentries); history = rewind_parseTimeLineHistory(histfile, tli, nentries);
pg_free(histfile); pg_free(histfile);
@ -841,12 +877,10 @@ getTimelineHistory(ControlFileData *controlFile, int *nentries)
{ {
int i; int i;
if (controlFile == &ControlFile_source) if (is_source)
pg_log_debug("Source timeline history:"); pg_log_debug("Source timeline history:");
else if (controlFile == &ControlFile_target)
pg_log_debug("Target timeline history:");
else else
Assert(false); pg_log_debug("Target timeline history:");
/* /*
* Print the target timeline history. * Print the target timeline history.
@ -866,28 +900,19 @@ getTimelineHistory(ControlFileData *controlFile, int *nentries)
} }
/* /*
* Determine the TLI of the last common timeline in the timeline history of the * Determine the TLI of the last common timeline in the timeline history of
* two clusters. targetHistory is filled with target timeline history and * two clusters. *tliIndex is set to the index of last common timeline in
* targetNentries is number of items in targetHistory. *tliIndex is set to the * the arrays, and *recptr is set to the position where the timeline history
* index of last common timeline in targetHistory array, and *recptr is set to * diverged (ie. the first WAL record that's not the same in both clusters).
* the position where the timeline history diverged (ie. the first WAL record
* that's not the same in both clusters).
*
* Control files of both clusters must be read into ControlFile_target/source
* before calling this routine.
*/ */
static void static void
findCommonAncestorTimeline(XLogRecPtr *recptr, int *tliIndex) findCommonAncestorTimeline(TimeLineHistoryEntry *a_history, int a_nentries,
TimeLineHistoryEntry *b_history, int b_nentries,
XLogRecPtr *recptr, int *tliIndex)
{ {
TimeLineHistoryEntry *sourceHistory;
int sourceNentries;
int i, int i,
n; n;
/* Retrieve timelines for both source and target */
sourceHistory = getTimelineHistory(&ControlFile_source, &sourceNentries);
targetHistory = getTimelineHistory(&ControlFile_target, &targetNentries);
/* /*
* Trace the history forward, until we hit the timeline diverge. It may * Trace the history forward, until we hit the timeline diverge. It may
* still be possible that the source and target nodes used the same * still be possible that the source and target nodes used the same
@ -896,21 +921,19 @@ findCommonAncestorTimeline(XLogRecPtr *recptr, int *tliIndex)
* recovery processes. Hence check the start position of the new timeline * recovery processes. Hence check the start position of the new timeline
* as well and move down by one extra timeline entry if they do not match. * as well and move down by one extra timeline entry if they do not match.
*/ */
n = Min(sourceNentries, targetNentries); n = Min(a_nentries, b_nentries);
for (i = 0; i < n; i++) for (i = 0; i < n; i++)
{ {
if (sourceHistory[i].tli != targetHistory[i].tli || if (a_history[i].tli != b_history[i].tli ||
sourceHistory[i].begin != targetHistory[i].begin) a_history[i].begin != b_history[i].begin)
break; break;
} }
if (i > 0) if (i > 0)
{ {
i--; i--;
*recptr = MinXLogRecPtr(sourceHistory[i].end, targetHistory[i].end); *recptr = MinXLogRecPtr(a_history[i].end, b_history[i].end);
*tliIndex = i; *tliIndex = i;
pg_free(sourceHistory);
return; return;
} }
else else

View File

@ -83,7 +83,6 @@ $node_b->wait_for_catchup('node_c', 'write', $lsn);
# A (primary) <--- B (standby) C (primary) # A (primary) <--- B (standby) C (primary)
$node_c->promote; $node_c->promote;
$node_c->safe_psql('postgres', "checkpoint");
# Insert a row in A. This causes A/B and C to have "diverged", so that it's # Insert a row in A. This causes A/B and C to have "diverged", so that it's

View File

@ -76,13 +76,6 @@ $node_1->wait_for_catchup('node_3');
# #
$node_1->stop('fast'); $node_1->stop('fast');
$node_3->promote; $node_3->promote;
# Force a checkpoint after the promotion. pg_rewind looks at the control
# file to determine what timeline the server is on, and that isn't updated
# immediately at promotion, but only at the next checkpoint. When running
# pg_rewind in remote mode, it's possible that we complete the test steps
# after promotion so quickly that when pg_rewind runs, the standby has not
# performed a checkpoint after promotion yet.
$node_3->safe_psql('postgres', "checkpoint");
# reconfigure node_1 as a standby following node_3 # reconfigure node_1 as a standby following node_3
my $node_3_connstr = $node_3->connstr; my $node_3_connstr = $node_3->connstr;
@ -108,8 +101,6 @@ $node_2->restart();
$node_3->wait_for_catchup('node_1'); $node_3->wait_for_catchup('node_1');
$node_1->promote; $node_1->promote;
# Force a checkpoint after promotion, like earlier.
$node_1->safe_psql('postgres', "checkpoint");
# #
# We now have a split-brain with two primaries. Insert a row on both to # We now have a split-brain with two primaries. Insert a row on both to

View File

@ -198,14 +198,6 @@ sub promote_standby
# the primary out-of-sync with the standby. # the primary out-of-sync with the standby.
$node_standby->promote; $node_standby->promote;
# Force a checkpoint after the promotion. pg_rewind looks at the control
# file to determine what timeline the server is on, and that isn't updated
# immediately at promotion, but only at the next checkpoint. When running
# pg_rewind in remote mode, it's possible that we complete the test steps
# after promotion so quickly that when pg_rewind runs, the standby has not
# performed a checkpoint after promotion yet.
standby_psql("checkpoint");
return; return;
} }