mirror of
https://github.com/postgres/postgres.git
synced 2025-05-01 01:04:50 +03:00
Fix more race conditions in the newly-added pg_rewind test.
pg_rewind looks at the control file to check what timeline a server is on. But promotion doesn't immediately write a checkpoint, it merely writes an end-of-recovery WAL record. If pg_rewind runs immediately after promotion, before the checkpoint has completed, it will think think that the server is still on the earlier timeline. We ran into this issue a long time ago already, see commit 484a848a73f. It's a bit bogus that pg_rewind doesn't determine the timeline correctly until the end-of-recovery checkpoint has completed. We probably should fix that. But for now work around it by waiting for the checkpoint to complete before running pg_rewind, like we did in commit 484a848a73f. In the passing, tidy up the new test a little bit. Rerder the INSERTs so that the comments make more sense, remove a spurious CHECKPOINT call after pg_rewind has already run, and add --debug option, so that if this fails again, we'll have more data. Per buildfarm failure at https://buildfarm.postgresql.org/cgi-bin/show_stage_log.pl?nm=rorqual&dt=2020-12-06%2018%3A32%3A19&stg=pg_rewind-check. Backpatch to all supported versions. Discussion: https://www.postgresql.org/message-id/1713707e-e318-761c-d287-5b6a4aa807e8@iki.fi
This commit is contained in:
parent
45d3631450
commit
d137b14c3c
@ -75,6 +75,13 @@ $node_1->wait_for_catchup('node_3', 'replay', $lsn);
|
|||||||
#
|
#
|
||||||
$node_1->stop('fast');
|
$node_1->stop('fast');
|
||||||
$node_3->promote;
|
$node_3->promote;
|
||||||
|
# Force a checkpoint after the promotion. pg_rewind looks at the control
|
||||||
|
# file to determine what timeline the server is on, and that isn't updated
|
||||||
|
# immediately at promotion, but only at the next checkpoint. When running
|
||||||
|
# pg_rewind in remote mode, it's possible that we complete the test steps
|
||||||
|
# after promotion so quickly that when pg_rewind runs, the standby has not
|
||||||
|
# performed a checkpoint after promotion yet.
|
||||||
|
$node_3->safe_psql('postgres', "checkpoint");
|
||||||
|
|
||||||
# reconfigure node_1 as a standby following node_3
|
# reconfigure node_1 as a standby following node_3
|
||||||
my $node_3_connstr = $node_3->connstr;
|
my $node_3_connstr = $node_3->connstr;
|
||||||
@ -105,6 +112,8 @@ $lsn = $node_3->lsn('insert');
|
|||||||
$node_3->wait_for_catchup('node_1', 'replay', $lsn);
|
$node_3->wait_for_catchup('node_1', 'replay', $lsn);
|
||||||
|
|
||||||
$node_1->promote;
|
$node_1->promote;
|
||||||
|
# Force a checkpoint after promotion, like earlier.
|
||||||
|
$node_1->safe_psql('postgres', "checkpoint");
|
||||||
|
|
||||||
#
|
#
|
||||||
# We now have a split-brain with two primaries. Insert a row on both to
|
# We now have a split-brain with two primaries. Insert a row on both to
|
||||||
@ -112,6 +121,9 @@ $node_1->promote;
|
|||||||
# see the insert on 1, as the insert on node 3 is rewound away.
|
# see the insert on 1, as the insert on node 3 is rewound away.
|
||||||
#
|
#
|
||||||
$node_1->safe_psql('postgres', "INSERT INTO public.foo (t) VALUES ('keep this')");
|
$node_1->safe_psql('postgres', "INSERT INTO public.foo (t) VALUES ('keep this')");
|
||||||
|
# 'bar' is unmodified in node 1, so it won't be overwritten by replaying the
|
||||||
|
# WAL from node 1.
|
||||||
|
$node_3->safe_psql('postgres', "INSERT INTO public.bar (t) VALUES ('rewind this')");
|
||||||
|
|
||||||
# Insert more rows in node 1, to bump up the XID counter. Otherwise, if
|
# Insert more rows in node 1, to bump up the XID counter. Otherwise, if
|
||||||
# rewind doesn't correctly rewind the changes made on the other node,
|
# rewind doesn't correctly rewind the changes made on the other node,
|
||||||
@ -120,10 +132,6 @@ $node_1->safe_psql('postgres', "INSERT INTO public.foo (t) VALUES ('keep this')"
|
|||||||
$node_1->safe_psql('postgres', "INSERT INTO public.foo (t) VALUES ('and this')");
|
$node_1->safe_psql('postgres', "INSERT INTO public.foo (t) VALUES ('and this')");
|
||||||
$node_1->safe_psql('postgres', "INSERT INTO public.foo (t) VALUES ('and this too')");
|
$node_1->safe_psql('postgres', "INSERT INTO public.foo (t) VALUES ('and this too')");
|
||||||
|
|
||||||
# Also insert a row in 'bar' on node 3. It is unmodified in node 1, so it won't get
|
|
||||||
# overwritten by replaying the WAL from node 1.
|
|
||||||
$node_3->safe_psql('postgres', "INSERT INTO public.bar (t) VALUES ('rewind this')");
|
|
||||||
|
|
||||||
# Wait for node 2 to catch up
|
# Wait for node 2 to catch up
|
||||||
$node_2->poll_query_until('postgres',
|
$node_2->poll_query_until('postgres',
|
||||||
q|SELECT COUNT(*) > 1 FROM public.bar|, 't');
|
q|SELECT COUNT(*) > 1 FROM public.bar|, 't');
|
||||||
@ -145,9 +153,10 @@ command_ok(
|
|||||||
[
|
[
|
||||||
'pg_rewind',
|
'pg_rewind',
|
||||||
"--source-server=$node_1_connstr",
|
"--source-server=$node_1_connstr",
|
||||||
"--target-pgdata=$node_2_pgdata"
|
"--target-pgdata=$node_2_pgdata",
|
||||||
|
"--debug"
|
||||||
],
|
],
|
||||||
'pg_rewind detects rewind needed');
|
'run pg_rewind');
|
||||||
|
|
||||||
# Now move back postgresql.conf with old settings
|
# Now move back postgresql.conf with old settings
|
||||||
move(
|
move(
|
||||||
@ -159,7 +168,6 @@ $node_2->start;
|
|||||||
# Check contents of the test tables after rewind. The rows inserted in node 3
|
# Check contents of the test tables after rewind. The rows inserted in node 3
|
||||||
# before rewind should've been overwritten with the data from node 1.
|
# before rewind should've been overwritten with the data from node 1.
|
||||||
my $result;
|
my $result;
|
||||||
$result = $node_2->safe_psql('postgres', 'checkpoint');
|
|
||||||
$result = $node_2->safe_psql('postgres', 'SELECT * FROM public.foo');
|
$result = $node_2->safe_psql('postgres', 'SELECT * FROM public.foo');
|
||||||
is($result, qq(keep this
|
is($result, qq(keep this
|
||||||
and this
|
and this
|
||||||
|
Loading…
x
Reference in New Issue
Block a user