mirror of
https://github.com/postgres/postgres.git
synced 2025-05-05 09:19:17 +03:00
Adjust pg_wal_replay_wait() procedure behavior on promoted standby
pg_wal_replay_wait() is intended to be called on standby. However, standby can be promoted to primary at any moment, even concurrently with the pg_wal_replay_wait() call. If recovery is not currently in progress that doesn't mean the wait was unsuccessful. Thus, we always need to recheck if the target LSN is replayed. Reported-by: Kevin Hale Boyes Discussion: https://postgr.es/m/CAPpHfdu5QN%2BZGACS%2B7foxmr8_nekgA2PA%2B-G3BuOUrdBLBFb6Q%40mail.gmail.com Author: Alexander Korotkov
This commit is contained in:
parent
bbf668d66f
commit
867d396ccd
@ -28969,6 +28969,15 @@ postgres=# SELECT '0/0'::pg_lsn + pd.segment_number * ps.setting::int + :offset
|
|||||||
connection pooler side.
|
connection pooler side.
|
||||||
</para>
|
</para>
|
||||||
|
|
||||||
|
<para>
|
||||||
|
<function>pg_wal_replay_wait</function> should be called on standby.
|
||||||
|
If a user calls <function>pg_wal_replay_wait</function> on primary, it
|
||||||
|
will error out. However, if <function>pg_wal_replay_wait</function> is
|
||||||
|
called on primary promoted from standby and <literal>target_lsn</literal>
|
||||||
|
was already replayed, then <function>pg_wal_replay_wait</function> just
|
||||||
|
exits immediately.
|
||||||
|
</para>
|
||||||
|
|
||||||
<para>
|
<para>
|
||||||
You can use <function>pg_wal_replay_wait</function> to wait for
|
You can use <function>pg_wal_replay_wait</function> to wait for
|
||||||
the <type>pg_lsn</type> value. For example, an application could update
|
the <type>pg_lsn</type> value. For example, an application could update
|
||||||
|
@ -230,14 +230,27 @@ WaitForLSNReplay(XLogRecPtr targetLSN, int64 timeout)
|
|||||||
Assert(MyProcNumber >= 0 && MyProcNumber < MaxBackends);
|
Assert(MyProcNumber >= 0 && MyProcNumber < MaxBackends);
|
||||||
|
|
||||||
if (!RecoveryInProgress())
|
if (!RecoveryInProgress())
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* Recovery is not in progress. Given that we detected this in the
|
||||||
|
* very first check, this procedure was mistakenly called on primary.
|
||||||
|
* However, it's possible that standby was promoted concurrently to
|
||||||
|
* the procedure call, while target LSN is replayed. So, we still
|
||||||
|
* check the last replay LSN before reporting an error.
|
||||||
|
*/
|
||||||
|
if (targetLSN <= GetXLogReplayRecPtr(NULL))
|
||||||
|
return;
|
||||||
ereport(ERROR,
|
ereport(ERROR,
|
||||||
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||||
errmsg("recovery is not in progress"),
|
errmsg("recovery is not in progress"),
|
||||||
errhint("Waiting for LSN can only be executed during recovery.")));
|
errhint("Waiting for LSN can only be executed during recovery.")));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
/* If target LSN is already replayed, exit immediately */
|
/* If target LSN is already replayed, exit immediately */
|
||||||
if (targetLSN <= GetXLogReplayRecPtr(NULL))
|
if (targetLSN <= GetXLogReplayRecPtr(NULL))
|
||||||
return;
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (timeout > 0)
|
if (timeout > 0)
|
||||||
{
|
{
|
||||||
@ -257,19 +270,30 @@ WaitForLSNReplay(XLogRecPtr targetLSN, int64 timeout)
|
|||||||
int rc;
|
int rc;
|
||||||
long delay_ms = 0;
|
long delay_ms = 0;
|
||||||
|
|
||||||
/* Check if the waited LSN has been replayed */
|
|
||||||
currentLSN = GetXLogReplayRecPtr(NULL);
|
|
||||||
if (targetLSN <= currentLSN)
|
|
||||||
break;
|
|
||||||
|
|
||||||
/* Recheck that recovery is still in-progress */
|
/* Recheck that recovery is still in-progress */
|
||||||
if (!RecoveryInProgress())
|
if (!RecoveryInProgress())
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* Recovery was ended, but recheck if target LSN was already
|
||||||
|
* replayed.
|
||||||
|
*/
|
||||||
|
currentLSN = GetXLogReplayRecPtr(NULL);
|
||||||
|
if (targetLSN <= currentLSN)
|
||||||
|
return;
|
||||||
ereport(ERROR,
|
ereport(ERROR,
|
||||||
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||||
errmsg("recovery is not in progress"),
|
errmsg("recovery is not in progress"),
|
||||||
errdetail("Recovery ended before replaying target LSN %X/%X; last replay LSN %X/%X.",
|
errdetail("Recovery ended before replaying target LSN %X/%X; last replay LSN %X/%X.",
|
||||||
LSN_FORMAT_ARGS(targetLSN),
|
LSN_FORMAT_ARGS(targetLSN),
|
||||||
LSN_FORMAT_ARGS(currentLSN))));
|
LSN_FORMAT_ARGS(currentLSN))));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/* Check if the waited LSN has been replayed */
|
||||||
|
currentLSN = GetXLogReplayRecPtr(NULL);
|
||||||
|
if (targetLSN <= currentLSN)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If the timeout value is specified, calculate the number of
|
* If the timeout value is specified, calculate the number of
|
||||||
|
@ -126,12 +126,18 @@ ok(1, 'multiple LSN waiters reported consistent data');
|
|||||||
|
|
||||||
# 5. Check that the standby promotion terminates the wait on LSN. Start
|
# 5. Check that the standby promotion terminates the wait on LSN. Start
|
||||||
# waiting for an unreachable LSN then promote. Check the log for the relevant
|
# waiting for an unreachable LSN then promote. Check the log for the relevant
|
||||||
# error message.
|
# error message. Also, check that waiting for already replayed LSN doesn't
|
||||||
|
# cause an error even after promotion.
|
||||||
|
my $lsn4 =
|
||||||
|
$node_primary->safe_psql('postgres',
|
||||||
|
"SELECT pg_current_wal_insert_lsn() + 10000000000");
|
||||||
|
my $lsn5 =
|
||||||
|
$node_primary->safe_psql('postgres', "SELECT pg_current_wal_insert_lsn()");
|
||||||
my $psql_session = $node_standby1->background_psql('postgres');
|
my $psql_session = $node_standby1->background_psql('postgres');
|
||||||
$psql_session->query_until(
|
$psql_session->query_until(
|
||||||
qr/start/, qq[
|
qr/start/, qq[
|
||||||
\\echo start
|
\\echo start
|
||||||
CALL pg_wal_replay_wait('${lsn3}');
|
CALL pg_wal_replay_wait('${lsn4}');
|
||||||
]);
|
]);
|
||||||
|
|
||||||
$log_offset = -s $node_standby1->logfile;
|
$log_offset = -s $node_standby1->logfile;
|
||||||
@ -140,6 +146,11 @@ $node_standby1->wait_for_log('recovery is not in progress', $log_offset);
|
|||||||
|
|
||||||
ok(1, 'got error after standby promote');
|
ok(1, 'got error after standby promote');
|
||||||
|
|
||||||
|
$node_standby1->safe_psql('postgres', "CALL pg_wal_replay_wait('${lsn5}');");
|
||||||
|
|
||||||
|
ok(1,
|
||||||
|
'wait for already replayed LSN exists immediately even after promotion');
|
||||||
|
|
||||||
$node_standby1->stop;
|
$node_standby1->stop;
|
||||||
$node_primary->stop;
|
$node_primary->stop;
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user