1
0
mirror of https://github.com/postgres/postgres.git synced 2025-04-27 22:56:53 +03:00

Make logical WAL sender report streaming state appropriately

WAL senders sending logically-decoded data fail to properly report in
"streaming" state when starting up, hence as long as one extra record is
not replayed, such WAL senders would remain in a "catchup" state, which
is inconsistent with the physical cousin.

This can be easily reproduced by for example using pg_recvlogical and
restarting the upstream server.  The TAP tests have been slightly
modified to detect the failure and strengthened so as future tests also
make sure that a node is in streaming state when waiting for its
catchup.

Backpatch down to 9.4 where this code has been introduced.

Reported-by: Sawada Masahiko
Author: Simon Riggs, Sawada Masahiko
Reviewed-by: Petr Jelinek, Michael Paquier, Vaishnavi Prabakaran
Discussion: https://postgr.es/m/CAD21AoB2ZbCCqOx=bgKMcLrAvs1V0ZMqzs7wBTuDySezTGtMZA@mail.gmail.com
This commit is contained in:
Michael Paquier 2018-07-12 10:19:35 +09:00
parent 39a96512b3
commit 9a7b7adc13
3 changed files with 23 additions and 7 deletions

View File

@ -2169,7 +2169,7 @@ WalSndLoop(WalSndSendDataCallback send_data)
if (MyWalSnd->state == WALSNDSTATE_CATCHUP) if (MyWalSnd->state == WALSNDSTATE_CATCHUP)
{ {
ereport(DEBUG1, ereport(DEBUG1,
(errmsg("standby \"%s\" has now caught up with primary", (errmsg("\"%s\" has now caught up with upstream server",
application_name))); application_name)));
WalSndSetState(WALSNDSTATE_STREAMING); WalSndSetState(WALSNDSTATE_STREAMING);
} }
@ -2758,10 +2758,10 @@ XLogSendLogical(void)
char *errm; char *errm;
/* /*
* Don't know whether we've caught up yet. We'll set it to true in * Don't know whether we've caught up yet. We'll set WalSndCaughtUp to
* WalSndWaitForWal, if we're actually waiting. We also set to true if * true in WalSndWaitForWal, if we're actually waiting. We also set to
* XLogReadRecord() had to stop reading but WalSndWaitForWal didn't wait - * true if XLogReadRecord() had to stop reading but WalSndWaitForWal
* i.e. when we're shutting down. * didn't wait - i.e. when we're shutting down.
*/ */
WalSndCaughtUp = false; WalSndCaughtUp = false;
@ -2774,6 +2774,9 @@ XLogSendLogical(void)
if (record != NULL) if (record != NULL)
{ {
/* XXX: Note that logical decoding cannot be used while in recovery */
XLogRecPtr flushPtr = GetFlushRecPtr();
/* /*
* Note the lack of any call to LagTrackerWrite() which is handled by * Note the lack of any call to LagTrackerWrite() which is handled by
* WalSndUpdateProgress which is called by output plugin through * WalSndUpdateProgress which is called by output plugin through
@ -2782,6 +2785,13 @@ XLogSendLogical(void)
LogicalDecodingProcessRecord(logical_decoding_ctx, logical_decoding_ctx->reader); LogicalDecodingProcessRecord(logical_decoding_ctx, logical_decoding_ctx->reader);
sentPtr = logical_decoding_ctx->reader->EndRecPtr; sentPtr = logical_decoding_ctx->reader->EndRecPtr;
/*
* If we have sent a record that is at or beyond the flushed point, we
* have caught up.
*/
if (sentPtr >= flushPtr)
WalSndCaughtUp = true;
} }
else else
{ {

View File

@ -1535,7 +1535,8 @@ also works for logical subscriptions)
until its replication location in pg_stat_replication equals or passes the until its replication location in pg_stat_replication equals or passes the
upstream's WAL insert point at the time this function is called. By default upstream's WAL insert point at the time this function is called. By default
the replay_lsn is waited for, but 'mode' may be specified to wait for any of the replay_lsn is waited for, but 'mode' may be specified to wait for any of
sent|write|flush|replay. sent|write|flush|replay. The connection catching up must be in a streaming
state.
If there is no active replication connection from this peer, waits until If there is no active replication connection from this peer, waits until
poll_query_until timeout. poll_query_until timeout.
@ -1580,7 +1581,7 @@ sub wait_for_catchup
. $lsn_expr . " on " . $lsn_expr . " on "
. $self->name . "\n"; . $self->name . "\n";
my $query = my $query =
qq[SELECT $lsn_expr <= ${mode}_lsn FROM pg_catalog.pg_stat_replication WHERE application_name = '$standby_name';]; qq[SELECT $lsn_expr <= ${mode}_lsn AND state = 'streaming' FROM pg_catalog.pg_stat_replication WHERE application_name = '$standby_name';];
$self->poll_query_until('postgres', $query) $self->poll_query_until('postgres', $query)
or croak "timed out waiting for catchup"; or croak "timed out waiting for catchup";
print "done\n"; print "done\n";

View File

@ -188,6 +188,11 @@ $node_publisher->safe_psql('postgres',
"INSERT INTO tab_ins SELECT generate_series(1001,1100)"); "INSERT INTO tab_ins SELECT generate_series(1001,1100)");
$node_publisher->safe_psql('postgres', "DELETE FROM tab_rep"); $node_publisher->safe_psql('postgres', "DELETE FROM tab_rep");
# Restart the publisher and check the state of the subscriber which
# should be in a streaming state after catching up.
$node_publisher->stop('fast');
$node_publisher->start;
$node_publisher->wait_for_catchup($appname); $node_publisher->wait_for_catchup($appname);
$result = $node_subscriber->safe_psql('postgres', $result = $node_subscriber->safe_psql('postgres',