mirror of
https://github.com/postgres/postgres.git
synced 2025-05-01 01:04:50 +03:00
Don't fail for > 1 walsenders in 019_replslot_limit, add debug messages.
So far the first of the retries introduced in f28bf667f60 resolves the issue. But I (Andres) am still suspicious that the start of the failures might indicate a problem. To reduce noise, stop reporting a failure if a retry resolves the problem. To allow figuring out what causes the slow slot drop, add a few more debug messages to ReplicationSlotDropPtr. See also commit afdeff10526, fe0972ee5e6 and f28bf667f60. Discussion: https://postgr.es/m/20220327213219.smdvfkq2fl74flow@alap3.anarazel.de
This commit is contained in:
parent
da4b56662f
commit
91c0570a79
@ -702,8 +702,13 @@ ReplicationSlotDropPtr(ReplicationSlot *slot)
|
||||
slot->active_pid = 0;
|
||||
slot->in_use = false;
|
||||
LWLockRelease(ReplicationSlotControlLock);
|
||||
|
||||
elog(DEBUG3, "replication slot drop: %s: marked as not in use", NameStr(slot->data.name));
|
||||
|
||||
ConditionVariableBroadcast(&slot->active_cv);
|
||||
|
||||
elog(DEBUG3, "replication slot drop: %s: notified others", NameStr(slot->data.name));
|
||||
|
||||
/*
|
||||
* Slot is dead and doesn't prevent resource removal anymore, recompute
|
||||
* limits.
|
||||
@ -711,6 +716,8 @@ ReplicationSlotDropPtr(ReplicationSlot *slot)
|
||||
ReplicationSlotsComputeRequiredXmin(false);
|
||||
ReplicationSlotsComputeRequiredLSN();
|
||||
|
||||
elog(DEBUG3, "replication slot drop: %s: computed required", NameStr(slot->data.name));
|
||||
|
||||
/*
|
||||
* If removing the directory fails, the worst thing that will happen is
|
||||
* that the user won't be able to create a new slot with the same name
|
||||
@ -720,6 +727,8 @@ ReplicationSlotDropPtr(ReplicationSlot *slot)
|
||||
ereport(WARNING,
|
||||
(errmsg("could not remove directory \"%s\"", tmppath)));
|
||||
|
||||
elog(DEBUG3, "replication slot drop: %s: removed directory", NameStr(slot->data.name));
|
||||
|
||||
/*
|
||||
* Send a message to drop the replication slot to the stats collector.
|
||||
* Since there is no guarantee of the order of message transfer on a UDP
|
||||
|
@ -339,8 +339,8 @@ my $senderpid;
|
||||
# We've seen occasional cases where multiple walsender pids are active. It
|
||||
# could be that we're just observing process shutdown being slow. To collect
|
||||
# more information, retry a couple times, print a bit of debugging information
|
||||
# each iteration. For now report a test failure even if later iterations
|
||||
# succeed.
|
||||
# each iteration. Don't fail the test if retries find just one pid, the
|
||||
# buildfarm failures are too noisy.
|
||||
my $i = 0;
|
||||
while (1)
|
||||
{
|
||||
@ -349,7 +349,9 @@ while (1)
|
||||
$senderpid = $node_primary3->safe_psql('postgres',
|
||||
"SELECT pid FROM pg_stat_activity WHERE backend_type = 'walsender'");
|
||||
|
||||
last if like($senderpid, qr/^[0-9]+$/, "have walsender pid $senderpid");
|
||||
last if $senderpid =~ qr/^[0-9]+$/;
|
||||
|
||||
diag "multiple walsenders active in iteration $i";
|
||||
|
||||
# show information about all active connections
|
||||
$node_primary3->psql('postgres',
|
||||
@ -370,6 +372,8 @@ while (1)
|
||||
usleep(100_000);
|
||||
}
|
||||
|
||||
like($senderpid, qr/^[0-9]+$/, "have walsender pid $senderpid");
|
||||
|
||||
my $receiverpid = $node_standby3->safe_psql('postgres',
|
||||
"SELECT pid FROM pg_stat_activity WHERE backend_type = 'walreceiver'");
|
||||
like($receiverpid, qr/^[0-9]+$/, "have walreceiver pid $receiverpid");
|
||||
|
Loading…
x
Reference in New Issue
Block a user