mirror of
https://github.com/postgres/postgres.git
synced 2025-06-14 18:42:34 +03:00
Add further debug info to help debug 019_replslot_limit.pl failures.
See also afdeff1052
. Failures after that commit provided a few more hints,
but not yet enough to understand what's going on.
In 019_replslot_limit.pl shut down nodes with fast instead of immediate mode
if we observe the failure mode. That should tell us whether the failures we're
observing are just a timing issue under high load. PGCTLTIMEOUT should prevent
buildfarm animals from hanging endlessly.
Also adds a bit more logging to replication slot drop and ShutdownPostgres().
Discussion: https://postgr.es/m/20220225192941.hqnvefgdzaro6gzg@alap3.anarazel.de
This commit is contained in:
@ -569,6 +569,10 @@ restart:
|
|||||||
if (!s->in_use)
|
if (!s->in_use)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
|
/* unlocked read of active_pid is ok for debugging purposes */
|
||||||
|
elog(DEBUG3, "temporary replication slot cleanup: %d in use, active_pid: %d",
|
||||||
|
i, s->active_pid);
|
||||||
|
|
||||||
SpinLockAcquire(&s->mutex);
|
SpinLockAcquire(&s->mutex);
|
||||||
if (s->active_pid == MyProcPid)
|
if (s->active_pid == MyProcPid)
|
||||||
{
|
{
|
||||||
@ -629,6 +633,9 @@ ReplicationSlotDropPtr(ReplicationSlot *slot)
|
|||||||
char path[MAXPGPATH];
|
char path[MAXPGPATH];
|
||||||
char tmppath[MAXPGPATH];
|
char tmppath[MAXPGPATH];
|
||||||
|
|
||||||
|
/* temp debugging aid to analyze 019_replslot_limit failures */
|
||||||
|
elog(DEBUG3, "replication slot drop: %s: begin", NameStr(slot->data.name));
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If some other backend ran this code concurrently with us, we might try
|
* If some other backend ran this code concurrently with us, we might try
|
||||||
* to delete a slot with a certain name while someone else was trying to
|
* to delete a slot with a certain name while someone else was trying to
|
||||||
@ -679,6 +686,9 @@ ReplicationSlotDropPtr(ReplicationSlot *slot)
|
|||||||
path, tmppath)));
|
path, tmppath)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
elog(DEBUG3, "replication slot drop: %s: removed on-disk",
|
||||||
|
NameStr(slot->data.name));
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The slot is definitely gone. Lock out concurrent scans of the array
|
* The slot is definitely gone. Lock out concurrent scans of the array
|
||||||
* long enough to kill it. It's OK to clear the active PID here without
|
* long enough to kill it. It's OK to clear the active PID here without
|
||||||
@ -734,6 +744,9 @@ ReplicationSlotDropPtr(ReplicationSlot *slot)
|
|||||||
* a slot while we're still cleaning up the detritus of the old one.
|
* a slot while we're still cleaning up the detritus of the old one.
|
||||||
*/
|
*/
|
||||||
LWLockRelease(ReplicationSlotAllocationLock);
|
LWLockRelease(ReplicationSlotAllocationLock);
|
||||||
|
|
||||||
|
elog(DEBUG3, "replication slot drop: %s: done",
|
||||||
|
NameStr(slot->data.name));
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -1945,3 +1945,10 @@ LWLockHeldByMeInMode(LWLock *l, LWLockMode mode)
|
|||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* temp debugging aid to analyze 019_replslot_limit failures */
|
||||||
|
int
|
||||||
|
LWLockHeldCount(void)
|
||||||
|
{
|
||||||
|
return num_held_lwlocks;
|
||||||
|
}
|
||||||
|
@ -1262,6 +1262,23 @@ ShutdownPostgres(int code, Datum arg)
|
|||||||
* them explicitly.
|
* them explicitly.
|
||||||
*/
|
*/
|
||||||
LockReleaseAll(USER_LOCKMETHOD, true);
|
LockReleaseAll(USER_LOCKMETHOD, true);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* temp debugging aid to analyze 019_replslot_limit failures
|
||||||
|
*
|
||||||
|
* If an error were thrown outside of a transaction nothing up to now
|
||||||
|
* would have released lwlocks. We probably will add an
|
||||||
|
* LWLockReleaseAll(). But for now make it easier to understand such cases
|
||||||
|
* by warning if any lwlocks are held.
|
||||||
|
*/
|
||||||
|
#ifdef USE_ASSERT_CHECKING
|
||||||
|
{
|
||||||
|
int held_lwlocks = LWLockHeldCount();
|
||||||
|
if (held_lwlocks)
|
||||||
|
elog(WARNING, "holding %d lwlocks at the end of ShutdownPostgres()",
|
||||||
|
held_lwlocks);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -121,6 +121,7 @@ extern void LWLockReleaseClearVar(LWLock *lock, uint64 *valptr, uint64 val);
|
|||||||
extern void LWLockReleaseAll(void);
|
extern void LWLockReleaseAll(void);
|
||||||
extern bool LWLockHeldByMe(LWLock *lock);
|
extern bool LWLockHeldByMe(LWLock *lock);
|
||||||
extern bool LWLockHeldByMeInMode(LWLock *lock, LWLockMode mode);
|
extern bool LWLockHeldByMeInMode(LWLock *lock, LWLockMode mode);
|
||||||
|
extern int LWLockHeldCount(void);
|
||||||
|
|
||||||
extern bool LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval);
|
extern bool LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval);
|
||||||
extern void LWLockUpdateVar(LWLock *lock, uint64 *valptr, uint64 value);
|
extern void LWLockUpdateVar(LWLock *lock, uint64 *valptr, uint64 value);
|
||||||
|
@ -335,7 +335,23 @@ $node_standby3->start;
|
|||||||
$node_primary3->wait_for_catchup($node_standby3);
|
$node_primary3->wait_for_catchup($node_standby3);
|
||||||
my $senderpid = $node_primary3->safe_psql('postgres',
|
my $senderpid = $node_primary3->safe_psql('postgres',
|
||||||
"SELECT pid FROM pg_stat_activity WHERE backend_type = 'walsender'");
|
"SELECT pid FROM pg_stat_activity WHERE backend_type = 'walsender'");
|
||||||
like($senderpid, qr/^[0-9]+$/, "have walsender pid $senderpid");
|
|
||||||
|
# We've seen occasional cases where multiple walsender pids are active. An
|
||||||
|
# immediate shutdown may hide evidence of a locking bug. So if multiple
|
||||||
|
# walsenders are observed, shut down in fast mode, and collect some more
|
||||||
|
# information.
|
||||||
|
if (not like($senderpid, qr/^[0-9]+$/, "have walsender pid $senderpid"))
|
||||||
|
{
|
||||||
|
my ($stdout, $stderr);
|
||||||
|
$node_primary3->psql('postgres',
|
||||||
|
"\\a\\t\nSELECT * FROM pg_stat_activity",
|
||||||
|
stdout => \$stdout, stderr => \$stderr);
|
||||||
|
diag $stdout, $stderr;
|
||||||
|
$node_primary3->stop('fast');
|
||||||
|
$node_standby3->stop('fast');
|
||||||
|
die "could not determine walsender pid, can't continue";
|
||||||
|
}
|
||||||
|
|
||||||
my $receiverpid = $node_standby3->safe_psql('postgres',
|
my $receiverpid = $node_standby3->safe_psql('postgres',
|
||||||
"SELECT pid FROM pg_stat_activity WHERE backend_type = 'walreceiver'");
|
"SELECT pid FROM pg_stat_activity WHERE backend_type = 'walreceiver'");
|
||||||
like($receiverpid, qr/^[0-9]+$/, "have walreceiver pid $receiverpid");
|
like($receiverpid, qr/^[0-9]+$/, "have walreceiver pid $receiverpid");
|
||||||
|
Reference in New Issue
Block a user