Add further debug info to help debug 019_replslot_limit.pl failures.

See also afdeff1052. Failures after that commit provided a few more hints, but not yet enough to understand what's going on. In 019_replslot_limit.pl shut down nodes with fast instead of immediate mode if we observe the failure mode. That should tell us whether the failures we're observing are just a timing issue under high load. PGCTLTIMEOUT should prevent buildfarm animals from hanging endlessly. Also adds a bit more logging to replication slot drop and ShutdownPostgres(). Discussion: https://postgr.es/m/20220225192941.hqnvefgdzaro6gzg@alap3.anarazel.de
2025-10-15 05:46:52 +03:00 · 2022-02-25 16:58:48 -08:00
parent 638300fef5
commit fe0972ee5e
5 changed files with 55 additions and 1 deletions
--- a/src/backend/replication/slot.c
+++ b/src/backend/replication/slot.c
@@ -569,6 +569,10 @@ restart:
 		if (!s->in_use)
 			continue;
 		/* unlocked read of active_pid is ok for debugging purposes */
 		elog(DEBUG3, "temporary replication slot cleanup: %d in use, active_pid: %d",
 			 i, s->active_pid);
 		SpinLockAcquire(&s->mutex);
 		if (s->active_pid == MyProcPid)
 		{
@@ -629,6 +633,9 @@ ReplicationSlotDropPtr(ReplicationSlot *slot)
 	char		path[MAXPGPATH];
 	char		tmppath[MAXPGPATH];
 	/* temp debugging aid to analyze 019_replslot_limit failures */
 	elog(DEBUG3, "replication slot drop: %s: begin", NameStr(slot->data.name));
 	/*
 	 * If some other backend ran this code concurrently with us, we might try
 	 * to delete a slot with a certain name while someone else was trying to
@@ -679,6 +686,9 @@ ReplicationSlotDropPtr(ReplicationSlot *slot)
 						path, tmppath)));
 	}
 	elog(DEBUG3, "replication slot drop: %s: removed on-disk",
 		 NameStr(slot->data.name));
 	/*
 	 * The slot is definitely gone.  Lock out concurrent scans of the array
 	 * long enough to kill it.  It's OK to clear the active PID here without
@@ -734,6 +744,9 @@ ReplicationSlotDropPtr(ReplicationSlot *slot)
 	 * a slot while we're still cleaning up the detritus of the old one.
 	 */
 	LWLockRelease(ReplicationSlotAllocationLock);
 	elog(DEBUG3, "replication slot drop: %s: done",
 		 NameStr(slot->data.name));
 }
 /*
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -1945,3 +1945,10 @@ LWLockHeldByMeInMode(LWLock *l, LWLockMode mode)
 	}
 	return false;
 }
 /* temp debugging aid to analyze 019_replslot_limit failures */
 int
 LWLockHeldCount(void)
 {
 	return num_held_lwlocks;
 }
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -1262,6 +1262,23 @@ ShutdownPostgres(int code, Datum arg)
 	 * them explicitly.
 	 */
 	LockReleaseAll(USER_LOCKMETHOD, true);
 	/*
 	 * temp debugging aid to analyze 019_replslot_limit failures
 	 *
 	 * If an error were thrown outside of a transaction nothing up to now
 	 * would have released lwlocks. We probably will add an
 	 * LWLockReleaseAll(). But for now make it easier to understand such cases
 	 * by warning if any lwlocks are held.
 	 */
 #ifdef USE_ASSERT_CHECKING
 	{
 		int held_lwlocks = LWLockHeldCount();
 		if (held_lwlocks)
 			elog(WARNING, "holding %d lwlocks at the end of ShutdownPostgres()",
 				 held_lwlocks);
 	}
 #endif
 }
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -121,6 +121,7 @@ extern void LWLockReleaseClearVar(LWLock *lock, uint64 *valptr, uint64 val);
 extern void LWLockReleaseAll(void);
 extern bool LWLockHeldByMe(LWLock *lock);
 extern bool LWLockHeldByMeInMode(LWLock *lock, LWLockMode mode);
 extern int LWLockHeldCount(void);
 extern bool LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval);
 extern void LWLockUpdateVar(LWLock *lock, uint64 *valptr, uint64 value);
--- a/src/test/recovery/t/019_replslot_limit.pl
+++ b/src/test/recovery/t/019_replslot_limit.pl
@@ -335,7 +335,23 @@ $node_standby3->start;
 $node_primary3->wait_for_catchup($node_standby3);
 my $senderpid = $node_primary3->safe_psql('postgres',
 	"SELECT pid FROM pg_stat_activity WHERE backend_type = 'walsender'");
-like($senderpid, qr/^[0-9]+$/, "have walsender pid $senderpid");
+
 # We've seen occasional cases where multiple walsender pids are active. An
 # immediate shutdown may hide evidence of a locking bug. So if multiple
 # walsenders are observed, shut down in fast mode, and collect some more
 # information.
 if (not like($senderpid, qr/^[0-9]+$/, "have walsender pid $senderpid"))
 {
 	my ($stdout, $stderr);
 	$node_primary3->psql('postgres',
 						 "\\a\\t\nSELECT * FROM pg_stat_activity",
 						 stdout => \$stdout, stderr => \$stderr);
 	diag $stdout, $stderr;
 	$node_primary3->stop('fast');
 	$node_standby3->stop('fast');
 	die "could not determine walsender pid, can't continue";
 }
 my $receiverpid = $node_standby3->safe_psql('postgres',
 	"SELECT pid FROM pg_stat_activity WHERE backend_type = 'walreceiver'");
 like($receiverpid, qr/^[0-9]+$/, "have walreceiver pid $receiverpid");