Modify the isolation tester so that multiple sessions can wait.

This allows testing of deadlock scenarios. Scenarios that would previously have been considered invalid are now simply taken as a scenario in which more than one backend will wait.
2025-10-15 05:46:52 +03:00 · 2016-02-11 08:36:30 -05:00
parent c9882c60f4
commit 38f8bdcac4
2 changed files with 200 additions and 151 deletions
--- a/src/test/isolation/README
+++ b/src/test/isolation/README
@@ -107,10 +107,11 @@ Each step may contain commands that block until further action has been taken
 (most likely, some other session runs a step that unblocks it or causes a
 deadlock).  A test that uses this ability must manually specify valid
 permutations, i.e. those that would not expect a blocked session to execute a
-command.  If the test fails to follow that rule, the test is aborted.
+command.  If a test fails to follow that rule, isolationtester will cancel it
-
+after 60 seconds.  If the cancel doesn't work, isolationtester will exit
-Currently, at most one step can be waiting at a time.  As long as one
+uncleanly after a total of 75 seconds of wait time.  Testing invalid
-step is waiting, subsequent steps are run to completion synchronously.
+permutations should be avoided because they can make the isolation tests take
 a very long time to run, and they serve no useful testing purpose.
 Note that isolationtester recognizes that a command has blocked by looking
 to see if it is shown as waiting in the pg_locks view; therefore, only
--- a/src/test/isolation/isolationtester.c
+++ b/src/test/isolation/isolationtester.c
@@ -15,6 +15,7 @@
 #include <sys/select.h>
 #endif
 #include "datatype/timestamp.h"
 #include "libpq-fe.h"
 #include "pqexpbuffer.h"
 #include "pg_getopt.h"
@@ -237,54 +238,6 @@ main(int argc, char **argv)
 	appendPQExpBufferStr(&wait_query,
 						 ") "
 						 "AND holder.mode = ANY (CASE waiter.mode "
 						 "WHEN 'AccessShareLock' THEN ARRAY["
 						 "'AccessExclusiveLock'] "
 						 "WHEN 'RowShareLock' THEN ARRAY["
 						 "'ExclusiveLock',"
 						 "'AccessExclusiveLock'] "
 						 "WHEN 'RowExclusiveLock' THEN ARRAY["
 						 "'ShareLock',"
 						 "'ShareRowExclusiveLock',"
 						 "'ExclusiveLock',"
 						 "'AccessExclusiveLock'] "
 						 "WHEN 'ShareUpdateExclusiveLock' THEN ARRAY["
 						 "'ShareUpdateExclusiveLock',"
 						 "'ShareLock',"
 						 "'ShareRowExclusiveLock',"
 						 "'ExclusiveLock',"
 						 "'AccessExclusiveLock'] "
 						 "WHEN 'ShareLock' THEN ARRAY["
 						 "'RowExclusiveLock',"
 						 "'ShareUpdateExclusiveLock',"
 						 "'ShareRowExclusiveLock',"
 						 "'ExclusiveLock',"
 						 "'AccessExclusiveLock'] "
 						 "WHEN 'ShareRowExclusiveLock' THEN ARRAY["
 						 "'RowExclusiveLock',"
 						 "'ShareUpdateExclusiveLock',"
 						 "'ShareLock',"
 						 "'ShareRowExclusiveLock',"
 						 "'ExclusiveLock',"
 						 "'AccessExclusiveLock'] "
 						 "WHEN 'ExclusiveLock' THEN ARRAY["
 						 "'RowShareLock',"
 						 "'RowExclusiveLock',"
 						 "'ShareUpdateExclusiveLock',"
 						 "'ShareLock',"
 						 "'ShareRowExclusiveLock',"
 						 "'ExclusiveLock',"
 						 "'AccessExclusiveLock'] "
 						 "WHEN 'AccessExclusiveLock' THEN ARRAY["
 						 "'AccessShareLock',"
 						 "'RowShareLock',"
 						 "'RowExclusiveLock',"
 						 "'ShareUpdateExclusiveLock',"
 						 "'ShareLock',"
 						 "'ShareRowExclusiveLock',"
 						 "'ExclusiveLock',"
 						 "'AccessExclusiveLock'] END) "
 				  "AND holder.locktype IS NOT DISTINCT FROM waiter.locktype "
 				  "AND holder.database IS NOT DISTINCT FROM waiter.database "
 				  "AND holder.relation IS NOT DISTINCT FROM waiter.relation "
@@ -467,34 +420,48 @@ report_error_message(Step *step)
 }
 /*
- * As above, but reports messages possibly emitted by two steps.  This is
+ * As above, but reports messages possibly emitted by multiple steps.  This is
 * useful when we have a blocked command awakened by another one; we want to
- * report both messages identically, for the case where we don't care which
+ * report all messages identically, for the case where we don't care which
 * one fails due to a timeout such as deadlock timeout.
 */
 static void
-report_two_error_messages(Step *step1, Step *step2)
+report_multiple_error_messages(Step *step, int nextra, Step **extrastep)
 {
-	char	   *prefix;
+	PQExpBufferData buffer;
 	int		n;
-	prefix = psprintf("%s %s", step1->name, step2->name);
+	if (nextra == 0)
 	if (step1->errormsg)
 	{
-		fprintf(stdout, "error in steps %s: %s\n", prefix,
+		report_error_message(step);
-				step1->errormsg);
+		return;
 		free(step1->errormsg);
 		step1->errormsg = NULL;
 	}
 	if (step2->errormsg)
 	{
 		fprintf(stdout, "error in steps %s: %s\n", prefix,
 				step2->errormsg);
 		free(step2->errormsg);
 		step2->errormsg = NULL;
 	}
-	free(prefix);
+	initPQExpBuffer(&buffer);
 	appendPQExpBufferStr(&buffer, step->name);
 	for (n = 0; n < nextra; ++n)
 		appendPQExpBuffer(&buffer, " %s", extrastep[n]->name);
 	if (step->errormsg)
 	{
 		fprintf(stdout, "error in steps %s: %s\n", buffer.data,
 				step->errormsg);
 		free(step->errormsg);
 		step->errormsg = NULL;
 	}
 	for (n = 0; n < nextra; ++n)
 	{
 		if (extrastep[n]->errormsg == NULL)
 			continue;
 		fprintf(stdout, "error in steps %s: %s\n",
 				buffer.data, extrastep[n]->errormsg);
 		free(extrastep[n]->errormsg);
 		extrastep[n]->errormsg = NULL;
 	}
 	termPQExpBuffer(&buffer);
 }
 /*
@@ -505,7 +472,14 @@ run_permutation(TestSpec *testspec, int nsteps, Step **steps)
 {
 	PGresult   *res;
 	int			i;
-	Step	   *waiting = NULL;
+	int			w;
 	int			nwaiting = 0;
 	int			nerrorstep = 0;
 	Step      **waiting;
 	Step      **errorstep;
 	waiting = malloc(sizeof(Step *) * testspec->nsessions);
 	errorstep = malloc(sizeof(Step *) * testspec->nsessions);
 	/*
 	 * In dry run mode, just display the permutation in the same format used
@@ -566,54 +540,76 @@ run_permutation(TestSpec *testspec, int nsteps, Step **steps)
 	for (i = 0; i < nsteps; i++)
 	{
 		Step	   *step = steps[i];
 		Step       *oldstep = NULL;
 		PGconn	   *conn = conns[1 + step->session];
-
+		bool		mustwait;
 		if (waiting != NULL && step->session == waiting->session)
 		{
 			PGcancel   *cancel;
 			PGresult   *res;
 			int			j;
 		/*
-			 * This permutation is invalid: it can never happen in real life.
+		 * Check whether the session that needs to perform the next step
 		 * is still blocked on an earlier step.  If so, wait for it to finish.
 		 *
-			 * A session is blocked on an earlier step (waiting) and no
+		 * In older versions of this tool, when allowed precisely one session
-			 * further steps from this session can run until it is unblocked,
+		 * to be waiting at a time.  If we reached a step which required that
-			 * but it can only be unblocked by running steps from other
+		 * session to execute the next command, we would declare the whole
-			 * sessions.
+		 * permutation invalid, cancel everything, and move on to the next one.
 		 * Unfortunately, that made it impossible to test the deadlock detector
 		 * using this framework unless the numebr of processes involved in the
 		 * deadlock was precisely two.  We now assume that if we reach a step
 		 * that is still blocked, we need to wait for it to unblock itself.
 		 */
-			fprintf(stderr, "invalid permutation detected\n");
+		for (w = 0; w < nwaiting; ++w)
 			/* Cancel the waiting statement from this session. */
 			cancel = PQgetCancel(conn);
 			if (cancel != NULL)
 		{
-				char		buf[256];
+			if (step->session == waiting[w]->session)
-
+			{
-				if (!PQcancel(cancel, buf, sizeof(buf)))
+				oldstep = waiting[w];
-					fprintf(stderr, "PQcancel failed: %s\n", buf);
+				break;
 				/* Be sure to consume the error message. */
 				while ((res = PQgetResult(conn)) != NULL)
 					PQclear(res);
 				PQfreeCancel(cancel);
 			}
 		}
 		if (oldstep != NULL)
 		{
 			/* Wait for previous step on this connection. */
 			try_complete_step(oldstep, STEP_RETRY);
 			/*
-			 * Now we really have to complete all the running transactions to
+			 * Attempt to complete any steps that were previously waiting.
-			 * make sure teardown doesn't block.
+			 * Remove any that have completed, and also remove the one for
 			 * which we just waited.
 			 */
-			for (j = 1; j < nconns; j++)
+			w = 0;
 			nerrorstep = 0;
 			while (w < nwaiting)
 			{
-				res = PQexec(conns[j], "ROLLBACK");
+				if (waiting[w] == oldstep)
-				if (res != NULL)
+				{
-					PQclear(res);
+					/* We just waited for this; remove it. */
 					if (w + 1 < nwaiting)
 						memcpy(&waiting[w], &waiting[w + 1],
 							   (nwaiting - (w + 1)) * sizeof(Step *));
 					nwaiting--;
 					continue;
 				}
-			goto teardown;
+				if (try_complete_step(waiting[w], STEP_NONBLOCK | STEP_RETRY))
 				{
 					/* Still blocked on a lock, leave it alone. */
 					w++;
 				}
 				else
 				{
 					/* This one finished, too! */
 					errorstep[nerrorstep++] = waiting[w];
 					if (w + 1 < nwaiting)
 						memcpy(&waiting[w], &waiting[w + 1],
 							   (nwaiting - (w + 1)) * sizeof(Step *));
 					nwaiting--;
 				}
 			}
 			/* Report all errors together. */
 			report_multiple_error_messages(oldstep, nerrorstep, errorstep);
 		}
 		/* Send the query for this step. */
 		if (!PQsendQuery(conn, step->sql))
 		{
 			fprintf(stdout, "failed to send query for step %s: %s\n",
@@ -621,39 +617,41 @@ run_permutation(TestSpec *testspec, int nsteps, Step **steps)
 			exit_nicely();
 		}
-		if (waiting != NULL)
+		/* Try to complete this step without blocking.  */
-		{
+		mustwait = try_complete_step(step, STEP_NONBLOCK);
 			/* Some other step is already waiting: just block. */
 			try_complete_step(step, 0);
-			/*
+		/* Attempt to complete any steps that were previously waiting. */
-			 * See if this step unblocked the waiting step; report both error
+		w = 0;
-			 * messages together if so.
+		nerrorstep = 0;
-			 */
+		while (w < nwaiting)
 			if (!try_complete_step(waiting, STEP_NONBLOCK | STEP_RETRY))
 		{
-				report_two_error_messages(step, waiting);
+			if (try_complete_step(waiting[w], STEP_NONBLOCK | STEP_RETRY))
-				waiting = NULL;
+				w++;
 			}
 			else
 				report_error_message(step);
 		}
 			else
 			{
-			if (try_complete_step(step, STEP_NONBLOCK))
+				errorstep[nerrorstep++] = waiting[w];
-				waiting = step;
+				if (w + 1 < nwaiting)
-			report_error_message(step);
+					memcpy(&waiting[w], &waiting[w + 1],
 						   (nwaiting - (w + 1)) * sizeof(Step *));
 				nwaiting--;
 			}
 		}
 		/* Report any error from this step, and any steps that it unblocked. */
 		report_multiple_error_messages(step, nerrorstep, errorstep);
 		/* If this step is waiting, add it to the array of waiters. */
 		if (mustwait)
 			waiting[nwaiting++] = step;
 	}
 	/* Finish any waiting query. */
-	if (waiting != NULL)
+	for (w = 0; w < nwaiting; ++w)
 	{
-		try_complete_step(waiting, STEP_RETRY);
+		try_complete_step(waiting[w], STEP_RETRY);
-		report_error_message(waiting);
+		report_error_message(waiting[w]);
 	}
 teardown:
 	/* Perform per-session teardown */
 	for (i = 0; i < testspec->nsessions; i++)
 	{
@@ -714,14 +712,17 @@ try_complete_step(Step *step, int flags)
 {
 	PGconn	   *conn = conns[1 + step->session];
 	fd_set		read_set;
 	struct timeval start_time;
 	struct timeval timeout;
 	int			sock = PQsocket(conn);
 	int			ret;
 	PGresult   *res;
 	bool		canceled = false;
 	gettimeofday(&start_time, NULL);
 	FD_ZERO(&read_set);
-	while ((flags & STEP_NONBLOCK) && PQisBusy(conn))
+	while (PQisBusy(conn))
 	{
 		FD_SET(sock, &read_set);
 		timeout.tv_sec = 0;
@@ -736,6 +737,12 @@ try_complete_step(Step *step, int flags)
 			exit_nicely();
 		}
 		else if (ret == 0)		/* select() timeout: check for lock wait */
 		{
 			struct timeval current_time;
 			int64		td;
 			/* If it's OK for the step to block, check whether it has. */
 			if (flags & STEP_NONBLOCK)
 			{
 				int			ntuples;
@@ -758,7 +765,48 @@ try_complete_step(Step *step, int flags)
 							   step->name, step->sql);
 					return true;
 				}
-			/* else, not waiting: give it more time */
+				/* else, not waiting */
 			}
 			/* Figure out how long we've been waiting for this step. */
 			gettimeofday(&current_time, NULL);
 			td = (int64) current_time.tv_sec - (int64) start_time.tv_sec;
 			td *= USECS_PER_SEC;
 			td += (int64) current_time.tv_usec - (int64) start_time.tv_usec;
 			/*
 			 * After 60 seconds, try to cancel the query. 
 			 *
 			 * If the user tries to test an invalid permutation, we don't
 			 * want to hang forever, especially when this is running in the
 			 * buildfarm.  So try to cancel it after a minute.  This will
 			 * presumably lead to this permutation failing, but remaining
 			 * permutations and tests should still be OK.
 			 */
 			if (td > 60 * USECS_PER_SEC && !canceled)
 			{
 				PGcancel *cancel = PQgetCancel(conn);
 				char	buf[256];
 				if (cancel != NULL && !PQcancel(cancel, buf, sizeof(buf)))
 					fprintf(stderr, "PQcancel failed: %s\n", buf);
 				if (cancel != NULL)
 					PQfreeCancel(cancel);
 			}
 			/*
 			 * After 75 seconds, just give up and die.
 			 *
 			 * Since cleanup steps won't be run in this case, this may cause
 			 * later tests to fail.  That stinks, but it's better than waiting
 			 * forever for the server to respond to the cancel.
 			 */
 			if (td > 75 * USECS_PER_SEC)
 			{
 				fprintf(stderr, "step %s timed out after 75 seconds\n",
 						step->name);
 				exit_nicely();
 			}
 		}
 		else if (!PQconsumeInput(conn)) /* select(): data available */
 		{