mirror of
https://github.com/postgres/postgres.git
synced 2025-05-03 22:24:49 +03:00
Modify the isolation tester so that multiple sessions can wait.
This allows testing of deadlock scenarios. Scenarios that would previously have been considered invalid are now simply taken as a scenario in which more than one backend will wait.
This commit is contained in:
parent
c9882c60f4
commit
38f8bdcac4
@ -107,10 +107,11 @@ Each step may contain commands that block until further action has been taken
|
||||
(most likely, some other session runs a step that unblocks it or causes a
|
||||
deadlock). A test that uses this ability must manually specify valid
|
||||
permutations, i.e. those that would not expect a blocked session to execute a
|
||||
command. If the test fails to follow that rule, the test is aborted.
|
||||
|
||||
Currently, at most one step can be waiting at a time. As long as one
|
||||
step is waiting, subsequent steps are run to completion synchronously.
|
||||
command. If a test fails to follow that rule, isolationtester will cancel it
|
||||
after 60 seconds. If the cancel doesn't work, isolationtester will exit
|
||||
uncleanly after a total of 75 seconds of wait time. Testing invalid
|
||||
permutations should be avoided because they can make the isolation tests take
|
||||
a very long time to run, and they serve no useful testing purpose.
|
||||
|
||||
Note that isolationtester recognizes that a command has blocked by looking
|
||||
to see if it is shown as waiting in the pg_locks view; therefore, only
|
||||
|
@ -15,6 +15,7 @@
|
||||
#include <sys/select.h>
|
||||
#endif
|
||||
|
||||
#include "datatype/timestamp.h"
|
||||
#include "libpq-fe.h"
|
||||
#include "pqexpbuffer.h"
|
||||
#include "pg_getopt.h"
|
||||
@ -237,54 +238,6 @@ main(int argc, char **argv)
|
||||
appendPQExpBufferStr(&wait_query,
|
||||
") "
|
||||
|
||||
"AND holder.mode = ANY (CASE waiter.mode "
|
||||
"WHEN 'AccessShareLock' THEN ARRAY["
|
||||
"'AccessExclusiveLock'] "
|
||||
"WHEN 'RowShareLock' THEN ARRAY["
|
||||
"'ExclusiveLock',"
|
||||
"'AccessExclusiveLock'] "
|
||||
"WHEN 'RowExclusiveLock' THEN ARRAY["
|
||||
"'ShareLock',"
|
||||
"'ShareRowExclusiveLock',"
|
||||
"'ExclusiveLock',"
|
||||
"'AccessExclusiveLock'] "
|
||||
"WHEN 'ShareUpdateExclusiveLock' THEN ARRAY["
|
||||
"'ShareUpdateExclusiveLock',"
|
||||
"'ShareLock',"
|
||||
"'ShareRowExclusiveLock',"
|
||||
"'ExclusiveLock',"
|
||||
"'AccessExclusiveLock'] "
|
||||
"WHEN 'ShareLock' THEN ARRAY["
|
||||
"'RowExclusiveLock',"
|
||||
"'ShareUpdateExclusiveLock',"
|
||||
"'ShareRowExclusiveLock',"
|
||||
"'ExclusiveLock',"
|
||||
"'AccessExclusiveLock'] "
|
||||
"WHEN 'ShareRowExclusiveLock' THEN ARRAY["
|
||||
"'RowExclusiveLock',"
|
||||
"'ShareUpdateExclusiveLock',"
|
||||
"'ShareLock',"
|
||||
"'ShareRowExclusiveLock',"
|
||||
"'ExclusiveLock',"
|
||||
"'AccessExclusiveLock'] "
|
||||
"WHEN 'ExclusiveLock' THEN ARRAY["
|
||||
"'RowShareLock',"
|
||||
"'RowExclusiveLock',"
|
||||
"'ShareUpdateExclusiveLock',"
|
||||
"'ShareLock',"
|
||||
"'ShareRowExclusiveLock',"
|
||||
"'ExclusiveLock',"
|
||||
"'AccessExclusiveLock'] "
|
||||
"WHEN 'AccessExclusiveLock' THEN ARRAY["
|
||||
"'AccessShareLock',"
|
||||
"'RowShareLock',"
|
||||
"'RowExclusiveLock',"
|
||||
"'ShareUpdateExclusiveLock',"
|
||||
"'ShareLock',"
|
||||
"'ShareRowExclusiveLock',"
|
||||
"'ExclusiveLock',"
|
||||
"'AccessExclusiveLock'] END) "
|
||||
|
||||
"AND holder.locktype IS NOT DISTINCT FROM waiter.locktype "
|
||||
"AND holder.database IS NOT DISTINCT FROM waiter.database "
|
||||
"AND holder.relation IS NOT DISTINCT FROM waiter.relation "
|
||||
@ -467,34 +420,48 @@ report_error_message(Step *step)
|
||||
}
|
||||
|
||||
/*
|
||||
* As above, but reports messages possibly emitted by two steps. This is
|
||||
* As above, but reports messages possibly emitted by multiple steps. This is
|
||||
* useful when we have a blocked command awakened by another one; we want to
|
||||
* report both messages identically, for the case where we don't care which
|
||||
* report all messages identically, for the case where we don't care which
|
||||
* one fails due to a timeout such as deadlock timeout.
|
||||
*/
|
||||
static void
|
||||
report_two_error_messages(Step *step1, Step *step2)
|
||||
report_multiple_error_messages(Step *step, int nextra, Step **extrastep)
|
||||
{
|
||||
char *prefix;
|
||||
PQExpBufferData buffer;
|
||||
int n;
|
||||
|
||||
prefix = psprintf("%s %s", step1->name, step2->name);
|
||||
|
||||
if (step1->errormsg)
|
||||
if (nextra == 0)
|
||||
{
|
||||
fprintf(stdout, "error in steps %s: %s\n", prefix,
|
||||
step1->errormsg);
|
||||
free(step1->errormsg);
|
||||
step1->errormsg = NULL;
|
||||
}
|
||||
if (step2->errormsg)
|
||||
{
|
||||
fprintf(stdout, "error in steps %s: %s\n", prefix,
|
||||
step2->errormsg);
|
||||
free(step2->errormsg);
|
||||
step2->errormsg = NULL;
|
||||
report_error_message(step);
|
||||
return;
|
||||
}
|
||||
|
||||
free(prefix);
|
||||
initPQExpBuffer(&buffer);
|
||||
appendPQExpBufferStr(&buffer, step->name);
|
||||
|
||||
for (n = 0; n < nextra; ++n)
|
||||
appendPQExpBuffer(&buffer, " %s", extrastep[n]->name);
|
||||
|
||||
if (step->errormsg)
|
||||
{
|
||||
fprintf(stdout, "error in steps %s: %s\n", buffer.data,
|
||||
step->errormsg);
|
||||
free(step->errormsg);
|
||||
step->errormsg = NULL;
|
||||
}
|
||||
|
||||
for (n = 0; n < nextra; ++n)
|
||||
{
|
||||
if (extrastep[n]->errormsg == NULL)
|
||||
continue;
|
||||
fprintf(stdout, "error in steps %s: %s\n",
|
||||
buffer.data, extrastep[n]->errormsg);
|
||||
free(extrastep[n]->errormsg);
|
||||
extrastep[n]->errormsg = NULL;
|
||||
}
|
||||
|
||||
termPQExpBuffer(&buffer);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -505,7 +472,14 @@ run_permutation(TestSpec *testspec, int nsteps, Step **steps)
|
||||
{
|
||||
PGresult *res;
|
||||
int i;
|
||||
Step *waiting = NULL;
|
||||
int w;
|
||||
int nwaiting = 0;
|
||||
int nerrorstep = 0;
|
||||
Step **waiting;
|
||||
Step **errorstep;
|
||||
|
||||
waiting = malloc(sizeof(Step *) * testspec->nsessions);
|
||||
errorstep = malloc(sizeof(Step *) * testspec->nsessions);
|
||||
|
||||
/*
|
||||
* In dry run mode, just display the permutation in the same format used
|
||||
@ -566,54 +540,76 @@ run_permutation(TestSpec *testspec, int nsteps, Step **steps)
|
||||
for (i = 0; i < nsteps; i++)
|
||||
{
|
||||
Step *step = steps[i];
|
||||
Step *oldstep = NULL;
|
||||
PGconn *conn = conns[1 + step->session];
|
||||
bool mustwait;
|
||||
|
||||
if (waiting != NULL && step->session == waiting->session)
|
||||
/*
|
||||
* Check whether the session that needs to perform the next step
|
||||
* is still blocked on an earlier step. If so, wait for it to finish.
|
||||
*
|
||||
* In older versions of this tool, when allowed precisely one session
|
||||
* to be waiting at a time. If we reached a step which required that
|
||||
* session to execute the next command, we would declare the whole
|
||||
* permutation invalid, cancel everything, and move on to the next one.
|
||||
* Unfortunately, that made it impossible to test the deadlock detector
|
||||
* using this framework unless the numebr of processes involved in the
|
||||
* deadlock was precisely two. We now assume that if we reach a step
|
||||
* that is still blocked, we need to wait for it to unblock itself.
|
||||
*/
|
||||
for (w = 0; w < nwaiting; ++w)
|
||||
{
|
||||
PGcancel *cancel;
|
||||
PGresult *res;
|
||||
int j;
|
||||
if (step->session == waiting[w]->session)
|
||||
{
|
||||
oldstep = waiting[w];
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (oldstep != NULL)
|
||||
{
|
||||
/* Wait for previous step on this connection. */
|
||||
try_complete_step(oldstep, STEP_RETRY);
|
||||
|
||||
/*
|
||||
* This permutation is invalid: it can never happen in real life.
|
||||
*
|
||||
* A session is blocked on an earlier step (waiting) and no
|
||||
* further steps from this session can run until it is unblocked,
|
||||
* but it can only be unblocked by running steps from other
|
||||
* sessions.
|
||||
* Attempt to complete any steps that were previously waiting.
|
||||
* Remove any that have completed, and also remove the one for
|
||||
* which we just waited.
|
||||
*/
|
||||
fprintf(stderr, "invalid permutation detected\n");
|
||||
|
||||
/* Cancel the waiting statement from this session. */
|
||||
cancel = PQgetCancel(conn);
|
||||
if (cancel != NULL)
|
||||
w = 0;
|
||||
nerrorstep = 0;
|
||||
while (w < nwaiting)
|
||||
{
|
||||
char buf[256];
|
||||
if (waiting[w] == oldstep)
|
||||
{
|
||||
/* We just waited for this; remove it. */
|
||||
if (w + 1 < nwaiting)
|
||||
memcpy(&waiting[w], &waiting[w + 1],
|
||||
(nwaiting - (w + 1)) * sizeof(Step *));
|
||||
nwaiting--;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!PQcancel(cancel, buf, sizeof(buf)))
|
||||
fprintf(stderr, "PQcancel failed: %s\n", buf);
|
||||
|
||||
/* Be sure to consume the error message. */
|
||||
while ((res = PQgetResult(conn)) != NULL)
|
||||
PQclear(res);
|
||||
|
||||
PQfreeCancel(cancel);
|
||||
if (try_complete_step(waiting[w], STEP_NONBLOCK | STEP_RETRY))
|
||||
{
|
||||
/* Still blocked on a lock, leave it alone. */
|
||||
w++;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* This one finished, too! */
|
||||
errorstep[nerrorstep++] = waiting[w];
|
||||
if (w + 1 < nwaiting)
|
||||
memcpy(&waiting[w], &waiting[w + 1],
|
||||
(nwaiting - (w + 1)) * sizeof(Step *));
|
||||
nwaiting--;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Now we really have to complete all the running transactions to
|
||||
* make sure teardown doesn't block.
|
||||
*/
|
||||
for (j = 1; j < nconns; j++)
|
||||
{
|
||||
res = PQexec(conns[j], "ROLLBACK");
|
||||
if (res != NULL)
|
||||
PQclear(res);
|
||||
}
|
||||
|
||||
goto teardown;
|
||||
/* Report all errors together. */
|
||||
report_multiple_error_messages(oldstep, nerrorstep, errorstep);
|
||||
}
|
||||
|
||||
/* Send the query for this step. */
|
||||
if (!PQsendQuery(conn, step->sql))
|
||||
{
|
||||
fprintf(stdout, "failed to send query for step %s: %s\n",
|
||||
@ -621,39 +617,41 @@ run_permutation(TestSpec *testspec, int nsteps, Step **steps)
|
||||
exit_nicely();
|
||||
}
|
||||
|
||||
if (waiting != NULL)
|
||||
{
|
||||
/* Some other step is already waiting: just block. */
|
||||
try_complete_step(step, 0);
|
||||
/* Try to complete this step without blocking. */
|
||||
mustwait = try_complete_step(step, STEP_NONBLOCK);
|
||||
|
||||
/*
|
||||
* See if this step unblocked the waiting step; report both error
|
||||
* messages together if so.
|
||||
*/
|
||||
if (!try_complete_step(waiting, STEP_NONBLOCK | STEP_RETRY))
|
||||
{
|
||||
report_two_error_messages(step, waiting);
|
||||
waiting = NULL;
|
||||
}
|
||||
else
|
||||
report_error_message(step);
|
||||
}
|
||||
else
|
||||
/* Attempt to complete any steps that were previously waiting. */
|
||||
w = 0;
|
||||
nerrorstep = 0;
|
||||
while (w < nwaiting)
|
||||
{
|
||||
if (try_complete_step(step, STEP_NONBLOCK))
|
||||
waiting = step;
|
||||
report_error_message(step);
|
||||
if (try_complete_step(waiting[w], STEP_NONBLOCK | STEP_RETRY))
|
||||
w++;
|
||||
else
|
||||
{
|
||||
errorstep[nerrorstep++] = waiting[w];
|
||||
if (w + 1 < nwaiting)
|
||||
memcpy(&waiting[w], &waiting[w + 1],
|
||||
(nwaiting - (w + 1)) * sizeof(Step *));
|
||||
nwaiting--;
|
||||
}
|
||||
}
|
||||
|
||||
/* Report any error from this step, and any steps that it unblocked. */
|
||||
report_multiple_error_messages(step, nerrorstep, errorstep);
|
||||
|
||||
/* If this step is waiting, add it to the array of waiters. */
|
||||
if (mustwait)
|
||||
waiting[nwaiting++] = step;
|
||||
}
|
||||
|
||||
/* Finish any waiting query. */
|
||||
if (waiting != NULL)
|
||||
for (w = 0; w < nwaiting; ++w)
|
||||
{
|
||||
try_complete_step(waiting, STEP_RETRY);
|
||||
report_error_message(waiting);
|
||||
try_complete_step(waiting[w], STEP_RETRY);
|
||||
report_error_message(waiting[w]);
|
||||
}
|
||||
|
||||
teardown:
|
||||
/* Perform per-session teardown */
|
||||
for (i = 0; i < testspec->nsessions; i++)
|
||||
{
|
||||
@ -714,14 +712,17 @@ try_complete_step(Step *step, int flags)
|
||||
{
|
||||
PGconn *conn = conns[1 + step->session];
|
||||
fd_set read_set;
|
||||
struct timeval start_time;
|
||||
struct timeval timeout;
|
||||
int sock = PQsocket(conn);
|
||||
int ret;
|
||||
PGresult *res;
|
||||
bool canceled = false;
|
||||
|
||||
gettimeofday(&start_time, NULL);
|
||||
FD_ZERO(&read_set);
|
||||
|
||||
while ((flags & STEP_NONBLOCK) && PQisBusy(conn))
|
||||
while (PQisBusy(conn))
|
||||
{
|
||||
FD_SET(sock, &read_set);
|
||||
timeout.tv_sec = 0;
|
||||
@ -737,28 +738,75 @@ try_complete_step(Step *step, int flags)
|
||||
}
|
||||
else if (ret == 0) /* select() timeout: check for lock wait */
|
||||
{
|
||||
int ntuples;
|
||||
struct timeval current_time;
|
||||
int64 td;
|
||||
|
||||
res = PQexecPrepared(conns[0], PREP_WAITING, 1,
|
||||
&backend_pids[step->session + 1],
|
||||
NULL, NULL, 0);
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
/* If it's OK for the step to block, check whether it has. */
|
||||
if (flags & STEP_NONBLOCK)
|
||||
{
|
||||
fprintf(stderr, "lock wait query failed: %s",
|
||||
PQerrorMessage(conn));
|
||||
int ntuples;
|
||||
|
||||
res = PQexecPrepared(conns[0], PREP_WAITING, 1,
|
||||
&backend_pids[step->session + 1],
|
||||
NULL, NULL, 0);
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
{
|
||||
fprintf(stderr, "lock wait query failed: %s",
|
||||
PQerrorMessage(conn));
|
||||
exit_nicely();
|
||||
}
|
||||
ntuples = PQntuples(res);
|
||||
PQclear(res);
|
||||
|
||||
if (ntuples >= 1) /* waiting to acquire a lock */
|
||||
{
|
||||
if (!(flags & STEP_RETRY))
|
||||
printf("step %s: %s <waiting ...>\n",
|
||||
step->name, step->sql);
|
||||
return true;
|
||||
}
|
||||
/* else, not waiting */
|
||||
}
|
||||
|
||||
/* Figure out how long we've been waiting for this step. */
|
||||
gettimeofday(¤t_time, NULL);
|
||||
td = (int64) current_time.tv_sec - (int64) start_time.tv_sec;
|
||||
td *= USECS_PER_SEC;
|
||||
td += (int64) current_time.tv_usec - (int64) start_time.tv_usec;
|
||||
|
||||
/*
|
||||
* After 60 seconds, try to cancel the query.
|
||||
*
|
||||
* If the user tries to test an invalid permutation, we don't
|
||||
* want to hang forever, especially when this is running in the
|
||||
* buildfarm. So try to cancel it after a minute. This will
|
||||
* presumably lead to this permutation failing, but remaining
|
||||
* permutations and tests should still be OK.
|
||||
*/
|
||||
if (td > 60 * USECS_PER_SEC && !canceled)
|
||||
{
|
||||
PGcancel *cancel = PQgetCancel(conn);
|
||||
char buf[256];
|
||||
|
||||
if (cancel != NULL && !PQcancel(cancel, buf, sizeof(buf)))
|
||||
fprintf(stderr, "PQcancel failed: %s\n", buf);
|
||||
if (cancel != NULL)
|
||||
PQfreeCancel(cancel);
|
||||
}
|
||||
|
||||
/*
|
||||
* After 75 seconds, just give up and die.
|
||||
*
|
||||
* Since cleanup steps won't be run in this case, this may cause
|
||||
* later tests to fail. That stinks, but it's better than waiting
|
||||
* forever for the server to respond to the cancel.
|
||||
*/
|
||||
if (td > 75 * USECS_PER_SEC)
|
||||
{
|
||||
fprintf(stderr, "step %s timed out after 75 seconds\n",
|
||||
step->name);
|
||||
exit_nicely();
|
||||
}
|
||||
ntuples = PQntuples(res);
|
||||
PQclear(res);
|
||||
|
||||
if (ntuples >= 1) /* waiting to acquire a lock */
|
||||
{
|
||||
if (!(flags & STEP_RETRY))
|
||||
printf("step %s: %s <waiting ...>\n",
|
||||
step->name, step->sql);
|
||||
return true;
|
||||
}
|
||||
/* else, not waiting: give it more time */
|
||||
}
|
||||
else if (!PQconsumeInput(conn)) /* select(): data available */
|
||||
{
|
||||
|
Loading…
x
Reference in New Issue
Block a user