mirror of
https://github.com/postgres/postgres.git
synced 2025-05-03 22:24:49 +03:00
Modify the isolation tester so that multiple sessions can wait.
This allows testing of deadlock scenarios. Scenarios that would previously have been considered invalid are now simply taken as a scenario in which more than one backend will wait.
This commit is contained in:
parent
c9882c60f4
commit
38f8bdcac4
@ -107,10 +107,11 @@ Each step may contain commands that block until further action has been taken
|
|||||||
(most likely, some other session runs a step that unblocks it or causes a
|
(most likely, some other session runs a step that unblocks it or causes a
|
||||||
deadlock). A test that uses this ability must manually specify valid
|
deadlock). A test that uses this ability must manually specify valid
|
||||||
permutations, i.e. those that would not expect a blocked session to execute a
|
permutations, i.e. those that would not expect a blocked session to execute a
|
||||||
command. If the test fails to follow that rule, the test is aborted.
|
command. If a test fails to follow that rule, isolationtester will cancel it
|
||||||
|
after 60 seconds. If the cancel doesn't work, isolationtester will exit
|
||||||
Currently, at most one step can be waiting at a time. As long as one
|
uncleanly after a total of 75 seconds of wait time. Testing invalid
|
||||||
step is waiting, subsequent steps are run to completion synchronously.
|
permutations should be avoided because they can make the isolation tests take
|
||||||
|
a very long time to run, and they serve no useful testing purpose.
|
||||||
|
|
||||||
Note that isolationtester recognizes that a command has blocked by looking
|
Note that isolationtester recognizes that a command has blocked by looking
|
||||||
to see if it is shown as waiting in the pg_locks view; therefore, only
|
to see if it is shown as waiting in the pg_locks view; therefore, only
|
||||||
|
@ -15,6 +15,7 @@
|
|||||||
#include <sys/select.h>
|
#include <sys/select.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#include "datatype/timestamp.h"
|
||||||
#include "libpq-fe.h"
|
#include "libpq-fe.h"
|
||||||
#include "pqexpbuffer.h"
|
#include "pqexpbuffer.h"
|
||||||
#include "pg_getopt.h"
|
#include "pg_getopt.h"
|
||||||
@ -237,54 +238,6 @@ main(int argc, char **argv)
|
|||||||
appendPQExpBufferStr(&wait_query,
|
appendPQExpBufferStr(&wait_query,
|
||||||
") "
|
") "
|
||||||
|
|
||||||
"AND holder.mode = ANY (CASE waiter.mode "
|
|
||||||
"WHEN 'AccessShareLock' THEN ARRAY["
|
|
||||||
"'AccessExclusiveLock'] "
|
|
||||||
"WHEN 'RowShareLock' THEN ARRAY["
|
|
||||||
"'ExclusiveLock',"
|
|
||||||
"'AccessExclusiveLock'] "
|
|
||||||
"WHEN 'RowExclusiveLock' THEN ARRAY["
|
|
||||||
"'ShareLock',"
|
|
||||||
"'ShareRowExclusiveLock',"
|
|
||||||
"'ExclusiveLock',"
|
|
||||||
"'AccessExclusiveLock'] "
|
|
||||||
"WHEN 'ShareUpdateExclusiveLock' THEN ARRAY["
|
|
||||||
"'ShareUpdateExclusiveLock',"
|
|
||||||
"'ShareLock',"
|
|
||||||
"'ShareRowExclusiveLock',"
|
|
||||||
"'ExclusiveLock',"
|
|
||||||
"'AccessExclusiveLock'] "
|
|
||||||
"WHEN 'ShareLock' THEN ARRAY["
|
|
||||||
"'RowExclusiveLock',"
|
|
||||||
"'ShareUpdateExclusiveLock',"
|
|
||||||
"'ShareRowExclusiveLock',"
|
|
||||||
"'ExclusiveLock',"
|
|
||||||
"'AccessExclusiveLock'] "
|
|
||||||
"WHEN 'ShareRowExclusiveLock' THEN ARRAY["
|
|
||||||
"'RowExclusiveLock',"
|
|
||||||
"'ShareUpdateExclusiveLock',"
|
|
||||||
"'ShareLock',"
|
|
||||||
"'ShareRowExclusiveLock',"
|
|
||||||
"'ExclusiveLock',"
|
|
||||||
"'AccessExclusiveLock'] "
|
|
||||||
"WHEN 'ExclusiveLock' THEN ARRAY["
|
|
||||||
"'RowShareLock',"
|
|
||||||
"'RowExclusiveLock',"
|
|
||||||
"'ShareUpdateExclusiveLock',"
|
|
||||||
"'ShareLock',"
|
|
||||||
"'ShareRowExclusiveLock',"
|
|
||||||
"'ExclusiveLock',"
|
|
||||||
"'AccessExclusiveLock'] "
|
|
||||||
"WHEN 'AccessExclusiveLock' THEN ARRAY["
|
|
||||||
"'AccessShareLock',"
|
|
||||||
"'RowShareLock',"
|
|
||||||
"'RowExclusiveLock',"
|
|
||||||
"'ShareUpdateExclusiveLock',"
|
|
||||||
"'ShareLock',"
|
|
||||||
"'ShareRowExclusiveLock',"
|
|
||||||
"'ExclusiveLock',"
|
|
||||||
"'AccessExclusiveLock'] END) "
|
|
||||||
|
|
||||||
"AND holder.locktype IS NOT DISTINCT FROM waiter.locktype "
|
"AND holder.locktype IS NOT DISTINCT FROM waiter.locktype "
|
||||||
"AND holder.database IS NOT DISTINCT FROM waiter.database "
|
"AND holder.database IS NOT DISTINCT FROM waiter.database "
|
||||||
"AND holder.relation IS NOT DISTINCT FROM waiter.relation "
|
"AND holder.relation IS NOT DISTINCT FROM waiter.relation "
|
||||||
@ -467,34 +420,48 @@ report_error_message(Step *step)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* As above, but reports messages possibly emitted by two steps. This is
|
* As above, but reports messages possibly emitted by multiple steps. This is
|
||||||
* useful when we have a blocked command awakened by another one; we want to
|
* useful when we have a blocked command awakened by another one; we want to
|
||||||
* report both messages identically, for the case where we don't care which
|
* report all messages identically, for the case where we don't care which
|
||||||
* one fails due to a timeout such as deadlock timeout.
|
* one fails due to a timeout such as deadlock timeout.
|
||||||
*/
|
*/
|
||||||
static void
|
static void
|
||||||
report_two_error_messages(Step *step1, Step *step2)
|
report_multiple_error_messages(Step *step, int nextra, Step **extrastep)
|
||||||
{
|
{
|
||||||
char *prefix;
|
PQExpBufferData buffer;
|
||||||
|
int n;
|
||||||
|
|
||||||
prefix = psprintf("%s %s", step1->name, step2->name);
|
if (nextra == 0)
|
||||||
|
|
||||||
if (step1->errormsg)
|
|
||||||
{
|
{
|
||||||
fprintf(stdout, "error in steps %s: %s\n", prefix,
|
report_error_message(step);
|
||||||
step1->errormsg);
|
return;
|
||||||
free(step1->errormsg);
|
|
||||||
step1->errormsg = NULL;
|
|
||||||
}
|
|
||||||
if (step2->errormsg)
|
|
||||||
{
|
|
||||||
fprintf(stdout, "error in steps %s: %s\n", prefix,
|
|
||||||
step2->errormsg);
|
|
||||||
free(step2->errormsg);
|
|
||||||
step2->errormsg = NULL;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
free(prefix);
|
initPQExpBuffer(&buffer);
|
||||||
|
appendPQExpBufferStr(&buffer, step->name);
|
||||||
|
|
||||||
|
for (n = 0; n < nextra; ++n)
|
||||||
|
appendPQExpBuffer(&buffer, " %s", extrastep[n]->name);
|
||||||
|
|
||||||
|
if (step->errormsg)
|
||||||
|
{
|
||||||
|
fprintf(stdout, "error in steps %s: %s\n", buffer.data,
|
||||||
|
step->errormsg);
|
||||||
|
free(step->errormsg);
|
||||||
|
step->errormsg = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (n = 0; n < nextra; ++n)
|
||||||
|
{
|
||||||
|
if (extrastep[n]->errormsg == NULL)
|
||||||
|
continue;
|
||||||
|
fprintf(stdout, "error in steps %s: %s\n",
|
||||||
|
buffer.data, extrastep[n]->errormsg);
|
||||||
|
free(extrastep[n]->errormsg);
|
||||||
|
extrastep[n]->errormsg = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
termPQExpBuffer(&buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -505,7 +472,14 @@ run_permutation(TestSpec *testspec, int nsteps, Step **steps)
|
|||||||
{
|
{
|
||||||
PGresult *res;
|
PGresult *res;
|
||||||
int i;
|
int i;
|
||||||
Step *waiting = NULL;
|
int w;
|
||||||
|
int nwaiting = 0;
|
||||||
|
int nerrorstep = 0;
|
||||||
|
Step **waiting;
|
||||||
|
Step **errorstep;
|
||||||
|
|
||||||
|
waiting = malloc(sizeof(Step *) * testspec->nsessions);
|
||||||
|
errorstep = malloc(sizeof(Step *) * testspec->nsessions);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* In dry run mode, just display the permutation in the same format used
|
* In dry run mode, just display the permutation in the same format used
|
||||||
@ -566,54 +540,76 @@ run_permutation(TestSpec *testspec, int nsteps, Step **steps)
|
|||||||
for (i = 0; i < nsteps; i++)
|
for (i = 0; i < nsteps; i++)
|
||||||
{
|
{
|
||||||
Step *step = steps[i];
|
Step *step = steps[i];
|
||||||
|
Step *oldstep = NULL;
|
||||||
PGconn *conn = conns[1 + step->session];
|
PGconn *conn = conns[1 + step->session];
|
||||||
|
bool mustwait;
|
||||||
if (waiting != NULL && step->session == waiting->session)
|
|
||||||
{
|
|
||||||
PGcancel *cancel;
|
|
||||||
PGresult *res;
|
|
||||||
int j;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This permutation is invalid: it can never happen in real life.
|
* Check whether the session that needs to perform the next step
|
||||||
|
* is still blocked on an earlier step. If so, wait for it to finish.
|
||||||
*
|
*
|
||||||
* A session is blocked on an earlier step (waiting) and no
|
* In older versions of this tool, when allowed precisely one session
|
||||||
* further steps from this session can run until it is unblocked,
|
* to be waiting at a time. If we reached a step which required that
|
||||||
* but it can only be unblocked by running steps from other
|
* session to execute the next command, we would declare the whole
|
||||||
* sessions.
|
* permutation invalid, cancel everything, and move on to the next one.
|
||||||
|
* Unfortunately, that made it impossible to test the deadlock detector
|
||||||
|
* using this framework unless the numebr of processes involved in the
|
||||||
|
* deadlock was precisely two. We now assume that if we reach a step
|
||||||
|
* that is still blocked, we need to wait for it to unblock itself.
|
||||||
*/
|
*/
|
||||||
fprintf(stderr, "invalid permutation detected\n");
|
for (w = 0; w < nwaiting; ++w)
|
||||||
|
|
||||||
/* Cancel the waiting statement from this session. */
|
|
||||||
cancel = PQgetCancel(conn);
|
|
||||||
if (cancel != NULL)
|
|
||||||
{
|
{
|
||||||
char buf[256];
|
if (step->session == waiting[w]->session)
|
||||||
|
{
|
||||||
if (!PQcancel(cancel, buf, sizeof(buf)))
|
oldstep = waiting[w];
|
||||||
fprintf(stderr, "PQcancel failed: %s\n", buf);
|
break;
|
||||||
|
|
||||||
/* Be sure to consume the error message. */
|
|
||||||
while ((res = PQgetResult(conn)) != NULL)
|
|
||||||
PQclear(res);
|
|
||||||
|
|
||||||
PQfreeCancel(cancel);
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
if (oldstep != NULL)
|
||||||
|
{
|
||||||
|
/* Wait for previous step on this connection. */
|
||||||
|
try_complete_step(oldstep, STEP_RETRY);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Now we really have to complete all the running transactions to
|
* Attempt to complete any steps that were previously waiting.
|
||||||
* make sure teardown doesn't block.
|
* Remove any that have completed, and also remove the one for
|
||||||
|
* which we just waited.
|
||||||
*/
|
*/
|
||||||
for (j = 1; j < nconns; j++)
|
w = 0;
|
||||||
|
nerrorstep = 0;
|
||||||
|
while (w < nwaiting)
|
||||||
{
|
{
|
||||||
res = PQexec(conns[j], "ROLLBACK");
|
if (waiting[w] == oldstep)
|
||||||
if (res != NULL)
|
{
|
||||||
PQclear(res);
|
/* We just waited for this; remove it. */
|
||||||
|
if (w + 1 < nwaiting)
|
||||||
|
memcpy(&waiting[w], &waiting[w + 1],
|
||||||
|
(nwaiting - (w + 1)) * sizeof(Step *));
|
||||||
|
nwaiting--;
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
goto teardown;
|
if (try_complete_step(waiting[w], STEP_NONBLOCK | STEP_RETRY))
|
||||||
|
{
|
||||||
|
/* Still blocked on a lock, leave it alone. */
|
||||||
|
w++;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/* This one finished, too! */
|
||||||
|
errorstep[nerrorstep++] = waiting[w];
|
||||||
|
if (w + 1 < nwaiting)
|
||||||
|
memcpy(&waiting[w], &waiting[w + 1],
|
||||||
|
(nwaiting - (w + 1)) * sizeof(Step *));
|
||||||
|
nwaiting--;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Report all errors together. */
|
||||||
|
report_multiple_error_messages(oldstep, nerrorstep, errorstep);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Send the query for this step. */
|
||||||
if (!PQsendQuery(conn, step->sql))
|
if (!PQsendQuery(conn, step->sql))
|
||||||
{
|
{
|
||||||
fprintf(stdout, "failed to send query for step %s: %s\n",
|
fprintf(stdout, "failed to send query for step %s: %s\n",
|
||||||
@ -621,39 +617,41 @@ run_permutation(TestSpec *testspec, int nsteps, Step **steps)
|
|||||||
exit_nicely();
|
exit_nicely();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (waiting != NULL)
|
/* Try to complete this step without blocking. */
|
||||||
{
|
mustwait = try_complete_step(step, STEP_NONBLOCK);
|
||||||
/* Some other step is already waiting: just block. */
|
|
||||||
try_complete_step(step, 0);
|
|
||||||
|
|
||||||
/*
|
/* Attempt to complete any steps that were previously waiting. */
|
||||||
* See if this step unblocked the waiting step; report both error
|
w = 0;
|
||||||
* messages together if so.
|
nerrorstep = 0;
|
||||||
*/
|
while (w < nwaiting)
|
||||||
if (!try_complete_step(waiting, STEP_NONBLOCK | STEP_RETRY))
|
|
||||||
{
|
{
|
||||||
report_two_error_messages(step, waiting);
|
if (try_complete_step(waiting[w], STEP_NONBLOCK | STEP_RETRY))
|
||||||
waiting = NULL;
|
w++;
|
||||||
}
|
|
||||||
else
|
|
||||||
report_error_message(step);
|
|
||||||
}
|
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
if (try_complete_step(step, STEP_NONBLOCK))
|
errorstep[nerrorstep++] = waiting[w];
|
||||||
waiting = step;
|
if (w + 1 < nwaiting)
|
||||||
report_error_message(step);
|
memcpy(&waiting[w], &waiting[w + 1],
|
||||||
|
(nwaiting - (w + 1)) * sizeof(Step *));
|
||||||
|
nwaiting--;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Report any error from this step, and any steps that it unblocked. */
|
||||||
|
report_multiple_error_messages(step, nerrorstep, errorstep);
|
||||||
|
|
||||||
|
/* If this step is waiting, add it to the array of waiters. */
|
||||||
|
if (mustwait)
|
||||||
|
waiting[nwaiting++] = step;
|
||||||
|
}
|
||||||
|
|
||||||
/* Finish any waiting query. */
|
/* Finish any waiting query. */
|
||||||
if (waiting != NULL)
|
for (w = 0; w < nwaiting; ++w)
|
||||||
{
|
{
|
||||||
try_complete_step(waiting, STEP_RETRY);
|
try_complete_step(waiting[w], STEP_RETRY);
|
||||||
report_error_message(waiting);
|
report_error_message(waiting[w]);
|
||||||
}
|
}
|
||||||
|
|
||||||
teardown:
|
|
||||||
/* Perform per-session teardown */
|
/* Perform per-session teardown */
|
||||||
for (i = 0; i < testspec->nsessions; i++)
|
for (i = 0; i < testspec->nsessions; i++)
|
||||||
{
|
{
|
||||||
@ -714,14 +712,17 @@ try_complete_step(Step *step, int flags)
|
|||||||
{
|
{
|
||||||
PGconn *conn = conns[1 + step->session];
|
PGconn *conn = conns[1 + step->session];
|
||||||
fd_set read_set;
|
fd_set read_set;
|
||||||
|
struct timeval start_time;
|
||||||
struct timeval timeout;
|
struct timeval timeout;
|
||||||
int sock = PQsocket(conn);
|
int sock = PQsocket(conn);
|
||||||
int ret;
|
int ret;
|
||||||
PGresult *res;
|
PGresult *res;
|
||||||
|
bool canceled = false;
|
||||||
|
|
||||||
|
gettimeofday(&start_time, NULL);
|
||||||
FD_ZERO(&read_set);
|
FD_ZERO(&read_set);
|
||||||
|
|
||||||
while ((flags & STEP_NONBLOCK) && PQisBusy(conn))
|
while (PQisBusy(conn))
|
||||||
{
|
{
|
||||||
FD_SET(sock, &read_set);
|
FD_SET(sock, &read_set);
|
||||||
timeout.tv_sec = 0;
|
timeout.tv_sec = 0;
|
||||||
@ -736,6 +737,12 @@ try_complete_step(Step *step, int flags)
|
|||||||
exit_nicely();
|
exit_nicely();
|
||||||
}
|
}
|
||||||
else if (ret == 0) /* select() timeout: check for lock wait */
|
else if (ret == 0) /* select() timeout: check for lock wait */
|
||||||
|
{
|
||||||
|
struct timeval current_time;
|
||||||
|
int64 td;
|
||||||
|
|
||||||
|
/* If it's OK for the step to block, check whether it has. */
|
||||||
|
if (flags & STEP_NONBLOCK)
|
||||||
{
|
{
|
||||||
int ntuples;
|
int ntuples;
|
||||||
|
|
||||||
@ -758,7 +765,48 @@ try_complete_step(Step *step, int flags)
|
|||||||
step->name, step->sql);
|
step->name, step->sql);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
/* else, not waiting: give it more time */
|
/* else, not waiting */
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Figure out how long we've been waiting for this step. */
|
||||||
|
gettimeofday(¤t_time, NULL);
|
||||||
|
td = (int64) current_time.tv_sec - (int64) start_time.tv_sec;
|
||||||
|
td *= USECS_PER_SEC;
|
||||||
|
td += (int64) current_time.tv_usec - (int64) start_time.tv_usec;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* After 60 seconds, try to cancel the query.
|
||||||
|
*
|
||||||
|
* If the user tries to test an invalid permutation, we don't
|
||||||
|
* want to hang forever, especially when this is running in the
|
||||||
|
* buildfarm. So try to cancel it after a minute. This will
|
||||||
|
* presumably lead to this permutation failing, but remaining
|
||||||
|
* permutations and tests should still be OK.
|
||||||
|
*/
|
||||||
|
if (td > 60 * USECS_PER_SEC && !canceled)
|
||||||
|
{
|
||||||
|
PGcancel *cancel = PQgetCancel(conn);
|
||||||
|
char buf[256];
|
||||||
|
|
||||||
|
if (cancel != NULL && !PQcancel(cancel, buf, sizeof(buf)))
|
||||||
|
fprintf(stderr, "PQcancel failed: %s\n", buf);
|
||||||
|
if (cancel != NULL)
|
||||||
|
PQfreeCancel(cancel);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* After 75 seconds, just give up and die.
|
||||||
|
*
|
||||||
|
* Since cleanup steps won't be run in this case, this may cause
|
||||||
|
* later tests to fail. That stinks, but it's better than waiting
|
||||||
|
* forever for the server to respond to the cancel.
|
||||||
|
*/
|
||||||
|
if (td > 75 * USECS_PER_SEC)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "step %s timed out after 75 seconds\n",
|
||||||
|
step->name);
|
||||||
|
exit_nicely();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else if (!PQconsumeInput(conn)) /* select(): data available */
|
else if (!PQconsumeInput(conn)) /* select(): data available */
|
||||||
{
|
{
|
||||||
|
Loading…
x
Reference in New Issue
Block a user