mirror of
https://github.com/postgres/postgres.git
synced 2025-07-15 19:21:59 +03:00
Fix race in parallel hash join batch cleanup, take II.
With unlucky timing and parallel_leader_participation=off (not the
default), PHJ could attempt to access per-batch shared state just as it
was being freed. There was code intended to prevent that by checking
for a cleared pointer, but it was racy. Fix, by introducing an extra
barrier phase. The new phase PHJ_BUILD_RUNNING means that it's safe to
access the per-batch state to find a batch to help with, and
PHJ_BUILD_DONE means that it is too late. The last to detach will free
the array of per-batch state as before, but now it will also atomically
advance the phase, so that late attachers can avoid the hazard. This
mirrors the way per-batch hash tables are freed (see phases
PHJ_BATCH_PROBING and PHJ_BATCH_DONE).
An earlier attempt to fix this (commit 3b8981b6
, later reverted) missed
one special case. When the inner side is empty (the "empty inner
optimization), the build barrier would only make it to
PHJ_BUILD_HASHING_INNER phase before workers attempted to detach from
the hashtable. In that case, fast-forward the build barrier to
PHJ_BUILD_RUNNING before proceeding, so that our later assertions hold
and we can still negotiate who is cleaning up.
Revealed by build farm failures, where BarrierAttach() failed a sanity
check assertion, because the memory had been clobbered by dsa_free().
In non-assert builds, the result could be a segmentation fault.
Back-patch to all supported releases.
Author: Thomas Munro <thomas.munro@gmail.com>
Author: Melanie Plageman <melanieplageman@gmail.com>
Reported-by: Michael Paquier <michael@paquier.xyz>
Reported-by: David Geier <geidav.pg@gmail.com>
Tested-by: David Geier <geidav.pg@gmail.com>
Discussion: https://postgr.es/m/20200929061142.GA29096%40paquier.xyz
This commit is contained in:
@ -333,14 +333,21 @@ MultiExecParallelHash(HashState *node)
|
||||
hashtable->nbuckets = pstate->nbuckets;
|
||||
hashtable->log2_nbuckets = my_log2(hashtable->nbuckets);
|
||||
hashtable->totalTuples = pstate->total_tuples;
|
||||
ExecParallelHashEnsureBatchAccessors(hashtable);
|
||||
|
||||
/*
|
||||
* Unless we're completely done and the batch state has been freed, make
|
||||
* sure we have accessors.
|
||||
*/
|
||||
if (BarrierPhase(build_barrier) < PHJ_BUILD_DONE)
|
||||
ExecParallelHashEnsureBatchAccessors(hashtable);
|
||||
|
||||
/*
|
||||
* The next synchronization point is in ExecHashJoin's HJ_BUILD_HASHTABLE
|
||||
* case, which will bring the build phase to PHJ_BUILD_DONE (if it isn't
|
||||
* there already).
|
||||
* case, which will bring the build phase to PHJ_BUILD_RUNNING (if it
|
||||
* isn't there already).
|
||||
*/
|
||||
Assert(BarrierPhase(build_barrier) == PHJ_BUILD_HASHING_OUTER ||
|
||||
BarrierPhase(build_barrier) == PHJ_BUILD_RUNNING ||
|
||||
BarrierPhase(build_barrier) == PHJ_BUILD_DONE);
|
||||
}
|
||||
|
||||
@ -624,7 +631,7 @@ ExecHashTableCreate(HashState *state, List *hashOperators, List *hashCollations,
|
||||
/*
|
||||
* The next Parallel Hash synchronization point is in
|
||||
* MultiExecParallelHash(), which will progress it all the way to
|
||||
* PHJ_BUILD_DONE. The caller must not return control from this
|
||||
* PHJ_BUILD_RUNNING. The caller must not return control from this
|
||||
* executor node between now and then.
|
||||
*/
|
||||
}
|
||||
@ -3019,14 +3026,11 @@ ExecParallelHashEnsureBatchAccessors(HashJoinTable hashtable)
|
||||
}
|
||||
|
||||
/*
|
||||
* It's possible for a backend to start up very late so that the whole
|
||||
* join is finished and the shm state for tracking batches has already
|
||||
* been freed by ExecHashTableDetach(). In that case we'll just leave
|
||||
* hashtable->batches as NULL so that ExecParallelHashJoinNewBatch() gives
|
||||
* up early.
|
||||
* We should never see a state where the batch-tracking array is freed,
|
||||
* because we should have given up sooner if we join when the build
|
||||
* barrier has reached the PHJ_BUILD_DONE phase.
|
||||
*/
|
||||
if (!DsaPointerIsValid(pstate->batches))
|
||||
return;
|
||||
Assert(DsaPointerIsValid(pstate->batches));
|
||||
|
||||
/* Use hash join memory context. */
|
||||
oldcxt = MemoryContextSwitchTo(hashtable->hashCxt);
|
||||
@ -3146,9 +3150,18 @@ ExecHashTableDetachBatch(HashJoinTable hashtable)
|
||||
void
|
||||
ExecHashTableDetach(HashJoinTable hashtable)
|
||||
{
|
||||
if (hashtable->parallel_state)
|
||||
ParallelHashJoinState *pstate = hashtable->parallel_state;
|
||||
|
||||
/*
|
||||
* If we're involved in a parallel query, we must either have gotten all
|
||||
* the way to PHJ_BUILD_RUNNING, or joined too late and be in
|
||||
* PHJ_BUILD_DONE.
|
||||
*/
|
||||
Assert(!pstate ||
|
||||
BarrierPhase(&pstate->build_barrier) >= PHJ_BUILD_RUNNING);
|
||||
|
||||
if (pstate && BarrierPhase(&pstate->build_barrier) == PHJ_BUILD_RUNNING)
|
||||
{
|
||||
ParallelHashJoinState *pstate = hashtable->parallel_state;
|
||||
int i;
|
||||
|
||||
/* Make sure any temporary files are closed. */
|
||||
@ -3164,17 +3177,22 @@ ExecHashTableDetach(HashJoinTable hashtable)
|
||||
}
|
||||
|
||||
/* If we're last to detach, clean up shared memory. */
|
||||
if (BarrierDetach(&pstate->build_barrier))
|
||||
if (BarrierArriveAndDetach(&pstate->build_barrier))
|
||||
{
|
||||
/*
|
||||
* Late joining processes will see this state and give up
|
||||
* immediately.
|
||||
*/
|
||||
Assert(BarrierPhase(&pstate->build_barrier) == PHJ_BUILD_DONE);
|
||||
|
||||
if (DsaPointerIsValid(pstate->batches))
|
||||
{
|
||||
dsa_free(hashtable->area, pstate->batches);
|
||||
pstate->batches = InvalidDsaPointer;
|
||||
}
|
||||
}
|
||||
|
||||
hashtable->parallel_state = NULL;
|
||||
}
|
||||
hashtable->parallel_state = NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
|
Reference in New Issue
Block a user