mirror of
				https://github.com/postgres/postgres.git
				synced 2025-11-03 09:13:20 +03:00 
			
		
		
		
	Revert "Fix race in Parallel Hash Join batch cleanup."
This reverts commit 8fa2478b40.
Discussion: https://postgr.es/m/CA%2BhUKGJmcqAE3MZeDCLLXa62cWM0AJbKmp2JrJYaJ86bz36LFA%40mail.gmail.com
			
			
This commit is contained in:
		@@ -333,21 +333,14 @@ MultiExecParallelHash(HashState *node)
 | 
			
		||||
	hashtable->nbuckets = pstate->nbuckets;
 | 
			
		||||
	hashtable->log2_nbuckets = my_log2(hashtable->nbuckets);
 | 
			
		||||
	hashtable->totalTuples = pstate->total_tuples;
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	 * Unless we're completely done and the batch state has been freed, make
 | 
			
		||||
	 * sure we have accessors.
 | 
			
		||||
	 */
 | 
			
		||||
	if (BarrierPhase(build_barrier) < PHJ_BUILD_DONE)
 | 
			
		||||
	ExecParallelHashEnsureBatchAccessors(hashtable);
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	 * The next synchronization point is in ExecHashJoin's HJ_BUILD_HASHTABLE
 | 
			
		||||
	 * case, which will bring the build phase to PHJ_BUILD_RUNNING (if it isn't
 | 
			
		||||
	 * case, which will bring the build phase to PHJ_BUILD_DONE (if it isn't
 | 
			
		||||
	 * there already).
 | 
			
		||||
	 */
 | 
			
		||||
	Assert(BarrierPhase(build_barrier) == PHJ_BUILD_HASHING_OUTER ||
 | 
			
		||||
		   BarrierPhase(build_barrier) == PHJ_BUILD_RUNNING ||
 | 
			
		||||
		   BarrierPhase(build_barrier) == PHJ_BUILD_DONE);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@@ -631,7 +624,7 @@ ExecHashTableCreate(HashState *state, List *hashOperators, List *hashCollations,
 | 
			
		||||
		/*
 | 
			
		||||
		 * The next Parallel Hash synchronization point is in
 | 
			
		||||
		 * MultiExecParallelHash(), which will progress it all the way to
 | 
			
		||||
		 * PHJ_BUILD_RUNNING.  The caller must not return control from this
 | 
			
		||||
		 * PHJ_BUILD_DONE.  The caller must not return control from this
 | 
			
		||||
		 * executor node between now and then.
 | 
			
		||||
		 */
 | 
			
		||||
	}
 | 
			
		||||
@@ -3026,11 +3019,14 @@ ExecParallelHashEnsureBatchAccessors(HashJoinTable hashtable)
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	 * We should never see a state where the batch-tracking array is freed,
 | 
			
		||||
	 * because we should have given up sooner if we join when the build barrier
 | 
			
		||||
	 * has reached the PHJ_BUILD_DONE phase.
 | 
			
		||||
	 * It's possible for a backend to start up very late so that the whole
 | 
			
		||||
	 * join is finished and the shm state for tracking batches has already
 | 
			
		||||
	 * been freed by ExecHashTableDetach().  In that case we'll just leave
 | 
			
		||||
	 * hashtable->batches as NULL so that ExecParallelHashJoinNewBatch() gives
 | 
			
		||||
	 * up early.
 | 
			
		||||
	 */
 | 
			
		||||
	Assert(DsaPointerIsValid(pstate->batches));
 | 
			
		||||
	if (!DsaPointerIsValid(pstate->batches))
 | 
			
		||||
		return;
 | 
			
		||||
 | 
			
		||||
	/* Use hash join memory context. */
 | 
			
		||||
	oldcxt = MemoryContextSwitchTo(hashtable->hashCxt);
 | 
			
		||||
@@ -3150,17 +3146,9 @@ ExecHashTableDetachBatch(HashJoinTable hashtable)
 | 
			
		||||
void
 | 
			
		||||
ExecHashTableDetach(HashJoinTable hashtable)
 | 
			
		||||
{
 | 
			
		||||
	ParallelHashJoinState *pstate = hashtable->parallel_state;
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	 * If we're involved in a parallel query, we must either have got all the
 | 
			
		||||
	 * way to PHJ_BUILD_RUNNING, or joined too late and be in PHJ_BUILD_DONE.
 | 
			
		||||
	 */
 | 
			
		||||
	Assert(!pstate ||
 | 
			
		||||
		   BarrierPhase(&pstate->build_barrier) >= PHJ_BUILD_RUNNING);
 | 
			
		||||
 | 
			
		||||
	if (pstate && BarrierPhase(&pstate->build_barrier) == PHJ_BUILD_RUNNING)
 | 
			
		||||
	if (hashtable->parallel_state)
 | 
			
		||||
	{
 | 
			
		||||
		ParallelHashJoinState *pstate = hashtable->parallel_state;
 | 
			
		||||
		int			i;
 | 
			
		||||
 | 
			
		||||
		/* Make sure any temporary files are closed. */
 | 
			
		||||
@@ -3176,22 +3164,17 @@ ExecHashTableDetach(HashJoinTable hashtable)
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		/* If we're last to detach, clean up shared memory. */
 | 
			
		||||
		if (BarrierArriveAndDetach(&pstate->build_barrier))
 | 
			
		||||
		if (BarrierDetach(&pstate->build_barrier))
 | 
			
		||||
		{
 | 
			
		||||
			/*
 | 
			
		||||
			 * Late joining processes will see this state and give up
 | 
			
		||||
			 * immediately.
 | 
			
		||||
			 */
 | 
			
		||||
			Assert(BarrierPhase(&pstate->build_barrier) == PHJ_BUILD_DONE);
 | 
			
		||||
 | 
			
		||||
			if (DsaPointerIsValid(pstate->batches))
 | 
			
		||||
			{
 | 
			
		||||
				dsa_free(hashtable->area, pstate->batches);
 | 
			
		||||
				pstate->batches = InvalidDsaPointer;
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
		hashtable->parallel_state = NULL;
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 
 | 
			
		||||
@@ -45,8 +45,7 @@
 | 
			
		||||
 *   PHJ_BUILD_ALLOCATING            -- one sets up the batches and table 0
 | 
			
		||||
 *   PHJ_BUILD_HASHING_INNER         -- all hash the inner rel
 | 
			
		||||
 *   PHJ_BUILD_HASHING_OUTER         -- (multi-batch only) all hash the outer
 | 
			
		||||
 *   PHJ_BUILD_RUNNING               -- building done, probing can begin
 | 
			
		||||
 *   PHJ_BUILD_DONE                  -- all work complete, one frees batches
 | 
			
		||||
 *   PHJ_BUILD_DONE                  -- building done, probing can begin
 | 
			
		||||
 *
 | 
			
		||||
 * While in the phase PHJ_BUILD_HASHING_INNER a separate pair of barriers may
 | 
			
		||||
 * be used repeatedly as required to coordinate expansions in the number of
 | 
			
		||||
@@ -74,7 +73,7 @@
 | 
			
		||||
 * batches whenever it encounters them while scanning and probing, which it
 | 
			
		||||
 * can do because it processes batches in serial order.
 | 
			
		||||
 *
 | 
			
		||||
 * Once PHJ_BUILD_RUNNING is reached, backends then split up and process
 | 
			
		||||
 * Once PHJ_BUILD_DONE is reached, backends then split up and process
 | 
			
		||||
 * different batches, or gang up and work together on probing batches if there
 | 
			
		||||
 * aren't enough to go around.  For each batch there is a separate barrier
 | 
			
		||||
 * with the following phases:
 | 
			
		||||
@@ -96,16 +95,11 @@
 | 
			
		||||
 *
 | 
			
		||||
 * To avoid deadlocks, we never wait for any barrier unless it is known that
 | 
			
		||||
 * all other backends attached to it are actively executing the node or have
 | 
			
		||||
 * finished.  Practically, that means that we never emit a tuple while attached
 | 
			
		||||
 * to a barrier, unless the barrier has reached a phase that means that no
 | 
			
		||||
 * process will wait on it again.  We emit tuples while attached to the build
 | 
			
		||||
 * barrier in phase PHJ_BUILD_RUNNING, and to a per-batch barrier in phase
 | 
			
		||||
 * PHJ_BATCH_PROBING.  These are advanced to PHJ_BUILD_DONE and PHJ_BATCH_DONE
 | 
			
		||||
 * respectively without waiting, using BarrierArriveAndDetach().  The last to
 | 
			
		||||
 * detach receives a different return value so that it knows that it's safe to
 | 
			
		||||
 * clean up.  Any straggler process that attaches after that phase is reached
 | 
			
		||||
 * will see that it's too late to participate or access the relevant shared
 | 
			
		||||
 * memory objects.
 | 
			
		||||
 * already arrived.  Practically, that means that we never return a tuple
 | 
			
		||||
 * while attached to a barrier, unless the barrier has reached its final
 | 
			
		||||
 * state.  In the slightly special case of the per-batch barrier, we return
 | 
			
		||||
 * tuples while in PHJ_BATCH_PROBING phase, but that's OK because we use
 | 
			
		||||
 * BarrierArriveAndDetach() to advance it to PHJ_BATCH_DONE without waiting.
 | 
			
		||||
 *
 | 
			
		||||
 *-------------------------------------------------------------------------
 | 
			
		||||
 */
 | 
			
		||||
@@ -323,7 +317,6 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel)
 | 
			
		||||
 | 
			
		||||
					build_barrier = ¶llel_state->build_barrier;
 | 
			
		||||
					Assert(BarrierPhase(build_barrier) == PHJ_BUILD_HASHING_OUTER ||
 | 
			
		||||
						   BarrierPhase(build_barrier) == PHJ_BUILD_RUNNING ||
 | 
			
		||||
						   BarrierPhase(build_barrier) == PHJ_BUILD_DONE);
 | 
			
		||||
					if (BarrierPhase(build_barrier) == PHJ_BUILD_HASHING_OUTER)
 | 
			
		||||
					{
 | 
			
		||||
@@ -336,18 +329,9 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel)
 | 
			
		||||
						BarrierArriveAndWait(build_barrier,
 | 
			
		||||
											 WAIT_EVENT_HASH_BUILD_HASHING_OUTER);
 | 
			
		||||
					}
 | 
			
		||||
					else if (BarrierPhase(build_barrier) == PHJ_BUILD_DONE)
 | 
			
		||||
					{
 | 
			
		||||
						/*
 | 
			
		||||
						 * If we attached so late that the job is finished and
 | 
			
		||||
						 * the batch state has been freed, we can return
 | 
			
		||||
						 * immediately.
 | 
			
		||||
						 */
 | 
			
		||||
						return NULL;
 | 
			
		||||
					}
 | 
			
		||||
					Assert(BarrierPhase(build_barrier) == PHJ_BUILD_DONE);
 | 
			
		||||
 | 
			
		||||
					/* Each backend should now select a batch to work on. */
 | 
			
		||||
					Assert(BarrierPhase(build_barrier) == PHJ_BUILD_RUNNING);
 | 
			
		||||
					hashtable->curbatch = -1;
 | 
			
		||||
					node->hj_JoinState = HJ_NEED_NEW_BATCH;
 | 
			
		||||
 | 
			
		||||
@@ -1106,6 +1090,14 @@ ExecParallelHashJoinNewBatch(HashJoinState *hjstate)
 | 
			
		||||
	int			start_batchno;
 | 
			
		||||
	int			batchno;
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	 * If we started up so late that the batch tracking array has been freed
 | 
			
		||||
	 * already by ExecHashTableDetach(), then we are finished.  See also
 | 
			
		||||
	 * ExecParallelHashEnsureBatchAccessors().
 | 
			
		||||
	 */
 | 
			
		||||
	if (hashtable->batches == NULL)
 | 
			
		||||
		return false;
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	 * If we were already attached to a batch, remember not to bother checking
 | 
			
		||||
	 * it again, and detach from it (possibly freeing the hash table if we are
 | 
			
		||||
 
 | 
			
		||||
@@ -258,8 +258,7 @@ typedef struct ParallelHashJoinState
 | 
			
		||||
#define PHJ_BUILD_ALLOCATING			1
 | 
			
		||||
#define PHJ_BUILD_HASHING_INNER			2
 | 
			
		||||
#define PHJ_BUILD_HASHING_OUTER			3
 | 
			
		||||
#define PHJ_BUILD_RUNNING				4
 | 
			
		||||
#define PHJ_BUILD_DONE					5
 | 
			
		||||
#define PHJ_BUILD_DONE					4
 | 
			
		||||
 | 
			
		||||
/* The phases for probing each batch, used by for batch_barrier. */
 | 
			
		||||
#define PHJ_BATCH_ELECTING				0
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user