Add parallel-aware hash joins.

Introduce parallel-aware hash joins that appear in EXPLAIN plans as Parallel Hash Join with Parallel Hash. While hash joins could already appear in parallel queries, they were previously always parallel-oblivious and had a partial subplan only on the outer side, meaning that the work of the inner subplan was duplicated in every worker. After this commit, the planner will consider using a partial subplan on the inner side too, using the Parallel Hash node to divide the work over the available CPU cores and combine its results in shared memory. If the join needs to be split into multiple batches in order to respect work_mem, then workers process different batches as much as possible and then work together on the remaining batches. The advantages of a parallel-aware hash join over a parallel-oblivious hash join used in a parallel query are that it: * avoids wasting memory on duplicated hash tables * avoids wasting disk space on duplicated batch files * divides the work of building the hash table over the CPUs One disadvantage is that there is some communication between the participating CPUs which might outweigh the benefits of parallelism in the case of small hash tables. This is avoided by the planner's existing reluctance to supply partial plans for small scans, but it may be necessary to estimate synchronization costs in future if that situation changes. Another is that outer batch 0 must be written to disk if multiple batches are required. A potential future advantage of parallel-aware hash joins is that right and full outer joins could be supported, since there is a single set of matched bits for each hashtable, but that is not yet implemented. A new GUC enable_parallel_hash is defined to control the feature, defaulting to on. Author: Thomas Munro Reviewed-By: Andres Freund, Robert Haas Tested-By: Rafia Sabih, Prabhat Sahu Discussion: https://postgr.es/m/CAEepm=2W=cOkiZxcg6qiFQP-dHUe09aqTrEMM7yJDrHMhDv_RA@mail.gmail.com https://postgr.es/m/CAEepm=37HKyJ4U6XOLi=JgfSHM3o6B-GaeO-6hkOmneTDkH+Uw@mail.gmail.com
2025-07-12 21:01:52 +03:00 · 2017-12-20 23:39:21 -08:00
parent f94eec490b
commit 1804284042
30 changed files with 3087 additions and 112 deletions
--- a/src/backend/optimizer/path/joinpath.c
+++ b/src/backend/optimizer/path/joinpath.c
@ -747,7 +747,7 @@ try_hashjoin_path(PlannerInfo *root,
 	 * never have any output pathkeys, per comments in create_hashjoin_path.
 	 */
 	initial_cost_hashjoin(root, &workspace, jointype, hashclauses,
-						  outer_path, inner_path, extra);
+						  outer_path, inner_path, extra, false);

 	if (add_path_precheck(joinrel,
 						  workspace.startup_cost, workspace.total_cost,
@ -761,6 +761,7 @@ try_hashjoin_path(PlannerInfo *root,
 									  extra,
 									  outer_path,
 									  inner_path,
+									  false,	/* parallel_hash */
 									  extra->restrictlist,
 									  required_outer,
 									  hashclauses));
@ -776,6 +777,10 @@ try_hashjoin_path(PlannerInfo *root,
 * try_partial_hashjoin_path
 *	  Consider a partial hashjoin join path; if it appears useful, push it into
 *	  the joinrel's partial_pathlist via add_partial_path().
+ *	  The outer side is partial.  If parallel_hash is true, then the inner path
+ *	  must be partial and will be run in parallel to create one or more shared
+ *	  hash tables; otherwise the inner path must be complete and a copy of it
+ *	  is run in every process to create separate identical private hash tables.
 */
 static void
 try_partial_hashjoin_path(PlannerInfo *root,
@ -784,7 +789,8 @@ try_partial_hashjoin_path(PlannerInfo *root,
 						  Path *inner_path,
 						  List *hashclauses,
 						  JoinType jointype,
-						  JoinPathExtraData *extra)
+						  JoinPathExtraData *extra,
+						  bool parallel_hash)
 {
 	JoinCostWorkspace workspace;

@ -808,7 +814,7 @@ try_partial_hashjoin_path(PlannerInfo *root,
 	 * cost.  Bail out right away if it looks terrible.
 	 */
 	initial_cost_hashjoin(root, &workspace, jointype, hashclauses,
-						  outer_path, inner_path, extra);
+						  outer_path, inner_path, extra, true);
 	if (!add_partial_path_precheck(joinrel, workspace.total_cost, NIL))
 		return;

@ -821,6 +827,7 @@ try_partial_hashjoin_path(PlannerInfo *root,
 										  extra,
 										  outer_path,
 										  inner_path,
+										  parallel_hash,
 										  extra->restrictlist,
 										  NULL,
 										  hashclauses));
@ -1839,6 +1846,10 @@ hash_inner_and_outer(PlannerInfo *root,
 		 * able to properly guarantee uniqueness.  Similarly, we can't handle
 		 * JOIN_FULL and JOIN_RIGHT, because they can produce false null
 		 * extended rows.  Also, the resulting path must not be parameterized.
+		 * We would be able to support JOIN_FULL and JOIN_RIGHT for Parallel
+		 * Hash, since in that case we're back to a single hash table with a
+		 * single set of match bits for each batch, but that will require
+		 * figuring out a deadlock-free way to wait for the probe to finish.
 		 */
 		if (joinrel->consider_parallel &&
 			save_jointype != JOIN_UNIQUE_OUTER &&
@ -1848,11 +1859,27 @@ hash_inner_and_outer(PlannerInfo *root,
 			bms_is_empty(joinrel->lateral_relids))
 		{
 			Path	   *cheapest_partial_outer;
+			Path	   *cheapest_partial_inner = NULL;
 			Path	   *cheapest_safe_inner = NULL;

 			cheapest_partial_outer =
 				(Path *) linitial(outerrel->partial_pathlist);

+			/*
+			 * Can we use a partial inner plan too, so that we can build a
+			 * shared hash table in parallel?
+			 */
+			if (innerrel->partial_pathlist != NIL && enable_parallel_hash)
+			{
+				cheapest_partial_inner =
+					(Path *) linitial(innerrel->partial_pathlist);
+				try_partial_hashjoin_path(root, joinrel,
+										  cheapest_partial_outer,
+										  cheapest_partial_inner,
+										  hashclauses, jointype, extra,
+										  true /* parallel_hash */ );
+			}
+
 			/*
 			 * Normally, given that the joinrel is parallel-safe, the cheapest
 			 * total inner path will also be parallel-safe, but if not, we'll
@ -1870,7 +1897,8 @@ hash_inner_and_outer(PlannerInfo *root,
 				try_partial_hashjoin_path(root, joinrel,
 										  cheapest_partial_outer,
 										  cheapest_safe_inner,
-										  hashclauses, jointype, extra);
+										  hashclauses, jointype, extra,
+										  false /* parallel_hash */ );
 		}
 	}
 }