Fix race in Parallel Hash Join batch cleanup.

With very unlucky timing and parallel_leader_participation off, PHJ could attempt to access per-batch state just as it was being freed. There was code intended to prevent that by checking for a cleared pointer, but it was buggy. Fix, by introducing an extra barrier phase. The new phase PHJ_BUILD_RUNNING means that it's safe to access the per-batch state to find a batch to help with, and PHJ_BUILD_DONE means that it is too late. The last to detach will free the array of per-batch state as before, but now it will also atomically advance the phase at the same time, so that late attachers can avoid the hazard, without the data race. This mirrors the way per-batch hash tables are freed (see phases PHJ_BATCH_PROBING and PHJ_BATCH_DONE). Revealed by a one-off build farm failure, where BarrierAttach() failed a sanity check assertion, because the memory had been clobbered by dsa_free(). Back-patch to 11, where the code arrived. Reported-by: Michael Paquier <michael@paquier.xyz> Discussion: https://postgr.es/m/20200929061142.GA29096%40paquier.xyz
author: Thomas Munro <tmunro@postgresql.org> 2021-03-17 17:46:39 +1300
committer: Thomas Munro <tmunro@postgresql.org> 2021-03-17 18:05:39 +1300
commit: 3b8981b6e1a2aea0f18384c803e21e9391de669a (patch)
tree: 512960ff7843b409c5031aaf5d47fb44020de258 /src/backend/executor/nodeHash.c
parent: 37929599499fe5760a9dbab48a10a898879a0d44 (diff)
download: postgresql-3b8981b6e1a2aea0f18384c803e21e9391de669a.tar.gz
postgresql-3b8981b6e1a2aea0f18384c803e21e9391de669a.zip
1 files changed, 32 insertions, 15 deletions
diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
index c5f2d1d22b1..c41c86ab51e 100644
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -333,14 +333,21 @@ MultiExecParallelHash(HashState *node)
 	hashtable->nbuckets = pstate->nbuckets;
 	hashtable->log2_nbuckets = my_log2(hashtable->nbuckets);
 	hashtable->totalTuples = pstate->total_tuples;
-	ExecParallelHashEnsureBatchAccessors(hashtable);
+
+	/*
+	 * Unless we're completely done and the batch state has been freed, make
+	 * sure we have accessors.
+	 */
+	if (BarrierPhase(build_barrier) < PHJ_BUILD_DONE)
+		ExecParallelHashEnsureBatchAccessors(hashtable);
 
 	/*
 	 * The next synchronization point is in ExecHashJoin's HJ_BUILD_HASHTABLE
-	 * case, which will bring the build phase to PHJ_BUILD_DONE (if it isn't
+	 * case, which will bring the build phase to PHJ_BUILD_RUNNING (if it isn't
 	 * there already).
 	 */
 	Assert(BarrierPhase(build_barrier) == PHJ_BUILD_HASHING_OUTER ||
+		   BarrierPhase(build_barrier) == PHJ_BUILD_RUNNING ||
 		   BarrierPhase(build_barrier) == PHJ_BUILD_DONE);
 }
 
@@ -624,7 +631,7 @@ ExecHashTableCreate(HashState *state, List *hashOperators, List *hashCollations,
 		/*
 		 * The next Parallel Hash synchronization point is in
 		 * MultiExecParallelHash(), which will progress it all the way to
-		 * PHJ_BUILD_DONE.  The caller must not return control from this
+		 * PHJ_BUILD_RUNNING.  The caller must not return control from this
 		 * executor node between now and then.
 		 */
 	}
@@ -3048,14 +3055,11 @@ ExecParallelHashEnsureBatchAccessors(HashJoinTable hashtable)
 	}
 
 	/*
-	 * It's possible for a backend to start up very late so that the whole
-	 * join is finished and the shm state for tracking batches has already
-	 * been freed by ExecHashTableDetach().  In that case we'll just leave
-	 * hashtable->batches as NULL so that ExecParallelHashJoinNewBatch() gives
-	 * up early.
+	 * We should never see a state where the batch-tracking array is freed,
+	 * because we should have given up sooner if we join when the build barrier
+	 * has reached the PHJ_BUILD_DONE phase.
 	 */
-	if (!DsaPointerIsValid(pstate->batches))
-		return;
+	Assert(DsaPointerIsValid(pstate->batches));
 
 	/* Use hash join memory context. */
 	oldcxt = MemoryContextSwitchTo(hashtable->hashCxt);
@@ -3175,9 +3179,17 @@ ExecHashTableDetachBatch(HashJoinTable hashtable)
 void
 ExecHashTableDetach(HashJoinTable hashtable)
 {
-	if (hashtable->parallel_state)
+	ParallelHashJoinState *pstate = hashtable->parallel_state;
+
+	/*
+	 * If we're involved in a parallel query, we must either have got all the
+	 * way to PHJ_BUILD_RUNNING, or joined too late and be in PHJ_BUILD_DONE.
+	 */
+	Assert(!pstate ||
+		   BarrierPhase(&pstate->build_barrier) >= PHJ_BUILD_RUNNING);
+
+	if (pstate && BarrierPhase(&pstate->build_barrier) == PHJ_BUILD_RUNNING)
 	{
-		ParallelHashJoinState *pstate = hashtable->parallel_state;
 		int			i;
 
 		/* Make sure any temporary files are closed. */
@@ -3193,17 +3205,22 @@ ExecHashTableDetach(HashJoinTable hashtable)
 		}
 
 		/* If we're last to detach, clean up shared memory. */
-		if (BarrierDetach(&pstate->build_barrier))
+		if (BarrierArriveAndDetach(&pstate->build_barrier))
 		{
+			/*
+			 * Late joining processes will see this state and give up
+			 * immediately.
+			 */
+			Assert(BarrierPhase(&pstate->build_barrier) == PHJ_BUILD_DONE);
+
 			if (DsaPointerIsValid(pstate->batches))
 			{
 				dsa_free(hashtable->area, pstate->batches);
 				pstate->batches = InvalidDsaPointer;
 			}
 		}
-
-		hashtable->parallel_state = NULL;
 	}
+	hashtable->parallel_state = NULL;
 }
 
 /*
author	Thomas Munro <tmunro@postgresql.org>	2021-03-17 17:46:39 +1300
committer	Thomas Munro <tmunro@postgresql.org>	2021-03-17 18:05:39 +1300
commit	3b8981b6e1a2aea0f18384c803e21e9391de669a (patch)
tree	512960ff7843b409c5031aaf5d47fb44020de258 /src/backend/executor/nodeHash.c
parent	37929599499fe5760a9dbab48a10a898879a0d44 (diff)
download	postgresql-3b8981b6e1a2aea0f18384c803e21e9391de669a.tar.gz postgresql-3b8981b6e1a2aea0f18384c803e21e9391de669a.zip