aboutsummaryrefslogtreecommitdiff
path: root/src/backend/executor/nodeHash.c
diff options
context:
space:
mode:
authorTomas Vondra <tomas.vondra@postgresql.org>2023-05-19 16:31:11 +0200
committerTomas Vondra <tomas.vondra@postgresql.org>2023-05-19 17:17:58 +0200
commit8c4040edf456d9241816176eacb79e4d9a0034fc (patch)
treec175487d46f9b8133829bb3843f498f52f06dbc6 /src/backend/executor/nodeHash.c
parent507615fc533b1b65bcecc6218e36436687fe8420 (diff)
downloadpostgresql-8c4040edf456d9241816176eacb79e4d9a0034fc.tar.gz
postgresql-8c4040edf456d9241816176eacb79e4d9a0034fc.zip
Allocate hash join files in a separate memory context
Should a hash join exceed memory limit, the hashtable is split up into multiple batches. The number of batches is doubled each time a given batch is determined not to fit in memory. Each batch file is allocated with a block-sized buffer for buffering tuples and parallel hash join has additional sharedtuplestore accessor buffers. In some pathological cases requiring a lot of batches, often with skewed data, bad stats, or very large datasets, users can run out-of-memory solely from the memory overhead of all the batch files' buffers. Batch files were allocated in the ExecutorState memory context, making it very hard to identify when this batch explosion was the source of an OOM. This commit allocates the batch files in a dedicated memory context, making it easier to identify the cause of an OOM and work to avoid it. Based on initial draft by Tomas Vondra, with significant reworks and improvements by Jehan-Guillaume de Rorthais. Author: Jehan-Guillaume de Rorthais <jgdr@dalibo.com> Author: Tomas Vondra <tomas.vondra@enterprisedb.com> Reviewed-by: Melanie Plageman <melanieplageman@gmail.com> Discussion: https://postgr.es/m/20190421114618.z3mpgmimc3rmubi4@development Discussion: https://postgr.es/m/20230504193006.1b5b9622%40karst#273020ff4061fc7a2fbb1ba96b281f17
Diffstat (limited to 'src/backend/executor/nodeHash.c')
-rw-r--r--src/backend/executor/nodeHash.c46
1 files changed, 33 insertions, 13 deletions
diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
index 5fd1c5553ba..301e4acba3c 100644
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -484,7 +484,7 @@ ExecHashTableCreate(HashState *state, List *hashOperators, List *hashCollations,
*
* The hashtable control block is just palloc'd from the executor's
* per-query memory context. Everything else should be kept inside the
- * subsidiary hashCxt or batchCxt.
+ * subsidiary hashCxt, batchCxt or spillCxt.
*/
hashtable = palloc_object(HashJoinTableData);
hashtable->nbuckets = nbuckets;
@@ -538,6 +538,10 @@ ExecHashTableCreate(HashState *state, List *hashOperators, List *hashCollations,
"HashBatchContext",
ALLOCSET_DEFAULT_SIZES);
+ hashtable->spillCxt = AllocSetContextCreate(hashtable->hashCxt,
+ "HashSpillContext",
+ ALLOCSET_DEFAULT_SIZES);
+
/* Allocate data that will live for the life of the hashjoin */
oldcxt = MemoryContextSwitchTo(hashtable->hashCxt);
@@ -570,12 +574,19 @@ ExecHashTableCreate(HashState *state, List *hashOperators, List *hashCollations,
if (nbatch > 1 && hashtable->parallel_state == NULL)
{
+ MemoryContext oldctx;
+
/*
* allocate and initialize the file arrays in hashCxt (not needed for
* parallel case which uses shared tuplestores instead of raw files)
*/
+ oldctx = MemoryContextSwitchTo(hashtable->spillCxt);
+
hashtable->innerBatchFile = palloc0_array(BufFile *, nbatch);
hashtable->outerBatchFile = palloc0_array(BufFile *, nbatch);
+
+ MemoryContextSwitchTo(oldctx);
+
/* The files will not be opened until needed... */
/* ... but make sure we have temp tablespaces established for them */
PrepareTempTablespaces();
@@ -913,7 +924,6 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
int oldnbatch = hashtable->nbatch;
int curbatch = hashtable->curbatch;
int nbatch;
- MemoryContext oldcxt;
long ninmemory;
long nfreed;
HashMemoryChunk oldchunks;
@@ -934,13 +944,16 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
hashtable, nbatch, hashtable->spaceUsed);
#endif
- oldcxt = MemoryContextSwitchTo(hashtable->hashCxt);
-
if (hashtable->innerBatchFile == NULL)
{
+ MemoryContext oldcxt = MemoryContextSwitchTo(hashtable->spillCxt);
+
/* we had no file arrays before */
hashtable->innerBatchFile = palloc0_array(BufFile *, nbatch);
hashtable->outerBatchFile = palloc0_array(BufFile *, nbatch);
+
+ MemoryContextSwitchTo(oldcxt);
+
/* time to establish the temp tablespaces, too */
PrepareTempTablespaces();
}
@@ -951,8 +964,6 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
hashtable->outerBatchFile = repalloc0_array(hashtable->outerBatchFile, BufFile *, oldnbatch, nbatch);
}
- MemoryContextSwitchTo(oldcxt);
-
hashtable->nbatch = nbatch;
/*
@@ -1024,7 +1035,8 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
Assert(batchno > curbatch);
ExecHashJoinSaveTuple(HJTUPLE_MINTUPLE(hashTuple),
hashTuple->hashvalue,
- &hashtable->innerBatchFile[batchno]);
+ &hashtable->innerBatchFile[batchno],
+ hashtable);
hashtable->spaceUsed -= hashTupleSize;
nfreed++;
@@ -1683,7 +1695,8 @@ ExecHashTableInsert(HashJoinTable hashtable,
Assert(batchno > hashtable->curbatch);
ExecHashJoinSaveTuple(tuple,
hashvalue,
- &hashtable->innerBatchFile[batchno]);
+ &hashtable->innerBatchFile[batchno],
+ hashtable);
}
if (shouldFree)
@@ -2664,7 +2677,8 @@ ExecHashRemoveNextSkewBucket(HashJoinTable hashtable)
/* Put the tuple into a temp file for later batches */
Assert(batchno > hashtable->curbatch);
ExecHashJoinSaveTuple(tuple, hashvalue,
- &hashtable->innerBatchFile[batchno]);
+ &hashtable->innerBatchFile[batchno],
+ hashtable);
pfree(hashTuple);
hashtable->spaceUsed -= tupleSize;
hashtable->spaceUsedSkew -= tupleSize;
@@ -3093,8 +3107,11 @@ ExecParallelHashJoinSetUpBatches(HashJoinTable hashtable, int nbatch)
pstate->nbatch = nbatch;
batches = dsa_get_address(hashtable->area, pstate->batches);
- /* Use hash join memory context. */
- oldcxt = MemoryContextSwitchTo(hashtable->hashCxt);
+ /*
+ * Use hash join spill memory context to allocate accessors, including
+ * buffers for the temporary files.
+ */
+ oldcxt = MemoryContextSwitchTo(hashtable->spillCxt);
/* Allocate this backend's accessor array. */
hashtable->nbatch = nbatch;
@@ -3196,8 +3213,11 @@ ExecParallelHashEnsureBatchAccessors(HashJoinTable hashtable)
*/
Assert(DsaPointerIsValid(pstate->batches));
- /* Use hash join memory context. */
- oldcxt = MemoryContextSwitchTo(hashtable->hashCxt);
+ /*
+ * Use hash join spill memory context to allocate accessors, including
+ * buffers for the temporary files.
+ */
+ oldcxt = MemoryContextSwitchTo(hashtable->spillCxt);
/* Allocate this backend's accessor array. */
hashtable->nbatch = pstate->nbatch;