1 files changed, 129 insertions, 0 deletions
diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
index 6f8a379e3b9..8d2201ab67f 100644
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -848,6 +848,90 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
 		nbatch = pg_nextpower2_32(Max(2, minbatch));
 	}
 
+	/*
+	 * Optimize the total amount of memory consumed by the hash node.
+	 *
+	 * The nbatch calculation above focuses on the size of the in-memory hash
+	 * table, assuming no per-batch overhead. Now adjust the number of batches
+	 * and the size of the hash table to minimize total memory consumed by the
+	 * hash node.
+	 *
+	 * Each batch file has a BLCKSZ buffer, and we may need two files per
+	 * batch (inner and outer side). So with enough batches this can be
+	 * significantly more memory than the hashtable itself.
+	 *
+	 * The total memory usage may be expressed by this formula:
+	 *
+	 * (inner_rel_bytes / nbatch) + (2 * nbatch * BLCKSZ) <= hash_table_bytes
+	 *
+	 * where (inner_rel_bytes / nbatch) is the size of the in-memory hash
+	 * table and (2 * nbatch * BLCKSZ) is the amount of memory used by file
+	 * buffers. But for sufficiently large values of inner_rel_bytes value
+	 * there may not be a nbatch value that would make both parts fit into
+	 * hash_table_bytes.
+	 *
+	 * In this case we can't enforce the memory limit - we're going to exceed
+	 * it. We can however minimize the impact and use as little memory as
+	 * possible. (We haven't really enforced it before either, as we simply
+	 * ignored the batch files.)
+	 *
+	 * The formula for total memory usage says that given an inner relation of
+	 * size inner_rel_bytes, we may divide it into an arbitrary number of
+	 * batches. This determines both the size of the in-memory hash table and
+	 * the amount of memory needed for batch files. These two terms work in
+	 * opposite ways - when one decreases, the other increases.
+	 *
+	 * For low nbatch values, the hash table takes most of the memory, but at
+	 * some point the batch files start to dominate. If you combine these two
+	 * terms, the memory consumption (for a fixed size of the inner relation)
+	 * has a u-shape, with a minimum at some nbatch value.
+	 *
+	 * Our goal is to find this nbatch value, minimizing the memory usage. We
+	 * calculate the memory usage with half the batches (i.e. nbatch/2), and
+	 * if it's lower than the current memory usage we know it's better to use
+	 * fewer batches. We repeat this until reducing the number of batches does
+	 * not reduce the memory usage - we found the optimum. We know the optimum
+	 * exists, thanks to the u-shape.
+	 *
+	 * We only want to do this when exceeding the memory limit, not every
+	 * time. The goal is not to minimize memory usage in every case, but to
+	 * minimize the memory usage when we can't stay within the memory limit.
+	 *
+	 * For this reason we only consider reducing the number of batches. We
+	 * could try the opposite direction too, but that would save memory only
+	 * when most of the memory is used by the hash table. And the hash table
+	 * was used for the initial sizing, so we shouldn't be exceeding the
+	 * memory limit too much. We might save memory by using more batches, but
+	 * it would result in spilling more batch files, which does not seem like
+	 * a great trade off.
+	 *
+	 * While growing the hashtable, we also adjust the number of buckets, to
+	 * not have more than one tuple per bucket (load factor 1). We can only do
+	 * this during the initial sizing - once we start building the hash,
+	 * nbucket is fixed.
+	 */
+	while (nbatch > 0)
+	{
+		/* how much memory are we using with current nbatch value */
+		size_t		current_space = hash_table_bytes + (2 * nbatch * BLCKSZ);
+
+		/* how much memory would we use with half the batches */
+		size_t		new_space = hash_table_bytes * 2 + (nbatch * BLCKSZ);
+
+		/* If the memory usage would not decrease, we found the optimum. */
+		if (current_space < new_space)
+			break;
+
+		/*
+		 * It's better to use half the batches, so do that and adjust the
+		 * nbucket in the opposite direction, and double the allowance.
+		 */
+		nbatch /= 2;
+		nbuckets *= 2;
+
+		*space_allowed = (*space_allowed) * 2;
+	}
+
 	Assert(nbuckets > 0);
 	Assert(nbatch > 0);
 
@@ -891,6 +975,47 @@ ExecHashTableDestroy(HashJoinTable hashtable)
 }
 
 /*
+ * Consider adjusting the allowed hash table size, depending on the number
+ * of batches, to minimize the overall memory usage (for both the hashtable
+ * and batch files).
+ *
+ * We're adjusting the size of the hash table, not the (optimal) number of
+ * buckets. We can't change that once we start building the hash, due to how
+ * ExecHashGetBucketAndBatch calculates batchno/bucketno from the hash. This
+ * means the load factor may not be optimal, but we're in damage control so
+ * we accept slower lookups. It's still much better than batch explosion.
+ *
+ * Returns true if we chose to increase the batch size (and thus we don't
+ * need to add batches), and false if we should increase nbatch.
+ */
+static bool
+ExecHashIncreaseBatchSize(HashJoinTable hashtable)
+{
+	/*
+	 * How much additional memory would doubling nbatch use? Each batch may
+	 * require two buffered files (inner/outer), with a BLCKSZ buffer.
+	 */
+	size_t		batchSpace = (hashtable->nbatch * 2 * BLCKSZ);
+
+	/*
+	 * Compare the new space needed for doubling nbatch and for enlarging the
+	 * in-memory hash table. If doubling the hash table needs less memory,
+	 * just do that. Otherwise, continue with doubling the nbatch.
+	 *
+	 * We're either doubling spaceAllowed of batchSpace, so which of those
+	 * increases the memory usage the least is the same as comparing the
+	 * values directly.
+	 */
+	if (hashtable->spaceAllowed <= batchSpace)
+	{
+		hashtable->spaceAllowed *= 2;
+		return true;
+	}
+
+	return false;
+}
+
+/*
  * ExecHashIncreaseNumBatches
  *		increase the original number of batches in order to reduce
  *		current memory consumption
@@ -913,6 +1038,10 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
 	if (oldnbatch > Min(INT_MAX / 2, MaxAllocSize / (sizeof(void *) * 2)))
 		return;
 
+	/* consider increasing size of the in-memory hash table instead */
+	if (ExecHashIncreaseBatchSize(hashtable))
+		return;
+
 	nbatch = oldnbatch * 2;
 	Assert(nbatch > 1);