Get rid of artificial restriction on hash table sizes on Windows.

The point of introducing the hash_mem_multiplier GUC was to let users reproduce the old behavior of hash aggregation, i.e. that it could use more than work_mem at need. However, the implementation failed to get the job done on Win64, where work_mem is clamped to 2GB to protect various places that calculate memory sizes using "long int". As written, the same clamp was applied to hash_mem. This resulted in severe performance regressions for queries requiring a bit more than 2GB for hash aggregation, as they now spill to disk and there's no way to stop that. Getting rid of the work_mem restriction seems like a good idea, but it's a big job and could not conceivably be back-patched. However, there's only a fairly small number of places that are concerned with the hash_mem value, and it turns out to be possible to remove the restriction there without too much code churn or any ABI breaks. So, let's do that for now to fix the regression, and leave the larger task for another day. This patch does introduce a bit more infrastructure that should help with the larger task, namely pg_bitutils.h support for working with size_t values. Per gripe from Laurent Hasson. Back-patch to v13 where the behavior change came in. Discussion: https://postgr.es/m/997817.1627074924@sss.pgh.pa.us Discussion: https://postgr.es/m/MN2PR15MB25601E80A9B6D1BA6F592B1985E39@MN2PR15MB2560.namprd15.prod.outlook.com
author: Tom Lane <tgl@sss.pgh.pa.us> 2021-07-25 14:02:27 -0400
committer: Tom Lane <tgl@sss.pgh.pa.us> 2021-07-25 14:02:27 -0400
commit: 28d936031a86d94806c6604480ff3f3f169b371c (patch)
tree: 5484c0b625433483dbff26b9da48881c8e3426c0 /src/backend/executor/nodeAgg.c
parent: d9d8aa9bb9aa2a850f53facd2c91e454129af57b (diff)
download: postgresql-28d936031a86d94806c6604480ff3f3f169b371c.tar.gz
postgresql-28d936031a86d94806c6604480ff3f3f169b371c.zip
1 files changed, 23 insertions, 19 deletions
diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index 914b02ceee4..39bea204d16 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -1802,15 +1802,15 @@ hash_agg_set_limits(double hashentrysize, double input_groups, int used_bits,
 {
 	int			npartitions;
 	Size		partition_mem;
-	int			hash_mem = get_hash_mem();
+	Size		hash_mem_limit = get_hash_memory_limit();
 
 	/* if not expected to spill, use all of hash_mem */
-	if (input_groups * hashentrysize < hash_mem * 1024L)
+	if (input_groups * hashentrysize <= hash_mem_limit)
 	{
 		if (num_partitions != NULL)
 			*num_partitions = 0;
-		*mem_limit = hash_mem * 1024L;
-		*ngroups_limit = *mem_limit / hashentrysize;
+		*mem_limit = hash_mem_limit;
+		*ngroups_limit = hash_mem_limit / hashentrysize;
 		return;
 	}
 
@@ -1835,10 +1835,10 @@ hash_agg_set_limits(double hashentrysize, double input_groups, int used_bits,
 	 * minimum number of partitions, so we aren't going to dramatically exceed
 	 * work mem anyway.
 	 */
-	if (hash_mem * 1024L > 4 * partition_mem)
-		*mem_limit = hash_mem * 1024L - partition_mem;
+	if (hash_mem_limit > 4 * partition_mem)
+		*mem_limit = hash_mem_limit - partition_mem;
 	else
-		*mem_limit = hash_mem * 1024L * 0.75;
+		*mem_limit = hash_mem_limit * 0.75;
 
 	if (*mem_limit > hashentrysize)
 		*ngroups_limit = *mem_limit / hashentrysize;
@@ -1992,32 +1992,36 @@ static int
 hash_choose_num_partitions(double input_groups, double hashentrysize,
 						   int used_bits, int *log2_npartitions)
 {
-	Size		mem_wanted;
-	int			partition_limit;
+	Size		hash_mem_limit = get_hash_memory_limit();
+	double		partition_limit;
+	double		mem_wanted;
+	double		dpartitions;
 	int			npartitions;
 	int			partition_bits;
-	int			hash_mem = get_hash_mem();
 
 	/*
 	 * Avoid creating so many partitions that the memory requirements of the
 	 * open partition files are greater than 1/4 of hash_mem.
 	 */
 	partition_limit =
-		(hash_mem * 1024L * 0.25 - HASHAGG_READ_BUFFER_SIZE) /
+		(hash_mem_limit * 0.25 - HASHAGG_READ_BUFFER_SIZE) /
 		HASHAGG_WRITE_BUFFER_SIZE;
 
 	mem_wanted = HASHAGG_PARTITION_FACTOR * input_groups * hashentrysize;
 
 	/* make enough partitions so that each one is likely to fit in memory */
-	npartitions = 1 + (mem_wanted / (hash_mem * 1024L));
+	dpartitions = 1 + (mem_wanted / hash_mem_limit);
+
+	if (dpartitions > partition_limit)
+		dpartitions = partition_limit;
 
-	if (npartitions > partition_limit)
-		npartitions = partition_limit;
+	if (dpartitions < HASHAGG_MIN_PARTITIONS)
+		dpartitions = HASHAGG_MIN_PARTITIONS;
+	if (dpartitions > HASHAGG_MAX_PARTITIONS)
+		dpartitions = HASHAGG_MAX_PARTITIONS;
 
-	if (npartitions < HASHAGG_MIN_PARTITIONS)
-		npartitions = HASHAGG_MIN_PARTITIONS;
-	if (npartitions > HASHAGG_MAX_PARTITIONS)
-		npartitions = HASHAGG_MAX_PARTITIONS;
+	/* HASHAGG_MAX_PARTITIONS limit makes this safe */
+	npartitions = (int) dpartitions;
 
 	/* ceil(log2(npartitions)) */
 	partition_bits = my_log2(npartitions);
@@ -2030,7 +2034,7 @@ hash_choose_num_partitions(double input_groups, double hashentrysize,
 		*log2_npartitions = partition_bits;
 
 	/* number of partitions will be a power of two */
-	npartitions = 1L << partition_bits;
+	npartitions = 1 << partition_bits;
 
 	return npartitions;
 }
author	Tom Lane <tgl@sss.pgh.pa.us>	2021-07-25 14:02:27 -0400
committer	Tom Lane <tgl@sss.pgh.pa.us>	2021-07-25 14:02:27 -0400
commit	28d936031a86d94806c6604480ff3f3f169b371c (patch)
tree	5484c0b625433483dbff26b9da48881c8e3426c0 /src/backend/executor/nodeAgg.c
parent	d9d8aa9bb9aa2a850f53facd2c91e454129af57b (diff)
download	postgresql-28d936031a86d94806c6604480ff3f3f169b371c.tar.gz postgresql-28d936031a86d94806c6604480ff3f3f169b371c.zip