diff options
Diffstat (limited to 'src/backend/executor')
-rw-r--r-- | src/backend/executor/execGrouping.c | 6 | ||||
-rw-r--r-- | src/backend/executor/nodeAgg.c | 42 | ||||
-rw-r--r-- | src/backend/executor/nodeHash.c | 126 | ||||
-rw-r--r-- | src/backend/executor/nodeMemoize.c | 2 |
4 files changed, 92 insertions, 84 deletions
diff --git a/src/backend/executor/execGrouping.c b/src/backend/executor/execGrouping.c index 5fd0b26cbc1..c11427a1f66 100644 --- a/src/backend/executor/execGrouping.c +++ b/src/backend/executor/execGrouping.c @@ -165,14 +165,16 @@ BuildTupleHashTableExt(PlanState *parent, { TupleHashTable hashtable; Size entrysize = sizeof(TupleHashEntryData) + additionalsize; - int hash_mem = get_hash_mem(); + Size hash_mem_limit; MemoryContext oldcontext; bool allow_jit; Assert(nbuckets > 0); /* Limit initial table size request to not more than hash_mem */ - nbuckets = Min(nbuckets, (long) ((hash_mem * 1024L) / entrysize)); + hash_mem_limit = get_hash_memory_limit() / entrysize; + if (nbuckets > hash_mem_limit) + nbuckets = hash_mem_limit; oldcontext = MemoryContextSwitchTo(metacxt); diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c index 914b02ceee4..39bea204d16 100644 --- a/src/backend/executor/nodeAgg.c +++ b/src/backend/executor/nodeAgg.c @@ -1802,15 +1802,15 @@ hash_agg_set_limits(double hashentrysize, double input_groups, int used_bits, { int npartitions; Size partition_mem; - int hash_mem = get_hash_mem(); + Size hash_mem_limit = get_hash_memory_limit(); /* if not expected to spill, use all of hash_mem */ - if (input_groups * hashentrysize < hash_mem * 1024L) + if (input_groups * hashentrysize <= hash_mem_limit) { if (num_partitions != NULL) *num_partitions = 0; - *mem_limit = hash_mem * 1024L; - *ngroups_limit = *mem_limit / hashentrysize; + *mem_limit = hash_mem_limit; + *ngroups_limit = hash_mem_limit / hashentrysize; return; } @@ -1835,10 +1835,10 @@ hash_agg_set_limits(double hashentrysize, double input_groups, int used_bits, * minimum number of partitions, so we aren't going to dramatically exceed * work mem anyway. */ - if (hash_mem * 1024L > 4 * partition_mem) - *mem_limit = hash_mem * 1024L - partition_mem; + if (hash_mem_limit > 4 * partition_mem) + *mem_limit = hash_mem_limit - partition_mem; else - *mem_limit = hash_mem * 1024L * 0.75; + *mem_limit = hash_mem_limit * 0.75; if (*mem_limit > hashentrysize) *ngroups_limit = *mem_limit / hashentrysize; @@ -1992,32 +1992,36 @@ static int hash_choose_num_partitions(double input_groups, double hashentrysize, int used_bits, int *log2_npartitions) { - Size mem_wanted; - int partition_limit; + Size hash_mem_limit = get_hash_memory_limit(); + double partition_limit; + double mem_wanted; + double dpartitions; int npartitions; int partition_bits; - int hash_mem = get_hash_mem(); /* * Avoid creating so many partitions that the memory requirements of the * open partition files are greater than 1/4 of hash_mem. */ partition_limit = - (hash_mem * 1024L * 0.25 - HASHAGG_READ_BUFFER_SIZE) / + (hash_mem_limit * 0.25 - HASHAGG_READ_BUFFER_SIZE) / HASHAGG_WRITE_BUFFER_SIZE; mem_wanted = HASHAGG_PARTITION_FACTOR * input_groups * hashentrysize; /* make enough partitions so that each one is likely to fit in memory */ - npartitions = 1 + (mem_wanted / (hash_mem * 1024L)); + dpartitions = 1 + (mem_wanted / hash_mem_limit); + + if (dpartitions > partition_limit) + dpartitions = partition_limit; - if (npartitions > partition_limit) - npartitions = partition_limit; + if (dpartitions < HASHAGG_MIN_PARTITIONS) + dpartitions = HASHAGG_MIN_PARTITIONS; + if (dpartitions > HASHAGG_MAX_PARTITIONS) + dpartitions = HASHAGG_MAX_PARTITIONS; - if (npartitions < HASHAGG_MIN_PARTITIONS) - npartitions = HASHAGG_MIN_PARTITIONS; - if (npartitions > HASHAGG_MAX_PARTITIONS) - npartitions = HASHAGG_MAX_PARTITIONS; + /* HASHAGG_MAX_PARTITIONS limit makes this safe */ + npartitions = (int) dpartitions; /* ceil(log2(npartitions)) */ partition_bits = my_log2(npartitions); @@ -2030,7 +2034,7 @@ hash_choose_num_partitions(double input_groups, double hashentrysize, *log2_npartitions = partition_bits; /* number of partitions will be a power of two */ - npartitions = 1L << partition_bits; + npartitions = 1 << partition_bits; return npartitions; } diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c index c5f2d1d22b1..73eb074cbf9 100644 --- a/src/backend/executor/nodeHash.c +++ b/src/backend/executor/nodeHash.c @@ -675,15 +675,12 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew, { int tupsize; double inner_rel_bytes; - long bucket_bytes; - long hash_table_bytes; - long skew_table_bytes; - long max_pointers; - long mppow2; + size_t hash_table_bytes; + size_t bucket_bytes; + size_t max_pointers; int nbatch = 1; int nbuckets; double dbuckets; - int hash_mem = get_hash_mem(); /* Force a plausible relation size if no info */ if (ntuples <= 0.0) @@ -700,9 +697,9 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew, inner_rel_bytes = ntuples * tupsize; /* - * Target in-memory hashtable size is hash_mem kilobytes. + * Compute in-memory hashtable size limit from GUCs. */ - hash_table_bytes = hash_mem * 1024L; + hash_table_bytes = get_hash_memory_limit(); /* * Parallel Hash tries to use the combined hash_mem of all workers to @@ -710,7 +707,14 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew, * per worker and tries to process batches in parallel. */ if (try_combined_hash_mem) - hash_table_bytes += hash_table_bytes * parallel_workers; + { + /* Careful, this could overflow size_t */ + double newlimit; + + newlimit = (double) hash_table_bytes * (double) (parallel_workers + 1); + newlimit = Min(newlimit, (double) SIZE_MAX); + hash_table_bytes = (size_t) newlimit; + } *space_allowed = hash_table_bytes; @@ -730,9 +734,12 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew, */ if (useskew) { - skew_table_bytes = hash_table_bytes * SKEW_HASH_MEM_PERCENT / 100; + size_t bytes_per_mcv; + size_t skew_mcvs; /*---------- + * Compute number of MCVs we could hold in hash_table_bytes + * * Divisor is: * size of a hash tuple + * worst-case size of skewBucket[] per MCV + @@ -740,12 +747,26 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew, * size of skew bucket struct itself *---------- */ - *num_skew_mcvs = skew_table_bytes / (tupsize + - (8 * sizeof(HashSkewBucket *)) + - sizeof(int) + - SKEW_BUCKET_OVERHEAD); - if (*num_skew_mcvs > 0) - hash_table_bytes -= skew_table_bytes; + bytes_per_mcv = tupsize + + (8 * sizeof(HashSkewBucket *)) + + sizeof(int) + + SKEW_BUCKET_OVERHEAD; + skew_mcvs = hash_table_bytes / bytes_per_mcv; + + /* + * Now scale by SKEW_HASH_MEM_PERCENT (we do it in this order so as + * not to worry about size_t overflow in the multiplication) + */ + skew_mcvs = (skew_mcvs * SKEW_HASH_MEM_PERCENT) / 100; + + /* Now clamp to integer range */ + skew_mcvs = Min(skew_mcvs, INT_MAX); + + *num_skew_mcvs = (int) skew_mcvs; + + /* Reduce hash_table_bytes by the amount needed for the skew table */ + if (skew_mcvs > 0) + hash_table_bytes -= skew_mcvs * bytes_per_mcv; } else *num_skew_mcvs = 0; @@ -753,22 +774,20 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew, /* * Set nbuckets to achieve an average bucket load of NTUP_PER_BUCKET when * memory is filled, assuming a single batch; but limit the value so that - * the pointer arrays we'll try to allocate do not exceed hash_mem nor - * MaxAllocSize. + * the pointer arrays we'll try to allocate do not exceed hash_table_bytes + * nor MaxAllocSize. * * Note that both nbuckets and nbatch must be powers of 2 to make * ExecHashGetBucketAndBatch fast. */ - max_pointers = *space_allowed / sizeof(HashJoinTuple); + max_pointers = hash_table_bytes / sizeof(HashJoinTuple); max_pointers = Min(max_pointers, MaxAllocSize / sizeof(HashJoinTuple)); /* If max_pointers isn't a power of 2, must round it down to one */ - mppow2 = 1L << my_log2(max_pointers); - if (max_pointers != mppow2) - max_pointers = mppow2 / 2; + max_pointers = pg_prevpower2_size_t(max_pointers); /* Also ensure we avoid integer overflow in nbatch and nbuckets */ /* (this step is redundant given the current value of MaxAllocSize) */ - max_pointers = Min(max_pointers, INT_MAX / 2); + max_pointers = Min(max_pointers, INT_MAX / 2 + 1); dbuckets = ceil(ntuples / NTUP_PER_BUCKET); dbuckets = Min(dbuckets, max_pointers); @@ -776,7 +795,7 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew, /* don't let nbuckets be really small, though ... */ nbuckets = Max(nbuckets, 1024); /* ... and force it to be a power of 2. */ - nbuckets = 1 << my_log2(nbuckets); + nbuckets = pg_nextpower2_32(nbuckets); /* * If there's not enough space to store the projected number of tuples and @@ -786,10 +805,10 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew, if (inner_rel_bytes + bucket_bytes > hash_table_bytes) { /* We'll need multiple batches */ - long lbuckets; + size_t sbuckets; double dbatch; int minbatch; - long bucket_size; + size_t bucket_size; /* * If Parallel Hash with combined hash_mem would still need multiple @@ -813,10 +832,10 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew, * overhead for the hash code, pointer to the next tuple, etc. */ bucket_size = (tupsize * NTUP_PER_BUCKET + sizeof(HashJoinTuple)); - lbuckets = 1L << my_log2(hash_table_bytes / bucket_size); - lbuckets = Min(lbuckets, max_pointers); - nbuckets = (int) lbuckets; - nbuckets = 1 << my_log2(nbuckets); + sbuckets = pg_nextpower2_size_t(hash_table_bytes / bucket_size); + sbuckets = Min(sbuckets, max_pointers); + nbuckets = (int) sbuckets; + nbuckets = pg_nextpower2_32(nbuckets); bucket_bytes = nbuckets * sizeof(HashJoinTuple); /* @@ -1097,14 +1116,12 @@ ExecParallelHashIncreaseNumBatches(HashJoinTable hashtable) /* Figure out how many batches to use. */ if (hashtable->nbatch == 1) { - int hash_mem = get_hash_mem(); - /* * We are going from single-batch to multi-batch. We need * to switch from one large combined memory budget to the * regular hash_mem budget. */ - pstate->space_allowed = hash_mem * 1024L; + pstate->space_allowed = get_hash_memory_limit(); /* * The combined hash_mem of all participants wasn't @@ -1113,7 +1130,7 @@ ExecParallelHashIncreaseNumBatches(HashJoinTable hashtable) * insufficient. So try two batches per participant, * rounded up to a power of two. */ - new_nbatch = 1 << my_log2(pstate->nparticipants * 2); + new_nbatch = pg_nextpower2_32(pstate->nparticipants * 2); } else { @@ -1152,7 +1169,7 @@ ExecParallelHashIncreaseNumBatches(HashJoinTable hashtable) MaxAllocSize / sizeof(dsa_pointer_atomic)); new_nbuckets = (int) dbuckets; new_nbuckets = Max(new_nbuckets, 1024); - new_nbuckets = 1 << my_log2(new_nbuckets); + new_nbuckets = pg_nextpower2_32(new_nbuckets); dsa_free(hashtable->area, old_batch0->buckets); hashtable->batches[0].shared->buckets = dsa_allocate(hashtable->area, @@ -3372,39 +3389,24 @@ ExecParallelHashTuplePrealloc(HashJoinTable hashtable, int batchno, size_t size) } /* - * Get a hash_mem value by multiplying the work_mem GUC's value by the - * hash_mem_multiplier GUC's value. + * Calculate the limit on how much memory can be used by Hash and similar + * plan types. This is work_mem times hash_mem_multiplier, and is + * expressed in bytes. * - * Returns a work_mem style KB value that hash-based nodes (including but not - * limited to hash join) use in place of work_mem. This is subject to the - * same restrictions as work_mem itself. (There is no such thing as the - * hash_mem GUC, but it's convenient for our callers to pretend that there - * is.) - * - * Exported for use by the planner, as well as other hash-based executor + * Exported for use by the planner, as well as other hash-like executor * nodes. This is a rather random place for this, but there is no better * place. */ -int -get_hash_mem(void) +size_t +get_hash_memory_limit(void) { - double hash_mem; + double mem_limit; - Assert(hash_mem_multiplier >= 1.0); + /* Do initial calculation in double arithmetic */ + mem_limit = (double) work_mem * hash_mem_multiplier * 1024.0; - hash_mem = (double) work_mem * hash_mem_multiplier; - - /* - * guc.c enforces a MAX_KILOBYTES limitation on work_mem in order to - * support the assumption that raw derived byte values can be stored in - * 'long' variables. The returned hash_mem value must also meet this - * assumption. - * - * We clamp the final value rather than throw an error because it should - * be possible to set work_mem and hash_mem_multiplier independently. - */ - if (hash_mem < MAX_KILOBYTES) - return (int) hash_mem; + /* Clamp in case it doesn't fit in size_t */ + mem_limit = Min(mem_limit, (double) SIZE_MAX); - return MAX_KILOBYTES; + return (size_t) mem_limit; } diff --git a/src/backend/executor/nodeMemoize.c b/src/backend/executor/nodeMemoize.c index 2fde4ebce69..bec588b3a04 100644 --- a/src/backend/executor/nodeMemoize.c +++ b/src/backend/executor/nodeMemoize.c @@ -905,7 +905,7 @@ ExecInitMemoize(Memoize *node, EState *estate, int eflags) mstate->mem_used = 0; /* Limit the total memory consumed by the cache to this */ - mstate->mem_limit = get_hash_mem() * 1024L; + mstate->mem_limit = get_hash_memory_limit(); /* A memory context dedicated for the cache */ mstate->tableContext = AllocSetContextCreate(CurrentMemoryContext, |