diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/backend/executor/execGrouping.c | 5 | ||||
-rw-r--r-- | src/backend/executor/nodeAgg.c | 30 | ||||
-rw-r--r-- | src/backend/executor/nodeHash.c | 80 | ||||
-rw-r--r-- | src/backend/executor/nodeHashjoin.c | 4 | ||||
-rw-r--r-- | src/backend/optimizer/path/costsize.c | 12 | ||||
-rw-r--r-- | src/backend/optimizer/plan/planner.c | 15 | ||||
-rw-r--r-- | src/backend/optimizer/plan/subselect.c | 9 | ||||
-rw-r--r-- | src/backend/optimizer/prep/prepunion.c | 9 | ||||
-rw-r--r-- | src/backend/optimizer/util/pathnode.c | 3 | ||||
-rw-r--r-- | src/backend/utils/adt/ri_triggers.c | 18 | ||||
-rw-r--r-- | src/backend/utils/init/globals.c | 1 | ||||
-rw-r--r-- | src/backend/utils/misc/guc.c | 11 | ||||
-rw-r--r-- | src/backend/utils/misc/postgresql.conf.sample | 1 | ||||
-rw-r--r-- | src/include/executor/hashjoin.h | 4 | ||||
-rw-r--r-- | src/include/executor/nodeHash.h | 2 | ||||
-rw-r--r-- | src/include/miscadmin.h | 4 |
16 files changed, 144 insertions, 64 deletions
diff --git a/src/backend/executor/execGrouping.c b/src/backend/executor/execGrouping.c index 321f427e478..90d04f9228a 100644 --- a/src/backend/executor/execGrouping.c +++ b/src/backend/executor/execGrouping.c @@ -165,13 +165,14 @@ BuildTupleHashTableExt(PlanState *parent, { TupleHashTable hashtable; Size entrysize = sizeof(TupleHashEntryData) + additionalsize; + int hash_mem = get_hash_mem(); MemoryContext oldcontext; bool allow_jit; Assert(nbuckets > 0); - /* Limit initial table size request to not more than work_mem */ - nbuckets = Min(nbuckets, (long) ((work_mem * 1024L) / entrysize)); + /* Limit initial table size request to not more than hash_mem */ + nbuckets = Min(nbuckets, (long) ((hash_mem * 1024L) / entrysize)); oldcontext = MemoryContextSwitchTo(metacxt); diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c index 02a9165c694..9776263ae75 100644 --- a/src/backend/executor/nodeAgg.c +++ b/src/backend/executor/nodeAgg.c @@ -203,7 +203,7 @@ * entries (and initialize new transition states), we instead spill them to * disk to be processed later. The tuples are spilled in a partitioned * manner, so that subsequent batches are smaller and less likely to exceed - * work_mem (if a batch does exceed work_mem, it must be spilled + * hash_mem (if a batch does exceed hash_mem, it must be spilled * recursively). * * Spilled data is written to logical tapes. These provide better control @@ -212,7 +212,7 @@ * * Note that it's possible for transition states to start small but then * grow very large; for instance in the case of ARRAY_AGG. In such cases, - * it's still possible to significantly exceed work_mem. We try to avoid + * it's still possible to significantly exceed hash_mem. We try to avoid * this situation by estimating what will fit in the available memory, and * imposing a limit on the number of groups separately from the amount of * memory consumed. @@ -1516,7 +1516,7 @@ build_hash_table(AggState *aggstate, int setno, long nbuckets) /* * Used to make sure initial hash table allocation does not exceed - * work_mem. Note that the estimate does not include space for + * hash_mem. Note that the estimate does not include space for * pass-by-reference transition data values, nor for the representative * tuple of each group. */ @@ -1782,7 +1782,7 @@ hashagg_recompile_expressions(AggState *aggstate, bool minslot, bool nullcheck) } /* - * Set limits that trigger spilling to avoid exceeding work_mem. Consider the + * Set limits that trigger spilling to avoid exceeding hash_mem. Consider the * number of partitions we expect to create (if we do spill). * * There are two limits: a memory limit, and also an ngroups limit. The @@ -1796,13 +1796,14 @@ hash_agg_set_limits(double hashentrysize, double input_groups, int used_bits, { int npartitions; Size partition_mem; + int hash_mem = get_hash_mem(); - /* if not expected to spill, use all of work_mem */ - if (input_groups * hashentrysize < work_mem * 1024L) + /* if not expected to spill, use all of hash_mem */ + if (input_groups * hashentrysize < hash_mem * 1024L) { if (num_partitions != NULL) *num_partitions = 0; - *mem_limit = work_mem * 1024L; + *mem_limit = hash_mem * 1024L; *ngroups_limit = *mem_limit / hashentrysize; return; } @@ -1824,14 +1825,14 @@ hash_agg_set_limits(double hashentrysize, double input_groups, int used_bits, HASHAGG_WRITE_BUFFER_SIZE * npartitions; /* - * Don't set the limit below 3/4 of work_mem. In that case, we are at the + * Don't set the limit below 3/4 of hash_mem. In that case, we are at the * minimum number of partitions, so we aren't going to dramatically exceed * work mem anyway. */ - if (work_mem * 1024L > 4 * partition_mem) - *mem_limit = work_mem * 1024L - partition_mem; + if (hash_mem * 1024L > 4 * partition_mem) + *mem_limit = hash_mem * 1024L - partition_mem; else - *mem_limit = work_mem * 1024L * 0.75; + *mem_limit = hash_mem * 1024L * 0.75; if (*mem_limit > hashentrysize) *ngroups_limit = *mem_limit / hashentrysize; @@ -1989,19 +1990,20 @@ hash_choose_num_partitions(double input_groups, double hashentrysize, int partition_limit; int npartitions; int partition_bits; + int hash_mem = get_hash_mem(); /* * Avoid creating so many partitions that the memory requirements of the - * open partition files are greater than 1/4 of work_mem. + * open partition files are greater than 1/4 of hash_mem. */ partition_limit = - (work_mem * 1024L * 0.25 - HASHAGG_READ_BUFFER_SIZE) / + (hash_mem * 1024L * 0.25 - HASHAGG_READ_BUFFER_SIZE) / HASHAGG_WRITE_BUFFER_SIZE; mem_wanted = HASHAGG_PARTITION_FACTOR * input_groups * hashentrysize; /* make enough partitions so that each one is likely to fit in memory */ - npartitions = 1 + (mem_wanted / (work_mem * 1024L)); + npartitions = 1 + (mem_wanted / (hash_mem * 1024L)); if (npartitions > partition_limit) npartitions = partition_limit; diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c index 45b342011fe..ea69eeb2a1e 100644 --- a/src/backend/executor/nodeHash.c +++ b/src/backend/executor/nodeHash.c @@ -39,6 +39,7 @@ #include "port/atomics.h" #include "port/pg_bitutils.h" #include "utils/dynahash.h" +#include "utils/guc.h" #include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/syscache.h" @@ -506,7 +507,7 @@ ExecHashTableCreate(HashState *state, List *hashOperators, List *hashCollations, hashtable->spaceAllowed = space_allowed; hashtable->spaceUsedSkew = 0; hashtable->spaceAllowedSkew = - hashtable->spaceAllowed * SKEW_WORK_MEM_PERCENT / 100; + hashtable->spaceAllowed * SKEW_HASH_MEM_PERCENT / 100; hashtable->chunks = NULL; hashtable->current_chunk = NULL; hashtable->parallel_state = state->parallel_state; @@ -665,7 +666,7 @@ ExecHashTableCreate(HashState *state, List *hashOperators, List *hashCollations, void ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew, - bool try_combined_work_mem, + bool try_combined_hash_mem, int parallel_workers, size_t *space_allowed, int *numbuckets, @@ -682,6 +683,7 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew, int nbatch = 1; int nbuckets; double dbuckets; + int hash_mem = get_hash_mem(); /* Force a plausible relation size if no info */ if (ntuples <= 0.0) @@ -698,16 +700,16 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew, inner_rel_bytes = ntuples * tupsize; /* - * Target in-memory hashtable size is work_mem kilobytes. + * Target in-memory hashtable size is hash_mem kilobytes. */ - hash_table_bytes = work_mem * 1024L; + hash_table_bytes = hash_mem * 1024L; /* - * Parallel Hash tries to use the combined work_mem of all workers to - * avoid the need to batch. If that won't work, it falls back to work_mem + * Parallel Hash tries to use the combined hash_mem of all workers to + * avoid the need to batch. If that won't work, it falls back to hash_mem * per worker and tries to process batches in parallel. */ - if (try_combined_work_mem) + if (try_combined_hash_mem) hash_table_bytes += hash_table_bytes * parallel_workers; *space_allowed = hash_table_bytes; @@ -728,7 +730,7 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew, */ if (useskew) { - skew_table_bytes = hash_table_bytes * SKEW_WORK_MEM_PERCENT / 100; + skew_table_bytes = hash_table_bytes * SKEW_HASH_MEM_PERCENT / 100; /*---------- * Divisor is: @@ -751,7 +753,7 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew, /* * Set nbuckets to achieve an average bucket load of NTUP_PER_BUCKET when * memory is filled, assuming a single batch; but limit the value so that - * the pointer arrays we'll try to allocate do not exceed work_mem nor + * the pointer arrays we'll try to allocate do not exceed hash_mem nor * MaxAllocSize. * * Note that both nbuckets and nbatch must be powers of 2 to make @@ -790,10 +792,10 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew, long bucket_size; /* - * If Parallel Hash with combined work_mem would still need multiple - * batches, we'll have to fall back to regular work_mem budget. + * If Parallel Hash with combined hash_mem would still need multiple + * batches, we'll have to fall back to regular hash_mem budget. */ - if (try_combined_work_mem) + if (try_combined_hash_mem) { ExecChooseHashTableSize(ntuples, tupwidth, useskew, false, parallel_workers, @@ -805,7 +807,7 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew, } /* - * Estimate the number of buckets we'll want to have when work_mem is + * Estimate the number of buckets we'll want to have when hash_mem is * entirely full. Each bucket will contain a bucket pointer plus * NTUP_PER_BUCKET tuples, whose projected size already includes * overhead for the hash code, pointer to the next tuple, etc. @@ -820,8 +822,8 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew, /* * Buckets are simple pointers to hashjoin tuples, while tupsize * includes the pointer, hash code, and MinimalTupleData. So buckets - * should never really exceed 25% of work_mem (even for - * NTUP_PER_BUCKET=1); except maybe for work_mem values that are not + * should never really exceed 25% of hash_mem (even for + * NTUP_PER_BUCKET=1); except maybe for hash_mem values that are not * 2^N bytes, where we might get more because of doubling. So let's * look for 50% here. */ @@ -1095,15 +1097,17 @@ ExecParallelHashIncreaseNumBatches(HashJoinTable hashtable) /* Figure out how many batches to use. */ if (hashtable->nbatch == 1) { + int hash_mem = get_hash_mem(); + /* * We are going from single-batch to multi-batch. We need * to switch from one large combined memory budget to the - * regular work_mem budget. + * regular hash_mem budget. */ - pstate->space_allowed = work_mem * 1024L; + pstate->space_allowed = hash_mem * 1024L; /* - * The combined work_mem of all participants wasn't + * The combined hash_mem of all participants wasn't * enough. Therefore one batch per participant would be * approximately equivalent and would probably also be * insufficient. So try two batches per participant, @@ -2855,7 +2859,7 @@ ExecParallelHashTupleAlloc(HashJoinTable hashtable, size_t size, /* * Check if our space limit would be exceeded. To avoid choking on - * very large tuples or very low work_mem setting, we'll always allow + * very large tuples or very low hash_mem setting, we'll always allow * each backend to allocate at least one chunk. */ if (hashtable->batches[0].at_least_one_chunk && @@ -3366,3 +3370,41 @@ ExecParallelHashTuplePrealloc(HashJoinTable hashtable, int batchno, size_t size) return true; } + +/* + * Get a hash_mem value by multiplying the work_mem GUC's value by the + * hash_mem_multiplier GUC's value. + * + * Returns a work_mem style KB value that hash-based nodes (including but not + * limited to hash join) use in place of work_mem. This is subject to the + * same restrictions as work_mem itself. (There is no such thing as the + * hash_mem GUC, but it's convenient for our callers to pretend that there + * is.) + * + * Exported for use by the planner, as well as other hash-based executor + * nodes. This is a rather random place for this, but there is no better + * place. + */ +int +get_hash_mem(void) +{ + double hash_mem; + + Assert(hash_mem_multiplier >= 1.0); + + hash_mem = (double) work_mem * hash_mem_multiplier; + + /* + * guc.c enforces a MAX_KILOBYTES limitation on work_mem in order to + * support the assumption that raw derived byte values can be stored in + * 'long' variables. The returned hash_mem value must also meet this + * assumption. + * + * We clamp the final value rather than throw an error because it should + * be possible to set work_mem and hash_mem_multiplier independently. + */ + if (hash_mem < MAX_KILOBYTES) + return (int) hash_mem; + + return MAX_KILOBYTES; +} diff --git a/src/backend/executor/nodeHashjoin.c b/src/backend/executor/nodeHashjoin.c index 9bb23fef1a6..5532b91a71d 100644 --- a/src/backend/executor/nodeHashjoin.c +++ b/src/backend/executor/nodeHashjoin.c @@ -89,9 +89,9 @@ * PHJ_BUILD_HASHING_INNER so we can skip loading. * * Initially we try to plan for a single-batch hash join using the combined - * work_mem of all participants to create a large shared hash table. If that + * hash_mem of all participants to create a large shared hash table. If that * turns out either at planning or execution time to be impossible then we - * fall back to regular work_mem sized hash tables. + * fall back to regular hash_mem sized hash tables. * * To avoid deadlocks, we never wait for any barrier unless it is known that * all other backends attached to it are actively executing the node or have diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 27ce4cc8069..fda4b2c6e87 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -3525,7 +3525,7 @@ initial_cost_hashjoin(PlannerInfo *root, JoinCostWorkspace *workspace, * Get hash table size that executor would use for inner relation. * * XXX for the moment, always assume that skew optimization will be - * performed. As long as SKEW_WORK_MEM_PERCENT is small, it's not worth + * performed. As long as SKEW_HASH_MEM_PERCENT is small, it's not worth * trying to determine that for sure. * * XXX at some point it might be interesting to try to account for skew @@ -3534,7 +3534,7 @@ initial_cost_hashjoin(PlannerInfo *root, JoinCostWorkspace *workspace, ExecChooseHashTableSize(inner_path_rows_total, inner_path->pathtarget->width, true, /* useskew */ - parallel_hash, /* try_combined_work_mem */ + parallel_hash, /* try_combined_hash_mem */ outer_path->parallel_workers, &space_allowed, &numbuckets, @@ -3597,6 +3597,7 @@ final_cost_hashjoin(PlannerInfo *root, HashPath *path, Cost run_cost = workspace->run_cost; int numbuckets = workspace->numbuckets; int numbatches = workspace->numbatches; + int hash_mem; Cost cpu_per_tuple; QualCost hash_qual_cost; QualCost qp_qual_cost; @@ -3715,16 +3716,17 @@ final_cost_hashjoin(PlannerInfo *root, HashPath *path, } /* - * If the bucket holding the inner MCV would exceed work_mem, we don't + * If the bucket holding the inner MCV would exceed hash_mem, we don't * want to hash unless there is really no other alternative, so apply * disable_cost. (The executor normally copes with excessive memory usage * by splitting batches, but obviously it cannot separate equal values - * that way, so it will be unable to drive the batch size below work_mem + * that way, so it will be unable to drive the batch size below hash_mem * when this is true.) */ + hash_mem = get_hash_mem(); if (relation_byte_size(clamp_row_est(inner_path_rows * innermcvfreq), inner_path->pathtarget->width) > - (work_mem * 1024L)) + (hash_mem * 1024L)) startup_cost += disable_cost; /* diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 1345e522dcf..b40a112c25b 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -4196,16 +4196,17 @@ consider_groupingsets_paths(PlannerInfo *root, double dNumGroups) { Query *parse = root->parse; + int hash_mem = get_hash_mem(); /* * If we're not being offered sorted input, then only consider plans that * can be done entirely by hashing. * - * We can hash everything if it looks like it'll fit in work_mem. But if + * We can hash everything if it looks like it'll fit in hash_mem. But if * the input is actually sorted despite not being advertised as such, we * prefer to make use of that in order to use less memory. * - * If none of the grouping sets are sortable, then ignore the work_mem + * If none of the grouping sets are sortable, then ignore the hash_mem * limit and generate a path anyway, since otherwise we'll just fail. */ if (!is_sorted) @@ -4257,10 +4258,10 @@ consider_groupingsets_paths(PlannerInfo *root, /* * gd->rollups is empty if we have only unsortable columns to work - * with. Override work_mem in that case; otherwise, we'll rely on the + * with. Override hash_mem in that case; otherwise, we'll rely on the * sorted-input case to generate usable mixed paths. */ - if (hashsize > work_mem * 1024L && gd->rollups) + if (hashsize > hash_mem * 1024L && gd->rollups) return; /* nope, won't fit */ /* @@ -4379,7 +4380,7 @@ consider_groupingsets_paths(PlannerInfo *root, { List *rollups = NIL; List *hash_sets = list_copy(gd->unsortable_sets); - double availspace = (work_mem * 1024.0); + double availspace = (hash_mem * 1024.0); ListCell *lc; /* @@ -4400,7 +4401,7 @@ consider_groupingsets_paths(PlannerInfo *root, /* * We treat this as a knapsack problem: the knapsack capacity - * represents work_mem, the item weights are the estimated memory + * represents hash_mem, the item weights are the estimated memory * usage of the hashtables needed to implement a single rollup, * and we really ought to use the cost saving as the item value; * however, currently the costs assigned to sort nodes don't @@ -4441,7 +4442,7 @@ consider_groupingsets_paths(PlannerInfo *root, rollup->numGroups); /* - * If sz is enormous, but work_mem (and hence scale) is + * If sz is enormous, but hash_mem (and hence scale) is * small, avoid integer overflow here. */ k_weights[i] = (int) Min(floor(sz / scale), diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c index b02fcb9bfe7..9a8f738c9d0 100644 --- a/src/backend/optimizer/plan/subselect.c +++ b/src/backend/optimizer/plan/subselect.c @@ -200,7 +200,7 @@ make_subplan(PlannerInfo *root, Query *orig_subquery, * XXX If an ANY subplan is uncorrelated, build_subplan may decide to hash * its output. In that case it would've been better to specify full * retrieval. At present, however, we can only check hashability after - * we've made the subplan :-(. (Determining whether it'll fit in work_mem + * we've made the subplan :-(. (Determining whether it'll fit in hash_mem * is the really hard part.) Therefore, we don't want to be too * optimistic about the percentage of tuples retrieved, for fear of * selecting a plan that's bad for the materialization case. @@ -278,7 +278,7 @@ make_subplan(PlannerInfo *root, Query *orig_subquery, plan = create_plan(subroot, best_path); - /* Now we can check if it'll fit in work_mem */ + /* Now we can check if it'll fit in hash_mem */ /* XXX can we check this at the Path stage? */ if (subplan_is_hashable(plan)) { @@ -716,16 +716,17 @@ static bool subplan_is_hashable(Plan *plan) { double subquery_size; + int hash_mem = get_hash_mem(); /* - * The estimated size of the subquery result must fit in work_mem. (Note: + * The estimated size of the subquery result must fit in hash_mem. (Note: * we use heap tuple overhead here even though the tuples will actually be * stored as MinimalTuples; this provides some fudge factor for hashtable * overhead.) */ subquery_size = plan->plan_rows * (MAXALIGN(plan->plan_width) + MAXALIGN(SizeofHeapTupleHeader)); - if (subquery_size > work_mem * 1024L) + if (subquery_size > hash_mem * 1024L) return false; return true; diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c index 6588f83d5ec..2ebd4ea3320 100644 --- a/src/backend/optimizer/prep/prepunion.c +++ b/src/backend/optimizer/prep/prepunion.c @@ -1018,6 +1018,7 @@ choose_hashed_setop(PlannerInfo *root, List *groupClauses, const char *construct) { int numGroupCols = list_length(groupClauses); + int hash_mem = get_hash_mem(); bool can_sort; bool can_hash; Size hashentrysize; @@ -1049,15 +1050,17 @@ choose_hashed_setop(PlannerInfo *root, List *groupClauses, /* * Don't do it if it doesn't look like the hashtable will fit into - * work_mem. + * hash_mem. */ hashentrysize = MAXALIGN(input_path->pathtarget->width) + MAXALIGN(SizeofMinimalTupleHeader); - if (hashentrysize * dNumGroups > work_mem * 1024L) + if (hashentrysize * dNumGroups > hash_mem * 1024L) return false; /* - * See if the estimated cost is no more than doing it the other way. + * See if the estimated cost is no more than doing it the other way. We + * deliberately give the hash case more memory when hash_mem exceeds + * standard work mem (i.e. when hash_mem_multiplier exceeds 1.0). * * We need to consider input_plan + hashagg versus input_plan + sort + * group. Note that the actual result plan might involve a SetOp or diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index 5110a6b8060..c1fc866cbf9 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -1720,8 +1720,9 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, * planner.c). */ int hashentrysize = subpath->pathtarget->width + 64; + int hash_mem = get_hash_mem(); - if (hashentrysize * pathnode->path.rows > work_mem * 1024L) + if (hashentrysize * pathnode->path.rows > hash_mem * 1024L) { /* * We should not try to hash. Hack the SpecialJoinInfo to diff --git a/src/backend/utils/adt/ri_triggers.c b/src/backend/utils/adt/ri_triggers.c index bb49e80d166..06cf16d9d71 100644 --- a/src/backend/utils/adt/ri_triggers.c +++ b/src/backend/utils/adt/ri_triggers.c @@ -1450,7 +1450,9 @@ RI_Initial_Check(Trigger *trigger, Relation fk_rel, Relation pk_rel) * enough to not use a multiple of work_mem, and one typically would not * have many large foreign-key validations happening concurrently. So * this seems to meet the criteria for being considered a "maintenance" - * operation, and accordingly we use maintenance_work_mem. + * operation, and accordingly we use maintenance_work_mem. However, we + * must also set hash_mem_multiplier to 1, since it is surely not okay to + * let that get applied to the maintenance_work_mem value. * * We use the equivalent of a function SET option to allow the setting to * persist for exactly the duration of the check query. guc.c also takes @@ -1462,6 +1464,9 @@ RI_Initial_Check(Trigger *trigger, Relation fk_rel, Relation pk_rel) (void) set_config_option("work_mem", workmembuf, PGC_USERSET, PGC_S_SESSION, GUC_ACTION_SAVE, true, 0, false); + (void) set_config_option("hash_mem_multiplier", "1", + PGC_USERSET, PGC_S_SESSION, + GUC_ACTION_SAVE, true, 0, false); if (SPI_connect() != SPI_OK_CONNECT) elog(ERROR, "SPI_connect failed"); @@ -1553,7 +1558,7 @@ RI_Initial_Check(Trigger *trigger, Relation fk_rel, Relation pk_rel) elog(ERROR, "SPI_finish failed"); /* - * Restore work_mem. + * Restore work_mem and hash_mem_multiplier. */ AtEOXact_GUC(true, save_nestlevel); @@ -1685,7 +1690,9 @@ RI_PartitionRemove_Check(Trigger *trigger, Relation fk_rel, Relation pk_rel) * enough to not use a multiple of work_mem, and one typically would not * have many large foreign-key validations happening concurrently. So * this seems to meet the criteria for being considered a "maintenance" - * operation, and accordingly we use maintenance_work_mem. + * operation, and accordingly we use maintenance_work_mem. However, we + * must also set hash_mem_multiplier to 1, since it is surely not okay to + * let that get applied to the maintenance_work_mem value. * * We use the equivalent of a function SET option to allow the setting to * persist for exactly the duration of the check query. guc.c also takes @@ -1697,6 +1704,9 @@ RI_PartitionRemove_Check(Trigger *trigger, Relation fk_rel, Relation pk_rel) (void) set_config_option("work_mem", workmembuf, PGC_USERSET, PGC_S_SESSION, GUC_ACTION_SAVE, true, 0, false); + (void) set_config_option("hash_mem_multiplier", "1", + PGC_USERSET, PGC_S_SESSION, + GUC_ACTION_SAVE, true, 0, false); if (SPI_connect() != SPI_OK_CONNECT) elog(ERROR, "SPI_connect failed"); @@ -1763,7 +1773,7 @@ RI_PartitionRemove_Check(Trigger *trigger, Relation fk_rel, Relation pk_rel) elog(ERROR, "SPI_finish failed"); /* - * Restore work_mem. + * Restore work_mem and hash_mem_multiplier. */ AtEOXact_GUC(true, save_nestlevel); } diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c index 497d7c38ae6..6ab82168398 100644 --- a/src/backend/utils/init/globals.c +++ b/src/backend/utils/init/globals.c @@ -119,6 +119,7 @@ int IntervalStyle = INTSTYLE_POSTGRES; bool enableFsync = true; bool allowSystemTableMods = false; int work_mem = 4096; +double hash_mem_multiplier = 1.0; int maintenance_work_mem = 65536; int max_parallel_maintenance_workers = 2; diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index abfa95a2314..c20885e97b2 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -3543,6 +3543,17 @@ static struct config_real ConfigureNamesReal[] = }, { + {"hash_mem_multiplier", PGC_USERSET, RESOURCES_MEM, + gettext_noop("Multiple of work_mem to use for hash tables."), + NULL, + GUC_EXPLAIN + }, + &hash_mem_multiplier, + 1.0, 1.0, 1000.0, + NULL, NULL, NULL + }, + + { {"bgwriter_lru_multiplier", PGC_SIGHUP, RESOURCES_BGWRITER, gettext_noop("Multiple of the average buffer usage to free per round."), NULL diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 5a0b8e98217..aa30291ea39 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -130,6 +130,7 @@ # Caution: it is not advisable to set max_prepared_transactions nonzero unless # you actively intend to use prepared transactions. #work_mem = 4MB # min 64kB +#hash_mem_multiplier = 1.0 # 1-1000.0 multiplier on hash table work_mem #maintenance_work_mem = 64MB # min 1MB #autovacuum_work_mem = -1 # min 1MB, or -1 to use maintenance_work_mem #logical_decoding_work_mem = 64MB # min 64kB diff --git a/src/include/executor/hashjoin.h b/src/include/executor/hashjoin.h index 79b634e8ed1..eb5daba36b0 100644 --- a/src/include/executor/hashjoin.h +++ b/src/include/executor/hashjoin.h @@ -88,7 +88,7 @@ typedef struct HashJoinTupleData * outer relation tuples with these hash values are matched against that * table instead of the main one. Thus, tuples with these hash values are * effectively handled as part of the first batch and will never go to disk. - * The skew hashtable is limited to SKEW_WORK_MEM_PERCENT of the total memory + * The skew hashtable is limited to SKEW_HASH_MEM_PERCENT of the total memory * allowed for the join; while building the hashtables, we decrease the number * of MCVs being specially treated if needed to stay under this limit. * @@ -107,7 +107,7 @@ typedef struct HashSkewBucket #define SKEW_BUCKET_OVERHEAD MAXALIGN(sizeof(HashSkewBucket)) #define INVALID_SKEW_BUCKET_NO (-1) -#define SKEW_WORK_MEM_PERCENT 2 +#define SKEW_HASH_MEM_PERCENT 2 #define SKEW_MIN_OUTER_FRACTION 0.01 /* diff --git a/src/include/executor/nodeHash.h b/src/include/executor/nodeHash.h index 64d2ce693ca..2db4e2f6726 100644 --- a/src/include/executor/nodeHash.h +++ b/src/include/executor/nodeHash.h @@ -61,7 +61,7 @@ extern bool ExecScanHashTableForUnmatched(HashJoinState *hjstate, extern void ExecHashTableReset(HashJoinTable hashtable); extern void ExecHashTableResetMatchFlags(HashJoinTable hashtable); extern void ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew, - bool try_combined_work_mem, + bool try_combined_hash_mem, int parallel_workers, size_t *space_allowed, int *numbuckets, diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 18bc8a7b904..72e33523984 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -243,6 +243,7 @@ extern PGDLLIMPORT int IntervalStyle; extern bool enableFsync; extern PGDLLIMPORT bool allowSystemTableMods; extern PGDLLIMPORT int work_mem; +extern PGDLLIMPORT double hash_mem_multiplier; extern PGDLLIMPORT int maintenance_work_mem; extern PGDLLIMPORT int max_parallel_maintenance_workers; @@ -469,4 +470,7 @@ extern bool has_rolreplication(Oid roleid); extern bool BackupInProgress(void); extern void CancelBackup(void); +/* in executor/nodeHash.c */ +extern int get_hash_mem(void); + #endif /* MISCADMIN_H */ |