aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRobert Haas <rhaas@postgresql.org>2016-12-16 10:03:08 -0500
committerRobert Haas <rhaas@postgresql.org>2016-12-16 10:03:08 -0500
commitb81b5a96f424531b97cdd1dba97d9d1b9c9d372e (patch)
tree9f2a42bcb7413185777dfae93163794838d47211
parent6a4fe1127c5a0ea1515589e416aa29e088170c0e (diff)
downloadpostgresql-b81b5a96f424531b97cdd1dba97d9d1b9c9d372e.tar.gz
postgresql-b81b5a96f424531b97cdd1dba97d9d1b9c9d372e.zip
Unbreak Finalize HashAggregate over Partial HashAggregate.
Commit 5dfc198146b49ce7ecc8a1fc9d5e171fb75f6ba5 introduced the use of a new type of hash table with linear reprobing for hash aggregates. Such a hash table behaves very poorly if keys are inserted in hash order, which does in fact happen in the case where a query use a Finalize HashAggregate node fed (via Gather) by a Partial HashAggregate node. In fact, queries with this type of plan tend to run effectively forever. Fix that by seeding the hash value differently in each worker (and in the leader, if it participates). Andres Freund and Robert Haas
-rw-r--r--src/backend/executor/execGrouping.c20
-rw-r--r--src/backend/executor/nodeAgg.c3
-rw-r--r--src/backend/executor/nodeRecursiveunion.c3
-rw-r--r--src/backend/executor/nodeSetOp.c3
-rw-r--r--src/backend/executor/nodeSubplan.c6
-rw-r--r--src/include/executor/executor.h2
-rw-r--r--src/include/nodes/execnodes.h1
7 files changed, 30 insertions, 8 deletions
diff --git a/src/backend/executor/execGrouping.c b/src/backend/executor/execGrouping.c
index 94cc59de7e1..3149fbe975c 100644
--- a/src/backend/executor/execGrouping.c
+++ b/src/backend/executor/execGrouping.c
@@ -18,6 +18,8 @@
*/
#include "postgres.h"
+#include "access/hash.h"
+#include "access/parallel.h"
#include "executor/executor.h"
#include "miscadmin.h"
#include "utils/lsyscache.h"
@@ -289,7 +291,8 @@ BuildTupleHashTable(int numCols, AttrNumber *keyColIdx,
FmgrInfo *eqfunctions,
FmgrInfo *hashfunctions,
long nbuckets, Size additionalsize,
- MemoryContext tablecxt, MemoryContext tempcxt)
+ MemoryContext tablecxt, MemoryContext tempcxt,
+ bool use_variable_hash_iv)
{
TupleHashTable hashtable;
Size entrysize = sizeof(TupleHashEntryData) + additionalsize;
@@ -314,6 +317,19 @@ BuildTupleHashTable(int numCols, AttrNumber *keyColIdx,
hashtable->in_hash_funcs = NULL;
hashtable->cur_eq_funcs = NULL;
+ /*
+ * If parallelism is in use, even if the master backend is performing the
+ * scan itself, we don't want to create the hashtable exactly the same way
+ * in all workers. As hashtables are iterated over in keyspace-order,
+ * doing so in all processes in the same way is likely to lead to
+ * "unbalanced" hashtables when the table size initially is
+ * underestimated.
+ */
+ if (use_variable_hash_iv)
+ hashtable->hash_iv = hash_uint32(ParallelWorkerNumber);
+ else
+ hashtable->hash_iv = 0;
+
hashtable->hashtab = tuplehash_create(tablecxt, nbuckets);
hashtable->hashtab->private_data = hashtable;
@@ -450,7 +466,7 @@ TupleHashTableHash(struct tuplehash_hash *tb, const MinimalTuple tuple)
TupleHashTable hashtable = (TupleHashTable) tb->private_data;
int numCols = hashtable->numCols;
AttrNumber *keyColIdx = hashtable->keyColIdx;
- uint32 hashkey = 0;
+ uint32 hashkey = hashtable->hash_iv;
TupleTableSlot *slot;
FmgrInfo *hashfunctions;
int i;
diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index eefb3d678c6..a093862f34b 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -1723,7 +1723,8 @@ build_hash_table(AggState *aggstate)
node->numGroups,
additionalsize,
aggstate->aggcontexts[0]->ecxt_per_tuple_memory,
- tmpmem);
+ tmpmem,
+ !DO_AGGSPLIT_SKIPFINAL(aggstate->aggsplit));
}
/*
diff --git a/src/backend/executor/nodeRecursiveunion.c b/src/backend/executor/nodeRecursiveunion.c
index acded079e24..5b734c05050 100644
--- a/src/backend/executor/nodeRecursiveunion.c
+++ b/src/backend/executor/nodeRecursiveunion.c
@@ -43,7 +43,8 @@ build_hash_table(RecursiveUnionState *rustate)
node->numGroups,
0,
rustate->tableContext,
- rustate->tempContext);
+ rustate->tempContext,
+ false);
}
diff --git a/src/backend/executor/nodeSetOp.c b/src/backend/executor/nodeSetOp.c
index e94555ead89..760b93521a6 100644
--- a/src/backend/executor/nodeSetOp.c
+++ b/src/backend/executor/nodeSetOp.c
@@ -130,7 +130,8 @@ build_hash_table(SetOpState *setopstate)
node->numGroups,
0,
setopstate->tableContext,
- setopstate->tempContext);
+ setopstate->tempContext,
+ false);
}
/*
diff --git a/src/backend/executor/nodeSubplan.c b/src/backend/executor/nodeSubplan.c
index 8ca8fc460ca..d3436000d0f 100644
--- a/src/backend/executor/nodeSubplan.c
+++ b/src/backend/executor/nodeSubplan.c
@@ -510,7 +510,8 @@ buildSubPlanHash(SubPlanState *node, ExprContext *econtext)
nbuckets,
0,
node->hashtablecxt,
- node->hashtempcxt);
+ node->hashtempcxt,
+ false);
if (!subplan->unknownEqFalse)
{
@@ -529,7 +530,8 @@ buildSubPlanHash(SubPlanState *node, ExprContext *econtext)
nbuckets,
0,
node->hashtablecxt,
- node->hashtempcxt);
+ node->hashtempcxt,
+ false);
}
/*
diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h
index b4d09f95640..3f649faf2fe 100644
--- a/src/include/executor/executor.h
+++ b/src/include/executor/executor.h
@@ -143,7 +143,7 @@ extern TupleHashTable BuildTupleHashTable(int numCols, AttrNumber *keyColIdx,
FmgrInfo *hashfunctions,
long nbuckets, Size additionalsize,
MemoryContext tablecxt,
- MemoryContext tempcxt);
+ MemoryContext tempcxt, bool use_variable_hash_iv);
extern TupleHashEntry LookupTupleHashEntry(TupleHashTable hashtable,
TupleTableSlot *slot,
bool *isnew);
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 1de5c8196d7..703604ab9d7 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -533,6 +533,7 @@ typedef struct TupleHashTableData
TupleTableSlot *inputslot; /* current input tuple's slot */
FmgrInfo *in_hash_funcs; /* hash functions for input datatype(s) */
FmgrInfo *cur_eq_funcs; /* equality functions for input vs. table */
+ uint32 hash_iv; /* hash-function IV */
} TupleHashTableData;
typedef tuplehash_iterator TupleHashIterator;