Improve bit perturbation in TupleHashTableHash.

The changes in b81b5a96f424531b97cdd1dba97d9d1b9c9d372e did not fully address the issue, because the bit-mixing of the IV into the final hash-key didn't prevent clustering in the input-data survive in the output data. This didn't cause a lot of problems because of the additional growth conditions added d4c62a6b623d6eef88218158e9fa3cf974c6c7e5. But as we want to rein those in due to explosive growth in some edges, this needs to be fixed. Author: Andres Freund Discussion: https://postgr.es/m/20171127185700.1470.20362@wrigleys.postgresql.org Backpatch: 10, where simplehash was introduced
author: Andres Freund <andres@anarazel.de> 2018-01-29 11:02:09 -0800
committer: Andres Freund <andres@anarazel.de> 2018-01-29 11:24:57 -0800
commit: c068f87723ca9cded1f2aceb956ede49de651690 (patch)
tree: 8f573c901557dbb7e909a491c9955e866d9052f5 /src
parent: 15be27460191a9ffb149cc98f6fbf97c369a6b1e (diff)
download: postgresql-c068f87723ca9cded1f2aceb956ede49de651690.tar.gz
postgresql-c068f87723ca9cded1f2aceb956ede49de651690.zip
3 files changed, 30 insertions, 18 deletions
diff --git a/src/backend/executor/execGrouping.c b/src/backend/executor/execGrouping.c
index 058ee688041..8e8dbb1f205 100644
--- a/src/backend/executor/execGrouping.c
+++ b/src/backend/executor/execGrouping.c
@@ -23,6 +23,7 @@
 #include "executor/executor.h"
 #include "miscadmin.h"
 #include "utils/lsyscache.h"
+#include "utils/hashutils.h"
 #include "utils/memutils.h"
 
 static uint32 TupleHashTableHash(struct tuplehash_hash *tb, const MinimalTuple tuple);
@@ -326,7 +327,7 @@ BuildTupleHashTable(int numCols, AttrNumber *keyColIdx,
 	 * underestimated.
 	 */
 	if (use_variable_hash_iv)
-		hashtable->hash_iv = hash_uint32(ParallelWorkerNumber);
+		hashtable->hash_iv = murmurhash32(ParallelWorkerNumber);
 	else
 		hashtable->hash_iv = 0;
 
@@ -510,7 +511,13 @@ TupleHashTableHash(struct tuplehash_hash *tb, const MinimalTuple tuple)
 		}
 	}
 
-	return hashkey;
+	/*
+	 * The way hashes are combined above, among each other and with the IV,
+	 * doesn't lead to good bit perturbation. As the IV's goal is to lead to
+	 * achieve that, perform a round of hashing of the combined hash -
+	 * resulting in near perfect perturbation.
+	 */
+	return murmurhash32(hashkey);
 }
 
 /*
diff --git a/src/test/regress/expected/groupingsets.out b/src/test/regress/expected/groupingsets.out
index cbfdbfd8563..d21a494a9dd 100644
--- a/src/test/regress/expected/groupingsets.out
+++ b/src/test/regress/expected/groupingsets.out
@@ -1183,29 +1183,33 @@ explain (costs off)
 -- simple rescan tests
 select a, b, sum(v.x)
   from (values (1),(2)) v(x), gstest_data(v.x)
- group by grouping sets (a,b);
+ group by grouping sets (a,b)
+ order by 1, 2, 3;
  a | b | sum 
 ---+---+-----
- 2 |   |   6
  1 |   |   3
+ 2 |   |   6
+   | 1 |   3
    | 2 |   3
    | 3 |   3
-   | 1 |   3
 (5 rows)
 
 explain (costs off)
   select a, b, sum(v.x)
     from (values (1),(2)) v(x), gstest_data(v.x)
-   group by grouping sets (a,b);
-                QUERY PLAN                
-------------------------------------------
- HashAggregate
-   Hash Key: gstest_data.a
-   Hash Key: gstest_data.b
-   ->  Nested Loop
-         ->  Values Scan on "*VALUES*"
-         ->  Function Scan on gstest_data
-(6 rows)
+   group by grouping sets (a,b)
+   order by 3, 1, 2;
+                             QUERY PLAN                              
+---------------------------------------------------------------------
+ Sort
+   Sort Key: (sum("*VALUES*".column1)), gstest_data.a, gstest_data.b
+   ->  HashAggregate
+         Hash Key: gstest_data.a
+         Hash Key: gstest_data.b
+         ->  Nested Loop
+               ->  Values Scan on "*VALUES*"
+               ->  Function Scan on gstest_data
+(8 rows)
 
 select *
   from (values (1),(2)) v(x),
diff --git a/src/test/regress/sql/groupingsets.sql b/src/test/regress/sql/groupingsets.sql
index b28d8217c12..eb680286030 100644
--- a/src/test/regress/sql/groupingsets.sql
+++ b/src/test/regress/sql/groupingsets.sql
@@ -342,12 +342,13 @@ explain (costs off)
 
 select a, b, sum(v.x)
   from (values (1),(2)) v(x), gstest_data(v.x)
- group by grouping sets (a,b);
+ group by grouping sets (a,b)
+ order by 1, 2, 3;
 explain (costs off)
   select a, b, sum(v.x)
     from (values (1),(2)) v(x), gstest_data(v.x)
-   group by grouping sets (a,b);
-
+   group by grouping sets (a,b)
+   order by 3, 1, 2;
 select *
   from (values (1),(2)) v(x),
        lateral (select a, b, sum(v.x) from gstest_data(v.x) group by grouping sets (a,b)) s;
author	Andres Freund <andres@anarazel.de>	2018-01-29 11:02:09 -0800
committer	Andres Freund <andres@anarazel.de>	2018-01-29 11:24:57 -0800
commit	c068f87723ca9cded1f2aceb956ede49de651690 (patch)
tree	8f573c901557dbb7e909a491c9955e866d9052f5 /src
parent	15be27460191a9ffb149cc98f6fbf97c369a6b1e (diff)
download	postgresql-c068f87723ca9cded1f2aceb956ede49de651690.tar.gz postgresql-c068f87723ca9cded1f2aceb956ede49de651690.zip