aboutsummaryrefslogtreecommitdiff
path: root/src/backend/nodes/bitmapset.c
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2007-06-01 15:33:19 +0000
committerTom Lane <tgl@sss.pgh.pa.us>2007-06-01 15:33:19 +0000
commit1f559b7d3aa411e08d2ad46a4bd7d9213945b103 (patch)
treedf4325affc882914a2dd6c6e8a8160ffc223a057 /src/backend/nodes/bitmapset.c
parent397d00af8f0526c78eef5999e85ce6a3d5e9e36f (diff)
downloadpostgresql-1f559b7d3aa411e08d2ad46a4bd7d9213945b103.tar.gz
postgresql-1f559b7d3aa411e08d2ad46a4bd7d9213945b103.zip
Fix several hash functions that were taking chintzy shortcuts instead of
delivering a well-randomized hash value. I got religion on this after observing that performance of multi-batch hash join degrades terribly if the higher-order bits of hash values aren't random, as indeed was true for say hashes of small integer values. It's now expected and documented that hash functions should use hash_any or some comparable method to ensure that all bits of their output are about equally random. initdb forced because this change invalidates existing hash indexes. For the same reason, this isn't back-patchable; the hash join performance problem will get a band-aid fix in the back branches.
Diffstat (limited to 'src/backend/nodes/bitmapset.c')
-rw-r--r--src/backend/nodes/bitmapset.c38
1 files changed, 13 insertions, 25 deletions
diff --git a/src/backend/nodes/bitmapset.c b/src/backend/nodes/bitmapset.c
index 86a9fb332c3..bc77363fbf4 100644
--- a/src/backend/nodes/bitmapset.c
+++ b/src/backend/nodes/bitmapset.c
@@ -14,13 +14,14 @@
* Copyright (c) 2003-2007, PostgreSQL Global Development Group
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/nodes/bitmapset.c,v 1.12 2007/01/05 22:19:29 momjian Exp $
+ * $PostgreSQL: pgsql/src/backend/nodes/bitmapset.c,v 1.13 2007/06/01 15:33:18 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "nodes/bitmapset.h"
+#include "access/hash.h"
#define WORDNUM(x) ((x) / BITS_PER_BITMAPWORD)
@@ -769,36 +770,23 @@ bms_first_member(Bitmapset *a)
*
* Note: we must ensure that any two bitmapsets that are bms_equal() will
* hash to the same value; in practice this means that trailing all-zero
- * words cannot affect the result. The circular-shift-and-XOR hash method
- * used here has this property, so long as we work from back to front.
- *
- * Note: you might wonder why we bother with the circular shift; at first
- * glance a straight longitudinal XOR seems as good and much simpler. The
- * reason is empirical: this gives a better distribution of hash values on
- * the bitmapsets actually generated by the planner. A common way to have
- * multiword bitmapsets is "a JOIN b JOIN c JOIN d ...", which gives rise
- * to rangetables in which base tables and JOIN nodes alternate; so
- * bitmapsets of base table RT indexes tend to use only odd-numbered or only
- * even-numbered bits. A straight longitudinal XOR would preserve this
- * property, leading to a much smaller set of possible outputs than if
- * we include a shift.
+ * words must not affect the result. Hence we strip those before applying
+ * hash_any().
*/
uint32
bms_hash_value(const Bitmapset *a)
{
- bitmapword result = 0;
- int wordnum;
+ int lastword;
- if (a == NULL || a->nwords <= 0)
+ if (a == NULL)
return 0; /* All empty sets hash to 0 */
- for (wordnum = a->nwords; --wordnum > 0;)
+ for (lastword = a->nwords; --lastword >= 0;)
{
- result ^= a->words[wordnum];
- if (result & ((bitmapword) 1 << (BITS_PER_BITMAPWORD - 1)))
- result = (result << 1) | 1;
- else
- result = (result << 1);
+ if (a->words[lastword] != 0)
+ break;
}
- result ^= a->words[0];
- return (uint32) result;
+ if (lastword < 0)
+ return 0; /* All empty sets hash to 0 */
+ return DatumGetUInt32(hash_any((const unsigned char *) a->words,
+ (lastword + 1) * sizeof(bitmapword)));
}