aboutsummaryrefslogtreecommitdiff
path: root/src/include/executor/hashjoin.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/include/executor/hashjoin.h')
-rw-r--r--src/include/executor/hashjoin.h40
1 files changed, 39 insertions, 1 deletions
diff --git a/src/include/executor/hashjoin.h b/src/include/executor/hashjoin.h
index 40a5244ad47..5b18282a646 100644
--- a/src/include/executor/hashjoin.h
+++ b/src/include/executor/hashjoin.h
@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/include/executor/hashjoin.h,v 1.49 2009/01/01 17:23:59 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/executor/hashjoin.h,v 1.50 2009/03/21 00:04:40 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -72,6 +72,36 @@ typedef struct HashJoinTupleData
#define HJTUPLE_MINTUPLE(hjtup) \
((MinimalTuple) ((char *) (hjtup) + HJTUPLE_OVERHEAD))
+/*
+ * If the outer relation's distribution is sufficiently nonuniform, we attempt
+ * to optimize the join by treating the hash values corresponding to the outer
+ * relation's MCVs specially. Inner relation tuples matching these hash
+ * values go into the "skew" hashtable instead of the main hashtable, and
+ * outer relation tuples with these hash values are matched against that
+ * table instead of the main one. Thus, tuples with these hash values are
+ * effectively handled as part of the first batch and will never go to disk.
+ * The skew hashtable is limited to SKEW_WORK_MEM_PERCENT of the total memory
+ * allowed for the join; while building the hashtables, we decrease the number
+ * of MCVs being specially treated if needed to stay under this limit.
+ *
+ * Note: you might wonder why we look at the outer relation stats for this,
+ * rather than the inner. One reason is that the outer relation is typically
+ * bigger, so we get more I/O savings by optimizing for its most common values.
+ * Also, for similarly-sized relations, the planner prefers to put the more
+ * uniformly distributed relation on the inside, so we're more likely to find
+ * interesting skew in the outer relation.
+ */
+typedef struct HashSkewBucket
+{
+ uint32 hashvalue; /* common hash value */
+ HashJoinTuple tuples; /* linked list of inner-relation tuples */
+} HashSkewBucket;
+
+#define SKEW_BUCKET_OVERHEAD MAXALIGN(sizeof(HashSkewBucket))
+#define INVALID_SKEW_BUCKET_NO (-1)
+#define SKEW_WORK_MEM_PERCENT 2
+#define SKEW_MIN_OUTER_FRACTION 0.01
+
typedef struct HashJoinTableData
{
@@ -82,6 +112,12 @@ typedef struct HashJoinTableData
struct HashJoinTupleData **buckets;
/* buckets array is per-batch storage, as are all the tuples */
+ bool skewEnabled; /* are we using skew optimization? */
+ HashSkewBucket **skewBucket; /* hashtable of skew buckets */
+ int skewBucketLen; /* size of skewBucket array (a power of 2!) */
+ int nSkewBuckets; /* number of active skew buckets */
+ int *skewBucketNums; /* array indexes of active skew buckets */
+
int nbatch; /* number of batches */
int curbatch; /* current batch #; 0 during 1st pass */
@@ -113,6 +149,8 @@ typedef struct HashJoinTableData
Size spaceUsed; /* memory space currently used by tuples */
Size spaceAllowed; /* upper limit for space used */
+ Size spaceUsedSkew; /* skew hash table's current space usage */
+ Size spaceAllowedSkew; /* upper limit for skew hashtable */
MemoryContext hashCxt; /* context for whole-hash-join storage */
MemoryContext batchCxt; /* context for this-batch-only storage */