diff options
author | Tom Lane <tgl@sss.pgh.pa.us> | 2009-03-21 00:04:40 +0000 |
---|---|---|
committer | Tom Lane <tgl@sss.pgh.pa.us> | 2009-03-21 00:04:40 +0000 |
commit | 596efd27edce20bba706f50de99a0f15bcc2a567 (patch) | |
tree | 63c07c3c310e1b072f0a29a79220c81254dba3d8 /src/include/executor/hashjoin.h | |
parent | 249d936bed069877923f0369bd2ce51a6f8f925e (diff) | |
download | postgresql-596efd27edce20bba706f50de99a0f15bcc2a567.tar.gz postgresql-596efd27edce20bba706f50de99a0f15bcc2a567.zip |
Optimize multi-batch hash joins when the outer relation has a nonuniform
distribution, by creating a special fast path for the (first few) most common
values of the outer relation. Tuples having hashvalues matching the MCVs
are effectively forced to be in the first batch, so that we never write
them out to the batch temp files.
Bryce Cutt and Ramon Lawrence, with some editorialization by me.
Diffstat (limited to 'src/include/executor/hashjoin.h')
-rw-r--r-- | src/include/executor/hashjoin.h | 40 |
1 files changed, 39 insertions, 1 deletions
diff --git a/src/include/executor/hashjoin.h b/src/include/executor/hashjoin.h index 40a5244ad47..5b18282a646 100644 --- a/src/include/executor/hashjoin.h +++ b/src/include/executor/hashjoin.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/executor/hashjoin.h,v 1.49 2009/01/01 17:23:59 momjian Exp $ + * $PostgreSQL: pgsql/src/include/executor/hashjoin.h,v 1.50 2009/03/21 00:04:40 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -72,6 +72,36 @@ typedef struct HashJoinTupleData #define HJTUPLE_MINTUPLE(hjtup) \ ((MinimalTuple) ((char *) (hjtup) + HJTUPLE_OVERHEAD)) +/* + * If the outer relation's distribution is sufficiently nonuniform, we attempt + * to optimize the join by treating the hash values corresponding to the outer + * relation's MCVs specially. Inner relation tuples matching these hash + * values go into the "skew" hashtable instead of the main hashtable, and + * outer relation tuples with these hash values are matched against that + * table instead of the main one. Thus, tuples with these hash values are + * effectively handled as part of the first batch and will never go to disk. + * The skew hashtable is limited to SKEW_WORK_MEM_PERCENT of the total memory + * allowed for the join; while building the hashtables, we decrease the number + * of MCVs being specially treated if needed to stay under this limit. + * + * Note: you might wonder why we look at the outer relation stats for this, + * rather than the inner. One reason is that the outer relation is typically + * bigger, so we get more I/O savings by optimizing for its most common values. + * Also, for similarly-sized relations, the planner prefers to put the more + * uniformly distributed relation on the inside, so we're more likely to find + * interesting skew in the outer relation. + */ +typedef struct HashSkewBucket +{ + uint32 hashvalue; /* common hash value */ + HashJoinTuple tuples; /* linked list of inner-relation tuples */ +} HashSkewBucket; + +#define SKEW_BUCKET_OVERHEAD MAXALIGN(sizeof(HashSkewBucket)) +#define INVALID_SKEW_BUCKET_NO (-1) +#define SKEW_WORK_MEM_PERCENT 2 +#define SKEW_MIN_OUTER_FRACTION 0.01 + typedef struct HashJoinTableData { @@ -82,6 +112,12 @@ typedef struct HashJoinTableData struct HashJoinTupleData **buckets; /* buckets array is per-batch storage, as are all the tuples */ + bool skewEnabled; /* are we using skew optimization? */ + HashSkewBucket **skewBucket; /* hashtable of skew buckets */ + int skewBucketLen; /* size of skewBucket array (a power of 2!) */ + int nSkewBuckets; /* number of active skew buckets */ + int *skewBucketNums; /* array indexes of active skew buckets */ + int nbatch; /* number of batches */ int curbatch; /* current batch #; 0 during 1st pass */ @@ -113,6 +149,8 @@ typedef struct HashJoinTableData Size spaceUsed; /* memory space currently used by tuples */ Size spaceAllowed; /* upper limit for space used */ + Size spaceUsedSkew; /* skew hash table's current space usage */ + Size spaceAllowedSkew; /* upper limit for skew hashtable */ MemoryContext hashCxt; /* context for whole-hash-join storage */ MemoryContext batchCxt; /* context for this-batch-only storage */ |