diff options
Diffstat (limited to 'src/backend/executor/nodeHashjoin.c')
-rw-r--r-- | src/backend/executor/nodeHashjoin.c | 393 |
1 files changed, 228 insertions, 165 deletions
diff --git a/src/backend/executor/nodeHashjoin.c b/src/backend/executor/nodeHashjoin.c index 07063af84f8..4f4eb701d3c 100644 --- a/src/backend/executor/nodeHashjoin.c +++ b/src/backend/executor/nodeHashjoin.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/executor/nodeHashjoin.c,v 1.67 2004/12/31 21:59:45 pgsql Exp $ + * $PostgreSQL: pgsql/src/backend/executor/nodeHashjoin.c,v 1.68 2005/03/06 22:15:04 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -16,16 +16,19 @@ #include "postgres.h" #include "executor/executor.h" +#include "executor/hashjoin.h" #include "executor/nodeHash.h" #include "executor/nodeHashjoin.h" #include "optimizer/clauses.h" #include "utils/memutils.h" -static TupleTableSlot *ExecHashJoinOuterGetTuple(PlanState *node, - HashJoinState *hjstate); +static TupleTableSlot *ExecHashJoinOuterGetTuple(PlanState *outerNode, + HashJoinState *hjstate, + uint32 *hashvalue); static TupleTableSlot *ExecHashJoinGetSavedTuple(HashJoinState *hjstate, BufFile *file, + uint32 *hashvalue, TupleTableSlot *tupleSlot); static int ExecHashJoinNewBatch(HashJoinState *hjstate); @@ -34,9 +37,9 @@ static int ExecHashJoinNewBatch(HashJoinState *hjstate); * ExecHashJoin * * This function implements the Hybrid Hashjoin algorithm. - * recursive partitioning remains to be added. - * Note: the relation we build hash table on is the inner - * the other one is outer. + * + * Note: the relation we build hash table on is the "inner" + * the other one is "outer". * ---------------------------------------------------------------- */ TupleTableSlot * /* return: a tuple or NULL */ @@ -45,8 +48,6 @@ ExecHashJoin(HashJoinState *node) EState *estate; PlanState *outerNode; HashState *hashNode; - List *hjclauses; - List *outerkeys; List *joinqual; List *otherqual; ScanDirection dir; @@ -56,12 +57,12 @@ ExecHashJoin(HashJoinState *node) HashJoinTable hashtable; HeapTuple curtuple; TupleTableSlot *outerTupleSlot; - int i; + uint32 hashvalue; + int batchno; /* * get information from HashJoin node */ - hjclauses = node->hashclauses; estate = node->js.ps.state; joinqual = node->js.joinqual; otherqual = node->js.ps.qual; @@ -73,7 +74,6 @@ ExecHashJoin(HashJoinState *node) * get information from HashJoin state */ hashtable = node->hj_HashTable; - outerkeys = node->hj_OuterHashKeys; econtext = node->js.ps.ps_ExprContext; /* @@ -111,12 +111,11 @@ ExecHashJoin(HashJoinState *node) /* * if this is the first call, build the hash table for inner relation */ - if (!node->hj_hashdone) + if (hashtable == NULL) { /* * create the hash table */ - Assert(hashtable == NULL); hashtable = ExecHashTableCreate((Hash *) hashNode->ps.plan, node->hj_HashOperators); node->hj_HashTable = hashtable; @@ -139,13 +138,10 @@ ExecHashJoin(HashJoinState *node) } /* - * Open temp files for outer batches, if needed. Note that file - * buffers are palloc'd in regular executor context. + * need to remember whether nbatch has increased since we began + * scanning the outer relation */ - for (i = 0; i < hashtable->nbatch; i++) - hashtable->outerBatchFile[i] = BufFileCreateTemp(false); - - node->hj_hashdone = true; + hashtable->nbatch_outstart = hashtable->nbatch; } /* @@ -159,7 +155,8 @@ ExecHashJoin(HashJoinState *node) if (node->hj_NeedNewOuter) { outerTupleSlot = ExecHashJoinOuterGetTuple(outerNode, - node); + node, + &hashvalue); if (TupIsNull(outerTupleSlot)) { /* end of join */ @@ -175,32 +172,26 @@ ExecHashJoin(HashJoinState *node) * now we have an outer tuple, find the corresponding bucket * for this tuple from the hash table */ - node->hj_CurBucketNo = ExecHashGetBucket(hashtable, econtext, - outerkeys); + node->hj_CurHashValue = hashvalue; + ExecHashGetBucketAndBatch(hashtable, hashvalue, + &node->hj_CurBucketNo, &batchno); node->hj_CurTuple = NULL; /* * Now we've got an outer tuple and the corresponding hash * bucket, but this tuple may not belong to the current batch. - * This need only be checked in the first pass. */ - if (hashtable->curbatch == 0) + if (batchno != hashtable->curbatch) { - int batchno = ExecHashGetBatch(node->hj_CurBucketNo, - hashtable); - - if (batchno >= 0) - { - /* - * Need to postpone this outer tuple to a later batch. - * Save it in the corresponding outer-batch file. - */ - hashtable->outerBatchSize[batchno]++; - ExecHashJoinSaveTuple(outerTupleSlot->val, - hashtable->outerBatchFile[batchno]); - node->hj_NeedNewOuter = true; - continue; /* loop around for a new outer tuple */ - } + /* + * Need to postpone this outer tuple to a later batch. + * Save it in the corresponding outer-batch file. + */ + Assert(batchno > hashtable->curbatch); + ExecHashJoinSaveTuple(outerTupleSlot->val, hashvalue, + &hashtable->outerBatchFile[batchno]); + node->hj_NeedNewOuter = true; + continue; /* loop around for a new outer tuple */ } } @@ -209,9 +200,7 @@ ExecHashJoin(HashJoinState *node) */ for (;;) { - curtuple = ExecScanHashBucket(node, - hjclauses, - econtext); + curtuple = ExecScanHashBucket(node, econtext); if (curtuple == NULL) break; /* out of matches */ @@ -412,10 +401,9 @@ ExecInitHashJoin(HashJoin *node, EState *estate) /* * initialize hash-specific info */ - - hjstate->hj_hashdone = false; hjstate->hj_HashTable = NULL; + hjstate->hj_CurHashValue = 0; hjstate->hj_CurBucketNo = 0; hjstate->hj_CurTuple = NULL; @@ -499,17 +487,21 @@ ExecEndHashJoin(HashJoinState *node) ExecEndNode(innerPlanState(node)); } -/* ---------------------------------------------------------------- - * ExecHashJoinOuterGetTuple +/* + * ExecHashJoinOuterGetTuple * * get the next outer tuple for hashjoin: either by - * executing a plan node as in the first pass, or from - * the tmp files for the hashjoin batches. - * ---------------------------------------------------------------- + * executing a plan node in the first pass, or from + * the temp files for the hashjoin batches. + * + * Returns a null slot if no more outer tuples. On success, the tuple's + * hash value is stored at *hashvalue --- this is either originally computed, + * or re-read from the temp file. */ - static TupleTableSlot * -ExecHashJoinOuterGetTuple(PlanState *node, HashJoinState *hjstate) +ExecHashJoinOuterGetTuple(PlanState *outerNode, + HashJoinState *hjstate, + uint32 *hashvalue) { HashJoinTable hashtable = hjstate->hj_HashTable; int curbatch = hashtable->curbatch; @@ -517,9 +509,20 @@ ExecHashJoinOuterGetTuple(PlanState *node, HashJoinState *hjstate) if (curbatch == 0) { /* if it is the first pass */ - slot = ExecProcNode(node); + slot = ExecProcNode(outerNode); if (!TupIsNull(slot)) + { + /* + * We have to compute the tuple's hash value. + */ + ExprContext *econtext = hjstate->js.ps.ps_ExprContext; + + econtext->ecxt_outertuple = slot; + *hashvalue = ExecHashGetHashValue(hashtable, econtext, + hjstate->hj_OuterHashKeys); + return slot; + } /* * We have just reached the end of the first pass. Try to switch @@ -530,12 +533,14 @@ ExecHashJoinOuterGetTuple(PlanState *node, HashJoinState *hjstate) /* * Try to read from a temp file. Loop allows us to advance to new - * batch as needed. + * batches as needed. NOTE: nbatch could increase inside + * ExecHashJoinNewBatch, so don't try to optimize this loop. */ - while (curbatch <= hashtable->nbatch) + while (curbatch < hashtable->nbatch) { slot = ExecHashJoinGetSavedTuple(hjstate, - hashtable->outerBatchFile[curbatch - 1], + hashtable->outerBatchFile[curbatch], + hashvalue, hjstate->hj_OuterTupleSlot); if (!TupIsNull(slot)) return slot; @@ -546,164 +551,223 @@ ExecHashJoinOuterGetTuple(PlanState *node, HashJoinState *hjstate) return NULL; } -/* ---------------------------------------------------------------- - * ExecHashJoinGetSavedTuple - * - * read the next tuple from a tmp file - * ---------------------------------------------------------------- - */ - -static TupleTableSlot * -ExecHashJoinGetSavedTuple(HashJoinState *hjstate, - BufFile *file, - TupleTableSlot *tupleSlot) -{ - HeapTupleData htup; - size_t nread; - HeapTuple heapTuple; - - nread = BufFileRead(file, (void *) &htup, sizeof(HeapTupleData)); - if (nread == 0) - return NULL; /* end of file */ - if (nread != sizeof(HeapTupleData)) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not read from hash-join temporary file: %m"))); - heapTuple = palloc(HEAPTUPLESIZE + htup.t_len); - memcpy((char *) heapTuple, (char *) &htup, sizeof(HeapTupleData)); - heapTuple->t_datamcxt = CurrentMemoryContext; - heapTuple->t_data = (HeapTupleHeader) - ((char *) heapTuple + HEAPTUPLESIZE); - nread = BufFileRead(file, (void *) heapTuple->t_data, htup.t_len); - if (nread != (size_t) htup.t_len) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not read from hash-join temporary file: %m"))); - return ExecStoreTuple(heapTuple, tupleSlot, InvalidBuffer, true); -} - -/* ---------------------------------------------------------------- - * ExecHashJoinNewBatch - * +/* + * ExecHashJoinNewBatch * switch to a new hashjoin batch - * ---------------------------------------------------------------- + * + * Returns the number of the new batch (1..nbatch-1), or nbatch if no more. + * We will never return a batch number that has an empty outer batch file. */ static int ExecHashJoinNewBatch(HashJoinState *hjstate) { HashJoinTable hashtable = hjstate->hj_HashTable; - int nbatch = hashtable->nbatch; - int newbatch = hashtable->curbatch + 1; - long *innerBatchSize = hashtable->innerBatchSize; - long *outerBatchSize = hashtable->outerBatchSize; + int nbatch; + int curbatch; BufFile *innerFile; TupleTableSlot *slot; - ExprContext *econtext; - List *innerhashkeys; + uint32 hashvalue; - if (newbatch > 1) +start_over: + nbatch = hashtable->nbatch; + curbatch = hashtable->curbatch; + + if (curbatch > 0) { /* * We no longer need the previous outer batch file; close it right * away to free disk space. */ - BufFileClose(hashtable->outerBatchFile[newbatch - 2]); - hashtable->outerBatchFile[newbatch - 2] = NULL; + if (hashtable->outerBatchFile[curbatch]) + BufFileClose(hashtable->outerBatchFile[curbatch]); + hashtable->outerBatchFile[curbatch] = NULL; } /* - * Normally we can skip over any batches that are empty on either side - * --- but for JOIN_LEFT, can only skip when left side is empty. - * Release associated temp files right away. + * We can always skip over any batches that are completely empty on both + * sides. We can sometimes skip over batches that are empty on only one + * side, but there are exceptions: + * + * 1. In a LEFT JOIN, we have to process outer batches even if the + * inner batch is empty. + * + * 2. If we have increased nbatch since the initial estimate, we have + * to scan inner batches since they might contain tuples that need to + * be reassigned to later inner batches. + * + * 3. Similarly, if we have increased nbatch since starting the outer + * scan, we have to rescan outer batches in case they contain tuples + * that need to be reassigned. */ - while (newbatch <= nbatch && - (outerBatchSize[newbatch - 1] == 0L || - (innerBatchSize[newbatch - 1] == 0L && - hjstate->js.jointype != JOIN_LEFT))) + curbatch++; + while (curbatch < nbatch && + (hashtable->outerBatchFile[curbatch] == NULL || + hashtable->innerBatchFile[curbatch] == NULL)) { - BufFileClose(hashtable->innerBatchFile[newbatch - 1]); - hashtable->innerBatchFile[newbatch - 1] = NULL; - BufFileClose(hashtable->outerBatchFile[newbatch - 1]); - hashtable->outerBatchFile[newbatch - 1] = NULL; - newbatch++; + if (hashtable->outerBatchFile[curbatch] && + hjstate->js.jointype == JOIN_LEFT) + break; /* must process due to rule 1 */ + if (hashtable->innerBatchFile[curbatch] && + nbatch != hashtable->nbatch_original) + break; /* must process due to rule 2 */ + if (hashtable->outerBatchFile[curbatch] && + nbatch != hashtable->nbatch_outstart) + break; /* must process due to rule 3 */ + /* We can ignore this batch. */ + /* Release associated temp files right away. */ + if (hashtable->innerBatchFile[curbatch]) + BufFileClose(hashtable->innerBatchFile[curbatch]); + hashtable->innerBatchFile[curbatch] = NULL; + if (hashtable->outerBatchFile[curbatch]) + BufFileClose(hashtable->outerBatchFile[curbatch]); + hashtable->outerBatchFile[curbatch] = NULL; + curbatch++; } - if (newbatch > nbatch) - return newbatch; /* no more batches */ - - /* - * Rewind inner and outer batch files for this batch, so that we can - * start reading them. - */ - if (BufFileSeek(hashtable->outerBatchFile[newbatch - 1], 0, 0L, SEEK_SET)) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not rewind hash-join temporary file: %m"))); + if (curbatch >= nbatch) + return curbatch; /* no more batches */ - innerFile = hashtable->innerBatchFile[newbatch - 1]; - - if (BufFileSeek(innerFile, 0, 0L, SEEK_SET)) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not rewind hash-join temporary file: %m"))); + hashtable->curbatch = curbatch; /* - * Reload the hash table with the new inner batch + * Reload the hash table with the new inner batch (which could be empty) */ - ExecHashTableReset(hashtable, innerBatchSize[newbatch - 1]); + ExecHashTableReset(hashtable); - econtext = hjstate->js.ps.ps_ExprContext; - innerhashkeys = hjstate->hj_InnerHashKeys; + innerFile = hashtable->innerBatchFile[curbatch]; - while ((slot = ExecHashJoinGetSavedTuple(hjstate, - innerFile, - hjstate->hj_HashTupleSlot)) - && !TupIsNull(slot)) + if (innerFile != NULL) { - econtext->ecxt_innertuple = slot; - ExecHashTableInsert(hashtable, econtext, innerhashkeys); + if (BufFileSeek(innerFile, 0, 0L, SEEK_SET)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not rewind hash-join temporary file: %m"))); + + while ((slot = ExecHashJoinGetSavedTuple(hjstate, + innerFile, + &hashvalue, + hjstate->hj_HashTupleSlot))) + { + /* + * NOTE: some tuples may be sent to future batches. Also, + * it is possible for hashtable->nbatch to be increased here! + */ + ExecHashTableInsert(hashtable, slot->val, hashvalue); + } + + /* + * after we build the hash table, the inner batch file is no longer + * needed + */ + BufFileClose(innerFile); + hashtable->innerBatchFile[curbatch] = NULL; } /* - * after we build the hash table, the inner batch file is no longer - * needed + * If there's no outer batch file, advance to next batch. */ - BufFileClose(innerFile); - hashtable->innerBatchFile[newbatch - 1] = NULL; + if (hashtable->outerBatchFile[curbatch] == NULL) + goto start_over; - hashtable->curbatch = newbatch; - return newbatch; + /* + * Rewind outer batch file, so that we can start reading it. + */ + if (BufFileSeek(hashtable->outerBatchFile[curbatch], 0, 0L, SEEK_SET)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not rewind hash-join temporary file: %m"))); + + return curbatch; } -/* ---------------------------------------------------------------- - * ExecHashJoinSaveTuple +/* + * ExecHashJoinSaveTuple + * save a tuple to a batch file. * - * save a tuple to a tmp file. + * The data recorded in the file for each tuple is its hash value, + * then an image of its HeapTupleData (with meaningless t_data pointer) + * followed by the HeapTupleHeader and tuple data. * - * The data recorded in the file for each tuple is an image of its - * HeapTupleData (with meaningless t_data pointer) followed by the - * HeapTupleHeader and tuple data. - * ---------------------------------------------------------------- + * Note: it is important always to call this in the regular executor + * context, not in a shorter-lived context; else the temp file buffers + * will get messed up. */ - void -ExecHashJoinSaveTuple(HeapTuple heapTuple, - BufFile *file) +ExecHashJoinSaveTuple(HeapTuple heapTuple, uint32 hashvalue, + BufFile **fileptr) { + BufFile *file = *fileptr; size_t written; + if (file == NULL) + { + /* First write to this batch file, so open it. */ + file = BufFileCreateTemp(false); + *fileptr = file; + } + + written = BufFileWrite(file, (void *) &hashvalue, sizeof(uint32)); + if (written != sizeof(uint32)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to hash-join temporary file: %m"))); + written = BufFileWrite(file, (void *) heapTuple, sizeof(HeapTupleData)); if (written != sizeof(HeapTupleData)) ereport(ERROR, (errcode_for_file_access(), - errmsg("could not write to hash-join temporary file: %m"))); + errmsg("could not write to hash-join temporary file: %m"))); + written = BufFileWrite(file, (void *) heapTuple->t_data, heapTuple->t_len); if (written != (size_t) heapTuple->t_len) ereport(ERROR, (errcode_for_file_access(), - errmsg("could not write to hash-join temporary file: %m"))); + errmsg("could not write to hash-join temporary file: %m"))); } +/* + * ExecHashJoinGetSavedTuple + * read the next tuple from a batch file. Return NULL if no more. + * + * On success, *hashvalue is set to the tuple's hash value, and the tuple + * itself is stored in the given slot. + */ +static TupleTableSlot * +ExecHashJoinGetSavedTuple(HashJoinState *hjstate, + BufFile *file, + uint32 *hashvalue, + TupleTableSlot *tupleSlot) +{ + HeapTupleData htup; + size_t nread; + HeapTuple heapTuple; + + nread = BufFileRead(file, (void *) hashvalue, sizeof(uint32)); + if (nread == 0) + return NULL; /* end of file */ + if (nread != sizeof(uint32)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read from hash-join temporary file: %m"))); + nread = BufFileRead(file, (void *) &htup, sizeof(HeapTupleData)); + if (nread != sizeof(HeapTupleData)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read from hash-join temporary file: %m"))); + heapTuple = palloc(HEAPTUPLESIZE + htup.t_len); + memcpy((char *) heapTuple, (char *) &htup, sizeof(HeapTupleData)); + heapTuple->t_datamcxt = CurrentMemoryContext; + heapTuple->t_data = (HeapTupleHeader) + ((char *) heapTuple + HEAPTUPLESIZE); + nread = BufFileRead(file, (void *) heapTuple->t_data, htup.t_len); + if (nread != (size_t) htup.t_len) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read from hash-join temporary file: %m"))); + return ExecStoreTuple(heapTuple, tupleSlot, InvalidBuffer, true); +} + + void ExecReScanHashJoin(HashJoinState *node, ExprContext *exprCtxt) { @@ -711,9 +775,8 @@ ExecReScanHashJoin(HashJoinState *node, ExprContext *exprCtxt) * If we haven't yet built the hash table then we can just return; * nothing done yet, so nothing to undo. */ - if (!node->hj_hashdone) + if (node->hj_HashTable == NULL) return; - Assert(node->hj_HashTable != NULL); /* * In a multi-batch join, we currently have to do rescans the hard @@ -722,7 +785,7 @@ ExecReScanHashJoin(HashJoinState *node, ExprContext *exprCtxt) * parameter change for the inner subnode, then we can just re-use the * existing hash table without rebuilding it. */ - if (node->hj_HashTable->nbatch == 0 && + if (node->hj_HashTable->nbatch == 1 && ((PlanState *) node)->righttree->chgParam == NULL) { /* okay to reuse the hash table; needn't rescan inner, either */ @@ -730,7 +793,6 @@ ExecReScanHashJoin(HashJoinState *node, ExprContext *exprCtxt) else { /* must destroy and rebuild hash table */ - node->hj_hashdone = false; ExecHashTableDestroy(node->hj_HashTable); node->hj_HashTable = NULL; @@ -743,6 +805,7 @@ ExecReScanHashJoin(HashJoinState *node, ExprContext *exprCtxt) } /* Always reset intra-tuple state */ + node->hj_CurHashValue = 0; node->hj_CurBucketNo = 0; node->hj_CurTuple = NULL; |