diff options
author | Tom Lane <tgl@sss.pgh.pa.us> | 2011-10-09 00:21:08 -0400 |
---|---|---|
committer | Tom Lane <tgl@sss.pgh.pa.us> | 2011-10-09 00:21:08 -0400 |
commit | cbfa92c23c3924d53889320cdbe26f23ee23e40c (patch) | |
tree | f93756a2e9f1d7e6cbf468b16528f275c04f04e5 /src | |
parent | 45401c1c25fe1ef14bf68089de86bcb5cce9f453 (diff) | |
download | postgresql-cbfa92c23c3924d53889320cdbe26f23ee23e40c.tar.gz postgresql-cbfa92c23c3924d53889320cdbe26f23ee23e40c.zip |
Improve index-only scans to avoid repeated access to the index page.
We copy all the matched tuples off the page during _bt_readpage, instead of
expensively re-locking the page during each subsequent tuple fetch. This
costs a bit more local storage, but not more than 2*BLCKSZ worth, and the
reduction in LWLock traffic is certainly worth that. What's more, this
lets us get rid of the API wart in the original patch that said an index AM
could randomly decline to supply an index tuple despite having asserted
pg_am.amcanreturn. That will be important for future improvements in the
index-only-scan feature, since the executor will now be able to rely on
having the index data available.
Diffstat (limited to 'src')
-rw-r--r-- | src/backend/access/index/indexam.c | 7 | ||||
-rw-r--r-- | src/backend/access/nbtree/nbtree.c | 120 | ||||
-rw-r--r-- | src/backend/access/nbtree/nbtsearch.c | 68 | ||||
-rw-r--r-- | src/backend/access/nbtree/nbtutils.c | 39 | ||||
-rw-r--r-- | src/backend/executor/nodeIndexscan.c | 2 | ||||
-rw-r--r-- | src/include/access/nbtree.h | 22 | ||||
-rw-r--r-- | src/include/access/relscan.h | 2 |
7 files changed, 130 insertions, 130 deletions
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index 3e0797a5c2e..6d423a7d682 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -443,9 +443,10 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction) Assert(TransactionIdIsValid(RecentGlobalXmin)); /* - * The AM's gettuple proc finds the next index entry matching the scan - * keys, and puts the TID in xs_ctup.t_self. It should also set - * scan->xs_recheck, though we pay no attention to that here. + * The AM's amgettuple proc finds the next index entry matching the scan + * keys, and puts the TID into scan->xs_ctup.t_self. It should also set + * scan->xs_recheck and possibly scan->xs_itup, though we pay no attention + * to those fields here. */ found = DatumGetBool(FunctionCall2(procedure, PointerGetDatum(scan), diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 11b57659ee8..80e703b1906 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -73,7 +73,6 @@ static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, BTCycleId cycleid); static void btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno); -static IndexTuple bt_getindextuple(IndexScanDesc scan); /* @@ -311,95 +310,10 @@ btgettuple(PG_FUNCTION_ARGS) else res = _bt_first(scan, dir); - /* Return the whole index tuple if requested */ - if (scan->xs_want_itup) - { - /* First, free the last one ... */ - if (scan->xs_itup != NULL) - { - pfree(scan->xs_itup); - scan->xs_itup = NULL; - } - - if (res) - scan->xs_itup = bt_getindextuple(scan); - } - PG_RETURN_BOOL(res); } /* - * bt_getindextuple - fetch index tuple at current position. - * - * This can fail to find the tuple if new tuples have been inserted on the - * index page since we stepped onto the page. NULL is returned in that case. - * (We could try a bit harder by searching for the TID; but if insertions - * are happening, it's reasonably likely that an index-only scan will fail - * anyway because of visibility. So probably not worth the trouble.) - * - * The tuple returned is a palloc'd copy, so that we don't need to keep a - * lock on the index page. - * - * The caller must have pin on so->currPos.buf. - */ -static IndexTuple -bt_getindextuple(IndexScanDesc scan) -{ - BTScanOpaque so = (BTScanOpaque) scan->opaque; - Page page; - BTPageOpaque opaque; - OffsetNumber minoff; - OffsetNumber maxoff; - int itemIndex; - OffsetNumber offnum; - IndexTuple ituple, - result; - - Assert(BufferIsValid(so->currPos.buf)); - - LockBuffer(so->currPos.buf, BT_READ); - - /* Locate the tuple, being paranoid about possibility the page changed */ - page = BufferGetPage(so->currPos.buf); - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - minoff = P_FIRSTDATAKEY(opaque); - maxoff = PageGetMaxOffsetNumber(page); - - itemIndex = so->currPos.itemIndex; - /* pure paranoia */ - Assert(itemIndex >= so->currPos.firstItem && - itemIndex <= so->currPos.lastItem); - - offnum = so->currPos.items[itemIndex].indexOffset; - if (offnum < minoff || offnum > maxoff) - { - /* should never happen, since we have pin on page, but be careful */ - LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); - return NULL; - } - - ituple = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); - - if (ItemPointerEquals(&ituple->t_tid, &scan->xs_ctup.t_self)) - { - /* yup, it's the desired tuple, so make a copy */ - Size itupsz = IndexTupleSize(ituple); - - result = palloc(itupsz); - memcpy(result, ituple, itupsz); - } - else - { - /* oops, it got moved */ - result = NULL; - } - - LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); - - return result; -} - -/* * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap */ Datum @@ -471,6 +385,15 @@ btbeginscan(PG_FUNCTION_ARGS) so->keyData = NULL; so->killedItems = NULL; /* until needed */ so->numKilled = 0; + + /* + * We don't know yet whether the scan will be index-only, so we do not + * allocate the tuple workspace arrays until btrescan. + */ + so->currTuples = so->markTuples = NULL; + so->currPos.nextTupleOffset = 0; + so->markPos.nextTupleOffset = 0; + scan->opaque = so; PG_RETURN_POINTER(scan); @@ -506,6 +429,18 @@ btrescan(PG_FUNCTION_ARGS) so->markItemIndex = -1; /* + * Allocate tuple workspace arrays, if needed for an index-only scan and + * not already done in a previous rescan call. To save on palloc + * overhead, both workspaces are allocated as one palloc block; only this + * function and btendscan know that. + */ + if (scan->xs_want_itup && so->currTuples == NULL) + { + so->currTuples = (char *) palloc(BLCKSZ * 2); + so->markTuples = so->currTuples + BLCKSZ; + } + + /* * Reset the scan keys. Note that keys ordering stuff moved to _bt_first. * - vadim 05/05/97 */ @@ -544,18 +479,16 @@ btendscan(PG_FUNCTION_ARGS) } so->markItemIndex = -1; + /* Release storage */ if (so->killedItems != NULL) pfree(so->killedItems); if (so->keyData != NULL) pfree(so->keyData); + if (so->currTuples != NULL) + pfree(so->currTuples); + /* so->markTuples should not be pfree'd, see btrescan */ pfree(so); - if (scan->xs_itup != NULL) - { - pfree(scan->xs_itup); - scan->xs_itup = NULL; - } - PG_RETURN_VOID(); } @@ -626,6 +559,9 @@ btrestrpos(PG_FUNCTION_ARGS) memcpy(&so->currPos, &so->markPos, offsetof(BTScanPosData, items[1]) + so->markPos.lastItem * sizeof(BTScanPosItem)); + if (so->currTuples) + memcpy(so->currTuples, so->markTuples, + so->markPos.nextTupleOffset); } } diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 313e574f969..f4bea8cd537 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -26,6 +26,8 @@ static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum); +static void _bt_saveitem(BTScanOpaque so, int itemIndex, + OffsetNumber offnum, IndexTuple itup); static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir); static Buffer _bt_walk_left(Relation rel, Buffer buf); static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir); @@ -429,8 +431,9 @@ _bt_compare(Relation rel, * if backwards scan, the last item) in the tree that satisfies the * qualifications in the scan key. On success exit, the page containing * the current index tuple is pinned but not locked, and data about - * the matching tuple(s) on the page has been loaded into so->currPos, - * and scan->xs_ctup.t_self is set to the heap TID of the current tuple. + * the matching tuple(s) on the page has been loaded into so->currPos. + * scan->xs_ctup.t_self is set to the heap TID of the current tuple, + * and if requested, scan->xs_itup points to a copy of the index tuple. * * If there are no matching items in the index, we return FALSE, with no * pins or locks held. @@ -456,6 +459,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) int keysCount = 0; int i; StrategyNumber strat_total; + BTScanPosItem *currItem; pgstat_count_index_scan(rel); @@ -912,7 +916,10 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); /* OK, itemIndex says what to return */ - scan->xs_ctup.t_self = so->currPos.items[so->currPos.itemIndex].heapTid; + currItem = &so->currPos.items[so->currPos.itemIndex]; + scan->xs_ctup.t_self = currItem->heapTid; + if (scan->xs_want_itup) + scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset); return true; } @@ -925,7 +932,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * previously returned. * * On successful exit, scan->xs_ctup.t_self is set to the TID of the - * next heap tuple, and so->currPos is updated as needed. + * next heap tuple, and if requested, scan->xs_itup points to a copy of + * the index tuple. so->currPos is updated as needed. * * On failure exit (no more tuples), we release pin and set * so->currPos.buf to InvalidBuffer. @@ -934,6 +942,7 @@ bool _bt_next(IndexScanDesc scan, ScanDirection dir) { BTScanOpaque so = (BTScanOpaque) scan->opaque; + BTScanPosItem *currItem; /* * Advance to next tuple on current page; or if there's no more, try to @@ -967,7 +976,10 @@ _bt_next(IndexScanDesc scan, ScanDirection dir) } /* OK, itemIndex says what to return */ - scan->xs_ctup.t_self = so->currPos.items[so->currPos.itemIndex].heapTid; + currItem = &so->currPos.items[so->currPos.itemIndex]; + scan->xs_ctup.t_self = currItem->heapTid; + if (scan->xs_want_itup) + scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset); return true; } @@ -996,6 +1008,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) OffsetNumber minoff; OffsetNumber maxoff; int itemIndex; + IndexTuple itup; bool continuescan; /* we must have the buffer pinned and locked */ @@ -1013,6 +1026,9 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) */ so->currPos.nextPage = opaque->btpo_next; + /* initialize tuple workspace to empty */ + so->currPos.nextTupleOffset = 0; + if (ScanDirectionIsForward(dir)) { /* load items[] in ascending order */ @@ -1022,12 +1038,11 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) while (offnum <= maxoff) { - if (_bt_checkkeys(scan, page, offnum, dir, &continuescan)) + itup = _bt_checkkeys(scan, page, offnum, dir, &continuescan); + if (itup != NULL) { /* tuple passes all scan key conditions, so remember it */ - /* _bt_checkkeys put the heap ptr into scan->xs_ctup.t_self */ - so->currPos.items[itemIndex].heapTid = scan->xs_ctup.t_self; - so->currPos.items[itemIndex].indexOffset = offnum; + _bt_saveitem(so, itemIndex, offnum, itup); itemIndex++; } if (!continuescan) @@ -1054,13 +1069,12 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) while (offnum >= minoff) { - if (_bt_checkkeys(scan, page, offnum, dir, &continuescan)) + itup = _bt_checkkeys(scan, page, offnum, dir, &continuescan); + if (itup != NULL) { /* tuple passes all scan key conditions, so remember it */ - /* _bt_checkkeys put the heap ptr into scan->xs_ctup.t_self */ itemIndex--; - so->currPos.items[itemIndex].heapTid = scan->xs_ctup.t_self; - so->currPos.items[itemIndex].indexOffset = offnum; + _bt_saveitem(so, itemIndex, offnum, itup); } if (!continuescan) { @@ -1081,6 +1095,25 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) return (so->currPos.firstItem <= so->currPos.lastItem); } +/* Save an index item into so->currPos.items[itemIndex] */ +static void +_bt_saveitem(BTScanOpaque so, int itemIndex, + OffsetNumber offnum, IndexTuple itup) +{ + BTScanPosItem *currItem = &so->currPos.items[itemIndex]; + + currItem->heapTid = itup->t_tid; + currItem->indexOffset = offnum; + if (so->currTuples) + { + Size itupsz = IndexTupleSize(itup); + + currItem->tupleOffset = so->currPos.nextTupleOffset; + memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz); + so->currPos.nextTupleOffset += MAXALIGN(itupsz); + } +} + /* * _bt_steppage() -- Step to next page containing valid data for scan * @@ -1119,6 +1152,9 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) memcpy(&so->markPos, &so->currPos, offsetof(BTScanPosData, items[1]) + so->currPos.lastItem * sizeof(BTScanPosItem)); + if (so->markTuples) + memcpy(so->markTuples, so->currTuples, + so->currPos.nextTupleOffset); so->markPos.itemIndex = so->markItemIndex; so->markItemIndex = -1; } @@ -1428,6 +1464,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) Page page; BTPageOpaque opaque; OffsetNumber start; + BTScanPosItem *currItem; /* * Scan down to the leftmost or rightmost leaf page. This is a simplified @@ -1505,7 +1542,10 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); /* OK, itemIndex says what to return */ - scan->xs_ctup.t_self = so->currPos.items[so->currPos.itemIndex].heapTid; + currItem = &so->currPos.items[so->currPos.itemIndex]; + scan->xs_ctup.t_self = currItem->heapTid; + if (scan->xs_want_itup) + scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset); return true; } diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 3ca75b79ccb..b6fb3867bf0 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -835,8 +835,8 @@ _bt_mark_scankey_required(ScanKey skey) /* * Test whether an indextuple satisfies all the scankey conditions. * - * If so, copy its TID into scan->xs_ctup.t_self, and return TRUE. - * If not, return FALSE (xs_ctup is not changed). + * If so, return the address of the index tuple on the index page. + * If not, return NULL. * * If the tuple fails to pass the qual, we also determine whether there's * any need to continue the scan beyond this tuple, and set *continuescan @@ -848,14 +848,16 @@ _bt_mark_scankey_required(ScanKey skey) * offnum: offset number of index tuple (must be a valid item!) * dir: direction we are scanning in * continuescan: output parameter (will be set correctly in all cases) + * + * Caller must hold pin and lock on the index page. */ -bool +IndexTuple _bt_checkkeys(IndexScanDesc scan, Page page, OffsetNumber offnum, ScanDirection dir, bool *continuescan) { ItemId iid = PageGetItemId(page, offnum); - bool tuple_valid; + bool tuple_alive; IndexTuple tuple; TupleDesc tupdesc; BTScanOpaque so; @@ -879,24 +881,24 @@ _bt_checkkeys(IndexScanDesc scan, if (ScanDirectionIsForward(dir)) { if (offnum < PageGetMaxOffsetNumber(page)) - return false; + return NULL; } else { BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (offnum > P_FIRSTDATAKEY(opaque)) - return false; + return NULL; } /* - * OK, we want to check the keys, but we'll return FALSE even if the - * tuple passes the key tests. + * OK, we want to check the keys so we can set continuescan correctly, + * but we'll return NULL even if the tuple passes the key tests. */ - tuple_valid = false; + tuple_alive = false; } else - tuple_valid = true; + tuple_alive = true; tuple = (IndexTuple) PageGetItem(page, iid); @@ -915,7 +917,7 @@ _bt_checkkeys(IndexScanDesc scan, { if (_bt_check_rowcompare(key, tuple, tupdesc, dir, continuescan)) continue; - return false; + return NULL; } datum = index_getattr(tuple, @@ -953,7 +955,7 @@ _bt_checkkeys(IndexScanDesc scan, /* * In any case, this indextuple doesn't match the qual. */ - return false; + return NULL; } if (isNull) @@ -988,7 +990,7 @@ _bt_checkkeys(IndexScanDesc scan, /* * In any case, this indextuple doesn't match the qual. */ - return false; + return NULL; } test = FunctionCall2Coll(&key->sk_func, key->sk_collation, @@ -1016,15 +1018,16 @@ _bt_checkkeys(IndexScanDesc scan, /* * In any case, this indextuple doesn't match the qual. */ - return false; + return NULL; } } - /* If we get here, the tuple passes all index quals. */ - if (tuple_valid) - scan->xs_ctup.t_self = tuple->t_tid; + /* Check for failure due to it being a killed tuple. */ + if (!tuple_alive) + return NULL; - return tuple_valid; + /* If we get here, the tuple passes all index quals. */ + return tuple; } /* diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c index 32ed65797ae..56b9855094a 100644 --- a/src/backend/executor/nodeIndexscan.c +++ b/src/backend/executor/nodeIndexscan.c @@ -92,7 +92,7 @@ IndexNext(IndexScanState *node) * scan's xs_cbuf, ie, the previously visited heap page. It's not * clear whether it'd be better to release that pin. */ - if (scandesc->xs_itup != NULL && + if (scandesc->xs_want_itup && visibilitymap_test(scandesc->heapRelation, ItemPointerGetBlockNumber(tid), &node->iss_VMBuffer)) diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 4e20c79ca6e..199fc940267 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -472,12 +472,18 @@ typedef BTStackData *BTStack; * items were killed, we re-lock the page to mark them killed, then unlock. * Finally we drop the pin and step to the next page in the appropriate * direction. + * + * If we are doing an index-only scan, we save the entire IndexTuple for each + * matched item, otherwise only its heap TID and offset. The IndexTuples go + * into a separate workspace array; each BTScanPosItem stores its tuple's + * offset within that array. */ typedef struct BTScanPosItem /* what we remember about each match */ { ItemPointerData heapTid; /* TID of referenced heap item */ OffsetNumber indexOffset; /* index item's location within page */ + LocationIndex tupleOffset; /* IndexTuple's offset in workspace, if any */ } BTScanPosItem; typedef struct BTScanPosData @@ -496,6 +502,12 @@ typedef struct BTScanPosData bool moreRight; /* + * If we are doing an index-only scan, nextTupleOffset is the first free + * location in the associated tuple storage workspace. + */ + int nextTupleOffset; + + /* * The items array is always ordered in index order (ie, increasing * indexoffset). When scanning backwards it is convenient to fill the * array back-to-front, so we start at the last slot and fill downwards. @@ -525,6 +537,14 @@ typedef struct BTScanOpaqueData int numKilled; /* number of currently stored items */ /* + * If we are doing an index-only scan, these are the tuple storage + * workspaces for the currPos and markPos respectively. Each is of + * size BLCKSZ, so it can hold as much as a full page's worth of tuples. + */ + char *currTuples; /* tuple storage for currPos */ + char *markTuples; /* tuple storage for markPos */ + + /* * If the marked position is on the same page as current position, we * don't use markPos, but just keep the marked itemIndex in markItemIndex * (all the rest of currPos is valid for the mark position). Hence, to @@ -620,7 +640,7 @@ extern ScanKey _bt_mkscankey_nodata(Relation rel); extern void _bt_freeskey(ScanKey skey); extern void _bt_freestack(BTStack stack); extern void _bt_preprocess_keys(IndexScanDesc scan); -extern bool _bt_checkkeys(IndexScanDesc scan, +extern IndexTuple _bt_checkkeys(IndexScanDesc scan, Page page, OffsetNumber offnum, ScanDirection dir, bool *continuescan); extern void _bt_killitems(IndexScanDesc scan, bool haveLock); diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index 656aefcceec..d48bbf865ea 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -79,7 +79,7 @@ typedef struct IndexScanDescData void *opaque; /* access-method-specific info */ /* in an index-only scan, this is valid after a successful amgettuple */ - IndexTuple xs_itup; /* index tuple returned by AM, or NULL */ + IndexTuple xs_itup; /* index tuple returned by AM */ /* xs_ctup/xs_cbuf/xs_recheck are valid after a successful index_getnext */ HeapTupleData xs_ctup; /* current heap tuple, if any */ |