diff options
author | Tom Lane <tgl@sss.pgh.pa.us> | 2011-10-07 20:13:02 -0400 |
---|---|---|
committer | Tom Lane <tgl@sss.pgh.pa.us> | 2011-10-07 20:14:13 -0400 |
commit | a2822fb9337a21f98ac4ce850bb4145acf47ca27 (patch) | |
tree | c239fe9a32ff0225e906711a76348cee1567f0d8 /src | |
parent | caa1054df8408b165e5f66ff25c87b6dd0a0a1e7 (diff) | |
download | postgresql-a2822fb9337a21f98ac4ce850bb4145acf47ca27.tar.gz postgresql-a2822fb9337a21f98ac4ce850bb4145acf47ca27.zip |
Support index-only scans using the visibility map to avoid heap fetches.
When a btree index contains all columns required by the query, and the
visibility map shows that all tuples on a target heap page are
visible-to-all, we don't need to fetch that heap page. This patch depends
on the previous patches that made the visibility map reliable.
There's a fair amount left to do here, notably trying to figure out a less
chintzy way of estimating the cost of an index-only scan, but the core
functionality seems ready to commit.
Robert Haas and Ibrar Ahmed, with some previous work by Heikki Linnakangas.
Diffstat (limited to 'src')
30 files changed, 643 insertions, 199 deletions
diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index 98832adb8a3..236e48912bb 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -93,6 +93,8 @@ RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys) else scan->orderByData = NULL; + scan->xs_want_itup = false; /* may be set later */ + /* * During recovery we ignore killed tuples and don't bother to kill them * either. We do this because the xmin on the primary node could easily be @@ -109,6 +111,8 @@ RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys) scan->opaque = NULL; + scan->xs_itup = NULL; + ItemPointerSetInvalid(&scan->xs_ctup.t_self); scan->xs_ctup.t_data = NULL; scan->xs_cbuf = InvalidBuffer; diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index 230af9bfa3a..3e0797a5c2e 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -20,7 +20,9 @@ * index_insert - insert an index tuple into a relation * index_markpos - mark a scan position * index_restrpos - restore a scan position - * index_getnext - get the next tuple from a scan + * index_getnext_tid - get the next TID from a scan + * index_fetch_heap - get the scan's next heap tuple + * index_getnext - get the next heap tuple from a scan * index_getbitmap - get all tuples from a scan * index_bulk_delete - bulk deletion of index tuples * index_vacuum_cleanup - post-deletion cleanup of an index @@ -423,12 +425,65 @@ index_restrpos(IndexScanDesc scan) } /* ---------------- - * index_getnext - get the next heap tuple from a scan + * index_getnext_tid - get the next TID from a scan * - * The result is the next heap tuple satisfying the scan keys and the - * snapshot, or NULL if no more matching tuples exist. On success, - * the buffer containing the heap tuple is pinned (the pin will be dropped - * at the next index_getnext or index_endscan). + * The result is the next TID satisfying the scan keys, + * or NULL if no more matching tuples exist. + * ---------------- + */ +ItemPointer +index_getnext_tid(IndexScanDesc scan, ScanDirection direction) +{ + FmgrInfo *procedure; + bool found; + + SCAN_CHECKS; + GET_SCAN_PROCEDURE(amgettuple); + + Assert(TransactionIdIsValid(RecentGlobalXmin)); + + /* + * The AM's gettuple proc finds the next index entry matching the scan + * keys, and puts the TID in xs_ctup.t_self. It should also set + * scan->xs_recheck, though we pay no attention to that here. + */ + found = DatumGetBool(FunctionCall2(procedure, + PointerGetDatum(scan), + Int32GetDatum(direction))); + + /* Reset kill flag immediately for safety */ + scan->kill_prior_tuple = false; + + /* If we're out of index entries, we're done */ + if (!found) + { + /* ... but first, release any held pin on a heap page */ + if (BufferIsValid(scan->xs_cbuf)) + { + ReleaseBuffer(scan->xs_cbuf); + scan->xs_cbuf = InvalidBuffer; + } + return NULL; + } + + pgstat_count_index_tuples(scan->indexRelation, 1); + + /* Return the TID of the tuple we found. */ + return &scan->xs_ctup.t_self; +} + +/* ---------------- + * index_fetch_heap - get the scan's next heap tuple + * + * The result is a visible heap tuple associated with the index TID most + * recently fetched by index_getnext_tid, or NULL if no more matching tuples + * exist. (There can be more than one matching tuple because of HOT chains, + * although when using an MVCC snapshot it should be impossible for more than + * one such tuple to exist.) + * + * On success, the buffer containing the heap tup is pinned (the pin will be + * dropped in a future index_getnext_tid, index_fetch_heap or index_endscan + * call). * * Note: caller must check scan->xs_recheck, and perform rechecking of the * scan keys if required. We do not do that here because we don't have @@ -436,22 +491,90 @@ index_restrpos(IndexScanDesc scan) * ---------------- */ HeapTuple -index_getnext(IndexScanDesc scan, ScanDirection direction) +index_fetch_heap(IndexScanDesc scan) { - HeapTuple heapTuple = &scan->xs_ctup; - ItemPointer tid = &heapTuple->t_self; - FmgrInfo *procedure; + ItemPointer tid = &scan->xs_ctup.t_self; bool all_dead = false; + bool got_heap_tuple; - SCAN_CHECKS; - GET_SCAN_PROCEDURE(amgettuple); + /* We can skip the buffer-switching logic if we're in mid-HOT chain. */ + if (!scan->xs_continue_hot) + { + /* Switch to correct buffer if we don't have it already */ + Buffer prev_buf = scan->xs_cbuf; - Assert(TransactionIdIsValid(RecentGlobalXmin)); + scan->xs_cbuf = ReleaseAndReadBuffer(scan->xs_cbuf, + scan->heapRelation, + ItemPointerGetBlockNumber(tid)); - for (;;) + /* + * Prune page, but only if we weren't already on this page + */ + if (prev_buf != scan->xs_cbuf) + heap_page_prune_opt(scan->heapRelation, scan->xs_cbuf, + RecentGlobalXmin); + } + + /* Obtain share-lock on the buffer so we can examine visibility */ + LockBuffer(scan->xs_cbuf, BUFFER_LOCK_SHARE); + got_heap_tuple = heap_hot_search_buffer(tid, scan->heapRelation, + scan->xs_cbuf, + scan->xs_snapshot, + &scan->xs_ctup, + &all_dead, + !scan->xs_continue_hot); + LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK); + + if (got_heap_tuple) { - bool got_heap_tuple; + /* + * Only in a non-MVCC snapshot can more than one member of the + * HOT chain be visible. + */ + scan->xs_continue_hot = !IsMVCCSnapshot(scan->xs_snapshot); + pgstat_count_heap_fetch(scan->indexRelation); + return &scan->xs_ctup; + } + + /* We've reached the end of the HOT chain. */ + scan->xs_continue_hot = false; + + /* + * If we scanned a whole HOT chain and found only dead tuples, tell index + * AM to kill its entry for that TID (this will take effect in the next + * amgettuple call, in index_getnext_tid). We do not do this when in + * recovery because it may violate MVCC to do so. See comments in + * RelationGetIndexScan(). + */ + if (!scan->xactStartedInRecovery) + scan->kill_prior_tuple = all_dead; + return NULL; +} + +/* ---------------- + * index_getnext - get the next heap tuple from a scan + * + * The result is the next heap tuple satisfying the scan keys and the + * snapshot, or NULL if no more matching tuples exist. + * + * On success, the buffer containing the heap tup is pinned (the pin will be + * dropped in a future index_getnext_tid, index_fetch_heap or index_endscan + * call). + * + * Note: caller must check scan->xs_recheck, and perform rechecking of the + * scan keys if required. We do not do that here because we don't have + * enough information to do it efficiently in the general case. + * ---------------- + */ +HeapTuple +index_getnext(IndexScanDesc scan, ScanDirection direction) +{ + HeapTuple heapTuple; + ItemPointer tid; + + for (;;) + { if (scan->xs_continue_hot) { /* @@ -459,86 +582,27 @@ index_getnext(IndexScanDesc scan, ScanDirection direction) * earlier member. Must still hold pin on current heap page. */ Assert(BufferIsValid(scan->xs_cbuf)); - Assert(ItemPointerGetBlockNumber(tid) == + Assert(ItemPointerGetBlockNumber(&scan->xs_ctup.t_self) == BufferGetBlockNumber(scan->xs_cbuf)); } else { - bool found; - Buffer prev_buf; - - /* - * If we scanned a whole HOT chain and found only dead tuples, - * tell index AM to kill its entry for that TID. We do not do this - * when in recovery because it may violate MVCC to do so. see - * comments in RelationGetIndexScan(). - */ - if (!scan->xactStartedInRecovery) - scan->kill_prior_tuple = all_dead; - - /* - * The AM's gettuple proc finds the next index entry matching the - * scan keys, and puts the TID in xs_ctup.t_self (ie, *tid). It - * should also set scan->xs_recheck, though we pay no attention to - * that here. - */ - found = DatumGetBool(FunctionCall2(procedure, - PointerGetDatum(scan), - Int32GetDatum(direction))); - - /* Reset kill flag immediately for safety */ - scan->kill_prior_tuple = false; + /* Time to fetch the next TID from the index */ + tid = index_getnext_tid(scan, direction); - /* If we're out of index entries, break out of outer loop */ - if (!found) + /* If we're out of index entries, we're done */ + if (tid == NULL) break; - - pgstat_count_index_tuples(scan->indexRelation, 1); - - /* Switch to correct buffer if we don't have it already */ - prev_buf = scan->xs_cbuf; - scan->xs_cbuf = ReleaseAndReadBuffer(scan->xs_cbuf, - scan->heapRelation, - ItemPointerGetBlockNumber(tid)); - - /* - * Prune page, but only if we weren't already on this page - */ - if (prev_buf != scan->xs_cbuf) - heap_page_prune_opt(scan->heapRelation, scan->xs_cbuf, - RecentGlobalXmin); } - /* Obtain share-lock on the buffer so we can examine visibility */ - LockBuffer(scan->xs_cbuf, BUFFER_LOCK_SHARE); - got_heap_tuple = heap_hot_search_buffer(tid, scan->heapRelation, - scan->xs_cbuf, - scan->xs_snapshot, - &scan->xs_ctup, - &all_dead, - !scan->xs_continue_hot); - LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK); - - if (got_heap_tuple) - { - /* - * Only in a non-MVCC snapshot can more than one member of the - * HOT chain be visible. - */ - scan->xs_continue_hot = !IsMVCCSnapshot(scan->xs_snapshot); - pgstat_count_heap_fetch(scan->indexRelation); + /* + * Fetch the next (or only) visible heap tuple for this index entry. + * If we don't find anything, loop around and grab the next TID from + * the index. + */ + heapTuple = index_fetch_heap(scan); + if (heapTuple != NULL) return heapTuple; - } - - /* Loop around to ask index AM for another TID */ - scan->xs_continue_hot = false; - } - - /* Release any held pin on a heap page */ - if (BufferIsValid(scan->xs_cbuf)) - { - ReleaseBuffer(scan->xs_cbuf); - scan->xs_cbuf = InvalidBuffer; } return NULL; /* failure exit */ diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 996611516fe..11b57659ee8 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -73,6 +73,7 @@ static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, BTCycleId cycleid); static void btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno); +static IndexTuple bt_getindextuple(IndexScanDesc scan); /* @@ -310,10 +311,95 @@ btgettuple(PG_FUNCTION_ARGS) else res = _bt_first(scan, dir); + /* Return the whole index tuple if requested */ + if (scan->xs_want_itup) + { + /* First, free the last one ... */ + if (scan->xs_itup != NULL) + { + pfree(scan->xs_itup); + scan->xs_itup = NULL; + } + + if (res) + scan->xs_itup = bt_getindextuple(scan); + } + PG_RETURN_BOOL(res); } /* + * bt_getindextuple - fetch index tuple at current position. + * + * This can fail to find the tuple if new tuples have been inserted on the + * index page since we stepped onto the page. NULL is returned in that case. + * (We could try a bit harder by searching for the TID; but if insertions + * are happening, it's reasonably likely that an index-only scan will fail + * anyway because of visibility. So probably not worth the trouble.) + * + * The tuple returned is a palloc'd copy, so that we don't need to keep a + * lock on the index page. + * + * The caller must have pin on so->currPos.buf. + */ +static IndexTuple +bt_getindextuple(IndexScanDesc scan) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + Page page; + BTPageOpaque opaque; + OffsetNumber minoff; + OffsetNumber maxoff; + int itemIndex; + OffsetNumber offnum; + IndexTuple ituple, + result; + + Assert(BufferIsValid(so->currPos.buf)); + + LockBuffer(so->currPos.buf, BT_READ); + + /* Locate the tuple, being paranoid about possibility the page changed */ + page = BufferGetPage(so->currPos.buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + minoff = P_FIRSTDATAKEY(opaque); + maxoff = PageGetMaxOffsetNumber(page); + + itemIndex = so->currPos.itemIndex; + /* pure paranoia */ + Assert(itemIndex >= so->currPos.firstItem && + itemIndex <= so->currPos.lastItem); + + offnum = so->currPos.items[itemIndex].indexOffset; + if (offnum < minoff || offnum > maxoff) + { + /* should never happen, since we have pin on page, but be careful */ + LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); + return NULL; + } + + ituple = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); + + if (ItemPointerEquals(&ituple->t_tid, &scan->xs_ctup.t_self)) + { + /* yup, it's the desired tuple, so make a copy */ + Size itupsz = IndexTupleSize(ituple); + + result = palloc(itupsz); + memcpy(result, ituple, itupsz); + } + else + { + /* oops, it got moved */ + result = NULL; + } + + LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); + + return result; +} + +/* * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap */ Datum @@ -464,6 +550,12 @@ btendscan(PG_FUNCTION_ARGS) pfree(so->keyData); pfree(so); + if (scan->xs_itup != NULL) + { + pfree(scan->xs_itup); + scan->xs_itup = NULL; + } + PG_RETURN_VOID(); } diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index cd9fc929232..fbcaf6cbe09 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -656,7 +656,10 @@ ExplainNode(PlanState *planstate, List *ancestors, pname = sname = "Seq Scan"; break; case T_IndexScan: - pname = sname = "Index Scan"; + if (((IndexScan *) plan)->indexonly) + pname = sname = "Index Only Scan"; + else + pname = sname = "Index Scan"; break; case T_BitmapIndexScan: pname = sname = "Bitmap Index Scan"; diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c index da25384e860..32ed65797ae 100644 --- a/src/backend/executor/nodeIndexscan.c +++ b/src/backend/executor/nodeIndexscan.c @@ -26,6 +26,7 @@ #include "access/nbtree.h" #include "access/relscan.h" +#include "access/visibilitymap.h" #include "executor/execdebug.h" #include "executor/nodeIndexscan.h" #include "optimizer/clauses.h" @@ -36,6 +37,7 @@ static TupleTableSlot *IndexNext(IndexScanState *node); +static void IndexStoreHeapTuple(TupleTableSlot *slot, IndexScanDesc scandesc); /* ---------------------------------------------------------------- @@ -54,6 +56,7 @@ IndexNext(IndexScanState *node) IndexScanDesc scandesc; HeapTuple tuple; TupleTableSlot *slot; + ItemPointer tid; /* * extract necessary information from index scan node @@ -73,19 +76,63 @@ IndexNext(IndexScanState *node) slot = node->ss.ss_ScanTupleSlot; /* - * ok, now that we have what we need, fetch the next tuple. + * OK, now that we have what we need, fetch the next TID. */ - while ((tuple = index_getnext(scandesc, direction)) != NULL) + while ((tid = index_getnext_tid(scandesc, direction)) != NULL) { /* - * Store the scanned tuple in the scan tuple slot of the scan state. - * Note: we pass 'false' because tuples returned by amgetnext are - * pointers onto disk pages and must not be pfree()'d. + * Attempt index-only scan, if possible. For this, we need to have + * gotten an index tuple from the AM, and we need the TID to reference + * a heap page on which all tuples are known visible to everybody. + * If that's the case, we don't need to visit the heap page for tuple + * visibility testing, and we don't need any column values that are + * not available from the index. + * + * Note: in the index-only path, we are still holding pin on the + * scan's xs_cbuf, ie, the previously visited heap page. It's not + * clear whether it'd be better to release that pin. */ - ExecStoreTuple(tuple, /* tuple to store */ - slot, /* slot to store in */ - scandesc->xs_cbuf, /* buffer containing tuple */ - false); /* don't pfree */ + if (scandesc->xs_itup != NULL && + visibilitymap_test(scandesc->heapRelation, + ItemPointerGetBlockNumber(tid), + &node->iss_VMBuffer)) + { + /* + * Convert index tuple to look like a heap tuple, and store the + * results in the scan tuple slot. + */ + IndexStoreHeapTuple(slot, scandesc); + } + else + { + /* Index-only approach not possible, so fetch heap tuple. */ + tuple = index_fetch_heap(scandesc); + + /* Tuple might not be visible. */ + if (tuple == NULL) + continue; + + /* + * Only MVCC snapshots are supported here, so there should be no + * need to keep following the HOT chain once a visible entry has + * been found. If we did want to allow that, we'd need to keep + * more state to remember not to call index_getnext_tid next time. + */ + if (scandesc->xs_continue_hot) + elog(ERROR, "unsupported use of non-MVCC snapshot in executor"); + + /* + * Store the scanned tuple in the scan tuple slot of the scan + * state. + * + * Note: we pass 'false' because tuples returned by amgetnext are + * pointers onto disk pages and must not be pfree()'d. + */ + ExecStoreTuple(tuple, /* tuple to store */ + slot, /* slot to store in */ + scandesc->xs_cbuf, /* buffer containing tuple */ + false); /* don't pfree */ + } /* * If the index was lossy, we have to recheck the index quals using @@ -114,6 +161,53 @@ IndexNext(IndexScanState *node) } /* + * IndexStoreHeapTuple + * + * When performing an index-only scan, we build a faux heap tuple + * from the index tuple. Columns not present in the index are set to + * NULL, which is OK because we know they won't be referenced. + * + * The faux tuple is built as a virtual tuple that depends on the + * scandesc's xs_itup, so that must remain valid for as long as we + * need the slot contents. + */ +static void +IndexStoreHeapTuple(TupleTableSlot *slot, IndexScanDesc scandesc) +{ + Form_pg_index indexForm = scandesc->indexRelation->rd_index; + TupleDesc indexDesc = RelationGetDescr(scandesc->indexRelation); + int nindexatts = indexDesc->natts; + int nheapatts = slot->tts_tupleDescriptor->natts; + Datum *values = slot->tts_values; + bool *isnull = slot->tts_isnull; + int i; + + /* We must first set the slot to empty, and mark all columns as null */ + ExecClearTuple(slot); + + memset(isnull, true, nheapatts * sizeof(bool)); + + /* Transpose index tuple into heap tuple. */ + for (i = 0; i < nindexatts; i++) + { + int indexatt = indexForm->indkey.values[i]; + + /* Ignore expression columns, as well as system attributes */ + if (indexatt <= 0) + continue; + + Assert(indexatt <= nheapatts); + + values[indexatt - 1] = index_getattr(scandesc->xs_itup, i + 1, + indexDesc, + &isnull[indexatt - 1]); + } + + /* And now we can mark the slot as holding a virtual tuple. */ + ExecStoreVirtualTuple(slot); +} + +/* * IndexRecheck -- access method routine to recheck a tuple in EvalPlanQual */ static bool @@ -399,6 +493,13 @@ ExecEndIndexScan(IndexScanState *node) indexScanDesc = node->iss_ScanDesc; relation = node->ss.ss_currentRelation; + /* Release VM buffer pin, if any. */ + if (node->iss_VMBuffer != InvalidBuffer) + { + ReleaseBuffer(node->iss_VMBuffer); + node->iss_VMBuffer = InvalidBuffer; + } + /* * Free the exprcontext(s) ... now dead code, see ExecFreeExprContext */ @@ -611,6 +712,10 @@ ExecInitIndexScan(IndexScan *node, EState *estate, int eflags) indexstate->iss_NumScanKeys, indexstate->iss_NumOrderByKeys); + /* Prepare for possible index-only scan */ + indexstate->iss_ScanDesc->xs_want_itup = node->indexonly; + indexstate->iss_VMBuffer = InvalidBuffer; + /* * If no run-time keys to calculate, go ahead and pass the scankeys to the * index AM. diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index 661a5162e63..5100642dd63 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -370,6 +370,7 @@ _copyIndexScan(IndexScan *from) COPY_NODE_FIELD(indexorderby); COPY_NODE_FIELD(indexorderbyorig); COPY_SCALAR_FIELD(indexorderdir); + COPY_SCALAR_FIELD(indexonly); return newnode; } diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index 0d0ce3c2034..9f564277747 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -447,6 +447,7 @@ _outIndexScan(StringInfo str, IndexScan *node) WRITE_NODE_FIELD(indexorderby); WRITE_NODE_FIELD(indexorderbyorig); WRITE_ENUM_FIELD(indexorderdir, ScanDirection); + WRITE_BOOL_FIELD(indexonly); } static void @@ -1500,6 +1501,7 @@ _outIndexPath(StringInfo str, IndexPath *node) WRITE_NODE_FIELD(indexorderbys); WRITE_BOOL_FIELD(isjoininner); WRITE_ENUM_FIELD(indexscandir, ScanDirection); + WRITE_BOOL_FIELD(indexonly); WRITE_FLOAT_FIELD(indextotalcost, "%.2f"); WRITE_FLOAT_FIELD(indexselectivity, "%.4f"); WRITE_FLOAT_FIELD(rows, "%.0f"); diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 7812a8628fc..e480797ca8e 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -110,6 +110,7 @@ Cost disable_cost = 1.0e10; bool enable_seqscan = true; bool enable_indexscan = true; +bool enable_indexonlyscan = true; bool enable_bitmapscan = true; bool enable_tidscan = true; bool enable_sort = true; @@ -119,6 +120,9 @@ bool enable_material = true; bool enable_mergejoin = true; bool enable_hashjoin = true; +/* Possibly this should become a GUC too */ +static double visibility_fraction = 0.9; + typedef struct { PlannerInfo *root; @@ -210,6 +214,7 @@ cost_seqscan(Path *path, PlannerInfo *root, * 'index' is the index to be used * 'indexQuals' is the list of applicable qual clauses (implicit AND semantics) * 'indexOrderBys' is the list of ORDER BY operators for amcanorderbyop indexes + * 'indexonly' is true if it's an index-only scan * 'outer_rel' is the outer relation when we are considering using the index * scan as the inside of a nestloop join (hence, some of the indexQuals * are join clauses, and we should expect repeated scans of the index); @@ -232,6 +237,7 @@ cost_index(IndexPath *path, PlannerInfo *root, IndexOptInfo *index, List *indexQuals, List *indexOrderBys, + bool indexonly, RelOptInfo *outer_rel) { RelOptInfo *baserel = index->rel; @@ -314,6 +320,12 @@ cost_index(IndexPath *path, PlannerInfo *root, * For partially-correlated indexes, we ought to charge somewhere between * these two estimates. We currently interpolate linearly between the * estimates based on the correlation squared (XXX is that appropriate?). + * + * If it's an index-only scan, then we will not need to fetch any heap + * pages for which the visibility map shows all tuples are visible. + * Unfortunately, we have no stats as to how much of the heap is + * all-visible, and that's likely to be a rather unstable number anyway. + * We use an arbitrary constant visibility_fraction to estimate this. *---------- */ if (outer_rel != NULL && outer_rel->rows > 1) @@ -333,6 +345,8 @@ cost_index(IndexPath *path, PlannerInfo *root, (double) index->pages, root); + pages_fetched = ceil(pages_fetched * visibility_fraction); + max_IO_cost = (pages_fetched * spc_random_page_cost) / num_scans; /* @@ -352,6 +366,8 @@ cost_index(IndexPath *path, PlannerInfo *root, (double) index->pages, root); + pages_fetched = ceil(pages_fetched * visibility_fraction); + min_IO_cost = (pages_fetched * spc_random_page_cost) / num_scans; } else @@ -365,11 +381,16 @@ cost_index(IndexPath *path, PlannerInfo *root, (double) index->pages, root); + pages_fetched = ceil(pages_fetched * visibility_fraction); + /* max_IO_cost is for the perfectly uncorrelated case (csquared=0) */ max_IO_cost = pages_fetched * spc_random_page_cost; /* min_IO_cost is for the perfectly correlated case (csquared=1) */ pages_fetched = ceil(indexSelectivity * (double) baserel->pages); + + pages_fetched = ceil(pages_fetched * visibility_fraction); + min_IO_cost = spc_random_page_cost; if (pages_fetched > 1) min_IO_cost += (pages_fetched - 1) * spc_seq_page_cost; diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c index fc96f4f1dac..7090a7e0c0d 100644 --- a/src/backend/optimizer/path/indxpath.c +++ b/src/backend/optimizer/path/indxpath.c @@ -18,6 +18,7 @@ #include <math.h> #include "access/skey.h" +#include "access/sysattr.h" #include "catalog/pg_am.h" #include "catalog/pg_collation.h" #include "catalog/pg_operator.h" @@ -88,6 +89,7 @@ static PathClauseUsage *classify_index_clause_usage(Path *path, List **clauselist); static void find_indexpath_quals(Path *bitmapqual, List **quals, List **preds); static int find_list_position(Node *node, List **nodelist); +static bool check_index_only(RelOptInfo *rel, IndexOptInfo *index); static List *group_clauses_by_indexkey(IndexOptInfo *index, List *clauses, List *outer_clauses, Relids outer_relids, @@ -314,6 +316,8 @@ find_usable_indexes(PlannerInfo *root, RelOptInfo *rel, bool useful_predicate; bool found_clause; bool index_is_ordered; + bool index_only_scan = false; + bool checked_index_only = false; /* * Check that index supports the desired scan type(s) @@ -438,6 +442,10 @@ find_usable_indexes(PlannerInfo *root, RelOptInfo *rel, */ if (found_clause || useful_pathkeys != NIL || useful_predicate) { + /* First, detect whether index-only scan is possible */ + index_only_scan = check_index_only(rel, index); + checked_index_only = true; + ipath = create_index_path(root, index, restrictclauses, orderbyclauses, @@ -445,6 +453,7 @@ find_usable_indexes(PlannerInfo *root, RelOptInfo *rel, index_is_ordered ? ForwardScanDirection : NoMovementScanDirection, + index_only_scan, outer_rel); result = lappend(result, ipath); } @@ -462,11 +471,15 @@ find_usable_indexes(PlannerInfo *root, RelOptInfo *rel, index_pathkeys); if (useful_pathkeys != NIL) { + if (!checked_index_only) + index_only_scan = check_index_only(rel, index); + ipath = create_index_path(root, index, restrictclauses, NIL, useful_pathkeys, BackwardScanDirection, + index_only_scan, outer_rel); result = lappend(result, ipath); } @@ -1040,6 +1053,82 @@ find_list_position(Node *node, List **nodelist) } +/* + * check_index_only + * Determine whether an index-only scan is possible for this index. + */ +static bool +check_index_only(RelOptInfo *rel, IndexOptInfo *index) +{ + bool result; + Bitmapset *attrs_used = NULL; + Bitmapset *index_attrs = NULL; + ListCell *lc; + int i; + + /* Index-only scans must be enabled, and AM must be capable of it */ + if (!enable_indexonlyscan) + return false; + if (!index->amcanreturn) + return false; + + /* + * Check that all needed attributes of the relation are available from + * the index. + * + * XXX this is overly conservative for partial indexes, since we will + * consider attributes involved in the index predicate as required even + * though the predicate won't need to be checked at runtime. (The same + * is true for attributes used only in index quals, if we are certain + * that the index is not lossy.) However, it would be quite expensive + * to determine that accurately at this point, so for now we take the + * easy way out. + */ + + /* + * Add all the attributes needed for joins or final output. Note: we must + * look at reltargetlist, not the attr_needed data, because attr_needed + * isn't computed for inheritance child rels. + */ + pull_varattnos((Node *) rel->reltargetlist, rel->relid, &attrs_used); + + /* Add all the attributes used by restriction clauses. */ + foreach(lc, rel->baserestrictinfo) + { + RestrictInfo *rinfo = (RestrictInfo *) lfirst(lc); + + pull_varattnos((Node *) rinfo->clause, rel->relid, &attrs_used); + } + + /* Construct a bitmapset of columns stored in the index. */ + for (i = 0; i < index->ncolumns; i++) + { + int attno = index->indexkeys[i]; + + /* + * For the moment, we just ignore index expressions. It might be nice + * to do something with them, later. We also ignore index columns + * that are system columns (such as OID), because the virtual-tuple + * coding used by IndexStoreHeapTuple() can't deal with them. + */ + if (attno <= 0) + continue; + + index_attrs = + bms_add_member(index_attrs, + attno - FirstLowInvalidHeapAttributeNumber); + } + + /* Do we have all the necessary attributes? */ + result = bms_is_subset(attrs_used, index_attrs); + + bms_free(attrs_used); + bms_free(index_attrs); + + return result; +} + + /**************************************************************************** * ---- ROUTINES TO CHECK RESTRICTIONS ---- ****************************************************************************/ diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index 0dedebaf705..36ee7c5648a 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -95,7 +95,7 @@ static SeqScan *make_seqscan(List *qptlist, List *qpqual, Index scanrelid); static IndexScan *make_indexscan(List *qptlist, List *qpqual, Index scanrelid, Oid indexid, List *indexqual, List *indexqualorig, List *indexorderby, List *indexorderbyorig, - ScanDirection indexscandir); + ScanDirection indexscandir, bool indexonly); static BitmapIndexScan *make_bitmap_indexscan(Index scanrelid, Oid indexid, List *indexqual, List *indexqualorig); @@ -1183,7 +1183,8 @@ create_indexscan_plan(PlannerInfo *root, stripped_indexquals, fixed_indexorderbys, indexorderbys, - best_path->indexscandir); + best_path->indexscandir, + best_path->indexonly); copy_path_costsize(&scan_plan->scan.plan, &best_path->path); /* use the indexscan-specific rows estimate, not the parent rel's */ @@ -2841,7 +2842,8 @@ make_indexscan(List *qptlist, List *indexqualorig, List *indexorderby, List *indexorderbyorig, - ScanDirection indexscandir) + ScanDirection indexscandir, + bool indexonly) { IndexScan *node = makeNode(IndexScan); Plan *plan = &node->scan.plan; @@ -2858,6 +2860,7 @@ make_indexscan(List *qptlist, node->indexorderby = indexorderby; node->indexorderbyorig = indexorderbyorig; node->indexorderdir = indexscandir; + node->indexonly = indexonly; return node; } diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index b4982746a2e..5c18b72344d 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -3297,7 +3297,7 @@ plan_cluster_use_sort(Oid tableOid, Oid indexOid) /* Estimate the cost of index scan */ indexScanPath = create_index_path(root, indexInfo, NIL, NIL, NIL, - ForwardScanDirection, NULL); + ForwardScanDirection, false, NULL); return (seqScanAndSortPath.total_cost < indexScanPath->path.total_cost); } diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index 4a1c94affbd..8ed55a3d0e2 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -25,8 +25,8 @@ #include "optimizer/paths.h" #include "optimizer/tlist.h" #include "parser/parsetree.h" -#include "utils/selfuncs.h" #include "utils/lsyscache.h" +#include "utils/selfuncs.h" static List *translate_sub_tlist(List *tlist, int relid); @@ -418,6 +418,7 @@ create_seqscan_path(PlannerInfo *root, RelOptInfo *rel) * 'indexscandir' is ForwardScanDirection or BackwardScanDirection * for an ordered index, or NoMovementScanDirection for * an unordered index. + * 'indexonly' is true if an index-only scan is wanted. * 'outer_rel' is the outer relation if this is a join inner indexscan path. * (pathkeys and indexscandir are ignored if so.) NULL if not. * @@ -430,6 +431,7 @@ create_index_path(PlannerInfo *root, List *indexorderbys, List *pathkeys, ScanDirection indexscandir, + bool indexonly, RelOptInfo *outer_rel) { IndexPath *pathnode = makeNode(IndexPath); @@ -468,6 +470,7 @@ create_index_path(PlannerInfo *root, pathnode->isjoininner = (outer_rel != NULL); pathnode->indexscandir = indexscandir; + pathnode->indexonly = indexonly; if (outer_rel != NULL) { @@ -506,7 +509,8 @@ create_index_path(PlannerInfo *root, pathnode->rows = rel->rows; } - cost_index(pathnode, root, index, indexquals, indexorderbys, outer_rel); + cost_index(pathnode, root, index, indexquals, indexorderbys, + indexonly, outer_rel); return pathnode; } diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index 8a3a5d85e2a..742e7a880ad 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -210,6 +210,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, info->relam = indexRelation->rd_rel->relam; info->amcostestimate = indexRelation->rd_am->amcostestimate; info->amcanorderbyop = indexRelation->rd_am->amcanorderbyop; + info->amcanreturn = indexRelation->rd_am->amcanreturn; info->amoptionalkey = indexRelation->rd_am->amoptionalkey; info->amsearchnulls = indexRelation->rd_am->amsearchnulls; info->amhasgettuple = OidIsValid(indexRelation->rd_am->amgettuple); diff --git a/src/backend/optimizer/util/var.c b/src/backend/optimizer/util/var.c index 8ce7ee41a18..888f6f67a84 100644 --- a/src/backend/optimizer/util/var.c +++ b/src/backend/optimizer/util/var.c @@ -36,6 +36,12 @@ typedef struct typedef struct { + Bitmapset *varattnos; + Index varno; +} pull_varattnos_context; + +typedef struct +{ int var_location; int sublevels_up; } locate_var_of_level_context; @@ -70,7 +76,7 @@ typedef struct static bool pull_varnos_walker(Node *node, pull_varnos_context *context); -static bool pull_varattnos_walker(Node *node, Bitmapset **varattnos); +static bool pull_varattnos_walker(Node *node, pull_varattnos_context *context); static bool contain_var_clause_walker(Node *node, void *context); static bool contain_vars_of_level_walker(Node *node, int *sublevels_up); static bool locate_var_of_level_walker(Node *node, @@ -177,23 +183,31 @@ pull_varnos_walker(Node *node, pull_varnos_context *context) * pull_varattnos * Find all the distinct attribute numbers present in an expression tree, * and add them to the initial contents of *varattnos. - * Only Vars that reference RTE 1 of rtable level zero are considered. + * Only Vars of the given varno and rtable level zero are considered. * * Attribute numbers are offset by FirstLowInvalidHeapAttributeNumber so that * we can include system attributes (e.g., OID) in the bitmap representation. * - * Currently, this does not support subqueries nor expressions containing - * references to multiple tables; not needed since it's only applied to - * index expressions and predicates. + * Currently, this does not support unplanned subqueries; that is not needed + * for current uses. It will handle already-planned SubPlan nodes, though, + * looking into only the "testexpr" and the "args" list. (The subplan cannot + * contain any other references to Vars of the current level.) */ void -pull_varattnos(Node *node, Bitmapset **varattnos) +pull_varattnos(Node *node, Index varno, Bitmapset **varattnos) { - (void) pull_varattnos_walker(node, varattnos); + pull_varattnos_context context; + + context.varattnos = *varattnos; + context.varno = varno; + + (void) pull_varattnos_walker(node, &context); + + *varattnos = context.varattnos; } static bool -pull_varattnos_walker(Node *node, Bitmapset **varattnos) +pull_varattnos_walker(Node *node, pull_varattnos_context *context) { if (node == NULL) return false; @@ -201,17 +215,18 @@ pull_varattnos_walker(Node *node, Bitmapset **varattnos) { Var *var = (Var *) node; - Assert(var->varno == 1); - *varattnos = bms_add_member(*varattnos, - var->varattno - FirstLowInvalidHeapAttributeNumber); + if (var->varno == context->varno && var->varlevelsup == 0) + context->varattnos = + bms_add_member(context->varattnos, + var->varattno - FirstLowInvalidHeapAttributeNumber); return false; } - /* Should not find a subquery or subplan */ + + /* Should not find an unplanned subquery */ Assert(!IsA(node, Query)); - Assert(!IsA(node, SubPlan)); return expression_tree_walker(node, pull_varattnos_walker, - (void *) varattnos); + (void *) context); } diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 19d94b252c2..976a832135e 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -3065,7 +3065,7 @@ set_debug_options(int debug_flag, GucContext context, GucSource source) bool set_plan_disabling_options(const char *arg, GucContext context, GucSource source) { - char *tmp = NULL; + const char *tmp = NULL; switch (arg[0]) { @@ -3075,6 +3075,9 @@ set_plan_disabling_options(const char *arg, GucContext context, GucSource source case 'i': /* indexscan */ tmp = "enable_indexscan"; break; + case 'o': /* indexonlyscan */ + tmp = "enable_indexonlyscan"; + break; case 'b': /* bitmapscan */ tmp = "enable_bitmapscan"; break; diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index b22ad07ca8f..9f6b12707bf 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -3684,10 +3684,10 @@ RelationGetIndexAttrBitmap(Relation relation) } /* Collect all attributes used in expressions, too */ - pull_varattnos((Node *) indexInfo->ii_Expressions, &indexattrs); + pull_varattnos((Node *) indexInfo->ii_Expressions, 1, &indexattrs); /* Collect all attributes in the index predicate, too */ - pull_varattnos((Node *) indexInfo->ii_Predicate, &indexattrs); + pull_varattnos((Node *) indexInfo->ii_Predicate, 1, &indexattrs); index_close(indexDesc, AccessShareLock); } diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 85fdad8996c..106096faee7 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -682,6 +682,15 @@ static struct config_bool ConfigureNamesBool[] = NULL, NULL, NULL }, { + {"enable_indexonlyscan", PGC_USERSET, QUERY_TUNING_METHOD, + gettext_noop("Enables the planner's use of index-only-scan plans."), + NULL + }, + &enable_indexonlyscan, + true, + NULL, NULL, NULL + }, + { {"enable_bitmapscan", PGC_USERSET, QUERY_TUNING_METHOD, gettext_noop("Enables the planner's use of bitmap-scan plans."), NULL diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 5bb7e7117bc..1d8bd3dd235 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -243,6 +243,7 @@ #enable_hashagg = on #enable_hashjoin = on #enable_indexscan = on +#enable_indexonlyscan = on #enable_material = on #enable_mergejoin = on #enable_nestloop = on diff --git a/src/include/access/genam.h b/src/include/access/genam.h index 7154ae385ba..dd62680dd54 100644 --- a/src/include/access/genam.h +++ b/src/include/access/genam.h @@ -144,6 +144,9 @@ extern void index_rescan(IndexScanDesc scan, extern void index_endscan(IndexScanDesc scan); extern void index_markpos(IndexScanDesc scan); extern void index_restrpos(IndexScanDesc scan); +extern ItemPointer index_getnext_tid(IndexScanDesc scan, + ScanDirection direction); +extern HeapTuple index_fetch_heap(IndexScanDesc scan); extern HeapTuple index_getnext(IndexScanDesc scan, ScanDirection direction); extern int64 index_getbitmap(IndexScanDesc scan, TIDBitmap *bitmap); diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index 57d08b9656e..656aefcceec 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -16,6 +16,7 @@ #include "access/genam.h" #include "access/heapam.h" +#include "access/itup.h" typedef struct HeapScanDescData @@ -66,6 +67,7 @@ typedef struct IndexScanDescData int numberOfOrderBys; /* number of ordering operators */ ScanKey keyData; /* array of index qualifier descriptors */ ScanKey orderByData; /* array of ordering op descriptors */ + bool xs_want_itup; /* caller requests index tuples */ /* signaling to index AM about killing index tuples */ bool kill_prior_tuple; /* last-returned tuple is dead */ @@ -76,6 +78,9 @@ typedef struct IndexScanDescData /* index access method's private state */ void *opaque; /* access-method-specific info */ + /* in an index-only scan, this is valid after a successful amgettuple */ + IndexTuple xs_itup; /* index tuple returned by AM, or NULL */ + /* xs_ctup/xs_cbuf/xs_recheck are valid after a successful index_getnext */ HeapTupleData xs_ctup; /* current heap tuple, if any */ Buffer xs_cbuf; /* current heap buffer in scan, if any */ diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index f3c8bb41422..e4eb7b1294f 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -53,6 +53,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 201109071 +#define CATALOG_VERSION_NO 201110071 #endif diff --git a/src/include/catalog/pg_am.h b/src/include/catalog/pg_am.h index feab4201deb..c3c864f95f5 100644 --- a/src/include/catalog/pg_am.h +++ b/src/include/catalog/pg_am.h @@ -45,6 +45,7 @@ CATALOG(pg_am,2601) bool amcanbackward; /* does AM support backward scan? */ bool amcanunique; /* does AM support UNIQUE indexes? */ bool amcanmulticol; /* does AM support multi-column indexes? */ + bool amcanreturn; /* can AM return IndexTuples? */ bool amoptionalkey; /* can query omit key for the first column? */ bool amsearchnulls; /* can AM search for NULL/NOT NULL entries? */ bool amstorage; /* can storage type differ from column type? */ @@ -78,7 +79,7 @@ typedef FormData_pg_am *Form_pg_am; * compiler constants for pg_am * ---------------- */ -#define Natts_pg_am 28 +#define Natts_pg_am 29 #define Anum_pg_am_amname 1 #define Anum_pg_am_amstrategies 2 #define Anum_pg_am_amsupport 3 @@ -87,42 +88,43 @@ typedef FormData_pg_am *Form_pg_am; #define Anum_pg_am_amcanbackward 6 #define Anum_pg_am_amcanunique 7 #define Anum_pg_am_amcanmulticol 8 -#define Anum_pg_am_amoptionalkey 9 -#define Anum_pg_am_amsearchnulls 10 -#define Anum_pg_am_amstorage 11 -#define Anum_pg_am_amclusterable 12 -#define Anum_pg_am_ampredlocks 13 -#define Anum_pg_am_amkeytype 14 -#define Anum_pg_am_aminsert 15 -#define Anum_pg_am_ambeginscan 16 -#define Anum_pg_am_amgettuple 17 -#define Anum_pg_am_amgetbitmap 18 -#define Anum_pg_am_amrescan 19 -#define Anum_pg_am_amendscan 20 -#define Anum_pg_am_ammarkpos 21 -#define Anum_pg_am_amrestrpos 22 -#define Anum_pg_am_ambuild 23 -#define Anum_pg_am_ambuildempty 24 -#define Anum_pg_am_ambulkdelete 25 -#define Anum_pg_am_amvacuumcleanup 26 -#define Anum_pg_am_amcostestimate 27 -#define Anum_pg_am_amoptions 28 +#define Anum_pg_am_amcanreturn 9 +#define Anum_pg_am_amoptionalkey 10 +#define Anum_pg_am_amsearchnulls 11 +#define Anum_pg_am_amstorage 12 +#define Anum_pg_am_amclusterable 13 +#define Anum_pg_am_ampredlocks 14 +#define Anum_pg_am_amkeytype 15 +#define Anum_pg_am_aminsert 16 +#define Anum_pg_am_ambeginscan 17 +#define Anum_pg_am_amgettuple 18 +#define Anum_pg_am_amgetbitmap 19 +#define Anum_pg_am_amrescan 20 +#define Anum_pg_am_amendscan 21 +#define Anum_pg_am_ammarkpos 22 +#define Anum_pg_am_amrestrpos 23 +#define Anum_pg_am_ambuild 24 +#define Anum_pg_am_ambuildempty 25 +#define Anum_pg_am_ambulkdelete 26 +#define Anum_pg_am_amvacuumcleanup 27 +#define Anum_pg_am_amcostestimate 28 +#define Anum_pg_am_amoptions 29 /* ---------------- * initial contents of pg_am * ---------------- */ -DATA(insert OID = 403 ( btree 5 1 t f t t t t t f t t 0 btinsert btbeginscan btgettuple btgetbitmap btrescan btendscan btmarkpos btrestrpos btbuild btbuildempty btbulkdelete btvacuumcleanup btcostestimate btoptions )); +DATA(insert OID = 403 ( btree 5 1 t f t t t t t t f t t 0 btinsert btbeginscan btgettuple btgetbitmap btrescan btendscan btmarkpos btrestrpos btbuild btbuildempty btbulkdelete btvacuumcleanup btcostestimate btoptions )); DESCR("b-tree index access method"); #define BTREE_AM_OID 403 -DATA(insert OID = 405 ( hash 1 1 f f t f f f f f f f 23 hashinsert hashbeginscan hashgettuple hashgetbitmap hashrescan hashendscan hashmarkpos hashrestrpos hashbuild hashbuildempty hashbulkdelete hashvacuumcleanup hashcostestimate hashoptions )); +DATA(insert OID = 405 ( hash 1 1 f f t f f f f f f f f 23 hashinsert hashbeginscan hashgettuple hashgetbitmap hashrescan hashendscan hashmarkpos hashrestrpos hashbuild hashbuildempty hashbulkdelete hashvacuumcleanup hashcostestimate hashoptions )); DESCR("hash index access method"); #define HASH_AM_OID 405 -DATA(insert OID = 783 ( gist 0 8 f t f f t t t t t f 0 gistinsert gistbeginscan gistgettuple gistgetbitmap gistrescan gistendscan gistmarkpos gistrestrpos gistbuild gistbuildempty gistbulkdelete gistvacuumcleanup gistcostestimate gistoptions )); +DATA(insert OID = 783 ( gist 0 8 f t f f t f t t t t f 0 gistinsert gistbeginscan gistgettuple gistgetbitmap gistrescan gistendscan gistmarkpos gistrestrpos gistbuild gistbuildempty gistbulkdelete gistvacuumcleanup gistcostestimate gistoptions )); DESCR("GiST index access method"); #define GIST_AM_OID 783 -DATA(insert OID = 2742 ( gin 0 5 f f f f t t f t f f 0 gininsert ginbeginscan - gingetbitmap ginrescan ginendscan ginmarkpos ginrestrpos ginbuild ginbuildempty ginbulkdelete ginvacuumcleanup gincostestimate ginoptions )); +DATA(insert OID = 2742 ( gin 0 5 f f f f t f t f t f f 0 gininsert ginbeginscan - gingetbitmap ginrescan ginendscan ginmarkpos ginrestrpos ginbuild ginbuildempty ginbulkdelete ginvacuumcleanup gincostestimate ginoptions )); DESCR("GIN index access method"); #define GIN_AM_OID 2742 diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index c8a0b598645..3885fa0099d 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -1226,6 +1226,7 @@ typedef struct * RuntimeContext expr context for evaling runtime Skeys * RelationDesc index relation descriptor * ScanDesc index scan descriptor + * VMBuffer buffer in use for visibility map testing, if any * ---------------- */ typedef struct IndexScanState @@ -1242,6 +1243,7 @@ typedef struct IndexScanState ExprContext *iss_RuntimeContext; Relation iss_RelationDesc; IndexScanDesc iss_ScanDesc; + Buffer iss_VMBuffer; } IndexScanState; /* ---------------- diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index 535eca77a7e..60467f52769 100644 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -300,6 +300,10 @@ typedef Scan SeqScan; * that the sort ordering is fully determinable from the top-level operators. * indexorderbyorig is unused at run time, but is needed for EXPLAIN. * (Note these fields are used for amcanorderbyop cases, not amcanorder cases.) + * + * indexorderdir specifies the scan ordering, for indexscans on amcanorder + * indexes (for other indexes it should be "don't care"). indexonly specifies + * an index-only scan, for indexscans on amcanreturn indexes. * ---------------- */ typedef struct IndexScan @@ -311,6 +315,7 @@ typedef struct IndexScan List *indexorderby; /* list of index ORDER BY exprs */ List *indexorderbyorig; /* the same in original form */ ScanDirection indexorderdir; /* forward or backward or don't care */ + bool indexonly; /* attempt to skip heap fetches? */ } IndexScan; /* ---------------- diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h index ecbbc1cd39a..cf48ba433c8 100644 --- a/src/include/nodes/relation.h +++ b/src/include/nodes/relation.h @@ -482,6 +482,7 @@ typedef struct IndexOptInfo bool unique; /* true if a unique index */ bool hypothetical; /* true if index doesn't really exist */ bool amcanorderbyop; /* does AM support order by operator result? */ + bool amcanreturn; /* does AM know how to return tuples? */ bool amoptionalkey; /* can query omit key for the first column? */ bool amsearchnulls; /* can AM search for NULL/NOT NULL entries? */ bool amhasgettuple; /* does AM have amgettuple interface? */ @@ -672,6 +673,10 @@ typedef struct Path * NoMovementScanDirection for an indexscan, but the planner wants to * distinguish ordered from unordered indexes for building pathkeys.) * + * 'indexonly' is TRUE for an index-only scan, that is, the index's access + * method has amcanreturn = TRUE and we only need columns available from the + * index. + * * 'indextotalcost' and 'indexselectivity' are saved in the IndexPath so that * we need not recompute them when considering using the same index in a * bitmap index/heap scan (see BitmapHeapPath). The costs of the IndexPath @@ -693,6 +698,7 @@ typedef struct IndexPath List *indexorderbys; bool isjoininner; ScanDirection indexscandir; + bool indexonly; Cost indextotalcost; Selectivity indexselectivity; double rows; /* estimated number of result tuples */ diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h index 604df335d2d..125808ae980 100644 --- a/src/include/optimizer/cost.h +++ b/src/include/optimizer/cost.h @@ -52,6 +52,7 @@ extern PGDLLIMPORT int effective_cache_size; extern Cost disable_cost; extern bool enable_seqscan; extern bool enable_indexscan; +extern bool enable_indexonlyscan; extern bool enable_bitmapscan; extern bool enable_tidscan; extern bool enable_sort; @@ -67,7 +68,8 @@ extern double index_pages_fetched(double tuples_fetched, BlockNumber pages, double index_pages, PlannerInfo *root); extern void cost_seqscan(Path *path, PlannerInfo *root, RelOptInfo *baserel); extern void cost_index(IndexPath *path, PlannerInfo *root, IndexOptInfo *index, - List *indexQuals, List *indexOrderBys, RelOptInfo *outer_rel); + List *indexQuals, List *indexOrderBys, + bool indexonly, RelOptInfo *outer_rel); extern void cost_bitmap_heap_scan(Path *path, PlannerInfo *root, RelOptInfo *baserel, Path *bitmapqual, RelOptInfo *outer_rel); extern void cost_bitmap_and_node(BitmapAndPath *path, PlannerInfo *root); diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h index ee02732fe1c..38c8c1c9a35 100644 --- a/src/include/optimizer/pathnode.h +++ b/src/include/optimizer/pathnode.h @@ -34,6 +34,7 @@ extern IndexPath *create_index_path(PlannerInfo *root, List *indexorderbys, List *pathkeys, ScanDirection indexscandir, + bool indexonly, RelOptInfo *outer_rel); extern BitmapHeapPath *create_bitmap_heap_path(PlannerInfo *root, RelOptInfo *rel, diff --git a/src/include/optimizer/var.h b/src/include/optimizer/var.h index 5d7e2d93e91..4fd0052f2df 100644 --- a/src/include/optimizer/var.h +++ b/src/include/optimizer/var.h @@ -31,7 +31,7 @@ typedef enum } PVCPlaceHolderBehavior; extern Relids pull_varnos(Node *node); -extern void pull_varattnos(Node *node, Bitmapset **varattnos); +extern void pull_varattnos(Node *node, Index varno, Bitmapset **varattnos); extern bool contain_var_clause(Node *node); extern bool contain_vars_of_level(Node *node, int levelsup); extern int locate_var_of_level(Node *node, int levelsup); diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out index 69926f7b79e..2324c7cda82 100644 --- a/src/test/regress/expected/aggregates.out +++ b/src/test/regress/expected/aggregates.out @@ -449,12 +449,12 @@ analyze tenk1; -- ensure we get consistent plans here -- Basic cases explain (costs off) select min(unique1) from tenk1; - QUERY PLAN -------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------ Result InitPlan 1 (returns $0) -> Limit - -> Index Scan using tenk1_unique1 on tenk1 + -> Index Only Scan using tenk1_unique1 on tenk1 Index Cond: (unique1 IS NOT NULL) (5 rows) @@ -466,12 +466,12 @@ select min(unique1) from tenk1; explain (costs off) select max(unique1) from tenk1; - QUERY PLAN ----------------------------------------------------------------- + QUERY PLAN +--------------------------------------------------------------------- Result InitPlan 1 (returns $0) -> Limit - -> Index Scan Backward using tenk1_unique1 on tenk1 + -> Index Only Scan Backward using tenk1_unique1 on tenk1 Index Cond: (unique1 IS NOT NULL) (5 rows) @@ -488,7 +488,7 @@ explain (costs off) Result InitPlan 1 (returns $0) -> Limit - -> Index Scan Backward using tenk1_unique1 on tenk1 + -> Index Only Scan Backward using tenk1_unique1 on tenk1 Index Cond: ((unique1 IS NOT NULL) AND (unique1 < 42)) (5 rows) @@ -505,7 +505,7 @@ explain (costs off) Result InitPlan 1 (returns $0) -> Limit - -> Index Scan Backward using tenk1_unique1 on tenk1 + -> Index Only Scan Backward using tenk1_unique1 on tenk1 Index Cond: ((unique1 IS NOT NULL) AND (unique1 > 42)) (5 rows) @@ -522,7 +522,7 @@ explain (costs off) Result InitPlan 1 (returns $0) -> Limit - -> Index Scan Backward using tenk1_unique1 on tenk1 + -> Index Only Scan Backward using tenk1_unique1 on tenk1 Index Cond: ((unique1 IS NOT NULL) AND (unique1 > 42000)) (5 rows) @@ -535,12 +535,12 @@ select max(unique1) from tenk1 where unique1 > 42000; -- multi-column index (uses tenk1_thous_tenthous) explain (costs off) select max(tenthous) from tenk1 where thousand = 33; - QUERY PLAN --------------------------------------------------------------------------- + QUERY PLAN +---------------------------------------------------------------------------- Result InitPlan 1 (returns $0) -> Limit - -> Index Scan Backward using tenk1_thous_tenthous on tenk1 + -> Index Only Scan Backward using tenk1_thous_tenthous on tenk1 Index Cond: ((thousand = 33) AND (tenthous IS NOT NULL)) (5 rows) @@ -557,7 +557,7 @@ explain (costs off) Result InitPlan 1 (returns $0) -> Limit - -> Index Scan using tenk1_thous_tenthous on tenk1 + -> Index Only Scan using tenk1_thous_tenthous on tenk1 Index Cond: ((thousand = 33) AND (tenthous IS NOT NULL)) (5 rows) @@ -578,7 +578,7 @@ explain (costs off) -> Result InitPlan 1 (returns $1) -> Limit - -> Index Scan using tenk1_unique1 on tenk1 + -> Index Only Scan using tenk1_unique1 on tenk1 Index Cond: ((unique1 IS NOT NULL) AND (unique1 > int4_tbl.f1)) (7 rows) @@ -596,12 +596,12 @@ select f1, (select min(unique1) from tenk1 where unique1 > f1) AS gt -- check some cases that were handled incorrectly in 8.3.0 explain (costs off) select distinct max(unique2) from tenk1; - QUERY PLAN ----------------------------------------------------------------- + QUERY PLAN +--------------------------------------------------------------------- HashAggregate InitPlan 1 (returns $0) -> Limit - -> Index Scan Backward using tenk1_unique2 on tenk1 + -> Index Only Scan Backward using tenk1_unique2 on tenk1 Index Cond: (unique2 IS NOT NULL) -> Result (6 rows) @@ -614,13 +614,13 @@ select distinct max(unique2) from tenk1; explain (costs off) select max(unique2) from tenk1 order by 1; - QUERY PLAN ----------------------------------------------------------------- + QUERY PLAN +--------------------------------------------------------------------- Sort Sort Key: ($0) InitPlan 1 (returns $0) -> Limit - -> Index Scan Backward using tenk1_unique2 on tenk1 + -> Index Only Scan Backward using tenk1_unique2 on tenk1 Index Cond: (unique2 IS NOT NULL) -> Result (7 rows) @@ -633,13 +633,13 @@ select max(unique2) from tenk1 order by 1; explain (costs off) select max(unique2) from tenk1 order by max(unique2); - QUERY PLAN ----------------------------------------------------------------- + QUERY PLAN +--------------------------------------------------------------------- Sort Sort Key: ($0) InitPlan 1 (returns $0) -> Limit - -> Index Scan Backward using tenk1_unique2 on tenk1 + -> Index Only Scan Backward using tenk1_unique2 on tenk1 Index Cond: (unique2 IS NOT NULL) -> Result (7 rows) @@ -652,13 +652,13 @@ select max(unique2) from tenk1 order by max(unique2); explain (costs off) select max(unique2) from tenk1 order by max(unique2)+1; - QUERY PLAN ----------------------------------------------------------------- + QUERY PLAN +--------------------------------------------------------------------- Sort Sort Key: (($0 + 1)) InitPlan 1 (returns $0) -> Limit - -> Index Scan Backward using tenk1_unique2 on tenk1 + -> Index Only Scan Backward using tenk1_unique2 on tenk1 Index Cond: (unique2 IS NOT NULL) -> Result (7 rows) @@ -671,13 +671,13 @@ select max(unique2) from tenk1 order by max(unique2)+1; explain (costs off) select max(unique2), generate_series(1,3) as g from tenk1 order by g desc; - QUERY PLAN ----------------------------------------------------------------- + QUERY PLAN +--------------------------------------------------------------------- Sort Sort Key: (generate_series(1, 3)) InitPlan 1 (returns $0) -> Limit - -> Index Scan Backward using tenk1_unique2 on tenk1 + -> Index Only Scan Backward using tenk1_unique2 on tenk1 Index Cond: (unique2 IS NOT NULL) -> Result (7 rows) @@ -705,32 +705,32 @@ insert into minmaxtest2 values(15), (16); insert into minmaxtest3 values(17), (18); explain (costs off) select min(f1), max(f1) from minmaxtest; - QUERY PLAN --------------------------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------------------------------- Result InitPlan 1 (returns $0) -> Limit -> Merge Append Sort Key: public.minmaxtest.f1 - -> Index Scan using minmaxtesti on minmaxtest + -> Index Only Scan using minmaxtesti on minmaxtest Index Cond: (f1 IS NOT NULL) - -> Index Scan using minmaxtest1i on minmaxtest1 minmaxtest + -> Index Only Scan using minmaxtest1i on minmaxtest1 minmaxtest Index Cond: (f1 IS NOT NULL) - -> Index Scan Backward using minmaxtest2i on minmaxtest2 minmaxtest + -> Index Only Scan Backward using minmaxtest2i on minmaxtest2 minmaxtest Index Cond: (f1 IS NOT NULL) - -> Index Scan using minmaxtest3i on minmaxtest3 minmaxtest + -> Index Only Scan using minmaxtest3i on minmaxtest3 minmaxtest Index Cond: (f1 IS NOT NULL) InitPlan 2 (returns $1) -> Limit -> Merge Append Sort Key: public.minmaxtest.f1 - -> Index Scan Backward using minmaxtesti on minmaxtest + -> Index Only Scan Backward using minmaxtesti on minmaxtest Index Cond: (f1 IS NOT NULL) - -> Index Scan Backward using minmaxtest1i on minmaxtest1 minmaxtest + -> Index Only Scan Backward using minmaxtest1i on minmaxtest1 minmaxtest Index Cond: (f1 IS NOT NULL) - -> Index Scan using minmaxtest2i on minmaxtest2 minmaxtest + -> Index Only Scan using minmaxtest2i on minmaxtest2 minmaxtest Index Cond: (f1 IS NOT NULL) - -> Index Scan Backward using minmaxtest3i on minmaxtest3 minmaxtest + -> Index Only Scan Backward using minmaxtest3i on minmaxtest3 minmaxtest Index Cond: (f1 IS NOT NULL) (25 rows) diff --git a/src/test/regress/expected/rangefuncs.out b/src/test/regress/expected/rangefuncs.out index 51d561b28fa..5f20c932497 100644 --- a/src/test/regress/expected/rangefuncs.out +++ b/src/test/regress/expected/rangefuncs.out @@ -1,17 +1,18 @@ SELECT name, setting FROM pg_settings WHERE name LIKE 'enable%'; - name | setting --------------------+--------- - enable_bitmapscan | on - enable_hashagg | on - enable_hashjoin | on - enable_indexscan | on - enable_material | on - enable_mergejoin | on - enable_nestloop | on - enable_seqscan | on - enable_sort | on - enable_tidscan | on -(10 rows) + name | setting +----------------------+--------- + enable_bitmapscan | on + enable_hashagg | on + enable_hashjoin | on + enable_indexonlyscan | on + enable_indexscan | on + enable_material | on + enable_mergejoin | on + enable_nestloop | on + enable_seqscan | on + enable_sort | on + enable_tidscan | on +(11 rows) CREATE TABLE foo2(fooid int, f2 int); INSERT INTO foo2 VALUES(1, 11); |