diff options
author | Tom Lane <tgl@sss.pgh.pa.us> | 2006-05-07 01:21:30 +0000 |
---|---|---|
committer | Tom Lane <tgl@sss.pgh.pa.us> | 2006-05-07 01:21:30 +0000 |
commit | 09cb5c0e7d6fbc9dee26dc429e4fc0f2a88e5272 (patch) | |
tree | 650b81f13757ed3c99638c65329475e682337646 /src/backend/access/nbtree/nbtutils.c | |
parent | 88d94a11bb2b0d3dac95325d8d6056e4bac8b4b8 (diff) | |
download | postgresql-09cb5c0e7d6fbc9dee26dc429e4fc0f2a88e5272.tar.gz postgresql-09cb5c0e7d6fbc9dee26dc429e4fc0f2a88e5272.zip |
Rewrite btree index scans to work a page at a time in all cases (both
btgettuple and btgetmulti). This eliminates the problem of "re-finding" the
exact stopping point, since the stopping point is effectively always a page
boundary, and index items are never moved across pre-existing page boundaries.
A small penalty is that the keys_are_unique optimization is effectively
disabled (and, therefore, is removed in this patch), causing us to apply
_bt_checkkeys() to at least one more tuple than necessary when looking up a
unique key. However, the advantages for non-unique cases seem great enough to
accept this tradeoff. Aside from simplifying and (sometimes) speeding up the
indexscan code, this will allow us to reimplement btbulkdelete as a largely
sequential scan instead of index-order traversal, thereby significantly
reducing the cost of VACUUM. Those changes will come in a separate patch.
Original patch by Heikki Linnakangas, rework by Tom Lane.
Diffstat (limited to 'src/backend/access/nbtree/nbtutils.c')
-rw-r--r-- | src/backend/access/nbtree/nbtutils.c | 110 |
1 files changed, 87 insertions, 23 deletions
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index b8b9ff780ea..74d06052587 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtutils.c,v 1.72 2006/03/05 15:58:21 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtutils.c,v 1.73 2006/05/07 01:21:30 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -196,11 +196,6 @@ _bt_freestack(BTStack stack) * Again though, only keys with RHS datatype equal to the index datatype * can be checked for contradictions. * - * Furthermore, we detect the case where the index is unique and we have - * equality quals for all columns. In this case there can be at most one - * (visible) matching tuple. index_getnext uses this to avoid uselessly - * continuing the scan after finding one match. - * * Row comparison keys are treated the same as comparisons to nondefault * datatypes: we just transfer them into the preprocessed array without any * editorialization. We can treat them the same as an ordinary inequality @@ -216,7 +211,6 @@ _bt_freestack(BTStack stack) void _bt_preprocess_keys(IndexScanDesc scan) { - Relation relation = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; int numberOfKeys = scan->numberOfKeys; int new_numberOfKeys; @@ -234,7 +228,6 @@ _bt_preprocess_keys(IndexScanDesc scan) /* initialize result variables */ so->qual_ok = true; so->numberOfKeys = 0; - scan->keys_are_unique = false; if (numberOfKeys < 1) return; /* done if qual-less scan */ @@ -256,13 +249,6 @@ _bt_preprocess_keys(IndexScanDesc scan) */ if (cur->sk_flags & SK_ISNULL) so->qual_ok = false; - else if (relation->rd_index->indisunique && - relation->rd_rel->relnatts == 1) - { - /* it's a unique index, do we have an equality qual? */ - if (cur->sk_strategy == BTEqualStrategyNumber) - scan->keys_are_unique = true; - } memcpy(outkeys, inkeys, sizeof(ScanKeyData)); so->numberOfKeys = 1; /* We can mark the qual as required if it's for first index col */ @@ -464,14 +450,6 @@ _bt_preprocess_keys(IndexScanDesc scan) } so->numberOfKeys = new_numberOfKeys; - - /* - * If unique index and we have equality keys for all columns, set - * keys_are_unique flag for higher levels. - */ - if (relation->rd_index->indisunique && - relation->rd_rel->relnatts == numberOfEqualCols) - scan->keys_are_unique = true; } /* @@ -826,3 +804,89 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, TupleDesc tupdesc, return result; } + +/* + * _bt_killitems - set LP_DELETE bit for items an indexscan caller has + * told us were killed + * + * scan->so contains information about the current page and killed tuples + * thereon (generally, this should only be called if so->numKilled > 0). + * + * The caller must have pin on so->currPos.buf, but may or may not have + * read-lock, as indicated by haveLock. Note that we assume read-lock + * is sufficient for setting LP_DELETE hint bits. + * + * We match items by heap TID before assuming they are the right ones to + * delete. We cope with cases where items have moved right due to insertions. + * If an item has moved off the current page due to a split, we'll fail to + * find it and do nothing (this is not an error case --- we assume the item + * will eventually get marked in a future indexscan). Likewise, if the item + * has moved left due to deletions or disappeared itself, we'll not find it, + * but these cases are not worth optimizing. (Since deletions are only done + * by VACUUM, any deletion makes it highly likely that the dead item has been + * removed itself, and therefore searching left is not worthwhile.) + */ +void +_bt_killitems(IndexScanDesc scan, bool haveLock) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + Page page; + BTPageOpaque opaque; + OffsetNumber minoff; + OffsetNumber maxoff; + int i; + bool killedsomething = false; + + Assert(BufferIsValid(so->currPos.buf)); + + if (!haveLock) + LockBuffer(so->currPos.buf, BT_READ); + + page = BufferGetPage(so->currPos.buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + minoff = P_FIRSTDATAKEY(opaque); + maxoff = PageGetMaxOffsetNumber(page); + + for (i = 0; i < so->numKilled; i++) + { + int itemIndex = so->killedItems[i]; + BTScanPosItem *kitem = &so->currPos.items[itemIndex]; + OffsetNumber offnum = kitem->indexOffset; + + Assert(itemIndex >= so->currPos.firstItem && + itemIndex <= so->currPos.lastItem); + if (offnum < minoff) + continue; /* pure paranoia */ + while (offnum <= maxoff) + { + ItemId iid = PageGetItemId(page, offnum); + IndexTuple ituple = (IndexTuple) PageGetItem(page, iid); + + if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid)) + { + /* found the item */ + iid->lp_flags |= LP_DELETE; + killedsomething = true; + break; /* out of inner search loop */ + } + offnum = OffsetNumberNext(offnum); + } + } + + /* + * Since this can be redone later if needed, it's treated the same + * as a commit-hint-bit status update for heap tuples: we mark the + * buffer dirty but don't make a WAL log entry. + */ + if (killedsomething) + SetBufferCommitInfoNeedsSave(so->currPos.buf); + + if (!haveLock) + LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); + + /* + * Always reset the scan state, so we don't look for same items + * on other pages. + */ + so->numKilled = 0; +} |