Rewrite btree index scans to work a page at a time in all cases (both

btgettuple and btgetmulti). This eliminates the problem of "re-finding" the exact stopping point, since the stopping point is effectively always a page boundary, and index items are never moved across pre-existing page boundaries. A small penalty is that the keys_are_unique optimization is effectively disabled (and, therefore, is removed in this patch), causing us to apply _bt_checkkeys() to at least one more tuple than necessary when looking up a unique key. However, the advantages for non-unique cases seem great enough to accept this tradeoff. Aside from simplifying and (sometimes) speeding up the indexscan code, this will allow us to reimplement btbulkdelete as a largely sequential scan instead of index-order traversal, thereby significantly reducing the cost of VACUUM. Those changes will come in a separate patch. Original patch by Heikki Linnakangas, rework by Tom Lane.
author: Tom Lane <tgl@sss.pgh.pa.us> 2006-05-07 01:21:30 +0000
committer: Tom Lane <tgl@sss.pgh.pa.us> 2006-05-07 01:21:30 +0000
commit: 09cb5c0e7d6fbc9dee26dc429e4fc0f2a88e5272 (patch)
tree: 650b81f13757ed3c99638c65329475e682337646 /src/backend/access/nbtree/nbtutils.c
parent: 88d94a11bb2b0d3dac95325d8d6056e4bac8b4b8 (diff)
download: postgresql-09cb5c0e7d6fbc9dee26dc429e4fc0f2a88e5272.tar.gz
postgresql-09cb5c0e7d6fbc9dee26dc429e4fc0f2a88e5272.zip
1 files changed, 87 insertions, 23 deletions
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index b8b9ff780ea..74d06052587 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtutils.c,v 1.72 2006/03/05 15:58:21 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtutils.c,v 1.73 2006/05/07 01:21:30 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -196,11 +196,6 @@ _bt_freestack(BTStack stack)
  * Again though, only keys with RHS datatype equal to the index datatype
  * can be checked for contradictions.
  *
- * Furthermore, we detect the case where the index is unique and we have
- * equality quals for all columns.	In this case there can be at most one
- * (visible) matching tuple.  index_getnext uses this to avoid uselessly
- * continuing the scan after finding one match.
- *
  * Row comparison keys are treated the same as comparisons to nondefault
  * datatypes: we just transfer them into the preprocessed array without any
  * editorialization.  We can treat them the same as an ordinary inequality
@@ -216,7 +211,6 @@ _bt_freestack(BTStack stack)
 void
 _bt_preprocess_keys(IndexScanDesc scan)
 {
-	Relation	relation = scan->indexRelation;
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
 	int			numberOfKeys = scan->numberOfKeys;
 	int			new_numberOfKeys;
@@ -234,7 +228,6 @@ _bt_preprocess_keys(IndexScanDesc scan)
 	/* initialize result variables */
 	so->qual_ok = true;
 	so->numberOfKeys = 0;
-	scan->keys_are_unique = false;
 
 	if (numberOfKeys < 1)
 		return;					/* done if qual-less scan */
@@ -256,13 +249,6 @@ _bt_preprocess_keys(IndexScanDesc scan)
 		 */
 		if (cur->sk_flags & SK_ISNULL)
 			so->qual_ok = false;
-		else if (relation->rd_index->indisunique &&
-				 relation->rd_rel->relnatts == 1)
-		{
-			/* it's a unique index, do we have an equality qual? */
-			if (cur->sk_strategy == BTEqualStrategyNumber)
-				scan->keys_are_unique = true;
-		}
 		memcpy(outkeys, inkeys, sizeof(ScanKeyData));
 		so->numberOfKeys = 1;
 		/* We can mark the qual as required if it's for first index col */
@@ -464,14 +450,6 @@ _bt_preprocess_keys(IndexScanDesc scan)
 	}
 
 	so->numberOfKeys = new_numberOfKeys;
-
-	/*
-	 * If unique index and we have equality keys for all columns, set
-	 * keys_are_unique flag for higher levels.
-	 */
-	if (relation->rd_index->indisunique &&
-		relation->rd_rel->relnatts == numberOfEqualCols)
-		scan->keys_are_unique = true;
 }
 
 /*
@@ -826,3 +804,89 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, TupleDesc tupdesc,
 
 	return result;
 }
+
+/*
+ * _bt_killitems - set LP_DELETE bit for items an indexscan caller has
+ * told us were killed
+ *
+ * scan->so contains information about the current page and killed tuples
+ * thereon (generally, this should only be called if so->numKilled > 0).
+ *
+ * The caller must have pin on so->currPos.buf, but may or may not have
+ * read-lock, as indicated by haveLock.  Note that we assume read-lock
+ * is sufficient for setting LP_DELETE hint bits.
+ *
+ * We match items by heap TID before assuming they are the right ones to
+ * delete.  We cope with cases where items have moved right due to insertions.
+ * If an item has moved off the current page due to a split, we'll fail to
+ * find it and do nothing (this is not an error case --- we assume the item
+ * will eventually get marked in a future indexscan).  Likewise, if the item
+ * has moved left due to deletions or disappeared itself, we'll not find it,
+ * but these cases are not worth optimizing.  (Since deletions are only done
+ * by VACUUM, any deletion makes it highly likely that the dead item has been
+ * removed itself, and therefore searching left is not worthwhile.)
+ */
+void
+_bt_killitems(IndexScanDesc scan, bool haveLock)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	Page		page;
+	BTPageOpaque opaque;
+	OffsetNumber minoff;
+	OffsetNumber maxoff;
+	int			i;
+	bool		killedsomething = false;
+
+	Assert(BufferIsValid(so->currPos.buf));
+
+	if (!haveLock)
+		LockBuffer(so->currPos.buf, BT_READ);
+
+	page = BufferGetPage(so->currPos.buf);
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	minoff = P_FIRSTDATAKEY(opaque);
+	maxoff = PageGetMaxOffsetNumber(page);
+
+	for (i = 0; i < so->numKilled; i++)
+	{
+		int				itemIndex = so->killedItems[i];
+		BTScanPosItem  *kitem = &so->currPos.items[itemIndex];
+		OffsetNumber	offnum = kitem->indexOffset;
+
+		Assert(itemIndex >= so->currPos.firstItem &&
+			   itemIndex <= so->currPos.lastItem);
+		if (offnum < minoff)
+			continue;			/* pure paranoia */
+		while (offnum <= maxoff)
+		{
+			ItemId		iid = PageGetItemId(page, offnum);
+			IndexTuple	ituple = (IndexTuple) PageGetItem(page, iid);
+
+			if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid))
+			{
+				/* found the item */
+				iid->lp_flags |= LP_DELETE;
+				killedsomething = true;
+				break;			/* out of inner search loop */
+			}
+			offnum = OffsetNumberNext(offnum);
+		}
+	}
+
+	/*
+	 * Since this can be redone later if needed, it's treated the same
+	 * as a commit-hint-bit status update for heap tuples: we mark the
+	 * buffer dirty but don't make a WAL log entry.
+	 */
+	if (killedsomething)
+		SetBufferCommitInfoNeedsSave(so->currPos.buf);
+
+	if (!haveLock)
+		LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
+
+	/*
+	 * Always reset the scan state, so we don't look for same items
+	 * on other pages.
+	 */
+	so->numKilled = 0;
+}
author	Tom Lane <tgl@sss.pgh.pa.us>	2006-05-07 01:21:30 +0000
committer	Tom Lane <tgl@sss.pgh.pa.us>	2006-05-07 01:21:30 +0000
commit	09cb5c0e7d6fbc9dee26dc429e4fc0f2a88e5272 (patch)
tree	650b81f13757ed3c99638c65329475e682337646 /src/backend/access/nbtree/nbtutils.c
parent	88d94a11bb2b0d3dac95325d8d6056e4bac8b4b8 (diff)
download	postgresql-09cb5c0e7d6fbc9dee26dc429e4fc0f2a88e5272.tar.gz postgresql-09cb5c0e7d6fbc9dee26dc429e4fc0f2a88e5272.zip