1 files changed, 252 insertions, 252 deletions
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index b714296c8f7..b1426456241 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -12,7 +12,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.81 2001/05/18 21:24:17 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.82 2001/07/15 22:48:16 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -28,11 +28,27 @@
 #include "storage/sinval.h"
 #include "access/xlogutils.h"
 
-bool		BuildingBtree = false;		/* see comment in btbuild() */
-bool		FastBuild = true;	/* use sort/build instead */
 
- /* of insertion build */
+/* Working state for btbuild and its callback */
+typedef struct
+{
+	bool		usefast;
+	bool		isUnique;
+	bool		haveDead;
+	Relation	heapRel;
+	BTSpool    *spool;
+	/*
+	 * spool2 is needed only when the index is an unique index. Dead
+	 * tuples are put into spool2 instead of spool in order to avoid
+	 * uniqueness check.
+	 */
+	BTSpool    *spool2;
+	double		indtuples;
+} BTBuildState;
+
 
+bool		BuildingBtree = false;		/* see comment in btbuild() */
+bool		FastBuild = true;	/* use SORT instead of insertion build */
 
 /*
  * TEMPORARY FLAG FOR TESTING NEW FIX TREE
@@ -41,6 +57,29 @@ bool		FastBuild = true;	/* use sort/build instead */
 bool		FixBTree = true;
 
 static void _bt_restscan(IndexScanDesc scan);
+static void btbuildCallback(Relation index,
+							HeapTuple htup,
+							Datum *attdata,
+							char *nulls,
+							bool tupleIsAlive,
+							void *state);
+
+
+/*
+ * AtEOXact_nbtree() --- clean up nbtree subsystem at xact abort or commit.
+ */
+void
+AtEOXact_nbtree(void)
+{
+	/*
+	 * Note: these actions should only be necessary during xact abort; but
+	 * they can't hurt during a commit.
+	 */
+
+	/* If we were building a btree, we ain't anymore. */
+	BuildingBtree = false;
+}
+
 
 /*
  *	btbuild() -- build a new btree index.
@@ -56,42 +95,10 @@ btbuild(PG_FUNCTION_ARGS)
 	Relation	heap = (Relation) PG_GETARG_POINTER(0);
 	Relation	index = (Relation) PG_GETARG_POINTER(1);
 	IndexInfo  *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2);
-	Node	   *oldPred = (Node *) PG_GETARG_POINTER(3);
-#ifdef NOT_USED
-	IndexStrategy istrat = (IndexStrategy) PG_GETARG_POINTER(4);
-#endif
-	HeapScanDesc hscan;
-	HeapTuple	htup;
-	IndexTuple	itup;
-	TupleDesc	htupdesc,
-				itupdesc;
-	Datum		attdata[INDEX_MAX_KEYS];
-	char		nulls[INDEX_MAX_KEYS];
-	double		nhtups,
-				nitups;
-	Node	   *pred = indexInfo->ii_Predicate;
-#ifndef OMIT_PARTIAL_INDEX
-	TupleTable	tupleTable;
-	TupleTableSlot *slot;
-#endif
-	ExprContext *econtext;
-	InsertIndexResult res = NULL;
-	BTSpool    *spool = NULL;
-	BTItem		btitem;
-	bool		usefast;
-	Snapshot	snapshot;
-	TransactionId XmaxRecent;
+	double		reltuples;
+	BTBuildState buildstate;
 
-	/*
-	 * spool2 is needed only when the index is an unique index. Dead
-	 * tuples are put into spool2 instead of spool in order to avoid
-	 * uniqueness check.
-	 */
-	BTSpool    *spool2 = NULL;
-	bool		tupleIsAlive;
-	int			dead_count;
-
-	/* note that this is a new btree */
+	/* set flag to disable locking */
 	BuildingBtree = true;
 
 	/*
@@ -100,220 +107,63 @@ btbuild(PG_FUNCTION_ARGS)
 	 * look harder at this.  (there is some kind of incremental processing
 	 * going on there.) -- pma 08/29/95
 	 */
-	usefast = (FastBuild && IsNormalProcessingMode());
+	buildstate.usefast = (FastBuild && IsNormalProcessingMode());
+	buildstate.isUnique = indexInfo->ii_Unique;
+	buildstate.haveDead = false;
+	buildstate.heapRel = heap;
+	buildstate.spool = NULL;
+	buildstate.spool2 = NULL;
+	buildstate.indtuples = 0;
 
 #ifdef BTREE_BUILD_STATS
 	if (Show_btree_build_stats)
 		ResetUsage();
 #endif	 /* BTREE_BUILD_STATS */
 
-	/* initialize the btree index metadata page (if this is a new index) */
-	if (oldPred == NULL)
-		_bt_metapinit(index);
-
-	/* get tuple descriptors for heap and index relations */
-	htupdesc = RelationGetDescr(heap);
-	itupdesc = RelationGetDescr(index);
-
 	/*
-	 * If this is a predicate (partial) index, we will need to evaluate
-	 * the predicate using ExecQual, which requires the current tuple to
-	 * be in a slot of a TupleTable.  In addition, ExecQual must have an
-	 * ExprContext referring to that slot.	Here, we initialize dummy
-	 * TupleTable and ExprContext objects for this purpose. --Nels, Feb 92
-	 *
-	 * We construct the ExprContext anyway since we need a per-tuple
-	 * temporary memory context for function evaluation -- tgl July 00
+	 * We expect to be called exactly once for any index relation. If
+	 * that's not the case, big trouble's what we have.
 	 */
-#ifndef OMIT_PARTIAL_INDEX
-	if (pred != NULL || oldPred != NULL)
-	{
-		tupleTable = ExecCreateTupleTable(1);
-		slot = ExecAllocTableSlot(tupleTable);
-		ExecSetSlotDescriptor(slot, htupdesc, false);
-
-		/*
-		 * we never want to use sort/build if we are extending an existing
-		 * partial index -- it works by inserting the newly-qualifying
-		 * tuples into the existing index. (sort/build would overwrite the
-		 * existing index with one consisting of the newly-qualifying
-		 * tuples.)
-		 */
-		usefast = false;
-	}
-	else
-	{
-		tupleTable = NULL;
-		slot = NULL;
-	}
-	econtext = MakeExprContext(slot, TransactionCommandContext);
-#else
-	econtext = MakeExprContext(NULL, TransactionCommandContext);
-#endif	 /* OMIT_PARTIAL_INDEX */
+	if (RelationGetNumberOfBlocks(index) != 0)
+		elog(ERROR, "%s already contains data",
+			 RelationGetRelationName(index));
 
-	/* build the index */
-	nhtups = nitups = 0.0;
+	/* initialize the btree index metadata page */
+	_bt_metapinit(index);
 
-	if (usefast)
+	if (buildstate.usefast)
 	{
-		spool = _bt_spoolinit(index, indexInfo->ii_Unique);
-
+		buildstate.spool = _bt_spoolinit(index, indexInfo->ii_Unique);
 		/*
-		 * Different from spool,the uniqueness isn't checked for spool2.
+		 * Different from spool, the uniqueness isn't checked for spool2.
 		 */
 		if (indexInfo->ii_Unique)
-			spool2 = _bt_spoolinit(index, false);
+			buildstate.spool2 = _bt_spoolinit(index, false);
 	}
 
-	/* start a heap scan */
-	dead_count = 0;
-	snapshot = (IsBootstrapProcessingMode() ? SnapshotNow : SnapshotAny);
-	hscan = heap_beginscan(heap, 0, snapshot, 0, (ScanKey) NULL);
-	XmaxRecent = 0;
-	if (snapshot == SnapshotAny)
-		GetXmaxRecent(&XmaxRecent);
-
-	while (HeapTupleIsValid(htup = heap_getnext(hscan, 0)))
-	{
-		if (snapshot == SnapshotAny)
-		{
-			tupleIsAlive = HeapTupleSatisfiesNow(htup->t_data);
-			if (!tupleIsAlive)
-			{
-				if ((htup->t_data->t_infomask & HEAP_XMIN_INVALID) != 0)
-					continue;
-				if (htup->t_data->t_infomask & HEAP_XMAX_COMMITTED &&
-					htup->t_data->t_xmax < XmaxRecent)
-					continue;
-			}
-		}
-		else
-			tupleIsAlive = true;
-
-		MemoryContextReset(econtext->ecxt_per_tuple_memory);
-
-		nhtups += 1.0;
-
-#ifndef OMIT_PARTIAL_INDEX
-
-		/*
-		 * If oldPred != NULL, this is an EXTEND INDEX command, so skip
-		 * this tuple if it was already in the existing partial index
-		 */
-		if (oldPred != NULL)
-		{
-			slot->val = htup;
-			if (ExecQual((List *) oldPred, econtext, false))
-			{
-				nitups += 1.0;
-				continue;
-			}
-		}
-
-		/*
-		 * Skip this tuple if it doesn't satisfy the partial-index
-		 * predicate
-		 */
-		if (pred != NULL)
-		{
-			slot->val = htup;
-			if (!ExecQual((List *) pred, econtext, false))
-				continue;
-		}
-#endif	 /* OMIT_PARTIAL_INDEX */
-
-		nitups += 1.0;
-
-		/*
-		 * For the current heap tuple, extract all the attributes we use
-		 * in this index, and note which are null.
-		 */
-		FormIndexDatum(indexInfo,
-					   htup,
-					   htupdesc,
-					   econtext->ecxt_per_tuple_memory,
-					   attdata,
-					   nulls);
-
-		/* form an index tuple and point it at the heap tuple */
-		itup = index_formtuple(itupdesc, attdata, nulls);
-
-		/*
-		 * If the single index key is null, we don't insert it into the
-		 * index.  Btrees support scans on <, <=, =, >=, and >. Relational
-		 * algebra says that A op B (where op is one of the operators
-		 * above) returns null if either A or B is null.  This means that
-		 * no qualification used in an index scan could ever return true
-		 * on a null attribute.  It also means that indices can't be used
-		 * by ISNULL or NOTNULL scans, but that's an artifact of the
-		 * strategy map architecture chosen in 1986, not of the way nulls
-		 * are handled here.
-		 */
-
-		/*
-		 * New comments: NULLs handling. While we can't do NULL
-		 * comparison, we can follow simple rule for ordering items on
-		 * btree pages - NULLs greater NOT_NULLs and NULL = NULL is TRUE.
-		 * Sure, it's just rule for placing/finding items and no more -
-		 * keytest'll return FALSE for a = 5 for items having 'a' isNULL.
-		 * Look at _bt_compare for how it works. - vadim 03/23/97
-		 *
-		 * if (itup->t_info & INDEX_NULL_MASK) { pfree(itup); continue; }
-		 */
-
-		itup->t_tid = htup->t_self;
-		btitem = _bt_formitem(itup);
-
-		/*
-		 * if we are doing bottom-up btree build, we insert the index into
-		 * a spool file for subsequent processing.	otherwise, we insert
-		 * into the btree.
-		 */
-		if (usefast)
-		{
-			if (tupleIsAlive || !spool2)
-				_bt_spool(btitem, spool);
-			else
-/* dead tuples are put into spool2 */
-			{
-				dead_count++;
-				_bt_spool(btitem, spool2);
-			}
-		}
-		else
-			res = _bt_doinsert(index, btitem, indexInfo->ii_Unique, heap);
-
-		pfree(btitem);
-		pfree(itup);
-		if (res)
-			pfree(res);
-	}
+	/* do the heap scan */
+	reltuples = IndexBuildHeapScan(heap, index, indexInfo,
+								   btbuildCallback, (void *) &buildstate);
 
 	/* okay, all heap tuples are indexed */
-	heap_endscan(hscan);
-	if (spool2 && !dead_count)	/* spool2 was found to be unnecessary */
+	if (buildstate.spool2 && !buildstate.haveDead)
 	{
-		_bt_spooldestroy(spool2);
-		spool2 = NULL;
+		/* spool2 turns out to be unnecessary */
+		_bt_spooldestroy(buildstate.spool2);
+		buildstate.spool2 = NULL;
 	}
 
-#ifndef OMIT_PARTIAL_INDEX
-	if (pred != NULL || oldPred != NULL)
-		ExecDropTupleTable(tupleTable, true);
-#endif	 /* OMIT_PARTIAL_INDEX */
-	FreeExprContext(econtext);
-
 	/*
 	 * if we are doing bottom-up btree build, finish the build by (1)
 	 * completing the sort of the spool file, (2) inserting the sorted
 	 * tuples into btree pages and (3) building the upper levels.
 	 */
-	if (usefast)
+	if (buildstate.usefast)
 	{
-		_bt_leafbuild(spool, spool2);
-		_bt_spooldestroy(spool);
-		if (spool2)
-			_bt_spooldestroy(spool2);
+		_bt_leafbuild(buildstate.spool, buildstate.spool2);
+		_bt_spooldestroy(buildstate.spool);
+		if (buildstate.spool2)
+			_bt_spooldestroy(buildstate.spool2);
 	}
 
 #ifdef BTREE_BUILD_STATS
@@ -325,6 +175,9 @@ btbuild(PG_FUNCTION_ARGS)
 	}
 #endif	 /* BTREE_BUILD_STATS */
 
+	/* all done */
+	BuildingBtree = false;
+
 	/*
 	 * Since we just counted the tuples in the heap, we update its stats
 	 * in pg_class to guarantee that the planner takes advantage of the
@@ -343,20 +196,63 @@ btbuild(PG_FUNCTION_ARGS)
 
 		heap_close(heap, NoLock);
 		index_close(index);
-		UpdateStats(hrelid, nhtups);
-		UpdateStats(irelid, nitups);
-		if (oldPred != NULL)
+		UpdateStats(hrelid, reltuples);
+		UpdateStats(irelid, buildstate.indtuples);
+	}
+
+	PG_RETURN_VOID();
+}
+
+/*
+ * Per-tuple callback from IndexBuildHeapScan
+ */
+static void
+btbuildCallback(Relation index,
+				HeapTuple htup,
+				Datum *attdata,
+				char *nulls,
+				bool tupleIsAlive,
+				void *state)
+{
+	BTBuildState   *buildstate = (BTBuildState *) state;
+	IndexTuple	itup;
+	BTItem		btitem;
+	InsertIndexResult res;
+
+	/* form an index tuple and point it at the heap tuple */
+	itup = index_formtuple(RelationGetDescr(index), attdata, nulls);
+	itup->t_tid = htup->t_self;
+
+	btitem = _bt_formitem(itup);
+
+	/*
+	 * if we are doing bottom-up btree build, we insert the index into
+	 * a spool file for subsequent processing.	otherwise, we insert
+	 * into the btree.
+	 */
+	if (buildstate->usefast)
+	{
+		if (tupleIsAlive || buildstate->spool2 == NULL)
+			_bt_spool(btitem, buildstate->spool);
+		else
 		{
-			if (nitups == nhtups)
-				pred = NULL;
-			UpdateIndexPredicate(irelid, oldPred, pred);
+			/* dead tuples are put into spool2 */
+			buildstate->haveDead = true;
+			_bt_spool(btitem, buildstate->spool2);
 		}
 	}
+	else
+	{
+		res = _bt_doinsert(index, btitem,
+						   buildstate->isUnique, buildstate->heapRel);
+		if (res)
+			pfree(res);
+	}
 
-	/* all done */
-	BuildingBtree = false;
+	buildstate->indtuples += 1;
 
-	PG_RETURN_VOID();
+	pfree(btitem);
+	pfree(itup);
 }
 
 /*
@@ -423,8 +319,10 @@ btgettuple(PG_FUNCTION_ARGS)
 
 	/*
 	 * Save heap TID to use it in _bt_restscan.  Then release the read
-	 * lock on the buffer so that we aren't blocking other backends. NOTE:
-	 * we do keep the pin on the buffer!
+	 * lock on the buffer so that we aren't blocking other backends.
+	 *
+	 * NOTE: we do keep the pin on the buffer!  This is essential to ensure
+	 * that someone else doesn't delete the index entry we are stopped on.
 	 */
 	if (res)
 	{
@@ -451,9 +349,6 @@ btbeginscan(PG_FUNCTION_ARGS)
 	/* get the scan */
 	scan = RelationGetIndexScan(rel, fromEnd, keysz, scankey);
 
-	/* register scan in case we change pages it's using */
-	_bt_regscan(scan);
-
 	PG_RETURN_POINTER(scan);
 }
 
@@ -571,8 +466,6 @@ btendscan(PG_FUNCTION_ARGS)
 		pfree(so->keyData);
 	pfree(so);
 
-	_bt_dropscan(scan);
-
 	PG_RETURN_VOID();
 }
 
@@ -640,20 +533,127 @@ btrestrpos(PG_FUNCTION_ARGS)
 	PG_RETURN_VOID();
 }
 
-/* stubs */
+/*
+ * Bulk deletion of all index entries pointing to a set of heap tuples.
+ * The set of target tuples is specified via a callback routine that tells
+ * whether any given heap tuple (identified by ItemPointer) is being deleted.
+ *
+ * Result: a palloc'd struct containing statistical info for VACUUM displays.
+ */
 Datum
-btdelete(PG_FUNCTION_ARGS)
+btbulkdelete(PG_FUNCTION_ARGS)
 {
 	Relation	rel = (Relation) PG_GETARG_POINTER(0);
-	ItemPointer tid = (ItemPointer) PG_GETARG_POINTER(1);
+	IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(1);
+	void	   *callback_state = (void *) PG_GETARG_POINTER(2);
+	IndexBulkDeleteResult *result;
+	BlockNumber	num_pages;
+	double		tuples_removed;
+	double		num_index_tuples;
+	RetrieveIndexResult res;
+	IndexScanDesc scan;
+	BTScanOpaque so;
+	ItemPointer current;
+
+	tuples_removed = 0;
+	num_index_tuples = 0;
+
+	/*
+	 * We use a standard IndexScanDesc scan object, but to speed up the loop,
+	 * we skip most of the wrapper layers of index_getnext and instead call
+	 * _bt_step directly.  This implies holding buffer lock on a target page
+	 * throughout the loop over the page's tuples.  Initially, we have a read
+	 * lock acquired by _bt_step when we stepped onto the page.  If we find
+	 * a tuple we need to delete, we trade in the read lock for an exclusive
+	 * write lock; after that, we hold the write lock until we step off the
+	 * page (fortunately, _bt_relbuf doesn't care which kind of lock it's
+	 * releasing).  This should minimize the amount of work needed per page.
+	 */
+	scan = index_beginscan(rel, false, 0, (ScanKey) NULL);
+	so = (BTScanOpaque) scan->opaque;
+	current = &(scan->currentItemData);
 
-	/* adjust any active scans that will be affected by this deletion */
-	_bt_adjscans(rel, tid);
+	/* Use _bt_first to get started, then _bt_step to remaining tuples */
+	res = _bt_first(scan, ForwardScanDirection);
 
-	/* delete the data from the page */
-	_bt_pagedel(rel, tid);
+	if (res != NULL)
+	{
+		Buffer		buf;
+		BlockNumber	lockedBlock = InvalidBlockNumber;
 
-	PG_RETURN_VOID();
+		pfree(res);
+		/* we have the buffer pinned and locked */
+		buf = so->btso_curbuf;
+		Assert(BufferIsValid(buf));
+
+		do
+		{
+			Page		page;
+			BlockNumber	blkno;
+			OffsetNumber offnum;
+			BTItem		btitem;
+			IndexTuple	itup;
+			ItemPointer	htup;
+
+			/* current is the next index tuple */
+			blkno = ItemPointerGetBlockNumber(current);
+			offnum = ItemPointerGetOffsetNumber(current);
+			page = BufferGetPage(buf);
+			btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
+			itup = &btitem->bti_itup;
+			htup = &(itup->t_tid);
+
+			if (callback(htup, callback_state))
+			{
+				/*
+				 * If this is first deletion on this page, trade in read
+				 * lock for a really-exclusive write lock.  Then, step back
+				 * one and re-examine the item, because someone else might
+				 * have inserted an item while we weren't holding the lock!
+				 */
+				if (blkno != lockedBlock)
+				{
+					LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+					LockBufferForCleanup(buf);
+					lockedBlock = blkno;
+				}
+				else
+				{
+					/* Delete the item from the page */
+					_bt_itemdel(rel, buf, current);
+
+					/* Mark buffer dirty, but keep the lock and pin */
+					WriteNoReleaseBuffer(buf);
+
+					tuples_removed += 1;
+				}
+
+				/*
+				 * We need to back up the scan one item so that the next
+				 * cycle will re-examine the same offnum on this page.
+				 *
+				 * For now, just hack the current-item index.  Will need
+				 * to be smarter when deletion includes removal of empty
+				 * index pages.
+				 */
+				current->ip_posid--;
+			}
+			else
+				num_index_tuples += 1;
+		} while (_bt_step(scan, &buf, ForwardScanDirection));
+	}
+
+	index_endscan(scan);
+
+	/* return statistics */
+	num_pages = RelationGetNumberOfBlocks(rel);
+
+	result = (IndexBulkDeleteResult *) palloc(sizeof(IndexBulkDeleteResult));
+	result->num_pages = num_pages;
+	result->tuples_removed = tuples_removed;
+	result->num_index_tuples = num_index_tuples;
+
+	PG_RETURN_POINTER(result);
 }
 
 /*
@@ -676,7 +676,7 @@ _bt_restscan(IndexScanDesc scan)
 
 	/*
 	 * Get back the read lock we were holding on the buffer. (We still
-	 * have a reference-count pin on it, though.)
+	 * have a reference-count pin on it, so need not get that.)
 	 */
 	LockBuffer(buf, BT_READ);
 
@@ -729,7 +729,7 @@ _bt_restscan(IndexScanDesc scan)
 				 "\n\tRecreate index %s.", RelationGetRelationName(rel));
 
 		blkno = opaque->btpo_next;
-		_bt_relbuf(rel, buf, BT_READ);
+		_bt_relbuf(rel, buf);
 		buf = _bt_getbuf(rel, blkno, BT_READ);
 		page = BufferGetPage(buf);
 		maxoff = PageGetMaxOffsetNumber(page);