aboutsummaryrefslogtreecommitdiff
path: root/src/backend/access/nbtree/nbtree.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/access/nbtree/nbtree.c')
-rw-r--r--src/backend/access/nbtree/nbtree.c504
1 files changed, 252 insertions, 252 deletions
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index b714296c8f7..b1426456241 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -12,7 +12,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.81 2001/05/18 21:24:17 momjian Exp $
+ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.82 2001/07/15 22:48:16 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -28,11 +28,27 @@
#include "storage/sinval.h"
#include "access/xlogutils.h"
-bool BuildingBtree = false; /* see comment in btbuild() */
-bool FastBuild = true; /* use sort/build instead */
- /* of insertion build */
+/* Working state for btbuild and its callback */
+typedef struct
+{
+ bool usefast;
+ bool isUnique;
+ bool haveDead;
+ Relation heapRel;
+ BTSpool *spool;
+ /*
+ * spool2 is needed only when the index is an unique index. Dead
+ * tuples are put into spool2 instead of spool in order to avoid
+ * uniqueness check.
+ */
+ BTSpool *spool2;
+ double indtuples;
+} BTBuildState;
+
+bool BuildingBtree = false; /* see comment in btbuild() */
+bool FastBuild = true; /* use SORT instead of insertion build */
/*
* TEMPORARY FLAG FOR TESTING NEW FIX TREE
@@ -41,6 +57,29 @@ bool FastBuild = true; /* use sort/build instead */
bool FixBTree = true;
static void _bt_restscan(IndexScanDesc scan);
+static void btbuildCallback(Relation index,
+ HeapTuple htup,
+ Datum *attdata,
+ char *nulls,
+ bool tupleIsAlive,
+ void *state);
+
+
+/*
+ * AtEOXact_nbtree() --- clean up nbtree subsystem at xact abort or commit.
+ */
+void
+AtEOXact_nbtree(void)
+{
+ /*
+ * Note: these actions should only be necessary during xact abort; but
+ * they can't hurt during a commit.
+ */
+
+ /* If we were building a btree, we ain't anymore. */
+ BuildingBtree = false;
+}
+
/*
* btbuild() -- build a new btree index.
@@ -56,42 +95,10 @@ btbuild(PG_FUNCTION_ARGS)
Relation heap = (Relation) PG_GETARG_POINTER(0);
Relation index = (Relation) PG_GETARG_POINTER(1);
IndexInfo *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2);
- Node *oldPred = (Node *) PG_GETARG_POINTER(3);
-#ifdef NOT_USED
- IndexStrategy istrat = (IndexStrategy) PG_GETARG_POINTER(4);
-#endif
- HeapScanDesc hscan;
- HeapTuple htup;
- IndexTuple itup;
- TupleDesc htupdesc,
- itupdesc;
- Datum attdata[INDEX_MAX_KEYS];
- char nulls[INDEX_MAX_KEYS];
- double nhtups,
- nitups;
- Node *pred = indexInfo->ii_Predicate;
-#ifndef OMIT_PARTIAL_INDEX
- TupleTable tupleTable;
- TupleTableSlot *slot;
-#endif
- ExprContext *econtext;
- InsertIndexResult res = NULL;
- BTSpool *spool = NULL;
- BTItem btitem;
- bool usefast;
- Snapshot snapshot;
- TransactionId XmaxRecent;
+ double reltuples;
+ BTBuildState buildstate;
- /*
- * spool2 is needed only when the index is an unique index. Dead
- * tuples are put into spool2 instead of spool in order to avoid
- * uniqueness check.
- */
- BTSpool *spool2 = NULL;
- bool tupleIsAlive;
- int dead_count;
-
- /* note that this is a new btree */
+ /* set flag to disable locking */
BuildingBtree = true;
/*
@@ -100,220 +107,63 @@ btbuild(PG_FUNCTION_ARGS)
* look harder at this. (there is some kind of incremental processing
* going on there.) -- pma 08/29/95
*/
- usefast = (FastBuild && IsNormalProcessingMode());
+ buildstate.usefast = (FastBuild && IsNormalProcessingMode());
+ buildstate.isUnique = indexInfo->ii_Unique;
+ buildstate.haveDead = false;
+ buildstate.heapRel = heap;
+ buildstate.spool = NULL;
+ buildstate.spool2 = NULL;
+ buildstate.indtuples = 0;
#ifdef BTREE_BUILD_STATS
if (Show_btree_build_stats)
ResetUsage();
#endif /* BTREE_BUILD_STATS */
- /* initialize the btree index metadata page (if this is a new index) */
- if (oldPred == NULL)
- _bt_metapinit(index);
-
- /* get tuple descriptors for heap and index relations */
- htupdesc = RelationGetDescr(heap);
- itupdesc = RelationGetDescr(index);
-
/*
- * If this is a predicate (partial) index, we will need to evaluate
- * the predicate using ExecQual, which requires the current tuple to
- * be in a slot of a TupleTable. In addition, ExecQual must have an
- * ExprContext referring to that slot. Here, we initialize dummy
- * TupleTable and ExprContext objects for this purpose. --Nels, Feb 92
- *
- * We construct the ExprContext anyway since we need a per-tuple
- * temporary memory context for function evaluation -- tgl July 00
+ * We expect to be called exactly once for any index relation. If
+ * that's not the case, big trouble's what we have.
*/
-#ifndef OMIT_PARTIAL_INDEX
- if (pred != NULL || oldPred != NULL)
- {
- tupleTable = ExecCreateTupleTable(1);
- slot = ExecAllocTableSlot(tupleTable);
- ExecSetSlotDescriptor(slot, htupdesc, false);
-
- /*
- * we never want to use sort/build if we are extending an existing
- * partial index -- it works by inserting the newly-qualifying
- * tuples into the existing index. (sort/build would overwrite the
- * existing index with one consisting of the newly-qualifying
- * tuples.)
- */
- usefast = false;
- }
- else
- {
- tupleTable = NULL;
- slot = NULL;
- }
- econtext = MakeExprContext(slot, TransactionCommandContext);
-#else
- econtext = MakeExprContext(NULL, TransactionCommandContext);
-#endif /* OMIT_PARTIAL_INDEX */
+ if (RelationGetNumberOfBlocks(index) != 0)
+ elog(ERROR, "%s already contains data",
+ RelationGetRelationName(index));
- /* build the index */
- nhtups = nitups = 0.0;
+ /* initialize the btree index metadata page */
+ _bt_metapinit(index);
- if (usefast)
+ if (buildstate.usefast)
{
- spool = _bt_spoolinit(index, indexInfo->ii_Unique);
-
+ buildstate.spool = _bt_spoolinit(index, indexInfo->ii_Unique);
/*
- * Different from spool,the uniqueness isn't checked for spool2.
+ * Different from spool, the uniqueness isn't checked for spool2.
*/
if (indexInfo->ii_Unique)
- spool2 = _bt_spoolinit(index, false);
+ buildstate.spool2 = _bt_spoolinit(index, false);
}
- /* start a heap scan */
- dead_count = 0;
- snapshot = (IsBootstrapProcessingMode() ? SnapshotNow : SnapshotAny);
- hscan = heap_beginscan(heap, 0, snapshot, 0, (ScanKey) NULL);
- XmaxRecent = 0;
- if (snapshot == SnapshotAny)
- GetXmaxRecent(&XmaxRecent);
-
- while (HeapTupleIsValid(htup = heap_getnext(hscan, 0)))
- {
- if (snapshot == SnapshotAny)
- {
- tupleIsAlive = HeapTupleSatisfiesNow(htup->t_data);
- if (!tupleIsAlive)
- {
- if ((htup->t_data->t_infomask & HEAP_XMIN_INVALID) != 0)
- continue;
- if (htup->t_data->t_infomask & HEAP_XMAX_COMMITTED &&
- htup->t_data->t_xmax < XmaxRecent)
- continue;
- }
- }
- else
- tupleIsAlive = true;
-
- MemoryContextReset(econtext->ecxt_per_tuple_memory);
-
- nhtups += 1.0;
-
-#ifndef OMIT_PARTIAL_INDEX
-
- /*
- * If oldPred != NULL, this is an EXTEND INDEX command, so skip
- * this tuple if it was already in the existing partial index
- */
- if (oldPred != NULL)
- {
- slot->val = htup;
- if (ExecQual((List *) oldPred, econtext, false))
- {
- nitups += 1.0;
- continue;
- }
- }
-
- /*
- * Skip this tuple if it doesn't satisfy the partial-index
- * predicate
- */
- if (pred != NULL)
- {
- slot->val = htup;
- if (!ExecQual((List *) pred, econtext, false))
- continue;
- }
-#endif /* OMIT_PARTIAL_INDEX */
-
- nitups += 1.0;
-
- /*
- * For the current heap tuple, extract all the attributes we use
- * in this index, and note which are null.
- */
- FormIndexDatum(indexInfo,
- htup,
- htupdesc,
- econtext->ecxt_per_tuple_memory,
- attdata,
- nulls);
-
- /* form an index tuple and point it at the heap tuple */
- itup = index_formtuple(itupdesc, attdata, nulls);
-
- /*
- * If the single index key is null, we don't insert it into the
- * index. Btrees support scans on <, <=, =, >=, and >. Relational
- * algebra says that A op B (where op is one of the operators
- * above) returns null if either A or B is null. This means that
- * no qualification used in an index scan could ever return true
- * on a null attribute. It also means that indices can't be used
- * by ISNULL or NOTNULL scans, but that's an artifact of the
- * strategy map architecture chosen in 1986, not of the way nulls
- * are handled here.
- */
-
- /*
- * New comments: NULLs handling. While we can't do NULL
- * comparison, we can follow simple rule for ordering items on
- * btree pages - NULLs greater NOT_NULLs and NULL = NULL is TRUE.
- * Sure, it's just rule for placing/finding items and no more -
- * keytest'll return FALSE for a = 5 for items having 'a' isNULL.
- * Look at _bt_compare for how it works. - vadim 03/23/97
- *
- * if (itup->t_info & INDEX_NULL_MASK) { pfree(itup); continue; }
- */
-
- itup->t_tid = htup->t_self;
- btitem = _bt_formitem(itup);
-
- /*
- * if we are doing bottom-up btree build, we insert the index into
- * a spool file for subsequent processing. otherwise, we insert
- * into the btree.
- */
- if (usefast)
- {
- if (tupleIsAlive || !spool2)
- _bt_spool(btitem, spool);
- else
-/* dead tuples are put into spool2 */
- {
- dead_count++;
- _bt_spool(btitem, spool2);
- }
- }
- else
- res = _bt_doinsert(index, btitem, indexInfo->ii_Unique, heap);
-
- pfree(btitem);
- pfree(itup);
- if (res)
- pfree(res);
- }
+ /* do the heap scan */
+ reltuples = IndexBuildHeapScan(heap, index, indexInfo,
+ btbuildCallback, (void *) &buildstate);
/* okay, all heap tuples are indexed */
- heap_endscan(hscan);
- if (spool2 && !dead_count) /* spool2 was found to be unnecessary */
+ if (buildstate.spool2 && !buildstate.haveDead)
{
- _bt_spooldestroy(spool2);
- spool2 = NULL;
+ /* spool2 turns out to be unnecessary */
+ _bt_spooldestroy(buildstate.spool2);
+ buildstate.spool2 = NULL;
}
-#ifndef OMIT_PARTIAL_INDEX
- if (pred != NULL || oldPred != NULL)
- ExecDropTupleTable(tupleTable, true);
-#endif /* OMIT_PARTIAL_INDEX */
- FreeExprContext(econtext);
-
/*
* if we are doing bottom-up btree build, finish the build by (1)
* completing the sort of the spool file, (2) inserting the sorted
* tuples into btree pages and (3) building the upper levels.
*/
- if (usefast)
+ if (buildstate.usefast)
{
- _bt_leafbuild(spool, spool2);
- _bt_spooldestroy(spool);
- if (spool2)
- _bt_spooldestroy(spool2);
+ _bt_leafbuild(buildstate.spool, buildstate.spool2);
+ _bt_spooldestroy(buildstate.spool);
+ if (buildstate.spool2)
+ _bt_spooldestroy(buildstate.spool2);
}
#ifdef BTREE_BUILD_STATS
@@ -325,6 +175,9 @@ btbuild(PG_FUNCTION_ARGS)
}
#endif /* BTREE_BUILD_STATS */
+ /* all done */
+ BuildingBtree = false;
+
/*
* Since we just counted the tuples in the heap, we update its stats
* in pg_class to guarantee that the planner takes advantage of the
@@ -343,20 +196,63 @@ btbuild(PG_FUNCTION_ARGS)
heap_close(heap, NoLock);
index_close(index);
- UpdateStats(hrelid, nhtups);
- UpdateStats(irelid, nitups);
- if (oldPred != NULL)
+ UpdateStats(hrelid, reltuples);
+ UpdateStats(irelid, buildstate.indtuples);
+ }
+
+ PG_RETURN_VOID();
+}
+
+/*
+ * Per-tuple callback from IndexBuildHeapScan
+ */
+static void
+btbuildCallback(Relation index,
+ HeapTuple htup,
+ Datum *attdata,
+ char *nulls,
+ bool tupleIsAlive,
+ void *state)
+{
+ BTBuildState *buildstate = (BTBuildState *) state;
+ IndexTuple itup;
+ BTItem btitem;
+ InsertIndexResult res;
+
+ /* form an index tuple and point it at the heap tuple */
+ itup = index_formtuple(RelationGetDescr(index), attdata, nulls);
+ itup->t_tid = htup->t_self;
+
+ btitem = _bt_formitem(itup);
+
+ /*
+ * if we are doing bottom-up btree build, we insert the index into
+ * a spool file for subsequent processing. otherwise, we insert
+ * into the btree.
+ */
+ if (buildstate->usefast)
+ {
+ if (tupleIsAlive || buildstate->spool2 == NULL)
+ _bt_spool(btitem, buildstate->spool);
+ else
{
- if (nitups == nhtups)
- pred = NULL;
- UpdateIndexPredicate(irelid, oldPred, pred);
+ /* dead tuples are put into spool2 */
+ buildstate->haveDead = true;
+ _bt_spool(btitem, buildstate->spool2);
}
}
+ else
+ {
+ res = _bt_doinsert(index, btitem,
+ buildstate->isUnique, buildstate->heapRel);
+ if (res)
+ pfree(res);
+ }
- /* all done */
- BuildingBtree = false;
+ buildstate->indtuples += 1;
- PG_RETURN_VOID();
+ pfree(btitem);
+ pfree(itup);
}
/*
@@ -423,8 +319,10 @@ btgettuple(PG_FUNCTION_ARGS)
/*
* Save heap TID to use it in _bt_restscan. Then release the read
- * lock on the buffer so that we aren't blocking other backends. NOTE:
- * we do keep the pin on the buffer!
+ * lock on the buffer so that we aren't blocking other backends.
+ *
+ * NOTE: we do keep the pin on the buffer! This is essential to ensure
+ * that someone else doesn't delete the index entry we are stopped on.
*/
if (res)
{
@@ -451,9 +349,6 @@ btbeginscan(PG_FUNCTION_ARGS)
/* get the scan */
scan = RelationGetIndexScan(rel, fromEnd, keysz, scankey);
- /* register scan in case we change pages it's using */
- _bt_regscan(scan);
-
PG_RETURN_POINTER(scan);
}
@@ -571,8 +466,6 @@ btendscan(PG_FUNCTION_ARGS)
pfree(so->keyData);
pfree(so);
- _bt_dropscan(scan);
-
PG_RETURN_VOID();
}
@@ -640,20 +533,127 @@ btrestrpos(PG_FUNCTION_ARGS)
PG_RETURN_VOID();
}
-/* stubs */
+/*
+ * Bulk deletion of all index entries pointing to a set of heap tuples.
+ * The set of target tuples is specified via a callback routine that tells
+ * whether any given heap tuple (identified by ItemPointer) is being deleted.
+ *
+ * Result: a palloc'd struct containing statistical info for VACUUM displays.
+ */
Datum
-btdelete(PG_FUNCTION_ARGS)
+btbulkdelete(PG_FUNCTION_ARGS)
{
Relation rel = (Relation) PG_GETARG_POINTER(0);
- ItemPointer tid = (ItemPointer) PG_GETARG_POINTER(1);
+ IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(1);
+ void *callback_state = (void *) PG_GETARG_POINTER(2);
+ IndexBulkDeleteResult *result;
+ BlockNumber num_pages;
+ double tuples_removed;
+ double num_index_tuples;
+ RetrieveIndexResult res;
+ IndexScanDesc scan;
+ BTScanOpaque so;
+ ItemPointer current;
+
+ tuples_removed = 0;
+ num_index_tuples = 0;
+
+ /*
+ * We use a standard IndexScanDesc scan object, but to speed up the loop,
+ * we skip most of the wrapper layers of index_getnext and instead call
+ * _bt_step directly. This implies holding buffer lock on a target page
+ * throughout the loop over the page's tuples. Initially, we have a read
+ * lock acquired by _bt_step when we stepped onto the page. If we find
+ * a tuple we need to delete, we trade in the read lock for an exclusive
+ * write lock; after that, we hold the write lock until we step off the
+ * page (fortunately, _bt_relbuf doesn't care which kind of lock it's
+ * releasing). This should minimize the amount of work needed per page.
+ */
+ scan = index_beginscan(rel, false, 0, (ScanKey) NULL);
+ so = (BTScanOpaque) scan->opaque;
+ current = &(scan->currentItemData);
- /* adjust any active scans that will be affected by this deletion */
- _bt_adjscans(rel, tid);
+ /* Use _bt_first to get started, then _bt_step to remaining tuples */
+ res = _bt_first(scan, ForwardScanDirection);
- /* delete the data from the page */
- _bt_pagedel(rel, tid);
+ if (res != NULL)
+ {
+ Buffer buf;
+ BlockNumber lockedBlock = InvalidBlockNumber;
- PG_RETURN_VOID();
+ pfree(res);
+ /* we have the buffer pinned and locked */
+ buf = so->btso_curbuf;
+ Assert(BufferIsValid(buf));
+
+ do
+ {
+ Page page;
+ BlockNumber blkno;
+ OffsetNumber offnum;
+ BTItem btitem;
+ IndexTuple itup;
+ ItemPointer htup;
+
+ /* current is the next index tuple */
+ blkno = ItemPointerGetBlockNumber(current);
+ offnum = ItemPointerGetOffsetNumber(current);
+ page = BufferGetPage(buf);
+ btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
+ itup = &btitem->bti_itup;
+ htup = &(itup->t_tid);
+
+ if (callback(htup, callback_state))
+ {
+ /*
+ * If this is first deletion on this page, trade in read
+ * lock for a really-exclusive write lock. Then, step back
+ * one and re-examine the item, because someone else might
+ * have inserted an item while we weren't holding the lock!
+ */
+ if (blkno != lockedBlock)
+ {
+ LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+ LockBufferForCleanup(buf);
+ lockedBlock = blkno;
+ }
+ else
+ {
+ /* Delete the item from the page */
+ _bt_itemdel(rel, buf, current);
+
+ /* Mark buffer dirty, but keep the lock and pin */
+ WriteNoReleaseBuffer(buf);
+
+ tuples_removed += 1;
+ }
+
+ /*
+ * We need to back up the scan one item so that the next
+ * cycle will re-examine the same offnum on this page.
+ *
+ * For now, just hack the current-item index. Will need
+ * to be smarter when deletion includes removal of empty
+ * index pages.
+ */
+ current->ip_posid--;
+ }
+ else
+ num_index_tuples += 1;
+ } while (_bt_step(scan, &buf, ForwardScanDirection));
+ }
+
+ index_endscan(scan);
+
+ /* return statistics */
+ num_pages = RelationGetNumberOfBlocks(rel);
+
+ result = (IndexBulkDeleteResult *) palloc(sizeof(IndexBulkDeleteResult));
+ result->num_pages = num_pages;
+ result->tuples_removed = tuples_removed;
+ result->num_index_tuples = num_index_tuples;
+
+ PG_RETURN_POINTER(result);
}
/*
@@ -676,7 +676,7 @@ _bt_restscan(IndexScanDesc scan)
/*
* Get back the read lock we were holding on the buffer. (We still
- * have a reference-count pin on it, though.)
+ * have a reference-count pin on it, so need not get that.)
*/
LockBuffer(buf, BT_READ);
@@ -729,7 +729,7 @@ _bt_restscan(IndexScanDesc scan)
"\n\tRecreate index %s.", RelationGetRelationName(rel));
blkno = opaque->btpo_next;
- _bt_relbuf(rel, buf, BT_READ);
+ _bt_relbuf(rel, buf);
buf = _bt_getbuf(rel, blkno, BT_READ);
page = BufferGetPage(buf);
maxoff = PageGetMaxOffsetNumber(page);