aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/backend/access/nbtree/nbtpage.c45
-rw-r--r--src/backend/access/nbtree/nbtree.c155
-rw-r--r--src/backend/access/nbtree/nbtxlog.c27
-rw-r--r--src/include/access/nbtree.h15
4 files changed, 135 insertions, 107 deletions
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index 16439972024..a9bd393440c 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -9,7 +9,7 @@
*
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.61 2003/02/23 06:17:13 tgl Exp $
+ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.62 2003/02/23 22:43:08 tgl Exp $
*
* NOTES
* Postgres btree pages look like ordinary relation pages. The opaque
@@ -618,26 +618,34 @@ _bt_metaproot(Relation rel, BlockNumber rootbknum, uint32 level)
}
/*
- * Delete an item from a btree page.
+ * Delete item(s) from a btree page.
*
* This must only be used for deleting leaf items. Deleting an item on a
* non-leaf page has to be done as part of an atomic action that includes
* deleting the page it points to.
*
* This routine assumes that the caller has pinned and locked the buffer,
- * and will write the buffer afterwards.
+ * and will write the buffer afterwards. Also, the given itemnos *must*
+ * appear in increasing order in the array.
*/
void
-_bt_itemdel(Relation rel, Buffer buf, ItemPointer tid)
+_bt_delitems(Relation rel, Buffer buf,
+ OffsetNumber *itemnos, int nitems)
{
Page page = BufferGetPage(buf);
- OffsetNumber offno;
-
- offno = ItemPointerGetOffsetNumber(tid);
+ int i;
+ /* No elog(ERROR) until changes are logged */
START_CRIT_SECTION();
- PageIndexTupleDelete(page, offno);
+ /*
+ * Delete the items in reverse order so we don't have to think about
+ * adjusting item numbers for previous deletions.
+ */
+ for (i = nitems - 1; i >= 0; i--)
+ {
+ PageIndexTupleDelete(page, itemnos[i]);
+ }
/* XLOG stuff */
if (!rel->rd_istemp)
@@ -646,17 +654,30 @@ _bt_itemdel(Relation rel, Buffer buf, ItemPointer tid)
XLogRecPtr recptr;
XLogRecData rdata[2];
- xlrec.target.node = rel->rd_node;
- xlrec.target.tid = *tid;
+ xlrec.node = rel->rd_node;
+ xlrec.block = BufferGetBlockNumber(buf);
rdata[0].buffer = InvalidBuffer;
rdata[0].data = (char *) &xlrec;
rdata[0].len = SizeOfBtreeDelete;
rdata[0].next = &(rdata[1]);
+ /*
+ * The target-offsets array is not in the buffer, but pretend
+ * that it is. When XLogInsert stores the whole buffer, the offsets
+ * array need not be stored too.
+ */
rdata[1].buffer = buf;
- rdata[1].data = NULL;
- rdata[1].len = 0;
+ if (nitems > 0)
+ {
+ rdata[1].data = (char *) itemnos;
+ rdata[1].len = nitems * sizeof(OffsetNumber);
+ }
+ else
+ {
+ rdata[1].data = NULL;
+ rdata[1].len = 0;
+ }
rdata[1].next = NULL;
recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE, rdata);
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index b7a1e7ada1c..d73af9e6782 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -12,7 +12,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.97 2003/02/23 06:17:13 tgl Exp $
+ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.98 2003/02/23 22:43:08 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -572,121 +572,110 @@ btbulkdelete(PG_FUNCTION_ARGS)
IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(1);
void *callback_state = (void *) PG_GETARG_POINTER(2);
IndexBulkDeleteResult *result;
- BlockNumber num_pages;
double tuples_removed;
double num_index_tuples;
- IndexScanDesc scan;
- BTScanOpaque so;
- ItemPointer current;
+ OffsetNumber deletable[BLCKSZ / sizeof(OffsetNumber)];
+ int ndeletable;
+ Buffer buf;
+ BlockNumber num_pages;
tuples_removed = 0;
num_index_tuples = 0;
/*
- * We use a standard IndexScanDesc scan object, but to speed up the
- * loop, we skip most of the wrapper layers of index_getnext and
- * instead call _bt_step directly. This implies holding buffer lock
- * on a target page throughout the loop over the page's tuples.
- *
- * Whenever we step onto a new page, we have to trade in the read
- * lock acquired by _bt_first or _bt_step for an exclusive write lock
- * (fortunately, _bt_relbuf doesn't care which kind of lock it's
- * releasing when it comes time for _bt_step to release our lock).
+ * The outer loop iterates over index leaf pages, the inner over items
+ * on a leaf page. We issue just one _bt_delitems() call per page,
+ * so as to minimize WAL traffic.
*
- * Note that we exclusive-lock every leaf page, or at least every one
- * containing data items. It sounds attractive to only exclusive-lock
+ * Note that we exclusive-lock every leaf page containing data items,
+ * in sequence left to right. It sounds attractive to only exclusive-lock
* those containing items we need to delete, but unfortunately that
* is not safe: we could then pass a stopped indexscan, which could
* in rare cases lead to deleting the item it needs to find when it
* resumes. (See _bt_restscan --- this could only happen if an indexscan
* stops on a deletable item and then a page split moves that item
* into a page further to its right, which the indexscan will have no
- * pin on.)
+ * pin on.) We can skip obtaining exclusive lock on empty pages
+ * though, since no indexscan could be stopped on those.
*/
- scan = index_beginscan(NULL, rel, SnapshotAny, 0, (ScanKey) NULL);
- so = (BTScanOpaque) scan->opaque;
- current = &(scan->currentItemData);
-
- /* Use _bt_first to get started, then _bt_step to remaining tuples */
- if (_bt_first(scan, ForwardScanDirection))
+ buf = _bt_get_endpoint(rel, 0, false);
+ if (BufferIsValid(buf)) /* check for empty index */
{
- Buffer buf;
- BlockNumber lockedBlock = InvalidBlockNumber;
-
- /* we have the buffer pinned and read-locked */
- buf = so->btso_curbuf;
- Assert(BufferIsValid(buf));
-
- do
+ for (;;)
{
Page page;
- BlockNumber blkno;
- OffsetNumber offnum;
- BTItem btitem;
BTPageOpaque opaque;
- IndexTuple itup;
- ItemPointer htup;
+ OffsetNumber offnum,
+ minoff,
+ maxoff;
+ BlockNumber nextpage;
CHECK_FOR_INTERRUPTS();
- /* current is the next index tuple */
+ ndeletable = 0;
page = BufferGetPage(buf);
- blkno = ItemPointerGetBlockNumber(current);
-
- /*
- * Make sure we have a super-exclusive write lock on this page.
- *
- * We assume that only concurrent insertions, not deletions,
- * can occur while we're not holding the page lock (the
- * caller should hold a suitable relation lock to ensure
- * this). Therefore, no items can escape being scanned because
- * of this temporary lock release.
- */
- if (blkno != lockedBlock)
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ minoff = P_FIRSTDATAKEY(opaque);
+ maxoff = PageGetMaxOffsetNumber(page);
+ /* We probably cannot see deleted pages, but skip 'em if so */
+ if (minoff <= maxoff && !P_ISDELETED(opaque))
{
+ /*
+ * Trade in the initial read lock for a super-exclusive
+ * write lock on this page.
+ */
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
LockBufferForCleanup(buf);
- lockedBlock = blkno;
/*
- * If the page was formerly rightmost but was split while we
- * didn't hold the lock, and ip_posid is pointing to item
- * 1, then ip_posid now points at the high key not a valid
- * data item. In this case we need to step forward.
+ * Recompute minoff/maxoff, both of which could have changed
+ * while we weren't holding the lock.
*/
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
- if (current->ip_posid < P_FIRSTDATAKEY(opaque))
- current->ip_posid = P_FIRSTDATAKEY(opaque);
- }
-
- offnum = ItemPointerGetOffsetNumber(current);
- btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
- itup = &btitem->bti_itup;
- htup = &(itup->t_tid);
-
- if (callback(htup, callback_state))
- {
- /* Okay to delete the item from the page */
- _bt_itemdel(rel, buf, current);
-
- /* Mark buffer dirty, but keep the lock and pin */
- WriteNoReleaseBuffer(buf);
-
- tuples_removed += 1;
-
+ minoff = P_FIRSTDATAKEY(opaque);
+ maxoff = PageGetMaxOffsetNumber(page);
/*
- * We now need to back up the scan one item, so that the next
- * cycle will re-examine the same offnum on this page (which
- * now holds the next item).
+ * Scan over all items to see which ones need deleted
+ * according to the callback function.
*/
- current->ip_posid--;
+ for (offnum = minoff;
+ offnum <= maxoff;
+ offnum = OffsetNumberNext(offnum))
+ {
+ BTItem btitem;
+ ItemPointer htup;
+
+ btitem = (BTItem) PageGetItem(page,
+ PageGetItemId(page, offnum));
+ htup = &(btitem->bti_itup.t_tid);
+ if (callback(htup, callback_state))
+ {
+ deletable[ndeletable++] = offnum;
+ tuples_removed += 1;
+ }
+ else
+ num_index_tuples += 1;
+ }
+ }
+ /*
+ * If we need to delete anything, do it and write the buffer;
+ * else just release the buffer.
+ */
+ nextpage = opaque->btpo_next;
+ if (ndeletable > 0)
+ {
+ _bt_delitems(rel, buf, deletable, ndeletable);
+ _bt_wrtbuf(rel, buf);
}
else
- num_index_tuples += 1;
- } while (_bt_step(scan, &buf, ForwardScanDirection));
+ {
+ _bt_relbuf(rel, buf);
+ }
+ /* And advance to next page, if any */
+ if (nextpage == P_NONE)
+ break;
+ buf = _bt_getbuf(rel, nextpage, BT_READ);
+ }
}
- index_endscan(scan);
-
/* return statistics */
num_pages = RelationGetNumberOfBlocks(rel);
@@ -765,7 +754,7 @@ btvacuumcleanup(PG_FUNCTION_ARGS)
}
}
else if ((opaque->btpo_flags & BTP_HALF_DEAD) ||
- P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page))
+ P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page))
{
/* Empty, try to delete */
int ndel;
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index 058b13b6a43..a1a52571fe1 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -8,7 +8,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.2 2003/02/23 06:17:13 tgl Exp $
+ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.3 2003/02/23 22:43:08 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -379,11 +379,10 @@ btree_xlog_delete(bool redo, XLogRecPtr lsn, XLogRecord *record)
return;
xlrec = (xl_btree_delete *) XLogRecGetData(record);
- reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->target.node);
+ reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->node);
if (!RelationIsValid(reln))
return;
- buffer = XLogReadBuffer(false, reln,
- ItemPointerGetBlockNumber(&(xlrec->target.tid)));
+ buffer = XLogReadBuffer(false, reln, xlrec->block);
if (!BufferIsValid(buffer))
elog(PANIC, "btree_delete_redo: block unfound");
page = (Page) BufferGetPage(buffer);
@@ -396,7 +395,21 @@ btree_xlog_delete(bool redo, XLogRecPtr lsn, XLogRecord *record)
return;
}
- PageIndexTupleDelete(page, ItemPointerGetOffsetNumber(&(xlrec->target.tid)));
+ if (record->xl_len > SizeOfBtreeDelete)
+ {
+ OffsetNumber *unused;
+ OffsetNumber *unend;
+
+ unused = (OffsetNumber *) ((char *) xlrec + SizeOfBtreeDelete);
+ unend = (OffsetNumber *) ((char *) xlrec + record->xl_len);
+
+ /* be careful to delete from back to front */
+ while (unused < unend)
+ {
+ unend--;
+ PageIndexTupleDelete(page, *unend);
+ }
+ }
PageSetLSN(page, lsn);
PageSetSUI(page, ThisStartUpID);
@@ -853,8 +866,8 @@ btree_desc(char *buf, uint8 xl_info, char *rec)
{
xl_btree_delete *xlrec = (xl_btree_delete *) rec;
- strcat(buf, "delete: ");
- out_target(buf, &(xlrec->target));
+ sprintf(buf + strlen(buf), "delete: node %u/%u; blk %u",
+ xlrec->node.tblNode, xlrec->node.relNode, xlrec->block);
break;
}
case XLOG_BTREE_DELETE_PAGE:
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 35b6c94e7e2..d6ea70cd7d1 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $Id: nbtree.h,v 1.66 2003/02/23 06:17:13 tgl Exp $
+ * $Id: nbtree.h,v 1.67 2003/02/23 22:43:09 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -263,14 +263,18 @@ typedef struct xl_btree_split
#define SizeOfBtreeSplit (offsetof(xl_btree_split, leftlen) + sizeof(uint16))
/*
- * This is what we need to know about delete of an individual leaf btitem
+ * This is what we need to know about delete of individual leaf btitems.
+ * The WAL record can represent deletion of any number of btitems on a
+ * single index page.
*/
typedef struct xl_btree_delete
{
- xl_btreetid target; /* deleted tuple id */
+ RelFileNode node;
+ BlockNumber block;
+ /* TARGET OFFSET NUMBERS FOLLOW AT THE END */
} xl_btree_delete;
-#define SizeOfBtreeDelete (offsetof(xl_btreetid, tid) + SizeOfIptrData)
+#define SizeOfBtreeDelete (offsetof(xl_btree_delete, block) + sizeof(BlockNumber))
/*
* This is what we need to know about deletion of a btree page. The target
@@ -453,7 +457,8 @@ extern void _bt_wrtnorelbuf(Relation rel, Buffer buf);
extern void _bt_pageinit(Page page, Size size);
extern bool _bt_page_recyclable(Page page);
extern void _bt_metaproot(Relation rel, BlockNumber rootbknum, uint32 level);
-extern void _bt_itemdel(Relation rel, Buffer buf, ItemPointer tid);
+extern void _bt_delitems(Relation rel, Buffer buf,
+ OffsetNumber *itemnos, int nitems);
extern int _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full);
/*