aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/backend/access/nbtree/README53
-rw-r--r--src/backend/access/nbtree/nbtinsert.c286
-rw-r--r--src/backend/access/nbtree/nbtpage.c91
-rw-r--r--src/backend/access/nbtree/nbtsearch.c68
-rw-r--r--src/backend/access/nbtree/nbtxlog.c323
-rw-r--r--src/include/access/nbtree.h26
-rw-r--r--src/include/access/rmgrlist.h2
-rw-r--r--src/include/access/xlog_internal.h2
8 files changed, 561 insertions, 290 deletions
diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README
index cd110df332c..4820f76e3bb 100644
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@@ -404,12 +404,41 @@ an additional insertion above that, etc).
For a root split, the followon WAL entry is a "new root" entry rather than
an "insertion" entry, but details are otherwise much the same.
-Because insertion involves multiple atomic actions, the WAL replay logic
-has to detect the case where a page split isn't followed by a matching
-insertion on the parent level, and then do that insertion on its own (and
-recursively for any subsequent parent insertion, of course). This is
-feasible because the WAL entry for the split contains enough info to know
-what must be inserted in the parent level.
+Because splitting involves multiple atomic actions, it's possible that the
+system crashes between splitting a page and inserting the downlink for the
+new half to the parent. After recovery, the downlink for the new page will
+be missing. The search algorithm works correctly, as the page will be found
+by following the right-link from its left sibling, although if a lot of
+downlinks in the tree are missing, performance will suffer. A more serious
+consequence is that if the page without a downlink gets split again, the
+insertion algorithm will fail to find the location in the parent level to
+insert the downlink.
+
+Our approach is to create any missing downlinks on-the-fly, when searching
+the tree for a new insertion. It could be done during searches, too, but
+it seems best not to put any extra updates in what would otherwise be a
+read-only operation (updating is not possible in hot standby mode anyway).
+It would seem natural to add the missing downlinks in VACUUM, but since
+inserting a downlink might require splitting a page, it might fail if you
+run out of disk space. That would be bad during VACUUM - the reason for
+running VACUUM in the first place might be that you run out of disk space,
+and now VACUUM won't finish because you're out of disk space. In contrast,
+an insertion can require enlarging the physical file anyway.
+
+To identify missing downlinks, when a page is split, the left page is
+flagged to indicate that the split is not yet complete (INCOMPLETE_SPLIT).
+When the downlink is inserted to the parent, the flag is cleared atomically
+with the insertion. The child page is kept locked until the insertion in
+the parent is finished and the flag in the child cleared, but can be
+released immediately after that, before recursing up the tree if the parent
+also needs to be split. This ensures that incompletely split pages should
+not be seen under normal circumstances; only if insertion to the parent
+has failed for some reason.
+
+We flag the left page, even though it's the right page that's missing the
+downlink, beacuse it's more convenient to know already when following the
+right-link from the left page to the right page that it will need to have
+its downlink inserted to the parent.
When splitting a non-root page that is alone on its level, the required
metapage update (of the "fast root" link) is performed and logged as part
@@ -419,8 +448,16 @@ metapage update is handled as part of the "new root" action.
Each step in page deletion are logged as separate WAL entries: marking the
leaf as half-dead and removing the downlink is one record, and unlinking a
page is a second record. If vacuum is interrupted for some reason, or the
-system crashes, the tree is consistent for searches and insertions. The next
-VACUUM will find the half-dead leaf page and continue the deletion.
+system crashes, the tree is consistent for searches and insertions. The
+next VACUUM will find the half-dead leaf page and continue the deletion.
+
+Before 9.4, we used to keep track of incomplete splits and page deletions
+during recovery and finish them immediately at end of recovery, instead of
+doing it lazily at the next insertion or vacuum. However, that made the
+recovery much more complicated, and only fixed the problem when crash
+recovery was performed. An incomplete split can also occur if an otherwise
+recoverable error, like out-of-memory or out-of-disk-space, happens while
+inserting the downlink to the parent.
Scans during Recovery
---------------------
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index 5f7953f5b38..3cd70ad4cfe 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -57,15 +57,18 @@ static void _bt_findinsertloc(Relation rel,
int keysz,
ScanKey scankey,
IndexTuple newtup,
+ BTStack stack,
Relation heapRel);
-static void _bt_insertonpg(Relation rel, Buffer buf,
+static void _bt_insertonpg(Relation rel, Buffer buf, Buffer cbuf,
BTStack stack,
IndexTuple itup,
OffsetNumber newitemoff,
bool split_only_page);
-static Buffer _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
- OffsetNumber newitemoff, Size newitemsz,
+static Buffer _bt_split(Relation rel, Buffer buf, Buffer cbuf,
+ OffsetNumber firstright, OffsetNumber newitemoff, Size newitemsz,
IndexTuple newitem, bool newitemonleft);
+static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf,
+ BTStack stack, bool is_root, bool is_only);
static OffsetNumber _bt_findsplitloc(Relation rel, Page page,
OffsetNumber newitemoff,
Size newitemsz,
@@ -129,7 +132,8 @@ top:
* move right in the tree. See Lehman and Yao for an excruciatingly
* precise description.
*/
- buf = _bt_moveright(rel, buf, natts, itup_scankey, false, BT_WRITE);
+ buf = _bt_moveright(rel, buf, natts, itup_scankey, false,
+ true, stack, BT_WRITE);
/*
* If we're not allowing duplicates, make sure the key isn't already in
@@ -182,8 +186,9 @@ top:
*/
CheckForSerializableConflictIn(rel, NULL, buf);
/* do the insertion */
- _bt_findinsertloc(rel, &buf, &offset, natts, itup_scankey, itup, heapRel);
- _bt_insertonpg(rel, buf, stack, itup, offset, false);
+ _bt_findinsertloc(rel, &buf, &offset, natts, itup_scankey, itup,
+ stack, heapRel);
+ _bt_insertonpg(rel, buf, InvalidBuffer, stack, itup, offset, false);
}
else
{
@@ -507,6 +512,7 @@ _bt_findinsertloc(Relation rel,
int keysz,
ScanKey scankey,
IndexTuple newtup,
+ BTStack stack,
Relation heapRel)
{
Buffer buf = *bufptr;
@@ -568,6 +574,7 @@ _bt_findinsertloc(Relation rel,
while (PageGetFreeSpace(page) < itemsz)
{
Buffer rbuf;
+ BlockNumber rblkno;
/*
* before considering moving right, see if we can obtain enough space
@@ -605,18 +612,33 @@ _bt_findinsertloc(Relation rel,
*/
rbuf = InvalidBuffer;
+ rblkno = lpageop->btpo_next;
for (;;)
{
- BlockNumber rblkno = lpageop->btpo_next;
-
rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
page = BufferGetPage(rbuf);
lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ /*
+ * If this page was incompletely split, finish the split now.
+ * We do this while holding a lock on the left sibling, which
+ * is not good because finishing the split could be a fairly
+ * lengthy operation. But this should happen very seldom.
+ */
+ if (P_INCOMPLETE_SPLIT(lpageop))
+ {
+ _bt_finish_split(rel, rbuf, stack);
+ rbuf = InvalidBuffer;
+ continue;
+ }
+
if (!P_IGNORE(lpageop))
break;
if (P_RIGHTMOST(lpageop))
elog(ERROR, "fell off the end of index \"%s\"",
RelationGetRelationName(rel));
+
+ rblkno = lpageop->btpo_next;
}
_bt_relbuf(rel, buf);
buf = rbuf;
@@ -658,10 +680,14 @@ _bt_findinsertloc(Relation rel,
* child page on the parent.
* + updates the metapage if a true root or fast root is split.
*
- * On entry, we must have the right buffer in which to do the
+ * On entry, we must have the correct buffer in which to do the
* insertion, and the buffer must be pinned and write-locked. On return,
* we will have dropped both the pin and the lock on the buffer.
*
+ * When inserting to a non-leaf page, 'cbuf' is the left-sibling of the
+ * page we're inserting the downlink for. This function will clear the
+ * INCOMPLETE_SPLIT flag on it, and release the buffer.
+ *
* The locking interactions in this code are critical. You should
* grok Lehman and Yao's paper before making any changes. In addition,
* you need to understand how we disambiguate duplicate keys in this
@@ -675,6 +701,7 @@ _bt_findinsertloc(Relation rel,
static void
_bt_insertonpg(Relation rel,
Buffer buf,
+ Buffer cbuf,
BTStack stack,
IndexTuple itup,
OffsetNumber newitemoff,
@@ -688,6 +715,14 @@ _bt_insertonpg(Relation rel,
page = BufferGetPage(buf);
lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
+ /* child buffer must be given iff inserting on an internal page */
+ Assert(P_ISLEAF(lpageop) == !BufferIsValid(cbuf));
+
+ /* The caller should've finished any incomplete splits already. */
+ if (P_INCOMPLETE_SPLIT(lpageop))
+ elog(ERROR, "cannot insert to incompletely split page %u",
+ BufferGetBlockNumber(buf));
+
itemsz = IndexTupleDSize(*itup);
itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we
* need to be consistent */
@@ -712,7 +747,7 @@ _bt_insertonpg(Relation rel,
&newitemonleft);
/* split the buffer into left and right halves */
- rbuf = _bt_split(rel, buf, firstright,
+ rbuf = _bt_split(rel, buf, cbuf, firstright,
newitemoff, itemsz, itup, newitemonleft);
PredicateLockPageSplit(rel,
BufferGetBlockNumber(buf),
@@ -786,11 +821,22 @@ _bt_insertonpg(Relation rel,
MarkBufferDirty(metabuf);
}
+ /* clear INCOMPLETE_SPLIT flag on child if inserting a downlink */
+ if (BufferIsValid(cbuf))
+ {
+ Page cpage = BufferGetPage(cbuf);
+ BTPageOpaque cpageop = (BTPageOpaque) PageGetSpecialPointer(cpage);
+
+ Assert(P_INCOMPLETE_SPLIT(cpageop));
+ cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
+ MarkBufferDirty(cbuf);
+ }
+
/* XLOG stuff */
if (RelationNeedsWAL(rel))
{
xl_btree_insert xlrec;
- BlockNumber xldownlink;
+ BlockNumber xlleftchild;
xl_btree_metadata xlmeta;
uint8 xlinfo;
XLogRecPtr recptr;
@@ -810,12 +856,15 @@ _bt_insertonpg(Relation rel,
xlinfo = XLOG_BTREE_INSERT_LEAF;
else
{
- xldownlink = ItemPointerGetBlockNumber(&(itup->t_tid));
- Assert(ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY);
-
- nextrdata->data = (char *) &xldownlink;
+ /*
+ * Include the block number of the left child, whose
+ * INCOMPLETE_SPLIT flag was cleared.
+ */
+ xlleftchild = BufferGetBlockNumber(cbuf);
+ nextrdata->data = (char *) &xlleftchild;
nextrdata->len = sizeof(BlockNumber);
- nextrdata->buffer = InvalidBuffer;
+ nextrdata->buffer = cbuf;
+ nextrdata->buffer_std = true;
nextrdata->next = nextrdata + 1;
nextrdata++;
@@ -870,7 +919,8 @@ _bt_insertonpg(Relation rel,
/* release buffers */
if (BufferIsValid(metabuf))
_bt_relbuf(rel, metabuf);
-
+ if (BufferIsValid(cbuf))
+ _bt_relbuf(rel, cbuf);
_bt_relbuf(rel, buf);
}
}
@@ -883,11 +933,15 @@ _bt_insertonpg(Relation rel,
* new right page. newitemoff etc. tell us about the new item that
* must be inserted along with the data from the old page.
*
+ * When splitting a non-leaf page, 'cbuf' is the left-sibling of the
+ * page we're inserting the downlink for. This function will clear the
+ * INCOMPLETE_SPLIT flag on it, and release the buffer.
+ *
* Returns the new right sibling of buf, pinned and write-locked.
* The pin and lock on buf are maintained.
*/
static Buffer
-_bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
+_bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
bool newitemonleft)
{
@@ -911,6 +965,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
OffsetNumber maxoff;
OffsetNumber i;
bool isroot;
+ bool isleaf;
/* Acquire a new page to split into */
rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
@@ -949,12 +1004,15 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage);
isroot = P_ISROOT(oopaque);
+ isleaf = P_ISLEAF(oopaque);
/* if we're splitting this page, it won't be the root when we're done */
/* also, clear the SPLIT_END and HAS_GARBAGE flags in both pages */
lopaque->btpo_flags = oopaque->btpo_flags;
lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
ropaque->btpo_flags = lopaque->btpo_flags;
+ /* set flag in left page indicating that the right page has no downlink */
+ lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT;
lopaque->btpo_prev = oopaque->btpo_prev;
lopaque->btpo_next = rightpagenumber;
ropaque->btpo_prev = origpagenumber;
@@ -1178,6 +1236,19 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
MarkBufferDirty(sbuf);
}
+ /*
+ * Clear INCOMPLETE_SPLIT flag on child if inserting the new item finishes
+ * a split.
+ */
+ if (!isleaf)
+ {
+ Page cpage = BufferGetPage(cbuf);
+ BTPageOpaque cpageop = (BTPageOpaque) PageGetSpecialPointer(cpage);
+
+ cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
+ MarkBufferDirty(cbuf);
+ }
+
/* XLOG stuff */
if (RelationNeedsWAL(rel))
{
@@ -1186,6 +1257,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
XLogRecPtr recptr;
XLogRecData rdata[7];
XLogRecData *lastrdata;
+ BlockNumber cblkno;
xlrec.node = rel->rd_node;
xlrec.leftsib = origpagenumber;
@@ -1200,34 +1272,6 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
lastrdata = &rdata[0];
- if (ropaque->btpo.level > 0)
- {
- /* Log downlink on non-leaf pages */
- lastrdata->next = lastrdata + 1;
- lastrdata++;
-
- lastrdata->data = (char *) &newitem->t_tid.ip_blkid;
- lastrdata->len = sizeof(BlockIdData);
- lastrdata->buffer = InvalidBuffer;
-
- /*
- * We must also log the left page's high key, because the right
- * page's leftmost key is suppressed on non-leaf levels. Show it
- * as belonging to the left page buffer, so that it is not stored
- * if XLogInsert decides it needs a full-page image of the left
- * page.
- */
- lastrdata->next = lastrdata + 1;
- lastrdata++;
-
- itemid = PageGetItemId(origpage, P_HIKEY);
- item = (IndexTuple) PageGetItem(origpage, itemid);
- lastrdata->data = (char *) item;
- lastrdata->len = MAXALIGN(IndexTupleSize(item));
- lastrdata->buffer = buf; /* backup block 1 */
- lastrdata->buffer_std = true;
- }
-
/*
* Log the new item and its offset, if it was inserted on the left
* page. (If it was put on the right page, we don't need to explicitly
@@ -1254,17 +1298,40 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
lastrdata->buffer = buf; /* backup block 1 */
lastrdata->buffer_std = true;
}
- else if (ropaque->btpo.level == 0)
+
+ /* Log left page */
+ if (!isleaf)
{
+ lastrdata->next = lastrdata + 1;
+ lastrdata++;
+
/*
- * Although we don't need to WAL-log the new item, we still need
- * XLogInsert to consider storing a full-page image of the left
- * page, so make an empty entry referencing that buffer. This also
- * ensures that the left page is always backup block 1.
+ * We must also log the left page's high key, because the right
+ * page's leftmost key is suppressed on non-leaf levels. Show it
+ * as belonging to the left page buffer, so that it is not stored
+ * if XLogInsert decides it needs a full-page image of the left
+ * page.
*/
+ itemid = PageGetItemId(origpage, P_HIKEY);
+ item = (IndexTuple) PageGetItem(origpage, itemid);
+ lastrdata->data = (char *) item;
+ lastrdata->len = MAXALIGN(IndexTupleSize(item));
+ lastrdata->buffer = buf; /* backup block 1 */
+ lastrdata->buffer_std = true;
+ }
+
+ if (isleaf && !newitemonleft)
+ {
lastrdata->next = lastrdata + 1;
lastrdata++;
+ /*
+ * Although we don't need to WAL-log anything on the left page,
+ * we still need XLogInsert to consider storing a full-page image
+ * of the left page, so make an empty entry referencing that
+ * buffer. This also ensures that the left page is always backup
+ * block 1.
+ */
lastrdata->data = NULL;
lastrdata->len = 0;
lastrdata->buffer = buf; /* backup block 1 */
@@ -1272,6 +1339,22 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
}
/*
+ * Log block number of left child, whose INCOMPLETE_SPLIT flag this
+ * insertion clears.
+ */
+ if (!isleaf)
+ {
+ lastrdata->next = lastrdata + 1;
+ lastrdata++;
+
+ cblkno = BufferGetBlockNumber(cbuf);
+ lastrdata->data = (char *) &cblkno;
+ lastrdata->len = sizeof(BlockNumber);
+ lastrdata->buffer = cbuf; /* backup block 2 */
+ lastrdata->buffer_std = true;
+ }
+
+ /*
* Log the contents of the right page in the format understood by
* _bt_restore_page(). We set lastrdata->buffer to InvalidBuffer,
* because we're going to recreate the whole page anyway, so it should
@@ -1300,7 +1383,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
lastrdata->data = NULL;
lastrdata->len = 0;
- lastrdata->buffer = sbuf; /* backup block 2 */
+ lastrdata->buffer = sbuf; /* bkp block 2 (leaf) or 3 (non-leaf) */
lastrdata->buffer_std = true;
}
@@ -1327,6 +1410,10 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
if (!P_RIGHTMOST(ropaque))
_bt_relbuf(rel, sbuf);
+ /* release the child */
+ if (!isleaf)
+ _bt_relbuf(rel, cbuf);
+
/* split's done */
return rbuf;
}
@@ -1597,10 +1684,8 @@ _bt_checksplitloc(FindSplitData *state,
* have to be efficient (concurrent ROOT split, WAL recovery)
* is_root - we split the true root
* is_only - we split a page alone on its level (might have been fast root)
- *
- * This is exported so it can be called by nbtxlog.c.
*/
-void
+static void
_bt_insert_parent(Relation rel,
Buffer buf,
Buffer rbuf,
@@ -1619,8 +1704,7 @@ _bt_insert_parent(Relation rel,
*
* If we have to search for the parent level, we do so by re-descending
* from the root. This is not super-efficient, but it's rare enough not
- * to matter. (This path is also taken when called from WAL recovery ---
- * we have no stack in that case.)
+ * to matter.
*/
if (is_root)
{
@@ -1679,12 +1763,13 @@ _bt_insert_parent(Relation rel,
* 05/27/97
*/
ItemPointerSet(&(stack->bts_btentry.t_tid), bknum, P_HIKEY);
-
pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);
- /* Now we can unlock the children */
+ /*
+ * Now we can unlock the right child. The left child will be unlocked
+ * by _bt_insertonpg().
+ */
_bt_relbuf(rel, rbuf);
- _bt_relbuf(rel, buf);
/* Check for error only after writing children */
if (pbuf == InvalidBuffer)
@@ -1692,7 +1777,7 @@ _bt_insert_parent(Relation rel,
RelationGetRelationName(rel), bknum, rbknum);
/* Recursively update the parent */
- _bt_insertonpg(rel, pbuf, stack->bts_parent,
+ _bt_insertonpg(rel, pbuf, buf, stack->bts_parent,
new_item, stack->bts_offset + 1,
is_only);
@@ -1702,6 +1787,62 @@ _bt_insert_parent(Relation rel,
}
/*
+ * _bt_finish_split() -- Finish an incomplete split
+ *
+ * A crash or other failure can leave a split incomplete. The insertion
+ * routines won't allow to insert on a page that is incompletely split.
+ * Before inserting on such a page, call _bt_finish_split().
+ *
+ * On entry, 'lbuf' must be locked in write-mode. On exit, it is unlocked
+ * and unpinned.
+ */
+void
+_bt_finish_split(Relation rel, Buffer lbuf, BTStack stack)
+{
+ Page lpage = BufferGetPage(lbuf);
+ BTPageOpaque lpageop = (BTPageOpaque) PageGetSpecialPointer(lpage);
+ Buffer rbuf;
+ Page rpage;
+ BTPageOpaque rpageop;
+ bool was_root;
+ bool was_only;
+
+ Assert(P_INCOMPLETE_SPLIT(lpageop));
+
+ /* Lock right sibling, the one missing the downlink */
+ rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
+ rpage = BufferGetPage(rbuf);
+ rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage);
+
+ /* Could this be a root split? */
+ if (!stack)
+ {
+ Buffer metabuf;
+ Page metapg;
+ BTMetaPageData *metad;
+
+ /* acquire lock on the metapage */
+ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
+ metapg = BufferGetPage(metabuf);
+ metad = BTPageGetMeta(metapg);
+
+ was_root = (metad->btm_root == BufferGetBlockNumber(lbuf));
+
+ _bt_relbuf(rel, metabuf);
+ }
+ else
+ was_root = false;
+
+ /* Was this the only page on the level before split? */
+ was_only = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop));
+
+ elog(DEBUG1, "finishing incomplete split of %u/%u",
+ BufferGetBlockNumber(lbuf), BufferGetBlockNumber(rbuf));
+
+ _bt_insert_parent(rel, lbuf, rbuf, stack, was_root, was_only);
+}
+
+/*
* _bt_getstackbuf() -- Walk back up the tree one step, and find the item
* we last looked at in the parent.
*
@@ -1733,6 +1874,12 @@ _bt_getstackbuf(Relation rel, BTStack stack, int access)
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ if (access == BT_WRITE && P_INCOMPLETE_SPLIT(opaque))
+ {
+ _bt_finish_split(rel, buf, stack->bts_parent);
+ continue;
+ }
+
if (!P_IGNORE(opaque))
{
OffsetNumber offnum,
@@ -1837,6 +1984,7 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
rbkno;
BlockNumber rootblknum;
BTPageOpaque rootopaque;
+ BTPageOpaque lopaque;
ItemId itemid;
IndexTuple item;
Size itemsz;
@@ -1848,6 +1996,7 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
lbkno = BufferGetBlockNumber(lbuf);
rbkno = BufferGetBlockNumber(rbuf);
lpage = BufferGetPage(lbuf);
+ lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
/* get a new root page */
rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
@@ -1921,6 +2070,11 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
BufferGetBlockNumber(lbuf), RelationGetRelationName(rel));
pfree(new_item);
+ /* Clear the incomplete-split flag in the left child */
+ Assert(P_INCOMPLETE_SPLIT(lopaque));
+ lopaque->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
+ MarkBufferDirty(lbuf);
+
MarkBufferDirty(rootbuf);
MarkBufferDirty(metabuf);
@@ -1929,7 +2083,7 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
{
xl_btree_newroot xlrec;
XLogRecPtr recptr;
- XLogRecData rdata[2];
+ XLogRecData rdata[3];
xlrec.node = rel->rd_node;
xlrec.rootblk = rootblknum;
@@ -1948,7 +2102,13 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
rdata[1].len = ((PageHeader) rootpage)->pd_special -
((PageHeader) rootpage)->pd_upper;
rdata[1].buffer = InvalidBuffer;
- rdata[1].next = NULL;
+ rdata[1].next = &(rdata[2]);
+
+ /* Make a full-page image of the left child if needed */
+ rdata[2].data = NULL;
+ rdata[2].len = 0;
+ rdata[2].buffer = lbuf;
+ rdata[2].next = NULL;
recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT, rdata);
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index 14e422a1d36..87ac5f4aafb 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -992,6 +992,7 @@ _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack,
Buffer pbuf;
Page page;
BTPageOpaque opaque;
+ BlockNumber leftsib;
/* Locate the parent's downlink (updating the stack entry if needed) */
ItemPointerSet(&(stack->bts_btentry.t_tid), child, P_HIKEY);
@@ -1020,7 +1021,8 @@ _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack,
* We have to check the parent itself, and then recurse to test
* the conditions at the parent's parent.
*/
- if (P_RIGHTMOST(opaque) || P_ISROOT(opaque))
+ if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) ||
+ P_INCOMPLETE_SPLIT(opaque))
{
_bt_relbuf(rel, pbuf);
return false;
@@ -1028,8 +1030,41 @@ _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack,
*target = parent;
*rightsib = opaque->btpo_next;
+ leftsib = opaque->btpo_prev;
_bt_relbuf(rel, pbuf);
+
+ /*
+ * Like in _bt_pagedel, check that the left sibling is not marked
+ * with INCOMPLETE_SPLIT flag. That would mean that there is no
+ * downlink to the page to be deleted, and the page deletion
+ * algorithm isn't prepared to handle that.
+ */
+ if (leftsib != P_NONE)
+ {
+ Buffer lbuf;
+ Page lpage;
+ BTPageOpaque lopaque;
+
+ lbuf = _bt_getbuf(rel, leftsib, BT_READ);
+ lpage = BufferGetPage(lbuf);
+ lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
+ /*
+ * If the left sibling was concurrently split, so that its
+ * next-pointer doesn't point to the current page anymore,
+ * the split that created the current page must be completed.
+ * (We don't allow splitting an incompletely split page again
+ * until the previous split has been completed)
+ */
+ if (lopaque->btpo_next == parent &&
+ P_INCOMPLETE_SPLIT(lopaque))
+ {
+ _bt_relbuf(rel, lbuf);
+ return false;
+ }
+ _bt_relbuf(rel, lbuf);
+ }
+
return _bt_lock_branch_parent(rel, parent, stack->bts_parent,
topparent, topoff, target, rightsib);
}
@@ -1081,6 +1116,10 @@ _bt_pagedel(Relation rel, Buffer buf)
* "stack" is a search stack leading (approximately) to the target page.
* It is initially NULL, but when iterating, we keep it to avoid
* duplicated search effort.
+ *
+ * Also, when "stack" is not NULL, we have already checked that the
+ * current page is not the right half of an incomplete split, i.e. the
+ * left sibling does not have its INCOMPLETE_SPLIT flag set.
*/
BTStack stack = NULL;
@@ -1117,11 +1156,25 @@ _bt_pagedel(Relation rel, Buffer buf)
}
/*
- * We can never delete rightmost pages nor root pages. While at it,
- * check that page is not already deleted and is empty.
+ * We can never delete rightmost pages nor root pages. While at
+ * it, check that page is not already deleted and is empty.
+ *
+ * To keep the algorithm simple, we also never delete an incompletely
+ * split page (they should be rare enough that this doesn't make any
+ * meaningful difference to disk usage):
+ *
+ * The INCOMPLETE_SPLIT flag on the page tells us if the page is the
+ * left half of an incomplete split, but ensuring that it's not the
+ * right half is more complicated. For that, we have to check that
+ * the left sibling doesn't have its INCOMPLETE_SPLIT flag set. On
+ * the first iteration, we temporarily release the lock on the
+ * current page, and check the left sibling and also construct a
+ * search stack to. On subsequent iterations, we know we stepped right
+ * from a page that passed these tests, so it's OK.
*/
if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) ||
- P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page))
+ P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
+ P_INCOMPLETE_SPLIT(opaque))
{
/* Should never fail to delete a half-dead page */
Assert(!P_ISHALFDEAD(opaque));
@@ -1142,6 +1195,9 @@ _bt_pagedel(Relation rel, Buffer buf)
* use the standard search mechanism to search for the page's high
* key; this will give us a link to either the current parent or
* someplace to its left (if there are multiple equal high keys).
+ *
+ * Also check if this is the right-half of an incomplete split
+ * (see comment above).
*/
if (!stack)
{
@@ -1149,16 +1205,43 @@ _bt_pagedel(Relation rel, Buffer buf)
ItemId itemid;
IndexTuple targetkey;
Buffer lbuf;
+ BlockNumber leftsib;
itemid = PageGetItemId(page, P_HIKEY);
targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid));
+ leftsib = opaque->btpo_prev;
+
/*
* To avoid deadlocks, we'd better drop the leaf page lock
* before going further.
*/
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+ /*
+ * Fetch the left sibling, to check that it's not marked
+ * with INCOMPLETE_SPLIT flag. That would mean that the
+ * page to-be-deleted doesn't have a downlink, and the page
+ * deletion algorithm isn't prepared to handle that.
+ */
+ if (!P_LEFTMOST(opaque))
+ {
+ BTPageOpaque lopaque;
+ Page lpage;
+
+ lbuf = _bt_getbuf(rel, leftsib, BT_READ);
+ lpage = BufferGetPage(lbuf);
+ lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
+ if (lopaque->btpo_next == BufferGetBlockNumber(buf) &&
+ P_INCOMPLETE_SPLIT(lopaque))
+ {
+ ReleaseBuffer(buf);
+ _bt_relbuf(rel, lbuf);
+ return ndeleted;
+ }
+ _bt_relbuf(rel, lbuf);
+ }
+
/* we need an insertion scan key for the search, so build one */
itup_scankey = _bt_mkscankey(rel, targetkey);
/* find the leftmost leaf page containing this key */
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index bc62fbc02e6..0bf12f0e107 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -51,7 +51,8 @@ static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
* NOTE that the returned buffer is read-locked regardless of the access
* parameter. However, access = BT_WRITE will allow an empty root page
* to be created and returned. When access = BT_READ, an empty index
- * will result in *bufP being set to InvalidBuffer.
+ * will result in *bufP being set to InvalidBuffer. Also, in BT_WRITE mode,
+ * any incomplete splits encountered during the search will be finished.
*/
BTStack
_bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
@@ -82,8 +83,17 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
* Race -- the page we just grabbed may have split since we read its
* pointer in the parent (or metapage). If it has, we may need to
* move right to its new sibling. Do that.
+ *
+ * In write-mode, allow _bt_moveright to finish any incomplete splits
+ * along the way. Strictly speaking, we'd only need to finish an
+ * incomplete split on the leaf page we're about to insert to, not on
+ * any of the upper levels (they is taken care of in _bt_getstackbuf,
+ * if the leaf page is split and we insert to the parent page). But
+ * this is a good opportunity to finish splits of internal pages too.
*/
- *bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey, BT_READ);
+ *bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey,
+ (access == BT_WRITE), stack_in,
+ BT_READ);
/* if this is a leaf page, we're done */
page = BufferGetPage(*bufP);
@@ -148,6 +158,11 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
* item >= scankey. When nextkey is true, we are looking for the first
* item strictly greater than scankey.
*
+ * If forupdate is true, we will attempt to finish any incomplete splits
+ * that we encounter. This is required when locking a target page for an
+ * insertion, because we don't allow inserting on a page before the split
+ * is completed. 'stack' is only used if forupdate is true.
+ *
* On entry, we have the buffer pinned and a lock of the type specified by
* 'access'. If we move right, we release the buffer and lock and acquire
* the same on the right sibling. Return value is the buffer we stop at.
@@ -158,15 +173,14 @@ _bt_moveright(Relation rel,
int keysz,
ScanKey scankey,
bool nextkey,
+ bool forupdate,
+ BTStack stack,
int access)
{
Page page;
BTPageOpaque opaque;
int32 cmpval;
- page = BufferGetPage(buf);
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-
/*
* When nextkey = false (normal case): if the scan key that brought us to
* this page is > the high key stored on the page, then the page has split
@@ -184,16 +198,46 @@ _bt_moveright(Relation rel,
*/
cmpval = nextkey ? 0 : 1;
- while (!P_RIGHTMOST(opaque) &&
- (P_IGNORE(opaque) ||
- _bt_compare(rel, keysz, scankey, page, P_HIKEY) >= cmpval))
+ for (;;)
{
- /* step right one page */
- BlockNumber rblkno = opaque->btpo_next;
-
- buf = _bt_relandgetbuf(rel, buf, rblkno, access);
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ if (P_RIGHTMOST(opaque))
+ break;
+
+ /*
+ * Finish any incomplete splits we encounter along the way.
+ */
+ if (forupdate && P_INCOMPLETE_SPLIT(opaque))
+ {
+ BlockNumber blkno = BufferGetBlockNumber(buf);
+
+ /* upgrade our lock if necessary */
+ if (access == BT_READ)
+ {
+ LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+ LockBuffer(buf, BT_WRITE);
+ }
+
+ if (P_INCOMPLETE_SPLIT(opaque))
+ _bt_finish_split(rel, buf, stack);
+ else
+ _bt_relbuf(rel, buf);
+
+ /* re-acquire the lock in the right mode, and re-check */
+ buf = _bt_getbuf(rel, blkno, access);
+ continue;
+ }
+
+ if (P_IGNORE(opaque) || _bt_compare(rel, keysz, scankey, page, P_HIKEY) >= cmpval)
+ {
+ /* step right one page */
+ buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
+ continue;
+ }
+ else
+ break;
}
if (P_IGNORE(opaque))
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index bc376fd4094..665e60b7bd2 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -21,61 +21,6 @@
#include "miscadmin.h"
/*
- * We must keep track of expected insertions due to page splits, and apply
- * them manually if they are not seen in the WAL log during replay. This
- * makes it safe for page insertion to be a multiple-WAL-action process.
- *
- * The data structure is a simple linked list --- this should be good enough,
- * since we don't expect a page split to remain incomplete for long. In any
- * case we need to respect the order of operations.
- */
-typedef struct bt_incomplete_split
-{
- RelFileNode node; /* the index */
- bool is_root; /* we split the root */
- BlockNumber leftblk; /* left half of split */
- BlockNumber rightblk; /* right half of split */
-} bt_incomplete_split;
-
-static List *incomplete_splits;
-
-
-static void
-log_incomplete_split(RelFileNode node, BlockNumber leftblk,
- BlockNumber rightblk, bool is_root)
-{
- bt_incomplete_split *action = palloc(sizeof(bt_incomplete_split));
-
- action->node = node;
- action->is_root = is_root;
- action->leftblk = leftblk;
- action->rightblk = rightblk;
- incomplete_splits = lappend(incomplete_splits, action);
-}
-
-static void
-forget_matching_split(RelFileNode node, BlockNumber downlink, bool is_root)
-{
- ListCell *l;
-
- foreach(l, incomplete_splits)
- {
- bt_incomplete_split *action = (bt_incomplete_split *) lfirst(l);
-
- if (RelFileNodeEquals(node, action->node) &&
- downlink == action->rightblk)
- {
- if (is_root != action->is_root)
- elog(LOG, "forget_matching_split: fishy is_root data (expected %d, got %d)",
- action->is_root, is_root);
- incomplete_splits = list_delete_ptr(incomplete_splits, action);
- pfree(action);
- break; /* need not look further */
- }
- }
-}
-
-/*
* _bt_restore_page -- re-enter all the index tuples on a page
*
* The page is freshly init'd, and *from (length len) is a copy of what
@@ -149,23 +94,60 @@ _bt_restore_meta(RelFileNode rnode, XLogRecPtr lsn,
UnlockReleaseBuffer(metabuf);
}
+/*
+ * _bt_clear_incomplete_split -- clear INCOMPLETE_SPLIT flag on a page
+ *
+ * This is a common subroutine of the redo functions of all the WAL record
+ * types that can insert a downlink: insert, split, and newroot.
+ */
+static void
+_bt_clear_incomplete_split(XLogRecPtr lsn, XLogRecord *record,
+ RelFileNode rnode, BlockNumber cblock)
+{
+ Buffer buf;
+
+ buf = XLogReadBuffer(rnode, cblock, false);
+ if (BufferIsValid(buf))
+ {
+ Page page = (Page) BufferGetPage(buf);
+
+ if (lsn > PageGetLSN(page))
+ {
+ BTPageOpaque pageop = (BTPageOpaque) PageGetSpecialPointer(page);
+ Assert((pageop->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0);
+ pageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buf);
+ }
+ UnlockReleaseBuffer(buf);
+ }
+}
+
static void
btree_xlog_insert(bool isleaf, bool ismeta,
XLogRecPtr lsn, XLogRecord *record)
{
xl_btree_insert *xlrec = (xl_btree_insert *) XLogRecGetData(record);
Buffer buffer;
+ Buffer cbuffer = InvalidBuffer;
Page page;
char *datapos;
int datalen;
xl_btree_metadata md;
- BlockNumber downlink = 0;
+ BlockNumber cblkno = 0;
+ int main_blk_index;
datapos = (char *) xlrec + SizeOfBtreeInsert;
datalen = record->xl_len - SizeOfBtreeInsert;
- if (!isleaf)
+ /*
+ * if this insert finishes a split at lower level, extract the block
+ * number of the (left) child.
+ */
+ if (!isleaf && (record->xl_info & XLR_BKP_BLOCK(0)) == 0)
{
- memcpy(&downlink, datapos, sizeof(BlockNumber));
+ memcpy(&cblkno, datapos, sizeof(BlockNumber));
+ Assert(cblkno != 0);
datapos += sizeof(BlockNumber);
datalen -= sizeof(BlockNumber);
}
@@ -176,8 +158,19 @@ btree_xlog_insert(bool isleaf, bool ismeta,
datalen -= sizeof(xl_btree_metadata);
}
- if (record->xl_info & XLR_BKP_BLOCK(0))
- (void) RestoreBackupBlock(lsn, record, 0, false, false);
+ if (!isleaf)
+ {
+ if (record->xl_info & XLR_BKP_BLOCK(0))
+ (void) RestoreBackupBlock(lsn, record, 0, false, false);
+ else
+ _bt_clear_incomplete_split(lsn, record, xlrec->target.node, cblkno);
+ main_blk_index = 1;
+ }
+ else
+ main_blk_index = 0;
+
+ if (record->xl_info & XLR_BKP_BLOCK(main_blk_index))
+ (void) RestoreBackupBlock(lsn, record, main_blk_index, false, false);
else
{
buffer = XLogReadBuffer(xlrec->target.node,
@@ -187,11 +180,7 @@ btree_xlog_insert(bool isleaf, bool ismeta,
{
page = (Page) BufferGetPage(buffer);
- if (lsn <= PageGetLSN(page))
- {
- UnlockReleaseBuffer(buffer);
- }
- else
+ if (lsn > PageGetLSN(page))
{
if (PageAddItem(page, (Item) datapos, datalen,
ItemPointerGetOffsetNumber(&(xlrec->target.tid)),
@@ -200,11 +189,14 @@ btree_xlog_insert(bool isleaf, bool ismeta,
PageSetLSN(page, lsn);
MarkBufferDirty(buffer);
- UnlockReleaseBuffer(buffer);
}
+ UnlockReleaseBuffer(buffer);
}
}
+ if (BufferIsValid(cbuffer))
+ UnlockReleaseBuffer(cbuffer);
+
/*
* Note: in normal operation, we'd update the metapage while still holding
* lock on the page we inserted into. But during replay it's not
@@ -216,10 +208,6 @@ btree_xlog_insert(bool isleaf, bool ismeta,
_bt_restore_meta(xlrec->target.node, lsn,
md.root, md.level,
md.fastroot, md.fastlevel);
-
- /* Forget any split this insertion completes */
- if (!isleaf)
- forget_matching_split(xlrec->target.node, downlink, false);
}
static void
@@ -227,6 +215,8 @@ btree_xlog_split(bool onleft, bool isroot,
XLogRecPtr lsn, XLogRecord *record)
{
xl_btree_split *xlrec = (xl_btree_split *) XLogRecGetData(record);
+ bool isleaf = (xlrec->level == 0);
+ Buffer lbuf;
Buffer rbuf;
Page rpage;
BTPageOpaque ropaque;
@@ -237,42 +227,18 @@ btree_xlog_split(bool onleft, bool isroot,
Size newitemsz = 0;
Item left_hikey = NULL;
Size left_hikeysz = 0;
+ BlockNumber cblkno = InvalidBlockNumber;
datapos = (char *) xlrec + SizeOfBtreeSplit;
datalen = record->xl_len - SizeOfBtreeSplit;
- /* Forget any split this insertion completes */
- if (xlrec->level > 0)
- {
- /* we assume SizeOfBtreeSplit is at least 16-bit aligned */
- BlockNumber downlink = BlockIdGetBlockNumber((BlockId) datapos);
-
- datapos += sizeof(BlockIdData);
- datalen -= sizeof(BlockIdData);
-
- forget_matching_split(xlrec->node, downlink, false);
-
- /* Extract left hikey and its size (still assuming 16-bit alignment) */
- if (!(record->xl_info & XLR_BKP_BLOCK(0)))
- {
- /* We assume 16-bit alignment is enough for IndexTupleSize */
- left_hikey = (Item) datapos;
- left_hikeysz = MAXALIGN(IndexTupleSize(left_hikey));
-
- datapos += left_hikeysz;
- datalen -= left_hikeysz;
- }
- }
-
- /* Extract newitem and newitemoff, if present */
+ /* Extract newitemoff and newitem, if present */
if (onleft)
{
- /* Extract the offset (still assuming 16-bit alignment) */
memcpy(&newitemoff, datapos, sizeof(OffsetNumber));
datapos += sizeof(OffsetNumber);
datalen -= sizeof(OffsetNumber);
}
-
if (onleft && !(record->xl_info & XLR_BKP_BLOCK(0)))
{
/*
@@ -286,6 +252,37 @@ btree_xlog_split(bool onleft, bool isroot,
datalen -= newitemsz;
}
+ /* Extract left hikey and its size (still assuming 16-bit alignment) */
+ if (!isleaf && !(record->xl_info & XLR_BKP_BLOCK(0)))
+ {
+ left_hikey = (Item) datapos;
+ left_hikeysz = MAXALIGN(IndexTupleSize(left_hikey));
+ datapos += left_hikeysz;
+ datalen -= left_hikeysz;
+ }
+ /*
+ * If this insertion finishes an incomplete split, get the block number
+ * of the child.
+ */
+ if (!isleaf && !(record->xl_info & XLR_BKP_BLOCK(1)))
+ {
+ memcpy(&cblkno, datapos, sizeof(BlockNumber));
+ datapos += sizeof(BlockNumber);
+ datalen -= sizeof(BlockNumber);
+ }
+
+ /*
+ * Clear the incomplete split flag on the left sibling of the child page
+ * this is a downlink for.
+ */
+ if (!isleaf)
+ {
+ if (record->xl_info & XLR_BKP_BLOCK(2))
+ (void) RestoreBackupBlock(lsn, record, 2, false, false);
+ else
+ _bt_clear_incomplete_split(lsn, record, xlrec->node, cblkno);
+ }
+
/* Reconstruct right (new) sibling page from scratch */
rbuf = XLogReadBuffer(xlrec->node, xlrec->rightsib, true);
Assert(BufferIsValid(rbuf));
@@ -297,7 +294,7 @@ btree_xlog_split(bool onleft, bool isroot,
ropaque->btpo_prev = xlrec->leftsib;
ropaque->btpo_next = xlrec->rnext;
ropaque->btpo.level = xlrec->level;
- ropaque->btpo_flags = (xlrec->level == 0) ? BTP_LEAF : 0;
+ ropaque->btpo_flags = isleaf ? BTP_LEAF : 0;
ropaque->btpo_cycleid = 0;
_bt_restore_page(rpage, datapos, datalen);
@@ -306,7 +303,7 @@ btree_xlog_split(bool onleft, bool isroot,
* On leaf level, the high key of the left page is equal to the first key
* on the right page.
*/
- if (xlrec->level == 0)
+ if (isleaf)
{
ItemId hiItemId = PageGetItemId(rpage, P_FIRSTDATAKEY(ropaque));
@@ -321,10 +318,10 @@ btree_xlog_split(bool onleft, bool isroot,
/* Now reconstruct left (original) sibling page */
if (record->xl_info & XLR_BKP_BLOCK(0))
- (void) RestoreBackupBlock(lsn, record, 0, false, false);
+ lbuf = RestoreBackupBlock(lsn, record, 0, false, true);
else
{
- Buffer lbuf = XLogReadBuffer(xlrec->node, xlrec->leftsib, false);
+ lbuf = XLogReadBuffer(xlrec->node, xlrec->leftsib, false);
if (BufferIsValid(lbuf))
{
@@ -381,19 +378,21 @@ btree_xlog_split(bool onleft, bool isroot,
elog(PANIC, "failed to add high key to left page after split");
/* Fix opaque fields */
- lopaque->btpo_flags = (xlrec->level == 0) ? BTP_LEAF : 0;
+ lopaque->btpo_flags = BTP_INCOMPLETE_SPLIT;
+ if (isleaf)
+ lopaque->btpo_flags |= BTP_LEAF;
lopaque->btpo_next = xlrec->rightsib;
lopaque->btpo_cycleid = 0;
PageSetLSN(lpage, lsn);
MarkBufferDirty(lbuf);
}
-
- UnlockReleaseBuffer(lbuf);
}
}
- /* We no longer need the right buffer */
+ /* We no longer need the buffers */
+ if (BufferIsValid(lbuf))
+ UnlockReleaseBuffer(lbuf);
UnlockReleaseBuffer(rbuf);
/*
@@ -404,32 +403,39 @@ btree_xlog_split(bool onleft, bool isroot,
* replay, because no other index update can be in progress, and readers
* will cope properly when following an obsolete left-link.
*/
- if (record->xl_info & XLR_BKP_BLOCK(1))
- (void) RestoreBackupBlock(lsn, record, 1, false, false);
- else if (xlrec->rnext != P_NONE)
+ if (xlrec->rnext != P_NONE)
{
- Buffer buffer = XLogReadBuffer(xlrec->node, xlrec->rnext, false);
+ /*
+ * the backup block containing right sibling is 2 or 3, depending
+ * whether this was a leaf or internal page.
+ */
+ int rnext_index = isleaf ? 2 : 3;
- if (BufferIsValid(buffer))
+ if (record->xl_info & XLR_BKP_BLOCK(rnext_index))
+ (void) RestoreBackupBlock(lsn, record, rnext_index, false, false);
+ else
{
- Page page = (Page) BufferGetPage(buffer);
+ Buffer buffer;
- if (lsn > PageGetLSN(page))
+ buffer = XLogReadBuffer(xlrec->node, xlrec->rnext, false);
+
+ if (BufferIsValid(buffer))
{
- BTPageOpaque pageop = (BTPageOpaque) PageGetSpecialPointer(page);
+ Page page = (Page) BufferGetPage(buffer);
- pageop->btpo_prev = xlrec->rightsib;
+ if (lsn > PageGetLSN(page))
+ {
+ BTPageOpaque pageop = (BTPageOpaque) PageGetSpecialPointer(page);
- PageSetLSN(page, lsn);
- MarkBufferDirty(buffer);
+ pageop->btpo_prev = xlrec->rightsib;
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buffer);
+ }
+ UnlockReleaseBuffer(buffer);
}
- UnlockReleaseBuffer(buffer);
}
}
-
- /* The job ain't done till the parent link is inserted... */
- log_incomplete_split(xlrec->node,
- xlrec->leftsib, xlrec->rightsib, isroot);
}
static void
@@ -1003,10 +1009,6 @@ btree_xlog_newroot(XLogRecPtr lsn, XLogRecord *record)
Buffer buffer;
Page page;
BTPageOpaque pageop;
- BlockNumber downlink = 0;
-
- /* Backup blocks are not used in newroot records */
- Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
buffer = XLogReadBuffer(xlrec->node, xlrec->rootblk, true);
Assert(BufferIsValid(buffer));
@@ -1025,14 +1027,21 @@ btree_xlog_newroot(XLogRecPtr lsn, XLogRecord *record)
if (record->xl_len > SizeOfBtreeNewroot)
{
IndexTuple itup;
+ BlockNumber cblkno;
_bt_restore_page(page,
(char *) xlrec + SizeOfBtreeNewroot,
record->xl_len - SizeOfBtreeNewroot);
- /* extract downlink to the right-hand split page */
- itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, P_FIRSTKEY));
- downlink = ItemPointerGetBlockNumber(&(itup->t_tid));
+ /* extract block number of the left-hand split page */
+ itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, P_HIKEY));
+ cblkno = ItemPointerGetBlockNumber(&(itup->t_tid));
Assert(ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY);
+
+ /* Clear the incomplete-split flag in left child */
+ if (record->xl_info & XLR_BKP_BLOCK(0))
+ (void) RestoreBackupBlock(lsn, record, 0, false, false);
+ else
+ _bt_clear_incomplete_split(lsn, record, xlrec->node, cblkno);
}
PageSetLSN(page, lsn);
@@ -1042,10 +1051,6 @@ btree_xlog_newroot(XLogRecPtr lsn, XLogRecord *record)
_bt_restore_meta(xlrec->node, lsn,
xlrec->rootblk, xlrec->level,
xlrec->rootblk, xlrec->level);
-
- /* Check to see if this satisfies any incomplete insertions */
- if (record->xl_len > SizeOfBtreeNewroot)
- forget_matching_split(xlrec->node, downlink, true);
}
static void
@@ -1125,59 +1130,3 @@ btree_redo(XLogRecPtr lsn, XLogRecord *record)
elog(PANIC, "btree_redo: unknown op code %u", info);
}
}
-
-void
-btree_xlog_startup(void)
-{
- incomplete_splits = NIL;
-}
-
-void
-btree_xlog_cleanup(void)
-{
- ListCell *l;
-
- foreach(l, incomplete_splits)
- {
- /* finish an incomplete split */
- bt_incomplete_split *action = (bt_incomplete_split *) lfirst(l);
- Buffer lbuf,
- rbuf;
- Page lpage,
- rpage;
- BTPageOpaque lpageop,
- rpageop;
- bool is_only;
- Relation reln;
-
- lbuf = XLogReadBuffer(action->node, action->leftblk, false);
- /* failure is impossible because we wrote this page earlier */
- if (!BufferIsValid(lbuf))
- elog(PANIC, "btree_xlog_cleanup: left block unfound");
- lpage = (Page) BufferGetPage(lbuf);
- lpageop = (BTPageOpaque) PageGetSpecialPointer(lpage);
- rbuf = XLogReadBuffer(action->node, action->rightblk, false);
- /* failure is impossible because we wrote this page earlier */
- if (!BufferIsValid(rbuf))
- elog(PANIC, "btree_xlog_cleanup: right block unfound");
- rpage = (Page) BufferGetPage(rbuf);
- rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage);
-
- /* if the pages are all of their level, it's a only-page split */
- is_only = P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop);
-
- reln = CreateFakeRelcacheEntry(action->node);
- _bt_insert_parent(reln, lbuf, rbuf, NULL,
- action->is_root, is_only);
- FreeFakeRelcacheEntry(reln);
- }
- incomplete_splits = NIL;
-}
-
-bool
-btree_safe_restartpoint(void)
-{
- if (incomplete_splits)
- return false;
- return true;
-}
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 779a42229b7..64c6982f50e 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -73,6 +73,7 @@ typedef BTPageOpaqueData *BTPageOpaque;
#define BTP_HALF_DEAD (1 << 4) /* empty, but still in tree */
#define BTP_SPLIT_END (1 << 5) /* rightmost page of split group */
#define BTP_HAS_GARBAGE (1 << 6) /* page has LP_DEAD tuples */
+#define BTP_INCOMPLETE_SPLIT (1 << 7) /* right sibling's downlink is missing */
/*
* The max allowed value of a cycle ID is a bit less than 64K. This is
@@ -178,6 +179,7 @@ typedef struct BTMetaPageData
#define P_ISHALFDEAD(opaque) ((opaque)->btpo_flags & BTP_HALF_DEAD)
#define P_IGNORE(opaque) ((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD))
#define P_HAS_GARBAGE(opaque) ((opaque)->btpo_flags & BTP_HAS_GARBAGE)
+#define P_INCOMPLETE_SPLIT(opaque) ((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT)
/*
* Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost
@@ -253,7 +255,7 @@ typedef struct xl_btree_metadata
typedef struct xl_btree_insert
{
xl_btreetid target; /* inserted tuple id */
- /* BlockNumber downlink field FOLLOWS IF NOT XLOG_BTREE_INSERT_LEAF */
+ /* BlockNumber finishes_split field FOLLOWS IF NOT XLOG_BTREE_INSERT_LEAF */
/* xl_btree_metadata FOLLOWS IF XLOG_BTREE_INSERT_META */
/* INDEX TUPLE FOLLOWS AT END OF STRUCT */
} xl_btree_insert;
@@ -286,19 +288,18 @@ typedef struct xl_btree_split
OffsetNumber firstright; /* first item moved to right page */
/*
- * If level > 0, BlockIdData downlink follows. (We use BlockIdData rather
- * than BlockNumber for alignment reasons: SizeOfBtreeSplit is only 16-bit
- * aligned.)
+ * In the _L variants, next are OffsetNumber newitemoff and the new item.
+ * (In the _R variants, the new item is one of the right page's tuples.)
+ * The new item, but not newitemoff, is suppressed if XLogInsert chooses
+ * to store the left page's whole page image.
*
* If level > 0, an IndexTuple representing the HIKEY of the left page
* follows. We don't need this on leaf pages, because it's the same as
* the leftmost key in the new right page. Also, it's suppressed if
* XLogInsert chooses to store the left page's whole page image.
*
- * In the _L variants, next are OffsetNumber newitemoff and the new item.
- * (In the _R variants, the new item is one of the right page's tuples.)
- * The new item, but not newitemoff, is suppressed if XLogInsert chooses
- * to store the left page's whole page image.
+ * If level > 0, BlockNumber of the page whose incomplete-split flag
+ * this insertion clears. (not aligned)
*
* Last are the right page's tuples in the form used by _bt_restore_page.
*/
@@ -642,8 +643,7 @@ extern Datum btoptions(PG_FUNCTION_ARGS);
extern bool _bt_doinsert(Relation rel, IndexTuple itup,
IndexUniqueCheck checkUnique, Relation heapRel);
extern Buffer _bt_getstackbuf(Relation rel, BTStack stack, int access);
-extern void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf,
- BTStack stack, bool is_root, bool is_only);
+extern void _bt_finish_split(Relation rel, Buffer bbuf, BTStack stack);
/*
* prototypes for functions in nbtpage.c
@@ -673,7 +673,8 @@ extern BTStack _bt_search(Relation rel,
int keysz, ScanKey scankey, bool nextkey,
Buffer *bufP, int access);
extern Buffer _bt_moveright(Relation rel, Buffer buf, int keysz,
- ScanKey scankey, bool nextkey, int access);
+ ScanKey scankey, bool nextkey, bool forupdate, BTStack stack,
+ int access);
extern OffsetNumber _bt_binsrch(Relation rel, Buffer buf, int keysz,
ScanKey scankey, bool nextkey);
extern int32 _bt_compare(Relation rel, int keysz, ScanKey scankey,
@@ -722,8 +723,5 @@ extern void _bt_leafbuild(BTSpool *btspool, BTSpool *spool2);
*/
extern void btree_redo(XLogRecPtr lsn, XLogRecord *record);
extern void btree_desc(StringInfo buf, uint8 xl_info, char *rec);
-extern void btree_xlog_startup(void);
-extern void btree_xlog_cleanup(void);
-extern bool btree_safe_restartpoint(void);
#endif /* NBTREE_H */
diff --git a/src/include/access/rmgrlist.h b/src/include/access/rmgrlist.h
index c968101e8a2..d9ee57d6da2 100644
--- a/src/include/access/rmgrlist.h
+++ b/src/include/access/rmgrlist.h
@@ -36,7 +36,7 @@ PG_RMGR(RM_RELMAP_ID, "RelMap", relmap_redo, relmap_desc, NULL, NULL, NULL)
PG_RMGR(RM_STANDBY_ID, "Standby", standby_redo, standby_desc, NULL, NULL, NULL)
PG_RMGR(RM_HEAP2_ID, "Heap2", heap2_redo, heap2_desc, NULL, NULL, NULL)
PG_RMGR(RM_HEAP_ID, "Heap", heap_redo, heap_desc, NULL, NULL, NULL)
-PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_xlog_startup, btree_xlog_cleanup, btree_safe_restartpoint)
+PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, NULL, NULL, NULL)
PG_RMGR(RM_HASH_ID, "Hash", hash_redo, hash_desc, NULL, NULL, NULL)
PG_RMGR(RM_GIN_ID, "Gin", gin_redo, gin_desc, gin_xlog_startup, gin_xlog_cleanup, NULL)
PG_RMGR(RM_GIST_ID, "Gist", gist_redo, gist_desc, gist_xlog_startup, gist_xlog_cleanup, NULL)
diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h
index 297e4505db9..ec021fe29ae 100644
--- a/src/include/access/xlog_internal.h
+++ b/src/include/access/xlog_internal.h
@@ -55,7 +55,7 @@ typedef struct BkpBlock
/*
* Each page of XLOG file has a header like this:
*/
-#define XLOG_PAGE_MAGIC 0xD07C /* can be used as WAL version indicator */
+#define XLOG_PAGE_MAGIC 0xD07D /* can be used as WAL version indicator */
typedef struct XLogPageHeaderData
{