aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/backend/access/gist/gist.c23
-rw-r--r--src/backend/access/gist/gistbuild.c7
-rw-r--r--src/backend/access/gist/gistxlog.c243
-rw-r--r--src/backend/access/rmgrdesc/gistdesc.c11
-rw-r--r--src/include/access/gist_private.h10
-rw-r--r--src/include/access/gistxlog.h17
6 files changed, 298 insertions, 13 deletions
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index 8a42effdf7a..a2cb84800e8 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -38,7 +38,8 @@ static bool gistinserttuples(GISTInsertState *state, GISTInsertStack *stack,
bool unlockbuf, bool unlockleftchild);
static void gistfinishsplit(GISTInsertState *state, GISTInsertStack *stack,
GISTSTATE *giststate, List *splitinfo, bool releasebuf);
-static void gistvacuumpage(Relation rel, Page page, Buffer buffer);
+static void gistvacuumpage(Relation rel, Page page, Buffer buffer,
+ Relation heapRel);
#define ROTATEDIST(d) do { \
@@ -172,7 +173,7 @@ gistinsert(Relation r, Datum *values, bool *isnull,
values, isnull, true /* size is currently bogus */ );
itup->t_tid = *ht_ctid;
- gistdoinsert(r, itup, 0, giststate);
+ gistdoinsert(r, itup, 0, giststate, heapRel);
/* cleanup */
MemoryContextSwitchTo(oldCxt);
@@ -218,7 +219,8 @@ gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
BlockNumber *newblkno,
Buffer leftchildbuf,
List **splitinfo,
- bool markfollowright)
+ bool markfollowright,
+ Relation heapRel)
{
BlockNumber blkno = BufferGetBlockNumber(buffer);
Page page = BufferGetPage(buffer);
@@ -259,7 +261,7 @@ gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
*/
if (is_split && GistPageIsLeaf(page) && GistPageHasGarbage(page))
{
- gistvacuumpage(rel, page, buffer);
+ gistvacuumpage(rel, page, buffer, heapRel);
is_split = gistnospace(page, itup, ntup, oldoffnum, freespace);
}
@@ -604,7 +606,8 @@ gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
* so it does not bother releasing palloc'd allocations.
*/
void
-gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate)
+gistdoinsert(Relation r, IndexTuple itup, Size freespace,
+ GISTSTATE *giststate, Relation heapRel)
{
ItemId iid;
IndexTuple idxtuple;
@@ -616,6 +619,7 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate)
memset(&state, 0, sizeof(GISTInsertState));
state.freespace = freespace;
state.r = r;
+ state.heapRel = heapRel;
/* Start from the root */
firststack.blkno = GIST_ROOT_BLKNO;
@@ -1232,7 +1236,8 @@ gistinserttuples(GISTInsertState *state, GISTInsertStack *stack,
oldoffnum, NULL,
leftchild,
&splitinfo,
- true);
+ true,
+ state->heapRel);
/*
* Before recursing up in case the page was split, release locks on the
@@ -1543,7 +1548,7 @@ freeGISTstate(GISTSTATE *giststate)
* Function assumes that buffer is exclusively locked.
*/
static void
-gistvacuumpage(Relation rel, Page page, Buffer buffer)
+gistvacuumpage(Relation rel, Page page, Buffer buffer, Relation heapRel)
{
OffsetNumber deletable[MaxIndexTuplesPerPage];
int ndeletable = 0;
@@ -1589,9 +1594,9 @@ gistvacuumpage(Relation rel, Page page, Buffer buffer)
{
XLogRecPtr recptr;
- recptr = gistXLogUpdate(buffer,
+ recptr = gistXLogDelete(buffer,
deletable, ndeletable,
- NULL, 0, InvalidBuffer);
+ heapRel->rd_node);
PageSetLSN(page, recptr);
}
diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c
index 434f15f0148..b9c4e27e1a5 100644
--- a/src/backend/access/gist/gistbuild.c
+++ b/src/backend/access/gist/gistbuild.c
@@ -56,6 +56,7 @@ typedef enum
typedef struct
{
Relation indexrel;
+ Relation heaprel;
GISTSTATE *giststate;
int64 indtuples; /* number of tuples indexed */
@@ -122,6 +123,7 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo)
int fillfactor;
buildstate.indexrel = index;
+ buildstate.heaprel = heap;
if (index->rd_options)
{
/* Get buffering mode from the options string */
@@ -484,7 +486,7 @@ gistBuildCallback(Relation index,
* locked, we call gistdoinsert directly.
*/
gistdoinsert(index, itup, buildstate->freespace,
- buildstate->giststate);
+ buildstate->giststate, buildstate->heaprel);
}
/* Update tuple count and total size. */
@@ -690,7 +692,8 @@ gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer, int level,
itup, ntup, oldoffnum, &placed_to_blk,
InvalidBuffer,
&splitinfo,
- false);
+ false,
+ buildstate->heaprel);
/*
* If this is a root split, update the root path item kept in memory. This
diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c
index 1e091269785..01e025d5fdb 100644
--- a/src/backend/access/gist/gistxlog.c
+++ b/src/backend/access/gist/gistxlog.c
@@ -16,8 +16,12 @@
#include "access/bufmask.h"
#include "access/gist_private.h"
#include "access/gistxlog.h"
+#include "access/heapam_xlog.h"
+#include "access/transam.h"
#include "access/xloginsert.h"
#include "access/xlogutils.h"
+#include "miscadmin.h"
+#include "storage/procarray.h"
#include "utils/memutils.h"
static MemoryContext opCtx; /* working memory for operations */
@@ -161,6 +165,210 @@ gistRedoPageUpdateRecord(XLogReaderState *record)
}
/*
+ * Get the latestRemovedXid from the heap pages pointed at by the index
+ * tuples being deleted. See also btree_xlog_delete_get_latestRemovedXid,
+ * on which this function is based.
+ */
+static TransactionId
+gistRedoDeleteRecordGetLatestRemovedXid(XLogReaderState *record)
+{
+ gistxlogDelete *xlrec = (gistxlogDelete *) XLogRecGetData(record);
+ OffsetNumber *todelete;
+ Buffer ibuffer,
+ hbuffer;
+ Page ipage,
+ hpage;
+ RelFileNode rnode;
+ BlockNumber blkno;
+ ItemId iitemid,
+ hitemid;
+ IndexTuple itup;
+ HeapTupleHeader htuphdr;
+ BlockNumber hblkno;
+ OffsetNumber hoffnum;
+ TransactionId latestRemovedXid = InvalidTransactionId;
+ int i;
+
+ /*
+ * If there's nothing running on the standby we don't need to derive a
+ * full latestRemovedXid value, so use a fast path out of here. This
+ * returns InvalidTransactionId, and so will conflict with all HS
+ * transactions; but since we just worked out that that's zero people,
+ * it's OK.
+ *
+ * XXX There is a race condition here, which is that a new backend might
+ * start just after we look. If so, it cannot need to conflict, but this
+ * coding will result in throwing a conflict anyway.
+ */
+ if (CountDBBackends(InvalidOid) == 0)
+ return latestRemovedXid;
+
+ /*
+ * In what follows, we have to examine the previous state of the index
+ * page, as well as the heap page(s) it points to. This is only valid if
+ * WAL replay has reached a consistent database state; which means that
+ * the preceding check is not just an optimization, but is *necessary*. We
+ * won't have let in any user sessions before we reach consistency.
+ */
+ if (!reachedConsistency)
+ elog(PANIC, "gistRedoDeleteRecordGetLatestRemovedXid: cannot operate with inconsistent data");
+
+ /*
+ * Get index page. If the DB is consistent, this should not fail, nor
+ * should any of the heap page fetches below. If one does, we return
+ * InvalidTransactionId to cancel all HS transactions. That's probably
+ * overkill, but it's safe, and certainly better than panicking here.
+ */
+ XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno);
+ ibuffer = XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno, RBM_NORMAL);
+ if (!BufferIsValid(ibuffer))
+ return InvalidTransactionId;
+ LockBuffer(ibuffer, BUFFER_LOCK_EXCLUSIVE);
+ ipage = (Page) BufferGetPage(ibuffer);
+
+ /*
+ * Loop through the deleted index items to obtain the TransactionId from
+ * the heap items they point to.
+ */
+ todelete = (OffsetNumber *) ((char *) xlrec + SizeOfGistxlogDelete);
+
+ for (i = 0; i < xlrec->ntodelete; i++)
+ {
+ /*
+ * Identify the index tuple about to be deleted
+ */
+ iitemid = PageGetItemId(ipage, todelete[i]);
+ itup = (IndexTuple) PageGetItem(ipage, iitemid);
+
+ /*
+ * Locate the heap page that the index tuple points at
+ */
+ hblkno = ItemPointerGetBlockNumber(&(itup->t_tid));
+ hbuffer = XLogReadBufferExtended(xlrec->hnode, MAIN_FORKNUM, hblkno, RBM_NORMAL);
+ if (!BufferIsValid(hbuffer))
+ {
+ UnlockReleaseBuffer(ibuffer);
+ return InvalidTransactionId;
+ }
+ LockBuffer(hbuffer, BUFFER_LOCK_SHARE);
+ hpage = (Page) BufferGetPage(hbuffer);
+
+ /*
+ * Look up the heap tuple header that the index tuple points at by
+ * using the heap node supplied with the xlrec. We can't use
+ * heap_fetch, since it uses ReadBuffer rather than XLogReadBuffer.
+ * Note that we are not looking at tuple data here, just headers.
+ */
+ hoffnum = ItemPointerGetOffsetNumber(&(itup->t_tid));
+ hitemid = PageGetItemId(hpage, hoffnum);
+
+ /*
+ * Follow any redirections until we find something useful.
+ */
+ while (ItemIdIsRedirected(hitemid))
+ {
+ hoffnum = ItemIdGetRedirect(hitemid);
+ hitemid = PageGetItemId(hpage, hoffnum);
+ CHECK_FOR_INTERRUPTS();
+ }
+
+ /*
+ * If the heap item has storage, then read the header and use that to
+ * set latestRemovedXid.
+ *
+ * Some LP_DEAD items may not be accessible, so we ignore them.
+ */
+ if (ItemIdHasStorage(hitemid))
+ {
+ htuphdr = (HeapTupleHeader) PageGetItem(hpage, hitemid);
+
+ HeapTupleHeaderAdvanceLatestRemovedXid(htuphdr, &latestRemovedXid);
+ }
+ else if (ItemIdIsDead(hitemid))
+ {
+ /*
+ * Conjecture: if hitemid is dead then it had xids before the xids
+ * marked on LP_NORMAL items. So we just ignore this item and move
+ * onto the next, for the purposes of calculating
+ * latestRemovedxids.
+ */
+ }
+ else
+ Assert(!ItemIdIsUsed(hitemid));
+
+ UnlockReleaseBuffer(hbuffer);
+ }
+
+ UnlockReleaseBuffer(ibuffer);
+
+ /*
+ * If all heap tuples were LP_DEAD then we will be returning
+ * InvalidTransactionId here, which avoids conflicts. This matches
+ * existing logic which assumes that LP_DEAD tuples must already be older
+ * than the latestRemovedXid on the cleanup record that set them as
+ * LP_DEAD, hence must already have generated a conflict.
+ */
+ return latestRemovedXid;
+}
+
+/*
+ * redo delete on gist index page to remove tuples marked as DEAD during index
+ * tuple insertion
+ */
+static void
+gistRedoDeleteRecord(XLogReaderState *record)
+{
+ XLogRecPtr lsn = record->EndRecPtr;
+ gistxlogDelete *xldata = (gistxlogDelete *) XLogRecGetData(record);
+ Buffer buffer;
+ Page page;
+
+ /*
+ * If we have any conflict processing to do, it must happen before we
+ * update the page.
+ *
+ * GiST delete records can conflict with standby queries. You might think
+ * that vacuum records would conflict as well, but we've handled that
+ * already. XLOG_HEAP2_CLEANUP_INFO records provide the highest xid
+ * cleaned by the vacuum of the heap and so we can resolve any conflicts
+ * just once when that arrives. After that we know that no conflicts
+ * exist from individual gist vacuum records on that index.
+ */
+ if (InHotStandby)
+ {
+ TransactionId latestRemovedXid = gistRedoDeleteRecordGetLatestRemovedXid(record);
+ RelFileNode rnode;
+
+ XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL);
+
+ ResolveRecoveryConflictWithSnapshot(latestRemovedXid, rnode);
+ }
+
+ if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
+ {
+ page = (Page) BufferGetPage(buffer);
+
+ if (XLogRecGetDataLen(record) > SizeOfGistxlogDelete)
+ {
+ OffsetNumber *todelete;
+
+ todelete = (OffsetNumber *) ((char *) xldata + SizeOfGistxlogDelete);
+
+ PageIndexMultiDelete(page, todelete, xldata->ntodelete);
+ }
+
+ GistClearPageHasGarbage(page);
+ GistMarkTuplesDeleted(page);
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buffer);
+ }
+
+ if (BufferIsValid(buffer))
+ UnlockReleaseBuffer(buffer);
+}
+
+/*
* Returns an array of index pointers.
*/
static IndexTuple *
@@ -318,6 +526,9 @@ gist_redo(XLogReaderState *record)
case XLOG_GIST_PAGE_UPDATE:
gistRedoPageUpdateRecord(record);
break;
+ case XLOG_GIST_DELETE:
+ gistRedoDeleteRecord(record);
+ break;
case XLOG_GIST_PAGE_SPLIT:
gistRedoPageSplitRecord(record);
break;
@@ -487,3 +698,35 @@ gistXLogUpdate(Buffer buffer,
return recptr;
}
+
+/*
+ * Write XLOG record describing a delete of leaf index tuples marked as DEAD
+ * during new tuple insertion. One may think that this case is already covered
+ * by gistXLogUpdate(). But deletion of index tuples might conflict with
+ * standby queries and needs special handling.
+ */
+XLogRecPtr
+gistXLogDelete(Buffer buffer, OffsetNumber *todelete, int ntodelete,
+ RelFileNode hnode)
+{
+ gistxlogDelete xlrec;
+ XLogRecPtr recptr;
+
+ xlrec.hnode = hnode;
+ xlrec.ntodelete = ntodelete;
+
+ XLogBeginInsert();
+ XLogRegisterData((char *) &xlrec, SizeOfGistxlogDelete);
+
+ /*
+ * We need the target-offsets array whether or not we store the whole
+ * buffer, to allow us to find the latestRemovedXid on a standby server.
+ */
+ XLogRegisterData((char *) todelete, ntodelete * sizeof(OffsetNumber));
+
+ XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
+
+ recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_DELETE);
+
+ return recptr;
+}
diff --git a/src/backend/access/rmgrdesc/gistdesc.c b/src/backend/access/rmgrdesc/gistdesc.c
index e5e925e0c5a..b79ed1dfdc8 100644
--- a/src/backend/access/rmgrdesc/gistdesc.c
+++ b/src/backend/access/rmgrdesc/gistdesc.c
@@ -24,6 +24,11 @@ out_gistxlogPageUpdate(StringInfo buf, gistxlogPageUpdate *xlrec)
}
static void
+out_gistxlogDelete(StringInfo buf, gistxlogPageUpdate *xlrec)
+{
+}
+
+static void
out_gistxlogPageSplit(StringInfo buf, gistxlogPageSplit *xlrec)
{
appendStringInfo(buf, "page_split: splits to %d pages",
@@ -41,6 +46,9 @@ gist_desc(StringInfo buf, XLogReaderState *record)
case XLOG_GIST_PAGE_UPDATE:
out_gistxlogPageUpdate(buf, (gistxlogPageUpdate *) rec);
break;
+ case XLOG_GIST_DELETE:
+ out_gistxlogDelete(buf, (gistxlogPageUpdate *) rec);
+ break;
case XLOG_GIST_PAGE_SPLIT:
out_gistxlogPageSplit(buf, (gistxlogPageSplit *) rec);
break;
@@ -59,6 +67,9 @@ gist_identify(uint8 info)
case XLOG_GIST_PAGE_UPDATE:
id = "PAGE_UPDATE";
break;
+ case XLOG_GIST_DELETE:
+ id = "DELETE";
+ break;
case XLOG_GIST_PAGE_SPLIT:
id = "PAGE_SPLIT";
break;
diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h
index 36ed7244ba0..a73716d6eaa 100644
--- a/src/include/access/gist_private.h
+++ b/src/include/access/gist_private.h
@@ -240,6 +240,7 @@ typedef struct GistSplitVector
typedef struct
{
Relation r;
+ Relation heapRel;
Size freespace; /* free space to be left */
GISTInsertStack *stack;
@@ -389,7 +390,8 @@ extern void freeGISTstate(GISTSTATE *giststate);
extern void gistdoinsert(Relation r,
IndexTuple itup,
Size freespace,
- GISTSTATE *GISTstate);
+ GISTSTATE *GISTstate,
+ Relation heapRel);
/* A List of these is returned from gistplacetopage() in *splitinfo */
typedef struct
@@ -404,7 +406,8 @@ extern bool gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
OffsetNumber oldoffnum, BlockNumber *newblkno,
Buffer leftchildbuf,
List **splitinfo,
- bool markleftchild);
+ bool markleftchild,
+ Relation heapRel);
extern SplitedPageLayout *gistSplit(Relation r, Page page, IndexTuple *itup,
int len, GISTSTATE *giststate);
@@ -414,6 +417,9 @@ extern XLogRecPtr gistXLogUpdate(Buffer buffer,
IndexTuple *itup, int ntup,
Buffer leftchild);
+XLogRecPtr gistXLogDelete(Buffer buffer, OffsetNumber *todelete,
+ int ntodelete, RelFileNode hnode);
+
extern XLogRecPtr gistXLogSplit(bool page_is_leaf,
SplitedPageLayout *dist,
BlockNumber origrlink, GistNSN oldnsn,
diff --git a/src/include/access/gistxlog.h b/src/include/access/gistxlog.h
index 1a2b9496d0d..b67c7100500 100644
--- a/src/include/access/gistxlog.h
+++ b/src/include/access/gistxlog.h
@@ -18,6 +18,7 @@
#include "lib/stringinfo.h"
#define XLOG_GIST_PAGE_UPDATE 0x00
+#define XLOG_GIST_DELETE 0x10 /* delete leaf index tuples for a page */
/* #define XLOG_GIST_NEW_ROOT 0x20 */ /* not used anymore */
#define XLOG_GIST_PAGE_SPLIT 0x30
/* #define XLOG_GIST_INSERT_COMPLETE 0x40 */ /* not used anymore */
@@ -41,6 +42,22 @@ typedef struct gistxlogPageUpdate
} gistxlogPageUpdate;
/*
+ * Backup Blk 0: Leaf page, whose index tuples are deleted.
+ */
+typedef struct gistxlogDelete
+{
+ RelFileNode hnode; /* RelFileNode of the heap the index currently
+ * points at */
+ uint16 ntodelete; /* number of deleted offsets */
+
+ /*
+ * In payload of blk 0 : todelete OffsetNumbers
+ */
+} gistxlogDelete;
+
+#define SizeOfGistxlogDelete (offsetof(gistxlogDelete, ntodelete) + sizeof(uint16))
+
+/*
* Backup Blk 0: If this operation completes a page split, by inserting a
* downlink for the split page, the left half of the split
* Backup Blk 1 - npage: split pages (1 is the original page)