aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/backend/access/nbtree/README24
-rw-r--r--src/backend/access/nbtree/nbtree.c23
-rw-r--r--src/backend/access/nbtree/nbtxlog.c18
-rw-r--r--src/backend/access/rmgrdesc/nbtdesc.c2
-rw-r--r--src/include/access/nbtree.h6
5 files changed, 67 insertions, 6 deletions
diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README
index 7055c242d20..3c703948448 100644
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@@ -520,6 +520,30 @@ normal running after recovery has completed. This is a key capability
because it allows running applications to continue while the standby
changes state into a normally running server.
+The interlocking required to avoid returning incorrect results from
+MVCC scans is not required on standby nodes. That is because
+HeapTupleSatisfiesUpdate(), HeapTupleSatisfiesSelf(),
+HeapTupleSatisfiesDirty() and HeapTupleSatisfiesVacuum() are only
+ever used during write transactions, which cannot exist on the standby.
+This leaves HeapTupleSatisfiesMVCC() and HeapTupleSatisfiesToast(), so
+HeapTupleSatisfiesToast() is the only non-MVCC scan type used on standbys.
+There is one minor exception, which is that the optimizer sometimes
+looks at the boundaries of value ranges using SnapshotDirty, which
+could result in returning a newer value for query statistics; this
+would affect the query plan in rare cases, but not the correctness.
+The risk window is small since the stats look at the min and max values
+in the index, so the scan retrieves a tid then immediately uses it
+to look in the heap. It is unlikely that the tid could have been
+deleted, vacuumed and re-inserted in the time taken to look in the heap
+via direct tid access. So we ignore that scan type as a problem.
+This means if we re-check the results of any scan of a toast index we
+will be able to completely avoid performing the "pin scan" operation
+during replay of VACUUM WAL records.
+
+XXX FIXME: Toast re-checks are not yet added, so we still perform the
+pin scan when replaying vacuum records of toast indexes.
+
+
Other Things That Are Handy to Know
-----------------------------------
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 712385b3bfc..752e3b5dd12 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -22,6 +22,7 @@
#include "access/relscan.h"
#include "access/xlog.h"
#include "catalog/index.h"
+#include "catalog/pg_namespace.h"
#include "commands/vacuum.h"
#include "storage/indexfsm.h"
#include "storage/ipc.h"
@@ -823,6 +824,11 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
}
/*
+ * Check to see if we need to issue one final WAL record for this index,
+ * which may be needed for correctness on a hot standby node when
+ * non-MVCC index scans could take place. This now only occurs when we
+ * perform a TOAST scan, so only occurs for TOAST indexes.
+ *
* If the WAL is replayed in hot standby, the replay process needs to get
* cleanup locks on all index leaf pages, just as we've been doing here.
* However, we won't issue any WAL records about pages that have no items
@@ -833,6 +839,7 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
* against the last leaf page in the index, if that one wasn't vacuumed.
*/
if (XLogStandbyInfoActive() &&
+ rel->rd_rel->relnamespace == PG_TOAST_NAMESPACE &&
vstate.lastBlockVacuumed < vstate.lastBlockLocked)
{
Buffer buf;
@@ -1031,6 +1038,20 @@ restart:
*/
if (ndeletable > 0)
{
+ BlockNumber lastBlockVacuumed = InvalidBlockNumber;
+
+ /*
+ * We may need to record the lastBlockVacuumed for use when
+ * non-MVCC scans might be performed on the index on a
+ * hot standby. See explanation in btree_xlog_vacuum().
+ *
+ * On a hot standby, a non-MVCC scan can only take place
+ * when we access a Toast Index, so we need only record
+ * the lastBlockVacuumed if we are vacuuming a Toast Index.
+ */
+ if (rel->rd_rel->relnamespace == PG_TOAST_NAMESPACE)
+ lastBlockVacuumed = vstate->lastBlockVacuumed;
+
/*
* Notice that the issued XLOG_BTREE_VACUUM WAL record includes an
* instruction to the replay code to get cleanup lock on all pages
@@ -1043,7 +1064,7 @@ restart:
* that.
*/
_bt_delitems_vacuum(rel, buf, deletable, ndeletable,
- vstate->lastBlockVacuumed);
+ lastBlockVacuumed);
/*
* Remember highest leaf page number we've issued a
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index bba4840da05..0d094ca7faa 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -391,6 +391,19 @@ btree_xlog_vacuum(XLogReaderState *record)
BTPageOpaque opaque;
/*
+ * If we are running non-MVCC scans using this index we need to do some
+ * additional work to ensure correctness, which is known as a "pin scan"
+ * described in more detail in next paragraphs. We used to do the extra
+ * work in all cases, whereas we now avoid that work except when the index
+ * is a toast index, since toast scans aren't fully MVCC compliant.
+ * If lastBlockVacuumed is set to InvalidBlockNumber then we skip the
+ * additional work required for the pin scan.
+ *
+ * Avoiding this extra work is important since it requires us to touch
+ * every page in the index, so is an O(N) operation. Worse, it is an
+ * operation performed in the foreground during redo, so it delays
+ * replication directly.
+ *
* If queries might be active then we need to ensure every leaf page is
* unpinned between the lastBlockVacuumed and the current block, if there
* are any. This prevents replay of the VACUUM from reaching the stage of
@@ -412,7 +425,7 @@ btree_xlog_vacuum(XLogReaderState *record)
* isn't yet consistent; so we need not fear reading still-corrupt blocks
* here during crash recovery.
*/
- if (HotStandbyActiveInReplay())
+ if (HotStandbyActiveInReplay() && BlockNumberIsValid(xlrec->lastBlockVacuumed))
{
RelFileNode thisrnode;
BlockNumber thisblkno;
@@ -433,7 +446,8 @@ btree_xlog_vacuum(XLogReaderState *record)
* XXX we don't actually need to read the block, we just need to
* confirm it is unpinned. If we had a special call into the
* buffer manager we could optimise this so that if the block is
- * not in shared_buffers we confirm it as unpinned.
+ * not in shared_buffers we confirm it as unpinned. Optimizing
+ * this is now moot, since in most cases we avoid the scan.
*/
buffer = XLogReadBufferExtended(thisrnode, MAIN_FORKNUM, blkno,
RBM_NORMAL_NO_LOG);
diff --git a/src/backend/access/rmgrdesc/nbtdesc.c b/src/backend/access/rmgrdesc/nbtdesc.c
index 7631cb5c73f..68afc2e09bd 100644
--- a/src/backend/access/rmgrdesc/nbtdesc.c
+++ b/src/backend/access/rmgrdesc/nbtdesc.c
@@ -48,7 +48,7 @@ btree_desc(StringInfo buf, XLogReaderState *record)
{
xl_btree_vacuum *xlrec = (xl_btree_vacuum *) rec;
- appendStringInfo(buf, "lastBlockVacuumed %u",
+ appendStringInfo(buf, "lastBlockVacuumed %d",
xlrec->lastBlockVacuumed);
break;
}
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 9ebf446693b..b76083323b7 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -331,8 +331,10 @@ typedef struct xl_btree_reuse_page
* The WAL record can represent deletion of any number of index tuples on a
* single index page when executed by VACUUM.
*
- * The correctness requirement for applying these changes during recovery is
- * that we must do one of these two things for every block in the index:
+ * For MVCC scans, lastBlockVacuumed will be set to InvalidBlockNumber.
+ * For a non-MVCC index scans there is an additional correctness requirement
+ * for applying these changes during recovery, which is that we must do one
+ * of these two things for every block in the index:
* * lock the block for cleanup and apply any required changes
* * EnsureBlockUnpinned()
* The purpose of this is to ensure that no index scans started before we