Allow read only connections during recovery, known as Hot Standby.

Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
author: Simon Riggs <simon@2ndQuadrant.com> 2009-12-19 01:32:45 +0000
committer: Simon Riggs <simon@2ndQuadrant.com> 2009-12-19 01:32:45 +0000
commit: efc16ea520679d713d98a2c7bf1453c4ff7b91ec (patch)
tree: 6a39d2af0704a36281dc7df3ec10823eb3e6de75 /src/backend/access
parent: 78a09145e0f8322e625bbc7d69fcb865ce4f3034 (diff)
download: postgresql-efc16ea520679d713d98a2c7bf1453c4ff7b91ec.tar.gz
postgresql-efc16ea520679d713d98a2c7bf1453c4ff7b91ec.zip
21 files changed, 1383 insertions, 194 deletions
diff --git a/src/backend/access/gin/ginxlog.c b/src/backend/access/gin/ginxlog.c
index 1f008b727f0..186805b1249 100644
--- a/src/backend/access/gin/ginxlog.c
+++ b/src/backend/access/gin/ginxlog.c
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *			 $PostgreSQL: pgsql/src/backend/access/gin/ginxlog.c,v 1.19 2009/06/11 14:48:53 momjian Exp $
+ *			 $PostgreSQL: pgsql/src/backend/access/gin/ginxlog.c,v 1.20 2009/12/19 01:32:31 sriggs Exp $
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
@@ -621,6 +621,10 @@ gin_redo(XLogRecPtr lsn, XLogRecord *record)
 {
 	uint8		info = record->xl_info & ~XLR_INFO_MASK;
 
+	/*
+	 * GIN indexes do not require any conflict processing.
+	 */
+
 	RestoreBkpBlocks(lsn, record, false);
 
 	topCtx = MemoryContextSwitchTo(opCtx);
diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c
index 672d714e014..7a9f8934cf4 100644
--- a/src/backend/access/gist/gistxlog.c
+++ b/src/backend/access/gist/gistxlog.c
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *			 $PostgreSQL: pgsql/src/backend/access/gist/gistxlog.c,v 1.32 2009/01/20 18:59:36 heikki Exp $
+ *			 $PostgreSQL: pgsql/src/backend/access/gist/gistxlog.c,v 1.33 2009/12/19 01:32:32 sriggs Exp $
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
@@ -396,6 +396,12 @@ gist_redo(XLogRecPtr lsn, XLogRecord *record)
 	uint8		info = record->xl_info & ~XLR_INFO_MASK;
 	MemoryContext oldCxt;
 
+	/*
+	 * GIST indexes do not require any conflict processing. NB: If we ever
+	 * implement a similar optimization we have in b-tree, and remove killed
+	 * tuples outside VACUUM, we'll need to handle that here.
+	 */
+
 	RestoreBkpBlocks(lsn, record, false);
 
 	oldCxt = MemoryContextSwitchTo(opCtx);
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 148d88ba274..4b85b127a7d 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.278 2009/08/24 02:18:31 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.279 2009/12/19 01:32:32 sriggs Exp $
  *
  *
  * INTERFACE ROUTINES
@@ -59,6 +59,7 @@
 #include "storage/lmgr.h"
 #include "storage/procarray.h"
 #include "storage/smgr.h"
+#include "storage/standby.h"
 #include "utils/datum.h"
 #include "utils/inval.h"
 #include "utils/lsyscache.h"
@@ -248,8 +249,11 @@ heapgetpage(HeapScanDesc scan, BlockNumber page)
 	/*
 	 * If the all-visible flag indicates that all tuples on the page are
 	 * visible to everyone, we can skip the per-tuple visibility tests.
+	 * But not in hot standby mode. A tuple that's already visible to all
+	 * transactions in the master might still be invisible to a read-only
+	 * transaction in the standby.
 	 */
-	all_visible = PageIsAllVisible(dp);
+	all_visible = PageIsAllVisible(dp) && !snapshot->takenDuringRecovery;
 
 	for (lineoff = FirstOffsetNumber, lpp = PageGetItemId(dp, lineoff);
 		 lineoff <= lines;
@@ -3770,19 +3774,77 @@ heap_restrpos(HeapScanDesc scan)
 }
 
 /*
+ * If 'tuple' contains any XID greater than latestRemovedXid, update
+ * latestRemovedXid to the greatest one found.
+ */
+void
+HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple,
+									   TransactionId *latestRemovedXid)
+{
+	TransactionId xmin = HeapTupleHeaderGetXmin(tuple);
+	TransactionId xmax = HeapTupleHeaderGetXmax(tuple);
+	TransactionId xvac = HeapTupleHeaderGetXvac(tuple);
+
+	if (tuple->t_infomask & HEAP_MOVED_OFF ||
+		tuple->t_infomask & HEAP_MOVED_IN)
+	{
+		if (TransactionIdPrecedes(*latestRemovedXid, xvac))
+			*latestRemovedXid = xvac;
+	}
+
+	if (TransactionIdPrecedes(*latestRemovedXid, xmax))
+		*latestRemovedXid = xmax;
+
+	if (TransactionIdPrecedes(*latestRemovedXid, xmin))
+		*latestRemovedXid = xmin;
+
+	Assert(TransactionIdIsValid(*latestRemovedXid));
+}
+
+/*
+ * Perform XLogInsert to register a heap cleanup info message. These
+ * messages are sent once per VACUUM and are required because
+ * of the phasing of removal operations during a lazy VACUUM.
+ * see comments for vacuum_log_cleanup_info().
+ */
+XLogRecPtr
+log_heap_cleanup_info(RelFileNode rnode, TransactionId latestRemovedXid)
+{
+	xl_heap_cleanup_info xlrec;
+	XLogRecPtr	recptr;
+	XLogRecData rdata;
+
+	xlrec.node = rnode;
+	xlrec.latestRemovedXid = latestRemovedXid;
+
+	rdata.data = (char *) &xlrec;
+	rdata.len = SizeOfHeapCleanupInfo;
+	rdata.buffer = InvalidBuffer;
+	rdata.next = NULL;
+
+	recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_CLEANUP_INFO, &rdata);
+
+	return recptr;
+}
+
+/*
  * Perform XLogInsert for a heap-clean operation.  Caller must already
  * have modified the buffer and marked it dirty.
  *
  * Note: prior to Postgres 8.3, the entries in the nowunused[] array were
  * zero-based tuple indexes.  Now they are one-based like other uses
  * of OffsetNumber.
+ *
+ * We also include latestRemovedXid, which is the greatest XID present in
+ * the removed tuples. That allows recovery processing to cancel or wait
+ * for long standby queries that can still see these tuples.
  */
 XLogRecPtr
 log_heap_clean(Relation reln, Buffer buffer,
 			   OffsetNumber *redirected, int nredirected,
 			   OffsetNumber *nowdead, int ndead,
 			   OffsetNumber *nowunused, int nunused,
-			   bool redirect_move)
+			   TransactionId latestRemovedXid, bool redirect_move)
 {
 	xl_heap_clean xlrec;
 	uint8		info;
@@ -3794,6 +3856,7 @@ log_heap_clean(Relation reln, Buffer buffer,
 
 	xlrec.node = reln->rd_node;
 	xlrec.block = BufferGetBlockNumber(buffer);
+	xlrec.latestRemovedXid = latestRemovedXid;
 	xlrec.nredirected = nredirected;
 	xlrec.ndead = ndead;
 
@@ -4068,6 +4131,33 @@ log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno,
 }
 
 /*
+ * Handles CLEANUP_INFO
+ */
+static void
+heap_xlog_cleanup_info(XLogRecPtr lsn, XLogRecord *record)
+{
+	xl_heap_cleanup_info *xlrec = (xl_heap_cleanup_info *) XLogRecGetData(record);
+
+	if (InHotStandby)
+	{
+		VirtualTransactionId *backends;
+
+		backends = GetConflictingVirtualXIDs(xlrec->latestRemovedXid,
+											 InvalidOid,
+											 true);
+		ResolveRecoveryConflictWithVirtualXIDs(backends,
+											   "VACUUM index cleanup",
+											   CONFLICT_MODE_ERROR);
+	}
+
+	/*
+	 * Actual operation is a no-op. Record type exists to provide a means
+	 * for conflict processing to occur before we begin index vacuum actions.
+	 * see vacuumlazy.c and also comments in btvacuumpage()
+	 */
+}
+
+/*
  * Handles CLEAN and CLEAN_MOVE record types
  */
 static void
@@ -4085,12 +4175,31 @@ heap_xlog_clean(XLogRecPtr lsn, XLogRecord *record, bool clean_move)
 	int			nunused;
 	Size		freespace;
 
+	/*
+	 * We're about to remove tuples. In Hot Standby mode, ensure that there's
+	 * no queries running for which the removed tuples are still visible.
+	 */
+	if (InHotStandby)
+	{
+		VirtualTransactionId *backends;
+
+		backends = GetConflictingVirtualXIDs(xlrec->latestRemovedXid,
+											 InvalidOid,
+											 true);
+		ResolveRecoveryConflictWithVirtualXIDs(backends,
+											   "VACUUM heap cleanup",
+											   CONFLICT_MODE_ERROR);
+	}
+
+	RestoreBkpBlocks(lsn, record, true);
+
 	if (record->xl_info & XLR_BKP_BLOCK_1)
 		return;
 
-	buffer = XLogReadBuffer(xlrec->node, xlrec->block, false);
+	buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, xlrec->block, RBM_NORMAL);
 	if (!BufferIsValid(buffer))
 		return;
+	LockBufferForCleanup(buffer);
 	page = (Page) BufferGetPage(buffer);
 
 	if (XLByteLE(lsn, PageGetLSN(page)))
@@ -4145,12 +4254,40 @@ heap_xlog_freeze(XLogRecPtr lsn, XLogRecord *record)
 	Buffer		buffer;
 	Page		page;
 
+	/*
+	 * In Hot Standby mode, ensure that there's no queries running which still
+	 * consider the frozen xids as running.
+	 */
+	if (InHotStandby)
+	{
+		VirtualTransactionId *backends;
+
+		/*
+		 * XXX: Using cutoff_xid is overly conservative. Even if cutoff_xid
+		 * is recent enough to conflict with a backend, the actual values
+		 * being frozen might not be. With a typical vacuum_freeze_min_age
+		 * setting in the ballpark of millions of transactions, it won't make
+		 * a difference, but it might if you run a manual VACUUM FREEZE.
+		 * Typically the cutoff is much earlier than any recently deceased
+		 * tuple versions removed by this vacuum, so don't worry too much.
+		 */
+		backends = GetConflictingVirtualXIDs(cutoff_xid,
+											 InvalidOid,
+											 true);
+		ResolveRecoveryConflictWithVirtualXIDs(backends,
+											   "VACUUM heap freeze",
+											   CONFLICT_MODE_ERROR);
+	}
+
+	RestoreBkpBlocks(lsn, record, false);
+
 	if (record->xl_info & XLR_BKP_BLOCK_1)
 		return;
 
-	buffer = XLogReadBuffer(xlrec->node, xlrec->block, false);
+	buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, xlrec->block, RBM_NORMAL);
 	if (!BufferIsValid(buffer))
 		return;
+	LockBufferForCleanup(buffer);
 	page = (Page) BufferGetPage(buffer);
 
 	if (XLByteLE(lsn, PageGetLSN(page)))
@@ -4740,6 +4877,11 @@ heap_redo(XLogRecPtr lsn, XLogRecord *record)
 {
 	uint8		info = record->xl_info & ~XLR_INFO_MASK;
 
+	/*
+	 * These operations don't overwrite MVCC data so no conflict
+	 * processing is required. The ones in heap2 rmgr do.
+	 */
+
 	RestoreBkpBlocks(lsn, record, false);
 
 	switch (info & XLOG_HEAP_OPMASK)
@@ -4778,20 +4920,25 @@ heap2_redo(XLogRecPtr lsn, XLogRecord *record)
 {
 	uint8		info = record->xl_info & ~XLR_INFO_MASK;
 
+	/*
+	 * Note that RestoreBkpBlocks() is called after conflict processing
+	 * within each record type handling function.
+	 */
+
 	switch (info & XLOG_HEAP_OPMASK)
 	{
 		case XLOG_HEAP2_FREEZE:
-			RestoreBkpBlocks(lsn, record, false);
 			heap_xlog_freeze(lsn, record);
 			break;
 		case XLOG_HEAP2_CLEAN:
-			RestoreBkpBlocks(lsn, record, true);
 			heap_xlog_clean(lsn, record, false);
 			break;
 		case XLOG_HEAP2_CLEAN_MOVE:
-			RestoreBkpBlocks(lsn, record, true);
 			heap_xlog_clean(lsn, record, true);
 			break;
+		case XLOG_HEAP2_CLEANUP_INFO:
+			heap_xlog_cleanup_info(lsn, record);
+			break;
 		default:
 			elog(PANIC, "heap2_redo: unknown op code %u", info);
 	}
@@ -4921,17 +5068,26 @@ heap2_desc(StringInfo buf, uint8 xl_info, char *rec)
 	{
 		xl_heap_clean *xlrec = (xl_heap_clean *) rec;
 
-		appendStringInfo(buf, "clean: rel %u/%u/%u; blk %u",
+		appendStringInfo(buf, "clean: rel %u/%u/%u; blk %u remxid %u",
 						 xlrec->node.spcNode, xlrec->node.dbNode,
-						 xlrec->node.relNode, xlrec->block);
+						 xlrec->node.relNode, xlrec->block,
+						 xlrec->latestRemovedXid);
 	}
 	else if (info == XLOG_HEAP2_CLEAN_MOVE)
 	{
 		xl_heap_clean *xlrec = (xl_heap_clean *) rec;
 
-		appendStringInfo(buf, "clean_move: rel %u/%u/%u; blk %u",
+		appendStringInfo(buf, "clean_move: rel %u/%u/%u; blk %u remxid %u",
 						 xlrec->node.spcNode, xlrec->node.dbNode,
-						 xlrec->node.relNode, xlrec->block);
+						 xlrec->node.relNode, xlrec->block,
+						 xlrec->latestRemovedXid);
+	}
+	else if (info == XLOG_HEAP2_CLEANUP_INFO)
+	{
+		xl_heap_cleanup_info *xlrec = (xl_heap_cleanup_info *) rec;
+
+		appendStringInfo(buf, "cleanup info: remxid %u",
+						 xlrec->latestRemovedXid);
 	}
 	else
 		appendStringInfo(buf, "UNKNOWN");
diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c
index 71ea689d0e6..1ea0899acc8 100644
--- a/src/backend/access/heap/pruneheap.c
+++ b/src/backend/access/heap/pruneheap.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/heap/pruneheap.c,v 1.18 2009/06/11 14:48:53 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/heap/pruneheap.c,v 1.19 2009/12/19 01:32:32 sriggs Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -30,7 +30,8 @@
 typedef struct
 {
 	TransactionId new_prune_xid;	/* new prune hint value for page */
-	int			nredirected;	/* numbers of entries in arrays below */
+	TransactionId latestRemovedXid; /* latest xid to be removed by this prune */
+	int			nredirected;		/* numbers of entries in arrays below */
 	int			ndead;
 	int			nunused;
 	/* arrays that accumulate indexes of items to be changed */
@@ -85,6 +86,14 @@ heap_page_prune_opt(Relation relation, Buffer buffer, TransactionId OldestXmin)
 		return;
 
 	/*
+	 * We can't write WAL in recovery mode, so there's no point trying to
+	 * clean the page. The master will likely issue a cleaning WAL record
+	 * soon anyway, so this is no particular loss.
+	 */
+	if (RecoveryInProgress())
+		return;
+
+	/*
 	 * We prune when a previous UPDATE failed to find enough space on the page
 	 * for a new tuple version, or when free space falls below the relation's
 	 * fill-factor target (but not less than 10%).
@@ -176,6 +185,7 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin,
 	 * of our working state.
 	 */
 	prstate.new_prune_xid = InvalidTransactionId;
+	prstate.latestRemovedXid = InvalidTransactionId;
 	prstate.nredirected = prstate.ndead = prstate.nunused = 0;
 	memset(prstate.marked, 0, sizeof(prstate.marked));
 
@@ -257,7 +267,7 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin,
 									prstate.redirected, prstate.nredirected,
 									prstate.nowdead, prstate.ndead,
 									prstate.nowunused, prstate.nunused,
-									redirect_move);
+									prstate.latestRemovedXid, redirect_move);
 
 			PageSetLSN(BufferGetPage(buffer), recptr);
 			PageSetTLI(BufferGetPage(buffer), ThisTimeLineID);
@@ -395,6 +405,8 @@ heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum,
 				== HEAPTUPLE_DEAD && !HeapTupleHeaderIsHotUpdated(htup))
 			{
 				heap_prune_record_unused(prstate, rootoffnum);
+				HeapTupleHeaderAdvanceLatestRemovedXid(htup,
+													   &prstate->latestRemovedXid);
 				ndeleted++;
 			}
 
@@ -520,7 +532,11 @@ heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum,
 		 * find another DEAD tuple is a fairly unusual corner case.)
 		 */
 		if (tupdead)
+		{
 			latestdead = offnum;
+			HeapTupleHeaderAdvanceLatestRemovedXid(htup,
+												   &prstate->latestRemovedXid);
+		}
 		else if (!recent_dead)
 			break;
 
diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c
index f07996a3d46..3bbbf3b06da 100644
--- a/src/backend/access/index/genam.c
+++ b/src/backend/access/index/genam.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/index/genam.c,v 1.77 2009/12/07 05:22:21 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/index/genam.c,v 1.78 2009/12/19 01:32:32 sriggs Exp $
  *
  * NOTES
  *	  many of the old access method routines have been turned into
@@ -91,8 +91,19 @@ RelationGetIndexScan(Relation indexRelation,
 	else
 		scan->keyData = NULL;
 
+	/*
+	 * During recovery we ignore killed tuples and don't bother to kill them
+	 * either. We do this because the xmin on the primary node could easily
+	 * be later than the xmin on the standby node, so that what the primary
+	 * thinks is killed is supposed to be visible on standby. So for correct
+	 * MVCC for queries during recovery we must ignore these hints and check
+	 * all tuples. Do *not* set ignore_killed_tuples to true when running
+	 * in a transaction that was started during recovery.
+	 * xactStartedInRecovery should not be altered by index AMs.
+	 */
 	scan->kill_prior_tuple = false;
-	scan->ignore_killed_tuples = true;	/* default setting */
+	scan->xactStartedInRecovery = TransactionStartedDuringRecovery();
+	scan->ignore_killed_tuples = !scan->xactStartedInRecovery;
 
 	scan->opaque = NULL;
 
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index f4ffeccd328..d71b26a5540 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/index/indexam.c,v 1.115 2009/07/29 20:56:18 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/index/indexam.c,v 1.116 2009/12/19 01:32:32 sriggs Exp $
  *
  * INTERFACE ROUTINES
  *		index_open		- open an index relation by relation OID
@@ -455,9 +455,12 @@ index_getnext(IndexScanDesc scan, ScanDirection direction)
 
 			/*
 			 * If we scanned a whole HOT chain and found only dead tuples,
-			 * tell index AM to kill its entry for that TID.
+			 * tell index AM to kill its entry for that TID. We do not do
+			 * this when in recovery because it may violate MVCC to do so.
+			 * see comments in RelationGetIndexScan().
 			 */
-			scan->kill_prior_tuple = scan->xs_hot_dead;
+			if (!scan->xactStartedInRecovery)
+				scan->kill_prior_tuple = scan->xs_hot_dead;
 
 			/*
 			 * The AM's gettuple proc finds the next index entry matching the
diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README
index 9fe84e320e2..e53315a83fb 100644
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@@ -1,4 +1,4 @@
-$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.20 2008/03/21 13:23:27 momjian Exp $
+$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.21 2009/12/19 01:32:32 sriggs Exp $
 
 Btree Indexing
 ==============
@@ -401,6 +401,33 @@ of the WAL entry.)  If the parent page becomes half-dead but is not
 immediately deleted due to a subsequent crash, there is no loss of
 consistency, and the empty page will be picked up by the next VACUUM.
 
+Scans during Recovery
+---------------------
+
+The btree index type can be safely used during recovery. During recovery
+we have at most one writer and potentially many readers. In that
+situation the locking requirements can be relaxed and we do not need
+double locking during block splits. Each WAL record makes changes to a
+single level of the btree using the correct locking sequence and so
+is safe for concurrent readers. Some readers may observe a block split
+in progress as they descend the tree, but they will simply move right
+onto the correct page.
+
+During recovery all index scans start with ignore_killed_tuples = false
+and we never set kill_prior_tuple. We do this because the oldest xmin
+on the standby server can be older than the oldest xmin on the master
+server, which means tuples can be marked as killed even when they are
+still visible on the standby. We don't WAL log tuple killed bits, but
+they can still appear in the standby because of full page writes. So
+we must always ignore them in standby, and that means it's not worth
+setting them either.
+
+Note that we talk about scans that are started during recovery. We go to
+a little trouble to allow a scan to start during recovery and end during
+normal running after recovery has completed. This is a key capability
+because it allows running applications to continue while the standby
+changes state into a normally running server.
+
 Other Things That Are Handy to Know
 -----------------------------------
 
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index a1dadfb6923..3263d5846a6 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.174 2009/10/02 21:14:04 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.175 2009/12/19 01:32:32 sriggs Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -2025,7 +2025,7 @@ _bt_vacuum_one_page(Relation rel, Buffer buffer)
 	}
 
 	if (ndeletable > 0)
-		_bt_delitems(rel, buffer, deletable, ndeletable);
+		_bt_delitems(rel, buffer, deletable, ndeletable, false, 0);
 
 	/*
 	 * Note: if we didn't find any LP_DEAD items, then the page's
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index 0dd4fdae79a..85f352d343f 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -9,7 +9,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.113 2009/05/05 19:02:22 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.114 2009/12/19 01:32:33 sriggs Exp $
  *
  *	NOTES
  *	   Postgres btree pages look like ordinary relation pages.	The opaque
@@ -653,19 +653,33 @@ _bt_page_recyclable(Page page)
  *
  * This routine assumes that the caller has pinned and locked the buffer.
  * Also, the given itemnos *must* appear in increasing order in the array.
+ *
+ * We record VACUUMs and b-tree deletes differently in WAL. InHotStandby
+ * we need to be able to pin all of the blocks in the btree in physical
+ * order when replaying the effects of a VACUUM, just as we do for the
+ * original VACUUM itself. lastBlockVacuumed allows us to tell whether an
+ * intermediate range of blocks has had no changes at all by VACUUM,
+ * and so must be scanned anyway during replay. We always write a WAL record
+ * for the last block in the index, whether or not it contained any items
+ * to be removed. This allows us to scan right up to end of index to
+ * ensure correct locking.
  */
 void
 _bt_delitems(Relation rel, Buffer buf,
-			 OffsetNumber *itemnos, int nitems)
+			 OffsetNumber *itemnos, int nitems, bool isVacuum,
+			 BlockNumber lastBlockVacuumed)
 {
 	Page		page = BufferGetPage(buf);
 	BTPageOpaque opaque;
 
+	Assert(isVacuum || lastBlockVacuumed == 0);
+
 	/* No ereport(ERROR) until changes are logged */
 	START_CRIT_SECTION();
 
 	/* Fix the page */
-	PageIndexMultiDelete(page, itemnos, nitems);
+	if (nitems > 0)
+		PageIndexMultiDelete(page, itemnos, nitems);
 
 	/*
 	 * We can clear the vacuum cycle ID since this page has certainly been
@@ -688,15 +702,36 @@ _bt_delitems(Relation rel, Buffer buf,
 	/* XLOG stuff */
 	if (!rel->rd_istemp)
 	{
-		xl_btree_delete xlrec;
 		XLogRecPtr	recptr;
 		XLogRecData rdata[2];
 
-		xlrec.node = rel->rd_node;
-		xlrec.block = BufferGetBlockNumber(buf);
+		if (isVacuum)
+		{
+			xl_btree_vacuum xlrec_vacuum;
+			xlrec_vacuum.node = rel->rd_node;
+			xlrec_vacuum.block = BufferGetBlockNumber(buf);
+
+			xlrec_vacuum.lastBlockVacuumed = lastBlockVacuumed;
+			rdata[0].data = (char *) &xlrec_vacuum;
+			rdata[0].len = SizeOfBtreeVacuum;
+		}
+		else
+		{
+			xl_btree_delete xlrec_delete;
+			xlrec_delete.node = rel->rd_node;
+			xlrec_delete.block = BufferGetBlockNumber(buf);
+
+			/*
+			 * XXX: We would like to set an accurate latestRemovedXid, but
+			 * there is no easy way of obtaining a useful value. So we punt
+			 * and store InvalidTransactionId, which forces the standby to
+			 * wait for/cancel all currently running transactions.
+			 */
+			xlrec_delete.latestRemovedXid = InvalidTransactionId;
+			rdata[0].data = (char *) &xlrec_delete;
+			rdata[0].len = SizeOfBtreeDelete;
+		}
 
-		rdata[0].data = (char *) &xlrec;
-		rdata[0].len = SizeOfBtreeDelete;
 		rdata[0].buffer = InvalidBuffer;
 		rdata[0].next = &(rdata[1]);
 
@@ -719,7 +754,10 @@ _bt_delitems(Relation rel, Buffer buf,
 		rdata[1].buffer_std = true;
 		rdata[1].next = NULL;
 
-		recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE, rdata);
+		if (isVacuum)
+			recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM, rdata);
+		else
+			recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE, rdata);
 
 		PageSetLSN(page, recptr);
 		PageSetTLI(page, ThisTimeLineID);
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 87a8a225dbf..d166a811b80 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -12,7 +12,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.172 2009/07/29 20:56:18 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.173 2009/12/19 01:32:33 sriggs Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -57,7 +57,8 @@ typedef struct
 	IndexBulkDeleteCallback callback;
 	void	   *callback_state;
 	BTCycleId	cycleid;
-	BlockNumber lastUsedPage;
+	BlockNumber lastBlockVacuumed; 	/* last blkno reached by Vacuum scan */
+	BlockNumber lastUsedPage;		/* blkno of last non-recyclable page */
 	BlockNumber totFreePages;	/* true total # of free pages */
 	MemoryContext pagedelcontext;
 } BTVacState;
@@ -629,6 +630,7 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	vstate.callback = callback;
 	vstate.callback_state = callback_state;
 	vstate.cycleid = cycleid;
+	vstate.lastBlockVacuumed = BTREE_METAPAGE; /* Initialise at first block */
 	vstate.lastUsedPage = BTREE_METAPAGE;
 	vstate.totFreePages = 0;
 
@@ -705,6 +707,32 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 		num_pages = new_pages;
 	}
 
+	/*
+	 * InHotStandby we need to scan right up to the end of the index for
+	 * correct locking, so we may need to write a WAL record for the final
+	 * block in the index if it was not vacuumed. It's possible that VACUUMing
+	 * has actually removed zeroed pages at the end of the index so we need to
+	 * take care to issue the record for last actual block and not for the
+	 * last block that was scanned. Ignore empty indexes.
+	 */
+	if (XLogStandbyInfoActive() &&
+		num_pages > 1 && vstate.lastBlockVacuumed < (num_pages - 1))
+	{
+		Buffer		buf;
+
+		/*
+		 * We can't use _bt_getbuf() here because it always applies
+		 * _bt_checkpage(), which will barf on an all-zero page. We want to
+		 * recycle all-zero pages, not fail.  Also, we want to use a nondefault
+		 * buffer access strategy.
+		 */
+		buf = ReadBufferExtended(rel, MAIN_FORKNUM, num_pages - 1, RBM_NORMAL,
+								 info->strategy);
+		LockBufferForCleanup(buf);
+		_bt_delitems(rel, buf, NULL, 0, true, vstate.lastBlockVacuumed);
+		_bt_relbuf(rel, buf);
+	}
+
 	MemoryContextDelete(vstate.pagedelcontext);
 
 	/* update statistics */
@@ -847,6 +875,26 @@ restart:
 				itup = (IndexTuple) PageGetItem(page,
 												PageGetItemId(page, offnum));
 				htup = &(itup->t_tid);
+
+				/*
+				 * During Hot Standby we currently assume that XLOG_BTREE_VACUUM
+				 * records do not produce conflicts. That is only true as long
+				 * as the callback function depends only upon whether the index
+				 * tuple refers to heap tuples removed in the initial heap scan.
+				 * When vacuum starts it derives a value of OldestXmin. Backends
+				 * taking later snapshots could have a RecentGlobalXmin with a
+				 * later xid than the vacuum's OldestXmin, so it is possible that
+				 * row versions deleted after OldestXmin could be marked as killed
+				 * by other backends. The callback function *could* look at the
+				 * index tuple state in isolation and decide to delete the index
+				 * tuple, though currently it does not. If it ever did, we would
+				 * need to reconsider whether XLOG_BTREE_VACUUM records should
+				 * cause conflicts. If they did cause conflicts they would be
+				 * fairly harsh conflicts, since we haven't yet worked out a way
+				 * to pass a useful value for latestRemovedXid on the
+				 * XLOG_BTREE_VACUUM records. This applies to *any* type of index
+				 * that marks index tuples as killed.
+				 */
 				if (callback(htup, callback_state))
 					deletable[ndeletable++] = offnum;
 			}
@@ -858,7 +906,19 @@ restart:
 		 */
 		if (ndeletable > 0)
 		{
-			_bt_delitems(rel, buf, deletable, ndeletable);
+			BlockNumber	lastBlockVacuumed = BufferGetBlockNumber(buf);
+
+			_bt_delitems(rel, buf, deletable, ndeletable, true, vstate->lastBlockVacuumed);
+
+			/*
+			 * Keep track of the block number of the lastBlockVacuumed, so
+			 * we can scan those blocks as well during WAL replay. This then
+			 * provides concurrency protection and allows btrees to be used
+			 * while in recovery.
+			 */
+			if (lastBlockVacuumed > vstate->lastBlockVacuumed)
+				vstate->lastBlockVacuumed = lastBlockVacuumed;
+
 			stats->tuples_removed += ndeletable;
 			/* must recompute maxoff */
 			maxoff = PageGetMaxOffsetNumber(page);
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index d132d6bdee1..418eec162d9 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.55 2009/06/11 14:48:54 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.56 2009/12/19 01:32:33 sriggs Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -16,7 +16,11 @@
 
 #include "access/nbtree.h"
 #include "access/transam.h"
+#include "access/xact.h"
 #include "storage/bufmgr.h"
+#include "storage/procarray.h"
+#include "storage/standby.h"
+#include "miscadmin.h"
 
 /*
  * We must keep track of expected insertions due to page splits, and apply
@@ -459,6 +463,97 @@ btree_xlog_split(bool onleft, bool isroot,
 }
 
 static void
+btree_xlog_vacuum(XLogRecPtr lsn, XLogRecord *record)
+{
+	xl_btree_vacuum *xlrec;
+	Buffer		buffer;
+	Page		page;
+	BTPageOpaque opaque;
+
+	xlrec = (xl_btree_vacuum *) XLogRecGetData(record);
+
+	/*
+	 * If queries might be active then we need to ensure every block is unpinned
+	 * between the lastBlockVacuumed and the current block, if there are any.
+	 * This ensures that every block in the index is touched during VACUUM as
+	 * required to ensure scans work correctly.
+	 */
+	if (standbyState == STANDBY_SNAPSHOT_READY &&
+		(xlrec->lastBlockVacuumed + 1) != xlrec->block)
+	{
+		BlockNumber blkno = xlrec->lastBlockVacuumed + 1;
+
+		for (; blkno < xlrec->block; blkno++)
+		{
+			/*
+			 * XXX we don't actually need to read the block, we
+			 * just need to confirm it is unpinned. If we had a special call
+			 * into the buffer manager we could optimise this so that
+			 * if the block is not in shared_buffers we confirm it as unpinned.
+			 *
+			 * Another simple optimization would be to check if there's any
+			 * backends running; if not, we could just skip this.
+			 */
+			buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, blkno, RBM_NORMAL);
+			if (BufferIsValid(buffer))
+			{
+				LockBufferForCleanup(buffer);
+				UnlockReleaseBuffer(buffer);
+			}
+		}
+	}
+
+	/*
+	 * If the block was restored from a full page image, nothing more to do.
+	 * The RestoreBkpBlocks() call already pinned and took cleanup lock on
+	 * it. XXX: Perhaps we should call RestoreBkpBlocks() *after* the loop
+	 * above, to make the disk access more sequential.
+	 */
+	if (record->xl_info & XLR_BKP_BLOCK_1)
+		return;
+
+	/*
+	 * Like in btvacuumpage(), we need to take a cleanup lock on every leaf
+	 * page. See nbtree/README for details.
+	 */
+	buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, xlrec->block, RBM_NORMAL);
+	if (!BufferIsValid(buffer))
+		return;
+	LockBufferForCleanup(buffer);
+	page = (Page) BufferGetPage(buffer);
+
+	if (XLByteLE(lsn, PageGetLSN(page)))
+	{
+		UnlockReleaseBuffer(buffer);
+		return;
+	}
+
+	if (record->xl_len > SizeOfBtreeVacuum)
+	{
+		OffsetNumber *unused;
+		OffsetNumber *unend;
+
+		unused = (OffsetNumber *) ((char *) xlrec + SizeOfBtreeVacuum);
+		unend = (OffsetNumber *) ((char *) xlrec + record->xl_len);
+
+		if ((unend - unused) > 0)
+			PageIndexMultiDelete(page, unused, unend - unused);
+	}
+
+	/*
+	 * Mark the page as not containing any LP_DEAD items --- see comments in
+	 * _bt_delitems().
+	 */
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
+
+	PageSetLSN(page, lsn);
+	PageSetTLI(page, ThisTimeLineID);
+	MarkBufferDirty(buffer);
+	UnlockReleaseBuffer(buffer);
+}
+
+static void
 btree_xlog_delete(XLogRecPtr lsn, XLogRecord *record)
 {
 	xl_btree_delete *xlrec;
@@ -470,6 +565,11 @@ btree_xlog_delete(XLogRecPtr lsn, XLogRecord *record)
 		return;
 
 	xlrec = (xl_btree_delete *) XLogRecGetData(record);
+
+	/*
+	 * We don't need to take a cleanup lock to apply these changes.
+	 * See nbtree/README for details.
+	 */
 	buffer = XLogReadBuffer(xlrec->node, xlrec->block, false);
 	if (!BufferIsValid(buffer))
 		return;
@@ -714,7 +814,43 @@ btree_redo(XLogRecPtr lsn, XLogRecord *record)
 {
 	uint8		info = record->xl_info & ~XLR_INFO_MASK;
 
-	RestoreBkpBlocks(lsn, record, false);
+	/*
+	 * Btree delete records can conflict with standby queries. You might
+	 * think that vacuum records would conflict as well, but we've handled
+	 * that already. XLOG_HEAP2_CLEANUP_INFO records provide the highest xid
+	 * cleaned by the vacuum of the heap and so we can resolve any conflicts
+	 * just once when that arrives. After that any we know that no conflicts
+	 * exist from individual btree vacuum records on that index.
+	 */
+	if (InHotStandby)
+	{
+		if (info == XLOG_BTREE_DELETE)
+		{
+			xl_btree_delete *xlrec = (xl_btree_delete *) XLogRecGetData(record);
+			VirtualTransactionId *backends;
+
+			/*
+			 * XXX Currently we put everybody on death row, because
+			 * currently _bt_delitems() supplies InvalidTransactionId.
+			 * This can be fairly painful, so providing a better value
+			 * here is worth some thought and possibly some effort to
+			 * improve.
+			 */
+			backends = GetConflictingVirtualXIDs(xlrec->latestRemovedXid,
+												 InvalidOid,
+												 true);
+
+			ResolveRecoveryConflictWithVirtualXIDs(backends,
+												   "b-tree delete",
+												   CONFLICT_MODE_ERROR);
+		}
+	}
+
+	/*
+	 * Vacuum needs to pin and take cleanup lock on every leaf page,
+	 * a regular exclusive lock is enough for all other purposes.
+	 */
+	RestoreBkpBlocks(lsn, record, (info == XLOG_BTREE_VACUUM));
 
 	switch (info)
 	{
@@ -739,6 +875,9 @@ btree_redo(XLogRecPtr lsn, XLogRecord *record)
 		case XLOG_BTREE_SPLIT_R_ROOT:
 			btree_xlog_split(false, true, lsn, record);
 			break;
+		case XLOG_BTREE_VACUUM:
+			btree_xlog_vacuum(lsn, record);
+			break;
 		case XLOG_BTREE_DELETE:
 			btree_xlog_delete(lsn, record);
 			break;
@@ -843,13 +982,24 @@ btree_desc(StringInfo buf, uint8 xl_info, char *rec)
 								 xlrec->level, xlrec->firstright);
 				break;
 			}
+		case XLOG_BTREE_VACUUM:
+			{
+				xl_btree_vacuum *xlrec = (xl_btree_vacuum *) rec;
+
+				appendStringInfo(buf, "vacuum: rel %u/%u/%u; blk %u, lastBlockVacuumed %u",
+								 xlrec->node.spcNode, xlrec->node.dbNode,
+								 xlrec->node.relNode, xlrec->block,
+								 xlrec->lastBlockVacuumed);
+				break;
+			}
 		case XLOG_BTREE_DELETE:
 			{
 				xl_btree_delete *xlrec = (xl_btree_delete *) rec;
 
-				appendStringInfo(buf, "delete: rel %u/%u/%u; blk %u",
+				appendStringInfo(buf, "delete: rel %u/%u/%u; blk %u, latestRemovedXid %u",
 								 xlrec->node.spcNode, xlrec->node.dbNode,
-								 xlrec->node.relNode, xlrec->block);
+								 xlrec->node.relNode, xlrec->block,
+								 xlrec->latestRemovedXid);
 				break;
 			}
 		case XLOG_BTREE_DELETE_PAGE:
diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README
index 2edac9d088f..05c41d487c7 100644
--- a/src/backend/access/transam/README
+++ b/src/backend/access/transam/README
@@ -1,4 +1,4 @@
-$PostgreSQL: pgsql/src/backend/access/transam/README,v 1.12 2008/10/20 19:18:18 alvherre Exp $
+$PostgreSQL: pgsql/src/backend/access/transam/README,v 1.13 2009/12/19 01:32:33 sriggs Exp $
 
 The Transaction System
 ======================
@@ -649,3 +649,34 @@ fsync it down to disk without any sort of interlock, as soon as it finishes
 the bulk update.  However, all these paths are designed to write data that
 no other transaction can see until after T1 commits.  The situation is thus
 not different from ordinary WAL-logged updates.
+
+Transaction Emulation during Recovery
+-------------------------------------
+
+During Recovery we replay transaction changes in the order they occurred.
+As part of this replay we emulate some transactional behaviour, so that
+read only backends can take MVCC snapshots. We do this by maintaining a
+list of XIDs belonging to transactions that are being replayed, so that
+each transaction that has recorded WAL records for database writes exist
+in the array until it commits. Further details are given in comments in
+procarray.c.
+
+Many actions write no WAL records at all, for example read only transactions.
+These have no effect on MVCC in recovery and we can pretend they never
+occurred at all. Subtransaction commit does not write a WAL record either
+and has very little effect, since lock waiters need to wait for the
+parent transaction to complete.
+
+Not all transactional behaviour is emulated, for example we do not insert
+a transaction entry into the lock table, nor do we maintain the transaction
+stack in memory. Clog entries are made normally. Multitrans is not maintained
+because its purpose is to record tuple level locks that an application has
+requested to prevent write locks. Since write locks cannot be obtained at all,
+there is never any conflict and so there is no reason to update multitrans.
+Subtrans is maintained during recovery but the details of the transaction
+tree are ignored and all subtransactions reference the top-level TransactionId
+directly. Since commit is atomic this provides correct lock wait behaviour
+yet simplifies emulation of subtransactions considerably.
+
+Further details on locking mechanics in recovery are given in comments
+with the Lock rmgr code.
diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c
index 8544725abb4..d94c09424a9 100644
--- a/src/backend/access/transam/clog.c
+++ b/src/backend/access/transam/clog.c
@@ -26,7 +26,7 @@
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/backend/access/transam/clog.c,v 1.53 2009/06/11 14:48:54 momjian Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/clog.c,v 1.54 2009/12/19 01:32:33 sriggs Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -574,7 +574,7 @@ ExtendCLOG(TransactionId newestXact)
 	LWLockAcquire(CLogControlLock, LW_EXCLUSIVE);
 
 	/* Zero the page and make an XLOG entry about it */
-	ZeroCLOGPage(pageno, true);
+	ZeroCLOGPage(pageno, !InRecovery);
 
 	LWLockRelease(CLogControlLock);
 }
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index 46eca9c9834..b272e9886bb 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -42,7 +42,7 @@
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/backend/access/transam/multixact.c,v 1.32 2009/11/23 09:58:36 heikki Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/multixact.c,v 1.33 2009/12/19 01:32:33 sriggs Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -59,6 +59,7 @@
 #include "storage/backendid.h"
 #include "storage/lmgr.h"
 #include "storage/procarray.h"
+#include "utils/builtins.h"
 #include "utils/memutils.h"
 
 
@@ -220,7 +221,6 @@ static MultiXactId GetNewMultiXactId(int nxids, MultiXactOffset *offset);
 static MultiXactId mXactCacheGetBySet(int nxids, TransactionId *xids);
 static int	mXactCacheGetById(MultiXactId multi, TransactionId **xids);
 static void mXactCachePut(MultiXactId multi, int nxids, TransactionId *xids);
-static int	xidComparator(const void *arg1, const void *arg2);
 
 #ifdef MULTIXACT_DEBUG
 static char *mxid_to_string(MultiXactId multi, int nxids, TransactionId *xids);
@@ -1221,27 +1221,6 @@ mXactCachePut(MultiXactId multi, int nxids, TransactionId *xids)
 	MXactCache = entry;
 }
 
-/*
- * xidComparator
- *		qsort comparison function for XIDs
- *
- * We don't need to use wraparound comparison for XIDs, and indeed must
- * not do so since that does not respect the triangle inequality!  Any
- * old sort order will do.
- */
-static int
-xidComparator(const void *arg1, const void *arg2)
-{
-	TransactionId xid1 = *(const TransactionId *) arg1;
-	TransactionId xid2 = *(const TransactionId *) arg2;
-
-	if (xid1 > xid2)
-		return 1;
-	if (xid1 < xid2)
-		return -1;
-	return 0;
-}
-
 #ifdef MULTIXACT_DEBUG
 static char *
 mxid_to_string(MultiXactId multi, int nxids, TransactionId *xids)
@@ -2051,11 +2030,18 @@ multixact_redo(XLogRecPtr lsn, XLogRecord *record)
 			if (TransactionIdPrecedes(max_xid, xids[i]))
 				max_xid = xids[i];
 		}
+
+		/* We don't expect anyone else to modify nextXid, hence startup process
+		 * doesn't need to hold a lock while checking this. We still acquire
+		 * the lock to modify it, though.
+		 */
 		if (TransactionIdFollowsOrEquals(max_xid,
 										 ShmemVariableCache->nextXid))
 		{
+			LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
 			ShmemVariableCache->nextXid = max_xid;
 			TransactionIdAdvance(ShmemVariableCache->nextXid);
+			LWLockRelease(XidGenLock);
 		}
 	}
 	else
diff --git a/src/backend/access/transam/recovery.conf.sample b/src/backend/access/transam/recovery.conf.sample
index 1ef80ac60fd..cdbb49295fd 100644
--- a/src/backend/access/transam/recovery.conf.sample
+++ b/src/backend/access/transam/recovery.conf.sample
@@ -79,3 +79,10 @@
 #
 #
 #---------------------------------------------------------------------------
+# HOT STANDBY PARAMETERS
+#---------------------------------------------------------------------------
+#
+# If you want to enable read-only connections during recovery, enable
+# recovery_connections in postgresql.conf
+#
+#---------------------------------------------------------------------------
diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c
index 44c3cd7769a..7e1e0f60fc3 100644
--- a/src/backend/access/transam/rmgr.c
+++ b/src/backend/access/transam/rmgr.c
@@ -3,7 +3,7 @@
  *
  * Resource managers definition
  *
- * $PostgreSQL: pgsql/src/backend/access/transam/rmgr.c,v 1.27 2008/11/19 10:34:50 heikki Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/rmgr.c,v 1.28 2009/12/19 01:32:33 sriggs Exp $
  */
 #include "postgres.h"
 
@@ -21,6 +21,7 @@
 #include "commands/sequence.h"
 #include "commands/tablespace.h"
 #include "storage/freespace.h"
+#include "storage/standby.h"
 
 
 const RmgrData RmgrTable[RM_MAX_ID + 1] = {
@@ -32,7 +33,7 @@ const RmgrData RmgrTable[RM_MAX_ID + 1] = {
 	{"Tablespace", tblspc_redo, tblspc_desc, NULL, NULL, NULL},
 	{"MultiXact", multixact_redo, multixact_desc, NULL, NULL, NULL},
 	{"Reserved 7", NULL, NULL, NULL, NULL, NULL},
-	{"Reserved 8", NULL, NULL, NULL, NULL, NULL},
+	{"Standby", standby_redo, standby_desc, NULL, NULL, NULL},
 	{"Heap2", heap2_redo, heap2_desc, NULL, NULL, NULL},
 	{"Heap", heap_redo, heap_desc, NULL, NULL, NULL},
 	{"Btree", btree_redo, btree_desc, btree_xlog_startup, btree_xlog_cleanup, btree_safe_restartpoint},
diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c
index 9c74e995db0..2b9db48f3b0 100644
--- a/src/backend/access/transam/subtrans.c
+++ b/src/backend/access/transam/subtrans.c
@@ -22,7 +22,7 @@
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/backend/access/transam/subtrans.c,v 1.24 2009/01/01 17:23:36 momjian Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/subtrans.c,v 1.25 2009/12/19 01:32:33 sriggs Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -68,15 +68,19 @@ static bool SubTransPagePrecedes(int page1, int page2);
 
 /*
  * Record the parent of a subtransaction in the subtrans log.
+ *
+ * In some cases we may need to overwrite an existing value.
  */
 void
-SubTransSetParent(TransactionId xid, TransactionId parent)
+SubTransSetParent(TransactionId xid, TransactionId parent, bool overwriteOK)
 {
 	int			pageno = TransactionIdToPage(xid);
 	int			entryno = TransactionIdToEntry(xid);
 	int			slotno;
 	TransactionId *ptr;
 
+	Assert(TransactionIdIsValid(parent));
+
 	LWLockAcquire(SubtransControlLock, LW_EXCLUSIVE);
 
 	slotno = SimpleLruReadPage(SubTransCtl, pageno, true, xid);
@@ -84,7 +88,8 @@ SubTransSetParent(TransactionId xid, TransactionId parent)
 	ptr += entryno;
 
 	/* Current state should be 0 */
-	Assert(*ptr == InvalidTransactionId);
+	Assert(*ptr == InvalidTransactionId ||
+			(*ptr == parent && overwriteOK));
 
 	*ptr = parent;
 
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index db5795324b2..4c3a1b901cb 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *		$PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.56 2009/11/23 09:58:36 heikki Exp $
+ *		$PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.57 2009/12/19 01:32:33 sriggs Exp $
  *
  * NOTES
  *		Each global transaction is associated with a global transaction
@@ -57,6 +57,7 @@
 #include "pgstat.h"
 #include "storage/fd.h"
 #include "storage/procarray.h"
+#include "storage/sinvaladt.h"
 #include "storage/smgr.h"
 #include "utils/builtins.h"
 #include "utils/memutils.h"
@@ -144,7 +145,10 @@ static void RecordTransactionCommitPrepared(TransactionId xid,
 								int nchildren,
 								TransactionId *children,
 								int nrels,
-								RelFileNode *rels);
+								RelFileNode *rels,
+								int ninvalmsgs,
+								SharedInvalidationMessage *invalmsgs,
+								bool initfileinval);
 static void RecordTransactionAbortPrepared(TransactionId xid,
 							   int nchildren,
 							   TransactionId *children,
@@ -736,10 +740,11 @@ TwoPhaseGetDummyProc(TransactionId xid)
  *	2. TransactionId[] (subtransactions)
  *	3. RelFileNode[] (files to be deleted at commit)
  *	4. RelFileNode[] (files to be deleted at abort)
- *	5. TwoPhaseRecordOnDisk
- *	6. ...
- *	7. TwoPhaseRecordOnDisk (end sentinel, rmid == TWOPHASE_RM_END_ID)
- *	8. CRC32
+ *	5. SharedInvalidationMessage[] (inval messages to be sent at commit)
+ *	6. TwoPhaseRecordOnDisk
+ *	7. ...
+ *	8. TwoPhaseRecordOnDisk (end sentinel, rmid == TWOPHASE_RM_END_ID)
+ *	9. CRC32
  *
  * Each segment except the final CRC32 is MAXALIGN'd.
  */
@@ -760,6 +765,8 @@ typedef struct TwoPhaseFileHeader
 	int32		nsubxacts;		/* number of following subxact XIDs */
 	int32		ncommitrels;	/* number of delete-on-commit rels */
 	int32		nabortrels;		/* number of delete-on-abort rels */
+	int32		ninvalmsgs;		/* number of cache invalidation messages */
+	bool		initfileinval;	/* does relcache init file need invalidation? */
 	char		gid[GIDSIZE];	/* GID for transaction */
 } TwoPhaseFileHeader;
 
@@ -835,6 +842,7 @@ StartPrepare(GlobalTransaction gxact)
 	TransactionId *children;
 	RelFileNode *commitrels;
 	RelFileNode *abortrels;
+	SharedInvalidationMessage *invalmsgs;
 
 	/* Initialize linked list */
 	records.head = palloc0(sizeof(XLogRecData));
@@ -859,11 +867,16 @@ StartPrepare(GlobalTransaction gxact)
 	hdr.nsubxacts = xactGetCommittedChildren(&children);
 	hdr.ncommitrels = smgrGetPendingDeletes(true, &commitrels, NULL);
 	hdr.nabortrels = smgrGetPendingDeletes(false, &abortrels, NULL);
+	hdr.ninvalmsgs = xactGetCommittedInvalidationMessages(&invalmsgs,
+														  &hdr.initfileinval);
 	StrNCpy(hdr.gid, gxact->gid, GIDSIZE);
 
 	save_state_data(&hdr, sizeof(TwoPhaseFileHeader));
 
-	/* Add the additional info about subxacts and deletable files */
+	/*
+	 * Add the additional info about subxacts, deletable files and
+	 * cache invalidation messages.
+	 */
 	if (hdr.nsubxacts > 0)
 	{
 		save_state_data(children, hdr.nsubxacts * sizeof(TransactionId));
@@ -880,6 +893,12 @@ StartPrepare(GlobalTransaction gxact)
 		save_state_data(abortrels, hdr.nabortrels * sizeof(RelFileNode));
 		pfree(abortrels);
 	}
+	if (hdr.ninvalmsgs > 0)
+	{
+		save_state_data(invalmsgs,
+						hdr.ninvalmsgs * sizeof(SharedInvalidationMessage));
+		pfree(invalmsgs);
+	}
 }
 
 /*
@@ -1071,7 +1090,7 @@ RegisterTwoPhaseRecord(TwoPhaseRmgrId rmid, uint16 info,
  * contents of the file.  Otherwise return NULL.
  */
 static char *
-ReadTwoPhaseFile(TransactionId xid)
+ReadTwoPhaseFile(TransactionId xid, bool give_warnings)
 {
 	char		path[MAXPGPATH];
 	char	   *buf;
@@ -1087,10 +1106,11 @@ ReadTwoPhaseFile(TransactionId xid)
 	fd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
 	if (fd < 0)
 	{
-		ereport(WARNING,
-				(errcode_for_file_access(),
-				 errmsg("could not open two-phase state file \"%s\": %m",
-						path)));
+		if (give_warnings)
+			ereport(WARNING,
+					(errcode_for_file_access(),
+					 errmsg("could not open two-phase state file \"%s\": %m",
+							path)));
 		return NULL;
 	}
 
@@ -1103,10 +1123,11 @@ ReadTwoPhaseFile(TransactionId xid)
 	if (fstat(fd, &stat))
 	{
 		close(fd);
-		ereport(WARNING,
-				(errcode_for_file_access(),
-				 errmsg("could not stat two-phase state file \"%s\": %m",
-						path)));
+		if (give_warnings)
+			ereport(WARNING,
+					(errcode_for_file_access(),
+					 errmsg("could not stat two-phase state file \"%s\": %m",
+							path)));
 		return NULL;
 	}
 
@@ -1134,10 +1155,11 @@ ReadTwoPhaseFile(TransactionId xid)
 	if (read(fd, buf, stat.st_size) != stat.st_size)
 	{
 		close(fd);
-		ereport(WARNING,
-				(errcode_for_file_access(),
-				 errmsg("could not read two-phase state file \"%s\": %m",
-						path)));
+		if (give_warnings)
+			ereport(WARNING,
+					(errcode_for_file_access(),
+					 errmsg("could not read two-phase state file \"%s\": %m",
+							path)));
 		pfree(buf);
 		return NULL;
 	}
@@ -1166,6 +1188,30 @@ ReadTwoPhaseFile(TransactionId xid)
 	return buf;
 }
 
+/*
+ * Confirms an xid is prepared, during recovery
+ */
+bool
+StandbyTransactionIdIsPrepared(TransactionId xid)
+{
+	char	   *buf;
+	TwoPhaseFileHeader *hdr;
+	bool		result;
+
+	Assert(TransactionIdIsValid(xid));
+
+	/* Read and validate file */
+	buf = ReadTwoPhaseFile(xid, false);
+	if (buf == NULL)
+		return false;
+
+	/* Check header also */
+	hdr = (TwoPhaseFileHeader *) buf;
+	result = TransactionIdEquals(hdr->xid, xid);
+	pfree(buf);
+
+	return result;
+}
 
 /*
  * FinishPreparedTransaction: execute COMMIT PREPARED or ROLLBACK PREPARED
@@ -1184,6 +1230,7 @@ FinishPreparedTransaction(const char *gid, bool isCommit)
 	RelFileNode *abortrels;
 	RelFileNode *delrels;
 	int			ndelrels;
+	SharedInvalidationMessage *invalmsgs;
 	int			i;
 
 	/*
@@ -1196,7 +1243,7 @@ FinishPreparedTransaction(const char *gid, bool isCommit)
 	/*
 	 * Read and validate the state file
 	 */
-	buf = ReadTwoPhaseFile(xid);
+	buf = ReadTwoPhaseFile(xid, true);
 	if (buf == NULL)
 		ereport(ERROR,
 				(errcode(ERRCODE_DATA_CORRUPTED),
@@ -1215,6 +1262,8 @@ FinishPreparedTransaction(const char *gid, bool isCommit)
 	bufptr += MAXALIGN(hdr->ncommitrels * sizeof(RelFileNode));
 	abortrels = (RelFileNode *) bufptr;
 	bufptr += MAXALIGN(hdr->nabortrels * sizeof(RelFileNode));
+	invalmsgs = (SharedInvalidationMessage *) bufptr;
+	bufptr += MAXALIGN(hdr->ninvalmsgs * sizeof(SharedInvalidationMessage));
 
 	/* compute latestXid among all children */
 	latestXid = TransactionIdLatest(xid, hdr->nsubxacts, children);
@@ -1230,7 +1279,9 @@ FinishPreparedTransaction(const char *gid, bool isCommit)
 	if (isCommit)
 		RecordTransactionCommitPrepared(xid,
 										hdr->nsubxacts, children,
-										hdr->ncommitrels, commitrels);
+										hdr->ncommitrels, commitrels,
+										hdr->ninvalmsgs, invalmsgs,
+										hdr->initfileinval);
 	else
 		RecordTransactionAbortPrepared(xid,
 									   hdr->nsubxacts, children,
@@ -1277,6 +1328,18 @@ FinishPreparedTransaction(const char *gid, bool isCommit)
 		smgrclose(srel);
 	}
 
+	/*
+	 * Handle cache invalidation messages.
+	 *
+	 * Relcache init file invalidation requires processing both
+	 * before and after we send the SI messages. See AtEOXact_Inval()
+	 */
+	if (hdr->initfileinval)
+		RelationCacheInitFileInvalidate(true);
+	SendSharedInvalidMessages(invalmsgs, hdr->ninvalmsgs);
+	if (hdr->initfileinval)
+		RelationCacheInitFileInvalidate(false);
+
 	/* And now do the callbacks */
 	if (isCommit)
 		ProcessRecords(bufptr, xid, twophase_postcommit_callbacks);
@@ -1528,14 +1591,21 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon)
  * Our other responsibility is to determine and return the oldest valid XID
  * among the prepared xacts (if none, return ShmemVariableCache->nextXid).
  * This is needed to synchronize pg_subtrans startup properly.
+ *
+ * If xids_p and nxids_p are not NULL, pointer to a palloc'd array of all
+ * top-level xids is stored in *xids_p. The number of entries in the array
+ * is returned in *nxids_p.
  */
 TransactionId
-PrescanPreparedTransactions(void)
+PrescanPreparedTransactions(TransactionId **xids_p, int *nxids_p)
 {
 	TransactionId origNextXid = ShmemVariableCache->nextXid;
 	TransactionId result = origNextXid;
 	DIR		   *cldir;
 	struct dirent *clde;
+	TransactionId *xids = NULL;
+	int			nxids = 0;
+	int			allocsize = 0;
 
 	cldir = AllocateDir(TWOPHASE_DIR);
 	while ((clde = ReadDir(cldir, TWOPHASE_DIR)) != NULL)
@@ -1567,7 +1637,7 @@ PrescanPreparedTransactions(void)
 			 */
 
 			/* Read and validate file */
-			buf = ReadTwoPhaseFile(xid);
+			buf = ReadTwoPhaseFile(xid, true);
 			if (buf == NULL)
 			{
 				ereport(WARNING,
@@ -1615,11 +1685,36 @@ PrescanPreparedTransactions(void)
 				}
 			}
 
+
+			if (xids_p)
+			{
+				if (nxids == allocsize)
+				{
+					if (nxids == 0)
+					{
+						allocsize = 10;
+						xids = palloc(allocsize * sizeof(TransactionId));
+					}
+					else
+					{
+						allocsize = allocsize * 2;
+						xids = repalloc(xids, allocsize * sizeof(TransactionId));
+					}
+				}
+				xids[nxids++] = xid;
+			}
+
 			pfree(buf);
 		}
 	}
 	FreeDir(cldir);
 
+	if (xids_p)
+	{
+		*xids_p = xids;
+		*nxids_p = nxids;
+	}
+
 	return result;
 }
 
@@ -1636,6 +1731,7 @@ RecoverPreparedTransactions(void)
 	char		dir[MAXPGPATH];
 	DIR		   *cldir;
 	struct dirent *clde;
+	bool		overwriteOK = false;
 
 	snprintf(dir, MAXPGPATH, "%s", TWOPHASE_DIR);
 
@@ -1666,7 +1762,7 @@ RecoverPreparedTransactions(void)
 			}
 
 			/* Read and validate file */
-			buf = ReadTwoPhaseFile(xid);
+			buf = ReadTwoPhaseFile(xid, true);
 			if (buf == NULL)
 			{
 				ereport(WARNING,
@@ -1687,6 +1783,15 @@ RecoverPreparedTransactions(void)
 			bufptr += MAXALIGN(hdr->nsubxacts * sizeof(TransactionId));
 			bufptr += MAXALIGN(hdr->ncommitrels * sizeof(RelFileNode));
 			bufptr += MAXALIGN(hdr->nabortrels * sizeof(RelFileNode));
+			bufptr += MAXALIGN(hdr->ninvalmsgs * sizeof(SharedInvalidationMessage));
+
+			/*
+			 * It's possible that SubTransSetParent has been set before, if the
+			 * prepared transaction generated xid assignment records. Test
+			 * here must match one used in AssignTransactionId().
+			 */
+			if (InHotStandby && hdr->nsubxacts >= PGPROC_MAX_CACHED_SUBXIDS)
+				overwriteOK = true;
 
 			/*
 			 * Reconstruct subtrans state for the transaction --- needed
@@ -1696,7 +1801,7 @@ RecoverPreparedTransactions(void)
 			 * hierarchy, but there's no need to restore that exactly.
 			 */
 			for (i = 0; i < hdr->nsubxacts; i++)
-				SubTransSetParent(subxids[i], xid);
+				SubTransSetParent(subxids[i], xid, overwriteOK);
 
 			/*
 			 * Recreate its GXACT and dummy PGPROC
@@ -1719,6 +1824,14 @@ RecoverPreparedTransactions(void)
 			 */
 			ProcessRecords(bufptr, xid, twophase_recover_callbacks);
 
+			/*
+			 * Release locks held by the standby process after we process each
+			 * prepared transaction. As a result, we don't need too many
+			 * additional locks at any one time.
+			 */
+			if (InHotStandby)
+				StandbyReleaseLockTree(xid, hdr->nsubxacts, subxids);
+
 			pfree(buf);
 		}
 	}
@@ -1739,9 +1852,12 @@ RecordTransactionCommitPrepared(TransactionId xid,
 								int nchildren,
 								TransactionId *children,
 								int nrels,
-								RelFileNode *rels)
+								RelFileNode *rels,
+								int ninvalmsgs,
+								SharedInvalidationMessage *invalmsgs,
+								bool initfileinval)
 {
-	XLogRecData rdata[3];
+	XLogRecData rdata[4];
 	int			lastrdata = 0;
 	xl_xact_commit_prepared xlrec;
 	XLogRecPtr	recptr;
@@ -1754,8 +1870,12 @@ RecordTransactionCommitPrepared(TransactionId xid,
 	/* Emit the XLOG commit record */
 	xlrec.xid = xid;
 	xlrec.crec.xact_time = GetCurrentTimestamp();
+	xlrec.crec.xinfo = initfileinval ? XACT_COMPLETION_UPDATE_RELCACHE_FILE : 0;
+	xlrec.crec.nmsgs = 0;
 	xlrec.crec.nrels = nrels;
 	xlrec.crec.nsubxacts = nchildren;
+	xlrec.crec.nmsgs = ninvalmsgs;
+
 	rdata[0].data = (char *) (&xlrec);
 	rdata[0].len = MinSizeOfXactCommitPrepared;
 	rdata[0].buffer = InvalidBuffer;
@@ -1777,6 +1897,15 @@ RecordTransactionCommitPrepared(TransactionId xid,
 		rdata[2].buffer = InvalidBuffer;
 		lastrdata = 2;
 	}
+	/* dump cache invalidation messages */
+	if (ninvalmsgs > 0)
+	{
+		rdata[lastrdata].next = &(rdata[3]);
+		rdata[3].data = (char *) invalmsgs;
+		rdata[3].len = ninvalmsgs * sizeof(SharedInvalidationMessage);
+		rdata[3].buffer = InvalidBuffer;
+		lastrdata = 3;
+	}
 	rdata[lastrdata].next = NULL;
 
 	recptr = XLogInsert(RM_XACT_ID, XLOG_XACT_COMMIT_PREPARED, rdata);
diff --git a/src/backend/access/transam/twophase_rmgr.c b/src/backend/access/transam/twophase_rmgr.c
index d1f7ac7aba7..1bd83e043b2 100644
--- a/src/backend/access/transam/twophase_rmgr.c
+++ b/src/backend/access/transam/twophase_rmgr.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/transam/twophase_rmgr.c,v 1.10 2009/11/23 09:58:36 heikki Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/transam/twophase_rmgr.c,v 1.11 2009/12/19 01:32:33 sriggs Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -19,14 +19,12 @@
 #include "commands/async.h"
 #include "pgstat.h"
 #include "storage/lock.h"
-#include "utils/inval.h"
 
 
 const TwoPhaseCallback twophase_recover_callbacks[TWOPHASE_RM_MAX_ID + 1] =
 {
 	NULL,						/* END ID */
 	lock_twophase_recover,		/* Lock */
-	NULL,						/* Inval */
 	NULL,						/* notify/listen */
 	NULL,						/* pgstat */
 	multixact_twophase_recover	/* MultiXact */
@@ -36,7 +34,6 @@ const TwoPhaseCallback twophase_postcommit_callbacks[TWOPHASE_RM_MAX_ID + 1] =
 {
 	NULL,						/* END ID */
 	lock_twophase_postcommit,	/* Lock */
-	inval_twophase_postcommit,	/* Inval */
 	notify_twophase_postcommit, /* notify/listen */
 	pgstat_twophase_postcommit,	/* pgstat */
 	multixact_twophase_postcommit /* MultiXact */
@@ -46,8 +43,16 @@ const TwoPhaseCallback twophase_postabort_callbacks[TWOPHASE_RM_MAX_ID + 1] =
 {
 	NULL,						/* END ID */
 	lock_twophase_postabort,	/* Lock */
-	NULL,						/* Inval */
 	NULL,						/* notify/listen */
 	pgstat_twophase_postabort,	/* pgstat */
 	multixact_twophase_postabort /* MultiXact */
 };
+
+const TwoPhaseCallback twophase_standby_recover_callbacks[TWOPHASE_RM_MAX_ID + 1] =
+{
+	NULL,						/* END ID */
+	lock_twophase_standby_recover,		/* Lock */
+	NULL,						/* notify/listen */
+	NULL,						/* pgstat */
+	NULL						/* MultiXact */
+};
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index f9a71760d38..a1656922779 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -10,7 +10,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.277 2009/12/09 21:57:50 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.278 2009/12/19 01:32:33 sriggs Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -42,6 +42,7 @@
 #include "storage/procarray.h"
 #include "storage/sinvaladt.h"
 #include "storage/smgr.h"
+#include "storage/standby.h"
 #include "utils/combocid.h"
 #include "utils/guc.h"
 #include "utils/inval.h"
@@ -139,6 +140,7 @@ typedef struct TransactionStateData
 	Oid			prevUser;		/* previous CurrentUserId setting */
 	int			prevSecContext;	/* previous SecurityRestrictionContext */
 	bool		prevXactReadOnly;		/* entry-time xact r/o state */
+	bool		startedInRecovery;	/* did we start in recovery? */
 	struct TransactionStateData *parent;		/* back link to parent */
 } TransactionStateData;
 
@@ -167,9 +169,17 @@ static TransactionStateData TopTransactionStateData = {
 	InvalidOid,					/* previous CurrentUserId setting */
 	0,							/* previous SecurityRestrictionContext */
 	false,						/* entry-time xact r/o state */
+	false,						/* startedInRecovery */
 	NULL						/* link to parent state block */
 };
 
+/*
+ * unreportedXids holds XIDs of all subtransactions that have not yet been
+ * reported in a XLOG_XACT_ASSIGNMENT record.
+ */
+static int nUnreportedXids;
+static TransactionId unreportedXids[PGPROC_MAX_CACHED_SUBXIDS];
+
 static TransactionState CurrentTransactionState = &TopTransactionStateData;
 
 /*
@@ -392,6 +402,9 @@ AssignTransactionId(TransactionState s)
 	bool		isSubXact = (s->parent != NULL);
 	ResourceOwner currentOwner;
 
+	if (RecoveryInProgress())
+		elog(ERROR, "cannot assign TransactionIds during recovery");
+
 	/* Assert that caller didn't screw up */
 	Assert(!TransactionIdIsValid(s->transactionId));
 	Assert(s->state == TRANS_INPROGRESS);
@@ -414,7 +427,7 @@ AssignTransactionId(TransactionState s)
 	s->transactionId = GetNewTransactionId(isSubXact);
 
 	if (isSubXact)
-		SubTransSetParent(s->transactionId, s->parent->transactionId);
+		SubTransSetParent(s->transactionId, s->parent->transactionId, false);
 
 	/*
 	 * Acquire lock on the transaction XID.  (We assume this cannot block.) We
@@ -435,8 +448,57 @@ AssignTransactionId(TransactionState s)
 	}
 	PG_END_TRY();
 	CurrentResourceOwner = currentOwner;
-}
 
+	/*
+	 * Every PGPROC_MAX_CACHED_SUBXIDS assigned transaction ids within each
+	 * top-level transaction we issue a WAL record for the assignment. We
+	 * include the top-level xid and all the subxids that have not yet been
+	 * reported using XLOG_XACT_ASSIGNMENT records.
+	 *
+	 * This is required to limit the amount of shared memory required in a
+	 * hot standby server to keep track of in-progress XIDs. See notes for
+	 * RecordKnownAssignedTransactionIds().
+	 *
+	 * We don't keep track of the immediate parent of each subxid,
+	 * only the top-level transaction that each subxact belongs to. This
+	 * is correct in recovery only because aborted subtransactions are
+	 * separately WAL logged.
+	 */
+	if (isSubXact && XLogStandbyInfoActive())
+	{
+		unreportedXids[nUnreportedXids] = s->transactionId;
+		nUnreportedXids++;
+
+		/* ensure this test matches similar one in RecoverPreparedTransactions() */
+		if (nUnreportedXids >= PGPROC_MAX_CACHED_SUBXIDS)
+		{
+			XLogRecData rdata[2];
+			xl_xact_assignment	xlrec;
+
+			/*
+			 * xtop is always set by now because we recurse up transaction
+			 * stack to the highest unassigned xid and then come back down
+			 */
+			xlrec.xtop = GetTopTransactionId();
+			Assert(TransactionIdIsValid(xlrec.xtop));
+			xlrec.nsubxacts = nUnreportedXids;
+
+			rdata[0].data = (char *) &xlrec;
+			rdata[0].len = MinSizeOfXactAssignment;
+			rdata[0].buffer = InvalidBuffer;
+			rdata[0].next = &rdata[1];
+
+			rdata[1].data = (char *) unreportedXids;
+			rdata[1].len = PGPROC_MAX_CACHED_SUBXIDS * sizeof(TransactionId);
+			rdata[1].buffer = InvalidBuffer;
+			rdata[1].next = NULL;
+
+			(void) XLogInsert(RM_XACT_ID, XLOG_XACT_ASSIGNMENT, rdata);
+
+			nUnreportedXids = 0;
+		}
+	}
+}
 
 /*
  *	GetCurrentSubTransactionId
@@ -596,6 +658,18 @@ TransactionIdIsCurrentTransactionId(TransactionId xid)
 	return false;
 }
 
+/*
+ *	TransactionStartedDuringRecovery
+ *
+ * Returns true if the current transaction started while recovery was still
+ * in progress. Recovery might have ended since so RecoveryInProgress() might
+ * return false already.
+ */
+bool
+TransactionStartedDuringRecovery(void)
+{
+	return CurrentTransactionState->startedInRecovery;
+}
 
 /*
  *	CommandCounterIncrement
@@ -811,7 +885,7 @@ AtSubStart_ResourceOwner(void)
  * This is exported only to support an ugly hack in VACUUM FULL.
  */
 TransactionId
-RecordTransactionCommit(void)
+RecordTransactionCommit(bool isVacuumFull)
 {
 	TransactionId xid = GetTopTransactionIdIfAny();
 	bool		markXidCommitted = TransactionIdIsValid(xid);
@@ -821,11 +895,15 @@ RecordTransactionCommit(void)
 	bool		haveNonTemp;
 	int			nchildren;
 	TransactionId *children;
+	int			nmsgs;
+	SharedInvalidationMessage *invalMessages = NULL;
+	bool		RelcacheInitFileInval;
 
 	/* Get data needed for commit record */
 	nrels = smgrGetPendingDeletes(true, &rels, &haveNonTemp);
 	nchildren = xactGetCommittedChildren(&children);
-
+	nmsgs = xactGetCommittedInvalidationMessages(&invalMessages,
+												 &RelcacheInitFileInval);
 	/*
 	 * If we haven't been assigned an XID yet, we neither can, nor do we want
 	 * to write a COMMIT record.
@@ -859,7 +937,7 @@ RecordTransactionCommit(void)
 		/*
 		 * Begin commit critical section and insert the commit XLOG record.
 		 */
-		XLogRecData rdata[3];
+		XLogRecData rdata[4];
 		int			lastrdata = 0;
 		xl_xact_commit xlrec;
 
@@ -867,6 +945,17 @@ RecordTransactionCommit(void)
 		BufmgrCommit();
 
 		/*
+		 * Set flags required for recovery processing of commits.
+		 */
+		xlrec.xinfo = 0;
+		if (RelcacheInitFileInval)
+			xlrec.xinfo |= XACT_COMPLETION_UPDATE_RELCACHE_FILE;
+		if (isVacuumFull)
+			xlrec.xinfo |= XACT_COMPLETION_VACUUM_FULL;
+		if (forceSyncCommit)
+			xlrec.xinfo |= XACT_COMPLETION_FORCE_SYNC_COMMIT;
+
+		/*
 		 * Mark ourselves as within our "commit critical section".	This
 		 * forces any concurrent checkpoint to wait until we've updated
 		 * pg_clog.  Without this, it is possible for the checkpoint to set
@@ -890,6 +979,7 @@ RecordTransactionCommit(void)
 		xlrec.xact_time = xactStopTimestamp;
 		xlrec.nrels = nrels;
 		xlrec.nsubxacts = nchildren;
+		xlrec.nmsgs = nmsgs;
 		rdata[0].data = (char *) (&xlrec);
 		rdata[0].len = MinSizeOfXactCommit;
 		rdata[0].buffer = InvalidBuffer;
@@ -911,6 +1001,15 @@ RecordTransactionCommit(void)
 			rdata[2].buffer = InvalidBuffer;
 			lastrdata = 2;
 		}
+		/* dump shared cache invalidation messages */
+		if (nmsgs > 0)
+		{
+			rdata[lastrdata].next = &(rdata[3]);
+			rdata[3].data = (char *) invalMessages;
+			rdata[3].len = nmsgs * sizeof(SharedInvalidationMessage);
+			rdata[3].buffer = InvalidBuffer;
+			lastrdata = 3;
+		}
 		rdata[lastrdata].next = NULL;
 
 		(void) XLogInsert(RM_XACT_ID, XLOG_XACT_COMMIT, rdata);
@@ -1352,6 +1451,13 @@ AtSubAbort_childXids(void)
 	s->childXids = NULL;
 	s->nChildXids = 0;
 	s->maxChildXids = 0;
+
+	/*
+	 * We could prune the unreportedXids array here. But we don't bother.
+	 * That would potentially reduce number of XLOG_XACT_ASSIGNMENT records
+	 * but it would likely introduce more CPU time into the more common
+	 * paths, so we choose not to do that.
+	 */
 }
 
 /* ----------------------------------------------------------------
@@ -1461,9 +1567,23 @@ StartTransaction(void)
 
 	/*
 	 * Make sure we've reset xact state variables
+	 *
+	 * If recovery is still in progress, mark this transaction as read-only.
+	 * We have lower level defences in XLogInsert and elsewhere to stop us
+	 * from modifying data during recovery, but this gives the normal
+	 * indication to the user that the transaction is read-only.
 	 */
+	if (RecoveryInProgress())
+	{
+		s->startedInRecovery = true;
+		XactReadOnly = true;
+	}
+	else
+	{
+		s->startedInRecovery = false;
+		XactReadOnly = DefaultXactReadOnly;
+	}
 	XactIsoLevel = DefaultXactIsoLevel;
-	XactReadOnly = DefaultXactReadOnly;
 	forceSyncCommit = false;
 	MyXactAccessedTempRel = false;
 
@@ -1476,6 +1596,11 @@ StartTransaction(void)
 	currentCommandIdUsed = false;
 
 	/*
+	 * initialize reported xid accounting
+	 */
+	nUnreportedXids = 0;
+
+	/*
 	 * must initialize resource-management stuff first
 	 */
 	AtStart_Memory();
@@ -1619,7 +1744,7 @@ CommitTransaction(void)
 	/*
 	 * Here is where we really truly commit.
 	 */
-	latestXid = RecordTransactionCommit();
+	latestXid = RecordTransactionCommit(false);
 
 	TRACE_POSTGRESQL_TRANSACTION_COMMIT(MyProc->lxid);
 
@@ -1853,7 +1978,6 @@ PrepareTransaction(void)
 	StartPrepare(gxact);
 
 	AtPrepare_Notify();
-	AtPrepare_Inval();
 	AtPrepare_Locks();
 	AtPrepare_PgStat();
 	AtPrepare_MultiXact();
@@ -4199,29 +4323,108 @@ xactGetCommittedChildren(TransactionId **ptr)
  *	XLOG support routines
  */
 
+/*
+ * Before 8.5 this was a fairly short function, but now it performs many
+ * actions for which the order of execution is critical.
+ */
 static void
-xact_redo_commit(xl_xact_commit *xlrec, TransactionId xid)
+xact_redo_commit(xl_xact_commit *xlrec, TransactionId xid, XLogRecPtr lsn)
 {
 	TransactionId *sub_xids;
+	SharedInvalidationMessage *inval_msgs;
 	TransactionId max_xid;
 	int			i;
 
-	/* Mark the transaction committed in pg_clog */
+	/* subxid array follows relfilenodes */
 	sub_xids = (TransactionId *) &(xlrec->xnodes[xlrec->nrels]);
-	TransactionIdCommitTree(xid, xlrec->nsubxacts, sub_xids);
+	/* invalidation messages array follows subxids */
+	inval_msgs = (SharedInvalidationMessage *) &(sub_xids[xlrec->nsubxacts]);
 
-	/* Make sure nextXid is beyond any XID mentioned in the record */
-	max_xid = xid;
-	for (i = 0; i < xlrec->nsubxacts; i++)
-	{
-		if (TransactionIdPrecedes(max_xid, sub_xids[i]))
-			max_xid = sub_xids[i];
-	}
+	max_xid = TransactionIdLatest(xid, xlrec->nsubxacts, sub_xids);
+
+	/*
+	 * Make sure nextXid is beyond any XID mentioned in the record.
+	 *
+	 * We don't expect anyone else to modify nextXid, hence we
+	 * don't need to hold a lock while checking this. We still acquire
+	 * the lock to modify it, though.
+	 */
 	if (TransactionIdFollowsOrEquals(max_xid,
 									 ShmemVariableCache->nextXid))
 	{
+		LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
 		ShmemVariableCache->nextXid = max_xid;
 		TransactionIdAdvance(ShmemVariableCache->nextXid);
+		LWLockRelease(XidGenLock);
+	}
+
+	if (!InHotStandby || XactCompletionVacuumFull(xlrec))
+	{
+		/*
+		 * Mark the transaction committed in pg_clog.
+		 *
+		 * If InHotStandby and this is the first commit of a VACUUM FULL INPLACE
+		 * we perform only the actual commit to clog. Strangely, there are two
+		 * commits that share the same xid for every VFI, so we need to skip
+		 * some steps for the first commit. It's OK to repeat the clog update
+		 * when we see the second commit on a VFI.
+		 */
+		TransactionIdCommitTree(xid, xlrec->nsubxacts, sub_xids);
+	}
+	else
+	{
+		/*
+		 * If a transaction completion record arrives that has as-yet unobserved
+		 * subtransactions then this will not have been fully handled by the call
+		 * to RecordKnownAssignedTransactionIds() in the main recovery loop in
+		 * xlog.c. So we need to do bookkeeping again to cover that case. This is
+		 * confusing and it is easy to think this call is irrelevant, which has
+		 * happened three times in development already. Leave it in.
+		 */
+		RecordKnownAssignedTransactionIds(max_xid);
+
+		/*
+		 * Mark the transaction committed in pg_clog. We use async commit
+		 * protocol during recovery to provide information on database
+		 * consistency for when users try to set hint bits. It is important
+		 * that we do not set hint bits until the minRecoveryPoint is past
+		 * this commit record. This ensures that if we crash we don't see
+		 * hint bits set on changes made by transactions that haven't yet
+		 * recovered. It's unlikely but it's good to be safe.
+		 */
+		TransactionIdAsyncCommitTree(xid, xlrec->nsubxacts, sub_xids, lsn);
+
+		/*
+		 * We must mark clog before we update the ProcArray.
+		 */
+		ExpireTreeKnownAssignedTransactionIds(xid, xlrec->nsubxacts, sub_xids);
+
+		/*
+		 * Send any cache invalidations attached to the commit. We must
+		 * maintain the same order of invalidation then release locks
+		 * as occurs in 	.
+		 */
+		if (xlrec->nmsgs > 0)
+		{
+			/*
+			 * Relcache init file invalidation requires processing both
+			 * before and after we send the SI messages. See AtEOXact_Inval()
+			 */
+			if (XactCompletionRelcacheInitFileInval(xlrec))
+				RelationCacheInitFileInvalidate(true);
+
+			SendSharedInvalidMessages(inval_msgs, xlrec->nmsgs);
+
+			if (XactCompletionRelcacheInitFileInval(xlrec))
+				RelationCacheInitFileInvalidate(false);
+		}
+
+		/*
+		 * Release locks, if any. We do this for both two phase and normal
+		 * one phase transactions. In effect we are ignoring the prepare
+		 * phase and just going straight to lock release.
+		 */
+		StandbyReleaseLockTree(xid, xlrec->nsubxacts, sub_xids);
 	}
 
 	/* Make sure files supposed to be dropped are dropped */
@@ -4240,8 +4443,31 @@ xact_redo_commit(xl_xact_commit *xlrec, TransactionId xid)
 		}
 		smgrclose(srel);
 	}
+
+	/*
+	 * We issue an XLogFlush() for the same reason we emit ForceSyncCommit() in
+	 * normal operation. For example, in DROP DATABASE, we delete all the files
+	 * belonging to the database, and then commit the transaction. If we crash
+	 * after all the files have been deleted but before the commit, you have an
+	 * entry in pg_database without any files. To minimize the window for that,
+	 * we use ForceSyncCommit() to rush the commit record to disk as quick as
+	 * possible. We have the same window during recovery, and forcing an
+	 * XLogFlush() (which updates minRecoveryPoint during recovery) helps
+	 * to reduce that problem window, for any user that requested ForceSyncCommit().
+	 */
+	if (XactCompletionForceSyncCommit(xlrec))
+		XLogFlush(lsn);
 }
 
+/*
+ * Be careful with the order of execution, as with xact_redo_commit().
+ * The two functions are similar but differ in key places.
+ *
+ * Note also that an abort can be for a subtransaction and its children,
+ * not just for a top level abort. That means we have to consider
+ * topxid != xid, whereas in commit we would find topxid == xid always
+ * because subtransaction commit is never WAL logged.
+ */
 static void
 xact_redo_abort(xl_xact_abort *xlrec, TransactionId xid)
 {
@@ -4249,22 +4475,55 @@ xact_redo_abort(xl_xact_abort *xlrec, TransactionId xid)
 	TransactionId max_xid;
 	int			i;
 
-	/* Mark the transaction aborted in pg_clog */
 	sub_xids = (TransactionId *) &(xlrec->xnodes[xlrec->nrels]);
-	TransactionIdAbortTree(xid, xlrec->nsubxacts, sub_xids);
+	max_xid = TransactionIdLatest(xid, xlrec->nsubxacts, sub_xids);
 
 	/* Make sure nextXid is beyond any XID mentioned in the record */
-	max_xid = xid;
-	for (i = 0; i < xlrec->nsubxacts; i++)
-	{
-		if (TransactionIdPrecedes(max_xid, sub_xids[i]))
-			max_xid = sub_xids[i];
-	}
+	/* We don't expect anyone else to modify nextXid, hence we
+	 * don't need to hold a lock while checking this. We still acquire
+	 * the lock to modify it, though.
+	 */
 	if (TransactionIdFollowsOrEquals(max_xid,
 									 ShmemVariableCache->nextXid))
 	{
+		LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
 		ShmemVariableCache->nextXid = max_xid;
 		TransactionIdAdvance(ShmemVariableCache->nextXid);
+		LWLockRelease(XidGenLock);
+	}
+
+	if (InHotStandby)
+	{
+		/*
+		 * If a transaction completion record arrives that has as-yet unobserved
+		 * subtransactions then this will not have been fully handled by the call
+		 * to RecordKnownAssignedTransactionIds() in the main recovery loop in
+		 * xlog.c. So we need to do bookkeeping again to cover that case. This is
+		 * confusing and it is easy to think this call is irrelevant, which has
+		 * happened three times in development already. Leave it in.
+		 */
+		RecordKnownAssignedTransactionIds(max_xid);
+	}
+
+	/* Mark the transaction aborted in pg_clog, no need for async stuff */
+	TransactionIdAbortTree(xid, xlrec->nsubxacts, sub_xids);
+
+	if (InHotStandby)
+	{
+		/*
+		 * We must mark clog before we update the ProcArray.
+		 */
+		ExpireTreeKnownAssignedTransactionIds(xid, xlrec->nsubxacts, sub_xids);
+
+		/*
+		 * There are no flat files that need updating, nor invalidation
+		 * messages to send or undo.
+		 */
+
+		/*
+		 * Release locks, if any. There are no invalidations to send.
+		 */
+		StandbyReleaseLockTree(xid, xlrec->nsubxacts, sub_xids);
 	}
 
 	/* Make sure files supposed to be dropped are dropped */
@@ -4297,7 +4556,7 @@ xact_redo(XLogRecPtr lsn, XLogRecord *record)
 	{
 		xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
 
-		xact_redo_commit(xlrec, record->xl_xid);
+		xact_redo_commit(xlrec, record->xl_xid, lsn);
 	}
 	else if (info == XLOG_XACT_ABORT)
 	{
@@ -4315,7 +4574,7 @@ xact_redo(XLogRecPtr lsn, XLogRecord *record)
 	{
 		xl_xact_commit_prepared *xlrec = (xl_xact_commit_prepared *) XLogRecGetData(record);
 
-		xact_redo_commit(&xlrec->crec, xlrec->xid);
+		xact_redo_commit(&xlrec->crec, xlrec->xid, lsn);
 		RemoveTwoPhaseFile(xlrec->xid, false);
 	}
 	else if (info == XLOG_XACT_ABORT_PREPARED)
@@ -4325,6 +4584,14 @@ xact_redo(XLogRecPtr lsn, XLogRecord *record)
 		xact_redo_abort(&xlrec->arec, xlrec->xid);
 		RemoveTwoPhaseFile(xlrec->xid, false);
 	}
+	else if (info == XLOG_XACT_ASSIGNMENT)
+	{
+		xl_xact_assignment *xlrec = (xl_xact_assignment *) XLogRecGetData(record);
+
+		if (InHotStandby)
+			ProcArrayApplyXidAssignment(xlrec->xtop,
+										xlrec->nsubxacts, xlrec->xsub);
+	}
 	else
 		elog(PANIC, "xact_redo: unknown op code %u", info);
 }
@@ -4333,6 +4600,14 @@ static void
 xact_desc_commit(StringInfo buf, xl_xact_commit *xlrec)
 {
 	int			i;
+	TransactionId *xacts;
+	SharedInvalidationMessage *msgs;
+
+	xacts = (TransactionId *) &xlrec->xnodes[xlrec->nrels];
+	msgs = (SharedInvalidationMessage *) &xacts[xlrec->nsubxacts];
+
+	if (XactCompletionRelcacheInitFileInval(xlrec))
+		appendStringInfo(buf, "; relcache init file inval");
 
 	appendStringInfoString(buf, timestamptz_to_str(xlrec->xact_time));
 	if (xlrec->nrels > 0)
@@ -4348,13 +4623,25 @@ xact_desc_commit(StringInfo buf, xl_xact_commit *xlrec)
 	}
 	if (xlrec->nsubxacts > 0)
 	{
-		TransactionId *xacts = (TransactionId *)
-		&xlrec->xnodes[xlrec->nrels];
-
 		appendStringInfo(buf, "; subxacts:");
 		for (i = 0; i < xlrec->nsubxacts; i++)
 			appendStringInfo(buf, " %u", xacts[i]);
 	}
+	if (xlrec->nmsgs > 0)
+	{
+		appendStringInfo(buf, "; inval msgs:");
+		for (i = 0; i < xlrec->nmsgs; i++)
+		{
+			SharedInvalidationMessage *msg = &msgs[i];
+
+			if (msg->id >= 0)
+				appendStringInfo(buf,  "catcache id%d ", msg->id);
+			else if (msg->id == SHAREDINVALRELCACHE_ID)
+				appendStringInfo(buf,  "relcache ");
+			else if (msg->id == SHAREDINVALSMGR_ID)
+				appendStringInfo(buf,  "smgr ");
+		}
+	}
 }
 
 static void
@@ -4385,6 +4672,17 @@ xact_desc_abort(StringInfo buf, xl_xact_abort *xlrec)
 	}
 }
 
+static void
+xact_desc_assignment(StringInfo buf, xl_xact_assignment *xlrec)
+{
+	int			i;
+
+	appendStringInfo(buf, "subxacts:");
+
+	for (i = 0; i < xlrec->nsubxacts; i++)
+		appendStringInfo(buf, " %u", xlrec->xsub[i]);
+}
+
 void
 xact_desc(StringInfo buf, uint8 xl_info, char *rec)
 {
@@ -4412,16 +4710,28 @@ xact_desc(StringInfo buf, uint8 xl_info, char *rec)
 	{
 		xl_xact_commit_prepared *xlrec = (xl_xact_commit_prepared *) rec;
 
-		appendStringInfo(buf, "commit %u: ", xlrec->xid);
+		appendStringInfo(buf, "commit prepared %u: ", xlrec->xid);
 		xact_desc_commit(buf, &xlrec->crec);
 	}
 	else if (info == XLOG_XACT_ABORT_PREPARED)
 	{
 		xl_xact_abort_prepared *xlrec = (xl_xact_abort_prepared *) rec;
 
-		appendStringInfo(buf, "abort %u: ", xlrec->xid);
+		appendStringInfo(buf, "abort prepared %u: ", xlrec->xid);
 		xact_desc_abort(buf, &xlrec->arec);
 	}
+	else if (info == XLOG_XACT_ASSIGNMENT)
+	{
+		xl_xact_assignment *xlrec = (xl_xact_assignment *) rec;
+
+		/*
+		 * Note that we ignore the WAL record's xid, since we're more
+		 * interested in the top-level xid that issued the record
+		 * and which xids are being reported here.
+		 */
+		appendStringInfo(buf, "xid assignment xtop %u: ", xlrec->xtop);
+		xact_desc_assignment(buf, xlrec);
+	}
 	else
 		appendStringInfo(buf, "UNKNOWN");
 }
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 331809a3b91..b861a76ee4f 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.353 2009/09/13 18:32:07 heikki Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.354 2009/12/19 01:32:33 sriggs Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -67,6 +67,8 @@ int			XLOGbuffers = 8;
 int			XLogArchiveTimeout = 0;
 bool		XLogArchiveMode = false;
 char	   *XLogArchiveCommand = NULL;
+bool 		XLogRequestRecoveryConnections = true;
+int			MaxStandbyDelay = 30;
 bool		fullPageWrites = true;
 bool		log_checkpoints = false;
 int			sync_method = DEFAULT_SYNC_METHOD;
@@ -129,10 +131,16 @@ TimeLineID	ThisTimeLineID = 0;
  * recovery mode".  It should be examined primarily by functions that need
  * to act differently when called from a WAL redo function (e.g., to skip WAL
  * logging).  To check whether the system is in recovery regardless of which
- * process you're running in, use RecoveryInProgress().
+ * process you're running in, use RecoveryInProgress() but only after shared
+ * memory startup and lock initialization.
  */
 bool		InRecovery = false;
 
+/* Are we in Hot Standby mode? Only valid in startup process, see xlog.h */
+HotStandbyState		standbyState = STANDBY_DISABLED;
+
+static 	XLogRecPtr	LastRec;
+
 /*
  * Local copy of SharedRecoveryInProgress variable. True actually means "not
  * known, need to check the shared state".
@@ -359,6 +367,8 @@ typedef struct XLogCtlData
 
 	/* end+1 of the last record replayed (or being replayed) */
 	XLogRecPtr	replayEndRecPtr;
+	/* timestamp of last record replayed (or being replayed) */
+	TimestampTz	recoveryLastXTime;
 
 	slock_t		info_lck;		/* locks shared variables shown above */
 } XLogCtlData;
@@ -463,6 +473,7 @@ static void readRecoveryCommandFile(void);
 static void exitArchiveRecovery(TimeLineID endTLI,
 					uint32 endLogId, uint32 endLogSeg);
 static bool recoveryStopsHere(XLogRecord *record, bool *includeThis);
+static void CheckRequiredParameterValues(CheckPoint checkPoint);
 static void LocalSetXLogInsertAllowed(void);
 static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
 
@@ -2103,9 +2114,40 @@ XLogAsyncCommitFlush(void)
 bool
 XLogNeedsFlush(XLogRecPtr record)
 {
-	/* XLOG doesn't need flushing during recovery */
+	/*
+	 * During recovery, we don't flush WAL but update minRecoveryPoint
+	 * instead. So "needs flush" is taken to mean whether minRecoveryPoint
+	 * would need to be updated.
+	 */
 	if (RecoveryInProgress())
-		return false;
+	{
+		/* Quick exit if already known updated */
+		if (XLByteLE(record, minRecoveryPoint) || !updateMinRecoveryPoint)
+			return false;
+
+		/*
+		 * Update local copy of minRecoveryPoint. But if the lock is busy,
+		 * just return a conservative guess.
+		 */
+		if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED))
+			return true;
+		minRecoveryPoint = ControlFile->minRecoveryPoint;
+		LWLockRelease(ControlFileLock);
+
+		/*
+		 * An invalid minRecoveryPoint means that we need to recover all the WAL,
+		 * i.e., we're doing crash recovery.  We never modify the control file's
+		 * value in that case, so we can short-circuit future checks here too.
+		 */
+		if (minRecoveryPoint.xlogid == 0 && minRecoveryPoint.xrecoff == 0)
+			updateMinRecoveryPoint = false;
+
+		/* check again */
+		if (XLByteLE(record, minRecoveryPoint) || !updateMinRecoveryPoint)
+			return false;
+		else
+			return true;
+	}
 
 	/* Quick exit if already known flushed */
 	if (XLByteLE(record, LogwrtResult.Flush))
@@ -3259,10 +3301,11 @@ CleanupBackupHistory(void)
  * ignoring them as already applied, but that's not a huge drawback.
  *
  * If 'cleanup' is true, a cleanup lock is used when restoring blocks.
- * Otherwise, a normal exclusive lock is used.	At the moment, that's just
- * pro forma, because there can't be any regular backends in the system
- * during recovery.  The 'cleanup' argument applies to all backup blocks
- * in the WAL record, that suffices for now.
+ * Otherwise, a normal exclusive lock is used.	During crash recovery, that's
+ * just pro forma because there can't be any regular backends in the system,
+ * but in hot standby mode the distinction is important. The 'cleanup'
+ * argument applies to all backup blocks in the WAL record, that suffices for
+ * now.
  */
 void
 RestoreBkpBlocks(XLogRecPtr lsn, XLogRecord *record, bool cleanup)
@@ -4679,6 +4722,7 @@ BootStrapXLOG(void)
 	checkPoint.oldestXid = FirstNormalTransactionId;
 	checkPoint.oldestXidDB = TemplateDbOid;
 	checkPoint.time = (pg_time_t) time(NULL);
+	checkPoint.oldestActiveXid = InvalidTransactionId;
 
 	ShmemVariableCache->nextXid = checkPoint.nextXid;
 	ShmemVariableCache->nextOid = checkPoint.nextOid;
@@ -5117,22 +5161,43 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis)
 	TimestampTz recordXtime;
 
 	/* We only consider stopping at COMMIT or ABORT records */
-	if (record->xl_rmid != RM_XACT_ID)
-		return false;
-	record_info = record->xl_info & ~XLR_INFO_MASK;
-	if (record_info == XLOG_XACT_COMMIT)
+	if (record->xl_rmid == RM_XACT_ID)
 	{
-		xl_xact_commit *recordXactCommitData;
+		record_info = record->xl_info & ~XLR_INFO_MASK;
+		if (record_info == XLOG_XACT_COMMIT)
+		{
+			xl_xact_commit *recordXactCommitData;
+
+			recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record);
+			recordXtime = recordXactCommitData->xact_time;
+		}
+		else if (record_info == XLOG_XACT_ABORT)
+		{
+			xl_xact_abort *recordXactAbortData;
 
-		recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record);
-		recordXtime = recordXactCommitData->xact_time;
+			recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record);
+			recordXtime = recordXactAbortData->xact_time;
+		}
+		else
+			return false;
 	}
-	else if (record_info == XLOG_XACT_ABORT)
+	else if (record->xl_rmid == RM_XLOG_ID)
 	{
-		xl_xact_abort *recordXactAbortData;
+		record_info = record->xl_info & ~XLR_INFO_MASK;
+		if (record_info == XLOG_CHECKPOINT_SHUTDOWN ||
+			record_info == XLOG_CHECKPOINT_ONLINE)
+		{
+			CheckPoint	checkPoint;
+
+			memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
+			recoveryLastXTime = checkPoint.time;
+		}
 
-		recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record);
-		recordXtime = recordXactAbortData->xact_time;
+		/*
+		 * We don't want to stop recovery on a checkpoint record, but we do
+		 * want to update recoveryLastXTime. So return is unconditional.
+		 */
+		return false;
 	}
 	else
 		return false;
@@ -5217,6 +5282,67 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis)
 }
 
 /*
+ * Returns bool with current recovery mode, a global state.
+ */
+Datum
+pg_is_in_recovery(PG_FUNCTION_ARGS)
+{
+	PG_RETURN_BOOL(RecoveryInProgress());
+}
+
+/*
+ * Returns timestamp of last recovered commit/abort record.
+ */
+TimestampTz
+GetLatestXLogTime(void)
+{
+	/* use volatile pointer to prevent code rearrangement */
+	volatile XLogCtlData *xlogctl = XLogCtl;
+
+	SpinLockAcquire(&xlogctl->info_lck);
+	recoveryLastXTime = xlogctl->recoveryLastXTime;
+	SpinLockRelease(&xlogctl->info_lck);
+
+	return recoveryLastXTime;
+}
+
+/*
+ * Note that text field supplied is a parameter name and does not require translation
+ */
+#define RecoveryRequiresIntParameter(param_name, currValue, checkpointValue) \
+{ \
+	if (currValue < checkpointValue) \
+		ereport(ERROR, \
+			(errmsg("recovery connections cannot continue because " \
+					"%s = %u is a lower setting than on WAL source server (value was %u)", \
+					param_name, \
+					currValue, \
+					checkpointValue))); \
+}
+
+/*
+ * Check to see if required parameters are set high enough on this server
+ * for various aspects of recovery operation.
+ */
+static void
+CheckRequiredParameterValues(CheckPoint checkPoint)
+{
+	/* We ignore autovacuum_max_workers when we make this test. */
+	RecoveryRequiresIntParameter("max_connections",
+									MaxConnections, checkPoint.MaxConnections);
+
+	RecoveryRequiresIntParameter("max_prepared_xacts",
+									max_prepared_xacts, checkPoint.max_prepared_xacts);
+	RecoveryRequiresIntParameter("max_locks_per_xact",
+									max_locks_per_xact, checkPoint.max_locks_per_xact);
+
+	if (!checkPoint.XLogStandbyInfoMode)
+		ereport(ERROR,
+			(errmsg("recovery connections cannot start because the recovery_connections "
+					"parameter is disabled on the WAL source server")));
+}
+
+/*
  * This must be called ONCE during postmaster or standalone-backend startup
  */
 void
@@ -5228,7 +5354,6 @@ StartupXLOG(void)
 	bool		reachedStopPoint = false;
 	bool		haveBackupLabel = false;
 	XLogRecPtr	RecPtr,
-				LastRec,
 				checkPointLoc,
 				backupStopLoc,
 				EndOfLog;
@@ -5238,6 +5363,7 @@ StartupXLOG(void)
 	uint32		freespace;
 	TransactionId oldestActiveXID;
 	bool		bgwriterLaunched = false;
+	bool		backendsAllowed = false;
 
 	/*
 	 * Read control file and check XLOG status looks valid.
@@ -5506,6 +5632,38 @@ StartupXLOG(void)
 								BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
 		}
 
+		/*
+		 * Initialize recovery connections, if enabled. We won't let backends
+		 * in yet, not until we've reached the min recovery point specified
+		 * in control file and we've established a recovery snapshot from a
+		 * running-xacts WAL record.
+		 */
+		if (InArchiveRecovery && XLogRequestRecoveryConnections)
+		{
+			TransactionId *xids;
+			int nxids;
+
+			CheckRequiredParameterValues(checkPoint);
+
+			ereport(LOG,
+				(errmsg("initializing recovery connections")));
+
+			InitRecoveryTransactionEnvironment();
+
+			if (wasShutdown)
+				oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
+			else
+				oldestActiveXID = checkPoint.oldestActiveXid;
+			Assert(TransactionIdIsValid(oldestActiveXID));
+
+			/* Startup commit log and related stuff */
+			StartupCLOG();
+			StartupSUBTRANS(oldestActiveXID);
+			StartupMultiXact();
+
+			ProcArrayInitRecoveryInfo(oldestActiveXID);
+		}
+
 		/* Initialize resource managers */
 		for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
 		{
@@ -5580,7 +5738,9 @@ StartupXLOG(void)
 			do
 			{
 #ifdef WAL_DEBUG
-				if (XLOG_DEBUG)
+				if (XLOG_DEBUG ||
+					(rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) ||
+					(rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3))
 				{
 					StringInfoData buf;
 
@@ -5608,27 +5768,29 @@ StartupXLOG(void)
 				}
 
 				/*
-				 * Check if we were requested to exit without finishing
-				 * recovery.
+				 * Have we passed our safe starting point?
 				 */
-				if (shutdown_requested)
-					proc_exit(1);
+				if (!reachedMinRecoveryPoint &&
+					XLByteLE(minRecoveryPoint, EndRecPtr))
+				{
+					reachedMinRecoveryPoint = true;
+					ereport(LOG,
+							(errmsg("consistent recovery state reached at %X/%X",
+									EndRecPtr.xlogid, EndRecPtr.xrecoff)));
+				}
 
 				/*
-				 * Have we passed our safe starting point? If so, we can tell
-				 * postmaster that the database is consistent now.
+				 * Have we got a valid starting snapshot that will allow
+				 * queries to be run? If so, we can tell postmaster that
+				 * the database is consistent now, enabling connections.
 				 */
-				if (!reachedMinRecoveryPoint &&
-					XLByteLT(minRecoveryPoint, EndRecPtr))
+				if (standbyState == STANDBY_SNAPSHOT_READY &&
+					!backendsAllowed &&
+					reachedMinRecoveryPoint &&
+					IsUnderPostmaster)
 				{
-					reachedMinRecoveryPoint = true;
-					if (InArchiveRecovery)
-					{
-						ereport(LOG,
-							  (errmsg("consistent recovery state reached")));
-						if (IsUnderPostmaster)
-							SendPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT);
-					}
+					backendsAllowed = true;
+					SendPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT);
 				}
 
 				/*
@@ -5662,8 +5824,13 @@ StartupXLOG(void)
 				 */
 				SpinLockAcquire(&xlogctl->info_lck);
 				xlogctl->replayEndRecPtr = EndRecPtr;
+				xlogctl->recoveryLastXTime = recoveryLastXTime;
 				SpinLockRelease(&xlogctl->info_lck);
 
+				/* In Hot Standby mode, keep track of XIDs we've seen */
+				if (InHotStandby && TransactionIdIsValid(record->xl_xid))
+					RecordKnownAssignedTransactionIds(record->xl_xid);
+
 				RmgrTable[record->xl_rmid].rm_redo(EndRecPtr, record);
 
 				/* Pop the error context stack */
@@ -5810,7 +5977,7 @@ StartupXLOG(void)
 	}
 
 	/* Pre-scan prepared transactions to find out the range of XIDs present */
-	oldestActiveXID = PrescanPreparedTransactions();
+	oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
 
 	if (InRecovery)
 	{
@@ -5891,14 +6058,27 @@ StartupXLOG(void)
 	ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
 	TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);
 
-	/* Start up the commit log and related stuff, too */
-	StartupCLOG();
-	StartupSUBTRANS(oldestActiveXID);
-	StartupMultiXact();
+	/*
+	 * Start up the commit log and related stuff, too. In hot standby mode
+	 * we did this already before WAL replay.
+	 */
+	if (standbyState == STANDBY_DISABLED)
+	{
+		StartupCLOG();
+		StartupSUBTRANS(oldestActiveXID);
+		StartupMultiXact();
+	}
 
 	/* Reload shared-memory state for prepared transactions */
 	RecoverPreparedTransactions();
 
+	/*
+	 * Shutdown the recovery environment. This must occur after
+	 * RecoverPreparedTransactions(), see notes for lock_twophase_recover()
+	 */
+	if (standbyState != STANDBY_DISABLED)
+		ShutdownRecoveryTransactionEnvironment();
+
 	/* Shut down readFile facility, free space */
 	if (readFile >= 0)
 	{
@@ -5964,8 +6144,9 @@ RecoveryInProgress(void)
 
 		/*
 		 * Initialize TimeLineID and RedoRecPtr when we discover that recovery
-		 * is finished.  (If you change this, see also
-		 * LocalSetXLogInsertAllowed.)
+		 * is finished. InitPostgres() relies upon this behaviour to ensure
+		 * that InitXLOGAccess() is called at backend startup.  (If you change
+		 * this, see also LocalSetXLogInsertAllowed.)
 		 */
 		if (!LocalRecoveryInProgress)
 			InitXLOGAccess();
@@ -6151,7 +6332,7 @@ InitXLOGAccess(void)
 {
 	/* ThisTimeLineID doesn't change so we need no lock to copy it */
 	ThisTimeLineID = XLogCtl->ThisTimeLineID;
-	Assert(ThisTimeLineID != 0);
+	Assert(ThisTimeLineID != 0 || IsBootstrapProcessingMode());
 
 	/* Use GetRedoRecPtr to copy the RedoRecPtr safely */
 	(void) GetRedoRecPtr();
@@ -6449,6 +6630,12 @@ CreateCheckPoint(int flags)
 	MemSet(&checkPoint, 0, sizeof(checkPoint));
 	checkPoint.time = (pg_time_t) time(NULL);
 
+	/* Set important parameter values for use when replaying WAL */
+	checkPoint.MaxConnections = MaxConnections;
+	checkPoint.max_prepared_xacts = max_prepared_xacts;
+	checkPoint.max_locks_per_xact = max_locks_per_xact;
+	checkPoint.XLogStandbyInfoMode = XLogStandbyInfoActive();
+
 	/*
 	 * We must hold WALInsertLock while examining insert state to determine
 	 * the checkpoint REDO pointer.
@@ -6624,6 +6811,21 @@ CreateCheckPoint(int flags)
 
 	CheckPointGuts(checkPoint.redo, flags);
 
+	/*
+	 * Take a snapshot of running transactions and write this to WAL.
+	 * This allows us to reconstruct the state of running transactions
+	 * during archive recovery, if required. Skip, if this info disabled.
+	 *
+	 * If we are shutting down, or Startup process is completing crash
+	 * recovery we don't need to write running xact data.
+	 *
+	 * Update checkPoint.nextXid since we have a later value
+	 */
+	if (!shutdown && XLogStandbyInfoActive())
+		 LogStandbySnapshot(&checkPoint.oldestActiveXid, &checkPoint.nextXid);
+	else
+		checkPoint.oldestActiveXid = InvalidTransactionId;
+
 	START_CRIT_SECTION();
 
 	/*
@@ -6791,7 +6993,7 @@ RecoveryRestartPoint(const CheckPoint *checkPoint)
 		if (RmgrTable[rmid].rm_safe_restartpoint != NULL)
 			if (!(RmgrTable[rmid].rm_safe_restartpoint()))
 			{
-				elog(DEBUG2, "RM %d not safe to record restart point at %X/%X",
+				elog(trace_recovery(DEBUG2), "RM %d not safe to record restart point at %X/%X",
 					 rmid,
 					 checkPoint->redo.xlogid,
 					 checkPoint->redo.xrecoff);
@@ -6923,14 +7125,9 @@ CreateRestartPoint(int flags)
 		LogCheckpointEnd(true);
 
 	ereport((log_checkpoints ? LOG : DEBUG2),
-			(errmsg("recovery restart point at %X/%X",
-				  lastCheckPoint.redo.xlogid, lastCheckPoint.redo.xrecoff)));
-
-	/* XXX this is currently BROKEN because we are in the wrong process */
-	if (recoveryLastXTime)
-		ereport((log_checkpoints ? LOG : DEBUG2),
-				(errmsg("last completed transaction was at log time %s",
-						timestamptz_to_str(recoveryLastXTime))));
+			(errmsg("recovery restart point at %X/%X with latest known log time %s",
+					lastCheckPoint.redo.xlogid, lastCheckPoint.redo.xrecoff,
+					timestamptz_to_str(GetLatestXLogTime()))));
 
 	LWLockRelease(CheckpointLock);
 	return true;
@@ -7036,6 +7233,19 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
 		ShmemVariableCache->oldestXid = checkPoint.oldestXid;
 		ShmemVariableCache->oldestXidDB = checkPoint.oldestXidDB;
 
+		/* Check to see if any changes to max_connections give problems */
+		if (standbyState != STANDBY_DISABLED)
+			CheckRequiredParameterValues(checkPoint);
+
+		if (standbyState >= STANDBY_INITIALIZED)
+		{
+			/*
+			 * Remove stale transactions, if any.
+			 */
+			ExpireOldKnownAssignedTransactionIds(checkPoint.nextXid);
+			StandbyReleaseOldLocks(checkPoint.nextXid);
+		}
+
 		/* ControlFile->checkPointCopy always tracks the latest ckpt XID */
 		ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
 		ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
@@ -7114,7 +7324,7 @@ xlog_desc(StringInfo buf, uint8 xl_info, char *rec)
 
 		appendStringInfo(buf, "checkpoint: redo %X/%X; "
 						 "tli %u; xid %u/%u; oid %u; multi %u; offset %u; "
-						 "oldest xid %u in DB %u; %s",
+						 "oldest xid %u in DB %u; oldest running xid %u; %s",
 						 checkpoint->redo.xlogid, checkpoint->redo.xrecoff,
 						 checkpoint->ThisTimeLineID,
 						 checkpoint->nextXidEpoch, checkpoint->nextXid,
@@ -7123,6 +7333,7 @@ xlog_desc(StringInfo buf, uint8 xl_info, char *rec)
 						 checkpoint->nextMultiOffset,
 						 checkpoint->oldestXid,
 						 checkpoint->oldestXidDB,
+						 checkpoint->oldestActiveXid,
 				 (info == XLOG_CHECKPOINT_SHUTDOWN) ? "shutdown" : "online");
 	}
 	else if (info == XLOG_NOOP)
@@ -7155,6 +7366,9 @@ xlog_outrec(StringInfo buf, XLogRecord *record)
 					 record->xl_prev.xlogid, record->xl_prev.xrecoff,
 					 record->xl_xid);
 
+	appendStringInfo(buf, "; len %u",
+					 record->xl_len);
+
 	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 	{
 		if (record->xl_info & XLR_SET_BKP_BLOCK(i))
@@ -7311,6 +7525,12 @@ pg_start_backup(PG_FUNCTION_ARGS)
 				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
 				 errmsg("must be superuser to run a backup")));
 
+	if (RecoveryInProgress())
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("recovery is in progress"),
+				 errhint("WAL control functions cannot be executed during recovery.")));
+
 	if (!XLogArchivingActive())
 		ereport(ERROR,
 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
@@ -7498,6 +7718,12 @@ pg_stop_backup(PG_FUNCTION_ARGS)
 				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
 				 (errmsg("must be superuser to run a backup"))));
 
+	if (RecoveryInProgress())
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("recovery is in progress"),
+				 errhint("WAL control functions cannot be executed during recovery.")));
+
 	if (!XLogArchivingActive())
 		ereport(ERROR,
 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
@@ -7659,6 +7885,12 @@ pg_switch_xlog(PG_FUNCTION_ARGS)
 				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
 			 (errmsg("must be superuser to switch transaction log files"))));
 
+	if (RecoveryInProgress())
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("recovery is in progress"),
+				 errhint("WAL control functions cannot be executed during recovery.")));
+
 	switchpoint = RequestXLogSwitch();
 
 	/*
@@ -7681,6 +7913,12 @@ pg_current_xlog_location(PG_FUNCTION_ARGS)
 {
 	char		location[MAXFNAMELEN];
 
+	if (RecoveryInProgress())
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("recovery is in progress"),
+				 errhint("WAL control functions cannot be executed during recovery.")));
+
 	/* Make sure we have an up-to-date local LogwrtResult */
 	{
 		/* use volatile pointer to prevent code rearrangement */
@@ -7708,6 +7946,12 @@ pg_current_xlog_insert_location(PG_FUNCTION_ARGS)
 	XLogRecPtr	current_recptr;
 	char		location[MAXFNAMELEN];
 
+	if (RecoveryInProgress())
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("recovery is in progress"),
+				 errhint("WAL control functions cannot be executed during recovery.")));
+
 	/*
 	 * Get the current end-of-WAL position ... shared lock is sufficient
 	 */
author	Simon Riggs <simon@2ndQuadrant.com>	2009-12-19 01:32:45 +0000
committer	Simon Riggs <simon@2ndQuadrant.com>	2009-12-19 01:32:45 +0000
commit	efc16ea520679d713d98a2c7bf1453c4ff7b91ec (patch)
tree	6a39d2af0704a36281dc7df3ec10823eb3e6de75 /src/backend/access
parent	78a09145e0f8322e625bbc7d69fcb865ce4f3034 (diff)
download	postgresql-efc16ea520679d713d98a2c7bf1453c4ff7b91ec.tar.gz postgresql-efc16ea520679d713d98a2c7bf1453c4ff7b91ec.zip