63 files changed, 9804 insertions, 193 deletions
diff --git a/src/Makefile.global.in b/src/Makefile.global.in
index 209d1bdf4dc..cdddf492f45 100644
--- a/src/Makefile.global.in
+++ b/src/Makefile.global.in
@@ -468,6 +468,8 @@ pg_regress_installcheck = $(top_builddir)/src/test/regress/pg_regress --inputdir
 
 pg_regress_clean_files = results/ regression.diffs regression.out tmp_check/ log/
 
+pg_isolation_regress_check = $(top_builddir)/src/test/isolation/pg_isolation_regress --inputdir=$(srcdir) --temp-install=./tmp_check --top-builddir=$(top_builddir) $(pg_regress_locale_flags)
+pg_isolation_regress_installcheck = $(top_builddir)/src/test/isolation/pg_isolation_regress --inputdir=$(srcdir) --top-builddir=$(top_builddir) $(pg_regress_locale_flags)
 
 ##########################################################################
 #
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index de4befa93f4..71ec74015cd 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -347,8 +347,7 @@ heapgetpage(HeapScanDesc scan, BlockNumber page)
 	/*
 	 * Prune and repair fragmentation for the whole page, if possible.
 	 */
-	Assert(TransactionIdIsValid(RecentGlobalXmin));
-	heap_page_prune_opt(scan->rs_rd, buffer, RecentGlobalXmin);
+	heap_page_prune_opt(scan->rs_rd, buffer);
 
 	/*
 	 * We must hold share lock on the buffer content while examining tuple
@@ -1750,10 +1749,22 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
 		 */
 		if (!skip)
 		{
+			/*
+			 * For the benefit of logical decoding, have t_self point at the
+			 * element of the HOT chain we're currently investigating instead
+			 * of the root tuple of the HOT chain. This is important because
+			 * the *Satisfies routine for historical mvcc snapshots needs the
+			 * correct tid to decide about the visibility in some cases.
+			 */
+			ItemPointerSet(&(heapTuple->t_self), BufferGetBlockNumber(buffer), offnum);
+
 			/* If it's visible per the snapshot, we must return it */
 			valid = HeapTupleSatisfiesVisibility(heapTuple, snapshot, buffer);
 			CheckForSerializableConflictOut(valid, relation, heapTuple,
 											buffer, snapshot);
+			/* reset to original, non-redirected, tid */
+			heapTuple->t_self = *tid;
+
 			if (valid)
 			{
 				ItemPointerSetOffsetNumber(tid, offnum);
@@ -8207,6 +8218,9 @@ heap2_redo(XLogRecPtr lsn, XLogRecord *record)
 			 * decoding.
 			 */
 			break;
+		case XLOG_HEAP2_REWRITE:
+			heap_xlog_logical_rewrite(lsn, record);
+			break;
 		default:
 			elog(PANIC, "heap2_redo: unknown op code %u", info);
 	}
diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c
index 27cbac85256..3c69e1badac 100644
--- a/src/backend/access/heap/pruneheap.c
+++ b/src/backend/access/heap/pruneheap.c
@@ -18,13 +18,14 @@
 #include "access/heapam_xlog.h"
 #include "access/transam.h"
 #include "access/htup_details.h"
+#include "catalog/catalog.h"
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "storage/bufmgr.h"
+#include "utils/snapmgr.h"
 #include "utils/rel.h"
 #include "utils/tqual.h"
 
-
 /* Working data for heap_page_prune and subroutines */
 typedef struct
 {
@@ -70,10 +71,34 @@ static void heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum);
  * or RECENTLY_DEAD (see HeapTupleSatisfiesVacuum).
  */
 void
-heap_page_prune_opt(Relation relation, Buffer buffer, TransactionId OldestXmin)
+heap_page_prune_opt(Relation relation, Buffer buffer)
 {
 	Page		page = BufferGetPage(buffer);
 	Size		minfree;
+	TransactionId OldestXmin;
+
+	/*
+	 * We can't write WAL in recovery mode, so there's no point trying to
+	 * clean the page. The master will likely issue a cleaning WAL record soon
+	 * anyway, so this is no particular loss.
+	 */
+	if (RecoveryInProgress())
+		return;
+
+	/*
+	 * Use the appropriate xmin horizon for this relation. If it's a proper
+	 * catalog relation or a user defined, additional, catalog relation, we
+	 * need to use the horizon that includes slots, otherwise the data-only
+	 * horizon can be used. Note that the toast relation of user defined
+	 * relations are *not* considered catalog relations.
+	 */
+	if (IsCatalogRelation(relation) ||
+		RelationIsAccessibleInLogicalDecoding(relation))
+		OldestXmin = RecentGlobalXmin;
+	else
+		OldestXmin = RecentGlobalDataXmin;
+
+	Assert(TransactionIdIsValid(OldestXmin));
 
 	/*
 	 * Let's see if we really need pruning.
@@ -85,14 +110,6 @@ heap_page_prune_opt(Relation relation, Buffer buffer, TransactionId OldestXmin)
 		return;
 
 	/*
-	 * We can't write WAL in recovery mode, so there's no point trying to
-	 * clean the page. The master will likely issue a cleaning WAL record soon
-	 * anyway, so this is no particular loss.
-	 */
-	if (RecoveryInProgress())
-		return;
-
-	/*
 	 * We prune when a previous UPDATE failed to find enough space on the page
 	 * for a new tuple version, or when free space falls below the relation's
 	 * fill-factor target (but not less than 10%).
diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index c34ab9865f8..239c7dad0c9 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -102,17 +102,34 @@
  */
 #include "postgres.h"
 
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "miscadmin.h"
+
 #include "access/heapam.h"
 #include "access/heapam_xlog.h"
 #include "access/rewriteheap.h"
 #include "access/transam.h"
 #include "access/tuptoaster.h"
+#include "access/xact.h"
+
+#include "catalog/catalog.h"
+
+#include "lib/ilist.h"
+
+#include "replication/logical.h"
+#include "replication/slot.h"
+
 #include "storage/bufmgr.h"
+#include "storage/fd.h"
 #include "storage/smgr.h"
+
 #include "utils/memutils.h"
 #include "utils/rel.h"
 #include "utils/tqual.h"
 
+#include "storage/procarray.h"
 
 /*
  * State associated with a rewrite operation. This is opaque to the user
@@ -120,21 +137,28 @@
  */
 typedef struct RewriteStateData
 {
+	Relation	rs_old_rel;		/* source heap */
 	Relation	rs_new_rel;		/* destination heap */
 	Page		rs_buffer;		/* page currently being built */
 	BlockNumber rs_blockno;		/* block where page will go */
 	bool		rs_buffer_valid;	/* T if any tuples in buffer */
 	bool		rs_use_wal;		/* must we WAL-log inserts? */
+	bool		rs_logical_rewrite; /* do we need to do logical rewriting */
 	TransactionId rs_oldest_xmin;		/* oldest xmin used by caller to
 										 * determine tuple visibility */
 	TransactionId rs_freeze_xid;/* Xid that will be used as freeze cutoff
 								 * point */
+	TransactionId rs_logical_xmin;	/* Xid that will be used as cutoff
+									 * point for logical rewrites */
 	MultiXactId rs_cutoff_multi;/* MultiXactId that will be used as cutoff
 								 * point for multixacts */
 	MemoryContext rs_cxt;		/* for hash tables and entries and tuples in
 								 * them */
+	XLogRecPtr	rs_begin_lsn;	/* XLogInsertLsn when starting the rewrite */
 	HTAB	   *rs_unresolved_tups;		/* unmatched A tuples */
 	HTAB	   *rs_old_new_tid_map;		/* unmatched B tuples */
+	HTAB	   *rs_logical_mappings;	/* logical remapping files */
+	uint32		rs_num_rewrite_mappings; /* # in memory mappings */
 }	RewriteStateData;
 
 /*
@@ -169,14 +193,45 @@ typedef struct
 
 typedef OldToNewMappingData *OldToNewMapping;
 
+/*
+ * In-Memory data for a xid that might need logical remapping entries
+ * to be logged.
+ */
+typedef struct RewriteMappingFile
+{
+	TransactionId		xid;		/* xid that might need to see the row */
+	int					vfd;		/* fd of mappings file */
+	off_t				off;		/* how far have we written yet */
+	uint32				num_mappings; /* number of in-memory mappings */
+	dlist_head			mappings;	/* list of in-memory mappings */
+	char				path[MAXPGPATH]; /* path, for error messages */
+} RewriteMappingFile;
+
+/*
+ * A single In-Memeory logical rewrite mapping, hanging of
+ * RewriteMappingFile->mappings.
+ */
+typedef struct RewriteMappingDataEntry
+{
+	LogicalRewriteMappingData map;	/* map between old and new location of
+									 * the tuple */
+	dlist_node	node;
+} RewriteMappingDataEntry;
+
 
 /* prototypes for internal functions */
 static void raw_heap_insert(RewriteState state, HeapTuple tup);
 
+/* internal logical remapping prototypes */
+static void logical_begin_heap_rewrite(RewriteState state);
+static void logical_rewrite_heap_tuple(RewriteState state, ItemPointerData old_tid, HeapTuple new_tuple);
+static void logical_end_heap_rewrite(RewriteState state);
+
 
 /*
  * Begin a rewrite of a table
  *
+ * old_heap		old, locked heap relation tuples will be read from
  * new_heap		new, locked heap relation to insert tuples to
  * oldest_xmin	xid used by the caller to determine which tuples are dead
  * freeze_xid	xid before which tuples will be frozen
@@ -187,7 +242,7 @@ static void raw_heap_insert(RewriteState state, HeapTuple tup);
  * to be used in subsequent calls to the other functions.
  */
 RewriteState
-begin_heap_rewrite(Relation new_heap, TransactionId oldest_xmin,
+begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xmin,
 				   TransactionId freeze_xid, MultiXactId cutoff_multi,
 				   bool use_wal)
 {
@@ -210,6 +265,7 @@ begin_heap_rewrite(Relation new_heap, TransactionId oldest_xmin,
 	/* Create and fill in the state struct */
 	state = palloc0(sizeof(RewriteStateData));
 
+	state->rs_old_rel = old_heap;
 	state->rs_new_rel = new_heap;
 	state->rs_buffer = (Page) palloc(BLCKSZ);
 	/* new_heap needn't be empty, just locked */
@@ -244,6 +300,8 @@ begin_heap_rewrite(Relation new_heap, TransactionId oldest_xmin,
 
 	MemoryContextSwitchTo(old_cxt);
 
+	logical_begin_heap_rewrite(state);
+
 	return state;
 }
 
@@ -301,6 +359,8 @@ end_heap_rewrite(RewriteState state)
 	if (RelationNeedsWAL(state->rs_new_rel))
 		heap_sync(state->rs_new_rel);
 
+	logical_end_heap_rewrite(state);
+
 	/* Deleting the context frees everything */
 	MemoryContextDelete(state->rs_cxt);
 }
@@ -429,6 +489,8 @@ rewrite_heap_tuple(RewriteState state,
 		raw_heap_insert(state, new_tuple);
 		new_tid = new_tuple->t_self;
 
+		logical_rewrite_heap_tuple(state, old_tid, new_tuple);
+
 		/*
 		 * If the tuple is the updated version of a row, and the prior version
 		 * wouldn't be DEAD yet, then we need to either resolve the prior
@@ -678,3 +740,545 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
 	if (heaptup != tup)
 		heap_freetuple(heaptup);
 }
+
+/* ------------------------------------------------------------------------
+ * Logical rewrite support
+ *
+ * When doing logical decoding - which relies on using cmin/cmax of catalog
+ * tuples, via xl_heap_new_cid records - heap rewrites have to log enough
+ * information to allow the decoding backend to updates its internal mapping
+ * of (relfilenode,ctid) => (cmin, cmax) to be correct for the rewritten heap.
+ *
+ * For that, every time we find a tuple that's been modified in a catalog
+ * relation within the xmin horizon of any decoding slot, we log a mapping
+ * from the old to the new location.
+ *
+ * To deal with rewrites that abort the filename of a mapping file contains
+ * the xid of the transaction performing the rewrite, which then can be
+ * checked before being read in.
+ *
+ * For efficiency we don't immediately spill every single map mapping for a
+ * row to disk but only do so in batches when we've collected several of them
+ * in memory or when end_heap_rewrite() has been called.
+ *
+ * Crash-Safety: This module diverts from the usual patterns of doing WAL
+ * since it cannot rely on checkpoint flushing out all buffers and thus
+ * waiting for exlusive locks on buffers. Usually the XLogInsert() covering
+ * buffer modifications is performed while the buffer(s) that are being
+ * modified are exlusively locked guaranteeing that both the WAL record and
+ * the modified heap are on either side of the checkpoint. But since the
+ * mapping files we log aren't in shared_buffers that interlock doesn't work.
+ *
+ * Instead we simply write the mapping files out to disk, *before* the
+ * XLogInsert() is performed. That guarantees that either the XLogInsert() is
+ * inserted after the checkpoint's redo pointer or that the checkpoint (via
+ * LogicalRewriteHeapCheckpoint()) has flushed the (partial) mapping file to
+ * disk. That leaves the tail end that has not yet been flushed open to
+ * corruption, which is solved by including the current offset in the
+ * xl_heap_rewrite_mapping records and truncating the mapping file to it
+ * during replay. Every time a rewrite is finished all generated mapping files
+ * are synced to disk.
+ *
+ * Note that if we were only concerned about crash safety we wouldn't have to
+ * deal with WAL logging at all - an fsync() at the end of a rewrite would be
+ * sufficient for crash safety. Any mapping that hasn't been safely flushed to
+ * disk has to be by an aborted (explicitly or via a crash) transaction and is
+ * ignored by virtue of the xid in it's name being subject to a
+ * TransactionDidCommit() check. But we want to support having standbys via
+ * physical replication, both for availability and to to do logical decoding
+ * there.
+ * ------------------------------------------------------------------------
+ */
+
+/*
+ * Do preparations for logging logical mappings during a rewrite if
+ * necessary. If we detect that we don't need to log anything we'll prevent
+ * any further action by the various logical rewrite functions.
+ */
+static void
+logical_begin_heap_rewrite(RewriteState state)
+{
+	HASHCTL		hash_ctl;
+	TransactionId logical_xmin;
+
+	/*
+	 * We only need to persist these mappings if the rewritten table can be
+	 * accessed during logical decoding, if not, we can skip doing any
+	 * additional work.
+	 */
+	state->rs_logical_rewrite =
+		RelationIsAccessibleInLogicalDecoding(state->rs_old_rel);
+
+	if (!state->rs_logical_rewrite)
+		return;
+
+	Assert(ReplicationSlotCtl != NULL);
+
+	ProcArrayGetReplicationSlotXmin(NULL, &logical_xmin);
+
+	/*
+	 * If there are no logical slots in progress we don't need to do anything,
+	 * there cannot be any remappings for relevant rows yet. The relation's
+	 * lock protects us against races.
+	 */
+	if (logical_xmin == InvalidTransactionId)
+	{
+		state->rs_logical_rewrite = false;
+		return;
+	}
+
+	state->rs_logical_xmin = logical_xmin;
+	state->rs_begin_lsn = GetXLogInsertRecPtr();
+	state->rs_num_rewrite_mappings = 0;
+
+	memset(&hash_ctl, 0, sizeof(hash_ctl));
+	hash_ctl.keysize = sizeof(TransactionId);
+	hash_ctl.entrysize = sizeof(RewriteMappingFile);
+	hash_ctl.hcxt = state->rs_cxt;
+	hash_ctl.hash = tag_hash;
+
+	state->rs_logical_mappings =
+		hash_create("Logical rewrite mapping",
+					128,		/* arbitrary initial size */
+					&hash_ctl,
+					HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
+}
+
+/*
+ * Flush all logical in-memory mappings to disk, but don't fsync them yet.
+ */
+static void
+logical_heap_rewrite_flush_mappings(RewriteState state)
+{
+	HASH_SEQ_STATUS seq_status;
+	RewriteMappingFile *src;
+	dlist_mutable_iter iter;
+
+	Assert(state->rs_logical_rewrite);
+
+	/* no logical rewrite in progress, no need to iterate over mappings */
+	if (state->rs_num_rewrite_mappings == 0)
+		return;
+
+	elog(DEBUG1, "flushing %u logical rewrite mapping entries",
+		 state->rs_num_rewrite_mappings);
+
+	hash_seq_init(&seq_status, state->rs_logical_mappings);
+	while ((src = (RewriteMappingFile *) hash_seq_search(&seq_status)) != NULL)
+	{
+		XLogRecData		rdata[2];
+		char		   *waldata;
+		char		   *waldata_start;
+		xl_heap_rewrite_mapping xlrec;
+		Oid				dboid;
+		uint32			len;
+		int				written;
+
+		/* this file hasn't got any new mappings */
+		if (src->num_mappings == 0)
+			continue;
+
+		if (state->rs_old_rel->rd_rel->relisshared)
+			dboid = InvalidOid;
+		else
+			dboid = MyDatabaseId;
+
+		xlrec.num_mappings = src->num_mappings;
+		xlrec.mapped_rel = RelationGetRelid(state->rs_old_rel);
+		xlrec.mapped_xid = src->xid;
+		xlrec.mapped_db = dboid;
+		xlrec.offset = src->off;
+		xlrec.start_lsn = state->rs_begin_lsn;
+
+		rdata[0].data = (char *) (&xlrec);
+		rdata[0].len = sizeof(xlrec);
+		rdata[0].buffer = InvalidBuffer;
+		rdata[0].next = &(rdata[1]);
+
+		/* write all mappings consecutively */
+		len = src->num_mappings * sizeof(LogicalRewriteMappingData);
+		waldata = palloc(len);
+		waldata_start = waldata;
+
+		/*
+		 * collect data we need to write out, but don't modify ondisk data yet
+		 */
+		dlist_foreach_modify(iter, &src->mappings)
+		{
+			RewriteMappingDataEntry *pmap;
+
+			pmap = dlist_container(RewriteMappingDataEntry, node, iter.cur);
+
+			memcpy(waldata, &pmap->map, sizeof(pmap->map));
+			waldata += sizeof(pmap->map);
+
+			/* remove from the list and free */
+			dlist_delete(&pmap->node);
+			pfree(pmap);
+
+			/* update bookkeeping */
+			state->rs_num_rewrite_mappings--;
+			src->num_mappings--;
+		}
+
+		/*
+		 * Note that we deviate from the usual WAL coding practices here,
+		 * check the above "Logical rewrite support" comment for reasoning.
+		 */
+		written = FileWrite(src->vfd, waldata_start, len);
+		if (written != len)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not write to file \"%s\", wrote %d of %d: %m", src->path,
+							written, len)));
+		src->off += len;
+
+		Assert(src->num_mappings == 0);
+
+		rdata[1].data = waldata_start;
+		rdata[1].len = len;
+		rdata[1].buffer = InvalidBuffer;
+		rdata[1].next = NULL;
+
+		/* write xlog record */
+		XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_REWRITE, rdata);
+
+	}
+	Assert(state->rs_num_rewrite_mappings == 0);
+}
+
+/*
+ * Logical remapping part of end_heap_rewrite().
+ */
+static void
+logical_end_heap_rewrite(RewriteState state)
+{
+	HASH_SEQ_STATUS seq_status;
+	RewriteMappingFile *src;
+
+	/* done, no logical rewrite in progress */
+	if (!state->rs_logical_rewrite)
+		return;
+
+	/* writeout remaining in-memory entries */
+	if (state->rs_num_rewrite_mappings > 0 )
+		logical_heap_rewrite_flush_mappings(state);
+
+	/* Iterate over all mappings we have written and fsync the files. */
+	hash_seq_init(&seq_status, state->rs_logical_mappings);
+	while ((src = (RewriteMappingFile *) hash_seq_search(&seq_status)) != NULL)
+	{
+		if(FileSync(src->vfd) != 0)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not fsync file \"%s\": %m", src->path)));
+		FileClose(src->vfd);
+	}
+	/* memory context cleanup will deal with the rest */
+}
+
+/*
+ * Log a single (old->new) mapping for 'xid'.
+ */
+static void
+logical_rewrite_log_mapping(RewriteState state, TransactionId xid,
+							LogicalRewriteMappingData *map)
+{
+	RewriteMappingFile		   *src;
+	RewriteMappingDataEntry	   *pmap;
+	Oid							relid;
+	bool						found;
+
+	relid = RelationGetRelid(state->rs_old_rel);
+
+	/* look for existing mappings for this 'mapped' xid */
+	src = hash_search(state->rs_logical_mappings, &xid,
+					  HASH_ENTER, &found);
+
+	/*
+	 * We haven't yet had the need to map anything for this xid, create
+	 * per-xid data structures.
+	 */
+	if (!found)
+	{
+		char		path[MAXPGPATH];
+		Oid			dboid;
+
+		if (state->rs_old_rel->rd_rel->relisshared)
+			dboid = InvalidOid;
+		else
+			dboid = MyDatabaseId;
+
+		snprintf(path, MAXPGPATH,
+				 "pg_llog/mappings/" LOGICAL_REWRITE_FORMAT,
+				 dboid, relid,
+				 (uint32) (state->rs_begin_lsn >> 32),
+				 (uint32) state->rs_begin_lsn,
+				 xid, GetCurrentTransactionId());
+
+		dlist_init(&src->mappings);
+		src->num_mappings = 0;
+		src->off = 0;
+		memcpy(src->path, path, sizeof(path));
+		src->vfd = PathNameOpenFile(path,
+									O_CREAT | O_EXCL | O_WRONLY | PG_BINARY,
+									S_IRUSR | S_IWUSR);
+		if (src->vfd < 0)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not create file \"%s\": %m",	path)));
+	}
+
+	pmap = MemoryContextAlloc(state->rs_cxt,
+							  sizeof(RewriteMappingDataEntry));
+	memcpy(&pmap->map, map, sizeof(LogicalRewriteMappingData));
+	dlist_push_tail(&src->mappings, &pmap->node);
+	src->num_mappings++;
+	state->rs_num_rewrite_mappings++;
+
+	/*
+	 * Write out buffer every time we've too many in-memory entries across all
+	 * mapping files.
+	 */
+	if (state->rs_num_rewrite_mappings >= 1000 /* arbitrary number */)
+		logical_heap_rewrite_flush_mappings(state);
+}
+
+/*
+ * Perform logical remapping for a tuple that's mapped from old_tid to
+ * new_tuple->t_self by rewrite_heap_tuple() iff necessary for the tuple.
+ */
+static void
+logical_rewrite_heap_tuple(RewriteState state, ItemPointerData old_tid,
+						   HeapTuple new_tuple)
+{
+	ItemPointerData new_tid = new_tuple->t_self;
+	TransactionId	cutoff = state->rs_logical_xmin;
+	TransactionId	xmin;
+	TransactionId	xmax;
+	bool			do_log_xmin = false;
+	bool			do_log_xmax = false;
+	LogicalRewriteMappingData map;
+
+	/* no logical rewrite in progress, we don't need to log anything */
+	if (!state->rs_logical_rewrite)
+		return;
+
+	xmin = HeapTupleHeaderGetXmin(new_tuple->t_data);
+	/* use *GetUpdateXid to correctly deal with multixacts */
+	xmax = HeapTupleHeaderGetUpdateXid(new_tuple->t_data);
+
+	/*
+	 * Log the mapping iff the tuple has been created recently.
+	 */
+	if (TransactionIdIsNormal(xmin) && !TransactionIdPrecedes(xmin, cutoff))
+		do_log_xmin = true;
+
+	if (!TransactionIdIsNormal(xmax))
+	{
+		/*
+		 * no xmax is set, can't have any permanent ones, so this check is
+		 * sufficient
+		 */
+	}
+	else if (HEAP_XMAX_IS_LOCKED_ONLY(new_tuple->t_data->t_infomask))
+	{
+		/* only locked, we don't care */
+	}
+	else if (!TransactionIdPrecedes(xmax, cutoff))
+	{
+		/* tuple has been deleted recently, log */
+		do_log_xmax = true;
+	}
+
+	/* if neither needs to be logged, we're done */
+	if (!do_log_xmin && !do_log_xmax)
+		return;
+
+	/* fill out mapping information */
+	map.old_node = state->rs_old_rel->rd_node;
+	map.old_tid = old_tid;
+	map.new_node = state->rs_new_rel->rd_node;
+	map.new_tid = new_tid;
+
+	/* ---
+	 * Now persist the mapping for the individual xids that are affected. We
+	 * need to log for both xmin and xmax if they aren't the same transaction
+	 * since the mapping files are per "affected" xid.
+	 * We don't muster all that much effort detecting whether xmin and xmax
+	 * are actually the same transaction, we just check whether the xid is the
+	 * same disregarding subtransactions. Logging too much is relatively
+	 * harmless and we could never do the check fully since subtransaction
+	 * data is thrown away during restarts.
+	 * ---
+	 */
+	if (do_log_xmin)
+		logical_rewrite_log_mapping(state, xmin, &map);
+	/* separately log mapping for xmax unless it'd be redundant */
+	if (do_log_xmax && !TransactionIdEquals(xmin, xmax))
+		logical_rewrite_log_mapping(state, xmax, &map);
+}
+
+/*
+ * Replay XLOG_HEAP2_REWRITE records
+ */
+void
+heap_xlog_logical_rewrite(XLogRecPtr lsn, XLogRecord *r)
+{
+	char		path[MAXPGPATH];
+	int			fd;
+	xl_heap_rewrite_mapping *xlrec;
+	uint32		len;
+	char	   *data;
+
+	xlrec = (xl_heap_rewrite_mapping *) XLogRecGetData(r);
+
+	snprintf(path, MAXPGPATH,
+			 "pg_llog/mappings/" LOGICAL_REWRITE_FORMAT,
+			 xlrec->mapped_db, xlrec->mapped_rel,
+			 (uint32) (xlrec->start_lsn >> 32),
+			 (uint32) xlrec->start_lsn,
+			 xlrec->mapped_xid, r->xl_xid);
+
+	fd = OpenTransientFile(path,
+						   O_CREAT | O_WRONLY | PG_BINARY,
+						   S_IRUSR | S_IWUSR);
+	if (fd < 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not create file \"%s\": %m",	path)));
+	/*
+	 * Truncate all data that's not guaranteed to have been safely fsynced (by
+	 * previous record or by the last checkpoint).
+	 */
+	if (ftruncate(fd, xlrec->offset) != 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not truncate file \"%s\" to %u: %m",
+						path, (uint32) xlrec->offset)));
+
+	/* now seek to the position we want to write our data to */
+	if (lseek(fd, xlrec->offset, SEEK_SET) != xlrec->offset)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not seek to the end of file \"%s\": %m",
+						path)));
+
+	data = XLogRecGetData(r) + sizeof(*xlrec);
+
+	len = xlrec->num_mappings * sizeof(LogicalRewriteMappingData);
+
+	/* write out tail end of mapping file (again) */
+	if (write(fd, data, len) != len)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not write to file \"%s\": %m", path)));
+	/*
+	 * Now fsync all previously written data. We could improve things and only
+	 * do this for the last write to a file, but the required bookkeeping
+	 * doesn't seem worth the trouble.
+	 */
+	if (pg_fsync(fd) != 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not fsync file \"%s\": %m", path)));
+
+	CloseTransientFile(fd);
+}
+
+/* ---
+ * Perform a checkpoint for logical rewrite mappings
+ *
+ * This serves two tasks:
+ * 1) Remove all mappings not needed anymore based on the logical restart LSN
+ * 2) Flush all remaining mappings to disk, so that replay after a checkpoint
+ *	  only has to deal with the parts of a mapping that have been written out
+ *	  after the checkpoint started.
+ * ---
+ */
+void
+CheckPointLogicalRewriteHeap(void)
+{
+	XLogRecPtr	cutoff;
+	XLogRecPtr	redo;
+	DIR		   *mappings_dir;
+	struct dirent *mapping_de;
+	char		path[MAXPGPATH];
+
+	/*
+	 * We start of with a minimum of the last redo pointer. No new decoding
+	 * slot will start before that, so that's a safe upper bound for removal.
+	 */
+	redo = GetRedoRecPtr();
+
+	/* now check for the restart ptrs from existing slots */
+	cutoff = ReplicationSlotsComputeLogicalRestartLSN();
+
+	/* don't start earlier than the restart lsn */
+	if (cutoff != InvalidXLogRecPtr && redo < cutoff)
+		cutoff = redo;
+
+	mappings_dir = AllocateDir("pg_llog/mappings");
+	while ((mapping_de = ReadDir(mappings_dir, "pg_llog/mappings")) != NULL)
+	{
+		struct stat	statbuf;
+		Oid			dboid;
+		Oid			relid;
+		XLogRecPtr	lsn;
+		TransactionId rewrite_xid;
+		TransactionId create_xid;
+		uint32		hi,	lo;
+
+		if (strcmp(mapping_de->d_name, ".") == 0 ||
+			strcmp(mapping_de->d_name, "..") == 0)
+			continue;
+
+		snprintf(path, MAXPGPATH, "pg_llog/mappings/%s", mapping_de->d_name);
+		if (lstat(path, &statbuf) == 0 && !S_ISREG(statbuf.st_mode))
+			continue;
+
+		/* Skip over files that cannot be ours. */
+		if (strncmp(mapping_de->d_name, "map-", 4) != 0)
+			continue;
+
+		if (sscanf(mapping_de->d_name, LOGICAL_REWRITE_FORMAT,
+				   &dboid, &relid, &hi, &lo, &rewrite_xid, &create_xid) != 6)
+			elog(ERROR,"could not parse filename \"%s\"", mapping_de->d_name);
+
+		lsn = ((uint64) hi) << 32 | lo;
+
+		if (lsn < cutoff || cutoff == InvalidXLogRecPtr)
+		{
+			elog(DEBUG1, "removing logical rewrite file \"%s\"", path);
+			if (unlink(path) < 0)
+				ereport(ERROR,
+						(errcode_for_file_access(),
+						 errmsg("could not unlink file \"%s\": %m", path)));
+		}
+		else
+		{
+			int		fd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0);
+
+			/*
+			 * The file cannot vanish due to concurrency since this function
+			 * is the only one removing logical mappings and it's run while
+			 * CheckpointLock is held exclusively.
+			 */
+			if (fd < 0)
+				ereport(ERROR,
+						(errcode_for_file_access(),
+						 errmsg("could not open file \"%s\": %m", path)));
+			/*
+			 * We could try to avoid fsyncing files that either haven't
+			 * changed or have only been created since the checkpoint's start,
+			 * but it's currently not deemed worth the effort.
+			 */
+			else if (pg_fsync(fd) != 0)
+				ereport(ERROR,
+						(errcode_for_file_access(),
+						 errmsg("could not fsync file \"%s\": %m", path)));
+			CloseTransientFile(fd);
+		}
+	}
+	FreeDir(mappings_dir);
+}
diff --git a/src/backend/access/heap/tuptoaster.c b/src/backend/access/heap/tuptoaster.c
index 97c9f238a7b..9a821d3e1cf 100644
--- a/src/backend/access/heap/tuptoaster.c
+++ b/src/backend/access/heap/tuptoaster.c
@@ -44,32 +44,6 @@
 
 #undef TOAST_DEBUG
 
-/*
- * Testing whether an externally-stored value is compressed now requires
- * comparing extsize (the actual length of the external data) to rawsize
- * (the original uncompressed datum's size).  The latter includes VARHDRSZ
- * overhead, the former doesn't.  We never use compression unless it actually
- * saves space, so we expect either equality or less-than.
- */
-#define VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer) \
-	((toast_pointer).va_extsize < (toast_pointer).va_rawsize - VARHDRSZ)
-
-/*
- * Macro to fetch the possibly-unaligned contents of an EXTERNAL datum
- * into a local "struct varatt_external" toast pointer.  This should be
- * just a memcpy, but some versions of gcc seem to produce broken code
- * that assumes the datum contents are aligned.  Introducing an explicit
- * intermediate "varattrib_1b_e *" variable seems to fix it.
- */
-#define VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr) \
-do { \
-	varattrib_1b_e *attre = (varattrib_1b_e *) (attr); \
-	Assert(VARATT_IS_EXTERNAL(attre)); \
-	Assert(VARSIZE_EXTERNAL(attre) == sizeof(toast_pointer) + VARHDRSZ_EXTERNAL); \
-	memcpy(&(toast_pointer), VARDATA_EXTERNAL(attre), sizeof(toast_pointer)); \
-} while (0)
-
-
 static void toast_delete_datum(Relation rel, Datum value);
 static Datum toast_save_datum(Relation rel, Datum value,
 				 struct varlena * oldexternal, int options);
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index 1aba2f04cc4..a4b5f3d698e 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -67,7 +67,10 @@
 
 #include "access/relscan.h"
 #include "access/transam.h"
+#include "access/xlog.h"
+
 #include "catalog/index.h"
+#include "catalog/catalog.h"
 #include "pgstat.h"
 #include "storage/bufmgr.h"
 #include "storage/lmgr.h"
@@ -520,8 +523,7 @@ index_fetch_heap(IndexScanDesc scan)
 		 * Prune page, but only if we weren't already on this page
 		 */
 		if (prev_buf != scan->xs_cbuf)
-			heap_page_prune_opt(scan->heapRelation, scan->xs_cbuf,
-								RecentGlobalXmin);
+			heap_page_prune_opt(scan->heapRelation, scan->xs_cbuf);
 	}
 
 	/* Obtain share-lock on the buffer so we can examine visibility */
diff --git a/src/backend/access/rmgrdesc/heapdesc.c b/src/backend/access/rmgrdesc/heapdesc.c
index 89ba09a206f..c8a61669dd2 100644
--- a/src/backend/access/rmgrdesc/heapdesc.c
+++ b/src/backend/access/rmgrdesc/heapdesc.c
@@ -149,6 +149,10 @@ heap2_desc(StringInfo buf, uint8 xl_info, char *rec)
 						 xlrec->node.relNode, xlrec->block,
 						 xlrec->cutoff_xid, xlrec->ntuples);
 	}
+	else if (info == XLOG_HEAP2_REWRITE)
+	{
+		appendStringInfoString(buf, "heap rewrite:");
+	}
 	else if (info == XLOG_HEAP2_CLEANUP_INFO)
 	{
 		xl_heap_cleanup_info *xlrec = (xl_heap_cleanup_info *) rec;
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 0487be17df7..b20d9732e78 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -1074,8 +1074,16 @@ RecordTransactionCommit(void)
 
 		/*
 		 * Do we need the long commit record? If not, use the compact format.
+		 *
+		 * For now always use the non-compact version if wal_level=logical, so
+		 * we can hide commits from other databases. TODO: In the future we
+		 * should merge compact and non-compact commits and use a flags
+		 * variable to determine if it contains subxacts, relations or
+		 * invalidation messages, that's more extensible and degrades more
+		 * gracefully. Till then, it's just 20 bytes of overhead.
 		 */
-		if (nrels > 0 || nmsgs > 0 || RelcacheInitFileInval || forceSyncCommit)
+		if (nrels > 0 || nmsgs > 0 || RelcacheInitFileInval || forceSyncCommit ||
+			XLogLogicalInfoActive())
 		{
 			XLogRecData rdata[4];
 			int			lastrdata = 0;
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index ad46eb0cebf..53a20b1e606 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -23,6 +23,7 @@
 
 #include "access/clog.h"
 #include "access/multixact.h"
+#include "access/rewriteheap.h"
 #include "access/subtrans.h"
 #include "access/timeline.h"
 #include "access/transam.h"
@@ -39,7 +40,9 @@
 #include "pgstat.h"
 #include "postmaster/bgwriter.h"
 #include "postmaster/startup.h"
+#include "replication/logical.h"
 #include "replication/slot.h"
+#include "replication/snapbuild.h"
 #include "replication/walreceiver.h"
 #include "replication/walsender.h"
 #include "storage/barrier.h"
@@ -4016,6 +4019,27 @@ CheckXLogRemoved(XLogSegNo segno, TimeLineID tli)
 }
 
 /*
+ * Return the last WAL segment removed, or 0 if no segment has been removed
+ * since startup.
+ *
+ * NB: the result can be out of date arbitrarily fast, the caller has to deal
+ * with that.
+ */
+XLogSegNo
+XLogGetLastRemovedSegno(void)
+{
+	/* use volatile pointer to prevent code rearrangement */
+	volatile XLogCtlData *xlogctl = XLogCtl;
+	XLogSegNo	lastRemovedSegNo;
+
+	SpinLockAcquire(&xlogctl->info_lck);
+	lastRemovedSegNo = xlogctl->lastRemovedSegNo;
+	SpinLockRelease(&xlogctl->info_lck);
+
+	return lastRemovedSegNo;
+}
+
+/*
  * Update the last removed segno pointer in shared memory, to reflect
  * that the given XLOG file has been removed.
  */
@@ -6559,6 +6583,12 @@ StartupXLOG(void)
 	StartupReplicationSlots(checkPoint.redo);
 
 	/*
+	 * Startup logical state, needs to be setup now so we have proper data
+	 * during crash recovery.
+	 */
+	StartupReorderBuffer();
+
+	/*
 	 * Startup MultiXact.  We need to do this early for two reasons: one
 	 * is that we might try to access multixacts when we do tuple freezing,
 	 * and the other is we need its state initialized because we attempt
@@ -8589,7 +8619,7 @@ CreateCheckPoint(int flags)
 	 * StartupSUBTRANS hasn't been called yet.
 	 */
 	if (!RecoveryInProgress())
-		TruncateSUBTRANS(GetOldestXmin(true, false));
+		TruncateSUBTRANS(GetOldestXmin(NULL, false));
 
 	/* Real work is done, but log and update stats before releasing lock. */
 	LogCheckpointEnd(false);
@@ -8674,6 +8704,8 @@ CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
 	CheckPointPredicate();
 	CheckPointRelationMap();
 	CheckPointReplicationSlots();
+	CheckPointSnapBuild();
+	CheckPointLogicalRewriteHeap();
 	CheckPointBuffers(flags);	/* performs all required fsyncs */
 	/* We deliberately delay 2PC checkpointing as long as possible */
 	CheckPointTwoPhase(checkPointRedo);
@@ -8965,7 +8997,7 @@ CreateRestartPoint(int flags)
 	 * this because StartupSUBTRANS hasn't been called yet.
 	 */
 	if (EnableHotStandby)
-		TruncateSUBTRANS(GetOldestXmin(true, false));
+		TruncateSUBTRANS(GetOldestXmin(NULL, false));
 
 	/* Real work is done, but log and update before releasing lock. */
 	LogCheckpointEnd(true);
diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c
index cebca95ac8d..877d7678f7a 100644
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -2156,7 +2156,7 @@ IndexBuildHeapScan(Relation heapRelation,
 	{
 		snapshot = SnapshotAny;
 		/* okay to ignore lazy VACUUMs here */
-		OldestXmin = GetOldestXmin(heapRelation->rd_rel->relisshared, true);
+		OldestXmin = GetOldestXmin(heapRelation, true);
 	}
 
 	scan = heap_beginscan_strat(heapRelation,	/* relation */
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index 04dfbb0ee54..0500a73e1ba 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -619,11 +619,13 @@ CREATE VIEW pg_stat_replication AS
 CREATE VIEW pg_replication_slots AS
     SELECT
             L.slot_name,
+            L.plugin,
             L.slot_type,
             L.datoid,
             D.datname AS database,
             L.active,
             L.xmin,
+            L.catalog_xmin,
             L.restart_lsn
     FROM pg_get_replication_slots() AS L
             LEFT JOIN pg_database D ON (L.datoid = D.oid);
@@ -822,3 +824,35 @@ CREATE OR REPLACE FUNCTION
 CREATE OR REPLACE FUNCTION
   json_populate_recordset(base anyelement, from_json json, use_json_as_text boolean DEFAULT false)
   RETURNS SETOF anyelement LANGUAGE internal STABLE ROWS 100  AS 'json_populate_recordset';
+
+CREATE OR REPLACE FUNCTION pg_logical_slot_get_changes(
+    IN slotname name, IN upto_lsn pg_lsn, IN upto_nchanges int, VARIADIC options text[] DEFAULT '{}',
+    OUT location pg_lsn, OUT xid xid, OUT data text)
+RETURNS SETOF RECORD
+LANGUAGE INTERNAL
+VOLATILE ROWS 1000 COST 1000
+AS 'pg_logical_slot_get_changes';
+
+CREATE OR REPLACE FUNCTION pg_logical_slot_peek_changes(
+    IN slotname name, IN upto_lsn pg_lsn, IN upto_nchanges int, VARIADIC options text[] DEFAULT '{}',
+    OUT location pg_lsn, OUT xid xid, OUT data text)
+RETURNS SETOF RECORD
+LANGUAGE INTERNAL
+VOLATILE ROWS 1000 COST 1000
+AS 'pg_logical_slot_peek_changes';
+
+CREATE OR REPLACE FUNCTION pg_logical_slot_get_binary_changes(
+    IN slotname name, IN upto_lsn pg_lsn, IN upto_nchanges int, VARIADIC options text[] DEFAULT '{}',
+    OUT location pg_lsn, OUT xid xid, OUT data bytea)
+RETURNS SETOF RECORD
+LANGUAGE INTERNAL
+VOLATILE ROWS 1000 COST 1000
+AS 'pg_logical_slot_get_binary_changes';
+
+CREATE OR REPLACE FUNCTION pg_logical_slot_peek_binary_changes(
+    IN slotname name, IN upto_lsn pg_lsn, IN upto_nchanges int, VARIADIC options text[] DEFAULT '{}',
+    OUT location pg_lsn, OUT xid xid, OUT data bytea)
+RETURNS SETOF RECORD
+LANGUAGE INTERNAL
+VOLATILE ROWS 1000 COST 1000
+AS 'pg_logical_slot_peek_binary_changes';
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index e7fcb558684..a04adeaac75 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -22,6 +22,7 @@
 #include "access/tuptoaster.h"
 #include "access/visibilitymap.h"
 #include "access/xact.h"
+#include "catalog/catalog.h"
 #include "catalog/index.h"
 #include "catalog/indexing.h"
 #include "catalog/pg_collation.h"
@@ -1081,7 +1082,7 @@ acquire_sample_rows(Relation onerel, int elevel,
 	totalblocks = RelationGetNumberOfBlocks(onerel);
 
 	/* Need a cutoff xmin for HeapTupleSatisfiesVacuum */
-	OldestXmin = GetOldestXmin(onerel->rd_rel->relisshared, true);
+	OldestXmin = GetOldestXmin(onerel, true);
 
 	/* Prepare for sampling block numbers */
 	BlockSampler_Init(&bs, totalblocks, targrows);
diff --git a/src/backend/commands/cluster.c b/src/backend/commands/cluster.c
index 8b18e4acb72..b6b40e724e7 100644
--- a/src/backend/commands/cluster.c
+++ b/src/backend/commands/cluster.c
@@ -850,7 +850,7 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose,
 	 * Since we're going to rewrite the whole table anyway, there's no reason
 	 * not to be aggressive about this.
 	 */
-	vacuum_set_xid_limits(0, 0, 0, 0, OldHeap->rd_rel->relisshared,
+	vacuum_set_xid_limits(OldHeap, 0, 0, 0, 0,
 						  &OldestXmin, &FreezeXid, NULL, &MultiXactCutoff,
 						  NULL);
 
@@ -869,7 +869,7 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose,
 	is_system_catalog = IsSystemRelation(OldHeap);
 
 	/* Initialize the rewrite operation */
-	rwstate = begin_heap_rewrite(NewHeap, OldestXmin, FreezeXid,
+	rwstate = begin_heap_rewrite(OldHeap, NewHeap, OldestXmin, FreezeXid,
 								 MultiXactCutoff, use_wal);
 
 	/*
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index 5d540aa3a01..4996a2e7cd2 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -45,6 +45,7 @@
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "postmaster/bgwriter.h"
+#include "replication/slot.h"
 #include "storage/copydir.h"
 #include "storage/fd.h"
 #include "storage/lmgr.h"
@@ -750,6 +751,7 @@ dropdb(const char *dbname, bool missing_ok)
 	HeapTuple	tup;
 	int			notherbackends;
 	int			npreparedxacts;
+	int			nslots, nslots_active;
 
 	/*
 	 * Look up the target database's OID, and get exclusive lock on it. We
@@ -807,6 +809,19 @@ dropdb(const char *dbname, bool missing_ok)
 				 errmsg("cannot drop the currently open database")));
 
 	/*
+	 * Check whether there are, possibly unconnected, logical slots that refer
+	 * to the to-be-dropped database. The database lock we are holding
+	 * prevents the creation of new slots using the database.
+	 */
+	if (ReplicationSlotsCountDBSlots(db_id, &nslots, &nslots_active))
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_IN_USE),
+				 errmsg("database \"%s\" is used by a logical decoding slot",
+						dbname),
+				 errdetail("There are %d slot(s), %d of them active",
+						   nslots, nslots_active)));
+
+	/*
 	 * Check for other backends in the target database.  (Because we hold the
 	 * database lock, no new ones can start after this.)
 	 *
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index 5ae7763534b..ded1841dc65 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -398,11 +398,11 @@ get_rel_oids(Oid relid, const RangeVar *vacrel)
  * not interested.
  */
 void
-vacuum_set_xid_limits(int freeze_min_age,
+vacuum_set_xid_limits(Relation rel,
+					  int freeze_min_age,
 					  int freeze_table_age,
 					  int multixact_freeze_min_age,
 					  int multixact_freeze_table_age,
-					  bool sharedRel,
 					  TransactionId *oldestXmin,
 					  TransactionId *freezeLimit,
 					  TransactionId *xidFullScanLimit,
@@ -425,7 +425,7 @@ vacuum_set_xid_limits(int freeze_min_age,
 	 * working on a particular table at any time, and that each vacuum is
 	 * always an independent transaction.
 	 */
-	*oldestXmin = GetOldestXmin(sharedRel, true);
+	*oldestXmin = GetOldestXmin(rel, true);
 
 	Assert(TransactionIdIsNormal(*oldestXmin));
 
@@ -795,7 +795,7 @@ vac_update_datfrozenxid(void)
 	 * committed pg_class entries for new tables; see AddNewRelationTuple().
 	 * So we cannot produce a wrong minimum by starting with this.
 	 */
-	newFrozenXid = GetOldestXmin(true, true);
+	newFrozenXid = GetOldestXmin(NULL, true);
 
 	/*
 	 * Similarly, initialize the MultiXact "min" with the value that would be
diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c
index d77892ee7f8..d5db917d97f 100644
--- a/src/backend/commands/vacuumlazy.c
+++ b/src/backend/commands/vacuumlazy.c
@@ -44,6 +44,7 @@
 #include "access/multixact.h"
 #include "access/transam.h"
 #include "access/visibilitymap.h"
+#include "catalog/catalog.h"
 #include "catalog/storage.h"
 #include "commands/dbcommands.h"
 #include "commands/vacuum.h"
@@ -204,10 +205,10 @@ lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt,
 
 	vac_strategy = bstrategy;
 
-	vacuum_set_xid_limits(vacstmt->freeze_min_age, vacstmt->freeze_table_age,
+	vacuum_set_xid_limits(onerel,
+						  vacstmt->freeze_min_age, vacstmt->freeze_table_age,
 						  vacstmt->multixact_freeze_min_age,
 						  vacstmt->multixact_freeze_table_age,
-						  onerel->rd_rel->relisshared,
 						  &OldestXmin, &FreezeLimit, &xidFullScanLimit,
 						  &MultiXactCutoff, &mxactFullScanLimit);
 
diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c
index 1a8d4e51430..7d8a3f2c248 100644
--- a/src/backend/executor/nodeBitmapHeapscan.c
+++ b/src/backend/executor/nodeBitmapHeapscan.c
@@ -336,8 +336,7 @@ bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres)
 	/*
 	 * Prune and repair fragmentation for the whole page, if possible.
 	 */
-	Assert(TransactionIdIsValid(RecentGlobalXmin));
-	heap_page_prune_opt(scan->rs_rd, buffer, RecentGlobalXmin);
+	heap_page_prune_opt(scan->rs_rd, buffer);
 
 	/*
 	 * We must hold share lock on the buffer content while examining tuple
diff --git a/src/backend/replication/Makefile b/src/backend/replication/Makefile
index 7941cb8d5e7..6f17b08a6a5 100644
--- a/src/backend/replication/Makefile
+++ b/src/backend/replication/Makefile
@@ -17,6 +17,8 @@ override CPPFLAGS := -I$(srcdir) $(CPPFLAGS)
 OBJS = walsender.o walreceiverfuncs.o walreceiver.o basebackup.o \
 	repl_gram.o slot.o slotfuncs.o syncrep.o
 
+SUBDIRS = logical
+
 include $(top_srcdir)/src/backend/common.mk
 
 # repl_scanner is compiled as part of repl_gram
diff --git a/src/backend/replication/logical/Makefile b/src/backend/replication/logical/Makefile
new file mode 100644
index 00000000000..310a45c5c05
--- /dev/null
+++ b/src/backend/replication/logical/Makefile
@@ -0,0 +1,19 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+#    Makefile for src/backend/replication/logical
+#
+# IDENTIFICATION
+#    src/backend/replication/logical/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/replication/logical
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+override CPPFLAGS := -I$(srcdir) $(CPPFLAGS)
+
+OBJS = decode.o logical.o logicalfuncs.o reorderbuffer.o snapbuild.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c
new file mode 100644
index 00000000000..e8949aab324
--- /dev/null
+++ b/src/backend/replication/logical/decode.c
@@ -0,0 +1,826 @@
+/* -------------------------------------------------------------------------
+ *
+ * decode.c
+ *		This module decodes WAL records read using xlogreader.h's APIs for the
+ *		purpose of logical decoding by passing information to the
+ *		reorderbuffer module (containing the actual changes) and to the
+ *		snapbuild module to build a fitting catalog snapshot (to be able to
+ *		properly decode the changes in the reorderbuffer).
+ *
+ * NOTE:
+ *		This basically tries to handle all low level xlog stuff for
+ *      reorderbuffer.c and snapbuild.c. There's some minor leakage where a
+ *      specific record's struct is used to pass data along, but those just
+ *      happen to contain the right amount of data in a convenient
+ *      format. There isn't and shouldn't be much intelligence about the
+ *      contents of records in here except turning them into a more usable
+ *      format.
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/replication/logical/decode.c
+ *
+ * -------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/heapam.h"
+#include "access/heapam_xlog.h"
+#include "access/transam.h"
+#include "access/xact.h"
+#include "access/xlog_internal.h"
+#include "access/xlogreader.h"
+
+#include "catalog/pg_control.h"
+
+#include "replication/decode.h"
+#include "replication/logical.h"
+#include "replication/reorderbuffer.h"
+#include "replication/snapbuild.h"
+
+#include "storage/standby.h"
+
+typedef struct XLogRecordBuffer
+{
+	XLogRecPtr origptr;
+	XLogRecPtr endptr;
+	XLogRecord record;
+	char *record_data;
+} XLogRecordBuffer;
+
+/* RMGR Handlers */
+static void DecodeXLogOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
+static void DecodeHeapOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
+static void DecodeHeap2Op(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
+static void DecodeXactOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
+static void DecodeStandbyOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
+
+/* individual record(group)'s handlers */
+static void DecodeInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
+static void DecodeUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
+static void DecodeDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
+static void DecodeMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
+static void DecodeCommit(LogicalDecodingContext *ctx, XLogRecordBuffer *buf,
+						 TransactionId xid, Oid dboid,
+						 TimestampTz commit_time,
+						 int nsubxacts, TransactionId *sub_xids,
+						 int ninval_msgs, SharedInvalidationMessage *msg);
+static void DecodeAbort(LogicalDecodingContext *ctx, XLogRecPtr lsn,
+			TransactionId xid, TransactionId *sub_xids, int nsubxacts);
+
+/* common function to decode tuples */
+static void DecodeXLogTuple(char *data, Size len, ReorderBufferTupleBuf *tup);
+
+/*
+ * Take every XLogReadRecord()ed record and perform the actions required to
+ * decode it using the output plugin already setup in the logical decoding
+ * context.
+ */
+void
+LogicalDecodingProcessRecord(LogicalDecodingContext *ctx, XLogRecord *record)
+{
+	XLogRecordBuffer buf;
+
+	buf.origptr = ctx->reader->ReadRecPtr;
+	buf.endptr = ctx->reader->EndRecPtr;
+	buf.record = *record;
+	buf.record_data = XLogRecGetData(record);
+
+	/* cast so we get a warning when new rmgrs are added */
+	switch ((RmgrIds) buf.record.xl_rmid)
+	{
+		/*
+		 * Rmgrs we care about for logical decoding. Add new rmgrs in
+		 * rmgrlist.h's order.
+		 */
+		case RM_XLOG_ID:
+			DecodeXLogOp(ctx, &buf);
+			break;
+
+		case RM_XACT_ID:
+			DecodeXactOp(ctx, &buf);
+			break;
+
+		case RM_STANDBY_ID:
+			DecodeStandbyOp(ctx, &buf);
+			break;
+
+		case RM_HEAP2_ID:
+			DecodeHeap2Op(ctx, &buf);
+			break;
+
+		case RM_HEAP_ID:
+			DecodeHeapOp(ctx, &buf);
+			break;
+
+		/*
+		 * Rmgrs irrelevant for logical decoding; they describe stuff not
+		 * represented in logical decoding. Add new rmgrs in rmgrlist.h's
+		 * order.
+		 */
+		case RM_SMGR_ID:
+		case RM_CLOG_ID:
+		case RM_DBASE_ID:
+		case RM_TBLSPC_ID:
+		case RM_MULTIXACT_ID:
+		case RM_RELMAP_ID:
+		case RM_BTREE_ID:
+		case RM_HASH_ID:
+		case RM_GIN_ID:
+		case RM_GIST_ID:
+		case RM_SEQ_ID:
+		case RM_SPGIST_ID:
+			break;
+		case RM_NEXT_ID:
+			elog(ERROR, "unexpected RM_NEXT_ID rmgr_id: %u", (RmgrIds) buf.record.xl_rmid);
+	}
+}
+
+/*
+ * Handle rmgr XLOG_ID records for DecodeRecordIntoReorderBuffer().
+ */
+static void
+DecodeXLogOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+	SnapBuild  *builder = ctx->snapshot_builder;
+	uint8		info = buf->record.xl_info & ~XLR_INFO_MASK;
+
+	switch (info)
+	{
+		/* this is also used in END_OF_RECOVERY checkpoints */
+		case XLOG_CHECKPOINT_SHUTDOWN:
+		case XLOG_END_OF_RECOVERY:
+			SnapBuildSerializationPoint(builder, buf->origptr);
+
+			break;
+		case XLOG_CHECKPOINT_ONLINE:
+			/*
+			 * a RUNNING_XACTS record will have been logged near to this, we
+			 * can restart from there.
+			 */
+			break;
+		case XLOG_NOOP:
+		case XLOG_NEXTOID:
+		case XLOG_SWITCH:
+		case XLOG_BACKUP_END:
+		case XLOG_PARAMETER_CHANGE:
+		case XLOG_RESTORE_POINT:
+		case XLOG_FPW_CHANGE:
+		case XLOG_FPI:
+			break;
+		default:
+			elog(ERROR, "unexpected RM_XLOG_ID record type: %u", info);
+	}
+}
+
+/*
+ * Handle rmgr XACT_ID records for DecodeRecordIntoReorderBuffer().
+ */
+static void
+DecodeXactOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+	SnapBuild	   *builder = ctx->snapshot_builder;
+	ReorderBuffer  *reorder = ctx->reorder;
+	XLogRecord	   *r = &buf->record;
+	uint8		info = r->xl_info & ~XLR_INFO_MASK;
+
+	/* no point in doing anything yet, data could not be decoded anyway */
+	if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT)
+		return;
+
+	switch (info)
+	{
+		case XLOG_XACT_COMMIT:
+			{
+				xl_xact_commit *xlrec;
+				TransactionId *subxacts = NULL;
+				SharedInvalidationMessage *invals = NULL;
+
+				xlrec = (xl_xact_commit *) buf->record_data;
+
+				subxacts = (TransactionId *) &(xlrec->xnodes[xlrec->nrels]);
+				invals = (SharedInvalidationMessage *) &(subxacts[xlrec->nsubxacts]);
+
+				DecodeCommit(ctx, buf, r->xl_xid, xlrec->dbId,
+							 xlrec->xact_time,
+							 xlrec->nsubxacts, subxacts,
+							 xlrec->nmsgs, invals);
+
+				break;
+			}
+		case XLOG_XACT_COMMIT_PREPARED:
+			{
+				xl_xact_commit_prepared *prec;
+				xl_xact_commit *xlrec;
+				TransactionId *subxacts;
+				SharedInvalidationMessage *invals = NULL;
+
+				/* Prepared commits contain a normal commit record... */
+				prec = (xl_xact_commit_prepared *) buf->record_data;
+				xlrec = &prec->crec;
+
+				subxacts = (TransactionId *) &(xlrec->xnodes[xlrec->nrels]);
+				invals = (SharedInvalidationMessage *) &(subxacts[xlrec->nsubxacts]);
+
+				DecodeCommit(ctx, buf, r->xl_xid, xlrec->dbId,
+							 xlrec->xact_time,
+							 xlrec->nsubxacts, subxacts,
+							 xlrec->nmsgs, invals);
+
+				break;
+			}
+		case XLOG_XACT_COMMIT_COMPACT:
+			{
+				xl_xact_commit_compact *xlrec;
+
+				xlrec = (xl_xact_commit_compact *) buf->record_data;
+
+				DecodeCommit(ctx, buf, r->xl_xid, InvalidOid,
+							 xlrec->xact_time,
+							 xlrec->nsubxacts, xlrec->subxacts,
+							 0, NULL);
+				break;
+			}
+		case XLOG_XACT_ABORT:
+			{
+				xl_xact_abort *xlrec;
+				TransactionId *sub_xids;
+
+				xlrec = (xl_xact_abort *) buf->record_data;
+
+				sub_xids = (TransactionId *) &(xlrec->xnodes[xlrec->nrels]);
+
+				DecodeAbort(ctx, buf->origptr, r->xl_xid,
+							sub_xids, xlrec->nsubxacts);
+				break;
+			}
+		case XLOG_XACT_ABORT_PREPARED:
+			{
+				xl_xact_abort_prepared *prec;
+				xl_xact_abort *xlrec;
+				TransactionId *sub_xids;
+
+				/* prepared abort contain a normal commit abort... */
+				prec = (xl_xact_abort_prepared *) buf->record_data;
+				xlrec = &prec->arec;
+
+				sub_xids = (TransactionId *) &(xlrec->xnodes[xlrec->nrels]);
+
+				/* r->xl_xid is committed in a separate record */
+				DecodeAbort(ctx, buf->origptr, prec->xid,
+							sub_xids, xlrec->nsubxacts);
+				break;
+			}
+
+		case XLOG_XACT_ASSIGNMENT:
+			{
+				xl_xact_assignment *xlrec;
+				int			i;
+				TransactionId *sub_xid;
+
+				xlrec =	(xl_xact_assignment *) buf->record_data;
+
+				sub_xid = &xlrec->xsub[0];
+
+				for (i = 0; i < xlrec->nsubxacts; i++)
+				{
+					ReorderBufferAssignChild(reorder, xlrec->xtop,
+											 *(sub_xid++), buf->origptr);
+				}
+				break;
+			}
+		case XLOG_XACT_PREPARE:
+			/*
+			 * Currently decoding ignores PREPARE TRANSACTION and will just
+			 * decode the transaction when the COMMIT PREPARED is sent or
+			 * throw away the transaction's contents when a ROLLBACK PREPARED
+			 * is received. In the future we could add code to expose prepared
+			 * transactions in the changestream allowing for a kind of
+			 * distributed 2PC.
+			 */
+			break;
+		default:
+			elog(ERROR, "unexpected RM_XACT_ID record type: %u", info);
+	}
+}
+
+/*
+ * Handle rmgr STANDBY_ID records for DecodeRecordIntoReorderBuffer().
+ */
+static void
+DecodeStandbyOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+	SnapBuild  *builder = ctx->snapshot_builder;
+	XLogRecord *r = &buf->record;
+	uint8		info = r->xl_info & ~XLR_INFO_MASK;
+
+	switch (info)
+	{
+		case XLOG_RUNNING_XACTS:
+			{
+				xl_running_xacts *running = (xl_running_xacts *) buf->record_data;
+				SnapBuildProcessRunningXacts(builder, buf->origptr, running);
+				/*
+				 * Abort all transactions that we keep track of, that are
+				 * older than the record's oldestRunningXid. This is the most
+				 * convenient spot for doing so since, in contrast to shutdown
+				 * or end-of-recovery checkpoints, we have information about
+				 * all running transactions which includes prepared ones,
+				 * while shutdown checkpoints just know that no non-prepared
+				 * transactions are in progress.
+				 */
+				ReorderBufferAbortOld(ctx->reorder, running->oldestRunningXid);
+			}
+			break;
+		case XLOG_STANDBY_LOCK:
+			break;
+		default:
+			elog(ERROR, "unexpected RM_STANDBY_ID record type: %u", info);
+	}
+}
+
+/*
+ * Handle rmgr HEAP2_ID records for DecodeRecordIntoReorderBuffer().
+ */
+static void
+DecodeHeap2Op(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+	uint8		info = buf->record.xl_info & XLOG_HEAP_OPMASK;
+	TransactionId xid = buf->record.xl_xid;
+	SnapBuild  *builder = ctx->snapshot_builder;
+
+	/* no point in doing anything yet */
+	if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT)
+		return;
+
+	switch (info)
+	{
+		case XLOG_HEAP2_MULTI_INSERT:
+			if (SnapBuildProcessChange(builder, xid, buf->origptr))
+				DecodeMultiInsert(ctx, buf);
+			break;
+		case XLOG_HEAP2_NEW_CID:
+			{
+				xl_heap_new_cid *xlrec;
+				xlrec = (xl_heap_new_cid *) buf->record_data;
+				SnapBuildProcessNewCid(builder, xid, buf->origptr, xlrec);
+
+				break;
+			}
+		case XLOG_HEAP2_REWRITE:
+			/*
+			 * Although these records only exist to serve the needs of logical
+			 * decoding, all the work happens as part of crash or archive
+			 * recovery, so we don't need to do anything here.
+			 */
+			break;
+		/*
+		 * Everything else here is just low level physical stuff we're
+		 * not interested in.
+		 */
+		case XLOG_HEAP2_FREEZE_PAGE:
+		case XLOG_HEAP2_CLEAN:
+		case XLOG_HEAP2_CLEANUP_INFO:
+		case XLOG_HEAP2_VISIBLE:
+		case XLOG_HEAP2_LOCK_UPDATED:
+			break;
+		default:
+			elog(ERROR, "unexpected RM_HEAP2_ID record type: %u", info);
+	}
+}
+
+/*
+ * Handle rmgr HEAP_ID records for DecodeRecordIntoReorderBuffer().
+ */
+static void
+DecodeHeapOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+	uint8		info = buf->record.xl_info & XLOG_HEAP_OPMASK;
+	TransactionId xid = buf->record.xl_xid;
+	SnapBuild  *builder = ctx->snapshot_builder;
+
+	/* no point in doing anything yet */
+	if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT)
+		return;
+
+	switch (info)
+	{
+		case XLOG_HEAP_INSERT:
+			if (SnapBuildProcessChange(builder, xid, buf->origptr))
+				DecodeInsert(ctx, buf);
+			break;
+
+			/*
+			 * Treat HOT update as normal updates. There is no useful
+			 * information in the fact that we could make it a HOT update
+			 * locally and the WAL layout is compatible.
+			 */
+		case XLOG_HEAP_HOT_UPDATE:
+		case XLOG_HEAP_UPDATE:
+			if (SnapBuildProcessChange(builder, xid, buf->origptr))
+				DecodeUpdate(ctx, buf);
+			break;
+
+		case XLOG_HEAP_DELETE:
+			if (SnapBuildProcessChange(builder, xid, buf->origptr))
+				DecodeDelete(ctx, buf);
+			break;
+
+		case XLOG_HEAP_NEWPAGE:
+			/*
+			 * This is only used in places like indexams and CLUSTER which
+			 * don't contain changes relevant for logical replication.
+			 */
+			break;
+
+		case XLOG_HEAP_INPLACE:
+			/*
+			 * Inplace updates are only ever performed on catalog tuples and
+			 * can, per definition, not change tuple visibility.  Since we
+			 * don't decode catalog tuples, we're not interested in the
+			 * record's contents.
+			 *
+			 * In-place updates can be used either by XID-bearing transactions
+			 * (e.g.  in CREATE INDEX CONCURRENTLY) or by XID-less
+			 * transactions (e.g.  VACUUM).  In the former case, the commit
+			 * record will include cache invalidations, so we mark the
+			 * transaction as catalog modifying here. Currently that's
+			 * redundant because the commit will do that as well, but once we
+			 * support decoding in-progress relations, this will be important.
+			 */
+			if (!TransactionIdIsValid(xid))
+				break;
+
+			SnapBuildProcessChange(builder, xid, buf->origptr);
+			ReorderBufferXidSetCatalogChanges(ctx->reorder, xid, buf->origptr);
+			break;
+
+		case XLOG_HEAP_LOCK:
+			/* we don't care about row level locks for now */
+			break;
+
+		default:
+			elog(ERROR, "unexpected RM_HEAP_ID record type: %u", info);
+			break;
+	}
+}
+
+/*
+ * Consolidated commit record handling between the different form of commit
+ * records.
+ */
+static void
+DecodeCommit(LogicalDecodingContext *ctx, XLogRecordBuffer *buf,
+			 TransactionId xid, Oid dboid,
+			 TimestampTz commit_time,
+			 int nsubxacts, TransactionId *sub_xids,
+			 int ninval_msgs, SharedInvalidationMessage *msgs)
+{
+	int			i;
+
+	/*
+	 * Process invalidation messages, even if we're not interested in the
+	 * transaction's contents, since the various caches need to always be
+	 * consistent.
+	 */
+	if (ninval_msgs > 0)
+	{
+		ReorderBufferAddInvalidations(ctx->reorder, xid, buf->origptr,
+									  ninval_msgs, msgs);
+		ReorderBufferXidSetCatalogChanges(ctx->reorder, xid, buf->origptr);
+	}
+
+	SnapBuildCommitTxn(ctx->snapshot_builder, buf->origptr, xid,
+					   nsubxacts, sub_xids);
+
+	/* ----
+	 * Check whether we are interested in this specific transaction, and tell
+	 * the the reorderbuffer to forget the content of the (sub-)transactions
+	 * if not.
+	 *
+	 * There basically two reasons we might not be interested in this
+	 * transaction:
+	 * 1) We might not be interested in decoding transactions up to this
+	 *    LSN. This can happen because we previously decoded it and now just
+	 *    are restarting or if we haven't assembled a consistent snapshot yet.
+	 * 2) The transaction happened in another database.
+	 *
+	 * We can't just use ReorderBufferAbort() here, because we need to execute
+	 * the transaction's invalidations.  This currently won't be needed if
+	 * we're just skipping over the transaction because currently we only do
+	 * so during startup, to get to the first transaction the client needs. As
+	 * we have reset the catalog caches before starting to read WAL, and we
+	 * haven't yet touched any catalogs, there can't be anything to invalidate.
+	 * But if we're "forgetting" this commit because it's it happened in
+	 * another database, the invalidations might be important, because they
+	 * could be for shared catalogs and we might have loaded data into the
+	 * relevant syscaches.
+	 * ---
+	 */
+	if (SnapBuildXactNeedsSkip(ctx->snapshot_builder, buf->origptr) ||
+		(dboid != InvalidOid && dboid != ctx->slot->data.database))
+	{
+		for (i = 0; i < nsubxacts; i++)
+		{
+			ReorderBufferForget(ctx->reorder, *sub_xids, buf->origptr);
+			sub_xids++;
+		}
+		ReorderBufferForget(ctx->reorder, xid, buf->origptr);
+
+		return;
+	}
+
+	/* tell the reorderbuffer about the surviving subtransactions */
+	for (i = 0; i < nsubxacts; i++)
+	{
+		ReorderBufferCommitChild(ctx->reorder, xid, *sub_xids,
+								 buf->origptr, buf->endptr);
+		sub_xids++;
+	}
+
+	/* replay actions of all transaction + subtransactions in order */
+	ReorderBufferCommit(ctx->reorder, xid, buf->origptr, buf->endptr,
+						commit_time);
+}
+
+/*
+ * Get the data from the various forms of abort records and pass it on to
+ * snapbuild.c and reorderbuffer.c
+ */
+static void
+DecodeAbort(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid,
+			TransactionId *sub_xids, int nsubxacts)
+{
+	int			i;
+
+	SnapBuildAbortTxn(ctx->snapshot_builder, lsn, xid, nsubxacts, sub_xids);
+
+	for (i = 0; i < nsubxacts; i++)
+	{
+		ReorderBufferAbort(ctx->reorder, *sub_xids, lsn);
+		sub_xids++;
+	}
+
+	ReorderBufferAbort(ctx->reorder, xid, lsn);
+}
+
+/*
+ * Parse XLOG_HEAP_INSERT (not MULTI_INSERT!) records into tuplebufs.
+ *
+ * Deletes can contain the new tuple.
+ */
+static void
+DecodeInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+	XLogRecord *r = &buf->record;
+	xl_heap_insert *xlrec;
+	ReorderBufferChange *change;
+
+	xlrec = (xl_heap_insert *) buf->record_data;
+
+	/* only interested in our database */
+	if (xlrec->target.node.dbNode != ctx->slot->data.database)
+		return;
+
+	change = ReorderBufferGetChange(ctx->reorder);
+	change->action = REORDER_BUFFER_CHANGE_INSERT;
+	memcpy(&change->tp.relnode, &xlrec->target.node, sizeof(RelFileNode));
+
+	if (xlrec->flags & XLOG_HEAP_CONTAINS_NEW_TUPLE)
+	{
+		Assert(r->xl_len > (SizeOfHeapInsert + SizeOfHeapHeader));
+
+		change->tp.newtuple = ReorderBufferGetTupleBuf(ctx->reorder);
+
+		DecodeXLogTuple((char *) xlrec + SizeOfHeapInsert,
+						r->xl_len - SizeOfHeapInsert,
+						change->tp.newtuple);
+	}
+
+	ReorderBufferQueueChange(ctx->reorder, r->xl_xid, buf->origptr, change);
+}
+
+/*
+ * Parse XLOG_HEAP_UPDATE and XLOG_HEAP_HOT_UPDATE, which have the same layout
+ * in the record, from wal into proper tuplebufs.
+ *
+ * Updates can possibly contain a new tuple and the old primary key.
+ */
+static void
+DecodeUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+	XLogRecord *r = &buf->record;
+	xl_heap_update *xlrec;
+	xl_heap_header_len *xlhdr;
+	ReorderBufferChange *change;
+	char	   *data;
+
+	xlrec = (xl_heap_update *) buf->record_data;
+	xlhdr = (xl_heap_header_len *) (buf->record_data + SizeOfHeapUpdate);
+
+	/* only interested in our database */
+	if (xlrec->target.node.dbNode != ctx->slot->data.database)
+		return;
+
+	change = ReorderBufferGetChange(ctx->reorder);
+	change->action = REORDER_BUFFER_CHANGE_UPDATE;
+	memcpy(&change->tp.relnode, &xlrec->target.node, sizeof(RelFileNode));
+
+	data = (char *) &xlhdr->header;
+
+	if (xlrec->flags & XLOG_HEAP_CONTAINS_NEW_TUPLE)
+	{
+		Assert(r->xl_len > (SizeOfHeapUpdate + SizeOfHeapHeaderLen));
+
+		change->tp.newtuple = ReorderBufferGetTupleBuf(ctx->reorder);
+
+		DecodeXLogTuple(data,
+						xlhdr->t_len + SizeOfHeapHeader,
+						change->tp.newtuple);
+		/* skip over the rest of the tuple header */
+		data += SizeOfHeapHeader;
+		/* skip over the tuple data */
+		data += xlhdr->t_len;
+	}
+
+	if (xlrec->flags & XLOG_HEAP_CONTAINS_OLD)
+	{
+		xlhdr = (xl_heap_header_len *) data;
+		change->tp.oldtuple = ReorderBufferGetTupleBuf(ctx->reorder);
+		DecodeXLogTuple((char *) &xlhdr->header,
+						xlhdr->t_len + SizeOfHeapHeader,
+						change->tp.oldtuple);
+		data = (char *) &xlhdr->header;
+		data += SizeOfHeapHeader;
+		data += xlhdr->t_len;
+	}
+
+	ReorderBufferQueueChange(ctx->reorder, r->xl_xid, buf->origptr, change);
+}
+
+/*
+ * Parse XLOG_HEAP_DELETE from wal into proper tuplebufs.
+ *
+ * Deletes can possibly contain the old primary key.
+ */
+static void
+DecodeDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+	XLogRecord *r = &buf->record;
+	xl_heap_delete *xlrec;
+	ReorderBufferChange *change;
+
+	xlrec = (xl_heap_delete *) buf->record_data;
+
+	/* only interested in our database */
+	if (xlrec->target.node.dbNode != ctx->slot->data.database)
+		return;
+
+	change = ReorderBufferGetChange(ctx->reorder);
+	change->action = REORDER_BUFFER_CHANGE_DELETE;
+
+	memcpy(&change->tp.relnode, &xlrec->target.node, sizeof(RelFileNode));
+
+	/* old primary key stored */
+	if (xlrec->flags & XLOG_HEAP_CONTAINS_OLD)
+	{
+		Assert(r->xl_len > (SizeOfHeapDelete + SizeOfHeapHeader));
+
+		change->tp.oldtuple = ReorderBufferGetTupleBuf(ctx->reorder);
+
+		DecodeXLogTuple((char *) xlrec + SizeOfHeapDelete,
+						r->xl_len - SizeOfHeapDelete,
+						change->tp.oldtuple);
+	}
+	ReorderBufferQueueChange(ctx->reorder, r->xl_xid, buf->origptr, change);
+}
+
+/*
+ * Decode XLOG_HEAP2_MULTI_INSERT_insert record into multiple tuplebufs.
+ *
+ * Currently MULTI_INSERT will always contain the full tuples.
+ */
+static void
+DecodeMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+	XLogRecord *r = &buf->record;
+	xl_heap_multi_insert *xlrec;
+	int			i;
+	char	   *data;
+	bool		isinit = (r->xl_info & XLOG_HEAP_INIT_PAGE) != 0;
+
+	xlrec = (xl_heap_multi_insert *) buf->record_data;
+
+	/* only interested in our database */
+	if (xlrec->node.dbNode != ctx->slot->data.database)
+		return;
+
+	data = buf->record_data + SizeOfHeapMultiInsert;
+
+	/*
+	 * OffsetNumbers (which are not of interest to us) are stored when
+	 * XLOG_HEAP_INIT_PAGE is not set -- skip over them.
+	 */
+	if (!isinit)
+		data += sizeof(OffsetNumber) * xlrec->ntuples;
+
+	for (i = 0; i < xlrec->ntuples; i++)
+	{
+		ReorderBufferChange *change;
+		xl_multi_insert_tuple *xlhdr;
+		int			datalen;
+		ReorderBufferTupleBuf *tuple;
+
+		change = ReorderBufferGetChange(ctx->reorder);
+		change->action = REORDER_BUFFER_CHANGE_INSERT;
+		memcpy(&change->tp.relnode, &xlrec->node, sizeof(RelFileNode));
+
+		/*
+		 * CONTAINS_NEW_TUPLE will always be set currently as multi_insert
+		 * isn't used for catalogs, but better be future proof.
+		 *
+		 * We decode the tuple in pretty much the same way as DecodeXLogTuple,
+		 * but since the layout is slightly different, we can't use it here.
+		 */
+		if (xlrec->flags & XLOG_HEAP_CONTAINS_NEW_TUPLE)
+		{
+			change->tp.newtuple = ReorderBufferGetTupleBuf(ctx->reorder);
+
+			tuple = change->tp.newtuple;
+
+			/* not a disk based tuple */
+			ItemPointerSetInvalid(&tuple->tuple.t_self);
+
+			xlhdr = (xl_multi_insert_tuple *) SHORTALIGN(data);
+			data = ((char *) xlhdr) + SizeOfMultiInsertTuple;
+			datalen = xlhdr->datalen;
+
+			/*
+			 * We can only figure this out after reassembling the
+			 * transactions.
+			 */
+			tuple->tuple.t_tableOid = InvalidOid;
+			tuple->tuple.t_data = &tuple->header;
+			tuple->tuple.t_len = datalen
+				+ offsetof(HeapTupleHeaderData, t_bits);
+
+			memset(&tuple->header, 0, sizeof(HeapTupleHeaderData));
+
+			memcpy((char *) &tuple->header
+				   + offsetof(HeapTupleHeaderData, t_bits),
+				   (char *) data,
+				   datalen);
+			data += datalen;
+
+			tuple->header.t_infomask = xlhdr->t_infomask;
+			tuple->header.t_infomask2 = xlhdr->t_infomask2;
+			tuple->header.t_hoff = xlhdr->t_hoff;
+		}
+
+		ReorderBufferQueueChange(ctx->reorder, r->xl_xid,
+								 buf->origptr, change);
+	}
+}
+
+/*
+ * Read a HeapTuple as WAL logged by heap_insert, heap_update and heap_delete
+ * (but not by heap_multi_insert) into a tuplebuf.
+ *
+ * The size 'len' and the pointer 'data' in the record need to be
+ * computed outside as they are record specific.
+ */
+static void
+DecodeXLogTuple(char *data, Size len, ReorderBufferTupleBuf *tuple)
+{
+	xl_heap_header xlhdr;
+	int			datalen = len - SizeOfHeapHeader;
+
+	Assert(datalen >= 0);
+	Assert(datalen <= MaxHeapTupleSize);
+
+	tuple->tuple.t_len = datalen + offsetof(HeapTupleHeaderData, t_bits);
+
+	/* not a disk based tuple */
+	ItemPointerSetInvalid(&tuple->tuple.t_self);
+
+	/* we can only figure this out after reassembling the transactions */
+	tuple->tuple.t_tableOid = InvalidOid;
+	tuple->tuple.t_data = &tuple->header;
+
+	/* data is not stored aligned, copy to aligned storage */
+	memcpy((char *) &xlhdr,
+		   data,
+		   SizeOfHeapHeader);
+
+	memset(&tuple->header, 0, sizeof(HeapTupleHeaderData));
+
+	memcpy((char *) &tuple->header + offsetof(HeapTupleHeaderData, t_bits),
+		   data + SizeOfHeapHeader,
+		   datalen);
+
+	tuple->header.t_infomask = xlhdr.t_infomask;
+	tuple->header.t_infomask2 = xlhdr.t_infomask2;
+	tuple->header.t_hoff = xlhdr.t_hoff;
+}
diff --git a/src/backend/replication/logical/logical.c b/src/backend/replication/logical/logical.c
new file mode 100644
index 00000000000..4fb0974f297
--- /dev/null
+++ b/src/backend/replication/logical/logical.c
@@ -0,0 +1,920 @@
+/*-------------------------------------------------------------------------
+ * logical.c
+ *	   PostgreSQL logical decoding coordination
+ *
+ * Copyright (c) 2012-2014, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/backend/replication/logical/logical.c
+ *
+ * NOTES
+ *    This file coordinates interaction between the various modules that
+ *    together providethe logical decoding, primarily by providing so
+ *    called LogicalDecodingContexts. The goal is to encapsulate most of the
+ *    internal complexity for consumers of logical decoding, so they can
+ *    create and consume a changestream with a low amount of code.
+ *
+ *    The idea is that a consumer provides three callbacks, one to read WAL,
+ *    one to prepare a data write, and a final one for actually writing since
+ *    their implementation depends on the type of consumer.  Check
+ *    logicalfunc.c for an example implementations of a fairly simple consumer
+ *    and a implementation of a WAL reading callback that's suitable for
+ *    simpler consumers.
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <unistd.h>
+#include <sys/stat.h>
+
+#include "miscadmin.h"
+
+#include "access/xact.h"
+
+#include "replication/decode.h"
+#include "replication/logical.h"
+#include "replication/reorderbuffer.h"
+#include "replication/snapbuild.h"
+
+#include "storage/proc.h"
+#include "storage/procarray.h"
+
+#include "utils/memutils.h"
+
+/* data for errcontext callback */
+typedef struct LogicalErrorCallbackState
+{
+	LogicalDecodingContext *ctx;
+	const char *callback_name;
+	XLogRecPtr	report_location;
+} LogicalErrorCallbackState;
+
+/* wrappers around output plugin callbacks */
+static void output_plugin_error_callback(void *arg);
+static void startup_cb_wrapper(LogicalDecodingContext *ctx, OutputPluginOptions *opt,
+								 bool is_init);
+static void shutdown_cb_wrapper(LogicalDecodingContext *ctx);
+static void begin_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn);
+static void commit_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
+							   XLogRecPtr commit_lsn);
+static void change_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
+						   Relation relation, ReorderBufferChange *change);
+
+static void LoadOutputPlugin(OutputPluginCallbacks *callbacks, char *plugin);
+
+/*
+ * Make sure the current settings & environment are capable of doing logical
+ * decoding.
+ */
+void
+CheckLogicalDecodingRequirements(void)
+{
+	CheckSlotRequirements();
+
+	if (wal_level < WAL_LEVEL_LOGICAL)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("logical decoding requires wal_level >= logical")));
+
+	if (MyDatabaseId == InvalidOid)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("logical decoding requires a database connection")));
+
+	/* ----
+	 * TODO: We got to change that someday soon...
+	 *
+	 * There's basically three things missing to allow this:
+	 * 1) We need to be able to correctly and quickly identify the timeline a
+	 *    LSN belongs to
+	 * 2) We need to force hot_standby_feedback to be enabled at all times so
+	 *    the primary cannot remove rows we need.
+	 * 3) support dropping replication slots referring to a database, in
+	 *    dbase_redo. There can't be any active ones due to HS recovery
+	 *    conflicts, so that should be relatively easy.
+	 * ----
+	 */
+	if (RecoveryInProgress())
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("logical decoding cannot be used while in recovery")));
+}
+
+/*
+ * Helper function for CreateInitialDecodingContext() and
+ * CreateDecodingContext() performing common tasks.
+ */
+static LogicalDecodingContext *
+StartupDecodingContext(List *output_plugin_options,
+					   XLogRecPtr start_lsn,
+					   TransactionId xmin_horizon,
+					   XLogPageReadCB read_page,
+					   LogicalOutputPluginWriterPrepareWrite prepare_write,
+					   LogicalOutputPluginWriterWrite do_write)
+{
+	ReplicationSlot *slot;
+	MemoryContext context, old_context;
+	LogicalDecodingContext *ctx;
+
+	/* shorter lines... */
+	slot = MyReplicationSlot;
+
+	context = AllocSetContextCreate(CurrentMemoryContext,
+									"Changeset Extraction Context",
+									ALLOCSET_DEFAULT_MINSIZE,
+									ALLOCSET_DEFAULT_INITSIZE,
+									ALLOCSET_DEFAULT_MAXSIZE);
+	old_context = MemoryContextSwitchTo(context);
+	ctx = palloc0(sizeof(LogicalDecodingContext));
+
+	ctx->context = context;
+
+	/* (re-)load output plugins, so we detect a bad (removed) output plugin now. */
+	LoadOutputPlugin(&ctx->callbacks, NameStr(slot->data.plugin));
+
+	/*
+	 * Now that the slot's xmin has been set, we can announce ourselves as a
+	 * logical decoding backend which doesn't need to be checked individually
+	 * when computing the xmin horizon because the xmin is enforced via
+	 * replication slots.
+	 */
+	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+	MyPgXact->vacuumFlags |= PROC_IN_LOGICAL_DECODING;
+	LWLockRelease(ProcArrayLock);
+
+	ctx->slot = slot;
+
+	ctx->reader = XLogReaderAllocate(read_page, ctx);
+	ctx->reader->private_data = ctx;
+
+	ctx->reorder = ReorderBufferAllocate();
+	ctx->snapshot_builder =
+		AllocateSnapshotBuilder(ctx->reorder, xmin_horizon, start_lsn);
+
+	ctx->reorder->private_data = ctx;
+
+	/* wrap output plugin callbacks, so we can add error context information */
+	ctx->reorder->begin = begin_cb_wrapper;
+	ctx->reorder->apply_change = change_cb_wrapper;
+	ctx->reorder->commit = commit_cb_wrapper;
+
+	ctx->out = makeStringInfo();
+	ctx->prepare_write = prepare_write;
+	ctx->write = do_write;
+
+	ctx->output_plugin_options = output_plugin_options;
+
+	MemoryContextSwitchTo(old_context);
+
+	return ctx;
+}
+
+/*
+ * Create a new decoding context, for a new logical slot.
+ *
+ * plugin contains the name of the output plugin
+ * output_plugin_options contains options passed to the output plugin
+ * read_page, prepare_write, do_write are callbacks that have to be filled to
+ *		perform the use-case dependent, actual, work.
+ *
+ * Needs to be called while in a memory context that's at least as long lived
+ * as the the decoding context because further memory contexts will be created
+ * inside it.
+ *
+ * Returns an initialized decoding context after calling the output plugin's
+ * startup function.
+ */
+LogicalDecodingContext *
+CreateInitDecodingContext(char *plugin,
+						  List *output_plugin_options,
+						  XLogPageReadCB read_page,
+						  LogicalOutputPluginWriterPrepareWrite prepare_write,
+						  LogicalOutputPluginWriterWrite do_write)
+{
+	TransactionId	xmin_horizon = InvalidTransactionId;
+	ReplicationSlot *slot;
+	LogicalDecodingContext *ctx;
+	MemoryContext	old_context;
+
+	/* shorter lines... */
+	slot = MyReplicationSlot;
+
+	/* first some sanity checks that are unlikely to be violated */
+	if (slot == NULL)
+		elog(ERROR, "cannot perform logical decoding without a acquired slot");
+
+	if (plugin == NULL)
+		elog(ERROR, "cannot initialize logical decoding without a specified plugin");
+
+	/* Make sure the passed slot is suitable. These are user facing errors. */
+	if (slot->data.database == InvalidOid)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("cannot use physical replication slot created for logical decoding")));
+
+	if (slot->data.database != MyDatabaseId)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("replication slot \"%s\" was not created in this database",
+						 NameStr(slot->data.name))));
+
+	if (IsTransactionState() &&
+		GetTopTransactionIdIfAny() != InvalidTransactionId)
+		ereport(ERROR,
+				(errcode(ERRCODE_ACTIVE_SQL_TRANSACTION),
+				 errmsg("cannot create logical replication slot in transaction that has performed writes")));
+
+	/* register output plugin name with slot */
+	SpinLockAcquire(&slot->mutex);
+	strncpy(NameStr(slot->data.plugin), plugin,
+			NAMEDATALEN);
+	NameStr(slot->data.plugin)[NAMEDATALEN - 1] = '\0';
+	SpinLockRelease(&slot->mutex);
+
+	/*
+	 * The replication slot mechanism is used to prevent removal of required
+	 * WAL. As there is no interlock between this and checkpoints required WAL
+	 * could be removed before ReplicationSlotsComputeRequiredLSN() has been
+	 * called to prevent that. In the very unlikely case that this happens
+	 * we'll just retry.
+	 */
+	while (true)
+	{
+		XLogSegNo	segno;
+
+		/*
+		 * Let's start with enough information if we can, so log a standby
+		 * snapshot and start decoding at exactly that position.
+		 */
+		if (!RecoveryInProgress())
+		{
+			XLogRecPtr flushptr;
+
+			/* start at current insert position*/
+			slot->data.restart_lsn = GetXLogInsertRecPtr();
+
+			/* make sure we have enough information to start */
+			flushptr = LogStandbySnapshot();
+
+			/* and make sure it's fsynced to disk */
+			XLogFlush(flushptr);
+		}
+		else
+			slot->data.restart_lsn = GetRedoRecPtr();
+
+		/* prevent WAL removal as fast as possible */
+		ReplicationSlotsComputeRequiredLSN();
+
+		/*
+		 * If all required WAL is still there, great, otherwise retry. The
+		 * slot should prevent further removal of WAL, unless there's a
+		 * concurrent ReplicationSlotsComputeRequiredLSN() after we've written
+		 * the new restart_lsn above, so normally we should never need to loop
+		 * more than twice.
+		 */
+		XLByteToSeg(slot->data.restart_lsn, segno);
+		if (XLogGetLastRemovedSegno() < segno)
+			break;
+	}
+
+
+	/* ----
+	 * This is a bit tricky: We need to determine a safe xmin horizon to start
+	 * decoding from, to avoid starting from a running xacts record referring
+	 * to xids whose rows have been vacuumed or pruned
+	 * already. GetOldestSafeDecodingTransactionId() returns such a value, but
+	 * without further interlock it's return value might immediately be out of
+	 * date.
+	 *
+	 * So we have to acquire the ProcArrayLock to prevent computation of new
+	 * xmin horizons by other backends, get the safe decoding xid, and inform
+	 * the slot machinery about the new limit. Once that's done the
+	 * ProcArrayLock can be be released as the slot machinery now is
+	 * protecting against vacuum.
+	 * ----
+	 */
+	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+	slot->effective_catalog_xmin = GetOldestSafeDecodingTransactionId();
+	slot->data.catalog_xmin = slot->effective_catalog_xmin;
+
+	ReplicationSlotsComputeRequiredXmin(true);
+
+	LWLockRelease(ProcArrayLock);
+
+	/*
+	 * tell the snapshot builder to only assemble snapshot once reaching
+	 * the a running_xact's record with the respective xmin.
+	 */
+	xmin_horizon = slot->data.catalog_xmin;
+
+	ReplicationSlotMarkDirty();
+	ReplicationSlotSave();
+
+	ctx = StartupDecodingContext(NIL, InvalidXLogRecPtr, xmin_horizon,
+								  read_page, prepare_write, do_write);
+
+	/* call output plugin initialization callback */
+	old_context = MemoryContextSwitchTo(ctx->context);
+	if (ctx->callbacks.startup_cb != NULL)
+		startup_cb_wrapper(ctx, &ctx->options, true);
+	MemoryContextSwitchTo(old_context);
+
+	return ctx;
+}
+
+/*
+ * Create a new decoding context, for a logical slot that has previously been
+ * used already.
+ *
+ * start_lsn contains the LSN of the last received data or InvalidXLogRecPtr
+ * output_plugin_options contains options passed to the output plugin
+ * read_page, prepare_write, do_write are callbacks that have to be filled to
+ *		perform the use-case dependent, actual, work.
+ *
+ * Needs to be called while in a memory context that's at least as long lived
+ * as the the decoding context because further memory contexts will be created
+ * inside it.
+ *
+ * Returns an initialized decoding context after calling the output plugin's
+ * startup function.
+ */
+LogicalDecodingContext *
+CreateDecodingContext(XLogRecPtr start_lsn,
+					  List *output_plugin_options,
+					  XLogPageReadCB read_page,
+					  LogicalOutputPluginWriterPrepareWrite prepare_write,
+					  LogicalOutputPluginWriterWrite do_write)
+{
+	LogicalDecodingContext *ctx;
+	ReplicationSlot *slot;
+	MemoryContext	old_context;
+
+	/* shorter lines... */
+	slot = MyReplicationSlot;
+
+	/* first some sanity checks that are unlikely to be violated */
+	if (slot == NULL)
+		elog(ERROR, "cannot perform logical decoding without a acquired slot");
+
+	/* make sure the passed slot is suitable, these are user facing errors */
+	if (slot->data.database == InvalidOid)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 (errmsg("cannot use physical replication slot for logical decoding"))));
+
+	if (slot->data.database != MyDatabaseId)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 (errmsg("replication slot \"%s\" was not created in this database",
+						 NameStr(slot->data.name)))));
+
+	if (start_lsn == InvalidXLogRecPtr)
+	{
+		/* continue from last position */
+		start_lsn = slot->data.confirmed_flush;
+	}
+	else if (start_lsn < slot->data.confirmed_flush)
+	{
+		/*
+		 * It might seem like we should error out in this case, but it's
+		 * pretty common for a client to acknowledge a LSN it doesn't have to
+		 * do anything for, and thus didn't store persistently, because the
+		 * xlog records didn't result in anything relevant for logical
+		 * decoding. Clients have to be able to do that to support
+		 * synchronous replication.
+		 */
+		start_lsn = slot->data.confirmed_flush;
+		elog(DEBUG1, "cannot stream from %X/%X, minimum is %X/%X, forwarding",
+			 (uint32)(start_lsn >> 32), (uint32)start_lsn,
+			 (uint32)(slot->data.confirmed_flush >> 32),
+			 (uint32)slot->data.confirmed_flush);
+	}
+
+	ctx = StartupDecodingContext(output_plugin_options,
+								 start_lsn, InvalidTransactionId,
+								 read_page, prepare_write, do_write);
+
+	/* call output plugin initialization callback */
+	old_context = MemoryContextSwitchTo(ctx->context);
+	if (ctx->callbacks.startup_cb != NULL)
+		startup_cb_wrapper(ctx, &ctx->options, true);
+	MemoryContextSwitchTo(old_context);
+
+	ereport(LOG,
+			(errmsg("starting logical decoding for slot %s",
+					NameStr(slot->data.name)),
+			 errdetail("streaming transactions committing after %X/%X, reading WAL from %X/%X",
+					   (uint32)(slot->data.confirmed_flush >> 32),
+					   (uint32)slot->data.confirmed_flush,
+					   (uint32)(slot->data.restart_lsn >> 32),
+					   (uint32)slot->data.restart_lsn)));
+
+	return ctx;
+}
+
+/*
+ * Returns true if an consistent initial decoding snapshot has been built.
+ */
+bool
+DecodingContextReady(LogicalDecodingContext *ctx)
+{
+	return SnapBuildCurrentState(ctx->snapshot_builder) == SNAPBUILD_CONSISTENT;
+}
+
+/*
+ * Read from the decoding slot, until it is ready to start extracting changes.
+ */
+void
+DecodingContextFindStartpoint(LogicalDecodingContext *ctx)
+{
+	XLogRecPtr	startptr;
+
+	/* Initialize from where to start reading WAL. */
+	startptr = ctx->slot->data.restart_lsn;
+
+	elog(DEBUG1, "searching for logical decoding starting point, starting at %X/%X",
+		 (uint32)(ctx->slot->data.restart_lsn >> 32),
+		 (uint32)ctx->slot->data.restart_lsn);
+
+	/* Wait for a consistent starting point */
+	for (;;)
+	{
+		XLogRecord *record;
+		char	   *err = NULL;
+
+		/*
+		 * If the caller requires that interrupts be checked, the read_page
+		 * callback should do so, as those will often wait.
+		 */
+
+		/* the read_page callback waits for new WAL */
+		record = XLogReadRecord(ctx->reader, startptr, &err);
+		if (err)
+			elog(ERROR, "%s", err);
+
+		Assert(record);
+
+		startptr = InvalidXLogRecPtr;
+
+		LogicalDecodingProcessRecord(ctx, record);
+
+		/* only continue till we found a consistent spot */
+		if (DecodingContextReady(ctx))
+			break;
+	}
+
+	ctx->slot->data.confirmed_flush = ctx->reader->EndRecPtr;
+}
+
+/*
+ * Free a previously allocated decoding context, invoking the shutdown
+ * callback if necessary.
+ */
+void
+FreeDecodingContext(LogicalDecodingContext *ctx)
+{
+	if (ctx->callbacks.shutdown_cb != NULL)
+		shutdown_cb_wrapper(ctx);
+
+	ReorderBufferFree(ctx->reorder);
+	FreeSnapshotBuilder(ctx->snapshot_builder);
+	XLogReaderFree(ctx->reader);
+	MemoryContextDelete(ctx->context);
+}
+
+/*
+ * Prepare a write using the context's output routine.
+ */
+void
+OutputPluginPrepareWrite(struct LogicalDecodingContext *ctx, bool last_write)
+{
+	if (!ctx->accept_writes)
+		elog(ERROR, "writes are only accepted in commit, begin and change callbacks");
+
+	ctx->prepare_write(ctx, ctx->write_location, ctx->write_xid, last_write);
+	ctx->prepared_write = true;
+}
+
+/*
+ * Perform a write using the context's output routine.
+ */
+void
+OutputPluginWrite(struct LogicalDecodingContext *ctx, bool last_write)
+{
+	if (!ctx->prepared_write)
+		elog(ERROR, "OutputPluginPrepareWrite needs to be called before OutputPluginWrite");
+
+	ctx->write(ctx, ctx->write_location, ctx->write_xid, last_write);
+	ctx->prepared_write = false;
+}
+
+/*
+ * Load the output plugin, lookup its output plugin init function, and check
+ * that it provides the required callbacks.
+ */
+static void
+LoadOutputPlugin(OutputPluginCallbacks *callbacks, char *plugin)
+{
+	LogicalOutputPluginInit plugin_init;
+
+	plugin_init = (LogicalOutputPluginInit)
+		load_external_function(plugin, "_PG_output_plugin_init", false, NULL);
+
+	if (plugin_init == NULL)
+		elog(ERROR, "output plugins have to declare the _PG_output_plugin_init symbol");
+
+	/* ask the output plugin to fill the callback struct */
+	plugin_init(callbacks);
+
+	if (callbacks->begin_cb == NULL)
+		elog(ERROR, "output plugins have to register a begin callback");
+	if (callbacks->change_cb == NULL)
+		elog(ERROR, "output plugins have to register a change callback");
+	if (callbacks->commit_cb == NULL)
+		elog(ERROR, "output plugins have to register a commit callback");
+}
+
+static void
+output_plugin_error_callback(void *arg)
+{
+	LogicalErrorCallbackState *state = (LogicalErrorCallbackState *) arg;
+	/* not all callbacks have an associated LSN  */
+	if (state->report_location != InvalidXLogRecPtr)
+		errcontext("slot \"%s\", output plugin \"%s\", in the %s callback, associated LSN %X/%X",
+				   NameStr(state->ctx->slot->data.name),
+				   NameStr(state->ctx->slot->data.plugin),
+				   state->callback_name,
+				   (uint32)(state->report_location >> 32),
+				   (uint32)state->report_location);
+	else
+		errcontext("slot \"%s\", output plugin \"%s\", in the %s callback",
+				   NameStr(state->ctx->slot->data.name),
+				   NameStr(state->ctx->slot->data.plugin),
+				   state->callback_name);
+}
+
+static void
+startup_cb_wrapper(LogicalDecodingContext *ctx, OutputPluginOptions *opt, bool is_init)
+{
+	LogicalErrorCallbackState state;
+	ErrorContextCallback errcallback;
+
+	/* Push callback + info on the error context stack */
+	state.ctx = ctx;
+	state.callback_name = "startup";
+	state.report_location = InvalidXLogRecPtr;
+	errcallback.callback = output_plugin_error_callback;
+	errcallback.arg = (void *) &state;
+	errcallback.previous = error_context_stack;
+	error_context_stack = &errcallback;
+
+	/* set output state */
+	ctx->accept_writes = false;
+
+	/* do the actual work: call callback */
+	ctx->callbacks.startup_cb(ctx, opt, is_init);
+
+	/* Pop the error context stack */
+	error_context_stack = errcallback.previous;
+}
+
+static void
+shutdown_cb_wrapper(LogicalDecodingContext *ctx)
+{
+	LogicalErrorCallbackState state;
+	ErrorContextCallback errcallback;
+
+	/* Push callback + info on the error context stack */
+	state.ctx = ctx;
+	state.callback_name = "shutdown";
+	state.report_location = InvalidXLogRecPtr;
+	errcallback.callback = output_plugin_error_callback;
+	errcallback.arg = (void *) &state;
+	errcallback.previous = error_context_stack;
+	error_context_stack = &errcallback;
+
+	/* set output state */
+	ctx->accept_writes = false;
+
+	/* do the actual work: call callback */
+	ctx->callbacks.shutdown_cb(ctx);
+
+	/* Pop the error context stack */
+	error_context_stack = errcallback.previous;
+}
+
+
+/*
+ * Callbacks for ReorderBuffer which add in some more information and then call
+ * output_plugin.h plugins.
+ */
+static void
+begin_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn)
+{
+	LogicalDecodingContext *ctx = cache->private_data;
+	LogicalErrorCallbackState state;
+	ErrorContextCallback errcallback;
+
+	/* Push callback + info on the error context stack */
+	state.ctx = ctx;
+	state.callback_name = "begin";
+	state.report_location = txn->first_lsn;
+	errcallback.callback = output_plugin_error_callback;
+	errcallback.arg = (void *) &state;
+	errcallback.previous = error_context_stack;
+	error_context_stack = &errcallback;
+
+	/* set output state */
+	ctx->accept_writes = true;
+	ctx->write_xid = txn->xid;
+	ctx->write_location = txn->first_lsn;
+
+	/* do the actual work: call callback */
+	ctx->callbacks.begin_cb(ctx, txn);
+
+	/* Pop the error context stack */
+	error_context_stack = errcallback.previous;
+}
+
+static void
+commit_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
+				   XLogRecPtr commit_lsn)
+{
+	LogicalDecodingContext *ctx = cache->private_data;
+	LogicalErrorCallbackState state;
+	ErrorContextCallback errcallback;
+
+	/* Push callback + info on the error context stack */
+	state.ctx = ctx;
+	state.callback_name = "commit";
+	state.report_location = txn->final_lsn; /* beginning of commit record */
+	errcallback.callback = output_plugin_error_callback;
+	errcallback.arg = (void *) &state;
+	errcallback.previous = error_context_stack;
+	error_context_stack = &errcallback;
+
+	/* set output state */
+	ctx->accept_writes = true;
+	ctx->write_xid = txn->xid;
+	ctx->write_location = txn->end_lsn; /* points to the end of the record */
+
+	/* do the actual work: call callback */
+	ctx->callbacks.commit_cb(ctx, txn, commit_lsn);
+
+	/* Pop the error context stack */
+	error_context_stack = errcallback.previous;
+}
+
+static void
+change_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
+			   Relation relation, ReorderBufferChange *change)
+{
+	LogicalDecodingContext *ctx = cache->private_data;
+	LogicalErrorCallbackState state;
+	ErrorContextCallback errcallback;
+
+	/* Push callback + info on the error context stack */
+	state.ctx = ctx;
+	state.callback_name = "change";
+	state.report_location = change->lsn;
+	errcallback.callback = output_plugin_error_callback;
+	errcallback.arg = (void *) &state;
+	errcallback.previous = error_context_stack;
+	error_context_stack = &errcallback;
+
+	/* set output state */
+	ctx->accept_writes = true;
+	ctx->write_xid = txn->xid;
+	/*
+	 * report this change's lsn so replies from clients can give an up2date
+	 * answer. This won't ever be enough (and shouldn't be!) to confirm
+	 * receipt of this transaction, but it might allow another transaction's
+	 * commit to be confirmed with one message.
+	 */
+	ctx->write_location = change->lsn;
+
+	ctx->callbacks.change_cb(ctx, txn, relation, change);
+
+	/* Pop the error context stack */
+	error_context_stack = errcallback.previous;
+}
+
+/*
+ * Set the required catalog xmin horizon for historic snapshots in the current
+ * replication slot.
+ *
+ * Note that in the most cases, we won't be able to immediately use the xmin
+ * to increase the xmin horizon, we need to wait till the client has confirmed
+ * receiving current_lsn with LogicalConfirmReceivedLocation().
+ */
+void
+LogicalIncreaseXminForSlot(XLogRecPtr current_lsn, TransactionId xmin)
+{
+	bool	updated_xmin = false;
+	ReplicationSlot *slot;
+
+	slot = MyReplicationSlot;
+
+	Assert(slot != NULL);
+
+	SpinLockAcquire(&slot->mutex);
+
+	/*
+	 * don't overwrite if we already have a newer xmin. This can
+	 * happen if we restart decoding in a slot.
+	 */
+	if (TransactionIdPrecedesOrEquals(xmin, slot->data.catalog_xmin))
+	{
+	}
+	/*
+	 * If the client has already confirmed up to this lsn, we directly
+	 * can mark this as accepted. This can happen if we restart
+	 * decoding in a slot.
+	 */
+	else if (current_lsn <= slot->data.confirmed_flush)
+	{
+		slot->candidate_catalog_xmin = xmin;
+		slot->candidate_xmin_lsn = current_lsn;
+
+		/* our candidate can directly be used */
+		updated_xmin = true;
+	}
+	/*
+	 * Only increase if the previous values have been applied, otherwise we
+	 * might never end up updating if the receiver acks too slowly.
+	 */
+	else if (slot->candidate_xmin_lsn == InvalidXLogRecPtr)
+	{
+		slot->candidate_catalog_xmin = xmin;
+		slot->candidate_xmin_lsn = current_lsn;
+	}
+	SpinLockRelease(&slot->mutex);
+
+	/* candidate already valid with the current flush position, apply */
+	if (updated_xmin)
+		LogicalConfirmReceivedLocation(slot->data.confirmed_flush);
+}
+
+/*
+ * Mark the minimal LSN (restart_lsn) we need to read to replay all
+ * transactions that have not yet committed at current_lsn.
+ *
+ * Just like IncreaseRestartDecodingForSlot this nly takes effect when the
+ * client has confirmed to have received current_lsn.
+ */
+void
+LogicalIncreaseRestartDecodingForSlot(XLogRecPtr current_lsn, XLogRecPtr restart_lsn)
+{
+	bool	updated_lsn = false;
+	ReplicationSlot *slot;
+
+	slot = MyReplicationSlot;
+
+	Assert(slot != NULL);
+	Assert(restart_lsn != InvalidXLogRecPtr);
+	Assert(current_lsn != InvalidXLogRecPtr);
+
+	SpinLockAcquire(&slot->mutex);
+
+	/* don't overwrite if have a newer restart lsn*/
+	if (restart_lsn <= slot->data.restart_lsn)
+	{
+	}
+	/*
+	 * We might have already flushed far enough to directly accept this lsn, in
+	 * this case there is no need to check for existing candidate LSNs
+	 */
+	else if (current_lsn <= slot->data.confirmed_flush)
+	{
+		slot->candidate_restart_valid = current_lsn;
+		slot->candidate_restart_lsn = restart_lsn;
+
+		/* our candidate can directly be used */
+		updated_lsn = true;
+	}
+	/*
+	 * Only increase if the previous values have been applied, otherwise we
+	 * might never end up updating if the receiver acks too slowly. A missed
+	 * value here will just cause some extra effort after reconnecting.
+	 */
+	if (slot->candidate_restart_valid == InvalidXLogRecPtr)
+	{
+		slot->candidate_restart_valid = current_lsn;
+		slot->candidate_restart_lsn = restart_lsn;
+
+		elog(DEBUG1, "got new restart lsn %X/%X at %X/%X",
+			 (uint32) (restart_lsn >> 32), (uint32) restart_lsn,
+			 (uint32) (current_lsn >> 32), (uint32) current_lsn);
+	}
+	else
+	{
+		elog(DEBUG1, "failed to increase restart lsn: proposed %X/%X, after %X/%X, current candidate %X/%X, current after %X/%X, flushed up to %X/%X",
+			 (uint32) (restart_lsn >> 32), (uint32) restart_lsn,
+			 (uint32) (current_lsn >> 32), (uint32) current_lsn,
+			 (uint32) (slot->candidate_restart_lsn >> 32),
+			 (uint32) slot->candidate_restart_lsn,
+			 (uint32) (slot->candidate_restart_valid >> 32),
+			 (uint32) slot->candidate_restart_valid,
+			 (uint32) (slot->data.confirmed_flush >> 32),
+			 (uint32) slot->data.confirmed_flush
+			);
+	}
+	SpinLockRelease(&slot->mutex);
+
+	/* candidates are already valid with the current flush position, apply */
+	if (updated_lsn)
+		LogicalConfirmReceivedLocation(slot->data.confirmed_flush);
+}
+
+/*
+ * Handle a consumer's conformation having received all changes up to lsn.
+ */
+void
+LogicalConfirmReceivedLocation(XLogRecPtr lsn)
+{
+	Assert(lsn != InvalidXLogRecPtr);
+
+	/* Do an unlocked check for candidate_lsn first. */
+	if (MyReplicationSlot->candidate_xmin_lsn != InvalidXLogRecPtr ||
+		MyReplicationSlot->candidate_restart_valid != InvalidXLogRecPtr)
+	{
+		bool		updated_xmin = false;
+		bool		updated_restart = false;
+
+		/* use volatile pointer to prevent code rearrangement */
+		volatile ReplicationSlot *slot = MyReplicationSlot;
+
+		SpinLockAcquire(&slot->mutex);
+
+		slot->data.confirmed_flush = lsn;
+
+		/* if were past the location required for bumping xmin, do so */
+		if (slot->candidate_xmin_lsn != InvalidXLogRecPtr &&
+			slot->candidate_xmin_lsn <= lsn)
+		{
+			/*
+			 * We have to write the changed xmin to disk *before* we change
+			 * the in-memory value, otherwise after a crash we wouldn't know
+			 * that some catalog tuples might have been removed already.
+			 *
+			 * Ensure that by first writing to ->xmin and only update
+			 * ->effective_xmin once the new state is synced to disk. After a
+			 * crash ->effective_xmin is set to ->xmin.
+			 */
+			if (TransactionIdIsValid(slot->candidate_catalog_xmin) &&
+				slot->data.catalog_xmin != slot->candidate_catalog_xmin)
+			{
+				slot->data.catalog_xmin = slot->candidate_catalog_xmin;
+				slot->candidate_catalog_xmin = InvalidTransactionId;
+				slot->candidate_xmin_lsn = InvalidXLogRecPtr;
+				updated_xmin = true;
+			}
+		}
+
+		if (slot->candidate_restart_valid != InvalidXLogRecPtr &&
+			slot->candidate_restart_valid <= lsn)
+		{
+			Assert(slot->candidate_restart_lsn != InvalidXLogRecPtr);
+
+			slot->data.restart_lsn = slot->candidate_restart_lsn;
+			slot->candidate_restart_lsn = InvalidXLogRecPtr;
+			slot->candidate_restart_valid = InvalidXLogRecPtr;
+			updated_restart = true;
+		}
+
+		SpinLockRelease(&slot->mutex);
+
+		/* first write new xmin to disk, so we know whats up after a crash */
+		if (updated_xmin || updated_restart)
+		{
+			ReplicationSlotMarkDirty();
+			ReplicationSlotSave();
+			elog(DEBUG1, "updated xmin: %u restart: %u", updated_xmin, updated_restart);
+		}
+		/*
+		 * Now the new xmin is safely on disk, we can let the global value
+		 * advance. We do not take ProcArrayLock or similar since we only
+		 * advance xmin here and there's not much harm done by a concurrent
+		 * computation missing that.
+		 */
+		if (updated_xmin)
+		{
+			SpinLockAcquire(&slot->mutex);
+			slot->effective_catalog_xmin = slot->data.catalog_xmin;
+			SpinLockRelease(&slot->mutex);
+
+			ReplicationSlotsComputeRequiredXmin(false);
+			ReplicationSlotsComputeRequiredLSN();
+		}
+	}
+	else
+	{
+		volatile ReplicationSlot *slot = MyReplicationSlot;
+
+		SpinLockAcquire(&slot->mutex);
+		slot->data.confirmed_flush = lsn;
+		SpinLockRelease(&slot->mutex);
+	}
+}
diff --git a/src/backend/replication/logical/logicalfuncs.c b/src/backend/replication/logical/logicalfuncs.c
new file mode 100644
index 00000000000..3b8ae3853ba
--- /dev/null
+++ b/src/backend/replication/logical/logicalfuncs.c
@@ -0,0 +1,509 @@
+/*-------------------------------------------------------------------------
+ *
+ * logicalfuncs.c
+ *
+ *	   Support functions for using logical decoding and managemnt of
+ *	   logical replication slots via SQL.
+ *
+ *
+ * Copyright (c) 2012-2014, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/backend/replication/logicalfuncs.c
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <unistd.h>
+
+#include "fmgr.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+
+#include "catalog/pg_type.h"
+
+#include "nodes/makefuncs.h"
+
+#include "mb/pg_wchar.h"
+
+#include "utils/array.h"
+#include "utils/builtins.h"
+#include "utils/inval.h"
+#include "utils/memutils.h"
+#include "utils/pg_lsn.h"
+#include "utils/resowner.h"
+#include "utils/lsyscache.h"
+
+#include "replication/decode.h"
+#include "replication/logical.h"
+#include "replication/logicalfuncs.h"
+
+#include "storage/fd.h"
+
+/* private date for writing out data */
+typedef struct DecodingOutputState {
+	Tuplestorestate *tupstore;
+	TupleDesc tupdesc;
+	bool binary_output;
+	int64 returned_rows;
+} DecodingOutputState;
+
+/*
+ * Prepare for a output plugin write.
+ */
+static void
+LogicalOutputPrepareWrite(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid,
+						  bool last_write)
+{
+	resetStringInfo(ctx->out);
+}
+
+/*
+ * Perform output plugin write into tuplestore.
+ */
+static void
+LogicalOutputWrite(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid,
+				   bool last_write)
+{
+	Datum		values[3];
+	bool		nulls[3];
+	DecodingOutputState *p;
+
+	/* SQL Datums can only be of a limited length... */
+	if (ctx->out->len > MaxAllocSize - VARHDRSZ)
+		elog(ERROR, "too much output for sql interface");
+
+	p = (DecodingOutputState *) ctx->output_writer_private;
+
+	memset(nulls, 0, sizeof(nulls));
+	values[0] = LSNGetDatum(lsn);
+	values[1] = TransactionIdGetDatum(xid);
+
+	/*
+	 * Assert ctx->out is in database encoding when we're writing textual
+	 * output.
+	 */
+	if (!p->binary_output)
+		Assert(pg_verify_mbstr(GetDatabaseEncoding(),
+							   ctx->out->data, ctx->out->len,
+							   false));
+
+	/* ick, but cstring_to_text_with_len works for bytea perfectly fine */
+	values[2] = PointerGetDatum(
+		cstring_to_text_with_len(ctx->out->data, ctx->out->len));
+
+	tuplestore_putvalues(p->tupstore, p->tupdesc, values, nulls);
+	p->returned_rows++;
+}
+
+/*
+ * TODO: This is duplicate code with pg_xlogdump, similar to walsender.c, but
+ * we currently don't have the infrastructure (elog!) to share it.
+ */
+static void
+XLogRead(char *buf, TimeLineID tli, XLogRecPtr startptr, Size count)
+{
+	char	   *p;
+	XLogRecPtr	recptr;
+	Size		nbytes;
+
+	static int	sendFile = -1;
+	static XLogSegNo sendSegNo = 0;
+	static uint32 sendOff = 0;
+
+	p = buf;
+	recptr = startptr;
+	nbytes = count;
+
+	while (nbytes > 0)
+	{
+		uint32		startoff;
+		int			segbytes;
+		int			readbytes;
+
+		startoff = recptr % XLogSegSize;
+
+		if (sendFile < 0 || !XLByteInSeg(recptr, sendSegNo))
+		{
+			char		path[MAXPGPATH];
+
+			/* Switch to another logfile segment */
+			if (sendFile >= 0)
+				close(sendFile);
+
+			XLByteToSeg(recptr, sendSegNo);
+
+			XLogFilePath(path, tli, sendSegNo);
+
+			sendFile = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
+
+			if (sendFile < 0)
+			{
+				if (errno == ENOENT)
+					ereport(ERROR,
+							(errcode_for_file_access(),
+							 errmsg("requested WAL segment %s has already been removed",
+									path)));
+				else
+					ereport(ERROR,
+							(errcode_for_file_access(),
+							 errmsg("could not open file \"%s\": %m",
+									path)));
+			}
+			sendOff = 0;
+		}
+
+		/* Need to seek in the file? */
+		if (sendOff != startoff)
+		{
+			if (lseek(sendFile, (off_t) startoff, SEEK_SET) < 0)
+			{
+				char		path[MAXPGPATH];
+
+				XLogFilePath(path, tli, sendSegNo);
+
+				ereport(ERROR,
+						(errcode_for_file_access(),
+				  errmsg("could not seek in log segment %s to offset %u: %m",
+						 path, startoff)));
+			}
+			sendOff = startoff;
+		}
+
+		/* How many bytes are within this segment? */
+		if (nbytes > (XLogSegSize - startoff))
+			segbytes = XLogSegSize - startoff;
+		else
+			segbytes = nbytes;
+
+		readbytes = read(sendFile, p, segbytes);
+		if (readbytes <= 0)
+		{
+			char		path[MAXPGPATH];
+
+			XLogFilePath(path, tli, sendSegNo);
+
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not read from log segment %s, offset %u, length %lu: %m",
+							path, sendOff, (unsigned long) segbytes)));
+		}
+
+		/* Update state for read */
+		recptr += readbytes;
+
+		sendOff += readbytes;
+		nbytes -= readbytes;
+		p += readbytes;
+	}
+}
+
+static void
+check_permissions(void)
+{
+	if (!superuser() && !has_rolreplication(GetUserId()))
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+				 (errmsg("must be superuser or replication role to use replication slots"))));
+}
+
+/*
+ * read_page callback for logical decoding contexts.
+ *
+ * Public because it would likely be very helpful for someone writing another
+ * output method outside walsender, e.g. in a bgworker.
+ *
+ * TODO: The walsender has it's own version of this, but it relies on the
+ * walsender's latch being set whenever WAL is flushed. No such infrastructure
+ * exists for normal backends, so we have to do a check/sleep/repeat style of
+ * loop for now.
+ */
+int
+logical_read_local_xlog_page(XLogReaderState *state, XLogRecPtr targetPagePtr,
+	int reqLen, XLogRecPtr targetRecPtr, char *cur_page, TimeLineID *pageTLI)
+{
+	XLogRecPtr	flushptr,
+				loc;
+	int			count;
+
+	loc = targetPagePtr + reqLen;
+	while (1)
+	{
+		/*
+		 * TODO: we're going to have to do something more intelligent about
+		 * timelines on standbys. Use readTimeLineHistory() and
+		 * tliOfPointInHistory() to get the proper LSN? For now we'll catch
+		 * that case earlier, but the code and TODO is left in here for when
+		 * that changes.
+		 */
+		if (!RecoveryInProgress())
+		{
+			*pageTLI = ThisTimeLineID;
+			flushptr = GetFlushRecPtr();
+		}
+		else
+			flushptr = GetXLogReplayRecPtr(pageTLI);
+
+		if (loc <= flushptr)
+			break;
+
+		CHECK_FOR_INTERRUPTS();
+		pg_usleep(1000L);
+	}
+
+	/* more than one block available */
+	if (targetPagePtr + XLOG_BLCKSZ <= flushptr)
+		count = XLOG_BLCKSZ;
+	/* not enough data there */
+	else if (targetPagePtr + reqLen > flushptr)
+		return -1;
+	/* part of the page available */
+	else
+		count = flushptr - targetPagePtr;
+
+	XLogRead(cur_page, *pageTLI, targetPagePtr, XLOG_BLCKSZ);
+
+	return count;
+}
+
+/*
+ * Helper function for the various SQL callable logical decoding functions.
+ */
+static Datum
+pg_logical_slot_get_changes_guts(FunctionCallInfo fcinfo, bool confirm, bool binary)
+{
+	Name		name = PG_GETARG_NAME(0);
+	XLogRecPtr	upto_lsn;
+	int32		upto_nchanges;
+
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	MemoryContext per_query_ctx;
+	MemoryContext oldcontext;
+
+	XLogRecPtr	end_of_wal;
+	XLogRecPtr	startptr;
+
+	LogicalDecodingContext *ctx;
+
+	ResourceOwner old_resowner = CurrentResourceOwner;
+	ArrayType  *arr;
+	Size		ndim;
+	List	   *options = NIL;
+	DecodingOutputState *p;
+
+	if (PG_ARGISNULL(1))
+		upto_lsn = InvalidXLogRecPtr;
+	else
+		upto_lsn = PG_GETARG_LSN(1);
+
+	if (PG_ARGISNULL(2))
+		upto_nchanges = InvalidXLogRecPtr;
+	else
+		upto_nchanges = PG_GETARG_INT32(2);
+
+	/* check to see if caller supports us returning a tuplestore */
+	if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("set-valued function called in context that cannot accept a set")));
+	if (!(rsinfo->allowedModes & SFRM_Materialize))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("materialize mode required, but it is not allowed in this context")));
+
+	/* state to write output to */
+	p = palloc0(sizeof(DecodingOutputState));
+
+	p->binary_output = binary;
+
+	/* Build a tuple descriptor for our result type */
+	if (get_call_result_type(fcinfo, NULL, &p->tupdesc) != TYPEFUNC_COMPOSITE)
+		elog(ERROR, "return type must be a row type");
+
+	check_permissions();
+
+	CheckLogicalDecodingRequirements();
+
+	arr = PG_GETARG_ARRAYTYPE_P(3);
+	ndim = ARR_NDIM(arr);
+
+	per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
+	oldcontext = MemoryContextSwitchTo(per_query_ctx);
+
+	if (ndim > 1)
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("array must be one-dimensional")));
+	}
+	else if (array_contains_nulls(arr))
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("array must not contain nulls")));
+	}
+	else if (ndim == 1)
+	{
+		int			nelems;
+		Datum	   *datum_opts;
+		int			i;
+
+		Assert(ARR_ELEMTYPE(arr) == TEXTOID);
+
+		deconstruct_array(arr, TEXTOID, -1, false, 'i',
+						  &datum_opts, NULL, &nelems);
+
+		if (nelems % 2 != 0)
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("array must have even number of elements")));
+
+		for (i = 0; i < nelems; i += 2)
+		{
+			char	   *name = TextDatumGetCString(datum_opts[i]);
+			char	   *opt = TextDatumGetCString(datum_opts[i + 1]);
+
+			options = lappend(options, makeDefElem(name, (Node *) makeString(opt)));
+		}
+	}
+
+	p->tupstore = tuplestore_begin_heap(true, false, work_mem);
+	rsinfo->returnMode = SFRM_Materialize;
+	rsinfo->setResult = p->tupstore;
+	rsinfo->setDesc = p->tupdesc;
+
+	/* compute the current end-of-wal */
+	if (!RecoveryInProgress())
+		end_of_wal = GetFlushRecPtr();
+	else
+		end_of_wal = GetXLogReplayRecPtr(NULL);
+
+	CheckLogicalDecodingRequirements();
+	ReplicationSlotAcquire(NameStr(*name));
+
+	PG_TRY();
+	{
+		ctx = CreateDecodingContext(InvalidXLogRecPtr,
+									options,
+									logical_read_local_xlog_page,
+									LogicalOutputPrepareWrite,
+									LogicalOutputWrite);
+
+		MemoryContextSwitchTo(oldcontext);
+
+		/*
+		 * Check whether the output pluggin writes textual output if that's
+		 * what we need.
+		 */
+		if (!binary &&
+			ctx->options.output_type != OUTPUT_PLUGIN_TEXTUAL_OUTPUT)
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("output plugin cannot produce text output")));
+
+		ctx->output_writer_private = p;
+
+		startptr = MyReplicationSlot->data.restart_lsn;
+
+		CurrentResourceOwner = ResourceOwnerCreate(CurrentResourceOwner, "logical decoding");
+
+		/* invalidate non-timetravel entries */
+		InvalidateSystemCaches();
+
+		while ((startptr != InvalidXLogRecPtr && startptr < end_of_wal) ||
+			   (ctx->reader->EndRecPtr && ctx->reader->EndRecPtr < end_of_wal))
+		{
+			XLogRecord *record;
+			char	   *errm = NULL;
+
+			record = XLogReadRecord(ctx->reader, startptr, &errm);
+			if (errm)
+				elog(ERROR, "%s", errm);
+
+			startptr = InvalidXLogRecPtr;
+
+			/*
+			 * The {begin_txn,change,commit_txn}_wrapper callbacks above will
+			 * store the description into our tuplestore.
+			 */
+			if (record != NULL)
+				LogicalDecodingProcessRecord(ctx, record);
+
+			/* check limits */
+			if (upto_lsn != InvalidXLogRecPtr &&
+				upto_lsn <= ctx->reader->EndRecPtr)
+				break;
+			if (upto_nchanges != 0 &&
+				upto_nchanges <= p->returned_rows)
+				break;
+		}
+	}
+	PG_CATCH();
+	{
+		/* clear all timetravel entries */
+		InvalidateSystemCaches();
+
+		PG_RE_THROW();
+	}
+	PG_END_TRY();
+
+	tuplestore_donestoring(tupstore);
+
+	CurrentResourceOwner = old_resowner;
+
+	/*
+	 * Next time, start where we left off. (Hunting things, the family
+	 * business..)
+	 */
+	if (ctx->reader->EndRecPtr != InvalidXLogRecPtr && confirm)
+		LogicalConfirmReceivedLocation(ctx->reader->EndRecPtr);
+
+	/* free context, call shutdown callback */
+	FreeDecodingContext(ctx);
+
+	ReplicationSlotRelease();
+	InvalidateSystemCaches();
+
+	return (Datum) 0;
+}
+
+/*
+ * SQL function returning the changestream as text, consuming the data.
+ */
+Datum
+pg_logical_slot_get_changes(PG_FUNCTION_ARGS)
+{
+	Datum ret = pg_logical_slot_get_changes_guts(fcinfo, true, false);
+	return ret;
+}
+
+/*
+ * SQL function returning the changestream as text, only peeking ahead.
+ */
+Datum
+pg_logical_slot_peek_changes(PG_FUNCTION_ARGS)
+{
+	Datum ret = pg_logical_slot_get_changes_guts(fcinfo, false, false);
+	return ret;
+}
+
+/*
+ * SQL function returning the changestream in binary, consuming the data.
+ */
+Datum
+pg_logical_slot_get_binary_changes(PG_FUNCTION_ARGS)
+{
+	Datum ret = pg_logical_slot_get_changes_guts(fcinfo, true, true);
+	return ret;
+}
+
+/*
+ * SQL function returning the changestream in binary, only peeking ahead.
+ */
+Datum
+pg_logical_slot_peek_binary_changes(PG_FUNCTION_ARGS)
+{
+	Datum ret = pg_logical_slot_get_changes_guts(fcinfo, false, true);
+	return ret;
+}
diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c
new file mode 100644
index 00000000000..e7182338b89
--- /dev/null
+++ b/src/backend/replication/logical/reorderbuffer.c
@@ -0,0 +1,3059 @@
+/*-------------------------------------------------------------------------
+ *
+ * reorderbuffer.c
+ *	  PostgreSQL logical replay/reorder buffer management
+ *
+ *
+ * Copyright (c) 2012-2014, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/replication/reorderbuffer.c
+ *
+ * NOTES
+ *	  This module gets handed individual pieces of transactions in the order
+ *	  they are written to the WAL and is responsible to reassemble them into
+ *	  toplevel transaction sized pieces. When a transaction is completely
+ *	  reassembled - signalled by reading the transaction commit record - it
+ *	  will then call the output plugin (c.f. ReorderBufferCommit()) with the
+ *	  individual changes. The output plugins rely on snapshots built by
+ *	  snapbuild.c which hands them to us.
+ *
+ *	  Transactions and subtransactions/savepoints in postgres are not
+ *	  immediately linked to each other from outside the performing
+ *	  backend. Only at commit/abort (or special xact_assignment records) they
+ *	  are linked together. Which means that we will have to splice together a
+ *	  toplevel transaction from its subtransactions. To do that efficiently we
+ *	  build a binary heap indexed by the smallest current lsn of the individual
+ *	  subtransactions' changestreams. As the individual streams are inherently
+ *	  ordered by LSN - since that is where we build them from - the transaction
+ *	  can easily be reassembled by always using the subtransaction with the
+ *	  smallest current LSN from the heap.
+ *
+ *	  In order to cope with large transactions - which can be several times as
+ *	  big as the available memory - this module supports spooling the contents
+ *	  of a large transactions to disk. When the transaction is replayed the
+ *	  contents of individual (sub-)transactions will be read from disk in
+ *	  chunks.
+ *
+ *	  This module also has to deal with reassembling toast records from the
+ *	  individual chunks stored in WAL. When a new (or initial) version of a
+ *	  tuple is stored in WAL it will always be preceded by the toast chunks
+ *	  emitted for the columns stored out of line. Within a single toplevel
+ *	  transaction there will be no other data carrying records between a row's
+ *	  toast chunks and the row data itself. See ReorderBufferToast* for
+ *	  details.
+ * -------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <unistd.h>
+#include <sys/stat.h>
+
+#include "miscadmin.h"
+
+#include "access/rewriteheap.h"
+#include "access/transam.h"
+#include "access/tuptoaster.h"
+#include "access/xact.h"
+
+#include "catalog/catalog.h"
+
+#include "common/relpath.h"
+
+#include "lib/binaryheap.h"
+
+#include "replication/logical.h"
+#include "replication/reorderbuffer.h"
+#include "replication/slot.h"
+#include "replication/snapbuild.h" /* just for SnapBuildSnapDecRefcount */
+
+#include "storage/bufmgr.h"
+#include "storage/fd.h"
+#include "storage/sinval.h"
+
+#include "utils/builtins.h"
+#include "utils/combocid.h"
+#include "utils/memdebug.h"
+#include "utils/memutils.h"
+#include "utils/relcache.h"
+#include "utils/relfilenodemap.h"
+#include "utils/tqual.h"
+
+/*
+ * For efficiency and simplicity reasons we want to keep Snapshots, CommandIds
+ * and ComboCids in the same list with the user visible INSERT/UPDATE/DELETE
+ * changes. We don't want to leak those internal values to external users
+ * though (they would just use switch()...default:) because that would make it
+ * harder to add to new user visible values.
+ *
+ * This needs to be synchronized with ReorderBufferChangeType! Adjust the
+ * StaticAssertExpr's in ReorderBufferAllocate if you add anything!
+ */
+typedef enum
+{
+	REORDER_BUFFER_CHANGE_INTERNAL_INSERT,
+	REORDER_BUFFER_CHANGE_INTERNAL_UPDATE,
+	REORDER_BUFFER_CHANGE_INTERNAL_DELETE,
+	REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT,
+	REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID,
+	REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID
+} ReorderBufferChangeTypeInternal;
+
+/* entry for a hash table we use to map from xid to our transaction state */
+typedef struct ReorderBufferTXNByIdEnt
+{
+	TransactionId xid;
+	ReorderBufferTXN *txn;
+} ReorderBufferTXNByIdEnt;
+
+/* data structures for (relfilenode, ctid) => (cmin, cmax) mapping */
+typedef struct ReorderBufferTupleCidKey
+{
+	RelFileNode relnode;
+	ItemPointerData tid;
+} ReorderBufferTupleCidKey;
+
+typedef struct ReorderBufferTupleCidEnt
+{
+	ReorderBufferTupleCidKey key;
+	CommandId	cmin;
+	CommandId	cmax;
+	CommandId	combocid;		/* just for debugging */
+} ReorderBufferTupleCidEnt;
+
+/* k-way in-order change iteration support structures */
+typedef struct ReorderBufferIterTXNEntry
+{
+	XLogRecPtr	lsn;
+	ReorderBufferChange *change;
+	ReorderBufferTXN *txn;
+	int			fd;
+	XLogSegNo	segno;
+} ReorderBufferIterTXNEntry;
+
+typedef struct ReorderBufferIterTXNState
+{
+	binaryheap *heap;
+	Size		nr_txns;
+	dlist_head	old_change;
+	ReorderBufferIterTXNEntry entries[FLEXIBLE_ARRAY_MEMBER];
+} ReorderBufferIterTXNState;
+
+/* toast datastructures */
+typedef struct ReorderBufferToastEnt
+{
+	Oid			chunk_id;		/* toast_table.chunk_id */
+	int32		last_chunk_seq; /* toast_table.chunk_seq of the last chunk we
+								 * have seen */
+	Size		num_chunks;		/* number of chunks we've already seen */
+	Size		size;			/* combined size of chunks seen */
+	dlist_head	chunks;			/* linked list of chunks */
+	struct varlena *reconstructed;		/* reconstructed varlena now pointed
+										 * to in main tup */
+} ReorderBufferToastEnt;
+
+/* Disk serialization support datastructures */
+typedef struct ReorderBufferDiskChange
+{
+	Size		size;
+	ReorderBufferChange change;
+	/* data follows */
+} ReorderBufferDiskChange;
+
+/*
+ * Maximum number of changes kept in memory, per transaction. After that,
+ * changes are spooled to disk.
+ *
+ * The current value should be sufficient to decode the entire transaction
+ * without hitting disk in OLTP workloads, while starting to spool to disk in
+ * other workloads reasonably fast.
+ *
+ * At some point in the future it probaly makes sense to have a more elaborate
+ * resource management here, but it's not entirely clear what that would look
+ * like.
+ */
+static const Size max_changes_in_memory = 4096;
+
+/*
+ * We use a very simple form of a slab allocator for frequently allocated
+ * objects, simply keeping a fixed number in a linked list when unused,
+ * instead pfree()ing them. Without that in many workloads aset.c becomes a
+ * major bottleneck, especially when spilling to disk while decoding batch
+ * workloads.
+ */
+static const Size max_cached_changes = 4096 * 2;
+static const Size max_cached_tuplebufs = 4096 * 2;		/* ~8MB */
+static const Size max_cached_transactions = 512;
+
+
+/* ---------------------------------------
+ * primary reorderbuffer support routines
+ * ---------------------------------------
+ */
+static ReorderBufferTXN *ReorderBufferGetTXN(ReorderBuffer *rb);
+static void ReorderBufferReturnTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
+static ReorderBufferTXN *ReorderBufferTXNByXid(ReorderBuffer *rb,
+					  TransactionId xid, bool create, bool *is_new,
+					  XLogRecPtr lsn, bool create_as_top);
+
+static void AssertTXNLsnOrder(ReorderBuffer *rb);
+
+/* ---------------------------------------
+ * support functions for lsn-order iterating over the ->changes of a
+ * transaction and its subtransactions
+ *
+ * used for iteration over the k-way heap merge of a transaction and its
+ * subtransactions
+ * ---------------------------------------
+ */
+static ReorderBufferIterTXNState *ReorderBufferIterTXNInit(ReorderBuffer *rb, ReorderBufferTXN *txn);
+static ReorderBufferChange *
+			ReorderBufferIterTXNNext(ReorderBuffer *rb, ReorderBufferIterTXNState *state);
+static void ReorderBufferIterTXNFinish(ReorderBuffer *rb,
+						   ReorderBufferIterTXNState *state);
+static void ReorderBufferExecuteInvalidations(ReorderBuffer *rb, ReorderBufferTXN *txn);
+
+/*
+ * ---------------------------------------
+ * Disk serialization support functions
+ * ---------------------------------------
+ */
+static void ReorderBufferCheckSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
+static void ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
+static void ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
+							 int fd, ReorderBufferChange *change);
+static Size ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn,
+							int *fd, XLogSegNo *segno);
+static void ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
+						   char *change);
+static void ReorderBufferRestoreCleanup(ReorderBuffer *rb, ReorderBufferTXN *txn);
+
+static void ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap);
+static Snapshot ReorderBufferCopySnap(ReorderBuffer *rb, Snapshot orig_snap,
+					  ReorderBufferTXN *txn, CommandId cid);
+
+/* ---------------------------------------
+ * toast reassembly support
+ * ---------------------------------------
+ */
+static void ReorderBufferToastInitHash(ReorderBuffer *rb, ReorderBufferTXN *txn);
+static void ReorderBufferToastReset(ReorderBuffer *rb, ReorderBufferTXN *txn);
+static void ReorderBufferToastReplace(ReorderBuffer *rb, ReorderBufferTXN *txn,
+						  Relation relation, ReorderBufferChange *change);
+static void ReorderBufferToastAppendChunk(ReorderBuffer *rb, ReorderBufferTXN *txn,
+							  Relation relation, ReorderBufferChange *change);
+
+
+/*
+ * Allocate a new ReorderBuffer
+ */
+ReorderBuffer *
+ReorderBufferAllocate(void)
+{
+	ReorderBuffer *buffer;
+	HASHCTL		hash_ctl;
+	MemoryContext new_ctx;
+
+	StaticAssertExpr((int) REORDER_BUFFER_CHANGE_INTERNAL_INSERT == (int) REORDER_BUFFER_CHANGE_INSERT, "out of sync enums");
+	StaticAssertExpr((int) REORDER_BUFFER_CHANGE_INTERNAL_UPDATE == (int) REORDER_BUFFER_CHANGE_UPDATE, "out of sync enums");
+	StaticAssertExpr((int) REORDER_BUFFER_CHANGE_INTERNAL_DELETE == (int) REORDER_BUFFER_CHANGE_DELETE, "out of sync enums");
+
+	/* allocate memory in own context, to have better accountability */
+	new_ctx = AllocSetContextCreate(CurrentMemoryContext,
+									"ReorderBuffer",
+									ALLOCSET_DEFAULT_MINSIZE,
+									ALLOCSET_DEFAULT_INITSIZE,
+									ALLOCSET_DEFAULT_MAXSIZE);
+
+	buffer =
+		(ReorderBuffer *) MemoryContextAlloc(new_ctx, sizeof(ReorderBuffer));
+
+	memset(&hash_ctl, 0, sizeof(hash_ctl));
+
+	buffer->context = new_ctx;
+
+	hash_ctl.keysize = sizeof(TransactionId);
+	hash_ctl.entrysize = sizeof(ReorderBufferTXNByIdEnt);
+	hash_ctl.hash = tag_hash;
+	hash_ctl.hcxt = buffer->context;
+
+	buffer->by_txn = hash_create("ReorderBufferByXid", 1000, &hash_ctl,
+								 HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
+
+	buffer->by_txn_last_xid = InvalidTransactionId;
+	buffer->by_txn_last_txn = NULL;
+
+	buffer->nr_cached_transactions = 0;
+	buffer->nr_cached_changes = 0;
+	buffer->nr_cached_tuplebufs = 0;
+
+	buffer->outbuf = NULL;
+	buffer->outbufsize = 0;
+
+	buffer->current_restart_decoding_lsn = InvalidXLogRecPtr;
+
+	dlist_init(&buffer->toplevel_by_lsn);
+	dlist_init(&buffer->cached_transactions);
+	dlist_init(&buffer->cached_changes);
+	slist_init(&buffer->cached_tuplebufs);
+
+	return buffer;
+}
+
+/*
+ * Free a ReorderBuffer
+ */
+void
+ReorderBufferFree(ReorderBuffer *rb)
+{
+	MemoryContext context = rb->context;
+
+	/*
+	 * We free separately allocated data by entirely scrapping reorderbuffer's
+	 * memory context.
+	 */
+	MemoryContextDelete(context);
+}
+
+/*
+ * Get a unused, possibly preallocated, ReorderBufferTXN.
+ */
+static ReorderBufferTXN *
+ReorderBufferGetTXN(ReorderBuffer *rb)
+{
+	ReorderBufferTXN *txn;
+
+	/* check the slab cache */
+	if (rb->nr_cached_transactions > 0)
+	{
+		rb->nr_cached_transactions--;
+		txn = (ReorderBufferTXN *)
+			dlist_container(ReorderBufferTXN, node,
+							dlist_pop_head_node(&rb->cached_transactions));
+	}
+	else
+	{
+		txn = (ReorderBufferTXN *)
+			MemoryContextAlloc(rb->context, sizeof(ReorderBufferTXN));
+	}
+
+	memset(txn, 0, sizeof(ReorderBufferTXN));
+
+	dlist_init(&txn->changes);
+	dlist_init(&txn->tuplecids);
+	dlist_init(&txn->subtxns);
+
+	return txn;
+}
+
+/*
+ * Free a ReorderBufferTXN.
+ *
+ * Deallocation might be delayed for efficiency purposes, for details check
+ * the comments above max_cached_changes's definition.
+ */
+void
+ReorderBufferReturnTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
+{
+	/* clean the lookup cache if we were cached (quite likely) */
+	if (rb->by_txn_last_xid == txn->xid)
+	{
+		rb->by_txn_last_xid = InvalidTransactionId;
+		rb->by_txn_last_txn = NULL;
+	}
+
+	/* free data that's contained */
+
+	if (txn->tuplecid_hash != NULL)
+	{
+		hash_destroy(txn->tuplecid_hash);
+		txn->tuplecid_hash = NULL;
+	}
+
+	if (txn->invalidations)
+	{
+		pfree(txn->invalidations);
+		txn->invalidations = NULL;
+	}
+
+	/* check whether to put into the slab cache */
+	if (rb->nr_cached_transactions < max_cached_transactions)
+	{
+		rb->nr_cached_transactions++;
+		dlist_push_head(&rb->cached_transactions, &txn->node);
+		VALGRIND_MAKE_MEM_UNDEFINED(txn, sizeof(ReorderBufferTXN));
+		VALGRIND_MAKE_MEM_DEFINED(&txn->node, sizeof(txn->node));
+	}
+	else
+	{
+		pfree(txn);
+	}
+}
+
+/*
+ * Get a unused, possibly preallocated, ReorderBufferChange.
+ */
+ReorderBufferChange *
+ReorderBufferGetChange(ReorderBuffer *rb)
+{
+	ReorderBufferChange *change;
+
+	/* check the slab cache */
+	if (rb->nr_cached_changes)
+	{
+		rb->nr_cached_changes--;
+		change = (ReorderBufferChange *)
+			dlist_container(ReorderBufferChange, node,
+							dlist_pop_head_node(&rb->cached_changes));
+	}
+	else
+	{
+		change = (ReorderBufferChange *)
+			MemoryContextAlloc(rb->context, sizeof(ReorderBufferChange));
+	}
+
+	memset(change, 0, sizeof(ReorderBufferChange));
+	return change;
+}
+
+/*
+ * Free an ReorderBufferChange.
+ *
+ * Deallocation might be delayed for efficiency purposes, for details check
+ * the comments above max_cached_changes's definition.
+ */
+void
+ReorderBufferReturnChange(ReorderBuffer *rb, ReorderBufferChange *change)
+{
+	/* free contained data */
+	switch ((ReorderBufferChangeTypeInternal) change->action_internal)
+	{
+		case REORDER_BUFFER_CHANGE_INTERNAL_INSERT:
+		case REORDER_BUFFER_CHANGE_INTERNAL_UPDATE:
+		case REORDER_BUFFER_CHANGE_INTERNAL_DELETE:
+			if (change->tp.newtuple)
+			{
+				ReorderBufferReturnTupleBuf(rb, change->tp.newtuple);
+				change->tp.newtuple = NULL;
+			}
+
+			if (change->tp.oldtuple)
+			{
+				ReorderBufferReturnTupleBuf(rb, change->tp.oldtuple);
+				change->tp.oldtuple = NULL;
+			}
+			break;
+		case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
+			if (change->snapshot)
+			{
+				ReorderBufferFreeSnap(rb, change->snapshot);
+				change->snapshot = NULL;
+			}
+			break;
+		case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
+			break;
+		case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
+			break;
+	}
+
+	/* check whether to put into the slab cache */
+	if (rb->nr_cached_changes < max_cached_changes)
+	{
+		rb->nr_cached_changes++;
+		dlist_push_head(&rb->cached_changes, &change->node);
+		VALGRIND_MAKE_MEM_UNDEFINED(change, sizeof(ReorderBufferChange));
+		VALGRIND_MAKE_MEM_DEFINED(&change->node, sizeof(change->node));
+	}
+	else
+	{
+		pfree(change);
+	}
+}
+
+
+/*
+ * Get a unused, possibly preallocated, ReorderBufferTupleBuf
+ */
+ReorderBufferTupleBuf *
+ReorderBufferGetTupleBuf(ReorderBuffer *rb)
+{
+	ReorderBufferTupleBuf *tuple;
+
+	/* check the slab cache */
+	if (rb->nr_cached_tuplebufs)
+	{
+		rb->nr_cached_tuplebufs--;
+		tuple = slist_container(ReorderBufferTupleBuf, node,
+								slist_pop_head_node(&rb->cached_tuplebufs));
+#ifdef USE_ASSERT_CHECKING
+		memset(tuple, 0xdeadbeef, sizeof(ReorderBufferTupleBuf));
+#endif
+	}
+	else
+	{
+		tuple = (ReorderBufferTupleBuf *)
+			MemoryContextAlloc(rb->context, sizeof(ReorderBufferTupleBuf));
+	}
+
+	return tuple;
+}
+
+/*
+ * Free an ReorderBufferTupleBuf.
+ *
+ * Deallocation might be delayed for efficiency purposes, for details check
+ * the comments above max_cached_changes's definition.
+ */
+void
+ReorderBufferReturnTupleBuf(ReorderBuffer *rb, ReorderBufferTupleBuf *tuple)
+{
+	/* check whether to put into the slab cache */
+	if (rb->nr_cached_tuplebufs < max_cached_tuplebufs)
+	{
+		rb->nr_cached_tuplebufs++;
+		slist_push_head(&rb->cached_tuplebufs, &tuple->node);
+		VALGRIND_MAKE_MEM_UNDEFINED(tuple, sizeof(ReorderBufferTupleBuf));
+		VALGRIND_MAKE_MEM_DEFINED(&tuple->node, sizeof(tuple->node));
+	}
+	else
+	{
+		pfree(tuple);
+	}
+}
+
+/*
+ * Return the ReorderBufferTXN from the given buffer, specified by Xid.
+ * If create is true, and a transaction doesn't already exist, create it
+ * (with the given LSN, and as top transaction if that's specified);
+ * when this happens, is_new is set to true.
+ */
+static ReorderBufferTXN *
+ReorderBufferTXNByXid(ReorderBuffer *rb, TransactionId xid, bool create,
+					  bool *is_new, XLogRecPtr lsn, bool create_as_top)
+{
+	ReorderBufferTXN *txn;
+	ReorderBufferTXNByIdEnt *ent;
+	bool		found;
+
+	Assert(TransactionIdIsValid(xid));
+	Assert(!create || lsn != InvalidXLogRecPtr);
+
+	/*
+	 * Check the one-entry lookup cache first
+	 */
+	if (TransactionIdIsValid(rb->by_txn_last_xid) &&
+		rb->by_txn_last_xid == xid)
+	{
+		txn = rb->by_txn_last_txn;
+
+		if (txn != NULL)
+		{
+			/* found it, and it's valid */
+			if (is_new)
+				*is_new = false;
+			return txn;
+		}
+
+		/*
+		 * cached as non-existant, and asked not to create? Then nothing else
+		 * to do.
+		 */
+		if (!create)
+			return NULL;
+		/* otherwise fall through to create it */
+	}
+
+	/*
+	 * If the cache wasn't hit or it yielded an "does-not-exist" and we want
+	 * to create an entry.
+	 */
+
+	/* search the lookup table */
+	ent = (ReorderBufferTXNByIdEnt *)
+		hash_search(rb->by_txn,
+					(void *) &xid,
+					create ? HASH_ENTER : HASH_FIND,
+					&found);
+	if (found)
+		txn = ent->txn;
+	else if (create)
+	{
+		/* initialize the new entry, if creation was requested */
+		Assert(ent != NULL);
+
+		ent->txn = ReorderBufferGetTXN(rb);
+		ent->txn->xid = xid;
+		txn = ent->txn;
+		txn->first_lsn = lsn;
+		txn->restart_decoding_lsn = rb->current_restart_decoding_lsn;
+
+		if (create_as_top)
+		{
+			dlist_push_tail(&rb->toplevel_by_lsn, &txn->node);
+			AssertTXNLsnOrder(rb);
+		}
+	}
+	else
+		txn = NULL;				/* not found and not asked to create */
+
+	/* update cache */
+	rb->by_txn_last_xid = xid;
+	rb->by_txn_last_txn = txn;
+
+	if (is_new)
+		*is_new = !found;
+
+	Assert(!create || !!txn);
+	return txn;
+}
+
+/*
+ * Queue a change into a transaction so it can be replayed upon commit.
+ */
+void
+ReorderBufferQueueChange(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn,
+					   ReorderBufferChange *change)
+{
+	ReorderBufferTXN *txn;
+
+	txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
+
+	change->lsn = lsn;
+	Assert(InvalidXLogRecPtr != lsn);
+	dlist_push_tail(&txn->changes, &change->node);
+	txn->nentries++;
+	txn->nentries_mem++;
+
+	ReorderBufferCheckSerializeTXN(rb, txn);
+}
+
+static void
+AssertTXNLsnOrder(ReorderBuffer *rb)
+{
+#ifdef USE_ASSERT_CHECKING
+	dlist_iter	iter;
+	XLogRecPtr	prev_first_lsn = InvalidXLogRecPtr;
+
+	dlist_foreach(iter, &rb->toplevel_by_lsn)
+	{
+		ReorderBufferTXN *cur_txn;
+
+		cur_txn = dlist_container(ReorderBufferTXN, node, iter.cur);
+		Assert(cur_txn->first_lsn != InvalidXLogRecPtr);
+
+		if (cur_txn->end_lsn != InvalidXLogRecPtr)
+			Assert(cur_txn->first_lsn <= cur_txn->end_lsn);
+
+		if (prev_first_lsn != InvalidXLogRecPtr)
+			Assert(prev_first_lsn < cur_txn->first_lsn);
+
+		Assert(!cur_txn->is_known_as_subxact);
+		prev_first_lsn = cur_txn->first_lsn;
+	}
+#endif
+}
+
+ReorderBufferTXN *
+ReorderBufferGetOldestTXN(ReorderBuffer *rb)
+{
+	ReorderBufferTXN *txn;
+
+	if (dlist_is_empty(&rb->toplevel_by_lsn))
+		return NULL;
+
+	AssertTXNLsnOrder(rb);
+
+	txn = dlist_head_element(ReorderBufferTXN, node, &rb->toplevel_by_lsn);
+
+	Assert(!txn->is_known_as_subxact);
+	Assert(txn->first_lsn != InvalidXLogRecPtr);
+	return txn;
+}
+
+void
+ReorderBufferSetRestartPoint(ReorderBuffer *rb, XLogRecPtr ptr)
+{
+	rb->current_restart_decoding_lsn = ptr;
+}
+
+void
+ReorderBufferAssignChild(ReorderBuffer *rb, TransactionId xid,
+						 TransactionId subxid, XLogRecPtr lsn)
+{
+	ReorderBufferTXN *txn;
+	ReorderBufferTXN *subtxn;
+	bool		new_top;
+	bool		new_sub;
+
+	txn = ReorderBufferTXNByXid(rb, xid, true, &new_top, lsn, true);
+	subtxn = ReorderBufferTXNByXid(rb, subxid, true, &new_sub, lsn, false);
+
+	if (new_sub)
+	{
+		/*
+		 * we assign subtransactions to top level transaction even if we don't
+		 * have data for it yet, assignment records frequently reference xids
+		 * that have not yet produced any records. Knowing those aren't top
+		 * level xids allows us to make processing cheaper in some places.
+		 */
+		dlist_push_tail(&txn->subtxns, &subtxn->node);
+		txn->nsubtxns++;
+	}
+	else if (!subtxn->is_known_as_subxact)
+	{
+		subtxn->is_known_as_subxact = true;
+		Assert(subtxn->nsubtxns == 0);
+
+		/* remove from lsn order list of top-level transactions */
+		dlist_delete(&subtxn->node);
+
+		/* add to toplevel transaction */
+		dlist_push_tail(&txn->subtxns, &subtxn->node);
+		txn->nsubtxns++;
+	}
+	else if (new_top)
+	{
+		elog(ERROR, "existing subxact assigned to unknown toplevel xact");
+	}
+}
+
+/*
+ * Associate a subtransaction with its toplevel transaction at commit
+ * time. There may be no further changes added after this.
+ */
+void
+ReorderBufferCommitChild(ReorderBuffer *rb, TransactionId xid,
+						 TransactionId subxid, XLogRecPtr commit_lsn,
+						 XLogRecPtr end_lsn)
+{
+	ReorderBufferTXN *txn;
+	ReorderBufferTXN *subtxn;
+
+	subtxn = ReorderBufferTXNByXid(rb, subxid, false, NULL,
+								   InvalidXLogRecPtr, false);
+
+	/*
+	 * No need to do anything if that subtxn didn't contain any changes
+	 */
+	if (!subtxn)
+		return;
+
+	txn = ReorderBufferTXNByXid(rb, xid, false, NULL, commit_lsn, true);
+
+	if (txn == NULL)
+		elog(ERROR, "subxact logged without previous toplevel record");
+
+	/*
+	 * Pass the our base snapshot to the parent transaction if it doesn't have
+	 * one, or ours is older. That can happen if there are no changes in the
+	 * toplevel transaction but in one of the child transactions. This allows
+	 * the parent to simply use it's base snapshot initially.
+	 */
+	if (txn->base_snapshot == NULL ||
+		txn->base_snapshot_lsn > subtxn->base_snapshot_lsn)
+	{
+		txn->base_snapshot = subtxn->base_snapshot;
+		txn->base_snapshot_lsn = subtxn->base_snapshot_lsn;
+		subtxn->base_snapshot = NULL;
+		subtxn->base_snapshot_lsn = InvalidXLogRecPtr;
+	}
+
+	subtxn->final_lsn = commit_lsn;
+	subtxn->end_lsn = end_lsn;
+
+	if (!subtxn->is_known_as_subxact)
+	{
+		subtxn->is_known_as_subxact = true;
+		Assert(subtxn->nsubtxns == 0);
+
+		/* remove from lsn order list of top-level transactions */
+		dlist_delete(&subtxn->node);
+
+		/* add to subtransaction list */
+		dlist_push_tail(&txn->subtxns, &subtxn->node);
+		txn->nsubtxns++;
+	}
+}
+
+
+/*
+ * Support for efficiently iterating over a transaction's and its
+ * subtransactions' changes.
+ *
+ * We do by doing a k-way merge between transactions/subtransactions. For that
+ * we model the current heads of the different transactions as a binary heap
+ * so we easily know which (sub-)transaction has the change with the smallest
+ * lsn next.
+ *
+ * We assume the changes in individual transactions are already sorted by LSN.
+ */
+
+/*
+ * Binary heap comparison function.
+ */
+static int
+ReorderBufferIterCompare(Datum a, Datum b, void *arg)
+{
+	ReorderBufferIterTXNState *state = (ReorderBufferIterTXNState *) arg;
+	XLogRecPtr	pos_a = state->entries[DatumGetInt32(a)].lsn;
+	XLogRecPtr	pos_b = state->entries[DatumGetInt32(b)].lsn;
+
+	if (pos_a < pos_b)
+		return 1;
+	else if (pos_a == pos_b)
+		return 0;
+	return -1;
+}
+
+/*
+ * Allocate & initialize an iterator which iterates in lsn order over a
+ * transaction and all its subtransactions.
+ */
+static ReorderBufferIterTXNState *
+ReorderBufferIterTXNInit(ReorderBuffer *rb, ReorderBufferTXN *txn)
+{
+	Size		nr_txns = 0;
+	ReorderBufferIterTXNState *state;
+	dlist_iter	cur_txn_i;
+	int32		off;
+
+	/*
+	 * Calculate the size of our heap: one element for every transaction that
+	 * contains changes.  (Besides the transactions already in the reorder
+	 * buffer, we count the one we were directly passed.)
+	 */
+	if (txn->nentries > 0)
+		nr_txns++;
+
+	dlist_foreach(cur_txn_i, &txn->subtxns)
+	{
+		ReorderBufferTXN *cur_txn;
+
+		cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur);
+
+		if (cur_txn->nentries > 0)
+			nr_txns++;
+	}
+
+	/*
+	 * TODO: Consider adding fastpath for the rather common nr_txns=1 case, no
+	 * need to allocate/build a heap then.
+	 */
+
+	/* allocate iteration state */
+	state = (ReorderBufferIterTXNState *)
+		MemoryContextAllocZero(rb->context,
+							   sizeof(ReorderBufferIterTXNState) +
+							   sizeof(ReorderBufferIterTXNEntry) * nr_txns);
+
+	state->nr_txns = nr_txns;
+	dlist_init(&state->old_change);
+
+	for (off = 0; off < state->nr_txns; off++)
+	{
+		state->entries[off].fd = -1;
+		state->entries[off].segno = 0;
+	}
+
+	/* allocate heap */
+	state->heap = binaryheap_allocate(state->nr_txns,
+									  ReorderBufferIterCompare,
+									  state);
+
+	/*
+	 * Now insert items into the binary heap, in an unordered fashion.  (We
+	 * will run a heap assembly step at the end; this is more efficient.)
+	 */
+
+	off = 0;
+
+	/* add toplevel transaction if it contains changes */
+	if (txn->nentries > 0)
+	{
+		ReorderBufferChange *cur_change;
+
+		if (txn->nentries != txn->nentries_mem)
+			ReorderBufferRestoreChanges(rb, txn, &state->entries[off].fd,
+										&state->entries[off].segno);
+
+		cur_change = dlist_head_element(ReorderBufferChange, node,
+										&txn->changes);
+
+		state->entries[off].lsn = cur_change->lsn;
+		state->entries[off].change = cur_change;
+		state->entries[off].txn = txn;
+
+		binaryheap_add_unordered(state->heap, Int32GetDatum(off++));
+	}
+
+	/* add subtransactions if they contain changes */
+	dlist_foreach(cur_txn_i, &txn->subtxns)
+	{
+		ReorderBufferTXN *cur_txn;
+
+		cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur);
+
+		if (cur_txn->nentries > 0)
+		{
+			ReorderBufferChange *cur_change;
+
+			if (txn->nentries != txn->nentries_mem)
+				ReorderBufferRestoreChanges(rb, cur_txn,
+											&state->entries[off].fd,
+											&state->entries[off].segno);
+
+			cur_change = dlist_head_element(ReorderBufferChange, node,
+											&cur_txn->changes);
+
+			state->entries[off].lsn = cur_change->lsn;
+			state->entries[off].change = cur_change;
+			state->entries[off].txn = cur_txn;
+
+			binaryheap_add_unordered(state->heap, Int32GetDatum(off++));
+		}
+	}
+
+	/* assemble a valid binary heap */
+	binaryheap_build(state->heap);
+
+	return state;
+}
+
+/*
+ * Return the next change when iterating over a transaction and its
+ * subtransactions.
+ *
+ * Returns NULL when no further changes exist.
+ */
+static ReorderBufferChange *
+ReorderBufferIterTXNNext(ReorderBuffer *rb, ReorderBufferIterTXNState *state)
+{
+	ReorderBufferChange *change;
+	ReorderBufferIterTXNEntry *entry;
+	int32		off;
+
+	/* nothing there anymore */
+	if (state->heap->bh_size == 0)
+		return NULL;
+
+	off = DatumGetInt32(binaryheap_first(state->heap));
+	entry = &state->entries[off];
+
+	/* free memory we might have "leaked" in the previous *Next call */
+	if (!dlist_is_empty(&state->old_change))
+	{
+		change = dlist_container(ReorderBufferChange, node,
+								 dlist_pop_head_node(&state->old_change));
+		ReorderBufferReturnChange(rb, change);
+		Assert(dlist_is_empty(&state->old_change));
+	}
+
+	change = entry->change;
+
+	/*
+	 * update heap with information about which transaction has the next
+	 * relevant change in LSN order
+	 */
+
+	/* there are in-memory changes */
+	if (dlist_has_next(&entry->txn->changes, &entry->change->node))
+	{
+		dlist_node *next = dlist_next_node(&entry->txn->changes, &change->node);
+		ReorderBufferChange *next_change =
+		dlist_container(ReorderBufferChange, node, next);
+
+		/* txn stays the same */
+		state->entries[off].lsn = next_change->lsn;
+		state->entries[off].change = next_change;
+
+		binaryheap_replace_first(state->heap, Int32GetDatum(off));
+		return change;
+	}
+
+	/* try to load changes from disk */
+	if (entry->txn->nentries != entry->txn->nentries_mem)
+	{
+		/*
+		 * Ugly: restoring changes will reuse *Change records, thus delete the
+		 * current one from the per-tx list and only free in the next call.
+		 */
+		dlist_delete(&change->node);
+		dlist_push_tail(&state->old_change, &change->node);
+
+		if (ReorderBufferRestoreChanges(rb, entry->txn, &entry->fd,
+										&state->entries[off].segno))
+		{
+			/* successfully restored changes from disk */
+			ReorderBufferChange *next_change =
+			dlist_head_element(ReorderBufferChange, node,
+							   &entry->txn->changes);
+
+			elog(DEBUG2, "restored %u/%u changes from disk",
+				 (uint32) entry->txn->nentries_mem,
+				 (uint32) entry->txn->nentries);
+
+			Assert(entry->txn->nentries_mem);
+			/* txn stays the same */
+			state->entries[off].lsn = next_change->lsn;
+			state->entries[off].change = next_change;
+			binaryheap_replace_first(state->heap, Int32GetDatum(off));
+
+			return change;
+		}
+	}
+
+	/* ok, no changes there anymore, remove */
+	binaryheap_remove_first(state->heap);
+
+	return change;
+}
+
+/*
+ * Deallocate the iterator
+ */
+static void
+ReorderBufferIterTXNFinish(ReorderBuffer *rb,
+						   ReorderBufferIterTXNState *state)
+{
+	int32		off;
+
+	for (off = 0; off < state->nr_txns; off++)
+	{
+		if (state->entries[off].fd != -1)
+			CloseTransientFile(state->entries[off].fd);
+	}
+
+	/* free memory we might have "leaked" in the last *Next call */
+	if (!dlist_is_empty(&state->old_change))
+	{
+		ReorderBufferChange *change;
+
+		change = dlist_container(ReorderBufferChange, node,
+								 dlist_pop_head_node(&state->old_change));
+		ReorderBufferReturnChange(rb, change);
+		Assert(dlist_is_empty(&state->old_change));
+	}
+
+	binaryheap_free(state->heap);
+	pfree(state);
+}
+
+/*
+ * Cleanup the contents of a transaction, usually after the transaction
+ * committed or aborted.
+ */
+static void
+ReorderBufferCleanupTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
+{
+	bool		found;
+	dlist_mutable_iter iter;
+
+	/* cleanup subtransactions & their changes */
+	dlist_foreach_modify(iter, &txn->subtxns)
+	{
+		ReorderBufferTXN *subtxn;
+
+		subtxn = dlist_container(ReorderBufferTXN, node, iter.cur);
+
+		/*
+		 * Subtransactions are always associated to the toplevel TXN, even if
+		 * they originally were happening inside another subtxn, so we won't
+		 * ever recurse more than one level deep here.
+		 */
+		Assert(subtxn->is_known_as_subxact);
+		Assert(subtxn->nsubtxns == 0);
+
+		ReorderBufferCleanupTXN(rb, subtxn);
+	}
+
+	/* cleanup changes in the toplevel txn */
+	dlist_foreach_modify(iter, &txn->changes)
+	{
+		ReorderBufferChange *change;
+
+		change = dlist_container(ReorderBufferChange, node, iter.cur);
+
+		ReorderBufferReturnChange(rb, change);
+	}
+
+	/*
+	 * Cleanup the tuplecids we stored for decoding catalog snapshot
+	 * access. They are always stored in the toplevel transaction.
+	 */
+	dlist_foreach_modify(iter, &txn->tuplecids)
+	{
+		ReorderBufferChange *change;
+
+		change = dlist_container(ReorderBufferChange, node, iter.cur);
+		Assert(change->action_internal == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID);
+		ReorderBufferReturnChange(rb, change);
+	}
+
+	if (txn->base_snapshot != NULL)
+	{
+		SnapBuildSnapDecRefcount(txn->base_snapshot);
+		txn->base_snapshot = NULL;
+		txn->base_snapshot_lsn = InvalidXLogRecPtr;
+	}
+
+	/* delete from list of known subxacts */
+	if (txn->is_known_as_subxact)
+	{
+		/* NB: nsubxacts count of parent will be too high now */
+		dlist_delete(&txn->node);
+	}
+	/* delete from LSN ordered list of toplevel TXNs */
+	else
+	{
+		dlist_delete(&txn->node);
+	}
+
+	/* now remove reference from buffer */
+	hash_search(rb->by_txn,
+				(void *) &txn->xid,
+				HASH_REMOVE,
+				&found);
+	Assert(found);
+
+	/* remove entries spilled to disk */
+	if (txn->nentries != txn->nentries_mem)
+		ReorderBufferRestoreCleanup(rb, txn);
+
+	/* deallocate */
+	ReorderBufferReturnTXN(rb, txn);
+}
+
+/*
+ * Build a hash with a (relfilenode, ctid) -> (cmin, cmax) mapping for use by
+ * tqual.c's HeapTupleSatisfiesHistoricMVCC.
+ */
+static void
+ReorderBufferBuildTupleCidHash(ReorderBuffer *rb, ReorderBufferTXN *txn)
+{
+	dlist_iter	iter;
+	HASHCTL		hash_ctl;
+
+	if (!txn->has_catalog_changes || dlist_is_empty(&txn->tuplecids))
+		return;
+
+	memset(&hash_ctl, 0, sizeof(hash_ctl));
+
+	hash_ctl.keysize = sizeof(ReorderBufferTupleCidKey);
+	hash_ctl.entrysize = sizeof(ReorderBufferTupleCidEnt);
+	hash_ctl.hash = tag_hash;
+	hash_ctl.hcxt = rb->context;
+
+	/*
+	 * create the hash with the exact number of to-be-stored tuplecids from
+	 * the start
+	 */
+	txn->tuplecid_hash =
+		hash_create("ReorderBufferTupleCid", txn->ntuplecids, &hash_ctl,
+					HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
+
+	dlist_foreach(iter, &txn->tuplecids)
+	{
+		ReorderBufferTupleCidKey key;
+		ReorderBufferTupleCidEnt *ent;
+		bool		found;
+		ReorderBufferChange *change;
+
+		change = dlist_container(ReorderBufferChange, node, iter.cur);
+
+		Assert(change->action_internal == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID);
+
+		/* be careful about padding */
+		memset(&key, 0, sizeof(ReorderBufferTupleCidKey));
+
+		key.relnode = change->tuplecid.node;
+
+		ItemPointerCopy(&change->tuplecid.tid,
+						&key.tid);
+
+		ent = (ReorderBufferTupleCidEnt *)
+			hash_search(txn->tuplecid_hash,
+						(void *) &key,
+						HASH_ENTER | HASH_FIND,
+						&found);
+		if (!found)
+		{
+			ent->cmin = change->tuplecid.cmin;
+			ent->cmax = change->tuplecid.cmax;
+			ent->combocid = change->tuplecid.combocid;
+		}
+		else
+		{
+			Assert(ent->cmin == change->tuplecid.cmin);
+			Assert(ent->cmax == InvalidCommandId ||
+				   ent->cmax == change->tuplecid.cmax);
+
+			/*
+			 * if the tuple got valid in this transaction and now got deleted
+			 * we already have a valid cmin stored. The cmax will be
+			 * InvalidCommandId though.
+			 */
+			ent->cmax = change->tuplecid.cmax;
+		}
+	}
+}
+
+/*
+ * Copy a provided snapshot so we can modify it privately. This is needed so
+ * that catalog modifying transactions can look into intermediate catalog
+ * states.
+ */
+static Snapshot
+ReorderBufferCopySnap(ReorderBuffer *rb, Snapshot orig_snap,
+					  ReorderBufferTXN *txn, CommandId cid)
+{
+	Snapshot	snap;
+	dlist_iter	iter;
+	int			i = 0;
+	Size		size;
+
+	size = sizeof(SnapshotData) +
+		sizeof(TransactionId) * orig_snap->xcnt +
+		sizeof(TransactionId) * (txn->nsubtxns + 1);
+
+	snap = MemoryContextAllocZero(rb->context, size);
+	memcpy(snap, orig_snap, sizeof(SnapshotData));
+
+	snap->copied = true;
+	snap->active_count = 0;
+	snap->regd_count = 1;
+	snap->xip = (TransactionId *) (snap + 1);
+
+	memcpy(snap->xip, orig_snap->xip, sizeof(TransactionId) * snap->xcnt);
+
+	/*
+	 * snap->subxip contains all txids that belong to our transaction which we
+	 * need to check via cmin/cmax. Thats why we store the toplevel
+	 * transaction in there as well.
+	 */
+	snap->subxip = snap->xip + snap->xcnt;
+	snap->subxip[i++] = txn->xid;
+
+	/*
+	 * nsubxcnt isn't decreased when subtransactions abort, so count
+	 * manually. Since it's an upper boundary it is safe to use it for the
+	 * allocation above.
+	 */
+	snap->subxcnt = 1;
+
+	dlist_foreach(iter, &txn->subtxns)
+	{
+		ReorderBufferTXN *sub_txn;
+
+		sub_txn = dlist_container(ReorderBufferTXN, node, iter.cur);
+		snap->subxip[i++] = sub_txn->xid;
+		snap->subxcnt++;
+	}
+
+	/* sort so we can bsearch() later */
+	qsort(snap->subxip, snap->subxcnt, sizeof(TransactionId), xidComparator);
+
+	/* store the specified current CommandId */
+	snap->curcid = cid;
+
+	return snap;
+}
+
+/*
+ * Free a previously ReorderBufferCopySnap'ed snapshot
+ */
+static void
+ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap)
+{
+	if (snap->copied)
+		pfree(snap);
+	else
+		SnapBuildSnapDecRefcount(snap);
+}
+
+/*
+ * Perform the replay of a transaction and it's non-aborted subtransactions.
+ *
+ * Subtransactions previously have to be processed by
+ * ReorderBufferCommitChild(), even if previously assigned to the toplevel
+ * transaction with ReorderBufferAssignChild.
+ *
+ * We currently can only decode a transaction's contents in when their commit
+ * record is read because that's currently the only place where we know about
+ * cache invalidations. Thus, once a toplevel commit is read, we iterate over
+ * the top and subtransactions (using a k-way merge) and replay the changes in
+ * lsn order.
+ */
+void
+ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid,
+					XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
+					TimestampTz commit_time)
+{
+	ReorderBufferTXN *txn;
+	ReorderBufferIterTXNState *iterstate = NULL;
+	ReorderBufferChange *change;
+
+	volatile CommandId	command_id = FirstCommandId;
+	volatile Snapshot	snapshot_now = NULL;
+	volatile bool		txn_started = false;
+	volatile bool		subtxn_started = false;
+
+	txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
+								false);
+
+	/* unknown transaction, nothing to replay */
+	if (txn == NULL)
+		return;
+
+	txn->final_lsn = commit_lsn;
+	txn->end_lsn = end_lsn;
+	txn->commit_time = commit_time;
+
+	/* serialize the last bunch of changes if we need start earlier anyway */
+	if (txn->nentries_mem != txn->nentries)
+		ReorderBufferSerializeTXN(rb, txn);
+
+	/*
+	 * If this transaction didn't have any real changes in our database, it's
+	 * OK not to have a snapshot. Note that ReorderBufferCommitChild will have
+	 * transferred its snapshot to this transaction if it had one and the
+	 * toplevel tx didn't.
+	 */
+	if (txn->base_snapshot == NULL)
+	{
+		Assert(txn->ninvalidations == 0);
+		ReorderBufferCleanupTXN(rb, txn);
+		return;
+	}
+
+	snapshot_now = txn->base_snapshot;
+
+	/* build data to be able to lookup the CommandIds of catalog tuples */
+	ReorderBufferBuildTupleCidHash(rb, txn);
+
+	/* setup the initial snapshot */
+	SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
+
+	PG_TRY();
+	{
+		txn_started = false;
+
+		/*
+		 * Decoding needs access to syscaches et al., which in turn use
+		 * heavyweight locks and such. Thus we need to have enough state around
+		 * to keep track of those. The easiest way is to simply use a
+		 * transaction internally. That also allows us to easily enforce that
+		 * nothing writes to the database by checking for xid assignments.
+		 *
+		 * When we're called via the SQL SRF there's already a transaction
+		 * started, so start an explicit subtransaction there.
+		 */
+		if (IsTransactionOrTransactionBlock())
+		{
+			BeginInternalSubTransaction("replay");
+			subtxn_started = true;
+		}
+		else
+		{
+			StartTransactionCommand();
+			txn_started = true;
+		}
+
+		rb->begin(rb, txn);
+
+		iterstate = ReorderBufferIterTXNInit(rb, txn);
+		while ((change = ReorderBufferIterTXNNext(rb, iterstate)))
+		{
+			Relation	relation = NULL;
+			Oid			reloid;
+
+			switch ((ReorderBufferChangeTypeInternal) change->action_internal)
+			{
+				case REORDER_BUFFER_CHANGE_INTERNAL_INSERT:
+				case REORDER_BUFFER_CHANGE_INTERNAL_UPDATE:
+				case REORDER_BUFFER_CHANGE_INTERNAL_DELETE:
+					Assert(snapshot_now);
+
+					reloid = RelidByRelfilenode(change->tp.relnode.spcNode,
+												change->tp.relnode.relNode);
+
+					/*
+					 * Catalog tuple without data, emitted while catalog was
+					 * in the process of being rewritten.
+					 */
+					if (reloid == InvalidOid &&
+						change->tp.newtuple == NULL &&
+						change->tp.oldtuple == NULL)
+						continue;
+					else if (reloid == InvalidOid)
+						elog(ERROR, "could not lookup relation %s",
+							 relpathperm(change->tp.relnode, MAIN_FORKNUM));
+
+					relation = RelationIdGetRelation(reloid);
+
+					if (relation == NULL)
+						elog(ERROR, "could open relation descriptor %s",
+							 relpathperm(change->tp.relnode, MAIN_FORKNUM));
+
+					if (RelationIsLogicallyLogged(relation))
+					{
+						/*
+						 * For now ignore sequence changes entirely. Most of
+						 * the time they don't log changes using records we
+						 * understand, so it doesn't make sense to handle the
+						 * few cases we do.
+						 */
+						if (relation->rd_rel->relkind == RELKIND_SEQUENCE)
+						{
+						}
+						/* user-triggered change */
+						else if (!IsToastRelation(relation))
+						{
+							ReorderBufferToastReplace(rb, txn, relation, change);
+							rb->apply_change(rb, txn, relation, change);
+							ReorderBufferToastReset(rb, txn);
+						}
+						/* we're not interested in toast deletions */
+						else if (change->action == REORDER_BUFFER_CHANGE_INSERT)
+						{
+							/*
+							 * Need to reassemble the full toasted Datum in
+							 * memory, to ensure the chunks don't get reused
+							 * till we're done remove it from the list of this
+							 * transaction's changes. Otherwise it will get
+							 * freed/reused while restoring spooled data from
+							 * disk.
+							 */
+							dlist_delete(&change->node);
+							ReorderBufferToastAppendChunk(rb, txn, relation,
+														  change);
+						}
+
+					}
+					RelationClose(relation);
+					break;
+				case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
+					/* get rid of the old */
+					TeardownHistoricSnapshot(false);
+
+					if (snapshot_now->copied)
+					{
+						ReorderBufferFreeSnap(rb, snapshot_now);
+						snapshot_now =
+							ReorderBufferCopySnap(rb, change->snapshot,
+												  txn, command_id);
+					}
+					/*
+					 * Restored from disk, need to be careful not to double
+					 * free. We could introduce refcounting for that, but for
+					 * now this seems infrequent enough not to care.
+					 */
+					else if (change->snapshot->copied)
+					{
+						snapshot_now =
+							ReorderBufferCopySnap(rb, change->snapshot,
+												  txn, command_id);
+					}
+					else
+					{
+						snapshot_now = change->snapshot;
+					}
+
+
+					/* and continue with the new one */
+					SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
+					break;
+
+				case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
+					Assert(change->command_id != InvalidCommandId);
+
+					if (command_id < change->command_id)
+					{
+						command_id = change->command_id;
+
+						if (!snapshot_now->copied)
+						{
+							/* we don't use the global one anymore */
+							snapshot_now = ReorderBufferCopySnap(rb, snapshot_now,
+																 txn, command_id);
+						}
+
+						snapshot_now->curcid = command_id;
+
+						TeardownHistoricSnapshot(false);
+						SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
+
+						/*
+						 * Every time the CommandId is incremented, we could
+						 * see new catalog contents, so execute all
+						 * invalidations.
+						 */
+						ReorderBufferExecuteInvalidations(rb, txn);
+					}
+
+					break;
+
+				case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
+					elog(ERROR, "tuplecid value in changequeue");
+					break;
+			}
+		}
+
+		ReorderBufferIterTXNFinish(rb, iterstate);
+
+		/* call commit callback */
+		rb->commit(rb, txn, commit_lsn);
+
+		/* this is just a sanity check against bad output plugin behaviour */
+		if (GetCurrentTransactionIdIfAny() != InvalidTransactionId)
+			elog(ERROR, "output plugin used xid %u",
+				 GetCurrentTransactionId());
+
+		/* make sure there's no cache pollution */
+		ReorderBufferExecuteInvalidations(rb, txn);
+
+		/* cleanup */
+		TeardownHistoricSnapshot(false);
+
+		/*
+		 * Abort subtransaction or the transaction as a whole has the right
+		 * semantics. We want all locks acquired in here to be released, not
+		 * reassigned to the parent and we do not want any database access
+		 * have persistent effects.
+		 */
+		if (subtxn_started)
+			RollbackAndReleaseCurrentSubTransaction();
+		else if (txn_started)
+			AbortCurrentTransaction();
+
+		if (snapshot_now->copied)
+			ReorderBufferFreeSnap(rb, snapshot_now);
+
+		/* remove potential on-disk data, and deallocate */
+		ReorderBufferCleanupTXN(rb, txn);
+	}
+	PG_CATCH();
+	{
+		/* TODO: Encapsulate cleanup from the PG_TRY and PG_CATCH blocks */
+		if (iterstate)
+			ReorderBufferIterTXNFinish(rb, iterstate);
+
+		TeardownHistoricSnapshot(true);
+
+		if (snapshot_now->copied)
+			ReorderBufferFreeSnap(rb, snapshot_now);
+
+		if (subtxn_started)
+			RollbackAndReleaseCurrentSubTransaction();
+		else if (txn_started)
+			AbortCurrentTransaction();
+
+		/*
+		 * Invalidations in an aborted transactions aren't allowed to do
+		 * catalog access, so we don't need to still have the snapshot setup.
+		 */
+		ReorderBufferExecuteInvalidations(rb, txn);
+
+		/* remove potential on-disk data, and deallocate */
+		ReorderBufferCleanupTXN(rb, txn);
+
+		PG_RE_THROW();
+	}
+	PG_END_TRY();
+}
+
+/*
+ * Abort a transaction that possibly has previous changes. Needs to be first
+ * called for subtransactions and then for the toplevel xid.
+ *
+ * NB: Transactions handled here have to have actively aborted (i.e. have
+ * produced an abort record). Implicitly aborted transactions are handled via
+ * ReorderBufferAbortOld(); transactions we're just not interesteded in, but
+ * which have committed are handled in ReorderBufferForget().
+ *
+ * This function purges this transaction and its contents from memory and
+ * disk.
+ */
+void
+ReorderBufferAbort(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
+{
+	ReorderBufferTXN *txn;
+
+	txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
+								false);
+
+	/* unknown, nothing to remove */
+	if (txn == NULL)
+		return;
+
+	/* cosmetic... */
+	txn->final_lsn = lsn;
+
+	/* remove potential on-disk data, and deallocate */
+	ReorderBufferCleanupTXN(rb, txn);
+}
+
+/*
+ * Abort all transactions that aren't actually running anymore because the
+ * server restarted.
+ *
+ * NB: These really have to be transactions that have aborted due to a server
+ * crash/immediate restart, as we don't deal with invalidations here.
+ */
+void
+ReorderBufferAbortOld(ReorderBuffer *rb, TransactionId oldestRunningXid)
+{
+	dlist_mutable_iter it;
+
+	/*
+	 * Iterate through all (potential) toplevel TXNs and abort all that are
+	 * older than what possibly can be running. Once we've found the first
+	 * that is alive we stop, there might be some that acquired an xid earlier
+	 * but started writing later, but it's unlikely and they will cleaned up
+	 * in a later call to ReorderBufferAbortOld().
+	 */
+	dlist_foreach_modify(it, &rb->toplevel_by_lsn)
+	{
+		ReorderBufferTXN * txn;
+
+		txn = dlist_container(ReorderBufferTXN, node, it.cur);
+
+		if (TransactionIdPrecedes(txn->xid, oldestRunningXid))
+		{
+			elog(DEBUG1, "aborting old transaction %u", txn->xid);
+
+			/* remove potential on-disk data, and deallocate this tx */
+			ReorderBufferCleanupTXN(rb, txn);
+		}
+		else
+			return;
+	}
+}
+
+/*
+ * Forget the contents of a transaction if we aren't interested in it's
+ * contents. Needs to be first called for subtransactions and then for the
+ * toplevel xid.
+ *
+ * This is significantly different to ReorderBufferAbort() because
+ * transactions that have committed need to be treated differenly from aborted
+ * ones since they may have modified the catalog.
+ *
+ * Note that this is only allowed to be called in the moment a transaction
+ * commit has just been read, not earlier; otherwise later records referring
+ * to this xid might re-create the transaction incompletely.
+ */
+void
+ReorderBufferForget(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
+{
+	ReorderBufferTXN *txn;
+
+	txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
+								false);
+
+	/* unknown, nothing to forget */
+	if (txn == NULL)
+		return;
+
+	/* cosmetic... */
+	txn->final_lsn = lsn;
+
+	/*
+	 * Proccess cache invalidation messages if there are any. Even if we're
+	 * not interested in the transaction's contents, it could have manipulated
+	 * the catalog and we need to update the caches according to that.
+	 */
+	if (txn->base_snapshot != NULL && txn->ninvalidations > 0)
+	{
+		/* setup snapshot to perform the invalidations in */
+		SetupHistoricSnapshot(txn->base_snapshot, txn->tuplecid_hash);
+		PG_TRY();
+		{
+			ReorderBufferExecuteInvalidations(rb, txn);
+			TeardownHistoricSnapshot(false);
+		}
+		PG_CATCH();
+		{
+			/* cleanup */
+			TeardownHistoricSnapshot(true);
+			PG_RE_THROW();
+		}
+		PG_END_TRY();
+	}
+	else
+		Assert(txn->ninvalidations == 0);
+
+	/* remove potential on-disk data, and deallocate */
+	ReorderBufferCleanupTXN(rb, txn);
+}
+
+
+/*
+ * Check whether a transaction is already known in this module.xs
+ */
+bool
+ReorderBufferIsXidKnown(ReorderBuffer *rb, TransactionId xid)
+{
+	ReorderBufferTXN *txn;
+
+	txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
+								false);
+	return txn != NULL;
+}
+
+/*
+ * Add a new snapshot to this transaction that may only used after lsn 'lsn'
+ * because the previous snapshot doesn't describe the catalog correctly for
+ * following rows.
+ */
+void
+ReorderBufferAddSnapshot(ReorderBuffer *rb, TransactionId xid,
+						 XLogRecPtr lsn, Snapshot snap)
+{
+	ReorderBufferChange *change = ReorderBufferGetChange(rb);
+
+	change->snapshot = snap;
+	change->action_internal = REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT;
+
+	ReorderBufferQueueChange(rb, xid, lsn, change);
+}
+
+/*
+ * Setup the base snapshot of a transaction. The base snapshot is the snapshot
+ * that is used to decode all changes until either this transaction modifies
+ * the catalog or another catalog modifying transaction commits.
+ *
+ * Needs to be called before any changes are added with
+ * ReorderBufferQueueChange().
+ */
+void
+ReorderBufferSetBaseSnapshot(ReorderBuffer *rb, TransactionId xid,
+							 XLogRecPtr lsn, Snapshot snap)
+{
+	ReorderBufferTXN *txn;
+	bool		is_new;
+
+	txn = ReorderBufferTXNByXid(rb, xid, true, &is_new, lsn, true);
+	Assert(txn->base_snapshot == NULL);
+	Assert(snap != NULL);
+
+	txn->base_snapshot = snap;
+	txn->base_snapshot_lsn = lsn;
+}
+
+/*
+ * Access the catalog with this CommandId at this point in the changestream.
+ *
+ * May only be called for command ids > 1
+ */
+void
+ReorderBufferAddNewCommandId(ReorderBuffer *rb, TransactionId xid,
+							 XLogRecPtr lsn, CommandId cid)
+{
+	ReorderBufferChange *change = ReorderBufferGetChange(rb);
+
+	change->command_id = cid;
+	change->action_internal = REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID;
+
+	ReorderBufferQueueChange(rb, xid, lsn, change);
+}
+
+
+/*
+ * Add new (relfilenode, tid) -> (cmin, cmax) mappings.
+ */
+void
+ReorderBufferAddNewTupleCids(ReorderBuffer *rb, TransactionId xid,
+							 XLogRecPtr lsn, RelFileNode node,
+							 ItemPointerData tid, CommandId cmin,
+							 CommandId cmax, CommandId combocid)
+{
+	ReorderBufferChange *change = ReorderBufferGetChange(rb);
+	ReorderBufferTXN *txn;
+
+	txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
+
+	change->tuplecid.node = node;
+	change->tuplecid.tid = tid;
+	change->tuplecid.cmin = cmin;
+	change->tuplecid.cmax = cmax;
+	change->tuplecid.combocid = combocid;
+	change->lsn = lsn;
+	change->action_internal = REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID;
+
+	dlist_push_tail(&txn->tuplecids, &change->node);
+	txn->ntuplecids++;
+}
+
+/*
+ * Setup the invalidation of the toplevel transaction.
+ *
+ * This needs to be done before ReorderBufferCommit is called!
+ */
+void
+ReorderBufferAddInvalidations(ReorderBuffer *rb, TransactionId xid,
+							  XLogRecPtr lsn, Size nmsgs,
+							  SharedInvalidationMessage *msgs)
+{
+	ReorderBufferTXN *txn;
+
+	txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
+
+	if (txn->ninvalidations != 0)
+		elog(ERROR, "only ever add one set of invalidations");
+
+	Assert(nmsgs > 0);
+
+	txn->ninvalidations = nmsgs;
+	txn->invalidations = (SharedInvalidationMessage *)
+		MemoryContextAlloc(rb->context,
+						   sizeof(SharedInvalidationMessage) * nmsgs);
+	memcpy(txn->invalidations, msgs,
+		   sizeof(SharedInvalidationMessage) * nmsgs);
+}
+
+/*
+ * Apply all invalidations we know. Possibly we only need parts at this point
+ * in the changestream but we don't know which those are.
+ */
+static void
+ReorderBufferExecuteInvalidations(ReorderBuffer *rb, ReorderBufferTXN *txn)
+{
+	int			i;
+
+	for (i = 0; i < txn->ninvalidations; i++)
+		LocalExecuteInvalidationMessage(&txn->invalidations[i]);
+}
+
+/*
+ * Mark a transaction as containing catalog changes
+ */
+void
+ReorderBufferXidSetCatalogChanges(ReorderBuffer *rb, TransactionId xid,
+								  XLogRecPtr lsn)
+{
+	ReorderBufferTXN *txn;
+
+	txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
+
+	txn->has_catalog_changes = true;
+}
+
+/*
+ * Query whether a transaction is already *known* to contain catalog
+ * changes. This can be wrong until directly before the commit!
+ */
+bool
+ReorderBufferXidHasCatalogChanges(ReorderBuffer *rb, TransactionId xid)
+{
+	ReorderBufferTXN *txn;
+
+	txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
+								false);
+	if (txn == NULL)
+		return false;
+
+	return txn->has_catalog_changes;
+}
+
+/*
+ * Have we already added the first snapshot?
+ */
+bool
+ReorderBufferXidHasBaseSnapshot(ReorderBuffer *rb, TransactionId xid)
+{
+	ReorderBufferTXN *txn;
+
+	txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
+								false);
+
+	/* transaction isn't known yet, ergo no snapshot */
+	if (txn == NULL)
+		return false;
+
+	/*
+	 * TODO: It would be a nice improvement if we would check the toplevel
+	 * transaction in subtransactions, but we'd need to keep track of a bit
+	 * more state.
+	 */
+	return txn->base_snapshot != NULL;
+}
+
+
+/*
+ * ---------------------------------------
+ * Disk serialization support
+ * ---------------------------------------
+ */
+
+/*
+ * Ensure the IO buffer is >= sz.
+ */
+static void
+ReorderBufferSerializeReserve(ReorderBuffer *rb, Size sz)
+{
+	if (!rb->outbufsize)
+	{
+		rb->outbuf = MemoryContextAlloc(rb->context, sz);
+		rb->outbufsize = sz;
+	}
+	else if (rb->outbufsize < sz)
+	{
+		rb->outbuf = repalloc(rb->outbuf, sz);
+		rb->outbufsize = sz;
+	}
+}
+
+/*
+ * Check whether the transaction tx should spill its data to disk.
+ */
+static void
+ReorderBufferCheckSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
+{
+	/*
+	 * TODO: improve accounting so we cheaply can take subtransactions into
+	 * account here.
+	 */
+	if (txn->nentries_mem >= max_changes_in_memory)
+	{
+		ReorderBufferSerializeTXN(rb, txn);
+		Assert(txn->nentries_mem == 0);
+	}
+}
+
+/*
+ * Spill data of a large transaction (and its subtransactions) to disk.
+ */
+static void
+ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
+{
+	dlist_iter	subtxn_i;
+	dlist_mutable_iter change_i;
+	int			fd = -1;
+	XLogSegNo	curOpenSegNo = 0;
+	Size		spilled = 0;
+	char		path[MAXPGPATH];
+
+	elog(DEBUG2, "spill %u changes in tx %u to disk",
+		 (uint32) txn->nentries_mem, txn->xid);
+
+	/* do the same to all child TXs */
+	dlist_foreach(subtxn_i, &txn->subtxns)
+	{
+		ReorderBufferTXN *subtxn;
+
+		subtxn = dlist_container(ReorderBufferTXN, node, subtxn_i.cur);
+		ReorderBufferSerializeTXN(rb, subtxn);
+	}
+
+	/* serialize changestream */
+	dlist_foreach_modify(change_i, &txn->changes)
+	{
+		ReorderBufferChange *change;
+
+		change = dlist_container(ReorderBufferChange, node, change_i.cur);
+
+		/*
+		 * store in segment in which it belongs by start lsn, don't split over
+		 * multiple segments tho
+		 */
+		if (fd == -1 || XLByteInSeg(change->lsn, curOpenSegNo))
+		{
+			XLogRecPtr	recptr;
+
+			if (fd != -1)
+				CloseTransientFile(fd);
+
+			XLByteToSeg(change->lsn, curOpenSegNo);
+			XLogSegNoOffsetToRecPtr(curOpenSegNo, 0, recptr);
+
+			/*
+			 * No need to care about TLIs here, only used during a single run,
+			 * so each LSN only maps to a specific WAL record.
+			 */
+			sprintf(path, "pg_replslot/%s/xid-%u-lsn-%X-%X.snap",
+					NameStr(MyReplicationSlot->data.name), txn->xid,
+					(uint32) (recptr >> 32), (uint32) recptr);
+
+			/* open segment, create it if necessary */
+			fd = OpenTransientFile(path,
+								   O_CREAT | O_WRONLY | O_APPEND | PG_BINARY,
+								   S_IRUSR | S_IWUSR);
+
+			if (fd < 0)
+				ereport(ERROR,
+						(errcode_for_file_access(),
+						 errmsg("could not open file \"%s\": %m",
+								path)));
+		}
+
+		ReorderBufferSerializeChange(rb, txn, fd, change);
+		dlist_delete(&change->node);
+		ReorderBufferReturnChange(rb, change);
+
+		spilled++;
+	}
+
+	Assert(spilled == txn->nentries_mem);
+	Assert(dlist_is_empty(&txn->changes));
+	txn->nentries_mem = 0;
+
+	if (fd != -1)
+		CloseTransientFile(fd);
+}
+
+/*
+ * Serialize individual change to disk.
+ */
+static void
+ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
+							 int fd, ReorderBufferChange *change)
+{
+	ReorderBufferDiskChange *ondisk;
+	Size		sz = sizeof(ReorderBufferDiskChange);
+
+	ReorderBufferSerializeReserve(rb, sz);
+
+	ondisk = (ReorderBufferDiskChange *) rb->outbuf;
+	memcpy(&ondisk->change, change, sizeof(ReorderBufferChange));
+
+	switch ((ReorderBufferChangeTypeInternal) change->action_internal)
+	{
+		case REORDER_BUFFER_CHANGE_INTERNAL_INSERT:
+			/* fall through */
+		case REORDER_BUFFER_CHANGE_INTERNAL_UPDATE:
+			/* fall through */
+		case REORDER_BUFFER_CHANGE_INTERNAL_DELETE:
+			{
+				char	   *data;
+				Size		oldlen = 0;
+				Size		newlen = 0;
+
+				if (change->tp.oldtuple)
+					oldlen = offsetof(ReorderBufferTupleBuf, data)
+						+ change->tp.oldtuple->tuple.t_len
+						- offsetof(HeapTupleHeaderData, t_bits);
+
+				if (change->tp.newtuple)
+					newlen = offsetof(ReorderBufferTupleBuf, data)
+						+ change->tp.newtuple->tuple.t_len
+						- offsetof(HeapTupleHeaderData, t_bits);
+
+				sz += oldlen;
+				sz += newlen;
+
+				/* make sure we have enough space */
+				ReorderBufferSerializeReserve(rb, sz);
+
+				data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
+				/* might have been reallocated above */
+				ondisk = (ReorderBufferDiskChange *) rb->outbuf;
+
+				if (oldlen)
+				{
+					memcpy(data, change->tp.oldtuple, oldlen);
+					data += oldlen;
+					Assert(&change->tp.oldtuple->header == change->tp.oldtuple->tuple.t_data);
+				}
+
+				if (newlen)
+				{
+					memcpy(data, change->tp.newtuple, newlen);
+					data += newlen;
+					Assert(&change->tp.newtuple->header == change->tp.newtuple->tuple.t_data);
+				}
+				break;
+			}
+		case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
+			{
+				char	   *data;
+
+				sz += sizeof(SnapshotData) +
+					sizeof(TransactionId) * change->snapshot->xcnt +
+					sizeof(TransactionId) * change->snapshot->subxcnt
+					;
+
+				/* make sure we have enough space */
+				ReorderBufferSerializeReserve(rb, sz);
+				data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
+				/* might have been reallocated above */
+				ondisk = (ReorderBufferDiskChange *) rb->outbuf;
+
+				memcpy(data, change->snapshot, sizeof(SnapshotData));
+				data += sizeof(SnapshotData);
+
+				if (change->snapshot->xcnt)
+				{
+					memcpy(data, change->snapshot->xip,
+						   sizeof(TransactionId) + change->snapshot->xcnt);
+					data += sizeof(TransactionId) + change->snapshot->xcnt;
+				}
+
+				if (change->snapshot->subxcnt)
+				{
+					memcpy(data, change->snapshot->subxip,
+						   sizeof(TransactionId) + change->snapshot->subxcnt);
+					data += sizeof(TransactionId) + change->snapshot->subxcnt;
+				}
+				break;
+			}
+		case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
+			/* ReorderBufferChange contains everything important */
+			break;
+		case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
+			/* ReorderBufferChange contains everything important */
+			break;
+	}
+
+	ondisk->size = sz;
+
+	if (write(fd, rb->outbuf, ondisk->size) != ondisk->size)
+	{
+		CloseTransientFile(fd);
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not write to xid %u's data file: %m",
+						txn->xid)));
+	}
+
+	Assert(ondisk->change.action_internal == change->action_internal);
+}
+
+/*
+ * Restore a number of changes spilled to disk back into memory.
+ */
+static Size
+ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn,
+							int *fd, XLogSegNo *segno)
+{
+	Size		restored = 0;
+	XLogSegNo	last_segno;
+	dlist_mutable_iter cleanup_iter;
+
+	Assert(txn->first_lsn != InvalidXLogRecPtr);
+	Assert(txn->final_lsn != InvalidXLogRecPtr);
+
+	/* free current entries, so we have memory for more */
+	dlist_foreach_modify(cleanup_iter, &txn->changes)
+	{
+		ReorderBufferChange *cleanup =
+		dlist_container(ReorderBufferChange, node, cleanup_iter.cur);
+
+		dlist_delete(&cleanup->node);
+		ReorderBufferReturnChange(rb, cleanup);
+	}
+	txn->nentries_mem = 0;
+	Assert(dlist_is_empty(&txn->changes));
+
+	XLByteToSeg(txn->final_lsn, last_segno);
+
+	while (restored < max_changes_in_memory && *segno <= last_segno)
+	{
+		int			readBytes;
+		ReorderBufferDiskChange *ondisk;
+
+		if (*fd == -1)
+		{
+			XLogRecPtr	recptr;
+			char		path[MAXPGPATH];
+
+			/* first time in */
+			if (*segno == 0)
+			{
+				XLByteToSeg(txn->first_lsn, *segno);
+			}
+
+			Assert(*segno != 0 || dlist_is_empty(&txn->changes));
+			XLogSegNoOffsetToRecPtr(*segno, 0, recptr);
+
+			/*
+			 * No need to care about TLIs here, only used during a single run,
+			 * so each LSN only maps to a specific WAL record.
+			 */
+			sprintf(path, "pg_replslot/%s/xid-%u-lsn-%X-%X.snap",
+					NameStr(MyReplicationSlot->data.name), txn->xid,
+					(uint32) (recptr >> 32), (uint32) recptr);
+
+			*fd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0);
+			if (*fd < 0 && errno == ENOENT)
+			{
+				*fd = -1;
+				(*segno)++;
+				continue;
+			}
+			else if (*fd < 0)
+				ereport(ERROR,
+						(errcode_for_file_access(),
+						 errmsg("could not open file \"%s\": %m",
+								path)));
+
+		}
+
+		ReorderBufferSerializeReserve(rb, sizeof(ReorderBufferDiskChange));
+
+
+		/*
+		 * Read the statically sized part of a change which has information
+		 * about the total size. If we couldn't read a record, we're at the
+		 * end of this file.
+		 */
+
+		readBytes = read(*fd, rb->outbuf, sizeof(ReorderBufferDiskChange));
+
+		/* eof */
+		if (readBytes == 0)
+		{
+			CloseTransientFile(*fd);
+			*fd = -1;
+			(*segno)++;
+			continue;
+		}
+		else if (readBytes < 0)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not read from reorderbuffer spill file: %m")));
+		else if (readBytes != sizeof(ReorderBufferDiskChange))
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("incomplete read from reorderbuffer spill file: read %d instead of %u",
+							readBytes,
+							(uint32) sizeof(ReorderBufferDiskChange))));
+
+		ondisk = (ReorderBufferDiskChange *) rb->outbuf;
+
+		ReorderBufferSerializeReserve(rb,
+									  sizeof(ReorderBufferDiskChange) + ondisk->size);
+		ondisk = (ReorderBufferDiskChange *) rb->outbuf;
+
+		readBytes = read(*fd, rb->outbuf + sizeof(ReorderBufferDiskChange),
+						 ondisk->size - sizeof(ReorderBufferDiskChange));
+
+		if (readBytes < 0)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not read from reorderbuffer spill file: %m")));
+		else if (readBytes != ondisk->size - sizeof(ReorderBufferDiskChange))
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not read from reorderbuffer spill file: read %d instead of %u",
+							readBytes,
+							(uint32) (ondisk->size - sizeof(ReorderBufferDiskChange)))));
+
+		/*
+		 * ok, read a full change from disk, now restore it into proper
+		 * in-memory format
+		 */
+		ReorderBufferRestoreChange(rb, txn, rb->outbuf);
+		restored++;
+	}
+
+	return restored;
+}
+
+/*
+ * Convert change from its on-disk format to in-memory format and queue it onto
+ * the TXN's ->changes list.
+ */
+static void
+ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
+						   char *data)
+{
+	ReorderBufferDiskChange *ondisk;
+	ReorderBufferChange *change;
+
+	ondisk = (ReorderBufferDiskChange *) data;
+
+	change = ReorderBufferGetChange(rb);
+
+	/* copy static part */
+	memcpy(change, &ondisk->change, sizeof(ReorderBufferChange));
+
+	data += sizeof(ReorderBufferDiskChange);
+
+	/* restore individual stuff */
+	switch ((ReorderBufferChangeTypeInternal) change->action_internal)
+	{
+		case REORDER_BUFFER_CHANGE_INTERNAL_INSERT:
+			/* fall through */
+		case REORDER_BUFFER_CHANGE_INTERNAL_UPDATE:
+			/* fall through */
+		case REORDER_BUFFER_CHANGE_INTERNAL_DELETE:
+			if (change->tp.newtuple)
+			{
+				Size		len = offsetof(ReorderBufferTupleBuf, data)
+				+((ReorderBufferTupleBuf *) data)->tuple.t_len
+				- offsetof(HeapTupleHeaderData, t_bits);
+
+				change->tp.newtuple = ReorderBufferGetTupleBuf(rb);
+				memcpy(change->tp.newtuple, data, len);
+				change->tp.newtuple->tuple.t_data = &change->tp.newtuple->header;
+
+				data += len;
+			}
+
+			if (change->tp.oldtuple)
+			{
+				Size		len = offsetof(ReorderBufferTupleBuf, data)
+				+((ReorderBufferTupleBuf *) data)->tuple.t_len
+				- offsetof(HeapTupleHeaderData, t_bits);
+
+				change->tp.oldtuple = ReorderBufferGetTupleBuf(rb);
+				memcpy(change->tp.oldtuple, data, len);
+				change->tp.oldtuple->tuple.t_data = &change->tp.oldtuple->header;
+				data += len;
+			}
+			break;
+		case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
+			{
+				Snapshot	oldsnap = (Snapshot) data;
+				Size		size = sizeof(SnapshotData) +
+				sizeof(TransactionId) * oldsnap->xcnt +
+				sizeof(TransactionId) * (oldsnap->subxcnt + 0)
+						   ;
+
+				Assert(change->snapshot != NULL);
+
+				change->snapshot = MemoryContextAllocZero(rb->context, size);
+
+				memcpy(change->snapshot, data, size);
+				change->snapshot->xip = (TransactionId *)
+					(((char *) change->snapshot) + sizeof(SnapshotData));
+				change->snapshot->subxip =
+					change->snapshot->xip + change->snapshot->xcnt + 0;
+				change->snapshot->copied = true;
+				break;
+			}
+			/* the base struct contains all the data, easy peasy */
+		case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
+		case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
+			break;
+	}
+
+	dlist_push_tail(&txn->changes, &change->node);
+	txn->nentries_mem++;
+}
+
+/*
+ * Remove all on-disk stored for the passed in transaction.
+ */
+static void
+ReorderBufferRestoreCleanup(ReorderBuffer *rb, ReorderBufferTXN *txn)
+{
+	XLogSegNo	first;
+	XLogSegNo	cur;
+	XLogSegNo	last;
+
+	Assert(txn->first_lsn != InvalidXLogRecPtr);
+	Assert(txn->final_lsn != InvalidXLogRecPtr);
+
+	XLByteToSeg(txn->first_lsn, first);
+	XLByteToSeg(txn->final_lsn, last);
+
+	/* iterate over all possible filenames, and delete them */
+	for (cur = first; cur <= last; cur++)
+	{
+		char		path[MAXPGPATH];
+		XLogRecPtr	recptr;
+
+		XLogSegNoOffsetToRecPtr(cur, 0, recptr);
+
+		sprintf(path, "pg_replslot/%s/xid-%u-lsn-%X-%X.snap",
+				NameStr(MyReplicationSlot->data.name), txn->xid,
+				(uint32) (recptr >> 32), (uint32) recptr);
+		if (unlink(path) != 0 && errno != ENOENT)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not unlink file \"%s\": %m", path)));
+	}
+}
+
+/*
+ * Delete all data spilled to disk after we've restarted/crashed. It will be
+ * recreated when the respective slots are reused.
+ */
+void
+StartupReorderBuffer(void)
+{
+	DIR		   *logical_dir;
+	struct dirent *logical_de;
+
+	DIR		   *spill_dir;
+	struct dirent *spill_de;
+
+	logical_dir = AllocateDir("pg_replslot");
+	while ((logical_de = ReadDir(logical_dir, "pg_replslot")) != NULL)
+	{
+		struct stat	statbuf;
+		char		path[MAXPGPATH];
+
+		if (strcmp(logical_de->d_name, ".") == 0 ||
+			strcmp(logical_de->d_name, "..") == 0)
+			continue;
+
+		/* if it cannot be a slot, skip the directory */
+		if (!ReplicationSlotValidateName(logical_de->d_name, DEBUG2))
+			continue;
+
+		/*
+		 * ok, has to be a surviving logical slot, iterate and delete
+		 * everythign starting with xid-*
+		 */
+		sprintf(path, "pg_replslot/%s", logical_de->d_name);
+
+		/* we're only creating directories here, skip if it's not our's */
+		if (lstat(path, &statbuf) == 0 && !S_ISDIR(statbuf.st_mode))
+			continue;
+
+		spill_dir = AllocateDir(path);
+		while ((spill_de = ReadDir(spill_dir, path)) != NULL)
+		{
+			if (strcmp(spill_de->d_name, ".") == 0 ||
+				strcmp(spill_de->d_name, "..") == 0)
+				continue;
+
+			/* only look at names that can be ours */
+			if (strncmp(spill_de->d_name, "xid", 3) == 0)
+			{
+				sprintf(path, "pg_replslot/%s/%s", logical_de->d_name,
+						spill_de->d_name);
+
+				if (unlink(path) != 0)
+					ereport(PANIC,
+							(errcode_for_file_access(),
+							 errmsg("could not unlink file \"%s\": %m",
+									path)));
+			}
+		}
+		FreeDir(spill_dir);
+	}
+	FreeDir(logical_dir);
+}
+
+/* ---------------------------------------
+ * toast reassembly support
+ * ---------------------------------------
+ */
+
+/*
+ * Initialize per tuple toast reconstruction support.
+ */
+static void
+ReorderBufferToastInitHash(ReorderBuffer *rb, ReorderBufferTXN *txn)
+{
+	HASHCTL		hash_ctl;
+
+	Assert(txn->toast_hash == NULL);
+
+	memset(&hash_ctl, 0, sizeof(hash_ctl));
+	hash_ctl.keysize = sizeof(Oid);
+	hash_ctl.entrysize = sizeof(ReorderBufferToastEnt);
+	hash_ctl.hash = tag_hash;
+	hash_ctl.hcxt = rb->context;
+	txn->toast_hash = hash_create("ReorderBufferToastHash", 5, &hash_ctl,
+								  HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
+}
+
+/*
+ * Per toast-chunk handling for toast reconstruction
+ *
+ * Appends a toast chunk so we can reconstruct it when the tuple "owning" the
+ * toasted Datum comes along.
+ */
+static void
+ReorderBufferToastAppendChunk(ReorderBuffer *rb, ReorderBufferTXN *txn,
+							  Relation relation, ReorderBufferChange *change)
+{
+	ReorderBufferToastEnt *ent;
+	bool		found;
+	int32		chunksize;
+	bool		isnull;
+	Pointer		chunk;
+	TupleDesc	desc = RelationGetDescr(relation);
+	Oid			chunk_id;
+	Oid			chunk_seq;
+
+	if (txn->toast_hash == NULL)
+		ReorderBufferToastInitHash(rb, txn);
+
+	Assert(IsToastRelation(relation));
+
+	chunk_id = DatumGetObjectId(fastgetattr(&change->tp.newtuple->tuple, 1, desc, &isnull));
+	Assert(!isnull);
+	chunk_seq = DatumGetInt32(fastgetattr(&change->tp.newtuple->tuple, 2, desc, &isnull));
+	Assert(!isnull);
+
+	ent = (ReorderBufferToastEnt *)
+		hash_search(txn->toast_hash,
+					(void *) &chunk_id,
+					HASH_ENTER,
+					&found);
+
+	if (!found)
+	{
+		Assert(ent->chunk_id == chunk_id);
+		ent->num_chunks = 0;
+		ent->last_chunk_seq = 0;
+		ent->size = 0;
+		ent->reconstructed = NULL;
+		dlist_init(&ent->chunks);
+
+		if (chunk_seq != 0)
+			elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq 0",
+				 chunk_seq, chunk_id);
+	}
+	else if (found && chunk_seq != ent->last_chunk_seq + 1)
+		elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq %d",
+			 chunk_seq, chunk_id, ent->last_chunk_seq + 1);
+
+	chunk = DatumGetPointer(fastgetattr(&change->tp.newtuple->tuple, 3, desc, &isnull));
+	Assert(!isnull);
+
+	/* calculate size so we can allocate the right size at once later */
+	if (!VARATT_IS_EXTENDED(chunk))
+		chunksize = VARSIZE(chunk) - VARHDRSZ;
+	else if (VARATT_IS_SHORT(chunk))
+		/* could happen due to heap_form_tuple doing its thing */
+		chunksize = VARSIZE_SHORT(chunk) - VARHDRSZ_SHORT;
+	else
+		elog(ERROR, "unexpected type of toast chunk");
+
+	ent->size += chunksize;
+	ent->last_chunk_seq = chunk_seq;
+	ent->num_chunks++;
+	dlist_push_tail(&ent->chunks, &change->node);
+}
+
+/*
+ * Rejigger change->newtuple to point to in-memory toast tuples instead to
+ * on-disk toast tuples that may not longer exist (think DROP TABLE or VACUUM).
+ *
+ * We cannot replace unchanged toast tuples though, so those will still point
+ * to on-disk toast data.
+ */
+static void
+ReorderBufferToastReplace(ReorderBuffer *rb, ReorderBufferTXN *txn,
+						  Relation relation, ReorderBufferChange *change)
+{
+	TupleDesc	desc;
+	int			natt;
+	Datum	   *attrs;
+	bool	   *isnull;
+	bool	   *free;
+	HeapTuple	newtup;
+	Relation	toast_rel;
+	TupleDesc	toast_desc;
+	MemoryContext oldcontext;
+
+	/* no toast tuples changed */
+	if (txn->toast_hash == NULL)
+		return;
+
+	oldcontext = MemoryContextSwitchTo(rb->context);
+
+	/* we should only have toast tuples in an INSERT or UPDATE */
+	Assert(change->tp.newtuple);
+
+	desc = RelationGetDescr(relation);
+
+	toast_rel = RelationIdGetRelation(relation->rd_rel->reltoastrelid);
+	toast_desc = RelationGetDescr(toast_rel);
+
+	/* should we allocate from stack instead? */
+	attrs = palloc0(sizeof(Datum) * desc->natts);
+	isnull = palloc0(sizeof(bool) * desc->natts);
+	free = palloc0(sizeof(bool) * desc->natts);
+
+	heap_deform_tuple(&change->tp.newtuple->tuple, desc,
+					  attrs, isnull);
+
+	for (natt = 0; natt < desc->natts; natt++)
+	{
+		Form_pg_attribute attr = desc->attrs[natt];
+		ReorderBufferToastEnt *ent;
+		struct varlena *varlena;
+
+		/* va_rawsize is the size of the original datum -- including header */
+		struct varatt_external toast_pointer;
+		struct varatt_indirect redirect_pointer;
+		struct varlena *new_datum = NULL;
+		struct varlena *reconstructed;
+		dlist_iter	it;
+		Size		data_done = 0;
+
+		/* system columns aren't toasted */
+		if (attr->attnum < 0)
+			continue;
+
+		if (attr->attisdropped)
+			continue;
+
+		/* not a varlena datatype */
+		if (attr->attlen != -1)
+			continue;
+
+		/* no data */
+		if (isnull[natt])
+			continue;
+
+		/* ok, we know we have a toast datum */
+		varlena = (struct varlena *) DatumGetPointer(attrs[natt]);
+
+		/* no need to do anything if the tuple isn't external */
+		if (!VARATT_IS_EXTERNAL(varlena))
+			continue;
+
+		VARATT_EXTERNAL_GET_POINTER(toast_pointer, varlena);
+
+		/*
+		 * Check whether the toast tuple changed, replace if so.
+		 */
+		ent = (ReorderBufferToastEnt *)
+			hash_search(txn->toast_hash,
+						(void *) &toast_pointer.va_valueid,
+						HASH_FIND,
+						NULL);
+		if (ent == NULL)
+			continue;
+
+		new_datum =
+			(struct varlena *) palloc0(INDIRECT_POINTER_SIZE);
+
+		free[natt] = true;
+
+		reconstructed = palloc0(toast_pointer.va_rawsize);
+
+		ent->reconstructed = reconstructed;
+
+		/* stitch toast tuple back together from its parts */
+		dlist_foreach(it, &ent->chunks)
+		{
+			bool		isnull;
+			ReorderBufferTupleBuf *tup =
+			dlist_container(ReorderBufferChange, node, it.cur)->tp.newtuple;
+			Pointer		chunk =
+			DatumGetPointer(fastgetattr(&tup->tuple, 3, toast_desc, &isnull));
+
+			Assert(!isnull);
+			Assert(!VARATT_IS_EXTERNAL(chunk));
+			Assert(!VARATT_IS_SHORT(chunk));
+
+			memcpy(VARDATA(reconstructed) + data_done,
+				   VARDATA(chunk),
+				   VARSIZE(chunk) - VARHDRSZ);
+			data_done += VARSIZE(chunk) - VARHDRSZ;
+		}
+		Assert(data_done == toast_pointer.va_extsize);
+
+		/* make sure its marked as compressed or not */
+		if (VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer))
+			SET_VARSIZE_COMPRESSED(reconstructed, data_done + VARHDRSZ);
+		else
+			SET_VARSIZE(reconstructed, data_done + VARHDRSZ);
+
+		memset(&redirect_pointer, 0, sizeof(redirect_pointer));
+		redirect_pointer.pointer = reconstructed;
+
+		SET_VARTAG_EXTERNAL(new_datum, VARTAG_INDIRECT);
+		memcpy(VARDATA_EXTERNAL(new_datum), &redirect_pointer,
+			   sizeof(redirect_pointer));
+
+		attrs[natt] = PointerGetDatum(new_datum);
+	}
+
+	/*
+	 * Build tuple in separate memory & copy tuple back into the tuplebuf
+	 * passed to the output plugin. We can't directly heap_fill_tuple() into
+	 * the tuplebuf because attrs[] will point back into the current content.
+	 */
+	newtup = heap_form_tuple(desc, attrs, isnull);
+	Assert(change->tp.newtuple->tuple.t_len <= MaxHeapTupleSize);
+	Assert(&change->tp.newtuple->header == change->tp.newtuple->tuple.t_data);
+
+	memcpy(change->tp.newtuple->tuple.t_data,
+		   newtup->t_data,
+		   newtup->t_len);
+	change->tp.newtuple->tuple.t_len = newtup->t_len;
+
+	/*
+	 * free resources we won't further need, more persistent stuff will be
+	 * free'd in ReorderBufferToastReset().
+	 */
+	RelationClose(toast_rel);
+	pfree(newtup);
+	for (natt = 0; natt < desc->natts; natt++)
+	{
+		if (free[natt])
+			pfree(DatumGetPointer(attrs[natt]));
+	}
+	pfree(attrs);
+	pfree(free);
+	pfree(isnull);
+
+	MemoryContextSwitchTo(oldcontext);
+}
+
+/*
+ * Free all resources allocated for toast reconstruction.
+ */
+static void
+ReorderBufferToastReset(ReorderBuffer *rb, ReorderBufferTXN *txn)
+{
+	HASH_SEQ_STATUS hstat;
+	ReorderBufferToastEnt *ent;
+
+	if (txn->toast_hash == NULL)
+		return;
+
+	/* sequentially walk over the hash and free everything */
+	hash_seq_init(&hstat, txn->toast_hash);
+	while ((ent = (ReorderBufferToastEnt *) hash_seq_search(&hstat)) != NULL)
+	{
+		dlist_mutable_iter it;
+
+		if (ent->reconstructed != NULL)
+			pfree(ent->reconstructed);
+
+		dlist_foreach_modify(it, &ent->chunks)
+		{
+			ReorderBufferChange *change =
+			dlist_container(ReorderBufferChange, node, it.cur);
+
+			dlist_delete(&change->node);
+			ReorderBufferReturnChange(rb, change);
+		}
+	}
+
+	hash_destroy(txn->toast_hash);
+	txn->toast_hash = NULL;
+}
+
+
+/* ---------------------------------------
+ * Visibility support for logical decoding
+ *
+ *
+ * Lookup actual cmin/cmax values when using decoding snapshot. We can't
+ * always rely on stored cmin/cmax values because of two scenarios:
+ *
+ * * A tuple got changed multiple times during a single transaction and thus
+ *	 has got a combocid. Combocid's are only valid for the duration of a
+ *	 single transaction.
+ * * A tuple with a cmin but no cmax (and thus no combocid) got
+ *	 deleted/updated in another transaction than the one which created it
+ *	 which we are looking at right now. As only one of cmin, cmax or combocid
+ *	 is actually stored in the heap we don't have access to the the value we
+ *	 need anymore.
+ *
+ * To resolve those problems we have a per-transaction hash of (cmin,
+ * cmax) tuples keyed by (relfilenode, ctid) which contains the actual
+ * (cmin, cmax) values. That also takes care of combocids by simply
+ * not caring about them at all. As we have the real cmin/cmax values
+ * combocids aren't interesting.
+ *
+ * As we only care about catalog tuples here the overhead of this
+ * hashtable should be acceptable.
+ *
+ * Heap rewrites complicate this a bit, check rewriteheap.c for
+ * details.
+ * -------------------------------------------------------------------------
+ */
+
+/* struct for qsort()ing mapping files by lsn somewhat efficiently */
+typedef struct RewriteMappingFile
+{
+	XLogRecPtr	lsn;
+	char		fname[MAXPGPATH];
+} RewriteMappingFile;
+
+#if NOT_USED
+static void
+DisplayMapping(HTAB *tuplecid_data)
+{
+	HASH_SEQ_STATUS hstat;
+	ReorderBufferTupleCidEnt *ent;
+
+	hash_seq_init(&hstat, tuplecid_data);
+	while ((ent = (ReorderBufferTupleCidEnt *) hash_seq_search(&hstat)) != NULL)
+	{
+		elog(DEBUG3, "mapping: node: %u/%u/%u tid: %u/%u cmin: %u, cmax: %u",
+			 ent->key.relnode.dbNode,
+			 ent->key.relnode.spcNode,
+			 ent->key.relnode.relNode,
+			 BlockIdGetBlockNumber(&ent->key.tid.ip_blkid),
+			 ent->key.tid.ip_posid,
+			 ent->cmin,
+			 ent->cmax
+			);
+	}
+}
+#endif
+
+/*
+ * Apply a single mapping file to tuplecid_data.
+ *
+ * The mapping file has to have been verified to be a) committed b) for our
+ * transaction c) applied in LSN order.
+ */
+static void
+ApplyLogicalMappingFile(HTAB *tuplecid_data, Oid relid, const char *fname)
+{
+	char		path[MAXPGPATH];
+	int			fd;
+	int			readBytes;
+	LogicalRewriteMappingData map;
+
+	sprintf(path, "pg_llog/mappings/%s", fname);
+	fd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0);
+	if (fd < 0)
+		ereport(ERROR,
+				(errmsg("could not open file \"%s\": %m", path)));
+
+	while (true)
+	{
+		ReorderBufferTupleCidKey key;
+		ReorderBufferTupleCidEnt *ent;
+		ReorderBufferTupleCidEnt *new_ent;
+		bool found;
+
+		/* be careful about padding */
+		memset(&key, 0, sizeof(ReorderBufferTupleCidKey));
+
+		/* read all mappings till the end of the file */
+		readBytes = read(fd, &map, sizeof(LogicalRewriteMappingData));
+
+		if (readBytes < 0)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not read file \"%s\": %m",
+							path)));
+		else if (readBytes == 0) /* EOF */
+			break;
+		else if (readBytes != sizeof(LogicalRewriteMappingData))
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not read file \"%s\", read %d instead of %d",
+							path, readBytes,
+							(int32) sizeof(LogicalRewriteMappingData))));
+
+		key.relnode = map.old_node;
+		ItemPointerCopy(&map.old_tid,
+						&key.tid);
+
+
+		ent = (ReorderBufferTupleCidEnt *)
+			hash_search(tuplecid_data,
+						(void *) &key,
+						HASH_FIND,
+						NULL);
+
+		/* no existing mapping, no need to update */
+		if (!ent)
+			continue;
+
+		key.relnode = map.new_node;
+		ItemPointerCopy(&map.new_tid,
+						&key.tid);
+
+		new_ent = (ReorderBufferTupleCidEnt *)
+			hash_search(tuplecid_data,
+						(void *) &key,
+						HASH_ENTER,
+						&found);
+
+		if (found)
+		{
+			/*
+			 * Make sure the existing mapping makes sense. We sometime update
+			 * old records that did not yet have a cmax (e.g. pg_class' own
+			 * entry while rewriting it) during rewrites, so allow that.
+			 */
+			Assert(ent->cmin == InvalidCommandId || ent->cmin == new_ent->cmin);
+			Assert(ent->cmax == InvalidCommandId || ent->cmax == new_ent->cmax);
+		}
+		else
+		{
+			/* update mapping */
+			new_ent->cmin = ent->cmin;
+			new_ent->cmax = ent->cmax;
+			new_ent->combocid = ent->combocid;
+		}
+	}
+}
+
+
+/*
+ * Check whether the TransactionOId 'xid' is in the pre-sorted array 'xip'.
+ */
+static bool
+TransactionIdInArray(TransactionId xid, TransactionId *xip, Size num)
+{
+	return bsearch(&xid, xip, num,
+				   sizeof(TransactionId), xidComparator) != NULL;
+}
+
+/*
+ * qsort() comparator for sorting RewriteMappingFiles in LSN order.
+ */
+static int
+file_sort_by_lsn(const void *a_p, const void *b_p)
+{
+	RewriteMappingFile *a = *(RewriteMappingFile **)a_p;
+	RewriteMappingFile *b = *(RewriteMappingFile **)b_p;
+
+	if (a->lsn < b->lsn)
+		return -1;
+	else if (a->lsn > b->lsn)
+		return 1;
+	return 0;
+}
+
+/*
+ * Apply any existing logical remapping files if there are any targeted at our
+ * transaction for relid.
+ */
+static void
+UpdateLogicalMappings(HTAB *tuplecid_data, Oid relid, Snapshot snapshot)
+{
+	DIR		   *mapping_dir;
+	struct dirent *mapping_de;
+	List	   *files = NIL;
+	ListCell   *file;
+	RewriteMappingFile **files_a;
+	size_t		off;
+	Oid			dboid = IsSharedRelation(relid) ? InvalidOid : MyDatabaseId;
+
+	mapping_dir = AllocateDir("pg_llog/mappings");
+	while ((mapping_de = ReadDir(mapping_dir, "pg_llog/mappings")) != NULL)
+	{
+		Oid				f_dboid;
+		Oid				f_relid;
+		TransactionId	f_mapped_xid;
+		TransactionId	f_create_xid;
+		XLogRecPtr		f_lsn;
+		uint32			f_hi, f_lo;
+		RewriteMappingFile *f;
+
+		if (strcmp(mapping_de->d_name, ".") == 0 ||
+			strcmp(mapping_de->d_name, "..") == 0)
+			continue;
+
+		/* Ignore files that aren't ours*/
+		if (strncmp(mapping_de->d_name, "map-", 4) != 0)
+			continue;
+
+		if (sscanf(mapping_de->d_name, LOGICAL_REWRITE_FORMAT,
+				   &f_dboid, &f_relid, &f_hi, &f_lo,
+				   &f_mapped_xid, &f_create_xid) != 6)
+			elog(ERROR, "could not parse fname %s", mapping_de->d_name);
+
+		f_lsn = ((uint64) f_hi) << 32 | f_lo;
+
+		/* mapping for another database */
+		if (f_dboid != dboid)
+			continue;
+
+		/* mapping for another relation */
+		if (f_relid != relid)
+			continue;
+
+		/* did the creating transaction abort? */
+		if (!TransactionIdDidCommit(f_create_xid))
+			continue;
+
+		/* not for our transaction */
+		if (!TransactionIdInArray(f_mapped_xid, snapshot->subxip, snapshot->subxcnt))
+			continue;
+
+		/* ok, relevant, queue for apply */
+		f = palloc(sizeof(RewriteMappingFile));
+		f->lsn = f_lsn;
+		strcpy(f->fname, mapping_de->d_name);
+		files = lappend(files, f);
+	}
+	FreeDir(mapping_dir);
+
+	/* build array we can easily sort */
+	files_a = palloc(list_length(files) * sizeof(RewriteMappingFile *));
+	off = 0;
+	foreach(file, files)
+	{
+		files_a[off++] = lfirst(file);
+	}
+
+	/* sort files so we apply them in LSN order */
+	qsort(files_a, list_length(files), sizeof(RewriteMappingFile *),
+		  file_sort_by_lsn);
+
+	for(off = 0; off < list_length(files); off++)
+	{
+		RewriteMappingFile *f = files_a[off];
+		elog(DEBUG1, "applying mapping: %s in %u", f->fname,
+			snapshot->subxip[0]);
+		ApplyLogicalMappingFile(tuplecid_data, relid, f->fname);
+		pfree(f);
+	}
+}
+
+/*
+ * Lookup cmin/cmax of a tuple, during logical decoding where we can't rely on
+ * combocids.
+ */
+bool
+ResolveCminCmaxDuringDecoding(HTAB *tuplecid_data,
+							  Snapshot snapshot,
+							  HeapTuple htup, Buffer buffer,
+							  CommandId *cmin, CommandId *cmax)
+{
+	ReorderBufferTupleCidKey key;
+	ReorderBufferTupleCidEnt *ent;
+	ForkNumber	forkno;
+	BlockNumber blockno;
+	bool updated_mapping = false;
+
+	/* be careful about padding */
+	memset(&key, 0, sizeof(key));
+
+	Assert(!BufferIsLocal(buffer));
+
+	/*
+	 * get relfilenode from the buffer, no convenient way to access it other
+	 * than that.
+	 */
+	BufferGetTag(buffer, &key.relnode, &forkno, &blockno);
+
+	/* tuples can only be in the main fork */
+	Assert(forkno == MAIN_FORKNUM);
+	Assert(blockno == ItemPointerGetBlockNumber(&htup->t_self));
+
+	ItemPointerCopy(&htup->t_self,
+					&key.tid);
+
+restart:
+	ent = (ReorderBufferTupleCidEnt *)
+		hash_search(tuplecid_data,
+					(void *) &key,
+					HASH_FIND,
+					NULL);
+
+	/*
+	 * failed to find a mapping, check whether the table was rewritten and
+	 * apply mapping if so, but only do that once - there can be no new
+	 * mappings while we are in here since we have to hold a lock on the
+	 * relation.
+	 */
+	if (ent == NULL && !updated_mapping)
+	{
+		UpdateLogicalMappings(tuplecid_data, htup->t_tableOid, snapshot);
+		/* now check but don't update for a mapping again */
+		updated_mapping = true;
+		goto restart;
+	}
+	else if (ent == NULL)
+		return false;
+
+	if (cmin)
+		*cmin = ent->cmin;
+	if (cmax)
+		*cmax = ent->cmax;
+	return true;
+}
diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c
new file mode 100644
index 00000000000..28f9a8a1a6f
--- /dev/null
+++ b/src/backend/replication/logical/snapbuild.c
@@ -0,0 +1,1885 @@
+/*-------------------------------------------------------------------------
+ *
+ * snapbuild.c
+ *
+ *	  Infrastructure for building historic catalog snapshots based on contents
+ *	  of the WAL, for the purpose of decoding heapam.c style values in the
+ *	  WAL.
+ *
+ * NOTES:
+ *
+ * We build snapshots which can *only* be used to read catalog contents and we
+ * do so by reading and interpreting the WAL stream. The aim is to build a
+ * snapshot that behaves the same as a freshly taken MVCC snapshot would have
+ * at the time the XLogRecord was generated.
+ *
+ * To build the snapshots we reuse the infrastructure built for Hot
+ * Standby. The in-memory snapshots we build look different than HS' because
+ * we have different needs. To successfully decode data from the WAL we only
+ * need to access catalog tables and (sys|rel|cat)cache, not the actual user
+ * tables since the data we decode is wholly contained in the WAL
+ * records. Also, our snapshots need to be different in comparison to normal
+ * MVCC ones because in contrast to those we cannot fully rely on the clog and
+ * pg_subtrans for information about committed transactions because they might
+ * commit in the future from the POV of the WAL entry we're currently
+ * decoding. This definition has the advantage that we only need to prevent
+ * removal of catalog rows, while normal table's rows can still be
+ * removed. This is achieved by using the replication slot mechanism.
+ *
+ * As the percentage of transactions modifying the catalog normally is fairly
+ * small in comparisons to ones only manipulating user data, we keep track of
+ * the committed catalog modifying ones inside (xmin, xmax) instead of keeping
+ * track of all running transactions like its done in a normal snapshot. Note
+ * that we're generally only looking at transactions that have acquired an
+ * xid. That is we keep a list of transactions between snapshot->(xmin, xmax)
+ * that we consider committed, everything else is considered aborted/in
+ * progress. That also allows us not to care about subtransactions before they
+ * have committed which means this modules, in contrast to HS, doesn't have to
+ * care about suboverflowed subtransactions and similar.
+ *
+ * One complexity of doing this is that to e.g. handle mixed DDL/DML
+ * transactions we need Snapshots that see intermediate versions of the
+ * catalog in a transaction. During normal operation this is achieved by using
+ * CommandIds/cmin/cmax. The problem with that however is that for space
+ * efficiency reasons only one value of that is stored
+ * (c.f. combocid.c). Since ComboCids are only available in memory we log
+ * additional information which allows us to get the original (cmin, cmax)
+ * pair during visibility checks. Check the reorderbuffer.c's comment above
+ * ResolveCminCmaxDuringDecoding() for details.
+ *
+ * To facilitate all this we need our own visibility routine, as the normal
+ * ones are optimized for different usecases.
+ *
+ * To replace the normal catalog snapshots with decoding ones use the
+ * SetupHistoricSnapshot() and TeardownHistoricSnapshot() functions.
+ *
+ *
+ *
+ * The snapbuild machinery is starting up in in several stages, as illustrated
+ * by the following graph:
+ *         +-------------------------+
+ *    +----|SNAPBUILD_START          |-------------+
+ *    |    +-------------------------+             |
+ *    |                 |                          |
+ *    |                 |                          |
+ *    |     running_xacts with running xacts       |
+ *    |                 |                          |
+ *    |                 |                          |
+ *    |                 v                          |
+ *    |    +-------------------------+             v
+ *    |    |SNAPBUILD_FULL_SNAPSHOT  |------------>|
+ *    |    +-------------------------+             |
+ * running_xacts        |                      saved snapshot
+ * with zero xacts      |                 at running_xacts's lsn
+ *    |                 |                          |
+ *    |     all running toplevel TXNs finished     |
+ *    |                 |                          |
+ *    |                 v                          |
+ *    |    +-------------------------+             |
+ *    +--->|SNAPBUILD_CONSISTENT     |<------------+
+ *         +-------------------------+
+ *
+ * Initially the machinery is in the START stage. When a xl_running_xacts
+ * record is read that is sufficiently new (above the safe xmin horizon),
+ * there's a state transation. If there were no running xacts when the
+ * runnign_xacts record was generated, we'll directly go into CONSISTENT
+ * state, otherwise we'll switch to the FULL_SNAPSHOT state. Having a full
+ * snapshot means that all transactions that start henceforth can be decoded
+ * in their entirety, but transactions that started previously can't. In
+ * FULL_SNAPSHOT we'll switch into CONSISTENT once all those previously
+ * running transactions have committed or aborted.
+ *
+ * Only transactions that commit after CONSISTENT state has been reached will
+ * be replayed, even though they might have started while still in
+ * FULL_SNAPSHOT. That ensures that we'll reach a point where no previous
+ * changes has been exported, but all the following ones will be. That point
+ * is a convenient point to initialize replication from, which is why we
+ * export a snapshot at that point, which *can* be used to read normal data.
+ *
+ * Copyright (c) 2012-2014, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/backend/replication/snapbuild.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "miscadmin.h"
+
+#include "access/heapam_xlog.h"
+#include "access/transam.h"
+#include "access/xact.h"
+
+#include "replication/logical.h"
+#include "replication/reorderbuffer.h"
+#include "replication/snapbuild.h"
+
+#include "utils/builtins.h"
+#include "utils/memutils.h"
+#include "utils/snapshot.h"
+#include "utils/snapmgr.h"
+#include "utils/tqual.h"
+
+#include "storage/block.h"		/* debugging output */
+#include "storage/fd.h"
+#include "storage/lmgr.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "storage/standby.h"
+
+/*
+ * This struct contains the current state of the snapshot building
+ * machinery. Besides a forward declaration in the header, it is not exposed
+ * to the public, so we can easily change it's contents.
+ */
+struct SnapBuild
+{
+	/* how far are we along building our first full snapshot */
+	SnapBuildState state;
+
+	/* private memory context used to allocate memory for this module. */
+	MemoryContext context;
+
+	/* all transactions < than this have committed/aborted */
+	TransactionId xmin;
+
+	/* all transactions >= than this are uncommitted */
+	TransactionId xmax;
+
+	/*
+	 * Don't replay commits from an LSN <= this LSN. This can be set
+	 * externally but it will also be advanced (never retreat) from within
+	 * snapbuild.c.
+	 */
+	XLogRecPtr	transactions_after;
+
+	/*
+	 * Don't start decoding WAL until the "xl_running_xacts" information
+	 * indicates there are no running xids with a xid smaller than this.
+	 */
+	TransactionId initial_xmin_horizon;
+
+	/*
+	 * Snapshot that's valid to see the catalog state seen at this moment.
+	 */
+	Snapshot	snapshot;
+
+	/*
+	 * LSN of the last location we are sure a snapshot has been serialized to.
+	 */
+	XLogRecPtr	last_serialized_snapshot;
+
+	/*
+	 * The reorderbuffer we need to update with usable snapshots et al.
+	 */
+	ReorderBuffer *reorder;
+
+	/*
+	 * Information about initially running transactions
+	 *
+	 * When we start building a snapshot there already may be transactions in
+	 * progress.  Those are stored in running.xip.	We don't have enough
+	 * information about those to decode their contents, so until they are
+	 * finished (xcnt=0) we cannot switch to a CONSISTENT state.
+	 */
+	struct
+	{
+		/*
+		 * As long as running.xcnt all XIDs < running.xmin and > running.xmax
+		 * have to be checked whether they still are running.
+		 */
+		TransactionId xmin;
+		TransactionId xmax;
+
+		size_t		xcnt;		/* number of used xip entries */
+		size_t		xcnt_space; /* allocated size of xip */
+		TransactionId *xip;		/* running xacts array, xidComparator-sorted */
+	}			running;
+
+	/*
+	 * Array of transactions which could have catalog changes that committed
+	 * between xmin and xmax.
+	 */
+	struct
+	{
+		/* number of committed transactions */
+		size_t		xcnt;
+
+		/* available space for committed transactions */
+		size_t		xcnt_space;
+
+		/*
+		 * Until we reach a CONSISTENT state, we record commits of all
+		 * transactions, not just the catalog changing ones. Record when that
+		 * changes so we know we cannot export a snapshot safely anymore.
+		 */
+		bool		includes_all_transactions;
+
+		/*
+		 * Array of committed transactions that have modified the catalog.
+		 *
+		 * As this array is frequently modified we do *not* keep it in
+		 * xidComparator order. Instead we sort the array when building &
+		 * distributing a snapshot.
+		 *
+		 * TODO: It's unclear whether that reasoning has much merit. Every
+		 * time we add something here after becoming consistent will also
+		 * require distributing a snapshot. Storing them sorted would
+		 * potentially also make it easier to purge (but more complicated wrt
+		 * wraparound?). Should be improved if sorting while building the
+		 * snapshot shows up in profiles.
+		 */
+		TransactionId *xip;
+	}			committed;
+};
+
+/*
+ * Starting a transaction -- which we need to do while exporting a snapshot --
+ * removes knowledge about the previously used resowner, so we save it here.
+ */
+ResourceOwner SavedResourceOwnerDuringExport = NULL;
+bool ExportInProgress = false;
+
+/* transaction state manipulation functions */
+static void SnapBuildEndTxn(SnapBuild *builder, XLogRecPtr lsn, TransactionId xid);
+
+/* ->running manipulation */
+static bool SnapBuildTxnIsRunning(SnapBuild *builder, TransactionId xid);
+
+/* ->committed manipulation */
+static void SnapBuildPurgeCommittedTxn(SnapBuild *builder);
+
+/* snapshot building/manipulation/distribution functions */
+static Snapshot SnapBuildBuildSnapshot(SnapBuild *builder, TransactionId xid);
+
+static void SnapBuildFreeSnapshot(Snapshot snap);
+
+static void SnapBuildSnapIncRefcount(Snapshot snap);
+
+static void SnapBuildDistributeNewCatalogSnapshot(SnapBuild *builder, XLogRecPtr lsn);
+
+/* xlog reading helper functions for SnapBuildProcessRecord */
+static bool SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *running);
+
+/* serialization functions */
+static void SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn);
+static bool SnapBuildRestore(SnapBuild *builder, XLogRecPtr lsn);
+
+
+/*
+ * Allocate a new snapshot builder.
+ *
+ * xmin_horizon is the xid >=which we can be sure no catalog rows have been
+ * removed, start_lsn is the LSN >= we want to replay commits.
+ */
+SnapBuild *
+AllocateSnapshotBuilder(ReorderBuffer *reorder,
+						TransactionId xmin_horizon,
+						XLogRecPtr start_lsn)
+{
+	MemoryContext context;
+	MemoryContext oldcontext;
+	SnapBuild  *builder;
+
+	/* allocate memory in own context, to have better accountability */
+	context = AllocSetContextCreate(CurrentMemoryContext,
+									"snapshot builder context",
+									ALLOCSET_DEFAULT_MINSIZE,
+									ALLOCSET_DEFAULT_INITSIZE,
+									ALLOCSET_DEFAULT_MAXSIZE);
+	oldcontext = MemoryContextSwitchTo(context);
+
+	builder = palloc0(sizeof(SnapBuild));
+
+	builder->state = SNAPBUILD_START;
+	builder->context = context;
+	builder->reorder = reorder;
+	/* Other struct members initialized by zeroing via palloc0 above */
+
+	builder->committed.xcnt = 0;
+	builder->committed.xcnt_space = 128;		/* arbitrary number */
+	builder->committed.xip =
+		palloc0(builder->committed.xcnt_space * sizeof(TransactionId));
+	builder->committed.includes_all_transactions = true;
+	builder->committed.xip =
+		palloc0(builder->committed.xcnt_space * sizeof(TransactionId));
+	builder->initial_xmin_horizon = xmin_horizon;
+	builder->transactions_after = start_lsn;
+
+	MemoryContextSwitchTo(oldcontext);
+
+	return builder;
+}
+
+/*
+ * Free a snapshot builder.
+ */
+void
+FreeSnapshotBuilder(SnapBuild *builder)
+{
+	MemoryContext context = builder->context;
+
+	/* free snapshot explicitly, that contains some error checking */
+	if (builder->snapshot != NULL)
+	{
+		SnapBuildSnapDecRefcount(builder->snapshot);
+		builder->snapshot = NULL;
+	}
+
+	/* other resources are deallocated via memory context reset */
+	MemoryContextDelete(context);
+}
+
+/*
+ * Free an unreferenced snapshot that has previously been built by us.
+ */
+static void
+SnapBuildFreeSnapshot(Snapshot snap)
+{
+	/* make sure we don't get passed an external snapshot */
+	Assert(snap->satisfies == HeapTupleSatisfiesHistoricMVCC);
+
+	/* make sure nobody modified our snapshot */
+	Assert(snap->curcid == FirstCommandId);
+	Assert(!snap->suboverflowed);
+	Assert(!snap->takenDuringRecovery);
+	Assert(snap->regd_count == 1);
+
+	/* slightly more likely, so it's checked even without c-asserts */
+	if (snap->copied)
+		elog(ERROR, "cannot free a copied snapshot");
+
+	if (snap->active_count)
+		elog(ERROR, "cannot free an active snapshot");
+
+	pfree(snap);
+}
+
+/*
+ * In which state of snapshot building are we?
+ */
+SnapBuildState
+SnapBuildCurrentState(SnapBuild *builder)
+{
+	return builder->state;
+}
+
+/*
+ * Should the contents of transaction ending at 'ptr' be decoded?
+ */
+bool
+SnapBuildXactNeedsSkip(SnapBuild *builder, XLogRecPtr ptr)
+{
+	return ptr <= builder->transactions_after;
+}
+
+/*
+ * Increase refcount of a snapshot.
+ *
+ * This is used when handing out a snapshot to some external resource or when
+ * adding a Snapshot as builder->snapshot.
+ */
+static void
+SnapBuildSnapIncRefcount(Snapshot snap)
+{
+	snap->active_count++;
+}
+
+/*
+ * Decrease refcount of a snapshot and free if the refcount reaches zero.
+ *
+ * Externally visible, so that external resources that have been handed an
+ * IncRef'ed Snapshot can adjust its refcount easily.
+ */
+void
+SnapBuildSnapDecRefcount(Snapshot snap)
+{
+	/* make sure we don't get passed an external snapshot */
+	Assert(snap->satisfies == HeapTupleSatisfiesHistoricMVCC);
+
+	/* make sure nobody modified our snapshot */
+	Assert(snap->curcid == FirstCommandId);
+	Assert(!snap->suboverflowed);
+	Assert(!snap->takenDuringRecovery);
+
+	Assert(snap->regd_count == 1);
+
+	Assert(snap->active_count);
+
+	/* slightly more likely, so its checked even without casserts */
+	if (snap->copied)
+		elog(ERROR, "cannot free a copied snapshot");
+
+	snap->active_count--;
+	if (!snap->active_count)
+		SnapBuildFreeSnapshot(snap);
+}
+
+/*
+ * Build a new snapshot, based on currently committed catalog-modifying
+ * transactions.
+ *
+ * In-progress transactions with catalog access are *not* allowed to modify
+ * these snapshots; they have to copy them and fill in appropriate ->curcid
+ * and ->subxip/subxcnt values.
+ */
+static Snapshot
+SnapBuildBuildSnapshot(SnapBuild *builder, TransactionId xid)
+{
+	Snapshot	snapshot;
+	Size		ssize;
+
+	Assert(builder->state >= SNAPBUILD_FULL_SNAPSHOT);
+
+	ssize = sizeof(SnapshotData)
+		+ sizeof(TransactionId) * builder->committed.xcnt
+		+ sizeof(TransactionId) * 1 /* toplevel xid */ ;
+
+	snapshot = MemoryContextAllocZero(builder->context, ssize);
+
+	snapshot->satisfies = HeapTupleSatisfiesHistoricMVCC;
+
+	/*
+	 * We misuse the original meaning of SnapshotData's xip and subxip fields
+	 * to make the more fitting for our needs.
+	 *
+	 * In the 'xip' array we store transactions that have to be treated as
+	 * committed. Since we will only ever look at tuples from transactions
+	 * that have modified the catalog its more efficient to store those few
+	 * that exist between xmin and xmax (frequently there are none).
+	 *
+	 * Snapshots that are used in transactions that have modified the catalog
+	 * also use the 'subxip' array to store their toplevel xid and all the
+	 * subtransaction xids so we can recognize when we need to treat rows as
+	 * visible that are not in xip but still need to be visible. Subxip only
+	 * gets filled when the transaction is copied into the context of a
+	 * catalog modifying transaction since we otherwise share a snapshot
+	 * between transactions. As long as a txn hasn't modified the catalog it
+	 * doesn't need to treat any uncommitted rows as visible, so there is no
+	 * need for those xids.
+	 *
+	 * Both arrays are qsort'ed so that we can use bsearch() on them.
+	 */
+	Assert(TransactionIdIsNormal(builder->xmin));
+	Assert(TransactionIdIsNormal(builder->xmax));
+
+	snapshot->xmin = builder->xmin;
+	snapshot->xmax = builder->xmax;
+
+	/* store all transactions to be treated as committed by this snapshot */
+	snapshot->xip =
+		(TransactionId *) ((char *) snapshot + sizeof(SnapshotData));
+	snapshot->xcnt = builder->committed.xcnt;
+	memcpy(snapshot->xip,
+		   builder->committed.xip,
+		   builder->committed.xcnt * sizeof(TransactionId));
+
+	/* sort so we can bsearch() */
+	qsort(snapshot->xip, snapshot->xcnt, sizeof(TransactionId), xidComparator);
+
+	/*
+	 * Initially, subxip is empty, i.e. it's a snapshot to be used by
+	 * transactions that don't modify the catalog. Will be filled by
+	 * ReorderBufferCopySnap() if necessary.
+	 */
+	snapshot->subxcnt = 0;
+	snapshot->subxip = NULL;
+
+	snapshot->suboverflowed = false;
+	snapshot->takenDuringRecovery = false;
+	snapshot->copied = false;
+	snapshot->curcid = FirstCommandId;
+	snapshot->active_count = 0;
+	snapshot->regd_count = 1; /* mark as registered so nobody frees it */
+
+	return snapshot;
+}
+
+/*
+ * Export a snapshot so it can be set in another session with SET TRANSACTION
+ * SNAPSHOT.
+ *
+ * For that we need to start a transaction in the current backend as the
+ * importing side checks whether the source transaction is still open to make
+ * sure the xmin horizon hasn't advanced since then.
+ *
+ * After that we convert a locally built snapshot into the normal variant
+ * understood by HeapTupleSatisfiesMVCC et al.
+ */
+const char *
+SnapBuildExportSnapshot(SnapBuild *builder)
+{
+	Snapshot	snap;
+	char	   *snapname;
+	TransactionId xid;
+	TransactionId *newxip;
+	int			newxcnt = 0;
+
+	if (builder->state != SNAPBUILD_CONSISTENT)
+		elog(ERROR, "cannot export a snapshot before reaching a consistent state");
+
+	if (!builder->committed.includes_all_transactions)
+		elog(ERROR, "cannot export a snapshot, not all transactions are monitored anymore");
+
+	/* so we don't overwrite the existing value */
+	if (TransactionIdIsValid(MyPgXact->xmin))
+		elog(ERROR, "cannot export a snapshot when MyPgXact->xmin already is valid");
+
+	if (IsTransactionOrTransactionBlock())
+		elog(ERROR, "cannot export a snapshot from within a transaction");
+
+	if (SavedResourceOwnerDuringExport)
+		elog(ERROR, "can only export one snapshot at a time");
+
+	SavedResourceOwnerDuringExport = CurrentResourceOwner;
+	ExportInProgress = true;
+
+	StartTransactionCommand();
+
+	Assert(!FirstSnapshotSet);
+
+	/* There doesn't seem to a nice API to set these */
+	XactIsoLevel = XACT_REPEATABLE_READ;
+	XactReadOnly = true;
+
+	snap = SnapBuildBuildSnapshot(builder, GetTopTransactionId());
+
+	/*
+	 * We know that snap->xmin is alive, enforced by the logical xmin
+	 * mechanism. Due to that we can do this without locks, we're only
+	 * changing our own value.
+	 */
+	MyPgXact->xmin = snap->xmin;
+
+	/* allocate in transaction context */
+	newxip = (TransactionId *)
+		palloc(sizeof(TransactionId) * GetMaxSnapshotXidCount());
+
+	/*
+	 * snapbuild.c builds transactions in an "inverted" manner, which means it
+	 * stores committed transactions in ->xip, not ones in progress. Build a
+	 * classical snapshot by marking all non-committed transactions as
+	 * in-progress. This can be expensive.
+	 */
+	for (xid = snap->xmin; NormalTransactionIdPrecedes(xid, snap->xmax);)
+	{
+		void	   *test;
+
+		/*
+		 * Check whether transaction committed using the decoding snapshot
+		 * meaning of ->xip.
+		 */
+		test = bsearch(&xid, snap->xip, snap->xcnt,
+					   sizeof(TransactionId), xidComparator);
+
+		if (test == NULL)
+		{
+			if (newxcnt >= GetMaxSnapshotXidCount())
+				elog(ERROR, "snapshot too large");
+
+			newxip[newxcnt++] = xid;
+		}
+
+		TransactionIdAdvance(xid);
+	}
+
+	snap->xcnt = newxcnt;
+	snap->xip = newxip;
+
+	/*
+	 * now that we've built a plain snapshot, use the normal mechanisms for
+	 * exporting it
+	 */
+	snapname = ExportSnapshot(snap);
+
+	ereport(LOG,
+			(errmsg("exported logical decoding snapshot: \"%s\" with %u xids",
+					snapname, snap->xcnt)));
+	return snapname;
+}
+
+/*
+ * Reset a previously SnapBuildExportSnapshot()'ed snapshot if there is
+ * any. Aborts the previously started transaction and resets the resource
+ * owner back to it's original value.
+ */
+void
+SnapBuildClearExportedSnapshot()
+{
+	/* nothing exported, thats the usual case */
+	if (!ExportInProgress)
+		return;
+
+	if (!IsTransactionState())
+		elog(ERROR, "clearing exported snapshot in wrong transaction state");
+
+	/* make sure nothing  could have ever happened */
+	AbortCurrentTransaction();
+
+	CurrentResourceOwner = SavedResourceOwnerDuringExport;
+	SavedResourceOwnerDuringExport = NULL;
+	ExportInProgress = false;
+}
+
+/*
+ * Handle the effects of a single heap change, appropriate to the current state
+ * of the snapshot builder and returns whether changes made at (xid, lsn) can
+ * be decoded.
+ */
+bool
+SnapBuildProcessChange(SnapBuild *builder, TransactionId xid, XLogRecPtr lsn)
+{
+	bool is_old_tx;
+
+	/*
+	 * We can't handle data in transactions if we haven't built a snapshot
+	 * yet, so don't store them.
+	 */
+	if (builder->state < SNAPBUILD_FULL_SNAPSHOT)
+		return false;
+
+	/*
+	 * No point in keeping track of changes in transactions that we don't have
+	 * enough information about to decode. This means that they started before
+	 * we got into the SNAPBUILD_FULL_SNAPSHOT state.
+	 */
+	if (builder->state < SNAPBUILD_CONSISTENT &&
+		SnapBuildTxnIsRunning(builder, xid))
+		return false;
+
+	/*
+	 * If the reorderbuffer doesn't yet have a snapshot, add one now, it will
+	 * be needed to decode the change we're currently processing.
+	 */
+	is_old_tx = ReorderBufferIsXidKnown(builder->reorder, xid);
+
+	if (!is_old_tx || !ReorderBufferXidHasBaseSnapshot(builder->reorder, xid))
+	{
+		/* only build a new snapshot if we don't have a prebuilt one */
+		if (builder->snapshot == NULL)
+		{
+			builder->snapshot = SnapBuildBuildSnapshot(builder, xid);
+			/* inrease refcount for the snapshot builder */
+			SnapBuildSnapIncRefcount(builder->snapshot);
+		}
+
+		/*
+		 * Increase refcount for the transaction we're handing the snapshot
+		 * out to.
+		 */
+		SnapBuildSnapIncRefcount(builder->snapshot);
+		ReorderBufferSetBaseSnapshot(builder->reorder, xid, lsn,
+									 builder->snapshot);
+	}
+
+	return true;
+}
+
+/*
+ * Do CommandId/ComboCid handling after reading a xl_heap_new_cid record. This
+ * implies that a transaction has done some form of write to system catalogs.
+ */
+void
+SnapBuildProcessNewCid(SnapBuild *builder, TransactionId xid,
+					   XLogRecPtr lsn, xl_heap_new_cid *xlrec)
+{
+	CommandId	cid;
+
+	/*
+	 * we only log new_cid's if a catalog tuple was modified, so mark
+	 * the transaction as containing catalog modifications
+	 */
+	ReorderBufferXidSetCatalogChanges(builder->reorder, xid,lsn);
+
+	ReorderBufferAddNewTupleCids(builder->reorder, xlrec->top_xid, lsn,
+								 xlrec->target.node, xlrec->target.tid,
+								 xlrec->cmin, xlrec->cmax,
+								 xlrec->combocid);
+
+	/* figure out new command id */
+	if (xlrec->cmin != InvalidCommandId &&
+		xlrec->cmax != InvalidCommandId)
+		cid = Max(xlrec->cmin, xlrec->cmax);
+	else if (xlrec->cmax != InvalidCommandId)
+		cid = xlrec->cmax;
+	else if (xlrec->cmin != InvalidCommandId)
+		cid = xlrec->cmin;
+	else
+	{
+		cid = InvalidCommandId;		/* silence compiler */
+		elog(ERROR, "xl_heap_new_cid record without a valid CommandId");
+	}
+
+	ReorderBufferAddNewCommandId(builder->reorder, xid, lsn, cid + 1);
+}
+
+/*
+ * Check whether `xid` is currently 'running'.
+ *
+ * Running transactions in our parlance are transactions which we didn't
+ * observe from the start so we can't properly decode their contents. They
+ * only exist after we freshly started from an < CONSISTENT snapshot.
+ */
+static bool
+SnapBuildTxnIsRunning(SnapBuild *builder, TransactionId xid)
+{
+	Assert(builder->state < SNAPBUILD_CONSISTENT);
+	Assert(TransactionIdIsNormal(builder->running.xmin));
+	Assert(TransactionIdIsNormal(builder->running.xmax));
+
+	if (builder->running.xcnt &&
+		NormalTransactionIdFollows(xid, builder->running.xmin) &&
+		NormalTransactionIdPrecedes(xid, builder->running.xmax))
+	{
+		TransactionId *search =
+		bsearch(&xid, builder->running.xip, builder->running.xcnt_space,
+				sizeof(TransactionId), xidComparator);
+
+		if (search != NULL)
+		{
+			Assert(*search == xid);
+			return true;
+		}
+	}
+
+	return false;
+}
+
+/*
+ * Add a new Snapshot to all transactions we're decoding that currently are
+ * in-progress so they can see new catalog contents made by the transaction
+ * that just committed. This is necessary because those in-progress
+ * transactions will use the new catalog's contents from here on (at the very
+ * least everything they do needs to be compatible with newer catalog
+ * contents).
+ */
+static void
+SnapBuildDistributeNewCatalogSnapshot(SnapBuild *builder, XLogRecPtr lsn)
+{
+	dlist_iter	txn_i;
+	ReorderBufferTXN *txn;
+
+	/*
+	 * Iterate through all toplevel transactions. This can include
+	 * subtransactions which we just don't yet know to be that, but that's
+	 * fine, they will just get an unneccesary snapshot queued.
+	 */
+	dlist_foreach(txn_i, &builder->reorder->toplevel_by_lsn)
+	{
+		txn = dlist_container(ReorderBufferTXN, node, txn_i.cur);
+
+		Assert(TransactionIdIsValid(txn->xid));
+
+		/*
+		 * If we don't have a base snapshot yet, there are no changes in this
+		 * transaction which in turn implies we don't yet need a snapshot at
+		 * all. We'll add add a snapshot when the first change gets queued.
+		 *
+		 * NB: This works correctly even for subtransactions because
+		 * ReorderBufferCommitChild() takes care to pass the parent the base
+		 * snapshot, and while iterating the changequeue we'll get the change
+		 * from the subtxn.
+		 */
+		if (!ReorderBufferXidHasBaseSnapshot(builder->reorder, txn->xid))
+			continue;
+
+		elog(DEBUG2, "adding a new snapshot to %u at %X/%X",
+			 txn->xid, (uint32) (lsn >> 32), (uint32) lsn);
+
+		/*
+		 * increase the snapshot's refcount for the transaction we are handing
+		 * it out to
+		 */
+		SnapBuildSnapIncRefcount(builder->snapshot);
+		ReorderBufferAddSnapshot(builder->reorder, txn->xid, lsn,
+								 builder->snapshot);
+	}
+}
+
+/*
+ * Keep track of a new catalog changing transaction that has committed.
+ */
+static void
+SnapBuildAddCommittedTxn(SnapBuild *builder, TransactionId xid)
+{
+	Assert(TransactionIdIsValid(xid));
+
+	if (builder->committed.xcnt == builder->committed.xcnt_space)
+	{
+		builder->committed.xcnt_space = builder->committed.xcnt_space * 2 + 1;
+
+		elog(DEBUG1, "increasing space for committed transactions to %u",
+			 (uint32) builder->committed.xcnt_space);
+
+		builder->committed.xip = repalloc(builder->committed.xip,
+					builder->committed.xcnt_space * sizeof(TransactionId));
+	}
+
+	/*
+	 * TODO: It might make sense to keep the array sorted here instead of
+	 * doing it every time we build a new snapshot. On the other hand this
+	 * gets called repeatedly when a transaction with subtransactions commits.
+	 */
+	builder->committed.xip[builder->committed.xcnt++] = xid;
+}
+
+/*
+ * Remove knowledge about transactions we treat as committed that are smaller
+ * than ->xmin. Those won't ever get checked via the ->commited array but via
+ * the clog machinery, so we don't need to waste memory on them.
+ */
+static void
+SnapBuildPurgeCommittedTxn(SnapBuild *builder)
+{
+	int			off;
+	TransactionId *workspace;
+	int			surviving_xids = 0;
+
+	/* not ready yet */
+	if (!TransactionIdIsNormal(builder->xmin))
+		return;
+
+	/* TODO: Neater algorithm than just copying and iterating? */
+	workspace =
+		MemoryContextAlloc(builder->context,
+						   builder->committed.xcnt * sizeof(TransactionId));
+
+	/* copy xids that still are interesting to workspace */
+	for (off = 0; off < builder->committed.xcnt; off++)
+	{
+		if (NormalTransactionIdPrecedes(builder->committed.xip[off],
+										builder->xmin))
+			;					/* remove */
+		else
+			workspace[surviving_xids++] = builder->committed.xip[off];
+	}
+
+	/* copy workspace back to persistent state */
+	memcpy(builder->committed.xip, workspace,
+		   surviving_xids * sizeof(TransactionId));
+
+	elog(DEBUG3, "purged committed transactions from %u to %u, xmin: %u, xmax: %u",
+		 (uint32) builder->committed.xcnt, (uint32) surviving_xids,
+		 builder->xmin, builder->xmax);
+	builder->committed.xcnt = surviving_xids;
+
+	pfree(workspace);
+}
+
+/*
+ * Common logic for SnapBuildAbortTxn and SnapBuildCommitTxn dealing with
+ * keeping track of the amount of running transactions.
+ */
+static void
+SnapBuildEndTxn(SnapBuild *builder, XLogRecPtr lsn, TransactionId xid)
+{
+	if (builder->state == SNAPBUILD_CONSISTENT)
+		return;
+
+	/*
+	 * NB: This handles subtransactions correctly even if we started from
+	 * suboverflowed xl_running_xacts because we only keep track of toplevel
+	 * transactions. Since the latter are always are allocated before their
+	 * subxids and since they end at the same time it's sufficient to deal
+	 * with them here.
+	 */
+	if (SnapBuildTxnIsRunning(builder, xid))
+	{
+		Assert(builder->running.xcnt > 0);
+
+		if (!--builder->running.xcnt)
+		{
+			/*
+			 * None of the originally running transaction is running anymore,
+			 * so our incrementaly built snapshot now is consistent.
+			 */
+			ereport(LOG,
+					(errmsg("logical decoding found consistent point at %X/%X",
+							(uint32)(lsn >> 32), (uint32)lsn),
+					 errdetail("xid %u finished, no running transactions anymore",
+							   xid)));
+			builder->state = SNAPBUILD_CONSISTENT;
+		}
+	}
+}
+
+/*
+ * Abort a transaction, throw away all state we kept.
+ */
+void
+SnapBuildAbortTxn(SnapBuild *builder, XLogRecPtr lsn,
+				  TransactionId xid,
+				  int nsubxacts, TransactionId *subxacts)
+{
+	int			i;
+
+	for (i = 0; i < nsubxacts; i++)
+	{
+		TransactionId subxid = subxacts[i];
+
+		SnapBuildEndTxn(builder, lsn, subxid);
+	}
+
+	SnapBuildEndTxn(builder, lsn, xid);
+}
+
+/*
+ * Handle everything that needs to be done when a transaction commits
+ */
+void
+SnapBuildCommitTxn(SnapBuild *builder, XLogRecPtr lsn, TransactionId xid,
+				   int nsubxacts, TransactionId *subxacts)
+{
+	int			nxact;
+
+	bool		forced_timetravel = false;
+	bool		sub_needs_timetravel = false;
+	bool		top_needs_timetravel = false;
+
+	TransactionId xmax = xid;
+
+	/*
+	 * If we couldn't observe every change of a transaction because it was
+	 * already running at the point we started to observe we have to assume it
+	 * made catalog changes.
+	 *
+	 * This has the positive benefit that we afterwards have enough
+	 * information to build an exportable snapshot that's usable by pg_dump et
+	 * al.
+	 */
+	if (builder->state < SNAPBUILD_CONSISTENT)
+	{
+		/* ensure that only commits after this are getting replayed */
+		if (builder->transactions_after < lsn)
+			builder->transactions_after = lsn;
+
+		/*
+		 * We could avoid treating !SnapBuildTxnIsRunning transactions as
+		 * timetravel ones, but we want to be able to export a snapshot when
+		 * we reached consistency.
+		 */
+		forced_timetravel = true;
+		elog(DEBUG1, "forced to assume catalog changes for xid %u because it was running to early", xid);
+	}
+
+	for (nxact = 0; nxact < nsubxacts; nxact++)
+	{
+		TransactionId subxid = subxacts[nxact];
+
+		/*
+		 * make sure txn is not tracked in running txn's anymore, switch state
+		 */
+		SnapBuildEndTxn(builder, lsn, subxid);
+
+		/*
+		 * If we're forcing timetravel we also need visibility information
+		 * about subtransaction, so keep track of subtransaction's state.
+		 */
+		if (forced_timetravel)
+		{
+			SnapBuildAddCommittedTxn(builder, subxid);
+			if (NormalTransactionIdFollows(subxid, xmax))
+				xmax = subxid;
+		}
+
+		/*
+		 * Add subtransaction to base snapshot if it DDL, we don't distinguish
+		 * to toplevel transactions there.
+		 */
+		else if (ReorderBufferXidHasCatalogChanges(builder->reorder, subxid))
+		{
+			sub_needs_timetravel = true;
+
+			elog(DEBUG1, "found subtransaction %u:%u with catalog changes.",
+				 xid, subxid);
+
+			SnapBuildAddCommittedTxn(builder, subxid);
+
+			if (NormalTransactionIdFollows(subxid, xmax))
+				xmax = subxid;
+		}
+	}
+
+	/*
+	 * Make sure toplevel txn is not tracked in running txn's anymore, switch
+	 * state to consistent if possible.
+	 */
+	SnapBuildEndTxn(builder, lsn, xid);
+
+	if (forced_timetravel)
+	{
+		elog(DEBUG2, "forced transaction %u to do timetravel.", xid);
+
+		SnapBuildAddCommittedTxn(builder, xid);
+	}
+	/* add toplevel transaction to base snapshot */
+	else if (ReorderBufferXidHasCatalogChanges(builder->reorder, xid))
+	{
+		elog(DEBUG2, "found top level transaction %u, with catalog changes!",
+			 xid);
+
+		top_needs_timetravel = true;
+		SnapBuildAddCommittedTxn(builder, xid);
+	}
+	else if (sub_needs_timetravel)
+	{
+		/* mark toplevel txn as timetravel as well */
+		SnapBuildAddCommittedTxn(builder, xid);
+	}
+
+	/* if there's any reason to build a historic snapshot, to so now */
+	if (forced_timetravel || top_needs_timetravel || sub_needs_timetravel)
+	{
+		/*
+		 * Adjust xmax of the snapshot builder, we only do that for committed,
+		 * catalog modifying, transactions, everything else isn't interesting
+		 * for us since we'll never look at the respective rows.
+		 */
+		if (!TransactionIdIsValid(builder->xmax) ||
+			TransactionIdFollowsOrEquals(xmax, builder->xmax))
+		{
+			builder->xmax = xmax;
+			TransactionIdAdvance(builder->xmax);
+		}
+
+		/*
+		 * If we haven't built a complete snapshot yet there's no need to hand
+		 * it out, it wouldn't (and couldn't) be used anyway.
+		 */
+		if (builder->state < SNAPBUILD_FULL_SNAPSHOT)
+			return;
+
+		/*
+		 * Decrease the snapshot builder's refcount of the old snapshot, note
+		 * that it still will be used if it has been handed out to the
+		 * reorderbuffer earlier.
+		 */
+		if (builder->snapshot)
+			SnapBuildSnapDecRefcount(builder->snapshot);
+
+		builder->snapshot = SnapBuildBuildSnapshot(builder, xid);
+
+		/* we might need to execute invalidations, add snapshot */
+		if (!ReorderBufferXidHasBaseSnapshot(builder->reorder, xid))
+		{
+			SnapBuildSnapIncRefcount(builder->snapshot);
+			ReorderBufferSetBaseSnapshot(builder->reorder, xid, lsn,
+										 builder->snapshot);
+		}
+
+		/* refcount of the snapshot builder for the new snapshot */
+		SnapBuildSnapIncRefcount(builder->snapshot);
+
+		/* add a new SnapshotNow to all currently running transactions */
+		SnapBuildDistributeNewCatalogSnapshot(builder, lsn);
+	}
+	else
+	{
+		/* record that we cannot export a general snapshot anymore */
+		builder->committed.includes_all_transactions = false;
+	}
+}
+
+
+/* -----------------------------------
+ * Snapshot building functions dealing with xlog records
+ * -----------------------------------
+ */
+
+/*
+ * Process a running xacts record, and use it's information to first build a
+ * historic snapshot and later to release resources that aren't needed
+ * anymore.
+ */
+void
+SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *running)
+{
+	ReorderBufferTXN *txn;
+
+	/*
+	 * If we're not consistent yet, inspect the record to see whether it
+	 * allows to get closer to being consistent. If we are consistent, dump
+	 * our snapshot so others or we, after a restart, can use it.
+	 */
+	if (builder->state < SNAPBUILD_CONSISTENT)
+	{
+		/* returns false if there's no point in performing cleanup just yet */
+		if (!SnapBuildFindSnapshot(builder, lsn, running))
+			return;
+	}
+	else
+		SnapBuildSerialize(builder, lsn);
+
+	/*
+	 * Update range of interesting xids base don the running xacts
+	 * information. We don't increase ->xmax using it, because once we are in
+	 * a consistent state we can do that ourselves and much more efficiently
+	 * so, because we only need to do it for catalog transactions since we
+	 * only ever look at those.
+	 *
+	 * NB: Because of that xmax can be lower than xmin, because we only
+	 * increase xmax when a catalog modifying transaction commits. While odd
+	 * looking, its correct and actually more efficient this way since we hit
+	 * fast paths in tqual.c.
+	 */
+	builder->xmin = running->oldestRunningXid;
+
+	/* Remove transactions we don't need to keep track off anymore */
+	SnapBuildPurgeCommittedTxn(builder);
+
+	elog(DEBUG3, "xmin: %u, xmax: %u, oldestrunning: %u",
+		 builder->xmin, builder->xmax,
+		 running->oldestRunningXid);
+
+	/*
+	 * Inrease shared memory limits, so vacuum can work on tuples we prevented
+	 * from being pruned till now.
+	 */
+	LogicalIncreaseXminForSlot(lsn, running->oldestRunningXid);
+
+	/*
+	 * Also tell the slot where we can restart decoding from. We don't want to
+	 * do that after every commit because changing that implies an fsync of
+	 * the logical slot's state file, so we only do it every time we see a
+	 * running xacts record.
+	 *
+	 * Do so by looking for the oldest in progress transaction (determined by
+	 * the first LSN of any of its relevant records). Every transaction
+	 * remembers the last location we stored the snapshot to disk before its
+	 * beginning. That point is where we can restart from.
+	 */
+
+	/*
+	 * Can't know about a serialized snapshot's location if we're not
+	 * consistent.
+	 */
+	if (builder->state < SNAPBUILD_CONSISTENT)
+		return;
+
+	txn = ReorderBufferGetOldestTXN(builder->reorder);
+
+	/*
+	 * oldest ongoing txn might have started when we didn't yet serialize
+	 * anything because we hadn't reached a consistent state yet.
+	 */
+	if (txn != NULL && txn->restart_decoding_lsn != InvalidXLogRecPtr)
+		LogicalIncreaseRestartDecodingForSlot(lsn, txn->restart_decoding_lsn);
+	/*
+	 * No in-progress transaction, can reuse the last serialized snapshot if
+	 * we have one.
+	 */
+	else if (txn == NULL &&
+			 builder->reorder->current_restart_decoding_lsn != InvalidXLogRecPtr &&
+			 builder->last_serialized_snapshot != InvalidXLogRecPtr)
+		LogicalIncreaseRestartDecodingForSlot(lsn,
+										   builder->last_serialized_snapshot);
+}
+
+
+/*
+ * Build the start of a snapshot that's capable of decoding the catalog.
+ *
+ * Helper function for SnapBuildProcessRunningXacts() while we're not yet
+ * consistent.
+ *
+ * Returns true if there is a point in performing internal maintenance/cleanup
+ * using the xl_running_xacts record.
+ */
+static bool
+SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *running)
+{
+	/* ---
+	 * Build catalog decoding snapshot incrementally using information about
+	 * the currently running transactions. There are several ways to do that:
+	 *
+	 * a) There were no running transactions when the xl_running_xacts record
+	 *    was inserted, jump to CONSISTENT immediately. We might find such a
+	 *    state we were waiting for b) and c).
+	 *
+	 * b) Wait for all toplevel transactions that were running to end. We
+	 *    simply track the number of in-progress toplevel transactions and
+	 *    lower it whenever one commits or aborts. When that number
+	 *    (builder->running.xcnt) reaches zero, we can go from FULL_SNAPSHOT
+	 *    to CONSISTENT.
+	 *	  NB: We need to search running.xip when seeing a transaction's end to
+	 *    make sure it's a toplevel transaction and it's been one of the
+	 *    intially running ones.
+	 *	  Interestingly, in contrast to HS, this allows us not to care about
+	 *	  subtransactions - and by extension suboverflowed xl_running_xacts -
+	 *	  at all.
+	 *
+	 * c) This (in a previous run) or another decoding slot serialized a
+	 *    snapshot to disk that we can use.
+	 * ---
+	 */
+
+	/*
+	 * xl_running_xact record is older than what we can use, we might not have
+	 * all necessary catalog rows anymore.
+	 */
+	if (TransactionIdIsNormal(builder->initial_xmin_horizon) &&
+		NormalTransactionIdPrecedes(running->oldestRunningXid,
+									builder->initial_xmin_horizon))
+	{
+		ereport(DEBUG1,
+				(errmsg("skipping snapshot at %X/%X while building logical decoding snapshot, xmin horizon too low",
+						(uint32) (lsn >> 32), (uint32) lsn),
+				 errdetail("initial xmin horizon of %u vs the snapshot's %u",
+						   builder->initial_xmin_horizon, running->oldestRunningXid)));
+		return true;
+	}
+
+	/*
+	 * a) No transaction were running, we can jump to consistent.
+	 *
+	 * NB: We might have already started to incrementally assemble a snapshot,
+	 * so we need to be careful to deal with that.
+	 */
+	if (running->xcnt == 0)
+	{
+		if (builder->transactions_after == InvalidXLogRecPtr ||
+			builder->transactions_after < lsn)
+			builder->transactions_after = lsn;
+
+		builder->xmin = running->oldestRunningXid;
+		builder->xmax = running->latestCompletedXid;
+		TransactionIdAdvance(builder->xmax);
+
+		Assert(TransactionIdIsNormal(builder->xmin));
+		Assert(TransactionIdIsNormal(builder->xmax));
+
+		/* no transactions running now */
+		builder->running.xcnt = 0;
+		builder->running.xmin = InvalidTransactionId;
+		builder->running.xmax = InvalidTransactionId;
+
+		builder->state = SNAPBUILD_CONSISTENT;
+
+		ereport(LOG,
+				(errmsg("logical decoding found consistent point at %X/%X",
+						(uint32)(lsn >> 32), (uint32)lsn),
+				 errdetail("running xacts with xcnt == 0")));
+
+		return false;
+	}
+	/* c) valid on disk state */
+	else if (SnapBuildRestore(builder, lsn))
+	{
+		/* there won't be any state to cleanup */
+		return false;
+	}
+	/*
+	 * b) first encounter of a useable xl_running_xacts record. If we had
+	 * found one earlier we would either track running transactions
+	 * (i.e. builder->running.xcnt != 0) or be consistent (this function
+	 * wouldn't get called).
+	 */
+	else if (!builder->running.xcnt)
+	{
+		int off;
+
+		/*
+		 * We only care about toplevel xids as those are the ones we
+		 * definitely see in the wal stream. As snapbuild.c tracks committed
+		 * instead of running transactions we don't need to know anything
+		 * about uncommitted subtransactions.
+		 */
+		builder->xmin = running->oldestRunningXid;
+		builder->xmax = running->latestCompletedXid;
+		TransactionIdAdvance(builder->xmax);
+
+		/* so we can safely use the faster comparisons */
+		Assert(TransactionIdIsNormal(builder->xmin));
+		Assert(TransactionIdIsNormal(builder->xmax));
+
+		builder->running.xcnt = running->xcnt;
+		builder->running.xcnt_space = running->xcnt;
+		builder->running.xip =
+			MemoryContextAlloc(builder->context,
+							builder->running.xcnt * sizeof(TransactionId));
+		memcpy(builder->running.xip, running->xids,
+			   builder->running.xcnt * sizeof(TransactionId));
+
+		/* sort so we can do a binary search */
+		qsort(builder->running.xip, builder->running.xcnt,
+			  sizeof(TransactionId), xidComparator);
+
+		builder->running.xmin = builder->running.xip[0];
+		builder->running.xmax = builder->running.xip[running->xcnt - 1];
+
+		/* makes comparisons cheaper later */
+		TransactionIdRetreat(builder->running.xmin);
+		TransactionIdAdvance(builder->running.xmax);
+
+		builder->state = SNAPBUILD_FULL_SNAPSHOT;
+
+		ereport(LOG,
+				(errmsg("logical decoding found initial starting point at %X/%X",
+						(uint32)(lsn >> 32), (uint32)lsn),
+				 errdetail("%u xacts need to finish", (uint32) builder->running.xcnt)));
+
+		/*
+		 * Iterate through all xids, wait for them to finish.
+		 *
+		 * This isn't required for the correctness of decoding, but to allow
+		 * isolationtester to notice that we're currently waiting for
+		 * something.
+		 */
+		for(off = 0; off < builder->running.xcnt; off++)
+		{
+			TransactionId xid = builder->running.xip[off];
+
+			/*
+			 * Upper layers should prevent that we ever need to wait on
+			 * ourselves. Check anyway, since failing to do so would either
+			 * result in an endless wait or an Assert() failure.
+			 */
+			if (TransactionIdIsCurrentTransactionId(xid))
+				elog(ERROR, "waiting for ourselves");
+
+			XactLockTableWait(xid);
+		}
+
+		/* nothing could have built up so far, so don't perform cleanup */
+		return false;
+	}
+
+	/*
+	 * We already started to track running xacts and need to wait for all
+	 * in-progress ones to finish. We fall through to the normal processing of
+	 * records so incremental cleanup can be performed.
+	 */
+	return true;
+}
+
+
+/* -----------------------------------
+ * Snapshot serialization support
+ * -----------------------------------
+ */
+
+/*
+ * We store current state of struct SnapBuild on disk in the following manner:
+ *
+ * struct SnapBuildOnDisk;
+ * TransactionId * running.xcnt_space;
+ * TransactionId * committed.xcnt; (*not xcnt_space*)
+ *
+ */
+typedef struct SnapBuildOnDisk
+{
+	/* first part of this struct needs to be version independent */
+
+	/* data not covered by checksum */
+	uint32		magic;
+	pg_crc32	checksum;
+
+	/* data covered by checksum */
+
+	/* version, in case we want to support pg_upgrade */
+	uint32		version;
+	/* how large is the on disk data, excluding the constant sized part */
+	uint32		length;
+
+	/* version dependent part */
+	SnapBuild	builder;
+
+	/* variable amount of TransactionIds follows */
+} SnapBuildOnDisk;
+
+#define SnapBuildOnDiskConstantSize \
+	offsetof(SnapBuildOnDisk, builder)
+#define SnapBuildOnDiskNotChecksummedSize \
+	offsetof(SnapBuildOnDisk, version)
+
+#define SNAPBUILD_MAGIC 0x51A1E001
+#define SNAPBUILD_VERSION 1
+
+/*
+ * Store/Load a snapshot from disk, depending on the snapshot builder's state.
+ *
+ * Supposed to be used by external (i.e. not snapbuild.c) code that just reada
+ * record that's a potential location for a serialized snapshot.
+ */
+void
+SnapBuildSerializationPoint(SnapBuild *builder, XLogRecPtr lsn)
+{
+	if (builder->state < SNAPBUILD_CONSISTENT)
+		SnapBuildRestore(builder, lsn);
+	else
+		SnapBuildSerialize(builder, lsn);
+}
+
+/*
+ * Serialize the snapshot 'builder' at the location 'lsn' if it hasn't already
+ * been done by another decoding process.
+ */
+static void
+SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn)
+{
+	Size		needed_length;
+	SnapBuildOnDisk *ondisk;
+	char	   *ondisk_c;
+	int			fd;
+	char		tmppath[MAXPGPATH];
+	char		path[MAXPGPATH];
+	int			ret;
+	struct stat stat_buf;
+	uint32		sz;
+
+	Assert(lsn != InvalidXLogRecPtr);
+	Assert(builder->last_serialized_snapshot == InvalidXLogRecPtr ||
+		   builder->last_serialized_snapshot <= lsn);
+
+	/*
+	 * no point in serializing if we cannot continue to work immediately after
+	 * restoring the snapshot
+	 */
+	if (builder->state < SNAPBUILD_CONSISTENT)
+		return;
+
+	/*
+	 * We identify snapshots by the LSN they are valid for. We don't need to
+	 * include timelines in the name as each LSN maps to exactly one timeline
+	 * unless the user used pg_resetxlog or similar. If a user did so, there's
+	 * no hope continuing to decode anyway.
+	 */
+	sprintf(path, "pg_llog/snapshots/%X-%X.snap",
+			(uint32) (lsn >> 32), (uint32) lsn);
+
+	/*
+	 * first check whether some other backend already has written the snapshot
+	 * for this LSN. It's perfectly fine if there's none, so we accept ENOENT
+	 * as a valid state. Everything else is an unexpected error.
+	 */
+	ret = stat(path, &stat_buf);
+
+	if (ret != 0 && errno != ENOENT)
+		ereport(ERROR,
+				(errmsg("could not stat file \"%s\": %m", path)));
+
+	else if (ret == 0)
+	{
+		/*
+		 * somebody else has already serialized to this point, don't overwrite
+		 * but remember location, so we don't need to read old data again.
+		 *
+		 * To be sure it has been synced to disk after the rename() from the
+		 * tempfile filename to the real filename, we just repeat the
+		 * fsync. That ought to be cheap because in most scenarios it should
+		 * already be safely on disk.
+		 */
+		fsync_fname(path, false);
+		fsync_fname("pg_llog/snapshots", true);
+
+		builder->last_serialized_snapshot = lsn;
+		goto out;
+	}
+
+	/*
+	 * there is an obvious race condition here between the time we stat(2) the
+	 * file and us writing the file. But we rename the file into place
+	 * atomically and all files created need to contain the same data anyway,
+	 * so this is perfectly fine, although a bit of a resource waste. Locking
+	 * seems like pointless complication.
+	 */
+	elog(DEBUG1, "serializing snapshot to %s", path);
+
+	/* to make sure only we will write to this tempfile, include pid */
+	sprintf(tmppath, "pg_llog/snapshots/%X-%X.snap.%u.tmp",
+			(uint32) (lsn >> 32), (uint32) lsn, MyProcPid);
+
+	/*
+	 * Unlink temporary file if it already exists, needs to have been before a
+	 * crash/error since we won't enter this function twice from within a
+	 * single decoding slot/backend and the temporary file contains the pid of
+	 * the current process.
+	 */
+	if (unlink(tmppath) != 0 && errno != ENOENT)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not unlink file \"%s\": %m",	path)));
+
+	needed_length = sizeof(SnapBuildOnDisk) +
+		sizeof(TransactionId) * builder->running.xcnt_space +
+		sizeof(TransactionId) * builder->committed.xcnt;
+
+	ondisk_c = MemoryContextAllocZero(builder->context, needed_length);
+	ondisk = (SnapBuildOnDisk *) ondisk_c;
+	ondisk->magic = SNAPBUILD_MAGIC;
+	ondisk->version = SNAPBUILD_VERSION;
+	ondisk->length = needed_length;
+	INIT_CRC32(ondisk->checksum);
+	COMP_CRC32(ondisk->checksum,
+			   ((char *) ondisk) + SnapBuildOnDiskNotChecksummedSize,
+			   SnapBuildOnDiskConstantSize - SnapBuildOnDiskNotChecksummedSize);
+	ondisk_c += sizeof(SnapBuildOnDisk);
+
+	memcpy(&ondisk->builder, builder, sizeof(SnapBuild));
+	/* NULL-ify memory-only data */
+	ondisk->builder.context = NULL;
+	ondisk->builder.snapshot = NULL;
+	ondisk->builder.reorder = NULL;
+	ondisk->builder.running.xip = NULL;
+	ondisk->builder.committed.xip = NULL;
+
+	COMP_CRC32(ondisk->checksum,
+			   &ondisk->builder,
+			   sizeof(SnapBuild));
+
+	/* copy running xacts */
+	sz = sizeof(TransactionId) * builder->running.xcnt_space;
+	memcpy(ondisk_c, builder->running.xip, sz);
+	COMP_CRC32(ondisk->checksum, ondisk_c, sz);
+	ondisk_c += sz;
+
+	/* copy committed xacts */
+	sz = sizeof(TransactionId) * builder->committed.xcnt;
+	memcpy(ondisk_c, builder->committed.xip, sz);
+	COMP_CRC32(ondisk->checksum, ondisk_c, sz);
+	ondisk_c += sz;
+
+	/* we have valid data now, open tempfile and write it there */
+	fd = OpenTransientFile(tmppath,
+						   O_CREAT | O_EXCL | O_WRONLY | PG_BINARY,
+						   S_IRUSR | S_IWUSR);
+	if (fd < 0)
+		ereport(ERROR,
+				(errmsg("could not open file \"%s\": %m", path)));
+
+	if ((write(fd, ondisk, needed_length)) != needed_length)
+	{
+		CloseTransientFile(fd);
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not write to file \"%s\": %m", tmppath)));
+	}
+
+	/*
+	 * fsync the file before renaming so that even if we crash after this we
+	 * have either a fully valid file or nothing.
+	 *
+	 * TODO: Do the fsync() via checkpoints/restartpoints, doing it here has
+	 * some noticeable overhead since it's performed synchronously during
+	 * decoding?
+	 */
+	if (pg_fsync(fd) != 0)
+	{
+		CloseTransientFile(fd);
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not fsync file \"%s\": %m", tmppath)));
+	}
+	CloseTransientFile(fd);
+
+	fsync_fname("pg_llog/snapshots", true);
+
+	/*
+	 * We may overwrite the work from some other backend, but that's ok, our
+	 * snapshot is valid as well, we'll just have done some superflous work.
+	 */
+	if (rename(tmppath, path) != 0)
+	{
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not rename file \"%s\" to \"%s\": %m",
+						tmppath, path)));
+	}
+
+	/* make sure we persist */
+	fsync_fname(path, false);
+	fsync_fname("pg_llog/snapshots", true);
+
+	/*
+	 * Now there's no way we can loose the dumped state anymore, remember
+	 * this as a serialization point.
+	 */
+	builder->last_serialized_snapshot = lsn;
+
+out:
+	ReorderBufferSetRestartPoint(builder->reorder,
+								 builder->last_serialized_snapshot);
+}
+
+/*
+ * Restore a snapshot into 'builder' if previously one has been stored at the
+ * location indicated by 'lsn'. Returns true if successful, false otherwise.
+ */
+static bool
+SnapBuildRestore(SnapBuild *builder, XLogRecPtr lsn)
+{
+	SnapBuildOnDisk ondisk;
+	int			fd;
+	char		path[MAXPGPATH];
+	Size		sz;
+	int			readBytes;
+	pg_crc32	checksum;
+
+	/* no point in loading a snapshot if we're already there */
+	if (builder->state == SNAPBUILD_CONSISTENT)
+		return false;
+
+	sprintf(path, "pg_llog/snapshots/%X-%X.snap",
+			(uint32) (lsn >> 32), (uint32) lsn);
+
+	fd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0);
+
+	if (fd < 0 && errno == ENOENT)
+		return false;
+	else if (fd < 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not open file \"%s\": %m", path)));
+
+	/* ----
+	 * Make sure the snapshot had been stored safely to disk, that's normally
+	 * cheap.
+	 * Note that we do not need PANIC here, nobody will be able to use the
+	 * slot without fsyncing, and saving it won't suceed without an fsync()
+	 * either...
+	 * ----
+	 */
+	fsync_fname(path, false);
+	fsync_fname("pg_llog/snapshots", true);
+
+
+	/* read statically sized portion of snapshot */
+	readBytes = read(fd, &ondisk, SnapBuildOnDiskConstantSize);
+	if (readBytes != SnapBuildOnDiskConstantSize)
+	{
+		CloseTransientFile(fd);
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not read file \"%s\", read %d of %d: %m",
+						path, readBytes, (int) SnapBuildOnDiskConstantSize)));
+	}
+
+	if (ondisk.magic != SNAPBUILD_MAGIC)
+		ereport(ERROR,
+				(errmsg("snapbuild state file \"%s\" has wrong magic %u instead of %u",
+						path, ondisk.magic, SNAPBUILD_MAGIC)));
+
+	if (ondisk.version != SNAPBUILD_VERSION)
+		ereport(ERROR,
+				(errmsg("snapbuild state file \"%s\" has unsupported version %u instead of %u",
+						path, ondisk.version, SNAPBUILD_VERSION)));
+
+	INIT_CRC32(checksum);
+	COMP_CRC32(checksum,
+			   ((char *) &ondisk) + SnapBuildOnDiskNotChecksummedSize,
+			   SnapBuildOnDiskConstantSize - SnapBuildOnDiskNotChecksummedSize);
+
+	/* read SnapBuild */
+	readBytes = read(fd, &ondisk.builder, sizeof(SnapBuild));
+	if (readBytes != sizeof(SnapBuild))
+	{
+		CloseTransientFile(fd);
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not read file \"%s\", read %d of %d: %m",
+						path, readBytes, (int) sizeof(SnapBuild))));
+	}
+	COMP_CRC32(checksum, &ondisk.builder, sizeof(SnapBuild));
+
+	/* restore running xacts information */
+	sz = sizeof(TransactionId) * ondisk.builder.running.xcnt_space;
+	ondisk.builder.running.xip = MemoryContextAlloc(builder->context, sz);
+	readBytes = read(fd, ondisk.builder.running.xip, sz);
+	if (readBytes != sz)
+	{
+		CloseTransientFile(fd);
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not read file \"%s\", read %d of %d: %m",
+						path, readBytes, (int) sz)));
+	}
+	COMP_CRC32(checksum, ondisk.builder.running.xip, sz);
+
+	/* restore committed xacts information */
+	sz = sizeof(TransactionId) * ondisk.builder.committed.xcnt;
+	ondisk.builder.committed.xip = MemoryContextAlloc(builder->context, sz);
+	readBytes = read(fd, ondisk.builder.committed.xip, sz);
+	if (readBytes != sz)
+	{
+		CloseTransientFile(fd);
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not read file \"%s\", read %d of %d: %m",
+						path, readBytes, (int) sz)));
+	}
+	COMP_CRC32(checksum, ondisk.builder.committed.xip, sz);
+
+	CloseTransientFile(fd);
+
+	/* verify checksum of what we've read */
+	if (!EQ_CRC32(checksum, ondisk.checksum))
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("snapbuild state file %s: checksum mismatch, is %u, should be %u",
+						path, checksum, ondisk.checksum)));
+
+	/*
+	 * ok, we now have a sensible snapshot here, figure out if it has more
+	 * information than we have.
+	 */
+
+	/*
+	 * We are only interested in consistent snapshots for now, comparing
+	 * whether one imcomplete snapshot is more "advanced" seems to be
+	 * unnecessarily complex.
+	 */
+	if (ondisk.builder.state < SNAPBUILD_CONSISTENT)
+		goto snapshot_not_interesting;
+
+	/*
+	 * Don't use a snapshot that requires an xmin that we cannot guarantee to
+	 * be available.
+	 */
+	if (TransactionIdPrecedes(ondisk.builder.xmin, builder->initial_xmin_horizon))
+		goto snapshot_not_interesting;
+
+
+	/* ok, we think the snapshot is sensible, copy over everything important */
+	builder->xmin = ondisk.builder.xmin;
+	builder->xmax = ondisk.builder.xmax;
+	builder->state = ondisk.builder.state;
+
+	builder->committed.xcnt = ondisk.builder.committed.xcnt;
+	/* We only allocated/stored xcnt, not xcnt_space xids ! */
+	/* don't overwrite preallocated xip, if we don't have anything here */
+	if (builder->committed.xcnt > 0)
+	{
+		pfree(builder->committed.xip);
+		builder->committed.xcnt_space = ondisk.builder.committed.xcnt;
+		builder->committed.xip = ondisk.builder.committed.xip;
+	}
+	ondisk.builder.committed.xip = NULL;
+
+	builder->running.xcnt = ondisk.builder.committed.xcnt;
+	if (builder->running.xip)
+		pfree(builder->running.xip);
+	builder->running.xcnt_space = ondisk.builder.committed.xcnt_space;
+	builder->running.xip = ondisk.builder.running.xip;
+
+	/* our snapshot is not interesting anymore, build a new one */
+	if (builder->snapshot != NULL)
+	{
+		SnapBuildSnapDecRefcount(builder->snapshot);
+	}
+	builder->snapshot = SnapBuildBuildSnapshot(builder, InvalidTransactionId);
+	SnapBuildSnapIncRefcount(builder->snapshot);
+
+	ReorderBufferSetRestartPoint(builder->reorder, lsn);
+
+	Assert(builder->state == SNAPBUILD_CONSISTENT);
+
+	ereport(LOG,
+			(errmsg("logical decoding found consistent point at %X/%X",
+					(uint32)(lsn >> 32), (uint32)lsn),
+			 errdetail("found initial snapshot in snapbuild file")));
+	return true;
+
+snapshot_not_interesting:
+	if (ondisk.builder.running.xip != NULL)
+		pfree(ondisk.builder.running.xip);
+	if (ondisk.builder.committed.xip != NULL)
+		pfree(ondisk.builder.committed.xip);
+	return false;
+}
+
+/*
+ * Remove all serialized snapshots that are not required anymore because no
+ * slot can need them. This doesn't actually have to run during a checkpoint,
+ * but it's a convenient point to schedule this.
+ *
+ * NB: We run this during checkpoints even if logical decoding is disabled so
+ * we cleanup old slots at some point after it got disabled.
+ */
+void
+CheckPointSnapBuild(void)
+{
+	XLogRecPtr	cutoff;
+	XLogRecPtr	redo;
+	DIR		   *snap_dir;
+	struct dirent *snap_de;
+	char		path[MAXPGPATH];
+
+	/*
+	 * We start of with a minimum of the last redo pointer. No new replication
+	 * slot will start before that, so that's a safe upper bound for removal.
+	 */
+	redo = GetRedoRecPtr();
+
+	/* now check for the restart ptrs from existing slots */
+	cutoff = ReplicationSlotsComputeLogicalRestartLSN();
+
+	/* don't start earlier than the restart lsn */
+	if (redo < cutoff)
+		cutoff = redo;
+
+	snap_dir = AllocateDir("pg_llog/snapshots");
+	while ((snap_de = ReadDir(snap_dir, "pg_llog/snapshots")) != NULL)
+	{
+		uint32		hi;
+		uint32		lo;
+		XLogRecPtr	lsn;
+		struct stat	statbuf;
+
+		if (strcmp(snap_de->d_name, ".") == 0 ||
+			strcmp(snap_de->d_name, "..") == 0)
+			continue;
+
+		snprintf(path, MAXPGPATH, "pg_llog/snapshots/%s", snap_de->d_name);
+
+		if (lstat(path, &statbuf) == 0 && !S_ISREG(statbuf.st_mode))
+		{
+			elog(DEBUG1, "only regular files expected: %s", path);
+			continue;
+		}
+
+		/*
+		 * temporary filenames from SnapBuildSerialize() include the LSN and
+		 * everything but are postfixed by .$pid.tmp. We can just remove them
+		 * the same as other files because there can be none that are currently
+		 * being written that are older than cutoff.
+		 *
+		 * We just log a message if a file doesn't fit the pattern, it's
+		 * probably some editors lock/state file or similar...
+		 */
+		if (sscanf(snap_de->d_name, "%X-%X.snap", &hi, &lo) != 2)
+		{
+			ereport(LOG,
+					(errmsg("could not parse filename \"%s\"", path)));
+			continue;
+		}
+
+		lsn = ((uint64) hi) << 32 | lo;
+
+		/* check whether we still need it */
+		if (lsn < cutoff || cutoff == InvalidXLogRecPtr)
+		{
+			elog(DEBUG1, "removing snapbuild snapshot %s", path);
+
+			/*
+			 * It's not particularly harmful, though strange, if we can't
+			 * remove the file here. Don't prevent the checkpoint from
+			 * completing, that'd be cure worse than the disease.
+			 */
+			if (unlink(path) < 0)
+			{
+				ereport(LOG,
+						(errcode_for_file_access(),
+						 errmsg("could not unlink file \"%s\": %m",
+								path)));
+				continue;
+			}
+		}
+	}
+	FreeDir(snap_dir);
+}
diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c
index 826c7f027e5..45ed7e40e89 100644
--- a/src/backend/replication/slot.c
+++ b/src/backend/replication/slot.c
@@ -43,6 +43,7 @@
 #include "miscadmin.h"
 #include "replication/slot.h"
 #include "storage/fd.h"
+#include "storage/proc.h"
 #include "storage/procarray.h"
 
 /*
@@ -82,6 +83,8 @@ ReplicationSlot *MyReplicationSlot = NULL;
 /* GUCs */
 int			max_replication_slots = 0;	/* the maximum number of replication slots */
 
+static void ReplicationSlotDropAcquired(void);
+
 /* internal persistency functions */
 static void RestoreSlotFromDisk(const char *name);
 static void CreateSlotOnDisk(ReplicationSlot *slot);
@@ -190,11 +193,12 @@ ReplicationSlotValidateName(const char *name, int elevel)
  * Create a new replication slot and mark it as used by this backend.
  *
  * name: Name of the slot
- * db_specific: changeset extraction is db specific, if the slot is going to
+ * db_specific: logical decoding is db specific; if the slot is going to
  *     be used for that pass true, otherwise false.
  */
 void
-ReplicationSlotCreate(const char *name, bool db_specific)
+ReplicationSlotCreate(const char *name, bool db_specific,
+					  ReplicationSlotPersistency persistency)
 {
 	ReplicationSlot *slot = NULL;
 	int			i;
@@ -246,6 +250,7 @@ ReplicationSlotCreate(const char *name, bool db_specific)
 	 */
 	Assert(!slot->in_use);
 	Assert(!slot->active);
+	slot->data.persistency = persistency;
 	slot->data.xmin = InvalidTransactionId;
 	slot->effective_xmin = InvalidTransactionId;
 	strncpy(NameStr(slot->data.name), name, NAMEDATALEN);
@@ -348,14 +353,30 @@ ReplicationSlotRelease(void)
 
 	Assert(slot != NULL && slot->active);
 
-	/* Mark slot inactive.  We're not freeing it, just disconnecting. */
+	if (slot->data.persistency == RS_EPHEMERAL)
+	{
+		/*
+		 * Delete the slot. There is no !PANIC case where this is allowed to
+		 * fail, all that may happen is an incomplete cleanup of the on-disk
+		 * data.
+		 */
+		ReplicationSlotDropAcquired();
+	}
+	else
 	{
+		/* Mark slot inactive.  We're not freeing it, just disconnecting. */
 		volatile ReplicationSlot *vslot = slot;
 		SpinLockAcquire(&slot->mutex);
 		vslot->active = false;
 		SpinLockRelease(&slot->mutex);
-		MyReplicationSlot = NULL;
 	}
+
+	MyReplicationSlot = NULL;
+
+	/* might not have been set when we've been a plain slot */
+	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+	MyPgXact->vacuumFlags &= ~PROC_IN_LOGICAL_DECODING;
+	LWLockRelease(ProcArrayLock);
 }
 
 /*
@@ -364,52 +385,36 @@ ReplicationSlotRelease(void)
 void
 ReplicationSlotDrop(const char *name)
 {
-	ReplicationSlot *slot = NULL;
-	int			i;
-	bool		active;
+	Assert(MyReplicationSlot == NULL);
+
+	ReplicationSlotAcquire(name);
+
+	ReplicationSlotDropAcquired();
+}
+
+/*
+ * Permanently drop the currently acquired replication slot which will be
+ * released by the point this function returns.
+ */
+static void
+ReplicationSlotDropAcquired(void)
+{
 	char		path[MAXPGPATH];
 	char		tmppath[MAXPGPATH];
+	ReplicationSlot *slot = MyReplicationSlot;
 
-	ReplicationSlotValidateName(name, ERROR);
+	Assert(MyReplicationSlot != NULL);
+
+	/* slot isn't acquired anymore */
+	MyReplicationSlot = NULL;
 
 	/*
-	 * If some other backend ran this code currently with us, we might both
-	 * try to free the same slot at the same time.  Or we might try to delete
-	 * a slot with a certain name while someone else was trying to create a
-	 * slot with the same name.
+	 * If some other backend ran this code concurrently with us, we might try
+	 * to delete a slot with a certain name while someone else was trying to
+	 * create a slot with the same name.
 	 */
 	LWLockAcquire(ReplicationSlotAllocationLock, LW_EXCLUSIVE);
 
-	/* Search for the named slot and mark it active if we find it. */
-	LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
-	for (i = 0; i < max_replication_slots; i++)
-	{
-		ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
-
-		if (s->in_use && strcmp(name, NameStr(s->data.name)) == 0)
-		{
-			volatile ReplicationSlot *vslot = s;
-
-			SpinLockAcquire(&s->mutex);
-			active = vslot->active;
-			vslot->active = true;
-			SpinLockRelease(&s->mutex);
-			slot = s;
-			break;
-		}
-	}
-	LWLockRelease(ReplicationSlotControlLock);
-
-	/* If we did not find the slot or it was already active, error out. */
-	if (slot == NULL)
-		ereport(ERROR,
-				(errcode(ERRCODE_UNDEFINED_OBJECT),
-				 errmsg("replication slot \"%s\" does not exist", name)));
-	if (active)
-		ereport(ERROR,
-				(errcode(ERRCODE_OBJECT_IN_USE),
-				 errmsg("replication slot \"%s\" is already active", name)));
-
 	/* Generate pathnames. */
 	sprintf(path, "pg_replslot/%s", NameStr(slot->data.name));
 	sprintf(tmppath, "pg_replslot/%s.tmp", NameStr(slot->data.name));
@@ -417,35 +422,41 @@ ReplicationSlotDrop(const char *name)
 	/*
 	 * Rename the slot directory on disk, so that we'll no longer recognize
 	 * this as a valid slot.  Note that if this fails, we've got to mark the
-	 * slot inactive again before bailing out.
+	 * slot inactive before bailing out.  If we're dropping a ephemeral slot,
+	 * we better never fail hard as the caller won't expect the slot to
+	 * survive and this might get called during error handling.
 	 */
-	if (rename(path, tmppath) != 0)
+	if (rename(path, tmppath) == 0)
+	{
+		/*
+		 * We need to fsync() the directory we just renamed and its parent to
+		 * make sure that our changes are on disk in a crash-safe fashion.  If
+		 * fsync() fails, we can't be sure whether the changes are on disk or
+		 * not.  For now, we handle that by panicking;
+		 * StartupReplicationSlots() will try to straighten it out after
+		 * restart.
+		 */
+		START_CRIT_SECTION();
+		fsync_fname(tmppath, true);
+		fsync_fname("pg_replslot", true);
+		END_CRIT_SECTION();
+	}
+	else
 	{
 		volatile ReplicationSlot *vslot = slot;
+		bool fail_softly = slot->data.persistency == RS_EPHEMERAL;
 
 		SpinLockAcquire(&slot->mutex);
 		vslot->active = false;
 		SpinLockRelease(&slot->mutex);
 
-		ereport(ERROR,
+		ereport(fail_softly ? WARNING : ERROR,
 				(errcode_for_file_access(),
 				 errmsg("could not rename \"%s\" to \"%s\": %m",
 						path, tmppath)));
 	}
 
 	/*
-	 * We need to fsync() the directory we just renamed and its parent to make
-	 * sure that our changes are on disk in a crash-safe fashion.  If fsync()
-	 * fails, we can't be sure whether the changes are on disk or not.  For
-	 * now, we handle that by panicking; StartupReplicationSlots() will
-	 * try to straighten it out after restart.
-	 */
-	START_CRIT_SECTION();
-	fsync_fname(tmppath, true);
-	fsync_fname("pg_replslot", true);
-	END_CRIT_SECTION();
-
-	/*
 	 * The slot is definitely gone.  Lock out concurrent scans of the array
 	 * long enough to kill it.  It's OK to clear the active flag here without
 	 * grabbing the mutex because nobody else can be scanning the array here,
@@ -461,7 +472,7 @@ ReplicationSlotDrop(const char *name)
 	 * Slot is dead and doesn't prevent resource removal anymore, recompute
 	 * limits.
 	 */
-	ReplicationSlotsComputeRequiredXmin();
+	ReplicationSlotsComputeRequiredXmin(false);
 	ReplicationSlotsComputeRequiredLSN();
 
 	/*
@@ -519,21 +530,49 @@ ReplicationSlotMarkDirty(void)
 }
 
 /*
+ * Convert a slot that's marked as RS_DROP_ON_ERROR to a RS_PERSISTENT slot,
+ * guaranteeing it will be there after a eventual crash.
+ */
+void
+ReplicationSlotPersist(void)
+{
+	ReplicationSlot *slot = MyReplicationSlot;
+
+	Assert(slot != NULL);
+	Assert(slot->data.persistency != RS_PERSISTENT);
+
+	{
+		volatile ReplicationSlot *vslot = slot;
+
+		SpinLockAcquire(&slot->mutex);
+		vslot->data.persistency = RS_PERSISTENT;
+		SpinLockRelease(&slot->mutex);
+	}
+
+	ReplicationSlotMarkDirty();
+	ReplicationSlotSave();
+}
+
+/*
  * Compute the oldest xmin across all slots and store it in the ProcArray.
  */
 void
-ReplicationSlotsComputeRequiredXmin(void)
+ReplicationSlotsComputeRequiredXmin(bool already_locked)
 {
 	int			i;
 	TransactionId agg_xmin = InvalidTransactionId;
+	TransactionId agg_catalog_xmin = InvalidTransactionId;
 
 	Assert(ReplicationSlotCtl != NULL);
 
-	LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
+	if (!already_locked)
+		LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
+
 	for (i = 0; i < max_replication_slots; i++)
 	{
 		ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
 		TransactionId	effective_xmin;
+		TransactionId	effective_catalog_xmin;
 
 		if (!s->in_use)
 			continue;
@@ -543,6 +582,7 @@ ReplicationSlotsComputeRequiredXmin(void)
 
 			SpinLockAcquire(&s->mutex);
 			effective_xmin = vslot->effective_xmin;
+			effective_catalog_xmin = vslot->effective_catalog_xmin;
 			SpinLockRelease(&s->mutex);
 		}
 
@@ -551,10 +591,18 @@ ReplicationSlotsComputeRequiredXmin(void)
 			(!TransactionIdIsValid(agg_xmin) ||
 			 TransactionIdPrecedes(effective_xmin, agg_xmin)))
 			agg_xmin = effective_xmin;
+
+		/* check the catalog xmin */
+		if (TransactionIdIsValid(effective_catalog_xmin) &&
+			(!TransactionIdIsValid(agg_catalog_xmin) ||
+			 TransactionIdPrecedes(effective_catalog_xmin, agg_catalog_xmin)))
+			agg_catalog_xmin = effective_catalog_xmin;
 	}
-	LWLockRelease(ReplicationSlotControlLock);
 
-	ProcArraySetReplicationSlotXmin(agg_xmin);
+	if (!already_locked)
+		LWLockRelease(ReplicationSlotControlLock);
+
+	ProcArraySetReplicationSlotXmin(agg_xmin, agg_catalog_xmin, already_locked);
 }
 
 /*
@@ -596,6 +644,110 @@ ReplicationSlotsComputeRequiredLSN(void)
 }
 
 /*
+ * Compute the oldest WAL LSN required by *logical* decoding slots..
+ *
+ * Returns InvalidXLogRecPtr if logical decoding is disabled or no logicals
+ * slots exist.
+ *
+ * NB: this returns a value >= ReplicationSlotsComputeRequiredLSN(), since it
+ * ignores physical replication slots.
+ *
+ * The results aren't required frequently, so we don't maintain a precomputed
+ * value like we do for ComputeRequiredLSN() and ComputeRequiredXmin().
+ */
+XLogRecPtr
+ReplicationSlotsComputeLogicalRestartLSN(void)
+{
+	XLogRecPtr	result = InvalidXLogRecPtr;
+	int			i;
+
+	if (max_replication_slots <= 0)
+		return InvalidXLogRecPtr;
+
+	LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
+
+	for (i = 0; i < max_replication_slots; i++)
+	{
+		volatile ReplicationSlot *s;
+		XLogRecPtr		restart_lsn;
+
+		s = &ReplicationSlotCtl->replication_slots[i];
+
+		/* cannot change while ReplicationSlotCtlLock is held */
+		if (!s->in_use)
+			continue;
+
+		/* we're only interested in logical slots */
+		if (s->data.database == InvalidOid)
+			continue;
+
+		/* read once, it's ok if it increases while we're checking */
+		SpinLockAcquire(&s->mutex);
+		restart_lsn = s->data.restart_lsn;
+		SpinLockRelease(&s->mutex);
+
+		if (result == InvalidXLogRecPtr ||
+			restart_lsn < result)
+			result = restart_lsn;
+	}
+
+	LWLockRelease(ReplicationSlotControlLock);
+
+	return result;
+}
+
+/*
+ * ReplicationSlotsCountDBSlots -- count the number of slots that refer to the
+ * passed database oid.
+ *
+ * Returns true if there are any slots referencing the database. *nslots will
+ * be set to the absolute number of slots in the database, *nactive to ones
+ * currently active.
+ */
+bool
+ReplicationSlotsCountDBSlots(Oid dboid, int *nslots, int *nactive)
+{
+	int			i;
+
+	*nslots = *nactive = 0;
+
+	if (max_replication_slots <= 0)
+		return false;
+
+	LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
+	for (i = 0; i < max_replication_slots; i++)
+	{
+		volatile ReplicationSlot *s;
+
+		s = &ReplicationSlotCtl->replication_slots[i];
+
+		/* cannot change while ReplicationSlotCtlLock is held */
+		if (!s->in_use)
+			continue;
+
+		/* not database specific, skip */
+		if (s->data.database == InvalidOid)
+
+		/* not our database, skip */
+		if (s->data.database != dboid)
+			continue;
+
+		/* count slots with spinlock held */
+		SpinLockAcquire(&s->mutex);
+		(*nslots)++;
+		if (s->active)
+			(*nactive)++;
+		SpinLockRelease(&s->mutex);
+	}
+	LWLockRelease(ReplicationSlotControlLock);
+
+	if (*nslots > 0)
+		return true;
+	return false;
+}
+
+
+/*
  * Check whether the server's configuration supports using replication
  * slots.
  */
@@ -723,7 +875,7 @@ StartupReplicationSlots(XLogRecPtr checkPointRedo)
 		return;
 
 	/* Now that we have recovered all the data, compute replication xmin */
-	ReplicationSlotsComputeRequiredXmin();
+	ReplicationSlotsComputeRequiredXmin(false);
 	ReplicationSlotsComputeRequiredLSN();
 }
 
@@ -1050,8 +1202,19 @@ RestoreSlotFromDisk(const char *name)
 		memcpy(&slot->data, &cp.slotdata,
 			   sizeof(ReplicationSlotPersistentData));
 
+		/* Don't restore the slot if it's not parked as persistent. */
+		if (slot->data.persistency != RS_PERSISTENT)
+			return;
+
 		/* initialize in memory state */
 		slot->effective_xmin = cp.slotdata.xmin;
+		slot->effective_catalog_xmin = cp.slotdata.catalog_xmin;
+
+		slot->candidate_catalog_xmin = InvalidTransactionId;
+		slot->candidate_xmin_lsn = InvalidXLogRecPtr;
+		slot->candidate_restart_lsn = InvalidXLogRecPtr;
+		slot->candidate_restart_valid = InvalidXLogRecPtr;
+
 		slot->in_use = true;
 		slot->active = false;
 
diff --git a/src/backend/replication/slotfuncs.c b/src/backend/replication/slotfuncs.c
index 5acd2bae19c..c9416b03eee 100644
--- a/src/backend/replication/slotfuncs.c
+++ b/src/backend/replication/slotfuncs.c
@@ -15,13 +15,13 @@
 
 #include "funcapi.h"
 #include "miscadmin.h"
+
 #include "access/htup_details.h"
+#include "replication/slot.h"
+#include "replication/logical.h"
+#include "replication/logicalfuncs.h"
 #include "utils/builtins.h"
 #include "utils/pg_lsn.h"
-#include "replication/slot.h"
-
-Datum		pg_create_physical_replication_slot(PG_FUNCTION_ARGS);
-Datum		pg_drop_replication_slot(PG_FUNCTION_ARGS);
 
 static void
 check_permissions(void)
@@ -54,7 +54,7 @@ pg_create_physical_replication_slot(PG_FUNCTION_ARGS)
 		elog(ERROR, "return type must be a row type");
 
 	/* acquire replication slot, this will check for conflicting names*/
-	ReplicationSlotCreate(NameStr(*name), false);
+	ReplicationSlotCreate(NameStr(*name), false, RS_PERSISTENT);
 
 	values[0] = NameGetDatum(&MyReplicationSlot->data.name);
 
@@ -69,6 +69,68 @@ pg_create_physical_replication_slot(PG_FUNCTION_ARGS)
 	PG_RETURN_DATUM(result);
 }
 
+
+/*
+ * SQL function for creating a new logical replication slot.
+ */
+Datum
+pg_create_logical_replication_slot(PG_FUNCTION_ARGS)
+{
+	Name		name = PG_GETARG_NAME(0);
+	Name		plugin = PG_GETARG_NAME(1);
+
+	LogicalDecodingContext *ctx = NULL;
+
+	TupleDesc	tupdesc;
+	HeapTuple	tuple;
+	Datum		result;
+	Datum		values[2];
+	bool		nulls[2];
+
+	if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+		elog(ERROR, "return type must be a row type");
+
+	check_permissions();
+
+	CheckLogicalDecodingRequirements();
+
+	Assert(!MyReplicationSlot);
+
+	/*
+	 * Acquire a logical decoding slot, this will check for conflicting
+	 * names.
+	 */
+	ReplicationSlotCreate(NameStr(*name), true, RS_EPHEMERAL);
+
+	/*
+	 * Create logical decoding context, to build the initial snapshot.
+	 */
+	ctx = CreateInitDecodingContext(
+		NameStr(*plugin), NIL,
+		logical_read_local_xlog_page, NULL, NULL);
+
+	/* build initial snapshot, might take a while */
+	DecodingContextFindStartpoint(ctx);
+
+	values[0] = CStringGetTextDatum(NameStr(MyReplicationSlot->data.name));
+	values[1] = LSNGetDatum(MyReplicationSlot->data.confirmed_flush);
+
+	/* don't need the decoding context anymore */
+	FreeDecodingContext(ctx);
+
+	memset(nulls, 0, sizeof(nulls));
+
+	tuple = heap_form_tuple(tupdesc, values, nulls);
+	result = HeapTupleGetDatum(tuple);
+
+	/* ok, slot is now fully created, mark it as persistent */
+	ReplicationSlotPersist();
+	ReplicationSlotRelease();
+
+	PG_RETURN_DATUM(result);
+}
+
+
 /*
  * SQL function for dropping a replication slot.
  */
@@ -92,7 +154,7 @@ pg_drop_replication_slot(PG_FUNCTION_ARGS)
 Datum
 pg_get_replication_slots(PG_FUNCTION_ARGS)
 {
-#define PG_STAT_GET_REPLICATION_SLOTS_COLS 6
+#define PG_GET_REPLICATION_SLOTS_COLS 8
 	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
 	TupleDesc	tupdesc;
 	Tuplestorestate *tupstore;
@@ -134,15 +196,16 @@ pg_get_replication_slots(PG_FUNCTION_ARGS)
 	for (slotno = 0; slotno < max_replication_slots; slotno++)
 	{
 		ReplicationSlot *slot = &ReplicationSlotCtl->replication_slots[slotno];
-		Datum		values[PG_STAT_GET_REPLICATION_SLOTS_COLS];
-		bool		nulls[PG_STAT_GET_REPLICATION_SLOTS_COLS];
+		Datum		values[PG_GET_REPLICATION_SLOTS_COLS];
+		bool		nulls[PG_GET_REPLICATION_SLOTS_COLS];
 
 		TransactionId xmin;
+		TransactionId catalog_xmin;
 		XLogRecPtr	restart_lsn;
 		bool		active;
 		Oid			database;
 		NameData	slot_name;
-
+		NameData	plugin;
 		int			i;
 
 		SpinLockAcquire(&slot->mutex);
@@ -154,9 +217,11 @@ pg_get_replication_slots(PG_FUNCTION_ARGS)
 		else
 		{
 			xmin = slot->data.xmin;
+			catalog_xmin = slot->data.catalog_xmin;
 			database = slot->data.database;
 			restart_lsn = slot->data.restart_lsn;
 			namecpy(&slot_name, &slot->data.name);
+			namecpy(&plugin, &slot->data.plugin);
 
 			active = slot->active;
 		}
@@ -166,19 +231,34 @@ pg_get_replication_slots(PG_FUNCTION_ARGS)
 
 		i = 0;
 		values[i++] = NameGetDatum(&slot_name);
+
+		if (database == InvalidOid)
+			nulls[i++] = true;
+		else
+			values[i++] = NameGetDatum(&plugin);
+
 		if (database == InvalidOid)
 			values[i++] = CStringGetTextDatum("physical");
 		else
 			values[i++] = CStringGetTextDatum("logical");
+
 		if (database == InvalidOid)
 			nulls[i++] = true;
 		else
 			values[i++] = database;
+
 		values[i++] = BoolGetDatum(active);
+
 		if (xmin != InvalidTransactionId)
 			values[i++] = TransactionIdGetDatum(xmin);
 		else
 			nulls[i++] = true;
+
+		if (catalog_xmin != InvalidTransactionId)
+			values[i++] = TransactionIdGetDatum(catalog_xmin);
+		else
+			nulls[i++] = true;
+
 		if (restart_lsn != InvalidTransactionId)
 			values[i++] = LSNGetDatum(restart_lsn);
 		else
diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c
index e31977eee02..43db10851c3 100644
--- a/src/backend/replication/walreceiver.c
+++ b/src/backend/replication/walreceiver.c
@@ -1147,7 +1147,7 @@ XLogWalRcvSendHSFeedback(bool immed)
 	 * everything else has been checked.
 	 */
 	if (hot_standby_feedback)
-		xmin = GetOldestXmin(true, false);
+		xmin = GetOldestXmin(NULL, false);
 	else
 		xmin = InvalidTransactionId;
 
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index 048367af299..5227eab414f 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -55,6 +55,7 @@
 #include "replication/basebackup.h"
 #include "replication/slot.h"
 #include "replication/syncrep.h"
+#include "replication/slot.h"
 #include "replication/walreceiver.h"
 #include "replication/walsender.h"
 #include "replication/walsender_private.h"
@@ -434,7 +435,7 @@ StartReplication(StartReplicationCmd *cmd)
 		if (MyReplicationSlot->data.database != InvalidOid)
 			ereport(ERROR,
 					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
-					 (errmsg("cannot use a replication slot created for changeset extraction for streaming replication"))));
+					 (errmsg("cannot use a logical replication slot for physical replication"))));
 	}
 
 	/*
@@ -656,7 +657,9 @@ CreateReplicationSlot(CreateReplicationSlotCmd *cmd)
 	sendTimeLineIsHistoric = false;
 	sendTimeLine = ThisTimeLineID;
 
-	ReplicationSlotCreate(cmd->slotname, cmd->kind == REPLICATION_KIND_LOGICAL);
+	ReplicationSlotCreate(cmd->slotname,
+						  cmd->kind == REPLICATION_KIND_LOGICAL,
+						  RS_PERSISTENT);
 
 	initStringInfo(&output_message);
 
@@ -766,7 +769,7 @@ exec_replication_command(const char *cmd_string)
 				if (cmd->kind == REPLICATION_KIND_PHYSICAL)
 					StartReplication(cmd);
 				else
-					elog(ERROR, "cannot handle changeset extraction yet");
+					elog(ERROR, "cannot handle logical decoding yet");
 				break;
 			}
 
@@ -1017,7 +1020,7 @@ ProcessStandbyReplyMessage(void)
 	if (MyReplicationSlot && flushPtr != InvalidXLogRecPtr)
 	{
 		if (MyReplicationSlot->data.database != InvalidOid)
-			elog(ERROR, "cannot handle changeset extraction yet");
+			elog(ERROR, "cannot handle logical decoding yet");
 		else
 			PhysicalConfirmReceivedLocation(flushPtr);
 	}
@@ -1050,7 +1053,7 @@ PhysicalReplicationSlotNewXmin(TransactionId feedbackXmin)
 	if (changed)
 	{
 		ReplicationSlotMarkDirty();
-		ReplicationSlotsComputeRequiredXmin();
+		ReplicationSlotsComputeRequiredXmin(false);
 	}
 }
 
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index eac418442d3..3376a353a40 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -50,11 +50,13 @@
 #include "access/transam.h"
 #include "access/xact.h"
 #include "access/twophase.h"
+#include "catalog/catalog.h"
 #include "miscadmin.h"
 #include "storage/proc.h"
 #include "storage/procarray.h"
 #include "storage/spin.h"
 #include "utils/builtins.h"
+#include "utils/rel.h"
 #include "utils/snapmgr.h"
 
 
@@ -84,6 +86,8 @@ typedef struct ProcArrayStruct
 
 	/* oldest xmin of any replication slot */
 	TransactionId replication_slot_xmin;
+	/* oldest catalog xmin of any replication slot */
+	TransactionId replication_slot_catalog_xmin;
 
 	/*
 	 * We declare pgprocnos[] as 1 entry because C wants a fixed-size array,
@@ -1108,21 +1112,22 @@ TransactionIdIsActive(TransactionId xid)
  * GetOldestXmin -- returns oldest transaction that was running
  *					when any current transaction was started.
  *
- * If allDbs is TRUE then all backends are considered; if allDbs is FALSE
- * then only backends running in my own database are considered.
+ * If rel is NULL or a shared relation, all backends are considered, otherwise
+ * only backends running in this database are considered.
  *
  * If ignoreVacuum is TRUE then backends with the PROC_IN_VACUUM flag set are
  * ignored.
  *
- * This is used by VACUUM to decide which deleted tuples must be preserved
- * in a table.	allDbs = TRUE is needed for shared relations, but allDbs =
- * FALSE is sufficient for non-shared relations, since only backends in my
- * own database could ever see the tuples in them.	Also, we can ignore
- * concurrently running lazy VACUUMs because (a) they must be working on other
- * tables, and (b) they don't need to do snapshot-based lookups.
+ * This is used by VACUUM to decide which deleted tuples must be preserved in
+ * the passed in table. For shared relations backends in all databases must be
+ * considered, but for non-shared relations that's not required, since only
+ * backends in my own database could ever see the tuples in them. Also, we can
+ * ignore concurrently running lazy VACUUMs because (a) they must be working
+ * on other tables, and (b) they don't need to do snapshot-based lookups.
  *
- * This is also used to determine where to truncate pg_subtrans.  allDbs
- * must be TRUE for that case, and ignoreVacuum FALSE.
+ * This is also used to determine where to truncate pg_subtrans.  For that
+ * backends in all databases have to be considered, so rel = NULL has to be
+ * passed in.
  *
  * Note: we include all currently running xids in the set of considered xids.
  * This ensures that if a just-started xact has not yet set its snapshot,
@@ -1133,7 +1138,7 @@ TransactionIdIsActive(TransactionId xid)
  * backwards on repeated calls. The calculated value is conservative, so that
  * anything older is definitely not considered as running by anyone anymore,
  * but the exact value calculated depends on a number of things. For example,
- * if allDbs is FALSE and there are no transactions running in the current
+ * if rel = NULL and there are no transactions running in the current
  * database, GetOldestXmin() returns latestCompletedXid. If a transaction
  * begins after that, its xmin will include in-progress transactions in other
  * databases that started earlier, so another call will return a lower value.
@@ -1152,12 +1157,22 @@ TransactionIdIsActive(TransactionId xid)
  * GetOldestXmin() move backwards, with no consequences for data integrity.
  */
 TransactionId
-GetOldestXmin(bool allDbs, bool ignoreVacuum)
+GetOldestXmin(Relation rel, bool ignoreVacuum)
 {
 	ProcArrayStruct *arrayP = procArray;
 	TransactionId result;
 	int			index;
+	bool		allDbs;
+
 	volatile TransactionId replication_slot_xmin = InvalidTransactionId;
+	volatile TransactionId replication_slot_catalog_xmin = InvalidTransactionId;
+
+	/*
+	 * If we're not computing a relation specific limit, or if a shared
+	 * relation has been passed in, backends in all databases have to be
+	 * considered.
+	 */
+	allDbs = rel == NULL || rel->rd_rel->relisshared;
 
 	/* Cannot look for individual databases during recovery */
 	Assert(allDbs || !RecoveryInProgress());
@@ -1180,6 +1195,13 @@ GetOldestXmin(bool allDbs, bool ignoreVacuum)
 		volatile PGPROC *proc = &allProcs[pgprocno];
 		volatile PGXACT *pgxact = &allPgXact[pgprocno];
 
+		/*
+		 * Backend is doing logical decoding which manages xmin separately,
+		 * check below.
+		 */
+		if (pgxact->vacuumFlags & PROC_IN_LOGICAL_DECODING)
+			continue;
+
 		if (ignoreVacuum && (pgxact->vacuumFlags & PROC_IN_VACUUM))
 			continue;
 
@@ -1211,6 +1233,7 @@ GetOldestXmin(bool allDbs, bool ignoreVacuum)
 
 	/* fetch into volatile var while ProcArrayLock is held */
 	replication_slot_xmin = procArray->replication_slot_xmin;
+	replication_slot_catalog_xmin = procArray->replication_slot_catalog_xmin;
 
 	if (RecoveryInProgress())
 	{
@@ -1259,6 +1282,18 @@ GetOldestXmin(bool allDbs, bool ignoreVacuum)
 		NormalTransactionIdPrecedes(replication_slot_xmin, result))
 		result = replication_slot_xmin;
 
+	/*
+	 * After locks have been released and defer_cleanup_age has been applied,
+	 * check whether we need to back up further to make logical decoding
+	 * possible. We need to do so if we're computing the global limit (rel =
+	 * NULL) or if the passed relation is a catalog relation of some kind.
+	 */
+	if ((rel == NULL ||
+		 RelationIsAccessibleInLogicalDecoding(rel)) &&
+		TransactionIdIsValid(replication_slot_catalog_xmin) &&
+		NormalTransactionIdPrecedes(replication_slot_catalog_xmin, result))
+		result = replication_slot_catalog_xmin;
+
 	return result;
 }
 
@@ -1313,6 +1348,8 @@ GetMaxSnapshotSubxidCount(void)
  *		RecentGlobalXmin: the global xmin (oldest TransactionXmin across all
  *			running transactions, except those running LAZY VACUUM).  This is
  *			the same computation done by GetOldestXmin(true, true).
+ *		RecentGlobalDataXmin: the global xmin for non-catalog tables
+ *			>= RecentGlobalXmin
  *
  * Note: this function should probably not be called with an argument that's
  * not statically allocated (see xip allocation below).
@@ -1329,6 +1366,7 @@ GetSnapshotData(Snapshot snapshot)
 	int			subcount = 0;
 	bool		suboverflowed = false;
 	volatile TransactionId replication_slot_xmin = InvalidTransactionId;
+	volatile TransactionId replication_slot_catalog_xmin = InvalidTransactionId;
 
 	Assert(snapshot != NULL);
 
@@ -1397,6 +1435,13 @@ GetSnapshotData(Snapshot snapshot)
 			volatile PGXACT *pgxact = &allPgXact[pgprocno];
 			TransactionId xid;
 
+			/*
+			 * Backend is doing logical decoding which manages xmin
+			 * separately, check below.
+			 */
+			if (pgxact->vacuumFlags & PROC_IN_LOGICAL_DECODING)
+				continue;
+
 			/* Ignore procs running LAZY VACUUM */
 			if (pgxact->vacuumFlags & PROC_IN_VACUUM)
 				continue;
@@ -1509,6 +1554,7 @@ GetSnapshotData(Snapshot snapshot)
 
 	/* fetch into volatile var while ProcArrayLock is held */
 	replication_slot_xmin = procArray->replication_slot_xmin;
+	replication_slot_catalog_xmin = procArray->replication_slot_catalog_xmin;
 
 	if (!TransactionIdIsValid(MyPgXact->xmin))
 		MyPgXact->xmin = TransactionXmin = xmin;
@@ -1533,6 +1579,17 @@ GetSnapshotData(Snapshot snapshot)
 		NormalTransactionIdPrecedes(replication_slot_xmin, RecentGlobalXmin))
 		RecentGlobalXmin = replication_slot_xmin;
 
+	/* Non-catalog tables can be vacuumed if older than this xid */
+	RecentGlobalDataXmin = RecentGlobalXmin;
+
+	/*
+	 * Check whether there's a replication slot requiring an older catalog
+	 * xmin.
+	 */
+	if (TransactionIdIsNormal(replication_slot_catalog_xmin) &&
+		NormalTransactionIdPrecedes(replication_slot_catalog_xmin, RecentGlobalXmin))
+		RecentGlobalXmin = replication_slot_catalog_xmin;
+
 	RecentXmin = xmin;
 
 	snapshot->xmin = xmin;
@@ -1633,9 +1690,11 @@ ProcArrayInstallImportedXmin(TransactionId xmin, TransactionId sourcexid)
  * Similar to GetSnapshotData but returns more information. We include
  * all PGXACTs with an assigned TransactionId, even VACUUM processes.
  *
- * We acquire XidGenLock, but the caller is responsible for releasing it.
- * This ensures that no new XIDs enter the proc array until the caller has
- * WAL-logged this snapshot, and releases the lock.
+ * We acquire XidGenLock and ProcArrayLock, but the caller is responsible for
+ * releasing them. Acquiring XidGenLock ensures that no new XIDs enter the proc
+ * array until the caller has WAL-logged this snapshot, and releases the
+ * lock. Acquiring ProcArrayLock ensures that no transactions commit until the
+ * lock is released.
  *
  * The returned data structure is statically allocated; caller should not
  * modify it, and must not assume it is valid past the next call.
@@ -1770,6 +1829,15 @@ GetRunningTransactionData(void)
 		}
 	}
 
+	/*
+	 * It's important *not* to include the limits set by slots here because
+	 * snapbuild.c uses oldestRunningXid to manage its xmin horizon. If those
+	 * were to be included here the initial value could never increase because
+	 * of a circular dependency where slots only increase their limits when
+	 * running xacts increases oldestRunningXid and running xacts only
+	 * increases if slots do.
+	 */
+
 	CurrentRunningXacts->xcnt = count - subcount;
 	CurrentRunningXacts->subxcnt = subcount;
 	CurrentRunningXacts->subxid_overflow = suboverflowed;
@@ -1777,13 +1845,12 @@ GetRunningTransactionData(void)
 	CurrentRunningXacts->oldestRunningXid = oldestRunningXid;
 	CurrentRunningXacts->latestCompletedXid = latestCompletedXid;
 
-	/* We don't release XidGenLock here, the caller is responsible for that */
-	LWLockRelease(ProcArrayLock);
-
 	Assert(TransactionIdIsValid(CurrentRunningXacts->nextXid));
 	Assert(TransactionIdIsValid(CurrentRunningXacts->oldestRunningXid));
 	Assert(TransactionIdIsNormal(CurrentRunningXacts->latestCompletedXid));
 
+	/* We don't release the locks here, the caller is responsible for that */
+
 	return CurrentRunningXacts;
 }
 
@@ -1853,6 +1920,92 @@ GetOldestActiveTransactionId(void)
 }
 
 /*
+ * GetOldestSafeDecodingTransactionId -- lowest xid not affected by vacuum
+ *
+ * Returns the oldest xid that we can guarantee not to have been affected by
+ * vacuum, i.e. no rows >= that xid have been vacuumed away unless the
+ * transaction aborted. Note that the value can (and most of the time will) be
+ * much more conservative than what really has been affected by vacuum, but we
+ * currently don't have better data available.
+ *
+ * This is useful to initalize the cutoff xid after which a new changeset
+ * extraction replication slot can start decoding changes.
+ *
+ * Must be called with ProcArrayLock held either shared or exclusively,
+ * although most callers will want to use exclusive mode since it is expected
+ * that the caller will immediately use the xid to peg the xmin horizon.
+ */
+TransactionId
+GetOldestSafeDecodingTransactionId(void)
+{
+	ProcArrayStruct *arrayP = procArray;
+	TransactionId oldestSafeXid;
+	int			index;
+	bool		recovery_in_progress = RecoveryInProgress();
+
+	Assert(LWLockHeldByMe(ProcArrayLock));
+
+	/*
+	 * Acquire XidGenLock, so no transactions can acquire an xid while we're
+	 * running. If no transaction with xid were running concurrently a new xid
+	 * could influence the the RecentXmin et al.
+	 *
+	 * We initialize the computation to nextXid since that's guaranteed to be
+	 * a safe, albeit pessimal, value.
+	 */
+	LWLockAcquire(XidGenLock, LW_SHARED);
+	oldestSafeXid = ShmemVariableCache->nextXid;
+
+	/*
+	 * If there's already a slot pegging the xmin horizon, we can start with
+	 * that value, it's guaranteed to be safe since it's computed by this
+	 * routine initally and has been enforced since.
+	 */
+	if (TransactionIdIsValid(procArray->replication_slot_catalog_xmin) &&
+		TransactionIdPrecedes(procArray->replication_slot_catalog_xmin,
+							  oldestSafeXid))
+		oldestSafeXid = procArray->replication_slot_catalog_xmin;
+
+	/*
+	 * If we're not in recovery, we walk over the procarray and collect the
+	 * lowest xid. Since we're called with ProcArrayLock held and have
+	 * acquired XidGenLock, no entries can vanish concurrently, since
+	 * PGXACT->xid is only set with XidGenLock held and only cleared with
+	 * ProcArrayLock held.
+	 *
+	 * In recovery we can't lower the safe value besides what we've computed
+	 * above, so we'll have to wait a bit longer there. We unfortunately can
+	 * *not* use KnownAssignedXidsGetOldestXmin() since the KnownAssignedXids
+	 * machinery can miss values and return an older value than is safe.
+	 */
+	if (!recovery_in_progress)
+	{
+		/*
+		 * Spin over procArray collecting all min(PGXACT->xid)
+		 */
+		for (index = 0; index < arrayP->numProcs; index++)
+		{
+			int			pgprocno = arrayP->pgprocnos[index];
+			volatile PGXACT *pgxact = &allPgXact[pgprocno];
+			TransactionId xid;
+
+			/* Fetch xid just once - see GetNewTransactionId */
+			xid = pgxact->xid;
+
+			if (!TransactionIdIsNormal(xid))
+				continue;
+
+			if (TransactionIdPrecedes(xid, oldestSafeXid))
+				oldestSafeXid = xid;
+		}
+	}
+
+	LWLockRelease(XidGenLock);
+
+	return oldestSafeXid;
+}
+
+/*
  * GetVirtualXIDsDelayingChkpt -- Get the VXIDs of transactions that are
  * delaying checkpoint because they have critical actions in progress.
  *
@@ -2523,10 +2676,39 @@ CountOtherDBBackends(Oid databaseId, int *nbackends, int *nprepared)
  * replicaton slots.
  */
 void
-ProcArraySetReplicationSlotXmin(TransactionId xmin)
+ProcArraySetReplicationSlotXmin(TransactionId xmin, TransactionId catalog_xmin,
+								bool already_locked)
 {
-	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+	Assert(!already_locked || LWLockHeldByMe(ProcArrayLock));
+
+	if (!already_locked)
+		LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
 	procArray->replication_slot_xmin = xmin;
+	procArray->replication_slot_catalog_xmin = catalog_xmin;
+
+	if (!already_locked)
+		LWLockRelease(ProcArrayLock);
+}
+
+/*
+ * ProcArrayGetReplicationSlotXmin
+ *
+ * Return the current slot xmin limits. That's useful to be able to remove
+ * data that's older than those limits.
+ */
+void
+ProcArrayGetReplicationSlotXmin(TransactionId *xmin,
+								TransactionId *catalog_xmin)
+{
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+	if (xmin != NULL)
+		*xmin = procArray->replication_slot_xmin;
+
+	if (catalog_xmin != NULL)
+		*catalog_xmin = procArray->replication_slot_catalog_xmin;
+
 	LWLockRelease(ProcArrayLock);
 }
 
diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c
index fb5f18edfc7..aa8bea5538b 100644
--- a/src/backend/storage/ipc/standby.c
+++ b/src/backend/storage/ipc/standby.c
@@ -800,7 +800,9 @@ standby_redo(XLogRecPtr lsn, XLogRecord *record)
 
 /*
  * Log details of the current snapshot to WAL. This allows the snapshot state
- * to be reconstructed on the standby.
+ * to be reconstructed on the standby and for logical decoding.
+ *
+ * This is used for Hot Standby as follows:
  *
  * We can move directly to STANDBY_SNAPSHOT_READY at startup if we
  * start from a shutdown checkpoint because we know nothing was running
@@ -854,6 +856,12 @@ standby_redo(XLogRecPtr lsn, XLogRecord *record)
  * Zero xids should no longer be possible, but we may be replaying WAL
  * from a time when they were possible.
  *
+ * For logical decoding only the running xacts information is needed;
+ * there's no need to look at the locking information, but it's logged anyway,
+ * as there's no independent knob to just enable logical decoding. For
+ * details of how this is used, check snapbuild.c's introductory comment.
+ *
+ *
  * Returns the RecPtr of the last inserted record.
  */
 XLogRecPtr
@@ -879,8 +887,28 @@ LogStandbySnapshot(void)
 	 * record we write, because standby will open up when it sees this.
 	 */
 	running = GetRunningTransactionData();
+
+	/*
+	 * GetRunningTransactionData() acquired ProcArrayLock, we must release
+	 * it. For Hot Standby this can be done before inserting the WAL record
+	 * because ProcArrayApplyRecoveryInfo() rechecks the commit status using
+	 * the clog. For logical decoding, though, the lock can't be released
+	 * early becuase the clog might be "in the future" from the POV of the
+	 * historic snapshot. This would allow for situations where we're waiting
+	 * for the end of a transaction listed in the xl_running_xacts record
+	 * which, according to the WAL, have commit before the xl_running_xacts
+	 * record. Fortunately this routine isn't executed frequently, and it's
+	 * only a shared lock.
+	 */
+	if (wal_level < WAL_LEVEL_LOGICAL)
+		LWLockRelease(ProcArrayLock);
+
 	recptr = LogCurrentRunningXacts(running);
 
+	/* Release lock if we kept it longer ... */
+	if (wal_level >= WAL_LEVEL_LOGICAL)
+		LWLockRelease(ProcArrayLock);
+
 	/* GetRunningTransactionData() acquired XidGenLock, we must release it */
 	LWLockRelease(XidGenLock);
 
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index fa460ca82eb..f595a0747c1 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -781,10 +781,6 @@ ProcKill(int code, Datum arg)
 	/* Make sure we're out of the sync rep lists */
 	SyncRepCleanupAtProcExit();
 
-	/* Make sure active replication slots are released */
-	if (MyReplicationSlot != NULL)
-		ReplicationSlotRelease();
-
 #ifdef USE_ASSERT_CHECKING
 	if (assert_enabled)
 	{
@@ -803,6 +799,10 @@ ProcKill(int code, Datum arg)
 	 */
 	LWLockReleaseAll();
 
+	/* Make sure active replication slots are released */
+	if (MyReplicationSlot != NULL)
+		ReplicationSlotRelease();
+
 	/*
 	 * Clear MyProc first; then disown the process latch.  This is so that
 	 * signal handlers won't try to clear the process latch after it's no
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index a230d7eda69..be961017d66 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -55,6 +55,7 @@
 #include "pg_getopt.h"
 #include "postmaster/autovacuum.h"
 #include "postmaster/postmaster.h"
+#include "replication/slot.h"
 #include "replication/walsender.h"
 #include "rewrite/rewriteHandler.h"
 #include "storage/bufmgr.h"
@@ -3854,6 +3855,16 @@ PostgresMain(int argc, char *argv[],
 			WalSndErrorCleanup();
 
 		/*
+		 * We can't release replication slots inside AbortTransaction() as we
+		 * need to be able to start and abort transactions while having a slot
+		 * acquired. But we never need to hold them across top level errors,
+		 * so releasing here is fine. There's another cleanup in ProcKill()
+		 * ensuring we'll correctly cleanup on FATAL errors as well.
+		 */
+		if (MyReplicationSlot != NULL)
+			ReplicationSlotRelease();
+
+		/*
 		 * Now return to normal top-level context and clear ErrorContext for
 		 * next time.
 		 */
diff --git a/src/backend/utils/cache/inval.c b/src/backend/utils/cache/inval.c
index 4423fe01bdd..115bcac5d23 100644
--- a/src/backend/utils/cache/inval.c
+++ b/src/backend/utils/cache/inval.c
@@ -512,7 +512,7 @@ RegisterSnapshotInvalidation(Oid dbId, Oid relId)
  * Only the local caches are flushed; this does not transmit the message
  * to other backends.
  */
-static void
+void
 LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg)
 {
 	if (msg->id >= 0)
@@ -596,7 +596,7 @@ LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg)
  *		since that tells us we've lost some shared-inval messages and hence
  *		don't know what needs to be invalidated.
  */
-static void
+void
 InvalidateSystemCaches(void)
 {
 	int			i;
diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c
index 2810b35eea1..32313244adb 100644
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@@ -73,6 +73,7 @@
 #include "utils/memutils.h"
 #include "utils/relmapper.h"
 #include "utils/resowner_private.h"
+#include "utils/snapmgr.h"
 #include "utils/syscache.h"
 #include "utils/tqual.h"
 
@@ -235,7 +236,7 @@ static void formrdesc(const char *relationName, Oid relationReltype,
 		  bool isshared, bool hasoids,
 		  int natts, const FormData_pg_attribute *attrs);
 
-static HeapTuple ScanPgRelation(Oid targetRelId, bool indexOK);
+static HeapTuple ScanPgRelation(Oid targetRelId, bool indexOK, bool force_non_historic);
 static Relation AllocateRelationDesc(Form_pg_class relp);
 static void RelationParseRelOptions(Relation relation, HeapTuple tuple);
 static void RelationBuildTupleDesc(Relation relation);
@@ -274,12 +275,13 @@ static void unlink_initfile(const char *initfilename);
  *		and must eventually be freed with heap_freetuple.
  */
 static HeapTuple
-ScanPgRelation(Oid targetRelId, bool indexOK)
+ScanPgRelation(Oid targetRelId, bool indexOK, bool force_non_historic)
 {
 	HeapTuple	pg_class_tuple;
 	Relation	pg_class_desc;
 	SysScanDesc pg_class_scan;
 	ScanKeyData key[1];
+	Snapshot	snapshot;
 
 	/*
 	 * If something goes wrong during backend startup, we might find ourselves
@@ -305,9 +307,20 @@ ScanPgRelation(Oid targetRelId, bool indexOK)
 	 * scan by setting indexOK == false.
 	 */
 	pg_class_desc = heap_open(RelationRelationId, AccessShareLock);
+
+	/*
+	 * The caller might need a tuple that's newer than the one the historic
+	 * snapshot; currently the only case requiring to do so is looking up the
+	 * relfilenode of non mapped system relations during decoding.
+	 */
+	if (force_non_historic)
+		snapshot = GetNonHistoricCatalogSnapshot(RelationRelationId);
+	else
+		snapshot = GetCatalogSnapshot(RelationRelationId);
+
 	pg_class_scan = systable_beginscan(pg_class_desc, ClassOidIndexId,
 									   indexOK && criticalRelcachesBuilt,
-									   NULL,
+									   snapshot,
 									   1, key);
 
 	pg_class_tuple = systable_getnext(pg_class_scan);
@@ -836,7 +849,7 @@ RelationBuildDesc(Oid targetRelId, bool insertIt)
 	/*
 	 * find the tuple in pg_class corresponding to the given relation id
 	 */
-	pg_class_tuple = ScanPgRelation(targetRelId, true);
+	pg_class_tuple = ScanPgRelation(targetRelId, true, false);
 
 	/*
 	 * if no such tuple exists, return NULL
@@ -989,8 +1002,42 @@ RelationInitPhysicalAddr(Relation relation)
 		relation->rd_node.dbNode = InvalidOid;
 	else
 		relation->rd_node.dbNode = MyDatabaseId;
+
 	if (relation->rd_rel->relfilenode)
+	{
+		/*
+		 * Even if we are using a decoding snapshot that doesn't represent
+		 * the current state of the catalog we need to make sure the
+		 * filenode points to the current file since the older file will
+		 * be gone (or truncated). The new file will still contain older
+		 * rows so lookups in them will work correctly. This wouldn't work
+		 * correctly if rewrites were allowed to change the schema in a
+		 * noncompatible way, but those are prevented both on catalog
+		 * tables and on user tables declared as additional catalog
+		 * tables.
+		 */
+		if (HistoricSnapshotActive()
+			&& RelationIsAccessibleInLogicalDecoding(relation)
+			&& IsTransactionState())
+		{
+			HeapTuple		phys_tuple;
+			Form_pg_class	physrel;
+
+			phys_tuple = ScanPgRelation(RelationGetRelid(relation),
+										RelationGetRelid(relation) != ClassOidIndexId,
+										true);
+			if (!HeapTupleIsValid(phys_tuple))
+				elog(ERROR, "could not find pg_class entry for %u",
+					 RelationGetRelid(relation));
+			physrel = (Form_pg_class) GETSTRUCT(phys_tuple);
+
+			relation->rd_rel->reltablespace = physrel->reltablespace;
+			relation->rd_rel->relfilenode = physrel->relfilenode;
+			heap_freetuple(phys_tuple);
+		}
+
 		relation->rd_node.relNode = relation->rd_rel->relfilenode;
+	}
 	else
 	{
 		/* Consult the relation mapper */
@@ -1742,7 +1789,7 @@ RelationReloadIndexInfo(Relation relation)
 	 * for pg_class_oid_index ...
 	 */
 	indexOK = (RelationGetRelid(relation) != ClassOidIndexId);
-	pg_class_tuple = ScanPgRelation(RelationGetRelid(relation), indexOK);
+	pg_class_tuple = ScanPgRelation(RelationGetRelid(relation), indexOK, false);
 	if (!HeapTupleIsValid(pg_class_tuple))
 		elog(ERROR, "could not find pg_class tuple for index %u",
 			 RelationGetRelid(relation));
diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c
index 4c0e0accc1c..4146527d2fd 100644
--- a/src/backend/utils/time/snapmgr.c
+++ b/src/backend/utils/time/snapmgr.c
@@ -19,6 +19,10 @@
  * have regd_count = 1 and are counted in RegisteredSnapshots, but are not
  * tracked by any resource owner.
  *
+ * The same is true for historic snapshots used during logical decoding,
+ * their lifetime is managed separately (as they life longer as one xact.c
+ * transaction).
+ *
  * These arrangements let us reset MyPgXact->xmin when there are no snapshots
  * referenced by this transaction.	(One possible improvement would be to be
  * able to advance Xmin when the snapshot with the earliest Xmin is no longer
@@ -69,12 +73,13 @@
  */
 static SnapshotData CurrentSnapshotData = {HeapTupleSatisfiesMVCC};
 static SnapshotData SecondarySnapshotData = {HeapTupleSatisfiesMVCC};
-static SnapshotData CatalogSnapshotData = {HeapTupleSatisfiesMVCC};
+SnapshotData CatalogSnapshotData = {HeapTupleSatisfiesMVCC};
 
 /* Pointers to valid snapshots */
 static Snapshot CurrentSnapshot = NULL;
 static Snapshot SecondarySnapshot = NULL;
 static Snapshot CatalogSnapshot = NULL;
+static Snapshot HistoricSnapshot = NULL;
 
 /*
  * Staleness detection for CatalogSnapshot.
@@ -86,13 +91,18 @@ static bool CatalogSnapshotStale = true;
  * for the convenience of TransactionIdIsInProgress: even in bootstrap
  * mode, we don't want it to say that BootstrapTransactionId is in progress.
  *
- * RecentGlobalXmin is initialized to InvalidTransactionId, to ensure that no
- * one tries to use a stale value.	Readers should ensure that it has been set
- * to something else before using it.
+ * RecentGlobalXmin and RecentGlobalDataXmin are initialized to
+ * InvalidTransactionId, to ensure that no one tries to use a stale
+ * value. Readers should ensure that it has been set to something else
+ * before using it.
  */
 TransactionId TransactionXmin = FirstNormalTransactionId;
 TransactionId RecentXmin = FirstNormalTransactionId;
 TransactionId RecentGlobalXmin = InvalidTransactionId;
+TransactionId RecentGlobalDataXmin = InvalidTransactionId;
+
+/* (table, ctid) => (cmin, cmax) mapping during timetravel */
+static HTAB *tuplecid_data = NULL;
 
 /*
  * Elements of the active snapshot stack.
@@ -158,6 +168,18 @@ static void SnapshotResetXmin(void);
 Snapshot
 GetTransactionSnapshot(void)
 {
+	/*
+	 * Return historic snapshot if doing logical decoding. We'll never
+	 * need a non-historic transaction snapshot in this (sub-)transaction, so
+	 * there's no need to be careful to set one up for later calls to
+	 * GetTransactionSnapshot().
+	 */
+	if (HistoricSnapshotActive())
+	{
+		Assert(!FirstSnapshotSet);
+		return HistoricSnapshot;
+	}
+
 	/* First call in transaction? */
 	if (!FirstSnapshotSet)
 	{
@@ -214,6 +236,13 @@ GetTransactionSnapshot(void)
 Snapshot
 GetLatestSnapshot(void)
 {
+	/*
+	 * So far there are no cases requiring support for GetLatestSnapshot()
+	 * during logical decoding, but it wouldn't be hard to add if
+	 * required.
+	 */
+	Assert(!HistoricSnapshotActive());
+
 	/* If first call in transaction, go ahead and set the xact snapshot */
 	if (!FirstSnapshotSet)
 		return GetTransactionSnapshot();
@@ -232,6 +261,26 @@ Snapshot
 GetCatalogSnapshot(Oid relid)
 {
 	/*
+	 * Return historic snapshot if we're doing logical decoding, but
+	 * return a non-historic, snapshot if we temporarily are doing up2date
+	 * lookups.
+	 */
+	if (HistoricSnapshotActive())
+		return HistoricSnapshot;
+
+	return GetNonHistoricCatalogSnapshot(relid);
+}
+
+/*
+ * GetNonHistoricCatalogSnapshot
+ *		Get a snapshot that is sufficiently up-to-date for scan of the system
+ *		catalog with the specified OID, even while historic snapshots are set
+ *		up.
+ */
+Snapshot
+GetNonHistoricCatalogSnapshot(Oid relid)
+{
+	/*
 	 * If the caller is trying to scan a relation that has no syscache,
 	 * no catcache invalidations will be sent when it is updated.  For a
 	 * a few key relations, snapshot invalidations are sent instead.  If
@@ -303,6 +352,7 @@ SetTransactionSnapshot(Snapshot sourcesnap, TransactionId sourcexid)
 
 	Assert(RegisteredSnapshots == 0);
 	Assert(FirstXactSnapshot == NULL);
+	Assert(HistoricSnapshotActive());
 
 	/*
 	 * Even though we are not going to use the snapshot it computes, we must
@@ -796,7 +846,7 @@ AtEOXact_Snapshot(bool isCommit)
  *		Returns the token (the file name) that can be used to import this
  *		snapshot.
  */
-static char *
+char *
 ExportSnapshot(Snapshot snapshot)
 {
 	TransactionId topXid;
@@ -1258,3 +1308,45 @@ ThereAreNoPriorRegisteredSnapshots(void)
 
 	return false;
 }
+
+/*
+ * Setup a snapshot that replaces normal catalog snapshots that allows catalog
+ * access to behave just like it did at a certain point in the past.
+ *
+ * Needed for logical decoding.
+ */
+void
+SetupHistoricSnapshot(Snapshot historic_snapshot, HTAB *tuplecids)
+{
+	Assert(historic_snapshot != NULL);
+
+	/* setup the timetravel snapshot */
+	HistoricSnapshot = historic_snapshot;
+
+	/* setup (cmin, cmax) lookup hash */
+	tuplecid_data = tuplecids;
+}
+
+
+/*
+ * Make catalog snapshots behave normally again.
+ */
+void
+TeardownHistoricSnapshot(bool is_error)
+{
+	HistoricSnapshot = NULL;
+	tuplecid_data = NULL;
+}
+
+bool
+HistoricSnapshotActive(void)
+{
+	return HistoricSnapshot != NULL;
+}
+
+HTAB *
+HistoricSnapshotGetTupleCids(void)
+{
+	Assert(HistoricSnapshotActive());
+	return tuplecid_data;
+}
diff --git a/src/backend/utils/time/tqual.c b/src/backend/utils/time/tqual.c
index f6267552573..c4732ed3110 100644
--- a/src/backend/utils/time/tqual.c
+++ b/src/backend/utils/time/tqual.c
@@ -62,6 +62,9 @@
 #include "access/xact.h"
 #include "storage/bufmgr.h"
 #include "storage/procarray.h"
+#include "utils/builtins.h"
+#include "utils/combocid.h"
+#include "utils/snapmgr.h"
 #include "utils/tqual.h"
 
 
@@ -73,7 +76,6 @@ SnapshotData SnapshotToastData = {HeapTupleSatisfiesToast};
 /* local functions */
 static bool XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot);
 
-
 /*
  * SetHintBits()
  *
@@ -1545,3 +1547,163 @@ HeapTupleHeaderIsOnlyLocked(HeapTupleHeader tuple)
 	 */
 	return true;
 }
+
+/*
+ * check whether the transaciont id 'xid' in in the pre-sorted array 'xip'.
+ */
+static bool
+TransactionIdInArray(TransactionId xid, TransactionId *xip, Size num)
+{
+	return bsearch(&xid, xip, num,
+				   sizeof(TransactionId), xidComparator) != NULL;
+}
+
+/*
+ * See the comments for HeapTupleSatisfiesMVCC for the semantics this function
+ * obeys.
+ *
+ * Only usable on tuples from catalog tables!
+ *
+ * We don't need to support HEAP_MOVED_(IN|OFF) for now because we only support
+ * reading catalog pages which couldn't have been created in an older version.
+ *
+ * We don't set any hint bits in here as it seems unlikely to be beneficial as
+ * those should already be set by normal access and it seems to be too
+ * dangerous to do so as the semantics of doing so during timetravel are more
+ * complicated than when dealing "only" with the present.
+ */
+bool
+HeapTupleSatisfiesHistoricMVCC(HeapTuple htup, Snapshot snapshot,
+						   Buffer buffer)
+{
+	HeapTupleHeader tuple = htup->t_data;
+	TransactionId xmin = HeapTupleHeaderGetXmin(tuple);
+	TransactionId xmax = HeapTupleHeaderGetRawXmax(tuple);
+
+	Assert(ItemPointerIsValid(&htup->t_self));
+	Assert(htup->t_tableOid != InvalidOid);
+
+	/* inserting transaction aborted */
+	if (HeapTupleHeaderXminInvalid(tuple))
+	{
+		Assert(!TransactionIdDidCommit(xmin));
+		return false;
+	}
+	/* check if its one of our txids, toplevel is also in there */
+	else if (TransactionIdInArray(xmin, snapshot->subxip, snapshot->subxcnt))
+	{
+		bool		resolved;
+		CommandId	cmin = HeapTupleHeaderGetRawCommandId(tuple);
+		CommandId	cmax = InvalidCommandId;
+
+		/*
+		 * another transaction might have (tried to) delete this tuple or
+		 * cmin/cmax was stored in a combocid. S we need to to lookup the
+		 * actual values externally.
+		 */
+		resolved = ResolveCminCmaxDuringDecoding(HistoricSnapshotGetTupleCids(), snapshot,
+												 htup, buffer,
+												 &cmin, &cmax);
+
+		if (!resolved)
+			elog(ERROR, "could not resolve cmin/cmax of catalog tuple");
+
+		Assert(cmin != InvalidCommandId);
+
+		if (cmin >= snapshot->curcid)
+			return false;	/* inserted after scan started */
+		/* fall through */
+	}
+	/* committed before our xmin horizon. Do a normal visibility check. */
+	else if (TransactionIdPrecedes(xmin, snapshot->xmin))
+	{
+		Assert(!(HeapTupleHeaderXminCommitted(tuple) &&
+				 !TransactionIdDidCommit(xmin)));
+
+		/* check for hint bit first, consult clog afterwards */
+		if (!HeapTupleHeaderXminCommitted(tuple) &&
+			!TransactionIdDidCommit(xmin))
+			return false;
+		/* fall through */
+	}
+	/* beyond our xmax horizon, i.e. invisible */
+	else if (TransactionIdFollowsOrEquals(xmin, snapshot->xmax))
+	{
+		return false;
+	}
+	/* check if it's a committed transaction in [xmin, xmax) */
+	else if(TransactionIdInArray(xmin, snapshot->xip, snapshot->xcnt))
+	{
+		/* fall through */
+	}
+	/*
+	 * none of the above, i.e. between [xmin, xmax) but hasn't
+	 * committed. I.e. invisible.
+	 */
+	else
+	{
+		return false;
+	}
+
+	/* at this point we know xmin is visible, go on to check xmax */
+
+	/* xid invalid or aborted */
+	if (tuple->t_infomask & HEAP_XMAX_INVALID)
+		return true;
+	/* locked tuples are always visible */
+	else if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
+		return true;
+	/*
+	 * We can see multis here if we're looking at user tables or if
+	 * somebody SELECT ... FOR SHARE/UPDATE a system table.
+	 */
+	else if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
+	{
+		xmax = HeapTupleGetUpdateXid(tuple);
+	}
+
+	/* check if its one of our txids, toplevel is also in there */
+	if (TransactionIdInArray(xmax, snapshot->subxip, snapshot->subxcnt))
+	{
+		bool resolved;
+		CommandId cmin;
+		CommandId cmax = HeapTupleHeaderGetRawCommandId(tuple);
+
+		/* Lookup actual cmin/cmax values */
+		resolved = ResolveCminCmaxDuringDecoding(HistoricSnapshotGetTupleCids(), snapshot,
+												 htup, buffer,
+												 &cmin, &cmax);
+
+		if (!resolved)
+			elog(ERROR, "could not resolve combocid to cmax");
+
+		Assert(cmax != InvalidCommandId);
+
+		if (cmax >= snapshot->curcid)
+			return true;	/* deleted after scan started */
+		else
+			return false;	/* deleted before scan started */
+	}
+	/* below xmin horizon, normal transaction state is valid */
+	else if (TransactionIdPrecedes(xmax, snapshot->xmin))
+	{
+		Assert(!(tuple->t_infomask & HEAP_XMAX_COMMITTED &&
+				 !TransactionIdDidCommit(xmax)));
+
+		/* check hint bit first */
+		if (tuple->t_infomask & HEAP_XMAX_COMMITTED)
+			return false;
+
+		/* check clog */
+		return !TransactionIdDidCommit(xmax);
+	}
+	/* above xmax horizon, we cannot possibly see the deleting transaction */
+	else if (TransactionIdFollowsOrEquals(xmax, snapshot->xmax))
+		return true;
+	/* xmax is between [xmin, xmax), check known committed array */
+	else if (TransactionIdInArray(xmax, snapshot->xip, snapshot->xcnt))
+		return false;
+	/* xmax is between [xmin, xmax), but known not to have committed yet */
+	else
+		return true;
+}
diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index 71248ee1bcf..a7d5f7a153b 100644
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -198,7 +198,10 @@ static const char *subdirs[] = {
 	"pg_replslot",
 	"pg_tblspc",
 	"pg_stat",
-	"pg_stat_tmp"
+	"pg_stat_tmp",
+	"pg_llog",
+	"pg_llog/snapshots",
+	"pg_llog/mappings"
 };
 
 
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index bfdadc3d5bb..0f802577c70 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -164,8 +164,7 @@ extern void heap_restrpos(HeapScanDesc scan);
 extern void heap_sync(Relation relation);
 
 /* in heap/pruneheap.c */
-extern void heap_page_prune_opt(Relation relation, Buffer buffer,
-					TransactionId OldestXmin);
+extern void heap_page_prune_opt(Relation relation, Buffer buffer);
 extern int heap_page_prune(Relation relation, Buffer buffer,
 				TransactionId OldestXmin,
 				bool report_stats, TransactionId *latestRemovedXid);
diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h
index d4383ab2cbe..194635952cb 100644
--- a/src/include/access/heapam_xlog.h
+++ b/src/include/access/heapam_xlog.h
@@ -48,7 +48,7 @@
  * the ones above associated with RM_HEAP_ID.  XLOG_HEAP_OPMASK applies to
  * these, too.
  */
-/* 0x00 is free, was XLOG_HEAP2_FREEZE */
+#define XLOG_HEAP2_REWRITE		0x00
 #define XLOG_HEAP2_CLEAN		0x10
 #define XLOG_HEAP2_FREEZE_PAGE	0x20
 #define XLOG_HEAP2_CLEANUP_INFO 0x30
@@ -332,6 +332,17 @@ typedef struct xl_heap_new_cid
 	xl_heaptid target;
 } xl_heap_new_cid;
 
+/* logical rewrite xlog record header */
+typedef struct xl_heap_rewrite_mapping
+{
+	TransactionId		mapped_xid;	/* xid that might need to see the row */
+	Oid					mapped_db;	/* DbOid or InvalidOid for shared rels */
+	Oid					mapped_rel;	/* Oid of the mapped relation */
+	off_t				offset;		/* How far have we written so far */
+	uint32				num_mappings; /* Number of in-memory mappings */
+	XLogRecPtr			start_lsn;	/* Insert LSN at begin of rewrite */
+} xl_heap_rewrite_mapping;
+
 #define SizeOfHeapNewCid (offsetof(xl_heap_new_cid, target) + SizeOfHeapTid)
 
 extern void HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple,
@@ -341,6 +352,7 @@ extern void heap_redo(XLogRecPtr lsn, XLogRecord *rptr);
 extern void heap_desc(StringInfo buf, uint8 xl_info, char *rec);
 extern void heap2_redo(XLogRecPtr lsn, XLogRecord *rptr);
 extern void heap2_desc(StringInfo buf, uint8 xl_info, char *rec);
+extern void heap_xlog_logical_rewrite(XLogRecPtr lsn, XLogRecord *r);
 
 extern XLogRecPtr log_heap_cleanup_info(RelFileNode rnode,
 					  TransactionId latestRemovedXid);
diff --git a/src/include/access/rewriteheap.h b/src/include/access/rewriteheap.h
index d098a0b1711..07df3b4f2b0 100644
--- a/src/include/access/rewriteheap.h
+++ b/src/include/access/rewriteheap.h
@@ -14,12 +14,14 @@
 #define REWRITE_HEAP_H
 
 #include "access/htup.h"
+#include "storage/itemptr.h"
+#include "storage/relfilenode.h"
 #include "utils/relcache.h"
 
 /* struct definition is private to rewriteheap.c */
 typedef struct RewriteStateData *RewriteState;
 
-extern RewriteState begin_heap_rewrite(Relation NewHeap,
+extern RewriteState begin_heap_rewrite(Relation OldHeap, Relation NewHeap,
 				   TransactionId OldestXmin, TransactionId FreezeXid,
 				   MultiXactId MultiXactCutoff, bool use_wal);
 extern void end_heap_rewrite(RewriteState state);
@@ -27,4 +29,29 @@ extern void rewrite_heap_tuple(RewriteState state, HeapTuple oldTuple,
 				   HeapTuple newTuple);
 extern bool rewrite_heap_dead_tuple(RewriteState state, HeapTuple oldTuple);
 
+/*
+ * On-Disk data format for an individual logical rewrite mapping.
+ */
+typedef struct LogicalRewriteMappingData
+{
+	RelFileNode		old_node;
+	RelFileNode		new_node;
+	ItemPointerData	old_tid;
+	ItemPointerData	new_tid;
+} LogicalRewriteMappingData;
+
+/* ---
+ * The filename consists out of the following, dash separated,
+ * components:
+ * 1) database oid or InvalidOid for shared relations
+ * 2) the oid of the relation
+ * 3) xid we are mapping for
+ * 4) upper 32bit of the LSN at which a rewrite started
+ * 5) lower 32bit of the LSN at which a rewrite started
+ * 6) xid of the xact performing the mapping
+ * ---
+ */
+#define LOGICAL_REWRITE_FORMAT "map-%x-%x-%X_%X-%x-%x"
+void CheckPointLogicalRewriteHeap(void);
+
 #endif   /* REWRITE_HEAP_H */
diff --git a/src/include/access/transam.h b/src/include/access/transam.h
index 8376dfd669b..a9774e9f593 100644
--- a/src/include/access/transam.h
+++ b/src/include/access/transam.h
@@ -63,6 +63,11 @@
 	(AssertMacro(TransactionIdIsNormal(id1) && TransactionIdIsNormal(id2)), \
 	(int32) ((id1) - (id2)) < 0)
 
+/* compare two XIDs already known to be normal; this is a macro for speed */
+#define NormalTransactionIdFollows(id1, id2) \
+	(AssertMacro(TransactionIdIsNormal(id1) && TransactionIdIsNormal(id2)), \
+	(int32) ((id1) - (id2)) > 0)
+
 /* ----------
  *		Object ID (OID) zero is InvalidOid.
  *
diff --git a/src/include/access/tuptoaster.h b/src/include/access/tuptoaster.h
index 5adf4f28169..296d016c9fc 100644
--- a/src/include/access/tuptoaster.h
+++ b/src/include/access/tuptoaster.h
@@ -98,9 +98,34 @@
 /* Size of an EXTERNAL datum that contains a standard TOAST pointer */
 #define TOAST_POINTER_SIZE (VARHDRSZ_EXTERNAL + sizeof(struct varatt_external))
 
-/* Size of an indirect datum that contains an indirect TOAST pointer */
+/* Size of an indirect datum that contains a standard TOAST pointer */
 #define INDIRECT_POINTER_SIZE (VARHDRSZ_EXTERNAL + sizeof(struct varatt_indirect))
 
+/*
+ * Testing whether an externally-stored value is compressed now requires
+ * comparing extsize (the actual length of the external data) to rawsize
+ * (the original uncompressed datum's size).  The latter includes VARHDRSZ
+ * overhead, the former doesn't.  We never use compression unless it actually
+ * saves space, so we expect either equality or less-than.
+ */
+#define VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer) \
+	((toast_pointer).va_extsize < (toast_pointer).va_rawsize - VARHDRSZ)
+
+/*
+ * Macro to fetch the possibly-unaligned contents of an EXTERNAL datum
+ * into a local "struct varatt_external" toast pointer.  This should be
+ * just a memcpy, but some versions of gcc seem to produce broken code
+ * that assumes the datum contents are aligned.  Introducing an explicit
+ * intermediate "varattrib_1b_e *" variable seems to fix it.
+ */
+#define VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr) \
+do { \
+	varattrib_1b_e *attre = (varattrib_1b_e *) (attr); \
+	Assert(VARATT_IS_EXTERNAL(attre)); \
+	Assert(VARSIZE_EXTERNAL(attre) == sizeof(toast_pointer) + VARHDRSZ_EXTERNAL); \
+	memcpy(&(toast_pointer), VARDATA_EXTERNAL(attre), sizeof(toast_pointer)); \
+} while (0)
+
 /* ----------
  * toast_insert_or_update -
  *
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 11ab2771990..a238292b76e 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -288,6 +288,7 @@ extern int	XLogFileOpen(XLogSegNo segno);
 extern XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std);
 
 extern void CheckXLogRemoved(XLogSegNo segno, TimeLineID tli);
+extern XLogSegNo XLogGetLastRemovedSegno(void);
 extern void XLogSetAsyncXactLSN(XLogRecPtr record);
 extern void XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn);
 
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index 80560574bf5..22dd0fc58e8 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -53,6 +53,6 @@
  */
 
 /*							yyyymmddN */
-#define CATALOG_VERSION_NO	201403031
+#define CATALOG_VERSION_NO	201403032
 
 #endif
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index 7a11721ba44..c5706518722 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -4804,8 +4804,18 @@ DATA(insert OID = 3779 (  pg_create_physical_replication_slot PGNSP PGUID 12 1 0
 DESCR("create a physical replication slot");
 DATA(insert OID = 3780 (  pg_drop_replication_slot PGNSP PGUID 12 1 0 0 0 f f f f f f v 1 0 2278 "19" _null_ _null_ _null_ _null_ pg_drop_replication_slot _null_ _null_ _null_ ));
 DESCR("drop a replication slot");
-DATA(insert OID = 3781 (  pg_get_replication_slots	PGNSP PGUID 12 1 10 0 0 f f f f f t s 0 0 2249 "" "{19,25,26,16,28,3220}" "{o,o,o,o,o,o}" "{slot_name,slot_type,datoid,active,xmin,restart_lsn}" _null_ pg_get_replication_slots _null_ _null_ _null_ ));
+DATA(insert OID = 3781 (  pg_get_replication_slots	PGNSP PGUID 12 1 10 0 0 f f f f f t s 0 0 2249 "" "{19,19,25,26,16,28,28,3220}" "{o,o,o,o,o,o,o,o}" "{slot_name,plugin,slot_type,datoid,active,xmin,catalog_xmin,restart_lsn}" _null_ pg_get_replication_slots _null_ _null_ _null_ ));
 DESCR("information about replication slots currently in use");
+DATA(insert OID = 3786 (  pg_create_logical_replication_slot PGNSP PGUID 12 1 0 0 0 f f f f f f v 2 0 2249 "19 19" "{19,19,25,3220}" "{i,i,o,o}" "{slotname,plugin,slotname,xlog_position}" _null_ pg_create_logical_replication_slot _null_ _null_ _null_ ));
+DESCR("set up a logical replication slot");
+DATA(insert OID = 3782 (  pg_logical_slot_get_changes PGNSP PGUID 12 1000 1000 25 0 f f f f f t v 4 0 2249 "19 3220 23 1009" "{19,3220,23,1009,3220,28,25}" "{i,i,i,v,o,o,o}" "{slotname,upto_lsn,upto_nchanges,options,location,xid,data}" _null_ pg_logical_slot_get_changes _null_ _null_ _null_ ));
+DESCR("get changes from replication slot");
+DATA(insert OID = 3783 (  pg_logical_slot_get_binary_changes PGNSP PGUID 12 1000 1000 25 0 f f f f f t v 4 0 2249 "19 3220 23 1009" "{19,3220,23,1009,3220,28,17}" "{i,i,i,v,o,o,o}" "{slotname,upto_lsn,upto_nchanges,options,location,xid,data}" _null_ pg_logical_slot_get_binary_changes _null_ _null_ _null_ ));
+DESCR("get binary changes from replication slot");
+DATA(insert OID = 3784 (  pg_logical_slot_peek_changes PGNSP PGUID 12 1000 1000 25 0 f f f f f t v 4 0 2249 "19 3220 23 1009" "{19,3220,23,1009,3220,28,25}" "{i,i,i,v,o,o,o}" "{slotname,upto_lsn,upto_nchanges,options,location,xid,data}" _null_ pg_logical_slot_peek_changes _null_ _null_ _null_ ));
+DESCR("peek at changes from replication slot");
+DATA(insert OID = 3785 (  pg_logical_slot_peek_binary_changes PGNSP PGUID 12 1000 1000 25 0 f f f f f t v 4 0 2249 "19 3220 23 1009" "{19,3220,23,1009,3220,28,17}" "{i,i,i,v,o,o,o}" "{slotname,upto_lsn,upto_nchanges,options,location,xid,data}" _null_ pg_logical_slot_peek_binary_changes _null_ _null_ _null_ ));
+DESCR("peek at binary changes from replication slot");
 
 /* event triggers */
 DATA(insert OID = 3566 (  pg_event_trigger_dropped_objects		PGNSP PGUID 12 10 100 0 0 f f f f t t s 0 0 2249 "" "{26,26,23,25,25,25,25}" "{o,o,o,o,o,o,o}" "{classid, objid, objsubid, object_type, schema_name, object_name, object_identity}" _null_ pg_event_trigger_dropped_objects _null_ _null_ _null_ ));
diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h
index 70350e02cb2..058dc5f6675 100644
--- a/src/include/commands/vacuum.h
+++ b/src/include/commands/vacuum.h
@@ -157,10 +157,10 @@ extern void vac_update_relstats(Relation relation,
 					bool hasindex,
 					TransactionId frozenxid,
 					MultiXactId minmulti);
-extern void vacuum_set_xid_limits(int freeze_min_age, int freeze_table_age,
+extern void vacuum_set_xid_limits(Relation rel,
+					  int freeze_min_age, int freeze_table_age,
 					  int multixact_freeze_min_age,
 					  int multixact_freeze_table_age,
-					  bool sharedRel,
 					  TransactionId *oldestXmin,
 					  TransactionId *freezeLimit,
 					  TransactionId *xidFullScanLimit,
diff --git a/src/include/replication/decode.h b/src/include/replication/decode.h
new file mode 100644
index 00000000000..7f55d789a23
--- /dev/null
+++ b/src/include/replication/decode.h
@@ -0,0 +1,19 @@
+/*-------------------------------------------------------------------------
+ * decode.h
+ *	   PostgreSQL WAL to logical transformation
+ *
+ * Portions Copyright (c) 2012-2014, PostgreSQL Global Development Group
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef DECODE_H
+#define DECODE_H
+
+#include "access/xlogreader.h"
+#include "replication/reorderbuffer.h"
+#include "replication/logical.h"
+
+void LogicalDecodingProcessRecord(LogicalDecodingContext *ctx,
+								  XLogRecord *record);
+
+#endif
diff --git a/src/include/replication/logical.h b/src/include/replication/logical.h
new file mode 100644
index 00000000000..e65c8b8075f
--- /dev/null
+++ b/src/include/replication/logical.h
@@ -0,0 +1,100 @@
+/*-------------------------------------------------------------------------
+ * logical.h
+ *	   PostgreSQL logical decoding coordination
+ *
+ * Copyright (c) 2012-2014, PostgreSQL Global Development Group
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef LOGICAL_H
+#define LOGICAL_H
+
+#include "replication/slot.h"
+
+#include "access/xlog.h"
+#include "access/xlogreader.h"
+#include "replication/output_plugin.h"
+
+struct LogicalDecodingContext;
+
+typedef void (*LogicalOutputPluginWriterWrite) (
+										   struct LogicalDecodingContext *lr,
+															XLogRecPtr Ptr,
+															TransactionId xid,
+															bool last_write
+);
+
+typedef LogicalOutputPluginWriterWrite LogicalOutputPluginWriterPrepareWrite;
+
+typedef struct LogicalDecodingContext
+{
+	/* memory context this is all allocated in */
+	MemoryContext context;
+
+	/* infrastructure pieces */
+	XLogReaderState *reader;
+	ReplicationSlot *slot;
+	struct ReorderBuffer *reorder;
+	struct SnapBuild *snapshot_builder;
+
+	OutputPluginCallbacks callbacks;
+	OutputPluginOptions options;
+
+	/*
+	 * User specified options
+	 */
+	List	   *output_plugin_options;
+
+	/*
+	 * User-Provided callback for writing/streaming out data.
+	 */
+	LogicalOutputPluginWriterPrepareWrite prepare_write;
+	LogicalOutputPluginWriterWrite write;
+
+	/*
+	 * Output buffer.
+	 */
+	StringInfo	out;
+
+	/*
+	 * Private data pointer of the output plugin.
+	 */
+	void	   *output_plugin_private;
+
+	/*
+	 * Private data pointer for the data writer.
+	 */
+	void	   *output_writer_private;
+
+	/*
+	 * State for writing output.
+	 */
+	bool accept_writes;
+	bool prepared_write;
+	XLogRecPtr write_location;
+	TransactionId write_xid;
+} LogicalDecodingContext;
+
+extern void CheckLogicalDecodingRequirements(void);
+
+extern LogicalDecodingContext *CreateInitDecodingContext(char *plugin,
+							List *output_plugin_options,
+							XLogPageReadCB read_page,
+						 LogicalOutputPluginWriterPrepareWrite prepare_write,
+							LogicalOutputPluginWriterWrite do_write);
+extern LogicalDecodingContext *CreateDecodingContext(
+							XLogRecPtr	start_lsn,
+							List *output_plugin_options,
+							XLogPageReadCB read_page,
+						 LogicalOutputPluginWriterPrepareWrite prepare_write,
+							LogicalOutputPluginWriterWrite do_write);
+extern void DecodingContextFindStartpoint(LogicalDecodingContext *ctx);
+extern bool DecodingContextReady(LogicalDecodingContext *ctx);
+extern void FreeDecodingContext(LogicalDecodingContext *ctx);
+
+extern void LogicalIncreaseXminForSlot(XLogRecPtr lsn, TransactionId xmin);
+extern void LogicalIncreaseRestartDecodingForSlot(XLogRecPtr current_lsn,
+												   XLogRecPtr restart_lsn);
+extern void LogicalConfirmReceivedLocation(XLogRecPtr lsn);
+
+#endif
diff --git a/src/include/replication/logicalfuncs.h b/src/include/replication/logicalfuncs.h
new file mode 100644
index 00000000000..21bf44ec4b7
--- /dev/null
+++ b/src/include/replication/logicalfuncs.h
@@ -0,0 +1,24 @@
+/*-------------------------------------------------------------------------
+ * logicalfuncs.h
+ *	   PostgreSQL WAL to logical transformation support functions
+ *
+ * Copyright (c) 2012-2014, PostgreSQL Global Development Group
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef LOGICALFUNCS_H
+#define LOGICALFUNCS_H
+
+#include "replication/logical.h"
+
+extern int logical_read_local_xlog_page(XLogReaderState *state,
+							 XLogRecPtr targetPagePtr,
+							 int reqLen, XLogRecPtr targetRecPtr,
+							 char *cur_page, TimeLineID *pageTLI);
+
+extern Datum pg_logical_slot_get_changes(PG_FUNCTION_ARGS);
+extern Datum pg_logical_slot_get_binary_changes(PG_FUNCTION_ARGS);
+extern Datum pg_logical_slot_peek_changes(PG_FUNCTION_ARGS);
+extern Datum pg_logical_slot_peek_binary_changes(PG_FUNCTION_ARGS);
+
+#endif
diff --git a/src/include/replication/output_plugin.h b/src/include/replication/output_plugin.h
new file mode 100644
index 00000000000..c47c24c8dbe
--- /dev/null
+++ b/src/include/replication/output_plugin.h
@@ -0,0 +1,98 @@
+/*-------------------------------------------------------------------------
+ * output_plugin.h
+ *	   PostgreSQL Logical Decode Plugin Interface
+ *
+ * Copyright (c) 2012-2014, PostgreSQL Global Development Group
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef OUTPUT_PLUGIN_H
+#define OUTPUT_PLUGIN_H
+
+#include "replication/reorderbuffer.h"
+
+struct LogicalDecodingContext;
+struct OutputPluginCallbacks;
+
+typedef enum OutputPluginOutputType
+{
+	OUTPUT_PLUGIN_BINARY_OUTPUT,
+	OUTPUT_PLUGIN_TEXTUAL_OUTPUT
+} OutputPluginOutputType;
+
+/*
+ * Options set by the output plugin, in the startup callback.
+ */
+typedef struct OutputPluginOptions
+{
+	OutputPluginOutputType output_type;
+} OutputPluginOptions;
+
+/*
+ * Type of the shared library symbol _PG_output_plugin_init that is looked up
+ * when loading an output plugin shared library.
+ */
+typedef void (*LogicalOutputPluginInit)(struct OutputPluginCallbacks *cb);
+
+/*
+ * Callback that gets called in a user-defined plugin. ctx->private_data can
+ * be set to some private data.
+ *
+ * "is_init" will be set to "true" if the decoding slot just got defined. When
+ * the same slot is used from there one, it will be "false".
+ */
+typedef void (*LogicalDecodeStartupCB) (
+										  struct LogicalDecodingContext *ctx,
+										  OutputPluginOptions *options,
+										  bool is_init
+);
+
+/*
+ * Callback called for every (explicit or implicit) BEGIN of a successful
+ * transaction.
+ */
+typedef void (*LogicalDecodeBeginCB) (
+											 struct LogicalDecodingContext *,
+												  ReorderBufferTXN *txn);
+
+/*
+ * Callback for every individual change in a successful transaction.
+ */
+typedef void (*LogicalDecodeChangeCB) (
+											 struct LogicalDecodingContext *,
+												   ReorderBufferTXN *txn,
+												   Relation relation,
+												   ReorderBufferChange *change
+);
+
+/*
+ * Called for every (explicit or implicit) COMMIT of a successful transaction.
+ */
+typedef void (*LogicalDecodeCommitCB) (
+											 struct LogicalDecodingContext *,
+												   ReorderBufferTXN *txn,
+												   XLogRecPtr commit_lsn);
+
+/*
+ * Called to shutdown an output plugin.
+ */
+typedef void (*LogicalDecodeShutdownCB) (
+											  struct LogicalDecodingContext *
+);
+
+/*
+ * Output plugin callbacks
+ */
+typedef struct OutputPluginCallbacks
+{
+	LogicalDecodeStartupCB startup_cb;
+	LogicalDecodeBeginCB begin_cb;
+	LogicalDecodeChangeCB change_cb;
+	LogicalDecodeCommitCB commit_cb;
+	LogicalDecodeShutdownCB shutdown_cb;
+} OutputPluginCallbacks;
+
+void OutputPluginPrepareWrite(struct LogicalDecodingContext *ctx, bool last_write);
+void OutputPluginWrite(struct LogicalDecodingContext *ctx, bool last_write);
+
+#endif   /* OUTPUT_PLUGIN_H */
diff --git a/src/include/replication/reorderbuffer.h b/src/include/replication/reorderbuffer.h
new file mode 100644
index 00000000000..01eabfb7be7
--- /dev/null
+++ b/src/include/replication/reorderbuffer.h
@@ -0,0 +1,351 @@
+/*
+ * reorderbuffer.h
+ *    PostgreSQL logical replay/reorder buffer management.
+ *
+ * Copyright (c) 2012-2014, PostgreSQL Global Development Group
+ *
+ * src/include/replication/reorderbuffer.h
+ */
+#ifndef REORDERBUFFER_H
+#define REORDERBUFFER_H
+
+#include "access/htup_details.h"
+
+#include "lib/ilist.h"
+
+#include "storage/sinval.h"
+
+#include "utils/hsearch.h"
+#include "utils/rel.h"
+#include "utils/snapshot.h"
+#include "utils/timestamp.h"
+
+/* an individual tuple, stored in one chunk of memory */
+typedef struct ReorderBufferTupleBuf
+{
+	/* position in preallocated list */
+	slist_node	node;
+
+	/* tuple, stored sequentially */
+	HeapTupleData tuple;
+	HeapTupleHeaderData header;
+	char		data[MaxHeapTupleSize];
+} ReorderBufferTupleBuf;
+
+/* types of the change passed to a 'change' callback */
+enum ReorderBufferChangeType
+{
+	REORDER_BUFFER_CHANGE_INSERT,
+	REORDER_BUFFER_CHANGE_UPDATE,
+	REORDER_BUFFER_CHANGE_DELETE
+};
+
+/*
+ * a single 'change', can be an insert (with one tuple), an update (old, new),
+ * or a delete (old).
+ *
+ * The same struct is also used internally for other purposes but that should
+ * never be visible outside reorderbuffer.c.
+ */
+typedef struct ReorderBufferChange
+{
+	XLogRecPtr	lsn;
+
+	/* type of change */
+	union
+	{
+		enum ReorderBufferChangeType action;
+		/* do not leak internal enum values to the outside */
+		int			action_internal;
+	};
+
+	/*
+	 * Context data for the change, which part of the union is valid depends
+	 * on action/action_internal.
+	 */
+	union
+	{
+		/* old, new tuples when action == *_INSERT|UPDATE|DELETE */
+		struct
+		{
+			/* relation that has been changed */
+			RelFileNode relnode;
+			/* valid for DELETE || UPDATE */
+			ReorderBufferTupleBuf *oldtuple;
+			/* valid for INSERT || UPDATE */
+			ReorderBufferTupleBuf *newtuple;
+		} tp;
+
+		/* new snapshot */
+		Snapshot	snapshot;
+
+		/* new command id for existing snapshot in a catalog changing tx */
+		CommandId	command_id;
+
+		/* new cid mapping for catalog changing transaction */
+		struct
+		{
+			RelFileNode node;
+			ItemPointerData tid;
+			CommandId	cmin;
+			CommandId	cmax;
+			CommandId	combocid;
+		}			tuplecid;
+	};
+
+	/*
+	 * While in use this is how a change is linked into a transactions,
+	 * otherwise it's the preallocated list.
+	 */
+	dlist_node	node;
+} ReorderBufferChange;
+
+typedef struct ReorderBufferTXN
+{
+	/*
+	 * The transactions transaction id, can be a toplevel or sub xid.
+	 */
+	TransactionId xid;
+
+	/* did the TX have catalog changes */
+	bool		has_catalog_changes;
+
+	/*
+	 * Do we know this is a subxact?
+	 */
+	bool		is_known_as_subxact;
+
+	/*
+	 * LSN of the first data carrying, WAL record with knowledge about this
+	 * xid. This is allowed to *not* be first record adorned with this xid, if
+	 * the previous records aren't relevant for logical decoding.
+	 */
+	XLogRecPtr	first_lsn;
+
+	/* ----
+	 * LSN of the record that lead to this xact to be committed or
+	 * aborted. This can be a
+	 * * plain commit record
+	 * * plain commit record, of a parent transaction
+	 * * prepared transaction commit
+	 * * plain abort record
+	 * * prepared transaction abort
+	 * * error during decoding
+	 * ----
+	 */
+	XLogRecPtr	final_lsn;
+
+	/*
+	 * LSN pointing to the end of the commit record + 1.
+	 */
+	XLogRecPtr	end_lsn;
+
+	/*
+	 * LSN of the last lsn at which snapshot information reside, so we can
+	 * restart decoding from there and fully recover this transaction from
+	 * WAL.
+	 */
+	XLogRecPtr	restart_decoding_lsn;
+
+	/*
+	 * Commit time, only known when we read the actual commit record.
+	 */
+	TimestampTz	commit_time;
+
+	/*
+	 * Base snapshot or NULL.
+	 */
+	Snapshot	base_snapshot;
+	XLogRecPtr	base_snapshot_lsn;
+
+	/*
+	 * How many ReorderBufferChange's do we have in this txn.
+	 *
+	 * Changes in subtransactions are *not* included but tracked separately.
+	 */
+	uint64		nentries;
+
+	/*
+	 * How many of the above entries are stored in memory in contrast to being
+	 * spilled to disk.
+	 */
+	uint64		nentries_mem;
+
+	/*
+	 * List of ReorderBufferChange structs, including new Snapshots and new
+	 * CommandIds
+	 */
+	dlist_head	changes;
+
+	/*
+	 * List of (relation, ctid) => (cmin, cmax) mappings for catalog tuples.
+	 * Those are always assigned to the toplevel transaction. (Keep track of
+	 * #entries to create a hash of the right size)
+	 */
+	dlist_head	tuplecids;
+	uint64		ntuplecids;
+
+	/*
+	 * On-demand built hash for looking up the above values.
+	 */
+	HTAB	   *tuplecid_hash;
+
+	/*
+	 * Hash containing (potentially partial) toast entries. NULL if no toast
+	 * tuples have been found for the current change.
+	 */
+	HTAB	   *toast_hash;
+
+	/*
+	 * non-hierarchical list of subtransactions that are *not* aborted. Only
+	 * used in toplevel transactions.
+	 */
+	dlist_head	subtxns;
+	uint32		nsubtxns;
+
+	/*
+	 * Stored cache invalidations. This is not a linked list because we get
+	 * all the invalidations at once.
+	 */
+	uint32		ninvalidations;
+	SharedInvalidationMessage *invalidations;
+
+	/* ---
+	 * Position in one of three lists:
+	 * * list of subtransactions if we are *known* to be subxact
+	 * * list of toplevel xacts (can be a as-yet unknown subxact)
+	 * * list of preallocated ReorderBufferTXNs
+	 * ---
+	 */
+	dlist_node	node;
+
+} ReorderBufferTXN;
+
+/* so we can define the callbacks used inside struct ReorderBuffer itself */
+typedef struct ReorderBuffer ReorderBuffer;
+
+/* change callback signature */
+typedef void (*ReorderBufferApplyChangeCB) (
+														ReorderBuffer *rb,
+														ReorderBufferTXN *txn,
+														Relation relation,
+												ReorderBufferChange *change);
+
+/* begin callback signature */
+typedef void (*ReorderBufferBeginCB) (
+												  ReorderBuffer *rb,
+												  ReorderBufferTXN *txn);
+
+/* commit callback signature */
+typedef void (*ReorderBufferCommitCB) (
+												   ReorderBuffer *rb,
+												   ReorderBufferTXN *txn,
+												   XLogRecPtr commit_lsn);
+
+struct ReorderBuffer
+{
+	/*
+	 * xid => ReorderBufferTXN lookup table
+	 */
+	HTAB	   *by_txn;
+
+	/*
+	 * Transactions that could be a toplevel xact, ordered by LSN of the first
+	 * record bearing that xid..
+	 */
+	dlist_head	toplevel_by_lsn;
+
+	/*
+	 * one-entry sized cache for by_txn. Very frequently the same txn gets
+	 * looked up over and over again.
+	 */
+	TransactionId by_txn_last_xid;
+	ReorderBufferTXN *by_txn_last_txn;
+
+	/*
+	 * Callacks to be called when a transactions commits.
+	 */
+	ReorderBufferBeginCB begin;
+	ReorderBufferApplyChangeCB apply_change;
+	ReorderBufferCommitCB commit;
+
+	/*
+	 * Pointer that will be passed untouched to the callbacks.
+	 */
+	void	   *private_data;
+
+	/*
+	 * Private memory context.
+	 */
+	MemoryContext context;
+
+	/*
+	 * Data structure slab cache.
+	 *
+	 * We allocate/deallocate some structures very frequently, to avoid bigger
+	 * overhead we cache some unused ones here.
+	 *
+	 * The maximum number of cached entries is controlled by const variables
+	 * ontop of reorderbuffer.c
+	 */
+
+	/* cached ReorderBufferTXNs */
+	dlist_head	cached_transactions;
+	Size		nr_cached_transactions;
+
+	/* cached ReorderBufferChanges */
+	dlist_head	cached_changes;
+	Size		nr_cached_changes;
+
+	/* cached ReorderBufferTupleBufs */
+	slist_head	cached_tuplebufs;
+	Size		nr_cached_tuplebufs;
+
+	XLogRecPtr	current_restart_decoding_lsn;
+
+	/* buffer for disk<->memory conversions */
+	char	   *outbuf;
+	Size		outbufsize;
+};
+
+
+ReorderBuffer *ReorderBufferAllocate(void);
+void		ReorderBufferFree(ReorderBuffer *);
+
+ReorderBufferTupleBuf *ReorderBufferGetTupleBuf(ReorderBuffer *);
+void		ReorderBufferReturnTupleBuf(ReorderBuffer *, ReorderBufferTupleBuf *tuple);
+ReorderBufferChange *ReorderBufferGetChange(ReorderBuffer *);
+void		ReorderBufferReturnChange(ReorderBuffer *, ReorderBufferChange *);
+
+void		ReorderBufferQueueChange(ReorderBuffer *, TransactionId, XLogRecPtr lsn, ReorderBufferChange *);
+void		ReorderBufferCommit(ReorderBuffer *, TransactionId,
+							XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
+							TimestampTz commit_time);
+void		ReorderBufferAssignChild(ReorderBuffer *, TransactionId, TransactionId, XLogRecPtr commit_lsn);
+void		ReorderBufferCommitChild(ReorderBuffer *, TransactionId, TransactionId,
+									 XLogRecPtr commit_lsn, XLogRecPtr end_lsn);
+void		ReorderBufferAbort(ReorderBuffer *, TransactionId, XLogRecPtr lsn);
+void		ReorderBufferAbortOld(ReorderBuffer *, TransactionId xid);
+void		ReorderBufferForget(ReorderBuffer *, TransactionId, XLogRecPtr lsn);
+
+void		ReorderBufferSetBaseSnapshot(ReorderBuffer *, TransactionId, XLogRecPtr lsn, struct SnapshotData *snap);
+void		ReorderBufferAddSnapshot(ReorderBuffer *, TransactionId, XLogRecPtr lsn, struct SnapshotData *snap);
+void ReorderBufferAddNewCommandId(ReorderBuffer *, TransactionId, XLogRecPtr lsn,
+							 CommandId cid);
+void ReorderBufferAddNewTupleCids(ReorderBuffer *, TransactionId, XLogRecPtr lsn,
+							 RelFileNode node, ItemPointerData pt,
+						 CommandId cmin, CommandId cmax, CommandId combocid);
+void ReorderBufferAddInvalidations(ReorderBuffer *, TransactionId, XLogRecPtr lsn,
+							  Size nmsgs, SharedInvalidationMessage *msgs);
+bool		ReorderBufferIsXidKnown(ReorderBuffer *, TransactionId xid);
+void		ReorderBufferXidSetCatalogChanges(ReorderBuffer *, TransactionId xid, XLogRecPtr lsn);
+bool		ReorderBufferXidHasCatalogChanges(ReorderBuffer *, TransactionId xid);
+bool		ReorderBufferXidHasBaseSnapshot(ReorderBuffer *, TransactionId xid);
+
+ReorderBufferTXN *ReorderBufferGetOldestTXN(ReorderBuffer *);
+
+void		ReorderBufferSetRestartPoint(ReorderBuffer *, XLogRecPtr ptr);
+
+void		StartupReorderBuffer(void);
+
+#endif
diff --git a/src/include/replication/slot.h b/src/include/replication/slot.h
index 089b0f4b70c..c354c9133bf 100644
--- a/src/include/replication/slot.h
+++ b/src/include/replication/slot.h
@@ -16,6 +16,24 @@
 #include "storage/shmem.h"
 #include "storage/spin.h"
 
+/*
+ * Behaviour of replication slots, upon release or crash.
+ *
+ * Slots marked as PERSISTENT are crashsafe and will not be dropped when
+ * released. Slots marked as EPHEMERAL will be dropped when released or after
+ * restarts.
+ *
+ * EPHEMERAL slots can be made PERSISTENT by calling ReplicationSlotPersist().
+ */
+typedef enum ReplicationSlotPersistency
+{
+	RS_PERSISTENT,
+	RS_EPHEMERAL
+} ReplicationSlotPersistency;
+
+/*
+ * On-Disk data of a replication slot, preserved across restarts.
+ */
 typedef struct ReplicationSlotPersistentData
 {
 	/* The slot's identifier */
@@ -25,6 +43,11 @@ typedef struct ReplicationSlotPersistentData
 	Oid			database;
 
 	/*
+	 * The slot's behaviour when being dropped (or restored after a crash).
+	 */
+	ReplicationSlotPersistency persistency;
+
+	/*
 	 * xmin horizon for data
 	 *
 	 * NB: This may represent a value that hasn't been written to disk yet;
@@ -32,9 +55,22 @@ typedef struct ReplicationSlotPersistentData
 	 */
 	TransactionId xmin;
 
+	/*
+	 * xmin horizon for catalog tuples
+	 *
+	 * NB: This may represent a value that hasn't been written to disk yet;
+	 * see notes for effective_xmin, below.
+	 */
+	TransactionId catalog_xmin;
+
 	/* oldest LSN that might be required by this replication slot */
 	XLogRecPtr	restart_lsn;
 
+	/* oldest LSN that the client has acked receipt for */
+	XLogRecPtr	confirmed_flush;
+
+	/* plugin name */
+	NameData	plugin;
 } ReplicationSlotPersistentData;
 
 /*
@@ -67,12 +103,26 @@ typedef struct ReplicationSlot
 	 * same as the persistent value (data.xmin).
 	 */
 	TransactionId effective_xmin;
+	TransactionId effective_catalog_xmin;
 
 	/* data surviving shutdowns and crashes */
 	ReplicationSlotPersistentData data;
 
 	/* is somebody performing io on this slot? */
 	LWLock	   *io_in_progress_lock;
+
+	/* all the remaining data is only used for logical slots */
+
+	/* ----
+	 * When the client has confirmed flushes >= candidate_xmin_lsn we can
+	 * advance the catalog xmin, when restart_valid has been passed,
+	 * restart_lsn can be increased.
+	 * ----
+	 */
+	TransactionId candidate_catalog_xmin;
+	XLogRecPtr	candidate_xmin_lsn;
+	XLogRecPtr	candidate_restart_valid;
+	XLogRecPtr	candidate_restart_lsn;
 } ReplicationSlot;
 
 /*
@@ -97,8 +147,11 @@ extern Size ReplicationSlotsShmemSize(void);
 extern void ReplicationSlotsShmemInit(void);
 
 /* management of individual slots */
-extern void ReplicationSlotCreate(const char *name, bool db_specific);
+extern void ReplicationSlotCreate(const char *name, bool db_specific,
+								  ReplicationSlotPersistency p);
+extern void ReplicationSlotPersist(void);
 extern void ReplicationSlotDrop(const char *name);
+
 extern void ReplicationSlotAcquire(const char *name);
 extern void ReplicationSlotRelease(void);
 extern void ReplicationSlotSave(void);
@@ -106,15 +159,20 @@ extern void ReplicationSlotMarkDirty(void);
 
 /* misc stuff */
 extern bool ReplicationSlotValidateName(const char *name, int elevel);
-extern void ReplicationSlotsComputeRequiredXmin(void);
+extern void ReplicationSlotsComputeRequiredXmin(bool already_locked);
 extern void ReplicationSlotsComputeRequiredLSN(void);
+extern XLogRecPtr ReplicationSlotsComputeLogicalRestartLSN(void);
+extern bool ReplicationSlotsCountDBSlots(Oid dboid, int *nslots, int *nactive);
+
 extern void StartupReplicationSlots(XLogRecPtr checkPointRedo);
 extern void CheckPointReplicationSlots(void);
 
 extern void CheckSlotRequirements(void);
-extern void ReplicationSlotAtProcExit(void);
 
 /* SQL callable functions */
+extern Datum pg_create_physical_replication_slot(PG_FUNCTION_ARGS);
+extern Datum pg_create_logical_replication_slot(PG_FUNCTION_ARGS);
+extern Datum pg_drop_replication_slot(PG_FUNCTION_ARGS);
 extern Datum pg_get_replication_slots(PG_FUNCTION_ARGS);
 
 #endif /* SLOT_H */
diff --git a/src/include/replication/snapbuild.h b/src/include/replication/snapbuild.h
new file mode 100644
index 00000000000..087c0e510d5
--- /dev/null
+++ b/src/include/replication/snapbuild.h
@@ -0,0 +1,83 @@
+/*-------------------------------------------------------------------------
+ *
+ * snapbuild.h
+ *	  Exports from replication/logical/snapbuild.c.
+ *
+ * Copyright (c) 2012-2014, PostgreSQL Global Development Group
+ *
+ * src/include/replication/snapbuild.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef SNAPBUILD_H
+#define SNAPBUILD_H
+
+#include "access/xlogdefs.h"
+#include "utils/snapmgr.h"
+
+typedef enum
+{
+	/*
+	 * Initial state, we can't do much yet.
+	 */
+	SNAPBUILD_START,
+
+	/*
+	 * We have collected enough information to decode tuples in transactions
+	 * that started after this.
+	 *
+	 * Once we reached this we start to collect changes. We cannot apply them
+	 * yet because the might be based on transactions that were still running
+	 * when we reached them yet.
+	 */
+	SNAPBUILD_FULL_SNAPSHOT,
+
+	/*
+	 * Found a point after hitting built_full_snapshot where all transactions
+	 * that were running at that point finished. Till we reach that we hold
+	 * off calling any commit callbacks.
+	 */
+	SNAPBUILD_CONSISTENT
+} SnapBuildState;
+
+/* forward declare so we don't have to expose the struct to the public */
+struct SnapBuild;
+typedef struct SnapBuild SnapBuild;
+
+/* forward declare so we don't have to include reorderbuffer.h */
+struct ReorderBuffer;
+
+/* forward declare so we don't have to include heapam_xlog.h */
+struct xl_heap_new_cid;
+struct xl_running_xacts;
+
+extern void CheckPointSnapBuild(void);
+
+extern SnapBuild *AllocateSnapshotBuilder(struct ReorderBuffer *cache,
+						  TransactionId xmin_horizon, XLogRecPtr start_lsn);
+extern void FreeSnapshotBuilder(SnapBuild *cache);
+
+extern void SnapBuildSnapDecRefcount(Snapshot snap);
+
+extern const char *SnapBuildExportSnapshot(SnapBuild *snapstate);
+extern void SnapBuildClearExportedSnapshot(void);
+
+extern SnapBuildState SnapBuildCurrentState(SnapBuild *snapstate);
+
+extern bool SnapBuildXactNeedsSkip(SnapBuild *snapstate, XLogRecPtr ptr);
+
+extern void SnapBuildCommitTxn(SnapBuild *builder, XLogRecPtr lsn,
+							   TransactionId xid, int nsubxacts,
+							   TransactionId *subxacts);
+extern void SnapBuildAbortTxn(SnapBuild *builder, XLogRecPtr lsn,
+							  TransactionId xid, int nsubxacts,
+							  TransactionId *subxacts);
+extern bool SnapBuildProcessChange(SnapBuild *builder, TransactionId xid,
+								   XLogRecPtr lsn);
+extern void SnapBuildProcessNewCid(SnapBuild *builder, TransactionId xid,
+								   XLogRecPtr lsn, struct xl_heap_new_cid *cid);
+extern void SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn,
+										 struct xl_running_xacts *running);
+extern void SnapBuildSerializationPoint(SnapBuild *builder, XLogRecPtr lsn);
+
+#endif   /* SNAPBUILD_H */
diff --git a/src/include/storage/itemptr.h b/src/include/storage/itemptr.h
index 67bbdbb988a..0b81d53f5f8 100644
--- a/src/include/storage/itemptr.h
+++ b/src/include/storage/itemptr.h
@@ -116,6 +116,9 @@ typedef ItemPointerData *ItemPointer;
 /*
  * ItemPointerCopy
  *		Copies the contents of one disk item pointer to another.
+ *
+ * Should there ever be padding in an ItemPointer this would need to be handled
+ * differently as it's used as hash key.
  */
 #define ItemPointerCopy(fromPointer, toPointer) \
 ( \
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index a3cadd9a017..5218b448cd6 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -41,10 +41,12 @@ struct XidCache
 #define		PROC_IS_AUTOVACUUM	0x01	/* is it an autovac worker? */
 #define		PROC_IN_VACUUM		0x02	/* currently running lazy vacuum */
 #define		PROC_IN_ANALYZE		0x04	/* currently running analyze */
-#define		PROC_VACUUM_FOR_WRAPAROUND 0x08		/* set by autovac only */
+#define		PROC_VACUUM_FOR_WRAPAROUND	0x08 /* set by autovac only */
+#define		PROC_IN_LOGICAL_DECODING	0x10 /* currently doing logical decoding */
 
 /* flags reset at EOXact */
-#define		PROC_VACUUM_STATE_MASK (0x0E)
+#define		PROC_VACUUM_STATE_MASK \
+	(PROC_IN_VACUUM | PROC_IN_ANALYZE | PROC_VACUUM_FOR_WRAPAROUND)
 
 /*
  * We allow a small number of "weak" relation locks (AccesShareLock,
diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h
index d1a58a3661b..d0b4103a09e 100644
--- a/src/include/storage/procarray.h
+++ b/src/include/storage/procarray.h
@@ -15,6 +15,7 @@
 #define PROCARRAY_H
 
 #include "storage/standby.h"
+#include "utils/relcache.h"
 #include "utils/snapshot.h"
 
 
@@ -50,8 +51,9 @@ extern RunningTransactions GetRunningTransactionData(void);
 
 extern bool TransactionIdIsInProgress(TransactionId xid);
 extern bool TransactionIdIsActive(TransactionId xid);
-extern TransactionId GetOldestXmin(bool allDbs, bool ignoreVacuum);
+extern TransactionId GetOldestXmin(Relation rel, bool ignoreVacuum);
 extern TransactionId GetOldestActiveTransactionId(void);
+extern TransactionId GetOldestSafeDecodingTransactionId(void);
 
 extern VirtualTransactionId *GetVirtualXIDsDelayingChkpt(int *nvxids);
 extern bool HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids);
@@ -77,6 +79,10 @@ extern void XidCacheRemoveRunningXids(TransactionId xid,
 						  int nxids, const TransactionId *xids,
 						  TransactionId latestXid);
 
-extern void ProcArraySetReplicationSlotXmin(TransactionId xmin);
+extern void ProcArraySetReplicationSlotXmin(TransactionId xmin,
+							TransactionId catalog_xmin, bool already_locked);
+
+extern void ProcArrayGetReplicationSlotXmin(TransactionId *xmin,
+											TransactionId *catalog_xmin);
 
 #endif   /* PROCARRAY_H */
diff --git a/src/include/storage/sinval.h b/src/include/storage/sinval.h
index 0cae810f49e..d5bb850337d 100644
--- a/src/include/storage/sinval.h
+++ b/src/include/storage/sinval.h
@@ -147,4 +147,6 @@ extern void ProcessCommittedInvalidationMessages(SharedInvalidationMessage *msgs
 									 int nmsgs, bool RelcacheInitFileInval,
 									 Oid dbid, Oid tsid);
 
+extern void LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg);
+
 #endif   /* SINVAL_H */
diff --git a/src/include/utils/inval.h b/src/include/utils/inval.h
index c1409c863db..6156e0219d0 100644
--- a/src/include/utils/inval.h
+++ b/src/include/utils/inval.h
@@ -64,4 +64,5 @@ extern void CacheRegisterRelcacheCallback(RelcacheCallbackFunction func,
 
 extern void CallSyscacheCallbacks(int cacheid, uint32 hashvalue);
 
+extern void InvalidateSystemCaches(void);
 #endif   /* INVAL_H */
diff --git a/src/include/utils/snapmgr.h b/src/include/utils/snapmgr.h
index c601770ec99..abe7016d040 100644
--- a/src/include/utils/snapmgr.h
+++ b/src/include/utils/snapmgr.h
@@ -23,12 +23,14 @@ extern bool FirstSnapshotSet;
 extern TransactionId TransactionXmin;
 extern TransactionId RecentXmin;
 extern TransactionId RecentGlobalXmin;
+extern TransactionId RecentGlobalDataXmin;
 
 extern Snapshot GetTransactionSnapshot(void);
 extern Snapshot GetLatestSnapshot(void);
 extern void SnapshotSetCommandId(CommandId curcid);
 
 extern Snapshot GetCatalogSnapshot(Oid relid);
+extern Snapshot GetNonHistoricCatalogSnapshot(Oid relid);
 extern void InvalidateCatalogSnapshot(void);
 
 extern void PushActiveSnapshot(Snapshot snapshot);
@@ -53,4 +55,13 @@ extern bool XactHasExportedSnapshots(void);
 extern void DeleteAllExportedSnapshotFiles(void);
 extern bool ThereAreNoPriorRegisteredSnapshots(void);
 
+extern char *ExportSnapshot(Snapshot snapshot);
+
+/* Support for catalog timetravel for logical decoding */
+struct HTAB;
+extern struct HTAB *HistoricSnapshotGetTupleCids(void);
+extern void SetupHistoricSnapshot(Snapshot snapshot_now, struct HTAB *tuplecids);
+extern void TeardownHistoricSnapshot(bool is_error);
+extern bool HistoricSnapshotActive(void);
+
 #endif   /* SNAPMGR_H */
diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h
index c542238825a..4b256074b0b 100644
--- a/src/include/utils/snapshot.h
+++ b/src/include/utils/snapshot.h
@@ -30,6 +30,22 @@ typedef struct SnapshotData *Snapshot;
 typedef bool (*SnapshotSatisfiesFunc) (HeapTuple htup,
 									   Snapshot snapshot, Buffer buffer);
 
+/*
+ * Struct representing all kind of possible snapshots.
+ *
+ * There are several different kinds of snapshots:
+ * * Normal MVCC snapshots
+ * * MVCC snapshots taken during recovery (in Hot-Standby mode)
+ * * Historic MVCC snapshots used during logical decoding 
+ * * snapshots passed to HeapTupleSatisfiesDirty()
+ * * snapshots used for SatisfiesAny, Toast, Self where no members are
+ *   accessed.
+ *
+ * TODO: It's probably a good idea to split this struct using a NodeTag
+ * similar to how parser and executor nodes are handled, with one type for
+ * each different kind of snapshot to avoid overloading the meaning of
+ * individual fields.
+ */
 typedef struct SnapshotData
 {
 	SnapshotSatisfiesFunc satisfies;	/* tuple test function */
@@ -46,11 +62,23 @@ typedef struct SnapshotData
 	 */
 	TransactionId xmin;			/* all XID < xmin are visible to me */
 	TransactionId xmax;			/* all XID >= xmax are invisible to me */
-	TransactionId *xip;			/* array of xact IDs in progress */
+	/*
+	 * For normal MVCC snapshot this contains the all xact IDs that are in
+	 * progress, unless the snapshot was taken during recovery in which case
+	 * it's empty. For historic MVCC snapshots, the meaning is inverted,
+	 * i.e. it contains *committed* transactions between xmin and xmax.
+	 */
+	TransactionId *xip;
 	uint32		xcnt;			/* # of xact ids in xip[] */
 	/* note: all ids in xip[] satisfy xmin <= xip[i] < xmax */
 	int32		subxcnt;		/* # of xact ids in subxip[] */
-	TransactionId *subxip;		/* array of subxact IDs in progress */
+	/*
+	 * For non-historic MVCC snapshots, this contains subxact IDs that are in
+	 * progress (and other transactions that are in progress if taken during
+	 * recovery). For historic snapshot it contains *all* xids assigned to the
+	 * replayed transaction, including the toplevel xid.
+	 */
+	TransactionId *subxip;
 	bool		suboverflowed;	/* has the subxip array overflowed? */
 	bool		takenDuringRecovery;	/* recovery-shaped snapshot? */
 	bool		copied;			/* false if it's a static snapshot */
diff --git a/src/include/utils/tqual.h b/src/include/utils/tqual.h
index e34c28a4f78..48abe62983d 100644
--- a/src/include/utils/tqual.h
+++ b/src/include/utils/tqual.h
@@ -22,6 +22,7 @@
 extern PGDLLIMPORT SnapshotData SnapshotSelfData;
 extern PGDLLIMPORT SnapshotData SnapshotAnyData;
 extern PGDLLIMPORT SnapshotData SnapshotToastData;
+extern PGDLLIMPORT SnapshotData CatalogSnapshotData;
 
 #define SnapshotSelf		(&SnapshotSelfData)
 #define SnapshotAny			(&SnapshotAnyData)
@@ -37,7 +38,8 @@ extern PGDLLIMPORT SnapshotData SnapshotToastData;
 
 /* This macro encodes the knowledge of which snapshots are MVCC-safe */
 #define IsMVCCSnapshot(snapshot)  \
-	((snapshot)->satisfies == HeapTupleSatisfiesMVCC)
+	((snapshot)->satisfies == HeapTupleSatisfiesMVCC || \
+	 (snapshot)->satisfies == HeapTupleSatisfiesHistoricMVCC)
 
 /*
  * HeapTupleSatisfiesVisibility
@@ -73,6 +75,8 @@ extern bool HeapTupleSatisfiesToast(HeapTuple htup,
 						Snapshot snapshot, Buffer buffer);
 extern bool HeapTupleSatisfiesDirty(HeapTuple htup,
 						Snapshot snapshot, Buffer buffer);
+extern bool HeapTupleSatisfiesHistoricMVCC(HeapTuple htup,
+						Snapshot snapshot, Buffer buffer);
 
 /* Special "satisfies" routines with different APIs */
 extern HTSU_Result HeapTupleSatisfiesUpdate(HeapTuple htup,
@@ -86,4 +90,13 @@ extern void HeapTupleSetHintBits(HeapTupleHeader tuple, Buffer buffer,
 					 uint16 infomask, TransactionId xid);
 extern bool HeapTupleHeaderIsOnlyLocked(HeapTupleHeader tuple);
 
+/*
+ * To avoid leaking to much knowledge about reorderbuffer implementation
+ * details this is implemented in reorderbuffer.c not tqual.c.
+ */
+extern bool ResolveCminCmaxDuringDecoding(struct HTAB *tuplecid_data,
+										  Snapshot snapshot,
+										  HeapTuple htup,
+										  Buffer buffer,
+										  CommandId *cmin, CommandId *cmax);
 #endif   /* TQUAL_H */
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index ef50f4da217..b0b6e27d8a9 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -1368,13 +1368,15 @@ pg_prepared_xacts| SELECT p.transaction,
    LEFT JOIN pg_authid u ON ((p.ownerid = u.oid)))
    LEFT JOIN pg_database d ON ((p.dbid = d.oid)));
 pg_replication_slots| SELECT l.slot_name,
+    l.plugin,
     l.slot_type,
     l.datoid,
     d.datname AS database,
     l.active,
     l.xmin,
+    l.catalog_xmin,
     l.restart_lsn
-   FROM (pg_get_replication_slots() l(slot_name, slot_type, datoid, active, xmin, restart_lsn)
+   FROM (pg_get_replication_slots() l(slot_name, plugin, slot_type, datoid, active, xmin, catalog_xmin, restart_lsn)
    LEFT JOIN pg_database d ON ((l.datoid = d.oid)));
 pg_roles| SELECT pg_authid.rolname,
     pg_authid.rolsuper,
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 3b7f61ef208..f9604541c7a 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -940,6 +940,17 @@ LockTupleMode
 LockingClause
 LogOpts
 LogStmtLevel
+LogicalDecodeBeginCB
+LogicalDecodeChangeCB
+LogicalDecodeCleanupCB
+LogicalDecodeCommitCB
+LogicalDecodeInitCB
+LogicalDecodingCheckpointData
+LogicalDecodingContext
+LogicalDecodingCtlData
+LogicalDecodingSlot
+LogicalOutputPluginWriterPrepareWrite
+LogicalOutputPluginWriterWrite
 LogicalTape
 LogicalTapeSet
 MAGIC
@@ -1053,6 +1064,7 @@ OprInfo
 OprProofCacheEntry
 OprProofCacheKey
 OutputContext
+OutputPluginCallbacks
 OverrideSearchPath
 OverrideStackEntry
 PACE_HEADER
@@ -1468,6 +1480,21 @@ Relids
 RelocationBufferInfo
 RenameStmt
 ReopenPtr
+ReorderBuffer
+ReorderBufferApplyChangeCB
+ReorderBufferBeginCB
+ReorderBufferChange
+ReorderBufferChangeTypeInternal
+ReorderBufferCommitCB
+ReorderBufferDiskChange
+ReorderBufferIterTXNEntry
+ReorderBufferIterTXNState
+ReorderBufferToastEnt
+ReorderBufferTupleBuf
+ReorderBufferTupleCidEnt
+ReorderBufferTupleCidKey
+ReorderBufferTXN
+ReorderBufferTXNByIdEnt
 ReplaceVarsFromTargetList_context
 ReplaceVarsNoMatchOption
 ResTarget
@@ -1522,6 +1549,8 @@ SID_NAME_USE
 SISeg
 SMgrRelation
 SMgrRelationData
+SnapBuildAction
+SnapBuildState
 SOCKADDR
 SOCKET
 SPELL
@@ -1613,6 +1642,8 @@ SlruSharedData
 Snapshot
 SnapshotData
 SnapshotSatisfiesFunc
+Snapstate
+SnapstateOnDisk
 SockAddr
 Sort
 SortBy
@@ -1929,6 +1960,7 @@ XLogReaderState
 XLogRecData
 XLogRecPtr
 XLogRecord
+XLogRecordBuffer
 XLogSegNo
 XLogSource
 XLogwrtResult
@@ -2351,6 +2383,7 @@ symbol
 tablespaceinfo
 teReqs
 teSection
+TestDecodingData
 temp_tablespaces_extra
 text
 timeKEY