aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/Makefile.global.in2
-rw-r--r--src/backend/access/heap/heapam.c18
-rw-r--r--src/backend/access/heap/pruneheap.c37
-rw-r--r--src/backend/access/heap/rewriteheap.c606
-rw-r--r--src/backend/access/heap/tuptoaster.c26
-rw-r--r--src/backend/access/index/indexam.c6
-rw-r--r--src/backend/access/rmgrdesc/heapdesc.c4
-rw-r--r--src/backend/access/transam/xact.c10
-rw-r--r--src/backend/access/transam/xlog.c36
-rw-r--r--src/backend/catalog/index.c2
-rw-r--r--src/backend/catalog/system_views.sql34
-rw-r--r--src/backend/commands/analyze.c3
-rw-r--r--src/backend/commands/cluster.c4
-rw-r--r--src/backend/commands/dbcommands.c15
-rw-r--r--src/backend/commands/vacuum.c8
-rw-r--r--src/backend/commands/vacuumlazy.c5
-rw-r--r--src/backend/executor/nodeBitmapHeapscan.c3
-rw-r--r--src/backend/replication/Makefile2
-rw-r--r--src/backend/replication/logical/Makefile19
-rw-r--r--src/backend/replication/logical/decode.c826
-rw-r--r--src/backend/replication/logical/logical.c920
-rw-r--r--src/backend/replication/logical/logicalfuncs.c509
-rw-r--r--src/backend/replication/logical/reorderbuffer.c3059
-rw-r--r--src/backend/replication/logical/snapbuild.c1885
-rw-r--r--src/backend/replication/slot.c289
-rw-r--r--src/backend/replication/slotfuncs.c98
-rw-r--r--src/backend/replication/walreceiver.c2
-rw-r--r--src/backend/replication/walsender.c13
-rw-r--r--src/backend/storage/ipc/procarray.c222
-rw-r--r--src/backend/storage/ipc/standby.c30
-rw-r--r--src/backend/storage/lmgr/proc.c8
-rw-r--r--src/backend/tcop/postgres.c11
-rw-r--r--src/backend/utils/cache/inval.c4
-rw-r--r--src/backend/utils/cache/relcache.c57
-rw-r--r--src/backend/utils/time/snapmgr.c102
-rw-r--r--src/backend/utils/time/tqual.c164
-rw-r--r--src/bin/initdb/initdb.c5
-rw-r--r--src/include/access/heapam.h3
-rw-r--r--src/include/access/heapam_xlog.h14
-rw-r--r--src/include/access/rewriteheap.h29
-rw-r--r--src/include/access/transam.h5
-rw-r--r--src/include/access/tuptoaster.h27
-rw-r--r--src/include/access/xlog.h1
-rw-r--r--src/include/catalog/catversion.h2
-rw-r--r--src/include/catalog/pg_proc.h12
-rw-r--r--src/include/commands/vacuum.h4
-rw-r--r--src/include/replication/decode.h19
-rw-r--r--src/include/replication/logical.h100
-rw-r--r--src/include/replication/logicalfuncs.h24
-rw-r--r--src/include/replication/output_plugin.h98
-rw-r--r--src/include/replication/reorderbuffer.h351
-rw-r--r--src/include/replication/slot.h64
-rw-r--r--src/include/replication/snapbuild.h83
-rw-r--r--src/include/storage/itemptr.h3
-rw-r--r--src/include/storage/proc.h6
-rw-r--r--src/include/storage/procarray.h10
-rw-r--r--src/include/storage/sinval.h2
-rw-r--r--src/include/utils/inval.h1
-rw-r--r--src/include/utils/snapmgr.h11
-rw-r--r--src/include/utils/snapshot.h32
-rw-r--r--src/include/utils/tqual.h15
-rw-r--r--src/test/regress/expected/rules.out4
-rw-r--r--src/tools/pgindent/typedefs.list33
63 files changed, 9804 insertions, 193 deletions
diff --git a/src/Makefile.global.in b/src/Makefile.global.in
index 209d1bdf4dc..cdddf492f45 100644
--- a/src/Makefile.global.in
+++ b/src/Makefile.global.in
@@ -468,6 +468,8 @@ pg_regress_installcheck = $(top_builddir)/src/test/regress/pg_regress --inputdir
pg_regress_clean_files = results/ regression.diffs regression.out tmp_check/ log/
+pg_isolation_regress_check = $(top_builddir)/src/test/isolation/pg_isolation_regress --inputdir=$(srcdir) --temp-install=./tmp_check --top-builddir=$(top_builddir) $(pg_regress_locale_flags)
+pg_isolation_regress_installcheck = $(top_builddir)/src/test/isolation/pg_isolation_regress --inputdir=$(srcdir) --top-builddir=$(top_builddir) $(pg_regress_locale_flags)
##########################################################################
#
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index de4befa93f4..71ec74015cd 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -347,8 +347,7 @@ heapgetpage(HeapScanDesc scan, BlockNumber page)
/*
* Prune and repair fragmentation for the whole page, if possible.
*/
- Assert(TransactionIdIsValid(RecentGlobalXmin));
- heap_page_prune_opt(scan->rs_rd, buffer, RecentGlobalXmin);
+ heap_page_prune_opt(scan->rs_rd, buffer);
/*
* We must hold share lock on the buffer content while examining tuple
@@ -1750,10 +1749,22 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
*/
if (!skip)
{
+ /*
+ * For the benefit of logical decoding, have t_self point at the
+ * element of the HOT chain we're currently investigating instead
+ * of the root tuple of the HOT chain. This is important because
+ * the *Satisfies routine for historical mvcc snapshots needs the
+ * correct tid to decide about the visibility in some cases.
+ */
+ ItemPointerSet(&(heapTuple->t_self), BufferGetBlockNumber(buffer), offnum);
+
/* If it's visible per the snapshot, we must return it */
valid = HeapTupleSatisfiesVisibility(heapTuple, snapshot, buffer);
CheckForSerializableConflictOut(valid, relation, heapTuple,
buffer, snapshot);
+ /* reset to original, non-redirected, tid */
+ heapTuple->t_self = *tid;
+
if (valid)
{
ItemPointerSetOffsetNumber(tid, offnum);
@@ -8207,6 +8218,9 @@ heap2_redo(XLogRecPtr lsn, XLogRecord *record)
* decoding.
*/
break;
+ case XLOG_HEAP2_REWRITE:
+ heap_xlog_logical_rewrite(lsn, record);
+ break;
default:
elog(PANIC, "heap2_redo: unknown op code %u", info);
}
diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c
index 27cbac85256..3c69e1badac 100644
--- a/src/backend/access/heap/pruneheap.c
+++ b/src/backend/access/heap/pruneheap.c
@@ -18,13 +18,14 @@
#include "access/heapam_xlog.h"
#include "access/transam.h"
#include "access/htup_details.h"
+#include "catalog/catalog.h"
#include "miscadmin.h"
#include "pgstat.h"
#include "storage/bufmgr.h"
+#include "utils/snapmgr.h"
#include "utils/rel.h"
#include "utils/tqual.h"
-
/* Working data for heap_page_prune and subroutines */
typedef struct
{
@@ -70,10 +71,34 @@ static void heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum);
* or RECENTLY_DEAD (see HeapTupleSatisfiesVacuum).
*/
void
-heap_page_prune_opt(Relation relation, Buffer buffer, TransactionId OldestXmin)
+heap_page_prune_opt(Relation relation, Buffer buffer)
{
Page page = BufferGetPage(buffer);
Size minfree;
+ TransactionId OldestXmin;
+
+ /*
+ * We can't write WAL in recovery mode, so there's no point trying to
+ * clean the page. The master will likely issue a cleaning WAL record soon
+ * anyway, so this is no particular loss.
+ */
+ if (RecoveryInProgress())
+ return;
+
+ /*
+ * Use the appropriate xmin horizon for this relation. If it's a proper
+ * catalog relation or a user defined, additional, catalog relation, we
+ * need to use the horizon that includes slots, otherwise the data-only
+ * horizon can be used. Note that the toast relation of user defined
+ * relations are *not* considered catalog relations.
+ */
+ if (IsCatalogRelation(relation) ||
+ RelationIsAccessibleInLogicalDecoding(relation))
+ OldestXmin = RecentGlobalXmin;
+ else
+ OldestXmin = RecentGlobalDataXmin;
+
+ Assert(TransactionIdIsValid(OldestXmin));
/*
* Let's see if we really need pruning.
@@ -85,14 +110,6 @@ heap_page_prune_opt(Relation relation, Buffer buffer, TransactionId OldestXmin)
return;
/*
- * We can't write WAL in recovery mode, so there's no point trying to
- * clean the page. The master will likely issue a cleaning WAL record soon
- * anyway, so this is no particular loss.
- */
- if (RecoveryInProgress())
- return;
-
- /*
* We prune when a previous UPDATE failed to find enough space on the page
* for a new tuple version, or when free space falls below the relation's
* fill-factor target (but not less than 10%).
diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index c34ab9865f8..239c7dad0c9 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -102,17 +102,34 @@
*/
#include "postgres.h"
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "miscadmin.h"
+
#include "access/heapam.h"
#include "access/heapam_xlog.h"
#include "access/rewriteheap.h"
#include "access/transam.h"
#include "access/tuptoaster.h"
+#include "access/xact.h"
+
+#include "catalog/catalog.h"
+
+#include "lib/ilist.h"
+
+#include "replication/logical.h"
+#include "replication/slot.h"
+
#include "storage/bufmgr.h"
+#include "storage/fd.h"
#include "storage/smgr.h"
+
#include "utils/memutils.h"
#include "utils/rel.h"
#include "utils/tqual.h"
+#include "storage/procarray.h"
/*
* State associated with a rewrite operation. This is opaque to the user
@@ -120,21 +137,28 @@
*/
typedef struct RewriteStateData
{
+ Relation rs_old_rel; /* source heap */
Relation rs_new_rel; /* destination heap */
Page rs_buffer; /* page currently being built */
BlockNumber rs_blockno; /* block where page will go */
bool rs_buffer_valid; /* T if any tuples in buffer */
bool rs_use_wal; /* must we WAL-log inserts? */
+ bool rs_logical_rewrite; /* do we need to do logical rewriting */
TransactionId rs_oldest_xmin; /* oldest xmin used by caller to
* determine tuple visibility */
TransactionId rs_freeze_xid;/* Xid that will be used as freeze cutoff
* point */
+ TransactionId rs_logical_xmin; /* Xid that will be used as cutoff
+ * point for logical rewrites */
MultiXactId rs_cutoff_multi;/* MultiXactId that will be used as cutoff
* point for multixacts */
MemoryContext rs_cxt; /* for hash tables and entries and tuples in
* them */
+ XLogRecPtr rs_begin_lsn; /* XLogInsertLsn when starting the rewrite */
HTAB *rs_unresolved_tups; /* unmatched A tuples */
HTAB *rs_old_new_tid_map; /* unmatched B tuples */
+ HTAB *rs_logical_mappings; /* logical remapping files */
+ uint32 rs_num_rewrite_mappings; /* # in memory mappings */
} RewriteStateData;
/*
@@ -169,14 +193,45 @@ typedef struct
typedef OldToNewMappingData *OldToNewMapping;
+/*
+ * In-Memory data for a xid that might need logical remapping entries
+ * to be logged.
+ */
+typedef struct RewriteMappingFile
+{
+ TransactionId xid; /* xid that might need to see the row */
+ int vfd; /* fd of mappings file */
+ off_t off; /* how far have we written yet */
+ uint32 num_mappings; /* number of in-memory mappings */
+ dlist_head mappings; /* list of in-memory mappings */
+ char path[MAXPGPATH]; /* path, for error messages */
+} RewriteMappingFile;
+
+/*
+ * A single In-Memeory logical rewrite mapping, hanging of
+ * RewriteMappingFile->mappings.
+ */
+typedef struct RewriteMappingDataEntry
+{
+ LogicalRewriteMappingData map; /* map between old and new location of
+ * the tuple */
+ dlist_node node;
+} RewriteMappingDataEntry;
+
/* prototypes for internal functions */
static void raw_heap_insert(RewriteState state, HeapTuple tup);
+/* internal logical remapping prototypes */
+static void logical_begin_heap_rewrite(RewriteState state);
+static void logical_rewrite_heap_tuple(RewriteState state, ItemPointerData old_tid, HeapTuple new_tuple);
+static void logical_end_heap_rewrite(RewriteState state);
+
/*
* Begin a rewrite of a table
*
+ * old_heap old, locked heap relation tuples will be read from
* new_heap new, locked heap relation to insert tuples to
* oldest_xmin xid used by the caller to determine which tuples are dead
* freeze_xid xid before which tuples will be frozen
@@ -187,7 +242,7 @@ static void raw_heap_insert(RewriteState state, HeapTuple tup);
* to be used in subsequent calls to the other functions.
*/
RewriteState
-begin_heap_rewrite(Relation new_heap, TransactionId oldest_xmin,
+begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xmin,
TransactionId freeze_xid, MultiXactId cutoff_multi,
bool use_wal)
{
@@ -210,6 +265,7 @@ begin_heap_rewrite(Relation new_heap, TransactionId oldest_xmin,
/* Create and fill in the state struct */
state = palloc0(sizeof(RewriteStateData));
+ state->rs_old_rel = old_heap;
state->rs_new_rel = new_heap;
state->rs_buffer = (Page) palloc(BLCKSZ);
/* new_heap needn't be empty, just locked */
@@ -244,6 +300,8 @@ begin_heap_rewrite(Relation new_heap, TransactionId oldest_xmin,
MemoryContextSwitchTo(old_cxt);
+ logical_begin_heap_rewrite(state);
+
return state;
}
@@ -301,6 +359,8 @@ end_heap_rewrite(RewriteState state)
if (RelationNeedsWAL(state->rs_new_rel))
heap_sync(state->rs_new_rel);
+ logical_end_heap_rewrite(state);
+
/* Deleting the context frees everything */
MemoryContextDelete(state->rs_cxt);
}
@@ -429,6 +489,8 @@ rewrite_heap_tuple(RewriteState state,
raw_heap_insert(state, new_tuple);
new_tid = new_tuple->t_self;
+ logical_rewrite_heap_tuple(state, old_tid, new_tuple);
+
/*
* If the tuple is the updated version of a row, and the prior version
* wouldn't be DEAD yet, then we need to either resolve the prior
@@ -678,3 +740,545 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
if (heaptup != tup)
heap_freetuple(heaptup);
}
+
+/* ------------------------------------------------------------------------
+ * Logical rewrite support
+ *
+ * When doing logical decoding - which relies on using cmin/cmax of catalog
+ * tuples, via xl_heap_new_cid records - heap rewrites have to log enough
+ * information to allow the decoding backend to updates its internal mapping
+ * of (relfilenode,ctid) => (cmin, cmax) to be correct for the rewritten heap.
+ *
+ * For that, every time we find a tuple that's been modified in a catalog
+ * relation within the xmin horizon of any decoding slot, we log a mapping
+ * from the old to the new location.
+ *
+ * To deal with rewrites that abort the filename of a mapping file contains
+ * the xid of the transaction performing the rewrite, which then can be
+ * checked before being read in.
+ *
+ * For efficiency we don't immediately spill every single map mapping for a
+ * row to disk but only do so in batches when we've collected several of them
+ * in memory or when end_heap_rewrite() has been called.
+ *
+ * Crash-Safety: This module diverts from the usual patterns of doing WAL
+ * since it cannot rely on checkpoint flushing out all buffers and thus
+ * waiting for exlusive locks on buffers. Usually the XLogInsert() covering
+ * buffer modifications is performed while the buffer(s) that are being
+ * modified are exlusively locked guaranteeing that both the WAL record and
+ * the modified heap are on either side of the checkpoint. But since the
+ * mapping files we log aren't in shared_buffers that interlock doesn't work.
+ *
+ * Instead we simply write the mapping files out to disk, *before* the
+ * XLogInsert() is performed. That guarantees that either the XLogInsert() is
+ * inserted after the checkpoint's redo pointer or that the checkpoint (via
+ * LogicalRewriteHeapCheckpoint()) has flushed the (partial) mapping file to
+ * disk. That leaves the tail end that has not yet been flushed open to
+ * corruption, which is solved by including the current offset in the
+ * xl_heap_rewrite_mapping records and truncating the mapping file to it
+ * during replay. Every time a rewrite is finished all generated mapping files
+ * are synced to disk.
+ *
+ * Note that if we were only concerned about crash safety we wouldn't have to
+ * deal with WAL logging at all - an fsync() at the end of a rewrite would be
+ * sufficient for crash safety. Any mapping that hasn't been safely flushed to
+ * disk has to be by an aborted (explicitly or via a crash) transaction and is
+ * ignored by virtue of the xid in it's name being subject to a
+ * TransactionDidCommit() check. But we want to support having standbys via
+ * physical replication, both for availability and to to do logical decoding
+ * there.
+ * ------------------------------------------------------------------------
+ */
+
+/*
+ * Do preparations for logging logical mappings during a rewrite if
+ * necessary. If we detect that we don't need to log anything we'll prevent
+ * any further action by the various logical rewrite functions.
+ */
+static void
+logical_begin_heap_rewrite(RewriteState state)
+{
+ HASHCTL hash_ctl;
+ TransactionId logical_xmin;
+
+ /*
+ * We only need to persist these mappings if the rewritten table can be
+ * accessed during logical decoding, if not, we can skip doing any
+ * additional work.
+ */
+ state->rs_logical_rewrite =
+ RelationIsAccessibleInLogicalDecoding(state->rs_old_rel);
+
+ if (!state->rs_logical_rewrite)
+ return;
+
+ Assert(ReplicationSlotCtl != NULL);
+
+ ProcArrayGetReplicationSlotXmin(NULL, &logical_xmin);
+
+ /*
+ * If there are no logical slots in progress we don't need to do anything,
+ * there cannot be any remappings for relevant rows yet. The relation's
+ * lock protects us against races.
+ */
+ if (logical_xmin == InvalidTransactionId)
+ {
+ state->rs_logical_rewrite = false;
+ return;
+ }
+
+ state->rs_logical_xmin = logical_xmin;
+ state->rs_begin_lsn = GetXLogInsertRecPtr();
+ state->rs_num_rewrite_mappings = 0;
+
+ memset(&hash_ctl, 0, sizeof(hash_ctl));
+ hash_ctl.keysize = sizeof(TransactionId);
+ hash_ctl.entrysize = sizeof(RewriteMappingFile);
+ hash_ctl.hcxt = state->rs_cxt;
+ hash_ctl.hash = tag_hash;
+
+ state->rs_logical_mappings =
+ hash_create("Logical rewrite mapping",
+ 128, /* arbitrary initial size */
+ &hash_ctl,
+ HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
+}
+
+/*
+ * Flush all logical in-memory mappings to disk, but don't fsync them yet.
+ */
+static void
+logical_heap_rewrite_flush_mappings(RewriteState state)
+{
+ HASH_SEQ_STATUS seq_status;
+ RewriteMappingFile *src;
+ dlist_mutable_iter iter;
+
+ Assert(state->rs_logical_rewrite);
+
+ /* no logical rewrite in progress, no need to iterate over mappings */
+ if (state->rs_num_rewrite_mappings == 0)
+ return;
+
+ elog(DEBUG1, "flushing %u logical rewrite mapping entries",
+ state->rs_num_rewrite_mappings);
+
+ hash_seq_init(&seq_status, state->rs_logical_mappings);
+ while ((src = (RewriteMappingFile *) hash_seq_search(&seq_status)) != NULL)
+ {
+ XLogRecData rdata[2];
+ char *waldata;
+ char *waldata_start;
+ xl_heap_rewrite_mapping xlrec;
+ Oid dboid;
+ uint32 len;
+ int written;
+
+ /* this file hasn't got any new mappings */
+ if (src->num_mappings == 0)
+ continue;
+
+ if (state->rs_old_rel->rd_rel->relisshared)
+ dboid = InvalidOid;
+ else
+ dboid = MyDatabaseId;
+
+ xlrec.num_mappings = src->num_mappings;
+ xlrec.mapped_rel = RelationGetRelid(state->rs_old_rel);
+ xlrec.mapped_xid = src->xid;
+ xlrec.mapped_db = dboid;
+ xlrec.offset = src->off;
+ xlrec.start_lsn = state->rs_begin_lsn;
+
+ rdata[0].data = (char *) (&xlrec);
+ rdata[0].len = sizeof(xlrec);
+ rdata[0].buffer = InvalidBuffer;
+ rdata[0].next = &(rdata[1]);
+
+ /* write all mappings consecutively */
+ len = src->num_mappings * sizeof(LogicalRewriteMappingData);
+ waldata = palloc(len);
+ waldata_start = waldata;
+
+ /*
+ * collect data we need to write out, but don't modify ondisk data yet
+ */
+ dlist_foreach_modify(iter, &src->mappings)
+ {
+ RewriteMappingDataEntry *pmap;
+
+ pmap = dlist_container(RewriteMappingDataEntry, node, iter.cur);
+
+ memcpy(waldata, &pmap->map, sizeof(pmap->map));
+ waldata += sizeof(pmap->map);
+
+ /* remove from the list and free */
+ dlist_delete(&pmap->node);
+ pfree(pmap);
+
+ /* update bookkeeping */
+ state->rs_num_rewrite_mappings--;
+ src->num_mappings--;
+ }
+
+ /*
+ * Note that we deviate from the usual WAL coding practices here,
+ * check the above "Logical rewrite support" comment for reasoning.
+ */
+ written = FileWrite(src->vfd, waldata_start, len);
+ if (written != len)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not write to file \"%s\", wrote %d of %d: %m", src->path,
+ written, len)));
+ src->off += len;
+
+ Assert(src->num_mappings == 0);
+
+ rdata[1].data = waldata_start;
+ rdata[1].len = len;
+ rdata[1].buffer = InvalidBuffer;
+ rdata[1].next = NULL;
+
+ /* write xlog record */
+ XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_REWRITE, rdata);
+
+ }
+ Assert(state->rs_num_rewrite_mappings == 0);
+}
+
+/*
+ * Logical remapping part of end_heap_rewrite().
+ */
+static void
+logical_end_heap_rewrite(RewriteState state)
+{
+ HASH_SEQ_STATUS seq_status;
+ RewriteMappingFile *src;
+
+ /* done, no logical rewrite in progress */
+ if (!state->rs_logical_rewrite)
+ return;
+
+ /* writeout remaining in-memory entries */
+ if (state->rs_num_rewrite_mappings > 0 )
+ logical_heap_rewrite_flush_mappings(state);
+
+ /* Iterate over all mappings we have written and fsync the files. */
+ hash_seq_init(&seq_status, state->rs_logical_mappings);
+ while ((src = (RewriteMappingFile *) hash_seq_search(&seq_status)) != NULL)
+ {
+ if(FileSync(src->vfd) != 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not fsync file \"%s\": %m", src->path)));
+ FileClose(src->vfd);
+ }
+ /* memory context cleanup will deal with the rest */
+}
+
+/*
+ * Log a single (old->new) mapping for 'xid'.
+ */
+static void
+logical_rewrite_log_mapping(RewriteState state, TransactionId xid,
+ LogicalRewriteMappingData *map)
+{
+ RewriteMappingFile *src;
+ RewriteMappingDataEntry *pmap;
+ Oid relid;
+ bool found;
+
+ relid = RelationGetRelid(state->rs_old_rel);
+
+ /* look for existing mappings for this 'mapped' xid */
+ src = hash_search(state->rs_logical_mappings, &xid,
+ HASH_ENTER, &found);
+
+ /*
+ * We haven't yet had the need to map anything for this xid, create
+ * per-xid data structures.
+ */
+ if (!found)
+ {
+ char path[MAXPGPATH];
+ Oid dboid;
+
+ if (state->rs_old_rel->rd_rel->relisshared)
+ dboid = InvalidOid;
+ else
+ dboid = MyDatabaseId;
+
+ snprintf(path, MAXPGPATH,
+ "pg_llog/mappings/" LOGICAL_REWRITE_FORMAT,
+ dboid, relid,
+ (uint32) (state->rs_begin_lsn >> 32),
+ (uint32) state->rs_begin_lsn,
+ xid, GetCurrentTransactionId());
+
+ dlist_init(&src->mappings);
+ src->num_mappings = 0;
+ src->off = 0;
+ memcpy(src->path, path, sizeof(path));
+ src->vfd = PathNameOpenFile(path,
+ O_CREAT | O_EXCL | O_WRONLY | PG_BINARY,
+ S_IRUSR | S_IWUSR);
+ if (src->vfd < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not create file \"%s\": %m", path)));
+ }
+
+ pmap = MemoryContextAlloc(state->rs_cxt,
+ sizeof(RewriteMappingDataEntry));
+ memcpy(&pmap->map, map, sizeof(LogicalRewriteMappingData));
+ dlist_push_tail(&src->mappings, &pmap->node);
+ src->num_mappings++;
+ state->rs_num_rewrite_mappings++;
+
+ /*
+ * Write out buffer every time we've too many in-memory entries across all
+ * mapping files.
+ */
+ if (state->rs_num_rewrite_mappings >= 1000 /* arbitrary number */)
+ logical_heap_rewrite_flush_mappings(state);
+}
+
+/*
+ * Perform logical remapping for a tuple that's mapped from old_tid to
+ * new_tuple->t_self by rewrite_heap_tuple() iff necessary for the tuple.
+ */
+static void
+logical_rewrite_heap_tuple(RewriteState state, ItemPointerData old_tid,
+ HeapTuple new_tuple)
+{
+ ItemPointerData new_tid = new_tuple->t_self;
+ TransactionId cutoff = state->rs_logical_xmin;
+ TransactionId xmin;
+ TransactionId xmax;
+ bool do_log_xmin = false;
+ bool do_log_xmax = false;
+ LogicalRewriteMappingData map;
+
+ /* no logical rewrite in progress, we don't need to log anything */
+ if (!state->rs_logical_rewrite)
+ return;
+
+ xmin = HeapTupleHeaderGetXmin(new_tuple->t_data);
+ /* use *GetUpdateXid to correctly deal with multixacts */
+ xmax = HeapTupleHeaderGetUpdateXid(new_tuple->t_data);
+
+ /*
+ * Log the mapping iff the tuple has been created recently.
+ */
+ if (TransactionIdIsNormal(xmin) && !TransactionIdPrecedes(xmin, cutoff))
+ do_log_xmin = true;
+
+ if (!TransactionIdIsNormal(xmax))
+ {
+ /*
+ * no xmax is set, can't have any permanent ones, so this check is
+ * sufficient
+ */
+ }
+ else if (HEAP_XMAX_IS_LOCKED_ONLY(new_tuple->t_data->t_infomask))
+ {
+ /* only locked, we don't care */
+ }
+ else if (!TransactionIdPrecedes(xmax, cutoff))
+ {
+ /* tuple has been deleted recently, log */
+ do_log_xmax = true;
+ }
+
+ /* if neither needs to be logged, we're done */
+ if (!do_log_xmin && !do_log_xmax)
+ return;
+
+ /* fill out mapping information */
+ map.old_node = state->rs_old_rel->rd_node;
+ map.old_tid = old_tid;
+ map.new_node = state->rs_new_rel->rd_node;
+ map.new_tid = new_tid;
+
+ /* ---
+ * Now persist the mapping for the individual xids that are affected. We
+ * need to log for both xmin and xmax if they aren't the same transaction
+ * since the mapping files are per "affected" xid.
+ * We don't muster all that much effort detecting whether xmin and xmax
+ * are actually the same transaction, we just check whether the xid is the
+ * same disregarding subtransactions. Logging too much is relatively
+ * harmless and we could never do the check fully since subtransaction
+ * data is thrown away during restarts.
+ * ---
+ */
+ if (do_log_xmin)
+ logical_rewrite_log_mapping(state, xmin, &map);
+ /* separately log mapping for xmax unless it'd be redundant */
+ if (do_log_xmax && !TransactionIdEquals(xmin, xmax))
+ logical_rewrite_log_mapping(state, xmax, &map);
+}
+
+/*
+ * Replay XLOG_HEAP2_REWRITE records
+ */
+void
+heap_xlog_logical_rewrite(XLogRecPtr lsn, XLogRecord *r)
+{
+ char path[MAXPGPATH];
+ int fd;
+ xl_heap_rewrite_mapping *xlrec;
+ uint32 len;
+ char *data;
+
+ xlrec = (xl_heap_rewrite_mapping *) XLogRecGetData(r);
+
+ snprintf(path, MAXPGPATH,
+ "pg_llog/mappings/" LOGICAL_REWRITE_FORMAT,
+ xlrec->mapped_db, xlrec->mapped_rel,
+ (uint32) (xlrec->start_lsn >> 32),
+ (uint32) xlrec->start_lsn,
+ xlrec->mapped_xid, r->xl_xid);
+
+ fd = OpenTransientFile(path,
+ O_CREAT | O_WRONLY | PG_BINARY,
+ S_IRUSR | S_IWUSR);
+ if (fd < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not create file \"%s\": %m", path)));
+ /*
+ * Truncate all data that's not guaranteed to have been safely fsynced (by
+ * previous record or by the last checkpoint).
+ */
+ if (ftruncate(fd, xlrec->offset) != 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not truncate file \"%s\" to %u: %m",
+ path, (uint32) xlrec->offset)));
+
+ /* now seek to the position we want to write our data to */
+ if (lseek(fd, xlrec->offset, SEEK_SET) != xlrec->offset)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not seek to the end of file \"%s\": %m",
+ path)));
+
+ data = XLogRecGetData(r) + sizeof(*xlrec);
+
+ len = xlrec->num_mappings * sizeof(LogicalRewriteMappingData);
+
+ /* write out tail end of mapping file (again) */
+ if (write(fd, data, len) != len)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not write to file \"%s\": %m", path)));
+ /*
+ * Now fsync all previously written data. We could improve things and only
+ * do this for the last write to a file, but the required bookkeeping
+ * doesn't seem worth the trouble.
+ */
+ if (pg_fsync(fd) != 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not fsync file \"%s\": %m", path)));
+
+ CloseTransientFile(fd);
+}
+
+/* ---
+ * Perform a checkpoint for logical rewrite mappings
+ *
+ * This serves two tasks:
+ * 1) Remove all mappings not needed anymore based on the logical restart LSN
+ * 2) Flush all remaining mappings to disk, so that replay after a checkpoint
+ * only has to deal with the parts of a mapping that have been written out
+ * after the checkpoint started.
+ * ---
+ */
+void
+CheckPointLogicalRewriteHeap(void)
+{
+ XLogRecPtr cutoff;
+ XLogRecPtr redo;
+ DIR *mappings_dir;
+ struct dirent *mapping_de;
+ char path[MAXPGPATH];
+
+ /*
+ * We start of with a minimum of the last redo pointer. No new decoding
+ * slot will start before that, so that's a safe upper bound for removal.
+ */
+ redo = GetRedoRecPtr();
+
+ /* now check for the restart ptrs from existing slots */
+ cutoff = ReplicationSlotsComputeLogicalRestartLSN();
+
+ /* don't start earlier than the restart lsn */
+ if (cutoff != InvalidXLogRecPtr && redo < cutoff)
+ cutoff = redo;
+
+ mappings_dir = AllocateDir("pg_llog/mappings");
+ while ((mapping_de = ReadDir(mappings_dir, "pg_llog/mappings")) != NULL)
+ {
+ struct stat statbuf;
+ Oid dboid;
+ Oid relid;
+ XLogRecPtr lsn;
+ TransactionId rewrite_xid;
+ TransactionId create_xid;
+ uint32 hi, lo;
+
+ if (strcmp(mapping_de->d_name, ".") == 0 ||
+ strcmp(mapping_de->d_name, "..") == 0)
+ continue;
+
+ snprintf(path, MAXPGPATH, "pg_llog/mappings/%s", mapping_de->d_name);
+ if (lstat(path, &statbuf) == 0 && !S_ISREG(statbuf.st_mode))
+ continue;
+
+ /* Skip over files that cannot be ours. */
+ if (strncmp(mapping_de->d_name, "map-", 4) != 0)
+ continue;
+
+ if (sscanf(mapping_de->d_name, LOGICAL_REWRITE_FORMAT,
+ &dboid, &relid, &hi, &lo, &rewrite_xid, &create_xid) != 6)
+ elog(ERROR,"could not parse filename \"%s\"", mapping_de->d_name);
+
+ lsn = ((uint64) hi) << 32 | lo;
+
+ if (lsn < cutoff || cutoff == InvalidXLogRecPtr)
+ {
+ elog(DEBUG1, "removing logical rewrite file \"%s\"", path);
+ if (unlink(path) < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not unlink file \"%s\": %m", path)));
+ }
+ else
+ {
+ int fd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0);
+
+ /*
+ * The file cannot vanish due to concurrency since this function
+ * is the only one removing logical mappings and it's run while
+ * CheckpointLock is held exclusively.
+ */
+ if (fd < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not open file \"%s\": %m", path)));
+ /*
+ * We could try to avoid fsyncing files that either haven't
+ * changed or have only been created since the checkpoint's start,
+ * but it's currently not deemed worth the effort.
+ */
+ else if (pg_fsync(fd) != 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not fsync file \"%s\": %m", path)));
+ CloseTransientFile(fd);
+ }
+ }
+ FreeDir(mappings_dir);
+}
diff --git a/src/backend/access/heap/tuptoaster.c b/src/backend/access/heap/tuptoaster.c
index 97c9f238a7b..9a821d3e1cf 100644
--- a/src/backend/access/heap/tuptoaster.c
+++ b/src/backend/access/heap/tuptoaster.c
@@ -44,32 +44,6 @@
#undef TOAST_DEBUG
-/*
- * Testing whether an externally-stored value is compressed now requires
- * comparing extsize (the actual length of the external data) to rawsize
- * (the original uncompressed datum's size). The latter includes VARHDRSZ
- * overhead, the former doesn't. We never use compression unless it actually
- * saves space, so we expect either equality or less-than.
- */
-#define VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer) \
- ((toast_pointer).va_extsize < (toast_pointer).va_rawsize - VARHDRSZ)
-
-/*
- * Macro to fetch the possibly-unaligned contents of an EXTERNAL datum
- * into a local "struct varatt_external" toast pointer. This should be
- * just a memcpy, but some versions of gcc seem to produce broken code
- * that assumes the datum contents are aligned. Introducing an explicit
- * intermediate "varattrib_1b_e *" variable seems to fix it.
- */
-#define VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr) \
-do { \
- varattrib_1b_e *attre = (varattrib_1b_e *) (attr); \
- Assert(VARATT_IS_EXTERNAL(attre)); \
- Assert(VARSIZE_EXTERNAL(attre) == sizeof(toast_pointer) + VARHDRSZ_EXTERNAL); \
- memcpy(&(toast_pointer), VARDATA_EXTERNAL(attre), sizeof(toast_pointer)); \
-} while (0)
-
-
static void toast_delete_datum(Relation rel, Datum value);
static Datum toast_save_datum(Relation rel, Datum value,
struct varlena * oldexternal, int options);
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index 1aba2f04cc4..a4b5f3d698e 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -67,7 +67,10 @@
#include "access/relscan.h"
#include "access/transam.h"
+#include "access/xlog.h"
+
#include "catalog/index.h"
+#include "catalog/catalog.h"
#include "pgstat.h"
#include "storage/bufmgr.h"
#include "storage/lmgr.h"
@@ -520,8 +523,7 @@ index_fetch_heap(IndexScanDesc scan)
* Prune page, but only if we weren't already on this page
*/
if (prev_buf != scan->xs_cbuf)
- heap_page_prune_opt(scan->heapRelation, scan->xs_cbuf,
- RecentGlobalXmin);
+ heap_page_prune_opt(scan->heapRelation, scan->xs_cbuf);
}
/* Obtain share-lock on the buffer so we can examine visibility */
diff --git a/src/backend/access/rmgrdesc/heapdesc.c b/src/backend/access/rmgrdesc/heapdesc.c
index 89ba09a206f..c8a61669dd2 100644
--- a/src/backend/access/rmgrdesc/heapdesc.c
+++ b/src/backend/access/rmgrdesc/heapdesc.c
@@ -149,6 +149,10 @@ heap2_desc(StringInfo buf, uint8 xl_info, char *rec)
xlrec->node.relNode, xlrec->block,
xlrec->cutoff_xid, xlrec->ntuples);
}
+ else if (info == XLOG_HEAP2_REWRITE)
+ {
+ appendStringInfoString(buf, "heap rewrite:");
+ }
else if (info == XLOG_HEAP2_CLEANUP_INFO)
{
xl_heap_cleanup_info *xlrec = (xl_heap_cleanup_info *) rec;
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 0487be17df7..b20d9732e78 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -1074,8 +1074,16 @@ RecordTransactionCommit(void)
/*
* Do we need the long commit record? If not, use the compact format.
+ *
+ * For now always use the non-compact version if wal_level=logical, so
+ * we can hide commits from other databases. TODO: In the future we
+ * should merge compact and non-compact commits and use a flags
+ * variable to determine if it contains subxacts, relations or
+ * invalidation messages, that's more extensible and degrades more
+ * gracefully. Till then, it's just 20 bytes of overhead.
*/
- if (nrels > 0 || nmsgs > 0 || RelcacheInitFileInval || forceSyncCommit)
+ if (nrels > 0 || nmsgs > 0 || RelcacheInitFileInval || forceSyncCommit ||
+ XLogLogicalInfoActive())
{
XLogRecData rdata[4];
int lastrdata = 0;
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index ad46eb0cebf..53a20b1e606 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -23,6 +23,7 @@
#include "access/clog.h"
#include "access/multixact.h"
+#include "access/rewriteheap.h"
#include "access/subtrans.h"
#include "access/timeline.h"
#include "access/transam.h"
@@ -39,7 +40,9 @@
#include "pgstat.h"
#include "postmaster/bgwriter.h"
#include "postmaster/startup.h"
+#include "replication/logical.h"
#include "replication/slot.h"
+#include "replication/snapbuild.h"
#include "replication/walreceiver.h"
#include "replication/walsender.h"
#include "storage/barrier.h"
@@ -4016,6 +4019,27 @@ CheckXLogRemoved(XLogSegNo segno, TimeLineID tli)
}
/*
+ * Return the last WAL segment removed, or 0 if no segment has been removed
+ * since startup.
+ *
+ * NB: the result can be out of date arbitrarily fast, the caller has to deal
+ * with that.
+ */
+XLogSegNo
+XLogGetLastRemovedSegno(void)
+{
+ /* use volatile pointer to prevent code rearrangement */
+ volatile XLogCtlData *xlogctl = XLogCtl;
+ XLogSegNo lastRemovedSegNo;
+
+ SpinLockAcquire(&xlogctl->info_lck);
+ lastRemovedSegNo = xlogctl->lastRemovedSegNo;
+ SpinLockRelease(&xlogctl->info_lck);
+
+ return lastRemovedSegNo;
+}
+
+/*
* Update the last removed segno pointer in shared memory, to reflect
* that the given XLOG file has been removed.
*/
@@ -6559,6 +6583,12 @@ StartupXLOG(void)
StartupReplicationSlots(checkPoint.redo);
/*
+ * Startup logical state, needs to be setup now so we have proper data
+ * during crash recovery.
+ */
+ StartupReorderBuffer();
+
+ /*
* Startup MultiXact. We need to do this early for two reasons: one
* is that we might try to access multixacts when we do tuple freezing,
* and the other is we need its state initialized because we attempt
@@ -8589,7 +8619,7 @@ CreateCheckPoint(int flags)
* StartupSUBTRANS hasn't been called yet.
*/
if (!RecoveryInProgress())
- TruncateSUBTRANS(GetOldestXmin(true, false));
+ TruncateSUBTRANS(GetOldestXmin(NULL, false));
/* Real work is done, but log and update stats before releasing lock. */
LogCheckpointEnd(false);
@@ -8674,6 +8704,8 @@ CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
CheckPointPredicate();
CheckPointRelationMap();
CheckPointReplicationSlots();
+ CheckPointSnapBuild();
+ CheckPointLogicalRewriteHeap();
CheckPointBuffers(flags); /* performs all required fsyncs */
/* We deliberately delay 2PC checkpointing as long as possible */
CheckPointTwoPhase(checkPointRedo);
@@ -8965,7 +8997,7 @@ CreateRestartPoint(int flags)
* this because StartupSUBTRANS hasn't been called yet.
*/
if (EnableHotStandby)
- TruncateSUBTRANS(GetOldestXmin(true, false));
+ TruncateSUBTRANS(GetOldestXmin(NULL, false));
/* Real work is done, but log and update before releasing lock. */
LogCheckpointEnd(true);
diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c
index cebca95ac8d..877d7678f7a 100644
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -2156,7 +2156,7 @@ IndexBuildHeapScan(Relation heapRelation,
{
snapshot = SnapshotAny;
/* okay to ignore lazy VACUUMs here */
- OldestXmin = GetOldestXmin(heapRelation->rd_rel->relisshared, true);
+ OldestXmin = GetOldestXmin(heapRelation, true);
}
scan = heap_beginscan_strat(heapRelation, /* relation */
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index 04dfbb0ee54..0500a73e1ba 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -619,11 +619,13 @@ CREATE VIEW pg_stat_replication AS
CREATE VIEW pg_replication_slots AS
SELECT
L.slot_name,
+ L.plugin,
L.slot_type,
L.datoid,
D.datname AS database,
L.active,
L.xmin,
+ L.catalog_xmin,
L.restart_lsn
FROM pg_get_replication_slots() AS L
LEFT JOIN pg_database D ON (L.datoid = D.oid);
@@ -822,3 +824,35 @@ CREATE OR REPLACE FUNCTION
CREATE OR REPLACE FUNCTION
json_populate_recordset(base anyelement, from_json json, use_json_as_text boolean DEFAULT false)
RETURNS SETOF anyelement LANGUAGE internal STABLE ROWS 100 AS 'json_populate_recordset';
+
+CREATE OR REPLACE FUNCTION pg_logical_slot_get_changes(
+ IN slotname name, IN upto_lsn pg_lsn, IN upto_nchanges int, VARIADIC options text[] DEFAULT '{}',
+ OUT location pg_lsn, OUT xid xid, OUT data text)
+RETURNS SETOF RECORD
+LANGUAGE INTERNAL
+VOLATILE ROWS 1000 COST 1000
+AS 'pg_logical_slot_get_changes';
+
+CREATE OR REPLACE FUNCTION pg_logical_slot_peek_changes(
+ IN slotname name, IN upto_lsn pg_lsn, IN upto_nchanges int, VARIADIC options text[] DEFAULT '{}',
+ OUT location pg_lsn, OUT xid xid, OUT data text)
+RETURNS SETOF RECORD
+LANGUAGE INTERNAL
+VOLATILE ROWS 1000 COST 1000
+AS 'pg_logical_slot_peek_changes';
+
+CREATE OR REPLACE FUNCTION pg_logical_slot_get_binary_changes(
+ IN slotname name, IN upto_lsn pg_lsn, IN upto_nchanges int, VARIADIC options text[] DEFAULT '{}',
+ OUT location pg_lsn, OUT xid xid, OUT data bytea)
+RETURNS SETOF RECORD
+LANGUAGE INTERNAL
+VOLATILE ROWS 1000 COST 1000
+AS 'pg_logical_slot_get_binary_changes';
+
+CREATE OR REPLACE FUNCTION pg_logical_slot_peek_binary_changes(
+ IN slotname name, IN upto_lsn pg_lsn, IN upto_nchanges int, VARIADIC options text[] DEFAULT '{}',
+ OUT location pg_lsn, OUT xid xid, OUT data bytea)
+RETURNS SETOF RECORD
+LANGUAGE INTERNAL
+VOLATILE ROWS 1000 COST 1000
+AS 'pg_logical_slot_peek_binary_changes';
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index e7fcb558684..a04adeaac75 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -22,6 +22,7 @@
#include "access/tuptoaster.h"
#include "access/visibilitymap.h"
#include "access/xact.h"
+#include "catalog/catalog.h"
#include "catalog/index.h"
#include "catalog/indexing.h"
#include "catalog/pg_collation.h"
@@ -1081,7 +1082,7 @@ acquire_sample_rows(Relation onerel, int elevel,
totalblocks = RelationGetNumberOfBlocks(onerel);
/* Need a cutoff xmin for HeapTupleSatisfiesVacuum */
- OldestXmin = GetOldestXmin(onerel->rd_rel->relisshared, true);
+ OldestXmin = GetOldestXmin(onerel, true);
/* Prepare for sampling block numbers */
BlockSampler_Init(&bs, totalblocks, targrows);
diff --git a/src/backend/commands/cluster.c b/src/backend/commands/cluster.c
index 8b18e4acb72..b6b40e724e7 100644
--- a/src/backend/commands/cluster.c
+++ b/src/backend/commands/cluster.c
@@ -850,7 +850,7 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose,
* Since we're going to rewrite the whole table anyway, there's no reason
* not to be aggressive about this.
*/
- vacuum_set_xid_limits(0, 0, 0, 0, OldHeap->rd_rel->relisshared,
+ vacuum_set_xid_limits(OldHeap, 0, 0, 0, 0,
&OldestXmin, &FreezeXid, NULL, &MultiXactCutoff,
NULL);
@@ -869,7 +869,7 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose,
is_system_catalog = IsSystemRelation(OldHeap);
/* Initialize the rewrite operation */
- rwstate = begin_heap_rewrite(NewHeap, OldestXmin, FreezeXid,
+ rwstate = begin_heap_rewrite(OldHeap, NewHeap, OldestXmin, FreezeXid,
MultiXactCutoff, use_wal);
/*
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index 5d540aa3a01..4996a2e7cd2 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -45,6 +45,7 @@
#include "miscadmin.h"
#include "pgstat.h"
#include "postmaster/bgwriter.h"
+#include "replication/slot.h"
#include "storage/copydir.h"
#include "storage/fd.h"
#include "storage/lmgr.h"
@@ -750,6 +751,7 @@ dropdb(const char *dbname, bool missing_ok)
HeapTuple tup;
int notherbackends;
int npreparedxacts;
+ int nslots, nslots_active;
/*
* Look up the target database's OID, and get exclusive lock on it. We
@@ -807,6 +809,19 @@ dropdb(const char *dbname, bool missing_ok)
errmsg("cannot drop the currently open database")));
/*
+ * Check whether there are, possibly unconnected, logical slots that refer
+ * to the to-be-dropped database. The database lock we are holding
+ * prevents the creation of new slots using the database.
+ */
+ if (ReplicationSlotsCountDBSlots(db_id, &nslots, &nslots_active))
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_IN_USE),
+ errmsg("database \"%s\" is used by a logical decoding slot",
+ dbname),
+ errdetail("There are %d slot(s), %d of them active",
+ nslots, nslots_active)));
+
+ /*
* Check for other backends in the target database. (Because we hold the
* database lock, no new ones can start after this.)
*
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index 5ae7763534b..ded1841dc65 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -398,11 +398,11 @@ get_rel_oids(Oid relid, const RangeVar *vacrel)
* not interested.
*/
void
-vacuum_set_xid_limits(int freeze_min_age,
+vacuum_set_xid_limits(Relation rel,
+ int freeze_min_age,
int freeze_table_age,
int multixact_freeze_min_age,
int multixact_freeze_table_age,
- bool sharedRel,
TransactionId *oldestXmin,
TransactionId *freezeLimit,
TransactionId *xidFullScanLimit,
@@ -425,7 +425,7 @@ vacuum_set_xid_limits(int freeze_min_age,
* working on a particular table at any time, and that each vacuum is
* always an independent transaction.
*/
- *oldestXmin = GetOldestXmin(sharedRel, true);
+ *oldestXmin = GetOldestXmin(rel, true);
Assert(TransactionIdIsNormal(*oldestXmin));
@@ -795,7 +795,7 @@ vac_update_datfrozenxid(void)
* committed pg_class entries for new tables; see AddNewRelationTuple().
* So we cannot produce a wrong minimum by starting with this.
*/
- newFrozenXid = GetOldestXmin(true, true);
+ newFrozenXid = GetOldestXmin(NULL, true);
/*
* Similarly, initialize the MultiXact "min" with the value that would be
diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c
index d77892ee7f8..d5db917d97f 100644
--- a/src/backend/commands/vacuumlazy.c
+++ b/src/backend/commands/vacuumlazy.c
@@ -44,6 +44,7 @@
#include "access/multixact.h"
#include "access/transam.h"
#include "access/visibilitymap.h"
+#include "catalog/catalog.h"
#include "catalog/storage.h"
#include "commands/dbcommands.h"
#include "commands/vacuum.h"
@@ -204,10 +205,10 @@ lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt,
vac_strategy = bstrategy;
- vacuum_set_xid_limits(vacstmt->freeze_min_age, vacstmt->freeze_table_age,
+ vacuum_set_xid_limits(onerel,
+ vacstmt->freeze_min_age, vacstmt->freeze_table_age,
vacstmt->multixact_freeze_min_age,
vacstmt->multixact_freeze_table_age,
- onerel->rd_rel->relisshared,
&OldestXmin, &FreezeLimit, &xidFullScanLimit,
&MultiXactCutoff, &mxactFullScanLimit);
diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c
index 1a8d4e51430..7d8a3f2c248 100644
--- a/src/backend/executor/nodeBitmapHeapscan.c
+++ b/src/backend/executor/nodeBitmapHeapscan.c
@@ -336,8 +336,7 @@ bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres)
/*
* Prune and repair fragmentation for the whole page, if possible.
*/
- Assert(TransactionIdIsValid(RecentGlobalXmin));
- heap_page_prune_opt(scan->rs_rd, buffer, RecentGlobalXmin);
+ heap_page_prune_opt(scan->rs_rd, buffer);
/*
* We must hold share lock on the buffer content while examining tuple
diff --git a/src/backend/replication/Makefile b/src/backend/replication/Makefile
index 7941cb8d5e7..6f17b08a6a5 100644
--- a/src/backend/replication/Makefile
+++ b/src/backend/replication/Makefile
@@ -17,6 +17,8 @@ override CPPFLAGS := -I$(srcdir) $(CPPFLAGS)
OBJS = walsender.o walreceiverfuncs.o walreceiver.o basebackup.o \
repl_gram.o slot.o slotfuncs.o syncrep.o
+SUBDIRS = logical
+
include $(top_srcdir)/src/backend/common.mk
# repl_scanner is compiled as part of repl_gram
diff --git a/src/backend/replication/logical/Makefile b/src/backend/replication/logical/Makefile
new file mode 100644
index 00000000000..310a45c5c05
--- /dev/null
+++ b/src/backend/replication/logical/Makefile
@@ -0,0 +1,19 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+# Makefile for src/backend/replication/logical
+#
+# IDENTIFICATION
+# src/backend/replication/logical/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/replication/logical
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+override CPPFLAGS := -I$(srcdir) $(CPPFLAGS)
+
+OBJS = decode.o logical.o logicalfuncs.o reorderbuffer.o snapbuild.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c
new file mode 100644
index 00000000000..e8949aab324
--- /dev/null
+++ b/src/backend/replication/logical/decode.c
@@ -0,0 +1,826 @@
+/* -------------------------------------------------------------------------
+ *
+ * decode.c
+ * This module decodes WAL records read using xlogreader.h's APIs for the
+ * purpose of logical decoding by passing information to the
+ * reorderbuffer module (containing the actual changes) and to the
+ * snapbuild module to build a fitting catalog snapshot (to be able to
+ * properly decode the changes in the reorderbuffer).
+ *
+ * NOTE:
+ * This basically tries to handle all low level xlog stuff for
+ * reorderbuffer.c and snapbuild.c. There's some minor leakage where a
+ * specific record's struct is used to pass data along, but those just
+ * happen to contain the right amount of data in a convenient
+ * format. There isn't and shouldn't be much intelligence about the
+ * contents of records in here except turning them into a more usable
+ * format.
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/replication/logical/decode.c
+ *
+ * -------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/heapam.h"
+#include "access/heapam_xlog.h"
+#include "access/transam.h"
+#include "access/xact.h"
+#include "access/xlog_internal.h"
+#include "access/xlogreader.h"
+
+#include "catalog/pg_control.h"
+
+#include "replication/decode.h"
+#include "replication/logical.h"
+#include "replication/reorderbuffer.h"
+#include "replication/snapbuild.h"
+
+#include "storage/standby.h"
+
+typedef struct XLogRecordBuffer
+{
+ XLogRecPtr origptr;
+ XLogRecPtr endptr;
+ XLogRecord record;
+ char *record_data;
+} XLogRecordBuffer;
+
+/* RMGR Handlers */
+static void DecodeXLogOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
+static void DecodeHeapOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
+static void DecodeHeap2Op(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
+static void DecodeXactOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
+static void DecodeStandbyOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
+
+/* individual record(group)'s handlers */
+static void DecodeInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
+static void DecodeUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
+static void DecodeDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
+static void DecodeMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
+static void DecodeCommit(LogicalDecodingContext *ctx, XLogRecordBuffer *buf,
+ TransactionId xid, Oid dboid,
+ TimestampTz commit_time,
+ int nsubxacts, TransactionId *sub_xids,
+ int ninval_msgs, SharedInvalidationMessage *msg);
+static void DecodeAbort(LogicalDecodingContext *ctx, XLogRecPtr lsn,
+ TransactionId xid, TransactionId *sub_xids, int nsubxacts);
+
+/* common function to decode tuples */
+static void DecodeXLogTuple(char *data, Size len, ReorderBufferTupleBuf *tup);
+
+/*
+ * Take every XLogReadRecord()ed record and perform the actions required to
+ * decode it using the output plugin already setup in the logical decoding
+ * context.
+ */
+void
+LogicalDecodingProcessRecord(LogicalDecodingContext *ctx, XLogRecord *record)
+{
+ XLogRecordBuffer buf;
+
+ buf.origptr = ctx->reader->ReadRecPtr;
+ buf.endptr = ctx->reader->EndRecPtr;
+ buf.record = *record;
+ buf.record_data = XLogRecGetData(record);
+
+ /* cast so we get a warning when new rmgrs are added */
+ switch ((RmgrIds) buf.record.xl_rmid)
+ {
+ /*
+ * Rmgrs we care about for logical decoding. Add new rmgrs in
+ * rmgrlist.h's order.
+ */
+ case RM_XLOG_ID:
+ DecodeXLogOp(ctx, &buf);
+ break;
+
+ case RM_XACT_ID:
+ DecodeXactOp(ctx, &buf);
+ break;
+
+ case RM_STANDBY_ID:
+ DecodeStandbyOp(ctx, &buf);
+ break;
+
+ case RM_HEAP2_ID:
+ DecodeHeap2Op(ctx, &buf);
+ break;
+
+ case RM_HEAP_ID:
+ DecodeHeapOp(ctx, &buf);
+ break;
+
+ /*
+ * Rmgrs irrelevant for logical decoding; they describe stuff not
+ * represented in logical decoding. Add new rmgrs in rmgrlist.h's
+ * order.
+ */
+ case RM_SMGR_ID:
+ case RM_CLOG_ID:
+ case RM_DBASE_ID:
+ case RM_TBLSPC_ID:
+ case RM_MULTIXACT_ID:
+ case RM_RELMAP_ID:
+ case RM_BTREE_ID:
+ case RM_HASH_ID:
+ case RM_GIN_ID:
+ case RM_GIST_ID:
+ case RM_SEQ_ID:
+ case RM_SPGIST_ID:
+ break;
+ case RM_NEXT_ID:
+ elog(ERROR, "unexpected RM_NEXT_ID rmgr_id: %u", (RmgrIds) buf.record.xl_rmid);
+ }
+}
+
+/*
+ * Handle rmgr XLOG_ID records for DecodeRecordIntoReorderBuffer().
+ */
+static void
+DecodeXLogOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+ SnapBuild *builder = ctx->snapshot_builder;
+ uint8 info = buf->record.xl_info & ~XLR_INFO_MASK;
+
+ switch (info)
+ {
+ /* this is also used in END_OF_RECOVERY checkpoints */
+ case XLOG_CHECKPOINT_SHUTDOWN:
+ case XLOG_END_OF_RECOVERY:
+ SnapBuildSerializationPoint(builder, buf->origptr);
+
+ break;
+ case XLOG_CHECKPOINT_ONLINE:
+ /*
+ * a RUNNING_XACTS record will have been logged near to this, we
+ * can restart from there.
+ */
+ break;
+ case XLOG_NOOP:
+ case XLOG_NEXTOID:
+ case XLOG_SWITCH:
+ case XLOG_BACKUP_END:
+ case XLOG_PARAMETER_CHANGE:
+ case XLOG_RESTORE_POINT:
+ case XLOG_FPW_CHANGE:
+ case XLOG_FPI:
+ break;
+ default:
+ elog(ERROR, "unexpected RM_XLOG_ID record type: %u", info);
+ }
+}
+
+/*
+ * Handle rmgr XACT_ID records for DecodeRecordIntoReorderBuffer().
+ */
+static void
+DecodeXactOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+ SnapBuild *builder = ctx->snapshot_builder;
+ ReorderBuffer *reorder = ctx->reorder;
+ XLogRecord *r = &buf->record;
+ uint8 info = r->xl_info & ~XLR_INFO_MASK;
+
+ /* no point in doing anything yet, data could not be decoded anyway */
+ if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT)
+ return;
+
+ switch (info)
+ {
+ case XLOG_XACT_COMMIT:
+ {
+ xl_xact_commit *xlrec;
+ TransactionId *subxacts = NULL;
+ SharedInvalidationMessage *invals = NULL;
+
+ xlrec = (xl_xact_commit *) buf->record_data;
+
+ subxacts = (TransactionId *) &(xlrec->xnodes[xlrec->nrels]);
+ invals = (SharedInvalidationMessage *) &(subxacts[xlrec->nsubxacts]);
+
+ DecodeCommit(ctx, buf, r->xl_xid, xlrec->dbId,
+ xlrec->xact_time,
+ xlrec->nsubxacts, subxacts,
+ xlrec->nmsgs, invals);
+
+ break;
+ }
+ case XLOG_XACT_COMMIT_PREPARED:
+ {
+ xl_xact_commit_prepared *prec;
+ xl_xact_commit *xlrec;
+ TransactionId *subxacts;
+ SharedInvalidationMessage *invals = NULL;
+
+ /* Prepared commits contain a normal commit record... */
+ prec = (xl_xact_commit_prepared *) buf->record_data;
+ xlrec = &prec->crec;
+
+ subxacts = (TransactionId *) &(xlrec->xnodes[xlrec->nrels]);
+ invals = (SharedInvalidationMessage *) &(subxacts[xlrec->nsubxacts]);
+
+ DecodeCommit(ctx, buf, r->xl_xid, xlrec->dbId,
+ xlrec->xact_time,
+ xlrec->nsubxacts, subxacts,
+ xlrec->nmsgs, invals);
+
+ break;
+ }
+ case XLOG_XACT_COMMIT_COMPACT:
+ {
+ xl_xact_commit_compact *xlrec;
+
+ xlrec = (xl_xact_commit_compact *) buf->record_data;
+
+ DecodeCommit(ctx, buf, r->xl_xid, InvalidOid,
+ xlrec->xact_time,
+ xlrec->nsubxacts, xlrec->subxacts,
+ 0, NULL);
+ break;
+ }
+ case XLOG_XACT_ABORT:
+ {
+ xl_xact_abort *xlrec;
+ TransactionId *sub_xids;
+
+ xlrec = (xl_xact_abort *) buf->record_data;
+
+ sub_xids = (TransactionId *) &(xlrec->xnodes[xlrec->nrels]);
+
+ DecodeAbort(ctx, buf->origptr, r->xl_xid,
+ sub_xids, xlrec->nsubxacts);
+ break;
+ }
+ case XLOG_XACT_ABORT_PREPARED:
+ {
+ xl_xact_abort_prepared *prec;
+ xl_xact_abort *xlrec;
+ TransactionId *sub_xids;
+
+ /* prepared abort contain a normal commit abort... */
+ prec = (xl_xact_abort_prepared *) buf->record_data;
+ xlrec = &prec->arec;
+
+ sub_xids = (TransactionId *) &(xlrec->xnodes[xlrec->nrels]);
+
+ /* r->xl_xid is committed in a separate record */
+ DecodeAbort(ctx, buf->origptr, prec->xid,
+ sub_xids, xlrec->nsubxacts);
+ break;
+ }
+
+ case XLOG_XACT_ASSIGNMENT:
+ {
+ xl_xact_assignment *xlrec;
+ int i;
+ TransactionId *sub_xid;
+
+ xlrec = (xl_xact_assignment *) buf->record_data;
+
+ sub_xid = &xlrec->xsub[0];
+
+ for (i = 0; i < xlrec->nsubxacts; i++)
+ {
+ ReorderBufferAssignChild(reorder, xlrec->xtop,
+ *(sub_xid++), buf->origptr);
+ }
+ break;
+ }
+ case XLOG_XACT_PREPARE:
+ /*
+ * Currently decoding ignores PREPARE TRANSACTION and will just
+ * decode the transaction when the COMMIT PREPARED is sent or
+ * throw away the transaction's contents when a ROLLBACK PREPARED
+ * is received. In the future we could add code to expose prepared
+ * transactions in the changestream allowing for a kind of
+ * distributed 2PC.
+ */
+ break;
+ default:
+ elog(ERROR, "unexpected RM_XACT_ID record type: %u", info);
+ }
+}
+
+/*
+ * Handle rmgr STANDBY_ID records for DecodeRecordIntoReorderBuffer().
+ */
+static void
+DecodeStandbyOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+ SnapBuild *builder = ctx->snapshot_builder;
+ XLogRecord *r = &buf->record;
+ uint8 info = r->xl_info & ~XLR_INFO_MASK;
+
+ switch (info)
+ {
+ case XLOG_RUNNING_XACTS:
+ {
+ xl_running_xacts *running = (xl_running_xacts *) buf->record_data;
+ SnapBuildProcessRunningXacts(builder, buf->origptr, running);
+ /*
+ * Abort all transactions that we keep track of, that are
+ * older than the record's oldestRunningXid. This is the most
+ * convenient spot for doing so since, in contrast to shutdown
+ * or end-of-recovery checkpoints, we have information about
+ * all running transactions which includes prepared ones,
+ * while shutdown checkpoints just know that no non-prepared
+ * transactions are in progress.
+ */
+ ReorderBufferAbortOld(ctx->reorder, running->oldestRunningXid);
+ }
+ break;
+ case XLOG_STANDBY_LOCK:
+ break;
+ default:
+ elog(ERROR, "unexpected RM_STANDBY_ID record type: %u", info);
+ }
+}
+
+/*
+ * Handle rmgr HEAP2_ID records for DecodeRecordIntoReorderBuffer().
+ */
+static void
+DecodeHeap2Op(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+ uint8 info = buf->record.xl_info & XLOG_HEAP_OPMASK;
+ TransactionId xid = buf->record.xl_xid;
+ SnapBuild *builder = ctx->snapshot_builder;
+
+ /* no point in doing anything yet */
+ if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT)
+ return;
+
+ switch (info)
+ {
+ case XLOG_HEAP2_MULTI_INSERT:
+ if (SnapBuildProcessChange(builder, xid, buf->origptr))
+ DecodeMultiInsert(ctx, buf);
+ break;
+ case XLOG_HEAP2_NEW_CID:
+ {
+ xl_heap_new_cid *xlrec;
+ xlrec = (xl_heap_new_cid *) buf->record_data;
+ SnapBuildProcessNewCid(builder, xid, buf->origptr, xlrec);
+
+ break;
+ }
+ case XLOG_HEAP2_REWRITE:
+ /*
+ * Although these records only exist to serve the needs of logical
+ * decoding, all the work happens as part of crash or archive
+ * recovery, so we don't need to do anything here.
+ */
+ break;
+ /*
+ * Everything else here is just low level physical stuff we're
+ * not interested in.
+ */
+ case XLOG_HEAP2_FREEZE_PAGE:
+ case XLOG_HEAP2_CLEAN:
+ case XLOG_HEAP2_CLEANUP_INFO:
+ case XLOG_HEAP2_VISIBLE:
+ case XLOG_HEAP2_LOCK_UPDATED:
+ break;
+ default:
+ elog(ERROR, "unexpected RM_HEAP2_ID record type: %u", info);
+ }
+}
+
+/*
+ * Handle rmgr HEAP_ID records for DecodeRecordIntoReorderBuffer().
+ */
+static void
+DecodeHeapOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+ uint8 info = buf->record.xl_info & XLOG_HEAP_OPMASK;
+ TransactionId xid = buf->record.xl_xid;
+ SnapBuild *builder = ctx->snapshot_builder;
+
+ /* no point in doing anything yet */
+ if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT)
+ return;
+
+ switch (info)
+ {
+ case XLOG_HEAP_INSERT:
+ if (SnapBuildProcessChange(builder, xid, buf->origptr))
+ DecodeInsert(ctx, buf);
+ break;
+
+ /*
+ * Treat HOT update as normal updates. There is no useful
+ * information in the fact that we could make it a HOT update
+ * locally and the WAL layout is compatible.
+ */
+ case XLOG_HEAP_HOT_UPDATE:
+ case XLOG_HEAP_UPDATE:
+ if (SnapBuildProcessChange(builder, xid, buf->origptr))
+ DecodeUpdate(ctx, buf);
+ break;
+
+ case XLOG_HEAP_DELETE:
+ if (SnapBuildProcessChange(builder, xid, buf->origptr))
+ DecodeDelete(ctx, buf);
+ break;
+
+ case XLOG_HEAP_NEWPAGE:
+ /*
+ * This is only used in places like indexams and CLUSTER which
+ * don't contain changes relevant for logical replication.
+ */
+ break;
+
+ case XLOG_HEAP_INPLACE:
+ /*
+ * Inplace updates are only ever performed on catalog tuples and
+ * can, per definition, not change tuple visibility. Since we
+ * don't decode catalog tuples, we're not interested in the
+ * record's contents.
+ *
+ * In-place updates can be used either by XID-bearing transactions
+ * (e.g. in CREATE INDEX CONCURRENTLY) or by XID-less
+ * transactions (e.g. VACUUM). In the former case, the commit
+ * record will include cache invalidations, so we mark the
+ * transaction as catalog modifying here. Currently that's
+ * redundant because the commit will do that as well, but once we
+ * support decoding in-progress relations, this will be important.
+ */
+ if (!TransactionIdIsValid(xid))
+ break;
+
+ SnapBuildProcessChange(builder, xid, buf->origptr);
+ ReorderBufferXidSetCatalogChanges(ctx->reorder, xid, buf->origptr);
+ break;
+
+ case XLOG_HEAP_LOCK:
+ /* we don't care about row level locks for now */
+ break;
+
+ default:
+ elog(ERROR, "unexpected RM_HEAP_ID record type: %u", info);
+ break;
+ }
+}
+
+/*
+ * Consolidated commit record handling between the different form of commit
+ * records.
+ */
+static void
+DecodeCommit(LogicalDecodingContext *ctx, XLogRecordBuffer *buf,
+ TransactionId xid, Oid dboid,
+ TimestampTz commit_time,
+ int nsubxacts, TransactionId *sub_xids,
+ int ninval_msgs, SharedInvalidationMessage *msgs)
+{
+ int i;
+
+ /*
+ * Process invalidation messages, even if we're not interested in the
+ * transaction's contents, since the various caches need to always be
+ * consistent.
+ */
+ if (ninval_msgs > 0)
+ {
+ ReorderBufferAddInvalidations(ctx->reorder, xid, buf->origptr,
+ ninval_msgs, msgs);
+ ReorderBufferXidSetCatalogChanges(ctx->reorder, xid, buf->origptr);
+ }
+
+ SnapBuildCommitTxn(ctx->snapshot_builder, buf->origptr, xid,
+ nsubxacts, sub_xids);
+
+ /* ----
+ * Check whether we are interested in this specific transaction, and tell
+ * the the reorderbuffer to forget the content of the (sub-)transactions
+ * if not.
+ *
+ * There basically two reasons we might not be interested in this
+ * transaction:
+ * 1) We might not be interested in decoding transactions up to this
+ * LSN. This can happen because we previously decoded it and now just
+ * are restarting or if we haven't assembled a consistent snapshot yet.
+ * 2) The transaction happened in another database.
+ *
+ * We can't just use ReorderBufferAbort() here, because we need to execute
+ * the transaction's invalidations. This currently won't be needed if
+ * we're just skipping over the transaction because currently we only do
+ * so during startup, to get to the first transaction the client needs. As
+ * we have reset the catalog caches before starting to read WAL, and we
+ * haven't yet touched any catalogs, there can't be anything to invalidate.
+ * But if we're "forgetting" this commit because it's it happened in
+ * another database, the invalidations might be important, because they
+ * could be for shared catalogs and we might have loaded data into the
+ * relevant syscaches.
+ * ---
+ */
+ if (SnapBuildXactNeedsSkip(ctx->snapshot_builder, buf->origptr) ||
+ (dboid != InvalidOid && dboid != ctx->slot->data.database))
+ {
+ for (i = 0; i < nsubxacts; i++)
+ {
+ ReorderBufferForget(ctx->reorder, *sub_xids, buf->origptr);
+ sub_xids++;
+ }
+ ReorderBufferForget(ctx->reorder, xid, buf->origptr);
+
+ return;
+ }
+
+ /* tell the reorderbuffer about the surviving subtransactions */
+ for (i = 0; i < nsubxacts; i++)
+ {
+ ReorderBufferCommitChild(ctx->reorder, xid, *sub_xids,
+ buf->origptr, buf->endptr);
+ sub_xids++;
+ }
+
+ /* replay actions of all transaction + subtransactions in order */
+ ReorderBufferCommit(ctx->reorder, xid, buf->origptr, buf->endptr,
+ commit_time);
+}
+
+/*
+ * Get the data from the various forms of abort records and pass it on to
+ * snapbuild.c and reorderbuffer.c
+ */
+static void
+DecodeAbort(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid,
+ TransactionId *sub_xids, int nsubxacts)
+{
+ int i;
+
+ SnapBuildAbortTxn(ctx->snapshot_builder, lsn, xid, nsubxacts, sub_xids);
+
+ for (i = 0; i < nsubxacts; i++)
+ {
+ ReorderBufferAbort(ctx->reorder, *sub_xids, lsn);
+ sub_xids++;
+ }
+
+ ReorderBufferAbort(ctx->reorder, xid, lsn);
+}
+
+/*
+ * Parse XLOG_HEAP_INSERT (not MULTI_INSERT!) records into tuplebufs.
+ *
+ * Deletes can contain the new tuple.
+ */
+static void
+DecodeInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+ XLogRecord *r = &buf->record;
+ xl_heap_insert *xlrec;
+ ReorderBufferChange *change;
+
+ xlrec = (xl_heap_insert *) buf->record_data;
+
+ /* only interested in our database */
+ if (xlrec->target.node.dbNode != ctx->slot->data.database)
+ return;
+
+ change = ReorderBufferGetChange(ctx->reorder);
+ change->action = REORDER_BUFFER_CHANGE_INSERT;
+ memcpy(&change->tp.relnode, &xlrec->target.node, sizeof(RelFileNode));
+
+ if (xlrec->flags & XLOG_HEAP_CONTAINS_NEW_TUPLE)
+ {
+ Assert(r->xl_len > (SizeOfHeapInsert + SizeOfHeapHeader));
+
+ change->tp.newtuple = ReorderBufferGetTupleBuf(ctx->reorder);
+
+ DecodeXLogTuple((char *) xlrec + SizeOfHeapInsert,
+ r->xl_len - SizeOfHeapInsert,
+ change->tp.newtuple);
+ }
+
+ ReorderBufferQueueChange(ctx->reorder, r->xl_xid, buf->origptr, change);
+}
+
+/*
+ * Parse XLOG_HEAP_UPDATE and XLOG_HEAP_HOT_UPDATE, which have the same layout
+ * in the record, from wal into proper tuplebufs.
+ *
+ * Updates can possibly contain a new tuple and the old primary key.
+ */
+static void
+DecodeUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+ XLogRecord *r = &buf->record;
+ xl_heap_update *xlrec;
+ xl_heap_header_len *xlhdr;
+ ReorderBufferChange *change;
+ char *data;
+
+ xlrec = (xl_heap_update *) buf->record_data;
+ xlhdr = (xl_heap_header_len *) (buf->record_data + SizeOfHeapUpdate);
+
+ /* only interested in our database */
+ if (xlrec->target.node.dbNode != ctx->slot->data.database)
+ return;
+
+ change = ReorderBufferGetChange(ctx->reorder);
+ change->action = REORDER_BUFFER_CHANGE_UPDATE;
+ memcpy(&change->tp.relnode, &xlrec->target.node, sizeof(RelFileNode));
+
+ data = (char *) &xlhdr->header;
+
+ if (xlrec->flags & XLOG_HEAP_CONTAINS_NEW_TUPLE)
+ {
+ Assert(r->xl_len > (SizeOfHeapUpdate + SizeOfHeapHeaderLen));
+
+ change->tp.newtuple = ReorderBufferGetTupleBuf(ctx->reorder);
+
+ DecodeXLogTuple(data,
+ xlhdr->t_len + SizeOfHeapHeader,
+ change->tp.newtuple);
+ /* skip over the rest of the tuple header */
+ data += SizeOfHeapHeader;
+ /* skip over the tuple data */
+ data += xlhdr->t_len;
+ }
+
+ if (xlrec->flags & XLOG_HEAP_CONTAINS_OLD)
+ {
+ xlhdr = (xl_heap_header_len *) data;
+ change->tp.oldtuple = ReorderBufferGetTupleBuf(ctx->reorder);
+ DecodeXLogTuple((char *) &xlhdr->header,
+ xlhdr->t_len + SizeOfHeapHeader,
+ change->tp.oldtuple);
+ data = (char *) &xlhdr->header;
+ data += SizeOfHeapHeader;
+ data += xlhdr->t_len;
+ }
+
+ ReorderBufferQueueChange(ctx->reorder, r->xl_xid, buf->origptr, change);
+}
+
+/*
+ * Parse XLOG_HEAP_DELETE from wal into proper tuplebufs.
+ *
+ * Deletes can possibly contain the old primary key.
+ */
+static void
+DecodeDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+ XLogRecord *r = &buf->record;
+ xl_heap_delete *xlrec;
+ ReorderBufferChange *change;
+
+ xlrec = (xl_heap_delete *) buf->record_data;
+
+ /* only interested in our database */
+ if (xlrec->target.node.dbNode != ctx->slot->data.database)
+ return;
+
+ change = ReorderBufferGetChange(ctx->reorder);
+ change->action = REORDER_BUFFER_CHANGE_DELETE;
+
+ memcpy(&change->tp.relnode, &xlrec->target.node, sizeof(RelFileNode));
+
+ /* old primary key stored */
+ if (xlrec->flags & XLOG_HEAP_CONTAINS_OLD)
+ {
+ Assert(r->xl_len > (SizeOfHeapDelete + SizeOfHeapHeader));
+
+ change->tp.oldtuple = ReorderBufferGetTupleBuf(ctx->reorder);
+
+ DecodeXLogTuple((char *) xlrec + SizeOfHeapDelete,
+ r->xl_len - SizeOfHeapDelete,
+ change->tp.oldtuple);
+ }
+ ReorderBufferQueueChange(ctx->reorder, r->xl_xid, buf->origptr, change);
+}
+
+/*
+ * Decode XLOG_HEAP2_MULTI_INSERT_insert record into multiple tuplebufs.
+ *
+ * Currently MULTI_INSERT will always contain the full tuples.
+ */
+static void
+DecodeMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+ XLogRecord *r = &buf->record;
+ xl_heap_multi_insert *xlrec;
+ int i;
+ char *data;
+ bool isinit = (r->xl_info & XLOG_HEAP_INIT_PAGE) != 0;
+
+ xlrec = (xl_heap_multi_insert *) buf->record_data;
+
+ /* only interested in our database */
+ if (xlrec->node.dbNode != ctx->slot->data.database)
+ return;
+
+ data = buf->record_data + SizeOfHeapMultiInsert;
+
+ /*
+ * OffsetNumbers (which are not of interest to us) are stored when
+ * XLOG_HEAP_INIT_PAGE is not set -- skip over them.
+ */
+ if (!isinit)
+ data += sizeof(OffsetNumber) * xlrec->ntuples;
+
+ for (i = 0; i < xlrec->ntuples; i++)
+ {
+ ReorderBufferChange *change;
+ xl_multi_insert_tuple *xlhdr;
+ int datalen;
+ ReorderBufferTupleBuf *tuple;
+
+ change = ReorderBufferGetChange(ctx->reorder);
+ change->action = REORDER_BUFFER_CHANGE_INSERT;
+ memcpy(&change->tp.relnode, &xlrec->node, sizeof(RelFileNode));
+
+ /*
+ * CONTAINS_NEW_TUPLE will always be set currently as multi_insert
+ * isn't used for catalogs, but better be future proof.
+ *
+ * We decode the tuple in pretty much the same way as DecodeXLogTuple,
+ * but since the layout is slightly different, we can't use it here.
+ */
+ if (xlrec->flags & XLOG_HEAP_CONTAINS_NEW_TUPLE)
+ {
+ change->tp.newtuple = ReorderBufferGetTupleBuf(ctx->reorder);
+
+ tuple = change->tp.newtuple;
+
+ /* not a disk based tuple */
+ ItemPointerSetInvalid(&tuple->tuple.t_self);
+
+ xlhdr = (xl_multi_insert_tuple *) SHORTALIGN(data);
+ data = ((char *) xlhdr) + SizeOfMultiInsertTuple;
+ datalen = xlhdr->datalen;
+
+ /*
+ * We can only figure this out after reassembling the
+ * transactions.
+ */
+ tuple->tuple.t_tableOid = InvalidOid;
+ tuple->tuple.t_data = &tuple->header;
+ tuple->tuple.t_len = datalen
+ + offsetof(HeapTupleHeaderData, t_bits);
+
+ memset(&tuple->header, 0, sizeof(HeapTupleHeaderData));
+
+ memcpy((char *) &tuple->header
+ + offsetof(HeapTupleHeaderData, t_bits),
+ (char *) data,
+ datalen);
+ data += datalen;
+
+ tuple->header.t_infomask = xlhdr->t_infomask;
+ tuple->header.t_infomask2 = xlhdr->t_infomask2;
+ tuple->header.t_hoff = xlhdr->t_hoff;
+ }
+
+ ReorderBufferQueueChange(ctx->reorder, r->xl_xid,
+ buf->origptr, change);
+ }
+}
+
+/*
+ * Read a HeapTuple as WAL logged by heap_insert, heap_update and heap_delete
+ * (but not by heap_multi_insert) into a tuplebuf.
+ *
+ * The size 'len' and the pointer 'data' in the record need to be
+ * computed outside as they are record specific.
+ */
+static void
+DecodeXLogTuple(char *data, Size len, ReorderBufferTupleBuf *tuple)
+{
+ xl_heap_header xlhdr;
+ int datalen = len - SizeOfHeapHeader;
+
+ Assert(datalen >= 0);
+ Assert(datalen <= MaxHeapTupleSize);
+
+ tuple->tuple.t_len = datalen + offsetof(HeapTupleHeaderData, t_bits);
+
+ /* not a disk based tuple */
+ ItemPointerSetInvalid(&tuple->tuple.t_self);
+
+ /* we can only figure this out after reassembling the transactions */
+ tuple->tuple.t_tableOid = InvalidOid;
+ tuple->tuple.t_data = &tuple->header;
+
+ /* data is not stored aligned, copy to aligned storage */
+ memcpy((char *) &xlhdr,
+ data,
+ SizeOfHeapHeader);
+
+ memset(&tuple->header, 0, sizeof(HeapTupleHeaderData));
+
+ memcpy((char *) &tuple->header + offsetof(HeapTupleHeaderData, t_bits),
+ data + SizeOfHeapHeader,
+ datalen);
+
+ tuple->header.t_infomask = xlhdr.t_infomask;
+ tuple->header.t_infomask2 = xlhdr.t_infomask2;
+ tuple->header.t_hoff = xlhdr.t_hoff;
+}
diff --git a/src/backend/replication/logical/logical.c b/src/backend/replication/logical/logical.c
new file mode 100644
index 00000000000..4fb0974f297
--- /dev/null
+++ b/src/backend/replication/logical/logical.c
@@ -0,0 +1,920 @@
+/*-------------------------------------------------------------------------
+ * logical.c
+ * PostgreSQL logical decoding coordination
+ *
+ * Copyright (c) 2012-2014, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/replication/logical/logical.c
+ *
+ * NOTES
+ * This file coordinates interaction between the various modules that
+ * together providethe logical decoding, primarily by providing so
+ * called LogicalDecodingContexts. The goal is to encapsulate most of the
+ * internal complexity for consumers of logical decoding, so they can
+ * create and consume a changestream with a low amount of code.
+ *
+ * The idea is that a consumer provides three callbacks, one to read WAL,
+ * one to prepare a data write, and a final one for actually writing since
+ * their implementation depends on the type of consumer. Check
+ * logicalfunc.c for an example implementations of a fairly simple consumer
+ * and a implementation of a WAL reading callback that's suitable for
+ * simpler consumers.
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <unistd.h>
+#include <sys/stat.h>
+
+#include "miscadmin.h"
+
+#include "access/xact.h"
+
+#include "replication/decode.h"
+#include "replication/logical.h"
+#include "replication/reorderbuffer.h"
+#include "replication/snapbuild.h"
+
+#include "storage/proc.h"
+#include "storage/procarray.h"
+
+#include "utils/memutils.h"
+
+/* data for errcontext callback */
+typedef struct LogicalErrorCallbackState
+{
+ LogicalDecodingContext *ctx;
+ const char *callback_name;
+ XLogRecPtr report_location;
+} LogicalErrorCallbackState;
+
+/* wrappers around output plugin callbacks */
+static void output_plugin_error_callback(void *arg);
+static void startup_cb_wrapper(LogicalDecodingContext *ctx, OutputPluginOptions *opt,
+ bool is_init);
+static void shutdown_cb_wrapper(LogicalDecodingContext *ctx);
+static void begin_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn);
+static void commit_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
+ XLogRecPtr commit_lsn);
+static void change_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
+ Relation relation, ReorderBufferChange *change);
+
+static void LoadOutputPlugin(OutputPluginCallbacks *callbacks, char *plugin);
+
+/*
+ * Make sure the current settings & environment are capable of doing logical
+ * decoding.
+ */
+void
+CheckLogicalDecodingRequirements(void)
+{
+ CheckSlotRequirements();
+
+ if (wal_level < WAL_LEVEL_LOGICAL)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("logical decoding requires wal_level >= logical")));
+
+ if (MyDatabaseId == InvalidOid)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("logical decoding requires a database connection")));
+
+ /* ----
+ * TODO: We got to change that someday soon...
+ *
+ * There's basically three things missing to allow this:
+ * 1) We need to be able to correctly and quickly identify the timeline a
+ * LSN belongs to
+ * 2) We need to force hot_standby_feedback to be enabled at all times so
+ * the primary cannot remove rows we need.
+ * 3) support dropping replication slots referring to a database, in
+ * dbase_redo. There can't be any active ones due to HS recovery
+ * conflicts, so that should be relatively easy.
+ * ----
+ */
+ if (RecoveryInProgress())
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("logical decoding cannot be used while in recovery")));
+}
+
+/*
+ * Helper function for CreateInitialDecodingContext() and
+ * CreateDecodingContext() performing common tasks.
+ */
+static LogicalDecodingContext *
+StartupDecodingContext(List *output_plugin_options,
+ XLogRecPtr start_lsn,
+ TransactionId xmin_horizon,
+ XLogPageReadCB read_page,
+ LogicalOutputPluginWriterPrepareWrite prepare_write,
+ LogicalOutputPluginWriterWrite do_write)
+{
+ ReplicationSlot *slot;
+ MemoryContext context, old_context;
+ LogicalDecodingContext *ctx;
+
+ /* shorter lines... */
+ slot = MyReplicationSlot;
+
+ context = AllocSetContextCreate(CurrentMemoryContext,
+ "Changeset Extraction Context",
+ ALLOCSET_DEFAULT_MINSIZE,
+ ALLOCSET_DEFAULT_INITSIZE,
+ ALLOCSET_DEFAULT_MAXSIZE);
+ old_context = MemoryContextSwitchTo(context);
+ ctx = palloc0(sizeof(LogicalDecodingContext));
+
+ ctx->context = context;
+
+ /* (re-)load output plugins, so we detect a bad (removed) output plugin now. */
+ LoadOutputPlugin(&ctx->callbacks, NameStr(slot->data.plugin));
+
+ /*
+ * Now that the slot's xmin has been set, we can announce ourselves as a
+ * logical decoding backend which doesn't need to be checked individually
+ * when computing the xmin horizon because the xmin is enforced via
+ * replication slots.
+ */
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+ MyPgXact->vacuumFlags |= PROC_IN_LOGICAL_DECODING;
+ LWLockRelease(ProcArrayLock);
+
+ ctx->slot = slot;
+
+ ctx->reader = XLogReaderAllocate(read_page, ctx);
+ ctx->reader->private_data = ctx;
+
+ ctx->reorder = ReorderBufferAllocate();
+ ctx->snapshot_builder =
+ AllocateSnapshotBuilder(ctx->reorder, xmin_horizon, start_lsn);
+
+ ctx->reorder->private_data = ctx;
+
+ /* wrap output plugin callbacks, so we can add error context information */
+ ctx->reorder->begin = begin_cb_wrapper;
+ ctx->reorder->apply_change = change_cb_wrapper;
+ ctx->reorder->commit = commit_cb_wrapper;
+
+ ctx->out = makeStringInfo();
+ ctx->prepare_write = prepare_write;
+ ctx->write = do_write;
+
+ ctx->output_plugin_options = output_plugin_options;
+
+ MemoryContextSwitchTo(old_context);
+
+ return ctx;
+}
+
+/*
+ * Create a new decoding context, for a new logical slot.
+ *
+ * plugin contains the name of the output plugin
+ * output_plugin_options contains options passed to the output plugin
+ * read_page, prepare_write, do_write are callbacks that have to be filled to
+ * perform the use-case dependent, actual, work.
+ *
+ * Needs to be called while in a memory context that's at least as long lived
+ * as the the decoding context because further memory contexts will be created
+ * inside it.
+ *
+ * Returns an initialized decoding context after calling the output plugin's
+ * startup function.
+ */
+LogicalDecodingContext *
+CreateInitDecodingContext(char *plugin,
+ List *output_plugin_options,
+ XLogPageReadCB read_page,
+ LogicalOutputPluginWriterPrepareWrite prepare_write,
+ LogicalOutputPluginWriterWrite do_write)
+{
+ TransactionId xmin_horizon = InvalidTransactionId;
+ ReplicationSlot *slot;
+ LogicalDecodingContext *ctx;
+ MemoryContext old_context;
+
+ /* shorter lines... */
+ slot = MyReplicationSlot;
+
+ /* first some sanity checks that are unlikely to be violated */
+ if (slot == NULL)
+ elog(ERROR, "cannot perform logical decoding without a acquired slot");
+
+ if (plugin == NULL)
+ elog(ERROR, "cannot initialize logical decoding without a specified plugin");
+
+ /* Make sure the passed slot is suitable. These are user facing errors. */
+ if (slot->data.database == InvalidOid)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("cannot use physical replication slot created for logical decoding")));
+
+ if (slot->data.database != MyDatabaseId)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("replication slot \"%s\" was not created in this database",
+ NameStr(slot->data.name))));
+
+ if (IsTransactionState() &&
+ GetTopTransactionIdIfAny() != InvalidTransactionId)
+ ereport(ERROR,
+ (errcode(ERRCODE_ACTIVE_SQL_TRANSACTION),
+ errmsg("cannot create logical replication slot in transaction that has performed writes")));
+
+ /* register output plugin name with slot */
+ SpinLockAcquire(&slot->mutex);
+ strncpy(NameStr(slot->data.plugin), plugin,
+ NAMEDATALEN);
+ NameStr(slot->data.plugin)[NAMEDATALEN - 1] = '\0';
+ SpinLockRelease(&slot->mutex);
+
+ /*
+ * The replication slot mechanism is used to prevent removal of required
+ * WAL. As there is no interlock between this and checkpoints required WAL
+ * could be removed before ReplicationSlotsComputeRequiredLSN() has been
+ * called to prevent that. In the very unlikely case that this happens
+ * we'll just retry.
+ */
+ while (true)
+ {
+ XLogSegNo segno;
+
+ /*
+ * Let's start with enough information if we can, so log a standby
+ * snapshot and start decoding at exactly that position.
+ */
+ if (!RecoveryInProgress())
+ {
+ XLogRecPtr flushptr;
+
+ /* start at current insert position*/
+ slot->data.restart_lsn = GetXLogInsertRecPtr();
+
+ /* make sure we have enough information to start */
+ flushptr = LogStandbySnapshot();
+
+ /* and make sure it's fsynced to disk */
+ XLogFlush(flushptr);
+ }
+ else
+ slot->data.restart_lsn = GetRedoRecPtr();
+
+ /* prevent WAL removal as fast as possible */
+ ReplicationSlotsComputeRequiredLSN();
+
+ /*
+ * If all required WAL is still there, great, otherwise retry. The
+ * slot should prevent further removal of WAL, unless there's a
+ * concurrent ReplicationSlotsComputeRequiredLSN() after we've written
+ * the new restart_lsn above, so normally we should never need to loop
+ * more than twice.
+ */
+ XLByteToSeg(slot->data.restart_lsn, segno);
+ if (XLogGetLastRemovedSegno() < segno)
+ break;
+ }
+
+
+ /* ----
+ * This is a bit tricky: We need to determine a safe xmin horizon to start
+ * decoding from, to avoid starting from a running xacts record referring
+ * to xids whose rows have been vacuumed or pruned
+ * already. GetOldestSafeDecodingTransactionId() returns such a value, but
+ * without further interlock it's return value might immediately be out of
+ * date.
+ *
+ * So we have to acquire the ProcArrayLock to prevent computation of new
+ * xmin horizons by other backends, get the safe decoding xid, and inform
+ * the slot machinery about the new limit. Once that's done the
+ * ProcArrayLock can be be released as the slot machinery now is
+ * protecting against vacuum.
+ * ----
+ */
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+ slot->effective_catalog_xmin = GetOldestSafeDecodingTransactionId();
+ slot->data.catalog_xmin = slot->effective_catalog_xmin;
+
+ ReplicationSlotsComputeRequiredXmin(true);
+
+ LWLockRelease(ProcArrayLock);
+
+ /*
+ * tell the snapshot builder to only assemble snapshot once reaching
+ * the a running_xact's record with the respective xmin.
+ */
+ xmin_horizon = slot->data.catalog_xmin;
+
+ ReplicationSlotMarkDirty();
+ ReplicationSlotSave();
+
+ ctx = StartupDecodingContext(NIL, InvalidXLogRecPtr, xmin_horizon,
+ read_page, prepare_write, do_write);
+
+ /* call output plugin initialization callback */
+ old_context = MemoryContextSwitchTo(ctx->context);
+ if (ctx->callbacks.startup_cb != NULL)
+ startup_cb_wrapper(ctx, &ctx->options, true);
+ MemoryContextSwitchTo(old_context);
+
+ return ctx;
+}
+
+/*
+ * Create a new decoding context, for a logical slot that has previously been
+ * used already.
+ *
+ * start_lsn contains the LSN of the last received data or InvalidXLogRecPtr
+ * output_plugin_options contains options passed to the output plugin
+ * read_page, prepare_write, do_write are callbacks that have to be filled to
+ * perform the use-case dependent, actual, work.
+ *
+ * Needs to be called while in a memory context that's at least as long lived
+ * as the the decoding context because further memory contexts will be created
+ * inside it.
+ *
+ * Returns an initialized decoding context after calling the output plugin's
+ * startup function.
+ */
+LogicalDecodingContext *
+CreateDecodingContext(XLogRecPtr start_lsn,
+ List *output_plugin_options,
+ XLogPageReadCB read_page,
+ LogicalOutputPluginWriterPrepareWrite prepare_write,
+ LogicalOutputPluginWriterWrite do_write)
+{
+ LogicalDecodingContext *ctx;
+ ReplicationSlot *slot;
+ MemoryContext old_context;
+
+ /* shorter lines... */
+ slot = MyReplicationSlot;
+
+ /* first some sanity checks that are unlikely to be violated */
+ if (slot == NULL)
+ elog(ERROR, "cannot perform logical decoding without a acquired slot");
+
+ /* make sure the passed slot is suitable, these are user facing errors */
+ if (slot->data.database == InvalidOid)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ (errmsg("cannot use physical replication slot for logical decoding"))));
+
+ if (slot->data.database != MyDatabaseId)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ (errmsg("replication slot \"%s\" was not created in this database",
+ NameStr(slot->data.name)))));
+
+ if (start_lsn == InvalidXLogRecPtr)
+ {
+ /* continue from last position */
+ start_lsn = slot->data.confirmed_flush;
+ }
+ else if (start_lsn < slot->data.confirmed_flush)
+ {
+ /*
+ * It might seem like we should error out in this case, but it's
+ * pretty common for a client to acknowledge a LSN it doesn't have to
+ * do anything for, and thus didn't store persistently, because the
+ * xlog records didn't result in anything relevant for logical
+ * decoding. Clients have to be able to do that to support
+ * synchronous replication.
+ */
+ start_lsn = slot->data.confirmed_flush;
+ elog(DEBUG1, "cannot stream from %X/%X, minimum is %X/%X, forwarding",
+ (uint32)(start_lsn >> 32), (uint32)start_lsn,
+ (uint32)(slot->data.confirmed_flush >> 32),
+ (uint32)slot->data.confirmed_flush);
+ }
+
+ ctx = StartupDecodingContext(output_plugin_options,
+ start_lsn, InvalidTransactionId,
+ read_page, prepare_write, do_write);
+
+ /* call output plugin initialization callback */
+ old_context = MemoryContextSwitchTo(ctx->context);
+ if (ctx->callbacks.startup_cb != NULL)
+ startup_cb_wrapper(ctx, &ctx->options, true);
+ MemoryContextSwitchTo(old_context);
+
+ ereport(LOG,
+ (errmsg("starting logical decoding for slot %s",
+ NameStr(slot->data.name)),
+ errdetail("streaming transactions committing after %X/%X, reading WAL from %X/%X",
+ (uint32)(slot->data.confirmed_flush >> 32),
+ (uint32)slot->data.confirmed_flush,
+ (uint32)(slot->data.restart_lsn >> 32),
+ (uint32)slot->data.restart_lsn)));
+
+ return ctx;
+}
+
+/*
+ * Returns true if an consistent initial decoding snapshot has been built.
+ */
+bool
+DecodingContextReady(LogicalDecodingContext *ctx)
+{
+ return SnapBuildCurrentState(ctx->snapshot_builder) == SNAPBUILD_CONSISTENT;
+}
+
+/*
+ * Read from the decoding slot, until it is ready to start extracting changes.
+ */
+void
+DecodingContextFindStartpoint(LogicalDecodingContext *ctx)
+{
+ XLogRecPtr startptr;
+
+ /* Initialize from where to start reading WAL. */
+ startptr = ctx->slot->data.restart_lsn;
+
+ elog(DEBUG1, "searching for logical decoding starting point, starting at %X/%X",
+ (uint32)(ctx->slot->data.restart_lsn >> 32),
+ (uint32)ctx->slot->data.restart_lsn);
+
+ /* Wait for a consistent starting point */
+ for (;;)
+ {
+ XLogRecord *record;
+ char *err = NULL;
+
+ /*
+ * If the caller requires that interrupts be checked, the read_page
+ * callback should do so, as those will often wait.
+ */
+
+ /* the read_page callback waits for new WAL */
+ record = XLogReadRecord(ctx->reader, startptr, &err);
+ if (err)
+ elog(ERROR, "%s", err);
+
+ Assert(record);
+
+ startptr = InvalidXLogRecPtr;
+
+ LogicalDecodingProcessRecord(ctx, record);
+
+ /* only continue till we found a consistent spot */
+ if (DecodingContextReady(ctx))
+ break;
+ }
+
+ ctx->slot->data.confirmed_flush = ctx->reader->EndRecPtr;
+}
+
+/*
+ * Free a previously allocated decoding context, invoking the shutdown
+ * callback if necessary.
+ */
+void
+FreeDecodingContext(LogicalDecodingContext *ctx)
+{
+ if (ctx->callbacks.shutdown_cb != NULL)
+ shutdown_cb_wrapper(ctx);
+
+ ReorderBufferFree(ctx->reorder);
+ FreeSnapshotBuilder(ctx->snapshot_builder);
+ XLogReaderFree(ctx->reader);
+ MemoryContextDelete(ctx->context);
+}
+
+/*
+ * Prepare a write using the context's output routine.
+ */
+void
+OutputPluginPrepareWrite(struct LogicalDecodingContext *ctx, bool last_write)
+{
+ if (!ctx->accept_writes)
+ elog(ERROR, "writes are only accepted in commit, begin and change callbacks");
+
+ ctx->prepare_write(ctx, ctx->write_location, ctx->write_xid, last_write);
+ ctx->prepared_write = true;
+}
+
+/*
+ * Perform a write using the context's output routine.
+ */
+void
+OutputPluginWrite(struct LogicalDecodingContext *ctx, bool last_write)
+{
+ if (!ctx->prepared_write)
+ elog(ERROR, "OutputPluginPrepareWrite needs to be called before OutputPluginWrite");
+
+ ctx->write(ctx, ctx->write_location, ctx->write_xid, last_write);
+ ctx->prepared_write = false;
+}
+
+/*
+ * Load the output plugin, lookup its output plugin init function, and check
+ * that it provides the required callbacks.
+ */
+static void
+LoadOutputPlugin(OutputPluginCallbacks *callbacks, char *plugin)
+{
+ LogicalOutputPluginInit plugin_init;
+
+ plugin_init = (LogicalOutputPluginInit)
+ load_external_function(plugin, "_PG_output_plugin_init", false, NULL);
+
+ if (plugin_init == NULL)
+ elog(ERROR, "output plugins have to declare the _PG_output_plugin_init symbol");
+
+ /* ask the output plugin to fill the callback struct */
+ plugin_init(callbacks);
+
+ if (callbacks->begin_cb == NULL)
+ elog(ERROR, "output plugins have to register a begin callback");
+ if (callbacks->change_cb == NULL)
+ elog(ERROR, "output plugins have to register a change callback");
+ if (callbacks->commit_cb == NULL)
+ elog(ERROR, "output plugins have to register a commit callback");
+}
+
+static void
+output_plugin_error_callback(void *arg)
+{
+ LogicalErrorCallbackState *state = (LogicalErrorCallbackState *) arg;
+ /* not all callbacks have an associated LSN */
+ if (state->report_location != InvalidXLogRecPtr)
+ errcontext("slot \"%s\", output plugin \"%s\", in the %s callback, associated LSN %X/%X",
+ NameStr(state->ctx->slot->data.name),
+ NameStr(state->ctx->slot->data.plugin),
+ state->callback_name,
+ (uint32)(state->report_location >> 32),
+ (uint32)state->report_location);
+ else
+ errcontext("slot \"%s\", output plugin \"%s\", in the %s callback",
+ NameStr(state->ctx->slot->data.name),
+ NameStr(state->ctx->slot->data.plugin),
+ state->callback_name);
+}
+
+static void
+startup_cb_wrapper(LogicalDecodingContext *ctx, OutputPluginOptions *opt, bool is_init)
+{
+ LogicalErrorCallbackState state;
+ ErrorContextCallback errcallback;
+
+ /* Push callback + info on the error context stack */
+ state.ctx = ctx;
+ state.callback_name = "startup";
+ state.report_location = InvalidXLogRecPtr;
+ errcallback.callback = output_plugin_error_callback;
+ errcallback.arg = (void *) &state;
+ errcallback.previous = error_context_stack;
+ error_context_stack = &errcallback;
+
+ /* set output state */
+ ctx->accept_writes = false;
+
+ /* do the actual work: call callback */
+ ctx->callbacks.startup_cb(ctx, opt, is_init);
+
+ /* Pop the error context stack */
+ error_context_stack = errcallback.previous;
+}
+
+static void
+shutdown_cb_wrapper(LogicalDecodingContext *ctx)
+{
+ LogicalErrorCallbackState state;
+ ErrorContextCallback errcallback;
+
+ /* Push callback + info on the error context stack */
+ state.ctx = ctx;
+ state.callback_name = "shutdown";
+ state.report_location = InvalidXLogRecPtr;
+ errcallback.callback = output_plugin_error_callback;
+ errcallback.arg = (void *) &state;
+ errcallback.previous = error_context_stack;
+ error_context_stack = &errcallback;
+
+ /* set output state */
+ ctx->accept_writes = false;
+
+ /* do the actual work: call callback */
+ ctx->callbacks.shutdown_cb(ctx);
+
+ /* Pop the error context stack */
+ error_context_stack = errcallback.previous;
+}
+
+
+/*
+ * Callbacks for ReorderBuffer which add in some more information and then call
+ * output_plugin.h plugins.
+ */
+static void
+begin_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn)
+{
+ LogicalDecodingContext *ctx = cache->private_data;
+ LogicalErrorCallbackState state;
+ ErrorContextCallback errcallback;
+
+ /* Push callback + info on the error context stack */
+ state.ctx = ctx;
+ state.callback_name = "begin";
+ state.report_location = txn->first_lsn;
+ errcallback.callback = output_plugin_error_callback;
+ errcallback.arg = (void *) &state;
+ errcallback.previous = error_context_stack;
+ error_context_stack = &errcallback;
+
+ /* set output state */
+ ctx->accept_writes = true;
+ ctx->write_xid = txn->xid;
+ ctx->write_location = txn->first_lsn;
+
+ /* do the actual work: call callback */
+ ctx->callbacks.begin_cb(ctx, txn);
+
+ /* Pop the error context stack */
+ error_context_stack = errcallback.previous;
+}
+
+static void
+commit_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
+ XLogRecPtr commit_lsn)
+{
+ LogicalDecodingContext *ctx = cache->private_data;
+ LogicalErrorCallbackState state;
+ ErrorContextCallback errcallback;
+
+ /* Push callback + info on the error context stack */
+ state.ctx = ctx;
+ state.callback_name = "commit";
+ state.report_location = txn->final_lsn; /* beginning of commit record */
+ errcallback.callback = output_plugin_error_callback;
+ errcallback.arg = (void *) &state;
+ errcallback.previous = error_context_stack;
+ error_context_stack = &errcallback;
+
+ /* set output state */
+ ctx->accept_writes = true;
+ ctx->write_xid = txn->xid;
+ ctx->write_location = txn->end_lsn; /* points to the end of the record */
+
+ /* do the actual work: call callback */
+ ctx->callbacks.commit_cb(ctx, txn, commit_lsn);
+
+ /* Pop the error context stack */
+ error_context_stack = errcallback.previous;
+}
+
+static void
+change_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
+ Relation relation, ReorderBufferChange *change)
+{
+ LogicalDecodingContext *ctx = cache->private_data;
+ LogicalErrorCallbackState state;
+ ErrorContextCallback errcallback;
+
+ /* Push callback + info on the error context stack */
+ state.ctx = ctx;
+ state.callback_name = "change";
+ state.report_location = change->lsn;
+ errcallback.callback = output_plugin_error_callback;
+ errcallback.arg = (void *) &state;
+ errcallback.previous = error_context_stack;
+ error_context_stack = &errcallback;
+
+ /* set output state */
+ ctx->accept_writes = true;
+ ctx->write_xid = txn->xid;
+ /*
+ * report this change's lsn so replies from clients can give an up2date
+ * answer. This won't ever be enough (and shouldn't be!) to confirm
+ * receipt of this transaction, but it might allow another transaction's
+ * commit to be confirmed with one message.
+ */
+ ctx->write_location = change->lsn;
+
+ ctx->callbacks.change_cb(ctx, txn, relation, change);
+
+ /* Pop the error context stack */
+ error_context_stack = errcallback.previous;
+}
+
+/*
+ * Set the required catalog xmin horizon for historic snapshots in the current
+ * replication slot.
+ *
+ * Note that in the most cases, we won't be able to immediately use the xmin
+ * to increase the xmin horizon, we need to wait till the client has confirmed
+ * receiving current_lsn with LogicalConfirmReceivedLocation().
+ */
+void
+LogicalIncreaseXminForSlot(XLogRecPtr current_lsn, TransactionId xmin)
+{
+ bool updated_xmin = false;
+ ReplicationSlot *slot;
+
+ slot = MyReplicationSlot;
+
+ Assert(slot != NULL);
+
+ SpinLockAcquire(&slot->mutex);
+
+ /*
+ * don't overwrite if we already have a newer xmin. This can
+ * happen if we restart decoding in a slot.
+ */
+ if (TransactionIdPrecedesOrEquals(xmin, slot->data.catalog_xmin))
+ {
+ }
+ /*
+ * If the client has already confirmed up to this lsn, we directly
+ * can mark this as accepted. This can happen if we restart
+ * decoding in a slot.
+ */
+ else if (current_lsn <= slot->data.confirmed_flush)
+ {
+ slot->candidate_catalog_xmin = xmin;
+ slot->candidate_xmin_lsn = current_lsn;
+
+ /* our candidate can directly be used */
+ updated_xmin = true;
+ }
+ /*
+ * Only increase if the previous values have been applied, otherwise we
+ * might never end up updating if the receiver acks too slowly.
+ */
+ else if (slot->candidate_xmin_lsn == InvalidXLogRecPtr)
+ {
+ slot->candidate_catalog_xmin = xmin;
+ slot->candidate_xmin_lsn = current_lsn;
+ }
+ SpinLockRelease(&slot->mutex);
+
+ /* candidate already valid with the current flush position, apply */
+ if (updated_xmin)
+ LogicalConfirmReceivedLocation(slot->data.confirmed_flush);
+}
+
+/*
+ * Mark the minimal LSN (restart_lsn) we need to read to replay all
+ * transactions that have not yet committed at current_lsn.
+ *
+ * Just like IncreaseRestartDecodingForSlot this nly takes effect when the
+ * client has confirmed to have received current_lsn.
+ */
+void
+LogicalIncreaseRestartDecodingForSlot(XLogRecPtr current_lsn, XLogRecPtr restart_lsn)
+{
+ bool updated_lsn = false;
+ ReplicationSlot *slot;
+
+ slot = MyReplicationSlot;
+
+ Assert(slot != NULL);
+ Assert(restart_lsn != InvalidXLogRecPtr);
+ Assert(current_lsn != InvalidXLogRecPtr);
+
+ SpinLockAcquire(&slot->mutex);
+
+ /* don't overwrite if have a newer restart lsn*/
+ if (restart_lsn <= slot->data.restart_lsn)
+ {
+ }
+ /*
+ * We might have already flushed far enough to directly accept this lsn, in
+ * this case there is no need to check for existing candidate LSNs
+ */
+ else if (current_lsn <= slot->data.confirmed_flush)
+ {
+ slot->candidate_restart_valid = current_lsn;
+ slot->candidate_restart_lsn = restart_lsn;
+
+ /* our candidate can directly be used */
+ updated_lsn = true;
+ }
+ /*
+ * Only increase if the previous values have been applied, otherwise we
+ * might never end up updating if the receiver acks too slowly. A missed
+ * value here will just cause some extra effort after reconnecting.
+ */
+ if (slot->candidate_restart_valid == InvalidXLogRecPtr)
+ {
+ slot->candidate_restart_valid = current_lsn;
+ slot->candidate_restart_lsn = restart_lsn;
+
+ elog(DEBUG1, "got new restart lsn %X/%X at %X/%X",
+ (uint32) (restart_lsn >> 32), (uint32) restart_lsn,
+ (uint32) (current_lsn >> 32), (uint32) current_lsn);
+ }
+ else
+ {
+ elog(DEBUG1, "failed to increase restart lsn: proposed %X/%X, after %X/%X, current candidate %X/%X, current after %X/%X, flushed up to %X/%X",
+ (uint32) (restart_lsn >> 32), (uint32) restart_lsn,
+ (uint32) (current_lsn >> 32), (uint32) current_lsn,
+ (uint32) (slot->candidate_restart_lsn >> 32),
+ (uint32) slot->candidate_restart_lsn,
+ (uint32) (slot->candidate_restart_valid >> 32),
+ (uint32) slot->candidate_restart_valid,
+ (uint32) (slot->data.confirmed_flush >> 32),
+ (uint32) slot->data.confirmed_flush
+ );
+ }
+ SpinLockRelease(&slot->mutex);
+
+ /* candidates are already valid with the current flush position, apply */
+ if (updated_lsn)
+ LogicalConfirmReceivedLocation(slot->data.confirmed_flush);
+}
+
+/*
+ * Handle a consumer's conformation having received all changes up to lsn.
+ */
+void
+LogicalConfirmReceivedLocation(XLogRecPtr lsn)
+{
+ Assert(lsn != InvalidXLogRecPtr);
+
+ /* Do an unlocked check for candidate_lsn first. */
+ if (MyReplicationSlot->candidate_xmin_lsn != InvalidXLogRecPtr ||
+ MyReplicationSlot->candidate_restart_valid != InvalidXLogRecPtr)
+ {
+ bool updated_xmin = false;
+ bool updated_restart = false;
+
+ /* use volatile pointer to prevent code rearrangement */
+ volatile ReplicationSlot *slot = MyReplicationSlot;
+
+ SpinLockAcquire(&slot->mutex);
+
+ slot->data.confirmed_flush = lsn;
+
+ /* if were past the location required for bumping xmin, do so */
+ if (slot->candidate_xmin_lsn != InvalidXLogRecPtr &&
+ slot->candidate_xmin_lsn <= lsn)
+ {
+ /*
+ * We have to write the changed xmin to disk *before* we change
+ * the in-memory value, otherwise after a crash we wouldn't know
+ * that some catalog tuples might have been removed already.
+ *
+ * Ensure that by first writing to ->xmin and only update
+ * ->effective_xmin once the new state is synced to disk. After a
+ * crash ->effective_xmin is set to ->xmin.
+ */
+ if (TransactionIdIsValid(slot->candidate_catalog_xmin) &&
+ slot->data.catalog_xmin != slot->candidate_catalog_xmin)
+ {
+ slot->data.catalog_xmin = slot->candidate_catalog_xmin;
+ slot->candidate_catalog_xmin = InvalidTransactionId;
+ slot->candidate_xmin_lsn = InvalidXLogRecPtr;
+ updated_xmin = true;
+ }
+ }
+
+ if (slot->candidate_restart_valid != InvalidXLogRecPtr &&
+ slot->candidate_restart_valid <= lsn)
+ {
+ Assert(slot->candidate_restart_lsn != InvalidXLogRecPtr);
+
+ slot->data.restart_lsn = slot->candidate_restart_lsn;
+ slot->candidate_restart_lsn = InvalidXLogRecPtr;
+ slot->candidate_restart_valid = InvalidXLogRecPtr;
+ updated_restart = true;
+ }
+
+ SpinLockRelease(&slot->mutex);
+
+ /* first write new xmin to disk, so we know whats up after a crash */
+ if (updated_xmin || updated_restart)
+ {
+ ReplicationSlotMarkDirty();
+ ReplicationSlotSave();
+ elog(DEBUG1, "updated xmin: %u restart: %u", updated_xmin, updated_restart);
+ }
+ /*
+ * Now the new xmin is safely on disk, we can let the global value
+ * advance. We do not take ProcArrayLock or similar since we only
+ * advance xmin here and there's not much harm done by a concurrent
+ * computation missing that.
+ */
+ if (updated_xmin)
+ {
+ SpinLockAcquire(&slot->mutex);
+ slot->effective_catalog_xmin = slot->data.catalog_xmin;
+ SpinLockRelease(&slot->mutex);
+
+ ReplicationSlotsComputeRequiredXmin(false);
+ ReplicationSlotsComputeRequiredLSN();
+ }
+ }
+ else
+ {
+ volatile ReplicationSlot *slot = MyReplicationSlot;
+
+ SpinLockAcquire(&slot->mutex);
+ slot->data.confirmed_flush = lsn;
+ SpinLockRelease(&slot->mutex);
+ }
+}
diff --git a/src/backend/replication/logical/logicalfuncs.c b/src/backend/replication/logical/logicalfuncs.c
new file mode 100644
index 00000000000..3b8ae3853ba
--- /dev/null
+++ b/src/backend/replication/logical/logicalfuncs.c
@@ -0,0 +1,509 @@
+/*-------------------------------------------------------------------------
+ *
+ * logicalfuncs.c
+ *
+ * Support functions for using logical decoding and managemnt of
+ * logical replication slots via SQL.
+ *
+ *
+ * Copyright (c) 2012-2014, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/replication/logicalfuncs.c
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <unistd.h>
+
+#include "fmgr.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+
+#include "catalog/pg_type.h"
+
+#include "nodes/makefuncs.h"
+
+#include "mb/pg_wchar.h"
+
+#include "utils/array.h"
+#include "utils/builtins.h"
+#include "utils/inval.h"
+#include "utils/memutils.h"
+#include "utils/pg_lsn.h"
+#include "utils/resowner.h"
+#include "utils/lsyscache.h"
+
+#include "replication/decode.h"
+#include "replication/logical.h"
+#include "replication/logicalfuncs.h"
+
+#include "storage/fd.h"
+
+/* private date for writing out data */
+typedef struct DecodingOutputState {
+ Tuplestorestate *tupstore;
+ TupleDesc tupdesc;
+ bool binary_output;
+ int64 returned_rows;
+} DecodingOutputState;
+
+/*
+ * Prepare for a output plugin write.
+ */
+static void
+LogicalOutputPrepareWrite(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid,
+ bool last_write)
+{
+ resetStringInfo(ctx->out);
+}
+
+/*
+ * Perform output plugin write into tuplestore.
+ */
+static void
+LogicalOutputWrite(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid,
+ bool last_write)
+{
+ Datum values[3];
+ bool nulls[3];
+ DecodingOutputState *p;
+
+ /* SQL Datums can only be of a limited length... */
+ if (ctx->out->len > MaxAllocSize - VARHDRSZ)
+ elog(ERROR, "too much output for sql interface");
+
+ p = (DecodingOutputState *) ctx->output_writer_private;
+
+ memset(nulls, 0, sizeof(nulls));
+ values[0] = LSNGetDatum(lsn);
+ values[1] = TransactionIdGetDatum(xid);
+
+ /*
+ * Assert ctx->out is in database encoding when we're writing textual
+ * output.
+ */
+ if (!p->binary_output)
+ Assert(pg_verify_mbstr(GetDatabaseEncoding(),
+ ctx->out->data, ctx->out->len,
+ false));
+
+ /* ick, but cstring_to_text_with_len works for bytea perfectly fine */
+ values[2] = PointerGetDatum(
+ cstring_to_text_with_len(ctx->out->data, ctx->out->len));
+
+ tuplestore_putvalues(p->tupstore, p->tupdesc, values, nulls);
+ p->returned_rows++;
+}
+
+/*
+ * TODO: This is duplicate code with pg_xlogdump, similar to walsender.c, but
+ * we currently don't have the infrastructure (elog!) to share it.
+ */
+static void
+XLogRead(char *buf, TimeLineID tli, XLogRecPtr startptr, Size count)
+{
+ char *p;
+ XLogRecPtr recptr;
+ Size nbytes;
+
+ static int sendFile = -1;
+ static XLogSegNo sendSegNo = 0;
+ static uint32 sendOff = 0;
+
+ p = buf;
+ recptr = startptr;
+ nbytes = count;
+
+ while (nbytes > 0)
+ {
+ uint32 startoff;
+ int segbytes;
+ int readbytes;
+
+ startoff = recptr % XLogSegSize;
+
+ if (sendFile < 0 || !XLByteInSeg(recptr, sendSegNo))
+ {
+ char path[MAXPGPATH];
+
+ /* Switch to another logfile segment */
+ if (sendFile >= 0)
+ close(sendFile);
+
+ XLByteToSeg(recptr, sendSegNo);
+
+ XLogFilePath(path, tli, sendSegNo);
+
+ sendFile = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
+
+ if (sendFile < 0)
+ {
+ if (errno == ENOENT)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("requested WAL segment %s has already been removed",
+ path)));
+ else
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not open file \"%s\": %m",
+ path)));
+ }
+ sendOff = 0;
+ }
+
+ /* Need to seek in the file? */
+ if (sendOff != startoff)
+ {
+ if (lseek(sendFile, (off_t) startoff, SEEK_SET) < 0)
+ {
+ char path[MAXPGPATH];
+
+ XLogFilePath(path, tli, sendSegNo);
+
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not seek in log segment %s to offset %u: %m",
+ path, startoff)));
+ }
+ sendOff = startoff;
+ }
+
+ /* How many bytes are within this segment? */
+ if (nbytes > (XLogSegSize - startoff))
+ segbytes = XLogSegSize - startoff;
+ else
+ segbytes = nbytes;
+
+ readbytes = read(sendFile, p, segbytes);
+ if (readbytes <= 0)
+ {
+ char path[MAXPGPATH];
+
+ XLogFilePath(path, tli, sendSegNo);
+
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not read from log segment %s, offset %u, length %lu: %m",
+ path, sendOff, (unsigned long) segbytes)));
+ }
+
+ /* Update state for read */
+ recptr += readbytes;
+
+ sendOff += readbytes;
+ nbytes -= readbytes;
+ p += readbytes;
+ }
+}
+
+static void
+check_permissions(void)
+{
+ if (!superuser() && !has_rolreplication(GetUserId()))
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ (errmsg("must be superuser or replication role to use replication slots"))));
+}
+
+/*
+ * read_page callback for logical decoding contexts.
+ *
+ * Public because it would likely be very helpful for someone writing another
+ * output method outside walsender, e.g. in a bgworker.
+ *
+ * TODO: The walsender has it's own version of this, but it relies on the
+ * walsender's latch being set whenever WAL is flushed. No such infrastructure
+ * exists for normal backends, so we have to do a check/sleep/repeat style of
+ * loop for now.
+ */
+int
+logical_read_local_xlog_page(XLogReaderState *state, XLogRecPtr targetPagePtr,
+ int reqLen, XLogRecPtr targetRecPtr, char *cur_page, TimeLineID *pageTLI)
+{
+ XLogRecPtr flushptr,
+ loc;
+ int count;
+
+ loc = targetPagePtr + reqLen;
+ while (1)
+ {
+ /*
+ * TODO: we're going to have to do something more intelligent about
+ * timelines on standbys. Use readTimeLineHistory() and
+ * tliOfPointInHistory() to get the proper LSN? For now we'll catch
+ * that case earlier, but the code and TODO is left in here for when
+ * that changes.
+ */
+ if (!RecoveryInProgress())
+ {
+ *pageTLI = ThisTimeLineID;
+ flushptr = GetFlushRecPtr();
+ }
+ else
+ flushptr = GetXLogReplayRecPtr(pageTLI);
+
+ if (loc <= flushptr)
+ break;
+
+ CHECK_FOR_INTERRUPTS();
+ pg_usleep(1000L);
+ }
+
+ /* more than one block available */
+ if (targetPagePtr + XLOG_BLCKSZ <= flushptr)
+ count = XLOG_BLCKSZ;
+ /* not enough data there */
+ else if (targetPagePtr + reqLen > flushptr)
+ return -1;
+ /* part of the page available */
+ else
+ count = flushptr - targetPagePtr;
+
+ XLogRead(cur_page, *pageTLI, targetPagePtr, XLOG_BLCKSZ);
+
+ return count;
+}
+
+/*
+ * Helper function for the various SQL callable logical decoding functions.
+ */
+static Datum
+pg_logical_slot_get_changes_guts(FunctionCallInfo fcinfo, bool confirm, bool binary)
+{
+ Name name = PG_GETARG_NAME(0);
+ XLogRecPtr upto_lsn;
+ int32 upto_nchanges;
+
+ ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+ MemoryContext per_query_ctx;
+ MemoryContext oldcontext;
+
+ XLogRecPtr end_of_wal;
+ XLogRecPtr startptr;
+
+ LogicalDecodingContext *ctx;
+
+ ResourceOwner old_resowner = CurrentResourceOwner;
+ ArrayType *arr;
+ Size ndim;
+ List *options = NIL;
+ DecodingOutputState *p;
+
+ if (PG_ARGISNULL(1))
+ upto_lsn = InvalidXLogRecPtr;
+ else
+ upto_lsn = PG_GETARG_LSN(1);
+
+ if (PG_ARGISNULL(2))
+ upto_nchanges = InvalidXLogRecPtr;
+ else
+ upto_nchanges = PG_GETARG_INT32(2);
+
+ /* check to see if caller supports us returning a tuplestore */
+ if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("set-valued function called in context that cannot accept a set")));
+ if (!(rsinfo->allowedModes & SFRM_Materialize))
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("materialize mode required, but it is not allowed in this context")));
+
+ /* state to write output to */
+ p = palloc0(sizeof(DecodingOutputState));
+
+ p->binary_output = binary;
+
+ /* Build a tuple descriptor for our result type */
+ if (get_call_result_type(fcinfo, NULL, &p->tupdesc) != TYPEFUNC_COMPOSITE)
+ elog(ERROR, "return type must be a row type");
+
+ check_permissions();
+
+ CheckLogicalDecodingRequirements();
+
+ arr = PG_GETARG_ARRAYTYPE_P(3);
+ ndim = ARR_NDIM(arr);
+
+ per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
+ oldcontext = MemoryContextSwitchTo(per_query_ctx);
+
+ if (ndim > 1)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("array must be one-dimensional")));
+ }
+ else if (array_contains_nulls(arr))
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("array must not contain nulls")));
+ }
+ else if (ndim == 1)
+ {
+ int nelems;
+ Datum *datum_opts;
+ int i;
+
+ Assert(ARR_ELEMTYPE(arr) == TEXTOID);
+
+ deconstruct_array(arr, TEXTOID, -1, false, 'i',
+ &datum_opts, NULL, &nelems);
+
+ if (nelems % 2 != 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("array must have even number of elements")));
+
+ for (i = 0; i < nelems; i += 2)
+ {
+ char *name = TextDatumGetCString(datum_opts[i]);
+ char *opt = TextDatumGetCString(datum_opts[i + 1]);
+
+ options = lappend(options, makeDefElem(name, (Node *) makeString(opt)));
+ }
+ }
+
+ p->tupstore = tuplestore_begin_heap(true, false, work_mem);
+ rsinfo->returnMode = SFRM_Materialize;
+ rsinfo->setResult = p->tupstore;
+ rsinfo->setDesc = p->tupdesc;
+
+ /* compute the current end-of-wal */
+ if (!RecoveryInProgress())
+ end_of_wal = GetFlushRecPtr();
+ else
+ end_of_wal = GetXLogReplayRecPtr(NULL);
+
+ CheckLogicalDecodingRequirements();
+ ReplicationSlotAcquire(NameStr(*name));
+
+ PG_TRY();
+ {
+ ctx = CreateDecodingContext(InvalidXLogRecPtr,
+ options,
+ logical_read_local_xlog_page,
+ LogicalOutputPrepareWrite,
+ LogicalOutputWrite);
+
+ MemoryContextSwitchTo(oldcontext);
+
+ /*
+ * Check whether the output pluggin writes textual output if that's
+ * what we need.
+ */
+ if (!binary &&
+ ctx->options.output_type != OUTPUT_PLUGIN_TEXTUAL_OUTPUT)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("output plugin cannot produce text output")));
+
+ ctx->output_writer_private = p;
+
+ startptr = MyReplicationSlot->data.restart_lsn;
+
+ CurrentResourceOwner = ResourceOwnerCreate(CurrentResourceOwner, "logical decoding");
+
+ /* invalidate non-timetravel entries */
+ InvalidateSystemCaches();
+
+ while ((startptr != InvalidXLogRecPtr && startptr < end_of_wal) ||
+ (ctx->reader->EndRecPtr && ctx->reader->EndRecPtr < end_of_wal))
+ {
+ XLogRecord *record;
+ char *errm = NULL;
+
+ record = XLogReadRecord(ctx->reader, startptr, &errm);
+ if (errm)
+ elog(ERROR, "%s", errm);
+
+ startptr = InvalidXLogRecPtr;
+
+ /*
+ * The {begin_txn,change,commit_txn}_wrapper callbacks above will
+ * store the description into our tuplestore.
+ */
+ if (record != NULL)
+ LogicalDecodingProcessRecord(ctx, record);
+
+ /* check limits */
+ if (upto_lsn != InvalidXLogRecPtr &&
+ upto_lsn <= ctx->reader->EndRecPtr)
+ break;
+ if (upto_nchanges != 0 &&
+ upto_nchanges <= p->returned_rows)
+ break;
+ }
+ }
+ PG_CATCH();
+ {
+ /* clear all timetravel entries */
+ InvalidateSystemCaches();
+
+ PG_RE_THROW();
+ }
+ PG_END_TRY();
+
+ tuplestore_donestoring(tupstore);
+
+ CurrentResourceOwner = old_resowner;
+
+ /*
+ * Next time, start where we left off. (Hunting things, the family
+ * business..)
+ */
+ if (ctx->reader->EndRecPtr != InvalidXLogRecPtr && confirm)
+ LogicalConfirmReceivedLocation(ctx->reader->EndRecPtr);
+
+ /* free context, call shutdown callback */
+ FreeDecodingContext(ctx);
+
+ ReplicationSlotRelease();
+ InvalidateSystemCaches();
+
+ return (Datum) 0;
+}
+
+/*
+ * SQL function returning the changestream as text, consuming the data.
+ */
+Datum
+pg_logical_slot_get_changes(PG_FUNCTION_ARGS)
+{
+ Datum ret = pg_logical_slot_get_changes_guts(fcinfo, true, false);
+ return ret;
+}
+
+/*
+ * SQL function returning the changestream as text, only peeking ahead.
+ */
+Datum
+pg_logical_slot_peek_changes(PG_FUNCTION_ARGS)
+{
+ Datum ret = pg_logical_slot_get_changes_guts(fcinfo, false, false);
+ return ret;
+}
+
+/*
+ * SQL function returning the changestream in binary, consuming the data.
+ */
+Datum
+pg_logical_slot_get_binary_changes(PG_FUNCTION_ARGS)
+{
+ Datum ret = pg_logical_slot_get_changes_guts(fcinfo, true, true);
+ return ret;
+}
+
+/*
+ * SQL function returning the changestream in binary, only peeking ahead.
+ */
+Datum
+pg_logical_slot_peek_binary_changes(PG_FUNCTION_ARGS)
+{
+ Datum ret = pg_logical_slot_get_changes_guts(fcinfo, false, true);
+ return ret;
+}
diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c
new file mode 100644
index 00000000000..e7182338b89
--- /dev/null
+++ b/src/backend/replication/logical/reorderbuffer.c
@@ -0,0 +1,3059 @@
+/*-------------------------------------------------------------------------
+ *
+ * reorderbuffer.c
+ * PostgreSQL logical replay/reorder buffer management
+ *
+ *
+ * Copyright (c) 2012-2014, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/replication/reorderbuffer.c
+ *
+ * NOTES
+ * This module gets handed individual pieces of transactions in the order
+ * they are written to the WAL and is responsible to reassemble them into
+ * toplevel transaction sized pieces. When a transaction is completely
+ * reassembled - signalled by reading the transaction commit record - it
+ * will then call the output plugin (c.f. ReorderBufferCommit()) with the
+ * individual changes. The output plugins rely on snapshots built by
+ * snapbuild.c which hands them to us.
+ *
+ * Transactions and subtransactions/savepoints in postgres are not
+ * immediately linked to each other from outside the performing
+ * backend. Only at commit/abort (or special xact_assignment records) they
+ * are linked together. Which means that we will have to splice together a
+ * toplevel transaction from its subtransactions. To do that efficiently we
+ * build a binary heap indexed by the smallest current lsn of the individual
+ * subtransactions' changestreams. As the individual streams are inherently
+ * ordered by LSN - since that is where we build them from - the transaction
+ * can easily be reassembled by always using the subtransaction with the
+ * smallest current LSN from the heap.
+ *
+ * In order to cope with large transactions - which can be several times as
+ * big as the available memory - this module supports spooling the contents
+ * of a large transactions to disk. When the transaction is replayed the
+ * contents of individual (sub-)transactions will be read from disk in
+ * chunks.
+ *
+ * This module also has to deal with reassembling toast records from the
+ * individual chunks stored in WAL. When a new (or initial) version of a
+ * tuple is stored in WAL it will always be preceded by the toast chunks
+ * emitted for the columns stored out of line. Within a single toplevel
+ * transaction there will be no other data carrying records between a row's
+ * toast chunks and the row data itself. See ReorderBufferToast* for
+ * details.
+ * -------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <unistd.h>
+#include <sys/stat.h>
+
+#include "miscadmin.h"
+
+#include "access/rewriteheap.h"
+#include "access/transam.h"
+#include "access/tuptoaster.h"
+#include "access/xact.h"
+
+#include "catalog/catalog.h"
+
+#include "common/relpath.h"
+
+#include "lib/binaryheap.h"
+
+#include "replication/logical.h"
+#include "replication/reorderbuffer.h"
+#include "replication/slot.h"
+#include "replication/snapbuild.h" /* just for SnapBuildSnapDecRefcount */
+
+#include "storage/bufmgr.h"
+#include "storage/fd.h"
+#include "storage/sinval.h"
+
+#include "utils/builtins.h"
+#include "utils/combocid.h"
+#include "utils/memdebug.h"
+#include "utils/memutils.h"
+#include "utils/relcache.h"
+#include "utils/relfilenodemap.h"
+#include "utils/tqual.h"
+
+/*
+ * For efficiency and simplicity reasons we want to keep Snapshots, CommandIds
+ * and ComboCids in the same list with the user visible INSERT/UPDATE/DELETE
+ * changes. We don't want to leak those internal values to external users
+ * though (they would just use switch()...default:) because that would make it
+ * harder to add to new user visible values.
+ *
+ * This needs to be synchronized with ReorderBufferChangeType! Adjust the
+ * StaticAssertExpr's in ReorderBufferAllocate if you add anything!
+ */
+typedef enum
+{
+ REORDER_BUFFER_CHANGE_INTERNAL_INSERT,
+ REORDER_BUFFER_CHANGE_INTERNAL_UPDATE,
+ REORDER_BUFFER_CHANGE_INTERNAL_DELETE,
+ REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT,
+ REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID,
+ REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID
+} ReorderBufferChangeTypeInternal;
+
+/* entry for a hash table we use to map from xid to our transaction state */
+typedef struct ReorderBufferTXNByIdEnt
+{
+ TransactionId xid;
+ ReorderBufferTXN *txn;
+} ReorderBufferTXNByIdEnt;
+
+/* data structures for (relfilenode, ctid) => (cmin, cmax) mapping */
+typedef struct ReorderBufferTupleCidKey
+{
+ RelFileNode relnode;
+ ItemPointerData tid;
+} ReorderBufferTupleCidKey;
+
+typedef struct ReorderBufferTupleCidEnt
+{
+ ReorderBufferTupleCidKey key;
+ CommandId cmin;
+ CommandId cmax;
+ CommandId combocid; /* just for debugging */
+} ReorderBufferTupleCidEnt;
+
+/* k-way in-order change iteration support structures */
+typedef struct ReorderBufferIterTXNEntry
+{
+ XLogRecPtr lsn;
+ ReorderBufferChange *change;
+ ReorderBufferTXN *txn;
+ int fd;
+ XLogSegNo segno;
+} ReorderBufferIterTXNEntry;
+
+typedef struct ReorderBufferIterTXNState
+{
+ binaryheap *heap;
+ Size nr_txns;
+ dlist_head old_change;
+ ReorderBufferIterTXNEntry entries[FLEXIBLE_ARRAY_MEMBER];
+} ReorderBufferIterTXNState;
+
+/* toast datastructures */
+typedef struct ReorderBufferToastEnt
+{
+ Oid chunk_id; /* toast_table.chunk_id */
+ int32 last_chunk_seq; /* toast_table.chunk_seq of the last chunk we
+ * have seen */
+ Size num_chunks; /* number of chunks we've already seen */
+ Size size; /* combined size of chunks seen */
+ dlist_head chunks; /* linked list of chunks */
+ struct varlena *reconstructed; /* reconstructed varlena now pointed
+ * to in main tup */
+} ReorderBufferToastEnt;
+
+/* Disk serialization support datastructures */
+typedef struct ReorderBufferDiskChange
+{
+ Size size;
+ ReorderBufferChange change;
+ /* data follows */
+} ReorderBufferDiskChange;
+
+/*
+ * Maximum number of changes kept in memory, per transaction. After that,
+ * changes are spooled to disk.
+ *
+ * The current value should be sufficient to decode the entire transaction
+ * without hitting disk in OLTP workloads, while starting to spool to disk in
+ * other workloads reasonably fast.
+ *
+ * At some point in the future it probaly makes sense to have a more elaborate
+ * resource management here, but it's not entirely clear what that would look
+ * like.
+ */
+static const Size max_changes_in_memory = 4096;
+
+/*
+ * We use a very simple form of a slab allocator for frequently allocated
+ * objects, simply keeping a fixed number in a linked list when unused,
+ * instead pfree()ing them. Without that in many workloads aset.c becomes a
+ * major bottleneck, especially when spilling to disk while decoding batch
+ * workloads.
+ */
+static const Size max_cached_changes = 4096 * 2;
+static const Size max_cached_tuplebufs = 4096 * 2; /* ~8MB */
+static const Size max_cached_transactions = 512;
+
+
+/* ---------------------------------------
+ * primary reorderbuffer support routines
+ * ---------------------------------------
+ */
+static ReorderBufferTXN *ReorderBufferGetTXN(ReorderBuffer *rb);
+static void ReorderBufferReturnTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
+static ReorderBufferTXN *ReorderBufferTXNByXid(ReorderBuffer *rb,
+ TransactionId xid, bool create, bool *is_new,
+ XLogRecPtr lsn, bool create_as_top);
+
+static void AssertTXNLsnOrder(ReorderBuffer *rb);
+
+/* ---------------------------------------
+ * support functions for lsn-order iterating over the ->changes of a
+ * transaction and its subtransactions
+ *
+ * used for iteration over the k-way heap merge of a transaction and its
+ * subtransactions
+ * ---------------------------------------
+ */
+static ReorderBufferIterTXNState *ReorderBufferIterTXNInit(ReorderBuffer *rb, ReorderBufferTXN *txn);
+static ReorderBufferChange *
+ ReorderBufferIterTXNNext(ReorderBuffer *rb, ReorderBufferIterTXNState *state);
+static void ReorderBufferIterTXNFinish(ReorderBuffer *rb,
+ ReorderBufferIterTXNState *state);
+static void ReorderBufferExecuteInvalidations(ReorderBuffer *rb, ReorderBufferTXN *txn);
+
+/*
+ * ---------------------------------------
+ * Disk serialization support functions
+ * ---------------------------------------
+ */
+static void ReorderBufferCheckSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
+static void ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
+static void ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
+ int fd, ReorderBufferChange *change);
+static Size ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn,
+ int *fd, XLogSegNo *segno);
+static void ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
+ char *change);
+static void ReorderBufferRestoreCleanup(ReorderBuffer *rb, ReorderBufferTXN *txn);
+
+static void ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap);
+static Snapshot ReorderBufferCopySnap(ReorderBuffer *rb, Snapshot orig_snap,
+ ReorderBufferTXN *txn, CommandId cid);
+
+/* ---------------------------------------
+ * toast reassembly support
+ * ---------------------------------------
+ */
+static void ReorderBufferToastInitHash(ReorderBuffer *rb, ReorderBufferTXN *txn);
+static void ReorderBufferToastReset(ReorderBuffer *rb, ReorderBufferTXN *txn);
+static void ReorderBufferToastReplace(ReorderBuffer *rb, ReorderBufferTXN *txn,
+ Relation relation, ReorderBufferChange *change);
+static void ReorderBufferToastAppendChunk(ReorderBuffer *rb, ReorderBufferTXN *txn,
+ Relation relation, ReorderBufferChange *change);
+
+
+/*
+ * Allocate a new ReorderBuffer
+ */
+ReorderBuffer *
+ReorderBufferAllocate(void)
+{
+ ReorderBuffer *buffer;
+ HASHCTL hash_ctl;
+ MemoryContext new_ctx;
+
+ StaticAssertExpr((int) REORDER_BUFFER_CHANGE_INTERNAL_INSERT == (int) REORDER_BUFFER_CHANGE_INSERT, "out of sync enums");
+ StaticAssertExpr((int) REORDER_BUFFER_CHANGE_INTERNAL_UPDATE == (int) REORDER_BUFFER_CHANGE_UPDATE, "out of sync enums");
+ StaticAssertExpr((int) REORDER_BUFFER_CHANGE_INTERNAL_DELETE == (int) REORDER_BUFFER_CHANGE_DELETE, "out of sync enums");
+
+ /* allocate memory in own context, to have better accountability */
+ new_ctx = AllocSetContextCreate(CurrentMemoryContext,
+ "ReorderBuffer",
+ ALLOCSET_DEFAULT_MINSIZE,
+ ALLOCSET_DEFAULT_INITSIZE,
+ ALLOCSET_DEFAULT_MAXSIZE);
+
+ buffer =
+ (ReorderBuffer *) MemoryContextAlloc(new_ctx, sizeof(ReorderBuffer));
+
+ memset(&hash_ctl, 0, sizeof(hash_ctl));
+
+ buffer->context = new_ctx;
+
+ hash_ctl.keysize = sizeof(TransactionId);
+ hash_ctl.entrysize = sizeof(ReorderBufferTXNByIdEnt);
+ hash_ctl.hash = tag_hash;
+ hash_ctl.hcxt = buffer->context;
+
+ buffer->by_txn = hash_create("ReorderBufferByXid", 1000, &hash_ctl,
+ HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
+
+ buffer->by_txn_last_xid = InvalidTransactionId;
+ buffer->by_txn_last_txn = NULL;
+
+ buffer->nr_cached_transactions = 0;
+ buffer->nr_cached_changes = 0;
+ buffer->nr_cached_tuplebufs = 0;
+
+ buffer->outbuf = NULL;
+ buffer->outbufsize = 0;
+
+ buffer->current_restart_decoding_lsn = InvalidXLogRecPtr;
+
+ dlist_init(&buffer->toplevel_by_lsn);
+ dlist_init(&buffer->cached_transactions);
+ dlist_init(&buffer->cached_changes);
+ slist_init(&buffer->cached_tuplebufs);
+
+ return buffer;
+}
+
+/*
+ * Free a ReorderBuffer
+ */
+void
+ReorderBufferFree(ReorderBuffer *rb)
+{
+ MemoryContext context = rb->context;
+
+ /*
+ * We free separately allocated data by entirely scrapping reorderbuffer's
+ * memory context.
+ */
+ MemoryContextDelete(context);
+}
+
+/*
+ * Get a unused, possibly preallocated, ReorderBufferTXN.
+ */
+static ReorderBufferTXN *
+ReorderBufferGetTXN(ReorderBuffer *rb)
+{
+ ReorderBufferTXN *txn;
+
+ /* check the slab cache */
+ if (rb->nr_cached_transactions > 0)
+ {
+ rb->nr_cached_transactions--;
+ txn = (ReorderBufferTXN *)
+ dlist_container(ReorderBufferTXN, node,
+ dlist_pop_head_node(&rb->cached_transactions));
+ }
+ else
+ {
+ txn = (ReorderBufferTXN *)
+ MemoryContextAlloc(rb->context, sizeof(ReorderBufferTXN));
+ }
+
+ memset(txn, 0, sizeof(ReorderBufferTXN));
+
+ dlist_init(&txn->changes);
+ dlist_init(&txn->tuplecids);
+ dlist_init(&txn->subtxns);
+
+ return txn;
+}
+
+/*
+ * Free a ReorderBufferTXN.
+ *
+ * Deallocation might be delayed for efficiency purposes, for details check
+ * the comments above max_cached_changes's definition.
+ */
+void
+ReorderBufferReturnTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
+{
+ /* clean the lookup cache if we were cached (quite likely) */
+ if (rb->by_txn_last_xid == txn->xid)
+ {
+ rb->by_txn_last_xid = InvalidTransactionId;
+ rb->by_txn_last_txn = NULL;
+ }
+
+ /* free data that's contained */
+
+ if (txn->tuplecid_hash != NULL)
+ {
+ hash_destroy(txn->tuplecid_hash);
+ txn->tuplecid_hash = NULL;
+ }
+
+ if (txn->invalidations)
+ {
+ pfree(txn->invalidations);
+ txn->invalidations = NULL;
+ }
+
+ /* check whether to put into the slab cache */
+ if (rb->nr_cached_transactions < max_cached_transactions)
+ {
+ rb->nr_cached_transactions++;
+ dlist_push_head(&rb->cached_transactions, &txn->node);
+ VALGRIND_MAKE_MEM_UNDEFINED(txn, sizeof(ReorderBufferTXN));
+ VALGRIND_MAKE_MEM_DEFINED(&txn->node, sizeof(txn->node));
+ }
+ else
+ {
+ pfree(txn);
+ }
+}
+
+/*
+ * Get a unused, possibly preallocated, ReorderBufferChange.
+ */
+ReorderBufferChange *
+ReorderBufferGetChange(ReorderBuffer *rb)
+{
+ ReorderBufferChange *change;
+
+ /* check the slab cache */
+ if (rb->nr_cached_changes)
+ {
+ rb->nr_cached_changes--;
+ change = (ReorderBufferChange *)
+ dlist_container(ReorderBufferChange, node,
+ dlist_pop_head_node(&rb->cached_changes));
+ }
+ else
+ {
+ change = (ReorderBufferChange *)
+ MemoryContextAlloc(rb->context, sizeof(ReorderBufferChange));
+ }
+
+ memset(change, 0, sizeof(ReorderBufferChange));
+ return change;
+}
+
+/*
+ * Free an ReorderBufferChange.
+ *
+ * Deallocation might be delayed for efficiency purposes, for details check
+ * the comments above max_cached_changes's definition.
+ */
+void
+ReorderBufferReturnChange(ReorderBuffer *rb, ReorderBufferChange *change)
+{
+ /* free contained data */
+ switch ((ReorderBufferChangeTypeInternal) change->action_internal)
+ {
+ case REORDER_BUFFER_CHANGE_INTERNAL_INSERT:
+ case REORDER_BUFFER_CHANGE_INTERNAL_UPDATE:
+ case REORDER_BUFFER_CHANGE_INTERNAL_DELETE:
+ if (change->tp.newtuple)
+ {
+ ReorderBufferReturnTupleBuf(rb, change->tp.newtuple);
+ change->tp.newtuple = NULL;
+ }
+
+ if (change->tp.oldtuple)
+ {
+ ReorderBufferReturnTupleBuf(rb, change->tp.oldtuple);
+ change->tp.oldtuple = NULL;
+ }
+ break;
+ case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
+ if (change->snapshot)
+ {
+ ReorderBufferFreeSnap(rb, change->snapshot);
+ change->snapshot = NULL;
+ }
+ break;
+ case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
+ break;
+ case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
+ break;
+ }
+
+ /* check whether to put into the slab cache */
+ if (rb->nr_cached_changes < max_cached_changes)
+ {
+ rb->nr_cached_changes++;
+ dlist_push_head(&rb->cached_changes, &change->node);
+ VALGRIND_MAKE_MEM_UNDEFINED(change, sizeof(ReorderBufferChange));
+ VALGRIND_MAKE_MEM_DEFINED(&change->node, sizeof(change->node));
+ }
+ else
+ {
+ pfree(change);
+ }
+}
+
+
+/*
+ * Get a unused, possibly preallocated, ReorderBufferTupleBuf
+ */
+ReorderBufferTupleBuf *
+ReorderBufferGetTupleBuf(ReorderBuffer *rb)
+{
+ ReorderBufferTupleBuf *tuple;
+
+ /* check the slab cache */
+ if (rb->nr_cached_tuplebufs)
+ {
+ rb->nr_cached_tuplebufs--;
+ tuple = slist_container(ReorderBufferTupleBuf, node,
+ slist_pop_head_node(&rb->cached_tuplebufs));
+#ifdef USE_ASSERT_CHECKING
+ memset(tuple, 0xdeadbeef, sizeof(ReorderBufferTupleBuf));
+#endif
+ }
+ else
+ {
+ tuple = (ReorderBufferTupleBuf *)
+ MemoryContextAlloc(rb->context, sizeof(ReorderBufferTupleBuf));
+ }
+
+ return tuple;
+}
+
+/*
+ * Free an ReorderBufferTupleBuf.
+ *
+ * Deallocation might be delayed for efficiency purposes, for details check
+ * the comments above max_cached_changes's definition.
+ */
+void
+ReorderBufferReturnTupleBuf(ReorderBuffer *rb, ReorderBufferTupleBuf *tuple)
+{
+ /* check whether to put into the slab cache */
+ if (rb->nr_cached_tuplebufs < max_cached_tuplebufs)
+ {
+ rb->nr_cached_tuplebufs++;
+ slist_push_head(&rb->cached_tuplebufs, &tuple->node);
+ VALGRIND_MAKE_MEM_UNDEFINED(tuple, sizeof(ReorderBufferTupleBuf));
+ VALGRIND_MAKE_MEM_DEFINED(&tuple->node, sizeof(tuple->node));
+ }
+ else
+ {
+ pfree(tuple);
+ }
+}
+
+/*
+ * Return the ReorderBufferTXN from the given buffer, specified by Xid.
+ * If create is true, and a transaction doesn't already exist, create it
+ * (with the given LSN, and as top transaction if that's specified);
+ * when this happens, is_new is set to true.
+ */
+static ReorderBufferTXN *
+ReorderBufferTXNByXid(ReorderBuffer *rb, TransactionId xid, bool create,
+ bool *is_new, XLogRecPtr lsn, bool create_as_top)
+{
+ ReorderBufferTXN *txn;
+ ReorderBufferTXNByIdEnt *ent;
+ bool found;
+
+ Assert(TransactionIdIsValid(xid));
+ Assert(!create || lsn != InvalidXLogRecPtr);
+
+ /*
+ * Check the one-entry lookup cache first
+ */
+ if (TransactionIdIsValid(rb->by_txn_last_xid) &&
+ rb->by_txn_last_xid == xid)
+ {
+ txn = rb->by_txn_last_txn;
+
+ if (txn != NULL)
+ {
+ /* found it, and it's valid */
+ if (is_new)
+ *is_new = false;
+ return txn;
+ }
+
+ /*
+ * cached as non-existant, and asked not to create? Then nothing else
+ * to do.
+ */
+ if (!create)
+ return NULL;
+ /* otherwise fall through to create it */
+ }
+
+ /*
+ * If the cache wasn't hit or it yielded an "does-not-exist" and we want
+ * to create an entry.
+ */
+
+ /* search the lookup table */
+ ent = (ReorderBufferTXNByIdEnt *)
+ hash_search(rb->by_txn,
+ (void *) &xid,
+ create ? HASH_ENTER : HASH_FIND,
+ &found);
+ if (found)
+ txn = ent->txn;
+ else if (create)
+ {
+ /* initialize the new entry, if creation was requested */
+ Assert(ent != NULL);
+
+ ent->txn = ReorderBufferGetTXN(rb);
+ ent->txn->xid = xid;
+ txn = ent->txn;
+ txn->first_lsn = lsn;
+ txn->restart_decoding_lsn = rb->current_restart_decoding_lsn;
+
+ if (create_as_top)
+ {
+ dlist_push_tail(&rb->toplevel_by_lsn, &txn->node);
+ AssertTXNLsnOrder(rb);
+ }
+ }
+ else
+ txn = NULL; /* not found and not asked to create */
+
+ /* update cache */
+ rb->by_txn_last_xid = xid;
+ rb->by_txn_last_txn = txn;
+
+ if (is_new)
+ *is_new = !found;
+
+ Assert(!create || !!txn);
+ return txn;
+}
+
+/*
+ * Queue a change into a transaction so it can be replayed upon commit.
+ */
+void
+ReorderBufferQueueChange(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn,
+ ReorderBufferChange *change)
+{
+ ReorderBufferTXN *txn;
+
+ txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
+
+ change->lsn = lsn;
+ Assert(InvalidXLogRecPtr != lsn);
+ dlist_push_tail(&txn->changes, &change->node);
+ txn->nentries++;
+ txn->nentries_mem++;
+
+ ReorderBufferCheckSerializeTXN(rb, txn);
+}
+
+static void
+AssertTXNLsnOrder(ReorderBuffer *rb)
+{
+#ifdef USE_ASSERT_CHECKING
+ dlist_iter iter;
+ XLogRecPtr prev_first_lsn = InvalidXLogRecPtr;
+
+ dlist_foreach(iter, &rb->toplevel_by_lsn)
+ {
+ ReorderBufferTXN *cur_txn;
+
+ cur_txn = dlist_container(ReorderBufferTXN, node, iter.cur);
+ Assert(cur_txn->first_lsn != InvalidXLogRecPtr);
+
+ if (cur_txn->end_lsn != InvalidXLogRecPtr)
+ Assert(cur_txn->first_lsn <= cur_txn->end_lsn);
+
+ if (prev_first_lsn != InvalidXLogRecPtr)
+ Assert(prev_first_lsn < cur_txn->first_lsn);
+
+ Assert(!cur_txn->is_known_as_subxact);
+ prev_first_lsn = cur_txn->first_lsn;
+ }
+#endif
+}
+
+ReorderBufferTXN *
+ReorderBufferGetOldestTXN(ReorderBuffer *rb)
+{
+ ReorderBufferTXN *txn;
+
+ if (dlist_is_empty(&rb->toplevel_by_lsn))
+ return NULL;
+
+ AssertTXNLsnOrder(rb);
+
+ txn = dlist_head_element(ReorderBufferTXN, node, &rb->toplevel_by_lsn);
+
+ Assert(!txn->is_known_as_subxact);
+ Assert(txn->first_lsn != InvalidXLogRecPtr);
+ return txn;
+}
+
+void
+ReorderBufferSetRestartPoint(ReorderBuffer *rb, XLogRecPtr ptr)
+{
+ rb->current_restart_decoding_lsn = ptr;
+}
+
+void
+ReorderBufferAssignChild(ReorderBuffer *rb, TransactionId xid,
+ TransactionId subxid, XLogRecPtr lsn)
+{
+ ReorderBufferTXN *txn;
+ ReorderBufferTXN *subtxn;
+ bool new_top;
+ bool new_sub;
+
+ txn = ReorderBufferTXNByXid(rb, xid, true, &new_top, lsn, true);
+ subtxn = ReorderBufferTXNByXid(rb, subxid, true, &new_sub, lsn, false);
+
+ if (new_sub)
+ {
+ /*
+ * we assign subtransactions to top level transaction even if we don't
+ * have data for it yet, assignment records frequently reference xids
+ * that have not yet produced any records. Knowing those aren't top
+ * level xids allows us to make processing cheaper in some places.
+ */
+ dlist_push_tail(&txn->subtxns, &subtxn->node);
+ txn->nsubtxns++;
+ }
+ else if (!subtxn->is_known_as_subxact)
+ {
+ subtxn->is_known_as_subxact = true;
+ Assert(subtxn->nsubtxns == 0);
+
+ /* remove from lsn order list of top-level transactions */
+ dlist_delete(&subtxn->node);
+
+ /* add to toplevel transaction */
+ dlist_push_tail(&txn->subtxns, &subtxn->node);
+ txn->nsubtxns++;
+ }
+ else if (new_top)
+ {
+ elog(ERROR, "existing subxact assigned to unknown toplevel xact");
+ }
+}
+
+/*
+ * Associate a subtransaction with its toplevel transaction at commit
+ * time. There may be no further changes added after this.
+ */
+void
+ReorderBufferCommitChild(ReorderBuffer *rb, TransactionId xid,
+ TransactionId subxid, XLogRecPtr commit_lsn,
+ XLogRecPtr end_lsn)
+{
+ ReorderBufferTXN *txn;
+ ReorderBufferTXN *subtxn;
+
+ subtxn = ReorderBufferTXNByXid(rb, subxid, false, NULL,
+ InvalidXLogRecPtr, false);
+
+ /*
+ * No need to do anything if that subtxn didn't contain any changes
+ */
+ if (!subtxn)
+ return;
+
+ txn = ReorderBufferTXNByXid(rb, xid, false, NULL, commit_lsn, true);
+
+ if (txn == NULL)
+ elog(ERROR, "subxact logged without previous toplevel record");
+
+ /*
+ * Pass the our base snapshot to the parent transaction if it doesn't have
+ * one, or ours is older. That can happen if there are no changes in the
+ * toplevel transaction but in one of the child transactions. This allows
+ * the parent to simply use it's base snapshot initially.
+ */
+ if (txn->base_snapshot == NULL ||
+ txn->base_snapshot_lsn > subtxn->base_snapshot_lsn)
+ {
+ txn->base_snapshot = subtxn->base_snapshot;
+ txn->base_snapshot_lsn = subtxn->base_snapshot_lsn;
+ subtxn->base_snapshot = NULL;
+ subtxn->base_snapshot_lsn = InvalidXLogRecPtr;
+ }
+
+ subtxn->final_lsn = commit_lsn;
+ subtxn->end_lsn = end_lsn;
+
+ if (!subtxn->is_known_as_subxact)
+ {
+ subtxn->is_known_as_subxact = true;
+ Assert(subtxn->nsubtxns == 0);
+
+ /* remove from lsn order list of top-level transactions */
+ dlist_delete(&subtxn->node);
+
+ /* add to subtransaction list */
+ dlist_push_tail(&txn->subtxns, &subtxn->node);
+ txn->nsubtxns++;
+ }
+}
+
+
+/*
+ * Support for efficiently iterating over a transaction's and its
+ * subtransactions' changes.
+ *
+ * We do by doing a k-way merge between transactions/subtransactions. For that
+ * we model the current heads of the different transactions as a binary heap
+ * so we easily know which (sub-)transaction has the change with the smallest
+ * lsn next.
+ *
+ * We assume the changes in individual transactions are already sorted by LSN.
+ */
+
+/*
+ * Binary heap comparison function.
+ */
+static int
+ReorderBufferIterCompare(Datum a, Datum b, void *arg)
+{
+ ReorderBufferIterTXNState *state = (ReorderBufferIterTXNState *) arg;
+ XLogRecPtr pos_a = state->entries[DatumGetInt32(a)].lsn;
+ XLogRecPtr pos_b = state->entries[DatumGetInt32(b)].lsn;
+
+ if (pos_a < pos_b)
+ return 1;
+ else if (pos_a == pos_b)
+ return 0;
+ return -1;
+}
+
+/*
+ * Allocate & initialize an iterator which iterates in lsn order over a
+ * transaction and all its subtransactions.
+ */
+static ReorderBufferIterTXNState *
+ReorderBufferIterTXNInit(ReorderBuffer *rb, ReorderBufferTXN *txn)
+{
+ Size nr_txns = 0;
+ ReorderBufferIterTXNState *state;
+ dlist_iter cur_txn_i;
+ int32 off;
+
+ /*
+ * Calculate the size of our heap: one element for every transaction that
+ * contains changes. (Besides the transactions already in the reorder
+ * buffer, we count the one we were directly passed.)
+ */
+ if (txn->nentries > 0)
+ nr_txns++;
+
+ dlist_foreach(cur_txn_i, &txn->subtxns)
+ {
+ ReorderBufferTXN *cur_txn;
+
+ cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur);
+
+ if (cur_txn->nentries > 0)
+ nr_txns++;
+ }
+
+ /*
+ * TODO: Consider adding fastpath for the rather common nr_txns=1 case, no
+ * need to allocate/build a heap then.
+ */
+
+ /* allocate iteration state */
+ state = (ReorderBufferIterTXNState *)
+ MemoryContextAllocZero(rb->context,
+ sizeof(ReorderBufferIterTXNState) +
+ sizeof(ReorderBufferIterTXNEntry) * nr_txns);
+
+ state->nr_txns = nr_txns;
+ dlist_init(&state->old_change);
+
+ for (off = 0; off < state->nr_txns; off++)
+ {
+ state->entries[off].fd = -1;
+ state->entries[off].segno = 0;
+ }
+
+ /* allocate heap */
+ state->heap = binaryheap_allocate(state->nr_txns,
+ ReorderBufferIterCompare,
+ state);
+
+ /*
+ * Now insert items into the binary heap, in an unordered fashion. (We
+ * will run a heap assembly step at the end; this is more efficient.)
+ */
+
+ off = 0;
+
+ /* add toplevel transaction if it contains changes */
+ if (txn->nentries > 0)
+ {
+ ReorderBufferChange *cur_change;
+
+ if (txn->nentries != txn->nentries_mem)
+ ReorderBufferRestoreChanges(rb, txn, &state->entries[off].fd,
+ &state->entries[off].segno);
+
+ cur_change = dlist_head_element(ReorderBufferChange, node,
+ &txn->changes);
+
+ state->entries[off].lsn = cur_change->lsn;
+ state->entries[off].change = cur_change;
+ state->entries[off].txn = txn;
+
+ binaryheap_add_unordered(state->heap, Int32GetDatum(off++));
+ }
+
+ /* add subtransactions if they contain changes */
+ dlist_foreach(cur_txn_i, &txn->subtxns)
+ {
+ ReorderBufferTXN *cur_txn;
+
+ cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur);
+
+ if (cur_txn->nentries > 0)
+ {
+ ReorderBufferChange *cur_change;
+
+ if (txn->nentries != txn->nentries_mem)
+ ReorderBufferRestoreChanges(rb, cur_txn,
+ &state->entries[off].fd,
+ &state->entries[off].segno);
+
+ cur_change = dlist_head_element(ReorderBufferChange, node,
+ &cur_txn->changes);
+
+ state->entries[off].lsn = cur_change->lsn;
+ state->entries[off].change = cur_change;
+ state->entries[off].txn = cur_txn;
+
+ binaryheap_add_unordered(state->heap, Int32GetDatum(off++));
+ }
+ }
+
+ /* assemble a valid binary heap */
+ binaryheap_build(state->heap);
+
+ return state;
+}
+
+/*
+ * Return the next change when iterating over a transaction and its
+ * subtransactions.
+ *
+ * Returns NULL when no further changes exist.
+ */
+static ReorderBufferChange *
+ReorderBufferIterTXNNext(ReorderBuffer *rb, ReorderBufferIterTXNState *state)
+{
+ ReorderBufferChange *change;
+ ReorderBufferIterTXNEntry *entry;
+ int32 off;
+
+ /* nothing there anymore */
+ if (state->heap->bh_size == 0)
+ return NULL;
+
+ off = DatumGetInt32(binaryheap_first(state->heap));
+ entry = &state->entries[off];
+
+ /* free memory we might have "leaked" in the previous *Next call */
+ if (!dlist_is_empty(&state->old_change))
+ {
+ change = dlist_container(ReorderBufferChange, node,
+ dlist_pop_head_node(&state->old_change));
+ ReorderBufferReturnChange(rb, change);
+ Assert(dlist_is_empty(&state->old_change));
+ }
+
+ change = entry->change;
+
+ /*
+ * update heap with information about which transaction has the next
+ * relevant change in LSN order
+ */
+
+ /* there are in-memory changes */
+ if (dlist_has_next(&entry->txn->changes, &entry->change->node))
+ {
+ dlist_node *next = dlist_next_node(&entry->txn->changes, &change->node);
+ ReorderBufferChange *next_change =
+ dlist_container(ReorderBufferChange, node, next);
+
+ /* txn stays the same */
+ state->entries[off].lsn = next_change->lsn;
+ state->entries[off].change = next_change;
+
+ binaryheap_replace_first(state->heap, Int32GetDatum(off));
+ return change;
+ }
+
+ /* try to load changes from disk */
+ if (entry->txn->nentries != entry->txn->nentries_mem)
+ {
+ /*
+ * Ugly: restoring changes will reuse *Change records, thus delete the
+ * current one from the per-tx list and only free in the next call.
+ */
+ dlist_delete(&change->node);
+ dlist_push_tail(&state->old_change, &change->node);
+
+ if (ReorderBufferRestoreChanges(rb, entry->txn, &entry->fd,
+ &state->entries[off].segno))
+ {
+ /* successfully restored changes from disk */
+ ReorderBufferChange *next_change =
+ dlist_head_element(ReorderBufferChange, node,
+ &entry->txn->changes);
+
+ elog(DEBUG2, "restored %u/%u changes from disk",
+ (uint32) entry->txn->nentries_mem,
+ (uint32) entry->txn->nentries);
+
+ Assert(entry->txn->nentries_mem);
+ /* txn stays the same */
+ state->entries[off].lsn = next_change->lsn;
+ state->entries[off].change = next_change;
+ binaryheap_replace_first(state->heap, Int32GetDatum(off));
+
+ return change;
+ }
+ }
+
+ /* ok, no changes there anymore, remove */
+ binaryheap_remove_first(state->heap);
+
+ return change;
+}
+
+/*
+ * Deallocate the iterator
+ */
+static void
+ReorderBufferIterTXNFinish(ReorderBuffer *rb,
+ ReorderBufferIterTXNState *state)
+{
+ int32 off;
+
+ for (off = 0; off < state->nr_txns; off++)
+ {
+ if (state->entries[off].fd != -1)
+ CloseTransientFile(state->entries[off].fd);
+ }
+
+ /* free memory we might have "leaked" in the last *Next call */
+ if (!dlist_is_empty(&state->old_change))
+ {
+ ReorderBufferChange *change;
+
+ change = dlist_container(ReorderBufferChange, node,
+ dlist_pop_head_node(&state->old_change));
+ ReorderBufferReturnChange(rb, change);
+ Assert(dlist_is_empty(&state->old_change));
+ }
+
+ binaryheap_free(state->heap);
+ pfree(state);
+}
+
+/*
+ * Cleanup the contents of a transaction, usually after the transaction
+ * committed or aborted.
+ */
+static void
+ReorderBufferCleanupTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
+{
+ bool found;
+ dlist_mutable_iter iter;
+
+ /* cleanup subtransactions & their changes */
+ dlist_foreach_modify(iter, &txn->subtxns)
+ {
+ ReorderBufferTXN *subtxn;
+
+ subtxn = dlist_container(ReorderBufferTXN, node, iter.cur);
+
+ /*
+ * Subtransactions are always associated to the toplevel TXN, even if
+ * they originally were happening inside another subtxn, so we won't
+ * ever recurse more than one level deep here.
+ */
+ Assert(subtxn->is_known_as_subxact);
+ Assert(subtxn->nsubtxns == 0);
+
+ ReorderBufferCleanupTXN(rb, subtxn);
+ }
+
+ /* cleanup changes in the toplevel txn */
+ dlist_foreach_modify(iter, &txn->changes)
+ {
+ ReorderBufferChange *change;
+
+ change = dlist_container(ReorderBufferChange, node, iter.cur);
+
+ ReorderBufferReturnChange(rb, change);
+ }
+
+ /*
+ * Cleanup the tuplecids we stored for decoding catalog snapshot
+ * access. They are always stored in the toplevel transaction.
+ */
+ dlist_foreach_modify(iter, &txn->tuplecids)
+ {
+ ReorderBufferChange *change;
+
+ change = dlist_container(ReorderBufferChange, node, iter.cur);
+ Assert(change->action_internal == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID);
+ ReorderBufferReturnChange(rb, change);
+ }
+
+ if (txn->base_snapshot != NULL)
+ {
+ SnapBuildSnapDecRefcount(txn->base_snapshot);
+ txn->base_snapshot = NULL;
+ txn->base_snapshot_lsn = InvalidXLogRecPtr;
+ }
+
+ /* delete from list of known subxacts */
+ if (txn->is_known_as_subxact)
+ {
+ /* NB: nsubxacts count of parent will be too high now */
+ dlist_delete(&txn->node);
+ }
+ /* delete from LSN ordered list of toplevel TXNs */
+ else
+ {
+ dlist_delete(&txn->node);
+ }
+
+ /* now remove reference from buffer */
+ hash_search(rb->by_txn,
+ (void *) &txn->xid,
+ HASH_REMOVE,
+ &found);
+ Assert(found);
+
+ /* remove entries spilled to disk */
+ if (txn->nentries != txn->nentries_mem)
+ ReorderBufferRestoreCleanup(rb, txn);
+
+ /* deallocate */
+ ReorderBufferReturnTXN(rb, txn);
+}
+
+/*
+ * Build a hash with a (relfilenode, ctid) -> (cmin, cmax) mapping for use by
+ * tqual.c's HeapTupleSatisfiesHistoricMVCC.
+ */
+static void
+ReorderBufferBuildTupleCidHash(ReorderBuffer *rb, ReorderBufferTXN *txn)
+{
+ dlist_iter iter;
+ HASHCTL hash_ctl;
+
+ if (!txn->has_catalog_changes || dlist_is_empty(&txn->tuplecids))
+ return;
+
+ memset(&hash_ctl, 0, sizeof(hash_ctl));
+
+ hash_ctl.keysize = sizeof(ReorderBufferTupleCidKey);
+ hash_ctl.entrysize = sizeof(ReorderBufferTupleCidEnt);
+ hash_ctl.hash = tag_hash;
+ hash_ctl.hcxt = rb->context;
+
+ /*
+ * create the hash with the exact number of to-be-stored tuplecids from
+ * the start
+ */
+ txn->tuplecid_hash =
+ hash_create("ReorderBufferTupleCid", txn->ntuplecids, &hash_ctl,
+ HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
+
+ dlist_foreach(iter, &txn->tuplecids)
+ {
+ ReorderBufferTupleCidKey key;
+ ReorderBufferTupleCidEnt *ent;
+ bool found;
+ ReorderBufferChange *change;
+
+ change = dlist_container(ReorderBufferChange, node, iter.cur);
+
+ Assert(change->action_internal == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID);
+
+ /* be careful about padding */
+ memset(&key, 0, sizeof(ReorderBufferTupleCidKey));
+
+ key.relnode = change->tuplecid.node;
+
+ ItemPointerCopy(&change->tuplecid.tid,
+ &key.tid);
+
+ ent = (ReorderBufferTupleCidEnt *)
+ hash_search(txn->tuplecid_hash,
+ (void *) &key,
+ HASH_ENTER | HASH_FIND,
+ &found);
+ if (!found)
+ {
+ ent->cmin = change->tuplecid.cmin;
+ ent->cmax = change->tuplecid.cmax;
+ ent->combocid = change->tuplecid.combocid;
+ }
+ else
+ {
+ Assert(ent->cmin == change->tuplecid.cmin);
+ Assert(ent->cmax == InvalidCommandId ||
+ ent->cmax == change->tuplecid.cmax);
+
+ /*
+ * if the tuple got valid in this transaction and now got deleted
+ * we already have a valid cmin stored. The cmax will be
+ * InvalidCommandId though.
+ */
+ ent->cmax = change->tuplecid.cmax;
+ }
+ }
+}
+
+/*
+ * Copy a provided snapshot so we can modify it privately. This is needed so
+ * that catalog modifying transactions can look into intermediate catalog
+ * states.
+ */
+static Snapshot
+ReorderBufferCopySnap(ReorderBuffer *rb, Snapshot orig_snap,
+ ReorderBufferTXN *txn, CommandId cid)
+{
+ Snapshot snap;
+ dlist_iter iter;
+ int i = 0;
+ Size size;
+
+ size = sizeof(SnapshotData) +
+ sizeof(TransactionId) * orig_snap->xcnt +
+ sizeof(TransactionId) * (txn->nsubtxns + 1);
+
+ snap = MemoryContextAllocZero(rb->context, size);
+ memcpy(snap, orig_snap, sizeof(SnapshotData));
+
+ snap->copied = true;
+ snap->active_count = 0;
+ snap->regd_count = 1;
+ snap->xip = (TransactionId *) (snap + 1);
+
+ memcpy(snap->xip, orig_snap->xip, sizeof(TransactionId) * snap->xcnt);
+
+ /*
+ * snap->subxip contains all txids that belong to our transaction which we
+ * need to check via cmin/cmax. Thats why we store the toplevel
+ * transaction in there as well.
+ */
+ snap->subxip = snap->xip + snap->xcnt;
+ snap->subxip[i++] = txn->xid;
+
+ /*
+ * nsubxcnt isn't decreased when subtransactions abort, so count
+ * manually. Since it's an upper boundary it is safe to use it for the
+ * allocation above.
+ */
+ snap->subxcnt = 1;
+
+ dlist_foreach(iter, &txn->subtxns)
+ {
+ ReorderBufferTXN *sub_txn;
+
+ sub_txn = dlist_container(ReorderBufferTXN, node, iter.cur);
+ snap->subxip[i++] = sub_txn->xid;
+ snap->subxcnt++;
+ }
+
+ /* sort so we can bsearch() later */
+ qsort(snap->subxip, snap->subxcnt, sizeof(TransactionId), xidComparator);
+
+ /* store the specified current CommandId */
+ snap->curcid = cid;
+
+ return snap;
+}
+
+/*
+ * Free a previously ReorderBufferCopySnap'ed snapshot
+ */
+static void
+ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap)
+{
+ if (snap->copied)
+ pfree(snap);
+ else
+ SnapBuildSnapDecRefcount(snap);
+}
+
+/*
+ * Perform the replay of a transaction and it's non-aborted subtransactions.
+ *
+ * Subtransactions previously have to be processed by
+ * ReorderBufferCommitChild(), even if previously assigned to the toplevel
+ * transaction with ReorderBufferAssignChild.
+ *
+ * We currently can only decode a transaction's contents in when their commit
+ * record is read because that's currently the only place where we know about
+ * cache invalidations. Thus, once a toplevel commit is read, we iterate over
+ * the top and subtransactions (using a k-way merge) and replay the changes in
+ * lsn order.
+ */
+void
+ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid,
+ XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
+ TimestampTz commit_time)
+{
+ ReorderBufferTXN *txn;
+ ReorderBufferIterTXNState *iterstate = NULL;
+ ReorderBufferChange *change;
+
+ volatile CommandId command_id = FirstCommandId;
+ volatile Snapshot snapshot_now = NULL;
+ volatile bool txn_started = false;
+ volatile bool subtxn_started = false;
+
+ txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
+ false);
+
+ /* unknown transaction, nothing to replay */
+ if (txn == NULL)
+ return;
+
+ txn->final_lsn = commit_lsn;
+ txn->end_lsn = end_lsn;
+ txn->commit_time = commit_time;
+
+ /* serialize the last bunch of changes if we need start earlier anyway */
+ if (txn->nentries_mem != txn->nentries)
+ ReorderBufferSerializeTXN(rb, txn);
+
+ /*
+ * If this transaction didn't have any real changes in our database, it's
+ * OK not to have a snapshot. Note that ReorderBufferCommitChild will have
+ * transferred its snapshot to this transaction if it had one and the
+ * toplevel tx didn't.
+ */
+ if (txn->base_snapshot == NULL)
+ {
+ Assert(txn->ninvalidations == 0);
+ ReorderBufferCleanupTXN(rb, txn);
+ return;
+ }
+
+ snapshot_now = txn->base_snapshot;
+
+ /* build data to be able to lookup the CommandIds of catalog tuples */
+ ReorderBufferBuildTupleCidHash(rb, txn);
+
+ /* setup the initial snapshot */
+ SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
+
+ PG_TRY();
+ {
+ txn_started = false;
+
+ /*
+ * Decoding needs access to syscaches et al., which in turn use
+ * heavyweight locks and such. Thus we need to have enough state around
+ * to keep track of those. The easiest way is to simply use a
+ * transaction internally. That also allows us to easily enforce that
+ * nothing writes to the database by checking for xid assignments.
+ *
+ * When we're called via the SQL SRF there's already a transaction
+ * started, so start an explicit subtransaction there.
+ */
+ if (IsTransactionOrTransactionBlock())
+ {
+ BeginInternalSubTransaction("replay");
+ subtxn_started = true;
+ }
+ else
+ {
+ StartTransactionCommand();
+ txn_started = true;
+ }
+
+ rb->begin(rb, txn);
+
+ iterstate = ReorderBufferIterTXNInit(rb, txn);
+ while ((change = ReorderBufferIterTXNNext(rb, iterstate)))
+ {
+ Relation relation = NULL;
+ Oid reloid;
+
+ switch ((ReorderBufferChangeTypeInternal) change->action_internal)
+ {
+ case REORDER_BUFFER_CHANGE_INTERNAL_INSERT:
+ case REORDER_BUFFER_CHANGE_INTERNAL_UPDATE:
+ case REORDER_BUFFER_CHANGE_INTERNAL_DELETE:
+ Assert(snapshot_now);
+
+ reloid = RelidByRelfilenode(change->tp.relnode.spcNode,
+ change->tp.relnode.relNode);
+
+ /*
+ * Catalog tuple without data, emitted while catalog was
+ * in the process of being rewritten.
+ */
+ if (reloid == InvalidOid &&
+ change->tp.newtuple == NULL &&
+ change->tp.oldtuple == NULL)
+ continue;
+ else if (reloid == InvalidOid)
+ elog(ERROR, "could not lookup relation %s",
+ relpathperm(change->tp.relnode, MAIN_FORKNUM));
+
+ relation = RelationIdGetRelation(reloid);
+
+ if (relation == NULL)
+ elog(ERROR, "could open relation descriptor %s",
+ relpathperm(change->tp.relnode, MAIN_FORKNUM));
+
+ if (RelationIsLogicallyLogged(relation))
+ {
+ /*
+ * For now ignore sequence changes entirely. Most of
+ * the time they don't log changes using records we
+ * understand, so it doesn't make sense to handle the
+ * few cases we do.
+ */
+ if (relation->rd_rel->relkind == RELKIND_SEQUENCE)
+ {
+ }
+ /* user-triggered change */
+ else if (!IsToastRelation(relation))
+ {
+ ReorderBufferToastReplace(rb, txn, relation, change);
+ rb->apply_change(rb, txn, relation, change);
+ ReorderBufferToastReset(rb, txn);
+ }
+ /* we're not interested in toast deletions */
+ else if (change->action == REORDER_BUFFER_CHANGE_INSERT)
+ {
+ /*
+ * Need to reassemble the full toasted Datum in
+ * memory, to ensure the chunks don't get reused
+ * till we're done remove it from the list of this
+ * transaction's changes. Otherwise it will get
+ * freed/reused while restoring spooled data from
+ * disk.
+ */
+ dlist_delete(&change->node);
+ ReorderBufferToastAppendChunk(rb, txn, relation,
+ change);
+ }
+
+ }
+ RelationClose(relation);
+ break;
+ case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
+ /* get rid of the old */
+ TeardownHistoricSnapshot(false);
+
+ if (snapshot_now->copied)
+ {
+ ReorderBufferFreeSnap(rb, snapshot_now);
+ snapshot_now =
+ ReorderBufferCopySnap(rb, change->snapshot,
+ txn, command_id);
+ }
+ /*
+ * Restored from disk, need to be careful not to double
+ * free. We could introduce refcounting for that, but for
+ * now this seems infrequent enough not to care.
+ */
+ else if (change->snapshot->copied)
+ {
+ snapshot_now =
+ ReorderBufferCopySnap(rb, change->snapshot,
+ txn, command_id);
+ }
+ else
+ {
+ snapshot_now = change->snapshot;
+ }
+
+
+ /* and continue with the new one */
+ SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
+ break;
+
+ case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
+ Assert(change->command_id != InvalidCommandId);
+
+ if (command_id < change->command_id)
+ {
+ command_id = change->command_id;
+
+ if (!snapshot_now->copied)
+ {
+ /* we don't use the global one anymore */
+ snapshot_now = ReorderBufferCopySnap(rb, snapshot_now,
+ txn, command_id);
+ }
+
+ snapshot_now->curcid = command_id;
+
+ TeardownHistoricSnapshot(false);
+ SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
+
+ /*
+ * Every time the CommandId is incremented, we could
+ * see new catalog contents, so execute all
+ * invalidations.
+ */
+ ReorderBufferExecuteInvalidations(rb, txn);
+ }
+
+ break;
+
+ case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
+ elog(ERROR, "tuplecid value in changequeue");
+ break;
+ }
+ }
+
+ ReorderBufferIterTXNFinish(rb, iterstate);
+
+ /* call commit callback */
+ rb->commit(rb, txn, commit_lsn);
+
+ /* this is just a sanity check against bad output plugin behaviour */
+ if (GetCurrentTransactionIdIfAny() != InvalidTransactionId)
+ elog(ERROR, "output plugin used xid %u",
+ GetCurrentTransactionId());
+
+ /* make sure there's no cache pollution */
+ ReorderBufferExecuteInvalidations(rb, txn);
+
+ /* cleanup */
+ TeardownHistoricSnapshot(false);
+
+ /*
+ * Abort subtransaction or the transaction as a whole has the right
+ * semantics. We want all locks acquired in here to be released, not
+ * reassigned to the parent and we do not want any database access
+ * have persistent effects.
+ */
+ if (subtxn_started)
+ RollbackAndReleaseCurrentSubTransaction();
+ else if (txn_started)
+ AbortCurrentTransaction();
+
+ if (snapshot_now->copied)
+ ReorderBufferFreeSnap(rb, snapshot_now);
+
+ /* remove potential on-disk data, and deallocate */
+ ReorderBufferCleanupTXN(rb, txn);
+ }
+ PG_CATCH();
+ {
+ /* TODO: Encapsulate cleanup from the PG_TRY and PG_CATCH blocks */
+ if (iterstate)
+ ReorderBufferIterTXNFinish(rb, iterstate);
+
+ TeardownHistoricSnapshot(true);
+
+ if (snapshot_now->copied)
+ ReorderBufferFreeSnap(rb, snapshot_now);
+
+ if (subtxn_started)
+ RollbackAndReleaseCurrentSubTransaction();
+ else if (txn_started)
+ AbortCurrentTransaction();
+
+ /*
+ * Invalidations in an aborted transactions aren't allowed to do
+ * catalog access, so we don't need to still have the snapshot setup.
+ */
+ ReorderBufferExecuteInvalidations(rb, txn);
+
+ /* remove potential on-disk data, and deallocate */
+ ReorderBufferCleanupTXN(rb, txn);
+
+ PG_RE_THROW();
+ }
+ PG_END_TRY();
+}
+
+/*
+ * Abort a transaction that possibly has previous changes. Needs to be first
+ * called for subtransactions and then for the toplevel xid.
+ *
+ * NB: Transactions handled here have to have actively aborted (i.e. have
+ * produced an abort record). Implicitly aborted transactions are handled via
+ * ReorderBufferAbortOld(); transactions we're just not interesteded in, but
+ * which have committed are handled in ReorderBufferForget().
+ *
+ * This function purges this transaction and its contents from memory and
+ * disk.
+ */
+void
+ReorderBufferAbort(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
+{
+ ReorderBufferTXN *txn;
+
+ txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
+ false);
+
+ /* unknown, nothing to remove */
+ if (txn == NULL)
+ return;
+
+ /* cosmetic... */
+ txn->final_lsn = lsn;
+
+ /* remove potential on-disk data, and deallocate */
+ ReorderBufferCleanupTXN(rb, txn);
+}
+
+/*
+ * Abort all transactions that aren't actually running anymore because the
+ * server restarted.
+ *
+ * NB: These really have to be transactions that have aborted due to a server
+ * crash/immediate restart, as we don't deal with invalidations here.
+ */
+void
+ReorderBufferAbortOld(ReorderBuffer *rb, TransactionId oldestRunningXid)
+{
+ dlist_mutable_iter it;
+
+ /*
+ * Iterate through all (potential) toplevel TXNs and abort all that are
+ * older than what possibly can be running. Once we've found the first
+ * that is alive we stop, there might be some that acquired an xid earlier
+ * but started writing later, but it's unlikely and they will cleaned up
+ * in a later call to ReorderBufferAbortOld().
+ */
+ dlist_foreach_modify(it, &rb->toplevel_by_lsn)
+ {
+ ReorderBufferTXN * txn;
+
+ txn = dlist_container(ReorderBufferTXN, node, it.cur);
+
+ if (TransactionIdPrecedes(txn->xid, oldestRunningXid))
+ {
+ elog(DEBUG1, "aborting old transaction %u", txn->xid);
+
+ /* remove potential on-disk data, and deallocate this tx */
+ ReorderBufferCleanupTXN(rb, txn);
+ }
+ else
+ return;
+ }
+}
+
+/*
+ * Forget the contents of a transaction if we aren't interested in it's
+ * contents. Needs to be first called for subtransactions and then for the
+ * toplevel xid.
+ *
+ * This is significantly different to ReorderBufferAbort() because
+ * transactions that have committed need to be treated differenly from aborted
+ * ones since they may have modified the catalog.
+ *
+ * Note that this is only allowed to be called in the moment a transaction
+ * commit has just been read, not earlier; otherwise later records referring
+ * to this xid might re-create the transaction incompletely.
+ */
+void
+ReorderBufferForget(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
+{
+ ReorderBufferTXN *txn;
+
+ txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
+ false);
+
+ /* unknown, nothing to forget */
+ if (txn == NULL)
+ return;
+
+ /* cosmetic... */
+ txn->final_lsn = lsn;
+
+ /*
+ * Proccess cache invalidation messages if there are any. Even if we're
+ * not interested in the transaction's contents, it could have manipulated
+ * the catalog and we need to update the caches according to that.
+ */
+ if (txn->base_snapshot != NULL && txn->ninvalidations > 0)
+ {
+ /* setup snapshot to perform the invalidations in */
+ SetupHistoricSnapshot(txn->base_snapshot, txn->tuplecid_hash);
+ PG_TRY();
+ {
+ ReorderBufferExecuteInvalidations(rb, txn);
+ TeardownHistoricSnapshot(false);
+ }
+ PG_CATCH();
+ {
+ /* cleanup */
+ TeardownHistoricSnapshot(true);
+ PG_RE_THROW();
+ }
+ PG_END_TRY();
+ }
+ else
+ Assert(txn->ninvalidations == 0);
+
+ /* remove potential on-disk data, and deallocate */
+ ReorderBufferCleanupTXN(rb, txn);
+}
+
+
+/*
+ * Check whether a transaction is already known in this module.xs
+ */
+bool
+ReorderBufferIsXidKnown(ReorderBuffer *rb, TransactionId xid)
+{
+ ReorderBufferTXN *txn;
+
+ txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
+ false);
+ return txn != NULL;
+}
+
+/*
+ * Add a new snapshot to this transaction that may only used after lsn 'lsn'
+ * because the previous snapshot doesn't describe the catalog correctly for
+ * following rows.
+ */
+void
+ReorderBufferAddSnapshot(ReorderBuffer *rb, TransactionId xid,
+ XLogRecPtr lsn, Snapshot snap)
+{
+ ReorderBufferChange *change = ReorderBufferGetChange(rb);
+
+ change->snapshot = snap;
+ change->action_internal = REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT;
+
+ ReorderBufferQueueChange(rb, xid, lsn, change);
+}
+
+/*
+ * Setup the base snapshot of a transaction. The base snapshot is the snapshot
+ * that is used to decode all changes until either this transaction modifies
+ * the catalog or another catalog modifying transaction commits.
+ *
+ * Needs to be called before any changes are added with
+ * ReorderBufferQueueChange().
+ */
+void
+ReorderBufferSetBaseSnapshot(ReorderBuffer *rb, TransactionId xid,
+ XLogRecPtr lsn, Snapshot snap)
+{
+ ReorderBufferTXN *txn;
+ bool is_new;
+
+ txn = ReorderBufferTXNByXid(rb, xid, true, &is_new, lsn, true);
+ Assert(txn->base_snapshot == NULL);
+ Assert(snap != NULL);
+
+ txn->base_snapshot = snap;
+ txn->base_snapshot_lsn = lsn;
+}
+
+/*
+ * Access the catalog with this CommandId at this point in the changestream.
+ *
+ * May only be called for command ids > 1
+ */
+void
+ReorderBufferAddNewCommandId(ReorderBuffer *rb, TransactionId xid,
+ XLogRecPtr lsn, CommandId cid)
+{
+ ReorderBufferChange *change = ReorderBufferGetChange(rb);
+
+ change->command_id = cid;
+ change->action_internal = REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID;
+
+ ReorderBufferQueueChange(rb, xid, lsn, change);
+}
+
+
+/*
+ * Add new (relfilenode, tid) -> (cmin, cmax) mappings.
+ */
+void
+ReorderBufferAddNewTupleCids(ReorderBuffer *rb, TransactionId xid,
+ XLogRecPtr lsn, RelFileNode node,
+ ItemPointerData tid, CommandId cmin,
+ CommandId cmax, CommandId combocid)
+{
+ ReorderBufferChange *change = ReorderBufferGetChange(rb);
+ ReorderBufferTXN *txn;
+
+ txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
+
+ change->tuplecid.node = node;
+ change->tuplecid.tid = tid;
+ change->tuplecid.cmin = cmin;
+ change->tuplecid.cmax = cmax;
+ change->tuplecid.combocid = combocid;
+ change->lsn = lsn;
+ change->action_internal = REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID;
+
+ dlist_push_tail(&txn->tuplecids, &change->node);
+ txn->ntuplecids++;
+}
+
+/*
+ * Setup the invalidation of the toplevel transaction.
+ *
+ * This needs to be done before ReorderBufferCommit is called!
+ */
+void
+ReorderBufferAddInvalidations(ReorderBuffer *rb, TransactionId xid,
+ XLogRecPtr lsn, Size nmsgs,
+ SharedInvalidationMessage *msgs)
+{
+ ReorderBufferTXN *txn;
+
+ txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
+
+ if (txn->ninvalidations != 0)
+ elog(ERROR, "only ever add one set of invalidations");
+
+ Assert(nmsgs > 0);
+
+ txn->ninvalidations = nmsgs;
+ txn->invalidations = (SharedInvalidationMessage *)
+ MemoryContextAlloc(rb->context,
+ sizeof(SharedInvalidationMessage) * nmsgs);
+ memcpy(txn->invalidations, msgs,
+ sizeof(SharedInvalidationMessage) * nmsgs);
+}
+
+/*
+ * Apply all invalidations we know. Possibly we only need parts at this point
+ * in the changestream but we don't know which those are.
+ */
+static void
+ReorderBufferExecuteInvalidations(ReorderBuffer *rb, ReorderBufferTXN *txn)
+{
+ int i;
+
+ for (i = 0; i < txn->ninvalidations; i++)
+ LocalExecuteInvalidationMessage(&txn->invalidations[i]);
+}
+
+/*
+ * Mark a transaction as containing catalog changes
+ */
+void
+ReorderBufferXidSetCatalogChanges(ReorderBuffer *rb, TransactionId xid,
+ XLogRecPtr lsn)
+{
+ ReorderBufferTXN *txn;
+
+ txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
+
+ txn->has_catalog_changes = true;
+}
+
+/*
+ * Query whether a transaction is already *known* to contain catalog
+ * changes. This can be wrong until directly before the commit!
+ */
+bool
+ReorderBufferXidHasCatalogChanges(ReorderBuffer *rb, TransactionId xid)
+{
+ ReorderBufferTXN *txn;
+
+ txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
+ false);
+ if (txn == NULL)
+ return false;
+
+ return txn->has_catalog_changes;
+}
+
+/*
+ * Have we already added the first snapshot?
+ */
+bool
+ReorderBufferXidHasBaseSnapshot(ReorderBuffer *rb, TransactionId xid)
+{
+ ReorderBufferTXN *txn;
+
+ txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
+ false);
+
+ /* transaction isn't known yet, ergo no snapshot */
+ if (txn == NULL)
+ return false;
+
+ /*
+ * TODO: It would be a nice improvement if we would check the toplevel
+ * transaction in subtransactions, but we'd need to keep track of a bit
+ * more state.
+ */
+ return txn->base_snapshot != NULL;
+}
+
+
+/*
+ * ---------------------------------------
+ * Disk serialization support
+ * ---------------------------------------
+ */
+
+/*
+ * Ensure the IO buffer is >= sz.
+ */
+static void
+ReorderBufferSerializeReserve(ReorderBuffer *rb, Size sz)
+{
+ if (!rb->outbufsize)
+ {
+ rb->outbuf = MemoryContextAlloc(rb->context, sz);
+ rb->outbufsize = sz;
+ }
+ else if (rb->outbufsize < sz)
+ {
+ rb->outbuf = repalloc(rb->outbuf, sz);
+ rb->outbufsize = sz;
+ }
+}
+
+/*
+ * Check whether the transaction tx should spill its data to disk.
+ */
+static void
+ReorderBufferCheckSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
+{
+ /*
+ * TODO: improve accounting so we cheaply can take subtransactions into
+ * account here.
+ */
+ if (txn->nentries_mem >= max_changes_in_memory)
+ {
+ ReorderBufferSerializeTXN(rb, txn);
+ Assert(txn->nentries_mem == 0);
+ }
+}
+
+/*
+ * Spill data of a large transaction (and its subtransactions) to disk.
+ */
+static void
+ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
+{
+ dlist_iter subtxn_i;
+ dlist_mutable_iter change_i;
+ int fd = -1;
+ XLogSegNo curOpenSegNo = 0;
+ Size spilled = 0;
+ char path[MAXPGPATH];
+
+ elog(DEBUG2, "spill %u changes in tx %u to disk",
+ (uint32) txn->nentries_mem, txn->xid);
+
+ /* do the same to all child TXs */
+ dlist_foreach(subtxn_i, &txn->subtxns)
+ {
+ ReorderBufferTXN *subtxn;
+
+ subtxn = dlist_container(ReorderBufferTXN, node, subtxn_i.cur);
+ ReorderBufferSerializeTXN(rb, subtxn);
+ }
+
+ /* serialize changestream */
+ dlist_foreach_modify(change_i, &txn->changes)
+ {
+ ReorderBufferChange *change;
+
+ change = dlist_container(ReorderBufferChange, node, change_i.cur);
+
+ /*
+ * store in segment in which it belongs by start lsn, don't split over
+ * multiple segments tho
+ */
+ if (fd == -1 || XLByteInSeg(change->lsn, curOpenSegNo))
+ {
+ XLogRecPtr recptr;
+
+ if (fd != -1)
+ CloseTransientFile(fd);
+
+ XLByteToSeg(change->lsn, curOpenSegNo);
+ XLogSegNoOffsetToRecPtr(curOpenSegNo, 0, recptr);
+
+ /*
+ * No need to care about TLIs here, only used during a single run,
+ * so each LSN only maps to a specific WAL record.
+ */
+ sprintf(path, "pg_replslot/%s/xid-%u-lsn-%X-%X.snap",
+ NameStr(MyReplicationSlot->data.name), txn->xid,
+ (uint32) (recptr >> 32), (uint32) recptr);
+
+ /* open segment, create it if necessary */
+ fd = OpenTransientFile(path,
+ O_CREAT | O_WRONLY | O_APPEND | PG_BINARY,
+ S_IRUSR | S_IWUSR);
+
+ if (fd < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not open file \"%s\": %m",
+ path)));
+ }
+
+ ReorderBufferSerializeChange(rb, txn, fd, change);
+ dlist_delete(&change->node);
+ ReorderBufferReturnChange(rb, change);
+
+ spilled++;
+ }
+
+ Assert(spilled == txn->nentries_mem);
+ Assert(dlist_is_empty(&txn->changes));
+ txn->nentries_mem = 0;
+
+ if (fd != -1)
+ CloseTransientFile(fd);
+}
+
+/*
+ * Serialize individual change to disk.
+ */
+static void
+ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
+ int fd, ReorderBufferChange *change)
+{
+ ReorderBufferDiskChange *ondisk;
+ Size sz = sizeof(ReorderBufferDiskChange);
+
+ ReorderBufferSerializeReserve(rb, sz);
+
+ ondisk = (ReorderBufferDiskChange *) rb->outbuf;
+ memcpy(&ondisk->change, change, sizeof(ReorderBufferChange));
+
+ switch ((ReorderBufferChangeTypeInternal) change->action_internal)
+ {
+ case REORDER_BUFFER_CHANGE_INTERNAL_INSERT:
+ /* fall through */
+ case REORDER_BUFFER_CHANGE_INTERNAL_UPDATE:
+ /* fall through */
+ case REORDER_BUFFER_CHANGE_INTERNAL_DELETE:
+ {
+ char *data;
+ Size oldlen = 0;
+ Size newlen = 0;
+
+ if (change->tp.oldtuple)
+ oldlen = offsetof(ReorderBufferTupleBuf, data)
+ + change->tp.oldtuple->tuple.t_len
+ - offsetof(HeapTupleHeaderData, t_bits);
+
+ if (change->tp.newtuple)
+ newlen = offsetof(ReorderBufferTupleBuf, data)
+ + change->tp.newtuple->tuple.t_len
+ - offsetof(HeapTupleHeaderData, t_bits);
+
+ sz += oldlen;
+ sz += newlen;
+
+ /* make sure we have enough space */
+ ReorderBufferSerializeReserve(rb, sz);
+
+ data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
+ /* might have been reallocated above */
+ ondisk = (ReorderBufferDiskChange *) rb->outbuf;
+
+ if (oldlen)
+ {
+ memcpy(data, change->tp.oldtuple, oldlen);
+ data += oldlen;
+ Assert(&change->tp.oldtuple->header == change->tp.oldtuple->tuple.t_data);
+ }
+
+ if (newlen)
+ {
+ memcpy(data, change->tp.newtuple, newlen);
+ data += newlen;
+ Assert(&change->tp.newtuple->header == change->tp.newtuple->tuple.t_data);
+ }
+ break;
+ }
+ case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
+ {
+ char *data;
+
+ sz += sizeof(SnapshotData) +
+ sizeof(TransactionId) * change->snapshot->xcnt +
+ sizeof(TransactionId) * change->snapshot->subxcnt
+ ;
+
+ /* make sure we have enough space */
+ ReorderBufferSerializeReserve(rb, sz);
+ data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
+ /* might have been reallocated above */
+ ondisk = (ReorderBufferDiskChange *) rb->outbuf;
+
+ memcpy(data, change->snapshot, sizeof(SnapshotData));
+ data += sizeof(SnapshotData);
+
+ if (change->snapshot->xcnt)
+ {
+ memcpy(data, change->snapshot->xip,
+ sizeof(TransactionId) + change->snapshot->xcnt);
+ data += sizeof(TransactionId) + change->snapshot->xcnt;
+ }
+
+ if (change->snapshot->subxcnt)
+ {
+ memcpy(data, change->snapshot->subxip,
+ sizeof(TransactionId) + change->snapshot->subxcnt);
+ data += sizeof(TransactionId) + change->snapshot->subxcnt;
+ }
+ break;
+ }
+ case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
+ /* ReorderBufferChange contains everything important */
+ break;
+ case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
+ /* ReorderBufferChange contains everything important */
+ break;
+ }
+
+ ondisk->size = sz;
+
+ if (write(fd, rb->outbuf, ondisk->size) != ondisk->size)
+ {
+ CloseTransientFile(fd);
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not write to xid %u's data file: %m",
+ txn->xid)));
+ }
+
+ Assert(ondisk->change.action_internal == change->action_internal);
+}
+
+/*
+ * Restore a number of changes spilled to disk back into memory.
+ */
+static Size
+ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn,
+ int *fd, XLogSegNo *segno)
+{
+ Size restored = 0;
+ XLogSegNo last_segno;
+ dlist_mutable_iter cleanup_iter;
+
+ Assert(txn->first_lsn != InvalidXLogRecPtr);
+ Assert(txn->final_lsn != InvalidXLogRecPtr);
+
+ /* free current entries, so we have memory for more */
+ dlist_foreach_modify(cleanup_iter, &txn->changes)
+ {
+ ReorderBufferChange *cleanup =
+ dlist_container(ReorderBufferChange, node, cleanup_iter.cur);
+
+ dlist_delete(&cleanup->node);
+ ReorderBufferReturnChange(rb, cleanup);
+ }
+ txn->nentries_mem = 0;
+ Assert(dlist_is_empty(&txn->changes));
+
+ XLByteToSeg(txn->final_lsn, last_segno);
+
+ while (restored < max_changes_in_memory && *segno <= last_segno)
+ {
+ int readBytes;
+ ReorderBufferDiskChange *ondisk;
+
+ if (*fd == -1)
+ {
+ XLogRecPtr recptr;
+ char path[MAXPGPATH];
+
+ /* first time in */
+ if (*segno == 0)
+ {
+ XLByteToSeg(txn->first_lsn, *segno);
+ }
+
+ Assert(*segno != 0 || dlist_is_empty(&txn->changes));
+ XLogSegNoOffsetToRecPtr(*segno, 0, recptr);
+
+ /*
+ * No need to care about TLIs here, only used during a single run,
+ * so each LSN only maps to a specific WAL record.
+ */
+ sprintf(path, "pg_replslot/%s/xid-%u-lsn-%X-%X.snap",
+ NameStr(MyReplicationSlot->data.name), txn->xid,
+ (uint32) (recptr >> 32), (uint32) recptr);
+
+ *fd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0);
+ if (*fd < 0 && errno == ENOENT)
+ {
+ *fd = -1;
+ (*segno)++;
+ continue;
+ }
+ else if (*fd < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not open file \"%s\": %m",
+ path)));
+
+ }
+
+ ReorderBufferSerializeReserve(rb, sizeof(ReorderBufferDiskChange));
+
+
+ /*
+ * Read the statically sized part of a change which has information
+ * about the total size. If we couldn't read a record, we're at the
+ * end of this file.
+ */
+
+ readBytes = read(*fd, rb->outbuf, sizeof(ReorderBufferDiskChange));
+
+ /* eof */
+ if (readBytes == 0)
+ {
+ CloseTransientFile(*fd);
+ *fd = -1;
+ (*segno)++;
+ continue;
+ }
+ else if (readBytes < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not read from reorderbuffer spill file: %m")));
+ else if (readBytes != sizeof(ReorderBufferDiskChange))
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("incomplete read from reorderbuffer spill file: read %d instead of %u",
+ readBytes,
+ (uint32) sizeof(ReorderBufferDiskChange))));
+
+ ondisk = (ReorderBufferDiskChange *) rb->outbuf;
+
+ ReorderBufferSerializeReserve(rb,
+ sizeof(ReorderBufferDiskChange) + ondisk->size);
+ ondisk = (ReorderBufferDiskChange *) rb->outbuf;
+
+ readBytes = read(*fd, rb->outbuf + sizeof(ReorderBufferDiskChange),
+ ondisk->size - sizeof(ReorderBufferDiskChange));
+
+ if (readBytes < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not read from reorderbuffer spill file: %m")));
+ else if (readBytes != ondisk->size - sizeof(ReorderBufferDiskChange))
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not read from reorderbuffer spill file: read %d instead of %u",
+ readBytes,
+ (uint32) (ondisk->size - sizeof(ReorderBufferDiskChange)))));
+
+ /*
+ * ok, read a full change from disk, now restore it into proper
+ * in-memory format
+ */
+ ReorderBufferRestoreChange(rb, txn, rb->outbuf);
+ restored++;
+ }
+
+ return restored;
+}
+
+/*
+ * Convert change from its on-disk format to in-memory format and queue it onto
+ * the TXN's ->changes list.
+ */
+static void
+ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
+ char *data)
+{
+ ReorderBufferDiskChange *ondisk;
+ ReorderBufferChange *change;
+
+ ondisk = (ReorderBufferDiskChange *) data;
+
+ change = ReorderBufferGetChange(rb);
+
+ /* copy static part */
+ memcpy(change, &ondisk->change, sizeof(ReorderBufferChange));
+
+ data += sizeof(ReorderBufferDiskChange);
+
+ /* restore individual stuff */
+ switch ((ReorderBufferChangeTypeInternal) change->action_internal)
+ {
+ case REORDER_BUFFER_CHANGE_INTERNAL_INSERT:
+ /* fall through */
+ case REORDER_BUFFER_CHANGE_INTERNAL_UPDATE:
+ /* fall through */
+ case REORDER_BUFFER_CHANGE_INTERNAL_DELETE:
+ if (change->tp.newtuple)
+ {
+ Size len = offsetof(ReorderBufferTupleBuf, data)
+ +((ReorderBufferTupleBuf *) data)->tuple.t_len
+ - offsetof(HeapTupleHeaderData, t_bits);
+
+ change->tp.newtuple = ReorderBufferGetTupleBuf(rb);
+ memcpy(change->tp.newtuple, data, len);
+ change->tp.newtuple->tuple.t_data = &change->tp.newtuple->header;
+
+ data += len;
+ }
+
+ if (change->tp.oldtuple)
+ {
+ Size len = offsetof(ReorderBufferTupleBuf, data)
+ +((ReorderBufferTupleBuf *) data)->tuple.t_len
+ - offsetof(HeapTupleHeaderData, t_bits);
+
+ change->tp.oldtuple = ReorderBufferGetTupleBuf(rb);
+ memcpy(change->tp.oldtuple, data, len);
+ change->tp.oldtuple->tuple.t_data = &change->tp.oldtuple->header;
+ data += len;
+ }
+ break;
+ case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
+ {
+ Snapshot oldsnap = (Snapshot) data;
+ Size size = sizeof(SnapshotData) +
+ sizeof(TransactionId) * oldsnap->xcnt +
+ sizeof(TransactionId) * (oldsnap->subxcnt + 0)
+ ;
+
+ Assert(change->snapshot != NULL);
+
+ change->snapshot = MemoryContextAllocZero(rb->context, size);
+
+ memcpy(change->snapshot, data, size);
+ change->snapshot->xip = (TransactionId *)
+ (((char *) change->snapshot) + sizeof(SnapshotData));
+ change->snapshot->subxip =
+ change->snapshot->xip + change->snapshot->xcnt + 0;
+ change->snapshot->copied = true;
+ break;
+ }
+ /* the base struct contains all the data, easy peasy */
+ case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
+ case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
+ break;
+ }
+
+ dlist_push_tail(&txn->changes, &change->node);
+ txn->nentries_mem++;
+}
+
+/*
+ * Remove all on-disk stored for the passed in transaction.
+ */
+static void
+ReorderBufferRestoreCleanup(ReorderBuffer *rb, ReorderBufferTXN *txn)
+{
+ XLogSegNo first;
+ XLogSegNo cur;
+ XLogSegNo last;
+
+ Assert(txn->first_lsn != InvalidXLogRecPtr);
+ Assert(txn->final_lsn != InvalidXLogRecPtr);
+
+ XLByteToSeg(txn->first_lsn, first);
+ XLByteToSeg(txn->final_lsn, last);
+
+ /* iterate over all possible filenames, and delete them */
+ for (cur = first; cur <= last; cur++)
+ {
+ char path[MAXPGPATH];
+ XLogRecPtr recptr;
+
+ XLogSegNoOffsetToRecPtr(cur, 0, recptr);
+
+ sprintf(path, "pg_replslot/%s/xid-%u-lsn-%X-%X.snap",
+ NameStr(MyReplicationSlot->data.name), txn->xid,
+ (uint32) (recptr >> 32), (uint32) recptr);
+ if (unlink(path) != 0 && errno != ENOENT)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not unlink file \"%s\": %m", path)));
+ }
+}
+
+/*
+ * Delete all data spilled to disk after we've restarted/crashed. It will be
+ * recreated when the respective slots are reused.
+ */
+void
+StartupReorderBuffer(void)
+{
+ DIR *logical_dir;
+ struct dirent *logical_de;
+
+ DIR *spill_dir;
+ struct dirent *spill_de;
+
+ logical_dir = AllocateDir("pg_replslot");
+ while ((logical_de = ReadDir(logical_dir, "pg_replslot")) != NULL)
+ {
+ struct stat statbuf;
+ char path[MAXPGPATH];
+
+ if (strcmp(logical_de->d_name, ".") == 0 ||
+ strcmp(logical_de->d_name, "..") == 0)
+ continue;
+
+ /* if it cannot be a slot, skip the directory */
+ if (!ReplicationSlotValidateName(logical_de->d_name, DEBUG2))
+ continue;
+
+ /*
+ * ok, has to be a surviving logical slot, iterate and delete
+ * everythign starting with xid-*
+ */
+ sprintf(path, "pg_replslot/%s", logical_de->d_name);
+
+ /* we're only creating directories here, skip if it's not our's */
+ if (lstat(path, &statbuf) == 0 && !S_ISDIR(statbuf.st_mode))
+ continue;
+
+ spill_dir = AllocateDir(path);
+ while ((spill_de = ReadDir(spill_dir, path)) != NULL)
+ {
+ if (strcmp(spill_de->d_name, ".") == 0 ||
+ strcmp(spill_de->d_name, "..") == 0)
+ continue;
+
+ /* only look at names that can be ours */
+ if (strncmp(spill_de->d_name, "xid", 3) == 0)
+ {
+ sprintf(path, "pg_replslot/%s/%s", logical_de->d_name,
+ spill_de->d_name);
+
+ if (unlink(path) != 0)
+ ereport(PANIC,
+ (errcode_for_file_access(),
+ errmsg("could not unlink file \"%s\": %m",
+ path)));
+ }
+ }
+ FreeDir(spill_dir);
+ }
+ FreeDir(logical_dir);
+}
+
+/* ---------------------------------------
+ * toast reassembly support
+ * ---------------------------------------
+ */
+
+/*
+ * Initialize per tuple toast reconstruction support.
+ */
+static void
+ReorderBufferToastInitHash(ReorderBuffer *rb, ReorderBufferTXN *txn)
+{
+ HASHCTL hash_ctl;
+
+ Assert(txn->toast_hash == NULL);
+
+ memset(&hash_ctl, 0, sizeof(hash_ctl));
+ hash_ctl.keysize = sizeof(Oid);
+ hash_ctl.entrysize = sizeof(ReorderBufferToastEnt);
+ hash_ctl.hash = tag_hash;
+ hash_ctl.hcxt = rb->context;
+ txn->toast_hash = hash_create("ReorderBufferToastHash", 5, &hash_ctl,
+ HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
+}
+
+/*
+ * Per toast-chunk handling for toast reconstruction
+ *
+ * Appends a toast chunk so we can reconstruct it when the tuple "owning" the
+ * toasted Datum comes along.
+ */
+static void
+ReorderBufferToastAppendChunk(ReorderBuffer *rb, ReorderBufferTXN *txn,
+ Relation relation, ReorderBufferChange *change)
+{
+ ReorderBufferToastEnt *ent;
+ bool found;
+ int32 chunksize;
+ bool isnull;
+ Pointer chunk;
+ TupleDesc desc = RelationGetDescr(relation);
+ Oid chunk_id;
+ Oid chunk_seq;
+
+ if (txn->toast_hash == NULL)
+ ReorderBufferToastInitHash(rb, txn);
+
+ Assert(IsToastRelation(relation));
+
+ chunk_id = DatumGetObjectId(fastgetattr(&change->tp.newtuple->tuple, 1, desc, &isnull));
+ Assert(!isnull);
+ chunk_seq = DatumGetInt32(fastgetattr(&change->tp.newtuple->tuple, 2, desc, &isnull));
+ Assert(!isnull);
+
+ ent = (ReorderBufferToastEnt *)
+ hash_search(txn->toast_hash,
+ (void *) &chunk_id,
+ HASH_ENTER,
+ &found);
+
+ if (!found)
+ {
+ Assert(ent->chunk_id == chunk_id);
+ ent->num_chunks = 0;
+ ent->last_chunk_seq = 0;
+ ent->size = 0;
+ ent->reconstructed = NULL;
+ dlist_init(&ent->chunks);
+
+ if (chunk_seq != 0)
+ elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq 0",
+ chunk_seq, chunk_id);
+ }
+ else if (found && chunk_seq != ent->last_chunk_seq + 1)
+ elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq %d",
+ chunk_seq, chunk_id, ent->last_chunk_seq + 1);
+
+ chunk = DatumGetPointer(fastgetattr(&change->tp.newtuple->tuple, 3, desc, &isnull));
+ Assert(!isnull);
+
+ /* calculate size so we can allocate the right size at once later */
+ if (!VARATT_IS_EXTENDED(chunk))
+ chunksize = VARSIZE(chunk) - VARHDRSZ;
+ else if (VARATT_IS_SHORT(chunk))
+ /* could happen due to heap_form_tuple doing its thing */
+ chunksize = VARSIZE_SHORT(chunk) - VARHDRSZ_SHORT;
+ else
+ elog(ERROR, "unexpected type of toast chunk");
+
+ ent->size += chunksize;
+ ent->last_chunk_seq = chunk_seq;
+ ent->num_chunks++;
+ dlist_push_tail(&ent->chunks, &change->node);
+}
+
+/*
+ * Rejigger change->newtuple to point to in-memory toast tuples instead to
+ * on-disk toast tuples that may not longer exist (think DROP TABLE or VACUUM).
+ *
+ * We cannot replace unchanged toast tuples though, so those will still point
+ * to on-disk toast data.
+ */
+static void
+ReorderBufferToastReplace(ReorderBuffer *rb, ReorderBufferTXN *txn,
+ Relation relation, ReorderBufferChange *change)
+{
+ TupleDesc desc;
+ int natt;
+ Datum *attrs;
+ bool *isnull;
+ bool *free;
+ HeapTuple newtup;
+ Relation toast_rel;
+ TupleDesc toast_desc;
+ MemoryContext oldcontext;
+
+ /* no toast tuples changed */
+ if (txn->toast_hash == NULL)
+ return;
+
+ oldcontext = MemoryContextSwitchTo(rb->context);
+
+ /* we should only have toast tuples in an INSERT or UPDATE */
+ Assert(change->tp.newtuple);
+
+ desc = RelationGetDescr(relation);
+
+ toast_rel = RelationIdGetRelation(relation->rd_rel->reltoastrelid);
+ toast_desc = RelationGetDescr(toast_rel);
+
+ /* should we allocate from stack instead? */
+ attrs = palloc0(sizeof(Datum) * desc->natts);
+ isnull = palloc0(sizeof(bool) * desc->natts);
+ free = palloc0(sizeof(bool) * desc->natts);
+
+ heap_deform_tuple(&change->tp.newtuple->tuple, desc,
+ attrs, isnull);
+
+ for (natt = 0; natt < desc->natts; natt++)
+ {
+ Form_pg_attribute attr = desc->attrs[natt];
+ ReorderBufferToastEnt *ent;
+ struct varlena *varlena;
+
+ /* va_rawsize is the size of the original datum -- including header */
+ struct varatt_external toast_pointer;
+ struct varatt_indirect redirect_pointer;
+ struct varlena *new_datum = NULL;
+ struct varlena *reconstructed;
+ dlist_iter it;
+ Size data_done = 0;
+
+ /* system columns aren't toasted */
+ if (attr->attnum < 0)
+ continue;
+
+ if (attr->attisdropped)
+ continue;
+
+ /* not a varlena datatype */
+ if (attr->attlen != -1)
+ continue;
+
+ /* no data */
+ if (isnull[natt])
+ continue;
+
+ /* ok, we know we have a toast datum */
+ varlena = (struct varlena *) DatumGetPointer(attrs[natt]);
+
+ /* no need to do anything if the tuple isn't external */
+ if (!VARATT_IS_EXTERNAL(varlena))
+ continue;
+
+ VARATT_EXTERNAL_GET_POINTER(toast_pointer, varlena);
+
+ /*
+ * Check whether the toast tuple changed, replace if so.
+ */
+ ent = (ReorderBufferToastEnt *)
+ hash_search(txn->toast_hash,
+ (void *) &toast_pointer.va_valueid,
+ HASH_FIND,
+ NULL);
+ if (ent == NULL)
+ continue;
+
+ new_datum =
+ (struct varlena *) palloc0(INDIRECT_POINTER_SIZE);
+
+ free[natt] = true;
+
+ reconstructed = palloc0(toast_pointer.va_rawsize);
+
+ ent->reconstructed = reconstructed;
+
+ /* stitch toast tuple back together from its parts */
+ dlist_foreach(it, &ent->chunks)
+ {
+ bool isnull;
+ ReorderBufferTupleBuf *tup =
+ dlist_container(ReorderBufferChange, node, it.cur)->tp.newtuple;
+ Pointer chunk =
+ DatumGetPointer(fastgetattr(&tup->tuple, 3, toast_desc, &isnull));
+
+ Assert(!isnull);
+ Assert(!VARATT_IS_EXTERNAL(chunk));
+ Assert(!VARATT_IS_SHORT(chunk));
+
+ memcpy(VARDATA(reconstructed) + data_done,
+ VARDATA(chunk),
+ VARSIZE(chunk) - VARHDRSZ);
+ data_done += VARSIZE(chunk) - VARHDRSZ;
+ }
+ Assert(data_done == toast_pointer.va_extsize);
+
+ /* make sure its marked as compressed or not */
+ if (VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer))
+ SET_VARSIZE_COMPRESSED(reconstructed, data_done + VARHDRSZ);
+ else
+ SET_VARSIZE(reconstructed, data_done + VARHDRSZ);
+
+ memset(&redirect_pointer, 0, sizeof(redirect_pointer));
+ redirect_pointer.pointer = reconstructed;
+
+ SET_VARTAG_EXTERNAL(new_datum, VARTAG_INDIRECT);
+ memcpy(VARDATA_EXTERNAL(new_datum), &redirect_pointer,
+ sizeof(redirect_pointer));
+
+ attrs[natt] = PointerGetDatum(new_datum);
+ }
+
+ /*
+ * Build tuple in separate memory & copy tuple back into the tuplebuf
+ * passed to the output plugin. We can't directly heap_fill_tuple() into
+ * the tuplebuf because attrs[] will point back into the current content.
+ */
+ newtup = heap_form_tuple(desc, attrs, isnull);
+ Assert(change->tp.newtuple->tuple.t_len <= MaxHeapTupleSize);
+ Assert(&change->tp.newtuple->header == change->tp.newtuple->tuple.t_data);
+
+ memcpy(change->tp.newtuple->tuple.t_data,
+ newtup->t_data,
+ newtup->t_len);
+ change->tp.newtuple->tuple.t_len = newtup->t_len;
+
+ /*
+ * free resources we won't further need, more persistent stuff will be
+ * free'd in ReorderBufferToastReset().
+ */
+ RelationClose(toast_rel);
+ pfree(newtup);
+ for (natt = 0; natt < desc->natts; natt++)
+ {
+ if (free[natt])
+ pfree(DatumGetPointer(attrs[natt]));
+ }
+ pfree(attrs);
+ pfree(free);
+ pfree(isnull);
+
+ MemoryContextSwitchTo(oldcontext);
+}
+
+/*
+ * Free all resources allocated for toast reconstruction.
+ */
+static void
+ReorderBufferToastReset(ReorderBuffer *rb, ReorderBufferTXN *txn)
+{
+ HASH_SEQ_STATUS hstat;
+ ReorderBufferToastEnt *ent;
+
+ if (txn->toast_hash == NULL)
+ return;
+
+ /* sequentially walk over the hash and free everything */
+ hash_seq_init(&hstat, txn->toast_hash);
+ while ((ent = (ReorderBufferToastEnt *) hash_seq_search(&hstat)) != NULL)
+ {
+ dlist_mutable_iter it;
+
+ if (ent->reconstructed != NULL)
+ pfree(ent->reconstructed);
+
+ dlist_foreach_modify(it, &ent->chunks)
+ {
+ ReorderBufferChange *change =
+ dlist_container(ReorderBufferChange, node, it.cur);
+
+ dlist_delete(&change->node);
+ ReorderBufferReturnChange(rb, change);
+ }
+ }
+
+ hash_destroy(txn->toast_hash);
+ txn->toast_hash = NULL;
+}
+
+
+/* ---------------------------------------
+ * Visibility support for logical decoding
+ *
+ *
+ * Lookup actual cmin/cmax values when using decoding snapshot. We can't
+ * always rely on stored cmin/cmax values because of two scenarios:
+ *
+ * * A tuple got changed multiple times during a single transaction and thus
+ * has got a combocid. Combocid's are only valid for the duration of a
+ * single transaction.
+ * * A tuple with a cmin but no cmax (and thus no combocid) got
+ * deleted/updated in another transaction than the one which created it
+ * which we are looking at right now. As only one of cmin, cmax or combocid
+ * is actually stored in the heap we don't have access to the the value we
+ * need anymore.
+ *
+ * To resolve those problems we have a per-transaction hash of (cmin,
+ * cmax) tuples keyed by (relfilenode, ctid) which contains the actual
+ * (cmin, cmax) values. That also takes care of combocids by simply
+ * not caring about them at all. As we have the real cmin/cmax values
+ * combocids aren't interesting.
+ *
+ * As we only care about catalog tuples here the overhead of this
+ * hashtable should be acceptable.
+ *
+ * Heap rewrites complicate this a bit, check rewriteheap.c for
+ * details.
+ * -------------------------------------------------------------------------
+ */
+
+/* struct for qsort()ing mapping files by lsn somewhat efficiently */
+typedef struct RewriteMappingFile
+{
+ XLogRecPtr lsn;
+ char fname[MAXPGPATH];
+} RewriteMappingFile;
+
+#if NOT_USED
+static void
+DisplayMapping(HTAB *tuplecid_data)
+{
+ HASH_SEQ_STATUS hstat;
+ ReorderBufferTupleCidEnt *ent;
+
+ hash_seq_init(&hstat, tuplecid_data);
+ while ((ent = (ReorderBufferTupleCidEnt *) hash_seq_search(&hstat)) != NULL)
+ {
+ elog(DEBUG3, "mapping: node: %u/%u/%u tid: %u/%u cmin: %u, cmax: %u",
+ ent->key.relnode.dbNode,
+ ent->key.relnode.spcNode,
+ ent->key.relnode.relNode,
+ BlockIdGetBlockNumber(&ent->key.tid.ip_blkid),
+ ent->key.tid.ip_posid,
+ ent->cmin,
+ ent->cmax
+ );
+ }
+}
+#endif
+
+/*
+ * Apply a single mapping file to tuplecid_data.
+ *
+ * The mapping file has to have been verified to be a) committed b) for our
+ * transaction c) applied in LSN order.
+ */
+static void
+ApplyLogicalMappingFile(HTAB *tuplecid_data, Oid relid, const char *fname)
+{
+ char path[MAXPGPATH];
+ int fd;
+ int readBytes;
+ LogicalRewriteMappingData map;
+
+ sprintf(path, "pg_llog/mappings/%s", fname);
+ fd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0);
+ if (fd < 0)
+ ereport(ERROR,
+ (errmsg("could not open file \"%s\": %m", path)));
+
+ while (true)
+ {
+ ReorderBufferTupleCidKey key;
+ ReorderBufferTupleCidEnt *ent;
+ ReorderBufferTupleCidEnt *new_ent;
+ bool found;
+
+ /* be careful about padding */
+ memset(&key, 0, sizeof(ReorderBufferTupleCidKey));
+
+ /* read all mappings till the end of the file */
+ readBytes = read(fd, &map, sizeof(LogicalRewriteMappingData));
+
+ if (readBytes < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not read file \"%s\": %m",
+ path)));
+ else if (readBytes == 0) /* EOF */
+ break;
+ else if (readBytes != sizeof(LogicalRewriteMappingData))
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not read file \"%s\", read %d instead of %d",
+ path, readBytes,
+ (int32) sizeof(LogicalRewriteMappingData))));
+
+ key.relnode = map.old_node;
+ ItemPointerCopy(&map.old_tid,
+ &key.tid);
+
+
+ ent = (ReorderBufferTupleCidEnt *)
+ hash_search(tuplecid_data,
+ (void *) &key,
+ HASH_FIND,
+ NULL);
+
+ /* no existing mapping, no need to update */
+ if (!ent)
+ continue;
+
+ key.relnode = map.new_node;
+ ItemPointerCopy(&map.new_tid,
+ &key.tid);
+
+ new_ent = (ReorderBufferTupleCidEnt *)
+ hash_search(tuplecid_data,
+ (void *) &key,
+ HASH_ENTER,
+ &found);
+
+ if (found)
+ {
+ /*
+ * Make sure the existing mapping makes sense. We sometime update
+ * old records that did not yet have a cmax (e.g. pg_class' own
+ * entry while rewriting it) during rewrites, so allow that.
+ */
+ Assert(ent->cmin == InvalidCommandId || ent->cmin == new_ent->cmin);
+ Assert(ent->cmax == InvalidCommandId || ent->cmax == new_ent->cmax);
+ }
+ else
+ {
+ /* update mapping */
+ new_ent->cmin = ent->cmin;
+ new_ent->cmax = ent->cmax;
+ new_ent->combocid = ent->combocid;
+ }
+ }
+}
+
+
+/*
+ * Check whether the TransactionOId 'xid' is in the pre-sorted array 'xip'.
+ */
+static bool
+TransactionIdInArray(TransactionId xid, TransactionId *xip, Size num)
+{
+ return bsearch(&xid, xip, num,
+ sizeof(TransactionId), xidComparator) != NULL;
+}
+
+/*
+ * qsort() comparator for sorting RewriteMappingFiles in LSN order.
+ */
+static int
+file_sort_by_lsn(const void *a_p, const void *b_p)
+{
+ RewriteMappingFile *a = *(RewriteMappingFile **)a_p;
+ RewriteMappingFile *b = *(RewriteMappingFile **)b_p;
+
+ if (a->lsn < b->lsn)
+ return -1;
+ else if (a->lsn > b->lsn)
+ return 1;
+ return 0;
+}
+
+/*
+ * Apply any existing logical remapping files if there are any targeted at our
+ * transaction for relid.
+ */
+static void
+UpdateLogicalMappings(HTAB *tuplecid_data, Oid relid, Snapshot snapshot)
+{
+ DIR *mapping_dir;
+ struct dirent *mapping_de;
+ List *files = NIL;
+ ListCell *file;
+ RewriteMappingFile **files_a;
+ size_t off;
+ Oid dboid = IsSharedRelation(relid) ? InvalidOid : MyDatabaseId;
+
+ mapping_dir = AllocateDir("pg_llog/mappings");
+ while ((mapping_de = ReadDir(mapping_dir, "pg_llog/mappings")) != NULL)
+ {
+ Oid f_dboid;
+ Oid f_relid;
+ TransactionId f_mapped_xid;
+ TransactionId f_create_xid;
+ XLogRecPtr f_lsn;
+ uint32 f_hi, f_lo;
+ RewriteMappingFile *f;
+
+ if (strcmp(mapping_de->d_name, ".") == 0 ||
+ strcmp(mapping_de->d_name, "..") == 0)
+ continue;
+
+ /* Ignore files that aren't ours*/
+ if (strncmp(mapping_de->d_name, "map-", 4) != 0)
+ continue;
+
+ if (sscanf(mapping_de->d_name, LOGICAL_REWRITE_FORMAT,
+ &f_dboid, &f_relid, &f_hi, &f_lo,
+ &f_mapped_xid, &f_create_xid) != 6)
+ elog(ERROR, "could not parse fname %s", mapping_de->d_name);
+
+ f_lsn = ((uint64) f_hi) << 32 | f_lo;
+
+ /* mapping for another database */
+ if (f_dboid != dboid)
+ continue;
+
+ /* mapping for another relation */
+ if (f_relid != relid)
+ continue;
+
+ /* did the creating transaction abort? */
+ if (!TransactionIdDidCommit(f_create_xid))
+ continue;
+
+ /* not for our transaction */
+ if (!TransactionIdInArray(f_mapped_xid, snapshot->subxip, snapshot->subxcnt))
+ continue;
+
+ /* ok, relevant, queue for apply */
+ f = palloc(sizeof(RewriteMappingFile));
+ f->lsn = f_lsn;
+ strcpy(f->fname, mapping_de->d_name);
+ files = lappend(files, f);
+ }
+ FreeDir(mapping_dir);
+
+ /* build array we can easily sort */
+ files_a = palloc(list_length(files) * sizeof(RewriteMappingFile *));
+ off = 0;
+ foreach(file, files)
+ {
+ files_a[off++] = lfirst(file);
+ }
+
+ /* sort files so we apply them in LSN order */
+ qsort(files_a, list_length(files), sizeof(RewriteMappingFile *),
+ file_sort_by_lsn);
+
+ for(off = 0; off < list_length(files); off++)
+ {
+ RewriteMappingFile *f = files_a[off];
+ elog(DEBUG1, "applying mapping: %s in %u", f->fname,
+ snapshot->subxip[0]);
+ ApplyLogicalMappingFile(tuplecid_data, relid, f->fname);
+ pfree(f);
+ }
+}
+
+/*
+ * Lookup cmin/cmax of a tuple, during logical decoding where we can't rely on
+ * combocids.
+ */
+bool
+ResolveCminCmaxDuringDecoding(HTAB *tuplecid_data,
+ Snapshot snapshot,
+ HeapTuple htup, Buffer buffer,
+ CommandId *cmin, CommandId *cmax)
+{
+ ReorderBufferTupleCidKey key;
+ ReorderBufferTupleCidEnt *ent;
+ ForkNumber forkno;
+ BlockNumber blockno;
+ bool updated_mapping = false;
+
+ /* be careful about padding */
+ memset(&key, 0, sizeof(key));
+
+ Assert(!BufferIsLocal(buffer));
+
+ /*
+ * get relfilenode from the buffer, no convenient way to access it other
+ * than that.
+ */
+ BufferGetTag(buffer, &key.relnode, &forkno, &blockno);
+
+ /* tuples can only be in the main fork */
+ Assert(forkno == MAIN_FORKNUM);
+ Assert(blockno == ItemPointerGetBlockNumber(&htup->t_self));
+
+ ItemPointerCopy(&htup->t_self,
+ &key.tid);
+
+restart:
+ ent = (ReorderBufferTupleCidEnt *)
+ hash_search(tuplecid_data,
+ (void *) &key,
+ HASH_FIND,
+ NULL);
+
+ /*
+ * failed to find a mapping, check whether the table was rewritten and
+ * apply mapping if so, but only do that once - there can be no new
+ * mappings while we are in here since we have to hold a lock on the
+ * relation.
+ */
+ if (ent == NULL && !updated_mapping)
+ {
+ UpdateLogicalMappings(tuplecid_data, htup->t_tableOid, snapshot);
+ /* now check but don't update for a mapping again */
+ updated_mapping = true;
+ goto restart;
+ }
+ else if (ent == NULL)
+ return false;
+
+ if (cmin)
+ *cmin = ent->cmin;
+ if (cmax)
+ *cmax = ent->cmax;
+ return true;
+}
diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c
new file mode 100644
index 00000000000..28f9a8a1a6f
--- /dev/null
+++ b/src/backend/replication/logical/snapbuild.c
@@ -0,0 +1,1885 @@
+/*-------------------------------------------------------------------------
+ *
+ * snapbuild.c
+ *
+ * Infrastructure for building historic catalog snapshots based on contents
+ * of the WAL, for the purpose of decoding heapam.c style values in the
+ * WAL.
+ *
+ * NOTES:
+ *
+ * We build snapshots which can *only* be used to read catalog contents and we
+ * do so by reading and interpreting the WAL stream. The aim is to build a
+ * snapshot that behaves the same as a freshly taken MVCC snapshot would have
+ * at the time the XLogRecord was generated.
+ *
+ * To build the snapshots we reuse the infrastructure built for Hot
+ * Standby. The in-memory snapshots we build look different than HS' because
+ * we have different needs. To successfully decode data from the WAL we only
+ * need to access catalog tables and (sys|rel|cat)cache, not the actual user
+ * tables since the data we decode is wholly contained in the WAL
+ * records. Also, our snapshots need to be different in comparison to normal
+ * MVCC ones because in contrast to those we cannot fully rely on the clog and
+ * pg_subtrans for information about committed transactions because they might
+ * commit in the future from the POV of the WAL entry we're currently
+ * decoding. This definition has the advantage that we only need to prevent
+ * removal of catalog rows, while normal table's rows can still be
+ * removed. This is achieved by using the replication slot mechanism.
+ *
+ * As the percentage of transactions modifying the catalog normally is fairly
+ * small in comparisons to ones only manipulating user data, we keep track of
+ * the committed catalog modifying ones inside (xmin, xmax) instead of keeping
+ * track of all running transactions like its done in a normal snapshot. Note
+ * that we're generally only looking at transactions that have acquired an
+ * xid. That is we keep a list of transactions between snapshot->(xmin, xmax)
+ * that we consider committed, everything else is considered aborted/in
+ * progress. That also allows us not to care about subtransactions before they
+ * have committed which means this modules, in contrast to HS, doesn't have to
+ * care about suboverflowed subtransactions and similar.
+ *
+ * One complexity of doing this is that to e.g. handle mixed DDL/DML
+ * transactions we need Snapshots that see intermediate versions of the
+ * catalog in a transaction. During normal operation this is achieved by using
+ * CommandIds/cmin/cmax. The problem with that however is that for space
+ * efficiency reasons only one value of that is stored
+ * (c.f. combocid.c). Since ComboCids are only available in memory we log
+ * additional information which allows us to get the original (cmin, cmax)
+ * pair during visibility checks. Check the reorderbuffer.c's comment above
+ * ResolveCminCmaxDuringDecoding() for details.
+ *
+ * To facilitate all this we need our own visibility routine, as the normal
+ * ones are optimized for different usecases.
+ *
+ * To replace the normal catalog snapshots with decoding ones use the
+ * SetupHistoricSnapshot() and TeardownHistoricSnapshot() functions.
+ *
+ *
+ *
+ * The snapbuild machinery is starting up in in several stages, as illustrated
+ * by the following graph:
+ * +-------------------------+
+ * +----|SNAPBUILD_START |-------------+
+ * | +-------------------------+ |
+ * | | |
+ * | | |
+ * | running_xacts with running xacts |
+ * | | |
+ * | | |
+ * | v |
+ * | +-------------------------+ v
+ * | |SNAPBUILD_FULL_SNAPSHOT |------------>|
+ * | +-------------------------+ |
+ * running_xacts | saved snapshot
+ * with zero xacts | at running_xacts's lsn
+ * | | |
+ * | all running toplevel TXNs finished |
+ * | | |
+ * | v |
+ * | +-------------------------+ |
+ * +--->|SNAPBUILD_CONSISTENT |<------------+
+ * +-------------------------+
+ *
+ * Initially the machinery is in the START stage. When a xl_running_xacts
+ * record is read that is sufficiently new (above the safe xmin horizon),
+ * there's a state transation. If there were no running xacts when the
+ * runnign_xacts record was generated, we'll directly go into CONSISTENT
+ * state, otherwise we'll switch to the FULL_SNAPSHOT state. Having a full
+ * snapshot means that all transactions that start henceforth can be decoded
+ * in their entirety, but transactions that started previously can't. In
+ * FULL_SNAPSHOT we'll switch into CONSISTENT once all those previously
+ * running transactions have committed or aborted.
+ *
+ * Only transactions that commit after CONSISTENT state has been reached will
+ * be replayed, even though they might have started while still in
+ * FULL_SNAPSHOT. That ensures that we'll reach a point where no previous
+ * changes has been exported, but all the following ones will be. That point
+ * is a convenient point to initialize replication from, which is why we
+ * export a snapshot at that point, which *can* be used to read normal data.
+ *
+ * Copyright (c) 2012-2014, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/replication/snapbuild.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "miscadmin.h"
+
+#include "access/heapam_xlog.h"
+#include "access/transam.h"
+#include "access/xact.h"
+
+#include "replication/logical.h"
+#include "replication/reorderbuffer.h"
+#include "replication/snapbuild.h"
+
+#include "utils/builtins.h"
+#include "utils/memutils.h"
+#include "utils/snapshot.h"
+#include "utils/snapmgr.h"
+#include "utils/tqual.h"
+
+#include "storage/block.h" /* debugging output */
+#include "storage/fd.h"
+#include "storage/lmgr.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "storage/standby.h"
+
+/*
+ * This struct contains the current state of the snapshot building
+ * machinery. Besides a forward declaration in the header, it is not exposed
+ * to the public, so we can easily change it's contents.
+ */
+struct SnapBuild
+{
+ /* how far are we along building our first full snapshot */
+ SnapBuildState state;
+
+ /* private memory context used to allocate memory for this module. */
+ MemoryContext context;
+
+ /* all transactions < than this have committed/aborted */
+ TransactionId xmin;
+
+ /* all transactions >= than this are uncommitted */
+ TransactionId xmax;
+
+ /*
+ * Don't replay commits from an LSN <= this LSN. This can be set
+ * externally but it will also be advanced (never retreat) from within
+ * snapbuild.c.
+ */
+ XLogRecPtr transactions_after;
+
+ /*
+ * Don't start decoding WAL until the "xl_running_xacts" information
+ * indicates there are no running xids with a xid smaller than this.
+ */
+ TransactionId initial_xmin_horizon;
+
+ /*
+ * Snapshot that's valid to see the catalog state seen at this moment.
+ */
+ Snapshot snapshot;
+
+ /*
+ * LSN of the last location we are sure a snapshot has been serialized to.
+ */
+ XLogRecPtr last_serialized_snapshot;
+
+ /*
+ * The reorderbuffer we need to update with usable snapshots et al.
+ */
+ ReorderBuffer *reorder;
+
+ /*
+ * Information about initially running transactions
+ *
+ * When we start building a snapshot there already may be transactions in
+ * progress. Those are stored in running.xip. We don't have enough
+ * information about those to decode their contents, so until they are
+ * finished (xcnt=0) we cannot switch to a CONSISTENT state.
+ */
+ struct
+ {
+ /*
+ * As long as running.xcnt all XIDs < running.xmin and > running.xmax
+ * have to be checked whether they still are running.
+ */
+ TransactionId xmin;
+ TransactionId xmax;
+
+ size_t xcnt; /* number of used xip entries */
+ size_t xcnt_space; /* allocated size of xip */
+ TransactionId *xip; /* running xacts array, xidComparator-sorted */
+ } running;
+
+ /*
+ * Array of transactions which could have catalog changes that committed
+ * between xmin and xmax.
+ */
+ struct
+ {
+ /* number of committed transactions */
+ size_t xcnt;
+
+ /* available space for committed transactions */
+ size_t xcnt_space;
+
+ /*
+ * Until we reach a CONSISTENT state, we record commits of all
+ * transactions, not just the catalog changing ones. Record when that
+ * changes so we know we cannot export a snapshot safely anymore.
+ */
+ bool includes_all_transactions;
+
+ /*
+ * Array of committed transactions that have modified the catalog.
+ *
+ * As this array is frequently modified we do *not* keep it in
+ * xidComparator order. Instead we sort the array when building &
+ * distributing a snapshot.
+ *
+ * TODO: It's unclear whether that reasoning has much merit. Every
+ * time we add something here after becoming consistent will also
+ * require distributing a snapshot. Storing them sorted would
+ * potentially also make it easier to purge (but more complicated wrt
+ * wraparound?). Should be improved if sorting while building the
+ * snapshot shows up in profiles.
+ */
+ TransactionId *xip;
+ } committed;
+};
+
+/*
+ * Starting a transaction -- which we need to do while exporting a snapshot --
+ * removes knowledge about the previously used resowner, so we save it here.
+ */
+ResourceOwner SavedResourceOwnerDuringExport = NULL;
+bool ExportInProgress = false;
+
+/* transaction state manipulation functions */
+static void SnapBuildEndTxn(SnapBuild *builder, XLogRecPtr lsn, TransactionId xid);
+
+/* ->running manipulation */
+static bool SnapBuildTxnIsRunning(SnapBuild *builder, TransactionId xid);
+
+/* ->committed manipulation */
+static void SnapBuildPurgeCommittedTxn(SnapBuild *builder);
+
+/* snapshot building/manipulation/distribution functions */
+static Snapshot SnapBuildBuildSnapshot(SnapBuild *builder, TransactionId xid);
+
+static void SnapBuildFreeSnapshot(Snapshot snap);
+
+static void SnapBuildSnapIncRefcount(Snapshot snap);
+
+static void SnapBuildDistributeNewCatalogSnapshot(SnapBuild *builder, XLogRecPtr lsn);
+
+/* xlog reading helper functions for SnapBuildProcessRecord */
+static bool SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *running);
+
+/* serialization functions */
+static void SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn);
+static bool SnapBuildRestore(SnapBuild *builder, XLogRecPtr lsn);
+
+
+/*
+ * Allocate a new snapshot builder.
+ *
+ * xmin_horizon is the xid >=which we can be sure no catalog rows have been
+ * removed, start_lsn is the LSN >= we want to replay commits.
+ */
+SnapBuild *
+AllocateSnapshotBuilder(ReorderBuffer *reorder,
+ TransactionId xmin_horizon,
+ XLogRecPtr start_lsn)
+{
+ MemoryContext context;
+ MemoryContext oldcontext;
+ SnapBuild *builder;
+
+ /* allocate memory in own context, to have better accountability */
+ context = AllocSetContextCreate(CurrentMemoryContext,
+ "snapshot builder context",
+ ALLOCSET_DEFAULT_MINSIZE,
+ ALLOCSET_DEFAULT_INITSIZE,
+ ALLOCSET_DEFAULT_MAXSIZE);
+ oldcontext = MemoryContextSwitchTo(context);
+
+ builder = palloc0(sizeof(SnapBuild));
+
+ builder->state = SNAPBUILD_START;
+ builder->context = context;
+ builder->reorder = reorder;
+ /* Other struct members initialized by zeroing via palloc0 above */
+
+ builder->committed.xcnt = 0;
+ builder->committed.xcnt_space = 128; /* arbitrary number */
+ builder->committed.xip =
+ palloc0(builder->committed.xcnt_space * sizeof(TransactionId));
+ builder->committed.includes_all_transactions = true;
+ builder->committed.xip =
+ palloc0(builder->committed.xcnt_space * sizeof(TransactionId));
+ builder->initial_xmin_horizon = xmin_horizon;
+ builder->transactions_after = start_lsn;
+
+ MemoryContextSwitchTo(oldcontext);
+
+ return builder;
+}
+
+/*
+ * Free a snapshot builder.
+ */
+void
+FreeSnapshotBuilder(SnapBuild *builder)
+{
+ MemoryContext context = builder->context;
+
+ /* free snapshot explicitly, that contains some error checking */
+ if (builder->snapshot != NULL)
+ {
+ SnapBuildSnapDecRefcount(builder->snapshot);
+ builder->snapshot = NULL;
+ }
+
+ /* other resources are deallocated via memory context reset */
+ MemoryContextDelete(context);
+}
+
+/*
+ * Free an unreferenced snapshot that has previously been built by us.
+ */
+static void
+SnapBuildFreeSnapshot(Snapshot snap)
+{
+ /* make sure we don't get passed an external snapshot */
+ Assert(snap->satisfies == HeapTupleSatisfiesHistoricMVCC);
+
+ /* make sure nobody modified our snapshot */
+ Assert(snap->curcid == FirstCommandId);
+ Assert(!snap->suboverflowed);
+ Assert(!snap->takenDuringRecovery);
+ Assert(snap->regd_count == 1);
+
+ /* slightly more likely, so it's checked even without c-asserts */
+ if (snap->copied)
+ elog(ERROR, "cannot free a copied snapshot");
+
+ if (snap->active_count)
+ elog(ERROR, "cannot free an active snapshot");
+
+ pfree(snap);
+}
+
+/*
+ * In which state of snapshot building are we?
+ */
+SnapBuildState
+SnapBuildCurrentState(SnapBuild *builder)
+{
+ return builder->state;
+}
+
+/*
+ * Should the contents of transaction ending at 'ptr' be decoded?
+ */
+bool
+SnapBuildXactNeedsSkip(SnapBuild *builder, XLogRecPtr ptr)
+{
+ return ptr <= builder->transactions_after;
+}
+
+/*
+ * Increase refcount of a snapshot.
+ *
+ * This is used when handing out a snapshot to some external resource or when
+ * adding a Snapshot as builder->snapshot.
+ */
+static void
+SnapBuildSnapIncRefcount(Snapshot snap)
+{
+ snap->active_count++;
+}
+
+/*
+ * Decrease refcount of a snapshot and free if the refcount reaches zero.
+ *
+ * Externally visible, so that external resources that have been handed an
+ * IncRef'ed Snapshot can adjust its refcount easily.
+ */
+void
+SnapBuildSnapDecRefcount(Snapshot snap)
+{
+ /* make sure we don't get passed an external snapshot */
+ Assert(snap->satisfies == HeapTupleSatisfiesHistoricMVCC);
+
+ /* make sure nobody modified our snapshot */
+ Assert(snap->curcid == FirstCommandId);
+ Assert(!snap->suboverflowed);
+ Assert(!snap->takenDuringRecovery);
+
+ Assert(snap->regd_count == 1);
+
+ Assert(snap->active_count);
+
+ /* slightly more likely, so its checked even without casserts */
+ if (snap->copied)
+ elog(ERROR, "cannot free a copied snapshot");
+
+ snap->active_count--;
+ if (!snap->active_count)
+ SnapBuildFreeSnapshot(snap);
+}
+
+/*
+ * Build a new snapshot, based on currently committed catalog-modifying
+ * transactions.
+ *
+ * In-progress transactions with catalog access are *not* allowed to modify
+ * these snapshots; they have to copy them and fill in appropriate ->curcid
+ * and ->subxip/subxcnt values.
+ */
+static Snapshot
+SnapBuildBuildSnapshot(SnapBuild *builder, TransactionId xid)
+{
+ Snapshot snapshot;
+ Size ssize;
+
+ Assert(builder->state >= SNAPBUILD_FULL_SNAPSHOT);
+
+ ssize = sizeof(SnapshotData)
+ + sizeof(TransactionId) * builder->committed.xcnt
+ + sizeof(TransactionId) * 1 /* toplevel xid */ ;
+
+ snapshot = MemoryContextAllocZero(builder->context, ssize);
+
+ snapshot->satisfies = HeapTupleSatisfiesHistoricMVCC;
+
+ /*
+ * We misuse the original meaning of SnapshotData's xip and subxip fields
+ * to make the more fitting for our needs.
+ *
+ * In the 'xip' array we store transactions that have to be treated as
+ * committed. Since we will only ever look at tuples from transactions
+ * that have modified the catalog its more efficient to store those few
+ * that exist between xmin and xmax (frequently there are none).
+ *
+ * Snapshots that are used in transactions that have modified the catalog
+ * also use the 'subxip' array to store their toplevel xid and all the
+ * subtransaction xids so we can recognize when we need to treat rows as
+ * visible that are not in xip but still need to be visible. Subxip only
+ * gets filled when the transaction is copied into the context of a
+ * catalog modifying transaction since we otherwise share a snapshot
+ * between transactions. As long as a txn hasn't modified the catalog it
+ * doesn't need to treat any uncommitted rows as visible, so there is no
+ * need for those xids.
+ *
+ * Both arrays are qsort'ed so that we can use bsearch() on them.
+ */
+ Assert(TransactionIdIsNormal(builder->xmin));
+ Assert(TransactionIdIsNormal(builder->xmax));
+
+ snapshot->xmin = builder->xmin;
+ snapshot->xmax = builder->xmax;
+
+ /* store all transactions to be treated as committed by this snapshot */
+ snapshot->xip =
+ (TransactionId *) ((char *) snapshot + sizeof(SnapshotData));
+ snapshot->xcnt = builder->committed.xcnt;
+ memcpy(snapshot->xip,
+ builder->committed.xip,
+ builder->committed.xcnt * sizeof(TransactionId));
+
+ /* sort so we can bsearch() */
+ qsort(snapshot->xip, snapshot->xcnt, sizeof(TransactionId), xidComparator);
+
+ /*
+ * Initially, subxip is empty, i.e. it's a snapshot to be used by
+ * transactions that don't modify the catalog. Will be filled by
+ * ReorderBufferCopySnap() if necessary.
+ */
+ snapshot->subxcnt = 0;
+ snapshot->subxip = NULL;
+
+ snapshot->suboverflowed = false;
+ snapshot->takenDuringRecovery = false;
+ snapshot->copied = false;
+ snapshot->curcid = FirstCommandId;
+ snapshot->active_count = 0;
+ snapshot->regd_count = 1; /* mark as registered so nobody frees it */
+
+ return snapshot;
+}
+
+/*
+ * Export a snapshot so it can be set in another session with SET TRANSACTION
+ * SNAPSHOT.
+ *
+ * For that we need to start a transaction in the current backend as the
+ * importing side checks whether the source transaction is still open to make
+ * sure the xmin horizon hasn't advanced since then.
+ *
+ * After that we convert a locally built snapshot into the normal variant
+ * understood by HeapTupleSatisfiesMVCC et al.
+ */
+const char *
+SnapBuildExportSnapshot(SnapBuild *builder)
+{
+ Snapshot snap;
+ char *snapname;
+ TransactionId xid;
+ TransactionId *newxip;
+ int newxcnt = 0;
+
+ if (builder->state != SNAPBUILD_CONSISTENT)
+ elog(ERROR, "cannot export a snapshot before reaching a consistent state");
+
+ if (!builder->committed.includes_all_transactions)
+ elog(ERROR, "cannot export a snapshot, not all transactions are monitored anymore");
+
+ /* so we don't overwrite the existing value */
+ if (TransactionIdIsValid(MyPgXact->xmin))
+ elog(ERROR, "cannot export a snapshot when MyPgXact->xmin already is valid");
+
+ if (IsTransactionOrTransactionBlock())
+ elog(ERROR, "cannot export a snapshot from within a transaction");
+
+ if (SavedResourceOwnerDuringExport)
+ elog(ERROR, "can only export one snapshot at a time");
+
+ SavedResourceOwnerDuringExport = CurrentResourceOwner;
+ ExportInProgress = true;
+
+ StartTransactionCommand();
+
+ Assert(!FirstSnapshotSet);
+
+ /* There doesn't seem to a nice API to set these */
+ XactIsoLevel = XACT_REPEATABLE_READ;
+ XactReadOnly = true;
+
+ snap = SnapBuildBuildSnapshot(builder, GetTopTransactionId());
+
+ /*
+ * We know that snap->xmin is alive, enforced by the logical xmin
+ * mechanism. Due to that we can do this without locks, we're only
+ * changing our own value.
+ */
+ MyPgXact->xmin = snap->xmin;
+
+ /* allocate in transaction context */
+ newxip = (TransactionId *)
+ palloc(sizeof(TransactionId) * GetMaxSnapshotXidCount());
+
+ /*
+ * snapbuild.c builds transactions in an "inverted" manner, which means it
+ * stores committed transactions in ->xip, not ones in progress. Build a
+ * classical snapshot by marking all non-committed transactions as
+ * in-progress. This can be expensive.
+ */
+ for (xid = snap->xmin; NormalTransactionIdPrecedes(xid, snap->xmax);)
+ {
+ void *test;
+
+ /*
+ * Check whether transaction committed using the decoding snapshot
+ * meaning of ->xip.
+ */
+ test = bsearch(&xid, snap->xip, snap->xcnt,
+ sizeof(TransactionId), xidComparator);
+
+ if (test == NULL)
+ {
+ if (newxcnt >= GetMaxSnapshotXidCount())
+ elog(ERROR, "snapshot too large");
+
+ newxip[newxcnt++] = xid;
+ }
+
+ TransactionIdAdvance(xid);
+ }
+
+ snap->xcnt = newxcnt;
+ snap->xip = newxip;
+
+ /*
+ * now that we've built a plain snapshot, use the normal mechanisms for
+ * exporting it
+ */
+ snapname = ExportSnapshot(snap);
+
+ ereport(LOG,
+ (errmsg("exported logical decoding snapshot: \"%s\" with %u xids",
+ snapname, snap->xcnt)));
+ return snapname;
+}
+
+/*
+ * Reset a previously SnapBuildExportSnapshot()'ed snapshot if there is
+ * any. Aborts the previously started transaction and resets the resource
+ * owner back to it's original value.
+ */
+void
+SnapBuildClearExportedSnapshot()
+{
+ /* nothing exported, thats the usual case */
+ if (!ExportInProgress)
+ return;
+
+ if (!IsTransactionState())
+ elog(ERROR, "clearing exported snapshot in wrong transaction state");
+
+ /* make sure nothing could have ever happened */
+ AbortCurrentTransaction();
+
+ CurrentResourceOwner = SavedResourceOwnerDuringExport;
+ SavedResourceOwnerDuringExport = NULL;
+ ExportInProgress = false;
+}
+
+/*
+ * Handle the effects of a single heap change, appropriate to the current state
+ * of the snapshot builder and returns whether changes made at (xid, lsn) can
+ * be decoded.
+ */
+bool
+SnapBuildProcessChange(SnapBuild *builder, TransactionId xid, XLogRecPtr lsn)
+{
+ bool is_old_tx;
+
+ /*
+ * We can't handle data in transactions if we haven't built a snapshot
+ * yet, so don't store them.
+ */
+ if (builder->state < SNAPBUILD_FULL_SNAPSHOT)
+ return false;
+
+ /*
+ * No point in keeping track of changes in transactions that we don't have
+ * enough information about to decode. This means that they started before
+ * we got into the SNAPBUILD_FULL_SNAPSHOT state.
+ */
+ if (builder->state < SNAPBUILD_CONSISTENT &&
+ SnapBuildTxnIsRunning(builder, xid))
+ return false;
+
+ /*
+ * If the reorderbuffer doesn't yet have a snapshot, add one now, it will
+ * be needed to decode the change we're currently processing.
+ */
+ is_old_tx = ReorderBufferIsXidKnown(builder->reorder, xid);
+
+ if (!is_old_tx || !ReorderBufferXidHasBaseSnapshot(builder->reorder, xid))
+ {
+ /* only build a new snapshot if we don't have a prebuilt one */
+ if (builder->snapshot == NULL)
+ {
+ builder->snapshot = SnapBuildBuildSnapshot(builder, xid);
+ /* inrease refcount for the snapshot builder */
+ SnapBuildSnapIncRefcount(builder->snapshot);
+ }
+
+ /*
+ * Increase refcount for the transaction we're handing the snapshot
+ * out to.
+ */
+ SnapBuildSnapIncRefcount(builder->snapshot);
+ ReorderBufferSetBaseSnapshot(builder->reorder, xid, lsn,
+ builder->snapshot);
+ }
+
+ return true;
+}
+
+/*
+ * Do CommandId/ComboCid handling after reading a xl_heap_new_cid record. This
+ * implies that a transaction has done some form of write to system catalogs.
+ */
+void
+SnapBuildProcessNewCid(SnapBuild *builder, TransactionId xid,
+ XLogRecPtr lsn, xl_heap_new_cid *xlrec)
+{
+ CommandId cid;
+
+ /*
+ * we only log new_cid's if a catalog tuple was modified, so mark
+ * the transaction as containing catalog modifications
+ */
+ ReorderBufferXidSetCatalogChanges(builder->reorder, xid,lsn);
+
+ ReorderBufferAddNewTupleCids(builder->reorder, xlrec->top_xid, lsn,
+ xlrec->target.node, xlrec->target.tid,
+ xlrec->cmin, xlrec->cmax,
+ xlrec->combocid);
+
+ /* figure out new command id */
+ if (xlrec->cmin != InvalidCommandId &&
+ xlrec->cmax != InvalidCommandId)
+ cid = Max(xlrec->cmin, xlrec->cmax);
+ else if (xlrec->cmax != InvalidCommandId)
+ cid = xlrec->cmax;
+ else if (xlrec->cmin != InvalidCommandId)
+ cid = xlrec->cmin;
+ else
+ {
+ cid = InvalidCommandId; /* silence compiler */
+ elog(ERROR, "xl_heap_new_cid record without a valid CommandId");
+ }
+
+ ReorderBufferAddNewCommandId(builder->reorder, xid, lsn, cid + 1);
+}
+
+/*
+ * Check whether `xid` is currently 'running'.
+ *
+ * Running transactions in our parlance are transactions which we didn't
+ * observe from the start so we can't properly decode their contents. They
+ * only exist after we freshly started from an < CONSISTENT snapshot.
+ */
+static bool
+SnapBuildTxnIsRunning(SnapBuild *builder, TransactionId xid)
+{
+ Assert(builder->state < SNAPBUILD_CONSISTENT);
+ Assert(TransactionIdIsNormal(builder->running.xmin));
+ Assert(TransactionIdIsNormal(builder->running.xmax));
+
+ if (builder->running.xcnt &&
+ NormalTransactionIdFollows(xid, builder->running.xmin) &&
+ NormalTransactionIdPrecedes(xid, builder->running.xmax))
+ {
+ TransactionId *search =
+ bsearch(&xid, builder->running.xip, builder->running.xcnt_space,
+ sizeof(TransactionId), xidComparator);
+
+ if (search != NULL)
+ {
+ Assert(*search == xid);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/*
+ * Add a new Snapshot to all transactions we're decoding that currently are
+ * in-progress so they can see new catalog contents made by the transaction
+ * that just committed. This is necessary because those in-progress
+ * transactions will use the new catalog's contents from here on (at the very
+ * least everything they do needs to be compatible with newer catalog
+ * contents).
+ */
+static void
+SnapBuildDistributeNewCatalogSnapshot(SnapBuild *builder, XLogRecPtr lsn)
+{
+ dlist_iter txn_i;
+ ReorderBufferTXN *txn;
+
+ /*
+ * Iterate through all toplevel transactions. This can include
+ * subtransactions which we just don't yet know to be that, but that's
+ * fine, they will just get an unneccesary snapshot queued.
+ */
+ dlist_foreach(txn_i, &builder->reorder->toplevel_by_lsn)
+ {
+ txn = dlist_container(ReorderBufferTXN, node, txn_i.cur);
+
+ Assert(TransactionIdIsValid(txn->xid));
+
+ /*
+ * If we don't have a base snapshot yet, there are no changes in this
+ * transaction which in turn implies we don't yet need a snapshot at
+ * all. We'll add add a snapshot when the first change gets queued.
+ *
+ * NB: This works correctly even for subtransactions because
+ * ReorderBufferCommitChild() takes care to pass the parent the base
+ * snapshot, and while iterating the changequeue we'll get the change
+ * from the subtxn.
+ */
+ if (!ReorderBufferXidHasBaseSnapshot(builder->reorder, txn->xid))
+ continue;
+
+ elog(DEBUG2, "adding a new snapshot to %u at %X/%X",
+ txn->xid, (uint32) (lsn >> 32), (uint32) lsn);
+
+ /*
+ * increase the snapshot's refcount for the transaction we are handing
+ * it out to
+ */
+ SnapBuildSnapIncRefcount(builder->snapshot);
+ ReorderBufferAddSnapshot(builder->reorder, txn->xid, lsn,
+ builder->snapshot);
+ }
+}
+
+/*
+ * Keep track of a new catalog changing transaction that has committed.
+ */
+static void
+SnapBuildAddCommittedTxn(SnapBuild *builder, TransactionId xid)
+{
+ Assert(TransactionIdIsValid(xid));
+
+ if (builder->committed.xcnt == builder->committed.xcnt_space)
+ {
+ builder->committed.xcnt_space = builder->committed.xcnt_space * 2 + 1;
+
+ elog(DEBUG1, "increasing space for committed transactions to %u",
+ (uint32) builder->committed.xcnt_space);
+
+ builder->committed.xip = repalloc(builder->committed.xip,
+ builder->committed.xcnt_space * sizeof(TransactionId));
+ }
+
+ /*
+ * TODO: It might make sense to keep the array sorted here instead of
+ * doing it every time we build a new snapshot. On the other hand this
+ * gets called repeatedly when a transaction with subtransactions commits.
+ */
+ builder->committed.xip[builder->committed.xcnt++] = xid;
+}
+
+/*
+ * Remove knowledge about transactions we treat as committed that are smaller
+ * than ->xmin. Those won't ever get checked via the ->commited array but via
+ * the clog machinery, so we don't need to waste memory on them.
+ */
+static void
+SnapBuildPurgeCommittedTxn(SnapBuild *builder)
+{
+ int off;
+ TransactionId *workspace;
+ int surviving_xids = 0;
+
+ /* not ready yet */
+ if (!TransactionIdIsNormal(builder->xmin))
+ return;
+
+ /* TODO: Neater algorithm than just copying and iterating? */
+ workspace =
+ MemoryContextAlloc(builder->context,
+ builder->committed.xcnt * sizeof(TransactionId));
+
+ /* copy xids that still are interesting to workspace */
+ for (off = 0; off < builder->committed.xcnt; off++)
+ {
+ if (NormalTransactionIdPrecedes(builder->committed.xip[off],
+ builder->xmin))
+ ; /* remove */
+ else
+ workspace[surviving_xids++] = builder->committed.xip[off];
+ }
+
+ /* copy workspace back to persistent state */
+ memcpy(builder->committed.xip, workspace,
+ surviving_xids * sizeof(TransactionId));
+
+ elog(DEBUG3, "purged committed transactions from %u to %u, xmin: %u, xmax: %u",
+ (uint32) builder->committed.xcnt, (uint32) surviving_xids,
+ builder->xmin, builder->xmax);
+ builder->committed.xcnt = surviving_xids;
+
+ pfree(workspace);
+}
+
+/*
+ * Common logic for SnapBuildAbortTxn and SnapBuildCommitTxn dealing with
+ * keeping track of the amount of running transactions.
+ */
+static void
+SnapBuildEndTxn(SnapBuild *builder, XLogRecPtr lsn, TransactionId xid)
+{
+ if (builder->state == SNAPBUILD_CONSISTENT)
+ return;
+
+ /*
+ * NB: This handles subtransactions correctly even if we started from
+ * suboverflowed xl_running_xacts because we only keep track of toplevel
+ * transactions. Since the latter are always are allocated before their
+ * subxids and since they end at the same time it's sufficient to deal
+ * with them here.
+ */
+ if (SnapBuildTxnIsRunning(builder, xid))
+ {
+ Assert(builder->running.xcnt > 0);
+
+ if (!--builder->running.xcnt)
+ {
+ /*
+ * None of the originally running transaction is running anymore,
+ * so our incrementaly built snapshot now is consistent.
+ */
+ ereport(LOG,
+ (errmsg("logical decoding found consistent point at %X/%X",
+ (uint32)(lsn >> 32), (uint32)lsn),
+ errdetail("xid %u finished, no running transactions anymore",
+ xid)));
+ builder->state = SNAPBUILD_CONSISTENT;
+ }
+ }
+}
+
+/*
+ * Abort a transaction, throw away all state we kept.
+ */
+void
+SnapBuildAbortTxn(SnapBuild *builder, XLogRecPtr lsn,
+ TransactionId xid,
+ int nsubxacts, TransactionId *subxacts)
+{
+ int i;
+
+ for (i = 0; i < nsubxacts; i++)
+ {
+ TransactionId subxid = subxacts[i];
+
+ SnapBuildEndTxn(builder, lsn, subxid);
+ }
+
+ SnapBuildEndTxn(builder, lsn, xid);
+}
+
+/*
+ * Handle everything that needs to be done when a transaction commits
+ */
+void
+SnapBuildCommitTxn(SnapBuild *builder, XLogRecPtr lsn, TransactionId xid,
+ int nsubxacts, TransactionId *subxacts)
+{
+ int nxact;
+
+ bool forced_timetravel = false;
+ bool sub_needs_timetravel = false;
+ bool top_needs_timetravel = false;
+
+ TransactionId xmax = xid;
+
+ /*
+ * If we couldn't observe every change of a transaction because it was
+ * already running at the point we started to observe we have to assume it
+ * made catalog changes.
+ *
+ * This has the positive benefit that we afterwards have enough
+ * information to build an exportable snapshot that's usable by pg_dump et
+ * al.
+ */
+ if (builder->state < SNAPBUILD_CONSISTENT)
+ {
+ /* ensure that only commits after this are getting replayed */
+ if (builder->transactions_after < lsn)
+ builder->transactions_after = lsn;
+
+ /*
+ * We could avoid treating !SnapBuildTxnIsRunning transactions as
+ * timetravel ones, but we want to be able to export a snapshot when
+ * we reached consistency.
+ */
+ forced_timetravel = true;
+ elog(DEBUG1, "forced to assume catalog changes for xid %u because it was running to early", xid);
+ }
+
+ for (nxact = 0; nxact < nsubxacts; nxact++)
+ {
+ TransactionId subxid = subxacts[nxact];
+
+ /*
+ * make sure txn is not tracked in running txn's anymore, switch state
+ */
+ SnapBuildEndTxn(builder, lsn, subxid);
+
+ /*
+ * If we're forcing timetravel we also need visibility information
+ * about subtransaction, so keep track of subtransaction's state.
+ */
+ if (forced_timetravel)
+ {
+ SnapBuildAddCommittedTxn(builder, subxid);
+ if (NormalTransactionIdFollows(subxid, xmax))
+ xmax = subxid;
+ }
+
+ /*
+ * Add subtransaction to base snapshot if it DDL, we don't distinguish
+ * to toplevel transactions there.
+ */
+ else if (ReorderBufferXidHasCatalogChanges(builder->reorder, subxid))
+ {
+ sub_needs_timetravel = true;
+
+ elog(DEBUG1, "found subtransaction %u:%u with catalog changes.",
+ xid, subxid);
+
+ SnapBuildAddCommittedTxn(builder, subxid);
+
+ if (NormalTransactionIdFollows(subxid, xmax))
+ xmax = subxid;
+ }
+ }
+
+ /*
+ * Make sure toplevel txn is not tracked in running txn's anymore, switch
+ * state to consistent if possible.
+ */
+ SnapBuildEndTxn(builder, lsn, xid);
+
+ if (forced_timetravel)
+ {
+ elog(DEBUG2, "forced transaction %u to do timetravel.", xid);
+
+ SnapBuildAddCommittedTxn(builder, xid);
+ }
+ /* add toplevel transaction to base snapshot */
+ else if (ReorderBufferXidHasCatalogChanges(builder->reorder, xid))
+ {
+ elog(DEBUG2, "found top level transaction %u, with catalog changes!",
+ xid);
+
+ top_needs_timetravel = true;
+ SnapBuildAddCommittedTxn(builder, xid);
+ }
+ else if (sub_needs_timetravel)
+ {
+ /* mark toplevel txn as timetravel as well */
+ SnapBuildAddCommittedTxn(builder, xid);
+ }
+
+ /* if there's any reason to build a historic snapshot, to so now */
+ if (forced_timetravel || top_needs_timetravel || sub_needs_timetravel)
+ {
+ /*
+ * Adjust xmax of the snapshot builder, we only do that for committed,
+ * catalog modifying, transactions, everything else isn't interesting
+ * for us since we'll never look at the respective rows.
+ */
+ if (!TransactionIdIsValid(builder->xmax) ||
+ TransactionIdFollowsOrEquals(xmax, builder->xmax))
+ {
+ builder->xmax = xmax;
+ TransactionIdAdvance(builder->xmax);
+ }
+
+ /*
+ * If we haven't built a complete snapshot yet there's no need to hand
+ * it out, it wouldn't (and couldn't) be used anyway.
+ */
+ if (builder->state < SNAPBUILD_FULL_SNAPSHOT)
+ return;
+
+ /*
+ * Decrease the snapshot builder's refcount of the old snapshot, note
+ * that it still will be used if it has been handed out to the
+ * reorderbuffer earlier.
+ */
+ if (builder->snapshot)
+ SnapBuildSnapDecRefcount(builder->snapshot);
+
+ builder->snapshot = SnapBuildBuildSnapshot(builder, xid);
+
+ /* we might need to execute invalidations, add snapshot */
+ if (!ReorderBufferXidHasBaseSnapshot(builder->reorder, xid))
+ {
+ SnapBuildSnapIncRefcount(builder->snapshot);
+ ReorderBufferSetBaseSnapshot(builder->reorder, xid, lsn,
+ builder->snapshot);
+ }
+
+ /* refcount of the snapshot builder for the new snapshot */
+ SnapBuildSnapIncRefcount(builder->snapshot);
+
+ /* add a new SnapshotNow to all currently running transactions */
+ SnapBuildDistributeNewCatalogSnapshot(builder, lsn);
+ }
+ else
+ {
+ /* record that we cannot export a general snapshot anymore */
+ builder->committed.includes_all_transactions = false;
+ }
+}
+
+
+/* -----------------------------------
+ * Snapshot building functions dealing with xlog records
+ * -----------------------------------
+ */
+
+/*
+ * Process a running xacts record, and use it's information to first build a
+ * historic snapshot and later to release resources that aren't needed
+ * anymore.
+ */
+void
+SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *running)
+{
+ ReorderBufferTXN *txn;
+
+ /*
+ * If we're not consistent yet, inspect the record to see whether it
+ * allows to get closer to being consistent. If we are consistent, dump
+ * our snapshot so others or we, after a restart, can use it.
+ */
+ if (builder->state < SNAPBUILD_CONSISTENT)
+ {
+ /* returns false if there's no point in performing cleanup just yet */
+ if (!SnapBuildFindSnapshot(builder, lsn, running))
+ return;
+ }
+ else
+ SnapBuildSerialize(builder, lsn);
+
+ /*
+ * Update range of interesting xids base don the running xacts
+ * information. We don't increase ->xmax using it, because once we are in
+ * a consistent state we can do that ourselves and much more efficiently
+ * so, because we only need to do it for catalog transactions since we
+ * only ever look at those.
+ *
+ * NB: Because of that xmax can be lower than xmin, because we only
+ * increase xmax when a catalog modifying transaction commits. While odd
+ * looking, its correct and actually more efficient this way since we hit
+ * fast paths in tqual.c.
+ */
+ builder->xmin = running->oldestRunningXid;
+
+ /* Remove transactions we don't need to keep track off anymore */
+ SnapBuildPurgeCommittedTxn(builder);
+
+ elog(DEBUG3, "xmin: %u, xmax: %u, oldestrunning: %u",
+ builder->xmin, builder->xmax,
+ running->oldestRunningXid);
+
+ /*
+ * Inrease shared memory limits, so vacuum can work on tuples we prevented
+ * from being pruned till now.
+ */
+ LogicalIncreaseXminForSlot(lsn, running->oldestRunningXid);
+
+ /*
+ * Also tell the slot where we can restart decoding from. We don't want to
+ * do that after every commit because changing that implies an fsync of
+ * the logical slot's state file, so we only do it every time we see a
+ * running xacts record.
+ *
+ * Do so by looking for the oldest in progress transaction (determined by
+ * the first LSN of any of its relevant records). Every transaction
+ * remembers the last location we stored the snapshot to disk before its
+ * beginning. That point is where we can restart from.
+ */
+
+ /*
+ * Can't know about a serialized snapshot's location if we're not
+ * consistent.
+ */
+ if (builder->state < SNAPBUILD_CONSISTENT)
+ return;
+
+ txn = ReorderBufferGetOldestTXN(builder->reorder);
+
+ /*
+ * oldest ongoing txn might have started when we didn't yet serialize
+ * anything because we hadn't reached a consistent state yet.
+ */
+ if (txn != NULL && txn->restart_decoding_lsn != InvalidXLogRecPtr)
+ LogicalIncreaseRestartDecodingForSlot(lsn, txn->restart_decoding_lsn);
+ /*
+ * No in-progress transaction, can reuse the last serialized snapshot if
+ * we have one.
+ */
+ else if (txn == NULL &&
+ builder->reorder->current_restart_decoding_lsn != InvalidXLogRecPtr &&
+ builder->last_serialized_snapshot != InvalidXLogRecPtr)
+ LogicalIncreaseRestartDecodingForSlot(lsn,
+ builder->last_serialized_snapshot);
+}
+
+
+/*
+ * Build the start of a snapshot that's capable of decoding the catalog.
+ *
+ * Helper function for SnapBuildProcessRunningXacts() while we're not yet
+ * consistent.
+ *
+ * Returns true if there is a point in performing internal maintenance/cleanup
+ * using the xl_running_xacts record.
+ */
+static bool
+SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *running)
+{
+ /* ---
+ * Build catalog decoding snapshot incrementally using information about
+ * the currently running transactions. There are several ways to do that:
+ *
+ * a) There were no running transactions when the xl_running_xacts record
+ * was inserted, jump to CONSISTENT immediately. We might find such a
+ * state we were waiting for b) and c).
+ *
+ * b) Wait for all toplevel transactions that were running to end. We
+ * simply track the number of in-progress toplevel transactions and
+ * lower it whenever one commits or aborts. When that number
+ * (builder->running.xcnt) reaches zero, we can go from FULL_SNAPSHOT
+ * to CONSISTENT.
+ * NB: We need to search running.xip when seeing a transaction's end to
+ * make sure it's a toplevel transaction and it's been one of the
+ * intially running ones.
+ * Interestingly, in contrast to HS, this allows us not to care about
+ * subtransactions - and by extension suboverflowed xl_running_xacts -
+ * at all.
+ *
+ * c) This (in a previous run) or another decoding slot serialized a
+ * snapshot to disk that we can use.
+ * ---
+ */
+
+ /*
+ * xl_running_xact record is older than what we can use, we might not have
+ * all necessary catalog rows anymore.
+ */
+ if (TransactionIdIsNormal(builder->initial_xmin_horizon) &&
+ NormalTransactionIdPrecedes(running->oldestRunningXid,
+ builder->initial_xmin_horizon))
+ {
+ ereport(DEBUG1,
+ (errmsg("skipping snapshot at %X/%X while building logical decoding snapshot, xmin horizon too low",
+ (uint32) (lsn >> 32), (uint32) lsn),
+ errdetail("initial xmin horizon of %u vs the snapshot's %u",
+ builder->initial_xmin_horizon, running->oldestRunningXid)));
+ return true;
+ }
+
+ /*
+ * a) No transaction were running, we can jump to consistent.
+ *
+ * NB: We might have already started to incrementally assemble a snapshot,
+ * so we need to be careful to deal with that.
+ */
+ if (running->xcnt == 0)
+ {
+ if (builder->transactions_after == InvalidXLogRecPtr ||
+ builder->transactions_after < lsn)
+ builder->transactions_after = lsn;
+
+ builder->xmin = running->oldestRunningXid;
+ builder->xmax = running->latestCompletedXid;
+ TransactionIdAdvance(builder->xmax);
+
+ Assert(TransactionIdIsNormal(builder->xmin));
+ Assert(TransactionIdIsNormal(builder->xmax));
+
+ /* no transactions running now */
+ builder->running.xcnt = 0;
+ builder->running.xmin = InvalidTransactionId;
+ builder->running.xmax = InvalidTransactionId;
+
+ builder->state = SNAPBUILD_CONSISTENT;
+
+ ereport(LOG,
+ (errmsg("logical decoding found consistent point at %X/%X",
+ (uint32)(lsn >> 32), (uint32)lsn),
+ errdetail("running xacts with xcnt == 0")));
+
+ return false;
+ }
+ /* c) valid on disk state */
+ else if (SnapBuildRestore(builder, lsn))
+ {
+ /* there won't be any state to cleanup */
+ return false;
+ }
+ /*
+ * b) first encounter of a useable xl_running_xacts record. If we had
+ * found one earlier we would either track running transactions
+ * (i.e. builder->running.xcnt != 0) or be consistent (this function
+ * wouldn't get called).
+ */
+ else if (!builder->running.xcnt)
+ {
+ int off;
+
+ /*
+ * We only care about toplevel xids as those are the ones we
+ * definitely see in the wal stream. As snapbuild.c tracks committed
+ * instead of running transactions we don't need to know anything
+ * about uncommitted subtransactions.
+ */
+ builder->xmin = running->oldestRunningXid;
+ builder->xmax = running->latestCompletedXid;
+ TransactionIdAdvance(builder->xmax);
+
+ /* so we can safely use the faster comparisons */
+ Assert(TransactionIdIsNormal(builder->xmin));
+ Assert(TransactionIdIsNormal(builder->xmax));
+
+ builder->running.xcnt = running->xcnt;
+ builder->running.xcnt_space = running->xcnt;
+ builder->running.xip =
+ MemoryContextAlloc(builder->context,
+ builder->running.xcnt * sizeof(TransactionId));
+ memcpy(builder->running.xip, running->xids,
+ builder->running.xcnt * sizeof(TransactionId));
+
+ /* sort so we can do a binary search */
+ qsort(builder->running.xip, builder->running.xcnt,
+ sizeof(TransactionId), xidComparator);
+
+ builder->running.xmin = builder->running.xip[0];
+ builder->running.xmax = builder->running.xip[running->xcnt - 1];
+
+ /* makes comparisons cheaper later */
+ TransactionIdRetreat(builder->running.xmin);
+ TransactionIdAdvance(builder->running.xmax);
+
+ builder->state = SNAPBUILD_FULL_SNAPSHOT;
+
+ ereport(LOG,
+ (errmsg("logical decoding found initial starting point at %X/%X",
+ (uint32)(lsn >> 32), (uint32)lsn),
+ errdetail("%u xacts need to finish", (uint32) builder->running.xcnt)));
+
+ /*
+ * Iterate through all xids, wait for them to finish.
+ *
+ * This isn't required for the correctness of decoding, but to allow
+ * isolationtester to notice that we're currently waiting for
+ * something.
+ */
+ for(off = 0; off < builder->running.xcnt; off++)
+ {
+ TransactionId xid = builder->running.xip[off];
+
+ /*
+ * Upper layers should prevent that we ever need to wait on
+ * ourselves. Check anyway, since failing to do so would either
+ * result in an endless wait or an Assert() failure.
+ */
+ if (TransactionIdIsCurrentTransactionId(xid))
+ elog(ERROR, "waiting for ourselves");
+
+ XactLockTableWait(xid);
+ }
+
+ /* nothing could have built up so far, so don't perform cleanup */
+ return false;
+ }
+
+ /*
+ * We already started to track running xacts and need to wait for all
+ * in-progress ones to finish. We fall through to the normal processing of
+ * records so incremental cleanup can be performed.
+ */
+ return true;
+}
+
+
+/* -----------------------------------
+ * Snapshot serialization support
+ * -----------------------------------
+ */
+
+/*
+ * We store current state of struct SnapBuild on disk in the following manner:
+ *
+ * struct SnapBuildOnDisk;
+ * TransactionId * running.xcnt_space;
+ * TransactionId * committed.xcnt; (*not xcnt_space*)
+ *
+ */
+typedef struct SnapBuildOnDisk
+{
+ /* first part of this struct needs to be version independent */
+
+ /* data not covered by checksum */
+ uint32 magic;
+ pg_crc32 checksum;
+
+ /* data covered by checksum */
+
+ /* version, in case we want to support pg_upgrade */
+ uint32 version;
+ /* how large is the on disk data, excluding the constant sized part */
+ uint32 length;
+
+ /* version dependent part */
+ SnapBuild builder;
+
+ /* variable amount of TransactionIds follows */
+} SnapBuildOnDisk;
+
+#define SnapBuildOnDiskConstantSize \
+ offsetof(SnapBuildOnDisk, builder)
+#define SnapBuildOnDiskNotChecksummedSize \
+ offsetof(SnapBuildOnDisk, version)
+
+#define SNAPBUILD_MAGIC 0x51A1E001
+#define SNAPBUILD_VERSION 1
+
+/*
+ * Store/Load a snapshot from disk, depending on the snapshot builder's state.
+ *
+ * Supposed to be used by external (i.e. not snapbuild.c) code that just reada
+ * record that's a potential location for a serialized snapshot.
+ */
+void
+SnapBuildSerializationPoint(SnapBuild *builder, XLogRecPtr lsn)
+{
+ if (builder->state < SNAPBUILD_CONSISTENT)
+ SnapBuildRestore(builder, lsn);
+ else
+ SnapBuildSerialize(builder, lsn);
+}
+
+/*
+ * Serialize the snapshot 'builder' at the location 'lsn' if it hasn't already
+ * been done by another decoding process.
+ */
+static void
+SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn)
+{
+ Size needed_length;
+ SnapBuildOnDisk *ondisk;
+ char *ondisk_c;
+ int fd;
+ char tmppath[MAXPGPATH];
+ char path[MAXPGPATH];
+ int ret;
+ struct stat stat_buf;
+ uint32 sz;
+
+ Assert(lsn != InvalidXLogRecPtr);
+ Assert(builder->last_serialized_snapshot == InvalidXLogRecPtr ||
+ builder->last_serialized_snapshot <= lsn);
+
+ /*
+ * no point in serializing if we cannot continue to work immediately after
+ * restoring the snapshot
+ */
+ if (builder->state < SNAPBUILD_CONSISTENT)
+ return;
+
+ /*
+ * We identify snapshots by the LSN they are valid for. We don't need to
+ * include timelines in the name as each LSN maps to exactly one timeline
+ * unless the user used pg_resetxlog or similar. If a user did so, there's
+ * no hope continuing to decode anyway.
+ */
+ sprintf(path, "pg_llog/snapshots/%X-%X.snap",
+ (uint32) (lsn >> 32), (uint32) lsn);
+
+ /*
+ * first check whether some other backend already has written the snapshot
+ * for this LSN. It's perfectly fine if there's none, so we accept ENOENT
+ * as a valid state. Everything else is an unexpected error.
+ */
+ ret = stat(path, &stat_buf);
+
+ if (ret != 0 && errno != ENOENT)
+ ereport(ERROR,
+ (errmsg("could not stat file \"%s\": %m", path)));
+
+ else if (ret == 0)
+ {
+ /*
+ * somebody else has already serialized to this point, don't overwrite
+ * but remember location, so we don't need to read old data again.
+ *
+ * To be sure it has been synced to disk after the rename() from the
+ * tempfile filename to the real filename, we just repeat the
+ * fsync. That ought to be cheap because in most scenarios it should
+ * already be safely on disk.
+ */
+ fsync_fname(path, false);
+ fsync_fname("pg_llog/snapshots", true);
+
+ builder->last_serialized_snapshot = lsn;
+ goto out;
+ }
+
+ /*
+ * there is an obvious race condition here between the time we stat(2) the
+ * file and us writing the file. But we rename the file into place
+ * atomically and all files created need to contain the same data anyway,
+ * so this is perfectly fine, although a bit of a resource waste. Locking
+ * seems like pointless complication.
+ */
+ elog(DEBUG1, "serializing snapshot to %s", path);
+
+ /* to make sure only we will write to this tempfile, include pid */
+ sprintf(tmppath, "pg_llog/snapshots/%X-%X.snap.%u.tmp",
+ (uint32) (lsn >> 32), (uint32) lsn, MyProcPid);
+
+ /*
+ * Unlink temporary file if it already exists, needs to have been before a
+ * crash/error since we won't enter this function twice from within a
+ * single decoding slot/backend and the temporary file contains the pid of
+ * the current process.
+ */
+ if (unlink(tmppath) != 0 && errno != ENOENT)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not unlink file \"%s\": %m", path)));
+
+ needed_length = sizeof(SnapBuildOnDisk) +
+ sizeof(TransactionId) * builder->running.xcnt_space +
+ sizeof(TransactionId) * builder->committed.xcnt;
+
+ ondisk_c = MemoryContextAllocZero(builder->context, needed_length);
+ ondisk = (SnapBuildOnDisk *) ondisk_c;
+ ondisk->magic = SNAPBUILD_MAGIC;
+ ondisk->version = SNAPBUILD_VERSION;
+ ondisk->length = needed_length;
+ INIT_CRC32(ondisk->checksum);
+ COMP_CRC32(ondisk->checksum,
+ ((char *) ondisk) + SnapBuildOnDiskNotChecksummedSize,
+ SnapBuildOnDiskConstantSize - SnapBuildOnDiskNotChecksummedSize);
+ ondisk_c += sizeof(SnapBuildOnDisk);
+
+ memcpy(&ondisk->builder, builder, sizeof(SnapBuild));
+ /* NULL-ify memory-only data */
+ ondisk->builder.context = NULL;
+ ondisk->builder.snapshot = NULL;
+ ondisk->builder.reorder = NULL;
+ ondisk->builder.running.xip = NULL;
+ ondisk->builder.committed.xip = NULL;
+
+ COMP_CRC32(ondisk->checksum,
+ &ondisk->builder,
+ sizeof(SnapBuild));
+
+ /* copy running xacts */
+ sz = sizeof(TransactionId) * builder->running.xcnt_space;
+ memcpy(ondisk_c, builder->running.xip, sz);
+ COMP_CRC32(ondisk->checksum, ondisk_c, sz);
+ ondisk_c += sz;
+
+ /* copy committed xacts */
+ sz = sizeof(TransactionId) * builder->committed.xcnt;
+ memcpy(ondisk_c, builder->committed.xip, sz);
+ COMP_CRC32(ondisk->checksum, ondisk_c, sz);
+ ondisk_c += sz;
+
+ /* we have valid data now, open tempfile and write it there */
+ fd = OpenTransientFile(tmppath,
+ O_CREAT | O_EXCL | O_WRONLY | PG_BINARY,
+ S_IRUSR | S_IWUSR);
+ if (fd < 0)
+ ereport(ERROR,
+ (errmsg("could not open file \"%s\": %m", path)));
+
+ if ((write(fd, ondisk, needed_length)) != needed_length)
+ {
+ CloseTransientFile(fd);
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not write to file \"%s\": %m", tmppath)));
+ }
+
+ /*
+ * fsync the file before renaming so that even if we crash after this we
+ * have either a fully valid file or nothing.
+ *
+ * TODO: Do the fsync() via checkpoints/restartpoints, doing it here has
+ * some noticeable overhead since it's performed synchronously during
+ * decoding?
+ */
+ if (pg_fsync(fd) != 0)
+ {
+ CloseTransientFile(fd);
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not fsync file \"%s\": %m", tmppath)));
+ }
+ CloseTransientFile(fd);
+
+ fsync_fname("pg_llog/snapshots", true);
+
+ /*
+ * We may overwrite the work from some other backend, but that's ok, our
+ * snapshot is valid as well, we'll just have done some superflous work.
+ */
+ if (rename(tmppath, path) != 0)
+ {
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not rename file \"%s\" to \"%s\": %m",
+ tmppath, path)));
+ }
+
+ /* make sure we persist */
+ fsync_fname(path, false);
+ fsync_fname("pg_llog/snapshots", true);
+
+ /*
+ * Now there's no way we can loose the dumped state anymore, remember
+ * this as a serialization point.
+ */
+ builder->last_serialized_snapshot = lsn;
+
+out:
+ ReorderBufferSetRestartPoint(builder->reorder,
+ builder->last_serialized_snapshot);
+}
+
+/*
+ * Restore a snapshot into 'builder' if previously one has been stored at the
+ * location indicated by 'lsn'. Returns true if successful, false otherwise.
+ */
+static bool
+SnapBuildRestore(SnapBuild *builder, XLogRecPtr lsn)
+{
+ SnapBuildOnDisk ondisk;
+ int fd;
+ char path[MAXPGPATH];
+ Size sz;
+ int readBytes;
+ pg_crc32 checksum;
+
+ /* no point in loading a snapshot if we're already there */
+ if (builder->state == SNAPBUILD_CONSISTENT)
+ return false;
+
+ sprintf(path, "pg_llog/snapshots/%X-%X.snap",
+ (uint32) (lsn >> 32), (uint32) lsn);
+
+ fd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0);
+
+ if (fd < 0 && errno == ENOENT)
+ return false;
+ else if (fd < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not open file \"%s\": %m", path)));
+
+ /* ----
+ * Make sure the snapshot had been stored safely to disk, that's normally
+ * cheap.
+ * Note that we do not need PANIC here, nobody will be able to use the
+ * slot without fsyncing, and saving it won't suceed without an fsync()
+ * either...
+ * ----
+ */
+ fsync_fname(path, false);
+ fsync_fname("pg_llog/snapshots", true);
+
+
+ /* read statically sized portion of snapshot */
+ readBytes = read(fd, &ondisk, SnapBuildOnDiskConstantSize);
+ if (readBytes != SnapBuildOnDiskConstantSize)
+ {
+ CloseTransientFile(fd);
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not read file \"%s\", read %d of %d: %m",
+ path, readBytes, (int) SnapBuildOnDiskConstantSize)));
+ }
+
+ if (ondisk.magic != SNAPBUILD_MAGIC)
+ ereport(ERROR,
+ (errmsg("snapbuild state file \"%s\" has wrong magic %u instead of %u",
+ path, ondisk.magic, SNAPBUILD_MAGIC)));
+
+ if (ondisk.version != SNAPBUILD_VERSION)
+ ereport(ERROR,
+ (errmsg("snapbuild state file \"%s\" has unsupported version %u instead of %u",
+ path, ondisk.version, SNAPBUILD_VERSION)));
+
+ INIT_CRC32(checksum);
+ COMP_CRC32(checksum,
+ ((char *) &ondisk) + SnapBuildOnDiskNotChecksummedSize,
+ SnapBuildOnDiskConstantSize - SnapBuildOnDiskNotChecksummedSize);
+
+ /* read SnapBuild */
+ readBytes = read(fd, &ondisk.builder, sizeof(SnapBuild));
+ if (readBytes != sizeof(SnapBuild))
+ {
+ CloseTransientFile(fd);
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not read file \"%s\", read %d of %d: %m",
+ path, readBytes, (int) sizeof(SnapBuild))));
+ }
+ COMP_CRC32(checksum, &ondisk.builder, sizeof(SnapBuild));
+
+ /* restore running xacts information */
+ sz = sizeof(TransactionId) * ondisk.builder.running.xcnt_space;
+ ondisk.builder.running.xip = MemoryContextAlloc(builder->context, sz);
+ readBytes = read(fd, ondisk.builder.running.xip, sz);
+ if (readBytes != sz)
+ {
+ CloseTransientFile(fd);
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not read file \"%s\", read %d of %d: %m",
+ path, readBytes, (int) sz)));
+ }
+ COMP_CRC32(checksum, ondisk.builder.running.xip, sz);
+
+ /* restore committed xacts information */
+ sz = sizeof(TransactionId) * ondisk.builder.committed.xcnt;
+ ondisk.builder.committed.xip = MemoryContextAlloc(builder->context, sz);
+ readBytes = read(fd, ondisk.builder.committed.xip, sz);
+ if (readBytes != sz)
+ {
+ CloseTransientFile(fd);
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not read file \"%s\", read %d of %d: %m",
+ path, readBytes, (int) sz)));
+ }
+ COMP_CRC32(checksum, ondisk.builder.committed.xip, sz);
+
+ CloseTransientFile(fd);
+
+ /* verify checksum of what we've read */
+ if (!EQ_CRC32(checksum, ondisk.checksum))
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("snapbuild state file %s: checksum mismatch, is %u, should be %u",
+ path, checksum, ondisk.checksum)));
+
+ /*
+ * ok, we now have a sensible snapshot here, figure out if it has more
+ * information than we have.
+ */
+
+ /*
+ * We are only interested in consistent snapshots for now, comparing
+ * whether one imcomplete snapshot is more "advanced" seems to be
+ * unnecessarily complex.
+ */
+ if (ondisk.builder.state < SNAPBUILD_CONSISTENT)
+ goto snapshot_not_interesting;
+
+ /*
+ * Don't use a snapshot that requires an xmin that we cannot guarantee to
+ * be available.
+ */
+ if (TransactionIdPrecedes(ondisk.builder.xmin, builder->initial_xmin_horizon))
+ goto snapshot_not_interesting;
+
+
+ /* ok, we think the snapshot is sensible, copy over everything important */
+ builder->xmin = ondisk.builder.xmin;
+ builder->xmax = ondisk.builder.xmax;
+ builder->state = ondisk.builder.state;
+
+ builder->committed.xcnt = ondisk.builder.committed.xcnt;
+ /* We only allocated/stored xcnt, not xcnt_space xids ! */
+ /* don't overwrite preallocated xip, if we don't have anything here */
+ if (builder->committed.xcnt > 0)
+ {
+ pfree(builder->committed.xip);
+ builder->committed.xcnt_space = ondisk.builder.committed.xcnt;
+ builder->committed.xip = ondisk.builder.committed.xip;
+ }
+ ondisk.builder.committed.xip = NULL;
+
+ builder->running.xcnt = ondisk.builder.committed.xcnt;
+ if (builder->running.xip)
+ pfree(builder->running.xip);
+ builder->running.xcnt_space = ondisk.builder.committed.xcnt_space;
+ builder->running.xip = ondisk.builder.running.xip;
+
+ /* our snapshot is not interesting anymore, build a new one */
+ if (builder->snapshot != NULL)
+ {
+ SnapBuildSnapDecRefcount(builder->snapshot);
+ }
+ builder->snapshot = SnapBuildBuildSnapshot(builder, InvalidTransactionId);
+ SnapBuildSnapIncRefcount(builder->snapshot);
+
+ ReorderBufferSetRestartPoint(builder->reorder, lsn);
+
+ Assert(builder->state == SNAPBUILD_CONSISTENT);
+
+ ereport(LOG,
+ (errmsg("logical decoding found consistent point at %X/%X",
+ (uint32)(lsn >> 32), (uint32)lsn),
+ errdetail("found initial snapshot in snapbuild file")));
+ return true;
+
+snapshot_not_interesting:
+ if (ondisk.builder.running.xip != NULL)
+ pfree(ondisk.builder.running.xip);
+ if (ondisk.builder.committed.xip != NULL)
+ pfree(ondisk.builder.committed.xip);
+ return false;
+}
+
+/*
+ * Remove all serialized snapshots that are not required anymore because no
+ * slot can need them. This doesn't actually have to run during a checkpoint,
+ * but it's a convenient point to schedule this.
+ *
+ * NB: We run this during checkpoints even if logical decoding is disabled so
+ * we cleanup old slots at some point after it got disabled.
+ */
+void
+CheckPointSnapBuild(void)
+{
+ XLogRecPtr cutoff;
+ XLogRecPtr redo;
+ DIR *snap_dir;
+ struct dirent *snap_de;
+ char path[MAXPGPATH];
+
+ /*
+ * We start of with a minimum of the last redo pointer. No new replication
+ * slot will start before that, so that's a safe upper bound for removal.
+ */
+ redo = GetRedoRecPtr();
+
+ /* now check for the restart ptrs from existing slots */
+ cutoff = ReplicationSlotsComputeLogicalRestartLSN();
+
+ /* don't start earlier than the restart lsn */
+ if (redo < cutoff)
+ cutoff = redo;
+
+ snap_dir = AllocateDir("pg_llog/snapshots");
+ while ((snap_de = ReadDir(snap_dir, "pg_llog/snapshots")) != NULL)
+ {
+ uint32 hi;
+ uint32 lo;
+ XLogRecPtr lsn;
+ struct stat statbuf;
+
+ if (strcmp(snap_de->d_name, ".") == 0 ||
+ strcmp(snap_de->d_name, "..") == 0)
+ continue;
+
+ snprintf(path, MAXPGPATH, "pg_llog/snapshots/%s", snap_de->d_name);
+
+ if (lstat(path, &statbuf) == 0 && !S_ISREG(statbuf.st_mode))
+ {
+ elog(DEBUG1, "only regular files expected: %s", path);
+ continue;
+ }
+
+ /*
+ * temporary filenames from SnapBuildSerialize() include the LSN and
+ * everything but are postfixed by .$pid.tmp. We can just remove them
+ * the same as other files because there can be none that are currently
+ * being written that are older than cutoff.
+ *
+ * We just log a message if a file doesn't fit the pattern, it's
+ * probably some editors lock/state file or similar...
+ */
+ if (sscanf(snap_de->d_name, "%X-%X.snap", &hi, &lo) != 2)
+ {
+ ereport(LOG,
+ (errmsg("could not parse filename \"%s\"", path)));
+ continue;
+ }
+
+ lsn = ((uint64) hi) << 32 | lo;
+
+ /* check whether we still need it */
+ if (lsn < cutoff || cutoff == InvalidXLogRecPtr)
+ {
+ elog(DEBUG1, "removing snapbuild snapshot %s", path);
+
+ /*
+ * It's not particularly harmful, though strange, if we can't
+ * remove the file here. Don't prevent the checkpoint from
+ * completing, that'd be cure worse than the disease.
+ */
+ if (unlink(path) < 0)
+ {
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not unlink file \"%s\": %m",
+ path)));
+ continue;
+ }
+ }
+ }
+ FreeDir(snap_dir);
+}
diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c
index 826c7f027e5..45ed7e40e89 100644
--- a/src/backend/replication/slot.c
+++ b/src/backend/replication/slot.c
@@ -43,6 +43,7 @@
#include "miscadmin.h"
#include "replication/slot.h"
#include "storage/fd.h"
+#include "storage/proc.h"
#include "storage/procarray.h"
/*
@@ -82,6 +83,8 @@ ReplicationSlot *MyReplicationSlot = NULL;
/* GUCs */
int max_replication_slots = 0; /* the maximum number of replication slots */
+static void ReplicationSlotDropAcquired(void);
+
/* internal persistency functions */
static void RestoreSlotFromDisk(const char *name);
static void CreateSlotOnDisk(ReplicationSlot *slot);
@@ -190,11 +193,12 @@ ReplicationSlotValidateName(const char *name, int elevel)
* Create a new replication slot and mark it as used by this backend.
*
* name: Name of the slot
- * db_specific: changeset extraction is db specific, if the slot is going to
+ * db_specific: logical decoding is db specific; if the slot is going to
* be used for that pass true, otherwise false.
*/
void
-ReplicationSlotCreate(const char *name, bool db_specific)
+ReplicationSlotCreate(const char *name, bool db_specific,
+ ReplicationSlotPersistency persistency)
{
ReplicationSlot *slot = NULL;
int i;
@@ -246,6 +250,7 @@ ReplicationSlotCreate(const char *name, bool db_specific)
*/
Assert(!slot->in_use);
Assert(!slot->active);
+ slot->data.persistency = persistency;
slot->data.xmin = InvalidTransactionId;
slot->effective_xmin = InvalidTransactionId;
strncpy(NameStr(slot->data.name), name, NAMEDATALEN);
@@ -348,14 +353,30 @@ ReplicationSlotRelease(void)
Assert(slot != NULL && slot->active);
- /* Mark slot inactive. We're not freeing it, just disconnecting. */
+ if (slot->data.persistency == RS_EPHEMERAL)
+ {
+ /*
+ * Delete the slot. There is no !PANIC case where this is allowed to
+ * fail, all that may happen is an incomplete cleanup of the on-disk
+ * data.
+ */
+ ReplicationSlotDropAcquired();
+ }
+ else
{
+ /* Mark slot inactive. We're not freeing it, just disconnecting. */
volatile ReplicationSlot *vslot = slot;
SpinLockAcquire(&slot->mutex);
vslot->active = false;
SpinLockRelease(&slot->mutex);
- MyReplicationSlot = NULL;
}
+
+ MyReplicationSlot = NULL;
+
+ /* might not have been set when we've been a plain slot */
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+ MyPgXact->vacuumFlags &= ~PROC_IN_LOGICAL_DECODING;
+ LWLockRelease(ProcArrayLock);
}
/*
@@ -364,52 +385,36 @@ ReplicationSlotRelease(void)
void
ReplicationSlotDrop(const char *name)
{
- ReplicationSlot *slot = NULL;
- int i;
- bool active;
+ Assert(MyReplicationSlot == NULL);
+
+ ReplicationSlotAcquire(name);
+
+ ReplicationSlotDropAcquired();
+}
+
+/*
+ * Permanently drop the currently acquired replication slot which will be
+ * released by the point this function returns.
+ */
+static void
+ReplicationSlotDropAcquired(void)
+{
char path[MAXPGPATH];
char tmppath[MAXPGPATH];
+ ReplicationSlot *slot = MyReplicationSlot;
- ReplicationSlotValidateName(name, ERROR);
+ Assert(MyReplicationSlot != NULL);
+
+ /* slot isn't acquired anymore */
+ MyReplicationSlot = NULL;
/*
- * If some other backend ran this code currently with us, we might both
- * try to free the same slot at the same time. Or we might try to delete
- * a slot with a certain name while someone else was trying to create a
- * slot with the same name.
+ * If some other backend ran this code concurrently with us, we might try
+ * to delete a slot with a certain name while someone else was trying to
+ * create a slot with the same name.
*/
LWLockAcquire(ReplicationSlotAllocationLock, LW_EXCLUSIVE);
- /* Search for the named slot and mark it active if we find it. */
- LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
- for (i = 0; i < max_replication_slots; i++)
- {
- ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
-
- if (s->in_use && strcmp(name, NameStr(s->data.name)) == 0)
- {
- volatile ReplicationSlot *vslot = s;
-
- SpinLockAcquire(&s->mutex);
- active = vslot->active;
- vslot->active = true;
- SpinLockRelease(&s->mutex);
- slot = s;
- break;
- }
- }
- LWLockRelease(ReplicationSlotControlLock);
-
- /* If we did not find the slot or it was already active, error out. */
- if (slot == NULL)
- ereport(ERROR,
- (errcode(ERRCODE_UNDEFINED_OBJECT),
- errmsg("replication slot \"%s\" does not exist", name)));
- if (active)
- ereport(ERROR,
- (errcode(ERRCODE_OBJECT_IN_USE),
- errmsg("replication slot \"%s\" is already active", name)));
-
/* Generate pathnames. */
sprintf(path, "pg_replslot/%s", NameStr(slot->data.name));
sprintf(tmppath, "pg_replslot/%s.tmp", NameStr(slot->data.name));
@@ -417,35 +422,41 @@ ReplicationSlotDrop(const char *name)
/*
* Rename the slot directory on disk, so that we'll no longer recognize
* this as a valid slot. Note that if this fails, we've got to mark the
- * slot inactive again before bailing out.
+ * slot inactive before bailing out. If we're dropping a ephemeral slot,
+ * we better never fail hard as the caller won't expect the slot to
+ * survive and this might get called during error handling.
*/
- if (rename(path, tmppath) != 0)
+ if (rename(path, tmppath) == 0)
+ {
+ /*
+ * We need to fsync() the directory we just renamed and its parent to
+ * make sure that our changes are on disk in a crash-safe fashion. If
+ * fsync() fails, we can't be sure whether the changes are on disk or
+ * not. For now, we handle that by panicking;
+ * StartupReplicationSlots() will try to straighten it out after
+ * restart.
+ */
+ START_CRIT_SECTION();
+ fsync_fname(tmppath, true);
+ fsync_fname("pg_replslot", true);
+ END_CRIT_SECTION();
+ }
+ else
{
volatile ReplicationSlot *vslot = slot;
+ bool fail_softly = slot->data.persistency == RS_EPHEMERAL;
SpinLockAcquire(&slot->mutex);
vslot->active = false;
SpinLockRelease(&slot->mutex);
- ereport(ERROR,
+ ereport(fail_softly ? WARNING : ERROR,
(errcode_for_file_access(),
errmsg("could not rename \"%s\" to \"%s\": %m",
path, tmppath)));
}
/*
- * We need to fsync() the directory we just renamed and its parent to make
- * sure that our changes are on disk in a crash-safe fashion. If fsync()
- * fails, we can't be sure whether the changes are on disk or not. For
- * now, we handle that by panicking; StartupReplicationSlots() will
- * try to straighten it out after restart.
- */
- START_CRIT_SECTION();
- fsync_fname(tmppath, true);
- fsync_fname("pg_replslot", true);
- END_CRIT_SECTION();
-
- /*
* The slot is definitely gone. Lock out concurrent scans of the array
* long enough to kill it. It's OK to clear the active flag here without
* grabbing the mutex because nobody else can be scanning the array here,
@@ -461,7 +472,7 @@ ReplicationSlotDrop(const char *name)
* Slot is dead and doesn't prevent resource removal anymore, recompute
* limits.
*/
- ReplicationSlotsComputeRequiredXmin();
+ ReplicationSlotsComputeRequiredXmin(false);
ReplicationSlotsComputeRequiredLSN();
/*
@@ -519,21 +530,49 @@ ReplicationSlotMarkDirty(void)
}
/*
+ * Convert a slot that's marked as RS_DROP_ON_ERROR to a RS_PERSISTENT slot,
+ * guaranteeing it will be there after a eventual crash.
+ */
+void
+ReplicationSlotPersist(void)
+{
+ ReplicationSlot *slot = MyReplicationSlot;
+
+ Assert(slot != NULL);
+ Assert(slot->data.persistency != RS_PERSISTENT);
+
+ {
+ volatile ReplicationSlot *vslot = slot;
+
+ SpinLockAcquire(&slot->mutex);
+ vslot->data.persistency = RS_PERSISTENT;
+ SpinLockRelease(&slot->mutex);
+ }
+
+ ReplicationSlotMarkDirty();
+ ReplicationSlotSave();
+}
+
+/*
* Compute the oldest xmin across all slots and store it in the ProcArray.
*/
void
-ReplicationSlotsComputeRequiredXmin(void)
+ReplicationSlotsComputeRequiredXmin(bool already_locked)
{
int i;
TransactionId agg_xmin = InvalidTransactionId;
+ TransactionId agg_catalog_xmin = InvalidTransactionId;
Assert(ReplicationSlotCtl != NULL);
- LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
+ if (!already_locked)
+ LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
+
for (i = 0; i < max_replication_slots; i++)
{
ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
TransactionId effective_xmin;
+ TransactionId effective_catalog_xmin;
if (!s->in_use)
continue;
@@ -543,6 +582,7 @@ ReplicationSlotsComputeRequiredXmin(void)
SpinLockAcquire(&s->mutex);
effective_xmin = vslot->effective_xmin;
+ effective_catalog_xmin = vslot->effective_catalog_xmin;
SpinLockRelease(&s->mutex);
}
@@ -551,10 +591,18 @@ ReplicationSlotsComputeRequiredXmin(void)
(!TransactionIdIsValid(agg_xmin) ||
TransactionIdPrecedes(effective_xmin, agg_xmin)))
agg_xmin = effective_xmin;
+
+ /* check the catalog xmin */
+ if (TransactionIdIsValid(effective_catalog_xmin) &&
+ (!TransactionIdIsValid(agg_catalog_xmin) ||
+ TransactionIdPrecedes(effective_catalog_xmin, agg_catalog_xmin)))
+ agg_catalog_xmin = effective_catalog_xmin;
}
- LWLockRelease(ReplicationSlotControlLock);
- ProcArraySetReplicationSlotXmin(agg_xmin);
+ if (!already_locked)
+ LWLockRelease(ReplicationSlotControlLock);
+
+ ProcArraySetReplicationSlotXmin(agg_xmin, agg_catalog_xmin, already_locked);
}
/*
@@ -596,6 +644,110 @@ ReplicationSlotsComputeRequiredLSN(void)
}
/*
+ * Compute the oldest WAL LSN required by *logical* decoding slots..
+ *
+ * Returns InvalidXLogRecPtr if logical decoding is disabled or no logicals
+ * slots exist.
+ *
+ * NB: this returns a value >= ReplicationSlotsComputeRequiredLSN(), since it
+ * ignores physical replication slots.
+ *
+ * The results aren't required frequently, so we don't maintain a precomputed
+ * value like we do for ComputeRequiredLSN() and ComputeRequiredXmin().
+ */
+XLogRecPtr
+ReplicationSlotsComputeLogicalRestartLSN(void)
+{
+ XLogRecPtr result = InvalidXLogRecPtr;
+ int i;
+
+ if (max_replication_slots <= 0)
+ return InvalidXLogRecPtr;
+
+ LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
+
+ for (i = 0; i < max_replication_slots; i++)
+ {
+ volatile ReplicationSlot *s;
+ XLogRecPtr restart_lsn;
+
+ s = &ReplicationSlotCtl->replication_slots[i];
+
+ /* cannot change while ReplicationSlotCtlLock is held */
+ if (!s->in_use)
+ continue;
+
+ /* we're only interested in logical slots */
+ if (s->data.database == InvalidOid)
+ continue;
+
+ /* read once, it's ok if it increases while we're checking */
+ SpinLockAcquire(&s->mutex);
+ restart_lsn = s->data.restart_lsn;
+ SpinLockRelease(&s->mutex);
+
+ if (result == InvalidXLogRecPtr ||
+ restart_lsn < result)
+ result = restart_lsn;
+ }
+
+ LWLockRelease(ReplicationSlotControlLock);
+
+ return result;
+}
+
+/*
+ * ReplicationSlotsCountDBSlots -- count the number of slots that refer to the
+ * passed database oid.
+ *
+ * Returns true if there are any slots referencing the database. *nslots will
+ * be set to the absolute number of slots in the database, *nactive to ones
+ * currently active.
+ */
+bool
+ReplicationSlotsCountDBSlots(Oid dboid, int *nslots, int *nactive)
+{
+ int i;
+
+ *nslots = *nactive = 0;
+
+ if (max_replication_slots <= 0)
+ return false;
+
+ LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
+ for (i = 0; i < max_replication_slots; i++)
+ {
+ volatile ReplicationSlot *s;
+
+ s = &ReplicationSlotCtl->replication_slots[i];
+
+ /* cannot change while ReplicationSlotCtlLock is held */
+ if (!s->in_use)
+ continue;
+
+ /* not database specific, skip */
+ if (s->data.database == InvalidOid)
+
+ /* not our database, skip */
+ if (s->data.database != dboid)
+ continue;
+
+ /* count slots with spinlock held */
+ SpinLockAcquire(&s->mutex);
+ (*nslots)++;
+ if (s->active)
+ (*nactive)++;
+ SpinLockRelease(&s->mutex);
+ }
+ LWLockRelease(ReplicationSlotControlLock);
+
+ if (*nslots > 0)
+ return true;
+ return false;
+}
+
+
+/*
* Check whether the server's configuration supports using replication
* slots.
*/
@@ -723,7 +875,7 @@ StartupReplicationSlots(XLogRecPtr checkPointRedo)
return;
/* Now that we have recovered all the data, compute replication xmin */
- ReplicationSlotsComputeRequiredXmin();
+ ReplicationSlotsComputeRequiredXmin(false);
ReplicationSlotsComputeRequiredLSN();
}
@@ -1050,8 +1202,19 @@ RestoreSlotFromDisk(const char *name)
memcpy(&slot->data, &cp.slotdata,
sizeof(ReplicationSlotPersistentData));
+ /* Don't restore the slot if it's not parked as persistent. */
+ if (slot->data.persistency != RS_PERSISTENT)
+ return;
+
/* initialize in memory state */
slot->effective_xmin = cp.slotdata.xmin;
+ slot->effective_catalog_xmin = cp.slotdata.catalog_xmin;
+
+ slot->candidate_catalog_xmin = InvalidTransactionId;
+ slot->candidate_xmin_lsn = InvalidXLogRecPtr;
+ slot->candidate_restart_lsn = InvalidXLogRecPtr;
+ slot->candidate_restart_valid = InvalidXLogRecPtr;
+
slot->in_use = true;
slot->active = false;
diff --git a/src/backend/replication/slotfuncs.c b/src/backend/replication/slotfuncs.c
index 5acd2bae19c..c9416b03eee 100644
--- a/src/backend/replication/slotfuncs.c
+++ b/src/backend/replication/slotfuncs.c
@@ -15,13 +15,13 @@
#include "funcapi.h"
#include "miscadmin.h"
+
#include "access/htup_details.h"
+#include "replication/slot.h"
+#include "replication/logical.h"
+#include "replication/logicalfuncs.h"
#include "utils/builtins.h"
#include "utils/pg_lsn.h"
-#include "replication/slot.h"
-
-Datum pg_create_physical_replication_slot(PG_FUNCTION_ARGS);
-Datum pg_drop_replication_slot(PG_FUNCTION_ARGS);
static void
check_permissions(void)
@@ -54,7 +54,7 @@ pg_create_physical_replication_slot(PG_FUNCTION_ARGS)
elog(ERROR, "return type must be a row type");
/* acquire replication slot, this will check for conflicting names*/
- ReplicationSlotCreate(NameStr(*name), false);
+ ReplicationSlotCreate(NameStr(*name), false, RS_PERSISTENT);
values[0] = NameGetDatum(&MyReplicationSlot->data.name);
@@ -69,6 +69,68 @@ pg_create_physical_replication_slot(PG_FUNCTION_ARGS)
PG_RETURN_DATUM(result);
}
+
+/*
+ * SQL function for creating a new logical replication slot.
+ */
+Datum
+pg_create_logical_replication_slot(PG_FUNCTION_ARGS)
+{
+ Name name = PG_GETARG_NAME(0);
+ Name plugin = PG_GETARG_NAME(1);
+
+ LogicalDecodingContext *ctx = NULL;
+
+ TupleDesc tupdesc;
+ HeapTuple tuple;
+ Datum result;
+ Datum values[2];
+ bool nulls[2];
+
+ if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+ elog(ERROR, "return type must be a row type");
+
+ check_permissions();
+
+ CheckLogicalDecodingRequirements();
+
+ Assert(!MyReplicationSlot);
+
+ /*
+ * Acquire a logical decoding slot, this will check for conflicting
+ * names.
+ */
+ ReplicationSlotCreate(NameStr(*name), true, RS_EPHEMERAL);
+
+ /*
+ * Create logical decoding context, to build the initial snapshot.
+ */
+ ctx = CreateInitDecodingContext(
+ NameStr(*plugin), NIL,
+ logical_read_local_xlog_page, NULL, NULL);
+
+ /* build initial snapshot, might take a while */
+ DecodingContextFindStartpoint(ctx);
+
+ values[0] = CStringGetTextDatum(NameStr(MyReplicationSlot->data.name));
+ values[1] = LSNGetDatum(MyReplicationSlot->data.confirmed_flush);
+
+ /* don't need the decoding context anymore */
+ FreeDecodingContext(ctx);
+
+ memset(nulls, 0, sizeof(nulls));
+
+ tuple = heap_form_tuple(tupdesc, values, nulls);
+ result = HeapTupleGetDatum(tuple);
+
+ /* ok, slot is now fully created, mark it as persistent */
+ ReplicationSlotPersist();
+ ReplicationSlotRelease();
+
+ PG_RETURN_DATUM(result);
+}
+
+
/*
* SQL function for dropping a replication slot.
*/
@@ -92,7 +154,7 @@ pg_drop_replication_slot(PG_FUNCTION_ARGS)
Datum
pg_get_replication_slots(PG_FUNCTION_ARGS)
{
-#define PG_STAT_GET_REPLICATION_SLOTS_COLS 6
+#define PG_GET_REPLICATION_SLOTS_COLS 8
ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
TupleDesc tupdesc;
Tuplestorestate *tupstore;
@@ -134,15 +196,16 @@ pg_get_replication_slots(PG_FUNCTION_ARGS)
for (slotno = 0; slotno < max_replication_slots; slotno++)
{
ReplicationSlot *slot = &ReplicationSlotCtl->replication_slots[slotno];
- Datum values[PG_STAT_GET_REPLICATION_SLOTS_COLS];
- bool nulls[PG_STAT_GET_REPLICATION_SLOTS_COLS];
+ Datum values[PG_GET_REPLICATION_SLOTS_COLS];
+ bool nulls[PG_GET_REPLICATION_SLOTS_COLS];
TransactionId xmin;
+ TransactionId catalog_xmin;
XLogRecPtr restart_lsn;
bool active;
Oid database;
NameData slot_name;
-
+ NameData plugin;
int i;
SpinLockAcquire(&slot->mutex);
@@ -154,9 +217,11 @@ pg_get_replication_slots(PG_FUNCTION_ARGS)
else
{
xmin = slot->data.xmin;
+ catalog_xmin = slot->data.catalog_xmin;
database = slot->data.database;
restart_lsn = slot->data.restart_lsn;
namecpy(&slot_name, &slot->data.name);
+ namecpy(&plugin, &slot->data.plugin);
active = slot->active;
}
@@ -166,19 +231,34 @@ pg_get_replication_slots(PG_FUNCTION_ARGS)
i = 0;
values[i++] = NameGetDatum(&slot_name);
+
+ if (database == InvalidOid)
+ nulls[i++] = true;
+ else
+ values[i++] = NameGetDatum(&plugin);
+
if (database == InvalidOid)
values[i++] = CStringGetTextDatum("physical");
else
values[i++] = CStringGetTextDatum("logical");
+
if (database == InvalidOid)
nulls[i++] = true;
else
values[i++] = database;
+
values[i++] = BoolGetDatum(active);
+
if (xmin != InvalidTransactionId)
values[i++] = TransactionIdGetDatum(xmin);
else
nulls[i++] = true;
+
+ if (catalog_xmin != InvalidTransactionId)
+ values[i++] = TransactionIdGetDatum(catalog_xmin);
+ else
+ nulls[i++] = true;
+
if (restart_lsn != InvalidTransactionId)
values[i++] = LSNGetDatum(restart_lsn);
else
diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c
index e31977eee02..43db10851c3 100644
--- a/src/backend/replication/walreceiver.c
+++ b/src/backend/replication/walreceiver.c
@@ -1147,7 +1147,7 @@ XLogWalRcvSendHSFeedback(bool immed)
* everything else has been checked.
*/
if (hot_standby_feedback)
- xmin = GetOldestXmin(true, false);
+ xmin = GetOldestXmin(NULL, false);
else
xmin = InvalidTransactionId;
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index 048367af299..5227eab414f 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -55,6 +55,7 @@
#include "replication/basebackup.h"
#include "replication/slot.h"
#include "replication/syncrep.h"
+#include "replication/slot.h"
#include "replication/walreceiver.h"
#include "replication/walsender.h"
#include "replication/walsender_private.h"
@@ -434,7 +435,7 @@ StartReplication(StartReplicationCmd *cmd)
if (MyReplicationSlot->data.database != InvalidOid)
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
- (errmsg("cannot use a replication slot created for changeset extraction for streaming replication"))));
+ (errmsg("cannot use a logical replication slot for physical replication"))));
}
/*
@@ -656,7 +657,9 @@ CreateReplicationSlot(CreateReplicationSlotCmd *cmd)
sendTimeLineIsHistoric = false;
sendTimeLine = ThisTimeLineID;
- ReplicationSlotCreate(cmd->slotname, cmd->kind == REPLICATION_KIND_LOGICAL);
+ ReplicationSlotCreate(cmd->slotname,
+ cmd->kind == REPLICATION_KIND_LOGICAL,
+ RS_PERSISTENT);
initStringInfo(&output_message);
@@ -766,7 +769,7 @@ exec_replication_command(const char *cmd_string)
if (cmd->kind == REPLICATION_KIND_PHYSICAL)
StartReplication(cmd);
else
- elog(ERROR, "cannot handle changeset extraction yet");
+ elog(ERROR, "cannot handle logical decoding yet");
break;
}
@@ -1017,7 +1020,7 @@ ProcessStandbyReplyMessage(void)
if (MyReplicationSlot && flushPtr != InvalidXLogRecPtr)
{
if (MyReplicationSlot->data.database != InvalidOid)
- elog(ERROR, "cannot handle changeset extraction yet");
+ elog(ERROR, "cannot handle logical decoding yet");
else
PhysicalConfirmReceivedLocation(flushPtr);
}
@@ -1050,7 +1053,7 @@ PhysicalReplicationSlotNewXmin(TransactionId feedbackXmin)
if (changed)
{
ReplicationSlotMarkDirty();
- ReplicationSlotsComputeRequiredXmin();
+ ReplicationSlotsComputeRequiredXmin(false);
}
}
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index eac418442d3..3376a353a40 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -50,11 +50,13 @@
#include "access/transam.h"
#include "access/xact.h"
#include "access/twophase.h"
+#include "catalog/catalog.h"
#include "miscadmin.h"
#include "storage/proc.h"
#include "storage/procarray.h"
#include "storage/spin.h"
#include "utils/builtins.h"
+#include "utils/rel.h"
#include "utils/snapmgr.h"
@@ -84,6 +86,8 @@ typedef struct ProcArrayStruct
/* oldest xmin of any replication slot */
TransactionId replication_slot_xmin;
+ /* oldest catalog xmin of any replication slot */
+ TransactionId replication_slot_catalog_xmin;
/*
* We declare pgprocnos[] as 1 entry because C wants a fixed-size array,
@@ -1108,21 +1112,22 @@ TransactionIdIsActive(TransactionId xid)
* GetOldestXmin -- returns oldest transaction that was running
* when any current transaction was started.
*
- * If allDbs is TRUE then all backends are considered; if allDbs is FALSE
- * then only backends running in my own database are considered.
+ * If rel is NULL or a shared relation, all backends are considered, otherwise
+ * only backends running in this database are considered.
*
* If ignoreVacuum is TRUE then backends with the PROC_IN_VACUUM flag set are
* ignored.
*
- * This is used by VACUUM to decide which deleted tuples must be preserved
- * in a table. allDbs = TRUE is needed for shared relations, but allDbs =
- * FALSE is sufficient for non-shared relations, since only backends in my
- * own database could ever see the tuples in them. Also, we can ignore
- * concurrently running lazy VACUUMs because (a) they must be working on other
- * tables, and (b) they don't need to do snapshot-based lookups.
+ * This is used by VACUUM to decide which deleted tuples must be preserved in
+ * the passed in table. For shared relations backends in all databases must be
+ * considered, but for non-shared relations that's not required, since only
+ * backends in my own database could ever see the tuples in them. Also, we can
+ * ignore concurrently running lazy VACUUMs because (a) they must be working
+ * on other tables, and (b) they don't need to do snapshot-based lookups.
*
- * This is also used to determine where to truncate pg_subtrans. allDbs
- * must be TRUE for that case, and ignoreVacuum FALSE.
+ * This is also used to determine where to truncate pg_subtrans. For that
+ * backends in all databases have to be considered, so rel = NULL has to be
+ * passed in.
*
* Note: we include all currently running xids in the set of considered xids.
* This ensures that if a just-started xact has not yet set its snapshot,
@@ -1133,7 +1138,7 @@ TransactionIdIsActive(TransactionId xid)
* backwards on repeated calls. The calculated value is conservative, so that
* anything older is definitely not considered as running by anyone anymore,
* but the exact value calculated depends on a number of things. For example,
- * if allDbs is FALSE and there are no transactions running in the current
+ * if rel = NULL and there are no transactions running in the current
* database, GetOldestXmin() returns latestCompletedXid. If a transaction
* begins after that, its xmin will include in-progress transactions in other
* databases that started earlier, so another call will return a lower value.
@@ -1152,12 +1157,22 @@ TransactionIdIsActive(TransactionId xid)
* GetOldestXmin() move backwards, with no consequences for data integrity.
*/
TransactionId
-GetOldestXmin(bool allDbs, bool ignoreVacuum)
+GetOldestXmin(Relation rel, bool ignoreVacuum)
{
ProcArrayStruct *arrayP = procArray;
TransactionId result;
int index;
+ bool allDbs;
+
volatile TransactionId replication_slot_xmin = InvalidTransactionId;
+ volatile TransactionId replication_slot_catalog_xmin = InvalidTransactionId;
+
+ /*
+ * If we're not computing a relation specific limit, or if a shared
+ * relation has been passed in, backends in all databases have to be
+ * considered.
+ */
+ allDbs = rel == NULL || rel->rd_rel->relisshared;
/* Cannot look for individual databases during recovery */
Assert(allDbs || !RecoveryInProgress());
@@ -1180,6 +1195,13 @@ GetOldestXmin(bool allDbs, bool ignoreVacuum)
volatile PGPROC *proc = &allProcs[pgprocno];
volatile PGXACT *pgxact = &allPgXact[pgprocno];
+ /*
+ * Backend is doing logical decoding which manages xmin separately,
+ * check below.
+ */
+ if (pgxact->vacuumFlags & PROC_IN_LOGICAL_DECODING)
+ continue;
+
if (ignoreVacuum && (pgxact->vacuumFlags & PROC_IN_VACUUM))
continue;
@@ -1211,6 +1233,7 @@ GetOldestXmin(bool allDbs, bool ignoreVacuum)
/* fetch into volatile var while ProcArrayLock is held */
replication_slot_xmin = procArray->replication_slot_xmin;
+ replication_slot_catalog_xmin = procArray->replication_slot_catalog_xmin;
if (RecoveryInProgress())
{
@@ -1259,6 +1282,18 @@ GetOldestXmin(bool allDbs, bool ignoreVacuum)
NormalTransactionIdPrecedes(replication_slot_xmin, result))
result = replication_slot_xmin;
+ /*
+ * After locks have been released and defer_cleanup_age has been applied,
+ * check whether we need to back up further to make logical decoding
+ * possible. We need to do so if we're computing the global limit (rel =
+ * NULL) or if the passed relation is a catalog relation of some kind.
+ */
+ if ((rel == NULL ||
+ RelationIsAccessibleInLogicalDecoding(rel)) &&
+ TransactionIdIsValid(replication_slot_catalog_xmin) &&
+ NormalTransactionIdPrecedes(replication_slot_catalog_xmin, result))
+ result = replication_slot_catalog_xmin;
+
return result;
}
@@ -1313,6 +1348,8 @@ GetMaxSnapshotSubxidCount(void)
* RecentGlobalXmin: the global xmin (oldest TransactionXmin across all
* running transactions, except those running LAZY VACUUM). This is
* the same computation done by GetOldestXmin(true, true).
+ * RecentGlobalDataXmin: the global xmin for non-catalog tables
+ * >= RecentGlobalXmin
*
* Note: this function should probably not be called with an argument that's
* not statically allocated (see xip allocation below).
@@ -1329,6 +1366,7 @@ GetSnapshotData(Snapshot snapshot)
int subcount = 0;
bool suboverflowed = false;
volatile TransactionId replication_slot_xmin = InvalidTransactionId;
+ volatile TransactionId replication_slot_catalog_xmin = InvalidTransactionId;
Assert(snapshot != NULL);
@@ -1397,6 +1435,13 @@ GetSnapshotData(Snapshot snapshot)
volatile PGXACT *pgxact = &allPgXact[pgprocno];
TransactionId xid;
+ /*
+ * Backend is doing logical decoding which manages xmin
+ * separately, check below.
+ */
+ if (pgxact->vacuumFlags & PROC_IN_LOGICAL_DECODING)
+ continue;
+
/* Ignore procs running LAZY VACUUM */
if (pgxact->vacuumFlags & PROC_IN_VACUUM)
continue;
@@ -1509,6 +1554,7 @@ GetSnapshotData(Snapshot snapshot)
/* fetch into volatile var while ProcArrayLock is held */
replication_slot_xmin = procArray->replication_slot_xmin;
+ replication_slot_catalog_xmin = procArray->replication_slot_catalog_xmin;
if (!TransactionIdIsValid(MyPgXact->xmin))
MyPgXact->xmin = TransactionXmin = xmin;
@@ -1533,6 +1579,17 @@ GetSnapshotData(Snapshot snapshot)
NormalTransactionIdPrecedes(replication_slot_xmin, RecentGlobalXmin))
RecentGlobalXmin = replication_slot_xmin;
+ /* Non-catalog tables can be vacuumed if older than this xid */
+ RecentGlobalDataXmin = RecentGlobalXmin;
+
+ /*
+ * Check whether there's a replication slot requiring an older catalog
+ * xmin.
+ */
+ if (TransactionIdIsNormal(replication_slot_catalog_xmin) &&
+ NormalTransactionIdPrecedes(replication_slot_catalog_xmin, RecentGlobalXmin))
+ RecentGlobalXmin = replication_slot_catalog_xmin;
+
RecentXmin = xmin;
snapshot->xmin = xmin;
@@ -1633,9 +1690,11 @@ ProcArrayInstallImportedXmin(TransactionId xmin, TransactionId sourcexid)
* Similar to GetSnapshotData but returns more information. We include
* all PGXACTs with an assigned TransactionId, even VACUUM processes.
*
- * We acquire XidGenLock, but the caller is responsible for releasing it.
- * This ensures that no new XIDs enter the proc array until the caller has
- * WAL-logged this snapshot, and releases the lock.
+ * We acquire XidGenLock and ProcArrayLock, but the caller is responsible for
+ * releasing them. Acquiring XidGenLock ensures that no new XIDs enter the proc
+ * array until the caller has WAL-logged this snapshot, and releases the
+ * lock. Acquiring ProcArrayLock ensures that no transactions commit until the
+ * lock is released.
*
* The returned data structure is statically allocated; caller should not
* modify it, and must not assume it is valid past the next call.
@@ -1770,6 +1829,15 @@ GetRunningTransactionData(void)
}
}
+ /*
+ * It's important *not* to include the limits set by slots here because
+ * snapbuild.c uses oldestRunningXid to manage its xmin horizon. If those
+ * were to be included here the initial value could never increase because
+ * of a circular dependency where slots only increase their limits when
+ * running xacts increases oldestRunningXid and running xacts only
+ * increases if slots do.
+ */
+
CurrentRunningXacts->xcnt = count - subcount;
CurrentRunningXacts->subxcnt = subcount;
CurrentRunningXacts->subxid_overflow = suboverflowed;
@@ -1777,13 +1845,12 @@ GetRunningTransactionData(void)
CurrentRunningXacts->oldestRunningXid = oldestRunningXid;
CurrentRunningXacts->latestCompletedXid = latestCompletedXid;
- /* We don't release XidGenLock here, the caller is responsible for that */
- LWLockRelease(ProcArrayLock);
-
Assert(TransactionIdIsValid(CurrentRunningXacts->nextXid));
Assert(TransactionIdIsValid(CurrentRunningXacts->oldestRunningXid));
Assert(TransactionIdIsNormal(CurrentRunningXacts->latestCompletedXid));
+ /* We don't release the locks here, the caller is responsible for that */
+
return CurrentRunningXacts;
}
@@ -1853,6 +1920,92 @@ GetOldestActiveTransactionId(void)
}
/*
+ * GetOldestSafeDecodingTransactionId -- lowest xid not affected by vacuum
+ *
+ * Returns the oldest xid that we can guarantee not to have been affected by
+ * vacuum, i.e. no rows >= that xid have been vacuumed away unless the
+ * transaction aborted. Note that the value can (and most of the time will) be
+ * much more conservative than what really has been affected by vacuum, but we
+ * currently don't have better data available.
+ *
+ * This is useful to initalize the cutoff xid after which a new changeset
+ * extraction replication slot can start decoding changes.
+ *
+ * Must be called with ProcArrayLock held either shared or exclusively,
+ * although most callers will want to use exclusive mode since it is expected
+ * that the caller will immediately use the xid to peg the xmin horizon.
+ */
+TransactionId
+GetOldestSafeDecodingTransactionId(void)
+{
+ ProcArrayStruct *arrayP = procArray;
+ TransactionId oldestSafeXid;
+ int index;
+ bool recovery_in_progress = RecoveryInProgress();
+
+ Assert(LWLockHeldByMe(ProcArrayLock));
+
+ /*
+ * Acquire XidGenLock, so no transactions can acquire an xid while we're
+ * running. If no transaction with xid were running concurrently a new xid
+ * could influence the the RecentXmin et al.
+ *
+ * We initialize the computation to nextXid since that's guaranteed to be
+ * a safe, albeit pessimal, value.
+ */
+ LWLockAcquire(XidGenLock, LW_SHARED);
+ oldestSafeXid = ShmemVariableCache->nextXid;
+
+ /*
+ * If there's already a slot pegging the xmin horizon, we can start with
+ * that value, it's guaranteed to be safe since it's computed by this
+ * routine initally and has been enforced since.
+ */
+ if (TransactionIdIsValid(procArray->replication_slot_catalog_xmin) &&
+ TransactionIdPrecedes(procArray->replication_slot_catalog_xmin,
+ oldestSafeXid))
+ oldestSafeXid = procArray->replication_slot_catalog_xmin;
+
+ /*
+ * If we're not in recovery, we walk over the procarray and collect the
+ * lowest xid. Since we're called with ProcArrayLock held and have
+ * acquired XidGenLock, no entries can vanish concurrently, since
+ * PGXACT->xid is only set with XidGenLock held and only cleared with
+ * ProcArrayLock held.
+ *
+ * In recovery we can't lower the safe value besides what we've computed
+ * above, so we'll have to wait a bit longer there. We unfortunately can
+ * *not* use KnownAssignedXidsGetOldestXmin() since the KnownAssignedXids
+ * machinery can miss values and return an older value than is safe.
+ */
+ if (!recovery_in_progress)
+ {
+ /*
+ * Spin over procArray collecting all min(PGXACT->xid)
+ */
+ for (index = 0; index < arrayP->numProcs; index++)
+ {
+ int pgprocno = arrayP->pgprocnos[index];
+ volatile PGXACT *pgxact = &allPgXact[pgprocno];
+ TransactionId xid;
+
+ /* Fetch xid just once - see GetNewTransactionId */
+ xid = pgxact->xid;
+
+ if (!TransactionIdIsNormal(xid))
+ continue;
+
+ if (TransactionIdPrecedes(xid, oldestSafeXid))
+ oldestSafeXid = xid;
+ }
+ }
+
+ LWLockRelease(XidGenLock);
+
+ return oldestSafeXid;
+}
+
+/*
* GetVirtualXIDsDelayingChkpt -- Get the VXIDs of transactions that are
* delaying checkpoint because they have critical actions in progress.
*
@@ -2523,10 +2676,39 @@ CountOtherDBBackends(Oid databaseId, int *nbackends, int *nprepared)
* replicaton slots.
*/
void
-ProcArraySetReplicationSlotXmin(TransactionId xmin)
+ProcArraySetReplicationSlotXmin(TransactionId xmin, TransactionId catalog_xmin,
+ bool already_locked)
{
- LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+ Assert(!already_locked || LWLockHeldByMe(ProcArrayLock));
+
+ if (!already_locked)
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
procArray->replication_slot_xmin = xmin;
+ procArray->replication_slot_catalog_xmin = catalog_xmin;
+
+ if (!already_locked)
+ LWLockRelease(ProcArrayLock);
+}
+
+/*
+ * ProcArrayGetReplicationSlotXmin
+ *
+ * Return the current slot xmin limits. That's useful to be able to remove
+ * data that's older than those limits.
+ */
+void
+ProcArrayGetReplicationSlotXmin(TransactionId *xmin,
+ TransactionId *catalog_xmin)
+{
+ LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+ if (xmin != NULL)
+ *xmin = procArray->replication_slot_xmin;
+
+ if (catalog_xmin != NULL)
+ *catalog_xmin = procArray->replication_slot_catalog_xmin;
+
LWLockRelease(ProcArrayLock);
}
diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c
index fb5f18edfc7..aa8bea5538b 100644
--- a/src/backend/storage/ipc/standby.c
+++ b/src/backend/storage/ipc/standby.c
@@ -800,7 +800,9 @@ standby_redo(XLogRecPtr lsn, XLogRecord *record)
/*
* Log details of the current snapshot to WAL. This allows the snapshot state
- * to be reconstructed on the standby.
+ * to be reconstructed on the standby and for logical decoding.
+ *
+ * This is used for Hot Standby as follows:
*
* We can move directly to STANDBY_SNAPSHOT_READY at startup if we
* start from a shutdown checkpoint because we know nothing was running
@@ -854,6 +856,12 @@ standby_redo(XLogRecPtr lsn, XLogRecord *record)
* Zero xids should no longer be possible, but we may be replaying WAL
* from a time when they were possible.
*
+ * For logical decoding only the running xacts information is needed;
+ * there's no need to look at the locking information, but it's logged anyway,
+ * as there's no independent knob to just enable logical decoding. For
+ * details of how this is used, check snapbuild.c's introductory comment.
+ *
+ *
* Returns the RecPtr of the last inserted record.
*/
XLogRecPtr
@@ -879,8 +887,28 @@ LogStandbySnapshot(void)
* record we write, because standby will open up when it sees this.
*/
running = GetRunningTransactionData();
+
+ /*
+ * GetRunningTransactionData() acquired ProcArrayLock, we must release
+ * it. For Hot Standby this can be done before inserting the WAL record
+ * because ProcArrayApplyRecoveryInfo() rechecks the commit status using
+ * the clog. For logical decoding, though, the lock can't be released
+ * early becuase the clog might be "in the future" from the POV of the
+ * historic snapshot. This would allow for situations where we're waiting
+ * for the end of a transaction listed in the xl_running_xacts record
+ * which, according to the WAL, have commit before the xl_running_xacts
+ * record. Fortunately this routine isn't executed frequently, and it's
+ * only a shared lock.
+ */
+ if (wal_level < WAL_LEVEL_LOGICAL)
+ LWLockRelease(ProcArrayLock);
+
recptr = LogCurrentRunningXacts(running);
+ /* Release lock if we kept it longer ... */
+ if (wal_level >= WAL_LEVEL_LOGICAL)
+ LWLockRelease(ProcArrayLock);
+
/* GetRunningTransactionData() acquired XidGenLock, we must release it */
LWLockRelease(XidGenLock);
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index fa460ca82eb..f595a0747c1 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -781,10 +781,6 @@ ProcKill(int code, Datum arg)
/* Make sure we're out of the sync rep lists */
SyncRepCleanupAtProcExit();
- /* Make sure active replication slots are released */
- if (MyReplicationSlot != NULL)
- ReplicationSlotRelease();
-
#ifdef USE_ASSERT_CHECKING
if (assert_enabled)
{
@@ -803,6 +799,10 @@ ProcKill(int code, Datum arg)
*/
LWLockReleaseAll();
+ /* Make sure active replication slots are released */
+ if (MyReplicationSlot != NULL)
+ ReplicationSlotRelease();
+
/*
* Clear MyProc first; then disown the process latch. This is so that
* signal handlers won't try to clear the process latch after it's no
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index a230d7eda69..be961017d66 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -55,6 +55,7 @@
#include "pg_getopt.h"
#include "postmaster/autovacuum.h"
#include "postmaster/postmaster.h"
+#include "replication/slot.h"
#include "replication/walsender.h"
#include "rewrite/rewriteHandler.h"
#include "storage/bufmgr.h"
@@ -3854,6 +3855,16 @@ PostgresMain(int argc, char *argv[],
WalSndErrorCleanup();
/*
+ * We can't release replication slots inside AbortTransaction() as we
+ * need to be able to start and abort transactions while having a slot
+ * acquired. But we never need to hold them across top level errors,
+ * so releasing here is fine. There's another cleanup in ProcKill()
+ * ensuring we'll correctly cleanup on FATAL errors as well.
+ */
+ if (MyReplicationSlot != NULL)
+ ReplicationSlotRelease();
+
+ /*
* Now return to normal top-level context and clear ErrorContext for
* next time.
*/
diff --git a/src/backend/utils/cache/inval.c b/src/backend/utils/cache/inval.c
index 4423fe01bdd..115bcac5d23 100644
--- a/src/backend/utils/cache/inval.c
+++ b/src/backend/utils/cache/inval.c
@@ -512,7 +512,7 @@ RegisterSnapshotInvalidation(Oid dbId, Oid relId)
* Only the local caches are flushed; this does not transmit the message
* to other backends.
*/
-static void
+void
LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg)
{
if (msg->id >= 0)
@@ -596,7 +596,7 @@ LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg)
* since that tells us we've lost some shared-inval messages and hence
* don't know what needs to be invalidated.
*/
-static void
+void
InvalidateSystemCaches(void)
{
int i;
diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c
index 2810b35eea1..32313244adb 100644
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@@ -73,6 +73,7 @@
#include "utils/memutils.h"
#include "utils/relmapper.h"
#include "utils/resowner_private.h"
+#include "utils/snapmgr.h"
#include "utils/syscache.h"
#include "utils/tqual.h"
@@ -235,7 +236,7 @@ static void formrdesc(const char *relationName, Oid relationReltype,
bool isshared, bool hasoids,
int natts, const FormData_pg_attribute *attrs);
-static HeapTuple ScanPgRelation(Oid targetRelId, bool indexOK);
+static HeapTuple ScanPgRelation(Oid targetRelId, bool indexOK, bool force_non_historic);
static Relation AllocateRelationDesc(Form_pg_class relp);
static void RelationParseRelOptions(Relation relation, HeapTuple tuple);
static void RelationBuildTupleDesc(Relation relation);
@@ -274,12 +275,13 @@ static void unlink_initfile(const char *initfilename);
* and must eventually be freed with heap_freetuple.
*/
static HeapTuple
-ScanPgRelation(Oid targetRelId, bool indexOK)
+ScanPgRelation(Oid targetRelId, bool indexOK, bool force_non_historic)
{
HeapTuple pg_class_tuple;
Relation pg_class_desc;
SysScanDesc pg_class_scan;
ScanKeyData key[1];
+ Snapshot snapshot;
/*
* If something goes wrong during backend startup, we might find ourselves
@@ -305,9 +307,20 @@ ScanPgRelation(Oid targetRelId, bool indexOK)
* scan by setting indexOK == false.
*/
pg_class_desc = heap_open(RelationRelationId, AccessShareLock);
+
+ /*
+ * The caller might need a tuple that's newer than the one the historic
+ * snapshot; currently the only case requiring to do so is looking up the
+ * relfilenode of non mapped system relations during decoding.
+ */
+ if (force_non_historic)
+ snapshot = GetNonHistoricCatalogSnapshot(RelationRelationId);
+ else
+ snapshot = GetCatalogSnapshot(RelationRelationId);
+
pg_class_scan = systable_beginscan(pg_class_desc, ClassOidIndexId,
indexOK && criticalRelcachesBuilt,
- NULL,
+ snapshot,
1, key);
pg_class_tuple = systable_getnext(pg_class_scan);
@@ -836,7 +849,7 @@ RelationBuildDesc(Oid targetRelId, bool insertIt)
/*
* find the tuple in pg_class corresponding to the given relation id
*/
- pg_class_tuple = ScanPgRelation(targetRelId, true);
+ pg_class_tuple = ScanPgRelation(targetRelId, true, false);
/*
* if no such tuple exists, return NULL
@@ -989,8 +1002,42 @@ RelationInitPhysicalAddr(Relation relation)
relation->rd_node.dbNode = InvalidOid;
else
relation->rd_node.dbNode = MyDatabaseId;
+
if (relation->rd_rel->relfilenode)
+ {
+ /*
+ * Even if we are using a decoding snapshot that doesn't represent
+ * the current state of the catalog we need to make sure the
+ * filenode points to the current file since the older file will
+ * be gone (or truncated). The new file will still contain older
+ * rows so lookups in them will work correctly. This wouldn't work
+ * correctly if rewrites were allowed to change the schema in a
+ * noncompatible way, but those are prevented both on catalog
+ * tables and on user tables declared as additional catalog
+ * tables.
+ */
+ if (HistoricSnapshotActive()
+ && RelationIsAccessibleInLogicalDecoding(relation)
+ && IsTransactionState())
+ {
+ HeapTuple phys_tuple;
+ Form_pg_class physrel;
+
+ phys_tuple = ScanPgRelation(RelationGetRelid(relation),
+ RelationGetRelid(relation) != ClassOidIndexId,
+ true);
+ if (!HeapTupleIsValid(phys_tuple))
+ elog(ERROR, "could not find pg_class entry for %u",
+ RelationGetRelid(relation));
+ physrel = (Form_pg_class) GETSTRUCT(phys_tuple);
+
+ relation->rd_rel->reltablespace = physrel->reltablespace;
+ relation->rd_rel->relfilenode = physrel->relfilenode;
+ heap_freetuple(phys_tuple);
+ }
+
relation->rd_node.relNode = relation->rd_rel->relfilenode;
+ }
else
{
/* Consult the relation mapper */
@@ -1742,7 +1789,7 @@ RelationReloadIndexInfo(Relation relation)
* for pg_class_oid_index ...
*/
indexOK = (RelationGetRelid(relation) != ClassOidIndexId);
- pg_class_tuple = ScanPgRelation(RelationGetRelid(relation), indexOK);
+ pg_class_tuple = ScanPgRelation(RelationGetRelid(relation), indexOK, false);
if (!HeapTupleIsValid(pg_class_tuple))
elog(ERROR, "could not find pg_class tuple for index %u",
RelationGetRelid(relation));
diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c
index 4c0e0accc1c..4146527d2fd 100644
--- a/src/backend/utils/time/snapmgr.c
+++ b/src/backend/utils/time/snapmgr.c
@@ -19,6 +19,10 @@
* have regd_count = 1 and are counted in RegisteredSnapshots, but are not
* tracked by any resource owner.
*
+ * The same is true for historic snapshots used during logical decoding,
+ * their lifetime is managed separately (as they life longer as one xact.c
+ * transaction).
+ *
* These arrangements let us reset MyPgXact->xmin when there are no snapshots
* referenced by this transaction. (One possible improvement would be to be
* able to advance Xmin when the snapshot with the earliest Xmin is no longer
@@ -69,12 +73,13 @@
*/
static SnapshotData CurrentSnapshotData = {HeapTupleSatisfiesMVCC};
static SnapshotData SecondarySnapshotData = {HeapTupleSatisfiesMVCC};
-static SnapshotData CatalogSnapshotData = {HeapTupleSatisfiesMVCC};
+SnapshotData CatalogSnapshotData = {HeapTupleSatisfiesMVCC};
/* Pointers to valid snapshots */
static Snapshot CurrentSnapshot = NULL;
static Snapshot SecondarySnapshot = NULL;
static Snapshot CatalogSnapshot = NULL;
+static Snapshot HistoricSnapshot = NULL;
/*
* Staleness detection for CatalogSnapshot.
@@ -86,13 +91,18 @@ static bool CatalogSnapshotStale = true;
* for the convenience of TransactionIdIsInProgress: even in bootstrap
* mode, we don't want it to say that BootstrapTransactionId is in progress.
*
- * RecentGlobalXmin is initialized to InvalidTransactionId, to ensure that no
- * one tries to use a stale value. Readers should ensure that it has been set
- * to something else before using it.
+ * RecentGlobalXmin and RecentGlobalDataXmin are initialized to
+ * InvalidTransactionId, to ensure that no one tries to use a stale
+ * value. Readers should ensure that it has been set to something else
+ * before using it.
*/
TransactionId TransactionXmin = FirstNormalTransactionId;
TransactionId RecentXmin = FirstNormalTransactionId;
TransactionId RecentGlobalXmin = InvalidTransactionId;
+TransactionId RecentGlobalDataXmin = InvalidTransactionId;
+
+/* (table, ctid) => (cmin, cmax) mapping during timetravel */
+static HTAB *tuplecid_data = NULL;
/*
* Elements of the active snapshot stack.
@@ -158,6 +168,18 @@ static void SnapshotResetXmin(void);
Snapshot
GetTransactionSnapshot(void)
{
+ /*
+ * Return historic snapshot if doing logical decoding. We'll never
+ * need a non-historic transaction snapshot in this (sub-)transaction, so
+ * there's no need to be careful to set one up for later calls to
+ * GetTransactionSnapshot().
+ */
+ if (HistoricSnapshotActive())
+ {
+ Assert(!FirstSnapshotSet);
+ return HistoricSnapshot;
+ }
+
/* First call in transaction? */
if (!FirstSnapshotSet)
{
@@ -214,6 +236,13 @@ GetTransactionSnapshot(void)
Snapshot
GetLatestSnapshot(void)
{
+ /*
+ * So far there are no cases requiring support for GetLatestSnapshot()
+ * during logical decoding, but it wouldn't be hard to add if
+ * required.
+ */
+ Assert(!HistoricSnapshotActive());
+
/* If first call in transaction, go ahead and set the xact snapshot */
if (!FirstSnapshotSet)
return GetTransactionSnapshot();
@@ -232,6 +261,26 @@ Snapshot
GetCatalogSnapshot(Oid relid)
{
/*
+ * Return historic snapshot if we're doing logical decoding, but
+ * return a non-historic, snapshot if we temporarily are doing up2date
+ * lookups.
+ */
+ if (HistoricSnapshotActive())
+ return HistoricSnapshot;
+
+ return GetNonHistoricCatalogSnapshot(relid);
+}
+
+/*
+ * GetNonHistoricCatalogSnapshot
+ * Get a snapshot that is sufficiently up-to-date for scan of the system
+ * catalog with the specified OID, even while historic snapshots are set
+ * up.
+ */
+Snapshot
+GetNonHistoricCatalogSnapshot(Oid relid)
+{
+ /*
* If the caller is trying to scan a relation that has no syscache,
* no catcache invalidations will be sent when it is updated. For a
* a few key relations, snapshot invalidations are sent instead. If
@@ -303,6 +352,7 @@ SetTransactionSnapshot(Snapshot sourcesnap, TransactionId sourcexid)
Assert(RegisteredSnapshots == 0);
Assert(FirstXactSnapshot == NULL);
+ Assert(HistoricSnapshotActive());
/*
* Even though we are not going to use the snapshot it computes, we must
@@ -796,7 +846,7 @@ AtEOXact_Snapshot(bool isCommit)
* Returns the token (the file name) that can be used to import this
* snapshot.
*/
-static char *
+char *
ExportSnapshot(Snapshot snapshot)
{
TransactionId topXid;
@@ -1258,3 +1308,45 @@ ThereAreNoPriorRegisteredSnapshots(void)
return false;
}
+
+/*
+ * Setup a snapshot that replaces normal catalog snapshots that allows catalog
+ * access to behave just like it did at a certain point in the past.
+ *
+ * Needed for logical decoding.
+ */
+void
+SetupHistoricSnapshot(Snapshot historic_snapshot, HTAB *tuplecids)
+{
+ Assert(historic_snapshot != NULL);
+
+ /* setup the timetravel snapshot */
+ HistoricSnapshot = historic_snapshot;
+
+ /* setup (cmin, cmax) lookup hash */
+ tuplecid_data = tuplecids;
+}
+
+
+/*
+ * Make catalog snapshots behave normally again.
+ */
+void
+TeardownHistoricSnapshot(bool is_error)
+{
+ HistoricSnapshot = NULL;
+ tuplecid_data = NULL;
+}
+
+bool
+HistoricSnapshotActive(void)
+{
+ return HistoricSnapshot != NULL;
+}
+
+HTAB *
+HistoricSnapshotGetTupleCids(void)
+{
+ Assert(HistoricSnapshotActive());
+ return tuplecid_data;
+}
diff --git a/src/backend/utils/time/tqual.c b/src/backend/utils/time/tqual.c
index f6267552573..c4732ed3110 100644
--- a/src/backend/utils/time/tqual.c
+++ b/src/backend/utils/time/tqual.c
@@ -62,6 +62,9 @@
#include "access/xact.h"
#include "storage/bufmgr.h"
#include "storage/procarray.h"
+#include "utils/builtins.h"
+#include "utils/combocid.h"
+#include "utils/snapmgr.h"
#include "utils/tqual.h"
@@ -73,7 +76,6 @@ SnapshotData SnapshotToastData = {HeapTupleSatisfiesToast};
/* local functions */
static bool XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot);
-
/*
* SetHintBits()
*
@@ -1545,3 +1547,163 @@ HeapTupleHeaderIsOnlyLocked(HeapTupleHeader tuple)
*/
return true;
}
+
+/*
+ * check whether the transaciont id 'xid' in in the pre-sorted array 'xip'.
+ */
+static bool
+TransactionIdInArray(TransactionId xid, TransactionId *xip, Size num)
+{
+ return bsearch(&xid, xip, num,
+ sizeof(TransactionId), xidComparator) != NULL;
+}
+
+/*
+ * See the comments for HeapTupleSatisfiesMVCC for the semantics this function
+ * obeys.
+ *
+ * Only usable on tuples from catalog tables!
+ *
+ * We don't need to support HEAP_MOVED_(IN|OFF) for now because we only support
+ * reading catalog pages which couldn't have been created in an older version.
+ *
+ * We don't set any hint bits in here as it seems unlikely to be beneficial as
+ * those should already be set by normal access and it seems to be too
+ * dangerous to do so as the semantics of doing so during timetravel are more
+ * complicated than when dealing "only" with the present.
+ */
+bool
+HeapTupleSatisfiesHistoricMVCC(HeapTuple htup, Snapshot snapshot,
+ Buffer buffer)
+{
+ HeapTupleHeader tuple = htup->t_data;
+ TransactionId xmin = HeapTupleHeaderGetXmin(tuple);
+ TransactionId xmax = HeapTupleHeaderGetRawXmax(tuple);
+
+ Assert(ItemPointerIsValid(&htup->t_self));
+ Assert(htup->t_tableOid != InvalidOid);
+
+ /* inserting transaction aborted */
+ if (HeapTupleHeaderXminInvalid(tuple))
+ {
+ Assert(!TransactionIdDidCommit(xmin));
+ return false;
+ }
+ /* check if its one of our txids, toplevel is also in there */
+ else if (TransactionIdInArray(xmin, snapshot->subxip, snapshot->subxcnt))
+ {
+ bool resolved;
+ CommandId cmin = HeapTupleHeaderGetRawCommandId(tuple);
+ CommandId cmax = InvalidCommandId;
+
+ /*
+ * another transaction might have (tried to) delete this tuple or
+ * cmin/cmax was stored in a combocid. S we need to to lookup the
+ * actual values externally.
+ */
+ resolved = ResolveCminCmaxDuringDecoding(HistoricSnapshotGetTupleCids(), snapshot,
+ htup, buffer,
+ &cmin, &cmax);
+
+ if (!resolved)
+ elog(ERROR, "could not resolve cmin/cmax of catalog tuple");
+
+ Assert(cmin != InvalidCommandId);
+
+ if (cmin >= snapshot->curcid)
+ return false; /* inserted after scan started */
+ /* fall through */
+ }
+ /* committed before our xmin horizon. Do a normal visibility check. */
+ else if (TransactionIdPrecedes(xmin, snapshot->xmin))
+ {
+ Assert(!(HeapTupleHeaderXminCommitted(tuple) &&
+ !TransactionIdDidCommit(xmin)));
+
+ /* check for hint bit first, consult clog afterwards */
+ if (!HeapTupleHeaderXminCommitted(tuple) &&
+ !TransactionIdDidCommit(xmin))
+ return false;
+ /* fall through */
+ }
+ /* beyond our xmax horizon, i.e. invisible */
+ else if (TransactionIdFollowsOrEquals(xmin, snapshot->xmax))
+ {
+ return false;
+ }
+ /* check if it's a committed transaction in [xmin, xmax) */
+ else if(TransactionIdInArray(xmin, snapshot->xip, snapshot->xcnt))
+ {
+ /* fall through */
+ }
+ /*
+ * none of the above, i.e. between [xmin, xmax) but hasn't
+ * committed. I.e. invisible.
+ */
+ else
+ {
+ return false;
+ }
+
+ /* at this point we know xmin is visible, go on to check xmax */
+
+ /* xid invalid or aborted */
+ if (tuple->t_infomask & HEAP_XMAX_INVALID)
+ return true;
+ /* locked tuples are always visible */
+ else if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
+ return true;
+ /*
+ * We can see multis here if we're looking at user tables or if
+ * somebody SELECT ... FOR SHARE/UPDATE a system table.
+ */
+ else if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
+ {
+ xmax = HeapTupleGetUpdateXid(tuple);
+ }
+
+ /* check if its one of our txids, toplevel is also in there */
+ if (TransactionIdInArray(xmax, snapshot->subxip, snapshot->subxcnt))
+ {
+ bool resolved;
+ CommandId cmin;
+ CommandId cmax = HeapTupleHeaderGetRawCommandId(tuple);
+
+ /* Lookup actual cmin/cmax values */
+ resolved = ResolveCminCmaxDuringDecoding(HistoricSnapshotGetTupleCids(), snapshot,
+ htup, buffer,
+ &cmin, &cmax);
+
+ if (!resolved)
+ elog(ERROR, "could not resolve combocid to cmax");
+
+ Assert(cmax != InvalidCommandId);
+
+ if (cmax >= snapshot->curcid)
+ return true; /* deleted after scan started */
+ else
+ return false; /* deleted before scan started */
+ }
+ /* below xmin horizon, normal transaction state is valid */
+ else if (TransactionIdPrecedes(xmax, snapshot->xmin))
+ {
+ Assert(!(tuple->t_infomask & HEAP_XMAX_COMMITTED &&
+ !TransactionIdDidCommit(xmax)));
+
+ /* check hint bit first */
+ if (tuple->t_infomask & HEAP_XMAX_COMMITTED)
+ return false;
+
+ /* check clog */
+ return !TransactionIdDidCommit(xmax);
+ }
+ /* above xmax horizon, we cannot possibly see the deleting transaction */
+ else if (TransactionIdFollowsOrEquals(xmax, snapshot->xmax))
+ return true;
+ /* xmax is between [xmin, xmax), check known committed array */
+ else if (TransactionIdInArray(xmax, snapshot->xip, snapshot->xcnt))
+ return false;
+ /* xmax is between [xmin, xmax), but known not to have committed yet */
+ else
+ return true;
+}
diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index 71248ee1bcf..a7d5f7a153b 100644
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -198,7 +198,10 @@ static const char *subdirs[] = {
"pg_replslot",
"pg_tblspc",
"pg_stat",
- "pg_stat_tmp"
+ "pg_stat_tmp",
+ "pg_llog",
+ "pg_llog/snapshots",
+ "pg_llog/mappings"
};
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index bfdadc3d5bb..0f802577c70 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -164,8 +164,7 @@ extern void heap_restrpos(HeapScanDesc scan);
extern void heap_sync(Relation relation);
/* in heap/pruneheap.c */
-extern void heap_page_prune_opt(Relation relation, Buffer buffer,
- TransactionId OldestXmin);
+extern void heap_page_prune_opt(Relation relation, Buffer buffer);
extern int heap_page_prune(Relation relation, Buffer buffer,
TransactionId OldestXmin,
bool report_stats, TransactionId *latestRemovedXid);
diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h
index d4383ab2cbe..194635952cb 100644
--- a/src/include/access/heapam_xlog.h
+++ b/src/include/access/heapam_xlog.h
@@ -48,7 +48,7 @@
* the ones above associated with RM_HEAP_ID. XLOG_HEAP_OPMASK applies to
* these, too.
*/
-/* 0x00 is free, was XLOG_HEAP2_FREEZE */
+#define XLOG_HEAP2_REWRITE 0x00
#define XLOG_HEAP2_CLEAN 0x10
#define XLOG_HEAP2_FREEZE_PAGE 0x20
#define XLOG_HEAP2_CLEANUP_INFO 0x30
@@ -332,6 +332,17 @@ typedef struct xl_heap_new_cid
xl_heaptid target;
} xl_heap_new_cid;
+/* logical rewrite xlog record header */
+typedef struct xl_heap_rewrite_mapping
+{
+ TransactionId mapped_xid; /* xid that might need to see the row */
+ Oid mapped_db; /* DbOid or InvalidOid for shared rels */
+ Oid mapped_rel; /* Oid of the mapped relation */
+ off_t offset; /* How far have we written so far */
+ uint32 num_mappings; /* Number of in-memory mappings */
+ XLogRecPtr start_lsn; /* Insert LSN at begin of rewrite */
+} xl_heap_rewrite_mapping;
+
#define SizeOfHeapNewCid (offsetof(xl_heap_new_cid, target) + SizeOfHeapTid)
extern void HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple,
@@ -341,6 +352,7 @@ extern void heap_redo(XLogRecPtr lsn, XLogRecord *rptr);
extern void heap_desc(StringInfo buf, uint8 xl_info, char *rec);
extern void heap2_redo(XLogRecPtr lsn, XLogRecord *rptr);
extern void heap2_desc(StringInfo buf, uint8 xl_info, char *rec);
+extern void heap_xlog_logical_rewrite(XLogRecPtr lsn, XLogRecord *r);
extern XLogRecPtr log_heap_cleanup_info(RelFileNode rnode,
TransactionId latestRemovedXid);
diff --git a/src/include/access/rewriteheap.h b/src/include/access/rewriteheap.h
index d098a0b1711..07df3b4f2b0 100644
--- a/src/include/access/rewriteheap.h
+++ b/src/include/access/rewriteheap.h
@@ -14,12 +14,14 @@
#define REWRITE_HEAP_H
#include "access/htup.h"
+#include "storage/itemptr.h"
+#include "storage/relfilenode.h"
#include "utils/relcache.h"
/* struct definition is private to rewriteheap.c */
typedef struct RewriteStateData *RewriteState;
-extern RewriteState begin_heap_rewrite(Relation NewHeap,
+extern RewriteState begin_heap_rewrite(Relation OldHeap, Relation NewHeap,
TransactionId OldestXmin, TransactionId FreezeXid,
MultiXactId MultiXactCutoff, bool use_wal);
extern void end_heap_rewrite(RewriteState state);
@@ -27,4 +29,29 @@ extern void rewrite_heap_tuple(RewriteState state, HeapTuple oldTuple,
HeapTuple newTuple);
extern bool rewrite_heap_dead_tuple(RewriteState state, HeapTuple oldTuple);
+/*
+ * On-Disk data format for an individual logical rewrite mapping.
+ */
+typedef struct LogicalRewriteMappingData
+{
+ RelFileNode old_node;
+ RelFileNode new_node;
+ ItemPointerData old_tid;
+ ItemPointerData new_tid;
+} LogicalRewriteMappingData;
+
+/* ---
+ * The filename consists out of the following, dash separated,
+ * components:
+ * 1) database oid or InvalidOid for shared relations
+ * 2) the oid of the relation
+ * 3) xid we are mapping for
+ * 4) upper 32bit of the LSN at which a rewrite started
+ * 5) lower 32bit of the LSN at which a rewrite started
+ * 6) xid of the xact performing the mapping
+ * ---
+ */
+#define LOGICAL_REWRITE_FORMAT "map-%x-%x-%X_%X-%x-%x"
+void CheckPointLogicalRewriteHeap(void);
+
#endif /* REWRITE_HEAP_H */
diff --git a/src/include/access/transam.h b/src/include/access/transam.h
index 8376dfd669b..a9774e9f593 100644
--- a/src/include/access/transam.h
+++ b/src/include/access/transam.h
@@ -63,6 +63,11 @@
(AssertMacro(TransactionIdIsNormal(id1) && TransactionIdIsNormal(id2)), \
(int32) ((id1) - (id2)) < 0)
+/* compare two XIDs already known to be normal; this is a macro for speed */
+#define NormalTransactionIdFollows(id1, id2) \
+ (AssertMacro(TransactionIdIsNormal(id1) && TransactionIdIsNormal(id2)), \
+ (int32) ((id1) - (id2)) > 0)
+
/* ----------
* Object ID (OID) zero is InvalidOid.
*
diff --git a/src/include/access/tuptoaster.h b/src/include/access/tuptoaster.h
index 5adf4f28169..296d016c9fc 100644
--- a/src/include/access/tuptoaster.h
+++ b/src/include/access/tuptoaster.h
@@ -98,9 +98,34 @@
/* Size of an EXTERNAL datum that contains a standard TOAST pointer */
#define TOAST_POINTER_SIZE (VARHDRSZ_EXTERNAL + sizeof(struct varatt_external))
-/* Size of an indirect datum that contains an indirect TOAST pointer */
+/* Size of an indirect datum that contains a standard TOAST pointer */
#define INDIRECT_POINTER_SIZE (VARHDRSZ_EXTERNAL + sizeof(struct varatt_indirect))
+/*
+ * Testing whether an externally-stored value is compressed now requires
+ * comparing extsize (the actual length of the external data) to rawsize
+ * (the original uncompressed datum's size). The latter includes VARHDRSZ
+ * overhead, the former doesn't. We never use compression unless it actually
+ * saves space, so we expect either equality or less-than.
+ */
+#define VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer) \
+ ((toast_pointer).va_extsize < (toast_pointer).va_rawsize - VARHDRSZ)
+
+/*
+ * Macro to fetch the possibly-unaligned contents of an EXTERNAL datum
+ * into a local "struct varatt_external" toast pointer. This should be
+ * just a memcpy, but some versions of gcc seem to produce broken code
+ * that assumes the datum contents are aligned. Introducing an explicit
+ * intermediate "varattrib_1b_e *" variable seems to fix it.
+ */
+#define VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr) \
+do { \
+ varattrib_1b_e *attre = (varattrib_1b_e *) (attr); \
+ Assert(VARATT_IS_EXTERNAL(attre)); \
+ Assert(VARSIZE_EXTERNAL(attre) == sizeof(toast_pointer) + VARHDRSZ_EXTERNAL); \
+ memcpy(&(toast_pointer), VARDATA_EXTERNAL(attre), sizeof(toast_pointer)); \
+} while (0)
+
/* ----------
* toast_insert_or_update -
*
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 11ab2771990..a238292b76e 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -288,6 +288,7 @@ extern int XLogFileOpen(XLogSegNo segno);
extern XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std);
extern void CheckXLogRemoved(XLogSegNo segno, TimeLineID tli);
+extern XLogSegNo XLogGetLastRemovedSegno(void);
extern void XLogSetAsyncXactLSN(XLogRecPtr record);
extern void XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn);
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index 80560574bf5..22dd0fc58e8 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -53,6 +53,6 @@
*/
/* yyyymmddN */
-#define CATALOG_VERSION_NO 201403031
+#define CATALOG_VERSION_NO 201403032
#endif
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index 7a11721ba44..c5706518722 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -4804,8 +4804,18 @@ DATA(insert OID = 3779 ( pg_create_physical_replication_slot PGNSP PGUID 12 1 0
DESCR("create a physical replication slot");
DATA(insert OID = 3780 ( pg_drop_replication_slot PGNSP PGUID 12 1 0 0 0 f f f f f f v 1 0 2278 "19" _null_ _null_ _null_ _null_ pg_drop_replication_slot _null_ _null_ _null_ ));
DESCR("drop a replication slot");
-DATA(insert OID = 3781 ( pg_get_replication_slots PGNSP PGUID 12 1 10 0 0 f f f f f t s 0 0 2249 "" "{19,25,26,16,28,3220}" "{o,o,o,o,o,o}" "{slot_name,slot_type,datoid,active,xmin,restart_lsn}" _null_ pg_get_replication_slots _null_ _null_ _null_ ));
+DATA(insert OID = 3781 ( pg_get_replication_slots PGNSP PGUID 12 1 10 0 0 f f f f f t s 0 0 2249 "" "{19,19,25,26,16,28,28,3220}" "{o,o,o,o,o,o,o,o}" "{slot_name,plugin,slot_type,datoid,active,xmin,catalog_xmin,restart_lsn}" _null_ pg_get_replication_slots _null_ _null_ _null_ ));
DESCR("information about replication slots currently in use");
+DATA(insert OID = 3786 ( pg_create_logical_replication_slot PGNSP PGUID 12 1 0 0 0 f f f f f f v 2 0 2249 "19 19" "{19,19,25,3220}" "{i,i,o,o}" "{slotname,plugin,slotname,xlog_position}" _null_ pg_create_logical_replication_slot _null_ _null_ _null_ ));
+DESCR("set up a logical replication slot");
+DATA(insert OID = 3782 ( pg_logical_slot_get_changes PGNSP PGUID 12 1000 1000 25 0 f f f f f t v 4 0 2249 "19 3220 23 1009" "{19,3220,23,1009,3220,28,25}" "{i,i,i,v,o,o,o}" "{slotname,upto_lsn,upto_nchanges,options,location,xid,data}" _null_ pg_logical_slot_get_changes _null_ _null_ _null_ ));
+DESCR("get changes from replication slot");
+DATA(insert OID = 3783 ( pg_logical_slot_get_binary_changes PGNSP PGUID 12 1000 1000 25 0 f f f f f t v 4 0 2249 "19 3220 23 1009" "{19,3220,23,1009,3220,28,17}" "{i,i,i,v,o,o,o}" "{slotname,upto_lsn,upto_nchanges,options,location,xid,data}" _null_ pg_logical_slot_get_binary_changes _null_ _null_ _null_ ));
+DESCR("get binary changes from replication slot");
+DATA(insert OID = 3784 ( pg_logical_slot_peek_changes PGNSP PGUID 12 1000 1000 25 0 f f f f f t v 4 0 2249 "19 3220 23 1009" "{19,3220,23,1009,3220,28,25}" "{i,i,i,v,o,o,o}" "{slotname,upto_lsn,upto_nchanges,options,location,xid,data}" _null_ pg_logical_slot_peek_changes _null_ _null_ _null_ ));
+DESCR("peek at changes from replication slot");
+DATA(insert OID = 3785 ( pg_logical_slot_peek_binary_changes PGNSP PGUID 12 1000 1000 25 0 f f f f f t v 4 0 2249 "19 3220 23 1009" "{19,3220,23,1009,3220,28,17}" "{i,i,i,v,o,o,o}" "{slotname,upto_lsn,upto_nchanges,options,location,xid,data}" _null_ pg_logical_slot_peek_binary_changes _null_ _null_ _null_ ));
+DESCR("peek at binary changes from replication slot");
/* event triggers */
DATA(insert OID = 3566 ( pg_event_trigger_dropped_objects PGNSP PGUID 12 10 100 0 0 f f f f t t s 0 0 2249 "" "{26,26,23,25,25,25,25}" "{o,o,o,o,o,o,o}" "{classid, objid, objsubid, object_type, schema_name, object_name, object_identity}" _null_ pg_event_trigger_dropped_objects _null_ _null_ _null_ ));
diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h
index 70350e02cb2..058dc5f6675 100644
--- a/src/include/commands/vacuum.h
+++ b/src/include/commands/vacuum.h
@@ -157,10 +157,10 @@ extern void vac_update_relstats(Relation relation,
bool hasindex,
TransactionId frozenxid,
MultiXactId minmulti);
-extern void vacuum_set_xid_limits(int freeze_min_age, int freeze_table_age,
+extern void vacuum_set_xid_limits(Relation rel,
+ int freeze_min_age, int freeze_table_age,
int multixact_freeze_min_age,
int multixact_freeze_table_age,
- bool sharedRel,
TransactionId *oldestXmin,
TransactionId *freezeLimit,
TransactionId *xidFullScanLimit,
diff --git a/src/include/replication/decode.h b/src/include/replication/decode.h
new file mode 100644
index 00000000000..7f55d789a23
--- /dev/null
+++ b/src/include/replication/decode.h
@@ -0,0 +1,19 @@
+/*-------------------------------------------------------------------------
+ * decode.h
+ * PostgreSQL WAL to logical transformation
+ *
+ * Portions Copyright (c) 2012-2014, PostgreSQL Global Development Group
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef DECODE_H
+#define DECODE_H
+
+#include "access/xlogreader.h"
+#include "replication/reorderbuffer.h"
+#include "replication/logical.h"
+
+void LogicalDecodingProcessRecord(LogicalDecodingContext *ctx,
+ XLogRecord *record);
+
+#endif
diff --git a/src/include/replication/logical.h b/src/include/replication/logical.h
new file mode 100644
index 00000000000..e65c8b8075f
--- /dev/null
+++ b/src/include/replication/logical.h
@@ -0,0 +1,100 @@
+/*-------------------------------------------------------------------------
+ * logical.h
+ * PostgreSQL logical decoding coordination
+ *
+ * Copyright (c) 2012-2014, PostgreSQL Global Development Group
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef LOGICAL_H
+#define LOGICAL_H
+
+#include "replication/slot.h"
+
+#include "access/xlog.h"
+#include "access/xlogreader.h"
+#include "replication/output_plugin.h"
+
+struct LogicalDecodingContext;
+
+typedef void (*LogicalOutputPluginWriterWrite) (
+ struct LogicalDecodingContext *lr,
+ XLogRecPtr Ptr,
+ TransactionId xid,
+ bool last_write
+);
+
+typedef LogicalOutputPluginWriterWrite LogicalOutputPluginWriterPrepareWrite;
+
+typedef struct LogicalDecodingContext
+{
+ /* memory context this is all allocated in */
+ MemoryContext context;
+
+ /* infrastructure pieces */
+ XLogReaderState *reader;
+ ReplicationSlot *slot;
+ struct ReorderBuffer *reorder;
+ struct SnapBuild *snapshot_builder;
+
+ OutputPluginCallbacks callbacks;
+ OutputPluginOptions options;
+
+ /*
+ * User specified options
+ */
+ List *output_plugin_options;
+
+ /*
+ * User-Provided callback for writing/streaming out data.
+ */
+ LogicalOutputPluginWriterPrepareWrite prepare_write;
+ LogicalOutputPluginWriterWrite write;
+
+ /*
+ * Output buffer.
+ */
+ StringInfo out;
+
+ /*
+ * Private data pointer of the output plugin.
+ */
+ void *output_plugin_private;
+
+ /*
+ * Private data pointer for the data writer.
+ */
+ void *output_writer_private;
+
+ /*
+ * State for writing output.
+ */
+ bool accept_writes;
+ bool prepared_write;
+ XLogRecPtr write_location;
+ TransactionId write_xid;
+} LogicalDecodingContext;
+
+extern void CheckLogicalDecodingRequirements(void);
+
+extern LogicalDecodingContext *CreateInitDecodingContext(char *plugin,
+ List *output_plugin_options,
+ XLogPageReadCB read_page,
+ LogicalOutputPluginWriterPrepareWrite prepare_write,
+ LogicalOutputPluginWriterWrite do_write);
+extern LogicalDecodingContext *CreateDecodingContext(
+ XLogRecPtr start_lsn,
+ List *output_plugin_options,
+ XLogPageReadCB read_page,
+ LogicalOutputPluginWriterPrepareWrite prepare_write,
+ LogicalOutputPluginWriterWrite do_write);
+extern void DecodingContextFindStartpoint(LogicalDecodingContext *ctx);
+extern bool DecodingContextReady(LogicalDecodingContext *ctx);
+extern void FreeDecodingContext(LogicalDecodingContext *ctx);
+
+extern void LogicalIncreaseXminForSlot(XLogRecPtr lsn, TransactionId xmin);
+extern void LogicalIncreaseRestartDecodingForSlot(XLogRecPtr current_lsn,
+ XLogRecPtr restart_lsn);
+extern void LogicalConfirmReceivedLocation(XLogRecPtr lsn);
+
+#endif
diff --git a/src/include/replication/logicalfuncs.h b/src/include/replication/logicalfuncs.h
new file mode 100644
index 00000000000..21bf44ec4b7
--- /dev/null
+++ b/src/include/replication/logicalfuncs.h
@@ -0,0 +1,24 @@
+/*-------------------------------------------------------------------------
+ * logicalfuncs.h
+ * PostgreSQL WAL to logical transformation support functions
+ *
+ * Copyright (c) 2012-2014, PostgreSQL Global Development Group
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef LOGICALFUNCS_H
+#define LOGICALFUNCS_H
+
+#include "replication/logical.h"
+
+extern int logical_read_local_xlog_page(XLogReaderState *state,
+ XLogRecPtr targetPagePtr,
+ int reqLen, XLogRecPtr targetRecPtr,
+ char *cur_page, TimeLineID *pageTLI);
+
+extern Datum pg_logical_slot_get_changes(PG_FUNCTION_ARGS);
+extern Datum pg_logical_slot_get_binary_changes(PG_FUNCTION_ARGS);
+extern Datum pg_logical_slot_peek_changes(PG_FUNCTION_ARGS);
+extern Datum pg_logical_slot_peek_binary_changes(PG_FUNCTION_ARGS);
+
+#endif
diff --git a/src/include/replication/output_plugin.h b/src/include/replication/output_plugin.h
new file mode 100644
index 00000000000..c47c24c8dbe
--- /dev/null
+++ b/src/include/replication/output_plugin.h
@@ -0,0 +1,98 @@
+/*-------------------------------------------------------------------------
+ * output_plugin.h
+ * PostgreSQL Logical Decode Plugin Interface
+ *
+ * Copyright (c) 2012-2014, PostgreSQL Global Development Group
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef OUTPUT_PLUGIN_H
+#define OUTPUT_PLUGIN_H
+
+#include "replication/reorderbuffer.h"
+
+struct LogicalDecodingContext;
+struct OutputPluginCallbacks;
+
+typedef enum OutputPluginOutputType
+{
+ OUTPUT_PLUGIN_BINARY_OUTPUT,
+ OUTPUT_PLUGIN_TEXTUAL_OUTPUT
+} OutputPluginOutputType;
+
+/*
+ * Options set by the output plugin, in the startup callback.
+ */
+typedef struct OutputPluginOptions
+{
+ OutputPluginOutputType output_type;
+} OutputPluginOptions;
+
+/*
+ * Type of the shared library symbol _PG_output_plugin_init that is looked up
+ * when loading an output plugin shared library.
+ */
+typedef void (*LogicalOutputPluginInit)(struct OutputPluginCallbacks *cb);
+
+/*
+ * Callback that gets called in a user-defined plugin. ctx->private_data can
+ * be set to some private data.
+ *
+ * "is_init" will be set to "true" if the decoding slot just got defined. When
+ * the same slot is used from there one, it will be "false".
+ */
+typedef void (*LogicalDecodeStartupCB) (
+ struct LogicalDecodingContext *ctx,
+ OutputPluginOptions *options,
+ bool is_init
+);
+
+/*
+ * Callback called for every (explicit or implicit) BEGIN of a successful
+ * transaction.
+ */
+typedef void (*LogicalDecodeBeginCB) (
+ struct LogicalDecodingContext *,
+ ReorderBufferTXN *txn);
+
+/*
+ * Callback for every individual change in a successful transaction.
+ */
+typedef void (*LogicalDecodeChangeCB) (
+ struct LogicalDecodingContext *,
+ ReorderBufferTXN *txn,
+ Relation relation,
+ ReorderBufferChange *change
+);
+
+/*
+ * Called for every (explicit or implicit) COMMIT of a successful transaction.
+ */
+typedef void (*LogicalDecodeCommitCB) (
+ struct LogicalDecodingContext *,
+ ReorderBufferTXN *txn,
+ XLogRecPtr commit_lsn);
+
+/*
+ * Called to shutdown an output plugin.
+ */
+typedef void (*LogicalDecodeShutdownCB) (
+ struct LogicalDecodingContext *
+);
+
+/*
+ * Output plugin callbacks
+ */
+typedef struct OutputPluginCallbacks
+{
+ LogicalDecodeStartupCB startup_cb;
+ LogicalDecodeBeginCB begin_cb;
+ LogicalDecodeChangeCB change_cb;
+ LogicalDecodeCommitCB commit_cb;
+ LogicalDecodeShutdownCB shutdown_cb;
+} OutputPluginCallbacks;
+
+void OutputPluginPrepareWrite(struct LogicalDecodingContext *ctx, bool last_write);
+void OutputPluginWrite(struct LogicalDecodingContext *ctx, bool last_write);
+
+#endif /* OUTPUT_PLUGIN_H */
diff --git a/src/include/replication/reorderbuffer.h b/src/include/replication/reorderbuffer.h
new file mode 100644
index 00000000000..01eabfb7be7
--- /dev/null
+++ b/src/include/replication/reorderbuffer.h
@@ -0,0 +1,351 @@
+/*
+ * reorderbuffer.h
+ * PostgreSQL logical replay/reorder buffer management.
+ *
+ * Copyright (c) 2012-2014, PostgreSQL Global Development Group
+ *
+ * src/include/replication/reorderbuffer.h
+ */
+#ifndef REORDERBUFFER_H
+#define REORDERBUFFER_H
+
+#include "access/htup_details.h"
+
+#include "lib/ilist.h"
+
+#include "storage/sinval.h"
+
+#include "utils/hsearch.h"
+#include "utils/rel.h"
+#include "utils/snapshot.h"
+#include "utils/timestamp.h"
+
+/* an individual tuple, stored in one chunk of memory */
+typedef struct ReorderBufferTupleBuf
+{
+ /* position in preallocated list */
+ slist_node node;
+
+ /* tuple, stored sequentially */
+ HeapTupleData tuple;
+ HeapTupleHeaderData header;
+ char data[MaxHeapTupleSize];
+} ReorderBufferTupleBuf;
+
+/* types of the change passed to a 'change' callback */
+enum ReorderBufferChangeType
+{
+ REORDER_BUFFER_CHANGE_INSERT,
+ REORDER_BUFFER_CHANGE_UPDATE,
+ REORDER_BUFFER_CHANGE_DELETE
+};
+
+/*
+ * a single 'change', can be an insert (with one tuple), an update (old, new),
+ * or a delete (old).
+ *
+ * The same struct is also used internally for other purposes but that should
+ * never be visible outside reorderbuffer.c.
+ */
+typedef struct ReorderBufferChange
+{
+ XLogRecPtr lsn;
+
+ /* type of change */
+ union
+ {
+ enum ReorderBufferChangeType action;
+ /* do not leak internal enum values to the outside */
+ int action_internal;
+ };
+
+ /*
+ * Context data for the change, which part of the union is valid depends
+ * on action/action_internal.
+ */
+ union
+ {
+ /* old, new tuples when action == *_INSERT|UPDATE|DELETE */
+ struct
+ {
+ /* relation that has been changed */
+ RelFileNode relnode;
+ /* valid for DELETE || UPDATE */
+ ReorderBufferTupleBuf *oldtuple;
+ /* valid for INSERT || UPDATE */
+ ReorderBufferTupleBuf *newtuple;
+ } tp;
+
+ /* new snapshot */
+ Snapshot snapshot;
+
+ /* new command id for existing snapshot in a catalog changing tx */
+ CommandId command_id;
+
+ /* new cid mapping for catalog changing transaction */
+ struct
+ {
+ RelFileNode node;
+ ItemPointerData tid;
+ CommandId cmin;
+ CommandId cmax;
+ CommandId combocid;
+ } tuplecid;
+ };
+
+ /*
+ * While in use this is how a change is linked into a transactions,
+ * otherwise it's the preallocated list.
+ */
+ dlist_node node;
+} ReorderBufferChange;
+
+typedef struct ReorderBufferTXN
+{
+ /*
+ * The transactions transaction id, can be a toplevel or sub xid.
+ */
+ TransactionId xid;
+
+ /* did the TX have catalog changes */
+ bool has_catalog_changes;
+
+ /*
+ * Do we know this is a subxact?
+ */
+ bool is_known_as_subxact;
+
+ /*
+ * LSN of the first data carrying, WAL record with knowledge about this
+ * xid. This is allowed to *not* be first record adorned with this xid, if
+ * the previous records aren't relevant for logical decoding.
+ */
+ XLogRecPtr first_lsn;
+
+ /* ----
+ * LSN of the record that lead to this xact to be committed or
+ * aborted. This can be a
+ * * plain commit record
+ * * plain commit record, of a parent transaction
+ * * prepared transaction commit
+ * * plain abort record
+ * * prepared transaction abort
+ * * error during decoding
+ * ----
+ */
+ XLogRecPtr final_lsn;
+
+ /*
+ * LSN pointing to the end of the commit record + 1.
+ */
+ XLogRecPtr end_lsn;
+
+ /*
+ * LSN of the last lsn at which snapshot information reside, so we can
+ * restart decoding from there and fully recover this transaction from
+ * WAL.
+ */
+ XLogRecPtr restart_decoding_lsn;
+
+ /*
+ * Commit time, only known when we read the actual commit record.
+ */
+ TimestampTz commit_time;
+
+ /*
+ * Base snapshot or NULL.
+ */
+ Snapshot base_snapshot;
+ XLogRecPtr base_snapshot_lsn;
+
+ /*
+ * How many ReorderBufferChange's do we have in this txn.
+ *
+ * Changes in subtransactions are *not* included but tracked separately.
+ */
+ uint64 nentries;
+
+ /*
+ * How many of the above entries are stored in memory in contrast to being
+ * spilled to disk.
+ */
+ uint64 nentries_mem;
+
+ /*
+ * List of ReorderBufferChange structs, including new Snapshots and new
+ * CommandIds
+ */
+ dlist_head changes;
+
+ /*
+ * List of (relation, ctid) => (cmin, cmax) mappings for catalog tuples.
+ * Those are always assigned to the toplevel transaction. (Keep track of
+ * #entries to create a hash of the right size)
+ */
+ dlist_head tuplecids;
+ uint64 ntuplecids;
+
+ /*
+ * On-demand built hash for looking up the above values.
+ */
+ HTAB *tuplecid_hash;
+
+ /*
+ * Hash containing (potentially partial) toast entries. NULL if no toast
+ * tuples have been found for the current change.
+ */
+ HTAB *toast_hash;
+
+ /*
+ * non-hierarchical list of subtransactions that are *not* aborted. Only
+ * used in toplevel transactions.
+ */
+ dlist_head subtxns;
+ uint32 nsubtxns;
+
+ /*
+ * Stored cache invalidations. This is not a linked list because we get
+ * all the invalidations at once.
+ */
+ uint32 ninvalidations;
+ SharedInvalidationMessage *invalidations;
+
+ /* ---
+ * Position in one of three lists:
+ * * list of subtransactions if we are *known* to be subxact
+ * * list of toplevel xacts (can be a as-yet unknown subxact)
+ * * list of preallocated ReorderBufferTXNs
+ * ---
+ */
+ dlist_node node;
+
+} ReorderBufferTXN;
+
+/* so we can define the callbacks used inside struct ReorderBuffer itself */
+typedef struct ReorderBuffer ReorderBuffer;
+
+/* change callback signature */
+typedef void (*ReorderBufferApplyChangeCB) (
+ ReorderBuffer *rb,
+ ReorderBufferTXN *txn,
+ Relation relation,
+ ReorderBufferChange *change);
+
+/* begin callback signature */
+typedef void (*ReorderBufferBeginCB) (
+ ReorderBuffer *rb,
+ ReorderBufferTXN *txn);
+
+/* commit callback signature */
+typedef void (*ReorderBufferCommitCB) (
+ ReorderBuffer *rb,
+ ReorderBufferTXN *txn,
+ XLogRecPtr commit_lsn);
+
+struct ReorderBuffer
+{
+ /*
+ * xid => ReorderBufferTXN lookup table
+ */
+ HTAB *by_txn;
+
+ /*
+ * Transactions that could be a toplevel xact, ordered by LSN of the first
+ * record bearing that xid..
+ */
+ dlist_head toplevel_by_lsn;
+
+ /*
+ * one-entry sized cache for by_txn. Very frequently the same txn gets
+ * looked up over and over again.
+ */
+ TransactionId by_txn_last_xid;
+ ReorderBufferTXN *by_txn_last_txn;
+
+ /*
+ * Callacks to be called when a transactions commits.
+ */
+ ReorderBufferBeginCB begin;
+ ReorderBufferApplyChangeCB apply_change;
+ ReorderBufferCommitCB commit;
+
+ /*
+ * Pointer that will be passed untouched to the callbacks.
+ */
+ void *private_data;
+
+ /*
+ * Private memory context.
+ */
+ MemoryContext context;
+
+ /*
+ * Data structure slab cache.
+ *
+ * We allocate/deallocate some structures very frequently, to avoid bigger
+ * overhead we cache some unused ones here.
+ *
+ * The maximum number of cached entries is controlled by const variables
+ * ontop of reorderbuffer.c
+ */
+
+ /* cached ReorderBufferTXNs */
+ dlist_head cached_transactions;
+ Size nr_cached_transactions;
+
+ /* cached ReorderBufferChanges */
+ dlist_head cached_changes;
+ Size nr_cached_changes;
+
+ /* cached ReorderBufferTupleBufs */
+ slist_head cached_tuplebufs;
+ Size nr_cached_tuplebufs;
+
+ XLogRecPtr current_restart_decoding_lsn;
+
+ /* buffer for disk<->memory conversions */
+ char *outbuf;
+ Size outbufsize;
+};
+
+
+ReorderBuffer *ReorderBufferAllocate(void);
+void ReorderBufferFree(ReorderBuffer *);
+
+ReorderBufferTupleBuf *ReorderBufferGetTupleBuf(ReorderBuffer *);
+void ReorderBufferReturnTupleBuf(ReorderBuffer *, ReorderBufferTupleBuf *tuple);
+ReorderBufferChange *ReorderBufferGetChange(ReorderBuffer *);
+void ReorderBufferReturnChange(ReorderBuffer *, ReorderBufferChange *);
+
+void ReorderBufferQueueChange(ReorderBuffer *, TransactionId, XLogRecPtr lsn, ReorderBufferChange *);
+void ReorderBufferCommit(ReorderBuffer *, TransactionId,
+ XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
+ TimestampTz commit_time);
+void ReorderBufferAssignChild(ReorderBuffer *, TransactionId, TransactionId, XLogRecPtr commit_lsn);
+void ReorderBufferCommitChild(ReorderBuffer *, TransactionId, TransactionId,
+ XLogRecPtr commit_lsn, XLogRecPtr end_lsn);
+void ReorderBufferAbort(ReorderBuffer *, TransactionId, XLogRecPtr lsn);
+void ReorderBufferAbortOld(ReorderBuffer *, TransactionId xid);
+void ReorderBufferForget(ReorderBuffer *, TransactionId, XLogRecPtr lsn);
+
+void ReorderBufferSetBaseSnapshot(ReorderBuffer *, TransactionId, XLogRecPtr lsn, struct SnapshotData *snap);
+void ReorderBufferAddSnapshot(ReorderBuffer *, TransactionId, XLogRecPtr lsn, struct SnapshotData *snap);
+void ReorderBufferAddNewCommandId(ReorderBuffer *, TransactionId, XLogRecPtr lsn,
+ CommandId cid);
+void ReorderBufferAddNewTupleCids(ReorderBuffer *, TransactionId, XLogRecPtr lsn,
+ RelFileNode node, ItemPointerData pt,
+ CommandId cmin, CommandId cmax, CommandId combocid);
+void ReorderBufferAddInvalidations(ReorderBuffer *, TransactionId, XLogRecPtr lsn,
+ Size nmsgs, SharedInvalidationMessage *msgs);
+bool ReorderBufferIsXidKnown(ReorderBuffer *, TransactionId xid);
+void ReorderBufferXidSetCatalogChanges(ReorderBuffer *, TransactionId xid, XLogRecPtr lsn);
+bool ReorderBufferXidHasCatalogChanges(ReorderBuffer *, TransactionId xid);
+bool ReorderBufferXidHasBaseSnapshot(ReorderBuffer *, TransactionId xid);
+
+ReorderBufferTXN *ReorderBufferGetOldestTXN(ReorderBuffer *);
+
+void ReorderBufferSetRestartPoint(ReorderBuffer *, XLogRecPtr ptr);
+
+void StartupReorderBuffer(void);
+
+#endif
diff --git a/src/include/replication/slot.h b/src/include/replication/slot.h
index 089b0f4b70c..c354c9133bf 100644
--- a/src/include/replication/slot.h
+++ b/src/include/replication/slot.h
@@ -16,6 +16,24 @@
#include "storage/shmem.h"
#include "storage/spin.h"
+/*
+ * Behaviour of replication slots, upon release or crash.
+ *
+ * Slots marked as PERSISTENT are crashsafe and will not be dropped when
+ * released. Slots marked as EPHEMERAL will be dropped when released or after
+ * restarts.
+ *
+ * EPHEMERAL slots can be made PERSISTENT by calling ReplicationSlotPersist().
+ */
+typedef enum ReplicationSlotPersistency
+{
+ RS_PERSISTENT,
+ RS_EPHEMERAL
+} ReplicationSlotPersistency;
+
+/*
+ * On-Disk data of a replication slot, preserved across restarts.
+ */
typedef struct ReplicationSlotPersistentData
{
/* The slot's identifier */
@@ -25,6 +43,11 @@ typedef struct ReplicationSlotPersistentData
Oid database;
/*
+ * The slot's behaviour when being dropped (or restored after a crash).
+ */
+ ReplicationSlotPersistency persistency;
+
+ /*
* xmin horizon for data
*
* NB: This may represent a value that hasn't been written to disk yet;
@@ -32,9 +55,22 @@ typedef struct ReplicationSlotPersistentData
*/
TransactionId xmin;
+ /*
+ * xmin horizon for catalog tuples
+ *
+ * NB: This may represent a value that hasn't been written to disk yet;
+ * see notes for effective_xmin, below.
+ */
+ TransactionId catalog_xmin;
+
/* oldest LSN that might be required by this replication slot */
XLogRecPtr restart_lsn;
+ /* oldest LSN that the client has acked receipt for */
+ XLogRecPtr confirmed_flush;
+
+ /* plugin name */
+ NameData plugin;
} ReplicationSlotPersistentData;
/*
@@ -67,12 +103,26 @@ typedef struct ReplicationSlot
* same as the persistent value (data.xmin).
*/
TransactionId effective_xmin;
+ TransactionId effective_catalog_xmin;
/* data surviving shutdowns and crashes */
ReplicationSlotPersistentData data;
/* is somebody performing io on this slot? */
LWLock *io_in_progress_lock;
+
+ /* all the remaining data is only used for logical slots */
+
+ /* ----
+ * When the client has confirmed flushes >= candidate_xmin_lsn we can
+ * advance the catalog xmin, when restart_valid has been passed,
+ * restart_lsn can be increased.
+ * ----
+ */
+ TransactionId candidate_catalog_xmin;
+ XLogRecPtr candidate_xmin_lsn;
+ XLogRecPtr candidate_restart_valid;
+ XLogRecPtr candidate_restart_lsn;
} ReplicationSlot;
/*
@@ -97,8 +147,11 @@ extern Size ReplicationSlotsShmemSize(void);
extern void ReplicationSlotsShmemInit(void);
/* management of individual slots */
-extern void ReplicationSlotCreate(const char *name, bool db_specific);
+extern void ReplicationSlotCreate(const char *name, bool db_specific,
+ ReplicationSlotPersistency p);
+extern void ReplicationSlotPersist(void);
extern void ReplicationSlotDrop(const char *name);
+
extern void ReplicationSlotAcquire(const char *name);
extern void ReplicationSlotRelease(void);
extern void ReplicationSlotSave(void);
@@ -106,15 +159,20 @@ extern void ReplicationSlotMarkDirty(void);
/* misc stuff */
extern bool ReplicationSlotValidateName(const char *name, int elevel);
-extern void ReplicationSlotsComputeRequiredXmin(void);
+extern void ReplicationSlotsComputeRequiredXmin(bool already_locked);
extern void ReplicationSlotsComputeRequiredLSN(void);
+extern XLogRecPtr ReplicationSlotsComputeLogicalRestartLSN(void);
+extern bool ReplicationSlotsCountDBSlots(Oid dboid, int *nslots, int *nactive);
+
extern void StartupReplicationSlots(XLogRecPtr checkPointRedo);
extern void CheckPointReplicationSlots(void);
extern void CheckSlotRequirements(void);
-extern void ReplicationSlotAtProcExit(void);
/* SQL callable functions */
+extern Datum pg_create_physical_replication_slot(PG_FUNCTION_ARGS);
+extern Datum pg_create_logical_replication_slot(PG_FUNCTION_ARGS);
+extern Datum pg_drop_replication_slot(PG_FUNCTION_ARGS);
extern Datum pg_get_replication_slots(PG_FUNCTION_ARGS);
#endif /* SLOT_H */
diff --git a/src/include/replication/snapbuild.h b/src/include/replication/snapbuild.h
new file mode 100644
index 00000000000..087c0e510d5
--- /dev/null
+++ b/src/include/replication/snapbuild.h
@@ -0,0 +1,83 @@
+/*-------------------------------------------------------------------------
+ *
+ * snapbuild.h
+ * Exports from replication/logical/snapbuild.c.
+ *
+ * Copyright (c) 2012-2014, PostgreSQL Global Development Group
+ *
+ * src/include/replication/snapbuild.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef SNAPBUILD_H
+#define SNAPBUILD_H
+
+#include "access/xlogdefs.h"
+#include "utils/snapmgr.h"
+
+typedef enum
+{
+ /*
+ * Initial state, we can't do much yet.
+ */
+ SNAPBUILD_START,
+
+ /*
+ * We have collected enough information to decode tuples in transactions
+ * that started after this.
+ *
+ * Once we reached this we start to collect changes. We cannot apply them
+ * yet because the might be based on transactions that were still running
+ * when we reached them yet.
+ */
+ SNAPBUILD_FULL_SNAPSHOT,
+
+ /*
+ * Found a point after hitting built_full_snapshot where all transactions
+ * that were running at that point finished. Till we reach that we hold
+ * off calling any commit callbacks.
+ */
+ SNAPBUILD_CONSISTENT
+} SnapBuildState;
+
+/* forward declare so we don't have to expose the struct to the public */
+struct SnapBuild;
+typedef struct SnapBuild SnapBuild;
+
+/* forward declare so we don't have to include reorderbuffer.h */
+struct ReorderBuffer;
+
+/* forward declare so we don't have to include heapam_xlog.h */
+struct xl_heap_new_cid;
+struct xl_running_xacts;
+
+extern void CheckPointSnapBuild(void);
+
+extern SnapBuild *AllocateSnapshotBuilder(struct ReorderBuffer *cache,
+ TransactionId xmin_horizon, XLogRecPtr start_lsn);
+extern void FreeSnapshotBuilder(SnapBuild *cache);
+
+extern void SnapBuildSnapDecRefcount(Snapshot snap);
+
+extern const char *SnapBuildExportSnapshot(SnapBuild *snapstate);
+extern void SnapBuildClearExportedSnapshot(void);
+
+extern SnapBuildState SnapBuildCurrentState(SnapBuild *snapstate);
+
+extern bool SnapBuildXactNeedsSkip(SnapBuild *snapstate, XLogRecPtr ptr);
+
+extern void SnapBuildCommitTxn(SnapBuild *builder, XLogRecPtr lsn,
+ TransactionId xid, int nsubxacts,
+ TransactionId *subxacts);
+extern void SnapBuildAbortTxn(SnapBuild *builder, XLogRecPtr lsn,
+ TransactionId xid, int nsubxacts,
+ TransactionId *subxacts);
+extern bool SnapBuildProcessChange(SnapBuild *builder, TransactionId xid,
+ XLogRecPtr lsn);
+extern void SnapBuildProcessNewCid(SnapBuild *builder, TransactionId xid,
+ XLogRecPtr lsn, struct xl_heap_new_cid *cid);
+extern void SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn,
+ struct xl_running_xacts *running);
+extern void SnapBuildSerializationPoint(SnapBuild *builder, XLogRecPtr lsn);
+
+#endif /* SNAPBUILD_H */
diff --git a/src/include/storage/itemptr.h b/src/include/storage/itemptr.h
index 67bbdbb988a..0b81d53f5f8 100644
--- a/src/include/storage/itemptr.h
+++ b/src/include/storage/itemptr.h
@@ -116,6 +116,9 @@ typedef ItemPointerData *ItemPointer;
/*
* ItemPointerCopy
* Copies the contents of one disk item pointer to another.
+ *
+ * Should there ever be padding in an ItemPointer this would need to be handled
+ * differently as it's used as hash key.
*/
#define ItemPointerCopy(fromPointer, toPointer) \
( \
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index a3cadd9a017..5218b448cd6 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -41,10 +41,12 @@ struct XidCache
#define PROC_IS_AUTOVACUUM 0x01 /* is it an autovac worker? */
#define PROC_IN_VACUUM 0x02 /* currently running lazy vacuum */
#define PROC_IN_ANALYZE 0x04 /* currently running analyze */
-#define PROC_VACUUM_FOR_WRAPAROUND 0x08 /* set by autovac only */
+#define PROC_VACUUM_FOR_WRAPAROUND 0x08 /* set by autovac only */
+#define PROC_IN_LOGICAL_DECODING 0x10 /* currently doing logical decoding */
/* flags reset at EOXact */
-#define PROC_VACUUM_STATE_MASK (0x0E)
+#define PROC_VACUUM_STATE_MASK \
+ (PROC_IN_VACUUM | PROC_IN_ANALYZE | PROC_VACUUM_FOR_WRAPAROUND)
/*
* We allow a small number of "weak" relation locks (AccesShareLock,
diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h
index d1a58a3661b..d0b4103a09e 100644
--- a/src/include/storage/procarray.h
+++ b/src/include/storage/procarray.h
@@ -15,6 +15,7 @@
#define PROCARRAY_H
#include "storage/standby.h"
+#include "utils/relcache.h"
#include "utils/snapshot.h"
@@ -50,8 +51,9 @@ extern RunningTransactions GetRunningTransactionData(void);
extern bool TransactionIdIsInProgress(TransactionId xid);
extern bool TransactionIdIsActive(TransactionId xid);
-extern TransactionId GetOldestXmin(bool allDbs, bool ignoreVacuum);
+extern TransactionId GetOldestXmin(Relation rel, bool ignoreVacuum);
extern TransactionId GetOldestActiveTransactionId(void);
+extern TransactionId GetOldestSafeDecodingTransactionId(void);
extern VirtualTransactionId *GetVirtualXIDsDelayingChkpt(int *nvxids);
extern bool HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids);
@@ -77,6 +79,10 @@ extern void XidCacheRemoveRunningXids(TransactionId xid,
int nxids, const TransactionId *xids,
TransactionId latestXid);
-extern void ProcArraySetReplicationSlotXmin(TransactionId xmin);
+extern void ProcArraySetReplicationSlotXmin(TransactionId xmin,
+ TransactionId catalog_xmin, bool already_locked);
+
+extern void ProcArrayGetReplicationSlotXmin(TransactionId *xmin,
+ TransactionId *catalog_xmin);
#endif /* PROCARRAY_H */
diff --git a/src/include/storage/sinval.h b/src/include/storage/sinval.h
index 0cae810f49e..d5bb850337d 100644
--- a/src/include/storage/sinval.h
+++ b/src/include/storage/sinval.h
@@ -147,4 +147,6 @@ extern void ProcessCommittedInvalidationMessages(SharedInvalidationMessage *msgs
int nmsgs, bool RelcacheInitFileInval,
Oid dbid, Oid tsid);
+extern void LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg);
+
#endif /* SINVAL_H */
diff --git a/src/include/utils/inval.h b/src/include/utils/inval.h
index c1409c863db..6156e0219d0 100644
--- a/src/include/utils/inval.h
+++ b/src/include/utils/inval.h
@@ -64,4 +64,5 @@ extern void CacheRegisterRelcacheCallback(RelcacheCallbackFunction func,
extern void CallSyscacheCallbacks(int cacheid, uint32 hashvalue);
+extern void InvalidateSystemCaches(void);
#endif /* INVAL_H */
diff --git a/src/include/utils/snapmgr.h b/src/include/utils/snapmgr.h
index c601770ec99..abe7016d040 100644
--- a/src/include/utils/snapmgr.h
+++ b/src/include/utils/snapmgr.h
@@ -23,12 +23,14 @@ extern bool FirstSnapshotSet;
extern TransactionId TransactionXmin;
extern TransactionId RecentXmin;
extern TransactionId RecentGlobalXmin;
+extern TransactionId RecentGlobalDataXmin;
extern Snapshot GetTransactionSnapshot(void);
extern Snapshot GetLatestSnapshot(void);
extern void SnapshotSetCommandId(CommandId curcid);
extern Snapshot GetCatalogSnapshot(Oid relid);
+extern Snapshot GetNonHistoricCatalogSnapshot(Oid relid);
extern void InvalidateCatalogSnapshot(void);
extern void PushActiveSnapshot(Snapshot snapshot);
@@ -53,4 +55,13 @@ extern bool XactHasExportedSnapshots(void);
extern void DeleteAllExportedSnapshotFiles(void);
extern bool ThereAreNoPriorRegisteredSnapshots(void);
+extern char *ExportSnapshot(Snapshot snapshot);
+
+/* Support for catalog timetravel for logical decoding */
+struct HTAB;
+extern struct HTAB *HistoricSnapshotGetTupleCids(void);
+extern void SetupHistoricSnapshot(Snapshot snapshot_now, struct HTAB *tuplecids);
+extern void TeardownHistoricSnapshot(bool is_error);
+extern bool HistoricSnapshotActive(void);
+
#endif /* SNAPMGR_H */
diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h
index c542238825a..4b256074b0b 100644
--- a/src/include/utils/snapshot.h
+++ b/src/include/utils/snapshot.h
@@ -30,6 +30,22 @@ typedef struct SnapshotData *Snapshot;
typedef bool (*SnapshotSatisfiesFunc) (HeapTuple htup,
Snapshot snapshot, Buffer buffer);
+/*
+ * Struct representing all kind of possible snapshots.
+ *
+ * There are several different kinds of snapshots:
+ * * Normal MVCC snapshots
+ * * MVCC snapshots taken during recovery (in Hot-Standby mode)
+ * * Historic MVCC snapshots used during logical decoding
+ * * snapshots passed to HeapTupleSatisfiesDirty()
+ * * snapshots used for SatisfiesAny, Toast, Self where no members are
+ * accessed.
+ *
+ * TODO: It's probably a good idea to split this struct using a NodeTag
+ * similar to how parser and executor nodes are handled, with one type for
+ * each different kind of snapshot to avoid overloading the meaning of
+ * individual fields.
+ */
typedef struct SnapshotData
{
SnapshotSatisfiesFunc satisfies; /* tuple test function */
@@ -46,11 +62,23 @@ typedef struct SnapshotData
*/
TransactionId xmin; /* all XID < xmin are visible to me */
TransactionId xmax; /* all XID >= xmax are invisible to me */
- TransactionId *xip; /* array of xact IDs in progress */
+ /*
+ * For normal MVCC snapshot this contains the all xact IDs that are in
+ * progress, unless the snapshot was taken during recovery in which case
+ * it's empty. For historic MVCC snapshots, the meaning is inverted,
+ * i.e. it contains *committed* transactions between xmin and xmax.
+ */
+ TransactionId *xip;
uint32 xcnt; /* # of xact ids in xip[] */
/* note: all ids in xip[] satisfy xmin <= xip[i] < xmax */
int32 subxcnt; /* # of xact ids in subxip[] */
- TransactionId *subxip; /* array of subxact IDs in progress */
+ /*
+ * For non-historic MVCC snapshots, this contains subxact IDs that are in
+ * progress (and other transactions that are in progress if taken during
+ * recovery). For historic snapshot it contains *all* xids assigned to the
+ * replayed transaction, including the toplevel xid.
+ */
+ TransactionId *subxip;
bool suboverflowed; /* has the subxip array overflowed? */
bool takenDuringRecovery; /* recovery-shaped snapshot? */
bool copied; /* false if it's a static snapshot */
diff --git a/src/include/utils/tqual.h b/src/include/utils/tqual.h
index e34c28a4f78..48abe62983d 100644
--- a/src/include/utils/tqual.h
+++ b/src/include/utils/tqual.h
@@ -22,6 +22,7 @@
extern PGDLLIMPORT SnapshotData SnapshotSelfData;
extern PGDLLIMPORT SnapshotData SnapshotAnyData;
extern PGDLLIMPORT SnapshotData SnapshotToastData;
+extern PGDLLIMPORT SnapshotData CatalogSnapshotData;
#define SnapshotSelf (&SnapshotSelfData)
#define SnapshotAny (&SnapshotAnyData)
@@ -37,7 +38,8 @@ extern PGDLLIMPORT SnapshotData SnapshotToastData;
/* This macro encodes the knowledge of which snapshots are MVCC-safe */
#define IsMVCCSnapshot(snapshot) \
- ((snapshot)->satisfies == HeapTupleSatisfiesMVCC)
+ ((snapshot)->satisfies == HeapTupleSatisfiesMVCC || \
+ (snapshot)->satisfies == HeapTupleSatisfiesHistoricMVCC)
/*
* HeapTupleSatisfiesVisibility
@@ -73,6 +75,8 @@ extern bool HeapTupleSatisfiesToast(HeapTuple htup,
Snapshot snapshot, Buffer buffer);
extern bool HeapTupleSatisfiesDirty(HeapTuple htup,
Snapshot snapshot, Buffer buffer);
+extern bool HeapTupleSatisfiesHistoricMVCC(HeapTuple htup,
+ Snapshot snapshot, Buffer buffer);
/* Special "satisfies" routines with different APIs */
extern HTSU_Result HeapTupleSatisfiesUpdate(HeapTuple htup,
@@ -86,4 +90,13 @@ extern void HeapTupleSetHintBits(HeapTupleHeader tuple, Buffer buffer,
uint16 infomask, TransactionId xid);
extern bool HeapTupleHeaderIsOnlyLocked(HeapTupleHeader tuple);
+/*
+ * To avoid leaking to much knowledge about reorderbuffer implementation
+ * details this is implemented in reorderbuffer.c not tqual.c.
+ */
+extern bool ResolveCminCmaxDuringDecoding(struct HTAB *tuplecid_data,
+ Snapshot snapshot,
+ HeapTuple htup,
+ Buffer buffer,
+ CommandId *cmin, CommandId *cmax);
#endif /* TQUAL_H */
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index ef50f4da217..b0b6e27d8a9 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -1368,13 +1368,15 @@ pg_prepared_xacts| SELECT p.transaction,
LEFT JOIN pg_authid u ON ((p.ownerid = u.oid)))
LEFT JOIN pg_database d ON ((p.dbid = d.oid)));
pg_replication_slots| SELECT l.slot_name,
+ l.plugin,
l.slot_type,
l.datoid,
d.datname AS database,
l.active,
l.xmin,
+ l.catalog_xmin,
l.restart_lsn
- FROM (pg_get_replication_slots() l(slot_name, slot_type, datoid, active, xmin, restart_lsn)
+ FROM (pg_get_replication_slots() l(slot_name, plugin, slot_type, datoid, active, xmin, catalog_xmin, restart_lsn)
LEFT JOIN pg_database d ON ((l.datoid = d.oid)));
pg_roles| SELECT pg_authid.rolname,
pg_authid.rolsuper,
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 3b7f61ef208..f9604541c7a 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -940,6 +940,17 @@ LockTupleMode
LockingClause
LogOpts
LogStmtLevel
+LogicalDecodeBeginCB
+LogicalDecodeChangeCB
+LogicalDecodeCleanupCB
+LogicalDecodeCommitCB
+LogicalDecodeInitCB
+LogicalDecodingCheckpointData
+LogicalDecodingContext
+LogicalDecodingCtlData
+LogicalDecodingSlot
+LogicalOutputPluginWriterPrepareWrite
+LogicalOutputPluginWriterWrite
LogicalTape
LogicalTapeSet
MAGIC
@@ -1053,6 +1064,7 @@ OprInfo
OprProofCacheEntry
OprProofCacheKey
OutputContext
+OutputPluginCallbacks
OverrideSearchPath
OverrideStackEntry
PACE_HEADER
@@ -1468,6 +1480,21 @@ Relids
RelocationBufferInfo
RenameStmt
ReopenPtr
+ReorderBuffer
+ReorderBufferApplyChangeCB
+ReorderBufferBeginCB
+ReorderBufferChange
+ReorderBufferChangeTypeInternal
+ReorderBufferCommitCB
+ReorderBufferDiskChange
+ReorderBufferIterTXNEntry
+ReorderBufferIterTXNState
+ReorderBufferToastEnt
+ReorderBufferTupleBuf
+ReorderBufferTupleCidEnt
+ReorderBufferTupleCidKey
+ReorderBufferTXN
+ReorderBufferTXNByIdEnt
ReplaceVarsFromTargetList_context
ReplaceVarsNoMatchOption
ResTarget
@@ -1522,6 +1549,8 @@ SID_NAME_USE
SISeg
SMgrRelation
SMgrRelationData
+SnapBuildAction
+SnapBuildState
SOCKADDR
SOCKET
SPELL
@@ -1613,6 +1642,8 @@ SlruSharedData
Snapshot
SnapshotData
SnapshotSatisfiesFunc
+Snapstate
+SnapstateOnDisk
SockAddr
Sort
SortBy
@@ -1929,6 +1960,7 @@ XLogReaderState
XLogRecData
XLogRecPtr
XLogRecord
+XLogRecordBuffer
XLogSegNo
XLogSource
XLogwrtResult
@@ -2351,6 +2383,7 @@ symbol
tablespaceinfo
teReqs
teSection
+TestDecodingData
temp_tablespaces_extra
text
timeKEY