1 files changed, 1003 insertions, 0 deletions
diff --git a/src/backend/backup/basebackup_incremental.c b/src/backend/backup/basebackup_incremental.c
new file mode 100644
index 00000000000..1e5a5ac33ad
--- /dev/null
+++ b/src/backend/backup/basebackup_incremental.c
@@ -0,0 +1,1003 @@
+/*-------------------------------------------------------------------------
+ *
+ * basebackup_incremental.c
+ *	  code for incremental backup support
+ *
+ * This code isn't actually in charge of taking an incremental backup;
+ * the actual construction of the incremental backup happens in
+ * basebackup.c. Here, we're concerned with providing the necessary
+ * supports for that operation. In particular, we need to parse the
+ * backup manifest supplied by the user taking the incremental backup
+ * and extract the required information from it.
+ *
+ * Portions Copyright (c) 2010-2022, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/backend/backup/basebackup_incremental.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/timeline.h"
+#include "access/xlog.h"
+#include "access/xlogrecovery.h"
+#include "backup/basebackup_incremental.h"
+#include "backup/walsummary.h"
+#include "common/blkreftable.h"
+#include "common/parse_manifest.h"
+#include "common/hashfn.h"
+#include "postmaster/walsummarizer.h"
+
+#define	BLOCKS_PER_READ			512
+
+/*
+ * Details extracted from the WAL ranges present in the supplied backup manifest.
+ */
+typedef struct
+{
+	TimeLineID	tli;
+	XLogRecPtr	start_lsn;
+	XLogRecPtr	end_lsn;
+} backup_wal_range;
+
+/*
+ * Details extracted from the file list present in the supplied backup manifest.
+ */
+typedef struct
+{
+	uint32		status;
+	const char *path;
+	size_t		size;
+} backup_file_entry;
+
+static uint32 hash_string_pointer(const char *s);
+#define SH_PREFIX               backup_file
+#define SH_ELEMENT_TYPE			backup_file_entry
+#define SH_KEY_TYPE             const char *
+#define SH_KEY                  path
+#define SH_HASH_KEY(tb, key)    hash_string_pointer(key)
+#define SH_EQUAL(tb, a, b)		(strcmp(a, b) == 0)
+#define SH_SCOPE                static inline
+#define SH_DECLARE
+#define SH_DEFINE
+#include "lib/simplehash.h"
+
+struct IncrementalBackupInfo
+{
+	/* Memory context for this object and its subsidiary objects. */
+	MemoryContext mcxt;
+
+	/* Temporary buffer for storing the manifest while parsing it. */
+	StringInfoData buf;
+
+	/* WAL ranges extracted from the backup manifest. */
+	List	   *manifest_wal_ranges;
+
+	/*
+	 * Files extracted from the backup manifest.
+	 *
+	 * We don't really need this information, because we use WAL summaries to
+	 * figure what's changed. It would be unsafe to just rely on the list of
+	 * files that existed before, because it's possible for a file to be
+	 * removed and a new one created with the same name and different
+	 * contents. In such cases, the whole file must still be sent. We can tell
+	 * from the WAL summaries whether that happened, but not from the file
+	 * list.
+	 *
+	 * Nonetheless, this data is useful for sanity checking. If a file that we
+	 * think we shouldn't need to send is not present in the manifest for the
+	 * prior backup, something has gone terribly wrong. We retain the file
+	 * names and sizes, but not the checksums or last modified times, for
+	 * which we have no use.
+	 *
+	 * One significant downside of storing this data is that it consumes
+	 * memory. If that turns out to be a problem, we might have to decide not
+	 * to retain this information, or to make it optional.
+	 */
+	backup_file_hash *manifest_files;
+
+	/*
+	 * Block-reference table for the incremental backup.
+	 *
+	 * It's possible that storing the entire block-reference table in memory
+	 * will be a problem for some users. The in-memory format that we're using
+	 * here is pretty efficient, converging to little more than 1 bit per
+	 * block for relation forks with large numbers of modified blocks. It's
+	 * possible, however, that if you try to perform an incremental backup of
+	 * a database with a sufficiently large number of relations on a
+	 * sufficiently small machine, you could run out of memory here. If that
+	 * turns out to be a problem in practice, we'll need to be more clever.
+	 */
+	BlockRefTable *brtab;
+};
+
+static void manifest_process_file(JsonManifestParseContext *context,
+								  char *pathname,
+								  size_t size,
+								  pg_checksum_type checksum_type,
+								  int checksum_length,
+								  uint8 *checksum_payload);
+static void manifest_process_wal_range(JsonManifestParseContext *context,
+									   TimeLineID tli,
+									   XLogRecPtr start_lsn,
+									   XLogRecPtr end_lsn);
+static void manifest_report_error(JsonManifestParseContext *ib,
+								  const char *fmt,...)
+			pg_attribute_printf(2, 3) pg_attribute_noreturn();
+static int	compare_block_numbers(const void *a, const void *b);
+
+/*
+ * Create a new object for storing information extracted from the manifest
+ * supplied when creating an incremental backup.
+ */
+IncrementalBackupInfo *
+CreateIncrementalBackupInfo(MemoryContext mcxt)
+{
+	IncrementalBackupInfo *ib;
+	MemoryContext oldcontext;
+
+	oldcontext = MemoryContextSwitchTo(mcxt);
+
+	ib = palloc0(sizeof(IncrementalBackupInfo));
+	ib->mcxt = mcxt;
+	initStringInfo(&ib->buf);
+
+	/*
+	 * It's hard to guess how many files a "typical" installation will have in
+	 * the data directory, but a fresh initdb creates almost 1000 files as of
+	 * this writing, so it seems to make sense for our estimate to
+	 * substantially higher.
+	 */
+	ib->manifest_files = backup_file_create(mcxt, 10000, NULL);
+
+	MemoryContextSwitchTo(oldcontext);
+
+	return ib;
+}
+
+/*
+ * Before taking an incremental backup, the caller must supply the backup
+ * manifest from a prior backup. Each chunk of manifest data recieved
+ * from the client should be passed to this function.
+ */
+void
+AppendIncrementalManifestData(IncrementalBackupInfo *ib, const char *data,
+							  int len)
+{
+	MemoryContext oldcontext;
+
+	/* Switch to our memory context. */
+	oldcontext = MemoryContextSwitchTo(ib->mcxt);
+
+	/*
+	 * XXX. Our json parser is at present incapable of parsing json blobs
+	 * incrementally, so we have to accumulate the entire backup manifest
+	 * before we can do anything with it. This should really be fixed, since
+	 * some users might have very large numbers of files in the data
+	 * directory.
+	 */
+	appendBinaryStringInfo(&ib->buf, data, len);
+
+	/* Switch back to previous memory context. */
+	MemoryContextSwitchTo(oldcontext);
+}
+
+/*
+ * Finalize an IncrementalBackupInfo object after all manifest data has
+ * been supplied via calls to AppendIncrementalManifestData.
+ */
+void
+FinalizeIncrementalManifest(IncrementalBackupInfo *ib)
+{
+	JsonManifestParseContext context;
+	MemoryContext oldcontext;
+
+	/* Switch to our memory context. */
+	oldcontext = MemoryContextSwitchTo(ib->mcxt);
+
+	/* Parse the manifest. */
+	context.private_data = ib;
+	context.per_file_cb = manifest_process_file;
+	context.per_wal_range_cb = manifest_process_wal_range;
+	context.error_cb = manifest_report_error;
+	json_parse_manifest(&context, ib->buf.data, ib->buf.len);
+
+	/* Done with the buffer, so release memory. */
+	pfree(ib->buf.data);
+	ib->buf.data = NULL;
+
+	/* Switch back to previous memory context. */
+	MemoryContextSwitchTo(oldcontext);
+}
+
+/*
+ * Prepare to take an incremental backup.
+ *
+ * Before this function is called, AppendIncrementalManifestData and
+ * FinalizeIncrementalManifest should have already been called to pass all
+ * the manifest data to this object.
+ *
+ * This function performs sanity checks on the data extracted from the
+ * manifest and figures out for which WAL ranges we need summaries, and
+ * whether those summaries are available. Then, it reads and combines the
+ * data from those summary files. It also updates the backup_state with the
+ * reference TLI and LSN for the prior backup.
+ */
+void
+PrepareForIncrementalBackup(IncrementalBackupInfo *ib,
+							BackupState *backup_state)
+{
+	MemoryContext oldcontext;
+	List	   *expectedTLEs;
+	List	   *all_wslist,
+			   *required_wslist = NIL;
+	ListCell   *lc;
+	TimeLineHistoryEntry **tlep;
+	int			num_wal_ranges;
+	int			i;
+	bool		found_backup_start_tli = false;
+	TimeLineID	earliest_wal_range_tli = 0;
+	XLogRecPtr	earliest_wal_range_start_lsn = InvalidXLogRecPtr;
+	TimeLineID	latest_wal_range_tli = 0;
+	XLogRecPtr	summarized_lsn;
+	XLogRecPtr	pending_lsn;
+	XLogRecPtr	prior_pending_lsn = InvalidXLogRecPtr;
+	int			deadcycles = 0;
+	TimestampTz initial_time,
+				current_time;
+
+	Assert(ib->buf.data == NULL);
+
+	/* Switch to our memory context. */
+	oldcontext = MemoryContextSwitchTo(ib->mcxt);
+
+	/*
+	 * A valid backup manifest must always contain at least one WAL range
+	 * (usually exactly one, unless the backup spanned a timeline switch).
+	 */
+	num_wal_ranges = list_length(ib->manifest_wal_ranges);
+	if (num_wal_ranges == 0)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("manifest contains no required WAL ranges")));
+
+	/*
+	 * Match up the TLIs that appear in the WAL ranges of the backup manifest
+	 * with those that appear in this server's timeline history. We expect
+	 * every backup_wal_range to match to a TimeLineHistoryEntry; if it does
+	 * not, that's an error.
+	 *
+	 * This loop also decides which of the WAL ranges is the manifest is most
+	 * ancient and which one is the newest, according to the timeline history
+	 * of this server, and stores TLIs of those WAL ranges into
+	 * earliest_wal_range_tli and latest_wal_range_tli. It also updates
+	 * earliest_wal_range_start_lsn to the start LSN of the WAL range for
+	 * earliest_wal_range_tli.
+	 *
+	 * Note that the return value of readTimeLineHistory puts the latest
+	 * timeline at the beginning of the list, not the end. Hence, the earliest
+	 * TLI is the one that occurs nearest the end of the list returned by
+	 * readTimeLineHistory, and the latest TLI is the one that occurs closest
+	 * to the beginning.
+	 */
+	expectedTLEs = readTimeLineHistory(backup_state->starttli);
+	tlep = palloc0(num_wal_ranges * sizeof(TimeLineHistoryEntry *));
+	for (i = 0; i < num_wal_ranges; ++i)
+	{
+		backup_wal_range *range = list_nth(ib->manifest_wal_ranges, i);
+		bool		saw_earliest_wal_range_tli = false;
+		bool		saw_latest_wal_range_tli = false;
+
+		/* Search this server's history for this WAL range's TLI. */
+		foreach(lc, expectedTLEs)
+		{
+			TimeLineHistoryEntry *tle = lfirst(lc);
+
+			if (tle->tli == range->tli)
+			{
+				tlep[i] = tle;
+				break;
+			}
+
+			if (tle->tli == earliest_wal_range_tli)
+				saw_earliest_wal_range_tli = true;
+			if (tle->tli == latest_wal_range_tli)
+				saw_latest_wal_range_tli = true;
+		}
+
+		/*
+		 * An incremental backup can only be taken relative to a backup that
+		 * represents a previous state of this server. If the backup requires
+		 * WAL from a timeline that's not in our history, that definitely
+		 * isn't the case.
+		 */
+		if (tlep[i] == NULL)
+			ereport(ERROR,
+					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+					 errmsg("timeline %u found in manifest, but not in this server's history",
+							range->tli)));
+
+		/*
+		 * If we found this TLI in the server's history before encountering
+		 * the latest TLI seen so far in the server's history, then this TLI
+		 * is the latest one seen so far.
+		 *
+		 * If on the other hand we saw the earliest TLI seen so far before
+		 * finding this TLI, this TLI is earlier than the earliest one seen so
+		 * far. And if this is the first TLI for which we've searched, it's
+		 * also the earliest one seen so far.
+		 *
+		 * On the first loop iteration, both things should necessarily be
+		 * true.
+		 */
+		if (!saw_latest_wal_range_tli)
+			latest_wal_range_tli = range->tli;
+		if (earliest_wal_range_tli == 0 || saw_earliest_wal_range_tli)
+		{
+			earliest_wal_range_tli = range->tli;
+			earliest_wal_range_start_lsn = range->start_lsn;
+		}
+	}
+
+	/*
+	 * Propagate information about the prior backup into the backup_label that
+	 * will be generated for this backup.
+	 */
+	backup_state->istartpoint = earliest_wal_range_start_lsn;
+	backup_state->istarttli = earliest_wal_range_tli;
+
+	/*
+	 * Sanity check start and end LSNs for the WAL ranges in the manifest.
+	 *
+	 * Commonly, there won't be any timeline switches during the prior backup
+	 * at all, but if there are, they should happen at the same LSNs that this
+	 * server switched timelines.
+	 *
+	 * Whether there are any timeline switches during the prior backup or not,
+	 * the prior backup shouldn't require any WAL from a timeline prior to the
+	 * start of that timeline. It also shouldn't require any WAL from later
+	 * than the start of this backup.
+	 *
+	 * If any of these sanity checks fail, one possible explanation is that
+	 * the user has generated WAL on the same timeline with the same LSNs more
+	 * than once. For instance, if two standbys running on timeline 1 were
+	 * both promoted and (due to a broken archiving setup) both selected new
+	 * timeline ID 2, then it's possible that one of these checks might trip.
+	 *
+	 * Note that there are lots of ways for the user to do something very bad
+	 * without tripping any of these checks, and they are not intended to be
+	 * comprehensive. It's pretty hard to see how we could be certain of
+	 * anything here. However, if there's a problem staring us right in the
+	 * face, it's best to report it, so we do.
+	 */
+	for (i = 0; i < num_wal_ranges; ++i)
+	{
+		backup_wal_range *range = list_nth(ib->manifest_wal_ranges, i);
+
+		if (range->tli == earliest_wal_range_tli)
+		{
+			if (range->start_lsn < tlep[i]->begin)
+				ereport(ERROR,
+						(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+						 errmsg("manifest requires WAL from initial timeline %u starting at %X/%X, but that timeline begins at %X/%X",
+								range->tli,
+								LSN_FORMAT_ARGS(range->start_lsn),
+								LSN_FORMAT_ARGS(tlep[i]->begin))));
+		}
+		else
+		{
+			if (range->start_lsn != tlep[i]->begin)
+				ereport(ERROR,
+						(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+						 errmsg("manifest requires WAL from continuation timeline %u starting at %X/%X, but that timeline begins at %X/%X",
+								range->tli,
+								LSN_FORMAT_ARGS(range->start_lsn),
+								LSN_FORMAT_ARGS(tlep[i]->begin))));
+		}
+
+		if (range->tli == latest_wal_range_tli)
+		{
+			if (range->end_lsn > backup_state->startpoint)
+				ereport(ERROR,
+						(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+						 errmsg("manifest requires WAL from final timeline %u ending at %X/%X, but this backup starts at %X/%X",
+								range->tli,
+								LSN_FORMAT_ARGS(range->end_lsn),
+								LSN_FORMAT_ARGS(backup_state->startpoint))));
+		}
+		else
+		{
+			if (range->end_lsn != tlep[i]->end)
+				ereport(ERROR,
+						(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+						 errmsg("manifest requires WAL from non-final timeline %u ending at %X/%X, but this server switched timelines at %X/%X",
+								range->tli,
+								LSN_FORMAT_ARGS(range->end_lsn),
+								LSN_FORMAT_ARGS(tlep[i]->end))));
+		}
+
+	}
+
+	/*
+	 * Wait for WAL summarization to catch up to the backup start LSN (but
+	 * time out if it doesn't do so quickly enough).
+	 */
+	initial_time = current_time = GetCurrentTimestamp();
+	while (1)
+	{
+		long		timeout_in_ms = 10000;
+		unsigned	elapsed_seconds;
+
+		/*
+		 * Align the wait time to prevent drift. This doesn't really matter,
+		 * but we'd like the warnings about how long we've been waiting to say
+		 * 10 seconds, 20 seconds, 30 seconds, 40 seconds ... without ever
+		 * drifting to something that is not a multiple of ten.
+		 */
+		timeout_in_ms -=
+			TimestampDifferenceMilliseconds(current_time, initial_time) %
+			timeout_in_ms;
+
+		/* Wait for up to 10 seconds. */
+		summarized_lsn = WaitForWalSummarization(backup_state->startpoint,
+												 10000, &pending_lsn);
+
+		/* If WAL summarization has progressed sufficiently, stop waiting. */
+		if (summarized_lsn >= backup_state->startpoint)
+			break;
+
+		/*
+		 * Keep track of the number of cycles during which there has been no
+		 * progression of pending_lsn. If pending_lsn is not advancing, that
+		 * means that not only are no new files appearing on disk, but we're
+		 * not even incorporating new records into the in-memory state.
+		 */
+		if (pending_lsn > prior_pending_lsn)
+		{
+			prior_pending_lsn = pending_lsn;
+			deadcycles = 0;
+		}
+		else
+			++deadcycles;
+
+		/*
+		 * If we've managed to wait for an entire minute withot the WAL
+		 * summarizer absorbing a single WAL record, error out; probably
+		 * something is wrong.
+		 *
+		 * We could consider also erroring out if the summarizer is taking too
+		 * long to catch up, but it's not clear what rate of progress would be
+		 * acceptable and what would be too slow. So instead, we just try to
+		 * error out in the case where there's no progress at all. That seems
+		 * likely to catch a reasonable number of the things that can go wrong
+		 * in practice (e.g. the summarizer process is completely hung, say
+		 * because somebody hooked up a debugger to it or something) without
+		 * giving up too quickly when the sytem is just slow.
+		 */
+		if (deadcycles >= 6)
+			ereport(ERROR,
+					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+					 errmsg("WAL summarization is not progressing"),
+					 errdetail("Summarization is needed through %X/%X, but is stuck at %X/%X on disk and %X/%X in memory.",
+							   LSN_FORMAT_ARGS(backup_state->startpoint),
+							   LSN_FORMAT_ARGS(summarized_lsn),
+							   LSN_FORMAT_ARGS(pending_lsn))));
+
+		/*
+		 * Otherwise, just let the user know what's happening.
+		 */
+		current_time = GetCurrentTimestamp();
+		elapsed_seconds =
+			TimestampDifferenceMilliseconds(initial_time, current_time) / 1000;
+		ereport(WARNING,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("still waiting for WAL summarization through %X/%X after %d seconds",
+						LSN_FORMAT_ARGS(backup_state->startpoint),
+						elapsed_seconds),
+				 errdetail("Summarization has reached %X/%X on disk and %X/%X in memory.",
+						   LSN_FORMAT_ARGS(summarized_lsn),
+						   LSN_FORMAT_ARGS(pending_lsn))));
+	}
+
+	/*
+	 * Retrieve a list of all WAL summaries on any timeline that overlap with
+	 * the LSN range of interest. We could instead call GetWalSummaries() once
+	 * per timeline in the loop that follows, but that would involve reading
+	 * the directory multiple times. It should be mildly faster - and perhaps
+	 * a bit safer - to do it just once.
+	 */
+	all_wslist = GetWalSummaries(0, earliest_wal_range_start_lsn,
+								 backup_state->startpoint);
+
+	/*
+	 * We need WAL summaries for everything that happened during the prior
+	 * backup and everything that happened afterward up until the point where
+	 * the current backup started.
+	 */
+	foreach(lc, expectedTLEs)
+	{
+		TimeLineHistoryEntry *tle = lfirst(lc);
+		XLogRecPtr	tli_start_lsn = tle->begin;
+		XLogRecPtr	tli_end_lsn = tle->end;
+		XLogRecPtr	tli_missing_lsn = InvalidXLogRecPtr;
+		List	   *tli_wslist;
+
+		/*
+		 * Working through the history of this server from the current
+		 * timeline backwards, we skip everything until we find the timeline
+		 * where this backup started. Most of the time, this means we won't
+		 * skip anything at all, as it's unlikely that the timeline has
+		 * changed since the beginning of the backup moments ago.
+		 */
+		if (tle->tli == backup_state->starttli)
+		{
+			found_backup_start_tli = true;
+			tli_end_lsn = backup_state->startpoint;
+		}
+		else if (!found_backup_start_tli)
+			continue;
+
+		/*
+		 * Find the summaries that overlap the LSN range of interest for this
+		 * timeline. If this is the earliest timeline involved, the range of
+		 * interest begins with the start LSN of the prior backup; otherwise,
+		 * it begins at the LSN at which this timeline came into existence. If
+		 * this is the latest TLI involved, the range of interest ends at the
+		 * start LSN of the current backup; otherwise, it ends at the point
+		 * where we switched from this timeline to the next one.
+		 */
+		if (tle->tli == earliest_wal_range_tli)
+			tli_start_lsn = earliest_wal_range_start_lsn;
+		tli_wslist = FilterWalSummaries(all_wslist, tle->tli,
+										tli_start_lsn, tli_end_lsn);
+
+		/*
+		 * There is no guarantee that the WAL summaries we found cover the
+		 * entire range of LSNs for which summaries are required, or indeed
+		 * that we found any WAL summaries at all. Check whether we have a
+		 * problem of that sort.
+		 */
+		if (!WalSummariesAreComplete(tli_wslist, tli_start_lsn, tli_end_lsn,
+									 &tli_missing_lsn))
+		{
+			if (XLogRecPtrIsInvalid(tli_missing_lsn))
+				ereport(ERROR,
+						(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+						 errmsg("WAL summaries are required on timeline %u from %X/%X to %X/%X, but no summaries for that timeline and LSN range exist",
+								tle->tli,
+								LSN_FORMAT_ARGS(tli_start_lsn),
+								LSN_FORMAT_ARGS(tli_end_lsn))));
+			else
+				ereport(ERROR,
+						(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+						 errmsg("WAL summaries are required on timeline %u from %X/%X to %X/%X, but the summaries for that timeline and LSN range are incomplete",
+								tle->tli,
+								LSN_FORMAT_ARGS(tli_start_lsn),
+								LSN_FORMAT_ARGS(tli_end_lsn)),
+						 errdetail("The first unsummarized LSN is this range is %X/%X.",
+								   LSN_FORMAT_ARGS(tli_missing_lsn))));
+		}
+
+		/*
+		 * Remember that we need to read these summaries.
+		 *
+		 * Technically, it's possible that this could read more files than
+		 * required, since tli_wslist in theory could contain redundant
+		 * summaries. For instance, if we have a summary from 0/10000000 to
+		 * 0/20000000 and also one from 0/00000000 to 0/30000000, then the
+		 * latter subsumes the former and the former could be ignored.
+		 *
+		 * We ignore this possibility because the WAL summarizer only tries to
+		 * generate summaries that do not overlap. If somehow they exist,
+		 * we'll do a bit of extra work but the results should still be
+		 * correct.
+		 */
+		required_wslist = list_concat(required_wslist, tli_wslist);
+
+		/*
+		 * Timelines earlier than the one in which the prior backup began are
+		 * not relevant.
+		 */
+		if (tle->tli == earliest_wal_range_tli)
+			break;
+	}
+
+	/*
+	 * Read all of the required block reference table files and merge all of
+	 * the data into a single in-memory block reference table.
+	 *
+	 * See the comments for struct IncrementalBackupInfo for some thoughts on
+	 * memory usage.
+	 */
+	ib->brtab = CreateEmptyBlockRefTable();
+	foreach(lc, required_wslist)
+	{
+		WalSummaryFile *ws = lfirst(lc);
+		WalSummaryIO wsio;
+		BlockRefTableReader *reader;
+		RelFileLocator rlocator;
+		ForkNumber	forknum;
+		BlockNumber limit_block;
+		BlockNumber blocks[BLOCKS_PER_READ];
+
+		wsio.file = OpenWalSummaryFile(ws, false);
+		wsio.filepos = 0;
+		ereport(DEBUG1,
+				(errmsg_internal("reading WAL summary file \"%s\"",
+								 FilePathName(wsio.file))));
+		reader = CreateBlockRefTableReader(ReadWalSummary, &wsio,
+										   FilePathName(wsio.file),
+										   ReportWalSummaryError, NULL);
+		while (BlockRefTableReaderNextRelation(reader, &rlocator, &forknum,
+											   &limit_block))
+		{
+			BlockRefTableSetLimitBlock(ib->brtab, &rlocator,
+									   forknum, limit_block);
+
+			while (1)
+			{
+				unsigned	nblocks;
+				unsigned	i;
+
+				nblocks = BlockRefTableReaderGetBlocks(reader, blocks,
+													   BLOCKS_PER_READ);
+				if (nblocks == 0)
+					break;
+
+				for (i = 0; i < nblocks; ++i)
+					BlockRefTableMarkBlockModified(ib->brtab, &rlocator,
+												   forknum, blocks[i]);
+			}
+		}
+		DestroyBlockRefTableReader(reader);
+		FileClose(wsio.file);
+	}
+
+	/* Switch back to previous memory context. */
+	MemoryContextSwitchTo(oldcontext);
+}
+
+/*
+ * Get the pathname that should be used when a file is sent incrementally.
+ *
+ * The result is a palloc'd string.
+ */
+char *
+GetIncrementalFilePath(Oid dboid, Oid spcoid, RelFileNumber relfilenumber,
+					   ForkNumber forknum, unsigned segno)
+{
+	char	   *path;
+	char	   *lastslash;
+	char	   *ipath;
+
+	path = GetRelationPath(dboid, spcoid, relfilenumber, InvalidBackendId,
+						   forknum);
+
+	lastslash = strrchr(path, '/');
+	Assert(lastslash != NULL);
+	*lastslash = '\0';
+
+	if (segno > 0)
+		ipath = psprintf("%s/INCREMENTAL.%s.%u", path, lastslash + 1, segno);
+	else
+		ipath = psprintf("%s/INCREMENTAL.%s", path, lastslash + 1);
+
+	pfree(path);
+
+	return ipath;
+}
+
+/*
+ * How should we back up a particular file as part of an incremental backup?
+ *
+ * If the return value is BACK_UP_FILE_FULLY, caller should back up the whole
+ * file just as if this were not an incremental backup.
+ *
+ * If the return value is BACK_UP_FILE_INCREMENTALLY, caller should include
+ * an incremental file in the backup instead of the entire file. On return,
+ * *num_blocks_required will be set to the number of blocks that need to be
+ * sent, and the actual block numbers will have been stored in
+ * relative_block_numbers, which should be an array of at least RELSEG_SIZE.
+ * In addition, *truncation_block_length will be set to the value that should
+ * be included in the incremental file.
+ */
+FileBackupMethod
+GetFileBackupMethod(IncrementalBackupInfo *ib, const char *path,
+					Oid dboid, Oid spcoid,
+					RelFileNumber relfilenumber, ForkNumber forknum,
+					unsigned segno, size_t size,
+					unsigned *num_blocks_required,
+					BlockNumber *relative_block_numbers,
+					unsigned *truncation_block_length)
+{
+	BlockNumber absolute_block_numbers[RELSEG_SIZE];
+	BlockNumber limit_block;
+	BlockNumber start_blkno;
+	BlockNumber stop_blkno;
+	RelFileLocator rlocator;
+	BlockRefTableEntry *brtentry;
+	unsigned	i;
+	unsigned	nblocks;
+
+	/* Should only be called after PrepareForIncrementalBackup. */
+	Assert(ib->buf.data == NULL);
+
+	/*
+	 * dboid could be InvalidOid if shared rel, but spcoid and relfilenumber
+	 * should have legal values.
+	 */
+	Assert(OidIsValid(spcoid));
+	Assert(RelFileNumberIsValid(relfilenumber));
+
+	/*
+	 * If the file size is too large or not a multiple of BLCKSZ, then
+	 * something weird is happening, so give up and send the whole file.
+	 */
+	if ((size % BLCKSZ) != 0 || size / BLCKSZ > RELSEG_SIZE)
+		return BACK_UP_FILE_FULLY;
+
+	/*
+	 * The free-space map fork is not properly WAL-logged, so we need to
+	 * backup the entire file every time.
+	 */
+	if (forknum == FSM_FORKNUM)
+		return BACK_UP_FILE_FULLY;
+
+	/*
+	 * If this file was not part of the prior backup, back it up fully.
+	 *
+	 * If this file was created after the prior backup and before the start of
+	 * the current backup, then the WAL summary information will tell us to
+	 * back up the whole file. However, if this file was created after the
+	 * start of the current backup, then the WAL summary won't know anything
+	 * about it. Without this logic, we would erroneously conclude that it was
+	 * OK to send it incrementally.
+	 *
+	 * Note that the file could have existed at the time of the prior backup,
+	 * gotten deleted, and then a new file with the same name could have been
+	 * created.  In that case, this logic won't prevent the file from being
+	 * backed up incrementally. But, if the deletion happened before the start
+	 * of the current backup, the limit block will be 0, inducing a full
+	 * backup. If the deletion happened after the start of the current backup,
+	 * reconstruction will erroneously combine blocks from the current
+	 * lifespan of the file with blocks from the previous lifespan -- but in
+	 * this type of case, WAL replay to reach backup consistency should remove
+	 * and recreate the file anyway, so the initial bogus contents should not
+	 * matter.
+	 */
+	if (backup_file_lookup(ib->manifest_files, path) == NULL)
+	{
+		char	   *ipath;
+
+		ipath = GetIncrementalFilePath(dboid, spcoid, relfilenumber,
+									   forknum, segno);
+		if (backup_file_lookup(ib->manifest_files, ipath) == NULL)
+			return BACK_UP_FILE_FULLY;
+	}
+
+	/* Look up the block reference table entry. */
+	rlocator.spcOid = spcoid;
+	rlocator.dbOid = dboid;
+	rlocator.relNumber = relfilenumber;
+	brtentry = BlockRefTableGetEntry(ib->brtab, &rlocator, forknum,
+									 &limit_block);
+
+	/*
+	 * If there is no entry, then there have been no WAL-logged changes to the
+	 * relation since the predecessor backup was taken, so we can back it up
+	 * incrementally and need not include any modified blocks.
+	 *
+	 * However, if the file is zero-length, we should do a full backup,
+	 * because an incremental file is always more than zero length, and it's
+	 * silly to take an incremental backup when a full backup would be
+	 * smaller.
+	 */
+	if (brtentry == NULL)
+	{
+		if (size == 0)
+			return BACK_UP_FILE_FULLY;
+		*num_blocks_required = 0;
+		*truncation_block_length = size / BLCKSZ;
+		return BACK_UP_FILE_INCREMENTALLY;
+	}
+
+	/*
+	 * If the limit_block is less than or equal to the point where this
+	 * segment starts, send the whole file.
+	 */
+	if (limit_block <= segno * RELSEG_SIZE)
+		return BACK_UP_FILE_FULLY;
+
+	/*
+	 * Get relevant entries from the block reference table entry.
+	 *
+	 * We shouldn't overflow computing the start or stop block numbers, but if
+	 * it manages to happen somehow, detect it and throw an error.
+	 */
+	start_blkno = segno * RELSEG_SIZE;
+	stop_blkno = start_blkno + (size / BLCKSZ);
+	if (start_blkno / RELSEG_SIZE != segno || stop_blkno < start_blkno)
+		ereport(ERROR,
+				errcode(ERRCODE_INTERNAL_ERROR),
+				errmsg_internal("overflow computing block number bounds for segment %u with size %zu",
+								segno, size));
+	nblocks = BlockRefTableEntryGetBlocks(brtentry, start_blkno, stop_blkno,
+										  absolute_block_numbers, RELSEG_SIZE);
+	Assert(nblocks <= RELSEG_SIZE);
+
+	/*
+	 * If we're going to have to send nearly all of the blocks, then just send
+	 * the whole file, because that won't require much extra storage or
+	 * transfer and will speed up and simplify backup restoration. It's not
+	 * clear what threshold is most appropriate here and perhaps it ought to
+	 * be configurable, but for now we're just going to say that if we'd need
+	 * to send 90% of the blocks anyway, give up and send the whole file.
+	 *
+	 * NB: If you change the threshold here, at least make sure to back up the
+	 * file fully when every single block must be sent, because there's
+	 * nothing good about sending an incremental file in that case.
+	 */
+	if (nblocks * BLCKSZ > size * 0.9)
+		return BACK_UP_FILE_FULLY;
+
+	/*
+	 * Looks like we can send an incremental file, so sort the absolute the
+	 * block numbers and then transpose absolute block numbers to relative
+	 * block numbers.
+	 *
+	 * NB: If the block reference table was using the bitmap representation
+	 * for a given chunk, the block numbers in that chunk will already be
+	 * sorted, but when the array-of-offsets representation is used, we can
+	 * receive block numbers here out of order.
+	 */
+	qsort(absolute_block_numbers, nblocks, sizeof(BlockNumber),
+		  compare_block_numbers);
+	for (i = 0; i < nblocks; ++i)
+		relative_block_numbers[i] = absolute_block_numbers[i] - start_blkno;
+	*num_blocks_required = nblocks;
+
+	/*
+	 * The truncation block length is the minimum length of the reconstructed
+	 * file. Any block numbers below this threshold that are not present in
+	 * the backup need to be fetched from the prior backup. At or above this
+	 * threshold, blocks should only be included in the result if they are
+	 * present in the backup. (This may require inserting zero blocks if the
+	 * blocks included in the backup are non-consecutive.)
+	 */
+	*truncation_block_length = size / BLCKSZ;
+	if (BlockNumberIsValid(limit_block))
+	{
+		unsigned	relative_limit = limit_block - segno * RELSEG_SIZE;
+
+		if (*truncation_block_length < relative_limit)
+			*truncation_block_length = relative_limit;
+	}
+
+	/* Send it incrementally. */
+	return BACK_UP_FILE_INCREMENTALLY;
+}
+
+/*
+ * Compute the size for an incremental file containing a given number of blocks.
+ */
+extern size_t
+GetIncrementalFileSize(unsigned num_blocks_required)
+{
+	size_t		result;
+
+	/* Make sure we're not going to overflow. */
+	Assert(num_blocks_required <= RELSEG_SIZE);
+
+	/*
+	 * Three four byte quantities (magic number, truncation block length,
+	 * block count) followed by block numbers followed by block contents.
+	 */
+	result = 3 * sizeof(uint32);
+	result += (BLCKSZ + sizeof(BlockNumber)) * num_blocks_required;
+
+	return result;
+}
+
+/*
+ * Helper function for filemap hash table.
+ */
+static uint32
+hash_string_pointer(const char *s)
+{
+	unsigned char *ss = (unsigned char *) s;
+
+	return hash_bytes(ss, strlen(s));
+}
+
+/*
+ * This callback is invoked for each file mentioned in the backup manifest.
+ *
+ * We store the path to each file and the size of each file for sanity-checking
+ * purposes. For further details, see comments for IncrementalBackupInfo.
+ */
+static void
+manifest_process_file(JsonManifestParseContext *context,
+					  char *pathname, size_t size,
+					  pg_checksum_type checksum_type,
+					  int checksum_length,
+					  uint8 *checksum_payload)
+{
+	IncrementalBackupInfo *ib = context->private_data;
+	backup_file_entry *entry;
+	bool		found;
+
+	entry = backup_file_insert(ib->manifest_files, pathname, &found);
+	if (!found)
+	{
+		entry->path = MemoryContextStrdup(ib->manifest_files->ctx,
+										  pathname);
+		entry->size = size;
+	}
+}
+
+/*
+ * This callback is invoked for each WAL range mentioned in the backup
+ * manifest.
+ *
+ * We're just interested in learning the oldest LSN and the corresponding TLI
+ * that appear in any WAL range.
+ */
+static void
+manifest_process_wal_range(JsonManifestParseContext *context,
+						   TimeLineID tli, XLogRecPtr start_lsn,
+						   XLogRecPtr end_lsn)
+{
+	IncrementalBackupInfo *ib = context->private_data;
+	backup_wal_range *range = palloc(sizeof(backup_wal_range));
+
+	range->tli = tli;
+	range->start_lsn = start_lsn;
+	range->end_lsn = end_lsn;
+	ib->manifest_wal_ranges = lappend(ib->manifest_wal_ranges, range);
+}
+
+/*
+ * This callback is invoked if an error occurs while parsing the backup
+ * manifest.
+ */
+static void
+manifest_report_error(JsonManifestParseContext *context, const char *fmt,...)
+{
+	StringInfoData errbuf;
+
+	initStringInfo(&errbuf);
+
+	for (;;)
+	{
+		va_list		ap;
+		int			needed;
+
+		va_start(ap, fmt);
+		needed = appendStringInfoVA(&errbuf, fmt, ap);
+		va_end(ap);
+		if (needed == 0)
+			break;
+		enlargeStringInfo(&errbuf, needed);
+	}
+
+	ereport(ERROR,
+			errmsg_internal("%s", errbuf.data));
+}
+
+/*
+ * Quicksort comparator for block numbers.
+ */
+static int
+compare_block_numbers(const void *a, const void *b)
+{
+	BlockNumber aa = *(BlockNumber *) a;
+	BlockNumber bb = *(BlockNumber *) b;
+
+	if (aa > bb)
+		return 1;
+	else if (aa == bb)
+		return 0;
+	else
+		return -1;
+}