1 files changed, 125 insertions, 760 deletions
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 6ed68185edb..ffb3569698f 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -29,45 +29,17 @@
 #include "access/xlogutils.h"
 #include "access/xlog.h"
 #include "pgstat.h"
-#include "portability/instr_time.h"
 #include "postmaster/bgwriter.h"
 #include "storage/fd.h"
 #include "storage/bufmgr.h"
+#include "storage/md.h"
 #include "storage/relfilenode.h"
 #include "storage/smgr.h"
+#include "storage/sync.h"
 #include "utils/hsearch.h"
 #include "utils/memutils.h"
 #include "pg_trace.h"
 
-
-/* intervals for calling AbsorbFsyncRequests in mdsync and mdpostckpt */
-#define FSYNCS_PER_ABSORB		10
-#define UNLINKS_PER_ABSORB		10
-
-/*
- * Special values for the segno arg to RememberFsyncRequest.
- *
- * Note that CompactCheckpointerRequestQueue assumes that it's OK to remove an
- * fsync request from the queue if an identical, subsequent request is found.
- * See comments there before making changes here.
- */
-#define FORGET_RELATION_FSYNC	(InvalidBlockNumber)
-#define FORGET_DATABASE_FSYNC	(InvalidBlockNumber-1)
-#define UNLINK_RELATION_REQUEST (InvalidBlockNumber-2)
-
-/*
- * On Windows, we have to interpret EACCES as possibly meaning the same as
- * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform,
- * that's what you get.  Ugh.  This code is designed so that we don't
- * actually believe these cases are okay without further evidence (namely,
- * a pending fsync request getting canceled ... see mdsync).
- */
-#ifndef WIN32
-#define FILE_POSSIBLY_DELETED(err)	((err) == ENOENT)
-#else
-#define FILE_POSSIBLY_DELETED(err)	((err) == ENOENT || (err) == EACCES)
-#endif
-
 /*
  *	The magnetic disk storage manager keeps track of open file
  *	descriptors in its own descriptor pool.  This is done to make it
@@ -115,49 +87,15 @@ typedef struct _MdfdVec
 static MemoryContext MdCxt;		/* context for all MdfdVec objects */
 
 
-/*
- * In some contexts (currently, standalone backends and the checkpointer)
- * we keep track of pending fsync operations: we need to remember all relation
- * segments that have been written since the last checkpoint, so that we can
- * fsync them down to disk before completing the next checkpoint.  This hash
- * table remembers the pending operations.  We use a hash table mostly as
- * a convenient way of merging duplicate requests.
- *
- * We use a similar mechanism to remember no-longer-needed files that can
- * be deleted after the next checkpoint, but we use a linked list instead of
- * a hash table, because we don't expect there to be any duplicate requests.
- *
- * These mechanisms are only used for non-temp relations; we never fsync
- * temp rels, nor do we need to postpone their deletion (see comments in
- * mdunlink).
- *
- * (Regular backends do not track pending operations locally, but forward
- * them to the checkpointer.)
- */
-typedef uint16 CycleCtr;		/* can be any convenient integer size */
-
-typedef struct
-{
-	RelFileNode rnode;			/* hash table key (must be first!) */
-	CycleCtr	cycle_ctr;		/* mdsync_cycle_ctr of oldest request */
-	/* requests[f] has bit n set if we need to fsync segment n of fork f */
-	Bitmapset  *requests[MAX_FORKNUM + 1];
-	/* canceled[f] is true if we canceled fsyncs for fork "recently" */
-	bool		canceled[MAX_FORKNUM + 1];
-} PendingOperationEntry;
-
-typedef struct
-{
-	RelFileNode rnode;			/* the dead relation to delete */
-	CycleCtr	cycle_ctr;		/* mdckpt_cycle_ctr when request was made */
-} PendingUnlinkEntry;
-
-static HTAB *pendingOpsTable = NULL;
-static List *pendingUnlinks = NIL;
-static MemoryContext pendingOpsCxt; /* context for the above  */
-
-static CycleCtr mdsync_cycle_ctr = 0;
-static CycleCtr mdckpt_cycle_ctr = 0;
+/* Populate a file tag describing an md.c segment file. */
+#define INIT_MD_FILETAG(a,xx_rnode,xx_forknum,xx_segno) \
+( \
+	memset(&(a), 0, sizeof(FileTag)), \
+	(a).handler = SYNC_HANDLER_MD, \
+	(a).rnode = (xx_rnode), \
+	(a).forknum = (xx_forknum), \
+	(a).segno = (xx_segno) \
+)
 
 
 /*** behavior for mdopen & _mdfd_getseg ***/
@@ -185,7 +123,10 @@ static void mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum,
 static MdfdVec *mdopen(SMgrRelation reln, ForkNumber forknum, int behavior);
 static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum,
 					   MdfdVec *seg);
-static void register_unlink(RelFileNodeBackend rnode);
+static void register_unlink_segment(RelFileNodeBackend rnode, ForkNumber forknum,
+						BlockNumber segno);
+static void register_forget_request(RelFileNodeBackend rnode, ForkNumber forknum,
+						BlockNumber segno);
 static void _fdvec_resize(SMgrRelation reln,
 			  ForkNumber forknum,
 			  int nseg);
@@ -208,64 +149,6 @@ mdinit(void)
 	MdCxt = AllocSetContextCreate(TopMemoryContext,
 								  "MdSmgr",
 								  ALLOCSET_DEFAULT_SIZES);
-
-	/*
-	 * Create pending-operations hashtable if we need it.  Currently, we need
-	 * it if we are standalone (not under a postmaster) or if we are a startup
-	 * or checkpointer auxiliary process.
-	 */
-	if (!IsUnderPostmaster || AmStartupProcess() || AmCheckpointerProcess())
-	{
-		HASHCTL		hash_ctl;
-
-		/*
-		 * XXX: The checkpointer needs to add entries to the pending ops table
-		 * when absorbing fsync requests.  That is done within a critical
-		 * section, which isn't usually allowed, but we make an exception. It
-		 * means that there's a theoretical possibility that you run out of
-		 * memory while absorbing fsync requests, which leads to a PANIC.
-		 * Fortunately the hash table is small so that's unlikely to happen in
-		 * practice.
-		 */
-		pendingOpsCxt = AllocSetContextCreate(MdCxt,
-											  "Pending ops context",
-											  ALLOCSET_DEFAULT_SIZES);
-		MemoryContextAllowInCriticalSection(pendingOpsCxt, true);
-
-		MemSet(&hash_ctl, 0, sizeof(hash_ctl));
-		hash_ctl.keysize = sizeof(RelFileNode);
-		hash_ctl.entrysize = sizeof(PendingOperationEntry);
-		hash_ctl.hcxt = pendingOpsCxt;
-		pendingOpsTable = hash_create("Pending Ops Table",
-									  100L,
-									  &hash_ctl,
-									  HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
-		pendingUnlinks = NIL;
-	}
-}
-
-/*
- * In archive recovery, we rely on checkpointer to do fsyncs, but we will have
- * already created the pendingOpsTable during initialization of the startup
- * process.  Calling this function drops the local pendingOpsTable so that
- * subsequent requests will be forwarded to checkpointer.
- */
-void
-SetForwardFsyncRequests(void)
-{
-	/* Perform any pending fsyncs we may have queued up, then drop table */
-	if (pendingOpsTable)
-	{
-		mdsync();
-		hash_destroy(pendingOpsTable);
-	}
-	pendingOpsTable = NULL;
-
-	/*
-	 * We should not have any pending unlink requests, since mdunlink doesn't
-	 * queue unlink requests when isRedo.
-	 */
-	Assert(pendingUnlinks == NIL);
 }
 
 /*
@@ -380,16 +263,6 @@ mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 void
 mdunlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
 {
-	/*
-	 * We have to clean out any pending fsync requests for the doomed
-	 * relation, else the next mdsync() will fail.  There can't be any such
-	 * requests for a temp relation, though.  We can send just one request
-	 * even when deleting multiple forks, since the fsync queuing code accepts
-	 * the "InvalidForkNumber = all forks" convention.
-	 */
-	if (!RelFileNodeBackendIsTemp(rnode))
-		ForgetRelationFsyncRequests(rnode.node, forkNum);
-
 	/* Now do the per-fork work */
 	if (forkNum == InvalidForkNumber)
 	{
@@ -413,6 +286,11 @@ mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
 	 */
 	if (isRedo || forkNum != MAIN_FORKNUM || RelFileNodeBackendIsTemp(rnode))
 	{
+		/* First, forget any pending sync requests for the first segment */
+		if (!RelFileNodeBackendIsTemp(rnode))
+			register_forget_request(rnode, forkNum, 0 /* first seg */ );
+
+		/* Next unlink the file */
 		ret = unlink(path);
 		if (ret < 0 && errno != ENOENT)
 			ereport(WARNING,
@@ -442,7 +320,7 @@ mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
 					 errmsg("could not truncate file \"%s\": %m", path)));
 
 		/* Register request to unlink first segment later */
-		register_unlink(rnode);
+		register_unlink_segment(rnode, forkNum, 0 /* first seg */ );
 	}
 
 	/*
@@ -459,6 +337,13 @@ mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
 		 */
 		for (segno = 1;; segno++)
 		{
+			/*
+			 * Forget any pending sync requests for this segment before we try
+			 * to unlink.
+			 */
+			if (!RelFileNodeBackendIsTemp(rnode))
+				register_forget_request(rnode, forkNum, segno);
+
 			sprintf(segpath, "%s.%u", path, segno);
 			if (unlink(segpath) < 0)
 			{
@@ -1004,412 +889,26 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum)
 }
 
 /*
- *	mdsync() -- Sync previous writes to stable storage.
- */
-void
-mdsync(void)
-{
-	static bool mdsync_in_progress = false;
-
-	HASH_SEQ_STATUS hstat;
-	PendingOperationEntry *entry;
-	int			absorb_counter;
-
-	/* Statistics on sync times */
-	int			processed = 0;
-	instr_time	sync_start,
-				sync_end,
-				sync_diff;
-	uint64		elapsed;
-	uint64		longest = 0;
-	uint64		total_elapsed = 0;
-
-	/*
-	 * This is only called during checkpoints, and checkpoints should only
-	 * occur in processes that have created a pendingOpsTable.
-	 */
-	if (!pendingOpsTable)
-		elog(ERROR, "cannot sync without a pendingOpsTable");
-
-	/*
-	 * If we are in the checkpointer, the sync had better include all fsync
-	 * requests that were queued by backends up to this point.  The tightest
-	 * race condition that could occur is that a buffer that must be written
-	 * and fsync'd for the checkpoint could have been dumped by a backend just
-	 * before it was visited by BufferSync().  We know the backend will have
-	 * queued an fsync request before clearing the buffer's dirtybit, so we
-	 * are safe as long as we do an Absorb after completing BufferSync().
-	 */
-	AbsorbFsyncRequests();
-
-	/*
-	 * To avoid excess fsync'ing (in the worst case, maybe a never-terminating
-	 * checkpoint), we want to ignore fsync requests that are entered into the
-	 * hashtable after this point --- they should be processed next time,
-	 * instead.  We use mdsync_cycle_ctr to tell old entries apart from new
-	 * ones: new ones will have cycle_ctr equal to the incremented value of
-	 * mdsync_cycle_ctr.
-	 *
-	 * In normal circumstances, all entries present in the table at this point
-	 * will have cycle_ctr exactly equal to the current (about to be old)
-	 * value of mdsync_cycle_ctr.  However, if we fail partway through the
-	 * fsync'ing loop, then older values of cycle_ctr might remain when we
-	 * come back here to try again.  Repeated checkpoint failures would
-	 * eventually wrap the counter around to the point where an old entry
-	 * might appear new, causing us to skip it, possibly allowing a checkpoint
-	 * to succeed that should not have.  To forestall wraparound, any time the
-	 * previous mdsync() failed to complete, run through the table and
-	 * forcibly set cycle_ctr = mdsync_cycle_ctr.
-	 *
-	 * Think not to merge this loop with the main loop, as the problem is
-	 * exactly that that loop may fail before having visited all the entries.
-	 * From a performance point of view it doesn't matter anyway, as this path
-	 * will never be taken in a system that's functioning normally.
-	 */
-	if (mdsync_in_progress)
-	{
-		/* prior try failed, so update any stale cycle_ctr values */
-		hash_seq_init(&hstat, pendingOpsTable);
-		while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
-		{
-			entry->cycle_ctr = mdsync_cycle_ctr;
-		}
-	}
-
-	/* Advance counter so that new hashtable entries are distinguishable */
-	mdsync_cycle_ctr++;
-
-	/* Set flag to detect failure if we don't reach the end of the loop */
-	mdsync_in_progress = true;
-
-	/* Now scan the hashtable for fsync requests to process */
-	absorb_counter = FSYNCS_PER_ABSORB;
-	hash_seq_init(&hstat, pendingOpsTable);
-	while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
-	{
-		ForkNumber	forknum;
-
-		/*
-		 * If the entry is new then don't process it this time; it might
-		 * contain multiple fsync-request bits, but they are all new.  Note
-		 * "continue" bypasses the hash-remove call at the bottom of the loop.
-		 */
-		if (entry->cycle_ctr == mdsync_cycle_ctr)
-			continue;
-
-		/* Else assert we haven't missed it */
-		Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr);
-
-		/*
-		 * Scan over the forks and segments represented by the entry.
-		 *
-		 * The bitmap manipulations are slightly tricky, because we can call
-		 * AbsorbFsyncRequests() inside the loop and that could result in
-		 * bms_add_member() modifying and even re-palloc'ing the bitmapsets.
-		 * So we detach it, but if we fail we'll merge it with any new
-		 * requests that have arrived in the meantime.
-		 */
-		for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
-		{
-			Bitmapset  *requests = entry->requests[forknum];
-			int			segno;
-
-			entry->requests[forknum] = NULL;
-			entry->canceled[forknum] = false;
-
-			segno = -1;
-			while ((segno = bms_next_member(requests, segno)) >= 0)
-			{
-				int			failures;
-
-				/*
-				 * If fsync is off then we don't have to bother opening the
-				 * file at all.  (We delay checking until this point so that
-				 * changing fsync on the fly behaves sensibly.)
-				 */
-				if (!enableFsync)
-					continue;
-
-				/*
-				 * If in checkpointer, we want to absorb pending requests
-				 * every so often to prevent overflow of the fsync request
-				 * queue.  It is unspecified whether newly-added entries will
-				 * be visited by hash_seq_search, but we don't care since we
-				 * don't need to process them anyway.
-				 */
-				if (--absorb_counter <= 0)
-				{
-					AbsorbFsyncRequests();
-					absorb_counter = FSYNCS_PER_ABSORB;
-				}
-
-				/*
-				 * The fsync table could contain requests to fsync segments
-				 * that have been deleted (unlinked) by the time we get to
-				 * them. Rather than just hoping an ENOENT (or EACCES on
-				 * Windows) error can be ignored, what we do on error is
-				 * absorb pending requests and then retry.  Since mdunlink()
-				 * queues a "cancel" message before actually unlinking, the
-				 * fsync request is guaranteed to be marked canceled after the
-				 * absorb if it really was this case. DROP DATABASE likewise
-				 * has to tell us to forget fsync requests before it starts
-				 * deletions.
-				 */
-				for (failures = 0;; failures++) /* loop exits at "break" */
-				{
-					SMgrRelation reln;
-					MdfdVec    *seg;
-					char	   *path;
-					int			save_errno;
-
-					/*
-					 * Find or create an smgr hash entry for this relation.
-					 * This may seem a bit unclean -- md calling smgr?	But
-					 * it's really the best solution.  It ensures that the
-					 * open file reference isn't permanently leaked if we get
-					 * an error here. (You may say "but an unreferenced
-					 * SMgrRelation is still a leak!" Not really, because the
-					 * only case in which a checkpoint is done by a process
-					 * that isn't about to shut down is in the checkpointer,
-					 * and it will periodically do smgrcloseall(). This fact
-					 * justifies our not closing the reln in the success path
-					 * either, which is a good thing since in non-checkpointer
-					 * cases we couldn't safely do that.)
-					 */
-					reln = smgropen(entry->rnode, InvalidBackendId);
-
-					/* Attempt to open and fsync the target segment */
-					seg = _mdfd_getseg(reln, forknum,
-									   (BlockNumber) segno * (BlockNumber) RELSEG_SIZE,
-									   false,
-									   EXTENSION_RETURN_NULL
-									   | EXTENSION_DONT_CHECK_SIZE);
-
-					INSTR_TIME_SET_CURRENT(sync_start);
-
-					if (seg != NULL &&
-						FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) >= 0)
-					{
-						/* Success; update statistics about sync timing */
-						INSTR_TIME_SET_CURRENT(sync_end);
-						sync_diff = sync_end;
-						INSTR_TIME_SUBTRACT(sync_diff, sync_start);
-						elapsed = INSTR_TIME_GET_MICROSEC(sync_diff);
-						if (elapsed > longest)
-							longest = elapsed;
-						total_elapsed += elapsed;
-						processed++;
-						requests = bms_del_member(requests, segno);
-						if (log_checkpoints)
-							elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f msec",
-								 processed,
-								 FilePathName(seg->mdfd_vfd),
-								 (double) elapsed / 1000);
-
-						break;	/* out of retry loop */
-					}
-
-					/* Compute file name for use in message */
-					save_errno = errno;
-					path = _mdfd_segpath(reln, forknum, (BlockNumber) segno);
-					errno = save_errno;
-
-					/*
-					 * It is possible that the relation has been dropped or
-					 * truncated since the fsync request was entered.
-					 * Therefore, allow ENOENT, but only if we didn't fail
-					 * already on this file.  This applies both for
-					 * _mdfd_getseg() and for FileSync, since fd.c might have
-					 * closed the file behind our back.
-					 *
-					 * XXX is there any point in allowing more than one retry?
-					 * Don't see one at the moment, but easy to change the
-					 * test here if so.
-					 */
-					if (!FILE_POSSIBLY_DELETED(errno) ||
-						failures > 0)
-					{
-						Bitmapset  *new_requests;
-
-						/*
-						 * We need to merge these unsatisfied requests with
-						 * any others that have arrived since we started.
-						 */
-						new_requests = entry->requests[forknum];
-						entry->requests[forknum] =
-							bms_join(new_requests, requests);
-
-						errno = save_errno;
-						ereport(data_sync_elevel(ERROR),
-								(errcode_for_file_access(),
-								 errmsg("could not fsync file \"%s\": %m",
-										path)));
-					}
-					else
-						ereport(DEBUG1,
-								(errcode_for_file_access(),
-								 errmsg("could not fsync file \"%s\" but retrying: %m",
-										path)));
-					pfree(path);
-
-					/*
-					 * Absorb incoming requests and check to see if a cancel
-					 * arrived for this relation fork.
-					 */
-					AbsorbFsyncRequests();
-					absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */
-
-					if (entry->canceled[forknum])
-						break;
-				}				/* end retry loop */
-			}
-			bms_free(requests);
-		}
-
-		/*
-		 * We've finished everything that was requested before we started to
-		 * scan the entry.  If no new requests have been inserted meanwhile,
-		 * remove the entry.  Otherwise, update its cycle counter, as all the
-		 * requests now in it must have arrived during this cycle.
-		 */
-		for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
-		{
-			if (entry->requests[forknum] != NULL)
-				break;
-		}
-		if (forknum <= MAX_FORKNUM)
-			entry->cycle_ctr = mdsync_cycle_ctr;
-		else
-		{
-			/* Okay to remove it */
-			if (hash_search(pendingOpsTable, &entry->rnode,
-							HASH_REMOVE, NULL) == NULL)
-				elog(ERROR, "pendingOpsTable corrupted");
-		}
-	}							/* end loop over hashtable entries */
-
-	/* Return sync performance metrics for report at checkpoint end */
-	CheckpointStats.ckpt_sync_rels = processed;
-	CheckpointStats.ckpt_longest_sync = longest;
-	CheckpointStats.ckpt_agg_sync_time = total_elapsed;
-
-	/* Flag successful completion of mdsync */
-	mdsync_in_progress = false;
-}
-
-/*
- * mdpreckpt() -- Do pre-checkpoint work
- *
- * To distinguish unlink requests that arrived before this checkpoint
- * started from those that arrived during the checkpoint, we use a cycle
- * counter similar to the one we use for fsync requests. That cycle
- * counter is incremented here.
- *
- * This must be called *before* the checkpoint REDO point is determined.
- * That ensures that we won't delete files too soon.
- *
- * Note that we can't do anything here that depends on the assumption
- * that the checkpoint will be completed.
- */
-void
-mdpreckpt(void)
-{
-	/*
-	 * Any unlink requests arriving after this point will be assigned the next
-	 * cycle counter, and won't be unlinked until next checkpoint.
-	 */
-	mdckpt_cycle_ctr++;
-}
-
-/*
- * mdpostckpt() -- Do post-checkpoint work
- *
- * Remove any lingering files that can now be safely removed.
- */
-void
-mdpostckpt(void)
-{
-	int			absorb_counter;
-
-	absorb_counter = UNLINKS_PER_ABSORB;
-	while (pendingUnlinks != NIL)
-	{
-		PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks);
-		char	   *path;
-
-		/*
-		 * New entries are appended to the end, so if the entry is new we've
-		 * reached the end of old entries.
-		 *
-		 * Note: if just the right number of consecutive checkpoints fail, we
-		 * could be fooled here by cycle_ctr wraparound.  However, the only
-		 * consequence is that we'd delay unlinking for one more checkpoint,
-		 * which is perfectly tolerable.
-		 */
-		if (entry->cycle_ctr == mdckpt_cycle_ctr)
-			break;
-
-		/* Unlink the file */
-		path = relpathperm(entry->rnode, MAIN_FORKNUM);
-		if (unlink(path) < 0)
-		{
-			/*
-			 * There's a race condition, when the database is dropped at the
-			 * same time that we process the pending unlink requests. If the
-			 * DROP DATABASE deletes the file before we do, we will get ENOENT
-			 * here. rmtree() also has to ignore ENOENT errors, to deal with
-			 * the possibility that we delete the file first.
-			 */
-			if (errno != ENOENT)
-				ereport(WARNING,
-						(errcode_for_file_access(),
-						 errmsg("could not remove file \"%s\": %m", path)));
-		}
-		pfree(path);
-
-		/* And remove the list entry */
-		pendingUnlinks = list_delete_first(pendingUnlinks);
-		pfree(entry);
-
-		/*
-		 * As in mdsync, we don't want to stop absorbing fsync requests for a
-		 * long time when there are many deletions to be done.  We can safely
-		 * call AbsorbFsyncRequests() at this point in the loop (note it might
-		 * try to delete list entries).
-		 */
-		if (--absorb_counter <= 0)
-		{
-			AbsorbFsyncRequests();
-			absorb_counter = UNLINKS_PER_ABSORB;
-		}
-	}
-}
-
-/*
  * register_dirty_segment() -- Mark a relation segment as needing fsync
  *
  * If there is a local pending-ops table, just make an entry in it for
- * mdsync to process later.  Otherwise, try to pass off the fsync request
- * to the checkpointer process.  If that fails, just do the fsync
- * locally before returning (we hope this will not happen often enough
- * to be a performance problem).
+ * ProcessSyncRequests to process later.  Otherwise, try to pass off the
+ * fsync request to the checkpointer process.  If that fails, just do the
+ * fsync locally before returning (we hope this will not happen often
+ * enough to be a performance problem).
  */
 static void
 register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
 {
+	FileTag		tag;
+
+	INIT_MD_FILETAG(tag, reln->smgr_rnode.node, forknum, seg->mdfd_segno);
+
 	/* Temp relations should never be fsync'd */
 	Assert(!SmgrIsTemp(reln));
 
-	if (pendingOpsTable)
+	if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false /* retryOnError */ ))
 	{
-		/* push it into local pending-ops table */
-		RememberFsyncRequest(reln->smgr_rnode.node, forknum, seg->mdfd_segno);
-	}
-	else
-	{
-		if (ForwardFsyncRequest(reln->smgr_rnode.node, forknum, seg->mdfd_segno))
-			return;				/* passed it off successfully */
-
 		ereport(DEBUG1,
 				(errmsg("could not forward fsync request because request queue is full")));
 
@@ -1423,254 +922,51 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
 
 /*
  * register_unlink() -- Schedule a file to be deleted after next checkpoint
- *
- * We don't bother passing in the fork number, because this is only used
- * with main forks.
- *
- * As with register_dirty_segment, this could involve either a local or
- * a remote pending-ops table.
  */
 static void
-register_unlink(RelFileNodeBackend rnode)
+register_unlink_segment(RelFileNodeBackend rnode, ForkNumber forknum,
+						BlockNumber segno)
 {
+	FileTag		tag;
+
+	INIT_MD_FILETAG(tag, rnode.node, forknum, segno);
+
 	/* Should never be used with temp relations */
 	Assert(!RelFileNodeBackendIsTemp(rnode));
 
-	if (pendingOpsTable)
-	{
-		/* push it into local pending-ops table */
-		RememberFsyncRequest(rnode.node, MAIN_FORKNUM,
-							 UNLINK_RELATION_REQUEST);
-	}
-	else
-	{
-		/*
-		 * Notify the checkpointer about it.  If we fail to queue the request
-		 * message, we have to sleep and try again, because we can't simply
-		 * delete the file now.  Ugly, but hopefully won't happen often.
-		 *
-		 * XXX should we just leave the file orphaned instead?
-		 */
-		Assert(IsUnderPostmaster);
-		while (!ForwardFsyncRequest(rnode.node, MAIN_FORKNUM,
-									UNLINK_RELATION_REQUEST))
-			pg_usleep(10000L);	/* 10 msec seems a good number */
-	}
+	RegisterSyncRequest(&tag, SYNC_UNLINK_REQUEST, true /* retryOnError */ );
 }
 
 /*
- * RememberFsyncRequest() -- callback from checkpointer side of fsync request
- *
- * We stuff fsync requests into the local hash table for execution
- * during the checkpointer's next checkpoint.  UNLINK requests go into a
- * separate linked list, however, because they get processed separately.
- *
- * The range of possible segment numbers is way less than the range of
- * BlockNumber, so we can reserve high values of segno for special purposes.
- * We define three:
- * - FORGET_RELATION_FSYNC means to cancel pending fsyncs for a relation,
- *	 either for one fork, or all forks if forknum is InvalidForkNumber
- * - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database
- * - UNLINK_RELATION_REQUEST is a request to delete the file after the next
- *	 checkpoint.
- * Note also that we're assuming real segment numbers don't exceed INT_MAX.
- *
- * (Handling FORGET_DATABASE_FSYNC requests is a tad slow because the hash
- * table has to be searched linearly, but dropping a database is a pretty
- * heavyweight operation anyhow, so we'll live with it.)
+ * register_forget_request() -- forget any fsyncs for a relation fork's segment
  */
-void
-RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
+static void
+register_forget_request(RelFileNodeBackend rnode, ForkNumber forknum,
+						BlockNumber segno)
 {
-	Assert(pendingOpsTable);
-
-	if (segno == FORGET_RELATION_FSYNC)
-	{
-		/* Remove any pending requests for the relation (one or all forks) */
-		PendingOperationEntry *entry;
-
-		entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
-													  &rnode,
-													  HASH_FIND,
-													  NULL);
-		if (entry)
-		{
-			/*
-			 * We can't just delete the entry since mdsync could have an
-			 * active hashtable scan.  Instead we delete the bitmapsets; this
-			 * is safe because of the way mdsync is coded.  We also set the
-			 * "canceled" flags so that mdsync can tell that a cancel arrived
-			 * for the fork(s).
-			 */
-			if (forknum == InvalidForkNumber)
-			{
-				/* remove requests for all forks */
-				for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
-				{
-					bms_free(entry->requests[forknum]);
-					entry->requests[forknum] = NULL;
-					entry->canceled[forknum] = true;
-				}
-			}
-			else
-			{
-				/* remove requests for single fork */
-				bms_free(entry->requests[forknum]);
-				entry->requests[forknum] = NULL;
-				entry->canceled[forknum] = true;
-			}
-		}
-	}
-	else if (segno == FORGET_DATABASE_FSYNC)
-	{
-		/* Remove any pending requests for the entire database */
-		HASH_SEQ_STATUS hstat;
-		PendingOperationEntry *entry;
-		ListCell   *cell,
-				   *prev,
-				   *next;
-
-		/* Remove fsync requests */
-		hash_seq_init(&hstat, pendingOpsTable);
-		while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
-		{
-			if (entry->rnode.dbNode == rnode.dbNode)
-			{
-				/* remove requests for all forks */
-				for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
-				{
-					bms_free(entry->requests[forknum]);
-					entry->requests[forknum] = NULL;
-					entry->canceled[forknum] = true;
-				}
-			}
-		}
-
-		/* Remove unlink requests */
-		prev = NULL;
-		for (cell = list_head(pendingUnlinks); cell; cell = next)
-		{
-			PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
-
-			next = lnext(cell);
-			if (entry->rnode.dbNode == rnode.dbNode)
-			{
-				pendingUnlinks = list_delete_cell(pendingUnlinks, cell, prev);
-				pfree(entry);
-			}
-			else
-				prev = cell;
-		}
-	}
-	else if (segno == UNLINK_RELATION_REQUEST)
-	{
-		/* Unlink request: put it in the linked list */
-		MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
-		PendingUnlinkEntry *entry;
-
-		/* PendingUnlinkEntry doesn't store forknum, since it's always MAIN */
-		Assert(forknum == MAIN_FORKNUM);
-
-		entry = palloc(sizeof(PendingUnlinkEntry));
-		entry->rnode = rnode;
-		entry->cycle_ctr = mdckpt_cycle_ctr;
-
-		pendingUnlinks = lappend(pendingUnlinks, entry);
-
-		MemoryContextSwitchTo(oldcxt);
-	}
-	else
-	{
-		/* Normal case: enter a request to fsync this segment */
-		MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
-		PendingOperationEntry *entry;
-		bool		found;
-
-		entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
-													  &rnode,
-													  HASH_ENTER,
-													  &found);
-		/* if new entry, initialize it */
-		if (!found)
-		{
-			entry->cycle_ctr = mdsync_cycle_ctr;
-			MemSet(entry->requests, 0, sizeof(entry->requests));
-			MemSet(entry->canceled, 0, sizeof(entry->canceled));
-		}
-
-		/*
-		 * NB: it's intentional that we don't change cycle_ctr if the entry
-		 * already exists.  The cycle_ctr must represent the oldest fsync
-		 * request that could be in the entry.
-		 */
-
-		entry->requests[forknum] = bms_add_member(entry->requests[forknum],
-												  (int) segno);
-
-		MemoryContextSwitchTo(oldcxt);
-	}
-}
+	FileTag		tag;
 
-/*
- * ForgetRelationFsyncRequests -- forget any fsyncs for a relation fork
- *
- * forknum == InvalidForkNumber means all forks, although this code doesn't
- * actually know that, since it's just forwarding the request elsewhere.
- */
-void
-ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum)
-{
-	if (pendingOpsTable)
-	{
-		/* standalone backend or startup process: fsync state is local */
-		RememberFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC);
-	}
-	else if (IsUnderPostmaster)
-	{
-		/*
-		 * Notify the checkpointer about it.  If we fail to queue the cancel
-		 * message, we have to sleep and try again ... ugly, but hopefully
-		 * won't happen often.
-		 *
-		 * XXX should we CHECK_FOR_INTERRUPTS in this loop?  Escaping with an
-		 * error would leave the no-longer-used file still present on disk,
-		 * which would be bad, so I'm inclined to assume that the checkpointer
-		 * will always empty the queue soon.
-		 */
-		while (!ForwardFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC))
-			pg_usleep(10000L);	/* 10 msec seems a good number */
+	INIT_MD_FILETAG(tag, rnode.node, forknum, segno);
 
-		/*
-		 * Note we don't wait for the checkpointer to actually absorb the
-		 * cancel message; see mdsync() for the implications.
-		 */
-	}
+	RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true /* retryOnError */ );
 }
 
 /*
  * ForgetDatabaseFsyncRequests -- forget any fsyncs and unlinks for a DB
  */
 void
-ForgetDatabaseFsyncRequests(Oid dbid)
+ForgetDatabaseSyncRequests(Oid dbid)
 {
+	FileTag		tag;
 	RelFileNode rnode;
 
 	rnode.dbNode = dbid;
 	rnode.spcNode = 0;
 	rnode.relNode = 0;
 
-	if (pendingOpsTable)
-	{
-		/* standalone backend or startup process: fsync state is local */
-		RememberFsyncRequest(rnode, InvalidForkNumber, FORGET_DATABASE_FSYNC);
-	}
-	else if (IsUnderPostmaster)
-	{
-		/* see notes in ForgetRelationFsyncRequests */
-		while (!ForwardFsyncRequest(rnode, InvalidForkNumber,
-									FORGET_DATABASE_FSYNC))
-			pg_usleep(10000L);	/* 10 msec seems a good number */
-	}
+	INIT_MD_FILETAG(tag, rnode, InvalidForkNumber, InvalidBlockNumber);
+
+	RegisterSyncRequest(&tag, SYNC_FILTER_REQUEST, true /* retryOnError */ );
 }
 
 /*
@@ -1951,3 +1247,72 @@ _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
 	/* note that this calculation will ignore any partial block at EOF */
 	return (BlockNumber) (len / BLCKSZ);
 }
+
+/*
+ * Sync a file to disk, given a file tag.  Write the path into an output
+ * buffer so the caller can use it in error messages.
+ *
+ * Return 0 on success, -1 on failure, with errno set.
+ */
+int
+mdsyncfiletag(const FileTag *ftag, char *path)
+{
+	SMgrRelation reln = smgropen(ftag->rnode, InvalidBackendId);
+	MdfdVec    *v;
+	char	   *p;
+
+	/* Provide the path for informational messages. */
+	p = _mdfd_segpath(reln, ftag->forknum, ftag->segno);
+	strlcpy(path, p, MAXPGPATH);
+	pfree(p);
+
+	/* Try to find open the requested segment. */
+	v = _mdfd_getseg(reln, ftag->forknum, ftag->segno, false,
+					 EXTENSION_RETURN_NULL);
+	if (v == NULL)
+	{
+		errno = ENOENT;
+		return -1;
+	}
+
+	/* Try to fsync the file. */
+	return FileSync(v->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC);
+}
+
+/*
+ * Unlink a file, given a file tag.  Write the path into an output
+ * buffer so the caller can use it in error messages.
+ *
+ * Return 0 on success, -1 on failure, with errno set.
+ */
+int
+mdunlinkfiletag(const FileTag *ftag, char *path)
+{
+	SMgrRelation reln = smgropen(ftag->rnode, InvalidBackendId);
+	char	   *p;
+
+	/* Compute the path. */
+	p = _mdfd_segpath(reln, ftag->forknum, ftag->segno);
+	strlcpy(path, p, MAXPGPATH);
+	pfree(p);
+
+	/* Try to unlink the file. */
+	return unlink(path);
+}
+
+/*
+ * Check if a given candidate request matches a given tag, when processing
+ * a SYNC_FILTER_REQUEST request.  This will be called for all pending
+ * requests to find out whether to forget them.
+ */
+bool
+mdfiletagmatches(const FileTag *ftag, const FileTag *candidate)
+{
+	/*
+	 * For now we only use filter requests as a way to drop all scheduled
+	 * callbacks relating to a given database, when dropping the database.
+	 * We'll return true for all candidates that have the same database OID as
+	 * the ftag from the SYNC_FILTER_REQUEST request, so they're forgotten.
+	 */
+	return ftag->rnode.dbNode == candidate->rnode.dbNode;
+}