diff options
Diffstat (limited to 'src/backend/storage/smgr/md.c')
-rw-r--r-- | src/backend/storage/smgr/md.c | 885 |
1 files changed, 125 insertions, 760 deletions
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 6ed68185edb..ffb3569698f 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -29,45 +29,17 @@ #include "access/xlogutils.h" #include "access/xlog.h" #include "pgstat.h" -#include "portability/instr_time.h" #include "postmaster/bgwriter.h" #include "storage/fd.h" #include "storage/bufmgr.h" +#include "storage/md.h" #include "storage/relfilenode.h" #include "storage/smgr.h" +#include "storage/sync.h" #include "utils/hsearch.h" #include "utils/memutils.h" #include "pg_trace.h" - -/* intervals for calling AbsorbFsyncRequests in mdsync and mdpostckpt */ -#define FSYNCS_PER_ABSORB 10 -#define UNLINKS_PER_ABSORB 10 - -/* - * Special values for the segno arg to RememberFsyncRequest. - * - * Note that CompactCheckpointerRequestQueue assumes that it's OK to remove an - * fsync request from the queue if an identical, subsequent request is found. - * See comments there before making changes here. - */ -#define FORGET_RELATION_FSYNC (InvalidBlockNumber) -#define FORGET_DATABASE_FSYNC (InvalidBlockNumber-1) -#define UNLINK_RELATION_REQUEST (InvalidBlockNumber-2) - -/* - * On Windows, we have to interpret EACCES as possibly meaning the same as - * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform, - * that's what you get. Ugh. This code is designed so that we don't - * actually believe these cases are okay without further evidence (namely, - * a pending fsync request getting canceled ... see mdsync). - */ -#ifndef WIN32 -#define FILE_POSSIBLY_DELETED(err) ((err) == ENOENT) -#else -#define FILE_POSSIBLY_DELETED(err) ((err) == ENOENT || (err) == EACCES) -#endif - /* * The magnetic disk storage manager keeps track of open file * descriptors in its own descriptor pool. This is done to make it @@ -115,49 +87,15 @@ typedef struct _MdfdVec static MemoryContext MdCxt; /* context for all MdfdVec objects */ -/* - * In some contexts (currently, standalone backends and the checkpointer) - * we keep track of pending fsync operations: we need to remember all relation - * segments that have been written since the last checkpoint, so that we can - * fsync them down to disk before completing the next checkpoint. This hash - * table remembers the pending operations. We use a hash table mostly as - * a convenient way of merging duplicate requests. - * - * We use a similar mechanism to remember no-longer-needed files that can - * be deleted after the next checkpoint, but we use a linked list instead of - * a hash table, because we don't expect there to be any duplicate requests. - * - * These mechanisms are only used for non-temp relations; we never fsync - * temp rels, nor do we need to postpone their deletion (see comments in - * mdunlink). - * - * (Regular backends do not track pending operations locally, but forward - * them to the checkpointer.) - */ -typedef uint16 CycleCtr; /* can be any convenient integer size */ - -typedef struct -{ - RelFileNode rnode; /* hash table key (must be first!) */ - CycleCtr cycle_ctr; /* mdsync_cycle_ctr of oldest request */ - /* requests[f] has bit n set if we need to fsync segment n of fork f */ - Bitmapset *requests[MAX_FORKNUM + 1]; - /* canceled[f] is true if we canceled fsyncs for fork "recently" */ - bool canceled[MAX_FORKNUM + 1]; -} PendingOperationEntry; - -typedef struct -{ - RelFileNode rnode; /* the dead relation to delete */ - CycleCtr cycle_ctr; /* mdckpt_cycle_ctr when request was made */ -} PendingUnlinkEntry; - -static HTAB *pendingOpsTable = NULL; -static List *pendingUnlinks = NIL; -static MemoryContext pendingOpsCxt; /* context for the above */ - -static CycleCtr mdsync_cycle_ctr = 0; -static CycleCtr mdckpt_cycle_ctr = 0; +/* Populate a file tag describing an md.c segment file. */ +#define INIT_MD_FILETAG(a,xx_rnode,xx_forknum,xx_segno) \ +( \ + memset(&(a), 0, sizeof(FileTag)), \ + (a).handler = SYNC_HANDLER_MD, \ + (a).rnode = (xx_rnode), \ + (a).forknum = (xx_forknum), \ + (a).segno = (xx_segno) \ +) /*** behavior for mdopen & _mdfd_getseg ***/ @@ -185,7 +123,10 @@ static void mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, static MdfdVec *mdopen(SMgrRelation reln, ForkNumber forknum, int behavior); static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg); -static void register_unlink(RelFileNodeBackend rnode); +static void register_unlink_segment(RelFileNodeBackend rnode, ForkNumber forknum, + BlockNumber segno); +static void register_forget_request(RelFileNodeBackend rnode, ForkNumber forknum, + BlockNumber segno); static void _fdvec_resize(SMgrRelation reln, ForkNumber forknum, int nseg); @@ -208,64 +149,6 @@ mdinit(void) MdCxt = AllocSetContextCreate(TopMemoryContext, "MdSmgr", ALLOCSET_DEFAULT_SIZES); - - /* - * Create pending-operations hashtable if we need it. Currently, we need - * it if we are standalone (not under a postmaster) or if we are a startup - * or checkpointer auxiliary process. - */ - if (!IsUnderPostmaster || AmStartupProcess() || AmCheckpointerProcess()) - { - HASHCTL hash_ctl; - - /* - * XXX: The checkpointer needs to add entries to the pending ops table - * when absorbing fsync requests. That is done within a critical - * section, which isn't usually allowed, but we make an exception. It - * means that there's a theoretical possibility that you run out of - * memory while absorbing fsync requests, which leads to a PANIC. - * Fortunately the hash table is small so that's unlikely to happen in - * practice. - */ - pendingOpsCxt = AllocSetContextCreate(MdCxt, - "Pending ops context", - ALLOCSET_DEFAULT_SIZES); - MemoryContextAllowInCriticalSection(pendingOpsCxt, true); - - MemSet(&hash_ctl, 0, sizeof(hash_ctl)); - hash_ctl.keysize = sizeof(RelFileNode); - hash_ctl.entrysize = sizeof(PendingOperationEntry); - hash_ctl.hcxt = pendingOpsCxt; - pendingOpsTable = hash_create("Pending Ops Table", - 100L, - &hash_ctl, - HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); - pendingUnlinks = NIL; - } -} - -/* - * In archive recovery, we rely on checkpointer to do fsyncs, but we will have - * already created the pendingOpsTable during initialization of the startup - * process. Calling this function drops the local pendingOpsTable so that - * subsequent requests will be forwarded to checkpointer. - */ -void -SetForwardFsyncRequests(void) -{ - /* Perform any pending fsyncs we may have queued up, then drop table */ - if (pendingOpsTable) - { - mdsync(); - hash_destroy(pendingOpsTable); - } - pendingOpsTable = NULL; - - /* - * We should not have any pending unlink requests, since mdunlink doesn't - * queue unlink requests when isRedo. - */ - Assert(pendingUnlinks == NIL); } /* @@ -380,16 +263,6 @@ mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo) void mdunlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) { - /* - * We have to clean out any pending fsync requests for the doomed - * relation, else the next mdsync() will fail. There can't be any such - * requests for a temp relation, though. We can send just one request - * even when deleting multiple forks, since the fsync queuing code accepts - * the "InvalidForkNumber = all forks" convention. - */ - if (!RelFileNodeBackendIsTemp(rnode)) - ForgetRelationFsyncRequests(rnode.node, forkNum); - /* Now do the per-fork work */ if (forkNum == InvalidForkNumber) { @@ -413,6 +286,11 @@ mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) */ if (isRedo || forkNum != MAIN_FORKNUM || RelFileNodeBackendIsTemp(rnode)) { + /* First, forget any pending sync requests for the first segment */ + if (!RelFileNodeBackendIsTemp(rnode)) + register_forget_request(rnode, forkNum, 0 /* first seg */ ); + + /* Next unlink the file */ ret = unlink(path); if (ret < 0 && errno != ENOENT) ereport(WARNING, @@ -442,7 +320,7 @@ mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) errmsg("could not truncate file \"%s\": %m", path))); /* Register request to unlink first segment later */ - register_unlink(rnode); + register_unlink_segment(rnode, forkNum, 0 /* first seg */ ); } /* @@ -459,6 +337,13 @@ mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) */ for (segno = 1;; segno++) { + /* + * Forget any pending sync requests for this segment before we try + * to unlink. + */ + if (!RelFileNodeBackendIsTemp(rnode)) + register_forget_request(rnode, forkNum, segno); + sprintf(segpath, "%s.%u", path, segno); if (unlink(segpath) < 0) { @@ -1004,412 +889,26 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum) } /* - * mdsync() -- Sync previous writes to stable storage. - */ -void -mdsync(void) -{ - static bool mdsync_in_progress = false; - - HASH_SEQ_STATUS hstat; - PendingOperationEntry *entry; - int absorb_counter; - - /* Statistics on sync times */ - int processed = 0; - instr_time sync_start, - sync_end, - sync_diff; - uint64 elapsed; - uint64 longest = 0; - uint64 total_elapsed = 0; - - /* - * This is only called during checkpoints, and checkpoints should only - * occur in processes that have created a pendingOpsTable. - */ - if (!pendingOpsTable) - elog(ERROR, "cannot sync without a pendingOpsTable"); - - /* - * If we are in the checkpointer, the sync had better include all fsync - * requests that were queued by backends up to this point. The tightest - * race condition that could occur is that a buffer that must be written - * and fsync'd for the checkpoint could have been dumped by a backend just - * before it was visited by BufferSync(). We know the backend will have - * queued an fsync request before clearing the buffer's dirtybit, so we - * are safe as long as we do an Absorb after completing BufferSync(). - */ - AbsorbFsyncRequests(); - - /* - * To avoid excess fsync'ing (in the worst case, maybe a never-terminating - * checkpoint), we want to ignore fsync requests that are entered into the - * hashtable after this point --- they should be processed next time, - * instead. We use mdsync_cycle_ctr to tell old entries apart from new - * ones: new ones will have cycle_ctr equal to the incremented value of - * mdsync_cycle_ctr. - * - * In normal circumstances, all entries present in the table at this point - * will have cycle_ctr exactly equal to the current (about to be old) - * value of mdsync_cycle_ctr. However, if we fail partway through the - * fsync'ing loop, then older values of cycle_ctr might remain when we - * come back here to try again. Repeated checkpoint failures would - * eventually wrap the counter around to the point where an old entry - * might appear new, causing us to skip it, possibly allowing a checkpoint - * to succeed that should not have. To forestall wraparound, any time the - * previous mdsync() failed to complete, run through the table and - * forcibly set cycle_ctr = mdsync_cycle_ctr. - * - * Think not to merge this loop with the main loop, as the problem is - * exactly that that loop may fail before having visited all the entries. - * From a performance point of view it doesn't matter anyway, as this path - * will never be taken in a system that's functioning normally. - */ - if (mdsync_in_progress) - { - /* prior try failed, so update any stale cycle_ctr values */ - hash_seq_init(&hstat, pendingOpsTable); - while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL) - { - entry->cycle_ctr = mdsync_cycle_ctr; - } - } - - /* Advance counter so that new hashtable entries are distinguishable */ - mdsync_cycle_ctr++; - - /* Set flag to detect failure if we don't reach the end of the loop */ - mdsync_in_progress = true; - - /* Now scan the hashtable for fsync requests to process */ - absorb_counter = FSYNCS_PER_ABSORB; - hash_seq_init(&hstat, pendingOpsTable); - while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL) - { - ForkNumber forknum; - - /* - * If the entry is new then don't process it this time; it might - * contain multiple fsync-request bits, but they are all new. Note - * "continue" bypasses the hash-remove call at the bottom of the loop. - */ - if (entry->cycle_ctr == mdsync_cycle_ctr) - continue; - - /* Else assert we haven't missed it */ - Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr); - - /* - * Scan over the forks and segments represented by the entry. - * - * The bitmap manipulations are slightly tricky, because we can call - * AbsorbFsyncRequests() inside the loop and that could result in - * bms_add_member() modifying and even re-palloc'ing the bitmapsets. - * So we detach it, but if we fail we'll merge it with any new - * requests that have arrived in the meantime. - */ - for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) - { - Bitmapset *requests = entry->requests[forknum]; - int segno; - - entry->requests[forknum] = NULL; - entry->canceled[forknum] = false; - - segno = -1; - while ((segno = bms_next_member(requests, segno)) >= 0) - { - int failures; - - /* - * If fsync is off then we don't have to bother opening the - * file at all. (We delay checking until this point so that - * changing fsync on the fly behaves sensibly.) - */ - if (!enableFsync) - continue; - - /* - * If in checkpointer, we want to absorb pending requests - * every so often to prevent overflow of the fsync request - * queue. It is unspecified whether newly-added entries will - * be visited by hash_seq_search, but we don't care since we - * don't need to process them anyway. - */ - if (--absorb_counter <= 0) - { - AbsorbFsyncRequests(); - absorb_counter = FSYNCS_PER_ABSORB; - } - - /* - * The fsync table could contain requests to fsync segments - * that have been deleted (unlinked) by the time we get to - * them. Rather than just hoping an ENOENT (or EACCES on - * Windows) error can be ignored, what we do on error is - * absorb pending requests and then retry. Since mdunlink() - * queues a "cancel" message before actually unlinking, the - * fsync request is guaranteed to be marked canceled after the - * absorb if it really was this case. DROP DATABASE likewise - * has to tell us to forget fsync requests before it starts - * deletions. - */ - for (failures = 0;; failures++) /* loop exits at "break" */ - { - SMgrRelation reln; - MdfdVec *seg; - char *path; - int save_errno; - - /* - * Find or create an smgr hash entry for this relation. - * This may seem a bit unclean -- md calling smgr? But - * it's really the best solution. It ensures that the - * open file reference isn't permanently leaked if we get - * an error here. (You may say "but an unreferenced - * SMgrRelation is still a leak!" Not really, because the - * only case in which a checkpoint is done by a process - * that isn't about to shut down is in the checkpointer, - * and it will periodically do smgrcloseall(). This fact - * justifies our not closing the reln in the success path - * either, which is a good thing since in non-checkpointer - * cases we couldn't safely do that.) - */ - reln = smgropen(entry->rnode, InvalidBackendId); - - /* Attempt to open and fsync the target segment */ - seg = _mdfd_getseg(reln, forknum, - (BlockNumber) segno * (BlockNumber) RELSEG_SIZE, - false, - EXTENSION_RETURN_NULL - | EXTENSION_DONT_CHECK_SIZE); - - INSTR_TIME_SET_CURRENT(sync_start); - - if (seg != NULL && - FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) >= 0) - { - /* Success; update statistics about sync timing */ - INSTR_TIME_SET_CURRENT(sync_end); - sync_diff = sync_end; - INSTR_TIME_SUBTRACT(sync_diff, sync_start); - elapsed = INSTR_TIME_GET_MICROSEC(sync_diff); - if (elapsed > longest) - longest = elapsed; - total_elapsed += elapsed; - processed++; - requests = bms_del_member(requests, segno); - if (log_checkpoints) - elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f msec", - processed, - FilePathName(seg->mdfd_vfd), - (double) elapsed / 1000); - - break; /* out of retry loop */ - } - - /* Compute file name for use in message */ - save_errno = errno; - path = _mdfd_segpath(reln, forknum, (BlockNumber) segno); - errno = save_errno; - - /* - * It is possible that the relation has been dropped or - * truncated since the fsync request was entered. - * Therefore, allow ENOENT, but only if we didn't fail - * already on this file. This applies both for - * _mdfd_getseg() and for FileSync, since fd.c might have - * closed the file behind our back. - * - * XXX is there any point in allowing more than one retry? - * Don't see one at the moment, but easy to change the - * test here if so. - */ - if (!FILE_POSSIBLY_DELETED(errno) || - failures > 0) - { - Bitmapset *new_requests; - - /* - * We need to merge these unsatisfied requests with - * any others that have arrived since we started. - */ - new_requests = entry->requests[forknum]; - entry->requests[forknum] = - bms_join(new_requests, requests); - - errno = save_errno; - ereport(data_sync_elevel(ERROR), - (errcode_for_file_access(), - errmsg("could not fsync file \"%s\": %m", - path))); - } - else - ereport(DEBUG1, - (errcode_for_file_access(), - errmsg("could not fsync file \"%s\" but retrying: %m", - path))); - pfree(path); - - /* - * Absorb incoming requests and check to see if a cancel - * arrived for this relation fork. - */ - AbsorbFsyncRequests(); - absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */ - - if (entry->canceled[forknum]) - break; - } /* end retry loop */ - } - bms_free(requests); - } - - /* - * We've finished everything that was requested before we started to - * scan the entry. If no new requests have been inserted meanwhile, - * remove the entry. Otherwise, update its cycle counter, as all the - * requests now in it must have arrived during this cycle. - */ - for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) - { - if (entry->requests[forknum] != NULL) - break; - } - if (forknum <= MAX_FORKNUM) - entry->cycle_ctr = mdsync_cycle_ctr; - else - { - /* Okay to remove it */ - if (hash_search(pendingOpsTable, &entry->rnode, - HASH_REMOVE, NULL) == NULL) - elog(ERROR, "pendingOpsTable corrupted"); - } - } /* end loop over hashtable entries */ - - /* Return sync performance metrics for report at checkpoint end */ - CheckpointStats.ckpt_sync_rels = processed; - CheckpointStats.ckpt_longest_sync = longest; - CheckpointStats.ckpt_agg_sync_time = total_elapsed; - - /* Flag successful completion of mdsync */ - mdsync_in_progress = false; -} - -/* - * mdpreckpt() -- Do pre-checkpoint work - * - * To distinguish unlink requests that arrived before this checkpoint - * started from those that arrived during the checkpoint, we use a cycle - * counter similar to the one we use for fsync requests. That cycle - * counter is incremented here. - * - * This must be called *before* the checkpoint REDO point is determined. - * That ensures that we won't delete files too soon. - * - * Note that we can't do anything here that depends on the assumption - * that the checkpoint will be completed. - */ -void -mdpreckpt(void) -{ - /* - * Any unlink requests arriving after this point will be assigned the next - * cycle counter, and won't be unlinked until next checkpoint. - */ - mdckpt_cycle_ctr++; -} - -/* - * mdpostckpt() -- Do post-checkpoint work - * - * Remove any lingering files that can now be safely removed. - */ -void -mdpostckpt(void) -{ - int absorb_counter; - - absorb_counter = UNLINKS_PER_ABSORB; - while (pendingUnlinks != NIL) - { - PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks); - char *path; - - /* - * New entries are appended to the end, so if the entry is new we've - * reached the end of old entries. - * - * Note: if just the right number of consecutive checkpoints fail, we - * could be fooled here by cycle_ctr wraparound. However, the only - * consequence is that we'd delay unlinking for one more checkpoint, - * which is perfectly tolerable. - */ - if (entry->cycle_ctr == mdckpt_cycle_ctr) - break; - - /* Unlink the file */ - path = relpathperm(entry->rnode, MAIN_FORKNUM); - if (unlink(path) < 0) - { - /* - * There's a race condition, when the database is dropped at the - * same time that we process the pending unlink requests. If the - * DROP DATABASE deletes the file before we do, we will get ENOENT - * here. rmtree() also has to ignore ENOENT errors, to deal with - * the possibility that we delete the file first. - */ - if (errno != ENOENT) - ereport(WARNING, - (errcode_for_file_access(), - errmsg("could not remove file \"%s\": %m", path))); - } - pfree(path); - - /* And remove the list entry */ - pendingUnlinks = list_delete_first(pendingUnlinks); - pfree(entry); - - /* - * As in mdsync, we don't want to stop absorbing fsync requests for a - * long time when there are many deletions to be done. We can safely - * call AbsorbFsyncRequests() at this point in the loop (note it might - * try to delete list entries). - */ - if (--absorb_counter <= 0) - { - AbsorbFsyncRequests(); - absorb_counter = UNLINKS_PER_ABSORB; - } - } -} - -/* * register_dirty_segment() -- Mark a relation segment as needing fsync * * If there is a local pending-ops table, just make an entry in it for - * mdsync to process later. Otherwise, try to pass off the fsync request - * to the checkpointer process. If that fails, just do the fsync - * locally before returning (we hope this will not happen often enough - * to be a performance problem). + * ProcessSyncRequests to process later. Otherwise, try to pass off the + * fsync request to the checkpointer process. If that fails, just do the + * fsync locally before returning (we hope this will not happen often + * enough to be a performance problem). */ static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) { + FileTag tag; + + INIT_MD_FILETAG(tag, reln->smgr_rnode.node, forknum, seg->mdfd_segno); + /* Temp relations should never be fsync'd */ Assert(!SmgrIsTemp(reln)); - if (pendingOpsTable) + if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false /* retryOnError */ )) { - /* push it into local pending-ops table */ - RememberFsyncRequest(reln->smgr_rnode.node, forknum, seg->mdfd_segno); - } - else - { - if (ForwardFsyncRequest(reln->smgr_rnode.node, forknum, seg->mdfd_segno)) - return; /* passed it off successfully */ - ereport(DEBUG1, (errmsg("could not forward fsync request because request queue is full"))); @@ -1423,254 +922,51 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) /* * register_unlink() -- Schedule a file to be deleted after next checkpoint - * - * We don't bother passing in the fork number, because this is only used - * with main forks. - * - * As with register_dirty_segment, this could involve either a local or - * a remote pending-ops table. */ static void -register_unlink(RelFileNodeBackend rnode) +register_unlink_segment(RelFileNodeBackend rnode, ForkNumber forknum, + BlockNumber segno) { + FileTag tag; + + INIT_MD_FILETAG(tag, rnode.node, forknum, segno); + /* Should never be used with temp relations */ Assert(!RelFileNodeBackendIsTemp(rnode)); - if (pendingOpsTable) - { - /* push it into local pending-ops table */ - RememberFsyncRequest(rnode.node, MAIN_FORKNUM, - UNLINK_RELATION_REQUEST); - } - else - { - /* - * Notify the checkpointer about it. If we fail to queue the request - * message, we have to sleep and try again, because we can't simply - * delete the file now. Ugly, but hopefully won't happen often. - * - * XXX should we just leave the file orphaned instead? - */ - Assert(IsUnderPostmaster); - while (!ForwardFsyncRequest(rnode.node, MAIN_FORKNUM, - UNLINK_RELATION_REQUEST)) - pg_usleep(10000L); /* 10 msec seems a good number */ - } + RegisterSyncRequest(&tag, SYNC_UNLINK_REQUEST, true /* retryOnError */ ); } /* - * RememberFsyncRequest() -- callback from checkpointer side of fsync request - * - * We stuff fsync requests into the local hash table for execution - * during the checkpointer's next checkpoint. UNLINK requests go into a - * separate linked list, however, because they get processed separately. - * - * The range of possible segment numbers is way less than the range of - * BlockNumber, so we can reserve high values of segno for special purposes. - * We define three: - * - FORGET_RELATION_FSYNC means to cancel pending fsyncs for a relation, - * either for one fork, or all forks if forknum is InvalidForkNumber - * - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database - * - UNLINK_RELATION_REQUEST is a request to delete the file after the next - * checkpoint. - * Note also that we're assuming real segment numbers don't exceed INT_MAX. - * - * (Handling FORGET_DATABASE_FSYNC requests is a tad slow because the hash - * table has to be searched linearly, but dropping a database is a pretty - * heavyweight operation anyhow, so we'll live with it.) + * register_forget_request() -- forget any fsyncs for a relation fork's segment */ -void -RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno) +static void +register_forget_request(RelFileNodeBackend rnode, ForkNumber forknum, + BlockNumber segno) { - Assert(pendingOpsTable); - - if (segno == FORGET_RELATION_FSYNC) - { - /* Remove any pending requests for the relation (one or all forks) */ - PendingOperationEntry *entry; - - entry = (PendingOperationEntry *) hash_search(pendingOpsTable, - &rnode, - HASH_FIND, - NULL); - if (entry) - { - /* - * We can't just delete the entry since mdsync could have an - * active hashtable scan. Instead we delete the bitmapsets; this - * is safe because of the way mdsync is coded. We also set the - * "canceled" flags so that mdsync can tell that a cancel arrived - * for the fork(s). - */ - if (forknum == InvalidForkNumber) - { - /* remove requests for all forks */ - for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) - { - bms_free(entry->requests[forknum]); - entry->requests[forknum] = NULL; - entry->canceled[forknum] = true; - } - } - else - { - /* remove requests for single fork */ - bms_free(entry->requests[forknum]); - entry->requests[forknum] = NULL; - entry->canceled[forknum] = true; - } - } - } - else if (segno == FORGET_DATABASE_FSYNC) - { - /* Remove any pending requests for the entire database */ - HASH_SEQ_STATUS hstat; - PendingOperationEntry *entry; - ListCell *cell, - *prev, - *next; - - /* Remove fsync requests */ - hash_seq_init(&hstat, pendingOpsTable); - while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL) - { - if (entry->rnode.dbNode == rnode.dbNode) - { - /* remove requests for all forks */ - for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) - { - bms_free(entry->requests[forknum]); - entry->requests[forknum] = NULL; - entry->canceled[forknum] = true; - } - } - } - - /* Remove unlink requests */ - prev = NULL; - for (cell = list_head(pendingUnlinks); cell; cell = next) - { - PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell); - - next = lnext(cell); - if (entry->rnode.dbNode == rnode.dbNode) - { - pendingUnlinks = list_delete_cell(pendingUnlinks, cell, prev); - pfree(entry); - } - else - prev = cell; - } - } - else if (segno == UNLINK_RELATION_REQUEST) - { - /* Unlink request: put it in the linked list */ - MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt); - PendingUnlinkEntry *entry; - - /* PendingUnlinkEntry doesn't store forknum, since it's always MAIN */ - Assert(forknum == MAIN_FORKNUM); - - entry = palloc(sizeof(PendingUnlinkEntry)); - entry->rnode = rnode; - entry->cycle_ctr = mdckpt_cycle_ctr; - - pendingUnlinks = lappend(pendingUnlinks, entry); - - MemoryContextSwitchTo(oldcxt); - } - else - { - /* Normal case: enter a request to fsync this segment */ - MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt); - PendingOperationEntry *entry; - bool found; - - entry = (PendingOperationEntry *) hash_search(pendingOpsTable, - &rnode, - HASH_ENTER, - &found); - /* if new entry, initialize it */ - if (!found) - { - entry->cycle_ctr = mdsync_cycle_ctr; - MemSet(entry->requests, 0, sizeof(entry->requests)); - MemSet(entry->canceled, 0, sizeof(entry->canceled)); - } - - /* - * NB: it's intentional that we don't change cycle_ctr if the entry - * already exists. The cycle_ctr must represent the oldest fsync - * request that could be in the entry. - */ - - entry->requests[forknum] = bms_add_member(entry->requests[forknum], - (int) segno); - - MemoryContextSwitchTo(oldcxt); - } -} + FileTag tag; -/* - * ForgetRelationFsyncRequests -- forget any fsyncs for a relation fork - * - * forknum == InvalidForkNumber means all forks, although this code doesn't - * actually know that, since it's just forwarding the request elsewhere. - */ -void -ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum) -{ - if (pendingOpsTable) - { - /* standalone backend or startup process: fsync state is local */ - RememberFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC); - } - else if (IsUnderPostmaster) - { - /* - * Notify the checkpointer about it. If we fail to queue the cancel - * message, we have to sleep and try again ... ugly, but hopefully - * won't happen often. - * - * XXX should we CHECK_FOR_INTERRUPTS in this loop? Escaping with an - * error would leave the no-longer-used file still present on disk, - * which would be bad, so I'm inclined to assume that the checkpointer - * will always empty the queue soon. - */ - while (!ForwardFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC)) - pg_usleep(10000L); /* 10 msec seems a good number */ + INIT_MD_FILETAG(tag, rnode.node, forknum, segno); - /* - * Note we don't wait for the checkpointer to actually absorb the - * cancel message; see mdsync() for the implications. - */ - } + RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true /* retryOnError */ ); } /* * ForgetDatabaseFsyncRequests -- forget any fsyncs and unlinks for a DB */ void -ForgetDatabaseFsyncRequests(Oid dbid) +ForgetDatabaseSyncRequests(Oid dbid) { + FileTag tag; RelFileNode rnode; rnode.dbNode = dbid; rnode.spcNode = 0; rnode.relNode = 0; - if (pendingOpsTable) - { - /* standalone backend or startup process: fsync state is local */ - RememberFsyncRequest(rnode, InvalidForkNumber, FORGET_DATABASE_FSYNC); - } - else if (IsUnderPostmaster) - { - /* see notes in ForgetRelationFsyncRequests */ - while (!ForwardFsyncRequest(rnode, InvalidForkNumber, - FORGET_DATABASE_FSYNC)) - pg_usleep(10000L); /* 10 msec seems a good number */ - } + INIT_MD_FILETAG(tag, rnode, InvalidForkNumber, InvalidBlockNumber); + + RegisterSyncRequest(&tag, SYNC_FILTER_REQUEST, true /* retryOnError */ ); } /* @@ -1951,3 +1247,72 @@ _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) /* note that this calculation will ignore any partial block at EOF */ return (BlockNumber) (len / BLCKSZ); } + +/* + * Sync a file to disk, given a file tag. Write the path into an output + * buffer so the caller can use it in error messages. + * + * Return 0 on success, -1 on failure, with errno set. + */ +int +mdsyncfiletag(const FileTag *ftag, char *path) +{ + SMgrRelation reln = smgropen(ftag->rnode, InvalidBackendId); + MdfdVec *v; + char *p; + + /* Provide the path for informational messages. */ + p = _mdfd_segpath(reln, ftag->forknum, ftag->segno); + strlcpy(path, p, MAXPGPATH); + pfree(p); + + /* Try to find open the requested segment. */ + v = _mdfd_getseg(reln, ftag->forknum, ftag->segno, false, + EXTENSION_RETURN_NULL); + if (v == NULL) + { + errno = ENOENT; + return -1; + } + + /* Try to fsync the file. */ + return FileSync(v->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC); +} + +/* + * Unlink a file, given a file tag. Write the path into an output + * buffer so the caller can use it in error messages. + * + * Return 0 on success, -1 on failure, with errno set. + */ +int +mdunlinkfiletag(const FileTag *ftag, char *path) +{ + SMgrRelation reln = smgropen(ftag->rnode, InvalidBackendId); + char *p; + + /* Compute the path. */ + p = _mdfd_segpath(reln, ftag->forknum, ftag->segno); + strlcpy(path, p, MAXPGPATH); + pfree(p); + + /* Try to unlink the file. */ + return unlink(path); +} + +/* + * Check if a given candidate request matches a given tag, when processing + * a SYNC_FILTER_REQUEST request. This will be called for all pending + * requests to find out whether to forget them. + */ +bool +mdfiletagmatches(const FileTag *ftag, const FileTag *candidate) +{ + /* + * For now we only use filter requests as a way to drop all scheduled + * callbacks relating to a given database, when dropping the database. + * We'll return true for all candidates that have the same database OID as + * the ftag from the SYNC_FILTER_REQUEST request, so they're forgotten. + */ + return ftag->rnode.dbNode == candidate->rnode.dbNode; +} |