diff options
Diffstat (limited to 'src/backend/storage/smgr/md.c')
-rw-r--r-- | src/backend/storage/smgr/md.c | 308 |
1 files changed, 247 insertions, 61 deletions
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 2122a243207..5ac5868f690 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.104 2004/04/19 17:42:58 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.105 2004/05/31 03:48:06 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -21,8 +21,10 @@ #include "catalog/catalog.h" #include "miscadmin.h" +#include "postmaster/bgwriter.h" #include "storage/fd.h" #include "storage/smgr.h" +#include "utils/hsearch.h" #include "utils/memutils.h" @@ -33,37 +35,68 @@ * system's file size limit (often 2GBytes). In order to do that, * we break relations up into chunks of < 2GBytes and store one chunk * in each of several files that represent the relation. See the - * BLCKSZ and RELSEG_SIZE configuration constants in - * include/pg_config.h. All chunks except the last MUST have size exactly - * equal to RELSEG_SIZE blocks --- see mdnblocks() and mdtruncate(). + * BLCKSZ and RELSEG_SIZE configuration constants in pg_config_manual.h. + * All chunks except the last MUST have size exactly equal to RELSEG_SIZE + * blocks --- see mdnblocks() and mdtruncate(). * * The file descriptor pointer (md_fd field) stored in the SMgrRelation * cache is, therefore, just the head of a list of MdfdVec objects. * But note the md_fd pointer can be NULL, indicating relation not open. * + * Note that mdfd_chain == NULL does not necessarily mean the relation + * doesn't have another segment after this one; we may just not have + * opened the next segment yet. (We could not have "all segments are + * in the chain" as an invariant anyway, since another backend could + * extend the relation when we weren't looking.) + * * All MdfdVec objects are palloc'd in the MdCxt memory context. */ typedef struct _MdfdVec { File mdfd_vfd; /* fd number in fd.c's pool */ - -#ifndef LET_OS_MANAGE_FILESIZE - struct _MdfdVec *mdfd_chain; /* for large relations */ + BlockNumber mdfd_segno; /* segment number, from 0 */ +#ifndef LET_OS_MANAGE_FILESIZE /* for large relations */ + struct _MdfdVec *mdfd_chain; /* next segment, or NULL */ #endif } MdfdVec; static MemoryContext MdCxt; /* context for all md.c allocations */ -/* routines declared here */ -static MdfdVec *mdopen(SMgrRelation reln); +/* + * In some contexts (currently, standalone backends and the bgwriter process) + * we keep track of pending fsync operations: we need to remember all relation + * segments that have been written since the last checkpoint, so that we can + * fsync them down to disk before completing the next checkpoint. This hash + * table remembers the pending operations. We use a hash table not because + * we want to look up individual operations, but simply as a convenient way + * of eliminating duplicate requests. + * + * (Regular backends do not track pending operations locally, but forward + * them to the bgwriter.) + * + * XXX for WIN32, may want to expand this to track pending deletes, too. + */ +typedef struct +{ + RelFileNode rnode; /* the targeted relation */ + BlockNumber segno; /* which segment */ +} PendingOperationEntry; + +static HTAB *pendingOpsTable = NULL; + + +/* local routines */ +static MdfdVec *mdopen(SMgrRelation reln, bool allowNotFound); +static bool register_dirty_segment(SMgrRelation reln, MdfdVec *seg); static MdfdVec *_fdvec_alloc(void); #ifndef LET_OS_MANAGE_FILESIZE static MdfdVec *_mdfd_openseg(SMgrRelation reln, BlockNumber segno, int oflags); #endif -static MdfdVec *_mdfd_getseg(SMgrRelation reln, BlockNumber blkno); +static MdfdVec *_mdfd_getseg(SMgrRelation reln, BlockNumber blkno, + bool allowNotFound); static BlockNumber _mdnblocks(File file, Size blcksz); @@ -79,6 +112,31 @@ mdinit(void) ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); + /* + * Create pending-operations hashtable if we need it. Currently, + * we need it if we are standalone (not under a postmaster) OR + * if we are a bootstrap-mode subprocess of a postmaster (that is, + * a startup or bgwriter process). + */ + if (!IsUnderPostmaster || IsBootstrapProcessingMode()) + { + HASHCTL hash_ctl; + + MemSet(&hash_ctl, 0, sizeof(hash_ctl)); + hash_ctl.keysize = sizeof(PendingOperationEntry); + hash_ctl.entrysize = sizeof(PendingOperationEntry); + hash_ctl.hash = tag_hash; + hash_ctl.hcxt = MdCxt; + pendingOpsTable = hash_create("Pending Ops Table", + 100L, + &hash_ctl, + HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT); + if (pendingOpsTable == NULL) + ereport(FATAL, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + return true; } @@ -130,6 +188,7 @@ mdcreate(SMgrRelation reln, bool isRedo) reln->md_fd = _fdvec_alloc(); reln->md_fd->mdfd_vfd = fd; + reln->md_fd->mdfd_segno = 0; #ifndef LET_OS_MANAGE_FILESIZE reln->md_fd->mdfd_chain = NULL; #endif @@ -217,7 +276,7 @@ mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer) int nbytes; MdfdVec *v; - v = _mdfd_getseg(reln, blocknum); + v = _mdfd_getseg(reln, blocknum, false); #ifndef LET_OS_MANAGE_FILESIZE seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE))); @@ -252,6 +311,9 @@ mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer) return false; } + if (!register_dirty_segment(reln, v)) + return false; + #ifndef LET_OS_MANAGE_FILESIZE Assert(_mdnblocks(v->mdfd_vfd, BLCKSZ) <= ((BlockNumber) RELSEG_SIZE)); #endif @@ -261,12 +323,14 @@ mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer) /* * mdopen() -- Open the specified relation. ereport's on failure. + * (Optionally, can return NULL instead of ereport for ENOENT.) * * Note we only open the first segment, when there are multiple segments. */ static MdfdVec * -mdopen(SMgrRelation reln) +mdopen(SMgrRelation reln, bool allowNotFound) { + MdfdVec *mdfd; char *path; File fd; @@ -292,6 +356,8 @@ mdopen(SMgrRelation reln) if (fd < 0) { pfree(path); + if (allowNotFound && errno == ENOENT) + return NULL; ereport(ERROR, (errcode_for_file_access(), errmsg("could not open relation %u/%u: %m", @@ -302,15 +368,16 @@ mdopen(SMgrRelation reln) pfree(path); - reln->md_fd = _fdvec_alloc(); + reln->md_fd = mdfd = _fdvec_alloc(); - reln->md_fd->mdfd_vfd = fd; + mdfd->mdfd_vfd = fd; + mdfd->mdfd_segno = 0; #ifndef LET_OS_MANAGE_FILESIZE - reln->md_fd->mdfd_chain = NULL; + mdfd->mdfd_chain = NULL; Assert(_mdnblocks(fd, BLCKSZ) <= ((BlockNumber) RELSEG_SIZE)); #endif - return reln->md_fd; + return mdfd; } /* @@ -361,7 +428,7 @@ mdread(SMgrRelation reln, BlockNumber blocknum, char *buffer) int nbytes; MdfdVec *v; - v = _mdfd_getseg(reln, blocknum); + v = _mdfd_getseg(reln, blocknum, false); #ifndef LET_OS_MANAGE_FILESIZE seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE))); @@ -403,7 +470,7 @@ mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer) long seekpos; MdfdVec *v; - v = _mdfd_getseg(reln, blocknum); + v = _mdfd_getseg(reln, blocknum, false); #ifndef LET_OS_MANAGE_FILESIZE seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE))); @@ -418,6 +485,9 @@ mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer) if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ) return false; + if (!register_dirty_segment(reln, v)) + return false; + return true; } @@ -434,7 +504,7 @@ mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer) BlockNumber mdnblocks(SMgrRelation reln) { - MdfdVec *v = mdopen(reln); + MdfdVec *v = mdopen(reln, false); #ifndef LET_OS_MANAGE_FILESIZE BlockNumber nblocks; @@ -516,7 +586,7 @@ mdtruncate(SMgrRelation reln, BlockNumber nblocks) if (nblocks == curnblk) return nblocks; /* no work */ - v = mdopen(reln); + v = mdopen(reln, false); #ifndef LET_OS_MANAGE_FILESIZE priorblocks = 0; @@ -576,40 +646,154 @@ mdtruncate(SMgrRelation reln, BlockNumber nblocks) } /* - * mdcommit() -- Commit a transaction. + * mdsync() -- Sync previous writes to stable storage. + * + * This is only called during checkpoints, and checkpoints should only + * occur in processes that have created a pendingOpsTable. */ bool -mdcommit(void) +mdsync(void) { + HASH_SEQ_STATUS hstat; + PendingOperationEntry *entry; + + if (!pendingOpsTable) + return false; + /* - * We don't actually have to do anything here... + * If we are in the bgwriter, the sync had better include all fsync + * requests that were queued by backends before the checkpoint REDO + * point was determined. We go that a little better by accepting + * all requests queued up to the point where we start fsync'ing. */ + AbsorbFsyncRequests(); + + hash_seq_init(&hstat, pendingOpsTable); + while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL) + { + /* + * If fsync is off then we don't have to bother opening the file + * at all. (We delay checking until this point so that changing + * fsync on the fly behaves sensibly.) + */ + if (enableFsync) + { + SMgrRelation reln; + MdfdVec *seg; + + /* + * Find or create an smgr hash entry for this relation. + * This may seem a bit unclean -- md calling smgr? But it's + * really the best solution. It ensures that the open file + * reference isn't permanently leaked if we get an error here. + * (You may say "but an unreferenced SMgrRelation is still a + * leak!" Not really, because the only case in which a checkpoint + * is done by a process that isn't about to shut down is in the + * bgwriter, and it will periodically do smgrcloseall(). This + * fact justifies our not closing the reln in the success path + * either, which is a good thing since in non-bgwriter cases + * we couldn't safely do that.) Furthermore, in many cases + * the relation will have been dirtied through this same smgr + * relation, and so we can save a file open/close cycle. + */ + reln = smgropen(entry->rnode); + + /* + * It is possible that the relation has been dropped or truncated + * since the fsync request was entered. Therefore, we have to + * allow file-not-found errors. This applies both during + * _mdfd_getseg() and during FileSync, since fd.c might have + * closed the file behind our back. + */ + seg = _mdfd_getseg(reln, + entry->segno * ((BlockNumber) RELSEG_SIZE), + true); + if (seg) + { + if (FileSync(seg->mdfd_vfd) < 0 && + errno != ENOENT) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not fsync segment %u of relation %u/%u: %m", + entry->segno, + entry->rnode.tblNode, + entry->rnode.relNode))); + return false; + } + } + } + + /* Okay, delete this entry */ + if (hash_search(pendingOpsTable, entry, + HASH_REMOVE, NULL) == NULL) + elog(ERROR, "pendingOpsTable corrupted"); + } + return true; } /* - * mdabort() -- Abort a transaction. + * register_dirty_segment() -- Mark a relation segment as needing fsync + * + * If there is a local pending-ops table, just make an entry in it for + * mdsync to process later. Otherwise, try to pass off the fsync request + * to the background writer process. If that fails, just do the fsync + * locally before returning (we expect this will not happen often enough + * to be a performance problem). + * + * A false result implies I/O failure during local fsync. errno will be + * valid for error reporting. */ -bool -mdabort(void) +static bool +register_dirty_segment(SMgrRelation reln, MdfdVec *seg) { - /* - * We don't actually have to do anything here... - */ + if (pendingOpsTable) + { + PendingOperationEntry entry; + + /* ensure any pad bytes in the struct are zeroed */ + MemSet(&entry, 0, sizeof(entry)); + entry.rnode = reln->smgr_rnode; + entry.segno = seg->mdfd_segno; + + if (hash_search(pendingOpsTable, &entry, HASH_ENTER, NULL) != NULL) + return true; + /* out of memory: fall through to do it locally */ + } + else + { + if (ForwardFsyncRequest(reln->smgr_rnode, seg->mdfd_segno)) + return true; + } + + if (FileSync(seg->mdfd_vfd) < 0) + return false; return true; } /* - * mdsync() -- Sync previous writes to stable storage. + * RememberFsyncRequest() -- callback from bgwriter side of fsync request + * + * We stuff the fsync request into the local hash table for execution + * during the bgwriter's next checkpoint. */ -bool -mdsync(void) +void +RememberFsyncRequest(RelFileNode rnode, BlockNumber segno) { - sync(); - if (IsUnderPostmaster) - pg_usleep(2000000L); - sync(); - return true; + PendingOperationEntry entry; + + Assert(pendingOpsTable); + + /* ensure any pad bytes in the struct are zeroed */ + MemSet(&entry, 0, sizeof(entry)); + entry.rnode = rnode; + entry.segno = segno; + + if (hash_search(pendingOpsTable, &entry, HASH_ENTER, NULL) == NULL) + ereport(FATAL, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); } /* @@ -618,18 +802,11 @@ mdsync(void) static MdfdVec * _fdvec_alloc(void) { - MdfdVec *v; - - v = (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec)); - v->mdfd_vfd = -1; -#ifndef LET_OS_MANAGE_FILESIZE - v->mdfd_chain = NULL; -#endif - - return v; + return (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec)); } #ifndef LET_OS_MANAGE_FILESIZE + /* * Open the specified segment of the relation, * and make a MdfdVec object for it. Returns NULL on failure. @@ -642,11 +819,11 @@ _mdfd_openseg(SMgrRelation reln, BlockNumber segno, int oflags) char *path, *fullpath; - /* be sure we have enough space for the '.segno', if any */ path = relpath(reln->smgr_rnode); if (segno > 0) { + /* be sure we have enough space for the '.segno' */ fullpath = (char *) palloc(strlen(path) + 12); sprintf(fullpath, "%s.%u", path, segno); pfree(path); @@ -667,32 +844,36 @@ _mdfd_openseg(SMgrRelation reln, BlockNumber segno, int oflags) /* fill the entry */ v->mdfd_vfd = fd; + v->mdfd_segno = segno; v->mdfd_chain = NULL; Assert(_mdnblocks(fd, BLCKSZ) <= ((BlockNumber) RELSEG_SIZE)); /* all done */ return v; } -#endif + +#endif /* LET_OS_MANAGE_FILESIZE */ /* * _mdfd_getseg() -- Find the segment of the relation holding the - * specified block. ereport's on failure. + * specified block. ereport's on failure. + * (Optionally, can return NULL instead of ereport for ENOENT.) */ static MdfdVec * -_mdfd_getseg(SMgrRelation reln, BlockNumber blkno) +_mdfd_getseg(SMgrRelation reln, BlockNumber blkno, bool allowNotFound) { - MdfdVec *v = mdopen(reln); - + MdfdVec *v = mdopen(reln, allowNotFound); #ifndef LET_OS_MANAGE_FILESIZE - BlockNumber segno; - BlockNumber i; + BlockNumber segstogo; + BlockNumber nextsegno; - for (segno = blkno / ((BlockNumber) RELSEG_SIZE), i = 1; - segno > 0; - i++, segno--) - { + if (!v) + return NULL; /* only possible if allowNotFound */ + for (segstogo = blkno / ((BlockNumber) RELSEG_SIZE), nextsegno = 1; + segstogo > 0; + nextsegno++, segstogo--) + { if (v->mdfd_chain == NULL) { /* @@ -705,16 +886,21 @@ _mdfd_getseg(SMgrRelation reln, BlockNumber blkno) * one new segment per call, so this restriction seems * reasonable. */ - v->mdfd_chain = _mdfd_openseg(reln, i, (segno == 1) ? O_CREAT : 0); - + v->mdfd_chain = _mdfd_openseg(reln, + nextsegno, + (segstogo == 1) ? O_CREAT : 0); if (v->mdfd_chain == NULL) + { + if (allowNotFound && errno == ENOENT) + return NULL; ereport(ERROR, (errcode_for_file_access(), errmsg("could not open segment %u of relation %u/%u (target block %u): %m", - i, + nextsegno, reln->smgr_rnode.tblNode, reln->smgr_rnode.relNode, blkno))); + } } v = v->mdfd_chain; } |