aboutsummaryrefslogtreecommitdiff
path: root/src/backend/storage/smgr/md.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/storage/smgr/md.c')
-rw-r--r--src/backend/storage/smgr/md.c308
1 files changed, 247 insertions, 61 deletions
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 2122a243207..5ac5868f690 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.104 2004/04/19 17:42:58 momjian Exp $
+ * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.105 2004/05/31 03:48:06 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -21,8 +21,10 @@
#include "catalog/catalog.h"
#include "miscadmin.h"
+#include "postmaster/bgwriter.h"
#include "storage/fd.h"
#include "storage/smgr.h"
+#include "utils/hsearch.h"
#include "utils/memutils.h"
@@ -33,37 +35,68 @@
* system's file size limit (often 2GBytes). In order to do that,
* we break relations up into chunks of < 2GBytes and store one chunk
* in each of several files that represent the relation. See the
- * BLCKSZ and RELSEG_SIZE configuration constants in
- * include/pg_config.h. All chunks except the last MUST have size exactly
- * equal to RELSEG_SIZE blocks --- see mdnblocks() and mdtruncate().
+ * BLCKSZ and RELSEG_SIZE configuration constants in pg_config_manual.h.
+ * All chunks except the last MUST have size exactly equal to RELSEG_SIZE
+ * blocks --- see mdnblocks() and mdtruncate().
*
* The file descriptor pointer (md_fd field) stored in the SMgrRelation
* cache is, therefore, just the head of a list of MdfdVec objects.
* But note the md_fd pointer can be NULL, indicating relation not open.
*
+ * Note that mdfd_chain == NULL does not necessarily mean the relation
+ * doesn't have another segment after this one; we may just not have
+ * opened the next segment yet. (We could not have "all segments are
+ * in the chain" as an invariant anyway, since another backend could
+ * extend the relation when we weren't looking.)
+ *
* All MdfdVec objects are palloc'd in the MdCxt memory context.
*/
typedef struct _MdfdVec
{
File mdfd_vfd; /* fd number in fd.c's pool */
-
-#ifndef LET_OS_MANAGE_FILESIZE
- struct _MdfdVec *mdfd_chain; /* for large relations */
+ BlockNumber mdfd_segno; /* segment number, from 0 */
+#ifndef LET_OS_MANAGE_FILESIZE /* for large relations */
+ struct _MdfdVec *mdfd_chain; /* next segment, or NULL */
#endif
} MdfdVec;
static MemoryContext MdCxt; /* context for all md.c allocations */
-/* routines declared here */
-static MdfdVec *mdopen(SMgrRelation reln);
+/*
+ * In some contexts (currently, standalone backends and the bgwriter process)
+ * we keep track of pending fsync operations: we need to remember all relation
+ * segments that have been written since the last checkpoint, so that we can
+ * fsync them down to disk before completing the next checkpoint. This hash
+ * table remembers the pending operations. We use a hash table not because
+ * we want to look up individual operations, but simply as a convenient way
+ * of eliminating duplicate requests.
+ *
+ * (Regular backends do not track pending operations locally, but forward
+ * them to the bgwriter.)
+ *
+ * XXX for WIN32, may want to expand this to track pending deletes, too.
+ */
+typedef struct
+{
+ RelFileNode rnode; /* the targeted relation */
+ BlockNumber segno; /* which segment */
+} PendingOperationEntry;
+
+static HTAB *pendingOpsTable = NULL;
+
+
+/* local routines */
+static MdfdVec *mdopen(SMgrRelation reln, bool allowNotFound);
+static bool register_dirty_segment(SMgrRelation reln, MdfdVec *seg);
static MdfdVec *_fdvec_alloc(void);
#ifndef LET_OS_MANAGE_FILESIZE
static MdfdVec *_mdfd_openseg(SMgrRelation reln, BlockNumber segno,
int oflags);
#endif
-static MdfdVec *_mdfd_getseg(SMgrRelation reln, BlockNumber blkno);
+static MdfdVec *_mdfd_getseg(SMgrRelation reln, BlockNumber blkno,
+ bool allowNotFound);
static BlockNumber _mdnblocks(File file, Size blcksz);
@@ -79,6 +112,31 @@ mdinit(void)
ALLOCSET_DEFAULT_INITSIZE,
ALLOCSET_DEFAULT_MAXSIZE);
+ /*
+ * Create pending-operations hashtable if we need it. Currently,
+ * we need it if we are standalone (not under a postmaster) OR
+ * if we are a bootstrap-mode subprocess of a postmaster (that is,
+ * a startup or bgwriter process).
+ */
+ if (!IsUnderPostmaster || IsBootstrapProcessingMode())
+ {
+ HASHCTL hash_ctl;
+
+ MemSet(&hash_ctl, 0, sizeof(hash_ctl));
+ hash_ctl.keysize = sizeof(PendingOperationEntry);
+ hash_ctl.entrysize = sizeof(PendingOperationEntry);
+ hash_ctl.hash = tag_hash;
+ hash_ctl.hcxt = MdCxt;
+ pendingOpsTable = hash_create("Pending Ops Table",
+ 100L,
+ &hash_ctl,
+ HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
+ if (pendingOpsTable == NULL)
+ ereport(FATAL,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+
return true;
}
@@ -130,6 +188,7 @@ mdcreate(SMgrRelation reln, bool isRedo)
reln->md_fd = _fdvec_alloc();
reln->md_fd->mdfd_vfd = fd;
+ reln->md_fd->mdfd_segno = 0;
#ifndef LET_OS_MANAGE_FILESIZE
reln->md_fd->mdfd_chain = NULL;
#endif
@@ -217,7 +276,7 @@ mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer)
int nbytes;
MdfdVec *v;
- v = _mdfd_getseg(reln, blocknum);
+ v = _mdfd_getseg(reln, blocknum, false);
#ifndef LET_OS_MANAGE_FILESIZE
seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
@@ -252,6 +311,9 @@ mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer)
return false;
}
+ if (!register_dirty_segment(reln, v))
+ return false;
+
#ifndef LET_OS_MANAGE_FILESIZE
Assert(_mdnblocks(v->mdfd_vfd, BLCKSZ) <= ((BlockNumber) RELSEG_SIZE));
#endif
@@ -261,12 +323,14 @@ mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer)
/*
* mdopen() -- Open the specified relation. ereport's on failure.
+ * (Optionally, can return NULL instead of ereport for ENOENT.)
*
* Note we only open the first segment, when there are multiple segments.
*/
static MdfdVec *
-mdopen(SMgrRelation reln)
+mdopen(SMgrRelation reln, bool allowNotFound)
{
+ MdfdVec *mdfd;
char *path;
File fd;
@@ -292,6 +356,8 @@ mdopen(SMgrRelation reln)
if (fd < 0)
{
pfree(path);
+ if (allowNotFound && errno == ENOENT)
+ return NULL;
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not open relation %u/%u: %m",
@@ -302,15 +368,16 @@ mdopen(SMgrRelation reln)
pfree(path);
- reln->md_fd = _fdvec_alloc();
+ reln->md_fd = mdfd = _fdvec_alloc();
- reln->md_fd->mdfd_vfd = fd;
+ mdfd->mdfd_vfd = fd;
+ mdfd->mdfd_segno = 0;
#ifndef LET_OS_MANAGE_FILESIZE
- reln->md_fd->mdfd_chain = NULL;
+ mdfd->mdfd_chain = NULL;
Assert(_mdnblocks(fd, BLCKSZ) <= ((BlockNumber) RELSEG_SIZE));
#endif
- return reln->md_fd;
+ return mdfd;
}
/*
@@ -361,7 +428,7 @@ mdread(SMgrRelation reln, BlockNumber blocknum, char *buffer)
int nbytes;
MdfdVec *v;
- v = _mdfd_getseg(reln, blocknum);
+ v = _mdfd_getseg(reln, blocknum, false);
#ifndef LET_OS_MANAGE_FILESIZE
seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
@@ -403,7 +470,7 @@ mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer)
long seekpos;
MdfdVec *v;
- v = _mdfd_getseg(reln, blocknum);
+ v = _mdfd_getseg(reln, blocknum, false);
#ifndef LET_OS_MANAGE_FILESIZE
seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
@@ -418,6 +485,9 @@ mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer)
if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
return false;
+ if (!register_dirty_segment(reln, v))
+ return false;
+
return true;
}
@@ -434,7 +504,7 @@ mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer)
BlockNumber
mdnblocks(SMgrRelation reln)
{
- MdfdVec *v = mdopen(reln);
+ MdfdVec *v = mdopen(reln, false);
#ifndef LET_OS_MANAGE_FILESIZE
BlockNumber nblocks;
@@ -516,7 +586,7 @@ mdtruncate(SMgrRelation reln, BlockNumber nblocks)
if (nblocks == curnblk)
return nblocks; /* no work */
- v = mdopen(reln);
+ v = mdopen(reln, false);
#ifndef LET_OS_MANAGE_FILESIZE
priorblocks = 0;
@@ -576,40 +646,154 @@ mdtruncate(SMgrRelation reln, BlockNumber nblocks)
}
/*
- * mdcommit() -- Commit a transaction.
+ * mdsync() -- Sync previous writes to stable storage.
+ *
+ * This is only called during checkpoints, and checkpoints should only
+ * occur in processes that have created a pendingOpsTable.
*/
bool
-mdcommit(void)
+mdsync(void)
{
+ HASH_SEQ_STATUS hstat;
+ PendingOperationEntry *entry;
+
+ if (!pendingOpsTable)
+ return false;
+
/*
- * We don't actually have to do anything here...
+ * If we are in the bgwriter, the sync had better include all fsync
+ * requests that were queued by backends before the checkpoint REDO
+ * point was determined. We go that a little better by accepting
+ * all requests queued up to the point where we start fsync'ing.
*/
+ AbsorbFsyncRequests();
+
+ hash_seq_init(&hstat, pendingOpsTable);
+ while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
+ {
+ /*
+ * If fsync is off then we don't have to bother opening the file
+ * at all. (We delay checking until this point so that changing
+ * fsync on the fly behaves sensibly.)
+ */
+ if (enableFsync)
+ {
+ SMgrRelation reln;
+ MdfdVec *seg;
+
+ /*
+ * Find or create an smgr hash entry for this relation.
+ * This may seem a bit unclean -- md calling smgr? But it's
+ * really the best solution. It ensures that the open file
+ * reference isn't permanently leaked if we get an error here.
+ * (You may say "but an unreferenced SMgrRelation is still a
+ * leak!" Not really, because the only case in which a checkpoint
+ * is done by a process that isn't about to shut down is in the
+ * bgwriter, and it will periodically do smgrcloseall(). This
+ * fact justifies our not closing the reln in the success path
+ * either, which is a good thing since in non-bgwriter cases
+ * we couldn't safely do that.) Furthermore, in many cases
+ * the relation will have been dirtied through this same smgr
+ * relation, and so we can save a file open/close cycle.
+ */
+ reln = smgropen(entry->rnode);
+
+ /*
+ * It is possible that the relation has been dropped or truncated
+ * since the fsync request was entered. Therefore, we have to
+ * allow file-not-found errors. This applies both during
+ * _mdfd_getseg() and during FileSync, since fd.c might have
+ * closed the file behind our back.
+ */
+ seg = _mdfd_getseg(reln,
+ entry->segno * ((BlockNumber) RELSEG_SIZE),
+ true);
+ if (seg)
+ {
+ if (FileSync(seg->mdfd_vfd) < 0 &&
+ errno != ENOENT)
+ {
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not fsync segment %u of relation %u/%u: %m",
+ entry->segno,
+ entry->rnode.tblNode,
+ entry->rnode.relNode)));
+ return false;
+ }
+ }
+ }
+
+ /* Okay, delete this entry */
+ if (hash_search(pendingOpsTable, entry,
+ HASH_REMOVE, NULL) == NULL)
+ elog(ERROR, "pendingOpsTable corrupted");
+ }
+
return true;
}
/*
- * mdabort() -- Abort a transaction.
+ * register_dirty_segment() -- Mark a relation segment as needing fsync
+ *
+ * If there is a local pending-ops table, just make an entry in it for
+ * mdsync to process later. Otherwise, try to pass off the fsync request
+ * to the background writer process. If that fails, just do the fsync
+ * locally before returning (we expect this will not happen often enough
+ * to be a performance problem).
+ *
+ * A false result implies I/O failure during local fsync. errno will be
+ * valid for error reporting.
*/
-bool
-mdabort(void)
+static bool
+register_dirty_segment(SMgrRelation reln, MdfdVec *seg)
{
- /*
- * We don't actually have to do anything here...
- */
+ if (pendingOpsTable)
+ {
+ PendingOperationEntry entry;
+
+ /* ensure any pad bytes in the struct are zeroed */
+ MemSet(&entry, 0, sizeof(entry));
+ entry.rnode = reln->smgr_rnode;
+ entry.segno = seg->mdfd_segno;
+
+ if (hash_search(pendingOpsTable, &entry, HASH_ENTER, NULL) != NULL)
+ return true;
+ /* out of memory: fall through to do it locally */
+ }
+ else
+ {
+ if (ForwardFsyncRequest(reln->smgr_rnode, seg->mdfd_segno))
+ return true;
+ }
+
+ if (FileSync(seg->mdfd_vfd) < 0)
+ return false;
return true;
}
/*
- * mdsync() -- Sync previous writes to stable storage.
+ * RememberFsyncRequest() -- callback from bgwriter side of fsync request
+ *
+ * We stuff the fsync request into the local hash table for execution
+ * during the bgwriter's next checkpoint.
*/
-bool
-mdsync(void)
+void
+RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
{
- sync();
- if (IsUnderPostmaster)
- pg_usleep(2000000L);
- sync();
- return true;
+ PendingOperationEntry entry;
+
+ Assert(pendingOpsTable);
+
+ /* ensure any pad bytes in the struct are zeroed */
+ MemSet(&entry, 0, sizeof(entry));
+ entry.rnode = rnode;
+ entry.segno = segno;
+
+ if (hash_search(pendingOpsTable, &entry, HASH_ENTER, NULL) == NULL)
+ ereport(FATAL,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
}
/*
@@ -618,18 +802,11 @@ mdsync(void)
static MdfdVec *
_fdvec_alloc(void)
{
- MdfdVec *v;
-
- v = (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec));
- v->mdfd_vfd = -1;
-#ifndef LET_OS_MANAGE_FILESIZE
- v->mdfd_chain = NULL;
-#endif
-
- return v;
+ return (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec));
}
#ifndef LET_OS_MANAGE_FILESIZE
+
/*
* Open the specified segment of the relation,
* and make a MdfdVec object for it. Returns NULL on failure.
@@ -642,11 +819,11 @@ _mdfd_openseg(SMgrRelation reln, BlockNumber segno, int oflags)
char *path,
*fullpath;
- /* be sure we have enough space for the '.segno', if any */
path = relpath(reln->smgr_rnode);
if (segno > 0)
{
+ /* be sure we have enough space for the '.segno' */
fullpath = (char *) palloc(strlen(path) + 12);
sprintf(fullpath, "%s.%u", path, segno);
pfree(path);
@@ -667,32 +844,36 @@ _mdfd_openseg(SMgrRelation reln, BlockNumber segno, int oflags)
/* fill the entry */
v->mdfd_vfd = fd;
+ v->mdfd_segno = segno;
v->mdfd_chain = NULL;
Assert(_mdnblocks(fd, BLCKSZ) <= ((BlockNumber) RELSEG_SIZE));
/* all done */
return v;
}
-#endif
+
+#endif /* LET_OS_MANAGE_FILESIZE */
/*
* _mdfd_getseg() -- Find the segment of the relation holding the
- * specified block. ereport's on failure.
+ * specified block. ereport's on failure.
+ * (Optionally, can return NULL instead of ereport for ENOENT.)
*/
static MdfdVec *
-_mdfd_getseg(SMgrRelation reln, BlockNumber blkno)
+_mdfd_getseg(SMgrRelation reln, BlockNumber blkno, bool allowNotFound)
{
- MdfdVec *v = mdopen(reln);
-
+ MdfdVec *v = mdopen(reln, allowNotFound);
#ifndef LET_OS_MANAGE_FILESIZE
- BlockNumber segno;
- BlockNumber i;
+ BlockNumber segstogo;
+ BlockNumber nextsegno;
- for (segno = blkno / ((BlockNumber) RELSEG_SIZE), i = 1;
- segno > 0;
- i++, segno--)
- {
+ if (!v)
+ return NULL; /* only possible if allowNotFound */
+ for (segstogo = blkno / ((BlockNumber) RELSEG_SIZE), nextsegno = 1;
+ segstogo > 0;
+ nextsegno++, segstogo--)
+ {
if (v->mdfd_chain == NULL)
{
/*
@@ -705,16 +886,21 @@ _mdfd_getseg(SMgrRelation reln, BlockNumber blkno)
* one new segment per call, so this restriction seems
* reasonable.
*/
- v->mdfd_chain = _mdfd_openseg(reln, i, (segno == 1) ? O_CREAT : 0);
-
+ v->mdfd_chain = _mdfd_openseg(reln,
+ nextsegno,
+ (segstogo == 1) ? O_CREAT : 0);
if (v->mdfd_chain == NULL)
+ {
+ if (allowNotFound && errno == ENOENT)
+ return NULL;
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not open segment %u of relation %u/%u (target block %u): %m",
- i,
+ nextsegno,
reln->smgr_rnode.tblNode,
reln->smgr_rnode.relNode,
blkno)));
+ }
}
v = v->mdfd_chain;
}