aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/backend/storage/file/fd.c255
-rw-r--r--src/backend/storage/file/reinit.c2
-rw-r--r--src/include/storage/fd.h4
3 files changed, 211 insertions, 50 deletions
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index ccf4df15a07..d39b81ed2ae 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -272,7 +272,10 @@ static void walkdir(const char *path,
#ifdef PG_FLUSH_DATA_WORKS
static void pre_sync_fname(const char *fname, bool isdir, int elevel);
#endif
-static void fsync_fname_ext(const char *fname, bool isdir, int elevel);
+static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
+
+static int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel);
+static int fsync_parent_path(const char *fname, int elevel);
/*
@@ -377,54 +380,157 @@ pg_flush_data(int fd, off_t offset, off_t amount)
* indicate the OS just doesn't allow/require fsyncing directories.
*/
void
-fsync_fname(char *fname, bool isdir)
+fsync_fname(const char *fname, bool isdir)
+{
+ fsync_fname_ext(fname, isdir, false, ERROR);
+}
+
+/*
+ * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
+ *
+ * This routine ensures that, after returning, the effect of renaming file
+ * persists in case of a crash. A crash while this routine is running will
+ * leave you with either the pre-existing or the moved file in place of the
+ * new file; no mixed state or truncated files are possible.
+ *
+ * It does so by using fsync on the old filename and the possibly existing
+ * target filename before the rename, and the target file and directory after.
+ *
+ * Note that rename() cannot be used across arbitrary directories, as they
+ * might not be on the same filesystem. Therefore this routine does not
+ * support renaming across directories.
+ *
+ * Log errors with the caller specified severity.
+ *
+ * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
+ * valid upon return.
+ */
+int
+durable_rename(const char *oldfile, const char *newfile, int elevel)
{
int fd;
- int returncode;
/*
- * Some OSs require directories to be opened read-only whereas other
- * systems don't allow us to fsync files opened read-only; so we need both
- * cases here
+ * First fsync the old and target path (if it exists), to ensure that they
+ * are properly persistent on disk. Syncing the target file is not
+ * strictly necessary, but it makes it easier to reason about crashes;
+ * because it's then guaranteed that either source or target file exists
+ * after a crash.
*/
- if (!isdir)
- fd = BasicOpenFile(fname,
- O_RDWR | PG_BINARY,
- S_IRUSR | S_IWUSR);
+ if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
+ return -1;
+
+ fd = BasicOpenFile((char *) newfile, PG_BINARY | O_RDWR, 0);
+ if (fd < 0)
+ {
+ if (errno != ENOENT)
+ {
+ ereport(elevel,
+ (errcode_for_file_access(),
+ errmsg("could not open file \"%s\": %m", newfile)));
+ return -1;
+ }
+ }
else
- fd = BasicOpenFile(fname,
- O_RDONLY | PG_BINARY,
- S_IRUSR | S_IWUSR);
+ {
+ if (pg_fsync(fd) != 0)
+ {
+ int save_errno;
+
+ save_errno = errno;
+ close(fd);
+ errno = save_errno;
+
+ ereport(elevel,
+ (errcode_for_file_access(),
+ errmsg("could not fsync file \"%s\": %m", newfile)));
+ return -1;
+ }
+ close(fd);
+ }
+
+ /* Time to do the real deal... */
+ if (rename(oldfile, newfile) < 0)
+ {
+ ereport(elevel,
+ (errcode_for_file_access(),
+ errmsg("could not rename file \"%s\" to \"%s\": %m",
+ oldfile, newfile)));
+ return -1;
+ }
/*
- * Some OSs don't allow us to open directories at all (Windows returns
- * EACCES)
+ * To guarantee renaming the file is persistent, fsync the file with its
+ * new name, and its containing directory.
*/
- if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
- return;
+ if (fsync_fname_ext(newfile, false, false, elevel) != 0)
+ return -1;
- else if (fd < 0)
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not open file \"%s\": %m", fname)));
+ if (fsync_parent_path(newfile, elevel) != 0)
+ return -1;
- returncode = pg_fsync(fd);
+ return 0;
+}
+
+/*
+ * durable_link_or_rename -- rename a file in a durable manner.
+ *
+ * Similar to durable_rename(), except that this routine tries (but does not
+ * guarantee) not to overwrite the target file.
+ *
+ * Note that a crash in an unfortunate moment can leave you with two links to
+ * the target file.
+ *
+ * Log errors with the caller specified severity.
+ *
+ * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
+ * valid upon return.
+ */
+int
+durable_link_or_rename(const char *oldfile, const char *newfile, int elevel)
+{
+ /*
+ * Ensure that, if we crash directly after the rename/link, a file with
+ * valid contents is moved into place.
+ */
+ if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
+ return -1;
- /* Some OSs don't allow us to fsync directories at all */
- if (returncode != 0 && isdir && errno == EBADF)
+#if HAVE_WORKING_LINK
+ if (link(oldfile, newfile) < 0)
{
- close(fd);
- return;
+ ereport(elevel,
+ (errcode_for_file_access(),
+ errmsg("could not link file \"%s\" to \"%s\": %m",
+ oldfile, newfile)));
+ return -1;
}
-
- if (returncode != 0)
- ereport(ERROR,
+ unlink(oldfile);
+#else
+ /* XXX: Add racy file existence check? */
+ if (rename(oldfile, newfile) < 0)
+ {
+ ereport(elevel,
(errcode_for_file_access(),
- errmsg("could not fsync file \"%s\": %m", fname)));
+ errmsg("could not rename file \"%s\" to \"%s\": %m",
+ oldfile, newfile)));
+ return -1;
+ }
+#endif
- close(fd);
-}
+ /*
+ * Make change persistent in case of an OS crash, both the new entry and
+ * its parent directory need to be flushed.
+ */
+ if (fsync_fname_ext(newfile, false, false, elevel) != 0)
+ return -1;
+
+ /* Same for parent directory */
+ if (fsync_parent_path(newfile, elevel) != 0)
+ return -1;
+ return 0;
+}
/*
* InitFileAccess --- initialize this module during backend startup
@@ -2317,10 +2423,10 @@ SyncDataDirectory(void)
* in pg_tblspc, they'll get fsync'd twice. That's not an expected case
* so we don't worry about optimizing it.
*/
- walkdir(".", fsync_fname_ext, false, LOG);
+ walkdir(".", datadir_fsync_fname, false, LOG);
if (xlog_is_symlink)
- walkdir("pg_xlog", fsync_fname_ext, false, LOG);
- walkdir("pg_tblspc", fsync_fname_ext, true, LOG);
+ walkdir("pg_xlog", datadir_fsync_fname, false, LOG);
+ walkdir("pg_tblspc", datadir_fsync_fname, true, LOG);
}
/*
@@ -2434,15 +2540,26 @@ pre_sync_fname(const char *fname, bool isdir, int elevel)
#endif /* PG_FLUSH_DATA_WORKS */
+static void
+datadir_fsync_fname(const char *fname, bool isdir, int elevel)
+{
+ /*
+ * We want to silently ignoring errors about unreadable files. Pass that
+ * desire on to fsync_fname_ext().
+ */
+ fsync_fname_ext(fname, isdir, true, elevel);
+}
+
/*
* fsync_fname_ext -- Try to fsync a file or directory
*
- * Ignores errors trying to open unreadable files, or trying to fsync
- * directories on systems where that isn't allowed/required, and logs other
- * errors at a caller-specified level.
+ * If ignore_perm is true, ignore errors upon trying to open unreadable
+ * files. Logs other errors at a caller-specified level.
+ *
+ * Returns 0 if the operation succeeded, -1 otherwise.
*/
-static void
-fsync_fname_ext(const char *fname, bool isdir, int elevel)
+static int
+fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
{
int fd;
int flags;
@@ -2460,20 +2577,23 @@ fsync_fname_ext(const char *fname, bool isdir, int elevel)
else
flags |= O_RDONLY;
+ fd = BasicOpenFile((char *) fname, flags, 0);
+
/*
- * Open the file, silently ignoring errors about unreadable files (or
- * unsupported operations, e.g. opening a directory under Windows), and
- * logging others.
+ * Some OSs don't allow us to open directories at all (Windows returns
+ * EACCES), just ignore the error in that case. If desired also silently
+ * ignoring errors about unreadable files. Log others.
*/
- fd = BasicOpenFile((char *) fname, flags, 0);
- if (fd < 0)
+ if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
+ return 0;
+ else if (fd < 0 && ignore_perm && errno == EACCES)
+ return 0;
+ else if (fd < 0)
{
- if (errno == EACCES || (isdir && errno == EISDIR))
- return;
ereport(elevel,
(errcode_for_file_access(),
errmsg("could not open file \"%s\": %m", fname)));
- return;
+ return -1;
}
returncode = pg_fsync(fd);
@@ -2483,9 +2603,48 @@ fsync_fname_ext(const char *fname, bool isdir, int elevel)
* those errors. Anything else needs to be logged.
*/
if (returncode != 0 && !(isdir && errno == EBADF))
+ {
+ int save_errno;
+
+ save_errno = errno;
+ (void) close(fd);
+ errno = save_errno;
+
ereport(elevel,
(errcode_for_file_access(),
errmsg("could not fsync file \"%s\": %m", fname)));
+ return -1;
+ }
(void) close(fd);
+
+ return 0;
+}
+
+/*
+ * fsync_parent_path -- fsync the parent path of a file or directory
+ *
+ * This is aimed at making file operations persistent on disk in case of
+ * an OS crash or power failure.
+ */
+static int
+fsync_parent_path(const char *fname, int elevel)
+{
+ char parentpath[MAXPGPATH];
+
+ strlcpy(parentpath, fname, MAXPGPATH);
+ get_parent_directory(parentpath);
+
+ /*
+ * get_parent_directory() returns an empty string if the input argument is
+ * just a file name (see comments in path.c), so handle that as being the
+ * current directory.
+ */
+ if (strlen(parentpath) == 0)
+ strlcpy(parentpath, ".", MAXPGPATH);
+
+ if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
+ return -1;
+
+ return 0;
}
diff --git a/src/backend/storage/file/reinit.c b/src/backend/storage/file/reinit.c
index 88b4364ea63..4eb179eb111 100644
--- a/src/backend/storage/file/reinit.c
+++ b/src/backend/storage/file/reinit.c
@@ -385,7 +385,7 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op)
FreeDir(dbspace_dir);
- fsync_fname((char *) dbspacedirname, true);
+ fsync_fname(dbspacedirname, true);
}
}
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index b9bffe3d10b..20cb7a2c254 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -99,7 +99,9 @@ extern int pg_fsync_no_writethrough(int fd);
extern int pg_fsync_writethrough(int fd);
extern int pg_fdatasync(int fd);
extern int pg_flush_data(int fd, off_t offset, off_t amount);
-extern void fsync_fname(char *fname, bool isdir);
+extern void fsync_fname(const char *fname, bool isdir);
+extern int durable_rename(const char *oldfile, const char *newfile, int loglevel);
+extern int durable_link_or_rename(const char *oldfile, const char *newfile, int loglevel);
extern void SyncDataDirectory(void);
/* Filename components for OpenTemporaryFile */