diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/backend/storage/file/fd.c | 88 | ||||
-rw-r--r-- | src/backend/storage/smgr/md.c | 108 | ||||
-rw-r--r-- | src/backend/storage/smgr/smgr.c | 28 | ||||
-rw-r--r-- | src/include/storage/fd.h | 3 | ||||
-rw-r--r-- | src/include/storage/md.h | 2 | ||||
-rw-r--r-- | src/include/storage/smgr.h | 2 |
6 files changed, 231 insertions, 0 deletions
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index 2ac365e97cc..a280a1e7be3 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -2206,6 +2206,94 @@ FileSync(File file, uint32 wait_event_info) return returnCode; } +/* + * Zero a region of the file. + * + * Returns 0 on success, -1 otherwise. In the latter case errno is set to the + * appropriate error. + */ +int +FileZero(File file, off_t offset, off_t amount, uint32 wait_event_info) +{ + int returnCode; + ssize_t written; + + Assert(FileIsValid(file)); + + DO_DB(elog(LOG, "FileZero: %d (%s) " INT64_FORMAT " " INT64_FORMAT, + file, VfdCache[file].fileName, + (int64) offset, (int64) amount)); + + returnCode = FileAccess(file); + if (returnCode < 0) + return returnCode; + + pgstat_report_wait_start(wait_event_info); + written = pg_pwrite_zeros(VfdCache[file].fd, amount, offset); + pgstat_report_wait_end(); + + if (written < 0) + return -1; + else if (written != amount) + { + /* if errno is unset, assume problem is no disk space */ + if (errno == 0) + errno = ENOSPC; + return -1; + } + + return 0; +} + +/* + * Try to reserve file space with posix_fallocate(). If posix_fallocate() is + * not implemented on the operating system or fails with EINVAL / EOPNOTSUPP, + * use FileZero() instead. + * + * Note that at least glibc() implements posix_fallocate() in userspace if not + * implemented by the filesystem. That's not the case for all environments + * though. + * + * Returns 0 on success, -1 otherwise. In the latter case errno is set to the + * appropriate error. + */ +int +FileFallocate(File file, off_t offset, off_t amount, uint32 wait_event_info) +{ +#ifdef HAVE_POSIX_FALLOCATE + int returnCode; + + Assert(FileIsValid(file)); + + DO_DB(elog(LOG, "FileFallocate: %d (%s) " INT64_FORMAT " " INT64_FORMAT, + file, VfdCache[file].fileName, + (int64) offset, (int64) amount)); + + returnCode = FileAccess(file); + if (returnCode < 0) + return -1; + + pgstat_report_wait_start(wait_event_info); + returnCode = posix_fallocate(VfdCache[file].fd, offset, amount); + pgstat_report_wait_end(); + + if (returnCode == 0) + return 0; + + /* for compatibility with %m printing etc */ + errno = returnCode; + + /* + * Return in cases of a "real" failure, if fallocate is not supported, + * fall through to the FileZero() backed implementation. + */ + if (returnCode != EINVAL && returnCode != EOPNOTSUPP) + return -1; +#endif + + return FileZero(file, offset, amount, wait_event_info); +} + off_t FileSize(File file) { diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 352958e1feb..1c2d1405f86 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -501,6 +501,114 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, } /* + * mdzeroextend() -- Add new zeroed out blocks to the specified relation. + * + * Similar to mdextend(), except the relation can be extended by multiple + * blocks at once and the added blocks will be filled with zeroes. + */ +void +mdzeroextend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, int nblocks, bool skipFsync) +{ + MdfdVec *v; + BlockNumber curblocknum = blocknum; + int remblocks = nblocks; + + Assert(nblocks > 0); + + /* This assert is too expensive to have on normally ... */ +#ifdef CHECK_WRITE_VS_EXTEND + Assert(blocknum >= mdnblocks(reln, forknum)); +#endif + + /* + * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any + * more --- we mustn't create a block whose number actually is + * InvalidBlockNumber or larger. + */ + if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("cannot extend file \"%s\" beyond %u blocks", + relpath(reln->smgr_rlocator, forknum), + InvalidBlockNumber))); + + while (remblocks > 0) + { + BlockNumber segstartblock = curblocknum % ((BlockNumber) RELSEG_SIZE); + off_t seekpos = (off_t) BLCKSZ * segstartblock; + int numblocks; + + if (segstartblock + remblocks > RELSEG_SIZE) + numblocks = RELSEG_SIZE - segstartblock; + else + numblocks = remblocks; + + v = _mdfd_getseg(reln, forknum, curblocknum, skipFsync, EXTENSION_CREATE); + + Assert(segstartblock < RELSEG_SIZE); + Assert(segstartblock + numblocks <= RELSEG_SIZE); + + /* + * If available and useful, use posix_fallocate() (via FileAllocate()) + * to extend the relation. That's often more efficient than using + * write(), as it commonly won't cause the kernel to allocate page + * cache space for the extended pages. + * + * However, we don't use FileAllocate() for small extensions, as it + * defeats delayed allocation on some filesystems. Not clear where + * that decision should be made though? For now just use a cutoff of + * 8, anything between 4 and 8 worked OK in some local testing. + */ + if (numblocks > 8) + { + int ret; + + ret = FileFallocate(v->mdfd_vfd, + seekpos, (off_t) BLCKSZ * numblocks, + WAIT_EVENT_DATA_FILE_EXTEND); + if (ret != 0) + { + ereport(ERROR, + errcode_for_file_access(), + errmsg("could not extend file \"%s\" with FileFallocate(): %m", + FilePathName(v->mdfd_vfd)), + errhint("Check free disk space.")); + } + } + else + { + int ret; + + /* + * Even if we don't want to use fallocate, we can still extend a + * bit more efficiently than writing each 8kB block individually. + * pg_pwrite_zeroes() (via FileZero()) uses + * pg_pwritev_with_retry() to avoid multiple writes or needing a + * zeroed buffer for the whole length of the extension. + */ + ret = FileZero(v->mdfd_vfd, + seekpos, (off_t) BLCKSZ * numblocks, + WAIT_EVENT_DATA_FILE_EXTEND); + if (ret < 0) + ereport(ERROR, + errcode_for_file_access(), + errmsg("could not extend file \"%s\": %m", + FilePathName(v->mdfd_vfd)), + errhint("Check free disk space.")); + } + + if (!skipFsync && !SmgrIsTemp(reln)) + register_dirty_segment(reln, forknum, v); + + Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE)); + + remblocks -= numblocks; + curblocknum += numblocks; + } +} + +/* * mdopenfork() -- Open one fork of the specified relation. * * Note we only open the first segment, when there are multiple segments. diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index dc466e54145..c37c246b77f 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -50,6 +50,8 @@ typedef struct f_smgr bool isRedo); void (*smgr_extend) (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync); + void (*smgr_zeroextend) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, int nblocks, bool skipFsync); bool (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); void (*smgr_read) (SMgrRelation reln, ForkNumber forknum, @@ -75,6 +77,7 @@ static const f_smgr smgrsw[] = { .smgr_exists = mdexists, .smgr_unlink = mdunlink, .smgr_extend = mdextend, + .smgr_zeroextend = mdzeroextend, .smgr_prefetch = mdprefetch, .smgr_read = mdread, .smgr_write = mdwrite, @@ -508,6 +511,31 @@ smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, } /* + * smgrzeroextend() -- Add new zeroed out blocks to a file. + * + * Similar to smgrextend(), except the relation can be extended by + * multiple blocks at once and the added blocks will be filled with + * zeroes. + */ +void +smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + int nblocks, bool skipFsync) +{ + smgrsw[reln->smgr_which].smgr_zeroextend(reln, forknum, blocknum, + nblocks, skipFsync); + + /* + * Normally we expect this to increase the fork size by nblocks, but if + * the cached value isn't as expected, just invalidate it so the next call + * asks the kernel. + */ + if (reln->smgr_cached_nblocks[forknum] == blocknum) + reln->smgr_cached_nblocks[forknum] = blocknum + nblocks; + else + reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber; +} + +/* * smgrprefetch() -- Initiate asynchronous read of the specified block of a relation. * * In recovery only, this can return false to indicate that a file diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index f85de97d083..daceafd4732 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -106,6 +106,9 @@ extern int FilePrefetch(File file, off_t offset, off_t amount, uint32 wait_event extern int FileRead(File file, void *buffer, size_t amount, off_t offset, uint32 wait_event_info); extern int FileWrite(File file, const void *buffer, size_t amount, off_t offset, uint32 wait_event_info); extern int FileSync(File file, uint32 wait_event_info); +extern int FileZero(File file, off_t offset, off_t amount, uint32 wait_event_info); +extern int FileFallocate(File file, off_t offset, off_t amount, uint32 wait_event_info); + extern off_t FileSize(File file); extern int FileTruncate(File file, off_t offset, uint32 wait_event_info); extern void FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info); diff --git a/src/include/storage/md.h b/src/include/storage/md.h index 8f32af9ef3d..941879ee6a8 100644 --- a/src/include/storage/md.h +++ b/src/include/storage/md.h @@ -28,6 +28,8 @@ extern bool mdexists(SMgrRelation reln, ForkNumber forknum); extern void mdunlink(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo); extern void mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync); +extern void mdzeroextend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, int nblocks, bool skipFsync); extern bool mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); extern void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index 0935144f425..a9a179aabac 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -92,6 +92,8 @@ extern void smgrdosyncall(SMgrRelation *rels, int nrels); extern void smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo); extern void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync); +extern void smgrzeroextend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, int nblocks, bool skipFsync); extern bool smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); extern void smgrread(SMgrRelation reln, ForkNumber forknum, |