diff options
Diffstat (limited to 'src/backend/storage/smgr/md.c')
-rw-r--r-- | src/backend/storage/smgr/md.c | 325 |
1 files changed, 231 insertions, 94 deletions
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index bbac3c87413..50e531d8277 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -28,6 +28,7 @@ #include "access/xlog.h" #include "access/xlogutils.h" #include "commands/tablespace.h" +#include "common/file_utils.h" #include "miscadmin.h" #include "pg_trace.h" #include "pgstat.h" @@ -754,138 +755,274 @@ mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, } /* - * mdread() -- Read the specified block from a relation. + * Convert an array of buffer address into an array of iovec objects, and + * return the number that were required. 'iov' must have enough space for up + * to 'nblocks' elements, but the number used may be less depending on + * merging. In the case of a run of fully contiguous buffers, a single iovec + * will be populated that can be handled as a plain non-vectored I/O. */ -void -mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - void *buffer) +static int +buffers_to_iovec(struct iovec *iov, void **buffers, int nblocks) { - off_t seekpos; - int nbytes; - MdfdVec *v; + struct iovec *iovp; + int iovcnt; - /* If this build supports direct I/O, the buffer must be I/O aligned. */ - if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ) - Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer)); + Assert(nblocks >= 1); - TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum, - reln->smgr_rlocator.locator.spcOid, - reln->smgr_rlocator.locator.dbOid, - reln->smgr_rlocator.locator.relNumber, - reln->smgr_rlocator.backend); - - v = _mdfd_getseg(reln, forknum, blocknum, false, - EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY); + /* If this build supports direct I/O, buffers must be I/O aligned. */ + for (int i = 0; i < nblocks; ++i) + { + if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ) + Assert((uintptr_t) buffers[i] == + TYPEALIGN(PG_IO_ALIGN_SIZE, buffers[i])); + } - seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); + /* Start the first iovec off with the first buffer. */ + iovp = &iov[0]; + iovp->iov_base = buffers[0]; + iovp->iov_len = BLCKSZ; + iovcnt = 1; - Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); + /* Try to merge the rest. */ + for (int i = 1; i < nblocks; ++i) + { + void *buffer = buffers[i]; - nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_READ); + if (((char *) iovp->iov_base + iovp->iov_len) == buffer) + { + /* Contiguous with the last iovec. */ + iovp->iov_len += BLCKSZ; + } + else + { + /* Need a new iovec. */ + iovp++; + iovp->iov_base = buffer; + iovp->iov_len = BLCKSZ; + iovcnt++; + } + } - TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum, - reln->smgr_rlocator.locator.spcOid, - reln->smgr_rlocator.locator.dbOid, - reln->smgr_rlocator.locator.relNumber, - reln->smgr_rlocator.backend, - nbytes, - BLCKSZ); + return iovcnt; +} - if (nbytes != BLCKSZ) +/* + * mdreadv() -- Read the specified blocks from a relation. + */ +void +mdreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + void **buffers, BlockNumber nblocks) +{ + while (nblocks > 0) { - if (nbytes < 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not read block %u in file \"%s\": %m", - blocknum, FilePathName(v->mdfd_vfd)))); + struct iovec iov[PG_IOV_MAX]; + int iovcnt; + off_t seekpos; + int nbytes; + MdfdVec *v; + BlockNumber nblocks_this_segment; + size_t transferred_this_segment; + size_t size_this_segment; + + v = _mdfd_getseg(reln, forknum, blocknum, false, + EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY); + + seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); + + Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); + + nblocks_this_segment = + Min(nblocks, + RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE))); + nblocks_this_segment = Min(nblocks_this_segment, lengthof(iov)); + + iovcnt = buffers_to_iovec(iov, buffers, nblocks_this_segment); + size_this_segment = nblocks_this_segment * BLCKSZ; + transferred_this_segment = 0; /* - * Short read: we are at or past EOF, or we read a partial block at - * EOF. Normally this is an error; upper levels should never try to - * read a nonexistent block. However, if zero_damaged_pages is ON or - * we are InRecovery, we should instead return zeroes without - * complaining. This allows, for example, the case of trying to - * update a block that was later truncated away. + * Inner loop to continue after a short read. We'll keep going until + * we hit EOF rather than assuming that a short read means we hit the + * end. */ - if (zero_damaged_pages || InRecovery) - MemSet(buffer, 0, BLCKSZ); - else - ereport(ERROR, - (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("could not read block %u in file \"%s\": read only %d of %d bytes", - blocknum, FilePathName(v->mdfd_vfd), - nbytes, BLCKSZ))); + for (;;) + { + TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum, + reln->smgr_rlocator.locator.spcOid, + reln->smgr_rlocator.locator.dbOid, + reln->smgr_rlocator.locator.relNumber, + reln->smgr_rlocator.backend); + nbytes = FileReadV(v->mdfd_vfd, iov, iovcnt, seekpos, + WAIT_EVENT_DATA_FILE_READ); + TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum, + reln->smgr_rlocator.locator.spcOid, + reln->smgr_rlocator.locator.dbOid, + reln->smgr_rlocator.locator.relNumber, + reln->smgr_rlocator.backend, + nbytes, + size_this_segment - transferred_this_segment); + +#ifdef SIMULATE_SHORT_READ + nbytes = Min(nbytes, 4096); +#endif + + if (nbytes < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read blocks %u..%u in file \"%s\": %m", + blocknum, + blocknum + nblocks_this_segment - 1, + FilePathName(v->mdfd_vfd)))); + + if (nbytes == 0) + { + /* + * We are at or past EOF, or we read a partial block at EOF. + * Normally this is an error; upper levels should never try to + * read a nonexistent block. However, if zero_damaged_pages + * is ON or we are InRecovery, we should instead return zeroes + * without complaining. This allows, for example, the case of + * trying to update a block that was later truncated away. + */ + if (zero_damaged_pages || InRecovery) + { + for (BlockNumber i = transferred_this_segment / BLCKSZ; + i < nblocks_this_segment; + ++i) + memset(buffers[i], 0, BLCKSZ); + break; + } + else + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("could not read blocks %u..%u in file \"%s\": read only %zu of %zu bytes", + blocknum, + blocknum + nblocks_this_segment - 1, + FilePathName(v->mdfd_vfd), + transferred_this_segment, + size_this_segment))); + } + + /* One loop should usually be enough. */ + transferred_this_segment += nbytes; + Assert(transferred_this_segment <= size_this_segment); + if (transferred_this_segment == size_this_segment) + break; + + /* Adjust position and vectors after a short read. */ + seekpos += nbytes; + iovcnt = compute_remaining_iovec(iov, iov, iovcnt, nbytes); + } + + nblocks -= nblocks_this_segment; + buffers += nblocks_this_segment; + blocknum += nblocks_this_segment; } } /* - * mdwrite() -- Write the supplied block at the appropriate location. + * mdwritev() -- Write the supplied blocks at the appropriate location. * * This is to be used only for updating already-existing blocks of a * relation (ie, those before the current EOF). To extend a relation, * use mdextend(). */ void -mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - const void *buffer, bool skipFsync) +mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + const void **buffers, BlockNumber nblocks, bool skipFsync) { - off_t seekpos; - int nbytes; - MdfdVec *v; - - /* If this build supports direct I/O, the buffer must be I/O aligned. */ - if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ) - Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer)); - /* This assert is too expensive to have on normally ... */ #ifdef CHECK_WRITE_VS_EXTEND Assert(blocknum < mdnblocks(reln, forknum)); #endif - TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum, - reln->smgr_rlocator.locator.spcOid, - reln->smgr_rlocator.locator.dbOid, - reln->smgr_rlocator.locator.relNumber, - reln->smgr_rlocator.backend); + while (nblocks > 0) + { + struct iovec iov[PG_IOV_MAX]; + int iovcnt; + off_t seekpos; + int nbytes; + MdfdVec *v; + BlockNumber nblocks_this_segment; + size_t transferred_this_segment; + size_t size_this_segment; - v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, - EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY); + v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, + EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY); - seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); + seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); - Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); + Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); - nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_WRITE); + nblocks_this_segment = + Min(nblocks, + RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE))); + nblocks_this_segment = Min(nblocks_this_segment, lengthof(iov)); - TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum, - reln->smgr_rlocator.locator.spcOid, - reln->smgr_rlocator.locator.dbOid, - reln->smgr_rlocator.locator.relNumber, - reln->smgr_rlocator.backend, - nbytes, - BLCKSZ); + iovcnt = buffers_to_iovec(iov, (void **) buffers, nblocks_this_segment); + size_this_segment = nblocks_this_segment * BLCKSZ; + transferred_this_segment = 0; - if (nbytes != BLCKSZ) - { - if (nbytes < 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not write block %u in file \"%s\": %m", - blocknum, FilePathName(v->mdfd_vfd)))); - /* short write: complain appropriately */ - ereport(ERROR, - (errcode(ERRCODE_DISK_FULL), - errmsg("could not write block %u in file \"%s\": wrote only %d of %d bytes", - blocknum, - FilePathName(v->mdfd_vfd), - nbytes, BLCKSZ), - errhint("Check free disk space."))); - } + /* + * Inner loop to continue after a short write. If the reason is that + * we're out of disk space, a future attempt should get an ENOSPC + * error from the kernel. + */ + for (;;) + { + TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum, + reln->smgr_rlocator.locator.spcOid, + reln->smgr_rlocator.locator.dbOid, + reln->smgr_rlocator.locator.relNumber, + reln->smgr_rlocator.backend); + nbytes = FileWriteV(v->mdfd_vfd, iov, iovcnt, seekpos, + WAIT_EVENT_DATA_FILE_WRITE); + TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum, + reln->smgr_rlocator.locator.spcOid, + reln->smgr_rlocator.locator.dbOid, + reln->smgr_rlocator.locator.relNumber, + reln->smgr_rlocator.backend, + nbytes, + size_this_segment - transferred_this_segment); + +#ifdef SIMULATE_SHORT_WRITE + nbytes = Min(nbytes, 4096); +#endif - if (!skipFsync && !SmgrIsTemp(reln)) - register_dirty_segment(reln, forknum, v); + if (nbytes < 0) + { + bool enospc = errno == ENOSPC; + + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write blocks %u..%u in file \"%s\": %m", + blocknum, + blocknum + nblocks_this_segment - 1, + FilePathName(v->mdfd_vfd)), + enospc ? errhint("Check free disk space.") : 0)); + } + + /* One loop should usually be enough. */ + transferred_this_segment += nbytes; + Assert(transferred_this_segment <= size_this_segment); + if (transferred_this_segment == size_this_segment) + break; + + /* Adjust position and iovecs after a short write. */ + seekpos += nbytes; + iovcnt = compute_remaining_iovec(iov, iov, iovcnt, nbytes); + } + + if (!skipFsync && !SmgrIsTemp(reln)) + register_dirty_segment(reln, forknum, v); + + nblocks -= nblocks_this_segment; + buffers += nblocks_this_segment; + blocknum += nblocks_this_segment; + } } + /* * mdwriteback() -- Tell the kernel to write pages back to storage. * |