diff options
author | Jeff Davis <jdavis@postgresql.org> | 2024-02-12 10:36:18 -0800 |
---|---|---|
committer | Jeff Davis <jdavis@postgresql.org> | 2024-02-12 11:11:22 -0800 |
commit | 91f2cae7a4e664e9c0472b364c7db29d755ab151 (patch) | |
tree | a39e90a07d21b18bf85a6b83e954638631b4d9b5 /src/backend/access/transam/xlog.c | |
parent | 09eb633e1baa3b7cd7929f3cc77f9c46f63c20b1 (diff) | |
download | postgresql-91f2cae7a4e664e9c0472b364c7db29d755ab151.tar.gz postgresql-91f2cae7a4e664e9c0472b364c7db29d755ab151.zip |
Read WAL directly from WAL buffers.
If available, read directly from WAL buffers, avoiding the need to go
through the filesystem. Only for physical replication for now, but can
be expanded to other callers.
In preparation for replicating unflushed WAL data.
Author: Bharath Rupireddy
Discussion: https://postgr.es/m/CALj2ACXKKK%3DwbiG5_t6dGao5GoecMwRkhr7GjVBM_jg54%2BNa%3DQ%40mail.gmail.com
Reviewed-by: Andres Freund, Alvaro Herrera, Nathan Bossart, Dilip Kumar, Nitin Jadhav, Melih Mutlu, Kyotaro Horiguchi
Diffstat (limited to 'src/backend/access/transam/xlog.c')
-rw-r--r-- | src/backend/access/transam/xlog.c | 120 |
1 files changed, 120 insertions, 0 deletions
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 478377c4a23..4e14c242b15 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -1706,6 +1706,126 @@ GetXLogBuffer(XLogRecPtr ptr, TimeLineID tli) } /* + * Read WAL data directly from WAL buffers, if available. Returns the number + * of bytes read successfully. + * + * Fewer than 'count' bytes may be read if some of the requested WAL data has + * already been evicted from the WAL buffers, or if the caller requests data + * that is not yet available. + * + * No locks are taken. + * + * The 'tli' argument is only used as a convenient safety check so that + * callers do not read from WAL buffers on a historical timeline. + */ +Size +WALReadFromBuffers(char *dstbuf, XLogRecPtr startptr, Size count, + TimeLineID tli) +{ + char *pdst = dstbuf; + XLogRecPtr recptr = startptr; + XLogRecPtr upto; + Size nbytes; + + if (RecoveryInProgress() || tli != GetWALInsertionTimeLine()) + return 0; + + Assert(!XLogRecPtrIsInvalid(startptr)); + + /* + * Don't read past the available WAL data. + * + * Check using local copy of LogwrtResult. Ordinarily it's been updated by + * the caller when determining how far to read; but if not, it just means + * we'll read less data. + * + * XXX: the available WAL could be extended to the WAL insert pointer by + * calling WaitXLogInsertionsToFinish(). + */ + upto = Min(startptr + count, LogwrtResult.Write); + nbytes = upto - startptr; + + /* + * Loop through the buffers without a lock. For each buffer, atomically + * read and verify the end pointer, then copy the data out, and finally + * re-read and re-verify the end pointer. + * + * Once a page is evicted, it never returns to the WAL buffers, so if the + * end pointer matches the expected end pointer before and after we copy + * the data, then the right page must have been present during the data + * copy. Read barriers are necessary to ensure that the data copy actually + * happens between the two verification steps. + * + * If either verification fails, we simply terminate the loop and return + * with the data that had been already copied out successfully. + */ + while (nbytes > 0) + { + uint32 offset = recptr % XLOG_BLCKSZ; + int idx = XLogRecPtrToBufIdx(recptr); + XLogRecPtr expectedEndPtr; + XLogRecPtr endptr; + const char *page; + const char *psrc; + Size npagebytes; + + /* + * Calculate the end pointer we expect in the xlblocks array if the + * correct page is present. + */ + expectedEndPtr = recptr + (XLOG_BLCKSZ - offset); + + /* + * First verification step: check that the correct page is present in + * the WAL buffers. + */ + endptr = pg_atomic_read_u64(&XLogCtl->xlblocks[idx]); + if (expectedEndPtr != endptr) + break; + + /* + * The correct page is present (or was at the time the endptr was + * read; must re-verify later). Calculate pointer to source data and + * determine how much data to read from this page. + */ + page = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ; + psrc = page + offset; + npagebytes = Min(nbytes, XLOG_BLCKSZ - offset); + + /* + * Ensure that the data copy and the first verification step are not + * reordered. + */ + pg_read_barrier(); + + /* data copy */ + memcpy(pdst, psrc, npagebytes); + + /* + * Ensure that the data copy and the second verification step are not + * reordered. + */ + pg_read_barrier(); + + /* + * Second verification step: check that the page we read from wasn't + * evicted while we were copying the data. + */ + endptr = pg_atomic_read_u64(&XLogCtl->xlblocks[idx]); + if (expectedEndPtr != endptr) + break; + + pdst += npagebytes; + recptr += npagebytes; + nbytes -= npagebytes; + } + + Assert(pdst - dstbuf <= count); + + return pdst - dstbuf; +} + +/* * Converts a "usable byte position" to XLogRecPtr. A usable byte position * is the position starting from the beginning of WAL, excluding all WAL * page headers. |