aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndres Freund <andres@anarazel.de>2025-07-07 21:03:16 -0400
committerAndres Freund <andres@anarazel.de>2025-07-07 22:57:07 -0400
commitf54af9f2679d5987b4680e742ac9bd585260e620 (patch)
treef53671d9d66b881087541054594405fe3c211bb3
parent55a780e9476a753354a6db887e92125c7886ca6d (diff)
downloadpostgresql-f54af9f2679d5987b4680e742ac9bd585260e620.tar.gz
postgresql-f54af9f2679d5987b4680e742ac9bd585260e620.zip
aio: Combine io_uring memory mappings, if supportedHEADmaster
By default io_uring creates a shared memory mapping for each io_uring instance, leading to a large number of memory mappings. Unfortunately a large number of memory mappings slows things down, backend exit is particularly affected. To address that, newer kernels (6.5) support using user-provided memory for the memory. By putting the relevant memory into shared memory we don't need any additional mappings. On a system with a new enough kernel and liburing, there is no discernible overhead when doing a pgbench -S -C anymore. Reported-by: MARK CALLAGHAN <mdcallag@gmail.com> Reviewed-by: "Burd, Greg" <greg@burd.me> Reviewed-by: Jim Nasby <jnasby@upgrade.com> Discussion: https://postgr.es/m/CAFbpF8OA44_UG+RYJcWH9WjF7E3GA6gka3gvH6nsrSnEe9H0NA@mail.gmail.com Backpatch-through: 18
-rwxr-xr-xconfigure17
-rw-r--r--configure.ac7
-rw-r--r--meson.build6
-rw-r--r--src/backend/storage/aio/method_io_uring.c210
-rw-r--r--src/include/pg_config.h.in3
-rw-r--r--src/tools/pgindent/typedefs.list1
6 files changed, 238 insertions, 6 deletions
diff --git a/configure b/configure
index 16ef5b58d1a..cfaf3757dd7 100755
--- a/configure
+++ b/configure
@@ -13309,6 +13309,23 @@ fi
fi
+if test "$with_liburing" = yes; then
+ _LIBS="$LIBS"
+ LIBS="$LIBURING_LIBS $LIBS"
+ for ac_func in io_uring_queue_init_mem
+do :
+ ac_fn_c_check_func "$LINENO" "io_uring_queue_init_mem" "ac_cv_func_io_uring_queue_init_mem"
+if test "x$ac_cv_func_io_uring_queue_init_mem" = xyes; then :
+ cat >>confdefs.h <<_ACEOF
+#define HAVE_IO_URING_QUEUE_INIT_MEM 1
+_ACEOF
+
+fi
+done
+
+ LIBS="$_LIBS"
+fi
+
if test "$with_lz4" = yes ; then
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for LZ4_compress_default in -llz4" >&5
$as_echo_n "checking for LZ4_compress_default in -llz4... " >&6; }
diff --git a/configure.ac b/configure.ac
index b3efc49c97a..c2877e36935 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1420,6 +1420,13 @@ if test "$with_libxslt" = yes ; then
AC_CHECK_LIB(xslt, xsltCleanupGlobals, [], [AC_MSG_ERROR([library 'xslt' is required for XSLT support])])
fi
+if test "$with_liburing" = yes; then
+ _LIBS="$LIBS"
+ LIBS="$LIBURING_LIBS $LIBS"
+ AC_CHECK_FUNCS([io_uring_queue_init_mem])
+ LIBS="$_LIBS"
+fi
+
if test "$with_lz4" = yes ; then
AC_CHECK_LIB(lz4, LZ4_compress_default, [], [AC_MSG_ERROR([library 'lz4' is required for LZ4 support])])
fi
diff --git a/meson.build b/meson.build
index f5b937de60c..5365aaf95e6 100644
--- a/meson.build
+++ b/meson.build
@@ -995,6 +995,12 @@ liburingopt = get_option('liburing')
liburing = dependency('liburing', required: liburingopt)
if liburing.found()
cdata.set('USE_LIBURING', 1)
+
+ if cc.has_function('io_uring_queue_init_mem',
+ dependencies: liburing, args: test_c_args)
+ cdata.set('HAVE_LIBURING_QUEUE_INIT_MEM', 1)
+ endif
+
endif
diff --git a/src/backend/storage/aio/method_io_uring.c b/src/backend/storage/aio/method_io_uring.c
index b78048328e1..0a8c054162f 100644
--- a/src/backend/storage/aio/method_io_uring.c
+++ b/src/backend/storage/aio/method_io_uring.c
@@ -29,6 +29,9 @@
#ifdef IOMETHOD_IO_URING_ENABLED
+#include <sys/mman.h>
+#include <unistd.h>
+
#include <liburing.h>
#include "miscadmin.h"
@@ -94,12 +97,32 @@ PgAioUringContext
struct io_uring io_uring_ring;
} PgAioUringContext;
+/*
+ * Information about the capabilities that io_uring has.
+ *
+ * Depending on liburing and kernel version different features are
+ * supported. At least for the kernel a kernel version check does not suffice
+ * as various vendors do backport features to older kernels :(.
+ */
+typedef struct PgAioUringCaps
+{
+ bool checked;
+ /* -1 if io_uring_queue_init_mem() is unsupported */
+ int mem_init_size;
+} PgAioUringCaps;
+
+
/* PgAioUringContexts for all backends */
static PgAioUringContext *pgaio_uring_contexts;
/* the current backend's context */
static PgAioUringContext *pgaio_my_uring_context;
+static PgAioUringCaps pgaio_uring_caps =
+{
+ .checked = false,
+ .mem_init_size = -1,
+};
static uint32
pgaio_uring_procs(void)
@@ -111,16 +134,145 @@ pgaio_uring_procs(void)
return MaxBackends + NUM_AUXILIARY_PROCS - MAX_IO_WORKERS;
}
-static Size
+/*
+ * Initializes pgaio_uring_caps, unless that's already done.
+ */
+static void
+pgaio_uring_check_capabilities(void)
+{
+ if (pgaio_uring_caps.checked)
+ return;
+
+ /*
+ * By default io_uring creates a shared memory mapping for each io_uring
+ * instance, leading to a large number of memory mappings. Unfortunately a
+ * large number of memory mappings slows things down, backend exit is
+ * particularly affected. To address that, newer kernels (6.5) support
+ * using user-provided memory for the memory, by putting the relevant
+ * memory into shared memory we don't need any additional mappings.
+ *
+ * To know whether this is supported, we unfortunately need to probe the
+ * kernel by trying to create a ring with userspace-provided memory. This
+ * also has a secondary benefit: We can determine precisely how much
+ * memory we need for each io_uring instance.
+ */
+#if defined(HAVE_LIBURING_QUEUE_INIT_MEM) && defined(IORING_SETUP_NO_MMAP)
+ {
+ struct io_uring test_ring;
+ size_t ring_size;
+ void *ring_ptr;
+ struct io_uring_params p = {0};
+ int ret;
+
+ /*
+ * Liburing does not yet provide an API to query how much memory a
+ * ring will need. So we over-estimate it here. As the memory is freed
+ * just below that's small temporary waste of memory.
+ *
+ * 1MB is more than enough for rings within io_max_concurrency's
+ * range.
+ */
+ ring_size = 1024 * 1024;
+
+ /*
+ * Hard to believe a system exists where 1MB would not be a multiple
+ * of the page size. But it's cheap to ensure...
+ */
+ ring_size -= ring_size % sysconf(_SC_PAGESIZE);
+
+ ring_ptr = mmap(NULL, ring_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+ if (ring_ptr == MAP_FAILED)
+ elog(ERROR,
+ "mmap(%zu) to determine io_uring_queue_init_mem() support failed: %m",
+ ring_size);
+
+ ret = io_uring_queue_init_mem(io_max_concurrency, &test_ring, &p, ring_ptr, ring_size);
+ if (ret > 0)
+ {
+ pgaio_uring_caps.mem_init_size = ret;
+
+ elog(DEBUG1,
+ "can use combined memory mapping for io_uring, each ring needs %d bytes",
+ ret);
+
+ /* clean up the created ring, it was just for a test */
+ io_uring_queue_exit(&test_ring);
+ }
+ else
+ {
+ /*
+ * There are different reasons for ring creation to fail, but it's
+ * ok to treat that just as io_uring_queue_init_mem() not being
+ * supported. We'll report a more detailed error in
+ * pgaio_uring_shmem_init().
+ */
+ errno = -ret;
+ elog(DEBUG1,
+ "cannot use combined memory mapping for io_uring, ring creation failed: %m");
+
+ }
+
+ if (munmap(ring_ptr, ring_size) != 0)
+ elog(ERROR, "munmap() failed: %m");
+ }
+#else
+ {
+ elog(DEBUG1,
+ "can't use combined memory mapping for io_uring, kernel or liburing too old");
+ }
+#endif
+
+ pgaio_uring_caps.checked = true;
+}
+
+/*
+ * Memory for all PgAioUringContext instances
+ */
+static size_t
pgaio_uring_context_shmem_size(void)
{
return mul_size(pgaio_uring_procs(), sizeof(PgAioUringContext));
}
+/*
+ * Memory for the combined memory used by io_uring instances. Returns 0 if
+ * that is not supported by kernel/liburing.
+ */
+static size_t
+pgaio_uring_ring_shmem_size(void)
+{
+ size_t sz = 0;
+
+ if (pgaio_uring_caps.mem_init_size > 0)
+ {
+ /*
+ * Memory for rings needs to be allocated to the page boundary,
+ * reserve space. Luckily it does not need to be aligned to hugepage
+ * boundaries, even if huge pages are used.
+ */
+ sz = add_size(sz, sysconf(_SC_PAGESIZE));
+ sz = add_size(sz, mul_size(pgaio_uring_procs(),
+ pgaio_uring_caps.mem_init_size));
+ }
+
+ return sz;
+}
+
static size_t
pgaio_uring_shmem_size(void)
{
- return pgaio_uring_context_shmem_size();
+ size_t sz;
+
+ /*
+ * Kernel and liburing support for various features influences how much
+ * shmem we need, perform the necessary checks.
+ */
+ pgaio_uring_check_capabilities();
+
+ sz = pgaio_uring_context_shmem_size();
+ sz = add_size(sz, pgaio_uring_ring_shmem_size());
+
+ return sz;
}
static void
@@ -128,13 +280,38 @@ pgaio_uring_shmem_init(bool first_time)
{
int TotalProcs = pgaio_uring_procs();
bool found;
+ char *shmem;
+ size_t ring_mem_remain = 0;
+ char *ring_mem_next = 0;
- pgaio_uring_contexts = (PgAioUringContext *)
- ShmemInitStruct("AioUring", pgaio_uring_shmem_size(), &found);
-
+ /*
+ * We allocate memory for all PgAioUringContext instances and, if
+ * supported, the memory required for each of the io_uring instances, in
+ * one ShmemInitStruct().
+ */
+ shmem = ShmemInitStruct("AioUringContext", pgaio_uring_shmem_size(), &found);
if (found)
return;
+ pgaio_uring_contexts = (PgAioUringContext *) shmem;
+ shmem += pgaio_uring_context_shmem_size();
+
+ /* if supported, handle memory alignment / sizing for io_uring memory */
+ if (pgaio_uring_caps.mem_init_size > 0)
+ {
+ ring_mem_remain = pgaio_uring_ring_shmem_size();
+ ring_mem_next = (char *) shmem;
+
+ /* align to page boundary, see also pgaio_uring_ring_shmem_size() */
+ ring_mem_next = (char *) TYPEALIGN(sysconf(_SC_PAGESIZE), ring_mem_next);
+
+ /* account for alignment */
+ ring_mem_remain -= ring_mem_next - shmem;
+ shmem += ring_mem_next - shmem;
+
+ shmem += ring_mem_remain;
+ }
+
for (int contextno = 0; contextno < TotalProcs; contextno++)
{
PgAioUringContext *context = &pgaio_uring_contexts[contextno];
@@ -158,7 +335,28 @@ pgaio_uring_shmem_init(bool first_time)
* be worth using that - also need to evaluate if that causes
* noticeable additional contention?
*/
- ret = io_uring_queue_init(io_max_concurrency, &context->io_uring_ring, 0);
+
+ /*
+ * If supported (c.f. pgaio_uring_check_capabilities()), create ring
+ * with its data in shared memory. Otherwise fall back io_uring
+ * creating a memory mapping for each ring.
+ */
+#if defined(HAVE_LIBURING_QUEUE_INIT_MEM) && defined(IORING_SETUP_NO_MMAP)
+ if (pgaio_uring_caps.mem_init_size > 0)
+ {
+ struct io_uring_params p = {0};
+
+ ret = io_uring_queue_init_mem(io_max_concurrency, &context->io_uring_ring, &p, ring_mem_next, ring_mem_remain);
+
+ ring_mem_remain -= ret;
+ ring_mem_next += ret;
+ }
+ else
+#endif
+ {
+ ret = io_uring_queue_init(io_max_concurrency, &context->io_uring_ring, 0);
+ }
+
if (ret < 0)
{
char *hint = NULL;
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 726a7c1be1f..c4dc5d72bdb 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -229,6 +229,9 @@
/* Define to 1 if you have the global variable 'int timezone'. */
#undef HAVE_INT_TIMEZONE
+/* Define to 1 if you have the `io_uring_queue_init_mem' function. */
+#undef HAVE_IO_URING_QUEUE_INIT_MEM
+
/* Define to 1 if __builtin_constant_p(x) implies "i"(x) acceptance. */
#undef HAVE_I_CONSTRAINT__BUILTIN_CONSTANT_P
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 114bdafafdf..83192038571 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -2181,6 +2181,7 @@ PgAioReturn
PgAioTargetData
PgAioTargetID
PgAioTargetInfo
+PgAioUringCaps
PgAioUringContext
PgAioWaitRef
PgArchData