aboutsummaryrefslogtreecommitdiff
path: root/src/unix/linux.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/unix/linux.c')
-rw-r--r--src/unix/linux.c560
1 files changed, 551 insertions, 9 deletions
diff --git a/src/unix/linux.c b/src/unix/linux.c
index a30f1bfb..875dfcbf 100644
--- a/src/unix/linux.c
+++ b/src/unix/linux.c
@@ -27,6 +27,8 @@
#include "internal.h"
#include <inttypes.h>
+#include <stdatomic.h>
+#include <stddef.h> /* offsetof */
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
@@ -38,15 +40,29 @@
#include <net/if.h>
#include <sys/epoll.h>
#include <sys/inotify.h>
+#include <sys/mman.h>
#include <sys/param.h>
#include <sys/prctl.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/sysinfo.h>
+#include <sys/sysmacros.h>
#include <sys/types.h>
#include <time.h>
#include <unistd.h>
+#ifndef __NR_io_uring_setup
+# define __NR_io_uring_setup 425
+#endif
+
+#ifndef __NR_io_uring_enter
+# define __NR_io_uring_enter 426
+#endif
+
+#ifndef __NR_io_uring_register
+# define __NR_io_uring_register 427
+#endif
+
#ifndef __NR_copy_file_range
# if defined(__x86_64__)
# define __NR_copy_file_range 326
@@ -116,6 +132,119 @@
# include <netpacket/packet.h>
#endif /* HAVE_IFADDRS_H */
+enum {
+ UV__IORING_SETUP_SQPOLL = 2u,
+};
+
+enum {
+ UV__IORING_FEAT_SINGLE_MMAP = 1u,
+ UV__IORING_FEAT_NODROP = 2u,
+ UV__IORING_FEAT_RSRC_TAGS = 1024u, /* linux v5.13 */
+};
+
+enum {
+ UV__IORING_OP_READV = 1,
+ UV__IORING_OP_WRITEV = 2,
+ UV__IORING_OP_FSYNC = 3,
+ UV__IORING_OP_STATX = 21,
+};
+
+enum {
+ UV__IORING_ENTER_SQ_WAKEUP = 2u,
+};
+
+enum {
+ UV__IORING_SQ_NEED_WAKEUP = 1u,
+};
+
+struct uv__io_cqring_offsets {
+ uint32_t head;
+ uint32_t tail;
+ uint32_t ring_mask;
+ uint32_t ring_entries;
+ uint32_t overflow;
+ uint32_t cqes;
+ uint64_t reserved0;
+ uint64_t reserved1;
+};
+
+STATIC_ASSERT(40 == sizeof(struct uv__io_cqring_offsets));
+
+struct uv__io_sqring_offsets {
+ uint32_t head;
+ uint32_t tail;
+ uint32_t ring_mask;
+ uint32_t ring_entries;
+ uint32_t flags;
+ uint32_t dropped;
+ uint32_t array;
+ uint32_t reserved0;
+ uint64_t reserved1;
+};
+
+STATIC_ASSERT(40 == sizeof(struct uv__io_sqring_offsets));
+
+struct uv__io_uring_cqe {
+ uint64_t user_data;
+ int32_t res;
+ uint32_t flags;
+};
+
+STATIC_ASSERT(16 == sizeof(struct uv__io_uring_cqe));
+
+struct uv__io_uring_sqe {
+ uint8_t opcode;
+ uint8_t flags;
+ uint16_t ioprio;
+ int32_t fd;
+ union {
+ uint64_t off;
+ uint64_t addr2;
+ };
+ union {
+ uint64_t addr;
+ };
+ uint32_t len;
+ union {
+ uint32_t rw_flags;
+ uint32_t fsync_flags;
+ uint32_t statx_flags;
+ };
+ uint64_t user_data;
+ union {
+ uint16_t buf_index;
+ uint64_t pad[3];
+ };
+};
+
+STATIC_ASSERT(64 == sizeof(struct uv__io_uring_sqe));
+STATIC_ASSERT(0 == offsetof(struct uv__io_uring_sqe, opcode));
+STATIC_ASSERT(1 == offsetof(struct uv__io_uring_sqe, flags));
+STATIC_ASSERT(2 == offsetof(struct uv__io_uring_sqe, ioprio));
+STATIC_ASSERT(4 == offsetof(struct uv__io_uring_sqe, fd));
+STATIC_ASSERT(8 == offsetof(struct uv__io_uring_sqe, off));
+STATIC_ASSERT(16 == offsetof(struct uv__io_uring_sqe, addr));
+STATIC_ASSERT(24 == offsetof(struct uv__io_uring_sqe, len));
+STATIC_ASSERT(28 == offsetof(struct uv__io_uring_sqe, rw_flags));
+STATIC_ASSERT(32 == offsetof(struct uv__io_uring_sqe, user_data));
+STATIC_ASSERT(40 == offsetof(struct uv__io_uring_sqe, buf_index));
+
+struct uv__io_uring_params {
+ uint32_t sq_entries;
+ uint32_t cq_entries;
+ uint32_t flags;
+ uint32_t sq_thread_cpu;
+ uint32_t sq_thread_idle;
+ uint32_t features;
+ uint32_t reserved[4];
+ struct uv__io_sqring_offsets sq_off; /* 40 bytes */
+ struct uv__io_cqring_offsets cq_off; /* 40 bytes */
+};
+
+STATIC_ASSERT(40 + 40 + 40 == sizeof(struct uv__io_uring_params));
+STATIC_ASSERT(40 == offsetof(struct uv__io_uring_params, sq_off));
+STATIC_ASSERT(80 == offsetof(struct uv__io_uring_params, cq_off));
+
struct watcher_list {
RB_ENTRY(watcher_list) entry;
QUEUE watchers;
@@ -206,7 +335,61 @@ ssize_t uv__getrandom(void* buf, size_t buflen, unsigned flags) {
}
+int uv__io_uring_setup(int entries, struct uv__io_uring_params* params) {
+ return syscall(__NR_io_uring_setup, entries, params);
+}
+
+
+int uv__io_uring_enter(int fd,
+ unsigned to_submit,
+ unsigned min_complete,
+ unsigned flags) {
+ /* io_uring_enter used to take a sigset_t but it's unused
+ * in newer kernels unless IORING_ENTER_EXT_ARG is set,
+ * in which case it takes a struct io_uring_getevents_arg.
+ */
+ return syscall(__NR_io_uring_enter, fd, to_submit, min_complete, flags, 0, 0);
+}
+
+
+int uv__io_uring_register(int fd, unsigned opcode, void* arg, unsigned nargs) {
+ return syscall(__NR_io_uring_register, fd, opcode, arg, nargs);
+}
+
+
+static int uv__use_io_uring(void) {
+ /* Ternary: unknown=0, yes=1, no=-1 */
+ static _Atomic int use_io_uring;
+ char* val;
+ int use;
+
+ use = atomic_load_explicit(&use_io_uring, memory_order_relaxed);
+
+ if (use == 0) {
+ val = getenv("UV_USE_IO_URING");
+ use = val == NULL || atoi(val) ? 1 : -1;
+ atomic_store_explicit(&use_io_uring, use, memory_order_relaxed);
+ }
+
+ return use > 0;
+}
+
+
int uv__platform_loop_init(uv_loop_t* loop) {
+ struct uv__io_uring_params params;
+ struct epoll_event e;
+ struct uv__iou* iou;
+ size_t cqlen;
+ size_t sqlen;
+ size_t maxlen;
+ size_t sqelen;
+ char* sq;
+ char* sqe;
+ int ringfd;
+
+ iou = &uv__get_internal_fields(loop)->iou;
+ iou->ringfd = -1;
+
loop->inotify_watchers = NULL;
loop->inotify_fd = -1;
loop->backend_fd = epoll_create1(O_CLOEXEC);
@@ -214,7 +397,101 @@ int uv__platform_loop_init(uv_loop_t* loop) {
if (loop->backend_fd == -1)
return UV__ERR(errno);
+ if (!uv__use_io_uring())
+ return 0;
+
+ sq = MAP_FAILED;
+ sqe = MAP_FAILED;
+
+ /* SQPOLL required CAP_SYS_NICE until linux v5.12 relaxed that requirement.
+ * Mostly academic because we check for a v5.13 kernel afterwards anyway.
+ */
+ memset(&params, 0, sizeof(params));
+ params.flags = UV__IORING_SETUP_SQPOLL;
+ params.sq_thread_idle = 10; /* milliseconds */
+
+ /* Kernel returns a file descriptor with O_CLOEXEC flag set. */
+ ringfd = uv__io_uring_setup(64, &params);
+ if (ringfd == -1)
+ return 0; /* Not an error, falls back to thread pool. */
+
+ /* IORING_FEAT_RSRC_TAGS is used to detect linux v5.13 but what we're
+ * actually detecting is whether IORING_OP_STATX works with SQPOLL.
+ */
+ if (!(params.features & UV__IORING_FEAT_RSRC_TAGS))
+ goto fail;
+
+ /* Implied by IORING_FEAT_RSRC_TAGS but checked explicitly anyway. */
+ if (!(params.features & UV__IORING_FEAT_SINGLE_MMAP))
+ goto fail;
+
+ /* Implied by IORING_FEAT_RSRC_TAGS but checked explicitly anyway. */
+ if (!(params.features & UV__IORING_FEAT_NODROP))
+ goto fail;
+
+ sqlen = params.sq_off.array + params.sq_entries * sizeof(uint32_t);
+ cqlen =
+ params.cq_off.cqes + params.cq_entries * sizeof(struct uv__io_uring_cqe);
+ maxlen = sqlen < cqlen ? cqlen : sqlen;
+ sqelen = params.sq_entries * sizeof(struct uv__io_uring_sqe);
+
+ sq = mmap(0,
+ maxlen,
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE,
+ ringfd,
+ 0); /* IORING_OFF_SQ_RING */
+
+ sqe = mmap(0,
+ sqelen,
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE,
+ ringfd,
+ 0x10000000ull); /* IORING_OFF_SQES */
+
+ if (sq == MAP_FAILED || sqe == MAP_FAILED)
+ goto fail;
+
+ iou->sqhead = (uint32_t*) (sq + params.sq_off.head);
+ iou->sqtail = (uint32_t*) (sq + params.sq_off.tail);
+ iou->sqmask = *(uint32_t*) (sq + params.sq_off.ring_mask);
+ iou->sqarray = (uint32_t*) (sq + params.sq_off.array);
+ iou->sqflags = (uint32_t*) (sq + params.sq_off.flags);
+ iou->cqhead = (uint32_t*) (sq + params.cq_off.head);
+ iou->cqtail = (uint32_t*) (sq + params.cq_off.tail);
+ iou->cqmask = *(uint32_t*) (sq + params.cq_off.ring_mask);
+ iou->sq = sq;
+ iou->cqe = sq + params.cq_off.cqes;
+ iou->sqe = sqe;
+ iou->sqlen = sqlen;
+ iou->cqlen = cqlen;
+ iou->maxlen = maxlen;
+ iou->sqelen = sqelen;
+ iou->ringfd = ringfd;
+ iou->in_flight = 0;
+
+ /* Only interested in completion events. To get notified when
+ * the kernel pulls items from the submission ring, add POLLOUT.
+ */
+ memset(&e, 0, sizeof(e));
+ e.events = POLLIN;
+ e.data.fd = ringfd;
+
+ if (epoll_ctl(loop->backend_fd, EPOLL_CTL_ADD, ringfd, &e))
+ goto fail;
+
return 0;
+
+fail:
+ if (sq != MAP_FAILED)
+ munmap(sq, maxlen);
+
+ if (sqe != MAP_FAILED)
+ munmap(sqe, sqelen);
+
+ uv__close(ringfd);
+
+ return 0; /* Not an error, falls back to thread pool. */
}
@@ -226,6 +503,8 @@ int uv__io_fork(uv_loop_t* loop) {
uv__close(loop->backend_fd);
loop->backend_fd = -1;
+
+ /* TODO(bnoordhuis) Loses items from the submission and completion rings. */
uv__platform_loop_delete(loop);
err = uv__platform_loop_init(loop);
@@ -237,10 +516,22 @@ int uv__io_fork(uv_loop_t* loop) {
void uv__platform_loop_delete(uv_loop_t* loop) {
- if (loop->inotify_fd == -1) return;
- uv__io_stop(loop, &loop->inotify_read_watcher, POLLIN);
- uv__close(loop->inotify_fd);
- loop->inotify_fd = -1;
+ struct uv__iou* iou;
+
+ iou = &uv__get_internal_fields(loop)->iou;
+
+ if (loop->inotify_fd != -1) {
+ uv__io_stop(loop, &loop->inotify_read_watcher, POLLIN);
+ uv__close(loop->inotify_fd);
+ loop->inotify_fd = -1;
+ }
+
+ if (iou->ringfd != -1) {
+ munmap(iou->sq, iou->maxlen);
+ munmap(iou->sqe, iou->sqelen);
+ uv__close(iou->ringfd);
+ iou->ringfd = -1;
+ }
}
@@ -298,6 +589,242 @@ int uv__io_check_fd(uv_loop_t* loop, int fd) {
}
+/* Caller must initialize SQE and call uv__iou_submit(). */
+static struct uv__io_uring_sqe* uv__iou_get_sqe(struct uv__iou* iou,
+ uv_loop_t* loop,
+ uv_fs_t* req) {
+ struct uv__io_uring_sqe* sqe;
+ uint32_t head;
+ uint32_t slot;
+
+ if (iou->ringfd == -1)
+ return NULL;
+
+ head = atomic_load_explicit((_Atomic uint32_t*) iou->sqhead,
+ memory_order_acquire);
+ if (head == *iou->sqtail + 1)
+ return NULL; /* No room in ring buffer. TODO(bnoordhuis) maybe flush it? */
+
+ slot = *iou->sqtail & iou->sqmask;
+ iou->sqarray[slot] = slot; /* Identity mapping of index -> sqe. */
+
+ sqe = iou->sqe;
+ sqe = &sqe[slot];
+ memset(sqe, 0, sizeof(*sqe));
+ sqe->user_data = (uintptr_t) req;
+
+ /* Pacify uv_cancel(). */
+ req->work_req.loop = loop;
+ req->work_req.work = NULL;
+ req->work_req.done = NULL;
+ QUEUE_INIT(&req->work_req.wq);
+
+ uv__req_register(loop, req);
+ iou->in_flight++;
+
+ return sqe;
+}
+
+
+static void uv__iou_submit(struct uv__iou* iou) {
+ uint32_t flags;
+
+ atomic_store_explicit((_Atomic uint32_t*) iou->sqtail,
+ *iou->sqtail + 1,
+ memory_order_release);
+
+ flags = atomic_load_explicit((_Atomic uint32_t*) iou->sqflags,
+ memory_order_acquire);
+
+ if (flags & UV__IORING_SQ_NEED_WAKEUP)
+ if (uv__io_uring_enter(iou->ringfd, 0, 0, UV__IORING_ENTER_SQ_WAKEUP))
+ perror("libuv: io_uring_enter"); /* Can't happen. */
+}
+
+
+int uv__iou_fs_fsync_or_fdatasync(uv_loop_t* loop,
+ uv_fs_t* req,
+ uint32_t fsync_flags) {
+ struct uv__io_uring_sqe* sqe;
+ struct uv__iou* iou;
+
+ iou = &uv__get_internal_fields(loop)->iou;
+
+ sqe = uv__iou_get_sqe(iou, loop, req);
+ if (sqe == NULL)
+ return 0;
+
+ /* Little known fact: setting seq->off and seq->len turns
+ * it into an asynchronous sync_file_range() operation.
+ */
+ sqe->fd = req->file;
+ sqe->fsync_flags = fsync_flags;
+ sqe->opcode = UV__IORING_OP_FSYNC;
+
+ uv__iou_submit(iou);
+
+ return 1;
+}
+
+
+int uv__iou_fs_read_or_write(uv_loop_t* loop,
+ uv_fs_t* req,
+ int is_read) {
+ struct uv__io_uring_sqe* sqe;
+ struct uv__iou* iou;
+
+ iou = &uv__get_internal_fields(loop)->iou;
+
+ sqe = uv__iou_get_sqe(iou, loop, req);
+ if (sqe == NULL)
+ return 0;
+
+ sqe->addr = (uintptr_t) req->bufs;
+ sqe->fd = req->file;
+ sqe->len = req->nbufs;
+ sqe->off = req->off < 0 ? -1 : req->off;
+ sqe->opcode = is_read ? UV__IORING_OP_READV : UV__IORING_OP_WRITEV;
+
+ uv__iou_submit(iou);
+
+ return 1;
+}
+
+
+int uv__iou_fs_statx(uv_loop_t* loop,
+ uv_fs_t* req,
+ int is_fstat,
+ int is_lstat) {
+ struct uv__io_uring_sqe* sqe;
+ struct uv__statx* statxbuf;
+ struct uv__iou* iou;
+
+ statxbuf = uv__malloc(sizeof(*statxbuf));
+ if (statxbuf == NULL)
+ return 0;
+
+ iou = &uv__get_internal_fields(loop)->iou;
+
+ sqe = uv__iou_get_sqe(iou, loop, req);
+ if (sqe == NULL) {
+ uv__free(statxbuf);
+ return 0;
+ }
+
+ req->ptr = statxbuf;
+
+ sqe->addr = (uintptr_t) req->path;
+ sqe->addr2 = (uintptr_t) statxbuf;
+ sqe->fd = AT_FDCWD;
+ sqe->len = 0xFFF; /* STATX_BASIC_STATS + STATX_BTIME */
+ sqe->opcode = UV__IORING_OP_STATX;
+
+ if (is_fstat) {
+ sqe->addr = (uintptr_t) "";
+ sqe->fd = req->file;
+ sqe->statx_flags |= 0x1000; /* AT_EMPTY_PATH */
+ }
+
+ if (is_lstat)
+ sqe->statx_flags |= AT_SYMLINK_NOFOLLOW;
+
+ uv__iou_submit(iou);
+
+ return 1;
+}
+
+
+void uv__statx_to_stat(const struct uv__statx* statxbuf, uv_stat_t* buf) {
+ buf->st_dev = makedev(statxbuf->stx_dev_major, statxbuf->stx_dev_minor);
+ buf->st_mode = statxbuf->stx_mode;
+ buf->st_nlink = statxbuf->stx_nlink;
+ buf->st_uid = statxbuf->stx_uid;
+ buf->st_gid = statxbuf->stx_gid;
+ buf->st_rdev = makedev(statxbuf->stx_rdev_major, statxbuf->stx_rdev_minor);
+ buf->st_ino = statxbuf->stx_ino;
+ buf->st_size = statxbuf->stx_size;
+ buf->st_blksize = statxbuf->stx_blksize;
+ buf->st_blocks = statxbuf->stx_blocks;
+ buf->st_atim.tv_sec = statxbuf->stx_atime.tv_sec;
+ buf->st_atim.tv_nsec = statxbuf->stx_atime.tv_nsec;
+ buf->st_mtim.tv_sec = statxbuf->stx_mtime.tv_sec;
+ buf->st_mtim.tv_nsec = statxbuf->stx_mtime.tv_nsec;
+ buf->st_ctim.tv_sec = statxbuf->stx_ctime.tv_sec;
+ buf->st_ctim.tv_nsec = statxbuf->stx_ctime.tv_nsec;
+ buf->st_birthtim.tv_sec = statxbuf->stx_btime.tv_sec;
+ buf->st_birthtim.tv_nsec = statxbuf->stx_btime.tv_nsec;
+ buf->st_flags = 0;
+ buf->st_gen = 0;
+}
+
+
+static void uv__iou_fs_statx_post(uv_fs_t* req) {
+ struct uv__statx* statxbuf;
+ uv_stat_t* buf;
+
+ buf = &req->statbuf;
+ statxbuf = req->ptr;
+ req->ptr = NULL;
+
+ if (req->result == 0) {
+ uv__msan_unpoison(statxbuf, sizeof(*statxbuf));
+ uv__statx_to_stat(statxbuf, buf);
+ req->ptr = buf;
+ }
+
+ uv__free(statxbuf);
+}
+
+
+static void uv__poll_io_uring(uv_loop_t* loop, struct uv__iou* iou) {
+ struct uv__io_uring_cqe* cqe;
+ struct uv__io_uring_cqe* e;
+ uv_fs_t* req;
+ uint32_t head;
+ uint32_t tail;
+ uint32_t mask;
+ uint32_t i;
+
+ head = *iou->cqhead;
+ tail = atomic_load_explicit((_Atomic uint32_t*) iou->cqtail,
+ memory_order_acquire);
+ mask = iou->cqmask;
+ cqe = iou->cqe;
+
+ for (i = head; i != tail; i++) {
+ e = &cqe[i & mask];
+
+ req = (uv_fs_t*) (uintptr_t) e->user_data;
+ assert(req->type == UV_FS);
+
+ uv__req_unregister(loop, req);
+ iou->in_flight--;
+
+ /* io_uring stores error codes as negative numbers, same as libuv. */
+ req->result = e->res;
+
+ switch (req->fs_type) {
+ case UV_FS_FSTAT:
+ case UV_FS_LSTAT:
+ case UV_FS_STAT:
+ uv__iou_fs_statx_post(req);
+ break;
+ default: /* Squelch -Wswitch warnings. */
+ break;
+ }
+
+ uv__metrics_update_idle_time(loop);
+ req->cb(req);
+ }
+
+ atomic_store_explicit((_Atomic uint32_t*) iou->cqhead,
+ tail,
+ memory_order_release);
+
+ uv__metrics_inc_events(loop, 1);
+}
+
+
void uv__io_poll(uv_loop_t* loop, int timeout) {
/* A bug in kernels < 2.6.37 makes timeouts larger than ~30 minutes
* effectively infinite on 32 bits architectures. To avoid blocking
@@ -308,15 +835,18 @@ void uv__io_poll(uv_loop_t* loop, int timeout) {
* that being the largest value I have seen in the wild (and only once.)
*/
static const int max_safe_timeout = 1789569;
+ uv__loop_internal_fields_t* lfields;
struct epoll_event events[1024];
struct epoll_event* pe;
struct epoll_event e;
+ struct uv__iou* iou;
int real_timeout;
QUEUE* q;
uv__io_t* w;
sigset_t* sigmask;
sigset_t sigset;
uint64_t base;
+ int have_iou_events;
int have_signals;
int nevents;
int count;
@@ -327,10 +857,8 @@ void uv__io_poll(uv_loop_t* loop, int timeout) {
int user_timeout;
int reset_timeout;
- if (loop->nfds == 0) {
- assert(QUEUE_EMPTY(&loop->watcher_queue));
- return;
- }
+ lfields = uv__get_internal_fields(loop);
+ iou = &lfields->iou;
memset(&e, 0, sizeof(e));
@@ -381,7 +909,7 @@ void uv__io_poll(uv_loop_t* loop, int timeout) {
count = 48; /* Benchmarks suggest this gives the best throughput. */
real_timeout = timeout;
- if (uv__get_internal_fields(loop)->flags & UV_METRICS_IDLE_TIME) {
+ if (lfields->flags & UV_METRICS_IDLE_TIME) {
reset_timeout = 1;
user_timeout = timeout;
timeout = 0;
@@ -391,6 +919,10 @@ void uv__io_poll(uv_loop_t* loop, int timeout) {
}
for (;;) {
+ if (loop->nfds == 0)
+ if (iou->in_flight == 0)
+ return;
+
/* Only need to set the provider_entry_time if timeout != 0. The function
* will return early if the loop isn't configured with UV_METRICS_IDLE_TIME.
*/
@@ -454,6 +986,7 @@ void uv__io_poll(uv_loop_t* loop, int timeout) {
goto update_timeout;
}
+ have_iou_events = 0;
have_signals = 0;
nevents = 0;
@@ -478,6 +1011,12 @@ void uv__io_poll(uv_loop_t* loop, int timeout) {
if (fd == -1)
continue;
+ if (fd == iou->ringfd) {
+ uv__poll_io_uring(loop, iou);
+ have_iou_events = 1;
+ continue;
+ }
+
assert(fd >= 0);
assert((unsigned) fd < loop->nwatchers);
@@ -549,6 +1088,9 @@ void uv__io_poll(uv_loop_t* loop, int timeout) {
loop->watchers[loop->nwatchers] = NULL;
loop->watchers[loop->nwatchers + 1] = NULL;
+ if (have_iou_events != 0)
+ return; /* Event loop should cycle now so don't poll again. */
+
if (have_signals != 0)
return; /* Event loop should cycle now so don't poll again. */