[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Qemu-devel] [PATCH RFC 4/4] aio-posix: Use epoll in aio_poll
From: |
Fam Zheng |
Subject: |
[Qemu-devel] [PATCH RFC 4/4] aio-posix: Use epoll in aio_poll |
Date: |
Tue, 30 Jun 2015 21:19:45 +0800 |
This patch let aio_poll use epoll_wait(2) syscall instead of
qemu_poll_ns, if possible. It improves scalability of
iothread (for example, virtio-scsi-dataplane.)
The epollfd is managed together with the GSource and ctx->aio_handlers,
by creating epoll_event instances for each watched aio fd and adding to
the epollfd with epoll_ctl.
The following table is a fio benchmark comparison on a single guest
block device, with different number of disks attached to the same scsi
bus (in MB/s):
=====================================================================
# of scsi-disks | master | epoll
| rd wr randrw | rd wr randrw
---------------------------------------------------------------------
1 | 103 96 49 | 105 99 49
4 | 92 96 48 | 103 98 49
8 | 96 94 46 | 101 97 50
16 | 91 91 45 | 101 95 48
32 | 84 83 40 | 95 95 48
64 | 75 73 35 | 91 90 44
128 | 54 53 26 | 79 80 39
256 | 41 39 19 | 63 62 30
=====================================================================
Signed-off-by: Fam Zheng <address@hidden>
---
aio-posix.c | 118 ++++++++++++++++++++++++++++++++++++++++++++++++++--
include/block/aio.h | 3 ++
2 files changed, 117 insertions(+), 4 deletions(-)
diff --git a/aio-posix.c b/aio-posix.c
index 22406ce..111d7fb 100644
--- a/aio-posix.c
+++ b/aio-posix.c
@@ -17,6 +17,9 @@
#include "block/block.h"
#include "qemu/queue.h"
#include "qemu/sockets.h"
+#ifdef CONFIG_EPOLL
+#include <sys/epoll.h>
+#endif
struct AioHandler
{
@@ -44,6 +47,12 @@ static AioHandler *find_aio_handler(AioContext *ctx, int fd)
void aio_context_setup(AioContext *ctx, Error **errp)
{
+#ifdef CONFIG_EPOLL
+ ctx->epollfd = epoll_create1(EPOLL_CLOEXEC);
+ if (ctx->epollfd < 0) {
+ error_setg(errp, "Failed to create epoll fd: %s", strerror(errno));
+ }
+#endif
}
void aio_set_fd_handler_pri(AioContext *ctx,
@@ -54,6 +63,11 @@ void aio_set_fd_handler_pri(AioContext *ctx,
void *opaque)
{
AioHandler *node;
+#ifdef CONFIG_EPOLL
+ struct epoll_event event;
+ int r;
+ bool add = false;
+#endif
node = find_aio_handler(ctx, fd);
@@ -61,6 +75,10 @@ void aio_set_fd_handler_pri(AioContext *ctx,
if (!io_read && !io_write && !io_read_pri) {
if (node) {
g_source_remove_poll(&ctx->source, &node->pfd);
+#ifdef CONFIG_EPOLL
+ r = epoll_ctl(ctx->epollfd, EPOLL_CTL_DEL, fd, &event);
+ assert(!r);
+#endif
/* If the lock is held, just mark the node as deleted */
if (ctx->walking_handlers) {
@@ -83,6 +101,9 @@ void aio_set_fd_handler_pri(AioContext *ctx,
QLIST_INSERT_HEAD(&ctx->aio_handlers, node, node);
g_source_add_poll(&ctx->source, &node->pfd);
+#ifdef CONFIG_EPOLL
+ add = true;
+#endif
}
/* Update handler with latest information */
node->io_read = io_read;
@@ -93,6 +114,13 @@ void aio_set_fd_handler_pri(AioContext *ctx,
node->pfd.events = (io_read ? G_IO_IN | G_IO_HUP | G_IO_ERR : 0);
node->pfd.events |= (io_write ? G_IO_OUT | G_IO_ERR : 0);
node->pfd.events |= (io_read_pri ? G_IO_PRI | G_IO_HUP | G_IO_ERR : 0);
+#ifdef CONFIG_EPOLL
+ event.data.ptr = node;
+ event.events = node->pfd.events;
+ r = epoll_ctl(ctx->epollfd, add ? EPOLL_CTL_ADD : EPOLL_CTL_MOD,
+ fd, &event);
+ assert(!r);
+#endif
}
aio_notify(ctx);
@@ -198,7 +226,80 @@ bool aio_dispatch(AioContext *ctx)
return progress;
}
-/* These thread-local variables are used only in a small part of aio_poll
+#ifdef CONFIG_EPOLL
+QEMU_BUILD_BUG_ON((int)G_IO_IN != EPOLLIN);
+QEMU_BUILD_BUG_ON((int)G_IO_OUT != EPOLLOUT);
+QEMU_BUILD_BUG_ON((int)G_IO_PRI != EPOLLPRI);
+QEMU_BUILD_BUG_ON((int)G_IO_ERR != EPOLLERR);
+QEMU_BUILD_BUG_ON((int)G_IO_HUP != EPOLLHUP);
+
+#define EPOLL_BATCH 128
+static bool aio_poll_epoll(AioContext *ctx, bool blocking)
+{
+ AioHandler *node;
+ bool was_dispatching;
+ int i, ret;
+ bool progress;
+ int64_t timeout;
+ struct epoll_event events[EPOLL_BATCH];
+
+ aio_context_acquire(ctx);
+ was_dispatching = ctx->dispatching;
+ progress = false;
+
+ /* aio_notify can avoid the expensive event_notifier_set if
+ * everything (file descriptors, bottom halves, timers) will
+ * be re-evaluated before the next blocking poll(). This is
+ * already true when aio_poll is called with blocking == false;
+ * if blocking == true, it is only true after poll() returns.
+ *
+ * If we're in a nested event loop, ctx->dispatching might be true.
+ * In that case we can restore it just before returning, but we
+ * have to clear it now.
+ */
+ aio_set_dispatching(ctx, !blocking);
+
+ ctx->walking_handlers++;
+
+ timeout = blocking ? aio_compute_timeout(ctx) : 0;
+
+ if (timeout > 0) {
+ timeout = DIV_ROUND_UP(timeout, 1000000);
+ }
+
+ /* wait until next event */
+ if (timeout) {
+ aio_context_release(ctx);
+ }
+ ret = epoll_wait(ctx->epollfd, events, EPOLL_BATCH, timeout);
+ if (timeout) {
+ aio_context_acquire(ctx);
+ }
+
+ /* if we have any readable fds, dispatch event */
+ if (ret > 0) {
+ for (i = 0; i < ret; i++) {
+ node = events[i].data.ptr;
+ node->pfd.revents = events[i].events;
+ }
+ }
+
+ ctx->walking_handlers--;
+
+ /* Run dispatch even if there were no readable fds to run timers */
+ aio_set_dispatching(ctx, true);
+ if (aio_dispatch(ctx)) {
+ progress = true;
+ }
+
+ aio_set_dispatching(ctx, was_dispatching);
+ aio_context_release(ctx);
+
+ return progress;
+}
+#else
+
+/* These thread-local variables are used only in a small part of aio_poll_posix
* around the call to the poll() system call. In particular they are not
* used while aio_poll is performing callbacks, which makes it much easier
* to think about reentrancy!
@@ -212,7 +313,6 @@ bool aio_dispatch(AioContext *ctx)
static __thread GPollFD *pollfds;
static __thread AioHandler **nodes;
static __thread unsigned npfd, nalloc;
-static __thread Notifier pollfds_cleanup_notifier;
static void pollfds_cleanup(Notifier *n, void *unused)
{
@@ -221,7 +321,7 @@ static void pollfds_cleanup(Notifier *n, void *unused)
g_free(nodes);
nalloc = 0;
}
-
+static __thread Notifier pollfds_cleanup_notifier;
static void add_pollfd(AioHandler *node)
{
if (npfd == nalloc) {
@@ -244,7 +344,7 @@ static void add_pollfd(AioHandler *node)
npfd++;
}
-bool aio_poll(AioContext *ctx, bool blocking)
+bool aio_poll_posix(AioContext *ctx, bool blocking)
{
AioHandler *node;
bool was_dispatching;
@@ -311,3 +411,13 @@ bool aio_poll(AioContext *ctx, bool blocking)
return progress;
}
+#endif
+
+bool aio_poll(AioContext *ctx, bool blocking)
+{
+#ifdef CONFIG_EPOLL
+ return aio_poll_epoll(ctx, blocking);
+#else
+ return aio_poll_posix(ctx, blocking);
+#endif
+}
diff --git a/include/block/aio.h b/include/block/aio.h
index 5120583..9178ff2 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -87,6 +87,9 @@ struct AioContext {
/* TimerLists for calling timers - one per clock type */
QEMUTimerListGroup tlg;
+
+ /* epoll fd */
+ int epollfd;
};
/* Used internally to synchronize aio_poll against qemu_bh_schedule. */
--
2.4.3