qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH v1 2/3] io: Add zerocopy and errqueue


From: Leonardo Bras
Subject: [PATCH v1 2/3] io: Add zerocopy and errqueue
Date: Tue, 31 Aug 2021 08:02:38 -0300

MSG_ZEROCOPY is a feature that enables copy avoidance in TCP/UDP socket
send calls. It does so by avoiding copying user data into kernel buffers.

To make it work, three steps are needed:
1 - A setsockopt() system call, enabling SO_ZEROCOPY
2 - Passing down the MSG_ZEROCOPY flag for each send*() syscall
3 - Process the socket's error queue, dealing with any error

Zerocopy has it's costs, so it will only get improved performance if
the sending buffer is big (10KB, according to Linux docs).

The step 2 makes it possible to use the same socket to send data
using both zerocopy and the default copying approach, so the application
cat choose what is best for each packet.

To implement step 1, an optional set_zerocopy() interface was created
in QIOChannel, allowing each using code to enable or disable it.

Step 2 will be enabled by the using code at each qio_channel_write*()
that would benefit of zerocopy;

Step 3 is done with qio_channel_socket_errq_proc(), that runs after
SOCKET_ERRQ_THRESH (16k) iovs sent, dealing with any error found.

Signed-off-by: Leonardo Bras <leobras@redhat.com>
---
 include/io/channel-socket.h |  2 +
 include/io/channel.h        | 29 ++++++++++++++
 io/channel-socket.c         | 76 +++++++++++++++++++++++++++++++++++++
 io/channel-tls.c            | 11 ++++++
 io/channel-websock.c        |  9 +++++
 io/channel.c                | 11 ++++++
 6 files changed, 138 insertions(+)

diff --git a/include/io/channel-socket.h b/include/io/channel-socket.h
index e747e63514..09dffe059f 100644
--- a/include/io/channel-socket.h
+++ b/include/io/channel-socket.h
@@ -47,6 +47,8 @@ struct QIOChannelSocket {
     socklen_t localAddrLen;
     struct sockaddr_storage remoteAddr;
     socklen_t remoteAddrLen;
+    size_t errq_pending;
+    bool zerocopy_enabled;
 };
 
 
diff --git a/include/io/channel.h b/include/io/channel.h
index dada9ebaaf..de10a78b10 100644
--- a/include/io/channel.h
+++ b/include/io/channel.h
@@ -137,6 +137,8 @@ struct QIOChannelClass {
                                   IOHandler *io_read,
                                   IOHandler *io_write,
                                   void *opaque);
+    void (*io_set_zerocopy)(QIOChannel *ioc,
+                            bool enabled);
 };
 
 /* General I/O handling functions */
@@ -570,6 +572,33 @@ int qio_channel_shutdown(QIOChannel *ioc,
 void qio_channel_set_delay(QIOChannel *ioc,
                            bool enabled);
 
+/**
+ * qio_channel_set_zerocopy:
+ * @ioc: the channel object
+ * @enabled: the new flag state
+ *
+ * Controls whether the underlying transport is
+ * permitted to use zerocopy to avoid copying the
+ * sending buffer in kernel. If @enabled is true, then the
+ * writes may avoid buffer copy in kernel. If @enabled
+ * is false, writes will cause the kernel to always
+ * copy the buffer contents before sending.
+ *
+ * In order to use make a write with zerocopy feature,
+ * it's also necessary to sent each packet with
+ * MSG_ZEROCOPY flag. With this, it's possible to
+ * to select only writes that would benefit from the
+ * use of zerocopy feature, i.e. the ones with larger
+ * buffers.
+ *
+ * This feature was added in Linux 4.14, so older
+ * versions will fail on enabling. This is not an
+ * issue, since it will fall-back to default copying
+ * approach.
+ */
+void qio_channel_set_zerocopy(QIOChannel *ioc,
+                              bool enabled);
+
 /**
  * qio_channel_set_cork:
  * @ioc: the channel object
diff --git a/io/channel-socket.c b/io/channel-socket.c
index e377e7303d..a69fec7315 100644
--- a/io/channel-socket.c
+++ b/io/channel-socket.c
@@ -26,8 +26,10 @@
 #include "io/channel-watch.h"
 #include "trace.h"
 #include "qapi/clone-visitor.h"
+#include <linux/errqueue.h>
 
 #define SOCKET_MAX_FDS 16
+#define SOCKET_ERRQ_THRESH 16384
 
 SocketAddress *
 qio_channel_socket_get_local_address(QIOChannelSocket *ioc,
@@ -55,6 +57,8 @@ qio_channel_socket_new(void)
 
     sioc = QIO_CHANNEL_SOCKET(object_new(TYPE_QIO_CHANNEL_SOCKET));
     sioc->fd = -1;
+    sioc->zerocopy_enabled = false;
+    sioc->errq_pending = 0;
 
     ioc = QIO_CHANNEL(sioc);
     qio_channel_set_feature(ioc, QIO_CHANNEL_FEATURE_SHUTDOWN);
@@ -520,6 +524,54 @@ static ssize_t qio_channel_socket_readv(QIOChannel *ioc,
     return ret;
 }
 
+static void qio_channel_socket_errq_proc(QIOChannelSocket *sioc,
+                                         Error **errp)
+{
+    int fd = sioc->fd;
+    int ret;
+    struct msghdr msg = {};
+    struct sock_extended_err *serr;
+    struct cmsghdr *cm;
+
+    do {
+        ret = recvmsg(fd, &msg, MSG_ERRQUEUE);
+        if (ret <= 0) {
+            if (ret == 0 || errno == EAGAIN) {
+                /* Nothing on errqueue */
+                 sioc->errq_pending = 0;
+                 break;
+            }
+            if (errno == EINTR) {
+                continue;
+            }
+
+            error_setg_errno(errp, errno,
+                             "Unable to read errqueue");
+            break;
+        }
+
+        cm = CMSG_FIRSTHDR(&msg);
+        if (cm->cmsg_level != SOL_IP &&
+            cm->cmsg_type != IP_RECVERR) {
+            error_setg_errno(errp, EPROTOTYPE,
+                             "Wrong cmsg in errqueue");
+            break;
+        }
+
+        serr = (void *) CMSG_DATA(cm);
+        if (serr->ee_errno != 0) {
+            error_setg_errno(errp, serr->ee_errno,
+                             "Error on socket");
+            break;
+        }
+        if (serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY) {
+            error_setg_errno(errp, serr->ee_origin,
+                             "Error not from zerocopy");
+            break;
+        }
+    } while (true);
+}
+
 static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
                                          const struct iovec *iov,
                                          size_t niov,
@@ -571,6 +623,14 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
                          "Unable to write to socket");
         return -1;
     }
+
+    if ((flags & MSG_ZEROCOPY) && sioc->zerocopy_enabled) {
+        sioc->errq_pending += niov;
+        if (sioc->errq_pending > SOCKET_ERRQ_THRESH) {
+            qio_channel_socket_errq_proc(sioc, errp);
+        }
+    }
+
     return ret;
 }
 #else /* WIN32 */
@@ -689,6 +749,21 @@ qio_channel_socket_set_delay(QIOChannel *ioc,
 }
 
 
+static void
+qio_channel_socket_set_zerocopy(QIOChannel *ioc,
+                                bool enabled)
+{
+    QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc);
+    int v = enabled ? 1 : 0;
+    int ret;
+
+    ret = qemu_setsockopt(sioc->fd, SOL_SOCKET, SO_ZEROCOPY, &v, sizeof(v));
+    if (ret >= 0) {
+        sioc->zerocopy_enabled = true;
+    }
+}
+
+
 static void
 qio_channel_socket_set_cork(QIOChannel *ioc,
                             bool enabled)
@@ -789,6 +864,7 @@ static void qio_channel_socket_class_init(ObjectClass 
*klass,
     ioc_klass->io_set_delay = qio_channel_socket_set_delay;
     ioc_klass->io_create_watch = qio_channel_socket_create_watch;
     ioc_klass->io_set_aio_fd_handler = qio_channel_socket_set_aio_fd_handler;
+    ioc_klass->io_set_zerocopy = qio_channel_socket_set_zerocopy;
 }
 
 static const TypeInfo qio_channel_socket_info = {
diff --git a/io/channel-tls.c b/io/channel-tls.c
index 4ce890a538..bf44b0f7b0 100644
--- a/io/channel-tls.c
+++ b/io/channel-tls.c
@@ -350,6 +350,16 @@ static void qio_channel_tls_set_delay(QIOChannel *ioc,
     qio_channel_set_delay(tioc->master, enabled);
 }
 
+
+static void qio_channel_tls_set_zerocopy(QIOChannel *ioc,
+                                         bool enabled)
+{
+    QIOChannelTLS *tioc = QIO_CHANNEL_TLS(ioc);
+
+    qio_channel_set_zerocopy(tioc->master, enabled);
+}
+
+
 static void qio_channel_tls_set_cork(QIOChannel *ioc,
                                      bool enabled)
 {
@@ -416,6 +426,7 @@ static void qio_channel_tls_class_init(ObjectClass *klass,
     ioc_klass->io_shutdown = qio_channel_tls_shutdown;
     ioc_klass->io_create_watch = qio_channel_tls_create_watch;
     ioc_klass->io_set_aio_fd_handler = qio_channel_tls_set_aio_fd_handler;
+    ioc_klass->io_set_zerocopy = qio_channel_tls_set_zerocopy;
 }
 
 static const TypeInfo qio_channel_tls_info = {
diff --git a/io/channel-websock.c b/io/channel-websock.c
index 035dd6075b..4e9491966b 100644
--- a/io/channel-websock.c
+++ b/io/channel-websock.c
@@ -1194,6 +1194,14 @@ static void qio_channel_websock_set_delay(QIOChannel 
*ioc,
     qio_channel_set_delay(tioc->master, enabled);
 }
 
+static void qio_channel_websock_set_zerocopy(QIOChannel *ioc,
+                                             bool enabled)
+{
+    QIOChannelWebsock *tioc = QIO_CHANNEL_WEBSOCK(ioc);
+
+    qio_channel_set_zerocopy(tioc->master, enabled);
+}
+
 static void qio_channel_websock_set_cork(QIOChannel *ioc,
                                          bool enabled)
 {
@@ -1318,6 +1326,7 @@ static void qio_channel_websock_class_init(ObjectClass 
*klass,
     ioc_klass->io_close = qio_channel_websock_close;
     ioc_klass->io_shutdown = qio_channel_websock_shutdown;
     ioc_klass->io_create_watch = qio_channel_websock_create_watch;
+    ioc_klass->io_set_zerocopy = qio_channel_websock_set_zerocopy;
 }
 
 static const TypeInfo qio_channel_websock_info = {
diff --git a/io/channel.c b/io/channel.c
index ee3cb83d4d..476440e8b2 100644
--- a/io/channel.c
+++ b/io/channel.c
@@ -450,6 +450,17 @@ void qio_channel_set_delay(QIOChannel *ioc,
 }
 
 
+void qio_channel_set_zerocopy(QIOChannel *ioc,
+                              bool enabled)
+{
+    QIOChannelClass *klass = QIO_CHANNEL_GET_CLASS(ioc);
+
+    if (klass->io_set_zerocopy) {
+        klass->io_set_zerocopy(ioc, enabled);
+    }
+}
+
+
 void qio_channel_set_cork(QIOChannel *ioc,
                           bool enabled)
 {
-- 
2.33.0




reply via email to

[Prev in Thread] Current Thread [Next in Thread]