[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [PATCH 2/5] libvduse: Add VDUSE (vDPA Device in Userspace) library
From: |
Stefan Hajnoczi |
Subject: |
Re: [PATCH 2/5] libvduse: Add VDUSE (vDPA Device in Userspace) library |
Date: |
Mon, 7 Feb 2022 14:00:39 +0000 |
On Tue, Jan 25, 2022 at 09:17:57PM +0800, Xie Yongji wrote:
> VDUSE [1] is a linux framework that makes it possible to implement
> software-emulated vDPA devices in userspace. This adds a library
> as a subproject to help implementing VDUSE backends in QEMU.
>
> [1] https://www.kernel.org/doc/html/latest/userspace-api/vduse.html
This library assumes that the program is allowed to access the control
device (/dev/vduse/control). Is that always the case or should the
library also support access to /dev/vduse/<name> only (maybe even with
file descriptor passing) so a privileged process can create/destroy
VDUSE devices?
I didn't review the vring code in detail.
>
> Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
> ---
> meson.build | 15 +
> meson_options.txt | 2 +
> scripts/meson-buildoptions.sh | 3 +
> subprojects/libvduse/include/atomic.h | 1 +
> subprojects/libvduse/libvduse.c | 1025 +++++++++++++++++++
> subprojects/libvduse/libvduse.h | 193 ++++
> subprojects/libvduse/meson.build | 10 +
> subprojects/libvduse/standard-headers/linux | 1 +
> 8 files changed, 1250 insertions(+)
> create mode 120000 subprojects/libvduse/include/atomic.h
> create mode 100644 subprojects/libvduse/libvduse.c
> create mode 100644 subprojects/libvduse/libvduse.h
> create mode 100644 subprojects/libvduse/meson.build
> create mode 120000 subprojects/libvduse/standard-headers/linux
>
> diff --git a/meson.build b/meson.build
> index 333c61deba..864fb50ade 100644
> --- a/meson.build
> +++ b/meson.build
> @@ -1305,6 +1305,21 @@ if not get_option('fuse_lseek').disabled()
> endif
> endif
>
> +have_libvduse = (targetos == 'linux')
> +if get_option('libvduse').enabled()
> + if targetos != 'linux'
> + error('libvduse requires linux')
> + endif
> +elif get_option('libvduse').disabled()
> + have_libvduse = false
> +endif
> +
> +libvduse = not_found
> +if have_libvduse
> + libvduse_proj = subproject('libvduse')
> + libvduse = libvduse_proj.get_variable('libvduse_dep')
> +endif
> +
> # libbpf
> libbpf = dependency('libbpf', required: get_option('bpf'), method:
> 'pkg-config')
> if libbpf.found() and not cc.links('''
> diff --git a/meson_options.txt b/meson_options.txt
> index 921967eddb..16790d1814 100644
> --- a/meson_options.txt
> +++ b/meson_options.txt
> @@ -195,6 +195,8 @@ option('virtfs', type: 'feature', value: 'auto',
> description: 'virtio-9p support')
> option('virtiofsd', type: 'feature', value: 'auto',
> description: 'build virtiofs daemon (virtiofsd)')
> +option('libvduse', type: 'feature', value: 'auto',
> + description: 'build VDUSE Library')
>
> option('capstone', type: 'combo', value: 'auto',
> choices: ['disabled', 'enabled', 'auto', 'system', 'internal'],
> diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
> index a4af02c527..af5c75d758 100644
> --- a/scripts/meson-buildoptions.sh
> +++ b/scripts/meson-buildoptions.sh
> @@ -58,6 +58,7 @@ meson_options_help() {
> printf "%s\n" ' libssh ssh block device support'
> printf "%s\n" ' libudev Use libudev to enumerate host devices'
> printf "%s\n" ' libusb libusb support for USB passthrough'
> + printf "%s\n" ' libvduse build VDUSE Library'
> printf "%s\n" ' libxml2 libxml2 support for Parallels image
> format'
> printf "%s\n" ' linux-aio Linux AIO support'
> printf "%s\n" ' linux-io-uring Linux io_uring support'
> @@ -188,6 +189,8 @@ _meson_option_parse() {
> --disable-libudev) printf "%s" -Dlibudev=disabled ;;
> --enable-libusb) printf "%s" -Dlibusb=enabled ;;
> --disable-libusb) printf "%s" -Dlibusb=disabled ;;
> + --enable-libvduse) printf "%s" -Dlibvduse=enabled ;;
> + --disable-libvduse) printf "%s" -Dlibvduse=disabled ;;
> --enable-libxml2) printf "%s" -Dlibxml2=enabled ;;
> --disable-libxml2) printf "%s" -Dlibxml2=disabled ;;
> --enable-linux-aio) printf "%s" -Dlinux_aio=enabled ;;
> diff --git a/subprojects/libvduse/include/atomic.h
> b/subprojects/libvduse/include/atomic.h
> new file mode 120000
> index 0000000000..8c2be64f7b
> --- /dev/null
> +++ b/subprojects/libvduse/include/atomic.h
> @@ -0,0 +1 @@
> +../../../include/qemu/atomic.h
> \ No newline at end of file
> diff --git a/subprojects/libvduse/libvduse.c b/subprojects/libvduse/libvduse.c
> new file mode 100644
> index 0000000000..7671864bca
> --- /dev/null
> +++ b/subprojects/libvduse/libvduse.c
> @@ -0,0 +1,1025 @@
> +/*
> + * VDUSE (vDPA Device in Userspace) library
> + *
> + * Copyright (C) 2022 Bytedance Inc. and/or its affiliates. All rights
> reserved.
> + * Portions of codes and concepts borrowed from libvhost-user.c, so:
> + * Copyright IBM, Corp. 2007
> + * Copyright (c) 2016 Red Hat, Inc.
> + *
> + * Author:
> + * Xie Yongji <xieyongji@bytedance.com>
> + * Anthony Liguori <aliguori@us.ibm.com>
> + * Marc-André Lureau <mlureau@redhat.com>
> + * Victor Kaplansky <victork@redhat.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or
> + * later. See the COPYING file in the top-level directory.
> + */
> +
> +#include <stdlib.h>
> +#include <stdio.h>
> +#include <stdbool.h>
> +#include <stddef.h>
> +#include <errno.h>
> +#include <string.h>
> +#include <assert.h>
> +#include <endian.h>
> +#include <unistd.h>
> +#include <limits.h>
> +#include <fcntl.h>
> +
> +#include <sys/ioctl.h>
> +#include <sys/eventfd.h>
> +#include <sys/mman.h>
> +
> +#include "include/atomic.h"
> +#include "standard-headers/linux/vhost_types.h"
> +#include "standard-headers/linux/vduse.h"
> +#include "libvduse.h"
> +
> +#define VIRTQUEUE_MAX_SIZE 1024
> +#define VDUSE_VQ_ALIGN 4096
> +#define MAX_IOVA_REGIONS 256
> +
> +/* Round number down to multiple */
> +#define ALIGN_DOWN(n, m) ((n) / (m) * (m))
> +
> +/* Round number up to multiple */
> +#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
> +
> +#ifndef unlikely
> +#define unlikely(x) __builtin_expect(!!(x), 0)
> +#endif
> +
> +typedef struct VduseRing {
> + unsigned int num;
> + uint64_t desc_addr;
> + uint64_t avail_addr;
> + uint64_t used_addr;
> + struct vring_desc *desc;
> + struct vring_avail *avail;
> + struct vring_used *used;
> +} VduseRing;
> +
> +struct VduseVirtq {
> + VduseRing vring;
> + uint16_t last_avail_idx;
> + uint16_t shadow_avail_idx;
> + uint16_t used_idx;
> + uint16_t signalled_used;
> + bool signalled_used_valid;
> + int index;
> + int inuse;
> + bool ready;
> + int fd;
> + VduseDev *dev;
> +};
> +
> +typedef struct VduseIovaRegion {
> + uint64_t iova;
> + uint64_t size;
> + uint64_t mmap_offset;
> + uint64_t mmap_addr;
> +} VduseIovaRegion;
> +
> +struct VduseDev {
> + VduseVirtq *vqs;
> + VduseIovaRegion regions[MAX_IOVA_REGIONS];
> + int num_regions;
> + char *name;
> + uint32_t device_id;
> + uint32_t vendor_id;
> + uint16_t num_queues;
> + uint16_t queue_size;
> + uint64_t features;
> + const VduseOps *ops;
> + int fd;
> + int ctrl_fd;
> + void *priv;
> +};
> +
> +static inline bool has_feature(uint64_t features, unsigned int fbit)
> +{
> + assert(fbit < 64);
> + return !!(features & (1ULL << fbit));
> +}
> +
> +static inline bool vduse_dev_has_feature(VduseDev *dev, unsigned int fbit)
> +{
> + return has_feature(dev->features, fbit);
> +}
> +
> +VduseDev *vduse_queue_get_dev(VduseVirtq *vq)
> +{
> + return vq->dev;
> +}
> +
> +int vduse_queue_get_fd(VduseVirtq *vq)
> +{
> + return vq->fd;
> +}
> +
> +void *vduse_dev_get_priv(VduseDev *dev)
> +{
> + return dev->priv;
> +}
> +
> +VduseVirtq *vduse_dev_get_queue(VduseDev *dev, int index)
> +{
> + return &dev->vqs[index];
> +}
> +
> +int vduse_dev_get_fd(VduseDev *dev)
> +{
> + return dev->fd;
> +}
> +
> +static int vduse_inject_irq(VduseDev *dev, int index)
> +{
> + return ioctl(dev->fd, VDUSE_VQ_INJECT_IRQ, &index);
> +}
> +
> +static void vduse_iova_remove_region(VduseDev *dev, uint64_t start,
> + uint64_t last)
> +{
> + int i;
> +
> + if (last == start) {
> + return;
> + }
> +
> + for (i = 0; i < MAX_IOVA_REGIONS; i++) {
> + if (!dev->regions[i].mmap_addr) {
> + continue;
> + }
> +
> + if (start <= dev->regions[i].iova &&
> + last >= (dev->regions[i].iova + dev->regions[i].size - 1)) {
> + munmap((void *)dev->regions[i].mmap_addr,
> + dev->regions[i].mmap_offset + dev->regions[i].size);
> + dev->regions[i].mmap_addr = 0;
> + dev->num_regions--;
> + }
> + }
> +}
> +
> +static int vduse_iova_add_region(VduseDev *dev, int fd,
> + uint64_t offset, uint64_t start,
> + uint64_t last, int prot)
> +{
> + int i;
> + uint64_t size = last - start + 1;
> + void *mmap_addr = mmap(0, size + offset, prot, MAP_SHARED, fd, 0);
> +
> + if (mmap_addr == MAP_FAILED) {
> + return -EINVAL;
Missing close(fd). This function takes ownership of fd.
> + }
> +
> + for (i = 0; i < MAX_IOVA_REGIONS; i++) {
> + if (!dev->regions[i].mmap_addr) {
> + dev->regions[i].mmap_addr = (uint64_t)(uintptr_t)mmap_addr;
> + dev->regions[i].mmap_offset = offset;
> + dev->regions[i].iova = start;
> + dev->regions[i].size = size;
> + dev->num_regions++;
> + break;
> + }
> + }
> + close(fd);
assert(i < MAX_IOVA_REGIONS)? If we can really reach the end of the for
loop then we must remember to call munmap(2).
> +
> + return 0;
> +}
> +
> +static int perm_to_prot(uint8_t perm)
> +{
> + int prot = 0;
> +
> + switch (perm) {
> + case VDUSE_ACCESS_WO:
> + prot |= PROT_WRITE;
> + break;
> + case VDUSE_ACCESS_RO:
> + prot |= PROT_READ;
> + break;
> + case VDUSE_ACCESS_RW:
> + prot |= PROT_READ | PROT_WRITE;
> + break;
> + default:
> + break;
> + }
> +
> + return prot;
> +}
> +
> +static inline void *iova_to_va(VduseDev *dev, uint64_t *plen, uint64_t iova)
> +{
> + int i, ret;
> + struct vduse_iotlb_entry entry;
> +
> + for (i = 0; i < MAX_IOVA_REGIONS; i++) {
> + VduseIovaRegion *r = &dev->regions[i];
> +
> + if (!r->mmap_addr) {
> + continue;
> + }
> +
> + if ((iova >= r->iova) && (iova < (r->iova + r->size))) {
> + if ((iova + *plen) > (r->iova + r->size)) {
> + *plen = r->iova + r->size - iova;
> + }
> + return (void *)(uintptr_t)(iova - r->iova +
> + r->mmap_addr + r->mmap_offset);
> + }
> + }
> +
> + entry.start = iova;
> + entry.last = iova + 1;
> + ret = ioctl(dev->fd, VDUSE_IOTLB_GET_FD, &entry);
> + if (ret < 0) {
> + return NULL;
> + }
> +
> + if (!vduse_iova_add_region(dev, ret, entry.offset, entry.start,
> + entry.last, perm_to_prot(entry.perm))) {
> + return iova_to_va(dev, plen, iova);
> + }
> +
> + return NULL;
> +}
> +
> +static inline uint16_t vring_avail_flags(VduseVirtq *vq)
> +{
> + return le16toh(vq->vring.avail->flags);
I remember we discussed whether VDUSE should support Transitional
devices. VIRTIO 1.0+ uses little-endian but legacy VIRTIO uses
guest-endian, so le16toh() will not work for legacy VIRTIO vrings in a
cross-endian configuration (e.g. big-endian guest on little-endian
host).
If cross-endian isn't supported please add an error during
intialization so users get a clear error message.
> +}
> +
> +static inline uint16_t vring_avail_idx(VduseVirtq *vq)
> +{
> + vq->shadow_avail_idx = le16toh(vq->vring.avail->idx);
> +
> + return vq->shadow_avail_idx;
> +}
> +
> +static inline uint16_t vring_avail_ring(VduseVirtq *vq, int i)
> +{
> + return le16toh(vq->vring.avail->ring[i]);
> +}
> +
> +static inline uint16_t vring_get_used_event(VduseVirtq *vq)
> +{
> + return vring_avail_ring(vq, vq->vring.num);
> +}
> +
> +static bool vduse_queue_get_head(VduseVirtq *vq, unsigned int idx,
> + unsigned int *head)
> +{
> + /*
> + * Grab the next descriptor number they're advertising, and increment
> + * the index we've seen.
> + */
> + *head = vring_avail_ring(vq, idx % vq->vring.num);
> +
> + /* If their number is silly, that's a fatal mistake. */
> + if (*head >= vq->vring.num) {
> + fprintf(stderr, "Guest says index %u is available\n", *head);
> + return false;
> + }
> +
> + return true;
> +}
> +
> +static int
> +vduse_queue_read_indirect_desc(VduseDev *dev, struct vring_desc *desc,
> + uint64_t addr, size_t len)
> +{
> + struct vring_desc *ori_desc;
> + uint64_t read_len;
> +
> + if (len > (VIRTQUEUE_MAX_SIZE * sizeof(struct vring_desc))) {
> + return -1;
> + }
> +
> + if (len == 0) {
> + return -1;
> + }
> +
> + while (len) {
> + read_len = len;
> + ori_desc = iova_to_va(dev, &read_len, addr);
> + if (!ori_desc) {
> + return -1;
> + }
> +
> + memcpy(desc, ori_desc, read_len);
> + len -= read_len;
> + addr += read_len;
> + desc += read_len;
> + }
> +
> + return 0;
> +}
> +
> +enum {
> + VIRTQUEUE_READ_DESC_ERROR = -1,
> + VIRTQUEUE_READ_DESC_DONE = 0, /* end of chain */
> + VIRTQUEUE_READ_DESC_MORE = 1, /* more buffers in chain */
> +};
> +
> +static int vduse_queue_read_next_desc(struct vring_desc *desc, int i,
> + unsigned int max, unsigned int *next)
> +{
> + /* If this descriptor says it doesn't chain, we're done. */
> + if (!(le16toh(desc[i].flags) & VRING_DESC_F_NEXT)) {
> + return VIRTQUEUE_READ_DESC_DONE;
> + }
> +
> + /* Check they're not leading us off end of descriptors. */
> + *next = desc[i].next;
> + /* Make sure compiler knows to grab that: we don't want it changing! */
> + smp_wmb();
> +
> + if (*next >= max) {
> + fprintf(stderr, "Desc next is %u\n", *next);
> + return VIRTQUEUE_READ_DESC_ERROR;
> + }
> +
> + return VIRTQUEUE_READ_DESC_MORE;
> +}
> +
> +/*
> + * Fetch avail_idx from VQ memory only when we really need to know if
> + * guest has added some buffers.
> + */
> +static bool vduse_queue_empty(VduseVirtq *vq)
> +{
> + if (unlikely(!vq->vring.avail)) {
> + return true;
> + }
> +
> + if (vq->shadow_avail_idx != vq->last_avail_idx) {
> + return false;
> + }
> +
> + return vring_avail_idx(vq) == vq->last_avail_idx;
> +}
> +
> +static bool vduse_queue_should_notify(VduseVirtq *vq)
> +{
> + VduseDev *dev = vq->dev;
> + uint16_t old, new;
> + bool v;
> +
> + /* We need to expose used array entries before checking used event. */
> + smp_mb();
> +
> + /* Always notify when queue is empty (when feature acknowledge) */
> + if (vduse_dev_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) &&
> + !vq->inuse && vduse_queue_empty(vq)) {
> + return true;
> + }
> +
> + if (!vduse_dev_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
> + return !(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT);
> + }
> +
> + v = vq->signalled_used_valid;
> + vq->signalled_used_valid = true;
> + old = vq->signalled_used;
> + new = vq->signalled_used = vq->used_idx;
> + return !v || vring_need_event(vring_get_used_event(vq), new, old);
> +}
> +
> +void vduse_queue_notify(VduseVirtq *vq)
> +{
> + VduseDev *dev = vq->dev;
> +
> + if (unlikely(!vq->vring.avail)) {
> + return;
> + }
> +
> + if (!vduse_queue_should_notify(vq)) {
> + return;
> + }
> +
> + if (vduse_inject_irq(dev, vq->index) < 0) {
> + fprintf(stderr, "Error inject irq for vq %d: %s\n",
> + vq->index, strerror(errno));
> + }
> +}
> +
> +static inline void vring_used_flags_set_bit(VduseVirtq *vq, int mask)
> +{
> + uint16_t *flags;
> +
> + flags = (uint16_t *)((char*)vq->vring.used +
> + offsetof(struct vring_used, flags));
> + *flags = htole16(le16toh(*flags) | mask);
> +}
> +
> +static inline void vring_used_flags_unset_bit(VduseVirtq *vq, int mask)
> +{
> + uint16_t *flags;
> +
> + flags = (uint16_t *)((char*)vq->vring.used +
> + offsetof(struct vring_used, flags));
> + *flags = htole16(le16toh(*flags) & ~mask);
> +}
> +
> +static inline void vring_set_avail_event(VduseVirtq *vq, uint16_t val)
> +{
> + *((uint16_t *)&vq->vring.used->ring[vq->vring.num]) = htole16(val);
> +}
> +
> +static bool vduse_queue_map_single_desc(VduseVirtq *vq, unsigned int
> *p_num_sg,
> + struct iovec *iov, unsigned int
> max_num_sg,
> + bool is_write, uint64_t pa, size_t sz)
> +{
> + unsigned num_sg = *p_num_sg;
> + VduseDev *dev = vq->dev;
> +
> + assert(num_sg <= max_num_sg);
> +
> + if (!sz) {
> + fprintf(stderr, "virtio: zero sized buffers are not allowed\n");
> + return false;
> + }
> +
> + while (sz) {
> + uint64_t len = sz;
> +
> + if (num_sg == max_num_sg) {
> + fprintf(stderr,
> + "virtio: too many descriptors in indirect table\n");
> + return false;
> + }
> +
> + iov[num_sg].iov_base = iova_to_va(dev, &len, pa);
> + if (iov[num_sg].iov_base == NULL) {
> + fprintf(stderr, "virtio: invalid address for buffers\n");
> + return false;
> + }
> + iov[num_sg++].iov_len = len;
> + sz -= len;
> + pa += len;
> + }
> +
> + *p_num_sg = num_sg;
> + return true;
> +}
> +
> +static void *vduse_queue_alloc_element(size_t sz, unsigned out_num,
> + unsigned in_num)
> +{
> + VduseVirtqElement *elem;
> + size_t in_sg_ofs = ALIGN_UP(sz, __alignof__(elem->in_sg[0]));
> + size_t out_sg_ofs = in_sg_ofs + in_num * sizeof(elem->in_sg[0]);
> + size_t out_sg_end = out_sg_ofs + out_num * sizeof(elem->out_sg[0]);
> +
> + assert(sz >= sizeof(VduseVirtqElement));
> + elem = malloc(out_sg_end);
Missing malloc() NULL return value check.
> + elem->out_num = out_num;
> + elem->in_num = in_num;
> + elem->in_sg = (void *)elem + in_sg_ofs;
> + elem->out_sg = (void *)elem + out_sg_ofs;
> + return elem;
> +}
> +
> +static void *vduse_queue_map_desc(VduseVirtq *vq, unsigned int idx, size_t
> sz)
> +{
> + struct vring_desc *desc = vq->vring.desc;
> + VduseDev *dev = vq->dev;
> + uint64_t desc_addr, read_len;
> + unsigned int desc_len;
> + unsigned int max = vq->vring.num;
> + unsigned int i = idx;
> + VduseVirtqElement *elem;
> + struct iovec iov[VIRTQUEUE_MAX_SIZE];
> + struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE];
> + unsigned int out_num = 0, in_num = 0;
> + int rc;
> +
> + if (le16toh(desc[i].flags) & VRING_DESC_F_INDIRECT) {
> + if (le32toh(desc[i].len) % sizeof(struct vring_desc)) {
> + fprintf(stderr, "Invalid size for indirect buffer table\n");
> + return NULL;
> + }
> +
> + /* loop over the indirect descriptor table */
> + desc_addr = le64toh(desc[i].addr);
> + desc_len = le32toh(desc[i].len);
> + max = desc_len / sizeof(struct vring_desc);
> + read_len = desc_len;
> + desc = iova_to_va(dev, &read_len, desc_addr);
> + if (unlikely(desc && read_len != desc_len)) {
> + /* Failed to use zero copy */
> + desc = NULL;
> + if (!vduse_queue_read_indirect_desc(dev, desc_buf,
> + desc_addr,
> + desc_len)) {
> + desc = desc_buf;
> + }
> + }
> + if (!desc) {
> + fprintf(stderr, "Invalid indirect buffer table\n");
> + return NULL;
> + }
> + i = 0;
> + }
> +
> + /* Collect all the descriptors */
> + do {
> + if (le16toh(desc[i].flags) & VRING_DESC_F_WRITE) {
> + if (!vduse_queue_map_single_desc(vq, &in_num, iov + out_num,
> + VIRTQUEUE_MAX_SIZE - out_num,
> + true, le64toh(desc[i].addr),
> + le32toh(desc[i].len))) {
> + return NULL;
> + }
> + } else {
> + if (in_num) {
> + fprintf(stderr, "Incorrect order for descriptors\n");
> + return NULL;
> + }
> + if (!vduse_queue_map_single_desc(vq, &out_num, iov,
> + VIRTQUEUE_MAX_SIZE, false,
> + le64toh(desc[i].addr),
> + le32toh(desc[i].len))) {
> + return NULL;
> + }
> + }
> +
> + /* If we've got too many, that implies a descriptor loop. */
> + if ((in_num + out_num) > max) {
> + fprintf(stderr, "Looped descriptor\n");
> + return NULL;
> + }
> + rc = vduse_queue_read_next_desc(desc, i, max, &i);
> + } while (rc == VIRTQUEUE_READ_DESC_MORE);
> +
> + if (rc == VIRTQUEUE_READ_DESC_ERROR) {
> + fprintf(stderr, "read descriptor error\n");
> + return NULL;
> + }
> +
> + /* Now copy what we have collected and mapped */
> + elem = vduse_queue_alloc_element(sz, out_num, in_num);
> + elem->index = idx;
> + for (i = 0; i < out_num; i++) {
> + elem->out_sg[i] = iov[i];
> + }
> + for (i = 0; i < in_num; i++) {
> + elem->in_sg[i] = iov[out_num + i];
> + }
> +
> + return elem;
> +}
> +
> +void *vduse_queue_pop(VduseVirtq *vq, size_t sz)
> +{
> + unsigned int head;
> + VduseVirtqElement *elem;
> + VduseDev *dev = vq->dev;
> +
> + if (unlikely(!vq->vring.avail)) {
> + return NULL;
> + }
> +
> + if (vduse_queue_empty(vq)) {
> + return NULL;
> + }
> + /* Needed after virtio_queue_empty() */
> + smp_rmb();
> +
> + if (vq->inuse >= vq->vring.num) {
> + fprintf(stderr, "Virtqueue size exceeded: %d\n", vq->inuse);
> + return NULL;
> + }
> +
> + if (!vduse_queue_get_head(vq, vq->last_avail_idx++, &head)) {
> + return NULL;
> + }
> +
> + if (vduse_dev_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
> + vring_set_avail_event(vq, vq->last_avail_idx);
> + }
> +
> + elem = vduse_queue_map_desc(vq, head, sz);
> +
> + if (!elem) {
> + return NULL;
> + }
> +
> + vq->inuse++;
> +
> + return elem;
> +}
> +
> +static inline void vring_used_write(VduseVirtq *vq,
> + struct vring_used_elem *uelem, int i)
> +{
> + struct vring_used *used = vq->vring.used;
> +
> + used->ring[i] = *uelem;
> +}
> +
> +static void vduse_queue_fill(VduseVirtq *vq, const VduseVirtqElement *elem,
> + unsigned int len, unsigned int idx)
> +{
> + struct vring_used_elem uelem;
> +
> + if (unlikely(!vq->vring.used)) {
> + return;
> + }
> +
> + idx = (idx + vq->used_idx) % vq->vring.num;
> +
> + uelem.id = htole32(elem->index);
> + uelem.len = htole32(len);
> + vring_used_write(vq, &uelem, idx);
> +}
> +
> +static inline void vring_used_idx_set(VduseVirtq *vq, uint16_t val)
> +{
> + vq->vring.used->idx = htole16(val);
> + vq->used_idx = val;
> +}
> +
> +static void vduse_queue_flush(VduseVirtq *vq, unsigned int count)
> +{
> + uint16_t old, new;
> +
> + if (unlikely(!vq->vring.used)) {
> + return;
> + }
> +
> + /* Make sure buffer is written before we update index. */
> + smp_wmb();
> +
> + old = vq->used_idx;
> + new = old + count;
> + vring_used_idx_set(vq, new);
> + vq->inuse -= count;
> + if (unlikely((int16_t)(new - vq->signalled_used) < (uint16_t)(new -
> old))) {
> + vq->signalled_used_valid = false;
> + }
> +}
> +
> +void vduse_queue_push(VduseVirtq *vq, const VduseVirtqElement *elem,
> + unsigned int len)
> +{
> + vduse_queue_fill(vq, elem, len, 0);
> + vduse_queue_flush(vq, 1);
> +}
> +
> +static int vduse_queue_update_vring(VduseVirtq *vq, uint64_t desc_addr,
> + uint64_t avail_addr, uint64_t used_addr)
> +{
> + struct VduseDev *dev = vq->dev;
> + uint64_t len;
> +
> + len = sizeof(struct vring_desc);
> + vq->vring.desc = iova_to_va(dev, &len, desc_addr);
> + assert(len == sizeof(struct vring_desc));
> +
> + len = sizeof(struct vring_avail);
> + vq->vring.avail = iova_to_va(dev, &len, avail_addr);
> + assert(len == sizeof(struct vring_avail));
> +
> + len = sizeof(struct vring_used);
> + vq->vring.used = iova_to_va(dev, &len, used_addr);
> + assert(len == sizeof(struct vring_used));
> +
> + if (!vq->vring.desc || !vq->vring.avail || !vq->vring.used) {
> + fprintf(stderr, "Failed to get vq[%d] iova mapping\n", vq->index);
> + return -EINVAL;
> + }
> +
> + return 0;
> +}
> +
> +static void vduse_queue_enable(VduseVirtq *vq)
> +{
> + struct VduseDev *dev = vq->dev;
> + struct vduse_vq_info vq_info;
> + struct vduse_vq_eventfd vq_eventfd;
> + int fd;
> +
> + vq_info.index = vq->index;
> + if (ioctl(dev->fd, VDUSE_VQ_GET_INFO, &vq_info)) {
> + fprintf(stderr, "Failed to get vq[%d] info: %s\n",
> + vq->index, strerror(errno));
> + return;
> + }
> +
> + if (!vq_info.ready) {
> + return;
> + }
> +
> + vq->vring.num = vq_info.num;
> + vq->vring.desc_addr = vq_info.desc_addr;
> + vq->vring.avail_addr = vq_info.driver_addr;
> + vq->vring.used_addr = vq_info.device_addr;
> +
> + if (vduse_queue_update_vring(vq, vq_info.desc_addr,
> + vq_info.driver_addr, vq_info.device_addr)) {
> + fprintf(stderr, "Failed to update vring for vq[%d]\n", vq->index);
> + return;
> + }
> +
> + fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
> + if (fd < 0) {
> + fprintf(stderr, "Failed to init eventfd for vq[%d]\n", vq->index);
> + return;
> + }
> +
> + vq_eventfd.index = vq->index;
> + vq_eventfd.fd = fd;
> + if (ioctl(dev->fd, VDUSE_VQ_SETUP_KICKFD, &vq_eventfd)) {
> + fprintf(stderr, "Failed to setup kick fd for vq[%d]\n", vq->index);
> + close(fd);
> + return;
> + }
> +
> + vq->fd = fd;
> + vq->shadow_avail_idx = vq->last_avail_idx = vq_info.split.avail_index;
> + vq->inuse = 0;
> + vq->used_idx = 0;
> + vq->signalled_used_valid = false;
> + vq->ready = true;
> +
> + dev->ops->enable_queue(dev, vq);
> +}
> +
> +static void vduse_queue_disable(VduseVirtq *vq)
> +{
> + struct VduseDev *dev = vq->dev;
> + struct vduse_vq_eventfd eventfd;
> +
> + if (!vq->ready) {
> + return;
> + }
> +
> + dev->ops->disable_queue(dev, vq);
> +
> + eventfd.index = vq->index;
> + eventfd.fd = VDUSE_EVENTFD_DEASSIGN;
> + ioctl(dev->fd, VDUSE_VQ_SETUP_KICKFD, &eventfd);
> + close(vq->fd);
> +
> + assert(vq->inuse == 0);
> +
> + vq->vring.num = 0;
> + vq->vring.desc_addr = 0;
> + vq->vring.avail_addr = 0;
> + vq->vring.used_addr = 0;
> + vq->vring.desc = 0;
> + vq->vring.avail = 0;
> + vq->vring.used = 0;
> + vq->ready = false;
> + vq->fd = -1;
> +}
> +
> +static void vduse_dev_start_dataplane(VduseDev *dev)
> +{
> + int i;
> +
> + if (ioctl(dev->fd, VDUSE_DEV_GET_FEATURES, &dev->features)) {
> + fprintf(stderr, "Failed to get features: %s\n", strerror(errno));
> + return;
> + }
> +
> + for (i = 0; i < dev->num_queues; i++) {
> + vduse_queue_enable(&dev->vqs[i]);
> + }
> +}
> +
> +static void vduse_dev_stop_dataplane(VduseDev *dev)
> +{
> + int i;
> +
> + for (i = 0; i < dev->num_queues; i++) {
> + vduse_queue_disable(&dev->vqs[i]);
> + }
> + dev->features = 0;
> + vduse_iova_remove_region(dev, 0, ULONG_MAX);
> +}
> +
> +int vduse_dev_handler(VduseDev *dev)
> +{
> + struct vduse_dev_request req;
> + struct vduse_dev_response resp = { 0 };
> + VduseVirtq *vq;
> + int i, ret;
> +
> + ret = read(dev->fd, &req, sizeof(req));
This file descriptor is blocking? I guess the assumption is that the
kernel VDUSE code always enqueues at least one struct vduse_dev_request,
so userspace will not block when the file descriptor becomes readable?
> + if (ret != sizeof(req)) {
> + fprintf(stderr, "Read request error [%d]: %s\n",
> + ret, strerror(errno));
> + return -errno;
> + }
> + resp.request_id = req.request_id;
> +
> + switch (req.type) {
> + case VDUSE_GET_VQ_STATE:
> + vq = &dev->vqs[req.vq_state.index];
> + resp.vq_state.split.avail_index = vq->last_avail_idx;
> + resp.result = VDUSE_REQ_RESULT_OK;
> + break;
> + case VDUSE_SET_STATUS:
> + if (req.s.status & VIRTIO_CONFIG_S_DRIVER_OK) {
> + vduse_dev_start_dataplane(dev);
> + } else if (req.s.status == 0) {
> + vduse_dev_stop_dataplane(dev);
> + }
> + resp.result = VDUSE_REQ_RESULT_OK;
> + break;
> + case VDUSE_UPDATE_IOTLB:
> + /* The iova will be updated by iova_to_va() later, so just remove it
> */
> + vduse_iova_remove_region(dev, req.iova.start, req.iova.last);
> + for (i = 0; i < dev->num_queues; i++) {
> + VduseVirtq *vq = &dev->vqs[i];
> + if (vq->ready) {
> + if (vduse_queue_update_vring(vq, vq->vring.desc_addr,
> + vq->vring.avail_addr,
> + vq->vring.used_addr)) {
> + fprintf(stderr, "Failed to update vring for vq[%d]\n",
> + vq->index);
> + }
> + }
> + }
> + resp.result = VDUSE_REQ_RESULT_OK;
> + break;
> + default:
> + resp.result = VDUSE_REQ_RESULT_FAILED;
> + break;
> + }
> +
> + ret = write(dev->fd, &resp, sizeof(resp));
The kernel never blocks here?
> + if (ret != sizeof(resp)) {
> + fprintf(stderr, "Write request %d error [%d]: %s\n",
> + req.type, ret, strerror(errno));
> + return -errno;
> + }
> + return 0;
> +}
> +
> +int vduse_dev_update_config(VduseDev *dev, uint32_t size,
> + uint32_t offset, char *buffer)
> +{
> + int ret;
> + struct vduse_config_data *data;
> +
> + data = malloc(offsetof(struct vduse_config_data, buffer) + size);
> + if (!data) {
> + return -ENOMEM;
> + }
> +
> + data->offset = offset;
> + data->length = size;
> + memcpy(data->buffer, buffer, size);
> +
> + ret = ioctl(dev->fd, VDUSE_DEV_SET_CONFIG, data);
> + free(data);
> +
> + if (ret) {
> + return -errno;
> + }
> +
> + if (ioctl(dev->fd, VDUSE_DEV_INJECT_CONFIG_IRQ)) {
> + return -errno;
> + }
> +
> + return 0;
> +}
> +
> +int vduse_dev_setup_queue(VduseDev *dev, int index, int max_size)
> +{
> + VduseVirtq *vq = &dev->vqs[index];
> + struct vduse_vq_config vq_config = { 0 };
> +
> + vq_config.index = vq->index;
> + vq_config.max_size = max_size;
> +
> + if (ioctl(dev->fd, VDUSE_VQ_SETUP, &vq_config)) {
> + return -errno;
> + }
> +
> + return 0;
> +}
> +
> +VduseDev *vduse_dev_create(const char *name, uint32_t device_id,
> + uint32_t vendor_id, uint64_t features,
> + uint16_t num_queues, uint32_t config_size,
> + char *config, const VduseOps *ops, void *priv)
> +{
> + VduseDev *dev;
> + int i, ret, ctrl_fd, fd = -1;
> + uint64_t version;
> + char dev_path[VDUSE_NAME_MAX + 16];
Why 16? It has to be at least strlen("/dev/vduse/"), but why more? I
suggest including strlen("/dev/vduse/") instead of hardcoding a magic
constant.
> + VduseVirtq *vqs = NULL;
> + struct vduse_dev_config *dev_config = NULL;
> + size_t size = offsetof(struct vduse_dev_config, config);
> +
> + if (!name || strlen(name) > VDUSE_NAME_MAX || !config ||
The NUL terminator needs to be taken into account:
strlen(name) + 1 > VDUSE_NAME_MAX
> + !config_size || !ops || !ops->enable_queue || !ops->disable_queue) {
> + fprintf(stderr, "Invalid parameter for vduse\n");
> + return NULL;
> + }
> +
> + dev = malloc(sizeof(VduseDev));
> + if (!dev) {
> + fprintf(stderr, "Failed to allocate vduse device\n");
> + return NULL;
> + }
> + memset(dev, 0, sizeof(VduseDev));
> +
> + ctrl_fd = open("/dev/vduse/control", O_RDWR);
> + if (ctrl_fd < 0) {
> + fprintf(stderr, "Failed to open /dev/vduse/control: %s\n",
> + strerror(errno));
> + goto err_ctrl;
> + }
> +
> + version = VDUSE_API_VERSION;
> + if (ioctl(ctrl_fd, VDUSE_SET_API_VERSION, &version)) {
> + fprintf(stderr, "Failed to set api version %lu: %s\n",
> + version, strerror(errno));
> + goto err_dev;
> + }
> +
> + dev_config = malloc(size + config_size);
> + if (!dev_config) {
> + fprintf(stderr, "Failed to allocate config space\n");
> + goto err_dev;
> + }
> + memset(dev_config, 0, size + config_size);
> +
> + strcpy(dev_config->name, name);
> + dev_config->device_id = device_id;
> + dev_config->vendor_id = vendor_id;
> + dev_config->features = features;
> + dev_config->vq_num = num_queues;
> + dev_config->vq_align = VDUSE_VQ_ALIGN;
> + dev_config->config_size = config_size;
> + memcpy(dev_config->config, config, config_size);
> +
> + ret = ioctl(ctrl_fd, VDUSE_CREATE_DEV, dev_config);
> + free(dev_config);
> + if (ret < 0) {
> + fprintf(stderr, "Failed to create vduse dev %s: %s\n",
> + name, strerror(errno));
> + goto err_dev;
> + }
> +
> + sprintf(dev_path, "/dev/vduse/%s", name);
> + fd = open(dev_path, O_RDWR);
Does the caller reject names with ".." path components? Maybe input
validation should be performed before we call open(2)?
> + if (fd < 0) {
> + fprintf(stderr, "Failed to open vduse dev %s: %s\n",
> + name, strerror(errno));
> + goto err;
> + }
> +
> + vqs = calloc(sizeof(VduseVirtq), num_queues);
calloc() could be used instead of malloc + memset above as well.
> + if (!vqs) {
> + fprintf(stderr, "Failed to allocate virtqueues\n");
> + goto err;
> + }
> +
> + for (i = 0; i < num_queues; i++) {
> + vqs[i].index = i;
> + vqs[i].dev = dev;
> + vqs[i].fd = -1;
> + }
> +
> + dev->vqs = vqs;
> + dev->name = strdup(name);
malloc(3) return values are checked elsewhere, strdup(3) should also be
checked.
> + dev->num_queues = num_queues;
> + dev->ops = ops;
> + dev->ctrl_fd = ctrl_fd;
> + dev->fd = fd;
> + dev->priv = priv;
> +
> + return dev;
> +err:
> + if (fd > 0) {
> + close(fd);
> + }
> + ioctl(ctrl_fd, VDUSE_DESTROY_DEV, name);
> +err_dev:
> + close(ctrl_fd);
> +err_ctrl:
> + free(dev);
> +
> + return NULL;
> +}
> +
> +void vduse_dev_destroy(VduseDev *dev)
> +{
> + free(dev->vqs);
> + close(dev->fd);
> + dev->fd = -1;
> + ioctl(dev->ctrl_fd, VDUSE_DESTROY_DEV, dev->name);
> + free(dev->name);
> + close(dev->ctrl_fd);
> + dev->ctrl_fd = -1;
> + free(dev);
> +}
> diff --git a/subprojects/libvduse/libvduse.h b/subprojects/libvduse/libvduse.h
> new file mode 100644
> index 0000000000..f6bcb51b5a
> --- /dev/null
> +++ b/subprojects/libvduse/libvduse.h
> @@ -0,0 +1,193 @@
> +/*
> + * VDUSE (vDPA Device in Userspace) library
> + *
> + * Copyright (C) 2022 Bytedance Inc. and/or its affiliates. All rights
> reserved.
> + *
> + * Author:
> + * Xie Yongji <xieyongji@bytedance.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or
> + * later. See the COPYING file in the top-level directory.
> + */
> +
> +#ifndef LIBVDUSE_H
> +#define LIBVDUSE_H
> +
> +#include <stdint.h>
> +#include <sys/uio.h>
> +
> +/* VDUSE device structure */
> +typedef struct VduseDev VduseDev;
> +
> +/* Virtqueue structure */
> +typedef struct VduseVirtq VduseVirtq;
> +
> +/* Some operation of VDUSE backend */
> +typedef struct VduseOps {
> + /* Called when virtqueue can be processed */
> + void (*enable_queue)(VduseDev *dev, VduseVirtq *vq);
> + /* Called when virtqueue processing should be stopped */
> + void (*disable_queue)(VduseDev *dev, VduseVirtq *vq);
> +} VduseOps;
> +
> +/* Describing elements of the I/O buffer */
> +typedef struct VduseVirtqElement {
> + /* Virtqueue index */
> + unsigned int index;
Is this the descriptor table index or the virtqueue number?
> + /* Number of physically-contiguous device-readable descriptors */
> + unsigned int out_num;
> + /* Number of physically-contiguous device-writable descriptors */
> + unsigned int in_num;
> + /* Array to store physically-contiguous device-writable descriptors */
> + struct iovec *in_sg;
> + /* Array to store physically-contiguous device-readable descriptors */
> + struct iovec *out_sg;
> +} VduseVirtqElement;
> +
> +/**
> + * vduse_queue_get_dev:
> + * @vq: specified virtqueue
> + *
> + * Get corresponding VDUSE device from the virtqueue.
> + *
> + * Returns: a pointer to VDUSE device on success, NULL on failure.
> + */
> +VduseDev *vduse_queue_get_dev(VduseVirtq *vq);
> +
> +/**
> + * vduse_queue_get_fd:
> + * @vq: specified virtqueue
> + *
> + * Get the kick fd for the virtqueue.
> + *
> + * Returns: file descriptor on success, -1 on failure.
> + */
> +int vduse_queue_get_fd(VduseVirtq *vq);
> +
> +/**
> + * vduse_queue_pop:
> + * @vq: specified virtqueue
> + * @sz: the size of struct to return (must be >= VduseVirtqElement)
> + *
> + * Pop an element from virtqueue available ring.
> + *
> + * Returns: a pointer to a structure containing VduseVirtqElement on success,
> + * NULL on failure.
> + */
> +void *vduse_queue_pop(VduseVirtq *vq, size_t sz);
> +
> +/**
> + * vduse_queue_push:
> + * @vq: specified virtqueue
> + * @elem: pointer to VduseVirtqElement returned by vduse_queue_pop()
> + * @len: length in bytes to write
> + *
> + * Push an element to virtqueue used ring.
> + */
> +void vduse_queue_push(VduseVirtq *vq, const VduseVirtqElement *elem,
> + unsigned int len);
> +/**
> + * vduse_queue_notify:
> + * @vq: specified virtqueue
> + *
> + * Request to notify the queue.
> + */
> +void vduse_queue_notify(VduseVirtq *vq);
> +
> +/**
> + * vduse_dev_get_priv:
> + * @dev: VDUSE device
> + *
> + * Get the private pointer passed to vduse_dev_create().
> + *
> + * Returns: private pointer on success, NULL on failure.
> + */
> +void *vduse_dev_get_priv(VduseDev *dev);
> +
> +/**
> + * vduse_dev_get_queue:
> + * @dev: VDUSE device
> + * @index: virtqueue index
> + *
> + * Get the specified virtqueue.
> + *
> + * Returns: a pointer to the virtqueue on success, NULL on failure.
> + */
> +VduseVirtq *vduse_dev_get_queue(VduseDev *dev, int index);
> +
> +/**
> + * vduse_dev_get_fd:
> + * @dev: VDUSE device
> + *
> + * Get the control message fd for the VDUSE device.
> + *
> + * Returns: file descriptor on success, -1 on failure.
> + */
> +int vduse_dev_get_fd(VduseDev *dev);
> +
> +/**
> + * vduse_dev_handler:
> + * @dev: VDUSE device
> + *
> + * Used to process the control message.
> + *
> + * Returns: file descriptor on success, -errno on failure.
> + */
> +int vduse_dev_handler(VduseDev *dev);
> +
> +/**
> + * vduse_dev_update_config:
> + * @dev: VDUSE device
> + * @size: the size to write to configuration space
> + * @offset: the offset from the beginning of configuration space
> + * @buffer: the buffer used to write from
> + *
> + * Update device configuration space and inject a config interrupt.
> + *
> + * Returns: 0 on success, -errno on failure.
> + */
> +int vduse_dev_update_config(VduseDev *dev, uint32_t size,
> + uint32_t offset, char *buffer);
> +
> +/**
> + * vduse_dev_setup_queue:
> + * @dev: VDUSE device
> + * @index: virtqueue index
> + * @max_size: the max size of virtqueue
> + *
> + * Setup the specified virtqueue.
> + *
> + * Returns: 0 on success, -errno on failure.
> + */
> +int vduse_dev_setup_queue(VduseDev *dev, int index, int max_size);
> +
> +/**
> + * vduse_dev_create:
> + * @name: VDUSE device name
> + * @device_id: virtio device id
> + * @vendor_id: virtio vendor id
> + * @features: virtio features
> + * @num_queues: the number of virtqueues
> + * @config_size: the size of the configuration space
> + * @config: the buffer of the configuration space
> + * @ops: the operation of VDUSE backend
> + * @priv: private pointer
> + *
> + * Create VDUSE device.
> + *
> + * Returns: pointer to VDUSE device on success, NULL on failure.
> + */
> +VduseDev *vduse_dev_create(const char *name, uint32_t device_id,
> + uint32_t vendor_id, uint64_t features,
> + uint16_t num_queues, uint32_t config_size,
> + char *config, const VduseOps *ops, void *priv);
> +
> +/**
> + * vduse_dev_destroy:
> + * @dev: VDUSE device
> + *
> + * Destroy the VDUSE device.
> + */
> +void vduse_dev_destroy(VduseDev *dev);
> +
> +#endif
> diff --git a/subprojects/libvduse/meson.build
> b/subprojects/libvduse/meson.build
> new file mode 100644
> index 0000000000..ba08f5ee1a
> --- /dev/null
> +++ b/subprojects/libvduse/meson.build
> @@ -0,0 +1,10 @@
> +project('libvduse', 'c',
> + license: 'GPL-2.0-or-later',
> + default_options: ['c_std=gnu99'])
> +
> +libvduse = static_library('vduse',
> + files('libvduse.c'),
> + c_args: '-D_GNU_SOURCE')
> +
> +libvduse_dep = declare_dependency(link_with: libvduse,
> + include_directories:
> include_directories('.'))
> diff --git a/subprojects/libvduse/standard-headers/linux
> b/subprojects/libvduse/standard-headers/linux
> new file mode 120000
> index 0000000000..c416f068ac
> --- /dev/null
> +++ b/subprojects/libvduse/standard-headers/linux
> @@ -0,0 +1 @@
> +../../../include/standard-headers/linux/
> \ No newline at end of file
> --
> 2.20.1
>
signature.asc
Description: PGP signature
- Re: [PATCH 2/5] libvduse: Add VDUSE (vDPA Device in Userspace) library,
Stefan Hajnoczi <=