[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [PATCH 3/7] dataplane: add virtqueue vring code
From: |
Blue Swirl |
Subject: |
Re: [Qemu-devel] [PATCH 3/7] dataplane: add virtqueue vring code |
Date: |
Sat, 17 Nov 2012 16:15:57 +0000 |
On Thu, Nov 15, 2012 at 3:19 PM, Stefan Hajnoczi <address@hidden> wrote:
> The virtio-blk-data-plane cannot access memory using the usual QEMU
> functions since it executes outside the global mutex and the memory APIs
> are this time are not thread-safe.
>
> This patch introduces a virtqueue module based on the kernel's vhost
> vring code. The trick is that we map guest memory ahead of time and
> access it cheaply outside the global mutex.
>
> Once the hardware emulation code can execute outside the global mutex it
> will be possible to drop this code.
>
> Signed-off-by: Stefan Hajnoczi <address@hidden>
> ---
> hw/Makefile.objs | 2 +-
> hw/dataplane/Makefile.objs | 3 +
> hw/dataplane/vring.c | 321
> +++++++++++++++++++++++++++++++++++++++++++++
> hw/dataplane/vring.h | 54 ++++++++
> trace-events | 3 +
> 5 files changed, 382 insertions(+), 1 deletion(-)
> create mode 100644 hw/dataplane/Makefile.objs
> create mode 100644 hw/dataplane/vring.c
> create mode 100644 hw/dataplane/vring.h
>
> diff --git a/hw/Makefile.objs b/hw/Makefile.objs
> index af4ab0c..da8ef0c 100644
> --- a/hw/Makefile.objs
> +++ b/hw/Makefile.objs
> @@ -1,4 +1,4 @@
> -common-obj-y = usb/ ide/
> +common-obj-y = usb/ ide/ dataplane/
> common-obj-y += loader.o
> common-obj-$(CONFIG_VIRTIO) += virtio-console.o
> common-obj-$(CONFIG_VIRTIO_PCI) += virtio-pci.o
> diff --git a/hw/dataplane/Makefile.objs b/hw/dataplane/Makefile.objs
> new file mode 100644
> index 0000000..b58544f
> --- /dev/null
> +++ b/hw/dataplane/Makefile.objs
> @@ -0,0 +1,3 @@
> +ifeq ($(CONFIG_VIRTIO), y)
> +common-obj-$(CONFIG_VIRTIO_BLK_DATA_PLANE) += vring.o
> +endif
> diff --git a/hw/dataplane/vring.c b/hw/dataplane/vring.c
> new file mode 100644
> index 0000000..6aacce8
> --- /dev/null
> +++ b/hw/dataplane/vring.c
> @@ -0,0 +1,321 @@
> +/* Copyright 2012 Red Hat, Inc.
> + * Copyright IBM, Corp. 2012
> + *
> + * Based on Linux vhost code:
> + * Copyright (C) 2009 Red Hat, Inc.
> + * Copyright (C) 2006 Rusty Russell IBM Corporation
> + *
> + * Author: Michael S. Tsirkin <address@hidden>
> + * Stefan Hajnoczi <address@hidden>
> + *
> + * Inspiration, some code, and most witty comments come from
> + * Documentation/virtual/lguest/lguest.c, by Rusty Russell
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.
> + */
> +
> +#include "trace.h"
> +#include "hw/dataplane/vring.h"
> +
> +/* Map target physical address to host address
> + */
> +static inline void *phys_to_host(Vring *vring, hwaddr phys)
> +{
> + /* Adjust for 3.6-4 GB PCI memory range */
> + if (phys >= 0x100000000) {
> + phys -= 0x100000000 - 0xe0000000;
> + } else if (phys >= 0xe0000000) {
> + fprintf(stderr, "phys_to_host bad physical address in "
> + "PCI range %#lx\n", phys);
> + exit(1);
Exiting is rather drastic. Is this guest's error or QEMU's?
> + }
> + return vring->phys_mem_zero_host_ptr + phys;
> +}
> +
> +/* Setup for cheap target physical to host address conversion
> + *
> + * This is a hack for direct access to guest memory, we're not really allowed
> + * to do this.
> + */
> +static void setup_phys_to_host(Vring *vring)
> +{
> + hwaddr len = 4096; /* RAM is really much larger but we cheat */
> + vring->phys_mem_zero_host_ptr = cpu_physical_memory_map(0, &len, 0);
> + if (!vring->phys_mem_zero_host_ptr) {
> + fprintf(stderr, "setup_phys_to_host failed\n");
> + exit(1);
> + }
> +}
> +
> +/* Map the guest's vring to host memory
> + *
> + * This is not allowed but we know the ring won't move.
> + */
> +void vring_setup(Vring *vring, VirtIODevice *vdev, int n)
> +{
> + setup_phys_to_host(vring);
> +
> + vring_init(&vring->vr, virtio_queue_get_num(vdev, n),
> + phys_to_host(vring, virtio_queue_get_ring_addr(vdev, n)),
> 4096);
> +
> + vring->last_avail_idx = 0;
> + vring->last_used_idx = 0;
> + vring->signalled_used = 0;
> + vring->signalled_used_valid = false;
> +
> + trace_vring_setup(virtio_queue_get_ring_addr(vdev, n),
> + vring->vr.desc, vring->vr.avail, vring->vr.used);
> +}
> +
> +/* Toggle guest->host notifies */
> +void vring_set_notification(VirtIODevice *vdev, Vring *vring, bool enable)
> +{
> + if (vdev->guest_features & (1 << VIRTIO_RING_F_EVENT_IDX)) {
> + if (enable) {
> + vring_avail_event(&vring->vr) = vring->vr.avail->idx;
> + }
> + } else if (enable) {
> + vring->vr.used->flags &= ~VRING_USED_F_NO_NOTIFY;
> + } else {
> + vring->vr.used->flags |= VRING_USED_F_NO_NOTIFY;
> + }
> +}
> +
> +/* This is stolen from linux/drivers/vhost/vhost.c:vhost_notify() */
> +bool vring_should_notify(VirtIODevice *vdev, Vring *vring)
> +{
> + uint16_t old, new;
> + bool v;
> + /* Flush out used index updates. This is paired
> + * with the barrier that the Guest executes when enabling
> + * interrupts. */
> + smp_mb();
> +
> + if ((vdev->guest_features & VIRTIO_F_NOTIFY_ON_EMPTY) &&
> + unlikely(vring->vr.avail->idx == vring->last_avail_idx)) {
> + return true;
> + }
> +
> + if (!(vdev->guest_features & VIRTIO_RING_F_EVENT_IDX)) {
> + return !(vring->vr.avail->flags & VRING_AVAIL_F_NO_INTERRUPT);
> + }
> + old = vring->signalled_used;
> + v = vring->signalled_used_valid;
> + new = vring->signalled_used = vring->last_used_idx;
> + vring->signalled_used_valid = true;
> +
> + if (unlikely(!v)) {
> + return true;
> + }
> +
> + return vring_need_event(vring_used_event(&vring->vr), new, old);
> +}
> +
> +/* This is stolen from linux-2.6/drivers/vhost/vhost.c. */
> +static bool get_indirect(Vring *vring,
> + struct iovec iov[], struct iovec *iov_end,
> + unsigned int *out_num, unsigned int *in_num,
> + struct vring_desc *indirect)
> +{
> + struct vring_desc desc;
> + unsigned int i = 0, count, found = 0;
> +
> + /* Sanity check */
> + if (unlikely(indirect->len % sizeof desc)) {
> + fprintf(stderr, "Invalid length in indirect descriptor: "
> + "len 0x%llx not multiple of 0x%zx\n",
> + (unsigned long long)indirect->len,
> + sizeof desc);
> + exit(1);
> + }
> +
> + count = indirect->len / sizeof desc;
> + /* Buffers are chained via a 16 bit next field, so
> + * we can have at most 2^16 of these. */
> + if (unlikely(count > USHRT_MAX + 1)) {
> + fprintf(stderr, "Indirect buffer length too big: %d\n",
> + indirect->len);
> + exit(1);
> + }
> +
> + /* Point to translate indirect desc chain */
> + indirect = phys_to_host(vring, indirect->addr);
> +
> + /* We will use the result as an address to read from, so most
> + * architectures only need a compiler barrier here. */
> + barrier(); /* read_barrier_depends(); */
> +
> + do {
> + if (unlikely(++found > count)) {
> + fprintf(stderr, "Loop detected: last one at %u "
> + "indirect size %u\n",
> + i, count);
> + exit(1);
> + }
> +
> + desc = *indirect++;
> + if (unlikely(desc.flags & VRING_DESC_F_INDIRECT)) {
> + fprintf(stderr, "Nested indirect descriptor\n");
> + exit(1);
> + }
> +
> + /* Stop for now if there are not enough iovecs available. */
> + if (iov >= iov_end) {
> + return false;
> + }
> +
> + iov->iov_base = phys_to_host(vring, desc.addr);
> + iov->iov_len = desc.len;
> + iov++;
> +
> + /* If this is an input descriptor, increment that count. */
> + if (desc.flags & VRING_DESC_F_WRITE) {
> + *in_num += 1;
> + } else {
> + /* If it's an output descriptor, they're all supposed
> + * to come before any input descriptors. */
> + if (unlikely(*in_num)) {
> + fprintf(stderr, "Indirect descriptor "
> + "has out after in: idx %d\n", i);
> + exit(1);
> + }
> + *out_num += 1;
> + }
> + i = desc.next;
> + } while (desc.flags & VRING_DESC_F_NEXT);
> + return true;
> +}
> +
> +/* This looks in the virtqueue and for the first available buffer, and
> converts
> + * it to an iovec for convenient access. Since descriptors consist of some
> + * number of output then some number of input descriptors, it's actually two
> + * iovecs, but we pack them into one and note how many of each there were.
> + *
> + * This function returns the descriptor number found, or vq->num (which is
> + * never a valid descriptor number) if none was found. A negative code is
> + * returned on error.
> + *
> + * Stolen from linux-2.6/drivers/vhost/vhost.c.
> + */
> +int vring_pop(VirtIODevice *vdev, Vring *vring,
> + struct iovec iov[], struct iovec *iov_end,
> + unsigned int *out_num, unsigned int *in_num)
> +{
> + struct vring_desc desc;
> + unsigned int i, head, found = 0, num = vring->vr.num;
> + __u16 avail_idx, last_avail_idx;
Please use uint16_t in QEMU code.
> +
> + /* Check it isn't doing very strange things with descriptor numbers. */
> + last_avail_idx = vring->last_avail_idx;
> + avail_idx = vring->vr.avail->idx;
> +
> + if (unlikely((__u16)(avail_idx - last_avail_idx) > num)) {
> + fprintf(stderr, "Guest moved used index from %u to %u\n",
> + last_avail_idx, avail_idx);
> + exit(1);
> + }
> +
> + /* If there's nothing new since last we looked. */
> + if (avail_idx == last_avail_idx) {
> + return -EAGAIN;
> + }
> +
> + /* Only get avail ring entries after they have been exposed by guest. */
> + smp_rmb();
> +
> + /* Grab the next descriptor number they're advertising, and increment
> + * the index we've seen. */
> + head = vring->vr.avail->ring[last_avail_idx % num];
> +
> + /* If their number is silly, that's an error. */
> + if (unlikely(head >= num)) {
> + fprintf(stderr, "Guest says index %u > %u is available\n",
> + head, num);
> + exit(1);
> + }
> +
> + if (vdev->guest_features & (1 << VIRTIO_RING_F_EVENT_IDX)) {
> + vring_avail_event(&vring->vr) = vring->vr.avail->idx;
> + }
> +
> + /* When we start there are none of either input nor output. */
> + *out_num = *in_num = 0;
> +
> + i = head;
> + do {
> + if (unlikely(i >= num)) {
> + fprintf(stderr, "Desc index is %u > %u, head = %u\n",
> + i, num, head);
> + exit(1);
> + }
> + if (unlikely(++found > num)) {
> + fprintf(stderr, "Loop detected: last one at %u "
> + "vq size %u head %u\n",
> + i, num, head);
> + exit(1);
> + }
> + desc = vring->vr.desc[i];
> + if (desc.flags & VRING_DESC_F_INDIRECT) {
> + if (!get_indirect(vring, iov, iov_end, out_num, in_num, &desc)) {
> + return -ENOBUFS; /* not enough iovecs, stop for now */
> + }
> + continue;
> + }
> +
> + /* If there are not enough iovecs left, stop for now. The caller
> + * should check if there are more descs available once they have
> dealt
> + * with the current set.
> + */
> + if (iov >= iov_end) {
> + return -ENOBUFS;
> + }
> +
> + iov->iov_base = phys_to_host(vring, desc.addr);
> + iov->iov_len = desc.len;
> + iov++;
> +
> + if (desc.flags & VRING_DESC_F_WRITE) {
> + /* If this is an input descriptor,
> + * increment that count. */
> + *in_num += 1;
> + } else {
> + /* If it's an output descriptor, they're all supposed
> + * to come before any input descriptors. */
> + if (unlikely(*in_num)) {
> + fprintf(stderr, "Descriptor has out after in: "
> + "idx %d\n", i);
> + exit(1);
> + }
> + *out_num += 1;
> + }
> + i = desc.next;
> + } while (desc.flags & VRING_DESC_F_NEXT);
> +
> + /* On success, increment avail index. */
> + vring->last_avail_idx++;
> + return head;
> +}
> +
> +/* After we've used one of their buffers, we tell them about it.
> + *
> + * Stolen from linux-2.6/drivers/vhost/vhost.c.
> + */
> +void vring_push(Vring *vring, unsigned int head, int len)
> +{
> + struct vring_used_elem *used;
> + uint16_t new;
> +
> + /* The virtqueue contains a ring of used buffers. Get a pointer to the
> + * next entry in that used ring. */
> + used = &vring->vr.used->ring[vring->last_used_idx % vring->vr.num];
> + used->id = head;
> + used->len = len;
> +
> + /* Make sure buffer is written before we update index. */
> + smp_wmb();
> +
> + new = vring->vr.used->idx = ++vring->last_used_idx;
> + if (unlikely((int16_t)(new - vring->signalled_used) < (uint16_t)1)) {
> + vring->signalled_used_valid = false;
> + }
> +}
> diff --git a/hw/dataplane/vring.h b/hw/dataplane/vring.h
> new file mode 100644
> index 0000000..42c2f0a
> --- /dev/null
> +++ b/hw/dataplane/vring.h
> @@ -0,0 +1,54 @@
> +/* Copyright 2012 Red Hat, Inc. and/or its affiliates
> + * Copyright IBM, Corp. 2012
> + *
> + * Based on Linux vhost code:
> + * Copyright (C) 2009 Red Hat, Inc.
> + * Copyright (C) 2006 Rusty Russell IBM Corporation
> + *
> + * Author: Michael S. Tsirkin <address@hidden>
> + * Stefan Hajnoczi <address@hidden>
> + *
> + * Inspiration, some code, and most witty comments come from
> + * Documentation/virtual/lguest/lguest.c, by Rusty Russell
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.
> + */
> +
> +#ifndef VRING_H
> +#define VRING_H
> +
> +#include <linux/virtio_ring.h>
> +#include "qemu-common.h"
> +#include "qemu-barrier.h"
> +#include "memory.h"
> +#include "hw/virtio.h"
> +
> +typedef struct {
> + void *phys_mem_zero_host_ptr; /* host pointer to guest RAM */
> + struct vring vr; /* virtqueue vring mapped to host memory
> */
> + __u16 last_avail_idx; /* last processed avail ring index */
> + __u16 last_used_idx; /* last processed used ring index */
> + uint16_t signalled_used; /* EVENT_IDX state */
> + bool signalled_used_valid;
> +} Vring;
> +
> +static inline unsigned int vring_get_num(Vring *vring)
> +{
> + return vring->vr.num;
> +}
> +
> +/* Are there more descriptors available? */
> +static inline bool vring_more_avail(Vring *vring)
> +{
> + return vring->vr.avail->idx != vring->last_avail_idx;
> +}
> +
> +void vring_setup(Vring *vring, VirtIODevice *vdev, int n);
> +void vring_set_notification(VirtIODevice *vdev, Vring *vring, bool enable);
> +bool vring_should_notify(VirtIODevice *vdev, Vring *vring);
> +int vring_pop(VirtIODevice *vdev, Vring *vring,
> + struct iovec iov[], struct iovec *iov_end,
> + unsigned int *out_num, unsigned int *in_num);
> +void vring_push(Vring *vring, unsigned int head, int len);
> +
> +#endif /* VRING_H */
> diff --git a/trace-events b/trace-events
> index e1a37cc..8eeab34 100644
> --- a/trace-events
> +++ b/trace-events
> @@ -98,6 +98,9 @@ virtio_blk_rw_complete(void *req, int ret) "req %p ret %d"
> virtio_blk_handle_write(void *req, uint64_t sector, size_t nsectors) "req %p
> sector %"PRIu64" nsectors %zu"
> virtio_blk_handle_read(void *req, uint64_t sector, size_t nsectors) "req %p
> sector %"PRIu64" nsectors %zu"
>
> +# hw/dataplane/vring.c
> +vring_setup(uint64_t physical, void *desc, void *avail, void *used) "vring
> physical %#"PRIx64" desc %p avail %p used %p"
> +
> # thread-pool.c
> thread_pool_submit(void *req, void *opaque) "req %p opaque %p"
> thread_pool_complete(void *req, void *opaque, int ret) "req %p opaque %p ret
> %d"
> --
> 1.8.0
>
>
- [Qemu-devel] [PATCH 1/7] raw-posix: add raw_get_aio_fd() for virtio-blk-data-plane, (continued)
- [Qemu-devel] [PATCH 3/7] dataplane: add virtqueue vring code, Stefan Hajnoczi, 2012/11/15
- [Qemu-devel] [PATCH 4/7] dataplane: add event loop, Stefan Hajnoczi, 2012/11/15
- [Qemu-devel] [PATCH 5/7] dataplane: add Linux AIO request queue, Stefan Hajnoczi, 2012/11/15
- [Qemu-devel] [PATCH 7/7] virtio-blk: add x-data-plane=on|off performance feature, Stefan Hajnoczi, 2012/11/15
- Re: [Qemu-devel] [PATCH 7/7] virtio-blk: add x-data-plane=on|off performance feature, Anthony Liguori, 2012/11/15