Initial version of shadow virtqueue that actually forward buffers.
It reuses the VirtQueue code for the device part. The driver part is
based on Linux's virtio_ring driver, but with stripped functionality
and optimizations so it's easier to review.
These will be added in later commits.
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
---
hw/virtio/vhost-shadow-virtqueue.c | 212 +++++++++++++++++++++++++++--
hw/virtio/vhost.c | 113 ++++++++++++++-
2 files changed, 312 insertions(+), 13 deletions(-)
diff --git a/hw/virtio/vhost-shadow-virtqueue.c
b/hw/virtio/vhost-shadow-virtqueue.c
index 1460d1d5d1..68ed0f2740 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -9,6 +9,7 @@
#include "hw/virtio/vhost-shadow-virtqueue.h"
#include "hw/virtio/vhost.h"
+#include "hw/virtio/virtio-access.h"
#include "standard-headers/linux/vhost_types.h"
@@ -55,11 +56,96 @@ typedef struct VhostShadowVirtqueue {
/* Virtio device */
VirtIODevice *vdev;
+ /* Map for returning guest's descriptors */
+ VirtQueueElement **ring_id_maps;
+
+ /* Next head to expose to device */
+ uint16_t avail_idx_shadow;
+
+ /* Next free descriptor */
+ uint16_t free_head;
+
+ /* Last seen used idx */
+ uint16_t shadow_used_idx;
+
+ /* Next head to consume from device */
+ uint16_t used_idx;
+
/* Descriptors copied from guest */
vring_desc_t descs[];
} VhostShadowVirtqueue;
-/* Forward guest notifications */
+static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
+ const struct iovec *iovec,
+ size_t num, bool more_descs, bool write)
+{
+ uint16_t i = svq->free_head, last = svq->free_head;
+ unsigned n;
+ uint16_t flags = write ? virtio_tswap16(svq->vdev, VRING_DESC_F_WRITE) : 0;
+ vring_desc_t *descs = svq->vring.desc;
+
+ if (num == 0) {
+ return;
+ }
+
+ for (n = 0; n < num; n++) {
+ if (more_descs || (n + 1 < num)) {
+ descs[i].flags = flags | virtio_tswap16(svq->vdev,
+ VRING_DESC_F_NEXT);
+ } else {
+ descs[i].flags = flags;
+ }
+ descs[i].addr = virtio_tswap64(svq->vdev, (hwaddr)iovec[n].iov_base);
+ descs[i].len = virtio_tswap32(svq->vdev, iovec[n].iov_len);
+
+ last = i;
+ i = virtio_tswap16(svq->vdev, descs[i].next);
+ }
+
+ svq->free_head = virtio_tswap16(svq->vdev, descs[last].next);
+}
+
+static unsigned vhost_shadow_vq_add_split(VhostShadowVirtqueue *svq,
+ VirtQueueElement *elem)
+{
+ int head;
+ unsigned avail_idx;
+ vring_avail_t *avail = svq->vring.avail;
+
+ head = svq->free_head;
+
+ /* We need some descriptors here */
+ assert(elem->out_num || elem->in_num);
+
+ vhost_vring_write_descs(svq, elem->out_sg, elem->out_num,
+ elem->in_num > 0, false);
+ vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
+
+ /*
+ * Put entry in available array (but don't update avail->idx until they
+ * do sync).
+ */
+ avail_idx = svq->avail_idx_shadow & (svq->vring.num - 1);
+ avail->ring[avail_idx] = virtio_tswap16(svq->vdev, head);
+ svq->avail_idx_shadow++;
+
+ /* Expose descriptors to device */
+ smp_wmb();
+ avail->idx = virtio_tswap16(svq->vdev, svq->avail_idx_shadow);
+
+ return head;
+
+}
+
+static void vhost_shadow_vq_add(VhostShadowVirtqueue *svq,
+ VirtQueueElement *elem)
+{
+ unsigned qemu_head = vhost_shadow_vq_add_split(svq, elem);
+
+ svq->ring_id_maps[qemu_head] = elem;
+}
+
+/* Handle guest->device notifications */
static void vhost_handle_guest_kick(EventNotifier *n)
{
VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
@@ -69,7 +155,72 @@ static void vhost_handle_guest_kick(EventNotifier *n)
return;
}
- event_notifier_set(&svq->kick_notifier);
+ /* Make available as many buffers as possible */
+ do {
+ if (virtio_queue_get_notification(svq->vq)) {
+ /* No more notifications until process all available */
+ virtio_queue_set_notification(svq->vq, false);
+ }
+
+ while (true) {
+ VirtQueueElement *elem;
+ if (virtio_queue_full(svq->vq)) {
+ break;
+ }
+
+ elem = virtqueue_pop(svq->vq, sizeof(*elem));
+ if (!elem) {
+ break;
+ }
+
+ vhost_shadow_vq_add(svq, elem);
+ event_notifier_set(&svq->kick_notifier);
+ }
+
+ virtio_queue_set_notification(svq->vq, true);
+ } while (!virtio_queue_empty(svq->vq));
+}
+
+static bool vhost_shadow_vq_more_used(VhostShadowVirtqueue *svq)
+{
+ if (svq->used_idx != svq->shadow_used_idx) {
+ return true;
+ }
+
+ /* Get used idx must not be reordered */
+ smp_rmb();
+ svq->shadow_used_idx = virtio_tswap16(svq->vdev, svq->vring.used->idx);
+
+ return svq->used_idx != svq->shadow_used_idx;
+}
+
+static VirtQueueElement *vhost_shadow_vq_get_buf(VhostShadowVirtqueue *svq)
+{
+ vring_desc_t *descs = svq->vring.desc;
+ const vring_used_t *used = svq->vring.used;
+ vring_used_elem_t used_elem;
+ uint16_t last_used;
+
+ if (!vhost_shadow_vq_more_used(svq)) {
+ return NULL;
+ }
+
+ last_used = svq->used_idx & (svq->vring.num - 1);
+ used_elem.id = virtio_tswap32(svq->vdev, used->ring[last_used].id);
+ used_elem.len = virtio_tswap32(svq->vdev, used->ring[last_used].len);
+
+ if (unlikely(used_elem.id >= svq->vring.num)) {
+ error_report("Device %s says index %u is available", svq->vdev->name,
+ used_elem.id);
+ return NULL;
+ }
+
+ descs[used_elem.id].next = svq->free_head;
+ svq->free_head = used_elem.id;
+
+ svq->used_idx++;
+ svq->ring_id_maps[used_elem.id]->len = used_elem.len;
+ return g_steal_pointer(&svq->ring_id_maps[used_elem.id]);
}
/* Forward vhost notifications */
@@ -78,6 +229,7 @@ static void
vhost_shadow_vq_handle_call_no_test(EventNotifier *n)
VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
call_notifier);
EventNotifier *masked_notifier;
+ VirtQueue *vq = svq->vq;
/* Signal start of using masked notifier */
qemu_event_reset(&svq->masked_notifier.is_free);
@@ -86,14 +238,29 @@ static void
vhost_shadow_vq_handle_call_no_test(EventNotifier *n)
qemu_event_set(&svq->masked_notifier.is_free);
}
- if (!masked_notifier) {
- unsigned n = virtio_get_queue_index(svq->vq);
- virtio_queue_invalidate_signalled_used(svq->vdev, n);
- virtio_notify_irqfd(svq->vdev, svq->vq);
- } else if (!svq->masked_notifier.signaled) {
- svq->masked_notifier.signaled = true;
- event_notifier_set(svq->masked_notifier.n);
- }
+ /* Make as many buffers as possible used. */
+ do {
+ unsigned i = 0;
+
+ /* TODO: Use VRING_AVAIL_F_NO_INTERRUPT */
+ while (true) {
+ g_autofree VirtQueueElement *elem = vhost_shadow_vq_get_buf(svq);
+ if (!elem) {
+ break;
+ }
+
+ assert(i < svq->vring.num);
+ virtqueue_fill(vq, elem, elem->len, i++);
+ }
+
+ virtqueue_flush(vq, i);
+ if (!masked_notifier) {
+ virtio_notify_irqfd(svq->vdev, svq->vq);
+ } else if (!svq->masked_notifier.signaled) {
+ svq->masked_notifier.signaled = true;
+ event_notifier_set(svq->masked_notifier.n);
+ }
+ } while (vhost_shadow_vq_more_used(svq));
if (masked_notifier) {
/* Signal not using it anymore */
@@ -103,7 +270,6 @@ static void
vhost_shadow_vq_handle_call_no_test(EventNotifier *n)
static void vhost_shadow_vq_handle_call(EventNotifier *n)
{
-
if (likely(event_notifier_test_and_clear(n))) {
vhost_shadow_vq_handle_call_no_test(n);
}
@@ -254,7 +420,11 @@ void vhost_shadow_vq_stop(struct vhost_dev *dev,
unsigned idx,
VhostShadowVirtqueue *svq)
{
+ int i;
int r = vhost_shadow_vq_restore_vdev_host_notifier(dev, idx, svq);
+
+ assert(!dev->shadow_vqs_enabled);
+
if (unlikely(r < 0)) {
error_report("Couldn't restore vq kick fd: %s", strerror(-r));
}
@@ -272,6 +442,18 @@ void vhost_shadow_vq_stop(struct vhost_dev *dev,
/* Restore vhost call */
vhost_virtqueue_mask(dev, dev->vdev, dev->vq_index + idx,
dev->vqs[idx].notifier_is_masked);
+
+
+ for (i = 0; i < svq->vring.num; ++i) {
+ g_autofree VirtQueueElement *elem = svq->ring_id_maps[i];
+ /*
+ * Although the doc says we must unpop in order, it's ok to unpop
+ * everything.
+ */
+ if (elem) {
+ virtqueue_unpop(svq->vq, elem, elem->len);
+ }
+ }
}
/*
@@ -284,7 +466,7 @@ VhostShadowVirtqueue *vhost_shadow_vq_new(struct vhost_dev
*dev, int idx)
unsigned num = virtio_queue_get_num(dev->vdev, vq_idx);
size_t ring_size = vring_size(num, VRING_DESC_ALIGN_SIZE);
g_autofree VhostShadowVirtqueue *svq = g_malloc0(sizeof(*svq) +
ring_size);
- int r;
+ int r, i;
r = event_notifier_init(&svq->kick_notifier, 0);
if (r != 0) {
@@ -303,6 +485,11 @@ VhostShadowVirtqueue *vhost_shadow_vq_new(struct vhost_dev
*dev, int idx)
vring_init(&svq->vring, num, svq->descs, VRING_DESC_ALIGN_SIZE);
svq->vq = virtio_get_queue(dev->vdev, vq_idx);
svq->vdev = dev->vdev;
+ for (i = 0; i < num - 1; i++) {
+ svq->descs[i].next = virtio_tswap16(dev->vdev, i + 1);
+ }
+
+ svq->ring_id_maps = g_new0(VirtQueueElement *, num);
event_notifier_set_handler(&svq->call_notifier,
vhost_shadow_vq_handle_call);
qemu_event_init(&svq->masked_notifier.is_free, true);
@@ -324,5 +511,6 @@ void vhost_shadow_vq_free(VhostShadowVirtqueue *vq)
event_notifier_cleanup(&vq->kick_notifier);
event_notifier_set_handler(&vq->call_notifier, NULL);
event_notifier_cleanup(&vq->call_notifier);
+ g_free(vq->ring_id_maps);
g_free(vq);
}
diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index eab3e334f2..a373999bc4 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -1021,6 +1021,19 @@ int vhost_device_iotlb_miss(struct vhost_dev *dev,
uint64_t iova, int write)
trace_vhost_iotlb_miss(dev, 1);
+ if (qatomic_load_acquire(&dev->shadow_vqs_enabled)) {
+ uaddr = iova;
+ len = 4096;
+ ret = vhost_backend_update_device_iotlb(dev, iova, uaddr, len,
+ IOMMU_RW);
+ if (ret) {
+ trace_vhost_iotlb_miss(dev, 2);
+ error_report("Fail to update device iotlb");
+ }
+
+ return ret;
+ }
+
iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as,
iova, write,
MEMTXATTRS_UNSPECIFIED);
@@ -1227,8 +1240,28 @@ static int vhost_sw_live_migration_stop(struct vhost_dev
*dev)
/* Can be read by vhost_virtqueue_mask, from vm exit */
qatomic_store_release(&dev->shadow_vqs_enabled, false);
+ dev->vhost_ops->vhost_set_vring_enable(dev, false);
+ if (vhost_backend_invalidate_device_iotlb(dev, 0, -1ULL)) {
+ error_report("Fail to invalidate device iotlb");
+ }
+
for (idx = 0; idx < dev->nvqs; ++idx) {
+ /*
+ * Update used ring information for IOTLB to work correctly,
+ * vhost-kernel code requires for this.
+ */
+ struct vhost_virtqueue *vq = dev->vqs + idx;
+ vhost_device_iotlb_miss(dev, vq->used_phys, true);
+
vhost_shadow_vq_stop(dev, idx, dev->shadow_vqs[idx]);
+ vhost_virtqueue_start(dev, dev->vdev, &dev->vqs[idx],
+ dev->vq_index + idx);
+ }
+
+ /* Enable guest's vq vring */
+ dev->vhost_ops->vhost_set_vring_enable(dev, true);
+
+ for (idx = 0; idx < dev->nvqs; ++idx) {
vhost_shadow_vq_free(dev->shadow_vqs[idx]);
}
@@ -1237,6 +1270,59 @@ static int vhost_sw_live_migration_stop(struct vhost_dev *dev)
return 0;
}
+/*
+ * Start shadow virtqueue in a given queue.
+ * In failure case, this function leaves queue working as regular vhost mode.
+ */
+static bool vhost_sw_live_migration_start_vq(struct vhost_dev *dev,
+ unsigned idx)
+{
+ struct vhost_vring_addr addr = {
+ .index = idx,
+ };
+ struct vhost_vring_state s = {
+ .index = idx,
+ };
+ int r;
+ bool ok;
+
+ vhost_virtqueue_stop(dev, dev->vdev, &dev->vqs[idx], dev->vq_index + idx);
+ ok = vhost_shadow_vq_start(dev, idx, dev->shadow_vqs[idx]);
+ if (unlikely(!ok)) {
+ return false;
+ }
+
+ /* From this point, vhost_virtqueue_start can reset these changes */
+ vhost_shadow_vq_get_vring_addr(dev->shadow_vqs[idx], &addr);
+ r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr);
+ if (unlikely(r != 0)) {
+ VHOST_OPS_DEBUG("vhost_set_vring_addr for shadow vq failed");
+ goto err;
+ }
+
+ r = dev->vhost_ops->vhost_set_vring_base(dev, &s);
+ if (unlikely(r != 0)) {
+ VHOST_OPS_DEBUG("vhost_set_vring_base for shadow vq failed");
+ goto err;
+ }
+
+ /*
+ * Update used ring information for IOTLB to work correctly,
+ * vhost-kernel code requires for this.
+ */
+ r = vhost_device_iotlb_miss(dev, addr.used_user_addr, true);
+ if (unlikely(r != 0)) {
+ /* Debug message already printed */
+ goto err;
+ }
+
+ return true;
+
+err:
+ vhost_virtqueue_start(dev, dev->vdev, &dev->vqs[idx], dev->vq_index + idx);
+ return false;
+}
+
static int vhost_sw_live_migration_start(struct vhost_dev *dev)
{
int idx, stop_idx;
@@ -1249,24 +1335,35 @@ static int vhost_sw_live_migration_start(struct
vhost_dev *dev)
}
}
+ dev->vhost_ops->vhost_set_vring_enable(dev, false);
+ if (vhost_backend_invalidate_device_iotlb(dev, 0, -1ULL)) {
+ error_report("Fail to invalidate device iotlb");
+ }
+
/* Can be read by vhost_virtqueue_mask, from vm exit */
qatomic_store_release(&dev->shadow_vqs_enabled, true);
for (idx = 0; idx < dev->nvqs; ++idx) {
- bool ok = vhost_shadow_vq_start(dev, idx, dev->shadow_vqs[idx]);
+ bool ok = vhost_sw_live_migration_start_vq(dev, idx);
if (unlikely(!ok)) {
goto err_start;
}
}
+ /* Enable shadow vq vring */
+ dev->vhost_ops->vhost_set_vring_enable(dev, true);
return 0;
err_start:
qatomic_store_release(&dev->shadow_vqs_enabled, false);
for (stop_idx = 0; stop_idx < idx; stop_idx++) {
vhost_shadow_vq_stop(dev, idx, dev->shadow_vqs[stop_idx]);
+ vhost_virtqueue_start(dev, dev->vdev, &dev->vqs[idx],
+ dev->vq_index + stop_idx);
}
err_new:
+ /* Enable guest's vring */
+ dev->vhost_ops->vhost_set_vring_enable(dev, true);
for (idx = 0; idx < dev->nvqs; ++idx) {
vhost_shadow_vq_free(dev->shadow_vqs[idx]);
}
@@ -1970,6 +2067,20 @@ void qmp_x_vhost_enable_shadow_vq(const char *name, bool
enable, Error **errp)
if (!hdev->started) {
err_cause = "Device is not started";
+ } else if (!vhost_dev_has_iommu(hdev)) {
+ err_cause = "Does not support iommu";
+ } else if (hdev->acked_features & BIT_ULL(VIRTIO_F_RING_PACKED)) {
+ err_cause = "Is packed";
+ } else if (hdev->acked_features & BIT_ULL(VIRTIO_RING_F_EVENT_IDX)) {
+ err_cause = "Have event idx";
+ } else if (hdev->acked_features &
+ BIT_ULL(VIRTIO_RING_F_INDIRECT_DESC)) {
+ err_cause = "Supports indirect descriptors";
+ } else if (!hdev->vhost_ops->vhost_set_vring_enable) {
+ err_cause = "Cannot pause device";
+ }
+
+ if (err_cause) {
goto err;
}