qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [RFC PATCH V2 2/2] vhost: device IOTLB API


From: Michael S. Tsirkin
Subject: Re: [Qemu-devel] [RFC PATCH V2 2/2] vhost: device IOTLB API
Date: Wed, 27 Apr 2016 14:45:56 +0300

On Fri, Mar 25, 2016 at 10:34:34AM +0800, Jason Wang wrote:
> This patch tries to implement an device IOTLB for vhost. This could be
> used with for co-operation with userspace(qemu) implementation of DMA
> remapping.
> 
> The idea is simple. When vhost meets an IOTLB miss, it will request
> the assistance of userspace to do the translation, this is done
> through:
> 
> - Fill the translation request in a preset userspace address (This
>   address is set through ioctl VHOST_SET_IOTLB_REQUEST_ENTRY).
> - Notify userspace through eventfd (This eventfd was set through ioctl
>   VHOST_SET_IOTLB_FD).

Why use an eventfd for this? We use them for interrupts because
that happens to be what kvm wants, but here - why don't we
just add a generic support for reading out events
on the vhost fd itself?

> - device IOTLB were started and stopped through VHOST_RUN_IOTLB ioctl
> 
> When userspace finishes the translation, it will update the vhost
> IOTLB through VHOST_UPDATE_IOTLB ioctl. Userspace is also in charge of
> snooping the IOTLB invalidation of IOMMU IOTLB and use
> VHOST_UPDATE_IOTLB to invalidate the possible entry in vhost.

There's one problem here, and that is that VQs still do not undergo
translation.  In theory VQ could be mapped in such a way
that it's not contigious in userspace memory.


> Signed-off-by: Jason Wang <address@hidden>

What limits amount of entries that kernel keeps around?
Do we want at least a mod parameter for this?


> ---
>  drivers/vhost/net.c        |   6 +-
>  drivers/vhost/vhost.c      | 301 
> +++++++++++++++++++++++++++++++++++++++------
>  drivers/vhost/vhost.h      |  17 ++-
>  fs/eventfd.c               |   3 +-
>  include/uapi/linux/vhost.h |  35 ++++++
>  5 files changed, 320 insertions(+), 42 deletions(-)
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 481db96..7cbdeed 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -334,7 +334,7 @@ static void handle_tx(struct vhost_net *net)
>               head = vhost_get_vq_desc(vq, vq->iov,
>                                        ARRAY_SIZE(vq->iov),
>                                        &out, &in,
> -                                      NULL, NULL);
> +                                      NULL, NULL, VHOST_ACCESS_RO);
>               /* On error, stop handling until the next kick. */
>               if (unlikely(head < 0))
>                       break;
> @@ -470,7 +470,7 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
>               }
>               r = vhost_get_vq_desc(vq, vq->iov + seg,
>                                     ARRAY_SIZE(vq->iov) - seg, &out,
> -                                   &in, log, log_num);
> +                                   &in, log, log_num, VHOST_ACCESS_WO);
>               if (unlikely(r < 0))
>                       goto err;
>  
> @@ -1083,7 +1083,7 @@ static long vhost_net_ioctl(struct file *f, unsigned 
> int ioctl,
>               r = vhost_dev_ioctl(&n->dev, ioctl, argp);
>               if (r == -ENOIOCTLCMD)
>                       r = vhost_vring_ioctl(&n->dev, ioctl, argp);
> -             else
> +             else if (ioctl != VHOST_UPDATE_IOTLB)
>                       vhost_net_flush(n);
>               mutex_unlock(&n->dev.mutex);
>               return r;
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 32c35a9..1dd43e8 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -280,6 +280,10 @@ static void vhost_vq_reset(struct vhost_dev *dev,
>       vq->call_ctx = NULL;
>       vq->call = NULL;
>       vq->log_ctx = NULL;
> +     vq->iotlb_call = NULL;
> +     vq->iotlb_call_ctx = NULL;
> +     vq->iotlb_request = NULL;
> +     vq->pending_request.flags.type = VHOST_IOTLB_INVALIDATE;
>       vq->umem = NULL;
>       vq->is_le = virtio_legacy_is_little_endian();
>       vhost_vq_reset_user_be(vq);
> @@ -387,8 +391,10 @@ void vhost_dev_init(struct vhost_dev *dev,
>       dev->log_ctx = NULL;
>       dev->log_file = NULL;
>       dev->umem = NULL;
> +     dev->iotlb = NULL;
>       dev->mm = NULL;
>       spin_lock_init(&dev->work_lock);
> +     spin_lock_init(&dev->iotlb_lock);
>       INIT_LIST_HEAD(&dev->work_list);
>       dev->worker = NULL;
>  
> @@ -537,6 +543,15 @@ void vhost_dev_stop(struct vhost_dev *dev)
>  }
>  EXPORT_SYMBOL_GPL(vhost_dev_stop);
>  
> +static void vhost_umem_free(struct vhost_umem *umem,
> +                         struct vhost_umem_node *node)
> +{
> +     vhost_umem_interval_tree_remove(node, &umem->umem_tree);
> +     list_del(&node->link);
> +     kfree(node);
> +     umem->numem--;
> +}
> +
>  static void vhost_umem_clean(struct vhost_umem *umem)
>  {
>       struct vhost_umem_node *node, *tmp;
> @@ -544,11 +559,9 @@ static void vhost_umem_clean(struct vhost_umem *umem)
>       if (!umem)
>               return;
>  
> -     list_for_each_entry_safe(node, tmp, &umem->umem_list, link) {
> -             vhost_umem_interval_tree_remove(node, &umem->umem_tree);
> -             list_del(&node->link);
> -             kvfree(node);
> -     }
> +     list_for_each_entry_safe(node, tmp, &umem->umem_list, link)
> +             vhost_umem_free(umem, node);
> +
>       kvfree(umem);
>  }
>  
> @@ -580,6 +593,8 @@ void vhost_dev_cleanup(struct vhost_dev *dev, bool locked)
>       /* No one will access memory at this point */
>       vhost_umem_clean(dev->umem);
>       dev->umem = NULL;
> +     vhost_umem_clean(dev->iotlb);
> +     dev->iotlb = NULL;
>       WARN_ON(!list_empty(&dev->work_list));
>       if (dev->worker) {
>               kthread_stop(dev->worker);
> @@ -699,11 +714,61 @@ int vhost_vq_access_ok(struct vhost_virtqueue *vq)
>  }
>  EXPORT_SYMBOL_GPL(vhost_vq_access_ok);
>  
> +static int vhost_new_umem_range(struct vhost_umem *umem,
> +                             u64 start, u64 size, u64 end,
> +                             u64 userspace_addr, int perm)
> +{
> +     struct vhost_umem_node *tmp, *node = kmalloc(sizeof(*node), GFP_ATOMIC);
> +
> +     if (!node)
> +             return -ENOMEM;
> +
> +     if (umem->numem == VHOST_IOTLB_SIZE) {
> +             tmp = list_last_entry(&umem->umem_list, typeof(*tmp), link);
> +             vhost_umem_free(umem, tmp);
> +     }
> +
> +     node->start = start;
> +     node->size = size;
> +     node->last = end;
> +     node->userspace_addr = userspace_addr;
> +     node->perm = perm;
> +     INIT_LIST_HEAD(&node->link);
> +     list_add_tail(&node->link, &umem->umem_list);
> +     vhost_umem_interval_tree_insert(node, &umem->umem_tree);
> +     umem->numem++;
> +
> +     return 0;
> +}
> +
> +static void vhost_del_umem_range(struct vhost_umem *umem,
> +                              u64 start, u64 end)
> +{
> +     struct vhost_umem_node *node;
> +
> +     while ((node = vhost_umem_interval_tree_iter_first(&umem->umem_tree,
> +                                                        start, end)))
> +             vhost_umem_free(umem, node);
> +}
> +
> +static struct vhost_umem *vhost_umem_alloc(void)
> +{
> +     struct vhost_umem *umem = vhost_kvzalloc(sizeof(*umem));
> +
> +     if (!umem)
> +             return NULL;
> +
> +     umem->umem_tree = RB_ROOT;
> +     umem->numem = 0;
> +     INIT_LIST_HEAD(&umem->umem_list);
> +
> +     return umem;
> +}
> +
>  static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user 
> *m)
>  {
>       struct vhost_memory mem, *newmem;
>       struct vhost_memory_region *region;
> -     struct vhost_umem_node *node;
>       struct vhost_umem *newumem, *oldumem;
>       unsigned long size = offsetof(struct vhost_memory, regions);
>       int i;
> @@ -725,28 +790,23 @@ static long vhost_set_memory(struct vhost_dev *d, 
> struct vhost_memory __user *m)
>               return -EFAULT;
>       }
>  
> -     newumem = vhost_kvzalloc(sizeof(*newumem));
> +     newumem = vhost_umem_alloc();
>       if (!newumem) {
>               kvfree(newmem);
>               return -ENOMEM;
>       }
>  
> -     newumem->umem_tree = RB_ROOT;
> -     INIT_LIST_HEAD(&newumem->umem_list);
> -
>       for (region = newmem->regions;
>            region < newmem->regions + mem.nregions;
>            region++) {
> -             node = vhost_kvzalloc(sizeof(*node));
> -             if (!node)
> +             if (vhost_new_umem_range(newumem,
> +                                      region->guest_phys_addr,
> +                                      region->memory_size,
> +                                      region->guest_phys_addr +
> +                                      region->memory_size - 1,
> +                                      region->userspace_addr,
> +                                      VHOST_ACCESS_RW))
>                       goto err;
> -             node->start = region->guest_phys_addr;
> -             node->size = region->memory_size;
> -             node->last = node->start + node->size - 1;
> -             node->userspace_addr = region->userspace_addr;
> -             INIT_LIST_HEAD(&node->link);
> -             list_add_tail(&node->link, &newumem->umem_list);
> -             vhost_umem_interval_tree_insert(node, &newumem->umem_tree);
>       }
>  
>       if (!memory_access_ok(d, newumem, 0))
> @@ -782,6 +842,7 @@ long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, 
> void __user *argp)
>       struct vhost_vring_state s;
>       struct vhost_vring_file f;
>       struct vhost_vring_addr a;
> +     struct vhost_vring_iotlb_entry e;
>       u32 idx;
>       long r;
>  
> @@ -910,6 +971,35 @@ long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, 
> void __user *argp)
>               } else
>                       filep = eventfp;
>               break;
> +     case VHOST_SET_VRING_IOTLB_REQUEST:
> +             r = -EFAULT;
> +             if (copy_from_user(&e, argp, sizeof e))
> +                     break;
> +             if (!access_ok(VERIFY_WRITE, e.userspace_addr,
> +                             sizeof(*vq->iotlb_request)))
> +                     break;
> +             r = 0;
> +             vq->iotlb_request = (struct vhost_iotlb_entry __user 
> *)e.userspace_addr;
> +             break;
> +     case VHOST_SET_VRING_IOTLB_CALL:
> +             if (copy_from_user(&f, argp, sizeof f)) {
> +                     r = -EFAULT;
> +                     break;
> +             }
> +             eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd);
> +             if (IS_ERR(eventfp)) {
> +                     r = PTR_ERR(eventfp);
> +                     break;
> +             }
> +             if (eventfp != vq->iotlb_call) {
> +                     filep = vq->iotlb_call;
> +                     ctx = vq->iotlb_call_ctx;
> +                     vq->iotlb_call = eventfp;
> +                     vq->iotlb_call_ctx = eventfp ?
> +                             eventfd_ctx_fileget(eventfp) : NULL;
> +             } else
> +                     filep = eventfp;
> +             break;
>       case VHOST_SET_VRING_CALL:
>               if (copy_from_user(&f, argp, sizeof f)) {
>                       r = -EFAULT;
> @@ -977,11 +1067,55 @@ long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, 
> void __user *argp)
>  }
>  EXPORT_SYMBOL_GPL(vhost_vring_ioctl);
>  
> +static int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled)
> +{
> +     struct vhost_umem *niotlb, *oiotlb;
> +
> +     if (enabled) {
> +             niotlb = vhost_umem_alloc();
> +             if (!niotlb)
> +                     return -ENOMEM;
> +     } else
> +             niotlb = NULL;
> +
> +     spin_lock(&d->iotlb_lock);
> +     oiotlb = d->iotlb;
> +     d->iotlb = niotlb;
> +     spin_unlock(&d->iotlb_lock);
> +
> +     vhost_umem_clean(oiotlb);
> +
> +     return 0;
> +}
> +
> +static void vhost_complete_iotlb_update(struct vhost_dev *d,
> +                                     struct vhost_iotlb_entry *entry)
> +{
> +     struct vhost_iotlb_entry *req;
> +     struct vhost_virtqueue *vq;
> +     int i;
> +
> +
> +     for (i = 0; i < d->nvqs; i++) {
> +             vq = d->vqs[i];
> +             mutex_lock(&vq->mutex);
> +             req = &vq->pending_request;
> +             if (entry->iova <= req->iova &&
> +                 entry->iova + entry->size - 1 > req->iova &&
> +                 req->flags.type == VHOST_IOTLB_MISS) {
> +                     *req = *entry;
> +                     vhost_poll_queue(&vq->poll);
> +             }
> +             mutex_unlock(&vq->mutex);
> +     }
> +}
> +
>  /* Caller must have device mutex */
>  long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user 
> *argp)
>  {
>       struct file *eventfp, *filep = NULL;
>       struct eventfd_ctx *ctx = NULL;
> +     struct vhost_iotlb_entry entry;
>       u64 p;
>       long r;
>       int i, fd;
> @@ -1050,6 +1184,52 @@ long vhost_dev_ioctl(struct vhost_dev *d, unsigned int 
> ioctl, void __user *argp)
>               if (filep)
>                       fput(filep);
>               break;
> +     case VHOST_RUN_IOTLB:
> +             /* FIXME: enable and disabled */
> +             vhost_init_device_iotlb(d, true);
> +             break;
> +     case VHOST_UPDATE_IOTLB:
> +             r = copy_from_user(&entry, argp, sizeof(entry));
> +             if (r < 0) {
> +                     r = -EFAULT;
> +                     goto done;
> +             }
> +
> +             spin_lock(&d->iotlb_lock);
> +             if (!d->iotlb) {
> +                     spin_unlock(&d->iotlb_lock);
> +                     r = -EFAULT;
> +                     goto done;
> +             }
> +             switch (entry.flags.type) {
> +             case VHOST_IOTLB_UPDATE:
> +                     if (entry.flags.valid != VHOST_IOTLB_VALID) {
> +                             break;
> +                     }
> +                     if (vhost_new_umem_range(d->iotlb,
> +                                              entry.iova,
> +                                              entry.size,
> +                                              entry.iova + entry.size - 1,
> +                                                 entry.userspace_addr,
> +                                                 entry.flags.perm)) {
> +                             r = -ENOMEM;
> +                             break;
> +                     }
> +                     break;
> +             case VHOST_IOTLB_INVALIDATE:
> +                     vhost_del_umem_range(d->iotlb,
> +                                          entry.iova,
> +                                          entry.iova + entry.size - 1);
> +                     break;
> +             default:
> +                     r = -EINVAL;
> +             }
> +             spin_unlock(&d->iotlb_lock);
> +
> +             if (!r && entry.flags.type != VHOST_IOTLB_INVALIDATE)
> +                     vhost_complete_iotlb_update(d, &entry);
> +
> +             break;
>       default:
>               r = -ENOIOCTLCMD;
>               break;
> @@ -1197,27 +1377,69 @@ int vhost_init_used(struct vhost_virtqueue *vq)
>  }
>  EXPORT_SYMBOL_GPL(vhost_init_used);
>  
> +static int vhost_iotlb_miss(struct vhost_virtqueue *vq, u64 iova)
> +{
> +     struct vhost_iotlb_entry *pending = &vq->pending_request;
> +     int ret;
> +
> +     if (!vq->iotlb_call_ctx)
> +             return -EOPNOTSUPP;
> +
> +     if (!vq->iotlb_request)
> +             return -EOPNOTSUPP;
> +
> +     if (pending->flags.type == VHOST_IOTLB_MISS) {
> +             return -EEXIST;
> +     }
> +
> +     pending->iova = iova;
> +     pending->flags.type = VHOST_IOTLB_MISS;
> +
> +     ret = __copy_to_user(vq->iotlb_request, pending,
> +                          sizeof(struct vhost_iotlb_entry));
> +     if (ret) {
> +             goto err;
> +     }
> +
> +     if (vq->iotlb_call_ctx)
> +             eventfd_signal(vq->iotlb_call_ctx, 1);
> +err:
> +     return ret;
> +}
> +
>  static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
> -                       struct iovec iov[], int iov_size)
> +                       struct iovec iov[], int iov_size, int access)
>  {
>       const struct vhost_umem_node *node;
> -     struct vhost_umem *umem = vq->umem;
> +     struct vhost_dev *dev = vq->dev;
> +     struct vhost_umem *umem = dev->iotlb ? dev->iotlb : dev->umem;
>       struct iovec *_iov;
>       u64 s = 0;
>       int ret = 0;
>  
> +     spin_lock(&dev->iotlb_lock);
> +
>       while ((u64)len > s) {
>               u64 size;
>               if (unlikely(ret >= iov_size)) {
>                       ret = -ENOBUFS;
>                       break;
>               }
> +
>               node = vhost_umem_interval_tree_iter_first(&umem->umem_tree,
>                                                       addr, addr + len - 1);
>               if (node == NULL || node->start > addr) {
> -                     ret = -EFAULT;
> +                     if (umem != dev->iotlb) {
> +                             ret = -EFAULT;
> +                             break;
> +                     }
> +                     ret = -EAGAIN;
> +                     break;
> +             } else if (!(node->perm & access)) {
> +                     ret = -EPERM;
>                       break;
>               }
> +
>               _iov = iov + ret;
>               size = node->size - addr + node->start;
>               _iov->iov_len = min((u64)len - s, size);
> @@ -1228,6 +1450,10 @@ static int translate_desc(struct vhost_virtqueue *vq, 
> u64 addr, u32 len,
>               ++ret;
>       }
>  
> +     spin_unlock(&dev->iotlb_lock);
> +
> +     if (ret == -EAGAIN)
> +             vhost_iotlb_miss(vq, addr);
>       return ret;
>  }
>  
> @@ -1256,7 +1482,7 @@ static int get_indirect(struct vhost_virtqueue *vq,
>                       struct iovec iov[], unsigned int iov_size,
>                       unsigned int *out_num, unsigned int *in_num,
>                       struct vhost_log *log, unsigned int *log_num,
> -                     struct vring_desc *indirect)
> +                     struct vring_desc *indirect, int access)
>  {
>       struct vring_desc desc;
>       unsigned int i = 0, count, found = 0;
> @@ -1274,9 +1500,10 @@ static int get_indirect(struct vhost_virtqueue *vq,
>       }
>  
>       ret = translate_desc(vq, vhost64_to_cpu(vq, indirect->addr), len, 
> vq->indirect,
> -                          UIO_MAXIOV);
> +                          UIO_MAXIOV, access);
>       if (unlikely(ret < 0)) {
> -             vq_err(vq, "Translation failure %d in indirect.\n", ret);
> +             if (ret != -EAGAIN)
> +                     vq_err(vq, "Translation failure %d in indirect.\n", 
> ret);
>               return ret;
>       }
>       iov_iter_init(&from, READ, vq->indirect, ret, len);
> @@ -1316,10 +1543,11 @@ static int get_indirect(struct vhost_virtqueue *vq,
>  
>               ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
>                                    vhost32_to_cpu(vq, desc.len), iov + 
> iov_count,
> -                                  iov_size - iov_count);
> +                                  iov_size - iov_count, access);
>               if (unlikely(ret < 0)) {
> -                     vq_err(vq, "Translation failure %d indirect idx %d\n",
> -                            ret, i);
> +                     if (ret != -EAGAIN)
> +                             vq_err(vq, "Translation failure %d indirect idx 
> %d\n",
> +                                     ret, i);
>                       return ret;
>               }
>               /* If this is an input descriptor, increment that count. */
> @@ -1355,7 +1583,8 @@ static int get_indirect(struct vhost_virtqueue *vq,
>  int vhost_get_vq_desc(struct vhost_virtqueue *vq,
>                     struct iovec iov[], unsigned int iov_size,
>                     unsigned int *out_num, unsigned int *in_num,
> -                   struct vhost_log *log, unsigned int *log_num)
> +                   struct vhost_log *log, unsigned int *log_num,
> +                   int access)
>  {
>       struct vring_desc desc;
>       unsigned int i, head, found = 0;
> @@ -1433,10 +1662,11 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
>               if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT)) {
>                       ret = get_indirect(vq, iov, iov_size,
>                                          out_num, in_num,
> -                                        log, log_num, &desc);
> +                                        log, log_num, &desc, access);
>                       if (unlikely(ret < 0)) {
> -                             vq_err(vq, "Failure detected "
> -                                    "in indirect descriptor at idx %d\n", i);
> +                             if (ret != -EAGAIN)
> +                                     vq_err(vq, "Failure detected "
> +                                             "in indirect descriptor at idx 
> %d\n", i);
>                               return ret;
>                       }
>                       continue;
> @@ -1444,10 +1674,11 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
>  
>               ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
>                                    vhost32_to_cpu(vq, desc.len), iov + 
> iov_count,
> -                                  iov_size - iov_count);
> +                                  iov_size - iov_count, access);
>               if (unlikely(ret < 0)) {
> -                     vq_err(vq, "Translation failure %d descriptor idx %d\n",
> -                            ret, i);
> +                     if (ret != -EAGAIN)
> +                             vq_err(vq, "Translation failure %d descriptor 
> idx %d\n",
> +                                     ret, i);
>                       return ret;
>               }
>               if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE)) {
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index 5d64393..4365104 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -62,13 +62,15 @@ struct vhost_umem_node {
>       __u64 last;
>       __u64 size;
>       __u64 userspace_addr;
> -     __u64 flags_padding;
> +     __u32 perm;
> +     __u32 flags_padding;
>       __u64 __subtree_last;
>  };
>  
>  struct vhost_umem {
>       struct rb_root umem_tree;
>       struct list_head umem_list;
> +     int numem;
>  };
>  
>  /* The virtqueue structure describes a queue attached to a device. */
> @@ -84,9 +86,13 @@ struct vhost_virtqueue {
>       struct file *kick;
>       struct file *call;
>       struct file *error;
> +     struct file *iotlb_call;
>       struct eventfd_ctx *call_ctx;
>       struct eventfd_ctx *error_ctx;
>       struct eventfd_ctx *log_ctx;
> +     struct eventfd_ctx *iotlb_call_ctx;
> +     struct vhost_iotlb_entry __user *iotlb_request;
> +     struct vhost_iotlb_entry pending_request;
>  
>       struct vhost_poll poll;
>  
> @@ -135,6 +141,8 @@ struct vhost_virtqueue {
>  #endif
>  };
>  
> +#define VHOST_IOTLB_SIZE 2048
> +
>  struct vhost_dev {
>       struct mm_struct *mm;
>       struct mutex mutex;
> @@ -146,6 +154,8 @@ struct vhost_dev {
>       struct list_head work_list;
>       struct task_struct *worker;
>       struct vhost_umem *umem;
> +     struct vhost_umem *iotlb;
> +     spinlock_t iotlb_lock;
>  };
>  
>  void vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue **vqs, int 
> nvqs);
> @@ -164,7 +174,8 @@ int vhost_log_access_ok(struct vhost_dev *);
>  int vhost_get_vq_desc(struct vhost_virtqueue *,
>                     struct iovec iov[], unsigned int iov_count,
>                     unsigned int *out_num, unsigned int *in_num,
> -                   struct vhost_log *log, unsigned int *log_num);
> +                   struct vhost_log *log, unsigned int *log_num,
> +                   int access);
>  void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
>  
>  int vhost_init_used(struct vhost_virtqueue *);
> @@ -183,7 +194,7 @@ int vhost_log_write(struct vhost_virtqueue *vq, struct 
> vhost_log *log,
>                   unsigned int log_num, u64 len);
>  
>  #define vq_err(vq, fmt, ...) do {                                  \
> -             pr_debug(pr_fmt(fmt), ##__VA_ARGS__);       \
> +             printk(pr_fmt(fmt), ##__VA_ARGS__);       \
>               if ((vq)->error_ctx)                               \
>                               eventfd_signal((vq)->error_ctx, 1);\
>       } while (0)
> diff --git a/fs/eventfd.c b/fs/eventfd.c
> index 8d0c0df..5c0a22f 100644
> --- a/fs/eventfd.c
> +++ b/fs/eventfd.c
> @@ -59,8 +59,9 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
>       if (ULLONG_MAX - ctx->count < n)
>               n = ULLONG_MAX - ctx->count;
>       ctx->count += n;
> -     if (waitqueue_active(&ctx->wqh))
> +     if (waitqueue_active(&ctx->wqh)) {
>               wake_up_locked_poll(&ctx->wqh, POLLIN);
> +     }
>       spin_unlock_irqrestore(&ctx->wqh.lock, flags);
>  
>       return n;
> diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
> index ab373191..5c35ab4 100644
> --- a/include/uapi/linux/vhost.h
> +++ b/include/uapi/linux/vhost.h
> @@ -47,6 +47,32 @@ struct vhost_vring_addr {
>       __u64 log_guest_addr;
>  };
>  
> +struct vhost_iotlb_entry {
> +     __u64 iova;
> +     __u64 size;
> +     __u64 userspace_addr;

Alignment requirements?

> +     struct {
> +#define VHOST_ACCESS_RO      0x1
> +#define VHOST_ACCESS_WO      0x2
> +#define VHOST_ACCESS_RW      0x3
> +             __u8  perm;
> +#define VHOST_IOTLB_MISS           1
> +#define VHOST_IOTLB_UPDATE         2
> +#define VHOST_IOTLB_INVALIDATE     3
> +             __u8  type;
> +#define VHOST_IOTLB_INVALID        0x1
> +#define VHOST_IOTLB_VALID          0x2
> +             __u8  valid;

why do we need this flag?

> +             __u8  u8_padding;
> +             __u32 padding;
> +     } flags;
> +};
> +
> +struct vhost_vring_iotlb_entry {
> +     unsigned int index;
> +     __u64 userspace_addr;
> +};
> +
>  struct vhost_memory_region {
>       __u64 guest_phys_addr;
>       __u64 memory_size; /* bytes */
> @@ -127,6 +153,15 @@ struct vhost_memory {
>  /* Set eventfd to signal an error */
>  #define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file)
>  
> +/* IOTLB */
> +/* Specify an eventfd file descriptor to signle on IOTLB miss */

typo

> +#define VHOST_SET_VRING_IOTLB_CALL _IOW(VHOST_VIRTIO, 0x23, struct      \
> +                                        vhost_vring_file)
> +#define VHOST_SET_VRING_IOTLB_REQUEST _IOW(VHOST_VIRTIO, 0x25, struct   \
> +                                           vhost_vring_iotlb_entry)
> +#define VHOST_UPDATE_IOTLB _IOW(VHOST_VIRTIO, 0x24, struct vhost_iotlb_entry)
> +#define VHOST_RUN_IOTLB _IOW(VHOST_VIRTIO, 0x26, int)
> +

Is the assumption that userspace must dedicate a thread to running the iotlb? 
I dislike this one.
Please support asynchronous APIs at least optionally to make
userspace make its own threading decisions.

>  /* VHOST_NET specific defines */
>  
>  /* Attach virtio net ring to a raw socket, or tap device.

Don't we need a feature bit for this?
Are we short on feature bits? If yes maybe it's time to add
something like PROTOCOL_FEATURES that we have in vhost-user.

> -- 
> 2.5.0



reply via email to

[Prev in Thread] Current Thread [Next in Thread]