[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [PATCH v7 3/4] vfio iommu: Add support for mediated dev
From: |
Jike Song |
Subject: |
Re: [Qemu-devel] [PATCH v7 3/4] vfio iommu: Add support for mediated devices |
Date: |
Thu, 29 Sep 2016 10:17:23 +0800 |
User-agent: |
Mozilla/5.0 (X11; Linux i686 on x86_64; rv:17.0) Gecko/20130801 Thunderbird/17.0.8 |
+Guangrong
On 08/25/2016 11:53 AM, Kirti Wankhede wrote:
> VFIO IOMMU drivers are designed for the devices which are IOMMU capable.
> Mediated device only uses IOMMU APIs, the underlying hardware can be
> managed by an IOMMU domain.
>
> Aim of this change is:
> - To use most of the code of TYPE1 IOMMU driver for mediated devices
> - To support direct assigned device and mediated device in single module
>
> Added two new callback functions to struct vfio_iommu_driver_ops. Backend
> IOMMU module that supports pining and unpinning pages for mdev devices
> should provide these functions.
> Added APIs for pining and unpining pages to VFIO module. These calls back
> into backend iommu module to actually pin and unpin pages.
>
> This change adds pin and unpin support for mediated device to TYPE1 IOMMU
> backend module. More details:
> - When iommu_group of mediated devices is attached, task structure is
> cached which is used later to pin pages and page accounting.
> - It keeps track of pinned pages for mediated domain. This data is used to
> verify unpinning request and to unpin remaining pages while detaching, if
> there are any.
> - Used existing mechanism for page accounting. If iommu capable domain
> exist in the container then all pages are already pinned and accounted.
> Accouting for mdev device is only done if there is no iommu capable
> domain in the container.
>
> Tested by assigning below combinations of devices to a single VM:
> - GPU pass through only
> - vGPU device only
> - One GPU pass through and one vGPU device
> - two GPU pass through
>
> Signed-off-by: Kirti Wankhede <address@hidden>
> Signed-off-by: Neo Jia <address@hidden>
> Change-Id: I295d6f0f2e0579b8d9882bfd8fd5a4194b97bd9a
> Reviewed-on: http://git-master/r/1175707
> Reviewed-by: Automatic_Commit_Validation_User
> ---
> drivers/vfio/vfio.c | 117 ++++++++++
> drivers/vfio/vfio_iommu_type1.c | 498
> ++++++++++++++++++++++++++++++++++++----
> include/linux/vfio.h | 13 +-
> 3 files changed, 580 insertions(+), 48 deletions(-)
>
> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> index 6fd6fa5469de..e3e342861e04 100644
> --- a/drivers/vfio/vfio.c
> +++ b/drivers/vfio/vfio.c
> @@ -1782,6 +1782,123 @@ void vfio_info_cap_shift(struct vfio_info_cap *caps,
> size_t offset)
> }
> EXPORT_SYMBOL_GPL(vfio_info_cap_shift);
>
> +static struct vfio_group *vfio_group_from_dev(struct device *dev)
> +{
> + struct vfio_device *device;
> + struct vfio_group *group;
> + int ret;
> +
> + device = vfio_device_get_from_dev(dev);
> + if (!device)
> + return ERR_PTR(-EINVAL);
> +
> + group = device->group;
> + if (!atomic_inc_not_zero(&group->container_users)) {
> + ret = -EINVAL;
> + goto err_ret;
> + }
> +
> + if (group->noiommu) {
> + atomic_dec(&group->container_users);
> + ret = -EPERM;
> + goto err_ret;
> + }
> +
> + if (!group->container->iommu_driver ||
> + !vfio_group_viable(group)) {
> + atomic_dec(&group->container_users);
> + ret = -EINVAL;
> + goto err_ret;
> + }
> +
> + vfio_device_put(device);
> + return group;
> +
> +err_ret:
> + vfio_device_put(device);
> + return ERR_PTR(ret);
> +}
> +
> +/*
> + * Pin a set of guest PFNs and return their associated host PFNs for local
> + * domain only.
> + * @dev [in] : device
> + * @user_pfn [in]: array of user/guest PFNs
> + * @npage [in]: count of array elements
> + * @prot [in] : protection flags
> + * @phys_pfn[out] : array of host PFNs
> + */
> +long vfio_pin_pages(struct device *dev, unsigned long *user_pfn,
> + long npage, int prot, unsigned long *phys_pfn)
> +{
> + struct vfio_container *container;
> + struct vfio_group *group;
> + struct vfio_iommu_driver *driver;
> + ssize_t ret = -EINVAL;
> +
> + if (!dev || !user_pfn || !phys_pfn)
> + return -EINVAL;
> +
> + group = vfio_group_from_dev(dev);
> + if (IS_ERR(group))
> + return PTR_ERR(group);
> +
> + container = group->container;
> + if (IS_ERR(container))
> + return PTR_ERR(container);
> +
> + down_read(&container->group_lock);
> +
> + driver = container->iommu_driver;
> + if (likely(driver && driver->ops->pin_pages))
> + ret = driver->ops->pin_pages(container->iommu_data, user_pfn,
> + npage, prot, phys_pfn);
> +
> + up_read(&container->group_lock);
> + vfio_group_try_dissolve_container(group);
> +
> + return ret;
> +
> +}
> +EXPORT_SYMBOL(vfio_pin_pages);
> +
> +/*
> + * Unpin set of host PFNs for local domain only.
> + * @dev [in] : device
> + * @pfn [in] : array of host PFNs to be unpinned.
> + * @npage [in] :count of elements in array, that is number of pages.
> + */
> +long vfio_unpin_pages(struct device *dev, unsigned long *pfn, long npage)
> +{
> + struct vfio_container *container;
> + struct vfio_group *group;
> + struct vfio_iommu_driver *driver;
> + ssize_t ret = -EINVAL;
> +
> + if (!dev || !pfn)
> + return -EINVAL;
> +
> + group = vfio_group_from_dev(dev);
> + if (IS_ERR(group))
> + return PTR_ERR(group);
> +
> + container = group->container;
> + if (IS_ERR(container))
> + return PTR_ERR(container);
> +
> + down_read(&container->group_lock);
> +
> + driver = container->iommu_driver;
> + if (likely(driver && driver->ops->unpin_pages))
> + ret = driver->ops->unpin_pages(container->iommu_data, pfn,
> + npage);
> +
> + up_read(&container->group_lock);
> + vfio_group_try_dissolve_container(group);
> + return ret;
> +}
> +EXPORT_SYMBOL(vfio_unpin_pages);
> +
> /**
> * Module/class support
> */
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 2ba19424e4a1..d52d75fd0f04 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -55,18 +55,26 @@ MODULE_PARM_DESC(disable_hugepages,
>
> struct vfio_iommu {
> struct list_head domain_list;
> + struct vfio_domain *local_domain;
> struct mutex lock;
> struct rb_root dma_list;
> bool v2;
> bool nesting;
> };
>
> +struct local_addr_space {
> + struct task_struct *task;
> + struct rb_root pfn_list; /* pinned Host pfn list */
> + struct mutex pfn_list_lock; /* mutex for pfn_list */
> +};
> +
> struct vfio_domain {
> struct iommu_domain *domain;
> struct list_head next;
> struct list_head group_list;
> int prot; /* IOMMU_CACHE */
> bool fgsp; /* Fine-grained super pages */
> + struct local_addr_space *local_addr_space;
> };
>
> struct vfio_dma {
> @@ -83,6 +91,22 @@ struct vfio_group {
> };
>
> /*
> + * Guest RAM pinning working set or DMA target
> + */
> +struct vfio_pfn {
> + struct rb_node node;
> + unsigned long vaddr; /* virtual addr */
> + dma_addr_t iova; /* IOVA */
> + unsigned long pfn; /* Host pfn */
> + size_t prot;
> + atomic_t ref_count;
> +};
> +
> +
> +#define IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu) \
> + (list_empty(&iommu->domain_list) ? false : true)
> +
> +/*
> * This code handles mapping and unmapping of user data buffers
> * into DMA'ble space using the IOMMU
> */
> @@ -130,6 +154,84 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu,
> struct vfio_dma *old)
> rb_erase(&old->node, &iommu->dma_list);
> }
>
> +/*
> + * Helper Functions for host pfn list
> + */
> +
> +static struct vfio_pfn *vfio_find_pfn(struct vfio_domain *domain,
> + unsigned long pfn)
> +{
> + struct rb_node *node;
> + struct vfio_pfn *vpfn, *ret = NULL;
> +
> + node = domain->local_addr_space->pfn_list.rb_node;
> +
> + while (node) {
> + vpfn = rb_entry(node, struct vfio_pfn, node);
> +
> + if (pfn < vpfn->pfn)
> + node = node->rb_left;
> + else if (pfn > vpfn->pfn)
> + node = node->rb_right;
> + else {
> + ret = vpfn;
> + break;
> + }
> + }
> +
> + return ret;
> +}
> +
> +static void vfio_link_pfn(struct vfio_domain *domain, struct vfio_pfn *new)
> +{
> + struct rb_node **link, *parent = NULL;
> + struct vfio_pfn *vpfn;
> +
> + link = &domain->local_addr_space->pfn_list.rb_node;
> + while (*link) {
> + parent = *link;
> + vpfn = rb_entry(parent, struct vfio_pfn, node);
> +
> + if (new->pfn < vpfn->pfn)
> + link = &(*link)->rb_left;
> + else
> + link = &(*link)->rb_right;
> + }
> +
> + rb_link_node(&new->node, parent, link);
> + rb_insert_color(&new->node, &domain->local_addr_space->pfn_list);
> +}
> +
> +static void vfio_unlink_pfn(struct vfio_domain *domain, struct vfio_pfn *old)
> +{
> + rb_erase(&old->node, &domain->local_addr_space->pfn_list);
> +}
> +
> +static int vfio_add_to_pfn_list(struct vfio_domain *domain, unsigned long
> vaddr,
> + dma_addr_t iova, unsigned long pfn, size_t prot)
> +{
> + struct vfio_pfn *vpfn;
> +
> + vpfn = kzalloc(sizeof(*vpfn), GFP_KERNEL);
> + if (!vpfn)
> + return -ENOMEM;
> +
> + vpfn->vaddr = vaddr;
> + vpfn->iova = iova;
> + vpfn->pfn = pfn;
> + vpfn->prot = prot;
> + atomic_set(&vpfn->ref_count, 1);
> + vfio_link_pfn(domain, vpfn);
> + return 0;
> +}
> +
> +static void vfio_remove_from_pfn_list(struct vfio_domain *domain,
> + struct vfio_pfn *vpfn)
> +{
> + vfio_unlink_pfn(domain, vpfn);
> + kfree(vpfn);
> +}
> +
> struct vwork {
> struct mm_struct *mm;
> long npage;
> @@ -150,17 +252,17 @@ static void vfio_lock_acct_bg(struct work_struct *work)
> kfree(vwork);
> }
>
> -static void vfio_lock_acct(long npage)
> +static void vfio_lock_acct(struct task_struct *task, long npage)
> {
> struct vwork *vwork;
> struct mm_struct *mm;
>
> - if (!current->mm || !npage)
> + if (!task->mm || !npage)
> return; /* process exited or nothing to do */
>
> - if (down_write_trylock(¤t->mm->mmap_sem)) {
> - current->mm->locked_vm += npage;
> - up_write(¤t->mm->mmap_sem);
> + if (down_write_trylock(&task->mm->mmap_sem)) {
> + task->mm->locked_vm += npage;
> + up_write(&task->mm->mmap_sem);
> return;
> }
>
> @@ -172,7 +274,7 @@ static void vfio_lock_acct(long npage)
> vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
> if (!vwork)
> return;
> - mm = get_task_mm(current);
> + mm = get_task_mm(task);
> if (!mm) {
> kfree(vwork);
> return;
> @@ -228,20 +330,31 @@ static int put_pfn(unsigned long pfn, int prot)
> return 0;
> }
>
> -static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
> +static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
> + int prot, unsigned long *pfn)
> {
> struct page *page[1];
> struct vm_area_struct *vma;
> + struct mm_struct *local_mm = (mm ? mm : current->mm);
> int ret = -EFAULT;
>
> - if (get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), page) == 1) {
> + if (mm) {
> + down_read(&local_mm->mmap_sem);
> + ret = get_user_pages_remote(NULL, local_mm, vaddr, 1,
> + !!(prot & IOMMU_WRITE), 0, page, NULL);
> + up_read(&local_mm->mmap_sem);
> + } else
> + ret = get_user_pages_fast(vaddr, 1,
> + !!(prot & IOMMU_WRITE), page);
> +
> + if (ret == 1) {
> *pfn = page_to_pfn(page[0]);
> return 0;
> }
>
> - down_read(¤t->mm->mmap_sem);
> + down_read(&local_mm->mmap_sem);
>
> - vma = find_vma_intersection(current->mm, vaddr, vaddr + 1);
> + vma = find_vma_intersection(local_mm, vaddr, vaddr + 1);
>
> if (vma && vma->vm_flags & VM_PFNMAP) {
> *pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
> @@ -249,7 +362,7 @@ static int vaddr_get_pfn(unsigned long vaddr, int prot,
> unsigned long *pfn)
> ret = 0;
> }
>
> - up_read(¤t->mm->mmap_sem);
> + up_read(&local_mm->mmap_sem);
>
> return ret;
> }
> @@ -259,8 +372,8 @@ static int vaddr_get_pfn(unsigned long vaddr, int prot,
> unsigned long *pfn)
> * the iommu can only map chunks of consecutive pfns anyway, so get the
> * first page and all consecutive pages with the same locking.
> */
> -static long vfio_pin_pages(unsigned long vaddr, long npage,
> - int prot, unsigned long *pfn_base)
> +static long __vfio_pin_pages_remote(unsigned long vaddr, long npage,
> + int prot, unsigned long *pfn_base)
> {
> unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> bool lock_cap = capable(CAP_IPC_LOCK);
> @@ -270,7 +383,7 @@ static long vfio_pin_pages(unsigned long vaddr, long
> npage,
> if (!current->mm)
> return -ENODEV;
>
> - ret = vaddr_get_pfn(vaddr, prot, pfn_base);
> + ret = vaddr_get_pfn(NULL, vaddr, prot, pfn_base);
> if (ret)
> return ret;
>
> @@ -285,7 +398,7 @@ static long vfio_pin_pages(unsigned long vaddr, long
> npage,
>
> if (unlikely(disable_hugepages)) {
> if (!rsvd)
> - vfio_lock_acct(1);
> + vfio_lock_acct(current, 1);
> return 1;
> }
>
> @@ -293,7 +406,7 @@ static long vfio_pin_pages(unsigned long vaddr, long
> npage,
> for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
> unsigned long pfn = 0;
>
> - ret = vaddr_get_pfn(vaddr, prot, &pfn);
> + ret = vaddr_get_pfn(NULL, vaddr, prot, &pfn);
> if (ret)
> break;
>
> @@ -313,13 +426,13 @@ static long vfio_pin_pages(unsigned long vaddr, long
> npage,
> }
>
> if (!rsvd)
> - vfio_lock_acct(i);
> + vfio_lock_acct(current, i);
>
> return i;
> }
>
> -static long vfio_unpin_pages(unsigned long pfn, long npage,
> - int prot, bool do_accounting)
> +static long __vfio_unpin_pages_remote(unsigned long pfn, long npage, int
> prot,
> + bool do_accounting)
> {
> unsigned long unlocked = 0;
> long i;
> @@ -328,7 +441,188 @@ static long vfio_unpin_pages(unsigned long pfn, long
> npage,
> unlocked += put_pfn(pfn++, prot);
>
> if (do_accounting)
> - vfio_lock_acct(-unlocked);
> + vfio_lock_acct(current, -unlocked);
> + return unlocked;
> +}
> +
> +static long __vfio_pin_pages_local(struct vfio_domain *domain,
> + unsigned long vaddr, int prot,
> + unsigned long *pfn_base,
> + bool do_accounting)
> +{
> + unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> + bool lock_cap = capable(CAP_IPC_LOCK);
> + long ret;
> + bool rsvd;
> + struct task_struct *task = domain->local_addr_space->task;
> +
> + if (!task->mm)
> + return -ENODEV;
> +
> + ret = vaddr_get_pfn(task->mm, vaddr, prot, pfn_base);
> + if (ret)
> + return ret;
> +
> + rsvd = is_invalid_reserved_pfn(*pfn_base);
> +
> + if (!rsvd && !lock_cap && task->mm->locked_vm + 1 > limit) {
> + put_pfn(*pfn_base, prot);
> + pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
> + limit << PAGE_SHIFT);
> + return -ENOMEM;
> + }
> +
> + if (!rsvd && do_accounting)
> + vfio_lock_acct(task, 1);
> +
> + return 1;
> +}
> +
> +static void __vfio_unpin_pages_local(struct vfio_domain *domain,
> + unsigned long pfn, int prot,
> + bool do_accounting)
> +{
> + put_pfn(pfn, prot);
> +
> + if (do_accounting)
> + vfio_lock_acct(domain->local_addr_space->task, -1);
> +}
> +
> +static int vfio_unpin_pfn(struct vfio_domain *domain,
> + struct vfio_pfn *vpfn, bool do_accounting)
> +{
> + __vfio_unpin_pages_local(domain, vpfn->pfn, vpfn->prot,
> + do_accounting);
> +
> + if (atomic_dec_and_test(&vpfn->ref_count))
> + vfio_remove_from_pfn_list(domain, vpfn);
> +
> + return 1;
> +}
> +
> +static long vfio_iommu_type1_pin_pages(void *iommu_data,
> + unsigned long *user_pfn,
> + long npage, int prot,
> + unsigned long *phys_pfn)
> +{
> + struct vfio_iommu *iommu = iommu_data;
> + struct vfio_domain *domain;
> + int i, j, ret;
> + long retpage;
> + unsigned long remote_vaddr;
> + unsigned long *pfn = phys_pfn;
> + struct vfio_dma *dma;
> + bool do_accounting = false;
> +
> + if (!iommu || !user_pfn || !phys_pfn)
> + return -EINVAL;
> +
> + mutex_lock(&iommu->lock);
> +
> + if (!iommu->local_domain) {
> + ret = -EINVAL;
> + goto pin_done;
> + }
> +
> + domain = iommu->local_domain;
> +
> + /*
> + * If iommu capable domain exist in the container then all pages are
> + * already pinned and accounted. Accouting should be done if there is no
> + * iommu capable domain in the container.
> + */
> + do_accounting = !IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu);
> +
> + for (i = 0; i < npage; i++) {
> + struct vfio_pfn *p;
> + dma_addr_t iova;
> +
> + iova = user_pfn[i] << PAGE_SHIFT;
> +
> + dma = vfio_find_dma(iommu, iova, 0);
> + if (!dma) {
> + ret = -EINVAL;
> + goto pin_unwind;
> + }
> +
> + remote_vaddr = dma->vaddr + iova - dma->iova;
> +
> + retpage = __vfio_pin_pages_local(domain, remote_vaddr, prot,
> + &pfn[i], do_accounting);
Hi Kirti,
Here you call __vfio_pin_pages_local() > vaddr_get_pfn() > GUP regardless
whether the vaddr already pinned or not. That probably means, if the caller
calls vfio_pin_pages() with a GPA for multiple times, you get memory leaks.
GUP always increases the page refcnt.
FWIW, I would like to have the pfn_list_lock implemented with key == iova,
so you can always try to find the PFN for a given iova, and pin it only if
not found.
--
Thanks,
Jike
> + if (retpage <= 0) {
> + WARN_ON(!retpage);
> + ret = (int)retpage;
> + goto pin_unwind;
> + }
> +
> + mutex_lock(&domain->local_addr_space->pfn_list_lock);
> +
> + /* search if pfn exist */
> + p = vfio_find_pfn(domain, pfn[i]);
> + if (p) {
> + atomic_inc(&p->ref_count);
> + mutex_unlock(&domain->local_addr_space->pfn_list_lock);
> + continue;
> + }
> +
> + ret = vfio_add_to_pfn_list(domain, remote_vaddr, iova,
> + pfn[i], prot);
> + mutex_unlock(&domain->local_addr_space->pfn_list_lock);
> +
> + if (ret) {
> + __vfio_unpin_pages_local(domain, pfn[i], prot,
> + do_accounting);
> + goto pin_unwind;
> + }
> + }
> +
> + ret = i;
> + goto pin_done;
> +
> +pin_unwind:
> + pfn[i] = 0;
> + mutex_lock(&domain->local_addr_space->pfn_list_lock);
> + for (j = 0; j < i; j++) {
> + struct vfio_pfn *p;
> +
> + p = vfio_find_pfn(domain, pfn[j]);
> + if (p)
> + vfio_unpin_pfn(domain, p, do_accounting);
> +
> + pfn[j] = 0;
> + }
> + mutex_unlock(&domain->local_addr_space->pfn_list_lock);
> +
> +pin_done:
> + mutex_unlock(&iommu->lock);
> + return ret;
> +}
> +
> +static long vfio_iommu_type1_unpin_pages(void *iommu_data, unsigned long
> *pfn,
> + long npage)
> +{
> + struct vfio_iommu *iommu = iommu_data;
> + struct vfio_domain *domain = NULL;
> + long unlocked = 0;
> + int i;
> +
> + if (!iommu || !pfn)
> + return -EINVAL;
> +
> + domain = iommu->local_domain;
> +
> + for (i = 0; i < npage; i++) {
> + struct vfio_pfn *p;
> +
> + mutex_lock(&domain->local_addr_space->pfn_list_lock);
> +
> + /* verify if pfn exist in pfn_list */
> + p = vfio_find_pfn(domain, pfn[i]);
> + if (p)
> + unlocked += vfio_unpin_pfn(domain, p, true);
> +
> + mutex_unlock(&domain->local_addr_space->pfn_list_lock);
> + }
>
> return unlocked;
> }
> @@ -341,6 +635,9 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu,
> struct vfio_dma *dma)
>
> if (!dma->size)
> return;
> +
> + if (!IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu))
> + return;
> /*
> * We use the IOMMU to track the physical addresses, otherwise we'd
> * need a much more complicated tracking system. Unfortunately that
> @@ -382,15 +679,15 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu,
> struct vfio_dma *dma)
> if (WARN_ON(!unmapped))
> break;
>
> - unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT,
> - unmapped >> PAGE_SHIFT,
> - dma->prot, false);
> + unlocked += __vfio_unpin_pages_remote(phys >> PAGE_SHIFT,
> + unmapped >> PAGE_SHIFT,
> + dma->prot, false);
> iova += unmapped;
>
> cond_resched();
> }
>
> - vfio_lock_acct(-unlocked);
> + vfio_lock_acct(current, -unlocked);
> }
>
> static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
> @@ -611,10 +908,16 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
> /* Insert zero-sized and grow as we map chunks of it */
> vfio_link_dma(iommu, dma);
>
> + /* Don't pin and map if container doesn't contain IOMMU capable domain*/
> + if (!IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu)) {
> + dma->size = size;
> + goto map_done;
> + }
> +
> while (size) {
> /* Pin a contiguous chunk of memory */
> - npage = vfio_pin_pages(vaddr + dma->size,
> - size >> PAGE_SHIFT, prot, &pfn);
> + npage = __vfio_pin_pages_remote(vaddr + dma->size,
> + size >> PAGE_SHIFT, prot, &pfn);
> if (npage <= 0) {
> WARN_ON(!npage);
> ret = (int)npage;
> @@ -624,7 +927,7 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
> /* Map it! */
> ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage, prot);
> if (ret) {
> - vfio_unpin_pages(pfn, npage, prot, true);
> + __vfio_unpin_pages_remote(pfn, npage, prot, true);
> break;
> }
>
> @@ -635,6 +938,7 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
> if (ret)
> vfio_remove_dma(iommu, dma);
>
> +map_done:
> mutex_unlock(&iommu->lock);
> return ret;
> }
> @@ -734,11 +1038,24 @@ static void vfio_test_domain_fgsp(struct vfio_domain
> *domain)
> __free_pages(pages, order);
> }
>
> +static struct vfio_group *find_iommu_group(struct vfio_domain *domain,
> + struct iommu_group *iommu_group)
> +{
> + struct vfio_group *g;
> +
> + list_for_each_entry(g, &domain->group_list, next) {
> + if (g->iommu_group == iommu_group)
> + return g;
> + }
> +
> + return NULL;
> +}
> +
> static int vfio_iommu_type1_attach_group(void *iommu_data,
> struct iommu_group *iommu_group)
> {
> struct vfio_iommu *iommu = iommu_data;
> - struct vfio_group *group, *g;
> + struct vfio_group *group;
> struct vfio_domain *domain, *d;
> struct bus_type *bus = NULL;
> int ret;
> @@ -746,10 +1063,14 @@ static int vfio_iommu_type1_attach_group(void
> *iommu_data,
> mutex_lock(&iommu->lock);
>
> list_for_each_entry(d, &iommu->domain_list, next) {
> - list_for_each_entry(g, &d->group_list, next) {
> - if (g->iommu_group != iommu_group)
> - continue;
> + if (find_iommu_group(d, iommu_group)) {
> + mutex_unlock(&iommu->lock);
> + return -EINVAL;
> + }
> + }
>
> + if (iommu->local_domain) {
> + if (find_iommu_group(iommu->local_domain, iommu_group)) {
> mutex_unlock(&iommu->lock);
> return -EINVAL;
> }
> @@ -769,6 +1090,33 @@ static int vfio_iommu_type1_attach_group(void
> *iommu_data,
> if (ret)
> goto out_free;
>
> + if (IS_ENABLED(CONFIF_VFIO_MDEV) && !iommu_present(bus) &&
> + (bus == &mdev_bus_type)) {
> + if (iommu->local_domain) {
> + list_add(&group->next,
> + &iommu->local_domain->group_list);
> + kfree(domain);
> + mutex_unlock(&iommu->lock);
> + return 0;
> + }
> +
> + domain->local_addr_space =
> kzalloc(sizeof(*domain->local_addr_space),
> + GFP_KERNEL);
> + if (!domain->local_addr_space) {
> + ret = -ENOMEM;
> + goto out_free;
> + }
> +
> + domain->local_addr_space->task = current;
> + INIT_LIST_HEAD(&domain->group_list);
> + list_add(&group->next, &domain->group_list);
> + domain->local_addr_space->pfn_list = RB_ROOT;
> + mutex_init(&domain->local_addr_space->pfn_list_lock);
> + iommu->local_domain = domain;
> + mutex_unlock(&iommu->lock);
> + return 0;
> + }
> +
> domain->domain = iommu_domain_alloc(bus);
> if (!domain->domain) {
> ret = -EIO;
> @@ -859,6 +1207,18 @@ static void vfio_iommu_unmap_unpin_all(struct
> vfio_iommu *iommu)
> vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
> }
>
> +static void vfio_local_unpin_all(struct vfio_domain *domain)
> +{
> + struct rb_node *node;
> +
> + mutex_lock(&domain->local_addr_space->pfn_list_lock);
> + while ((node = rb_first(&domain->local_addr_space->pfn_list))) {
> + vfio_unpin_pfn(domain,
> + rb_entry(node, struct vfio_pfn, node), false);
> + }
> + mutex_unlock(&domain->local_addr_space->pfn_list_lock);
> +}
> +
> static void vfio_iommu_type1_detach_group(void *iommu_data,
> struct iommu_group *iommu_group)
> {
> @@ -868,31 +1228,52 @@ static void vfio_iommu_type1_detach_group(void
> *iommu_data,
>
> mutex_lock(&iommu->lock);
>
> - list_for_each_entry(domain, &iommu->domain_list, next) {
> - list_for_each_entry(group, &domain->group_list, next) {
> - if (group->iommu_group != iommu_group)
> - continue;
> + if (iommu->local_domain) {
> + domain = iommu->local_domain;
> + group = find_iommu_group(domain, iommu_group);
> + if (group) {
> + list_del(&group->next);
> + kfree(group);
>
> + if (list_empty(&domain->group_list)) {
> + vfio_local_unpin_all(domain);
> + if
> (!IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu))
> + vfio_iommu_unmap_unpin_all(iommu);
> + kfree(domain);
> + iommu->local_domain = NULL;
> + }
> + goto detach_group_done;
> + }
> + }
> +
> + if (!IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu))
> + goto detach_group_done;
> +
> + list_for_each_entry(domain, &iommu->domain_list, next) {
> + group = find_iommu_group(domain, iommu_group);
> + if (group) {
> iommu_detach_group(domain->domain, iommu_group);
> list_del(&group->next);
> kfree(group);
> /*
> * Group ownership provides privilege, if the group
> * list is empty, the domain goes away. If it's the
> - * last domain, then all the mappings go away too.
> + * last domain with iommu and local domain doesn't
> + * exist, the all the mappings go away too.
> */
> if (list_empty(&domain->group_list)) {
> - if (list_is_singular(&iommu->domain_list))
> + if (list_is_singular(&iommu->domain_list) &&
> + (!iommu->local_domain))
> vfio_iommu_unmap_unpin_all(iommu);
> iommu_domain_free(domain->domain);
> list_del(&domain->next);
> kfree(domain);
> }
> - goto done;
> + break;
> }
> }
>
> -done:
> +detach_group_done:
> mutex_unlock(&iommu->lock);
> }
>
> @@ -924,27 +1305,48 @@ static void *vfio_iommu_type1_open(unsigned long arg)
> return iommu;
> }
>
> +static void vfio_release_domain(struct vfio_domain *domain)
> +{
> + struct vfio_group *group, *group_tmp;
> +
> + list_for_each_entry_safe(group, group_tmp,
> + &domain->group_list, next) {
> + if (!domain->local_addr_space)
> + iommu_detach_group(domain->domain, group->iommu_group);
> + list_del(&group->next);
> + kfree(group);
> + }
> +
> + if (domain->local_addr_space)
> + vfio_local_unpin_all(domain);
> + else
> + iommu_domain_free(domain->domain);
> +}
> +
> static void vfio_iommu_type1_release(void *iommu_data)
> {
> struct vfio_iommu *iommu = iommu_data;
> struct vfio_domain *domain, *domain_tmp;
> - struct vfio_group *group, *group_tmp;
> +
> + if (iommu->local_domain) {
> + vfio_release_domain(iommu->local_domain);
> + kfree(iommu->local_domain);
> + iommu->local_domain = NULL;
> + }
>
> vfio_iommu_unmap_unpin_all(iommu);
>
> + if (!IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu))
> + goto release_exit;
> +
> list_for_each_entry_safe(domain, domain_tmp,
> &iommu->domain_list, next) {
> - list_for_each_entry_safe(group, group_tmp,
> - &domain->group_list, next) {
> - iommu_detach_group(domain->domain, group->iommu_group);
> - list_del(&group->next);
> - kfree(group);
> - }
> - iommu_domain_free(domain->domain);
> + vfio_release_domain(domain);
> list_del(&domain->next);
> kfree(domain);
> }
>
> +release_exit:
> kfree(iommu);
> }
>
> @@ -1048,6 +1450,8 @@ static const struct vfio_iommu_driver_ops
> vfio_iommu_driver_ops_type1 = {
> .ioctl = vfio_iommu_type1_ioctl,
> .attach_group = vfio_iommu_type1_attach_group,
> .detach_group = vfio_iommu_type1_detach_group,
> + .pin_pages = vfio_iommu_type1_pin_pages,
> + .unpin_pages = vfio_iommu_type1_unpin_pages,
> };
>
> static int __init vfio_iommu_type1_init(void)
> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index 0ecae0b1cd34..0bd25ba6223d 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -17,6 +17,7 @@
> #include <linux/workqueue.h>
> #include <linux/poll.h>
> #include <uapi/linux/vfio.h>
> +#include <linux/mdev.h>
>
> /**
> * struct vfio_device_ops - VFIO bus driver device callbacks
> @@ -75,7 +76,11 @@ struct vfio_iommu_driver_ops {
> struct iommu_group *group);
> void (*detach_group)(void *iommu_data,
> struct iommu_group *group);
> -
> + long (*pin_pages)(void *iommu_data, unsigned long *user_pfn,
> + long npage, int prot,
> + unsigned long *phys_pfn);
> + long (*unpin_pages)(void *iommu_data, unsigned long *pfn,
> + long npage);
> };
>
> extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops
> *ops);
> @@ -127,6 +132,12 @@ static inline long vfio_spapr_iommu_eeh_ioctl(struct
> iommu_group *group,
> }
> #endif /* CONFIG_EEH */
>
> +extern long vfio_pin_pages(struct device *dev, unsigned long *user_pfn,
> + long npage, int prot, unsigned long *phys_pfn);
> +
> +extern long vfio_unpin_pages(struct device *dev, unsigned long *pfn,
> + long npage);
> +
> /*
> * IRQfd - generic
> */
>
- Re: [Qemu-devel] [PATCH v7 3/4] vfio iommu: Add support for mediated devices,
Jike Song <=