qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Qemu-devel] [RFC PATCH] kernel vfio: enabled and supported on power


From: Alexey Kardashevskiy
Subject: [Qemu-devel] [RFC PATCH] kernel vfio: enabled and supported on power
Date: Sat, 12 May 2012 17:31:29 +1000
User-agent: Mozilla/5.0 (X11; Linux i686; rv:11.0) Gecko/20120327 Thunderbird/11.0.1

The idea of the patch is to demonstrate what POWER needs to support VFIO.

Added support on POWER. Than includes:

1) IOMMU API driver for POWER.
It also includes subsys_initcall_sync(power_pci_iommu_init) which walks through 
all
PCI devices and creates IOMMU groups and adds devices to these groups.

2) Prototype for an additional IOMMU API call tce_iommu_get_dma_window.

We need to tell the POWER guest a DMA window location and size. So,
I tried to add this to struct iommu_ops:

static int tce_iommu_get_dma_window(struct iommu_domain *dom, int index,
                        phys_addr_t *start_address, size_t *allocated_size)

The idea is that it returns 32-bit DMA window for index==0 and 64-bit
DMA window for index==1. This is what we need now.

However I noticed a move to implement IOMMU chardev for every platform 
separately
(kernel: drivers/vfio/vfio_iommu_x86.c).

I. e. QEMU does ioctl to the host, this call gets into a _platform_specific_ 
IOMMU chardev,
then the chardev calls a _platform_independend_ IOMMU API function (lets call it
iommu_get_dma_window()), and this iommu_get_dma_window() calls a 
_platform_specific_
tce_iommu_get_dma_window().


Another example, DMA map/unmap implementation on X86 is split between 2 pieces 
of code -
drivers/vfio/vfio_iommu_x86.c and drivers/iommu/intel-iommu.c.

And drivers/vfio/vfio_iommu_x86.c works perfect for POWER except a DMA window 
setup
which I dropped for now and simply use quite popular configuration on power 
(1Gb DMA window starting
from 0x0).

As for me, it is too complicated. We do not need either
- platform specific IOMMU chardev  or
- IOMMU API at all

What do I miss?


Signed-off-by: Alexey Kardashevskiy <address@hidden>
---
 arch/powerpc/include/asm/iommu.h |    3 +
 arch/powerpc/kernel/iommu.c      |  302 ++++++++++++++++++++++++++++++++++++++
 drivers/iommu/Kconfig            |    2 +
 drivers/vfio/Kconfig             |    5 +-
 4 files changed, 310 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index edfc980..92aeb57 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -66,6 +66,9 @@ struct iommu_table {
        unsigned long  it_halfpoint; /* Breaking point for small/large allocs */
        spinlock_t     it_lock;      /* Protects it_map */
        unsigned long *it_map;       /* A simple allocation bitmap for now */
+#ifdef CONFIG_IOMMU_API
+       struct iommu_group *it_group;
+#endif
 };

 struct scatterlist;
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 0cfcf98..6e40870 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -39,6 +39,9 @@
 #include <asm/pci-bridge.h>
 #include <asm/machdep.h>
 #include <asm/kdump.h>
+#include <asm-generic/sizes.h>
+
+#include <linux/iommu.h>

 #define DBG(...)

@@ -677,3 +680,302 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t 
size,
                free_pages((unsigned long)vaddr, get_order(size));
        }
 }
+
+#ifdef CONFIG_IOMMU_API
+
+/*
+ * IOMMU API implementation.
+ *
+ * Note: only one domain per IOMMU group is enabled at the moment.
+ *
+ */
+struct tce_domain {
+       struct iommu_table *tbl;
+};
+
+static int tce_iommu_domain_init(struct iommu_domain *dom)
+{
+       struct tce_domain *tcedom;
+
+       tcedom = kzalloc(sizeof(*tcedom), GFP_KERNEL);
+       if (!tcedom)
+               return -ENOMEM;
+
+       dom->priv = tcedom;
+       printk("TCE Domain %p (IOMMU tcedom %p) initialized\n", tcedom, dom);
+
+       return 0;
+}
+
+static void tce_iommu_domain_destroy(struct iommu_domain *dom)
+{
+       struct tce_domain *tcedom = dom->priv;
+
+       printk("TCE Domain %p (IOMMU tcedom %p) destroyed\n", tcedom, dom);
+       if (!tcedom)
+               return;
+
+       if (tcedom->tbl) {
+               /*
+                * At the moment the kernel cleans the TCE table up before use
+                * anyway but it would be just nice to clean when it is just
+                * released.
+                */
+               printk("TODO: clean the TCE table\n");
+       }
+
+       kfree(tcedom);
+
+       dom->priv = NULL;
+}
+
+static int tce_iommu_attach_device(struct iommu_domain *dom,
+                                  struct device *dev)
+{
+       struct tce_domain *tcedom = dom->priv;
+
+       if (!tcedom->tbl) {
+               tcedom->tbl = get_iommu_table_base(dev);
+               printk("TCE Domain %p (IOMMU tcedom %p) - "
+                               "device %p is first in a domain\n",
+                               tcedom, dom, dev);
+       } else if (tcedom->tbl != get_iommu_table_base(dev)) {
+               printk("TCE Domain %p (IOMMU tcedom %p) - "
+                               "device %p NOT attached, wrong group\n",
+                               tcedom, dom, dev);
+               return -EBUSY;
+       }
+
+       printk("TCE Domain %p (IOMMU tcedom %p) - device %p attached\n",
+                       tcedom, dom, dev);
+
+       return 0;
+}
+
+static void tce_iommu_detach_device(struct iommu_domain *dom,
+                                   struct device *dev)
+{
+       struct tce_domain *tcedom = dom->priv;
+       struct iommu_table *tbl = tcedom->tbl;
+
+       printk("TCE Domain %p (IOMMU tcedom %p) - device %p DEtached\n",
+                       tcedom, dom, dev);
+       BUG_ON(tbl !=get_iommu_table_base(dev));
+}
+
+static int tce_iommu_map(struct iommu_domain *dom, unsigned long iova,
+                  phys_addr_t paddr, size_t size, int prot)
+{
+       struct tce_domain *tcedom = dom->priv;
+       struct iommu_table *tbl = tcedom->tbl;
+       unsigned long entry, flags;
+       int build_fail;
+
+       spin_lock_irqsave(&(tbl->it_lock), flags);
+       entry = iova >> IOMMU_PAGE_SHIFT;
+       build_fail = ppc_md.tce_build(tbl, entry, 1/*pages*/,
+                       (unsigned long)paddr & IOMMU_PAGE_MASK,
+                       DMA_BIDIRECTIONAL, NULL/*attrs*/);
+
+       /* ppc_md.tce_build() only returns non-zero for transient errors.
+        * Clean up the table bitmap in this case and return
+        * DMA_ERROR_CODE. For all other errors the functionality is
+        * not altered.
+        */
+       if (unlikely(build_fail)) {
+               printk("Failed to add TCE\n");
+               spin_unlock_irqrestore(&(tbl->it_lock), flags);
+               return -EFAULT;
+       }
+       /* Flush/invalidate TLB caches if necessary */
+       if (ppc_md.tce_flush)
+               ppc_md.tce_flush(tbl);
+
+       spin_unlock_irqrestore(&(tbl->it_lock), flags);
+
+       /* Make sure updates are seen by hardware */
+       mb();
+
+       return 0;
+}
+
+static size_t tce_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
+                    size_t size)
+{
+       struct tce_domain *tcedom = dom->priv;
+       struct iommu_table *tbl = tcedom->tbl;
+       unsigned long entry, flags;
+       entry = iova >> IOMMU_PAGE_SHIFT;
+
+       spin_lock_irqsave(&(tbl->it_lock), flags);
+       ppc_md.tce_free(tbl, entry, 1);
+       /* Flush/invalidate TLB caches if necessary */
+       if (ppc_md.tce_flush)
+               ppc_md.tce_flush(tbl);
+
+       spin_unlock_irqrestore(&(tbl->it_lock), flags);
+
+       /* Make sure updates are seen by hardware */
+       mb();
+
+       return size;
+}
+
+static phys_addr_t tce_iommu_iova_to_phys(struct iommu_domain *dom,
+               unsigned long iova)
+{
+       struct tce_domain *tcedom = dom->priv;
+       struct iommu_table *tbl = tcedom->tbl;
+       unsigned long entry = iova >> IOMMU_PAGE_SHIFT;
+       phys_addr_t ret = 0;
+
+       if (ppc_md.tce_get)
+               ret = ppc_md.tce_get(tbl, entry);
+
+       return ret;
+}
+
+static int tce_iommu_domain_has_cap(struct iommu_domain *dom,
+                                   unsigned long cap)
+{
+       switch (cap) {
+       case IOMMU_CAP_CACHE_COHERENCY:
+       case IOMMU_CAP_INTR_REMAP:
+               /* FIXME: not sure if these are correct */
+               return 1;
+/*     case IOMMU_CAP_SETUP_REQUIRED:
+               return 1;*/
+       }
+
+       return 0;
+}
+#if 0
+static int tce_iommu_get_dma_window(struct iommu_domain *dom, int index,
+                       phys_addr_t *start_address, size_t *allocated_size)
+{
+       struct tce_domain *tcedom = dom->priv;
+       struct iommu_table *tbl = tcedom->tbl;
+
+       if (!tbl) {
+               printk(KERN_ERR"tce_iommu: not initialized\n");
+               return -EFAULT;
+       }
+       if (!allocated_size || !start_address) {
+               printk(KERN_ERR"tce_iommu: invalid parameters\n");
+               return -EFAULT;
+       }
+       if (index > 0) {
+               printk(KERN_ERR"tce_iommu: %u is out of boundary\n", index);
+               return -EINVAL;
+       }
+       *start_address = tbl->it_offset << IOMMU_PAGE_SHIFT;
+       *allocated_size = tbl->it_size << IOMMU_PAGE_SHIFT;
+
+       return 0;
+}
+#endif
+static int device_notifier(struct notifier_block *nb,
+                          unsigned long action, void *data)
+{
+       struct device *dev = data;
+
+       printk("device_notifier(%p) ", dev);
+       /*if (action == BUS_NOTIFY_ADD_DEVICE)
+               return add_iommu_group(dev, NULL);*/
+       switch (action) {
+#define __x(s) case s: printk("action=" #s " %u 0x%x\n", (s), (s)); break;
+               __x(BUS_NOTIFY_ADD_DEVICE);
+               __x(BUS_NOTIFY_DEL_DEVICE);
+               __x(BUS_NOTIFY_BIND_DRIVER);
+               __x(BUS_NOTIFY_BOUND_DRIVER);
+               __x(BUS_NOTIFY_UNBIND_DRIVER);
+               __x(BUS_NOTIFY_UNBOUND_DRIVER);
+               default: printk("action=%lu 0x%lx\n", action, action);
+       }
+       return 0;
+}
+
+static struct notifier_block device_nb = {
+       .notifier_call = device_notifier,
+};
+
+static int tce_iommu_add_device_dummy(struct device *dev)
+{
+       printk(KERN_ERR"%s: not implemented!\n", __func__);
+       return -EINVAL;
+}
+
+static void tce_iommu_remove_device_dummy(struct device *dev)
+{
+       printk(KERN_ERR"%s: not implemented!\n", __func__);
+}
+
+static struct iommu_ops tce_iommu_ops = {
+       .domain_init    = tce_iommu_domain_init,
+       .domain_destroy = tce_iommu_domain_destroy,
+       .attach_dev     = tce_iommu_attach_device,
+       .detach_dev     = tce_iommu_detach_device,
+       .map            = tce_iommu_map,
+       .unmap          = tce_iommu_unmap,
+       .iova_to_phys   = tce_iommu_iova_to_phys,
+       .domain_has_cap = tce_iommu_domain_has_cap,
+       .add_device     = tce_iommu_add_device_dummy,
+       .remove_device  = tce_iommu_remove_device_dummy,
+/*     .get_dma_window = tce_iommu_get_dma_window,*/
+       .pgsize_bitmap  = SZ_4K /*| SZ_64K | SZ_1M | SZ_16M;*/
+};
+
+/*
+ * Setup IOMMU API.
+ */
+static int __init power_pci_iommu_init(void)
+{
+       struct pci_dev *pdev = NULL;
+       struct iommu_table *tbl = NULL;
+       int ret = 0;
+
+       bus_set_iommu(&pci_bus_type, &tce_iommu_ops);
+       bus_register_notifier(&pci_bus_type, &device_nb);
+
+       for_each_pci_dev(pdev) {
+
+               tbl = get_iommu_table_base(&pdev->dev);
+               if (NULL == tbl) {
+                       printk("Skipping device %s\n", pdev->dev.kobj.name);
+                       continue;
+               }
+               if (!tbl->it_group) {
+                       struct iommu_group *tmp = iommu_group_alloc();
+                       if (IS_ERR(tmp)) {
+                               printk("Failed to create new IOMMU group, "
+                                               "ret = %ld\n", PTR_ERR(tmp));
+                               break;
+                       }
+                       tbl->it_group = tmp;
+               }
+
+               ret = iommu_group_add_device(tbl->it_group, &pdev->dev);
+               if (ret < 0)
+                       printk("iommu_group_add_device(%s) failed with %d\n",
+                                       pdev->dev.kobj.name, ret);
+       }
+
+       return 0;
+}
+
+/*
+ * Must be initialized after subsys_initcall(iommu_init) and
+ * subsys_initcall(pcibios_init).
+ */
+subsys_initcall_sync(power_pci_iommu_init);
+
+#endif /* CONFIG_IOMMU_API */
+
+/* WORKAROUND */
+struct kvm;
+struct kvm_memory_slot;
+int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
+{
+       return 0;
+}
+
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 6bea696..885ebe1 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -1,5 +1,7 @@
 # IOMMU_API always gets selected by whoever wants it.
 config IOMMU_API
+       bool "IOMMU API Support for PCI pass through"
+       default n
        bool

 menuconfig IOMMU_SUPPORT
diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 77b754c..5788194 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -1,6 +1,7 @@
 config VFIO_IOMMU_X86
-       tristate
-       depends on VFIO && X86
+       tristate "X86 IOMMU API"
+       depends on VFIO
+#      && X86
        default n

 menuconfig VFIO


-- 
Alexey



reply via email to

[Prev in Thread] Current Thread [Next in Thread]