[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Qemu-devel] [RFC PATCH] kernel vfio: enabled and supported on power
From: |
Alexey Kardashevskiy |
Subject: |
[Qemu-devel] [RFC PATCH] kernel vfio: enabled and supported on power |
Date: |
Sat, 12 May 2012 17:31:29 +1000 |
User-agent: |
Mozilla/5.0 (X11; Linux i686; rv:11.0) Gecko/20120327 Thunderbird/11.0.1 |
The idea of the patch is to demonstrate what POWER needs to support VFIO.
Added support on POWER. Than includes:
1) IOMMU API driver for POWER.
It also includes subsys_initcall_sync(power_pci_iommu_init) which walks through
all
PCI devices and creates IOMMU groups and adds devices to these groups.
2) Prototype for an additional IOMMU API call tce_iommu_get_dma_window.
We need to tell the POWER guest a DMA window location and size. So,
I tried to add this to struct iommu_ops:
static int tce_iommu_get_dma_window(struct iommu_domain *dom, int index,
phys_addr_t *start_address, size_t *allocated_size)
The idea is that it returns 32-bit DMA window for index==0 and 64-bit
DMA window for index==1. This is what we need now.
However I noticed a move to implement IOMMU chardev for every platform
separately
(kernel: drivers/vfio/vfio_iommu_x86.c).
I. e. QEMU does ioctl to the host, this call gets into a _platform_specific_
IOMMU chardev,
then the chardev calls a _platform_independend_ IOMMU API function (lets call it
iommu_get_dma_window()), and this iommu_get_dma_window() calls a
_platform_specific_
tce_iommu_get_dma_window().
Another example, DMA map/unmap implementation on X86 is split between 2 pieces
of code -
drivers/vfio/vfio_iommu_x86.c and drivers/iommu/intel-iommu.c.
And drivers/vfio/vfio_iommu_x86.c works perfect for POWER except a DMA window
setup
which I dropped for now and simply use quite popular configuration on power
(1Gb DMA window starting
from 0x0).
As for me, it is too complicated. We do not need either
- platform specific IOMMU chardev or
- IOMMU API at all
What do I miss?
Signed-off-by: Alexey Kardashevskiy <address@hidden>
---
arch/powerpc/include/asm/iommu.h | 3 +
arch/powerpc/kernel/iommu.c | 302 ++++++++++++++++++++++++++++++++++++++
drivers/iommu/Kconfig | 2 +
drivers/vfio/Kconfig | 5 +-
4 files changed, 310 insertions(+), 2 deletions(-)
diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index edfc980..92aeb57 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -66,6 +66,9 @@ struct iommu_table {
unsigned long it_halfpoint; /* Breaking point for small/large allocs */
spinlock_t it_lock; /* Protects it_map */
unsigned long *it_map; /* A simple allocation bitmap for now */
+#ifdef CONFIG_IOMMU_API
+ struct iommu_group *it_group;
+#endif
};
struct scatterlist;
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 0cfcf98..6e40870 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -39,6 +39,9 @@
#include <asm/pci-bridge.h>
#include <asm/machdep.h>
#include <asm/kdump.h>
+#include <asm-generic/sizes.h>
+
+#include <linux/iommu.h>
#define DBG(...)
@@ -677,3 +680,302 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t
size,
free_pages((unsigned long)vaddr, get_order(size));
}
}
+
+#ifdef CONFIG_IOMMU_API
+
+/*
+ * IOMMU API implementation.
+ *
+ * Note: only one domain per IOMMU group is enabled at the moment.
+ *
+ */
+struct tce_domain {
+ struct iommu_table *tbl;
+};
+
+static int tce_iommu_domain_init(struct iommu_domain *dom)
+{
+ struct tce_domain *tcedom;
+
+ tcedom = kzalloc(sizeof(*tcedom), GFP_KERNEL);
+ if (!tcedom)
+ return -ENOMEM;
+
+ dom->priv = tcedom;
+ printk("TCE Domain %p (IOMMU tcedom %p) initialized\n", tcedom, dom);
+
+ return 0;
+}
+
+static void tce_iommu_domain_destroy(struct iommu_domain *dom)
+{
+ struct tce_domain *tcedom = dom->priv;
+
+ printk("TCE Domain %p (IOMMU tcedom %p) destroyed\n", tcedom, dom);
+ if (!tcedom)
+ return;
+
+ if (tcedom->tbl) {
+ /*
+ * At the moment the kernel cleans the TCE table up before use
+ * anyway but it would be just nice to clean when it is just
+ * released.
+ */
+ printk("TODO: clean the TCE table\n");
+ }
+
+ kfree(tcedom);
+
+ dom->priv = NULL;
+}
+
+static int tce_iommu_attach_device(struct iommu_domain *dom,
+ struct device *dev)
+{
+ struct tce_domain *tcedom = dom->priv;
+
+ if (!tcedom->tbl) {
+ tcedom->tbl = get_iommu_table_base(dev);
+ printk("TCE Domain %p (IOMMU tcedom %p) - "
+ "device %p is first in a domain\n",
+ tcedom, dom, dev);
+ } else if (tcedom->tbl != get_iommu_table_base(dev)) {
+ printk("TCE Domain %p (IOMMU tcedom %p) - "
+ "device %p NOT attached, wrong group\n",
+ tcedom, dom, dev);
+ return -EBUSY;
+ }
+
+ printk("TCE Domain %p (IOMMU tcedom %p) - device %p attached\n",
+ tcedom, dom, dev);
+
+ return 0;
+}
+
+static void tce_iommu_detach_device(struct iommu_domain *dom,
+ struct device *dev)
+{
+ struct tce_domain *tcedom = dom->priv;
+ struct iommu_table *tbl = tcedom->tbl;
+
+ printk("TCE Domain %p (IOMMU tcedom %p) - device %p DEtached\n",
+ tcedom, dom, dev);
+ BUG_ON(tbl !=get_iommu_table_base(dev));
+}
+
+static int tce_iommu_map(struct iommu_domain *dom, unsigned long iova,
+ phys_addr_t paddr, size_t size, int prot)
+{
+ struct tce_domain *tcedom = dom->priv;
+ struct iommu_table *tbl = tcedom->tbl;
+ unsigned long entry, flags;
+ int build_fail;
+
+ spin_lock_irqsave(&(tbl->it_lock), flags);
+ entry = iova >> IOMMU_PAGE_SHIFT;
+ build_fail = ppc_md.tce_build(tbl, entry, 1/*pages*/,
+ (unsigned long)paddr & IOMMU_PAGE_MASK,
+ DMA_BIDIRECTIONAL, NULL/*attrs*/);
+
+ /* ppc_md.tce_build() only returns non-zero for transient errors.
+ * Clean up the table bitmap in this case and return
+ * DMA_ERROR_CODE. For all other errors the functionality is
+ * not altered.
+ */
+ if (unlikely(build_fail)) {
+ printk("Failed to add TCE\n");
+ spin_unlock_irqrestore(&(tbl->it_lock), flags);
+ return -EFAULT;
+ }
+ /* Flush/invalidate TLB caches if necessary */
+ if (ppc_md.tce_flush)
+ ppc_md.tce_flush(tbl);
+
+ spin_unlock_irqrestore(&(tbl->it_lock), flags);
+
+ /* Make sure updates are seen by hardware */
+ mb();
+
+ return 0;
+}
+
+static size_t tce_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
+ size_t size)
+{
+ struct tce_domain *tcedom = dom->priv;
+ struct iommu_table *tbl = tcedom->tbl;
+ unsigned long entry, flags;
+ entry = iova >> IOMMU_PAGE_SHIFT;
+
+ spin_lock_irqsave(&(tbl->it_lock), flags);
+ ppc_md.tce_free(tbl, entry, 1);
+ /* Flush/invalidate TLB caches if necessary */
+ if (ppc_md.tce_flush)
+ ppc_md.tce_flush(tbl);
+
+ spin_unlock_irqrestore(&(tbl->it_lock), flags);
+
+ /* Make sure updates are seen by hardware */
+ mb();
+
+ return size;
+}
+
+static phys_addr_t tce_iommu_iova_to_phys(struct iommu_domain *dom,
+ unsigned long iova)
+{
+ struct tce_domain *tcedom = dom->priv;
+ struct iommu_table *tbl = tcedom->tbl;
+ unsigned long entry = iova >> IOMMU_PAGE_SHIFT;
+ phys_addr_t ret = 0;
+
+ if (ppc_md.tce_get)
+ ret = ppc_md.tce_get(tbl, entry);
+
+ return ret;
+}
+
+static int tce_iommu_domain_has_cap(struct iommu_domain *dom,
+ unsigned long cap)
+{
+ switch (cap) {
+ case IOMMU_CAP_CACHE_COHERENCY:
+ case IOMMU_CAP_INTR_REMAP:
+ /* FIXME: not sure if these are correct */
+ return 1;
+/* case IOMMU_CAP_SETUP_REQUIRED:
+ return 1;*/
+ }
+
+ return 0;
+}
+#if 0
+static int tce_iommu_get_dma_window(struct iommu_domain *dom, int index,
+ phys_addr_t *start_address, size_t *allocated_size)
+{
+ struct tce_domain *tcedom = dom->priv;
+ struct iommu_table *tbl = tcedom->tbl;
+
+ if (!tbl) {
+ printk(KERN_ERR"tce_iommu: not initialized\n");
+ return -EFAULT;
+ }
+ if (!allocated_size || !start_address) {
+ printk(KERN_ERR"tce_iommu: invalid parameters\n");
+ return -EFAULT;
+ }
+ if (index > 0) {
+ printk(KERN_ERR"tce_iommu: %u is out of boundary\n", index);
+ return -EINVAL;
+ }
+ *start_address = tbl->it_offset << IOMMU_PAGE_SHIFT;
+ *allocated_size = tbl->it_size << IOMMU_PAGE_SHIFT;
+
+ return 0;
+}
+#endif
+static int device_notifier(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ struct device *dev = data;
+
+ printk("device_notifier(%p) ", dev);
+ /*if (action == BUS_NOTIFY_ADD_DEVICE)
+ return add_iommu_group(dev, NULL);*/
+ switch (action) {
+#define __x(s) case s: printk("action=" #s " %u 0x%x\n", (s), (s)); break;
+ __x(BUS_NOTIFY_ADD_DEVICE);
+ __x(BUS_NOTIFY_DEL_DEVICE);
+ __x(BUS_NOTIFY_BIND_DRIVER);
+ __x(BUS_NOTIFY_BOUND_DRIVER);
+ __x(BUS_NOTIFY_UNBIND_DRIVER);
+ __x(BUS_NOTIFY_UNBOUND_DRIVER);
+ default: printk("action=%lu 0x%lx\n", action, action);
+ }
+ return 0;
+}
+
+static struct notifier_block device_nb = {
+ .notifier_call = device_notifier,
+};
+
+static int tce_iommu_add_device_dummy(struct device *dev)
+{
+ printk(KERN_ERR"%s: not implemented!\n", __func__);
+ return -EINVAL;
+}
+
+static void tce_iommu_remove_device_dummy(struct device *dev)
+{
+ printk(KERN_ERR"%s: not implemented!\n", __func__);
+}
+
+static struct iommu_ops tce_iommu_ops = {
+ .domain_init = tce_iommu_domain_init,
+ .domain_destroy = tce_iommu_domain_destroy,
+ .attach_dev = tce_iommu_attach_device,
+ .detach_dev = tce_iommu_detach_device,
+ .map = tce_iommu_map,
+ .unmap = tce_iommu_unmap,
+ .iova_to_phys = tce_iommu_iova_to_phys,
+ .domain_has_cap = tce_iommu_domain_has_cap,
+ .add_device = tce_iommu_add_device_dummy,
+ .remove_device = tce_iommu_remove_device_dummy,
+/* .get_dma_window = tce_iommu_get_dma_window,*/
+ .pgsize_bitmap = SZ_4K /*| SZ_64K | SZ_1M | SZ_16M;*/
+};
+
+/*
+ * Setup IOMMU API.
+ */
+static int __init power_pci_iommu_init(void)
+{
+ struct pci_dev *pdev = NULL;
+ struct iommu_table *tbl = NULL;
+ int ret = 0;
+
+ bus_set_iommu(&pci_bus_type, &tce_iommu_ops);
+ bus_register_notifier(&pci_bus_type, &device_nb);
+
+ for_each_pci_dev(pdev) {
+
+ tbl = get_iommu_table_base(&pdev->dev);
+ if (NULL == tbl) {
+ printk("Skipping device %s\n", pdev->dev.kobj.name);
+ continue;
+ }
+ if (!tbl->it_group) {
+ struct iommu_group *tmp = iommu_group_alloc();
+ if (IS_ERR(tmp)) {
+ printk("Failed to create new IOMMU group, "
+ "ret = %ld\n", PTR_ERR(tmp));
+ break;
+ }
+ tbl->it_group = tmp;
+ }
+
+ ret = iommu_group_add_device(tbl->it_group, &pdev->dev);
+ if (ret < 0)
+ printk("iommu_group_add_device(%s) failed with %d\n",
+ pdev->dev.kobj.name, ret);
+ }
+
+ return 0;
+}
+
+/*
+ * Must be initialized after subsys_initcall(iommu_init) and
+ * subsys_initcall(pcibios_init).
+ */
+subsys_initcall_sync(power_pci_iommu_init);
+
+#endif /* CONFIG_IOMMU_API */
+
+/* WORKAROUND */
+struct kvm;
+struct kvm_memory_slot;
+int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
+{
+ return 0;
+}
+
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 6bea696..885ebe1 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -1,5 +1,7 @@
# IOMMU_API always gets selected by whoever wants it.
config IOMMU_API
+ bool "IOMMU API Support for PCI pass through"
+ default n
bool
menuconfig IOMMU_SUPPORT
diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 77b754c..5788194 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -1,6 +1,7 @@
config VFIO_IOMMU_X86
- tristate
- depends on VFIO && X86
+ tristate "X86 IOMMU API"
+ depends on VFIO
+# && X86
default n
menuconfig VFIO
--
Alexey
- [Qemu-devel] [RFC PATCH] kernel vfio: enabled and supported on power,
Alexey Kardashevskiy <=