qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH 2/2] vfio-powerpc: added VFIO support


From: Alex Williamson
Subject: Re: [Qemu-devel] [PATCH 2/2] vfio-powerpc: added VFIO support
Date: Tue, 10 Jul 2012 10:55:23 -0600

On Tue, 2012-07-10 at 15:51 +1000, Alexey Kardashevskiy wrote:
> The patch enables VFIO on POWER.
> 
> It literally does the following:
> 
> 1. POWERPC IOMMU support (the kernel counterpart is required)
> 
> 2. Added #ifdef TARGET_PPC64 for EOI handlers initialisation.
> 
> 3. Added vfio_get_container_fd() to VFIO in order to initialize 1).
> 
> 4. Makefile fixed and "is_vfio" flag added into sPAPR PHB - required to
> distinguish VFIO's DMA context from the emulated one.
> 
> WIth the pathes posted today a bit earlier, this patch fully supports
> VFIO what includes MSIX as well,
> 
> 
> Signed-off-by: Alexey Kardashevskiy <address@hidden>
> ---
>  hw/ppc/Makefile.objs |    3 ++
>  hw/spapr.h           |    4 +++
>  hw/spapr_iommu.c     |   87 
> ++++++++++++++++++++++++++++++++++++++++++++++++++
>  hw/spapr_pci.c       |   23 ++++++++++++-
>  hw/spapr_pci.h       |    2 ++
>  hw/vfio_pci.c        |   76 +++++++++++++++++++++++++++++++++++++++++--
>  hw/vfio_pci.h        |    2 ++
>  7 files changed, 193 insertions(+), 4 deletions(-)
> 
> diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
> index f573a95..c46a049 100644
> --- a/hw/ppc/Makefile.objs
> +++ b/hw/ppc/Makefile.objs
> @@ -25,4 +25,7 @@ obj-$(CONFIG_FDT) += ../device_tree.o
>  # Xilinx PPC peripherals
>  obj-y += xilinx_ethlite.o
>  
> +# VFIO PCI device assignment
> +obj-$(CONFIG_VFIO_PCI) += vfio_pci.o
> +
>  obj-y := $(addprefix ../,$(obj-y))
> diff --git a/hw/spapr.h b/hw/spapr.h
> index b37f337..9dca704 100644
> --- a/hw/spapr.h
> +++ b/hw/spapr.h
> @@ -340,4 +340,8 @@ int spapr_dma_dt(void *fdt, int node_off, const char 
> *propname,
>  int spapr_tcet_dma_dt(void *fdt, int node_off, const char *propname,
>                        DMAContext *dma);
>  
> +void spapr_vfio_init_dma(int fd, uint32_t liobn,
> +                         uint64_t *dma32_window_start,
> +                         uint64_t *dma32_window_size);
> +
>  #endif /* !defined (__HW_SPAPR_H__) */
> diff --git a/hw/spapr_iommu.c b/hw/spapr_iommu.c
> index 50c288d..0a194e8 100644
> --- a/hw/spapr_iommu.c
> +++ b/hw/spapr_iommu.c
> @@ -16,6 +16,8 @@
>   * You should have received a copy of the GNU Lesser General Public
>   * License along with this library; if not, see 
> <http://www.gnu.org/licenses/>.
>   */
> +#include <sys/ioctl.h>
> +
>  #include "hw.h"
>  #include "kvm.h"
>  #include "qdev.h"
> @@ -23,6 +25,7 @@
>  #include "dma.h"
>  
>  #include "hw/spapr.h"
> +#include "hw/linux-vfio.h"

I really need to move this into linux-headers.

>  
>  #include <libfdt.h>
>  
> @@ -183,6 +186,86 @@ static int put_tce_emu(target_ulong liobn, target_ulong 
> ioba, target_ulong tce)
>      return 0;
>  }
>  
> +/* -------- API for POWERPC IOMMU -------- */
> +
> +#define POWERPC_IOMMU           2
> +
> +struct tce_iommu_info {
> +    __u32 argsz;
> +    __u32 dma32_window_start;
> +    __u32 dma32_window_size;
> +};
> +
> +#define POWERPC_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
> +
> +struct tce_iommu_dma_map {
> +    __u32 argsz;
> +    __u64 va;
> +    __u64 dmaaddr;
> +};
> +
> +#define POWERPC_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13)
> +#define POWERPC_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)

I assume this would eventually go into the kernel vfio.h with a VFIO_
prefix.  Add a flags field to the structures or it'll be hard to extend
them later.

> +typedef struct sPAPRVFIOTable {
> +    int fd;
> +    uint32_t liobn;
> +    QLIST_ENTRY(sPAPRVFIOTable) list;
> +} sPAPRVFIOTable;
> +
> +QLIST_HEAD(vfio_tce_tables, sPAPRVFIOTable) vfio_tce_tables;
> +
> +void spapr_vfio_init_dma(int fd, uint32_t liobn,
> +                         uint64_t *dma32_window_start,
> +                         uint64_t *dma32_window_size)
> +{
> +    sPAPRVFIOTable *t;
> +    struct tce_iommu_info info = { .argsz = sizeof(info) };
> +
> +    if (ioctl(fd, POWERPC_IOMMU_GET_INFO, &info)) {
> +        fprintf(stderr, "POWERPC_IOMMU_GET_INFO failed %d\n", errno);
> +        return;
> +    }
> +    *dma32_window_start = info.dma32_window_start;
> +    *dma32_window_size = info.dma32_window_size;
> +
> +    t = g_malloc0(sizeof(*t));
> +    t->fd = fd;
> +    t->liobn = liobn;
> +
> +    QLIST_INSERT_HEAD(&vfio_tce_tables, t, list);
> +}
> +
> +static int put_tce_vfio(uint32_t liobn, target_ulong ioba, target_ulong tce)
> +{
> +    sPAPRVFIOTable *t;
> +    struct tce_iommu_dma_map map = {
> +        .argsz = sizeof(map),
> +        .va = 0,
> +        .dmaaddr = ioba,
> +    };
> +
> +    QLIST_FOREACH(t, &vfio_tce_tables, list) {
> +        if (t->liobn != liobn) {
> +            continue;
> +        }
> +        if (tce) {
> +            map.va = (uintptr_t)qemu_get_ram_ptr(tce & ~SPAPR_TCE_PAGE_MASK);
> +            if (ioctl(t->fd, POWERPC_IOMMU_MAP_DMA, &map)) {
> +                fprintf(stderr, "TCE_MAP_DMA: %d\n", errno);
> +                return H_PARAMETER;
> +            }
> +        } else {
> +            if (ioctl(t->fd, POWERPC_IOMMU_UNMAP_DMA, &map)) {
> +                fprintf(stderr, "TCE_UNMAP_DMA: %d\n", errno);
> +                return H_PARAMETER;
> +            }
> +        }
> +        return H_SUCCESS;
> +    }
> +    return H_CONTINUE; /* positive non-zero value */
> +}
> +

I wish you could do this through a MemoryListener like we do on x86.

>  static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr,
>                                target_ulong opcode, target_ulong *args)
>  {
> @@ -203,6 +286,10 @@ static target_ulong h_put_tce(CPUPPCState *env, 
> sPAPREnvironment *spapr,
>      if (0 >= ret) {
>          return ret ? H_PARAMETER : H_SUCCESS;
>      }
> +    ret = put_tce_vfio(liobn, ioba, tce);
> +    if (0 >= ret) {
> +        return ret ? H_PARAMETER : H_SUCCESS;
> +    }
>  #ifdef DEBUG_TCE
>      fprintf(stderr, "%s on liobn=" TARGET_FMT_lx
>              "  ioba 0x" TARGET_FMT_lx "  TCE 0x" TARGET_FMT_lx " ret=%d\n",
> diff --git a/hw/spapr_pci.c b/hw/spapr_pci.c
> index 5f89003..3375c3f 100644
> --- a/hw/spapr_pci.c
> +++ b/hw/spapr_pci.c
> @@ -29,6 +29,7 @@
>  #include "pci_host.h"
>  #include "hw/spapr.h"
>  #include "hw/spapr_pci.h"
> +#include "hw/vfio_pci.h"
>  #include "exec-memory.h"
>  #include <libfdt.h>
>  #include "trace.h"
> @@ -440,6 +441,12 @@ static void pci_spapr_set_irq(void *opaque, int irq_num, 
> int level)
>                   level);
>  }
>  
> +static int pci_spapr_get_irq(void *opaque, int irq_num)
> +{
> +    sPAPRPHBState *phb = opaque;
> +    return phb->lsi_table[irq_num].dt_irq;
> +}
> +
>  static uint64_t spapr_io_read(void *opaque, target_phys_addr_t addr,
>                                unsigned size)
>  {
> @@ -567,7 +574,8 @@ static int spapr_phb_init(SysBusDevice *s)
>  
>      bus = pci_register_bus(&phb->host_state.busdev.qdev,
>                             phb->busname ? phb->busname : phb->dtbusname,
> -                           pci_spapr_set_irq, NULL, pci_spapr_map_irq, phb,
> +                           pci_spapr_set_irq, pci_spapr_get_irq,
> +                           pci_spapr_map_irq, phb,
>                             &phb->memspace, &phb->iospace,
>                             PCI_DEVFN(0, 0), PCI_NUM_PINS);
>      phb->host_state.bus = bus;
> @@ -596,6 +604,7 @@ static Property spapr_phb_properties[] = {
>      DEFINE_PROP_HEX64("io_win_addr", sPAPRPHBState, io_win_addr, 0),
>      DEFINE_PROP_HEX64("io_win_size", sPAPRPHBState, io_win_size, 0x10000),
>      DEFINE_PROP_HEX64("msi_win_addr", sPAPRPHBState, msi_win_addr, 0),
> +    DEFINE_PROP_UINT8("vfio", sPAPRPHBState, is_vfio, 0),
>      DEFINE_PROP_END_OF_LIST(),
>  };
>  
> @@ -639,6 +648,18 @@ void spapr_create_phb(sPAPREnvironment *spapr,
>  /* Finalize PCI setup, called when all devices are already created */
>  int spapr_finalize_pci_setup(sPAPRPHBState *phb)
>  {
> +    if (phb->is_vfio) {
> +        int fd = vfio_get_container_fd(phb->host_state.bus);
> +
> +        if (fd < 0) {
> +            return fd;
> +        }
> +        spapr_vfio_init_dma(fd, phb->dma_liobn,
> +                            &phb->dma_window_start,
> +                            &phb->dma_window_size);
> +        return 0;
> +    }
> +
>      phb->dma_window_start = 0;
>      phb->dma_window_size = 0x40000000;
>      phb->dma = spapr_tce_new_dma_context(phb->dma_liobn,
> diff --git a/hw/spapr_pci.h b/hw/spapr_pci.h
> index 3aae273..a4f031b 100644
> --- a/hw/spapr_pci.h
> +++ b/hw/spapr_pci.h
> @@ -57,6 +57,8 @@ typedef struct sPAPRPHBState {
>          int nvec;
>      } msi_table[SPAPR_MSIX_MAX_DEVS];
>  
> +    uint8_t is_vfio;
> +
>      QLIST_ENTRY(sPAPRPHBState) list;
>  } sPAPRPHBState;
>  
> diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c
> index 1ac287f..cc0b974 100644
> --- a/hw/vfio_pci.c
> +++ b/hw/vfio_pci.c
> @@ -21,7 +21,6 @@
>  #include <dirent.h>
>  #include <stdio.h>
>  #include <unistd.h>
> -#include <sys/io.h>
>  #include <sys/ioctl.h>
>  #include <sys/mman.h>
>  #include <sys/types.h>
> @@ -44,6 +43,17 @@
>  #include "vfio_pci.h"
>  #include "linux-vfio.h"
>  
> +#ifndef TARGET_PPC64
> +#include <sys/io.h>
> +#define VFIO_IOMMU_EXTENSION    VFIO_X86_IOMMU
> +#else
> +#include "hw/pci_internals.h"
> +#include "hw/xics.h"
> +#include "hw/spapr.h"
> +#define POWERPC_IOMMU           2
> +#define VFIO_IOMMU_EXTENSION    POWERPC_IOMMU
> +#endif
> +

VFIO_IOMMU_EXTENSION never gets used, POWER_IOMMU is redefined below.

>  //#define DEBUG_VFIO
>  #ifdef DEBUG_VFIO
>  #define DPRINTF(fmt, ...) \
> @@ -235,6 +245,7 @@ struct vfio_irq_set_fd {
>  
>  static void vfio_enable_intx_kvm(VFIODevice *vdev)
>  {
> +#ifndef TARGET_PPC64

Why do you need this, aren't the extension checks sufficient for this to
be a nop for you?

>  #ifdef CONFIG_KVM
>      struct vfio_irq_set_fd irq_set_fd = {
>       .irq_set = {
> @@ -298,10 +309,12 @@ fail:
>      qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev);
>      vfio_unmask_intx(vdev);
>  #endif
> +#endif
>  }
>  
>  static void vfio_disable_intx_kvm(VFIODevice *vdev)
>  {
> +#ifndef TARGET_PPC64

Same

>  #ifdef CONFIG_KVM
>      struct vfio_irq_set_fd irq_set_fd = {
>       .irq_set = {
> @@ -350,8 +363,10 @@ static void vfio_disable_intx_kvm(VFIODevice *vdev)
>      DPRINTF("%s(%04x:%02x:%02x.%x) KVM INTx accel disabled\n", __FUNCTION__,
>              vdev->host.seg, vdev->host.bus, vdev->host.dev, vdev->host.func);
>  #endif
> +#endif
>  }
>  
> +#ifndef TARGET_PPC64
>  static void vfio_update_irq(Notifier *notify, void *data)
>  {
>      VFIODevice *vdev = container_of(notify, VFIODevice, intx.update_irq);
> @@ -381,6 +396,7 @@ static void vfio_update_irq(Notifier *notify, void *data)
>      /* Re-enable the interrupt in cased we missed an EOI */
>      vfio_eoi(&vdev->intx.eoi, NULL);
>  }
> +#endif
>  
>  static int vfio_enable_intx(VFIODevice *vdev)
>  {
> @@ -404,10 +420,14 @@ static int vfio_enable_intx(VFIODevice *vdev)
>      vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
>      vdev->intx.irq = pci_get_irq(&vdev->pdev, vdev->intx.pin);
>      vdev->intx.eoi.notify = vfio_eoi;
> +#ifndef TARGET_PPC64
>      ioapic_add_gsi_eoi_notifier(&vdev->intx.eoi, vdev->intx.irq);

This is really only a place holder for x86 too, I don't think my eoi
notifier as written is acceptable upstream.  We really need some common
infrastructure here.  I'm hoping to get the kvm acceleration in place
which would make vfio usable on x86 with kvm (the common case), then
work towards a generic eoi notifier.

>  
>      vdev->intx.update_irq.notify = vfio_update_irq;
>      pci_add_irq_update_notifier(&vdev->pdev, &vdev->intx.update_irq);

Can't you stub this out to make it safe to do on POWER too?

> +#else
> +    xics_add_eoi_notifier(&vdev->intx.eoi, vdev->intx.irq);
> +#endif
>  
>      if (event_notifier_init(&vdev->intx.interrupt, 0)) {
>          error_report("vfio: Error: event_notifier_init failed\n");
> @@ -440,8 +460,12 @@ static void vfio_disable_intx(VFIODevice *vdev)
>      vfio_disable_intx_kvm(vdev);
>      vfio_disable_irqindex(vdev, VFIO_PCI_INTX_IRQ_INDEX);
>  
> +#ifndef TARGET_PPC64
>      pci_remove_irq_update_notifier(&vdev->intx.update_irq);
>      ioapic_remove_gsi_eoi_notifier(&vdev->intx.eoi, vdev->intx.irq);
> +#else
> +    xics_remove_eoi_notifier(&vdev->intx.eoi);
> +#endif
>  
>      fd = event_notifier_get_fd(&vdev->intx.interrupt);
>      qemu_set_fd_handler(fd, NULL, NULL, vdev);
> @@ -543,7 +567,7 @@ static int vfio_msix_vector_use(PCIDevice *pdev,
>      }
>  
>      fd = event_notifier_get_fd(&vdev->msi_vectors[vector].interrupt);
> -
> +#ifndef TARGET_PPC64
>      vdev->msi_vectors[vector].virq = kvm_irqchip_add_msi_route(kvm_state, 
> msg);
>      if (vdev->msi_vectors[vector].virq < 0 || 
>          kvm_irqchip_add_irqfd(kvm_state, fd,
> @@ -551,7 +575,11 @@ static int vfio_msix_vector_use(PCIDevice *pdev,
>          qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL,
>                              &vdev->msi_vectors[vector]);
>      }
> -
> +#else
> +    vdev->msi_vectors[vector].virq = -1;
> +    qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL,
> +                        &vdev->msi_vectors[vector]);
> +#endif

This shouldn't be necessary once the abort is removed from
kvm_irqchip_add_msi_route.  It'll be merged next time the kvm uq tree
merges into qemu.

>      if (vdev->nr_vectors < vector + 1) {
>          int i;
>  
> @@ -692,6 +720,7 @@ retry:
>          fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
>  
>          msg = msi_get_msg(&vdev->pdev, i);
> +#ifndef TARGET_PPC64
>          vdev->msi_vectors[i].virq = kvm_irqchip_add_msi_route(kvm_state, 
> msg);
>          if (vdev->msi_vectors[i].virq < 0 || 
>              kvm_irqchip_add_irqfd(kvm_state, fd,
> @@ -699,6 +728,12 @@ retry:
>              qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL,
>                                  &vdev->msi_vectors[i]);
>          }
> +#else
> +        vdev->msi_vectors[i].virq = -1;
> +        qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL,
> +                            &vdev->msi_vectors[i]);
> +        msg = msg;
> +#endif

Same here

>      }
>      
>      ret = vfio_enable_vectors(vdev, false);
> @@ -1581,6 +1616,25 @@ static int vfio_connect_container(VFIOGroup *group)
>  
>          memory_listener_register(&container->listener, get_system_memory());
>  
> +#define POWERPC_IOMMU           2

Assume this will go in the kernel vfio.h at some point.  You may want to
pick a different name if there's a possibility of other powerpc iommu
implementations... thus the crappy type1 name for x86.

> +    } else if (ioctl(fd, VFIO_CHECK_EXTENSION, POWERPC_IOMMU)) {
> +        ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
> +        if (ret) {
> +            error_report("vfio: failed to set group container: %s\n",
> +                         strerror(errno));
> +            g_free(container);
> +            close(fd);
> +            return -1;
> +        }
> +
> +        ret = ioctl(fd, VFIO_SET_IOMMU, POWERPC_IOMMU);
> +        if (ret) {
> +            error_report("vfio: failed to set iommu for container: %s\n",
> +                         strerror(errno));
> +            g_free(container);
> +            close(fd);
> +            return -1;
> +        }
>      } else {
>          error_report("vfio: No available IOMMU models\n");
>          g_free(container);
> @@ -2005,3 +2059,19 @@ static void register_vfio_pci_dev_type(void)
>  }
>  
>  type_init(register_vfio_pci_dev_type)
> +
> +int vfio_get_container_fd(struct PCIBus *pbus)
> +{
> +    BusChild *kid1st = QTAILQ_FIRST(&pbus->qbus.children);
> +    VFIODevice *vdev1st;
> +
> +    if (!kid1st) {
> +        printf("No device registered on PCI bus \"%s\", no DMA enabled\n",
> +               pbus->qbus.name);
> +        return -1;
> +    }
> +    vdev1st = container_of(kid1st->child, VFIODevice, pdev.qdev);
> +
> +    return vdev1st->group->container->fd;
> +}
> +

This is not a generic implementation.  x86 won't have all devices on a
bus be vfio devices and even if it did, there's no guarantee they all
belong to the same container.  This should probably at least take a
PCIDevice and some kind of POWER specific code will need to know that
the container is the same for the whole bus.  Thanks,

Alex

> diff --git a/hw/vfio_pci.h b/hw/vfio_pci.h
> index 226607c..0d13341 100644
> --- a/hw/vfio_pci.h
> +++ b/hw/vfio_pci.h
> @@ -105,4 +105,6 @@ typedef struct VFIOGroup {
>  #define VFIO_FLAG_IOMMU_SHARED_BIT 0
>  #define VFIO_FLAG_IOMMU_SHARED (1U << VFIO_FLAG_UIOMMU_SHARED_BIT)
>  
> +int vfio_get_container_fd(struct PCIBus *pbus);
> +
>  #endif /* __VFIO_H__ */






reply via email to

[Prev in Thread] Current Thread [Next in Thread]