qemu-ppc
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-ppc] [PATCH qemu v8 14/14] spapr_pci/spapr_pci_vfio: Support D


From: David Gibson
Subject: Re: [Qemu-ppc] [PATCH qemu v8 14/14] spapr_pci/spapr_pci_vfio: Support Dynamic DMA Windows (DDW)
Date: Tue, 23 Jun 2015 16:38:01 +1000
User-agent: Mutt/1.5.23 (2014-03-12)

On Thu, Jun 18, 2015 at 09:37:36PM +1000, Alexey Kardashevskiy wrote:
> This adds support for Dynamic DMA Windows (DDW) option defined by
> the SPAPR specification which allows to have additional DMA window(s)
> 
> This implements DDW for emulated and VFIO devices. As all TCE root regions
> are mapped at 0 and 64bit long (and actual tables are child regions),
> this replaces memory_region_add_subregion() with _overlap() to make
> QEMU memory API happy.
> 
> This reserves RTAS token numbers for DDW calls.
> 
> This implements helpers to interact with VFIO kernel interface.
> 
> This changes the TCE table migration descriptor to support dynamic
> tables as from now on, PHB will create as many stub TCE table objects
> as PHB can possibly support but not all of them might be initialized at
> the time of migration because DDW might or might not be requested by
> the guest.
> 
> The "ddw" property is enabled by default on a PHB but for compatibility
> the pseries-2.3 machine and older disable it.
> 
> This implements DDW for VFIO. The host kernel support is required.
> This adds a "levels" property to PHB to control the number of levels
> in the actual TCE table allocated by the host kernel, 0 is the default
> value to tell QEMU to calculate the correct value. Current hardware
> supports up to 5 levels.
> 
> The existing linux guests try creating one additional huge DMA window
> with 64K or 16MB pages and map the entire guest RAM to. If succeeded,
> the guest switches to dma_direct_ops and never calls TCE hypercalls
> (H_PUT_TCE,...) again. This enables VFIO devices to use the entire RAM
> and not waste time on map/unmap later.
> 
> This adds 4 RTAS handlers:
> * ibm,query-pe-dma-window
> * ibm,create-pe-dma-window
> * ibm,remove-pe-dma-window
> * ibm,reset-pe-dma-window
> These are registered from type_init() callback.
> 
> These RTAS handlers are implemented in a separate file to avoid polluting
> spapr_iommu.c with PCI.
> 
> Signed-off-by: Alexey Kardashevskiy <address@hidden>

> diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
> index c8ab06e..0b2ff6d 100644
> --- a/hw/ppc/Makefile.objs
> +++ b/hw/ppc/Makefile.objs
> @@ -7,6 +7,9 @@ obj-$(CONFIG_PSERIES) += spapr_pci.o spapr_rtc.o spapr_drc.o
>  ifeq ($(CONFIG_PCI)$(CONFIG_PSERIES)$(CONFIG_LINUX), yyy)
>  obj-y += spapr_pci_vfio.o
>  endif
> +ifeq ($(CONFIG_PCI)$(CONFIG_PSERIES), yy)
> +obj-y += spapr_rtas_ddw.o
> +endif
>  # PowerPC 4xx boards
>  obj-y += ppc405_boards.o ppc4xx_devs.o ppc405_uc.o ppc440_bamboo.o
>  obj-y += ppc4xx_pci.o
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 5ca817c..d50d50b 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -1860,6 +1860,11 @@ static const TypeInfo spapr_machine_info = {
>              .driver   = "spapr-pci-host-bridge",\
>              .property = "dynamic-reconfiguration",\
>              .value    = "off",\
> +        },\
> +        {\
> +            .driver   = TYPE_SPAPR_PCI_HOST_BRIDGE,\
> +            .property = "ddw",\
> +            .value    = stringify(off),\
>          },
>  
>  #define SPAPR_COMPAT_2_2 \
> diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
> index 5e6bdb4..eaa1943 100644
> --- a/hw/ppc/spapr_iommu.c
> +++ b/hw/ppc/spapr_iommu.c
> @@ -136,6 +136,15 @@ static IOMMUTLBEntry 
> spapr_tce_translate_iommu(MemoryRegion *iommu, hwaddr addr,
>      return ret;
>  }
>  
> +static void spapr_tce_table_pre_save(void *opaque)
> +{
> +    sPAPRTCETable *tcet = SPAPR_TCE_TABLE(opaque);
> +
> +    tcet->migtable = tcet->table;
> +}
> +
> +static void spapr_tce_table_do_enable(sPAPRTCETable *tcet, bool vfio_accel);
> +
>  static int spapr_tce_table_post_load(void *opaque, int version_id)
>  {
>      sPAPRTCETable *tcet = SPAPR_TCE_TABLE(opaque);
> @@ -144,22 +153,43 @@ static int spapr_tce_table_post_load(void *opaque, int 
> version_id)
>          spapr_vio_set_bypass(tcet->vdev, tcet->bypass);
>      }
>  
> +    if (!tcet->migtable) {
> +        return 0;
> +    }
> +
> +    if (tcet->enabled) {
> +        if (!tcet->table) {
> +            tcet->enabled = false;
> +            /* VFIO does not migrate so pass vfio_accel == false */
> +            spapr_tce_table_do_enable(tcet, false);
> +        }
> +        memcpy(tcet->table, tcet->migtable,
> +               tcet->nb_table * sizeof(tcet->table[0]));
> +        free(tcet->migtable);
> +        tcet->migtable = NULL;
> +    }
> +
>      return 0;
>  }
>  
>  static const VMStateDescription vmstate_spapr_tce_table = {
>      .name = "spapr_iommu",
> -    .version_id = 2,
> +    .version_id = 3,
>      .minimum_version_id = 2,
> +    .pre_save = spapr_tce_table_pre_save,
>      .post_load = spapr_tce_table_post_load,
>      .fields      = (VMStateField []) {
>          /* Sanity check */
>          VMSTATE_UINT32_EQUAL(liobn, sPAPRTCETable),
> -        VMSTATE_UINT32_EQUAL(nb_table, sPAPRTCETable),
>  
>          /* IOMMU state */
> +        VMSTATE_BOOL_V(enabled, sPAPRTCETable, 3),
> +        VMSTATE_UINT64_V(bus_offset, sPAPRTCETable, 3),
> +        VMSTATE_UINT32_V(page_shift, sPAPRTCETable, 3),
> +        VMSTATE_UINT32(nb_table, sPAPRTCETable),
>          VMSTATE_BOOL(bypass, sPAPRTCETable),
> -        VMSTATE_VARRAY_UINT32(table, sPAPRTCETable, nb_table, 0, 
> vmstate_info_uint64, uint64_t),
> +        VMSTATE_VARRAY_UINT32_ALLOC(migtable, sPAPRTCETable, nb_table, 0,
> +                                    vmstate_info_uint64, uint64_t),
>  
>          VMSTATE_END_OF_LIST()
>      },
> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
> index 1f980fa..ab2d650 100644
> --- a/hw/ppc/spapr_pci.c
> +++ b/hw/ppc/spapr_pci.c
> @@ -719,6 +719,8 @@ static AddressSpace *spapr_pci_dma_iommu(PCIBus *bus, 
> void *opaque, int devfn)
>  static int spapr_phb_dma_update(Object *child, void *opaque)
>  {
>      int ret = 0;
> +    uint64_t bus_offset = 0;
> +    sPAPRPHBState *sphb = opaque;
>      sPAPRTCETable *tcet = (sPAPRTCETable *)
>          object_dynamic_cast(child, TYPE_SPAPR_TCE_TABLE);
>  
> @@ -726,6 +728,17 @@ static int spapr_phb_dma_update(Object *child, void 
> *opaque)
>          return 0;
>      }
>  
> +    ret = spapr_phb_vfio_dma_init_window(sphb,
> +                                         tcet->page_shift,
> +                                         tcet->nb_table << tcet->page_shift,
> +                                         &bus_offset);
> +    if (ret) {
> +        return ret;
> +    }
> +    if (bus_offset != tcet->bus_offset) {
> +        return -EFAULT;
> +    }
> +
>      if (tcet->fd >= 0) {
>          /*
>           * We got first vfio-pci device on accelerated table.
> @@ -749,6 +762,9 @@ static int 
> spapr_phb_dma_capabilities_update(sPAPRPHBState *sphb)
>  
>      sphb->dma32_window_start = 0;
>      sphb->dma32_window_size = SPAPR_PCI_DMA32_SIZE;
> +    sphb->windows_supported = SPAPR_PCI_DMA_MAX_WINDOWS;
> +    sphb->page_size_mask = (1ULL << 12) | (1ULL << 16) | (1ULL << 24);
> +    sphb->dma64_window_size = pow2ceil(ram_size);

This should probably be maxram_size so we're ready for hotplug memory
- and in some other places too.

>  
>      ret = spapr_phb_vfio_dma_capabilities_update(sphb);
>      sphb->has_vfio = (ret == 0);
> @@ -756,12 +772,31 @@ static int 
> spapr_phb_dma_capabilities_update(sPAPRPHBState *sphb)
>      return 0;
>  }
>  
> -static int spapr_phb_dma_init_window(sPAPRPHBState *sphb,
> -                                     uint32_t liobn, uint32_t page_shift,
> -                                     uint64_t window_size)
> +int spapr_phb_dma_init_window(sPAPRPHBState *sphb,
> +                              uint32_t liobn, uint32_t page_shift,
> +                              uint64_t window_size)
>  {
>      uint64_t bus_offset = sphb->dma32_window_start;
>      sPAPRTCETable *tcet = spapr_tce_find_by_liobn(liobn);
> +    int ret;
> +
> +    if (SPAPR_PCI_DMA_WINDOW_NUM(liobn) && !sphb->ddw_enabled) {
> +        return -1;
> +    }
> +
> +    if (sphb->ddw_enabled) {
> +        if (sphb->has_vfio) {
> +            ret = spapr_phb_vfio_dma_init_window(sphb,
> +                                                 page_shift, window_size,
> +                                                 &bus_offset);
> +            if (ret) {
> +                return ret;
> +            }
> +        } else if (SPAPR_PCI_DMA_WINDOW_NUM(liobn)) {
> +            /* No VFIO so we choose a huge window address */
> +            bus_offset = SPAPR_PCI_DMA64_START;

Won't this logic break if you hotplug a VFIO device onto a PHB that
previously didn't have any?

> +        }
> +    }
>  
>      spapr_tce_table_enable(tcet, bus_offset, page_shift,
>                             window_size >> page_shift,
> @@ -773,9 +808,14 @@ static int spapr_phb_dma_init_window(sPAPRPHBState *sphb,
>  int spapr_phb_dma_remove_window(sPAPRPHBState *sphb,
>                                  sPAPRTCETable *tcet)
>  {
> +    int ret = 0;
> +
> +    if (sphb->has_vfio && sphb->ddw_enabled) {
> +        ret = spapr_phb_vfio_dma_remove_window(sphb, tcet);
> +    }
>      spapr_tce_table_disable(tcet);
>  
> -    return 0;
> +    return ret;
>  }
>  
>  static int spapr_phb_disable_dma_windows(Object *child, void *opaque)
> @@ -811,7 +851,7 @@ static int spapr_phb_hotplug_dma_sync(sPAPRPHBState *sphb)
>      spapr_phb_dma_capabilities_update(sphb);
>  
>      if (!had_vfio && sphb->has_vfio) {
> -        object_child_foreach(OBJECT(sphb), spapr_phb_dma_update, NULL);
> +        object_child_foreach(OBJECT(sphb), spapr_phb_dma_update, sphb);
>      }
>  
>      return ret;
> @@ -1357,15 +1397,17 @@ static void spapr_phb_realize(DeviceState *dev, Error 
> **errp)
>          }
>      }
>  
> -    tcet = spapr_tce_new_table(DEVICE(sphb), sphb->dma_liobn);
> -    if (!tcet) {
> -            error_setg(errp, "failed to create TCE table");
> +    for (i = 0; i < SPAPR_PCI_DMA_MAX_WINDOWS; ++i) {
> +        tcet = spapr_tce_new_table(DEVICE(sphb),
> +                                   SPAPR_PCI_LIOBN(sphb->index, i));
> +        if (!tcet) {
> +            error_setg(errp, "spapr_tce_new_table failed");
>              return;
> +        }
> +        memory_region_add_subregion_overlap(&sphb->iommu_root, 0,
> +                                            spapr_tce_get_iommu(tcet), 0);
>      }
>  
> -    memory_region_add_subregion(&sphb->iommu_root, 0,
> -                                spapr_tce_get_iommu(tcet));
> -
>      sphb->msi = g_hash_table_new_full(g_int_hash, g_int_equal, g_free, 
> g_free);
>  }
>  
> @@ -1400,6 +1442,8 @@ static Property spapr_phb_properties[] = {
>                         SPAPR_PCI_IO_WIN_SIZE),
>      DEFINE_PROP_BOOL("dynamic-reconfiguration", sPAPRPHBState, dr_enabled,
>                       true),
> +    DEFINE_PROP_BOOL("ddw", sPAPRPHBState, ddw_enabled, true),
> +    DEFINE_PROP_UINT8("levels", sPAPRPHBState, levels, 0),
>      DEFINE_PROP_END_OF_LIST(),
>  };
>  
> @@ -1580,6 +1624,15 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
>      uint32_t interrupt_map_mask[] = {
>          cpu_to_be32(b_ddddd(-1)|b_fff(0)), 0x0, 0x0, cpu_to_be32(-1)};
>      uint32_t interrupt_map[PCI_SLOT_MAX * PCI_NUM_PINS][7];
> +    uint32_t ddw_applicable[] = {
> +        cpu_to_be32(RTAS_IBM_QUERY_PE_DMA_WINDOW),
> +        cpu_to_be32(RTAS_IBM_CREATE_PE_DMA_WINDOW),
> +        cpu_to_be32(RTAS_IBM_REMOVE_PE_DMA_WINDOW)
> +    };
> +    uint32_t ddw_extensions[] = {
> +        cpu_to_be32(1),
> +        cpu_to_be32(RTAS_IBM_RESET_PE_DMA_WINDOW)
> +    };
>      sPAPRTCETable *tcet;
>  
>      /* Start populating the FDT */
> @@ -1602,6 +1655,14 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
>      _FDT(fdt_setprop_cell(fdt, bus_off, "ibm,pci-config-space-type", 0x1));
>      _FDT(fdt_setprop_cell(fdt, bus_off, "ibm,pe-total-#msi", XICS_IRQS));
>  
> +    /* Dynamic DMA window */
> +    if (phb->ddw_enabled) {
> +        _FDT(fdt_setprop(fdt, bus_off, "ibm,ddw-applicable", &ddw_applicable,
> +                         sizeof(ddw_applicable)));
> +        _FDT(fdt_setprop(fdt, bus_off, "ibm,ddw-extensions",
> +                         &ddw_extensions, sizeof(ddw_extensions)));
> +    }
> +
>      /* Build the interrupt-map, this must matches what is done
>       * in pci_spapr_map_irq
>       */
> diff --git a/hw/ppc/spapr_pci_vfio.c b/hw/ppc/spapr_pci_vfio.c
> index 6df9a23..5102c72 100644
> --- a/hw/ppc/spapr_pci_vfio.c
> +++ b/hw/ppc/spapr_pci_vfio.c
> @@ -41,6 +41,86 @@ int spapr_phb_vfio_dma_capabilities_update(sPAPRPHBState 
> *sphb)
>      sphb->dma32_window_start = info.dma32_window_start;
>      sphb->dma32_window_size = info.dma32_window_size;
>  
> +    if (sphb->ddw_enabled && (info.flags & VFIO_IOMMU_SPAPR_INFO_DDW)) {
> +        sphb->windows_supported = info.ddw.max_dynamic_windows_supported;
> +        sphb->page_size_mask = info.ddw.pgsizes;
> +        sphb->dma64_window_size = pow2ceil(ram_size);
> +        sphb->max_levels = info.ddw.levels;
> +    } else {
> +        /* If VFIO_IOMMU_INFO_DDW is not set, disable DDW */
> +        sphb->ddw_enabled = false;
> +    }
> +
> +    return ret;
> +}
> +
> +static int spapr_phb_vfio_levels(uint32_t entries)
> +{
> +    unsigned pages = (entries * sizeof(uint64_t)) / getpagesize();
> +    int levels;
> +
> +    if (pages <= 64) {
> +        levels = 1;
> +    } else if (pages <= 64*64) {
> +        levels = 2;
> +    } else if (pages <= 64*64*64) {
> +        levels = 3;
> +    } else {
> +        levels = 4;
> +    }
> +
> +    return levels;
> +}
> +
> +int spapr_phb_vfio_dma_init_window(sPAPRPHBState *sphb,
> +                                   uint32_t page_shift,
> +                                   uint64_t window_size,
> +                                   uint64_t *bus_offset)
> +{
> +    int ret;
> +    struct vfio_iommu_spapr_tce_create create = {
> +        .argsz = sizeof(create),
> +        .page_shift = page_shift,
> +        .window_size = window_size,
> +        .levels = sphb->levels,
> +        .start_addr = 0,
> +    };
> +
> +    /*
> +     * Dynamic windows are supported, that means that there is no
> +     * pre-created window and we have to create one.
> +     */
> +    if (!create.levels) {
> +        create.levels = spapr_phb_vfio_levels(create.window_size >>
> +                                              page_shift);
> +    }
> +
> +    if (create.levels > sphb->max_levels) {
> +        return -EINVAL;
> +    }
> +
> +    ret = vfio_container_ioctl(&sphb->iommu_as,
> +                               VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
> +    if (ret) {
> +        return ret;
> +    }
> +    *bus_offset = create.start_addr;
> +
> +    return 0;
> +}
> +
> +int spapr_phb_vfio_dma_remove_window(sPAPRPHBState *sphb,
> +                                            sPAPRTCETable *tcet)
> +{
> +    struct vfio_iommu_spapr_tce_remove remove = {
> +        .argsz = sizeof(remove),
> +        .start_addr = tcet->bus_offset
> +    };
> +    int ret;
> +
> +    ret = vfio_container_ioctl(&sphb->iommu_as,
> +                               VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
> +
>      return ret;
>  }
>  
> diff --git a/hw/ppc/spapr_rtas_ddw.c b/hw/ppc/spapr_rtas_ddw.c
> new file mode 100644
> index 0000000..7539c6a
> --- /dev/null
> +++ b/hw/ppc/spapr_rtas_ddw.c
> @@ -0,0 +1,300 @@
> +/*
> + * QEMU sPAPR Dynamic DMA windows support
> + *
> + * Copyright (c) 2014 Alexey Kardashevskiy, IBM Corporation.
> + *
> + *  This program is free software; you can redistribute it and/or modify
> + *  it under the terms of the GNU General Public License as published by
> + *  the Free Software Foundation; either version 2 of the License,
> + *  or (at your option) any later version.
> + *
> + *  This program is distributed in the hope that it will be useful,
> + *  but WITHOUT ANY WARRANTY; without even the implied warranty of
> + *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + *  GNU General Public License for more details.
> + *
> + *  You should have received a copy of the GNU General Public License
> + *  along with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include "qemu/error-report.h"
> +#include "hw/ppc/spapr.h"
> +#include "hw/pci-host/spapr.h"
> +#include "trace.h"
> +
> +static int spapr_phb_get_active_win_num_cb(Object *child, void *opaque)
> +{
> +    sPAPRTCETable *tcet;
> +
> +    tcet = (sPAPRTCETable *) object_dynamic_cast(child, 
> TYPE_SPAPR_TCE_TABLE);
> +    if (tcet && tcet->enabled) {
> +        ++*(unsigned *)opaque;
> +    }
> +    return 0;
> +}
> +
> +static unsigned spapr_phb_get_active_win_num(sPAPRPHBState *sphb)
> +{
> +    unsigned ret = 0;
> +
> +    object_child_foreach(OBJECT(sphb), spapr_phb_get_active_win_num_cb, 
> &ret);
> +
> +    return ret;
> +}
> +
> +static int spapr_phb_get_free_liobn_cb(Object *child, void *opaque)
> +{
> +    sPAPRTCETable *tcet;
> +
> +    tcet = (sPAPRTCETable *) object_dynamic_cast(child, 
> TYPE_SPAPR_TCE_TABLE);
> +    if (tcet && !tcet->enabled) {
> +        *(uint32_t *)opaque = tcet->liobn;
> +        return 1;
> +    }
> +    return 0;
> +}
> +
> +static unsigned spapr_phb_get_free_liobn(sPAPRPHBState *sphb)
> +{
> +    uint32_t liobn = 0;
> +
> +    object_child_foreach(OBJECT(sphb), spapr_phb_get_free_liobn_cb, &liobn);
> +
> +    return liobn;
> +}
> +
> +static uint32_t spapr_query_mask(struct ppc_one_seg_page_size *sps,
> +                                 uint64_t page_mask)
> +{
> +    int i, j;
> +    uint32_t mask = 0;
> +    const struct { int shift; uint32_t mask; } masks[] = {
> +        { 12, RTAS_DDW_PGSIZE_4K },
> +        { 16, RTAS_DDW_PGSIZE_64K },
> +        { 24, RTAS_DDW_PGSIZE_16M },
> +        { 25, RTAS_DDW_PGSIZE_32M },
> +        { 26, RTAS_DDW_PGSIZE_64M },
> +        { 27, RTAS_DDW_PGSIZE_128M },
> +        { 28, RTAS_DDW_PGSIZE_256M },
> +        { 34, RTAS_DDW_PGSIZE_16G },
> +    };
> +
> +    for (i = 0; i < PPC_PAGE_SIZES_MAX_SZ; i++) {
> +        for (j = 0; j < ARRAY_SIZE(masks); ++j) {
> +            if ((sps[i].page_shift == masks[j].shift) &&
> +                    (page_mask & (1ULL << masks[j].shift))) {
> +                mask |= masks[j].mask;
> +            }
> +        }
> +    }
> +
> +    return mask;
> +}
> +
> +static void rtas_ibm_query_pe_dma_window(PowerPCCPU *cpu,
> +                                         sPAPRMachineState *spapr,
> +                                         uint32_t token, uint32_t nargs,
> +                                         target_ulong args,
> +                                         uint32_t nret, target_ulong rets)
> +{
> +    CPUPPCState *env = &cpu->env;
> +    sPAPRPHBState *sphb;
> +    uint64_t buid;
> +    uint32_t avail, addr, pgmask = 0;
> +    unsigned current;
> +
> +    if ((nargs != 3) || (nret != 5)) {
> +        goto param_error_exit;
> +    }
> +
> +    buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2);
> +    addr = rtas_ld(args, 0);
> +    sphb = spapr_pci_find_phb(spapr, buid);
> +    if (!sphb || !sphb->ddw_enabled) {
> +        goto param_error_exit;
> +    }
> +
> +    current = spapr_phb_get_active_win_num(sphb);
> +    avail = (sphb->windows_supported > current) ?
> +            (sphb->windows_supported - current) : 0;
> +
> +    /* Work out supported page masks */
> +    pgmask = spapr_query_mask(env->sps.sps, sphb->page_size_mask);
> +
> +    rtas_st(rets, 0, RTAS_OUT_SUCCESS);
> +    rtas_st(rets, 1, avail);
> +
> +    /*
> +     * This is "Largest contiguous block of TCEs allocated specifically
> +     * for (that is, are reserved for) this PE".
> +     * Return the maximum number as all RAM was in 4K pages.
> +     */
> +    rtas_st(rets, 2, sphb->dma64_window_size >> SPAPR_TCE_PAGE_SHIFT);
> +    rtas_st(rets, 3, pgmask);
> +    rtas_st(rets, 4, 0); /* DMA migration mask, not supported */
> +
> +    trace_spapr_iommu_ddw_query(buid, addr, avail, sphb->dma64_window_size,
> +                                pgmask);
> +    return;
> +
> +param_error_exit:
> +    rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
> +}
> +
> +static void rtas_ibm_create_pe_dma_window(PowerPCCPU *cpu,
> +                                          sPAPRMachineState *spapr,
> +                                          uint32_t token, uint32_t nargs,
> +                                          target_ulong args,
> +                                          uint32_t nret, target_ulong rets)
> +{
> +    sPAPRPHBState *sphb;
> +    sPAPRTCETable *tcet = NULL;
> +    uint32_t addr, page_shift, window_shift, liobn;
> +    uint64_t buid;
> +    long ret;
> +
> +    if ((nargs != 5) || (nret != 4)) {
> +        goto param_error_exit;
> +    }
> +
> +    buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2);
> +    addr = rtas_ld(args, 0);
> +    sphb = spapr_pci_find_phb(spapr, buid);
> +    if (!sphb || !sphb->ddw_enabled) {
> +        goto param_error_exit;
> +    }
> +
> +    page_shift = rtas_ld(args, 3);
> +    window_shift = rtas_ld(args, 4);
> +    liobn = spapr_phb_get_free_liobn(sphb);
> +
> +    if (!liobn || !(sphb->page_size_mask & (1ULL << page_shift))) {
> +        goto hw_error_exit;
> +    }
> +
> +    ret = spapr_phb_dma_init_window(sphb, liobn, page_shift,
> +                                    1ULL << window_shift);
> +    tcet = spapr_tce_find_by_liobn(liobn);
> +    trace_spapr_iommu_ddw_create(buid, addr, 1ULL << page_shift,
> +                                 1ULL << window_shift,
> +                                 tcet ? tcet->bus_offset : 0xbaadf00d,
> +                                 liobn, ret);
> +    if (ret || !tcet) {
> +        goto hw_error_exit;
> +    }
> +
> +    rtas_st(rets, 0, RTAS_OUT_SUCCESS);
> +    rtas_st(rets, 1, liobn);
> +    rtas_st(rets, 2, tcet->bus_offset >> 32);
> +    rtas_st(rets, 3, tcet->bus_offset & ((uint32_t) -1));
> +
> +    return;
> +
> +hw_error_exit:
> +    rtas_st(rets, 0, RTAS_OUT_HW_ERROR);
> +    return;
> +
> +param_error_exit:
> +    rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
> +}
> +
> +static void rtas_ibm_remove_pe_dma_window(PowerPCCPU *cpu,
> +                                          sPAPRMachineState *spapr,
> +                                          uint32_t token, uint32_t nargs,
> +                                          target_ulong args,
> +                                          uint32_t nret, target_ulong rets)
> +{
> +    sPAPRPHBState *sphb;
> +    sPAPRTCETable *tcet;
> +    uint32_t liobn;
> +    long ret;
> +
> +    if ((nargs != 1) || (nret != 1)) {
> +        goto param_error_exit;
> +    }
> +
> +    liobn = rtas_ld(args, 0);
> +    tcet = spapr_tce_find_by_liobn(liobn);
> +    if (!tcet) {
> +        goto param_error_exit;
> +    }
> +
> +    sphb = SPAPR_PCI_HOST_BRIDGE(OBJECT(tcet)->parent);
> +    if (!sphb || !sphb->ddw_enabled) {
> +        goto param_error_exit;
> +    }
> +
> +    ret = spapr_phb_dma_remove_window(sphb, tcet);
> +    trace_spapr_iommu_ddw_remove(liobn, ret);
> +    if (ret) {
> +        goto hw_error_exit;
> +    }
> +
> +    rtas_st(rets, 0, RTAS_OUT_SUCCESS);
> +    return;
> +
> +hw_error_exit:
> +    rtas_st(rets, 0, RTAS_OUT_HW_ERROR);
> +    return;
> +
> +param_error_exit:
> +    rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
> +}
> +
> +static void rtas_ibm_reset_pe_dma_window(PowerPCCPU *cpu,
> +                                         sPAPRMachineState *spapr,
> +                                         uint32_t token, uint32_t nargs,
> +                                         target_ulong args,
> +                                         uint32_t nret, target_ulong rets)
> +{
> +    sPAPRPHBState *sphb;
> +    uint64_t buid;
> +    uint32_t addr;
> +    long ret;
> +
> +    if ((nargs != 3) || (nret != 1)) {
> +        goto param_error_exit;
> +    }
> +
> +    buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2);
> +    addr = rtas_ld(args, 0);
> +    sphb = spapr_pci_find_phb(spapr, buid);
> +    if (!sphb || !sphb->ddw_enabled) {
> +        goto param_error_exit;
> +    }
> +
> +    ret = spapr_phb_dma_reset(sphb);
> +    trace_spapr_iommu_ddw_reset(buid, addr, ret);
> +    if (ret) {
> +        goto hw_error_exit;
> +    }
> +
> +    rtas_st(rets, 0, RTAS_OUT_SUCCESS);
> +
> +    return;
> +
> +hw_error_exit:
> +    rtas_st(rets, 0, RTAS_OUT_HW_ERROR);
> +    return;
> +
> +param_error_exit:
> +    rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
> +}
> +
> +static void spapr_rtas_ddw_init(void)
> +{
> +    spapr_rtas_register(RTAS_IBM_QUERY_PE_DMA_WINDOW,
> +                        "ibm,query-pe-dma-window",
> +                        rtas_ibm_query_pe_dma_window);
> +    spapr_rtas_register(RTAS_IBM_CREATE_PE_DMA_WINDOW,
> +                        "ibm,create-pe-dma-window",
> +                        rtas_ibm_create_pe_dma_window);
> +    spapr_rtas_register(RTAS_IBM_REMOVE_PE_DMA_WINDOW,
> +                        "ibm,remove-pe-dma-window",
> +                        rtas_ibm_remove_pe_dma_window);
> +    spapr_rtas_register(RTAS_IBM_RESET_PE_DMA_WINDOW,
> +                        "ibm,reset-pe-dma-window",
> +                        rtas_ibm_reset_pe_dma_window);
> +}
> +
> +type_init(spapr_rtas_ddw_init)
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index 9e3e0b0..f915127 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -830,6 +830,8 @@ int vfio_container_ioctl(AddressSpace *as,
>      case VFIO_CHECK_EXTENSION:
>      case VFIO_IOMMU_SPAPR_TCE_GET_INFO:
>      case VFIO_EEH_PE_OP:
> +    case VFIO_IOMMU_SPAPR_TCE_CREATE:
> +    case VFIO_IOMMU_SPAPR_TCE_REMOVE:
>          break;
>      default:
>          /* Return an error on unknown requests */
> diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h
> index b2a8fc3..1313805 100644
> --- a/include/hw/pci-host/spapr.h
> +++ b/include/hw/pci-host/spapr.h
> @@ -88,6 +88,12 @@ struct sPAPRPHBState {
>      uint32_t dma32_window_size;
>      bool has_vfio;
>      int32_t iommugroupid; /* obsolete */
> +    bool ddw_enabled;
> +    uint32_t windows_supported;
> +    uint64_t page_size_mask;
> +    uint64_t dma64_window_size;
> +    uint8_t max_levels;
> +    uint8_t levels;
>  
>      QLIST_ENTRY(sPAPRPHBState) list;
>  };
> @@ -110,6 +116,12 @@ struct sPAPRPHBState {
>  
>  #define SPAPR_PCI_DMA32_SIZE         0x40000000
>  
> +/* Default 64bit dynamic window offset */
> +#define SPAPR_PCI_DMA64_START        0x8000000000000000ULL
> +
> +/* Maximum allowed number of DMA windows for emulated PHB */
> +#define SPAPR_PCI_DMA_MAX_WINDOWS    2
> +
>  static inline qemu_irq spapr_phb_lsi_qirq(struct sPAPRPHBState *phb, int pin)
>  {
>      sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> @@ -130,11 +142,20 @@ void spapr_pci_rtas_init(void);
>  sPAPRPHBState *spapr_pci_find_phb(sPAPRMachineState *spapr, uint64_t buid);
>  PCIDevice *spapr_pci_find_dev(sPAPRMachineState *spapr, uint64_t buid,
>                                uint32_t config_addr);
> +int spapr_phb_dma_init_window(sPAPRPHBState *sphb,
> +                              uint32_t liobn, uint32_t page_shift,
> +                              uint64_t window_size);
>  int spapr_phb_dma_remove_window(sPAPRPHBState *sphb,
>                                  sPAPRTCETable *tcet);
>  int spapr_phb_dma_reset(sPAPRPHBState *sphb);
>  
>  int spapr_phb_vfio_dma_capabilities_update(sPAPRPHBState *sphb);
> +int spapr_phb_vfio_dma_init_window(sPAPRPHBState *sphb,
> +                                   uint32_t page_shift,
> +                                   uint64_t window_size,
> +                                   uint64_t *bus_offset);
> +int spapr_phb_vfio_dma_remove_window(sPAPRPHBState *sphb,
> +                                     sPAPRTCETable *tcet);
>  int spapr_phb_vfio_eeh_set_option(sPAPRPHBState *sphb,
>                                    PCIDevice *pdev, int option);
>  int spapr_phb_vfio_eeh_get_state(sPAPRPHBState *sphb, int *state);
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index 4645f16..5a58785 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -416,6 +416,16 @@ int spapr_allocate_irq_block(int num, bool lsi, bool 
> msi);
>  #define RTAS_OUT_NOT_SUPPORTED      -3
>  #define RTAS_OUT_NOT_AUTHORIZED     -9002
>  
> +/* DDW pagesize mask values from ibm,query-pe-dma-window */
> +#define RTAS_DDW_PGSIZE_4K       0x01
> +#define RTAS_DDW_PGSIZE_64K      0x02
> +#define RTAS_DDW_PGSIZE_16M      0x04
> +#define RTAS_DDW_PGSIZE_32M      0x08
> +#define RTAS_DDW_PGSIZE_64M      0x10
> +#define RTAS_DDW_PGSIZE_128M     0x20
> +#define RTAS_DDW_PGSIZE_256M     0x40
> +#define RTAS_DDW_PGSIZE_16G      0x80
> +
>  /* RTAS tokens */
>  #define RTAS_TOKEN_BASE      0x2000
>  
> @@ -457,8 +467,12 @@ int spapr_allocate_irq_block(int num, bool lsi, bool 
> msi);
>  #define RTAS_IBM_SET_SLOT_RESET                 (RTAS_TOKEN_BASE + 0x23)
>  #define RTAS_IBM_CONFIGURE_PE                   (RTAS_TOKEN_BASE + 0x24)
>  #define RTAS_IBM_SLOT_ERROR_DETAIL              (RTAS_TOKEN_BASE + 0x25)
> +#define RTAS_IBM_QUERY_PE_DMA_WINDOW            (RTAS_TOKEN_BASE + 0x26)
> +#define RTAS_IBM_CREATE_PE_DMA_WINDOW           (RTAS_TOKEN_BASE + 0x27)
> +#define RTAS_IBM_REMOVE_PE_DMA_WINDOW           (RTAS_TOKEN_BASE + 0x28)
> +#define RTAS_IBM_RESET_PE_DMA_WINDOW            (RTAS_TOKEN_BASE + 0x29)
>  
> -#define RTAS_TOKEN_MAX                          (RTAS_TOKEN_BASE + 0x26)
> +#define RTAS_TOKEN_MAX                          (RTAS_TOKEN_BASE + 0x2A)
>  
>  /* RTAS ibm,get-system-parameter token values */
>  #define RTAS_SYSPARM_SPLPAR_CHARACTERISTICS      20
> @@ -558,6 +572,7 @@ struct sPAPRTCETable {
>      uint64_t bus_offset;
>      uint32_t page_shift;
>      uint64_t *table;
> +    uint64_t *migtable;
>      bool bypass;
>      int fd;
>      MemoryRegion root, iommu;
> diff --git a/trace-events b/trace-events
> index 3d1aeea..edd3164 100644
> --- a/trace-events
> +++ b/trace-events
> @@ -1365,6 +1365,10 @@ spapr_iommu_pci_indirect(uint64_t liobn, uint64_t 
> ioba, uint64_t tce, uint64_t i
>  spapr_iommu_pci_stuff(uint64_t liobn, uint64_t ioba, uint64_t tce_value, 
> uint64_t npages, uint64_t ret) "liobn=%"PRIx64" ioba=0x%"PRIx64" 
> tcevalue=0x%"PRIx64" npages=%"PRId64" ret=%"PRId64
>  spapr_iommu_xlate(uint64_t liobn, uint64_t ioba, uint64_t tce, unsigned 
> perm, unsigned pgsize) "liobn=%"PRIx64" 0x%"PRIx64" -> 0x%"PRIx64" perm=%u 
> mask=%x"
>  spapr_iommu_alloc_table(uint64_t liobn, void *table, int fd) 
> "liobn=%"PRIx64" table=%p fd=%d"
> +spapr_iommu_ddw_query(uint64_t buid, uint32_t cfgaddr, unsigned wa, uint64_t 
> win_size, uint32_t pgmask) "buid=%"PRIx64" addr=%"PRIx32", %u windows 
> available, max window size=%"PRIx64", mask=%"PRIx32
> +spapr_iommu_ddw_create(uint64_t buid, uint32_t cfgaddr, unsigned long long 
> pg_size, unsigned long long req_size, uint64_t start, uint32_t liobn, long 
> ret) "buid=%"PRIx64" addr=%"PRIx32", page size=0x%llx, requested=0x%llx, 
> start addr=%"PRIx64", liobn=%"PRIx32", ret = %ld"
> +spapr_iommu_ddw_remove(uint32_t liobn, long ret) "liobn=%"PRIx32", ret = %ld"
> +spapr_iommu_ddw_reset(uint64_t buid, uint32_t cfgaddr, long ret) 
> "buid=%"PRIx64" addr=%"PRIx32", ret = %ld"
>  
>  # hw/ppc/ppc.c
>  ppc_tb_adjust(uint64_t offs1, uint64_t offs2, int64_t diff, int64_t seconds) 
> "adjusted from 0x%"PRIx64" to 0x%"PRIx64", diff %"PRId64" (%"PRId64"s)"

-- 
David Gibson                    | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
                                | _way_ _around_!
http://www.ozlabs.org/~dgibson

Attachment: pgpS7UodKAx7W.pgp
Description: PGP signature


reply via email to

[Prev in Thread] Current Thread [Next in Thread]