qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH qemu v16 19/19] spapr_pci/spapr_pci_vfio: Suppor


From: Alexey Kardashevskiy
Subject: Re: [Qemu-devel] [PATCH qemu v16 19/19] spapr_pci/spapr_pci_vfio: Support Dynamic DMA Windows (DDW)
Date: Mon, 16 May 2016 16:25:53 +1000
User-agent: Mozilla/5.0 (X11; Linux i686 on x86_64; rv:45.0) Gecko/20100101 Thunderbird/45.0

On 05/13/2016 06:41 PM, Bharata B Rao wrote:
On Wed, May 4, 2016 at 12:22 PM, Alexey Kardashevskiy <address@hidden> wrote:
This adds support for Dynamic DMA Windows (DDW) option defined by
the SPAPR specification which allows to have additional DMA window(s)

The "ddw" property is enabled by default on a PHB but for compatibility
the pseries-2.5 machine (TODO: update version) and older disable it.
This also creates a single DMA window for the older machines to
maintain backward migration.

This implements DDW for PHB with emulated and VFIO devices. The host
kernel support is required. The advertised IOMMU page sizes are 4K and
64K; 16M pages are supported but not advertised by default, in order to
enable them, the user has to specify "pgsz" property for PHB and
enable huge pages for RAM.

The existing linux guests try creating one additional huge DMA window
with 64K or 16MB pages and map the entire guest RAM to. If succeeded,
the guest switches to dma_direct_ops and never calls TCE hypercalls
(H_PUT_TCE,...) again. This enables VFIO devices to use the entire RAM
and not waste time on map/unmap later. This adds a "dma64_win_addr"
property which is a bus address for the 64bit window and by default
set to 0x800.0000.0000.0000 as this is what the modern POWER8 hardware
uses and this allows having emulated and VFIO devices on the same bus.

This adds 4 RTAS handlers:
* ibm,query-pe-dma-window
* ibm,create-pe-dma-window
* ibm,remove-pe-dma-window
* ibm,reset-pe-dma-window
These are registered from type_init() callback.

These RTAS handlers are implemented in a separate file to avoid polluting
spapr_iommu.c with PCI.

This changes sPAPRPHBState::dma_liobn to an array to allow 2 LIOBNs.

Signed-off-by: Alexey Kardashevskiy <address@hidden>
---
Changes:
v16:
* s/dma_liobn/dma_liobn[SPAPR_PCI_DMA_MAX_WINDOWS]/
* s/SPAPR_PCI_LIOBN()/dma_liobn[]/

v15:
* moved page mask filtering to PHB realize(), use "-mempath" to know
if there are huge pages
* fixed error reporting in RTAS handlers
* max window size accounts now hotpluggable memory boundaries
---
 hw/ppc/Makefile.objs        |   1 +
 hw/ppc/spapr.c              |   5 +
 hw/ppc/spapr_pci.c          |  75 +++++++++---
 hw/ppc/spapr_rtas_ddw.c     | 292 ++++++++++++++++++++++++++++++++++++++++++++
 include/hw/pci-host/spapr.h |   8 +-
 include/hw/ppc/spapr.h      |  16 ++-
 trace-events                |   4 +
 7 files changed, 381 insertions(+), 20 deletions(-)
 create mode 100644 hw/ppc/spapr_rtas_ddw.c

diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
index c1ffc77..986b36f 100644
--- a/hw/ppc/Makefile.objs
+++ b/hw/ppc/Makefile.objs
@@ -7,6 +7,7 @@ obj-$(CONFIG_PSERIES) += spapr_pci.o spapr_rtc.o spapr_drc.o 
spapr_rng.o
 ifeq ($(CONFIG_PCI)$(CONFIG_PSERIES)$(CONFIG_LINUX), yyy)
 obj-y += spapr_pci_vfio.o
 endif
+obj-$(CONFIG_PSERIES) += spapr_rtas_ddw.o
 # PowerPC 4xx boards
 obj-y += ppc405_boards.o ppc4xx_devs.o ppc405_uc.o ppc440_bamboo.o
 obj-y += ppc4xx_pci.o
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index b69995e..0206609 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -2365,6 +2365,11 @@ DEFINE_SPAPR_MACHINE(2_6, "2.6", true);
         .driver   = "spapr-vlan", \
         .property = "use-rx-buffer-pools", \
         .value    = "off", \
+    }, \
+    {\
+        .driver   = TYPE_SPAPR_PCI_HOST_BRIDGE,\
+        .property = "ddw",\
+        .value    = stringify(off),\
     },

 static void spapr_machine_2_5_instance_options(MachineState *machine)
diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
index 51e7d56..aa414f2 100644
--- a/hw/ppc/spapr_pci.c
+++ b/hw/ppc/spapr_pci.c
@@ -35,6 +35,7 @@
 #include "hw/ppc/spapr.h"
 #include "hw/pci-host/spapr.h"
 #include "exec/address-spaces.h"
+#include "exec/ram_addr.h"
 #include <libfdt.h>
 #include "trace.h"
 #include "qemu/error-report.h"
@@ -44,6 +45,7 @@
 #include "hw/pci/pci_bus.h"
 #include "hw/ppc/spapr_drc.h"
 #include "sysemu/device_tree.h"
+#include "sysemu/hostmem.h"

 #include "hw/vfio/vfio.h"

@@ -1305,11 +1307,14 @@ static void spapr_phb_realize(DeviceState *dev, Error 
**errp)
     PCIBus *bus;
     uint64_t msi_window_size = 4096;
     sPAPRTCETable *tcet;
+    const unsigned windows_supported =
+        sphb->ddw_enabled ? SPAPR_PCI_DMA_MAX_WINDOWS : 1;

     if (sphb->index != (uint32_t)-1) {
         hwaddr windows_base;

-        if ((sphb->buid != (uint64_t)-1) || (sphb->dma_liobn != (uint32_t)-1)
+        if ((sphb->buid != (uint64_t)-1) || (sphb->dma_liobn[0] != 
(uint32_t)-1)
+            || ((sphb->dma_liobn[1] != (uint32_t)-1) && (windows_supported > 
1))
             || (sphb->mem_win_addr != (hwaddr)-1)
             || (sphb->io_win_addr != (hwaddr)-1)) {
             error_setg(errp, "Either \"index\" or other parameters must"
@@ -1324,7 +1329,9 @@ static void spapr_phb_realize(DeviceState *dev, Error 
**errp)
         }

         sphb->buid = SPAPR_PCI_BASE_BUID + sphb->index;
-        sphb->dma_liobn = SPAPR_PCI_LIOBN(sphb->index, 0);
+        for (i = 0; i < windows_supported; ++i) {
+            sphb->dma_liobn[i] = SPAPR_PCI_LIOBN(sphb->index, i);
+        }

         windows_base = SPAPR_PCI_WINDOW_BASE
             + sphb->index * SPAPR_PCI_WINDOW_SPACING;
@@ -1337,8 +1344,9 @@ static void spapr_phb_realize(DeviceState *dev, Error 
**errp)
         return;
     }

-    if (sphb->dma_liobn == (uint32_t)-1) {
-        error_setg(errp, "LIOBN not specified for PHB");
+    if ((sphb->dma_liobn[0] == (uint32_t)-1) ||
+        ((sphb->dma_liobn[1] == (uint32_t)-1) && (windows_supported > 1))) {
+        error_setg(errp, "LIOBN(s) not specified for PHB");
         return;
     }

@@ -1456,16 +1464,18 @@ static void spapr_phb_realize(DeviceState *dev, Error 
**errp)
         }
     }

-    tcet = spapr_tce_new_table(DEVICE(sphb), sphb->dma_liobn);
-    if (!tcet) {
-        error_setg(errp, "Unable to create TCE table for %s",
-                   sphb->dtbusname);
-        return;
+    /* DMA setup */
+    for (i = 0; i < windows_supported; ++i) {
+        tcet = spapr_tce_new_table(DEVICE(sphb), sphb->dma_liobn[i]);
+        if (!tcet) {
+            error_setg(errp, "Creating window#%d failed for %s",
+                       i, sphb->dtbusname);
+            return;
+        }
+        memory_region_add_subregion_overlap(&sphb->iommu_root, 0,
+                                            spapr_tce_get_iommu(tcet), 0);
     }

-    memory_region_add_subregion_overlap(&sphb->iommu_root, 0,
-                                        spapr_tce_get_iommu(tcet), 0);
-
     sphb->msi = g_hash_table_new_full(g_int_hash, g_int_equal, g_free, g_free);
 }

@@ -1482,13 +1492,19 @@ static int spapr_phb_children_reset(Object *child, void 
*opaque)

 void spapr_phb_dma_reset(sPAPRPHBState *sphb)
 {
-    sPAPRTCETable *tcet = spapr_tce_find_by_liobn(sphb->dma_liobn);
+    int i;
+    sPAPRTCETable *tcet;

-    if (tcet && tcet->enabled) {
-        spapr_tce_table_disable(tcet);
+    for (i = 0; i < SPAPR_PCI_DMA_MAX_WINDOWS; ++i) {
+        tcet = spapr_tce_find_by_liobn(sphb->dma_liobn[i]);
+
+        if (tcet && tcet->enabled) {
+            spapr_tce_table_disable(tcet);
+        }
     }

     /* Register default 32bit DMA window */
+    tcet = spapr_tce_find_by_liobn(sphb->dma_liobn[0]);
     spapr_tce_table_enable(tcet, SPAPR_TCE_PAGE_SHIFT, sphb->dma_win_addr,
                            sphb->dma_win_size >> SPAPR_TCE_PAGE_SHIFT);
 }
@@ -1510,7 +1526,8 @@ static void spapr_phb_reset(DeviceState *qdev)
 static Property spapr_phb_properties[] = {
     DEFINE_PROP_UINT32("index", sPAPRPHBState, index, -1),
     DEFINE_PROP_UINT64("buid", sPAPRPHBState, buid, -1),
-    DEFINE_PROP_UINT32("liobn", sPAPRPHBState, dma_liobn, -1),
+    DEFINE_PROP_UINT32("liobn", sPAPRPHBState, dma_liobn[0], -1),
+    DEFINE_PROP_UINT32("liobn64", sPAPRPHBState, dma_liobn[1], -1),
     DEFINE_PROP_UINT64("mem_win_addr", sPAPRPHBState, mem_win_addr, -1),
     DEFINE_PROP_UINT64("mem_win_size", sPAPRPHBState, mem_win_size,
                        SPAPR_PCI_MMIO_WIN_SIZE),
@@ -1522,6 +1539,11 @@ static Property spapr_phb_properties[] = {
     /* Default DMA window is 0..1GB */
     DEFINE_PROP_UINT64("dma_win_addr", sPAPRPHBState, dma_win_addr, 0),
     DEFINE_PROP_UINT64("dma_win_size", sPAPRPHBState, dma_win_size, 
0x40000000),
+    DEFINE_PROP_UINT64("dma64_win_addr", sPAPRPHBState, dma64_window_addr,
+                       0x800000000000000ULL),
+    DEFINE_PROP_BOOL("ddw", sPAPRPHBState, ddw_enabled, true),
+    DEFINE_PROP_UINT64("pgsz", sPAPRPHBState, page_size_mask,
+                       (1ULL << 12) | (1ULL << 16)),
     DEFINE_PROP_END_OF_LIST(),
 };

@@ -1598,7 +1620,7 @@ static const VMStateDescription vmstate_spapr_pci = {
     .post_load = spapr_pci_post_load,
     .fields = (VMStateField[]) {
         VMSTATE_UINT64_EQUAL(buid, sPAPRPHBState),
-        VMSTATE_UINT32_EQUAL(dma_liobn, sPAPRPHBState),
+        VMSTATE_UNUSED(4), /* dma_liobn */
         VMSTATE_UINT64_EQUAL(mem_win_addr, sPAPRPHBState),
         VMSTATE_UINT64_EQUAL(mem_win_size, sPAPRPHBState),
         VMSTATE_UINT64_EQUAL(io_win_addr, sPAPRPHBState),
@@ -1775,6 +1797,15 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
     uint32_t interrupt_map_mask[] = {
         cpu_to_be32(b_ddddd(-1)|b_fff(0)), 0x0, 0x0, cpu_to_be32(-1)};
     uint32_t interrupt_map[PCI_SLOT_MAX * PCI_NUM_PINS][7];
+    uint32_t ddw_applicable[] = {
+        cpu_to_be32(RTAS_IBM_QUERY_PE_DMA_WINDOW),
+        cpu_to_be32(RTAS_IBM_CREATE_PE_DMA_WINDOW),
+        cpu_to_be32(RTAS_IBM_REMOVE_PE_DMA_WINDOW)
+    };
+    uint32_t ddw_extensions[] = {
+        cpu_to_be32(1),
+        cpu_to_be32(RTAS_IBM_RESET_PE_DMA_WINDOW)
+    };
     sPAPRTCETable *tcet;
     PCIBus *bus = PCI_HOST_BRIDGE(phb)->bus;
     sPAPRFDT s_fdt;
@@ -1799,6 +1830,14 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
     _FDT(fdt_setprop_cell(fdt, bus_off, "ibm,pci-config-space-type", 0x1));
     _FDT(fdt_setprop_cell(fdt, bus_off, "ibm,pe-total-#msi", XICS_IRQS));

+    /* Dynamic DMA window */
+    if (phb->ddw_enabled) {
+        _FDT(fdt_setprop(fdt, bus_off, "ibm,ddw-applicable", &ddw_applicable,
+                         sizeof(ddw_applicable)));
+        _FDT(fdt_setprop(fdt, bus_off, "ibm,ddw-extensions",
+                         &ddw_extensions, sizeof(ddw_extensions)));
+    }
+
     /* Build the interrupt-map, this must matches what is done
      * in pci_spapr_map_irq
      */
@@ -1822,7 +1861,7 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
     _FDT(fdt_setprop(fdt, bus_off, "interrupt-map", &interrupt_map,
                      sizeof(interrupt_map)));

-    tcet = spapr_tce_find_by_liobn(phb->dma_liobn);
+    tcet = spapr_tce_find_by_liobn(phb->dma_liobn[0]);
     if (!tcet) {
         return -1;
     }
diff --git a/hw/ppc/spapr_rtas_ddw.c b/hw/ppc/spapr_rtas_ddw.c
new file mode 100644
index 0000000..b4e0686
--- /dev/null
+++ b/hw/ppc/spapr_rtas_ddw.c
@@ -0,0 +1,292 @@
+/*
+ * QEMU sPAPR Dynamic DMA windows support
+ *
+ * Copyright (c) 2015 Alexey Kardashevskiy, IBM Corporation.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License,
+ *  or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/error-report.h"
+#include "hw/ppc/spapr.h"
+#include "hw/pci-host/spapr.h"
+#include "trace.h"
+
+static int spapr_phb_get_active_win_num_cb(Object *child, void *opaque)
+{
+    sPAPRTCETable *tcet;
+
+    tcet = (sPAPRTCETable *) object_dynamic_cast(child, TYPE_SPAPR_TCE_TABLE);
+    if (tcet && tcet->enabled) {
+        ++*(unsigned *)opaque;
+    }
+    return 0;
+}
+
+static unsigned spapr_phb_get_active_win_num(sPAPRPHBState *sphb)
+{
+    unsigned ret = 0;
+
+    object_child_foreach(OBJECT(sphb), spapr_phb_get_active_win_num_cb, &ret);
+
+    return ret;
+}
+
+static int spapr_phb_get_free_liobn_cb(Object *child, void *opaque)
+{
+    sPAPRTCETable *tcet;
+
+    tcet = (sPAPRTCETable *) object_dynamic_cast(child, TYPE_SPAPR_TCE_TABLE);
+    if (tcet && !tcet->enabled) {
+        *(uint32_t *)opaque = tcet->liobn;
+        return 1;
+    }
+    return 0;
+}
+
+static unsigned spapr_phb_get_free_liobn(sPAPRPHBState *sphb)
+{
+    uint32_t liobn = 0;
+
+    object_child_foreach(OBJECT(sphb), spapr_phb_get_free_liobn_cb, &liobn);
+
+    return liobn;
+}
+
+static uint32_t spapr_page_mask_to_query_mask(uint64_t page_mask)
+{
+    int i;
+    uint32_t mask = 0;
+    const struct { int shift; uint32_t mask; } masks[] = {
+        { 12, RTAS_DDW_PGSIZE_4K },
+        { 16, RTAS_DDW_PGSIZE_64K },
+        { 24, RTAS_DDW_PGSIZE_16M },
+        { 25, RTAS_DDW_PGSIZE_32M },
+        { 26, RTAS_DDW_PGSIZE_64M },
+        { 27, RTAS_DDW_PGSIZE_128M },
+        { 28, RTAS_DDW_PGSIZE_256M },
+        { 34, RTAS_DDW_PGSIZE_16G },
+    };
+
+    for (i = 0; i < ARRAY_SIZE(masks); ++i) {
+        if (page_mask & (1ULL << masks[i].shift)) {
+            mask |= masks[i].mask;
+        }
+    }
+
+    return mask;
+}
+
+static void rtas_ibm_query_pe_dma_window(PowerPCCPU *cpu,
+                                         sPAPRMachineState *spapr,
+                                         uint32_t token, uint32_t nargs,
+                                         target_ulong args,
+                                         uint32_t nret, target_ulong rets)
+{
+    sPAPRPHBState *sphb;
+    uint64_t buid, max_window_size;
+    uint32_t avail, addr, pgmask = 0;
+    MachineState *machine = MACHINE(spapr);
+
+    if ((nargs != 3) || (nret != 5)) {
+        goto param_error_exit;
+    }
+
+    buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2);
+    addr = rtas_ld(args, 0);
+    sphb = spapr_pci_find_phb(spapr, buid);
+    if (!sphb || !sphb->ddw_enabled) {
+        goto param_error_exit;
+    }
+
+    /* Translate page mask to LoPAPR format */
+    pgmask = spapr_page_mask_to_query_mask(sphb->page_size_mask);
+
+    /*
+     * This is "Largest contiguous block of TCEs allocated specifically
+     * for (that is, are reserved for) this PE".
+     * Return the maximum number as maximum supported RAM size was in 4K pages.
+     */
+    if (machine->ram_size == machine->maxram_size) {
+        max_window_size = machine->ram_size >> SPAPR_TCE_PAGE_SHIFT;
+    } else {
+        MemoryHotplugState *hpms = &spapr->hotplug_memory;
+
+        max_window_size = hpms->base + memory_region_size(&hpms->mr);
+    }

Guess SPAPR_TCE_PAGE_SHIFT right shift should be applied to
max_window_size in both the instances (if and else) ?

Yes it should. Thanks for noticing.



+
+    avail = SPAPR_PCI_DMA_MAX_WINDOWS - spapr_phb_get_active_win_num(sphb);
+
+    rtas_st(rets, 0, RTAS_OUT_SUCCESS);
+    rtas_st(rets, 1, avail);
+    rtas_st(rets, 2, max_window_size);
+    rtas_st(rets, 3, pgmask);
+    rtas_st(rets, 4, 0); /* DMA migration mask, not supported */
+
+    trace_spapr_iommu_ddw_query(buid, addr, avail, max_window_size, pgmask);
+    return;
+
+param_error_exit:
+    rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
+}
+
+static void rtas_ibm_create_pe_dma_window(PowerPCCPU *cpu,
+                                          sPAPRMachineState *spapr,
+                                          uint32_t token, uint32_t nargs,
+                                          target_ulong args,
+                                          uint32_t nret, target_ulong rets)
+{
+    sPAPRPHBState *sphb;
+    sPAPRTCETable *tcet = NULL;
+    uint32_t addr, page_shift, window_shift, liobn;
+    uint64_t buid;
+
+    if ((nargs != 5) || (nret != 4)) {
+        goto param_error_exit;
+    }
+
+    buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2);
+    addr = rtas_ld(args, 0);
+    sphb = spapr_pci_find_phb(spapr, buid);
+    if (!sphb || !sphb->ddw_enabled) {
+        goto param_error_exit;
+    }
+
+    page_shift = rtas_ld(args, 3);
+    window_shift = rtas_ld(args, 4);

Kernel has a bug due to which wrong window_shift gets returned here. I
have posted possible fix here:
https://patchwork.ozlabs.org/patch/621497/

I have tried to work around this issue in QEMU too
https://lists.nongnu.org/archive/html/qemu-ppc/2016-04/msg00226.html

But the above work around involves changing the memory representation
in DT.

What is wrong with this workaround?


Hence I feel until the guest kernel changes are available, a
simpler work around would be to discard the window_shift value above
and recalculate the right value as below:

if (machine->ram_size == machine->maxram_size) {
    max_window_size = machine->ram_size;
} else {
     MemoryHotplugState *hpms = &spapr->hotplug_memory;
     max_window_size = hpms->base + memory_region_size(&hpms->mr);
}
window_shift = max_window_size >> SPAPR_TCE_PAGE_SHIFT;

and create DDW based on this calculated window_shift value. Does that
sound reasonable ?

The workaround should only do that for the second window, at least, or for the default one but with page size at least 64K; otherwise it is going to be a waste of memory (2MB per each 1GB of guest RAM).



--
Alexey



reply via email to

[Prev in Thread] Current Thread [Next in Thread]