qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH RFC 1/6] i386/pc: Account IOVA reserved ranges above 4G boundary


From: Joao Martins
Subject: [PATCH RFC 1/6] i386/pc: Account IOVA reserved ranges above 4G boundary
Date: Tue, 22 Jun 2021 16:49:00 +0100

It is assumed that the whole GPA space is available to be
DMA addressable, within a given address space limit. Since
v5.4 based that is not true, and VFIO will validate whether
the selected IOVA is indeed valid i.e. not reserved by IOMMU
on behalf of some specific devices or platform-defined.

AMD systems with an IOMMU are examples of such platforms and
particularly may export only these ranges as allowed:

        0000000000000000 - 00000000fedfffff (0      .. 3.982G)
        00000000fef00000 - 000000fcffffffff (3.983G .. 1011.9G)
        0000010000000000 - ffffffffffffffff (1Tb    .. 16Pb)

We already know of accounting for the 4G hole, albeit if the
guest is big enough we will fail to allocate a >1010G given
the ~12G hole at the 1Tb boundary, reserved for HyperTransport.

When creating the region above 4G, take into account what
IOVAs are allowed by defining the known allowed ranges
and search for the next free IOVA ranges. When finding a
invalid IOVA we mark them as reserved and proceed to the
next allowed IOVA region.

After accounting for the 1Tb hole on AMD hosts, mtree should
look like:

0000000100000000-000000fcffffffff (prio 0, i/o):
        alias ram-above-4g @pc.ram 0000000080000000-000000fc7fffffff
0000010000000000-000001037fffffff (prio 0, i/o):
        alias ram-above-1t @pc.ram 000000fc80000000-000000ffffffffff

Co-developed-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
---
 hw/i386/pc.c         | 103 +++++++++++++++++++++++++++++++++++++++----
 include/hw/i386/pc.h |  57 ++++++++++++++++++++++++
 target/i386/cpu.h    |   3 ++
 3 files changed, 154 insertions(+), 9 deletions(-)

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index c6d8d0d84d91..52a5473ba846 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -91,6 +91,7 @@
 #include "qapi/qmp/qerror.h"
 #include "e820_memory_layout.h"
 #include "fw_cfg.h"
+#include "target/i386/cpu.h"
 #include "trace.h"
 #include CONFIG_DEVICES
 
@@ -860,6 +861,93 @@ void xen_load_linux(PCMachineState *pcms)
     x86ms->fw_cfg = fw_cfg;
 }
 
+struct GPARange usable_iova_ranges[] = {
+    { .start = 4 * GiB, .end = UINT64_MAX, .name = "ram-above-4g" },
+
+/*
+ * AMD systems with an IOMMU have an additional hole close to the
+ * 1Tb, which are special GPAs that cannot be DMA mapped. Depending
+ * on kernel version, VFIO may or may not let you DMA map those ranges.
+ * Starting v5.4 we validate it, and can't create guests on AMD machines
+ * with certain memory sizes. The range is:
+ *
+ * FD_0000_0000h - FF_FFFF_FFFFh
+ *
+ * The ranges represent the following:
+ *
+ * Base Address   Top Address  Use
+ *
+ * FD_0000_0000h FD_F7FF_FFFFh Reserved interrupt address space
+ * FD_F800_0000h FD_F8FF_FFFFh Interrupt/EOI IntCtl
+ * FD_F900_0000h FD_F90F_FFFFh Legacy PIC IACK
+ * FD_F910_0000h FD_F91F_FFFFh System Management
+ * FD_F920_0000h FD_FAFF_FFFFh Reserved Page Tables
+ * FD_FB00_0000h FD_FBFF_FFFFh Address Translation
+ * FD_FC00_0000h FD_FDFF_FFFFh I/O Space
+ * FD_FE00_0000h FD_FFFF_FFFFh Configuration
+ * FE_0000_0000h FE_1FFF_FFFFh Extended Configuration/Device Messages
+ * FE_2000_0000h FF_FFFF_FFFFh Reserved
+ *
+ * See AMD IOMMU spec, section 2.1.2 "IOMMU Logical Topology",
+ * Table 3: Special Address Controls (GPA) for more information.
+ */
+#define DEFAULT_NR_USABLE_IOVAS 1
+#define AMD_MAX_PHYSADDR_BELOW_1TB  0xfcffffffff
+    { .start = 1 * TiB, .end = UINT64_MAX, .name = "ram-above-1t" },
+};
+
+ uint32_t nb_iova_ranges = DEFAULT_NR_USABLE_IOVAS;
+
+static void init_usable_iova_ranges(void)
+{
+    uint32_t eax, vendor[3];
+
+    host_cpuid(0x0, 0, &eax, &vendor[0], &vendor[2], &vendor[1]);
+    if (IS_AMD_VENDOR(vendor)) {
+        usable_iova_ranges[0].end = AMD_MAX_PHYSADDR_BELOW_1TB;
+        nb_iova_ranges++;
+    }
+}
+
+static void add_memory_region(MemoryRegion *system_memory, MemoryRegion *ram,
+                                hwaddr base, hwaddr size, hwaddr offset)
+{
+    hwaddr start, region_size, resv_start, resv_end;
+    struct GPARange *range;
+    MemoryRegion *region;
+    uint32_t index;
+
+    for_each_usable_range(index, base, size, range, start, region_size) {
+        region = g_malloc(sizeof(*region));
+        memory_region_init_alias(region, NULL, range->name, ram,
+                                 offset, region_size);
+        memory_region_add_subregion(system_memory, start, region);
+        e820_add_entry(start, region_size, E820_RAM);
+
+        assert(size >= region_size);
+        if (size == region_size) {
+            return;
+        }
+
+        /*
+         * There's memory left to create a region for, so there should be
+         * another valid IOVA range left.  Creating the reserved region
+         * would also be pointless.
+         */
+        if (index + 1 == nb_iova_ranges) {
+            return;
+        }
+
+        resv_start = start + region_size;
+        resv_end = usable_iova_ranges[index + 1].start;
+
+        /* Create a reserved region in the IOVA hole. */
+        e820_add_entry(resv_start, resv_end - resv_start, E820_RESERVED);
+
+        offset += region_size;
+    }
+}
+
 void pc_memory_init(PCMachineState *pcms,
                     MemoryRegion *system_memory,
                     MemoryRegion *rom_memory,
@@ -867,7 +955,7 @@ void pc_memory_init(PCMachineState *pcms,
 {
     int linux_boot, i;
     MemoryRegion *option_rom_mr;
-    MemoryRegion *ram_below_4g, *ram_above_4g;
+    MemoryRegion *ram_below_4g;
     FWCfgState *fw_cfg;
     MachineState *machine = MACHINE(pcms);
     MachineClass *mc = MACHINE_GET_CLASS(machine);
@@ -877,6 +965,8 @@ void pc_memory_init(PCMachineState *pcms,
     assert(machine->ram_size == x86ms->below_4g_mem_size +
                                 x86ms->above_4g_mem_size);
 
+    init_usable_iova_ranges();
+
     linux_boot = (machine->kernel_filename != NULL);
 
     /*
@@ -888,16 +978,11 @@ void pc_memory_init(PCMachineState *pcms,
     memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", machine->ram,
                              0, x86ms->below_4g_mem_size);
     memory_region_add_subregion(system_memory, 0, ram_below_4g);
+
     e820_add_entry(0, x86ms->below_4g_mem_size, E820_RAM);
     if (x86ms->above_4g_mem_size > 0) {
-        ram_above_4g = g_malloc(sizeof(*ram_above_4g));
-        memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g",
-                                 machine->ram,
-                                 x86ms->below_4g_mem_size,
-                                 x86ms->above_4g_mem_size);
-        memory_region_add_subregion(system_memory, 0x100000000ULL,
-                                    ram_above_4g);
-        e820_add_entry(0x100000000ULL, x86ms->above_4g_mem_size, E820_RAM);
+        add_memory_region(system_memory, machine->ram, 4 * GiB,
+                          x86ms->above_4g_mem_size, x86ms->below_4g_mem_size);
     }
 
     if (!pcmc->has_reserved_memory &&
diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index 1522a3359a93..73b8e2900c72 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -151,6 +151,63 @@ void pc_guest_info_init(PCMachineState *pcms);
 #define PCI_HOST_BELOW_4G_MEM_SIZE     "below-4g-mem-size"
 #define PCI_HOST_ABOVE_4G_MEM_SIZE     "above-4g-mem-size"
 
+struct GPARange {
+    uint64_t start;
+    uint64_t end;
+#define range_len(r) ((r).end - (r).start + 1)
+
+    const char *name;
+};
+
+extern uint32_t nb_iova_ranges;
+extern struct GPARange usable_iova_ranges[];
+
+static inline void next_iova_range_index(uint32_t i, hwaddr base, hwaddr size,
+                                         hwaddr *start, hwaddr *region_size,
+                                         uint32_t *index, struct GPARange **r)
+{
+    struct GPARange *range;
+
+    while (i < nb_iova_ranges) {
+        range = &usable_iova_ranges[i];
+        if (range->end >= base) {
+            break;
+        }
+        i++;
+    }
+
+    *index = i;
+    /* index is out of bounds or no region left to process */
+    if (i >= nb_iova_ranges || !size) {
+        return;
+    }
+
+    *start = MAX(range->start, base);
+    *region_size = MIN(range->end - *start + 1, size);
+    *r = range;
+}
+
+/*
+ * for_each_usable_range() - iterates over usable IOVA regions
+ *
+ * @i:      range index within usable_iova_ranges
+ * @base:   requested address we want to use
+ * @size:   total size of the region with @base
+ * @r:      range indexed by @i within usable_iova_ranges
+ * @s:      calculated usable iova address base
+ * @i_size: calculated usable iova region size starting @s
+ *
+ * Use this iterator to walk through usable GPA ranges. Platforms
+ * such as AMD with IOMMU capable hardware restrict certain address
+ * ranges for Hyper Transport. This iterator helper lets user avoid
+ * using those said reserved ranges.
+ */
+#define for_each_usable_range(i, base, size, r, s, i_size) \
+    for (s = 0, i_size = 0, r = NULL, \
+         next_iova_range_index(0, base, size, &s, &i_size, &i, &r); \
+         i < nb_iova_ranges && size != 0; \
+         size -= i_size, \
+         next_iova_range_index(i + 1, base, size, &s, &i_size, &i, &r))
 
 void pc_pci_as_mapping_init(Object *owner, MemoryRegion *system_memory,
                             MemoryRegion *pci_address_space);
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 1e11071d817b..f8f15a4523c6 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -869,6 +869,9 @@ typedef uint64_t FeatureWordArray[FEATURE_WORDS];
 #define IS_AMD_CPU(env) ((env)->cpuid_vendor1 == CPUID_VENDOR_AMD_1 && \
                          (env)->cpuid_vendor2 == CPUID_VENDOR_AMD_2 && \
                          (env)->cpuid_vendor3 == CPUID_VENDOR_AMD_3)
+#define IS_AMD_VENDOR(vendor) ((vendor[0]) == CPUID_VENDOR_AMD_1 && \
+                         (vendor[1]) == CPUID_VENDOR_AMD_2 && \
+                         (vendor[2]) == CPUID_VENDOR_AMD_3)
 
 #define CPUID_MWAIT_IBE     (1U << 1) /* Interrupts can exit capability */
 #define CPUID_MWAIT_EMX     (1U << 0) /* enumeration supported */
-- 
2.17.1




reply via email to

[Prev in Thread] Current Thread [Next in Thread]