qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH v4 9/9] intel-iommu: rework the page walk logic


From: Michael S. Tsirkin
Subject: Re: [Qemu-devel] [PATCH v4 9/9] intel-iommu: rework the page walk logic
Date: Wed, 23 May 2018 17:33:34 +0300

On Fri, May 18, 2018 at 03:25:17PM +0800, Peter Xu wrote:
> SECURITY IMPLICATION: this patch will fix a potential small window that
> the DMA page table might be incomplete or invalid when the guest sends
> domain/context invalidations to a device.  It can cause random DMA
> errors for assigned devices.

So this is more a correctness IMO. I don't see how can
e.g. an application within guest cause any mischief
with this, it will just get a non working device.

> 
> This is a major change to the VT-d shadow page walking logic. It
> includes but not limited to:
> 
> - For each VTDAddressSpace, now we maintain what IOVA ranges we have
>   mapped and what we have not.  With that information, now we only send
>   MAP or UNMAP when necessary.  Say, we don't send MAP notifies if we
>   know we have already mapped the range, meanwhile we don't send UNMAP
>   notifies if we know we never mapped the range at all.
> 
> - Introduced vtd_sync_shadow_page_table[_range] APIs so that we can call
>   in any places to resync the shadow page table for a device.
> 
> - When we receive domain/context invalidation, we should not really run
>   the replay logic, instead we use the new sync shadow page table API to
>   resync the whole shadow page table without unmapping the whole
>   region.  After this change, we'll only do the page walk once for each
>   domain invalidations (before this, it can be multiple, depending on
>   number of notifiers per address space).
> 
> Since at it, the page walking logic is also refactored to be simpler.
> 
> CC: QEMU Stable <address@hidden>
> Reported-by: Jintack Lim <address@hidden>
> Tested-by: Jintack Lim <address@hidden>
> Signed-off-by: Peter Xu <address@hidden>
> ---
>  include/hw/i386/intel_iommu.h |   2 +
>  hw/i386/intel_iommu.c         | 213 +++++++++++++++++++++++++---------
>  hw/i386/trace-events          |   3 +-
>  3 files changed, 159 insertions(+), 59 deletions(-)
> 
> diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
> index 156f35e919..fbfedcb1c0 100644
> --- a/include/hw/i386/intel_iommu.h
> +++ b/include/hw/i386/intel_iommu.h
> @@ -27,6 +27,7 @@
>  #include "hw/i386/ioapic.h"
>  #include "hw/pci/msi.h"
>  #include "hw/sysbus.h"
> +#include "qemu/iova-tree.h"
>  
>  #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu"
>  #define INTEL_IOMMU_DEVICE(obj) \
> @@ -95,6 +96,7 @@ struct VTDAddressSpace {
>      QLIST_ENTRY(VTDAddressSpace) next;
>      /* Superset of notifier flags that this address space has */
>      IOMMUNotifierFlag notifier_flags;
> +    IOVATree *iova_tree;          /* Traces mapped IOVA ranges */
>  };
>  
>  struct VTDBus {
> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> index 61bb3d31e7..b5a09b7908 100644
> --- a/hw/i386/intel_iommu.c
> +++ b/hw/i386/intel_iommu.c
> @@ -769,10 +769,77 @@ typedef struct {
>  
>  static int vtd_page_walk_one(IOMMUTLBEntry *entry, vtd_page_walk_info *info)
>  {
> +    VTDAddressSpace *as = info->as;
>      vtd_page_walk_hook hook_fn = info->hook_fn;
>      void *private = info->private;
> +    DMAMap target = {
> +        .iova = entry->iova,
> +        .size = entry->addr_mask,
> +        .translated_addr = entry->translated_addr,
> +        .perm = entry->perm,
> +    };
> +    DMAMap *mapped = iova_tree_find(as->iova_tree, &target);
> +
> +    if (entry->perm == IOMMU_NONE && !info->notify_unmap) {
> +        trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask);
> +        return 0;
> +    }
>  
>      assert(hook_fn);
> +
> +    /* Update local IOVA mapped ranges */
> +    if (entry->perm) {
> +        if (mapped) {
> +            /* If it's exactly the same translation, skip */
> +            if (!memcmp(mapped, &target, sizeof(target))) {
> +                trace_vtd_page_walk_one_skip_map(entry->iova, 
> entry->addr_mask,
> +                                                 entry->translated_addr);
> +                return 0;
> +            } else {
> +                /*
> +                 * Translation changed.  Normally this should not
> +                 * happen, but it can happen when with buggy guest
> +                 * OSes.  Note that there will be a small window that
> +                 * we don't have map at all.  But that's the best
> +                 * effort we can do.  The ideal way to emulate this is
> +                 * atomically modify the PTE to follow what has
> +                 * changed, but we can't.  One example is that vfio
> +                 * driver only has VFIO_IOMMU_[UN]MAP_DMA but no
> +                 * interface to modify a mapping (meanwhile it seems
> +                 * meaningless to even provide one).  Anyway, let's
> +                 * mark this as a TODO in case one day we'll have
> +                 * a better solution.
> +                 */
> +                IOMMUAccessFlags cache_perm = entry->perm;
> +                int ret;
> +
> +                /* Emulate an UNMAP */
> +                entry->perm = IOMMU_NONE;
> +                trace_vtd_page_walk_one(info->domain_id,
> +                                        entry->iova,
> +                                        entry->translated_addr,
> +                                        entry->addr_mask,
> +                                        entry->perm);
> +                ret = hook_fn(entry, private);
> +                if (ret) {
> +                    return ret;
> +                }
> +                /* Drop any existing mapping */
> +                iova_tree_remove(as->iova_tree, &target);
> +                /* Recover the correct permission */
> +                entry->perm = cache_perm;
> +            }
> +        }
> +        iova_tree_insert(as->iova_tree, &target);
> +    } else {
> +        if (!mapped) {
> +            /* Skip since we didn't map this range at all */
> +            trace_vtd_page_walk_one_skip_unmap(entry->iova, 
> entry->addr_mask);
> +            return 0;
> +        }
> +        iova_tree_remove(as->iova_tree, &target);
> +    }
> +
>      trace_vtd_page_walk_one(info->domain_id, entry->iova,
>                              entry->translated_addr, entry->addr_mask,
>                              entry->perm);
> @@ -834,45 +901,34 @@ static int vtd_page_walk_level(dma_addr_t addr, 
> uint64_t start,
>           */
>          entry_valid = read_cur | write_cur;
>  
> -        entry.target_as = &address_space_memory;
> -        entry.iova = iova & subpage_mask;
> -        entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur);
> -        entry.addr_mask = ~subpage_mask;
> -
> -        if (vtd_is_last_slpte(slpte, level)) {
> -            /* NOTE: this is only meaningful if entry_valid == true */
> -            entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw);
> -            if (!entry_valid && !info->notify_unmap) {
> -                trace_vtd_page_walk_skip_perm(iova, iova_next);
> -                goto next;
> -            }
> -            ret = vtd_page_walk_one(&entry, info);
> -            if (ret < 0) {
> -                return ret;
> -            }
> -        } else {
> -            if (!entry_valid) {
> -                if (info->notify_unmap) {
> -                    /*
> -                     * The whole entry is invalid; unmap it all.
> -                     * Translated address is meaningless, zero it.
> -                     */
> -                    entry.translated_addr = 0x0;
> -                    ret = vtd_page_walk_one(&entry, info);
> -                    if (ret < 0) {
> -                        return ret;
> -                    }
> -                } else {
> -                    trace_vtd_page_walk_skip_perm(iova, iova_next);
> -                }
> -                goto next;
> -            }
> +        if (!vtd_is_last_slpte(slpte, level) && entry_valid) {
> +            /*
> +             * This is a valid PDE (or even bigger than PDE).  We need
> +             * to walk one further level.
> +             */
>              ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte, info->aw),
>                                        iova, MIN(iova_next, end), level - 1,
>                                        read_cur, write_cur, info);
> -            if (ret < 0) {
> -                return ret;
> -            }
> +        } else {
> +            /*
> +             * This means we are either:
> +             *
> +             * (1) the real page entry (either 4K page, or huge page)
> +             * (2) the whole range is invalid
> +             *
> +             * In either case, we send an IOTLB notification down.
> +             */
> +            entry.target_as = &address_space_memory;
> +            entry.iova = iova & subpage_mask;
> +            entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur);
> +            entry.addr_mask = ~subpage_mask;
> +            /* NOTE: this is only meaningful if entry_valid == true */
> +            entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw);
> +            ret = vtd_page_walk_one(&entry, info);
> +        }
> +
> +        if (ret < 0) {
> +            return ret;
>          }
>  
>  next:
> @@ -964,6 +1020,58 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, 
> uint8_t bus_num,
>      return 0;
>  }
>  
> +static int vtd_sync_shadow_page_hook(IOMMUTLBEntry *entry,
> +                                     void *private)
> +{
> +    memory_region_notify_iommu((IOMMUMemoryRegion *)private, *entry);
> +    return 0;
> +}
> +
> +/* If context entry is NULL, we'll try to fetch it on our own. */
> +static int vtd_sync_shadow_page_table_range(VTDAddressSpace *vtd_as,
> +                                            VTDContextEntry *ce,
> +                                            hwaddr addr, hwaddr size)
> +{
> +    IntelIOMMUState *s = vtd_as->iommu_state;
> +    vtd_page_walk_info info = {
> +        .hook_fn = vtd_sync_shadow_page_hook,
> +        .private = (void *)&vtd_as->iommu,
> +        .notify_unmap = true,
> +        .aw = s->aw_bits,
> +        .as = vtd_as,
> +    };
> +    VTDContextEntry ce_cache;
> +    int ret;
> +
> +    if (ce) {
> +        /* If the caller provided context entry, use it */
> +        ce_cache = *ce;
> +    } else {
> +        /* If the caller didn't provide ce, try to fetch */
> +        ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
> +                                       vtd_as->devfn, &ce_cache);
> +        if (ret) {
> +            /*
> +             * This should not really happen, but in case it happens,
> +             * we just skip the sync for this time.  After all we even
> +             * don't have the root table pointer!
> +             */
> +            trace_vtd_err("Detected invalid context entry when "
> +                          "trying to sync shadow page table");
> +            return 0;
> +        }
> +    }
> +
> +    info.domain_id = VTD_CONTEXT_ENTRY_DID(ce_cache.hi);
> +
> +    return vtd_page_walk(&ce_cache, addr, addr + size, &info);
> +}
> +
> +static int vtd_sync_shadow_page_table(VTDAddressSpace *vtd_as)
> +{
> +    return vtd_sync_shadow_page_table_range(vtd_as, NULL, 0, UINT64_MAX);
> +}
> +
>  /*
>   * Fetch translation type for specific device. Returns <0 if error
>   * happens, otherwise return the shifted type to check against
> @@ -1296,7 +1404,7 @@ static void vtd_iommu_replay_all(IntelIOMMUState *s)
>      VTDAddressSpace *vtd_as;
>  
>      QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
> -        memory_region_iommu_replay_all(&vtd_as->iommu);
> +        vtd_sync_shadow_page_table(vtd_as);
>      }
>  }
>  
> @@ -1371,14 +1479,13 @@ static void 
> vtd_context_device_invalidate(IntelIOMMUState *s,
>                  vtd_switch_address_space(vtd_as);
>                  /*
>                   * So a device is moving out of (or moving into) a
> -                 * domain, a replay() suites here to notify all the
> -                 * IOMMU_NOTIFIER_MAP registers about this change.
> +                 * domain, resync the shadow page table.
>                   * This won't bring bad even if we have no such
>                   * notifier registered - the IOMMU notification
>                   * framework will skip MAP notifications if that
>                   * happened.
>                   */
> -                memory_region_iommu_replay_all(&vtd_as->iommu);
> +                vtd_sync_shadow_page_table(vtd_as);
>              }
>          }
>      }
> @@ -1436,18 +1543,11 @@ static void 
> vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id)
>          if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
>                                        vtd_as->devfn, &ce) &&
>              domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) {
> -            memory_region_iommu_replay_all(&vtd_as->iommu);
> +            vtd_sync_shadow_page_table(vtd_as);
>          }
>      }
>  }
>  
> -static int vtd_page_invalidate_notify_hook(IOMMUTLBEntry *entry,
> -                                           void *private)
> -{
> -    memory_region_notify_iommu((IOMMUMemoryRegion *)private, *entry);
> -    return 0;
> -}
> -
>  static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
>                                             uint16_t domain_id, hwaddr addr,
>                                             uint8_t am)
> @@ -1462,21 +1562,12 @@ static void 
> vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
>                                         vtd_as->devfn, &ce);
>          if (!ret && domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) {
>              if (vtd_as_has_map_notifier(vtd_as)) {
> -                vtd_page_walk_info info = {
> -                    .hook_fn = vtd_page_invalidate_notify_hook,
> -                    .private = (void *)&vtd_as->iommu,
> -                    .notify_unmap = true,
> -                    .aw = s->aw_bits,
> -                    .as = vtd_as,
> -                    .domain_id = domain_id,
> -                };
> -
>                  /*
>                   * As long as we have MAP notifications registered in
>                   * any of our IOMMU notifiers, we need to sync the
>                   * shadow page table.
>                   */
> -                vtd_page_walk(&ce, addr, addr + size, &info);
> +                vtd_sync_shadow_page_table_range(vtd_as, &ce, addr, size);
>              } else {
>                  /*
>                   * For UNMAP-only notifiers, we don't need to walk the
> @@ -2806,6 +2897,7 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, 
> PCIBus *bus, int devfn)
>          vtd_dev_as->devfn = (uint8_t)devfn;
>          vtd_dev_as->iommu_state = s;
>          vtd_dev_as->context_cache_entry.context_cache_gen = 0;
> +        vtd_dev_as->iova_tree = iova_tree_new();
>  
>          /*
>           * Memory region relationships looks like (Address range shows
> @@ -2858,6 +2950,7 @@ static void vtd_address_space_unmap(VTDAddressSpace 
> *as, IOMMUNotifier *n)
>      hwaddr start = n->start;
>      hwaddr end = n->end;
>      IntelIOMMUState *s = as->iommu_state;
> +    DMAMap map;
>  
>      /*
>       * Note: all the codes in this function has a assumption that IOVA
> @@ -2902,6 +2995,10 @@ static void vtd_address_space_unmap(VTDAddressSpace 
> *as, IOMMUNotifier *n)
>                               VTD_PCI_FUNC(as->devfn),
>                               entry.iova, size);
>  
> +    map.iova = entry.iova;
> +    map.size = entry.addr_mask;
> +    iova_tree_remove(as->iova_tree, &map);
> +
>      memory_region_notify_one(n, &entry);
>  }
>  
> diff --git a/hw/i386/trace-events b/hw/i386/trace-events
> index ca23ba9fad..e14d06ec83 100644
> --- a/hw/i386/trace-events
> +++ b/hw/i386/trace-events
> @@ -40,8 +40,9 @@ vtd_replay_ce_valid(uint8_t bus, uint8_t dev, uint8_t fn, 
> uint16_t domain, uint6
>  vtd_replay_ce_invalid(uint8_t bus, uint8_t dev, uint8_t fn) "replay invalid 
> context device %02"PRIx8":%02"PRIx8".%02"PRIx8
>  vtd_page_walk_level(uint64_t addr, uint32_t level, uint64_t start, uint64_t 
> end) "walk (base=0x%"PRIx64", level=%"PRIu32") iova range 0x%"PRIx64" - 
> 0x%"PRIx64
>  vtd_page_walk_one(uint16_t domain, uint64_t iova, uint64_t gpa, uint64_t 
> mask, int perm) "domain 0x%"PRIu16" iova 0x%"PRIx64" -> gpa 0x%"PRIx64" mask 
> 0x%"PRIx64" perm %d"
> +vtd_page_walk_one_skip_map(uint64_t iova, uint64_t mask, uint64_t 
> translated) "iova 0x%"PRIx64" mask 0x%"PRIx64" translated 0x%"PRIx64
> +vtd_page_walk_one_skip_unmap(uint64_t iova, uint64_t mask) "iova 0x%"PRIx64" 
> mask 0x%"PRIx64
>  vtd_page_walk_skip_read(uint64_t iova, uint64_t next) "Page walk skip iova 
> 0x%"PRIx64" - 0x%"PRIx64" due to unable to read"
> -vtd_page_walk_skip_perm(uint64_t iova, uint64_t next) "Page walk skip iova 
> 0x%"PRIx64" - 0x%"PRIx64" due to perm empty"
>  vtd_page_walk_skip_reserve(uint64_t iova, uint64_t next) "Page walk skip 
> iova 0x%"PRIx64" - 0x%"PRIx64" due to rsrv set"
>  vtd_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) 
> "Device %02x:%02x.%x switching address space (iommu enabled=%d)"
>  vtd_as_unmap_whole(uint8_t bus, uint8_t slot, uint8_t fn, uint64_t iova, 
> uint64_t size) "Device %02x:%02x.%x start 0x%"PRIx64" size 0x%"PRIx64
> -- 
> 2.17.0



reply via email to

[Prev in Thread] Current Thread [Next in Thread]