qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH v3 2/2] intel-iommu: extend VTD emulation to all


From: Michael S. Tsirkin
Subject: Re: [Qemu-devel] [PATCH v3 2/2] intel-iommu: extend VTD emulation to allow 57-bit IOVA address width.
Date: Tue, 18 Dec 2018 09:49:02 -0500

On Tue, Dec 18, 2018 at 09:45:41PM +0800, Yu Zhang wrote:
> On Tue, Dec 18, 2018 at 07:43:28AM -0500, Michael S. Tsirkin wrote:
> > On Tue, Dec 18, 2018 at 06:01:16PM +0800, Yu Zhang wrote:
> > > On Tue, Dec 18, 2018 at 05:47:14PM +0800, Yu Zhang wrote:
> > > > On Mon, Dec 17, 2018 at 02:29:02PM +0100, Igor Mammedov wrote:
> > > > > On Wed, 12 Dec 2018 21:05:39 +0800
> > > > > Yu Zhang <address@hidden> wrote:
> > > > > 
> > > > > > A 5-level paging capable VM may choose to use 57-bit IOVA address 
> > > > > > width.
> > > > > > E.g. guest applications may prefer to use its VA as IOVA when 
> > > > > > performing
> > > > > > VFIO map/unmap operations, to avoid the burden of managing the IOVA 
> > > > > > space.
> > > > > > 
> > > > > > This patch extends the current vIOMMU logic to cover the extended 
> > > > > > address
> > > > > > width. When creating a VM with 5-level paging feature, one can 
> > > > > > choose to
> > > > > > create a virtual VTD with 5-level paging capability, with 
> > > > > > configurations
> > > > > > like "-device intel-iommu,x-aw-bits=57".
> > > > > > 
> > > > > > Signed-off-by: Yu Zhang <address@hidden>
> > > > > > Reviewed-by: Peter Xu <address@hidden>
> > > > > > ---
> > > > > > Cc: "Michael S. Tsirkin" <address@hidden>
> > > > > > Cc: Marcel Apfelbaum <address@hidden>
> > > > > > Cc: Paolo Bonzini <address@hidden>
> > > > > > Cc: Richard Henderson <address@hidden>
> > > > > > Cc: Eduardo Habkost <address@hidden>
> > > > > > Cc: Peter Xu <address@hidden>
> > > > > > ---
> > > > > >  hw/i386/intel_iommu.c          | 53 
> > > > > > ++++++++++++++++++++++++++++++++----------
> > > > > >  hw/i386/intel_iommu_internal.h | 10 ++++++--
> > > > > >  include/hw/i386/intel_iommu.h  |  1 +
> > > > > >  3 files changed, 50 insertions(+), 14 deletions(-)
> > > > > > 
> > > > > > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> > > > > > index 0e88c63..871110c 100644
> > > > > > --- a/hw/i386/intel_iommu.c
> > > > > > +++ b/hw/i386/intel_iommu.c
> > > > > > @@ -664,16 +664,16 @@ static inline bool 
> > > > > > vtd_iova_range_check(uint64_t iova, VTDContextEntry *ce,
> > > > > >  
> > > > > >  /*
> > > > > >   * Rsvd field masks for spte:
> > > > > > - *     Index [1] to [4] 4k pages
> > > > > > - *     Index [5] to [8] large pages
> > > > > > + *     Index [1] to [5] 4k pages
> > > > > > + *     Index [6] to [10] large pages
> > > > > >   */
> > > > > > -static uint64_t vtd_paging_entry_rsvd_field[9];
> > > > > > +static uint64_t vtd_paging_entry_rsvd_field[11];
> > > > > >  
> > > > > >  static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, uint32_t level)
> > > > > >  {
> > > > > >      if (slpte & VTD_SL_PT_PAGE_SIZE_MASK) {
> > > > > >          /* Maybe large page */
> > > > > > -        return slpte & vtd_paging_entry_rsvd_field[level + 4];
> > > > > > +        return slpte & vtd_paging_entry_rsvd_field[level + 5];
> > > > > >      } else {
> > > > > >          return slpte & vtd_paging_entry_rsvd_field[level];
> > > > > >      }
> > > > > > @@ -3127,6 +3127,8 @@ static void vtd_init(IntelIOMMUState *s)
> > > > > >               VTD_CAP_SAGAW_39bit | VTD_CAP_MGAW(s->aw_bits);
> > > > > >      if (s->aw_bits == VTD_AW_48BIT) {
> > > > > >          s->cap |= VTD_CAP_SAGAW_48bit;
> > > > > > +    } else if (s->aw_bits == VTD_AW_57BIT) {
> > > > > > +        s->cap |= VTD_CAP_SAGAW_57bit | VTD_CAP_SAGAW_48bit;
> > > > > >      }
> > > > > >      s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO;
> > > > > >      s->haw_bits = cpu->phys_bits;
> > > > > > @@ -3139,10 +3141,12 @@ static void vtd_init(IntelIOMMUState *s)
> > > > > >      vtd_paging_entry_rsvd_field[2] = 
> > > > > > VTD_SPTE_PAGE_L2_RSVD_MASK(s->haw_bits);
> > > > > >      vtd_paging_entry_rsvd_field[3] = 
> > > > > > VTD_SPTE_PAGE_L3_RSVD_MASK(s->haw_bits);
> > > > > >      vtd_paging_entry_rsvd_field[4] = 
> > > > > > VTD_SPTE_PAGE_L4_RSVD_MASK(s->haw_bits);
> > > > > > -    vtd_paging_entry_rsvd_field[5] = 
> > > > > > VTD_SPTE_LPAGE_L1_RSVD_MASK(s->haw_bits);
> > > > > > -    vtd_paging_entry_rsvd_field[6] = 
> > > > > > VTD_SPTE_LPAGE_L2_RSVD_MASK(s->haw_bits);
> > > > > > -    vtd_paging_entry_rsvd_field[7] = 
> > > > > > VTD_SPTE_LPAGE_L3_RSVD_MASK(s->haw_bits);
> > > > > > -    vtd_paging_entry_rsvd_field[8] = 
> > > > > > VTD_SPTE_LPAGE_L4_RSVD_MASK(s->haw_bits);
> > > > > > +    vtd_paging_entry_rsvd_field[5] = 
> > > > > > VTD_SPTE_PAGE_L5_RSVD_MASK(s->haw_bits);
> > > > > > +    vtd_paging_entry_rsvd_field[6] = 
> > > > > > VTD_SPTE_LPAGE_L1_RSVD_MASK(s->haw_bits);
> > > > > > +    vtd_paging_entry_rsvd_field[7] = 
> > > > > > VTD_SPTE_LPAGE_L2_RSVD_MASK(s->haw_bits);
> > > > > > +    vtd_paging_entry_rsvd_field[8] = 
> > > > > > VTD_SPTE_LPAGE_L3_RSVD_MASK(s->haw_bits);
> > > > > > +    vtd_paging_entry_rsvd_field[9] = 
> > > > > > VTD_SPTE_LPAGE_L4_RSVD_MASK(s->haw_bits);
> > > > > > +    vtd_paging_entry_rsvd_field[10] = 
> > > > > > VTD_SPTE_LPAGE_L5_RSVD_MASK(s->haw_bits);
> > > > > >  
> > > > > >      if (x86_iommu->intr_supported) {
> > > > > >          s->ecap |= VTD_ECAP_IR | VTD_ECAP_MHMV;
> > > > > > @@ -3241,6 +3245,23 @@ static AddressSpace 
> > > > > > *vtd_host_dma_iommu(PCIBus *bus, void *opaque, int devfn)
> > > > > >      return &vtd_as->as;
> > > > > >  }
> > > > > >  
> > > > > > +static bool host_has_la57(void)
> > > > > > +{
> > > > > > +    uint32_t ecx, unused;
> > > > > > +
> > > > > > +    host_cpuid(7, 0, &unused, &unused, &ecx, &unused);
> > > > > > +    return ecx & CPUID_7_0_ECX_LA57;
> > > > > > +}
> > > > > > +
> > > > > > +static bool guest_has_la57(void)
> > > > > > +{
> > > > > > +    CPUState *cs = first_cpu;
> > > > > > +    X86CPU *cpu = X86_CPU(cs);
> > > > > > +    CPUX86State *env = &cpu->env;
> > > > > > +
> > > > > > +    return env->features[FEAT_7_0_ECX] & CPUID_7_0_ECX_LA57;
> > > > > > +}
> > > > > another direct access to CPU fields,
> > > > > I'd suggest to set this value when iommu is created
> > > > > i.e. add 'la57' property and set from iommu owner.
> > > > > 
> > > > 
> > > > Sorry, do you mean "-device intel-iommu,la57"? I think we do not need
> > > > that, because a 5-level capable vIOMMU can be created with properties
> > > > like "-device intel-iommu,x-aw-bits=57". 
> > > > 
> > > > The guest CPU fields are checked to make sure the VM has LA57 CPU 
> > > > feature,
> > > > because I believe there shall be no 5-level IOMMU on platforms without 
> > > > LA57
> > > > CPUs. 
> > 
> > I don't necessarily see why these need to be connected.
> > If yes pls add code to explain.
> 
> Sorry, do you mean the VM shall be able to see a 5-level IOMMU even it does 
> not
> have LA57 feature? I do not see any direct connection when asked to enable a 
> 5-level
> vIOMMU at first, but I was told(and checked) that DPDK in the VM may choose a 
> VA
> value as an IOVA.

Right but then that doesn't work on all hosts either.

> And if guest has LA57, we should create a 5-level vIOMMU to the VM.
> But if the VM even does not have LA57, any specific reason we should give it 
> a 5-level
> vIOMMU?

So the example you give is VTD address width < CPU aw. That is known
to be problematic for dpdk but not for other software and maybe dpdk
will learns how to cope. Given such hosts exist it might be
useful to support this at least for debugging.

Are there reasons to worry about VTD > CPU?


> > 
> > 
> > > > > >  static bool vtd_decide_config(IntelIOMMUState *s, Error **errp)
> > > > > >  {
> > > > > >      X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
> > > > > > @@ -3267,11 +3288,19 @@ static bool 
> > > > > > vtd_decide_config(IntelIOMMUState *s, Error **errp)
> > > > > >          }
> > > > > >      }
> > > > > >  
> > > > > > -    /* Currently only address widths supported are 39 and 48 bits 
> > > > > > */
> > > > > > +    /* Currently address widths supported are 39, 48, and 57 bits 
> > > > > > */
> > > > > >      if ((s->aw_bits != VTD_AW_39BIT) &&
> > > > > > -        (s->aw_bits != VTD_AW_48BIT)) {
> > > > > > -        error_setg(errp, "Supported values for x-aw-bits are: %d, 
> > > > > > %d",
> > > > > > -                   VTD_AW_39BIT, VTD_AW_48BIT);
> > > > > > +        (s->aw_bits != VTD_AW_48BIT) &&
> > > > > > +        (s->aw_bits != VTD_AW_57BIT)) {
> > > > > > +        error_setg(errp, "Supported values for x-aw-bits are: %d, 
> > > > > > %d, %d",
> > > > > > +                   VTD_AW_39BIT, VTD_AW_48BIT, VTD_AW_57BIT);
> > > > > > +        return false;
> > > > > > +    }
> > > > > > +
> > > > > > +    if ((s->aw_bits == VTD_AW_57BIT) &&
> > > > > > +        !(host_has_la57() && guest_has_la57())) {
> > > > > Does iommu supposed to work in TCG mode?
> > > > > If yes then why it should care about host_has_la57()?
> > > > > 
> > > > 
> > > > Hmm... I did not take TCG mode into consideration. And host_has_la57() 
> > > > is
> > > > used to guarantee the host have la57 feature so that iommu shadowing 
> > > > works
> > > > for device assignment.
> > > > 
> > > > I guess iommu shall work in TCG mode(though I am not quite sure about 
> > > > this).
> > > > But I do not have any usage case of a 5-level vIOMMU in TCG in mind. So 
> > > > maybe
> > > > we can:
> > > > 1> check the 'ms->accel' in vtd_decide_config() and do not care about 
> > > > host
> > > > capability if it is TCG.
> > > 
> > > For choice 1, kvm_enabled() might be used instead of ms->accel. Thanks 
> > > Peter
> > > for the remind. :)
> > 
> > 
> > This needs a big comment with an explanation though.
> > And probably a TODO to make it work under TCG ...
> > 
> 
> Thanks, Michael. For choice 1, I believe it should work for TCG(will need test
> though), and the condition would be sth. like:
> 
>     if ((s->aw_bits == VTD_AW_57BIT) &&
>         kvm_enabled() &&
>         !host_has_la57())  {
> 
> As you can see, though I remove the check of guest_has_la57(), I still kept 
> the
> check against host when KVM is enabled. I'm still ready to be convinced for 
> any
> requirement why we do not need the guest check. :) 


okay but then (repeating myself, sorry) pls add a comment that explains
what happens if you do not add this limitation.


> > > > 2> Or, we can choose to keep as it is, and add the check when 5-level 
> > > > paging
> > > > vIOMMU does have usage in TCG?
> > > > 
> > > > But as to the check of guest capability, I still believe it is 
> > > > necessary. As
> > > > said, a VM without LA57 feature shall not see a VT-d with 5-level IOMMU.
> > > > 
> > > > > > +        error_setg(errp, "Do not support 57-bit DMA address, 
> > > > > > unless both "
> > > > > > +                         "host and guest are capable of 5-level 
> > > > > > paging");
> > > > > >          return false;
> > > > > >      }
> > > > > >  
> > > > > > diff --git a/hw/i386/intel_iommu_internal.h 
> > > > > > b/hw/i386/intel_iommu_internal.h
> > > > > > index d084099..2b29b6f 100644
> > > > > > --- a/hw/i386/intel_iommu_internal.h
> > > > > > +++ b/hw/i386/intel_iommu_internal.h
> > > > > > @@ -114,8 +114,8 @@
> > > > > >                                       VTD_INTERRUPT_ADDR_FIRST + 1)
> > > > > >  
> > > > > >  /* The shift of source_id in the key of IOTLB hash table */
> > > > > > -#define VTD_IOTLB_SID_SHIFT         36
> > > > > > -#define VTD_IOTLB_LVL_SHIFT         52
> > > > > > +#define VTD_IOTLB_SID_SHIFT         45
> > > > > > +#define VTD_IOTLB_LVL_SHIFT         61
> > > > > >  #define VTD_IOTLB_MAX_SIZE          1024    /* Max size of the 
> > > > > > hash table */
> > > > > >  
> > > > > >  /* IOTLB_REG */
> > > > > > @@ -212,6 +212,8 @@
> > > > > >  #define VTD_CAP_SAGAW_39bit         (0x2ULL << VTD_CAP_SAGAW_SHIFT)
> > > > > >   /* 48-bit AGAW, 4-level page-table */
> > > > > >  #define VTD_CAP_SAGAW_48bit         (0x4ULL << VTD_CAP_SAGAW_SHIFT)
> > > > > > + /* 57-bit AGAW, 5-level page-table */
> > > > > > +#define VTD_CAP_SAGAW_57bit         (0x8ULL << VTD_CAP_SAGAW_SHIFT)
> > > > > >  
> > > > > >  /* IQT_REG */
> > > > > >  #define VTD_IQT_QT(val)             (((val) >> 4) & 0x7fffULL)
> > > > > > @@ -379,6 +381,8 @@ typedef union VTDInvDesc VTDInvDesc;
> > > > > >          (0x800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
> > > > > >  #define VTD_SPTE_PAGE_L4_RSVD_MASK(aw) \
> > > > > >          (0x880ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
> > > > > > +#define VTD_SPTE_PAGE_L5_RSVD_MASK(aw) \
> > > > > > +        (0x880ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
> > > > > >  #define VTD_SPTE_LPAGE_L1_RSVD_MASK(aw) \
> > > > > >          (0x800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
> > > > > >  #define VTD_SPTE_LPAGE_L2_RSVD_MASK(aw) \
> > > > > > @@ -387,6 +391,8 @@ typedef union VTDInvDesc VTDInvDesc;
> > > > > >          (0x3ffff800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
> > > > > >  #define VTD_SPTE_LPAGE_L4_RSVD_MASK(aw) \
> > > > > >          (0x880ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
> > > > > > +#define VTD_SPTE_LPAGE_L5_RSVD_MASK(aw) \
> > > > > > +        (0x880ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
> > > > > >  
> > > > > >  /* Information about page-selective IOTLB invalidate */
> > > > > >  struct VTDIOTLBPageInvInfo {
> > > > > > diff --git a/include/hw/i386/intel_iommu.h 
> > > > > > b/include/hw/i386/intel_iommu.h
> > > > > > index 820451c..7474c4f 100644
> > > > > > --- a/include/hw/i386/intel_iommu.h
> > > > > > +++ b/include/hw/i386/intel_iommu.h
> > > > > > @@ -49,6 +49,7 @@
> > > > > >  #define DMAR_REG_SIZE               0x230
> > > > > >  #define VTD_AW_39BIT                39
> > > > > >  #define VTD_AW_48BIT                48
> > > > > > +#define VTD_AW_57BIT                57
> > > > > >  #define VTD_ADDRESS_WIDTH           VTD_AW_39BIT
> > > > > >  #define VTD_HAW_MASK(aw)            ((1ULL << (aw)) - 1)
> > > > > >  
> > > > > 
> > > > > 
> > > > 
> > > > B.R.
> > > > Yu
> > > > 
> 
> B.R.
> Yu



reply via email to

[Prev in Thread] Current Thread [Next in Thread]