diff mbox series

[RFC,v3,14/25] intel_iommu: add virtual command capability support

Message ID 1580300216-86172-15-git-send-email-yi.l.liu@intel.com (mailing list archive)
State New, archived
Headers show
Series intel_iommu: expose Shared Virtual Addressing to VMs | expand

Commit Message

Yi Liu Jan. 29, 2020, 12:16 p.m. UTC
From: Liu Yi L <yi.l.liu@intel.com>

This patch adds virtual command support to Intel vIOMMU per
Intel VT-d 3.1 spec. And adds two virtual commands: allocate
pasid and free pasid.

Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Yi Sun <yi.y.sun@linux.intel.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Eduardo Habkost <ehabkost@redhat.com>
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
---
 hw/i386/intel_iommu.c          | 163 ++++++++++++++++++++++++++++++++++++++++-
 hw/i386/intel_iommu_internal.h |  38 ++++++++++
 hw/i386/trace-events           |   1 +
 include/hw/i386/intel_iommu.h  |   6 +-
 4 files changed, 206 insertions(+), 2 deletions(-)

Comments

Peter Xu Feb. 11, 2020, 8:16 p.m. UTC | #1
On Wed, Jan 29, 2020 at 04:16:45AM -0800, Liu, Yi L wrote:
> From: Liu Yi L <yi.l.liu@intel.com>
> 
> This patch adds virtual command support to Intel vIOMMU per
> Intel VT-d 3.1 spec. And adds two virtual commands: allocate
> pasid and free pasid.
> 
> Cc: Kevin Tian <kevin.tian@intel.com>
> Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
> Cc: Peter Xu <peterx@redhat.com>
> Cc: Yi Sun <yi.y.sun@linux.intel.com>
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Cc: Richard Henderson <rth@twiddle.net>
> Cc: Eduardo Habkost <ehabkost@redhat.com>
> Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
> Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
> ---
>  hw/i386/intel_iommu.c          | 163 ++++++++++++++++++++++++++++++++++++++++-
>  hw/i386/intel_iommu_internal.h |  38 ++++++++++
>  hw/i386/trace-events           |   1 +
>  include/hw/i386/intel_iommu.h  |   6 +-
>  4 files changed, 206 insertions(+), 2 deletions(-)
> 
> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> index 33be40c..43a728f 100644
> --- a/hw/i386/intel_iommu.c
> +++ b/hw/i386/intel_iommu.c
> @@ -2649,6 +2649,142 @@ static void vtd_handle_iectl_write(IntelIOMMUState *s)
>      }
>  }
>  
> +static int vtd_request_pasid_alloc(IntelIOMMUState *s, uint32_t *pasid)
> +{
> +    VTDBus *vtd_bus;
> +    int bus_n, devfn, ret = -errno;
> +    VTDIOMMUContext *vtd_icx;
> +
> +    for (bus_n = 0; bus_n < PCI_BUS_MAX; bus_n++) {
> +        vtd_bus = vtd_find_as_from_bus_num(s, bus_n);
> +        if (!vtd_bus) {
> +            continue;
> +        }
> +        for (devfn = 0; devfn < PCI_DEVFN_MAX; devfn++) {
> +            vtd_icx = vtd_bus->dev_icx[devfn];
> +            if (!vtd_icx) {
> +                continue;
> +            }
> +
> +            /*
> +             * We'll return the first valid result we got. It's
> +             * a bit hackish in that we don't have a good global
> +             * interface yet to talk to modules like vfio to deliver
> +             * this allocation request, so we're leveraging this
> +             * per-device iommu object to do the same thing just
> +             * to make sure the allocation happens only once.
> +             */
> +            ret = ds_iommu_pasid_alloc(vtd_icx->dsi_obj,
> +                         VTD_MIN_HPASID, VTD_MAX_HPASID, pasid);

Your indents are always strange to me for long funcalls...  Not a
complaint though, as long as no one else complains. :)

> +            if (!ret) {
> +                break;
> +            }
> +        }
> +    }
> +    return ret;
> +}
> +
> +static int vtd_request_pasid_free(IntelIOMMUState *s, uint32_t pasid)
> +{
> +    VTDBus *vtd_bus;
> +    int bus_n, devfn, ret = -errno;
> +    VTDIOMMUContext *vtd_icx;
> +
> +    for (bus_n = 0; bus_n < PCI_BUS_MAX; bus_n++) {
> +        vtd_bus = vtd_find_as_from_bus_num(s, bus_n);
> +        if (!vtd_bus) {
> +            continue;
> +        }
> +        for (devfn = 0; devfn < PCI_DEVFN_MAX; devfn++) {
> +            vtd_icx = vtd_bus->dev_icx[devfn];
> +            if (!vtd_icx) {
> +                continue;
> +            }
> +            /*
> +             * Similar with pasid allocation. We'll free the pasid
> +             * on the first successful free operation. It's a bit
> +             * hackish in that we don't have a good global interface
> +             * yet to talk to modules like vfio to deliver this pasid
> +             * free request, so we're leveraging this per-device iommu
> +             * object to do the same thing just to make sure the
> +             * free happens only once.
> +             */
> +            ret = ds_iommu_pasid_free(vtd_icx->dsi_obj, pasid);
> +            if (!ret) {
> +                break;
> +            }
> +        }
> +    }
> +    return ret;
> +}
> +
> +/*
> + * If IP is not set, set it and return 0
> + * If IP is already set, return -1

Out of date?  Instead can mention that this also resets the reply
status code to zero implicitly so by default it will return a success.

Other than that:

Reviewed-by: Peter Xu <peterx@redhat.com>

> + */
> +static void vtd_vcmd_set_ip(IntelIOMMUState *s)
> +{
> +    s->vcrsp = 1;
> +    vtd_set_quad_raw(s, DMAR_VCRSP_REG,
> +                     ((uint64_t) s->vcrsp));
> +}
> +
> +static void vtd_vcmd_clear_ip(IntelIOMMUState *s)
> +{
> +    s->vcrsp &= (~((uint64_t)(0x1)));
> +    vtd_set_quad_raw(s, DMAR_VCRSP_REG,
> +                     ((uint64_t) s->vcrsp));
> +}
> +
> +/* Handle write to Virtual Command Register */
> +static int vtd_handle_vcmd_write(IntelIOMMUState *s, uint64_t val)
> +{
> +    uint32_t pasid;
> +    int ret = -1;
> +
> +    trace_vtd_reg_write_vcmd(s->vcrsp, val);
> +
> +    if (!(s->vccap & VTD_VCCAP_PAS) ||
> +         (s->vcrsp & 1)) {
> +        return -1;
> +    }
> +
> +    /*
> +     * Since vCPU should be blocked when the guest VMCD
> +     * write was trapped to here. Should be no other vCPUs
> +     * try to access VCMD if guest software is well written.
> +     * However, we still emulate the IP bit here in case of
> +     * bad guest software. Also align with the spec.
> +     */
> +    vtd_vcmd_set_ip(s);
> +
> +    switch (val & VTD_VCMD_CMD_MASK) {
> +    case VTD_VCMD_ALLOC_PASID:
> +        ret = vtd_request_pasid_alloc(s, &pasid);
> +        if (ret) {
> +            s->vcrsp |= VTD_VCRSP_SC(VTD_VCMD_NO_AVAILABLE_PASID);
> +        } else {
> +            s->vcrsp |= VTD_VCRSP_RSLT(pasid);
> +        }
> +        break;
> +
> +    case VTD_VCMD_FREE_PASID:
> +        pasid = VTD_VCMD_PASID_VALUE(val);
> +        ret = vtd_request_pasid_free(s, pasid);
> +        if (ret < 0) {
> +            s->vcrsp |= VTD_VCRSP_SC(VTD_VCMD_FREE_INVALID_PASID);
> +        }
> +        break;
> +
> +    default:
> +        s->vcrsp |= VTD_VCRSP_SC(VTD_VCMD_UNDEFINED_CMD);
> +        error_report_once("Virtual Command: unsupported command!!!");
> +        break;
> +    }
> +    vtd_vcmd_clear_ip(s);
> +    return 0;
> +}
> +
>  static uint64_t vtd_mem_read(void *opaque, hwaddr addr, unsigned size)
>  {
>      IntelIOMMUState *s = opaque;
> @@ -2938,6 +3074,23 @@ static void vtd_mem_write(void *opaque, hwaddr addr,
>          vtd_set_long(s, addr, val);
>          break;
>  
> +    case DMAR_VCMD_REG:
> +        if (!vtd_handle_vcmd_write(s, val)) {
> +            if (size == 4) {
> +                vtd_set_long(s, addr, val);
> +            } else {
> +                vtd_set_quad(s, addr, val);
> +            }
> +        }
> +        break;
> +
> +    case DMAR_VCMD_REG_HI:
> +        assert(size == 4);
> +        if (!vtd_handle_vcmd_write(s, val)) {
> +            vtd_set_long(s, addr, val);
> +        }
> +        break;
> +
>      default:
>          if (size == 4) {
>              vtd_set_long(s, addr, val);
> @@ -3712,7 +3865,8 @@ static void vtd_init(IntelIOMMUState *s)
>          s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_SLTS;
>      } else if (s->scalable_mode && s->scalable_modern) {
>          s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_PASID
> -                   | VTD_ECAP_FLTS | VTD_ECAP_PSS;
> +                   | VTD_ECAP_FLTS | VTD_ECAP_PSS | VTD_ECAP_VCS;
> +        s->vccap |= VTD_VCCAP_PAS;
>      }
>  
>      vtd_reset_caches(s);
> @@ -3768,6 +3922,13 @@ static void vtd_init(IntelIOMMUState *s)
>       * Interrupt remapping registers.
>       */
>      vtd_define_quad(s, DMAR_IRTA_REG, 0, 0xfffffffffffff80fULL, 0);
> +
> +    /*
> +     * Virtual Command Definitions
> +     */
> +    vtd_define_quad(s, DMAR_VCCAP_REG, s->vccap, 0, 0);
> +    vtd_define_quad(s, DMAR_VCMD_REG, 0, 0xffffffffffffffffULL, 0);
> +    vtd_define_quad(s, DMAR_VCRSP_REG, 0, 0, 0);
>  }
>  
>  /* Should not reset address_spaces when reset because devices will still use
> diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
> index c4dbb2c..fb5fdc2 100644
> --- a/hw/i386/intel_iommu_internal.h
> +++ b/hw/i386/intel_iommu_internal.h
> @@ -85,6 +85,12 @@
>  #define DMAR_MTRRCAP_REG_HI     0x104
>  #define DMAR_MTRRDEF_REG        0x108 /* MTRR default type */
>  #define DMAR_MTRRDEF_REG_HI     0x10c
> +#define DMAR_VCCAP_REG          0xE00 /* Virtual Command Capability Register */
> +#define DMAR_VCCAP_REG_HI       0xE04
> +#define DMAR_VCMD_REG           0xE10 /* Virtual Command Register */
> +#define DMAR_VCMD_REG_HI        0xE14
> +#define DMAR_VCRSP_REG          0xE20 /* Virtual Command Reponse Register */
> +#define DMAR_VCRSP_REG_HI       0xE24
>  
>  /* IOTLB registers */
>  #define DMAR_IOTLB_REG_OFFSET   0xf0 /* Offset to the IOTLB registers */
> @@ -193,6 +199,7 @@
>  #define VTD_ECAP_PSS                (19ULL << 35)
>  #define VTD_ECAP_PASID              (1ULL << 40)
>  #define VTD_ECAP_SMTS               (1ULL << 43)
> +#define VTD_ECAP_VCS                (1ULL << 44)
>  #define VTD_ECAP_SLTS               (1ULL << 46)
>  #define VTD_ECAP_FLTS               (1ULL << 47)
>  
> @@ -315,6 +322,37 @@ typedef enum VTDFaultReason {
>  
>  #define VTD_CONTEXT_CACHE_GEN_MAX       0xffffffffUL
>  
> +/* VCCAP_REG */
> +#define VTD_VCCAP_PAS               (1UL << 0)
> +
> +/*
> + * The basic idea is to let hypervisor to set a range for available
> + * PASIDs for VMs. One of the reasons is PASID #0 is reserved by
> + * RID_PASID usage. We have no idea how many reserved PASIDs in future,
> + * so here just an evaluated value. Honestly, set it as "1" is enough
> + * at current stage.
> + */
> +#define VTD_MIN_HPASID              1
> +#define VTD_MAX_HPASID              0xFFFFF
> +
> +/* Virtual Command Register */
> +enum {
> +     VTD_VCMD_NULL_CMD = 0,
> +     VTD_VCMD_ALLOC_PASID = 1,
> +     VTD_VCMD_FREE_PASID = 2,
> +     VTD_VCMD_CMD_NUM,
> +};
> +
> +#define VTD_VCMD_CMD_MASK           0xffUL
> +#define VTD_VCMD_PASID_VALUE(val)   (((val) >> 8) & 0xfffff)
> +
> +#define VTD_VCRSP_RSLT(val)         ((val) << 8)
> +#define VTD_VCRSP_SC(val)           (((val) & 0x3) << 1)
> +
> +#define VTD_VCMD_UNDEFINED_CMD         1ULL
> +#define VTD_VCMD_NO_AVAILABLE_PASID    2ULL
> +#define VTD_VCMD_FREE_INVALID_PASID    2ULL
> +
>  /* Interrupt Entry Cache Invalidation Descriptor: VT-d 6.5.2.7. */
>  struct VTDInvDescIEC {
>      uint32_t type:4;            /* Should always be 0x4 */
> diff --git a/hw/i386/trace-events b/hw/i386/trace-events
> index e48bef2..71536a7 100644
> --- a/hw/i386/trace-events
> +++ b/hw/i386/trace-events
> @@ -51,6 +51,7 @@ vtd_reg_write_gcmd(uint32_t status, uint32_t val) "status 0x%"PRIx32" value 0x%"
>  vtd_reg_write_fectl(uint32_t value) "value 0x%"PRIx32
>  vtd_reg_write_iectl(uint32_t value) "value 0x%"PRIx32
>  vtd_reg_ics_clear_ip(void) ""
> +vtd_reg_write_vcmd(uint32_t status, uint32_t val) "status 0x%"PRIx32" value 0x%"PRIx32
>  vtd_dmar_translate(uint8_t bus, uint8_t slot, uint8_t func, uint64_t iova, uint64_t gpa, uint64_t mask) "dev %02x:%02x.%02x iova 0x%"PRIx64" -> gpa 0x%"PRIx64" mask 0x%"PRIx64
>  vtd_dmar_enable(bool en) "enable %d"
>  vtd_dmar_fault(uint16_t sid, int fault, uint64_t addr, bool is_write) "sid 0x%"PRIx16" fault %d addr 0x%"PRIx64" write %d"
> diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
> index 1ef2917..4158116 100644
> --- a/include/hw/i386/intel_iommu.h
> +++ b/include/hw/i386/intel_iommu.h
> @@ -46,7 +46,7 @@
>  #define VTD_SID_TO_BUS(sid)         (((sid) >> 8) & 0xff)
>  #define VTD_SID_TO_DEVFN(sid)       ((sid) & 0xff)
>  
> -#define DMAR_REG_SIZE               0x230
> +#define DMAR_REG_SIZE               0xF00
>  #define VTD_HOST_AW_39BIT           39
>  #define VTD_HOST_AW_48BIT           48
>  #define VTD_HOST_ADDRESS_WIDTH      VTD_HOST_AW_39BIT
> @@ -285,6 +285,10 @@ struct IntelIOMMUState {
>      uint8_t aw_bits;                /* Host/IOVA address width (in bits) */
>      bool dma_drain;                 /* Whether DMA r/w draining enabled */
>  
> +    /* Virtual Command Register */
> +    uint64_t vccap;                 /* The value of vcmd capability reg */
> +    uint64_t vcrsp;                 /* Current value of VCMD RSP REG */
> +
>      /*
>       * Protects IOMMU states in general.  Currently it protects the
>       * per-IOMMU IOTLB cache, and context entry cache in VTDAddressSpace.
> -- 
> 2.7.4
>
Peter Xu Feb. 11, 2020, 9:56 p.m. UTC | #2
On Wed, Jan 29, 2020 at 04:16:45AM -0800, Liu, Yi L wrote:
> +/*
> + * The basic idea is to let hypervisor to set a range for available
> + * PASIDs for VMs. One of the reasons is PASID #0 is reserved by
> + * RID_PASID usage. We have no idea how many reserved PASIDs in future,
> + * so here just an evaluated value. Honestly, set it as "1" is enough
> + * at current stage.
> + */
> +#define VTD_MIN_HPASID              1
> +#define VTD_MAX_HPASID              0xFFFFF

One more question: I see that PASID is defined as 20bits long.  It's
fine.  However I start to get confused on how the Scalable Mode PASID
Directory could service that much of PASID entries.

I'm looking at spec 3.4.3, Figure 3-8.

Firstly, we only have two levels for a PASID table.  The context entry
of a device stores a pointer to the "Scalable Mode PASID Directory"
page. I see that there're 2^14 entries in "Scalable Mode PASID
Directory" page, each is a "Scalable Mode PASID Table".
However... how do we fit in the 4K page if each entry is a pointer of
x86_64 (8 bytes) while there're 2^14 entries?  A simple math gives me
4K/8 = 512, which means the "Scalable Mode PASID Directory" page can
only have 512 entries, then how the 2^14 come from?  Hmm??

Apart of this: also I just noticed (when reading the latter part of
the series) that the time that a pasid table walk can consume will
depend on this value too.  I'd suggest to make this as small as we
can, as long as it satisfies the usage.  We can even bump it in the
future.
Yi Liu Feb. 12, 2020, 7:32 a.m. UTC | #3
> From: Peter Xu <peterx@redhat.com>
> Sent: Wednesday, February 12, 2020 4:16 AM
> To: Liu, Yi L <yi.l.liu@intel.com>
> Subject: Re: [RFC v3 14/25] intel_iommu: add virtual command capability support
> 
> On Wed, Jan 29, 2020 at 04:16:45AM -0800, Liu, Yi L wrote:
> > From: Liu Yi L <yi.l.liu@intel.com>
> >
> > This patch adds virtual command support to Intel vIOMMU per Intel VT-d
> > 3.1 spec. And adds two virtual commands: allocate pasid and free
> > pasid.
> >
> > Cc: Kevin Tian <kevin.tian@intel.com>
> > Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
> > Cc: Peter Xu <peterx@redhat.com>
> > Cc: Yi Sun <yi.y.sun@linux.intel.com>
> > Cc: Paolo Bonzini <pbonzini@redhat.com>
> > Cc: Richard Henderson <rth@twiddle.net>
> > Cc: Eduardo Habkost <ehabkost@redhat.com>
> > Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
> > Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
> > ---
> >  hw/i386/intel_iommu.c          | 163
> ++++++++++++++++++++++++++++++++++++++++-
> >  hw/i386/intel_iommu_internal.h |  38 ++++++++++
> >  hw/i386/trace-events           |   1 +
> >  include/hw/i386/intel_iommu.h  |   6 +-
> >  4 files changed, 206 insertions(+), 2 deletions(-)
> >
> > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index
> > 33be40c..43a728f 100644
> > --- a/hw/i386/intel_iommu.c
> > +++ b/hw/i386/intel_iommu.c
> > @@ -2649,6 +2649,142 @@ static void
> vtd_handle_iectl_write(IntelIOMMUState *s)
> >      }
> >  }
> >
> > +static int vtd_request_pasid_alloc(IntelIOMMUState *s, uint32_t
> > +*pasid) {
> > +    VTDBus *vtd_bus;
> > +    int bus_n, devfn, ret = -errno;
> > +    VTDIOMMUContext *vtd_icx;
> > +
> > +    for (bus_n = 0; bus_n < PCI_BUS_MAX; bus_n++) {
> > +        vtd_bus = vtd_find_as_from_bus_num(s, bus_n);
> > +        if (!vtd_bus) {
> > +            continue;
> > +        }
> > +        for (devfn = 0; devfn < PCI_DEVFN_MAX; devfn++) {
> > +            vtd_icx = vtd_bus->dev_icx[devfn];
> > +            if (!vtd_icx) {
> > +                continue;
> > +            }
> > +
> > +            /*
> > +             * We'll return the first valid result we got. It's
> > +             * a bit hackish in that we don't have a good global
> > +             * interface yet to talk to modules like vfio to deliver
> > +             * this allocation request, so we're leveraging this
> > +             * per-device iommu object to do the same thing just
> > +             * to make sure the allocation happens only once.
> > +             */
> > +            ret = ds_iommu_pasid_alloc(vtd_icx->dsi_obj,
> > +                         VTD_MIN_HPASID, VTD_MAX_HPASID, pasid);
> 
> Your indents are always strange to me for long funcalls...  Not a complaint though,
> as long as no one else complains. :)

yeah, I'm also not feeling well with them... I'll try to make the indents  for long
funccalls better. 
Yi Liu Feb. 13, 2020, 2:40 a.m. UTC | #4
> From: Peter Xu <peterx@redhat.com>
> Sent: Wednesday, February 12, 2020 5:57 AM
> To: Liu, Yi L <yi.l.liu@intel.com>
> Subject: Re: [RFC v3 14/25] intel_iommu: add virtual command capability support
> 
> On Wed, Jan 29, 2020 at 04:16:45AM -0800, Liu, Yi L wrote:
> > +/*
> > + * The basic idea is to let hypervisor to set a range for available
> > + * PASIDs for VMs. One of the reasons is PASID #0 is reserved by
> > + * RID_PASID usage. We have no idea how many reserved PASIDs in future,
> > + * so here just an evaluated value. Honestly, set it as "1" is enough
> > + * at current stage.
> > + */
> > +#define VTD_MIN_HPASID              1
> > +#define VTD_MAX_HPASID              0xFFFFF
> 
> One more question: I see that PASID is defined as 20bits long.  It's
> fine.  However I start to get confused on how the Scalable Mode PASID
> Directory could service that much of PASID entries.
> 
> I'm looking at spec 3.4.3, Figure 3-8.
> 
> Firstly, we only have two levels for a PASID table.  The context entry
> of a device stores a pointer to the "Scalable Mode PASID Directory"
> page. I see that there're 2^14 entries in "Scalable Mode PASID
> Directory" page, each is a "Scalable Mode PASID Table".
> However... how do we fit in the 4K page if each entry is a pointer of
> x86_64 (8 bytes) while there're 2^14 entries?  A simple math gives me
> 4K/8 = 512, which means the "Scalable Mode PASID Directory" page can
> only have 512 entries, then how the 2^14 come from?  Hmm??

I checked with Kevin. The spec doesn't say the dir table is 4K. It says 4K
only for pasid table. Also, if you look at 9.4, scalabe-mode context entry
includes a PDTS field to specify the actual size of the directory table.

> Apart of this: also I just noticed (when reading the latter part of
> the series) that the time that a pasid table walk can consume will
> depend on this value too.  I'd suggest to make this as small as we
> can, as long as it satisfies the usage.  We can even bump it in the
> future.

I see. This looks to be an optimization. right? Instead of modify the
value of this macro,  I think we can do this optimization by tracking
the allocated PASIDs in QEMU. Thus, the pasid table walk  would be more
efficient and also no dependency on the VTD_MAX_HPASID. Does it make
sense to you? :-)

Regards,
Yi Liu
Peter Xu Feb. 13, 2020, 2:31 p.m. UTC | #5
On Thu, Feb 13, 2020 at 02:40:45AM +0000, Liu, Yi L wrote:
> > From: Peter Xu <peterx@redhat.com>
> > Sent: Wednesday, February 12, 2020 5:57 AM
> > To: Liu, Yi L <yi.l.liu@intel.com>
> > Subject: Re: [RFC v3 14/25] intel_iommu: add virtual command capability support
> > 
> > On Wed, Jan 29, 2020 at 04:16:45AM -0800, Liu, Yi L wrote:
> > > +/*
> > > + * The basic idea is to let hypervisor to set a range for available
> > > + * PASIDs for VMs. One of the reasons is PASID #0 is reserved by
> > > + * RID_PASID usage. We have no idea how many reserved PASIDs in future,
> > > + * so here just an evaluated value. Honestly, set it as "1" is enough
> > > + * at current stage.
> > > + */
> > > +#define VTD_MIN_HPASID              1
> > > +#define VTD_MAX_HPASID              0xFFFFF
> > 
> > One more question: I see that PASID is defined as 20bits long.  It's
> > fine.  However I start to get confused on how the Scalable Mode PASID
> > Directory could service that much of PASID entries.
> > 
> > I'm looking at spec 3.4.3, Figure 3-8.
> > 
> > Firstly, we only have two levels for a PASID table.  The context entry
> > of a device stores a pointer to the "Scalable Mode PASID Directory"
> > page. I see that there're 2^14 entries in "Scalable Mode PASID
> > Directory" page, each is a "Scalable Mode PASID Table".
> > However... how do we fit in the 4K page if each entry is a pointer of
> > x86_64 (8 bytes) while there're 2^14 entries?  A simple math gives me
> > 4K/8 = 512, which means the "Scalable Mode PASID Directory" page can
> > only have 512 entries, then how the 2^14 come from?  Hmm??
> 
> I checked with Kevin. The spec doesn't say the dir table is 4K. It says 4K
> only for pasid table. Also, if you look at 9.4, scalabe-mode context entry
> includes a PDTS field to specify the actual size of the directory table.

Ah I see.  Then it seems to be lost then in this series.  Say, I think
vtd_sm_pasid_table_walk() should also stop walking until reaching the
size there, and you need to fetch that size info from the context
entry before walk starts.

> 
> > Apart of this: also I just noticed (when reading the latter part of
> > the series) that the time that a pasid table walk can consume will
> > depend on this value too.  I'd suggest to make this as small as we
> > can, as long as it satisfies the usage.  We can even bump it in the
> > future.
> 
> I see. This looks to be an optimization. right? Instead of modify the
> value of this macro,  I think we can do this optimization by tracking
> the allocated PASIDs in QEMU. Thus, the pasid table walk  would be more
> efficient and also no dependency on the VTD_MAX_HPASID. Does it make
> sense to you? :-)

Yeah sounds good. :)

Thanks,
Peter Xu Feb. 13, 2020, 3:08 p.m. UTC | #6
On Thu, Feb 13, 2020 at 09:31:10AM -0500, Peter Xu wrote:

[...]

> > > Apart of this: also I just noticed (when reading the latter part of
> > > the series) that the time that a pasid table walk can consume will
> > > depend on this value too.  I'd suggest to make this as small as we
> > > can, as long as it satisfies the usage.  We can even bump it in the
> > > future.
> > 
> > I see. This looks to be an optimization. right? Instead of modify the
> > value of this macro,  I think we can do this optimization by tracking
> > the allocated PASIDs in QEMU. Thus, the pasid table walk  would be more
> > efficient and also no dependency on the VTD_MAX_HPASID. Does it make
> > sense to you? :-)
> 
> Yeah sounds good. :)

Just to make sure it's safe even for when the global allocation is not
happening (full emulation devices?  Do they need the PASID table walk
too?).  Anyway, be careful to not miss some valid PASID entries, or we
can still use the MIN(PASID_MAX, CONTEXT_ENTRY_SIZE) to be safe as a
first version.  Thanks,
Yi Liu Feb. 15, 2020, 8:49 a.m. UTC | #7
> From: Peter Xu < peterx@redhat.com >
> Sent: Thursday, February 13, 2020 11:09 PM
> To: Liu, Yi L <yi.l.liu@intel.com>
> Subject: Re: [RFC v3 14/25] intel_iommu: add virtual command capability support
> 
> On Thu, Feb 13, 2020 at 09:31:10AM -0500, Peter Xu wrote:
> 
> [...]
> 
> > > > Apart of this: also I just noticed (when reading the latter part of
> > > > the series) that the time that a pasid table walk can consume will
> > > > depend on this value too.  I'd suggest to make this as small as we
> > > > can, as long as it satisfies the usage.  We can even bump it in the
> > > > future.
> > >
> > > I see. This looks to be an optimization. right? Instead of modify the
> > > value of this macro,  I think we can do this optimization by tracking
> > > the allocated PASIDs in QEMU. Thus, the pasid table walk  would be more
> > > efficient and also no dependency on the VTD_MAX_HPASID. Does it make
> > > sense to you? :-)
> >
> > Yeah sounds good. :)
> 
> Just to make sure it's safe even for when the global allocation is not
> happening (full emulation devices?  Do they need the PASID table walk
> too?). 

I'd say no. For full emulation devices, just needs to ensure the pasid cache
is latest (do what guest told). Even the invalidation flushes too much cache,
it just affects the performance but no correctness issue.  This is different
with passthru devices, if unbind too much, it means some passthru devices
may encounter DMA  fault later.

> Anyway, be careful to not miss some valid PASID entries, or we
> can still use the MIN(PASID_MAX, CONTEXT_ENTRY_SIZE) to be safe as a
> first version.  Thanks,

Agreed. First version to ensure 100% safe.

Regards,
Yi Liu
diff mbox series

Patch

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 33be40c..43a728f 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -2649,6 +2649,142 @@  static void vtd_handle_iectl_write(IntelIOMMUState *s)
     }
 }
 
+static int vtd_request_pasid_alloc(IntelIOMMUState *s, uint32_t *pasid)
+{
+    VTDBus *vtd_bus;
+    int bus_n, devfn, ret = -errno;
+    VTDIOMMUContext *vtd_icx;
+
+    for (bus_n = 0; bus_n < PCI_BUS_MAX; bus_n++) {
+        vtd_bus = vtd_find_as_from_bus_num(s, bus_n);
+        if (!vtd_bus) {
+            continue;
+        }
+        for (devfn = 0; devfn < PCI_DEVFN_MAX; devfn++) {
+            vtd_icx = vtd_bus->dev_icx[devfn];
+            if (!vtd_icx) {
+                continue;
+            }
+
+            /*
+             * We'll return the first valid result we got. It's
+             * a bit hackish in that we don't have a good global
+             * interface yet to talk to modules like vfio to deliver
+             * this allocation request, so we're leveraging this
+             * per-device iommu object to do the same thing just
+             * to make sure the allocation happens only once.
+             */
+            ret = ds_iommu_pasid_alloc(vtd_icx->dsi_obj,
+                         VTD_MIN_HPASID, VTD_MAX_HPASID, pasid);
+            if (!ret) {
+                break;
+            }
+        }
+    }
+    return ret;
+}
+
+static int vtd_request_pasid_free(IntelIOMMUState *s, uint32_t pasid)
+{
+    VTDBus *vtd_bus;
+    int bus_n, devfn, ret = -errno;
+    VTDIOMMUContext *vtd_icx;
+
+    for (bus_n = 0; bus_n < PCI_BUS_MAX; bus_n++) {
+        vtd_bus = vtd_find_as_from_bus_num(s, bus_n);
+        if (!vtd_bus) {
+            continue;
+        }
+        for (devfn = 0; devfn < PCI_DEVFN_MAX; devfn++) {
+            vtd_icx = vtd_bus->dev_icx[devfn];
+            if (!vtd_icx) {
+                continue;
+            }
+            /*
+             * Similar with pasid allocation. We'll free the pasid
+             * on the first successful free operation. It's a bit
+             * hackish in that we don't have a good global interface
+             * yet to talk to modules like vfio to deliver this pasid
+             * free request, so we're leveraging this per-device iommu
+             * object to do the same thing just to make sure the
+             * free happens only once.
+             */
+            ret = ds_iommu_pasid_free(vtd_icx->dsi_obj, pasid);
+            if (!ret) {
+                break;
+            }
+        }
+    }
+    return ret;
+}
+
+/*
+ * If IP is not set, set it and return 0
+ * If IP is already set, return -1
+ */
+static void vtd_vcmd_set_ip(IntelIOMMUState *s)
+{
+    s->vcrsp = 1;
+    vtd_set_quad_raw(s, DMAR_VCRSP_REG,
+                     ((uint64_t) s->vcrsp));
+}
+
+static void vtd_vcmd_clear_ip(IntelIOMMUState *s)
+{
+    s->vcrsp &= (~((uint64_t)(0x1)));
+    vtd_set_quad_raw(s, DMAR_VCRSP_REG,
+                     ((uint64_t) s->vcrsp));
+}
+
+/* Handle write to Virtual Command Register */
+static int vtd_handle_vcmd_write(IntelIOMMUState *s, uint64_t val)
+{
+    uint32_t pasid;
+    int ret = -1;
+
+    trace_vtd_reg_write_vcmd(s->vcrsp, val);
+
+    if (!(s->vccap & VTD_VCCAP_PAS) ||
+         (s->vcrsp & 1)) {
+        return -1;
+    }
+
+    /*
+     * Since vCPU should be blocked when the guest VMCD
+     * write was trapped to here. Should be no other vCPUs
+     * try to access VCMD if guest software is well written.
+     * However, we still emulate the IP bit here in case of
+     * bad guest software. Also align with the spec.
+     */
+    vtd_vcmd_set_ip(s);
+
+    switch (val & VTD_VCMD_CMD_MASK) {
+    case VTD_VCMD_ALLOC_PASID:
+        ret = vtd_request_pasid_alloc(s, &pasid);
+        if (ret) {
+            s->vcrsp |= VTD_VCRSP_SC(VTD_VCMD_NO_AVAILABLE_PASID);
+        } else {
+            s->vcrsp |= VTD_VCRSP_RSLT(pasid);
+        }
+        break;
+
+    case VTD_VCMD_FREE_PASID:
+        pasid = VTD_VCMD_PASID_VALUE(val);
+        ret = vtd_request_pasid_free(s, pasid);
+        if (ret < 0) {
+            s->vcrsp |= VTD_VCRSP_SC(VTD_VCMD_FREE_INVALID_PASID);
+        }
+        break;
+
+    default:
+        s->vcrsp |= VTD_VCRSP_SC(VTD_VCMD_UNDEFINED_CMD);
+        error_report_once("Virtual Command: unsupported command!!!");
+        break;
+    }
+    vtd_vcmd_clear_ip(s);
+    return 0;
+}
+
 static uint64_t vtd_mem_read(void *opaque, hwaddr addr, unsigned size)
 {
     IntelIOMMUState *s = opaque;
@@ -2938,6 +3074,23 @@  static void vtd_mem_write(void *opaque, hwaddr addr,
         vtd_set_long(s, addr, val);
         break;
 
+    case DMAR_VCMD_REG:
+        if (!vtd_handle_vcmd_write(s, val)) {
+            if (size == 4) {
+                vtd_set_long(s, addr, val);
+            } else {
+                vtd_set_quad(s, addr, val);
+            }
+        }
+        break;
+
+    case DMAR_VCMD_REG_HI:
+        assert(size == 4);
+        if (!vtd_handle_vcmd_write(s, val)) {
+            vtd_set_long(s, addr, val);
+        }
+        break;
+
     default:
         if (size == 4) {
             vtd_set_long(s, addr, val);
@@ -3712,7 +3865,8 @@  static void vtd_init(IntelIOMMUState *s)
         s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_SLTS;
     } else if (s->scalable_mode && s->scalable_modern) {
         s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_PASID
-                   | VTD_ECAP_FLTS | VTD_ECAP_PSS;
+                   | VTD_ECAP_FLTS | VTD_ECAP_PSS | VTD_ECAP_VCS;
+        s->vccap |= VTD_VCCAP_PAS;
     }
 
     vtd_reset_caches(s);
@@ -3768,6 +3922,13 @@  static void vtd_init(IntelIOMMUState *s)
      * Interrupt remapping registers.
      */
     vtd_define_quad(s, DMAR_IRTA_REG, 0, 0xfffffffffffff80fULL, 0);
+
+    /*
+     * Virtual Command Definitions
+     */
+    vtd_define_quad(s, DMAR_VCCAP_REG, s->vccap, 0, 0);
+    vtd_define_quad(s, DMAR_VCMD_REG, 0, 0xffffffffffffffffULL, 0);
+    vtd_define_quad(s, DMAR_VCRSP_REG, 0, 0, 0);
 }
 
 /* Should not reset address_spaces when reset because devices will still use
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index c4dbb2c..fb5fdc2 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -85,6 +85,12 @@ 
 #define DMAR_MTRRCAP_REG_HI     0x104
 #define DMAR_MTRRDEF_REG        0x108 /* MTRR default type */
 #define DMAR_MTRRDEF_REG_HI     0x10c
+#define DMAR_VCCAP_REG          0xE00 /* Virtual Command Capability Register */
+#define DMAR_VCCAP_REG_HI       0xE04
+#define DMAR_VCMD_REG           0xE10 /* Virtual Command Register */
+#define DMAR_VCMD_REG_HI        0xE14
+#define DMAR_VCRSP_REG          0xE20 /* Virtual Command Reponse Register */
+#define DMAR_VCRSP_REG_HI       0xE24
 
 /* IOTLB registers */
 #define DMAR_IOTLB_REG_OFFSET   0xf0 /* Offset to the IOTLB registers */
@@ -193,6 +199,7 @@ 
 #define VTD_ECAP_PSS                (19ULL << 35)
 #define VTD_ECAP_PASID              (1ULL << 40)
 #define VTD_ECAP_SMTS               (1ULL << 43)
+#define VTD_ECAP_VCS                (1ULL << 44)
 #define VTD_ECAP_SLTS               (1ULL << 46)
 #define VTD_ECAP_FLTS               (1ULL << 47)
 
@@ -315,6 +322,37 @@  typedef enum VTDFaultReason {
 
 #define VTD_CONTEXT_CACHE_GEN_MAX       0xffffffffUL
 
+/* VCCAP_REG */
+#define VTD_VCCAP_PAS               (1UL << 0)
+
+/*
+ * The basic idea is to let hypervisor to set a range for available
+ * PASIDs for VMs. One of the reasons is PASID #0 is reserved by
+ * RID_PASID usage. We have no idea how many reserved PASIDs in future,
+ * so here just an evaluated value. Honestly, set it as "1" is enough
+ * at current stage.
+ */
+#define VTD_MIN_HPASID              1
+#define VTD_MAX_HPASID              0xFFFFF
+
+/* Virtual Command Register */
+enum {
+     VTD_VCMD_NULL_CMD = 0,
+     VTD_VCMD_ALLOC_PASID = 1,
+     VTD_VCMD_FREE_PASID = 2,
+     VTD_VCMD_CMD_NUM,
+};
+
+#define VTD_VCMD_CMD_MASK           0xffUL
+#define VTD_VCMD_PASID_VALUE(val)   (((val) >> 8) & 0xfffff)
+
+#define VTD_VCRSP_RSLT(val)         ((val) << 8)
+#define VTD_VCRSP_SC(val)           (((val) & 0x3) << 1)
+
+#define VTD_VCMD_UNDEFINED_CMD         1ULL
+#define VTD_VCMD_NO_AVAILABLE_PASID    2ULL
+#define VTD_VCMD_FREE_INVALID_PASID    2ULL
+
 /* Interrupt Entry Cache Invalidation Descriptor: VT-d 6.5.2.7. */
 struct VTDInvDescIEC {
     uint32_t type:4;            /* Should always be 0x4 */
diff --git a/hw/i386/trace-events b/hw/i386/trace-events
index e48bef2..71536a7 100644
--- a/hw/i386/trace-events
+++ b/hw/i386/trace-events
@@ -51,6 +51,7 @@  vtd_reg_write_gcmd(uint32_t status, uint32_t val) "status 0x%"PRIx32" value 0x%"
 vtd_reg_write_fectl(uint32_t value) "value 0x%"PRIx32
 vtd_reg_write_iectl(uint32_t value) "value 0x%"PRIx32
 vtd_reg_ics_clear_ip(void) ""
+vtd_reg_write_vcmd(uint32_t status, uint32_t val) "status 0x%"PRIx32" value 0x%"PRIx32
 vtd_dmar_translate(uint8_t bus, uint8_t slot, uint8_t func, uint64_t iova, uint64_t gpa, uint64_t mask) "dev %02x:%02x.%02x iova 0x%"PRIx64" -> gpa 0x%"PRIx64" mask 0x%"PRIx64
 vtd_dmar_enable(bool en) "enable %d"
 vtd_dmar_fault(uint16_t sid, int fault, uint64_t addr, bool is_write) "sid 0x%"PRIx16" fault %d addr 0x%"PRIx64" write %d"
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index 1ef2917..4158116 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -46,7 +46,7 @@ 
 #define VTD_SID_TO_BUS(sid)         (((sid) >> 8) & 0xff)
 #define VTD_SID_TO_DEVFN(sid)       ((sid) & 0xff)
 
-#define DMAR_REG_SIZE               0x230
+#define DMAR_REG_SIZE               0xF00
 #define VTD_HOST_AW_39BIT           39
 #define VTD_HOST_AW_48BIT           48
 #define VTD_HOST_ADDRESS_WIDTH      VTD_HOST_AW_39BIT
@@ -285,6 +285,10 @@  struct IntelIOMMUState {
     uint8_t aw_bits;                /* Host/IOVA address width (in bits) */
     bool dma_drain;                 /* Whether DMA r/w draining enabled */
 
+    /* Virtual Command Register */
+    uint64_t vccap;                 /* The value of vcmd capability reg */
+    uint64_t vcrsp;                 /* Current value of VCMD RSP REG */
+
     /*
      * Protects IOMMU states in general.  Currently it protects the
      * per-IOMMU IOTLB cache, and context entry cache in VTDAddressSpace.