diff mbox series

[RFC,v2,09/22] vfio/pci: add iommu_context notifier for pasid alloc/free

Message ID 1571920483-3382-10-git-send-email-yi.l.liu@intel.com (mailing list archive)
State New, archived
Headers show
Series intel_iommu: expose Shared Virtual Addressing to VM | expand

Commit Message

Yi Liu Oct. 24, 2019, 12:34 p.m. UTC
This patch adds pasid alloc/free notifiers for vfio-pci. It is
supposed to be fired by vIOMMU. VFIO then sends PASID allocation
or free request to host.

Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Eric Auger <eric.auger@redhat.com>
Cc: Yi Sun <yi.y.sun@linux.intel.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
---
 hw/vfio/common.c         |  9 ++++++
 hw/vfio/pci.c            | 81 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/hw/iommu/iommu.h | 15 +++++++++
 3 files changed, 105 insertions(+)

Comments

David Gibson Oct. 29, 2019, 12:15 p.m. UTC | #1
On Thu, Oct 24, 2019 at 08:34:30AM -0400, Liu Yi L wrote:
> This patch adds pasid alloc/free notifiers for vfio-pci. It is
> supposed to be fired by vIOMMU. VFIO then sends PASID allocation
> or free request to host.
> 
> Cc: Kevin Tian <kevin.tian@intel.com>
> Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
> Cc: Peter Xu <peterx@redhat.com>
> Cc: Eric Auger <eric.auger@redhat.com>
> Cc: Yi Sun <yi.y.sun@linux.intel.com>
> Cc: David Gibson <david@gibson.dropbear.id.au>
> Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
> ---
>  hw/vfio/common.c         |  9 ++++++
>  hw/vfio/pci.c            | 81 ++++++++++++++++++++++++++++++++++++++++++++++++
>  include/hw/iommu/iommu.h | 15 +++++++++
>  3 files changed, 105 insertions(+)
> 
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index d418527..e6ad21c 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -1436,6 +1436,7 @@ static void vfio_disconnect_container(VFIOGroup *group)
>      if (QLIST_EMPTY(&container->group_list)) {
>          VFIOAddressSpace *space = container->space;
>          VFIOGuestIOMMU *giommu, *tmp;
> +        VFIOIOMMUContext *giommu_ctx, *ctx;
>  
>          QLIST_REMOVE(container, next);
>  
> @@ -1446,6 +1447,14 @@ static void vfio_disconnect_container(VFIOGroup *group)
>              g_free(giommu);
>          }
>  
> +        QLIST_FOREACH_SAFE(giommu_ctx, &container->iommu_ctx_list,
> +                                                   iommu_ctx_next, ctx) {
> +            iommu_ctx_notifier_unregister(giommu_ctx->iommu_ctx,
> +                                                      &giommu_ctx->n);
> +            QLIST_REMOVE(giommu_ctx, iommu_ctx_next);
> +            g_free(giommu_ctx);
> +        }
> +
>          trace_vfio_disconnect_container(container->fd);
>          close(container->fd);
>          g_free(container);
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 12fac39..8721ff6 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -2699,11 +2699,80 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
>      vdev->req_enabled = false;
>  }
>  
> +static void vfio_register_iommu_ctx_notifier(VFIOPCIDevice *vdev,
> +                                             IOMMUContext *iommu_ctx,
> +                                             IOMMUCTXNotifyFn fn,
> +                                             IOMMUCTXEvent event)
> +{
> +    VFIOContainer *container = vdev->vbasedev.group->container;
> +    VFIOIOMMUContext *giommu_ctx;
> +
> +    giommu_ctx = g_malloc0(sizeof(*giommu_ctx));
> +    giommu_ctx->container = container;
> +    giommu_ctx->iommu_ctx = iommu_ctx;
> +    QLIST_INSERT_HEAD(&container->iommu_ctx_list,
> +                      giommu_ctx,
> +                      iommu_ctx_next);
> +    iommu_ctx_notifier_register(iommu_ctx,
> +                                &giommu_ctx->n,
> +                                fn,
> +                                event);
> +}
> +
> +static void vfio_iommu_pasid_alloc_notify(IOMMUCTXNotifier *n,
> +                                          IOMMUCTXEventData *event_data)
> +{
> +    VFIOIOMMUContext *giommu_ctx = container_of(n, VFIOIOMMUContext, n);
> +    VFIOContainer *container = giommu_ctx->container;
> +    IOMMUCTXPASIDReqDesc *pasid_req =
> +                              (IOMMUCTXPASIDReqDesc *) event_data->data;
> +    struct vfio_iommu_type1_pasid_request req;
> +    unsigned long argsz;
> +    int pasid;
> +
> +    argsz = sizeof(req);
> +    req.argsz = argsz;
> +    req.flag = VFIO_IOMMU_PASID_ALLOC;
> +    req.min_pasid = pasid_req->min_pasid;
> +    req.max_pasid = pasid_req->max_pasid;
> +
> +    pasid = ioctl(container->fd, VFIO_IOMMU_PASID_REQUEST, &req);
> +    if (pasid < 0) {
> +        error_report("%s: %d, alloc failed", __func__, -errno);
> +    }
> +    pasid_req->alloc_result = pasid;

Altering the event data from the notifier doesn't make sense.  By
definition there can be multiple notifiers on the chain, so in that
case which one is responsible for updating the writable field?

> +}
> +
> +static void vfio_iommu_pasid_free_notify(IOMMUCTXNotifier *n,
> +                                          IOMMUCTXEventData *event_data)
> +{
> +    VFIOIOMMUContext *giommu_ctx = container_of(n, VFIOIOMMUContext, n);
> +    VFIOContainer *container = giommu_ctx->container;
> +    IOMMUCTXPASIDReqDesc *pasid_req =
> +                              (IOMMUCTXPASIDReqDesc *) event_data->data;
> +    struct vfio_iommu_type1_pasid_request req;
> +    unsigned long argsz;
> +    int ret = 0;
> +
> +    argsz = sizeof(req);
> +    req.argsz = argsz;
> +    req.flag = VFIO_IOMMU_PASID_FREE;
> +    req.pasid = pasid_req->pasid;
> +
> +    ret = ioctl(container->fd, VFIO_IOMMU_PASID_REQUEST, &req);
> +    if (ret != 0) {
> +        error_report("%s: %d, pasid %u free failed",
> +                   __func__, -errno, (unsigned) pasid_req->pasid);
> +    }
> +    pasid_req->free_result = ret;

Same problem here.

> +}
> +
>  static void vfio_realize(PCIDevice *pdev, Error **errp)
>  {
>      VFIOPCIDevice *vdev = PCI_VFIO(pdev);
>      VFIODevice *vbasedev_iter;
>      VFIOGroup *group;
> +    IOMMUContext *iommu_context;
>      char *tmp, *subsys, group_path[PATH_MAX], *group_name;
>      Error *err = NULL;
>      ssize_t len;
> @@ -3000,6 +3069,18 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
>      vfio_register_req_notifier(vdev);
>      vfio_setup_resetfn_quirk(vdev);
>  
> +    iommu_context = pci_device_iommu_context(pdev);
> +    if (iommu_context) {
> +        vfio_register_iommu_ctx_notifier(vdev,
> +                                         iommu_context,
> +                                         vfio_iommu_pasid_alloc_notify,
> +                                         IOMMU_CTX_EVENT_PASID_ALLOC);
> +        vfio_register_iommu_ctx_notifier(vdev,
> +                                         iommu_context,
> +                                         vfio_iommu_pasid_free_notify,
> +                                         IOMMU_CTX_EVENT_PASID_FREE);
> +    }
> +
>      return;
>  
>  out_teardown:
> diff --git a/include/hw/iommu/iommu.h b/include/hw/iommu/iommu.h
> index c22c442..4352afd 100644
> --- a/include/hw/iommu/iommu.h
> +++ b/include/hw/iommu/iommu.h
> @@ -31,10 +31,25 @@
>  typedef struct IOMMUContext IOMMUContext;
>  
>  enum IOMMUCTXEvent {
> +    IOMMU_CTX_EVENT_PASID_ALLOC,
> +    IOMMU_CTX_EVENT_PASID_FREE,
>      IOMMU_CTX_EVENT_NUM,
>  };
>  typedef enum IOMMUCTXEvent IOMMUCTXEvent;
>  
> +union IOMMUCTXPASIDReqDesc {
> +    struct {
> +        uint32_t min_pasid;
> +        uint32_t max_pasid;
> +        int32_t alloc_result; /* pasid allocated for the alloc request */
> +    };
> +    struct {
> +        uint32_t pasid; /* pasid to be free */
> +        int free_result;
> +    };
> +};

Apart from theproblem with writable fields, using a big union for
event data is pretty ugly.  If you need this different information for
the different events, it might make more sense to have a separate
notifier chain with a separate call interface for each event type,
rather than trying to multiplex them together.

> +typedef union IOMMUCTXPASIDReqDesc IOMMUCTXPASIDReqDesc;
> +
>  struct IOMMUCTXEventData {
>      IOMMUCTXEvent event;
>      uint64_t length;
Peter Xu Nov. 1, 2019, 5:26 p.m. UTC | #2
On Tue, Oct 29, 2019 at 01:15:44PM +0100, David Gibson wrote:
> > +union IOMMUCTXPASIDReqDesc {
> > +    struct {
> > +        uint32_t min_pasid;
> > +        uint32_t max_pasid;
> > +        int32_t alloc_result; /* pasid allocated for the alloc request */
> > +    };
> > +    struct {
> > +        uint32_t pasid; /* pasid to be free */
> > +        int free_result;
> > +    };
> > +};
> 
> Apart from theproblem with writable fields, using a big union for
> event data is pretty ugly.  If you need this different information for
> the different events, it might make more sense to have a separate
> notifier chain with a separate call interface for each event type,
> rather than trying to multiplex them together.

I have no issue on the union definiion, however I do agree that it's a
bit awkward to register one notifier for each event.

Instead of introducing even more notifier chains, I'm thinking whether
we can simply provide a single notifier hook for all the four events.
After all I don't see in what case we'll only register some of the
events, like we can't register alloc_pasid() without registering to
free_pasid() because otherwise it does not make sense..  And also you
have the wrapper struct ("IOMMUCTXEventData") which contains the event
type, so the notify() hook will know which message is this.

A side note is that I think you don't need the
IOMMUCTXEventData.length.  If you see the code, vtd_bind_guest_pasid()
does not even initialize length right now, and I think it could still
work only because none of the vfio notify() hook
(e.g. vfio_iommu_pasid_bind_notify) checks that length...
Yi Liu Nov. 6, 2019, 12:14 p.m. UTC | #3
> From: David Gibson [mailto:david@gibson.dropbear.id.au]
> Sent: Tuesday, October 29, 2019 8:16 PM
> To: Liu, Yi L <yi.l.liu@intel.com>
> Subject: Re: [RFC v2 09/22] vfio/pci: add iommu_context notifier for pasid alloc/free
> 
> On Thu, Oct 24, 2019 at 08:34:30AM -0400, Liu Yi L wrote:
> > This patch adds pasid alloc/free notifiers for vfio-pci. It is
> > supposed to be fired by vIOMMU. VFIO then sends PASID allocation
> > or free request to host.
> >
> > Cc: Kevin Tian <kevin.tian@intel.com>
> > Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
> > Cc: Peter Xu <peterx@redhat.com>
> > Cc: Eric Auger <eric.auger@redhat.com>
> > Cc: Yi Sun <yi.y.sun@linux.intel.com>
> > Cc: David Gibson <david@gibson.dropbear.id.au>
> > Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
> > ---
> >  hw/vfio/common.c         |  9 ++++++
> >  hw/vfio/pci.c            | 81
> ++++++++++++++++++++++++++++++++++++++++++++++++
> >  include/hw/iommu/iommu.h | 15 +++++++++
> >  3 files changed, 105 insertions(+)
> >
> > diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> > index d418527..e6ad21c 100644
> > --- a/hw/vfio/common.c
> > +++ b/hw/vfio/common.c
> > @@ -1436,6 +1436,7 @@ static void vfio_disconnect_container(VFIOGroup
> *group)
> >      if (QLIST_EMPTY(&container->group_list)) {
> >          VFIOAddressSpace *space = container->space;
> >          VFIOGuestIOMMU *giommu, *tmp;
> > +        VFIOIOMMUContext *giommu_ctx, *ctx;
> >
> >          QLIST_REMOVE(container, next);
> >
> > @@ -1446,6 +1447,14 @@ static void vfio_disconnect_container(VFIOGroup
> *group)
> >              g_free(giommu);
> >          }
> >
> > +        QLIST_FOREACH_SAFE(giommu_ctx, &container->iommu_ctx_list,
> > +                                                   iommu_ctx_next, ctx) {
> > +            iommu_ctx_notifier_unregister(giommu_ctx->iommu_ctx,
> > +                                                      &giommu_ctx->n);
> > +            QLIST_REMOVE(giommu_ctx, iommu_ctx_next);
> > +            g_free(giommu_ctx);
> > +        }
> > +
> >          trace_vfio_disconnect_container(container->fd);
> >          close(container->fd);
> >          g_free(container);
> > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> > index 12fac39..8721ff6 100644
> > --- a/hw/vfio/pci.c
> > +++ b/hw/vfio/pci.c
> > @@ -2699,11 +2699,80 @@ static void
> vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
> >      vdev->req_enabled = false;
> >  }
> >
> > +static void vfio_register_iommu_ctx_notifier(VFIOPCIDevice *vdev,
> > +                                             IOMMUContext *iommu_ctx,
> > +                                             IOMMUCTXNotifyFn fn,
> > +                                             IOMMUCTXEvent event)
> > +{
> > +    VFIOContainer *container = vdev->vbasedev.group->container;
> > +    VFIOIOMMUContext *giommu_ctx;
> > +
> > +    giommu_ctx = g_malloc0(sizeof(*giommu_ctx));
> > +    giommu_ctx->container = container;
> > +    giommu_ctx->iommu_ctx = iommu_ctx;
> > +    QLIST_INSERT_HEAD(&container->iommu_ctx_list,
> > +                      giommu_ctx,
> > +                      iommu_ctx_next);
> > +    iommu_ctx_notifier_register(iommu_ctx,
> > +                                &giommu_ctx->n,
> > +                                fn,
> > +                                event);
> > +}
> > +
> > +static void vfio_iommu_pasid_alloc_notify(IOMMUCTXNotifier *n,
> > +                                          IOMMUCTXEventData *event_data)
> > +{
> > +    VFIOIOMMUContext *giommu_ctx = container_of(n, VFIOIOMMUContext, n);
> > +    VFIOContainer *container = giommu_ctx->container;
> > +    IOMMUCTXPASIDReqDesc *pasid_req =
> > +                              (IOMMUCTXPASIDReqDesc *) event_data->data;
> > +    struct vfio_iommu_type1_pasid_request req;
> > +    unsigned long argsz;
> > +    int pasid;
> > +
> > +    argsz = sizeof(req);
> > +    req.argsz = argsz;
> > +    req.flag = VFIO_IOMMU_PASID_ALLOC;
> > +    req.min_pasid = pasid_req->min_pasid;
> > +    req.max_pasid = pasid_req->max_pasid;
> > +
> > +    pasid = ioctl(container->fd, VFIO_IOMMU_PASID_REQUEST, &req);
> > +    if (pasid < 0) {
> > +        error_report("%s: %d, alloc failed", __func__, -errno);
> > +    }
> > +    pasid_req->alloc_result = pasid;
> 
> Altering the event data from the notifier doesn't make sense.  By
> definition there can be multiple notifiers on the chain, so in that
> case which one is responsible for updating the writable field?

I guess you mean multiple pasid_alloc nofitiers. right?

It works for VT-d now, as Intel vIOMMU maintains the IOMMUContext
per-bdf. And there will be only 1 pasid_alloc notifier in the chain. But, I
agree it is not good if other module just share an IOMMUConext across
devices. Definitely, it would have multiple pasid_alloc notifiers.

How about enforcing IOMMUContext layer to only invoke one successful
pasid_alloc/free notifier if PASID_ALLOC/FREE event comes? pasid
alloc/free are really special as it requires feedback. And a potential
benefit is that the pasid_alloc/free will not be affected by hot plug
scenario. There will be always a notifier to work for pasid_alloc/free
work unless all passthru devices are hot plugged. How do you think? Or
if any other idea?

> > +}
> > +
> > +static void vfio_iommu_pasid_free_notify(IOMMUCTXNotifier *n,
> > +                                          IOMMUCTXEventData *event_data)
> > +{
> > +    VFIOIOMMUContext *giommu_ctx = container_of(n, VFIOIOMMUContext, n);
> > +    VFIOContainer *container = giommu_ctx->container;
> > +    IOMMUCTXPASIDReqDesc *pasid_req =
> > +                              (IOMMUCTXPASIDReqDesc *) event_data->data;
> > +    struct vfio_iommu_type1_pasid_request req;
> > +    unsigned long argsz;
> > +    int ret = 0;
> > +
> > +    argsz = sizeof(req);
> > +    req.argsz = argsz;
> > +    req.flag = VFIO_IOMMU_PASID_FREE;
> > +    req.pasid = pasid_req->pasid;
> > +
> > +    ret = ioctl(container->fd, VFIO_IOMMU_PASID_REQUEST, &req);
> > +    if (ret != 0) {
> > +        error_report("%s: %d, pasid %u free failed",
> > +                   __func__, -errno, (unsigned) pasid_req->pasid);
> > +    }
> > +    pasid_req->free_result = ret;
> 
> Same problem here.

yep, as above proposal.

> > +}
> > +
> >  static void vfio_realize(PCIDevice *pdev, Error **errp)
> >  {
> >      VFIOPCIDevice *vdev = PCI_VFIO(pdev);
> >      VFIODevice *vbasedev_iter;
> >      VFIOGroup *group;
> > +    IOMMUContext *iommu_context;
> >      char *tmp, *subsys, group_path[PATH_MAX], *group_name;
> >      Error *err = NULL;
> >      ssize_t len;
> > @@ -3000,6 +3069,18 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
> >      vfio_register_req_notifier(vdev);
> >      vfio_setup_resetfn_quirk(vdev);
> >
> > +    iommu_context = pci_device_iommu_context(pdev);
> > +    if (iommu_context) {
> > +        vfio_register_iommu_ctx_notifier(vdev,
> > +                                         iommu_context,
> > +                                         vfio_iommu_pasid_alloc_notify,
> > +                                         IOMMU_CTX_EVENT_PASID_ALLOC);
> > +        vfio_register_iommu_ctx_notifier(vdev,
> > +                                         iommu_context,
> > +                                         vfio_iommu_pasid_free_notify,
> > +                                         IOMMU_CTX_EVENT_PASID_FREE);
> > +    }
> > +
> >      return;
> >
> >  out_teardown:
> > diff --git a/include/hw/iommu/iommu.h b/include/hw/iommu/iommu.h
> > index c22c442..4352afd 100644
> > --- a/include/hw/iommu/iommu.h
> > +++ b/include/hw/iommu/iommu.h
> > @@ -31,10 +31,25 @@
> >  typedef struct IOMMUContext IOMMUContext;
> >
> >  enum IOMMUCTXEvent {
> > +    IOMMU_CTX_EVENT_PASID_ALLOC,
> > +    IOMMU_CTX_EVENT_PASID_FREE,
> >      IOMMU_CTX_EVENT_NUM,
> >  };
> >  typedef enum IOMMUCTXEvent IOMMUCTXEvent;
> >
> > +union IOMMUCTXPASIDReqDesc {
> > +    struct {
> > +        uint32_t min_pasid;
> > +        uint32_t max_pasid;
> > +        int32_t alloc_result; /* pasid allocated for the alloc request */
> > +    };
> > +    struct {
> > +        uint32_t pasid; /* pasid to be free */
> > +        int free_result;
> > +    };
> > +};
> 
> Apart from theproblem with writable fields, using a big union for
> event data is pretty ugly.  If you need this different information for
> the different events, it might make more sense to have a separate
> notifier chain with a separate call interface for each event type,
> rather than trying to multiplex them together.

sure, I'll de-couple them. Nice catch.

Thanks,
Yi Liu
Yi Liu Nov. 6, 2019, 12:46 p.m. UTC | #4
> From: Peter Xu
> Sent: Saturday, November 2, 2019 1:26 AM
> To: David Gibson <david@gibson.dropbear.id.au>
> Subject: Re: [RFC v2 09/22] vfio/pci: add iommu_context notifier for pasid alloc/free
> 
> On Tue, Oct 29, 2019 at 01:15:44PM +0100, David Gibson wrote:
> > > +union IOMMUCTXPASIDReqDesc {
> > > +    struct {
> > > +        uint32_t min_pasid;
> > > +        uint32_t max_pasid;
> > > +        int32_t alloc_result; /* pasid allocated for the alloc request */
> > > +    };
> > > +    struct {
> > > +        uint32_t pasid; /* pasid to be free */
> > > +        int free_result;
> > > +    };
> > > +};
> >
> > Apart from theproblem with writable fields, using a big union for
> > event data is pretty ugly.  If you need this different information for
> > the different events, it might make more sense to have a separate
> > notifier chain with a separate call interface for each event type,
> > rather than trying to multiplex them together.
> 
> I have no issue on the union definiion, however I do agree that it's a
> bit awkward to register one notifier for each event.

Got it. Would fix it in next version.

> Instead of introducing even more notifier chains, I'm thinking whether
> we can simply provide a single notifier hook for all the four events.
> After all I don't see in what case we'll only register some of the
> events, like we can't register alloc_pasid() without registering to
> free_pasid() because otherwise it does not make sense..  And also you
> have the wrapper struct ("IOMMUCTXEventData") which contains the event
> type, so the notify() hook will know which message is this.

I'm in with this proposal. This makes the notifier chain smaller.

> A side note is that I think you don't need the
> IOMMUCTXEventData.length.  If you see the code, vtd_bind_guest_pasid()
> does not even initialize length right now, and I think it could still
> work only because none of the vfio notify() hook
> (e.g. vfio_iommu_pasid_bind_notify) checks that length...

yes, will fix it.

> --
> Peter Xu
David Gibson Nov. 20, 2019, 4:27 a.m. UTC | #5
On Wed, Nov 06, 2019 at 12:14:50PM +0000, Liu, Yi L wrote:
> > From: David Gibson [mailto:david@gibson.dropbear.id.au]
> > Sent: Tuesday, October 29, 2019 8:16 PM
> > To: Liu, Yi L <yi.l.liu@intel.com>
> > Subject: Re: [RFC v2 09/22] vfio/pci: add iommu_context notifier for pasid alloc/free
> > 
> > On Thu, Oct 24, 2019 at 08:34:30AM -0400, Liu Yi L wrote:
> > > This patch adds pasid alloc/free notifiers for vfio-pci. It is
> > > supposed to be fired by vIOMMU. VFIO then sends PASID allocation
> > > or free request to host.
> > >
> > > Cc: Kevin Tian <kevin.tian@intel.com>
> > > Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
> > > Cc: Peter Xu <peterx@redhat.com>
> > > Cc: Eric Auger <eric.auger@redhat.com>
> > > Cc: Yi Sun <yi.y.sun@linux.intel.com>
> > > Cc: David Gibson <david@gibson.dropbear.id.au>
> > > Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
> > > ---
> > >  hw/vfio/common.c         |  9 ++++++
> > >  hw/vfio/pci.c            | 81
> > ++++++++++++++++++++++++++++++++++++++++++++++++
> > >  include/hw/iommu/iommu.h | 15 +++++++++
> > >  3 files changed, 105 insertions(+)
> > >
> > > diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> > > index d418527..e6ad21c 100644
> > > --- a/hw/vfio/common.c
> > > +++ b/hw/vfio/common.c
> > > @@ -1436,6 +1436,7 @@ static void vfio_disconnect_container(VFIOGroup
> > *group)
> > >      if (QLIST_EMPTY(&container->group_list)) {
> > >          VFIOAddressSpace *space = container->space;
> > >          VFIOGuestIOMMU *giommu, *tmp;
> > > +        VFIOIOMMUContext *giommu_ctx, *ctx;
> > >
> > >          QLIST_REMOVE(container, next);
> > >
> > > @@ -1446,6 +1447,14 @@ static void vfio_disconnect_container(VFIOGroup
> > *group)
> > >              g_free(giommu);
> > >          }
> > >
> > > +        QLIST_FOREACH_SAFE(giommu_ctx, &container->iommu_ctx_list,
> > > +                                                   iommu_ctx_next, ctx) {
> > > +            iommu_ctx_notifier_unregister(giommu_ctx->iommu_ctx,
> > > +                                                      &giommu_ctx->n);
> > > +            QLIST_REMOVE(giommu_ctx, iommu_ctx_next);
> > > +            g_free(giommu_ctx);
> > > +        }
> > > +
> > >          trace_vfio_disconnect_container(container->fd);
> > >          close(container->fd);
> > >          g_free(container);
> > > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> > > index 12fac39..8721ff6 100644
> > > --- a/hw/vfio/pci.c
> > > +++ b/hw/vfio/pci.c
> > > @@ -2699,11 +2699,80 @@ static void
> > vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
> > >      vdev->req_enabled = false;
> > >  }
> > >
> > > +static void vfio_register_iommu_ctx_notifier(VFIOPCIDevice *vdev,
> > > +                                             IOMMUContext *iommu_ctx,
> > > +                                             IOMMUCTXNotifyFn fn,
> > > +                                             IOMMUCTXEvent event)
> > > +{
> > > +    VFIOContainer *container = vdev->vbasedev.group->container;
> > > +    VFIOIOMMUContext *giommu_ctx;
> > > +
> > > +    giommu_ctx = g_malloc0(sizeof(*giommu_ctx));
> > > +    giommu_ctx->container = container;
> > > +    giommu_ctx->iommu_ctx = iommu_ctx;
> > > +    QLIST_INSERT_HEAD(&container->iommu_ctx_list,
> > > +                      giommu_ctx,
> > > +                      iommu_ctx_next);
> > > +    iommu_ctx_notifier_register(iommu_ctx,
> > > +                                &giommu_ctx->n,
> > > +                                fn,
> > > +                                event);
> > > +}
> > > +
> > > +static void vfio_iommu_pasid_alloc_notify(IOMMUCTXNotifier *n,
> > > +                                          IOMMUCTXEventData *event_data)
> > > +{
> > > +    VFIOIOMMUContext *giommu_ctx = container_of(n, VFIOIOMMUContext, n);
> > > +    VFIOContainer *container = giommu_ctx->container;
> > > +    IOMMUCTXPASIDReqDesc *pasid_req =
> > > +                              (IOMMUCTXPASIDReqDesc *) event_data->data;
> > > +    struct vfio_iommu_type1_pasid_request req;
> > > +    unsigned long argsz;
> > > +    int pasid;
> > > +
> > > +    argsz = sizeof(req);
> > > +    req.argsz = argsz;
> > > +    req.flag = VFIO_IOMMU_PASID_ALLOC;
> > > +    req.min_pasid = pasid_req->min_pasid;
> > > +    req.max_pasid = pasid_req->max_pasid;
> > > +
> > > +    pasid = ioctl(container->fd, VFIO_IOMMU_PASID_REQUEST, &req);
> > > +    if (pasid < 0) {
> > > +        error_report("%s: %d, alloc failed", __func__, -errno);
> > > +    }
> > > +    pasid_req->alloc_result = pasid;
> > 
> > Altering the event data from the notifier doesn't make sense.  By
> > definition there can be multiple notifiers on the chain, so in that
> > case which one is responsible for updating the writable field?
> 
> I guess you mean multiple pasid_alloc nofitiers. right?
> 
> It works for VT-d now, as Intel vIOMMU maintains the IOMMUContext
> per-bdf. And there will be only 1 pasid_alloc notifier in the chain. But, I
> agree it is not good if other module just share an IOMMUConext across
> devices. Definitely, it would have multiple pasid_alloc notifiers.

Right.

> How about enforcing IOMMUContext layer to only invoke one successful
> pasid_alloc/free notifier if PASID_ALLOC/FREE event comes? pasid
> alloc/free are really special as it requires feedback. And a potential
> benefit is that the pasid_alloc/free will not be affected by hot plug
> scenario. There will be always a notifier to work for pasid_alloc/free
> work unless all passthru devices are hot plugged. How do you think? Or
> if any other idea?

Hrm, that still doesn't seem right to me.  I don't think a notifier is
really the right mechanism for something that needs to return values.
This seems like something where you need to find a _single_
responsible object and call a method / callback on that specifically.

But it seems to me there's a more fundamental problem here.  AIUI the
idea is that a single IOMMUContext could hold multiple devices.  But
if the devices are responsible for assigning their own pasid values
(by passing that decisionon to the host through vfio) then that really
can't work.

I'm assuming it's impossible from the hardware side to virtualize the
pasids (so that we could assign them from qemu without host
intervention).

If so, then the pasid allocation really has to be a Context level, not
device level operation.  We'd have to wire the VFIO backend up to the
context itself, not a device... I'm not immediately sure how to do
that, though.
Yi Liu Nov. 26, 2019, 7:07 a.m. UTC | #6
Hi David,

> From: David Gibson < david@gibson.dropbear.id.au>
> Sent: Wednesday, November 20, 2019 12:28 PM
> To: Liu, Yi L <yi.l.liu@intel.com>
> Subject: Re: [RFC v2 09/22] vfio/pci: add iommu_context notifier for pasid alloc/free
> 
> On Wed, Nov 06, 2019 at 12:14:50PM +0000, Liu, Yi L wrote:
> > > From: David Gibson [mailto:david@gibson.dropbear.id.au]
> > > Sent: Tuesday, October 29, 2019 8:16 PM
> > > To: Liu, Yi L <yi.l.liu@intel.com>
> > > Subject: Re: [RFC v2 09/22] vfio/pci: add iommu_context notifier for pasid
> alloc/free
> > >
> > > On Thu, Oct 24, 2019 at 08:34:30AM -0400, Liu Yi L wrote:
> > > > This patch adds pasid alloc/free notifiers for vfio-pci. It is
> > > > supposed to be fired by vIOMMU. VFIO then sends PASID allocation
> > > > or free request to host.
> > > >
> > > > Cc: Kevin Tian <kevin.tian@intel.com>
> > > > Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
> > > > Cc: Peter Xu <peterx@redhat.com>
> > > > Cc: Eric Auger <eric.auger@redhat.com>
> > > > Cc: Yi Sun <yi.y.sun@linux.intel.com>
> > > > Cc: David Gibson <david@gibson.dropbear.id.au>
> > > > Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
> > > > ---
> > > >  hw/vfio/common.c         |  9 ++++++
> > > >  hw/vfio/pci.c            | 81
[...]
> > > > +
> > > > +static void vfio_iommu_pasid_alloc_notify(IOMMUCTXNotifier *n,
> > > > +                                          IOMMUCTXEventData *event_data)
> > > > +{
> > > > +    VFIOIOMMUContext *giommu_ctx = container_of(n, VFIOIOMMUContext,
> n);
> > > > +    VFIOContainer *container = giommu_ctx->container;
> > > > +    IOMMUCTXPASIDReqDesc *pasid_req =
> > > > +                              (IOMMUCTXPASIDReqDesc *) event_data->data;
> > > > +    struct vfio_iommu_type1_pasid_request req;
> > > > +    unsigned long argsz;
> > > > +    int pasid;
> > > > +
> > > > +    argsz = sizeof(req);
> > > > +    req.argsz = argsz;
> > > > +    req.flag = VFIO_IOMMU_PASID_ALLOC;
> > > > +    req.min_pasid = pasid_req->min_pasid;
> > > > +    req.max_pasid = pasid_req->max_pasid;
> > > > +
> > > > +    pasid = ioctl(container->fd, VFIO_IOMMU_PASID_REQUEST, &req);
> > > > +    if (pasid < 0) {
> > > > +        error_report("%s: %d, alloc failed", __func__, -errno);
> > > > +    }
> > > > +    pasid_req->alloc_result = pasid;
> > >
> > > Altering the event data from the notifier doesn't make sense.  By
> > > definition there can be multiple notifiers on the chain, so in that
> > > case which one is responsible for updating the writable field?
> >
> > I guess you mean multiple pasid_alloc nofitiers. right?
> >
> > It works for VT-d now, as Intel vIOMMU maintains the IOMMUContext
> > per-bdf. And there will be only 1 pasid_alloc notifier in the chain. But, I
> > agree it is not good if other module just share an IOMMUConext across
> > devices. Definitely, it would have multiple pasid_alloc notifiers.
> 
> Right.
> 
> > How about enforcing IOMMUContext layer to only invoke one successful
> > pasid_alloc/free notifier if PASID_ALLOC/FREE event comes? pasid
> > alloc/free are really special as it requires feedback. And a potential
> > benefit is that the pasid_alloc/free will not be affected by hot plug
> > scenario. There will be always a notifier to work for pasid_alloc/free
> > work unless all passthru devices are hot plugged. How do you think? Or
> > if any other idea?
> 
> Hrm, that still doesn't seem right to me.  I don't think a notifier is
> really the right mechanism for something that needs to return values.
> This seems like something where you need to find a _single_
> responsible object and call a method / callback on that specifically.

Agreed. For alloc/free operations, we need an explicit calling instead
of notifier which is usally to be a chain notification.

> But it seems to me there's a more fundamental problem here.  AIUI the
> idea is that a single IOMMUContext could hold multiple devices.  But
> if the devices are responsible for assigning their own pasid values
> (by passing that decisionon to the host through vfio) then that really
> can't work.
>
> I'm assuming it's impossible from the hardware side to virtualize the
> pasids (so that we could assign them from qemu without host
> intervention).

Actually, this is possible. On Intel platform, we've introduced ENQCMD
to do PASID translation which essentially supports PASID virtualization.
You may get more details in section 3.3. This is also why we want to have
host's intervention in PASID alloc/free.

https://software.intel.com/sites/default/files/managed/c5/15/architecture-instruction-set-extensions-programming-reference.pdf

> If so, then the pasid allocation really has to be a Context level, not
> device level operation.  We'd have to wire the VFIO backend up to the
> context itself, not a device... I'm not immediately sure how to do
> that, though.

I think for the pasid alloc/free, we want it to be a vfio container
operation. right? However, we cannot expose vfio container out of vfio
or we don't want to do such thing. Then I'm wondering if we can have
a PASIDObject which is allocated per container creation, and registered
to vIOMMU. The PASIDObject can provide pasid alloc/free ops. vIOMMU can
consume the ops to get host pasid or free a host pasid.

While for the current IOMMUContext in this patchset, I think we may keep
it to support bind_gpasid and iommu_cache_invalidate. Also, as far as I
can see, we may want to extend it to support host IOMMU translation fault
injection to vIOMMU. This is also an important operation after config
nested translation for vIOMMU (a.k.a. dual stage translation).

> --
> David Gibson                  | I'll have my music baroque, and my code
> david AT gibson.dropbear.id.au        | minimalist, thank you.  NOT _the_ _other_
>                               | _way_ _around_!
> http://www.ozlabs.org/~dgibson

Thanks,
Yi Liu
diff mbox series

Patch

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index d418527..e6ad21c 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1436,6 +1436,7 @@  static void vfio_disconnect_container(VFIOGroup *group)
     if (QLIST_EMPTY(&container->group_list)) {
         VFIOAddressSpace *space = container->space;
         VFIOGuestIOMMU *giommu, *tmp;
+        VFIOIOMMUContext *giommu_ctx, *ctx;
 
         QLIST_REMOVE(container, next);
 
@@ -1446,6 +1447,14 @@  static void vfio_disconnect_container(VFIOGroup *group)
             g_free(giommu);
         }
 
+        QLIST_FOREACH_SAFE(giommu_ctx, &container->iommu_ctx_list,
+                                                   iommu_ctx_next, ctx) {
+            iommu_ctx_notifier_unregister(giommu_ctx->iommu_ctx,
+                                                      &giommu_ctx->n);
+            QLIST_REMOVE(giommu_ctx, iommu_ctx_next);
+            g_free(giommu_ctx);
+        }
+
         trace_vfio_disconnect_container(container->fd);
         close(container->fd);
         g_free(container);
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 12fac39..8721ff6 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2699,11 +2699,80 @@  static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
     vdev->req_enabled = false;
 }
 
+static void vfio_register_iommu_ctx_notifier(VFIOPCIDevice *vdev,
+                                             IOMMUContext *iommu_ctx,
+                                             IOMMUCTXNotifyFn fn,
+                                             IOMMUCTXEvent event)
+{
+    VFIOContainer *container = vdev->vbasedev.group->container;
+    VFIOIOMMUContext *giommu_ctx;
+
+    giommu_ctx = g_malloc0(sizeof(*giommu_ctx));
+    giommu_ctx->container = container;
+    giommu_ctx->iommu_ctx = iommu_ctx;
+    QLIST_INSERT_HEAD(&container->iommu_ctx_list,
+                      giommu_ctx,
+                      iommu_ctx_next);
+    iommu_ctx_notifier_register(iommu_ctx,
+                                &giommu_ctx->n,
+                                fn,
+                                event);
+}
+
+static void vfio_iommu_pasid_alloc_notify(IOMMUCTXNotifier *n,
+                                          IOMMUCTXEventData *event_data)
+{
+    VFIOIOMMUContext *giommu_ctx = container_of(n, VFIOIOMMUContext, n);
+    VFIOContainer *container = giommu_ctx->container;
+    IOMMUCTXPASIDReqDesc *pasid_req =
+                              (IOMMUCTXPASIDReqDesc *) event_data->data;
+    struct vfio_iommu_type1_pasid_request req;
+    unsigned long argsz;
+    int pasid;
+
+    argsz = sizeof(req);
+    req.argsz = argsz;
+    req.flag = VFIO_IOMMU_PASID_ALLOC;
+    req.min_pasid = pasid_req->min_pasid;
+    req.max_pasid = pasid_req->max_pasid;
+
+    pasid = ioctl(container->fd, VFIO_IOMMU_PASID_REQUEST, &req);
+    if (pasid < 0) {
+        error_report("%s: %d, alloc failed", __func__, -errno);
+    }
+    pasid_req->alloc_result = pasid;
+}
+
+static void vfio_iommu_pasid_free_notify(IOMMUCTXNotifier *n,
+                                          IOMMUCTXEventData *event_data)
+{
+    VFIOIOMMUContext *giommu_ctx = container_of(n, VFIOIOMMUContext, n);
+    VFIOContainer *container = giommu_ctx->container;
+    IOMMUCTXPASIDReqDesc *pasid_req =
+                              (IOMMUCTXPASIDReqDesc *) event_data->data;
+    struct vfio_iommu_type1_pasid_request req;
+    unsigned long argsz;
+    int ret = 0;
+
+    argsz = sizeof(req);
+    req.argsz = argsz;
+    req.flag = VFIO_IOMMU_PASID_FREE;
+    req.pasid = pasid_req->pasid;
+
+    ret = ioctl(container->fd, VFIO_IOMMU_PASID_REQUEST, &req);
+    if (ret != 0) {
+        error_report("%s: %d, pasid %u free failed",
+                   __func__, -errno, (unsigned) pasid_req->pasid);
+    }
+    pasid_req->free_result = ret;
+}
+
 static void vfio_realize(PCIDevice *pdev, Error **errp)
 {
     VFIOPCIDevice *vdev = PCI_VFIO(pdev);
     VFIODevice *vbasedev_iter;
     VFIOGroup *group;
+    IOMMUContext *iommu_context;
     char *tmp, *subsys, group_path[PATH_MAX], *group_name;
     Error *err = NULL;
     ssize_t len;
@@ -3000,6 +3069,18 @@  static void vfio_realize(PCIDevice *pdev, Error **errp)
     vfio_register_req_notifier(vdev);
     vfio_setup_resetfn_quirk(vdev);
 
+    iommu_context = pci_device_iommu_context(pdev);
+    if (iommu_context) {
+        vfio_register_iommu_ctx_notifier(vdev,
+                                         iommu_context,
+                                         vfio_iommu_pasid_alloc_notify,
+                                         IOMMU_CTX_EVENT_PASID_ALLOC);
+        vfio_register_iommu_ctx_notifier(vdev,
+                                         iommu_context,
+                                         vfio_iommu_pasid_free_notify,
+                                         IOMMU_CTX_EVENT_PASID_FREE);
+    }
+
     return;
 
 out_teardown:
diff --git a/include/hw/iommu/iommu.h b/include/hw/iommu/iommu.h
index c22c442..4352afd 100644
--- a/include/hw/iommu/iommu.h
+++ b/include/hw/iommu/iommu.h
@@ -31,10 +31,25 @@ 
 typedef struct IOMMUContext IOMMUContext;
 
 enum IOMMUCTXEvent {
+    IOMMU_CTX_EVENT_PASID_ALLOC,
+    IOMMU_CTX_EVENT_PASID_FREE,
     IOMMU_CTX_EVENT_NUM,
 };
 typedef enum IOMMUCTXEvent IOMMUCTXEvent;
 
+union IOMMUCTXPASIDReqDesc {
+    struct {
+        uint32_t min_pasid;
+        uint32_t max_pasid;
+        int32_t alloc_result; /* pasid allocated for the alloc request */
+    };
+    struct {
+        uint32_t pasid; /* pasid to be free */
+        int free_result;
+    };
+};
+typedef union IOMMUCTXPASIDReqDesc IOMMUCTXPASIDReqDesc;
+
 struct IOMMUCTXEventData {
     IOMMUCTXEvent event;
     uint64_t length;