Message ID | 20190526161004.25232-27-eric.auger@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | SMMUv3 Nested Stage Setup | expand |
On Sun, 26 May 2019 18:10:01 +0200 Eric Auger <eric.auger@redhat.com> wrote: > This patch registers a fault handler which records faults in > a circular buffer and then signals an eventfd. This buffer is > exposed within the fault region. > > Signed-off-by: Eric Auger <eric.auger@redhat.com> > > --- > > v3 -> v4: > - move iommu_unregister_device_fault_handler to vfio_pci_release > --- > drivers/vfio/pci/vfio_pci.c | 49 +++++++++++++++++++++++++++++ > drivers/vfio/pci/vfio_pci_private.h | 1 + > 2 files changed, 50 insertions(+) > > diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c > index f75f61127277..520999994ba8 100644 > --- a/drivers/vfio/pci/vfio_pci.c > +++ b/drivers/vfio/pci/vfio_pci.c > @@ -30,6 +30,7 @@ > #include <linux/vfio.h> > #include <linux/vgaarb.h> > #include <linux/nospec.h> > +#include <linux/circ_buf.h> > > #include "vfio_pci_private.h" > > @@ -296,6 +297,46 @@ static const struct vfio_pci_regops vfio_pci_fault_prod_regops = { > .add_capability = vfio_pci_fault_prod_add_capability, > }; > > +int vfio_pci_iommu_dev_fault_handler(struct iommu_fault_event *evt, void *data) > +{ > + struct vfio_pci_device *vdev = (struct vfio_pci_device *) data; > + struct vfio_region_fault_prod *prod_region = > + (struct vfio_region_fault_prod *)vdev->fault_pages; > + struct vfio_region_fault_cons *cons_region = > + (struct vfio_region_fault_cons *)(vdev->fault_pages + 2 * PAGE_SIZE); > + struct iommu_fault *new = > + (struct iommu_fault *)(vdev->fault_pages + prod_region->offset + > + prod_region->prod * prod_region->entry_size); > + int prod, cons, size; > + > + mutex_lock(&vdev->fault_queue_lock); > + > + if (!vdev->fault_abi) > + goto unlock; > + > + prod = prod_region->prod; > + cons = cons_region->cons; > + size = prod_region->nb_entries; > + > + if (CIRC_SPACE(prod, cons, size) < 1) > + goto unlock; > + > + *new = evt->fault; > + prod = (prod + 1) % size; > + prod_region->prod = prod; > + mutex_unlock(&vdev->fault_queue_lock); > + > + mutex_lock(&vdev->igate); > + if (vdev->dma_fault_trigger) > + eventfd_signal(vdev->dma_fault_trigger, 1); > + mutex_unlock(&vdev->igate); > + return 0; > + > +unlock: > + mutex_unlock(&vdev->fault_queue_lock); > + return -EINVAL; > +} > + > static int vfio_pci_init_fault_region(struct vfio_pci_device *vdev) > { > struct vfio_region_fault_prod *header; > @@ -328,6 +369,13 @@ static int vfio_pci_init_fault_region(struct vfio_pci_device *vdev) > header = (struct vfio_region_fault_prod *)vdev->fault_pages; > header->version = -1; > header->offset = PAGE_SIZE; > + > + ret = iommu_register_device_fault_handler(&vdev->pdev->dev, > + vfio_pci_iommu_dev_fault_handler, > + vdev); > + if (ret) > + goto out; > + > return 0; > out: > kfree(vdev->fault_pages); > @@ -570,6 +618,7 @@ static void vfio_pci_release(void *device_data) > if (!(--vdev->refcnt)) { > vfio_spapr_pci_eeh_release(vdev->pdev); > vfio_pci_disable(vdev); > + iommu_unregister_device_fault_handler(&vdev->pdev->dev); But this can fail if there are pending faults which leaves a device reference and then the system is broken :( > } > > mutex_unlock(&vdev->reflck->lock); > diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h > index 8e0a55682d3f..a9276926f008 100644 > --- a/drivers/vfio/pci/vfio_pci_private.h > +++ b/drivers/vfio/pci/vfio_pci_private.h > @@ -122,6 +122,7 @@ struct vfio_pci_device { > int ioeventfds_nr; > struct eventfd_ctx *err_trigger; > struct eventfd_ctx *req_trigger; > + struct eventfd_ctx *dma_fault_trigger; > struct mutex fault_queue_lock; > int fault_abi; > struct list_head dummy_resources_list;
Hi Alex, On 6/4/19 12:31 AM, Alex Williamson wrote: > On Sun, 26 May 2019 18:10:01 +0200 > Eric Auger <eric.auger@redhat.com> wrote: > >> This patch registers a fault handler which records faults in >> a circular buffer and then signals an eventfd. This buffer is >> exposed within the fault region. >> >> Signed-off-by: Eric Auger <eric.auger@redhat.com> >> >> --- >> >> v3 -> v4: >> - move iommu_unregister_device_fault_handler to vfio_pci_release >> --- >> drivers/vfio/pci/vfio_pci.c | 49 +++++++++++++++++++++++++++++ >> drivers/vfio/pci/vfio_pci_private.h | 1 + >> 2 files changed, 50 insertions(+) >> >> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c >> index f75f61127277..520999994ba8 100644 >> --- a/drivers/vfio/pci/vfio_pci.c >> +++ b/drivers/vfio/pci/vfio_pci.c >> @@ -30,6 +30,7 @@ >> #include <linux/vfio.h> >> #include <linux/vgaarb.h> >> #include <linux/nospec.h> >> +#include <linux/circ_buf.h> >> >> #include "vfio_pci_private.h" >> >> @@ -296,6 +297,46 @@ static const struct vfio_pci_regops vfio_pci_fault_prod_regops = { >> .add_capability = vfio_pci_fault_prod_add_capability, >> }; >> >> +int vfio_pci_iommu_dev_fault_handler(struct iommu_fault_event *evt, void *data) >> +{ >> + struct vfio_pci_device *vdev = (struct vfio_pci_device *) data; >> + struct vfio_region_fault_prod *prod_region = >> + (struct vfio_region_fault_prod *)vdev->fault_pages; >> + struct vfio_region_fault_cons *cons_region = >> + (struct vfio_region_fault_cons *)(vdev->fault_pages + 2 * PAGE_SIZE); >> + struct iommu_fault *new = >> + (struct iommu_fault *)(vdev->fault_pages + prod_region->offset + >> + prod_region->prod * prod_region->entry_size); >> + int prod, cons, size; >> + >> + mutex_lock(&vdev->fault_queue_lock); >> + >> + if (!vdev->fault_abi) >> + goto unlock; >> + >> + prod = prod_region->prod; >> + cons = cons_region->cons; >> + size = prod_region->nb_entries; >> + >> + if (CIRC_SPACE(prod, cons, size) < 1) >> + goto unlock; >> + >> + *new = evt->fault; >> + prod = (prod + 1) % size; >> + prod_region->prod = prod; >> + mutex_unlock(&vdev->fault_queue_lock); >> + >> + mutex_lock(&vdev->igate); >> + if (vdev->dma_fault_trigger) >> + eventfd_signal(vdev->dma_fault_trigger, 1); >> + mutex_unlock(&vdev->igate); >> + return 0; >> + >> +unlock: >> + mutex_unlock(&vdev->fault_queue_lock); >> + return -EINVAL; >> +} >> + >> static int vfio_pci_init_fault_region(struct vfio_pci_device *vdev) >> { >> struct vfio_region_fault_prod *header; >> @@ -328,6 +369,13 @@ static int vfio_pci_init_fault_region(struct vfio_pci_device *vdev) >> header = (struct vfio_region_fault_prod *)vdev->fault_pages; >> header->version = -1; >> header->offset = PAGE_SIZE; >> + >> + ret = iommu_register_device_fault_handler(&vdev->pdev->dev, >> + vfio_pci_iommu_dev_fault_handler, >> + vdev); >> + if (ret) >> + goto out; >> + >> return 0; >> out: >> kfree(vdev->fault_pages); >> @@ -570,6 +618,7 @@ static void vfio_pci_release(void *device_data) >> if (!(--vdev->refcnt)) { >> vfio_spapr_pci_eeh_release(vdev->pdev); >> vfio_pci_disable(vdev); >> + iommu_unregister_device_fault_handler(&vdev->pdev->dev); > > > But this can fail if there are pending faults which leaves a device > reference and then the system is broken :( This series only features unrecoverable errors and for those the unregistration cannot fail. Now unrecoverable errors were added I admit this is confusing. We need to sort this out or clean the dependencies. Thanks Eric > >> } >> >> mutex_unlock(&vdev->reflck->lock); >> diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h >> index 8e0a55682d3f..a9276926f008 100644 >> --- a/drivers/vfio/pci/vfio_pci_private.h >> +++ b/drivers/vfio/pci/vfio_pci_private.h >> @@ -122,6 +122,7 @@ struct vfio_pci_device { >> int ioeventfds_nr; >> struct eventfd_ctx *err_trigger; >> struct eventfd_ctx *req_trigger; >> + struct eventfd_ctx *dma_fault_trigger; >> struct mutex fault_queue_lock; >> int fault_abi; >> struct list_head dummy_resources_list; >
On Tue, 4 Jun 2019 18:11:08 +0200 Auger Eric <eric.auger@redhat.com> wrote: > Hi Alex, > > On 6/4/19 12:31 AM, Alex Williamson wrote: > > On Sun, 26 May 2019 18:10:01 +0200 > > Eric Auger <eric.auger@redhat.com> wrote: > > > >> This patch registers a fault handler which records faults in > >> a circular buffer and then signals an eventfd. This buffer is > >> exposed within the fault region. > >> > >> Signed-off-by: Eric Auger <eric.auger@redhat.com> > >> > >> --- > >> > >> v3 -> v4: > >> - move iommu_unregister_device_fault_handler to vfio_pci_release > >> --- > >> drivers/vfio/pci/vfio_pci.c | 49 > >> +++++++++++++++++++++++++++++ drivers/vfio/pci/vfio_pci_private.h > >> | 1 + 2 files changed, 50 insertions(+) > >> > >> diff --git a/drivers/vfio/pci/vfio_pci.c > >> b/drivers/vfio/pci/vfio_pci.c index f75f61127277..520999994ba8 > >> 100644 --- a/drivers/vfio/pci/vfio_pci.c > >> +++ b/drivers/vfio/pci/vfio_pci.c > >> @@ -30,6 +30,7 @@ > >> #include <linux/vfio.h> > >> #include <linux/vgaarb.h> > >> #include <linux/nospec.h> > >> +#include <linux/circ_buf.h> > >> > >> #include "vfio_pci_private.h" > >> > >> @@ -296,6 +297,46 @@ static const struct vfio_pci_regops > >> vfio_pci_fault_prod_regops = { .add_capability = > >> vfio_pci_fault_prod_add_capability, }; > >> > >> +int vfio_pci_iommu_dev_fault_handler(struct iommu_fault_event > >> *evt, void *data) +{ > >> + struct vfio_pci_device *vdev = (struct vfio_pci_device *) > >> data; > >> + struct vfio_region_fault_prod *prod_region = > >> + (struct vfio_region_fault_prod > >> *)vdev->fault_pages; > >> + struct vfio_region_fault_cons *cons_region = > >> + (struct vfio_region_fault_cons > >> *)(vdev->fault_pages + 2 * PAGE_SIZE); > >> + struct iommu_fault *new = > >> + (struct iommu_fault *)(vdev->fault_pages + > >> prod_region->offset + > >> + prod_region->prod * > >> prod_region->entry_size); > >> + int prod, cons, size; > >> + > >> + mutex_lock(&vdev->fault_queue_lock); > >> + > >> + if (!vdev->fault_abi) > >> + goto unlock; > >> + > >> + prod = prod_region->prod; > >> + cons = cons_region->cons; > >> + size = prod_region->nb_entries; > >> + > >> + if (CIRC_SPACE(prod, cons, size) < 1) > >> + goto unlock; > >> + > >> + *new = evt->fault; > >> + prod = (prod + 1) % size; > >> + prod_region->prod = prod; > >> + mutex_unlock(&vdev->fault_queue_lock); > >> + > >> + mutex_lock(&vdev->igate); > >> + if (vdev->dma_fault_trigger) > >> + eventfd_signal(vdev->dma_fault_trigger, 1); > >> + mutex_unlock(&vdev->igate); > >> + return 0; > >> + > >> +unlock: > >> + mutex_unlock(&vdev->fault_queue_lock); > >> + return -EINVAL; > >> +} > >> + > >> static int vfio_pci_init_fault_region(struct vfio_pci_device > >> *vdev) { > >> struct vfio_region_fault_prod *header; > >> @@ -328,6 +369,13 @@ static int vfio_pci_init_fault_region(struct > >> vfio_pci_device *vdev) header = (struct vfio_region_fault_prod > >> *)vdev->fault_pages; header->version = -1; > >> header->offset = PAGE_SIZE; > >> + > >> + ret = > >> iommu_register_device_fault_handler(&vdev->pdev->dev, > >> + > >> vfio_pci_iommu_dev_fault_handler, > >> + vdev); > >> + if (ret) > >> + goto out; > >> + > >> return 0; > >> out: > >> kfree(vdev->fault_pages); > >> @@ -570,6 +618,7 @@ static void vfio_pci_release(void *device_data) > >> if (!(--vdev->refcnt)) { > >> vfio_spapr_pci_eeh_release(vdev->pdev); > >> vfio_pci_disable(vdev); > >> + > >> iommu_unregister_device_fault_handler(&vdev->pdev->dev); > > > > > > But this can fail if there are pending faults which leaves a device > > reference and then the system is broken :( > This series only features unrecoverable errors and for those the > unregistration cannot fail. Now unrecoverable errors were added I > admit this is confusing. We need to sort this out or clean the > dependencies. As Alex pointed out in 4/29, we can make iommu_unregister_device_fault_handler() never fail and clean up all the pending faults in the host IOMMU belong to that device. But the problem is that if a fault, such as PRQ, has already been injected into the guest, the page response may come back after handler is unregistered and registered again. We need a way to reject such page response belong to the previous life of the handler. Perhaps a sync call to the guest with your fault queue eventfd? I am not sure. Jacob
On 05/06/2019 23:45, Jacob Pan wrote: > On Tue, 4 Jun 2019 18:11:08 +0200 > Auger Eric <eric.auger@redhat.com> wrote: > >> Hi Alex, >> >> On 6/4/19 12:31 AM, Alex Williamson wrote: >>> On Sun, 26 May 2019 18:10:01 +0200 >>> Eric Auger <eric.auger@redhat.com> wrote: >>> >>>> This patch registers a fault handler which records faults in >>>> a circular buffer and then signals an eventfd. This buffer is >>>> exposed within the fault region. >>>> >>>> Signed-off-by: Eric Auger <eric.auger@redhat.com> >>>> >>>> --- >>>> >>>> v3 -> v4: >>>> - move iommu_unregister_device_fault_handler to vfio_pci_release >>>> --- >>>> drivers/vfio/pci/vfio_pci.c | 49 >>>> +++++++++++++++++++++++++++++ drivers/vfio/pci/vfio_pci_private.h >>>> | 1 + 2 files changed, 50 insertions(+) >>>> >>>> diff --git a/drivers/vfio/pci/vfio_pci.c >>>> b/drivers/vfio/pci/vfio_pci.c index f75f61127277..520999994ba8 >>>> 100644 --- a/drivers/vfio/pci/vfio_pci.c >>>> +++ b/drivers/vfio/pci/vfio_pci.c >>>> @@ -30,6 +30,7 @@ >>>> #include <linux/vfio.h> >>>> #include <linux/vgaarb.h> >>>> #include <linux/nospec.h> >>>> +#include <linux/circ_buf.h> >>>> >>>> #include "vfio_pci_private.h" >>>> >>>> @@ -296,6 +297,46 @@ static const struct vfio_pci_regops >>>> vfio_pci_fault_prod_regops = { .add_capability = >>>> vfio_pci_fault_prod_add_capability, }; >>>> >>>> +int vfio_pci_iommu_dev_fault_handler(struct iommu_fault_event >>>> *evt, void *data) +{ >>>> + struct vfio_pci_device *vdev = (struct vfio_pci_device *) >>>> data; >>>> + struct vfio_region_fault_prod *prod_region = >>>> + (struct vfio_region_fault_prod >>>> *)vdev->fault_pages; >>>> + struct vfio_region_fault_cons *cons_region = >>>> + (struct vfio_region_fault_cons >>>> *)(vdev->fault_pages + 2 * PAGE_SIZE); >>>> + struct iommu_fault *new = >>>> + (struct iommu_fault *)(vdev->fault_pages + >>>> prod_region->offset + >>>> + prod_region->prod * >>>> prod_region->entry_size); >>>> + int prod, cons, size; >>>> + >>>> + mutex_lock(&vdev->fault_queue_lock); >>>> + >>>> + if (!vdev->fault_abi) >>>> + goto unlock; >>>> + >>>> + prod = prod_region->prod; >>>> + cons = cons_region->cons; >>>> + size = prod_region->nb_entries; >>>> + >>>> + if (CIRC_SPACE(prod, cons, size) < 1) >>>> + goto unlock; >>>> + >>>> + *new = evt->fault; >>>> + prod = (prod + 1) % size; >>>> + prod_region->prod = prod; >>>> + mutex_unlock(&vdev->fault_queue_lock); >>>> + >>>> + mutex_lock(&vdev->igate); >>>> + if (vdev->dma_fault_trigger) >>>> + eventfd_signal(vdev->dma_fault_trigger, 1); >>>> + mutex_unlock(&vdev->igate); >>>> + return 0; >>>> + >>>> +unlock: >>>> + mutex_unlock(&vdev->fault_queue_lock); >>>> + return -EINVAL; >>>> +} >>>> + >>>> static int vfio_pci_init_fault_region(struct vfio_pci_device >>>> *vdev) { >>>> struct vfio_region_fault_prod *header; >>>> @@ -328,6 +369,13 @@ static int vfio_pci_init_fault_region(struct >>>> vfio_pci_device *vdev) header = (struct vfio_region_fault_prod >>>> *)vdev->fault_pages; header->version = -1; >>>> header->offset = PAGE_SIZE; >>>> + >>>> + ret = >>>> iommu_register_device_fault_handler(&vdev->pdev->dev, >>>> + >>>> vfio_pci_iommu_dev_fault_handler, >>>> + vdev); >>>> + if (ret) >>>> + goto out; >>>> + >>>> return 0; >>>> out: >>>> kfree(vdev->fault_pages); >>>> @@ -570,6 +618,7 @@ static void vfio_pci_release(void *device_data) >>>> if (!(--vdev->refcnt)) { >>>> vfio_spapr_pci_eeh_release(vdev->pdev); >>>> vfio_pci_disable(vdev); >>>> + >>>> iommu_unregister_device_fault_handler(&vdev->pdev->dev); >>> >>> >>> But this can fail if there are pending faults which leaves a device >>> reference and then the system is broken :( >> This series only features unrecoverable errors and for those the >> unregistration cannot fail. Now unrecoverable errors were added I >> admit this is confusing. We need to sort this out or clean the >> dependencies. > As Alex pointed out in 4/29, we can make > iommu_unregister_device_fault_handler() never fail and clean up all the > pending faults in the host IOMMU belong to that device. But the problem > is that if a fault, such as PRQ, has already been injected into the > guest, the page response may come back after handler is unregistered > and registered again. I'm trying to figure out if that would be harmful in any way. I guess it can be a bit nasty if we handle the page response right after having injected a new page request that uses the same PRGI. In any other case we discard the page response, but here we forward it to the endpoint and: * If the response status is success, endpoint retries the translation. The guest probably hasn't had time to handle the new page request and translation will fail, which may lead the endpoint to give up (two unsuccessful translation requests). Or send a new request * otherwise the endpoint won't retry the access, and could also disable PRI if the status is failure. > We need a way to reject such page response belong > to the previous life of the handler. Perhaps a sync call to the guest > with your fault queue eventfd? I am not sure. We could simply expect the device driver not to send any page response after unregistering the fault handler. Is there any reason VFIO would need to unregister and re-register the fault handler on a live guest? Thanks, Jean
On Thu, 6 Jun 2019 19:54:05 +0100 Jean-Philippe Brucker <jean-philippe.brucker@arm.com> wrote: > On 05/06/2019 23:45, Jacob Pan wrote: > > On Tue, 4 Jun 2019 18:11:08 +0200 > > Auger Eric <eric.auger@redhat.com> wrote: > > > >> Hi Alex, > >> > >> On 6/4/19 12:31 AM, Alex Williamson wrote: > >>> On Sun, 26 May 2019 18:10:01 +0200 > >>> Eric Auger <eric.auger@redhat.com> wrote: > >>> > >>>> This patch registers a fault handler which records faults in > >>>> a circular buffer and then signals an eventfd. This buffer is > >>>> exposed within the fault region. > >>>> > >>>> Signed-off-by: Eric Auger <eric.auger@redhat.com> > >>>> > >>>> --- > >>>> > >>>> v3 -> v4: > >>>> - move iommu_unregister_device_fault_handler to vfio_pci_release > >>>> --- > >>>> drivers/vfio/pci/vfio_pci.c | 49 > >>>> +++++++++++++++++++++++++++++ drivers/vfio/pci/vfio_pci_private.h > >>>> | 1 + 2 files changed, 50 insertions(+) > >>>> > >>>> diff --git a/drivers/vfio/pci/vfio_pci.c > >>>> b/drivers/vfio/pci/vfio_pci.c index f75f61127277..520999994ba8 > >>>> 100644 --- a/drivers/vfio/pci/vfio_pci.c > >>>> +++ b/drivers/vfio/pci/vfio_pci.c > >>>> @@ -30,6 +30,7 @@ > >>>> #include <linux/vfio.h> > >>>> #include <linux/vgaarb.h> > >>>> #include <linux/nospec.h> > >>>> +#include <linux/circ_buf.h> > >>>> > >>>> #include "vfio_pci_private.h" > >>>> > >>>> @@ -296,6 +297,46 @@ static const struct vfio_pci_regops > >>>> vfio_pci_fault_prod_regops = { .add_capability = > >>>> vfio_pci_fault_prod_add_capability, }; > >>>> > >>>> +int vfio_pci_iommu_dev_fault_handler(struct iommu_fault_event > >>>> *evt, void *data) +{ > >>>> + struct vfio_pci_device *vdev = (struct vfio_pci_device > >>>> *) data; > >>>> + struct vfio_region_fault_prod *prod_region = > >>>> + (struct vfio_region_fault_prod > >>>> *)vdev->fault_pages; > >>>> + struct vfio_region_fault_cons *cons_region = > >>>> + (struct vfio_region_fault_cons > >>>> *)(vdev->fault_pages + 2 * PAGE_SIZE); > >>>> + struct iommu_fault *new = > >>>> + (struct iommu_fault *)(vdev->fault_pages + > >>>> prod_region->offset + > >>>> + prod_region->prod * > >>>> prod_region->entry_size); > >>>> + int prod, cons, size; > >>>> + > >>>> + mutex_lock(&vdev->fault_queue_lock); > >>>> + > >>>> + if (!vdev->fault_abi) > >>>> + goto unlock; > >>>> + > >>>> + prod = prod_region->prod; > >>>> + cons = cons_region->cons; > >>>> + size = prod_region->nb_entries; > >>>> + > >>>> + if (CIRC_SPACE(prod, cons, size) < 1) > >>>> + goto unlock; > >>>> + > >>>> + *new = evt->fault; > >>>> + prod = (prod + 1) % size; > >>>> + prod_region->prod = prod; > >>>> + mutex_unlock(&vdev->fault_queue_lock); > >>>> + > >>>> + mutex_lock(&vdev->igate); > >>>> + if (vdev->dma_fault_trigger) > >>>> + eventfd_signal(vdev->dma_fault_trigger, 1); > >>>> + mutex_unlock(&vdev->igate); > >>>> + return 0; > >>>> + > >>>> +unlock: > >>>> + mutex_unlock(&vdev->fault_queue_lock); > >>>> + return -EINVAL; > >>>> +} > >>>> + > >>>> static int vfio_pci_init_fault_region(struct vfio_pci_device > >>>> *vdev) { > >>>> struct vfio_region_fault_prod *header; > >>>> @@ -328,6 +369,13 @@ static int vfio_pci_init_fault_region(struct > >>>> vfio_pci_device *vdev) header = (struct vfio_region_fault_prod > >>>> *)vdev->fault_pages; header->version = -1; > >>>> header->offset = PAGE_SIZE; > >>>> + > >>>> + ret = > >>>> iommu_register_device_fault_handler(&vdev->pdev->dev, > >>>> + > >>>> vfio_pci_iommu_dev_fault_handler, > >>>> + vdev); > >>>> + if (ret) > >>>> + goto out; > >>>> + > >>>> return 0; > >>>> out: > >>>> kfree(vdev->fault_pages); > >>>> @@ -570,6 +618,7 @@ static void vfio_pci_release(void > >>>> *device_data) if (!(--vdev->refcnt)) { > >>>> vfio_spapr_pci_eeh_release(vdev->pdev); > >>>> vfio_pci_disable(vdev); > >>>> + > >>>> iommu_unregister_device_fault_handler(&vdev->pdev->dev); > >>> > >>> > >>> But this can fail if there are pending faults which leaves a > >>> device reference and then the system is broken :( > >> This series only features unrecoverable errors and for those the > >> unregistration cannot fail. Now unrecoverable errors were added I > >> admit this is confusing. We need to sort this out or clean the > >> dependencies. > > As Alex pointed out in 4/29, we can make > > iommu_unregister_device_fault_handler() never fail and clean up all > > the pending faults in the host IOMMU belong to that device. But the > > problem is that if a fault, such as PRQ, has already been injected > > into the guest, the page response may come back after handler is > > unregistered and registered again. > > I'm trying to figure out if that would be harmful in any way. I guess > it can be a bit nasty if we handle the page response right after > having injected a new page request that uses the same PRGI. In any > other case we discard the page response, but here we forward it to > the endpoint and: > > * If the response status is success, endpoint retries the > translation. The guest probably hasn't had time to handle the new > page request and translation will fail, which may lead the endpoint > to give up (two unsuccessful translation requests). Or send a new > request > Good point, there shouldn't be any harm if the page response is a "fake" success. In fact it could happen in the normal operation when PRQs to two devices share the same non-leaf translation structure. The worst case is just a retry. I am not aware of the retry limit, is it in the PCIe spec? I cannot find it. I think we should just document it, similar to having a spurious interrupt. The PRQ trace event should capture that as well. > * otherwise the endpoint won't retry the access, and could also > disable PRI if the status is failure. > That would be true regardless this race condition with handler registration. So should be fine. > > We need a way to reject such page response belong > > to the previous life of the handler. Perhaps a sync call to the > > guest with your fault queue eventfd? I am not sure. > > We could simply expect the device driver not to send any page response > after unregistering the fault handler. Is there any reason VFIO would > need to unregister and re-register the fault handler on a live guest? > There is no reason for VFIO to unregister and register again, I was just thinking from security perspective. Someone could write a VFIO app do this attack. But I agree the damage is within the device, may get PRI disabled as a result. So it seems we agree on the following: - iommu_unregister_device_fault_handler() will never fail - iommu driver cleans up all pending faults when handler is unregistered - assume device driver or guest not sending more page response _after_ handler is unregistered. - system will tolerate rare spurious response Sounds right? > Thanks, > Jean [Jacob Pan]
Hi Jean, Jacob, On 6/6/19 10:29 PM, Jacob Pan wrote: > On Thu, 6 Jun 2019 19:54:05 +0100 > Jean-Philippe Brucker <jean-philippe.brucker@arm.com> wrote: > >> On 05/06/2019 23:45, Jacob Pan wrote: >>> On Tue, 4 Jun 2019 18:11:08 +0200 >>> Auger Eric <eric.auger@redhat.com> wrote: >>> >>>> Hi Alex, >>>> >>>> On 6/4/19 12:31 AM, Alex Williamson wrote: >>>>> On Sun, 26 May 2019 18:10:01 +0200 >>>>> Eric Auger <eric.auger@redhat.com> wrote: >>>>> >>>>>> This patch registers a fault handler which records faults in >>>>>> a circular buffer and then signals an eventfd. This buffer is >>>>>> exposed within the fault region. >>>>>> >>>>>> Signed-off-by: Eric Auger <eric.auger@redhat.com> >>>>>> >>>>>> --- >>>>>> >>>>>> v3 -> v4: >>>>>> - move iommu_unregister_device_fault_handler to vfio_pci_release >>>>>> --- >>>>>> drivers/vfio/pci/vfio_pci.c | 49 >>>>>> +++++++++++++++++++++++++++++ drivers/vfio/pci/vfio_pci_private.h >>>>>> | 1 + 2 files changed, 50 insertions(+) >>>>>> >>>>>> diff --git a/drivers/vfio/pci/vfio_pci.c >>>>>> b/drivers/vfio/pci/vfio_pci.c index f75f61127277..520999994ba8 >>>>>> 100644 --- a/drivers/vfio/pci/vfio_pci.c >>>>>> +++ b/drivers/vfio/pci/vfio_pci.c >>>>>> @@ -30,6 +30,7 @@ >>>>>> #include <linux/vfio.h> >>>>>> #include <linux/vgaarb.h> >>>>>> #include <linux/nospec.h> >>>>>> +#include <linux/circ_buf.h> >>>>>> >>>>>> #include "vfio_pci_private.h" >>>>>> >>>>>> @@ -296,6 +297,46 @@ static const struct vfio_pci_regops >>>>>> vfio_pci_fault_prod_regops = { .add_capability = >>>>>> vfio_pci_fault_prod_add_capability, }; >>>>>> >>>>>> +int vfio_pci_iommu_dev_fault_handler(struct iommu_fault_event >>>>>> *evt, void *data) +{ >>>>>> + struct vfio_pci_device *vdev = (struct vfio_pci_device >>>>>> *) data; >>>>>> + struct vfio_region_fault_prod *prod_region = >>>>>> + (struct vfio_region_fault_prod >>>>>> *)vdev->fault_pages; >>>>>> + struct vfio_region_fault_cons *cons_region = >>>>>> + (struct vfio_region_fault_cons >>>>>> *)(vdev->fault_pages + 2 * PAGE_SIZE); >>>>>> + struct iommu_fault *new = >>>>>> + (struct iommu_fault *)(vdev->fault_pages + >>>>>> prod_region->offset + >>>>>> + prod_region->prod * >>>>>> prod_region->entry_size); >>>>>> + int prod, cons, size; >>>>>> + >>>>>> + mutex_lock(&vdev->fault_queue_lock); >>>>>> + >>>>>> + if (!vdev->fault_abi) >>>>>> + goto unlock; >>>>>> + >>>>>> + prod = prod_region->prod; >>>>>> + cons = cons_region->cons; >>>>>> + size = prod_region->nb_entries; >>>>>> + >>>>>> + if (CIRC_SPACE(prod, cons, size) < 1) >>>>>> + goto unlock; >>>>>> + >>>>>> + *new = evt->fault; >>>>>> + prod = (prod + 1) % size; >>>>>> + prod_region->prod = prod; >>>>>> + mutex_unlock(&vdev->fault_queue_lock); >>>>>> + >>>>>> + mutex_lock(&vdev->igate); >>>>>> + if (vdev->dma_fault_trigger) >>>>>> + eventfd_signal(vdev->dma_fault_trigger, 1); >>>>>> + mutex_unlock(&vdev->igate); >>>>>> + return 0; >>>>>> + >>>>>> +unlock: >>>>>> + mutex_unlock(&vdev->fault_queue_lock); >>>>>> + return -EINVAL; >>>>>> +} >>>>>> + >>>>>> static int vfio_pci_init_fault_region(struct vfio_pci_device >>>>>> *vdev) { >>>>>> struct vfio_region_fault_prod *header; >>>>>> @@ -328,6 +369,13 @@ static int vfio_pci_init_fault_region(struct >>>>>> vfio_pci_device *vdev) header = (struct vfio_region_fault_prod >>>>>> *)vdev->fault_pages; header->version = -1; >>>>>> header->offset = PAGE_SIZE; >>>>>> + >>>>>> + ret = >>>>>> iommu_register_device_fault_handler(&vdev->pdev->dev, >>>>>> + >>>>>> vfio_pci_iommu_dev_fault_handler, >>>>>> + vdev); >>>>>> + if (ret) >>>>>> + goto out; >>>>>> + >>>>>> return 0; >>>>>> out: >>>>>> kfree(vdev->fault_pages); >>>>>> @@ -570,6 +618,7 @@ static void vfio_pci_release(void >>>>>> *device_data) if (!(--vdev->refcnt)) { >>>>>> vfio_spapr_pci_eeh_release(vdev->pdev); >>>>>> vfio_pci_disable(vdev); >>>>>> + >>>>>> iommu_unregister_device_fault_handler(&vdev->pdev->dev); >>>>> >>>>> >>>>> But this can fail if there are pending faults which leaves a >>>>> device reference and then the system is broken :( >>>> This series only features unrecoverable errors and for those the >>>> unregistration cannot fail. Now unrecoverable errors were added I >>>> admit this is confusing. We need to sort this out or clean the >>>> dependencies. >>> As Alex pointed out in 4/29, we can make >>> iommu_unregister_device_fault_handler() never fail and clean up all >>> the pending faults in the host IOMMU belong to that device. But the >>> problem is that if a fault, such as PRQ, has already been injected >>> into the guest, the page response may come back after handler is >>> unregistered and registered again. >> >> I'm trying to figure out if that would be harmful in any way. I guess >> it can be a bit nasty if we handle the page response right after >> having injected a new page request that uses the same PRGI. In any >> other case we discard the page response, but here we forward it to >> the endpoint and: >> >> * If the response status is success, endpoint retries the >> translation. The guest probably hasn't had time to handle the new >> page request and translation will fail, which may lead the endpoint >> to give up (two unsuccessful translation requests). Or send a new >> request >> > Good point, there shouldn't be any harm if the page response is a > "fake" success. In fact it could happen in the normal operation when > PRQs to two devices share the same non-leaf translation structure. The > worst case is just a retry. I am not aware of the retry limit, is it in > the PCIe spec? I cannot find it. > > I think we should just document it, similar to having a spurious > interrupt. The PRQ trace event should capture that as well. > >> * otherwise the endpoint won't retry the access, and could also >> disable PRI if the status is failure. >> > That would be true regardless this race condition with handler > registration. So should be fine. > >>> We need a way to reject such page response belong >>> to the previous life of the handler. Perhaps a sync call to the >>> guest with your fault queue eventfd? I am not sure. >> >> We could simply expect the device driver not to send any page response >> after unregistering the fault handler. Is there any reason VFIO would >> need to unregister and re-register the fault handler on a live guest? >> > There is no reason for VFIO to unregister and register again, I was > just thinking from security perspective. Someone could write a VFIO app > do this attack. But I agree the damage is within the device, may get > PRI disabled as a result. At the moment the handler unregistration is done on the vfio-pci release function() when the last reference is released so I am not sure this can even be achieved. > > So it seems we agree on the following: > - iommu_unregister_device_fault_handler() will never fail > - iommu driver cleans up all pending faults when handler is unregistered > - assume device driver or guest not sending more page response _after_ > handler is unregistered. > - system will tolerate rare spurious response > > Sounds right? sounds good for me Thanks Eric > >> Thanks, >> Jean > > [Jacob Pan] >
On 06/06/2019 21:29, Jacob Pan wrote: >>>>>> iommu_unregister_device_fault_handler(&vdev->pdev->dev); >>>>> >>>>> >>>>> But this can fail if there are pending faults which leaves a >>>>> device reference and then the system is broken :( >>>> This series only features unrecoverable errors and for those the >>>> unregistration cannot fail. Now unrecoverable errors were added I >>>> admit this is confusing. We need to sort this out or clean the >>>> dependencies. >>> As Alex pointed out in 4/29, we can make >>> iommu_unregister_device_fault_handler() never fail and clean up all >>> the pending faults in the host IOMMU belong to that device. But the >>> problem is that if a fault, such as PRQ, has already been injected >>> into the guest, the page response may come back after handler is >>> unregistered and registered again. >> >> I'm trying to figure out if that would be harmful in any way. I guess >> it can be a bit nasty if we handle the page response right after >> having injected a new page request that uses the same PRGI. In any >> other case we discard the page response, but here we forward it to >> the endpoint and: >> >> * If the response status is success, endpoint retries the >> translation. The guest probably hasn't had time to handle the new >> page request and translation will fail, which may lead the endpoint >> to give up (two unsuccessful translation requests). Or send a new >> request >> > Good point, there shouldn't be any harm if the page response is a > "fake" success. In fact it could happen in the normal operation when > PRQs to two devices share the same non-leaf translation structure. The > worst case is just a retry. I am not aware of the retry limit, is it in > the PCIe spec? I cannot find it. I don't think so, it's the implementation's choice. In general I don't think devices will have a retry limit, but it doesn't seem like the PCI spec prevents them from implementing one either. It could be useful to stop retrying after a certain number of faults, for preventing livelocks when the OS doesn't fix up the page tables and the device would just repeat the fault indefinitely. > I think we should just document it, similar to having a spurious > interrupt. The PRQ trace event should capture that as well. > >> * otherwise the endpoint won't retry the access, and could also >> disable PRI if the status is failure. >> > That would be true regardless this race condition with handler > registration. So should be fine. We do give an invalid response for the old PRG (because of unregistering), but also for the new one, which has a different address that the guest might be able to page in and would normally return success. >>> We need a way to reject such page response belong >>> to the previous life of the handler. Perhaps a sync call to the >>> guest with your fault queue eventfd? I am not sure. >> >> We could simply expect the device driver not to send any page response >> after unregistering the fault handler. Is there any reason VFIO would >> need to unregister and re-register the fault handler on a live guest? >> > There is no reason for VFIO to unregister and register again, I was > just thinking from security perspective. Someone could write a VFIO app > do this attack. But I agree the damage is within the device, may get > PRI disabled as a result. Yes I think the damage would always be contained within the misbehaving software > So it seems we agree on the following: > - iommu_unregister_device_fault_handler() will never fail > - iommu driver cleans up all pending faults when handler is unregistered > - assume device driver or guest not sending more page response _after_ > handler is unregistered. > - system will tolerate rare spurious response > > Sounds right? Yes, I'll add that to the fault series Thanks, Jean
On 26/05/2019 17:10, Eric Auger wrote: > +int vfio_pci_iommu_dev_fault_handler(struct iommu_fault_event *evt, void *data) > +{ > + struct vfio_pci_device *vdev = (struct vfio_pci_device *) data; > + struct vfio_region_fault_prod *prod_region = > + (struct vfio_region_fault_prod *)vdev->fault_pages; > + struct vfio_region_fault_cons *cons_region = > + (struct vfio_region_fault_cons *)(vdev->fault_pages + 2 * PAGE_SIZE); > + struct iommu_fault *new = > + (struct iommu_fault *)(vdev->fault_pages + prod_region->offset + > + prod_region->prod * prod_region->entry_size); > + int prod, cons, size; > + > + mutex_lock(&vdev->fault_queue_lock); > + > + if (!vdev->fault_abi) > + goto unlock; > + > + prod = prod_region->prod; > + cons = cons_region->cons; > + size = prod_region->nb_entries; > + > + if (CIRC_SPACE(prod, cons, size) < 1) > + goto unlock; > + > + *new = evt->fault; Could you check fault.type and return an error if it's not UNRECOV here? If the fault is recoverable (very unlikely since the PRI capability is disabled, but allowed) and we return an error here, then the caller takes care of completing the fault. If we forward it to the guest instead, the producer will wait indefinitely for a response. Thanks, Jean > + prod = (prod + 1) % size; > + prod_region->prod = prod; > + mutex_unlock(&vdev->fault_queue_lock); > + > + mutex_lock(&vdev->igate); > + if (vdev->dma_fault_trigger) > + eventfd_signal(vdev->dma_fault_trigger, 1); > + mutex_unlock(&vdev->igate); > + return 0; > + > +unlock: > + mutex_unlock(&vdev->fault_queue_lock); > + return -EINVAL; > +}
Hi Jean, On 6/7/19 2:48 PM, Jean-Philippe Brucker wrote: > On 26/05/2019 17:10, Eric Auger wrote: >> +int vfio_pci_iommu_dev_fault_handler(struct iommu_fault_event *evt, void *data) >> +{ >> + struct vfio_pci_device *vdev = (struct vfio_pci_device *) data; >> + struct vfio_region_fault_prod *prod_region = >> + (struct vfio_region_fault_prod *)vdev->fault_pages; >> + struct vfio_region_fault_cons *cons_region = >> + (struct vfio_region_fault_cons *)(vdev->fault_pages + 2 * PAGE_SIZE); >> + struct iommu_fault *new = >> + (struct iommu_fault *)(vdev->fault_pages + prod_region->offset + >> + prod_region->prod * prod_region->entry_size); >> + int prod, cons, size; >> + >> + mutex_lock(&vdev->fault_queue_lock); >> + >> + if (!vdev->fault_abi) >> + goto unlock; >> + >> + prod = prod_region->prod; >> + cons = cons_region->cons; >> + size = prod_region->nb_entries; >> + >> + if (CIRC_SPACE(prod, cons, size) < 1) >> + goto unlock; >> + >> + *new = evt->fault; > > Could you check fault.type and return an error if it's not UNRECOV here? > If the fault is recoverable (very unlikely since the PRI capability is > disabled, but allowed) and we return an error here, then the caller > takes care of completing the fault. If we forward it to the guest > instead, the producer will wait indefinitely for a response. Sure I will add that check in the next version. Thanks Eric > > Thanks, > Jean > >> + prod = (prod + 1) % size; >> + prod_region->prod = prod; >> + mutex_unlock(&vdev->fault_queue_lock); >> + >> + mutex_lock(&vdev->igate); >> + if (vdev->dma_fault_trigger) >> + eventfd_signal(vdev->dma_fault_trigger, 1); >> + mutex_unlock(&vdev->igate); >> + return 0; >> + >> +unlock: >> + mutex_unlock(&vdev->fault_queue_lock); >> + return -EINVAL; >> +}
On Fri, 7 Jun 2019 11:28:13 +0100 Jean-Philippe Brucker <jean-philippe.brucker@arm.com> wrote: > On 06/06/2019 21:29, Jacob Pan wrote: > >>>>>> iommu_unregister_device_fault_handler(&vdev->pdev->dev); > >>>>> > >>>>> > >>>>> But this can fail if there are pending faults which leaves a > >>>>> device reference and then the system is broken :( > >>>> This series only features unrecoverable errors and for those the > >>>> unregistration cannot fail. Now unrecoverable errors were added I > >>>> admit this is confusing. We need to sort this out or clean the > >>>> dependencies. > >>> As Alex pointed out in 4/29, we can make > >>> iommu_unregister_device_fault_handler() never fail and clean up > >>> all the pending faults in the host IOMMU belong to that device. > >>> But the problem is that if a fault, such as PRQ, has already been > >>> injected into the guest, the page response may come back after > >>> handler is unregistered and registered again. > >> > >> I'm trying to figure out if that would be harmful in any way. I > >> guess it can be a bit nasty if we handle the page response right > >> after having injected a new page request that uses the same PRGI. > >> In any other case we discard the page response, but here we > >> forward it to the endpoint and: > >> > >> * If the response status is success, endpoint retries the > >> translation. The guest probably hasn't had time to handle the new > >> page request and translation will fail, which may lead the endpoint > >> to give up (two unsuccessful translation requests). Or send a new > >> request > >> > > Good point, there shouldn't be any harm if the page response is a > > "fake" success. In fact it could happen in the normal operation when > > PRQs to two devices share the same non-leaf translation structure. > > The worst case is just a retry. I am not aware of the retry limit, > > is it in the PCIe spec? I cannot find it. > > I don't think so, it's the implementation's choice. In general I don't > think devices will have a retry limit, but it doesn't seem like the > PCI spec prevents them from implementing one either. It could be > useful to stop retrying after a certain number of faults, for > preventing livelocks when the OS doesn't fix up the page tables and > the device would just repeat the fault indefinitely. > > > I think we should just document it, similar to having a spurious > > interrupt. The PRQ trace event should capture that as well. > > > >> * otherwise the endpoint won't retry the access, and could also > >> disable PRI if the status is failure. > >> > > That would be true regardless this race condition with handler > > registration. So should be fine. > > We do give an invalid response for the old PRG (because of > unregistering), but also for the new one, which has a different > address that the guest might be able to page in and would normally > return success. > > >>> We need a way to reject such page response belong > >>> to the previous life of the handler. Perhaps a sync call to the > >>> guest with your fault queue eventfd? I am not sure. > >> > >> We could simply expect the device driver not to send any page > >> response after unregistering the fault handler. Is there any > >> reason VFIO would need to unregister and re-register the fault > >> handler on a live guest? > > There is no reason for VFIO to unregister and register again, I was > > just thinking from security perspective. Someone could write a VFIO > > app do this attack. But I agree the damage is within the device, > > may get PRI disabled as a result. > > Yes I think the damage would always be contained within the > misbehaving software > > > So it seems we agree on the following: > > - iommu_unregister_device_fault_handler() will never fail > > - iommu driver cleans up all pending faults when handler is > > unregistered > > - assume device driver or guest not sending more page response > > _after_ handler is unregistered. > > - system will tolerate rare spurious response > > > > Sounds right? > > Yes, I'll add that to the fault series Hold on a second please, I think we need more clarifications. Ashok pointed out to me that the spurious response can be harmful to other devices when it comes to mdev, where PRQ group id is not per PASID, device may reuse the group number and receiving spurious page response can confuse the entire PF. Having spurious page response is also not abiding the PCIe spec. exactly. We have two options here: 1. unregister handler will get -EBUSY if outstanding fault exists. -PROs: block offending device unbind only, eventually timeout will clear. -CONs: flooded faults can prevent clearing 2. unregister handle will block until all faults are clear in the host. Never fails unregistration -PROs: simple flow for VFIO, no need to worry about device holding reference. -CONs: spurious page response may come from misbehaving/malicious guest if guest does unregister and register back to back. It seems the only way to prevent spurious page response is to introduce a SW token or sequence# for each PRQ that needs a response. I still think option 2 is good. Consider the following time line: decoding PR#: page request G#: group # P#: PASID S#: sequence # A#: address PS#: page response (F): Fail (S): Success # Dev Host VFIO/QEMU Guest =========================================================== 1 <-reg(handler) 2 PR1G1S1A1 -> inject -> PR1G1S1A1 3 PR2G1S2A2 -> inject -> PR2G1S2A2 4. <-unreg(handler) 5. <-PR1G1S1A1(F) | 6. <-PR2G1S2A2(F) V 7. <-unreg(handler) 8. <-reg(handler) 9 PR3G1S3A1 -> inject -> PR3G1S3A1 10. <-PS1G1S1A1 11. <reject S1> 11. <accept S3> <-PS3G1S3A1 12.PS3G1S3A1(S) The spurious page response comes in at step 10 where the guest sends response for the request in step 1. But since the sequence # is 1, host IOMMU driver will reject it. At step 11, we accept page response for the matching sequence # then respond SUCCESS to the device. So would it be OK to add this sequence# to iommu_fault and page response, or could event reuse the time stamp for that purpose. Jacob
On 07/06/2019 18:43, Jacob Pan wrote: >>> So it seems we agree on the following: >>> - iommu_unregister_device_fault_handler() will never fail >>> - iommu driver cleans up all pending faults when handler is >>> unregistered >>> - assume device driver or guest not sending more page response >>> _after_ handler is unregistered. >>> - system will tolerate rare spurious response >>> >>> Sounds right? >> >> Yes, I'll add that to the fault series > Hold on a second please, I think we need more clarifications. Ashok > pointed out to me that the spurious response can be harmful to other > devices when it comes to mdev, where PRQ group id is not per PASID, > device may reuse the group number and receiving spurious page response > can confuse the entire PF. I don't understand how mdev differs from the non-mdev situation (but I also still don't fully get how mdev+PASID will be implemented). Is the following the case you're worried about? M#: mdev # # Dev Host mdev drv VFIO/QEMU Guest ==================================================================== 1 <- reg(handler) 2 PR1 G1 P1 -> M1 PR1 G1 inject -> M1 PR1 G1 3 <- unreg(handler) 4 <- PS1 G1 P1 (F) | 5 unreg(handler) 6 <- reg(handler) 7 PR2 G1 P1 -> M2 PR2 G1 inject -> M2 PR2 G1 8 <- M1 PS1 G1 9 accept ?? <- PS1 G1 P1 10 <- M2 PS2 G1 11 accept <- PS2 G1 P1 Step 2 injects PR1 for mdev#1. Step 4 auto-responds to PR1. Between steps 5 and 6, we re-allocate PASID #1 for mdev #2. At step 7, we inject PR2 for mdev #2. Step 8 is the spurious Page Response for PR1. But I don't think step 9 is possible, because the mdev driver knows that mdev #1 isn't using PASID #1 anymore. If the configuration is valid at all (a page response channel still exists for mdev #1), then mdev #1 now has a different PASID, e.g. #2, and step 9 would be "<- PS1 G1 P2" which is rejected by iommu.c (no such pending page request). And step 11 will be accepted. If PASIDs are allocated through VCMD, then the situation seems similar: at step 2 you inject "M1 PR1 G1 P1" into the guest, and at step 8 the spurious response is "M1 PS1 G1 P1". If mdev #1 doesn't have PASID #1 anymore, then the mdev driver can check that the PASID is invalid and can reject the page response. > Having spurious page response is also not > abiding the PCIe spec. exactly. We are following the PCI spec though, in that we don't send page responses for PRGIs that aren't in flight. > We have two options here: > 1. unregister handler will get -EBUSY if outstanding fault exists. > -PROs: block offending device unbind only, eventually timeout > will clear. > -CONs: flooded faults can prevent clearing > 2. unregister handle will block until all faults are clear in the host. > Never fails unregistration Here the host completes the faults itself or wait for a response from the guest? I'm slightly confused by the word "blocking". I'd rather we don't introduce an uninterruptible sleep in the IOMMU core, since it's unlikely to ever finish if we rely on the guest to complete things. > -PROs: simple flow for VFIO, no need to worry about device > holding reference. > -CONs: spurious page response may come from > misbehaving/malicious guest if guest does unregister and > register back to back. > It seems the only way to prevent spurious page response is to introduce > a SW token or sequence# for each PRQ that needs a response. I still > think option 2 is good. > > Consider the following time line: > decoding > PR#: page request > G#: group # > P#: PASID > S#: sequence # > A#: address > PS#: page response > (F): Fail > (S): Success > > # Dev Host VFIO/QEMU Guest > =========================================================== > 1 <-reg(handler) > 2 PR1G1S1A1 -> inject -> PR1G1S1A1 > 3 PR2G1S2A2 -> inject -> PR2G1S2A2 > 4. <-unreg(handler) > 5. <-PR1G1S1A1(F) | > 6. <-PR2G1S2A2(F) V > 7. <-unreg(handler) > 8. <-reg(handler) > 9 PR3G1S3A1 -> inject -> PR3G1S3A1 > 10. <-PS1G1S1A1 > 11. <reject S1> > 11. <accept S3> <-PS3G1S3A1 > 12.PS3G1S3A1(S) > > The spurious page response comes in at step 10 where the guest sends > response for the request in step 1. But since the sequence # is 1, host > IOMMU driver will reject it. At step 11, we accept page response for > the matching sequence # then respond SUCCESS to the device. > > So would it be OK to add this sequence# to iommu_fault and page > response, or could event reuse the time stamp for that purpose. With a PV interface we can do what we want, but it can't work with an IOMMU emulation that only has 9 bits for the PRGI. I suppose we can add the sequence number but we'll have to handle the case where it isn't present in the page response (ie. accept it anyway). Thanks, Jean
On Mon, 10 Jun 2019 13:45:02 +0100 Jean-Philippe Brucker <jean-philippe.brucker@arm.com> wrote: > On 07/06/2019 18:43, Jacob Pan wrote: > >>> So it seems we agree on the following: > >>> - iommu_unregister_device_fault_handler() will never fail > >>> - iommu driver cleans up all pending faults when handler is > >>> unregistered > >>> - assume device driver or guest not sending more page response > >>> _after_ handler is unregistered. > >>> - system will tolerate rare spurious response > >>> > >>> Sounds right? > >> > >> Yes, I'll add that to the fault series > > Hold on a second please, I think we need more clarifications. Ashok > > pointed out to me that the spurious response can be harmful to other > > devices when it comes to mdev, where PRQ group id is not per PASID, > > device may reuse the group number and receiving spurious page > > response can confuse the entire PF. > > I don't understand how mdev differs from the non-mdev situation (but I > also still don't fully get how mdev+PASID will be implemented). Is the > following the case you're worried about? > > M#: mdev # > > # Dev Host mdev drv VFIO/QEMU Guest > ==================================================================== > 1 <- reg(handler) > 2 PR1 G1 P1 -> M1 PR1 G1 inject -> M1 PR1 G1 > 3 <- unreg(handler) > 4 <- PS1 G1 P1 (F) | > 5 unreg(handler) > 6 <- reg(handler) > 7 PR2 G1 P1 -> M2 PR2 G1 inject -> M2 PR2 G1 > 8 <- M1 PS1 G1 > 9 accept ?? <- PS1 G1 P1 > 10 <- M2 PS2 G1 > 11 accept <- PS2 G1 P1 > Not really. I am not worried about PASID reuse or unbind. Just within the same PASID bind lifetime of a single mdev, back to back register/unregister fault handler. After Step 4, device will think G1 is done. Device could reuse G1 for the next PR, if we accept PS1 in step 9, device will terminate G1 before the real G1 PS arrives in Step 11. The real G1 PS might have a different response code. Then we just drop the PS in Step 11? If the device does not reuse G1 immediately, the spurious response to G1 will get dropped no issue there. > > Step 2 injects PR1 for mdev#1. Step 4 auto-responds to PR1. Between > steps 5 and 6, we re-allocate PASID #1 for mdev #2. At step 7, we > inject PR2 for mdev #2. Step 8 is the spurious Page Response for PR1. > > But I don't think step 9 is possible, because the mdev driver knows > that mdev #1 isn't using PASID #1 anymore. If the configuration is > valid at all (a page response channel still exists for mdev #1), then > mdev #1 now has a different PASID, e.g. #2, and step 9 would be "<- > PS1 G1 P2" which is rejected by iommu.c (no such pending page > request). And step 11 will be accepted. > > If PASIDs are allocated through VCMD, then the situation seems > similar: at step 2 you inject "M1 PR1 G1 P1" into the guest, and at > step 8 the spurious response is "M1 PS1 G1 P1". If mdev #1 doesn't > have PASID #1 anymore, then the mdev driver can check that the PASID > is invalid and can reject the page response. > > > Having spurious page response is also not > > abiding the PCIe spec. exactly. > > We are following the PCI spec though, in that we don't send page > responses for PRGIs that aren't in flight. > You are right, the worst case of the spurious PS is to terminate the group prematurely. Need to know the scope of the HW damage in case of mdev where group IDs can be shared among mdevs belong to the same PF. > > We have two options here: > > 1. unregister handler will get -EBUSY if outstanding fault exists. > > -PROs: block offending device unbind only, eventually > > timeout will clear. > > -CONs: flooded faults can prevent clearing > > 2. unregister handle will block until all faults are clear in the > > host. Never fails unregistration > > Here the host completes the faults itself or wait for a response from > the guest? I'm slightly confused by the word "blocking". I'd rather we > don't introduce an uninterruptible sleep in the IOMMU core, since it's > unlikely to ever finish if we rely on the guest to complete things. > No uninterruptible sleep, I meant unregister_handler is a sync call. But no wait for guest's response. > > -PROs: simple flow for VFIO, no need to worry about device > > holding reference. > > -CONs: spurious page response may come from > > misbehaving/malicious guest if guest does unregister and > > register back to back. > > > It seems the only way to prevent spurious page response is to > > introduce a SW token or sequence# for each PRQ that needs a > > response. I still think option 2 is good. > > > > Consider the following time line: > > decoding > > PR#: page request > > G#: group # > > P#: PASID > > S#: sequence # > > A#: address > > PS#: page response > > (F): Fail > > (S): Success > > > > # Dev Host VFIO/QEMU Guest > > =========================================================== > > 1 <-reg(handler) > > 2 PR1G1S1A1 -> inject -> > > PR1G1S1A1 3 PR2G1S2A2 -> > > inject -> PR2G1S2A2 4. > > <-unreg(handler) 5. <-PR1G1S1A1(F) | > > 6. <-PR2G1S2A2(F) V > > 7. <-unreg(handler) > > 8. <-reg(handler) > > 9 PR3G1S3A1 -> inject -> > > PR3G1S3A1 10. > > <-PS1G1S1A1 11. <reject S1> > > 11. <accept S3> <-PS3G1S3A1 > > 12.PS3G1S3A1(S) > > > > The spurious page response comes in at step 10 where the guest sends > > response for the request in step 1. But since the sequence # is 1, > > host IOMMU driver will reject it. At step 11, we accept page > > response for the matching sequence # then respond SUCCESS to the > > device. > > > > So would it be OK to add this sequence# to iommu_fault and page > > response, or could event reuse the time stamp for that purpose. > > With a PV interface we can do what we want, but it can't work with an > IOMMU emulation that only has 9 bits for the PRGI. I suppose we can > add the sequence number but we'll have to handle the case where it > isn't present in the page response (ie. accept it anyway). > For VT-d emulation, we might be able to use the private data as sequence# in vIOMMU. Keep the real private data in the host. Need Yi's input. If private data is not present, then accept it anyway. > Thanks, > Jean
On 10/06/2019 22:31, Jacob Pan wrote: > On Mon, 10 Jun 2019 13:45:02 +0100 > Jean-Philippe Brucker <jean-philippe.brucker@arm.com> wrote: > >> On 07/06/2019 18:43, Jacob Pan wrote: >>>>> So it seems we agree on the following: >>>>> - iommu_unregister_device_fault_handler() will never fail >>>>> - iommu driver cleans up all pending faults when handler is >>>>> unregistered >>>>> - assume device driver or guest not sending more page response >>>>> _after_ handler is unregistered. >>>>> - system will tolerate rare spurious response >>>>> >>>>> Sounds right? >>>> >>>> Yes, I'll add that to the fault series >>> Hold on a second please, I think we need more clarifications. Ashok >>> pointed out to me that the spurious response can be harmful to other >>> devices when it comes to mdev, where PRQ group id is not per PASID, >>> device may reuse the group number and receiving spurious page >>> response can confuse the entire PF. >> >> I don't understand how mdev differs from the non-mdev situation (but I >> also still don't fully get how mdev+PASID will be implemented). Is the >> following the case you're worried about? >> >> M#: mdev # >> >> # Dev Host mdev drv VFIO/QEMU Guest >> ==================================================================== >> 1 <- reg(handler) >> 2 PR1 G1 P1 -> M1 PR1 G1 inject -> M1 PR1 G1 >> 3 <- unreg(handler) >> 4 <- PS1 G1 P1 (F) | >> 5 unreg(handler) >> 6 <- reg(handler) >> 7 PR2 G1 P1 -> M2 PR2 G1 inject -> M2 PR2 G1 >> 8 <- M1 PS1 G1 >> 9 accept ?? <- PS1 G1 P1 >> 10 <- M2 PS2 G1 >> 11 accept <- PS2 G1 P1 >> > Not really. I am not worried about PASID reuse or unbind. Just within > the same PASID bind lifetime of a single mdev, back to back > register/unregister fault handler. > After Step 4, device will think G1 is done. Device could reuse G1 for > the next PR, if we accept PS1 in step 9, device will terminate G1 before > the real G1 PS arrives in Step 11. The real G1 PS might have a > different response code. Then we just drop the PS in Step 11? Yes, I think we do. Two possibilities: * G1 is reused at step 7 for the same PASID context, which means that it is for the same mdev. The problem is then identical to the non-mdev case, new page faults and old page response may cross: # Dev Host mdev drv VFIO/QEMU Guest ==================================================================== 7 PR2 G1 P1 --. 8 \ .------------- M1 PS1 G1 9 '-----> PR2 G1 P1 -> / inject --> M1 PR2 G1 10 accept <--- PS1 G1 P1 <--' 11 reject <--- PS2 G1 P1 <------------------ M1 PS2 G1 And the incorrect page response is returned to the guest. However it affects a single mdev/guest context, it doesn't affect other mdevs. * Or G1 is reused at step 7 for a different PASID. At step 10 the fault handler rejects the page response because the PASID is different, and step 11 is accepted. >>> Having spurious page response is also not >>> abiding the PCIe spec. exactly. >> >> We are following the PCI spec though, in that we don't send page >> responses for PRGIs that aren't in flight. >> > You are right, the worst case of the spurious PS is to terminate the > group prematurely. Need to know the scope of the HW damage in case of mdev > where group IDs can be shared among mdevs belong to the same PF. But from the IOMMU fault API point of view, the full page request is identified by both PRGI and PASID. Given that each mdev has its own set of PASIDs, it should be easy to isolate page responses per mdev. Thanks, Jean
On Tue, 11 Jun 2019 14:14:33 +0100 Jean-Philippe Brucker <jean-philippe.brucker@arm.com> wrote: > On 10/06/2019 22:31, Jacob Pan wrote: > > On Mon, 10 Jun 2019 13:45:02 +0100 > > Jean-Philippe Brucker <jean-philippe.brucker@arm.com> wrote: > > > >> On 07/06/2019 18:43, Jacob Pan wrote: > >>>>> So it seems we agree on the following: > >>>>> - iommu_unregister_device_fault_handler() will never fail > >>>>> - iommu driver cleans up all pending faults when handler is > >>>>> unregistered > >>>>> - assume device driver or guest not sending more page response > >>>>> _after_ handler is unregistered. > >>>>> - system will tolerate rare spurious response > >>>>> > >>>>> Sounds right? > >>>> > >>>> Yes, I'll add that to the fault series > >>> Hold on a second please, I think we need more clarifications. > >>> Ashok pointed out to me that the spurious response can be harmful > >>> to other devices when it comes to mdev, where PRQ group id is not > >>> per PASID, device may reuse the group number and receiving > >>> spurious page response can confuse the entire PF. > >> > >> I don't understand how mdev differs from the non-mdev situation > >> (but I also still don't fully get how mdev+PASID will be > >> implemented). Is the following the case you're worried about? > >> > >> M#: mdev # > >> > >> # Dev Host mdev drv VFIO/QEMU Guest > >> ==================================================================== > >> 1 <- reg(handler) > >> 2 PR1 G1 P1 -> M1 PR1 G1 inject -> M1 PR1 G1 > >> 3 <- unreg(handler) > >> 4 <- PS1 G1 P1 (F) | > >> 5 unreg(handler) > >> 6 <- reg(handler) > >> 7 PR2 G1 P1 -> M2 PR2 G1 inject -> M2 PR2 G1 > >> 8 <- M1 PS1 G1 > >> 9 accept ?? <- PS1 G1 P1 > >> 10 <- M2 PS2 G1 > >> 11 accept <- PS2 G1 P1 > >> > > Not really. I am not worried about PASID reuse or unbind. Just > > within the same PASID bind lifetime of a single mdev, back to back > > register/unregister fault handler. > > After Step 4, device will think G1 is done. Device could reuse G1 > > for the next PR, if we accept PS1 in step 9, device will terminate > > G1 before the real G1 PS arrives in Step 11. The real G1 PS might > > have a different response code. Then we just drop the PS in Step > > 11? > > Yes, I think we do. Two possibilities: > > * G1 is reused at step 7 for the same PASID context, which means that > it is for the same mdev. The problem is then identical to the non-mdev > case, new page faults and old page response may cross: > > # Dev Host mdev drv VFIO/QEMU Guest > ==================================================================== > 7 PR2 G1 P1 --. > 8 \ .------------- M1 PS1 G1 > 9 '-----> PR2 G1 P1 -> / inject --> M1 PR2 G1 > 10 accept <--- PS1 G1 P1 <--' > 11 reject <--- PS2 G1 P1 <------------------ M1 PS2 G1 > > And the incorrect page response is returned to the guest. However it > affects a single mdev/guest context, it doesn't affect other mdevs. > > * Or G1 is reused at step 7 for a different PASID. At step 10 the > fault handler rejects the page response because the PASID is > different, and step 11 is accepted. > > > >>> Having spurious page response is also not > >>> abiding the PCIe spec. exactly. > >> > >> We are following the PCI spec though, in that we don't send page > >> responses for PRGIs that aren't in flight. > >> > > You are right, the worst case of the spurious PS is to terminate the > > group prematurely. Need to know the scope of the HW damage in case > > of mdev where group IDs can be shared among mdevs belong to the > > same PF. > > But from the IOMMU fault API point of view, the full page request is > identified by both PRGI and PASID. Given that each mdev has its own > set of PASIDs, it should be easy to isolate page responses per mdev. > On Intel platform, devices sending page request with private data must receive page response with matching private data. If we solely depend on PRGI and PASID, we may send stale private data to the device in those incorrect page response. Since private data may represent PF device wide contexts, the consequence of sending page response with wrong private data may affect other mdev/PASID. One solution we are thinking to do is to inject the sequence #(e.g. ktime raw mono clock) as vIOMMU private data into to the guest. Guest would return this fake private data in page response, then host will send page response back to the device that matches PRG1 and PASID and private_data. This solution does not expose HW context related private data to the guest but need to extend page response in iommu uapi. /** * struct iommu_page_response - Generic page response information * @version: API version of this structure * @flags: encodes whether the corresponding fields are valid * (IOMMU_FAULT_PAGE_RESPONSE_* values) * @pasid: Process Address Space ID * @grpid: Page Request Group Index * @code: response code from &enum iommu_page_response_code * @private_data: private data for the matching page request */ struct iommu_page_response { #define IOMMU_PAGE_RESP_VERSION_1 1 __u32 version; #define IOMMU_PAGE_RESP_PASID_VALID (1 << 0) #define IOMMU_PAGE_RESP_PRIVATE_DATA (1 << 1) __u32 flags; __u32 pasid; __u32 grpid; __u32 code; __u32 padding; __u64 private_data[2]; }; There is also the change needed for separating storage for the real and fake private data. Sorry for the last minute change, did not realize the HW implications. I see this as a future extension due to limited testing, perhaps for now, can you add paddings similar to page request? Make it 64B as well. struct iommu_page_response { #define IOMMU_PAGE_RESP_VERSION_1 1 __u32 version; #define IOMMU_PAGE_RESP_PASID_VALID (1 << 0) __u32 flags; __u32 pasid; __u32 grpid; __u32 code; __u8 padding[44]; }; Thanks! Jacob
On 12/06/2019 19:53, Jacob Pan wrote: >>> You are right, the worst case of the spurious PS is to terminate the >>> group prematurely. Need to know the scope of the HW damage in case >>> of mdev where group IDs can be shared among mdevs belong to the >>> same PF. >> >> But from the IOMMU fault API point of view, the full page request is >> identified by both PRGI and PASID. Given that each mdev has its own >> set of PASIDs, it should be easy to isolate page responses per mdev. >> > On Intel platform, devices sending page request with private data must > receive page response with matching private data. If we solely depend > on PRGI and PASID, we may send stale private data to the device in > those incorrect page response. Since private data may represent PF > device wide contexts, the consequence of sending page response with > wrong private data may affect other mdev/PASID. > > One solution we are thinking to do is to inject the sequence #(e.g. > ktime raw mono clock) as vIOMMU private data into to the guest. Guest > would return this fake private data in page response, then host will > send page response back to the device that matches PRG1 and PASID and > private_data. > > This solution does not expose HW context related private data to the > guest but need to extend page response in iommu uapi. > > /** > * struct iommu_page_response - Generic page response information > * @version: API version of this structure > * @flags: encodes whether the corresponding fields are valid > * (IOMMU_FAULT_PAGE_RESPONSE_* values) > * @pasid: Process Address Space ID > * @grpid: Page Request Group Index > * @code: response code from &enum iommu_page_response_code > * @private_data: private data for the matching page request > */ > struct iommu_page_response { > #define IOMMU_PAGE_RESP_VERSION_1 1 > __u32 version; > #define IOMMU_PAGE_RESP_PASID_VALID (1 << 0) > #define IOMMU_PAGE_RESP_PRIVATE_DATA (1 << 1) > __u32 flags; > __u32 pasid; > __u32 grpid; > __u32 code; > __u32 padding; > __u64 private_data[2]; > }; > > There is also the change needed for separating storage for the real and > fake private data. > > Sorry for the last minute change, did not realize the HW implications. > > I see this as a future extension due to limited testing, I'm wondering how we deal with: (1) old userspace that won't fill the new private_data field in page_response. A new kernel still has to support it. (2) old kernel that won't recognize the new PRIVATE_DATA flag. Currently iommu_page_response() rejects page responses with unknown flags. I guess we'll need a two-way negotiation, where userspace queries whether the kernel supports the flag (2), and the kernel learns whether it should expect the private data to come back (1). > perhaps for > now, can you add paddings similar to page request? Make it 64B as well. I don't think padding is necessary, because iommu_page_response is sent by userspace to the kernel, unlike iommu_fault which is allocated by userspace and filled by the kernel. Page response looks a lot more like existing VFIO mechanisms, so I suppose we'll wrap the iommu_page_response structure and include an argsz parameter at the top: struct vfio_iommu_page_response { u32 argsz; struct iommu_page_response pr; }; struct vfio_iommu_page_response vpr = { .argsz = sizeof(vpr), .pr = ... ... }; ioctl(devfd, VFIO_IOMMU_PAGE_RESPONSE, &vpr); In that case supporting private data can be done by simply appending a field at the end (plus the negotiation above). Thanks, Jean
On Tue, 18 Jun 2019 15:04:36 +0100 Jean-Philippe Brucker <jean-philippe.brucker@arm.com> wrote: > On 12/06/2019 19:53, Jacob Pan wrote: > >>> You are right, the worst case of the spurious PS is to terminate > >>> the group prematurely. Need to know the scope of the HW damage in > >>> case of mdev where group IDs can be shared among mdevs belong to > >>> the same PF. > >> > >> But from the IOMMU fault API point of view, the full page request > >> is identified by both PRGI and PASID. Given that each mdev has its > >> own set of PASIDs, it should be easy to isolate page responses per > >> mdev. > > On Intel platform, devices sending page request with private data > > must receive page response with matching private data. If we solely > > depend on PRGI and PASID, we may send stale private data to the > > device in those incorrect page response. Since private data may > > represent PF device wide contexts, the consequence of sending page > > response with wrong private data may affect other mdev/PASID. > > > > One solution we are thinking to do is to inject the sequence #(e.g. > > ktime raw mono clock) as vIOMMU private data into to the guest. > > Guest would return this fake private data in page response, then > > host will send page response back to the device that matches PRG1 > > and PASID and private_data. > > > > This solution does not expose HW context related private data to the > > guest but need to extend page response in iommu uapi. > > > > /** > > * struct iommu_page_response - Generic page response information > > * @version: API version of this structure > > * @flags: encodes whether the corresponding fields are valid > > * (IOMMU_FAULT_PAGE_RESPONSE_* values) > > * @pasid: Process Address Space ID > > * @grpid: Page Request Group Index > > * @code: response code from &enum iommu_page_response_code > > * @private_data: private data for the matching page request > > */ > > struct iommu_page_response { > > #define IOMMU_PAGE_RESP_VERSION_1 1 > > __u32 version; > > #define IOMMU_PAGE_RESP_PASID_VALID (1 << 0) > > #define IOMMU_PAGE_RESP_PRIVATE_DATA (1 << 1) > > __u32 flags; > > __u32 pasid; > > __u32 grpid; > > __u32 code; > > __u32 padding; > > __u64 private_data[2]; > > }; > > > > There is also the change needed for separating storage for the real > > and fake private data. > > > > Sorry for the last minute change, did not realize the HW > > implications. > > > > I see this as a future extension due to limited testing, > > I'm wondering how we deal with: > (1) old userspace that won't fill the new private_data field in > page_response. A new kernel still has to support it. > (2) old kernel that won't recognize the new PRIVATE_DATA flag. > Currently iommu_page_response() rejects page responses with unknown > flags. > > I guess we'll need a two-way negotiation, where userspace queries > whether the kernel supports the flag (2), and the kernel learns > whether it should expect the private data to come back (1). > I am not sure case (1) exist in that there is no existing user space supports PRQ w/o private data. Am I missing something? For VT-d emulation, private data is always part of the scalable mode PASID capability. If vIOMMU query host supports PASID and scalable mode, it will always support private data once PRQ is enabled. So I think we only need to negotiate (2) which should be covered by VT-d PASID cap. > > perhaps for > > now, can you add paddings similar to page request? Make it 64B as > > well. > > I don't think padding is necessary, because iommu_page_response is > sent by userspace to the kernel, unlike iommu_fault which is > allocated by userspace and filled by the kernel. > > Page response looks a lot more like existing VFIO mechanisms, so I > suppose we'll wrap the iommu_page_response structure and include an > argsz parameter at the top: > > struct vfio_iommu_page_response { > u32 argsz; > struct iommu_page_response pr; > }; > > struct vfio_iommu_page_response vpr = { > .argsz = sizeof(vpr), > .pr = ... > ... > }; > > ioctl(devfd, VFIO_IOMMU_PAGE_RESPONSE, &vpr); > > In that case supporting private data can be done by simply appending a > field at the end (plus the negotiation above). > Do you mean at the end of struct vfio_iommu_page_response{}? or at the end of that seems struct iommu_page_response{}? The consumer of the private data is iommu driver not vfio. So I think you want to add the new field at the end of struct iommu_page_response, right? I think that would work, just to clarify.
On 19/06/2019 01:19, Jacob Pan wrote: >>> I see this as a future extension due to limited testing, >> >> I'm wondering how we deal with: >> (1) old userspace that won't fill the new private_data field in >> page_response. A new kernel still has to support it. >> (2) old kernel that won't recognize the new PRIVATE_DATA flag. >> Currently iommu_page_response() rejects page responses with unknown >> flags. >> >> I guess we'll need a two-way negotiation, where userspace queries >> whether the kernel supports the flag (2), and the kernel learns >> whether it should expect the private data to come back (1). >> > I am not sure case (1) exist in that there is no existing user space > supports PRQ w/o private data. Am I missing something? > > For VT-d emulation, private data is always part of the scalable mode > PASID capability. If vIOMMU query host supports PASID and scalable > mode, it will always support private data once PRQ is enabled. Right if VT-d won't ever support page_response without private data then I don't think we have to worry about (1). > So I think we only need to negotiate (2) which should be covered by > VT-d PASID cap. > >>> perhaps for >>> now, can you add paddings similar to page request? Make it 64B as >>> well. >> >> I don't think padding is necessary, because iommu_page_response is >> sent by userspace to the kernel, unlike iommu_fault which is >> allocated by userspace and filled by the kernel. >> >> Page response looks a lot more like existing VFIO mechanisms, so I >> suppose we'll wrap the iommu_page_response structure and include an >> argsz parameter at the top: >> >> struct vfio_iommu_page_response { >> u32 argsz; >> struct iommu_page_response pr; >> }; >> >> struct vfio_iommu_page_response vpr = { >> .argsz = sizeof(vpr), >> .pr = ... >> ... >> }; >> >> ioctl(devfd, VFIO_IOMMU_PAGE_RESPONSE, &vpr); >> >> In that case supporting private data can be done by simply appending a >> field at the end (plus the negotiation above). >> > Do you mean at the end of struct vfio_iommu_page_response{}? or at > the end of that seems struct iommu_page_response{}? > > The consumer of the private data is iommu driver not vfio. So I think > you want to add the new field at the end of struct iommu_page_response, > right? Yes that's what I meant Thanks, Jean
Hi Jean, Jacob, On 6/18/19 4:04 PM, Jean-Philippe Brucker wrote: > On 12/06/2019 19:53, Jacob Pan wrote: >>>> You are right, the worst case of the spurious PS is to terminate the >>>> group prematurely. Need to know the scope of the HW damage in case >>>> of mdev where group IDs can be shared among mdevs belong to the >>>> same PF. >>> >>> But from the IOMMU fault API point of view, the full page request is >>> identified by both PRGI and PASID. Given that each mdev has its own >>> set of PASIDs, it should be easy to isolate page responses per mdev. >>> >> On Intel platform, devices sending page request with private data must >> receive page response with matching private data. If we solely depend >> on PRGI and PASID, we may send stale private data to the device in >> those incorrect page response. Since private data may represent PF >> device wide contexts, the consequence of sending page response with >> wrong private data may affect other mdev/PASID. >> >> One solution we are thinking to do is to inject the sequence #(e.g. >> ktime raw mono clock) as vIOMMU private data into to the guest. Guest >> would return this fake private data in page response, then host will >> send page response back to the device that matches PRG1 and PASID and >> private_data. >> >> This solution does not expose HW context related private data to the >> guest but need to extend page response in iommu uapi. >> >> /** >> * struct iommu_page_response - Generic page response information >> * @version: API version of this structure >> * @flags: encodes whether the corresponding fields are valid >> * (IOMMU_FAULT_PAGE_RESPONSE_* values) >> * @pasid: Process Address Space ID >> * @grpid: Page Request Group Index >> * @code: response code from &enum iommu_page_response_code >> * @private_data: private data for the matching page request >> */ >> struct iommu_page_response { >> #define IOMMU_PAGE_RESP_VERSION_1 1 >> __u32 version; >> #define IOMMU_PAGE_RESP_PASID_VALID (1 << 0) >> #define IOMMU_PAGE_RESP_PRIVATE_DATA (1 << 1) >> __u32 flags; >> __u32 pasid; >> __u32 grpid; >> __u32 code; >> __u32 padding; >> __u64 private_data[2]; >> }; >> >> There is also the change needed for separating storage for the real and >> fake private data. >> >> Sorry for the last minute change, did not realize the HW implications. >> >> I see this as a future extension due to limited testing, > > I'm wondering how we deal with: > (1) old userspace that won't fill the new private_data field in > page_response. A new kernel still has to support it. > (2) old kernel that won't recognize the new PRIVATE_DATA flag. Currently > iommu_page_response() rejects page responses with unknown flags. > > I guess we'll need a two-way negotiation, where userspace queries > whether the kernel supports the flag (2), and the kernel learns whether > it should expect the private data to come back (1). > >> perhaps for >> now, can you add paddings similar to page request? Make it 64B as well. > > I don't think padding is necessary, because iommu_page_response is sent > by userspace to the kernel, unlike iommu_fault which is allocated by > userspace and filled by the kernel. > > Page response looks a lot more like existing VFIO mechanisms, so I > suppose we'll wrap the iommu_page_response structure and include an > argsz parameter at the top: > > struct vfio_iommu_page_response { > u32 argsz; > struct iommu_page_response pr; > }; > > struct vfio_iommu_page_response vpr = { > .argsz = sizeof(vpr), > .pr = ... > ... > }; > > ioctl(devfd, VFIO_IOMMU_PAGE_RESPONSE, &vpr); > > In that case supporting private data can be done by simply appending a > field at the end (plus the negotiation above). Sorry I did not quite follow the spurious response discussion but I just noticed we still do have, upstream, in iommu_unregister_device_fault_handler: /* we cannot unregister handler if there are pending faults */ if (!list_empty(¶m->fault_param->faults)) { ret = -EBUSY; goto unlock; } So did you eventually decide to let iommu_unregister_device_fault_handler fail or is an oversight? Thanks Eric > > Thanks, > Jean > _______________________________________________ > iommu mailing list > iommu@lists.linux-foundation.org > https://lists.linuxfoundation.org/mailman/listinfo/iommu >
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index f75f61127277..520999994ba8 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -30,6 +30,7 @@ #include <linux/vfio.h> #include <linux/vgaarb.h> #include <linux/nospec.h> +#include <linux/circ_buf.h> #include "vfio_pci_private.h" @@ -296,6 +297,46 @@ static const struct vfio_pci_regops vfio_pci_fault_prod_regops = { .add_capability = vfio_pci_fault_prod_add_capability, }; +int vfio_pci_iommu_dev_fault_handler(struct iommu_fault_event *evt, void *data) +{ + struct vfio_pci_device *vdev = (struct vfio_pci_device *) data; + struct vfio_region_fault_prod *prod_region = + (struct vfio_region_fault_prod *)vdev->fault_pages; + struct vfio_region_fault_cons *cons_region = + (struct vfio_region_fault_cons *)(vdev->fault_pages + 2 * PAGE_SIZE); + struct iommu_fault *new = + (struct iommu_fault *)(vdev->fault_pages + prod_region->offset + + prod_region->prod * prod_region->entry_size); + int prod, cons, size; + + mutex_lock(&vdev->fault_queue_lock); + + if (!vdev->fault_abi) + goto unlock; + + prod = prod_region->prod; + cons = cons_region->cons; + size = prod_region->nb_entries; + + if (CIRC_SPACE(prod, cons, size) < 1) + goto unlock; + + *new = evt->fault; + prod = (prod + 1) % size; + prod_region->prod = prod; + mutex_unlock(&vdev->fault_queue_lock); + + mutex_lock(&vdev->igate); + if (vdev->dma_fault_trigger) + eventfd_signal(vdev->dma_fault_trigger, 1); + mutex_unlock(&vdev->igate); + return 0; + +unlock: + mutex_unlock(&vdev->fault_queue_lock); + return -EINVAL; +} + static int vfio_pci_init_fault_region(struct vfio_pci_device *vdev) { struct vfio_region_fault_prod *header; @@ -328,6 +369,13 @@ static int vfio_pci_init_fault_region(struct vfio_pci_device *vdev) header = (struct vfio_region_fault_prod *)vdev->fault_pages; header->version = -1; header->offset = PAGE_SIZE; + + ret = iommu_register_device_fault_handler(&vdev->pdev->dev, + vfio_pci_iommu_dev_fault_handler, + vdev); + if (ret) + goto out; + return 0; out: kfree(vdev->fault_pages); @@ -570,6 +618,7 @@ static void vfio_pci_release(void *device_data) if (!(--vdev->refcnt)) { vfio_spapr_pci_eeh_release(vdev->pdev); vfio_pci_disable(vdev); + iommu_unregister_device_fault_handler(&vdev->pdev->dev); } mutex_unlock(&vdev->reflck->lock); diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h index 8e0a55682d3f..a9276926f008 100644 --- a/drivers/vfio/pci/vfio_pci_private.h +++ b/drivers/vfio/pci/vfio_pci_private.h @@ -122,6 +122,7 @@ struct vfio_pci_device { int ioeventfds_nr; struct eventfd_ctx *err_trigger; struct eventfd_ctx *req_trigger; + struct eventfd_ctx *dma_fault_trigger; struct mutex fault_queue_lock; int fault_abi; struct list_head dummy_resources_list;
This patch registers a fault handler which records faults in a circular buffer and then signals an eventfd. This buffer is exposed within the fault region. Signed-off-by: Eric Auger <eric.auger@redhat.com> --- v3 -> v4: - move iommu_unregister_device_fault_handler to vfio_pci_release --- drivers/vfio/pci/vfio_pci.c | 49 +++++++++++++++++++++++++++++ drivers/vfio/pci/vfio_pci_private.h | 1 + 2 files changed, 50 insertions(+)