diff mbox series

[v2,15/23] vfio-user: forward msix BAR accesses to server

Message ID 0ad69e4ea3d1f37246ce5e32ba833d6c871e99b1.1675228037.git.john.g.johnson@oracle.com (mailing list archive)
State New, archived
Headers show
Series vfio-user client | expand

Commit Message

John Johnson Feb. 2, 2023, 5:55 a.m. UTC
Server holds device current device pending state
Use irq masking commands in socket case

Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
---
 hw/vfio/pci.h                 |  1 +
 include/hw/vfio/vfio-common.h |  3 ++
 hw/vfio/ccw.c                 |  1 +
 hw/vfio/common.c              | 26 ++++++++++++++++++
 hw/vfio/pci.c                 | 23 +++++++++++++++-
 hw/vfio/platform.c            |  1 +
 hw/vfio/user-pci.c            | 64 +++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 118 insertions(+), 1 deletion(-)

Comments

Alex Williamson Feb. 6, 2023, 8:33 p.m. UTC | #1
On Wed,  1 Feb 2023 21:55:51 -0800
John Johnson <john.g.johnson@oracle.com> wrote:

> Server holds device current device pending state
> Use irq masking commands in socket case
> 
> Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
> ---
>  hw/vfio/pci.h                 |  1 +
>  include/hw/vfio/vfio-common.h |  3 ++
>  hw/vfio/ccw.c                 |  1 +
>  hw/vfio/common.c              | 26 ++++++++++++++++++
>  hw/vfio/pci.c                 | 23 +++++++++++++++-
>  hw/vfio/platform.c            |  1 +
>  hw/vfio/user-pci.c            | 64 +++++++++++++++++++++++++++++++++++++++++++
>  7 files changed, 118 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> index 4f70664..d3e5d5f 100644
> --- a/hw/vfio/pci.h
> +++ b/hw/vfio/pci.h
> @@ -113,6 +113,7 @@ typedef struct VFIOMSIXInfo {
>      uint32_t table_offset;
>      uint32_t pba_offset;
>      unsigned long *pending;
> +    MemoryRegion *pba_region;
>  } VFIOMSIXInfo;
>  
>  /*
> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> index bbc4b15..2c58d7d 100644
> --- a/include/hw/vfio/vfio-common.h
> +++ b/include/hw/vfio/vfio-common.h
> @@ -143,6 +143,7 @@ typedef struct VFIODevice {
>      bool ram_block_discard_allowed;
>      bool enable_migration;
>      bool use_regfds;
> +    bool can_mask_irq;

How can this be a device level property?  vfio-pci (kernel) supports
masking of INTx.  It seems like there needs to be a per-index info or
probe here to to support this.


>      VFIODeviceOps *ops;
>      VFIODeviceIO *io;
>      unsigned int num_irqs;
> @@ -239,6 +240,8 @@ void vfio_put_base_device(VFIODevice *vbasedev);
>  void vfio_disable_irqindex(VFIODevice *vbasedev, int index);
>  void vfio_unmask_single_irqindex(VFIODevice *vbasedev, int index);
>  void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index);
> +void vfio_unmask_single_irq(VFIODevice *vbasedev, int index, int irq);
> +void vfio_mask_single_irq(VFIODevice *vbasedev, int index, int irq);
>  int vfio_set_irq_signaling(VFIODevice *vbasedev, int index, int subindex,
>                             int action, int fd, Error **errp);
>  void vfio_region_write(void *opaque, hwaddr addr,
> diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c
> index 00605bd..bf67670 100644
> --- a/hw/vfio/ccw.c
> +++ b/hw/vfio/ccw.c
> @@ -616,6 +616,7 @@ static void vfio_ccw_get_device(VFIOGroup *group, VFIOCCWDevice *vcdev,
>      vcdev->vdev.dev = &vcdev->cdev.parent_obj.parent_obj;
>      vcdev->vdev.io = &vfio_dev_io_ioctl;
>      vcdev->vdev.use_regfds = false;
> +    vcdev->vdev.can_mask_irq = false;
>  
>      return;
>  
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index de64e53..0c1cb21 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -102,6 +102,32 @@ void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index)
>      vbasedev->io->set_irqs(vbasedev, &irq_set);
>  }
>  
> +void vfio_mask_single_irq(VFIODevice *vbasedev, int index, int irq)
> +{
> +    struct vfio_irq_set irq_set = {
> +        .argsz = sizeof(irq_set),
> +        .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK,
> +        .index = index,
> +        .start = irq,
> +        .count = 1,
> +    };
> +
> +    vbasedev->io->set_irqs(vbasedev, &irq_set);
> +}
> +
> +void vfio_unmask_single_irq(VFIODevice *vbasedev, int index, int irq)
> +{
> +    struct vfio_irq_set irq_set = {
> +        .argsz = sizeof(irq_set),
> +        .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK,
> +        .index = index,
> +        .start = irq,
> +        .count = 1,
> +    };
> +
> +    vbasedev->io->set_irqs(vbasedev, &irq_set);
> +}
> +
>  static inline const char *action_to_str(int action)
>  {
>      switch (action) {
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 42e7c82..7b16f8f 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -477,6 +477,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
>  {
>      VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
>      VFIOMSIVector *vector;
> +    bool new_vec = false;
>      int ret;
>  
>      trace_vfio_msix_vector_do_use(vdev->vbasedev.name, nr);
> @@ -490,6 +491,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
>              error_report("vfio: Error: event_notifier_init failed");
>          }
>          vector->use = true;
> +        new_vec = true;
>          msix_vector_use(pdev, nr);
>      }
>  
> @@ -516,6 +518,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
>                  kvm_irqchip_commit_route_changes(&vfio_route_change);
>                  vfio_connect_kvm_msi_virq(vector);
>              }
> +            new_vec = true;

This looks wrong to me, can't we get here when a previously used vector
is unmasked?

>          }
>      }
>  
> @@ -523,6 +526,8 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
>       * We don't want to have the host allocate all possible MSI vectors
>       * for a device if they're not in use, so we shutdown and incrementally
>       * increase them as needed.
> +     * Otherwise, unmask the vector if the vector is already setup (and we can
> +     * do so) or send the fd if not.
>       */
>      if (vdev->nr_vectors < nr + 1) {
>          vdev->nr_vectors = nr + 1;
> @@ -533,6 +538,8 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
>                  error_report("vfio: failed to enable vectors, %d", ret);
>              }
>          }
> +    } else if (vdev->vbasedev.can_mask_irq && !new_vec) {
> +        vfio_unmask_single_irq(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, nr);

The correctness of @new_vec is doubtful here, but masking support must
be in reference to a specific IRQ index.  INTx masking is implicit, but
we'd probably need something on the VFIOPCIDevice to indicate support
for MSI and MSIX masking support, separately.

>      } else {
>          Error *err = NULL;
>          int32_t fd;
> @@ -574,6 +581,12 @@ static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr)
>  
>      trace_vfio_msix_vector_release(vdev->vbasedev.name, nr);
>  
> +    /* just mask vector if peer supports it */
> +    if (vdev->vbasedev.can_mask_irq) {
> +        vfio_mask_single_irq(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, nr);
> +        return;
> +    }
> +
>      /*
>       * There are still old guests that mask and unmask vectors on every
>       * interrupt.  If we're using QEMU bypass with a KVM irqfd, leave all of
> @@ -644,7 +657,7 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev)
>          if (ret) {
>              error_report("vfio: failed to enable vectors, %d", ret);
>          }
> -    } else {
> +    } else if (!vdev->vbasedev.can_mask_irq) {
>          /*
>           * Some communication channels between VF & PF or PF & fw rely on the
>           * physical state of the device and expect that enabling MSI-X from the
> @@ -660,6 +673,13 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev)
>           */
>          vfio_msix_vector_do_use(&vdev->pdev, 0, NULL, NULL);
>          vfio_msix_vector_release(&vdev->pdev, 0);
> +    } else {
> +        /*
> +         * If we can use irq masking, send an invalid fd on vector 0
> +         * to enable MSI-X without any vectors enabled.
> +         */
> +        vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, 0,
> +                               VFIO_IRQ_SET_ACTION_TRIGGER, -1, NULL);

What does this have to do with masking?  I'm not entirely sure this
doesn't also work on vfio-pci (kernel).

In general this feels like a special cased version of MSI/X masking
support rather than actually reworking things to support underlying
masking support for these indexes.  Thanks,

Alex
John Johnson Feb. 8, 2023, 6:38 a.m. UTC | #2
> On Feb 6, 2023, at 12:33 PM, Alex Williamson <alex.williamson@redhat.com> wrote:
> 
> On Wed,  1 Feb 2023 21:55:51 -0800
> John Johnson <john.g.johnson@oracle.com> wrote:
> 
>> Server holds device current device pending state
>> Use irq masking commands in socket case
>> 
>> Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
>> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
>> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
>> ---
>> hw/vfio/pci.h                 |  1 +
>> include/hw/vfio/vfio-common.h |  3 ++
>> hw/vfio/ccw.c                 |  1 +
>> hw/vfio/common.c              | 26 ++++++++++++++++++
>> hw/vfio/pci.c                 | 23 +++++++++++++++-
>> hw/vfio/platform.c            |  1 +
>> hw/vfio/user-pci.c            | 64 +++++++++++++++++++++++++++++++++++++++++++
>> 7 files changed, 118 insertions(+), 1 deletion(-)
>> 
>> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
>> index 4f70664..d3e5d5f 100644
>> --- a/hw/vfio/pci.h
>> +++ b/hw/vfio/pci.h
>> @@ -113,6 +113,7 @@ typedef struct VFIOMSIXInfo {
>>     uint32_t table_offset;
>>     uint32_t pba_offset;
>>     unsigned long *pending;
>> +    MemoryRegion *pba_region;
>> } VFIOMSIXInfo;
>> 
>> /*
>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
>> index bbc4b15..2c58d7d 100644
>> --- a/include/hw/vfio/vfio-common.h
>> +++ b/include/hw/vfio/vfio-common.h
>> @@ -143,6 +143,7 @@ typedef struct VFIODevice {
>>     bool ram_block_discard_allowed;
>>     bool enable_migration;
>>     bool use_regfds;
>> +    bool can_mask_irq;
> 
> How can this be a device level property?  vfio-pci (kernel) supports
> masking of INTx.  It seems like there needs to be a per-index info or
> probe here to to support this.
> 

	It is only used for MSIX masking.  MSI masking is done with
config space stores, and vfio-kernel and vfio-user handle them the
same by forwarding the config space writes to the server so it can
mask interrupts.

	MSIX is trickier because the mask bits are in a memory region.
vfio-kernel doesn’t support SET_IRQS on MSIX vectors, so if the host
doesn’t allow client mapping of the MSIX table to do the masking, vfio
switches a masked vector’s event FD destination from KVM to QEMU, then
uses the QEMU PCI emulation to mask the vector.

	vfio-user has to use messages to mask MSIX vectors, since there
is no host config space to map.  I originally forwarded the MSIX table
writes to the server to do the masking, but the feedback on the vfio-user
server changes was to add SET_IRQS support.

	I can make this a PCI MSIX-specific bool if that helps.


> 
>>     VFIODeviceOps *ops;
>>     VFIODeviceIO *io;
>>     unsigned int num_irqs;
>> @@ -239,6 +240,8 @@ void vfio_put_base_device(VFIODevice *vbasedev);
>> void vfio_disable_irqindex(VFIODevice *vbasedev, int index);
>> void vfio_unmask_single_irqindex(VFIODevice *vbasedev, int index);
>> void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index);
>> +void vfio_unmask_single_irq(VFIODevice *vbasedev, int index, int irq);
>> +void vfio_mask_single_irq(VFIODevice *vbasedev, int index, int irq);
>> int vfio_set_irq_signaling(VFIODevice *vbasedev, int index, int subindex,
>>                            int action, int fd, Error **errp);
>> void vfio_region_write(void *opaque, hwaddr addr,
>> diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c
>> index 00605bd..bf67670 100644
>> --- a/hw/vfio/ccw.c
>> +++ b/hw/vfio/ccw.c
>> @@ -616,6 +616,7 @@ static void vfio_ccw_get_device(VFIOGroup *group, VFIOCCWDevice *vcdev,
>>     vcdev->vdev.dev = &vcdev->cdev.parent_obj.parent_obj;
>>     vcdev->vdev.io = &vfio_dev_io_ioctl;
>>     vcdev->vdev.use_regfds = false;
>> +    vcdev->vdev.can_mask_irq = false;
>> 
>>     return;
>> 
>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>> index de64e53..0c1cb21 100644
>> --- a/hw/vfio/common.c
>> +++ b/hw/vfio/common.c
>> @@ -102,6 +102,32 @@ void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index)
>>     vbasedev->io->set_irqs(vbasedev, &irq_set);
>> }
>> 
>> +void vfio_mask_single_irq(VFIODevice *vbasedev, int index, int irq)
>> +{
>> +    struct vfio_irq_set irq_set = {
>> +        .argsz = sizeof(irq_set),
>> +        .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK,
>> +        .index = index,
>> +        .start = irq,
>> +        .count = 1,
>> +    };
>> +
>> +    vbasedev->io->set_irqs(vbasedev, &irq_set);
>> +}
>> +
>> +void vfio_unmask_single_irq(VFIODevice *vbasedev, int index, int irq)
>> +{
>> +    struct vfio_irq_set irq_set = {
>> +        .argsz = sizeof(irq_set),
>> +        .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK,
>> +        .index = index,
>> +        .start = irq,
>> +        .count = 1,
>> +    };
>> +
>> +    vbasedev->io->set_irqs(vbasedev, &irq_set);
>> +}
>> +
>> static inline const char *action_to_str(int action)
>> {
>>     switch (action) {
>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>> index 42e7c82..7b16f8f 100644
>> --- a/hw/vfio/pci.c
>> +++ b/hw/vfio/pci.c
>> @@ -477,6 +477,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
>> {
>>     VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
>>     VFIOMSIVector *vector;
>> +    bool new_vec = false;
>>     int ret;
>> 
>>     trace_vfio_msix_vector_do_use(vdev->vbasedev.name, nr);
>> @@ -490,6 +491,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
>>             error_report("vfio: Error: event_notifier_init failed");
>>         }
>>         vector->use = true;
>> +        new_vec = true;
>>         msix_vector_use(pdev, nr);
>>     }
>> 
>> @@ -516,6 +518,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
>>                 kvm_irqchip_commit_route_changes(&vfio_route_change);
>>                 vfio_connect_kvm_msi_virq(vector);
>>             }
>> +            new_vec = true;
> 
> This looks wrong to me, can't we get here when a previously used vector
> is unmasked?
> 

	It’s for the case where we upgrade the vector from using an event FD
that terminates in QEMU to one that terminates in KVM.  In that case we want
to send the FD again, not just unmask the current FD.



>>         }
>>     }
>> 
>> @@ -523,6 +526,8 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
>>      * We don't want to have the host allocate all possible MSI vectors
>>      * for a device if they're not in use, so we shutdown and incrementally
>>      * increase them as needed.
>> +     * Otherwise, unmask the vector if the vector is already setup (and we can
>> +     * do so) or send the fd if not.
>>      */
>>     if (vdev->nr_vectors < nr + 1) {
>>         vdev->nr_vectors = nr + 1;
>> @@ -533,6 +538,8 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
>>                 error_report("vfio: failed to enable vectors, %d", ret);
>>             }
>>         }
>> +    } else if (vdev->vbasedev.can_mask_irq && !new_vec) {
>> +        vfio_unmask_single_irq(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, nr);
> 
> The correctness of @new_vec is doubtful here, but masking support must
> be in reference to a specific IRQ index.  INTx masking is implicit, but
> we'd probably need something on the VFIOPCIDevice to indicate support
> for MSI and MSIX masking support, separately.
> 

	MSI masking uses config writes, so this is only for MSIX.


>>     } else {
>>         Error *err = NULL;
>>         int32_t fd;
>> @@ -574,6 +581,12 @@ static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr)
>> 
>>     trace_vfio_msix_vector_release(vdev->vbasedev.name, nr);
>> 
>> +    /* just mask vector if peer supports it */
>> +    if (vdev->vbasedev.can_mask_irq) {
>> +        vfio_mask_single_irq(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, nr);
>> +        return;
>> +    }
>> +
>>     /*
>>      * There are still old guests that mask and unmask vectors on every
>>      * interrupt.  If we're using QEMU bypass with a KVM irqfd, leave all of
>> @@ -644,7 +657,7 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev)
>>         if (ret) {
>>             error_report("vfio: failed to enable vectors, %d", ret);
>>         }
>> -    } else {
>> +    } else if (!vdev->vbasedev.can_mask_irq) {
>>         /*
>>          * Some communication channels between VF & PF or PF & fw rely on the
>>          * physical state of the device and expect that enabling MSI-X from the
>> @@ -660,6 +673,13 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev)
>>          */
>>         vfio_msix_vector_do_use(&vdev->pdev, 0, NULL, NULL);
>>         vfio_msix_vector_release(&vdev->pdev, 0);
>> +    } else {
>> +        /*
>> +         * If we can use irq masking, send an invalid fd on vector 0
>> +         * to enable MSI-X without any vectors enabled.
>> +         */
>> +        vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, 0,
>> +                               VFIO_IRQ_SET_ACTION_TRIGGER, -1, NULL);
> 
> What does this have to do with masking?  I'm not entirely sure this
> doesn't also work on vfio-pci (kernel).
> 
> In general this feels like a special cased version of MSI/X masking
> support rather than actually reworking things to support underlying
> masking support for these indexes.  Thanks,
> 


	The comment in the !can_mask_irq clause sounded like vfio-kernel
needs a use/release cycle, so I didn’t want to change its behavior.

								JJ
Alex Williamson Feb. 8, 2023, 9:30 p.m. UTC | #3
On Wed, 8 Feb 2023 06:38:30 +0000
John Johnson <john.g.johnson@oracle.com> wrote:

> > On Feb 6, 2023, at 12:33 PM, Alex Williamson <alex.williamson@redhat.com> wrote:
> > 
> > On Wed,  1 Feb 2023 21:55:51 -0800
> > John Johnson <john.g.johnson@oracle.com> wrote:
> >   
> >> Server holds device current device pending state
> >> Use irq masking commands in socket case
> >> 
> >> Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
> >> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
> >> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
> >> ---
> >> hw/vfio/pci.h                 |  1 +
> >> include/hw/vfio/vfio-common.h |  3 ++
> >> hw/vfio/ccw.c                 |  1 +
> >> hw/vfio/common.c              | 26 ++++++++++++++++++
> >> hw/vfio/pci.c                 | 23 +++++++++++++++-
> >> hw/vfio/platform.c            |  1 +
> >> hw/vfio/user-pci.c            | 64 +++++++++++++++++++++++++++++++++++++++++++
> >> 7 files changed, 118 insertions(+), 1 deletion(-)
> >> 
> >> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> >> index 4f70664..d3e5d5f 100644
> >> --- a/hw/vfio/pci.h
> >> +++ b/hw/vfio/pci.h
> >> @@ -113,6 +113,7 @@ typedef struct VFIOMSIXInfo {
> >>     uint32_t table_offset;
> >>     uint32_t pba_offset;
> >>     unsigned long *pending;
> >> +    MemoryRegion *pba_region;
> >> } VFIOMSIXInfo;
> >> 
> >> /*
> >> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> >> index bbc4b15..2c58d7d 100644
> >> --- a/include/hw/vfio/vfio-common.h
> >> +++ b/include/hw/vfio/vfio-common.h
> >> @@ -143,6 +143,7 @@ typedef struct VFIODevice {
> >>     bool ram_block_discard_allowed;
> >>     bool enable_migration;
> >>     bool use_regfds;
> >> +    bool can_mask_irq;  
> > 
> > How can this be a device level property?  vfio-pci (kernel) supports
> > masking of INTx.  It seems like there needs to be a per-index info or
> > probe here to to support this.
> >   
> 
> 	It is only used for MSIX masking.  MSI masking is done with
> config space stores, and vfio-kernel and vfio-user handle them the
> same by forwarding the config space writes to the server so it can
> mask interrupts.

I suppose this does slip through on vfio-kernel, though support via
SET_IRQS would really be the uAPI mechanism we'd expect to use for
masking, just as it is for enabling/disabling MSI.  MSI is not well
used enough to have flushed that out and it seems harmless, but is not
the intended API.

> 	MSIX is trickier because the mask bits are in a memory region.
> vfio-kernel doesn’t support SET_IRQS on MSIX vectors, so if the host
> doesn’t allow client mapping of the MSIX table to do the masking, vfio
> switches a masked vector’s event FD destination from KVM to QEMU, then
> uses the QEMU PCI emulation to mask the vector.

Lack of support for MSIX ACTION_[UN]MASK is not an API limitation, it's
an implementation issue in the kernel.  Same for MSI.  I believe this
is resolved now, that there are mask/unmask APIs available within the
kernel, as well as mechanisms to avoid the NORESIZE behavior now, so
the intention would be to implement that support, along with a
mechanism for the user to know the support is present.  We already have
that for NORESIZE via IRQ_INFO, so I suspect the right answer might be
to add a new VFIO_IRQ_INFO_MSI_MASK to advertise masking support.
 
> 	vfio-user has to use messages to mask MSIX vectors, since there
> is no host config space to map.  I originally forwarded the MSIX table
> writes to the server to do the masking, but the feedback on the vfio-user
> server changes was to add SET_IRQS support.

Which is what I'm describing above, QEMU should know via the VFIO uAPI
whether MSI/X masking is supported and use that to configure the code
to make use of it rather than some inferred value based on the
interface type.  Thanks,

Alex
John Johnson Feb. 10, 2023, 5:28 a.m. UTC | #4
> On Feb 8, 2023, at 1:30 PM, Alex Williamson <alex.williamson@redhat.com> wrote:
> 
> On Wed, 8 Feb 2023 06:38:30 +0000
> John Johnson <john.g.johnson@oracle.com> wrote:
> 
>>> On Feb 6, 2023, at 12:33 PM, Alex Williamson <alex.williamson@redhat.com> wrote:
>>> 
>>> On Wed,  1 Feb 2023 21:55:51 -0800
>>> John Johnson <john.g.johnson@oracle.com> wrote:
>>> 
>>>> Server holds device current device pending state
>>>> Use irq masking commands in socket case
>>>> 
>>>> Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
>>>> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
>>>> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
>>>> ---
>>>> hw/vfio/pci.h                 |  1 +
>>>> include/hw/vfio/vfio-common.h |  3 ++
>>>> hw/vfio/ccw.c                 |  1 +
>>>> hw/vfio/common.c              | 26 ++++++++++++++++++
>>>> hw/vfio/pci.c                 | 23 +++++++++++++++-
>>>> hw/vfio/platform.c            |  1 +
>>>> hw/vfio/user-pci.c            | 64 +++++++++++++++++++++++++++++++++++++++++++
>>>> 7 files changed, 118 insertions(+), 1 deletion(-)
>>>> 
>>>> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
>>>> index 4f70664..d3e5d5f 100644
>>>> --- a/hw/vfio/pci.h
>>>> +++ b/hw/vfio/pci.h
>>>> @@ -113,6 +113,7 @@ typedef struct VFIOMSIXInfo {
>>>>    uint32_t table_offset;
>>>>    uint32_t pba_offset;
>>>>    unsigned long *pending;
>>>> +    MemoryRegion *pba_region;
>>>> } VFIOMSIXInfo;
>>>> 
>>>> /*
>>>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
>>>> index bbc4b15..2c58d7d 100644
>>>> --- a/include/hw/vfio/vfio-common.h
>>>> +++ b/include/hw/vfio/vfio-common.h
>>>> @@ -143,6 +143,7 @@ typedef struct VFIODevice {
>>>>    bool ram_block_discard_allowed;
>>>>    bool enable_migration;
>>>>    bool use_regfds;
>>>> +    bool can_mask_irq;  
>>> 
>>> How can this be a device level property?  vfio-pci (kernel) supports
>>> masking of INTx.  It seems like there needs to be a per-index info or
>>> probe here to to support this.
>>> 
>> 
>> 	It is only used for MSIX masking.  MSI masking is done with
>> config space stores, and vfio-kernel and vfio-user handle them the
>> same by forwarding the config space writes to the server so it can
>> mask interrupts.
> 
> I suppose this does slip through on vfio-kernel, though support via
> SET_IRQS would really be the uAPI mechanism we'd expect to use for
> masking, just as it is for enabling/disabling MSI.  MSI is not well
> used enough to have flushed that out and it seems harmless, but is not
> the intended API.
> 
>> 	MSIX is trickier because the mask bits are in a memory region.
>> vfio-kernel doesn’t support SET_IRQS on MSIX vectors, so if the host
>> doesn’t allow client mapping of the MSIX table to do the masking, vfio
>> switches a masked vector’s event FD destination from KVM to QEMU, then
>> uses the QEMU PCI emulation to mask the vector.
> 
> Lack of support for MSIX ACTION_[UN]MASK is not an API limitation, it's
> an implementation issue in the kernel.  Same for MSI.  I believe this
> is resolved now, that there are mask/unmask APIs available within the
> kernel, as well as mechanisms to avoid the NORESIZE behavior now, so
> the intention would be to implement that support, along with a
> mechanism for the user to know the support is present.  We already have
> that for NORESIZE via IRQ_INFO, so I suspect the right answer might be
> to add a new VFIO_IRQ_INFO_MSI_MASK to advertise masking support.
> 

	I looked at the linux next vfio kernel driver, and it doesn't seem
to support MSI/X masking.


>> 	vfio-user has to use messages to mask MSIX vectors, since there
>> is no host config space to map.  I originally forwarded the MSIX table
>> writes to the server to do the masking, but the feedback on the vfio-user
>> server changes was to add SET_IRQS support.
> 
> Which is what I'm describing above, QEMU should know via the VFIO uAPI
> whether MSI/X masking is supported and use that to configure the code
> to make use of it rather than some inferred value based on the
> interface type.  Thanks,
> 

	The kernel doesn’t advertising it working either, so I think I can
key it off the presence of VFIO_IRQ_INFO_MASKABLE in irq_info

							JJ
diff mbox series

Patch

diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index 4f70664..d3e5d5f 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -113,6 +113,7 @@  typedef struct VFIOMSIXInfo {
     uint32_t table_offset;
     uint32_t pba_offset;
     unsigned long *pending;
+    MemoryRegion *pba_region;
 } VFIOMSIXInfo;
 
 /*
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index bbc4b15..2c58d7d 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -143,6 +143,7 @@  typedef struct VFIODevice {
     bool ram_block_discard_allowed;
     bool enable_migration;
     bool use_regfds;
+    bool can_mask_irq;
     VFIODeviceOps *ops;
     VFIODeviceIO *io;
     unsigned int num_irqs;
@@ -239,6 +240,8 @@  void vfio_put_base_device(VFIODevice *vbasedev);
 void vfio_disable_irqindex(VFIODevice *vbasedev, int index);
 void vfio_unmask_single_irqindex(VFIODevice *vbasedev, int index);
 void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index);
+void vfio_unmask_single_irq(VFIODevice *vbasedev, int index, int irq);
+void vfio_mask_single_irq(VFIODevice *vbasedev, int index, int irq);
 int vfio_set_irq_signaling(VFIODevice *vbasedev, int index, int subindex,
                            int action, int fd, Error **errp);
 void vfio_region_write(void *opaque, hwaddr addr,
diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c
index 00605bd..bf67670 100644
--- a/hw/vfio/ccw.c
+++ b/hw/vfio/ccw.c
@@ -616,6 +616,7 @@  static void vfio_ccw_get_device(VFIOGroup *group, VFIOCCWDevice *vcdev,
     vcdev->vdev.dev = &vcdev->cdev.parent_obj.parent_obj;
     vcdev->vdev.io = &vfio_dev_io_ioctl;
     vcdev->vdev.use_regfds = false;
+    vcdev->vdev.can_mask_irq = false;
 
     return;
 
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index de64e53..0c1cb21 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -102,6 +102,32 @@  void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index)
     vbasedev->io->set_irqs(vbasedev, &irq_set);
 }
 
+void vfio_mask_single_irq(VFIODevice *vbasedev, int index, int irq)
+{
+    struct vfio_irq_set irq_set = {
+        .argsz = sizeof(irq_set),
+        .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK,
+        .index = index,
+        .start = irq,
+        .count = 1,
+    };
+
+    vbasedev->io->set_irqs(vbasedev, &irq_set);
+}
+
+void vfio_unmask_single_irq(VFIODevice *vbasedev, int index, int irq)
+{
+    struct vfio_irq_set irq_set = {
+        .argsz = sizeof(irq_set),
+        .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK,
+        .index = index,
+        .start = irq,
+        .count = 1,
+    };
+
+    vbasedev->io->set_irqs(vbasedev, &irq_set);
+}
+
 static inline const char *action_to_str(int action)
 {
     switch (action) {
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 42e7c82..7b16f8f 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -477,6 +477,7 @@  static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
 {
     VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
     VFIOMSIVector *vector;
+    bool new_vec = false;
     int ret;
 
     trace_vfio_msix_vector_do_use(vdev->vbasedev.name, nr);
@@ -490,6 +491,7 @@  static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
             error_report("vfio: Error: event_notifier_init failed");
         }
         vector->use = true;
+        new_vec = true;
         msix_vector_use(pdev, nr);
     }
 
@@ -516,6 +518,7 @@  static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
                 kvm_irqchip_commit_route_changes(&vfio_route_change);
                 vfio_connect_kvm_msi_virq(vector);
             }
+            new_vec = true;
         }
     }
 
@@ -523,6 +526,8 @@  static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
      * We don't want to have the host allocate all possible MSI vectors
      * for a device if they're not in use, so we shutdown and incrementally
      * increase them as needed.
+     * Otherwise, unmask the vector if the vector is already setup (and we can
+     * do so) or send the fd if not.
      */
     if (vdev->nr_vectors < nr + 1) {
         vdev->nr_vectors = nr + 1;
@@ -533,6 +538,8 @@  static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
                 error_report("vfio: failed to enable vectors, %d", ret);
             }
         }
+    } else if (vdev->vbasedev.can_mask_irq && !new_vec) {
+        vfio_unmask_single_irq(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, nr);
     } else {
         Error *err = NULL;
         int32_t fd;
@@ -574,6 +581,12 @@  static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr)
 
     trace_vfio_msix_vector_release(vdev->vbasedev.name, nr);
 
+    /* just mask vector if peer supports it */
+    if (vdev->vbasedev.can_mask_irq) {
+        vfio_mask_single_irq(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, nr);
+        return;
+    }
+
     /*
      * There are still old guests that mask and unmask vectors on every
      * interrupt.  If we're using QEMU bypass with a KVM irqfd, leave all of
@@ -644,7 +657,7 @@  static void vfio_msix_enable(VFIOPCIDevice *vdev)
         if (ret) {
             error_report("vfio: failed to enable vectors, %d", ret);
         }
-    } else {
+    } else if (!vdev->vbasedev.can_mask_irq) {
         /*
          * Some communication channels between VF & PF or PF & fw rely on the
          * physical state of the device and expect that enabling MSI-X from the
@@ -660,6 +673,13 @@  static void vfio_msix_enable(VFIOPCIDevice *vdev)
          */
         vfio_msix_vector_do_use(&vdev->pdev, 0, NULL, NULL);
         vfio_msix_vector_release(&vdev->pdev, 0);
+    } else {
+        /*
+         * If we can use irq masking, send an invalid fd on vector 0
+         * to enable MSI-X without any vectors enabled.
+         */
+        vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, 0,
+                               VFIO_IRQ_SET_ACTION_TRIGGER, -1, NULL);
     }
 
     trace_vfio_msix_enable(vdev->vbasedev.name);
@@ -3040,6 +3060,7 @@  static void vfio_realize(PCIDevice *pdev, Error **errp)
     vbasedev->dev = DEVICE(vdev);
     vbasedev->io = &vfio_dev_io_ioctl;
     vbasedev->use_regfds = false;
+    vbasedev->can_mask_irq = false;
 
     tmp = g_strdup_printf("%s/iommu_group", vbasedev->sysfsdev);
     len = readlink(tmp, group_path, sizeof(group_path));
diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c
index 8ddfcca..3387ec4 100644
--- a/hw/vfio/platform.c
+++ b/hw/vfio/platform.c
@@ -623,6 +623,7 @@  static void vfio_platform_realize(DeviceState *dev, Error **errp)
     vbasedev->ops = &vfio_platform_ops;
     vbasedev->io = &vfio_dev_io_ioctl;
     vbasedev->use_regfds = false;
+    vbasedev->can_mask_irq = false;
 
     qemu_mutex_init(&vdev->intp_mutex);
 
diff --git a/hw/vfio/user-pci.c b/hw/vfio/user-pci.c
index 55ffe7f..bc1d01a 100644
--- a/hw/vfio/user-pci.c
+++ b/hw/vfio/user-pci.c
@@ -45,6 +45,62 @@  struct VFIOUserPCIDevice {
 };
 
 /*
+ * The server maintains the device's pending interrupts,
+ * via its MSIX table and PBA, so we treat these acceses
+ * like PCI config space and forward them.
+ */
+static uint64_t vfio_user_pba_read(void *opaque, hwaddr addr,
+                                   unsigned size)
+{
+    VFIOPCIDevice *vdev = opaque;
+    VFIORegion *region = &vdev->bars[vdev->msix->pba_bar].region;
+    uint64_t data;
+
+    /* server copy is what matters */
+    data = vfio_region_read(region, addr + vdev->msix->pba_offset, size);
+    return data;
+}
+
+static void vfio_user_pba_write(void *opaque, hwaddr addr,
+                                  uint64_t data, unsigned size)
+{
+    /* dropped */
+}
+
+static const MemoryRegionOps vfio_user_pba_ops = {
+    .read = vfio_user_pba_read,
+    .write = vfio_user_pba_write,
+    .endianness = DEVICE_LITTLE_ENDIAN,
+};
+
+static void vfio_user_msix_setup(VFIOPCIDevice *vdev)
+{
+    MemoryRegion *vfio_reg, *msix_reg, *pba_reg;
+
+    pba_reg = g_new0(MemoryRegion, 1);
+    vdev->msix->pba_region = pba_reg;
+
+    vfio_reg = vdev->bars[vdev->msix->pba_bar].mr;
+    msix_reg = &vdev->pdev.msix_pba_mmio;
+    memory_region_init_io(pba_reg, OBJECT(vdev), &vfio_user_pba_ops, vdev,
+                          "VFIO MSIX PBA", int128_get64(msix_reg->size));
+    memory_region_add_subregion_overlap(vfio_reg, vdev->msix->pba_offset,
+                                        pba_reg, 1);
+}
+
+static void vfio_user_msix_teardown(VFIOPCIDevice *vdev)
+{
+    MemoryRegion *mr, *sub;
+
+    mr = vdev->bars[vdev->msix->pba_bar].mr;
+    sub = vdev->msix->pba_region;
+    memory_region_del_subregion(mr, sub);
+
+    g_free(vdev->msix->pba_region);
+    vdev->msix->pba_region = NULL;
+}
+
+/*
  * Incoming request message callback.
  *
  * Runs off main loop, so BQL held.
@@ -122,6 +178,7 @@  static void vfio_user_pci_realize(PCIDevice *pdev, Error **errp)
     vbasedev->dev = DEVICE(vdev);
     vbasedev->io = &vfio_dev_io_sock;
     vbasedev->use_regfds = true;
+    vbasedev->can_mask_irq = true;
 
     ret = vfio_user_get_device(vbasedev, errp);
     if (ret) {
@@ -159,6 +216,9 @@  static void vfio_user_pci_realize(PCIDevice *pdev, Error **errp)
     if (ret) {
         goto out_teardown;
     }
+    if (vdev->msix != NULL) {
+        vfio_user_msix_setup(vdev);
+    }
 
     ret = vfio_interrupt_setup(vdev, errp);
     if (ret) {
@@ -186,6 +246,10 @@  static void vfio_user_instance_finalize(Object *obj)
     g_free(vdev->emulated_config_bits);
     g_free(vdev->rom);
 
+    if (vdev->msix != NULL) {
+        vfio_user_msix_teardown(vdev);
+    }
+
     vfio_put_device(vdev);
 
     if (vbasedev->proxy != NULL) {