Message ID | 0ad69e4ea3d1f37246ce5e32ba833d6c871e99b1.1675228037.git.john.g.johnson@oracle.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | vfio-user client | expand |
On Wed, 1 Feb 2023 21:55:51 -0800 John Johnson <john.g.johnson@oracle.com> wrote: > Server holds device current device pending state > Use irq masking commands in socket case > > Signed-off-by: John G Johnson <john.g.johnson@oracle.com> > Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com> > Signed-off-by: Jagannathan Raman <jag.raman@oracle.com> > --- > hw/vfio/pci.h | 1 + > include/hw/vfio/vfio-common.h | 3 ++ > hw/vfio/ccw.c | 1 + > hw/vfio/common.c | 26 ++++++++++++++++++ > hw/vfio/pci.c | 23 +++++++++++++++- > hw/vfio/platform.c | 1 + > hw/vfio/user-pci.c | 64 +++++++++++++++++++++++++++++++++++++++++++ > 7 files changed, 118 insertions(+), 1 deletion(-) > > diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h > index 4f70664..d3e5d5f 100644 > --- a/hw/vfio/pci.h > +++ b/hw/vfio/pci.h > @@ -113,6 +113,7 @@ typedef struct VFIOMSIXInfo { > uint32_t table_offset; > uint32_t pba_offset; > unsigned long *pending; > + MemoryRegion *pba_region; > } VFIOMSIXInfo; > > /* > diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h > index bbc4b15..2c58d7d 100644 > --- a/include/hw/vfio/vfio-common.h > +++ b/include/hw/vfio/vfio-common.h > @@ -143,6 +143,7 @@ typedef struct VFIODevice { > bool ram_block_discard_allowed; > bool enable_migration; > bool use_regfds; > + bool can_mask_irq; How can this be a device level property? vfio-pci (kernel) supports masking of INTx. It seems like there needs to be a per-index info or probe here to to support this. > VFIODeviceOps *ops; > VFIODeviceIO *io; > unsigned int num_irqs; > @@ -239,6 +240,8 @@ void vfio_put_base_device(VFIODevice *vbasedev); > void vfio_disable_irqindex(VFIODevice *vbasedev, int index); > void vfio_unmask_single_irqindex(VFIODevice *vbasedev, int index); > void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index); > +void vfio_unmask_single_irq(VFIODevice *vbasedev, int index, int irq); > +void vfio_mask_single_irq(VFIODevice *vbasedev, int index, int irq); > int vfio_set_irq_signaling(VFIODevice *vbasedev, int index, int subindex, > int action, int fd, Error **errp); > void vfio_region_write(void *opaque, hwaddr addr, > diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c > index 00605bd..bf67670 100644 > --- a/hw/vfio/ccw.c > +++ b/hw/vfio/ccw.c > @@ -616,6 +616,7 @@ static void vfio_ccw_get_device(VFIOGroup *group, VFIOCCWDevice *vcdev, > vcdev->vdev.dev = &vcdev->cdev.parent_obj.parent_obj; > vcdev->vdev.io = &vfio_dev_io_ioctl; > vcdev->vdev.use_regfds = false; > + vcdev->vdev.can_mask_irq = false; > > return; > > diff --git a/hw/vfio/common.c b/hw/vfio/common.c > index de64e53..0c1cb21 100644 > --- a/hw/vfio/common.c > +++ b/hw/vfio/common.c > @@ -102,6 +102,32 @@ void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index) > vbasedev->io->set_irqs(vbasedev, &irq_set); > } > > +void vfio_mask_single_irq(VFIODevice *vbasedev, int index, int irq) > +{ > + struct vfio_irq_set irq_set = { > + .argsz = sizeof(irq_set), > + .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK, > + .index = index, > + .start = irq, > + .count = 1, > + }; > + > + vbasedev->io->set_irqs(vbasedev, &irq_set); > +} > + > +void vfio_unmask_single_irq(VFIODevice *vbasedev, int index, int irq) > +{ > + struct vfio_irq_set irq_set = { > + .argsz = sizeof(irq_set), > + .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK, > + .index = index, > + .start = irq, > + .count = 1, > + }; > + > + vbasedev->io->set_irqs(vbasedev, &irq_set); > +} > + > static inline const char *action_to_str(int action) > { > switch (action) { > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c > index 42e7c82..7b16f8f 100644 > --- a/hw/vfio/pci.c > +++ b/hw/vfio/pci.c > @@ -477,6 +477,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, > { > VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); > VFIOMSIVector *vector; > + bool new_vec = false; > int ret; > > trace_vfio_msix_vector_do_use(vdev->vbasedev.name, nr); > @@ -490,6 +491,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, > error_report("vfio: Error: event_notifier_init failed"); > } > vector->use = true; > + new_vec = true; > msix_vector_use(pdev, nr); > } > > @@ -516,6 +518,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, > kvm_irqchip_commit_route_changes(&vfio_route_change); > vfio_connect_kvm_msi_virq(vector); > } > + new_vec = true; This looks wrong to me, can't we get here when a previously used vector is unmasked? > } > } > > @@ -523,6 +526,8 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, > * We don't want to have the host allocate all possible MSI vectors > * for a device if they're not in use, so we shutdown and incrementally > * increase them as needed. > + * Otherwise, unmask the vector if the vector is already setup (and we can > + * do so) or send the fd if not. > */ > if (vdev->nr_vectors < nr + 1) { > vdev->nr_vectors = nr + 1; > @@ -533,6 +538,8 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, > error_report("vfio: failed to enable vectors, %d", ret); > } > } > + } else if (vdev->vbasedev.can_mask_irq && !new_vec) { > + vfio_unmask_single_irq(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, nr); The correctness of @new_vec is doubtful here, but masking support must be in reference to a specific IRQ index. INTx masking is implicit, but we'd probably need something on the VFIOPCIDevice to indicate support for MSI and MSIX masking support, separately. > } else { > Error *err = NULL; > int32_t fd; > @@ -574,6 +581,12 @@ static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr) > > trace_vfio_msix_vector_release(vdev->vbasedev.name, nr); > > + /* just mask vector if peer supports it */ > + if (vdev->vbasedev.can_mask_irq) { > + vfio_mask_single_irq(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, nr); > + return; > + } > + > /* > * There are still old guests that mask and unmask vectors on every > * interrupt. If we're using QEMU bypass with a KVM irqfd, leave all of > @@ -644,7 +657,7 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev) > if (ret) { > error_report("vfio: failed to enable vectors, %d", ret); > } > - } else { > + } else if (!vdev->vbasedev.can_mask_irq) { > /* > * Some communication channels between VF & PF or PF & fw rely on the > * physical state of the device and expect that enabling MSI-X from the > @@ -660,6 +673,13 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev) > */ > vfio_msix_vector_do_use(&vdev->pdev, 0, NULL, NULL); > vfio_msix_vector_release(&vdev->pdev, 0); > + } else { > + /* > + * If we can use irq masking, send an invalid fd on vector 0 > + * to enable MSI-X without any vectors enabled. > + */ > + vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, 0, > + VFIO_IRQ_SET_ACTION_TRIGGER, -1, NULL); What does this have to do with masking? I'm not entirely sure this doesn't also work on vfio-pci (kernel). In general this feels like a special cased version of MSI/X masking support rather than actually reworking things to support underlying masking support for these indexes. Thanks, Alex
> On Feb 6, 2023, at 12:33 PM, Alex Williamson <alex.williamson@redhat.com> wrote: > > On Wed, 1 Feb 2023 21:55:51 -0800 > John Johnson <john.g.johnson@oracle.com> wrote: > >> Server holds device current device pending state >> Use irq masking commands in socket case >> >> Signed-off-by: John G Johnson <john.g.johnson@oracle.com> >> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com> >> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com> >> --- >> hw/vfio/pci.h | 1 + >> include/hw/vfio/vfio-common.h | 3 ++ >> hw/vfio/ccw.c | 1 + >> hw/vfio/common.c | 26 ++++++++++++++++++ >> hw/vfio/pci.c | 23 +++++++++++++++- >> hw/vfio/platform.c | 1 + >> hw/vfio/user-pci.c | 64 +++++++++++++++++++++++++++++++++++++++++++ >> 7 files changed, 118 insertions(+), 1 deletion(-) >> >> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h >> index 4f70664..d3e5d5f 100644 >> --- a/hw/vfio/pci.h >> +++ b/hw/vfio/pci.h >> @@ -113,6 +113,7 @@ typedef struct VFIOMSIXInfo { >> uint32_t table_offset; >> uint32_t pba_offset; >> unsigned long *pending; >> + MemoryRegion *pba_region; >> } VFIOMSIXInfo; >> >> /* >> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h >> index bbc4b15..2c58d7d 100644 >> --- a/include/hw/vfio/vfio-common.h >> +++ b/include/hw/vfio/vfio-common.h >> @@ -143,6 +143,7 @@ typedef struct VFIODevice { >> bool ram_block_discard_allowed; >> bool enable_migration; >> bool use_regfds; >> + bool can_mask_irq; > > How can this be a device level property? vfio-pci (kernel) supports > masking of INTx. It seems like there needs to be a per-index info or > probe here to to support this. > It is only used for MSIX masking. MSI masking is done with config space stores, and vfio-kernel and vfio-user handle them the same by forwarding the config space writes to the server so it can mask interrupts. MSIX is trickier because the mask bits are in a memory region. vfio-kernel doesn’t support SET_IRQS on MSIX vectors, so if the host doesn’t allow client mapping of the MSIX table to do the masking, vfio switches a masked vector’s event FD destination from KVM to QEMU, then uses the QEMU PCI emulation to mask the vector. vfio-user has to use messages to mask MSIX vectors, since there is no host config space to map. I originally forwarded the MSIX table writes to the server to do the masking, but the feedback on the vfio-user server changes was to add SET_IRQS support. I can make this a PCI MSIX-specific bool if that helps. > >> VFIODeviceOps *ops; >> VFIODeviceIO *io; >> unsigned int num_irqs; >> @@ -239,6 +240,8 @@ void vfio_put_base_device(VFIODevice *vbasedev); >> void vfio_disable_irqindex(VFIODevice *vbasedev, int index); >> void vfio_unmask_single_irqindex(VFIODevice *vbasedev, int index); >> void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index); >> +void vfio_unmask_single_irq(VFIODevice *vbasedev, int index, int irq); >> +void vfio_mask_single_irq(VFIODevice *vbasedev, int index, int irq); >> int vfio_set_irq_signaling(VFIODevice *vbasedev, int index, int subindex, >> int action, int fd, Error **errp); >> void vfio_region_write(void *opaque, hwaddr addr, >> diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c >> index 00605bd..bf67670 100644 >> --- a/hw/vfio/ccw.c >> +++ b/hw/vfio/ccw.c >> @@ -616,6 +616,7 @@ static void vfio_ccw_get_device(VFIOGroup *group, VFIOCCWDevice *vcdev, >> vcdev->vdev.dev = &vcdev->cdev.parent_obj.parent_obj; >> vcdev->vdev.io = &vfio_dev_io_ioctl; >> vcdev->vdev.use_regfds = false; >> + vcdev->vdev.can_mask_irq = false; >> >> return; >> >> diff --git a/hw/vfio/common.c b/hw/vfio/common.c >> index de64e53..0c1cb21 100644 >> --- a/hw/vfio/common.c >> +++ b/hw/vfio/common.c >> @@ -102,6 +102,32 @@ void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index) >> vbasedev->io->set_irqs(vbasedev, &irq_set); >> } >> >> +void vfio_mask_single_irq(VFIODevice *vbasedev, int index, int irq) >> +{ >> + struct vfio_irq_set irq_set = { >> + .argsz = sizeof(irq_set), >> + .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK, >> + .index = index, >> + .start = irq, >> + .count = 1, >> + }; >> + >> + vbasedev->io->set_irqs(vbasedev, &irq_set); >> +} >> + >> +void vfio_unmask_single_irq(VFIODevice *vbasedev, int index, int irq) >> +{ >> + struct vfio_irq_set irq_set = { >> + .argsz = sizeof(irq_set), >> + .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK, >> + .index = index, >> + .start = irq, >> + .count = 1, >> + }; >> + >> + vbasedev->io->set_irqs(vbasedev, &irq_set); >> +} >> + >> static inline const char *action_to_str(int action) >> { >> switch (action) { >> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c >> index 42e7c82..7b16f8f 100644 >> --- a/hw/vfio/pci.c >> +++ b/hw/vfio/pci.c >> @@ -477,6 +477,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, >> { >> VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); >> VFIOMSIVector *vector; >> + bool new_vec = false; >> int ret; >> >> trace_vfio_msix_vector_do_use(vdev->vbasedev.name, nr); >> @@ -490,6 +491,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, >> error_report("vfio: Error: event_notifier_init failed"); >> } >> vector->use = true; >> + new_vec = true; >> msix_vector_use(pdev, nr); >> } >> >> @@ -516,6 +518,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, >> kvm_irqchip_commit_route_changes(&vfio_route_change); >> vfio_connect_kvm_msi_virq(vector); >> } >> + new_vec = true; > > This looks wrong to me, can't we get here when a previously used vector > is unmasked? > It’s for the case where we upgrade the vector from using an event FD that terminates in QEMU to one that terminates in KVM. In that case we want to send the FD again, not just unmask the current FD. >> } >> } >> >> @@ -523,6 +526,8 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, >> * We don't want to have the host allocate all possible MSI vectors >> * for a device if they're not in use, so we shutdown and incrementally >> * increase them as needed. >> + * Otherwise, unmask the vector if the vector is already setup (and we can >> + * do so) or send the fd if not. >> */ >> if (vdev->nr_vectors < nr + 1) { >> vdev->nr_vectors = nr + 1; >> @@ -533,6 +538,8 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, >> error_report("vfio: failed to enable vectors, %d", ret); >> } >> } >> + } else if (vdev->vbasedev.can_mask_irq && !new_vec) { >> + vfio_unmask_single_irq(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, nr); > > The correctness of @new_vec is doubtful here, but masking support must > be in reference to a specific IRQ index. INTx masking is implicit, but > we'd probably need something on the VFIOPCIDevice to indicate support > for MSI and MSIX masking support, separately. > MSI masking uses config writes, so this is only for MSIX. >> } else { >> Error *err = NULL; >> int32_t fd; >> @@ -574,6 +581,12 @@ static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr) >> >> trace_vfio_msix_vector_release(vdev->vbasedev.name, nr); >> >> + /* just mask vector if peer supports it */ >> + if (vdev->vbasedev.can_mask_irq) { >> + vfio_mask_single_irq(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, nr); >> + return; >> + } >> + >> /* >> * There are still old guests that mask and unmask vectors on every >> * interrupt. If we're using QEMU bypass with a KVM irqfd, leave all of >> @@ -644,7 +657,7 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev) >> if (ret) { >> error_report("vfio: failed to enable vectors, %d", ret); >> } >> - } else { >> + } else if (!vdev->vbasedev.can_mask_irq) { >> /* >> * Some communication channels between VF & PF or PF & fw rely on the >> * physical state of the device and expect that enabling MSI-X from the >> @@ -660,6 +673,13 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev) >> */ >> vfio_msix_vector_do_use(&vdev->pdev, 0, NULL, NULL); >> vfio_msix_vector_release(&vdev->pdev, 0); >> + } else { >> + /* >> + * If we can use irq masking, send an invalid fd on vector 0 >> + * to enable MSI-X without any vectors enabled. >> + */ >> + vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, 0, >> + VFIO_IRQ_SET_ACTION_TRIGGER, -1, NULL); > > What does this have to do with masking? I'm not entirely sure this > doesn't also work on vfio-pci (kernel). > > In general this feels like a special cased version of MSI/X masking > support rather than actually reworking things to support underlying > masking support for these indexes. Thanks, > The comment in the !can_mask_irq clause sounded like vfio-kernel needs a use/release cycle, so I didn’t want to change its behavior. JJ
On Wed, 8 Feb 2023 06:38:30 +0000 John Johnson <john.g.johnson@oracle.com> wrote: > > On Feb 6, 2023, at 12:33 PM, Alex Williamson <alex.williamson@redhat.com> wrote: > > > > On Wed, 1 Feb 2023 21:55:51 -0800 > > John Johnson <john.g.johnson@oracle.com> wrote: > > > >> Server holds device current device pending state > >> Use irq masking commands in socket case > >> > >> Signed-off-by: John G Johnson <john.g.johnson@oracle.com> > >> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com> > >> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com> > >> --- > >> hw/vfio/pci.h | 1 + > >> include/hw/vfio/vfio-common.h | 3 ++ > >> hw/vfio/ccw.c | 1 + > >> hw/vfio/common.c | 26 ++++++++++++++++++ > >> hw/vfio/pci.c | 23 +++++++++++++++- > >> hw/vfio/platform.c | 1 + > >> hw/vfio/user-pci.c | 64 +++++++++++++++++++++++++++++++++++++++++++ > >> 7 files changed, 118 insertions(+), 1 deletion(-) > >> > >> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h > >> index 4f70664..d3e5d5f 100644 > >> --- a/hw/vfio/pci.h > >> +++ b/hw/vfio/pci.h > >> @@ -113,6 +113,7 @@ typedef struct VFIOMSIXInfo { > >> uint32_t table_offset; > >> uint32_t pba_offset; > >> unsigned long *pending; > >> + MemoryRegion *pba_region; > >> } VFIOMSIXInfo; > >> > >> /* > >> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h > >> index bbc4b15..2c58d7d 100644 > >> --- a/include/hw/vfio/vfio-common.h > >> +++ b/include/hw/vfio/vfio-common.h > >> @@ -143,6 +143,7 @@ typedef struct VFIODevice { > >> bool ram_block_discard_allowed; > >> bool enable_migration; > >> bool use_regfds; > >> + bool can_mask_irq; > > > > How can this be a device level property? vfio-pci (kernel) supports > > masking of INTx. It seems like there needs to be a per-index info or > > probe here to to support this. > > > > It is only used for MSIX masking. MSI masking is done with > config space stores, and vfio-kernel and vfio-user handle them the > same by forwarding the config space writes to the server so it can > mask interrupts. I suppose this does slip through on vfio-kernel, though support via SET_IRQS would really be the uAPI mechanism we'd expect to use for masking, just as it is for enabling/disabling MSI. MSI is not well used enough to have flushed that out and it seems harmless, but is not the intended API. > MSIX is trickier because the mask bits are in a memory region. > vfio-kernel doesn’t support SET_IRQS on MSIX vectors, so if the host > doesn’t allow client mapping of the MSIX table to do the masking, vfio > switches a masked vector’s event FD destination from KVM to QEMU, then > uses the QEMU PCI emulation to mask the vector. Lack of support for MSIX ACTION_[UN]MASK is not an API limitation, it's an implementation issue in the kernel. Same for MSI. I believe this is resolved now, that there are mask/unmask APIs available within the kernel, as well as mechanisms to avoid the NORESIZE behavior now, so the intention would be to implement that support, along with a mechanism for the user to know the support is present. We already have that for NORESIZE via IRQ_INFO, so I suspect the right answer might be to add a new VFIO_IRQ_INFO_MSI_MASK to advertise masking support. > vfio-user has to use messages to mask MSIX vectors, since there > is no host config space to map. I originally forwarded the MSIX table > writes to the server to do the masking, but the feedback on the vfio-user > server changes was to add SET_IRQS support. Which is what I'm describing above, QEMU should know via the VFIO uAPI whether MSI/X masking is supported and use that to configure the code to make use of it rather than some inferred value based on the interface type. Thanks, Alex
> On Feb 8, 2023, at 1:30 PM, Alex Williamson <alex.williamson@redhat.com> wrote: > > On Wed, 8 Feb 2023 06:38:30 +0000 > John Johnson <john.g.johnson@oracle.com> wrote: > >>> On Feb 6, 2023, at 12:33 PM, Alex Williamson <alex.williamson@redhat.com> wrote: >>> >>> On Wed, 1 Feb 2023 21:55:51 -0800 >>> John Johnson <john.g.johnson@oracle.com> wrote: >>> >>>> Server holds device current device pending state >>>> Use irq masking commands in socket case >>>> >>>> Signed-off-by: John G Johnson <john.g.johnson@oracle.com> >>>> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com> >>>> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com> >>>> --- >>>> hw/vfio/pci.h | 1 + >>>> include/hw/vfio/vfio-common.h | 3 ++ >>>> hw/vfio/ccw.c | 1 + >>>> hw/vfio/common.c | 26 ++++++++++++++++++ >>>> hw/vfio/pci.c | 23 +++++++++++++++- >>>> hw/vfio/platform.c | 1 + >>>> hw/vfio/user-pci.c | 64 +++++++++++++++++++++++++++++++++++++++++++ >>>> 7 files changed, 118 insertions(+), 1 deletion(-) >>>> >>>> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h >>>> index 4f70664..d3e5d5f 100644 >>>> --- a/hw/vfio/pci.h >>>> +++ b/hw/vfio/pci.h >>>> @@ -113,6 +113,7 @@ typedef struct VFIOMSIXInfo { >>>> uint32_t table_offset; >>>> uint32_t pba_offset; >>>> unsigned long *pending; >>>> + MemoryRegion *pba_region; >>>> } VFIOMSIXInfo; >>>> >>>> /* >>>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h >>>> index bbc4b15..2c58d7d 100644 >>>> --- a/include/hw/vfio/vfio-common.h >>>> +++ b/include/hw/vfio/vfio-common.h >>>> @@ -143,6 +143,7 @@ typedef struct VFIODevice { >>>> bool ram_block_discard_allowed; >>>> bool enable_migration; >>>> bool use_regfds; >>>> + bool can_mask_irq; >>> >>> How can this be a device level property? vfio-pci (kernel) supports >>> masking of INTx. It seems like there needs to be a per-index info or >>> probe here to to support this. >>> >> >> It is only used for MSIX masking. MSI masking is done with >> config space stores, and vfio-kernel and vfio-user handle them the >> same by forwarding the config space writes to the server so it can >> mask interrupts. > > I suppose this does slip through on vfio-kernel, though support via > SET_IRQS would really be the uAPI mechanism we'd expect to use for > masking, just as it is for enabling/disabling MSI. MSI is not well > used enough to have flushed that out and it seems harmless, but is not > the intended API. > >> MSIX is trickier because the mask bits are in a memory region. >> vfio-kernel doesn’t support SET_IRQS on MSIX vectors, so if the host >> doesn’t allow client mapping of the MSIX table to do the masking, vfio >> switches a masked vector’s event FD destination from KVM to QEMU, then >> uses the QEMU PCI emulation to mask the vector. > > Lack of support for MSIX ACTION_[UN]MASK is not an API limitation, it's > an implementation issue in the kernel. Same for MSI. I believe this > is resolved now, that there are mask/unmask APIs available within the > kernel, as well as mechanisms to avoid the NORESIZE behavior now, so > the intention would be to implement that support, along with a > mechanism for the user to know the support is present. We already have > that for NORESIZE via IRQ_INFO, so I suspect the right answer might be > to add a new VFIO_IRQ_INFO_MSI_MASK to advertise masking support. > I looked at the linux next vfio kernel driver, and it doesn't seem to support MSI/X masking. >> vfio-user has to use messages to mask MSIX vectors, since there >> is no host config space to map. I originally forwarded the MSIX table >> writes to the server to do the masking, but the feedback on the vfio-user >> server changes was to add SET_IRQS support. > > Which is what I'm describing above, QEMU should know via the VFIO uAPI > whether MSI/X masking is supported and use that to configure the code > to make use of it rather than some inferred value based on the > interface type. Thanks, > The kernel doesn’t advertising it working either, so I think I can key it off the presence of VFIO_IRQ_INFO_MASKABLE in irq_info JJ
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index 4f70664..d3e5d5f 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -113,6 +113,7 @@ typedef struct VFIOMSIXInfo { uint32_t table_offset; uint32_t pba_offset; unsigned long *pending; + MemoryRegion *pba_region; } VFIOMSIXInfo; /* diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index bbc4b15..2c58d7d 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -143,6 +143,7 @@ typedef struct VFIODevice { bool ram_block_discard_allowed; bool enable_migration; bool use_regfds; + bool can_mask_irq; VFIODeviceOps *ops; VFIODeviceIO *io; unsigned int num_irqs; @@ -239,6 +240,8 @@ void vfio_put_base_device(VFIODevice *vbasedev); void vfio_disable_irqindex(VFIODevice *vbasedev, int index); void vfio_unmask_single_irqindex(VFIODevice *vbasedev, int index); void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index); +void vfio_unmask_single_irq(VFIODevice *vbasedev, int index, int irq); +void vfio_mask_single_irq(VFIODevice *vbasedev, int index, int irq); int vfio_set_irq_signaling(VFIODevice *vbasedev, int index, int subindex, int action, int fd, Error **errp); void vfio_region_write(void *opaque, hwaddr addr, diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c index 00605bd..bf67670 100644 --- a/hw/vfio/ccw.c +++ b/hw/vfio/ccw.c @@ -616,6 +616,7 @@ static void vfio_ccw_get_device(VFIOGroup *group, VFIOCCWDevice *vcdev, vcdev->vdev.dev = &vcdev->cdev.parent_obj.parent_obj; vcdev->vdev.io = &vfio_dev_io_ioctl; vcdev->vdev.use_regfds = false; + vcdev->vdev.can_mask_irq = false; return; diff --git a/hw/vfio/common.c b/hw/vfio/common.c index de64e53..0c1cb21 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -102,6 +102,32 @@ void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index) vbasedev->io->set_irqs(vbasedev, &irq_set); } +void vfio_mask_single_irq(VFIODevice *vbasedev, int index, int irq) +{ + struct vfio_irq_set irq_set = { + .argsz = sizeof(irq_set), + .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK, + .index = index, + .start = irq, + .count = 1, + }; + + vbasedev->io->set_irqs(vbasedev, &irq_set); +} + +void vfio_unmask_single_irq(VFIODevice *vbasedev, int index, int irq) +{ + struct vfio_irq_set irq_set = { + .argsz = sizeof(irq_set), + .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK, + .index = index, + .start = irq, + .count = 1, + }; + + vbasedev->io->set_irqs(vbasedev, &irq_set); +} + static inline const char *action_to_str(int action) { switch (action) { diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 42e7c82..7b16f8f 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -477,6 +477,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, { VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); VFIOMSIVector *vector; + bool new_vec = false; int ret; trace_vfio_msix_vector_do_use(vdev->vbasedev.name, nr); @@ -490,6 +491,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, error_report("vfio: Error: event_notifier_init failed"); } vector->use = true; + new_vec = true; msix_vector_use(pdev, nr); } @@ -516,6 +518,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, kvm_irqchip_commit_route_changes(&vfio_route_change); vfio_connect_kvm_msi_virq(vector); } + new_vec = true; } } @@ -523,6 +526,8 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, * We don't want to have the host allocate all possible MSI vectors * for a device if they're not in use, so we shutdown and incrementally * increase them as needed. + * Otherwise, unmask the vector if the vector is already setup (and we can + * do so) or send the fd if not. */ if (vdev->nr_vectors < nr + 1) { vdev->nr_vectors = nr + 1; @@ -533,6 +538,8 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, error_report("vfio: failed to enable vectors, %d", ret); } } + } else if (vdev->vbasedev.can_mask_irq && !new_vec) { + vfio_unmask_single_irq(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, nr); } else { Error *err = NULL; int32_t fd; @@ -574,6 +581,12 @@ static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr) trace_vfio_msix_vector_release(vdev->vbasedev.name, nr); + /* just mask vector if peer supports it */ + if (vdev->vbasedev.can_mask_irq) { + vfio_mask_single_irq(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, nr); + return; + } + /* * There are still old guests that mask and unmask vectors on every * interrupt. If we're using QEMU bypass with a KVM irqfd, leave all of @@ -644,7 +657,7 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev) if (ret) { error_report("vfio: failed to enable vectors, %d", ret); } - } else { + } else if (!vdev->vbasedev.can_mask_irq) { /* * Some communication channels between VF & PF or PF & fw rely on the * physical state of the device and expect that enabling MSI-X from the @@ -660,6 +673,13 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev) */ vfio_msix_vector_do_use(&vdev->pdev, 0, NULL, NULL); vfio_msix_vector_release(&vdev->pdev, 0); + } else { + /* + * If we can use irq masking, send an invalid fd on vector 0 + * to enable MSI-X without any vectors enabled. + */ + vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, 0, + VFIO_IRQ_SET_ACTION_TRIGGER, -1, NULL); } trace_vfio_msix_enable(vdev->vbasedev.name); @@ -3040,6 +3060,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) vbasedev->dev = DEVICE(vdev); vbasedev->io = &vfio_dev_io_ioctl; vbasedev->use_regfds = false; + vbasedev->can_mask_irq = false; tmp = g_strdup_printf("%s/iommu_group", vbasedev->sysfsdev); len = readlink(tmp, group_path, sizeof(group_path)); diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c index 8ddfcca..3387ec4 100644 --- a/hw/vfio/platform.c +++ b/hw/vfio/platform.c @@ -623,6 +623,7 @@ static void vfio_platform_realize(DeviceState *dev, Error **errp) vbasedev->ops = &vfio_platform_ops; vbasedev->io = &vfio_dev_io_ioctl; vbasedev->use_regfds = false; + vbasedev->can_mask_irq = false; qemu_mutex_init(&vdev->intp_mutex); diff --git a/hw/vfio/user-pci.c b/hw/vfio/user-pci.c index 55ffe7f..bc1d01a 100644 --- a/hw/vfio/user-pci.c +++ b/hw/vfio/user-pci.c @@ -45,6 +45,62 @@ struct VFIOUserPCIDevice { }; /* + * The server maintains the device's pending interrupts, + * via its MSIX table and PBA, so we treat these acceses + * like PCI config space and forward them. + */ +static uint64_t vfio_user_pba_read(void *opaque, hwaddr addr, + unsigned size) +{ + VFIOPCIDevice *vdev = opaque; + VFIORegion *region = &vdev->bars[vdev->msix->pba_bar].region; + uint64_t data; + + /* server copy is what matters */ + data = vfio_region_read(region, addr + vdev->msix->pba_offset, size); + return data; +} + +static void vfio_user_pba_write(void *opaque, hwaddr addr, + uint64_t data, unsigned size) +{ + /* dropped */ +} + +static const MemoryRegionOps vfio_user_pba_ops = { + .read = vfio_user_pba_read, + .write = vfio_user_pba_write, + .endianness = DEVICE_LITTLE_ENDIAN, +}; + +static void vfio_user_msix_setup(VFIOPCIDevice *vdev) +{ + MemoryRegion *vfio_reg, *msix_reg, *pba_reg; + + pba_reg = g_new0(MemoryRegion, 1); + vdev->msix->pba_region = pba_reg; + + vfio_reg = vdev->bars[vdev->msix->pba_bar].mr; + msix_reg = &vdev->pdev.msix_pba_mmio; + memory_region_init_io(pba_reg, OBJECT(vdev), &vfio_user_pba_ops, vdev, + "VFIO MSIX PBA", int128_get64(msix_reg->size)); + memory_region_add_subregion_overlap(vfio_reg, vdev->msix->pba_offset, + pba_reg, 1); +} + +static void vfio_user_msix_teardown(VFIOPCIDevice *vdev) +{ + MemoryRegion *mr, *sub; + + mr = vdev->bars[vdev->msix->pba_bar].mr; + sub = vdev->msix->pba_region; + memory_region_del_subregion(mr, sub); + + g_free(vdev->msix->pba_region); + vdev->msix->pba_region = NULL; +} + +/* * Incoming request message callback. * * Runs off main loop, so BQL held. @@ -122,6 +178,7 @@ static void vfio_user_pci_realize(PCIDevice *pdev, Error **errp) vbasedev->dev = DEVICE(vdev); vbasedev->io = &vfio_dev_io_sock; vbasedev->use_regfds = true; + vbasedev->can_mask_irq = true; ret = vfio_user_get_device(vbasedev, errp); if (ret) { @@ -159,6 +216,9 @@ static void vfio_user_pci_realize(PCIDevice *pdev, Error **errp) if (ret) { goto out_teardown; } + if (vdev->msix != NULL) { + vfio_user_msix_setup(vdev); + } ret = vfio_interrupt_setup(vdev, errp); if (ret) { @@ -186,6 +246,10 @@ static void vfio_user_instance_finalize(Object *obj) g_free(vdev->emulated_config_bits); g_free(vdev->rom); + if (vdev->msix != NULL) { + vfio_user_msix_teardown(vdev); + } + vfio_put_device(vdev); if (vbasedev->proxy != NULL) {