Message ID | 1669167756-196788-1-git-send-email-chenxiang66@hisilicon.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [v2] vfio/pci: Verify each MSI vector to avoid invalid MSI vectors | expand |
On Wed, 23 Nov 2022 01:42:36 +0000, chenxiang <chenxiang66@hisilicon.com> wrote: > > From: Xiang Chen <chenxiang66@hisilicon.com> > > Currently the number of MSI vectors comes from register PCI_MSI_FLAGS > which should be power-of-2 in qemu, in some scenaries it is not the same as > the number that driver requires in guest, for example, a PCI driver wants > to allocate 6 MSI vecotrs in guest, but as the limitation, it will allocate > 8 MSI vectors. So it requires 8 MSI vectors in qemu while the driver in > guest only wants to allocate 6 MSI vectors. > > When GICv4.1 is enabled, it iterates over all possible MSIs and enable the > forwarding while the guest has only created some of mappings in the virtual > ITS, so some calls fail. The exception print is as following: > vfio-pci 0000:3a:00.1: irq bypass producer (token 000000008f08224d) registration > fails:66311 > > To avoid the issue, verify each MSI vector, skip some operations such as > request_irq() and irq_bypass_register_producer() for those invalid MSI vectors. > > Signed-off-by: Xiang Chen <chenxiang66@hisilicon.com> > --- > I reported the issue at the link: > https://lkml.kernel.org/lkml/87cze9lcut.wl-maz@kernel.org/T/ > > Change Log: > v1 -> v2: > Verify each MSI vector in kernel instead of adding systemcall according to > Mar's suggestion > --- > arch/arm64/kvm/vgic/vgic-irqfd.c | 13 +++++++++++++ > arch/arm64/kvm/vgic/vgic-its.c | 36 ++++++++++++++++++++++++++++++++++++ > arch/arm64/kvm/vgic/vgic.h | 1 + > drivers/vfio/pci/vfio_pci_intrs.c | 33 +++++++++++++++++++++++++++++++++ > include/linux/kvm_host.h | 2 ++ > 5 files changed, 85 insertions(+) > > diff --git a/arch/arm64/kvm/vgic/vgic-irqfd.c b/arch/arm64/kvm/vgic/vgic-irqfd.c > index 475059b..71f6af57 100644 > --- a/arch/arm64/kvm/vgic/vgic-irqfd.c > +++ b/arch/arm64/kvm/vgic/vgic-irqfd.c > @@ -98,6 +98,19 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, > return vgic_its_inject_msi(kvm, &msi); > } > > +int kvm_verify_msi(struct kvm *kvm, > + struct kvm_kernel_irq_routing_entry *irq_entry) > +{ > + struct kvm_msi msi; > + > + if (!vgic_has_its(kvm)) > + return -ENODEV; > + > + kvm_populate_msi(irq_entry, &msi); > + > + return vgic_its_verify_msi(kvm, &msi); > +} > + > /** > * kvm_arch_set_irq_inatomic: fast-path for irqfd injection > */ > diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c > index 94a666d..8312a4a 100644 > --- a/arch/arm64/kvm/vgic/vgic-its.c > +++ b/arch/arm64/kvm/vgic/vgic-its.c > @@ -767,6 +767,42 @@ int vgic_its_inject_cached_translation(struct kvm *kvm, struct kvm_msi *msi) > return 0; > } > > +int vgic_its_verify_msi(struct kvm *kvm, struct kvm_msi *msi) > +{ > + struct vgic_its *its; > + struct its_ite *ite; > + struct kvm_vcpu *vcpu; > + int ret = 0; > + > + if (!irqchip_in_kernel(kvm) || (msi->flags & ~KVM_MSI_VALID_DEVID)) > + return -EINVAL; > + > + if (!vgic_has_its(kvm)) > + return -ENODEV; > + > + its = vgic_msi_to_its(kvm, msi); > + if (IS_ERR(its)) > + return PTR_ERR(its); > + > + mutex_lock(&its->its_lock); > + if (!its->enabled) { > + ret = -EBUSY; > + goto unlock; > + } > + ite = find_ite(its, msi->devid, msi->data); > + if (!ite || !its_is_collection_mapped(ite->collection)) { > + ret = E_ITS_INT_UNMAPPED_INTERRUPT; > + goto unlock; > + } > + > + vcpu = kvm_get_vcpu(kvm, ite->collection->target_addr); > + if (!vcpu) > + ret = E_ITS_INT_UNMAPPED_INTERRUPT; I'm sorry, but what does this mean to the caller? This should never leak outside of the ITS code. > +unlock: > + mutex_unlock(&its->its_lock); > + return ret; > +} > + > /* > * Queries the KVM IO bus framework to get the ITS pointer from the given > * doorbell address. > diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h > index 0c8da72..d452150 100644 > --- a/arch/arm64/kvm/vgic/vgic.h > +++ b/arch/arm64/kvm/vgic/vgic.h > @@ -240,6 +240,7 @@ int kvm_vgic_register_its_device(void); > void vgic_enable_lpis(struct kvm_vcpu *vcpu); > void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu); > int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi); > +int vgic_its_verify_msi(struct kvm *kvm, struct kvm_msi *msi); > int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr); > int vgic_v3_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write, > int offset, u32 *val); > diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c > index 40c3d7c..3027805 100644 > --- a/drivers/vfio/pci/vfio_pci_intrs.c > +++ b/drivers/vfio/pci/vfio_pci_intrs.c > @@ -19,6 +19,7 @@ > #include <linux/vfio.h> > #include <linux/wait.h> > #include <linux/slab.h> > +#include <linux/kvm_irqfd.h> > > #include "vfio_pci_priv.h" > > @@ -315,6 +316,28 @@ static int vfio_msi_enable(struct vfio_pci_core_device *vdev, int nvec, bool msi > return 0; > } > > +static int vfio_pci_verify_msi_entry(struct vfio_pci_core_device *vdev, > + struct eventfd_ctx *trigger) > +{ > + struct kvm *kvm = vdev->vdev.kvm; > + struct kvm_kernel_irqfd *tmp; > + struct kvm_kernel_irq_routing_entry irq_entry; > + int ret = -ENODEV; > + > + spin_lock_irq(&kvm->irqfds.lock); > + list_for_each_entry(tmp, &kvm->irqfds.items, list) { > + if (trigger == tmp->eventfd) { > + ret = 0; > + break; > + } > + } > + spin_unlock_irq(&kvm->irqfds.lock); > + if (ret) > + return ret; > + irq_entry = tmp->irq_entry; > + return kvm_verify_msi(kvm, &irq_entry); How does this work on !arm64? Why do we need an on-stack version of tmp->irq_entry? > +} > + > static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev, > int vector, int fd, bool msix) > { > @@ -355,6 +378,16 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev, > return PTR_ERR(trigger); > } > > + if (!msix) { > + ret = vfio_pci_verify_msi_entry(vdev, trigger); > + if (ret) { > + kfree(vdev->ctx[vector].name); > + eventfd_ctx_put(trigger); > + if (ret > 0) > + ret = 0; > + return ret; > + } > + } Honestly, the whole things seems really complicated to avoid something that is only a harmless warning . How about just toning down the message instead? M.
On Wed, 23 Nov 2022 12:08:05 +0000 Marc Zyngier <maz@kernel.org> wrote: > On Wed, 23 Nov 2022 01:42:36 +0000, > chenxiang <chenxiang66@hisilicon.com> wrote: > > > > From: Xiang Chen <chenxiang66@hisilicon.com> > > > > Currently the number of MSI vectors comes from register PCI_MSI_FLAGS > > which should be power-of-2 in qemu, in some scenaries it is not the same as > > the number that driver requires in guest, for example, a PCI driver wants > > to allocate 6 MSI vecotrs in guest, but as the limitation, it will allocate > > 8 MSI vectors. So it requires 8 MSI vectors in qemu while the driver in > > guest only wants to allocate 6 MSI vectors. > > > > When GICv4.1 is enabled, it iterates over all possible MSIs and enable the > > forwarding while the guest has only created some of mappings in the virtual > > ITS, so some calls fail. The exception print is as following: > > vfio-pci 0000:3a:00.1: irq bypass producer (token 000000008f08224d) registration > > fails:66311 > > > > To avoid the issue, verify each MSI vector, skip some operations such as > > request_irq() and irq_bypass_register_producer() for those invalid MSI vectors. > > > > Signed-off-by: Xiang Chen <chenxiang66@hisilicon.com> > > --- > > I reported the issue at the link: > > https://lkml.kernel.org/lkml/87cze9lcut.wl-maz@kernel.org/T/ > > > > Change Log: > > v1 -> v2: > > Verify each MSI vector in kernel instead of adding systemcall according to > > Mar's suggestion > > --- > > arch/arm64/kvm/vgic/vgic-irqfd.c | 13 +++++++++++++ > > arch/arm64/kvm/vgic/vgic-its.c | 36 ++++++++++++++++++++++++++++++++++++ > > arch/arm64/kvm/vgic/vgic.h | 1 + > > drivers/vfio/pci/vfio_pci_intrs.c | 33 +++++++++++++++++++++++++++++++++ > > include/linux/kvm_host.h | 2 ++ > > 5 files changed, 85 insertions(+) > > > > diff --git a/arch/arm64/kvm/vgic/vgic-irqfd.c b/arch/arm64/kvm/vgic/vgic-irqfd.c > > index 475059b..71f6af57 100644 > > --- a/arch/arm64/kvm/vgic/vgic-irqfd.c > > +++ b/arch/arm64/kvm/vgic/vgic-irqfd.c > > @@ -98,6 +98,19 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, > > return vgic_its_inject_msi(kvm, &msi); > > } > > > > +int kvm_verify_msi(struct kvm *kvm, > > + struct kvm_kernel_irq_routing_entry *irq_entry) > > +{ > > + struct kvm_msi msi; > > + > > + if (!vgic_has_its(kvm)) > > + return -ENODEV; > > + > > + kvm_populate_msi(irq_entry, &msi); > > + > > + return vgic_its_verify_msi(kvm, &msi); > > +} > > + > > /** > > * kvm_arch_set_irq_inatomic: fast-path for irqfd injection > > */ > > diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c > > index 94a666d..8312a4a 100644 > > --- a/arch/arm64/kvm/vgic/vgic-its.c > > +++ b/arch/arm64/kvm/vgic/vgic-its.c > > @@ -767,6 +767,42 @@ int vgic_its_inject_cached_translation(struct kvm *kvm, struct kvm_msi *msi) > > return 0; > > } > > > > +int vgic_its_verify_msi(struct kvm *kvm, struct kvm_msi *msi) > > +{ > > + struct vgic_its *its; > > + struct its_ite *ite; > > + struct kvm_vcpu *vcpu; > > + int ret = 0; > > + > > + if (!irqchip_in_kernel(kvm) || (msi->flags & ~KVM_MSI_VALID_DEVID)) > > + return -EINVAL; > > + > > + if (!vgic_has_its(kvm)) > > + return -ENODEV; > > + > > + its = vgic_msi_to_its(kvm, msi); > > + if (IS_ERR(its)) > > + return PTR_ERR(its); > > + > > + mutex_lock(&its->its_lock); > > + if (!its->enabled) { > > + ret = -EBUSY; > > + goto unlock; > > + } > > + ite = find_ite(its, msi->devid, msi->data); > > + if (!ite || !its_is_collection_mapped(ite->collection)) { > > + ret = E_ITS_INT_UNMAPPED_INTERRUPT; > > + goto unlock; > > + } > > + > > + vcpu = kvm_get_vcpu(kvm, ite->collection->target_addr); > > + if (!vcpu) > > + ret = E_ITS_INT_UNMAPPED_INTERRUPT; > > I'm sorry, but what does this mean to the caller? This should never > leak outside of the ITS code. > > > +unlock: > > + mutex_unlock(&its->its_lock); > > + return ret; > > +} > > + > > /* > > * Queries the KVM IO bus framework to get the ITS pointer from the given > > * doorbell address. > > diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h > > index 0c8da72..d452150 100644 > > --- a/arch/arm64/kvm/vgic/vgic.h > > +++ b/arch/arm64/kvm/vgic/vgic.h > > @@ -240,6 +240,7 @@ int kvm_vgic_register_its_device(void); > > void vgic_enable_lpis(struct kvm_vcpu *vcpu); > > void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu); > > int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi); > > +int vgic_its_verify_msi(struct kvm *kvm, struct kvm_msi *msi); > > int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr); > > int vgic_v3_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write, > > int offset, u32 *val); > > diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c > > index 40c3d7c..3027805 100644 > > --- a/drivers/vfio/pci/vfio_pci_intrs.c > > +++ b/drivers/vfio/pci/vfio_pci_intrs.c > > @@ -19,6 +19,7 @@ > > #include <linux/vfio.h> > > #include <linux/wait.h> > > #include <linux/slab.h> > > +#include <linux/kvm_irqfd.h> > > > > #include "vfio_pci_priv.h" > > > > @@ -315,6 +316,28 @@ static int vfio_msi_enable(struct vfio_pci_core_device *vdev, int nvec, bool msi > > return 0; > > } > > > > +static int vfio_pci_verify_msi_entry(struct vfio_pci_core_device *vdev, > > + struct eventfd_ctx *trigger) > > +{ > > + struct kvm *kvm = vdev->vdev.kvm; > > + struct kvm_kernel_irqfd *tmp; > > + struct kvm_kernel_irq_routing_entry irq_entry; > > + int ret = -ENODEV; > > + > > + spin_lock_irq(&kvm->irqfds.lock); > > + list_for_each_entry(tmp, &kvm->irqfds.items, list) { > > + if (trigger == tmp->eventfd) { > > + ret = 0; > > + break; > > + } > > + } > > + spin_unlock_irq(&kvm->irqfds.lock); > > + if (ret) > > + return ret; > > + irq_entry = tmp->irq_entry; > > + return kvm_verify_msi(kvm, &irq_entry); > > How does this work on !arm64? Why do we need an on-stack version of > tmp->irq_entry? Not only on !arm64, but in any scenario that doesn't involve KVM. There cannot be a hard dependency between vfio and kvm. Thanks, Alex PS - What driver/device actually cares about more than 1 MSI vector and doesn't implement MSI-X? > > > +} > > + > > static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev, > > int vector, int fd, bool msix) > > { > > @@ -355,6 +378,16 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev, > > return PTR_ERR(trigger); > > } > > > > + if (!msix) { > > + ret = vfio_pci_verify_msi_entry(vdev, trigger); > > + if (ret) { > > + kfree(vdev->ctx[vector].name); > > + eventfd_ctx_put(trigger); > > + if (ret > 0) > > + ret = 0; > > + return ret; > > + } > > + } > > Honestly, the whole things seems really complicated to avoid something > that is only a harmless warning . How about just toning down the > message instead? > > M. >
On Wed, 23 Nov 2022 19:55:14 +0000, Alex Williamson <alex.williamson@redhat.com> wrote: > > On Wed, 23 Nov 2022 12:08:05 +0000 > Marc Zyngier <maz@kernel.org> wrote: > > > On Wed, 23 Nov 2022 01:42:36 +0000, > > chenxiang <chenxiang66@hisilicon.com> wrote: > > > > > > +static int vfio_pci_verify_msi_entry(struct vfio_pci_core_device *vdev, > > > + struct eventfd_ctx *trigger) > > > +{ > > > + struct kvm *kvm = vdev->vdev.kvm; > > > + struct kvm_kernel_irqfd *tmp; > > > + struct kvm_kernel_irq_routing_entry irq_entry; > > > + int ret = -ENODEV; > > > + > > > + spin_lock_irq(&kvm->irqfds.lock); > > > + list_for_each_entry(tmp, &kvm->irqfds.items, list) { > > > + if (trigger == tmp->eventfd) { > > > + ret = 0; > > > + break; > > > + } > > > + } > > > + spin_unlock_irq(&kvm->irqfds.lock); > > > + if (ret) > > > + return ret; > > > + irq_entry = tmp->irq_entry; > > > + return kvm_verify_msi(kvm, &irq_entry); > > > > How does this work on !arm64? Why do we need an on-stack version of > > tmp->irq_entry? > > Not only on !arm64, but in any scenario that doesn't involve KVM. > There cannot be a hard dependency between vfio and kvm. Thanks, Yup, good point. > > Alex > > PS - What driver/device actually cares about more than 1 MSI vector and > doesn't implement MSI-X? Unfortunately, there is a metric ton of crap that fits in that description: 01:00.0 Network controller: Broadcom Inc. and subsidiaries Device 4433 (rev 07) Subsystem: Apple Inc. Device 4387 Device tree node: /sys/firmware/devicetree/base/soc/pcie@690000000/pci@0,0/wifi@0,0 Flags: bus master, fast devsel, latency 0, IRQ 97, IOMMU group 4 Memory at 6c1400000 (64-bit, non-prefetchable) [size=64K] Memory at 6c0000000 (64-bit, non-prefetchable) [size=16M] Capabilities: [48] Power Management version 3 Capabilities: [58] MSI: Enable+ Count=1/32 Maskable- 64bit+ ... and no MSI-X in sight. Pass this to a VM, and you'll see exactly what is described here. And that's not old stuff either. This is brand new HW. Do we need to care? I don't think so. M.
On Wed, Nov 23, 2022 at 09:42:36AM +0800, chenxiang via wrote: > From: Xiang Chen <chenxiang66@hisilicon.com> > > Currently the number of MSI vectors comes from register PCI_MSI_FLAGS > which should be power-of-2 in qemu, in some scenaries it is not the same as > the number that driver requires in guest, for example, a PCI driver wants > to allocate 6 MSI vecotrs in guest, but as the limitation, it will allocate > 8 MSI vectors. So it requires 8 MSI vectors in qemu while the driver in > guest only wants to allocate 6 MSI vectors. > > When GICv4.1 is enabled, it iterates over all possible MSIs and enable the > forwarding while the guest has only created some of mappings in the virtual > ITS, so some calls fail. The exception print is as following: > vfio-pci 0000:3a:00.1: irq bypass producer (token 000000008f08224d) registration > fails:66311 With Thomas's series to make MSI more dynamic this could spell future problems, as future kernels might have different ordering. It is just architecturally wrong to tie the MSI programming at the PCI level with the current state of the guest's virtual interrupt controller. Physical hardware doesn't do this, virtual emulation shouldn't either. People are taking too many liberties with trapping the PCI MSI registers through VFIO. :( Jason
在 2022/11/23 20:08, Marc Zyngier 写道: > On Wed, 23 Nov 2022 01:42:36 +0000, > chenxiang <chenxiang66@hisilicon.com> wrote: >> From: Xiang Chen <chenxiang66@hisilicon.com> >> >> Currently the number of MSI vectors comes from register PCI_MSI_FLAGS >> which should be power-of-2 in qemu, in some scenaries it is not the same as >> the number that driver requires in guest, for example, a PCI driver wants >> to allocate 6 MSI vecotrs in guest, but as the limitation, it will allocate >> 8 MSI vectors. So it requires 8 MSI vectors in qemu while the driver in >> guest only wants to allocate 6 MSI vectors. >> >> When GICv4.1 is enabled, it iterates over all possible MSIs and enable the >> forwarding while the guest has only created some of mappings in the virtual >> ITS, so some calls fail. The exception print is as following: >> vfio-pci 0000:3a:00.1: irq bypass producer (token 000000008f08224d) registration >> fails:66311 >> >> To avoid the issue, verify each MSI vector, skip some operations such as >> request_irq() and irq_bypass_register_producer() for those invalid MSI vectors. >> >> Signed-off-by: Xiang Chen <chenxiang66@hisilicon.com> >> --- >> I reported the issue at the link: >> https://lkml.kernel.org/lkml/87cze9lcut.wl-maz@kernel.org/T/ >> >> Change Log: >> v1 -> v2: >> Verify each MSI vector in kernel instead of adding systemcall according to >> Mar's suggestion >> --- >> arch/arm64/kvm/vgic/vgic-irqfd.c | 13 +++++++++++++ >> arch/arm64/kvm/vgic/vgic-its.c | 36 ++++++++++++++++++++++++++++++++++++ >> arch/arm64/kvm/vgic/vgic.h | 1 + >> drivers/vfio/pci/vfio_pci_intrs.c | 33 +++++++++++++++++++++++++++++++++ >> include/linux/kvm_host.h | 2 ++ >> 5 files changed, 85 insertions(+) >> >> diff --git a/arch/arm64/kvm/vgic/vgic-irqfd.c b/arch/arm64/kvm/vgic/vgic-irqfd.c >> index 475059b..71f6af57 100644 >> --- a/arch/arm64/kvm/vgic/vgic-irqfd.c >> +++ b/arch/arm64/kvm/vgic/vgic-irqfd.c >> @@ -98,6 +98,19 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, >> return vgic_its_inject_msi(kvm, &msi); >> } >> >> +int kvm_verify_msi(struct kvm *kvm, >> + struct kvm_kernel_irq_routing_entry *irq_entry) >> +{ >> + struct kvm_msi msi; >> + >> + if (!vgic_has_its(kvm)) >> + return -ENODEV; >> + >> + kvm_populate_msi(irq_entry, &msi); >> + >> + return vgic_its_verify_msi(kvm, &msi); >> +} >> + >> /** >> * kvm_arch_set_irq_inatomic: fast-path for irqfd injection >> */ >> diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c >> index 94a666d..8312a4a 100644 >> --- a/arch/arm64/kvm/vgic/vgic-its.c >> +++ b/arch/arm64/kvm/vgic/vgic-its.c >> @@ -767,6 +767,42 @@ int vgic_its_inject_cached_translation(struct kvm *kvm, struct kvm_msi *msi) >> return 0; >> } >> >> +int vgic_its_verify_msi(struct kvm *kvm, struct kvm_msi *msi) >> +{ >> + struct vgic_its *its; >> + struct its_ite *ite; >> + struct kvm_vcpu *vcpu; >> + int ret = 0; >> + >> + if (!irqchip_in_kernel(kvm) || (msi->flags & ~KVM_MSI_VALID_DEVID)) >> + return -EINVAL; >> + >> + if (!vgic_has_its(kvm)) >> + return -ENODEV; >> + >> + its = vgic_msi_to_its(kvm, msi); >> + if (IS_ERR(its)) >> + return PTR_ERR(its); >> + >> + mutex_lock(&its->its_lock); >> + if (!its->enabled) { >> + ret = -EBUSY; >> + goto unlock; >> + } >> + ite = find_ite(its, msi->devid, msi->data); >> + if (!ite || !its_is_collection_mapped(ite->collection)) { >> + ret = E_ITS_INT_UNMAPPED_INTERRUPT; >> + goto unlock; >> + } >> + >> + vcpu = kvm_get_vcpu(kvm, ite->collection->target_addr); >> + if (!vcpu) >> + ret = E_ITS_INT_UNMAPPED_INTERRUPT; > I'm sorry, but what does this mean to the caller? This should never > leak outside of the ITS code. Actually it is already leak outside of ITS code, and please see the exception printk (E_ITS_INT_UNMAPPED_INTERRUPT is 0x10307 which is equal to 66311): vfio-pci 0000:3a:00.1: irq bypass producer (token 000000008f08224d) registration fails:66311 > >> +unlock: >> + mutex_unlock(&its->its_lock); >> + return ret; >> +} >> + >> /* >> * Queries the KVM IO bus framework to get the ITS pointer from the given >> * doorbell address. >> diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h >> index 0c8da72..d452150 100644 >> --- a/arch/arm64/kvm/vgic/vgic.h >> +++ b/arch/arm64/kvm/vgic/vgic.h >> @@ -240,6 +240,7 @@ int kvm_vgic_register_its_device(void); >> void vgic_enable_lpis(struct kvm_vcpu *vcpu); >> void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu); >> int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi); >> +int vgic_its_verify_msi(struct kvm *kvm, struct kvm_msi *msi); >> int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr); >> int vgic_v3_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write, >> int offset, u32 *val); >> diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c >> index 40c3d7c..3027805 100644 >> --- a/drivers/vfio/pci/vfio_pci_intrs.c >> +++ b/drivers/vfio/pci/vfio_pci_intrs.c >> @@ -19,6 +19,7 @@ >> #include <linux/vfio.h> >> #include <linux/wait.h> >> #include <linux/slab.h> >> +#include <linux/kvm_irqfd.h> >> >> #include "vfio_pci_priv.h" >> >> @@ -315,6 +316,28 @@ static int vfio_msi_enable(struct vfio_pci_core_device *vdev, int nvec, bool msi >> return 0; >> } >> >> +static int vfio_pci_verify_msi_entry(struct vfio_pci_core_device *vdev, >> + struct eventfd_ctx *trigger) >> +{ >> + struct kvm *kvm = vdev->vdev.kvm; >> + struct kvm_kernel_irqfd *tmp; >> + struct kvm_kernel_irq_routing_entry irq_entry; >> + int ret = -ENODEV; >> + >> + spin_lock_irq(&kvm->irqfds.lock); >> + list_for_each_entry(tmp, &kvm->irqfds.items, list) { >> + if (trigger == tmp->eventfd) { >> + ret = 0; >> + break; >> + } >> + } >> + spin_unlock_irq(&kvm->irqfds.lock); >> + if (ret) >> + return ret; >> + irq_entry = tmp->irq_entry; >> + return kvm_verify_msi(kvm, &irq_entry); > How does this work on !arm64? Why do we need an on-stack version of > tmp->irq_entry? Right, i didn't notice that it is common code and need to be work on other platforms. > >> +} >> + >> static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev, >> int vector, int fd, bool msix) >> { >> @@ -355,6 +378,16 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev, >> return PTR_ERR(trigger); >> } >> >> + if (!msix) { >> + ret = vfio_pci_verify_msi_entry(vdev, trigger); >> + if (ret) { >> + kfree(vdev->ctx[vector].name); >> + eventfd_ctx_put(trigger); >> + if (ret > 0) >> + ret = 0; >> + return ret; >> + } >> + } > Honestly, the whole things seems really complicated to avoid something > that is only a harmless warning . It seems also waste some interrupts. Allocating and requesting some interrupts but not used. > How about just toning down the > message instead? ok
On Sat, 26 Nov 2022 06:33:15 +0000, "chenxiang (M)" <chenxiang66@hisilicon.com> wrote: > > > 在 2022/11/23 20:08, Marc Zyngier 写道: > > On Wed, 23 Nov 2022 01:42:36 +0000, > > chenxiang <chenxiang66@hisilicon.com> wrote: > >> From: Xiang Chen <chenxiang66@hisilicon.com> > >> > >> Currently the number of MSI vectors comes from register PCI_MSI_FLAGS > >> which should be power-of-2 in qemu, in some scenaries it is not the same as > >> the number that driver requires in guest, for example, a PCI driver wants > >> to allocate 6 MSI vecotrs in guest, but as the limitation, it will allocate > >> 8 MSI vectors. So it requires 8 MSI vectors in qemu while the driver in > >> guest only wants to allocate 6 MSI vectors. > >> > >> When GICv4.1 is enabled, it iterates over all possible MSIs and enable the > >> forwarding while the guest has only created some of mappings in the virtual > >> ITS, so some calls fail. The exception print is as following: > >> vfio-pci 0000:3a:00.1: irq bypass producer (token 000000008f08224d) registration > >> fails:66311 > >> > >> To avoid the issue, verify each MSI vector, skip some operations such as > >> request_irq() and irq_bypass_register_producer() for those invalid MSI vectors. > >> > >> Signed-off-by: Xiang Chen <chenxiang66@hisilicon.com> > >> --- > >> I reported the issue at the link: > >> https://lkml.kernel.org/lkml/87cze9lcut.wl-maz@kernel.org/T/ > >> > >> Change Log: > >> v1 -> v2: > >> Verify each MSI vector in kernel instead of adding systemcall according to > >> Mar's suggestion > >> --- > >> arch/arm64/kvm/vgic/vgic-irqfd.c | 13 +++++++++++++ > >> arch/arm64/kvm/vgic/vgic-its.c | 36 ++++++++++++++++++++++++++++++++++++ > >> arch/arm64/kvm/vgic/vgic.h | 1 + > >> drivers/vfio/pci/vfio_pci_intrs.c | 33 +++++++++++++++++++++++++++++++++ > >> include/linux/kvm_host.h | 2 ++ > >> 5 files changed, 85 insertions(+) > >> > >> diff --git a/arch/arm64/kvm/vgic/vgic-irqfd.c b/arch/arm64/kvm/vgic/vgic-irqfd.c > >> index 475059b..71f6af57 100644 > >> --- a/arch/arm64/kvm/vgic/vgic-irqfd.c > >> +++ b/arch/arm64/kvm/vgic/vgic-irqfd.c > >> @@ -98,6 +98,19 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, > >> return vgic_its_inject_msi(kvm, &msi); > >> } > >> +int kvm_verify_msi(struct kvm *kvm, > >> + struct kvm_kernel_irq_routing_entry *irq_entry) > >> +{ > >> + struct kvm_msi msi; > >> + > >> + if (!vgic_has_its(kvm)) > >> + return -ENODEV; > >> + > >> + kvm_populate_msi(irq_entry, &msi); > >> + > >> + return vgic_its_verify_msi(kvm, &msi); > >> +} > >> + > >> /** > >> * kvm_arch_set_irq_inatomic: fast-path for irqfd injection > >> */ > >> diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c > >> index 94a666d..8312a4a 100644 > >> --- a/arch/arm64/kvm/vgic/vgic-its.c > >> +++ b/arch/arm64/kvm/vgic/vgic-its.c > >> @@ -767,6 +767,42 @@ int vgic_its_inject_cached_translation(struct kvm *kvm, struct kvm_msi *msi) > >> return 0; > >> } > >> +int vgic_its_verify_msi(struct kvm *kvm, struct kvm_msi *msi) > >> +{ > >> + struct vgic_its *its; > >> + struct its_ite *ite; > >> + struct kvm_vcpu *vcpu; > >> + int ret = 0; > >> + > >> + if (!irqchip_in_kernel(kvm) || (msi->flags & ~KVM_MSI_VALID_DEVID)) > >> + return -EINVAL; > >> + > >> + if (!vgic_has_its(kvm)) > >> + return -ENODEV; > >> + > >> + its = vgic_msi_to_its(kvm, msi); > >> + if (IS_ERR(its)) > >> + return PTR_ERR(its); > >> + > >> + mutex_lock(&its->its_lock); > >> + if (!its->enabled) { > >> + ret = -EBUSY; > >> + goto unlock; > >> + } > >> + ite = find_ite(its, msi->devid, msi->data); > >> + if (!ite || !its_is_collection_mapped(ite->collection)) { > >> + ret = E_ITS_INT_UNMAPPED_INTERRUPT; > >> + goto unlock; > >> + } > >> + > >> + vcpu = kvm_get_vcpu(kvm, ite->collection->target_addr); > >> + if (!vcpu) > >> + ret = E_ITS_INT_UNMAPPED_INTERRUPT; > > I'm sorry, but what does this mean to the caller? This should never > > leak outside of the ITS code. > > Actually it is already leak outside of ITS code, and please see the > exception printk (E_ITS_INT_UNMAPPED_INTERRUPT is 0x10307 which is > equal to 66311): > > vfio-pci 0000:3a:00.1: irq bypass producer (token 000000008f08224d) registration fails:66311 > But that's hardly interpreted, which is the whole point. Only zero is considered a success value. > > Honestly, the whole things seems really complicated to avoid something > > that is only a harmless warning . > > It seems also waste some interrupts. Allocating and requesting some > interrupts but not used. What makes you think they are not used? A guest can install a mapping for those at any point. They won't be directly injected, but they will be delivered to the guest via the normal SW injection mechanism. M.
On Thu, 24 Nov 2022 18:00:44 +0000, Jason Gunthorpe <jgg@ziepe.ca> wrote: > > On Wed, Nov 23, 2022 at 09:42:36AM +0800, chenxiang via wrote: > > From: Xiang Chen <chenxiang66@hisilicon.com> > > > > Currently the number of MSI vectors comes from register PCI_MSI_FLAGS > > which should be power-of-2 in qemu, in some scenaries it is not the same as > > the number that driver requires in guest, for example, a PCI driver wants > > to allocate 6 MSI vecotrs in guest, but as the limitation, it will allocate > > 8 MSI vectors. So it requires 8 MSI vectors in qemu while the driver in > > guest only wants to allocate 6 MSI vectors. > > > > When GICv4.1 is enabled, it iterates over all possible MSIs and enable the > > forwarding while the guest has only created some of mappings in the virtual > > ITS, so some calls fail. The exception print is as following: > > vfio-pci 0000:3a:00.1: irq bypass producer (token 000000008f08224d) registration > > fails:66311 > > With Thomas's series to make MSI more dynamic this could spell future > problems, as future kernels might have different ordering. Enabling MSIs on the endpoint before they are programmed in the interrupt controller? I don't think that's a realistic outcome. > It is just architecturally wrong to tie the MSI programming at the PCI > level with the current state of the guest's virtual interrupt > controller. There is no architectural ties between the two at all. There is an optimisation that allows direct injection if you do it in a non braindead order. Nothing breaks if you don't, you just have wasted memory, performance, power and area. You're welcome. > Physical hardware doesn't do this, virtual emulation shouldn't either. If you want to fix VFIO, be my guest. My rambling about the sorry state of this has been in the kernel for 5 years (ed8703a506a8). > People are taking too many liberties with trapping the PCI MSI > registers through VFIO. :( Do you really want to leave access to the MSI BAR to userspace? The number of ways this can go wrong is mind-boggling. Starting with having to rebuild the interrupt translation tables on the host side to follow what the guest does, instead of keeping the two independent. M.
On Sat, Nov 26, 2022 at 11:15:14AM +0000, Marc Zyngier wrote: > > Physical hardware doesn't do this, virtual emulation shouldn't either. > > If you want to fix VFIO, be my guest. My rambling about the sorry > state of this has been in the kernel for 5 years (ed8703a506a8). We are talking about things. Stuff we want to do doesn't work, or is completely insane right now. > > People are taking too many liberties with trapping the PCI MSI > > registers through VFIO. :( > > Do you really want to leave access to the MSI BAR to userspace? The > number of ways this can go wrong is mind-boggling. Yeah, actually I do. This is basically mandatory to do something like IMS, SIOV, etc. > Starting with having to rebuild the interrupt translation tables on > the host side to follow what the guest does, instead of keeping the > two independent. At least on x86 most of the discussion has been about teaching the interrupt controller to go to the hypervisor to get help when establishing interrupts. The hypervisor can tell the guest what the real MSI data is. This is following the example of hyperv which plugs in a hyper call to HVCALL_MAP_DEVICE_INTERRUPT in its remapping irq_chip. This allows the hypervisor to tell the guest a real addr/data pair and the hypervisor does not have to involve itself in the device programming. We haven't reached a point of thinking in detail about ARM, but I would guess the general theme would still apply. Jason
diff --git a/arch/arm64/kvm/vgic/vgic-irqfd.c b/arch/arm64/kvm/vgic/vgic-irqfd.c index 475059b..71f6af57 100644 --- a/arch/arm64/kvm/vgic/vgic-irqfd.c +++ b/arch/arm64/kvm/vgic/vgic-irqfd.c @@ -98,6 +98,19 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, return vgic_its_inject_msi(kvm, &msi); } +int kvm_verify_msi(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *irq_entry) +{ + struct kvm_msi msi; + + if (!vgic_has_its(kvm)) + return -ENODEV; + + kvm_populate_msi(irq_entry, &msi); + + return vgic_its_verify_msi(kvm, &msi); +} + /** * kvm_arch_set_irq_inatomic: fast-path for irqfd injection */ diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index 94a666d..8312a4a 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -767,6 +767,42 @@ int vgic_its_inject_cached_translation(struct kvm *kvm, struct kvm_msi *msi) return 0; } +int vgic_its_verify_msi(struct kvm *kvm, struct kvm_msi *msi) +{ + struct vgic_its *its; + struct its_ite *ite; + struct kvm_vcpu *vcpu; + int ret = 0; + + if (!irqchip_in_kernel(kvm) || (msi->flags & ~KVM_MSI_VALID_DEVID)) + return -EINVAL; + + if (!vgic_has_its(kvm)) + return -ENODEV; + + its = vgic_msi_to_its(kvm, msi); + if (IS_ERR(its)) + return PTR_ERR(its); + + mutex_lock(&its->its_lock); + if (!its->enabled) { + ret = -EBUSY; + goto unlock; + } + ite = find_ite(its, msi->devid, msi->data); + if (!ite || !its_is_collection_mapped(ite->collection)) { + ret = E_ITS_INT_UNMAPPED_INTERRUPT; + goto unlock; + } + + vcpu = kvm_get_vcpu(kvm, ite->collection->target_addr); + if (!vcpu) + ret = E_ITS_INT_UNMAPPED_INTERRUPT; +unlock: + mutex_unlock(&its->its_lock); + return ret; +} + /* * Queries the KVM IO bus framework to get the ITS pointer from the given * doorbell address. diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index 0c8da72..d452150 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -240,6 +240,7 @@ int kvm_vgic_register_its_device(void); void vgic_enable_lpis(struct kvm_vcpu *vcpu); void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu); int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi); +int vgic_its_verify_msi(struct kvm *kvm, struct kvm_msi *msi); int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr); int vgic_v3_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write, int offset, u32 *val); diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index 40c3d7c..3027805 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -19,6 +19,7 @@ #include <linux/vfio.h> #include <linux/wait.h> #include <linux/slab.h> +#include <linux/kvm_irqfd.h> #include "vfio_pci_priv.h" @@ -315,6 +316,28 @@ static int vfio_msi_enable(struct vfio_pci_core_device *vdev, int nvec, bool msi return 0; } +static int vfio_pci_verify_msi_entry(struct vfio_pci_core_device *vdev, + struct eventfd_ctx *trigger) +{ + struct kvm *kvm = vdev->vdev.kvm; + struct kvm_kernel_irqfd *tmp; + struct kvm_kernel_irq_routing_entry irq_entry; + int ret = -ENODEV; + + spin_lock_irq(&kvm->irqfds.lock); + list_for_each_entry(tmp, &kvm->irqfds.items, list) { + if (trigger == tmp->eventfd) { + ret = 0; + break; + } + } + spin_unlock_irq(&kvm->irqfds.lock); + if (ret) + return ret; + irq_entry = tmp->irq_entry; + return kvm_verify_msi(kvm, &irq_entry); +} + static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev, int vector, int fd, bool msix) { @@ -355,6 +378,16 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev, return PTR_ERR(trigger); } + if (!msix) { + ret = vfio_pci_verify_msi_entry(vdev, trigger); + if (ret) { + kfree(vdev->ctx[vector].name); + eventfd_ctx_put(trigger); + if (ret > 0) + ret = 0; + return ret; + } + } /* * The MSIx vector table resides in device memory which may be cleared * via backdoor resets. We don't allow direct access to the vector diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 1cd9a22..3c8f22a 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1611,6 +1611,8 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm, int kvm_request_irq_source_id(struct kvm *kvm); void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id); bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args); +int kvm_verify_msi(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *irq_entry); /* * Returns a pointer to the memslot if it contains gfn.