diff mbox

[v4,12/16] KVM: kvm-vfio: implement the VFIO skeleton for VT-d Posted-Interrupts

Message ID 1434019912-15423-13-git-send-email-feng.wu@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Wu, Feng June 11, 2015, 10:51 a.m. UTC
This patch adds the kvm-vfio interface for VT-d Posted-Interrupts.
When guests update MSI/MSI-x information for an assigned-device,
QEMU will use KVM_DEV_VFIO_DEVICE_POST_IRQ attribute to setup
IRTE for VT-d PI. Userspace program can also use
KVM_DEV_VFIO_DEVICE_UNPOST_IRQ to change back to irq remapping mode.
This patch implements these IRQ attributes.

Signed-off-by: Feng Wu <feng.wu@intel.com>
---
 include/linux/kvm_host.h |  22 +++++++++
 virt/kvm/vfio.c          | 126 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 148 insertions(+)

Comments

Alex Williamson June 11, 2015, 5:15 p.m. UTC | #1
On Thu, 2015-06-11 at 18:51 +0800, Feng Wu wrote:
> This patch adds the kvm-vfio interface for VT-d Posted-Interrupts.
> When guests update MSI/MSI-x information for an assigned-device,
> QEMU will use KVM_DEV_VFIO_DEVICE_POST_IRQ attribute to setup
> IRTE for VT-d PI. Userspace program can also use
> KVM_DEV_VFIO_DEVICE_UNPOST_IRQ to change back to irq remapping mode.
> This patch implements these IRQ attributes.
> 
> Signed-off-by: Feng Wu <feng.wu@intel.com>
> ---
>  include/linux/kvm_host.h |  22 +++++++++
>  virt/kvm/vfio.c          | 126 +++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 148 insertions(+)
> 
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index f591f7c..69f8711 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -1073,6 +1073,28 @@ extern struct kvm_device_ops kvm_xics_ops;
>  extern struct kvm_device_ops kvm_arm_vgic_v2_ops;
>  extern struct kvm_device_ops kvm_arm_vgic_v3_ops;
>  
> +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_POST
> +/*
> + * kvm_arch_vfio_update_pi_irte - set IRTE for Posted-Interrupts
> + *
> + * @kvm: kvm
> + * @host_irq: host irq of the interrupt
> + * @guest_irq: gsi of the interrupt
> + * @set: set or unset PI
> + * returns 0 on success, < 0 on failure
> + */
> +int kvm_arch_vfio_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
> +				 uint32_t guest_irq, bool set);
> +#else
> +static inline int kvm_arch_vfio_update_pi_irte(struct kvm *kvm,
> +					       unsigned int host_irq,
> +					       uint32_t guest_irq,
> +					       bool set)
> +{
> +	return 0;
> +}

The code below can't get to this function without
__KVM_HAVE_ARCH_KVM_VFIO_POST, but this seems like it should return an
error if not implemented.

> +#endif
> +
>  #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
>  
>  static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val)
> diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c
> index 80a45e4..547fc51 100644
> --- a/virt/kvm/vfio.c
> +++ b/virt/kvm/vfio.c
> @@ -18,6 +18,7 @@
>  #include <linux/slab.h>
>  #include <linux/uaccess.h>
>  #include <linux/vfio.h>
> +#include <asm/irq_remapping.h>

This only exists on x86.   Are we also getting lucky with some of the
include chains that give us the PCI related defines?  It looks like
we're implicitly assuming CONFIG_PCI

>  #include "vfio.h"
>  
>  struct kvm_vfio_group {
> @@ -276,12 +277,128 @@ static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg)
>  	return -ENXIO;
>  }
>  
> +static int kvm_vfio_pci_get_irq_count(struct pci_dev *pdev, int irq_type)
> +{
> +	if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) {
> +		u8 pin;
> +
> +		pci_read_config_byte(pdev, PCI_INTERRUPT_PIN, &pin);
> +		if (pin)
> +			return 1;
> +	} else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) {
> +		return pci_msi_vec_count(pdev);
> +	} else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) {
> +		return pci_msix_vec_count(pdev);
> +	}
> +
> +	return 0;
> +}
> +
> +static int kvm_vfio_control_pi(struct kvm_device *kdev,
> +			       int32_t __user *argp, bool set)
> +{
> +	struct kvm_vfio_dev_irq pi_info;
> +	uint32_t *gsi;
> +	unsigned long minsz;
> +	struct vfio_device *vdev;
> +	struct msi_desc *entry;
> +	struct device *dev;
> +	struct pci_dev *pdev;
> +	int i, max, ret;
> +
> +	minsz = offsetofend(struct kvm_vfio_dev_irq, count);
> +
> +	if (copy_from_user(&pi_info, (void __user *)argp, minsz))
> +		return -EFAULT;
> +
> +	if (pi_info.argsz < minsz || pi_info.index >= VFIO_PCI_NUM_IRQS)
> +		return -EINVAL;

Could we also abort on pi_info.count == 0?

> +
> +	vdev = kvm_vfio_get_vfio_device(pi_info.fd);
> +	if (IS_ERR(vdev))
> +		return PTR_ERR(vdev);
> +
> +	dev = kvm_vfio_external_base_device(vdev);
> +	if (!dev || !dev_is_pci(dev)) {
> +		ret = -EFAULT;
> +		goto put_vfio_device;
> +	}
> +
> +	pdev = to_pci_dev(dev);
> +
> +	max = kvm_vfio_pci_get_irq_count(pdev, pi_info.index);
> +	if (max <= 0) {
> +		ret = -EFAULT;
> +		goto put_vfio_device;
> +	}
> +
> +	if (pi_info.argsz - minsz < pi_info.count * sizeof(u32) ||
> +	    pi_info.start >= max || pi_info.start + pi_info.count > max) {
> +		ret = -EINVAL;
> +		goto put_vfio_device;
> +	}
> +
> +	gsi = memdup_user((void __user *)((unsigned long)argp + minsz),
> +			   pi_info.count * sizeof(u32));
> +	if (IS_ERR(gsi)) {
> +		ret = PTR_ERR(gsi);
> +		goto put_vfio_device;
> +	}
> +
> +#ifdef CONFIG_PCI_MSI
> +	for (i = 0; i < pi_info.count; i++) {
> +		list_for_each_entry(entry, &pdev->msi_list, list) {

Should we be able to get here for INTx?

> +			if (entry->msi_attrib.entry_nr != pi_info.start+i)
> +				continue;
> +
> +			ret = kvm_arch_vfio_update_pi_irte(kdev->kvm,
> +							   entry->irq,
> +							   gsi[i],
> +							   set);
> +			if (ret)
> +				goto free_gsi;
> +		}
> +	}
> +#endif
> +
> +	ret = 0;

So if we didn't do anything, return success?  That seems strange.
Should we also be doing some unwind on failure?  Thanks,

Alex

> +
> +free_gsi:
> +	kfree(gsi);
> +
> +put_vfio_device:
> +	kvm_vfio_put_vfio_device(vdev);
> +	return ret;
> +}
> +
> +static int kvm_vfio_set_device(struct kvm_device *kdev, long attr, u64 arg)
> +{
> +	int32_t __user *argp = (int32_t __user *)(unsigned long)arg;
> +	int ret;
> +
> +	switch (attr) {
> +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_POST
> +	case KVM_DEV_VFIO_DEVICE_POST_IRQ:
> +		ret = kvm_vfio_control_pi(kdev, argp, 1);
> +		break;
> +	case KVM_DEV_VFIO_DEVICE_UNPOST_IRQ:
> +		ret = kvm_vfio_control_pi(kdev, argp, 0);
> +		break;
> +#endif
> +	default:
> +		ret = -ENXIO;
> +	}
> +	return ret;
> +}
> +
>  static int kvm_vfio_set_attr(struct kvm_device *dev,
>  			     struct kvm_device_attr *attr)
>  {
>  	switch (attr->group) {
>  	case KVM_DEV_VFIO_GROUP:
>  		return kvm_vfio_set_group(dev, attr->attr, attr->addr);
> +	case KVM_DEV_VFIO_DEVICE:
> +		return kvm_vfio_set_device(dev, attr->attr, attr->addr);
>  	}
>  
>  	return -ENXIO;
> @@ -299,6 +416,15 @@ static int kvm_vfio_has_attr(struct kvm_device *dev,
>  		}
>  
>  		break;
> +	case KVM_DEV_VFIO_DEVICE:
> +		switch (attr->attr) {
> +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_POST
> +		case KVM_DEV_VFIO_DEVICE_POST_IRQ:
> +		case KVM_DEV_VFIO_DEVICE_UNPOST_IRQ:
> +			return irq_remapping_cap(IRQ_POSTING_CAP) ? 0 : -ENXIO;
> +#endif
> +		}
> +		break;
>  	}
>  
>  	return -ENXIO;



--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Wu, Feng June 12, 2015, 4:54 a.m. UTC | #2
> -----Original Message-----

> From: Alex Williamson [mailto:alex.williamson@redhat.com]

> Sent: Friday, June 12, 2015 1:15 AM

> To: Wu, Feng

> Cc: kvm@vger.kernel.org; linux-kernel@vger.kernel.org; pbonzini@redhat.com;

> mtosatti@redhat.com; eric.auger@linaro.org

> Subject: Re: [v4 12/16] KVM: kvm-vfio: implement the VFIO skeleton for VT-d

> Posted-Interrupts

> 

> On Thu, 2015-06-11 at 18:51 +0800, Feng Wu wrote:

> > This patch adds the kvm-vfio interface for VT-d Posted-Interrupts.

> > When guests update MSI/MSI-x information for an assigned-device,

> > QEMU will use KVM_DEV_VFIO_DEVICE_POST_IRQ attribute to setup

> > IRTE for VT-d PI. Userspace program can also use

> > KVM_DEV_VFIO_DEVICE_UNPOST_IRQ to change back to irq remapping

> mode.

> > This patch implements these IRQ attributes.

> >

> > Signed-off-by: Feng Wu <feng.wu@intel.com>

> > ---

> >  include/linux/kvm_host.h |  22 +++++++++

> >  virt/kvm/vfio.c          | 126

> +++++++++++++++++++++++++++++++++++++++++++++++

> >  2 files changed, 148 insertions(+)

> >

> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h

> > index f591f7c..69f8711 100644

> > --- a/include/linux/kvm_host.h

> > +++ b/include/linux/kvm_host.h

> > @@ -1073,6 +1073,28 @@ extern struct kvm_device_ops kvm_xics_ops;

> >  extern struct kvm_device_ops kvm_arm_vgic_v2_ops;

> >  extern struct kvm_device_ops kvm_arm_vgic_v3_ops;

> >

> > +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_POST

> > +/*

> > + * kvm_arch_vfio_update_pi_irte - set IRTE for Posted-Interrupts

> > + *

> > + * @kvm: kvm

> > + * @host_irq: host irq of the interrupt

> > + * @guest_irq: gsi of the interrupt

> > + * @set: set or unset PI

> > + * returns 0 on success, < 0 on failure

> > + */

> > +int kvm_arch_vfio_update_pi_irte(struct kvm *kvm, unsigned int host_irq,

> > +				 uint32_t guest_irq, bool set);

> > +#else

> > +static inline int kvm_arch_vfio_update_pi_irte(struct kvm *kvm,

> > +					       unsigned int host_irq,

> > +					       uint32_t guest_irq,

> > +					       bool set)

> > +{

> > +	return 0;

> > +}

> 

> The code below can't get to this function without

> __KVM_HAVE_ARCH_KVM_VFIO_POST, but this seems like it should return an

> error if not implemented.


kvm_arch_vfio_update_pi_irte() is called by kvm_vfio_control_pi(), if we remove the
dummy definition of kvm_arch_vfio_update_pi_irte(), kvm_vfio_control_pi() is also
needed to be included in __KVM_HAVE_ARCH_KVM_VFIO_POST, I will handle this
in the next version.

> 

> > +#endif

> > +

> >  #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT

> >

> >  static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool

> val)

> > diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c

> > index 80a45e4..547fc51 100644

> > --- a/virt/kvm/vfio.c

> > +++ b/virt/kvm/vfio.c

> > @@ -18,6 +18,7 @@

> >  #include <linux/slab.h>

> >  #include <linux/uaccess.h>

> >  #include <linux/vfio.h>

> > +#include <asm/irq_remapping.h>

> 

> This only exists on x86.   


But in kvm_vfio_has_attr(), we can only return 0 when posted interrupt is supported
via calling " irq_remapping_cap(IRQ_POSTING_CAP)" which needs this header file.
Do you think how can I handle this?

> Are we also getting lucky with some of the

> include chains that give us the PCI related defines?  It looks like

> we're implicitly assuming CONFIG_PCI


Yes, I think the PCI related header files are included implicitly here. Anyway
I can add "#include <linux/pci.h>" explicitly.

> >  #include "vfio.h"

> >

> >  struct kvm_vfio_group {

> > @@ -276,12 +277,128 @@ static int kvm_vfio_set_group(struct kvm_device

> *dev, long attr, u64 arg)

> >  	return -ENXIO;

> >  }

> >

> > +static int kvm_vfio_pci_get_irq_count(struct pci_dev *pdev, int irq_type)

> > +{

> > +	if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) {

> > +		u8 pin;

> > +

> > +		pci_read_config_byte(pdev, PCI_INTERRUPT_PIN, &pin);

> > +		if (pin)

> > +			return 1;

> > +	} else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) {

> > +		return pci_msi_vec_count(pdev);

> > +	} else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) {

> > +		return pci_msix_vec_count(pdev);

> > +	}

> > +

> > +	return 0;

> > +}

> > +

> > +static int kvm_vfio_control_pi(struct kvm_device *kdev,

> > +			       int32_t __user *argp, bool set)

> > +{

> > +	struct kvm_vfio_dev_irq pi_info;

> > +	uint32_t *gsi;

> > +	unsigned long minsz;

> > +	struct vfio_device *vdev;

> > +	struct msi_desc *entry;

> > +	struct device *dev;

> > +	struct pci_dev *pdev;

> > +	int i, max, ret;

> > +

> > +	minsz = offsetofend(struct kvm_vfio_dev_irq, count);

> > +

> > +	if (copy_from_user(&pi_info, (void __user *)argp, minsz))

> > +		return -EFAULT;

> > +

> > +	if (pi_info.argsz < minsz || pi_info.index >= VFIO_PCI_NUM_IRQS)

> > +		return -EINVAL;

> 

> Could we also abort on pi_info.count == 0?


Yes, that is a good point.

> 

> > +

> > +	vdev = kvm_vfio_get_vfio_device(pi_info.fd);

> > +	if (IS_ERR(vdev))

> > +		return PTR_ERR(vdev);

> > +

> > +	dev = kvm_vfio_external_base_device(vdev);

> > +	if (!dev || !dev_is_pci(dev)) {

> > +		ret = -EFAULT;

> > +		goto put_vfio_device;

> > +	}

> > +

> > +	pdev = to_pci_dev(dev);

> > +

> > +	max = kvm_vfio_pci_get_irq_count(pdev, pi_info.index);

> > +	if (max <= 0) {

> > +		ret = -EFAULT;

> > +		goto put_vfio_device;

> > +	}

> > +

> > +	if (pi_info.argsz - minsz < pi_info.count * sizeof(u32) ||

> > +	    pi_info.start >= max || pi_info.start + pi_info.count > max) {

> > +		ret = -EINVAL;

> > +		goto put_vfio_device;

> > +	}

> > +

> > +	gsi = memdup_user((void __user *)((unsigned long)argp + minsz),

> > +			   pi_info.count * sizeof(u32));

> > +	if (IS_ERR(gsi)) {

> > +		ret = PTR_ERR(gsi);

> > +		goto put_vfio_device;

> > +	}

> > +

> > +#ifdef CONFIG_PCI_MSI

> > +	for (i = 0; i < pi_info.count; i++) {

> > +		list_for_each_entry(entry, &pdev->msi_list, list) {

> 

> Should we be able to get here for INTx?


We only support PI for MSI/MSIx. I think I can return earlier in this function if
the index is not VFIO_PCI_MSI_IRQ_INDEX or VFIO_PCI_MSIX_IRQ_INDEX,
is this okay for you?

> 

> > +			if (entry->msi_attrib.entry_nr != pi_info.start+i)

> > +				continue;

> > +

> > +			ret = kvm_arch_vfio_update_pi_irte(kdev->kvm,

> > +							   entry->irq,

> > +							   gsi[i],

> > +							   set);

> > +			if (ret)

> > +				goto free_gsi;

> > +		}

> > +	}

> > +#endif

> > +

> > +	ret = 0;

> 

> So if we didn't do anything, return success?  That seems strange.

> Should we also be doing some unwind on failure?  Thanks,

> 

I can't think of what I need to do on failure. Do you have any ideas?

Thanks,
Feng

> Alex

> 

> > +

> > +free_gsi:

> > +	kfree(gsi);

> > +

> > +put_vfio_device:

> > +	kvm_vfio_put_vfio_device(vdev);

> > +	return ret;

> > +}

> > +

> > +static int kvm_vfio_set_device(struct kvm_device *kdev, long attr, u64 arg)

> > +{

> > +	int32_t __user *argp = (int32_t __user *)(unsigned long)arg;

> > +	int ret;

> > +

> > +	switch (attr) {

> > +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_POST

> > +	case KVM_DEV_VFIO_DEVICE_POST_IRQ:

> > +		ret = kvm_vfio_control_pi(kdev, argp, 1);

> > +		break;

> > +	case KVM_DEV_VFIO_DEVICE_UNPOST_IRQ:

> > +		ret = kvm_vfio_control_pi(kdev, argp, 0);

> > +		break;

> > +#endif

> > +	default:

> > +		ret = -ENXIO;

> > +	}

> > +	return ret;

> > +}

> > +

> >  static int kvm_vfio_set_attr(struct kvm_device *dev,

> >  			     struct kvm_device_attr *attr)

> >  {

> >  	switch (attr->group) {

> >  	case KVM_DEV_VFIO_GROUP:

> >  		return kvm_vfio_set_group(dev, attr->attr, attr->addr);

> > +	case KVM_DEV_VFIO_DEVICE:

> > +		return kvm_vfio_set_device(dev, attr->attr, attr->addr);

> >  	}

> >

> >  	return -ENXIO;

> > @@ -299,6 +416,15 @@ static int kvm_vfio_has_attr(struct kvm_device

> *dev,

> >  		}

> >

> >  		break;

> > +	case KVM_DEV_VFIO_DEVICE:

> > +		switch (attr->attr) {

> > +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_POST

> > +		case KVM_DEV_VFIO_DEVICE_POST_IRQ:

> > +		case KVM_DEV_VFIO_DEVICE_UNPOST_IRQ:

> > +			return irq_remapping_cap(IRQ_POSTING_CAP) ? 0 : -ENXIO;

> > +#endif

> > +		}

> > +		break;

> >  	}

> >

> >  	return -ENXIO;

> 

>
Alex Williamson June 12, 2015, 2:51 p.m. UTC | #3
On Fri, 2015-06-12 at 04:54 +0000, Wu, Feng wrote:
> 
> 
> > -----Original Message-----
> > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > Sent: Friday, June 12, 2015 1:15 AM
> > To: Wu, Feng
> > Cc: kvm@vger.kernel.org; linux-kernel@vger.kernel.org; pbonzini@redhat.com;
> > mtosatti@redhat.com; eric.auger@linaro.org
> > Subject: Re: [v4 12/16] KVM: kvm-vfio: implement the VFIO skeleton for VT-d
> > Posted-Interrupts
> > 
> > On Thu, 2015-06-11 at 18:51 +0800, Feng Wu wrote:
> > > This patch adds the kvm-vfio interface for VT-d Posted-Interrupts.
> > > When guests update MSI/MSI-x information for an assigned-device,
> > > QEMU will use KVM_DEV_VFIO_DEVICE_POST_IRQ attribute to setup
> > > IRTE for VT-d PI. Userspace program can also use
> > > KVM_DEV_VFIO_DEVICE_UNPOST_IRQ to change back to irq remapping
> > mode.
> > > This patch implements these IRQ attributes.
> > >
> > > Signed-off-by: Feng Wu <feng.wu@intel.com>
> > > ---
> > >  include/linux/kvm_host.h |  22 +++++++++
> > >  virt/kvm/vfio.c          | 126
> > +++++++++++++++++++++++++++++++++++++++++++++++
> > >  2 files changed, 148 insertions(+)
> > >
> > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > index f591f7c..69f8711 100644
> > > --- a/include/linux/kvm_host.h
> > > +++ b/include/linux/kvm_host.h
> > > @@ -1073,6 +1073,28 @@ extern struct kvm_device_ops kvm_xics_ops;
> > >  extern struct kvm_device_ops kvm_arm_vgic_v2_ops;
> > >  extern struct kvm_device_ops kvm_arm_vgic_v3_ops;
> > >
> > > +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_POST
> > > +/*
> > > + * kvm_arch_vfio_update_pi_irte - set IRTE for Posted-Interrupts
> > > + *
> > > + * @kvm: kvm
> > > + * @host_irq: host irq of the interrupt
> > > + * @guest_irq: gsi of the interrupt
> > > + * @set: set or unset PI
> > > + * returns 0 on success, < 0 on failure
> > > + */
> > > +int kvm_arch_vfio_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
> > > +				 uint32_t guest_irq, bool set);
> > > +#else
> > > +static inline int kvm_arch_vfio_update_pi_irte(struct kvm *kvm,
> > > +					       unsigned int host_irq,
> > > +					       uint32_t guest_irq,
> > > +					       bool set)
> > > +{
> > > +	return 0;
> > > +}
> > 
> > The code below can't get to this function without
> > __KVM_HAVE_ARCH_KVM_VFIO_POST, but this seems like it should return an
> > error if not implemented.
> 
> kvm_arch_vfio_update_pi_irte() is called by kvm_vfio_control_pi(), if we remove the
> dummy definition of kvm_arch_vfio_update_pi_irte(), kvm_vfio_control_pi() is also
> needed to be included in __KVM_HAVE_ARCH_KVM_VFIO_POST, I will handle this
> in the next version.

But typically we wouldn't return success for a function that's not
implemented.

> > > +#endif
> > > +
> > >  #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
> > >
> > >  static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool
> > val)
> > > diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c
> > > index 80a45e4..547fc51 100644
> > > --- a/virt/kvm/vfio.c
> > > +++ b/virt/kvm/vfio.c
> > > @@ -18,6 +18,7 @@
> > >  #include <linux/slab.h>
> > >  #include <linux/uaccess.h>
> > >  #include <linux/vfio.h>
> > > +#include <asm/irq_remapping.h>
> > 
> > This only exists on x86.   
> 
> But in kvm_vfio_has_attr(), we can only return 0 when posted interrupt is supported
> via calling " irq_remapping_cap(IRQ_POSTING_CAP)" which needs this header file.
> Do you think how can I handle this?

Further abstraction, #ifdef...  This file is not x86 specific.

> > Are we also getting lucky with some of the
> > include chains that give us the PCI related defines?  It looks like
> > we're implicitly assuming CONFIG_PCI
> 
> Yes, I think the PCI related header files are included implicitly here. Anyway
> I can add "#include <linux/pci.h>" explicitly.

Ok, but that doesn't solve the problem that this file should not assume
CONFIG_PCI.  Posted interrupts functionality may requires PCI, but
neither KVM nor VFIO necessarily do and therefore the kvm-vfio interface
cannot assume it.

> > >  #include "vfio.h"
> > >
> > >  struct kvm_vfio_group {
> > > @@ -276,12 +277,128 @@ static int kvm_vfio_set_group(struct kvm_device
> > *dev, long attr, u64 arg)
> > >  	return -ENXIO;
> > >  }
> > >
> > > +static int kvm_vfio_pci_get_irq_count(struct pci_dev *pdev, int irq_type)
> > > +{
> > > +	if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) {
> > > +		u8 pin;
> > > +
> > > +		pci_read_config_byte(pdev, PCI_INTERRUPT_PIN, &pin);
> > > +		if (pin)
> > > +			return 1;
> > > +	} else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) {
> > > +		return pci_msi_vec_count(pdev);
> > > +	} else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) {
> > > +		return pci_msix_vec_count(pdev);
> > > +	}
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +static int kvm_vfio_control_pi(struct kvm_device *kdev,
> > > +			       int32_t __user *argp, bool set)
> > > +{
> > > +	struct kvm_vfio_dev_irq pi_info;
> > > +	uint32_t *gsi;
> > > +	unsigned long minsz;
> > > +	struct vfio_device *vdev;
> > > +	struct msi_desc *entry;
> > > +	struct device *dev;
> > > +	struct pci_dev *pdev;
> > > +	int i, max, ret;
> > > +
> > > +	minsz = offsetofend(struct kvm_vfio_dev_irq, count);
> > > +
> > > +	if (copy_from_user(&pi_info, (void __user *)argp, minsz))
> > > +		return -EFAULT;
> > > +
> > > +	if (pi_info.argsz < minsz || pi_info.index >= VFIO_PCI_NUM_IRQS)
> > > +		return -EINVAL;
> > 
> > Could we also abort on pi_info.count == 0?
> 
> Yes, that is a good point.
> 
> > 
> > > +
> > > +	vdev = kvm_vfio_get_vfio_device(pi_info.fd);
> > > +	if (IS_ERR(vdev))
> > > +		return PTR_ERR(vdev);
> > > +
> > > +	dev = kvm_vfio_external_base_device(vdev);
> > > +	if (!dev || !dev_is_pci(dev)) {
> > > +		ret = -EFAULT;
> > > +		goto put_vfio_device;
> > > +	}
> > > +
> > > +	pdev = to_pci_dev(dev);
> > > +
> > > +	max = kvm_vfio_pci_get_irq_count(pdev, pi_info.index);
> > > +	if (max <= 0) {
> > > +		ret = -EFAULT;
> > > +		goto put_vfio_device;
> > > +	}
> > > +
> > > +	if (pi_info.argsz - minsz < pi_info.count * sizeof(u32) ||
> > > +	    pi_info.start >= max || pi_info.start + pi_info.count > max) {
> > > +		ret = -EINVAL;
> > > +		goto put_vfio_device;
> > > +	}
> > > +
> > > +	gsi = memdup_user((void __user *)((unsigned long)argp + minsz),
> > > +			   pi_info.count * sizeof(u32));
> > > +	if (IS_ERR(gsi)) {
> > > +		ret = PTR_ERR(gsi);
> > > +		goto put_vfio_device;
> > > +	}
> > > +
> > > +#ifdef CONFIG_PCI_MSI
> > > +	for (i = 0; i < pi_info.count; i++) {
> > > +		list_for_each_entry(entry, &pdev->msi_list, list) {
> > 
> > Should we be able to get here for INTx?
> 
> We only support PI for MSI/MSIx. I think I can return earlier in this function if
> the index is not VFIO_PCI_MSI_IRQ_INDEX or VFIO_PCI_MSIX_IRQ_INDEX,
> is this okay for you?

Yes, preferably early enough that you don't need to read the pin
register to determine if the device supports INTx.

> > 
> > > +			if (entry->msi_attrib.entry_nr != pi_info.start+i)
> > > +				continue;
> > > +
> > > +			ret = kvm_arch_vfio_update_pi_irte(kdev->kvm,
> > > +							   entry->irq,
> > > +							   gsi[i],
> > > +							   set);
> > > +			if (ret)
> > > +				goto free_gsi;
> > > +		}
> > > +	}
> > > +#endif
> > > +
> > > +	ret = 0;
> > 
> > So if we didn't do anything, return success?  That seems strange.
> > Should we also be doing some unwind on failure?  Thanks,
> > 
> I can't think of what I need to do on failure. Do you have any ideas?

This is a user interface, so I think a user would expect that success
means that all of the parameters were correct and all of the indexes
were configured.  If something generated an error, everything should be
returned to the initial state and an error returned.  That may mean that
we need to reverse the loop and unset things that we set, or set things
that were unset.  Thanks,

Alex

> > > +free_gsi:
> > > +	kfree(gsi);
> > > +
> > > +put_vfio_device:
> > > +	kvm_vfio_put_vfio_device(vdev);
> > > +	return ret;
> > > +}
> > > +
> > > +static int kvm_vfio_set_device(struct kvm_device *kdev, long attr, u64 arg)
> > > +{
> > > +	int32_t __user *argp = (int32_t __user *)(unsigned long)arg;
> > > +	int ret;
> > > +
> > > +	switch (attr) {
> > > +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_POST
> > > +	case KVM_DEV_VFIO_DEVICE_POST_IRQ:
> > > +		ret = kvm_vfio_control_pi(kdev, argp, 1);
> > > +		break;
> > > +	case KVM_DEV_VFIO_DEVICE_UNPOST_IRQ:
> > > +		ret = kvm_vfio_control_pi(kdev, argp, 0);
> > > +		break;
> > > +#endif
> > > +	default:
> > > +		ret = -ENXIO;
> > > +	}
> > > +	return ret;
> > > +}
> > > +
> > >  static int kvm_vfio_set_attr(struct kvm_device *dev,
> > >  			     struct kvm_device_attr *attr)
> > >  {
> > >  	switch (attr->group) {
> > >  	case KVM_DEV_VFIO_GROUP:
> > >  		return kvm_vfio_set_group(dev, attr->attr, attr->addr);
> > > +	case KVM_DEV_VFIO_DEVICE:
> > > +		return kvm_vfio_set_device(dev, attr->attr, attr->addr);
> > >  	}
> > >
> > >  	return -ENXIO;
> > > @@ -299,6 +416,15 @@ static int kvm_vfio_has_attr(struct kvm_device
> > *dev,
> > >  		}
> > >
> > >  		break;
> > > +	case KVM_DEV_VFIO_DEVICE:
> > > +		switch (attr->attr) {
> > > +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_POST
> > > +		case KVM_DEV_VFIO_DEVICE_POST_IRQ:
> > > +		case KVM_DEV_VFIO_DEVICE_UNPOST_IRQ:
> > > +			return irq_remapping_cap(IRQ_POSTING_CAP) ? 0 : -ENXIO;
> > > +#endif
> > > +		}
> > > +		break;
> > >  	}
> > >
> > >  	return -ENXIO;
> > 
> > 
> 
> NrybX?v^)?{.n+h?}?z&j:+vzZ++zfh~izw?&)?f



--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini June 23, 2015, 4 p.m. UTC | #4
On 12/06/2015 16:51, Alex Williamson wrote:
> > > The code below can't get to this function without
> > > __KVM_HAVE_ARCH_KVM_VFIO_POST, but this seems like it should return an
> > > error if not implemented.
> > 
> > kvm_arch_vfio_update_pi_irte() is called by kvm_vfio_control_pi(), if we remove the
> > dummy definition of kvm_arch_vfio_update_pi_irte(), kvm_vfio_control_pi() is also
> > needed to be included in __KVM_HAVE_ARCH_KVM_VFIO_POST, I will handle this
> > in the next version.
> 
> But typically we wouldn't return success for a function that's not
> implemented.

I agree.  Moving the implementation to vmx.c would also let you make
kvm_arch_update_pi_irte() the only entry-point to the functionality in
vmx.c.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index f591f7c..69f8711 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1073,6 +1073,28 @@  extern struct kvm_device_ops kvm_xics_ops;
 extern struct kvm_device_ops kvm_arm_vgic_v2_ops;
 extern struct kvm_device_ops kvm_arm_vgic_v3_ops;
 
+#ifdef __KVM_HAVE_ARCH_KVM_VFIO_POST
+/*
+ * kvm_arch_vfio_update_pi_irte - set IRTE for Posted-Interrupts
+ *
+ * @kvm: kvm
+ * @host_irq: host irq of the interrupt
+ * @guest_irq: gsi of the interrupt
+ * @set: set or unset PI
+ * returns 0 on success, < 0 on failure
+ */
+int kvm_arch_vfio_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
+				 uint32_t guest_irq, bool set);
+#else
+static inline int kvm_arch_vfio_update_pi_irte(struct kvm *kvm,
+					       unsigned int host_irq,
+					       uint32_t guest_irq,
+					       bool set)
+{
+	return 0;
+}
+#endif
+
 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
 
 static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val)
diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c
index 80a45e4..547fc51 100644
--- a/virt/kvm/vfio.c
+++ b/virt/kvm/vfio.c
@@ -18,6 +18,7 @@ 
 #include <linux/slab.h>
 #include <linux/uaccess.h>
 #include <linux/vfio.h>
+#include <asm/irq_remapping.h>
 #include "vfio.h"
 
 struct kvm_vfio_group {
@@ -276,12 +277,128 @@  static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg)
 	return -ENXIO;
 }
 
+static int kvm_vfio_pci_get_irq_count(struct pci_dev *pdev, int irq_type)
+{
+	if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) {
+		u8 pin;
+
+		pci_read_config_byte(pdev, PCI_INTERRUPT_PIN, &pin);
+		if (pin)
+			return 1;
+	} else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) {
+		return pci_msi_vec_count(pdev);
+	} else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) {
+		return pci_msix_vec_count(pdev);
+	}
+
+	return 0;
+}
+
+static int kvm_vfio_control_pi(struct kvm_device *kdev,
+			       int32_t __user *argp, bool set)
+{
+	struct kvm_vfio_dev_irq pi_info;
+	uint32_t *gsi;
+	unsigned long minsz;
+	struct vfio_device *vdev;
+	struct msi_desc *entry;
+	struct device *dev;
+	struct pci_dev *pdev;
+	int i, max, ret;
+
+	minsz = offsetofend(struct kvm_vfio_dev_irq, count);
+
+	if (copy_from_user(&pi_info, (void __user *)argp, minsz))
+		return -EFAULT;
+
+	if (pi_info.argsz < minsz || pi_info.index >= VFIO_PCI_NUM_IRQS)
+		return -EINVAL;
+
+	vdev = kvm_vfio_get_vfio_device(pi_info.fd);
+	if (IS_ERR(vdev))
+		return PTR_ERR(vdev);
+
+	dev = kvm_vfio_external_base_device(vdev);
+	if (!dev || !dev_is_pci(dev)) {
+		ret = -EFAULT;
+		goto put_vfio_device;
+	}
+
+	pdev = to_pci_dev(dev);
+
+	max = kvm_vfio_pci_get_irq_count(pdev, pi_info.index);
+	if (max <= 0) {
+		ret = -EFAULT;
+		goto put_vfio_device;
+	}
+
+	if (pi_info.argsz - minsz < pi_info.count * sizeof(u32) ||
+	    pi_info.start >= max || pi_info.start + pi_info.count > max) {
+		ret = -EINVAL;
+		goto put_vfio_device;
+	}
+
+	gsi = memdup_user((void __user *)((unsigned long)argp + minsz),
+			   pi_info.count * sizeof(u32));
+	if (IS_ERR(gsi)) {
+		ret = PTR_ERR(gsi);
+		goto put_vfio_device;
+	}
+
+#ifdef CONFIG_PCI_MSI
+	for (i = 0; i < pi_info.count; i++) {
+		list_for_each_entry(entry, &pdev->msi_list, list) {
+			if (entry->msi_attrib.entry_nr != pi_info.start+i)
+				continue;
+
+			ret = kvm_arch_vfio_update_pi_irte(kdev->kvm,
+							   entry->irq,
+							   gsi[i],
+							   set);
+			if (ret)
+				goto free_gsi;
+		}
+	}
+#endif
+
+	ret = 0;
+
+free_gsi:
+	kfree(gsi);
+
+put_vfio_device:
+	kvm_vfio_put_vfio_device(vdev);
+	return ret;
+}
+
+static int kvm_vfio_set_device(struct kvm_device *kdev, long attr, u64 arg)
+{
+	int32_t __user *argp = (int32_t __user *)(unsigned long)arg;
+	int ret;
+
+	switch (attr) {
+#ifdef __KVM_HAVE_ARCH_KVM_VFIO_POST
+	case KVM_DEV_VFIO_DEVICE_POST_IRQ:
+		ret = kvm_vfio_control_pi(kdev, argp, 1);
+		break;
+	case KVM_DEV_VFIO_DEVICE_UNPOST_IRQ:
+		ret = kvm_vfio_control_pi(kdev, argp, 0);
+		break;
+#endif
+	default:
+		ret = -ENXIO;
+	}
+	return ret;
+}
+
 static int kvm_vfio_set_attr(struct kvm_device *dev,
 			     struct kvm_device_attr *attr)
 {
 	switch (attr->group) {
 	case KVM_DEV_VFIO_GROUP:
 		return kvm_vfio_set_group(dev, attr->attr, attr->addr);
+	case KVM_DEV_VFIO_DEVICE:
+		return kvm_vfio_set_device(dev, attr->attr, attr->addr);
 	}
 
 	return -ENXIO;
@@ -299,6 +416,15 @@  static int kvm_vfio_has_attr(struct kvm_device *dev,
 		}
 
 		break;
+	case KVM_DEV_VFIO_DEVICE:
+		switch (attr->attr) {
+#ifdef __KVM_HAVE_ARCH_KVM_VFIO_POST
+		case KVM_DEV_VFIO_DEVICE_POST_IRQ:
+		case KVM_DEV_VFIO_DEVICE_UNPOST_IRQ:
+			return irq_remapping_cap(IRQ_POSTING_CAP) ? 0 : -ENXIO;
+#endif
+		}
+		break;
 	}
 
 	return -ENXIO;