diff mbox series

[v6,05/20] vfio: mdev: common lib code for setting up Interrupt Message Store

Message ID 162164277624.261970.7989190254803052804.stgit@djiang5-desk3.ch.intel.com (mailing list archive)
State New, archived
Headers show
Series Add VFIO mediated device support and DEV-MSI support for the idxd driver | expand

Commit Message

Dave Jiang May 22, 2021, 12:19 a.m. UTC
Add common helper code to setup IMS once the MSI domain has been
setup by the device driver. The main helper function is
mdev_ims_set_msix_trigger() that is called by the VFIO ioctl
VFIO_DEVICE_SET_IRQS. The function deals with the setup and
teardown of emulated and IMS backed eventfd that gets exported
to the guest kernel via VFIO as MSIX vectors.

Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/vfio/mdev/Kconfig     |   12 ++
 drivers/vfio/mdev/Makefile    |    3 
 drivers/vfio/mdev/mdev_irqs.c |  318 +++++++++++++++++++++++++++++++++++++++++
 include/linux/mdev.h          |   51 +++++++
 4 files changed, 384 insertions(+)
 create mode 100644 drivers/vfio/mdev/mdev_irqs.c

Comments

Jason Gunthorpe May 24, 2021, 12:02 a.m. UTC | #1
On Fri, May 21, 2021 at 05:19:36PM -0700, Dave Jiang wrote:
> Add common helper code to setup IMS once the MSI domain has been
> setup by the device driver. The main helper function is
> mdev_ims_set_msix_trigger() that is called by the VFIO ioctl
> VFIO_DEVICE_SET_IRQS. The function deals with the setup and
> teardown of emulated and IMS backed eventfd that gets exported
> to the guest kernel via VFIO as MSIX vectors.
> 
> Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
> Signed-off-by: Dave Jiang <dave.jiang@intel.com>
> ---
>  drivers/vfio/mdev/Kconfig     |   12 ++
>  drivers/vfio/mdev/Makefile    |    3 
>  drivers/vfio/mdev/mdev_irqs.c |  318 +++++++++++++++++++++++++++++++++++++++++
>  include/linux/mdev.h          |   51 +++++++
>  4 files changed, 384 insertions(+)
>  create mode 100644 drivers/vfio/mdev/mdev_irqs.c

IMS is not mdev specific, do not entangle it with mdev code. This
should be generic VFIO stuff.

> +static int mdev_msix_set_vector_signal(struct mdev_irq *mdev_irq, int vector, int fd)
> +{
> +	int rc, irq;
> +	struct mdev_device *mdev = irq_to_mdev(mdev_irq);
> +	struct mdev_irq_entry *entry;
> +	struct device *dev = &mdev->dev;
> +	struct eventfd_ctx *trigger;
> +	char *name;
> +	bool pasid_en;
> +	u32 auxval;
> +
> +	if (vector < 0 || vector >= mdev_irq->num)
> +		return -EINVAL;
> +
> +	entry = &mdev_irq->irq_entries[vector];
> +
> +	if (entry->ims)
> +		irq = dev_msi_irq_vector(dev, entry->ims_id);
> +	else
> +		irq = 0;
> +
> +	pasid_en = mdev_irq->pasid != INVALID_IOASID ? true : false;
> +
> +	/* IMS and invalid pasid is not a valid configuration */
> +	if (entry->ims && !pasid_en)
> +		return -EINVAL;
> +
> +	if (entry->trigger) {
> +		if (irq) {
> +			irq_bypass_unregister_producer(&entry->producer);
> +			free_irq(irq, entry->trigger);
> +			if (pasid_en) {
> +				auxval = ims_ctrl_pasid_aux(0, false);
> +				irq_set_auxdata(irq, IMS_AUXDATA_CONTROL_WORD, auxval);
> +			}
> +		}
> +		kfree(entry->name);
> +		eventfd_ctx_put(entry->trigger);
> +		entry->trigger = NULL;
> +	}
> +
> +	if (fd < 0)
> +		return 0;
> +
> +	name = kasprintf(GFP_KERNEL, "vfio-mdev-irq[%d](%s)", vector, dev_name(dev));
> +	if (!name)
> +		return -ENOMEM;
> +
> +	trigger = eventfd_ctx_fdget(fd);
> +	if (IS_ERR(trigger)) {
> +		kfree(name);
> +		return PTR_ERR(trigger);
> +	}
> +
> +	entry->name = name;
> +	entry->trigger = trigger;
> +
> +	if (!irq)
> +		return 0;
> +
> +	if (pasid_en) {
> +		auxval = ims_ctrl_pasid_aux(mdev_irq->pasid, true);
> +		rc = irq_set_auxdata(irq, IMS_AUXDATA_CONTROL_WORD, auxval);
> +		if (rc < 0)
> +			goto err;

Why is anything to do with PASID here? Something has gone wrong with
the layers I suspect..

Oh yes. drivers/irqchip/irq-ims-msi.c is dxd specific and shouldn't be
pretending to be common code.

The protocol to stuff the pasid and other stuff into the auxdata is
also compeltely idxd specific and is just a hacky way to communicate
from this code to the IDXD irq-chip.

So this doesn't belong here either. Pass in the auxdata from the idxd
code and I'd rename the irq-ims-msi to irq-ims-idxd

> +static int mdev_msix_enable(struct mdev_irq *mdev_irq, int nvec)
> +{
> +	struct mdev_device *mdev = irq_to_mdev(mdev_irq);
> +	struct device *dev;
> +	int rc;
> +
> +	if (nvec != mdev_irq->num)
> +		return -EINVAL;
> +
> +	if (mdev_irq->ims_num) {
> +		dev = &mdev->dev;
> +		rc = msi_domain_alloc_irqs(dev_get_msi_domain(dev), dev, mdev_irq->ims_num);

Huh? The PCI device should be the only device touching IRQ stuff. I'm
nervous to see you mix in the mdev struct device into this function.

Isn't the msi_domain just idxd->ims_domain?

Jason
Dave Jiang May 28, 2021, 1:49 a.m. UTC | #2
On 5/23/2021 5:02 PM, Jason Gunthorpe wrote:
> On Fri, May 21, 2021 at 05:19:36PM -0700, Dave Jiang wrote:
>> Add common helper code to setup IMS once the MSI domain has been
>> setup by the device driver. The main helper function is
>> mdev_ims_set_msix_trigger() that is called by the VFIO ioctl
>> VFIO_DEVICE_SET_IRQS. The function deals with the setup and
>> teardown of emulated and IMS backed eventfd that gets exported
>> to the guest kernel via VFIO as MSIX vectors.
>>
>> Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
>> Signed-off-by: Dave Jiang <dave.jiang@intel.com>
>> ---
>>   drivers/vfio/mdev/Kconfig     |   12 ++
>>   drivers/vfio/mdev/Makefile    |    3
>>   drivers/vfio/mdev/mdev_irqs.c |  318 +++++++++++++++++++++++++++++++++++++++++
>>   include/linux/mdev.h          |   51 +++++++
>>   4 files changed, 384 insertions(+)
>>   create mode 100644 drivers/vfio/mdev/mdev_irqs.c
> IMS is not mdev specific, do not entangle it with mdev code. This
> should be generic VFIO stuff.
>
>> +static int mdev_msix_set_vector_signal(struct mdev_irq *mdev_irq, int vector, int fd)
>> +{
>> +	int rc, irq;
>> +	struct mdev_device *mdev = irq_to_mdev(mdev_irq);
>> +	struct mdev_irq_entry *entry;
>> +	struct device *dev = &mdev->dev;
>> +	struct eventfd_ctx *trigger;
>> +	char *name;
>> +	bool pasid_en;
>> +	u32 auxval;
>> +
>> +	if (vector < 0 || vector >= mdev_irq->num)
>> +		return -EINVAL;
>> +
>> +	entry = &mdev_irq->irq_entries[vector];
>> +
>> +	if (entry->ims)
>> +		irq = dev_msi_irq_vector(dev, entry->ims_id);
>> +	else
>> +		irq = 0;
>> +
>> +	pasid_en = mdev_irq->pasid != INVALID_IOASID ? true : false;
>> +
>> +	/* IMS and invalid pasid is not a valid configuration */
>> +	if (entry->ims && !pasid_en)
>> +		return -EINVAL;
>> +
>> +	if (entry->trigger) {
>> +		if (irq) {
>> +			irq_bypass_unregister_producer(&entry->producer);
>> +			free_irq(irq, entry->trigger);
>> +			if (pasid_en) {
>> +				auxval = ims_ctrl_pasid_aux(0, false);
>> +				irq_set_auxdata(irq, IMS_AUXDATA_CONTROL_WORD, auxval);
>> +			}
>> +		}
>> +		kfree(entry->name);
>> +		eventfd_ctx_put(entry->trigger);
>> +		entry->trigger = NULL;
>> +	}
>> +
>> +	if (fd < 0)
>> +		return 0;
>> +
>> +	name = kasprintf(GFP_KERNEL, "vfio-mdev-irq[%d](%s)", vector, dev_name(dev));
>> +	if (!name)
>> +		return -ENOMEM;
>> +
>> +	trigger = eventfd_ctx_fdget(fd);
>> +	if (IS_ERR(trigger)) {
>> +		kfree(name);
>> +		return PTR_ERR(trigger);
>> +	}
>> +
>> +	entry->name = name;
>> +	entry->trigger = trigger;
>> +
>> +	if (!irq)
>> +		return 0;
>> +
>> +	if (pasid_en) {
>> +		auxval = ims_ctrl_pasid_aux(mdev_irq->pasid, true);
>> +		rc = irq_set_auxdata(irq, IMS_AUXDATA_CONTROL_WORD, auxval);
>> +		if (rc < 0)
>> +			goto err;
> Why is anything to do with PASID here? Something has gone wrong with
> the layers I suspect..
>
> Oh yes. drivers/irqchip/irq-ims-msi.c is dxd specific and shouldn't be
> pretending to be common code.
>
> The protocol to stuff the pasid and other stuff into the auxdata is
> also compeltely idxd specific and is just a hacky way to communicate
> from this code to the IDXD irq-chip.
>
> So this doesn't belong here either. Pass in the auxdata from the idxd
> code and I'd rename the irq-ims-msi to irq-ims-idxd
>
>> +static int mdev_msix_enable(struct mdev_irq *mdev_irq, int nvec)
>> +{
>> +	struct mdev_device *mdev = irq_to_mdev(mdev_irq);
>> +	struct device *dev;
>> +	int rc;
>> +
>> +	if (nvec != mdev_irq->num)
>> +		return -EINVAL;
>> +
>> +	if (mdev_irq->ims_num) {
>> +		dev = &mdev->dev;
>> +		rc = msi_domain_alloc_irqs(dev_get_msi_domain(dev), dev, mdev_irq->ims_num);
> Huh? The PCI device should be the only device touching IRQ stuff. I'm
> nervous to see you mix in the mdev struct device into this function.

As we talked about in the other thread. We have a single IMS domain per 
device. The domain is set to the mdev 'struct device' and we allocate 
the vectors to each mdev 'struct device' so we can manage those IMS 
vectors specifically for that mdev.

>
> Isn't the msi_domain just idxd->ims_domain?

Yes


>
> Jason
Jason Gunthorpe May 28, 2021, 12:21 p.m. UTC | #3
On Thu, May 27, 2021 at 06:49:59PM -0700, Dave Jiang wrote:
> > > +static int mdev_msix_enable(struct mdev_irq *mdev_irq, int nvec)
> > > +{
> > > +	struct mdev_device *mdev = irq_to_mdev(mdev_irq);
> > > +	struct device *dev;
> > > +	int rc;
> > > +
> > > +	if (nvec != mdev_irq->num)
> > > +		return -EINVAL;
> > > +
> > > +	if (mdev_irq->ims_num) {
> > > +		dev = &mdev->dev;
> > > +		rc = msi_domain_alloc_irqs(dev_get_msi_domain(dev), dev, mdev_irq->ims_num);
> > Huh? The PCI device should be the only device touching IRQ stuff. I'm
> > nervous to see you mix in the mdev struct device into this function.
> 
> As we talked about in the other thread. We have a single IMS domain per
> device. The domain is set to the mdev 'struct device' and we allocate the
> vectors to each mdev 'struct device' so we can manage those IMS vectors
> specifically for that mdev.

That is not the point, I'm asking if you should be calling
dev_set_msi_domain(mdev) at all

Jason
Dave Jiang May 28, 2021, 4:37 p.m. UTC | #4
On 5/28/2021 5:21 AM, Jason Gunthorpe wrote:
> On Thu, May 27, 2021 at 06:49:59PM -0700, Dave Jiang wrote:
>>>> +static int mdev_msix_enable(struct mdev_irq *mdev_irq, int nvec)
>>>> +{
>>>> +	struct mdev_device *mdev = irq_to_mdev(mdev_irq);
>>>> +	struct device *dev;
>>>> +	int rc;
>>>> +
>>>> +	if (nvec != mdev_irq->num)
>>>> +		return -EINVAL;
>>>> +
>>>> +	if (mdev_irq->ims_num) {
>>>> +		dev = &mdev->dev;
>>>> +		rc = msi_domain_alloc_irqs(dev_get_msi_domain(dev), dev, mdev_irq->ims_num);
>>> Huh? The PCI device should be the only device touching IRQ stuff. I'm
>>> nervous to see you mix in the mdev struct device into this function.
>> As we talked about in the other thread. We have a single IMS domain per
>> device. The domain is set to the mdev 'struct device' and we allocate the
>> vectors to each mdev 'struct device' so we can manage those IMS vectors
>> specifically for that mdev.
> That is not the point, I'm asking if you should be calling
> dev_set_msi_domain(mdev) at all

I'm not familiar with the standard way of doing this. Should I not set 
the domain to the mdev 'struct device' because I can have multiple mdev 
using the same domain? With the domain set, I am able to retrieve it and 
call the msi_domain_alloc_irqs() in common code. Alternatively we can 
pass in the domain during init and not rely on dev->msi_domain.


>
> Jason
Jason Gunthorpe May 28, 2021, 4:39 p.m. UTC | #5
On Fri, May 28, 2021 at 09:37:56AM -0700, Dave Jiang wrote:
> 
> On 5/28/2021 5:21 AM, Jason Gunthorpe wrote:
> > On Thu, May 27, 2021 at 06:49:59PM -0700, Dave Jiang wrote:
> > > > > +static int mdev_msix_enable(struct mdev_irq *mdev_irq, int nvec)
> > > > > +{
> > > > > +	struct mdev_device *mdev = irq_to_mdev(mdev_irq);
> > > > > +	struct device *dev;
> > > > > +	int rc;
> > > > > +
> > > > > +	if (nvec != mdev_irq->num)
> > > > > +		return -EINVAL;
> > > > > +
> > > > > +	if (mdev_irq->ims_num) {
> > > > > +		dev = &mdev->dev;
> > > > > +		rc = msi_domain_alloc_irqs(dev_get_msi_domain(dev), dev, mdev_irq->ims_num);
> > > > Huh? The PCI device should be the only device touching IRQ stuff. I'm
> > > > nervous to see you mix in the mdev struct device into this function.
> > > As we talked about in the other thread. We have a single IMS domain per
> > > device. The domain is set to the mdev 'struct device' and we allocate the
> > > vectors to each mdev 'struct device' so we can manage those IMS vectors
> > > specifically for that mdev.
> > That is not the point, I'm asking if you should be calling
> > dev_set_msi_domain(mdev) at all
> 
> I'm not familiar with the standard way of doing this. Should I not set the
> domain to the mdev 'struct device' because I can have multiple mdev using
> the same domain? With the domain set, I am able to retrieve it and call the
> msi_domain_alloc_irqs() in common code. Alternatively we can pass in the
> domain during init and not rely on dev->msi_

Honestly, I don't know. I would prefer Thomas confirm what is the
correct way to use the msi_domain as IDXD is going to be the reference
everyone copies.

Jason
Thomas Gleixner May 31, 2021, 10:41 a.m. UTC | #6
On Fri, May 28 2021 at 13:39, Jason Gunthorpe wrote:
> On Fri, May 28, 2021 at 09:37:56AM -0700, Dave Jiang wrote:
>> On 5/28/2021 5:21 AM, Jason Gunthorpe wrote:
>> > On Thu, May 27, 2021 at 06:49:59PM -0700, Dave Jiang wrote:
>> > > > > +static int mdev_msix_enable(struct mdev_irq *mdev_irq, int nvec)
>> > > > > +{
>> > > > > +	struct mdev_device *mdev = irq_to_mdev(mdev_irq);
>> > > > > +	struct device *dev;
>> > > > > +	int rc;
>> > > > > +
>> > > > > +	if (nvec != mdev_irq->num)
>> > > > > +		return -EINVAL;
>> > > > > +
>> > > > > +	if (mdev_irq->ims_num) {
>> > > > > +		dev = &mdev->dev;
>> > > > > +		rc = msi_domain_alloc_irqs(dev_get_msi_domain(dev), dev, mdev_irq->ims_num);
>> > > >
>> > > > Huh? The PCI device should be the only device touching IRQ stuff. I'm
>> > > > nervous to see you mix in the mdev struct device into this function.
>> > >
>> > > As we talked about in the other thread. We have a single IMS domain per
>> > > device. The domain is set to the mdev 'struct device' and we allocate the
>> > > vectors to each mdev 'struct device' so we can manage those IMS vectors
>> > > specifically for that mdev.
>> >
>> > That is not the point, I'm asking if you should be calling
>> > dev_set_msi_domain(mdev) at all
>> 
>> I'm not familiar with the standard way of doing this. Should I not set the
>> domain to the mdev 'struct device' because I can have multiple mdev using
>> the same domain? With the domain set, I am able to retrieve it and call the
>> msi_domain_alloc_irqs() in common code. Alternatively we can pass in the
>> domain during init and not rely on dev->msi_
>
> Honestly, I don't know. I would prefer Thomas confirm what is the
> correct way to use the msi_domain as IDXD is going to be the reference
> everyone copies.

The general expectation is that the MSI irqdomain is retrievable from
struct device for any device which supports MSI.

Thanks,

        tglx
Thomas Gleixner May 31, 2021, 1:48 p.m. UTC | #7
On Fri, May 21 2021 at 17:19, Dave Jiang wrote:
> Add common helper code to setup IMS once the MSI domain has been
> setup by the device driver. The main helper function is
> mdev_ims_set_msix_trigger() that is called by the VFIO ioctl
> VFIO_DEVICE_SET_IRQS. The function deals with the setup and
> teardown of emulated and IMS backed eventfd that gets exported
> to the guest kernel via VFIO as MSIX vectors.

So this talks about IMS, but the functionality is all named mdev_msix*
and mdev_irqs*. Confused.

> +/*
> + * Mediate device IMS library code

Mediated?

> +static int mdev_msix_set_vector_signal(struct mdev_irq *mdev_irq, int vector, int fd)
> +{
> +	int rc, irq;
> +	struct mdev_device *mdev = irq_to_mdev(mdev_irq);
> +	struct mdev_irq_entry *entry;
> +	struct device *dev = &mdev->dev;
> +	struct eventfd_ctx *trigger;
> +	char *name;
> +	bool pasid_en;
> +	u32 auxval;
> +
> +	if (vector < 0 || vector >= mdev_irq->num)
> +		return -EINVAL;
> +
> +	entry = &mdev_irq->irq_entries[vector];
> +
> +	if (entry->ims)
> +		irq = dev_msi_irq_vector(dev, entry->ims_id);
> +	else
> +		irq = 0;

I have no idea what this does. Comments are overrated...

Aside of that dev_msi_irq_vector() seems to be a gross misnomer. AFAICT
it retrieves the Linux interrupt number and not some vector.

> +	pasid_en = mdev_irq->pasid != INVALID_IOASID ? true : false;
> +
> +	/* IMS and invalid pasid is not a valid configuration */
> +	if (entry->ims && !pasid_en)
> +		return -EINVAL;

Why is this not validated already?

> +	if (entry->trigger) {
> +		if (irq) {
> +			irq_bypass_unregister_producer(&entry->producer);
> +			free_irq(irq, entry->trigger);
> +			if (pasid_en) {
> +				auxval = ims_ctrl_pasid_aux(0, false);
> +				irq_set_auxdata(irq, IMS_AUXDATA_CONTROL_WORD, auxval);

Why can't this be done in the irq chip when the interrupt is torn down?
Just because the irq chip driver, which is thankfully not merged yet,
has been implemented that way?

I did this aux dance because someone explained to me that this has to be
handled seperately and has to be changed independent of all the
interrupt setup and whatever. But looking at the actual usage now that's
clearly not the case.

What's the exact order of all this? I assume so:

    1) mdev_irqs_init()
    2) mdev_irqs_set_pasid()
    3) mdev_set_msix_trigger()

Right? See below.

> +}
> +EXPORT_SYMBOL_GPL(mdev_irqs_set_pasid);

> +	if (fd < 0)
> +		return 0;
> +
> +	name = kasprintf(GFP_KERNEL, "vfio-mdev-irq[%d](%s)", vector, dev_name(dev));
> +	if (!name)
> +		return -ENOMEM;
> +
> +	trigger = eventfd_ctx_fdget(fd);
> +	if (IS_ERR(trigger)) {
> +		kfree(name);
> +		return PTR_ERR(trigger);
> +	}
> +
> +	entry->name = name;
> +	entry->trigger = trigger;
> +
> +	if (!irq)
> +		return 0;

These exit conditions are completely confusing.

> +	if (pasid_en) {
> +		auxval = ims_ctrl_pasid_aux(mdev_irq->pasid, true);
> +		rc = irq_set_auxdata(irq, IMS_AUXDATA_CONTROL_WORD, auxval);
> +		if (rc < 0)
> +			goto err;

Again. This can be handled in the interrupt chip when the interrupt is
set up through request_irq().

> +static int mdev_msix_enable(struct mdev_irq *mdev_irq, int nvec)
> +{
> +	struct mdev_device *mdev = irq_to_mdev(mdev_irq);
> +	struct device *dev;
> +	int rc;
> +
> +	if (nvec != mdev_irq->num)
> +		return -EINVAL;
> +
> +	if (mdev_irq->ims_num) {
> +		dev = &mdev->dev;
> +		rc = msi_domain_alloc_irqs(dev_get_msi_domain(dev), dev, mdev_irq->ims_num);

The allocation of the interrupts happens _after_ PASID has been
set and PASID is per device, right?

So the obvious place to store PASID is in struct device because the
device pointer is for one stored in the msi entry descriptor and it is
also handed down to the irq domain allocation function. So this can be
checked at allocation time already.

What's unclear to me is under which circumstances does the IMS interrupt
require a PASID.

   1) Always
   2) Use case dependent

Thanks,

        tglx
Jason Gunthorpe May 31, 2021, 3:24 p.m. UTC | #8
On Mon, May 31, 2021 at 03:48:35PM +0200, Thomas Gleixner wrote:

> What's unclear to me is under which circumstances does the IMS interrupt
> require a PASID.
> 
>    1) Always
>    2) Use case dependent

It is just a weird IDXD thing. The PASID is serving as some VM
identifier in that HW, somehow, AFAIK.

Jason
Dave Jiang June 8, 2021, 3:57 p.m. UTC | #9
On 5/31/2021 6:48 AM, Thomas Gleixner wrote:
> On Fri, May 21 2021 at 17:19, Dave Jiang wrote:
>> Add common helper code to setup IMS once the MSI domain has been
>> setup by the device driver. The main helper function is
>> mdev_ims_set_msix_trigger() that is called by the VFIO ioctl
>> VFIO_DEVICE_SET_IRQS. The function deals with the setup and
>> teardown of emulated and IMS backed eventfd that gets exported
>> to the guest kernel via VFIO as MSIX vectors.
> So this talks about IMS, but the functionality is all named mdev_msix*
> and mdev_irqs*. Confused.

Jason mentioned this as well. Will move to vfio_ims* common code.


>> +/*
>> + * Mediate device IMS library code
> Mediated?
>
>> +static int mdev_msix_set_vector_signal(struct mdev_irq *mdev_irq, int vector, int fd)
>> +{
>> +	int rc, irq;
>> +	struct mdev_device *mdev = irq_to_mdev(mdev_irq);
>> +	struct mdev_irq_entry *entry;
>> +	struct device *dev = &mdev->dev;
>> +	struct eventfd_ctx *trigger;
>> +	char *name;
>> +	bool pasid_en;
>> +	u32 auxval;
>> +
>> +	if (vector < 0 || vector >= mdev_irq->num)
>> +		return -EINVAL;
>> +
>> +	entry = &mdev_irq->irq_entries[vector];
>> +
>> +	if (entry->ims)
>> +		irq = dev_msi_irq_vector(dev, entry->ims_id);
>> +	else
>> +		irq = 0;
> I have no idea what this does. Comments are overrated...
>
> Aside of that dev_msi_irq_vector() seems to be a gross misnomer. AFAICT
> it retrieves the Linux interrupt number and not some vector.

Will change function name to dev_msi_irq().


>
>> +	pasid_en = mdev_irq->pasid != INVALID_IOASID ? true : false;
>> +
>> +	/* IMS and invalid pasid is not a valid configuration */
>> +	if (entry->ims && !pasid_en)
>> +		return -EINVAL;
> Why is this not validated already?

Will remove.

>
>> +	if (entry->trigger) {
>> +		if (irq) {
>> +			irq_bypass_unregister_producer(&entry->producer);
>> +			free_irq(irq, entry->trigger);
>> +			if (pasid_en) {
>> +				auxval = ims_ctrl_pasid_aux(0, false);
>> +				irq_set_auxdata(irq, IMS_AUXDATA_CONTROL_WORD, auxval);
> Why can't this be done in the irq chip when the interrupt is torn down?
> Just because the irq chip driver, which is thankfully not merged yet,
> has been implemented that way?
>
> I did this aux dance because someone explained to me that this has to be
> handled seperately and has to be changed independent of all the
> interrupt setup and whatever. But looking at the actual usage now that's
> clearly not the case.
>
> What's the exact order of all this? I assume so:
>
>      1) mdev_irqs_init()
>      2) mdev_irqs_set_pasid()
>      3) mdev_set_msix_trigger()
>
> Right? See below.

I'll provide more info below. But yes we can add pasid to 'struct 
device'. The work on auxdata is appreciated and is still needed.


>
>> +}
>> +EXPORT_SYMBOL_GPL(mdev_irqs_set_pasid);
>> +	if (fd < 0)
>> +		return 0;
>> +
>> +	name = kasprintf(GFP_KERNEL, "vfio-mdev-irq[%d](%s)", vector, dev_name(dev));
>> +	if (!name)
>> +		return -ENOMEM;
>> +
>> +	trigger = eventfd_ctx_fdget(fd);
>> +	if (IS_ERR(trigger)) {
>> +		kfree(name);
>> +		return PTR_ERR(trigger);
>> +	}
>> +
>> +	entry->name = name;
>> +	entry->trigger = trigger;
>> +
>> +	if (!irq)
>> +		return 0;
> These exit conditions are completely confusing.

I will add more comments. For the IMS path, some vectors are emulated 
and some are backed by IMS. Thus the early exit.


>
>> +	if (pasid_en) {
>> +		auxval = ims_ctrl_pasid_aux(mdev_irq->pasid, true);
>> +		rc = irq_set_auxdata(irq, IMS_AUXDATA_CONTROL_WORD, auxval);
>> +		if (rc < 0)
>> +			goto err;
> Again. This can be handled in the interrupt chip when the interrupt is
> set up through request_irq().
>
>> +static int mdev_msix_enable(struct mdev_irq *mdev_irq, int nvec)
>> +{
>> +	struct mdev_device *mdev = irq_to_mdev(mdev_irq);
>> +	struct device *dev;
>> +	int rc;
>> +
>> +	if (nvec != mdev_irq->num)
>> +		return -EINVAL;
>> +
>> +	if (mdev_irq->ims_num) {
>> +		dev = &mdev->dev;
>> +		rc = msi_domain_alloc_irqs(dev_get_msi_domain(dev), dev, mdev_irq->ims_num);
> The allocation of the interrupts happens _after_ PASID has been
> set and PASID is per device, right?
>
> So the obvious place to store PASID is in struct device because the
> device pointer is for one stored in the msi entry descriptor and it is
> also handed down to the irq domain allocation function. So this can be
> checked at allocation time already.
>
> What's unclear to me is under which circumstances does the IMS interrupt
> require a PASID.
>
>     1) Always
>     2) Use case dependent
>
Thomas, thank you for the review. I'll try to provide a summary below with what's going on with IMS after taking in yours and Jason's comments.

Interrupt Message Store (IMS) was designed to provide a more scalable means of interrupt storage compared to industry standard PCI-MSIx.
These can be defined in a device specific way per the SIOV spec.
* Not limited to 2048 vectors as specified by PCIe spec.
* Not limited how different parts of the interrupt, such as masking, pending bit array are located so facilitate different hardware configurations
   that allows devices to layout in a more device friendly way.
https://software.intel.com/content/www/us/en/develop/download/intel-scalable-io-virtualization-technical-specification.html

Two variants were created by your code:
https://lore.kernel.org/linux-hyperv/20200826112335.202234502@linutronix.de/
* IMS array – Mimics how DSA lays its IMS layout in hardware.
* IMS queue – free format memory based, devices such as graphics for e.g. could locate the IMS store in context maintained by system memory for interrupt
               message and data.

Devices such as Intel DSA provides ability for unprivileged software, host user space, or guests to submit work. The intent to notify when work is complete
is part of the work descriptor submitted to the DSA hardware.

                         Generic Descriptor Format
     +----------------------------------------------------------------+
     |                                                |    PASID      |
     +----------------------------------------------------------------+
     |                                                                |
     +----------------------------------------------------------------+
     |                                                                |
     +----------------------------------------------------------------+
     |            | Interrupt handle |                                |
     +----------------------------------------------------------------+
     |                                                                |
     +----------------------------------------------------------------+
     |                                                                |
     +----------------------------------------------------------------+
     |                                                                |
     +----------------------------------------------------------------+
     |                                                                |
     +----------------------------------------------------------------+
     |                                                                |
     +----------------------------------------------------------------+

DSA provides ability to allocate interrupt handles that are internally tied to HW irq. For IMS, the interrupt handle is the index for the IMS entries table.
This handle is submitted via work descriptors from unprivileged software. Either from host user space, or from other Virtual Machines. This allows ultimate
flexibility to software submitting the work, so hardware can notify completion via one of the interrupt handles. In order to ensure unprivileged software
doesn’t use a handle that doesn’t belong to it, DSA provides a facility for system software to associate a PASID with an interrupt handle, and DSA will ensure
the entity submitting the work is authorized to generate interrupt via this interrupt handle (PASID stored in IMS array should match PASID in descriptor).
The fact that the interrupt handle is tied to a PASID is implementation specific. The consumer of this interface doesn’t have any need to allocate a PASID
explicitly and is managed by privileged software.

DSA provides a way to skip PASID validation for IMS handles. This can be used if host kernel is the *only* agent generating work. Host usages without IOMMU
scalable mode are not currently implemented.

The PASID field in IMS entry is used to verify against the PASID that is associated with the submitted descriptor. The combination of the interrupt handle
(device IMS index) and the PASID verifies if the descriptor can generate the interrupt. On mismatch, invalid interrupt handle error (0x19) is generated by
the device in the software error register.
  
For a dedicated wq (dwq), the PASID is programmed into the WQ config register. When the descriptor is submitted to the WQ portal, the PASID from WQCFG is
compared the IMS entry as well as the interrupt handle that is programmed in the descriptor.

For a shared wq (swq), the PASID is either programmed in the descriptor for ENQCMDS or retrieved from the MSR in the case of ENQCMD. That PASID and the
interrupt handle is compared with the what is in the IMS entry.

With a match, the IMS interrupt is generated.

The following is the call flow for mdev without vSVM support:
1.    idxd host driver sets PASID from iommu_aux_get_pasid() to ‘struct device’
2.    idxd guest driver calls request_irq()
3.    VFIO calls VFIO_DEVICE_SET_IRQS ioctl
4.    idxd host driver calls vfio_set_ims_trigger() (newly created common helper function)
	a.    VFIO calls msi_domain_alloc_irqs() and programs valid 'struct device' PASID as auxdata to IMS entry
	b.    Host driver calls request_irq() for IMS interrupts

With a default pasid programmed to 'struct device', for this use case above we shouldn't have the need of programming pasid outside of irqchip.

For the use case of mdev with vSVM enabled, the code is staged for upstream submission after the current "mdev" series. When the guest idxd driver binds a
supervisor PASID, this guest PASID maps to a different host PASID and is not the default host PASID that is programmed to be programmed to the ‘struct device’.
The guest PASID is passed to the host driver through vendor specific means. The host driver needs to retrieve the host PASID that is mapped to this guest PASID
and program the that host PASID to the IMS entry. This is where the auxdata helper is needed and the PASID cannot be set via the common code path. The idxd
driver does this by having the guest driver fill the virtual MSIX permission table (device specific), which contains a PASID entry for each of the MSIX vectors
when SVA is turned on. The MMIO write to the guest vMSIXPERM table allows the host driver MMIO emulation code to retrieve the guest PASID and attempt to match
that with the host PASID. That host PASID is programmed to the IMS entry that is backing the guest MSIX vector. This cannot be done via the common path and
therefore requires the auxdata helper function to program the IMS PASID fields.

The following is the call flow for mdev with vSVM support:
1.    idxd host driver sets PASID to mdev ‘struct device’ via iommu_aux_get_PASID()
2.    idxd guest driver binds supervisor pasid
3.    idxd guest driver calls request_irq()
4.    VFIO calls VFIO_DEVICE_SET_IRQS ioctl
5.    idxd host driver calls vfio_set_ims_trigger()
	a.    VFIO calls msi_domain_alloc_irqs() and programs PASID as auxdata to IMS entry
	b.    Host driver calls request_irq() for IMS interrupts
6.    idxd guest driver programs virtual device MSIX permission table with guest PASID.
7.    Host driver mdev MMIO emulation retrieves guest PASID from vdev MSIXPERM table and matches to host PASID via ioasid_find_by_spid().
	a.    Host driver calls irq_set_auxdata() to change to the new PASID for IMS entry.
Jason Gunthorpe June 8, 2021, 5:22 p.m. UTC | #10
On Tue, Jun 08, 2021 at 08:57:35AM -0700, Dave Jiang wrote:

> In order to ensure unprivileged software doesn’t use a handle that
> doesn’t belong to it, DSA provides a facility for system software to
> associate a PASID with an interrupt handle, and DSA will ensure the
> entity submitting the work is authorized to generate interrupt via
> this interrupt handle (PASID stored in IMS array should match PASID
> in descriptor).

How does a SVA userspace allocate interrupt handles and make them
affine to the proper CPU(s)?

IIRC interrupts are quite limited per-CPU due to the x86 IDT,
generally I would expect a kernel driver to allocate at most one IRQ
per CPU.

However here you say each process using SVA needs a unique interrupt
handle with its PASID encoded in it. Since the IMS irqchip you are
using can't share IRQs between interrupt handles does this mean that
every time userspace creates a SVA it triggers consumption of an IMS
and IDT entry on some CPU? How is this secure against DOS of limited
kernel resources?

> driver does this by having the guest driver fill the virtual MSIX
> permission table (device specific), which contains a PASID entry for
> each of the MSIX vectors when SVA is turned on. The MMIO write to
> the guest vMSIXPERM table allows the host driver MMIO emulation code
> to retrieve the guest PASID and attempt to match that with the host
> PASID. That host PASID is programmed to the IMS entry that is
> backing the guest MSIX vector. This cannot be done via the common
> path and therefore requires the auxdata helper function to program
> the IMS PASID fields.

So a VM guest gets a SW emulated vMSIXPERM table along side MSI-X, but
the physical driver went down this IMS adventure?

And you had to do this because, as discussed earlier, true IMS is not
usable in the guest due to other platform problems?

Jason
Thomas Gleixner June 10, 2021, 1 p.m. UTC | #11
On Tue, Jun 08 2021 at 08:57, Dave Jiang wrote:
> On 5/31/2021 6:48 AM, Thomas Gleixner wrote:
>> What's unclear to me is under which circumstances does the IMS interrupt
>> require a PASID.
>>
>>     1) Always
>>     2) Use case dependent
>>
> Thomas, thank you for the review. I'll try to provide a summary below
> with what's going on with IMS after taking in yours and Jason's
> comments.

<snip>

No need to paste the manuals into mail.

</snip>

> DSA provides a way to skip PASID validation for IMS handles. This can
> be used if host kernel is the *only* agent generating work. Host
> usages without IOMMU scalable mode are not currently implemented.

So the IMS irq chip driver can do:

ims_array_alloc_msi_store(domain, dev)
{
        struct msi_domain_info *info = domain->host_data;
        struct ims_array_data *ims = info->data;

        if (ims->flags & VALIDATE_PASID) {
        	if (!valid_pasid(dev))
                	return -EINVAL;
        }

or something like that.

> The following is the call flow for mdev without vSVM support:
> 1.    idxd host driver sets PASID from iommu_aux_get_pasid() to ‘struct device’

Why needs every driver to implement that?

That should be part of the iommu management to store that.

> 2.    idxd guest driver calls request_irq()
> 3.    VFIO calls VFIO_DEVICE_SET_IRQS ioctl

How does the guest driver request_irq() end up in the VFIO ioctl on the
host?

> 4.    idxd host driver calls vfio_set_ims_trigger() (newly created common helper function)
> 	a.    VFIO calls msi_domain_alloc_irqs() and programs valid 'struct device' PASID as auxdata to IMS entry

VFIO does not program anything into the IMS entry.

The IMS irq chip driver retrieves PASID from struct device and does
that. That can be part of the domain allocation function, but there is
no requirement to do so. It can be done later, e.g. when the interrupt
is started up.

> 	b.    Host driver calls request_irq() for IMS interrupts
>
> With a default pasid programmed to 'struct device', for this use case
> above we shouldn't have the need of programming pasid outside of
> irqchip.

s/shouldn't/do not/

> The following is the call flow for mdev with vSVM support:
> 1. idxd host driver sets PASID to mdev ‘struct device’ via iommu_aux_get_PASID()
> 2. idxd guest driver binds supervisor pasid
> 3. idxd guest driver calls request_irq()
> 4. VFIO calls VFIO_DEVICE_SET_IRQS ioctl
> 5. idxd host driver calls vfio_set_ims_trigger()
>    a. VFIO calls msi_domain_alloc_irqs() and programs PASID as auxdata to IMS entry
>    b. Host driver calls request_irq() for IMS interrupts
> 6. idxd guest driver programs virtual device MSIX permission table with guest PASID.
> 7. Host driver mdev MMIO emulation retrieves guest PASID from vdev
>    MSIXPERM table and matches to host PASID via ioasid_find_by_spid().
>    a. Host driver calls irq_set_auxdata() to change to the new PASID
>       for IMS entry.

What enforces this ordering? Certainly not the hardware.

The guest driver knows the guest PASID _before_ interrupts are allocated
or requested for the device. So it can store the guest PASID _before_ it
triggers the mechanism which makes vfio/host initialize the interrupts.

So no. It's not needed at all. It's pretty much the same as the host
side driver except for the that MSIXPERM stuff.

And just for the record. Setting MSIXPERM _after_ request_irq()
completed is just wrong because if an interrupt is raised _before_ that
MSIXPERM muck is set up, then it will fire with the host PASID and not
with the guest's.

This whole IDXD stuff has been a monstrous layering violation from the
very beginning and unfortunately this hasn't changed much since then.

Thanks,

        tglx
diff mbox series

Patch

diff --git a/drivers/vfio/mdev/Kconfig b/drivers/vfio/mdev/Kconfig
index 763c877a1318..82f79d99a7db 100644
--- a/drivers/vfio/mdev/Kconfig
+++ b/drivers/vfio/mdev/Kconfig
@@ -9,3 +9,15 @@  config VFIO_MDEV
 	  See Documentation/driver-api/vfio-mediated-device.rst for more details.
 
 	  If you don't know what do here, say N.
+
+config VFIO_MDEV_IRQS
+	bool "Mediated device driver common lib code for interrupts"
+	depends on VFIO_MDEV
+	select IMS_MSI_ARRAY
+	select IRQ_BYPASS_MANAGER
+	default n
+	help
+	  Provide common library code to deal with IMS interrupts for mediated
+	  devices.
+
+	  If you don't know what to do here, say N.
diff --git a/drivers/vfio/mdev/Makefile b/drivers/vfio/mdev/Makefile
index 7c236ba1b90e..c3f160cae192 100644
--- a/drivers/vfio/mdev/Makefile
+++ b/drivers/vfio/mdev/Makefile
@@ -2,4 +2,7 @@ 
 
 mdev-y := mdev_core.o mdev_sysfs.o mdev_driver.o
 
+mdev-$(CONFIG_VFIO_MDEV_IRQS) += mdev_irqs.o
+
 obj-$(CONFIG_VFIO_MDEV) += mdev.o
+
diff --git a/drivers/vfio/mdev/mdev_irqs.c b/drivers/vfio/mdev/mdev_irqs.c
new file mode 100644
index 000000000000..ed2d11a7c729
--- /dev/null
+++ b/drivers/vfio/mdev/mdev_irqs.c
@@ -0,0 +1,318 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Mediate device IMS library code
+ *
+ * Copyright (c) 2021 Intel Corp. All rights reserved.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/device.h>
+#include <linux/interrupt.h>
+#include <linux/irqchip/irq-ims-msi.h>
+#include <linux/eventfd.h>
+#include <linux/irqreturn.h>
+#include <linux/msi.h>
+#include <linux/vfio.h>
+#include <linux/irqbypass.h>
+#include <linux/mdev.h>
+
+static irqreturn_t mdev_irq_handler(int irq, void *arg)
+{
+	struct eventfd_ctx *trigger = arg;
+
+	eventfd_signal(trigger, 1);
+	return IRQ_HANDLED;
+}
+
+/*
+ * Common helper routine to send signal to the eventfd that has been setup.
+ *
+ * @mdev_irq [in]		: struct mdev_irq context
+ * @vector [in]			: vector index for eventfd
+ *
+ * No return value.
+ */
+void mdev_msix_send_signal(struct mdev_device *mdev, int vector)
+{
+	struct mdev_irq *mdev_irq = &mdev->mdev_irq;
+	struct eventfd_ctx *trigger = mdev_irq->irq_entries[vector].trigger;
+
+	if (!mdev_irq->irq_entries || !trigger) {
+		dev_warn(&mdev->dev, "EventFD %d trigger not setup, can't send!\n", vector);
+		return;
+	}
+	mdev_irq_handler(0, (void *)trigger);
+}
+EXPORT_SYMBOL_GPL(mdev_msix_send_signal);
+
+static int mdev_msix_set_vector_signal(struct mdev_irq *mdev_irq, int vector, int fd)
+{
+	int rc, irq;
+	struct mdev_device *mdev = irq_to_mdev(mdev_irq);
+	struct mdev_irq_entry *entry;
+	struct device *dev = &mdev->dev;
+	struct eventfd_ctx *trigger;
+	char *name;
+	bool pasid_en;
+	u32 auxval;
+
+	if (vector < 0 || vector >= mdev_irq->num)
+		return -EINVAL;
+
+	entry = &mdev_irq->irq_entries[vector];
+
+	if (entry->ims)
+		irq = dev_msi_irq_vector(dev, entry->ims_id);
+	else
+		irq = 0;
+
+	pasid_en = mdev_irq->pasid != INVALID_IOASID ? true : false;
+
+	/* IMS and invalid pasid is not a valid configuration */
+	if (entry->ims && !pasid_en)
+		return -EINVAL;
+
+	if (entry->trigger) {
+		if (irq) {
+			irq_bypass_unregister_producer(&entry->producer);
+			free_irq(irq, entry->trigger);
+			if (pasid_en) {
+				auxval = ims_ctrl_pasid_aux(0, false);
+				irq_set_auxdata(irq, IMS_AUXDATA_CONTROL_WORD, auxval);
+			}
+		}
+		kfree(entry->name);
+		eventfd_ctx_put(entry->trigger);
+		entry->trigger = NULL;
+	}
+
+	if (fd < 0)
+		return 0;
+
+	name = kasprintf(GFP_KERNEL, "vfio-mdev-irq[%d](%s)", vector, dev_name(dev));
+	if (!name)
+		return -ENOMEM;
+
+	trigger = eventfd_ctx_fdget(fd);
+	if (IS_ERR(trigger)) {
+		kfree(name);
+		return PTR_ERR(trigger);
+	}
+
+	entry->name = name;
+	entry->trigger = trigger;
+
+	if (!irq)
+		return 0;
+
+	if (pasid_en) {
+		auxval = ims_ctrl_pasid_aux(mdev_irq->pasid, true);
+		rc = irq_set_auxdata(irq, IMS_AUXDATA_CONTROL_WORD, auxval);
+		if (rc < 0)
+			goto err;
+	}
+
+	rc = request_irq(irq, mdev_irq_handler, 0, name, trigger);
+	if (rc < 0)
+		goto irq_err;
+
+	entry->producer.token = trigger;
+	entry->producer.irq = irq;
+	rc = irq_bypass_register_producer(&entry->producer);
+	if (unlikely(rc)) {
+		dev_warn(dev, "irq bypass producer (token %p) registration fails: %d\n",
+			 &entry->producer.token, rc);
+		entry->producer.token = NULL;
+	}
+
+	return 0;
+
+ irq_err:
+	if (pasid_en) {
+		auxval = ims_ctrl_pasid_aux(0, false);
+		irq_set_auxdata(irq, IMS_AUXDATA_CONTROL_WORD, auxval);
+	}
+ err:
+	kfree(name);
+	eventfd_ctx_put(trigger);
+	entry->trigger = NULL;
+	return rc;
+}
+
+static int mdev_msix_set_vector_signals(struct mdev_irq *mdev_irq, unsigned int start,
+					unsigned int count, int *fds)
+{
+	int i, j, rc = 0;
+
+	if (start >= mdev_irq->num || start + count > mdev_irq->num)
+		return -EINVAL;
+
+	for (i = 0, j = start; j < count && !rc; i++, j++) {
+		int fd = fds ? fds[i] : -1;
+
+		rc = mdev_msix_set_vector_signal(mdev_irq, j, fd);
+	}
+
+	if (rc) {
+		for (--j; j >= (int)start; j--)
+			mdev_msix_set_vector_signal(mdev_irq, j, -1);
+	}
+
+	return rc;
+}
+
+static int mdev_msix_enable(struct mdev_irq *mdev_irq, int nvec)
+{
+	struct mdev_device *mdev = irq_to_mdev(mdev_irq);
+	struct device *dev;
+	int rc;
+
+	if (nvec != mdev_irq->num)
+		return -EINVAL;
+
+	if (mdev_irq->ims_num) {
+		dev = &mdev->dev;
+		rc = msi_domain_alloc_irqs(dev_get_msi_domain(dev), dev, mdev_irq->ims_num);
+		if (rc < 0)
+			return rc;
+	}
+
+	mdev_irq->irq_type = VFIO_PCI_MSIX_IRQ_INDEX;
+	return 0;
+}
+
+static int mdev_msix_disable(struct mdev_irq *mdev_irq)
+{
+	struct mdev_device *mdev = irq_to_mdev(mdev_irq);
+	struct device *dev = &mdev->dev;
+	struct irq_domain *irq_domain;
+
+	mdev_msix_set_vector_signals(mdev_irq, 0, mdev_irq->num, NULL);
+	irq_domain = dev_get_msi_domain(&mdev->dev);
+	if (irq_domain)
+		msi_domain_free_irqs(irq_domain, dev);
+	mdev_irq->irq_type = VFIO_PCI_NUM_IRQS;
+	return 0;
+}
+
+/*
+ * Common helper function that sets up the MSIX vectors for the mdev device that are
+ * Interrupt Message Store (IMS) backed. Certain mdev devices can have the first
+ * vector emulated rather than backed by IMS.
+ *
+ *  @mdev [in]		: mdev device
+ *  @index [in]		: type of VFIO vectors to setup
+ *  @start [in]		: start position of the vector index
+ *  @count [in]		: number of vectors
+ *  @flags [in]		: VFIO_IRQ action to be taken
+ *  @data [in]		: data accompanied for the call
+ *  Return error code on failure or 0 on success.
+ */
+
+int mdev_set_msix_trigger(struct mdev_device *mdev, unsigned int index,
+			  unsigned int start, unsigned int count, u32 flags,
+			  void *data)
+{
+	struct mdev_irq *mdev_irq = &mdev->mdev_irq;
+	int i, rc = 0;
+
+	if (count > mdev_irq->num)
+		count = mdev_irq->num;
+
+	if (!count && (flags & VFIO_IRQ_SET_DATA_NONE)) {
+		mdev_msix_disable(mdev_irq);
+		return 0;
+	}
+
+	if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
+		int *fds = data;
+
+		if (mdev_irq->irq_type == index)
+			return mdev_msix_set_vector_signals(mdev_irq, start, count, fds);
+
+		rc = mdev_msix_enable(mdev_irq, start + count);
+		if (rc < 0)
+			return rc;
+
+		rc = mdev_msix_set_vector_signals(mdev_irq, start, count, fds);
+		if (rc < 0)
+			mdev_msix_disable(mdev_irq);
+
+		return rc;
+	}
+
+	if (start + count > mdev_irq->num)
+		return -EINVAL;
+
+	for (i = start; i < start + count; i++) {
+		if (!mdev_irq->irq_entries[i].trigger)
+			continue;
+		if (flags & VFIO_IRQ_SET_DATA_NONE) {
+			eventfd_signal(mdev_irq->irq_entries[i].trigger, 1);
+		} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
+			u8 *bools = data;
+
+			if (bools[i - start])
+				eventfd_signal(mdev_irq->irq_entries[i].trigger, 1);
+		}
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mdev_set_msix_trigger);
+
+void mdev_irqs_set_pasid(struct mdev_device *mdev, u32 pasid)
+{
+	mdev->mdev_irq.pasid = pasid;
+}
+EXPORT_SYMBOL_GPL(mdev_irqs_set_pasid);
+
+/*
+ * Initialize and setup the mdev_irq context under mdev.
+ *
+ * @mdev [in]		: mdev device
+ * @num [in]		: number of vectors
+ * @ims_map [in]	: bool array that indicates whether a guest MSIX vector is
+ *			  backed by an IMS vector or emulated
+ * Return error code on failure or 0 on success.
+ */
+int mdev_irqs_init(struct mdev_device *mdev, int num, bool *ims_map)
+{
+	struct mdev_irq *mdev_irq = &mdev->mdev_irq;
+	int i;
+
+	if (num < 1)
+		return -EINVAL;
+
+	mdev_irq->irq_type = VFIO_PCI_NUM_IRQS;
+	mdev_irq->num = num;
+	mdev_irq->pasid = INVALID_IOASID;
+
+	mdev_irq->irq_entries = kcalloc(num, sizeof(*mdev_irq->irq_entries), GFP_KERNEL);
+	if (!mdev_irq->irq_entries)
+		return -ENOMEM;
+
+	for (i = 0; i < num; i++) {
+		mdev_irq->irq_entries[i].ims = ims_map[i];
+		if (ims_map[i]) {
+			mdev_irq->irq_entries[i].ims_id = mdev_irq->ims_num;
+			mdev_irq->ims_num++;
+		}
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mdev_irqs_init);
+
+/*
+ * Free allocated memory in mdev_irq
+ *
+ * @mdev [in]		: mdev device
+ */
+void mdev_irqs_free(struct mdev_device *mdev)
+{
+	kfree(mdev->mdev_irq.irq_entries);
+	memset(&mdev->mdev_irq, 0, sizeof(mdev->mdev_irq));
+}
+EXPORT_SYMBOL_GPL(mdev_irqs_free);
diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index 0cd8db2d3422..035c021e8068 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -10,8 +10,26 @@ 
 #ifndef MDEV_H
 #define MDEV_H
 
+#include <linux/irqbypass.h>
+
 struct mdev_type;
 
+struct mdev_irq_entry {
+	struct eventfd_ctx *trigger;
+	struct irq_bypass_producer producer;
+	char *name;
+	bool ims;
+	int ims_id;
+};
+
+struct mdev_irq {
+	struct mdev_irq_entry *irq_entries;
+	int num;
+	int ims_num;
+	int irq_type;
+	int pasid;
+};
+
 struct mdev_device {
 	struct device dev;
 	guid_t uuid;
@@ -19,8 +37,14 @@  struct mdev_device {
 	struct mdev_type *type;
 	struct device *iommu_device;
 	struct mutex creation_lock;
+	struct mdev_irq mdev_irq;
 };
 
+static inline struct mdev_device *irq_to_mdev(struct mdev_irq *mdev_irq)
+{
+	return container_of(mdev_irq, struct mdev_device, mdev_irq);
+}
+
 static inline struct mdev_device *to_mdev_device(struct device *dev)
 {
 	return container_of(dev, struct mdev_device, dev);
@@ -99,4 +123,31 @@  static inline struct mdev_device *mdev_from_dev(struct device *dev)
 	return dev->bus == &mdev_bus_type ? to_mdev_device(dev) : NULL;
 }
 
+#if IS_ENABLED(CONFIG_VFIO_MDEV_IRQS)
+int mdev_set_msix_trigger(struct mdev_device *mdev, unsigned int index,
+			  unsigned int start, unsigned int count, u32 flags,
+			  void *data);
+void mdev_msix_send_signal(struct mdev_device *mdev, int vector);
+int mdev_irqs_init(struct mdev_device *mdev, int num, bool *ims_map);
+void mdev_irqs_free(struct mdev_device *mdev);
+void mdev_irqs_set_pasid(struct mdev_device *mdev, u32 pasid);
+#else
+static inline int mdev_set_msix_trigger(struct mdev_device *mdev, unsigned int index,
+					unsigned int start, unsigned int count, u32 flags,
+					void *data)
+{
+	return -EOPNOTSUPP;
+}
+
+void mdev_msix_send_signal(struct mdev_device *mdev, int vector) {}
+
+static inline int mdev_irqs_init(struct mdev_device *mdev, int num, bool *ims_map)
+{
+	return -EOPNOTSUPP;
+}
+
+void mdev_irqs_free(struct mdev_device *mdev) {}
+void mdev_irqs_set_pasid(struct mdev_device *mdev, u32 pasid) {}
+#endif /* CONFIG_VFIO_MDEV_IMS */
+
 #endif /* MDEV_H */