diff mbox series

[v3,05/18] dmaengine: idxd: add IMS support in base driver

Message ID 160021248979.67751.3799965857372703876.stgit@djiang5-desk3.ch.intel.com
State Not Applicable
Headers show
Series [v3,01/18] irqchip: Add IMS (Interrupt Message Storage) driver | expand

Commit Message

Dave Jiang Sept. 15, 2020, 11:28 p.m. UTC
In preparation for support of VFIO mediated device for idxd driver, the
enabling for Interrupt Message Store (IMS) interrupts is added for the idxd
With IMS support the idxd driver can dynamically allocate interrupts on a
per mdev basis based on how many IMS vectors that are mapped to the mdev
device. This commit only provides the support functions in the base driver
and not the VFIO mdev code utilization.

The commit has some portal related changes. A "portal" is a special
location within the MMIO BAR2 of the DSA device where descriptors are
submitted via the CPU command MOVDIR64B or ENQCMD(S). The offset for the
portal address determines whether the submitted descriptor is for MSI-X
or IMS notification.

See Intel SIOV spec for more details:
https://software.intel.com/en-us/download/intel-scalable-io-virtualization-technical-specification

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/dma/idxd/cdev.c   |    4 ++-
 drivers/dma/idxd/idxd.h   |   14 +++++++++---
 drivers/dma/idxd/init.c   |   52 +++++++++++++++++++++++++++++++++++++++++++++
 drivers/dma/idxd/submit.c |   11 ++++++++--
 drivers/dma/idxd/sysfs.c  |   11 ++++++++++
 5 files changed, 84 insertions(+), 8 deletions(-)

Comments

Thomas Gleixner Sept. 30, 2020, 6:47 p.m. UTC | #1
On Tue, Sep 15 2020 at 16:28, Dave Jiang wrote:
>  struct idxd_device {
> @@ -170,6 +171,7 @@ struct idxd_device {
>  
>  	int num_groups;
>  
> +	u32 ims_offset;
>  	u32 msix_perm_offset;
>  	u32 wqcfg_offset;
>  	u32 grpcfg_offset;
> @@ -177,6 +179,7 @@ struct idxd_device {
>  
>  	u64 max_xfer_bytes;
>  	u32 max_batch_size;
> +	int ims_size;
>  	int max_groups;
>  	int max_engines;
>  	int max_tokens;
> @@ -196,6 +199,7 @@ struct idxd_device {
>  	struct work_struct work;
>  
>  	int *int_handles;
> +	struct sbitmap ims_sbmap;

This bitmap is needed for what?

> --- a/drivers/dma/idxd/init.c
> +++ b/drivers/dma/idxd/init.c
> @@ -231,10 +231,51 @@ static void idxd_read_table_offsets(struct idxd_device *idxd)
>  	idxd->msix_perm_offset = offsets.msix_perm * 0x100;
>  	dev_dbg(dev, "IDXD MSIX Permission Offset: %#x\n",
>  		idxd->msix_perm_offset);
> +	idxd->ims_offset = offsets.ims * 0x100;

Magic constant pulled out of thin air. #define ....

> +	dev_dbg(dev, "IDXD IMS Offset: %#x\n", idxd->ims_offset);
>  	idxd->perfmon_offset = offsets.perfmon * 0x100;
>  	dev_dbg(dev, "IDXD Perfmon Offset: %#x\n", idxd->perfmon_offset);
>  }
>  
> +#define PCI_DEVSEC_CAP		0x23
> +#define SIOVDVSEC1(offset)	((offset) + 0x4)
> +#define SIOVDVSEC2(offset)	((offset) + 0x8)
> +#define DVSECID			0x5
> +#define SIOVCAP(offset)		((offset) + 0x14)
> +
> +static void idxd_check_siov(struct idxd_device *idxd)
> +{
> +	struct pci_dev *pdev = idxd->pdev;
> +	struct device *dev = &pdev->dev;
> +	int dvsec;
> +	u16 val16;
> +	u32 val32;
> +
> +	dvsec = pci_find_ext_capability(pdev, PCI_DEVSEC_CAP);
> +	pci_read_config_word(pdev, SIOVDVSEC1(dvsec), &val16);
> +	if (val16 != PCI_VENDOR_ID_INTEL) {
> +		dev_dbg(&pdev->dev, "DVSEC vendor id is not Intel\n");
> +		return;
> +	}
> +
> +	pci_read_config_word(pdev, SIOVDVSEC2(dvsec), &val16);
> +	if (val16 != DVSECID) {
> +		dev_dbg(&pdev->dev, "DVSEC ID is not SIOV\n");
> +		return;
> +	}
> +
> +	pci_read_config_dword(pdev, SIOVCAP(dvsec), &val32);
> +	if ((val32 & 0x1) && idxd->hw.gen_cap.max_ims_mult) {
> +		idxd->ims_size = idxd->hw.gen_cap.max_ims_mult * 256ULL;
> +		dev_dbg(dev, "IMS size: %u\n", idxd->ims_size);
> +		set_bit(IDXD_FLAG_SIOV_SUPPORTED, &idxd->flags);
> +		dev_dbg(&pdev->dev, "IMS supported for device\n");
> +		return;
> +	}
> +
> +	dev_dbg(&pdev->dev, "SIOV unsupported for device\n");

It's really hard to find the code inside all of this dev_dbg()
noise. But why is this capability check done in this driver? Is this
capability stuff really IDXD specific or is the next device which
supports this going to copy and pasta the above?

>  static void idxd_read_caps(struct idxd_device *idxd)
>  {
>  	struct device *dev = &idxd->pdev->dev;
> @@ -253,6 +294,7 @@ static void idxd_read_caps(struct idxd_device *idxd)
>  	dev_dbg(dev, "max xfer size: %llu bytes\n", idxd->max_xfer_bytes);
>  	idxd->max_batch_size = 1U << idxd->hw.gen_cap.max_batch_shift;
>  	dev_dbg(dev, "max batch size: %u\n", idxd->max_batch_size);
> +	idxd_check_siov(idxd);
>  	if (idxd->hw.gen_cap.config_en)
>  		set_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags);
>  
> @@ -347,9 +389,19 @@ static int idxd_probe(struct idxd_device *idxd)
>  
>  	idxd->major = idxd_cdev_get_major(idxd);
>  
> +	if (idxd->ims_size) {
> +		rc = sbitmap_init_node(&idxd->ims_sbmap, idxd->ims_size, -1,
> +				       GFP_KERNEL, dev_to_node(dev));
> +		if (rc < 0)
> +			goto sbitmap_fail;
> +	}

Ah, here the bitmap is allocated, but it's still completely unclear what
it is used for.

The subject line is misleading as hell. This does not add support, it's
doing some magic capability checks and allocates stuff which nobody
knows what it is used for.

Thanks,

        tglx
Jason Gunthorpe Sept. 30, 2020, 6:51 p.m. UTC | #2
On Wed, Sep 30, 2020 at 08:47:00PM +0200, Thomas Gleixner wrote:

> > +	pci_read_config_dword(pdev, SIOVCAP(dvsec), &val32);
> > +	if ((val32 & 0x1) && idxd->hw.gen_cap.max_ims_mult) {
> > +		idxd->ims_size = idxd->hw.gen_cap.max_ims_mult * 256ULL;
> > +		dev_dbg(dev, "IMS size: %u\n", idxd->ims_size);
> > +		set_bit(IDXD_FLAG_SIOV_SUPPORTED, &idxd->flags);
> > +		dev_dbg(&pdev->dev, "IMS supported for device\n");
> > +		return;
> > +	}
> > +
> > +	dev_dbg(&pdev->dev, "SIOV unsupported for device\n");
> 
> It's really hard to find the code inside all of this dev_dbg()
> noise. But why is this capability check done in this driver? Is this
> capability stuff really IDXD specific or is the next device which
> supports this going to copy and pasta the above?

It is the weirdest thing, IMHO. Intel defined a dvsec cap in their
SIOV cookbook, but as far as I can see it serves no purpose at
all.

Last time I asked I got some unclear mumbling about "OEMs".

I expect you'll see all Intel drivers copying this code.

Jason
Thomas Gleixner Sept. 30, 2020, 9:48 p.m. UTC | #3
On Wed, Sep 30 2020 at 15:51, Jason Gunthorpe wrote:

> On Wed, Sep 30, 2020 at 08:47:00PM +0200, Thomas Gleixner wrote:
>
>> > +	pci_read_config_dword(pdev, SIOVCAP(dvsec), &val32);
>> > +	if ((val32 & 0x1) && idxd->hw.gen_cap.max_ims_mult) {
>> > +		idxd->ims_size = idxd->hw.gen_cap.max_ims_mult * 256ULL;
>> > +		dev_dbg(dev, "IMS size: %u\n", idxd->ims_size);
>> > +		set_bit(IDXD_FLAG_SIOV_SUPPORTED, &idxd->flags);
>> > +		dev_dbg(&pdev->dev, "IMS supported for device\n");
>> > +		return;
>> > +	}
>> > +
>> > +	dev_dbg(&pdev->dev, "SIOV unsupported for device\n");
>> 
>> It's really hard to find the code inside all of this dev_dbg()
>> noise. But why is this capability check done in this driver? Is this
>> capability stuff really IDXD specific or is the next device which
>> supports this going to copy and pasta the above?
>
> It is the weirdest thing, IMHO. Intel defined a dvsec cap in their
> SIOV cookbook, but as far as I can see it serves no purpose at
> all.

Why am I not surprised?

> Last time I asked I got some unclear mumbling about "OEMs".

See above.

But it reads the IMS storage array size out of this capability, so it
looks like it has some value.

> I expect you'll see all Intel drivers copying this code.

Just to set the expectations straight:

   1) Has this capability stuff any value aside of being mentioned in
      the SIOV cookbook?

   2) If it has no value, then just remove the mess

   3) If it has value then this wants to go to the PCI core and fill in
      some SIOV specific data structure when PCI evaluates the
      capabilities. Or at least have a generic function which can be
      called by the magic SIOV capable drivers.

Thanks,

        tglx
Raj, Ashok Sept. 30, 2020, 9:49 p.m. UTC | #4
Hi Jason

On Wed, Sep 30, 2020 at 03:51:03PM -0300, Jason Gunthorpe wrote:
> On Wed, Sep 30, 2020 at 08:47:00PM +0200, Thomas Gleixner wrote:
> 
> > > +	pci_read_config_dword(pdev, SIOVCAP(dvsec), &val32);
> > > +	if ((val32 & 0x1) && idxd->hw.gen_cap.max_ims_mult) {
> > > +		idxd->ims_size = idxd->hw.gen_cap.max_ims_mult * 256ULL;
> > > +		dev_dbg(dev, "IMS size: %u\n", idxd->ims_size);
> > > +		set_bit(IDXD_FLAG_SIOV_SUPPORTED, &idxd->flags);
> > > +		dev_dbg(&pdev->dev, "IMS supported for device\n");
> > > +		return;
> > > +	}
> > > +
> > > +	dev_dbg(&pdev->dev, "SIOV unsupported for device\n");
> > 
> > It's really hard to find the code inside all of this dev_dbg()
> > noise. But why is this capability check done in this driver? Is this
> > capability stuff really IDXD specific or is the next device which
> > supports this going to copy and pasta the above?
> 
> It is the weirdest thing, IMHO. Intel defined a dvsec cap in their
> SIOV cookbook, but as far as I can see it serves no purpose at
> all.
> 
> Last time I asked I got some unclear mumbling about "OEMs".
> 
One of the parameters it has is the "supported system page-sizes" which is
usually there in the SRIOV properties. So it needed a place holder for
that. 

IMS is a device specific capability, and I almost forgot why we needed
until I had to checking internally. Remember when a device is given to a
guest, MSIX routing via Interrupt Remapping is automatic via the VFIO/IRQFD
and such.

When we provision an entire PCI device that is IMS capable. The guest
driver does know it can update the IMS entries directly without going to
the host. But in order to do remapping we need something like how we manage
PASID allocation from guest, so an IRTE entry can be allocated and the host
driver can write the proper values for IMS.

Hope this helps clear things up.

Cheers,
Ashok
Thomas Gleixner Sept. 30, 2020, 9:57 p.m. UTC | #5
On Wed, Sep 30 2020 at 14:49, Ashok Raj wrote:
>> It is the weirdest thing, IMHO. Intel defined a dvsec cap in their
>> SIOV cookbook, but as far as I can see it serves no purpose at
>> all.
>> 
>> Last time I asked I got some unclear mumbling about "OEMs".
>> 
> One of the parameters it has is the "supported system page-sizes" which is
> usually there in the SRIOV properties. So it needed a place holder for
> that. 
>
> IMS is a device specific capability, and I almost forgot why we needed
> until I had to checking internally. Remember when a device is given to a
> guest, MSIX routing via Interrupt Remapping is automatic via the VFIO/IRQFD
> and such.

-ENOPARSE

> When we provision an entire PCI device that is IMS capable. The guest
> driver does know it can update the IMS entries directly without going to
> the host. But in order to do remapping we need something like how we manage
> PASID allocation from guest, so an IRTE entry can be allocated and the host
> driver can write the proper values for IMS.

And how is that related to that capbility thing?

Also this stuff is host side and not guest side. I seriously doubt that
you want to hand in the whole PCI device which contains the IMS
thing. The whole point of IMS was as far as I was told that you can
create gazillions of subdevices and have seperate MSI interrupts for
each of them.

Thanks,

        tglx
Jason Gunthorpe Sept. 30, 2020, 10:38 p.m. UTC | #6
On Wed, Sep 30, 2020 at 02:49:41PM -0700, Raj, Ashok wrote:

> One of the parameters it has is the "supported system page-sizes" which is
> usually there in the SRIOV properties. So it needed a place holder for
> that. 

No idea why this would be a PCI cap. It is certainly not something so
universal it needs standardizing. There are many ways a device can
manage a BAR to match a required protection granularity.

> When we provision an entire PCI device that is IMS capable. The guest
> driver does know it can update the IMS entries directly without going to
> the host. But in order to do remapping we need something like how we manage
> PASID allocation from guest, so an IRTE entry can be allocated and the host
> driver can write the proper values for IMS.

Look at the architecture we ended up with.

You need to make pci_subdevice_msi_create_irq_domain() fail if the
platform can't provide the functionality.

Working around that with PCI caps is pretty gross.

Jason
Raj, Ashok Oct. 1, 2020, 1:07 a.m. UTC | #7
Hi Thomas,

On Wed, Sep 30, 2020 at 11:57:22PM +0200, Thomas Gleixner wrote:
> On Wed, Sep 30 2020 at 14:49, Ashok Raj wrote:
> >> It is the weirdest thing, IMHO. Intel defined a dvsec cap in their
> >> SIOV cookbook, but as far as I can see it serves no purpose at
> >> all.
> >> 
> >> Last time I asked I got some unclear mumbling about "OEMs".
> >> 
> > One of the parameters it has is the "supported system page-sizes" which is
> > usually there in the SRIOV properties. So it needed a place holder for
> > that. 
> >
> > IMS is a device specific capability, and I almost forgot why we needed
> > until I had to checking internally. Remember when a device is given to a
> > guest, MSIX routing via Interrupt Remapping is automatic via the VFIO/IRQFD
> > and such.
> 
> -ENOPARSE

Let me try again to see if this will turn into ESUCCESS :-)

Devices exposed to guest need host OS support for programming interrupt
entries in the IOMMU interrupt remapping table. VFIO provides those 
services for standard interrupt schemes like MSI/MSIx for instance. 
Since IMS is device specific VFIO can't provide an intercept when 
IMS entries are programmed by the guest OS. 

If the virtualisation software doesn't expose vIOMMU with virtual
capabilities to allocate IRTE entries and support for vIRTE in guest
then its expected to turn off the IMS capability. Hence driver running 
in guest will not enable IMS.

> 
> > When we provision an entire PCI device that is IMS capable. The guest
> > driver does know it can update the IMS entries directly without going to
> > the host. But in order to do remapping we need something like how we manage
> > PASID allocation from guest, so an IRTE entry can be allocated and the host
> > driver can write the proper values for IMS.
> 
> And how is that related to that capbility thing?
> 
> Also this stuff is host side and not guest side. I seriously doubt that
> you want to hand in the whole PCI device which contains the IMS

You are right, but nothing prevents a user from simply taking a full PCI
device and assign to guest. 

> thing. The whole point of IMS was as far as I was told that you can
> create gazillions of subdevices and have seperate MSI interrupts for
> each of them.


Cheers,
Ashok
Thomas Gleixner Oct. 1, 2020, 8:44 a.m. UTC | #8
On Wed, Sep 30 2020 at 18:07, Ashok Raj wrote:
> On Wed, Sep 30, 2020 at 11:57:22PM +0200, Thomas Gleixner wrote:
>
> Devices exposed to guest need host OS support for programming interrupt
> entries in the IOMMU interrupt remapping table. VFIO provides those 
> services for standard interrupt schemes like MSI/MSIx for instance. 
> Since IMS is device specific VFIO can't provide an intercept when 
> IMS entries are programmed by the guest OS.

Why is IMS exposed to the guest in the first place? You expose a
subdevice to a guest, right? And that subdevice should not even know
that IMS exists simply because IMS is strictly host specific.

The obvious emulation here is to make the subdevice look like a PCI
device and expose emulated MSIX (not MSI) which is intercepted when
accessing the MSIX table and then redirected to the proper place along
with IRTE and PASID and whatever.

>> Also this stuff is host side and not guest side. I seriously doubt that
>> you want to hand in the whole PCI device which contains the IMS
>
> You are right, but nothing prevents a user from simply taking a full PCI
> device and assign to guest. 

You surely can and should prevent that because it makes no sense and
cannot work.

That's why you want a generic check for 'this device is magic SIOV or
whatever' and not something burried deep into a driver..

Thanks,

        tglx
Dave Jiang Oct. 1, 2020, 8:48 p.m. UTC | #9
On 9/30/2020 11:47 AM, Thomas Gleixner wrote:
> On Tue, Sep 15 2020 at 16:28, Dave Jiang wrote:
>>   struct idxd_device {
>> @@ -170,6 +171,7 @@ struct idxd_device {
>>   
>>   	int num_groups;
>>   
>> +	u32 ims_offset;
>>   	u32 msix_perm_offset;
>>   	u32 wqcfg_offset;
>>   	u32 grpcfg_offset;
>> @@ -177,6 +179,7 @@ struct idxd_device {
>>   
>>   	u64 max_xfer_bytes;
>>   	u32 max_batch_size;
>> +	int ims_size;
>>   	int max_groups;
>>   	int max_engines;
>>   	int max_tokens;
>> @@ -196,6 +199,7 @@ struct idxd_device {
>>   	struct work_struct work;
>>   
>>   	int *int_handles;
>> +	struct sbitmap ims_sbmap;
> 
> This bitmap is needed for what?

Nothing anymore. I forgot to remove. All this is handled by MSI core now with 
code from you.

> 
>> --- a/drivers/dma/idxd/init.c
>> +++ b/drivers/dma/idxd/init.c
>> @@ -231,10 +231,51 @@ static void idxd_read_table_offsets(struct idxd_device *idxd)
>>   	idxd->msix_perm_offset = offsets.msix_perm * 0x100;
>>   	dev_dbg(dev, "IDXD MSIX Permission Offset: %#x\n",
>>   		idxd->msix_perm_offset);
>> +	idxd->ims_offset = offsets.ims * 0x100;
> 
> Magic constant pulled out of thin air. #define ....

Will fix

> 
>> +	dev_dbg(dev, "IDXD IMS Offset: %#x\n", idxd->ims_offset);
>>   	idxd->perfmon_offset = offsets.perfmon * 0x100;
>>   	dev_dbg(dev, "IDXD Perfmon Offset: %#x\n", idxd->perfmon_offset);
>>   }
>>   
>> +#define PCI_DEVSEC_CAP		0x23
>> +#define SIOVDVSEC1(offset)	((offset) + 0x4)
>> +#define SIOVDVSEC2(offset)	((offset) + 0x8)
>> +#define DVSECID			0x5
>> +#define SIOVCAP(offset)		((offset) + 0x14)
>> +
>> +static void idxd_check_siov(struct idxd_device *idxd)
>> +{
>> +	struct pci_dev *pdev = idxd->pdev;
>> +	struct device *dev = &pdev->dev;
>> +	int dvsec;
>> +	u16 val16;
>> +	u32 val32;
>> +
>> +	dvsec = pci_find_ext_capability(pdev, PCI_DEVSEC_CAP);
>> +	pci_read_config_word(pdev, SIOVDVSEC1(dvsec), &val16);
>> +	if (val16 != PCI_VENDOR_ID_INTEL) {
>> +		dev_dbg(&pdev->dev, "DVSEC vendor id is not Intel\n");
>> +		return;
>> +	}
>> +
>> +	pci_read_config_word(pdev, SIOVDVSEC2(dvsec), &val16);
>> +	if (val16 != DVSECID) {
>> +		dev_dbg(&pdev->dev, "DVSEC ID is not SIOV\n");
>> +		return;
>> +	}
>> +
>> +	pci_read_config_dword(pdev, SIOVCAP(dvsec), &val32);
>> +	if ((val32 & 0x1) && idxd->hw.gen_cap.max_ims_mult) {
>> +		idxd->ims_size = idxd->hw.gen_cap.max_ims_mult * 256ULL;
>> +		dev_dbg(dev, "IMS size: %u\n", idxd->ims_size);
>> +		set_bit(IDXD_FLAG_SIOV_SUPPORTED, &idxd->flags);
>> +		dev_dbg(&pdev->dev, "IMS supported for device\n");
>> +		return;
>> +	}
>> +
>> +	dev_dbg(&pdev->dev, "SIOV unsupported for device\n");
> 
> It's really hard to find the code inside all of this dev_dbg()
> noise. But why is this capability check done in this driver? Is this
> capability stuff really IDXD specific or is the next device which
> supports this going to copy and pasta the above?

Will look into move this into a common detection function for all similar 
devices. This should be common for all Intel devices that support SIOV.

> 
>>   static void idxd_read_caps(struct idxd_device *idxd)
>>   {
>>   	struct device *dev = &idxd->pdev->dev;
>> @@ -253,6 +294,7 @@ static void idxd_read_caps(struct idxd_device *idxd)
>>   	dev_dbg(dev, "max xfer size: %llu bytes\n", idxd->max_xfer_bytes);
>>   	idxd->max_batch_size = 1U << idxd->hw.gen_cap.max_batch_shift;
>>   	dev_dbg(dev, "max batch size: %u\n", idxd->max_batch_size);
>> +	idxd_check_siov(idxd);
>>   	if (idxd->hw.gen_cap.config_en)
>>   		set_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags);
>>   
>> @@ -347,9 +389,19 @@ static int idxd_probe(struct idxd_device *idxd)
>>   
>>   	idxd->major = idxd_cdev_get_major(idxd);
>>   
>> +	if (idxd->ims_size) {
>> +		rc = sbitmap_init_node(&idxd->ims_sbmap, idxd->ims_size, -1,
>> +				       GFP_KERNEL, dev_to_node(dev));
>> +		if (rc < 0)
>> +			goto sbitmap_fail;
>> +	}
> 
> Ah, here the bitmap is allocated, but it's still completely unclear what
> it is used for.

Need to remove.

> 
> The subject line is misleading as hell. This does not add support, it's
> doing some magic capability checks and allocates stuff which nobody
> knows what it is used for.

With the unneeded code removal and moving the SIOV detection code to common 
implementation, it should be more clear.

> 
> Thanks,
> 
>          tglx
> 
>
diff mbox series

Patch

diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c
index c3976156db2f..06fed39ff17f 100644
--- a/drivers/dma/idxd/cdev.c
+++ b/drivers/dma/idxd/cdev.c
@@ -157,8 +157,8 @@  static int idxd_cdev_mmap(struct file *filp, struct vm_area_struct *vma)
 		return rc;
 
 	vma->vm_flags |= VM_DONTCOPY;
-	pfn = (base + idxd_get_wq_portal_full_offset(wq->id,
-				IDXD_PORTAL_LIMITED)) >> PAGE_SHIFT;
+	pfn = (base + idxd_get_wq_portal_full_offset(wq->id, IDXD_PORTAL_LIMITED,
+						     IDXD_IRQ_MSIX)) >> PAGE_SHIFT;
 	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 	vma->vm_private_data = ctx;
 
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 2cd541c5faab..acc444ad9db6 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -146,6 +146,7 @@  enum idxd_device_state {
 enum idxd_device_flag {
 	IDXD_FLAG_CONFIGURABLE = 0,
 	IDXD_FLAG_CMD_RUNNING,
+	IDXD_FLAG_SIOV_SUPPORTED,
 };
 
 struct idxd_device {
@@ -170,6 +171,7 @@  struct idxd_device {
 
 	int num_groups;
 
+	u32 ims_offset;
 	u32 msix_perm_offset;
 	u32 wqcfg_offset;
 	u32 grpcfg_offset;
@@ -177,6 +179,7 @@  struct idxd_device {
 
 	u64 max_xfer_bytes;
 	u32 max_batch_size;
+	int ims_size;
 	int max_groups;
 	int max_engines;
 	int max_tokens;
@@ -196,6 +199,7 @@  struct idxd_device {
 	struct work_struct work;
 
 	int *int_handles;
+	struct sbitmap ims_sbmap;
 };
 
 /* IDXD software descriptor */
@@ -233,15 +237,17 @@  enum idxd_interrupt_type {
 	IDXD_IRQ_IMS,
 };
 
-static inline int idxd_get_wq_portal_offset(enum idxd_portal_prot prot)
+static inline int idxd_get_wq_portal_offset(enum idxd_portal_prot prot,
+					    enum idxd_interrupt_type irq_type)
 {
-	return prot * 0x1000;
+	return prot * 0x1000 + irq_type * 0x2000;
 }
 
 static inline int idxd_get_wq_portal_full_offset(int wq_id,
-						 enum idxd_portal_prot prot)
+						 enum idxd_portal_prot prot,
+						 enum idxd_interrupt_type irq_type)
 {
-	return ((wq_id * 4) << PAGE_SHIFT) + idxd_get_wq_portal_offset(prot);
+	return ((wq_id * 4) << PAGE_SHIFT) + idxd_get_wq_portal_offset(prot, irq_type);
 }
 
 static inline void idxd_set_type(struct idxd_device *idxd)
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index fb8b1001d35a..7d08f2b92de7 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -231,10 +231,51 @@  static void idxd_read_table_offsets(struct idxd_device *idxd)
 	idxd->msix_perm_offset = offsets.msix_perm * 0x100;
 	dev_dbg(dev, "IDXD MSIX Permission Offset: %#x\n",
 		idxd->msix_perm_offset);
+	idxd->ims_offset = offsets.ims * 0x100;
+	dev_dbg(dev, "IDXD IMS Offset: %#x\n", idxd->ims_offset);
 	idxd->perfmon_offset = offsets.perfmon * 0x100;
 	dev_dbg(dev, "IDXD Perfmon Offset: %#x\n", idxd->perfmon_offset);
 }
 
+#define PCI_DEVSEC_CAP		0x23
+#define SIOVDVSEC1(offset)	((offset) + 0x4)
+#define SIOVDVSEC2(offset)	((offset) + 0x8)
+#define DVSECID			0x5
+#define SIOVCAP(offset)		((offset) + 0x14)
+
+static void idxd_check_siov(struct idxd_device *idxd)
+{
+	struct pci_dev *pdev = idxd->pdev;
+	struct device *dev = &pdev->dev;
+	int dvsec;
+	u16 val16;
+	u32 val32;
+
+	dvsec = pci_find_ext_capability(pdev, PCI_DEVSEC_CAP);
+	pci_read_config_word(pdev, SIOVDVSEC1(dvsec), &val16);
+	if (val16 != PCI_VENDOR_ID_INTEL) {
+		dev_dbg(&pdev->dev, "DVSEC vendor id is not Intel\n");
+		return;
+	}
+
+	pci_read_config_word(pdev, SIOVDVSEC2(dvsec), &val16);
+	if (val16 != DVSECID) {
+		dev_dbg(&pdev->dev, "DVSEC ID is not SIOV\n");
+		return;
+	}
+
+	pci_read_config_dword(pdev, SIOVCAP(dvsec), &val32);
+	if ((val32 & 0x1) && idxd->hw.gen_cap.max_ims_mult) {
+		idxd->ims_size = idxd->hw.gen_cap.max_ims_mult * 256ULL;
+		dev_dbg(dev, "IMS size: %u\n", idxd->ims_size);
+		set_bit(IDXD_FLAG_SIOV_SUPPORTED, &idxd->flags);
+		dev_dbg(&pdev->dev, "IMS supported for device\n");
+		return;
+	}
+
+	dev_dbg(&pdev->dev, "SIOV unsupported for device\n");
+}
+
 static void idxd_read_caps(struct idxd_device *idxd)
 {
 	struct device *dev = &idxd->pdev->dev;
@@ -253,6 +294,7 @@  static void idxd_read_caps(struct idxd_device *idxd)
 	dev_dbg(dev, "max xfer size: %llu bytes\n", idxd->max_xfer_bytes);
 	idxd->max_batch_size = 1U << idxd->hw.gen_cap.max_batch_shift;
 	dev_dbg(dev, "max batch size: %u\n", idxd->max_batch_size);
+	idxd_check_siov(idxd);
 	if (idxd->hw.gen_cap.config_en)
 		set_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags);
 
@@ -347,9 +389,19 @@  static int idxd_probe(struct idxd_device *idxd)
 
 	idxd->major = idxd_cdev_get_major(idxd);
 
+	if (idxd->ims_size) {
+		rc = sbitmap_init_node(&idxd->ims_sbmap, idxd->ims_size, -1,
+				       GFP_KERNEL, dev_to_node(dev));
+		if (rc < 0)
+			goto sbitmap_fail;
+	}
 	dev_dbg(dev, "IDXD device %d probed successfully\n", idxd->id);
 	return 0;
 
+ sbitmap_fail:
+	mutex_lock(&idxd_idr_lock);
+	idr_remove(&idxd_idrs[idxd->type], idxd->id);
+	mutex_unlock(&idxd_idr_lock);
  err_idr_fail:
 	idxd_mask_error_interrupts(idxd);
 	idxd_mask_msix_vectors(idxd);
diff --git a/drivers/dma/idxd/submit.c b/drivers/dma/idxd/submit.c
index 2b3c2f132af7..8dd72cf3c838 100644
--- a/drivers/dma/idxd/submit.c
+++ b/drivers/dma/idxd/submit.c
@@ -27,7 +27,13 @@  static struct idxd_desc *__get_desc(struct idxd_wq *wq, int idx, int cpu)
 		desc->hw->int_handle = wq->vec_ptr;
 	} else {
 		desc->vec_ptr = wq->vec_ptr;
-		desc->hw->int_handle = idxd->int_handles[desc->vec_ptr];
+		/*
+		 * int_handles are only for descriptor completion. However for device
+		 * MSIX enumeration, vec 0 is used for misc interrupts. Therefore even
+		 * though we are rotating through 1...N for descriptor interrupts, we
+		 * need to acqurie the int_handles from 0..N-1.
+		 */
+		desc->hw->int_handle = idxd->int_handles[desc->vec_ptr - 1];
 	}
 
 	return desc;
@@ -87,7 +93,8 @@  int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc)
 	if (idxd->state != IDXD_DEV_ENABLED)
 		return -EIO;
 
-	portal = wq->dportal + idxd_get_wq_portal_offset(IDXD_PORTAL_UNLIMITED);
+	portal = wq->dportal + idxd_get_wq_portal_offset(IDXD_PORTAL_LIMITED, IDXD_IRQ_MSIX);
+
 	/*
 	 * The wmb() flushes writes to coherent DMA data before possibly
 	 * triggering a DMA read. The wmb() is necessary even on UP because
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 00d2347399bb..4914316ae493 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -1237,6 +1237,16 @@  static ssize_t numa_node_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(numa_node);
 
+static ssize_t ims_size_show(struct device *dev,
+			     struct device_attribute *attr, char *buf)
+{
+	struct idxd_device *idxd =
+		container_of(dev, struct idxd_device, conf_dev);
+
+	return sprintf(buf, "%u\n", idxd->ims_size);
+}
+static DEVICE_ATTR_RO(ims_size);
+
 static ssize_t max_batch_size_show(struct device *dev,
 				   struct device_attribute *attr, char *buf)
 {
@@ -1422,6 +1432,7 @@  static struct attribute *idxd_device_attributes[] = {
 	&dev_attr_max_work_queues_size.attr,
 	&dev_attr_max_engines.attr,
 	&dev_attr_numa_node.attr,
+	&dev_attr_ims_size.attr,
 	&dev_attr_max_batch_size.attr,
 	&dev_attr_max_transfer_size.attr,
 	&dev_attr_op_cap.attr,