diff mbox series

PCI: dwc: Chain the set IRQ affinity request back to the parent

Message ID 20250303070501.2740392-1-danielsftsai@google.com (mailing list archive)
State Changes Requested
Delegated to: Krzysztof Wilczyński
Headers show
Series PCI: dwc: Chain the set IRQ affinity request back to the parent | expand

Commit Message

Tsai Sung-Fu March 3, 2025, 7:05 a.m. UTC
From: Tsai Sung-Fu <danielsftsai@google.com>

In our use case, we hope to have each MSI controller to have assigned
CPU affinity due to throughput/performance concern.

Therefore, we chained the set affinity request pass down from Endpoint
device back to the parent by implementing the dw_pci_msi_set_affinity().

We aware of some concerns of breaking userspace ABI brought up in the
past discussion. So our implementation try to make some kind of
resolution to those ABI questions. Like the algo will reject the
affinity changing requests which are deem incompatible to the rest of
others which shared the same parent IRQ line. It also update their
effective affinities upon successful request, so correct affinity
setting will be reflected on the Sysfs interface.

Here is the flow

1. Map current MSI vector to the MSI controller it goes to, so we know
   it's parent IRQ number, stored in pp->msi_irq[].
2. Check if this is a valid request by calling to
   dw_pci_check_mask_compatibility(), we do cpumask_and() with all the MSI
   vectors binding to this MSI controller, as long as the result is not
   empty, we deem it as a valid request.
3. Call to the irq_set_affinity() callback bind to the parent to update
   the CPU affinity if it is a valid request.
4. Update the effective_mask of all the MSI vectors bind to this MSI
   controller by calling to the dw_pci_update_effective_affinity(), so
   we can have correct value reflect to the user space.

Set DesignWare with different IRQ lock class to avoid recursive lock
warning when running the kernel compile with debug configuration
enabled.

Signed-off-by: Tsai Sung-Fu <danielsftsai@google.com>
---
 .../pci/controller/dwc/pcie-designware-host.c | 131 +++++++++++++++++-
 1 file changed, 128 insertions(+), 3 deletions(-)

Comments

Thomas Gleixner March 3, 2025, 9:10 a.m. UTC | #1
On Mon, Mar 03 2025 at 07:05, Daniel Tsai wrote:
> +/*
> + * The algo here honor if there is any intersection of mask of
> + * the existing MSI vectors and the requesting MSI vector. So we
> + * could handle both narrow (1 bit set mask) and wide (0xffff...)
> + * cases, return -EINVAL and reject the request if the result of
> + * cpumask is empty, otherwise return 0 and have the calculated
> + * result on the mask_to_check to pass down to the irq_chip.
> + */
> +static int dw_pci_check_mask_compatibility(struct dw_pcie_rp *pp,
> +					   unsigned long ctrl,
> +					   unsigned long hwirq_to_check,
> +					   struct cpumask *mask_to_check)
> +{
> +	unsigned long end, hwirq;
> +	const struct cpumask *mask;
> +	unsigned int virq;
> +
> +	hwirq = ctrl * MAX_MSI_IRQS_PER_CTRL;
> +	end = hwirq + MAX_MSI_IRQS_PER_CTRL;
> +	for_each_set_bit_from(hwirq, pp->msi_irq_in_use, end) {
> +		if (hwirq == hwirq_to_check)
> +			continue;
> +		virq = irq_find_mapping(pp->irq_domain, hwirq);
> +		if (!virq)
> +			continue;
> +		mask = irq_get_affinity_mask(virq);

What protects @mask against a concurrent modification?

> +		if (!cpumask_and(mask_to_check, mask, mask_to_check))
> +			return -EINVAL;
> +	}
> +
> +	return 0;
> +}
> +
> +static void dw_pci_update_effective_affinity(struct dw_pcie_rp *pp,
> +					     unsigned long ctrl,
> +					     const struct cpumask *effective_mask,
> +					     unsigned long hwirq_to_check)
> +{
> +	struct irq_desc *desc_downstream;
> +	unsigned int virq_downstream;
> +	unsigned long end, hwirq;
> +
> +	/*
> +	 * Update all the irq_data's effective mask
> +	 * bind to this MSI controller, so the correct
> +	 * affinity would reflect on
> +	 * /proc/irq/XXX/effective_affinity
> +	 */
> +	hwirq = ctrl * MAX_MSI_IRQS_PER_CTRL;
> +	end = hwirq + MAX_MSI_IRQS_PER_CTRL;
> +	for_each_set_bit_from(hwirq, pp->msi_irq_in_use, end) {
> +		virq_downstream = irq_find_mapping(pp->irq_domain, hwirq);
> +		if (!virq_downstream)
> +			continue;
> +		desc_downstream = irq_to_desc(virq_downstream);
> +		irq_data_update_effective_affinity(&desc_downstream->irq_data,
> +						   effective_mask);

Same here.

> +	}
> +}
> +
> +static int dw_pci_msi_set_affinity(struct irq_data *d,
> +				   const struct cpumask *mask, bool force)
> +{
> +	struct dw_pcie_rp *pp = irq_data_get_irq_chip_data(d);
> +	struct dw_pcie *pci = to_dw_pcie_from_pp(pp);
> +	int ret;
> +	int virq_parent;
> +	unsigned long hwirq = d->hwirq;
> +	unsigned long flags, ctrl;
> +	struct irq_desc *desc_parent;
> +	const struct cpumask *effective_mask;
> +	cpumask_var_t mask_result;
> +
> +	ctrl = hwirq / MAX_MSI_IRQS_PER_CTRL;
> +	if (!alloc_cpumask_var(&mask_result, GFP_ATOMIC))
> +		return -ENOMEM;

This does not work on a RT enabled kernel. Allocations with a raw spin
lock held are not possible.

> +	/*
> +	 * Loop through all possible MSI vector to check if the
> +	 * requested one is compatible with all of them
> +	 */
> +	raw_spin_lock_irqsave(&pp->lock, flags);
> +	cpumask_copy(mask_result, mask);
> +	ret = dw_pci_check_mask_compatibility(pp, ctrl, hwirq, mask_result);
> +	if (ret) {
> +		dev_dbg(pci->dev, "Incompatible mask, request %*pbl, irq num %u\n",
> +			cpumask_pr_args(mask), d->irq);
> +		goto unlock;
> +	}
> +
> +	dev_dbg(pci->dev, "Final mask, request %*pbl, irq num %u\n",
> +		cpumask_pr_args(mask_result), d->irq);
> +
> +	virq_parent = pp->msi_irq[ctrl];
> +	desc_parent = irq_to_desc(virq_parent);
> +	ret = desc_parent->irq_data.chip->irq_set_affinity(&desc_parent->irq_data,
> +							   mask_result, force);

Again. Completely unserialized.

Thanks,

        tglx
kernel test robot March 3, 2025, 9:25 a.m. UTC | #2
Hi Daniel,

kernel test robot noticed the following build errors:

[auto build test ERROR on pci/next]
[also build test ERROR on pci/for-linus mani-mhi/mhi-next linus/master v6.14-rc5 next-20250228]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Daniel-Tsai/PCI-dwc-Chain-the-set-IRQ-affinity-request-back-to-the-parent/20250303-150704
base:   https://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git next
patch link:    https://lore.kernel.org/r/20250303070501.2740392-1-danielsftsai%40google.com
patch subject: [PATCH] PCI: dwc: Chain the set IRQ affinity request back to the parent
config: x86_64-buildonly-randconfig-006-20250303 (https://download.01.org/0day-ci/archive/20250303/202503031726.KLKoMMZk-lkp@intel.com/config)
compiler: clang version 19.1.7 (https://github.com/llvm/llvm-project cd708029e0b2869e80abe31ddb175f7c35361f90)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250303/202503031726.KLKoMMZk-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202503031726.KLKoMMZk-lkp@intel.com/

All errors (new ones prefixed by >>):

>> drivers/pci/controller/dwc/pcie-designware-host.c:223:45: error: no member named 'affinity' in 'struct irq_common_data'
     223 |                 cpumask_copy(desc_parent->irq_common_data.affinity, mask);
         |                              ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ^
   drivers/pci/controller/dwc/pcie-designware-host.c:507:30: warning: shift count >= width of type [-Wshift-count-overflow]
     507 |                 dma_set_coherent_mask(dev, DMA_BIT_MASK(64));
         |                                            ^~~~~~~~~~~~~~~~
   include/linux/dma-mapping.h:73:54: note: expanded from macro 'DMA_BIT_MASK'
      73 | #define DMA_BIT_MASK(n) (((n) == 64) ? ~0ULL : ((1ULL<<(n))-1))
         |                                                      ^ ~~~
   1 warning and 1 error generated.


vim +223 drivers/pci/controller/dwc/pcie-designware-host.c

   178	
   179	static int dw_pci_msi_set_affinity(struct irq_data *d,
   180					   const struct cpumask *mask, bool force)
   181	{
   182		struct dw_pcie_rp *pp = irq_data_get_irq_chip_data(d);
   183		struct dw_pcie *pci = to_dw_pcie_from_pp(pp);
   184		int ret;
   185		int virq_parent;
   186		unsigned long hwirq = d->hwirq;
   187		unsigned long flags, ctrl;
   188		struct irq_desc *desc_parent;
   189		const struct cpumask *effective_mask;
   190		cpumask_var_t mask_result;
   191	
   192		ctrl = hwirq / MAX_MSI_IRQS_PER_CTRL;
   193		if (!alloc_cpumask_var(&mask_result, GFP_ATOMIC))
   194			return -ENOMEM;
   195	
   196		/*
   197		 * Loop through all possible MSI vector to check if the
   198		 * requested one is compatible with all of them
   199		 */
   200		raw_spin_lock_irqsave(&pp->lock, flags);
   201		cpumask_copy(mask_result, mask);
   202		ret = dw_pci_check_mask_compatibility(pp, ctrl, hwirq, mask_result);
   203		if (ret) {
   204			dev_dbg(pci->dev, "Incompatible mask, request %*pbl, irq num %u\n",
   205				cpumask_pr_args(mask), d->irq);
   206			goto unlock;
   207		}
   208	
   209		dev_dbg(pci->dev, "Final mask, request %*pbl, irq num %u\n",
   210			cpumask_pr_args(mask_result), d->irq);
   211	
   212		virq_parent = pp->msi_irq[ctrl];
   213		desc_parent = irq_to_desc(virq_parent);
   214		ret = desc_parent->irq_data.chip->irq_set_affinity(&desc_parent->irq_data,
   215								   mask_result, force);
   216	
   217		if (ret < 0)
   218			goto unlock;
   219	
   220		switch (ret) {
   221		case IRQ_SET_MASK_OK:
   222		case IRQ_SET_MASK_OK_DONE:
 > 223			cpumask_copy(desc_parent->irq_common_data.affinity, mask);
   224			fallthrough;
   225		case IRQ_SET_MASK_OK_NOCOPY:
   226			break;
   227		}
   228	
   229		effective_mask = irq_data_get_effective_affinity_mask(&desc_parent->irq_data);
   230		dw_pci_update_effective_affinity(pp, ctrl, effective_mask, hwirq);
   231	
   232	unlock:
   233		free_cpumask_var(mask_result);
   234		raw_spin_unlock_irqrestore(&pp->lock, flags);
   235		return ret < 0 ? ret : IRQ_SET_MASK_OK_NOCOPY;
   236	}
   237
kernel test robot March 3, 2025, 10:13 a.m. UTC | #3
Hi Daniel,

kernel test robot noticed the following build errors:

[auto build test ERROR on pci/next]
[also build test ERROR on pci/for-linus mani-mhi/mhi-next linus/master v6.14-rc5 next-20250228]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Daniel-Tsai/PCI-dwc-Chain-the-set-IRQ-affinity-request-back-to-the-parent/20250303-150704
base:   https://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git next
patch link:    https://lore.kernel.org/r/20250303070501.2740392-1-danielsftsai%40google.com
patch subject: [PATCH] PCI: dwc: Chain the set IRQ affinity request back to the parent
config: sparc64-randconfig-002-20250303 (https://download.01.org/0day-ci/archive/20250303/202503031759.oiGkE9fl-lkp@intel.com/config)
compiler: sparc64-linux-gcc (GCC) 14.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250303/202503031759.oiGkE9fl-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202503031759.oiGkE9fl-lkp@intel.com/

All errors (new ones prefixed by >>):

   drivers/pci/controller/dwc/pcie-designware-host.c: In function 'dw_pci_msi_set_affinity':
>> drivers/pci/controller/dwc/pcie-designware-host.c:223:58: error: 'struct irq_common_data' has no member named 'affinity'
     223 |                 cpumask_copy(desc_parent->irq_common_data.affinity, mask);
         |                                                          ^


vim +223 drivers/pci/controller/dwc/pcie-designware-host.c

   178	
   179	static int dw_pci_msi_set_affinity(struct irq_data *d,
   180					   const struct cpumask *mask, bool force)
   181	{
   182		struct dw_pcie_rp *pp = irq_data_get_irq_chip_data(d);
   183		struct dw_pcie *pci = to_dw_pcie_from_pp(pp);
   184		int ret;
   185		int virq_parent;
   186		unsigned long hwirq = d->hwirq;
   187		unsigned long flags, ctrl;
   188		struct irq_desc *desc_parent;
   189		const struct cpumask *effective_mask;
   190		cpumask_var_t mask_result;
   191	
   192		ctrl = hwirq / MAX_MSI_IRQS_PER_CTRL;
   193		if (!alloc_cpumask_var(&mask_result, GFP_ATOMIC))
   194			return -ENOMEM;
   195	
   196		/*
   197		 * Loop through all possible MSI vector to check if the
   198		 * requested one is compatible with all of them
   199		 */
   200		raw_spin_lock_irqsave(&pp->lock, flags);
   201		cpumask_copy(mask_result, mask);
   202		ret = dw_pci_check_mask_compatibility(pp, ctrl, hwirq, mask_result);
   203		if (ret) {
   204			dev_dbg(pci->dev, "Incompatible mask, request %*pbl, irq num %u\n",
   205				cpumask_pr_args(mask), d->irq);
   206			goto unlock;
   207		}
   208	
   209		dev_dbg(pci->dev, "Final mask, request %*pbl, irq num %u\n",
   210			cpumask_pr_args(mask_result), d->irq);
   211	
   212		virq_parent = pp->msi_irq[ctrl];
   213		desc_parent = irq_to_desc(virq_parent);
   214		ret = desc_parent->irq_data.chip->irq_set_affinity(&desc_parent->irq_data,
   215								   mask_result, force);
   216	
   217		if (ret < 0)
   218			goto unlock;
   219	
   220		switch (ret) {
   221		case IRQ_SET_MASK_OK:
   222		case IRQ_SET_MASK_OK_DONE:
 > 223			cpumask_copy(desc_parent->irq_common_data.affinity, mask);
   224			fallthrough;
   225		case IRQ_SET_MASK_OK_NOCOPY:
   226			break;
   227		}
   228	
   229		effective_mask = irq_data_get_effective_affinity_mask(&desc_parent->irq_data);
   230		dw_pci_update_effective_affinity(pp, ctrl, effective_mask, hwirq);
   231	
   232	unlock:
   233		free_cpumask_var(mask_result);
   234		raw_spin_unlock_irqrestore(&pp->lock, flags);
   235		return ret < 0 ? ret : IRQ_SET_MASK_OK_NOCOPY;
   236	}
   237
Tsai Sung-Fu March 4, 2025, 5:48 a.m. UTC | #4
On Mon, Mar 3, 2025 at 5:10 PM Thomas Gleixner <tglx@linutronix.de> wrote:
>
> On Mon, Mar 03 2025 at 07:05, Daniel Tsai wrote:
> > +/*
> > + * The algo here honor if there is any intersection of mask of
> > + * the existing MSI vectors and the requesting MSI vector. So we
> > + * could handle both narrow (1 bit set mask) and wide (0xffff...)
> > + * cases, return -EINVAL and reject the request if the result of
> > + * cpumask is empty, otherwise return 0 and have the calculated
> > + * result on the mask_to_check to pass down to the irq_chip.
> > + */
> > +static int dw_pci_check_mask_compatibility(struct dw_pcie_rp *pp,
> > +                                        unsigned long ctrl,
> > +                                        unsigned long hwirq_to_check,
> > +                                        struct cpumask *mask_to_check)
> > +{
> > +     unsigned long end, hwirq;
> > +     const struct cpumask *mask;
> > +     unsigned int virq;
> > +
> > +     hwirq = ctrl * MAX_MSI_IRQS_PER_CTRL;
> > +     end = hwirq + MAX_MSI_IRQS_PER_CTRL;
> > +     for_each_set_bit_from(hwirq, pp->msi_irq_in_use, end) {
> > +             if (hwirq == hwirq_to_check)
> > +                     continue;
> > +             virq = irq_find_mapping(pp->irq_domain, hwirq);
> > +             if (!virq)
> > +                     continue;
> > +             mask = irq_get_affinity_mask(virq);
>
> What protects @mask against a concurrent modification?

We hold the &pp->lock in the dw_pci_msi_set_affinity before calling
this function
so that should help to protect against concurrent modification ?

>
> > +             if (!cpumask_and(mask_to_check, mask, mask_to_check))
> > +                     return -EINVAL;
> > +     }
> > +
> > +     return 0;
> > +}
> > +
> > +static void dw_pci_update_effective_affinity(struct dw_pcie_rp *pp,
> > +                                          unsigned long ctrl,
> > +                                          const struct cpumask *effective_mask,
> > +                                          unsigned long hwirq_to_check)
> > +{
> > +     struct irq_desc *desc_downstream;
> > +     unsigned int virq_downstream;
> > +     unsigned long end, hwirq;
> > +
> > +     /*
> > +      * Update all the irq_data's effective mask
> > +      * bind to this MSI controller, so the correct
> > +      * affinity would reflect on
> > +      * /proc/irq/XXX/effective_affinity
> > +      */
> > +     hwirq = ctrl * MAX_MSI_IRQS_PER_CTRL;
> > +     end = hwirq + MAX_MSI_IRQS_PER_CTRL;
> > +     for_each_set_bit_from(hwirq, pp->msi_irq_in_use, end) {
> > +             virq_downstream = irq_find_mapping(pp->irq_domain, hwirq);
> > +             if (!virq_downstream)
> > +                     continue;
> > +             desc_downstream = irq_to_desc(virq_downstream);
> > +             irq_data_update_effective_affinity(&desc_downstream->irq_data,
> > +                                                effective_mask);
>
> Same here.

We hold the &pp->lock in the dw_pci_msi_set_affinity before calling
here, so that could help I think ?

>
> > +     }
> > +}
> > +
> > +static int dw_pci_msi_set_affinity(struct irq_data *d,
> > +                                const struct cpumask *mask, bool force)
> > +{
> > +     struct dw_pcie_rp *pp = irq_data_get_irq_chip_data(d);
> > +     struct dw_pcie *pci = to_dw_pcie_from_pp(pp);
> > +     int ret;
> > +     int virq_parent;
> > +     unsigned long hwirq = d->hwirq;
> > +     unsigned long flags, ctrl;
> > +     struct irq_desc *desc_parent;
> > +     const struct cpumask *effective_mask;
> > +     cpumask_var_t mask_result;
> > +
> > +     ctrl = hwirq / MAX_MSI_IRQS_PER_CTRL;
> > +     if (!alloc_cpumask_var(&mask_result, GFP_ATOMIC))
> > +             return -ENOMEM;
>
> This does not work on a RT enabled kernel. Allocations with a raw spin
> lock held are not possible.
>

Even with the GFP_ATOMIC flag ? I thought that means it would be safe
to do allocation in the atomic context ?
Didn't work on the RT kernel before, so please enlighten me if I am
wrong and if you don't mind.

> > +     /*
> > +      * Loop through all possible MSI vector to check if the
> > +      * requested one is compatible with all of them
> > +      */
> > +     raw_spin_lock_irqsave(&pp->lock, flags);
> > +     cpumask_copy(mask_result, mask);
> > +     ret = dw_pci_check_mask_compatibility(pp, ctrl, hwirq, mask_result);
> > +     if (ret) {
> > +             dev_dbg(pci->dev, "Incompatible mask, request %*pbl, irq num %u\n",
> > +                     cpumask_pr_args(mask), d->irq);
> > +             goto unlock;
> > +     }
> > +
> > +     dev_dbg(pci->dev, "Final mask, request %*pbl, irq num %u\n",
> > +             cpumask_pr_args(mask_result), d->irq);
> > +
> > +     virq_parent = pp->msi_irq[ctrl];
> > +     desc_parent = irq_to_desc(virq_parent);
> > +     ret = desc_parent->irq_data.chip->irq_set_affinity(&desc_parent->irq_data,
> > +                                                        mask_result, force);
>
> Again. Completely unserialized.
>

The reason why we remove the desc_parent->lock protection is because
the chained IRQ structure didn't export parent IRQ to the user space, so we
think this call path should be the only caller. And since we have &pp->lock hold
at the beginning, so that should help to protect against concurrent
modification here ?


> Thanks,
>
>         tglx

Sorry for the test build error, I will fix it and re-send a new patch.

Thanks
Thomas Gleixner March 4, 2025, 9:46 a.m. UTC | #5
On Tue, Mar 04 2025 at 13:48, Tsai Sung-Fu wrote:
> On Mon, Mar 3, 2025 at 5:10 PM Thomas Gleixner <tglx@linutronix.de> wrote:
>> > +     hwirq = ctrl * MAX_MSI_IRQS_PER_CTRL;
>> > +     end = hwirq + MAX_MSI_IRQS_PER_CTRL;
>> > +     for_each_set_bit_from(hwirq, pp->msi_irq_in_use, end) {
>> > +             if (hwirq == hwirq_to_check)
>> > +                     continue;
>> > +             virq = irq_find_mapping(pp->irq_domain, hwirq);
>> > +             if (!virq)
>> > +                     continue;
>> > +             mask = irq_get_affinity_mask(virq);
>>
>> What protects @mask against a concurrent modification?
>
> We hold the &pp->lock in the dw_pci_msi_set_affinity before calling
> this function
> so that should help to protect against concurrent modification ?

So that's the same domain, right?

>> > +     /*
>> > +      * Update all the irq_data's effective mask
>> > +      * bind to this MSI controller, so the correct
>> > +      * affinity would reflect on
>> > +      * /proc/irq/XXX/effective_affinity
>> > +      */
>> > +     hwirq = ctrl * MAX_MSI_IRQS_PER_CTRL;
>> > +     end = hwirq + MAX_MSI_IRQS_PER_CTRL;
>> > +     for_each_set_bit_from(hwirq, pp->msi_irq_in_use, end) {
>> > +             virq_downstream = irq_find_mapping(pp->irq_domain, hwirq);
>> > +             if (!virq_downstream)
>> > +                     continue;
>> > +             desc_downstream = irq_to_desc(virq_downstream);
>> > +             irq_data_update_effective_affinity(&desc_downstream->irq_data,
>> > +                                                effective_mask);
>>
>> Same here.
>
> We hold the &pp->lock in the dw_pci_msi_set_affinity before calling
> here, so that could help I think ?

A comment would be helpful for the casual reader.

>>
>> > +     }
>> > +}
>> > +
>> > +static int dw_pci_msi_set_affinity(struct irq_data *d,
>> > +                                const struct cpumask *mask, bool force)
>> > +{
>> > +     struct dw_pcie_rp *pp = irq_data_get_irq_chip_data(d);
>> > +     struct dw_pcie *pci = to_dw_pcie_from_pp(pp);
>> > +     int ret;
>> > +     int virq_parent;
>> > +     unsigned long hwirq = d->hwirq;
>> > +     unsigned long flags, ctrl;
>> > +     struct irq_desc *desc_parent;
>> > +     const struct cpumask *effective_mask;
>> > +     cpumask_var_t mask_result;
>> > +
>> > +     ctrl = hwirq / MAX_MSI_IRQS_PER_CTRL;
>> > +     if (!alloc_cpumask_var(&mask_result, GFP_ATOMIC))
>> > +             return -ENOMEM;
>>
>> This does not work on a RT enabled kernel. Allocations with a raw spin
>> lock held are not possible.
>>
>
> Even with the GFP_ATOMIC flag ? I thought that means it would be safe
> to do allocation in the atomic context ?
> Didn't work on the RT kernel before, so please enlighten me if I am
> wrong and if you don't mind.

https://www.kernel.org/doc/html/latest/locking/locktypes.html#preempt-rt-caveats
https://www.kernel.org/doc/html/latest/locking/locktypes.html#raw-spinlock-t-on-rt

>> > +     /*
>> > +      * Loop through all possible MSI vector to check if the
>> > +      * requested one is compatible with all of them
>> > +      */
>> > +     raw_spin_lock_irqsave(&pp->lock, flags);
>> > +     cpumask_copy(mask_result, mask);
>> > +     ret = dw_pci_check_mask_compatibility(pp, ctrl, hwirq, mask_result);
>> > +     if (ret) {
>> > +             dev_dbg(pci->dev, "Incompatible mask, request %*pbl, irq num %u\n",
>> > +                     cpumask_pr_args(mask), d->irq);
>> > +             goto unlock;
>> > +     }
>> > +
>> > +     dev_dbg(pci->dev, "Final mask, request %*pbl, irq num %u\n",
>> > +             cpumask_pr_args(mask_result), d->irq);
>> > +
>> > +     virq_parent = pp->msi_irq[ctrl];
>> > +     desc_parent = irq_to_desc(virq_parent);
>> > +     ret = desc_parent->irq_data.chip->irq_set_affinity(&desc_parent->irq_data,
>> > +                                                        mask_result, force);
>>
>> Again. Completely unserialized.
>>
>
> The reason why we remove the desc_parent->lock protection is because
> the chained IRQ structure didn't export parent IRQ to the user space, so we
> think this call path should be the only caller. And since we have &pp->lock hold
> at the beginning, so that should help to protect against concurrent
> modification here ?

"Should be the only caller" is not really a technical argument. If you
make assumptions like this, then you have to come up with a proper
explanation why this is correct under all circumstances and with all
possible variants of parent interrupt chips.

Aside of that fiddling with the internals of interrupt descriptors is
not really justified here. What's wrong with using the existing
irq_set_affinity() mechanism?

Thanks,

        tglx
Tsai Sung-Fu March 5, 2025, 11:21 a.m. UTC | #6
On Tue, Mar 4, 2025 at 5:46 PM Thomas Gleixner <tglx@linutronix.de> wrote:
>
> On Tue, Mar 04 2025 at 13:48, Tsai Sung-Fu wrote:
> > On Mon, Mar 3, 2025 at 5:10 PM Thomas Gleixner <tglx@linutronix.de> wrote:
> >> > +     hwirq = ctrl * MAX_MSI_IRQS_PER_CTRL;
> >> > +     end = hwirq + MAX_MSI_IRQS_PER_CTRL;
> >> > +     for_each_set_bit_from(hwirq, pp->msi_irq_in_use, end) {
> >> > +             if (hwirq == hwirq_to_check)
> >> > +                     continue;
> >> > +             virq = irq_find_mapping(pp->irq_domain, hwirq);
> >> > +             if (!virq)
> >> > +                     continue;
> >> > +             mask = irq_get_affinity_mask(virq);
> >>
> >> What protects @mask against a concurrent modification?
> >
> > We hold the &pp->lock in the dw_pci_msi_set_affinity before calling
> > this function
> > so that should help to protect against concurrent modification ?
>
> So that's the same domain, right?
>

Yes, all from pp->irq_domain allocated by the driver by calling to
irq_domain_create_linear().

I will add the lockdep_assert_held() check.

> >> > +     /*
> >> > +      * Update all the irq_data's effective mask
> >> > +      * bind to this MSI controller, so the correct
> >> > +      * affinity would reflect on
> >> > +      * /proc/irq/XXX/effective_affinity
> >> > +      */
> >> > +     hwirq = ctrl * MAX_MSI_IRQS_PER_CTRL;
> >> > +     end = hwirq + MAX_MSI_IRQS_PER_CTRL;
> >> > +     for_each_set_bit_from(hwirq, pp->msi_irq_in_use, end) {
> >> > +             virq_downstream = irq_find_mapping(pp->irq_domain, hwirq);
> >> > +             if (!virq_downstream)
> >> > +                     continue;
> >> > +             desc_downstream = irq_to_desc(virq_downstream);
> >> > +             irq_data_update_effective_affinity(&desc_downstream->irq_data,
> >> > +                                                effective_mask);
> >>
> >> Same here.
> >
> > We hold the &pp->lock in the dw_pci_msi_set_affinity before calling
> > here, so that could help I think ?
>
> A comment would be helpful for the casual reader.

Sure, will also add the lockdep_assert_held() check

>
> >>
> >> > +     }
> >> > +}
> >> > +
> >> > +static int dw_pci_msi_set_affinity(struct irq_data *d,
> >> > +                                const struct cpumask *mask, bool force)
> >> > +{
> >> > +     struct dw_pcie_rp *pp = irq_data_get_irq_chip_data(d);
> >> > +     struct dw_pcie *pci = to_dw_pcie_from_pp(pp);
> >> > +     int ret;
> >> > +     int virq_parent;
> >> > +     unsigned long hwirq = d->hwirq;
> >> > +     unsigned long flags, ctrl;
> >> > +     struct irq_desc *desc_parent;
> >> > +     const struct cpumask *effective_mask;
> >> > +     cpumask_var_t mask_result;
> >> > +
> >> > +     ctrl = hwirq / MAX_MSI_IRQS_PER_CTRL;
> >> > +     if (!alloc_cpumask_var(&mask_result, GFP_ATOMIC))
> >> > +             return -ENOMEM;
> >>
> >> This does not work on a RT enabled kernel. Allocations with a raw spin
> >> lock held are not possible.
> >>
> >
> > Even with the GFP_ATOMIC flag ? I thought that means it would be safe
> > to do allocation in the atomic context ?
> > Didn't work on the RT kernel before, so please enlighten me if I am
> > wrong and if you don't mind.
>
> https://www.kernel.org/doc/html/latest/locking/locktypes.html#preempt-rt-caveats
> https://www.kernel.org/doc/html/latest/locking/locktypes.html#raw-spinlock-t-on-rt
>

I will remove the Allocation and move the cpumask to the device structure.
Thanks for the doc.

> >> > +     /*
> >> > +      * Loop through all possible MSI vector to check if the
> >> > +      * requested one is compatible with all of them
> >> > +      */
> >> > +     raw_spin_lock_irqsave(&pp->lock, flags);
> >> > +     cpumask_copy(mask_result, mask);
> >> > +     ret = dw_pci_check_mask_compatibility(pp, ctrl, hwirq, mask_result);
> >> > +     if (ret) {
> >> > +             dev_dbg(pci->dev, "Incompatible mask, request %*pbl, irq num %u\n",
> >> > +                     cpumask_pr_args(mask), d->irq);
> >> > +             goto unlock;
> >> > +     }
> >> > +
> >> > +     dev_dbg(pci->dev, "Final mask, request %*pbl, irq num %u\n",
> >> > +             cpumask_pr_args(mask_result), d->irq);
> >> > +
> >> > +     virq_parent = pp->msi_irq[ctrl];
> >> > +     desc_parent = irq_to_desc(virq_parent);
> >> > +     ret = desc_parent->irq_data.chip->irq_set_affinity(&desc_parent->irq_data,
> >> > +                                                        mask_result, force);
> >>
> >> Again. Completely unserialized.
> >>
> >
> > The reason why we remove the desc_parent->lock protection is because
> > the chained IRQ structure didn't export parent IRQ to the user space, so we
> > think this call path should be the only caller. And since we have &pp->lock hold
> > at the beginning, so that should help to protect against concurrent
> > modification here ?
>
> "Should be the only caller" is not really a technical argument. If you
> make assumptions like this, then you have to come up with a proper
> explanation why this is correct under all circumstances and with all
> possible variants of parent interrupt chips.
>
> Aside of that fiddling with the internals of interrupt descriptors is
> not really justified here. What's wrong with using the existing
> irq_set_affinity() mechanism?
>
> Thanks,
>
>         tglx
>

Using irq_set_affinity() would give us some circular deadlock warning
at the probe stage.
irq_startup() flow would hold the global lock from irq_setup_affinity(), so
deadlock scenario like this below might happen. The irq_set_affinity()
would try to acquire &irq_desc_lock_class
while the dw_pci_msi_set_affinity() already hold the &pp->lock. To
resolve that we would like to reuse the idea to
replace the global lock in irq_set_affinity() with PERCPU structure
from this patch ->
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=64b6d1d7a84538de34c22a6fc92a7dcc2b196b64
Just would like to know if this looks feasible to you ?

  Chain exists of:
  &irq_desc_lock_class --> mask_lock --> &pp->lock
  Possible unsafe locking scenario:
        CPU0                            CPU1
        ----                                   ----
   lock(&pp->lock);
                                               lock(mask_lock);
                                               lock(&pp->lock);
   lock(&irq_desc_lock_class);

  *** DEADLOCK ***

Thanks
Thomas Gleixner March 6, 2025, 7:44 a.m. UTC | #7
On Wed, Mar 05 2025 at 19:21, Tsai Sung-Fu wrote:
> On Tue, Mar 4, 2025 at 5:46 PM Thomas Gleixner <tglx@linutronix.de> wrote:
>> >> > +     ret = desc_parent->irq_data.chip->irq_set_affinity(&desc_parent->irq_data,
>> >> > +                                                        mask_result, force);
>> >>
>> >> Again. Completely unserialized.
>> >>
>> >
>> > The reason why we remove the desc_parent->lock protection is because
>> > the chained IRQ structure didn't export parent IRQ to the user space, so we
>> > think this call path should be the only caller. And since we have &pp->lock hold
>> > at the beginning, so that should help to protect against concurrent
>> > modification here ?
>>
>> "Should be the only caller" is not really a technical argument. If you
>> make assumptions like this, then you have to come up with a proper
>> explanation why this is correct under all circumstances and with all
>> possible variants of parent interrupt chips.
>>
>> Aside of that fiddling with the internals of interrupt descriptors is
>> not really justified here. What's wrong with using the existing
>> irq_set_affinity() mechanism?
>>
> Using irq_set_affinity() would give us some circular deadlock warning
> at the probe stage.
> irq_startup() flow would hold the global lock from irq_setup_affinity(), so
> deadlock scenario like this below might happen. The irq_set_affinity()
> would try to acquire &irq_desc_lock_class
> while the dw_pci_msi_set_affinity() already hold the &pp->lock. To
> resolve that we would like to reuse the idea to
> replace the global lock in irq_set_affinity() with PERCPU structure
> from this patch ->
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=64b6d1d7a84538de34c22a6fc92a7dcc2b196b64
> Just would like to know if this looks feasible to you ?

It won't help. See below.

>   Chain exists of:
>   &irq_desc_lock_class --> mask_lock --> &pp->lock
>   Possible unsafe locking scenario:
>         CPU0                            CPU1
>         ----                                   ----
>    lock(&pp->lock);
>                                                lock(mask_lock);
>                                                lock(&pp->lock);
>    lock(&irq_desc_lock_class);

Indeed.

Back to irq_set_affinity(). I did not think about the more problematic
issue with calling irq_set_affinity(): The per CPU mask is then used
recursive, which is a bad idea.

But looking deeper, your implementation breaks some other things, like
e.g. the affinity coupling of interrupt threads on the MSI side. They
have to be coupled to the effective affinity of the underlying hard
interrupt. Solvable problem, but not in the driver.

Aside of the fact that this hardware design is a trainwreck, which
defeats the intended flexibility of MSI, solving this at the chip driver
level ends up in a incomplete and hacky solution. Also as this is not
the only IP block, which was cobbled together mindlessly that way, we
really don't want to proliferate this horrorshow all over the place.

This really wants a generic infrastructure which handles all aspects of
this correctly once and for all of these hardware trainwrecks. That
needs some thoughts and effort, but that's definitely preferred over a
half baked driver specific hackery, which fiddles in the guts of the
interrupt descriptors as it sees fit. From a layering POV, a interrupt
chip driver should not even care about the fact that an interrupt
descriptor exists. We've spent a lot of effort to decouple these things
and just because C allows it, does not mean it's a correct or a good
idea to ignore the layering.

The approach you have taken has a massive downside:

    It forces all (32) MSI interrupts, which are demultiplexed from the
    underlying parent interrupt, to have the same affinity and the user
    space interface in /proc/irq/*/affinity becomes more than
    confusing.

    This is not what user space expects from an interface which controls
    the affinity of an individual interrupt. No user and no tool expects
    that it has to deal with conflicting settings for MSI interrupts,
    which are independent of each other.

    Tools like irqbalanced will be pretty unhappy about the behaviour
    and you have to teach them about the severe limitations of this.

I'm absolutely not convinced that this solves more problems than it
creates.

In fact, you can achieve the very same outcome with a clean and straight
forward ten line change:

   Instead of the hidden chained handlers, request regular interrupts
   for demultiplexing and give them proper names 'DWC_MSI_DEMUX-0-31',
   'DWC_MSI_DEMUX-32-63'...

   That provides exactly the same mechanism w/o all the hackery and
   creates the same (solvable) problem vs. interrupt thread affinity,
   but provides an understandable user interface.

No?


If you really want to provide a useful and flexible mechanism to steer
the affinity of the individual MSI interrupts, which are handled by each
control block, then you have to think about a completely different
approach.

The underlying interrupt per control block will always be affine to a
particular CPU. But it's not written in stone, that the demultiplexing
interrupt actually has to handle the pending interrupts in the context
of the demultiplexer, at least not for MSI interrupts, which are edge
type and from the PCI device side considered 'fire and forget'.

So right now the demultiplex handler does:

   bits = read_dbi(...);
   for_each_bit(bit, bits)
        generic_handle_domain_irq(domain, base + bit);

generic_handle_irq_domain() gets the interrupt mapping for the hardware
interrupt number and invokes the relevant flow handler directly from the
same context, which means that the affinity of all those interrupts is
bound to the affinity of the underlying demultiplex interrupt.

Iignoring the devil in the details, this can also be handled by
delegating the handling to a different CPU:

   bits = read_dbi(...);
   for_each_bit(bit, bits)
        generic_handle_irq_demux_domain(domain, base + bit);

where generic_handle_demux_domain_irq() does:

      desc = irq_resolve_mapping(domain, hwirq);

      if (desc->target_cpu == smp_processor_id()) {
      	   handle_irq_desc(desc);
      } else {
          // Issue DWC ack() here? - See below
          irq_work_queue_on(&desc->irq_work, desc->target_cpu);
      }

The generic irq_work handler does:

      desc = container_of(work, struct irq_desc, irq_work);
      handle_irq_desc(desc);

That's obviously asynchronous, but in case of edge type MSI interrupts,
that's not a problem at all.

The real problems are:

    1) Handling the pending bit of the interrupt in the DBI, i.e. what
       dw_pci_bottom_ack() does.

    2) Life time of the interrupt descriptor

    3) Slightly increased latency of the rerouted interrupts

    4) Affinity of the demultiplex interrupt

#1 Needs some thoughts. From the top of my head I think it would require
   that dw_pci_bottom_ack() is issued in the context of the demultiplex
   handler as it would otherwise be retriggered, but that just needs
   some research in the datasheet and experimental validation.

   The actual interrupt flow handler for the MSI interrupt is then not
   longer handle_edge_irq() as it should not issue the ack() again.

   From a correctness POV vs. interrupt handling of a edge type
   interrupt, that's perfectly fine.

#2 Trivial enough to solve by flushing and invalidating the irq_work
   before shutdown/removal.

#3 That might be an issue for some extreme scenarios, but in the general
   case I claim, that it does not matter at all.

   Those scenarios where it really matters can affine the interrupt to
   the affinity of the demultiplexing interrupt, which leads to #4

#4 That's a trivial problem to solve. There is absolutely no technical
   reason to make them hidden. Demultiplexing just works fine from the
   context of regular interrupts.

There are some other minor issues like CPU hotplug, but I can't see
anything truly complex in this design right now. Famous last words :)

At the driver level this becomes really simple. Affinity changes are
just selecting a target CPU and store it for trivial comparison to avoid
a CPU mask check in the demultiplex path. No fiddling with sibling
or parent interrupts, no locking issues. Thread affinity and other
things just work as expected.

I'm really tempted to play with that. Such infrastructure would allow to
support PCI/multi-MSI on x86 without interrupt remapping, which is on
the wishlist of a lot of people for many years.

Thanks,

        tglx
Tsai Sung-Fu March 7, 2025, 11:10 a.m. UTC | #8
Thanks for your detailed explanation and feedback, I am a bit confused about the
#4 you mentioned here ->

>     4) Affinity of the demultiplex interrupt

Are you saying there is a chance to queue this demultiplexing IRQ event
to the current running CPU ?
would it fall into the first if clause ?

>       if (desc->target_cpu == smp_processor_id()) {
>            handle_irq_desc(desc);

And that's really an approach worth to try, I will work on it.

Thanks
Thomas Gleixner March 7, 2025, 7:49 p.m. UTC | #9
On Fri, Mar 07 2025 at 19:10, Tsai Sung-Fu wrote:
> Thanks for your detailed explanation and feedback, I am a bit confused about the
> #4 you mentioned here ->
>
>>     4) Affinity of the demultiplex interrupt
>
> Are you saying there is a chance to queue this demultiplexing IRQ event
> to the current running CPU ?

The demultiplexing interrupt (currently a chained handler, which is
hidden from /proc/irq/) stays at the affinity which the kernel decided
to assign to it at startup. That means it can't be steered to a
particual CPU and nobody knows to which CPU it is affine. You can only
guess it from /proc/interrupts by observing where the associated
demultiplex interrupts are affine to.

So ideally you want to be able to affine the demultiplexing interrupt
too. That requires to switch it to a regular interrupt for
simplicity. We could expose those hidden chained handlers affinity too,
but that needs some surgery vs. locking etc.

> And that's really an approach worth to try, I will work on it.

I've played with this on top of variant of Marc's changes to use MSI
parent interrupts for such controllers too:

  https://lore.kernel.org/all/20241204124549.607054-1-maz@kernel.org/

A completely untested and largely uncompiled preview is here:

     https://tglx.de/~tglx/patches.tar

The MSI parent parts are in flux. Marc will look at them in the next
weeks, but I picked them up because it simplifies the whole business a
lot. If you find bugs in that series, you can keep them :)

Thanks,

        tglx
Tsai Sung-Fu March 11, 2025, 9:52 a.m. UTC | #10
Running some basic tests with this patch (
https://tglx.de/~tglx/patches.tar ) applied on my device, at first
glance, the affinity feature is working.

I didn't run stress test to test the stability, and the Kernel version
we used is a bit old, so I only applied change in this 2 patches

-> Subject: genirq: Add interrupt redirection infrastructure
-> Subject: PCI: dwc: Enable MSI affinity support

And adding if check on irq_chip_redirect_set_affinity() and
irq_set_redirect_target() to avoid cpumask_first() return nr_cpu_ids

int irq_chip_redirect_set_affinity(struct irq_data *data, const struct
cpumask *dest, bool force) {
     ....
     ....
     if (target >= nr_cpu_ids)
          target = smp_processor_id();
     irq_data_update_effective_affinity(data, cpumask_of(target));
}

static void irq_set_redirect_target(struct irq_desc *desc)
{
    const struct cpumask *m =
irq_data_get_effective_affinity_mask(&desc->irq_data);
    unsigned int cpu = cpumask_first(m);

    if (cpu >= nr_cpu_ids)
       cpu = smp_processor_id();

    WRITE_ONCE(desc->redirect.target_cpu, cpu);
}

May I ask, would this patch be officially added to the 6.14 kernel ?

Thanks

On Sat, Mar 8, 2025 at 3:49 AM Thomas Gleixner <tglx@linutronix.de> wrote:
>
> On Fri, Mar 07 2025 at 19:10, Tsai Sung-Fu wrote:
> > Thanks for your detailed explanation and feedback, I am a bit confused about the
> > #4 you mentioned here ->
> >
> >>     4) Affinity of the demultiplex interrupt
> >
> > Are you saying there is a chance to queue this demultiplexing IRQ event
> > to the current running CPU ?
>
> The demultiplexing interrupt (currently a chained handler, which is
> hidden from /proc/irq/) stays at the affinity which the kernel decided
> to assign to it at startup. That means it can't be steered to a
> particual CPU and nobody knows to which CPU it is affine. You can only
> guess it from /proc/interrupts by observing where the associated
> demultiplex interrupts are affine to.
>
> So ideally you want to be able to affine the demultiplexing interrupt
> too. That requires to switch it to a regular interrupt for
> simplicity. We could expose those hidden chained handlers affinity too,
> but that needs some surgery vs. locking etc.
>
> > And that's really an approach worth to try, I will work on it.
>
> I've played with this on top of variant of Marc's changes to use MSI
> parent interrupts for such controllers too:
>
>   https://lore.kernel.org/all/20241204124549.607054-1-maz@kernel.org/
>
> A completely untested and largely uncompiled preview is here:
>
>      https://tglx.de/~tglx/patches.tar
>
> The MSI parent parts are in flux. Marc will look at them in the next
> weeks, but I picked them up because it simplifies the whole business a
> lot. If you find bugs in that series, you can keep them :)
>
> Thanks,
>
>         tglx
Thomas Gleixner March 11, 2025, 2:05 p.m. UTC | #11
On Tue, Mar 11 2025 at 17:52, Tsai Sung-Fu wrote:

Please do not top-post and trim your replies.

> Running some basic tests with this patch (
> https://tglx.de/~tglx/patches.tar ) applied on my device, at first
> glance, the affinity feature is working.
>
> I didn't run stress test to test the stability, and the Kernel version
> we used is a bit old, so I only applied change in this 2 patches

I don't care about old kernels and what you can apply or not. Kernel
development happens against upstream and not against randomly chosen
private kernel versions.

> And adding if check on irq_chip_redirect_set_affinity() and
> irq_set_redirect_target() to avoid cpumask_first() return nr_cpu_ids

I assume you know how diff works.

> May I ask, would this patch be officially added to the 6.14 kernel ?

You may ask. But you should know the answer already, no?

The merge window for 6.14 closed on February 2nd with the release of
6.14-rc1. Anything which goes into Linus tree between rc1 and the final
release is fixes only.

This is new infrastructure, which has neither been posted nor reviewed
nor properly tested. There are also no numbers about the overhead and
no analysis whether that overhead causes regressions on existing setups.

These changes want to be:

   1) Put into a series with proper change logs

   2) Posted on the relevant mailing list

   3) Tested and proper numbers provided

So they are not even close to be ready for the 6.15 merge window, simply
because the irq tree is going to freeze at 6.14-rc7, i.e. by the end of
this week.

I'm not planning to work on them. Feel free to take the PoC patches,
polish them up and post them according to the documented process.

Thanks,

        tglx
diff mbox series

Patch

diff --git a/drivers/pci/controller/dwc/pcie-designware-host.c b/drivers/pci/controller/dwc/pcie-designware-host.c
index d2291c3ceb8be..4cab9f05c8813 100644
--- a/drivers/pci/controller/dwc/pcie-designware-host.c
+++ b/drivers/pci/controller/dwc/pcie-designware-host.c
@@ -49,8 +49,7 @@  static struct irq_chip dw_pcie_msi_irq_chip = {
 
 static struct msi_domain_info dw_pcie_msi_domain_info = {
 	.flags	= MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
-		  MSI_FLAG_NO_AFFINITY | MSI_FLAG_PCI_MSIX |
-		  MSI_FLAG_MULTI_PCI_MSI,
+		  MSI_FLAG_PCI_MSIX | MSI_FLAG_MULTI_PCI_MSI,
 	.chip	= &dw_pcie_msi_irq_chip,
 };
 
@@ -117,6 +116,125 @@  static void dw_pci_setup_msi_msg(struct irq_data *d, struct msi_msg *msg)
 		(int)d->hwirq, msg->address_hi, msg->address_lo);
 }
 
+/*
+ * The algo here honor if there is any intersection of mask of
+ * the existing MSI vectors and the requesting MSI vector. So we
+ * could handle both narrow (1 bit set mask) and wide (0xffff...)
+ * cases, return -EINVAL and reject the request if the result of
+ * cpumask is empty, otherwise return 0 and have the calculated
+ * result on the mask_to_check to pass down to the irq_chip.
+ */
+static int dw_pci_check_mask_compatibility(struct dw_pcie_rp *pp,
+					   unsigned long ctrl,
+					   unsigned long hwirq_to_check,
+					   struct cpumask *mask_to_check)
+{
+	unsigned long end, hwirq;
+	const struct cpumask *mask;
+	unsigned int virq;
+
+	hwirq = ctrl * MAX_MSI_IRQS_PER_CTRL;
+	end = hwirq + MAX_MSI_IRQS_PER_CTRL;
+	for_each_set_bit_from(hwirq, pp->msi_irq_in_use, end) {
+		if (hwirq == hwirq_to_check)
+			continue;
+		virq = irq_find_mapping(pp->irq_domain, hwirq);
+		if (!virq)
+			continue;
+		mask = irq_get_affinity_mask(virq);
+		if (!cpumask_and(mask_to_check, mask, mask_to_check))
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+static void dw_pci_update_effective_affinity(struct dw_pcie_rp *pp,
+					     unsigned long ctrl,
+					     const struct cpumask *effective_mask,
+					     unsigned long hwirq_to_check)
+{
+	struct irq_desc *desc_downstream;
+	unsigned int virq_downstream;
+	unsigned long end, hwirq;
+
+	/*
+	 * Update all the irq_data's effective mask
+	 * bind to this MSI controller, so the correct
+	 * affinity would reflect on
+	 * /proc/irq/XXX/effective_affinity
+	 */
+	hwirq = ctrl * MAX_MSI_IRQS_PER_CTRL;
+	end = hwirq + MAX_MSI_IRQS_PER_CTRL;
+	for_each_set_bit_from(hwirq, pp->msi_irq_in_use, end) {
+		virq_downstream = irq_find_mapping(pp->irq_domain, hwirq);
+		if (!virq_downstream)
+			continue;
+		desc_downstream = irq_to_desc(virq_downstream);
+		irq_data_update_effective_affinity(&desc_downstream->irq_data,
+						   effective_mask);
+	}
+}
+
+static int dw_pci_msi_set_affinity(struct irq_data *d,
+				   const struct cpumask *mask, bool force)
+{
+	struct dw_pcie_rp *pp = irq_data_get_irq_chip_data(d);
+	struct dw_pcie *pci = to_dw_pcie_from_pp(pp);
+	int ret;
+	int virq_parent;
+	unsigned long hwirq = d->hwirq;
+	unsigned long flags, ctrl;
+	struct irq_desc *desc_parent;
+	const struct cpumask *effective_mask;
+	cpumask_var_t mask_result;
+
+	ctrl = hwirq / MAX_MSI_IRQS_PER_CTRL;
+	if (!alloc_cpumask_var(&mask_result, GFP_ATOMIC))
+		return -ENOMEM;
+
+	/*
+	 * Loop through all possible MSI vector to check if the
+	 * requested one is compatible with all of them
+	 */
+	raw_spin_lock_irqsave(&pp->lock, flags);
+	cpumask_copy(mask_result, mask);
+	ret = dw_pci_check_mask_compatibility(pp, ctrl, hwirq, mask_result);
+	if (ret) {
+		dev_dbg(pci->dev, "Incompatible mask, request %*pbl, irq num %u\n",
+			cpumask_pr_args(mask), d->irq);
+		goto unlock;
+	}
+
+	dev_dbg(pci->dev, "Final mask, request %*pbl, irq num %u\n",
+		cpumask_pr_args(mask_result), d->irq);
+
+	virq_parent = pp->msi_irq[ctrl];
+	desc_parent = irq_to_desc(virq_parent);
+	ret = desc_parent->irq_data.chip->irq_set_affinity(&desc_parent->irq_data,
+							   mask_result, force);
+
+	if (ret < 0)
+		goto unlock;
+
+	switch (ret) {
+	case IRQ_SET_MASK_OK:
+	case IRQ_SET_MASK_OK_DONE:
+		cpumask_copy(desc_parent->irq_common_data.affinity, mask);
+		fallthrough;
+	case IRQ_SET_MASK_OK_NOCOPY:
+		break;
+	}
+
+	effective_mask = irq_data_get_effective_affinity_mask(&desc_parent->irq_data);
+	dw_pci_update_effective_affinity(pp, ctrl, effective_mask, hwirq);
+
+unlock:
+	free_cpumask_var(mask_result);
+	raw_spin_unlock_irqrestore(&pp->lock, flags);
+	return ret < 0 ? ret : IRQ_SET_MASK_OK_NOCOPY;
+}
+
 static void dw_pci_bottom_mask(struct irq_data *d)
 {
 	struct dw_pcie_rp *pp = irq_data_get_irq_chip_data(d);
@@ -172,10 +290,14 @@  static struct irq_chip dw_pci_msi_bottom_irq_chip = {
 	.name = "DWPCI-MSI",
 	.irq_ack = dw_pci_bottom_ack,
 	.irq_compose_msi_msg = dw_pci_setup_msi_msg,
+	.irq_set_affinity = dw_pci_msi_set_affinity,
 	.irq_mask = dw_pci_bottom_mask,
 	.irq_unmask = dw_pci_bottom_unmask,
 };
 
+static struct lock_class_key dw_pci_irq_lock_class;
+static struct lock_class_key dw_pci_irq_request_class;
+
 static int dw_pcie_irq_domain_alloc(struct irq_domain *domain,
 				    unsigned int virq, unsigned int nr_irqs,
 				    void *args)
@@ -195,11 +317,14 @@  static int dw_pcie_irq_domain_alloc(struct irq_domain *domain,
 	if (bit < 0)
 		return -ENOSPC;
 
-	for (i = 0; i < nr_irqs; i++)
+	for (i = 0; i < nr_irqs; i++) {
+		irq_set_lockdep_class(virq + i, &dw_pci_irq_lock_class,
+				      &dw_pci_irq_request_class);
 		irq_domain_set_info(domain, virq + i, bit + i,
 				    pp->msi_irq_chip,
 				    pp, handle_edge_irq,
 				    NULL, NULL);
+	}
 
 	return 0;
 }