diff mbox series

[v3,2/2] irqchip/gic-v3-its: Balance initial LPI affinity across CPUs

Message ID 20200316115433.9017-3-maz@kernel.org (mailing list archive)
State New, archived
Headers show
Series irqchip/gic-v3-its: Balance LPI affinity across CPUs | expand

Commit Message

Marc Zyngier March 16, 2020, 11:54 a.m. UTC
When mapping a LPI, the ITS driver picks the first possible
affinity, which is in most cases CPU0, assuming that if
that's not suitable, someone will come and set the affinity
to something more interesting.

It apparently isn't the case, and people complain of poor
performance when many interrupts are glued to the same CPU.
So let's place the interrupts by finding the "least loaded"
CPU (that is, the one that has the fewer LPIs mapped to it).
So called 'managed' interrupts are an interesting case where
the affinity is actually dictated by the kernel itself, and
we should honor this.

Reported-by: John Garry <john.garry@huawei.com>
Link: https://lore.kernel.org/r/1575642904-58295-1-git-send-email-john.garry@huawei.com
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 drivers/irqchip/irq-gic-v3-its.c | 118 ++++++++++++++++++++++++-------
 1 file changed, 92 insertions(+), 26 deletions(-)

Comments

John Garry March 16, 2020, 1:02 p.m. UTC | #1
On 16/03/2020 11:54, Marc Zyngier wrote:
> When mapping a LPI, the ITS driver picks the first possible
> affinity, which is in most cases CPU0, assuming that if
> that's not suitable, someone will come and set the affinity
> to something more interesting.
> 
> It apparently isn't the case, and people complain of poor
> performance when many interrupts are glued to the same CPU.
> So let's place the interrupts by finding the "least loaded"
> CPU (that is, the one that has the fewer LPIs mapped to it).
> So called 'managed' interrupts are an interesting case where
> the affinity is actually dictated by the kernel itself, and
> we should honor this.
> 
> Reported-by: John Garry <john.garry@huawei.com>
> Link: https://lore.kernel.org/r/1575642904-58295-1-git-send-email-john.garry@huawei.com
> Signed-off-by: Marc Zyngier <maz@kernel.org>
> ---
>   drivers/irqchip/irq-gic-v3-its.c | 118 ++++++++++++++++++++++++-------
>   1 file changed, 92 insertions(+), 26 deletions(-)
> 
> diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
> index 941786e1e8f7..7f1b731c04bb 100644
> --- a/drivers/irqchip/irq-gic-v3-its.c
> +++ b/drivers/irqchip/irq-gic-v3-its.c
> @@ -1531,31 +1531,107 @@ static void its_dec_lpi_count(struct irq_data *d, int cpu)
>   		atomic_dec(&per_cpu_ptr(&cpu_lpi_count, cpu)->unmanaged);
>   }
>   
> +static unsigned int cpumask_pick_least_loaded(struct irq_data *d,
> +					      const struct cpumask *cpu_mask)
> +{
> +	unsigned int cpu = nr_cpu_ids, tmp;
> +	int count = S32_MAX;
> +
> +	for_each_cpu(tmp, cpu_mask) {

Hi Marc,

> +		int this_count = its_read_lpi_count(d, tmp);

Not sure if it's intentional, but now there seems to be a subtle 
difference to what Thomas described for non-managed interrupts - for 
non-managed interrupts, x86 selects the CPU based on the total interrupt 
load per CPU (or, more specifically, lowest vector allocation count), 
and not just the non-managed load. Or maybe I misread it.

Anyway, we can test this now for NVMe with its managed interrupts.

Cheers,
John

> +		if (this_count < count) {
> +			cpu = tmp;
> +		        count = this_count;
> +		}
> +	}
> +
> +	return cpu;
> +}
> +
> +/*
> + * As suggested by Thomas Gleixner in:
> + * https://lore.kernel.org/r/87h80q2aoc.fsf@nanos.tec.linutronix.de
> + */
> +static int its_select_cpu(struct irq_data *d,
> +			  const struct cpumask *aff_mask)
> +{
> +	struct its_device *its_dev = irq_data_get_irq_chip_data(d);
> +	cpumask_var_t tmpmask;
> +	int cpu, node;
> +
> +	if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
> +		return -ENOMEM;
> +
> +	node = its_dev->its->numa_node;
> +
> +	if (!irqd_affinity_is_managed(d)) {
> +		/* First try the NUMA node */
> +		if (node != NUMA_NO_NODE) {
> +			/*
> +			 * Try the intersection of the affinity mask and the
> +			 * node mask (and the online mask, just to be safe).
> +			 */
> +			cpumask_and(tmpmask, cpumask_of_node(node), aff_mask);
> +			cpumask_and(tmpmask, tmpmask, cpu_online_mask);
> +
> +			/* If that doesn't work, try the nodemask itself */
> +			if (cpumask_empty(tmpmask))
> +				cpumask_and(tmpmask, cpumask_of_node(node), cpu_online_mask);
> +
> +			cpu = cpumask_pick_least_loaded(d, tmpmask);
> +			if (cpu < nr_cpu_ids)
> +				goto out;
> +
> +			/* If we can't cross sockets, give up */
> +			if ((its_dev->its->flags & ITS_FLAGS_WORKAROUND_CAVIUM_23144))
> +				goto out;
> +
> +			/* If the above failed, expand the search */
> +		}
> +
> +		/* Try the intersection of the affinity and online masks */
> +		cpumask_and(tmpmask, aff_mask, cpu_online_mask);
> +
> +		/* If that doesn't fly, the online mask is the last resort */
> +		if (cpumask_empty(tmpmask))
> +			cpumask_copy(tmpmask, cpu_online_mask);
> +
> +		cpu = cpumask_pick_least_loaded(d, tmpmask);
> +	} else {
> +		cpumask_and(tmpmask, irq_data_get_affinity_mask(d), cpu_online_mask);
> +
> +		/* If we cannot cross sockets, limit the search to that node */
> +		if ((its_dev->its->flags & ITS_FLAGS_WORKAROUND_CAVIUM_23144) &&
> +		    node != NUMA_NO_NODE)
> +			cpumask_and(tmpmask, tmpmask, cpumask_of_node(node));
> +
> +		cpu = cpumask_pick_least_loaded(d, tmpmask);
> +	}
> +out:
> +	free_cpumask_var(tmpmask);
> +
> +	pr_debug("IRQ%d -> %*pbl CPU%d\n", d->irq, cpumask_pr_args(aff_mask), cpu);
> +	return cpu;
> +}
> +
>   static int its_set_affinity(struct irq_data *d, const struct cpumask *mask_val,
>   			    bool force)
>   {
> -	unsigned int cpu;
> -	const struct cpumask *cpu_mask = cpu_online_mask;
>   	struct its_device *its_dev = irq_data_get_irq_chip_data(d);
>   	struct its_collection *target_col;
>   	u32 id = its_get_event_id(d);
> +	int cpu;
>   
>   	/* A forwarded interrupt should use irq_set_vcpu_affinity */
>   	if (irqd_is_forwarded_to_vcpu(d))
>   		return -EINVAL;
>   
> -       /* lpi cannot be routed to a redistributor that is on a foreign node */
> -	if (its_dev->its->flags & ITS_FLAGS_WORKAROUND_CAVIUM_23144) {
> -		if (its_dev->its->numa_node >= 0) {
> -			cpu_mask = cpumask_of_node(its_dev->its->numa_node);
> -			if (!cpumask_intersects(mask_val, cpu_mask))
> -				return -EINVAL;
> -		}
> -	}
> -
> -	cpu = cpumask_any_and(mask_val, cpu_mask);
> +	if (!force)
> +		cpu = its_select_cpu(d, mask_val);
> +	else
> +		cpu = cpumask_pick_least_loaded(d, mask_val);
>   
> -	if (cpu >= nr_cpu_ids)
> +	if (cpu < 0 || cpu >= nr_cpu_ids)
>   		return -EINVAL;
>   
>   	/* don't set the affinity when the target cpu is same as current one */
> @@ -3455,21 +3531,11 @@ static int its_irq_domain_activate(struct irq_domain *domain,
>   {
>   	struct its_device *its_dev = irq_data_get_irq_chip_data(d);
>   	u32 event = its_get_event_id(d);
> -	const struct cpumask *cpu_mask = cpu_online_mask;
>   	int cpu;
>   
> -	/* get the cpu_mask of local node */
> -	if (its_dev->its->numa_node >= 0)
> -		cpu_mask = cpumask_of_node(its_dev->its->numa_node);
> -
> -	/* Bind the LPI to the first possible CPU */
> -	cpu = cpumask_first_and(cpu_mask, cpu_online_mask);
> -	if (cpu >= nr_cpu_ids) {
> -		if (its_dev->its->flags & ITS_FLAGS_WORKAROUND_CAVIUM_23144)
> -			return -EINVAL;
> -
> -		cpu = cpumask_first(cpu_online_mask);
> -	}
> +	cpu = its_select_cpu(d, cpu_online_mask);
> +	if (cpu < 0 || cpu >= nr_cpu_ids)
> +		return -EINVAL;
>   
>   	its_inc_lpi_count(d, cpu);
>   	its_dev->event_map.col_map[event] = cpu;
>
Marc Zyngier March 16, 2020, 1:14 p.m. UTC | #2
On 2020-03-16 13:02, John Garry wrote:

Hi John,

> Hi Marc,
> 
>> +		int this_count = its_read_lpi_count(d, tmp);
> 
> Not sure if it's intentional, but now there seems to be a subtle
> difference to what Thomas described for non-managed interrupts - for
> non-managed interrupts, x86 selects the CPU based on the total
> interrupt load per CPU (or, more specifically, lowest vector
> allocation count), and not just the non-managed load. Or maybe I
> misread it.

So far, I'm trying to keep the two allocation paths separate, as the
two systems I have access to have very different behaviours: D05 has
no managed interrupts to speak of, and my top-secret work machine
has almost no unmanaged interrupts, so the two sets are almost
completely disjoint.

Also, it all depends on the interrupt allocation order, and whether
something will rebalance the non-managed interrupts at a later time.
At least, these two patches make it easy to alter the placement policy
(the behaviour you describe above is a 2 line change).

> Anyway, we can test this now for NVMe with its managed interrupts.

Looking forward to hearing from you!

         M.
John Garry March 17, 2020, 6:43 p.m. UTC | #3
>>
>>> +        int this_count = its_read_lpi_count(d, tmp);
>>
>> Not sure if it's intentional, but now there seems to be a subtle
>> difference to what Thomas described for non-managed interrupts - for
>> non-managed interrupts, x86 selects the CPU based on the total
>> interrupt load per CPU (or, more specifically, lowest vector
>> allocation count), and not just the non-managed load. Or maybe I
>> misread it.
> 
> So far, I'm trying to keep the two allocation paths separate, as the
> two systems I have access to have very different behaviours: D05 has
> no managed interrupts to speak of, and my top-secret work machine
> has almost no unmanaged interrupts, so the two sets are almost
> completely disjoint.

Sure, but I'd say that it would be a more common scenario to have a 
mixture of both.

> 
> Also, it all depends on the interrupt allocation order, and whether
> something will rebalance the non-managed interrupts at a later time.
> At least, these two patches make it easy to alter the placement policy
> (the behaviour you describe above is a 2 line change).
> 
>> Anyway, we can test this now for NVMe with its managed interrupts.
> 
> Looking forward to hearing from you!
>

On my D06CS board (128 core), there seems to be something wrong, as the 
q0 affinity mask looks incorrect:

PCI name is 81:00.0: nvme0n1 
 

         irq 322, cpu list 69, effective list 69 
 

         irq 325, cpu list 32-38, effective list 32 
 

         irq 326, cpu list 39-45, effective list 40 
 

         irq 327, cpu list 46-51, effective list 47 
 

         irq 328, cpu list 52-57, effective list 53 
 

         irq 329, cpu list 58-63, effective list 59


And something stranger for my colleague Luo Jiaxing, specifically the 
effective affinity:

PCI name is 85:00.0: nvme2n1
irq 196, cpu list 0-31, effective list 82
irq 377, cpu list 32-38, effective list 32
irq 378, cpu list 39-45, effective list 39
irq 379, cpu list 46-51, effective list 46

But then v5.6-rc5 vanilla also looks to have this issue when I tested on 
my board:

john@ubuntu:~$ more /proc/irq/322/smp_affinity_list 
 

69

My D06ES (96 core) board looks sensible for the affinity in this regard 
(I did not try vanilla v5.6-rc5, but only with your patches on top). 
I'll need to debug this.

Cheers,
John
John Garry March 18, 2020, 12:22 p.m. UTC | #4
I may have an idea about this:
irq 196, cpu list 0-31, effective list 82

Just going back to comment on the code:

> +/*
> + * As suggested by Thomas Gleixner in:
> + * https://lore.kernel.org/r/87h80q2aoc.fsf@nanos.tec.linutronix.de
> + */
> +static int its_select_cpu(struct irq_data *d,
> +			  const struct cpumask *aff_mask)
> +{
> +	struct its_device *its_dev = irq_data_get_irq_chip_data(d);
> +	cpumask_var_t tmpmask;
> +	int cpu, node;
> +
> +	if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
> +		return -ENOMEM;
> +
> +	node = its_dev->its->numa_node;
> +
> +	if (!irqd_affinity_is_managed(d)) {
> +		/* First try the NUMA node */
> +		if (node != NUMA_NO_NODE) {
> +			/*
> +			 * Try the intersection of the affinity mask and the
> +			 * node mask (and the online mask, just to be safe).
> +			 */
> +			cpumask_and(tmpmask, cpumask_of_node(node), aff_mask);
> +			cpumask_and(tmpmask, tmpmask, cpu_online_mask);
> +
> +			/* If that doesn't work, try the nodemask itself */

So if tmpmsk is empty...

> +			if (cpumask_empty(tmpmask))
> +				cpumask_and(tmpmask, cpumask_of_node(node), cpu_online_mask);

  now the tmpmask may have no intersection with the aff_mask...

> +
> +			cpu = cpumask_pick_least_loaded(d, tmpmask);
> +			if (cpu < nr_cpu_ids)
> +				goto out;
> +
> +			/* If we can't cross sockets, give up */
> +			if ((its_dev->its->flags & ITS_FLAGS_WORKAROUND_CAVIUM_23144))
> +				goto out;
> +
> +			/* If the above failed, expand the search */
> +		}

SNIP

> +out:
> +	free_cpumask_var(tmpmask);
> +
> +	pr_debug("IRQ%d -> %*pbl CPU%d\n", d->irq, cpumask_pr_args(aff_mask), cpu);
> +	return cpu;
> +}
> +
>   static int its_set_affinity(struct irq_data *d, const struct cpumask *mask_val,
>   			    bool force)
>   {
> -	unsigned int cpu;
> -	const struct cpumask *cpu_mask = cpu_online_mask;
>   	struct its_device *its_dev = irq_data_get_irq_chip_data(d);
>   	struct its_collection *target_col;
>   	u32 id = its_get_event_id(d);
> +	int cpu;
>   
>   	/* A forwarded interrupt should use irq_set_vcpu_affinity */
>   	if (irqd_is_forwarded_to_vcpu(d))
>   		return -EINVAL;
>   
> -       /* lpi cannot be routed to a redistributor that is on a foreign node */
> -	if (its_dev->its->flags & ITS_FLAGS_WORKAROUND_CAVIUM_23144) {
> -		if (its_dev->its->numa_node >= 0) {
> -			cpu_mask = cpumask_of_node(its_dev->its->numa_node);
> -			if (!cpumask_intersects(mask_val, cpu_mask))
> -				return -EINVAL;
> -		}
> -	}
> -
> -	cpu = cpumask_any_and(mask_val, cpu_mask);
> +	if (!force)
> +		cpu = its_select_cpu(d, mask_val);
> +	else
> +		cpu = cpumask_pick_least_loaded(d, mask_val);
>   
> -	if (cpu >= nr_cpu_ids)
> +	if (cpu < 0 || cpu >= nr_cpu_ids)
>   		return -EINVAL;

Annotate missing code:

	if (cpu < 0 || cpu >= nr_cpu_ids)
		return -EINVAL;

	if (cpu != its_dev->event_map.col_map[id]) {
		its_inc_lpi_count(d, cpu);
		its_dec_lpi_count(d, its_dev->event_map.col_map[id]);
		target_col = &its_dev->its->collections[cpu];
		its_send_movi(its_dev, target_col, id);
		its_dev->event_map.col_map[id] = cpu;
		irq_data_update_effective_affinity(d, cpumask_of(cpu));
	}

So cpu may not be a member of mask_val. Hence the inconsistency of the 
affinity list and effective affinity. We could just drop the AND of the 
ITS node mask in its_select_cpu().

Anyway, I don't think that this should stop us testing.

Cheers,
John
Marc Zyngier March 18, 2020, 2:04 p.m. UTC | #5
On 2020-03-18 12:22, John Garry wrote:
> I may have an idea about this:
> irq 196, cpu list 0-31, effective list 82
> 
> Just going back to comment on the code:
> 
>> +/*
>> + * As suggested by Thomas Gleixner in:
>> + * https://lore.kernel.org/r/87h80q2aoc.fsf@nanos.tec.linutronix.de
>> + */
>> +static int its_select_cpu(struct irq_data *d,
>> +			  const struct cpumask *aff_mask)
>> +{
>> +	struct its_device *its_dev = irq_data_get_irq_chip_data(d);
>> +	cpumask_var_t tmpmask;
>> +	int cpu, node;
>> +
>> +	if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
>> +		return -ENOMEM;
>> +
>> +	node = its_dev->its->numa_node;
>> +
>> +	if (!irqd_affinity_is_managed(d)) {
>> +		/* First try the NUMA node */
>> +		if (node != NUMA_NO_NODE) {
>> +			/*
>> +			 * Try the intersection of the affinity mask and the
>> +			 * node mask (and the online mask, just to be safe).
>> +			 */
>> +			cpumask_and(tmpmask, cpumask_of_node(node), aff_mask);
>> +			cpumask_and(tmpmask, tmpmask, cpu_online_mask);
>> +
>> +			/* If that doesn't work, try the nodemask itself */
> 
> So if tmpmsk is empty...

Which means the proposed affinity mask isn't part of the node mask the 
first place.
Why did we get such an affinity the first place?

> 
>> +			if (cpumask_empty(tmpmask))
>> +				cpumask_and(tmpmask, cpumask_of_node(node), cpu_online_mask);
> 
>  now the tmpmask may have no intersection with the aff_mask...

But it has the mask for CPUs that are best suited for this interrupt, 
right?
If I understand the topology of your machine, it has an ITS per 64 CPUs, 
and
this device is connected to the ITS that serves the second socket.

> 
>> +
>> +			cpu = cpumask_pick_least_loaded(d, tmpmask);
>> +			if (cpu < nr_cpu_ids)
>> +				goto out;
>> +
>> +			/* If we can't cross sockets, give up */
>> +			if ((its_dev->its->flags & ITS_FLAGS_WORKAROUND_CAVIUM_23144))
>> +				goto out;
>> +
>> +			/* If the above failed, expand the search */
>> +		}
> 
> SNIP
> 
>> +out:
>> +	free_cpumask_var(tmpmask);
>> +
>> +	pr_debug("IRQ%d -> %*pbl CPU%d\n", d->irq, 
>> cpumask_pr_args(aff_mask), cpu);
>> +	return cpu;
>> +}
>> +
>>   static int its_set_affinity(struct irq_data *d, const struct cpumask 
>> *mask_val,
>>   			    bool force)
>>   {
>> -	unsigned int cpu;
>> -	const struct cpumask *cpu_mask = cpu_online_mask;
>>   	struct its_device *its_dev = irq_data_get_irq_chip_data(d);
>>   	struct its_collection *target_col;
>>   	u32 id = its_get_event_id(d);
>> +	int cpu;
>>     	/* A forwarded interrupt should use irq_set_vcpu_affinity */
>>   	if (irqd_is_forwarded_to_vcpu(d))
>>   		return -EINVAL;
>>   -       /* lpi cannot be routed to a redistributor that is on a 
>> foreign node */
>> -	if (its_dev->its->flags & ITS_FLAGS_WORKAROUND_CAVIUM_23144) {
>> -		if (its_dev->its->numa_node >= 0) {
>> -			cpu_mask = cpumask_of_node(its_dev->its->numa_node);
>> -			if (!cpumask_intersects(mask_val, cpu_mask))
>> -				return -EINVAL;
>> -		}
>> -	}
>> -
>> -	cpu = cpumask_any_and(mask_val, cpu_mask);
>> +	if (!force)
>> +		cpu = its_select_cpu(d, mask_val);
>> +	else
>> +		cpu = cpumask_pick_least_loaded(d, mask_val);
>>   -	if (cpu >= nr_cpu_ids)
>> +	if (cpu < 0 || cpu >= nr_cpu_ids)
>>   		return -EINVAL;
> 
> Annotate missing code:
> 
> 	if (cpu < 0 || cpu >= nr_cpu_ids)
> 		return -EINVAL;
> 
> 	if (cpu != its_dev->event_map.col_map[id]) {
> 		its_inc_lpi_count(d, cpu);
> 		its_dec_lpi_count(d, its_dev->event_map.col_map[id]);
> 		target_col = &its_dev->its->collections[cpu];
> 		its_send_movi(its_dev, target_col, id);
> 		its_dev->event_map.col_map[id] = cpu;
> 		irq_data_update_effective_affinity(d, cpumask_of(cpu));
> 	}
> 
> So cpu may not be a member of mask_val. Hence the inconsistency of the
> affinity list and effective affinity. We could just drop the AND of
> the ITS node mask in its_select_cpu().

That would be a departure from the algorithm Thomas proposed, which made
a lot of sense in my opinion. What its_select_cpu() does in this case is
probably the best that can be achieved from a latency perspective,
as it keeps the interrupt local to the socket that generated it.

What I wonder is how we end-up with this silly aff_mask the first place.

> Anyway, I don't think that this should stop us testing.

Agreed.

         M.
Marc Zyngier March 18, 2020, 2:16 p.m. UTC | #6
On 2020-03-17 18:43, John Garry wrote:
>>> 
>>>> +        int this_count = its_read_lpi_count(d, tmp);
>>> 
>>> Not sure if it's intentional, but now there seems to be a subtle
>>> difference to what Thomas described for non-managed interrupts - for
>>> non-managed interrupts, x86 selects the CPU based on the total
>>> interrupt load per CPU (or, more specifically, lowest vector
>>> allocation count), and not just the non-managed load. Or maybe I
>>> misread it.
>> 
>> So far, I'm trying to keep the two allocation paths separate, as the
>> two systems I have access to have very different behaviours: D05 has
>> no managed interrupts to speak of, and my top-secret work machine
>> has almost no unmanaged interrupts, so the two sets are almost
>> completely disjoint.
> 
> Sure, but I'd say that it would be a more common scenario to have a
> mixture of both.
> 
>> 
>> Also, it all depends on the interrupt allocation order, and whether
>> something will rebalance the non-managed interrupts at a later time.
>> At least, these two patches make it easy to alter the placement policy
>> (the behaviour you describe above is a 2 line change).
>> 
>>> Anyway, we can test this now for NVMe with its managed interrupts.
>> 
>> Looking forward to hearing from you!
>> 
> 
> On my D06CS board (128 core), there seems to be something wrong, as
> the q0 affinity mask looks incorrect:
> 
> PCI name is 81:00.0: nvme0n1
> 
> 
>         irq 322, cpu list 69, effective list 69
> 
> 
>         irq 325, cpu list 32-38, effective list 32
> 
> 
>         irq 326, cpu list 39-45, effective list 40
> 
> 
>         irq 327, cpu list 46-51, effective list 47
> 
> 
>         irq 328, cpu list 52-57, effective list 53
> 
> 
>         irq 329, cpu list 58-63, effective list 59


Sorry, can you explain in more detail what you find wrong in this log?
Is it that interrupt 322 has a single CPU affinity instead of a list?

> And something stranger for my colleague Luo Jiaxing, specifically the
> effective affinity:
> 
> PCI name is 85:00.0: nvme2n1
> irq 196, cpu list 0-31, effective list 82

Right, this one we have seen in your other email. Being a non-managed
interrupt, it lands on the closest socket.

> irq 377, cpu list 32-38, effective list 32
> irq 378, cpu list 39-45, effective list 39
> irq 379, cpu list 46-51, effective list 46
> 
> But then v5.6-rc5 vanilla also looks to have this issue when I tested
> on my board:
> 
> john@ubuntu:~$ more /proc/irq/322/smp_affinity_list
> 
> 
> 69
> 
> My D06ES (96 core) board looks sensible for the affinity in this
> regard (I did not try vanilla v5.6-rc5, but only with your patches on
> top). I'll need to debug this.

Thanks,

         M.
John Garry March 18, 2020, 2:25 p.m. UTC | #7
On 18/03/2020 14:16, Marc Zyngier wrote:
>>
>> On my D06CS board (128 core), there seems to be something wrong, as
>> the q0 affinity mask looks incorrect:
>>
>> PCI name is 81:00.0: nvme0n1
>>
>>
>>         irq 322, cpu list 69, effective list 69
>>
>>

...

> 
> Sorry, can you explain in more detail what you find wrong in this log?
> Is it that interrupt 322 has a single CPU affinity instead of a list?
> 
>> And something stranger for my colleague Luo Jiaxing, specifically the 

Hi Marc,

Sorry, ignore this. I just realized after that the NVMe PCI driver 
reserved queue0 vector as without affinity spreading, i.e. non-managed.

Cheers,
John
John Garry March 18, 2020, 3:34 p.m. UTC | #8
>>> +static int its_select_cpu(struct irq_data *d,
>>> +			  const struct cpumask *aff_mask)
>>> +{
>>> +	struct its_device *its_dev = irq_data_get_irq_chip_data(d);
>>> +	cpumask_var_t tmpmask;
>>> +	int cpu, node;
>>> +
>>> +	if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
>>> +		return -ENOMEM;
>>> +
>>> +	node = its_dev->its->numa_node;
>>> +
>>> +	if (!irqd_affinity_is_managed(d)) {
>>> +		/* First try the NUMA node */
>>> +		if (node != NUMA_NO_NODE) {
>>> +			/*
>>> +			 * Try the intersection of the affinity mask and the
>>> +			 * node mask (and the online mask, just to be safe).
>>> +			 */
>>> +			cpumask_and(tmpmask, cpumask_of_node(node), aff_mask);
>>> +			cpumask_and(tmpmask, tmpmask, cpu_online_mask);
>>> +
>>> +			/* If that doesn't work, try the nodemask itself */
>>
>> So if tmpmsk is empty...
> 
> Which means the proposed affinity mask isn't part of the node mask the
> first place.
> Why did we get such an affinity the first place?

It seems to be just irqbalance setting the affinity mask via sysfs:

[44.782116] Calltrace:
[44.782119] its_select_cpu+0x420/0x6e0
[44.782121] its_set_affinity+0x180/0x208
[44.782126] msi_domain_set_affinity+0x44/0xb8
[44.782130] irq_do_set_affinity+0x48/0x190
[44.782132] irq_set_affinity_locked+0xc0/0xe8
[44.782134] __irq_set_affinity+0x48/0x78
[44.782136] write_irq_affinity.isra.8+0xec/0x110
[44.782138] irq_affinity_proc_write+0x1c/0x28
[44.782142] proc_reg_write+0x70/0xb8
[44.782147] __vfs_write+0x18/0x40
[44.782149] vfs_write+0xb0/0x1d0
[44.782151] ksys_write+0x64/0xe8
[44.782154] __arm64_sys_write+0x18/0x20
[44.782157] el0_svc_common.constprop.2+0x88/0x150
[44.782159] do_el0_svc+0x20/0x80
[44.782162] el0_sync_handler+0x118/0x188
[44.782164] el0_sync+0x140/0x180

And for some reason fancied cpu62.

> 
>>
>>> +			if (cpumask_empty(tmpmask))
>>> +				cpumask_and(tmpmask, cpumask_of_node(node), cpu_online_mask);
>>
>>   now the tmpmask may have no intersection with the aff_mask...
> 
> But it has the mask for CPUs that are best suited for this interrupt,
> right?
> If I understand the topology of your machine, it has an ITS per 64 CPUs,
> and
> this device is connected to the ITS that serves the second socket.

No, this one (D06ES) has a single ITS:

john@ubuntu:~/kernel-dev$ dmesg | grep ITS
[    0.000000] SRAT: PXM 0 -> ITS 0 -> Node 0
[    0.000000] ITS [mem 0x202100000-0x20211ffff]
[    0.000000] ITS@0x0000000202100000: Using ITS number 0
[    0.000000] ITS@0x0000000202100000: allocated 8192 Devices 
@23ea9f0000 (indirect, esz 8, psz 16K, shr 1)
[    0.000000] ITS@0x0000000202100000: allocated 2048 Virtual CPUs 
@23ea9d8000 (indirect, esz 16, psz 4K, shr 1)
[    0.000000] ITS@0x0000000202100000: allocated 256 Interrupt 
Collections @23ea9d3000 (flat, esz 16, psz 4K, shr 1)
[    0.000000] ITS: Using DirectLPI for VPE invalidation
[    0.000000] ITS: Enabling GICv4 support
[    0.044034] Platform MSI: ITS@0x202100000 domain created
[    0.044042] PCI/MSI: ITS@0x202100000 domain created

D06CS has 2x ITS, as you may know :)

And, FWIW, the device is on the 2nd socket, numa node #2.

So the cpu mask of node #0 (where the ITS lives) is 0-23. So no 
intersection with what userspace requested.

>> 	if (cpu < 0 || cpu >= nr_cpu_ids)
>> 		return -EINVAL;
>>
>> 	if (cpu != its_dev->event_map.col_map[id]) {
>> 		its_inc_lpi_count(d, cpu);
>> 		its_dec_lpi_count(d, its_dev->event_map.col_map[id]);
>> 		target_col = &its_dev->its->collections[cpu];
>> 		its_send_movi(its_dev, target_col, id);
>> 		its_dev->event_map.col_map[id] = cpu;
>> 		irq_data_update_effective_affinity(d, cpumask_of(cpu));
>> 	}
>>
>> So cpu may not be a member of mask_val. Hence the inconsistency of the
>> affinity list and effective affinity. We could just drop the AND of
>> the ITS node mask in its_select_cpu().
> 
> That would be a departure from the algorithm Thomas proposed, which made
> a lot of sense in my opinion. What its_select_cpu() does in this case is
> probably the best that can be achieved from a latency perspective,
> as it keeps the interrupt local to the socket that generated it.

We seem to be following what Thomas described for a non-managed 
interrupt bound to a node. But is this interrupt bound to the node?

Regardless of that, what you're saying seems right - keep local 
interrupt bound to the node. But the problem is that userspace is doing 
its own thing.

> 
> What I wonder is how we end-up with this silly aff_mask the first place.

Cheers,
John

BTW, sorry if any text formatting is mangled. I have to improve my WFH 
setup....
Marc Zyngier March 18, 2020, 5:30 p.m. UTC | #9
On 2020-03-18 15:34, John Garry wrote:
>>>> +static int its_select_cpu(struct irq_data *d,
>>>> +			  const struct cpumask *aff_mask)
>>>> +{
>>>> +	struct its_device *its_dev = irq_data_get_irq_chip_data(d);
>>>> +	cpumask_var_t tmpmask;
>>>> +	int cpu, node;
>>>> +
>>>> +	if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
>>>> +		return -ENOMEM;
>>>> +
>>>> +	node = its_dev->its->numa_node;
>>>> +
>>>> +	if (!irqd_affinity_is_managed(d)) {
>>>> +		/* First try the NUMA node */
>>>> +		if (node != NUMA_NO_NODE) {
>>>> +			/*
>>>> +			 * Try the intersection of the affinity mask and the
>>>> +			 * node mask (and the online mask, just to be safe).
>>>> +			 */
>>>> +			cpumask_and(tmpmask, cpumask_of_node(node), aff_mask);
>>>> +			cpumask_and(tmpmask, tmpmask, cpu_online_mask);
>>>> +
>>>> +			/* If that doesn't work, try the nodemask itself */
>>> 
>>> So if tmpmsk is empty...
>> 
>> Which means the proposed affinity mask isn't part of the node mask the
>> first place.
>> Why did we get such an affinity the first place?
> 
> It seems to be just irqbalance setting the affinity mask via sysfs:
> 
> [44.782116] Calltrace:
> [44.782119] its_select_cpu+0x420/0x6e0
> [44.782121] its_set_affinity+0x180/0x208
> [44.782126] msi_domain_set_affinity+0x44/0xb8
> [44.782130] irq_do_set_affinity+0x48/0x190
> [44.782132] irq_set_affinity_locked+0xc0/0xe8
> [44.782134] __irq_set_affinity+0x48/0x78
> [44.782136] write_irq_affinity.isra.8+0xec/0x110
> [44.782138] irq_affinity_proc_write+0x1c/0x28
> [44.782142] proc_reg_write+0x70/0xb8
> [44.782147] __vfs_write+0x18/0x40
> [44.782149] vfs_write+0xb0/0x1d0
> [44.782151] ksys_write+0x64/0xe8
> [44.782154] __arm64_sys_write+0x18/0x20
> [44.782157] el0_svc_common.constprop.2+0x88/0x150
> [44.782159] do_el0_svc+0x20/0x80
> [44.782162] el0_sync_handler+0x118/0x188
> [44.782164] el0_sync+0x140/0x180
> 
> And for some reason fancied cpu62.

Hmmm. OK. I'm surprised that irqbalance dries to set a range of CPUs, 
instead of
a particular CPU though.

> 
>> 
>>> 
>>>> +			if (cpumask_empty(tmpmask))
>>>> +				cpumask_and(tmpmask, cpumask_of_node(node), cpu_online_mask);
>>> 
>>>   now the tmpmask may have no intersection with the aff_mask...
>> 
>> But it has the mask for CPUs that are best suited for this interrupt,
>> right?
>> If I understand the topology of your machine, it has an ITS per 64 
>> CPUs,
>> and
>> this device is connected to the ITS that serves the second socket.
> 
> No, this one (D06ES) has a single ITS:
> 
> john@ubuntu:~/kernel-dev$ dmesg | grep ITS
> [    0.000000] SRAT: PXM 0 -> ITS 0 -> Node 0
> [    0.000000] ITS [mem 0x202100000-0x20211ffff]
> [    0.000000] ITS@0x0000000202100000: Using ITS number 0
> [    0.000000] ITS@0x0000000202100000: allocated 8192 Devices
> @23ea9f0000 (indirect, esz 8, psz 16K, shr 1)
> [    0.000000] ITS@0x0000000202100000: allocated 2048 Virtual CPUs
> @23ea9d8000 (indirect, esz 16, psz 4K, shr 1)
> [    0.000000] ITS@0x0000000202100000: allocated 256 Interrupt
> Collections @23ea9d3000 (flat, esz 16, psz 4K, shr 1)
> [    0.000000] ITS: Using DirectLPI for VPE invalidation
> [    0.000000] ITS: Enabling GICv4 support
> [    0.044034] Platform MSI: ITS@0x202100000 domain created
> [    0.044042] PCI/MSI: ITS@0x202100000 domain created

There's something I'm missing here. If there's a single ITS in the 
system,
node affinity must cover the whole system, not half of it.

> D06CS has 2x ITS, as you may know :)
> 
> And, FWIW, the device is on the 2nd socket, numa node #2.

You've lost me. Single ITS, but two sockets?

> 
> So the cpu mask of node #0 (where the ITS lives) is 0-23. So no
> intersection with what userspace requested.
> 
>>> 	if (cpu < 0 || cpu >= nr_cpu_ids)
>>> 		return -EINVAL;
>>> 
>>> 	if (cpu != its_dev->event_map.col_map[id]) {
>>> 		its_inc_lpi_count(d, cpu);
>>> 		its_dec_lpi_count(d, its_dev->event_map.col_map[id]);
>>> 		target_col = &its_dev->its->collections[cpu];
>>> 		its_send_movi(its_dev, target_col, id);
>>> 		its_dev->event_map.col_map[id] = cpu;
>>> 		irq_data_update_effective_affinity(d, cpumask_of(cpu));
>>> 	}
>>> 
>>> So cpu may not be a member of mask_val. Hence the inconsistency of 
>>> the
>>> affinity list and effective affinity. We could just drop the AND of
>>> the ITS node mask in its_select_cpu().
>> 
>> That would be a departure from the algorithm Thomas proposed, which 
>> made
>> a lot of sense in my opinion. What its_select_cpu() does in this case 
>> is
>> probably the best that can be achieved from a latency perspective,
>> as it keeps the interrupt local to the socket that generated it.
> 
> We seem to be following what Thomas described for a non-managed
> interrupt bound to a node. But is this interrupt bound to the node?

If the ITS advertizes affinity to a node (through SRAT, for example),
we should use that. And that's what we have in this patch.

> Regardless of that, what you're saying seems right - keep local
> interrupt bound to the node. But the problem is that userspace is
> doing its own thing.

Unless you tell the interrupt subsystem that userspace cannot balance
this interrupt, it can happen.

> BTW, sorry if any text formatting is mangled. I have to improve my WFH 
> setup....

You're doing fine! ;-)

         M.
John Garry March 18, 2020, 7 p.m. UTC | #10
Hi Marc,

>> And for some reason fancied cpu62.
> 
> Hmmm. OK. I'm surprised that irqbalance dries to set a range of CPUs, 
> instead of
> a particular CPU though.

It does seem strange. But also quite consistent. I will check again on that.

>>>
>>> But it has the mask for CPUs that are best suited for this interrupt,
>>> right?
>>> If I understand the topology of your machine, it has an ITS per 64 CPUs,
>>> and
>>> this device is connected to the ITS that serves the second socket.
>>
>> No, this one (D06ES) has a single ITS:
>>
>> john@ubuntu:~/kernel-dev$ dmesg | grep ITS
>> [    0.000000] SRAT: PXM 0 -> ITS 0 -> Node 0
>> [    0.000000] ITS [mem 0x202100000-0x20211ffff]
>> [    0.000000] ITS@0x0000000202100000: Using ITS number 0
>> [    0.000000] ITS@0x0000000202100000: allocated 8192 Devices
>> @23ea9f0000 (indirect, esz 8, psz 16K, shr 1)
>> [    0.000000] ITS@0x0000000202100000: allocated 2048 Virtual CPUs
>> @23ea9d8000 (indirect, esz 16, psz 4K, shr 1)
>> [    0.000000] ITS@0x0000000202100000: allocated 256 Interrupt
>> Collections @23ea9d3000 (flat, esz 16, psz 4K, shr 1)
>> [    0.000000] ITS: Using DirectLPI for VPE invalidation
>> [    0.000000] ITS: Enabling GICv4 support
>> [    0.044034] Platform MSI: ITS@0x202100000 domain created
>> [    0.044042] PCI/MSI: ITS@0x202100000 domain created
> 
> There's something I'm missing here. If there's a single ITS in the system,
> node affinity must cover the whole system, not half of it.
> 
>> D06CS has 2x ITS, as you may know :)
>>
>> And, FWIW, the device is on the 2nd socket, numa node #2.
> 
> You've lost me. Single ITS, but two sockets?

Yeah, right, so I think that a single ITS is used due to some HW bug in 
the ES chip, fixed in the CS chip.

And some more background on the D05, D06ES, D06CS topology:

Even though the system is 2x socket, we model as 4x NUMA nodes, i.e. 2x 
nodes per socket. This is because each node has an associated memory 
controller in the socket, i.e. 2x memory controllers per socket. As 
such, for this D06ES system, a NUMA node is 24 cores.

I will be the first to admit that it does make things more complicated. 
Even more especially (and arguably broken) when we need to assign a 
proximity domain to devices in either socket, considering they are 
equidistant from either memory controller/CPU cluster in that socket.

> 
>>
>> So the cpu mask of node #0 (where the ITS lives) is 0-23. So no
>> intersection with what userspace requested.
>>
>>>>     if (cpu < 0 || cpu >= nr_cpu_ids)
>>>>         return -EINVAL;
>>>>
>>>>     if (cpu != its_dev->event_map.col_map[id]) {
>>>>         its_inc_lpi_count(d, cpu);
>>>>         its_dec_lpi_count(d, its_dev->event_map.col_map[id]);
>>>>         target_col = &its_dev->its->collections[cpu];
>>>>         its_send_movi(its_dev, target_col, id);
>>>>         its_dev->event_map.col_map[id] = cpu;
>>>>         irq_data_update_effective_affinity(d, cpumask_of(cpu));
>>>>     }
>>>>
>>>> So cpu may not be a member of mask_val. Hence the inconsistency of the
>>>> affinity list and effective affinity. We could just drop the AND of
>>>> the ITS node mask in its_select_cpu().
>>>
>>> That would be a departure from the algorithm Thomas proposed, which made
>>> a lot of sense in my opinion. What its_select_cpu() does in this case is
>>> probably the best that can be achieved from a latency perspective,
>>> as it keeps the interrupt local to the socket that generated it.
>>
>> We seem to be following what Thomas described for a non-managed
>> interrupt bound to a node. But is this interrupt bound to the node?
> 
> If the ITS advertizes affinity to a node (through SRAT, for example),
> we should use that. And that's what we have in this patch.

Right, but my system is incompatible. Reason being, SRAT says ITS is 
NUMA node #0 (I think choosing node #0 over #1 may be just arbitrary), 
and the cpu mask for NUMA node #0 is 0-23, as above. And I figure even 
for D06CS with 2x ITS, again, is incompatible for the same reason.

So your expectation for a single ITS system would be that the NUMA node 
cpu mask for the ITS would cover all cpus. Sadly, it doesn't here...

Much appreciated,
John
John Garry March 27, 2020, 5:52 p.m. UTC | #11
> +
> +/*
> + * As suggested by Thomas Gleixner in:
> + * https://lore.kernel.org/r/87h80q2aoc.fsf@nanos.tec.linutronix.de
> + */
> +static int its_select_cpu(struct irq_data *d,
> +			  const struct cpumask *aff_mask)
> +{
> +	struct its_device *its_dev = irq_data_get_irq_chip_data(d);
> +	cpumask_var_t tmpmask;
> +	int cpu, node;
> +
> +	if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
> +		return -ENOMEM;
> +
> +	node = its_dev->its->numa_node;
> +
> +	if (!irqd_affinity_is_managed(d)) {
> +		/* First try the NUMA node */
> +		if (node != NUMA_NO_NODE) {
> +			/*
> +			 * Try the intersection of the affinity mask and the
> +			 * node mask (and the online mask, just to be safe).
> +			 */
> +			cpumask_and(tmpmask, cpumask_of_node(node), aff_mask);
> +			cpumask_and(tmpmask, tmpmask, cpu_online_mask);
> +
> +			/* If that doesn't work, try the nodemask itself */
> +			if (cpumask_empty(tmpmask))
> +				cpumask_and(tmpmask, cpumask_of_node(node), cpu_online_mask);
> +
> +			cpu = cpumask_pick_least_loaded(d, tmpmask);
> +			if (cpu < nr_cpu_ids)
> +				goto out;
> +
> +			/* If we can't cross sockets, give up */
> +			if ((its_dev->its->flags & ITS_FLAGS_WORKAROUND_CAVIUM_23144))
> +				goto out;
> +
> +			/* If the above failed, expand the search */
> +		}
> +
> +		/* Try the intersection of the affinity and online masks */
> +		cpumask_and(tmpmask, aff_mask, cpu_online_mask);
> +
> +		/* If that doesn't fly, the online mask is the last resort */
> +		if (cpumask_empty(tmpmask))
> +			cpumask_copy(tmpmask, cpu_online_mask);
> +
> +		cpu = cpumask_pick_least_loaded(d, tmpmask);
> +	} else {


Hi Marc,


> +		cpumask_and(tmpmask, irq_data_get_affinity_mask(d), cpu_online_mask);
> +

Please consider this flow:

- in its_irq_domain_activate()->its_select_cpu(), for a managed 
interrupt we select the target cpu from the interrupt affin mask

- then in its_set_affinity() call for the same interrupt, we may 
needlessly reselect the target cpu. This is because in the 
its_set_affinity()->its_select_cpu() call, we account for that interrupt 
in the load for the current target cpu, and may find a lesser loaded cpu 
in the mask and switch.

For example, from mask 0-5 we select cpu0 initially. Then on the 2nd 
call, we find cpu0 has a greater load (1) then cpu1 (0), and switch to cpu1.

Cheers,
John


> +		/* If we cannot cross sockets, limit the search to that node */
> +		if ((its_dev->its->flags & ITS_FLAGS_WORKAROUND_CAVIUM_23144) &&
> +		    node != NUMA_NO_NODE)
> +			cpumask_and(tmpmask, tmpmask, cpumask_of_node(node));
> +
> +		cpu = cpumask_pick_least_loaded(d, tmpmask);
> +	}
> +out:
> +	free_cpumask_var(tmpmask);
> +
> +	pr_debug("IRQ%d -> %*pbl CPU%d\n", d->irq, cpumask_pr_args(aff_mask), cpu);
> +	return cpu;
> +}
diff mbox series

Patch

diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 941786e1e8f7..7f1b731c04bb 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -1531,31 +1531,107 @@  static void its_dec_lpi_count(struct irq_data *d, int cpu)
 		atomic_dec(&per_cpu_ptr(&cpu_lpi_count, cpu)->unmanaged);
 }
 
+static unsigned int cpumask_pick_least_loaded(struct irq_data *d,
+					      const struct cpumask *cpu_mask)
+{
+	unsigned int cpu = nr_cpu_ids, tmp;
+	int count = S32_MAX;
+
+	for_each_cpu(tmp, cpu_mask) {
+		int this_count = its_read_lpi_count(d, tmp);
+		if (this_count < count) {
+			cpu = tmp;
+		        count = this_count;
+		}
+	}
+
+	return cpu;
+}
+
+/*
+ * As suggested by Thomas Gleixner in:
+ * https://lore.kernel.org/r/87h80q2aoc.fsf@nanos.tec.linutronix.de
+ */
+static int its_select_cpu(struct irq_data *d,
+			  const struct cpumask *aff_mask)
+{
+	struct its_device *its_dev = irq_data_get_irq_chip_data(d);
+	cpumask_var_t tmpmask;
+	int cpu, node;
+
+	if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
+		return -ENOMEM;
+
+	node = its_dev->its->numa_node;
+
+	if (!irqd_affinity_is_managed(d)) {
+		/* First try the NUMA node */
+		if (node != NUMA_NO_NODE) {
+			/*
+			 * Try the intersection of the affinity mask and the
+			 * node mask (and the online mask, just to be safe).
+			 */
+			cpumask_and(tmpmask, cpumask_of_node(node), aff_mask);
+			cpumask_and(tmpmask, tmpmask, cpu_online_mask);
+
+			/* If that doesn't work, try the nodemask itself */
+			if (cpumask_empty(tmpmask))
+				cpumask_and(tmpmask, cpumask_of_node(node), cpu_online_mask);
+
+			cpu = cpumask_pick_least_loaded(d, tmpmask);
+			if (cpu < nr_cpu_ids)
+				goto out;
+
+			/* If we can't cross sockets, give up */
+			if ((its_dev->its->flags & ITS_FLAGS_WORKAROUND_CAVIUM_23144))
+				goto out;
+
+			/* If the above failed, expand the search */
+		}
+
+		/* Try the intersection of the affinity and online masks */
+		cpumask_and(tmpmask, aff_mask, cpu_online_mask);
+
+		/* If that doesn't fly, the online mask is the last resort */
+		if (cpumask_empty(tmpmask))
+			cpumask_copy(tmpmask, cpu_online_mask);
+
+		cpu = cpumask_pick_least_loaded(d, tmpmask);
+	} else {
+		cpumask_and(tmpmask, irq_data_get_affinity_mask(d), cpu_online_mask);
+
+		/* If we cannot cross sockets, limit the search to that node */
+		if ((its_dev->its->flags & ITS_FLAGS_WORKAROUND_CAVIUM_23144) &&
+		    node != NUMA_NO_NODE)
+			cpumask_and(tmpmask, tmpmask, cpumask_of_node(node));
+
+		cpu = cpumask_pick_least_loaded(d, tmpmask);
+	}
+out:
+	free_cpumask_var(tmpmask);
+
+	pr_debug("IRQ%d -> %*pbl CPU%d\n", d->irq, cpumask_pr_args(aff_mask), cpu);
+	return cpu;
+}
+
 static int its_set_affinity(struct irq_data *d, const struct cpumask *mask_val,
 			    bool force)
 {
-	unsigned int cpu;
-	const struct cpumask *cpu_mask = cpu_online_mask;
 	struct its_device *its_dev = irq_data_get_irq_chip_data(d);
 	struct its_collection *target_col;
 	u32 id = its_get_event_id(d);
+	int cpu;
 
 	/* A forwarded interrupt should use irq_set_vcpu_affinity */
 	if (irqd_is_forwarded_to_vcpu(d))
 		return -EINVAL;
 
-       /* lpi cannot be routed to a redistributor that is on a foreign node */
-	if (its_dev->its->flags & ITS_FLAGS_WORKAROUND_CAVIUM_23144) {
-		if (its_dev->its->numa_node >= 0) {
-			cpu_mask = cpumask_of_node(its_dev->its->numa_node);
-			if (!cpumask_intersects(mask_val, cpu_mask))
-				return -EINVAL;
-		}
-	}
-
-	cpu = cpumask_any_and(mask_val, cpu_mask);
+	if (!force)
+		cpu = its_select_cpu(d, mask_val);
+	else
+		cpu = cpumask_pick_least_loaded(d, mask_val);
 
-	if (cpu >= nr_cpu_ids)
+	if (cpu < 0 || cpu >= nr_cpu_ids)
 		return -EINVAL;
 
 	/* don't set the affinity when the target cpu is same as current one */
@@ -3455,21 +3531,11 @@  static int its_irq_domain_activate(struct irq_domain *domain,
 {
 	struct its_device *its_dev = irq_data_get_irq_chip_data(d);
 	u32 event = its_get_event_id(d);
-	const struct cpumask *cpu_mask = cpu_online_mask;
 	int cpu;
 
-	/* get the cpu_mask of local node */
-	if (its_dev->its->numa_node >= 0)
-		cpu_mask = cpumask_of_node(its_dev->its->numa_node);
-
-	/* Bind the LPI to the first possible CPU */
-	cpu = cpumask_first_and(cpu_mask, cpu_online_mask);
-	if (cpu >= nr_cpu_ids) {
-		if (its_dev->its->flags & ITS_FLAGS_WORKAROUND_CAVIUM_23144)
-			return -EINVAL;
-
-		cpu = cpumask_first(cpu_online_mask);
-	}
+	cpu = its_select_cpu(d, cpu_online_mask);
+	if (cpu < 0 || cpu >= nr_cpu_ids)
+		return -EINVAL;
 
 	its_inc_lpi_count(d, cpu);
 	its_dev->event_map.col_map[event] = cpu;