diff mbox series

net/mlx5: Add affinity for each irq

Message ID 20220606071351.3550997-1-yajun.deng@linux.dev (mailing list archive)
State Rejected
Headers show
Series net/mlx5: Add affinity for each irq | expand

Commit Message

Yajun Deng June 6, 2022, 7:13 a.m. UTC
The mlx5 would allocate no less than one irq for per cpu, we can bond each
irq to a cpu to improve interrupt performance.

Signed-off-by: Yajun Deng <yajun.deng@linux.dev>
---
 .../net/ethernet/mellanox/mlx5/core/pci_irq.c | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

Comments

Shay Drori June 6, 2022, 8:31 a.m. UTC | #1
On 6/6/2022 10:13, Yajun Deng wrote:
> The mlx5 would allocate no less than one irq for per cpu, we can bond each
> irq to a cpu to improve interrupt performance.

The maximum number of affinity set is hard coded to 4. in case nvec > 4 
* (num_CPUs)[1]
we will hit the following WARN[2].
Also, we hit an oops following this WARN...

[1]
mlx5 support up to 2K MSIX (depends on the HW). e.g.: if we max out mlx5 
MSIX capability,
we will cross this limit on any machine, at least that I know of.

[2]

This is a machine with 10 CPUs and 350 MSIX

[    1.633436] ------------[ cut here ]------------
  [    1.633437] WARNING: CPU: 2 PID: 194 at kernel/irq/affinity.c:443 irq_create_affinity_masks+0x175/0x270
  [    1.633467] Modules linked in: mlx5_core(+)
  [    1.633474] CPU: 2 PID: 194 Comm: systemd-modules Not tainted 5.18.0+ #1
  [    1.633480] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
  [    1.633483] RIP: 0010:irq_create_affinity_masks+0x175/0x270
  [    1.633492] Code: 5c 41 5d 41 5e 41 5f c3 48 c7 46 20 90 6d 19 81 48 c7 c0 90 6d 19 81 8b 34 24 4c 89 ef ff d0 41 83 7d 08 04 0f 86 de fe ff ff <0f> 0b 45 31 f6 eb c5 45 8b 5d 00 8b 34 24 43 8d 04 1f 42 8d 0c 1e
  [    1.633497] RSP: 0018:ffff88810716bac0 EFLAGS: 00010202
  [    1.633501] RAX: 000000000000000a RBX: 0000000000000001 RCX: 0000000000000200
  [    1.633504] RDX: ffffffff82605000 RSI: ffffffff82605000 RDI: 0000000000000000
  [    1.633507] RBP: ffff88810716bbd0 R08: 000000000000000a R09: ffffffff82604fc0
  [    1.633510] R10: 0000000000000008 R11: 000ffffffffff000 R12: 0000000000000000
  [    1.633513] R13: ffff88810716bbd0 R14: 0000000000000160 R15: 0000000000000160
  [    1.633516] FS:  00007f8d72994b80(0000) GS:ffff88852c900000(0000) knlGS:0000000000000000
  [    1.633525] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  [    1.633528] CR2: 00007f8d73ba4490 CR3: 0000000103fce001 CR4: 0000000000370ea0
  [    1.633531] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  [    1.633534] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
  [    1.633536] Call Trace:
  [    1.633549]  <TASK>
  [    1.633553]  __pci_enable_msix_range+0x2b9/0x4c0
  [    1.633572]  pci_alloc_irq_vectors_affinity+0xa5/0x100
  [    1.633579]  mlx5_irq_table_create.cold+0x6d/0x22f [mlx5_core]
  [    1.634032]  ? probe_one+0x1aa/0x280 [mlx5_core]
  [    1.634193]  ? pci_device_probe+0xa4/0x140
  [    1.634201]  ? really_probe+0xc9/0x350
  [    1.634205]  ? pm_runtime_barrier+0x43/0x80
  [    1.634213]  ? __driver_probe_device+0x80/0x170
  [    1.634218]  ? driver_probe_device+0x1e/0x90
  [    1.634223]  ? __driver_attach+0xcd/0x1b0
  [    1.634226]  ? __device_attach_driver+0xf0/0xf0
  [    1.634231]  ? __device_attach_driver+0xf0/0xf0
  [    1.634235]  ? bus_for_each_dev+0x77/0xc0
  [    1.634243]  ? bus_add_driver+0x184/0x1f0
  [    1.634247]  ? driver_register+0x8f/0xe0
  [    1.634251]  ? 0xffffffffa0180000
  [    1.634256]  ? init+0x62/0x1000 [mlx5_core]
  [    1.634413]  ? do_one_initcall+0x4a/0x1e0
  [    1.634418]  ? kmem_cache_alloc_trace+0x33/0x420
  [    1.634426]  ? do_init_module+0x72/0x260
  [    1.634434]  ? __do_sys_finit_module+0xbb/0x130
  [    1.634443]  ? do_syscall_64+0x3d/0x90
  [    1.634452]  ? entry_SYSCALL_64_after_hwframe+0x46/0xb0
  [    1.634461]  </TASK>
  [    1.634463] ---[ end trace 0000000000000000 ]---
  [  OK  ] Finished udev Coldplug all Devices.
  [    1.713428] Kernel panic - not syncing: stack-protector: Kernel stack is corrupted in: mlx5_irq_table_create+0x9c/0xa0 [mlx5_core]
  [    1.715521] CPU: 2 PID: 194 Comm: systemd-modules Tainted: G        W         5.18.0+ #1
  [    1.715524] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
  [    1.715525] Call Trace:
  [    1.715532]  <TASK>
  [    1.715533]  dump_stack_lvl+0x34/0x44
  [    1.715538]  panic+0x100/0x255
  [    1.715542]  ? mlx5_irq_table_create+0x9c/0xa0 [mlx5_core]
  [    1.715602]  __stack_chk_fail+0x10/0x10
  [    1.715607]  mlx5_irq_table_create+0x9c/0xa0 [mlx5_core]
  [    1.715662]  ? probe_one+0x1aa/0x280 [mlx5_core]
  [    1.715709]  ? pci_device_probe+0xa4/0x140
  [    1.715712]  ? really_probe+0xc9/0x350
  [    1.715715]  ? pm_runtime_barrier+0x43/0x80
  [    1.715718]  ? __driver_probe_device+0x80/0x170
  [    1.715719]  ? driver_probe_device+0x1e/0x90
  [    1.715721]  ? __driver_attach+0xcd/0x1b0
  [    1.715722]  ? __device_attach_driver+0xf0/0xf0
  [    1.715723]  ? __device_attach_driver+0xf0/0xf0
  [    1.715724]  ? bus_for_each_dev+0x77/0xc0
  [    1.715727]  ? bus_add_driver+0x184/0x1f0
  [    1.715728]  ? driver_register+0x8f/0xe0
  [    1.715730]  ? 0xffffffffa0180000
  [    1.715731]  ? init+0x62/0x1000 [mlx5_core]
  [    1.715778]  ? do_one_initcall+0x4a/0x1e0
  [    1.715781]  ? kmem_cache_alloc_trace+0x33/0x420
  [    1.715784]  ? do_init_module+0x72/0x260
  [    1.715788]  ? __do_sys_finit_module+0xbb/0x130
  [    1.715790]  ? do_syscall_64+0x3d/0x90
  [    1.715792]  ? entry_SYSCALL_64_after_hwframe+0x46/0xb0
  [    1.715796]  </TASK>
  [    1.715938] Kernel Offset: disabled
  [    1.732563] ---[ end Kernel panic - not syncing: stack-protector: Kernel stack is corrupted in: mlx5_irq_table_create+0x9c/0xa0 [mlx5_core] ]---

>
> Signed-off-by: Yajun Deng <yajun.deng@linux.dev>
> ---
>   .../net/ethernet/mellanox/mlx5/core/pci_irq.c | 19 ++++++++++++++++++-
>   1 file changed, 18 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
> index 662f1d55e30e..d13fc403fe78 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
> @@ -624,11 +624,27 @@ int mlx5_irq_table_get_num_comp(struct mlx5_irq_table *table)
>   	return table->pf_pool->xa_num_irqs.max - table->pf_pool->xa_num_irqs.min;
>   }
>   
> +static void mlx5_calc_sets(struct irq_affinity *affd, unsigned int nvecs)
> +{
> +	int i;
> +
> +	affd->nr_sets = (nvecs - 1) / num_possible_cpus() + 1;
> +
> +	for (i = 0; i < affd->nr_sets; i++) {
> +		affd->set_size[i] = min(nvecs, num_possible_cpus());
> +		nvecs -= num_possible_cpus();
> +	}
> +}
> +
>   int mlx5_irq_table_create(struct mlx5_core_dev *dev)
>   {
>   	int num_eqs = MLX5_CAP_GEN(dev, max_num_eqs) ?
>   		      MLX5_CAP_GEN(dev, max_num_eqs) :
>   		      1 << MLX5_CAP_GEN(dev, log_max_eq);
> +	struct irq_affinity affd = {
> +		.pre_vectors = 0,
> +		.calc_sets   = mlx5_calc_sets,
> +	};
>   	int total_vec;
>   	int pf_vec;
>   	int err;
> @@ -644,7 +660,8 @@ int mlx5_irq_table_create(struct mlx5_core_dev *dev)
>   		total_vec += MLX5_IRQ_CTRL_SF_MAX +
>   			MLX5_COMP_EQS_PER_SF * mlx5_sf_max_functions(dev);
>   
> -	total_vec = pci_alloc_irq_vectors(dev->pdev, 1, total_vec, PCI_IRQ_MSIX);
> +	total_vec = pci_alloc_irq_vectors_affinity(dev->pdev, 1, total_vec,
> +						   PCI_IRQ_MSIX | PCI_IRQ_AFFINITY, &affd);
>   	if (total_vec < 0)
>   		return total_vec;
>   	pf_vec = min(pf_vec, total_vec);
Yajun Deng June 6, 2022, 10:29 a.m. UTC | #2
June 6, 2022 4:31 PM, "Shay Drory" <shayd@nvidia.com> wrote:

> On 6/6/2022 10:13, Yajun Deng wrote:
> 
>> The mlx5 would allocate no less than one irq for per cpu, we can bond each
>> irq to a cpu to improve interrupt performance.
> 
> The maximum number of affinity set is hard coded to 4. in case nvec > 4 * (num_CPUs)[1]
> we will hit the following WARN[2].
> Also, we hit an oops following this WARN...
> 
> [1]
> mlx5 support up to 2K MSIX (depends on the HW). e.g.: if we max out mlx5 MSIX capability,
> we will cross this limit on any machine, at least that I know of.
> 

Oh, I didn't expect so many MSIX. Thank you!
> [2]
> 
> This is a machine with 10 CPUs and 350 MSIX
> 
> [ 1.633436] ------------[ cut here ]------------
> [ 1.633437] WARNING: CPU: 2 PID: 194 at kernel/irq/affinity.c:443
> irq_create_affinity_masks+0x175/0x270
> [ 1.633467] Modules linked in: mlx5_core(+)
> [ 1.633474] CPU: 2 PID: 194 Comm: systemd-modules Not tainted 5.18.0+ #1
> [ 1.633480] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS
> rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
> [ 1.633483] RIP: 0010:irq_create_affinity_masks+0x175/0x270
> [ 1.633492] Code: 5c 41 5d 41 5e 41 5f c3 48 c7 46 20 90 6d 19 81 48 c7 c0 90 6d 19 81 8b 34 24 4c
> 89 ef ff d0 41 83 7d 08 04 0f 86 de fe ff ff <0f> 0b 45 31 f6 eb c5 45 8b 5d 00 8b 34 24 43 8d 04
> 1f 42 8d 0c 1e
> [ 1.633497] RSP: 0018:ffff88810716bac0 EFLAGS: 00010202
> [ 1.633501] RAX: 000000000000000a RBX: 0000000000000001 RCX: 0000000000000200
> [ 1.633504] RDX: ffffffff82605000 RSI: ffffffff82605000 RDI: 0000000000000000
> [ 1.633507] RBP: ffff88810716bbd0 R08: 000000000000000a R09: ffffffff82604fc0
> [ 1.633510] R10: 0000000000000008 R11: 000ffffffffff000 R12: 0000000000000000
> [ 1.633513] R13: ffff88810716bbd0 R14: 0000000000000160 R15: 0000000000000160
> [ 1.633516] FS: 00007f8d72994b80(0000) GS:ffff88852c900000(0000) knlGS:0000000000000000
> [ 1.633525] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [ 1.633528] CR2: 00007f8d73ba4490 CR3: 0000000103fce001 CR4: 0000000000370ea0
> [ 1.633531] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [ 1.633534] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> [ 1.633536] Call Trace:
> [ 1.633549] <TASK>
> [ 1.633553] __pci_enable_msix_range+0x2b9/0x4c0
> [ 1.633572] pci_alloc_irq_vectors_affinity+0xa5/0x100
> [ 1.633579] mlx5_irq_table_create.cold+0x6d/0x22f [mlx5_core]
> [ 1.634032] ? probe_one+0x1aa/0x280 [mlx5_core]
> [ 1.634193] ? pci_device_probe+0xa4/0x140
> [ 1.634201] ? really_probe+0xc9/0x350
> [ 1.634205] ? pm_runtime_barrier+0x43/0x80
> [ 1.634213] ? __driver_probe_device+0x80/0x170
> [ 1.634218] ? driver_probe_device+0x1e/0x90
> [ 1.634223] ? __driver_attach+0xcd/0x1b0
> [ 1.634226] ? __device_attach_driver+0xf0/0xf0
> [ 1.634231] ? __device_attach_driver+0xf0/0xf0
> [ 1.634235] ? bus_for_each_dev+0x77/0xc0
> [ 1.634243] ? bus_add_driver+0x184/0x1f0
> [ 1.634247] ? driver_register+0x8f/0xe0
> [ 1.634251] ? 0xffffffffa0180000
> [ 1.634256] ? init+0x62/0x1000 [mlx5_core]
> [ 1.634413] ? do_one_initcall+0x4a/0x1e0
> [ 1.634418] ? kmem_cache_alloc_trace+0x33/0x420
> [ 1.634426] ? do_init_module+0x72/0x260
> [ 1.634434] ? __do_sys_finit_module+0xbb/0x130
> [ 1.634443] ? do_syscall_64+0x3d/0x90
> [ 1.634452] ? entry_SYSCALL_64_after_hwframe+0x46/0xb0
> [ 1.634461] </TASK>
> [ 1.634463] ---[ end trace 0000000000000000 ]---
> [ OK ] Finished udev Coldplug all Devices.
> [ 1.713428] Kernel panic - not syncing: stack-protector: Kernel stack is corrupted in:
> mlx5_irq_table_create+0x9c/0xa0 [mlx5_core]
> [ 1.715521] CPU: 2 PID: 194 Comm: systemd-modules Tainted: G W 5.18.0+ #1
> [ 1.715524] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS
> rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
> [ 1.715525] Call Trace:
> [ 1.715532] <TASK>
> [ 1.715533] dump_stack_lvl+0x34/0x44
> [ 1.715538] panic+0x100/0x255
> [ 1.715542] ? mlx5_irq_table_create+0x9c/0xa0 [mlx5_core]
> [ 1.715602] __stack_chk_fail+0x10/0x10
> [ 1.715607] mlx5_irq_table_create+0x9c/0xa0 [mlx5_core]
> [ 1.715662] ? probe_one+0x1aa/0x280 [mlx5_core]
> [ 1.715709] ? pci_device_probe+0xa4/0x140
> [ 1.715712] ? really_probe+0xc9/0x350
> [ 1.715715] ? pm_runtime_barrier+0x43/0x80
> [ 1.715718] ? __driver_probe_device+0x80/0x170
> [ 1.715719] ? driver_probe_device+0x1e/0x90
> [ 1.715721] ? __driver_attach+0xcd/0x1b0
> [ 1.715722] ? __device_attach_driver+0xf0/0xf0
> [ 1.715723] ? __device_attach_driver+0xf0/0xf0
> [ 1.715724] ? bus_for_each_dev+0x77/0xc0
> [ 1.715727] ? bus_add_driver+0x184/0x1f0
> [ 1.715728] ? driver_register+0x8f/0xe0
> [ 1.715730] ? 0xffffffffa0180000
> [ 1.715731] ? init+0x62/0x1000 [mlx5_core]
> [ 1.715778] ? do_one_initcall+0x4a/0x1e0
> [ 1.715781] ? kmem_cache_alloc_trace+0x33/0x420
> [ 1.715784] ? do_init_module+0x72/0x260
> [ 1.715788] ? __do_sys_finit_module+0xbb/0x130
> [ 1.715790] ? do_syscall_64+0x3d/0x90
> [ 1.715792] ? entry_SYSCALL_64_after_hwframe+0x46/0xb0
> [ 1.715796] </TASK>
> [ 1.715938] Kernel Offset: disabled
> [ 1.732563] ---[ end Kernel panic - not syncing: stack-protector: Kernel stack is corrupted in:
> mlx5_irq_table_create+0x9c/0xa0 [mlx5_core] ]---
> 
>> Signed-off-by: Yajun Deng <yajun.deng@linux.dev>
>> ---
>> .../net/ethernet/mellanox/mlx5/core/pci_irq.c | 19 ++++++++++++++++++-
>> 1 file changed, 18 insertions(+), 1 deletion(-)
>> 
>> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
>> b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
>> index 662f1d55e30e..d13fc403fe78 100644
>> --- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
>> +++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
>> @@ -624,11 +624,27 @@ int mlx5_irq_table_get_num_comp(struct mlx5_irq_table *table)
>> return table->pf_pool->xa_num_irqs.max - table->pf_pool->xa_num_irqs.min;
>> }
>>> +static void mlx5_calc_sets(struct irq_affinity *affd, unsigned int nvecs)
>> +{
>> + int i;
>> +
>> + affd->nr_sets = (nvecs - 1) / num_possible_cpus() + 1;
>> +
>> + for (i = 0; i < affd->nr_sets; i++) {
>> + affd->set_size[i] = min(nvecs, num_possible_cpus());
>> + nvecs -= num_possible_cpus();
>> + }
>> +}
>> +
>> int mlx5_irq_table_create(struct mlx5_core_dev *dev)
>> {
>> int num_eqs = MLX5_CAP_GEN(dev, max_num_eqs) ?
>> MLX5_CAP_GEN(dev, max_num_eqs) :
>> 1 << MLX5_CAP_GEN(dev, log_max_eq);
>> + struct irq_affinity affd = {
>> + .pre_vectors = 0,
>> + .calc_sets = mlx5_calc_sets,
>> + };
>> int total_vec;
>> int pf_vec;
>> int err;
>> @@ -644,7 +660,8 @@ int mlx5_irq_table_create(struct mlx5_core_dev *dev)
>> total_vec += MLX5_IRQ_CTRL_SF_MAX +
>> MLX5_COMP_EQS_PER_SF * mlx5_sf_max_functions(dev);
>>> - total_vec = pci_alloc_irq_vectors(dev->pdev, 1, total_vec, PCI_IRQ_MSIX);
>> + total_vec = pci_alloc_irq_vectors_affinity(dev->pdev, 1, total_vec,
>> + PCI_IRQ_MSIX | PCI_IRQ_AFFINITY, &affd);
>> if (total_vec < 0)
>> return total_vec;
>> pf_vec = min(pf_vec, total_vec);
diff mbox series

Patch

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
index 662f1d55e30e..d13fc403fe78 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
@@ -624,11 +624,27 @@  int mlx5_irq_table_get_num_comp(struct mlx5_irq_table *table)
 	return table->pf_pool->xa_num_irqs.max - table->pf_pool->xa_num_irqs.min;
 }
 
+static void mlx5_calc_sets(struct irq_affinity *affd, unsigned int nvecs)
+{
+	int i;
+
+	affd->nr_sets = (nvecs - 1) / num_possible_cpus() + 1;
+
+	for (i = 0; i < affd->nr_sets; i++) {
+		affd->set_size[i] = min(nvecs, num_possible_cpus());
+		nvecs -= num_possible_cpus();
+	}
+}
+
 int mlx5_irq_table_create(struct mlx5_core_dev *dev)
 {
 	int num_eqs = MLX5_CAP_GEN(dev, max_num_eqs) ?
 		      MLX5_CAP_GEN(dev, max_num_eqs) :
 		      1 << MLX5_CAP_GEN(dev, log_max_eq);
+	struct irq_affinity affd = {
+		.pre_vectors = 0,
+		.calc_sets   = mlx5_calc_sets,
+	};
 	int total_vec;
 	int pf_vec;
 	int err;
@@ -644,7 +660,8 @@  int mlx5_irq_table_create(struct mlx5_core_dev *dev)
 		total_vec += MLX5_IRQ_CTRL_SF_MAX +
 			MLX5_COMP_EQS_PER_SF * mlx5_sf_max_functions(dev);
 
-	total_vec = pci_alloc_irq_vectors(dev->pdev, 1, total_vec, PCI_IRQ_MSIX);
+	total_vec = pci_alloc_irq_vectors_affinity(dev->pdev, 1, total_vec,
+						   PCI_IRQ_MSIX | PCI_IRQ_AFFINITY, &affd);
 	if (total_vec < 0)
 		return total_vec;
 	pf_vec = min(pf_vec, total_vec);