diff mbox series

[v1,net-next,2/6] net: napi: add CPU affinity to napi->config

Message ID 20241210002626.366878-3-ahmed.zaki@intel.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series net: napi: add CPU affinity to napi->config | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for net-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 39 this patch: 39
netdev/build_tools success Errors and warnings before: 0 (+23) this patch: 0 (+23)
netdev/cc_maintainers warning 1 maintainers not CCed: horms@kernel.org
netdev/build_clang success Errors and warnings before: 63 this patch: 63
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 4101 this patch: 4101
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 81 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 95 this patch: 95
netdev/source_inline success Was 0 now: 0
netdev/contest success net-next-2024-12-11--03-01 (tests: 763)

Commit Message

Ahmed Zaki Dec. 10, 2024, 12:26 a.m. UTC
A common task for most drivers is to remember the user's CPU affinity to
its IRQs. On each netdev reset, the driver must then re-assign the
user's setting to the IRQs.

Add CPU affinity mask to napi->config. To delegate the CPU affinity
management to the core, drivers must:
 1 - add a persistent napi config:     netif_napi_add_config()
 2 - bind an IRQ to the napi instance: netif_napi_set_irq()

the core will then make sure to use re-assign affinity to the napi's
IRQ.

The default mask set to all IRQs is all online CPUs.

Suggested-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Ahmed Zaki <ahmed.zaki@intel.com>
---
 include/linux/netdevice.h |  6 ++++++
 net/core/dev.c            | 31 ++++++++++++++++++++++++++++++-
 2 files changed, 36 insertions(+), 1 deletion(-)

Comments

Joe Damato Dec. 10, 2024, 1:29 a.m. UTC | #1
On Mon, Dec 09, 2024 at 05:26:22PM -0700, Ahmed Zaki wrote:
> A common task for most drivers is to remember the user's CPU affinity to
> its IRQs. On each netdev reset, the driver must then re-assign the
> user's setting to the IRQs.
> 
> Add CPU affinity mask to napi->config. To delegate the CPU affinity
> management to the core, drivers must:
>  1 - add a persistent napi config:     netif_napi_add_config()
>  2 - bind an IRQ to the napi instance: netif_napi_set_irq()
> 
> the core will then make sure to use re-assign affinity to the napi's
> IRQ.
> 
> The default mask set to all IRQs is all online CPUs.
> 
> Suggested-by: Jakub Kicinski <kuba@kernel.org>
> Signed-off-by: Ahmed Zaki <ahmed.zaki@intel.com>
> ---

[...]

> diff --git a/net/core/dev.c b/net/core/dev.c
> index 6ef9eb401fb2..778ba27d2b83 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -6699,11 +6699,35 @@ void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index,
>  }
>  EXPORT_SYMBOL(netif_queue_set_napi);
>  
> +static void
> +netif_napi_affinity_notify(struct irq_affinity_notify *notify,
> +			   const cpumask_t *mask)
> +{
> +	struct napi_struct *napi =
> +		container_of(notify, struct napi_struct, affinity_notify);
> +
> +	if (napi->config)
> +		cpumask_copy(&napi->config->affinity_mask, mask);
> +}
> +
> +static void
> +netif_napi_affinity_release(struct kref __always_unused *ref)
> +{
> +}
> +
>  static void napi_restore_config(struct napi_struct *n)
>  {
>  	n->defer_hard_irqs = n->config->defer_hard_irqs;
>  	n->gro_flush_timeout = n->config->gro_flush_timeout;
>  	n->irq_suspend_timeout = n->config->irq_suspend_timeout;
> +
> +	if (n->irq > 0 && n->irq_flags & NAPIF_F_IRQ_AFFINITY) {
> +		n->affinity_notify.notify = netif_napi_affinity_notify;
> +		n->affinity_notify.release = netif_napi_affinity_release;
> +		irq_set_affinity_notifier(n->irq, &n->affinity_notify);
> +		irq_set_affinity(n->irq, &n->config->affinity_mask);
> +	}
> +
>  	/* a NAPI ID might be stored in the config, if so use it. if not, use
>  	 * napi_hash_add to generate one for us. It will be saved to the config
>  	 * in napi_disable.
> @@ -6720,6 +6744,8 @@ static void napi_save_config(struct napi_struct *n)
>  	n->config->gro_flush_timeout = n->gro_flush_timeout;
>  	n->config->irq_suspend_timeout = n->irq_suspend_timeout;
>  	n->config->napi_id = n->napi_id;
> +	if (n->irq > 0 && n->irq_flags & NAPIF_F_IRQ_AFFINITY)
> +		irq_set_affinity_notifier(n->irq, NULL);

My understanding when I attempted this was that using generic IRQ
notifiers breaks ARFS [1], because IRQ notifiers only support a
single notifier and so drivers with ARFS can't _also_ set their own
notifiers for that.

Two ideas were proposed in the thread I mentioned:
  1. Have multiple notifiers per IRQ so that having a generic core
     based notifier wouldn't break ARFS.
  2. Jakub mentioned calling cpu_rmap_update from the core so that a
     generic solution wouldn't be blocked.

I don't know anything about option 1, so I looked at option 2.

At the time when I read the code, it seemed that cpu_rmap_update
required some state be passed in (struct irq_glue), so in that case,
the only way to call cpu_rmap_update from the core would be to
maintain some state about ARFS in the core, too, so that drivers
which support ARFS won't be broken by this change.

At that time there was no persistent per-NAPI config, but since
there is now, there might be a way to solve this.

Just guessing here, but maybe one way to solve this would be to move
ARFS into the core by:
  - Adding a new bit in addition to NAPIF_F_IRQ_AFFINITY... I don't
    know NAPIF_F_ARFS_AFFINITY or something? so that drivers
    could express that they support ARFS.
  - Remove the driver calls to irq_cpu_rmap_add and make sure to
    pass the new bit in for drivers that support ARFS (in your
    changeset, I believe that would be at least ice, mlx4, and
    bnxt... possibly more?).
  - In the generic core code, if the ARFS bit is set then you pass
    in the state needed for ARFS to work, otherwise do what the
    proposed code is doing now.

But, that's just a guess. Maybe there's a better way.

In any case, the proposed change as-is, I think, breaks ARFS in the
same way my proposed change did, so I think any further iterations
on this might need to also include something to avoid breaking ARFS.

FWIW, I was trying to solve a slightly different problem which was
removing a repeated check in driver napi poll functions to determine
if the IRQ affinity had changed.

But with the persistent NAPI state, it might be possible to solve
all of these problems in one go since they are all related?

[1]: https://lore.kernel.org/lkml/701eb84c-8d26-4945-8af3-55a70e05b09c@nvidia.com/
Jakub Kicinski Dec. 11, 2024, 4:21 a.m. UTC | #2
On Mon, 9 Dec 2024 17:29:00 -0800 Joe Damato wrote:
> My understanding when I attempted this was that using generic IRQ
> notifiers breaks ARFS [1], because IRQ notifiers only support a
> single notifier and so drivers with ARFS can't _also_ set their own
> notifiers for that.

Ah, you are so right, I forgot the details and was grepping for
notifier registration :S

> Two ideas were proposed in the thread I mentioned:
>   1. Have multiple notifiers per IRQ so that having a generic core
>      based notifier wouldn't break ARFS.
>   2. Jakub mentioned calling cpu_rmap_update from the core so that a
>      generic solution wouldn't be blocked.
> 
> I don't know anything about option 1, so I looked at option 2.
> 
> At the time when I read the code, it seemed that cpu_rmap_update
> required some state be passed in (struct irq_glue), so in that case,
> the only way to call cpu_rmap_update from the core would be to
> maintain some state about ARFS in the core, too, so that drivers
> which support ARFS won't be broken by this change.
> 
> At that time there was no persistent per-NAPI config, but since
> there is now, there might be a way to solve this.
> 
> Just guessing here, but maybe one way to solve this would be to move
> ARFS into the core by:
>   - Adding a new bit in addition to NAPIF_F_IRQ_AFFINITY... I don't
>     know NAPIF_F_ARFS_AFFINITY or something? so that drivers
>     could express that they support ARFS.
>   - Remove the driver calls to irq_cpu_rmap_add and make sure to
>     pass the new bit in for drivers that support ARFS (in your
>     changeset, I believe that would be at least ice, mlx4, and
>     bnxt... possibly more?).
>   - In the generic core code, if the ARFS bit is set then you pass
>     in the state needed for ARFS to work, otherwise do what the
>     proposed code is doing now.

SG, maybe I'd put the flag(s) in struct net_device, so that we are sure 
the whole device either wants the new behavior or not.

I say flag(s) because idpf probably wants just to opt in for affinity
mask management, without ARFS. So pretty close to Ahmed's existing code.
ARFS support will require another flag to ask for rmap management in
the core as you describe.
Ahmed Zaki Dec. 11, 2024, 4:33 p.m. UTC | #3
On 2024-12-09 6:29 p.m., Joe Damato wrote:
> On Mon, Dec 09, 2024 at 05:26:22PM -0700, Ahmed Zaki wrote:
>> A common task for most drivers is to remember the user's CPU affinity to
>> its IRQs. On each netdev reset, the driver must then re-assign the
>> user's setting to the IRQs.
>>
>> Add CPU affinity mask to napi->config. To delegate the CPU affinity
>> management to the core, drivers must:
>>   1 - add a persistent napi config:     netif_napi_add_config()
>>   2 - bind an IRQ to the napi instance: netif_napi_set_irq()
>>
>> the core will then make sure to use re-assign affinity to the napi's
>> IRQ.
>>
>> The default mask set to all IRQs is all online CPUs.
>>
>> Suggested-by: Jakub Kicinski <kuba@kernel.org>
>> Signed-off-by: Ahmed Zaki <ahmed.zaki@intel.com>
>> ---
> 
> [...]
> 
>> diff --git a/net/core/dev.c b/net/core/dev.c
>> index 6ef9eb401fb2..778ba27d2b83 100644
>> --- a/net/core/dev.c
>> +++ b/net/core/dev.c
>> @@ -6699,11 +6699,35 @@ void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index,
>>   }
>>   EXPORT_SYMBOL(netif_queue_set_napi);
>>   
>> +static void
>> +netif_napi_affinity_notify(struct irq_affinity_notify *notify,
>> +			   const cpumask_t *mask)
>> +{
>> +	struct napi_struct *napi =
>> +		container_of(notify, struct napi_struct, affinity_notify);
>> +
>> +	if (napi->config)
>> +		cpumask_copy(&napi->config->affinity_mask, mask);
>> +}
>> +
>> +static void
>> +netif_napi_affinity_release(struct kref __always_unused *ref)
>> +{
>> +}
>> +
>>   static void napi_restore_config(struct napi_struct *n)
>>   {
>>   	n->defer_hard_irqs = n->config->defer_hard_irqs;
>>   	n->gro_flush_timeout = n->config->gro_flush_timeout;
>>   	n->irq_suspend_timeout = n->config->irq_suspend_timeout;
>> +
>> +	if (n->irq > 0 && n->irq_flags & NAPIF_F_IRQ_AFFINITY) {
>> +		n->affinity_notify.notify = netif_napi_affinity_notify;
>> +		n->affinity_notify.release = netif_napi_affinity_release;
>> +		irq_set_affinity_notifier(n->irq, &n->affinity_notify);
>> +		irq_set_affinity(n->irq, &n->config->affinity_mask);
>> +	}
>> +
>>   	/* a NAPI ID might be stored in the config, if so use it. if not, use
>>   	 * napi_hash_add to generate one for us. It will be saved to the config
>>   	 * in napi_disable.
>> @@ -6720,6 +6744,8 @@ static void napi_save_config(struct napi_struct *n)
>>   	n->config->gro_flush_timeout = n->gro_flush_timeout;
>>   	n->config->irq_suspend_timeout = n->irq_suspend_timeout;
>>   	n->config->napi_id = n->napi_id;
>> +	if (n->irq > 0 && n->irq_flags & NAPIF_F_IRQ_AFFINITY)
>> +		irq_set_affinity_notifier(n->irq, NULL);
> 
> My understanding when I attempted this was that using generic IRQ
> notifiers breaks ARFS [1], because IRQ notifiers only support a
> single notifier and so drivers with ARFS can't _also_ set their own
> notifiers for that.

Thank you for the review and reply. I was wondering why some drivers 
check for ARFS (in buggy ways) before setting affinity notifiers. I now 
have a better understanding.

> 
> Two ideas were proposed in the thread I mentioned:
>    1. Have multiple notifiers per IRQ so that having a generic core
>       based notifier wouldn't break ARFS.
>    2. Jakub mentioned calling cpu_rmap_update from the core so that a
>       generic solution wouldn't be blocked.
> 
> I don't know anything about option 1, so I looked at option 2.
> 
> At the time when I read the code, it seemed that cpu_rmap_update
> required some state be passed in (struct irq_glue), so in that case,
> the only way to call cpu_rmap_update from the core would be to
> maintain some state about ARFS in the core, too, so that drivers
> which support ARFS won't be broken by this change.
> 
> At that time there was no persistent per-NAPI config, but since
> there is now, there might be a way to solve this.
> 
> Just guessing here, but maybe one way to solve this would be to move
> ARFS into the core by:
>    - Adding a new bit in addition to NAPIF_F_IRQ_AFFINITY... I don't
>      know NAPIF_F_ARFS_AFFINITY or something? so that drivers
>      could express that they support ARFS.
>    - Remove the driver calls to irq_cpu_rmap_add and make sure to
>      pass the new bit in for drivers that support ARFS (in your
>      changeset, I believe that would be at least ice, mlx4, and
>      bnxt... possibly more?).
>    - In the generic core code, if the ARFS bit is set then you pass
>      in the state needed for ARFS to work, otherwise do what the
>      proposed code is doing now.
> 
> But, that's just a guess. Maybe there's a better way.

I will look into all of this and send a new version, but yes it is clear 
that the core needs to manage ARFS rmap creation and updates beside the 
affinity restoration.

Ahmed
diff mbox series

Patch

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b598de335d26..9bf91c3aca8d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -350,9 +350,14 @@  struct napi_config {
 	u64 gro_flush_timeout;
 	u64 irq_suspend_timeout;
 	u32 defer_hard_irqs;
+	cpumask_t affinity_mask;
 	unsigned int napi_id;
 };
 
+enum {
+	NAPIF_F_IRQ_AFFINITY		= BIT(0)
+};
+
 /*
  * Structure for NAPI scheduling similar to tasklet but with weighting
  */
@@ -394,6 +399,7 @@  struct napi_struct {
 	unsigned long		irq_flags;
 	int			index;
 	struct napi_config	*config;
+	struct irq_affinity_notify affinity_notify;
 };
 
 enum {
diff --git a/net/core/dev.c b/net/core/dev.c
index 6ef9eb401fb2..778ba27d2b83 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6699,11 +6699,35 @@  void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index,
 }
 EXPORT_SYMBOL(netif_queue_set_napi);
 
+static void
+netif_napi_affinity_notify(struct irq_affinity_notify *notify,
+			   const cpumask_t *mask)
+{
+	struct napi_struct *napi =
+		container_of(notify, struct napi_struct, affinity_notify);
+
+	if (napi->config)
+		cpumask_copy(&napi->config->affinity_mask, mask);
+}
+
+static void
+netif_napi_affinity_release(struct kref __always_unused *ref)
+{
+}
+
 static void napi_restore_config(struct napi_struct *n)
 {
 	n->defer_hard_irqs = n->config->defer_hard_irqs;
 	n->gro_flush_timeout = n->config->gro_flush_timeout;
 	n->irq_suspend_timeout = n->config->irq_suspend_timeout;
+
+	if (n->irq > 0 && n->irq_flags & NAPIF_F_IRQ_AFFINITY) {
+		n->affinity_notify.notify = netif_napi_affinity_notify;
+		n->affinity_notify.release = netif_napi_affinity_release;
+		irq_set_affinity_notifier(n->irq, &n->affinity_notify);
+		irq_set_affinity(n->irq, &n->config->affinity_mask);
+	}
+
 	/* a NAPI ID might be stored in the config, if so use it. if not, use
 	 * napi_hash_add to generate one for us. It will be saved to the config
 	 * in napi_disable.
@@ -6720,6 +6744,8 @@  static void napi_save_config(struct napi_struct *n)
 	n->config->gro_flush_timeout = n->gro_flush_timeout;
 	n->config->irq_suspend_timeout = n->irq_suspend_timeout;
 	n->config->napi_id = n->napi_id;
+	if (n->irq > 0 && n->irq_flags & NAPIF_F_IRQ_AFFINITY)
+		irq_set_affinity_notifier(n->irq, NULL);
 	napi_hash_del(n);
 }
 
@@ -11184,7 +11210,7 @@  struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 {
 	struct net_device *dev;
 	size_t napi_config_sz;
-	unsigned int maxqs;
+	unsigned int maxqs, i;
 
 	BUG_ON(strlen(name) >= sizeof(dev->name));
 
@@ -11280,6 +11306,9 @@  struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 	dev->napi_config = kvzalloc(napi_config_sz, GFP_KERNEL_ACCOUNT);
 	if (!dev->napi_config)
 		goto free_all;
+	for (i = 0; i < maxqs; i++)
+		cpumask_copy(&dev->napi_config[i].affinity_mask,
+			     cpu_online_mask);
 
 	strscpy(dev->name, name);
 	dev->name_assign_type = name_assign_type;