Message ID | 20250211210657.428439-4-ahmed.zaki@intel.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | net: napi: add CPU affinity to napi->config | expand |
On 2/11/25 10:06 PM, Ahmed Zaki wrote: > @@ -394,10 +395,8 @@ struct napi_struct { > struct list_head dev_list; > struct hlist_node napi_hash_node; > int irq; > -#ifdef CONFIG_RFS_ACCEL > struct irq_affinity_notify notify; > int napi_rmap_idx; > -#endif I'm sorry for the late doubt, but it's not clear to me why you need to add the #ifdef in the previous patch ?!? > diff --git a/net/core/dev.c b/net/core/dev.c > index 209296cef3cd..d2c942bbd5e6 100644 > --- a/net/core/dev.c > +++ b/net/core/dev.c > @@ -6871,28 +6871,39 @@ void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index, > } > EXPORT_SYMBOL(netif_queue_set_napi); > > -#ifdef CONFIG_RFS_ACCEL > static void > -netif_irq_cpu_rmap_notify(struct irq_affinity_notify *notify, > - const cpumask_t *mask) > +netif_napi_irq_notify(struct irq_affinity_notify *notify, > + const cpumask_t *mask) > { > struct napi_struct *napi = > container_of(notify, struct napi_struct, notify); > +#ifdef CONFIG_RFS_ACCEL > struct cpu_rmap *rmap = napi->dev->rx_cpu_rmap; > int err; > +#endif > > - err = cpu_rmap_update(rmap, napi->napi_rmap_idx, mask); > - if (err) > - netdev_warn(napi->dev, "RMAP update failed (%d)\n", > - err); > + if (napi->config && napi->dev->irq_affinity_auto) > + cpumask_copy(&napi->config->affinity_mask, mask); > + > +#ifdef CONFIG_RFS_ACCEL > + if (napi->dev->rx_cpu_rmap_auto) { > + err = cpu_rmap_update(rmap, napi->napi_rmap_idx, mask); > + if (err) > + netdev_warn(napi->dev, "RMAP update failed (%d)\n", > + err); > + } > +#endif Minor nit: if you provide a netif_rx_cpu_rmap() helper returning dev->rx_cpu_rmap or NULL for !CONFIG_RFS_ACCEL build, you can avoid the above 2 ifdefs and possibly more below. > @@ -6915,7 +6926,6 @@ static int napi_irq_cpu_rmap_add(struct napi_struct *napi, int irq) > if (rc) > goto err_set; > > - set_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state); Minor nit: I think it would be better if the previous patch would add directly this line in netif_napi_set_irq_locked() (avoding the removal here). /P
On 2025-02-13 5:26 a.m., Paolo Abeni wrote: > On 2/11/25 10:06 PM, Ahmed Zaki wrote: >> @@ -394,10 +395,8 @@ struct napi_struct { >> struct list_head dev_list; >> struct hlist_node napi_hash_node; >> int irq; >> -#ifdef CONFIG_RFS_ACCEL >> struct irq_affinity_notify notify; >> int napi_rmap_idx; >> -#endif > > I'm sorry for the late doubt, but it's not clear to me why you need to > add the #ifdef in the previous patch ?!? It was there to make the code consistent, since the rmap and the notifier were only needed for ARFS. It can be removed, although I am not sure if there would be any warnings since on !CONFIG_ARFS_ACCEL the fields would never be used. > >> diff --git a/net/core/dev.c b/net/core/dev.c >> index 209296cef3cd..d2c942bbd5e6 100644 >> --- a/net/core/dev.c >> +++ b/net/core/dev.c >> @@ -6871,28 +6871,39 @@ void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index, >> } >> EXPORT_SYMBOL(netif_queue_set_napi); >> >> -#ifdef CONFIG_RFS_ACCEL >> static void >> -netif_irq_cpu_rmap_notify(struct irq_affinity_notify *notify, >> - const cpumask_t *mask) >> +netif_napi_irq_notify(struct irq_affinity_notify *notify, >> + const cpumask_t *mask) >> { >> struct napi_struct *napi = >> container_of(notify, struct napi_struct, notify); >> +#ifdef CONFIG_RFS_ACCEL >> struct cpu_rmap *rmap = napi->dev->rx_cpu_rmap; >> int err; >> +#endif >> >> - err = cpu_rmap_update(rmap, napi->napi_rmap_idx, mask); >> - if (err) >> - netdev_warn(napi->dev, "RMAP update failed (%d)\n", >> - err); >> + if (napi->config && napi->dev->irq_affinity_auto) >> + cpumask_copy(&napi->config->affinity_mask, mask); >> + >> +#ifdef CONFIG_RFS_ACCEL >> + if (napi->dev->rx_cpu_rmap_auto) { >> + err = cpu_rmap_update(rmap, napi->napi_rmap_idx, mask); >> + if (err) >> + netdev_warn(napi->dev, "RMAP update failed (%d)\n", >> + err); >> + } >> +#endif > > Minor nit: if you provide a netif_rx_cpu_rmap() helper returning > dev->rx_cpu_rmap or NULL for !CONFIG_RFS_ACCEL build, you can avoid the > above 2 ifdefs and possibly more below. > Thanks, I will add this if there is a new version. >> @@ -6915,7 +6926,6 @@ static int napi_irq_cpu_rmap_add(struct napi_struct *napi, int irq) >> if (rc) >> goto err_set; >> >> - set_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state); > > Minor nit: I think it would be better if the previous patch would add > directly this line in netif_napi_set_irq_locked() (avoding the removal > here). > yes, it just made more sense for that patch.
On Tue, 11 Feb 2025 14:06:54 -0700 Ahmed Zaki wrote: > @@ -11575,9 +11615,9 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, > void (*setup)(struct net_device *), > unsigned int txqs, unsigned int rxqs) > { > + unsigned int maxqs, i, numa; > struct net_device *dev; > size_t napi_config_sz; > - unsigned int maxqs; > > BUG_ON(strlen(name) >= sizeof(dev->name)); > > @@ -11679,6 +11719,11 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, > if (!dev->napi_config) > goto free_all; > > + numa = dev_to_node(&dev->dev); Does this work? dev->dev is the "software" device, IOW the netdev itself. The HW dev is dev->dev.parent but it won't be set at this stage, we'd need to move the init to register, or maybe netif_enable_cpu_rmap() / netif_set_affinity_auto() ? > + for (i = 0; i < maxqs; i++) > + cpumask_set_cpu(cpumask_local_spread(i, numa), > + &dev->napi_config[i].affinity_mask);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9344d9b632d4..63fb392558b3 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -352,6 +352,7 @@ struct napi_config { u64 gro_flush_timeout; u64 irq_suspend_timeout; u32 defer_hard_irqs; + cpumask_t affinity_mask; unsigned int napi_id; }; @@ -394,10 +395,8 @@ struct napi_struct { struct list_head dev_list; struct hlist_node napi_hash_node; int irq; -#ifdef CONFIG_RFS_ACCEL struct irq_affinity_notify notify; int napi_rmap_idx; -#endif int index; struct napi_config *config; }; @@ -1995,6 +1994,12 @@ enum netdev_reg_state { * * @threaded: napi threaded mode is enabled * + * @irq_affinity_auto: driver wants the core to store and re-assign the IRQ + * affinity. Set by netif_enable_irq_affinity(), then + * the driver must create a persistent napi by + * netif_napi_add_config() and finally bind the napi to + * IRQ (via netif_napi_set_irq()). + * * @rx_cpu_rmap_auto: driver wants the core to manage the ARFS rmap. * Set by calling netif_enable_cpu_rmap(). * @@ -2405,6 +2410,7 @@ struct net_device { struct lock_class_key *qdisc_tx_busylock; bool proto_down; bool threaded; + bool irq_affinity_auto; bool rx_cpu_rmap_auto; /* priv_flags_slow, ungrouped to save space */ @@ -2665,6 +2671,11 @@ static inline void netdev_set_ml_priv(struct net_device *dev, dev->ml_priv_type = type; } +static inline void netif_set_affinity_auto(struct net_device *dev) +{ + dev->irq_affinity_auto = true; +} + /* * Net namespace inlines */ diff --git a/net/core/dev.c b/net/core/dev.c index 209296cef3cd..d2c942bbd5e6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6871,28 +6871,39 @@ void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index, } EXPORT_SYMBOL(netif_queue_set_napi); -#ifdef CONFIG_RFS_ACCEL static void -netif_irq_cpu_rmap_notify(struct irq_affinity_notify *notify, - const cpumask_t *mask) +netif_napi_irq_notify(struct irq_affinity_notify *notify, + const cpumask_t *mask) { struct napi_struct *napi = container_of(notify, struct napi_struct, notify); +#ifdef CONFIG_RFS_ACCEL struct cpu_rmap *rmap = napi->dev->rx_cpu_rmap; int err; +#endif - err = cpu_rmap_update(rmap, napi->napi_rmap_idx, mask); - if (err) - netdev_warn(napi->dev, "RMAP update failed (%d)\n", - err); + if (napi->config && napi->dev->irq_affinity_auto) + cpumask_copy(&napi->config->affinity_mask, mask); + +#ifdef CONFIG_RFS_ACCEL + if (napi->dev->rx_cpu_rmap_auto) { + err = cpu_rmap_update(rmap, napi->napi_rmap_idx, mask); + if (err) + netdev_warn(napi->dev, "RMAP update failed (%d)\n", + err); + } +#endif } +#ifdef CONFIG_RFS_ACCEL static void netif_napi_affinity_release(struct kref *ref) { struct napi_struct *napi = container_of(ref, struct napi_struct, notify.kref); struct cpu_rmap *rmap = napi->dev->rx_cpu_rmap; + if (!napi->dev->rx_cpu_rmap_auto) + return; rmap->obj[napi->napi_rmap_idx] = NULL; napi->napi_rmap_idx = -1; cpu_rmap_put(rmap); @@ -6903,7 +6914,7 @@ static int napi_irq_cpu_rmap_add(struct napi_struct *napi, int irq) struct cpu_rmap *rmap = napi->dev->rx_cpu_rmap; int rc; - napi->notify.notify = netif_irq_cpu_rmap_notify; + napi->notify.notify = netif_napi_irq_notify; napi->notify.release = netif_napi_affinity_release; cpu_rmap_get(rmap); rc = cpu_rmap_add(rmap, napi); @@ -6915,7 +6926,6 @@ static int napi_irq_cpu_rmap_add(struct napi_struct *napi, int irq) if (rc) goto err_set; - set_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state); return 0; err_set: @@ -6954,6 +6964,10 @@ static void netif_del_cpu_rmap(struct net_device *dev) } #else +static void netif_napi_affinity_release(struct kref *ref) +{ +} + static int napi_irq_cpu_rmap_add(struct napi_struct *napi, int irq) { return 0; @@ -6977,7 +6991,7 @@ void netif_napi_set_irq_locked(struct napi_struct *napi, int irq) if (napi->irq == irq) return; - /* Remove existing rmap entries */ + /* Remove existing resources */ if (test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state)) irq_set_affinity_notifier(napi->irq, NULL); @@ -6985,9 +6999,30 @@ void netif_napi_set_irq_locked(struct napi_struct *napi, int irq) if (irq < 0) return; - rc = napi_irq_cpu_rmap_add(napi, irq); - if (rc) - netdev_warn(napi->dev, "Unable to update aRFS map (%d)\n", rc); + if (napi->dev->rx_cpu_rmap_auto) { + rc = napi_irq_cpu_rmap_add(napi, irq); + if (rc) { + netdev_warn(napi->dev, "Unable to update ARFS map (%d)\n", + rc); + return; + } + set_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state); + + } else if (napi->dev->irq_affinity_auto) { + if (WARN_ON_ONCE(!napi->config)) + return; + + napi->notify.notify = netif_napi_irq_notify; + napi->notify.release = netif_napi_affinity_release; + + rc = irq_set_affinity_notifier(irq, &napi->notify); + if (rc) { + netdev_warn(napi->dev, "Unable to set IRQ notifier (%d)\n", + rc); + return; + } + set_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state); + } } EXPORT_SYMBOL(netif_napi_set_irq_locked); @@ -6996,6 +7031,11 @@ static void napi_restore_config(struct napi_struct *n) n->defer_hard_irqs = n->config->defer_hard_irqs; n->gro_flush_timeout = n->config->gro_flush_timeout; n->irq_suspend_timeout = n->config->irq_suspend_timeout; + + if (n->dev->irq_affinity_auto && + test_bit(NAPI_STATE_HAS_NOTIFIER, &n->state)) + irq_set_affinity(n->irq, &n->config->affinity_mask); + /* a NAPI ID might be stored in the config, if so use it. if not, use * napi_hash_add to generate one for us. */ @@ -11575,9 +11615,9 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, void (*setup)(struct net_device *), unsigned int txqs, unsigned int rxqs) { + unsigned int maxqs, i, numa; struct net_device *dev; size_t napi_config_sz; - unsigned int maxqs; BUG_ON(strlen(name) >= sizeof(dev->name)); @@ -11679,6 +11719,11 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, if (!dev->napi_config) goto free_all; + numa = dev_to_node(&dev->dev); + for (i = 0; i < maxqs; i++) + cpumask_set_cpu(cpumask_local_spread(i, numa), + &dev->napi_config[i].affinity_mask); + strscpy(dev->name, name); dev->name_assign_type = name_assign_type; dev->group = INIT_NETDEV_GROUP;
A common task for most drivers is to remember the user-set CPU affinity to its IRQs. On each netdev reset, the driver should re-assign the user's settings to the IRQs. Add CPU affinity mask to napi_config. To delegate the CPU affinity management to the core, drivers must: 1 - set the new netdev flag "irq_affinity_auto": netif_enable_irq_affinity(netdev) 2 - create the napi with persistent config: netif_napi_add_config() 3 - bind an IRQ to the napi instance: netif_napi_set_irq() the core will then make sure to use re-assign affinity to the napi's IRQ. The default IRQ mask is set to one cpu starting from the closest NUMA. Signed-off-by: Ahmed Zaki <ahmed.zaki@intel.com> --- include/linux/netdevice.h | 15 ++++++-- net/core/dev.c | 73 +++++++++++++++++++++++++++++++-------- 2 files changed, 72 insertions(+), 16 deletions(-)