diff mbox series

[net-next,v5,1/6] net: move ARFS rmap management to core

Message ID 20250113171042.158123-2-ahmed.zaki@intel.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series net: napi: add CPU affinity to napi->config | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for net-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 40 this patch: 40
netdev/build_tools success Errors and warnings before: 0 (+1) this patch: 0 (+1)
netdev/cc_maintainers warning 2 maintainers not CCed: saeedb@amazon.com akiyano@amazon.com
netdev/build_clang success Errors and warnings before: 6616 this patch: 6616
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 4097 this patch: 4097
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 254 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 105 this patch: 105
netdev/source_inline success Was 0 now: 0
netdev/contest success net-next-2025-01-14--03-00 (tests: 885)

Commit Message

Ahmed Zaki Jan. 13, 2025, 5:10 p.m. UTC
Add a new netdev flag "rx_cpu_rmap_auto". Drivers supporting ARFS should
set the flag via netif_enable_cpu_rmap() and core will allocate and manage
the ARFS rmap. Freeing the rmap is also done by core when the netdev is
freed.

Signed-off-by: Ahmed Zaki <ahmed.zaki@intel.com>
---
 drivers/net/ethernet/amazon/ena/ena_netdev.c | 38 ++---------------
 drivers/net/ethernet/broadcom/bnxt/bnxt.c    | 27 ++----------
 drivers/net/ethernet/intel/ice/ice_arfs.c    | 17 +-------
 include/linux/netdevice.h                    | 15 +++++--
 net/core/dev.c                               | 44 ++++++++++++++++++++
 5 files changed, 63 insertions(+), 78 deletions(-)

Comments

Arinzon, David Jan. 14, 2025, 9:33 a.m. UTC | #1
> Add a new netdev flag "rx_cpu_rmap_auto". Drivers supporting ARFS should
> set the flag via netif_enable_cpu_rmap() and core will allocate and manage
> the ARFS rmap. Freeing the rmap is also done by core when the netdev is
> freed.
> 
> Signed-off-by: Ahmed Zaki <ahmed.zaki@intel.com>
> ---
>  drivers/net/ethernet/amazon/ena/ena_netdev.c | 38 ++---------------
>  drivers/net/ethernet/broadcom/bnxt/bnxt.c    | 27 ++----------
>  drivers/net/ethernet/intel/ice/ice_arfs.c    | 17 +-------
>  include/linux/netdevice.h                    | 15 +++++--
>  net/core/dev.c                               | 44 ++++++++++++++++++++
>  5 files changed, 63 insertions(+), 78 deletions(-)
> 
> diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c
> b/drivers/net/ethernet/amazon/ena/ena_netdev.c
> index c1295dfad0d0..a3fceaa83cd5 100644
> --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
> +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c
> @@ -5,9 +5,6 @@
> 
>  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> 
> -#ifdef CONFIG_RFS_ACCEL
> -#include <linux/cpu_rmap.h>
> -#endif /* CONFIG_RFS_ACCEL */
>  #include <linux/ethtool.h>
>  #include <linux/kernel.h>
>  #include <linux/module.h>
> @@ -165,25 +162,10 @@ int ena_xmit_common(struct ena_adapter
> *adapter,  static int ena_init_rx_cpu_rmap(struct ena_adapter *adapter)  {
> #ifdef CONFIG_RFS_ACCEL
> -       u32 i;
> -       int rc;
> -
> -       adapter->netdev->rx_cpu_rmap = alloc_irq_cpu_rmap(adapter-
> >num_io_queues);
> -       if (!adapter->netdev->rx_cpu_rmap)
> -               return -ENOMEM;
> -       for (i = 0; i < adapter->num_io_queues; i++) {
> -               int irq_idx = ENA_IO_IRQ_IDX(i);
> -
> -               rc = irq_cpu_rmap_add(adapter->netdev->rx_cpu_rmap,
> -                                     pci_irq_vector(adapter->pdev, irq_idx));
> -               if (rc) {
> -                       free_irq_cpu_rmap(adapter->netdev->rx_cpu_rmap);
> -                       adapter->netdev->rx_cpu_rmap = NULL;
> -                       return rc;
> -               }
> -       }
> -#endif /* CONFIG_RFS_ACCEL */
> +       return netif_enable_cpu_rmap(adapter->netdev,
> +adapter->num_io_queues); #else
>         return 0;
> +#endif /* CONFIG_RFS_ACCEL */
>  }
> 
>  static void ena_init_io_rings_common(struct ena_adapter *adapter, @@ -
> 1742,13 +1724,6 @@ static void ena_free_io_irq(struct ena_adapter
> *adapter)
>         struct ena_irq *irq;
>         int i;
> 
> -#ifdef CONFIG_RFS_ACCEL
> -       if (adapter->msix_vecs >= 1) {
> -               free_irq_cpu_rmap(adapter->netdev->rx_cpu_rmap);
> -               adapter->netdev->rx_cpu_rmap = NULL;
> -       }
> -#endif /* CONFIG_RFS_ACCEL */
> -
>         for (i = ENA_IO_IRQ_FIRST_IDX; i <
> ENA_MAX_MSIX_VEC(io_queue_count); i++) {
>                 irq = &adapter->irq_tbl[i];
>                 irq_set_affinity_hint(irq->vector, NULL); @@ -4131,13 +4106,6 @@
> static void __ena_shutoff(struct pci_dev *pdev, bool shutdown)
>         ena_dev = adapter->ena_dev;
>         netdev = adapter->netdev;
> 
> -#ifdef CONFIG_RFS_ACCEL
> -       if ((adapter->msix_vecs >= 1) && (netdev->rx_cpu_rmap)) {
> -               free_irq_cpu_rmap(netdev->rx_cpu_rmap);
> -               netdev->rx_cpu_rmap = NULL;
> -       }
> -
> -#endif /* CONFIG_RFS_ACCEL */
>         /* Make sure timer and reset routine won't be called after
>          * freeing device resources.
>          */
> diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> index 884d42db5554..1f50bc715038 100644
> --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> @@ -49,7 +49,6 @@
>  #include <linux/cache.h>
>  #include <linux/log2.h>
>  #include <linux/bitmap.h>
> -#include <linux/cpu_rmap.h>
>  #include <linux/cpumask.h>
>  #include <net/pkt_cls.h>
>  #include <net/page_pool/helpers.h>
> @@ -10861,7 +10860,7 @@ static int bnxt_set_real_num_queues(struct bnxt
> *bp)
> 
>  #ifdef CONFIG_RFS_ACCEL
>         if (bp->flags & BNXT_FLAG_RFS)
> -               dev->rx_cpu_rmap = alloc_irq_cpu_rmap(bp->rx_nr_rings);
> +               return netif_enable_cpu_rmap(dev, bp->rx_nr_rings);
>  #endif
> 
>         return rc;
> @@ -11215,10 +11214,6 @@ static void bnxt_free_irq(struct bnxt *bp)
>         struct bnxt_irq *irq;
>         int i;
> 
> -#ifdef CONFIG_RFS_ACCEL
> -       free_irq_cpu_rmap(bp->dev->rx_cpu_rmap);
> -       bp->dev->rx_cpu_rmap = NULL;
> -#endif
>         if (!bp->irq_tbl || !bp->bnapi)
>                 return;
> 
> @@ -11241,11 +11236,8 @@ static void bnxt_free_irq(struct bnxt *bp)
> 
>  static int bnxt_request_irq(struct bnxt *bp)  {
> -       int i, j, rc = 0;
> +       int i, rc = 0;
>         unsigned long flags = 0;
> -#ifdef CONFIG_RFS_ACCEL
> -       struct cpu_rmap *rmap;
> -#endif
> 
>         rc = bnxt_setup_int_mode(bp);
>         if (rc) {
> @@ -11253,22 +11245,11 @@ static int bnxt_request_irq(struct bnxt *bp)
>                            rc);
>                 return rc;
>         }
> -#ifdef CONFIG_RFS_ACCEL
> -       rmap = bp->dev->rx_cpu_rmap;
> -#endif
> -       for (i = 0, j = 0; i < bp->cp_nr_rings; i++) {
> +
> +       for (i = 0; i < bp->cp_nr_rings; i++) {
>                 int map_idx = bnxt_cp_num_to_irq_num(bp, i);
>                 struct bnxt_irq *irq = &bp->irq_tbl[map_idx];
> 
> -#ifdef CONFIG_RFS_ACCEL
> -               if (rmap && bp->bnapi[i]->rx_ring) {
> -                       rc = irq_cpu_rmap_add(rmap, irq->vector);
> -                       if (rc)
> -                               netdev_warn(bp->dev, "failed adding irq rmap for ring
> %d\n",
> -                                           j);
> -                       j++;
> -               }
> -#endif
>                 rc = request_irq(irq->vector, irq->handler, flags, irq->name,
>                                  bp->bnapi[i]);
>                 if (rc)
> diff --git a/drivers/net/ethernet/intel/ice/ice_arfs.c
> b/drivers/net/ethernet/intel/ice/ice_arfs.c
> index 7cee365cc7d1..3b1b892e6958 100644
> --- a/drivers/net/ethernet/intel/ice/ice_arfs.c
> +++ b/drivers/net/ethernet/intel/ice/ice_arfs.c
> @@ -584,9 +584,6 @@ void ice_free_cpu_rx_rmap(struct ice_vsi *vsi)
>         netdev = vsi->netdev;
>         if (!netdev || !netdev->rx_cpu_rmap)
>                 return;
> -
> -       free_irq_cpu_rmap(netdev->rx_cpu_rmap);
> -       netdev->rx_cpu_rmap = NULL;
>  }
> 
>  /**
> @@ -597,7 +594,6 @@ int ice_set_cpu_rx_rmap(struct ice_vsi *vsi)  {
>         struct net_device *netdev;
>         struct ice_pf *pf;
> -       int i;
> 
>         if (!vsi || vsi->type != ICE_VSI_PF)
>                 return 0;
> @@ -610,18 +606,7 @@ int ice_set_cpu_rx_rmap(struct ice_vsi *vsi)
>         netdev_dbg(netdev, "Setup CPU RMAP: vsi type 0x%x, ifname %s,
> q_vectors %d\n",
>                    vsi->type, netdev->name, vsi->num_q_vectors);
> 
> -       netdev->rx_cpu_rmap = alloc_irq_cpu_rmap(vsi->num_q_vectors);
> -       if (unlikely(!netdev->rx_cpu_rmap))
> -               return -EINVAL;
> -
> -       ice_for_each_q_vector(vsi, i)
> -               if (irq_cpu_rmap_add(netdev->rx_cpu_rmap,
> -                                    vsi->q_vectors[i]->irq.virq)) {
> -                       ice_free_cpu_rx_rmap(vsi);
> -                       return -EINVAL;
> -               }
> -
> -       return 0;
> +       return netif_enable_cpu_rmap(netdev, vsi->num_q_vectors);
>  }
> 
>  /**
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index
> aeb4a6cff171..7e95e9ee36dd 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -1991,6 +1991,9 @@ enum netdev_reg_state {
>   *
>   *     @threaded:      napi threaded mode is enabled
>   *
> + *     @rx_cpu_rmap_auto: driver wants the core to manage the ARFS rmap.
> + *                        Set by calling netif_enable_cpu_rmap().
> + *
>   *     @see_all_hwtstamp_requests: device wants to see calls to
>   *                     ndo_hwtstamp_set() for all timestamp requests
>   *                     regardless of source, even if those aren't
> @@ -2398,6 +2401,9 @@ struct net_device {
>         struct lock_class_key   *qdisc_tx_busylock;
>         bool                    proto_down;
>         bool                    threaded;
> +#ifdef CONFIG_RFS_ACCEL
> +       bool                    rx_cpu_rmap_auto;
> +#endif
> 
>         /* priv_flags_slow, ungrouped to save space */
>         unsigned long           see_all_hwtstamp_requests:1;
> @@ -2671,10 +2677,7 @@ void netif_queue_set_napi(struct net_device
> *dev, unsigned int queue_index,
>                           enum netdev_queue_type type,
>                           struct napi_struct *napi);
> 
> -static inline void netif_napi_set_irq(struct napi_struct *napi, int irq) -{
> -       napi->irq = irq;
> -}
> +void netif_napi_set_irq(struct napi_struct *napi, int irq);
> 
>  /* Default NAPI poll() weight
>   * Device drivers are strongly advised to not use bigger value @@ -2765,6
> +2768,10 @@ static inline void netif_napi_del(struct napi_struct *napi)
>         synchronize_net();
>  }
> 
> +#ifdef CONFIG_RFS_ACCEL
> +int netif_enable_cpu_rmap(struct net_device *dev, unsigned int
> +num_irqs);
> +
> +#endif
>  struct packet_type {
>         __be16                  type;   /* This is really htons(ether_type). */
>         bool                    ignore_outgoing;
> diff --git a/net/core/dev.c b/net/core/dev.c index
> 1a90ed8cc6cc..3ee7a514dca8 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -6745,6 +6745,46 @@ void netif_queue_set_napi(struct net_device
> *dev, unsigned int queue_index,  }
> EXPORT_SYMBOL(netif_queue_set_napi);
> 
> +#ifdef CONFIG_RFS_ACCEL
> +static void netif_disable_cpu_rmap(struct net_device *dev) {
> +       free_irq_cpu_rmap(dev->rx_cpu_rmap);
> +       dev->rx_cpu_rmap = NULL;
> +       dev->rx_cpu_rmap_auto = false;
> +}
> +
> +int netif_enable_cpu_rmap(struct net_device *dev, unsigned int
> +num_irqs) {
> +       dev->rx_cpu_rmap = alloc_irq_cpu_rmap(num_irqs);
> +       if (!dev->rx_cpu_rmap)
> +               return -ENOMEM;
> +
> +       dev->rx_cpu_rmap_auto = true;
> +       return 0;
> +}
> +EXPORT_SYMBOL(netif_enable_cpu_rmap);
> +#endif
> +
> +void netif_napi_set_irq(struct napi_struct *napi, int irq) { #ifdef
> +CONFIG_RFS_ACCEL
> +       int rc;
> +#endif
> +       napi->irq = irq;
> +
> +#ifdef CONFIG_RFS_ACCEL
> +       if (napi->dev->rx_cpu_rmap && napi->dev->rx_cpu_rmap_auto) {
> +               rc = irq_cpu_rmap_add(napi->dev->rx_cpu_rmap, irq);
> +               if (rc) {
> +                       netdev_warn(napi->dev, "Unable to update ARFS map (%d)\n",
> +                                   rc);
> +                       netif_disable_cpu_rmap(napi->dev);
> +               }
> +       }
> +#endif
> +}
> +EXPORT_SYMBOL(netif_napi_set_irq);
> +
>  static void napi_restore_config(struct napi_struct *n)  {
>         n->defer_hard_irqs = n->config->defer_hard_irqs; @@ -11421,6
> +11461,10 @@ void free_netdev(struct net_device *dev)
>         /* Flush device addresses */
>         dev_addr_flush(dev);
> 
> +#ifdef CONFIG_RFS_ACCEL
> +       if (dev->rx_cpu_rmap && dev->rx_cpu_rmap_auto)
> +               netif_disable_cpu_rmap(dev); #endif
>         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
>                 netif_napi_del(p);
> 
> --
> 2.43.0

Thanks for making the change in the ENA driver.

Acked-by: David Arinzon <darinzon@amazon.com>
Jakub Kicinski Jan. 14, 2025, 10:08 p.m. UTC | #2
On Mon, 13 Jan 2025 10:10:37 -0700 Ahmed Zaki wrote:
> -#endif /* CONFIG_RFS_ACCEL */
> +	return netif_enable_cpu_rmap(adapter->netdev, adapter->num_io_queues);
> +#else
>  	return 0;
> +#endif /* CONFIG_RFS_ACCEL */

Let's try to eliminate some of the ifdef-ery on the driver side.
netif_enable_cpu_rmap() should simply do nothing if !CONFIG_RFS_ACCEL

> @@ -2398,6 +2401,9 @@ struct net_device {
> 	struct lock_class_key	*qdisc_tx_busylock;
> 	bool			proto_down;
> 	bool			threaded;
> +#ifdef CONFIG_RFS_ACCEL
> +	bool			rx_cpu_rmap_auto;
> +#endif

similar point, don't hide it, it's just one byte and we can just leave
it as false if !CONFIG_RFS_ACCEL. It can save us a bunch of other ifdefs

> +#ifdef CONFIG_RFS_ACCEL
> +static void netif_disable_cpu_rmap(struct net_device *dev)
> +{
> +	free_irq_cpu_rmap(dev->rx_cpu_rmap);
> +	dev->rx_cpu_rmap = NULL;
> +	dev->rx_cpu_rmap_auto = false;
> +}

Better do do:

static void netif_disable_cpu_rmap(struct net_device *dev)
{
#ifdef CONFIG_RFS_ACCEL
	free_irq_cpu_rmap(dev->rx_cpu_rmap);
	dev->rx_cpu_rmap = NULL;
	dev->rx_cpu_rmap_auto = false;
#endif
}

IOW if not relevant the function should do nothing

> +int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs)
> +{
> +	dev->rx_cpu_rmap = alloc_irq_cpu_rmap(num_irqs);
> +	if (!dev->rx_cpu_rmap)
> +		return -ENOMEM;
> +
> +	dev->rx_cpu_rmap_auto = true;
> +	return 0;
> +}
> +EXPORT_SYMBOL(netif_enable_cpu_rmap);

here you can depend on dead code elimination:

int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs)
{
	if (!IS_ENABLED(CONFIG_RFS_ACCEL))
		return 0;

	...
}

> +#endif
> +
> +void netif_napi_set_irq(struct napi_struct *napi, int irq)
> +{
> +#ifdef CONFIG_RFS_ACCEL
> +	int rc;
> +#endif
> +	napi->irq = irq;
> +
> +#ifdef CONFIG_RFS_ACCEL
> +	if (napi->dev->rx_cpu_rmap && napi->dev->rx_cpu_rmap_auto) {
> +		rc = irq_cpu_rmap_add(napi->dev->rx_cpu_rmap, irq);
> +		if (rc) {
> +			netdev_warn(napi->dev, "Unable to update ARFS map (%d)\n",
> +				    rc);
> +			netif_disable_cpu_rmap(napi->dev);
> +		}
> +	}
> +#endif

Declare rc inside the if to avoid the extra ifdef on variable decl

> +}
> +EXPORT_SYMBOL(netif_napi_set_irq);
> +
>  static void napi_restore_config(struct napi_struct *n)
>  {
>  	n->defer_hard_irqs = n->config->defer_hard_irqs;
> @@ -11421,6 +11461,10 @@ void free_netdev(struct net_device *dev)
>  	/* Flush device addresses */
>  	dev_addr_flush(dev);
>  
> +#ifdef CONFIG_RFS_ACCEL
> +	if (dev->rx_cpu_rmap && dev->rx_cpu_rmap_auto)

don't check dev->rx_cpu_rmap, dev->rx_cpu_rmap_auto is enough

> +		netif_disable_cpu_rmap(dev);
> +#endif
>  	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
>  		netif_napi_del(p);
>  

IRQs are often allocated in ndo_open and freed in ndo_stop, so
you need to catch netif_napi_del or napi_disable and remove
the IRQ from the map.

Similarly netif_napi_set_irq() may get called with -1 to clear
the IRQ number, which you currently treat at a real IRQ id, AFAICT.
Ahmed Zaki Jan. 15, 2025, 1 a.m. UTC | #3
On 2025-01-14 3:08 p.m., Jakub Kicinski wrote:
> On Mon, 13 Jan 2025 10:10:37 -0700 Ahmed Zaki wrote:
>> -#endif /* CONFIG_RFS_ACCEL */
>> +	return netif_enable_cpu_rmap(adapter->netdev, adapter->num_io_queues);
>> +#else
>>   	return 0;
>> +#endif /* CONFIG_RFS_ACCEL */
> 
> Let's try to eliminate some of the ifdef-ery on the driver side.
> netif_enable_cpu_rmap() should simply do nothing if !CONFIG_RFS_ACCEL
> 
>> @@ -2398,6 +2401,9 @@ struct net_device {
>> 	struct lock_class_key	*qdisc_tx_busylock;
>> 	bool			proto_down;
>> 	bool			threaded;
>> +#ifdef CONFIG_RFS_ACCEL
>> +	bool			rx_cpu_rmap_auto;
>> +#endif
> 
> similar point, don't hide it, it's just one byte and we can just leave
> it as false if !CONFIG_RFS_ACCEL. It can save us a bunch of other ifdefs

Ok, makes sense.

> 
>> +#ifdef CONFIG_RFS_ACCEL
>> +static void netif_disable_cpu_rmap(struct net_device *dev)
>> +{
>> +	free_irq_cpu_rmap(dev->rx_cpu_rmap);
>> +	dev->rx_cpu_rmap = NULL;
>> +	dev->rx_cpu_rmap_auto = false;
>> +}
> 
> Better do do:
> 
> static void netif_disable_cpu_rmap(struct net_device *dev)
> {
> #ifdef CONFIG_RFS_ACCEL
> 	free_irq_cpu_rmap(dev->rx_cpu_rmap);
> 	dev->rx_cpu_rmap = NULL;
> 	dev->rx_cpu_rmap_auto = false;
> #endif
> }

Sure.

> 
> IOW if not relevant the function should do nothing
> 
>> +int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs)
>> +{
>> +	dev->rx_cpu_rmap = alloc_irq_cpu_rmap(num_irqs);
>> +	if (!dev->rx_cpu_rmap)
>> +		return -ENOMEM;
>> +
>> +	dev->rx_cpu_rmap_auto = true;
>> +	return 0;
>> +}
>> +EXPORT_SYMBOL(netif_enable_cpu_rmap);
> 
> here you can depend on dead code elimination:
> 
> int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs)
> {
> 	if (!IS_ENABLED(CONFIG_RFS_ACCEL))
> 		return 0;
> 
> 	...
> }
> 

netdev->rx_cpu_rmap is declared inside #ifdef CONFIG_RFS_ACCEL, so I 
still need #ifdef inside netif_enable_cpu_rmap(). I will do the same as
in netif_disable_cpu_rmap() though, that will keep the function visible.

>> +#endif
>> +
>> +void netif_napi_set_irq(struct napi_struct *napi, int irq)
>> +{
>> +#ifdef CONFIG_RFS_ACCEL
>> +	int rc;
>> +#endif
>> +	napi->irq = irq;
>> +
>> +#ifdef CONFIG_RFS_ACCEL
>> +	if (napi->dev->rx_cpu_rmap && napi->dev->rx_cpu_rmap_auto) {
>> +		rc = irq_cpu_rmap_add(napi->dev->rx_cpu_rmap, irq);
>> +		if (rc) {
>> +			netdev_warn(napi->dev, "Unable to update ARFS map (%d)\n",
>> +				    rc);
>> +			netif_disable_cpu_rmap(napi->dev);
>> +		}
>> +	}
>> +#endif
> 
> Declare rc inside the if to avoid the extra ifdef on variable decl

The CONFIG_RFS_ACCEL is removed in a later patch (3) when the 
irq_affinity_auto is introduced and rc is re-used.

Instead, I will move "napi->irq = irq;" to the end and merge the 2 
RFS_ACCL blocks.

> 
>> +}
>> +EXPORT_SYMBOL(netif_napi_set_irq);
>> +
>>   static void napi_restore_config(struct napi_struct *n)
>>   {
>>   	n->defer_hard_irqs = n->config->defer_hard_irqs;
>> @@ -11421,6 +11461,10 @@ void free_netdev(struct net_device *dev)
>>   	/* Flush device addresses */
>>   	dev_addr_flush(dev);
>>   
>> +#ifdef CONFIG_RFS_ACCEL
>> +	if (dev->rx_cpu_rmap && dev->rx_cpu_rmap_auto)
> 
> don't check dev->rx_cpu_rmap, dev->rx_cpu_rmap_auto is enough

yes, also a good point.


> 
>> +		netif_disable_cpu_rmap(dev);
>> +#endif
>>   	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
>>   		netif_napi_del(p);
>>   
> 
> IRQs are often allocated in ndo_open and freed in ndo_stop, so
> you need to catch netif_napi_del or napi_disable and remove
> the IRQ from the map.

Ok, I will look into that too.

> 
> Similarly netif_napi_set_irq() may get called with -1 to clear
> the IRQ number, which you currently treat at a real IRQ id, AFAICT.

correct there is no handling for irq = -1. So netif_napi_set_irq() needs 
to add the irq to the rmap only if it is > 0.

I need to clarify expectation of netif_napi_set_irq() because I only see 
it called with irq = -1 in napi_add_weight. But you say it can be called 
with irq = -1 to "clear" the IRQ.

Does this mean that, if irq = -1, we need to "delete" the irq from rmap 
if a valid irq already existed (which means this can happen as an 
alternative to napi_del()/napi_diable())? or just skip adding irq to 
rmap is enough?

Thanks,
Ahmed
Jakub Kicinski Jan. 15, 2025, 1:38 a.m. UTC | #4
On Tue, 14 Jan 2025 18:00:30 -0700 Ahmed Zaki wrote:
> > Similarly netif_napi_set_irq() may get called with -1 to clear
> > the IRQ number, which you currently treat at a real IRQ id, AFAICT.  
> 
> correct there is no handling for irq = -1. So netif_napi_set_irq() needs 
> to add the irq to the rmap only if it is > 0.
> 
> I need to clarify expectation of netif_napi_set_irq() because I only see 
> it called with irq = -1 in napi_add_weight. But you say it can be called 
> with irq = -1 to "clear" the IRQ.

I _think_ that's what Amritha had in mind. For queue <> NAPI linking
similarly we are expected to call the same helper with a NULL param.

> Does this mean that, if irq = -1, we need to "delete" the irq from rmap 
> if a valid irq already existed (which means this can happen as an 
> alternative to napi_del()/napi_diable())? or just skip adding irq to 
> rmap is enough?

I'm afraid we need both. Most drivers today simply never clear the IRQ,
they will just delete the NAPI and kfree() its memory. So we need to
"catch" NAPIs with IRQs assigned getting deleted and clean up the IRQ.

In the future some drivers may explicitly call the set with -1,
especially now that the IRQ has more implications than just getting
reported via netlink. We need to support that, too.

And for a good measure we should also throw in a warning if a driver
tries to set the IRQ but the IRQ is already set in the NAPI (not -1).
diff mbox series

Patch

diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c
index c1295dfad0d0..a3fceaa83cd5 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c
@@ -5,9 +5,6 @@ 
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
-#ifdef CONFIG_RFS_ACCEL
-#include <linux/cpu_rmap.h>
-#endif /* CONFIG_RFS_ACCEL */
 #include <linux/ethtool.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
@@ -165,25 +162,10 @@  int ena_xmit_common(struct ena_adapter *adapter,
 static int ena_init_rx_cpu_rmap(struct ena_adapter *adapter)
 {
 #ifdef CONFIG_RFS_ACCEL
-	u32 i;
-	int rc;
-
-	adapter->netdev->rx_cpu_rmap = alloc_irq_cpu_rmap(adapter->num_io_queues);
-	if (!adapter->netdev->rx_cpu_rmap)
-		return -ENOMEM;
-	for (i = 0; i < adapter->num_io_queues; i++) {
-		int irq_idx = ENA_IO_IRQ_IDX(i);
-
-		rc = irq_cpu_rmap_add(adapter->netdev->rx_cpu_rmap,
-				      pci_irq_vector(adapter->pdev, irq_idx));
-		if (rc) {
-			free_irq_cpu_rmap(adapter->netdev->rx_cpu_rmap);
-			adapter->netdev->rx_cpu_rmap = NULL;
-			return rc;
-		}
-	}
-#endif /* CONFIG_RFS_ACCEL */
+	return netif_enable_cpu_rmap(adapter->netdev, adapter->num_io_queues);
+#else
 	return 0;
+#endif /* CONFIG_RFS_ACCEL */
 }
 
 static void ena_init_io_rings_common(struct ena_adapter *adapter,
@@ -1742,13 +1724,6 @@  static void ena_free_io_irq(struct ena_adapter *adapter)
 	struct ena_irq *irq;
 	int i;
 
-#ifdef CONFIG_RFS_ACCEL
-	if (adapter->msix_vecs >= 1) {
-		free_irq_cpu_rmap(adapter->netdev->rx_cpu_rmap);
-		adapter->netdev->rx_cpu_rmap = NULL;
-	}
-#endif /* CONFIG_RFS_ACCEL */
-
 	for (i = ENA_IO_IRQ_FIRST_IDX; i < ENA_MAX_MSIX_VEC(io_queue_count); i++) {
 		irq = &adapter->irq_tbl[i];
 		irq_set_affinity_hint(irq->vector, NULL);
@@ -4131,13 +4106,6 @@  static void __ena_shutoff(struct pci_dev *pdev, bool shutdown)
 	ena_dev = adapter->ena_dev;
 	netdev = adapter->netdev;
 
-#ifdef CONFIG_RFS_ACCEL
-	if ((adapter->msix_vecs >= 1) && (netdev->rx_cpu_rmap)) {
-		free_irq_cpu_rmap(netdev->rx_cpu_rmap);
-		netdev->rx_cpu_rmap = NULL;
-	}
-
-#endif /* CONFIG_RFS_ACCEL */
 	/* Make sure timer and reset routine won't be called after
 	 * freeing device resources.
 	 */
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 884d42db5554..1f50bc715038 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -49,7 +49,6 @@ 
 #include <linux/cache.h>
 #include <linux/log2.h>
 #include <linux/bitmap.h>
-#include <linux/cpu_rmap.h>
 #include <linux/cpumask.h>
 #include <net/pkt_cls.h>
 #include <net/page_pool/helpers.h>
@@ -10861,7 +10860,7 @@  static int bnxt_set_real_num_queues(struct bnxt *bp)
 
 #ifdef CONFIG_RFS_ACCEL
 	if (bp->flags & BNXT_FLAG_RFS)
-		dev->rx_cpu_rmap = alloc_irq_cpu_rmap(bp->rx_nr_rings);
+		return netif_enable_cpu_rmap(dev, bp->rx_nr_rings);
 #endif
 
 	return rc;
@@ -11215,10 +11214,6 @@  static void bnxt_free_irq(struct bnxt *bp)
 	struct bnxt_irq *irq;
 	int i;
 
-#ifdef CONFIG_RFS_ACCEL
-	free_irq_cpu_rmap(bp->dev->rx_cpu_rmap);
-	bp->dev->rx_cpu_rmap = NULL;
-#endif
 	if (!bp->irq_tbl || !bp->bnapi)
 		return;
 
@@ -11241,11 +11236,8 @@  static void bnxt_free_irq(struct bnxt *bp)
 
 static int bnxt_request_irq(struct bnxt *bp)
 {
-	int i, j, rc = 0;
+	int i, rc = 0;
 	unsigned long flags = 0;
-#ifdef CONFIG_RFS_ACCEL
-	struct cpu_rmap *rmap;
-#endif
 
 	rc = bnxt_setup_int_mode(bp);
 	if (rc) {
@@ -11253,22 +11245,11 @@  static int bnxt_request_irq(struct bnxt *bp)
 			   rc);
 		return rc;
 	}
-#ifdef CONFIG_RFS_ACCEL
-	rmap = bp->dev->rx_cpu_rmap;
-#endif
-	for (i = 0, j = 0; i < bp->cp_nr_rings; i++) {
+
+	for (i = 0; i < bp->cp_nr_rings; i++) {
 		int map_idx = bnxt_cp_num_to_irq_num(bp, i);
 		struct bnxt_irq *irq = &bp->irq_tbl[map_idx];
 
-#ifdef CONFIG_RFS_ACCEL
-		if (rmap && bp->bnapi[i]->rx_ring) {
-			rc = irq_cpu_rmap_add(rmap, irq->vector);
-			if (rc)
-				netdev_warn(bp->dev, "failed adding irq rmap for ring %d\n",
-					    j);
-			j++;
-		}
-#endif
 		rc = request_irq(irq->vector, irq->handler, flags, irq->name,
 				 bp->bnapi[i]);
 		if (rc)
diff --git a/drivers/net/ethernet/intel/ice/ice_arfs.c b/drivers/net/ethernet/intel/ice/ice_arfs.c
index 7cee365cc7d1..3b1b892e6958 100644
--- a/drivers/net/ethernet/intel/ice/ice_arfs.c
+++ b/drivers/net/ethernet/intel/ice/ice_arfs.c
@@ -584,9 +584,6 @@  void ice_free_cpu_rx_rmap(struct ice_vsi *vsi)
 	netdev = vsi->netdev;
 	if (!netdev || !netdev->rx_cpu_rmap)
 		return;
-
-	free_irq_cpu_rmap(netdev->rx_cpu_rmap);
-	netdev->rx_cpu_rmap = NULL;
 }
 
 /**
@@ -597,7 +594,6 @@  int ice_set_cpu_rx_rmap(struct ice_vsi *vsi)
 {
 	struct net_device *netdev;
 	struct ice_pf *pf;
-	int i;
 
 	if (!vsi || vsi->type != ICE_VSI_PF)
 		return 0;
@@ -610,18 +606,7 @@  int ice_set_cpu_rx_rmap(struct ice_vsi *vsi)
 	netdev_dbg(netdev, "Setup CPU RMAP: vsi type 0x%x, ifname %s, q_vectors %d\n",
 		   vsi->type, netdev->name, vsi->num_q_vectors);
 
-	netdev->rx_cpu_rmap = alloc_irq_cpu_rmap(vsi->num_q_vectors);
-	if (unlikely(!netdev->rx_cpu_rmap))
-		return -EINVAL;
-
-	ice_for_each_q_vector(vsi, i)
-		if (irq_cpu_rmap_add(netdev->rx_cpu_rmap,
-				     vsi->q_vectors[i]->irq.virq)) {
-			ice_free_cpu_rx_rmap(vsi);
-			return -EINVAL;
-		}
-
-	return 0;
+	return netif_enable_cpu_rmap(netdev, vsi->num_q_vectors);
 }
 
 /**
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index aeb4a6cff171..7e95e9ee36dd 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1991,6 +1991,9 @@  enum netdev_reg_state {
  *
  *	@threaded:	napi threaded mode is enabled
  *
+ *	@rx_cpu_rmap_auto: driver wants the core to manage the ARFS rmap.
+ *	                   Set by calling netif_enable_cpu_rmap().
+ *
  *	@see_all_hwtstamp_requests: device wants to see calls to
  *			ndo_hwtstamp_set() for all timestamp requests
  *			regardless of source, even if those aren't
@@ -2398,6 +2401,9 @@  struct net_device {
 	struct lock_class_key	*qdisc_tx_busylock;
 	bool			proto_down;
 	bool			threaded;
+#ifdef CONFIG_RFS_ACCEL
+	bool			rx_cpu_rmap_auto;
+#endif
 
 	/* priv_flags_slow, ungrouped to save space */
 	unsigned long		see_all_hwtstamp_requests:1;
@@ -2671,10 +2677,7 @@  void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index,
 			  enum netdev_queue_type type,
 			  struct napi_struct *napi);
 
-static inline void netif_napi_set_irq(struct napi_struct *napi, int irq)
-{
-	napi->irq = irq;
-}
+void netif_napi_set_irq(struct napi_struct *napi, int irq);
 
 /* Default NAPI poll() weight
  * Device drivers are strongly advised to not use bigger value
@@ -2765,6 +2768,10 @@  static inline void netif_napi_del(struct napi_struct *napi)
 	synchronize_net();
 }
 
+#ifdef CONFIG_RFS_ACCEL
+int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs);
+
+#endif
 struct packet_type {
 	__be16			type;	/* This is really htons(ether_type). */
 	bool			ignore_outgoing;
diff --git a/net/core/dev.c b/net/core/dev.c
index 1a90ed8cc6cc..3ee7a514dca8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6745,6 +6745,46 @@  void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index,
 }
 EXPORT_SYMBOL(netif_queue_set_napi);
 
+#ifdef CONFIG_RFS_ACCEL
+static void netif_disable_cpu_rmap(struct net_device *dev)
+{
+	free_irq_cpu_rmap(dev->rx_cpu_rmap);
+	dev->rx_cpu_rmap = NULL;
+	dev->rx_cpu_rmap_auto = false;
+}
+
+int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs)
+{
+	dev->rx_cpu_rmap = alloc_irq_cpu_rmap(num_irqs);
+	if (!dev->rx_cpu_rmap)
+		return -ENOMEM;
+
+	dev->rx_cpu_rmap_auto = true;
+	return 0;
+}
+EXPORT_SYMBOL(netif_enable_cpu_rmap);
+#endif
+
+void netif_napi_set_irq(struct napi_struct *napi, int irq)
+{
+#ifdef CONFIG_RFS_ACCEL
+	int rc;
+#endif
+	napi->irq = irq;
+
+#ifdef CONFIG_RFS_ACCEL
+	if (napi->dev->rx_cpu_rmap && napi->dev->rx_cpu_rmap_auto) {
+		rc = irq_cpu_rmap_add(napi->dev->rx_cpu_rmap, irq);
+		if (rc) {
+			netdev_warn(napi->dev, "Unable to update ARFS map (%d)\n",
+				    rc);
+			netif_disable_cpu_rmap(napi->dev);
+		}
+	}
+#endif
+}
+EXPORT_SYMBOL(netif_napi_set_irq);
+
 static void napi_restore_config(struct napi_struct *n)
 {
 	n->defer_hard_irqs = n->config->defer_hard_irqs;
@@ -11421,6 +11461,10 @@  void free_netdev(struct net_device *dev)
 	/* Flush device addresses */
 	dev_addr_flush(dev);
 
+#ifdef CONFIG_RFS_ACCEL
+	if (dev->rx_cpu_rmap && dev->rx_cpu_rmap_auto)
+		netif_disable_cpu_rmap(dev);
+#endif
 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
 		netif_napi_del(p);