diff mbox series

[RFC,net-next,v3,5/9] net: napi: Add napi_config

Message ID 20240912100738.16567-6-jdamato@fastly.com (mailing list archive)
State RFC
Delegated to: Netdev Maintainers
Headers show
Series Add support for per-NAPI config via netlink | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for net-next, async
netdev/apply fail Patch does not apply to net-next-0

Commit Message

Joe Damato Sept. 12, 2024, 10:07 a.m. UTC
Add a persistent NAPI config area for NAPI configuration to the core.
Drivers opt-in to setting the storage for a NAPI by passing an index
when calling netif_napi_add_storage.

napi_config is allocated in alloc_netdev_mqs, freed in free_netdev
(after the NAPIs are deleted), and set to 0 when napi_enable is called.

Drivers which implement call netif_napi_add_storage will have persistent
NAPI IDs.

Signed-off-by: Joe Damato <jdamato@fastly.com>
---
 .../networking/net_cachelines/net_device.rst  |  1 +
 include/linux/netdevice.h                     | 34 +++++++++
 net/core/dev.c                                | 74 +++++++++++++++++--
 net/core/dev.h                                | 12 +++
 4 files changed, 113 insertions(+), 8 deletions(-)

Comments

Joe Damato Sept. 12, 2024, 3:03 p.m. UTC | #1
Several comments on different things below for this patch that I just noticed.

On Thu, Sep 12, 2024 at 10:07:13AM +0000, Joe Damato wrote:
> Add a persistent NAPI config area for NAPI configuration to the core.
> Drivers opt-in to setting the storage for a NAPI by passing an index
> when calling netif_napi_add_storage.
> 
> napi_config is allocated in alloc_netdev_mqs, freed in free_netdev
> (after the NAPIs are deleted), and set to 0 when napi_enable is called.

Forgot to re-read all the commit messages. I will do that for rfcv4
and make sure they are all correct; this message is not correct.
 
> Drivers which implement call netif_napi_add_storage will have persistent
> NAPI IDs.
> 
> Signed-off-by: Joe Damato <jdamato@fastly.com>
> ---
>  .../networking/net_cachelines/net_device.rst  |  1 +
>  include/linux/netdevice.h                     | 34 +++++++++
>  net/core/dev.c                                | 74 +++++++++++++++++--
>  net/core/dev.h                                | 12 +++
>  4 files changed, 113 insertions(+), 8 deletions(-)
>

[...]

> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 3e07ab8e0295..08afc96179f9 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h

[...]

> @@ -2685,6 +2717,8 @@ void __netif_napi_del(struct napi_struct *napi);
>   */
>  static inline void netif_napi_del(struct napi_struct *napi)
>  {
> +	napi->config = NULL;
> +	napi->index = -1;
>  	__netif_napi_del(napi);
>  	synchronize_net();
>  }

I don't quite see how, but occasionally when changing the queue
count with ethtool -L, I seem to trigger a page_pool issue.

I assumed it was related to either my changes above in netif_napi_del, or 
below in __netif_napi_del, but I'm not so sure because the issue does not
happen reliably and I can't seem to figure out how my change would cause this.

When it does happen, this is the stack trace:

  page_pool_empty_ring() page_pool refcnt -30528 violation
  ------------[ cut here ]------------
  Negative(-1) inflight packet-pages
  WARNING: CPU: 1 PID: 5117 at net/core/page_pool.c:617 page_pool_inflight+0x4c/0x90

  [...]

  CPU: 1 UID: 0 PID: 5117 Comm: ethtool Tainted: G        W          [...]

  RIP: 0010:page_pool_inflight+0x4c/0x90
  Code: e4 b8 00 00 00 00 44 0f 48 e0 44 89 e0 41 5c e9 8a c1 1b 00 66 90 45 85 e4 79 ef 44 89 e6 48 c7 c7 78 56 26 82 e8 14 63 78 ff <0f> 0b eb dc 65 8b 05 b5 af 71 7e 48 0f a3 05 21 d9 3b 01 73 d7 48
  RSP: 0018:ffffc9001d01b640 EFLAGS: 00010282
  RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000027
  RDX: 0000000000000027 RSI: 00000000ffefffff RDI: ffff88bf4f45c988
  RBP: ffff8900da55a800 R08: 0000000000000000 R09: ffffc9001d01b480
  R10: 0000000000000001 R11: 0000000000000001 R12: 00000000ffffffff
  R13: ffffffff82cd35b0 R14: ffff890062550f00 R15: ffff8881b0e85400
  FS:  00007fa9cb382740(0000) GS:ffff88bf4f440000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 0000558baa9d3b38 CR3: 000000011222a000 CR4: 0000000000350ef0
  Call Trace:
   <TASK>
   ? __warn+0x80/0x110
   ? page_pool_inflight+0x4c/0x90
   ? report_bug+0x19c/0x1b0
   ? handle_bug+0x3c/0x70
   ? exc_invalid_op+0x18/0x70
   ? asm_exc_invalid_op+0x1a/0x20
   ? page_pool_inflight+0x4c/0x90
   page_pool_release+0x10e/0x1d0
   page_pool_destroy+0x66/0x160
   mlx5e_free_rq+0x69/0xb0 [mlx5_core]
   mlx5e_close_queues+0x39/0x150 [mlx5_core]
   mlx5e_close_channel+0x1c/0x60 [mlx5_core]
   mlx5e_close_channels+0x49/0xa0 [mlx5_core]
   mlx5e_switch_priv_channels+0xa9/0x120 [mlx5_core]
   ? __pfx_mlx5e_num_channels_changed_ctx+0x10/0x10 [mlx5_core]
   mlx5e_safe_switch_params+0xb8/0xf0 [mlx5_core]
   mlx5e_ethtool_set_channels+0x17a/0x290 [mlx5_core]
   ethnl_set_channels+0x243/0x310
   ethnl_default_set_doit+0xc1/0x170
   genl_family_rcv_msg_doit+0xd9/0x130
   genl_rcv_msg+0x18f/0x2c0
   ? __pfx_ethnl_default_set_doit+0x10/0x10
   ? __pfx_genl_rcv_msg+0x10/0x10
   netlink_rcv_skb+0x5a/0x110
   genl_rcv+0x28/0x40
   netlink_unicast+0x1aa/0x260
   netlink_sendmsg+0x1e9/0x420
   __sys_sendto+0x1d5/0x1f0
   ? handle_mm_fault+0x1cb/0x290
   ? do_user_addr_fault+0x558/0x7c0
   __x64_sys_sendto+0x29/0x30
   do_syscall_64+0x5d/0x170
   entry_SYSCALL_64_after_hwframe+0x76/0x7e

> diff --git a/net/core/dev.c b/net/core/dev.c
> index f2fd503516de..ca2227d0b8ed 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c

[...]

> @@ -6736,7 +6776,13 @@ void __netif_napi_del(struct napi_struct *napi)
>  	if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
>  		return;
>  
> -	napi_hash_del(napi);
> +	if (!napi->config) {
> +		napi_hash_del(napi);
> +	} else {
> +		napi->index = -1;
> +		napi->config = NULL;
> +	}
> +

See above; perhaps something related to this change is triggering the page pool
warning occasionally.

>  	list_del_rcu(&napi->dev_list);
>  	napi_free_frags(napi);
>  
> @@ -11049,6 +11095,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
>  		unsigned int txqs, unsigned int rxqs)
>  {
>  	struct net_device *dev;
> +	size_t napi_config_sz;
> +	unsigned int maxqs;
>  
>  	BUG_ON(strlen(name) >= sizeof(dev->name));
>  
> @@ -11062,6 +11110,9 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
>  		return NULL;
>  	}
>  
> +	WARN_ON_ONCE(txqs != rxqs);

This warning triggers for me on boot every time with mlx5 NICs.

The code in mlx5 seems to get the rxq and txq maximums in:
  drivers/net/ethernet/mellanox/mlx5/core/en_main.c
    mlx5e_create_netdev

  which does:

    txqs = mlx5e_get_max_num_txqs(mdev, profile);
    rxqs = mlx5e_get_max_num_rxqs(mdev, profile);

    netdev = alloc_etherdev_mqs(sizeof(struct mlx5e_priv), txqs, rxqs);

In my case for my device, txqs: 760, rxqs: 63.

I would guess that this warning will trigger everytime for mlx5 NICs
and would be quite annoying.

We may just want to replace the allocation logic to allocate
txqs+rxqs, remove the WARN_ON_ONCE, and be OK with some wasted
space?

> +	maxqs = max(txqs, rxqs);
> +
>  	dev = kvzalloc(struct_size(dev, priv, sizeof_priv),
>  		       GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
>  	if (!dev)
> @@ -11136,6 +11187,11 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
>  	if (!dev->ethtool)
>  		goto free_all;
>  
> +	napi_config_sz = array_size(maxqs, sizeof(*dev->napi_config));
> +	dev->napi_config = kvzalloc(napi_config_sz, GFP_KERNEL_ACCOUNT);
> +	if (!dev->napi_config)
> +		goto free_all;
> +

[...]

> diff --git a/net/core/dev.h b/net/core/dev.h
> index a9d5f678564a..9eb3f559275c 100644
> --- a/net/core/dev.h
> +++ b/net/core/dev.h
> @@ -167,11 +167,17 @@ static inline void napi_set_defer_hard_irqs(struct napi_struct *n, u32 defer)
>  static inline void netdev_set_defer_hard_irqs(struct net_device *netdev,
>  					      u32 defer)
>  {
> +	unsigned int count = max(netdev->num_rx_queues,
> +				 netdev->num_tx_queues);
>  	struct napi_struct *napi;
> +	int i;
>  
>  	WRITE_ONCE(netdev->napi_defer_hard_irqs, defer);
>  	list_for_each_entry(napi, &netdev->napi_list, dev_list)
>  		napi_set_defer_hard_irqs(napi, defer);
> +
> +	for (i = 0; i < count; i++)
> +		netdev->napi_config[i].defer_hard_irqs = defer;

The above is incorrect. Some devices may have netdev->napi_config =
NULL if they don't call the add_storage wrapper.

Unless there is major feedback/changes requested that affect this
code, in the rfcv4 branch I plan to fix this by adding a NULL check:

  if (netdev->napi_config)
    for (....)
      netdev->napi_config[i]....

>  
>  /**
> @@ -206,11 +212,17 @@ static inline void napi_set_gro_flush_timeout(struct napi_struct *n,
>  static inline void netdev_set_gro_flush_timeout(struct net_device *netdev,
>  						unsigned long timeout)
>  {
> +	unsigned int count = max(netdev->num_rx_queues,
> +				 netdev->num_tx_queues);
>  	struct napi_struct *napi;
> +	int i;
>  
>  	WRITE_ONCE(netdev->gro_flush_timeout, timeout);
>  	list_for_each_entry(napi, &netdev->napi_list, dev_list)
>  		napi_set_gro_flush_timeout(napi, timeout);
> +
> +	for (i = 0; i < count; i++)
> +		netdev->napi_config[i].gro_flush_timeout = timeout;

Same as above.
Stanislav Fomichev Sept. 13, 2024, 5:42 p.m. UTC | #2
On 09/12, Joe Damato wrote:
> Add a persistent NAPI config area for NAPI configuration to the core.
> Drivers opt-in to setting the storage for a NAPI by passing an index
> when calling netif_napi_add_storage.
> 
> napi_config is allocated in alloc_netdev_mqs, freed in free_netdev
> (after the NAPIs are deleted), and set to 0 when napi_enable is called.
> 
> Drivers which implement call netif_napi_add_storage will have persistent
> NAPI IDs.
> 
> Signed-off-by: Joe Damato <jdamato@fastly.com>
> ---
>  .../networking/net_cachelines/net_device.rst  |  1 +
>  include/linux/netdevice.h                     | 34 +++++++++
>  net/core/dev.c                                | 74 +++++++++++++++++--
>  net/core/dev.h                                | 12 +++
>  4 files changed, 113 insertions(+), 8 deletions(-)
> 
> diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst
> index 3d02ae79c850..11d659051f5e 100644
> --- a/Documentation/networking/net_cachelines/net_device.rst
> +++ b/Documentation/networking/net_cachelines/net_device.rst
> @@ -183,3 +183,4 @@ struct hlist_head                   page_pools
>  struct dim_irq_moder*               irq_moder
>  unsigned_long                       gro_flush_timeout
>  u32                                 napi_defer_hard_irqs
> +struct napi_config*                 napi_config
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 3e07ab8e0295..08afc96179f9 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -342,6 +342,15 @@ struct gro_list {
>   */
>  #define GRO_HASH_BUCKETS	8
>  
> +/*
> + * Structure for per-NAPI storage
> + */
> +struct napi_config {
> +	u64 gro_flush_timeout;
> +	u32 defer_hard_irqs;
> +	unsigned int napi_id;
> +};
> +
>  /*
>   * Structure for NAPI scheduling similar to tasklet but with weighting
>   */
> @@ -379,6 +388,8 @@ struct napi_struct {
>  	int			irq;
>  	unsigned long		gro_flush_timeout;
>  	u32			defer_hard_irqs;
> +	int			index;
> +	struct napi_config	*config;
>  };
>  
>  enum {
> @@ -2011,6 +2022,9 @@ enum netdev_reg_state {
>   *	@dpll_pin: Pointer to the SyncE source pin of a DPLL subsystem,
>   *		   where the clock is recovered.
>   *
> + *	@napi_config: An array of napi_config structures containing per-NAPI
> + *		      settings.
> + *
>   *	FIXME: cleanup struct net_device such that network protocol info
>   *	moves out.
>   */
> @@ -2400,6 +2414,7 @@ struct net_device {
>  	struct dim_irq_moder	*irq_moder;
>  	unsigned long		gro_flush_timeout;
>  	u32			napi_defer_hard_irqs;
> +	struct napi_config	*napi_config;
>  
>  	u8			priv[] ____cacheline_aligned
>  				       __counted_by(priv_len);
> @@ -2650,6 +2665,23 @@ netif_napi_add_tx_weight(struct net_device *dev,
>  	netif_napi_add_weight(dev, napi, poll, weight);
>  }
>  
> +/**
> + * netif_napi_add_storage - initialize a NAPI context and set storage area
> + * @dev: network device
> + * @napi: NAPI context
> + * @poll: polling function
> + * @weight: the poll weight of this NAPI
> + * @index: the NAPI index
> + */
> +static inline void
> +netif_napi_add_storage(struct net_device *dev, struct napi_struct *napi,
> +		       int (*poll)(struct napi_struct *, int), int index)
> +{
> +	napi->index = index;
> +	napi->config = &dev->napi_config[index];
> +	netif_napi_add_weight(dev, napi, poll, NAPI_POLL_WEIGHT);
> +}
> +
>  /**
>   * netif_napi_add_tx() - initialize a NAPI context to be used for Tx only
>   * @dev:  network device
> @@ -2685,6 +2717,8 @@ void __netif_napi_del(struct napi_struct *napi);
>   */
>  static inline void netif_napi_del(struct napi_struct *napi)
>  {
> +	napi->config = NULL;
> +	napi->index = -1;
>  	__netif_napi_del(napi);
>  	synchronize_net();
>  }
> diff --git a/net/core/dev.c b/net/core/dev.c
> index f2fd503516de..ca2227d0b8ed 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -6493,6 +6493,18 @@ EXPORT_SYMBOL(napi_busy_loop);
>  
>  #endif /* CONFIG_NET_RX_BUSY_POLL */
>  
> +static void napi_hash_add_with_id(struct napi_struct *napi, unsigned int napi_id)
> +{
> +	spin_lock(&napi_hash_lock);
> +
> +	napi->napi_id = napi_id;
> +
> +	hlist_add_head_rcu(&napi->napi_hash_node,
> +			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
> +
> +	spin_unlock(&napi_hash_lock);
> +}
> +
>  static void napi_hash_add(struct napi_struct *napi)
>  {
>  	if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state))
> @@ -6505,12 +6517,13 @@ static void napi_hash_add(struct napi_struct *napi)
>  		if (unlikely(++napi_gen_id < MIN_NAPI_ID))
>  			napi_gen_id = MIN_NAPI_ID;
>  	} while (napi_by_id(napi_gen_id));

[..]

> -	napi->napi_id = napi_gen_id;
> -
> -	hlist_add_head_rcu(&napi->napi_hash_node,
> -			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
>  
>  	spin_unlock(&napi_hash_lock);
> +
> +	napi_hash_add_with_id(napi, napi_gen_id);

nit: it is very unlikely that napi_gen_id is gonna wrap around after the
spin_unlock above, but maybe it's safer to have the following?

static void __napi_hash_add_with_id(struct napi_struct *napi, unsigned int napi_id)
{
	napi->napi_id = napi_id;
	hlist_add_head_rcu(&napi->napi_hash_node,
			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
}

static void napi_hash_add_with_id(struct napi_struct *napi, unsigned int napi_id)
{
	spin_lock(&napi_hash_lock);
	__napi_hash_add_with_id(...);
	spin_unlock(&napi_hash_lock);
}

And use __napi_hash_add_with_id here before spin_unlock?
Joe Damato Sept. 13, 2024, 6:55 p.m. UTC | #3
On Fri, Sep 13, 2024 at 10:42:37AM -0700, Stanislav Fomichev wrote:
> On 09/12, Joe Damato wrote:

[...]

> > @@ -6505,12 +6517,13 @@ static void napi_hash_add(struct napi_struct *napi)
> >  		if (unlikely(++napi_gen_id < MIN_NAPI_ID))
> >  			napi_gen_id = MIN_NAPI_ID;
> >  	} while (napi_by_id(napi_gen_id));
> 
> [..]
> 
> > -	napi->napi_id = napi_gen_id;
> > -
> > -	hlist_add_head_rcu(&napi->napi_hash_node,
> > -			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
> >  
> >  	spin_unlock(&napi_hash_lock);
> > +
> > +	napi_hash_add_with_id(napi, napi_gen_id);
> 
> nit: it is very unlikely that napi_gen_id is gonna wrap around after the
> spin_unlock above, but maybe it's safer to have the following?
> 
> static void __napi_hash_add_with_id(struct napi_struct *napi, unsigned int napi_id)
> {
> 	napi->napi_id = napi_id;
> 	hlist_add_head_rcu(&napi->napi_hash_node,
> 			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
> }
> 
> static void napi_hash_add_with_id(struct napi_struct *napi, unsigned int napi_id)
> {
> 	spin_lock(&napi_hash_lock);
> 	__napi_hash_add_with_id(...);
> 	spin_unlock(&napi_hash_lock);
> }
> 
> And use __napi_hash_add_with_id here before spin_unlock?

Thanks for taking a look.

Sure, that seems reasonable. I can add that for the rfcv4.

I'll probably hold off on posting the rfcv4 until either after LPC
and/or after I have some time to debug the mlx5/page_pool thing.
Joe Damato Sept. 13, 2024, 7:39 p.m. UTC | #4
On Fri, Sep 13, 2024 at 10:42:37AM -0700, Stanislav Fomichev wrote:
> On 09/12, Joe Damato wrote:

[...]

> > --- a/net/core/dev.c
> > +++ b/net/core/dev.c
> > @@ -6493,6 +6493,18 @@ EXPORT_SYMBOL(napi_busy_loop);
> >  
> >  #endif /* CONFIG_NET_RX_BUSY_POLL */
> >  
> > +static void napi_hash_add_with_id(struct napi_struct *napi, unsigned int napi_id)
> > +{
> > +	spin_lock(&napi_hash_lock);
> > +
> > +	napi->napi_id = napi_id;
> > +
> > +	hlist_add_head_rcu(&napi->napi_hash_node,
> > +			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
> > +
> > +	spin_unlock(&napi_hash_lock);
> > +}
> > +
> >  static void napi_hash_add(struct napi_struct *napi)
> >  {
> >  	if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state))
> > @@ -6505,12 +6517,13 @@ static void napi_hash_add(struct napi_struct *napi)
> >  		if (unlikely(++napi_gen_id < MIN_NAPI_ID))
> >  			napi_gen_id = MIN_NAPI_ID;
> >  	} while (napi_by_id(napi_gen_id));
> 
> [..]
> 
> > -	napi->napi_id = napi_gen_id;
> > -
> > -	hlist_add_head_rcu(&napi->napi_hash_node,
> > -			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
> >  
> >  	spin_unlock(&napi_hash_lock);
> > +
> > +	napi_hash_add_with_id(napi, napi_gen_id);
> 
> nit: it is very unlikely that napi_gen_id is gonna wrap around after the
> spin_unlock above, but maybe it's safer to have the following?
> 
> static void __napi_hash_add_with_id(struct napi_struct *napi, unsigned int napi_id)
> {
> 	napi->napi_id = napi_id;
> 	hlist_add_head_rcu(&napi->napi_hash_node,
> 			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
> }
> 
> static void napi_hash_add_with_id(struct napi_struct *napi, unsigned int napi_id)
> {
> 	spin_lock(&napi_hash_lock);
> 	__napi_hash_add_with_id(...);
> 	spin_unlock(&napi_hash_lock);
> }
> 
> And use __napi_hash_add_with_id here before spin_unlock?

After making this change and re-testing on a couple reboots, I haven't
been able to reproduce the page pool issue I mentioned in the other
email [1].

Not sure if I've just been... "getting lucky" or if this fixed
something that I won't fully grasp until I read the mlx5 source
again.

Will test it a few more times, though.

[1]: https://lore.kernel.org/netdev/ZuMC2fYPPtWggB2w@LQ3V64L9R2.homenet.telecomitalia.it/
Willem de Bruijn Sept. 13, 2024, 9:44 p.m. UTC | #5
Joe Damato wrote:
> Several comments on different things below for this patch that I just noticed.
> 
> On Thu, Sep 12, 2024 at 10:07:13AM +0000, Joe Damato wrote:
> > Add a persistent NAPI config area for NAPI configuration to the core.
> > Drivers opt-in to setting the storage for a NAPI by passing an index
> > when calling netif_napi_add_storage.
> > 
> > napi_config is allocated in alloc_netdev_mqs, freed in free_netdev
> > (after the NAPIs are deleted), and set to 0 when napi_enable is called.
> 
> Forgot to re-read all the commit messages. I will do that for rfcv4
> and make sure they are all correct; this message is not correct.
>  
> > Drivers which implement call netif_napi_add_storage will have persistent
> > NAPI IDs.
> > 
> > Signed-off-by: Joe Damato <jdamato@fastly.com>

> > @@ -11062,6 +11110,9 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
> >  		return NULL;
> >  	}
> >  
> > +	WARN_ON_ONCE(txqs != rxqs);
> 
> This warning triggers for me on boot every time with mlx5 NICs.
> 
> The code in mlx5 seems to get the rxq and txq maximums in:
>   drivers/net/ethernet/mellanox/mlx5/core/en_main.c
>     mlx5e_create_netdev
> 
>   which does:
> 
>     txqs = mlx5e_get_max_num_txqs(mdev, profile);
>     rxqs = mlx5e_get_max_num_rxqs(mdev, profile);
> 
>     netdev = alloc_etherdev_mqs(sizeof(struct mlx5e_priv), txqs, rxqs);
> 
> In my case for my device, txqs: 760, rxqs: 63.
> 
> I would guess that this warning will trigger everytime for mlx5 NICs
> and would be quite annoying.
> 
> We may just want to replace the allocation logic to allocate
> txqs+rxqs, remove the WARN_ON_ONCE, and be OK with some wasted
> space?

I was about to say that txqs == rxqs is not necessary.

The number of napi config structs you want depends on whether the
driver configures separate IRQs for Tx and Rx or not.

Allocating the max of the two is perhaps sufficient for now.
Joe Damato Sept. 13, 2024, 10:10 p.m. UTC | #6
On Fri, Sep 13, 2024 at 05:44:07PM -0400, Willem de Bruijn wrote:
> Joe Damato wrote:
> > Several comments on different things below for this patch that I just noticed.
> > 
> > On Thu, Sep 12, 2024 at 10:07:13AM +0000, Joe Damato wrote:
> > > Add a persistent NAPI config area for NAPI configuration to the core.
> > > Drivers opt-in to setting the storage for a NAPI by passing an index
> > > when calling netif_napi_add_storage.
> > > 
> > > napi_config is allocated in alloc_netdev_mqs, freed in free_netdev
> > > (after the NAPIs are deleted), and set to 0 when napi_enable is called.
> > 
> > Forgot to re-read all the commit messages. I will do that for rfcv4
> > and make sure they are all correct; this message is not correct.
> >  
> > > Drivers which implement call netif_napi_add_storage will have persistent
> > > NAPI IDs.
> > > 
> > > Signed-off-by: Joe Damato <jdamato@fastly.com>
> 
> > > @@ -11062,6 +11110,9 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
> > >  		return NULL;
> > >  	}
> > >  
> > > +	WARN_ON_ONCE(txqs != rxqs);
> > 
> > This warning triggers for me on boot every time with mlx5 NICs.
> > 
> > The code in mlx5 seems to get the rxq and txq maximums in:
> >   drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> >     mlx5e_create_netdev
> > 
> >   which does:
> > 
> >     txqs = mlx5e_get_max_num_txqs(mdev, profile);
> >     rxqs = mlx5e_get_max_num_rxqs(mdev, profile);
> > 
> >     netdev = alloc_etherdev_mqs(sizeof(struct mlx5e_priv), txqs, rxqs);
> > 
> > In my case for my device, txqs: 760, rxqs: 63.
> > 
> > I would guess that this warning will trigger everytime for mlx5 NICs
> > and would be quite annoying.
> > 
> > We may just want to replace the allocation logic to allocate
> > txqs+rxqs, remove the WARN_ON_ONCE, and be OK with some wasted
> > space?
> 
> I was about to say that txqs == rxqs is not necessary.

Correct.

> The number of napi config structs you want depends on whether the
> driver configures separate IRQs for Tx and Rx or not.

Correct. This is why I included the mlx4 patch.

> Allocating the max of the two is perhaps sufficient for now.

I don't think I agree. The max of the two means you'll always be
missing some config space if the maximum number of both are
allocated by the user/device.

The WARN_ON_ONCE was added as suggested from a previous conversation
[1], but due to the imbalance in mlx5 (and probably other devices)
the warning will be more of a nuisance and will likely trigger on
every boot for at least mlx5, but probably others.

Regardless of how many we decide to allocate: the point I was making
above was that the WARN_ON_ONCE should likely be removed.

[1]: https://lore.kernel.org/lkml/20240902174944.293dfe4b@kernel.org/
Joe Damato Sept. 14, 2024, 1 p.m. UTC | #7
On Fri, Sep 13, 2024 at 09:39:49PM +0200, Joe Damato wrote:
> On Fri, Sep 13, 2024 at 10:42:37AM -0700, Stanislav Fomichev wrote:
> > On 09/12, Joe Damato wrote:
> 
> [...]
> 
> > > --- a/net/core/dev.c
> > > +++ b/net/core/dev.c
> > > @@ -6493,6 +6493,18 @@ EXPORT_SYMBOL(napi_busy_loop);
> > >  
> > >  #endif /* CONFIG_NET_RX_BUSY_POLL */
> > >  
> > > +static void napi_hash_add_with_id(struct napi_struct *napi, unsigned int napi_id)
> > > +{
> > > +	spin_lock(&napi_hash_lock);
> > > +
> > > +	napi->napi_id = napi_id;
> > > +
> > > +	hlist_add_head_rcu(&napi->napi_hash_node,
> > > +			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
> > > +
> > > +	spin_unlock(&napi_hash_lock);
> > > +}
> > > +
> > >  static void napi_hash_add(struct napi_struct *napi)
> > >  {
> > >  	if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state))
> > > @@ -6505,12 +6517,13 @@ static void napi_hash_add(struct napi_struct *napi)
> > >  		if (unlikely(++napi_gen_id < MIN_NAPI_ID))
> > >  			napi_gen_id = MIN_NAPI_ID;
> > >  	} while (napi_by_id(napi_gen_id));
> > 
> > [..]
> > 
> > > -	napi->napi_id = napi_gen_id;
> > > -
> > > -	hlist_add_head_rcu(&napi->napi_hash_node,
> > > -			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
> > >  
> > >  	spin_unlock(&napi_hash_lock);
> > > +
> > > +	napi_hash_add_with_id(napi, napi_gen_id);
> > 
> > nit: it is very unlikely that napi_gen_id is gonna wrap around after the
> > spin_unlock above, but maybe it's safer to have the following?
> > 
> > static void __napi_hash_add_with_id(struct napi_struct *napi, unsigned int napi_id)
> > {
> > 	napi->napi_id = napi_id;
> > 	hlist_add_head_rcu(&napi->napi_hash_node,
> > 			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
> > }
> > 
> > static void napi_hash_add_with_id(struct napi_struct *napi, unsigned int napi_id)
> > {
> > 	spin_lock(&napi_hash_lock);
> > 	__napi_hash_add_with_id(...);
> > 	spin_unlock(&napi_hash_lock);
> > }
> > 
> > And use __napi_hash_add_with_id here before spin_unlock?
> 
> After making this change and re-testing on a couple reboots, I haven't
> been able to reproduce the page pool issue I mentioned in the other
> email [1].
> 
> Not sure if I've just been... "getting lucky" or if this fixed
> something that I won't fully grasp until I read the mlx5 source
> again.
> 
> Will test it a few more times, though.
> 
> [1]: https://lore.kernel.org/netdev/ZuMC2fYPPtWggB2w@LQ3V64L9R2.homenet.telecomitalia.it/

I was able to reproduce the issue by running a simple script
(suggested by Martin):

  for ((i=0;i<100;i++)); do for q in 1 2 4 8 16 32; do sudo ethtool -L eth4 combined $q ;done;done

It does not seem to reproduce on net-next/main, so it is almost certainly a bug
introduced in patch 5 and the suggested changes above didn't solve the problem.

That said, the changes I have queued for the RFCv4:
  - Updated commit messages of most patches
  - Renamed netif_napi_add_storage to netif_napi_add_config in patch 5 and
    updated the driver patches.
  - Added a NULL check in netdev_set_defer_hard_irqs and
    netdev_set_gro_flush_timeout for netdev->napi_config in patch 5
  - Add Stanislav's suggestion for more safe handling of napi_gen_id in
    patch 5
  - Fixed merge conflicts in patch 6 so it applies cleanly

The outstanding items at this point are:
  - Getting rid of the WARN_ON_ONCE (assuming we all agree on this)
  - Deciding if we are allocating max(rxqs, txqs) or something else
  - Debugging the page pool issue

Jakub suggested the first two items on the list above, so I'm hoping to hear
his thoughts on them :) I have no strong preference on those two.

Once those two are solved, I can send an RFCv4 to see where we are at and I'll
call out the outstanding page pool issue in the cover letter.

I likely won't have time to debug the page pool thing until after LPC, though
:(
diff mbox series

Patch

diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst
index 3d02ae79c850..11d659051f5e 100644
--- a/Documentation/networking/net_cachelines/net_device.rst
+++ b/Documentation/networking/net_cachelines/net_device.rst
@@ -183,3 +183,4 @@  struct hlist_head                   page_pools
 struct dim_irq_moder*               irq_moder
 unsigned_long                       gro_flush_timeout
 u32                                 napi_defer_hard_irqs
+struct napi_config*                 napi_config
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3e07ab8e0295..08afc96179f9 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -342,6 +342,15 @@  struct gro_list {
  */
 #define GRO_HASH_BUCKETS	8
 
+/*
+ * Structure for per-NAPI storage
+ */
+struct napi_config {
+	u64 gro_flush_timeout;
+	u32 defer_hard_irqs;
+	unsigned int napi_id;
+};
+
 /*
  * Structure for NAPI scheduling similar to tasklet but with weighting
  */
@@ -379,6 +388,8 @@  struct napi_struct {
 	int			irq;
 	unsigned long		gro_flush_timeout;
 	u32			defer_hard_irqs;
+	int			index;
+	struct napi_config	*config;
 };
 
 enum {
@@ -2011,6 +2022,9 @@  enum netdev_reg_state {
  *	@dpll_pin: Pointer to the SyncE source pin of a DPLL subsystem,
  *		   where the clock is recovered.
  *
+ *	@napi_config: An array of napi_config structures containing per-NAPI
+ *		      settings.
+ *
  *	FIXME: cleanup struct net_device such that network protocol info
  *	moves out.
  */
@@ -2400,6 +2414,7 @@  struct net_device {
 	struct dim_irq_moder	*irq_moder;
 	unsigned long		gro_flush_timeout;
 	u32			napi_defer_hard_irqs;
+	struct napi_config	*napi_config;
 
 	u8			priv[] ____cacheline_aligned
 				       __counted_by(priv_len);
@@ -2650,6 +2665,23 @@  netif_napi_add_tx_weight(struct net_device *dev,
 	netif_napi_add_weight(dev, napi, poll, weight);
 }
 
+/**
+ * netif_napi_add_storage - initialize a NAPI context and set storage area
+ * @dev: network device
+ * @napi: NAPI context
+ * @poll: polling function
+ * @weight: the poll weight of this NAPI
+ * @index: the NAPI index
+ */
+static inline void
+netif_napi_add_storage(struct net_device *dev, struct napi_struct *napi,
+		       int (*poll)(struct napi_struct *, int), int index)
+{
+	napi->index = index;
+	napi->config = &dev->napi_config[index];
+	netif_napi_add_weight(dev, napi, poll, NAPI_POLL_WEIGHT);
+}
+
 /**
  * netif_napi_add_tx() - initialize a NAPI context to be used for Tx only
  * @dev:  network device
@@ -2685,6 +2717,8 @@  void __netif_napi_del(struct napi_struct *napi);
  */
 static inline void netif_napi_del(struct napi_struct *napi)
 {
+	napi->config = NULL;
+	napi->index = -1;
 	__netif_napi_del(napi);
 	synchronize_net();
 }
diff --git a/net/core/dev.c b/net/core/dev.c
index f2fd503516de..ca2227d0b8ed 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6493,6 +6493,18 @@  EXPORT_SYMBOL(napi_busy_loop);
 
 #endif /* CONFIG_NET_RX_BUSY_POLL */
 
+static void napi_hash_add_with_id(struct napi_struct *napi, unsigned int napi_id)
+{
+	spin_lock(&napi_hash_lock);
+
+	napi->napi_id = napi_id;
+
+	hlist_add_head_rcu(&napi->napi_hash_node,
+			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
+
+	spin_unlock(&napi_hash_lock);
+}
+
 static void napi_hash_add(struct napi_struct *napi)
 {
 	if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state))
@@ -6505,12 +6517,13 @@  static void napi_hash_add(struct napi_struct *napi)
 		if (unlikely(++napi_gen_id < MIN_NAPI_ID))
 			napi_gen_id = MIN_NAPI_ID;
 	} while (napi_by_id(napi_gen_id));
-	napi->napi_id = napi_gen_id;
-
-	hlist_add_head_rcu(&napi->napi_hash_node,
-			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
 
 	spin_unlock(&napi_hash_lock);
+
+	napi_hash_add_with_id(napi, napi_gen_id);
+
+	if (napi->config)
+		napi->config->napi_id = napi_gen_id;
 }
 
 /* Warning : caller is responsible to make sure rcu grace period
@@ -6631,6 +6644,21 @@  void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index,
 }
 EXPORT_SYMBOL(netif_queue_set_napi);
 
+static void napi_restore_config(struct napi_struct *n)
+{
+	n->defer_hard_irqs = n->config->defer_hard_irqs;
+	n->gro_flush_timeout = n->config->gro_flush_timeout;
+	napi_hash_add_with_id(n, n->config->napi_id);
+}
+
+static void napi_save_config(struct napi_struct *n)
+{
+	n->config->defer_hard_irqs = n->defer_hard_irqs;
+	n->config->gro_flush_timeout = n->gro_flush_timeout;
+	n->config->napi_id = n->napi_id;
+	napi_hash_del(n);
+}
+
 void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
 			   int (*poll)(struct napi_struct *, int), int weight)
 {
@@ -6641,8 +6669,6 @@  void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
 	INIT_HLIST_NODE(&napi->napi_hash_node);
 	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
 	napi->timer.function = napi_watchdog;
-	napi_set_defer_hard_irqs(napi, READ_ONCE(dev->napi_defer_hard_irqs));
-	napi_set_gro_flush_timeout(napi, READ_ONCE(dev->gro_flush_timeout));
 	init_gro_hash(napi);
 	napi->skb = NULL;
 	INIT_LIST_HEAD(&napi->rx_list);
@@ -6660,7 +6686,15 @@  void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
 	set_bit(NAPI_STATE_SCHED, &napi->state);
 	set_bit(NAPI_STATE_NPSVC, &napi->state);
 	list_add_rcu(&napi->dev_list, &dev->napi_list);
-	napi_hash_add(napi);
+	/* if there is no config associated with this NAPI, generate a fresh
+	 * NAPI ID and hash it. Otherwise, settings will be restored in napi_enable.
+	 */
+	if (!napi->config || (napi->config && !napi->config->napi_id)) {
+		napi_hash_add(napi);
+		napi_set_defer_hard_irqs(napi, READ_ONCE(dev->napi_defer_hard_irqs));
+		napi_set_gro_flush_timeout(napi, READ_ONCE(dev->gro_flush_timeout));
+	}
+
 	napi_get_frags_check(napi);
 	/* Create kthread for this napi if dev->threaded is set.
 	 * Clear dev->threaded if kthread creation failed so that
@@ -6692,6 +6726,9 @@  void napi_disable(struct napi_struct *n)
 
 	hrtimer_cancel(&n->timer);
 
+	if (n->config)
+		napi_save_config(n);
+
 	clear_bit(NAPI_STATE_DISABLE, &n->state);
 }
 EXPORT_SYMBOL(napi_disable);
@@ -6714,6 +6751,9 @@  void napi_enable(struct napi_struct *n)
 		if (n->dev->threaded && n->thread)
 			new |= NAPIF_STATE_THREADED;
 	} while (!try_cmpxchg(&n->state, &val, new));
+
+	if (n->config)
+		napi_restore_config(n);
 }
 EXPORT_SYMBOL(napi_enable);
 
@@ -6736,7 +6776,13 @@  void __netif_napi_del(struct napi_struct *napi)
 	if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
 		return;
 
-	napi_hash_del(napi);
+	if (!napi->config) {
+		napi_hash_del(napi);
+	} else {
+		napi->index = -1;
+		napi->config = NULL;
+	}
+
 	list_del_rcu(&napi->dev_list);
 	napi_free_frags(napi);
 
@@ -11049,6 +11095,8 @@  struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 		unsigned int txqs, unsigned int rxqs)
 {
 	struct net_device *dev;
+	size_t napi_config_sz;
+	unsigned int maxqs;
 
 	BUG_ON(strlen(name) >= sizeof(dev->name));
 
@@ -11062,6 +11110,9 @@  struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 		return NULL;
 	}
 
+	WARN_ON_ONCE(txqs != rxqs);
+	maxqs = max(txqs, rxqs);
+
 	dev = kvzalloc(struct_size(dev, priv, sizeof_priv),
 		       GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
 	if (!dev)
@@ -11136,6 +11187,11 @@  struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 	if (!dev->ethtool)
 		goto free_all;
 
+	napi_config_sz = array_size(maxqs, sizeof(*dev->napi_config));
+	dev->napi_config = kvzalloc(napi_config_sz, GFP_KERNEL_ACCOUNT);
+	if (!dev->napi_config)
+		goto free_all;
+
 	strscpy(dev->name, name);
 	dev->name_assign_type = name_assign_type;
 	dev->group = INIT_NETDEV_GROUP;
@@ -11197,6 +11253,8 @@  void free_netdev(struct net_device *dev)
 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
 		netif_napi_del(p);
 
+	kvfree(dev->napi_config);
+
 	ref_tracker_dir_exit(&dev->refcnt_tracker);
 #ifdef CONFIG_PCPU_DEV_REFCNT
 	free_percpu(dev->pcpu_refcnt);
diff --git a/net/core/dev.h b/net/core/dev.h
index a9d5f678564a..9eb3f559275c 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -167,11 +167,17 @@  static inline void napi_set_defer_hard_irqs(struct napi_struct *n, u32 defer)
 static inline void netdev_set_defer_hard_irqs(struct net_device *netdev,
 					      u32 defer)
 {
+	unsigned int count = max(netdev->num_rx_queues,
+				 netdev->num_tx_queues);
 	struct napi_struct *napi;
+	int i;
 
 	WRITE_ONCE(netdev->napi_defer_hard_irqs, defer);
 	list_for_each_entry(napi, &netdev->napi_list, dev_list)
 		napi_set_defer_hard_irqs(napi, defer);
+
+	for (i = 0; i < count; i++)
+		netdev->napi_config[i].defer_hard_irqs = defer;
 }
 
 /**
@@ -206,11 +212,17 @@  static inline void napi_set_gro_flush_timeout(struct napi_struct *n,
 static inline void netdev_set_gro_flush_timeout(struct net_device *netdev,
 						unsigned long timeout)
 {
+	unsigned int count = max(netdev->num_rx_queues,
+				 netdev->num_tx_queues);
 	struct napi_struct *napi;
+	int i;
 
 	WRITE_ONCE(netdev->gro_flush_timeout, timeout);
 	list_for_each_entry(napi, &netdev->napi_list, dev_list)
 		napi_set_gro_flush_timeout(napi, timeout);
+
+	for (i = 0; i < count; i++)
+		netdev->napi_config[i].gro_flush_timeout = timeout;
 }
 
 int rps_cpumask_housekeeping(struct cpumask *mask);