diff mbox series

[net-next,v5,5/9] net: napi: Add napi_config

Message ID 20241009005525.13651-6-jdamato@fastly.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series Add support for per-NAPI config via netlink | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for net-next, async
netdev/ynl success Generated files up to date; no warnings/errors; GEN HAS DIFF 2 files changed, 93 insertions(+);
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 43 this patch: 43
netdev/build_tools success Errors and warnings before: 0 (+2) this patch: 0 (+2)
netdev/cc_maintainers success CCed 7 of 7 maintainers
netdev/build_clang success Errors and warnings before: 83 this patch: 83
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 4144 this patch: 4144
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 264 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 101 this patch: 101
netdev/source_inline success Was 0 now: 0

Commit Message

Joe Damato Oct. 9, 2024, 12:54 a.m. UTC
Add a persistent NAPI config area for NAPI configuration to the core.
Drivers opt-in to setting the persistent config for a NAPI by passing an
index when calling netif_napi_add_config.

napi_config is allocated in alloc_netdev_mqs, freed in free_netdev
(after the NAPIs are deleted).

Drivers which call netif_napi_add_config will have persistent per-NAPI
settings: NAPI IDs, gro_flush_timeout, and defer_hard_irq settings.

Per-NAPI settings are saved in napi_disable and restored in napi_enable.

Co-developed-by: Martin Karsten <mkarsten@uwaterloo.ca>
Signed-off-by: Martin Karsten <mkarsten@uwaterloo.ca>
Signed-off-by: Joe Damato <jdamato@fastly.com>
---
 .../networking/net_cachelines/net_device.rst  |  1 +
 include/linux/netdevice.h                     | 36 ++++++++-
 net/core/dev.c                                | 79 +++++++++++++++++--
 net/core/dev.h                                | 12 +++
 4 files changed, 118 insertions(+), 10 deletions(-)

Comments

Eric Dumazet Oct. 10, 2024, 4:20 a.m. UTC | #1
On Wed, Oct 9, 2024 at 2:56 AM Joe Damato <jdamato@fastly.com> wrote:
>
> Add a persistent NAPI config area for NAPI configuration to the core.
> Drivers opt-in to setting the persistent config for a NAPI by passing an
> index when calling netif_napi_add_config.
>
> napi_config is allocated in alloc_netdev_mqs, freed in free_netdev
> (after the NAPIs are deleted).
>
> Drivers which call netif_napi_add_config will have persistent per-NAPI
> settings: NAPI IDs, gro_flush_timeout, and defer_hard_irq settings.
>
> Per-NAPI settings are saved in napi_disable and restored in napi_enable.
>
> Co-developed-by: Martin Karsten <mkarsten@uwaterloo.ca>
> Signed-off-by: Martin Karsten <mkarsten@uwaterloo.ca>
> Signed-off-by: Joe Damato <jdamato@fastly.com>
> ---
>  .../networking/net_cachelines/net_device.rst  |  1 +
>  include/linux/netdevice.h                     | 36 ++++++++-
>  net/core/dev.c                                | 79 +++++++++++++++++--
>  net/core/dev.h                                | 12 +++
>  4 files changed, 118 insertions(+), 10 deletions(-)
>
> diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst
> index 3ab663b6cf16..9d86720cb722 100644
> --- a/Documentation/networking/net_cachelines/net_device.rst
> +++ b/Documentation/networking/net_cachelines/net_device.rst
> @@ -183,5 +183,6 @@ struct_dpll_pin*                    dpll_pin
>  struct hlist_head                   page_pools
>  struct dim_irq_moder*               irq_moder
>  u64                                 max_pacing_offload_horizon
> +struct_napi_config*                 napi_config
>  unsigned_long                       gro_flush_timeout
>  u32                                 napi_defer_hard_irqs
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 4239a4a9d295..b65a901ab4e7 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -342,6 +342,15 @@ struct gro_list {
>   */
>  #define GRO_HASH_BUCKETS       8
>
> +/*
> + * Structure for per-NAPI config
> + */
> +struct napi_config {
> +       u64 gro_flush_timeout;
> +       u32 defer_hard_irqs;
> +       unsigned int napi_id;
> +};
> +
>  /*
>   * Structure for NAPI scheduling similar to tasklet but with weighting
>   */
> @@ -379,6 +388,8 @@ struct napi_struct {
>         struct list_head        dev_list;
>         struct hlist_node       napi_hash_node;
>         int                     irq;
> +       int                     index;
> +       struct napi_config      *config;
>  };
>
>  enum {
> @@ -1860,9 +1871,6 @@ enum netdev_reg_state {
>   *                             allocated at register_netdev() time
>   *     @real_num_rx_queues:    Number of RX queues currently active in device
>   *     @xdp_prog:              XDP sockets filter program pointer
> - *     @gro_flush_timeout:     timeout for GRO layer in NAPI
> - *     @napi_defer_hard_irqs:  If not zero, provides a counter that would
> - *                             allow to avoid NIC hard IRQ, on busy queues.
>   *
>   *     @rx_handler:            handler for received packets
>   *     @rx_handler_data:       XXX: need comments on this one
> @@ -2012,6 +2020,11 @@ enum netdev_reg_state {
>   *                where the clock is recovered.
>   *
>   *     @max_pacing_offload_horizon: max EDT offload horizon in nsec.
> + *     @napi_config: An array of napi_config structures containing per-NAPI
> + *                   settings.
> + *     @gro_flush_timeout:     timeout for GRO layer in NAPI
> + *     @napi_defer_hard_irqs:  If not zero, provides a counter that would
> + *                             allow to avoid NIC hard IRQ, on busy queues.
>   *
>   *     FIXME: cleanup struct net_device such that network protocol info
>   *     moves out.
> @@ -2405,6 +2418,7 @@ struct net_device {
>         struct dim_irq_moder    *irq_moder;
>
>         u64                     max_pacing_offload_horizon;
> +       struct napi_config      *napi_config;
>         unsigned long           gro_flush_timeout;
>         u32                     napi_defer_hard_irqs;
>
> @@ -2657,6 +2671,22 @@ netif_napi_add_tx_weight(struct net_device *dev,
>         netif_napi_add_weight(dev, napi, poll, weight);
>  }
>
> +/**
> + * netif_napi_add_config - initialize a NAPI context with persistent config
> + * @dev: network device
> + * @napi: NAPI context
> + * @poll: polling function
> + * @index: the NAPI index
> + */
> +static inline void
> +netif_napi_add_config(struct net_device *dev, struct napi_struct *napi,
> +                     int (*poll)(struct napi_struct *, int), int index)
> +{
> +       napi->index = index;
> +       napi->config = &dev->napi_config[index];
> +       netif_napi_add_weight(dev, napi, poll, NAPI_POLL_WEIGHT);
> +}
> +
>  /**
>   * netif_napi_add_tx() - initialize a NAPI context to be used for Tx only
>   * @dev:  network device
> diff --git a/net/core/dev.c b/net/core/dev.c
> index fca2295f4d95..bd87232f7b37 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -6503,6 +6503,22 @@ EXPORT_SYMBOL(napi_busy_loop);
>
>  #endif /* CONFIG_NET_RX_BUSY_POLL */
>
> +static void __napi_hash_add_with_id(struct napi_struct *napi,
> +                                   unsigned int napi_id)
> +{
> +       napi->napi_id = napi_id;
> +       hlist_add_head_rcu(&napi->napi_hash_node,
> +                          &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
> +}
> +
> +static void napi_hash_add_with_id(struct napi_struct *napi,
> +                                 unsigned int napi_id)
> +{
> +       spin_lock(&napi_hash_lock);
> +       __napi_hash_add_with_id(napi, napi_id);

Hmmm... there is no check if 'napi_id' is already used and hashed.

I would add

WARN_ON_ONCE(napi_by_id(napi_id));

> +       spin_unlock(&napi_hash_lock);
> +}
> +
>  static void napi_hash_add(struct napi_struct *napi)
>  {
>         if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state))
> @@ -6515,10 +6531,8 @@ static void napi_hash_add(struct napi_struct *napi)
>                 if (unlikely(++napi_gen_id < MIN_NAPI_ID))
>                         napi_gen_id = MIN_NAPI_ID;
>         } while (napi_by_id(napi_gen_id));
> -       napi->napi_id = napi_gen_id;
>
> -       hlist_add_head_rcu(&napi->napi_hash_node,
> -                          &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
> +       __napi_hash_add_with_id(napi, napi_gen_id);
>
>         spin_unlock(&napi_hash_lock);
>  }
>
Jakub Kicinski Oct. 10, 2024, 4:08 p.m. UTC | #2
On Thu, 10 Oct 2024 08:59:29 -0700 Joe Damato wrote:
> Jakub: I hope that it's OK if I retain your Reviewed-by tag as the
> change Eric suggested is a minor one?

Yes
diff mbox series

Patch

diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst
index 3ab663b6cf16..9d86720cb722 100644
--- a/Documentation/networking/net_cachelines/net_device.rst
+++ b/Documentation/networking/net_cachelines/net_device.rst
@@ -183,5 +183,6 @@  struct_dpll_pin*                    dpll_pin
 struct hlist_head                   page_pools
 struct dim_irq_moder*               irq_moder
 u64                                 max_pacing_offload_horizon
+struct_napi_config*                 napi_config
 unsigned_long                       gro_flush_timeout
 u32                                 napi_defer_hard_irqs
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4239a4a9d295..b65a901ab4e7 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -342,6 +342,15 @@  struct gro_list {
  */
 #define GRO_HASH_BUCKETS	8
 
+/*
+ * Structure for per-NAPI config
+ */
+struct napi_config {
+	u64 gro_flush_timeout;
+	u32 defer_hard_irqs;
+	unsigned int napi_id;
+};
+
 /*
  * Structure for NAPI scheduling similar to tasklet but with weighting
  */
@@ -379,6 +388,8 @@  struct napi_struct {
 	struct list_head	dev_list;
 	struct hlist_node	napi_hash_node;
 	int			irq;
+	int			index;
+	struct napi_config	*config;
 };
 
 enum {
@@ -1860,9 +1871,6 @@  enum netdev_reg_state {
  *				allocated at register_netdev() time
  *	@real_num_rx_queues: 	Number of RX queues currently active in device
  *	@xdp_prog:		XDP sockets filter program pointer
- *	@gro_flush_timeout:	timeout for GRO layer in NAPI
- *	@napi_defer_hard_irqs:	If not zero, provides a counter that would
- *				allow to avoid NIC hard IRQ, on busy queues.
  *
  *	@rx_handler:		handler for received packets
  *	@rx_handler_data: 	XXX: need comments on this one
@@ -2012,6 +2020,11 @@  enum netdev_reg_state {
  *		   where the clock is recovered.
  *
  *	@max_pacing_offload_horizon: max EDT offload horizon in nsec.
+ *	@napi_config: An array of napi_config structures containing per-NAPI
+ *		      settings.
+ *	@gro_flush_timeout:	timeout for GRO layer in NAPI
+ *	@napi_defer_hard_irqs:	If not zero, provides a counter that would
+ *				allow to avoid NIC hard IRQ, on busy queues.
  *
  *	FIXME: cleanup struct net_device such that network protocol info
  *	moves out.
@@ -2405,6 +2418,7 @@  struct net_device {
 	struct dim_irq_moder	*irq_moder;
 
 	u64			max_pacing_offload_horizon;
+	struct napi_config	*napi_config;
 	unsigned long		gro_flush_timeout;
 	u32			napi_defer_hard_irqs;
 
@@ -2657,6 +2671,22 @@  netif_napi_add_tx_weight(struct net_device *dev,
 	netif_napi_add_weight(dev, napi, poll, weight);
 }
 
+/**
+ * netif_napi_add_config - initialize a NAPI context with persistent config
+ * @dev: network device
+ * @napi: NAPI context
+ * @poll: polling function
+ * @index: the NAPI index
+ */
+static inline void
+netif_napi_add_config(struct net_device *dev, struct napi_struct *napi,
+		      int (*poll)(struct napi_struct *, int), int index)
+{
+	napi->index = index;
+	napi->config = &dev->napi_config[index];
+	netif_napi_add_weight(dev, napi, poll, NAPI_POLL_WEIGHT);
+}
+
 /**
  * netif_napi_add_tx() - initialize a NAPI context to be used for Tx only
  * @dev:  network device
diff --git a/net/core/dev.c b/net/core/dev.c
index fca2295f4d95..bd87232f7b37 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6503,6 +6503,22 @@  EXPORT_SYMBOL(napi_busy_loop);
 
 #endif /* CONFIG_NET_RX_BUSY_POLL */
 
+static void __napi_hash_add_with_id(struct napi_struct *napi,
+				    unsigned int napi_id)
+{
+	napi->napi_id = napi_id;
+	hlist_add_head_rcu(&napi->napi_hash_node,
+			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
+}
+
+static void napi_hash_add_with_id(struct napi_struct *napi,
+				  unsigned int napi_id)
+{
+	spin_lock(&napi_hash_lock);
+	__napi_hash_add_with_id(napi, napi_id);
+	spin_unlock(&napi_hash_lock);
+}
+
 static void napi_hash_add(struct napi_struct *napi)
 {
 	if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state))
@@ -6515,10 +6531,8 @@  static void napi_hash_add(struct napi_struct *napi)
 		if (unlikely(++napi_gen_id < MIN_NAPI_ID))
 			napi_gen_id = MIN_NAPI_ID;
 	} while (napi_by_id(napi_gen_id));
-	napi->napi_id = napi_gen_id;
 
-	hlist_add_head_rcu(&napi->napi_hash_node,
-			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
+	__napi_hash_add_with_id(napi, napi_gen_id);
 
 	spin_unlock(&napi_hash_lock);
 }
@@ -6641,6 +6655,28 @@  void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index,
 }
 EXPORT_SYMBOL(netif_queue_set_napi);
 
+static void napi_restore_config(struct napi_struct *n)
+{
+	n->defer_hard_irqs = n->config->defer_hard_irqs;
+	n->gro_flush_timeout = n->config->gro_flush_timeout;
+	/* a NAPI ID might be stored in the config, if so use it. if not, use
+	 * napi_hash_add to generate one for us. It will be saved to the config
+	 * in napi_disable.
+	 */
+	if (n->config->napi_id)
+		napi_hash_add_with_id(n, n->config->napi_id);
+	else
+		napi_hash_add(n);
+}
+
+static void napi_save_config(struct napi_struct *n)
+{
+	n->config->defer_hard_irqs = n->defer_hard_irqs;
+	n->config->gro_flush_timeout = n->gro_flush_timeout;
+	n->config->napi_id = n->napi_id;
+	napi_hash_del(n);
+}
+
 void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
 			   int (*poll)(struct napi_struct *, int), int weight)
 {
@@ -6651,8 +6687,6 @@  void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
 	INIT_HLIST_NODE(&napi->napi_hash_node);
 	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
 	napi->timer.function = napi_watchdog;
-	napi_set_defer_hard_irqs(napi, READ_ONCE(dev->napi_defer_hard_irqs));
-	napi_set_gro_flush_timeout(napi, READ_ONCE(dev->gro_flush_timeout));
 	init_gro_hash(napi);
 	napi->skb = NULL;
 	INIT_LIST_HEAD(&napi->rx_list);
@@ -6670,7 +6704,13 @@  void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
 	set_bit(NAPI_STATE_SCHED, &napi->state);
 	set_bit(NAPI_STATE_NPSVC, &napi->state);
 	list_add_rcu(&napi->dev_list, &dev->napi_list);
-	napi_hash_add(napi);
+
+	/* default settings from sysfs are applied to all NAPIs. any per-NAPI
+	 * configuration will be loaded in napi_enable
+	 */
+	napi_set_defer_hard_irqs(napi, READ_ONCE(dev->napi_defer_hard_irqs));
+	napi_set_gro_flush_timeout(napi, READ_ONCE(dev->gro_flush_timeout));
+
 	napi_get_frags_check(napi);
 	/* Create kthread for this napi if dev->threaded is set.
 	 * Clear dev->threaded if kthread creation failed so that
@@ -6702,6 +6742,11 @@  void napi_disable(struct napi_struct *n)
 
 	hrtimer_cancel(&n->timer);
 
+	if (n->config)
+		napi_save_config(n);
+	else
+		napi_hash_del(n);
+
 	clear_bit(NAPI_STATE_DISABLE, &n->state);
 }
 EXPORT_SYMBOL(napi_disable);
@@ -6717,6 +6762,11 @@  void napi_enable(struct napi_struct *n)
 {
 	unsigned long new, val = READ_ONCE(n->state);
 
+	if (n->config)
+		napi_restore_config(n);
+	else
+		napi_hash_add(n);
+
 	do {
 		BUG_ON(!test_bit(NAPI_STATE_SCHED, &val));
 
@@ -6746,7 +6796,11 @@  void __netif_napi_del(struct napi_struct *napi)
 	if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
 		return;
 
-	napi_hash_del(napi);
+	if (napi->config) {
+		napi->index = -1;
+		napi->config = NULL;
+	}
+
 	list_del_rcu(&napi->dev_list);
 	napi_free_frags(napi);
 
@@ -11083,6 +11137,8 @@  struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 		unsigned int txqs, unsigned int rxqs)
 {
 	struct net_device *dev;
+	size_t napi_config_sz;
+	unsigned int maxqs;
 
 	BUG_ON(strlen(name) >= sizeof(dev->name));
 
@@ -11096,6 +11152,8 @@  struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 		return NULL;
 	}
 
+	maxqs = max(txqs, rxqs);
+
 	dev = kvzalloc(struct_size(dev, priv, sizeof_priv),
 		       GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
 	if (!dev)
@@ -11170,6 +11228,11 @@  struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 	if (!dev->ethtool)
 		goto free_all;
 
+	napi_config_sz = array_size(maxqs, sizeof(*dev->napi_config));
+	dev->napi_config = kvzalloc(napi_config_sz, GFP_KERNEL_ACCOUNT);
+	if (!dev->napi_config)
+		goto free_all;
+
 	strscpy(dev->name, name);
 	dev->name_assign_type = name_assign_type;
 	dev->group = INIT_NETDEV_GROUP;
@@ -11231,6 +11294,8 @@  void free_netdev(struct net_device *dev)
 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
 		netif_napi_del(p);
 
+	kvfree(dev->napi_config);
+
 	ref_tracker_dir_exit(&dev->refcnt_tracker);
 #ifdef CONFIG_PCPU_DEV_REFCNT
 	free_percpu(dev->pcpu_refcnt);
diff --git a/net/core/dev.h b/net/core/dev.h
index 26e598aa56c3..f22cb532de7a 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -167,11 +167,17 @@  static inline void napi_set_defer_hard_irqs(struct napi_struct *n, u32 defer)
 static inline void netdev_set_defer_hard_irqs(struct net_device *netdev,
 					      u32 defer)
 {
+	unsigned int count = max(netdev->num_rx_queues,
+				 netdev->num_tx_queues);
 	struct napi_struct *napi;
+	int i;
 
 	WRITE_ONCE(netdev->napi_defer_hard_irqs, defer);
 	list_for_each_entry(napi, &netdev->napi_list, dev_list)
 		napi_set_defer_hard_irqs(napi, defer);
+
+	for (i = 0; i < count; i++)
+		netdev->napi_config[i].defer_hard_irqs = defer;
 }
 
 /**
@@ -207,11 +213,17 @@  static inline void napi_set_gro_flush_timeout(struct napi_struct *n,
 static inline void netdev_set_gro_flush_timeout(struct net_device *netdev,
 						unsigned long timeout)
 {
+	unsigned int count = max(netdev->num_rx_queues,
+				 netdev->num_tx_queues);
 	struct napi_struct *napi;
+	int i;
 
 	WRITE_ONCE(netdev->gro_flush_timeout, timeout);
 	list_for_each_entry(napi, &netdev->napi_list, dev_list)
 		napi_set_gro_flush_timeout(napi, timeout);
+
+	for (i = 0; i < count; i++)
+		netdev->napi_config[i].gro_flush_timeout = timeout;
 }
 
 int rps_cpumask_housekeeping(struct cpumask *mask);