diff mbox series

[net-next,1/6] net: Add sysfs parameter irq_suspend_timeout

Message ID 20240823173103.94978-2-jdamato@fastly.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series Suspend IRQs during application busy periods | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for net-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 54 this patch: 54
netdev/build_tools success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers success CCed 4 of 4 maintainers
netdev/build_clang success Errors and warnings before: 100 this patch: 100
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 4064 this patch: 4064
netdev/checkpatch warning CHECK: Please use a blank line after function/struct/union/enum declarations WARNING: line length of 98 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 101 this patch: 101
netdev/source_inline success Was 0 now: 0
netdev/contest success net-next-2024-08-25--21-00 (tests: 714)

Commit Message

Joe Damato Aug. 23, 2024, 5:30 p.m. UTC
From: Martin Karsten <mkarsten@uwaterloo.ca>

This patch doesn't change any behavior but prepares the code for other
changes in the following commits which use irq_suspend_timeout as a
timeout for IRQ suspension.

Signed-off-by: Martin Karsten <mkarsten@uwaterloo.ca>
Co-developed-by: Joe Damato <jdamato@fastly.com>
Signed-off-by: Joe Damato <jdamato@fastly.com>
Tested-by: Joe Damato <jdamato@fastly.com>
Tested-by: Martin Karsten <mkarsten@uwaterloo.ca>
---
 rfc -> v1:
   - Removed napi.rst documentation from this patch; added to patch 6.

 include/linux/netdevice.h |  2 ++
 net/core/dev.c            |  3 ++-
 net/core/net-sysfs.c      | 18 ++++++++++++++++++
 3 files changed, 22 insertions(+), 1 deletion(-)

Comments

Eric Dumazet Aug. 23, 2024, 5:39 p.m. UTC | #1
On Fri, Aug 23, 2024 at 7:31 PM Joe Damato <jdamato@fastly.com> wrote:
>
> From: Martin Karsten <mkarsten@uwaterloo.ca>
>
> This patch doesn't change any behavior but prepares the code for other
> changes in the following commits which use irq_suspend_timeout as a
> timeout for IRQ suspension.
>
> Signed-off-by: Martin Karsten <mkarsten@uwaterloo.ca>
> Co-developed-by: Joe Damato <jdamato@fastly.com>
> Signed-off-by: Joe Damato <jdamato@fastly.com>
> Tested-by: Joe Damato <jdamato@fastly.com>
> Tested-by: Martin Karsten <mkarsten@uwaterloo.ca>
> ---
>  rfc -> v1:
>    - Removed napi.rst documentation from this patch; added to patch 6.
>
>  include/linux/netdevice.h |  2 ++
>  net/core/dev.c            |  3 ++-
>  net/core/net-sysfs.c      | 18 ++++++++++++++++++
>  3 files changed, 22 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 0ef3eaa23f4b..31867bb2ff65 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -1857,6 +1857,7 @@ enum netdev_reg_state {
>   *     @gro_flush_timeout:     timeout for GRO layer in NAPI
>   *     @napi_defer_hard_irqs:  If not zero, provides a counter that would
>   *                             allow to avoid NIC hard IRQ, on busy queues.
> + *     @irq_suspend_timeout:   IRQ suspension timeout
>   *
>   *     @rx_handler:            handler for received packets
>   *     @rx_handler_data:       XXX: need comments on this one
> @@ -2060,6 +2061,7 @@ struct net_device {
>         struct netdev_rx_queue  *_rx;
>         unsigned long           gro_flush_timeout;
>         int                     napi_defer_hard_irqs;
> +       unsigned long           irq_suspend_timeout;
>         unsigned int            gro_max_size;
>         unsigned int            gro_ipv4_max_size;
>         rx_handler_func_t __rcu *rx_handler;
> diff --git a/net/core/dev.c b/net/core/dev.c
> index e7260889d4cb..3bf325ec25a3 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -11945,6 +11945,7 @@ static void __init net_dev_struct_check(void)
>         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, _rx);
>         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_flush_timeout);
>         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, napi_defer_hard_irqs);
> +       CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, irq_suspend_timeout);
>         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_max_size);
>         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_ipv4_max_size);
>         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler);
> @@ -11956,7 +11957,7 @@ static void __init net_dev_struct_check(void)
>  #ifdef CONFIG_NET_XGRESS
>         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, tcx_ingress);
>  #endif
> -       CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 104);
> +       CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 112);
>  }
>
>  /*
> diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
> index 0e2084ce7b75..fb6f3327310f 100644
> --- a/net/core/net-sysfs.c
> +++ b/net/core/net-sysfs.c
> @@ -440,6 +440,23 @@ static ssize_t napi_defer_hard_irqs_store(struct device *dev,
>  }
>  NETDEVICE_SHOW_RW(napi_defer_hard_irqs, fmt_dec);
>
> +static int change_irq_suspend_timeout(struct net_device *dev, unsigned long val)
> +{
> +       WRITE_ONCE(dev->irq_suspend_timeout, val);
> +       return 0;
> +}
> +
> +static ssize_t irq_suspend_timeout_store(struct device *dev,
> +                                        struct device_attribute *attr,
> +                                        const char *buf, size_t len)
> +{
> +       if (!capable(CAP_NET_ADMIN))
> +               return -EPERM;
> +
> +       return netdev_store(dev, attr, buf, len, change_irq_suspend_timeout);
> +}
> +NETDEVICE_SHOW_RW(irq_suspend_timeout, fmt_ulong);
> +
>  static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr,
>                              const char *buf, size_t len)
>  {
> @@ -664,6 +681,7 @@ static struct attribute *net_class_attrs[] __ro_after_init = {
>         &dev_attr_tx_queue_len.attr,
>         &dev_attr_gro_flush_timeout.attr,
>         &dev_attr_napi_defer_hard_irqs.attr,
> +       &dev_attr_irq_suspend_timeout.attr,
>         &dev_attr_phys_port_id.attr,
>         &dev_attr_phys_port_name.attr,
>         &dev_attr_phys_switch_id.attr,
> --
> 2.25.1


Please no more per-device sysfs entry, shared by all the users of the device.

Let's not repeat past mistakes.

Nowadays, we need/want per receive-queue tuning, preferably set with netlink.
Joe Damato Aug. 23, 2024, 8:15 p.m. UTC | #2
On Fri, Aug 23, 2024 at 07:39:56PM +0200, Eric Dumazet wrote:
> On Fri, Aug 23, 2024 at 7:31 PM Joe Damato <jdamato@fastly.com> wrote:
> >
> > From: Martin Karsten <mkarsten@uwaterloo.ca>
> >
> > This patch doesn't change any behavior but prepares the code for other
> > changes in the following commits which use irq_suspend_timeout as a
> > timeout for IRQ suspension.
> >
> > Signed-off-by: Martin Karsten <mkarsten@uwaterloo.ca>
> > Co-developed-by: Joe Damato <jdamato@fastly.com>
> > Signed-off-by: Joe Damato <jdamato@fastly.com>
> > Tested-by: Joe Damato <jdamato@fastly.com>
> > Tested-by: Martin Karsten <mkarsten@uwaterloo.ca>
> > ---
> >  rfc -> v1:
> >    - Removed napi.rst documentation from this patch; added to patch 6.
> >
> >  include/linux/netdevice.h |  2 ++
> >  net/core/dev.c            |  3 ++-
> >  net/core/net-sysfs.c      | 18 ++++++++++++++++++
> >  3 files changed, 22 insertions(+), 1 deletion(-)
> >
> > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> > index 0ef3eaa23f4b..31867bb2ff65 100644
> > --- a/include/linux/netdevice.h
> > +++ b/include/linux/netdevice.h
> > @@ -1857,6 +1857,7 @@ enum netdev_reg_state {
> >   *     @gro_flush_timeout:     timeout for GRO layer in NAPI
> >   *     @napi_defer_hard_irqs:  If not zero, provides a counter that would
> >   *                             allow to avoid NIC hard IRQ, on busy queues.
> > + *     @irq_suspend_timeout:   IRQ suspension timeout
> >   *
> >   *     @rx_handler:            handler for received packets
> >   *     @rx_handler_data:       XXX: need comments on this one
> > @@ -2060,6 +2061,7 @@ struct net_device {
> >         struct netdev_rx_queue  *_rx;
> >         unsigned long           gro_flush_timeout;
> >         int                     napi_defer_hard_irqs;
> > +       unsigned long           irq_suspend_timeout;
> >         unsigned int            gro_max_size;
> >         unsigned int            gro_ipv4_max_size;
> >         rx_handler_func_t __rcu *rx_handler;
> > diff --git a/net/core/dev.c b/net/core/dev.c
> > index e7260889d4cb..3bf325ec25a3 100644
> > --- a/net/core/dev.c
> > +++ b/net/core/dev.c
> > @@ -11945,6 +11945,7 @@ static void __init net_dev_struct_check(void)
> >         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, _rx);
> >         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_flush_timeout);
> >         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, napi_defer_hard_irqs);
> > +       CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, irq_suspend_timeout);
> >         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_max_size);
> >         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_ipv4_max_size);
> >         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler);
> > @@ -11956,7 +11957,7 @@ static void __init net_dev_struct_check(void)
> >  #ifdef CONFIG_NET_XGRESS
> >         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, tcx_ingress);
> >  #endif
> > -       CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 104);
> > +       CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 112);
> >  }
> >
> >  /*
> > diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
> > index 0e2084ce7b75..fb6f3327310f 100644
> > --- a/net/core/net-sysfs.c
> > +++ b/net/core/net-sysfs.c
> > @@ -440,6 +440,23 @@ static ssize_t napi_defer_hard_irqs_store(struct device *dev,
> >  }
> >  NETDEVICE_SHOW_RW(napi_defer_hard_irqs, fmt_dec);
> >
> > +static int change_irq_suspend_timeout(struct net_device *dev, unsigned long val)
> > +{
> > +       WRITE_ONCE(dev->irq_suspend_timeout, val);
> > +       return 0;
> > +}
> > +
> > +static ssize_t irq_suspend_timeout_store(struct device *dev,
> > +                                        struct device_attribute *attr,
> > +                                        const char *buf, size_t len)
> > +{
> > +       if (!capable(CAP_NET_ADMIN))
> > +               return -EPERM;
> > +
> > +       return netdev_store(dev, attr, buf, len, change_irq_suspend_timeout);
> > +}
> > +NETDEVICE_SHOW_RW(irq_suspend_timeout, fmt_ulong);
> > +
> >  static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr,
> >                              const char *buf, size_t len)
> >  {
> > @@ -664,6 +681,7 @@ static struct attribute *net_class_attrs[] __ro_after_init = {
> >         &dev_attr_tx_queue_len.attr,
> >         &dev_attr_gro_flush_timeout.attr,
> >         &dev_attr_napi_defer_hard_irqs.attr,
> > +       &dev_attr_irq_suspend_timeout.attr,
> >         &dev_attr_phys_port_id.attr,
> >         &dev_attr_phys_port_name.attr,
> >         &dev_attr_phys_switch_id.attr,
> > --
> > 2.25.1
> 
> 
> Please no more per-device sysfs entry, shared by all the users of the device.
> 
> Let's not repeat past mistakes.
> 
> Nowadays, we need/want per receive-queue tuning, preferably set with netlink.

Thanks for the feedback, Eric. We appreciate your consideration
and guidance.

May we ask what your thoughts are, overall, about getting a
mechanism like this accepted?

We want to make sure that this, in principle, is acceptable before
iterating further and going down the path of netlink, if required.

On the specific netlink bit in your comment, we agree in principle,
however:

  1. Our code integrates directly with existing sysfs parameters.
     If we make our parameter settable via netlink, but the others
     remain as sysfs parameters then the interface for users
     becomes a bit cumbersome.

     And, so the urge will be to move all parameters to netlink
     for ease of use for the user.

     As we mentioned in our cover letter: we agree that doing so is
     a good idea, but we hope to convince you (and the other
     maintainers) that the netlink work can come later as a separate
     change which affects the existing parameters we integrate with
     as well as the parameter we are introducing, at the same time.

  2. The proposed mechanism yields a substantial performance and
     efficiency improvement which would be valuable to users. It would
     be unfortunate to block until all of this could be moved to
     netlink, first.

  3. While adding a new sysfs parameter does affect the ABI
     permanently, it doesn't prevent us from making these parameters
     per-NAPI in the future. This series adds strength to the argument
     that these parameters should be per-NAPI because our results show
     the impact these parameters can have on network processing very
     clearly in a range of scenarios.

We appreciate your thoughts on the above as we want to ensure we
are moving in the right direction.
diff mbox series

Patch

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 0ef3eaa23f4b..31867bb2ff65 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1857,6 +1857,7 @@  enum netdev_reg_state {
  *	@gro_flush_timeout:	timeout for GRO layer in NAPI
  *	@napi_defer_hard_irqs:	If not zero, provides a counter that would
  *				allow to avoid NIC hard IRQ, on busy queues.
+ *	@irq_suspend_timeout:	IRQ suspension timeout
  *
  *	@rx_handler:		handler for received packets
  *	@rx_handler_data: 	XXX: need comments on this one
@@ -2060,6 +2061,7 @@  struct net_device {
 	struct netdev_rx_queue	*_rx;
 	unsigned long		gro_flush_timeout;
 	int			napi_defer_hard_irqs;
+	unsigned long		irq_suspend_timeout;
 	unsigned int		gro_max_size;
 	unsigned int		gro_ipv4_max_size;
 	rx_handler_func_t __rcu	*rx_handler;
diff --git a/net/core/dev.c b/net/core/dev.c
index e7260889d4cb..3bf325ec25a3 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -11945,6 +11945,7 @@  static void __init net_dev_struct_check(void)
 	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, _rx);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_flush_timeout);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, napi_defer_hard_irqs);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, irq_suspend_timeout);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_max_size);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_ipv4_max_size);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler);
@@ -11956,7 +11957,7 @@  static void __init net_dev_struct_check(void)
 #ifdef CONFIG_NET_XGRESS
 	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, tcx_ingress);
 #endif
-	CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 104);
+	CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 112);
 }
 
 /*
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 0e2084ce7b75..fb6f3327310f 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -440,6 +440,23 @@  static ssize_t napi_defer_hard_irqs_store(struct device *dev,
 }
 NETDEVICE_SHOW_RW(napi_defer_hard_irqs, fmt_dec);
 
+static int change_irq_suspend_timeout(struct net_device *dev, unsigned long val)
+{
+	WRITE_ONCE(dev->irq_suspend_timeout, val);
+	return 0;
+}
+
+static ssize_t irq_suspend_timeout_store(struct device *dev,
+					 struct device_attribute *attr,
+					 const char *buf, size_t len)
+{
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	return netdev_store(dev, attr, buf, len, change_irq_suspend_timeout);
+}
+NETDEVICE_SHOW_RW(irq_suspend_timeout, fmt_ulong);
+
 static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr,
 			     const char *buf, size_t len)
 {
@@ -664,6 +681,7 @@  static struct attribute *net_class_attrs[] __ro_after_init = {
 	&dev_attr_tx_queue_len.attr,
 	&dev_attr_gro_flush_timeout.attr,
 	&dev_attr_napi_defer_hard_irqs.attr,
+	&dev_attr_irq_suspend_timeout.attr,
 	&dev_attr_phys_port_id.attr,
 	&dev_attr_phys_port_name.attr,
 	&dev_attr_phys_switch_id.attr,