diff mbox series

[v6,net-next,08/15] net: softnet_data: Make xmit.recursion per task.

Message ID 20240612170303.3896084-9-bigeasy@linutronix.de (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series locking: Introduce nested-BH locking. | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for net-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 15193 this patch: 15193
netdev/build_tools success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers success CCed 15 of 15 maintainers
netdev/build_clang success Errors and warnings before: 2034 this patch: 2034
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 16334 this patch: 16334
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 69 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 22 this patch: 22
netdev/source_inline success Was 0 now: 0
netdev/contest success net-next-2024-06-13--18-00 (tests: 647)

Commit Message

Sebastian Andrzej Siewior June 12, 2024, 4:44 p.m. UTC
Softirq is preemptible on PREEMPT_RT. Without a per-CPU lock in
local_bh_disable() there is no guarantee that only one device is
transmitting at a time.
With preemption and multiple senders it is possible that the per-CPU
recursion counter gets incremented by different threads and exceeds
XMIT_RECURSION_LIMIT leading to a false positive recursion alert.

Instead of adding a lock to protect the per-CPU variable it is simpler
to make the counter per-task. Sending and receiving skbs happens always
in thread context anyway.

Having a lock to protected the per-CPU counter would block/ serialize two
sending threads needlessly. It would also require a recursive lock to
ensure that the owner can increment the counter further.

Make the recursion counter a task_struct member on PREEMPT_RT.

Cc: Ben Segall <bsegall@google.com>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 include/linux/netdevice.h | 11 +++++++++++
 include/linux/sched.h     |  4 +++-
 net/core/dev.h            | 20 ++++++++++++++++++++
 3 files changed, 34 insertions(+), 1 deletion(-)

Comments

Steven Rostedt June 12, 2024, 5:18 p.m. UTC | #1
On Wed, 12 Jun 2024 18:44:34 +0200
Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:

> Softirq is preemptible on PREEMPT_RT. Without a per-CPU lock in
> local_bh_disable() there is no guarantee that only one device is
> transmitting at a time.
> With preemption and multiple senders it is possible that the per-CPU
> recursion counter gets incremented by different threads and exceeds
> XMIT_RECURSION_LIMIT leading to a false positive recursion alert.
> 
> Instead of adding a lock to protect the per-CPU variable it is simpler
> to make the counter per-task. Sending and receiving skbs happens always
> in thread context anyway.
> 
> Having a lock to protected the per-CPU counter would block/ serialize two
> sending threads needlessly. It would also require a recursive lock to
> ensure that the owner can increment the counter further.
> 
> Make the recursion counter a task_struct member on PREEMPT_RT.

I'm curious to what would be the harm to using a per_task counter
instead of per_cpu outside of PREEMPT_RT. That way, we wouldn't have to
have the #ifdef.

-- Steve


> 
> Cc: Ben Segall <bsegall@google.com>
> Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
> Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
> Cc: Juri Lelli <juri.lelli@redhat.com>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Steven Rostedt <rostedt@goodmis.org>
> Cc: Valentin Schneider <vschneid@redhat.com>
> Cc: Vincent Guittot <vincent.guittot@linaro.org>
> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
> ---
>  include/linux/netdevice.h | 11 +++++++++++
>  include/linux/sched.h     |  4 +++-
>  net/core/dev.h            | 20 ++++++++++++++++++++
>  3 files changed, 34 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index d20c6c99eb887..b5ec072ec2430 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -3223,7 +3223,9 @@ struct softnet_data {
>  #endif
>  	/* written and read only by owning cpu: */
>  	struct {
> +#ifndef CONFIG_PREEMPT_RT
>  		u16 recursion;
> +#endif
>  		u8  more;
>  #ifdef CONFIG_NET_EGRESS
>  		u8  skip_txqueue;
> @@ -3256,10 +3258,19 @@ struct softnet_data {
>  
>  DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
>  
> +#ifdef CONFIG_PREEMPT_RT
> +static inline int dev_recursion_level(void)
> +{
> +	return current->net_xmit_recursion;
> +}
> +
> +#else
> +
>  static inline int dev_recursion_level(void)
>  {
>  	return this_cpu_read(softnet_data.xmit.recursion);
>  }
> +#endif
>  
>  void __netif_schedule(struct Qdisc *q);
>  void netif_schedule_queue(struct netdev_queue *txq);
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 61591ac6eab6d..a9b0ca72db55f 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -975,7 +975,9 @@ struct task_struct {
>  	/* delay due to memory thrashing */
>  	unsigned                        in_thrashing:1;
>  #endif
> -
> +#ifdef CONFIG_PREEMPT_RT
> +	u8				net_xmit_recursion;
> +#endif
>  	unsigned long			atomic_flags; /* Flags requiring atomic access. */
>  
>  	struct restart_block		restart_block;
> diff --git a/net/core/dev.h b/net/core/dev.h
> index b7b518bc2be55..2f96d63053ad0 100644
> --- a/net/core/dev.h
> +++ b/net/core/dev.h
> @@ -150,6 +150,25 @@ struct napi_struct *napi_by_id(unsigned int napi_id);
>  void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu);
>  
>  #define XMIT_RECURSION_LIMIT	8
> +
> +#ifdef CONFIG_PREEMPT_RT
> +static inline bool dev_xmit_recursion(void)
> +{
> +	return unlikely(current->net_xmit_recursion > XMIT_RECURSION_LIMIT);
> +}
> +
> +static inline void dev_xmit_recursion_inc(void)
> +{
> +	current->net_xmit_recursion++;
> +}
> +
> +static inline void dev_xmit_recursion_dec(void)
> +{
> +	current->net_xmit_recursion--;
> +}
> +
> +#else
> +
>  static inline bool dev_xmit_recursion(void)
>  {
>  	return unlikely(__this_cpu_read(softnet_data.xmit.recursion) >
> @@ -165,5 +184,6 @@ static inline void dev_xmit_recursion_dec(void)
>  {
>  	__this_cpu_dec(softnet_data.xmit.recursion);
>  }
> +#endif
>  
>  #endif
Sebastian Andrzej Siewior June 14, 2024, 8:27 a.m. UTC | #2
On 2024-06-12 13:18:29 [-0400], Steven Rostedt wrote:
> On Wed, 12 Jun 2024 18:44:34 +0200
> Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:
> 
> > Softirq is preemptible on PREEMPT_RT. Without a per-CPU lock in
> > local_bh_disable() there is no guarantee that only one device is
> > transmitting at a time.
> > With preemption and multiple senders it is possible that the per-CPU
> > recursion counter gets incremented by different threads and exceeds
> > XMIT_RECURSION_LIMIT leading to a false positive recursion alert.
> > 
> > Instead of adding a lock to protect the per-CPU variable it is simpler
> > to make the counter per-task. Sending and receiving skbs happens always
> > in thread context anyway.
> > 
> > Having a lock to protected the per-CPU counter would block/ serialize two
> > sending threads needlessly. It would also require a recursive lock to
> > ensure that the owner can increment the counter further.
> > 
> > Make the recursion counter a task_struct member on PREEMPT_RT.
> 
> I'm curious to what would be the harm to using a per_task counter
> instead of per_cpu outside of PREEMPT_RT. That way, we wouldn't have to
> have the #ifdef.

There should be a hole on !RT, too so we shouldn't gain weight. The
limit is set to 8 so an u8 would be enough. The counter is only accessed
with BH-disabled so it will be used only in one context since it can't
schedule().

I think it should work fine. netdev folks, you want me to remove that
ifdef and use a per-Task counter unconditionally?

> -- Steve

Sebastian
Eric Dumazet June 14, 2024, 8:38 a.m. UTC | #3
On Fri, Jun 14, 2024 at 10:28 AM Sebastian Andrzej Siewior
<bigeasy@linutronix.de> wrote:
>
> On 2024-06-12 13:18:29 [-0400], Steven Rostedt wrote:
> > On Wed, 12 Jun 2024 18:44:34 +0200
> > Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:
> >
> > > Softirq is preemptible on PREEMPT_RT. Without a per-CPU lock in
> > > local_bh_disable() there is no guarantee that only one device is
> > > transmitting at a time.
> > > With preemption and multiple senders it is possible that the per-CPU
> > > recursion counter gets incremented by different threads and exceeds
> > > XMIT_RECURSION_LIMIT leading to a false positive recursion alert.
> > >
> > > Instead of adding a lock to protect the per-CPU variable it is simpler
> > > to make the counter per-task. Sending and receiving skbs happens always
> > > in thread context anyway.
> > >
> > > Having a lock to protected the per-CPU counter would block/ serialize two
> > > sending threads needlessly. It would also require a recursive lock to
> > > ensure that the owner can increment the counter further.
> > >
> > > Make the recursion counter a task_struct member on PREEMPT_RT.
> >
> > I'm curious to what would be the harm to using a per_task counter
> > instead of per_cpu outside of PREEMPT_RT. That way, we wouldn't have to
> > have the #ifdef.
>
> There should be a hole on !RT, too so we shouldn't gain weight. The
> limit is set to 8 so an u8 would be enough. The counter is only accessed
> with BH-disabled so it will be used only in one context since it can't
> schedule().
>
> I think it should work fine. netdev folks, you want me to remove that
> ifdef and use a per-Task counter unconditionally?

It depends if this adds another cache line miss/dirtying or not.

What about other fields from softnet_data.xmit ?
Sebastian Andrzej Siewior June 14, 2024, 9:48 a.m. UTC | #4
On 2024-06-14 10:38:15 [+0200], Eric Dumazet wrote:
> > I think it should work fine. netdev folks, you want me to remove that
> > ifdef and use a per-Task counter unconditionally?
> 
> It depends if this adds another cache line miss/dirtying or not.
> 
> What about other fields from softnet_data.xmit ?

duh. Looking at the `more' member I realise that this needs to move to
task_struct on RT, too. Therefore I would move the whole xmit struct.

The xmit cacheline starts within the previous member (xfrm_backlog) and
ends before the following member starts. So it kind of has its own
cacheline.
With defconfig, if we move it to the front of task struct then we go from

| struct task_struct {
|         struct thread_info         thread_info;          /*     0    24 */
|         unsigned int               __state;              /*    24     4 */
|         unsigned int               saved_state;          /*    28     4 */
|         void *                     stack;                /*    32     8 */
|         refcount_t                 usage;                /*    40     4 */
|         unsigned int               flags;                /*    44     4 */
|         unsigned int               ptrace;               /*    48     4 */
|         int                        on_cpu;               /*    52     4 */
|         struct __call_single_node  wake_entry;           /*    56    16 */
|         /* --- cacheline 1 boundary (64 bytes) was 8 bytes ago --- */
|         unsigned int               wakee_flips;          /*    72     4 */
| 
|         /* XXX 4 bytes hole, try to pack */
| 
|         long unsigned int          wakee_flip_decay_ts;  /*    80     8 */

to

| struct task_struct {
|         struct thread_info         thread_info;          /*     0    24 */
|         unsigned int               __state;              /*    24     4 */
|         unsigned int               saved_state;          /*    28     4 */
|         void *                     stack;                /*    32     8 */
|         refcount_t                 usage;                /*    40     4 */
|         unsigned int               flags;                /*    44     4 */
|         unsigned int               ptrace;               /*    48     4 */
|         struct {
|                 u16                recursion;            /*    52     2 */
|                 u8                 more;                 /*    54     1 */
|                 u8                 skip_txqueue;         /*    55     1 */
|         } xmit;                                          /*    52     4 */
|         struct __call_single_node  wake_entry;           /*    56    16 */
|         /* --- cacheline 1 boundary (64 bytes) was 8 bytes ago --- */
|         int                        on_cpu;               /*    72     4 */
|         unsigned int               wakee_flips;          /*    76     4 */
|         long unsigned int          wakee_flip_decay_ts;  /*    80     8 */


stuffed a hole due to adding `xmit' and moving `on_cpu'. In the end the
total size of task_struct remained the same.
The cache line should be hot due to `flags' usage in

| static void handle_softirqs(bool ksirqd)
| {
|          unsigned long old_flags = current->flags;
…
|         current->flags &= ~PF_MEMALLOC;

Then there is a bit of code before net_XX_action() and the usage of
either of the members so not sure if it is gone by then…

Is this what we want or not?

Sebastian
Paolo Abeni June 14, 2024, 2:08 p.m. UTC | #5
On Fri, 2024-06-14 at 11:48 +0200, Sebastian Andrzej Siewior wrote:
> On 2024-06-14 10:38:15 [+0200], Eric Dumazet wrote:
> > > I think it should work fine. netdev folks, you want me to remove that
> > > ifdef and use a per-Task counter unconditionally?
> > 
> > It depends if this adds another cache line miss/dirtying or not.
> > 
> > What about other fields from softnet_data.xmit ?
> 
> duh. Looking at the `more' member I realise that this needs to move to
> task_struct on RT, too. Therefore I would move the whole xmit struct.
> 
> The xmit cacheline starts within the previous member (xfrm_backlog) and
> ends before the following member starts. So it kind of has its own
> cacheline.
> With defconfig, if we move it to the front of task struct then we go from
> 
> > struct task_struct {
> >         struct thread_info         thread_info;          /*     0    24 */
> >         unsigned int               __state;              /*    24     4 */
> >         unsigned int               saved_state;          /*    28     4 */
> >         void *                     stack;                /*    32     8 */
> >         refcount_t                 usage;                /*    40     4 */
> >         unsigned int               flags;                /*    44     4 */
> >         unsigned int               ptrace;               /*    48     4 */
> >         int                        on_cpu;               /*    52     4 */
> >         struct __call_single_node  wake_entry;           /*    56    16 */
> >         /* --- cacheline 1 boundary (64 bytes) was 8 bytes ago --- */
> >         unsigned int               wakee_flips;          /*    72     4 */
> > 
> >         /* XXX 4 bytes hole, try to pack */
> > 
> >         long unsigned int          wakee_flip_decay_ts;  /*    80     8 */
> 
> to
> 
> > struct task_struct {
> >         struct thread_info         thread_info;          /*     0    24 */
> >         unsigned int               __state;              /*    24     4 */
> >         unsigned int               saved_state;          /*    28     4 */
> >         void *                     stack;                /*    32     8 */
> >         refcount_t                 usage;                /*    40     4 */
> >         unsigned int               flags;                /*    44     4 */
> >         unsigned int               ptrace;               /*    48     4 */
> >         struct {
> >                 u16                recursion;            /*    52     2 */
> >                 u8                 more;                 /*    54     1 */
> >                 u8                 skip_txqueue;         /*    55     1 */
> >         } xmit;                                          /*    52     4 */
> >         struct __call_single_node  wake_entry;           /*    56    16 */
> >         /* --- cacheline 1 boundary (64 bytes) was 8 bytes ago --- */
> >         int                        on_cpu;               /*    72     4 */
> >         unsigned int               wakee_flips;          /*    76     4 */
> >         long unsigned int          wakee_flip_decay_ts;  /*    80     8 */
> 
> 
> stuffed a hole due to adding `xmit' and moving `on_cpu'. In the end the
> total size of task_struct remained the same.
> The cache line should be hot due to `flags' usage in
> 
> > static void handle_softirqs(bool ksirqd)
> > {
> >          unsigned long old_flags = current->flags;
> …
> >         current->flags &= ~PF_MEMALLOC;
> 
> Then there is a bit of code before net_XX_action() and the usage of
> either of the members so not sure if it is gone by then…
> 
> Is this what we want or not?

I personally think (fear mostly) there is still the potential for some
(performance) regression. I think it would be safer to introduce this
change under a compiler conditional and eventually follow-up with a
patch making the code generic.

Should such later change prove to be problematic, we could revert it
without impacting the series as a whole. 

Thanks!

Paolo
Steven Rostedt June 14, 2024, 4:04 p.m. UTC | #6
On Fri, 14 Jun 2024 16:08:42 +0200
Paolo Abeni <pabeni@redhat.com> wrote:

> > Is this what we want or not?  
> 
> I personally think (fear mostly) there is still the potential for some
> (performance) regression. I think it would be safer to introduce this
> change under a compiler conditional and eventually follow-up with a
> patch making the code generic.
> 
> Should such later change prove to be problematic, we could revert it
> without impacting the series as a whole. 

That makes sense to me.

-- Steve
Sebastian Andrzej Siewior June 14, 2024, 4:04 p.m. UTC | #7
On 2024-06-14 16:08:42 [+0200], Paolo Abeni wrote:
> 
> I personally think (fear mostly) there is still the potential for some
> (performance) regression. I think it would be safer to introduce this
> change under a compiler conditional and eventually follow-up with a
> patch making the code generic.
> 
> Should such later change prove to be problematic, we could revert it
> without impacting the series as a whole. 

Sounds reasonable. In that case let me stick with "v6.5" of this patch
(as just posted due the `more' member) and then I could introduce an
option for !RT to use this optionally so it can be tested widely.

> Thanks!
> 
> Paolo

Sebastian
diff mbox series

Patch

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d20c6c99eb887..b5ec072ec2430 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3223,7 +3223,9 @@  struct softnet_data {
 #endif
 	/* written and read only by owning cpu: */
 	struct {
+#ifndef CONFIG_PREEMPT_RT
 		u16 recursion;
+#endif
 		u8  more;
 #ifdef CONFIG_NET_EGRESS
 		u8  skip_txqueue;
@@ -3256,10 +3258,19 @@  struct softnet_data {
 
 DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 
+#ifdef CONFIG_PREEMPT_RT
+static inline int dev_recursion_level(void)
+{
+	return current->net_xmit_recursion;
+}
+
+#else
+
 static inline int dev_recursion_level(void)
 {
 	return this_cpu_read(softnet_data.xmit.recursion);
 }
+#endif
 
 void __netif_schedule(struct Qdisc *q);
 void netif_schedule_queue(struct netdev_queue *txq);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 61591ac6eab6d..a9b0ca72db55f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -975,7 +975,9 @@  struct task_struct {
 	/* delay due to memory thrashing */
 	unsigned                        in_thrashing:1;
 #endif
-
+#ifdef CONFIG_PREEMPT_RT
+	u8				net_xmit_recursion;
+#endif
 	unsigned long			atomic_flags; /* Flags requiring atomic access. */
 
 	struct restart_block		restart_block;
diff --git a/net/core/dev.h b/net/core/dev.h
index b7b518bc2be55..2f96d63053ad0 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -150,6 +150,25 @@  struct napi_struct *napi_by_id(unsigned int napi_id);
 void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu);
 
 #define XMIT_RECURSION_LIMIT	8
+
+#ifdef CONFIG_PREEMPT_RT
+static inline bool dev_xmit_recursion(void)
+{
+	return unlikely(current->net_xmit_recursion > XMIT_RECURSION_LIMIT);
+}
+
+static inline void dev_xmit_recursion_inc(void)
+{
+	current->net_xmit_recursion++;
+}
+
+static inline void dev_xmit_recursion_dec(void)
+{
+	current->net_xmit_recursion--;
+}
+
+#else
+
 static inline bool dev_xmit_recursion(void)
 {
 	return unlikely(__this_cpu_read(softnet_data.xmit.recursion) >
@@ -165,5 +184,6 @@  static inline void dev_xmit_recursion_dec(void)
 {
 	__this_cpu_dec(softnet_data.xmit.recursion);
 }
+#endif
 
 #endif