diff mbox series

[net] net: Use this_cpu_inc() to increment net->core_stats

Message ID YmFjdOp+R5gVGZ7p@linutronix.de (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series [net] net: Use this_cpu_inc() to increment net->core_stats | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for net, async
netdev/fixes_present fail Series targets non-next tree, but doesn't contain any Fixes tags
netdev/subject_prefix success Link
netdev/cover_letter success Single patches do not need cover letters
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit fail Errors and warnings before: 4827 this patch: 4963
netdev/cc_maintainers warning 1 maintainers not CCed: petrm@nvidia.com
netdev/build_clang success Errors and warnings before: 1074 this patch: 1074
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn fail Errors and warnings before: 4974 this patch: 5076
netdev/checkpatch warning WARNING: line length of 81 exceeds 80 columns WARNING: line length of 86 exceeds 80 columns WARNING: line length of 87 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Sebastian Andrzej Siewior April 21, 2022, 2 p.m. UTC
The macro dev_core_stats_##FIELD##_inc() disables preemption and invokes
netdev_core_stats_alloc() to return a per-CPU pointer.
netdev_core_stats_alloc() will allocate memory on its first invocation
which breaks on PREEMPT_RT because it requires non-atomic context for
memory allocation.

This can be avoided by enabling preemption in netdev_core_stats_alloc()
assuming the caller always disables preemption.

It might be better to replace local_inc() with this_cpu_inc() now that
dev_core_stats_##FIELD##_inc() gained a preempt-disable section and does
not rely on already disabled preemption. This results in less
instructions on x86-64:
local_inc:
|          incl %gs:__preempt_count(%rip)  # __preempt_count
|          movq    488(%rdi), %rax # _1->core_stats, _22
|          testq   %rax, %rax      # _22
|          je      .L585   #,
|          add %gs:this_cpu_off(%rip), %rax        # this_cpu_off, tcp_ptr__
|  .L586:
|          testq   %rax, %rax      # _27
|          je      .L587   #,
|          incq (%rax)            # _6->a.counter
|  .L587:
|          decl %gs:__preempt_count(%rip)  # __preempt_count

this_cpu_inc(), this patch:
|         movq    488(%rdi), %rax # _1->core_stats, _5
|         testq   %rax, %rax      # _5
|         je      .L591   #,
| .L585:
|         incq %gs:(%rax) # _18->rx_dropped

Use unsigned long as type for the counter. Use this_cpu_inc() to
increment the counter. Use a plain read of the counter.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 include/linux/netdevice.h | 17 +++++++----------
 net/core/dev.c            | 14 +++++---------
 2 files changed, 12 insertions(+), 19 deletions(-)

Comments

Eric Dumazet April 21, 2022, 3:32 p.m. UTC | #1
On Thu, Apr 21, 2022 at 7:00 AM Sebastian Andrzej Siewior
<bigeasy@linutronix.de> wrote:
>
> The macro dev_core_stats_##FIELD##_inc() disables preemption and invokes
> netdev_core_stats_alloc() to return a per-CPU pointer.
> netdev_core_stats_alloc() will allocate memory on its first invocation
> which breaks on PREEMPT_RT because it requires non-atomic context for
> memory allocation.

Can you elaborate on this, I am confused ?

You are saying that on PREEMPT_RT, we can not call
alloc_percpu_gfp(XXX, GFP_ATOMIC | __GFP_NOWARN);
under some contexts ?

preemption might be disabled by callers of net->core_stats anyways...


>
> This can be avoided by enabling preemption in netdev_core_stats_alloc()
> assuming the caller always disables preemption.
>
> It might be better to replace local_inc() with this_cpu_inc() now that
> dev_core_stats_##FIELD##_inc() gained a preempt-disable section and does
> not rely on already disabled preemption. This results in less
> instructions on x86-64:
> local_inc:
> |          incl %gs:__preempt_count(%rip)  # __preempt_count
> |          movq    488(%rdi), %rax # _1->core_stats, _22
> |          testq   %rax, %rax      # _22
> |          je      .L585   #,
> |          add %gs:this_cpu_off(%rip), %rax        # this_cpu_off, tcp_ptr__
> |  .L586:
> |          testq   %rax, %rax      # _27
> |          je      .L587   #,
> |          incq (%rax)            # _6->a.counter
> |  .L587:
> |          decl %gs:__preempt_count(%rip)  # __preempt_count
>
> this_cpu_inc(), this patch:
> |         movq    488(%rdi), %rax # _1->core_stats, _5
> |         testq   %rax, %rax      # _5
> |         je      .L591   #,
> | .L585:
> |         incq %gs:(%rax) # _18->rx_dropped
>
> Use unsigned long as type for the counter. Use this_cpu_inc() to
> increment the counter. Use a plain read of the counter.
>
> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
> ---
>  include/linux/netdevice.h | 17 +++++++----------
>  net/core/dev.c            | 14 +++++---------
>  2 files changed, 12 insertions(+), 19 deletions(-)
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 59e27a2b7bf04..0009112b23767 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -199,10 +199,10 @@ struct net_device_stats {
>   * Try to fit them in a single cache line, for dev_get_stats() sake.
>   */
>  struct net_device_core_stats {
> -       local_t         rx_dropped;
> -       local_t         tx_dropped;
> -       local_t         rx_nohandler;
> -} __aligned(4 * sizeof(local_t));
> +       unsigned long   rx_dropped;
> +       unsigned long   tx_dropped;
> +       unsigned long   rx_nohandler;
> +} __aligned(4 * sizeof(unsigned long));
>
>  #include <linux/cache.h>
>  #include <linux/skbuff.h>
> @@ -3843,7 +3843,7 @@ static __always_inline bool __is_skb_forwardable(const struct net_device *dev,
>         return false;
>  }
>
> -struct net_device_core_stats *netdev_core_stats_alloc(struct net_device *dev);
> +struct net_device_core_stats __percpu *netdev_core_stats_alloc(struct net_device *dev);
>
>  static inline struct net_device_core_stats *dev_core_stats(struct net_device *dev)
>  {
> @@ -3851,7 +3851,7 @@ static inline struct net_device_core_stats *dev_core_stats(struct net_device *de
>         struct net_device_core_stats __percpu *p = READ_ONCE(dev->core_stats);
>
>         if (likely(p))
> -               return this_cpu_ptr(p);
> +               return p;
>
>         return netdev_core_stats_alloc(dev);
>  }
> @@ -3861,12 +3861,9 @@ static inline void dev_core_stats_##FIELD##_inc(struct net_device *dev)          \
>  {                                                                              \
>         struct net_device_core_stats *p;                                        \
>                                                                                 \
> -       preempt_disable();                                                      \
>         p = dev_core_stats(dev);                                                \
> -                                                                               \
>         if (p)                                                                  \
> -               local_inc(&p->FIELD);                                           \
> -       preempt_enable();                                                       \
> +               this_cpu_inc(p->FIELD);                                         \
>  }
>  DEV_CORE_STATS_INC(rx_dropped)
>  DEV_CORE_STATS_INC(tx_dropped)
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 9ec51c1d77cf4..bf6026158874e 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -10309,7 +10309,7 @@ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
>  }
>  EXPORT_SYMBOL(netdev_stats_to_stats64);
>
> -struct net_device_core_stats *netdev_core_stats_alloc(struct net_device *dev)
> +struct net_device_core_stats __percpu *netdev_core_stats_alloc(struct net_device *dev)
>  {
>         struct net_device_core_stats __percpu *p;
>
> @@ -10320,11 +10320,7 @@ struct net_device_core_stats *netdev_core_stats_alloc(struct net_device *dev)
>                 free_percpu(p);
>
>         /* This READ_ONCE() pairs with the cmpxchg() above */
> -       p = READ_ONCE(dev->core_stats);
> -       if (!p)
> -               return NULL;
> -
> -       return this_cpu_ptr(p);
> +       return READ_ONCE(dev->core_stats);
>  }
>  EXPORT_SYMBOL(netdev_core_stats_alloc);
>
> @@ -10361,9 +10357,9 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
>
>                 for_each_possible_cpu(i) {
>                         core_stats = per_cpu_ptr(p, i);
> -                       storage->rx_dropped += local_read(&core_stats->rx_dropped);
> -                       storage->tx_dropped += local_read(&core_stats->tx_dropped);
> -                       storage->rx_nohandler += local_read(&core_stats->rx_nohandler);
> +                       storage->rx_dropped += core_stats->rx_dropped;
> +                       storage->tx_dropped += core_stats->tx_dropped;
> +                       storage->rx_nohandler += core_stats->rx_nohandler;
>                 }
>         }
>         return storage;
> --
> 2.35.2
>
Sebastian Andrzej Siewior April 21, 2022, 3:48 p.m. UTC | #2
On 2022-04-21 08:32:30 [-0700], Eric Dumazet wrote:
> On Thu, Apr 21, 2022 at 7:00 AM Sebastian Andrzej Siewior
> <bigeasy@linutronix.de> wrote:
> >
> > The macro dev_core_stats_##FIELD##_inc() disables preemption and invokes
> > netdev_core_stats_alloc() to return a per-CPU pointer.
> > netdev_core_stats_alloc() will allocate memory on its first invocation
> > which breaks on PREEMPT_RT because it requires non-atomic context for
> > memory allocation.
> 
> Can you elaborate on this, I am confused ?
> 
> You are saying that on PREEMPT_RT, we can not call
> alloc_percpu_gfp(XXX, GFP_ATOMIC | __GFP_NOWARN);
> under some contexts ?

Correct. On PREEMPT_RT you must not explicitly create an atomic context
by
- using preempt_disable()
- acquiring a raw_spinlock_t lock
- using local_irq_disable()

while allocating memory. GFP_ATOMIC won't save you. The internal locks
within mm (kmalloc() and per-CPU memory) are sleeping locks and can not
be acquired in atomic context.

> preemption might be disabled by callers of net->core_stats anyways...

It won't be disabled by
- acquiring a spinlock_t lock
- running in softirq or interrupt handler

I haven't seen any splats (with RT enabled) other than this
preempt_disable() section so far. However only the first caller
allocates memory so maybe I add a check later on to be sure.

Sebastian
Eric Dumazet April 21, 2022, 4:06 p.m. UTC | #3
On Thu, Apr 21, 2022 at 7:00 AM Sebastian Andrzej Siewior
<bigeasy@linutronix.de> wrote:
>

>                 for_each_possible_cpu(i) {
>                         core_stats = per_cpu_ptr(p, i);
> -                       storage->rx_dropped += local_read(&core_stats->rx_dropped);
> -                       storage->tx_dropped += local_read(&core_stats->tx_dropped);
> -                       storage->rx_nohandler += local_read(&core_stats->rx_nohandler);
> +                       storage->rx_dropped += core_stats->rx_dropped;
> +                       storage->tx_dropped += core_stats->tx_dropped;
> +                       storage->rx_nohandler += core_stats->rx_nohandler;

I think that one of the reasons for me to use  local_read() was that
it provided what was needed to avoid future syzbot reports.

Perhaps use READ_ONCE() here ?

Yes, we have many similar folding loops that are  simply assuming
compiler won't do stupid things.
Sebastian Andrzej Siewior April 21, 2022, 4:51 p.m. UTC | #4
On 2022-04-21 09:06:05 [-0700], Eric Dumazet wrote:
> On Thu, Apr 21, 2022 at 7:00 AM Sebastian Andrzej Siewior
> <bigeasy@linutronix.de> wrote:
> >
> 
> >                 for_each_possible_cpu(i) {
> >                         core_stats = per_cpu_ptr(p, i);
> > -                       storage->rx_dropped += local_read(&core_stats->rx_dropped);
> > -                       storage->tx_dropped += local_read(&core_stats->tx_dropped);
> > -                       storage->rx_nohandler += local_read(&core_stats->rx_nohandler);
> > +                       storage->rx_dropped += core_stats->rx_dropped;
> > +                       storage->tx_dropped += core_stats->tx_dropped;
> > +                       storage->rx_nohandler += core_stats->rx_nohandler;
> 
> I think that one of the reasons for me to use  local_read() was that
> it provided what was needed to avoid future syzbot reports.

syzbot report due a plain read of a per-CPU variable which might be
modified?

> Perhaps use READ_ONCE() here ?
> 
> Yes, we have many similar folding loops that are  simply assuming
> compiler won't do stupid things.

I wasn't sure about that and added PeterZ to do some yelling here just
in case. And yes, we have other sites doing exactly that. In
   Documentation/core-api/this_cpu_ops.rst
there is nothing about remote-READ-access (only that there should be no
writes (due to parallel this_cpu_inc() on the local CPU)). I know that a
32bit write can be optimized in two 16bit writes in certain cases but a
read is a read.
PeterZ? :)

Sebastian
Eric Dumazet April 21, 2022, 5:11 p.m. UTC | #5
On Thu, Apr 21, 2022 at 9:51 AM Sebastian Andrzej Siewior
<bigeasy@linutronix.de> wrote:
>
> On 2022-04-21 09:06:05 [-0700], Eric Dumazet wrote:
> > On Thu, Apr 21, 2022 at 7:00 AM Sebastian Andrzej Siewior
> > <bigeasy@linutronix.de> wrote:
> > >
> >
> > >                 for_each_possible_cpu(i) {
> > >                         core_stats = per_cpu_ptr(p, i);
> > > -                       storage->rx_dropped += local_read(&core_stats->rx_dropped);
> > > -                       storage->tx_dropped += local_read(&core_stats->tx_dropped);
> > > -                       storage->rx_nohandler += local_read(&core_stats->rx_nohandler);
> > > +                       storage->rx_dropped += core_stats->rx_dropped;
> > > +                       storage->tx_dropped += core_stats->tx_dropped;
> > > +                       storage->rx_nohandler += core_stats->rx_nohandler;
> >
> > I think that one of the reasons for me to use  local_read() was that
> > it provided what was needed to avoid future syzbot reports.
>
> syzbot report due a plain read of a per-CPU variable which might be
> modified?

Yes, this is KCSAN  (
https://www.kernel.org/doc/html/latest/dev-tools/kcsan.html )

>
> > Perhaps use READ_ONCE() here ?
> >
> > Yes, we have many similar folding loops that are  simply assuming
> > compiler won't do stupid things.
>
> I wasn't sure about that and added PeterZ to do some yelling here just
> in case. And yes, we have other sites doing exactly that. In
>    Documentation/core-api/this_cpu_ops.rst
> there is nothing about remote-READ-access (only that there should be no
> writes (due to parallel this_cpu_inc() on the local CPU)). I know that a
> 32bit write can be optimized in two 16bit writes in certain cases but a
> read is a read.
> PeterZ? :)
>
> Sebastian
Jakub Kicinski April 22, 2022, 7:56 p.m. UTC | #6
On Thu, 21 Apr 2022 16:00:20 +0200 Sebastian Andrzej Siewior wrote:
> @@ -3851,7 +3851,7 @@ static inline struct net_device_core_stats *dev_core_stats(struct net_device *de

I think this needs to return __percpu now?
Double check sparse is happy for v2, pls.

>  	struct net_device_core_stats __percpu *p = READ_ONCE(dev->core_stats);
>  
>  	if (likely(p))
> -		return this_cpu_ptr(p);
> +		return p;
Peter Zijlstra April 23, 2022, 9:24 a.m. UTC | #7
On Thu, Apr 21, 2022 at 06:51:31PM +0200, Sebastian Andrzej Siewior wrote:
> On 2022-04-21 09:06:05 [-0700], Eric Dumazet wrote:
> > On Thu, Apr 21, 2022 at 7:00 AM Sebastian Andrzej Siewior
> > <bigeasy@linutronix.de> wrote:
> > >
> > 
> > >                 for_each_possible_cpu(i) {
> > >                         core_stats = per_cpu_ptr(p, i);
> > > -                       storage->rx_dropped += local_read(&core_stats->rx_dropped);
> > > -                       storage->tx_dropped += local_read(&core_stats->tx_dropped);
> > > -                       storage->rx_nohandler += local_read(&core_stats->rx_nohandler);
> > > +                       storage->rx_dropped += core_stats->rx_dropped;
> > > +                       storage->tx_dropped += core_stats->tx_dropped;
> > > +                       storage->rx_nohandler += core_stats->rx_nohandler;
> > 
> > I think that one of the reasons for me to use  local_read() was that
> > it provided what was needed to avoid future syzbot reports.
> 
> syzbot report due a plain read of a per-CPU variable which might be
> modified?
> 
> > Perhaps use READ_ONCE() here ?
> > 
> > Yes, we have many similar folding loops that are  simply assuming
> > compiler won't do stupid things.
> 
> I wasn't sure about that and added PeterZ to do some yelling here just
> in case. And yes, we have other sites doing exactly that. In
>    Documentation/core-api/this_cpu_ops.rst
> there is nothing about remote-READ-access (only that there should be no
> writes (due to parallel this_cpu_inc() on the local CPU)). I know that a
> 32bit write can be optimized in two 16bit writes in certain cases but a
> read is a read.
> PeterZ? :)

Eric is right. READ_ONCE() is 'required' to ensure the compiler doesn't
split the load and KCSAN konws about these things.
Eric Dumazet April 23, 2022, 1:45 p.m. UTC | #8
On Sat, Apr 23, 2022 at 2:24 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Thu, Apr 21, 2022 at 06:51:31PM +0200, Sebastian Andrzej Siewior wrote:
> > On 2022-04-21 09:06:05 [-0700], Eric Dumazet wrote:
> > > On Thu, Apr 21, 2022 at 7:00 AM Sebastian Andrzej Siewior
> > > <bigeasy@linutronix.de> wrote:
> > > >
> > >
> > > >                 for_each_possible_cpu(i) {
> > > >                         core_stats = per_cpu_ptr(p, i);
> > > > -                       storage->rx_dropped += local_read(&core_stats->rx_dropped);
> > > > -                       storage->tx_dropped += local_read(&core_stats->tx_dropped);
> > > > -                       storage->rx_nohandler += local_read(&core_stats->rx_nohandler);
> > > > +                       storage->rx_dropped += core_stats->rx_dropped;
> > > > +                       storage->tx_dropped += core_stats->tx_dropped;
> > > > +                       storage->rx_nohandler += core_stats->rx_nohandler;
> > >
> > > I think that one of the reasons for me to use  local_read() was that
> > > it provided what was needed to avoid future syzbot reports.
> >
> > syzbot report due a plain read of a per-CPU variable which might be
> > modified?
> >
> > > Perhaps use READ_ONCE() here ?
> > >
> > > Yes, we have many similar folding loops that are  simply assuming
> > > compiler won't do stupid things.
> >
> > I wasn't sure about that and added PeterZ to do some yelling here just
> > in case. And yes, we have other sites doing exactly that. In
> >    Documentation/core-api/this_cpu_ops.rst
> > there is nothing about remote-READ-access (only that there should be no
> > writes (due to parallel this_cpu_inc() on the local CPU)). I know that a
> > 32bit write can be optimized in two 16bit writes in certain cases but a
> > read is a read.
> > PeterZ? :)
>
> Eric is right. READ_ONCE() is 'required' to ensure the compiler doesn't
> split the load and KCSAN konws about these things.

More details can be found in https://lwn.net/Articles/793253/

Thanks !
Sebastian Andrzej Siewior April 24, 2022, 8:33 a.m. UTC | #9
On 2022-04-23 11:24:39 [+0200], Peter Zijlstra wrote:
> Eric is right. READ_ONCE() is 'required' to ensure the compiler doesn't
> split the load and KCSAN konws about these things.

So we should update the documentation and make sure that is done
tree-wide with the remote per-CPU access.
I will update that patch accordingly and add the other thing to my todo
list.

Sebastian
diff mbox series

Patch

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 59e27a2b7bf04..0009112b23767 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -199,10 +199,10 @@  struct net_device_stats {
  * Try to fit them in a single cache line, for dev_get_stats() sake.
  */
 struct net_device_core_stats {
-	local_t		rx_dropped;
-	local_t		tx_dropped;
-	local_t		rx_nohandler;
-} __aligned(4 * sizeof(local_t));
+	unsigned long	rx_dropped;
+	unsigned long	tx_dropped;
+	unsigned long	rx_nohandler;
+} __aligned(4 * sizeof(unsigned long));
 
 #include <linux/cache.h>
 #include <linux/skbuff.h>
@@ -3843,7 +3843,7 @@  static __always_inline bool __is_skb_forwardable(const struct net_device *dev,
 	return false;
 }
 
-struct net_device_core_stats *netdev_core_stats_alloc(struct net_device *dev);
+struct net_device_core_stats __percpu *netdev_core_stats_alloc(struct net_device *dev);
 
 static inline struct net_device_core_stats *dev_core_stats(struct net_device *dev)
 {
@@ -3851,7 +3851,7 @@  static inline struct net_device_core_stats *dev_core_stats(struct net_device *de
 	struct net_device_core_stats __percpu *p = READ_ONCE(dev->core_stats);
 
 	if (likely(p))
-		return this_cpu_ptr(p);
+		return p;
 
 	return netdev_core_stats_alloc(dev);
 }
@@ -3861,12 +3861,9 @@  static inline void dev_core_stats_##FIELD##_inc(struct net_device *dev)		\
 {										\
 	struct net_device_core_stats *p;					\
 										\
-	preempt_disable();							\
 	p = dev_core_stats(dev);						\
-										\
 	if (p)									\
-		local_inc(&p->FIELD);						\
-	preempt_enable();							\
+		this_cpu_inc(p->FIELD);						\
 }
 DEV_CORE_STATS_INC(rx_dropped)
 DEV_CORE_STATS_INC(tx_dropped)
diff --git a/net/core/dev.c b/net/core/dev.c
index 9ec51c1d77cf4..bf6026158874e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -10309,7 +10309,7 @@  void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
 }
 EXPORT_SYMBOL(netdev_stats_to_stats64);
 
-struct net_device_core_stats *netdev_core_stats_alloc(struct net_device *dev)
+struct net_device_core_stats __percpu *netdev_core_stats_alloc(struct net_device *dev)
 {
 	struct net_device_core_stats __percpu *p;
 
@@ -10320,11 +10320,7 @@  struct net_device_core_stats *netdev_core_stats_alloc(struct net_device *dev)
 		free_percpu(p);
 
 	/* This READ_ONCE() pairs with the cmpxchg() above */
-	p = READ_ONCE(dev->core_stats);
-	if (!p)
-		return NULL;
-
-	return this_cpu_ptr(p);
+	return READ_ONCE(dev->core_stats);
 }
 EXPORT_SYMBOL(netdev_core_stats_alloc);
 
@@ -10361,9 +10357,9 @@  struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
 
 		for_each_possible_cpu(i) {
 			core_stats = per_cpu_ptr(p, i);
-			storage->rx_dropped += local_read(&core_stats->rx_dropped);
-			storage->tx_dropped += local_read(&core_stats->tx_dropped);
-			storage->rx_nohandler += local_read(&core_stats->rx_nohandler);
+			storage->rx_dropped += core_stats->rx_dropped;
+			storage->tx_dropped += core_stats->tx_dropped;
+			storage->rx_nohandler += core_stats->rx_nohandler;
 		}
 	}
 	return storage;