Message ID | YmFjdOp+R5gVGZ7p@linutronix.de (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | [net] net: Use this_cpu_inc() to increment net->core_stats | expand |
On Thu, Apr 21, 2022 at 7:00 AM Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote: > > The macro dev_core_stats_##FIELD##_inc() disables preemption and invokes > netdev_core_stats_alloc() to return a per-CPU pointer. > netdev_core_stats_alloc() will allocate memory on its first invocation > which breaks on PREEMPT_RT because it requires non-atomic context for > memory allocation. Can you elaborate on this, I am confused ? You are saying that on PREEMPT_RT, we can not call alloc_percpu_gfp(XXX, GFP_ATOMIC | __GFP_NOWARN); under some contexts ? preemption might be disabled by callers of net->core_stats anyways... > > This can be avoided by enabling preemption in netdev_core_stats_alloc() > assuming the caller always disables preemption. > > It might be better to replace local_inc() with this_cpu_inc() now that > dev_core_stats_##FIELD##_inc() gained a preempt-disable section and does > not rely on already disabled preemption. This results in less > instructions on x86-64: > local_inc: > | incl %gs:__preempt_count(%rip) # __preempt_count > | movq 488(%rdi), %rax # _1->core_stats, _22 > | testq %rax, %rax # _22 > | je .L585 #, > | add %gs:this_cpu_off(%rip), %rax # this_cpu_off, tcp_ptr__ > | .L586: > | testq %rax, %rax # _27 > | je .L587 #, > | incq (%rax) # _6->a.counter > | .L587: > | decl %gs:__preempt_count(%rip) # __preempt_count > > this_cpu_inc(), this patch: > | movq 488(%rdi), %rax # _1->core_stats, _5 > | testq %rax, %rax # _5 > | je .L591 #, > | .L585: > | incq %gs:(%rax) # _18->rx_dropped > > Use unsigned long as type for the counter. Use this_cpu_inc() to > increment the counter. Use a plain read of the counter. > > Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> > --- > include/linux/netdevice.h | 17 +++++++---------- > net/core/dev.c | 14 +++++--------- > 2 files changed, 12 insertions(+), 19 deletions(-) > > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h > index 59e27a2b7bf04..0009112b23767 100644 > --- a/include/linux/netdevice.h > +++ b/include/linux/netdevice.h > @@ -199,10 +199,10 @@ struct net_device_stats { > * Try to fit them in a single cache line, for dev_get_stats() sake. > */ > struct net_device_core_stats { > - local_t rx_dropped; > - local_t tx_dropped; > - local_t rx_nohandler; > -} __aligned(4 * sizeof(local_t)); > + unsigned long rx_dropped; > + unsigned long tx_dropped; > + unsigned long rx_nohandler; > +} __aligned(4 * sizeof(unsigned long)); > > #include <linux/cache.h> > #include <linux/skbuff.h> > @@ -3843,7 +3843,7 @@ static __always_inline bool __is_skb_forwardable(const struct net_device *dev, > return false; > } > > -struct net_device_core_stats *netdev_core_stats_alloc(struct net_device *dev); > +struct net_device_core_stats __percpu *netdev_core_stats_alloc(struct net_device *dev); > > static inline struct net_device_core_stats *dev_core_stats(struct net_device *dev) > { > @@ -3851,7 +3851,7 @@ static inline struct net_device_core_stats *dev_core_stats(struct net_device *de > struct net_device_core_stats __percpu *p = READ_ONCE(dev->core_stats); > > if (likely(p)) > - return this_cpu_ptr(p); > + return p; > > return netdev_core_stats_alloc(dev); > } > @@ -3861,12 +3861,9 @@ static inline void dev_core_stats_##FIELD##_inc(struct net_device *dev) \ > { \ > struct net_device_core_stats *p; \ > \ > - preempt_disable(); \ > p = dev_core_stats(dev); \ > - \ > if (p) \ > - local_inc(&p->FIELD); \ > - preempt_enable(); \ > + this_cpu_inc(p->FIELD); \ > } > DEV_CORE_STATS_INC(rx_dropped) > DEV_CORE_STATS_INC(tx_dropped) > diff --git a/net/core/dev.c b/net/core/dev.c > index 9ec51c1d77cf4..bf6026158874e 100644 > --- a/net/core/dev.c > +++ b/net/core/dev.c > @@ -10309,7 +10309,7 @@ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, > } > EXPORT_SYMBOL(netdev_stats_to_stats64); > > -struct net_device_core_stats *netdev_core_stats_alloc(struct net_device *dev) > +struct net_device_core_stats __percpu *netdev_core_stats_alloc(struct net_device *dev) > { > struct net_device_core_stats __percpu *p; > > @@ -10320,11 +10320,7 @@ struct net_device_core_stats *netdev_core_stats_alloc(struct net_device *dev) > free_percpu(p); > > /* This READ_ONCE() pairs with the cmpxchg() above */ > - p = READ_ONCE(dev->core_stats); > - if (!p) > - return NULL; > - > - return this_cpu_ptr(p); > + return READ_ONCE(dev->core_stats); > } > EXPORT_SYMBOL(netdev_core_stats_alloc); > > @@ -10361,9 +10357,9 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, > > for_each_possible_cpu(i) { > core_stats = per_cpu_ptr(p, i); > - storage->rx_dropped += local_read(&core_stats->rx_dropped); > - storage->tx_dropped += local_read(&core_stats->tx_dropped); > - storage->rx_nohandler += local_read(&core_stats->rx_nohandler); > + storage->rx_dropped += core_stats->rx_dropped; > + storage->tx_dropped += core_stats->tx_dropped; > + storage->rx_nohandler += core_stats->rx_nohandler; > } > } > return storage; > -- > 2.35.2 >
On 2022-04-21 08:32:30 [-0700], Eric Dumazet wrote: > On Thu, Apr 21, 2022 at 7:00 AM Sebastian Andrzej Siewior > <bigeasy@linutronix.de> wrote: > > > > The macro dev_core_stats_##FIELD##_inc() disables preemption and invokes > > netdev_core_stats_alloc() to return a per-CPU pointer. > > netdev_core_stats_alloc() will allocate memory on its first invocation > > which breaks on PREEMPT_RT because it requires non-atomic context for > > memory allocation. > > Can you elaborate on this, I am confused ? > > You are saying that on PREEMPT_RT, we can not call > alloc_percpu_gfp(XXX, GFP_ATOMIC | __GFP_NOWARN); > under some contexts ? Correct. On PREEMPT_RT you must not explicitly create an atomic context by - using preempt_disable() - acquiring a raw_spinlock_t lock - using local_irq_disable() while allocating memory. GFP_ATOMIC won't save you. The internal locks within mm (kmalloc() and per-CPU memory) are sleeping locks and can not be acquired in atomic context. > preemption might be disabled by callers of net->core_stats anyways... It won't be disabled by - acquiring a spinlock_t lock - running in softirq or interrupt handler I haven't seen any splats (with RT enabled) other than this preempt_disable() section so far. However only the first caller allocates memory so maybe I add a check later on to be sure. Sebastian
On Thu, Apr 21, 2022 at 7:00 AM Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote: > > for_each_possible_cpu(i) { > core_stats = per_cpu_ptr(p, i); > - storage->rx_dropped += local_read(&core_stats->rx_dropped); > - storage->tx_dropped += local_read(&core_stats->tx_dropped); > - storage->rx_nohandler += local_read(&core_stats->rx_nohandler); > + storage->rx_dropped += core_stats->rx_dropped; > + storage->tx_dropped += core_stats->tx_dropped; > + storage->rx_nohandler += core_stats->rx_nohandler; I think that one of the reasons for me to use local_read() was that it provided what was needed to avoid future syzbot reports. Perhaps use READ_ONCE() here ? Yes, we have many similar folding loops that are simply assuming compiler won't do stupid things.
On 2022-04-21 09:06:05 [-0700], Eric Dumazet wrote: > On Thu, Apr 21, 2022 at 7:00 AM Sebastian Andrzej Siewior > <bigeasy@linutronix.de> wrote: > > > > > for_each_possible_cpu(i) { > > core_stats = per_cpu_ptr(p, i); > > - storage->rx_dropped += local_read(&core_stats->rx_dropped); > > - storage->tx_dropped += local_read(&core_stats->tx_dropped); > > - storage->rx_nohandler += local_read(&core_stats->rx_nohandler); > > + storage->rx_dropped += core_stats->rx_dropped; > > + storage->tx_dropped += core_stats->tx_dropped; > > + storage->rx_nohandler += core_stats->rx_nohandler; > > I think that one of the reasons for me to use local_read() was that > it provided what was needed to avoid future syzbot reports. syzbot report due a plain read of a per-CPU variable which might be modified? > Perhaps use READ_ONCE() here ? > > Yes, we have many similar folding loops that are simply assuming > compiler won't do stupid things. I wasn't sure about that and added PeterZ to do some yelling here just in case. And yes, we have other sites doing exactly that. In Documentation/core-api/this_cpu_ops.rst there is nothing about remote-READ-access (only that there should be no writes (due to parallel this_cpu_inc() on the local CPU)). I know that a 32bit write can be optimized in two 16bit writes in certain cases but a read is a read. PeterZ? :) Sebastian
On Thu, Apr 21, 2022 at 9:51 AM Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote: > > On 2022-04-21 09:06:05 [-0700], Eric Dumazet wrote: > > On Thu, Apr 21, 2022 at 7:00 AM Sebastian Andrzej Siewior > > <bigeasy@linutronix.de> wrote: > > > > > > > > for_each_possible_cpu(i) { > > > core_stats = per_cpu_ptr(p, i); > > > - storage->rx_dropped += local_read(&core_stats->rx_dropped); > > > - storage->tx_dropped += local_read(&core_stats->tx_dropped); > > > - storage->rx_nohandler += local_read(&core_stats->rx_nohandler); > > > + storage->rx_dropped += core_stats->rx_dropped; > > > + storage->tx_dropped += core_stats->tx_dropped; > > > + storage->rx_nohandler += core_stats->rx_nohandler; > > > > I think that one of the reasons for me to use local_read() was that > > it provided what was needed to avoid future syzbot reports. > > syzbot report due a plain read of a per-CPU variable which might be > modified? Yes, this is KCSAN ( https://www.kernel.org/doc/html/latest/dev-tools/kcsan.html ) > > > Perhaps use READ_ONCE() here ? > > > > Yes, we have many similar folding loops that are simply assuming > > compiler won't do stupid things. > > I wasn't sure about that and added PeterZ to do some yelling here just > in case. And yes, we have other sites doing exactly that. In > Documentation/core-api/this_cpu_ops.rst > there is nothing about remote-READ-access (only that there should be no > writes (due to parallel this_cpu_inc() on the local CPU)). I know that a > 32bit write can be optimized in two 16bit writes in certain cases but a > read is a read. > PeterZ? :) > > Sebastian
On Thu, 21 Apr 2022 16:00:20 +0200 Sebastian Andrzej Siewior wrote: > @@ -3851,7 +3851,7 @@ static inline struct net_device_core_stats *dev_core_stats(struct net_device *de I think this needs to return __percpu now? Double check sparse is happy for v2, pls. > struct net_device_core_stats __percpu *p = READ_ONCE(dev->core_stats); > > if (likely(p)) > - return this_cpu_ptr(p); > + return p;
On Thu, Apr 21, 2022 at 06:51:31PM +0200, Sebastian Andrzej Siewior wrote: > On 2022-04-21 09:06:05 [-0700], Eric Dumazet wrote: > > On Thu, Apr 21, 2022 at 7:00 AM Sebastian Andrzej Siewior > > <bigeasy@linutronix.de> wrote: > > > > > > > > for_each_possible_cpu(i) { > > > core_stats = per_cpu_ptr(p, i); > > > - storage->rx_dropped += local_read(&core_stats->rx_dropped); > > > - storage->tx_dropped += local_read(&core_stats->tx_dropped); > > > - storage->rx_nohandler += local_read(&core_stats->rx_nohandler); > > > + storage->rx_dropped += core_stats->rx_dropped; > > > + storage->tx_dropped += core_stats->tx_dropped; > > > + storage->rx_nohandler += core_stats->rx_nohandler; > > > > I think that one of the reasons for me to use local_read() was that > > it provided what was needed to avoid future syzbot reports. > > syzbot report due a plain read of a per-CPU variable which might be > modified? > > > Perhaps use READ_ONCE() here ? > > > > Yes, we have many similar folding loops that are simply assuming > > compiler won't do stupid things. > > I wasn't sure about that and added PeterZ to do some yelling here just > in case. And yes, we have other sites doing exactly that. In > Documentation/core-api/this_cpu_ops.rst > there is nothing about remote-READ-access (only that there should be no > writes (due to parallel this_cpu_inc() on the local CPU)). I know that a > 32bit write can be optimized in two 16bit writes in certain cases but a > read is a read. > PeterZ? :) Eric is right. READ_ONCE() is 'required' to ensure the compiler doesn't split the load and KCSAN konws about these things.
On Sat, Apr 23, 2022 at 2:24 AM Peter Zijlstra <peterz@infradead.org> wrote: > > On Thu, Apr 21, 2022 at 06:51:31PM +0200, Sebastian Andrzej Siewior wrote: > > On 2022-04-21 09:06:05 [-0700], Eric Dumazet wrote: > > > On Thu, Apr 21, 2022 at 7:00 AM Sebastian Andrzej Siewior > > > <bigeasy@linutronix.de> wrote: > > > > > > > > > > > for_each_possible_cpu(i) { > > > > core_stats = per_cpu_ptr(p, i); > > > > - storage->rx_dropped += local_read(&core_stats->rx_dropped); > > > > - storage->tx_dropped += local_read(&core_stats->tx_dropped); > > > > - storage->rx_nohandler += local_read(&core_stats->rx_nohandler); > > > > + storage->rx_dropped += core_stats->rx_dropped; > > > > + storage->tx_dropped += core_stats->tx_dropped; > > > > + storage->rx_nohandler += core_stats->rx_nohandler; > > > > > > I think that one of the reasons for me to use local_read() was that > > > it provided what was needed to avoid future syzbot reports. > > > > syzbot report due a plain read of a per-CPU variable which might be > > modified? > > > > > Perhaps use READ_ONCE() here ? > > > > > > Yes, we have many similar folding loops that are simply assuming > > > compiler won't do stupid things. > > > > I wasn't sure about that and added PeterZ to do some yelling here just > > in case. And yes, we have other sites doing exactly that. In > > Documentation/core-api/this_cpu_ops.rst > > there is nothing about remote-READ-access (only that there should be no > > writes (due to parallel this_cpu_inc() on the local CPU)). I know that a > > 32bit write can be optimized in two 16bit writes in certain cases but a > > read is a read. > > PeterZ? :) > > Eric is right. READ_ONCE() is 'required' to ensure the compiler doesn't > split the load and KCSAN konws about these things. More details can be found in https://lwn.net/Articles/793253/ Thanks !
On 2022-04-23 11:24:39 [+0200], Peter Zijlstra wrote: > Eric is right. READ_ONCE() is 'required' to ensure the compiler doesn't > split the load and KCSAN konws about these things. So we should update the documentation and make sure that is done tree-wide with the remote per-CPU access. I will update that patch accordingly and add the other thing to my todo list. Sebastian
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 59e27a2b7bf04..0009112b23767 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -199,10 +199,10 @@ struct net_device_stats { * Try to fit them in a single cache line, for dev_get_stats() sake. */ struct net_device_core_stats { - local_t rx_dropped; - local_t tx_dropped; - local_t rx_nohandler; -} __aligned(4 * sizeof(local_t)); + unsigned long rx_dropped; + unsigned long tx_dropped; + unsigned long rx_nohandler; +} __aligned(4 * sizeof(unsigned long)); #include <linux/cache.h> #include <linux/skbuff.h> @@ -3843,7 +3843,7 @@ static __always_inline bool __is_skb_forwardable(const struct net_device *dev, return false; } -struct net_device_core_stats *netdev_core_stats_alloc(struct net_device *dev); +struct net_device_core_stats __percpu *netdev_core_stats_alloc(struct net_device *dev); static inline struct net_device_core_stats *dev_core_stats(struct net_device *dev) { @@ -3851,7 +3851,7 @@ static inline struct net_device_core_stats *dev_core_stats(struct net_device *de struct net_device_core_stats __percpu *p = READ_ONCE(dev->core_stats); if (likely(p)) - return this_cpu_ptr(p); + return p; return netdev_core_stats_alloc(dev); } @@ -3861,12 +3861,9 @@ static inline void dev_core_stats_##FIELD##_inc(struct net_device *dev) \ { \ struct net_device_core_stats *p; \ \ - preempt_disable(); \ p = dev_core_stats(dev); \ - \ if (p) \ - local_inc(&p->FIELD); \ - preempt_enable(); \ + this_cpu_inc(p->FIELD); \ } DEV_CORE_STATS_INC(rx_dropped) DEV_CORE_STATS_INC(tx_dropped) diff --git a/net/core/dev.c b/net/core/dev.c index 9ec51c1d77cf4..bf6026158874e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10309,7 +10309,7 @@ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, } EXPORT_SYMBOL(netdev_stats_to_stats64); -struct net_device_core_stats *netdev_core_stats_alloc(struct net_device *dev) +struct net_device_core_stats __percpu *netdev_core_stats_alloc(struct net_device *dev) { struct net_device_core_stats __percpu *p; @@ -10320,11 +10320,7 @@ struct net_device_core_stats *netdev_core_stats_alloc(struct net_device *dev) free_percpu(p); /* This READ_ONCE() pairs with the cmpxchg() above */ - p = READ_ONCE(dev->core_stats); - if (!p) - return NULL; - - return this_cpu_ptr(p); + return READ_ONCE(dev->core_stats); } EXPORT_SYMBOL(netdev_core_stats_alloc); @@ -10361,9 +10357,9 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, for_each_possible_cpu(i) { core_stats = per_cpu_ptr(p, i); - storage->rx_dropped += local_read(&core_stats->rx_dropped); - storage->tx_dropped += local_read(&core_stats->tx_dropped); - storage->rx_nohandler += local_read(&core_stats->rx_nohandler); + storage->rx_dropped += core_stats->rx_dropped; + storage->tx_dropped += core_stats->tx_dropped; + storage->rx_nohandler += core_stats->rx_nohandler; } } return storage;
The macro dev_core_stats_##FIELD##_inc() disables preemption and invokes netdev_core_stats_alloc() to return a per-CPU pointer. netdev_core_stats_alloc() will allocate memory on its first invocation which breaks on PREEMPT_RT because it requires non-atomic context for memory allocation. This can be avoided by enabling preemption in netdev_core_stats_alloc() assuming the caller always disables preemption. It might be better to replace local_inc() with this_cpu_inc() now that dev_core_stats_##FIELD##_inc() gained a preempt-disable section and does not rely on already disabled preemption. This results in less instructions on x86-64: local_inc: | incl %gs:__preempt_count(%rip) # __preempt_count | movq 488(%rdi), %rax # _1->core_stats, _22 | testq %rax, %rax # _22 | je .L585 #, | add %gs:this_cpu_off(%rip), %rax # this_cpu_off, tcp_ptr__ | .L586: | testq %rax, %rax # _27 | je .L587 #, | incq (%rax) # _6->a.counter | .L587: | decl %gs:__preempt_count(%rip) # __preempt_count this_cpu_inc(), this patch: | movq 488(%rdi), %rax # _1->core_stats, _5 | testq %rax, %rax # _5 | je .L591 #, | .L585: | incq %gs:(%rax) # _18->rx_dropped Use unsigned long as type for the counter. Use this_cpu_inc() to increment the counter. Use a plain read of the counter. Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> --- include/linux/netdevice.h | 17 +++++++---------- net/core/dev.c | 14 +++++--------- 2 files changed, 12 insertions(+), 19 deletions(-)