Message ID | 20240329154225.349288-7-edumazet@google.com (mailing list archive) |
---|---|
State | Accepted |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | net: rps: misc changes | expand |
Hello Eric, On Fri, Mar 29, 2024 at 11:43 PM Eric Dumazet <edumazet@google.com> wrote: > > input_queue_tail_incr_save() is incrementing the sd queue_tail > and save it in the flow last_qtail. > > Two issues here : > > - no lock protects the write on last_qtail, we should use appropriate > annotations. > > - We can perform this write after releasing the per-cpu backlog lock, > to decrease this lock hold duration (move away the cache line miss) > > Also move input_queue_head_incr() and rps helpers to include/net/rps.h, > while adding rps_ prefix to better reflect their role. > > v2: Fixed a build issue (Jakub and kernel build bots) > > Signed-off-by: Eric Dumazet <edumazet@google.com> > --- > include/linux/netdevice.h | 15 --------------- > include/net/rps.h | 23 +++++++++++++++++++++++ > net/core/dev.c | 20 ++++++++++++-------- > 3 files changed, 35 insertions(+), 23 deletions(-) > > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h > index 1c31cd2691d32064613836141fbdeeebc831b21f..14f19cc2616452d7e6afbbaa52f8ad3e61a419e9 100644 > --- a/include/linux/netdevice.h > +++ b/include/linux/netdevice.h > @@ -3249,21 +3249,6 @@ struct softnet_data { > call_single_data_t defer_csd; > }; > > -static inline void input_queue_head_incr(struct softnet_data *sd) > -{ > -#ifdef CONFIG_RPS > - sd->input_queue_head++; > -#endif > -} > - > -static inline void input_queue_tail_incr_save(struct softnet_data *sd, > - unsigned int *qtail) > -{ > -#ifdef CONFIG_RPS > - *qtail = ++sd->input_queue_tail; > -#endif > -} > - > DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); > > static inline int dev_recursion_level(void) > diff --git a/include/net/rps.h b/include/net/rps.h > index 7660243e905b92651a41292e04caf72c5f12f26e..10ca25731c1ef766715fe7ee415ad0b71ec643a8 100644 > --- a/include/net/rps.h > +++ b/include/net/rps.h > @@ -122,4 +122,27 @@ static inline void sock_rps_record_flow(const struct sock *sk) > #endif > } > > +static inline u32 rps_input_queue_tail_incr(struct softnet_data *sd) > +{ > +#ifdef CONFIG_RPS > + return ++sd->input_queue_tail; > +#else > + return 0; > +#endif > +} > + > +static inline void rps_input_queue_tail_save(u32 *dest, u32 tail) > +{ > +#ifdef CONFIG_RPS > + WRITE_ONCE(*dest, tail); > +#endif > +} I wonder if we should also call this new helper to WRITE_ONCE last_qtail in the set_rps_cpu()? Thanks, Jason > + > +static inline void rps_input_queue_head_incr(struct softnet_data *sd) > +{ > +#ifdef CONFIG_RPS > + sd->input_queue_head++; > +#endif > +} > + > #endif /* _NET_RPS_H */ > diff --git a/net/core/dev.c b/net/core/dev.c > index 0a8ccb0451c30a39f8f8b45d26b7e5548b8bfba4..79073bbc9a644049cacf8433310f4641745049e9 100644 > --- a/net/core/dev.c > +++ b/net/core/dev.c > @@ -4611,7 +4611,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, > if (unlikely(tcpu != next_cpu) && > (tcpu >= nr_cpu_ids || !cpu_online(tcpu) || > ((int)(per_cpu(softnet_data, tcpu).input_queue_head - > - rflow->last_qtail)) >= 0)) { > + READ_ONCE(rflow->last_qtail))) >= 0)) { > tcpu = next_cpu; > rflow = set_rps_cpu(dev, skb, rflow, next_cpu); > } > @@ -4666,7 +4666,7 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, > cpu = READ_ONCE(rflow->cpu); > if (rflow->filter == filter_id && cpu < nr_cpu_ids && > ((int)(per_cpu(softnet_data, cpu).input_queue_head - > - rflow->last_qtail) < > + READ_ONCE(rflow->last_qtail)) < > (int)(10 * flow_table->mask))) > expire = false; > } > @@ -4801,6 +4801,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, > unsigned long flags; > unsigned int qlen; > int max_backlog; > + u32 tail; > > reason = SKB_DROP_REASON_DEV_READY; > if (!netif_running(skb->dev)) > @@ -4825,8 +4826,11 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, > napi_schedule_rps(sd); > } > __skb_queue_tail(&sd->input_pkt_queue, skb); > - input_queue_tail_incr_save(sd, qtail); > + tail = rps_input_queue_tail_incr(sd); > backlog_unlock_irq_restore(sd, &flags); > + > + /* save the tail outside of the critical section */ > + rps_input_queue_tail_save(qtail, tail); > return NET_RX_SUCCESS; > } > > @@ -5904,7 +5908,7 @@ static void flush_backlog(struct work_struct *work) > if (skb->dev->reg_state == NETREG_UNREGISTERING) { > __skb_unlink(skb, &sd->input_pkt_queue); > dev_kfree_skb_irq(skb); > - input_queue_head_incr(sd); > + rps_input_queue_head_incr(sd); > } > } > backlog_unlock_irq_enable(sd); > @@ -5913,7 +5917,7 @@ static void flush_backlog(struct work_struct *work) > if (skb->dev->reg_state == NETREG_UNREGISTERING) { > __skb_unlink(skb, &sd->process_queue); > kfree_skb(skb); > - input_queue_head_incr(sd); > + rps_input_queue_head_incr(sd); > } > } > local_bh_enable(); > @@ -6041,7 +6045,7 @@ static int process_backlog(struct napi_struct *napi, int quota) > rcu_read_lock(); > __netif_receive_skb(skb); > rcu_read_unlock(); > - input_queue_head_incr(sd); > + rps_input_queue_head_incr(sd); > if (++work >= quota) > return work; > > @@ -11455,11 +11459,11 @@ static int dev_cpu_dead(unsigned int oldcpu) > /* Process offline CPU's input_pkt_queue */ > while ((skb = __skb_dequeue(&oldsd->process_queue))) { > netif_rx(skb); > - input_queue_head_incr(oldsd); > + rps_input_queue_head_incr(oldsd); > } > while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) { > netif_rx(skb); > - input_queue_head_incr(oldsd); > + rps_input_queue_head_incr(oldsd); > } > > return 0; > -- > 2.44.0.478.gd926399ef9-goog > >
On Sat, Mar 30, 2024 at 3:47 PM Jason Xing <kerneljasonxing@gmail.com> wrote: > > Hello Eric, > > On Fri, Mar 29, 2024 at 11:43 PM Eric Dumazet <edumazet@google.com> wrote: > > > > input_queue_tail_incr_save() is incrementing the sd queue_tail > > and save it in the flow last_qtail. > > > > Two issues here : > > > > - no lock protects the write on last_qtail, we should use appropriate > > annotations. > > > > - We can perform this write after releasing the per-cpu backlog lock, > > to decrease this lock hold duration (move away the cache line miss) > > > > Also move input_queue_head_incr() and rps helpers to include/net/rps.h, > > while adding rps_ prefix to better reflect their role. > > > > v2: Fixed a build issue (Jakub and kernel build bots) > > > > Signed-off-by: Eric Dumazet <edumazet@google.com> > > --- > > include/linux/netdevice.h | 15 --------------- > > include/net/rps.h | 23 +++++++++++++++++++++++ > > net/core/dev.c | 20 ++++++++++++-------- > > 3 files changed, 35 insertions(+), 23 deletions(-) > > > > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h > > index 1c31cd2691d32064613836141fbdeeebc831b21f..14f19cc2616452d7e6afbbaa52f8ad3e61a419e9 100644 > > --- a/include/linux/netdevice.h > > +++ b/include/linux/netdevice.h > > @@ -3249,21 +3249,6 @@ struct softnet_data { > > call_single_data_t defer_csd; > > }; > > > > -static inline void input_queue_head_incr(struct softnet_data *sd) > > -{ > > -#ifdef CONFIG_RPS > > - sd->input_queue_head++; > > -#endif > > -} > > - > > -static inline void input_queue_tail_incr_save(struct softnet_data *sd, > > - unsigned int *qtail) > > -{ > > -#ifdef CONFIG_RPS > > - *qtail = ++sd->input_queue_tail; > > -#endif > > -} > > - > > DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); > > > > static inline int dev_recursion_level(void) > > diff --git a/include/net/rps.h b/include/net/rps.h > > index 7660243e905b92651a41292e04caf72c5f12f26e..10ca25731c1ef766715fe7ee415ad0b71ec643a8 100644 > > --- a/include/net/rps.h > > +++ b/include/net/rps.h > > @@ -122,4 +122,27 @@ static inline void sock_rps_record_flow(const struct sock *sk) > > #endif > > } > > > > +static inline u32 rps_input_queue_tail_incr(struct softnet_data *sd) > > +{ > > +#ifdef CONFIG_RPS > > + return ++sd->input_queue_tail; > > +#else > > + return 0; > > +#endif > > +} > > + > > +static inline void rps_input_queue_tail_save(u32 *dest, u32 tail) > > +{ > > +#ifdef CONFIG_RPS > > + WRITE_ONCE(*dest, tail); > > +#endif > > +} > > I wonder if we should also call this new helper to WRITE_ONCE > last_qtail in the set_rps_cpu()? > Absolutely, I have another patch series to address remaining races (rflow->cpu, rflow->filter ...) I chose to make a small one, to ease reviews.
On Sun, Mar 31, 2024 at 12:01 AM Eric Dumazet <edumazet@google.com> wrote: > > On Sat, Mar 30, 2024 at 3:47 PM Jason Xing <kerneljasonxing@gmail.com> wrote: > > > > Hello Eric, > > > > On Fri, Mar 29, 2024 at 11:43 PM Eric Dumazet <edumazet@google.com> wrote: > > > > > > input_queue_tail_incr_save() is incrementing the sd queue_tail > > > and save it in the flow last_qtail. > > > > > > Two issues here : > > > > > > - no lock protects the write on last_qtail, we should use appropriate > > > annotations. > > > > > > - We can perform this write after releasing the per-cpu backlog lock, > > > to decrease this lock hold duration (move away the cache line miss) > > > > > > Also move input_queue_head_incr() and rps helpers to include/net/rps.h, > > > while adding rps_ prefix to better reflect their role. > > > > > > v2: Fixed a build issue (Jakub and kernel build bots) > > > > > > Signed-off-by: Eric Dumazet <edumazet@google.com> > > > --- > > > include/linux/netdevice.h | 15 --------------- > > > include/net/rps.h | 23 +++++++++++++++++++++++ > > > net/core/dev.c | 20 ++++++++++++-------- > > > 3 files changed, 35 insertions(+), 23 deletions(-) > > > > > > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h > > > index 1c31cd2691d32064613836141fbdeeebc831b21f..14f19cc2616452d7e6afbbaa52f8ad3e61a419e9 100644 > > > --- a/include/linux/netdevice.h > > > +++ b/include/linux/netdevice.h > > > @@ -3249,21 +3249,6 @@ struct softnet_data { > > > call_single_data_t defer_csd; > > > }; > > > > > > -static inline void input_queue_head_incr(struct softnet_data *sd) > > > -{ > > > -#ifdef CONFIG_RPS > > > - sd->input_queue_head++; > > > -#endif > > > -} > > > - > > > -static inline void input_queue_tail_incr_save(struct softnet_data *sd, > > > - unsigned int *qtail) > > > -{ > > > -#ifdef CONFIG_RPS > > > - *qtail = ++sd->input_queue_tail; > > > -#endif > > > -} > > > - > > > DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); > > > > > > static inline int dev_recursion_level(void) > > > diff --git a/include/net/rps.h b/include/net/rps.h > > > index 7660243e905b92651a41292e04caf72c5f12f26e..10ca25731c1ef766715fe7ee415ad0b71ec643a8 100644 > > > --- a/include/net/rps.h > > > +++ b/include/net/rps.h > > > @@ -122,4 +122,27 @@ static inline void sock_rps_record_flow(const struct sock *sk) > > > #endif > > > } > > > > > > +static inline u32 rps_input_queue_tail_incr(struct softnet_data *sd) > > > +{ > > > +#ifdef CONFIG_RPS > > > + return ++sd->input_queue_tail; > > > +#else > > > + return 0; > > > +#endif > > > +} > > > + > > > +static inline void rps_input_queue_tail_save(u32 *dest, u32 tail) > > > +{ > > > +#ifdef CONFIG_RPS > > > + WRITE_ONCE(*dest, tail); > > > +#endif > > > +} > > > > I wonder if we should also call this new helper to WRITE_ONCE > > last_qtail in the set_rps_cpu()? > > > > Absolutely, I have another patch series to address remaining races > (rflow->cpu, rflow->filter ...) > > I chose to make a small one, to ease reviews. Great. Now I have no more questions :) Thanks, Jason
On Sun, Mar 31, 2024 at 12:01 AM Eric Dumazet <edumazet@google.com> wrote: > > On Sat, Mar 30, 2024 at 3:47 PM Jason Xing <kerneljasonxing@gmail.com> wrote: > > > > Hello Eric, > > > > On Fri, Mar 29, 2024 at 11:43 PM Eric Dumazet <edumazet@google.com> wrote: > > > > > > input_queue_tail_incr_save() is incrementing the sd queue_tail > > > and save it in the flow last_qtail. > > > > > > Two issues here : > > > > > > - no lock protects the write on last_qtail, we should use appropriate > > > annotations. > > > > > > - We can perform this write after releasing the per-cpu backlog lock, > > > to decrease this lock hold duration (move away the cache line miss) > > > > > > Also move input_queue_head_incr() and rps helpers to include/net/rps.h, > > > while adding rps_ prefix to better reflect their role. > > > > > > v2: Fixed a build issue (Jakub and kernel build bots) > > > > > > Signed-off-by: Eric Dumazet <edumazet@google.com> > > > --- > > > include/linux/netdevice.h | 15 --------------- > > > include/net/rps.h | 23 +++++++++++++++++++++++ > > > net/core/dev.c | 20 ++++++++++++-------- > > > 3 files changed, 35 insertions(+), 23 deletions(-) > > > > > > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h > > > index 1c31cd2691d32064613836141fbdeeebc831b21f..14f19cc2616452d7e6afbbaa52f8ad3e61a419e9 100644 > > > --- a/include/linux/netdevice.h > > > +++ b/include/linux/netdevice.h > > > @@ -3249,21 +3249,6 @@ struct softnet_data { > > > call_single_data_t defer_csd; > > > }; > > > > > > -static inline void input_queue_head_incr(struct softnet_data *sd) > > > -{ > > > -#ifdef CONFIG_RPS > > > - sd->input_queue_head++; > > > -#endif > > > -} > > > - > > > -static inline void input_queue_tail_incr_save(struct softnet_data *sd, > > > - unsigned int *qtail) > > > -{ > > > -#ifdef CONFIG_RPS > > > - *qtail = ++sd->input_queue_tail; > > > -#endif > > > -} > > > - > > > DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); > > > > > > static inline int dev_recursion_level(void) > > > diff --git a/include/net/rps.h b/include/net/rps.h > > > index 7660243e905b92651a41292e04caf72c5f12f26e..10ca25731c1ef766715fe7ee415ad0b71ec643a8 100644 > > > --- a/include/net/rps.h > > > +++ b/include/net/rps.h > > > @@ -122,4 +122,27 @@ static inline void sock_rps_record_flow(const struct sock *sk) > > > #endif > > > } > > > > > > +static inline u32 rps_input_queue_tail_incr(struct softnet_data *sd) > > > +{ > > > +#ifdef CONFIG_RPS > > > + return ++sd->input_queue_tail; > > > +#else > > > + return 0; > > > +#endif > > > +} > > > + > > > +static inline void rps_input_queue_tail_save(u32 *dest, u32 tail) > > > +{ > > > +#ifdef CONFIG_RPS > > > + WRITE_ONCE(*dest, tail); > > > +#endif > > > +} > > > > I wonder if we should also call this new helper to WRITE_ONCE > > last_qtail in the set_rps_cpu()? > > > > Absolutely, I have another patch series to address remaining races > (rflow->cpu, rflow->filter ...) > > I chose to make a small one, to ease reviews. Hello Eric, I wonder if you already have a patchset to change those three members in struct rps_dev_flow? I looked through this part and found it's not that complicated. So if not, I can do it :) Thanks, Jason
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1c31cd2691d32064613836141fbdeeebc831b21f..14f19cc2616452d7e6afbbaa52f8ad3e61a419e9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3249,21 +3249,6 @@ struct softnet_data { call_single_data_t defer_csd; }; -static inline void input_queue_head_incr(struct softnet_data *sd) -{ -#ifdef CONFIG_RPS - sd->input_queue_head++; -#endif -} - -static inline void input_queue_tail_incr_save(struct softnet_data *sd, - unsigned int *qtail) -{ -#ifdef CONFIG_RPS - *qtail = ++sd->input_queue_tail; -#endif -} - DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); static inline int dev_recursion_level(void) diff --git a/include/net/rps.h b/include/net/rps.h index 7660243e905b92651a41292e04caf72c5f12f26e..10ca25731c1ef766715fe7ee415ad0b71ec643a8 100644 --- a/include/net/rps.h +++ b/include/net/rps.h @@ -122,4 +122,27 @@ static inline void sock_rps_record_flow(const struct sock *sk) #endif } +static inline u32 rps_input_queue_tail_incr(struct softnet_data *sd) +{ +#ifdef CONFIG_RPS + return ++sd->input_queue_tail; +#else + return 0; +#endif +} + +static inline void rps_input_queue_tail_save(u32 *dest, u32 tail) +{ +#ifdef CONFIG_RPS + WRITE_ONCE(*dest, tail); +#endif +} + +static inline void rps_input_queue_head_incr(struct softnet_data *sd) +{ +#ifdef CONFIG_RPS + sd->input_queue_head++; +#endif +} + #endif /* _NET_RPS_H */ diff --git a/net/core/dev.c b/net/core/dev.c index 0a8ccb0451c30a39f8f8b45d26b7e5548b8bfba4..79073bbc9a644049cacf8433310f4641745049e9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4611,7 +4611,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, if (unlikely(tcpu != next_cpu) && (tcpu >= nr_cpu_ids || !cpu_online(tcpu) || ((int)(per_cpu(softnet_data, tcpu).input_queue_head - - rflow->last_qtail)) >= 0)) { + READ_ONCE(rflow->last_qtail))) >= 0)) { tcpu = next_cpu; rflow = set_rps_cpu(dev, skb, rflow, next_cpu); } @@ -4666,7 +4666,7 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, cpu = READ_ONCE(rflow->cpu); if (rflow->filter == filter_id && cpu < nr_cpu_ids && ((int)(per_cpu(softnet_data, cpu).input_queue_head - - rflow->last_qtail) < + READ_ONCE(rflow->last_qtail)) < (int)(10 * flow_table->mask))) expire = false; } @@ -4801,6 +4801,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, unsigned long flags; unsigned int qlen; int max_backlog; + u32 tail; reason = SKB_DROP_REASON_DEV_READY; if (!netif_running(skb->dev)) @@ -4825,8 +4826,11 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, napi_schedule_rps(sd); } __skb_queue_tail(&sd->input_pkt_queue, skb); - input_queue_tail_incr_save(sd, qtail); + tail = rps_input_queue_tail_incr(sd); backlog_unlock_irq_restore(sd, &flags); + + /* save the tail outside of the critical section */ + rps_input_queue_tail_save(qtail, tail); return NET_RX_SUCCESS; } @@ -5904,7 +5908,7 @@ static void flush_backlog(struct work_struct *work) if (skb->dev->reg_state == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->input_pkt_queue); dev_kfree_skb_irq(skb); - input_queue_head_incr(sd); + rps_input_queue_head_incr(sd); } } backlog_unlock_irq_enable(sd); @@ -5913,7 +5917,7 @@ static void flush_backlog(struct work_struct *work) if (skb->dev->reg_state == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->process_queue); kfree_skb(skb); - input_queue_head_incr(sd); + rps_input_queue_head_incr(sd); } } local_bh_enable(); @@ -6041,7 +6045,7 @@ static int process_backlog(struct napi_struct *napi, int quota) rcu_read_lock(); __netif_receive_skb(skb); rcu_read_unlock(); - input_queue_head_incr(sd); + rps_input_queue_head_incr(sd); if (++work >= quota) return work; @@ -11455,11 +11459,11 @@ static int dev_cpu_dead(unsigned int oldcpu) /* Process offline CPU's input_pkt_queue */ while ((skb = __skb_dequeue(&oldsd->process_queue))) { netif_rx(skb); - input_queue_head_incr(oldsd); + rps_input_queue_head_incr(oldsd); } while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) { netif_rx(skb); - input_queue_head_incr(oldsd); + rps_input_queue_head_incr(oldsd); } return 0;
input_queue_tail_incr_save() is incrementing the sd queue_tail and save it in the flow last_qtail. Two issues here : - no lock protects the write on last_qtail, we should use appropriate annotations. - We can perform this write after releasing the per-cpu backlog lock, to decrease this lock hold duration (move away the cache line miss) Also move input_queue_head_incr() and rps helpers to include/net/rps.h, while adding rps_ prefix to better reflect their role. v2: Fixed a build issue (Jakub and kernel build bots) Signed-off-by: Eric Dumazet <edumazet@google.com> --- include/linux/netdevice.h | 15 --------------- include/net/rps.h | 23 +++++++++++++++++++++++ net/core/dev.c | 20 ++++++++++++-------- 3 files changed, 35 insertions(+), 23 deletions(-)