Message ID | 20230327175446.98151-3-john.fastabend@gmail.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | BPF |
Headers | show |
Series | bpf sockmap fixes | expand |
On Mon, Mar 27, 2023 at 10:54 AM -07, John Fastabend wrote: > Sk_buffs are fed into sockmap verdict programs either from a strparser > (when the user might want to decide how framing of skb is done by attaching > another parser program) or directly through tcp_read_sock. The > tcp_read_sock is the preferred method for performance when the BPF logic is > a stream parser. > > The flow for Cilium's common use case with a stream parser is, > > tcp_read_sock() > sk_psock_verdict_recv > ret = bpf_prog_run_pin_on_cpu() > sk_psock_verdict_apply(sock, skb, ret) > // if system is under memory pressure or app is slow we may > // need to queue skb. Do this queuing through ingress_skb and > // then kick timer to wake up handler > skb_queue_tail(ingress_skb, skb) > schedule_work(work); > > > The work queue is wired up to sk_psock_backlog(). This will then walk the > ingress_skb skb list that holds our sk_buffs that could not be handled, > but should be OK to run at some later point. However, its possible that > the workqueue doing this work still hits an error when sending the skb. > When this happens the skbuff is requeued on a temporary 'state' struct > kept with the workqueue. This is necessary because its possible to > partially send an skbuff before hitting an error and we need to know how > and where to restart when the workqueue runs next. > > Now for the trouble, we don't rekick the workqueue. This can cause a > stall where the skbuff we just cached on the state variable might never > be sent. This happens when its the last packet in a flow and no further > packets come along that would cause the system to kick the workqueue from > that side. > > To fix we could do simple schedule_work(), but while under memory pressure > it makes sense to back off some instead of continue to retry repeatedly. So > instead to fix convert schedule_work to schedule_delayed_work and add > backoff logic to reschedule from backlog queue on errors. Its not obvious > though what a good backoff is so use '1'. > > To test we observed some flakes whil running NGINX compliance test with > sockmap we attributed these failed test to this bug and subsequent issue. > > Fixes: 04919bed948dc ("tcp: Introduce tcp_read_skb()") > Tested-by: William Findlay <will@isovalent.com> > Signed-off-by: John Fastabend <john.fastabend@gmail.com> > --- > include/linux/skmsg.h | 2 +- > net/core/skmsg.c | 19 ++++++++++++------- > net/core/sock_map.c | 3 ++- > 3 files changed, 15 insertions(+), 9 deletions(-) > > diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h > index 84f787416a54..904ff9a32ad6 100644 > --- a/include/linux/skmsg.h > +++ b/include/linux/skmsg.h > @@ -105,7 +105,7 @@ struct sk_psock { > struct proto *sk_proto; > struct mutex work_mutex; > struct sk_psock_work_state work_state; > - struct work_struct work; > + struct delayed_work work; > struct rcu_work rwork; > }; > > diff --git a/net/core/skmsg.c b/net/core/skmsg.c > index 2b6d9519ff29..96a6a3a74a67 100644 > --- a/net/core/skmsg.c > +++ b/net/core/skmsg.c > @@ -481,7 +481,7 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, > } > out: > if (psock->work_state.skb && copied > 0) > - schedule_work(&psock->work); > + schedule_delayed_work(&psock->work, 0); > return copied; > } > EXPORT_SYMBOL_GPL(sk_msg_recvmsg); > @@ -639,7 +639,8 @@ static void sk_psock_skb_state(struct sk_psock *psock, > > static void sk_psock_backlog(struct work_struct *work) > { > - struct sk_psock *psock = container_of(work, struct sk_psock, work); > + struct delayed_work *dwork = to_delayed_work(work); > + struct sk_psock *psock = container_of(dwork, struct sk_psock, work); > struct sk_psock_work_state *state = &psock->work_state; > struct sk_buff *skb = NULL; > bool ingress; > @@ -679,6 +680,10 @@ static void sk_psock_backlog(struct work_struct *work) > if (ret == -EAGAIN) { > sk_psock_skb_state(psock, state, skb, > len, off); > + > + // Delay slightly to prioritize any > + // other work that might be here. > + schedule_delayed_work(&psock->work, 1); Do IIUC that this means we can back out changes from commit bec217197b41 ("skmsg: Schedule psock work if the cached skb exists on the psock")? Nit: Comment syntax. > goto end; > } > /* Hard errors break pipe and stop xmit. */ > @@ -733,7 +738,7 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node) > INIT_LIST_HEAD(&psock->link); > spin_lock_init(&psock->link_lock); > > - INIT_WORK(&psock->work, sk_psock_backlog); > + INIT_DELAYED_WORK(&psock->work, sk_psock_backlog); > mutex_init(&psock->work_mutex); > INIT_LIST_HEAD(&psock->ingress_msg); > spin_lock_init(&psock->ingress_lock); > @@ -822,7 +827,7 @@ static void sk_psock_destroy(struct work_struct *work) > > sk_psock_done_strp(psock); > > - cancel_work_sync(&psock->work); > + cancel_delayed_work_sync(&psock->work); > mutex_destroy(&psock->work_mutex); > > psock_progs_drop(&psock->progs); > @@ -937,7 +942,7 @@ static int sk_psock_skb_redirect(struct sk_psock *from, struct sk_buff *skb) > } > > skb_queue_tail(&psock_other->ingress_skb, skb); > - schedule_work(&psock_other->work); > + schedule_delayed_work(&psock_other->work, 0); > spin_unlock_bh(&psock_other->ingress_lock); > return 0; > } > @@ -1017,7 +1022,7 @@ static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, > spin_lock_bh(&psock->ingress_lock); > if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { > skb_queue_tail(&psock->ingress_skb, skb); > - schedule_work(&psock->work); > + schedule_delayed_work(&psock->work, 0); > err = 0; > } > spin_unlock_bh(&psock->ingress_lock); > @@ -1048,7 +1053,7 @@ static void sk_psock_write_space(struct sock *sk) > psock = sk_psock(sk); > if (likely(psock)) { > if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) > - schedule_work(&psock->work); > + schedule_delayed_work(&psock->work, 0); > write_space = psock->saved_write_space; > } > rcu_read_unlock(); > diff --git a/net/core/sock_map.c b/net/core/sock_map.c > index a68a7290a3b2..d38267201892 100644 > --- a/net/core/sock_map.c > +++ b/net/core/sock_map.c > @@ -1624,9 +1624,10 @@ void sock_map_close(struct sock *sk, long timeout) > rcu_read_unlock(); > sk_psock_stop(psock); > release_sock(sk); > - cancel_work_sync(&psock->work); > + cancel_delayed_work_sync(&psock->work); > sk_psock_put(sk, psock); > } > + > /* Make sure we do not recurse. This is a bug. > * Leak the socket instead of crashing on a stack overflow. > */
Jakub Sitnicki wrote: > On Mon, Mar 27, 2023 at 10:54 AM -07, John Fastabend wrote: > > Sk_buffs are fed into sockmap verdict programs either from a strparser > > (when the user might want to decide how framing of skb is done by attaching > > another parser program) or directly through tcp_read_sock. The > > tcp_read_sock is the preferred method for performance when the BPF logic is > > a stream parser. > > > > The flow for Cilium's common use case with a stream parser is, > > > > tcp_read_sock() > > sk_psock_verdict_recv > > ret = bpf_prog_run_pin_on_cpu() > > sk_psock_verdict_apply(sock, skb, ret) > > // if system is under memory pressure or app is slow we may > > // need to queue skb. Do this queuing through ingress_skb and > > // then kick timer to wake up handler > > skb_queue_tail(ingress_skb, skb) > > schedule_work(work); > > > > > > The work queue is wired up to sk_psock_backlog(). This will then walk the > > ingress_skb skb list that holds our sk_buffs that could not be handled, > > but should be OK to run at some later point. However, its possible that > > the workqueue doing this work still hits an error when sending the skb. > > When this happens the skbuff is requeued on a temporary 'state' struct > > kept with the workqueue. This is necessary because its possible to > > partially send an skbuff before hitting an error and we need to know how > > and where to restart when the workqueue runs next. > > > > Now for the trouble, we don't rekick the workqueue. This can cause a > > stall where the skbuff we just cached on the state variable might never > > be sent. This happens when its the last packet in a flow and no further > > packets come along that would cause the system to kick the workqueue from > > that side. > > > > To fix we could do simple schedule_work(), but while under memory pressure > > it makes sense to back off some instead of continue to retry repeatedly. So > > instead to fix convert schedule_work to schedule_delayed_work and add > > backoff logic to reschedule from backlog queue on errors. Its not obvious > > though what a good backoff is so use '1'. > > > > To test we observed some flakes whil running NGINX compliance test with > > sockmap we attributed these failed test to this bug and subsequent issue. > > > > Fixes: 04919bed948dc ("tcp: Introduce tcp_read_skb()") > > Tested-by: William Findlay <will@isovalent.com> > > Signed-off-by: John Fastabend <john.fastabend@gmail.com> > > --- [...] > > --- a/net/core/skmsg.c > > +++ b/net/core/skmsg.c > > @@ -481,7 +481,7 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, > > } > > out: > > if (psock->work_state.skb && copied > 0) > > - schedule_work(&psock->work); > > + schedule_delayed_work(&psock->work, 0); > > return copied; > > } > > EXPORT_SYMBOL_GPL(sk_msg_recvmsg); > > @@ -639,7 +639,8 @@ static void sk_psock_skb_state(struct sk_psock *psock, > > > > static void sk_psock_backlog(struct work_struct *work) > > { > > - struct sk_psock *psock = container_of(work, struct sk_psock, work); > > + struct delayed_work *dwork = to_delayed_work(work); > > + struct sk_psock *psock = container_of(dwork, struct sk_psock, work); > > struct sk_psock_work_state *state = &psock->work_state; > > struct sk_buff *skb = NULL; > > bool ingress; > > @@ -679,6 +680,10 @@ static void sk_psock_backlog(struct work_struct *work) > > if (ret == -EAGAIN) { > > sk_psock_skb_state(psock, state, skb, > > len, off); > > + > > + // Delay slightly to prioritize any > > + // other work that might be here. > > + schedule_delayed_work(&psock->work, 1); > > Do IIUC that this means we can back out changes from commit bec217197b41 > ("skmsg: Schedule psock work if the cached skb exists on the psock")? Yeah I think so this is a more direct way to get the same result. I'm also thinking this check, if (psock->work_state.skb && copied > 0) schedule_work(&psock->work) is not correct copied=0 which could happen on empty queue could be the result of a skb stuck from this eagain error in backlog. I think its OK to revert that patch in a separate patch. And ideally we could get some way to load up the stack to hit these corner cases without running long stress tests. WDYT? > > Nit: Comment syntax. Yep happy to fix.
On Tue, Mar 28, 2023 at 02:56 PM -07, John Fastabend wrote: > Jakub Sitnicki wrote: >> On Mon, Mar 27, 2023 at 10:54 AM -07, John Fastabend wrote: >> > Sk_buffs are fed into sockmap verdict programs either from a strparser >> > (when the user might want to decide how framing of skb is done by attaching >> > another parser program) or directly through tcp_read_sock. The >> > tcp_read_sock is the preferred method for performance when the BPF logic is >> > a stream parser. >> > >> > The flow for Cilium's common use case with a stream parser is, >> > >> > tcp_read_sock() >> > sk_psock_verdict_recv >> > ret = bpf_prog_run_pin_on_cpu() >> > sk_psock_verdict_apply(sock, skb, ret) >> > // if system is under memory pressure or app is slow we may >> > // need to queue skb. Do this queuing through ingress_skb and >> > // then kick timer to wake up handler >> > skb_queue_tail(ingress_skb, skb) >> > schedule_work(work); >> > >> > >> > The work queue is wired up to sk_psock_backlog(). This will then walk the >> > ingress_skb skb list that holds our sk_buffs that could not be handled, >> > but should be OK to run at some later point. However, its possible that >> > the workqueue doing this work still hits an error when sending the skb. >> > When this happens the skbuff is requeued on a temporary 'state' struct >> > kept with the workqueue. This is necessary because its possible to >> > partially send an skbuff before hitting an error and we need to know how >> > and where to restart when the workqueue runs next. >> > >> > Now for the trouble, we don't rekick the workqueue. This can cause a >> > stall where the skbuff we just cached on the state variable might never >> > be sent. This happens when its the last packet in a flow and no further >> > packets come along that would cause the system to kick the workqueue from >> > that side. >> > >> > To fix we could do simple schedule_work(), but while under memory pressure >> > it makes sense to back off some instead of continue to retry repeatedly. So >> > instead to fix convert schedule_work to schedule_delayed_work and add >> > backoff logic to reschedule from backlog queue on errors. Its not obvious >> > though what a good backoff is so use '1'. >> > >> > To test we observed some flakes whil running NGINX compliance test with >> > sockmap we attributed these failed test to this bug and subsequent issue. >> > >> > Fixes: 04919bed948dc ("tcp: Introduce tcp_read_skb()") >> > Tested-by: William Findlay <will@isovalent.com> >> > Signed-off-by: John Fastabend <john.fastabend@gmail.com> >> > --- > > [...] > >> > --- a/net/core/skmsg.c >> > +++ b/net/core/skmsg.c >> > @@ -481,7 +481,7 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, >> > } >> > out: >> > if (psock->work_state.skb && copied > 0) >> > - schedule_work(&psock->work); >> > + schedule_delayed_work(&psock->work, 0); >> > return copied; >> > } >> > EXPORT_SYMBOL_GPL(sk_msg_recvmsg); >> > @@ -639,7 +639,8 @@ static void sk_psock_skb_state(struct sk_psock *psock, >> > >> > static void sk_psock_backlog(struct work_struct *work) >> > { >> > - struct sk_psock *psock = container_of(work, struct sk_psock, work); >> > + struct delayed_work *dwork = to_delayed_work(work); >> > + struct sk_psock *psock = container_of(dwork, struct sk_psock, work); >> > struct sk_psock_work_state *state = &psock->work_state; >> > struct sk_buff *skb = NULL; >> > bool ingress; >> > @@ -679,6 +680,10 @@ static void sk_psock_backlog(struct work_struct *work) >> > if (ret == -EAGAIN) { >> > sk_psock_skb_state(psock, state, skb, >> > len, off); >> > + >> > + // Delay slightly to prioritize any >> > + // other work that might be here. >> > + schedule_delayed_work(&psock->work, 1); >> >> Do IIUC that this means we can back out changes from commit bec217197b41 >> ("skmsg: Schedule psock work if the cached skb exists on the psock")? > > Yeah I think so this is a more direct way to get the same result. I'm also > thinking this check, > > if (psock->work_state.skb && copied > 0) > schedule_work(&psock->work) > > is not correct copied=0 which could happen on empty queue could be the > result of a skb stuck from this eagain error in backlog. I suspect the 'copied > 0' check is there to handle the 0-length read scenario. But I think you're right, that the empty queue scenario is not being handled properly. > I think its OK to revert that patch in a separate patch. And ideally we > could get some way to load up the stack to hit these corner cases without > running long stress tests. > > WDYT? Yeah, the revert can wait. I was just curious if my thinking was right. There is plenty of material in this series as is :-)
diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 84f787416a54..904ff9a32ad6 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -105,7 +105,7 @@ struct sk_psock { struct proto *sk_proto; struct mutex work_mutex; struct sk_psock_work_state work_state; - struct work_struct work; + struct delayed_work work; struct rcu_work rwork; }; diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 2b6d9519ff29..96a6a3a74a67 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -481,7 +481,7 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, } out: if (psock->work_state.skb && copied > 0) - schedule_work(&psock->work); + schedule_delayed_work(&psock->work, 0); return copied; } EXPORT_SYMBOL_GPL(sk_msg_recvmsg); @@ -639,7 +639,8 @@ static void sk_psock_skb_state(struct sk_psock *psock, static void sk_psock_backlog(struct work_struct *work) { - struct sk_psock *psock = container_of(work, struct sk_psock, work); + struct delayed_work *dwork = to_delayed_work(work); + struct sk_psock *psock = container_of(dwork, struct sk_psock, work); struct sk_psock_work_state *state = &psock->work_state; struct sk_buff *skb = NULL; bool ingress; @@ -679,6 +680,10 @@ static void sk_psock_backlog(struct work_struct *work) if (ret == -EAGAIN) { sk_psock_skb_state(psock, state, skb, len, off); + + // Delay slightly to prioritize any + // other work that might be here. + schedule_delayed_work(&psock->work, 1); goto end; } /* Hard errors break pipe and stop xmit. */ @@ -733,7 +738,7 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node) INIT_LIST_HEAD(&psock->link); spin_lock_init(&psock->link_lock); - INIT_WORK(&psock->work, sk_psock_backlog); + INIT_DELAYED_WORK(&psock->work, sk_psock_backlog); mutex_init(&psock->work_mutex); INIT_LIST_HEAD(&psock->ingress_msg); spin_lock_init(&psock->ingress_lock); @@ -822,7 +827,7 @@ static void sk_psock_destroy(struct work_struct *work) sk_psock_done_strp(psock); - cancel_work_sync(&psock->work); + cancel_delayed_work_sync(&psock->work); mutex_destroy(&psock->work_mutex); psock_progs_drop(&psock->progs); @@ -937,7 +942,7 @@ static int sk_psock_skb_redirect(struct sk_psock *from, struct sk_buff *skb) } skb_queue_tail(&psock_other->ingress_skb, skb); - schedule_work(&psock_other->work); + schedule_delayed_work(&psock_other->work, 0); spin_unlock_bh(&psock_other->ingress_lock); return 0; } @@ -1017,7 +1022,7 @@ static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, spin_lock_bh(&psock->ingress_lock); if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { skb_queue_tail(&psock->ingress_skb, skb); - schedule_work(&psock->work); + schedule_delayed_work(&psock->work, 0); err = 0; } spin_unlock_bh(&psock->ingress_lock); @@ -1048,7 +1053,7 @@ static void sk_psock_write_space(struct sock *sk) psock = sk_psock(sk); if (likely(psock)) { if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) - schedule_work(&psock->work); + schedule_delayed_work(&psock->work, 0); write_space = psock->saved_write_space; } rcu_read_unlock(); diff --git a/net/core/sock_map.c b/net/core/sock_map.c index a68a7290a3b2..d38267201892 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1624,9 +1624,10 @@ void sock_map_close(struct sock *sk, long timeout) rcu_read_unlock(); sk_psock_stop(psock); release_sock(sk); - cancel_work_sync(&psock->work); + cancel_delayed_work_sync(&psock->work); sk_psock_put(sk, psock); } + /* Make sure we do not recurse. This is a bug. * Leak the socket instead of crashing on a stack overflow. */