Message ID | 20220614171734.1103875-3-eric.dumazet@gmail.com (mailing list archive) |
---|---|
State | Accepted |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | tcp: final (?) round of mem pressure fixes | expand |
On Tue, Jun 14, 2022 at 1:17 PM Eric Dumazet <eric.dumazet@gmail.com> wrote: > > From: Eric Dumazet <edumazet@google.com> > > Blamed commit only dealt with applications issuing small writes. > > Issue here is that we allow to force memory schedule for the sk_buff > allocation, but we have no guarantee that sendmsg() is able to > copy some payload in it. > > In this patch, I make sure the socket can use up to tcp_wmem[0] bytes. > > For example, if we consider tcp_wmem[0] = 4096 (default on x86), > and initial skb->truesize being 1280, tcp_sendmsg() is able to > copy up to 2816 bytes under memory pressure. > > Before this patch a sendmsg() sending more than 2816 bytes > would either block forever (if persistent memory pressure), > or return -EAGAIN. > > For bigger MTU networks, it is advised to increase tcp_wmem[0] > to avoid sending too small packets. > > v2: deal with zero copy paths. > > Fixes: 8e4d980ac215 ("tcp: fix behavior for epoll edge trigger") > Signed-off-by: Eric Dumazet <edumazet@google.com> Acked-by: Soheil Hassas Yeganeh <soheil@google.com> Very nice find! Thank you! > --- > net/ipv4/tcp.c | 33 +++++++++++++++++++++++++++++---- > 1 file changed, 29 insertions(+), 4 deletions(-) > > diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c > index 14ebb4ec4a51f3c55501aa53423ce897599e8637..56083c2497f0b695c660256aa43f8a743d481697 100644 > --- a/net/ipv4/tcp.c > +++ b/net/ipv4/tcp.c > @@ -951,6 +951,23 @@ static int tcp_downgrade_zcopy_pure(struct sock *sk, struct sk_buff *skb) > return 0; > } > > +static int tcp_wmem_schedule(struct sock *sk, int copy) > +{ > + int left; > + > + if (likely(sk_wmem_schedule(sk, copy))) > + return copy; > + > + /* We could be in trouble if we have nothing queued. > + * Use whatever is left in sk->sk_forward_alloc and tcp_wmem[0] > + * to guarantee some progress. > + */ > + left = sock_net(sk)->ipv4.sysctl_tcp_wmem[0] - sk->sk_wmem_queued; > + if (left > 0) > + sk_forced_mem_schedule(sk, min(left, copy)); > + return min(copy, sk->sk_forward_alloc); > +} > + > static struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags, > struct page *page, int offset, size_t *size) > { > @@ -986,7 +1003,11 @@ static struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags, > tcp_mark_push(tp, skb); > goto new_segment; > } > - if (tcp_downgrade_zcopy_pure(sk, skb) || !sk_wmem_schedule(sk, copy)) > + if (tcp_downgrade_zcopy_pure(sk, skb)) > + return NULL; > + > + copy = tcp_wmem_schedule(sk, copy); > + if (!copy) > return NULL; > > if (can_coalesce) { > @@ -1334,8 +1355,11 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) > > copy = min_t(int, copy, pfrag->size - pfrag->offset); > > - if (tcp_downgrade_zcopy_pure(sk, skb) || > - !sk_wmem_schedule(sk, copy)) > + if (tcp_downgrade_zcopy_pure(sk, skb)) > + goto wait_for_space; > + > + copy = tcp_wmem_schedule(sk, copy); > + if (!copy) > goto wait_for_space; > > err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb, > @@ -1362,7 +1386,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) > skb_shinfo(skb)->flags |= SKBFL_PURE_ZEROCOPY; > > if (!skb_zcopy_pure(skb)) { > - if (!sk_wmem_schedule(sk, copy)) > + copy = tcp_wmem_schedule(sk, copy); > + if (!copy) > goto wait_for_space; > } > > -- > 2.36.1.476.g0c4daa206d-goog >
On Tue, Jun 14, 2022 at 10:42 AM Soheil Hassas Yeganeh <soheil@google.com> wrote: > > On Tue, Jun 14, 2022 at 1:17 PM Eric Dumazet <eric.dumazet@gmail.com> wrote: > > > > From: Eric Dumazet <edumazet@google.com> > > > > Blamed commit only dealt with applications issuing small writes. > > > > Issue here is that we allow to force memory schedule for the sk_buff > > allocation, but we have no guarantee that sendmsg() is able to > > copy some payload in it. > > > > In this patch, I make sure the socket can use up to tcp_wmem[0] bytes. > > > > For example, if we consider tcp_wmem[0] = 4096 (default on x86), > > and initial skb->truesize being 1280, tcp_sendmsg() is able to > > copy up to 2816 bytes under memory pressure. > > > > Before this patch a sendmsg() sending more than 2816 bytes > > would either block forever (if persistent memory pressure), > > or return -EAGAIN. > > > > For bigger MTU networks, it is advised to increase tcp_wmem[0] > > to avoid sending too small packets. > > > > v2: deal with zero copy paths. > > > > Fixes: 8e4d980ac215 ("tcp: fix behavior for epoll edge trigger") > > Signed-off-by: Eric Dumazet <edumazet@google.com> > > Acked-by: Soheil Hassas Yeganeh <soheil@google.com> > > Very nice find! Thank you! > > > --- > > net/ipv4/tcp.c | 33 +++++++++++++++++++++++++++++---- > > 1 file changed, 29 insertions(+), 4 deletions(-) > > > > diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c > > index 14ebb4ec4a51f3c55501aa53423ce897599e8637..56083c2497f0b695c660256aa43f8a743d481697 100644 > > --- a/net/ipv4/tcp.c > > +++ b/net/ipv4/tcp.c > > @@ -951,6 +951,23 @@ static int tcp_downgrade_zcopy_pure(struct sock *sk, struct sk_buff *skb) > > return 0; > > } > > > > +static int tcp_wmem_schedule(struct sock *sk, int copy) > > +{ > > + int left; > > + > > + if (likely(sk_wmem_schedule(sk, copy))) > > + return copy; > > + > > + /* We could be in trouble if we have nothing queued. > > + * Use whatever is left in sk->sk_forward_alloc and tcp_wmem[0] > > + * to guarantee some progress. > > + */ > > + left = sock_net(sk)->ipv4.sysctl_tcp_wmem[0] - sk->sk_wmem_queued; > > + if (left > 0) > > + sk_forced_mem_schedule(sk, min(left, copy)); > > + return min(copy, sk->sk_forward_alloc); > > +} > > + > > static struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags, > > struct page *page, int offset, size_t *size) > > { > > @@ -986,7 +1003,11 @@ static struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags, > > tcp_mark_push(tp, skb); > > goto new_segment; > > } > > - if (tcp_downgrade_zcopy_pure(sk, skb) || !sk_wmem_schedule(sk, copy)) > > + if (tcp_downgrade_zcopy_pure(sk, skb)) > > + return NULL; Do we need to take care of the call to sk_wmem_schedule() inside tcp_downgrade_zcopy_pure()? > > + > > + copy = tcp_wmem_schedule(sk, copy); > > + if (!copy) > > return NULL; > > > > if (can_coalesce) { > > @@ -1334,8 +1355,11 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) > > > > copy = min_t(int, copy, pfrag->size - pfrag->offset); > > > > - if (tcp_downgrade_zcopy_pure(sk, skb) || > > - !sk_wmem_schedule(sk, copy)) > > + if (tcp_downgrade_zcopy_pure(sk, skb)) > > + goto wait_for_space; > > + > > + copy = tcp_wmem_schedule(sk, copy); > > + if (!copy) > > goto wait_for_space; > > > > err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb, > > @@ -1362,7 +1386,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) > > skb_shinfo(skb)->flags |= SKBFL_PURE_ZEROCOPY; > > > > if (!skb_zcopy_pure(skb)) { > > - if (!sk_wmem_schedule(sk, copy)) > > + copy = tcp_wmem_schedule(sk, copy); > > + if (!copy) > > goto wait_for_space; > > } > > > > -- > > 2.36.1.476.g0c4daa206d-goog > >
On Tue, Jun 14, 2022 at 12:10 PM Eric Dumazet <edumazet@google.com> wrote: > > > > On Tue, Jun 14, 2022, 11:41 AM Wei Wang <weiwan@google.com> wrote: >> >> On Tue, Jun 14, 2022 at 10:42 AM Soheil Hassas Yeganeh >> <soheil@google.com> wrote: >> > >> > On Tue, Jun 14, 2022 at 1:17 PM Eric Dumazet <eric.dumazet@gmail.com> wrote: >> > > >> > > From: Eric Dumazet <edumazet@google.com> >> > > >> > > Blamed commit only dealt with applications issuing small writes. >> > > >> > > Issue here is that we allow to force memory schedule for the sk_buff >> > > allocation, but we have no guarantee that sendmsg() is able to >> > > copy some payload in it. >> > > >> > > In this patch, I make sure the socket can use up to tcp_wmem[0] bytes. >> > > >> > > For example, if we consider tcp_wmem[0] = 4096 (default on x86), >> > > and initial skb->truesize being 1280, tcp_sendmsg() is able to >> > > copy up to 2816 bytes under memory pressure. >> > > >> > > Before this patch a sendmsg() sending more than 2816 bytes >> > > would either block forever (if persistent memory pressure), >> > > or return -EAGAIN. >> > > >> > > For bigger MTU networks, it is advised to increase tcp_wmem[0] >> > > to avoid sending too small packets. >> > > >> > > v2: deal with zero copy paths. >> > > >> > > Fixes: 8e4d980ac215 ("tcp: fix behavior for epoll edge trigger") >> > > Signed-off-by: Eric Dumazet <edumazet@google.com> Reviewed-by: Wei Wang <weiwan@google.com> >> > >> > Acked-by: Soheil Hassas Yeganeh <soheil@google.com> >> > >> > Very nice find! Thank you! >> > >> > > --- >> > > net/ipv4/tcp.c | 33 +++++++++++++++++++++++++++++---- >> > > 1 file changed, 29 insertions(+), 4 deletions(-) >> > > >> > > diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c >> > > index 14ebb4ec4a51f3c55501aa53423ce897599e8637..56083c2497f0b695c660256aa43f8a743d481697 100644 >> > > --- a/net/ipv4/tcp.c >> > > +++ b/net/ipv4/tcp.c >> > > @@ -951,6 +951,23 @@ static int tcp_downgrade_zcopy_pure(struct sock *sk, struct sk_buff *skb) >> > > return 0; >> > > } >> > > >> > > +static int tcp_wmem_schedule(struct sock *sk, int copy) >> > > +{ >> > > + int left; >> > > + >> > > + if (likely(sk_wmem_schedule(sk, copy))) >> > > + return copy; >> > > + >> > > + /* We could be in trouble if we have nothing queued. >> > > + * Use whatever is left in sk->sk_forward_alloc and tcp_wmem[0] >> > > + * to guarantee some progress. >> > > + */ >> > > + left = sock_net(sk)->ipv4.sysctl_tcp_wmem[0] - sk->sk_wmem_queued; >> > > + if (left > 0) >> > > + sk_forced_mem_schedule(sk, min(left, copy)); >> > > + return min(copy, sk->sk_forward_alloc); >> > > +} >> > > + >> > > static struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags, >> > > struct page *page, int offset, size_t *size) >> > > { >> > > @@ -986,7 +1003,11 @@ static struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags, >> > > tcp_mark_push(tp, skb); >> > > goto new_segment; >> > > } >> > > - if (tcp_downgrade_zcopy_pure(sk, skb) || !sk_wmem_schedule(sk, copy)) >> > > + if (tcp_downgrade_zcopy_pure(sk, skb)) >> > > + return NULL; >> >> Do we need to take care of the call to sk_wmem_schedule() inside >> tcp_downgrade_zcopy_pure()? > > > We can not. Payload has been added already, and will be sent eventually. Ack. Thanks for the explanation. >> >> >> > > + >> > > + copy = tcp_wmem_schedule(sk, copy); >> > > + if (!copy) >> > > return NULL; >> > > >> > > if (can_coalesce) { >> > > @@ -1334,8 +1355,11 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) >> > > >> > > copy = min_t(int, copy, pfrag->size - pfrag->offset); >> > > >> > > - if (tcp_downgrade_zcopy_pure(sk, skb) || >> > > - !sk_wmem_schedule(sk, copy)) >> > > + if (tcp_downgrade_zcopy_pure(sk, skb)) >> > > + goto wait_for_space; >> > > + >> > > + copy = tcp_wmem_schedule(sk, copy); >> > > + if (!copy) >> > > goto wait_for_space; >> > > >> > > err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb, >> > > @@ -1362,7 +1386,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) >> > > skb_shinfo(skb)->flags |= SKBFL_PURE_ZEROCOPY; >> > > >> > > if (!skb_zcopy_pure(skb)) { >> > > - if (!sk_wmem_schedule(sk, copy)) >> > > + copy = tcp_wmem_schedule(sk, copy); >> > > + if (!copy) >> > > goto wait_for_space; >> > > } >> > > >> > > -- >> > > 2.36.1.476.g0c4daa206d-goog >> > >
On Tue, Jun 14, 2022 at 10:17 AM Eric Dumazet <eric.dumazet@gmail.com> wrote: > > From: Eric Dumazet <edumazet@google.com> > > Blamed commit only dealt with applications issuing small writes. > > Issue here is that we allow to force memory schedule for the sk_buff > allocation, but we have no guarantee that sendmsg() is able to > copy some payload in it. > > In this patch, I make sure the socket can use up to tcp_wmem[0] bytes. > > For example, if we consider tcp_wmem[0] = 4096 (default on x86), > and initial skb->truesize being 1280, tcp_sendmsg() is able to > copy up to 2816 bytes under memory pressure. > > Before this patch a sendmsg() sending more than 2816 bytes > would either block forever (if persistent memory pressure), > or return -EAGAIN. > > For bigger MTU networks, it is advised to increase tcp_wmem[0] > to avoid sending too small packets. > > v2: deal with zero copy paths. > > Fixes: 8e4d980ac215 ("tcp: fix behavior for epoll edge trigger") > Signed-off-by: Eric Dumazet <edumazet@google.com> Reviewed-by: Shakeel Butt <shakeelb@google.com>
Hi, On Tue, Jun 14, 2022 at 10:17:34AM -0700, Eric Dumazet wrote: > From: Eric Dumazet <edumazet@google.com> > > Blamed commit only dealt with applications issuing small writes. > > Issue here is that we allow to force memory schedule for the sk_buff > allocation, but we have no guarantee that sendmsg() is able to > copy some payload in it. > > In this patch, I make sure the socket can use up to tcp_wmem[0] bytes. > > For example, if we consider tcp_wmem[0] = 4096 (default on x86), > and initial skb->truesize being 1280, tcp_sendmsg() is able to > copy up to 2816 bytes under memory pressure. > > Before this patch a sendmsg() sending more than 2816 bytes > would either block forever (if persistent memory pressure), > or return -EAGAIN. > > For bigger MTU networks, it is advised to increase tcp_wmem[0] > to avoid sending too small packets. > > v2: deal with zero copy paths. I think this might have gotten double applied: https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?id=849b425cd091e1804af964b771761cfbefbafb43 https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?id=f54755f6a11accb2db5ef17f8f75aad0875aefdc and now net-next builds are broken: ../../../../../../../../net/ipv4/tcp.c:971:12: error: redefinition of ‘tcp_wmem_schedule’ 971 | static int tcp_wmem_schedule(struct sock *sk, int copy) | ^~~~~~~~~~~~~~~~~ ../../../../../../../../net/ipv4/tcp.c:954:12: note: previous definition of ‘tcp_wmem_schedule’ with type ‘int(struct sock *, int)’ 954 | static int tcp_wmem_schedule(struct sock *sk, int copy) | ^~~~~~~~~~~~~~~~~ ../../../../../../../../net/ipv4/tcp.c:954:12: warning: ‘tcp_wmem_schedule’ defined but not used [-Wunused-function] make[5]: *** [/home/wgci/tmp/2813390.12234/tmp.0PMBO65tGf/scripts/Makefile.build:249: net /ipv4/tcp.o] Error 1 make[4]: *** [/home/wgci/tmp/2813390.12234/tmp.0PMBO65tGf/scripts/Makefile.build:466: net/ipv4] Error 2 make[4]: *** Waiting for unfinished jobs.... Jason
On Fri, Jun 17, 2022 at 12:08:14PM +0200, Jason A. Donenfeld wrote: > Hi, > > On Tue, Jun 14, 2022 at 10:17:34AM -0700, Eric Dumazet wrote: > > From: Eric Dumazet <edumazet@google.com> > > > > Blamed commit only dealt with applications issuing small writes. > > > > Issue here is that we allow to force memory schedule for the sk_buff > > allocation, but we have no guarantee that sendmsg() is able to > > copy some payload in it. > > > > In this patch, I make sure the socket can use up to tcp_wmem[0] bytes. > > > > For example, if we consider tcp_wmem[0] = 4096 (default on x86), > > and initial skb->truesize being 1280, tcp_sendmsg() is able to > > copy up to 2816 bytes under memory pressure. > > > > Before this patch a sendmsg() sending more than 2816 bytes > > would either block forever (if persistent memory pressure), > > or return -EAGAIN. > > > > For bigger MTU networks, it is advised to increase tcp_wmem[0] > > to avoid sending too small packets. > > > > v2: deal with zero copy paths. > > I think this might have gotten double applied: > > https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?id=849b425cd091e1804af964b771761cfbefbafb43 > https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?id=f54755f6a11accb2db5ef17f8f75aad0875aefdc > > and now net-next builds are broken: > > ../../../../../../../../net/ipv4/tcp.c:971:12: error: redefinition of ‘tcp_wmem_schedule’ > 971 | static int tcp_wmem_schedule(struct sock *sk, int copy) | ^~~~~~~~~~~~~~~~~ > ../../../../../../../../net/ipv4/tcp.c:954:12: note: previous definition of ‘tcp_wmem_schedule’ with type ‘int(struct sock *, int)’ > 954 | static int tcp_wmem_schedule(struct sock *sk, int copy) | ^~~~~~~~~~~~~~~~~ > ../../../../../../../../net/ipv4/tcp.c:954:12: warning: ‘tcp_wmem_schedule’ defined but not used [-Wunused-function] make[5]: *** [/home/wgci/tmp/2813390.12234/tmp.0PMBO65tGf/scripts/Makefile.build:249: net > /ipv4/tcp.o] Error 1 make[4]: *** [/home/wgci/tmp/2813390.12234/tmp.0PMBO65tGf/scripts/Makefile.build:466: net/ipv4] Error 2 > make[4]: *** Waiting for unfinished jobs.... Ah, fixed already: https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?id=fd8b330ce1bb79ba00047435cc213b14f886bf1f > > Jason
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 14ebb4ec4a51f3c55501aa53423ce897599e8637..56083c2497f0b695c660256aa43f8a743d481697 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -951,6 +951,23 @@ static int tcp_downgrade_zcopy_pure(struct sock *sk, struct sk_buff *skb) return 0; } +static int tcp_wmem_schedule(struct sock *sk, int copy) +{ + int left; + + if (likely(sk_wmem_schedule(sk, copy))) + return copy; + + /* We could be in trouble if we have nothing queued. + * Use whatever is left in sk->sk_forward_alloc and tcp_wmem[0] + * to guarantee some progress. + */ + left = sock_net(sk)->ipv4.sysctl_tcp_wmem[0] - sk->sk_wmem_queued; + if (left > 0) + sk_forced_mem_schedule(sk, min(left, copy)); + return min(copy, sk->sk_forward_alloc); +} + static struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags, struct page *page, int offset, size_t *size) { @@ -986,7 +1003,11 @@ static struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags, tcp_mark_push(tp, skb); goto new_segment; } - if (tcp_downgrade_zcopy_pure(sk, skb) || !sk_wmem_schedule(sk, copy)) + if (tcp_downgrade_zcopy_pure(sk, skb)) + return NULL; + + copy = tcp_wmem_schedule(sk, copy); + if (!copy) return NULL; if (can_coalesce) { @@ -1334,8 +1355,11 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) copy = min_t(int, copy, pfrag->size - pfrag->offset); - if (tcp_downgrade_zcopy_pure(sk, skb) || - !sk_wmem_schedule(sk, copy)) + if (tcp_downgrade_zcopy_pure(sk, skb)) + goto wait_for_space; + + copy = tcp_wmem_schedule(sk, copy); + if (!copy) goto wait_for_space; err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb, @@ -1362,7 +1386,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) skb_shinfo(skb)->flags |= SKBFL_PURE_ZEROCOPY; if (!skb_zcopy_pure(skb)) { - if (!sk_wmem_schedule(sk, copy)) + copy = tcp_wmem_schedule(sk, copy); + if (!copy) goto wait_for_space; }