Message ID | 20230807134547.2782227-2-imagedong@tencent.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | net: tcp: support probing OOM | expand |
On Mon, Aug 7, 2023 at 3:47 PM <menglong8.dong@gmail.com> wrote: > > From: Menglong Dong <imagedong@tencent.com> > > For now, skb will be dropped when no memory, which makes client keep > retrans util timeout and it's not friendly to the users. > > In this patch, we reply an ACK with zero-window in this case to update > the snd_wnd of the sender to 0. Therefore, the sender won't timeout the > connection and will probe the zero-window with the retransmits. > > Signed-off-by: Menglong Dong <imagedong@tencent.com> > --- > v2: > - send 0 rwin ACK for the receive queue empty case when necessary > - send the ACK immediately by using the ICSK_ACK_NOW flag > --- > include/net/inet_connection_sock.h | 3 ++- > net/ipv4/tcp_input.c | 14 +++++++++++--- > net/ipv4/tcp_output.c | 14 +++++++++++--- > 3 files changed, 24 insertions(+), 7 deletions(-) > > diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h > index c2b15f7e5516..be3c858a2ebb 100644 > --- a/include/net/inet_connection_sock.h > +++ b/include/net/inet_connection_sock.h > @@ -164,7 +164,8 @@ enum inet_csk_ack_state_t { > ICSK_ACK_TIMER = 2, > ICSK_ACK_PUSHED = 4, > ICSK_ACK_PUSHED2 = 8, > - ICSK_ACK_NOW = 16 /* Send the next ACK immediately (once) */ > + ICSK_ACK_NOW = 16, /* Send the next ACK immediately (once) */ > + ICSK_ACK_NOMEM = 32, > }; > > void inet_csk_init_xmit_timers(struct sock *sk, > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c > index 8e96ebe373d7..aae485d0a3b6 100644 > --- a/net/ipv4/tcp_input.c > +++ b/net/ipv4/tcp_input.c > @@ -5059,12 +5059,20 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) > > /* Ok. In sequence. In window. */ > queue_and_out: > - if (skb_queue_len(&sk->sk_receive_queue) == 0) > - sk_forced_mem_schedule(sk, skb->truesize); > - else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) { > + if (skb_queue_len(&sk->sk_receive_queue) == 0) { > + if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) { > + sk_forced_mem_schedule(sk, skb->truesize); I think we want sk->sk_data_ready() here, to let applications drain the queue, regardless of sk->sk_rcvlowat value. > + inet_csk(sk)->icsk_ack.pending |= > + (ICSK_ACK_NOMEM | ICSK_ACK_NOW); > + inet_csk_schedule_ack(sk); > + } > + } else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) { > reason = SKB_DROP_REASON_PROTO_MEM; > NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP); > sk->sk_data_ready(sk); We also want to keep this sk->sk_data_ready(sk) call. > + inet_csk(sk)->icsk_ack.pending |= > + (ICSK_ACK_NOMEM | ICSK_ACK_NOW); > + inet_csk_schedule_ack(sk); > goto drop; > } This would suggest a different code refactoring, to avoid code duplication. diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 57c8af1859c16eba5e952a23ea959b628006f9c1..dde6c44f2c1e33dcf60c23b49cd99f270874ca96 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5050,13 +5050,17 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) /* Ok. In sequence. In window. */ queue_and_out: - if (skb_queue_len(&sk->sk_receive_queue) == 0) - sk_forced_mem_schedule(sk, skb->truesize); - else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) { - reason = SKB_DROP_REASON_PROTO_MEM; - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP); + if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) { + /* TODO: maybe ratelimit these WIN 0 ACK ? */ + inet_csk(sk)->icsk_ack.pending |= CSK_ACK_NOMEM | ICSK_ACK_NOW; + inet_csk_schedule_ack(sk); sk->sk_data_ready(sk); - goto drop; + if (skb_queue_len(&sk->sk_receive_queue)) { + reason = SKB_DROP_REASON_PROTO_MEM; + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP); + goto drop; + } + sk_forced_mem_schedule(sk, skb->truesize); } eaten = tcp_queue_rcv(sk, skb, &fragstolen);
On Mon, Aug 7, 2023 at 10:05 PM Eric Dumazet <edumazet@google.com> wrote: > > On Mon, Aug 7, 2023 at 3:47 PM <menglong8.dong@gmail.com> wrote: > > > > From: Menglong Dong <imagedong@tencent.com> > > > > For now, skb will be dropped when no memory, which makes client keep > > retrans util timeout and it's not friendly to the users. > > > > In this patch, we reply an ACK with zero-window in this case to update > > the snd_wnd of the sender to 0. Therefore, the sender won't timeout the > > connection and will probe the zero-window with the retransmits. > > > > Signed-off-by: Menglong Dong <imagedong@tencent.com> > > --- > > v2: > > - send 0 rwin ACK for the receive queue empty case when necessary > > - send the ACK immediately by using the ICSK_ACK_NOW flag > > --- > > include/net/inet_connection_sock.h | 3 ++- > > net/ipv4/tcp_input.c | 14 +++++++++++--- > > net/ipv4/tcp_output.c | 14 +++++++++++--- > > 3 files changed, 24 insertions(+), 7 deletions(-) > > > > diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h > > index c2b15f7e5516..be3c858a2ebb 100644 > > --- a/include/net/inet_connection_sock.h > > +++ b/include/net/inet_connection_sock.h > > @@ -164,7 +164,8 @@ enum inet_csk_ack_state_t { > > ICSK_ACK_TIMER = 2, > > ICSK_ACK_PUSHED = 4, > > ICSK_ACK_PUSHED2 = 8, > > - ICSK_ACK_NOW = 16 /* Send the next ACK immediately (once) */ > > + ICSK_ACK_NOW = 16, /* Send the next ACK immediately (once) */ > > + ICSK_ACK_NOMEM = 32, > > }; > > > > void inet_csk_init_xmit_timers(struct sock *sk, > > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c > > index 8e96ebe373d7..aae485d0a3b6 100644 > > --- a/net/ipv4/tcp_input.c > > +++ b/net/ipv4/tcp_input.c > > @@ -5059,12 +5059,20 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) > > > > /* Ok. In sequence. In window. */ > > queue_and_out: > > - if (skb_queue_len(&sk->sk_receive_queue) == 0) > > - sk_forced_mem_schedule(sk, skb->truesize); > > - else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) { > > + if (skb_queue_len(&sk->sk_receive_queue) == 0) { > > + if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) { > > + sk_forced_mem_schedule(sk, skb->truesize); > > I think we want sk->sk_data_ready() here, to let applications drain the queue, > regardless of sk->sk_rcvlowat value. > > > > + inet_csk(sk)->icsk_ack.pending |= > > + (ICSK_ACK_NOMEM | ICSK_ACK_NOW); > > + inet_csk_schedule_ack(sk); > > + } > > + } else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) { > > reason = SKB_DROP_REASON_PROTO_MEM; > > NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP); > > sk->sk_data_ready(sk); > > We also want to keep this sk->sk_data_ready(sk) call. > > > + inet_csk(sk)->icsk_ack.pending |= > > + (ICSK_ACK_NOMEM | ICSK_ACK_NOW); > > + inet_csk_schedule_ack(sk); > > goto drop; > > } > > This would suggest a different code refactoring, to avoid code duplication. > > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c > index 57c8af1859c16eba5e952a23ea959b628006f9c1..dde6c44f2c1e33dcf60c23b49cd99f270874ca96 > 100644 > --- a/net/ipv4/tcp_input.c > +++ b/net/ipv4/tcp_input.c > @@ -5050,13 +5050,17 @@ static void tcp_data_queue(struct sock *sk, > struct sk_buff *skb) > > /* Ok. In sequence. In window. */ > queue_and_out: > - if (skb_queue_len(&sk->sk_receive_queue) == 0) > - sk_forced_mem_schedule(sk, skb->truesize); > - else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) { > - reason = SKB_DROP_REASON_PROTO_MEM; > - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP); > + if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) { > + /* TODO: maybe ratelimit these WIN 0 ACK ? */ > + inet_csk(sk)->icsk_ack.pending |= > CSK_ACK_NOMEM | ICSK_ACK_NOW; > + inet_csk_schedule_ack(sk); > sk->sk_data_ready(sk); > - goto drop; > + if (skb_queue_len(&sk->sk_receive_queue)) { > + reason = SKB_DROP_REASON_PROTO_MEM; > + NET_INC_STATS(sock_net(sk), > LINUX_MIB_TCPRCVQDROP); > + goto drop; > + } > + sk_forced_mem_schedule(sk, skb->truesize); > } > > eaten = tcp_queue_rcv(sk, skb, &fragstolen); Looks much better, thank you!
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index c2b15f7e5516..be3c858a2ebb 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -164,7 +164,8 @@ enum inet_csk_ack_state_t { ICSK_ACK_TIMER = 2, ICSK_ACK_PUSHED = 4, ICSK_ACK_PUSHED2 = 8, - ICSK_ACK_NOW = 16 /* Send the next ACK immediately (once) */ + ICSK_ACK_NOW = 16, /* Send the next ACK immediately (once) */ + ICSK_ACK_NOMEM = 32, }; void inet_csk_init_xmit_timers(struct sock *sk, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8e96ebe373d7..aae485d0a3b6 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5059,12 +5059,20 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) /* Ok. In sequence. In window. */ queue_and_out: - if (skb_queue_len(&sk->sk_receive_queue) == 0) - sk_forced_mem_schedule(sk, skb->truesize); - else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) { + if (skb_queue_len(&sk->sk_receive_queue) == 0) { + if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) { + sk_forced_mem_schedule(sk, skb->truesize); + inet_csk(sk)->icsk_ack.pending |= + (ICSK_ACK_NOMEM | ICSK_ACK_NOW); + inet_csk_schedule_ack(sk); + } + } else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) { reason = SKB_DROP_REASON_PROTO_MEM; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP); sk->sk_data_ready(sk); + inet_csk(sk)->icsk_ack.pending |= + (ICSK_ACK_NOMEM | ICSK_ACK_NOW); + inet_csk_schedule_ack(sk); goto drop; } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index c5412ee77fc8..769a558159ee 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -257,11 +257,19 @@ EXPORT_SYMBOL(tcp_select_initial_window); static u16 tcp_select_window(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - u32 old_win = tp->rcv_wnd; - u32 cur_win = tcp_receive_window(tp); - u32 new_win = __tcp_select_window(sk); struct net *net = sock_net(sk); + u32 old_win = tp->rcv_wnd; + u32 cur_win, new_win; + + /* Make the window 0 if we failed to queue the data because we + * are out of memory. The window is temporary, so we don't store + * it on the socket. + */ + if (unlikely(inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOMEM)) + return 0; + cur_win = tcp_receive_window(tp); + new_win = __tcp_select_window(sk); if (new_win < cur_win) { /* Danger Will Robinson! * Don't update rcv_wup/rcv_wnd here or else