Message ID | 8dd3343ea83a7a631acfe4940a9dea841b32c52a.1621613936.git.pabeni@redhat.com (mailing list archive) |
---|---|
State | Superseded, archived |
Delegated to: | Mat Martineau |
Headers | show |
Series | [v2,mptcp-next] mptcp: drop tx skb cache | expand |
Context | Check | Description |
---|---|---|
matttbe/KVM_Validation__normal | success | Success! ✅ |
matttbe/KVM_Validation__debug | success | Success! ✅ |
matttbe/KVM_Validation__btf__only_bpftest_all_ | success | Success! ✅ |
On Fri, 21 May 2021, Paolo Abeni wrote: > The mentioned cache was introduced to reduce the number of skb > allocation in atomic context, but the required complexity is > excessive. > > This change remove the mentioned cache. > > Signed-off-by: Paolo Abeni <pabeni@redhat.com> > --- > v1 -> v2: > - drop unused field skb_tx_cache There's also a size_goal_cache field in struct mptcp_sock that's now unused. I like the simplification! Do the self tests trigger enough memory pressure to test allocation in that scenario? The commit that introduced the skb_tx_cache did so to allow removal of the skb_ext cache. It looks like both caches are now unnecessary because the skb and skb_ext allocations are handled together in the __mptcp_{do_,}alloc_tx_skb() functions - so an allocation failure can be handled without the complexity of the caches. Does that match your understanding? If so, it would be helpful information for the commit message. Thanks - Mat > --- > net/mptcp/protocol.c | 89 ++------------------------------------------ > net/mptcp/protocol.h | 1 - > 2 files changed, 4 insertions(+), 86 deletions(-) > > diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c > index 446acfb85493..1114a914d845 100644 > --- a/net/mptcp/protocol.c > +++ b/net/mptcp/protocol.c > @@ -903,22 +903,14 @@ static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk, > df->data_seq + df->data_len == msk->write_seq; > } > > -static int mptcp_wmem_with_overhead(struct sock *sk, int size) > +static int mptcp_wmem_with_overhead(int size) > { > - struct mptcp_sock *msk = mptcp_sk(sk); > - int ret, skbs; > - > - ret = size + ((sizeof(struct mptcp_data_frag) * size) >> PAGE_SHIFT); > - skbs = (msk->tx_pending_data + size) / msk->size_goal_cache; > - if (skbs < msk->skb_tx_cache.qlen) > - return ret; > - > - return ret + (skbs - msk->skb_tx_cache.qlen) * SKB_TRUESIZE(MAX_TCP_HEADER); > + return size + ((sizeof(struct mptcp_data_frag) * size) >> PAGE_SHIFT); > } > > static void __mptcp_wmem_reserve(struct sock *sk, int size) > { > - int amount = mptcp_wmem_with_overhead(sk, size); > + int amount = mptcp_wmem_with_overhead(size); > struct mptcp_sock *msk = mptcp_sk(sk); > > WARN_ON_ONCE(msk->wmem_reserved); > @@ -1213,49 +1205,8 @@ static struct sk_buff *__mptcp_do_alloc_tx_skb(struct sock *sk, gfp_t gfp) > return NULL; > } > > -static bool mptcp_tx_cache_refill(struct sock *sk, int size, > - struct sk_buff_head *skbs, int *total_ts) > -{ > - struct mptcp_sock *msk = mptcp_sk(sk); > - struct sk_buff *skb; > - int space_needed; > - > - if (unlikely(tcp_under_memory_pressure(sk))) { > - mptcp_mem_reclaim_partial(sk); > - > - /* under pressure pre-allocate at most a single skb */ > - if (msk->skb_tx_cache.qlen) > - return true; > - space_needed = msk->size_goal_cache; > - } else { > - space_needed = msk->tx_pending_data + size - > - msk->skb_tx_cache.qlen * msk->size_goal_cache; > - } > - > - while (space_needed > 0) { > - skb = __mptcp_do_alloc_tx_skb(sk, sk->sk_allocation); > - if (unlikely(!skb)) { > - /* under memory pressure, try to pass the caller a > - * single skb to allow forward progress > - */ > - while (skbs->qlen > 1) { > - skb = __skb_dequeue_tail(skbs); > - *total_ts -= skb->truesize; > - __kfree_skb(skb); > - } > - return skbs->qlen > 0; > - } > - > - *total_ts += skb->truesize; > - __skb_queue_tail(skbs, skb); > - space_needed -= msk->size_goal_cache; > - } > - return true; > -} > - > static bool __mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp) > { > - struct mptcp_sock *msk = mptcp_sk(sk); > struct sk_buff *skb; > > if (ssk->sk_tx_skb_cache) { > @@ -1266,22 +1217,6 @@ static bool __mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp) > return true; > } > > - skb = skb_peek(&msk->skb_tx_cache); > - if (skb) { > - if (likely(sk_wmem_schedule(ssk, skb->truesize))) { > - skb = __skb_dequeue(&msk->skb_tx_cache); > - if (WARN_ON_ONCE(!skb)) > - return false; > - > - mptcp_wmem_uncharge(sk, skb->truesize); > - ssk->sk_tx_skb_cache = skb; > - return true; > - } > - > - /* over memory limit, no point to try to allocate a new skb */ > - return false; > - } > - > skb = __mptcp_do_alloc_tx_skb(sk, gfp); > if (!skb) > return false; > @@ -1297,7 +1232,6 @@ static bool __mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp) > static bool mptcp_must_reclaim_memory(struct sock *sk, struct sock *ssk) > { > return !ssk->sk_tx_skb_cache && > - !skb_peek(&mptcp_sk(sk)->skb_tx_cache) && > tcp_under_memory_pressure(sk); > } > > @@ -1340,7 +1274,6 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, > /* compute send limit */ > info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags); > avail_size = info->size_goal; > - msk->size_goal_cache = info->size_goal; > skb = tcp_write_queue_tail(ssk); > if (skb) { > /* Limit the write to the size available in the > @@ -1689,7 +1622,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) > while (msg_data_left(msg)) { > int total_ts, frag_truesize = 0; > struct mptcp_data_frag *dfrag; > - struct sk_buff_head skbs; > bool dfrag_collapsed; > size_t psize, offset; > > @@ -1722,16 +1654,10 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) > psize = pfrag->size - offset; > psize = min_t(size_t, psize, msg_data_left(msg)); > total_ts = psize + frag_truesize; > - __skb_queue_head_init(&skbs); > - if (!mptcp_tx_cache_refill(sk, psize, &skbs, &total_ts)) > - goto wait_for_memory; > > - if (!mptcp_wmem_alloc(sk, total_ts)) { > - __skb_queue_purge(&skbs); > + if (!mptcp_wmem_alloc(sk, total_ts)) > goto wait_for_memory; > - } > > - skb_queue_splice_tail(&skbs, &msk->skb_tx_cache); > if (copy_page_from_iter(dfrag->page, offset, psize, > &msg->msg_iter) != psize) { > mptcp_wmem_uncharge(sk, psize + frag_truesize); > @@ -2460,13 +2386,11 @@ static int __mptcp_init_sock(struct sock *sk) > INIT_LIST_HEAD(&msk->rtx_queue); > INIT_WORK(&msk->work, mptcp_worker); > __skb_queue_head_init(&msk->receive_queue); > - __skb_queue_head_init(&msk->skb_tx_cache); > msk->out_of_order_queue = RB_ROOT; > msk->first_pending = NULL; > msk->wmem_reserved = 0; > msk->rmem_released = 0; > msk->tx_pending_data = 0; > - msk->size_goal_cache = TCP_BASE_MSS; > > msk->ack_hint = NULL; > msk->first = NULL; > @@ -2527,15 +2451,10 @@ static void __mptcp_clear_xmit(struct sock *sk) > { > struct mptcp_sock *msk = mptcp_sk(sk); > struct mptcp_data_frag *dtmp, *dfrag; > - struct sk_buff *skb; > > WRITE_ONCE(msk->first_pending, NULL); > list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) > dfrag_clear(sk, dfrag); > - while ((skb = __skb_dequeue(&msk->skb_tx_cache)) != NULL) { > - sk->sk_forward_alloc += skb->truesize; > - kfree_skb(skb); > - } > } > > static void mptcp_cancel_work(struct sock *sk) > diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h > index 868e878af526..0ad1e1d08667 100644 > --- a/net/mptcp/protocol.h > +++ b/net/mptcp/protocol.h > @@ -245,7 +245,6 @@ struct mptcp_sock { > struct sk_buff *ooo_last_skb; > struct rb_root out_of_order_queue; > struct sk_buff_head receive_queue; > - struct sk_buff_head skb_tx_cache; /* this is wmem accounted */ > int tx_pending_data; > int size_goal_cache; > struct list_head conn_list; > -- > 2.26.3 > > > -- Mat Martineau Intel
On Fri, 2021-05-21 at 15:57 -0700, Mat Martineau wrote: > On Fri, 21 May 2021, Paolo Abeni wrote: > > > The mentioned cache was introduced to reduce the number of skb > > allocation in atomic context, but the required complexity is > > excessive. > > > > This change remove the mentioned cache. > > > > Signed-off-by: Paolo Abeni <pabeni@redhat.com> > > --- > > v1 -> v2: > > - drop unused field skb_tx_cache > > There's also a size_goal_cache field in struct mptcp_sock that's now > unused. yep, will do in v3 (sorry, I lost track of this review) > I like the simplification! Do the self tests trigger enough memory > pressure to test allocation in that scenario? Will ensure that before posting v3 > The commit that introduced the skb_tx_cache did so to allow removal of the > skb_ext cache. It looks like both caches are now unnecessary because the > skb and skb_ext allocations are handled together in the > __mptcp_{do_,}alloc_tx_skb() functions - so an allocation failure can be > handled without the complexity of the caches. The cache goal was avoid/reduce skb allocation in atomic context. That is still true (e.g. with skb_tx_cache we have less skb allocation in atomic context, expecially when the transfer is link/pacing/peer limited). Plain TCP can allocare skbs in atomic context, but that does not happen often. MPTCP will do that almost for each skb, when link/pacing/peer limited. I think we can ignore the above, as e.g. high performance NIC device driver always allocate skb in atomic context. > Does that match your > understanding? If so, it would be helpful information for the commit > message. The main trick, imho is that we still use the sk->sk_tx_skb_cache to prevent the TCP stack from allocating extension-less skb. That trick was introduced with the mptcp tx_cache. Not sure if the above clarify anything ?!? Cheers, Paolo
On Wed, 2021-05-26 at 13:06 +0200, Paolo Abeni wrote: > On Fri, 2021-05-21 at 15:57 -0700, Mat Martineau wrote: > > On Fri, 21 May 2021, Paolo Abeni wrote: > > > > > The mentioned cache was introduced to reduce the number of skb > > > allocation in atomic context, but the required complexity is > > > excessive. > > > > > > This change remove the mentioned cache. > > > > > > Signed-off-by: Paolo Abeni <pabeni@redhat.com> > > > --- > > > v1 -> v2: > > > - drop unused field skb_tx_cache > > > > There's also a size_goal_cache field in struct mptcp_sock that's now > > unused. > > yep, will do in v3 (sorry, I lost track of this review) > > > I like the simplification! Do the self tests trigger enough memory > > pressure to test allocation in that scenario? > > Will ensure that before posting v3 Well, we have problems under severe memory pressure, but that happens even with the current export branch. I'll send a patch Cheers, Paolo
On Wed, 26 May 2021, Paolo Abeni wrote: > On Fri, 2021-05-21 at 15:57 -0700, Mat Martineau wrote: >> On Fri, 21 May 2021, Paolo Abeni wrote: >> >>> The mentioned cache was introduced to reduce the number of skb >>> allocation in atomic context, but the required complexity is >>> excessive. >>> >>> This change remove the mentioned cache. >>> >>> Signed-off-by: Paolo Abeni <pabeni@redhat.com> >>> --- >>> v1 -> v2: >>> - drop unused field skb_tx_cache >> >> There's also a size_goal_cache field in struct mptcp_sock that's now >> unused. > > yep, will do in v3 (sorry, I lost track of this review) > >> I like the simplification! Do the self tests trigger enough memory >> pressure to test allocation in that scenario? > > Will ensure that before posting v3 > >> The commit that introduced the skb_tx_cache did so to allow removal of the >> skb_ext cache. It looks like both caches are now unnecessary because the >> skb and skb_ext allocations are handled together in the >> __mptcp_{do_,}alloc_tx_skb() functions - so an allocation failure can be >> handled without the complexity of the caches. > > The cache goal was avoid/reduce skb allocation in atomic context. That > is still true (e.g. with skb_tx_cache we have less skb allocation in > atomic context, expecially when the transfer is link/pacing/peer > limited). Plain TCP can allocare skbs in atomic context, but that does > not happen often. MPTCP will do that almost for each skb, > when link/pacing/peer limited. > > I think we can ignore the above, as e.g. high performance NIC device > driver always allocate skb in atomic context. > I think you have a better sense of the tradeoffs with the allocation context than I do - from what you describe it doesn't seem like the extra complexity to avoid atomic allocation has a significant payoff. >> Does that match your >> understanding? If so, it would be helpful information for the commit >> message. > > The main trick, imho is that we still use the sk->sk_tx_skb_cache to > prevent the TCP stack from allocating extension-less skb. That trick > was introduced with the mptcp tx_cache. > > Not sure if the above clarify anything ?!? Yes - it's helpful background, thanks! -- Mat Martineau Intel
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 446acfb85493..1114a914d845 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -903,22 +903,14 @@ static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk, df->data_seq + df->data_len == msk->write_seq; } -static int mptcp_wmem_with_overhead(struct sock *sk, int size) +static int mptcp_wmem_with_overhead(int size) { - struct mptcp_sock *msk = mptcp_sk(sk); - int ret, skbs; - - ret = size + ((sizeof(struct mptcp_data_frag) * size) >> PAGE_SHIFT); - skbs = (msk->tx_pending_data + size) / msk->size_goal_cache; - if (skbs < msk->skb_tx_cache.qlen) - return ret; - - return ret + (skbs - msk->skb_tx_cache.qlen) * SKB_TRUESIZE(MAX_TCP_HEADER); + return size + ((sizeof(struct mptcp_data_frag) * size) >> PAGE_SHIFT); } static void __mptcp_wmem_reserve(struct sock *sk, int size) { - int amount = mptcp_wmem_with_overhead(sk, size); + int amount = mptcp_wmem_with_overhead(size); struct mptcp_sock *msk = mptcp_sk(sk); WARN_ON_ONCE(msk->wmem_reserved); @@ -1213,49 +1205,8 @@ static struct sk_buff *__mptcp_do_alloc_tx_skb(struct sock *sk, gfp_t gfp) return NULL; } -static bool mptcp_tx_cache_refill(struct sock *sk, int size, - struct sk_buff_head *skbs, int *total_ts) -{ - struct mptcp_sock *msk = mptcp_sk(sk); - struct sk_buff *skb; - int space_needed; - - if (unlikely(tcp_under_memory_pressure(sk))) { - mptcp_mem_reclaim_partial(sk); - - /* under pressure pre-allocate at most a single skb */ - if (msk->skb_tx_cache.qlen) - return true; - space_needed = msk->size_goal_cache; - } else { - space_needed = msk->tx_pending_data + size - - msk->skb_tx_cache.qlen * msk->size_goal_cache; - } - - while (space_needed > 0) { - skb = __mptcp_do_alloc_tx_skb(sk, sk->sk_allocation); - if (unlikely(!skb)) { - /* under memory pressure, try to pass the caller a - * single skb to allow forward progress - */ - while (skbs->qlen > 1) { - skb = __skb_dequeue_tail(skbs); - *total_ts -= skb->truesize; - __kfree_skb(skb); - } - return skbs->qlen > 0; - } - - *total_ts += skb->truesize; - __skb_queue_tail(skbs, skb); - space_needed -= msk->size_goal_cache; - } - return true; -} - static bool __mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp) { - struct mptcp_sock *msk = mptcp_sk(sk); struct sk_buff *skb; if (ssk->sk_tx_skb_cache) { @@ -1266,22 +1217,6 @@ static bool __mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp) return true; } - skb = skb_peek(&msk->skb_tx_cache); - if (skb) { - if (likely(sk_wmem_schedule(ssk, skb->truesize))) { - skb = __skb_dequeue(&msk->skb_tx_cache); - if (WARN_ON_ONCE(!skb)) - return false; - - mptcp_wmem_uncharge(sk, skb->truesize); - ssk->sk_tx_skb_cache = skb; - return true; - } - - /* over memory limit, no point to try to allocate a new skb */ - return false; - } - skb = __mptcp_do_alloc_tx_skb(sk, gfp); if (!skb) return false; @@ -1297,7 +1232,6 @@ static bool __mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp) static bool mptcp_must_reclaim_memory(struct sock *sk, struct sock *ssk) { return !ssk->sk_tx_skb_cache && - !skb_peek(&mptcp_sk(sk)->skb_tx_cache) && tcp_under_memory_pressure(sk); } @@ -1340,7 +1274,6 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, /* compute send limit */ info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags); avail_size = info->size_goal; - msk->size_goal_cache = info->size_goal; skb = tcp_write_queue_tail(ssk); if (skb) { /* Limit the write to the size available in the @@ -1689,7 +1622,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) while (msg_data_left(msg)) { int total_ts, frag_truesize = 0; struct mptcp_data_frag *dfrag; - struct sk_buff_head skbs; bool dfrag_collapsed; size_t psize, offset; @@ -1722,16 +1654,10 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) psize = pfrag->size - offset; psize = min_t(size_t, psize, msg_data_left(msg)); total_ts = psize + frag_truesize; - __skb_queue_head_init(&skbs); - if (!mptcp_tx_cache_refill(sk, psize, &skbs, &total_ts)) - goto wait_for_memory; - if (!mptcp_wmem_alloc(sk, total_ts)) { - __skb_queue_purge(&skbs); + if (!mptcp_wmem_alloc(sk, total_ts)) goto wait_for_memory; - } - skb_queue_splice_tail(&skbs, &msk->skb_tx_cache); if (copy_page_from_iter(dfrag->page, offset, psize, &msg->msg_iter) != psize) { mptcp_wmem_uncharge(sk, psize + frag_truesize); @@ -2460,13 +2386,11 @@ static int __mptcp_init_sock(struct sock *sk) INIT_LIST_HEAD(&msk->rtx_queue); INIT_WORK(&msk->work, mptcp_worker); __skb_queue_head_init(&msk->receive_queue); - __skb_queue_head_init(&msk->skb_tx_cache); msk->out_of_order_queue = RB_ROOT; msk->first_pending = NULL; msk->wmem_reserved = 0; msk->rmem_released = 0; msk->tx_pending_data = 0; - msk->size_goal_cache = TCP_BASE_MSS; msk->ack_hint = NULL; msk->first = NULL; @@ -2527,15 +2451,10 @@ static void __mptcp_clear_xmit(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_data_frag *dtmp, *dfrag; - struct sk_buff *skb; WRITE_ONCE(msk->first_pending, NULL); list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) dfrag_clear(sk, dfrag); - while ((skb = __skb_dequeue(&msk->skb_tx_cache)) != NULL) { - sk->sk_forward_alloc += skb->truesize; - kfree_skb(skb); - } } static void mptcp_cancel_work(struct sock *sk) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 868e878af526..0ad1e1d08667 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -245,7 +245,6 @@ struct mptcp_sock { struct sk_buff *ooo_last_skb; struct rb_root out_of_order_queue; struct sk_buff_head receive_queue; - struct sk_buff_head skb_tx_cache; /* this is wmem accounted */ int tx_pending_data; int size_goal_cache; struct list_head conn_list;
The mentioned cache was introduced to reduce the number of skb allocation in atomic context, but the required complexity is excessive. This change remove the mentioned cache. Signed-off-by: Paolo Abeni <pabeni@redhat.com> --- v1 -> v2: - drop unused field skb_tx_cache --- net/mptcp/protocol.c | 89 ++------------------------------------------ net/mptcp/protocol.h | 1 - 2 files changed, 4 insertions(+), 86 deletions(-)