Message ID | 20220930014458.1219217-1-eric.dumazet@gmail.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | [net-next] gro: add support of (hw)gro packets to gro stack | expand |
On Thu, 2022-09-29 at 18:44 -0700, Eric Dumazet wrote: > From: Coco Li <lixiaoyan@google.com> > > Current GRO stack only supports incoming packets containing > one frame/MSS. > > This patch changes GRO to accept packets that are already GRO. > > HW-GRO (aka RSC for some vendors) is very often limited in presence > of interleaved packets. Linux SW GRO stack can complete the job > and provide larger GRO packets, thus reducing rate of ACK packets > and cpu overhead. > > This also means BIG TCP can be used, even if HW-GRO/RSC was > able to cook ~64 KB GRO packets. > > Co-Developed-by: Eric Dumazet <edumazet@google.com> > Signed-off-by: Eric Dumazet <edumazet@google.com> > Signed-off-by: Coco Li <lixiaoyan@google.com> > --- > net/core/gro.c | 13 +++++++++---- > net/ipv4/tcp_offload.c | 7 ++++++- > 2 files changed, 15 insertions(+), 5 deletions(-) > > diff --git a/net/core/gro.c b/net/core/gro.c > index b4190eb084672fb4f2be8b437eccb4e8507ff63f..d8e159c4bdf553508cd123bee4f5251908ede9fe 100644 > --- a/net/core/gro.c > +++ b/net/core/gro.c > @@ -160,6 +160,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) > unsigned int gro_max_size; > unsigned int new_truesize; > struct sk_buff *lp; > + int segs; > > /* pairs with WRITE_ONCE() in netif_set_gro_max_size() */ > gro_max_size = READ_ONCE(p->dev->gro_max_size); > @@ -175,6 +176,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) > return -E2BIG; > } > > + segs = NAPI_GRO_CB(skb)->count; > lp = NAPI_GRO_CB(p)->last; > pinfo = skb_shinfo(lp); > > @@ -265,7 +267,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) > lp = p; > > done: > - NAPI_GRO_CB(p)->count++; > + NAPI_GRO_CB(p)->count += segs; > p->data_len += len; > p->truesize += delta_truesize; > p->len += len; > @@ -496,8 +498,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff > BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct napi_gro_cb, zeroed), > sizeof(u32))); /* Avoid slow unaligned acc */ > *(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0; > - NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb); > + NAPI_GRO_CB(skb)->flush = skb_has_frag_list(skb); > NAPI_GRO_CB(skb)->is_atomic = 1; > + NAPI_GRO_CB(skb)->count = max_t(u16, 1, > + skb_shinfo(skb)->gso_segs); > > /* Setup for GRO checksum validation */ > switch (skb->ip_summed) { > @@ -545,10 +549,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff > else > gro_list->count++; > > - NAPI_GRO_CB(skb)->count = 1; > NAPI_GRO_CB(skb)->age = jiffies; > NAPI_GRO_CB(skb)->last = skb; > - skb_shinfo(skb)->gso_size = skb_gro_len(skb); > + if (!skb_is_gso(skb)) > + skb_shinfo(skb)->gso_size = skb_gro_len(skb); > list_add(&skb->list, &gro_list->list); > ret = GRO_HELD; > > @@ -660,6 +664,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) > > skb->encapsulation = 0; > skb_shinfo(skb)->gso_type = 0; > + skb_shinfo(skb)->gso_size = 0; > if (unlikely(skb->slow_gro)) { > skb_orphan(skb); > skb_ext_reset(skb); > diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c > index a844a0d38482d916251f3aca4555c75c9770820c..0223bbfe9568064b47bc6227d342a4d25c9edfa7 100644 > --- a/net/ipv4/tcp_offload.c > +++ b/net/ipv4/tcp_offload.c > @@ -255,7 +255,12 @@ struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb) > > mss = skb_shinfo(p)->gso_size; > > - flush |= (len - 1) >= mss; > + if (skb_is_gso(skb)) { > + flush |= (mss != skb_shinfo(skb)->gso_size); > + flush |= ((skb_gro_len(p) % mss) != 0); If I read correctly, the '(skb_gro_len(p) % mss) != 0' codition can be true only if 'p' was an HW GRO packet (or at least a gso packet before entering the GRO engine), am I correct? In that case 'p' staged into the GRO hash up to the next packet (skb), just to be flushed. Should the above condition be instead: flush |= ((skb_gro_len(skb) % mss) != 0); ? And possibly use that condition while initializing NAPI_GRO_CB(skb)->flush in dev_gro_receive() ? > + } else { > + flush |= (len - 1) >= mss; > + } > flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq); > #ifdef CONFIG_TLS_DEVICE > flush |= p->decrypted ^ skb->decrypted; I could not find a NIC doing HW GRO for UDP, so I guess we don't need something similar in udp_offload, right? Thanks! Paolo
On Fri, Sep 30, 2022 at 1:45 AM Paolo Abeni <pabeni@redhat.com> wrote: > > On Thu, 2022-09-29 at 18:44 -0700, Eric Dumazet wrote: > > From: Coco Li <lixiaoyan@google.com> > > > > Current GRO stack only supports incoming packets containing > > one frame/MSS. > > > > This patch changes GRO to accept packets that are already GRO. > > > > HW-GRO (aka RSC for some vendors) is very often limited in presence > > of interleaved packets. Linux SW GRO stack can complete the job > > and provide larger GRO packets, thus reducing rate of ACK packets > > and cpu overhead. > > > > This also means BIG TCP can be used, even if HW-GRO/RSC was > > able to cook ~64 KB GRO packets. > > > > Co-Developed-by: Eric Dumazet <edumazet@google.com> > > Signed-off-by: Eric Dumazet <edumazet@google.com> > > Signed-off-by: Coco Li <lixiaoyan@google.com> > > --- > > net/core/gro.c | 13 +++++++++---- > > net/ipv4/tcp_offload.c | 7 ++++++- > > 2 files changed, 15 insertions(+), 5 deletions(-) > > > > diff --git a/net/core/gro.c b/net/core/gro.c > > index b4190eb084672fb4f2be8b437eccb4e8507ff63f..d8e159c4bdf553508cd123bee4f5251908ede9fe 100644 > > --- a/net/core/gro.c > > +++ b/net/core/gro.c > > @@ -160,6 +160,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) > > unsigned int gro_max_size; > > unsigned int new_truesize; > > struct sk_buff *lp; > > + int segs; > > > > /* pairs with WRITE_ONCE() in netif_set_gro_max_size() */ > > gro_max_size = READ_ONCE(p->dev->gro_max_size); > > @@ -175,6 +176,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) > > return -E2BIG; > > } > > > > + segs = NAPI_GRO_CB(skb)->count; > > lp = NAPI_GRO_CB(p)->last; > > pinfo = skb_shinfo(lp); > > > > @@ -265,7 +267,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) > > lp = p; > > > > done: > > - NAPI_GRO_CB(p)->count++; > > + NAPI_GRO_CB(p)->count += segs; > > p->data_len += len; > > p->truesize += delta_truesize; > > p->len += len; > > @@ -496,8 +498,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff > > BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct napi_gro_cb, zeroed), > > sizeof(u32))); /* Avoid slow unaligned acc */ > > *(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0; > > - NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb); > > + NAPI_GRO_CB(skb)->flush = skb_has_frag_list(skb); > > NAPI_GRO_CB(skb)->is_atomic = 1; > > + NAPI_GRO_CB(skb)->count = max_t(u16, 1, > > + skb_shinfo(skb)->gso_segs); > > > > /* Setup for GRO checksum validation */ > > switch (skb->ip_summed) { > > @@ -545,10 +549,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff > > else > > gro_list->count++; > > > > - NAPI_GRO_CB(skb)->count = 1; > > NAPI_GRO_CB(skb)->age = jiffies; > > NAPI_GRO_CB(skb)->last = skb; > > - skb_shinfo(skb)->gso_size = skb_gro_len(skb); > > + if (!skb_is_gso(skb)) > > + skb_shinfo(skb)->gso_size = skb_gro_len(skb); > > list_add(&skb->list, &gro_list->list); > > ret = GRO_HELD; > > > > @@ -660,6 +664,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) > > > > skb->encapsulation = 0; > > skb_shinfo(skb)->gso_type = 0; > > + skb_shinfo(skb)->gso_size = 0; > > if (unlikely(skb->slow_gro)) { > > skb_orphan(skb); > > skb_ext_reset(skb); > > diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c > > index a844a0d38482d916251f3aca4555c75c9770820c..0223bbfe9568064b47bc6227d342a4d25c9edfa7 100644 > > --- a/net/ipv4/tcp_offload.c > > +++ b/net/ipv4/tcp_offload.c > > @@ -255,7 +255,12 @@ struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb) > > > > mss = skb_shinfo(p)->gso_size; > > > > - flush |= (len - 1) >= mss; > > + if (skb_is_gso(skb)) { > > + flush |= (mss != skb_shinfo(skb)->gso_size); > > + flush |= ((skb_gro_len(p) % mss) != 0); > > If I read correctly, the '(skb_gro_len(p) % mss) != 0' codition can be > true only if 'p' was an HW GRO packet (or at least a gso packet before > entering the GRO engine), am I correct? In that case 'p' staged into > the GRO hash up to the next packet (skb), just to be flushed. > > Should the above condition be instead: > > flush |= ((skb_gro_len(skb) % mss) != 0); Yes, probable typo. > ? > > And possibly use that condition while initializing > NAPI_GRO_CB(skb)->flush in dev_gro_receive() ? Not sure, this would add an extra test in dev_gro_receive() It seems better to leave the test here, because the prior condition needs to stay here. if (skb_is_gso(skb)) { flush |= (mss != skb_shinfo(skb)->gso_size); > > > + } else { > > + flush |= (len - 1) >= mss; > > + } > > flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq); > > #ifdef CONFIG_TLS_DEVICE > > flush |= p->decrypted ^ skb->decrypted; > > I could not find a NIC doing HW GRO for UDP, so I guess we don't need > something similar in udp_offload, right? Well, this could be added just in case :)
On Fri, Sep 30, 2022 at 12:53 PM Eric Dumazet <edumazet@google.com> wrote: > > On Fri, Sep 30, 2022 at 1:45 AM Paolo Abeni <pabeni@redhat.com> wrote: > > > > On Thu, 2022-09-29 at 18:44 -0700, Eric Dumazet wrote: > > > From: Coco Li <lixiaoyan@google.com> > > > > > > Current GRO stack only supports incoming packets containing > > > one frame/MSS. > > > > > > This patch changes GRO to accept packets that are already GRO. > > > > > > HW-GRO (aka RSC for some vendors) is very often limited in presence > > > of interleaved packets. Linux SW GRO stack can complete the job > > > and provide larger GRO packets, thus reducing rate of ACK packets > > > and cpu overhead. > > > > > > This also means BIG TCP can be used, even if HW-GRO/RSC was > > > able to cook ~64 KB GRO packets. > > > > > > Co-Developed-by: Eric Dumazet <edumazet@google.com> > > > Signed-off-by: Eric Dumazet <edumazet@google.com> > > > Signed-off-by: Coco Li <lixiaoyan@google.com> > > > --- > > > net/core/gro.c | 13 +++++++++---- > > > net/ipv4/tcp_offload.c | 7 ++++++- > > > 2 files changed, 15 insertions(+), 5 deletions(-) > > > > > > diff --git a/net/core/gro.c b/net/core/gro.c > > > index b4190eb084672fb4f2be8b437eccb4e8507ff63f..d8e159c4bdf553508cd123bee4f5251908ede9fe 100644 > > > --- a/net/core/gro.c > > > +++ b/net/core/gro.c > > > @@ -160,6 +160,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) > > > unsigned int gro_max_size; > > > unsigned int new_truesize; > > > struct sk_buff *lp; > > > + int segs; > > > > > > /* pairs with WRITE_ONCE() in netif_set_gro_max_size() */ > > > gro_max_size = READ_ONCE(p->dev->gro_max_size); > > > @@ -175,6 +176,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) > > > return -E2BIG; > > > } > > > > > > + segs = NAPI_GRO_CB(skb)->count; > > > lp = NAPI_GRO_CB(p)->last; > > > pinfo = skb_shinfo(lp); > > > > > > @@ -265,7 +267,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) > > > lp = p; > > > > > > done: > > > - NAPI_GRO_CB(p)->count++; > > > + NAPI_GRO_CB(p)->count += segs; > > > p->data_len += len; > > > p->truesize += delta_truesize; > > > p->len += len; > > > @@ -496,8 +498,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff > > > BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct napi_gro_cb, zeroed), > > > sizeof(u32))); /* Avoid slow unaligned acc */ > > > *(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0; > > > - NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb); > > > + NAPI_GRO_CB(skb)->flush = skb_has_frag_list(skb); > > > NAPI_GRO_CB(skb)->is_atomic = 1; > > > + NAPI_GRO_CB(skb)->count = max_t(u16, 1, > > > + skb_shinfo(skb)->gso_segs); > > > > > > /* Setup for GRO checksum validation */ > > > switch (skb->ip_summed) { > > > @@ -545,10 +549,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff > > > else > > > gro_list->count++; > > > > > > - NAPI_GRO_CB(skb)->count = 1; > > > NAPI_GRO_CB(skb)->age = jiffies; > > > NAPI_GRO_CB(skb)->last = skb; > > > - skb_shinfo(skb)->gso_size = skb_gro_len(skb); > > > + if (!skb_is_gso(skb)) > > > + skb_shinfo(skb)->gso_size = skb_gro_len(skb); > > > list_add(&skb->list, &gro_list->list); > > > ret = GRO_HELD; > > > > > > @@ -660,6 +664,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) > > > > > > skb->encapsulation = 0; > > > skb_shinfo(skb)->gso_type = 0; > > > + skb_shinfo(skb)->gso_size = 0; > > > if (unlikely(skb->slow_gro)) { > > > skb_orphan(skb); > > > skb_ext_reset(skb); > > > diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c > > > index a844a0d38482d916251f3aca4555c75c9770820c..0223bbfe9568064b47bc6227d342a4d25c9edfa7 100644 > > > --- a/net/ipv4/tcp_offload.c > > > +++ b/net/ipv4/tcp_offload.c > > > @@ -255,7 +255,12 @@ struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb) > > > > > > mss = skb_shinfo(p)->gso_size; > > > > > > - flush |= (len - 1) >= mss; > > > + if (skb_is_gso(skb)) { > > > + flush |= (mss != skb_shinfo(skb)->gso_size); > > > + flush |= ((skb_gro_len(p) % mss) != 0); > > > > If I read correctly, the '(skb_gro_len(p) % mss) != 0' codition can be > > true only if 'p' was an HW GRO packet (or at least a gso packet before > > entering the GRO engine), am I correct? In that case 'p' staged into > > the GRO hash up to the next packet (skb), just to be flushed. > > > > Should the above condition be instead: > > > > flush |= ((skb_gro_len(skb) % mss) != 0); > > Yes, probable typo. > > > ? > > > > And possibly use that condition while initializing > > NAPI_GRO_CB(skb)->flush in dev_gro_receive() ? > > Not sure, this would add an extra test in dev_gro_receive() > > It seems better to leave the test here, because the prior condition > needs to stay here. > > if (skb_is_gso(skb)) { > flush |= (mss != skb_shinfo(skb)->gso_size); > Oh well, I think Coco missed the fact that the ((skb_gro_len(skb) % mss) != 0) condition needs to be put after label out_check_final. For example, if MSS==1000, and p has 4 segments, we still want to aggregate skb into p if skb payload is not a multiple of MSS. > > > > > > + } else { > > > + flush |= (len - 1) >= mss; > > > + } > > > flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq); > > > #ifdef CONFIG_TLS_DEVICE > > > flush |= p->decrypted ^ skb->decrypted; > > > > I could not find a NIC doing HW GRO for UDP, so I guess we don't need > > something similar in udp_offload, right? > > Well, this could be added just in case :)
On Fri, Sep 30, 2022 at 1:00 PM Eric Dumazet <edumazet@google.com> wrote: > > On Fri, Sep 30, 2022 at 12:53 PM Eric Dumazet <edumazet@google.com> wrote: > > > > On Fri, Sep 30, 2022 at 1:45 AM Paolo Abeni <pabeni@redhat.com> wrote: > > > > > > On Thu, 2022-09-29 at 18:44 -0700, Eric Dumazet wrote: > > > > From: Coco Li <lixiaoyan@google.com> > > > > > > > > Current GRO stack only supports incoming packets containing > > > > one frame/MSS. > > > > > > > > This patch changes GRO to accept packets that are already GRO. > > > > > > > > HW-GRO (aka RSC for some vendors) is very often limited in presence > > > > of interleaved packets. Linux SW GRO stack can complete the job > > > > and provide larger GRO packets, thus reducing rate of ACK packets > > > > and cpu overhead. > > > > > > > > This also means BIG TCP can be used, even if HW-GRO/RSC was > > > > able to cook ~64 KB GRO packets. > > > > > > > > Co-Developed-by: Eric Dumazet <edumazet@google.com> > > > > Signed-off-by: Eric Dumazet <edumazet@google.com> > > > > Signed-off-by: Coco Li <lixiaoyan@google.com> > > > > --- > > > > net/core/gro.c | 13 +++++++++---- > > > > net/ipv4/tcp_offload.c | 7 ++++++- > > > > 2 files changed, 15 insertions(+), 5 deletions(-) > > > > > > > > diff --git a/net/core/gro.c b/net/core/gro.c > > > > index b4190eb084672fb4f2be8b437eccb4e8507ff63f..d8e159c4bdf553508cd123bee4f5251908ede9fe 100644 > > > > --- a/net/core/gro.c > > > > +++ b/net/core/gro.c > > > > @@ -160,6 +160,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) > > > > unsigned int gro_max_size; > > > > unsigned int new_truesize; > > > > struct sk_buff *lp; > > > > + int segs; > > > > > > > > /* pairs with WRITE_ONCE() in netif_set_gro_max_size() */ > > > > gro_max_size = READ_ONCE(p->dev->gro_max_size); > > > > @@ -175,6 +176,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) > > > > return -E2BIG; > > > > } > > > > > > > > + segs = NAPI_GRO_CB(skb)->count; > > > > lp = NAPI_GRO_CB(p)->last; > > > > pinfo = skb_shinfo(lp); > > > > > > > > @@ -265,7 +267,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) > > > > lp = p; > > > > > > > > done: > > > > - NAPI_GRO_CB(p)->count++; > > > > + NAPI_GRO_CB(p)->count += segs; > > > > p->data_len += len; > > > > p->truesize += delta_truesize; > > > > p->len += len; > > > > @@ -496,8 +498,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff > > > > BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct napi_gro_cb, zeroed), > > > > sizeof(u32))); /* Avoid slow unaligned acc */ > > > > *(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0; > > > > - NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb); > > > > + NAPI_GRO_CB(skb)->flush = skb_has_frag_list(skb); > > > > NAPI_GRO_CB(skb)->is_atomic = 1; > > > > + NAPI_GRO_CB(skb)->count = max_t(u16, 1, > > > > + skb_shinfo(skb)->gso_segs); > > > > > > > > /* Setup for GRO checksum validation */ > > > > switch (skb->ip_summed) { > > > > @@ -545,10 +549,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff > > > > else > > > > gro_list->count++; > > > > > > > > - NAPI_GRO_CB(skb)->count = 1; > > > > NAPI_GRO_CB(skb)->age = jiffies; > > > > NAPI_GRO_CB(skb)->last = skb; > > > > - skb_shinfo(skb)->gso_size = skb_gro_len(skb); > > > > + if (!skb_is_gso(skb)) > > > > + skb_shinfo(skb)->gso_size = skb_gro_len(skb); > > > > list_add(&skb->list, &gro_list->list); > > > > ret = GRO_HELD; > > > > > > > > @@ -660,6 +664,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) > > > > > > > > skb->encapsulation = 0; > > > > skb_shinfo(skb)->gso_type = 0; > > > > + skb_shinfo(skb)->gso_size = 0; > > > > if (unlikely(skb->slow_gro)) { > > > > skb_orphan(skb); > > > > skb_ext_reset(skb); > > > > diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c > > > > index a844a0d38482d916251f3aca4555c75c9770820c..0223bbfe9568064b47bc6227d342a4d25c9edfa7 100644 > > > > --- a/net/ipv4/tcp_offload.c > > > > +++ b/net/ipv4/tcp_offload.c > > > > @@ -255,7 +255,12 @@ struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb) > > > > > > > > mss = skb_shinfo(p)->gso_size; > > > > > > > > - flush |= (len - 1) >= mss; > > > > + if (skb_is_gso(skb)) { > > > > + flush |= (mss != skb_shinfo(skb)->gso_size); > > > > + flush |= ((skb_gro_len(p) % mss) != 0); > > > > > > If I read correctly, the '(skb_gro_len(p) % mss) != 0' codition can be > > > true only if 'p' was an HW GRO packet (or at least a gso packet before > > > entering the GRO engine), am I correct? In that case 'p' staged into > > > the GRO hash up to the next packet (skb), just to be flushed. > > > > > > Should the above condition be instead: > > > > > > flush |= ((skb_gro_len(skb) % mss) != 0); > > > > Yes, probable typo. > > > > > ? > > > > > > And possibly use that condition while initializing > > > NAPI_GRO_CB(skb)->flush in dev_gro_receive() ? > > > > Not sure, this would add an extra test in dev_gro_receive() > > > > It seems better to leave the test here, because the prior condition > > needs to stay here. > > > > if (skb_is_gso(skb)) { > > flush |= (mss != skb_shinfo(skb)->gso_size); > > > > Oh well, I think Coco missed the fact that the ((skb_gro_len(skb) % mss) != 0) > condition needs to be put after label out_check_final. > > For example, if MSS==1000, and p has 4 segments, we still want to > aggregate skb into p > if skb payload is not a multiple of MSS. > Relative diff would be: diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 0223bbfe9568064b47bc6227d342a4d25c9edfa7..79996b007bd64635aea27e3fddf291abe10ceca1 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -255,26 +255,27 @@ struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb) mss = skb_shinfo(p)->gso_size; - if (skb_is_gso(skb)) { + if (unlikely(skb_is_gso(skb))) flush |= (mss != skb_shinfo(skb)->gso_size); - flush |= ((skb_gro_len(p) % mss) != 0); - } else { + else flush |= (len - 1) >= mss; - } + flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq); #ifdef CONFIG_TLS_DEVICE flush |= p->decrypted ^ skb->decrypted; #endif if (flush || skb_gro_receive(p, skb)) { - mss = 1; - goto out_check_final; + flush = 0; + goto check_flags; } tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH); out_check_final: - flush = len < mss; + flush = len != NAPI_GRO_CB(skb)->count * mss; + +check_flags: flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH | TCP_FLAG_RST | TCP_FLAG_SYN | TCP_FLAG_FIN));
On Fri, 2022-09-30 at 13:19 -0700, Eric Dumazet wrote: > On Fri, Sep 30, 2022 at 1:00 PM Eric Dumazet <edumazet@google.com> wrote: > > > > On Fri, Sep 30, 2022 at 12:53 PM Eric Dumazet <edumazet@google.com> wrote: > > > > > > On Fri, Sep 30, 2022 at 1:45 AM Paolo Abeni <pabeni@redhat.com> wrote: > > > > > > > > On Thu, 2022-09-29 at 18:44 -0700, Eric Dumazet wrote: > > > > > From: Coco Li <lixiaoyan@google.com> > > > > > > > > > > Current GRO stack only supports incoming packets containing > > > > > one frame/MSS. > > > > > > > > > > This patch changes GRO to accept packets that are already GRO. > > > > > > > > > > HW-GRO (aka RSC for some vendors) is very often limited in presence > > > > > of interleaved packets. Linux SW GRO stack can complete the job > > > > > and provide larger GRO packets, thus reducing rate of ACK packets > > > > > and cpu overhead. > > > > > > > > > > This also means BIG TCP can be used, even if HW-GRO/RSC was > > > > > able to cook ~64 KB GRO packets. > > > > > > > > > > Co-Developed-by: Eric Dumazet <edumazet@google.com> > > > > > Signed-off-by: Eric Dumazet <edumazet@google.com> > > > > > Signed-off-by: Coco Li <lixiaoyan@google.com> > > > > > --- > > > > > net/core/gro.c | 13 +++++++++---- > > > > > net/ipv4/tcp_offload.c | 7 ++++++- > > > > > 2 files changed, 15 insertions(+), 5 deletions(-) > > > > > > > > > > diff --git a/net/core/gro.c b/net/core/gro.c > > > > > index b4190eb084672fb4f2be8b437eccb4e8507ff63f..d8e159c4bdf553508cd123bee4f5251908ede9fe 100644 > > > > > --- a/net/core/gro.c > > > > > +++ b/net/core/gro.c > > > > > @@ -160,6 +160,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) > > > > > unsigned int gro_max_size; > > > > > unsigned int new_truesize; > > > > > struct sk_buff *lp; > > > > > + int segs; > > > > > > > > > > /* pairs with WRITE_ONCE() in netif_set_gro_max_size() */ > > > > > gro_max_size = READ_ONCE(p->dev->gro_max_size); > > > > > @@ -175,6 +176,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) > > > > > return -E2BIG; > > > > > } > > > > > > > > > > + segs = NAPI_GRO_CB(skb)->count; > > > > > lp = NAPI_GRO_CB(p)->last; > > > > > pinfo = skb_shinfo(lp); > > > > > > > > > > @@ -265,7 +267,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) > > > > > lp = p; > > > > > > > > > > done: > > > > > - NAPI_GRO_CB(p)->count++; > > > > > + NAPI_GRO_CB(p)->count += segs; > > > > > p->data_len += len; > > > > > p->truesize += delta_truesize; > > > > > p->len += len; > > > > > @@ -496,8 +498,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff > > > > > BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct napi_gro_cb, zeroed), > > > > > sizeof(u32))); /* Avoid slow unaligned acc */ > > > > > *(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0; > > > > > - NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb); > > > > > + NAPI_GRO_CB(skb)->flush = skb_has_frag_list(skb); > > > > > NAPI_GRO_CB(skb)->is_atomic = 1; > > > > > + NAPI_GRO_CB(skb)->count = max_t(u16, 1, > > > > > + skb_shinfo(skb)->gso_segs); > > > > > > > > > > /* Setup for GRO checksum validation */ > > > > > switch (skb->ip_summed) { > > > > > @@ -545,10 +549,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff > > > > > else > > > > > gro_list->count++; > > > > > > > > > > - NAPI_GRO_CB(skb)->count = 1; > > > > > NAPI_GRO_CB(skb)->age = jiffies; > > > > > NAPI_GRO_CB(skb)->last = skb; > > > > > - skb_shinfo(skb)->gso_size = skb_gro_len(skb); > > > > > + if (!skb_is_gso(skb)) > > > > > + skb_shinfo(skb)->gso_size = skb_gro_len(skb); > > > > > list_add(&skb->list, &gro_list->list); > > > > > ret = GRO_HELD; > > > > > > > > > > @@ -660,6 +664,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) > > > > > > > > > > skb->encapsulation = 0; > > > > > skb_shinfo(skb)->gso_type = 0; > > > > > + skb_shinfo(skb)->gso_size = 0; > > > > > if (unlikely(skb->slow_gro)) { > > > > > skb_orphan(skb); > > > > > skb_ext_reset(skb); > > > > > diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c > > > > > index a844a0d38482d916251f3aca4555c75c9770820c..0223bbfe9568064b47bc6227d342a4d25c9edfa7 100644 > > > > > --- a/net/ipv4/tcp_offload.c > > > > > +++ b/net/ipv4/tcp_offload.c > > > > > @@ -255,7 +255,12 @@ struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb) > > > > > > > > > > mss = skb_shinfo(p)->gso_size; > > > > > > > > > > - flush |= (len - 1) >= mss; > > > > > + if (skb_is_gso(skb)) { > > > > > + flush |= (mss != skb_shinfo(skb)->gso_size); > > > > > + flush |= ((skb_gro_len(p) % mss) != 0); > > > > > > > > If I read correctly, the '(skb_gro_len(p) % mss) != 0' codition can be > > > > true only if 'p' was an HW GRO packet (or at least a gso packet before > > > > entering the GRO engine), am I correct? In that case 'p' staged into > > > > the GRO hash up to the next packet (skb), just to be flushed. > > > > > > > > Should the above condition be instead: > > > > > > > > flush |= ((skb_gro_len(skb) % mss) != 0); > > > > > > Yes, probable typo. > > > > > > > ? > > > > > > > > And possibly use that condition while initializing > > > > NAPI_GRO_CB(skb)->flush in dev_gro_receive() ? > > > > > > Not sure, this would add an extra test in dev_gro_receive() > > > > > > It seems better to leave the test here, because the prior condition > > > needs to stay here. > > > > > > if (skb_is_gso(skb)) { > > > flush |= (mss != skb_shinfo(skb)->gso_size); > > > > > > > Oh well, I think Coco missed the fact that the ((skb_gro_len(skb) % mss) != 0) > > condition needs to be put after label out_check_final. > > > > For example, if MSS==1000, and p has 4 segments, we still want to > > aggregate skb into p > > if skb payload is not a multiple of MSS. > > > > Relative diff would be: > > diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c > index 0223bbfe9568064b47bc6227d342a4d25c9edfa7..79996b007bd64635aea27e3fddf291abe10ceca1 > 100644 > --- a/net/ipv4/tcp_offload.c > +++ b/net/ipv4/tcp_offload.c > @@ -255,26 +255,27 @@ struct sk_buff *tcp_gro_receive(struct list_head > *head, struct sk_buff *skb) > > mss = skb_shinfo(p)->gso_size; > > - if (skb_is_gso(skb)) { > + if (unlikely(skb_is_gso(skb))) > flush |= (mss != skb_shinfo(skb)->gso_size); > - flush |= ((skb_gro_len(p) % mss) != 0); > - } else { > + else > flush |= (len - 1) >= mss; > - } > + > flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq); > #ifdef CONFIG_TLS_DEVICE > flush |= p->decrypted ^ skb->decrypted; > #endif > > if (flush || skb_gro_receive(p, skb)) { > - mss = 1; > - goto out_check_final; > + flush = 0; > + goto check_flags; > } > > tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH); > > out_check_final: > - flush = len < mss; > + flush = len != NAPI_GRO_CB(skb)->count * mss; Not sure if it's worthy, perhaps mss can be updated under the unlikely(skb_is_gso(skb)) a few lines above: mss *= NAPI_GRO_CB(skb)->count; so that here we can avoid the additional operation for the non gso case? - just: flush = len != mss; In any case LGTM, thanks! Paolo
On Fri, Sep 30, 2022 at 2:02 PM Paolo Abeni <pabeni@redhat.com> wrote: > > On Fri, 2022-09-30 at 13:19 -0700, Eric Dumazet wrote: > > On Fri, Sep 30, 2022 at 1:00 PM Eric Dumazet <edumazet@google.com> wrote: > > > > > > On Fri, Sep 30, 2022 at 12:53 PM Eric Dumazet <edumazet@google.com> wrote: > > > > > > > > On Fri, Sep 30, 2022 at 1:45 AM Paolo Abeni <pabeni@redhat.com> wrote: > > > > > > > > > > On Thu, 2022-09-29 at 18:44 -0700, Eric Dumazet wrote: > > > > > > From: Coco Li <lixiaoyan@google.com> > > > > > > > > > > > > Current GRO stack only supports incoming packets containing > > > > > > one frame/MSS. > > > > > > > > > > > > This patch changes GRO to accept packets that are already GRO. > > > > > > > > > > > > HW-GRO (aka RSC for some vendors) is very often limited in presence > > > > > > of interleaved packets. Linux SW GRO stack can complete the job > > > > > > and provide larger GRO packets, thus reducing rate of ACK packets > > > > > > and cpu overhead. > > > > > > > > > > > > This also means BIG TCP can be used, even if HW-GRO/RSC was > > > > > > able to cook ~64 KB GRO packets. > > > > > > > > > > > > Co-Developed-by: Eric Dumazet <edumazet@google.com> > > > > > > Signed-off-by: Eric Dumazet <edumazet@google.com> > > > > > > Signed-off-by: Coco Li <lixiaoyan@google.com> > > > > > > --- > > > > > > net/core/gro.c | 13 +++++++++---- > > > > > > net/ipv4/tcp_offload.c | 7 ++++++- > > > > > > 2 files changed, 15 insertions(+), 5 deletions(-) > > > > > > > > > > > > diff --git a/net/core/gro.c b/net/core/gro.c > > > > > > index b4190eb084672fb4f2be8b437eccb4e8507ff63f..d8e159c4bdf553508cd123bee4f5251908ede9fe 100644 > > > > > > --- a/net/core/gro.c > > > > > > +++ b/net/core/gro.c > > > > > > @@ -160,6 +160,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) > > > > > > unsigned int gro_max_size; > > > > > > unsigned int new_truesize; > > > > > > struct sk_buff *lp; > > > > > > + int segs; > > > > > > > > > > > > /* pairs with WRITE_ONCE() in netif_set_gro_max_size() */ > > > > > > gro_max_size = READ_ONCE(p->dev->gro_max_size); > > > > > > @@ -175,6 +176,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) > > > > > > return -E2BIG; > > > > > > } > > > > > > > > > > > > + segs = NAPI_GRO_CB(skb)->count; > > > > > > lp = NAPI_GRO_CB(p)->last; > > > > > > pinfo = skb_shinfo(lp); > > > > > > > > > > > > @@ -265,7 +267,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) > > > > > > lp = p; > > > > > > > > > > > > done: > > > > > > - NAPI_GRO_CB(p)->count++; > > > > > > + NAPI_GRO_CB(p)->count += segs; > > > > > > p->data_len += len; > > > > > > p->truesize += delta_truesize; > > > > > > p->len += len; > > > > > > @@ -496,8 +498,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff > > > > > > BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct napi_gro_cb, zeroed), > > > > > > sizeof(u32))); /* Avoid slow unaligned acc */ > > > > > > *(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0; > > > > > > - NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb); > > > > > > + NAPI_GRO_CB(skb)->flush = skb_has_frag_list(skb); > > > > > > NAPI_GRO_CB(skb)->is_atomic = 1; > > > > > > + NAPI_GRO_CB(skb)->count = max_t(u16, 1, > > > > > > + skb_shinfo(skb)->gso_segs); > > > > > > > > > > > > /* Setup for GRO checksum validation */ > > > > > > switch (skb->ip_summed) { > > > > > > @@ -545,10 +549,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff > > > > > > else > > > > > > gro_list->count++; > > > > > > > > > > > > - NAPI_GRO_CB(skb)->count = 1; > > > > > > NAPI_GRO_CB(skb)->age = jiffies; > > > > > > NAPI_GRO_CB(skb)->last = skb; > > > > > > - skb_shinfo(skb)->gso_size = skb_gro_len(skb); > > > > > > + if (!skb_is_gso(skb)) > > > > > > + skb_shinfo(skb)->gso_size = skb_gro_len(skb); > > > > > > list_add(&skb->list, &gro_list->list); > > > > > > ret = GRO_HELD; > > > > > > > > > > > > @@ -660,6 +664,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) > > > > > > > > > > > > skb->encapsulation = 0; > > > > > > skb_shinfo(skb)->gso_type = 0; > > > > > > + skb_shinfo(skb)->gso_size = 0; > > > > > > if (unlikely(skb->slow_gro)) { > > > > > > skb_orphan(skb); > > > > > > skb_ext_reset(skb); > > > > > > diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c > > > > > > index a844a0d38482d916251f3aca4555c75c9770820c..0223bbfe9568064b47bc6227d342a4d25c9edfa7 100644 > > > > > > --- a/net/ipv4/tcp_offload.c > > > > > > +++ b/net/ipv4/tcp_offload.c > > > > > > @@ -255,7 +255,12 @@ struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb) > > > > > > > > > > > > mss = skb_shinfo(p)->gso_size; > > > > > > > > > > > > - flush |= (len - 1) >= mss; > > > > > > + if (skb_is_gso(skb)) { > > > > > > + flush |= (mss != skb_shinfo(skb)->gso_size); > > > > > > + flush |= ((skb_gro_len(p) % mss) != 0); > > > > > > > > > > If I read correctly, the '(skb_gro_len(p) % mss) != 0' codition can be > > > > > true only if 'p' was an HW GRO packet (or at least a gso packet before > > > > > entering the GRO engine), am I correct? In that case 'p' staged into > > > > > the GRO hash up to the next packet (skb), just to be flushed. > > > > > > > > > > Should the above condition be instead: > > > > > > > > > > flush |= ((skb_gro_len(skb) % mss) != 0); > > > > > > > > Yes, probable typo. > > > > > > > > > ? > > > > > > > > > > And possibly use that condition while initializing > > > > > NAPI_GRO_CB(skb)->flush in dev_gro_receive() ? > > > > > > > > Not sure, this would add an extra test in dev_gro_receive() > > > > > > > > It seems better to leave the test here, because the prior condition > > > > needs to stay here. > > > > > > > > if (skb_is_gso(skb)) { > > > > flush |= (mss != skb_shinfo(skb)->gso_size); > > > > > > > > > > Oh well, I think Coco missed the fact that the ((skb_gro_len(skb) % mss) != 0) > > > condition needs to be put after label out_check_final. > > > > > > For example, if MSS==1000, and p has 4 segments, we still want to > > > aggregate skb into p > > > if skb payload is not a multiple of MSS. > > > > > > > Relative diff would be: > > > > diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c > > index 0223bbfe9568064b47bc6227d342a4d25c9edfa7..79996b007bd64635aea27e3fddf291abe10ceca1 > > 100644 > > --- a/net/ipv4/tcp_offload.c > > +++ b/net/ipv4/tcp_offload.c > > @@ -255,26 +255,27 @@ struct sk_buff *tcp_gro_receive(struct list_head > > *head, struct sk_buff *skb) > > > > mss = skb_shinfo(p)->gso_size; > > > > - if (skb_is_gso(skb)) { > > + if (unlikely(skb_is_gso(skb))) > > flush |= (mss != skb_shinfo(skb)->gso_size); > > - flush |= ((skb_gro_len(p) % mss) != 0); > > - } else { > > + else > > flush |= (len - 1) >= mss; > > - } > > + > > flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq); > > #ifdef CONFIG_TLS_DEVICE > > flush |= p->decrypted ^ skb->decrypted; > > #endif > > > > if (flush || skb_gro_receive(p, skb)) { > > - mss = 1; > > - goto out_check_final; > > + flush = 0; > > + goto check_flags; > > } > > > > tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH); > > > > out_check_final: > > - flush = len < mss; > > + flush = len != NAPI_GRO_CB(skb)->count * mss; > > Not sure if it's worthy, perhaps mss can be updated under the > unlikely(skb_is_gso(skb)) a few lines above: > > mss *= NAPI_GRO_CB(skb)->count; > > so that here we can avoid the additional operation for the non gso > case? - just: > flush = len != mss; > Yes, but we still need a check even if the above code was not run (because of earlier goto out_check_final) I am thinking of: diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index a844a0d38482d916251f3aca4555c75c9770820c..ba8e6cfb3852fc609afe3022efa10f7a06fb4c12 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -255,7 +255,11 @@ struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb) mss = skb_shinfo(p)->gso_size; - flush |= (len - 1) >= mss; + if (unlikely(skb_is_gso(skb))) + flush |= (mss != skb_shinfo(skb)->gso_size); + else + flush |= (len - 1) >= mss; + flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq); #ifdef CONFIG_TLS_DEVICE flush |= p->decrypted ^ skb->decrypted; @@ -269,7 +273,11 @@ struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb) tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH); out_check_final: - flush = len < mss; + if (unlikely(skb_is_gso(skb))) + flush = len != NAPI_GRO_CB(skb)->count * skb_shinfo(skb)->gso_size; + else + flush = len < mss; + flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH | TCP_FLAG_RST | TCP_FLAG_SYN | TCP_FLAG_FIN));
diff --git a/net/core/gro.c b/net/core/gro.c index b4190eb084672fb4f2be8b437eccb4e8507ff63f..d8e159c4bdf553508cd123bee4f5251908ede9fe 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -160,6 +160,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) unsigned int gro_max_size; unsigned int new_truesize; struct sk_buff *lp; + int segs; /* pairs with WRITE_ONCE() in netif_set_gro_max_size() */ gro_max_size = READ_ONCE(p->dev->gro_max_size); @@ -175,6 +176,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) return -E2BIG; } + segs = NAPI_GRO_CB(skb)->count; lp = NAPI_GRO_CB(p)->last; pinfo = skb_shinfo(lp); @@ -265,7 +267,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) lp = p; done: - NAPI_GRO_CB(p)->count++; + NAPI_GRO_CB(p)->count += segs; p->data_len += len; p->truesize += delta_truesize; p->len += len; @@ -496,8 +498,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct napi_gro_cb, zeroed), sizeof(u32))); /* Avoid slow unaligned acc */ *(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0; - NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb); + NAPI_GRO_CB(skb)->flush = skb_has_frag_list(skb); NAPI_GRO_CB(skb)->is_atomic = 1; + NAPI_GRO_CB(skb)->count = max_t(u16, 1, + skb_shinfo(skb)->gso_segs); /* Setup for GRO checksum validation */ switch (skb->ip_summed) { @@ -545,10 +549,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff else gro_list->count++; - NAPI_GRO_CB(skb)->count = 1; NAPI_GRO_CB(skb)->age = jiffies; NAPI_GRO_CB(skb)->last = skb; - skb_shinfo(skb)->gso_size = skb_gro_len(skb); + if (!skb_is_gso(skb)) + skb_shinfo(skb)->gso_size = skb_gro_len(skb); list_add(&skb->list, &gro_list->list); ret = GRO_HELD; @@ -660,6 +664,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) skb->encapsulation = 0; skb_shinfo(skb)->gso_type = 0; + skb_shinfo(skb)->gso_size = 0; if (unlikely(skb->slow_gro)) { skb_orphan(skb); skb_ext_reset(skb); diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index a844a0d38482d916251f3aca4555c75c9770820c..0223bbfe9568064b47bc6227d342a4d25c9edfa7 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -255,7 +255,12 @@ struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb) mss = skb_shinfo(p)->gso_size; - flush |= (len - 1) >= mss; + if (skb_is_gso(skb)) { + flush |= (mss != skb_shinfo(skb)->gso_size); + flush |= ((skb_gro_len(p) % mss) != 0); + } else { + flush |= (len - 1) >= mss; + } flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq); #ifdef CONFIG_TLS_DEVICE flush |= p->decrypted ^ skb->decrypted;