diff mbox series

[net-next] gro: add support of (hw)gro packets to gro stack

Message ID 20220930014458.1219217-1-eric.dumazet@gmail.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series [net-next] gro: add support of (hw)gro packets to gro stack | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for net-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Single patches do not need cover letters
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 6 this patch: 6
netdev/cc_maintainers warning 2 maintainers not CCed: dsahern@kernel.org yoshfuji@linux-ipv6.org
netdev/build_clang success Errors and warnings before: 5 this patch: 5
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 6 this patch: 6
netdev/checkpatch warning WARNING: 'Co-developed-by:' is the preferred signature form
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Eric Dumazet Sept. 30, 2022, 1:44 a.m. UTC
From: Coco Li <lixiaoyan@google.com>

Current GRO stack only supports incoming packets containing
one frame/MSS.

This patch changes GRO to accept packets that are already GRO.

HW-GRO (aka RSC for some vendors) is very often limited in presence
of interleaved packets. Linux SW GRO stack can complete the job
and provide larger GRO packets, thus reducing rate of ACK packets
and cpu overhead.

This also means BIG TCP can be used, even if HW-GRO/RSC was
able to cook ~64 KB GRO packets.

Co-Developed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Coco Li <lixiaoyan@google.com>
---
 net/core/gro.c         | 13 +++++++++----
 net/ipv4/tcp_offload.c |  7 ++++++-
 2 files changed, 15 insertions(+), 5 deletions(-)

Comments

Paolo Abeni Sept. 30, 2022, 8:45 a.m. UTC | #1
On Thu, 2022-09-29 at 18:44 -0700, Eric Dumazet wrote:
> From: Coco Li <lixiaoyan@google.com>
> 
> Current GRO stack only supports incoming packets containing
> one frame/MSS.
> 
> This patch changes GRO to accept packets that are already GRO.
> 
> HW-GRO (aka RSC for some vendors) is very often limited in presence
> of interleaved packets. Linux SW GRO stack can complete the job
> and provide larger GRO packets, thus reducing rate of ACK packets
> and cpu overhead.
> 
> This also means BIG TCP can be used, even if HW-GRO/RSC was
> able to cook ~64 KB GRO packets.
> 
> Co-Developed-by: Eric Dumazet <edumazet@google.com>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Signed-off-by: Coco Li <lixiaoyan@google.com>
> ---
>  net/core/gro.c         | 13 +++++++++----
>  net/ipv4/tcp_offload.c |  7 ++++++-
>  2 files changed, 15 insertions(+), 5 deletions(-)
> 
> diff --git a/net/core/gro.c b/net/core/gro.c
> index b4190eb084672fb4f2be8b437eccb4e8507ff63f..d8e159c4bdf553508cd123bee4f5251908ede9fe 100644
> --- a/net/core/gro.c
> +++ b/net/core/gro.c
> @@ -160,6 +160,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
>  	unsigned int gro_max_size;
>  	unsigned int new_truesize;
>  	struct sk_buff *lp;
> +	int segs;
>  
>  	/* pairs with WRITE_ONCE() in netif_set_gro_max_size() */
>  	gro_max_size = READ_ONCE(p->dev->gro_max_size);
> @@ -175,6 +176,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
>  			return -E2BIG;
>  	}
>  
> +	segs = NAPI_GRO_CB(skb)->count;
>  	lp = NAPI_GRO_CB(p)->last;
>  	pinfo = skb_shinfo(lp);
>  
> @@ -265,7 +267,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
>  	lp = p;
>  
>  done:
> -	NAPI_GRO_CB(p)->count++;
> +	NAPI_GRO_CB(p)->count += segs;
>  	p->data_len += len;
>  	p->truesize += delta_truesize;
>  	p->len += len;
> @@ -496,8 +498,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
>  		BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct napi_gro_cb, zeroed),
>  					 sizeof(u32))); /* Avoid slow unaligned acc */
>  		*(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0;
> -		NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
> +		NAPI_GRO_CB(skb)->flush = skb_has_frag_list(skb);
>  		NAPI_GRO_CB(skb)->is_atomic = 1;
> +		NAPI_GRO_CB(skb)->count = max_t(u16, 1,
> +						skb_shinfo(skb)->gso_segs);
>  
>  		/* Setup for GRO checksum validation */
>  		switch (skb->ip_summed) {
> @@ -545,10 +549,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
>  	else
>  		gro_list->count++;
>  
> -	NAPI_GRO_CB(skb)->count = 1;
>  	NAPI_GRO_CB(skb)->age = jiffies;
>  	NAPI_GRO_CB(skb)->last = skb;
> -	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
> +	if (!skb_is_gso(skb))
> +		skb_shinfo(skb)->gso_size = skb_gro_len(skb);
>  	list_add(&skb->list, &gro_list->list);
>  	ret = GRO_HELD;
>  
> @@ -660,6 +664,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
>  
>  	skb->encapsulation = 0;
>  	skb_shinfo(skb)->gso_type = 0;
> +	skb_shinfo(skb)->gso_size = 0;
>  	if (unlikely(skb->slow_gro)) {
>  		skb_orphan(skb);
>  		skb_ext_reset(skb);
> diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
> index a844a0d38482d916251f3aca4555c75c9770820c..0223bbfe9568064b47bc6227d342a4d25c9edfa7 100644
> --- a/net/ipv4/tcp_offload.c
> +++ b/net/ipv4/tcp_offload.c
> @@ -255,7 +255,12 @@ struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb)
>  
>  	mss = skb_shinfo(p)->gso_size;
>  
> -	flush |= (len - 1) >= mss;
> +	if (skb_is_gso(skb)) {
> +		flush |= (mss != skb_shinfo(skb)->gso_size);
> +		flush |= ((skb_gro_len(p) % mss) != 0);

If I read correctly, the '(skb_gro_len(p) % mss) != 0' codition can be
true only if 'p' was an HW GRO packet (or at least a gso packet before
entering the GRO engine), am I correct? In that case 'p' staged into
the GRO hash up to the next packet (skb), just to be flushed.

Should the above condition be instead:

		flush |= ((skb_gro_len(skb) % mss) != 0);
?

And possibly use that condition while initializing
NAPI_GRO_CB(skb)->flush in dev_gro_receive() ?

> +	} else {
> +		flush |= (len - 1) >= mss;
> +	}
>  	flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
>  #ifdef CONFIG_TLS_DEVICE
>  	flush |= p->decrypted ^ skb->decrypted;

I could not find a NIC doing HW GRO for UDP, so I guess we don't need
something similar in udp_offload, right?

Thanks!

Paolo
Eric Dumazet Sept. 30, 2022, 7:53 p.m. UTC | #2
On Fri, Sep 30, 2022 at 1:45 AM Paolo Abeni <pabeni@redhat.com> wrote:
>
> On Thu, 2022-09-29 at 18:44 -0700, Eric Dumazet wrote:
> > From: Coco Li <lixiaoyan@google.com>
> >
> > Current GRO stack only supports incoming packets containing
> > one frame/MSS.
> >
> > This patch changes GRO to accept packets that are already GRO.
> >
> > HW-GRO (aka RSC for some vendors) is very often limited in presence
> > of interleaved packets. Linux SW GRO stack can complete the job
> > and provide larger GRO packets, thus reducing rate of ACK packets
> > and cpu overhead.
> >
> > This also means BIG TCP can be used, even if HW-GRO/RSC was
> > able to cook ~64 KB GRO packets.
> >
> > Co-Developed-by: Eric Dumazet <edumazet@google.com>
> > Signed-off-by: Eric Dumazet <edumazet@google.com>
> > Signed-off-by: Coco Li <lixiaoyan@google.com>
> > ---
> >  net/core/gro.c         | 13 +++++++++----
> >  net/ipv4/tcp_offload.c |  7 ++++++-
> >  2 files changed, 15 insertions(+), 5 deletions(-)
> >
> > diff --git a/net/core/gro.c b/net/core/gro.c
> > index b4190eb084672fb4f2be8b437eccb4e8507ff63f..d8e159c4bdf553508cd123bee4f5251908ede9fe 100644
> > --- a/net/core/gro.c
> > +++ b/net/core/gro.c
> > @@ -160,6 +160,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
> >       unsigned int gro_max_size;
> >       unsigned int new_truesize;
> >       struct sk_buff *lp;
> > +     int segs;
> >
> >       /* pairs with WRITE_ONCE() in netif_set_gro_max_size() */
> >       gro_max_size = READ_ONCE(p->dev->gro_max_size);
> > @@ -175,6 +176,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
> >                       return -E2BIG;
> >       }
> >
> > +     segs = NAPI_GRO_CB(skb)->count;
> >       lp = NAPI_GRO_CB(p)->last;
> >       pinfo = skb_shinfo(lp);
> >
> > @@ -265,7 +267,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
> >       lp = p;
> >
> >  done:
> > -     NAPI_GRO_CB(p)->count++;
> > +     NAPI_GRO_CB(p)->count += segs;
> >       p->data_len += len;
> >       p->truesize += delta_truesize;
> >       p->len += len;
> > @@ -496,8 +498,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
> >               BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct napi_gro_cb, zeroed),
> >                                        sizeof(u32))); /* Avoid slow unaligned acc */
> >               *(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0;
> > -             NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
> > +             NAPI_GRO_CB(skb)->flush = skb_has_frag_list(skb);
> >               NAPI_GRO_CB(skb)->is_atomic = 1;
> > +             NAPI_GRO_CB(skb)->count = max_t(u16, 1,
> > +                                             skb_shinfo(skb)->gso_segs);
> >
> >               /* Setup for GRO checksum validation */
> >               switch (skb->ip_summed) {
> > @@ -545,10 +549,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
> >       else
> >               gro_list->count++;
> >
> > -     NAPI_GRO_CB(skb)->count = 1;
> >       NAPI_GRO_CB(skb)->age = jiffies;
> >       NAPI_GRO_CB(skb)->last = skb;
> > -     skb_shinfo(skb)->gso_size = skb_gro_len(skb);
> > +     if (!skb_is_gso(skb))
> > +             skb_shinfo(skb)->gso_size = skb_gro_len(skb);
> >       list_add(&skb->list, &gro_list->list);
> >       ret = GRO_HELD;
> >
> > @@ -660,6 +664,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
> >
> >       skb->encapsulation = 0;
> >       skb_shinfo(skb)->gso_type = 0;
> > +     skb_shinfo(skb)->gso_size = 0;
> >       if (unlikely(skb->slow_gro)) {
> >               skb_orphan(skb);
> >               skb_ext_reset(skb);
> > diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
> > index a844a0d38482d916251f3aca4555c75c9770820c..0223bbfe9568064b47bc6227d342a4d25c9edfa7 100644
> > --- a/net/ipv4/tcp_offload.c
> > +++ b/net/ipv4/tcp_offload.c
> > @@ -255,7 +255,12 @@ struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb)
> >
> >       mss = skb_shinfo(p)->gso_size;
> >
> > -     flush |= (len - 1) >= mss;
> > +     if (skb_is_gso(skb)) {
> > +             flush |= (mss != skb_shinfo(skb)->gso_size);
> > +             flush |= ((skb_gro_len(p) % mss) != 0);
>
> If I read correctly, the '(skb_gro_len(p) % mss) != 0' codition can be
> true only if 'p' was an HW GRO packet (or at least a gso packet before
> entering the GRO engine), am I correct? In that case 'p' staged into
> the GRO hash up to the next packet (skb), just to be flushed.
>
> Should the above condition be instead:
>
>                 flush |= ((skb_gro_len(skb) % mss) != 0);

Yes, probable typo.

> ?
>
> And possibly use that condition while initializing
> NAPI_GRO_CB(skb)->flush in dev_gro_receive() ?

Not sure, this would add an extra test in dev_gro_receive()

It seems better to leave the test here, because the prior condition
needs to stay here.

if (skb_is_gso(skb)) {
             flush |= (mss != skb_shinfo(skb)->gso_size);


>
> > +     } else {
> > +             flush |= (len - 1) >= mss;
> > +     }
> >       flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
> >  #ifdef CONFIG_TLS_DEVICE
> >       flush |= p->decrypted ^ skb->decrypted;
>
> I could not find a NIC doing HW GRO for UDP, so I guess we don't need
> something similar in udp_offload, right?

Well, this could be added just in case :)
Eric Dumazet Sept. 30, 2022, 8 p.m. UTC | #3
On Fri, Sep 30, 2022 at 12:53 PM Eric Dumazet <edumazet@google.com> wrote:
>
> On Fri, Sep 30, 2022 at 1:45 AM Paolo Abeni <pabeni@redhat.com> wrote:
> >
> > On Thu, 2022-09-29 at 18:44 -0700, Eric Dumazet wrote:
> > > From: Coco Li <lixiaoyan@google.com>
> > >
> > > Current GRO stack only supports incoming packets containing
> > > one frame/MSS.
> > >
> > > This patch changes GRO to accept packets that are already GRO.
> > >
> > > HW-GRO (aka RSC for some vendors) is very often limited in presence
> > > of interleaved packets. Linux SW GRO stack can complete the job
> > > and provide larger GRO packets, thus reducing rate of ACK packets
> > > and cpu overhead.
> > >
> > > This also means BIG TCP can be used, even if HW-GRO/RSC was
> > > able to cook ~64 KB GRO packets.
> > >
> > > Co-Developed-by: Eric Dumazet <edumazet@google.com>
> > > Signed-off-by: Eric Dumazet <edumazet@google.com>
> > > Signed-off-by: Coco Li <lixiaoyan@google.com>
> > > ---
> > >  net/core/gro.c         | 13 +++++++++----
> > >  net/ipv4/tcp_offload.c |  7 ++++++-
> > >  2 files changed, 15 insertions(+), 5 deletions(-)
> > >
> > > diff --git a/net/core/gro.c b/net/core/gro.c
> > > index b4190eb084672fb4f2be8b437eccb4e8507ff63f..d8e159c4bdf553508cd123bee4f5251908ede9fe 100644
> > > --- a/net/core/gro.c
> > > +++ b/net/core/gro.c
> > > @@ -160,6 +160,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
> > >       unsigned int gro_max_size;
> > >       unsigned int new_truesize;
> > >       struct sk_buff *lp;
> > > +     int segs;
> > >
> > >       /* pairs with WRITE_ONCE() in netif_set_gro_max_size() */
> > >       gro_max_size = READ_ONCE(p->dev->gro_max_size);
> > > @@ -175,6 +176,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
> > >                       return -E2BIG;
> > >       }
> > >
> > > +     segs = NAPI_GRO_CB(skb)->count;
> > >       lp = NAPI_GRO_CB(p)->last;
> > >       pinfo = skb_shinfo(lp);
> > >
> > > @@ -265,7 +267,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
> > >       lp = p;
> > >
> > >  done:
> > > -     NAPI_GRO_CB(p)->count++;
> > > +     NAPI_GRO_CB(p)->count += segs;
> > >       p->data_len += len;
> > >       p->truesize += delta_truesize;
> > >       p->len += len;
> > > @@ -496,8 +498,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
> > >               BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct napi_gro_cb, zeroed),
> > >                                        sizeof(u32))); /* Avoid slow unaligned acc */
> > >               *(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0;
> > > -             NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
> > > +             NAPI_GRO_CB(skb)->flush = skb_has_frag_list(skb);
> > >               NAPI_GRO_CB(skb)->is_atomic = 1;
> > > +             NAPI_GRO_CB(skb)->count = max_t(u16, 1,
> > > +                                             skb_shinfo(skb)->gso_segs);
> > >
> > >               /* Setup for GRO checksum validation */
> > >               switch (skb->ip_summed) {
> > > @@ -545,10 +549,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
> > >       else
> > >               gro_list->count++;
> > >
> > > -     NAPI_GRO_CB(skb)->count = 1;
> > >       NAPI_GRO_CB(skb)->age = jiffies;
> > >       NAPI_GRO_CB(skb)->last = skb;
> > > -     skb_shinfo(skb)->gso_size = skb_gro_len(skb);
> > > +     if (!skb_is_gso(skb))
> > > +             skb_shinfo(skb)->gso_size = skb_gro_len(skb);
> > >       list_add(&skb->list, &gro_list->list);
> > >       ret = GRO_HELD;
> > >
> > > @@ -660,6 +664,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
> > >
> > >       skb->encapsulation = 0;
> > >       skb_shinfo(skb)->gso_type = 0;
> > > +     skb_shinfo(skb)->gso_size = 0;
> > >       if (unlikely(skb->slow_gro)) {
> > >               skb_orphan(skb);
> > >               skb_ext_reset(skb);
> > > diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
> > > index a844a0d38482d916251f3aca4555c75c9770820c..0223bbfe9568064b47bc6227d342a4d25c9edfa7 100644
> > > --- a/net/ipv4/tcp_offload.c
> > > +++ b/net/ipv4/tcp_offload.c
> > > @@ -255,7 +255,12 @@ struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb)
> > >
> > >       mss = skb_shinfo(p)->gso_size;
> > >
> > > -     flush |= (len - 1) >= mss;
> > > +     if (skb_is_gso(skb)) {
> > > +             flush |= (mss != skb_shinfo(skb)->gso_size);
> > > +             flush |= ((skb_gro_len(p) % mss) != 0);
> >
> > If I read correctly, the '(skb_gro_len(p) % mss) != 0' codition can be
> > true only if 'p' was an HW GRO packet (or at least a gso packet before
> > entering the GRO engine), am I correct? In that case 'p' staged into
> > the GRO hash up to the next packet (skb), just to be flushed.
> >
> > Should the above condition be instead:
> >
> >                 flush |= ((skb_gro_len(skb) % mss) != 0);
>
> Yes, probable typo.
>
> > ?
> >
> > And possibly use that condition while initializing
> > NAPI_GRO_CB(skb)->flush in dev_gro_receive() ?
>
> Not sure, this would add an extra test in dev_gro_receive()
>
> It seems better to leave the test here, because the prior condition
> needs to stay here.
>
> if (skb_is_gso(skb)) {
>              flush |= (mss != skb_shinfo(skb)->gso_size);
>

Oh well, I think Coco missed the fact that the  ((skb_gro_len(skb) % mss) != 0)
condition needs to be put after label out_check_final.

For example, if MSS==1000, and p has 4 segments, we still want to
aggregate skb into p
if skb payload is not a multiple of MSS.


>
> >
> > > +     } else {
> > > +             flush |= (len - 1) >= mss;
> > > +     }
> > >       flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
> > >  #ifdef CONFIG_TLS_DEVICE
> > >       flush |= p->decrypted ^ skb->decrypted;
> >
> > I could not find a NIC doing HW GRO for UDP, so I guess we don't need
> > something similar in udp_offload, right?
>
> Well, this could be added just in case :)
Eric Dumazet Sept. 30, 2022, 8:19 p.m. UTC | #4
On Fri, Sep 30, 2022 at 1:00 PM Eric Dumazet <edumazet@google.com> wrote:
>
> On Fri, Sep 30, 2022 at 12:53 PM Eric Dumazet <edumazet@google.com> wrote:
> >
> > On Fri, Sep 30, 2022 at 1:45 AM Paolo Abeni <pabeni@redhat.com> wrote:
> > >
> > > On Thu, 2022-09-29 at 18:44 -0700, Eric Dumazet wrote:
> > > > From: Coco Li <lixiaoyan@google.com>
> > > >
> > > > Current GRO stack only supports incoming packets containing
> > > > one frame/MSS.
> > > >
> > > > This patch changes GRO to accept packets that are already GRO.
> > > >
> > > > HW-GRO (aka RSC for some vendors) is very often limited in presence
> > > > of interleaved packets. Linux SW GRO stack can complete the job
> > > > and provide larger GRO packets, thus reducing rate of ACK packets
> > > > and cpu overhead.
> > > >
> > > > This also means BIG TCP can be used, even if HW-GRO/RSC was
> > > > able to cook ~64 KB GRO packets.
> > > >
> > > > Co-Developed-by: Eric Dumazet <edumazet@google.com>
> > > > Signed-off-by: Eric Dumazet <edumazet@google.com>
> > > > Signed-off-by: Coco Li <lixiaoyan@google.com>
> > > > ---
> > > >  net/core/gro.c         | 13 +++++++++----
> > > >  net/ipv4/tcp_offload.c |  7 ++++++-
> > > >  2 files changed, 15 insertions(+), 5 deletions(-)
> > > >
> > > > diff --git a/net/core/gro.c b/net/core/gro.c
> > > > index b4190eb084672fb4f2be8b437eccb4e8507ff63f..d8e159c4bdf553508cd123bee4f5251908ede9fe 100644
> > > > --- a/net/core/gro.c
> > > > +++ b/net/core/gro.c
> > > > @@ -160,6 +160,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
> > > >       unsigned int gro_max_size;
> > > >       unsigned int new_truesize;
> > > >       struct sk_buff *lp;
> > > > +     int segs;
> > > >
> > > >       /* pairs with WRITE_ONCE() in netif_set_gro_max_size() */
> > > >       gro_max_size = READ_ONCE(p->dev->gro_max_size);
> > > > @@ -175,6 +176,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
> > > >                       return -E2BIG;
> > > >       }
> > > >
> > > > +     segs = NAPI_GRO_CB(skb)->count;
> > > >       lp = NAPI_GRO_CB(p)->last;
> > > >       pinfo = skb_shinfo(lp);
> > > >
> > > > @@ -265,7 +267,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
> > > >       lp = p;
> > > >
> > > >  done:
> > > > -     NAPI_GRO_CB(p)->count++;
> > > > +     NAPI_GRO_CB(p)->count += segs;
> > > >       p->data_len += len;
> > > >       p->truesize += delta_truesize;
> > > >       p->len += len;
> > > > @@ -496,8 +498,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
> > > >               BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct napi_gro_cb, zeroed),
> > > >                                        sizeof(u32))); /* Avoid slow unaligned acc */
> > > >               *(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0;
> > > > -             NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
> > > > +             NAPI_GRO_CB(skb)->flush = skb_has_frag_list(skb);
> > > >               NAPI_GRO_CB(skb)->is_atomic = 1;
> > > > +             NAPI_GRO_CB(skb)->count = max_t(u16, 1,
> > > > +                                             skb_shinfo(skb)->gso_segs);
> > > >
> > > >               /* Setup for GRO checksum validation */
> > > >               switch (skb->ip_summed) {
> > > > @@ -545,10 +549,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
> > > >       else
> > > >               gro_list->count++;
> > > >
> > > > -     NAPI_GRO_CB(skb)->count = 1;
> > > >       NAPI_GRO_CB(skb)->age = jiffies;
> > > >       NAPI_GRO_CB(skb)->last = skb;
> > > > -     skb_shinfo(skb)->gso_size = skb_gro_len(skb);
> > > > +     if (!skb_is_gso(skb))
> > > > +             skb_shinfo(skb)->gso_size = skb_gro_len(skb);
> > > >       list_add(&skb->list, &gro_list->list);
> > > >       ret = GRO_HELD;
> > > >
> > > > @@ -660,6 +664,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
> > > >
> > > >       skb->encapsulation = 0;
> > > >       skb_shinfo(skb)->gso_type = 0;
> > > > +     skb_shinfo(skb)->gso_size = 0;
> > > >       if (unlikely(skb->slow_gro)) {
> > > >               skb_orphan(skb);
> > > >               skb_ext_reset(skb);
> > > > diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
> > > > index a844a0d38482d916251f3aca4555c75c9770820c..0223bbfe9568064b47bc6227d342a4d25c9edfa7 100644
> > > > --- a/net/ipv4/tcp_offload.c
> > > > +++ b/net/ipv4/tcp_offload.c
> > > > @@ -255,7 +255,12 @@ struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb)
> > > >
> > > >       mss = skb_shinfo(p)->gso_size;
> > > >
> > > > -     flush |= (len - 1) >= mss;
> > > > +     if (skb_is_gso(skb)) {
> > > > +             flush |= (mss != skb_shinfo(skb)->gso_size);
> > > > +             flush |= ((skb_gro_len(p) % mss) != 0);
> > >
> > > If I read correctly, the '(skb_gro_len(p) % mss) != 0' codition can be
> > > true only if 'p' was an HW GRO packet (or at least a gso packet before
> > > entering the GRO engine), am I correct? In that case 'p' staged into
> > > the GRO hash up to the next packet (skb), just to be flushed.
> > >
> > > Should the above condition be instead:
> > >
> > >                 flush |= ((skb_gro_len(skb) % mss) != 0);
> >
> > Yes, probable typo.
> >
> > > ?
> > >
> > > And possibly use that condition while initializing
> > > NAPI_GRO_CB(skb)->flush in dev_gro_receive() ?
> >
> > Not sure, this would add an extra test in dev_gro_receive()
> >
> > It seems better to leave the test here, because the prior condition
> > needs to stay here.
> >
> > if (skb_is_gso(skb)) {
> >              flush |= (mss != skb_shinfo(skb)->gso_size);
> >
>
> Oh well, I think Coco missed the fact that the  ((skb_gro_len(skb) % mss) != 0)
> condition needs to be put after label out_check_final.
>
> For example, if MSS==1000, and p has 4 segments, we still want to
> aggregate skb into p
> if skb payload is not a multiple of MSS.
>

Relative diff would be:

diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 0223bbfe9568064b47bc6227d342a4d25c9edfa7..79996b007bd64635aea27e3fddf291abe10ceca1
100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -255,26 +255,27 @@ struct sk_buff *tcp_gro_receive(struct list_head
*head, struct sk_buff *skb)

        mss = skb_shinfo(p)->gso_size;

-       if (skb_is_gso(skb)) {
+       if (unlikely(skb_is_gso(skb)))
                flush |= (mss != skb_shinfo(skb)->gso_size);
-               flush |= ((skb_gro_len(p) % mss) != 0);
-       } else {
+       else
                flush |= (len - 1) >= mss;
-       }
+
        flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
 #ifdef CONFIG_TLS_DEVICE
        flush |= p->decrypted ^ skb->decrypted;
 #endif

        if (flush || skb_gro_receive(p, skb)) {
-               mss = 1;
-               goto out_check_final;
+               flush = 0;
+               goto check_flags;
        }

        tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH);

 out_check_final:
-       flush = len < mss;
+       flush = len != NAPI_GRO_CB(skb)->count * mss;
+
+check_flags:
        flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH |
                                        TCP_FLAG_RST | TCP_FLAG_SYN |
                                        TCP_FLAG_FIN));
Paolo Abeni Sept. 30, 2022, 9:02 p.m. UTC | #5
On Fri, 2022-09-30 at 13:19 -0700, Eric Dumazet wrote:
> On Fri, Sep 30, 2022 at 1:00 PM Eric Dumazet <edumazet@google.com> wrote:
> > 
> > On Fri, Sep 30, 2022 at 12:53 PM Eric Dumazet <edumazet@google.com> wrote:
> > > 
> > > On Fri, Sep 30, 2022 at 1:45 AM Paolo Abeni <pabeni@redhat.com> wrote:
> > > > 
> > > > On Thu, 2022-09-29 at 18:44 -0700, Eric Dumazet wrote:
> > > > > From: Coco Li <lixiaoyan@google.com>
> > > > > 
> > > > > Current GRO stack only supports incoming packets containing
> > > > > one frame/MSS.
> > > > > 
> > > > > This patch changes GRO to accept packets that are already GRO.
> > > > > 
> > > > > HW-GRO (aka RSC for some vendors) is very often limited in presence
> > > > > of interleaved packets. Linux SW GRO stack can complete the job
> > > > > and provide larger GRO packets, thus reducing rate of ACK packets
> > > > > and cpu overhead.
> > > > > 
> > > > > This also means BIG TCP can be used, even if HW-GRO/RSC was
> > > > > able to cook ~64 KB GRO packets.
> > > > > 
> > > > > Co-Developed-by: Eric Dumazet <edumazet@google.com>
> > > > > Signed-off-by: Eric Dumazet <edumazet@google.com>
> > > > > Signed-off-by: Coco Li <lixiaoyan@google.com>
> > > > > ---
> > > > >  net/core/gro.c         | 13 +++++++++----
> > > > >  net/ipv4/tcp_offload.c |  7 ++++++-
> > > > >  2 files changed, 15 insertions(+), 5 deletions(-)
> > > > > 
> > > > > diff --git a/net/core/gro.c b/net/core/gro.c
> > > > > index b4190eb084672fb4f2be8b437eccb4e8507ff63f..d8e159c4bdf553508cd123bee4f5251908ede9fe 100644
> > > > > --- a/net/core/gro.c
> > > > > +++ b/net/core/gro.c
> > > > > @@ -160,6 +160,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
> > > > >       unsigned int gro_max_size;
> > > > >       unsigned int new_truesize;
> > > > >       struct sk_buff *lp;
> > > > > +     int segs;
> > > > > 
> > > > >       /* pairs with WRITE_ONCE() in netif_set_gro_max_size() */
> > > > >       gro_max_size = READ_ONCE(p->dev->gro_max_size);
> > > > > @@ -175,6 +176,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
> > > > >                       return -E2BIG;
> > > > >       }
> > > > > 
> > > > > +     segs = NAPI_GRO_CB(skb)->count;
> > > > >       lp = NAPI_GRO_CB(p)->last;
> > > > >       pinfo = skb_shinfo(lp);
> > > > > 
> > > > > @@ -265,7 +267,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
> > > > >       lp = p;
> > > > > 
> > > > >  done:
> > > > > -     NAPI_GRO_CB(p)->count++;
> > > > > +     NAPI_GRO_CB(p)->count += segs;
> > > > >       p->data_len += len;
> > > > >       p->truesize += delta_truesize;
> > > > >       p->len += len;
> > > > > @@ -496,8 +498,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
> > > > >               BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct napi_gro_cb, zeroed),
> > > > >                                        sizeof(u32))); /* Avoid slow unaligned acc */
> > > > >               *(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0;
> > > > > -             NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
> > > > > +             NAPI_GRO_CB(skb)->flush = skb_has_frag_list(skb);
> > > > >               NAPI_GRO_CB(skb)->is_atomic = 1;
> > > > > +             NAPI_GRO_CB(skb)->count = max_t(u16, 1,
> > > > > +                                             skb_shinfo(skb)->gso_segs);
> > > > > 
> > > > >               /* Setup for GRO checksum validation */
> > > > >               switch (skb->ip_summed) {
> > > > > @@ -545,10 +549,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
> > > > >       else
> > > > >               gro_list->count++;
> > > > > 
> > > > > -     NAPI_GRO_CB(skb)->count = 1;
> > > > >       NAPI_GRO_CB(skb)->age = jiffies;
> > > > >       NAPI_GRO_CB(skb)->last = skb;
> > > > > -     skb_shinfo(skb)->gso_size = skb_gro_len(skb);
> > > > > +     if (!skb_is_gso(skb))
> > > > > +             skb_shinfo(skb)->gso_size = skb_gro_len(skb);
> > > > >       list_add(&skb->list, &gro_list->list);
> > > > >       ret = GRO_HELD;
> > > > > 
> > > > > @@ -660,6 +664,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
> > > > > 
> > > > >       skb->encapsulation = 0;
> > > > >       skb_shinfo(skb)->gso_type = 0;
> > > > > +     skb_shinfo(skb)->gso_size = 0;
> > > > >       if (unlikely(skb->slow_gro)) {
> > > > >               skb_orphan(skb);
> > > > >               skb_ext_reset(skb);
> > > > > diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
> > > > > index a844a0d38482d916251f3aca4555c75c9770820c..0223bbfe9568064b47bc6227d342a4d25c9edfa7 100644
> > > > > --- a/net/ipv4/tcp_offload.c
> > > > > +++ b/net/ipv4/tcp_offload.c
> > > > > @@ -255,7 +255,12 @@ struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb)
> > > > > 
> > > > >       mss = skb_shinfo(p)->gso_size;
> > > > > 
> > > > > -     flush |= (len - 1) >= mss;
> > > > > +     if (skb_is_gso(skb)) {
> > > > > +             flush |= (mss != skb_shinfo(skb)->gso_size);
> > > > > +             flush |= ((skb_gro_len(p) % mss) != 0);
> > > > 
> > > > If I read correctly, the '(skb_gro_len(p) % mss) != 0' codition can be
> > > > true only if 'p' was an HW GRO packet (or at least a gso packet before
> > > > entering the GRO engine), am I correct? In that case 'p' staged into
> > > > the GRO hash up to the next packet (skb), just to be flushed.
> > > > 
> > > > Should the above condition be instead:
> > > > 
> > > >                 flush |= ((skb_gro_len(skb) % mss) != 0);
> > > 
> > > Yes, probable typo.
> > > 
> > > > ?
> > > > 
> > > > And possibly use that condition while initializing
> > > > NAPI_GRO_CB(skb)->flush in dev_gro_receive() ?
> > > 
> > > Not sure, this would add an extra test in dev_gro_receive()
> > > 
> > > It seems better to leave the test here, because the prior condition
> > > needs to stay here.
> > > 
> > > if (skb_is_gso(skb)) {
> > >              flush |= (mss != skb_shinfo(skb)->gso_size);
> > > 
> > 
> > Oh well, I think Coco missed the fact that the  ((skb_gro_len(skb) % mss) != 0)
> > condition needs to be put after label out_check_final.
> > 
> > For example, if MSS==1000, and p has 4 segments, we still want to
> > aggregate skb into p
> > if skb payload is not a multiple of MSS.
> > 
> 
> Relative diff would be:
> 
> diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
> index 0223bbfe9568064b47bc6227d342a4d25c9edfa7..79996b007bd64635aea27e3fddf291abe10ceca1
> 100644
> --- a/net/ipv4/tcp_offload.c
> +++ b/net/ipv4/tcp_offload.c
> @@ -255,26 +255,27 @@ struct sk_buff *tcp_gro_receive(struct list_head
> *head, struct sk_buff *skb)
> 
>         mss = skb_shinfo(p)->gso_size;
> 
> -       if (skb_is_gso(skb)) {
> +       if (unlikely(skb_is_gso(skb)))
>                 flush |= (mss != skb_shinfo(skb)->gso_size);
> -               flush |= ((skb_gro_len(p) % mss) != 0);
> -       } else {
> +       else
>                 flush |= (len - 1) >= mss;
> -       }
> +
>         flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
>  #ifdef CONFIG_TLS_DEVICE
>         flush |= p->decrypted ^ skb->decrypted;
>  #endif
> 
>         if (flush || skb_gro_receive(p, skb)) {
> -               mss = 1;
> -               goto out_check_final;
> +               flush = 0;
> +               goto check_flags;
>         }
> 
>         tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH);
> 
>  out_check_final:
> -       flush = len < mss;
> +       flush = len != NAPI_GRO_CB(skb)->count * mss;

Not sure if it's worthy, perhaps mss can be updated under the 
unlikely(skb_is_gso(skb)) a few lines above:

	mss *= NAPI_GRO_CB(skb)->count;

so that here we can avoid the additional operation for the non gso
case? - just:
	flush = len != mss;

In any case LGTM, thanks!

Paolo
Eric Dumazet Sept. 30, 2022, 9:36 p.m. UTC | #6
On Fri, Sep 30, 2022 at 2:02 PM Paolo Abeni <pabeni@redhat.com> wrote:
>
> On Fri, 2022-09-30 at 13:19 -0700, Eric Dumazet wrote:
> > On Fri, Sep 30, 2022 at 1:00 PM Eric Dumazet <edumazet@google.com> wrote:
> > >
> > > On Fri, Sep 30, 2022 at 12:53 PM Eric Dumazet <edumazet@google.com> wrote:
> > > >
> > > > On Fri, Sep 30, 2022 at 1:45 AM Paolo Abeni <pabeni@redhat.com> wrote:
> > > > >
> > > > > On Thu, 2022-09-29 at 18:44 -0700, Eric Dumazet wrote:
> > > > > > From: Coco Li <lixiaoyan@google.com>
> > > > > >
> > > > > > Current GRO stack only supports incoming packets containing
> > > > > > one frame/MSS.
> > > > > >
> > > > > > This patch changes GRO to accept packets that are already GRO.
> > > > > >
> > > > > > HW-GRO (aka RSC for some vendors) is very often limited in presence
> > > > > > of interleaved packets. Linux SW GRO stack can complete the job
> > > > > > and provide larger GRO packets, thus reducing rate of ACK packets
> > > > > > and cpu overhead.
> > > > > >
> > > > > > This also means BIG TCP can be used, even if HW-GRO/RSC was
> > > > > > able to cook ~64 KB GRO packets.
> > > > > >
> > > > > > Co-Developed-by: Eric Dumazet <edumazet@google.com>
> > > > > > Signed-off-by: Eric Dumazet <edumazet@google.com>
> > > > > > Signed-off-by: Coco Li <lixiaoyan@google.com>
> > > > > > ---
> > > > > >  net/core/gro.c         | 13 +++++++++----
> > > > > >  net/ipv4/tcp_offload.c |  7 ++++++-
> > > > > >  2 files changed, 15 insertions(+), 5 deletions(-)
> > > > > >
> > > > > > diff --git a/net/core/gro.c b/net/core/gro.c
> > > > > > index b4190eb084672fb4f2be8b437eccb4e8507ff63f..d8e159c4bdf553508cd123bee4f5251908ede9fe 100644
> > > > > > --- a/net/core/gro.c
> > > > > > +++ b/net/core/gro.c
> > > > > > @@ -160,6 +160,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
> > > > > >       unsigned int gro_max_size;
> > > > > >       unsigned int new_truesize;
> > > > > >       struct sk_buff *lp;
> > > > > > +     int segs;
> > > > > >
> > > > > >       /* pairs with WRITE_ONCE() in netif_set_gro_max_size() */
> > > > > >       gro_max_size = READ_ONCE(p->dev->gro_max_size);
> > > > > > @@ -175,6 +176,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
> > > > > >                       return -E2BIG;
> > > > > >       }
> > > > > >
> > > > > > +     segs = NAPI_GRO_CB(skb)->count;
> > > > > >       lp = NAPI_GRO_CB(p)->last;
> > > > > >       pinfo = skb_shinfo(lp);
> > > > > >
> > > > > > @@ -265,7 +267,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
> > > > > >       lp = p;
> > > > > >
> > > > > >  done:
> > > > > > -     NAPI_GRO_CB(p)->count++;
> > > > > > +     NAPI_GRO_CB(p)->count += segs;
> > > > > >       p->data_len += len;
> > > > > >       p->truesize += delta_truesize;
> > > > > >       p->len += len;
> > > > > > @@ -496,8 +498,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
> > > > > >               BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct napi_gro_cb, zeroed),
> > > > > >                                        sizeof(u32))); /* Avoid slow unaligned acc */
> > > > > >               *(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0;
> > > > > > -             NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
> > > > > > +             NAPI_GRO_CB(skb)->flush = skb_has_frag_list(skb);
> > > > > >               NAPI_GRO_CB(skb)->is_atomic = 1;
> > > > > > +             NAPI_GRO_CB(skb)->count = max_t(u16, 1,
> > > > > > +                                             skb_shinfo(skb)->gso_segs);
> > > > > >
> > > > > >               /* Setup for GRO checksum validation */
> > > > > >               switch (skb->ip_summed) {
> > > > > > @@ -545,10 +549,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
> > > > > >       else
> > > > > >               gro_list->count++;
> > > > > >
> > > > > > -     NAPI_GRO_CB(skb)->count = 1;
> > > > > >       NAPI_GRO_CB(skb)->age = jiffies;
> > > > > >       NAPI_GRO_CB(skb)->last = skb;
> > > > > > -     skb_shinfo(skb)->gso_size = skb_gro_len(skb);
> > > > > > +     if (!skb_is_gso(skb))
> > > > > > +             skb_shinfo(skb)->gso_size = skb_gro_len(skb);
> > > > > >       list_add(&skb->list, &gro_list->list);
> > > > > >       ret = GRO_HELD;
> > > > > >
> > > > > > @@ -660,6 +664,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
> > > > > >
> > > > > >       skb->encapsulation = 0;
> > > > > >       skb_shinfo(skb)->gso_type = 0;
> > > > > > +     skb_shinfo(skb)->gso_size = 0;
> > > > > >       if (unlikely(skb->slow_gro)) {
> > > > > >               skb_orphan(skb);
> > > > > >               skb_ext_reset(skb);
> > > > > > diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
> > > > > > index a844a0d38482d916251f3aca4555c75c9770820c..0223bbfe9568064b47bc6227d342a4d25c9edfa7 100644
> > > > > > --- a/net/ipv4/tcp_offload.c
> > > > > > +++ b/net/ipv4/tcp_offload.c
> > > > > > @@ -255,7 +255,12 @@ struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb)
> > > > > >
> > > > > >       mss = skb_shinfo(p)->gso_size;
> > > > > >
> > > > > > -     flush |= (len - 1) >= mss;
> > > > > > +     if (skb_is_gso(skb)) {
> > > > > > +             flush |= (mss != skb_shinfo(skb)->gso_size);
> > > > > > +             flush |= ((skb_gro_len(p) % mss) != 0);
> > > > >
> > > > > If I read correctly, the '(skb_gro_len(p) % mss) != 0' codition can be
> > > > > true only if 'p' was an HW GRO packet (or at least a gso packet before
> > > > > entering the GRO engine), am I correct? In that case 'p' staged into
> > > > > the GRO hash up to the next packet (skb), just to be flushed.
> > > > >
> > > > > Should the above condition be instead:
> > > > >
> > > > >                 flush |= ((skb_gro_len(skb) % mss) != 0);
> > > >
> > > > Yes, probable typo.
> > > >
> > > > > ?
> > > > >
> > > > > And possibly use that condition while initializing
> > > > > NAPI_GRO_CB(skb)->flush in dev_gro_receive() ?
> > > >
> > > > Not sure, this would add an extra test in dev_gro_receive()
> > > >
> > > > It seems better to leave the test here, because the prior condition
> > > > needs to stay here.
> > > >
> > > > if (skb_is_gso(skb)) {
> > > >              flush |= (mss != skb_shinfo(skb)->gso_size);
> > > >
> > >
> > > Oh well, I think Coco missed the fact that the  ((skb_gro_len(skb) % mss) != 0)
> > > condition needs to be put after label out_check_final.
> > >
> > > For example, if MSS==1000, and p has 4 segments, we still want to
> > > aggregate skb into p
> > > if skb payload is not a multiple of MSS.
> > >
> >
> > Relative diff would be:
> >
> > diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
> > index 0223bbfe9568064b47bc6227d342a4d25c9edfa7..79996b007bd64635aea27e3fddf291abe10ceca1
> > 100644
> > --- a/net/ipv4/tcp_offload.c
> > +++ b/net/ipv4/tcp_offload.c
> > @@ -255,26 +255,27 @@ struct sk_buff *tcp_gro_receive(struct list_head
> > *head, struct sk_buff *skb)
> >
> >         mss = skb_shinfo(p)->gso_size;
> >
> > -       if (skb_is_gso(skb)) {
> > +       if (unlikely(skb_is_gso(skb)))
> >                 flush |= (mss != skb_shinfo(skb)->gso_size);
> > -               flush |= ((skb_gro_len(p) % mss) != 0);
> > -       } else {
> > +       else
> >                 flush |= (len - 1) >= mss;
> > -       }
> > +
> >         flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
> >  #ifdef CONFIG_TLS_DEVICE
> >         flush |= p->decrypted ^ skb->decrypted;
> >  #endif
> >
> >         if (flush || skb_gro_receive(p, skb)) {
> > -               mss = 1;
> > -               goto out_check_final;
> > +               flush = 0;
> > +               goto check_flags;
> >         }
> >
> >         tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH);
> >
> >  out_check_final:
> > -       flush = len < mss;
> > +       flush = len != NAPI_GRO_CB(skb)->count * mss;
>
> Not sure if it's worthy, perhaps mss can be updated under the
> unlikely(skb_is_gso(skb)) a few lines above:
>
>         mss *= NAPI_GRO_CB(skb)->count;
>
> so that here we can avoid the additional operation for the non gso
> case? - just:
>         flush = len != mss;
>

Yes, but we still need a check even if the above code was not run
(because of earlier goto out_check_final)

I am thinking of:

diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index a844a0d38482d916251f3aca4555c75c9770820c..ba8e6cfb3852fc609afe3022efa10f7a06fb4c12
100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -255,7 +255,11 @@ struct sk_buff *tcp_gro_receive(struct list_head
*head, struct sk_buff *skb)

        mss = skb_shinfo(p)->gso_size;

-       flush |= (len - 1) >= mss;
+       if (unlikely(skb_is_gso(skb)))
+               flush |= (mss != skb_shinfo(skb)->gso_size);
+       else
+               flush |= (len - 1) >= mss;
+
        flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
 #ifdef CONFIG_TLS_DEVICE
        flush |= p->decrypted ^ skb->decrypted;
@@ -269,7 +273,11 @@ struct sk_buff *tcp_gro_receive(struct list_head
*head, struct sk_buff *skb)
        tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH);

 out_check_final:
-       flush = len < mss;
+       if (unlikely(skb_is_gso(skb)))
+               flush = len != NAPI_GRO_CB(skb)->count *
skb_shinfo(skb)->gso_size;
+       else
+               flush = len < mss;
+
        flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH |
                                        TCP_FLAG_RST | TCP_FLAG_SYN |
                                        TCP_FLAG_FIN));
diff mbox series

Patch

diff --git a/net/core/gro.c b/net/core/gro.c
index b4190eb084672fb4f2be8b437eccb4e8507ff63f..d8e159c4bdf553508cd123bee4f5251908ede9fe 100644
--- a/net/core/gro.c
+++ b/net/core/gro.c
@@ -160,6 +160,7 @@  int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
 	unsigned int gro_max_size;
 	unsigned int new_truesize;
 	struct sk_buff *lp;
+	int segs;
 
 	/* pairs with WRITE_ONCE() in netif_set_gro_max_size() */
 	gro_max_size = READ_ONCE(p->dev->gro_max_size);
@@ -175,6 +176,7 @@  int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
 			return -E2BIG;
 	}
 
+	segs = NAPI_GRO_CB(skb)->count;
 	lp = NAPI_GRO_CB(p)->last;
 	pinfo = skb_shinfo(lp);
 
@@ -265,7 +267,7 @@  int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
 	lp = p;
 
 done:
-	NAPI_GRO_CB(p)->count++;
+	NAPI_GRO_CB(p)->count += segs;
 	p->data_len += len;
 	p->truesize += delta_truesize;
 	p->len += len;
@@ -496,8 +498,10 @@  static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 		BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct napi_gro_cb, zeroed),
 					 sizeof(u32))); /* Avoid slow unaligned acc */
 		*(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0;
-		NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
+		NAPI_GRO_CB(skb)->flush = skb_has_frag_list(skb);
 		NAPI_GRO_CB(skb)->is_atomic = 1;
+		NAPI_GRO_CB(skb)->count = max_t(u16, 1,
+						skb_shinfo(skb)->gso_segs);
 
 		/* Setup for GRO checksum validation */
 		switch (skb->ip_summed) {
@@ -545,10 +549,10 @@  static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 	else
 		gro_list->count++;
 
-	NAPI_GRO_CB(skb)->count = 1;
 	NAPI_GRO_CB(skb)->age = jiffies;
 	NAPI_GRO_CB(skb)->last = skb;
-	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
+	if (!skb_is_gso(skb))
+		skb_shinfo(skb)->gso_size = skb_gro_len(skb);
 	list_add(&skb->list, &gro_list->list);
 	ret = GRO_HELD;
 
@@ -660,6 +664,7 @@  static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
 
 	skb->encapsulation = 0;
 	skb_shinfo(skb)->gso_type = 0;
+	skb_shinfo(skb)->gso_size = 0;
 	if (unlikely(skb->slow_gro)) {
 		skb_orphan(skb);
 		skb_ext_reset(skb);
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index a844a0d38482d916251f3aca4555c75c9770820c..0223bbfe9568064b47bc6227d342a4d25c9edfa7 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -255,7 +255,12 @@  struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb)
 
 	mss = skb_shinfo(p)->gso_size;
 
-	flush |= (len - 1) >= mss;
+	if (skb_is_gso(skb)) {
+		flush |= (mss != skb_shinfo(skb)->gso_size);
+		flush |= ((skb_gro_len(p) % mss) != 0);
+	} else {
+		flush |= (len - 1) >= mss;
+	}
 	flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
 #ifdef CONFIG_TLS_DEVICE
 	flush |= p->decrypted ^ skb->decrypted;