diff mbox series

[PATCHv3,net-next,10/10] net: add support for ipv4 big tcp

Message ID 798ca80553e73028eeec4be08ba1549d08b2e5fc.1674835106.git.lucien.xin@gmail.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series net: support ipv4 big tcp | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for net-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 7 this patch: 7
netdev/cc_maintainers warning 4 maintainers not CCed: richardbgobert@gmail.com dsahern@kernel.org steffen.klassert@secunet.com martin.lau@kernel.org
netdev/build_clang success Errors and warnings before: 1 this patch: 1
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 7 this patch: 7
netdev/checkpatch warning WARNING: line length of 87 exceeds 80 columns WARNING: line length of 92 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Xin Long Jan. 27, 2023, 3:59 p.m. UTC
Similar to Eric's IPv6 BIG TCP, this patch is to enable IPv4 BIG TCP.

Firstly, allow sk->sk_gso_max_size to be set to a value greater than
GSO_LEGACY_MAX_SIZE by not trimming gso_max_size in sk_trim_gso_size()
for IPv4 TCP sockets.

Then on TX path, set IP header tot_len to 0 when skb->len > IP_MAX_MTU
in __ip_local_out() to allow to send BIG TCP packets, and this implies
that skb->len is the length of a IPv4 packet; On RX path, use skb->len
as the length of the IPv4 packet when the IP header tot_len is 0 and
skb->len > IP_MAX_MTU in ip_rcv_core(). As the API iph_set_totlen() and
skb_ip_totlen() are used in __ip_local_out() and ip_rcv_core(), we only
need to update these APIs.

Also in GRO receive, add the check for ETH_P_IP/IPPROTO_TCP, and allows
the merged packet size >= GRO_LEGACY_MAX_SIZE in skb_gro_receive(). In
GRO complete, set IP header tot_len to 0 when the merged packet size
greater than IP_MAX_MTU in iph_set_totlen() so that it can be processed
on RX path.

Note that by checking skb_is_gso_tcp() in API iph_totlen(), it makes
this implementation safe to use iph->len == 0 indicates IPv4 BIG TCP
packets.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
---
 net/core/gro.c       | 12 +++++++-----
 net/core/sock.c      |  8 ++++++--
 net/ipv4/af_inet.c   |  7 ++++---
 net/ipv4/ip_input.c  |  2 +-
 net/ipv4/ip_output.c |  2 +-
 5 files changed, 19 insertions(+), 12 deletions(-)

Comments

Eric Dumazet Jan. 27, 2023, 5:41 p.m. UTC | #1
On Fri, Jan 27, 2023 at 5:00 PM Xin Long <lucien.xin@gmail.com> wrote:
>
> Similar to Eric's IPv6 BIG TCP, this patch is to enable IPv4 BIG TCP.
>
> Firstly, allow sk->sk_gso_max_size to be set to a value greater than
> GSO_LEGACY_MAX_SIZE by not trimming gso_max_size in sk_trim_gso_size()
> for IPv4 TCP sockets.
>
> Then on TX path, set IP header tot_len to 0 when skb->len > IP_MAX_MTU
> in __ip_local_out() to allow to send BIG TCP packets, and this implies
> that skb->len is the length of a IPv4 packet; On RX path, use skb->len
> as the length of the IPv4 packet when the IP header tot_len is 0 and
> skb->len > IP_MAX_MTU in ip_rcv_core(). As the API iph_set_totlen() and
> skb_ip_totlen() are used in __ip_local_out() and ip_rcv_core(), we only
> need to update these APIs.
>
> Also in GRO receive, add the check for ETH_P_IP/IPPROTO_TCP, and allows
> the merged packet size >= GRO_LEGACY_MAX_SIZE in skb_gro_receive(). In
> GRO complete, set IP header tot_len to 0 when the merged packet size
> greater than IP_MAX_MTU in iph_set_totlen() so that it can be processed
> on RX path.
>
> Note that by checking skb_is_gso_tcp() in API iph_totlen(), it makes
> this implementation safe to use iph->len == 0 indicates IPv4 BIG TCP
> packets.
>
> Signed-off-by: Xin Long <lucien.xin@gmail.com>
> ---
>  net/core/gro.c       | 12 +++++++-----
>  net/core/sock.c      |  8 ++++++--
>  net/ipv4/af_inet.c   |  7 ++++---
>  net/ipv4/ip_input.c  |  2 +-
>  net/ipv4/ip_output.c |  2 +-
>  5 files changed, 19 insertions(+), 12 deletions(-)
>
> diff --git a/net/core/gro.c b/net/core/gro.c
> index 506f83d715f8..b15f85546bdd 100644
> --- a/net/core/gro.c
> +++ b/net/core/gro.c
> @@ -162,16 +162,18 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
>         struct sk_buff *lp;
>         int segs;
>
> -       /* pairs with WRITE_ONCE() in netif_set_gro_max_size() */
> -       gro_max_size = READ_ONCE(p->dev->gro_max_size);
> +       /* pairs with WRITE_ONCE() in netif_set_gro(_ipv4)_max_size() */
> +       gro_max_size = p->protocol == htons(ETH_P_IPV6) ?
> +                       READ_ONCE(p->dev->gro_max_size) :
> +                               READ_ONCE(p->dev->gro_ipv4_max_size);
>
>         if (unlikely(p->len + len >= gro_max_size || NAPI_GRO_CB(skb)->flush))
>                 return -E2BIG;
>
>         if (unlikely(p->len + len >= GRO_LEGACY_MAX_SIZE)) {
> -               if (p->protocol != htons(ETH_P_IPV6) ||
> -                   skb_headroom(p) < sizeof(struct hop_jumbo_hdr) ||
> -                   ipv6_hdr(p)->nexthdr != IPPROTO_TCP ||
> +               if (NAPI_GRO_CB(skb)->proto != IPPROTO_TCP ||
> +                   (p->protocol == htons(ETH_P_IPV6) &&
> +                    skb_headroom(p) < sizeof(struct hop_jumbo_hdr)) ||
>                     p->encapsulation)
>                         return -E2BIG;
>         }
> diff --git a/net/core/sock.c b/net/core/sock.c
> index 7ba4891460ad..c98f9a4eeff9 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -2383,6 +2383,8 @@ static void sk_trim_gso_size(struct sock *sk)
>             !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
>                 return;
>  #endif
> +       if (sk->sk_family == AF_INET && sk_is_tcp(sk))
> +               return;

Or simply

diff --git a/net/core/sock.c b/net/core/sock.c
index 7ba4891460adbd6c13c0ce1dcdd7f23c8c1f0f5d..dcb8fff91fd9a9472267a2cf2fdc98114a7d2b7d
100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2375,14 +2375,9 @@ EXPORT_SYMBOL_GPL(sk_free_unlock_clone);

 static void sk_trim_gso_size(struct sock *sk)
 {
-       if (sk->sk_gso_max_size <= GSO_LEGACY_MAX_SIZE)
+       if (sk->sk_gso_max_size <= GSO_LEGACY_MAX_SIZE ||
+           sk_is_tcp(sk))
                return;
-#if IS_ENABLED(CONFIG_IPV6)
-       if (sk->sk_family == AF_INET6 &&
-           sk_is_tcp(sk) &&
-           !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
-               return;
-#endif
        sk->sk_gso_max_size = GSO_LEGACY_MAX_SIZE;
 }



>         sk->sk_gso_max_size = GSO_LEGACY_MAX_SIZE;
>  }
>
> @@ -2403,8 +2405,10 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
>                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
>                 } else {
>                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
> -                       /* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */
> -                       sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size);
> +                       /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
> +                       sk->sk_gso_max_size = sk->sk_family == AF_INET6 ?
> +                                       READ_ONCE(dst->dev->gso_max_size) :
> +                                               READ_ONCE(dst->dev->gso_ipv4_max_size);
>                         sk_trim_gso_size(sk);
>                         sk->sk_gso_max_size -= (MAX_TCP_HEADER + 1);
>                         /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
> diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
> index 6c0ec2789943..2f992a323b95 100644
> --- a/net/ipv4/af_inet.c
> +++ b/net/ipv4/af_inet.c
> @@ -1485,6 +1485,7 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
>         if (unlikely(ip_fast_csum((u8 *)iph, 5)))
>                 goto out;
>
> +       NAPI_GRO_CB(skb)->proto = proto;
>         id = ntohl(*(__be32 *)&iph->id);
>         flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF));
>         id >>= 16;
> @@ -1618,9 +1619,9 @@ int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
>
>  int inet_gro_complete(struct sk_buff *skb, int nhoff)
>  {
> -       __be16 newlen = htons(skb->len - nhoff);
>         struct iphdr *iph = (struct iphdr *)(skb->data + nhoff);
>         const struct net_offload *ops;
> +       __be16 totlen = iph->tot_len;
>         int proto = iph->protocol;
>         int err = -ENOSYS;
>
> @@ -1629,8 +1630,8 @@ int inet_gro_complete(struct sk_buff *skb, int nhoff)
>                 skb_set_inner_network_header(skb, nhoff);
>         }
>
> -       csum_replace2(&iph->check, iph->tot_len, newlen);
> -       iph->tot_len = newlen;
> +       iph_set_totlen(iph, skb->len - nhoff);
> +       csum_replace2(&iph->check, totlen, iph->tot_len);
>
>         ops = rcu_dereference(inet_offloads[proto]);
>         if (WARN_ON(!ops || !ops->callbacks.gro_complete))
> diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
> index e880ce77322a..0aa8c49b4e1b 100644
> --- a/net/ipv4/ip_input.c
> +++ b/net/ipv4/ip_input.c
> @@ -511,7 +511,7 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
>         if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
>                 goto csum_error;
>
> -       len = ntohs(iph->tot_len);
> +       len = skb_ip_totlen(skb);

len = iph_totlen(skb, iph);

>         if (skb->len < len) {
>                 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
>                 __IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS);
> diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
> index 922c87ef1ab5..4e4e308c3230 100644
> --- a/net/ipv4/ip_output.c
> +++ b/net/ipv4/ip_output.c
> @@ -100,7 +100,7 @@ int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
>  {
>         struct iphdr *iph = ip_hdr(skb);
>
> -       iph->tot_len = htons(skb->len);
> +       iph_set_totlen(iph, skb->len);
>         ip_send_check(iph);
>
>         /* if egress device is enslaved to an L3 master device pass the
> --
> 2.31.1
>
Xin Long Jan. 27, 2023, 6:37 p.m. UTC | #2
On Fri, Jan 27, 2023 at 12:41 PM Eric Dumazet <edumazet@google.com> wrote:
>
> On Fri, Jan 27, 2023 at 5:00 PM Xin Long <lucien.xin@gmail.com> wrote:
> >
> > Similar to Eric's IPv6 BIG TCP, this patch is to enable IPv4 BIG TCP.
> >
> > Firstly, allow sk->sk_gso_max_size to be set to a value greater than
> > GSO_LEGACY_MAX_SIZE by not trimming gso_max_size in sk_trim_gso_size()
> > for IPv4 TCP sockets.
> >
> > Then on TX path, set IP header tot_len to 0 when skb->len > IP_MAX_MTU
> > in __ip_local_out() to allow to send BIG TCP packets, and this implies
> > that skb->len is the length of a IPv4 packet; On RX path, use skb->len
> > as the length of the IPv4 packet when the IP header tot_len is 0 and
> > skb->len > IP_MAX_MTU in ip_rcv_core(). As the API iph_set_totlen() and
> > skb_ip_totlen() are used in __ip_local_out() and ip_rcv_core(), we only
> > need to update these APIs.
> >
> > Also in GRO receive, add the check for ETH_P_IP/IPPROTO_TCP, and allows
> > the merged packet size >= GRO_LEGACY_MAX_SIZE in skb_gro_receive(). In
> > GRO complete, set IP header tot_len to 0 when the merged packet size
> > greater than IP_MAX_MTU in iph_set_totlen() so that it can be processed
> > on RX path.
> >
> > Note that by checking skb_is_gso_tcp() in API iph_totlen(), it makes
> > this implementation safe to use iph->len == 0 indicates IPv4 BIG TCP
> > packets.
> >
> > Signed-off-by: Xin Long <lucien.xin@gmail.com>
> > ---
> >  net/core/gro.c       | 12 +++++++-----
> >  net/core/sock.c      |  8 ++++++--
> >  net/ipv4/af_inet.c   |  7 ++++---
> >  net/ipv4/ip_input.c  |  2 +-
> >  net/ipv4/ip_output.c |  2 +-
> >  5 files changed, 19 insertions(+), 12 deletions(-)
> >
> > diff --git a/net/core/gro.c b/net/core/gro.c
> > index 506f83d715f8..b15f85546bdd 100644
> > --- a/net/core/gro.c
> > +++ b/net/core/gro.c
> > @@ -162,16 +162,18 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
> >         struct sk_buff *lp;
> >         int segs;
> >
> > -       /* pairs with WRITE_ONCE() in netif_set_gro_max_size() */
> > -       gro_max_size = READ_ONCE(p->dev->gro_max_size);
> > +       /* pairs with WRITE_ONCE() in netif_set_gro(_ipv4)_max_size() */
> > +       gro_max_size = p->protocol == htons(ETH_P_IPV6) ?
> > +                       READ_ONCE(p->dev->gro_max_size) :
> > +                               READ_ONCE(p->dev->gro_ipv4_max_size);
> >
> >         if (unlikely(p->len + len >= gro_max_size || NAPI_GRO_CB(skb)->flush))
> >                 return -E2BIG;
> >
> >         if (unlikely(p->len + len >= GRO_LEGACY_MAX_SIZE)) {
> > -               if (p->protocol != htons(ETH_P_IPV6) ||
> > -                   skb_headroom(p) < sizeof(struct hop_jumbo_hdr) ||
> > -                   ipv6_hdr(p)->nexthdr != IPPROTO_TCP ||
> > +               if (NAPI_GRO_CB(skb)->proto != IPPROTO_TCP ||
> > +                   (p->protocol == htons(ETH_P_IPV6) &&
> > +                    skb_headroom(p) < sizeof(struct hop_jumbo_hdr)) ||
> >                     p->encapsulation)
> >                         return -E2BIG;
> >         }
> > diff --git a/net/core/sock.c b/net/core/sock.c
> > index 7ba4891460ad..c98f9a4eeff9 100644
> > --- a/net/core/sock.c
> > +++ b/net/core/sock.c
> > @@ -2383,6 +2383,8 @@ static void sk_trim_gso_size(struct sock *sk)
> >             !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
> >                 return;
> >  #endif
> > +       if (sk->sk_family == AF_INET && sk_is_tcp(sk))
> > +               return;
>
> Or simply
>
> diff --git a/net/core/sock.c b/net/core/sock.c
> index 7ba4891460adbd6c13c0ce1dcdd7f23c8c1f0f5d..dcb8fff91fd9a9472267a2cf2fdc98114a7d2b7d
> 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -2375,14 +2375,9 @@ EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
>
>  static void sk_trim_gso_size(struct sock *sk)
>  {
> -       if (sk->sk_gso_max_size <= GSO_LEGACY_MAX_SIZE)
> +       if (sk->sk_gso_max_size <= GSO_LEGACY_MAX_SIZE ||
> +           sk_is_tcp(sk))
>                 return;
> -#if IS_ENABLED(CONFIG_IPV6)
> -       if (sk->sk_family == AF_INET6 &&
> -           sk_is_tcp(sk) &&
> -           !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
> -               return;
> -#endif
>         sk->sk_gso_max_size = GSO_LEGACY_MAX_SIZE;
>  }
There's a difference,  AF_INET6 TCP socket may send ipv4 packets with
ipv6_addr_v4mapped, if we don't check ipv6_addr_v4mapped(), IPV4
GSO packets might go with the "gso_max_size" for IPV6.

I think we could use the change you wrote above, but we also need to
use dst->ops->family instead of sk->sk_family in sk_setup_caps():

+                       sk->sk_gso_max_size = dst->ops->family == AF_INET6 ?
+                                       READ_ONCE(dst->dev->gso_max_size) :
+
READ_ONCE(dst->dev->gso_ipv4_max_size);

>
>
>
> >         sk->sk_gso_max_size = GSO_LEGACY_MAX_SIZE;
> >  }
> >
> > @@ -2403,8 +2405,10 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
> >                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
> >                 } else {
> >                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
> > -                       /* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */
> > -                       sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size);
> > +                       /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
> > +                       sk->sk_gso_max_size = sk->sk_family == AF_INET6 ?
> > +                                       READ_ONCE(dst->dev->gso_max_size) :
> > +                                               READ_ONCE(dst->dev->gso_ipv4_max_size);
> >                         sk_trim_gso_size(sk);
> >                         sk->sk_gso_max_size -= (MAX_TCP_HEADER + 1);
> >                         /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
> > diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
> > index 6c0ec2789943..2f992a323b95 100644
> > --- a/net/ipv4/af_inet.c
> > +++ b/net/ipv4/af_inet.c
> > @@ -1485,6 +1485,7 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
> >         if (unlikely(ip_fast_csum((u8 *)iph, 5)))
> >                 goto out;
> >
> > +       NAPI_GRO_CB(skb)->proto = proto;
> >         id = ntohl(*(__be32 *)&iph->id);
> >         flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF));
> >         id >>= 16;
> > @@ -1618,9 +1619,9 @@ int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
> >
> >  int inet_gro_complete(struct sk_buff *skb, int nhoff)
> >  {
> > -       __be16 newlen = htons(skb->len - nhoff);
> >         struct iphdr *iph = (struct iphdr *)(skb->data + nhoff);
> >         const struct net_offload *ops;
> > +       __be16 totlen = iph->tot_len;
> >         int proto = iph->protocol;
> >         int err = -ENOSYS;
> >
> > @@ -1629,8 +1630,8 @@ int inet_gro_complete(struct sk_buff *skb, int nhoff)
> >                 skb_set_inner_network_header(skb, nhoff);
> >         }
> >
> > -       csum_replace2(&iph->check, iph->tot_len, newlen);
> > -       iph->tot_len = newlen;
> > +       iph_set_totlen(iph, skb->len - nhoff);
> > +       csum_replace2(&iph->check, totlen, iph->tot_len);
> >
> >         ops = rcu_dereference(inet_offloads[proto]);
> >         if (WARN_ON(!ops || !ops->callbacks.gro_complete))
> > diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
> > index e880ce77322a..0aa8c49b4e1b 100644
> > --- a/net/ipv4/ip_input.c
> > +++ b/net/ipv4/ip_input.c
> > @@ -511,7 +511,7 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
> >         if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
> >                 goto csum_error;
> >
> > -       len = ntohs(iph->tot_len);
> > +       len = skb_ip_totlen(skb);
>
> len = iph_totlen(skb, iph);
OK, thanks.
Eric Dumazet Jan. 27, 2023, 6:44 p.m. UTC | #3
On Fri, Jan 27, 2023 at 7:37 PM Xin Long <lucien.xin@gmail.com> wrote:
>
> On Fri, Jan 27, 2023 at 12:41 PM Eric Dumazet <edumazet@google.com> wrote:
> >
> > On Fri, Jan 27, 2023 at 5:00 PM Xin Long <lucien.xin@gmail.com> wrote:
> > >
> > > Similar to Eric's IPv6 BIG TCP, this patch is to enable IPv4 BIG TCP.
> > >
> > > Firstly, allow sk->sk_gso_max_size to be set to a value greater than
> > > GSO_LEGACY_MAX_SIZE by not trimming gso_max_size in sk_trim_gso_size()
> > > for IPv4 TCP sockets.
> > >
> > > Then on TX path, set IP header tot_len to 0 when skb->len > IP_MAX_MTU
> > > in __ip_local_out() to allow to send BIG TCP packets, and this implies
> > > that skb->len is the length of a IPv4 packet; On RX path, use skb->len
> > > as the length of the IPv4 packet when the IP header tot_len is 0 and
> > > skb->len > IP_MAX_MTU in ip_rcv_core(). As the API iph_set_totlen() and
> > > skb_ip_totlen() are used in __ip_local_out() and ip_rcv_core(), we only
> > > need to update these APIs.
> > >
> > > Also in GRO receive, add the check for ETH_P_IP/IPPROTO_TCP, and allows
> > > the merged packet size >= GRO_LEGACY_MAX_SIZE in skb_gro_receive(). In
> > > GRO complete, set IP header tot_len to 0 when the merged packet size
> > > greater than IP_MAX_MTU in iph_set_totlen() so that it can be processed
> > > on RX path.
> > >
> > > Note that by checking skb_is_gso_tcp() in API iph_totlen(), it makes
> > > this implementation safe to use iph->len == 0 indicates IPv4 BIG TCP
> > > packets.
> > >
> > > Signed-off-by: Xin Long <lucien.xin@gmail.com>
> > > ---
> > >  net/core/gro.c       | 12 +++++++-----
> > >  net/core/sock.c      |  8 ++++++--
> > >  net/ipv4/af_inet.c   |  7 ++++---
> > >  net/ipv4/ip_input.c  |  2 +-
> > >  net/ipv4/ip_output.c |  2 +-
> > >  5 files changed, 19 insertions(+), 12 deletions(-)
> > >
> > > diff --git a/net/core/gro.c b/net/core/gro.c
> > > index 506f83d715f8..b15f85546bdd 100644
> > > --- a/net/core/gro.c
> > > +++ b/net/core/gro.c
> > > @@ -162,16 +162,18 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
> > >         struct sk_buff *lp;
> > >         int segs;
> > >
> > > -       /* pairs with WRITE_ONCE() in netif_set_gro_max_size() */
> > > -       gro_max_size = READ_ONCE(p->dev->gro_max_size);
> > > +       /* pairs with WRITE_ONCE() in netif_set_gro(_ipv4)_max_size() */
> > > +       gro_max_size = p->protocol == htons(ETH_P_IPV6) ?
> > > +                       READ_ONCE(p->dev->gro_max_size) :
> > > +                               READ_ONCE(p->dev->gro_ipv4_max_size);
> > >
> > >         if (unlikely(p->len + len >= gro_max_size || NAPI_GRO_CB(skb)->flush))
> > >                 return -E2BIG;
> > >
> > >         if (unlikely(p->len + len >= GRO_LEGACY_MAX_SIZE)) {
> > > -               if (p->protocol != htons(ETH_P_IPV6) ||
> > > -                   skb_headroom(p) < sizeof(struct hop_jumbo_hdr) ||
> > > -                   ipv6_hdr(p)->nexthdr != IPPROTO_TCP ||
> > > +               if (NAPI_GRO_CB(skb)->proto != IPPROTO_TCP ||
> > > +                   (p->protocol == htons(ETH_P_IPV6) &&
> > > +                    skb_headroom(p) < sizeof(struct hop_jumbo_hdr)) ||
> > >                     p->encapsulation)
> > >                         return -E2BIG;
> > >         }
> > > diff --git a/net/core/sock.c b/net/core/sock.c
> > > index 7ba4891460ad..c98f9a4eeff9 100644
> > > --- a/net/core/sock.c
> > > +++ b/net/core/sock.c
> > > @@ -2383,6 +2383,8 @@ static void sk_trim_gso_size(struct sock *sk)
> > >             !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
> > >                 return;
> > >  #endif
> > > +       if (sk->sk_family == AF_INET && sk_is_tcp(sk))
> > > +               return;
> >
> > Or simply
> >
> > diff --git a/net/core/sock.c b/net/core/sock.c
> > index 7ba4891460adbd6c13c0ce1dcdd7f23c8c1f0f5d..dcb8fff91fd9a9472267a2cf2fdc98114a7d2b7d
> > 100644
> > --- a/net/core/sock.c
> > +++ b/net/core/sock.c
> > @@ -2375,14 +2375,9 @@ EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
> >
> >  static void sk_trim_gso_size(struct sock *sk)
> >  {
> > -       if (sk->sk_gso_max_size <= GSO_LEGACY_MAX_SIZE)
> > +       if (sk->sk_gso_max_size <= GSO_LEGACY_MAX_SIZE ||
> > +           sk_is_tcp(sk))
> >                 return;
> > -#if IS_ENABLED(CONFIG_IPV6)
> > -       if (sk->sk_family == AF_INET6 &&
> > -           sk_is_tcp(sk) &&
> > -           !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
> > -               return;
> > -#endif
> >         sk->sk_gso_max_size = GSO_LEGACY_MAX_SIZE;
> >  }
> There's a difference,  AF_INET6 TCP socket may send ipv4 packets with
> ipv6_addr_v4mapped, if we don't check ipv6_addr_v4mapped(), IPV4
> GSO packets might go with the "gso_max_size" for IPV6.
>

But the change you wrote in sk_setup_caps() only checked sk_family.


> I think we could use the change you wrote above, but we also need to
> use dst->ops->family instead of sk->sk_family in sk_setup_caps():
>
> +                       sk->sk_gso_max_size = dst->ops->family == AF_INET6 ?
> +                                       READ_ONCE(dst->dev->gso_max_size) :
> +
> READ_ONCE(dst->dev->gso_ipv4_max_size);
>
> >
> >
> >
> > >         sk->sk_gso_max_size = GSO_LEGACY_MAX_SIZE;
> > >  }
> > >
> > > @@ -2403,8 +2405,10 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
> > >                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
> > >                 } else {
> > >                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
> > > -                       /* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */
> > > -                       sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size);
> > > +                       /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
> > > +                       sk->sk_gso_max_size = sk->sk_family == AF_INET6 ?
> > > +                                       READ_ONCE(dst->dev->gso_max_size) :
> > > +                                               READ_ONCE(dst->dev->gso_ipv4_max_size);

Here...

So if you need ipv6_addr_v4mapped() this should be done here anyway.

> > >                         sk_trim_gso_size(sk);
> > >                         sk->sk_gso_max_size -= (MAX_TCP_HEADER + 1);
> > >                         /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
> > > diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
> > > index 6c0ec2789943..2f992a323b95 100644
> > > --- a/net/ipv4/af_inet.c
> > > +++ b/net/ipv4/af_inet.c
> > > @@ -1485,6 +1485,7 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
> > >         if (unlikely(ip_fast_csum((u8 *)iph, 5)))
> > >                 goto out;
> > >
> > > +       NAPI_GRO_CB(skb)->proto = proto;
> > >         id = ntohl(*(__be32 *)&iph->id);
> > >         flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF));
> > >         id >>= 16;
> > > @@ -1618,9 +1619,9 @@ int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
> > >
> > >  int inet_gro_complete(struct sk_buff *skb, int nhoff)
> > >  {
> > > -       __be16 newlen = htons(skb->len - nhoff);
> > >         struct iphdr *iph = (struct iphdr *)(skb->data + nhoff);
> > >         const struct net_offload *ops;
> > > +       __be16 totlen = iph->tot_len;
> > >         int proto = iph->protocol;
> > >         int err = -ENOSYS;
> > >
> > > @@ -1629,8 +1630,8 @@ int inet_gro_complete(struct sk_buff *skb, int nhoff)
> > >                 skb_set_inner_network_header(skb, nhoff);
> > >         }
> > >
> > > -       csum_replace2(&iph->check, iph->tot_len, newlen);
> > > -       iph->tot_len = newlen;
> > > +       iph_set_totlen(iph, skb->len - nhoff);
> > > +       csum_replace2(&iph->check, totlen, iph->tot_len);
> > >
> > >         ops = rcu_dereference(inet_offloads[proto]);
> > >         if (WARN_ON(!ops || !ops->callbacks.gro_complete))
> > > diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
> > > index e880ce77322a..0aa8c49b4e1b 100644
> > > --- a/net/ipv4/ip_input.c
> > > +++ b/net/ipv4/ip_input.c
> > > @@ -511,7 +511,7 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
> > >         if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
> > >                 goto csum_error;
> > >
> > > -       len = ntohs(iph->tot_len);
> > > +       len = skb_ip_totlen(skb);
> >
> > len = iph_totlen(skb, iph);
> OK, thanks.
diff mbox series

Patch

diff --git a/net/core/gro.c b/net/core/gro.c
index 506f83d715f8..b15f85546bdd 100644
--- a/net/core/gro.c
+++ b/net/core/gro.c
@@ -162,16 +162,18 @@  int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
 	struct sk_buff *lp;
 	int segs;
 
-	/* pairs with WRITE_ONCE() in netif_set_gro_max_size() */
-	gro_max_size = READ_ONCE(p->dev->gro_max_size);
+	/* pairs with WRITE_ONCE() in netif_set_gro(_ipv4)_max_size() */
+	gro_max_size = p->protocol == htons(ETH_P_IPV6) ?
+			READ_ONCE(p->dev->gro_max_size) :
+				READ_ONCE(p->dev->gro_ipv4_max_size);
 
 	if (unlikely(p->len + len >= gro_max_size || NAPI_GRO_CB(skb)->flush))
 		return -E2BIG;
 
 	if (unlikely(p->len + len >= GRO_LEGACY_MAX_SIZE)) {
-		if (p->protocol != htons(ETH_P_IPV6) ||
-		    skb_headroom(p) < sizeof(struct hop_jumbo_hdr) ||
-		    ipv6_hdr(p)->nexthdr != IPPROTO_TCP ||
+		if (NAPI_GRO_CB(skb)->proto != IPPROTO_TCP ||
+		    (p->protocol == htons(ETH_P_IPV6) &&
+		     skb_headroom(p) < sizeof(struct hop_jumbo_hdr)) ||
 		    p->encapsulation)
 			return -E2BIG;
 	}
diff --git a/net/core/sock.c b/net/core/sock.c
index 7ba4891460ad..c98f9a4eeff9 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2383,6 +2383,8 @@  static void sk_trim_gso_size(struct sock *sk)
 	    !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
 		return;
 #endif
+	if (sk->sk_family == AF_INET && sk_is_tcp(sk))
+		return;
 	sk->sk_gso_max_size = GSO_LEGACY_MAX_SIZE;
 }
 
@@ -2403,8 +2405,10 @@  void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
 		} else {
 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
-			/* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */
-			sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size);
+			/* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
+			sk->sk_gso_max_size = sk->sk_family == AF_INET6 ?
+					READ_ONCE(dst->dev->gso_max_size) :
+						READ_ONCE(dst->dev->gso_ipv4_max_size);
 			sk_trim_gso_size(sk);
 			sk->sk_gso_max_size -= (MAX_TCP_HEADER + 1);
 			/* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 6c0ec2789943..2f992a323b95 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1485,6 +1485,7 @@  struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
 	if (unlikely(ip_fast_csum((u8 *)iph, 5)))
 		goto out;
 
+	NAPI_GRO_CB(skb)->proto = proto;
 	id = ntohl(*(__be32 *)&iph->id);
 	flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF));
 	id >>= 16;
@@ -1618,9 +1619,9 @@  int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
 
 int inet_gro_complete(struct sk_buff *skb, int nhoff)
 {
-	__be16 newlen = htons(skb->len - nhoff);
 	struct iphdr *iph = (struct iphdr *)(skb->data + nhoff);
 	const struct net_offload *ops;
+	__be16 totlen = iph->tot_len;
 	int proto = iph->protocol;
 	int err = -ENOSYS;
 
@@ -1629,8 +1630,8 @@  int inet_gro_complete(struct sk_buff *skb, int nhoff)
 		skb_set_inner_network_header(skb, nhoff);
 	}
 
-	csum_replace2(&iph->check, iph->tot_len, newlen);
-	iph->tot_len = newlen;
+	iph_set_totlen(iph, skb->len - nhoff);
+	csum_replace2(&iph->check, totlen, iph->tot_len);
 
 	ops = rcu_dereference(inet_offloads[proto]);
 	if (WARN_ON(!ops || !ops->callbacks.gro_complete))
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index e880ce77322a..0aa8c49b4e1b 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -511,7 +511,7 @@  static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
 	if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
 		goto csum_error;
 
-	len = ntohs(iph->tot_len);
+	len = skb_ip_totlen(skb);
 	if (skb->len < len) {
 		drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
 		__IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 922c87ef1ab5..4e4e308c3230 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -100,7 +100,7 @@  int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
 	struct iphdr *iph = ip_hdr(skb);
 
-	iph->tot_len = htons(skb->len);
+	iph_set_totlen(iph, skb->len);
 	ip_send_check(iph);
 
 	/* if egress device is enslaved to an L3 master device pass the