diff mbox series

[v4,net-next,02/14] ipv6: add dev->gso_ipv6_max_size

Message ID 20220310054703.849899-3-eric.dumazet@gmail.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series tcp: BIG TCP implementation | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for net-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 4827 this patch: 4827
netdev/cc_maintainers warning 3 maintainers not CCed: liuhangbin@gmail.com idosch@nvidia.com petrm@nvidia.com
netdev/build_clang success Errors and warnings before: 822 this patch: 822
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 4982 this patch: 4982
netdev/checkpatch warning CHECK: Alignment should match open parenthesis WARNING: line length of 83 exceeds 80 columns WARNING: line length of 93 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 1 this patch: 1
netdev/source_inline success Was 0 now: 0

Commit Message

Eric Dumazet March 10, 2022, 5:46 a.m. UTC
From: Coco Li <lixiaoyan@google.com>

This enable TCP stack to build TSO packets bigger than
64KB if the driver is LSOv2 compatible.

This patch introduces new variable gso_ipv6_max_size
that is modifiable through ip link.

ip link set dev eth0 gso_ipv6_max_size 185000

User input is capped by driver limit (tso_ipv6_max_size)
added in previous patch.

Signed-off-by: Coco Li <lixiaoyan@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 include/linux/netdevice.h          | 12 ++++++++++++
 include/uapi/linux/if_link.h       |  1 +
 net/core/dev.c                     |  1 +
 net/core/rtnetlink.c               | 15 +++++++++++++++
 net/core/sock.c                    |  6 ++++++
 tools/include/uapi/linux/if_link.h |  1 +
 6 files changed, 36 insertions(+)

Comments

Alexander Duyck March 11, 2022, 4:21 p.m. UTC | #1
On Wed, 2022-03-09 at 21:46 -0800, Eric Dumazet wrote:
> From: Coco Li <lixiaoyan@google.com>
> 
> This enable TCP stack to build TSO packets bigger than
> 64KB if the driver is LSOv2 compatible.
> 
> This patch introduces new variable gso_ipv6_max_size
> that is modifiable through ip link.
> 
> ip link set dev eth0 gso_ipv6_max_size 185000
> 
> User input is capped by driver limit (tso_ipv6_max_size)
> added in previous patch.
> 
> Signed-off-by: Coco Li <lixiaoyan@google.com>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> ---
>  include/linux/netdevice.h          | 12 ++++++++++++
>  include/uapi/linux/if_link.h       |  1 +
>  net/core/dev.c                     |  1 +
>  net/core/rtnetlink.c               | 15 +++++++++++++++
>  net/core/sock.c                    |  6 ++++++
>  tools/include/uapi/linux/if_link.h |  1 +
>  6 files changed, 36 insertions(+)
> 
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 61db67222c47664c179b6a5d3b6f15fdf8a02bdd..9ed348d8b6f1195514c3b5f85fbe2c45b3fa997f 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -1952,6 +1952,7 @@ enum netdev_ml_priv_type {
>   *					registered
>   *	@offload_xstats_l3:	L3 HW stats for this netdevice.
>   *	@tso_ipv6_max_size:	Maximum size of IPv6 TSO packets (driver/NIC limit)
> + *	@gso_ipv6_max_size:	Maximum size of IPv6 GSO packets (user/admin limit)
>   *
>   *	FIXME: cleanup struct net_device such that network protocol info
>   *	moves out.
> @@ -2291,6 +2292,7 @@ struct net_device {
>  	netdevice_tracker	dev_registered_tracker;
>  	struct rtnl_hw_stats64	*offload_xstats_l3;
>  	unsigned int		tso_ipv6_max_size;
> +	unsigned int		gso_ipv6_max_size;
>  };
>  #define to_net_dev(d) container_of(d, struct net_device, dev)
> 

Rather than have this as a device specific value would it be
advantageous to consider making this a namespace specific sysctl value
instead? Something along the lines of:
  net.ipv6.conf.*.max_jumbogram_size

It could also be applied generically to the GSO/GRO as the upper limit
for any frame assembled by the socket or GRO.

The general idea is that might be desirable for admins to be able to
basically just set the maximum size they want to see for IPv6 frames
and if we could combine the GRO/GSO logic into a single sysctl that
could be set on a namespace basis instead of a device basis which would
be more difficult to track down. We already have the per-device limits
in the tso_ipv6_max_size for the outgoing frames so it seems like it
might make sense to make this per network namespace and defaultable
rather than per device and requiring an update for each device
instance.
Eric Dumazet March 15, 2022, 3:57 p.m. UTC | #2
On Fri, Mar 11, 2022 at 8:22 AM Alexander H Duyck
<alexander.duyck@gmail.com> wrote:
>
> On Wed, 2022-03-09 at 21:46 -0800, Eric Dumazet wrote:
> > From: Coco Li <lixiaoyan@google.com>
> >
> > This enable TCP stack to build TSO packets bigger than
> > 64KB if the driver is LSOv2 compatible.
> >
> > This patch introduces new variable gso_ipv6_max_size
> > that is modifiable through ip link.
> >
> > ip link set dev eth0 gso_ipv6_max_size 185000
> >
> > User input is capped by driver limit (tso_ipv6_max_size)
> > added in previous patch.
> >
> > Signed-off-by: Coco Li <lixiaoyan@google.com>
> > Signed-off-by: Eric Dumazet <edumazet@google.com>
> > ---
> >  include/linux/netdevice.h          | 12 ++++++++++++
> >  include/uapi/linux/if_link.h       |  1 +
> >  net/core/dev.c                     |  1 +
> >  net/core/rtnetlink.c               | 15 +++++++++++++++
> >  net/core/sock.c                    |  6 ++++++
> >  tools/include/uapi/linux/if_link.h |  1 +
> >  6 files changed, 36 insertions(+)
> >
> > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> > index 61db67222c47664c179b6a5d3b6f15fdf8a02bdd..9ed348d8b6f1195514c3b5f85fbe2c45b3fa997f 100644
> > --- a/include/linux/netdevice.h
> > +++ b/include/linux/netdevice.h
> > @@ -1952,6 +1952,7 @@ enum netdev_ml_priv_type {
> >   *                                   registered
> >   *   @offload_xstats_l3:     L3 HW stats for this netdevice.
> >   *   @tso_ipv6_max_size:     Maximum size of IPv6 TSO packets (driver/NIC limit)
> > + *   @gso_ipv6_max_size:     Maximum size of IPv6 GSO packets (user/admin limit)
> >   *
> >   *   FIXME: cleanup struct net_device such that network protocol info
> >   *   moves out.
> > @@ -2291,6 +2292,7 @@ struct net_device {
> >       netdevice_tracker       dev_registered_tracker;
> >       struct rtnl_hw_stats64  *offload_xstats_l3;
> >       unsigned int            tso_ipv6_max_size;
> > +     unsigned int            gso_ipv6_max_size;
> >  };
> >  #define to_net_dev(d) container_of(d, struct net_device, dev)
> >
>
> Rather than have this as a device specific value would it be
> advantageous to consider making this a namespace specific sysctl value
> instead? Something along the lines of:
>   net.ipv6.conf.*.max_jumbogram_size
>
> It could also be applied generically to the GSO/GRO as the upper limit
> for any frame assembled by the socket or GRO.
>
> The general idea is that might be desirable for admins to be able to
> basically just set the maximum size they want to see for IPv6 frames
> and if we could combine the GRO/GSO logic into a single sysctl that
> could be set on a namespace basis instead of a device basis which would
> be more difficult to track down. We already have the per-device limits
> in the tso_ipv6_max_size for the outgoing frames so it seems like it
> might make sense to make this per network namespace and defaultable
> rather than per device and requiring an update for each device
> instance.
>

At least Google found it was easier to have per device controls, in
terms of testing the feature,
and gradually deploying it.

We have hosts with multiple NIC, of different types. We want to be
able to control BIG TCP on a per device basis.
For instance I had a bug in one of the implementation for one (non
upstream) driver, that I could mitigate
by setting a different limit only for this NIC, until the host can
boot with a fixed kernel.

We use ipvlan, with one private net-ns and IPv6 address per job, we
wanted to deploy BIG TCP on a per job basis

I guess that if you want to add a sysctl, automatically overriding the
per device setting, this could be done later ?
diff mbox series

Patch

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 61db67222c47664c179b6a5d3b6f15fdf8a02bdd..9ed348d8b6f1195514c3b5f85fbe2c45b3fa997f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1952,6 +1952,7 @@  enum netdev_ml_priv_type {
  *					registered
  *	@offload_xstats_l3:	L3 HW stats for this netdevice.
  *	@tso_ipv6_max_size:	Maximum size of IPv6 TSO packets (driver/NIC limit)
+ *	@gso_ipv6_max_size:	Maximum size of IPv6 GSO packets (user/admin limit)
  *
  *	FIXME: cleanup struct net_device such that network protocol info
  *	moves out.
@@ -2291,6 +2292,7 @@  struct net_device {
 	netdevice_tracker	dev_registered_tracker;
 	struct rtnl_hw_stats64	*offload_xstats_l3;
 	unsigned int		tso_ipv6_max_size;
+	unsigned int		gso_ipv6_max_size;
 };
 #define to_net_dev(d) container_of(d, struct net_device, dev)
 
@@ -4874,6 +4876,10 @@  static inline void netif_set_gso_max_size(struct net_device *dev,
 {
 	/* dev->gso_max_size is read locklessly from sk_setup_caps() */
 	WRITE_ONCE(dev->gso_max_size, size);
+
+	/* legacy drivers want to lower gso_max_size, regardless of family. */
+	size = min(size, dev->gso_ipv6_max_size);
+	WRITE_ONCE(dev->gso_ipv6_max_size, size);
 }
 
 static inline void netif_set_gso_max_segs(struct net_device *dev,
@@ -4897,6 +4903,12 @@  static inline void netif_set_tso_ipv6_max_size(struct net_device *dev,
 	dev->tso_ipv6_max_size = size;
 }
 
+static inline void netif_set_gso_ipv6_max_size(struct net_device *dev,
+					       unsigned int size)
+{
+	size = min(size, dev->tso_ipv6_max_size);
+	WRITE_ONCE(dev->gso_ipv6_max_size, size);
+}
 
 static inline void skb_gso_error_unwind(struct sk_buff *skb, __be16 protocol,
 					int pulled_hlen, u16 mac_offset,
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index c8af031b692e52690a2760e9d79c9462185e2fc9..048a9c848a3a39596b6c3135553fdfb9a1fe37d2 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -364,6 +364,7 @@  enum {
 	IFLA_PARENT_DEV_BUS_NAME,
 	IFLA_GRO_MAX_SIZE,
 	IFLA_TSO_IPV6_MAX_SIZE,
+	IFLA_GSO_IPV6_MAX_SIZE,
 
 	__IFLA_MAX
 };
diff --git a/net/core/dev.c b/net/core/dev.c
index de28f634c18a65d1948a96db5678d38e9c871b1f..87f8b8cb39a61c8f5a444e3b341a97ba0a4c06d9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -10468,6 +10468,7 @@  struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 	dev->gso_max_segs = GSO_MAX_SEGS;
 	dev->gro_max_size = GRO_MAX_SIZE;
 	dev->tso_ipv6_max_size = GSO_MAX_SIZE;
+	dev->gso_ipv6_max_size = GSO_MAX_SIZE;
 
 	dev->upper_level = 1;
 	dev->lower_level = 1;
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index ab51b18cdb5d46b87d4a11d2f66a68968ba737d6..172de404c595c89e30651a091242a75be8f786b7 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1028,6 +1028,7 @@  static noinline size_t if_nlmsg_size(const struct net_device *dev,
 	       + nla_total_size(4) /* IFLA_GSO_MAX_SIZE */
 	       + nla_total_size(4) /* IFLA_GRO_MAX_SIZE */
 	       + nla_total_size(4) /* IFLA_TSO_IPV6_MAX_SIZE */
+	       + nla_total_size(4) /* IFLA_GSO_IPV6_MAX_SIZE */
 	       + nla_total_size(1) /* IFLA_OPERSTATE */
 	       + nla_total_size(1) /* IFLA_LINKMODE */
 	       + nla_total_size(4) /* IFLA_CARRIER_CHANGES */
@@ -1734,6 +1735,7 @@  static int rtnl_fill_ifinfo(struct sk_buff *skb,
 	    nla_put_u32(skb, IFLA_GSO_MAX_SIZE, dev->gso_max_size) ||
 	    nla_put_u32(skb, IFLA_GRO_MAX_SIZE, dev->gro_max_size) ||
 	    nla_put_u32(skb, IFLA_TSO_IPV6_MAX_SIZE, dev->tso_ipv6_max_size) ||
+	    nla_put_u32(skb, IFLA_GSO_IPV6_MAX_SIZE, dev->gso_ipv6_max_size) ||
 #ifdef CONFIG_RPS
 	    nla_put_u32(skb, IFLA_NUM_RX_QUEUES, dev->num_rx_queues) ||
 #endif
@@ -1888,6 +1890,7 @@  static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
 	[IFLA_PARENT_DEV_NAME]	= { .type = NLA_NUL_STRING },
 	[IFLA_GRO_MAX_SIZE]	= { .type = NLA_U32 },
 	[IFLA_TSO_IPV6_MAX_SIZE]	= { .type = NLA_U32 },
+	[IFLA_GSO_IPV6_MAX_SIZE]	= { .type = NLA_U32 },
 };
 
 static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
@@ -2774,6 +2777,15 @@  static int do_setlink(const struct sk_buff *skb,
 		}
 	}
 
+	if (tb[IFLA_GSO_IPV6_MAX_SIZE]) {
+		u32 max_size = nla_get_u32(tb[IFLA_GSO_IPV6_MAX_SIZE]);
+
+		if (dev->gso_ipv6_max_size ^ max_size) {
+			netif_set_gso_ipv6_max_size(dev, max_size);
+			status |= DO_SETLINK_MODIFIED;
+		}
+	}
+
 	if (tb[IFLA_GSO_MAX_SEGS]) {
 		u32 max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]);
 
@@ -3249,6 +3261,9 @@  struct net_device *rtnl_create_link(struct net *net, const char *ifname,
 		netif_set_gso_max_segs(dev, nla_get_u32(tb[IFLA_GSO_MAX_SEGS]));
 	if (tb[IFLA_GRO_MAX_SIZE])
 		netif_set_gro_max_size(dev, nla_get_u32(tb[IFLA_GRO_MAX_SIZE]));
+	if (tb[IFLA_GSO_IPV6_MAX_SIZE])
+		netif_set_gso_ipv6_max_size(dev,
+			nla_get_u32(tb[IFLA_GSO_IPV6_MAX_SIZE]));
 
 	return dev;
 }
diff --git a/net/core/sock.c b/net/core/sock.c
index 1180a0cb01104561befa1f96deb71f36efcf12da..e0858e82bc386eb2779a0d6af6063b2078e6ea7b 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2279,6 +2279,12 @@  void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
 			/* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */
 			sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size);
+#if IS_ENABLED(CONFIG_IPV6)
+			if (sk->sk_family == AF_INET6 &&
+			    sk_is_tcp(sk) &&
+			    !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
+				sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_ipv6_max_size);
+#endif
 			sk->sk_gso_max_size -= (MAX_TCP_HEADER + 1);
 			/* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
 			max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h
index 441615c39f0a24eeeb6e27b4ca88031bcc234cf8..e40cd575607872d3bff3bc1971df8c6426290562 100644
--- a/tools/include/uapi/linux/if_link.h
+++ b/tools/include/uapi/linux/if_link.h
@@ -349,6 +349,7 @@  enum {
 	IFLA_PARENT_DEV_BUS_NAME,
 	IFLA_GRO_MAX_SIZE,
 	IFLA_TSO_IPV6_MAX_SIZE,
+	IFLA_GSO_IPV6_MAX_SIZE,
 
 	__IFLA_MAX
 };