Message ID | 20220310054703.849899-3-eric.dumazet@gmail.com (mailing list archive) |
---|---|
State | Changes Requested |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | tcp: BIG TCP implementation | expand |
On Wed, 2022-03-09 at 21:46 -0800, Eric Dumazet wrote: > From: Coco Li <lixiaoyan@google.com> > > This enable TCP stack to build TSO packets bigger than > 64KB if the driver is LSOv2 compatible. > > This patch introduces new variable gso_ipv6_max_size > that is modifiable through ip link. > > ip link set dev eth0 gso_ipv6_max_size 185000 > > User input is capped by driver limit (tso_ipv6_max_size) > added in previous patch. > > Signed-off-by: Coco Li <lixiaoyan@google.com> > Signed-off-by: Eric Dumazet <edumazet@google.com> > --- > include/linux/netdevice.h | 12 ++++++++++++ > include/uapi/linux/if_link.h | 1 + > net/core/dev.c | 1 + > net/core/rtnetlink.c | 15 +++++++++++++++ > net/core/sock.c | 6 ++++++ > tools/include/uapi/linux/if_link.h | 1 + > 6 files changed, 36 insertions(+) > > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h > index 61db67222c47664c179b6a5d3b6f15fdf8a02bdd..9ed348d8b6f1195514c3b5f85fbe2c45b3fa997f 100644 > --- a/include/linux/netdevice.h > +++ b/include/linux/netdevice.h > @@ -1952,6 +1952,7 @@ enum netdev_ml_priv_type { > * registered > * @offload_xstats_l3: L3 HW stats for this netdevice. > * @tso_ipv6_max_size: Maximum size of IPv6 TSO packets (driver/NIC limit) > + * @gso_ipv6_max_size: Maximum size of IPv6 GSO packets (user/admin limit) > * > * FIXME: cleanup struct net_device such that network protocol info > * moves out. > @@ -2291,6 +2292,7 @@ struct net_device { > netdevice_tracker dev_registered_tracker; > struct rtnl_hw_stats64 *offload_xstats_l3; > unsigned int tso_ipv6_max_size; > + unsigned int gso_ipv6_max_size; > }; > #define to_net_dev(d) container_of(d, struct net_device, dev) > Rather than have this as a device specific value would it be advantageous to consider making this a namespace specific sysctl value instead? Something along the lines of: net.ipv6.conf.*.max_jumbogram_size It could also be applied generically to the GSO/GRO as the upper limit for any frame assembled by the socket or GRO. The general idea is that might be desirable for admins to be able to basically just set the maximum size they want to see for IPv6 frames and if we could combine the GRO/GSO logic into a single sysctl that could be set on a namespace basis instead of a device basis which would be more difficult to track down. We already have the per-device limits in the tso_ipv6_max_size for the outgoing frames so it seems like it might make sense to make this per network namespace and defaultable rather than per device and requiring an update for each device instance.
On Fri, Mar 11, 2022 at 8:22 AM Alexander H Duyck <alexander.duyck@gmail.com> wrote: > > On Wed, 2022-03-09 at 21:46 -0800, Eric Dumazet wrote: > > From: Coco Li <lixiaoyan@google.com> > > > > This enable TCP stack to build TSO packets bigger than > > 64KB if the driver is LSOv2 compatible. > > > > This patch introduces new variable gso_ipv6_max_size > > that is modifiable through ip link. > > > > ip link set dev eth0 gso_ipv6_max_size 185000 > > > > User input is capped by driver limit (tso_ipv6_max_size) > > added in previous patch. > > > > Signed-off-by: Coco Li <lixiaoyan@google.com> > > Signed-off-by: Eric Dumazet <edumazet@google.com> > > --- > > include/linux/netdevice.h | 12 ++++++++++++ > > include/uapi/linux/if_link.h | 1 + > > net/core/dev.c | 1 + > > net/core/rtnetlink.c | 15 +++++++++++++++ > > net/core/sock.c | 6 ++++++ > > tools/include/uapi/linux/if_link.h | 1 + > > 6 files changed, 36 insertions(+) > > > > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h > > index 61db67222c47664c179b6a5d3b6f15fdf8a02bdd..9ed348d8b6f1195514c3b5f85fbe2c45b3fa997f 100644 > > --- a/include/linux/netdevice.h > > +++ b/include/linux/netdevice.h > > @@ -1952,6 +1952,7 @@ enum netdev_ml_priv_type { > > * registered > > * @offload_xstats_l3: L3 HW stats for this netdevice. > > * @tso_ipv6_max_size: Maximum size of IPv6 TSO packets (driver/NIC limit) > > + * @gso_ipv6_max_size: Maximum size of IPv6 GSO packets (user/admin limit) > > * > > * FIXME: cleanup struct net_device such that network protocol info > > * moves out. > > @@ -2291,6 +2292,7 @@ struct net_device { > > netdevice_tracker dev_registered_tracker; > > struct rtnl_hw_stats64 *offload_xstats_l3; > > unsigned int tso_ipv6_max_size; > > + unsigned int gso_ipv6_max_size; > > }; > > #define to_net_dev(d) container_of(d, struct net_device, dev) > > > > Rather than have this as a device specific value would it be > advantageous to consider making this a namespace specific sysctl value > instead? Something along the lines of: > net.ipv6.conf.*.max_jumbogram_size > > It could also be applied generically to the GSO/GRO as the upper limit > for any frame assembled by the socket or GRO. > > The general idea is that might be desirable for admins to be able to > basically just set the maximum size they want to see for IPv6 frames > and if we could combine the GRO/GSO logic into a single sysctl that > could be set on a namespace basis instead of a device basis which would > be more difficult to track down. We already have the per-device limits > in the tso_ipv6_max_size for the outgoing frames so it seems like it > might make sense to make this per network namespace and defaultable > rather than per device and requiring an update for each device > instance. > At least Google found it was easier to have per device controls, in terms of testing the feature, and gradually deploying it. We have hosts with multiple NIC, of different types. We want to be able to control BIG TCP on a per device basis. For instance I had a bug in one of the implementation for one (non upstream) driver, that I could mitigate by setting a different limit only for this NIC, until the host can boot with a fixed kernel. We use ipvlan, with one private net-ns and IPv6 address per job, we wanted to deploy BIG TCP on a per job basis I guess that if you want to add a sysctl, automatically overriding the per device setting, this could be done later ?
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 61db67222c47664c179b6a5d3b6f15fdf8a02bdd..9ed348d8b6f1195514c3b5f85fbe2c45b3fa997f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1952,6 +1952,7 @@ enum netdev_ml_priv_type { * registered * @offload_xstats_l3: L3 HW stats for this netdevice. * @tso_ipv6_max_size: Maximum size of IPv6 TSO packets (driver/NIC limit) + * @gso_ipv6_max_size: Maximum size of IPv6 GSO packets (user/admin limit) * * FIXME: cleanup struct net_device such that network protocol info * moves out. @@ -2291,6 +2292,7 @@ struct net_device { netdevice_tracker dev_registered_tracker; struct rtnl_hw_stats64 *offload_xstats_l3; unsigned int tso_ipv6_max_size; + unsigned int gso_ipv6_max_size; }; #define to_net_dev(d) container_of(d, struct net_device, dev) @@ -4874,6 +4876,10 @@ static inline void netif_set_gso_max_size(struct net_device *dev, { /* dev->gso_max_size is read locklessly from sk_setup_caps() */ WRITE_ONCE(dev->gso_max_size, size); + + /* legacy drivers want to lower gso_max_size, regardless of family. */ + size = min(size, dev->gso_ipv6_max_size); + WRITE_ONCE(dev->gso_ipv6_max_size, size); } static inline void netif_set_gso_max_segs(struct net_device *dev, @@ -4897,6 +4903,12 @@ static inline void netif_set_tso_ipv6_max_size(struct net_device *dev, dev->tso_ipv6_max_size = size; } +static inline void netif_set_gso_ipv6_max_size(struct net_device *dev, + unsigned int size) +{ + size = min(size, dev->tso_ipv6_max_size); + WRITE_ONCE(dev->gso_ipv6_max_size, size); +} static inline void skb_gso_error_unwind(struct sk_buff *skb, __be16 protocol, int pulled_hlen, u16 mac_offset, diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index c8af031b692e52690a2760e9d79c9462185e2fc9..048a9c848a3a39596b6c3135553fdfb9a1fe37d2 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -364,6 +364,7 @@ enum { IFLA_PARENT_DEV_BUS_NAME, IFLA_GRO_MAX_SIZE, IFLA_TSO_IPV6_MAX_SIZE, + IFLA_GSO_IPV6_MAX_SIZE, __IFLA_MAX }; diff --git a/net/core/dev.c b/net/core/dev.c index de28f634c18a65d1948a96db5678d38e9c871b1f..87f8b8cb39a61c8f5a444e3b341a97ba0a4c06d9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10468,6 +10468,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev->gso_max_segs = GSO_MAX_SEGS; dev->gro_max_size = GRO_MAX_SIZE; dev->tso_ipv6_max_size = GSO_MAX_SIZE; + dev->gso_ipv6_max_size = GSO_MAX_SIZE; dev->upper_level = 1; dev->lower_level = 1; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index ab51b18cdb5d46b87d4a11d2f66a68968ba737d6..172de404c595c89e30651a091242a75be8f786b7 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1028,6 +1028,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(4) /* IFLA_GSO_MAX_SIZE */ + nla_total_size(4) /* IFLA_GRO_MAX_SIZE */ + nla_total_size(4) /* IFLA_TSO_IPV6_MAX_SIZE */ + + nla_total_size(4) /* IFLA_GSO_IPV6_MAX_SIZE */ + nla_total_size(1) /* IFLA_OPERSTATE */ + nla_total_size(1) /* IFLA_LINKMODE */ + nla_total_size(4) /* IFLA_CARRIER_CHANGES */ @@ -1734,6 +1735,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, nla_put_u32(skb, IFLA_GSO_MAX_SIZE, dev->gso_max_size) || nla_put_u32(skb, IFLA_GRO_MAX_SIZE, dev->gro_max_size) || nla_put_u32(skb, IFLA_TSO_IPV6_MAX_SIZE, dev->tso_ipv6_max_size) || + nla_put_u32(skb, IFLA_GSO_IPV6_MAX_SIZE, dev->gso_ipv6_max_size) || #ifdef CONFIG_RPS nla_put_u32(skb, IFLA_NUM_RX_QUEUES, dev->num_rx_queues) || #endif @@ -1888,6 +1890,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_PARENT_DEV_NAME] = { .type = NLA_NUL_STRING }, [IFLA_GRO_MAX_SIZE] = { .type = NLA_U32 }, [IFLA_TSO_IPV6_MAX_SIZE] = { .type = NLA_U32 }, + [IFLA_GSO_IPV6_MAX_SIZE] = { .type = NLA_U32 }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -2774,6 +2777,15 @@ static int do_setlink(const struct sk_buff *skb, } } + if (tb[IFLA_GSO_IPV6_MAX_SIZE]) { + u32 max_size = nla_get_u32(tb[IFLA_GSO_IPV6_MAX_SIZE]); + + if (dev->gso_ipv6_max_size ^ max_size) { + netif_set_gso_ipv6_max_size(dev, max_size); + status |= DO_SETLINK_MODIFIED; + } + } + if (tb[IFLA_GSO_MAX_SEGS]) { u32 max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]); @@ -3249,6 +3261,9 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname, netif_set_gso_max_segs(dev, nla_get_u32(tb[IFLA_GSO_MAX_SEGS])); if (tb[IFLA_GRO_MAX_SIZE]) netif_set_gro_max_size(dev, nla_get_u32(tb[IFLA_GRO_MAX_SIZE])); + if (tb[IFLA_GSO_IPV6_MAX_SIZE]) + netif_set_gso_ipv6_max_size(dev, + nla_get_u32(tb[IFLA_GSO_IPV6_MAX_SIZE])); return dev; } diff --git a/net/core/sock.c b/net/core/sock.c index 1180a0cb01104561befa1f96deb71f36efcf12da..e0858e82bc386eb2779a0d6af6063b2078e6ea7b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2279,6 +2279,12 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; /* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */ sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size); +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6 && + sk_is_tcp(sk) && + !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) + sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_ipv6_max_size); +#endif sk->sk_gso_max_size -= (MAX_TCP_HEADER + 1); /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */ max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1); diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h index 441615c39f0a24eeeb6e27b4ca88031bcc234cf8..e40cd575607872d3bff3bc1971df8c6426290562 100644 --- a/tools/include/uapi/linux/if_link.h +++ b/tools/include/uapi/linux/if_link.h @@ -349,6 +349,7 @@ enum { IFLA_PARENT_DEV_BUS_NAME, IFLA_GRO_MAX_SIZE, IFLA_TSO_IPV6_MAX_SIZE, + IFLA_GSO_IPV6_MAX_SIZE, __IFLA_MAX };