diff mbox series

[RFC,net-next,1/9] skb: introduce gro_disabled bit

Message ID b8c183a24285c2ab30c51622f4f9eff8f7a4752f.1718919473.git.yan@cloudflare.com (mailing list archive)
State RFC
Delegated to: Netdev Maintainers
Headers show
Series xdp: allow disable GRO per packet by XDP | expand

Checks

Context Check Description
netdev/series_format warning Series does not have a cover letter
netdev/tree_selection success Clearly marked for net-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 895 this patch: 895
netdev/build_tools success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers success CCed 15 of 15 maintainers
netdev/build_clang success Errors and warnings before: 961 this patch: 961
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 5930 this patch: 5930
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 82 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc fail Errors and warnings before: 135 this patch: 136
netdev/source_inline success Was 0 now: 0

Commit Message

Yan Zhai June 20, 2024, 10:19 p.m. UTC
Software GRO is currently controlled by a single switch, i.e.

  ethtool -K dev gro on|off

However, this is not always desired. When GRO is enabled, even if the
kernel cannot GRO certain traffic, it has to run through the GRO receive
handlers with no benefit.

There are also scenarios that turning off GRO is a requirement. For
example, our production environment has a scenario that a TC egress hook
may add multiple encapsulation headers to forwarded skbs for load
balancing and isolation purpose. The encapsulation is implemented via
BPF. But the problem arises then: there is no way to properly offload a
double-encapsulated packet, since skb only has network_header and
inner_network_header to track one layer of encapsulation, but not two.
On the other hand, not all the traffic through this device needs double
encapsulation. But we have to turn off GRO completely for any ingress
device as a result.

Introduce a bit on skb so that GRO engine can be notified to skip GRO on
this skb, rather than having to be 0-or-1 for all traffic.

Signed-off-by: Yan Zhai <yan@cloudflare.com>
---
 include/linux/netdevice.h |  9 +++++++--
 include/linux/skbuff.h    | 10 ++++++++++
 net/Kconfig               | 10 ++++++++++
 net/core/gro.c            |  2 +-
 net/core/gro_cells.c      |  2 +-
 net/core/skbuff.c         |  4 ++++
 6 files changed, 33 insertions(+), 4 deletions(-)

Comments

Alexander Lobakin June 21, 2024, 9:11 a.m. UTC | #1
From: Yan Zhai <yan@cloudflare.com>
Date: Thu, 20 Jun 2024 15:19:10 -0700

> Software GRO is currently controlled by a single switch, i.e.
> 
>   ethtool -K dev gro on|off
> 
> However, this is not always desired. When GRO is enabled, even if the
> kernel cannot GRO certain traffic, it has to run through the GRO receive
> handlers with no benefit.
> 
> There are also scenarios that turning off GRO is a requirement. For
> example, our production environment has a scenario that a TC egress hook
> may add multiple encapsulation headers to forwarded skbs for load
> balancing and isolation purpose. The encapsulation is implemented via
> BPF. But the problem arises then: there is no way to properly offload a
> double-encapsulated packet, since skb only has network_header and
> inner_network_header to track one layer of encapsulation, but not two.

Implement it in the kernel then? :D

> On the other hand, not all the traffic through this device needs double
> encapsulation. But we have to turn off GRO completely for any ingress
> device as a result.
> 
> Introduce a bit on skb so that GRO engine can be notified to skip GRO on
> this skb, rather than having to be 0-or-1 for all traffic.
> 
> Signed-off-by: Yan Zhai <yan@cloudflare.com>
> ---
>  include/linux/netdevice.h |  9 +++++++--
>  include/linux/skbuff.h    | 10 ++++++++++
>  net/Kconfig               | 10 ++++++++++
>  net/core/gro.c            |  2 +-
>  net/core/gro_cells.c      |  2 +-
>  net/core/skbuff.c         |  4 ++++
>  6 files changed, 33 insertions(+), 4 deletions(-)
> 
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index c83b390191d4..2ca0870b1221 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -2415,11 +2415,16 @@ struct net_device {
>  	((dev)->devlink_port = (port));				\
>  })
>  
> -static inline bool netif_elide_gro(const struct net_device *dev)
> +static inline bool netif_elide_gro(const struct sk_buff *skb)
>  {
> -	if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog)
> +	if (!(skb->dev->features & NETIF_F_GRO) || skb->dev->xdp_prog)
>  		return true;
> +
> +#ifdef CONFIG_SKB_GRO_CONTROL
> +	return skb->gro_disabled;
> +#else
>  	return false;
> +#endif
>  }
>  
>  #define	NETDEV_ALIGN		32
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index f4cda3fbdb75..48b10ece95b5 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -1008,6 +1008,9 @@ struct sk_buff {
>  #if IS_ENABLED(CONFIG_IP_SCTP)
>  	__u8			csum_not_inet:1;
>  #endif
> +#ifdef CONFIG_SKB_GRO_CONTROL
> +	__u8			gro_disabled:1;
> +#endif
>  
>  #if defined(CONFIG_NET_SCHED) || defined(CONFIG_NET_XGRESS)
>  	__u16			tc_index;	/* traffic control index */
> @@ -1215,6 +1218,13 @@ static inline bool skb_wifi_acked_valid(const struct sk_buff *skb)
>  #endif
>  }
>  
> +static inline void skb_disable_gro(struct sk_buff *skb)
> +{
> +#ifdef CONFIG_SKB_GRO_CONTROL
> +	skb->gro_disabled = 1;
> +#endif
> +}
> +
>  /**
>   * skb_unref - decrement the skb's reference count
>   * @skb: buffer
> diff --git a/net/Kconfig b/net/Kconfig
> index 9fe65fa26e48..47d1ee92df15 100644
> --- a/net/Kconfig
> +++ b/net/Kconfig
> @@ -289,6 +289,16 @@ config MAX_SKB_FRAGS
>  	  and in drivers using build_skb().
>  	  If unsure, say 17.
>  
> +config SKB_GRO_CONTROL
> +	bool "allow disable GRO on per-packet basis"
> +	default y
> +	help
> +	  By default GRO can only be enabled or disabled per network device.
> +	  This can be cumbersome for certain scenarios.
> +	  Toggling this option will allow disabling GRO for selected packets,
> +	  e.g. by XDP programs, so that it is more flexibile.
> +	  Extra overhead should be minimal.

I don't think we need a Kconfig option for that. Can't it be
unconditional? Is there any real eye-visible overhead?

> +
>  config RPS
>  	bool "Receive packet steering"
>  	depends on SMP && SYSFS
> diff --git a/net/core/gro.c b/net/core/gro.c
> index b3b43de1a650..46232a0d1983 100644
> --- a/net/core/gro.c
> +++ b/net/core/gro.c
> @@ -476,7 +476,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
>  	enum gro_result ret;
>  	int same_flow;
>  
> -	if (netif_elide_gro(skb->dev))
> +	if (netif_elide_gro(skb))
>  		goto normal;
>  
>  	gro_list_prepare(&gro_list->list, skb);
> diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c
> index ff8e5b64bf6b..1bf15783300f 100644
> --- a/net/core/gro_cells.c
> +++ b/net/core/gro_cells.c
> @@ -20,7 +20,7 @@ int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb)
>  	if (unlikely(!(dev->flags & IFF_UP)))
>  		goto drop;
>  
> -	if (!gcells->cells || skb_cloned(skb) || netif_elide_gro(dev)) {
> +	if (!gcells->cells || skb_cloned(skb) || netif_elide_gro(skb)) {
>  		res = netif_rx(skb);
>  		goto unlock;
>  	}
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 2315c088e91d..82bd297921c1 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -6030,6 +6030,10 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet)
>  	ipvs_reset(skb);
>  	skb->mark = 0;
>  	skb_clear_tstamp(skb);
> +#ifdef CONFIG_SKB_GRO_CONTROL
> +	/* hand back GRO control to next netns */
> +	skb->gro_disabled = 0;
> +#endif
>  }
>  EXPORT_SYMBOL_GPL(skb_scrub_packet);

Thanks,
Olek
Paolo Abeni June 21, 2024, 9:49 a.m. UTC | #2
On Thu, 2024-06-20 at 15:19 -0700, Yan Zhai wrote:
> Software GRO is currently controlled by a single switch, i.e.
> 
>   ethtool -K dev gro on|off
> 
> However, this is not always desired. When GRO is enabled, even if the
> kernel cannot GRO certain traffic, it has to run through the GRO receive
> handlers with no benefit.
> 
> There are also scenarios that turning off GRO is a requirement. For
> example, our production environment has a scenario that a TC egress hook
> may add multiple encapsulation headers to forwarded skbs for load
> balancing and isolation purpose. The encapsulation is implemented via
> BPF. But the problem arises then: there is no way to properly offload a
> double-encapsulated packet, since skb only has network_header and
> inner_network_header to track one layer of encapsulation, but not two.
> On the other hand, not all the traffic through this device needs double
> encapsulation. But we have to turn off GRO completely for any ingress
> device as a result.
> 
> Introduce a bit on skb so that GRO engine can be notified to skip GRO on
> this skb, rather than having to be 0-or-1 for all traffic.
> 
> Signed-off-by: Yan Zhai <yan@cloudflare.com>
> ---
>  include/linux/netdevice.h |  9 +++++++--
>  include/linux/skbuff.h    | 10 ++++++++++
>  net/Kconfig               | 10 ++++++++++
>  net/core/gro.c            |  2 +-
>  net/core/gro_cells.c      |  2 +-
>  net/core/skbuff.c         |  4 ++++
>  6 files changed, 33 insertions(+), 4 deletions(-)
> 
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index c83b390191d4..2ca0870b1221 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -2415,11 +2415,16 @@ struct net_device {
>  	((dev)->devlink_port = (port));				\
>  })
>  
> -static inline bool netif_elide_gro(const struct net_device *dev)
> +static inline bool netif_elide_gro(const struct sk_buff *skb)
>  {
> -	if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog)
> +	if (!(skb->dev->features & NETIF_F_GRO) || skb->dev->xdp_prog)
>  		return true;
> +
> +#ifdef CONFIG_SKB_GRO_CONTROL
> +	return skb->gro_disabled;
> +#else
>  	return false;
> +#endif

This will generate OoO if the gro_disabled is flipped in the middle of
a stream.

Assuming the above is fine for your use case (I think it's _not_ in
general), you could get the same result without an additional costly
bit in sk_buff.

Let xdp_frame_fixup_skb_offloading() return a bool - e.g. 'true' when
gro should be avoided - and let the NIC driver call netif_receive_skb()
instead of the gro rx hook for such packet.

All in all the approach implemented in this series does not look worthy
to me.

Thanks,

Paolo
Paolo Abeni June 21, 2024, 9:57 a.m. UTC | #3
On Thu, 2024-06-20 at 15:19 -0700, Yan Zhai wrote:
> Software GRO is currently controlled by a single switch, i.e.
> 
>   ethtool -K dev gro on|off
> 
> However, this is not always desired. When GRO is enabled, even if the
> kernel cannot GRO certain traffic, it has to run through the GRO receive
> handlers with no benefit.
> 
> There are also scenarios that turning off GRO is a requirement. For
> example, our production environment has a scenario that a TC egress hook
> may add multiple encapsulation headers to forwarded skbs for load
> balancing and isolation purpose. The encapsulation is implemented via
> BPF. But the problem arises then: there is no way to properly offload a
> double-encapsulated packet, since skb only has network_header and
> inner_network_header to track one layer of encapsulation, but not two.
> On the other hand, not all the traffic through this device needs double
> encapsulation. But we have to turn off GRO completely for any ingress
> device as a result.

Could you please add more details WRT this last statement? I'm unsure
if I understand your problem. My guess is as follow:

Your device receive some traffic, GRO and forward it, and the multiple
encapsulation can happen on such forwarded traffic (since I can't find
almost none of the above your message is mainly a wild guess).

Assuming I guessed correctly, I think you could solve the problem with
no kernel changes: redirect the to-be-tunneled traffic to some virtual
device and all TX offload on top of it and let the encap happen there.

Cheers,

Paolo
Willem de Bruijn June 21, 2024, 12:15 p.m. UTC | #4
Yan Zhai wrote:
> Software GRO is currently controlled by a single switch, i.e.
> 
>   ethtool -K dev gro on|off
> 
> However, this is not always desired. When GRO is enabled, even if the
> kernel cannot GRO certain traffic, it has to run through the GRO receive
> handlers with no benefit.
> 
> There are also scenarios that turning off GRO is a requirement. For
> example, our production environment has a scenario that a TC egress hook
> may add multiple encapsulation headers to forwarded skbs for load
> balancing and isolation purpose. The encapsulation is implemented via
> BPF. But the problem arises then: there is no way to properly offload a
> double-encapsulated packet, since skb only has network_header and
> inner_network_header to track one layer of encapsulation, but not two.
> On the other hand, not all the traffic through this device needs double
> encapsulation. But we have to turn off GRO completely for any ingress
> device as a result.
> 
> Introduce a bit on skb so that GRO engine can be notified to skip GRO on
> this skb, rather than having to be 0-or-1 for all traffic.
> 
> Signed-off-by: Yan Zhai <yan@cloudflare.com>
> ---
>  include/linux/netdevice.h |  9 +++++++--
>  include/linux/skbuff.h    | 10 ++++++++++
>  net/Kconfig               | 10 ++++++++++
>  net/core/gro.c            |  2 +-
>  net/core/gro_cells.c      |  2 +-
>  net/core/skbuff.c         |  4 ++++
>  6 files changed, 33 insertions(+), 4 deletions(-)
> 
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index c83b390191d4..2ca0870b1221 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -2415,11 +2415,16 @@ struct net_device {
>  	((dev)->devlink_port = (port));				\
>  })
>  
> -static inline bool netif_elide_gro(const struct net_device *dev)
> +static inline bool netif_elide_gro(const struct sk_buff *skb)
>  {
> -	if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog)
> +	if (!(skb->dev->features & NETIF_F_GRO) || skb->dev->xdp_prog)
>  		return true;
> +
> +#ifdef CONFIG_SKB_GRO_CONTROL
> +	return skb->gro_disabled;
> +#else
>  	return false;
> +#endif

Yet more branches in the hot path.

Compile time configurability does not help, as that will be
enabled by distros.

For a fairly niche use case. Where functionality of GRO already
works. So just a performance for a very rare case at the cost of a
regression in the common case. A small regression perhaps, but death
by a thousand cuts.

>  }
>  
>  #define	NETDEV_ALIGN		32
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index f4cda3fbdb75..48b10ece95b5 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -1008,6 +1008,9 @@ struct sk_buff {
>  #if IS_ENABLED(CONFIG_IP_SCTP)
>  	__u8			csum_not_inet:1;
>  #endif
> +#ifdef CONFIG_SKB_GRO_CONTROL
> +	__u8			gro_disabled:1;
> +#endif
>  
>  #if defined(CONFIG_NET_SCHED) || defined(CONFIG_NET_XGRESS)
>  	__u16			tc_index;	/* traffic control index */
> @@ -1215,6 +1218,13 @@ static inline bool skb_wifi_acked_valid(const struct sk_buff *skb)
>  #endif
>  }
>  
> +static inline void skb_disable_gro(struct sk_buff *skb)
> +{
> +#ifdef CONFIG_SKB_GRO_CONTROL
> +	skb->gro_disabled = 1;
> +#endif
> +}
> +
>  /**
>   * skb_unref - decrement the skb's reference count
>   * @skb: buffer
> diff --git a/net/Kconfig b/net/Kconfig
> index 9fe65fa26e48..47d1ee92df15 100644
> --- a/net/Kconfig
> +++ b/net/Kconfig
> @@ -289,6 +289,16 @@ config MAX_SKB_FRAGS
>  	  and in drivers using build_skb().
>  	  If unsure, say 17.
>  
> +config SKB_GRO_CONTROL
> +	bool "allow disable GRO on per-packet basis"
> +	default y
> +	help
> +	  By default GRO can only be enabled or disabled per network device.
> +	  This can be cumbersome for certain scenarios.
> +	  Toggling this option will allow disabling GRO for selected packets,
> +	  e.g. by XDP programs, so that it is more flexibile.
> +	  Extra overhead should be minimal.
> +
>  config RPS
>  	bool "Receive packet steering"
>  	depends on SMP && SYSFS
> diff --git a/net/core/gro.c b/net/core/gro.c
> index b3b43de1a650..46232a0d1983 100644
> --- a/net/core/gro.c
> +++ b/net/core/gro.c
> @@ -476,7 +476,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
>  	enum gro_result ret;
>  	int same_flow;
>  
> -	if (netif_elide_gro(skb->dev))
> +	if (netif_elide_gro(skb))
>  		goto normal;
>  
>  	gro_list_prepare(&gro_list->list, skb);
> diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c
> index ff8e5b64bf6b..1bf15783300f 100644
> --- a/net/core/gro_cells.c
> +++ b/net/core/gro_cells.c
> @@ -20,7 +20,7 @@ int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb)
>  	if (unlikely(!(dev->flags & IFF_UP)))
>  		goto drop;
>  
> -	if (!gcells->cells || skb_cloned(skb) || netif_elide_gro(dev)) {
> +	if (!gcells->cells || skb_cloned(skb) || netif_elide_gro(skb)) {
>  		res = netif_rx(skb);
>  		goto unlock;
>  	}
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 2315c088e91d..82bd297921c1 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -6030,6 +6030,10 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet)
>  	ipvs_reset(skb);
>  	skb->mark = 0;
>  	skb_clear_tstamp(skb);
> +#ifdef CONFIG_SKB_GRO_CONTROL
> +	/* hand back GRO control to next netns */
> +	skb->gro_disabled = 0;
> +#endif
>  }
>  EXPORT_SYMBOL_GPL(skb_scrub_packet);
>  
> -- 
> 2.30.2
> 
>
Daniel Borkmann June 21, 2024, 12:47 p.m. UTC | #5
On 6/21/24 2:15 PM, Willem de Bruijn wrote:
> Yan Zhai wrote:
>> Software GRO is currently controlled by a single switch, i.e.
>>
>>    ethtool -K dev gro on|off
>>
>> However, this is not always desired. When GRO is enabled, even if the
>> kernel cannot GRO certain traffic, it has to run through the GRO receive
>> handlers with no benefit.
>>
>> There are also scenarios that turning off GRO is a requirement. For
>> example, our production environment has a scenario that a TC egress hook
>> may add multiple encapsulation headers to forwarded skbs for load
>> balancing and isolation purpose. The encapsulation is implemented via
>> BPF. But the problem arises then: there is no way to properly offload a
>> double-encapsulated packet, since skb only has network_header and
>> inner_network_header to track one layer of encapsulation, but not two.
>> On the other hand, not all the traffic through this device needs double
>> encapsulation. But we have to turn off GRO completely for any ingress
>> device as a result.
>>
>> Introduce a bit on skb so that GRO engine can be notified to skip GRO on
>> this skb, rather than having to be 0-or-1 for all traffic.
>>
>> Signed-off-by: Yan Zhai <yan@cloudflare.com>
>> ---
>>   include/linux/netdevice.h |  9 +++++++--
>>   include/linux/skbuff.h    | 10 ++++++++++
>>   net/Kconfig               | 10 ++++++++++
>>   net/core/gro.c            |  2 +-
>>   net/core/gro_cells.c      |  2 +-
>>   net/core/skbuff.c         |  4 ++++
>>   6 files changed, 33 insertions(+), 4 deletions(-)
>>
>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>> index c83b390191d4..2ca0870b1221 100644
>> --- a/include/linux/netdevice.h
>> +++ b/include/linux/netdevice.h
>> @@ -2415,11 +2415,16 @@ struct net_device {
>>   	((dev)->devlink_port = (port));				\
>>   })
>>   
>> -static inline bool netif_elide_gro(const struct net_device *dev)
>> +static inline bool netif_elide_gro(const struct sk_buff *skb)
>>   {
>> -	if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog)
>> +	if (!(skb->dev->features & NETIF_F_GRO) || skb->dev->xdp_prog)
>>   		return true;
>> +
>> +#ifdef CONFIG_SKB_GRO_CONTROL
>> +	return skb->gro_disabled;
>> +#else
>>   	return false;
>> +#endif
> 
> Yet more branches in the hot path.
> 
> Compile time configurability does not help, as that will be
> enabled by distros.
> 
> For a fairly niche use case. Where functionality of GRO already
> works. So just a performance for a very rare case at the cost of a
> regression in the common case. A small regression perhaps, but death
> by a thousand cuts.

Mentioning it here b/c it perhaps fits in this context, longer time ago
there was the idea mentioned to have BPF operating as GRO engine which
might also help to reduce attack surface by only having to handle packets
of interest for the concrete production use case. Perhaps here meta data
buffer could be used to pass a notification from XDP to exit early w/o
aggregation.
Yan Zhai June 21, 2024, 2:29 p.m. UTC | #6
On Fri, Jun 21, 2024 at 4:49 AM Paolo Abeni <pabeni@redhat.com> wrote:
>
> On Thu, 2024-06-20 at 15:19 -0700, Yan Zhai wrote:
> > Software GRO is currently controlled by a single switch, i.e.
> >
> >   ethtool -K dev gro on|off
> >
> > However, this is not always desired. When GRO is enabled, even if the
> > kernel cannot GRO certain traffic, it has to run through the GRO receive
> > handlers with no benefit.
> >
> > There are also scenarios that turning off GRO is a requirement. For
> > example, our production environment has a scenario that a TC egress hook
> > may add multiple encapsulation headers to forwarded skbs for load
> > balancing and isolation purpose. The encapsulation is implemented via
> > BPF. But the problem arises then: there is no way to properly offload a
> > double-encapsulated packet, since skb only has network_header and
> > inner_network_header to track one layer of encapsulation, but not two.
> > On the other hand, not all the traffic through this device needs double
> > encapsulation. But we have to turn off GRO completely for any ingress
> > device as a result.
> >
> > Introduce a bit on skb so that GRO engine can be notified to skip GRO on
> > this skb, rather than having to be 0-or-1 for all traffic.
> >
> > Signed-off-by: Yan Zhai <yan@cloudflare.com>
> > ---
> >  include/linux/netdevice.h |  9 +++++++--
> >  include/linux/skbuff.h    | 10 ++++++++++
> >  net/Kconfig               | 10 ++++++++++
> >  net/core/gro.c            |  2 +-
> >  net/core/gro_cells.c      |  2 +-
> >  net/core/skbuff.c         |  4 ++++
> >  6 files changed, 33 insertions(+), 4 deletions(-)
> >
> > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> > index c83b390191d4..2ca0870b1221 100644
> > --- a/include/linux/netdevice.h
> > +++ b/include/linux/netdevice.h
> > @@ -2415,11 +2415,16 @@ struct net_device {
> >       ((dev)->devlink_port = (port));                         \
> >  })
> >
> > -static inline bool netif_elide_gro(const struct net_device *dev)
> > +static inline bool netif_elide_gro(const struct sk_buff *skb)
> >  {
> > -     if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog)
> > +     if (!(skb->dev->features & NETIF_F_GRO) || skb->dev->xdp_prog)
> >               return true;
> > +
> > +#ifdef CONFIG_SKB_GRO_CONTROL
> > +     return skb->gro_disabled;
> > +#else
> >       return false;
> > +#endif
>
> This will generate OoO if the gro_disabled is flipped in the middle of
> a stream.
>
> Assuming the above is fine for your use case (I think it's _not_ in
> general), you could get the same result without an additional costly
> bit in sk_buff.

Calling it per-packet control seems inaccurate here, the motivation is
to give users the ability to control per-flow behaviors. OoO is indeed
a consequence if users don't do it correctly.

>
> Let xdp_frame_fixup_skb_offloading() return a bool - e.g. 'true' when
> gro should be avoided - and let the NIC driver call netif_receive_skb()
> instead of the gro rx hook for such packet.
>
For rx on a single device, directly calling netif_receive_skb is
reasonable. For tunnel receivers it is kinda inconsistent IMHO. For
example, we terminate GRE tunnels in a netns, and it is necessary to
disable GRO on both the entering veth device and also the GRE tunnel
to shutdown GRO. That's why I'd hope to use a bit of skb, to be
consistent within the same netns. Let me add a bit more context to
clarify why we think this is necessary in another thread.

best,
Yan

> All in all the approach implemented in this series does not look worthy
> to me.
>
> Thanks,
>
> Paolo
>
Yan Zhai June 21, 2024, 3:17 p.m. UTC | #7
On Fri, Jun 21, 2024 at 4:57 AM Paolo Abeni <pabeni@redhat.com> wrote:
>
> On Thu, 2024-06-20 at 15:19 -0700, Yan Zhai wrote:
> > Software GRO is currently controlled by a single switch, i.e.
> >
> >   ethtool -K dev gro on|off
> >
> > However, this is not always desired. When GRO is enabled, even if the
> > kernel cannot GRO certain traffic, it has to run through the GRO receive
> > handlers with no benefit.
> >
> > There are also scenarios that turning off GRO is a requirement. For
> > example, our production environment has a scenario that a TC egress hook
> > may add multiple encapsulation headers to forwarded skbs for load
> > balancing and isolation purpose. The encapsulation is implemented via
> > BPF. But the problem arises then: there is no way to properly offload a
> > double-encapsulated packet, since skb only has network_header and
> > inner_network_header to track one layer of encapsulation, but not two.
> > On the other hand, not all the traffic through this device needs double
> > encapsulation. But we have to turn off GRO completely for any ingress
> > device as a result.
>
> Could you please add more details WRT this last statement? I'm unsure
> if I understand your problem. My guess is as follow:
>
> Your device receive some traffic, GRO and forward it, and the multiple
> encapsulation can happen on such forwarded traffic (since I can't find
> almost none of the above your message is mainly a wild guess).
>
> Assuming I guessed correctly, I think you could solve the problem with
> no kernel changes: redirect the to-be-tunneled traffic to some virtual
> device and all TX offload on top of it and let the encap happen there.
>
Let's say we have a netns to implement network functions like
DoS/IDS/Load balancing for IP traffic. The netns has a single veth
entrance/exit, and a bunch of ip tunnels, GRE/XFRM, to receive and
tunnel traffic from customer's private sites. Some of such traffic
could be encapsulated to reach services outside of the netns (but on
the same server), for example, customers may also want to use our
CDN/Caching functionality. The complication here is that we might have
to further tunnel traffic to another data center, because the routing
is asymmetric so we can receive client traffic from US but the
response may come back to our EU data center, and in order to do
layer4/layer7 service, we have to make sure those land on the same
server.

It is true that a device like a veth pair or even netkit could allow
the kernel segment GRO packets for us. But this does not sound
actually right in terms of design: if we know already some packet path
should not be GRO-ed, can we enforce this rather than having to
aggregate it then chop it down soon after? For our specific case
though, it also becomes a headache for analytics and customer rules
that rely on ingress device name, we probably need to pair each tunnel
with such a virtual device. There could be hundreds of ipsec tunnels,
and that seems to be a substantial overhead for both data path and
control plane management.

To make this a bit more general, what I'd like to introduce here is:
when we know GRO is either problematic or simply not useful (like to
some UDP traffic), can we have more control toggle to skip it?

thanks
Yan

> Cheers,
>
> Paolo
>
Yan Zhai June 21, 2024, 3:34 p.m. UTC | #8
> > -static inline bool netif_elide_gro(const struct net_device *dev)
> > +static inline bool netif_elide_gro(const struct sk_buff *skb)
> >  {
> > -     if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog)
> > +     if (!(skb->dev->features & NETIF_F_GRO) || skb->dev->xdp_prog)
> >               return true;
> > +
> > +#ifdef CONFIG_SKB_GRO_CONTROL
> > +     return skb->gro_disabled;
> > +#else
> >       return false;
> > +#endif
>
> Yet more branches in the hot path.
>
> Compile time configurability does not help, as that will be
> enabled by distros.
>
> For a fairly niche use case. Where functionality of GRO already
> works. So just a performance for a very rare case at the cost of a
> regression in the common case. A small regression perhaps, but death
> by a thousand cuts.
>

I share your concern on operating on this hotpath. Will a
static_branch + sysctl make it less aggressive? Speaking of
performance, I'd hope this can give us more control so we can achieve
the best of two worlds: for TCP and some UDP traffic, we can enable
GRO, while for some other classes that we know GRO does no good or
even harm, let's disable GRO to save more cycles. The key observation
is that developers may already know which traffic is blessed by GRO,
but lack a way to realize it.

best
Yan
Yan Zhai June 21, 2024, 3:40 p.m. UTC | #9
On Fri, Jun 21, 2024 at 4:13 AM Alexander Lobakin
<aleksander.lobakin@intel.com> wrote:
>
> From: Yan Zhai <yan@cloudflare.com>
> Date: Thu, 20 Jun 2024 15:19:10 -0700
>
> > Software GRO is currently controlled by a single switch, i.e.
> >
> >   ethtool -K dev gro on|off
> >
> > However, this is not always desired. When GRO is enabled, even if the
> > kernel cannot GRO certain traffic, it has to run through the GRO receive
> > handlers with no benefit.
> >
> > There are also scenarios that turning off GRO is a requirement. For
> > example, our production environment has a scenario that a TC egress hook
> > may add multiple encapsulation headers to forwarded skbs for load
> > balancing and isolation purpose. The encapsulation is implemented via
> > BPF. But the problem arises then: there is no way to properly offload a
> > double-encapsulated packet, since skb only has network_header and
> > inner_network_header to track one layer of encapsulation, but not two.
>
> Implement it in the kernel then? :D
>
It would be a big commitment that I dare not make :) Out of curiosity,
is it something that devices can handle today?

> > On the other hand, not all the traffic through this device needs double
> > encapsulation. But we have to turn off GRO completely for any ingress
> > device as a result.
> >
> > Introduce a bit on skb so that GRO engine can be notified to skip GRO on
> > this skb, rather than having to be 0-or-1 for all traffic.
> >
> > Signed-off-by: Yan Zhai <yan@cloudflare.com>
> > ---
> >  include/linux/netdevice.h |  9 +++++++--
> >  include/linux/skbuff.h    | 10 ++++++++++
> >  net/Kconfig               | 10 ++++++++++
> >  net/core/gro.c            |  2 +-
> >  net/core/gro_cells.c      |  2 +-
> >  net/core/skbuff.c         |  4 ++++
> >  6 files changed, 33 insertions(+), 4 deletions(-)
> >
> > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> > index c83b390191d4..2ca0870b1221 100644
> > --- a/include/linux/netdevice.h
> > +++ b/include/linux/netdevice.h
> > @@ -2415,11 +2415,16 @@ struct net_device {
> >       ((dev)->devlink_port = (port));                         \
> >  })
> >
> > -static inline bool netif_elide_gro(const struct net_device *dev)
> > +static inline bool netif_elide_gro(const struct sk_buff *skb)
> >  {
> > -     if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog)
> > +     if (!(skb->dev->features & NETIF_F_GRO) || skb->dev->xdp_prog)
> >               return true;
> > +
> > +#ifdef CONFIG_SKB_GRO_CONTROL
> > +     return skb->gro_disabled;
> > +#else
> >       return false;
> > +#endif
> >  }
> >
> >  #define      NETDEV_ALIGN            32
> > diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> > index f4cda3fbdb75..48b10ece95b5 100644
> > --- a/include/linux/skbuff.h
> > +++ b/include/linux/skbuff.h
> > @@ -1008,6 +1008,9 @@ struct sk_buff {
> >  #if IS_ENABLED(CONFIG_IP_SCTP)
> >       __u8                    csum_not_inet:1;
> >  #endif
> > +#ifdef CONFIG_SKB_GRO_CONTROL
> > +     __u8                    gro_disabled:1;
> > +#endif
> >
> >  #if defined(CONFIG_NET_SCHED) || defined(CONFIG_NET_XGRESS)
> >       __u16                   tc_index;       /* traffic control index */
> > @@ -1215,6 +1218,13 @@ static inline bool skb_wifi_acked_valid(const struct sk_buff *skb)
> >  #endif
> >  }
> >
> > +static inline void skb_disable_gro(struct sk_buff *skb)
> > +{
> > +#ifdef CONFIG_SKB_GRO_CONTROL
> > +     skb->gro_disabled = 1;
> > +#endif
> > +}
> > +
> >  /**
> >   * skb_unref - decrement the skb's reference count
> >   * @skb: buffer
> > diff --git a/net/Kconfig b/net/Kconfig
> > index 9fe65fa26e48..47d1ee92df15 100644
> > --- a/net/Kconfig
> > +++ b/net/Kconfig
> > @@ -289,6 +289,16 @@ config MAX_SKB_FRAGS
> >         and in drivers using build_skb().
> >         If unsure, say 17.
> >
> > +config SKB_GRO_CONTROL
> > +     bool "allow disable GRO on per-packet basis"
> > +     default y
> > +     help
> > +       By default GRO can only be enabled or disabled per network device.
> > +       This can be cumbersome for certain scenarios.
> > +       Toggling this option will allow disabling GRO for selected packets,
> > +       e.g. by XDP programs, so that it is more flexibile.
> > +       Extra overhead should be minimal.
>
> I don't think we need a Kconfig option for that. Can't it be
> unconditional? Is there any real eye-visible overhead?

Normally if it is a single branch I would not worry about it. But I
know I am touching a hot potato here so I just want to be cautious :)

best
Yan

>
> Thanks,
> Olek
Yan Zhai June 21, 2024, 4 p.m. UTC | #10
On Fri, Jun 21, 2024 at 8:13 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> On 6/21/24 2:15 PM, Willem de Bruijn wrote:
> > Yan Zhai wrote:
> >> Software GRO is currently controlled by a single switch, i.e.
> >>
> >>    ethtool -K dev gro on|off
> >>
> >> However, this is not always desired. When GRO is enabled, even if the
> >> kernel cannot GRO certain traffic, it has to run through the GRO receive
> >> handlers with no benefit.
> >>
> >> There are also scenarios that turning off GRO is a requirement. For
> >> example, our production environment has a scenario that a TC egress hook
> >> may add multiple encapsulation headers to forwarded skbs for load
> >> balancing and isolation purpose. The encapsulation is implemented via
> >> BPF. But the problem arises then: there is no way to properly offload a
> >> double-encapsulated packet, since skb only has network_header and
> >> inner_network_header to track one layer of encapsulation, but not two.
> >> On the other hand, not all the traffic through this device needs double
> >> encapsulation. But we have to turn off GRO completely for any ingress
> >> device as a result.
> >>
> >> Introduce a bit on skb so that GRO engine can be notified to skip GRO on
> >> this skb, rather than having to be 0-or-1 for all traffic.
> >>
> >> Signed-off-by: Yan Zhai <yan@cloudflare.com>
> >> ---
> >>   include/linux/netdevice.h |  9 +++++++--
> >>   include/linux/skbuff.h    | 10 ++++++++++
> >>   net/Kconfig               | 10 ++++++++++
> >>   net/core/gro.c            |  2 +-
> >>   net/core/gro_cells.c      |  2 +-
> >>   net/core/skbuff.c         |  4 ++++
> >>   6 files changed, 33 insertions(+), 4 deletions(-)
> >>
> >> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> >> index c83b390191d4..2ca0870b1221 100644
> >> --- a/include/linux/netdevice.h
> >> +++ b/include/linux/netdevice.h
> >> @@ -2415,11 +2415,16 @@ struct net_device {
> >>      ((dev)->devlink_port = (port));                         \
> >>   })
> >>
> >> -static inline bool netif_elide_gro(const struct net_device *dev)
> >> +static inline bool netif_elide_gro(const struct sk_buff *skb)
> >>   {
> >> -    if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog)
> >> +    if (!(skb->dev->features & NETIF_F_GRO) || skb->dev->xdp_prog)
> >>              return true;
> >> +
> >> +#ifdef CONFIG_SKB_GRO_CONTROL
> >> +    return skb->gro_disabled;
> >> +#else
> >>      return false;
> >> +#endif
> >
> > Yet more branches in the hot path.
> >
> > Compile time configurability does not help, as that will be
> > enabled by distros.
> >
> > For a fairly niche use case. Where functionality of GRO already
> > works. So just a performance for a very rare case at the cost of a
> > regression in the common case. A small regression perhaps, but death
> > by a thousand cuts.
>
> Mentioning it here b/c it perhaps fits in this context, longer time ago
> there was the idea mentioned to have BPF operating as GRO engine which
> might also help to reduce attack surface by only having to handle packets
> of interest for the concrete production use case. Perhaps here meta data
> buffer could be used to pass a notification from XDP to exit early w/o
> aggregation.

Metadata is in fact one of our interests as well. We discussed using
metadata instead of a skb bit to carry this information internally.
Since metadata is opaque atm so it seems the only option is to have a
GRO control hook before napi_gro_receive, and let BPF decide
netif_receive_skb or napi_gro_receive (echo what Paolo said). With BPF
it could indeed be more flexible, but the cons is that it could be
even more slower than taking a bit on skb. I am actually open to
either approach, as long as it gives us more control on when to enable
GRO :)

To extend the discussion a bit, putting GRO aside, I think some common
hook before GRO would be still valuable moving forward: it is a
limited window where the driver code has both access to XDP context
and skb. Today we do not have a good way to transfer HW offloading
info to skbs if XDP redirect-to-cpu or if XDP encap-and-tx for load
balancing purposes. The XDP metadata infrastructure already allows XDP
to read this information with driver supports, so to complete that, a
place to use it (which I introduced as
xdp_buff/frame_fixup_skb_offloading in a later patch) would be
beneficial to pass on things like the flow hash, vlan information,
etc.

best
Yan
Daniel Borkmann June 21, 2024, 4:15 p.m. UTC | #11
On 6/21/24 6:00 PM, Yan Zhai wrote:
> On Fri, Jun 21, 2024 at 8:13 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
>> On 6/21/24 2:15 PM, Willem de Bruijn wrote:
>>> Yan Zhai wrote:
>>>> Software GRO is currently controlled by a single switch, i.e.
>>>>
>>>>     ethtool -K dev gro on|off
>>>>
>>>> However, this is not always desired. When GRO is enabled, even if the
>>>> kernel cannot GRO certain traffic, it has to run through the GRO receive
>>>> handlers with no benefit.
>>>>
>>>> There are also scenarios that turning off GRO is a requirement. For
>>>> example, our production environment has a scenario that a TC egress hook
>>>> may add multiple encapsulation headers to forwarded skbs for load
>>>> balancing and isolation purpose. The encapsulation is implemented via
>>>> BPF. But the problem arises then: there is no way to properly offload a
>>>> double-encapsulated packet, since skb only has network_header and
>>>> inner_network_header to track one layer of encapsulation, but not two.
>>>> On the other hand, not all the traffic through this device needs double
>>>> encapsulation. But we have to turn off GRO completely for any ingress
>>>> device as a result.
>>>>
>>>> Introduce a bit on skb so that GRO engine can be notified to skip GRO on
>>>> this skb, rather than having to be 0-or-1 for all traffic.
>>>>
>>>> Signed-off-by: Yan Zhai <yan@cloudflare.com>
>>>> ---
>>>>    include/linux/netdevice.h |  9 +++++++--
>>>>    include/linux/skbuff.h    | 10 ++++++++++
>>>>    net/Kconfig               | 10 ++++++++++
>>>>    net/core/gro.c            |  2 +-
>>>>    net/core/gro_cells.c      |  2 +-
>>>>    net/core/skbuff.c         |  4 ++++
>>>>    6 files changed, 33 insertions(+), 4 deletions(-)
>>>>
>>>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>>>> index c83b390191d4..2ca0870b1221 100644
>>>> --- a/include/linux/netdevice.h
>>>> +++ b/include/linux/netdevice.h
>>>> @@ -2415,11 +2415,16 @@ struct net_device {
>>>>       ((dev)->devlink_port = (port));                         \
>>>>    })
>>>>
>>>> -static inline bool netif_elide_gro(const struct net_device *dev)
>>>> +static inline bool netif_elide_gro(const struct sk_buff *skb)
>>>>    {
>>>> -    if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog)
>>>> +    if (!(skb->dev->features & NETIF_F_GRO) || skb->dev->xdp_prog)
>>>>               return true;
>>>> +
>>>> +#ifdef CONFIG_SKB_GRO_CONTROL
>>>> +    return skb->gro_disabled;
>>>> +#else
>>>>       return false;
>>>> +#endif
>>>
>>> Yet more branches in the hot path.
>>>
>>> Compile time configurability does not help, as that will be
>>> enabled by distros.
>>>
>>> For a fairly niche use case. Where functionality of GRO already
>>> works. So just a performance for a very rare case at the cost of a
>>> regression in the common case. A small regression perhaps, but death
>>> by a thousand cuts.
>>
>> Mentioning it here b/c it perhaps fits in this context, longer time ago
>> there was the idea mentioned to have BPF operating as GRO engine which
>> might also help to reduce attack surface by only having to handle packets
>> of interest for the concrete production use case. Perhaps here meta data
>> buffer could be used to pass a notification from XDP to exit early w/o
>> aggregation.
> 
> Metadata is in fact one of our interests as well. We discussed using
> metadata instead of a skb bit to carry this information internally.
> Since metadata is opaque atm so it seems the only option is to have a
> GRO control hook before napi_gro_receive, and let BPF decide
> netif_receive_skb or napi_gro_receive (echo what Paolo said). With BPF
> it could indeed be more flexible, but the cons is that it could be
> even more slower than taking a bit on skb. I am actually open to
> either approach, as long as it gives us more control on when to enable
> GRO :)

Oh wait, one thing that just came to mind.. have you tried u64 per-CPU
counter map in XDP? For packets which should not be GRO-aggregated you
add count++ into the meta data area, and this forces GRO to not aggregate
since meta data that needs to be transported to tc BPF layer mismatches
(and therefore the contract/intent is that tc BPF needs to see the different
meta data passed to it).

Thanks,
Daniel
Yan Zhai June 21, 2024, 5:20 p.m. UTC | #12
On Fri, Jun 21, 2024 at 11:41 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> On 6/21/24 6:00 PM, Yan Zhai wrote:
> > On Fri, Jun 21, 2024 at 8:13 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
> >> On 6/21/24 2:15 PM, Willem de Bruijn wrote:
> >>> Yan Zhai wrote:
> >>>> Software GRO is currently controlled by a single switch, i.e.
> >>>>
> >>>>     ethtool -K dev gro on|off
> >>>>
> >>>> However, this is not always desired. When GRO is enabled, even if the
> >>>> kernel cannot GRO certain traffic, it has to run through the GRO receive
> >>>> handlers with no benefit.
> >>>>
> >>>> There are also scenarios that turning off GRO is a requirement. For
> >>>> example, our production environment has a scenario that a TC egress hook
> >>>> may add multiple encapsulation headers to forwarded skbs for load
> >>>> balancing and isolation purpose. The encapsulation is implemented via
> >>>> BPF. But the problem arises then: there is no way to properly offload a
> >>>> double-encapsulated packet, since skb only has network_header and
> >>>> inner_network_header to track one layer of encapsulation, but not two.
> >>>> On the other hand, not all the traffic through this device needs double
> >>>> encapsulation. But we have to turn off GRO completely for any ingress
> >>>> device as a result.
> >>>>
> >>>> Introduce a bit on skb so that GRO engine can be notified to skip GRO on
> >>>> this skb, rather than having to be 0-or-1 for all traffic.
> >>>>
> >>>> Signed-off-by: Yan Zhai <yan@cloudflare.com>
> >>>> ---
> >>>>    include/linux/netdevice.h |  9 +++++++--
> >>>>    include/linux/skbuff.h    | 10 ++++++++++
> >>>>    net/Kconfig               | 10 ++++++++++
> >>>>    net/core/gro.c            |  2 +-
> >>>>    net/core/gro_cells.c      |  2 +-
> >>>>    net/core/skbuff.c         |  4 ++++
> >>>>    6 files changed, 33 insertions(+), 4 deletions(-)
> >>>>
> >>>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> >>>> index c83b390191d4..2ca0870b1221 100644
> >>>> --- a/include/linux/netdevice.h
> >>>> +++ b/include/linux/netdevice.h
> >>>> @@ -2415,11 +2415,16 @@ struct net_device {
> >>>>       ((dev)->devlink_port = (port));                         \
> >>>>    })
> >>>>
> >>>> -static inline bool netif_elide_gro(const struct net_device *dev)
> >>>> +static inline bool netif_elide_gro(const struct sk_buff *skb)
> >>>>    {
> >>>> -    if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog)
> >>>> +    if (!(skb->dev->features & NETIF_F_GRO) || skb->dev->xdp_prog)
> >>>>               return true;
> >>>> +
> >>>> +#ifdef CONFIG_SKB_GRO_CONTROL
> >>>> +    return skb->gro_disabled;
> >>>> +#else
> >>>>       return false;
> >>>> +#endif
> >>>
> >>> Yet more branches in the hot path.
> >>>
> >>> Compile time configurability does not help, as that will be
> >>> enabled by distros.
> >>>
> >>> For a fairly niche use case. Where functionality of GRO already
> >>> works. So just a performance for a very rare case at the cost of a
> >>> regression in the common case. A small regression perhaps, but death
> >>> by a thousand cuts.
> >>
> >> Mentioning it here b/c it perhaps fits in this context, longer time ago
> >> there was the idea mentioned to have BPF operating as GRO engine which
> >> might also help to reduce attack surface by only having to handle packets
> >> of interest for the concrete production use case. Perhaps here meta data
> >> buffer could be used to pass a notification from XDP to exit early w/o
> >> aggregation.
> >
> > Metadata is in fact one of our interests as well. We discussed using
> > metadata instead of a skb bit to carry this information internally.
> > Since metadata is opaque atm so it seems the only option is to have a
> > GRO control hook before napi_gro_receive, and let BPF decide
> > netif_receive_skb or napi_gro_receive (echo what Paolo said). With BPF
> > it could indeed be more flexible, but the cons is that it could be
> > even more slower than taking a bit on skb. I am actually open to
> > either approach, as long as it gives us more control on when to enable
> > GRO :)
>
> Oh wait, one thing that just came to mind.. have you tried u64 per-CPU
> counter map in XDP? For packets which should not be GRO-aggregated you
> add count++ into the meta data area, and this forces GRO to not aggregate
> since meta data that needs to be transported to tc BPF layer mismatches
> (and therefore the contract/intent is that tc BPF needs to see the different
> meta data passed to it).
>

Very very sorry to resendx2 :( Not sure why my laptop also decided to
switch on html... I removed CCs from the message hopefully it reduces
some noises...

We did this before accidentally (we put a timestamp for debugging
purposes in metadata) and this actually caused about 20% of OoO for
TCP in production: all PSH packets are reordered. GRO does not fire
the packet to the upper layer when a diff in metadata is found for a
non-PSH packet, instead it is queued as a “new flow” on the GRO list
and waits for flushing. When a PSH packet arrives, its semantic is to
flush this packet immediately and thus precedes earlier packets of the
same flow.

The artifact of this behavior can also cause latency increase and hash
degradation since the mismatch does not result in flushing, it results
in extra queuing on the same hash list, until the list is flushed.
It’s another reason we want to disable GRO when we know metadata can
be set differently for tracing purposes (I didn’t mention this though
because it seems distracting).

Thanks
Yan

> Thanks,
> Daniel
Willem de Bruijn June 23, 2024, 8:23 a.m. UTC | #13
Yan Zhai wrote:
> On Fri, Jun 21, 2024 at 11:41 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
> >
> > On 6/21/24 6:00 PM, Yan Zhai wrote:
> > > On Fri, Jun 21, 2024 at 8:13 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
> > >> On 6/21/24 2:15 PM, Willem de Bruijn wrote:
> > >>> Yan Zhai wrote:
> > >>>> Software GRO is currently controlled by a single switch, i.e.
> > >>>>
> > >>>>     ethtool -K dev gro on|off
> > >>>>
> > >>>> However, this is not always desired. When GRO is enabled, even if the
> > >>>> kernel cannot GRO certain traffic, it has to run through the GRO receive
> > >>>> handlers with no benefit.
> > >>>>
> > >>>> There are also scenarios that turning off GRO is a requirement. For
> > >>>> example, our production environment has a scenario that a TC egress hook
> > >>>> may add multiple encapsulation headers to forwarded skbs for load
> > >>>> balancing and isolation purpose. The encapsulation is implemented via
> > >>>> BPF. But the problem arises then: there is no way to properly offload a
> > >>>> double-encapsulated packet, since skb only has network_header and
> > >>>> inner_network_header to track one layer of encapsulation, but not two.
> > >>>> On the other hand, not all the traffic through this device needs double
> > >>>> encapsulation. But we have to turn off GRO completely for any ingress
> > >>>> device as a result.
> > >>>>
> > >>>> Introduce a bit on skb so that GRO engine can be notified to skip GRO on
> > >>>> this skb, rather than having to be 0-or-1 for all traffic.
> > >>>>
> > >>>> Signed-off-by: Yan Zhai <yan@cloudflare.com>
> > >>>> ---
> > >>>>    include/linux/netdevice.h |  9 +++++++--
> > >>>>    include/linux/skbuff.h    | 10 ++++++++++
> > >>>>    net/Kconfig               | 10 ++++++++++
> > >>>>    net/core/gro.c            |  2 +-
> > >>>>    net/core/gro_cells.c      |  2 +-
> > >>>>    net/core/skbuff.c         |  4 ++++
> > >>>>    6 files changed, 33 insertions(+), 4 deletions(-)
> > >>>>
> > >>>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> > >>>> index c83b390191d4..2ca0870b1221 100644
> > >>>> --- a/include/linux/netdevice.h
> > >>>> +++ b/include/linux/netdevice.h
> > >>>> @@ -2415,11 +2415,16 @@ struct net_device {
> > >>>>       ((dev)->devlink_port = (port));                         \
> > >>>>    })
> > >>>>
> > >>>> -static inline bool netif_elide_gro(const struct net_device *dev)
> > >>>> +static inline bool netif_elide_gro(const struct sk_buff *skb)
> > >>>>    {
> > >>>> -    if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog)
> > >>>> +    if (!(skb->dev->features & NETIF_F_GRO) || skb->dev->xdp_prog)
> > >>>>               return true;
> > >>>> +
> > >>>> +#ifdef CONFIG_SKB_GRO_CONTROL
> > >>>> +    return skb->gro_disabled;
> > >>>> +#else
> > >>>>       return false;
> > >>>> +#endif
> > >>>
> > >>> Yet more branches in the hot path.
> > >>>
> > >>> Compile time configurability does not help, as that will be
> > >>> enabled by distros.
> > >>>
> > >>> For a fairly niche use case. Where functionality of GRO already
> > >>> works. So just a performance for a very rare case at the cost of a
> > >>> regression in the common case. A small regression perhaps, but death
> > >>> by a thousand cuts.
> > >>
> > >> Mentioning it here b/c it perhaps fits in this context, longer time ago
> > >> there was the idea mentioned to have BPF operating as GRO engine which
> > >> might also help to reduce attack surface by only having to handle packets
> > >> of interest for the concrete production use case. Perhaps here meta data
> > >> buffer could be used to pass a notification from XDP to exit early w/o
> > >> aggregation.
> > >
> > > Metadata is in fact one of our interests as well. We discussed using
> > > metadata instead of a skb bit to carry this information internally.
> > > Since metadata is opaque atm so it seems the only option is to have a
> > > GRO control hook before napi_gro_receive, and let BPF decide
> > > netif_receive_skb or napi_gro_receive (echo what Paolo said). With BPF
> > > it could indeed be more flexible, but the cons is that it could be
> > > even more slower than taking a bit on skb. I am actually open to
> > > either approach, as long as it gives us more control on when to enable
> > > GRO :)
> >
> > Oh wait, one thing that just came to mind.. have you tried u64 per-CPU
> > counter map in XDP? For packets which should not be GRO-aggregated you
> > add count++ into the meta data area, and this forces GRO to not aggregate
> > since meta data that needs to be transported to tc BPF layer mismatches
> > (and therefore the contract/intent is that tc BPF needs to see the different
> > meta data passed to it).
> >
> 
> We did this before accidentally (we put a timestamp for debugging
> purposes in metadata) and this actually caused about 20% of OoO for
> TCP in production: all PSH packets are reordered. GRO does not fire
> the packet to the upper layer when a diff in metadata is found for a
> non-PSH packet, instead it is queued as a “new flow” on the GRO list
> and waits for flushing. When a PSH packet arrives, its semantic is to
> flush this packet immediately and thus precedes earlier packets of the
> same flow.

Is that a bug in XDP metadata handling for GRO?

Mismatching metadata should not be taken as separate flows, but as a
flush condition.
Willem de Bruijn June 23, 2024, 8:27 a.m. UTC | #14
Yan Zhai wrote:
> > > -static inline bool netif_elide_gro(const struct net_device *dev)
> > > +static inline bool netif_elide_gro(const struct sk_buff *skb)
> > >  {
> > > -     if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog)
> > > +     if (!(skb->dev->features & NETIF_F_GRO) || skb->dev->xdp_prog)
> > >               return true;
> > > +
> > > +#ifdef CONFIG_SKB_GRO_CONTROL
> > > +     return skb->gro_disabled;
> > > +#else
> > >       return false;
> > > +#endif
> >
> > Yet more branches in the hot path.
> >
> > Compile time configurability does not help, as that will be
> > enabled by distros.
> >
> > For a fairly niche use case. Where functionality of GRO already
> > works. So just a performance for a very rare case at the cost of a
> > regression in the common case. A small regression perhaps, but death
> > by a thousand cuts.
> >
> 
> I share your concern on operating on this hotpath. Will a
> static_branch + sysctl make it less aggressive?

That is always a possibility. But we have to use it judiciously,
cannot add a sysctl for every branch.

I'm still of the opinion that Paolo shared that this seems a lot of
complexity for a fairly minor performance optimization for a rare
case.

> Speaking of
> performance, I'd hope this can give us more control so we can achieve
> the best of two worlds: for TCP and some UDP traffic, we can enable
> GRO, while for some other classes that we know GRO does no good or
> even harm, let's disable GRO to save more cycles. The key observation
> is that developers may already know which traffic is blessed by GRO,
> but lack a way to realize it.

Following up also on Daniel's point on using BPF as GRO engine. Even
earlier I tried to add an option to selectively enable GRO protocols
without BPF. Definitely worthwhile to be able to disable GRO handlers
to reduce attack surface to bad input.


> 
> best
> Yan
Daniel Borkmann June 24, 2024, 1:30 p.m. UTC | #15
On 6/23/24 10:23 AM, Willem de Bruijn wrote:
> Yan Zhai wrote:
>> On Fri, Jun 21, 2024 at 11:41 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
>>> On 6/21/24 6:00 PM, Yan Zhai wrote:
>>>> On Fri, Jun 21, 2024 at 8:13 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
>>>>> On 6/21/24 2:15 PM, Willem de Bruijn wrote:
>>>>>> Yan Zhai wrote:
>>>>>>> Software GRO is currently controlled by a single switch, i.e.
>>>>>>>
>>>>>>>      ethtool -K dev gro on|off
>>>>>>>
>>>>>>> However, this is not always desired. When GRO is enabled, even if the
>>>>>>> kernel cannot GRO certain traffic, it has to run through the GRO receive
>>>>>>> handlers with no benefit.
>>>>>>>
>>>>>>> There are also scenarios that turning off GRO is a requirement. For
>>>>>>> example, our production environment has a scenario that a TC egress hook
>>>>>>> may add multiple encapsulation headers to forwarded skbs for load
>>>>>>> balancing and isolation purpose. The encapsulation is implemented via
>>>>>>> BPF. But the problem arises then: there is no way to properly offload a
>>>>>>> double-encapsulated packet, since skb only has network_header and
>>>>>>> inner_network_header to track one layer of encapsulation, but not two.
>>>>>>> On the other hand, not all the traffic through this device needs double
>>>>>>> encapsulation. But we have to turn off GRO completely for any ingress
>>>>>>> device as a result.
>>>>>>>
>>>>>>> Introduce a bit on skb so that GRO engine can be notified to skip GRO on
>>>>>>> this skb, rather than having to be 0-or-1 for all traffic.
>>>>>>>
>>>>>>> Signed-off-by: Yan Zhai <yan@cloudflare.com>
>>>>>>> ---
>>>>>>>     include/linux/netdevice.h |  9 +++++++--
>>>>>>>     include/linux/skbuff.h    | 10 ++++++++++
>>>>>>>     net/Kconfig               | 10 ++++++++++
>>>>>>>     net/core/gro.c            |  2 +-
>>>>>>>     net/core/gro_cells.c      |  2 +-
>>>>>>>     net/core/skbuff.c         |  4 ++++
>>>>>>>     6 files changed, 33 insertions(+), 4 deletions(-)
>>>>>>>
>>>>>>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>>>>>>> index c83b390191d4..2ca0870b1221 100644
>>>>>>> --- a/include/linux/netdevice.h
>>>>>>> +++ b/include/linux/netdevice.h
>>>>>>> @@ -2415,11 +2415,16 @@ struct net_device {
>>>>>>>        ((dev)->devlink_port = (port));                         \
>>>>>>>     })
>>>>>>>
>>>>>>> -static inline bool netif_elide_gro(const struct net_device *dev)
>>>>>>> +static inline bool netif_elide_gro(const struct sk_buff *skb)
>>>>>>>     {
>>>>>>> -    if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog)
>>>>>>> +    if (!(skb->dev->features & NETIF_F_GRO) || skb->dev->xdp_prog)
>>>>>>>                return true;
>>>>>>> +
>>>>>>> +#ifdef CONFIG_SKB_GRO_CONTROL
>>>>>>> +    return skb->gro_disabled;
>>>>>>> +#else
>>>>>>>        return false;
>>>>>>> +#endif
>>>>>>
>>>>>> Yet more branches in the hot path.
>>>>>>
>>>>>> Compile time configurability does not help, as that will be
>>>>>> enabled by distros.
>>>>>>
>>>>>> For a fairly niche use case. Where functionality of GRO already
>>>>>> works. So just a performance for a very rare case at the cost of a
>>>>>> regression in the common case. A small regression perhaps, but death
>>>>>> by a thousand cuts.
>>>>>
>>>>> Mentioning it here b/c it perhaps fits in this context, longer time ago
>>>>> there was the idea mentioned to have BPF operating as GRO engine which
>>>>> might also help to reduce attack surface by only having to handle packets
>>>>> of interest for the concrete production use case. Perhaps here meta data
>>>>> buffer could be used to pass a notification from XDP to exit early w/o
>>>>> aggregation.
>>>>
>>>> Metadata is in fact one of our interests as well. We discussed using
>>>> metadata instead of a skb bit to carry this information internally.
>>>> Since metadata is opaque atm so it seems the only option is to have a
>>>> GRO control hook before napi_gro_receive, and let BPF decide
>>>> netif_receive_skb or napi_gro_receive (echo what Paolo said). With BPF
>>>> it could indeed be more flexible, but the cons is that it could be
>>>> even more slower than taking a bit on skb. I am actually open to
>>>> either approach, as long as it gives us more control on when to enable
>>>> GRO :)
>>>
>>> Oh wait, one thing that just came to mind.. have you tried u64 per-CPU
>>> counter map in XDP? For packets which should not be GRO-aggregated you
>>> add count++ into the meta data area, and this forces GRO to not aggregate
>>> since meta data that needs to be transported to tc BPF layer mismatches
>>> (and therefore the contract/intent is that tc BPF needs to see the different
>>> meta data passed to it).
>>
>> We did this before accidentally (we put a timestamp for debugging
>> purposes in metadata) and this actually caused about 20% of OoO for
>> TCP in production: all PSH packets are reordered. GRO does not fire
>> the packet to the upper layer when a diff in metadata is found for a
>> non-PSH packet, instead it is queued as a “new flow” on the GRO list
>> and waits for flushing. When a PSH packet arrives, its semantic is to
>> flush this packet immediately and thus precedes earlier packets of the
>> same flow.
> 
> Is that a bug in XDP metadata handling for GRO?
> 
> Mismatching metadata should not be taken as separate flows, but as a
> flush condition.

Definitely a bug as it should flush. If noone is faster I can add it to my
backlog todo to fix it, but might probably take a week before I get to it.

Thanks,
Daniel
Yan Zhai June 24, 2024, 5:49 p.m. UTC | #16
On Mon, Jun 24, 2024 at 8:30 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> On 6/23/24 10:23 AM, Willem de Bruijn wrote:
> > Yan Zhai wrote:
> >> On Fri, Jun 21, 2024 at 11:41 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
> >>> On 6/21/24 6:00 PM, Yan Zhai wrote:
> >>>> On Fri, Jun 21, 2024 at 8:13 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
> >>>>> On 6/21/24 2:15 PM, Willem de Bruijn wrote:
> >>>>>> Yan Zhai wrote:
> >>>>>>> Software GRO is currently controlled by a single switch, i.e.
> >>>>>>>
> >>>>>>>      ethtool -K dev gro on|off
> >>>>>>>
> >>>>>>> However, this is not always desired. When GRO is enabled, even if the
> >>>>>>> kernel cannot GRO certain traffic, it has to run through the GRO receive
> >>>>>>> handlers with no benefit.
> >>>>>>>
> >>>>>>> There are also scenarios that turning off GRO is a requirement. For
> >>>>>>> example, our production environment has a scenario that a TC egress hook
> >>>>>>> may add multiple encapsulation headers to forwarded skbs for load
> >>>>>>> balancing and isolation purpose. The encapsulation is implemented via
> >>>>>>> BPF. But the problem arises then: there is no way to properly offload a
> >>>>>>> double-encapsulated packet, since skb only has network_header and
> >>>>>>> inner_network_header to track one layer of encapsulation, but not two.
> >>>>>>> On the other hand, not all the traffic through this device needs double
> >>>>>>> encapsulation. But we have to turn off GRO completely for any ingress
> >>>>>>> device as a result.
> >>>>>>>
> >>>>>>> Introduce a bit on skb so that GRO engine can be notified to skip GRO on
> >>>>>>> this skb, rather than having to be 0-or-1 for all traffic.
> >>>>>>>
> >>>>>>> Signed-off-by: Yan Zhai <yan@cloudflare.com>
> >>>>>>> ---
> >>>>>>>     include/linux/netdevice.h |  9 +++++++--
> >>>>>>>     include/linux/skbuff.h    | 10 ++++++++++
> >>>>>>>     net/Kconfig               | 10 ++++++++++
> >>>>>>>     net/core/gro.c            |  2 +-
> >>>>>>>     net/core/gro_cells.c      |  2 +-
> >>>>>>>     net/core/skbuff.c         |  4 ++++
> >>>>>>>     6 files changed, 33 insertions(+), 4 deletions(-)
> >>>>>>>
> >>>>>>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> >>>>>>> index c83b390191d4..2ca0870b1221 100644
> >>>>>>> --- a/include/linux/netdevice.h
> >>>>>>> +++ b/include/linux/netdevice.h
> >>>>>>> @@ -2415,11 +2415,16 @@ struct net_device {
> >>>>>>>        ((dev)->devlink_port = (port));                         \
> >>>>>>>     })
> >>>>>>>
> >>>>>>> -static inline bool netif_elide_gro(const struct net_device *dev)
> >>>>>>> +static inline bool netif_elide_gro(const struct sk_buff *skb)
> >>>>>>>     {
> >>>>>>> -    if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog)
> >>>>>>> +    if (!(skb->dev->features & NETIF_F_GRO) || skb->dev->xdp_prog)
> >>>>>>>                return true;
> >>>>>>> +
> >>>>>>> +#ifdef CONFIG_SKB_GRO_CONTROL
> >>>>>>> +    return skb->gro_disabled;
> >>>>>>> +#else
> >>>>>>>        return false;
> >>>>>>> +#endif
> >>>>>>
> >>>>>> Yet more branches in the hot path.
> >>>>>>
> >>>>>> Compile time configurability does not help, as that will be
> >>>>>> enabled by distros.
> >>>>>>
> >>>>>> For a fairly niche use case. Where functionality of GRO already
> >>>>>> works. So just a performance for a very rare case at the cost of a
> >>>>>> regression in the common case. A small regression perhaps, but death
> >>>>>> by a thousand cuts.
> >>>>>
> >>>>> Mentioning it here b/c it perhaps fits in this context, longer time ago
> >>>>> there was the idea mentioned to have BPF operating as GRO engine which
> >>>>> might also help to reduce attack surface by only having to handle packets
> >>>>> of interest for the concrete production use case. Perhaps here meta data
> >>>>> buffer could be used to pass a notification from XDP to exit early w/o
> >>>>> aggregation.
> >>>>
> >>>> Metadata is in fact one of our interests as well. We discussed using
> >>>> metadata instead of a skb bit to carry this information internally.
> >>>> Since metadata is opaque atm so it seems the only option is to have a
> >>>> GRO control hook before napi_gro_receive, and let BPF decide
> >>>> netif_receive_skb or napi_gro_receive (echo what Paolo said). With BPF
> >>>> it could indeed be more flexible, but the cons is that it could be
> >>>> even more slower than taking a bit on skb. I am actually open to
> >>>> either approach, as long as it gives us more control on when to enable
> >>>> GRO :)
> >>>
> >>> Oh wait, one thing that just came to mind.. have you tried u64 per-CPU
> >>> counter map in XDP? For packets which should not be GRO-aggregated you
> >>> add count++ into the meta data area, and this forces GRO to not aggregate
> >>> since meta data that needs to be transported to tc BPF layer mismatches
> >>> (and therefore the contract/intent is that tc BPF needs to see the different
> >>> meta data passed to it).
> >>
> >> We did this before accidentally (we put a timestamp for debugging
> >> purposes in metadata) and this actually caused about 20% of OoO for
> >> TCP in production: all PSH packets are reordered. GRO does not fire
> >> the packet to the upper layer when a diff in metadata is found for a
> >> non-PSH packet, instead it is queued as a “new flow” on the GRO list
> >> and waits for flushing. When a PSH packet arrives, its semantic is to
> >> flush this packet immediately and thus precedes earlier packets of the
> >> same flow.
> >
> > Is that a bug in XDP metadata handling for GRO?
> >
> > Mismatching metadata should not be taken as separate flows, but as a
> > flush condition.
>
> Definitely a bug as it should flush. If noone is faster I can add it to my
> backlog todo to fix it, but might probably take a week before I get to it.
>
In theory we should flush if the same flow has different metadata.
However "same flow" is not finally confirmed until GRO proceeds to the
TCP layer in this case. So to achieve this flush semantic, metadata
should be compared at the leaf protocol handlers instead of initially
inside dev_gro_receive. I am not quite sure if it is worthwhile, or
just allow skipping GRO from XDP, since it's the XDP program who sets
this metadata.

Yan

> Thanks,
> Daniel
Yan Zhai June 24, 2024, 6:17 p.m. UTC | #17
On Sun, Jun 23, 2024 at 3:27 AM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> Yan Zhai wrote:
> > > > -static inline bool netif_elide_gro(const struct net_device *dev)
> > > > +static inline bool netif_elide_gro(const struct sk_buff *skb)
> > > >  {
> > > > -     if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog)
> > > > +     if (!(skb->dev->features & NETIF_F_GRO) || skb->dev->xdp_prog)
> > > >               return true;
> > > > +
> > > > +#ifdef CONFIG_SKB_GRO_CONTROL
> > > > +     return skb->gro_disabled;
> > > > +#else
> > > >       return false;
> > > > +#endif
> > >
> > > Yet more branches in the hot path.
> > >
> > > Compile time configurability does not help, as that will be
> > > enabled by distros.
> > >
> > > For a fairly niche use case. Where functionality of GRO already
> > > works. So just a performance for a very rare case at the cost of a
> > > regression in the common case. A small regression perhaps, but death
> > > by a thousand cuts.
> > >
> >
> > I share your concern on operating on this hotpath. Will a
> > static_branch + sysctl make it less aggressive?
>
> That is always a possibility. But we have to use it judiciously,
> cannot add a sysctl for every branch.
>
> I'm still of the opinion that Paolo shared that this seems a lot of
> complexity for a fairly minor performance optimization for a rare
> case.
>
Actually combining the discussion in this thread, I think it would be
more than the corner cases that we encounter. Let me elaborate below.

> > Speaking of
> > performance, I'd hope this can give us more control so we can achieve
> > the best of two worlds: for TCP and some UDP traffic, we can enable
> > GRO, while for some other classes that we know GRO does no good or
> > even harm, let's disable GRO to save more cycles. The key observation
> > is that developers may already know which traffic is blessed by GRO,
> > but lack a way to realize it.
>
> Following up also on Daniel's point on using BPF as GRO engine. Even
> earlier I tried to add an option to selectively enable GRO protocols
> without BPF. Definitely worthwhile to be able to disable GRO handlers
> to reduce attack surface to bad input.
>
I was probably staring too hard at my own things, which is indeed a
corner case. But reducing the attack surface is indeed a good
motivation for this patch. I checked briefly with our DoS team today,
the DoS scenario will definitely benefit from skipping GRO, for
example on SYN/RST floods. XDP is our main weapon to drop attack
traffic today, but it does not always drop 100% of the floods, and
time by time it does need to fall back to iptables due to the delay of
XDP program assembly or the BPF limitation on analyzing the packet. I
did an ad hoc measurement just now on a mostly idle server, with
~1.3Mpps SYN flood concentrated on one CPU and dropped them early in
raw-PREROUTING. w/ GRO this would consume about 35-41% of the CPU
time, while w/o GRO the time dropped to 9-12%. This seems a pretty
significant breath room under heavy attacks.

But I am not sure I understand "BPF as GRO engine" here, it seems to
me that being able to disable GRO by XDP is already good enough. Any
more motivations to do more complex work here?

best
Yan

>
> >
> > best
> > Yan
>
>
Willem de Bruijn June 30, 2024, 1:40 p.m. UTC | #18
Yan Zhai wrote:
> On Sun, Jun 23, 2024 at 3:27 AM Willem de Bruijn
> <willemdebruijn.kernel@gmail.com> wrote:
> >
> > Yan Zhai wrote:
> > > > > -static inline bool netif_elide_gro(const struct net_device *dev)
> > > > > +static inline bool netif_elide_gro(const struct sk_buff *skb)
> > > > >  {
> > > > > -     if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog)
> > > > > +     if (!(skb->dev->features & NETIF_F_GRO) || skb->dev->xdp_prog)
> > > > >               return true;
> > > > > +
> > > > > +#ifdef CONFIG_SKB_GRO_CONTROL
> > > > > +     return skb->gro_disabled;
> > > > > +#else
> > > > >       return false;
> > > > > +#endif
> > > >
> > > > Yet more branches in the hot path.
> > > >
> > > > Compile time configurability does not help, as that will be
> > > > enabled by distros.
> > > >
> > > > For a fairly niche use case. Where functionality of GRO already
> > > > works. So just a performance for a very rare case at the cost of a
> > > > regression in the common case. A small regression perhaps, but death
> > > > by a thousand cuts.
> > > >
> > >
> > > I share your concern on operating on this hotpath. Will a
> > > static_branch + sysctl make it less aggressive?
> >
> > That is always a possibility. But we have to use it judiciously,
> > cannot add a sysctl for every branch.
> >
> > I'm still of the opinion that Paolo shared that this seems a lot of
> > complexity for a fairly minor performance optimization for a rare
> > case.
> >
> Actually combining the discussion in this thread, I think it would be
> more than the corner cases that we encounter. Let me elaborate below.
> 
> > > Speaking of
> > > performance, I'd hope this can give us more control so we can achieve
> > > the best of two worlds: for TCP and some UDP traffic, we can enable
> > > GRO, while for some other classes that we know GRO does no good or
> > > even harm, let's disable GRO to save more cycles. The key observation
> > > is that developers may already know which traffic is blessed by GRO,
> > > but lack a way to realize it.
> >
> > Following up also on Daniel's point on using BPF as GRO engine. Even
> > earlier I tried to add an option to selectively enable GRO protocols
> > without BPF. Definitely worthwhile to be able to disable GRO handlers
> > to reduce attack surface to bad input.
> >
> I was probably staring too hard at my own things, which is indeed a
> corner case. But reducing the attack surface is indeed a good
> motivation for this patch. I checked briefly with our DoS team today,
> the DoS scenario will definitely benefit from skipping GRO, for
> example on SYN/RST floods. XDP is our main weapon to drop attack
> traffic today, but it does not always drop 100% of the floods, and
> time by time it does need to fall back to iptables due to the delay of
> XDP program assembly or the BPF limitation on analyzing the packet. I
> did an ad hoc measurement just now on a mostly idle server, with
> ~1.3Mpps SYN flood concentrated on one CPU and dropped them early in
> raw-PREROUTING. w/ GRO this would consume about 35-41% of the CPU
> time, while w/o GRO the time dropped to 9-12%. This seems a pretty
> significant breath room under heavy attacks.

A GRO opt-out might make sense.

A long time ago I sent a patch that configured GRO protocols using
syscalls, selectively (un)registering handlers. The interface was not
very nice, so I did not pursue it further. On the upside, the datapath
did not introduce any extra code. The intent was to reduce attack
surface of packet parsing code.

A few concerns with an XDP based opt-out. It is more work to enable:
requires compiling and load an XDP program. It adds cycles in the
hot path. And I do not entirely understand when an XDP program will be
able to detect that a packet should not enter the GRO engine, but
cannot drop the packet (your netfilter example above).

> But I am not sure I understand "BPF as GRO engine" here, it seems to
> me that being able to disable GRO by XDP is already good enough. Any
> more motivations to do more complex work here?

FWIW, we looked into this a few years ago. Analogous to the BPF flow
dissector: if the BPF program is loaded, use that instead of the C
code path. But we did not arrive at a practical implementation at the
time. Things may have changed, but one issue is how to store and
access the list (or table) of outstanding GRO skbs.

> best
> Yan
> 
> >
> > >
> > > best
> > > Yan
> >
> >
Yan Zhai July 3, 2024, 6:46 p.m. UTC | #19
On Sun, Jun 30, 2024 at 8:40 AM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> Yan Zhai wrote:
> > On Sun, Jun 23, 2024 at 3:27 AM Willem de Bruijn
> > <willemdebruijn.kernel@gmail.com> wrote:
> > >
> > > Yan Zhai wrote:
> > > > > > -static inline bool netif_elide_gro(const struct net_device *dev)
> > > > > > +static inline bool netif_elide_gro(const struct sk_buff *skb)
> > > > > >  {
> > > > > > -     if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog)
> > > > > > +     if (!(skb->dev->features & NETIF_F_GRO) || skb->dev->xdp_prog)
> > > > > >               return true;
> > > > > > +
> > > > > > +#ifdef CONFIG_SKB_GRO_CONTROL
> > > > > > +     return skb->gro_disabled;
> > > > > > +#else
> > > > > >       return false;
> > > > > > +#endif
> > > > >
> > > > > Yet more branches in the hot path.
> > > > >
> > > > > Compile time configurability does not help, as that will be
> > > > > enabled by distros.
> > > > >
> > > > > For a fairly niche use case. Where functionality of GRO already
> > > > > works. So just a performance for a very rare case at the cost of a
> > > > > regression in the common case. A small regression perhaps, but death
> > > > > by a thousand cuts.
> > > > >
> > > >
> > > > I share your concern on operating on this hotpath. Will a
> > > > static_branch + sysctl make it less aggressive?
> > >
> > > That is always a possibility. But we have to use it judiciously,
> > > cannot add a sysctl for every branch.
> > >
> > > I'm still of the opinion that Paolo shared that this seems a lot of
> > > complexity for a fairly minor performance optimization for a rare
> > > case.
> > >
> > Actually combining the discussion in this thread, I think it would be
> > more than the corner cases that we encounter. Let me elaborate below.
> >
> > > > Speaking of
> > > > performance, I'd hope this can give us more control so we can achieve
> > > > the best of two worlds: for TCP and some UDP traffic, we can enable
> > > > GRO, while for some other classes that we know GRO does no good or
> > > > even harm, let's disable GRO to save more cycles. The key observation
> > > > is that developers may already know which traffic is blessed by GRO,
> > > > but lack a way to realize it.
> > >
> > > Following up also on Daniel's point on using BPF as GRO engine. Even
> > > earlier I tried to add an option to selectively enable GRO protocols
> > > without BPF. Definitely worthwhile to be able to disable GRO handlers
> > > to reduce attack surface to bad input.
> > >
> > I was probably staring too hard at my own things, which is indeed a
> > corner case. But reducing the attack surface is indeed a good
> > motivation for this patch. I checked briefly with our DoS team today,
> > the DoS scenario will definitely benefit from skipping GRO, for
> > example on SYN/RST floods. XDP is our main weapon to drop attack
> > traffic today, but it does not always drop 100% of the floods, and
> > time by time it does need to fall back to iptables due to the delay of
> > XDP program assembly or the BPF limitation on analyzing the packet. I
> > did an ad hoc measurement just now on a mostly idle server, with
> > ~1.3Mpps SYN flood concentrated on one CPU and dropped them early in
> > raw-PREROUTING. w/ GRO this would consume about 35-41% of the CPU
> > time, while w/o GRO the time dropped to 9-12%. This seems a pretty
> > significant breath room under heavy attacks.
>
> A GRO opt-out might make sense.
>
> A long time ago I sent a patch that configured GRO protocols using
> syscalls, selectively (un)registering handlers. The interface was not
> very nice, so I did not pursue it further. On the upside, the datapath
> did not introduce any extra code. The intent was to reduce attack
> surface of packet parsing code.
>
> A few concerns with an XDP based opt-out. It is more work to enable:
> requires compiling and load an XDP program. It adds cycles in the
> hot path. And I do not entirely understand when an XDP program will be
> able to detect that a packet should not enter the GRO engine, but
> cannot drop the packet (your netfilter example above).
>
Agree that XDP based approach is just offering for XDP users. But
given the way GRO works on flows today, it feels really hard to
provide an elegant and generic interface.

For DoS scenarios, let me expand it a bit. Packets themselves could be
a good indicator that they should not go through GRO, like fragments,
or with special flags like SYN/RST/PSH. Under an attack, we sometimes
also need conntrack or SYN cookies to help determine if some packets
are legit or not. We have a few kfuncs to lookup conntrack entries in
XDP today, but I am not sure if we can confidently drop them without
completely mirroring full conntrack functionality. Rather, using
conntrack as extra heuristics to mark suspicious packets in XDP, like
TCP packets out of windows, etc, and still leave verdict to iptables
seems a safer thing to do. I did observe a few occurrences in the past
where a substantial amount of SYN flood passed through XDP, with some
clever tricks in faking flow headers. Those were eventually dealt by
SYN cookies, but all of those go through GRO unnecessarily although
they all carry a SYN flag. Would be definitely beneficial to save
every cycle under attacks.

> > But I am not sure I understand "BPF as GRO engine" here, it seems to
> > me that being able to disable GRO by XDP is already good enough. Any
> > more motivations to do more complex work here?
>
> FWIW, we looked into this a few years ago. Analogous to the BPF flow
> dissector: if the BPF program is loaded, use that instead of the C
> code path. But we did not arrive at a practical implementation at the
> time. Things may have changed, but one issue is how to store and
> access the list (or table) of outstanding GRO skbs.
>
I see, thanks for the explanation.

Yan

> > best
> > Yan
> >
> > >
> > > >
> > > > best
> > > > Yan
> > >
> > >
>
>
diff mbox series

Patch

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c83b390191d4..2ca0870b1221 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2415,11 +2415,16 @@  struct net_device {
 	((dev)->devlink_port = (port));				\
 })
 
-static inline bool netif_elide_gro(const struct net_device *dev)
+static inline bool netif_elide_gro(const struct sk_buff *skb)
 {
-	if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog)
+	if (!(skb->dev->features & NETIF_F_GRO) || skb->dev->xdp_prog)
 		return true;
+
+#ifdef CONFIG_SKB_GRO_CONTROL
+	return skb->gro_disabled;
+#else
 	return false;
+#endif
 }
 
 #define	NETDEV_ALIGN		32
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f4cda3fbdb75..48b10ece95b5 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1008,6 +1008,9 @@  struct sk_buff {
 #if IS_ENABLED(CONFIG_IP_SCTP)
 	__u8			csum_not_inet:1;
 #endif
+#ifdef CONFIG_SKB_GRO_CONTROL
+	__u8			gro_disabled:1;
+#endif
 
 #if defined(CONFIG_NET_SCHED) || defined(CONFIG_NET_XGRESS)
 	__u16			tc_index;	/* traffic control index */
@@ -1215,6 +1218,13 @@  static inline bool skb_wifi_acked_valid(const struct sk_buff *skb)
 #endif
 }
 
+static inline void skb_disable_gro(struct sk_buff *skb)
+{
+#ifdef CONFIG_SKB_GRO_CONTROL
+	skb->gro_disabled = 1;
+#endif
+}
+
 /**
  * skb_unref - decrement the skb's reference count
  * @skb: buffer
diff --git a/net/Kconfig b/net/Kconfig
index 9fe65fa26e48..47d1ee92df15 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -289,6 +289,16 @@  config MAX_SKB_FRAGS
 	  and in drivers using build_skb().
 	  If unsure, say 17.
 
+config SKB_GRO_CONTROL
+	bool "allow disable GRO on per-packet basis"
+	default y
+	help
+	  By default GRO can only be enabled or disabled per network device.
+	  This can be cumbersome for certain scenarios.
+	  Toggling this option will allow disabling GRO for selected packets,
+	  e.g. by XDP programs, so that it is more flexibile.
+	  Extra overhead should be minimal.
+
 config RPS
 	bool "Receive packet steering"
 	depends on SMP && SYSFS
diff --git a/net/core/gro.c b/net/core/gro.c
index b3b43de1a650..46232a0d1983 100644
--- a/net/core/gro.c
+++ b/net/core/gro.c
@@ -476,7 +476,7 @@  static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 	enum gro_result ret;
 	int same_flow;
 
-	if (netif_elide_gro(skb->dev))
+	if (netif_elide_gro(skb))
 		goto normal;
 
 	gro_list_prepare(&gro_list->list, skb);
diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c
index ff8e5b64bf6b..1bf15783300f 100644
--- a/net/core/gro_cells.c
+++ b/net/core/gro_cells.c
@@ -20,7 +20,7 @@  int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb)
 	if (unlikely(!(dev->flags & IFF_UP)))
 		goto drop;
 
-	if (!gcells->cells || skb_cloned(skb) || netif_elide_gro(dev)) {
+	if (!gcells->cells || skb_cloned(skb) || netif_elide_gro(skb)) {
 		res = netif_rx(skb);
 		goto unlock;
 	}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 2315c088e91d..82bd297921c1 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -6030,6 +6030,10 @@  void skb_scrub_packet(struct sk_buff *skb, bool xnet)
 	ipvs_reset(skb);
 	skb->mark = 0;
 	skb_clear_tstamp(skb);
+#ifdef CONFIG_SKB_GRO_CONTROL
+	/* hand back GRO control to next netns */
+	skb->gro_disabled = 0;
+#endif
 }
 EXPORT_SYMBOL_GPL(skb_scrub_packet);