diff mbox series

net: xdp: allow user space to request a smaller packet headroom requirement

Message ID 20220314102210.92329-1-nbd@nbd.name (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series net: xdp: allow user space to request a smaller packet headroom requirement | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next success VM_Test
netdev/tree_selection success Not a local patch, async

Commit Message

Felix Fietkau March 14, 2022, 10:22 a.m. UTC
Most ethernet drivers allocate a packet headroom of NET_SKB_PAD. Since it is
rounded up to L1 cache size, it ends up being at least 64 bytes on the most
common platforms.
On most ethernet drivers, having a guaranteed headroom of 256 bytes for XDP
adds an extra forced pskb_expand_head call when enabling SKB XDP, which can
be quite expensive.
Many XDP programs need only very little headroom, so it can be beneficial
to have a way to opt-out of the 256 bytes headroom requirement.

Add an extra flag XDP_FLAGS_SMALL_HEADROOM that can be set when attaching
the XDP program, which reduces the minimum headroom to 64 bytes.

Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
 include/linux/netdevice.h          | 1 +
 include/uapi/linux/bpf.h           | 1 +
 include/uapi/linux/if_link.h       | 4 +++-
 net/core/dev.c                     | 9 ++++++++-
 tools/include/uapi/linux/if_link.h | 4 +++-
 5 files changed, 16 insertions(+), 3 deletions(-)

Comments

Jesper D. Brouer March 14, 2022, 8:39 p.m. UTC | #1
(Cc. BPF list and other XDP maintainers)

On 14/03/2022 11.22, Felix Fietkau wrote:
> Most ethernet drivers allocate a packet headroom of NET_SKB_PAD. Since it is
> rounded up to L1 cache size, it ends up being at least 64 bytes on the most
> common platforms.
> On most ethernet drivers, having a guaranteed headroom of 256 bytes for XDP
> adds an extra forced pskb_expand_head call when enabling SKB XDP, which can
> be quite expensive.
> Many XDP programs need only very little headroom, so it can be beneficial
> to have a way to opt-out of the 256 bytes headroom requirement.

IMHO 64 bytes is too small.
We are using this area for struct xdp_frame and also for metadata
(XDP-hints).  This will limit us from growing this structures for
the sake of generic-XDP.

I'm fine with reducting this to 192 bytes, as most Intel drivers
have this headroom, and have defacto established that this is
a valid XDP headroom, even for native-XDP.

We could go a small as two cachelines 128 bytes, as if xdp_frame
and metadata grows above a cache-line (64 bytes) each, then we have
done something wrong (performance wise).


> Add an extra flag XDP_FLAGS_SMALL_HEADROOM that can be set when attaching
> the XDP program, which reduces the minimum headroom to 64 bytes.

I don't like a flags approach.

Multiple disadvantages.
  (a) Harder to use
  (b) Now reading a new cache-line in net_device

> Signed-off-by: Felix Fietkau <nbd@nbd.name>
> ---
>   include/linux/netdevice.h          | 1 +
>   include/uapi/linux/bpf.h           | 1 +
>   include/uapi/linux/if_link.h       | 4 +++-
>   net/core/dev.c                     | 9 ++++++++-
>   tools/include/uapi/linux/if_link.h | 4 +++-
>   5 files changed, 16 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 0d994710b335..f6f270a5e301 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -2274,6 +2274,7 @@ struct net_device {
>   	bool			proto_down;
>   	unsigned		wol_enabled:1;
>   	unsigned		threaded:1;
> +	unsigned		xdp_small_headroom:1;
>   

Looks like we need to read this cache-line, in a XDP (generic) fastpath.

>   	struct list_head	net_notifier_list;
>   
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 4eebea830613..7635dfb02313 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -5688,6 +5688,7 @@ struct bpf_xdp_sock {
>   };
>   
>   #define XDP_PACKET_HEADROOM 256
> +#define XDP_PACKET_HEADROOM_SMALL 64

Define it 192 instead.

>   
>   /* User return codes for XDP prog type.
>    * A valid XDP program must return one of these defined values. All other
> diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
> index e003a0b9b4b2..acb996334910 100644
> --- a/include/uapi/linux/if_link.h
> +++ b/include/uapi/linux/if_link.h
> @@ -1275,11 +1275,13 @@ enum {
>   #define XDP_FLAGS_DRV_MODE		(1U << 2)
>   #define XDP_FLAGS_HW_MODE		(1U << 3)
>   #define XDP_FLAGS_REPLACE		(1U << 4)
> +#define XDP_FLAGS_SMALL_HEADROOM	(1U << 5)
>   #define XDP_FLAGS_MODES			(XDP_FLAGS_SKB_MODE | \
>   					 XDP_FLAGS_DRV_MODE | \
>   					 XDP_FLAGS_HW_MODE)
>   #define XDP_FLAGS_MASK			(XDP_FLAGS_UPDATE_IF_NOEXIST | \
> -					 XDP_FLAGS_MODES | XDP_FLAGS_REPLACE)
> +					 XDP_FLAGS_MODES | XDP_FLAGS_REPLACE | \
> +					 XDP_FLAGS_SMALL_HEADROOM)
>   
>   /* These are stored into IFLA_XDP_ATTACHED on dump. */
>   enum {
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 8d25ec5b3af7..cb12379b8f11 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -4722,6 +4722,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
>   				     struct xdp_buff *xdp,
>   				     struct bpf_prog *xdp_prog)
>   {
> +	int min_headroom = XDP_PACKET_HEADROOM;
>   	u32 act = XDP_DROP;
>   
>   	/* Reinjected packets coming from act_mirred or similar should
> @@ -4730,12 +4731,15 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
>   	if (skb_is_redirected(skb))
>   		return XDP_PASS;
>   
> +	if (skb->dev->xdp_small_headroom)
> +		min_headroom = XDP_PACKET_HEADROOM_SMALL;
> +
>   	/* XDP packets must be linear and must have sufficient headroom
>   	 * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
>   	 * native XDP provides, thus we need to do it here as well.
>   	 */
>   	if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
> -	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
> +	    skb_headroom(skb) < min_headroom) {

Use define XDP_PACKET_HEADROOM_SMALL here directly.

>   		int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
>   		int troom = skb->tail + skb->data_len - skb->end;
>   
> @@ -9135,6 +9139,9 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack
>   			return err;
>   	}
>   
> +	if (mode == XDP_MODE_SKB)
> +		dev->xdp_small_headroom = !!(flags & XDP_FLAGS_SMALL_HEADROOM);
> +
>   	if (link)
>   		dev_xdp_set_link(dev, mode, link);
>   	else
> diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h
> index e1ba2d51b717..0df737a6c489 100644
> --- a/tools/include/uapi/linux/if_link.h
> +++ b/tools/include/uapi/linux/if_link.h
> @@ -1185,11 +1185,13 @@ enum {
>   #define XDP_FLAGS_DRV_MODE		(1U << 2)
>   #define XDP_FLAGS_HW_MODE		(1U << 3)
>   #define XDP_FLAGS_REPLACE		(1U << 4)
> +#define XDP_FLAGS_SMALL_HEADROOM	(1U << 5)
>   #define XDP_FLAGS_MODES			(XDP_FLAGS_SKB_MODE | \
>   					 XDP_FLAGS_DRV_MODE | \
>   					 XDP_FLAGS_HW_MODE)
>   #define XDP_FLAGS_MASK			(XDP_FLAGS_UPDATE_IF_NOEXIST | \
> -					 XDP_FLAGS_MODES | XDP_FLAGS_REPLACE)
> +					 XDP_FLAGS_MODES | XDP_FLAGS_REPLACE | \
> +					 XDP_FLAGS_SMALL_HEADROOM)
>   
>   /* These are stored into IFLA_XDP_ATTACHED on dump. */
>   enum {
> 

--Jesper
Felix Fietkau March 14, 2022, 9:42 p.m. UTC | #2
On 14.03.22 21:39, Jesper D. Brouer wrote:
> (Cc. BPF list and other XDP maintainers)
> 
> On 14/03/2022 11.22, Felix Fietkau wrote:
>> Most ethernet drivers allocate a packet headroom of NET_SKB_PAD. Since it is
>> rounded up to L1 cache size, it ends up being at least 64 bytes on the most
>> common platforms.
>> On most ethernet drivers, having a guaranteed headroom of 256 bytes for XDP
>> adds an extra forced pskb_expand_head call when enabling SKB XDP, which can
>> be quite expensive.
>> Many XDP programs need only very little headroom, so it can be beneficial
>> to have a way to opt-out of the 256 bytes headroom requirement.
> 
> IMHO 64 bytes is too small.
> We are using this area for struct xdp_frame and also for metadata
> (XDP-hints).  This will limit us from growing this structures for
> the sake of generic-XDP.
> 
> I'm fine with reducting this to 192 bytes, as most Intel drivers
> have this headroom, and have defacto established that this is
> a valid XDP headroom, even for native-XDP.
> 
> We could go a small as two cachelines 128 bytes, as if xdp_frame
> and metadata grows above a cache-line (64 bytes) each, then we have
> done something wrong (performance wise).
Here's some background on why I chose 64 bytes: I'm currently 
implementing a userspace + xdp program to act as generic fastpath to 
speed network bridging.
For that I need to support a very diverse set of network drivers 
(including a lot of WLAN drivers).
My XDP program only needs up to 4 bytes extra headroom (for a VLAN header).
I made this headroom reduction opt-in, so that by default generic-XDP 
programs can still rely on 256 bytes headroom.

If we make the small version any bigger than 64 bytes, it limits my 
options to:
1) bump NET_SKB_PAD accordingly at the risk of creating issues with 
buffer management for some affected drivers (IMHO not likely to be 
accepted upstream)
2) create patches for each and every driver that could possibly get used 
on OpenWrt to make my approach viable (there's so many of them, so I 
think that's not really feasible either)
3) stick with non-upstream hacks for dealing with this in OpenWrt
I don't really like any of those options, but I can't think of any other 
solution right now.

If I take the pskb_expand_head hit from the headroom mismatch, the 
result is that my bridge accelerator code actually decreases performance 
instead of making anything better.

- Felix
Toke Høiland-Jørgensen March 14, 2022, 10:16 p.m. UTC | #3
Felix Fietkau <nbd@nbd.name> writes:

> On 14.03.22 21:39, Jesper D. Brouer wrote:
>> (Cc. BPF list and other XDP maintainers)
>> 
>> On 14/03/2022 11.22, Felix Fietkau wrote:
>>> Most ethernet drivers allocate a packet headroom of NET_SKB_PAD. Since it is
>>> rounded up to L1 cache size, it ends up being at least 64 bytes on the most
>>> common platforms.
>>> On most ethernet drivers, having a guaranteed headroom of 256 bytes for XDP
>>> adds an extra forced pskb_expand_head call when enabling SKB XDP, which can
>>> be quite expensive.
>>> Many XDP programs need only very little headroom, so it can be beneficial
>>> to have a way to opt-out of the 256 bytes headroom requirement.
>> 
>> IMHO 64 bytes is too small.
>> We are using this area for struct xdp_frame and also for metadata
>> (XDP-hints).  This will limit us from growing this structures for
>> the sake of generic-XDP.
>> 
>> I'm fine with reducting this to 192 bytes, as most Intel drivers
>> have this headroom, and have defacto established that this is
>> a valid XDP headroom, even for native-XDP.
>> 
>> We could go a small as two cachelines 128 bytes, as if xdp_frame
>> and metadata grows above a cache-line (64 bytes) each, then we have
>> done something wrong (performance wise).
> Here's some background on why I chose 64 bytes: I'm currently 
> implementing a userspace + xdp program to act as generic fastpath to 
> speed network bridging.

Any reason this can't run in the TC ingress hook instead? Generic XDP is
a bit of an odd duck, and I'm not a huge fan of special-casing it this
way...

-Toke
Daniel Borkmann March 14, 2022, 10:20 p.m. UTC | #4
On 3/14/22 11:16 PM, Toke Høiland-Jørgensen wrote:
> Felix Fietkau <nbd@nbd.name> writes:
>> On 14.03.22 21:39, Jesper D. Brouer wrote:
>>> (Cc. BPF list and other XDP maintainers)
>>> On 14/03/2022 11.22, Felix Fietkau wrote:
>>>> Most ethernet drivers allocate a packet headroom of NET_SKB_PAD. Since it is
>>>> rounded up to L1 cache size, it ends up being at least 64 bytes on the most
>>>> common platforms.
>>>> On most ethernet drivers, having a guaranteed headroom of 256 bytes for XDP
>>>> adds an extra forced pskb_expand_head call when enabling SKB XDP, which can
>>>> be quite expensive.
>>>> Many XDP programs need only very little headroom, so it can be beneficial
>>>> to have a way to opt-out of the 256 bytes headroom requirement.
>>>
>>> IMHO 64 bytes is too small.
>>> We are using this area for struct xdp_frame and also for metadata
>>> (XDP-hints).  This will limit us from growing this structures for
>>> the sake of generic-XDP.
>>>
>>> I'm fine with reducting this to 192 bytes, as most Intel drivers
>>> have this headroom, and have defacto established that this is
>>> a valid XDP headroom, even for native-XDP.
>>>
>>> We could go a small as two cachelines 128 bytes, as if xdp_frame
>>> and metadata grows above a cache-line (64 bytes) each, then we have
>>> done something wrong (performance wise).
>> Here's some background on why I chose 64 bytes: I'm currently
>> implementing a userspace + xdp program to act as generic fastpath to
>> speed network bridging.
> 
> Any reason this can't run in the TC ingress hook instead? Generic XDP is
> a bit of an odd duck, and I'm not a huge fan of special-casing it this
> way...

+1, would have been fine with generic reduction to just down to 192 bytes
(though not less than that), but 64 is a bit too little. Also curious on
why not tc ingress instead?

Thanks,
Daniel
Felix Fietkau March 14, 2022, 10:43 p.m. UTC | #5
On 14.03.22 23:20, Daniel Borkmann wrote:
> On 3/14/22 11:16 PM, Toke Høiland-Jørgensen wrote:
>> Felix Fietkau <nbd@nbd.name> writes:
>>> On 14.03.22 21:39, Jesper D. Brouer wrote:
>>>> (Cc. BPF list and other XDP maintainers)
>>>> On 14/03/2022 11.22, Felix Fietkau wrote:
>>>>> Most ethernet drivers allocate a packet headroom of NET_SKB_PAD. Since it is
>>>>> rounded up to L1 cache size, it ends up being at least 64 bytes on the most
>>>>> common platforms.
>>>>> On most ethernet drivers, having a guaranteed headroom of 256 bytes for XDP
>>>>> adds an extra forced pskb_expand_head call when enabling SKB XDP, which can
>>>>> be quite expensive.
>>>>> Many XDP programs need only very little headroom, so it can be beneficial
>>>>> to have a way to opt-out of the 256 bytes headroom requirement.
>>>>
>>>> IMHO 64 bytes is too small.
>>>> We are using this area for struct xdp_frame and also for metadata
>>>> (XDP-hints).  This will limit us from growing this structures for
>>>> the sake of generic-XDP.
>>>>
>>>> I'm fine with reducting this to 192 bytes, as most Intel drivers
>>>> have this headroom, and have defacto established that this is
>>>> a valid XDP headroom, even for native-XDP.
>>>>
>>>> We could go a small as two cachelines 128 bytes, as if xdp_frame
>>>> and metadata grows above a cache-line (64 bytes) each, then we have
>>>> done something wrong (performance wise).
>>> Here's some background on why I chose 64 bytes: I'm currently
>>> implementing a userspace + xdp program to act as generic fastpath to
>>> speed network bridging.
>> 
>> Any reason this can't run in the TC ingress hook instead? Generic XDP is
>> a bit of an odd duck, and I'm not a huge fan of special-casing it this
>> way...
> 
> +1, would have been fine with generic reduction to just down to 192 bytes
> (though not less than that), but 64 is a bit too little. Also curious on
> why not tc ingress instead?
I chose XDP because of bpf_redirect_map, which doesn't seem to be 
available to tc ingress classifier programs.

When I started writing the code, I didn't know that generic XDP 
performance would be bad on pretty much any ethernet/WLAN driver that 
wasn't updated to support it.

- Felix
Jesper Dangaard Brouer March 15, 2022, 6:50 a.m. UTC | #6
On 14/03/2022 23.43, Felix Fietkau wrote:
> 
> On 14.03.22 23:20, Daniel Borkmann wrote:
>> On 3/14/22 11:16 PM, Toke Høiland-Jørgensen wrote:
>>> Felix Fietkau <nbd@nbd.name> writes:
>>>> On 14.03.22 21:39, Jesper D. Brouer wrote:
>>>>> (Cc. BPF list and other XDP maintainers)
>>>>> On 14/03/2022 11.22, Felix Fietkau wrote:
>>>>>> Most ethernet drivers allocate a packet headroom of NET_SKB_PAD. 
>>>>>> Since it is
>>>>>> rounded up to L1 cache size, it ends up being at least 64 bytes on 
>>>>>> the most
>>>>>> common platforms.
>>>>>> On most ethernet drivers, having a guaranteed headroom of 256 
>>>>>> bytes for XDP
>>>>>> adds an extra forced pskb_expand_head call when enabling SKB XDP, 
>>>>>> which can
>>>>>> be quite expensive.
>>>>>> Many XDP programs need only very little headroom, so it can be 
>>>>>> beneficial
>>>>>> to have a way to opt-out of the 256 bytes headroom requirement.
>>>>>
>>>>> IMHO 64 bytes is too small.
>>>>> We are using this area for struct xdp_frame and also for metadata
>>>>> (XDP-hints).  This will limit us from growing this structures for
>>>>> the sake of generic-XDP.
>>>>>
>>>>> I'm fine with reducting this to 192 bytes, as most Intel drivers
>>>>> have this headroom, and have defacto established that this is
>>>>> a valid XDP headroom, even for native-XDP.
>>>>>
>>>>> We could go a small as two cachelines 128 bytes, as if xdp_frame
>>>>> and metadata grows above a cache-line (64 bytes) each, then we have
>>>>> done something wrong (performance wise).
 >>>>
>>>> Here's some background on why I chose 64 bytes: I'm currently
>>>> implementing a userspace + xdp program to act as generic fastpath to
>>>> speed network bridging.

Cc. Lorenzo as you mention you wanted to accelerate Linux bridging.
We can avoid a lot of external FIB sync in userspace, if we
add some BPF-helpers to lookup in bridge FIB table.

>>>
>>> Any reason this can't run in the TC ingress hook instead? Generic XDP is
>>> a bit of an odd duck, and I'm not a huge fan of special-casing it this
>>> way...
>>
>> +1, would have been fine with generic reduction to just down to 192 bytes
>> (though not less than that), but 64 is a bit too little. 

+1

>> Also curious on why not tc ingress instead?
 >
> I chose XDP because of bpf_redirect_map, which doesn't seem to be 
> available to tc ingress classifier programs.

TC have a "normal" bpf_redirect which is slower than a bpf_redirect_map.
The secret to the performance boost from bpf_redirect_map is the hidden
TX bulking layer.  I have experimented with TC bulking, which showed a
30% performance boost on TC redirect to TX with fixed 32 frame bulking.

Notice that newer libbpf makes it easier to attach to the TC hook
without shell'ing out to 'tc' binary. See how to use in this example:
 
https://github.com/xdp-project/bpf-examples/blob/master/tc-policy/tc_txq_policy.c


> When I started writing the code, I didn't know that generic XDP 
> performance would be bad on pretty much any ethernet/WLAN driver that 
> wasn't updated to support it.

Yes, we kind of kept generic-XDP non-optimized as the real tool for
the  job is TC-BPF, given at this stage the SKB have already been 
allocate, and converting it back to a XDP representation is not
going to be faster that TC-BPF.

Maybe we should optimize generic-XDP a bit more, because I'm
buying the argument, that developers want to write one BPF program
that works across device drivers, instead of having to handle
both TC-BPF and XDP-BPF.

Notice that generic-XDP doesn't actually do the bulking step, even
though using bpf_redirect_map. (would be obvious optimization).

If we extend kernel/bpf.devmap.c with bulking for SKBs, then both
TC-BPF and generic-XDP could share that code path, and obtain
the TX bulking performance boost via SKB-list + xmit_more.


--Jesper
Felix Fietkau March 15, 2022, 7:25 a.m. UTC | #7
On 15.03.22 07:50, Jesper Dangaard Brouer wrote:
> On 14/03/2022 23.43, Felix Fietkau wrote:
>> 
>> On 14.03.22 23:20, Daniel Borkmann wrote:
>>> On 3/14/22 11:16 PM, Toke Høiland-Jørgensen wrote:
>>>> Felix Fietkau <nbd@nbd.name> writes:
>>>>> On 14.03.22 21:39, Jesper D. Brouer wrote:
>>>>>> (Cc. BPF list and other XDP maintainers)
>>>>>> On 14/03/2022 11.22, Felix Fietkau wrote:
>>>>>>> Most ethernet drivers allocate a packet headroom of NET_SKB_PAD. 
>>>>>>> Since it is
>>>>>>> rounded up to L1 cache size, it ends up being at least 64 bytes on 
>>>>>>> the most
>>>>>>> common platforms.
>>>>>>> On most ethernet drivers, having a guaranteed headroom of 256 
>>>>>>> bytes for XDP
>>>>>>> adds an extra forced pskb_expand_head call when enabling SKB XDP, 
>>>>>>> which can
>>>>>>> be quite expensive.
>>>>>>> Many XDP programs need only very little headroom, so it can be 
>>>>>>> beneficial
>>>>>>> to have a way to opt-out of the 256 bytes headroom requirement.
>>>>>>
>>>>>> IMHO 64 bytes is too small.
>>>>>> We are using this area for struct xdp_frame and also for metadata
>>>>>> (XDP-hints).  This will limit us from growing this structures for
>>>>>> the sake of generic-XDP.
>>>>>>
>>>>>> I'm fine with reducting this to 192 bytes, as most Intel drivers
>>>>>> have this headroom, and have defacto established that this is
>>>>>> a valid XDP headroom, even for native-XDP.
>>>>>>
>>>>>> We could go a small as two cachelines 128 bytes, as if xdp_frame
>>>>>> and metadata grows above a cache-line (64 bytes) each, then we have
>>>>>> done something wrong (performance wise).
>   >>>>
>>>>> Here's some background on why I chose 64 bytes: I'm currently
>>>>> implementing a userspace + xdp program to act as generic fastpath to
>>>>> speed network bridging.
> 
> Cc. Lorenzo as you mention you wanted to accelerate Linux bridging.
> We can avoid a lot of external FIB sync in userspace, if we
> add some BPF-helpers to lookup in bridge FIB table.
I talked to Lorenzo a lot about my approach already. If we do the FIB 
lookup from BPF and add all the necessary checks to handle the 
forwarding path, I believe the result is going to look a lot like the 
existing bridge code, and would not make it significantly faster.

My approach involves creating a map that caches src-mac + dest-mac + 
vlan "flows", along with the target port + vlan.
User space then takes care of flushing those entries based on FDB, port 
and config changes.
I already prototyped this with an in-kernel implementation directly in 
the bridge code, and it led to a 6-10% CPU usage reduction.
Preliminary tests indicate that with my user space + XDP implementation 
and the headroom modification, I'm getting comparable performance.

>>>> Any reason this can't run in the TC ingress hook instead? Generic XDP is
>>>> a bit of an odd duck, and I'm not a huge fan of special-casing it this
>>>> way...
>>>
>>> +1, would have been fine with generic reduction to just down to 192 bytes
>>> (though not less than that), but 64 is a bit too little. 
> 
> +1
> 
>>> Also curious on why not tc ingress instead?
>   >
>> I chose XDP because of bpf_redirect_map, which doesn't seem to be 
>> available to tc ingress classifier programs.
> 
> TC have a "normal" bpf_redirect which is slower than a bpf_redirect_map.
> The secret to the performance boost from bpf_redirect_map is the hidden
> TX bulking layer.  I have experimented with TC bulking, which showed a
> 30% performance boost on TC redirect to TX with fixed 32 frame bulking.
> 
> Notice that newer libbpf makes it easier to attach to the TC hook
> without shell'ing out to 'tc' binary. See how to use in this example:
>   
> https://github.com/xdp-project/bpf-examples/blob/master/tc-policy/tc_txq_policy.c
> 
> 
>> When I started writing the code, I didn't know that generic XDP 
>> performance would be bad on pretty much any ethernet/WLAN driver that 
>> wasn't updated to support it.
> 
> Yes, we kind of kept generic-XDP non-optimized as the real tool for
> the  job is TC-BPF, given at this stage the SKB have already been
> allocate, and converting it back to a XDP representation is not
> going to be faster that TC-BPF.
> 
> Maybe we should optimize generic-XDP a bit more, because I'm
> buying the argument, that developers want to write one BPF program
> that works across device drivers, instead of having to handle
> both TC-BPF and XDP-BPF.
> 
> Notice that generic-XDP doesn't actually do the bulking step, even
> though using bpf_redirect_map. (would be obvious optimization).
> 
> If we extend kernel/bpf.devmap.c with bulking for SKBs, then both
> TC-BPF and generic-XDP could share that code path, and obtain
> the TX bulking performance boost via SKB-list + xmit_more.
Thanks for the information. I guess I'll rewrite my code to use TC-BPF now.

- Felix
diff mbox series

Patch

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 0d994710b335..f6f270a5e301 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2274,6 +2274,7 @@  struct net_device {
 	bool			proto_down;
 	unsigned		wol_enabled:1;
 	unsigned		threaded:1;
+	unsigned		xdp_small_headroom:1;
 
 	struct list_head	net_notifier_list;
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4eebea830613..7635dfb02313 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5688,6 +5688,7 @@  struct bpf_xdp_sock {
 };
 
 #define XDP_PACKET_HEADROOM 256
+#define XDP_PACKET_HEADROOM_SMALL 64
 
 /* User return codes for XDP prog type.
  * A valid XDP program must return one of these defined values. All other
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index e003a0b9b4b2..acb996334910 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -1275,11 +1275,13 @@  enum {
 #define XDP_FLAGS_DRV_MODE		(1U << 2)
 #define XDP_FLAGS_HW_MODE		(1U << 3)
 #define XDP_FLAGS_REPLACE		(1U << 4)
+#define XDP_FLAGS_SMALL_HEADROOM	(1U << 5)
 #define XDP_FLAGS_MODES			(XDP_FLAGS_SKB_MODE | \
 					 XDP_FLAGS_DRV_MODE | \
 					 XDP_FLAGS_HW_MODE)
 #define XDP_FLAGS_MASK			(XDP_FLAGS_UPDATE_IF_NOEXIST | \
-					 XDP_FLAGS_MODES | XDP_FLAGS_REPLACE)
+					 XDP_FLAGS_MODES | XDP_FLAGS_REPLACE | \
+					 XDP_FLAGS_SMALL_HEADROOM)
 
 /* These are stored into IFLA_XDP_ATTACHED on dump. */
 enum {
diff --git a/net/core/dev.c b/net/core/dev.c
index 8d25ec5b3af7..cb12379b8f11 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4722,6 +4722,7 @@  static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 				     struct xdp_buff *xdp,
 				     struct bpf_prog *xdp_prog)
 {
+	int min_headroom = XDP_PACKET_HEADROOM;
 	u32 act = XDP_DROP;
 
 	/* Reinjected packets coming from act_mirred or similar should
@@ -4730,12 +4731,15 @@  static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 	if (skb_is_redirected(skb))
 		return XDP_PASS;
 
+	if (skb->dev->xdp_small_headroom)
+		min_headroom = XDP_PACKET_HEADROOM_SMALL;
+
 	/* XDP packets must be linear and must have sufficient headroom
 	 * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
 	 * native XDP provides, thus we need to do it here as well.
 	 */
 	if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
-	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
+	    skb_headroom(skb) < min_headroom) {
 		int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
 		int troom = skb->tail + skb->data_len - skb->end;
 
@@ -9135,6 +9139,9 @@  static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack
 			return err;
 	}
 
+	if (mode == XDP_MODE_SKB)
+		dev->xdp_small_headroom = !!(flags & XDP_FLAGS_SMALL_HEADROOM);
+
 	if (link)
 		dev_xdp_set_link(dev, mode, link);
 	else
diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h
index e1ba2d51b717..0df737a6c489 100644
--- a/tools/include/uapi/linux/if_link.h
+++ b/tools/include/uapi/linux/if_link.h
@@ -1185,11 +1185,13 @@  enum {
 #define XDP_FLAGS_DRV_MODE		(1U << 2)
 #define XDP_FLAGS_HW_MODE		(1U << 3)
 #define XDP_FLAGS_REPLACE		(1U << 4)
+#define XDP_FLAGS_SMALL_HEADROOM	(1U << 5)
 #define XDP_FLAGS_MODES			(XDP_FLAGS_SKB_MODE | \
 					 XDP_FLAGS_DRV_MODE | \
 					 XDP_FLAGS_HW_MODE)
 #define XDP_FLAGS_MASK			(XDP_FLAGS_UPDATE_IF_NOEXIST | \
-					 XDP_FLAGS_MODES | XDP_FLAGS_REPLACE)
+					 XDP_FLAGS_MODES | XDP_FLAGS_REPLACE | \
+					 XDP_FLAGS_SMALL_HEADROOM)
 
 /* These are stored into IFLA_XDP_ATTACHED on dump. */
 enum {