Message ID | 20220314102210.92329-1-nbd@nbd.name (mailing list archive) |
---|---|
State | Changes Requested |
Delegated to: | BPF |
Headers | show |
Series | net: xdp: allow user space to request a smaller packet headroom requirement | expand |
Context | Check | Description |
---|---|---|
bpf/vmtest-bpf-next-PR | success | PR summary |
bpf/vmtest-bpf-next | success | VM_Test |
netdev/tree_selection | success | Not a local patch, async |
(Cc. BPF list and other XDP maintainers) On 14/03/2022 11.22, Felix Fietkau wrote: > Most ethernet drivers allocate a packet headroom of NET_SKB_PAD. Since it is > rounded up to L1 cache size, it ends up being at least 64 bytes on the most > common platforms. > On most ethernet drivers, having a guaranteed headroom of 256 bytes for XDP > adds an extra forced pskb_expand_head call when enabling SKB XDP, which can > be quite expensive. > Many XDP programs need only very little headroom, so it can be beneficial > to have a way to opt-out of the 256 bytes headroom requirement. IMHO 64 bytes is too small. We are using this area for struct xdp_frame and also for metadata (XDP-hints). This will limit us from growing this structures for the sake of generic-XDP. I'm fine with reducting this to 192 bytes, as most Intel drivers have this headroom, and have defacto established that this is a valid XDP headroom, even for native-XDP. We could go a small as two cachelines 128 bytes, as if xdp_frame and metadata grows above a cache-line (64 bytes) each, then we have done something wrong (performance wise). > Add an extra flag XDP_FLAGS_SMALL_HEADROOM that can be set when attaching > the XDP program, which reduces the minimum headroom to 64 bytes. I don't like a flags approach. Multiple disadvantages. (a) Harder to use (b) Now reading a new cache-line in net_device > Signed-off-by: Felix Fietkau <nbd@nbd.name> > --- > include/linux/netdevice.h | 1 + > include/uapi/linux/bpf.h | 1 + > include/uapi/linux/if_link.h | 4 +++- > net/core/dev.c | 9 ++++++++- > tools/include/uapi/linux/if_link.h | 4 +++- > 5 files changed, 16 insertions(+), 3 deletions(-) > > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h > index 0d994710b335..f6f270a5e301 100644 > --- a/include/linux/netdevice.h > +++ b/include/linux/netdevice.h > @@ -2274,6 +2274,7 @@ struct net_device { > bool proto_down; > unsigned wol_enabled:1; > unsigned threaded:1; > + unsigned xdp_small_headroom:1; > Looks like we need to read this cache-line, in a XDP (generic) fastpath. > struct list_head net_notifier_list; > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index 4eebea830613..7635dfb02313 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -5688,6 +5688,7 @@ struct bpf_xdp_sock { > }; > > #define XDP_PACKET_HEADROOM 256 > +#define XDP_PACKET_HEADROOM_SMALL 64 Define it 192 instead. > > /* User return codes for XDP prog type. > * A valid XDP program must return one of these defined values. All other > diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h > index e003a0b9b4b2..acb996334910 100644 > --- a/include/uapi/linux/if_link.h > +++ b/include/uapi/linux/if_link.h > @@ -1275,11 +1275,13 @@ enum { > #define XDP_FLAGS_DRV_MODE (1U << 2) > #define XDP_FLAGS_HW_MODE (1U << 3) > #define XDP_FLAGS_REPLACE (1U << 4) > +#define XDP_FLAGS_SMALL_HEADROOM (1U << 5) > #define XDP_FLAGS_MODES (XDP_FLAGS_SKB_MODE | \ > XDP_FLAGS_DRV_MODE | \ > XDP_FLAGS_HW_MODE) > #define XDP_FLAGS_MASK (XDP_FLAGS_UPDATE_IF_NOEXIST | \ > - XDP_FLAGS_MODES | XDP_FLAGS_REPLACE) > + XDP_FLAGS_MODES | XDP_FLAGS_REPLACE | \ > + XDP_FLAGS_SMALL_HEADROOM) > > /* These are stored into IFLA_XDP_ATTACHED on dump. */ > enum { > diff --git a/net/core/dev.c b/net/core/dev.c > index 8d25ec5b3af7..cb12379b8f11 100644 > --- a/net/core/dev.c > +++ b/net/core/dev.c > @@ -4722,6 +4722,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, > struct xdp_buff *xdp, > struct bpf_prog *xdp_prog) > { > + int min_headroom = XDP_PACKET_HEADROOM; > u32 act = XDP_DROP; > > /* Reinjected packets coming from act_mirred or similar should > @@ -4730,12 +4731,15 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, > if (skb_is_redirected(skb)) > return XDP_PASS; > > + if (skb->dev->xdp_small_headroom) > + min_headroom = XDP_PACKET_HEADROOM_SMALL; > + > /* XDP packets must be linear and must have sufficient headroom > * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also > * native XDP provides, thus we need to do it here as well. > */ > if (skb_cloned(skb) || skb_is_nonlinear(skb) || > - skb_headroom(skb) < XDP_PACKET_HEADROOM) { > + skb_headroom(skb) < min_headroom) { Use define XDP_PACKET_HEADROOM_SMALL here directly. > int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb); > int troom = skb->tail + skb->data_len - skb->end; > > @@ -9135,6 +9139,9 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack > return err; > } > > + if (mode == XDP_MODE_SKB) > + dev->xdp_small_headroom = !!(flags & XDP_FLAGS_SMALL_HEADROOM); > + > if (link) > dev_xdp_set_link(dev, mode, link); > else > diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h > index e1ba2d51b717..0df737a6c489 100644 > --- a/tools/include/uapi/linux/if_link.h > +++ b/tools/include/uapi/linux/if_link.h > @@ -1185,11 +1185,13 @@ enum { > #define XDP_FLAGS_DRV_MODE (1U << 2) > #define XDP_FLAGS_HW_MODE (1U << 3) > #define XDP_FLAGS_REPLACE (1U << 4) > +#define XDP_FLAGS_SMALL_HEADROOM (1U << 5) > #define XDP_FLAGS_MODES (XDP_FLAGS_SKB_MODE | \ > XDP_FLAGS_DRV_MODE | \ > XDP_FLAGS_HW_MODE) > #define XDP_FLAGS_MASK (XDP_FLAGS_UPDATE_IF_NOEXIST | \ > - XDP_FLAGS_MODES | XDP_FLAGS_REPLACE) > + XDP_FLAGS_MODES | XDP_FLAGS_REPLACE | \ > + XDP_FLAGS_SMALL_HEADROOM) > > /* These are stored into IFLA_XDP_ATTACHED on dump. */ > enum { > --Jesper
On 14.03.22 21:39, Jesper D. Brouer wrote: > (Cc. BPF list and other XDP maintainers) > > On 14/03/2022 11.22, Felix Fietkau wrote: >> Most ethernet drivers allocate a packet headroom of NET_SKB_PAD. Since it is >> rounded up to L1 cache size, it ends up being at least 64 bytes on the most >> common platforms. >> On most ethernet drivers, having a guaranteed headroom of 256 bytes for XDP >> adds an extra forced pskb_expand_head call when enabling SKB XDP, which can >> be quite expensive. >> Many XDP programs need only very little headroom, so it can be beneficial >> to have a way to opt-out of the 256 bytes headroom requirement. > > IMHO 64 bytes is too small. > We are using this area for struct xdp_frame and also for metadata > (XDP-hints). This will limit us from growing this structures for > the sake of generic-XDP. > > I'm fine with reducting this to 192 bytes, as most Intel drivers > have this headroom, and have defacto established that this is > a valid XDP headroom, even for native-XDP. > > We could go a small as two cachelines 128 bytes, as if xdp_frame > and metadata grows above a cache-line (64 bytes) each, then we have > done something wrong (performance wise). Here's some background on why I chose 64 bytes: I'm currently implementing a userspace + xdp program to act as generic fastpath to speed network bridging. For that I need to support a very diverse set of network drivers (including a lot of WLAN drivers). My XDP program only needs up to 4 bytes extra headroom (for a VLAN header). I made this headroom reduction opt-in, so that by default generic-XDP programs can still rely on 256 bytes headroom. If we make the small version any bigger than 64 bytes, it limits my options to: 1) bump NET_SKB_PAD accordingly at the risk of creating issues with buffer management for some affected drivers (IMHO not likely to be accepted upstream) 2) create patches for each and every driver that could possibly get used on OpenWrt to make my approach viable (there's so many of them, so I think that's not really feasible either) 3) stick with non-upstream hacks for dealing with this in OpenWrt I don't really like any of those options, but I can't think of any other solution right now. If I take the pskb_expand_head hit from the headroom mismatch, the result is that my bridge accelerator code actually decreases performance instead of making anything better. - Felix
Felix Fietkau <nbd@nbd.name> writes: > On 14.03.22 21:39, Jesper D. Brouer wrote: >> (Cc. BPF list and other XDP maintainers) >> >> On 14/03/2022 11.22, Felix Fietkau wrote: >>> Most ethernet drivers allocate a packet headroom of NET_SKB_PAD. Since it is >>> rounded up to L1 cache size, it ends up being at least 64 bytes on the most >>> common platforms. >>> On most ethernet drivers, having a guaranteed headroom of 256 bytes for XDP >>> adds an extra forced pskb_expand_head call when enabling SKB XDP, which can >>> be quite expensive. >>> Many XDP programs need only very little headroom, so it can be beneficial >>> to have a way to opt-out of the 256 bytes headroom requirement. >> >> IMHO 64 bytes is too small. >> We are using this area for struct xdp_frame and also for metadata >> (XDP-hints). This will limit us from growing this structures for >> the sake of generic-XDP. >> >> I'm fine with reducting this to 192 bytes, as most Intel drivers >> have this headroom, and have defacto established that this is >> a valid XDP headroom, even for native-XDP. >> >> We could go a small as two cachelines 128 bytes, as if xdp_frame >> and metadata grows above a cache-line (64 bytes) each, then we have >> done something wrong (performance wise). > Here's some background on why I chose 64 bytes: I'm currently > implementing a userspace + xdp program to act as generic fastpath to > speed network bridging. Any reason this can't run in the TC ingress hook instead? Generic XDP is a bit of an odd duck, and I'm not a huge fan of special-casing it this way... -Toke
On 3/14/22 11:16 PM, Toke Høiland-Jørgensen wrote: > Felix Fietkau <nbd@nbd.name> writes: >> On 14.03.22 21:39, Jesper D. Brouer wrote: >>> (Cc. BPF list and other XDP maintainers) >>> On 14/03/2022 11.22, Felix Fietkau wrote: >>>> Most ethernet drivers allocate a packet headroom of NET_SKB_PAD. Since it is >>>> rounded up to L1 cache size, it ends up being at least 64 bytes on the most >>>> common platforms. >>>> On most ethernet drivers, having a guaranteed headroom of 256 bytes for XDP >>>> adds an extra forced pskb_expand_head call when enabling SKB XDP, which can >>>> be quite expensive. >>>> Many XDP programs need only very little headroom, so it can be beneficial >>>> to have a way to opt-out of the 256 bytes headroom requirement. >>> >>> IMHO 64 bytes is too small. >>> We are using this area for struct xdp_frame and also for metadata >>> (XDP-hints). This will limit us from growing this structures for >>> the sake of generic-XDP. >>> >>> I'm fine with reducting this to 192 bytes, as most Intel drivers >>> have this headroom, and have defacto established that this is >>> a valid XDP headroom, even for native-XDP. >>> >>> We could go a small as two cachelines 128 bytes, as if xdp_frame >>> and metadata grows above a cache-line (64 bytes) each, then we have >>> done something wrong (performance wise). >> Here's some background on why I chose 64 bytes: I'm currently >> implementing a userspace + xdp program to act as generic fastpath to >> speed network bridging. > > Any reason this can't run in the TC ingress hook instead? Generic XDP is > a bit of an odd duck, and I'm not a huge fan of special-casing it this > way... +1, would have been fine with generic reduction to just down to 192 bytes (though not less than that), but 64 is a bit too little. Also curious on why not tc ingress instead? Thanks, Daniel
On 14.03.22 23:20, Daniel Borkmann wrote: > On 3/14/22 11:16 PM, Toke Høiland-Jørgensen wrote: >> Felix Fietkau <nbd@nbd.name> writes: >>> On 14.03.22 21:39, Jesper D. Brouer wrote: >>>> (Cc. BPF list and other XDP maintainers) >>>> On 14/03/2022 11.22, Felix Fietkau wrote: >>>>> Most ethernet drivers allocate a packet headroom of NET_SKB_PAD. Since it is >>>>> rounded up to L1 cache size, it ends up being at least 64 bytes on the most >>>>> common platforms. >>>>> On most ethernet drivers, having a guaranteed headroom of 256 bytes for XDP >>>>> adds an extra forced pskb_expand_head call when enabling SKB XDP, which can >>>>> be quite expensive. >>>>> Many XDP programs need only very little headroom, so it can be beneficial >>>>> to have a way to opt-out of the 256 bytes headroom requirement. >>>> >>>> IMHO 64 bytes is too small. >>>> We are using this area for struct xdp_frame and also for metadata >>>> (XDP-hints). This will limit us from growing this structures for >>>> the sake of generic-XDP. >>>> >>>> I'm fine with reducting this to 192 bytes, as most Intel drivers >>>> have this headroom, and have defacto established that this is >>>> a valid XDP headroom, even for native-XDP. >>>> >>>> We could go a small as two cachelines 128 bytes, as if xdp_frame >>>> and metadata grows above a cache-line (64 bytes) each, then we have >>>> done something wrong (performance wise). >>> Here's some background on why I chose 64 bytes: I'm currently >>> implementing a userspace + xdp program to act as generic fastpath to >>> speed network bridging. >> >> Any reason this can't run in the TC ingress hook instead? Generic XDP is >> a bit of an odd duck, and I'm not a huge fan of special-casing it this >> way... > > +1, would have been fine with generic reduction to just down to 192 bytes > (though not less than that), but 64 is a bit too little. Also curious on > why not tc ingress instead? I chose XDP because of bpf_redirect_map, which doesn't seem to be available to tc ingress classifier programs. When I started writing the code, I didn't know that generic XDP performance would be bad on pretty much any ethernet/WLAN driver that wasn't updated to support it. - Felix
On 14/03/2022 23.43, Felix Fietkau wrote: > > On 14.03.22 23:20, Daniel Borkmann wrote: >> On 3/14/22 11:16 PM, Toke Høiland-Jørgensen wrote: >>> Felix Fietkau <nbd@nbd.name> writes: >>>> On 14.03.22 21:39, Jesper D. Brouer wrote: >>>>> (Cc. BPF list and other XDP maintainers) >>>>> On 14/03/2022 11.22, Felix Fietkau wrote: >>>>>> Most ethernet drivers allocate a packet headroom of NET_SKB_PAD. >>>>>> Since it is >>>>>> rounded up to L1 cache size, it ends up being at least 64 bytes on >>>>>> the most >>>>>> common platforms. >>>>>> On most ethernet drivers, having a guaranteed headroom of 256 >>>>>> bytes for XDP >>>>>> adds an extra forced pskb_expand_head call when enabling SKB XDP, >>>>>> which can >>>>>> be quite expensive. >>>>>> Many XDP programs need only very little headroom, so it can be >>>>>> beneficial >>>>>> to have a way to opt-out of the 256 bytes headroom requirement. >>>>> >>>>> IMHO 64 bytes is too small. >>>>> We are using this area for struct xdp_frame and also for metadata >>>>> (XDP-hints). This will limit us from growing this structures for >>>>> the sake of generic-XDP. >>>>> >>>>> I'm fine with reducting this to 192 bytes, as most Intel drivers >>>>> have this headroom, and have defacto established that this is >>>>> a valid XDP headroom, even for native-XDP. >>>>> >>>>> We could go a small as two cachelines 128 bytes, as if xdp_frame >>>>> and metadata grows above a cache-line (64 bytes) each, then we have >>>>> done something wrong (performance wise). >>>> >>>> Here's some background on why I chose 64 bytes: I'm currently >>>> implementing a userspace + xdp program to act as generic fastpath to >>>> speed network bridging. Cc. Lorenzo as you mention you wanted to accelerate Linux bridging. We can avoid a lot of external FIB sync in userspace, if we add some BPF-helpers to lookup in bridge FIB table. >>> >>> Any reason this can't run in the TC ingress hook instead? Generic XDP is >>> a bit of an odd duck, and I'm not a huge fan of special-casing it this >>> way... >> >> +1, would have been fine with generic reduction to just down to 192 bytes >> (though not less than that), but 64 is a bit too little. +1 >> Also curious on why not tc ingress instead? > > I chose XDP because of bpf_redirect_map, which doesn't seem to be > available to tc ingress classifier programs. TC have a "normal" bpf_redirect which is slower than a bpf_redirect_map. The secret to the performance boost from bpf_redirect_map is the hidden TX bulking layer. I have experimented with TC bulking, which showed a 30% performance boost on TC redirect to TX with fixed 32 frame bulking. Notice that newer libbpf makes it easier to attach to the TC hook without shell'ing out to 'tc' binary. See how to use in this example: https://github.com/xdp-project/bpf-examples/blob/master/tc-policy/tc_txq_policy.c > When I started writing the code, I didn't know that generic XDP > performance would be bad on pretty much any ethernet/WLAN driver that > wasn't updated to support it. Yes, we kind of kept generic-XDP non-optimized as the real tool for the job is TC-BPF, given at this stage the SKB have already been allocate, and converting it back to a XDP representation is not going to be faster that TC-BPF. Maybe we should optimize generic-XDP a bit more, because I'm buying the argument, that developers want to write one BPF program that works across device drivers, instead of having to handle both TC-BPF and XDP-BPF. Notice that generic-XDP doesn't actually do the bulking step, even though using bpf_redirect_map. (would be obvious optimization). If we extend kernel/bpf.devmap.c with bulking for SKBs, then both TC-BPF and generic-XDP could share that code path, and obtain the TX bulking performance boost via SKB-list + xmit_more. --Jesper
On 15.03.22 07:50, Jesper Dangaard Brouer wrote: > On 14/03/2022 23.43, Felix Fietkau wrote: >> >> On 14.03.22 23:20, Daniel Borkmann wrote: >>> On 3/14/22 11:16 PM, Toke Høiland-Jørgensen wrote: >>>> Felix Fietkau <nbd@nbd.name> writes: >>>>> On 14.03.22 21:39, Jesper D. Brouer wrote: >>>>>> (Cc. BPF list and other XDP maintainers) >>>>>> On 14/03/2022 11.22, Felix Fietkau wrote: >>>>>>> Most ethernet drivers allocate a packet headroom of NET_SKB_PAD. >>>>>>> Since it is >>>>>>> rounded up to L1 cache size, it ends up being at least 64 bytes on >>>>>>> the most >>>>>>> common platforms. >>>>>>> On most ethernet drivers, having a guaranteed headroom of 256 >>>>>>> bytes for XDP >>>>>>> adds an extra forced pskb_expand_head call when enabling SKB XDP, >>>>>>> which can >>>>>>> be quite expensive. >>>>>>> Many XDP programs need only very little headroom, so it can be >>>>>>> beneficial >>>>>>> to have a way to opt-out of the 256 bytes headroom requirement. >>>>>> >>>>>> IMHO 64 bytes is too small. >>>>>> We are using this area for struct xdp_frame and also for metadata >>>>>> (XDP-hints). This will limit us from growing this structures for >>>>>> the sake of generic-XDP. >>>>>> >>>>>> I'm fine with reducting this to 192 bytes, as most Intel drivers >>>>>> have this headroom, and have defacto established that this is >>>>>> a valid XDP headroom, even for native-XDP. >>>>>> >>>>>> We could go a small as two cachelines 128 bytes, as if xdp_frame >>>>>> and metadata grows above a cache-line (64 bytes) each, then we have >>>>>> done something wrong (performance wise). > >>>> >>>>> Here's some background on why I chose 64 bytes: I'm currently >>>>> implementing a userspace + xdp program to act as generic fastpath to >>>>> speed network bridging. > > Cc. Lorenzo as you mention you wanted to accelerate Linux bridging. > We can avoid a lot of external FIB sync in userspace, if we > add some BPF-helpers to lookup in bridge FIB table. I talked to Lorenzo a lot about my approach already. If we do the FIB lookup from BPF and add all the necessary checks to handle the forwarding path, I believe the result is going to look a lot like the existing bridge code, and would not make it significantly faster. My approach involves creating a map that caches src-mac + dest-mac + vlan "flows", along with the target port + vlan. User space then takes care of flushing those entries based on FDB, port and config changes. I already prototyped this with an in-kernel implementation directly in the bridge code, and it led to a 6-10% CPU usage reduction. Preliminary tests indicate that with my user space + XDP implementation and the headroom modification, I'm getting comparable performance. >>>> Any reason this can't run in the TC ingress hook instead? Generic XDP is >>>> a bit of an odd duck, and I'm not a huge fan of special-casing it this >>>> way... >>> >>> +1, would have been fine with generic reduction to just down to 192 bytes >>> (though not less than that), but 64 is a bit too little. > > +1 > >>> Also curious on why not tc ingress instead? > > >> I chose XDP because of bpf_redirect_map, which doesn't seem to be >> available to tc ingress classifier programs. > > TC have a "normal" bpf_redirect which is slower than a bpf_redirect_map. > The secret to the performance boost from bpf_redirect_map is the hidden > TX bulking layer. I have experimented with TC bulking, which showed a > 30% performance boost on TC redirect to TX with fixed 32 frame bulking. > > Notice that newer libbpf makes it easier to attach to the TC hook > without shell'ing out to 'tc' binary. See how to use in this example: > > https://github.com/xdp-project/bpf-examples/blob/master/tc-policy/tc_txq_policy.c > > >> When I started writing the code, I didn't know that generic XDP >> performance would be bad on pretty much any ethernet/WLAN driver that >> wasn't updated to support it. > > Yes, we kind of kept generic-XDP non-optimized as the real tool for > the job is TC-BPF, given at this stage the SKB have already been > allocate, and converting it back to a XDP representation is not > going to be faster that TC-BPF. > > Maybe we should optimize generic-XDP a bit more, because I'm > buying the argument, that developers want to write one BPF program > that works across device drivers, instead of having to handle > both TC-BPF and XDP-BPF. > > Notice that generic-XDP doesn't actually do the bulking step, even > though using bpf_redirect_map. (would be obvious optimization). > > If we extend kernel/bpf.devmap.c with bulking for SKBs, then both > TC-BPF and generic-XDP could share that code path, and obtain > the TX bulking performance boost via SKB-list + xmit_more. Thanks for the information. I guess I'll rewrite my code to use TC-BPF now. - Felix
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0d994710b335..f6f270a5e301 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2274,6 +2274,7 @@ struct net_device { bool proto_down; unsigned wol_enabled:1; unsigned threaded:1; + unsigned xdp_small_headroom:1; struct list_head net_notifier_list; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4eebea830613..7635dfb02313 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5688,6 +5688,7 @@ struct bpf_xdp_sock { }; #define XDP_PACKET_HEADROOM 256 +#define XDP_PACKET_HEADROOM_SMALL 64 /* User return codes for XDP prog type. * A valid XDP program must return one of these defined values. All other diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index e003a0b9b4b2..acb996334910 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -1275,11 +1275,13 @@ enum { #define XDP_FLAGS_DRV_MODE (1U << 2) #define XDP_FLAGS_HW_MODE (1U << 3) #define XDP_FLAGS_REPLACE (1U << 4) +#define XDP_FLAGS_SMALL_HEADROOM (1U << 5) #define XDP_FLAGS_MODES (XDP_FLAGS_SKB_MODE | \ XDP_FLAGS_DRV_MODE | \ XDP_FLAGS_HW_MODE) #define XDP_FLAGS_MASK (XDP_FLAGS_UPDATE_IF_NOEXIST | \ - XDP_FLAGS_MODES | XDP_FLAGS_REPLACE) + XDP_FLAGS_MODES | XDP_FLAGS_REPLACE | \ + XDP_FLAGS_SMALL_HEADROOM) /* These are stored into IFLA_XDP_ATTACHED on dump. */ enum { diff --git a/net/core/dev.c b/net/core/dev.c index 8d25ec5b3af7..cb12379b8f11 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4722,6 +4722,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { + int min_headroom = XDP_PACKET_HEADROOM; u32 act = XDP_DROP; /* Reinjected packets coming from act_mirred or similar should @@ -4730,12 +4731,15 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, if (skb_is_redirected(skb)) return XDP_PASS; + if (skb->dev->xdp_small_headroom) + min_headroom = XDP_PACKET_HEADROOM_SMALL; + /* XDP packets must be linear and must have sufficient headroom * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also * native XDP provides, thus we need to do it here as well. */ if (skb_cloned(skb) || skb_is_nonlinear(skb) || - skb_headroom(skb) < XDP_PACKET_HEADROOM) { + skb_headroom(skb) < min_headroom) { int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb); int troom = skb->tail + skb->data_len - skb->end; @@ -9135,6 +9139,9 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack return err; } + if (mode == XDP_MODE_SKB) + dev->xdp_small_headroom = !!(flags & XDP_FLAGS_SMALL_HEADROOM); + if (link) dev_xdp_set_link(dev, mode, link); else diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h index e1ba2d51b717..0df737a6c489 100644 --- a/tools/include/uapi/linux/if_link.h +++ b/tools/include/uapi/linux/if_link.h @@ -1185,11 +1185,13 @@ enum { #define XDP_FLAGS_DRV_MODE (1U << 2) #define XDP_FLAGS_HW_MODE (1U << 3) #define XDP_FLAGS_REPLACE (1U << 4) +#define XDP_FLAGS_SMALL_HEADROOM (1U << 5) #define XDP_FLAGS_MODES (XDP_FLAGS_SKB_MODE | \ XDP_FLAGS_DRV_MODE | \ XDP_FLAGS_HW_MODE) #define XDP_FLAGS_MASK (XDP_FLAGS_UPDATE_IF_NOEXIST | \ - XDP_FLAGS_MODES | XDP_FLAGS_REPLACE) + XDP_FLAGS_MODES | XDP_FLAGS_REPLACE | \ + XDP_FLAGS_SMALL_HEADROOM) /* These are stored into IFLA_XDP_ATTACHED on dump. */ enum {
Most ethernet drivers allocate a packet headroom of NET_SKB_PAD. Since it is rounded up to L1 cache size, it ends up being at least 64 bytes on the most common platforms. On most ethernet drivers, having a guaranteed headroom of 256 bytes for XDP adds an extra forced pskb_expand_head call when enabling SKB XDP, which can be quite expensive. Many XDP programs need only very little headroom, so it can be beneficial to have a way to opt-out of the 256 bytes headroom requirement. Add an extra flag XDP_FLAGS_SMALL_HEADROOM that can be set when attaching the XDP program, which reduces the minimum headroom to 64 bytes. Signed-off-by: Felix Fietkau <nbd@nbd.name> --- include/linux/netdevice.h | 1 + include/uapi/linux/bpf.h | 1 + include/uapi/linux/if_link.h | 4 +++- net/core/dev.c | 9 ++++++++- tools/include/uapi/linux/if_link.h | 4 +++- 5 files changed, 16 insertions(+), 3 deletions(-)