diff mbox series

[v5,net-next,3/3] xdp: add multi-buff support for xdp running in generic mode

Message ID e73a75e0d0f81a3b20568675829df4763fa0d389.1702563810.git.lorenzo@kernel.org (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series add multi-buff support for xdp running in generic mode | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for net-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1123 this patch: 1123
netdev/cc_maintainers warning 3 maintainers not CCed: ast@kernel.org daniel@iogearbox.net john.fastabend@gmail.com
netdev/build_clang success Errors and warnings before: 1143 this patch: 1143
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 1150 this patch: 1150
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 199 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Lorenzo Bianconi Dec. 14, 2023, 2:29 p.m. UTC
Similar to native xdp, do not always linearize the skb in
netif_receive_generic_xdp routine but create a non-linear xdp_buff to be
processed by the eBPF program. This allow to add  multi-buffer support
for xdp running in generic mode.

Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
---
 net/core/dev.c | 153 +++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 134 insertions(+), 19 deletions(-)

Comments

Jesper Dangaard Brouer Dec. 20, 2023, 4:01 p.m. UTC | #1
On 14/12/2023 15.29, Lorenzo Bianconi wrote:
> Similar to native xdp, do not always linearize the skb in
> netif_receive_generic_xdp routine but create a non-linear xdp_buff to be
> processed by the eBPF program. This allow to add  multi-buffer support
> for xdp running in generic mode.
> 
> Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> ---
>   net/core/dev.c | 153 +++++++++++++++++++++++++++++++++++++++++++------
>   1 file changed, 134 insertions(+), 19 deletions(-)
> 
> diff --git a/net/core/dev.c b/net/core/dev.c
> index d7857de03dba..47164acc3268 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -4854,6 +4854,12 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
>   	xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq);
>   	xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len,
>   			 skb_headlen(skb) + mac_len, true);
> +	if (skb_is_nonlinear(skb)) {
> +		skb_shinfo(skb)->xdp_frags_size = skb->data_len;
> +		xdp_buff_set_frags_flag(xdp);
> +	} else {
> +		xdp_buff_clear_frags_flag(xdp);
> +	}
>   
>   	orig_data_end = xdp->data_end;
>   	orig_data = xdp->data;
> @@ -4883,6 +4889,14 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
>   		skb->len += off; /* positive on grow, negative on shrink */
>   	}
>   
> +	/* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers
> +	 * (e.g. bpf_xdp_adjust_tail), we need to update data_len here.
> +	 */
> +	if (xdp_buff_has_frags(xdp))
> +		skb->data_len = skb_shinfo(skb)->xdp_frags_size;
> +	else
> +		skb->data_len = 0;
> +
>   	/* check if XDP changed eth hdr such SKB needs update */
>   	eth = (struct ethhdr *)xdp->data;
>   	if ((orig_eth_type != eth->h_proto) ||
> @@ -4916,12 +4930,118 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
>   	return act;
>   }
>   
> +static int netif_skb_segment_for_xdp(struct sk_buff **pskb,

This function "...segment_for_xdp" always reallocate SKB and copies all
bits over.
Should it have been named "skb_realloc_for_xdp" ?

I was really hopeing we can find a design to avoid doing this realloc.

If the BPF-prog doesn't write into any of the fragments, then we can
avoid this realloc (+copy) dance. We designed XDP multi-buff to have
exactly the same layout+location as SKB in skb_shared_info, exactly to
avoid having to reallocated.

More comments inline below...

> +				     struct bpf_prog *prog)
> +{
> +#if IS_ENABLED(CONFIG_PAGE_POOL)
> +	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
> +	u32 size, truesize, len, max_head_size, off;
> +	struct sk_buff *skb = *pskb, *nskb;
> +	int err, i, head_off;
> +	void *data;
> +
> +	/* XDP does not support fraglist so we need to linearize
> +	 * the skb.
> +	 */
> +	if (skb_has_frag_list(skb) || !prog->aux->xdp_has_frags)
> +		return -EOPNOTSUPP;
> +
> +	max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE - XDP_PACKET_HEADROOM);
> +	if (skb->len > max_head_size + MAX_SKB_FRAGS * PAGE_SIZE)
> +		return -ENOMEM;
> +
> +	size = min_t(u32, skb->len, max_head_size);
> +	truesize = SKB_HEAD_ALIGN(size) + XDP_PACKET_HEADROOM;
> +	data = page_pool_dev_alloc_va(sd->page_pool, &truesize);
> +	if (!data)
> +		return -ENOMEM;
> +
> +	nskb = napi_build_skb(data, truesize);
> +	if (!nskb) {
> +		page_pool_free_va(sd->page_pool, data, true);
> +		return -ENOMEM;
> +	}
> +
> +	skb_reserve(nskb, XDP_PACKET_HEADROOM);
> +	skb_copy_header(nskb, skb);
> +	skb_mark_for_recycle(nskb);
> +
> +	err = skb_copy_bits(skb, 0, nskb->data, size);

This will likely copy part of the "frags" into the SKB "head" area.

Especially for netstack generated TCP packets, this will change the
segmentation layout significantly.  I wonder what (performance) effects
this will have on further handling of these SKBs.



> +	if (err) {
> +		consume_skb(nskb);
> +		return err;
> +	}
> +	skb_put(nskb, size);
> +
> +	head_off = skb_headroom(nskb) - skb_headroom(skb);
> +	skb_headers_offset_update(nskb, head_off);
> +
> +	off = size;
> +	len = skb->len - off;
> +	for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) {
> +		struct page *page;
> +		u32 page_off;
> +
> +		size = min_t(u32, len, PAGE_SIZE);
> +		truesize = size;
> +
> +		page = page_pool_dev_alloc(sd->page_pool, &page_off,
> +					   &truesize);
> +		if (!data) {
> +			consume_skb(nskb);
> +			return -ENOMEM;
> +		}
> +
> +		skb_add_rx_frag(nskb, i, page, page_off, size, truesize);
> +		err = skb_copy_bits(skb, off, page_address(page) + page_off,
> +				    size);

I think it is correct, but we can easily endup with the new SKB (nskb)
having a different nskb->nr_frags.


> +		if (err) {
> +			consume_skb(nskb);
> +			return err;
> +		}
> +
> +		len -= size;
> +		off += size;
> +	}
> +
> +	consume_skb(skb);
> +	*pskb = nskb;
> +
> +	return 0;
> +#else
> +	return -EOPNOTSUPP;
> +#endif
> +}
> +
> +static int netif_skb_check_for_xdp(struct sk_buff **pskb,
> +				   struct bpf_prog *prog)
> +{
> +	struct sk_buff *skb = *pskb;
> +	int err, hroom, troom;
> +
> +	if (!netif_skb_segment_for_xdp(pskb, prog))
> +		return 0;

IMHO the code call logic, does not make it easy to add cases where we
can avoid the realloc.  With this patch, it feels like the realloc+copy
code path is the "main" code path for XDP-generic.

Our goal should be to avoid realloc.

My goal for XDP multi-buff was/is that it can co-exist with GSO/GRO
packets.  This patchset is a step in the direction of enabling GRO on
devices with XDP (generic) loaded.  And I was really excited about this,
but the overhead is going to be massive compared to normal GRO (without
realloc+copy) that XDP end-users are going to be disappointed.


> +
> +	/* In case we have to go down the path and also linearize,
> +	 * then lets do the pskb_expand_head() work just once here.
> +	 */
> +	hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
> +	troom = skb->tail + skb->data_len - skb->end;
> +	err = pskb_expand_head(skb,
> +			       hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
> +			       troom > 0 ? troom + 128 : 0, GFP_ATOMIC);
> +	if (err)
> +		return err;
> +
> +	return skb_linearize(skb);
> +}
> +
>   static u32 netif_receive_generic_xdp(struct sk_buff **pskb,
>   				     struct xdp_buff *xdp,
>   				     struct bpf_prog *xdp_prog)
>   {
>   	struct sk_buff *skb = *pskb;
> -	u32 act = XDP_DROP;
> +	u32 mac_len, act = XDP_DROP;
>   
>   	/* Reinjected packets coming from act_mirred or similar should
>   	 * not get XDP generic processing.
> @@ -4929,41 +5049,36 @@ static u32 netif_receive_generic_xdp(struct sk_buff **pskb,
>   	if (skb_is_redirected(skb))
>   		return XDP_PASS;
>   
> -	/* XDP packets must be linear and must have sufficient headroom
> -	 * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
> -	 * native XDP provides, thus we need to do it here as well.
> +	/* XDP packets must have sufficient headroom of XDP_PACKET_HEADROOM
> +	 * bytes. This is the guarantee that also native XDP provides,
> +	 * thus we need to do it here as well.

Some "native" XDP provider only have 192 bytes as HEADROOM and XDP code
can this not being static (256 bytes).  So, perhaps it is time to allow
XDP generic to only require 192 bytes?

>   	 */
> +	mac_len = skb->data - skb_mac_header(skb);
> +	__skb_push(skb, mac_len);
> +
>   	if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
>   	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
> -		int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
> -		int troom = skb->tail + skb->data_len - skb->end;
> -
> -		/* In case we have to go down the path and also linearize,
> -		 * then lets do the pskb_expand_head() work just once here.
> -		 */
> -		if (pskb_expand_head(skb,
> -				     hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
> -				     troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
> -			goto do_drop;
> -		if (skb_linearize(skb))
> +		if (netif_skb_check_for_xdp(pskb, xdp_prog))
>   			goto do_drop;
>   	}
>   
> -	act = bpf_prog_run_generic_xdp(skb, xdp, xdp_prog);
> +	__skb_pull(*pskb, mac_len);
> +
> +	act = bpf_prog_run_generic_xdp(*pskb, xdp, xdp_prog);
>   	switch (act) {
>   	case XDP_REDIRECT:
>   	case XDP_TX:
>   	case XDP_PASS:
>   		break;
>   	default:
> -		bpf_warn_invalid_xdp_action(skb->dev, xdp_prog, act);
> +		bpf_warn_invalid_xdp_action((*pskb)->dev, xdp_prog, act);
>   		fallthrough;
>   	case XDP_ABORTED:
> -		trace_xdp_exception(skb->dev, xdp_prog, act);
> +		trace_xdp_exception((*pskb)->dev, xdp_prog, act);
>   		fallthrough;
>   	case XDP_DROP:
>   	do_drop:
> -		kfree_skb(skb);
> +		kfree_skb(*pskb);
>   		break;
>   	}
>
Lorenzo Bianconi Dec. 21, 2023, 8:23 a.m. UTC | #2
> 
> 
> On 14/12/2023 15.29, Lorenzo Bianconi wrote:
> > Similar to native xdp, do not always linearize the skb in
> > netif_receive_generic_xdp routine but create a non-linear xdp_buff to be
> > processed by the eBPF program. This allow to add  multi-buffer support
> > for xdp running in generic mode.
> > 
> > Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> > ---
> >   net/core/dev.c | 153 +++++++++++++++++++++++++++++++++++++++++++------
> >   1 file changed, 134 insertions(+), 19 deletions(-)
> > 
> > diff --git a/net/core/dev.c b/net/core/dev.c
> > index d7857de03dba..47164acc3268 100644
> > --- a/net/core/dev.c
> > +++ b/net/core/dev.c
> > @@ -4854,6 +4854,12 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
> >   	xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq);
> >   	xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len,
> >   			 skb_headlen(skb) + mac_len, true);
> > +	if (skb_is_nonlinear(skb)) {
> > +		skb_shinfo(skb)->xdp_frags_size = skb->data_len;
> > +		xdp_buff_set_frags_flag(xdp);
> > +	} else {
> > +		xdp_buff_clear_frags_flag(xdp);
> > +	}
> >   	orig_data_end = xdp->data_end;
> >   	orig_data = xdp->data;
> > @@ -4883,6 +4889,14 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
> >   		skb->len += off; /* positive on grow, negative on shrink */
> >   	}
> > +	/* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers
> > +	 * (e.g. bpf_xdp_adjust_tail), we need to update data_len here.
> > +	 */
> > +	if (xdp_buff_has_frags(xdp))
> > +		skb->data_len = skb_shinfo(skb)->xdp_frags_size;
> > +	else
> > +		skb->data_len = 0;
> > +
> >   	/* check if XDP changed eth hdr such SKB needs update */
> >   	eth = (struct ethhdr *)xdp->data;
> >   	if ((orig_eth_type != eth->h_proto) ||
> > @@ -4916,12 +4930,118 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
> >   	return act;
> >   }
> > +static int netif_skb_segment_for_xdp(struct sk_buff **pskb,
> 
> This function "...segment_for_xdp" always reallocate SKB and copies all
> bits over.
> Should it have been named "skb_realloc_for_xdp" ?

Hi Jesper,

ack, naming is always hard :)

> 
> I was really hopeing we can find a design to avoid doing this realloc.
> 
> If the BPF-prog doesn't write into any of the fragments, then we can
> avoid this realloc (+copy) dance. We designed XDP multi-buff to have
> exactly the same layout+location as SKB in skb_shared_info, exactly to
> avoid having to reallocated.

I 100% agree with you, but we will need a similar copy fallback approach
anyway, right? It is just a matter to understand if we should implement it
with page_pool or page_frag_cache (or something different).

> 
> More comments inline below...
> 
> > +				     struct bpf_prog *prog)
> > +{
> > +#if IS_ENABLED(CONFIG_PAGE_POOL)
> > +	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
> > +	u32 size, truesize, len, max_head_size, off;
> > +	struct sk_buff *skb = *pskb, *nskb;
> > +	int err, i, head_off;
> > +	void *data;
> > +
> > +	/* XDP does not support fraglist so we need to linearize
> > +	 * the skb.
> > +	 */
> > +	if (skb_has_frag_list(skb) || !prog->aux->xdp_has_frags)
> > +		return -EOPNOTSUPP;
> > +
> > +	max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE - XDP_PACKET_HEADROOM);
> > +	if (skb->len > max_head_size + MAX_SKB_FRAGS * PAGE_SIZE)
> > +		return -ENOMEM;
> > +
> > +	size = min_t(u32, skb->len, max_head_size);
> > +	truesize = SKB_HEAD_ALIGN(size) + XDP_PACKET_HEADROOM;
> > +	data = page_pool_dev_alloc_va(sd->page_pool, &truesize);
> > +	if (!data)
> > +		return -ENOMEM;
> > +
> > +	nskb = napi_build_skb(data, truesize);
> > +	if (!nskb) {
> > +		page_pool_free_va(sd->page_pool, data, true);
> > +		return -ENOMEM;
> > +	}
> > +
> > +	skb_reserve(nskb, XDP_PACKET_HEADROOM);
> > +	skb_copy_header(nskb, skb);
> > +	skb_mark_for_recycle(nskb);
> > +
> > +	err = skb_copy_bits(skb, 0, nskb->data, size);
> 
> This will likely copy part of the "frags" into the SKB "head" area.
> 
> Especially for netstack generated TCP packets, this will change the
> segmentation layout significantly.  I wonder what (performance) effects
> this will have on further handling of these SKBs.

Do you think it can be a problem? I think in this way we can reduce the
number of allocated page. Moreover, what about the case when skb head is
bigger than a single page? Do you think we should use bigger allocation order?

> 
> 
> 
> > +	if (err) {
> > +		consume_skb(nskb);
> > +		return err;
> > +	}
> > +	skb_put(nskb, size);
> > +
> > +	head_off = skb_headroom(nskb) - skb_headroom(skb);
> > +	skb_headers_offset_update(nskb, head_off);
> > +
> > +	off = size;
> > +	len = skb->len - off;
> > +	for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) {
> > +		struct page *page;
> > +		u32 page_off;
> > +
> > +		size = min_t(u32, len, PAGE_SIZE);
> > +		truesize = size;
> > +
> > +		page = page_pool_dev_alloc(sd->page_pool, &page_off,
> > +					   &truesize);
> > +		if (!data) {
> > +			consume_skb(nskb);
> > +			return -ENOMEM;
> > +		}
> > +
> > +		skb_add_rx_frag(nskb, i, page, page_off, size, truesize);
> > +		err = skb_copy_bits(skb, off, page_address(page) + page_off,
> > +				    size);
> 
> I think it is correct, but we can easily endup with the new SKB (nskb)
> having a different nskb->nr_frags.

see above

> 
> 
> > +		if (err) {
> > +			consume_skb(nskb);
> > +			return err;
> > +		}
> > +
> > +		len -= size;
> > +		off += size;
> > +	}
> > +
> > +	consume_skb(skb);
> > +	*pskb = nskb;
> > +
> > +	return 0;
> > +#else
> > +	return -EOPNOTSUPP;
> > +#endif
> > +}
> > +
> > +static int netif_skb_check_for_xdp(struct sk_buff **pskb,
> > +				   struct bpf_prog *prog)
> > +{
> > +	struct sk_buff *skb = *pskb;
> > +	int err, hroom, troom;
> > +
> > +	if (!netif_skb_segment_for_xdp(pskb, prog))
> > +		return 0;
> 
> IMHO the code call logic, does not make it easy to add cases where we
> can avoid the realloc.  With this patch, it feels like the realloc+copy
> code path is the "main" code path for XDP-generic.

ack, I think it would depend about the logic to avoid realloc+copy. We could
revaluate it when we have this logic in place. What do you think?

> 
> Our goal should be to avoid realloc.
> 
> My goal for XDP multi-buff was/is that it can co-exist with GSO/GRO
> packets.  This patchset is a step in the direction of enabling GRO on
> devices with XDP (generic) loaded.  And I was really excited about this,
> but the overhead is going to be massive compared to normal GRO (without
> realloc+copy) that XDP end-users are going to be disappointed.
> 
> 
> > +
> > +	/* In case we have to go down the path and also linearize,
> > +	 * then lets do the pskb_expand_head() work just once here.
> > +	 */
> > +	hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
> > +	troom = skb->tail + skb->data_len - skb->end;
> > +	err = pskb_expand_head(skb,
> > +			       hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
> > +			       troom > 0 ? troom + 128 : 0, GFP_ATOMIC);
> > +	if (err)
> > +		return err;
> > +
> > +	return skb_linearize(skb);
> > +}
> > +
> >   static u32 netif_receive_generic_xdp(struct sk_buff **pskb,
> >   				     struct xdp_buff *xdp,
> >   				     struct bpf_prog *xdp_prog)
> >   {
> >   	struct sk_buff *skb = *pskb;
> > -	u32 act = XDP_DROP;
> > +	u32 mac_len, act = XDP_DROP;
> >   	/* Reinjected packets coming from act_mirred or similar should
> >   	 * not get XDP generic processing.
> > @@ -4929,41 +5049,36 @@ static u32 netif_receive_generic_xdp(struct sk_buff **pskb,
> >   	if (skb_is_redirected(skb))
> >   		return XDP_PASS;
> > -	/* XDP packets must be linear and must have sufficient headroom
> > -	 * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
> > -	 * native XDP provides, thus we need to do it here as well.
> > +	/* XDP packets must have sufficient headroom of XDP_PACKET_HEADROOM
> > +	 * bytes. This is the guarantee that also native XDP provides,
> > +	 * thus we need to do it here as well.
> 
> Some "native" XDP provider only have 192 bytes as HEADROOM and XDP code
> can this not being static (256 bytes).  So, perhaps it is time to allow
> XDP generic to only require 192 bytes?

ack, agree. I think this can be added a separated patch.

Regards,
Lorenzo

> 
> >   	 */
> > +	mac_len = skb->data - skb_mac_header(skb);
> > +	__skb_push(skb, mac_len);
> > +
> >   	if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
> >   	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
> > -		int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
> > -		int troom = skb->tail + skb->data_len - skb->end;
> > -
> > -		/* In case we have to go down the path and also linearize,
> > -		 * then lets do the pskb_expand_head() work just once here.
> > -		 */
> > -		if (pskb_expand_head(skb,
> > -				     hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
> > -				     troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
> > -			goto do_drop;
> > -		if (skb_linearize(skb))
> > +		if (netif_skb_check_for_xdp(pskb, xdp_prog))
> >   			goto do_drop;
> >   	}
> > -	act = bpf_prog_run_generic_xdp(skb, xdp, xdp_prog);
> > +	__skb_pull(*pskb, mac_len);
> > +
> > +	act = bpf_prog_run_generic_xdp(*pskb, xdp, xdp_prog);
> >   	switch (act) {
> >   	case XDP_REDIRECT:
> >   	case XDP_TX:
> >   	case XDP_PASS:
> >   		break;
> >   	default:
> > -		bpf_warn_invalid_xdp_action(skb->dev, xdp_prog, act);
> > +		bpf_warn_invalid_xdp_action((*pskb)->dev, xdp_prog, act);
> >   		fallthrough;
> >   	case XDP_ABORTED:
> > -		trace_xdp_exception(skb->dev, xdp_prog, act);
> > +		trace_xdp_exception((*pskb)->dev, xdp_prog, act);
> >   		fallthrough;
> >   	case XDP_DROP:
> >   	do_drop:
> > -		kfree_skb(skb);
> > +		kfree_skb(*pskb);
> >   		break;
> >   	}
diff mbox series

Patch

diff --git a/net/core/dev.c b/net/core/dev.c
index d7857de03dba..47164acc3268 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4854,6 +4854,12 @@  u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
 	xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq);
 	xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len,
 			 skb_headlen(skb) + mac_len, true);
+	if (skb_is_nonlinear(skb)) {
+		skb_shinfo(skb)->xdp_frags_size = skb->data_len;
+		xdp_buff_set_frags_flag(xdp);
+	} else {
+		xdp_buff_clear_frags_flag(xdp);
+	}
 
 	orig_data_end = xdp->data_end;
 	orig_data = xdp->data;
@@ -4883,6 +4889,14 @@  u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
 		skb->len += off; /* positive on grow, negative on shrink */
 	}
 
+	/* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers
+	 * (e.g. bpf_xdp_adjust_tail), we need to update data_len here.
+	 */
+	if (xdp_buff_has_frags(xdp))
+		skb->data_len = skb_shinfo(skb)->xdp_frags_size;
+	else
+		skb->data_len = 0;
+
 	/* check if XDP changed eth hdr such SKB needs update */
 	eth = (struct ethhdr *)xdp->data;
 	if ((orig_eth_type != eth->h_proto) ||
@@ -4916,12 +4930,118 @@  u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
 	return act;
 }
 
+static int netif_skb_segment_for_xdp(struct sk_buff **pskb,
+				     struct bpf_prog *prog)
+{
+#if IS_ENABLED(CONFIG_PAGE_POOL)
+	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
+	u32 size, truesize, len, max_head_size, off;
+	struct sk_buff *skb = *pskb, *nskb;
+	int err, i, head_off;
+	void *data;
+
+	/* XDP does not support fraglist so we need to linearize
+	 * the skb.
+	 */
+	if (skb_has_frag_list(skb) || !prog->aux->xdp_has_frags)
+		return -EOPNOTSUPP;
+
+	max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE - XDP_PACKET_HEADROOM);
+	if (skb->len > max_head_size + MAX_SKB_FRAGS * PAGE_SIZE)
+		return -ENOMEM;
+
+	size = min_t(u32, skb->len, max_head_size);
+	truesize = SKB_HEAD_ALIGN(size) + XDP_PACKET_HEADROOM;
+	data = page_pool_dev_alloc_va(sd->page_pool, &truesize);
+	if (!data)
+		return -ENOMEM;
+
+	nskb = napi_build_skb(data, truesize);
+	if (!nskb) {
+		page_pool_free_va(sd->page_pool, data, true);
+		return -ENOMEM;
+	}
+
+	skb_reserve(nskb, XDP_PACKET_HEADROOM);
+	skb_copy_header(nskb, skb);
+	skb_mark_for_recycle(nskb);
+
+	err = skb_copy_bits(skb, 0, nskb->data, size);
+	if (err) {
+		consume_skb(nskb);
+		return err;
+	}
+	skb_put(nskb, size);
+
+	head_off = skb_headroom(nskb) - skb_headroom(skb);
+	skb_headers_offset_update(nskb, head_off);
+
+	off = size;
+	len = skb->len - off;
+	for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) {
+		struct page *page;
+		u32 page_off;
+
+		size = min_t(u32, len, PAGE_SIZE);
+		truesize = size;
+
+		page = page_pool_dev_alloc(sd->page_pool, &page_off,
+					   &truesize);
+		if (!data) {
+			consume_skb(nskb);
+			return -ENOMEM;
+		}
+
+		skb_add_rx_frag(nskb, i, page, page_off, size, truesize);
+		err = skb_copy_bits(skb, off, page_address(page) + page_off,
+				    size);
+		if (err) {
+			consume_skb(nskb);
+			return err;
+		}
+
+		len -= size;
+		off += size;
+	}
+
+	consume_skb(skb);
+	*pskb = nskb;
+
+	return 0;
+#else
+	return -EOPNOTSUPP;
+#endif
+}
+
+static int netif_skb_check_for_xdp(struct sk_buff **pskb,
+				   struct bpf_prog *prog)
+{
+	struct sk_buff *skb = *pskb;
+	int err, hroom, troom;
+
+	if (!netif_skb_segment_for_xdp(pskb, prog))
+		return 0;
+
+	/* In case we have to go down the path and also linearize,
+	 * then lets do the pskb_expand_head() work just once here.
+	 */
+	hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
+	troom = skb->tail + skb->data_len - skb->end;
+	err = pskb_expand_head(skb,
+			       hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
+			       troom > 0 ? troom + 128 : 0, GFP_ATOMIC);
+	if (err)
+		return err;
+
+	return skb_linearize(skb);
+}
+
 static u32 netif_receive_generic_xdp(struct sk_buff **pskb,
 				     struct xdp_buff *xdp,
 				     struct bpf_prog *xdp_prog)
 {
 	struct sk_buff *skb = *pskb;
-	u32 act = XDP_DROP;
+	u32 mac_len, act = XDP_DROP;
 
 	/* Reinjected packets coming from act_mirred or similar should
 	 * not get XDP generic processing.
@@ -4929,41 +5049,36 @@  static u32 netif_receive_generic_xdp(struct sk_buff **pskb,
 	if (skb_is_redirected(skb))
 		return XDP_PASS;
 
-	/* XDP packets must be linear and must have sufficient headroom
-	 * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
-	 * native XDP provides, thus we need to do it here as well.
+	/* XDP packets must have sufficient headroom of XDP_PACKET_HEADROOM
+	 * bytes. This is the guarantee that also native XDP provides,
+	 * thus we need to do it here as well.
 	 */
+	mac_len = skb->data - skb_mac_header(skb);
+	__skb_push(skb, mac_len);
+
 	if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
 	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
-		int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
-		int troom = skb->tail + skb->data_len - skb->end;
-
-		/* In case we have to go down the path and also linearize,
-		 * then lets do the pskb_expand_head() work just once here.
-		 */
-		if (pskb_expand_head(skb,
-				     hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
-				     troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
-			goto do_drop;
-		if (skb_linearize(skb))
+		if (netif_skb_check_for_xdp(pskb, xdp_prog))
 			goto do_drop;
 	}
 
-	act = bpf_prog_run_generic_xdp(skb, xdp, xdp_prog);
+	__skb_pull(*pskb, mac_len);
+
+	act = bpf_prog_run_generic_xdp(*pskb, xdp, xdp_prog);
 	switch (act) {
 	case XDP_REDIRECT:
 	case XDP_TX:
 	case XDP_PASS:
 		break;
 	default:
-		bpf_warn_invalid_xdp_action(skb->dev, xdp_prog, act);
+		bpf_warn_invalid_xdp_action((*pskb)->dev, xdp_prog, act);
 		fallthrough;
 	case XDP_ABORTED:
-		trace_xdp_exception(skb->dev, xdp_prog, act);
+		trace_xdp_exception((*pskb)->dev, xdp_prog, act);
 		fallthrough;
 	case XDP_DROP:
 	do_drop:
-		kfree_skb(skb);
+		kfree_skb(*pskb);
 		break;
 	}