diff mbox series

[net-next] xdp: add multi-buff support for xdp running in generic mode

Message ID c928f7c698de070b33d38f230081fd4f993f2567.1701128026.git.lorenzo@kernel.org (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series [net-next] xdp: add multi-buff support for xdp running in generic mode | expand

Checks

Context Check Description
netdev/series_format success Single patches do not need cover letters
netdev/codegen success Generated files up to date
netdev/tree_selection success Clearly marked for net-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1123 this patch: 1123
netdev/cc_maintainers warning 3 maintainers not CCed: ast@kernel.org daniel@iogearbox.net john.fastabend@gmail.com
netdev/build_clang success Errors and warnings before: 1143 this patch: 1143
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 1150 this patch: 1150
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 52 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Lorenzo Bianconi Nov. 27, 2023, 11:38 p.m. UTC
Similar to native xdp, do not always linearize the skb in
netif_receive_generic_xdp routine but create a non-linear xdp_buff to be
processed by the eBPF program. This allow to add multi-buffer support
for xdp running in generic mode.

Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
---
 net/core/dev.c | 28 +++++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

Comments

Toke Høiland-Jørgensen Nov. 28, 2023, 9:45 a.m. UTC | #1
Lorenzo Bianconi <lorenzo@kernel.org> writes:

> Similar to native xdp, do not always linearize the skb in
> netif_receive_generic_xdp routine but create a non-linear xdp_buff to be
> processed by the eBPF program. This allow to add multi-buffer support
> for xdp running in generic mode.
>
> Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>

With one nit below:

Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>

> ---
>  net/core/dev.c | 28 +++++++++++++++++++++++-----
>  1 file changed, 23 insertions(+), 5 deletions(-)
>
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 3950ced396b5..5a58f3e28657 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -4853,6 +4853,12 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
>  	xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq);
>  	xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len,
>  			 skb_headlen(skb) + mac_len, true);
> +	if (skb_is_nonlinear(skb)) {
> +		skb_shinfo(skb)->xdp_frags_size = skb->data_len;
> +		xdp_buff_set_frags_flag(xdp);
> +	} else {
> +		xdp_buff_clear_frags_flag(xdp);
> +	}
>  
>  	orig_data_end = xdp->data_end;
>  	orig_data = xdp->data;
> @@ -4882,6 +4888,14 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
>  		skb->len += off; /* positive on grow, negative on shrink */
>  	}
>  
> +	/* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers
> +	 * (e.g. bpf_xdp_adjust_tail), we need to update data_len here.
> +	 */
> +	if (xdp_buff_has_frags(xdp))
> +		skb->data_len = skb_shinfo(skb)->xdp_frags_size;
> +	else
> +		skb->data_len = 0;
> +
>  	/* check if XDP changed eth hdr such SKB needs update */
>  	eth = (struct ethhdr *)xdp->data;
>  	if ((orig_eth_type != eth->h_proto) ||
> @@ -4927,9 +4941,9 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
>  	if (skb_is_redirected(skb))
>  		return XDP_PASS;
>  
> -	/* XDP packets must be linear and must have sufficient headroom
> -	 * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
> -	 * native XDP provides, thus we need to do it here as well.
> +	/* XDP packets must have sufficient headroom of XDP_PACKET_HEADROOM
> +	 * bytes. This is the guarantee that also native XDP provides,
> +	 * thus we need to do it here as well.
>  	 */
>  	if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
>  	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
> @@ -4943,8 +4957,12 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
>  				     hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
>  				     troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
>  			goto do_drop;

There's a comment above the pskb_expand_head() call that isn't quite
part of the context here, that should also be adjusted now that we don't
always linearise:

		/* In case we have to go down the path and also linearize,
		 * then lets do the pskb_expand_head() work just once here.
		 */

Actually, I think maybe just dropping that comment entirely with this
change would make sense?


> -		if (skb_linearize(skb))
> -			goto do_drop;
> +
> +		/* XDP does not support fraglist */
> +		if (skb_has_frag_list(skb) || !xdp_prog->aux->xdp_has_frags) {
> +			if (skb_linearize(skb))
> +				goto do_drop;
> +		}
>  	}
>  
>  	act = bpf_prog_run_generic_xdp(skb, xdp, xdp_prog);
> -- 
> 2.43.0
Lorenzo Bianconi Nov. 28, 2023, 12:02 p.m. UTC | #2
> Lorenzo Bianconi <lorenzo@kernel.org> writes:
> 
> > Similar to native xdp, do not always linearize the skb in
> > netif_receive_generic_xdp routine but create a non-linear xdp_buff to be
> > processed by the eBPF program. This allow to add multi-buffer support
> > for xdp running in generic mode.
> >
> > Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> 
> With one nit below:
> 
> Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
> 
> > ---
> >  net/core/dev.c | 28 +++++++++++++++++++++++-----
> >  1 file changed, 23 insertions(+), 5 deletions(-)
> >
> > diff --git a/net/core/dev.c b/net/core/dev.c
> > index 3950ced396b5..5a58f3e28657 100644
> > --- a/net/core/dev.c
> > +++ b/net/core/dev.c
> > @@ -4853,6 +4853,12 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
> >  	xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq);
> >  	xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len,
> >  			 skb_headlen(skb) + mac_len, true);
> > +	if (skb_is_nonlinear(skb)) {
> > +		skb_shinfo(skb)->xdp_frags_size = skb->data_len;
> > +		xdp_buff_set_frags_flag(xdp);
> > +	} else {
> > +		xdp_buff_clear_frags_flag(xdp);
> > +	}
> >  
> >  	orig_data_end = xdp->data_end;
> >  	orig_data = xdp->data;
> > @@ -4882,6 +4888,14 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
> >  		skb->len += off; /* positive on grow, negative on shrink */
> >  	}
> >  
> > +	/* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers
> > +	 * (e.g. bpf_xdp_adjust_tail), we need to update data_len here.
> > +	 */
> > +	if (xdp_buff_has_frags(xdp))
> > +		skb->data_len = skb_shinfo(skb)->xdp_frags_size;
> > +	else
> > +		skb->data_len = 0;
> > +
> >  	/* check if XDP changed eth hdr such SKB needs update */
> >  	eth = (struct ethhdr *)xdp->data;
> >  	if ((orig_eth_type != eth->h_proto) ||
> > @@ -4927,9 +4941,9 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
> >  	if (skb_is_redirected(skb))
> >  		return XDP_PASS;
> >  
> > -	/* XDP packets must be linear and must have sufficient headroom
> > -	 * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
> > -	 * native XDP provides, thus we need to do it here as well.
> > +	/* XDP packets must have sufficient headroom of XDP_PACKET_HEADROOM
> > +	 * bytes. This is the guarantee that also native XDP provides,
> > +	 * thus we need to do it here as well.
> >  	 */
> >  	if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
> >  	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
> > @@ -4943,8 +4957,12 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
> >  				     hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
> >  				     troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
> >  			goto do_drop;
> 
> There's a comment above the pskb_expand_head() call that isn't quite
> part of the context here, that should also be adjusted now that we don't
> always linearise:
> 
> 		/* In case we have to go down the path and also linearize,
> 		 * then lets do the pskb_expand_head() work just once here.
> 		 */
> 
> Actually, I think maybe just dropping that comment entirely with this
> change would make sense?

ack, I will get rid of it. Thx.

Regards,
Lorenzo

> 
> 
> > -		if (skb_linearize(skb))
> > -			goto do_drop;
> > +
> > +		/* XDP does not support fraglist */
> > +		if (skb_has_frag_list(skb) || !xdp_prog->aux->xdp_has_frags) {
> > +			if (skb_linearize(skb))
> > +				goto do_drop;
> > +		}
> >  	}
> >  
> >  	act = bpf_prog_run_generic_xdp(skb, xdp, xdp_prog);
> > -- 
> > 2.43.0
>
Lorenzo Bianconi Nov. 28, 2023, 5:29 p.m. UTC | #3
> Similar to native xdp, do not always linearize the skb in
> netif_receive_generic_xdp routine but create a non-linear xdp_buff to be
> processed by the eBPF program. This allow to add multi-buffer support
> for xdp running in generic mode.
> 
> Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> ---
>  net/core/dev.c | 28 +++++++++++++++++++++++-----
>  1 file changed, 23 insertions(+), 5 deletions(-)
> 
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 3950ced396b5..5a58f3e28657 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -4853,6 +4853,12 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
>  	xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq);
>  	xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len,
>  			 skb_headlen(skb) + mac_len, true);
> +	if (skb_is_nonlinear(skb)) {
> +		skb_shinfo(skb)->xdp_frags_size = skb->data_len;
> +		xdp_buff_set_frags_flag(xdp);
> +	} else {
> +		xdp_buff_clear_frags_flag(xdp);
> +	}
>  
>  	orig_data_end = xdp->data_end;
>  	orig_data = xdp->data;
> @@ -4882,6 +4888,14 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
>  		skb->len += off; /* positive on grow, negative on shrink */
>  	}
>  
> +	/* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers
> +	 * (e.g. bpf_xdp_adjust_tail), we need to update data_len here.
> +	 */
> +	if (xdp_buff_has_frags(xdp))
> +		skb->data_len = skb_shinfo(skb)->xdp_frags_size;
> +	else
> +		skb->data_len = 0;
> +
>  	/* check if XDP changed eth hdr such SKB needs update */
>  	eth = (struct ethhdr *)xdp->data;
>  	if ((orig_eth_type != eth->h_proto) ||
> @@ -4927,9 +4941,9 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
>  	if (skb_is_redirected(skb))
>  		return XDP_PASS;
>  
> -	/* XDP packets must be linear and must have sufficient headroom
> -	 * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
> -	 * native XDP provides, thus we need to do it here as well.
> +	/* XDP packets must have sufficient headroom of XDP_PACKET_HEADROOM
> +	 * bytes. This is the guarantee that also native XDP provides,
> +	 * thus we need to do it here as well.
>  	 */
>  	if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
>  	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
> @@ -4943,8 +4957,12 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
>  				     hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
>  				     troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
>  			goto do_drop;
> -		if (skb_linearize(skb))
> -			goto do_drop;
> +
> +		/* XDP does not support fraglist */
> +		if (skb_has_frag_list(skb) || !xdp_prog->aux->xdp_has_frags) {
> +			if (skb_linearize(skb))
> +				goto do_drop;

@Jakub: iirc we were discussing something similar for veth [0].
Here pskb_expand_head() reallocates skb paged data (skb_shinfo()->frags[])
just if the skb is cloned and if it is zero-copied [1] while in skb_cow_data()
we always reallocate the paged area if skb_shinfo()->nr_frags is set [2].
Since the eBPF program can theoretically modify paged data, I would say we
should do the same we did for veth even here, right?

Regards,
Lorenzo

[0] https://lore.kernel.org/netdev/20220312131806.1c2919ba@kicinski-fedora-pc1c0hjn.dhcp.thefacebook.com/
[1] https://elixir.bootlin.com/linux/v6.6.2/source/net/core/skbuff.c#L2113
[2] https://elixir.bootlin.com/linux/v6.6.2/source/net/core/skbuff.c#L5016

> +		}
>  	}
>  
>  	act = bpf_prog_run_generic_xdp(skb, xdp, xdp_prog);
> -- 
> 2.43.0
>
Jakub Kicinski Nov. 28, 2023, 6:51 p.m. UTC | #4
On Tue, 28 Nov 2023 18:29:20 +0100 Lorenzo Bianconi wrote:
> @Jakub: iirc we were discussing something similar for veth [0].
> Here pskb_expand_head() reallocates skb paged data (skb_shinfo()->frags[])
> just if the skb is cloned and if it is zero-copied [1] while in skb_cow_data()
> we always reallocate the paged area if skb_shinfo()->nr_frags is set [2].
> Since the eBPF program can theoretically modify paged data, I would say we
> should do the same we did for veth even here, right?

Yes, don't we allow writes to fragments in XDP based on the assumption
that it runs on Rx so that paged data must not be zero copy?
bpf_xdp_store_bytes() doesn't seem to have any checks which would
stop it from writing fragments, as far as I can see.

I don't see how we can ever correctly support this form of mbuf for veth
or generic XDP :(
Lorenzo Bianconi Nov. 28, 2023, 10:27 p.m. UTC | #5
> On Tue, 28 Nov 2023 18:29:20 +0100 Lorenzo Bianconi wrote:
> > @Jakub: iirc we were discussing something similar for veth [0].
> > Here pskb_expand_head() reallocates skb paged data (skb_shinfo()->frags[])
> > just if the skb is cloned and if it is zero-copied [1] while in skb_cow_data()
> > we always reallocate the paged area if skb_shinfo()->nr_frags is set [2].
> > Since the eBPF program can theoretically modify paged data, I would say we
> > should do the same we did for veth even here, right?
> 
> Yes, don't we allow writes to fragments in XDP based on the assumption
> that it runs on Rx so that paged data must not be zero copy?
> bpf_xdp_store_bytes() doesn't seem to have any checks which would
> stop it from writing fragments, as far as I can see.

do you mean in the skb use-case we could write to fragments (without copying
them) if the skb is not cloned and the paged area is not 'zero-copied'?
With respect to this patch it would mean we can rely on pskb_expand_head() to
reallocate the skb and to covert it to a xdp_buff and we do not need to explicitly
reallocate fragments as we currently do for veth in veth_convert_skb_to_xdp_buff() [0].
Is my understanding correct or am I missing something?

Regards,
Lorenzo

[0] https://elixir.bootlin.com/linux/v6.6.2/source/drivers/net/veth.c#L738

> 
> I don't see how we can ever correctly support this form of mbuf for veth
> or generic XDP :(
Jakub Kicinski Nov. 28, 2023, 11:10 p.m. UTC | #6
On Tue, 28 Nov 2023 23:27:29 +0100 Lorenzo Bianconi wrote:
> > Yes, don't we allow writes to fragments in XDP based on the assumption
> > that it runs on Rx so that paged data must not be zero copy?
> > bpf_xdp_store_bytes() doesn't seem to have any checks which would
> > stop it from writing fragments, as far as I can see.  
> 
> do you mean in the skb use-case we could write to fragments (without copying
> them) if the skb is not cloned and the paged area is not 'zero-copied'?

The zero-copy thing is a red herring. If application uses
sendpage/sendfile/splice the frag may be a page cache page
of a file. Or something completely read only.

IIUC you're trying to avoid the copy if the prog is mbuf capable.
So I was saying that can't work for forms of XDP which actually 
deal with skbs. But that wasn't really your question, sorry :)

> With respect to this patch it would mean we can rely on pskb_expand_head() to
> reallocate the skb and to covert it to a xdp_buff and we do not need to explicitly
> reallocate fragments as we currently do for veth in veth_convert_skb_to_xdp_buff() [0].
> Is my understanding correct or am I missing something?

The difference is that pskb_expand_head() will give you a linear skb,
potentially triggering an order 5 allocation. Expensive and likely to
fail under memory pressure.

veth_convert_skb_to_xdp_buff() tries to allocate pages, and keep
the skb fragmented.
Lorenzo Bianconi Nov. 29, 2023, 4:36 p.m. UTC | #7
> On Tue, 28 Nov 2023 23:27:29 +0100 Lorenzo Bianconi wrote:
> > > Yes, don't we allow writes to fragments in XDP based on the assumption
> > > that it runs on Rx so that paged data must not be zero copy?
> > > bpf_xdp_store_bytes() doesn't seem to have any checks which would
> > > stop it from writing fragments, as far as I can see.  
> > 
> > do you mean in the skb use-case we could write to fragments (without copying
> > them) if the skb is not cloned and the paged area is not 'zero-copied'?
> 
> The zero-copy thing is a red herring. If application uses
> sendpage/sendfile/splice the frag may be a page cache page
> of a file. Or something completely read only.

ack, thx for pointing this out. It is clear now :)

> 
> IIUC you're trying to avoid the copy if the prog is mbuf capable.
> So I was saying that can't work for forms of XDP which actually 
> deal with skbs. But that wasn't really your question, sorry :)
> 
> > With respect to this patch it would mean we can rely on pskb_expand_head() to
> > reallocate the skb and to covert it to a xdp_buff and we do not need to explicitly
> > reallocate fragments as we currently do for veth in veth_convert_skb_to_xdp_buff() [0].
> > Is my understanding correct or am I missing something?
> 
> The difference is that pskb_expand_head() will give you a linear skb,
> potentially triggering an order 5 allocation. Expensive and likely to
> fail under memory pressure.

ack

> 
> veth_convert_skb_to_xdp_buff() tries to allocate pages, and keep
> the skb fragmented.

I will rework the patch using this approach.

Regards,
Lorenzo
diff mbox series

Patch

diff --git a/net/core/dev.c b/net/core/dev.c
index 3950ced396b5..5a58f3e28657 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4853,6 +4853,12 @@  u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
 	xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq);
 	xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len,
 			 skb_headlen(skb) + mac_len, true);
+	if (skb_is_nonlinear(skb)) {
+		skb_shinfo(skb)->xdp_frags_size = skb->data_len;
+		xdp_buff_set_frags_flag(xdp);
+	} else {
+		xdp_buff_clear_frags_flag(xdp);
+	}
 
 	orig_data_end = xdp->data_end;
 	orig_data = xdp->data;
@@ -4882,6 +4888,14 @@  u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
 		skb->len += off; /* positive on grow, negative on shrink */
 	}
 
+	/* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers
+	 * (e.g. bpf_xdp_adjust_tail), we need to update data_len here.
+	 */
+	if (xdp_buff_has_frags(xdp))
+		skb->data_len = skb_shinfo(skb)->xdp_frags_size;
+	else
+		skb->data_len = 0;
+
 	/* check if XDP changed eth hdr such SKB needs update */
 	eth = (struct ethhdr *)xdp->data;
 	if ((orig_eth_type != eth->h_proto) ||
@@ -4927,9 +4941,9 @@  static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 	if (skb_is_redirected(skb))
 		return XDP_PASS;
 
-	/* XDP packets must be linear and must have sufficient headroom
-	 * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
-	 * native XDP provides, thus we need to do it here as well.
+	/* XDP packets must have sufficient headroom of XDP_PACKET_HEADROOM
+	 * bytes. This is the guarantee that also native XDP provides,
+	 * thus we need to do it here as well.
 	 */
 	if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
 	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
@@ -4943,8 +4957,12 @@  static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 				     hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
 				     troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
 			goto do_drop;
-		if (skb_linearize(skb))
-			goto do_drop;
+
+		/* XDP does not support fraglist */
+		if (skb_has_frag_list(skb) || !xdp_prog->aux->xdp_has_frags) {
+			if (skb_linearize(skb))
+				goto do_drop;
+		}
 	}
 
 	act = bpf_prog_run_generic_xdp(skb, xdp, xdp_prog);