diff mbox series

[v4,bpf-next,2/3] veth: rework veth_xdp_rcv_skb in order to accept non-linear skb

Message ID 24703dbc3477a4b3aaf908f6226a566d27969f83.1646755129.git.lorenzo@kernel.org (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series introduce xdp frags support to veth driver | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for bpf-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 2 this patch: 2
netdev/cc_maintainers warning 6 maintainers not CCed: hawk@kernel.org kpsingh@kernel.org john.fastabend@gmail.com kafai@fb.com songliubraving@fb.com yhs@fb.com
netdev/build_clang success Errors and warnings before: 18 this patch: 18
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 7 this patch: 7
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 257 lines checked
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next success VM_Test

Commit Message

Lorenzo Bianconi March 8, 2022, 4:05 p.m. UTC
Introduce veth_convert_xdp_buff_from_skb routine in order to
convert a non-linear skb into a xdp buffer. If the received skb
is cloned or shared, veth_convert_xdp_buff_from_skb will copy it
in a new skb composed by order-0 pages for the linear and the
fragmented area. Moreover veth_convert_xdp_buff_from_skb guarantees
we have enough headroom for xdp.
This is a preliminary patch to allow attaching xdp programs with frags
support on veth devices.

Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
---
 drivers/net/veth.c | 174 ++++++++++++++++++++++++++++++---------------
 net/core/xdp.c     |   1 +
 2 files changed, 119 insertions(+), 56 deletions(-)

Comments

Toke Høiland-Jørgensen March 10, 2022, 11:21 a.m. UTC | #1
Lorenzo Bianconi <lorenzo@kernel.org> writes:

> Introduce veth_convert_xdp_buff_from_skb routine in order to
> convert a non-linear skb into a xdp buffer. If the received skb
> is cloned or shared, veth_convert_xdp_buff_from_skb will copy it
> in a new skb composed by order-0 pages for the linear and the
> fragmented area. Moreover veth_convert_xdp_buff_from_skb guarantees
> we have enough headroom for xdp.
> This is a preliminary patch to allow attaching xdp programs with frags
> support on veth devices.
>
> Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>

It's cool that we can do this! A few comments below:

> ---
>  drivers/net/veth.c | 174 ++++++++++++++++++++++++++++++---------------
>  net/core/xdp.c     |   1 +
>  2 files changed, 119 insertions(+), 56 deletions(-)
>
> diff --git a/drivers/net/veth.c b/drivers/net/veth.c
> index b77ce3fdcfe8..47b21b1d2fd9 100644
> --- a/drivers/net/veth.c
> +++ b/drivers/net/veth.c
> @@ -433,21 +433,6 @@ static void veth_set_multicast_list(struct net_device *dev)
>  {
>  }
>  
> -static struct sk_buff *veth_build_skb(void *head, int headroom, int len,
> -				      int buflen)
> -{
> -	struct sk_buff *skb;
> -
> -	skb = build_skb(head, buflen);
> -	if (!skb)
> -		return NULL;
> -
> -	skb_reserve(skb, headroom);
> -	skb_put(skb, len);
> -
> -	return skb;
> -}
> -
>  static int veth_select_rxq(struct net_device *dev)
>  {
>  	return smp_processor_id() % dev->real_num_rx_queues;
> @@ -695,72 +680,143 @@ static void veth_xdp_rcv_bulk_skb(struct veth_rq *rq, void **frames,
>  	}
>  }
>  
> -static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
> -					struct sk_buff *skb,
> -					struct veth_xdp_tx_bq *bq,
> -					struct veth_stats *stats)
> +static void veth_xdp_get(struct xdp_buff *xdp)
>  {
> -	u32 pktlen, headroom, act, metalen, frame_sz;
> -	void *orig_data, *orig_data_end;
> -	struct bpf_prog *xdp_prog;
> -	int mac_len, delta, off;
> -	struct xdp_buff xdp;
> +	struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
> +	int i;
>  
> -	skb_prepare_for_gro(skb);
> +	get_page(virt_to_page(xdp->data));
> +	if (likely(!xdp_buff_has_frags(xdp)))
> +		return;
>  
> -	rcu_read_lock();
> -	xdp_prog = rcu_dereference(rq->xdp_prog);
> -	if (unlikely(!xdp_prog)) {
> -		rcu_read_unlock();
> -		goto out;
> -	}
> +	for (i = 0; i < sinfo->nr_frags; i++)
> +		__skb_frag_ref(&sinfo->frags[i]);
> +}
>  
> -	mac_len = skb->data - skb_mac_header(skb);
> -	pktlen = skb->len + mac_len;
> -	headroom = skb_headroom(skb) - mac_len;
> +static int veth_convert_xdp_buff_from_skb(struct veth_rq *rq,
> +					  struct xdp_buff *xdp,
> +					  struct sk_buff **pskb)
> +{

nit: It's not really "converting" and skb into an xdp_buff, since the
xdp_buff lives on the stack; so maybe 'veth_init_xdp_buff_from_skb()'?

> +	struct sk_buff *skb = *pskb;
> +	u32 frame_sz;
>  
>  	if (skb_shared(skb) || skb_head_is_locked(skb) ||
> -	    skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) {
> +	    skb_shinfo(skb)->nr_frags) {

So this always clones the skb if it has frags? Is that really needed?

Also, there's a lot of memory allocation and copying going on here; have
you measured the performance?

> +		u32 size, len, max_head_size, off;
>  		struct sk_buff *nskb;
> -		int size, head_off;
> -		void *head, *start;
>  		struct page *page;
> +		int i, head_off;
>  
> -		size = SKB_DATA_ALIGN(VETH_XDP_HEADROOM + pktlen) +
> -		       SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
> -		if (size > PAGE_SIZE)
> +		/* We need a private copy of the skb and data buffers since
> +		 * the ebpf program can modify it. We segment the original skb
> +		 * into order-0 pages without linearize it.
> +		 *
> +		 * Make sure we have enough space for linear and paged area
> +		 */
> +		max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE -
> +						  VETH_XDP_HEADROOM);
> +		if (skb->len > PAGE_SIZE * MAX_SKB_FRAGS + max_head_size)
>  			goto drop;
>  
> +		/* Allocate skb head */
>  		page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
>  		if (!page)
>  			goto drop;
>  
> -		head = page_address(page);
> -		start = head + VETH_XDP_HEADROOM;
> -		if (skb_copy_bits(skb, -mac_len, start, pktlen)) {
> -			page_frag_free(head);
> +		nskb = build_skb(page_address(page), PAGE_SIZE);
> +		if (!nskb) {
> +			put_page(page);
>  			goto drop;
>  		}
>  
> -		nskb = veth_build_skb(head, VETH_XDP_HEADROOM + mac_len,
> -				      skb->len, PAGE_SIZE);
> -		if (!nskb) {
> -			page_frag_free(head);
> +		skb_reserve(nskb, VETH_XDP_HEADROOM);
> +		size = min_t(u32, skb->len, max_head_size);
> +		if (skb_copy_bits(skb, 0, nskb->data, size)) {
> +			consume_skb(nskb);
>  			goto drop;
>  		}
> +		skb_put(nskb, size);
>  
>  		skb_copy_header(nskb, skb);
>  		head_off = skb_headroom(nskb) - skb_headroom(skb);
>  		skb_headers_offset_update(nskb, head_off);
> +
> +		/* Allocate paged area of new skb */
> +		off = size;
> +		len = skb->len - off;
> +
> +		for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) {
> +			page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
> +			if (!page) {
> +				consume_skb(nskb);
> +				goto drop;
> +			}
> +
> +			size = min_t(u32, len, PAGE_SIZE);
> +			skb_add_rx_frag(nskb, i, page, 0, size, PAGE_SIZE);
> +			if (skb_copy_bits(skb, off, page_address(page),
> +					  size)) {
> +				consume_skb(nskb);
> +				goto drop;
> +			}
> +
> +			len -= size;
> +			off += size;
> +		}
> +
>  		consume_skb(skb);
>  		skb = nskb;
> +	} else if (skb_headroom(skb) < XDP_PACKET_HEADROOM &&
> +		   pskb_expand_head(skb, VETH_XDP_HEADROOM, 0, GFP_ATOMIC)) {
> +		goto drop;
>  	}
>  
>  	/* SKB "head" area always have tailroom for skb_shared_info */
>  	frame_sz = skb_end_pointer(skb) - skb->head;
>  	frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
> -	xdp_init_buff(&xdp, frame_sz, &rq->xdp_rxq);
> -	xdp_prepare_buff(&xdp, skb->head, skb->mac_header, pktlen, true);
> +	xdp_init_buff(xdp, frame_sz, &rq->xdp_rxq);
> +	xdp_prepare_buff(xdp, skb->head, skb_headroom(skb),
> +			 skb_headlen(skb), true);
> +
> +	if (skb_is_nonlinear(skb)) {
> +		skb_shinfo(skb)->xdp_frags_size = skb->data_len;
> +		xdp_buff_set_frags_flag(xdp);
> +	} else {
> +		xdp_buff_clear_frags_flag(xdp);
> +	}
> +	*pskb = skb;
> +
> +	return 0;
> +drop:
> +	consume_skb(skb);
> +	*pskb = NULL;
> +
> +	return -ENOMEM;
> +}
> +
> +static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
> +					struct sk_buff *skb,
> +					struct veth_xdp_tx_bq *bq,
> +					struct veth_stats *stats)
> +{
> +	void *orig_data, *orig_data_end;
> +	struct bpf_prog *xdp_prog;
> +	struct xdp_buff xdp;
> +	u32 act, metalen;
> +	int off;
> +
> +	skb_prepare_for_gro(skb);
> +
> +	rcu_read_lock();
> +	xdp_prog = rcu_dereference(rq->xdp_prog);
> +	if (unlikely(!xdp_prog)) {
> +		rcu_read_unlock();
> +		goto out;
> +	}
> +
> +	__skb_push(skb, skb->data - skb_mac_header(skb));
> +	if (veth_convert_xdp_buff_from_skb(rq, &xdp, &skb))
> +		goto drop;
>  
>  	orig_data = xdp.data;
>  	orig_data_end = xdp.data_end;
> @@ -771,7 +827,7 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
>  	case XDP_PASS:
>  		break;
>  	case XDP_TX:
> -		get_page(virt_to_page(xdp.data));
> +		veth_xdp_get(&xdp);
>  		consume_skb(skb);
>  		xdp.rxq->mem = rq->xdp_mem;
>  		if (unlikely(veth_xdp_tx(rq, &xdp, bq) < 0)) {
> @@ -783,7 +839,7 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
>  		rcu_read_unlock();
>  		goto xdp_xmit;
>  	case XDP_REDIRECT:
> -		get_page(virt_to_page(xdp.data));
> +		veth_xdp_get(&xdp);
>  		consume_skb(skb);
>  		xdp.rxq->mem = rq->xdp_mem;
>  		if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) {
> @@ -806,18 +862,24 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
>  	rcu_read_unlock();
>  
>  	/* check if bpf_xdp_adjust_head was used */
> -	delta = orig_data - xdp.data;
> -	off = mac_len + delta;
> +	off = orig_data - xdp.data;
>  	if (off > 0)
>  		__skb_push(skb, off);
>  	else if (off < 0)
>  		__skb_pull(skb, -off);
> -	skb->mac_header -= delta;
> +
> +	skb_reset_mac_header(skb);
>  
>  	/* check if bpf_xdp_adjust_tail was used */
>  	off = xdp.data_end - orig_data_end;
>  	if (off != 0)
>  		__skb_put(skb, off); /* positive on grow, negative on shrink */
> +
> +	if (xdp_buff_has_frags(&xdp))
> +		skb->data_len = skb_shinfo(skb)->xdp_frags_size;
> +	else
> +		skb->data_len = 0;

We can remove entire frags using xdp_adjust_tail, right? Will that get
propagated in the right way to the skb frags due to the dual use of
skb_shared_info, or?
Lorenzo Bianconi March 10, 2022, 11:43 a.m. UTC | #2
> Lorenzo Bianconi <lorenzo@kernel.org> writes:
> 
> > Introduce veth_convert_xdp_buff_from_skb routine in order to
> > convert a non-linear skb into a xdp buffer. If the received skb
> > is cloned or shared, veth_convert_xdp_buff_from_skb will copy it
> > in a new skb composed by order-0 pages for the linear and the
> > fragmented area. Moreover veth_convert_xdp_buff_from_skb guarantees
> > we have enough headroom for xdp.
> > This is a preliminary patch to allow attaching xdp programs with frags
> > support on veth devices.
> >
> > Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> 
> It's cool that we can do this! A few comments below:

Hi Toke,

thx for the review :)

[...]

> > +static int veth_convert_xdp_buff_from_skb(struct veth_rq *rq,
> > +					  struct xdp_buff *xdp,
> > +					  struct sk_buff **pskb)
> > +{
> 
> nit: It's not really "converting" and skb into an xdp_buff, since the
> xdp_buff lives on the stack; so maybe 'veth_init_xdp_buff_from_skb()'?

I kept the previous naming convention used for xdp_convert_frame_to_buff()
(my goal would be to move it in xdp.c and reuse this routine for the
generic-xdp use case) but I am fine with veth_init_xdp_buff_from_skb().

> 
> > +	struct sk_buff *skb = *pskb;
> > +	u32 frame_sz;
> >  
> >  	if (skb_shared(skb) || skb_head_is_locked(skb) ||
> > -	    skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) {
> > +	    skb_shinfo(skb)->nr_frags) {
> 
> So this always clones the skb if it has frags? Is that really needed?

if we look at skb_cow_data(), paged area is always considered not writable

> 
> Also, there's a lot of memory allocation and copying going on here; have
> you measured the performance?

even in the previous implementation we always reallocate the skb if the
conditions above are verified so I do not expect any difference in the single
buffer use-case but I will run some performance tests.

> 

[...]

> > +
> > +	if (xdp_buff_has_frags(&xdp))
> > +		skb->data_len = skb_shinfo(skb)->xdp_frags_size;
> > +	else
> > +		skb->data_len = 0;
> 
> We can remove entire frags using xdp_adjust_tail, right? Will that get
> propagated in the right way to the skb frags due to the dual use of
> skb_shared_info, or?

bpf_xdp_frags_shrink_tail() can remove entire frags and it will modify metadata
contained in the skb_shared_info (e.g. nr_frags or the frag size of
the given page). We should consider the data_len field in this case. Agree?

Regards,
Lorenzo

>
Toke Høiland-Jørgensen March 10, 2022, 7:06 p.m. UTC | #3
Lorenzo Bianconi <lorenzo.bianconi@redhat.com> writes:

>> Lorenzo Bianconi <lorenzo@kernel.org> writes:
>> 
>> > Introduce veth_convert_xdp_buff_from_skb routine in order to
>> > convert a non-linear skb into a xdp buffer. If the received skb
>> > is cloned or shared, veth_convert_xdp_buff_from_skb will copy it
>> > in a new skb composed by order-0 pages for the linear and the
>> > fragmented area. Moreover veth_convert_xdp_buff_from_skb guarantees
>> > we have enough headroom for xdp.
>> > This is a preliminary patch to allow attaching xdp programs with frags
>> > support on veth devices.
>> >
>> > Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
>> 
>> It's cool that we can do this! A few comments below:
>
> Hi Toke,
>
> thx for the review :)
>
> [...]
>
>> > +static int veth_convert_xdp_buff_from_skb(struct veth_rq *rq,
>> > +					  struct xdp_buff *xdp,
>> > +					  struct sk_buff **pskb)
>> > +{
>> 
>> nit: It's not really "converting" and skb into an xdp_buff, since the
>> xdp_buff lives on the stack; so maybe 'veth_init_xdp_buff_from_skb()'?
>
> I kept the previous naming convention used for xdp_convert_frame_to_buff()
> (my goal would be to move it in xdp.c and reuse this routine for the
> generic-xdp use case) but I am fine with
> veth_init_xdp_buff_from_skb().

Consistency is probably good, but right now we have functions of the
form 'xdp_convert_X_to_Y()' and 'xdp_update_Y_from_X()'. So to follow
that you'd have either 'veth_update_xdp_buff_from_skb()' or
'veth_convert_skb_to_xdp_buff()' :)

>> > +	struct sk_buff *skb = *pskb;
>> > +	u32 frame_sz;
>> >  
>> >  	if (skb_shared(skb) || skb_head_is_locked(skb) ||
>> > -	    skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) {
>> > +	    skb_shinfo(skb)->nr_frags) {
>> 
>> So this always clones the skb if it has frags? Is that really needed?
>
> if we look at skb_cow_data(), paged area is always considered not writable

Ah, right, did not know that. Seems a bit odd, but OK.

>> Also, there's a lot of memory allocation and copying going on here; have
>> you measured the performance?
>
> even in the previous implementation we always reallocate the skb if the
> conditions above are verified so I do not expect any difference in the single
> buffer use-case but I will run some performance tests.

No, I wouldn't expect any difference for the single-buffer case, but I
would also be interested in how big the overhead is of having to copy
the whole jumbo-frame?

BTW, just noticed one other change - before we had:

> -	headroom = skb_headroom(skb) - mac_len;
>  	if (skb_shared(skb) || skb_head_is_locked(skb) ||
> -	    skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) {


And in your patch that becomes:

> +	} else if (skb_headroom(skb) < XDP_PACKET_HEADROOM &&
> +		   pskb_expand_head(skb, VETH_XDP_HEADROOM, 0, GFP_ATOMIC)) {
> +		goto drop;


So the mac_len subtraction disappeared; that seems wrong?

>> > +
>> > +	if (xdp_buff_has_frags(&xdp))
>> > +		skb->data_len = skb_shinfo(skb)->xdp_frags_size;
>> > +	else
>> > +		skb->data_len = 0;
>> 
>> We can remove entire frags using xdp_adjust_tail, right? Will that get
>> propagated in the right way to the skb frags due to the dual use of
>> skb_shared_info, or?
>
> bpf_xdp_frags_shrink_tail() can remove entire frags and it will modify
> metadata contained in the skb_shared_info (e.g. nr_frags or the frag
> size of the given page). We should consider the data_len field in this
> case. Agree?

Right, that's what I assumed; makes sense. But adding a comment
mentioning this above the update of data_len might be helpful? :)

-Toke
Lorenzo Bianconi March 10, 2022, 7:26 p.m. UTC | #4
> Lorenzo Bianconi <lorenzo.bianconi@redhat.com> writes:
> 
> >> Lorenzo Bianconi <lorenzo@kernel.org> writes:
> >> 
> >> > Introduce veth_convert_xdp_buff_from_skb routine in order to
> >> > convert a non-linear skb into a xdp buffer. If the received skb
> >> > is cloned or shared, veth_convert_xdp_buff_from_skb will copy it
> >> > in a new skb composed by order-0 pages for the linear and the
> >> > fragmented area. Moreover veth_convert_xdp_buff_from_skb guarantees
> >> > we have enough headroom for xdp.
> >> > This is a preliminary patch to allow attaching xdp programs with frags
> >> > support on veth devices.
> >> >
> >> > Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> >> 
> >> It's cool that we can do this! A few comments below:
> >
> > Hi Toke,
> >
> > thx for the review :)
> >
> > [...]
> >
> >> > +static int veth_convert_xdp_buff_from_skb(struct veth_rq *rq,
> >> > +					  struct xdp_buff *xdp,
> >> > +					  struct sk_buff **pskb)
> >> > +{
> >> 
> >> nit: It's not really "converting" and skb into an xdp_buff, since the
> >> xdp_buff lives on the stack; so maybe 'veth_init_xdp_buff_from_skb()'?
> >
> > I kept the previous naming convention used for xdp_convert_frame_to_buff()
> > (my goal would be to move it in xdp.c and reuse this routine for the
> > generic-xdp use case) but I am fine with
> > veth_init_xdp_buff_from_skb().
> 
> Consistency is probably good, but right now we have functions of the
> form 'xdp_convert_X_to_Y()' and 'xdp_update_Y_from_X()'. So to follow
> that you'd have either 'veth_update_xdp_buff_from_skb()' or
> 'veth_convert_skb_to_xdp_buff()' :)

ack, I am fine with veth_convert_skb_to_xdp_buff()

> 
> >> > +	struct sk_buff *skb = *pskb;
> >> > +	u32 frame_sz;
> >> >  
> >> >  	if (skb_shared(skb) || skb_head_is_locked(skb) ||
> >> > -	    skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) {
> >> > +	    skb_shinfo(skb)->nr_frags) {
> >> 
> >> So this always clones the skb if it has frags? Is that really needed?
> >
> > if we look at skb_cow_data(), paged area is always considered not writable
> 
> Ah, right, did not know that. Seems a bit odd, but OK.
> 
> >> Also, there's a lot of memory allocation and copying going on here; have
> >> you measured the performance?
> >
> > even in the previous implementation we always reallocate the skb if the
> > conditions above are verified so I do not expect any difference in the single
> > buffer use-case but I will run some performance tests.
> 
> No, I wouldn't expect any difference for the single-buffer case, but I
> would also be interested in how big the overhead is of having to copy
> the whole jumbo-frame?

oh ok, I got what you mean. I guess we can compare the tcp throughput for
the legacy skb mode (when no program is attached on the veth pair) and xdp 
mode (when we load a simple xdp program that just returns xdp_pass) when
jumbo frames are enabled. I would expect a performance penalty but let's see.

> 
> BTW, just noticed one other change - before we had:
> 
> > -	headroom = skb_headroom(skb) - mac_len;
> >  	if (skb_shared(skb) || skb_head_is_locked(skb) ||
> > -	    skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) {
> 
> 
> And in your patch that becomes:
> 
> > +	} else if (skb_headroom(skb) < XDP_PACKET_HEADROOM &&
> > +		   pskb_expand_head(skb, VETH_XDP_HEADROOM, 0, GFP_ATOMIC)) {
> > +		goto drop;
> 
> 
> So the mac_len subtraction disappeared; that seems wrong?

we call __skb_push before running veth_convert_xdp_buff_from_skb() in
veth_xdp_rcv_skb().

> 
> >> > +
> >> > +	if (xdp_buff_has_frags(&xdp))
> >> > +		skb->data_len = skb_shinfo(skb)->xdp_frags_size;
> >> > +	else
> >> > +		skb->data_len = 0;
> >> 
> >> We can remove entire frags using xdp_adjust_tail, right? Will that get
> >> propagated in the right way to the skb frags due to the dual use of
> >> skb_shared_info, or?
> >
> > bpf_xdp_frags_shrink_tail() can remove entire frags and it will modify
> > metadata contained in the skb_shared_info (e.g. nr_frags or the frag
> > size of the given page). We should consider the data_len field in this
> > case. Agree?
> 
> Right, that's what I assumed; makes sense. But adding a comment
> mentioning this above the update of data_len might be helpful? :)

ack, will do.

Regards,
Lorenzo

> 
> -Toke
>
Lorenzo Bianconi March 10, 2022, 11:46 p.m. UTC | #5
> > Lorenzo Bianconi <lorenzo.bianconi@redhat.com> writes:
> > 
> > >> Lorenzo Bianconi <lorenzo@kernel.org> writes:
> > >> 
> > >> > Introduce veth_convert_xdp_buff_from_skb routine in order to
> > >> > convert a non-linear skb into a xdp buffer. If the received skb
> > >> > is cloned or shared, veth_convert_xdp_buff_from_skb will copy it
> > >> > in a new skb composed by order-0 pages for the linear and the
> > >> > fragmented area. Moreover veth_convert_xdp_buff_from_skb guarantees
> > >> > we have enough headroom for xdp.
> > >> > This is a preliminary patch to allow attaching xdp programs with frags
> > >> > support on veth devices.
> > >> >
> > >> > Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> > >> 
> > >> It's cool that we can do this! A few comments below:
> > >
> > > Hi Toke,
> > >
> > > thx for the review :)
> > >
> > > [...]
> > >
> > >> > +static int veth_convert_xdp_buff_from_skb(struct veth_rq *rq,
> > >> > +					  struct xdp_buff *xdp,
> > >> > +					  struct sk_buff **pskb)
> > >> > +{
> > >> 
> > >> nit: It's not really "converting" and skb into an xdp_buff, since the
> > >> xdp_buff lives on the stack; so maybe 'veth_init_xdp_buff_from_skb()'?
> > >
> > > I kept the previous naming convention used for xdp_convert_frame_to_buff()
> > > (my goal would be to move it in xdp.c and reuse this routine for the
> > > generic-xdp use case) but I am fine with
> > > veth_init_xdp_buff_from_skb().
> > 
> > Consistency is probably good, but right now we have functions of the
> > form 'xdp_convert_X_to_Y()' and 'xdp_update_Y_from_X()'. So to follow
> > that you'd have either 'veth_update_xdp_buff_from_skb()' or
> > 'veth_convert_skb_to_xdp_buff()' :)
> 
> ack, I am fine with veth_convert_skb_to_xdp_buff()
> 
> > 
> > >> > +	struct sk_buff *skb = *pskb;
> > >> > +	u32 frame_sz;
> > >> >  
> > >> >  	if (skb_shared(skb) || skb_head_is_locked(skb) ||
> > >> > -	    skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) {
> > >> > +	    skb_shinfo(skb)->nr_frags) {
> > >> 
> > >> So this always clones the skb if it has frags? Is that really needed?
> > >
> > > if we look at skb_cow_data(), paged area is always considered not writable
> > 
> > Ah, right, did not know that. Seems a bit odd, but OK.
> > 
> > >> Also, there's a lot of memory allocation and copying going on here; have
> > >> you measured the performance?
> > >
> > > even in the previous implementation we always reallocate the skb if the
> > > conditions above are verified so I do not expect any difference in the single
> > > buffer use-case but I will run some performance tests.
> > 
> > No, I wouldn't expect any difference for the single-buffer case, but I
> > would also be interested in how big the overhead is of having to copy
> > the whole jumbo-frame?
> 
> oh ok, I got what you mean. I guess we can compare the tcp throughput for
> the legacy skb mode (when no program is attached on the veth pair) and xdp 
> mode (when we load a simple xdp program that just returns xdp_pass) when
> jumbo frames are enabled. I would expect a performance penalty but let's see.

I run the tests described above and I got the following results:

- skb mode mtu 1500B (TSO/GSO off): ~ 16.8 Gbps
- xdp mode mtu 1500B (XDP_PASS):    ~ 9.52 Gbps

- skb mode mtu 32KB (TSO/GSO off): ~ 41 Gbps
- xdp mode mtu 32KB (XDP_PASS):    ~ 25 Gbps

the (expected) performance penalty ratio (due to the copy) is quite constant

Regards,
Lorenzo

> 
> > 
> > BTW, just noticed one other change - before we had:
> > 
> > > -	headroom = skb_headroom(skb) - mac_len;
> > >  	if (skb_shared(skb) || skb_head_is_locked(skb) ||
> > > -	    skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) {
> > 
> > 
> > And in your patch that becomes:
> > 
> > > +	} else if (skb_headroom(skb) < XDP_PACKET_HEADROOM &&
> > > +		   pskb_expand_head(skb, VETH_XDP_HEADROOM, 0, GFP_ATOMIC)) {
> > > +		goto drop;
> > 
> > 
> > So the mac_len subtraction disappeared; that seems wrong?
> 
> we call __skb_push before running veth_convert_xdp_buff_from_skb() in
> veth_xdp_rcv_skb().
> 
> > 
> > >> > +
> > >> > +	if (xdp_buff_has_frags(&xdp))
> > >> > +		skb->data_len = skb_shinfo(skb)->xdp_frags_size;
> > >> > +	else
> > >> > +		skb->data_len = 0;
> > >> 
> > >> We can remove entire frags using xdp_adjust_tail, right? Will that get
> > >> propagated in the right way to the skb frags due to the dual use of
> > >> skb_shared_info, or?
> > >
> > > bpf_xdp_frags_shrink_tail() can remove entire frags and it will modify
> > > metadata contained in the skb_shared_info (e.g. nr_frags or the frag
> > > size of the given page). We should consider the data_len field in this
> > > case. Agree?
> > 
> > Right, that's what I assumed; makes sense. But adding a comment
> > mentioning this above the update of data_len might be helpful? :)
> 
> ack, will do.
> 
> Regards,
> Lorenzo
> 
> > 
> > -Toke
> >
Jakub Kicinski March 12, 2022, 9:18 p.m. UTC | #6
On Thu, 10 Mar 2022 20:06:40 +0100 Toke Høiland-Jørgensen wrote:
> >> So this always clones the skb if it has frags? Is that really needed?  
> >
> > if we look at skb_cow_data(), paged area is always considered not writable  
> 
> Ah, right, did not know that. Seems a bit odd, but OK.

Yeah, I think I pointed that out, I may well be wrong.

AFAICT frags which are not writable are not marked in any clear
way. We have SKBFL_SHARED_FRAG which seems pretty close but its
documented as an indication that the frag can be written under our
feet, not that we can't write to it. Subtly different. And (as
documented) it's only used when doing SW csums, as far as I can
grep.

Maybe someone else knows the semantics.
diff mbox series

Patch

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index b77ce3fdcfe8..47b21b1d2fd9 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -433,21 +433,6 @@  static void veth_set_multicast_list(struct net_device *dev)
 {
 }
 
-static struct sk_buff *veth_build_skb(void *head, int headroom, int len,
-				      int buflen)
-{
-	struct sk_buff *skb;
-
-	skb = build_skb(head, buflen);
-	if (!skb)
-		return NULL;
-
-	skb_reserve(skb, headroom);
-	skb_put(skb, len);
-
-	return skb;
-}
-
 static int veth_select_rxq(struct net_device *dev)
 {
 	return smp_processor_id() % dev->real_num_rx_queues;
@@ -695,72 +680,143 @@  static void veth_xdp_rcv_bulk_skb(struct veth_rq *rq, void **frames,
 	}
 }
 
-static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
-					struct sk_buff *skb,
-					struct veth_xdp_tx_bq *bq,
-					struct veth_stats *stats)
+static void veth_xdp_get(struct xdp_buff *xdp)
 {
-	u32 pktlen, headroom, act, metalen, frame_sz;
-	void *orig_data, *orig_data_end;
-	struct bpf_prog *xdp_prog;
-	int mac_len, delta, off;
-	struct xdp_buff xdp;
+	struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
+	int i;
 
-	skb_prepare_for_gro(skb);
+	get_page(virt_to_page(xdp->data));
+	if (likely(!xdp_buff_has_frags(xdp)))
+		return;
 
-	rcu_read_lock();
-	xdp_prog = rcu_dereference(rq->xdp_prog);
-	if (unlikely(!xdp_prog)) {
-		rcu_read_unlock();
-		goto out;
-	}
+	for (i = 0; i < sinfo->nr_frags; i++)
+		__skb_frag_ref(&sinfo->frags[i]);
+}
 
-	mac_len = skb->data - skb_mac_header(skb);
-	pktlen = skb->len + mac_len;
-	headroom = skb_headroom(skb) - mac_len;
+static int veth_convert_xdp_buff_from_skb(struct veth_rq *rq,
+					  struct xdp_buff *xdp,
+					  struct sk_buff **pskb)
+{
+	struct sk_buff *skb = *pskb;
+	u32 frame_sz;
 
 	if (skb_shared(skb) || skb_head_is_locked(skb) ||
-	    skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) {
+	    skb_shinfo(skb)->nr_frags) {
+		u32 size, len, max_head_size, off;
 		struct sk_buff *nskb;
-		int size, head_off;
-		void *head, *start;
 		struct page *page;
+		int i, head_off;
 
-		size = SKB_DATA_ALIGN(VETH_XDP_HEADROOM + pktlen) +
-		       SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
-		if (size > PAGE_SIZE)
+		/* We need a private copy of the skb and data buffers since
+		 * the ebpf program can modify it. We segment the original skb
+		 * into order-0 pages without linearize it.
+		 *
+		 * Make sure we have enough space for linear and paged area
+		 */
+		max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE -
+						  VETH_XDP_HEADROOM);
+		if (skb->len > PAGE_SIZE * MAX_SKB_FRAGS + max_head_size)
 			goto drop;
 
+		/* Allocate skb head */
 		page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
 		if (!page)
 			goto drop;
 
-		head = page_address(page);
-		start = head + VETH_XDP_HEADROOM;
-		if (skb_copy_bits(skb, -mac_len, start, pktlen)) {
-			page_frag_free(head);
+		nskb = build_skb(page_address(page), PAGE_SIZE);
+		if (!nskb) {
+			put_page(page);
 			goto drop;
 		}
 
-		nskb = veth_build_skb(head, VETH_XDP_HEADROOM + mac_len,
-				      skb->len, PAGE_SIZE);
-		if (!nskb) {
-			page_frag_free(head);
+		skb_reserve(nskb, VETH_XDP_HEADROOM);
+		size = min_t(u32, skb->len, max_head_size);
+		if (skb_copy_bits(skb, 0, nskb->data, size)) {
+			consume_skb(nskb);
 			goto drop;
 		}
+		skb_put(nskb, size);
 
 		skb_copy_header(nskb, skb);
 		head_off = skb_headroom(nskb) - skb_headroom(skb);
 		skb_headers_offset_update(nskb, head_off);
+
+		/* Allocate paged area of new skb */
+		off = size;
+		len = skb->len - off;
+
+		for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) {
+			page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
+			if (!page) {
+				consume_skb(nskb);
+				goto drop;
+			}
+
+			size = min_t(u32, len, PAGE_SIZE);
+			skb_add_rx_frag(nskb, i, page, 0, size, PAGE_SIZE);
+			if (skb_copy_bits(skb, off, page_address(page),
+					  size)) {
+				consume_skb(nskb);
+				goto drop;
+			}
+
+			len -= size;
+			off += size;
+		}
+
 		consume_skb(skb);
 		skb = nskb;
+	} else if (skb_headroom(skb) < XDP_PACKET_HEADROOM &&
+		   pskb_expand_head(skb, VETH_XDP_HEADROOM, 0, GFP_ATOMIC)) {
+		goto drop;
 	}
 
 	/* SKB "head" area always have tailroom for skb_shared_info */
 	frame_sz = skb_end_pointer(skb) - skb->head;
 	frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
-	xdp_init_buff(&xdp, frame_sz, &rq->xdp_rxq);
-	xdp_prepare_buff(&xdp, skb->head, skb->mac_header, pktlen, true);
+	xdp_init_buff(xdp, frame_sz, &rq->xdp_rxq);
+	xdp_prepare_buff(xdp, skb->head, skb_headroom(skb),
+			 skb_headlen(skb), true);
+
+	if (skb_is_nonlinear(skb)) {
+		skb_shinfo(skb)->xdp_frags_size = skb->data_len;
+		xdp_buff_set_frags_flag(xdp);
+	} else {
+		xdp_buff_clear_frags_flag(xdp);
+	}
+	*pskb = skb;
+
+	return 0;
+drop:
+	consume_skb(skb);
+	*pskb = NULL;
+
+	return -ENOMEM;
+}
+
+static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
+					struct sk_buff *skb,
+					struct veth_xdp_tx_bq *bq,
+					struct veth_stats *stats)
+{
+	void *orig_data, *orig_data_end;
+	struct bpf_prog *xdp_prog;
+	struct xdp_buff xdp;
+	u32 act, metalen;
+	int off;
+
+	skb_prepare_for_gro(skb);
+
+	rcu_read_lock();
+	xdp_prog = rcu_dereference(rq->xdp_prog);
+	if (unlikely(!xdp_prog)) {
+		rcu_read_unlock();
+		goto out;
+	}
+
+	__skb_push(skb, skb->data - skb_mac_header(skb));
+	if (veth_convert_xdp_buff_from_skb(rq, &xdp, &skb))
+		goto drop;
 
 	orig_data = xdp.data;
 	orig_data_end = xdp.data_end;
@@ -771,7 +827,7 @@  static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
 	case XDP_PASS:
 		break;
 	case XDP_TX:
-		get_page(virt_to_page(xdp.data));
+		veth_xdp_get(&xdp);
 		consume_skb(skb);
 		xdp.rxq->mem = rq->xdp_mem;
 		if (unlikely(veth_xdp_tx(rq, &xdp, bq) < 0)) {
@@ -783,7 +839,7 @@  static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
 		rcu_read_unlock();
 		goto xdp_xmit;
 	case XDP_REDIRECT:
-		get_page(virt_to_page(xdp.data));
+		veth_xdp_get(&xdp);
 		consume_skb(skb);
 		xdp.rxq->mem = rq->xdp_mem;
 		if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) {
@@ -806,18 +862,24 @@  static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
 	rcu_read_unlock();
 
 	/* check if bpf_xdp_adjust_head was used */
-	delta = orig_data - xdp.data;
-	off = mac_len + delta;
+	off = orig_data - xdp.data;
 	if (off > 0)
 		__skb_push(skb, off);
 	else if (off < 0)
 		__skb_pull(skb, -off);
-	skb->mac_header -= delta;
+
+	skb_reset_mac_header(skb);
 
 	/* check if bpf_xdp_adjust_tail was used */
 	off = xdp.data_end - orig_data_end;
 	if (off != 0)
 		__skb_put(skb, off); /* positive on grow, negative on shrink */
+
+	if (xdp_buff_has_frags(&xdp))
+		skb->data_len = skb_shinfo(skb)->xdp_frags_size;
+	else
+		skb->data_len = 0;
+
 	skb->protocol = eth_type_trans(skb, rq->dev);
 
 	metalen = xdp.data - xdp.data_meta;
@@ -833,7 +895,7 @@  static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
 	return NULL;
 err_xdp:
 	rcu_read_unlock();
-	page_frag_free(xdp.data);
+	xdp_return_buff(&xdp);
 xdp_xmit:
 	return NULL;
 }
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 361df312ee7f..b5f2d428d856 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -528,6 +528,7 @@  void xdp_return_buff(struct xdp_buff *xdp)
 out:
 	__xdp_return(xdp->data, &xdp->rxq->mem, true, xdp);
 }
+EXPORT_SYMBOL_GPL(xdp_return_buff);
 
 /* Only called for MEM_TYPE_PAGE_POOL see xdp.h */
 void __xdp_release_frame(void *data, struct xdp_mem_info *mem)