diff mbox series

[v9,bpf-next,08/14] bpf: add multi-buff support to the bpf_xdp_adjust_tail() API

Message ID 863f4934d251f44ad85a6be08b3737fac74f9b5a.1623674025.git.lorenzo@kernel.org (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series mvneta: introduce XDP multi-buffer support | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for bpf-next
netdev/subject_prefix success Link
netdev/cc_maintainers warning 6 maintainers not CCed: yhs@fb.com kpsingh@kernel.org hawk@kernel.org andrii@kernel.org kafai@fb.com songliubraving@fb.com
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 5151 this patch: 5151
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 109 lines checked
netdev/build_allmodconfig_warn success Errors and warnings before: 5216 this patch: 5216
netdev/header_inline success Link

Commit Message

Lorenzo Bianconi June 14, 2021, 12:49 p.m. UTC
From: Eelco Chaudron <echaudro@redhat.com>

This change adds support for tail growing and shrinking for XDP multi-buff.

Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
---
 include/net/xdp.h |  7 ++++++
 net/core/filter.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++
 net/core/xdp.c    |  5 ++--
 3 files changed, 72 insertions(+), 2 deletions(-)

Comments

John Fastabend June 22, 2021, 11:37 p.m. UTC | #1
Lorenzo Bianconi wrote:
> From: Eelco Chaudron <echaudro@redhat.com>
> 
> This change adds support for tail growing and shrinking for XDP multi-buff.
> 

It would be nice if the commit message gave us some details on how the
growing/shrinking works in the multi-buff support.

> Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
> Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> ---
>  include/net/xdp.h |  7 ++++++
>  net/core/filter.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++
>  net/core/xdp.c    |  5 ++--
>  3 files changed, 72 insertions(+), 2 deletions(-)
> 
> diff --git a/include/net/xdp.h b/include/net/xdp.h
> index 935a6f83115f..3525801c6ed5 100644
> --- a/include/net/xdp.h
> +++ b/include/net/xdp.h
> @@ -132,6 +132,11 @@ xdp_get_shared_info_from_buff(struct xdp_buff *xdp)
>  	return (struct skb_shared_info *)xdp_data_hard_end(xdp);
>  }
>  
> +static inline unsigned int xdp_get_frag_tailroom(const skb_frag_t *frag)
> +{
> +	return PAGE_SIZE - skb_frag_size(frag) - skb_frag_off(frag);
> +}
> +
>  struct xdp_frame {
>  	void *data;
>  	u16 len;
> @@ -259,6 +264,8 @@ struct xdp_frame *xdp_convert_buff_to_frame(struct xdp_buff *xdp)
>  	return xdp_frame;
>  }
>  
> +void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
> +		  struct xdp_buff *xdp);
>  void xdp_return_frame(struct xdp_frame *xdpf);
>  void xdp_return_frame_rx_napi(struct xdp_frame *xdpf);
>  void xdp_return_buff(struct xdp_buff *xdp);
> diff --git a/net/core/filter.c b/net/core/filter.c
> index caa88955562e..05f574a3d690 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -3859,11 +3859,73 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
>  	.arg2_type	= ARG_ANYTHING,
>  };
>  
> +static int bpf_xdp_mb_adjust_tail(struct xdp_buff *xdp, int offset)
> +{
> +	struct skb_shared_info *sinfo;
> +
> +	if (unlikely(!xdp_buff_is_mb(xdp)))
> +		return -EINVAL;
> +
> +	sinfo = xdp_get_shared_info_from_buff(xdp);
> +	if (offset >= 0) {
> +		skb_frag_t *frag = &sinfo->frags[sinfo->nr_frags - 1];
> +		int size;
> +
> +		if (unlikely(offset > xdp_get_frag_tailroom(frag)))
> +			return -EINVAL;
> +
> +		size = skb_frag_size(frag);
> +		memset(skb_frag_address(frag) + size, 0, offset);
> +		skb_frag_size_set(frag, size + offset);
> +		sinfo->data_len += offset;

Can you add some comment on how this works? So today I call
bpf_xdp_adjust_tail() to add some trailer to my packet.
This looks like it adds tailroom to the last frag? But, then
how do I insert my trailer? I don't think we can without the
extra multi-buffer access support right.

Also data_end will be unchanged yet it will return 0 so my
current programs will likely be a bit confused by this.

> +	} else {
Eelco Chaudron June 24, 2021, 9:26 a.m. UTC | #2
On 23 Jun 2021, at 1:37, John Fastabend wrote:

> Lorenzo Bianconi wrote:
>> From: Eelco Chaudron <echaudro@redhat.com>
>>
>> This change adds support for tail growing and shrinking for XDP multi-buff.
>>
>
> It would be nice if the commit message gave us some details on how the
> growing/shrinking works in the multi-buff support.

Will add this to the next rev.

>> Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
>> Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
>> ---
>>  include/net/xdp.h |  7 ++++++
>>  net/core/filter.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++
>>  net/core/xdp.c    |  5 ++--
>>  3 files changed, 72 insertions(+), 2 deletions(-)
>>
>> diff --git a/include/net/xdp.h b/include/net/xdp.h
>> index 935a6f83115f..3525801c6ed5 100644
>> --- a/include/net/xdp.h
>> +++ b/include/net/xdp.h
>> @@ -132,6 +132,11 @@ xdp_get_shared_info_from_buff(struct xdp_buff *xdp)
>>  	return (struct skb_shared_info *)xdp_data_hard_end(xdp);
>>  }
>>
>> +static inline unsigned int xdp_get_frag_tailroom(const skb_frag_t *frag)
>> +{
>> +	return PAGE_SIZE - skb_frag_size(frag) - skb_frag_off(frag);
>> +}
>> +
>>  struct xdp_frame {
>>  	void *data;
>>  	u16 len;
>> @@ -259,6 +264,8 @@ struct xdp_frame *xdp_convert_buff_to_frame(struct xdp_buff *xdp)
>>  	return xdp_frame;
>>  }
>>
>> +void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
>> +		  struct xdp_buff *xdp);
>>  void xdp_return_frame(struct xdp_frame *xdpf);
>>  void xdp_return_frame_rx_napi(struct xdp_frame *xdpf);
>>  void xdp_return_buff(struct xdp_buff *xdp);
>> diff --git a/net/core/filter.c b/net/core/filter.c
>> index caa88955562e..05f574a3d690 100644
>> --- a/net/core/filter.c
>> +++ b/net/core/filter.c
>> @@ -3859,11 +3859,73 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
>>  	.arg2_type	= ARG_ANYTHING,
>>  };
>>
>> +static int bpf_xdp_mb_adjust_tail(struct xdp_buff *xdp, int offset)
>> +{
>> +	struct skb_shared_info *sinfo;
>> +
>> +	if (unlikely(!xdp_buff_is_mb(xdp)))
>> +		return -EINVAL;
>> +
>> +	sinfo = xdp_get_shared_info_from_buff(xdp);
>> +	if (offset >= 0) {
>> +		skb_frag_t *frag = &sinfo->frags[sinfo->nr_frags - 1];
>> +		int size;
>> +
>> +		if (unlikely(offset > xdp_get_frag_tailroom(frag)))
>> +			return -EINVAL;
>> +
>> +		size = skb_frag_size(frag);
>> +		memset(skb_frag_address(frag) + size, 0, offset);
>> +		skb_frag_size_set(frag, size + offset);
>> +		sinfo->data_len += offset;
>
> Can you add some comment on how this works? So today I call
> bpf_xdp_adjust_tail() to add some trailer to my packet.
> This looks like it adds tailroom to the last frag? But, then
> how do I insert my trailer? I don't think we can without the
> extra multi-buffer access support right.

You are right, we need some kind of multi-buffer access helpers.

> Also data_end will be unchanged yet it will return 0 so my
> current programs will likely be a bit confused by this.

Guess this is the tricky part, applications need to be multi-buffer aware. If current applications rely on bpf_xdp_adjust_tail(+) to determine maximum frame length this approach might not work. In this case, we might need an additional helper to do tail expansion with multi buffer support.

But then the question arrives how would mb unaware application behave in general when an mb packet is supplied?? It would definitely not determine the correct packet length.

>> +	} else {
John Fastabend June 24, 2021, 2:24 p.m. UTC | #3
Eelco Chaudron wrote:
> 
> 
> On 23 Jun 2021, at 1:37, John Fastabend wrote:
> 
> > Lorenzo Bianconi wrote:
> >> From: Eelco Chaudron <echaudro@redhat.com>
> >>
> >> This change adds support for tail growing and shrinking for XDP multi-buff.
> >>
> >
> > It would be nice if the commit message gave us some details on how the
> > growing/shrinking works in the multi-buff support.
> 
> Will add this to the next rev.
> 
> >> Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
> >> Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> >> ---
> >>  include/net/xdp.h |  7 ++++++
> >>  net/core/filter.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++
> >>  net/core/xdp.c    |  5 ++--
> >>  3 files changed, 72 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/include/net/xdp.h b/include/net/xdp.h
> >> index 935a6f83115f..3525801c6ed5 100644
> >> --- a/include/net/xdp.h
> >> +++ b/include/net/xdp.h
> >> @@ -132,6 +132,11 @@ xdp_get_shared_info_from_buff(struct xdp_buff *xdp)
> >>  	return (struct skb_shared_info *)xdp_data_hard_end(xdp);
> >>  }
> >>
> >> +static inline unsigned int xdp_get_frag_tailroom(const skb_frag_t *frag)
> >> +{
> >> +	return PAGE_SIZE - skb_frag_size(frag) - skb_frag_off(frag);
> >> +}
> >> +
> >>  struct xdp_frame {
> >>  	void *data;
> >>  	u16 len;
> >> @@ -259,6 +264,8 @@ struct xdp_frame *xdp_convert_buff_to_frame(struct xdp_buff *xdp)
> >>  	return xdp_frame;
> >>  }
> >>
> >> +void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
> >> +		  struct xdp_buff *xdp);
> >>  void xdp_return_frame(struct xdp_frame *xdpf);
> >>  void xdp_return_frame_rx_napi(struct xdp_frame *xdpf);
> >>  void xdp_return_buff(struct xdp_buff *xdp);
> >> diff --git a/net/core/filter.c b/net/core/filter.c
> >> index caa88955562e..05f574a3d690 100644
> >> --- a/net/core/filter.c
> >> +++ b/net/core/filter.c
> >> @@ -3859,11 +3859,73 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
> >>  	.arg2_type	= ARG_ANYTHING,
> >>  };
> >>
> >> +static int bpf_xdp_mb_adjust_tail(struct xdp_buff *xdp, int offset)
> >> +{
> >> +	struct skb_shared_info *sinfo;
> >> +
> >> +	if (unlikely(!xdp_buff_is_mb(xdp)))
> >> +		return -EINVAL;
> >> +
> >> +	sinfo = xdp_get_shared_info_from_buff(xdp);
> >> +	if (offset >= 0) {
> >> +		skb_frag_t *frag = &sinfo->frags[sinfo->nr_frags - 1];
> >> +		int size;
> >> +
> >> +		if (unlikely(offset > xdp_get_frag_tailroom(frag)))
> >> +			return -EINVAL;
> >> +
> >> +		size = skb_frag_size(frag);
> >> +		memset(skb_frag_address(frag) + size, 0, offset);
> >> +		skb_frag_size_set(frag, size + offset);
> >> +		sinfo->data_len += offset;
> >
> > Can you add some comment on how this works? So today I call
> > bpf_xdp_adjust_tail() to add some trailer to my packet.
> > This looks like it adds tailroom to the last frag? But, then
> > how do I insert my trailer? I don't think we can without the
> > extra multi-buffer access support right.
> 
> You are right, we need some kind of multi-buffer access helpers.
> 
> > Also data_end will be unchanged yet it will return 0 so my
> > current programs will likely be a bit confused by this.
> 
> Guess this is the tricky part, applications need to be multi-buffer aware. If current applications rely on bpf_xdp_adjust_tail(+) to determine maximum frame length this approach might not work. In this case, we might need an additional helper to do tail expansion with multi buffer support.
> 
> But then the question arrives how would mb unaware application behave in general when an mb packet is supplied?? It would definitely not determine the correct packet length.

Right that was my conclusion as well. Existing programs might
have subtle side effects if they start running on multibuffer
drivers as is. I don't have any good ideas though on how
to handle this.

> 
> >> +	} else {
>
Zvi Effron June 24, 2021, 3:16 p.m. UTC | #4
On Thu, Jun 24, 2021 at 7:24 AM John Fastabend <john.fastabend@gmail.com> wrote:
>
> Eelco Chaudron wrote:
> >
> >
> > On 23 Jun 2021, at 1:37, John Fastabend wrote:
> >
> > > Lorenzo Bianconi wrote:
> > >> From: Eelco Chaudron <echaudro@redhat.com>
> > >>
> > >> This change adds support for tail growing and shrinking for XDP multi-buff.
> > >>
> > >
> > > It would be nice if the commit message gave us some details on how the
> > > growing/shrinking works in the multi-buff support.
> >
> > Will add this to the next rev.
> >
> > >> Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
> > >> Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> > >> ---
> > >>  include/net/xdp.h |  7 ++++++
> > >>  net/core/filter.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++
> > >>  net/core/xdp.c    |  5 ++--
> > >>  3 files changed, 72 insertions(+), 2 deletions(-)
> > >>
> > >> diff --git a/include/net/xdp.h b/include/net/xdp.h
> > >> index 935a6f83115f..3525801c6ed5 100644
> > >> --- a/include/net/xdp.h
> > >> +++ b/include/net/xdp.h
> > >> @@ -132,6 +132,11 @@ xdp_get_shared_info_from_buff(struct xdp_buff *xdp)
> > >>    return (struct skb_shared_info *)xdp_data_hard_end(xdp);
> > >>  }
> > >>
> > >> +static inline unsigned int xdp_get_frag_tailroom(const skb_frag_t *frag)
> > >> +{
> > >> +  return PAGE_SIZE - skb_frag_size(frag) - skb_frag_off(frag);
> > >> +}
> > >> +
> > >>  struct xdp_frame {
> > >>    void *data;
> > >>    u16 len;
> > >> @@ -259,6 +264,8 @@ struct xdp_frame *xdp_convert_buff_to_frame(struct xdp_buff *xdp)
> > >>    return xdp_frame;
> > >>  }
> > >>
> > >> +void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
> > >> +            struct xdp_buff *xdp);
> > >>  void xdp_return_frame(struct xdp_frame *xdpf);
> > >>  void xdp_return_frame_rx_napi(struct xdp_frame *xdpf);
> > >>  void xdp_return_buff(struct xdp_buff *xdp);
> > >> diff --git a/net/core/filter.c b/net/core/filter.c
> > >> index caa88955562e..05f574a3d690 100644
> > >> --- a/net/core/filter.c
> > >> +++ b/net/core/filter.c
> > >> @@ -3859,11 +3859,73 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
> > >>    .arg2_type      = ARG_ANYTHING,
> > >>  };
> > >>
> > >> +static int bpf_xdp_mb_adjust_tail(struct xdp_buff *xdp, int offset)
> > >> +{
> > >> +  struct skb_shared_info *sinfo;
> > >> +
> > >> +  if (unlikely(!xdp_buff_is_mb(xdp)))
> > >> +          return -EINVAL;
> > >> +
> > >> +  sinfo = xdp_get_shared_info_from_buff(xdp);
> > >> +  if (offset >= 0) {
> > >> +          skb_frag_t *frag = &sinfo->frags[sinfo->nr_frags - 1];
> > >> +          int size;
> > >> +
> > >> +          if (unlikely(offset > xdp_get_frag_tailroom(frag)))
> > >> +                  return -EINVAL;
> > >> +
> > >> +          size = skb_frag_size(frag);
> > >> +          memset(skb_frag_address(frag) + size, 0, offset);
> > >> +          skb_frag_size_set(frag, size + offset);
> > >> +          sinfo->data_len += offset;
> > >
> > > Can you add some comment on how this works? So today I call
> > > bpf_xdp_adjust_tail() to add some trailer to my packet.
> > > This looks like it adds tailroom to the last frag? But, then
> > > how do I insert my trailer? I don't think we can without the
> > > extra multi-buffer access support right.
> >
> > You are right, we need some kind of multi-buffer access helpers.
> >
> > > Also data_end will be unchanged yet it will return 0 so my
> > > current programs will likely be a bit confused by this.
> >
> > Guess this is the tricky part, applications need to be multi-buffer aware. If current applications rely on bpf_xdp_adjust_tail(+) to determine maximum frame length this approach might not work. In this case, we might need an additional helper to do tail expansion with multi buffer support.
> >
> > But then the question arrives how would mb unaware application behave in general when an mb packet is supplied?? It would definitely not determine the correct packet length.
>
> Right that was my conclusion as well. Existing programs might
> have subtle side effects if they start running on multibuffer
> drivers as is. I don't have any good ideas though on how
> to handle this.
>

Would it be possible to detect multibuffer awareness of a program at load
(or attach) time, perhaps by looking for the use of the new multibuffer
helpers? That might make it possible to reject a non-multibuffer aware
program on multibuffer drivers (or maybe even put the driver into a
non-multibuffer mode at attach time), or at the very least issue a
warning?

> >
> > >> +  } else {
> >
>
>
Lorenzo Bianconi June 29, 2021, 1:19 p.m. UTC | #5
> Eelco Chaudron wrote:
> > 
> > 
> > On 23 Jun 2021, at 1:37, John Fastabend wrote:
> > 
> > > Lorenzo Bianconi wrote:
> > >> From: Eelco Chaudron <echaudro@redhat.com>
> > >>
> > >> This change adds support for tail growing and shrinking for XDP multi-buff.
> > >>
> > >
> > > It would be nice if the commit message gave us some details on how the
> > > growing/shrinking works in the multi-buff support.
[...]
> > Guess this is the tricky part, applications need to be multi-buffer aware. If current applications rely on bpf_xdp_adjust_tail(+) to determine maximum frame length this approach might not work. In this case, we might need an additional helper to do tail expansion with multi buffer support.
> > 
> > But then the question arrives how would mb unaware application behave in general when an mb packet is supplied?? It would definitely not determine the correct packet length.
> 
> Right that was my conclusion as well. Existing programs might
> have subtle side effects if they start running on multibuffer
> drivers as is. I don't have any good ideas though on how
> to handle this.

what about checking the program capabilities at load time (e.g. with a
special program type) and disable mb feature if the bpf program is not
mb-aware? (e.g. forbid to set the MTU greater than 1500B in xdp mode).

Regards,
Lorenzo

> 
> > 
> > >> +	} else {
> > 
> 
>
Toke Høiland-Jørgensen June 29, 2021, 1:27 p.m. UTC | #6
Lorenzo Bianconi <lorenzo.bianconi@redhat.com> writes:

>> Eelco Chaudron wrote:
>> > 
>> > 
>> > On 23 Jun 2021, at 1:37, John Fastabend wrote:
>> > 
>> > > Lorenzo Bianconi wrote:
>> > >> From: Eelco Chaudron <echaudro@redhat.com>
>> > >>
>> > >> This change adds support for tail growing and shrinking for XDP multi-buff.
>> > >>
>> > >
>> > > It would be nice if the commit message gave us some details on how the
>> > > growing/shrinking works in the multi-buff support.
> [...]
>> > Guess this is the tricky part, applications need to be multi-buffer aware. If current applications rely on bpf_xdp_adjust_tail(+) to determine maximum frame length this approach might not work. In this case, we might need an additional helper to do tail expansion with multi buffer support.
>> > 
>> > But then the question arrives how would mb unaware application behave in general when an mb packet is supplied?? It would definitely not determine the correct packet length.
>> 
>> Right that was my conclusion as well. Existing programs might
>> have subtle side effects if they start running on multibuffer
>> drivers as is. I don't have any good ideas though on how
>> to handle this.
>
> what about checking the program capabilities at load time (e.g. with a
> special program type) and disable mb feature if the bpf program is not
> mb-aware? (e.g. forbid to set the MTU greater than 1500B in xdp mode).

So what happens when that legacy program runs on a veth and gets an
mb-enabled frame redirected into it? :)

-Toke
Toke Høiland-Jørgensen July 6, 2021, 9:44 p.m. UTC | #7
Changing the subject to address this point specifically:

> Right that was my conclusion as well. Existing programs might have
> subtle side effects if they start running on multibuffer drivers as
> is. I don't have any good ideas though on how to handle this.

So I had a chat about this with Lorenzo, Eelco and Jesper today, and
promised I'd summarise our discussion to you all, so this is my attempt
at that. Please excuse the long email, I'm just trying to be
comprehensive :)

So first off, a problem description: If an existing XDP program is
exposed to an xdp_buff that is really a multi-buffer, it may end up with
subtle and hard-to-debug bugs: If it's parsing the packet it'll only see
part of the payload and not be aware of that fact, and if it's
calculating the packet length, that will also only be wrong (only
counting the first fragment).

So what to do about this? First of all, to do anything about it, XDP
programs need to be able to declare themselves "multi-buffer aware" (but
see point 1 below). We could try to auto-detect it in the verifier by
which helpers the program is using, but since existing programs could be
perfectly happy to just keep running, it probably needs to be something
the program communicates explicitly. One option is to use the
expected_attach_type to encode this; programs can then declare it in the
source by section name, or the userspace loader can set the type for
existing programs if needed.

With this, the kernel will know if a given XDP program is multi-buff
aware and can decide what to do with that information. For this we came
up with basically three options:

1. Do nothing. This would make it up to users / sysadmins to avoid
   anything breaking by manually making sure to not enable multi-buffer
   support while loading any XDP programs that will malfunction if
   presented with an mb frame. This will probably break in interesting
   ways, but it's nice and simple from an implementation PoV. With this
   we don't need the declaration discussed above either.

2. Add a check at runtime and drop the frames if they are mb-enabled and
   the program doesn't understand it. This is relatively simple to
   implement, but it also makes for difficult-to-understand issues (why
   are my packets suddenly being dropped?), and it will incur runtime
   overhead.

3. Reject loading of programs that are not MB-aware when running in an
   MB-enabled mode. This would make things break in more obvious ways,
   and still allow a userspace loader to declare a program "MB-aware" to
   force it to run if necessary. The problem then becomes at what level
   to block this?

   Doing this at the driver level is not enough: while a particular
   driver knows if it's running in multi-buff mode, we can't know for
   sure if a particular XDP program is multi-buff aware at attach time:
   it could be tail-calling other programs, or redirecting packets to
   another interface where it will be processed by a non-MB aware
   program.

   So another option is to make it a global toggle: e.g., create a new
   sysctl to enable multi-buffer. If this is set, reject loading any XDP
   program that doesn't support multi-buffer mode, and if it's unset,
   disable multi-buffer mode in all drivers. This will make it explicit
   when the multi-buffer mode is used, and prevent any accidental subtle
   malfunction of existing XDP programs. The drawback is that it's a
   mode switch, so more configuration complexity.

None of these options are ideal, of course, but I hope the above
explanation at least makes sense. If anyone has any better ideas (or can
spot any flaws in the reasoning above) please don't hesitate to let us
know!

-Toke
diff mbox series

Patch

diff --git a/include/net/xdp.h b/include/net/xdp.h
index 935a6f83115f..3525801c6ed5 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -132,6 +132,11 @@  xdp_get_shared_info_from_buff(struct xdp_buff *xdp)
 	return (struct skb_shared_info *)xdp_data_hard_end(xdp);
 }
 
+static inline unsigned int xdp_get_frag_tailroom(const skb_frag_t *frag)
+{
+	return PAGE_SIZE - skb_frag_size(frag) - skb_frag_off(frag);
+}
+
 struct xdp_frame {
 	void *data;
 	u16 len;
@@ -259,6 +264,8 @@  struct xdp_frame *xdp_convert_buff_to_frame(struct xdp_buff *xdp)
 	return xdp_frame;
 }
 
+void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
+		  struct xdp_buff *xdp);
 void xdp_return_frame(struct xdp_frame *xdpf);
 void xdp_return_frame_rx_napi(struct xdp_frame *xdpf);
 void xdp_return_buff(struct xdp_buff *xdp);
diff --git a/net/core/filter.c b/net/core/filter.c
index caa88955562e..05f574a3d690 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3859,11 +3859,73 @@  static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
 	.arg2_type	= ARG_ANYTHING,
 };
 
+static int bpf_xdp_mb_adjust_tail(struct xdp_buff *xdp, int offset)
+{
+	struct skb_shared_info *sinfo;
+
+	if (unlikely(!xdp_buff_is_mb(xdp)))
+		return -EINVAL;
+
+	sinfo = xdp_get_shared_info_from_buff(xdp);
+	if (offset >= 0) {
+		skb_frag_t *frag = &sinfo->frags[sinfo->nr_frags - 1];
+		int size;
+
+		if (unlikely(offset > xdp_get_frag_tailroom(frag)))
+			return -EINVAL;
+
+		size = skb_frag_size(frag);
+		memset(skb_frag_address(frag) + size, 0, offset);
+		skb_frag_size_set(frag, size + offset);
+		sinfo->data_len += offset;
+	} else {
+		int i, n_frags_free = 0, len_free = 0;
+
+		offset = abs(offset);
+		if (unlikely(offset > ((int)(xdp->data_end - xdp->data) +
+				       sinfo->data_len - ETH_HLEN)))
+			return -EINVAL;
+
+		for (i = sinfo->nr_frags - 1; i >= 0 && offset > 0; i--) {
+			skb_frag_t *frag = &sinfo->frags[i];
+			int size = skb_frag_size(frag);
+			int shrink = min_t(int, offset, size);
+
+			len_free += shrink;
+			offset -= shrink;
+
+			if (unlikely(size == shrink)) {
+				struct page *page = skb_frag_page(frag);
+
+				__xdp_return(page_address(page), &xdp->rxq->mem,
+					     false, NULL);
+				n_frags_free++;
+			} else {
+				skb_frag_size_set(frag, size - shrink);
+				break;
+			}
+		}
+		sinfo->nr_frags -= n_frags_free;
+		sinfo->data_len -= len_free;
+
+		if (unlikely(!sinfo->nr_frags))
+			xdp_buff_clear_mb(xdp);
+
+		if (unlikely(offset > 0))
+			xdp->data_end -= offset;
+	}
+
+	return 0;
+}
+
 BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset)
 {
 	void *data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */
 	void *data_end = xdp->data_end + offset;
 
+	if (unlikely(xdp_buff_is_mb(xdp)))
+		return bpf_xdp_mb_adjust_tail(xdp, offset);
+
 	/* Notice that xdp_data_hard_end have reserved some tailroom */
 	if (unlikely(data_end > data_hard_end))
 		return -EINVAL;
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 71bedf6049a1..ffd70d3e9e5d 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -338,8 +338,8 @@  EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
  * is used for those calls sites.  Thus, allowing for faster recycling
  * of xdp_frames/pages in those cases.
  */
-static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
-			 struct xdp_buff *xdp)
+void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
+		  struct xdp_buff *xdp)
 {
 	struct xdp_mem_allocator *xa;
 	struct page *page;
@@ -372,6 +372,7 @@  static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
 		break;
 	}
 }
+EXPORT_SYMBOL_GPL(__xdp_return);
 
 void xdp_return_frame(struct xdp_frame *xdpf)
 {