diff mbox series

[bpf-next,v7,10/17] veth: Support RX XDP metadata

Message ID 20230112003230.3779451-11-sdf@google.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series xdp: hints via kfuncs | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-7 success Logs for llvm-toolchain
bpf/vmtest-bpf-next-VM_Test-8 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-2 success Logs for build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-3 success Logs for build for aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-5 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-next-PR fail merge-conflict
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count fail Series longer than 15 patches (and no cover letter)
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers warning 4 maintainers not CCed: edumazet@google.com davem@davemloft.net pabeni@redhat.com hawk@kernel.org
netdev/build_clang success Errors and warnings before: 0 this patch: 0
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 0 this patch: 0
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 67 lines checked
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-4 success Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for test_maps on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-11 pending Logs for test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for test_maps on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-14 success Logs for test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for test_progs on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-16 fail Logs for test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-18 success Logs for test_progs on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-19 success Logs for test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for test_progs_no_alu32 on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-21 fail Logs for test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for test_progs_no_alu32 on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-24 success Logs for test_progs_no_alu32_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for test_progs_no_alu32_parallel on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-26 success Logs for test_progs_no_alu32_parallel on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for test_progs_no_alu32_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-29 success Logs for test_progs_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for test_progs_parallel on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-31 success Logs for test_progs_parallel on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-32 success Logs for test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-33 success Logs for test_progs_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-34 success Logs for test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-35 success Logs for test_verifier on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-36 success Logs for test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-37 success Logs for test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-38 success Logs for test_verifier on x86_64 with llvm-16

Commit Message

Stanislav Fomichev Jan. 12, 2023, 12:32 a.m. UTC
The goal is to enable end-to-end testing of the metadata for AF_XDP.

Cc: John Fastabend <john.fastabend@gmail.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Martin KaFai Lau <martin.lau@linux.dev>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Anatoly Burakov <anatoly.burakov@intel.com>
Cc: Alexander Lobakin <alexandr.lobakin@intel.com>
Cc: Magnus Karlsson <magnus.karlsson@gmail.com>
Cc: Maryam Tahhan <mtahhan@redhat.com>
Cc: xdp-hints@xdp-project.net
Cc: netdev@vger.kernel.org
Signed-off-by: Stanislav Fomichev <sdf@google.com>
---
 drivers/net/veth.c | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

Comments

Jesper Dangaard Brouer Jan. 16, 2023, 4:21 p.m. UTC | #1
On 12/01/2023 01.32, Stanislav Fomichev wrote:
> The goal is to enable end-to-end testing of the metadata for AF_XDP.

For me the goal with veth goes beyond *testing*.

This patch ignores the xdp_frame case.  I'm not blocking this patch, but
I'm saying we need to make sure there is a way forward for accessing
XDP-hints when handling redirected xdp_frame's.

I have two use-cases we should cover (as future work).

(#1) We have customers that want to redirect from physical NIC hardware
into containers, and then have the veth XDP-prog (selectively) redirect
into an AF_XDP socket (when matching fastpath packets).  Here they
(minimum) want access to the XDP hint info on HW checksum.

(#2) Both veth and cpumap can create SKBs based on xdp_frame's.  Here it
is essential to get HW checksum and HW hash when creating these SKBs
(else netstack have to do expensive csum calc and parsing in
flow-dissector).

> Cc: John Fastabend <john.fastabend@gmail.com>
> Cc: David Ahern <dsahern@gmail.com>
> Cc: Martin KaFai Lau <martin.lau@linux.dev>
> Cc: Jakub Kicinski <kuba@kernel.org>
> Cc: Willem de Bruijn <willemb@google.com>
> Cc: Jesper Dangaard Brouer <brouer@redhat.com>
> Cc: Anatoly Burakov <anatoly.burakov@intel.com>
> Cc: Alexander Lobakin <alexandr.lobakin@intel.com>
> Cc: Magnus Karlsson <magnus.karlsson@gmail.com>
> Cc: Maryam Tahhan <mtahhan@redhat.com>
> Cc: xdp-hints@xdp-project.net
> Cc: netdev@vger.kernel.org
> Signed-off-by: Stanislav Fomichev <sdf@google.com>
> ---
>   drivers/net/veth.c | 31 +++++++++++++++++++++++++++++++
>   1 file changed, 31 insertions(+)
> 
> diff --git a/drivers/net/veth.c b/drivers/net/veth.c
> index 70f50602287a..ba3e05832843 100644
> --- a/drivers/net/veth.c
> +++ b/drivers/net/veth.c
> @@ -118,6 +118,7 @@ static struct {
>   
>   struct veth_xdp_buff {
>   	struct xdp_buff xdp;
> +	struct sk_buff *skb;
>   };
>   
>   static int veth_get_link_ksettings(struct net_device *dev,
> @@ -602,6 +603,7 @@ static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq,
>   
>   		xdp_convert_frame_to_buff(frame, xdp);
>   		xdp->rxq = &rq->xdp_rxq;
> +		vxbuf.skb = NULL;
>   
>   		act = bpf_prog_run_xdp(xdp_prog, xdp);
>   
> @@ -823,6 +825,7 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
>   	__skb_push(skb, skb->data - skb_mac_header(skb));
>   	if (veth_convert_skb_to_xdp_buff(rq, xdp, &skb))
>   		goto drop;
> +	vxbuf.skb = skb;
>   
>   	orig_data = xdp->data;
>   	orig_data_end = xdp->data_end;
> @@ -1602,6 +1605,28 @@ static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp)
>   	}
>   }
>   
> +static int veth_xdp_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp)
> +{
> +	struct veth_xdp_buff *_ctx = (void *)ctx;
> +
> +	if (!_ctx->skb)
> +		return -EOPNOTSUPP;
> +
> +	*timestamp = skb_hwtstamps(_ctx->skb)->hwtstamp;

The SKB stores this skb_hwtstamps() in skb_shared_info memory area.
This memory area is actually also available to xdp_frames.  Thus, we
could store the HW rx_timestamp in same location for redirected
xdp_frames.  This could make code path sharing possible between SKB vs
xdp_frame in veth.

This would also make it fast to "transfer" HW rx_timestamp when creating
an SKB from an xdp_frame, as data is already written in the correct place.

Performance wise the down-side is that skb_shared_info memory area is in
a separate cacheline.  Thus, when no HW rx_timestamp is available, then
it is very expensive for a veth XDP bpf-prog to access this, just to get
a zero back.  Having an xdp_frame->flags bit that knows if HW
rx_timestamp have been stored, can mitigate this.


> +	return 0;
> +}
> +
> +static int veth_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash)
> +{
> +	struct veth_xdp_buff *_ctx = (void *)ctx;
> +
> +	if (!_ctx->skb)
> +		return -EOPNOTSUPP;

For xdp_frame case, I'm considering simply storing the u32 RX-hash in
struct xdp_frame.  This makes it easy to extract for xdp_frame to SKB
create use-case.

As have been mentioned before, the SKB also requires knowing the RSS
hash-type.  This HW hash-type actually contains a lot of information,
that today is lost when reduced to the SKB hash-type.  Due to
standardization from Microsoft, most HW provide info on (L3) IPv4 or
IPv6, and on (L4) TCP or UDP (and often SCTP).  Often hardware
descriptor also provide info on the header length.  Future work in this
area is exciting as we can speedup parsing of packets in XDP, if we can
get are more detailed HW info on hash "packet-type".

> +
> +	*hash = skb_get_hash(_ctx->skb); > +	return 0;
> +}
> +

--Jesper
Stanislav Fomichev Jan. 17, 2023, 8:33 p.m. UTC | #2
On Mon, Jan 16, 2023 at 8:21 AM Jesper Dangaard Brouer
<jbrouer@redhat.com> wrote:
>
>
>
> On 12/01/2023 01.32, Stanislav Fomichev wrote:
> > The goal is to enable end-to-end testing of the metadata for AF_XDP.
>
> For me the goal with veth goes beyond *testing*.
>
> This patch ignores the xdp_frame case.  I'm not blocking this patch, but
> I'm saying we need to make sure there is a way forward for accessing
> XDP-hints when handling redirected xdp_frame's.

Sure, let's work towards getting that other part addressed!

> I have two use-cases we should cover (as future work).
>
> (#1) We have customers that want to redirect from physical NIC hardware
> into containers, and then have the veth XDP-prog (selectively) redirect
> into an AF_XDP socket (when matching fastpath packets).  Here they
> (minimum) want access to the XDP hint info on HW checksum.
>
> (#2) Both veth and cpumap can create SKBs based on xdp_frame's.  Here it
> is essential to get HW checksum and HW hash when creating these SKBs
> (else netstack have to do expensive csum calc and parsing in
> flow-dissector).

From my PoW, I'd probably have to look into the TX side first (tx
timestamp) before looking into xdp->skb path. So if somebody on your
side has cycles, feel free to drive this effort. I'm happy to provide
reviews/comments/etc. I think we've discussed in the past that this
will most likely look like another set of "export" kfuncs?

We can start with extending new
Documentation/networking/xdp-rx-metadata.rst with a high-level design.

> > Cc: John Fastabend <john.fastabend@gmail.com>
> > Cc: David Ahern <dsahern@gmail.com>
> > Cc: Martin KaFai Lau <martin.lau@linux.dev>
> > Cc: Jakub Kicinski <kuba@kernel.org>
> > Cc: Willem de Bruijn <willemb@google.com>
> > Cc: Jesper Dangaard Brouer <brouer@redhat.com>
> > Cc: Anatoly Burakov <anatoly.burakov@intel.com>
> > Cc: Alexander Lobakin <alexandr.lobakin@intel.com>
> > Cc: Magnus Karlsson <magnus.karlsson@gmail.com>
> > Cc: Maryam Tahhan <mtahhan@redhat.com>
> > Cc: xdp-hints@xdp-project.net
> > Cc: netdev@vger.kernel.org
> > Signed-off-by: Stanislav Fomichev <sdf@google.com>
> > ---
> >   drivers/net/veth.c | 31 +++++++++++++++++++++++++++++++
> >   1 file changed, 31 insertions(+)
> >
> > diff --git a/drivers/net/veth.c b/drivers/net/veth.c
> > index 70f50602287a..ba3e05832843 100644
> > --- a/drivers/net/veth.c
> > +++ b/drivers/net/veth.c
> > @@ -118,6 +118,7 @@ static struct {
> >
> >   struct veth_xdp_buff {
> >       struct xdp_buff xdp;
> > +     struct sk_buff *skb;
> >   };
> >
> >   static int veth_get_link_ksettings(struct net_device *dev,
> > @@ -602,6 +603,7 @@ static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq,
> >
> >               xdp_convert_frame_to_buff(frame, xdp);
> >               xdp->rxq = &rq->xdp_rxq;
> > +             vxbuf.skb = NULL;
> >
> >               act = bpf_prog_run_xdp(xdp_prog, xdp);
> >
> > @@ -823,6 +825,7 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
> >       __skb_push(skb, skb->data - skb_mac_header(skb));
> >       if (veth_convert_skb_to_xdp_buff(rq, xdp, &skb))
> >               goto drop;
> > +     vxbuf.skb = skb;
> >
> >       orig_data = xdp->data;
> >       orig_data_end = xdp->data_end;
> > @@ -1602,6 +1605,28 @@ static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp)
> >       }
> >   }
> >
> > +static int veth_xdp_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp)
> > +{
> > +     struct veth_xdp_buff *_ctx = (void *)ctx;
> > +
> > +     if (!_ctx->skb)
> > +             return -EOPNOTSUPP;
> > +
> > +     *timestamp = skb_hwtstamps(_ctx->skb)->hwtstamp;
>
> The SKB stores this skb_hwtstamps() in skb_shared_info memory area.
> This memory area is actually also available to xdp_frames.  Thus, we
> could store the HW rx_timestamp in same location for redirected
> xdp_frames.  This could make code path sharing possible between SKB vs
> xdp_frame in veth.
>
> This would also make it fast to "transfer" HW rx_timestamp when creating
> an SKB from an xdp_frame, as data is already written in the correct place.
>
> Performance wise the down-side is that skb_shared_info memory area is in
> a separate cacheline.  Thus, when no HW rx_timestamp is available, then
> it is very expensive for a veth XDP bpf-prog to access this, just to get
> a zero back.  Having an xdp_frame->flags bit that knows if HW
> rx_timestamp have been stored, can mitigate this.

That's one way to do it; although I'm not sure about the cases which
don't use xdp_frame and use stack-allocated xdp_buff.

> > +     return 0;
> > +}
> > +
> > +static int veth_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash)
> > +{
> > +     struct veth_xdp_buff *_ctx = (void *)ctx;
> > +
> > +     if (!_ctx->skb)
> > +             return -EOPNOTSUPP;
>
> For xdp_frame case, I'm considering simply storing the u32 RX-hash in
> struct xdp_frame.  This makes it easy to extract for xdp_frame to SKB
> create use-case.
>
> As have been mentioned before, the SKB also requires knowing the RSS
> hash-type.  This HW hash-type actually contains a lot of information,
> that today is lost when reduced to the SKB hash-type.  Due to
> standardization from Microsoft, most HW provide info on (L3) IPv4 or
> IPv6, and on (L4) TCP or UDP (and often SCTP).  Often hardware
> descriptor also provide info on the header length.  Future work in this
> area is exciting as we can speedup parsing of packets in XDP, if we can
> get are more detailed HW info on hash "packet-type".

Something like the version we've discussed a while back [0]?
Seems workable overall if we remove it from the UAPI? (not everyone
was happy about UAPI parts IIRC)

0: https://lore.kernel.org/bpf/20221115030210.3159213-7-sdf@google.com/


> > +
> > +     *hash = skb_get_hash(_ctx->skb); > +    return 0;
> > +}
> > +
>
> --Jesper
>
Jesper Dangaard Brouer Jan. 18, 2023, 3:57 p.m. UTC | #3
On 17/01/2023 21.33, Stanislav Fomichev wrote:
> On Mon, Jan 16, 2023 at 8:21 AM Jesper Dangaard Brouer
> <jbrouer@redhat.com> wrote:
>>
>>
>> On 12/01/2023 01.32, Stanislav Fomichev wrote:
>>> The goal is to enable end-to-end testing of the metadata for AF_XDP.
>>
>> For me the goal with veth goes beyond *testing*.
>>
>> This patch ignores the xdp_frame case.  I'm not blocking this patch, but
>> I'm saying we need to make sure there is a way forward for accessing
>> XDP-hints when handling redirected xdp_frame's.
> 
> Sure, let's work towards getting that other part addressed!
> 
>> I have two use-cases we should cover (as future work).
>>
>> (#1) We have customers that want to redirect from physical NIC hardware
>> into containers, and then have the veth XDP-prog (selectively) redirect
>> into an AF_XDP socket (when matching fastpath packets).  Here they
>> (minimum) want access to the XDP hint info on HW checksum.
>>
>> (#2) Both veth and cpumap can create SKBs based on xdp_frame's.  Here it
>> is essential to get HW checksum and HW hash when creating these SKBs
>> (else netstack have to do expensive csum calc and parsing in
>> flow-dissector).
> 
>  From my PoW, I'd probably have to look into the TX side first (tx
> timestamp) before looking into xdp->skb path. So if somebody on your
> side has cycles, feel free to drive this effort. I'm happy to provide
> reviews/comments/etc. I think we've discussed in the past that this
> will most likely look like another set of "export" kfuncs?
> 

I can use some cycles to look at this and come-up with some PoC code.

Yes, I'm thinking of creating 'another set of "export" kfuncs' as you 
say. I'm no-longer suggesting to add a "store" flag to this patchsets 
kfuncs.

The advantages with another set of store/export kfuncs are that these 
can live in the core-kernel code (e.g. not in drivers), and hopefully we 
can "unroll" these as BPF-prog code (as you did earlier).


> We can start with extending new
> Documentation/networking/xdp-rx-metadata.rst with a high-level design.

Sure, but I'll try to get some code working first.

>>> Cc: John Fastabend <john.fastabend@gmail.com>
>>> Cc: David Ahern <dsahern@gmail.com>
>>> Cc: Martin KaFai Lau <martin.lau@linux.dev>
>>> Cc: Jakub Kicinski <kuba@kernel.org>
>>> Cc: Willem de Bruijn <willemb@google.com>
>>> Cc: Jesper Dangaard Brouer <brouer@redhat.com>
>>> Cc: Anatoly Burakov <anatoly.burakov@intel.com>
>>> Cc: Alexander Lobakin <alexandr.lobakin@intel.com>
>>> Cc: Magnus Karlsson <magnus.karlsson@gmail.com>
>>> Cc: Maryam Tahhan <mtahhan@redhat.com>
>>> Cc: xdp-hints@xdp-project.net
>>> Cc: netdev@vger.kernel.org
>>> Signed-off-by: Stanislav Fomichev <sdf@google.com>
>>> ---
>>>    drivers/net/veth.c | 31 +++++++++++++++++++++++++++++++
>>>    1 file changed, 31 insertions(+)
>>>
>>> diff --git a/drivers/net/veth.c b/drivers/net/veth.c
>>> index 70f50602287a..ba3e05832843 100644
>>> --- a/drivers/net/veth.c
>>> +++ b/drivers/net/veth.c
>>> @@ -118,6 +118,7 @@ static struct {
>>>
>>>    struct veth_xdp_buff {
>>>        struct xdp_buff xdp;
>>> +     struct sk_buff *skb;
>>>    };
>>>
>>>    static int veth_get_link_ksettings(struct net_device *dev,
>>> @@ -602,6 +603,7 @@ static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq,
>>>
>>>                xdp_convert_frame_to_buff(frame, xdp);
>>>                xdp->rxq = &rq->xdp_rxq;
>>> +             vxbuf.skb = NULL;
>>>
>>>                act = bpf_prog_run_xdp(xdp_prog, xdp);
>>>
>>> @@ -823,6 +825,7 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
>>>        __skb_push(skb, skb->data - skb_mac_header(skb));
>>>        if (veth_convert_skb_to_xdp_buff(rq, xdp, &skb))
>>>                goto drop;
>>> +     vxbuf.skb = skb;
>>>
>>>        orig_data = xdp->data;
>>>        orig_data_end = xdp->data_end;
>>> @@ -1602,6 +1605,28 @@ static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp)
>>>        }
>>>    }
>>>
>>> +static int veth_xdp_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp)
>>> +{
>>> +     struct veth_xdp_buff *_ctx = (void *)ctx;
>>> +
>>> +     if (!_ctx->skb)
>>> +             return -EOPNOTSUPP;
>>> +
>>> +     *timestamp = skb_hwtstamps(_ctx->skb)->hwtstamp;
>>
>> The SKB stores this skb_hwtstamps() in skb_shared_info memory area.
>> This memory area is actually also available to xdp_frames.  Thus, we
>> could store the HW rx_timestamp in same location for redirected
>> xdp_frames.  This could make code path sharing possible between SKB vs
>> xdp_frame in veth.
>>
>> This would also make it fast to "transfer" HW rx_timestamp when creating
>> an SKB from an xdp_frame, as data is already written in the correct place.
>>
>> Performance wise the down-side is that skb_shared_info memory area is in
>> a separate cacheline.  Thus, when no HW rx_timestamp is available, then
>> it is very expensive for a veth XDP bpf-prog to access this, just to get
>> a zero back.  Having an xdp_frame->flags bit that knows if HW
>> rx_timestamp have been stored, can mitigate this.
> 
> That's one way to do it; although I'm not sure about the cases which
> don't use xdp_frame and use stack-allocated xdp_buff.

Above I should have said xdp_buff->flags, to make it more clear that 
this doesn't depend on xdp_frame.  The xdp_buff->flags gets copied to 
xdp_frame->flags, so I see them as equivalent.

The skb_shared_info memory area is also available to xdp_buff's.
(See code #define xdp_data_hard_end in include/net/xdp.h)


>>> +     return 0;
>>> +}
>>> +
>>> +static int veth_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash)
>>> +{
>>> +     struct veth_xdp_buff *_ctx = (void *)ctx;
>>> +
>>> +     if (!_ctx->skb)
>>> +             return -EOPNOTSUPP;
>>
>> For xdp_frame case, I'm considering simply storing the u32 RX-hash in
>> struct xdp_frame.  This makes it easy to extract for xdp_frame to SKB
>> create use-case.
>>
>> As have been mentioned before, the SKB also requires knowing the RSS
>> hash-type.  This HW hash-type actually contains a lot of information,
>> that today is lost when reduced to the SKB hash-type.  Due to
>> standardization from Microsoft, most HW provide info on (L3) IPv4 or
>> IPv6, and on (L4) TCP or UDP (and often SCTP).  Often hardware
>> descriptor also provide info on the header length.  Future work in this
>> area is exciting as we can speedup parsing of packets in XDP, if we can
>> get are more detailed HW info on hash "packet-type".
> 
> Something like the version we've discussed a while back [0]?
> Seems workable overall if we remove it from the UAPI? (not everyone
> was happy about UAPI parts IIRC)
> 
> 0: https://lore.kernel.org/bpf/20221115030210.3159213-7-sdf@google.com/

Yes, somewhat similar to [0].
Except that:

(1) Have a more granular design with more kfuncs for individually 
exporting hints (like this patchset) giving BPF-programmer more 
flexibility. (but unroll BPF-byte code to avoid func-call overhead).

(2) No UAPI, except the kfunc calls, and kernel-code can hide where the 
hints are stored, e.g. as member in struct xdp_frame, in skb_shared_info 
memory area, or somehow in metadata memory area. (Hopefully avoiding too 
much bikesheeting about memory area as we are free to change this later).

--Jesper
diff mbox series

Patch

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 70f50602287a..ba3e05832843 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -118,6 +118,7 @@  static struct {
 
 struct veth_xdp_buff {
 	struct xdp_buff xdp;
+	struct sk_buff *skb;
 };
 
 static int veth_get_link_ksettings(struct net_device *dev,
@@ -602,6 +603,7 @@  static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq,
 
 		xdp_convert_frame_to_buff(frame, xdp);
 		xdp->rxq = &rq->xdp_rxq;
+		vxbuf.skb = NULL;
 
 		act = bpf_prog_run_xdp(xdp_prog, xdp);
 
@@ -823,6 +825,7 @@  static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
 	__skb_push(skb, skb->data - skb_mac_header(skb));
 	if (veth_convert_skb_to_xdp_buff(rq, xdp, &skb))
 		goto drop;
+	vxbuf.skb = skb;
 
 	orig_data = xdp->data;
 	orig_data_end = xdp->data_end;
@@ -1602,6 +1605,28 @@  static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 	}
 }
 
+static int veth_xdp_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp)
+{
+	struct veth_xdp_buff *_ctx = (void *)ctx;
+
+	if (!_ctx->skb)
+		return -EOPNOTSUPP;
+
+	*timestamp = skb_hwtstamps(_ctx->skb)->hwtstamp;
+	return 0;
+}
+
+static int veth_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash)
+{
+	struct veth_xdp_buff *_ctx = (void *)ctx;
+
+	if (!_ctx->skb)
+		return -EOPNOTSUPP;
+
+	*hash = skb_get_hash(_ctx->skb);
+	return 0;
+}
+
 static const struct net_device_ops veth_netdev_ops = {
 	.ndo_init            = veth_dev_init,
 	.ndo_open            = veth_open,
@@ -1623,6 +1648,11 @@  static const struct net_device_ops veth_netdev_ops = {
 	.ndo_get_peer_dev	= veth_peer_dev,
 };
 
+static const struct xdp_metadata_ops veth_xdp_metadata_ops = {
+	.xmo_rx_timestamp		= veth_xdp_rx_timestamp,
+	.xmo_rx_hash			= veth_xdp_rx_hash,
+};
+
 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \
 		       NETIF_F_RXCSUM | NETIF_F_SCTP_CRC | NETIF_F_HIGHDMA | \
 		       NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | \
@@ -1639,6 +1669,7 @@  static void veth_setup(struct net_device *dev)
 	dev->priv_flags |= IFF_PHONY_HEADROOM;
 
 	dev->netdev_ops = &veth_netdev_ops;
+	dev->xdp_metadata_ops = &veth_xdp_metadata_ops;
 	dev->ethtool_ops = &veth_ethtool_ops;
 	dev->features |= NETIF_F_LLTX;
 	dev->features |= VETH_FEATURES;