diff mbox series

[RFC,bpf-next,2/5] veth: Support rx timestamp metadata for xdp

Message ID 20221027200019.4106375-3-sdf@google.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series xdp: hints via kfuncs | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-1 pending Logs for ${{ matrix.test }} on ${{ matrix.arch }} with ${{ matrix.toolchain }}
bpf/vmtest-bpf-next-VM_Test-2 fail Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-3 fail Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-4 fail Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-5 success Logs for llvm-toolchain
bpf/vmtest-bpf-next-VM_Test-6 success Logs for set-matrix
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit fail Errors and warnings before: 160 this patch: 85
netdev/cc_maintainers warning 4 maintainers not CCed: pabeni@redhat.com hawk@kernel.org edumazet@google.com davem@davemloft.net
netdev/build_clang fail Errors and warnings before: 16 this patch: 16
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn fail Errors and warnings before: 84 this patch: 29
netdev/checkpatch warning WARNING: line length of 85 exceeds 80 columns WARNING: line length of 87 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Stanislav Fomichev Oct. 27, 2022, 8 p.m. UTC
xskxceiver conveniently setups up veth pairs so it seems logical
to use veth as an example for some of the metadata handling.

We timestamp skb right when we "receive" it, store its
pointer in xdp_buff->priv and generate BPF bytecode to
reach it from the BPF program.

This largely follows the idea of "store some queue context in
the xdp_buff/xdp_frame so the metadata can be reached out
from the BPF program".

Cc: Martin KaFai Lau <martin.lau@linux.dev>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Anatoly Burakov <anatoly.burakov@intel.com>
Cc: Alexander Lobakin <alexandr.lobakin@intel.com>
Cc: Magnus Karlsson <magnus.karlsson@gmail.com>
Cc: Maryam Tahhan <mtahhan@redhat.com>
Cc: xdp-hints@xdp-project.net
Cc: netdev@vger.kernel.org
Signed-off-by: Stanislav Fomichev <sdf@google.com>
---
 drivers/net/veth.c | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

Comments

Jesper Dangaard Brouer Oct. 28, 2022, 8:40 a.m. UTC | #1
On 27/10/2022 22.00, Stanislav Fomichev wrote:
> xskxceiver conveniently setups up veth pairs so it seems logical
> to use veth as an example for some of the metadata handling.
> 
> We timestamp skb right when we "receive" it, store its
> pointer in xdp_buff->priv and generate BPF bytecode to
> reach it from the BPF program.
> 
> This largely follows the idea of "store some queue context in
> the xdp_buff/xdp_frame so the metadata can be reached out
> from the BPF program".
> 
> Cc: Martin KaFai Lau <martin.lau@linux.dev>
> Cc: Jakub Kicinski <kuba@kernel.org>
> Cc: Willem de Bruijn <willemb@google.com>
> Cc: Jesper Dangaard Brouer <brouer@redhat.com>
> Cc: Anatoly Burakov <anatoly.burakov@intel.com>
> Cc: Alexander Lobakin <alexandr.lobakin@intel.com>
> Cc: Magnus Karlsson <magnus.karlsson@gmail.com>
> Cc: Maryam Tahhan <mtahhan@redhat.com>
> Cc: xdp-hints@xdp-project.net
> Cc: netdev@vger.kernel.org
> Signed-off-by: Stanislav Fomichev <sdf@google.com>
> ---
>   drivers/net/veth.c | 31 +++++++++++++++++++++++++++++++
>   1 file changed, 31 insertions(+)
> 
> diff --git a/drivers/net/veth.c b/drivers/net/veth.c
> index 09682ea3354e..35396dd73de0 100644
> --- a/drivers/net/veth.c
> +++ b/drivers/net/veth.c
> @@ -597,6 +597,7 @@ static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq,
>   
>   		xdp_convert_frame_to_buff(frame, &xdp);
>   		xdp.rxq = &rq->xdp_rxq;
> +		xdp.priv = NULL;

So, why doesn't this supported for normal XDP mode?!?
e.g. Where veth gets XDP redirected an xdp_frame.

My main use case (for veth) is to make NIC hardware hints available to
containers.  Thus, creating a flexible fast-path via XDP-redirect
directly into containers veth device.  (This is e.g. for replacing the
inflexible SR-IOV approach with SR-IOV net_devices in the container,
with a more cloud friendly approach).

How can we extend this approach to handle xdp_frame's from different 
net_device's ?


>   
>   		act = bpf_prog_run_xdp(xdp_prog, &xdp);
>   
> @@ -820,6 +821,7 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
>   
>   	orig_data = xdp.data;
>   	orig_data_end = xdp.data_end;
> +	xdp.priv = skb;
>   

So, enabling SKB based path only.

>   	act = bpf_prog_run_xdp(xdp_prog, &xdp);
>   
> @@ -936,6 +938,7 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget,
>   			struct sk_buff *skb = ptr;
>   
>   			stats->xdp_bytes += skb->len;
> +			__net_timestamp(skb);
>   			skb = veth_xdp_rcv_skb(rq, skb, bq, stats);
>   			if (skb) {
>   				if (skb_shared(skb) || skb_unclone(skb, GFP_ATOMIC))
> @@ -1595,6 +1598,33 @@ static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp)
>   	}
>   }
>   
> +static int veth_unroll_kfunc(struct bpf_prog *prog, struct bpf_insn *insn)
> +{
> +	u32 func_id = insn->imm;
> +
> +	if (func_id == xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_HAVE_RX_TIMESTAMP)) {
> +		/* return true; */
> +		insn[0] = BPF_MOV64_IMM(BPF_REG_0, 1);
> +		return 1;
> +	} else if (func_id == xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_TIMESTAMP)) {
> +		/* r1 = ((struct xdp_buff *)r1)->priv; [skb] */
> +		insn[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1,
> +				      offsetof(struct xdp_buff, priv));
> +		/* if (r1 == NULL) { */
> +		insn[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1);
> +		/*	return 0; */
> +		insn[2] = BPF_MOV64_IMM(BPF_REG_0, 0);
> +		/* } else { */
> +		/*	return ((struct sk_buff *)r1)->tstamp; */
> +		insn[3] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1,
> +				      offsetof(struct sk_buff, tstamp));

Just to be clear, this skb->tstamp is a software timestamp, right?

> +		/* } */
> +		return 4;
> +	}

I'm slightly concerned with driver developers maintaining BPF-bytecode
on a per-driver bases, but I can certainly live with this if BPF
maintainers can.

> +
> +	return 0;
> +}
> +
>   static const struct net_device_ops veth_netdev_ops = {
>   	.ndo_init            = veth_dev_init,
>   	.ndo_open            = veth_open,
> @@ -1614,6 +1644,7 @@ static const struct net_device_ops veth_netdev_ops = {
>   	.ndo_bpf		= veth_xdp,
>   	.ndo_xdp_xmit		= veth_ndo_xdp_xmit,
>   	.ndo_get_peer_dev	= veth_peer_dev,
> +	.ndo_unroll_kfunc       = veth_unroll_kfunc,
>   };
>   
>   #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \
Stanislav Fomichev Oct. 28, 2022, 6:46 p.m. UTC | #2
On Fri, Oct 28, 2022 at 1:40 AM Jesper Dangaard Brouer
<jbrouer@redhat.com> wrote:
>
>
> On 27/10/2022 22.00, Stanislav Fomichev wrote:
> > xskxceiver conveniently setups up veth pairs so it seems logical
> > to use veth as an example for some of the metadata handling.
> >
> > We timestamp skb right when we "receive" it, store its
> > pointer in xdp_buff->priv and generate BPF bytecode to
> > reach it from the BPF program.
> >
> > This largely follows the idea of "store some queue context in
> > the xdp_buff/xdp_frame so the metadata can be reached out
> > from the BPF program".
> >
> > Cc: Martin KaFai Lau <martin.lau@linux.dev>
> > Cc: Jakub Kicinski <kuba@kernel.org>
> > Cc: Willem de Bruijn <willemb@google.com>
> > Cc: Jesper Dangaard Brouer <brouer@redhat.com>
> > Cc: Anatoly Burakov <anatoly.burakov@intel.com>
> > Cc: Alexander Lobakin <alexandr.lobakin@intel.com>
> > Cc: Magnus Karlsson <magnus.karlsson@gmail.com>
> > Cc: Maryam Tahhan <mtahhan@redhat.com>
> > Cc: xdp-hints@xdp-project.net
> > Cc: netdev@vger.kernel.org
> > Signed-off-by: Stanislav Fomichev <sdf@google.com>
> > ---
> >   drivers/net/veth.c | 31 +++++++++++++++++++++++++++++++
> >   1 file changed, 31 insertions(+)
> >
> > diff --git a/drivers/net/veth.c b/drivers/net/veth.c
> > index 09682ea3354e..35396dd73de0 100644
> > --- a/drivers/net/veth.c
> > +++ b/drivers/net/veth.c
> > @@ -597,6 +597,7 @@ static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq,
> >
> >               xdp_convert_frame_to_buff(frame, &xdp);
> >               xdp.rxq = &rq->xdp_rxq;
> > +             xdp.priv = NULL;
>
> So, why doesn't this supported for normal XDP mode?!?
> e.g. Where veth gets XDP redirected an xdp_frame.

I wanted to have something simple for the demonstration purposes
(hence the re-usage of xskxceiver + veth without redirection).
But also see my cover letter:

Cons:
- forwarding has to be handled explicitly; the BPF programs have to
  agree on the metadata layout (IOW, the forwarding program
  has to be aware of the final AF_XDP consumer metadata layout)

> My main use case (for veth) is to make NIC hardware hints available to
> containers.  Thus, creating a flexible fast-path via XDP-redirect
> directly into containers veth device.  (This is e.g. for replacing the
> inflexible SR-IOV approach with SR-IOV net_devices in the container,
> with a more cloud friendly approach).
>
> How can we extend this approach to handle xdp_frame's from different
> net_device's ?

 So for this case, your forwarding program will have to call a bunch
of kfuncs and assemble the metadata.
It can also put some info about this metadata format. In theory, it
can even put some external btf-id for the struct that describes the
layout; or it can use some tlv format.
And then the final consumer will have to decide what to do with that metadata.

Or do you want xdp->skb conversion to also be transparently handled?
In this case, the last program will have to convert this to some new
xdp_hints_skb so the kernel can understand it. We might need some
extra helpers to signal those, but seems doable?

> >
> >               act = bpf_prog_run_xdp(xdp_prog, &xdp);
> >
> > @@ -820,6 +821,7 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
> >
> >       orig_data = xdp.data;
> >       orig_data_end = xdp.data_end;
> > +     xdp.priv = skb;
> >
>
> So, enabling SKB based path only.
>
> >       act = bpf_prog_run_xdp(xdp_prog, &xdp);
> >
> > @@ -936,6 +938,7 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget,
> >                       struct sk_buff *skb = ptr;
> >
> >                       stats->xdp_bytes += skb->len;
> > +                     __net_timestamp(skb);
> >                       skb = veth_xdp_rcv_skb(rq, skb, bq, stats);
> >                       if (skb) {
> >                               if (skb_shared(skb) || skb_unclone(skb, GFP_ATOMIC))
> > @@ -1595,6 +1598,33 @@ static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp)
> >       }
> >   }
> >
> > +static int veth_unroll_kfunc(struct bpf_prog *prog, struct bpf_insn *insn)
> > +{
> > +     u32 func_id = insn->imm;
> > +
> > +     if (func_id == xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_HAVE_RX_TIMESTAMP)) {
> > +             /* return true; */
> > +             insn[0] = BPF_MOV64_IMM(BPF_REG_0, 1);
> > +             return 1;
> > +     } else if (func_id == xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_TIMESTAMP)) {
> > +             /* r1 = ((struct xdp_buff *)r1)->priv; [skb] */
> > +             insn[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1,
> > +                                   offsetof(struct xdp_buff, priv));
> > +             /* if (r1 == NULL) { */
> > +             insn[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1);
> > +             /*      return 0; */
> > +             insn[2] = BPF_MOV64_IMM(BPF_REG_0, 0);
> > +             /* } else { */
> > +             /*      return ((struct sk_buff *)r1)->tstamp; */
> > +             insn[3] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1,
> > +                                   offsetof(struct sk_buff, tstamp));
>
> Just to be clear, this skb->tstamp is a software timestamp, right?

Yes, see above, this is just to showcase how the bpf/af_xdp side will
look. The 1st patch and the last one are the interesting ones. The
rest is boring plumbing we can ignore for now.




> > +             /* } */
> > +             return 4;
> > +     }
>
> I'm slightly concerned with driver developers maintaining BPF-bytecode
> on a per-driver bases, but I can certainly live with this if BPF
> maintainers can.
>
> > +
> > +     return 0;
> > +}
> > +
> >   static const struct net_device_ops veth_netdev_ops = {
> >       .ndo_init            = veth_dev_init,
> >       .ndo_open            = veth_open,
> > @@ -1614,6 +1644,7 @@ static const struct net_device_ops veth_netdev_ops = {
> >       .ndo_bpf                = veth_xdp,
> >       .ndo_xdp_xmit           = veth_ndo_xdp_xmit,
> >       .ndo_get_peer_dev       = veth_peer_dev,
> > +     .ndo_unroll_kfunc       = veth_unroll_kfunc,
> >   };
> >
> >   #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \
>
diff mbox series

Patch

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 09682ea3354e..35396dd73de0 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -597,6 +597,7 @@  static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq,
 
 		xdp_convert_frame_to_buff(frame, &xdp);
 		xdp.rxq = &rq->xdp_rxq;
+		xdp.priv = NULL;
 
 		act = bpf_prog_run_xdp(xdp_prog, &xdp);
 
@@ -820,6 +821,7 @@  static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
 
 	orig_data = xdp.data;
 	orig_data_end = xdp.data_end;
+	xdp.priv = skb;
 
 	act = bpf_prog_run_xdp(xdp_prog, &xdp);
 
@@ -936,6 +938,7 @@  static int veth_xdp_rcv(struct veth_rq *rq, int budget,
 			struct sk_buff *skb = ptr;
 
 			stats->xdp_bytes += skb->len;
+			__net_timestamp(skb);
 			skb = veth_xdp_rcv_skb(rq, skb, bq, stats);
 			if (skb) {
 				if (skb_shared(skb) || skb_unclone(skb, GFP_ATOMIC))
@@ -1595,6 +1598,33 @@  static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 	}
 }
 
+static int veth_unroll_kfunc(struct bpf_prog *prog, struct bpf_insn *insn)
+{
+	u32 func_id = insn->imm;
+
+	if (func_id == xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_HAVE_RX_TIMESTAMP)) {
+		/* return true; */
+		insn[0] = BPF_MOV64_IMM(BPF_REG_0, 1);
+		return 1;
+	} else if (func_id == xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_TIMESTAMP)) {
+		/* r1 = ((struct xdp_buff *)r1)->priv; [skb] */
+		insn[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1,
+				      offsetof(struct xdp_buff, priv));
+		/* if (r1 == NULL) { */
+		insn[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1);
+		/*	return 0; */
+		insn[2] = BPF_MOV64_IMM(BPF_REG_0, 0);
+		/* } else { */
+		/*	return ((struct sk_buff *)r1)->tstamp; */
+		insn[3] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1,
+				      offsetof(struct sk_buff, tstamp));
+		/* } */
+		return 4;
+	}
+
+	return 0;
+}
+
 static const struct net_device_ops veth_netdev_ops = {
 	.ndo_init            = veth_dev_init,
 	.ndo_open            = veth_open,
@@ -1614,6 +1644,7 @@  static const struct net_device_ops veth_netdev_ops = {
 	.ndo_bpf		= veth_xdp,
 	.ndo_xdp_xmit		= veth_ndo_xdp_xmit,
 	.ndo_get_peer_dev	= veth_peer_dev,
+	.ndo_unroll_kfunc       = veth_unroll_kfunc,
 };
 
 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \