diff mbox series

[RFC,bpf-next,v2,11/11] net/mlx5e: Support TX timestamp metadata

Message ID 20230621170244.1283336-12-sdf@google.com (mailing list archive)
State RFC
Delegated to: BPF
Headers show
Series bpf: Netdev TX metadata | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ${{ matrix.test }} on ${{ matrix.arch }} with ${{ matrix.toolchain_full }}
bpf/vmtest-bpf-next-VM_Test-2 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-3 fail Logs for build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-4 fail Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-5 fail Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 fail Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-7 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-8 success Logs for veristat
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit fail Errors and warnings before: 13 this patch: 15
netdev/cc_maintainers warning 11 maintainers not CCed: kuba@kernel.org hawk@kernel.org saeedm@nvidia.com tariqt@nvidia.com davem@davemloft.net pabeni@redhat.com leon@kernel.org maxtram95@gmail.com dtatulea@nvidia.com edumazet@google.com linux-rdma@vger.kernel.org
netdev/build_clang fail Errors and warnings before: 13 this patch: 15
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn fail Errors and warnings before: 13 this patch: 15
netdev/checkpatch warning WARNING: line length of 83 exceeds 80 columns WARNING: line length of 86 exceeds 80 columns WARNING: line length of 90 exceeds 80 columns WARNING: line length of 94 exceeds 80 columns WARNING: line length of 98 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Stanislav Fomichev June 21, 2023, 5:02 p.m. UTC
WIP, not tested, only to show the overall idea.
Non-AF_XDP paths are marked with 'false' for now.

Cc: netdev@vger.kernel.org
Signed-off-by: Stanislav Fomichev <sdf@google.com>
---
 .../net/ethernet/mellanox/mlx5/core/en/txrx.h | 11 +++
 .../net/ethernet/mellanox/mlx5/core/en/xdp.c  | 96 ++++++++++++++++++-
 .../net/ethernet/mellanox/mlx5/core/en/xdp.h  |  9 +-
 .../ethernet/mellanox/mlx5/core/en/xsk/tx.c   |  3 +
 .../net/ethernet/mellanox/mlx5/core/en_tx.c   | 16 ++++
 .../net/ethernet/mellanox/mlx5/core/main.c    | 26 ++++-
 6 files changed, 156 insertions(+), 5 deletions(-)

Comments

Alexei Starovoitov June 22, 2023, 7:57 p.m. UTC | #1
On Wed, Jun 21, 2023 at 10:02:44AM -0700, Stanislav Fomichev wrote:
> WIP, not tested, only to show the overall idea.
> Non-AF_XDP paths are marked with 'false' for now.
> 
> Cc: netdev@vger.kernel.org
> Signed-off-by: Stanislav Fomichev <sdf@google.com>
> ---
>  .../net/ethernet/mellanox/mlx5/core/en/txrx.h | 11 +++
>  .../net/ethernet/mellanox/mlx5/core/en/xdp.c  | 96 ++++++++++++++++++-
>  .../net/ethernet/mellanox/mlx5/core/en/xdp.h  |  9 +-
>  .../ethernet/mellanox/mlx5/core/en/xsk/tx.c   |  3 +
>  .../net/ethernet/mellanox/mlx5/core/en_tx.c   | 16 ++++
>  .../net/ethernet/mellanox/mlx5/core/main.c    | 26 ++++-
>  6 files changed, 156 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
> index 879d698b6119..e4509464e0b1 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
> @@ -6,6 +6,7 @@
>  
>  #include "en.h"
>  #include <linux/indirect_call_wrapper.h>
> +#include <net/devtx.h>
>  
>  #define MLX5E_TX_WQE_EMPTY_DS_COUNT (sizeof(struct mlx5e_tx_wqe) / MLX5_SEND_WQE_DS)
>  
> @@ -506,4 +507,14 @@ static inline struct mlx5e_mpw_info *mlx5e_get_mpw_info(struct mlx5e_rq *rq, int
>  
>  	return (struct mlx5e_mpw_info *)((char *)rq->mpwqe.info + array_size(i, isz));
>  }
> +
> +struct mlx5e_devtx_frame {
> +	struct devtx_frame frame;
> +	struct mlx5_cqe64 *cqe; /* tx completion */

cqe is only valid at completion.

> +	struct mlx5e_tx_wqe *wqe; /* tx */

wqe is only valid at submission.

imo that's a very clear sign that this is not a generic datastructure.
The code is trying hard to make 'frame' part of it look common,
but it won't help bpf prog to be 'generic'.
It is still going to precisely coded for completion vs submission.
Similarly a bpf prog for completion in veth will be different than bpf prog for completion in mlx5.
As I stated earlier this 'generalization' and 'common' datastructure only adds code complexity.
Stanislav Fomichev June 22, 2023, 8:13 p.m. UTC | #2
On Thu, Jun 22, 2023 at 12:58 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Wed, Jun 21, 2023 at 10:02:44AM -0700, Stanislav Fomichev wrote:
> > WIP, not tested, only to show the overall idea.
> > Non-AF_XDP paths are marked with 'false' for now.
> >
> > Cc: netdev@vger.kernel.org
> > Signed-off-by: Stanislav Fomichev <sdf@google.com>
> > ---
> >  .../net/ethernet/mellanox/mlx5/core/en/txrx.h | 11 +++
> >  .../net/ethernet/mellanox/mlx5/core/en/xdp.c  | 96 ++++++++++++++++++-
> >  .../net/ethernet/mellanox/mlx5/core/en/xdp.h  |  9 +-
> >  .../ethernet/mellanox/mlx5/core/en/xsk/tx.c   |  3 +
> >  .../net/ethernet/mellanox/mlx5/core/en_tx.c   | 16 ++++
> >  .../net/ethernet/mellanox/mlx5/core/main.c    | 26 ++++-
> >  6 files changed, 156 insertions(+), 5 deletions(-)
> >
> > diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
> > index 879d698b6119..e4509464e0b1 100644
> > --- a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
> > +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
> > @@ -6,6 +6,7 @@
> >
> >  #include "en.h"
> >  #include <linux/indirect_call_wrapper.h>
> > +#include <net/devtx.h>
> >
> >  #define MLX5E_TX_WQE_EMPTY_DS_COUNT (sizeof(struct mlx5e_tx_wqe) / MLX5_SEND_WQE_DS)
> >
> > @@ -506,4 +507,14 @@ static inline struct mlx5e_mpw_info *mlx5e_get_mpw_info(struct mlx5e_rq *rq, int
> >
> >       return (struct mlx5e_mpw_info *)((char *)rq->mpwqe.info + array_size(i, isz));
> >  }
> > +
> > +struct mlx5e_devtx_frame {
> > +     struct devtx_frame frame;
> > +     struct mlx5_cqe64 *cqe; /* tx completion */
>
> cqe is only valid at completion.
>
> > +     struct mlx5e_tx_wqe *wqe; /* tx */
>
> wqe is only valid at submission.
>
> imo that's a very clear sign that this is not a generic datastructure.
> The code is trying hard to make 'frame' part of it look common,
> but it won't help bpf prog to be 'generic'.
> It is still going to precisely coded for completion vs submission.
> Similarly a bpf prog for completion in veth will be different than bpf prog for completion in mlx5.
> As I stated earlier this 'generalization' and 'common' datastructure only adds code complexity.

The reason I went with this abstract context is to allow the programs
to be attached to the different devices.
For example, the xdp_hw_metadata we currently have is not really tied
down to the particular implementation.
If every hook declaration looks different, it seems impossible to
create portable programs.

The frame part is not really needed, we can probably rename it to ctx
and pass data/frags over the arguments?

struct devtx_ctx {
  struct net_device *netdev;
  /* the devices will be able to create wrappers to stash descriptor pointers */
};
void veth_devtx_submit(struct devtx_ctx *ctx, void *data, u16 len, u8
meta_len, struct skb_shared_info *sinfo);

But striving to have a similar hook declaration seems useful to
program portability sake?
Alexei Starovoitov June 22, 2023, 9:47 p.m. UTC | #3
On Thu, Jun 22, 2023 at 1:13 PM Stanislav Fomichev <sdf@google.com> wrote:
>
> On Thu, Jun 22, 2023 at 12:58 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Wed, Jun 21, 2023 at 10:02:44AM -0700, Stanislav Fomichev wrote:
> > > WIP, not tested, only to show the overall idea.
> > > Non-AF_XDP paths are marked with 'false' for now.
> > >
> > > Cc: netdev@vger.kernel.org
> > > Signed-off-by: Stanislav Fomichev <sdf@google.com>
> > > ---
> > >  .../net/ethernet/mellanox/mlx5/core/en/txrx.h | 11 +++
> > >  .../net/ethernet/mellanox/mlx5/core/en/xdp.c  | 96 ++++++++++++++++++-
> > >  .../net/ethernet/mellanox/mlx5/core/en/xdp.h  |  9 +-
> > >  .../ethernet/mellanox/mlx5/core/en/xsk/tx.c   |  3 +
> > >  .../net/ethernet/mellanox/mlx5/core/en_tx.c   | 16 ++++
> > >  .../net/ethernet/mellanox/mlx5/core/main.c    | 26 ++++-
> > >  6 files changed, 156 insertions(+), 5 deletions(-)
> > >
> > > diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
> > > index 879d698b6119..e4509464e0b1 100644
> > > --- a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
> > > +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
> > > @@ -6,6 +6,7 @@
> > >
> > >  #include "en.h"
> > >  #include <linux/indirect_call_wrapper.h>
> > > +#include <net/devtx.h>
> > >
> > >  #define MLX5E_TX_WQE_EMPTY_DS_COUNT (sizeof(struct mlx5e_tx_wqe) / MLX5_SEND_WQE_DS)
> > >
> > > @@ -506,4 +507,14 @@ static inline struct mlx5e_mpw_info *mlx5e_get_mpw_info(struct mlx5e_rq *rq, int
> > >
> > >       return (struct mlx5e_mpw_info *)((char *)rq->mpwqe.info + array_size(i, isz));
> > >  }
> > > +
> > > +struct mlx5e_devtx_frame {
> > > +     struct devtx_frame frame;
> > > +     struct mlx5_cqe64 *cqe; /* tx completion */
> >
> > cqe is only valid at completion.
> >
> > > +     struct mlx5e_tx_wqe *wqe; /* tx */
> >
> > wqe is only valid at submission.
> >
> > imo that's a very clear sign that this is not a generic datastructure.
> > The code is trying hard to make 'frame' part of it look common,
> > but it won't help bpf prog to be 'generic'.
> > It is still going to precisely coded for completion vs submission.
> > Similarly a bpf prog for completion in veth will be different than bpf prog for completion in mlx5.
> > As I stated earlier this 'generalization' and 'common' datastructure only adds code complexity.
>
> The reason I went with this abstract context is to allow the programs
> to be attached to the different devices.
> For example, the xdp_hw_metadata we currently have is not really tied
> down to the particular implementation.
> If every hook declaration looks different, it seems impossible to
> create portable programs.
>
> The frame part is not really needed, we can probably rename it to ctx
> and pass data/frags over the arguments?
>
> struct devtx_ctx {
>   struct net_device *netdev;
>   /* the devices will be able to create wrappers to stash descriptor pointers */
> };
> void veth_devtx_submit(struct devtx_ctx *ctx, void *data, u16 len, u8
> meta_len, struct skb_shared_info *sinfo);
>
> But striving to have a similar hook declaration seems useful to
> program portability sake?

portability across what ?
'timestamp' on veth doesn't have a real use. It's testing only.
Even testing is a bit dubious.
I can see a need for bpf prog to run in the datacenter on mlx, brcm
and whatever other nics, but they will have completely different
hw descriptors. timestamp kfuncs to request/read can be common,
but to read the descriptors bpf prog authors would need to write
different code anyway.
So kernel code going out its way to present somewhat common devtx_ctx
just doesn't help. It adds code to the kernel, but bpf prog still
has to be tailored for mlx and brcm differently.
Stanislav Fomichev June 22, 2023, 10:13 p.m. UTC | #4
On Thu, Jun 22, 2023 at 2:47 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Thu, Jun 22, 2023 at 1:13 PM Stanislav Fomichev <sdf@google.com> wrote:
> >
> > On Thu, Jun 22, 2023 at 12:58 PM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > On Wed, Jun 21, 2023 at 10:02:44AM -0700, Stanislav Fomichev wrote:
> > > > WIP, not tested, only to show the overall idea.
> > > > Non-AF_XDP paths are marked with 'false' for now.
> > > >
> > > > Cc: netdev@vger.kernel.org
> > > > Signed-off-by: Stanislav Fomichev <sdf@google.com>
> > > > ---
> > > >  .../net/ethernet/mellanox/mlx5/core/en/txrx.h | 11 +++
> > > >  .../net/ethernet/mellanox/mlx5/core/en/xdp.c  | 96 ++++++++++++++++++-
> > > >  .../net/ethernet/mellanox/mlx5/core/en/xdp.h  |  9 +-
> > > >  .../ethernet/mellanox/mlx5/core/en/xsk/tx.c   |  3 +
> > > >  .../net/ethernet/mellanox/mlx5/core/en_tx.c   | 16 ++++
> > > >  .../net/ethernet/mellanox/mlx5/core/main.c    | 26 ++++-
> > > >  6 files changed, 156 insertions(+), 5 deletions(-)
> > > >
> > > > diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
> > > > index 879d698b6119..e4509464e0b1 100644
> > > > --- a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
> > > > +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
> > > > @@ -6,6 +6,7 @@
> > > >
> > > >  #include "en.h"
> > > >  #include <linux/indirect_call_wrapper.h>
> > > > +#include <net/devtx.h>
> > > >
> > > >  #define MLX5E_TX_WQE_EMPTY_DS_COUNT (sizeof(struct mlx5e_tx_wqe) / MLX5_SEND_WQE_DS)
> > > >
> > > > @@ -506,4 +507,14 @@ static inline struct mlx5e_mpw_info *mlx5e_get_mpw_info(struct mlx5e_rq *rq, int
> > > >
> > > >       return (struct mlx5e_mpw_info *)((char *)rq->mpwqe.info + array_size(i, isz));
> > > >  }
> > > > +
> > > > +struct mlx5e_devtx_frame {
> > > > +     struct devtx_frame frame;
> > > > +     struct mlx5_cqe64 *cqe; /* tx completion */
> > >
> > > cqe is only valid at completion.
> > >
> > > > +     struct mlx5e_tx_wqe *wqe; /* tx */
> > >
> > > wqe is only valid at submission.
> > >
> > > imo that's a very clear sign that this is not a generic datastructure.
> > > The code is trying hard to make 'frame' part of it look common,
> > > but it won't help bpf prog to be 'generic'.
> > > It is still going to precisely coded for completion vs submission.
> > > Similarly a bpf prog for completion in veth will be different than bpf prog for completion in mlx5.
> > > As I stated earlier this 'generalization' and 'common' datastructure only adds code complexity.
> >
> > The reason I went with this abstract context is to allow the programs
> > to be attached to the different devices.
> > For example, the xdp_hw_metadata we currently have is not really tied
> > down to the particular implementation.
> > If every hook declaration looks different, it seems impossible to
> > create portable programs.
> >
> > The frame part is not really needed, we can probably rename it to ctx
> > and pass data/frags over the arguments?
> >
> > struct devtx_ctx {
> >   struct net_device *netdev;
> >   /* the devices will be able to create wrappers to stash descriptor pointers */
> > };
> > void veth_devtx_submit(struct devtx_ctx *ctx, void *data, u16 len, u8
> > meta_len, struct skb_shared_info *sinfo);
> >
> > But striving to have a similar hook declaration seems useful to
> > program portability sake?
>
> portability across what ?
> 'timestamp' on veth doesn't have a real use. It's testing only.
> Even testing is a bit dubious.
> I can see a need for bpf prog to run in the datacenter on mlx, brcm
> and whatever other nics, but they will have completely different
> hw descriptors. timestamp kfuncs to request/read can be common,
> but to read the descriptors bpf prog authors would need to write
> different code anyway.
> So kernel code going out its way to present somewhat common devtx_ctx
> just doesn't help. It adds code to the kernel, but bpf prog still
> has to be tailored for mlx and brcm differently.

Isn't it the same discussion/arguments we had during the RX series?
We want to provide common sane interfaces/abstractions via kfuncs.
That will make most BPF programs portable from mlx to brcm (for
example) without doing a rewrite.
We're also exposing raw (readonly) descriptors (via that get_ctx
helper) to the users who know what to do with them.
Most users don't know what to do with raw descriptors; the specs are
not public; things can change depending on fw version/etc/etc.
So the progs that touch raw descriptors are not the primary use-case.
(that was the tl;dr for rx part, seems like it applies here?)

Let's maybe discuss that mlx5 example? Are you proposing to do
something along these lines?

void mlx5e_devtx_submit(struct mlx5e_tx_wqe *wqe);
void mlx5e_devtx_complete(struct mlx5_cqe64 *cqe);

If yes, I'm missing how we define the common kfuncs in this case. The
kfuncs need to have some common context. We're defining them with:
bpf_devtx_<kfunc>(const struct devtx_frame *ctx);
Alexei Starovoitov June 23, 2023, 2:35 a.m. UTC | #5
On Thu, Jun 22, 2023 at 3:13 PM Stanislav Fomichev <sdf@google.com> wrote:
>
> On Thu, Jun 22, 2023 at 2:47 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Thu, Jun 22, 2023 at 1:13 PM Stanislav Fomichev <sdf@google.com> wrote:
> > >
> > > On Thu, Jun 22, 2023 at 12:58 PM Alexei Starovoitov
> > > <alexei.starovoitov@gmail.com> wrote:
> > > >
> > > > On Wed, Jun 21, 2023 at 10:02:44AM -0700, Stanislav Fomichev wrote:
> > > > > WIP, not tested, only to show the overall idea.
> > > > > Non-AF_XDP paths are marked with 'false' for now.
> > > > >
> > > > > Cc: netdev@vger.kernel.org
> > > > > Signed-off-by: Stanislav Fomichev <sdf@google.com>
> > > > > ---
> > > > >  .../net/ethernet/mellanox/mlx5/core/en/txrx.h | 11 +++
> > > > >  .../net/ethernet/mellanox/mlx5/core/en/xdp.c  | 96 ++++++++++++++++++-
> > > > >  .../net/ethernet/mellanox/mlx5/core/en/xdp.h  |  9 +-
> > > > >  .../ethernet/mellanox/mlx5/core/en/xsk/tx.c   |  3 +
> > > > >  .../net/ethernet/mellanox/mlx5/core/en_tx.c   | 16 ++++
> > > > >  .../net/ethernet/mellanox/mlx5/core/main.c    | 26 ++++-
> > > > >  6 files changed, 156 insertions(+), 5 deletions(-)
> > > > >
> > > > > diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
> > > > > index 879d698b6119..e4509464e0b1 100644
> > > > > --- a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
> > > > > +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
> > > > > @@ -6,6 +6,7 @@
> > > > >
> > > > >  #include "en.h"
> > > > >  #include <linux/indirect_call_wrapper.h>
> > > > > +#include <net/devtx.h>
> > > > >
> > > > >  #define MLX5E_TX_WQE_EMPTY_DS_COUNT (sizeof(struct mlx5e_tx_wqe) / MLX5_SEND_WQE_DS)
> > > > >
> > > > > @@ -506,4 +507,14 @@ static inline struct mlx5e_mpw_info *mlx5e_get_mpw_info(struct mlx5e_rq *rq, int
> > > > >
> > > > >       return (struct mlx5e_mpw_info *)((char *)rq->mpwqe.info + array_size(i, isz));
> > > > >  }
> > > > > +
> > > > > +struct mlx5e_devtx_frame {
> > > > > +     struct devtx_frame frame;
> > > > > +     struct mlx5_cqe64 *cqe; /* tx completion */
> > > >
> > > > cqe is only valid at completion.
> > > >
> > > > > +     struct mlx5e_tx_wqe *wqe; /* tx */
> > > >
> > > > wqe is only valid at submission.
> > > >
> > > > imo that's a very clear sign that this is not a generic datastructure.
> > > > The code is trying hard to make 'frame' part of it look common,
> > > > but it won't help bpf prog to be 'generic'.
> > > > It is still going to precisely coded for completion vs submission.
> > > > Similarly a bpf prog for completion in veth will be different than bpf prog for completion in mlx5.
> > > > As I stated earlier this 'generalization' and 'common' datastructure only adds code complexity.
> > >
> > > The reason I went with this abstract context is to allow the programs
> > > to be attached to the different devices.
> > > For example, the xdp_hw_metadata we currently have is not really tied
> > > down to the particular implementation.
> > > If every hook declaration looks different, it seems impossible to
> > > create portable programs.
> > >
> > > The frame part is not really needed, we can probably rename it to ctx
> > > and pass data/frags over the arguments?
> > >
> > > struct devtx_ctx {
> > >   struct net_device *netdev;
> > >   /* the devices will be able to create wrappers to stash descriptor pointers */
> > > };
> > > void veth_devtx_submit(struct devtx_ctx *ctx, void *data, u16 len, u8
> > > meta_len, struct skb_shared_info *sinfo);
> > >
> > > But striving to have a similar hook declaration seems useful to
> > > program portability sake?
> >
> > portability across what ?
> > 'timestamp' on veth doesn't have a real use. It's testing only.
> > Even testing is a bit dubious.
> > I can see a need for bpf prog to run in the datacenter on mlx, brcm
> > and whatever other nics, but they will have completely different
> > hw descriptors. timestamp kfuncs to request/read can be common,
> > but to read the descriptors bpf prog authors would need to write
> > different code anyway.
> > So kernel code going out its way to present somewhat common devtx_ctx
> > just doesn't help. It adds code to the kernel, but bpf prog still
> > has to be tailored for mlx and brcm differently.
>
> Isn't it the same discussion/arguments we had during the RX series?

Right, but there we already have xdp_md as an abstraction.
Extra kfuncs don't change that.
Here is the whole new 'ctx' being proposed with assumption that
it will be shared between completion and submission and will be
useful in both.

But there is skb at submission time and no skb at completion.
xdp_frame is there, but it's the last record of what was sent on the wire.
Parsing it with bpf is like examining steps in a sand. They are gone.
Parsing at submission makes sense, not at completion
and the driver has a way to associate wqe with cqe.

> We want to provide common sane interfaces/abstractions via kfuncs.
> That will make most BPF programs portable from mlx to brcm (for
> example) without doing a rewrite.
> We're also exposing raw (readonly) descriptors (via that get_ctx
> helper) to the users who know what to do with them.
> Most users don't know what to do with raw descriptors;

Why do you think so?
Who are those users?
I see your proposal and thumbs up from onlookers.
afaict there are zero users for rx side hw hints too.

> the specs are
> not public; things can change depending on fw version/etc/etc.
> So the progs that touch raw descriptors are not the primary use-case.
> (that was the tl;dr for rx part, seems like it applies here?)
>
> Let's maybe discuss that mlx5 example? Are you proposing to do
> something along these lines?
>
> void mlx5e_devtx_submit(struct mlx5e_tx_wqe *wqe);
> void mlx5e_devtx_complete(struct mlx5_cqe64 *cqe);
>
> If yes, I'm missing how we define the common kfuncs in this case. The
> kfuncs need to have some common context. We're defining them with:
> bpf_devtx_<kfunc>(const struct devtx_frame *ctx);

I'm looking at xdp_metadata and wondering who's using it.
I haven't seen a single bug report.
No bugs means no one is using it. There is zero chance that we managed
to implement it bug-free on the first try.
So new tx side things look like a feature creep to me.
rx side is far from proven to be useful for anything.
Yet you want to add new things.
Maryam Tahhan June 23, 2023, 10:16 a.m. UTC | #6
On 23/06/2023 03:35, Alexei Starovoitov wrote:
> Why do you think so?
> Who are those users?
> I see your proposal and thumbs up from onlookers.
> afaict there are zero users for rx side hw hints too.
>
>> the specs are
>> not public; things can change depending on fw version/etc/etc.
>> So the progs that touch raw descriptors are not the primary use-case.
>> (that was the tl;dr for rx part, seems like it applies here?)
>>
>> Let's maybe discuss that mlx5 example? Are you proposing to do
>> something along these lines?
>>
>> void mlx5e_devtx_submit(struct mlx5e_tx_wqe *wqe);
>> void mlx5e_devtx_complete(struct mlx5_cqe64 *cqe);
>>
>> If yes, I'm missing how we define the common kfuncs in this case. The
>> kfuncs need to have some common context. We're defining them with:
>> bpf_devtx_<kfunc>(const struct devtx_frame *ctx);
> I'm looking at xdp_metadata and wondering who's using it.
> I haven't seen a single bug report.
> No bugs means no one is using it. There is zero chance that we managed
> to implement it bug-free on the first try.
> So new tx side things look like a feature creep to me.
> rx side is far from proven to be useful for anything.
> Yet you want to add new things.
>

Hi folks

We in CNDP (https://github.com/CloudNativeDataPlane/cndp) have been 
looking to use xdp_metadata to relay receive side offloads from the NIC 
to our AF_XDP applications. We see this is a key feature that is 
essential for the viability of AF_XDP in the real world. We would love 
to see something adopted for the TX side alongside what's on the RX 
side. We don't want to waste cycles do everything in software when the 
NIC HW supports many features that we need.

Thank you
Maryam
Alexei Starovoitov June 23, 2023, 4:32 p.m. UTC | #7
On Fri, Jun 23, 2023 at 3:16 AM Maryam Tahhan <mtahhan@redhat.com> wrote:
>
> On 23/06/2023 03:35, Alexei Starovoitov wrote:
> > Why do you think so?
> > Who are those users?
> > I see your proposal and thumbs up from onlookers.
> > afaict there are zero users for rx side hw hints too.
> >
> >> the specs are
> >> not public; things can change depending on fw version/etc/etc.
> >> So the progs that touch raw descriptors are not the primary use-case.
> >> (that was the tl;dr for rx part, seems like it applies here?)
> >>
> >> Let's maybe discuss that mlx5 example? Are you proposing to do
> >> something along these lines?
> >>
> >> void mlx5e_devtx_submit(struct mlx5e_tx_wqe *wqe);
> >> void mlx5e_devtx_complete(struct mlx5_cqe64 *cqe);
> >>
> >> If yes, I'm missing how we define the common kfuncs in this case. The
> >> kfuncs need to have some common context. We're defining them with:
> >> bpf_devtx_<kfunc>(const struct devtx_frame *ctx);
> > I'm looking at xdp_metadata and wondering who's using it.
> > I haven't seen a single bug report.
> > No bugs means no one is using it. There is zero chance that we managed
> > to implement it bug-free on the first try.
> > So new tx side things look like a feature creep to me.
> > rx side is far from proven to be useful for anything.
> > Yet you want to add new things.
> >
>
> Hi folks
>
> We in CNDP (https://github.com/CloudNativeDataPlane/cndp) have been

with TCP stack in user space over af_xdp...

> looking to use xdp_metadata to relay receive side offloads from the NIC
> to our AF_XDP applications. We see this is a key feature that is
> essential for the viability of AF_XDP in the real world. We would love
> to see something adopted for the TX side alongside what's on the RX
> side. We don't want to waste cycles do everything in software when the
> NIC HW supports many features that we need.

Please specify "many features". If that means HW TSO to accelerate
your TCP in user space, then sorry, but no.
Stanislav Fomichev June 23, 2023, 5:24 p.m. UTC | #8
On Thu, Jun 22, 2023 at 7:36 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Thu, Jun 22, 2023 at 3:13 PM Stanislav Fomichev <sdf@google.com> wrote:
> >
> > On Thu, Jun 22, 2023 at 2:47 PM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > On Thu, Jun 22, 2023 at 1:13 PM Stanislav Fomichev <sdf@google.com> wrote:
> > > >
> > > > On Thu, Jun 22, 2023 at 12:58 PM Alexei Starovoitov
> > > > <alexei.starovoitov@gmail.com> wrote:
> > > > >
> > > > > On Wed, Jun 21, 2023 at 10:02:44AM -0700, Stanislav Fomichev wrote:
> > > > > > WIP, not tested, only to show the overall idea.
> > > > > > Non-AF_XDP paths are marked with 'false' for now.
> > > > > >
> > > > > > Cc: netdev@vger.kernel.org
> > > > > > Signed-off-by: Stanislav Fomichev <sdf@google.com>
> > > > > > ---
> > > > > >  .../net/ethernet/mellanox/mlx5/core/en/txrx.h | 11 +++
> > > > > >  .../net/ethernet/mellanox/mlx5/core/en/xdp.c  | 96 ++++++++++++++++++-
> > > > > >  .../net/ethernet/mellanox/mlx5/core/en/xdp.h  |  9 +-
> > > > > >  .../ethernet/mellanox/mlx5/core/en/xsk/tx.c   |  3 +
> > > > > >  .../net/ethernet/mellanox/mlx5/core/en_tx.c   | 16 ++++
> > > > > >  .../net/ethernet/mellanox/mlx5/core/main.c    | 26 ++++-
> > > > > >  6 files changed, 156 insertions(+), 5 deletions(-)
> > > > > >
> > > > > > diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
> > > > > > index 879d698b6119..e4509464e0b1 100644
> > > > > > --- a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
> > > > > > +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
> > > > > > @@ -6,6 +6,7 @@
> > > > > >
> > > > > >  #include "en.h"
> > > > > >  #include <linux/indirect_call_wrapper.h>
> > > > > > +#include <net/devtx.h>
> > > > > >
> > > > > >  #define MLX5E_TX_WQE_EMPTY_DS_COUNT (sizeof(struct mlx5e_tx_wqe) / MLX5_SEND_WQE_DS)
> > > > > >
> > > > > > @@ -506,4 +507,14 @@ static inline struct mlx5e_mpw_info *mlx5e_get_mpw_info(struct mlx5e_rq *rq, int
> > > > > >
> > > > > >       return (struct mlx5e_mpw_info *)((char *)rq->mpwqe.info + array_size(i, isz));
> > > > > >  }
> > > > > > +
> > > > > > +struct mlx5e_devtx_frame {
> > > > > > +     struct devtx_frame frame;
> > > > > > +     struct mlx5_cqe64 *cqe; /* tx completion */
> > > > >
> > > > > cqe is only valid at completion.
> > > > >
> > > > > > +     struct mlx5e_tx_wqe *wqe; /* tx */
> > > > >
> > > > > wqe is only valid at submission.
> > > > >
> > > > > imo that's a very clear sign that this is not a generic datastructure.
> > > > > The code is trying hard to make 'frame' part of it look common,
> > > > > but it won't help bpf prog to be 'generic'.
> > > > > It is still going to precisely coded for completion vs submission.
> > > > > Similarly a bpf prog for completion in veth will be different than bpf prog for completion in mlx5.
> > > > > As I stated earlier this 'generalization' and 'common' datastructure only adds code complexity.
> > > >
> > > > The reason I went with this abstract context is to allow the programs
> > > > to be attached to the different devices.
> > > > For example, the xdp_hw_metadata we currently have is not really tied
> > > > down to the particular implementation.
> > > > If every hook declaration looks different, it seems impossible to
> > > > create portable programs.
> > > >
> > > > The frame part is not really needed, we can probably rename it to ctx
> > > > and pass data/frags over the arguments?
> > > >
> > > > struct devtx_ctx {
> > > >   struct net_device *netdev;
> > > >   /* the devices will be able to create wrappers to stash descriptor pointers */
> > > > };
> > > > void veth_devtx_submit(struct devtx_ctx *ctx, void *data, u16 len, u8
> > > > meta_len, struct skb_shared_info *sinfo);
> > > >
> > > > But striving to have a similar hook declaration seems useful to
> > > > program portability sake?
> > >
> > > portability across what ?
> > > 'timestamp' on veth doesn't have a real use. It's testing only.
> > > Even testing is a bit dubious.
> > > I can see a need for bpf prog to run in the datacenter on mlx, brcm
> > > and whatever other nics, but they will have completely different
> > > hw descriptors. timestamp kfuncs to request/read can be common,
> > > but to read the descriptors bpf prog authors would need to write
> > > different code anyway.
> > > So kernel code going out its way to present somewhat common devtx_ctx
> > > just doesn't help. It adds code to the kernel, but bpf prog still
> > > has to be tailored for mlx and brcm differently.
> >
> > Isn't it the same discussion/arguments we had during the RX series?
>
> Right, but there we already have xdp_md as an abstraction.
> Extra kfuncs don't change that.
> Here is the whole new 'ctx' being proposed with assumption that
> it will be shared between completion and submission and will be
> useful in both.
>
> But there is skb at submission time and no skb at completion.
> xdp_frame is there, but it's the last record of what was sent on the wire.
> Parsing it with bpf is like examining steps in a sand. They are gone.
> Parsing at submission makes sense, not at completion
> and the driver has a way to associate wqe with cqe.

Right, and I'm not exposing neither skb nor xdp_md/frame, so we're on
the same page?
Or are you suggesting to further split devtx_frame into two contexts?
One for submit and another for complete?
And don't expose the payload at the complete time?
Having payload at complete might still be useful though, at least the header.
In case the users want only to inspect completion based on some marker/flow.

> > We want to provide common sane interfaces/abstractions via kfuncs.
> > That will make most BPF programs portable from mlx to brcm (for
> > example) without doing a rewrite.
> > We're also exposing raw (readonly) descriptors (via that get_ctx
> > helper) to the users who know what to do with them.
> > Most users don't know what to do with raw descriptors;
>
> Why do you think so?
> Who are those users?
> I see your proposal and thumbs up from onlookers.
> afaict there are zero users for rx side hw hints too.

My bias comes from the point of view of our internal use-cases where
we'd like to have rx/tx timestamps in the device-agnostic fashion.
I'm happy to incorporate other requirements as I did with exposing raw
descriptors at rx using get_ctx helper.
Regarding the usage: for the external ones I'm assuming it will take
time until it all percolates through the distros...

> > the specs are
> > not public; things can change depending on fw version/etc/etc.
> > So the progs that touch raw descriptors are not the primary use-case.
> > (that was the tl;dr for rx part, seems like it applies here?)
> >
> > Let's maybe discuss that mlx5 example? Are you proposing to do
> > something along these lines?
> >
> > void mlx5e_devtx_submit(struct mlx5e_tx_wqe *wqe);
> > void mlx5e_devtx_complete(struct mlx5_cqe64 *cqe);
> >
> > If yes, I'm missing how we define the common kfuncs in this case. The
> > kfuncs need to have some common context. We're defining them with:
> > bpf_devtx_<kfunc>(const struct devtx_frame *ctx);
>
> I'm looking at xdp_metadata and wondering who's using it.
> I haven't seen a single bug report.
> No bugs means no one is using it. There is zero chance that we managed
> to implement it bug-free on the first try.
> So new tx side things look like a feature creep to me.
> rx side is far from proven to be useful for anything.
> Yet you want to add new things.

I've been talking about both tx and rx timestamps right from the
beginning, so it's not really a new feature.
But what's the concern here? IIUC, the whole point of it being
kfunc-based is that we can wipe it all if/when it becomes a dead
weight.

Regarding the users, there is also a bit of a chicken and egg problem:
We have some internal interest in using AF_XDP, but it lacks multibuf
(which is in the review) and the offloads (which I'm trying to move
forward for both rx/tx).
Maryam Tahhan June 23, 2023, 5:47 p.m. UTC | #9
On 23/06/2023 17:32, Alexei Starovoitov wrote:
> On Fri, Jun 23, 2023 at 3:16 AM Maryam Tahhan <mtahhan@redhat.com> wrote:
>> On 23/06/2023 03:35, Alexei Starovoitov wrote:
>>> Why do you think so?
>>> Who are those users?
>>> I see your proposal and thumbs up from onlookers.
>>> afaict there are zero users for rx side hw hints too.
>>>
>>>> the specs are
>>>> not public; things can change depending on fw version/etc/etc.
>>>> So the progs that touch raw descriptors are not the primary use-case.
>>>> (that was the tl;dr for rx part, seems like it applies here?)
>>>>
>>>> Let's maybe discuss that mlx5 example? Are you proposing to do
>>>> something along these lines?
>>>>
>>>> void mlx5e_devtx_submit(struct mlx5e_tx_wqe *wqe);
>>>> void mlx5e_devtx_complete(struct mlx5_cqe64 *cqe);
>>>>
>>>> If yes, I'm missing how we define the common kfuncs in this case. The
>>>> kfuncs need to have some common context. We're defining them with:
>>>> bpf_devtx_<kfunc>(const struct devtx_frame *ctx);
>>> I'm looking at xdp_metadata and wondering who's using it.
>>> I haven't seen a single bug report.
>>> No bugs means no one is using it. There is zero chance that we managed
>>> to implement it bug-free on the first try.
>>> So new tx side things look like a feature creep to me.
>>> rx side is far from proven to be useful for anything.
>>> Yet you want to add new things.
>>>
>> Hi folks
>>
>> We in CNDP (https://github.com/CloudNativeDataPlane/cndp) have been
> with TCP stack in user space over af_xdp...
>
>> looking to use xdp_metadata to relay receive side offloads from the NIC
>> to our AF_XDP applications. We see this is a key feature that is
>> essential for the viability of AF_XDP in the real world. We would love
>> to see something adopted for the TX side alongside what's on the RX
>> side. We don't want to waste cycles do everything in software when the
>> NIC HW supports many features that we need.
> Please specify "many features". If that means HW TSO to accelerate
> your TCP in user space, then sorry, but no.

Our TCP "stack" does NOT work without the kernel, it's a "lightweight 
data plane", the kernel is the control plane you may remember my 
presentation

at FOSDEM 23 in Brussels [1].


We need things as simple as TX check summing and I'm not sure about TSO 
yet (maybe in time). The Hybrid Networking Stack goal is not to compete

with the Kernel but rather provide a new approach to high performance 
Cloud Native networking which uses the Kernel + XDP and AF_XDP. We would

like to show how high performance networking use cases can use the in 
kernel fast path to achieve the performance they are looking for.


You can find more details about what we are trying to do here [2]


[1] https://fosdem.org/2023/schedule/event/hybrid_netstack/

[2] https://next.redhat.com/2022/12/07/the-hybrid-networking-stack/
Donald Hunter June 23, 2023, 6:57 p.m. UTC | #10
Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:

> On Thu, Jun 22, 2023 at 3:13 PM Stanislav Fomichev <sdf@google.com> wrote:
>>
>> We want to provide common sane interfaces/abstractions via kfuncs.
>> That will make most BPF programs portable from mlx to brcm (for
>> example) without doing a rewrite.
>> We're also exposing raw (readonly) descriptors (via that get_ctx
>> helper) to the users who know what to do with them.
>> Most users don't know what to do with raw descriptors;
>
> Why do you think so?
> Who are those users?
> I see your proposal and thumbs up from onlookers.
> afaict there are zero users for rx side hw hints too.

We have customers in various sectors that want to use rx hw timestamps.

There are several use cases especially in Telco where they use DPDK
today and want to move to AF_XDP but they need to be able to benefit
from the hw capabilities of the NICs they purchase. Not having access to
hw offloads on rx and tx is a barrier to AF_XDP adoption.

The most notable gaps in AF_XDP are checksum offloads and multi buffer
support.

>> the specs are
>> not public; things can change depending on fw version/etc/etc.
>> So the progs that touch raw descriptors are not the primary use-case.
>> (that was the tl;dr for rx part, seems like it applies here?)
>>
>> Let's maybe discuss that mlx5 example? Are you proposing to do
>> something along these lines?
>>
>> void mlx5e_devtx_submit(struct mlx5e_tx_wqe *wqe);
>> void mlx5e_devtx_complete(struct mlx5_cqe64 *cqe);
>>
>> If yes, I'm missing how we define the common kfuncs in this case. The
>> kfuncs need to have some common context. We're defining them with:
>> bpf_devtx_<kfunc>(const struct devtx_frame *ctx);
>
> I'm looking at xdp_metadata and wondering who's using it.
> I haven't seen a single bug report.
> No bugs means no one is using it. There is zero chance that we managed
> to implement it bug-free on the first try.

Nobody is using xdp_metadata today, not because they don't want to, but
because there was no consensus for how to use it. We have internal POCs
that use xdp_metadata and are still trying to build the foundations
needed to support it consistently across different hardware. Jesper
Brouer proposed a way to describe xdp_metadata with BTF and it was
rejected. The current plan to use kfuncs for xdp hints is the most
recent attempt to find a solution.

> So new tx side things look like a feature creep to me.
> rx side is far from proven to be useful for anything.
> Yet you want to add new things.

We have telcos and large enterprises that either use DPDK or use
proprietary solutions for getting traffic to user space. They want to
move to AF_XDP but without at least RX and TX checksum offloads they are
paying a CPU tax for using AF_XDP. One customer is also waiting for
multi-buffer support to land before they can adopt AF_XDP.

So, no it's not feature creep, it's a set of required features to reach
minimum viable product to be able to move out of a lab and replace
legacy in production.
John Fastabend June 24, 2023, 12:25 a.m. UTC | #11
Donald Hunter wrote:
> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
> 
> > On Thu, Jun 22, 2023 at 3:13 PM Stanislav Fomichev <sdf@google.com> wrote:
> >>
> >> We want to provide common sane interfaces/abstractions via kfuncs.
> >> That will make most BPF programs portable from mlx to brcm (for
> >> example) without doing a rewrite.
> >> We're also exposing raw (readonly) descriptors (via that get_ctx
> >> helper) to the users who know what to do with them.
> >> Most users don't know what to do with raw descriptors;
> >
> > Why do you think so?
> > Who are those users?
> > I see your proposal and thumbs up from onlookers.
> > afaict there are zero users for rx side hw hints too.
> 
> We have customers in various sectors that want to use rx hw timestamps.
> 
> There are several use cases especially in Telco where they use DPDK
> today and want to move to AF_XDP but they need to be able to benefit
> from the hw capabilities of the NICs they purchase. Not having access to
> hw offloads on rx and tx is a barrier to AF_XDP adoption.
> 
> The most notable gaps in AF_XDP are checksum offloads and multi buffer
> support.
> 
> >> the specs are
> >> not public; things can change depending on fw version/etc/etc.
> >> So the progs that touch raw descriptors are not the primary use-case.
> >> (that was the tl;dr for rx part, seems like it applies here?)
> >>
> >> Let's maybe discuss that mlx5 example? Are you proposing to do
> >> something along these lines?
> >>
> >> void mlx5e_devtx_submit(struct mlx5e_tx_wqe *wqe);
> >> void mlx5e_devtx_complete(struct mlx5_cqe64 *cqe);
> >>
> >> If yes, I'm missing how we define the common kfuncs in this case. The
> >> kfuncs need to have some common context. We're defining them with:
> >> bpf_devtx_<kfunc>(const struct devtx_frame *ctx);
> >
> > I'm looking at xdp_metadata and wondering who's using it.
> > I haven't seen a single bug report.
> > No bugs means no one is using it. There is zero chance that we managed
> > to implement it bug-free on the first try.
> 
> Nobody is using xdp_metadata today, not because they don't want to, but
> because there was no consensus for how to use it. We have internal POCs
> that use xdp_metadata and are still trying to build the foundations
> needed to support it consistently across different hardware. Jesper
> Brouer proposed a way to describe xdp_metadata with BTF and it was
> rejected. The current plan to use kfuncs for xdp hints is the most
> recent attempt to find a solution.

The hold up on my side is getting it in a LST kernel so we can get it
deployed in real environments. Although my plan is to just cast the
ctx to a kernel ctx and read the data out we need.

> 
> > So new tx side things look like a feature creep to me.
> > rx side is far from proven to be useful for anything.
> > Yet you want to add new things.

From my side if we just had a hook there and could cast the ctx to
something BTF type safe so we can simply read through the descriptor
I think that would sufficient for many use cases. To write into the
descriptor that might take more thought a writeable BTF flag?

> 
> We have telcos and large enterprises that either use DPDK or use
> proprietary solutions for getting traffic to user space. They want to
> move to AF_XDP but without at least RX and TX checksum offloads they are
> paying a CPU tax for using AF_XDP. One customer is also waiting for
> multi-buffer support to land before they can adopt AF_XDP.
> 
> So, no it's not feature creep, it's a set of required features to reach
> minimum viable product to be able to move out of a lab and replace
> legacy in production.
Alexei Starovoitov June 24, 2023, 2:52 a.m. UTC | #12
On Fri, Jun 23, 2023 at 5:25 PM John Fastabend <john.fastabend@gmail.com> wrote:
>
> Donald Hunter wrote:
> > Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
> >
> > > On Thu, Jun 22, 2023 at 3:13 PM Stanislav Fomichev <sdf@google.com> wrote:
> > >>
> > >> We want to provide common sane interfaces/abstractions via kfuncs.
> > >> That will make most BPF programs portable from mlx to brcm (for
> > >> example) without doing a rewrite.
> > >> We're also exposing raw (readonly) descriptors (via that get_ctx
> > >> helper) to the users who know what to do with them.
> > >> Most users don't know what to do with raw descriptors;
> > >
> > > Why do you think so?
> > > Who are those users?
> > > I see your proposal and thumbs up from onlookers.
> > > afaict there are zero users for rx side hw hints too.
> >
> > We have customers in various sectors that want to use rx hw timestamps.
> >
> > There are several use cases especially in Telco where they use DPDK
> > today and want to move to AF_XDP but they need to be able to benefit
> > from the hw capabilities of the NICs they purchase. Not having access to
> > hw offloads on rx and tx is a barrier to AF_XDP adoption.
> >
> > The most notable gaps in AF_XDP are checksum offloads and multi buffer
> > support.
> >
> > >> the specs are
> > >> not public; things can change depending on fw version/etc/etc.
> > >> So the progs that touch raw descriptors are not the primary use-case.
> > >> (that was the tl;dr for rx part, seems like it applies here?)
> > >>
> > >> Let's maybe discuss that mlx5 example? Are you proposing to do
> > >> something along these lines?
> > >>
> > >> void mlx5e_devtx_submit(struct mlx5e_tx_wqe *wqe);
> > >> void mlx5e_devtx_complete(struct mlx5_cqe64 *cqe);
> > >>
> > >> If yes, I'm missing how we define the common kfuncs in this case. The
> > >> kfuncs need to have some common context. We're defining them with:
> > >> bpf_devtx_<kfunc>(const struct devtx_frame *ctx);
> > >
> > > I'm looking at xdp_metadata and wondering who's using it.
> > > I haven't seen a single bug report.
> > > No bugs means no one is using it. There is zero chance that we managed
> > > to implement it bug-free on the first try.
> >
> > Nobody is using xdp_metadata today, not because they don't want to, but
> > because there was no consensus for how to use it. We have internal POCs
> > that use xdp_metadata and are still trying to build the foundations
> > needed to support it consistently across different hardware. Jesper
> > Brouer proposed a way to describe xdp_metadata with BTF and it was
> > rejected. The current plan to use kfuncs for xdp hints is the most
> > recent attempt to find a solution.
>
> The hold up on my side is getting it in a LST kernel so we can get it
> deployed in real environments. Although my plan is to just cast the
> ctx to a kernel ctx and read the data out we need.

+1

> >
> > > So new tx side things look like a feature creep to me.
> > > rx side is far from proven to be useful for anything.
> > > Yet you want to add new things.
>
> From my side if we just had a hook there and could cast the ctx to
> something BTF type safe so we can simply read through the descriptor
> I think that would sufficient for many use cases. To write into the
> descriptor that might take more thought a writeable BTF flag?

That's pretty much what I'm suggesting.
Add two driver specific __weak nop hook points where necessary
with few driver specific kfuncs.
Don't build generic infra when it's too early to generalize.

It would mean that bpf progs will be driver specific,
but when something novel like this is being proposed it's better
to start with minimal code change to core kernel (ideally none)
and when common things are found then generalize.

Sounds like Stanislav use case is timestamps in TX
while Donald's are checksums on RX, TX. These use cases are too different.
To make HW TX checksum compute checksum driven by AF_XDP
a lot more needs to be done than what Stan is proposing for timestamps.
Jakub Kicinski June 24, 2023, 9:38 p.m. UTC | #13
On Fri, 23 Jun 2023 19:52:03 -0700 Alexei Starovoitov wrote:
> That's pretty much what I'm suggesting.
> Add two driver specific __weak nop hook points where necessary
> with few driver specific kfuncs.
> Don't build generic infra when it's too early to generalize.
> 
> It would mean that bpf progs will be driver specific,
> but when something novel like this is being proposed it's better
> to start with minimal code change to core kernel (ideally none)
> and when common things are found then generalize.
> 
> Sounds like Stanislav use case is timestamps in TX
> while Donald's are checksums on RX, TX. These use cases are too different.
> To make HW TX checksum compute checksum driven by AF_XDP
> a lot more needs to be done than what Stan is proposing for timestamps.

I'd think HW TX csum is actually simpler than dealing with time,
will you change your mind if Stan posts Tx csum within a few days? :)

The set of offloads is barely changing, the lack of clarity 
on what is needed seems overstated. IMHO AF_XDP is getting no use
today, because everything remotely complex was stripped out of 
the implementation to get it merged. Aren't we hand waving the
complexity away simply because we don't want to deal with it?

These are the features today's devices support (rx/tx is a mirror):
 - L4 csum
 - segmentation
 - time reporting

Some may also support:
 - forwarding md tagging
 - Tx launch time
 - no fcs
Legacy / irrelevant:
 - VLAN insertion
Stanislav Fomichev June 25, 2023, 1:12 a.m. UTC | #14
On 06/24, Jakub Kicinski wrote:
> On Fri, 23 Jun 2023 19:52:03 -0700 Alexei Starovoitov wrote:
> > That's pretty much what I'm suggesting.
> > Add two driver specific __weak nop hook points where necessary
> > with few driver specific kfuncs.
> > Don't build generic infra when it's too early to generalize.
> > 
> > It would mean that bpf progs will be driver specific,
> > but when something novel like this is being proposed it's better
> > to start with minimal code change to core kernel (ideally none)
> > and when common things are found then generalize.
> > 
> > Sounds like Stanislav use case is timestamps in TX
> > while Donald's are checksums on RX, TX. These use cases are too different.
> > To make HW TX checksum compute checksum driven by AF_XDP
> > a lot more needs to be done than what Stan is proposing for timestamps.
> 
> I'd think HW TX csum is actually simpler than dealing with time,
> will you change your mind if Stan posts Tx csum within a few days? :)
> 
> The set of offloads is barely changing, the lack of clarity 
> on what is needed seems overstated. IMHO AF_XDP is getting no use
> today, because everything remotely complex was stripped out of 
> the implementation to get it merged. Aren't we hand waving the
> complexity away simply because we don't want to deal with it?
> 
> These are the features today's devices support (rx/tx is a mirror):
>  - L4 csum
>  - segmentation
>  - time reporting
> 
> Some may also support:
>  - forwarding md tagging
>  - Tx launch time
>  - no fcs
> Legacy / irrelevant:
>  - VLAN insertion

Right, the goal of the series is to lay out the foundation to support
AF_XDP offloads. I'm starting with tx timestamp because that's more
pressing. But, as I mentioned in another thread, we do have other
users that want to adopt AF_XDP, but due to missing tx offloads, they
aren't able to.

IMHO, with pre-tx/post-tx hooks, it's pretty easy to go from TX
timestamp to TX checksum offload, we don't need a lot:
- define another generic kfunc bpf_request_tx_csum(from, to)
- drivers implement it
- af_xdp users call this kfunc from devtx hook

We seem to be arguing over start-with-my-specific-narrow-use-case vs
start-with-generic implementation, so maybe time for the office hours?
I can try to present some cohesive plan of how we start with the framework
plus tx-timestamp and expand with tx-checksum/etc. There is a lot of
commonality in these offloads, so I'm probably not communicating it
properly..
Stanislav Fomichev June 26, 2023, 9:36 p.m. UTC | #15
On 06/24, Stanislav Fomichev wrote:
> On 06/24, Jakub Kicinski wrote:
> > On Fri, 23 Jun 2023 19:52:03 -0700 Alexei Starovoitov wrote:
> > > That's pretty much what I'm suggesting.
> > > Add two driver specific __weak nop hook points where necessary
> > > with few driver specific kfuncs.
> > > Don't build generic infra when it's too early to generalize.
> > > 
> > > It would mean that bpf progs will be driver specific,
> > > but when something novel like this is being proposed it's better
> > > to start with minimal code change to core kernel (ideally none)
> > > and when common things are found then generalize.
> > > 
> > > Sounds like Stanislav use case is timestamps in TX
> > > while Donald's are checksums on RX, TX. These use cases are too different.
> > > To make HW TX checksum compute checksum driven by AF_XDP
> > > a lot more needs to be done than what Stan is proposing for timestamps.
> > 
> > I'd think HW TX csum is actually simpler than dealing with time,
> > will you change your mind if Stan posts Tx csum within a few days? :)
> > 
> > The set of offloads is barely changing, the lack of clarity 
> > on what is needed seems overstated. IMHO AF_XDP is getting no use
> > today, because everything remotely complex was stripped out of 
> > the implementation to get it merged. Aren't we hand waving the
> > complexity away simply because we don't want to deal with it?
> > 
> > These are the features today's devices support (rx/tx is a mirror):
> >  - L4 csum
> >  - segmentation
> >  - time reporting
> > 
> > Some may also support:
> >  - forwarding md tagging
> >  - Tx launch time
> >  - no fcs
> > Legacy / irrelevant:
> >  - VLAN insertion
> 
> Right, the goal of the series is to lay out the foundation to support
> AF_XDP offloads. I'm starting with tx timestamp because that's more
> pressing. But, as I mentioned in another thread, we do have other
> users that want to adopt AF_XDP, but due to missing tx offloads, they
> aren't able to.
> 
> IMHO, with pre-tx/post-tx hooks, it's pretty easy to go from TX
> timestamp to TX checksum offload, we don't need a lot:
> - define another generic kfunc bpf_request_tx_csum(from, to)
> - drivers implement it
> - af_xdp users call this kfunc from devtx hook
> 
> We seem to be arguing over start-with-my-specific-narrow-use-case vs
> start-with-generic implementation, so maybe time for the office hours?
> I can try to present some cohesive plan of how we start with the framework
> plus tx-timestamp and expand with tx-checksum/etc. There is a lot of
> commonality in these offloads, so I'm probably not communicating it
> properly..

Or, maybe a better suggestion: let me try to implement TX checksum
kfunc in the v3 (to show how to build on top this series).
Having code is better than doing slides :-D
Alexei Starovoitov June 26, 2023, 10:37 p.m. UTC | #16
On Mon, Jun 26, 2023 at 2:36 PM Stanislav Fomichev <sdf@google.com> wrote:
>
> > >
> > > I'd think HW TX csum is actually simpler than dealing with time,
> > > will you change your mind if Stan posts Tx csum within a few days? :)

Absolutely :) Happy to change my mind.

> > > The set of offloads is barely changing, the lack of clarity
> > > on what is needed seems overstated. IMHO AF_XDP is getting no use
> > > today, because everything remotely complex was stripped out of
> > > the implementation to get it merged. Aren't we hand waving the
> > > complexity away simply because we don't want to deal with it?
> > >
> > > These are the features today's devices support (rx/tx is a mirror):
> > >  - L4 csum
> > >  - segmentation
> > >  - time reporting
> > >
> > > Some may also support:
> > >  - forwarding md tagging
> > >  - Tx launch time
> > >  - no fcs
> > > Legacy / irrelevant:
> > >  - VLAN insertion
> >
> > Right, the goal of the series is to lay out the foundation to support
> > AF_XDP offloads. I'm starting with tx timestamp because that's more
> > pressing. But, as I mentioned in another thread, we do have other
> > users that want to adopt AF_XDP, but due to missing tx offloads, they
> > aren't able to.
> >
> > IMHO, with pre-tx/post-tx hooks, it's pretty easy to go from TX
> > timestamp to TX checksum offload, we don't need a lot:
> > - define another generic kfunc bpf_request_tx_csum(from, to)
> > - drivers implement it
> > - af_xdp users call this kfunc from devtx hook
> >
> > We seem to be arguing over start-with-my-specific-narrow-use-case vs
> > start-with-generic implementation, so maybe time for the office hours?
> > I can try to present some cohesive plan of how we start with the framework
> > plus tx-timestamp and expand with tx-checksum/etc. There is a lot of
> > commonality in these offloads, so I'm probably not communicating it
> > properly..
>
> Or, maybe a better suggestion: let me try to implement TX checksum
> kfunc in the v3 (to show how to build on top this series).
> Having code is better than doing slides :-D

That would certainly help :)
What I got out of your lsfmmbpf talk is that timestamp is your
main and only use case. tx checksum for af_xdp is the other use case,
but it's not yours, so you sort-of use it as an extra justification
for timestamp. Hence my negative reaction to 'generality'.
I think we'll have better results in designing an api
when we look at these two use cases independently.
And implement them in patches solving specifically timestamp
with normal skb traffic and tx checksum for af_xdp as two independent apis.
If it looks like we can extract a common framework out of them. Great.
But trying to generalize before truly addressing both cases
is likely to cripple both apis.

It doesn't have to be only two use cases.
I completely agree with Kuba that:
 - L4 csum
 - segmentation
 - time reporting
are universal HW NIC features and we need to have an api
that exposes these features in programmable way to bpf progs in the kernel
and through af_xdp to user space.
I mainly suggest addressing them one by one and look
for common code bits and api similarities later.
Stanislav Fomichev June 26, 2023, 11:29 p.m. UTC | #17
On Mon, Jun 26, 2023 at 3:37 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Mon, Jun 26, 2023 at 2:36 PM Stanislav Fomichev <sdf@google.com> wrote:
> >
> > > >
> > > > I'd think HW TX csum is actually simpler than dealing with time,
> > > > will you change your mind if Stan posts Tx csum within a few days? :)
>
> Absolutely :) Happy to change my mind.
>
> > > > The set of offloads is barely changing, the lack of clarity
> > > > on what is needed seems overstated. IMHO AF_XDP is getting no use
> > > > today, because everything remotely complex was stripped out of
> > > > the implementation to get it merged. Aren't we hand waving the
> > > > complexity away simply because we don't want to deal with it?
> > > >
> > > > These are the features today's devices support (rx/tx is a mirror):
> > > >  - L4 csum
> > > >  - segmentation
> > > >  - time reporting
> > > >
> > > > Some may also support:
> > > >  - forwarding md tagging
> > > >  - Tx launch time
> > > >  - no fcs
> > > > Legacy / irrelevant:
> > > >  - VLAN insertion
> > >
> > > Right, the goal of the series is to lay out the foundation to support
> > > AF_XDP offloads. I'm starting with tx timestamp because that's more
> > > pressing. But, as I mentioned in another thread, we do have other
> > > users that want to adopt AF_XDP, but due to missing tx offloads, they
> > > aren't able to.
> > >
> > > IMHO, with pre-tx/post-tx hooks, it's pretty easy to go from TX
> > > timestamp to TX checksum offload, we don't need a lot:
> > > - define another generic kfunc bpf_request_tx_csum(from, to)
> > > - drivers implement it
> > > - af_xdp users call this kfunc from devtx hook
> > >
> > > We seem to be arguing over start-with-my-specific-narrow-use-case vs
> > > start-with-generic implementation, so maybe time for the office hours?
> > > I can try to present some cohesive plan of how we start with the framework
> > > plus tx-timestamp and expand with tx-checksum/etc. There is a lot of
> > > commonality in these offloads, so I'm probably not communicating it
> > > properly..
> >
> > Or, maybe a better suggestion: let me try to implement TX checksum
> > kfunc in the v3 (to show how to build on top this series).
> > Having code is better than doing slides :-D
>
> That would certainly help :)
> What I got out of your lsfmmbpf talk is that timestamp is your
> main and only use case. tx checksum for af_xdp is the other use case,
> but it's not yours, so you sort-of use it as an extra justification
> for timestamp. Hence my negative reaction to 'generality'.
> I think we'll have better results in designing an api
> when we look at these two use cases independently.
> And implement them in patches solving specifically timestamp
> with normal skb traffic and tx checksum for af_xdp as two independent apis.
> If it looks like we can extract a common framework out of them. Great.
> But trying to generalize before truly addressing both cases
> is likely to cripple both apis.

I need timestamps for the af_xdp case and I don't really care about skb :-(
I brought skb into the picture mostly to cover John's cases.
So maybe let's drop the skb case for now and focus on af_xdp?
skb is convenient testing-wise though (with veth), but maybe I can
somehow carve-out af_xdp skbs only out of it..

Regarding timestamp vs checksum: timestamp is more pressing, but I do
have people around that want to use af_xdp but need multibuf + tx
offloads, so I was hoping to at least have a path for more tx offloads
after we're done with tx timestamp "offload"..

> It doesn't have to be only two use cases.
> I completely agree with Kuba that:
>  - L4 csum
>  - segmentation
>  - time reporting
> are universal HW NIC features and we need to have an api
> that exposes these features in programmable way to bpf progs in the kernel
> and through af_xdp to user space.
> I mainly suggest addressing them one by one and look
> for common code bits and api similarities later.

Ack, let me see if I can fit tx csum into the picture. I still feel
like we need these dev-bound tracing programs if we want to trigger
kfuncs safely, but maybe we can simplify further..
Toke Høiland-Jørgensen June 27, 2023, 1:35 p.m. UTC | #18
Stanislav Fomichev <sdf@google.com> writes:

> Ack, let me see if I can fit tx csum into the picture. I still feel
> like we need these dev-bound tracing programs if we want to trigger
> kfuncs safely, but maybe we can simplify further..

FWIW, I absolutely think we should go with "attach to ifindex + dev
bound" model instead of the "attach to driver kfunc and check ifindex in
BPF". The latter may be fine for BPF kernel devs, but it's a terrible
API for a dataplane, which IMO is what we're building here...

-Toke
John Fastabend June 27, 2023, 9:43 p.m. UTC | #19
Stanislav Fomichev wrote:
> On Mon, Jun 26, 2023 at 3:37 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Mon, Jun 26, 2023 at 2:36 PM Stanislav Fomichev <sdf@google.com> wrote:
> > >
> > > > >
> > > > > I'd think HW TX csum is actually simpler than dealing with time,
> > > > > will you change your mind if Stan posts Tx csum within a few days? :)
> >
> > Absolutely :) Happy to change my mind.
> >
> > > > > The set of offloads is barely changing, the lack of clarity
> > > > > on what is needed seems overstated. IMHO AF_XDP is getting no use
> > > > > today, because everything remotely complex was stripped out of
> > > > > the implementation to get it merged. Aren't we hand waving the
> > > > > complexity away simply because we don't want to deal with it?
> > > > >
> > > > > These are the features today's devices support (rx/tx is a mirror):
> > > > >  - L4 csum
> > > > >  - segmentation
> > > > >  - time reporting
> > > > >
> > > > > Some may also support:
> > > > >  - forwarding md tagging
> > > > >  - Tx launch time
> > > > >  - no fcs
> > > > > Legacy / irrelevant:
> > > > >  - VLAN insertion
> > > >
> > > > Right, the goal of the series is to lay out the foundation to support
> > > > AF_XDP offloads. I'm starting with tx timestamp because that's more
> > > > pressing. But, as I mentioned in another thread, we do have other
> > > > users that want to adopt AF_XDP, but due to missing tx offloads, they
> > > > aren't able to.
> > > >
> > > > IMHO, with pre-tx/post-tx hooks, it's pretty easy to go from TX
> > > > timestamp to TX checksum offload, we don't need a lot:
> > > > - define another generic kfunc bpf_request_tx_csum(from, to)
> > > > - drivers implement it
> > > > - af_xdp users call this kfunc from devtx hook
> > > >
> > > > We seem to be arguing over start-with-my-specific-narrow-use-case vs
> > > > start-with-generic implementation, so maybe time for the office hours?
> > > > I can try to present some cohesive plan of how we start with the framework
> > > > plus tx-timestamp and expand with tx-checksum/etc. There is a lot of
> > > > commonality in these offloads, so I'm probably not communicating it
> > > > properly..
> > >
> > > Or, maybe a better suggestion: let me try to implement TX checksum
> > > kfunc in the v3 (to show how to build on top this series).
> > > Having code is better than doing slides :-D
> >
> > That would certainly help :)
> > What I got out of your lsfmmbpf talk is that timestamp is your
> > main and only use case. tx checksum for af_xdp is the other use case,
> > but it's not yours, so you sort-of use it as an extra justification
> > for timestamp. Hence my negative reaction to 'generality'.
> > I think we'll have better results in designing an api
> > when we look at these two use cases independently.
> > And implement them in patches solving specifically timestamp
> > with normal skb traffic and tx checksum for af_xdp as two independent apis.
> > If it looks like we can extract a common framework out of them. Great.
> > But trying to generalize before truly addressing both cases
> > is likely to cripple both apis.
> 
> I need timestamps for the af_xdp case and I don't really care about skb :-(
> I brought skb into the picture mostly to cover John's cases.
> So maybe let's drop the skb case for now and focus on af_xdp?
> skb is convenient testing-wise though (with veth), but maybe I can
> somehow carve-out af_xdp skbs only out of it..

I'm ok if your drop my use case but I read above and I seem to have a
slightly different opinion/approach in mind.

What I think would be the most straight-forward thing and most flexible
is to create a <drvname>_devtx_submit_skb(<drivname>descriptor, sk_buff)
and <drvname>_devtx_submit_xdp(<drvname>descriptor, xdp_frame) and then
corresponding calls for <drvname>_devtx_complete_{skb|xdp}() Then you
don't spend any cycles building the metadata thing or have to even
worry about read kfuncs. The BPF program has read access to any
fields they need. And with the skb, xdp pointer we have the context
that created the descriptor and generate meaningful metrics.

I'm clearly sacrificing usability in some sense of a general user that
doesn't know about drivers, hardware and so on for performance,
flexibility and simplicity of implementation. In general I'm OK with
this. I have trouble understanding who the dev is that is coding at
this layer, but can't read kernel driver code or at least has a good
understanding of the hardware. We are deep in optimization and
performance world once we get to putting hooks in the driver we should
expect a certain amount of understanding before we let folks just plant
hooks here. Its going to be very easy to cause all sort of damage
even if we go to the other extreme and make a unified interface and
push the complexity onto kernel devs to maintain. I really believe
folks writing AF_XDP code (for DPDK or otherwise) have a really good
handle on networking, hardware, and drivers. I also expect that
AF_XDP users will mostly be abstracted away from AF_XDP internals
by DPDK and other libs or applications. My $.02 is these will be
primarily middle box type application built for special purpose l2/l3/l4
firewalling, DDOS, etc and telco protocols. Rant off.

But I can admit <drvname>_devtx_submit_xdp(<drvname>descriptor, ...)
is a bit raw. For one its going to require an object file per
driver/hardware and maybe worse multiple objects per driver/hw to
deal with hw descriptor changes with features. My $.02 is we could
solve this with better attach time linking. Now you have to at
compile time know what you are going to attach to and how to parse
the descriptor. If we had attach time linking we could dynamically
link to the hardware specific code at link time. And then its up
to us here or others who really understand the hardware to write
a ice_read_ts, mlx_read_tx but that can all just be normal BPF code.

Also I hand waved through a step where at attach time we have
some way to say link the thing that is associated with the
driver I'm about to attach to. As a first step a loader could
do this.

Its maybe more core work and less wrangling drivers then and
it means kfuncs become just blocks of BPF that anyone can
write. The big win in my mind is I don't need to know now
what I want tomorrow because I should have access. Also we push
the complexity and maintenance out of driver/kernel and into
BPF libs and users. Then we don't have to touch BPF core just
to add new things.

Last bit that complicates things is I need a way to also write
allowed values into the descriptor. We don't have anything that
can do this now. So maybe kfuncs for the write tstamp flags and
friends?

Anyways, my $.02.



> 
> Regarding timestamp vs checksum: timestamp is more pressing, but I do

One request would be to also include a driver that doesn't have
always on timestamps so some writer is needed. CSUM enabling
I'm interested to see what the signature looks like? On the
skb side we use the skb a fair amount to build the checksum
it seems so I guess AF_XDP needs to push down the csum_offset?
In the SKB case its less interesting I think becuase the
stack is already handling it.

> have people around that want to use af_xdp but need multibuf + tx
> offloads, so I was hoping to at least have a path for more tx offloads
> after we're done with tx timestamp "offload"..
> 
> > It doesn't have to be only two use cases.
> > I completely agree with Kuba that:
> >  - L4 csum
> >  - segmentation
> >  - time reporting
> > are universal HW NIC features and we need to have an api
> > that exposes these features in programmable way to bpf progs in the kernel
> > and through af_xdp to user space.
> > I mainly suggest addressing them one by one and look
> > for common code bits and api similarities later.
> 
> Ack, let me see if I can fit tx csum into the picture. I still feel
> like we need these dev-bound tracing programs if we want to trigger
> kfuncs safely, but maybe we can simplify further..

Its not clear to me how you get to a dev specific attach here
without complicating the hot path more. I think we need to
really be careful to not make hotpath more complex. Will
follow along for sure to see what gets created.

Thanks,
John
Stanislav Fomichev June 27, 2023, 10:56 p.m. UTC | #20
On 06/27, John Fastabend wrote:
> Stanislav Fomichev wrote:
> > On Mon, Jun 26, 2023 at 3:37 PM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > On Mon, Jun 26, 2023 at 2:36 PM Stanislav Fomichev <sdf@google.com> wrote:
> > > >
> > > > > >
> > > > > > I'd think HW TX csum is actually simpler than dealing with time,
> > > > > > will you change your mind if Stan posts Tx csum within a few days? :)
> > >
> > > Absolutely :) Happy to change my mind.
> > >
> > > > > > The set of offloads is barely changing, the lack of clarity
> > > > > > on what is needed seems overstated. IMHO AF_XDP is getting no use
> > > > > > today, because everything remotely complex was stripped out of
> > > > > > the implementation to get it merged. Aren't we hand waving the
> > > > > > complexity away simply because we don't want to deal with it?
> > > > > >
> > > > > > These are the features today's devices support (rx/tx is a mirror):
> > > > > >  - L4 csum
> > > > > >  - segmentation
> > > > > >  - time reporting
> > > > > >
> > > > > > Some may also support:
> > > > > >  - forwarding md tagging
> > > > > >  - Tx launch time
> > > > > >  - no fcs
> > > > > > Legacy / irrelevant:
> > > > > >  - VLAN insertion
> > > > >
> > > > > Right, the goal of the series is to lay out the foundation to support
> > > > > AF_XDP offloads. I'm starting with tx timestamp because that's more
> > > > > pressing. But, as I mentioned in another thread, we do have other
> > > > > users that want to adopt AF_XDP, but due to missing tx offloads, they
> > > > > aren't able to.
> > > > >
> > > > > IMHO, with pre-tx/post-tx hooks, it's pretty easy to go from TX
> > > > > timestamp to TX checksum offload, we don't need a lot:
> > > > > - define another generic kfunc bpf_request_tx_csum(from, to)
> > > > > - drivers implement it
> > > > > - af_xdp users call this kfunc from devtx hook
> > > > >
> > > > > We seem to be arguing over start-with-my-specific-narrow-use-case vs
> > > > > start-with-generic implementation, so maybe time for the office hours?
> > > > > I can try to present some cohesive plan of how we start with the framework
> > > > > plus tx-timestamp and expand with tx-checksum/etc. There is a lot of
> > > > > commonality in these offloads, so I'm probably not communicating it
> > > > > properly..
> > > >
> > > > Or, maybe a better suggestion: let me try to implement TX checksum
> > > > kfunc in the v3 (to show how to build on top this series).
> > > > Having code is better than doing slides :-D
> > >
> > > That would certainly help :)
> > > What I got out of your lsfmmbpf talk is that timestamp is your
> > > main and only use case. tx checksum for af_xdp is the other use case,
> > > but it's not yours, so you sort-of use it as an extra justification
> > > for timestamp. Hence my negative reaction to 'generality'.
> > > I think we'll have better results in designing an api
> > > when we look at these two use cases independently.
> > > And implement them in patches solving specifically timestamp
> > > with normal skb traffic and tx checksum for af_xdp as two independent apis.
> > > If it looks like we can extract a common framework out of them. Great.
> > > But trying to generalize before truly addressing both cases
> > > is likely to cripple both apis.
> > 
> > I need timestamps for the af_xdp case and I don't really care about skb :-(
> > I brought skb into the picture mostly to cover John's cases.
> > So maybe let's drop the skb case for now and focus on af_xdp?
> > skb is convenient testing-wise though (with veth), but maybe I can
> > somehow carve-out af_xdp skbs only out of it..
> 
> I'm ok if your drop my use case but I read above and I seem to have a
> slightly different opinion/approach in mind.
> 
> What I think would be the most straight-forward thing and most flexible
> is to create a <drvname>_devtx_submit_skb(<drivname>descriptor, sk_buff)
> and <drvname>_devtx_submit_xdp(<drvname>descriptor, xdp_frame) and then
> corresponding calls for <drvname>_devtx_complete_{skb|xdp}() Then you
> don't spend any cycles building the metadata thing or have to even
> worry about read kfuncs. The BPF program has read access to any
> fields they need. And with the skb, xdp pointer we have the context
> that created the descriptor and generate meaningful metrics.
> 
> I'm clearly sacrificing usability in some sense of a general user that
> doesn't know about drivers, hardware and so on for performance,
> flexibility and simplicity of implementation. In general I'm OK with
> this. I have trouble understanding who the dev is that is coding at
> this layer, but can't read kernel driver code or at least has a good
> understanding of the hardware. We are deep in optimization and
> performance world once we get to putting hooks in the driver we should
> expect a certain amount of understanding before we let folks just plant
> hooks here. Its going to be very easy to cause all sort of damage
> even if we go to the other extreme and make a unified interface and
> push the complexity onto kernel devs to maintain. I really believe
> folks writing AF_XDP code (for DPDK or otherwise) have a really good
> handle on networking, hardware, and drivers. I also expect that
> AF_XDP users will mostly be abstracted away from AF_XDP internals
> by DPDK and other libs or applications. My $.02 is these will be
> primarily middle box type application built for special purpose l2/l3/l4
> firewalling, DDOS, etc and telco protocols. Rant off.
> 
> But I can admit <drvname>_devtx_submit_xdp(<drvname>descriptor, ...)
> is a bit raw. For one its going to require an object file per
> driver/hardware and maybe worse multiple objects per driver/hw to
> deal with hw descriptor changes with features. My $.02 is we could
> solve this with better attach time linking. Now you have to at
> compile time know what you are going to attach to and how to parse
> the descriptor. If we had attach time linking we could dynamically
> link to the hardware specific code at link time. And then its up
> to us here or others who really understand the hardware to write
> a ice_read_ts, mlx_read_tx but that can all just be normal BPF code.
> 
> Also I hand waved through a step where at attach time we have
> some way to say link the thing that is associated with the
> driver I'm about to attach to. As a first step a loader could
> do this.
> 
> Its maybe more core work and less wrangling drivers then and
> it means kfuncs become just blocks of BPF that anyone can
> write. The big win in my mind is I don't need to know now
> what I want tomorrow because I should have access. Also we push
> the complexity and maintenance out of driver/kernel and into
> BPF libs and users. Then we don't have to touch BPF core just
> to add new things.
> 
> Last bit that complicates things is I need a way to also write
> allowed values into the descriptor. We don't have anything that
> can do this now. So maybe kfuncs for the write tstamp flags and
> friends?

And in this case, the kfuncs would be non-generic and look something
like the following?

  bpf_devtx_<drvname>_request_timestamp(<drivname>descriptor, xdp_frame)

I feel like this can work, yeah. The interface is raw, but maybe
you are both right in assuming that different nics will
expose different capabilities and we shouldn't try to pretend
there is some commonality. I'll try to explore that idea more with
the tx-csum..

Worst case, userspace can do:

int bpf_devtx_request_timestamp(some-generic-prog-abstraction-to-pass-ctx)
{
#ifdef DEVICE_MLX5
  return mlx5_request_timestamp(...);
#elif DEVICE_VETH
  return veth_request-timestamp(...);
#else
  ...
#endif
}

One thing we should probably spend more time in this case is documenting
it all. Or maybe having some DEVTX_XXX macros for those kfunc definitions
and hooks to make them discoverable.

But yeah, I see the appeal. The only ugly part with this all is that
my xdp_hw_metadata would not be portable at all :-/ But it might be
a good place/reason to actually figure out how to do it :-)

> Anyways, my $.02.
> 
> 
> 
> > 
> > Regarding timestamp vs checksum: timestamp is more pressing, but I do
> 
> One request would be to also include a driver that doesn't have
> always on timestamps so some writer is needed. CSUM enabling
> I'm interested to see what the signature looks like? On the
> skb side we use the skb a fair amount to build the checksum
> it seems so I guess AF_XDP needs to push down the csum_offset?
> In the SKB case its less interesting I think becuase the
> stack is already handling it.

I need access to your lab :-p

Regarding the signature, csum_start + csum_offset maybe? As we have in
skb?

Although, from a quick grepping, I see some of the drivers have only
a fixed set of tx checksum configurations they support:

switch (skb->csum_offset) {
case offsetof(struct tcphdr, check):
	tx_desc->flags |= DO_TCP_IP_TX_CSUM_AT_FIXED_OFFSET;
	break;
default:
	/* sw fallback */
}

So maybe that's another argument in favor of not doing a generic
layer and just expose whatever HW has in a non-portable way...
(otoh, still accepting csum_offset+start + doing that switch inside
is probably an ok common interface)

> > have people around that want to use af_xdp but need multibuf + tx
> > offloads, so I was hoping to at least have a path for more tx offloads
> > after we're done with tx timestamp "offload"..
> > 
> > > It doesn't have to be only two use cases.
> > > I completely agree with Kuba that:
> > >  - L4 csum
> > >  - segmentation
> > >  - time reporting
> > > are universal HW NIC features and we need to have an api
> > > that exposes these features in programmable way to bpf progs in the kernel
> > > and through af_xdp to user space.
> > > I mainly suggest addressing them one by one and look
> > > for common code bits and api similarities later.
> > 
> > Ack, let me see if I can fit tx csum into the picture. I still feel
> > like we need these dev-bound tracing programs if we want to trigger
> > kfuncs safely, but maybe we can simplify further..
> 
> Its not clear to me how you get to a dev specific attach here
> without complicating the hot path more. I think we need to
> really be careful to not make hotpath more complex. Will
> follow along for sure to see what gets created.

Agreed. I've yet to test it with some real HW (still in the process of
trying to get back my lab configuration which was changed recently) :-(
John Fastabend June 27, 2023, 11:33 p.m. UTC | #21
Stanislav Fomichev wrote:
> On 06/27, John Fastabend wrote:
> > Stanislav Fomichev wrote:
> > > On Mon, Jun 26, 2023 at 3:37 PM Alexei Starovoitov
> > > <alexei.starovoitov@gmail.com> wrote:
> > > >
> > > > On Mon, Jun 26, 2023 at 2:36 PM Stanislav Fomichev <sdf@google.com> wrote:
> > > > >
> > > > > > >
> > > > > > > I'd think HW TX csum is actually simpler than dealing with time,
> > > > > > > will you change your mind if Stan posts Tx csum within a few days? :)
> > > >
> > > > Absolutely :) Happy to change my mind.
> > > >
> > > > > > > The set of offloads is barely changing, the lack of clarity
> > > > > > > on what is needed seems overstated. IMHO AF_XDP is getting no use
> > > > > > > today, because everything remotely complex was stripped out of
> > > > > > > the implementation to get it merged. Aren't we hand waving the
> > > > > > > complexity away simply because we don't want to deal with it?
> > > > > > >
> > > > > > > These are the features today's devices support (rx/tx is a mirror):
> > > > > > >  - L4 csum
> > > > > > >  - segmentation
> > > > > > >  - time reporting
> > > > > > >
> > > > > > > Some may also support:
> > > > > > >  - forwarding md tagging
> > > > > > >  - Tx launch time
> > > > > > >  - no fcs
> > > > > > > Legacy / irrelevant:
> > > > > > >  - VLAN insertion
> > > > > >
> > > > > > Right, the goal of the series is to lay out the foundation to support
> > > > > > AF_XDP offloads. I'm starting with tx timestamp because that's more
> > > > > > pressing. But, as I mentioned in another thread, we do have other
> > > > > > users that want to adopt AF_XDP, but due to missing tx offloads, they
> > > > > > aren't able to.
> > > > > >
> > > > > > IMHO, with pre-tx/post-tx hooks, it's pretty easy to go from TX
> > > > > > timestamp to TX checksum offload, we don't need a lot:
> > > > > > - define another generic kfunc bpf_request_tx_csum(from, to)
> > > > > > - drivers implement it
> > > > > > - af_xdp users call this kfunc from devtx hook
> > > > > >
> > > > > > We seem to be arguing over start-with-my-specific-narrow-use-case vs
> > > > > > start-with-generic implementation, so maybe time for the office hours?
> > > > > > I can try to present some cohesive plan of how we start with the framework
> > > > > > plus tx-timestamp and expand with tx-checksum/etc. There is a lot of
> > > > > > commonality in these offloads, so I'm probably not communicating it
> > > > > > properly..
> > > > >
> > > > > Or, maybe a better suggestion: let me try to implement TX checksum
> > > > > kfunc in the v3 (to show how to build on top this series).
> > > > > Having code is better than doing slides :-D
> > > >
> > > > That would certainly help :)
> > > > What I got out of your lsfmmbpf talk is that timestamp is your
> > > > main and only use case. tx checksum for af_xdp is the other use case,
> > > > but it's not yours, so you sort-of use it as an extra justification
> > > > for timestamp. Hence my negative reaction to 'generality'.
> > > > I think we'll have better results in designing an api
> > > > when we look at these two use cases independently.
> > > > And implement them in patches solving specifically timestamp
> > > > with normal skb traffic and tx checksum for af_xdp as two independent apis.
> > > > If it looks like we can extract a common framework out of them. Great.
> > > > But trying to generalize before truly addressing both cases
> > > > is likely to cripple both apis.
> > > 
> > > I need timestamps for the af_xdp case and I don't really care about skb :-(
> > > I brought skb into the picture mostly to cover John's cases.
> > > So maybe let's drop the skb case for now and focus on af_xdp?
> > > skb is convenient testing-wise though (with veth), but maybe I can
> > > somehow carve-out af_xdp skbs only out of it..
> > 
> > I'm ok if your drop my use case but I read above and I seem to have a
> > slightly different opinion/approach in mind.
> > 
> > What I think would be the most straight-forward thing and most flexible
> > is to create a <drvname>_devtx_submit_skb(<drivname>descriptor, sk_buff)
> > and <drvname>_devtx_submit_xdp(<drvname>descriptor, xdp_frame) and then
> > corresponding calls for <drvname>_devtx_complete_{skb|xdp}() Then you
> > don't spend any cycles building the metadata thing or have to even
> > worry about read kfuncs. The BPF program has read access to any
> > fields they need. And with the skb, xdp pointer we have the context
> > that created the descriptor and generate meaningful metrics.
> > 
> > I'm clearly sacrificing usability in some sense of a general user that
> > doesn't know about drivers, hardware and so on for performance,
> > flexibility and simplicity of implementation. In general I'm OK with
> > this. I have trouble understanding who the dev is that is coding at
> > this layer, but can't read kernel driver code or at least has a good
> > understanding of the hardware. We are deep in optimization and
> > performance world once we get to putting hooks in the driver we should
> > expect a certain amount of understanding before we let folks just plant
> > hooks here. Its going to be very easy to cause all sort of damage
> > even if we go to the other extreme and make a unified interface and
> > push the complexity onto kernel devs to maintain. I really believe
> > folks writing AF_XDP code (for DPDK or otherwise) have a really good
> > handle on networking, hardware, and drivers. I also expect that
> > AF_XDP users will mostly be abstracted away from AF_XDP internals
> > by DPDK and other libs or applications. My $.02 is these will be
> > primarily middle box type application built for special purpose l2/l3/l4
> > firewalling, DDOS, etc and telco protocols. Rant off.
> > 
> > But I can admit <drvname>_devtx_submit_xdp(<drvname>descriptor, ...)
> > is a bit raw. For one its going to require an object file per
> > driver/hardware and maybe worse multiple objects per driver/hw to
> > deal with hw descriptor changes with features. My $.02 is we could
> > solve this with better attach time linking. Now you have to at
> > compile time know what you are going to attach to and how to parse
> > the descriptor. If we had attach time linking we could dynamically
> > link to the hardware specific code at link time. And then its up
> > to us here or others who really understand the hardware to write
> > a ice_read_ts, mlx_read_tx but that can all just be normal BPF code.
> > 
> > Also I hand waved through a step where at attach time we have
> > some way to say link the thing that is associated with the
> > driver I'm about to attach to. As a first step a loader could
> > do this.
> > 
> > Its maybe more core work and less wrangling drivers then and
> > it means kfuncs become just blocks of BPF that anyone can
> > write. The big win in my mind is I don't need to know now
> > what I want tomorrow because I should have access. Also we push
> > the complexity and maintenance out of driver/kernel and into
> > BPF libs and users. Then we don't have to touch BPF core just
> > to add new things.
> > 
> > Last bit that complicates things is I need a way to also write
> > allowed values into the descriptor. We don't have anything that
> > can do this now. So maybe kfuncs for the write tstamp flags and
> > friends?
> 
> And in this case, the kfuncs would be non-generic and look something
> like the following?
> 
>   bpf_devtx_<drvname>_request_timestamp(<drivname>descriptor, xdp_frame)

Yeah, for writing into the descriptor I couldn't think up anythign more
clever. Anyways we will want to JIT that into insns if we are touching
every pkt.

> 
> I feel like this can work, yeah. The interface is raw, but maybe
> you are both right in assuming that different nics will
> expose different capabilities and we shouldn't try to pretend
> there is some commonality. I'll try to explore that idea more with
> the tx-csum..

I think it should be handle at the next level up with BPF libraries
and at BPF community level by building abstractions on top of BPF
instead of trying to bury them into where they feel less natural
to me. A project like Tetragon, DPDK, ... would abstract these behind
their APIs and users writing a AF_XDP widget on top of these would
probably never need to know about it. Certainly we wont have end
users of Tetragon have to care about driver they are attaching to.

> 
> Worst case, userspace can do:
> 
> int bpf_devtx_request_timestamp(some-generic-prog-abstraction-to-pass-ctx)
> {
> #ifdef DEVICE_MLX5
>   return mlx5_request_timestamp(...);
> #elif DEVICE_VETH
>   return veth_request-timestamp(...);
> #else
>   ...
> #endif
> }

Yeah I think so and then carry a couple different object files
for the environment around. We do this already for some things.
Its not ideal but it works. I think a good end goal would be

 int bpf_devtx_request_timestamp(...)
 {
	set_ts = dlsym( dl_handle, request-timestamp);
	return set_ts(...) 
 }

Then we could at attach time take that dlsym and rewrite it.

> 
> One thing we should probably spend more time in this case is documenting
> it all. Or maybe having some DEVTX_XXX macros for those kfunc definitions
> and hooks to make them discoverable.

More docs would be great for sure.

> 
> But yeah, I see the appeal. The only ugly part with this all is that
> my xdp_hw_metadata would not be portable at all :-/ But it might be
> a good place/reason to actually figure out how to do it :-)

Agree you lose portability at the low level BPF, but we could
get it back with BPF libs. I think its the basic tradeoff here
is I want to keep the interface raw as possible and push the
details into the BPF program/loader. So you lose the low level
portability, but I think we get a lot for it and can get it
back if a BPF community builds the libs and tooling to solve
the problem at the next layer up. My thinking is kernel dev
intuition is to solve it in the kernel, but we take on a lot
of complexity to do this when we could push it out to userspace
where its easier to manage versioning and the complexity.

> 
> > Anyways, my $.02.
> > 
> > 
> > 
> > > 
> > > Regarding timestamp vs checksum: timestamp is more pressing, but I do
> > 
> > One request would be to also include a driver that doesn't have
> > always on timestamps so some writer is needed. CSUM enabling
> > I'm interested to see what the signature looks like? On the
> > skb side we use the skb a fair amount to build the checksum
> > it seems so I guess AF_XDP needs to push down the csum_offset?
> > In the SKB case its less interesting I think becuase the
> > stack is already handling it.
> 
> I need access to your lab :-p

I can likely try to at least prototype for some nic.

> 
> Regarding the signature, csum_start + csum_offset maybe? As we have in
> skb?
> 
> Although, from a quick grepping, I see some of the drivers have only
> a fixed set of tx checksum configurations they support:
> 
> switch (skb->csum_offset) {
> case offsetof(struct tcphdr, check):
> 	tx_desc->flags |= DO_TCP_IP_TX_CSUM_AT_FIXED_OFFSET;
> 	break;
> default:
> 	/* sw fallback */
> }

Yeah because they might not want or no how to do other protocols.

> 
> So maybe that's another argument in favor of not doing a generic
> layer and just expose whatever HW has in a non-portable way...
> (otoh, still accepting csum_offset+start + doing that switch inside
> is probably an ok common interface)

That is what I was thinking.

> 
> > > have people around that want to use af_xdp but need multibuf + tx
> > > offloads, so I was hoping to at least have a path for more tx offloads
> > > after we're done with tx timestamp "offload"..
> > > 
> > > > It doesn't have to be only two use cases.
> > > > I completely agree with Kuba that:
> > > >  - L4 csum
> > > >  - segmentation
> > > >  - time reporting
> > > > are universal HW NIC features and we need to have an api
> > > > that exposes these features in programmable way to bpf progs in the kernel
> > > > and through af_xdp to user space.
> > > > I mainly suggest addressing them one by one and look
> > > > for common code bits and api similarities later.
> > > 
> > > Ack, let me see if I can fit tx csum into the picture. I still feel
> > > like we need these dev-bound tracing programs if we want to trigger
> > > kfuncs safely, but maybe we can simplify further..
> > 
> > Its not clear to me how you get to a dev specific attach here
> > without complicating the hot path more. I think we need to
> > really be careful to not make hotpath more complex. Will
> > follow along for sure to see what gets created.
> 
> Agreed. I've yet to test it with some real HW (still in the process of
> trying to get back my lab configuration which was changed recently) :-(

:( Another question would be is there a use case to have netdev
specific programs? That might drive the need to attach to a
specific netdev. I can't think of a use case off-hand I guess
Toke might have something in mind based on his reply earlier.
Alexei Starovoitov June 27, 2023, 11:50 p.m. UTC | #22
On Tue, Jun 27, 2023 at 4:33 PM John Fastabend <john.fastabend@gmail.com> wrote:
>
> Yeah I think so and then carry a couple different object files
> for the environment around. We do this already for some things.
> Its not ideal but it works. I think a good end goal would be
>
>  int bpf_devtx_request_timestamp(...)
>  {
>         set_ts = dlsym( dl_handle, request-timestamp);
>         return set_ts(...)
>  }
>
> Then we could at attach time take that dlsym and rewrite it.

Sounds like we need polymorphic kfuncs.
Same kfunc name called by bpf prog, but implementation would
change depending on {attach_btf_id, prog_type, etc}.
The existing bpf_xdp_metadata_rx_hash is almost that.
Except it's driver specific.
We should probably generalize this mechanism.
Jakub Kicinski June 28, 2023, 6:52 p.m. UTC | #23
On Tue, 27 Jun 2023 14:43:57 -0700 John Fastabend wrote:
> What I think would be the most straight-forward thing and most flexible
> is to create a <drvname>_devtx_submit_skb(<drivname>descriptor, sk_buff)
> and <drvname>_devtx_submit_xdp(<drvname>descriptor, xdp_frame) and then
> corresponding calls for <drvname>_devtx_complete_{skb|xdp}() Then you
> don't spend any cycles building the metadata thing or have to even
> worry about read kfuncs. The BPF program has read access to any
> fields they need. And with the skb, xdp pointer we have the context
> that created the descriptor and generate meaningful metrics.

Sorry but this is not going to happen without my nack. DPDK was a much
cleaner bifurcation point than trying to write datapath drivers in BPF.
Users having to learn how to render descriptors for all the NICs
and queue formats out there is not reasonable. Isovalent hired
a lot of former driver developers so you may feel like it's a good
idea, as a middleware provider. But for the rest of us the matrix
of HW x queue format x people writing BPF is too large. If we can
write some poor man's DPDK / common BPF driver library to be selected
at linking time - we can as well provide a generic interface in
the kernel itself. Again, we never merged explicit DPDK support, 
your idea is strictly worse.
Toke Høiland-Jørgensen June 29, 2023, 11:43 a.m. UTC | #24
Jakub Kicinski <kuba@kernel.org> writes:

> On Tue, 27 Jun 2023 14:43:57 -0700 John Fastabend wrote:
>> What I think would be the most straight-forward thing and most flexible
>> is to create a <drvname>_devtx_submit_skb(<drivname>descriptor, sk_buff)
>> and <drvname>_devtx_submit_xdp(<drvname>descriptor, xdp_frame) and then
>> corresponding calls for <drvname>_devtx_complete_{skb|xdp}() Then you
>> don't spend any cycles building the metadata thing or have to even
>> worry about read kfuncs. The BPF program has read access to any
>> fields they need. And with the skb, xdp pointer we have the context
>> that created the descriptor and generate meaningful metrics.
>
> Sorry but this is not going to happen without my nack. DPDK was a much
> cleaner bifurcation point than trying to write datapath drivers in BPF.
> Users having to learn how to render descriptors for all the NICs
> and queue formats out there is not reasonable. Isovalent hired
> a lot of former driver developers so you may feel like it's a good
> idea, as a middleware provider. But for the rest of us the matrix
> of HW x queue format x people writing BPF is too large. If we can
> write some poor man's DPDK / common BPF driver library to be selected
> at linking time - we can as well provide a generic interface in
> the kernel itself. Again, we never merged explicit DPDK support, 
> your idea is strictly worse.

I agree: we're writing an operating system kernel here. The *whole
point* of an operating system is to provide an abstraction over
different types of hardware and provide a common API so users don't have
to deal with the hardware details.

I feel like there's some tension between "BPF as a dataplane API" and
"BPF as a kernel extension language" here, especially as the BPF
subsystem has grown more features in the latter direction. In my mind,
XDP is still very much a dataplane API; in fact that's one of the main
selling points wrt DPDK: you can get high performance networking but
still take advantage of the kernel drivers and other abstractions that
the kernel provides. If you're going for raw performance and the ability
to twiddle every tiny detail of the hardware, DPDK fills that niche
quite nicely (and also shows us the pains of going that route).

-Toke
Stanislav Fomichev June 30, 2023, 6:54 p.m. UTC | #25
On Thu, Jun 29, 2023 at 4:43 AM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>
> Jakub Kicinski <kuba@kernel.org> writes:
>
> > On Tue, 27 Jun 2023 14:43:57 -0700 John Fastabend wrote:
> >> What I think would be the most straight-forward thing and most flexible
> >> is to create a <drvname>_devtx_submit_skb(<drivname>descriptor, sk_buff)
> >> and <drvname>_devtx_submit_xdp(<drvname>descriptor, xdp_frame) and then
> >> corresponding calls for <drvname>_devtx_complete_{skb|xdp}() Then you
> >> don't spend any cycles building the metadata thing or have to even
> >> worry about read kfuncs. The BPF program has read access to any
> >> fields they need. And with the skb, xdp pointer we have the context
> >> that created the descriptor and generate meaningful metrics.
> >
> > Sorry but this is not going to happen without my nack. DPDK was a much
> > cleaner bifurcation point than trying to write datapath drivers in BPF.
> > Users having to learn how to render descriptors for all the NICs
> > and queue formats out there is not reasonable. Isovalent hired
> > a lot of former driver developers so you may feel like it's a good
> > idea, as a middleware provider. But for the rest of us the matrix
> > of HW x queue format x people writing BPF is too large. If we can
> > write some poor man's DPDK / common BPF driver library to be selected
> > at linking time - we can as well provide a generic interface in
> > the kernel itself. Again, we never merged explicit DPDK support,
> > your idea is strictly worse.
>
> I agree: we're writing an operating system kernel here. The *whole
> point* of an operating system is to provide an abstraction over
> different types of hardware and provide a common API so users don't have
> to deal with the hardware details.
>
> I feel like there's some tension between "BPF as a dataplane API" and
> "BPF as a kernel extension language" here, especially as the BPF
> subsystem has grown more features in the latter direction. In my mind,
> XDP is still very much a dataplane API; in fact that's one of the main
> selling points wrt DPDK: you can get high performance networking but
> still take advantage of the kernel drivers and other abstractions that
> the kernel provides. If you're going for raw performance and the ability
> to twiddle every tiny detail of the hardware, DPDK fills that niche
> quite nicely (and also shows us the pains of going that route).

Since the thread has been quiet for a day, here is how I'm planning to proceed:
- remove most of the devtx_frame context (but still keep it for
stashing descriptor pointers and having a common kfunc api)
- keep common kfunc interface for common abstractions
- separate skb/xdp hooks - this is probably a good idea anyway to not
mix them up (we are focusing mostly on xdp here)
- continue using raw fentry for now, let's reconsider later, depending
on where we end up with generic apis vs non-generic ones
- add tx checksum to show how this tx-dev-bound framework can be
extended (and show similarities between the timestamp and checksum)

Iow, I'll largely keep the same approach but will try to expose raw
skb/xdp_frame + add tx-csum. Let's reconvene once I send out v3. Thank
you all for the valuable feedback!
John Fastabend July 1, 2023, 12:52 a.m. UTC | #26
Toke Høiland-Jørgensen wrote:
> Jakub Kicinski <kuba@kernel.org> writes:
> 
> > On Tue, 27 Jun 2023 14:43:57 -0700 John Fastabend wrote:
> >> What I think would be the most straight-forward thing and most flexible
> >> is to create a <drvname>_devtx_submit_skb(<drivname>descriptor, sk_buff)
> >> and <drvname>_devtx_submit_xdp(<drvname>descriptor, xdp_frame) and then
> >> corresponding calls for <drvname>_devtx_complete_{skb|xdp}() Then you
> >> don't spend any cycles building the metadata thing or have to even
> >> worry about read kfuncs. The BPF program has read access to any
> >> fields they need. And with the skb, xdp pointer we have the context
> >> that created the descriptor and generate meaningful metrics.
> >
> > Sorry but this is not going to happen without my nack. DPDK was a much
> > cleaner bifurcation point than trying to write datapath drivers in BPF.
> > Users having to learn how to render descriptors for all the NICs
> > and queue formats out there is not reasonable. Isovalent hired

I would expect BPF/driver experts would write the libraries for the
datapath API that the network/switch developer is going to use. I would
even put the BPF programs in kernel and ship them with the release
if that helps.

We have different visions on who the BPF user is that writes XDP
programs I think.

> > a lot of former driver developers so you may feel like it's a good
> > idea, as a middleware provider. But for the rest of us the matrix
> > of HW x queue format x people writing BPF is too large. If we can

Its nice though that we have good coverage for XDP so the matrix
is big. Even with kfuncs though we need someone to write support.
My thought is its just a question of if they write it in BPF
or in C code as a reader kfunc. I suspect for these advanced features
its only a subset at least upfront. Either way BPF or C you are
stuck finding someone to write that code.

> > write some poor man's DPDK / common BPF driver library to be selected
> > at linking time - we can as well provide a generic interface in
> > the kernel itself. Again, we never merged explicit DPDK support, 
> > your idea is strictly worse.
> 
> I agree: we're writing an operating system kernel here. The *whole
> point* of an operating system is to provide an abstraction over
> different types of hardware and provide a common API so users don't have
> to deal with the hardware details.

And just to be clear what we sacrifice then is forwards/backwards
portability. If its a kernel kfunc we need to add a kfunc for
every field we want to read and it will only be available then.
Further, it will need some general agreement that its useful for
it to be added. A hardware vendor wont be able to add some arbitrary
field and get access to it. So we lose this by doing kfuncs.

Its pushing complexity into the kernel that we maintain in kernel
when we could push the complexity into BPF and maintain as user
space code and BPF codes. Its a choice to make I think.

Also abstraction can cost cycles. Here we have to prepare the
structure and call kfunc. The kfunc can be inlined if folks
do the work. It may be small cost but not free.

> 
> I feel like there's some tension between "BPF as a dataplane API" and
> "BPF as a kernel extension language" here, especially as the BPF

Agree. I'm obviously not maximizing for ease of use for the dataplane
API as BPF. IMO though even with the kfunc abstraction its niche work
writing low level datapath code that requires exposing a user
API higher up the stack. With a DSL (P4, ...) for example you could 
abstract away the complexity and then compile down into these
details. Or if you like tables an Openflow style table interface
would provide a table API.

> subsystem has grown more features in the latter direction. In my mind,
> XDP is still very much a dataplane API; in fact that's one of the main
> selling points wrt DPDK: you can get high performance networking but
> still take advantage of the kernel drivers and other abstractions that

I think we agree on the goal a fast datapath for the nic.

> the kernel provides. If you're going for raw performance and the ability
> to twiddle every tiny detail of the hardware, DPDK fills that niche
> quite nicely (and also shows us the pains of going that route).

Summary on my side is we minimize kernel complexity
by raw descriptor reads, we don't need to know what we
want to read in the future and we need folks who understand
the hardware regardless of where the code lives in BPF
or C. C certainly helps the picking what kfunc to use
but we also have BTF that solves this struct/offset problem
for non-networking use cases already.

> 
> -Toke
>
Jakub Kicinski July 1, 2023, 3:11 a.m. UTC | #27
On Fri, 30 Jun 2023 17:52:05 -0700 John Fastabend wrote:
> Toke Høiland-Jørgensen wrote:
> > Jakub Kicinski <kuba@kernel.org> writes:
> > > Sorry but this is not going to happen without my nack. DPDK was a much
> > > cleaner bifurcation point than trying to write datapath drivers in BPF.
> > > Users having to learn how to render descriptors for all the NICs
> > > and queue formats out there is not reasonable. Isovalent hired  
> 
> I would expect BPF/driver experts would write the libraries for the
> datapath API that the network/switch developer is going to use. I would
> even put the BPF programs in kernel and ship them with the release
> if that helps.
> 
> We have different visions on who the BPF user is that writes XDP
> programs I think.

Yes, crucially. What I've seen talking to engineers working on TC/XDP
BPF at Meta (and I may not be dealing with experts, Martin would have
a broader view) is that they don't understand basics like s/g or
details of checksums.

I don't think it is reasonable to call you, Maxim, Nik and co. "users".
We're risking building system so complex normal people will _need_ an
overlay on top to make it work.

> > > a lot of former driver developers so you may feel like it's a good
> > > idea, as a middleware provider. But for the rest of us the matrix
> > > of HW x queue format x people writing BPF is too large. If we can  
> 
> Its nice though that we have good coverage for XDP so the matrix
> is big. Even with kfuncs though we need someone to write support.
> My thought is its just a question of if they write it in BPF
> or in C code as a reader kfunc. I suspect for these advanced features
> its only a subset at least upfront. Either way BPF or C you are
> stuck finding someone to write that code.

Right, but kernel is a central point where it can be written, reviewed,
cross-optimized and stored.

> > > write some poor man's DPDK / common BPF driver library to be selected
> > > at linking time - we can as well provide a generic interface in
> > > the kernel itself. Again, we never merged explicit DPDK support, 
> > > your idea is strictly worse.  
> > 
> > I agree: we're writing an operating system kernel here. The *whole
> > point* of an operating system is to provide an abstraction over
> > different types of hardware and provide a common API so users don't have
> > to deal with the hardware details.  
> 
> And just to be clear what we sacrifice then is forwards/backwards
> portability.

Forward compatibility is also the favorite word of HW vendors when 
they create proprietary interfaces. I think it's incorrect to call
cutting functionality out of a project forward compatibility.
If functionality is moved the surface of compatibility is different.

> If its a kernel kfunc we need to add a kfunc for
> every field we want to read and it will only be available then.
> Further, it will need some general agreement that its useful for
> it to be added. A hardware vendor wont be able to add some arbitrary
> field and get access to it. So we lose this by doing kfuncs.

We both know how easy it is to come up with useful HW, so I'm guessing
this is rhetorical.

> Its pushing complexity into the kernel that we maintain in kernel
> when we could push the complexity into BPF and maintain as user
> space code and BPF codes. Its a choice to make I think.

Right, and I believe having the code in the kernel, appropriately
integrated with the drivers is beneficial. The main argument against 
it is that in certain environments kernels are old. But that's a very
destructive argument.
John Fastabend July 3, 2023, 6:30 p.m. UTC | #28
Jakub Kicinski wrote:
> On Fri, 30 Jun 2023 17:52:05 -0700 John Fastabend wrote:
> > Toke Høiland-Jørgensen wrote:
> > > Jakub Kicinski <kuba@kernel.org> writes:
> > > > Sorry but this is not going to happen without my nack. DPDK was a much
> > > > cleaner bifurcation point than trying to write datapath drivers in BPF.
> > > > Users having to learn how to render descriptors for all the NICs
> > > > and queue formats out there is not reasonable. Isovalent hired  
> > 
> > I would expect BPF/driver experts would write the libraries for the
> > datapath API that the network/switch developer is going to use. I would
> > even put the BPF programs in kernel and ship them with the release
> > if that helps.
> > 
> > We have different visions on who the BPF user is that writes XDP
> > programs I think.
> 
> Yes, crucially. What I've seen talking to engineers working on TC/XDP
> BPF at Meta (and I may not be dealing with experts, Martin would have
> a broader view) is that they don't understand basics like s/g or
> details of checksums.

Interesting data point. But these same engineers will want to get
access to the checksum, but don't understand it? Seems if your
going to start reading/writing descriptors even through kfuncs
we need to get some docs/notes on how to use them correctly then.
We certainly wont put guardrails on the read/writes for performance
reasons.

> 
> I don't think it is reasonable to call you, Maxim, Nik and co. "users".
> We're risking building system so complex normal people will _need_ an
> overlay on top to make it work.

I consider us users. We write networking CNI and observability/sec
tooling on top of BPF. Most of what we create is driven by customer
environments and performance. Maybe not typical users I guess, but
also Meta users are not typical and have their own set of constraints
and insights.

> 
> > > > a lot of former driver developers so you may feel like it's a good
> > > > idea, as a middleware provider. But for the rest of us the matrix
> > > > of HW x queue format x people writing BPF is too large. If we can  
> > 
> > Its nice though that we have good coverage for XDP so the matrix
> > is big. Even with kfuncs though we need someone to write support.
> > My thought is its just a question of if they write it in BPF
> > or in C code as a reader kfunc. I suspect for these advanced features
> > its only a subset at least upfront. Either way BPF or C you are
> > stuck finding someone to write that code.
> 
> Right, but kernel is a central point where it can be written, reviewed,
> cross-optimized and stored.

We can check BPF codes into the kernel.

> 
> > > > write some poor man's DPDK / common BPF driver library to be selected
> > > > at linking time - we can as well provide a generic interface in
> > > > the kernel itself. Again, we never merged explicit DPDK support, 
> > > > your idea is strictly worse.  
> > > 
> > > I agree: we're writing an operating system kernel here. The *whole
> > > point* of an operating system is to provide an abstraction over
> > > different types of hardware and provide a common API so users don't have
> > > to deal with the hardware details.  
> > 
> > And just to be clear what we sacrifice then is forwards/backwards
> > portability.
> 
> Forward compatibility is also the favorite word of HW vendors when 
> they create proprietary interfaces. I think it's incorrect to call
> cutting functionality out of a project forward compatibility.
> If functionality is moved the surface of compatibility is different.

Sure a bit of an abuse of terminology.

> 
> > If its a kernel kfunc we need to add a kfunc for
> > every field we want to read and it will only be available then.
> > Further, it will need some general agreement that its useful for
> > it to be added. A hardware vendor wont be able to add some arbitrary
> > field and get access to it. So we lose this by doing kfuncs.
> 
> We both know how easy it is to come up with useful HW, so I'm guessing
> this is rhetorical.

It wasn't rhetorical but agree we've been chasing this for years
and outside of environments where you own a very large data center
and sell out VMs or niche spaces its been hard to come up with a
really good general use case.

> 
> > Its pushing complexity into the kernel that we maintain in kernel
> > when we could push the complexity into BPF and maintain as user
> > space code and BPF codes. Its a choice to make I think.
> 
> Right, and I believe having the code in the kernel, appropriately
> integrated with the drivers is beneficial. The main argument against 
> it is that in certain environments kernels are old. But that's a very
> destructive argument.

My main concern here is we forget some kfunc that we need and then
we are stuck. We don't have the luxury of upgrading kernels easily.
It doesn't need to be an either/or discussion if we have a ctx()
call we can drop into BTF over the descriptor and use kfuncs for
the most common things. Other option is to simply write a kfunc
for every field I see that could potentially have some use even
if I don't fully understand it at the moment.

I suspect I am less concerned about raw access because we already
have BTF infra built up around our network observability/sec
solution so we already handle per kernel differences and desc.
just looks like another BTF object we want to read. And we
know what dev and types we are attaching to so we don't have
issues with is this a mlx or intel or etc device.

Also as a more practical concern how do we manage nic specific
things? Have nic spcific kfuncs? Per descriptor tx_flags and
status flags. Other things we need are ptr to skb and access
to the descriptor ring so we can pull stats off the ring. I'm
not arguing it can't be done with kfuncs, but if we go kfunc
route be prepared for a long list of kfuncs and driver specific
ones.

.John
Jakub Kicinski July 3, 2023, 7:33 p.m. UTC | #29
On Mon, 03 Jul 2023 11:30:44 -0700 John Fastabend wrote:
> Jakub Kicinski wrote:
> > On Fri, 30 Jun 2023 17:52:05 -0700 John Fastabend wrote:  
> > > I would expect BPF/driver experts would write the libraries for the
> > > datapath API that the network/switch developer is going to use. I would
> > > even put the BPF programs in kernel and ship them with the release
> > > if that helps.
> > > 
> > > We have different visions on who the BPF user is that writes XDP
> > > programs I think.  
> > 
> > Yes, crucially. What I've seen talking to engineers working on TC/XDP
> > BPF at Meta (and I may not be dealing with experts, Martin would have
> > a broader view) is that they don't understand basics like s/g or
> > details of checksums.  
> 
> Interesting data point. But these same engineers will want to get
> access to the checksum, but don't understand it? Seems if your
> going to start reading/writing descriptors even through kfuncs
> we need to get some docs/notes on how to use them correctly then.
> We certainly wont put guardrails on the read/writes for performance
> reasons.

Dunno about checksum, but it's definitely the same kind of person
that'd want access to timestamps.

> > I don't think it is reasonable to call you, Maxim, Nik and co. "users".
> > We're risking building system so complex normal people will _need_ an
> > overlay on top to make it work.  
> 
> I consider us users. We write networking CNI and observability/sec
> tooling on top of BPF. Most of what we create is driven by customer
> environments and performance. Maybe not typical users I guess, but
> also Meta users are not typical and have their own set of constraints
> and insights.

One thing Meta certainly does (and I think is a large part of success
of BPF) is delegating the development of applications away from the core
kernel team. Meta is different than a smaller company in that it _has_
a kernel team, but the "network application" teams I suspect are fairly
typical.

> > > Its pushing complexity into the kernel that we maintain in kernel
> > > when we could push the complexity into BPF and maintain as user
> > > space code and BPF codes. Its a choice to make I think.  
> > 
> > Right, and I believe having the code in the kernel, appropriately
> > integrated with the drivers is beneficial. The main argument against 
> > it is that in certain environments kernels are old. But that's a very
> > destructive argument.  
> 
> My main concern here is we forget some kfunc that we need and then
> we are stuck. We don't have the luxury of upgrading kernels easily.
> It doesn't need to be an either/or discussion if we have a ctx()
> call we can drop into BTF over the descriptor and use kfuncs for
> the most common things. Other option is to simply write a kfunc
> for every field I see that could potentially have some use even
> if I don't fully understand it at the moment.
> 
> I suspect I am less concerned about raw access because we already
> have BTF infra built up around our network observability/sec
> solution so we already handle per kernel differences and desc.
> just looks like another BTF object we want to read. And we
> know what dev and types we are attaching to so we don't have
> issues with is this a mlx or intel or etc device.
> 
> Also as a more practical concern how do we manage nic specific
> things? 

What are the NIC specific things?

> Have nic spcific kfuncs? Per descriptor tx_flags and
> status flags. Other things we need are ptr to skb and access
> to the descriptor ring so we can pull stats off the ring. I'm
> not arguing it can't be done with kfuncs, but if we go kfunc
> route be prepared for a long list of kfuncs and driver specific
> ones.

IDK why you say that, I gave the base list of offloads in an earlier
email.
diff mbox series

Patch

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
index 879d698b6119..e4509464e0b1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
@@ -6,6 +6,7 @@ 
 
 #include "en.h"
 #include <linux/indirect_call_wrapper.h>
+#include <net/devtx.h>
 
 #define MLX5E_TX_WQE_EMPTY_DS_COUNT (sizeof(struct mlx5e_tx_wqe) / MLX5_SEND_WQE_DS)
 
@@ -506,4 +507,14 @@  static inline struct mlx5e_mpw_info *mlx5e_get_mpw_info(struct mlx5e_rq *rq, int
 
 	return (struct mlx5e_mpw_info *)((char *)rq->mpwqe.info + array_size(i, isz));
 }
+
+struct mlx5e_devtx_frame {
+	struct devtx_frame frame;
+	struct mlx5_cqe64 *cqe; /* tx completion */
+	struct mlx5e_tx_wqe *wqe; /* tx */
+};
+
+void mlx5e_devtx_submit(struct devtx_frame *ctx);
+void mlx5e_devtx_complete(struct devtx_frame *ctx);
+
 #endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
index f0e6095809fa..0cb0f0799cbc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
@@ -255,9 +255,30 @@  static int mlx5e_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash,
 	return 0;
 }
 
+static int mlx5e_devtx_sb_request_timestamp(const struct devtx_frame *ctx)
+{
+	/* Nothing to do here, CQE always has a timestamp. */
+	return 0;
+}
+
+static int mlx5e_devtx_cp_timestamp(const struct devtx_frame *_ctx, u64 *timestamp)
+{
+	const struct mlx5e_devtx_frame *ctx = (void *)_ctx;
+	u64 ts;
+
+	if (unlikely(!ctx->cqe))
+		return -ENODATA;
+
+	ts = get_cqe_ts(ctx->cqe);
+	*timestamp = mlx5_real_time_cyc2time(NULL, ts);
+	return 0;
+}
+
 const struct xdp_metadata_ops mlx5e_xdp_metadata_ops = {
 	.xmo_rx_timestamp		= mlx5e_xdp_rx_timestamp,
 	.xmo_rx_hash			= mlx5e_xdp_rx_hash,
+	.xmo_sb_request_timestamp	= mlx5e_devtx_sb_request_timestamp,
+	.xmo_cp_timestamp		= mlx5e_devtx_cp_timestamp,
 };
 
 /* returns true if packet was consumed by xdp */
@@ -453,6 +474,23 @@  mlx5e_xmit_xdp_frame_mpwqe(struct mlx5e_xdpsq *sq, struct mlx5e_xmit_data *xdptx
 
 	mlx5e_xdp_mpwqe_add_dseg(sq, p, stats);
 
+	if (devtx_enabled()) {
+		struct mlx5e_xmit_data_frags *xdptxdf =
+			container_of(xdptxd, struct mlx5e_xmit_data_frags, xd);
+
+		struct mlx5e_devtx_frame ctx = {
+			.frame = {
+				.data = p->data,
+				.len = p->len,
+				.meta_len = sq->xsk_pool->tx_metadata_len,
+				.sinfo = xdptxd->has_frags ? xdptxdf->sinfo : NULL,
+				.netdev = sq->cq.netdev,
+			},
+			.wqe = sq->mpwqe.wqe,
+		};
+		mlx5e_devtx_submit(&ctx.frame);
+	}
+
 	if (unlikely(mlx5e_xdp_mpwqe_is_full(session, sq->max_sq_mpw_wqebbs)))
 		mlx5e_xdp_mpwqe_complete(sq);
 
@@ -560,6 +598,20 @@  mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq *sq, struct mlx5e_xmit_data *xdptxd,
 		dseg++;
 	}
 
+	if (devtx_enabled()) {
+		struct mlx5e_devtx_frame ctx = {
+			.frame = {
+				.data = xdptxd->data,
+				.len = xdptxd->len,
+				.meta_len = sq->xsk_pool->tx_metadata_len,
+				.sinfo = xdptxd->has_frags ? xdptxdf->sinfo : NULL,
+				.netdev = sq->cq.netdev,
+			},
+			.wqe = wqe,
+		};
+		mlx5e_devtx_submit(&ctx.frame);
+	}
+
 	cseg->opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | MLX5_OPCODE_SEND);
 
 	if (test_bit(MLX5E_SQ_STATE_XDP_MULTIBUF, &sq->state)) {
@@ -607,7 +659,8 @@  mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq *sq, struct mlx5e_xmit_data *xdptxd,
 static void mlx5e_free_xdpsq_desc(struct mlx5e_xdpsq *sq,
 				  struct mlx5e_xdp_wqe_info *wi,
 				  u32 *xsk_frames,
-				  struct xdp_frame_bulk *bq)
+				  struct xdp_frame_bulk *bq,
+				  struct mlx5_cqe64 *cqe)
 {
 	struct mlx5e_xdp_info_fifo *xdpi_fifo = &sq->db.xdpi_fifo;
 	u16 i;
@@ -626,6 +679,14 @@  static void mlx5e_free_xdpsq_desc(struct mlx5e_xdpsq *sq,
 			xdpi = mlx5e_xdpi_fifo_pop(xdpi_fifo);
 			dma_addr = xdpi.frame.dma_addr;
 
+			if (false && devtx_enabled()) {
+				struct mlx5e_devtx_frame ctx;
+
+				devtx_frame_from_xdp(&ctx.frame, xdpf, sq->cq.netdev);
+				ctx.cqe = cqe;
+				mlx5e_devtx_complete(&ctx.frame);
+			}
+
 			dma_unmap_single(sq->pdev, dma_addr,
 					 xdpf->len, DMA_TO_DEVICE);
 			if (xdp_frame_has_frags(xdpf)) {
@@ -659,6 +720,20 @@  static void mlx5e_free_xdpsq_desc(struct mlx5e_xdpsq *sq,
 				xdpi = mlx5e_xdpi_fifo_pop(xdpi_fifo);
 				page = xdpi.page.page;
 
+				if (false && devtx_enabled()) {
+					struct mlx5e_devtx_frame ctx = {
+						.frame = {
+							.data = page,
+							.len = PAGE_SIZE,
+							.meta_len = sq->xsk_pool->tx_metadata_len,
+							.netdev = sq->cq.netdev,
+						},
+						.cqe = cqe,
+					};
+
+					mlx5e_devtx_complete(&ctx.frame);
+				}
+
 				/* No need to check ((page->pp_magic & ~0x3UL) == PP_SIGNATURE)
 				 * as we know this is a page_pool page.
 				 */
@@ -670,6 +745,21 @@  static void mlx5e_free_xdpsq_desc(struct mlx5e_xdpsq *sq,
 		}
 		case MLX5E_XDP_XMIT_MODE_XSK:
 			/* AF_XDP send */
+
+			if (devtx_enabled()) {
+				struct mlx5e_devtx_frame ctx = {
+					.frame = {
+						.data = xdpi.frame.xsk_head,
+						.len = xdpi.page.xsk_head_len,
+						.meta_len = sq->xsk_pool->tx_metadata_len,
+						.netdev = sq->cq.netdev,
+					},
+					.cqe = cqe,
+				};
+
+				mlx5e_devtx_complete(&ctx.frame);
+			}
+
 			(*xsk_frames)++;
 			break;
 		default:
@@ -720,7 +810,7 @@  bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq)
 
 			sqcc += wi->num_wqebbs;
 
-			mlx5e_free_xdpsq_desc(sq, wi, &xsk_frames, &bq);
+			mlx5e_free_xdpsq_desc(sq, wi, &xsk_frames, &bq, cqe);
 		} while (!last_wqe);
 
 		if (unlikely(get_cqe_opcode(cqe) != MLX5_CQE_REQ)) {
@@ -767,7 +857,7 @@  void mlx5e_free_xdpsq_descs(struct mlx5e_xdpsq *sq)
 
 		sq->cc += wi->num_wqebbs;
 
-		mlx5e_free_xdpsq_desc(sq, wi, &xsk_frames, &bq);
+		mlx5e_free_xdpsq_desc(sq, wi, &xsk_frames, &bq, NULL);
 	}
 
 	xdp_flush_frame_bulk(&bq);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h
index 9e8e6184f9e4..860638e1209b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h
@@ -50,6 +50,11 @@  struct mlx5e_xdp_buff {
 	struct mlx5e_rq *rq;
 };
 
+struct mlx5e_xdp_md {
+	struct xdp_md md;
+	struct mlx5_cqe64 *cqe;
+};
+
 /* XDP packets can be transmitted in different ways. On completion, we need to
  * distinguish between them to clean up things in a proper way.
  */
@@ -82,18 +87,20 @@  enum mlx5e_xdp_xmit_mode {
  *    num, page_1, page_2, ... , page_num.
  *
  * MLX5E_XDP_XMIT_MODE_XSK:
- *    none.
+ *    frame.xsk_head + page.xsk_head_len for header portion only.
  */
 union mlx5e_xdp_info {
 	enum mlx5e_xdp_xmit_mode mode;
 	union {
 		struct xdp_frame *xdpf;
 		dma_addr_t dma_addr;
+		void *xsk_head;
 	} frame;
 	union {
 		struct mlx5e_rq *rq;
 		u8 num;
 		struct page *page;
+		u32 xsk_head_len;
 	} page;
 };
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c
index 597f319d4770..1b97d6f6a9ba 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c
@@ -96,6 +96,9 @@  bool mlx5e_xsk_tx(struct mlx5e_xdpsq *sq, unsigned int budget)
 
 		xsk_buff_raw_dma_sync_for_device(pool, xdptxd.dma_addr, xdptxd.len);
 
+		xdpi.frame.xsk_head = xdptxd.data;
+		xdpi.page.xsk_head_len = xdptxd.len;
+
 		ret = INDIRECT_CALL_2(sq->xmit_xdp_frame, mlx5e_xmit_xdp_frame_mpwqe,
 				      mlx5e_xmit_xdp_frame, sq, &xdptxd,
 				      check_result);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index c7eb6b238c2b..f8d3e210408a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -758,6 +758,14 @@  static void mlx5e_tx_wi_consume_fifo_skbs(struct mlx5e_txqsq *sq, struct mlx5e_t
 	for (i = 0; i < wi->num_fifo_pkts; i++) {
 		struct sk_buff *skb = mlx5e_skb_fifo_pop(&sq->db.skb_fifo);
 
+		if (false && devtx_enabled()) {
+			struct mlx5e_devtx_frame ctx = {};
+
+			devtx_frame_from_skb(&ctx.frame, skb, sq->cq.netdev);
+			ctx.cqe = cqe;
+			mlx5e_devtx_complete(&ctx.frame);
+		}
+
 		mlx5e_consume_skb(sq, skb, cqe, napi_budget);
 	}
 }
@@ -826,6 +834,14 @@  bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget)
 			sqcc += wi->num_wqebbs;
 
 			if (likely(wi->skb)) {
+				if (false && devtx_enabled()) {
+					struct mlx5e_devtx_frame ctx = {};
+
+					devtx_frame_from_skb(&ctx.frame, wi->skb, cq->netdev);
+					ctx.cqe = cqe;
+					mlx5e_devtx_complete(&ctx.frame);
+				}
+
 				mlx5e_tx_wi_dma_unmap(sq, wi, &dma_fifo_cc);
 				mlx5e_consume_skb(sq, wi->skb, cqe, napi_budget);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index a7eb65cd0bdd..7160389a5bc6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -48,6 +48,7 @@ 
 #include <linux/mlx5/vport.h>
 #include <linux/version.h>
 #include <net/devlink.h>
+#include <net/devtx.h>
 #include "mlx5_core.h"
 #include "thermal.h"
 #include "lib/eq.h"
@@ -73,6 +74,7 @@ 
 #include "sf/dev/dev.h"
 #include "sf/sf.h"
 #include "mlx5_irq.h"
+#include "en/xdp.h"
 
 MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
 MODULE_DESCRIPTION("Mellanox 5th generation network adapters (ConnectX series) core driver");
@@ -2132,6 +2134,19 @@  static void mlx5_core_verify_params(void)
 	}
 }
 
+__weak noinline void mlx5e_devtx_submit(struct devtx_frame *ctx)
+{
+}
+
+__weak noinline void mlx5e_devtx_complete(struct devtx_frame *ctx)
+{
+}
+
+BTF_SET8_START(mlx5e_devtx_hook_ids)
+BTF_ID_FLAGS(func, mlx5e_devtx_submit)
+BTF_ID_FLAGS(func, mlx5e_devtx_complete)
+BTF_SET8_END(mlx5e_devtx_hook_ids)
+
 static int __init mlx5_init(void)
 {
 	int err;
@@ -2144,9 +2159,15 @@  static int __init mlx5_init(void)
 	mlx5_core_verify_params();
 	mlx5_register_debugfs();
 
+	err = devtx_hooks_register(&mlx5e_devtx_hook_ids, &mlx5e_xdp_metadata_ops);
+	if (err) {
+		pr_warn("failed to register devtx hooks: %d", err);
+		goto err_debug;
+	}
+
 	err = mlx5e_init();
 	if (err)
-		goto err_debug;
+		goto err_devtx;
 
 	err = mlx5_sf_driver_register();
 	if (err)
@@ -2162,6 +2183,8 @@  static int __init mlx5_init(void)
 	mlx5_sf_driver_unregister();
 err_sf:
 	mlx5e_cleanup();
+err_devtx:
+	devtx_hooks_unregister(&mlx5e_devtx_hook_ids);
 err_debug:
 	mlx5_unregister_debugfs();
 	return err;
@@ -2169,6 +2192,7 @@  static int __init mlx5_init(void)
 
 static void __exit mlx5_cleanup(void)
 {
+	devtx_hooks_unregister(&mlx5e_devtx_hook_ids);
 	pci_unregister_driver(&mlx5_core_driver);
 	mlx5_sf_driver_unregister();
 	mlx5e_cleanup();