diff mbox series

[bpf-next,v6,11/18] ice: put XDP meta sources assignment under a static key condition

Message ID 20231012170524.21085-12-larysa.zaremba@intel.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series XDP metadata via kfuncs for ice + VLAN hint | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
netdev/series_format fail Series longer than 15 patches (and no cover letter)
netdev/tree_selection success Clearly marked for bpf-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1340 this patch: 1340
netdev/cc_maintainers warning 6 maintainers not CCed: intel-wired-lan@lists.osuosl.org pabeni@redhat.com edumazet@google.com jesse.brandeburg@intel.com anthony.l.nguyen@intel.com davem@davemloft.net
netdev/build_clang success Errors and warnings before: 1363 this patch: 1363
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 1364 this patch: 1364
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 61 lines checked
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-0 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-1 success Logs for build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-2 success Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-3 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-4 success Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-5 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-6 success Logs for test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for test_maps on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-10 success Logs for test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-11 success Logs for test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for test_progs on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-14 success Logs for test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for test_progs_no_alu32 on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-18 success Logs for test_progs_no_alu32_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for test_progs_no_alu32_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-21 success Logs for test_progs_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for test_progs_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-24 success Logs for test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for test_verifier on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-28 fail Logs for veristat

Commit Message

Larysa Zaremba Oct. 12, 2023, 5:05 p.m. UTC
Usage of XDP hints requires putting additional information after the
xdp_buff. In basic case, only the descriptor has to be copied on a
per-packet basis, because xdp_buff permanently resides before per-ring
metadata (cached time and VLAN protocol ID).

However, in ZC mode, xdp_buffs come from a pool, so memory after such
buffer does not contain any reliable information, so everything has to be
copied, damaging the performance.

Introduce a static key to enable meta sources assignment only when attached
XDP program is device-bound.

This patch eliminates a 6% performance drop in ZC mode, which was a result
of addition of XDP hints to the driver.

Signed-off-by: Larysa Zaremba <larysa.zaremba@intel.com>
---
 drivers/net/ethernet/intel/ice/ice.h      |  1 +
 drivers/net/ethernet/intel/ice/ice_main.c | 14 ++++++++++++++
 drivers/net/ethernet/intel/ice/ice_txrx.c |  3 ++-
 drivers/net/ethernet/intel/ice/ice_xsk.c  |  3 +++
 4 files changed, 20 insertions(+), 1 deletion(-)

Comments

Fijalkowski, Maciej Oct. 20, 2023, 4:32 p.m. UTC | #1
On Thu, Oct 12, 2023 at 07:05:17PM +0200, Larysa Zaremba wrote:
> Usage of XDP hints requires putting additional information after the
> xdp_buff. In basic case, only the descriptor has to be copied on a
> per-packet basis, because xdp_buff permanently resides before per-ring
> metadata (cached time and VLAN protocol ID).
> 
> However, in ZC mode, xdp_buffs come from a pool, so memory after such
> buffer does not contain any reliable information, so everything has to be
> copied, damaging the performance.
> 
> Introduce a static key to enable meta sources assignment only when attached
> XDP program is device-bound.
> 
> This patch eliminates a 6% performance drop in ZC mode, which was a result
> of addition of XDP hints to the driver.
> 
> Signed-off-by: Larysa Zaremba <larysa.zaremba@intel.com>
> ---
>  drivers/net/ethernet/intel/ice/ice.h      |  1 +
>  drivers/net/ethernet/intel/ice/ice_main.c | 14 ++++++++++++++
>  drivers/net/ethernet/intel/ice/ice_txrx.c |  3 ++-
>  drivers/net/ethernet/intel/ice/ice_xsk.c  |  3 +++
>  4 files changed, 20 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h
> index 3d0f15f8b2b8..76d22be878a4 100644
> --- a/drivers/net/ethernet/intel/ice/ice.h
> +++ b/drivers/net/ethernet/intel/ice/ice.h
> @@ -210,6 +210,7 @@ enum ice_feature {
>  };
>  
>  DECLARE_STATIC_KEY_FALSE(ice_xdp_locking_key);
> +DECLARE_STATIC_KEY_FALSE(ice_xdp_meta_key);
>  
>  struct ice_channel {
>  	struct list_head list;
> diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
> index 47e8920e1727..ee0df86d34b7 100644
> --- a/drivers/net/ethernet/intel/ice/ice_main.c
> +++ b/drivers/net/ethernet/intel/ice/ice_main.c
> @@ -48,6 +48,9 @@ MODULE_PARM_DESC(debug, "netif level (0=none,...,16=all)");
>  DEFINE_STATIC_KEY_FALSE(ice_xdp_locking_key);
>  EXPORT_SYMBOL(ice_xdp_locking_key);
>  
> +DEFINE_STATIC_KEY_FALSE(ice_xdp_meta_key);
> +EXPORT_SYMBOL(ice_xdp_meta_key);
> +
>  /**
>   * ice_hw_to_dev - Get device pointer from the hardware structure
>   * @hw: pointer to the device HW structure
> @@ -2634,6 +2637,11 @@ static int ice_xdp_alloc_setup_rings(struct ice_vsi *vsi)
>  	return -ENOMEM;
>  }
>  
> +static bool ice_xdp_prog_has_meta(struct bpf_prog *prog)
> +{
> +	return prog && prog->aux->dev_bound;
> +}
> +
>  /**
>   * ice_vsi_assign_bpf_prog - set or clear bpf prog pointer on VSI
>   * @vsi: VSI to set the bpf prog on
> @@ -2644,10 +2652,16 @@ static void ice_vsi_assign_bpf_prog(struct ice_vsi *vsi, struct bpf_prog *prog)
>  	struct bpf_prog *old_prog;
>  	int i;
>  
> +	if (ice_xdp_prog_has_meta(prog))
> +		static_branch_inc(&ice_xdp_meta_key);

i thought boolean key would be enough but inc/dec should serve properly
for example prog hotswap cases.

> +
>  	old_prog = xchg(&vsi->xdp_prog, prog);
>  	ice_for_each_rxq(vsi, i)
>  		WRITE_ONCE(vsi->rx_rings[i]->xdp_prog, vsi->xdp_prog);
>  
> +	if (ice_xdp_prog_has_meta(old_prog))
> +		static_branch_dec(&ice_xdp_meta_key);
> +
>  	if (old_prog)
>  		bpf_prog_put(old_prog);
>  }
> diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c
> index 4fd7614f243d..19fc182d1f4c 100644
> --- a/drivers/net/ethernet/intel/ice/ice_txrx.c
> +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
> @@ -572,7 +572,8 @@ ice_run_xdp(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp,
>  	if (!xdp_prog)
>  		goto exit;
>  
> -	ice_xdp_meta_set_desc(xdp, eop_desc);
> +	if (static_branch_unlikely(&ice_xdp_meta_key))

My only concern is that we might be hurting in a minor way hints path now,
no?

> +		ice_xdp_meta_set_desc(xdp, eop_desc);
>  
>  	act = bpf_prog_run_xdp(xdp_prog, xdp);
>  	switch (act) {
> diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c
> index 39775bb6cec1..f92d7d33fde6 100644
> --- a/drivers/net/ethernet/intel/ice/ice_xsk.c
> +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c
> @@ -773,6 +773,9 @@ static void ice_prepare_pkt_ctx_zc(struct xdp_buff *xdp,
>  				   union ice_32b_rx_flex_desc *eop_desc,
>  				   struct ice_rx_ring *rx_ring)
>  {
> +	if (!static_branch_unlikely(&ice_xdp_meta_key))
> +		return;

wouldn't it be better to pull it out and avoid calling
ice_prepare_pkt_ctx_zc() unnecessarily?

> +
>  	XSK_CHECK_PRIV_TYPE(struct ice_xdp_buff);
>  	((struct ice_xdp_buff *)xdp)->pkt_ctx = rx_ring->pkt_ctx;
>  	ice_xdp_meta_set_desc(xdp, eop_desc);
> -- 
> 2.41.0
>
Larysa Zaremba Oct. 23, 2023, 9:35 a.m. UTC | #2
On Fri, Oct 20, 2023 at 06:32:13PM +0200, Maciej Fijalkowski wrote:
> On Thu, Oct 12, 2023 at 07:05:17PM +0200, Larysa Zaremba wrote:
> > Usage of XDP hints requires putting additional information after the
> > xdp_buff. In basic case, only the descriptor has to be copied on a
> > per-packet basis, because xdp_buff permanently resides before per-ring
> > metadata (cached time and VLAN protocol ID).
> > 
> > However, in ZC mode, xdp_buffs come from a pool, so memory after such
> > buffer does not contain any reliable information, so everything has to be
> > copied, damaging the performance.
> > 
> > Introduce a static key to enable meta sources assignment only when attached
> > XDP program is device-bound.
> > 
> > This patch eliminates a 6% performance drop in ZC mode, which was a result
> > of addition of XDP hints to the driver.
> > 
> > Signed-off-by: Larysa Zaremba <larysa.zaremba@intel.com>
> > ---
> >  drivers/net/ethernet/intel/ice/ice.h      |  1 +
> >  drivers/net/ethernet/intel/ice/ice_main.c | 14 ++++++++++++++
> >  drivers/net/ethernet/intel/ice/ice_txrx.c |  3 ++-
> >  drivers/net/ethernet/intel/ice/ice_xsk.c  |  3 +++
> >  4 files changed, 20 insertions(+), 1 deletion(-)
> > 
> > diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h
> > index 3d0f15f8b2b8..76d22be878a4 100644
> > --- a/drivers/net/ethernet/intel/ice/ice.h
> > +++ b/drivers/net/ethernet/intel/ice/ice.h
> > @@ -210,6 +210,7 @@ enum ice_feature {
> >  };
> >  
> >  DECLARE_STATIC_KEY_FALSE(ice_xdp_locking_key);
> > +DECLARE_STATIC_KEY_FALSE(ice_xdp_meta_key);
> >  
> >  struct ice_channel {
> >  	struct list_head list;
> > diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
> > index 47e8920e1727..ee0df86d34b7 100644
> > --- a/drivers/net/ethernet/intel/ice/ice_main.c
> > +++ b/drivers/net/ethernet/intel/ice/ice_main.c
> > @@ -48,6 +48,9 @@ MODULE_PARM_DESC(debug, "netif level (0=none,...,16=all)");
> >  DEFINE_STATIC_KEY_FALSE(ice_xdp_locking_key);
> >  EXPORT_SYMBOL(ice_xdp_locking_key);
> >  
> > +DEFINE_STATIC_KEY_FALSE(ice_xdp_meta_key);
> > +EXPORT_SYMBOL(ice_xdp_meta_key);
> > +
> >  /**
> >   * ice_hw_to_dev - Get device pointer from the hardware structure
> >   * @hw: pointer to the device HW structure
> > @@ -2634,6 +2637,11 @@ static int ice_xdp_alloc_setup_rings(struct ice_vsi *vsi)
> >  	return -ENOMEM;
> >  }
> >  
> > +static bool ice_xdp_prog_has_meta(struct bpf_prog *prog)
> > +{
> > +	return prog && prog->aux->dev_bound;
> > +}
> > +
> >  /**
> >   * ice_vsi_assign_bpf_prog - set or clear bpf prog pointer on VSI
> >   * @vsi: VSI to set the bpf prog on
> > @@ -2644,10 +2652,16 @@ static void ice_vsi_assign_bpf_prog(struct ice_vsi *vsi, struct bpf_prog *prog)
> >  	struct bpf_prog *old_prog;
> >  	int i;
> >  
> > +	if (ice_xdp_prog_has_meta(prog))
> > +		static_branch_inc(&ice_xdp_meta_key);
> 
> i thought boolean key would be enough but inc/dec should serve properly
> for example prog hotswap cases.
>

My thought process on using counting instead of boolean was: there can be 
several PFs that use the same driver, so therefore we need to keep track of how 
many od them use hints. And yes, this also looks better for hot-swapping, 
because conditions become more straightforward (we do not need to compare old 
and new programs).

> > +
> >  	old_prog = xchg(&vsi->xdp_prog, prog);
> >  	ice_for_each_rxq(vsi, i)
> >  		WRITE_ONCE(vsi->rx_rings[i]->xdp_prog, vsi->xdp_prog);
> >  
> > +	if (ice_xdp_prog_has_meta(old_prog))
> > +		static_branch_dec(&ice_xdp_meta_key);
> > +
> >  	if (old_prog)
> >  		bpf_prog_put(old_prog);
> >  }
> > diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c
> > index 4fd7614f243d..19fc182d1f4c 100644
> > --- a/drivers/net/ethernet/intel/ice/ice_txrx.c
> > +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
> > @@ -572,7 +572,8 @@ ice_run_xdp(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp,
> >  	if (!xdp_prog)
> >  		goto exit;
> >  
> > -	ice_xdp_meta_set_desc(xdp, eop_desc);
> > +	if (static_branch_unlikely(&ice_xdp_meta_key))
> 
> My only concern is that we might be hurting in a minor way hints path now,
> no?

I have thought "unlikely" refers to the default state the code is compiled with 
and after static key incrementation this should be patched to "likely". Isn't 
this how static keys work?

> 
> > +		ice_xdp_meta_set_desc(xdp, eop_desc);
> >  
> >  	act = bpf_prog_run_xdp(xdp_prog, xdp);
> >  	switch (act) {
> > diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c
> > index 39775bb6cec1..f92d7d33fde6 100644
> > --- a/drivers/net/ethernet/intel/ice/ice_xsk.c
> > +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c
> > @@ -773,6 +773,9 @@ static void ice_prepare_pkt_ctx_zc(struct xdp_buff *xdp,
> >  				   union ice_32b_rx_flex_desc *eop_desc,
> >  				   struct ice_rx_ring *rx_ring)
> >  {
> > +	if (!static_branch_unlikely(&ice_xdp_meta_key))
> > +		return;
> 
> wouldn't it be better to pull it out and avoid calling
> ice_prepare_pkt_ctx_zc() unnecessarily?
> 
> > +
> >  	XSK_CHECK_PRIV_TYPE(struct ice_xdp_buff);
> >  	((struct ice_xdp_buff *)xdp)->pkt_ctx = rx_ring->pkt_ctx;
> >  	ice_xdp_meta_set_desc(xdp, eop_desc);
> > -- 
> > 2.41.0
> >
Fijalkowski, Maciej Oct. 28, 2023, 7:55 p.m. UTC | #3
On Mon, Oct 23, 2023 at 11:35:46AM +0200, Larysa Zaremba wrote:
> On Fri, Oct 20, 2023 at 06:32:13PM +0200, Maciej Fijalkowski wrote:
> > On Thu, Oct 12, 2023 at 07:05:17PM +0200, Larysa Zaremba wrote:
> > > Usage of XDP hints requires putting additional information after the
> > > xdp_buff. In basic case, only the descriptor has to be copied on a
> > > per-packet basis, because xdp_buff permanently resides before per-ring
> > > metadata (cached time and VLAN protocol ID).
> > > 
> > > However, in ZC mode, xdp_buffs come from a pool, so memory after such
> > > buffer does not contain any reliable information, so everything has to be
> > > copied, damaging the performance.
> > > 
> > > Introduce a static key to enable meta sources assignment only when attached
> > > XDP program is device-bound.
> > > 
> > > This patch eliminates a 6% performance drop in ZC mode, which was a result
> > > of addition of XDP hints to the driver.
> > > 
> > > Signed-off-by: Larysa Zaremba <larysa.zaremba@intel.com>
> > > ---
> > >  drivers/net/ethernet/intel/ice/ice.h      |  1 +
> > >  drivers/net/ethernet/intel/ice/ice_main.c | 14 ++++++++++++++
> > >  drivers/net/ethernet/intel/ice/ice_txrx.c |  3 ++-
> > >  drivers/net/ethernet/intel/ice/ice_xsk.c  |  3 +++
> > >  4 files changed, 20 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h
> > > index 3d0f15f8b2b8..76d22be878a4 100644
> > > --- a/drivers/net/ethernet/intel/ice/ice.h
> > > +++ b/drivers/net/ethernet/intel/ice/ice.h
> > > @@ -210,6 +210,7 @@ enum ice_feature {
> > >  };
> > >  
> > >  DECLARE_STATIC_KEY_FALSE(ice_xdp_locking_key);
> > > +DECLARE_STATIC_KEY_FALSE(ice_xdp_meta_key);
> > >  
> > >  struct ice_channel {
> > >  	struct list_head list;
> > > diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
> > > index 47e8920e1727..ee0df86d34b7 100644
> > > --- a/drivers/net/ethernet/intel/ice/ice_main.c
> > > +++ b/drivers/net/ethernet/intel/ice/ice_main.c
> > > @@ -48,6 +48,9 @@ MODULE_PARM_DESC(debug, "netif level (0=none,...,16=all)");
> > >  DEFINE_STATIC_KEY_FALSE(ice_xdp_locking_key);
> > >  EXPORT_SYMBOL(ice_xdp_locking_key);
> > >  
> > > +DEFINE_STATIC_KEY_FALSE(ice_xdp_meta_key);
> > > +EXPORT_SYMBOL(ice_xdp_meta_key);
> > > +
> > >  /**
> > >   * ice_hw_to_dev - Get device pointer from the hardware structure
> > >   * @hw: pointer to the device HW structure
> > > @@ -2634,6 +2637,11 @@ static int ice_xdp_alloc_setup_rings(struct ice_vsi *vsi)
> > >  	return -ENOMEM;
> > >  }
> > >  
> > > +static bool ice_xdp_prog_has_meta(struct bpf_prog *prog)
> > > +{
> > > +	return prog && prog->aux->dev_bound;
> > > +}
> > > +
> > >  /**
> > >   * ice_vsi_assign_bpf_prog - set or clear bpf prog pointer on VSI
> > >   * @vsi: VSI to set the bpf prog on
> > > @@ -2644,10 +2652,16 @@ static void ice_vsi_assign_bpf_prog(struct ice_vsi *vsi, struct bpf_prog *prog)
> > >  	struct bpf_prog *old_prog;
> > >  	int i;
> > >  
> > > +	if (ice_xdp_prog_has_meta(prog))
> > > +		static_branch_inc(&ice_xdp_meta_key);
> > 
> > i thought boolean key would be enough but inc/dec should serve properly
> > for example prog hotswap cases.
> >
> 
> My thought process on using counting instead of boolean was: there can be 
> several PFs that use the same driver, so therefore we need to keep track of how 
> many od them use hints. 

Very good point. This implies that if PF0 has hints-enabled prog loaded,
PF1 with non-hints prog will "suffer" from it.

Sorry for such a long delays in responses but I was having a hard time
making up my mind about it. In the end I have come up to some conclusions.
I know the timing for sending this response is not ideal, but I need to
get this off my chest and bring discussion back to life:)

IMHO having static keys to eliminate ZC overhead does not scale. I assume
every other driver would have to follow that.

XSK pool allows us to avoid initializing various things per each packet.
Instead, taking xdp_rxq_info as an example, each xdp_buff from pool has
xdp_rxq_info assigned at init time. With this in mind, we should have some
mechanism to set hints-specific things in xdp_buff_xsk::cb, at init time
as well. Such mechanism should not require us to expose driver's private
xdp_buff hints containers (such as ice_pkt_ctx) to XSK pool.

Right now you moved phctime down to ice_pkt_ctx and to me that's the main
reason we have to copy ice_pkt_ctx to each xdp_buff on ZC. What if we keep
the cached_phctime at original offset in ring but ice_pkt_ctx would get a
pointer to that?

This would allow us to init the pointer in each xdp_buff from XSK pool at
init time. I have come up with a way to program that via so called XSK
meta descriptors. Each desc would have data to write onto cb, offset
within cb and amount of bytes to write/copy.

I'll share the diff below but note that I didn't measure how much lower
the performance is degraded. My icelake machine where I used to measure
performance-sensitive code got broke. For now we can't escape initing
eop_desc per each xdp_buff, but I moved it to alloc side, as we mangle
descs there anyway.

I think mlx5 could benefit from that approach as well with initing the rq
ptr at init time.

Diff does mostly these things:
- move cached_phctime to old place in ice_rx_ring and add ptr to that in
  ice_pkt_ctx
- introduce xsk_pool_set_meta()
- use it from ice side.

I consider this as a discussion trigger rather than ready code. Any
feedback will be appreciated.

---------------------------------8<---------------------------------

diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c
index 7fa43827a3f0..c192e84bee55 100644
--- a/drivers/net/ethernet/intel/ice/ice_base.c
+++ b/drivers/net/ethernet/intel/ice/ice_base.c
@@ -519,6 +519,23 @@ static int ice_setup_rx_ctx(struct ice_rx_ring *ring)
 	return 0;
 }
 
+static void ice_xsk_pool_set_meta(struct ice_rx_ring *ring)
+{
+	struct xsk_meta_desc desc = {};
+
+	desc.val = (uintptr_t)&ring->cached_phctime;
+	desc.off = offsetof(struct ice_pkt_ctx, cached_phctime);
+	desc.bytes = sizeof_field(struct ice_pkt_ctx, cached_phctime);
+	xsk_pool_set_meta(ring->xsk_pool, &desc);
+
+	memset(&desc, 0, sizeof(struct xsk_meta_desc));
+
+	desc.val = ring->pkt_ctx.vlan_proto;
+	desc.off = offsetof(struct ice_pkt_ctx, vlan_proto);
+	desc.bytes = sizeof_field(struct ice_pkt_ctx, vlan_proto);
+	xsk_pool_set_meta(ring->xsk_pool, &desc);
+}
+
 /**
  * ice_vsi_cfg_rxq - Configure an Rx queue
  * @ring: the ring being configured
@@ -553,6 +570,7 @@ int ice_vsi_cfg_rxq(struct ice_rx_ring *ring)
 			if (err)
 				return err;
 			xsk_pool_set_rxq_info(ring->xsk_pool, &ring->xdp_rxq);
+			ice_xsk_pool_set_meta(ring);
 
 			dev_info(dev, "Registered XDP mem model MEM_TYPE_XSK_BUFF_POOL on Rx ring %d\n",
 				 ring->q_index);
@@ -575,6 +593,7 @@ int ice_vsi_cfg_rxq(struct ice_rx_ring *ring)
 
 	xdp_init_buff(&ring->xdp, ice_rx_pg_size(ring) / 2, &ring->xdp_rxq);
 	ring->xdp.data = NULL;
+	ring->pkt_ctx.cached_phctime = &ring->cached_phctime;
 	err = ice_setup_rx_ctx(ring);
 	if (err) {
 		dev_err(dev, "ice_setup_rx_ctx failed for RxQ %d, err %d\n",
diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c
index cf5c91ada94c..d3cb08e66dcb 100644
--- a/drivers/net/ethernet/intel/ice/ice_ethtool.c
+++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c
@@ -2846,7 +2846,7 @@ ice_set_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring,
 		/* clone ring and setup updated count */
 		rx_rings[i] = *vsi->rx_rings[i];
 		rx_rings[i].count = new_rx_cnt;
-		rx_rings[i].pkt_ctx.cached_phctime = pf->ptp.cached_phc_time;
+		rx_rings[i].cached_phctime = pf->ptp.cached_phc_time;
 		rx_rings[i].desc = NULL;
 		rx_rings[i].rx_buf = NULL;
 		/* this is to allow wr32 to have something to write to
diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c
index 2fc97eafd1f6..1f45f0c3963d 100644
--- a/drivers/net/ethernet/intel/ice/ice_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_lib.c
@@ -1456,7 +1456,7 @@ static int ice_vsi_alloc_rings(struct ice_vsi *vsi)
 		ring->netdev = vsi->netdev;
 		ring->dev = dev;
 		ring->count = vsi->num_rx_desc;
-		ring->pkt_ctx.cached_phctime = pf->ptp.cached_phc_time;
+		ring->cached_phctime = pf->ptp.cached_phc_time;
 		WRITE_ONCE(vsi->rx_rings[i], ring);
 	}
 
diff --git a/drivers/net/ethernet/intel/ice/ice_ptp.c b/drivers/net/ethernet/intel/ice/ice_ptp.c
index f6444890f0ef..e2fa979830cd 100644
--- a/drivers/net/ethernet/intel/ice/ice_ptp.c
+++ b/drivers/net/ethernet/intel/ice/ice_ptp.c
@@ -955,8 +955,7 @@ static int ice_ptp_update_cached_phctime(struct ice_pf *pf)
 		ice_for_each_rxq(vsi, j) {
 			if (!vsi->rx_rings[j])
 				continue;
-			WRITE_ONCE(vsi->rx_rings[j]->pkt_ctx.cached_phctime,
-				   systime);
+			WRITE_ONCE(vsi->rx_rings[j]->cached_phctime, systime);
 		}
 	}
 	clear_bit(ICE_CFG_BUSY, pf->state);
@@ -2119,7 +2118,7 @@ u64 ice_ptp_get_rx_hwts(const union ice_32b_rx_flex_desc *rx_desc,
 	if (!(rx_desc->wb.time_stamp_low & ICE_PTP_TS_VALID))
 		return 0;
 
-	cached_time = READ_ONCE(pkt_ctx->cached_phctime);
+	cached_time = READ_ONCE(*pkt_ctx->cached_phctime);
 
 	/* Do not report a timestamp if we don't have a cached PHC time */
 	if (!cached_time)
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h b/drivers/net/ethernet/intel/ice/ice_txrx.h
index 41e0b14e6643..94594cc0d3ee 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.h
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.h
@@ -259,7 +259,7 @@ enum ice_rx_dtype {
 
 struct ice_pkt_ctx {
 	const union ice_32b_rx_flex_desc *eop_desc;
-	u64 cached_phctime;
+	u64 *cached_phctime;
 	__be16 vlan_proto;
 };
 
@@ -356,6 +356,7 @@ struct ice_rx_ring {
 	struct ice_tx_ring *xdp_ring;
 	struct xsk_buff_pool *xsk_pool;
 	dma_addr_t dma;			/* physical address of ring */
+	u64 cached_phctime;
 	u16 rx_buf_len;
 	u8 dcb_tc;			/* Traffic class of ring */
 	u8 ptp_rx;
diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c
index 49a64bfdd1f6..6fa7a86152d0 100644
--- a/drivers/net/ethernet/intel/ice/ice_xsk.c
+++ b/drivers/net/ethernet/intel/ice/ice_xsk.c
@@ -431,9 +431,18 @@ int ice_xsk_pool_setup(struct ice_vsi *vsi, struct xsk_buff_pool *pool, u16 qid)
 	return ret;
 }
 
+static struct ice_xdp_buff *xsk_buff_to_ice_ctx(struct xdp_buff *xdp)
+{
+	/* xdp_buff pointer used by ZC code path is alloc as xdp_buff_xsk. The
+	 * ice_xdp_buff shares its layout with xdp_buff_xsk and private
+	 * ice_xdp_buff fields fall into xdp_buff_xsk->cb
+	 */
+       return (struct ice_xdp_buff *)xdp;
+}
+
 /**
  * ice_fill_rx_descs - pick buffers from XSK buffer pool and use it
- * @pool: XSK Buffer pool to pull the buffers from
+ * @rx_ring: rx ring
  * @xdp: SW ring of xdp_buff that will hold the buffers
  * @rx_desc: Pointer to Rx descriptors that will be filled
  * @count: The number of buffers to allocate
@@ -445,18 +454,21 @@ int ice_xsk_pool_setup(struct ice_vsi *vsi, struct xsk_buff_pool *pool, u16 qid)
  *
  * Returns the amount of allocated Rx descriptors
  */
-static u16 ice_fill_rx_descs(struct xsk_buff_pool *pool, struct xdp_buff **xdp,
+static u16 ice_fill_rx_descs(struct ice_rx_ring *rx_ring, struct xdp_buff **xdp,
 			     union ice_32b_rx_flex_desc *rx_desc, u16 count)
 {
+	struct ice_xdp_buff *ctx;
 	dma_addr_t dma;
 	u16 buffs;
 	int i;
 
-	buffs = xsk_buff_alloc_batch(pool, xdp, count);
+	buffs = xsk_buff_alloc_batch(rx_ring->xsk_pool, xdp, count);
 	for (i = 0; i < buffs; i++) {
 		dma = xsk_buff_xdp_get_dma(*xdp);
 		rx_desc->read.pkt_addr = cpu_to_le64(dma);
 		rx_desc->wb.status_error0 = 0;
+		ctx = xsk_buff_to_ice_ctx(*xdp);
+		ctx->pkt_ctx.eop_desc = rx_desc;
 
 		rx_desc++;
 		xdp++;
@@ -488,8 +500,7 @@ static bool __ice_alloc_rx_bufs_zc(struct ice_rx_ring *rx_ring, u16 count)
 	xdp = ice_xdp_buf(rx_ring, ntu);
 
 	if (ntu + count >= rx_ring->count) {
-		nb_buffs_extra = ice_fill_rx_descs(rx_ring->xsk_pool, xdp,
-						   rx_desc,
+		nb_buffs_extra = ice_fill_rx_descs(rx_ring, xdp, rx_desc,
 						   rx_ring->count - ntu);
 		if (nb_buffs_extra != rx_ring->count - ntu) {
 			ntu += nb_buffs_extra;
@@ -502,7 +513,7 @@ static bool __ice_alloc_rx_bufs_zc(struct ice_rx_ring *rx_ring, u16 count)
 		ice_release_rx_desc(rx_ring, 0);
 	}
 
-	nb_buffs = ice_fill_rx_descs(rx_ring->xsk_pool, xdp, rx_desc, count);
+	nb_buffs = ice_fill_rx_descs(rx_ring, xdp, rx_desc, count);
 
 	ntu += nb_buffs;
 	if (ntu == rx_ring->count)
@@ -746,32 +757,6 @@ static int ice_xmit_xdp_tx_zc(struct xdp_buff *xdp,
 	return ICE_XDP_CONSUMED;
 }
 
-/**
- * ice_prepare_pkt_ctx_zc - Prepare packet context for XDP hints
- * @xdp: xdp_buff used as input to the XDP program
- * @eop_desc: End of packet descriptor
- * @rx_ring: Rx ring with packet context
- *
- * In regular XDP, xdp_buff is placed inside the ring structure,
- * just before the packet context, so the latter can be accessed
- * with xdp_buff address only at all times, but in ZC mode,
- * xdp_buffs come from the pool, so we need to reinitialize
- * context for every packet.
- *
- * We can safely convert xdp_buff_xsk to ice_xdp_buff,
- * because there are XSK_PRIV_MAX bytes reserved in xdp_buff_xsk
- * right after xdp_buff, for our private use.
- * XSK_CHECK_PRIV_TYPE() ensures we do not go above the limit.
- */
-static void ice_prepare_pkt_ctx_zc(struct xdp_buff *xdp,
-				   union ice_32b_rx_flex_desc *eop_desc,
-				   struct ice_rx_ring *rx_ring)
-{
-	XSK_CHECK_PRIV_TYPE(struct ice_xdp_buff);
-	((struct ice_xdp_buff *)xdp)->pkt_ctx = rx_ring->pkt_ctx;
-	ice_xdp_meta_set_desc(xdp, eop_desc);
-}
-
 /**
  * ice_run_xdp_zc - Executes an XDP program in zero-copy path
  * @rx_ring: Rx ring
@@ -784,13 +769,11 @@ static void ice_prepare_pkt_ctx_zc(struct xdp_buff *xdp,
  */
 static int
 ice_run_xdp_zc(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp,
-	       struct bpf_prog *xdp_prog, struct ice_tx_ring *xdp_ring,
-	       union ice_32b_rx_flex_desc *rx_desc)
+	       struct bpf_prog *xdp_prog, struct ice_tx_ring *xdp_ring)
 {
 	int err, result = ICE_XDP_PASS;
 	u32 act;
 
-	ice_prepare_pkt_ctx_zc(xdp, rx_desc, rx_ring);
 	act = bpf_prog_run_xdp(xdp_prog, xdp);
 
 	if (likely(act == XDP_REDIRECT)) {
@@ -930,8 +913,7 @@ int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, int budget)
 		if (ice_is_non_eop(rx_ring, rx_desc))
 			continue;
 
-		xdp_res = ice_run_xdp_zc(rx_ring, first, xdp_prog, xdp_ring,
-					 rx_desc);
+		xdp_res = ice_run_xdp_zc(rx_ring, first, xdp_prog, xdp_ring);
 		if (likely(xdp_res & (ICE_XDP_TX | ICE_XDP_REDIR))) {
 			xdp_xmit |= xdp_res;
 		} else if (xdp_res == ICE_XDP_EXIT) {
diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h
index 1f6fc8c7a84c..91fa74a14841 100644
--- a/include/net/xdp_sock_drv.h
+++ b/include/net/xdp_sock_drv.h
@@ -14,6 +14,13 @@
 
 #ifdef CONFIG_XDP_SOCKETS
 
+struct xsk_meta_desc {
+	u64 val;
+	u8 off;
+	u8 bytes;
+};
+
+
 void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries);
 bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc);
 u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 max);
@@ -47,6 +54,12 @@ static inline void xsk_pool_set_rxq_info(struct xsk_buff_pool *pool,
 	xp_set_rxq_info(pool, rxq);
 }
 
+static inline void xsk_pool_set_meta(struct xsk_buff_pool *pool,
+				     struct xsk_meta_desc *desc)
+{
+	xp_set_meta(pool, desc);
+}
+
 static inline unsigned int xsk_pool_get_napi_id(struct xsk_buff_pool *pool)
 {
 #ifdef CONFIG_NET_RX_BUSY_POLL
@@ -250,6 +263,11 @@ static inline void xsk_pool_set_rxq_info(struct xsk_buff_pool *pool,
 {
 }
 
+static inline void xsk_pool_set_meta(struct xsk_buff_pool *pool,
+				     struct xsk_meta_desc *desc)
+{
+}
+
 static inline unsigned int xsk_pool_get_napi_id(struct xsk_buff_pool *pool)
 {
 	return 0;
diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
index b0bdff26fc88..354b1c702a82 100644
--- a/include/net/xsk_buff_pool.h
+++ b/include/net/xsk_buff_pool.h
@@ -12,6 +12,7 @@
 
 struct xsk_buff_pool;
 struct xdp_rxq_info;
+struct xsk_meta_desc;
 struct xsk_queue;
 struct xdp_desc;
 struct xdp_umem;
@@ -132,6 +133,7 @@ static inline void xp_init_xskb_dma(struct xdp_buff_xsk *xskb, struct xsk_buff_p
 
 /* AF_XDP ZC drivers, via xdp_sock_buff.h */
 void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq);
+void xp_set_meta(struct xsk_buff_pool *pool, struct xsk_meta_desc *desc);
 int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev,
 	       unsigned long attrs, struct page **pages, u32 nr_pages);
 void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs);
diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
index 49cb9f9a09be..632fdc247862 100644
--- a/net/xdp/xsk_buff_pool.c
+++ b/net/xdp/xsk_buff_pool.c
@@ -123,6 +123,18 @@ void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq)
 }
 EXPORT_SYMBOL(xp_set_rxq_info);
 
+void xp_set_meta(struct xsk_buff_pool *pool, struct xsk_meta_desc *desc)
+{
+	u32 i;
+
+	for (i = 0; i < pool->heads_cnt; i++) {
+		struct xdp_buff_xsk *xskb = &pool->heads[i];
+
+		memcpy(xskb->cb + desc->off, desc->buf, desc->bytes);
+	}
+}
+EXPORT_SYMBOL(xp_set_meta);
+
 static void xp_disable_drv_zc(struct xsk_buff_pool *pool)
 {
 	struct netdev_bpf bpf;

--------------------------------->8---------------------------------

> And yes, this also looks better for hot-swapping, 
> because conditions become more straightforward (we do not need to compare old 
> and new programs).
> 
> > > +
> > >  	old_prog = xchg(&vsi->xdp_prog, prog);
> > >  	ice_for_each_rxq(vsi, i)
> > >  		WRITE_ONCE(vsi->rx_rings[i]->xdp_prog, vsi->xdp_prog);
> > >  
> > > +	if (ice_xdp_prog_has_meta(old_prog))
> > > +		static_branch_dec(&ice_xdp_meta_key);
> > > +
> > >  	if (old_prog)
> > >  		bpf_prog_put(old_prog);
> > >  }
> > > diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c
> > > index 4fd7614f243d..19fc182d1f4c 100644
> > > --- a/drivers/net/ethernet/intel/ice/ice_txrx.c
> > > +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
> > > @@ -572,7 +572,8 @@ ice_run_xdp(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp,
> > >  	if (!xdp_prog)
> > >  		goto exit;
> > >  
> > > -	ice_xdp_meta_set_desc(xdp, eop_desc);
> > > +	if (static_branch_unlikely(&ice_xdp_meta_key))
> > 
> > My only concern is that we might be hurting in a minor way hints path now,
> > no?
> 
> I have thought "unlikely" refers to the default state the code is compiled with 
> and after static key incrementation this should be patched to "likely". Isn't 
> this how static keys work?

I was only referring to that it ends with compiler hint:
#define unlikely_notrace(x)	__builtin_expect(!!(x), 0)

see include/linux/jump_label.h

> 
> > 
> > > +		ice_xdp_meta_set_desc(xdp, eop_desc);
> > >  
> > >  	act = bpf_prog_run_xdp(xdp_prog, xdp);
> > >  	switch (act) {
> > > diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c
> > > index 39775bb6cec1..f92d7d33fde6 100644
> > > --- a/drivers/net/ethernet/intel/ice/ice_xsk.c
> > > +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c
> > > @@ -773,6 +773,9 @@ static void ice_prepare_pkt_ctx_zc(struct xdp_buff *xdp,
> > >  				   union ice_32b_rx_flex_desc *eop_desc,
> > >  				   struct ice_rx_ring *rx_ring)
> > >  {
> > > +	if (!static_branch_unlikely(&ice_xdp_meta_key))
> > > +		return;
> > 
> > wouldn't it be better to pull it out and avoid calling
> > ice_prepare_pkt_ctx_zc() unnecessarily?
> > 
> > > +
> > >  	XSK_CHECK_PRIV_TYPE(struct ice_xdp_buff);
> > >  	((struct ice_xdp_buff *)xdp)->pkt_ctx = rx_ring->pkt_ctx;
> > >  	ice_xdp_meta_set_desc(xdp, eop_desc);
> > > -- 
> > > 2.41.0
> > >
Larysa Zaremba Oct. 31, 2023, 2:22 p.m. UTC | #4
On Sat, Oct 28, 2023 at 09:55:52PM +0200, Maciej Fijalkowski wrote:
> On Mon, Oct 23, 2023 at 11:35:46AM +0200, Larysa Zaremba wrote:
> > On Fri, Oct 20, 2023 at 06:32:13PM +0200, Maciej Fijalkowski wrote:
> > > On Thu, Oct 12, 2023 at 07:05:17PM +0200, Larysa Zaremba wrote:
> > > > Usage of XDP hints requires putting additional information after the
> > > > xdp_buff. In basic case, only the descriptor has to be copied on a
> > > > per-packet basis, because xdp_buff permanently resides before per-ring
> > > > metadata (cached time and VLAN protocol ID).
> > > > 
> > > > However, in ZC mode, xdp_buffs come from a pool, so memory after such
> > > > buffer does not contain any reliable information, so everything has to be
> > > > copied, damaging the performance.
> > > > 
> > > > Introduce a static key to enable meta sources assignment only when attached
> > > > XDP program is device-bound.
> > > > 
> > > > This patch eliminates a 6% performance drop in ZC mode, which was a result
> > > > of addition of XDP hints to the driver.
> > > > 
> > > > Signed-off-by: Larysa Zaremba <larysa.zaremba@intel.com>
> > > > ---
> > > >  drivers/net/ethernet/intel/ice/ice.h      |  1 +
> > > >  drivers/net/ethernet/intel/ice/ice_main.c | 14 ++++++++++++++
> > > >  drivers/net/ethernet/intel/ice/ice_txrx.c |  3 ++-
> > > >  drivers/net/ethernet/intel/ice/ice_xsk.c  |  3 +++
> > > >  4 files changed, 20 insertions(+), 1 deletion(-)
> > > > 
> > > > diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h
> > > > index 3d0f15f8b2b8..76d22be878a4 100644
> > > > --- a/drivers/net/ethernet/intel/ice/ice.h
> > > > +++ b/drivers/net/ethernet/intel/ice/ice.h
> > > > @@ -210,6 +210,7 @@ enum ice_feature {
> > > >  };
> > > >  
> > > >  DECLARE_STATIC_KEY_FALSE(ice_xdp_locking_key);
> > > > +DECLARE_STATIC_KEY_FALSE(ice_xdp_meta_key);
> > > >  
> > > >  struct ice_channel {
> > > >  	struct list_head list;
> > > > diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
> > > > index 47e8920e1727..ee0df86d34b7 100644
> > > > --- a/drivers/net/ethernet/intel/ice/ice_main.c
> > > > +++ b/drivers/net/ethernet/intel/ice/ice_main.c
> > > > @@ -48,6 +48,9 @@ MODULE_PARM_DESC(debug, "netif level (0=none,...,16=all)");
> > > >  DEFINE_STATIC_KEY_FALSE(ice_xdp_locking_key);
> > > >  EXPORT_SYMBOL(ice_xdp_locking_key);
> > > >  
> > > > +DEFINE_STATIC_KEY_FALSE(ice_xdp_meta_key);
> > > > +EXPORT_SYMBOL(ice_xdp_meta_key);
> > > > +
> > > >  /**
> > > >   * ice_hw_to_dev - Get device pointer from the hardware structure
> > > >   * @hw: pointer to the device HW structure
> > > > @@ -2634,6 +2637,11 @@ static int ice_xdp_alloc_setup_rings(struct ice_vsi *vsi)
> > > >  	return -ENOMEM;
> > > >  }
> > > >  
> > > > +static bool ice_xdp_prog_has_meta(struct bpf_prog *prog)
> > > > +{
> > > > +	return prog && prog->aux->dev_bound;
> > > > +}
> > > > +
> > > >  /**
> > > >   * ice_vsi_assign_bpf_prog - set or clear bpf prog pointer on VSI
> > > >   * @vsi: VSI to set the bpf prog on
> > > > @@ -2644,10 +2652,16 @@ static void ice_vsi_assign_bpf_prog(struct ice_vsi *vsi, struct bpf_prog *prog)
> > > >  	struct bpf_prog *old_prog;
> > > >  	int i;
> > > >  
> > > > +	if (ice_xdp_prog_has_meta(prog))
> > > > +		static_branch_inc(&ice_xdp_meta_key);
> > > 
> > > i thought boolean key would be enough but inc/dec should serve properly
> > > for example prog hotswap cases.
> > >
> > 
> > My thought process on using counting instead of boolean was: there can be 
> > several PFs that use the same driver, so therefore we need to keep track of how 
> > many od them use hints. 
> 
> Very good point. This implies that if PF0 has hints-enabled prog loaded,
> PF1 with non-hints prog will "suffer" from it.
> 
> Sorry for such a long delays in responses but I was having a hard time
> making up my mind about it. In the end I have come up to some conclusions.
> I know the timing for sending this response is not ideal, but I need to
> get this off my chest and bring discussion back to life:)
> 
> IMHO having static keys to eliminate ZC overhead does not scale. I assume
> every other driver would have to follow that.
> 
> XSK pool allows us to avoid initializing various things per each packet.
> Instead, taking xdp_rxq_info as an example, each xdp_buff from pool has
> xdp_rxq_info assigned at init time. With this in mind, we should have some
> mechanism to set hints-specific things in xdp_buff_xsk::cb, at init time
> as well. Such mechanism should not require us to expose driver's private
> xdp_buff hints containers (such as ice_pkt_ctx) to XSK pool.
> 
> Right now you moved phctime down to ice_pkt_ctx and to me that's the main
> reason we have to copy ice_pkt_ctx to each xdp_buff on ZC. What if we keep
> the cached_phctime at original offset in ring but ice_pkt_ctx would get a
> pointer to that?
> 
> This would allow us to init the pointer in each xdp_buff from XSK pool at
> init time. I have come up with a way to program that via so called XSK
> meta descriptors. Each desc would have data to write onto cb, offset
> within cb and amount of bytes to write/copy.
> 
> I'll share the diff below but note that I didn't measure how much lower
> the performance is degraded. My icelake machine where I used to measure
> performance-sensitive code got broke. For now we can't escape initing
> eop_desc per each xdp_buff, but I moved it to alloc side, as we mangle
> descs there anyway.
> 
> I think mlx5 could benefit from that approach as well with initing the rq
> ptr at init time.
> 
> Diff does mostly these things:
> - move cached_phctime to old place in ice_rx_ring and add ptr to that in
>   ice_pkt_ctx
> - introduce xsk_pool_set_meta()
> - use it from ice side.
> 

Thank you for the code! I will probably send v7 with such changes. Are you OK, 
if patch with core changes would go with you as an author?

But also, I see a minor problem with that switching VLAN protocol does not 
trigger buffer allocation, so we have to point to that too, this probably means 
moving cached time back and finding 16 extra bits in CL3. Single pointer to 
{cached time, vlan_proto} would be copied to be after xdp_buff.

> I consider this as a discussion trigger rather than ready code. Any
> feedback will be appreciated.
> 
> ---------------------------------8<---------------------------------
> 
> diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c
> index 7fa43827a3f0..c192e84bee55 100644
> --- a/drivers/net/ethernet/intel/ice/ice_base.c
> +++ b/drivers/net/ethernet/intel/ice/ice_base.c
> @@ -519,6 +519,23 @@ static int ice_setup_rx_ctx(struct ice_rx_ring *ring)
>  	return 0;
>  }
>  
> +static void ice_xsk_pool_set_meta(struct ice_rx_ring *ring)
> +{
> +	struct xsk_meta_desc desc = {};
> +
> +	desc.val = (uintptr_t)&ring->cached_phctime;
> +	desc.off = offsetof(struct ice_pkt_ctx, cached_phctime);
> +	desc.bytes = sizeof_field(struct ice_pkt_ctx, cached_phctime);
> +	xsk_pool_set_meta(ring->xsk_pool, &desc);
> +
> +	memset(&desc, 0, sizeof(struct xsk_meta_desc));
> +
> +	desc.val = ring->pkt_ctx.vlan_proto;
> +	desc.off = offsetof(struct ice_pkt_ctx, vlan_proto);
> +	desc.bytes = sizeof_field(struct ice_pkt_ctx, vlan_proto);
> +	xsk_pool_set_meta(ring->xsk_pool, &desc);
> +}
> +
>  /**
>   * ice_vsi_cfg_rxq - Configure an Rx queue
>   * @ring: the ring being configured
> @@ -553,6 +570,7 @@ int ice_vsi_cfg_rxq(struct ice_rx_ring *ring)
>  			if (err)
>  				return err;
>  			xsk_pool_set_rxq_info(ring->xsk_pool, &ring->xdp_rxq);
> +			ice_xsk_pool_set_meta(ring);
>  
>  			dev_info(dev, "Registered XDP mem model MEM_TYPE_XSK_BUFF_POOL on Rx ring %d\n",
>  				 ring->q_index);
> @@ -575,6 +593,7 @@ int ice_vsi_cfg_rxq(struct ice_rx_ring *ring)
>  
>  	xdp_init_buff(&ring->xdp, ice_rx_pg_size(ring) / 2, &ring->xdp_rxq);
>  	ring->xdp.data = NULL;
> +	ring->pkt_ctx.cached_phctime = &ring->cached_phctime;
>  	err = ice_setup_rx_ctx(ring);
>  	if (err) {
>  		dev_err(dev, "ice_setup_rx_ctx failed for RxQ %d, err %d\n",
> diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c
> index cf5c91ada94c..d3cb08e66dcb 100644
> --- a/drivers/net/ethernet/intel/ice/ice_ethtool.c
> +++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c
> @@ -2846,7 +2846,7 @@ ice_set_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring,
>  		/* clone ring and setup updated count */
>  		rx_rings[i] = *vsi->rx_rings[i];
>  		rx_rings[i].count = new_rx_cnt;
> -		rx_rings[i].pkt_ctx.cached_phctime = pf->ptp.cached_phc_time;
> +		rx_rings[i].cached_phctime = pf->ptp.cached_phc_time;
>  		rx_rings[i].desc = NULL;
>  		rx_rings[i].rx_buf = NULL;
>  		/* this is to allow wr32 to have something to write to
> diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c
> index 2fc97eafd1f6..1f45f0c3963d 100644
> --- a/drivers/net/ethernet/intel/ice/ice_lib.c
> +++ b/drivers/net/ethernet/intel/ice/ice_lib.c
> @@ -1456,7 +1456,7 @@ static int ice_vsi_alloc_rings(struct ice_vsi *vsi)
>  		ring->netdev = vsi->netdev;
>  		ring->dev = dev;
>  		ring->count = vsi->num_rx_desc;
> -		ring->pkt_ctx.cached_phctime = pf->ptp.cached_phc_time;
> +		ring->cached_phctime = pf->ptp.cached_phc_time;
>  		WRITE_ONCE(vsi->rx_rings[i], ring);
>  	}
>  
> diff --git a/drivers/net/ethernet/intel/ice/ice_ptp.c b/drivers/net/ethernet/intel/ice/ice_ptp.c
> index f6444890f0ef..e2fa979830cd 100644
> --- a/drivers/net/ethernet/intel/ice/ice_ptp.c
> +++ b/drivers/net/ethernet/intel/ice/ice_ptp.c
> @@ -955,8 +955,7 @@ static int ice_ptp_update_cached_phctime(struct ice_pf *pf)
>  		ice_for_each_rxq(vsi, j) {
>  			if (!vsi->rx_rings[j])
>  				continue;
> -			WRITE_ONCE(vsi->rx_rings[j]->pkt_ctx.cached_phctime,
> -				   systime);
> +			WRITE_ONCE(vsi->rx_rings[j]->cached_phctime, systime);
>  		}
>  	}
>  	clear_bit(ICE_CFG_BUSY, pf->state);
> @@ -2119,7 +2118,7 @@ u64 ice_ptp_get_rx_hwts(const union ice_32b_rx_flex_desc *rx_desc,
>  	if (!(rx_desc->wb.time_stamp_low & ICE_PTP_TS_VALID))
>  		return 0;
>  
> -	cached_time = READ_ONCE(pkt_ctx->cached_phctime);
> +	cached_time = READ_ONCE(*pkt_ctx->cached_phctime);
>  
>  	/* Do not report a timestamp if we don't have a cached PHC time */
>  	if (!cached_time)
> diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h b/drivers/net/ethernet/intel/ice/ice_txrx.h
> index 41e0b14e6643..94594cc0d3ee 100644
> --- a/drivers/net/ethernet/intel/ice/ice_txrx.h
> +++ b/drivers/net/ethernet/intel/ice/ice_txrx.h
> @@ -259,7 +259,7 @@ enum ice_rx_dtype {
>  
>  struct ice_pkt_ctx {
>  	const union ice_32b_rx_flex_desc *eop_desc;
> -	u64 cached_phctime;
> +	u64 *cached_phctime;
>  	__be16 vlan_proto;
>  };
>  
> @@ -356,6 +356,7 @@ struct ice_rx_ring {
>  	struct ice_tx_ring *xdp_ring;
>  	struct xsk_buff_pool *xsk_pool;
>  	dma_addr_t dma;			/* physical address of ring */
> +	u64 cached_phctime;
>  	u16 rx_buf_len;
>  	u8 dcb_tc;			/* Traffic class of ring */
>  	u8 ptp_rx;
> diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c
> index 49a64bfdd1f6..6fa7a86152d0 100644
> --- a/drivers/net/ethernet/intel/ice/ice_xsk.c
> +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c
> @@ -431,9 +431,18 @@ int ice_xsk_pool_setup(struct ice_vsi *vsi, struct xsk_buff_pool *pool, u16 qid)
>  	return ret;
>  }
>  
> +static struct ice_xdp_buff *xsk_buff_to_ice_ctx(struct xdp_buff *xdp)
> +{
> +	/* xdp_buff pointer used by ZC code path is alloc as xdp_buff_xsk. The
> +	 * ice_xdp_buff shares its layout with xdp_buff_xsk and private
> +	 * ice_xdp_buff fields fall into xdp_buff_xsk->cb
> +	 */
> +       return (struct ice_xdp_buff *)xdp;
> +}
> +
>  /**
>   * ice_fill_rx_descs - pick buffers from XSK buffer pool and use it
> - * @pool: XSK Buffer pool to pull the buffers from
> + * @rx_ring: rx ring
>   * @xdp: SW ring of xdp_buff that will hold the buffers
>   * @rx_desc: Pointer to Rx descriptors that will be filled
>   * @count: The number of buffers to allocate
> @@ -445,18 +454,21 @@ int ice_xsk_pool_setup(struct ice_vsi *vsi, struct xsk_buff_pool *pool, u16 qid)
>   *
>   * Returns the amount of allocated Rx descriptors
>   */
> -static u16 ice_fill_rx_descs(struct xsk_buff_pool *pool, struct xdp_buff **xdp,
> +static u16 ice_fill_rx_descs(struct ice_rx_ring *rx_ring, struct xdp_buff **xdp,
>  			     union ice_32b_rx_flex_desc *rx_desc, u16 count)
>  {
> +	struct ice_xdp_buff *ctx;
>  	dma_addr_t dma;
>  	u16 buffs;
>  	int i;
>  
> -	buffs = xsk_buff_alloc_batch(pool, xdp, count);
> +	buffs = xsk_buff_alloc_batch(rx_ring->xsk_pool, xdp, count);
>  	for (i = 0; i < buffs; i++) {
>  		dma = xsk_buff_xdp_get_dma(*xdp);
>  		rx_desc->read.pkt_addr = cpu_to_le64(dma);
>  		rx_desc->wb.status_error0 = 0;
> +		ctx = xsk_buff_to_ice_ctx(*xdp);
> +		ctx->pkt_ctx.eop_desc = rx_desc;
>  
>  		rx_desc++;
>  		xdp++;
> @@ -488,8 +500,7 @@ static bool __ice_alloc_rx_bufs_zc(struct ice_rx_ring *rx_ring, u16 count)
>  	xdp = ice_xdp_buf(rx_ring, ntu);
>  
>  	if (ntu + count >= rx_ring->count) {
> -		nb_buffs_extra = ice_fill_rx_descs(rx_ring->xsk_pool, xdp,
> -						   rx_desc,
> +		nb_buffs_extra = ice_fill_rx_descs(rx_ring, xdp, rx_desc,
>  						   rx_ring->count - ntu);
>  		if (nb_buffs_extra != rx_ring->count - ntu) {
>  			ntu += nb_buffs_extra;
> @@ -502,7 +513,7 @@ static bool __ice_alloc_rx_bufs_zc(struct ice_rx_ring *rx_ring, u16 count)
>  		ice_release_rx_desc(rx_ring, 0);
>  	}
>  
> -	nb_buffs = ice_fill_rx_descs(rx_ring->xsk_pool, xdp, rx_desc, count);
> +	nb_buffs = ice_fill_rx_descs(rx_ring, xdp, rx_desc, count);
>  
>  	ntu += nb_buffs;
>  	if (ntu == rx_ring->count)
> @@ -746,32 +757,6 @@ static int ice_xmit_xdp_tx_zc(struct xdp_buff *xdp,
>  	return ICE_XDP_CONSUMED;
>  }
>  
> -/**
> - * ice_prepare_pkt_ctx_zc - Prepare packet context for XDP hints
> - * @xdp: xdp_buff used as input to the XDP program
> - * @eop_desc: End of packet descriptor
> - * @rx_ring: Rx ring with packet context
> - *
> - * In regular XDP, xdp_buff is placed inside the ring structure,
> - * just before the packet context, so the latter can be accessed
> - * with xdp_buff address only at all times, but in ZC mode,
> - * xdp_buffs come from the pool, so we need to reinitialize
> - * context for every packet.
> - *
> - * We can safely convert xdp_buff_xsk to ice_xdp_buff,
> - * because there are XSK_PRIV_MAX bytes reserved in xdp_buff_xsk
> - * right after xdp_buff, for our private use.
> - * XSK_CHECK_PRIV_TYPE() ensures we do not go above the limit.
> - */
> -static void ice_prepare_pkt_ctx_zc(struct xdp_buff *xdp,
> -				   union ice_32b_rx_flex_desc *eop_desc,
> -				   struct ice_rx_ring *rx_ring)
> -{
> -	XSK_CHECK_PRIV_TYPE(struct ice_xdp_buff);
> -	((struct ice_xdp_buff *)xdp)->pkt_ctx = rx_ring->pkt_ctx;
> -	ice_xdp_meta_set_desc(xdp, eop_desc);
> -}
> -
>  /**
>   * ice_run_xdp_zc - Executes an XDP program in zero-copy path
>   * @rx_ring: Rx ring
> @@ -784,13 +769,11 @@ static void ice_prepare_pkt_ctx_zc(struct xdp_buff *xdp,
>   */
>  static int
>  ice_run_xdp_zc(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp,
> -	       struct bpf_prog *xdp_prog, struct ice_tx_ring *xdp_ring,
> -	       union ice_32b_rx_flex_desc *rx_desc)
> +	       struct bpf_prog *xdp_prog, struct ice_tx_ring *xdp_ring)
>  {
>  	int err, result = ICE_XDP_PASS;
>  	u32 act;
>  
> -	ice_prepare_pkt_ctx_zc(xdp, rx_desc, rx_ring);
>  	act = bpf_prog_run_xdp(xdp_prog, xdp);
>  
>  	if (likely(act == XDP_REDIRECT)) {
> @@ -930,8 +913,7 @@ int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, int budget)
>  		if (ice_is_non_eop(rx_ring, rx_desc))
>  			continue;
>  
> -		xdp_res = ice_run_xdp_zc(rx_ring, first, xdp_prog, xdp_ring,
> -					 rx_desc);
> +		xdp_res = ice_run_xdp_zc(rx_ring, first, xdp_prog, xdp_ring);
>  		if (likely(xdp_res & (ICE_XDP_TX | ICE_XDP_REDIR))) {
>  			xdp_xmit |= xdp_res;
>  		} else if (xdp_res == ICE_XDP_EXIT) {
> diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h
> index 1f6fc8c7a84c..91fa74a14841 100644
> --- a/include/net/xdp_sock_drv.h
> +++ b/include/net/xdp_sock_drv.h
> @@ -14,6 +14,13 @@
>  
>  #ifdef CONFIG_XDP_SOCKETS
>  
> +struct xsk_meta_desc {
> +	u64 val;
> +	u8 off;
> +	u8 bytes;
> +};
> +
> +
>  void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries);
>  bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc);
>  u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 max);
> @@ -47,6 +54,12 @@ static inline void xsk_pool_set_rxq_info(struct xsk_buff_pool *pool,
>  	xp_set_rxq_info(pool, rxq);
>  }
>  
> +static inline void xsk_pool_set_meta(struct xsk_buff_pool *pool,
> +				     struct xsk_meta_desc *desc)
> +{
> +	xp_set_meta(pool, desc);
> +}
> +
>  static inline unsigned int xsk_pool_get_napi_id(struct xsk_buff_pool *pool)
>  {
>  #ifdef CONFIG_NET_RX_BUSY_POLL
> @@ -250,6 +263,11 @@ static inline void xsk_pool_set_rxq_info(struct xsk_buff_pool *pool,
>  {
>  }
>  
> +static inline void xsk_pool_set_meta(struct xsk_buff_pool *pool,
> +				     struct xsk_meta_desc *desc)
> +{
> +}
> +
>  static inline unsigned int xsk_pool_get_napi_id(struct xsk_buff_pool *pool)
>  {
>  	return 0;
> diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
> index b0bdff26fc88..354b1c702a82 100644
> --- a/include/net/xsk_buff_pool.h
> +++ b/include/net/xsk_buff_pool.h
> @@ -12,6 +12,7 @@
>  
>  struct xsk_buff_pool;
>  struct xdp_rxq_info;
> +struct xsk_meta_desc;
>  struct xsk_queue;
>  struct xdp_desc;
>  struct xdp_umem;
> @@ -132,6 +133,7 @@ static inline void xp_init_xskb_dma(struct xdp_buff_xsk *xskb, struct xsk_buff_p
>  
>  /* AF_XDP ZC drivers, via xdp_sock_buff.h */
>  void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq);
> +void xp_set_meta(struct xsk_buff_pool *pool, struct xsk_meta_desc *desc);
>  int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev,
>  	       unsigned long attrs, struct page **pages, u32 nr_pages);
>  void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs);
> diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
> index 49cb9f9a09be..632fdc247862 100644
> --- a/net/xdp/xsk_buff_pool.c
> +++ b/net/xdp/xsk_buff_pool.c
> @@ -123,6 +123,18 @@ void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq)
>  }
>  EXPORT_SYMBOL(xp_set_rxq_info);
>  
> +void xp_set_meta(struct xsk_buff_pool *pool, struct xsk_meta_desc *desc)
> +{
> +	u32 i;
> +
> +	for (i = 0; i < pool->heads_cnt; i++) {
> +		struct xdp_buff_xsk *xskb = &pool->heads[i];
> +
> +		memcpy(xskb->cb + desc->off, desc->buf, desc->bytes);
> +	}
> +}
> +EXPORT_SYMBOL(xp_set_meta);
> +
>  static void xp_disable_drv_zc(struct xsk_buff_pool *pool)
>  {
>  	struct netdev_bpf bpf;
> 
> --------------------------------->8---------------------------------
> 
> > And yes, this also looks better for hot-swapping, 
> > because conditions become more straightforward (we do not need to compare old 
> > and new programs).
> > 
> > > > +
> > > >  	old_prog = xchg(&vsi->xdp_prog, prog);
> > > >  	ice_for_each_rxq(vsi, i)
> > > >  		WRITE_ONCE(vsi->rx_rings[i]->xdp_prog, vsi->xdp_prog);
> > > >  
> > > > +	if (ice_xdp_prog_has_meta(old_prog))
> > > > +		static_branch_dec(&ice_xdp_meta_key);
> > > > +
> > > >  	if (old_prog)
> > > >  		bpf_prog_put(old_prog);
> > > >  }
> > > > diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c
> > > > index 4fd7614f243d..19fc182d1f4c 100644
> > > > --- a/drivers/net/ethernet/intel/ice/ice_txrx.c
> > > > +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
> > > > @@ -572,7 +572,8 @@ ice_run_xdp(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp,
> > > >  	if (!xdp_prog)
> > > >  		goto exit;
> > > >  
> > > > -	ice_xdp_meta_set_desc(xdp, eop_desc);
> > > > +	if (static_branch_unlikely(&ice_xdp_meta_key))
> > > 
> > > My only concern is that we might be hurting in a minor way hints path now,
> > > no?
> > 
> > I have thought "unlikely" refers to the default state the code is compiled with 
> > and after static key incrementation this should be patched to "likely". Isn't 
> > this how static keys work?
> 
> I was only referring to that it ends with compiler hint:
> #define unlikely_notrace(x)	__builtin_expect(!!(x), 0)
> 
> see include/linux/jump_label.h
> 
> > 
> > > 
> > > > +		ice_xdp_meta_set_desc(xdp, eop_desc);
> > > >  
> > > >  	act = bpf_prog_run_xdp(xdp_prog, xdp);
> > > >  	switch (act) {
> > > > diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c
> > > > index 39775bb6cec1..f92d7d33fde6 100644
> > > > --- a/drivers/net/ethernet/intel/ice/ice_xsk.c
> > > > +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c
> > > > @@ -773,6 +773,9 @@ static void ice_prepare_pkt_ctx_zc(struct xdp_buff *xdp,
> > > >  				   union ice_32b_rx_flex_desc *eop_desc,
> > > >  				   struct ice_rx_ring *rx_ring)
> > > >  {
> > > > +	if (!static_branch_unlikely(&ice_xdp_meta_key))
> > > > +		return;
> > > 
> > > wouldn't it be better to pull it out and avoid calling
> > > ice_prepare_pkt_ctx_zc() unnecessarily?
> > > 
> > > > +
> > > >  	XSK_CHECK_PRIV_TYPE(struct ice_xdp_buff);
> > > >  	((struct ice_xdp_buff *)xdp)->pkt_ctx = rx_ring->pkt_ctx;
> > > >  	ice_xdp_meta_set_desc(xdp, eop_desc);
> > > > -- 
> > > > 2.41.0
> > > >
Larysa Zaremba Oct. 31, 2023, 5:32 p.m. UTC | #5
On Sat, Oct 28, 2023 at 09:55:52PM +0200, Maciej Fijalkowski wrote:
> On Mon, Oct 23, 2023 at 11:35:46AM +0200, Larysa Zaremba wrote:
> > On Fri, Oct 20, 2023 at 06:32:13PM +0200, Maciej Fijalkowski wrote:
> > > On Thu, Oct 12, 2023 at 07:05:17PM +0200, Larysa Zaremba wrote:
> > > > Usage of XDP hints requires putting additional information after the
> > > > xdp_buff. In basic case, only the descriptor has to be copied on a
> > > > per-packet basis, because xdp_buff permanently resides before per-ring
> > > > metadata (cached time and VLAN protocol ID).
> > > > 
> > > > However, in ZC mode, xdp_buffs come from a pool, so memory after such
> > > > buffer does not contain any reliable information, so everything has to be
> > > > copied, damaging the performance.
> > > > 
> > > > Introduce a static key to enable meta sources assignment only when attached
> > > > XDP program is device-bound.
> > > > 
> > > > This patch eliminates a 6% performance drop in ZC mode, which was a result
> > > > of addition of XDP hints to the driver.
> > > > 
> > > > Signed-off-by: Larysa Zaremba <larysa.zaremba@intel.com>
> > > > ---
> > > >  drivers/net/ethernet/intel/ice/ice.h      |  1 +
> > > >  drivers/net/ethernet/intel/ice/ice_main.c | 14 ++++++++++++++
> > > >  drivers/net/ethernet/intel/ice/ice_txrx.c |  3 ++-
> > > >  drivers/net/ethernet/intel/ice/ice_xsk.c  |  3 +++
> > > >  4 files changed, 20 insertions(+), 1 deletion(-)
> > > > 
> > > > diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h
> > > > index 3d0f15f8b2b8..76d22be878a4 100644
> > > > --- a/drivers/net/ethernet/intel/ice/ice.h
> > > > +++ b/drivers/net/ethernet/intel/ice/ice.h
> > > > @@ -210,6 +210,7 @@ enum ice_feature {
> > > >  };
> > > >  
> > > >  DECLARE_STATIC_KEY_FALSE(ice_xdp_locking_key);
> > > > +DECLARE_STATIC_KEY_FALSE(ice_xdp_meta_key);
> > > >  
> > > >  struct ice_channel {
> > > >  	struct list_head list;
> > > > diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
> > > > index 47e8920e1727..ee0df86d34b7 100644
> > > > --- a/drivers/net/ethernet/intel/ice/ice_main.c
> > > > +++ b/drivers/net/ethernet/intel/ice/ice_main.c
> > > > @@ -48,6 +48,9 @@ MODULE_PARM_DESC(debug, "netif level (0=none,...,16=all)");
> > > >  DEFINE_STATIC_KEY_FALSE(ice_xdp_locking_key);
> > > >  EXPORT_SYMBOL(ice_xdp_locking_key);
> > > >  
> > > > +DEFINE_STATIC_KEY_FALSE(ice_xdp_meta_key);
> > > > +EXPORT_SYMBOL(ice_xdp_meta_key);
> > > > +
> > > >  /**
> > > >   * ice_hw_to_dev - Get device pointer from the hardware structure
> > > >   * @hw: pointer to the device HW structure
> > > > @@ -2634,6 +2637,11 @@ static int ice_xdp_alloc_setup_rings(struct ice_vsi *vsi)
> > > >  	return -ENOMEM;
> > > >  }
> > > >  
> > > > +static bool ice_xdp_prog_has_meta(struct bpf_prog *prog)
> > > > +{
> > > > +	return prog && prog->aux->dev_bound;
> > > > +}
> > > > +
> > > >  /**
> > > >   * ice_vsi_assign_bpf_prog - set or clear bpf prog pointer on VSI
> > > >   * @vsi: VSI to set the bpf prog on
> > > > @@ -2644,10 +2652,16 @@ static void ice_vsi_assign_bpf_prog(struct ice_vsi *vsi, struct bpf_prog *prog)
> > > >  	struct bpf_prog *old_prog;
> > > >  	int i;
> > > >  
> > > > +	if (ice_xdp_prog_has_meta(prog))
> > > > +		static_branch_inc(&ice_xdp_meta_key);
> > > 
> > > i thought boolean key would be enough but inc/dec should serve properly
> > > for example prog hotswap cases.
> > >
> > 
> > My thought process on using counting instead of boolean was: there can be 
> > several PFs that use the same driver, so therefore we need to keep track of how 
> > many od them use hints. 
> 
> Very good point. This implies that if PF0 has hints-enabled prog loaded,
> PF1 with non-hints prog will "suffer" from it.
> 
> Sorry for such a long delays in responses but I was having a hard time
> making up my mind about it. In the end I have come up to some conclusions.
> I know the timing for sending this response is not ideal, but I need to
> get this off my chest and bring discussion back to life:)
> 
> IMHO having static keys to eliminate ZC overhead does not scale. I assume
> every other driver would have to follow that.
> 
> XSK pool allows us to avoid initializing various things per each packet.
> Instead, taking xdp_rxq_info as an example, each xdp_buff from pool has
> xdp_rxq_info assigned at init time. With this in mind, we should have some
> mechanism to set hints-specific things in xdp_buff_xsk::cb, at init time
> as well. Such mechanism should not require us to expose driver's private
> xdp_buff hints containers (such as ice_pkt_ctx) to XSK pool.
> 
> Right now you moved phctime down to ice_pkt_ctx and to me that's the main
> reason we have to copy ice_pkt_ctx to each xdp_buff on ZC. What if we keep
> the cached_phctime at original offset in ring but ice_pkt_ctx would get a
> pointer to that?
> 
> This would allow us to init the pointer in each xdp_buff from XSK pool at
> init time. I have come up with a way to program that via so called XSK
> meta descriptors. Each desc would have data to write onto cb, offset
> within cb and amount of bytes to write/copy.
> 
> I'll share the diff below but note that I didn't measure how much lower
> the performance is degraded. My icelake machine where I used to measure
> performance-sensitive code got broke. For now we can't escape initing
> eop_desc per each xdp_buff, but I moved it to alloc side, as we mangle
> descs there anyway.
> 
> I think mlx5 could benefit from that approach as well with initing the rq
> ptr at init time.
> 
> Diff does mostly these things:
> - move cached_phctime to old place in ice_rx_ring and add ptr to that in
>   ice_pkt_ctx
> - introduce xsk_pool_set_meta()
> - use it from ice side.
> 
> I consider this as a discussion trigger rather than ready code. Any
> feedback will be appreciated.
> 
> ---------------------------------8<---------------------------------
> 
> diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c
> index 7fa43827a3f0..c192e84bee55 100644
> --- a/drivers/net/ethernet/intel/ice/ice_base.c
> +++ b/drivers/net/ethernet/intel/ice/ice_base.c
> @@ -519,6 +519,23 @@ static int ice_setup_rx_ctx(struct ice_rx_ring *ring)
>  	return 0;
>  }
>  

[...]

> > > 
> > > My only concern is that we might be hurting in a minor way hints path now,
> > > no?
> > 
> > I have thought "unlikely" refers to the default state the code is compiled with 
> > and after static key incrementation this should be patched to "likely". Isn't 
> > this how static keys work?
> 
> I was only referring to that it ends with compiler hint:
> #define unlikely_notrace(x)	__builtin_expect(!!(x), 0)
> 
> see include/linux/jump_label.h
> 

You are right, I have misunderstood the concept a little bit.

> > 
> > > 
> > > > +		ice_xdp_meta_set_desc(xdp, eop_desc);
> > > >  
> > > >  	act = bpf_prog_run_xdp(xdp_prog, xdp);
> > > >  	switch (act) {
> > > > diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c
> > > > index 39775bb6cec1..f92d7d33fde6 100644
> > > > --- a/drivers/net/ethernet/intel/ice/ice_xsk.c
> > > > +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c
> > > > @@ -773,6 +773,9 @@ static void ice_prepare_pkt_ctx_zc(struct xdp_buff *xdp,
> > > >  				   union ice_32b_rx_flex_desc *eop_desc,
> > > >  				   struct ice_rx_ring *rx_ring)
> > > >  {
> > > > +	if (!static_branch_unlikely(&ice_xdp_meta_key))
> > > > +		return;
> > > 
> > > wouldn't it be better to pull it out and avoid calling
> > > ice_prepare_pkt_ctx_zc() unnecessarily?
> > > 
> > > > +
> > > >  	XSK_CHECK_PRIV_TYPE(struct ice_xdp_buff);
> > > >  	((struct ice_xdp_buff *)xdp)->pkt_ctx = rx_ring->pkt_ctx;
> > > >  	ice_xdp_meta_set_desc(xdp, eop_desc);
> > > > -- 
> > > > 2.41.0
> > > >
Fijalkowski, Maciej Nov. 2, 2023, 1:23 p.m. UTC | #6
On Tue, Oct 31, 2023 at 03:22:31PM +0100, Larysa Zaremba wrote:
> On Sat, Oct 28, 2023 at 09:55:52PM +0200, Maciej Fijalkowski wrote:
> > On Mon, Oct 23, 2023 at 11:35:46AM +0200, Larysa Zaremba wrote:
> > > On Fri, Oct 20, 2023 at 06:32:13PM +0200, Maciej Fijalkowski wrote:
> > > > On Thu, Oct 12, 2023 at 07:05:17PM +0200, Larysa Zaremba wrote:
> > > > > Usage of XDP hints requires putting additional information after the
> > > > > xdp_buff. In basic case, only the descriptor has to be copied on a
> > > > > per-packet basis, because xdp_buff permanently resides before per-ring
> > > > > metadata (cached time and VLAN protocol ID).
> > > > > 
> > > > > However, in ZC mode, xdp_buffs come from a pool, so memory after such
> > > > > buffer does not contain any reliable information, so everything has to be
> > > > > copied, damaging the performance.
> > > > > 
> > > > > Introduce a static key to enable meta sources assignment only when attached
> > > > > XDP program is device-bound.
> > > > > 
> > > > > This patch eliminates a 6% performance drop in ZC mode, which was a result
> > > > > of addition of XDP hints to the driver.
> > > > > 
> > > > > Signed-off-by: Larysa Zaremba <larysa.zaremba@intel.com>
> > > > > ---
> > > > >  drivers/net/ethernet/intel/ice/ice.h      |  1 +
> > > > >  drivers/net/ethernet/intel/ice/ice_main.c | 14 ++++++++++++++
> > > > >  drivers/net/ethernet/intel/ice/ice_txrx.c |  3 ++-
> > > > >  drivers/net/ethernet/intel/ice/ice_xsk.c  |  3 +++
> > > > >  4 files changed, 20 insertions(+), 1 deletion(-)
> > > > > 
> > > > > diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h
> > > > > index 3d0f15f8b2b8..76d22be878a4 100644
> > > > > --- a/drivers/net/ethernet/intel/ice/ice.h
> > > > > +++ b/drivers/net/ethernet/intel/ice/ice.h
> > > > > @@ -210,6 +210,7 @@ enum ice_feature {
> > > > >  };
> > > > >  
> > > > >  DECLARE_STATIC_KEY_FALSE(ice_xdp_locking_key);
> > > > > +DECLARE_STATIC_KEY_FALSE(ice_xdp_meta_key);
> > > > >  
> > > > >  struct ice_channel {
> > > > >  	struct list_head list;
> > > > > diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
> > > > > index 47e8920e1727..ee0df86d34b7 100644
> > > > > --- a/drivers/net/ethernet/intel/ice/ice_main.c
> > > > > +++ b/drivers/net/ethernet/intel/ice/ice_main.c
> > > > > @@ -48,6 +48,9 @@ MODULE_PARM_DESC(debug, "netif level (0=none,...,16=all)");
> > > > >  DEFINE_STATIC_KEY_FALSE(ice_xdp_locking_key);
> > > > >  EXPORT_SYMBOL(ice_xdp_locking_key);
> > > > >  
> > > > > +DEFINE_STATIC_KEY_FALSE(ice_xdp_meta_key);
> > > > > +EXPORT_SYMBOL(ice_xdp_meta_key);
> > > > > +
> > > > >  /**
> > > > >   * ice_hw_to_dev - Get device pointer from the hardware structure
> > > > >   * @hw: pointer to the device HW structure
> > > > > @@ -2634,6 +2637,11 @@ static int ice_xdp_alloc_setup_rings(struct ice_vsi *vsi)
> > > > >  	return -ENOMEM;
> > > > >  }
> > > > >  
> > > > > +static bool ice_xdp_prog_has_meta(struct bpf_prog *prog)
> > > > > +{
> > > > > +	return prog && prog->aux->dev_bound;
> > > > > +}
> > > > > +
> > > > >  /**
> > > > >   * ice_vsi_assign_bpf_prog - set or clear bpf prog pointer on VSI
> > > > >   * @vsi: VSI to set the bpf prog on
> > > > > @@ -2644,10 +2652,16 @@ static void ice_vsi_assign_bpf_prog(struct ice_vsi *vsi, struct bpf_prog *prog)
> > > > >  	struct bpf_prog *old_prog;
> > > > >  	int i;
> > > > >  
> > > > > +	if (ice_xdp_prog_has_meta(prog))
> > > > > +		static_branch_inc(&ice_xdp_meta_key);
> > > > 
> > > > i thought boolean key would be enough but inc/dec should serve properly
> > > > for example prog hotswap cases.
> > > >
> > > 
> > > My thought process on using counting instead of boolean was: there can be 
> > > several PFs that use the same driver, so therefore we need to keep track of how 
> > > many od them use hints. 
> > 
> > Very good point. This implies that if PF0 has hints-enabled prog loaded,
> > PF1 with non-hints prog will "suffer" from it.
> > 
> > Sorry for such a long delays in responses but I was having a hard time
> > making up my mind about it. In the end I have come up to some conclusions.
> > I know the timing for sending this response is not ideal, but I need to
> > get this off my chest and bring discussion back to life:)
> > 
> > IMHO having static keys to eliminate ZC overhead does not scale. I assume
> > every other driver would have to follow that.
> > 
> > XSK pool allows us to avoid initializing various things per each packet.
> > Instead, taking xdp_rxq_info as an example, each xdp_buff from pool has
> > xdp_rxq_info assigned at init time. With this in mind, we should have some
> > mechanism to set hints-specific things in xdp_buff_xsk::cb, at init time
> > as well. Such mechanism should not require us to expose driver's private
> > xdp_buff hints containers (such as ice_pkt_ctx) to XSK pool.
> > 
> > Right now you moved phctime down to ice_pkt_ctx and to me that's the main
> > reason we have to copy ice_pkt_ctx to each xdp_buff on ZC. What if we keep
> > the cached_phctime at original offset in ring but ice_pkt_ctx would get a
> > pointer to that?
> > 
> > This would allow us to init the pointer in each xdp_buff from XSK pool at
> > init time. I have come up with a way to program that via so called XSK
> > meta descriptors. Each desc would have data to write onto cb, offset
> > within cb and amount of bytes to write/copy.
> > 
> > I'll share the diff below but note that I didn't measure how much lower
> > the performance is degraded. My icelake machine where I used to measure
> > performance-sensitive code got broke. For now we can't escape initing
> > eop_desc per each xdp_buff, but I moved it to alloc side, as we mangle
> > descs there anyway.
> > 
> > I think mlx5 could benefit from that approach as well with initing the rq
> > ptr at init time.
> > 
> > Diff does mostly these things:
> > - move cached_phctime to old place in ice_rx_ring and add ptr to that in
> >   ice_pkt_ctx
> > - introduce xsk_pool_set_meta()
> > - use it from ice side.
> > 
> 
> Thank you for the code! I will probably send v7 with such changes. Are you OK, 
> if patch with core changes would go with you as an author?

Yes or I can produce a patch and share, up to you.

> 
> But also, I see a minor problem with that switching VLAN protocol does not 
> trigger buffer allocation, so we have to point to that too, this probably means 
> moving cached time back and finding 16 extra bits in CL3. Single pointer to 
> {cached time, vlan_proto} would be copied to be after xdp_buff.

It's not that it has to trigger buffer allocation, we could stop the
interface if pool is present and update vlan proto on pool's xdp_buffs
(from quick glance i don't see that we're stopping iface for setting vlan
features) but that sounds like more of a hassle to do...

So yeah maybe let's just have a ptr in ice_pkt_ctx as well.

[...]
Larysa Zaremba Nov. 2, 2023, 1:48 p.m. UTC | #7
On Thu, Nov 02, 2023 at 02:23:02PM +0100, Maciej Fijalkowski wrote:
> On Tue, Oct 31, 2023 at 03:22:31PM +0100, Larysa Zaremba wrote:
> > On Sat, Oct 28, 2023 at 09:55:52PM +0200, Maciej Fijalkowski wrote:
> > > On Mon, Oct 23, 2023 at 11:35:46AM +0200, Larysa Zaremba wrote:
> > > > On Fri, Oct 20, 2023 at 06:32:13PM +0200, Maciej Fijalkowski wrote:
> > > > > On Thu, Oct 12, 2023 at 07:05:17PM +0200, Larysa Zaremba wrote:
> > > > > > Usage of XDP hints requires putting additional information after the
> > > > > > xdp_buff. In basic case, only the descriptor has to be copied on a
> > > > > > per-packet basis, because xdp_buff permanently resides before per-ring
> > > > > > metadata (cached time and VLAN protocol ID).
> > > > > > 
> > > > > > However, in ZC mode, xdp_buffs come from a pool, so memory after such
> > > > > > buffer does not contain any reliable information, so everything has to be
> > > > > > copied, damaging the performance.
> > > > > > 
> > > > > > Introduce a static key to enable meta sources assignment only when attached
> > > > > > XDP program is device-bound.
> > > > > > 
> > > > > > This patch eliminates a 6% performance drop in ZC mode, which was a result
> > > > > > of addition of XDP hints to the driver.
> > > > > > 
> > > > > > Signed-off-by: Larysa Zaremba <larysa.zaremba@intel.com>
> > > > > > ---
> > > > > >  drivers/net/ethernet/intel/ice/ice.h      |  1 +
> > > > > >  drivers/net/ethernet/intel/ice/ice_main.c | 14 ++++++++++++++
> > > > > >  drivers/net/ethernet/intel/ice/ice_txrx.c |  3 ++-
> > > > > >  drivers/net/ethernet/intel/ice/ice_xsk.c  |  3 +++
> > > > > >  4 files changed, 20 insertions(+), 1 deletion(-)
> > > > > > 
> > > > > > diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h
> > > > > > index 3d0f15f8b2b8..76d22be878a4 100644
> > > > > > --- a/drivers/net/ethernet/intel/ice/ice.h
> > > > > > +++ b/drivers/net/ethernet/intel/ice/ice.h
> > > > > > @@ -210,6 +210,7 @@ enum ice_feature {
> > > > > >  };
> > > > > >  
> > > > > >  DECLARE_STATIC_KEY_FALSE(ice_xdp_locking_key);
> > > > > > +DECLARE_STATIC_KEY_FALSE(ice_xdp_meta_key);
> > > > > >  
> > > > > >  struct ice_channel {
> > > > > >  	struct list_head list;
> > > > > > diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
> > > > > > index 47e8920e1727..ee0df86d34b7 100644
> > > > > > --- a/drivers/net/ethernet/intel/ice/ice_main.c
> > > > > > +++ b/drivers/net/ethernet/intel/ice/ice_main.c
> > > > > > @@ -48,6 +48,9 @@ MODULE_PARM_DESC(debug, "netif level (0=none,...,16=all)");
> > > > > >  DEFINE_STATIC_KEY_FALSE(ice_xdp_locking_key);
> > > > > >  EXPORT_SYMBOL(ice_xdp_locking_key);
> > > > > >  
> > > > > > +DEFINE_STATIC_KEY_FALSE(ice_xdp_meta_key);
> > > > > > +EXPORT_SYMBOL(ice_xdp_meta_key);
> > > > > > +
> > > > > >  /**
> > > > > >   * ice_hw_to_dev - Get device pointer from the hardware structure
> > > > > >   * @hw: pointer to the device HW structure
> > > > > > @@ -2634,6 +2637,11 @@ static int ice_xdp_alloc_setup_rings(struct ice_vsi *vsi)
> > > > > >  	return -ENOMEM;
> > > > > >  }
> > > > > >  
> > > > > > +static bool ice_xdp_prog_has_meta(struct bpf_prog *prog)
> > > > > > +{
> > > > > > +	return prog && prog->aux->dev_bound;
> > > > > > +}
> > > > > > +
> > > > > >  /**
> > > > > >   * ice_vsi_assign_bpf_prog - set or clear bpf prog pointer on VSI
> > > > > >   * @vsi: VSI to set the bpf prog on
> > > > > > @@ -2644,10 +2652,16 @@ static void ice_vsi_assign_bpf_prog(struct ice_vsi *vsi, struct bpf_prog *prog)
> > > > > >  	struct bpf_prog *old_prog;
> > > > > >  	int i;
> > > > > >  
> > > > > > +	if (ice_xdp_prog_has_meta(prog))
> > > > > > +		static_branch_inc(&ice_xdp_meta_key);
> > > > > 
> > > > > i thought boolean key would be enough but inc/dec should serve properly
> > > > > for example prog hotswap cases.
> > > > >
> > > > 
> > > > My thought process on using counting instead of boolean was: there can be 
> > > > several PFs that use the same driver, so therefore we need to keep track of how 
> > > > many od them use hints. 
> > > 
> > > Very good point. This implies that if PF0 has hints-enabled prog loaded,
> > > PF1 with non-hints prog will "suffer" from it.
> > > 
> > > Sorry for such a long delays in responses but I was having a hard time
> > > making up my mind about it. In the end I have come up to some conclusions.
> > > I know the timing for sending this response is not ideal, but I need to
> > > get this off my chest and bring discussion back to life:)
> > > 
> > > IMHO having static keys to eliminate ZC overhead does not scale. I assume
> > > every other driver would have to follow that.
> > > 
> > > XSK pool allows us to avoid initializing various things per each packet.
> > > Instead, taking xdp_rxq_info as an example, each xdp_buff from pool has
> > > xdp_rxq_info assigned at init time. With this in mind, we should have some
> > > mechanism to set hints-specific things in xdp_buff_xsk::cb, at init time
> > > as well. Such mechanism should not require us to expose driver's private
> > > xdp_buff hints containers (such as ice_pkt_ctx) to XSK pool.
> > > 
> > > Right now you moved phctime down to ice_pkt_ctx and to me that's the main
> > > reason we have to copy ice_pkt_ctx to each xdp_buff on ZC. What if we keep
> > > the cached_phctime at original offset in ring but ice_pkt_ctx would get a
> > > pointer to that?
> > > 
> > > This would allow us to init the pointer in each xdp_buff from XSK pool at
> > > init time. I have come up with a way to program that via so called XSK
> > > meta descriptors. Each desc would have data to write onto cb, offset
> > > within cb and amount of bytes to write/copy.
> > > 
> > > I'll share the diff below but note that I didn't measure how much lower
> > > the performance is degraded. My icelake machine where I used to measure
> > > performance-sensitive code got broke. For now we can't escape initing
> > > eop_desc per each xdp_buff, but I moved it to alloc side, as we mangle
> > > descs there anyway.
> > > 
> > > I think mlx5 could benefit from that approach as well with initing the rq
> > > ptr at init time.
> > > 
> > > Diff does mostly these things:
> > > - move cached_phctime to old place in ice_rx_ring and add ptr to that in
> > >   ice_pkt_ctx
> > > - introduce xsk_pool_set_meta()
> > > - use it from ice side.
> > > 
> > 
> > Thank you for the code! I will probably send v7 with such changes. Are you OK, 
> > if patch with core changes would go with you as an author?
> 
> Yes or I can produce a patch and share, up to you.
>

I have already started, your diff does not compile, so I took some creative 
liberty. Will send you patches for verification this week.
 
> > 
> > But also, I see a minor problem with that switching VLAN protocol does not 
> > trigger buffer allocation, so we have to point to that too, this probably means 
> > moving cached time back and finding 16 extra bits in CL3. Single pointer to 
> > {cached time, vlan_proto} would be copied to be after xdp_buff.
> 
> It's not that it has to trigger buffer allocation, we could stop the
> interface if pool is present and update vlan proto on pool's xdp_buffs
> (from quick glance i don't see that we're stopping iface for setting vlan
> features) but that sounds like more of a hassle to do...
> 
> So yeah maybe let's just have a ptr in ice_pkt_ctx as well.
> 
> [...]
diff mbox series

Patch

diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h
index 3d0f15f8b2b8..76d22be878a4 100644
--- a/drivers/net/ethernet/intel/ice/ice.h
+++ b/drivers/net/ethernet/intel/ice/ice.h
@@ -210,6 +210,7 @@  enum ice_feature {
 };
 
 DECLARE_STATIC_KEY_FALSE(ice_xdp_locking_key);
+DECLARE_STATIC_KEY_FALSE(ice_xdp_meta_key);
 
 struct ice_channel {
 	struct list_head list;
diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index 47e8920e1727..ee0df86d34b7 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -48,6 +48,9 @@  MODULE_PARM_DESC(debug, "netif level (0=none,...,16=all)");
 DEFINE_STATIC_KEY_FALSE(ice_xdp_locking_key);
 EXPORT_SYMBOL(ice_xdp_locking_key);
 
+DEFINE_STATIC_KEY_FALSE(ice_xdp_meta_key);
+EXPORT_SYMBOL(ice_xdp_meta_key);
+
 /**
  * ice_hw_to_dev - Get device pointer from the hardware structure
  * @hw: pointer to the device HW structure
@@ -2634,6 +2637,11 @@  static int ice_xdp_alloc_setup_rings(struct ice_vsi *vsi)
 	return -ENOMEM;
 }
 
+static bool ice_xdp_prog_has_meta(struct bpf_prog *prog)
+{
+	return prog && prog->aux->dev_bound;
+}
+
 /**
  * ice_vsi_assign_bpf_prog - set or clear bpf prog pointer on VSI
  * @vsi: VSI to set the bpf prog on
@@ -2644,10 +2652,16 @@  static void ice_vsi_assign_bpf_prog(struct ice_vsi *vsi, struct bpf_prog *prog)
 	struct bpf_prog *old_prog;
 	int i;
 
+	if (ice_xdp_prog_has_meta(prog))
+		static_branch_inc(&ice_xdp_meta_key);
+
 	old_prog = xchg(&vsi->xdp_prog, prog);
 	ice_for_each_rxq(vsi, i)
 		WRITE_ONCE(vsi->rx_rings[i]->xdp_prog, vsi->xdp_prog);
 
+	if (ice_xdp_prog_has_meta(old_prog))
+		static_branch_dec(&ice_xdp_meta_key);
+
 	if (old_prog)
 		bpf_prog_put(old_prog);
 }
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c
index 4fd7614f243d..19fc182d1f4c 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
@@ -572,7 +572,8 @@  ice_run_xdp(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp,
 	if (!xdp_prog)
 		goto exit;
 
-	ice_xdp_meta_set_desc(xdp, eop_desc);
+	if (static_branch_unlikely(&ice_xdp_meta_key))
+		ice_xdp_meta_set_desc(xdp, eop_desc);
 
 	act = bpf_prog_run_xdp(xdp_prog, xdp);
 	switch (act) {
diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c
index 39775bb6cec1..f92d7d33fde6 100644
--- a/drivers/net/ethernet/intel/ice/ice_xsk.c
+++ b/drivers/net/ethernet/intel/ice/ice_xsk.c
@@ -773,6 +773,9 @@  static void ice_prepare_pkt_ctx_zc(struct xdp_buff *xdp,
 				   union ice_32b_rx_flex_desc *eop_desc,
 				   struct ice_rx_ring *rx_ring)
 {
+	if (!static_branch_unlikely(&ice_xdp_meta_key))
+		return;
+
 	XSK_CHECK_PRIV_TYPE(struct ice_xdp_buff);
 	((struct ice_xdp_buff *)xdp)->pkt_ctx = rx_ring->pkt_ctx;
 	ice_xdp_meta_set_desc(xdp, eop_desc);