diff mbox series

[v5,bpf,02/11] xsk: make xsk_buff_pool responsible for clearing xdp_buff::flags

Message ID 20240122221610.556746-3-maciej.fijalkowski@intel.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series net: bpf_xdp_adjust_tail() and Intel mbuf fixes | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf
netdev/ynl success SINGLE THREAD; Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag present in non-next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1079 this patch: 1079
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers success CCed 0 of 0 maintainers
netdev/build_clang success Errors and warnings before: 1096 this patch: 1096
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success Fixes tag looks correct
netdev/build_allmodconfig_warn success Errors and warnings before: 1097 this patch: 1097
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 35 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-VM_Test-17 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-VM_Test-18 success Logs for set-matrix
bpf/vmtest-bpf-VM_Test-19 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-VM_Test-20 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-VM_Test-21 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-22 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-26 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-27 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-28 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-34 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-VM_Test-35 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-36 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18 and -O2 optimization
bpf/vmtest-bpf-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-42 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-VM_Test-15 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-VM_Test-29 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17 and -O2 optimization
bpf/vmtest-bpf-VM_Test-14 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-PR success PR summary
bpf/vmtest-bpf-VM_Test-13 success Logs for s390x-gcc / test (test_maps, false, 360) / test_maps on s390x with gcc

Commit Message

Fijalkowski, Maciej Jan. 22, 2024, 10:16 p.m. UTC
XDP multi-buffer support introduced XDP_FLAGS_HAS_FRAGS flag that is
used by drivers to notify data path whether xdp_buff contains fragments
or not. Data path looks up mentioned flag on first buffer that occupies
the linear part of xdp_buff, so drivers only modify it there. This is
sufficient for SKB and XDP_DRV modes as usually xdp_buff is allocated on
stack or it resides within struct representing driver's queue and
fragments are carried via skb_frag_t structs. IOW, we are dealing with
only one xdp_buff.

ZC mode though relies on list of xdp_buff structs that is carried via
xsk_buff_pool::xskb_list, so ZC data path has to make sure that
fragments do *not* have XDP_FLAGS_HAS_FRAGS set. Otherwise,
xsk_buff_free() could misbehave if it would be executed against xdp_buff
that carries a frag with XDP_FLAGS_HAS_FRAGS flag set. Such scenario can
take place when within supplied XDP program bpf_xdp_adjust_tail() is
used with negative offset that would in turn release the tail fragment
from multi-buffer frame.

Calling xsk_buff_free() on tail fragment with XDP_FLAGS_HAS_FRAGS would
result in releasing all the nodes from xskb_list that were produced by
driver before XDP program execution, which is not what is intended -
only tail fragment should be deleted from xskb_list and then it should
be put onto xsk_buff_pool::free_list. Such multi-buffer frame will never
make it up to user space, so from AF_XDP application POV there would be
no traffic running, however due to free_list getting constantly new
nodes, driver will be able to feed HW Rx queue with recycled buffers.
Bottom line is that instead of traffic being redirected to user space,
it would be continuously dropped.

To fix this, let us clear the mentioned flag on xsk_buff_pool side at
allocation time, which is what should have been done right from the
start of XSK multi-buffer support.

Fixes: 1bbc04de607b ("ice: xsk: add RX multi-buffer support")
Fixes: 1c9ba9c14658 ("i40e: xsk: add RX multi-buffer support")
Fixes: 24ea50127ecf ("xsk: support mbuf on ZC RX")
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_xsk.c | 1 -
 drivers/net/ethernet/intel/ice/ice_xsk.c   | 1 -
 net/xdp/xsk_buff_pool.c                    | 3 +++
 3 files changed, 3 insertions(+), 2 deletions(-)

Comments

Magnus Karlsson Jan. 24, 2024, 8:20 a.m. UTC | #1
On Mon, 22 Jan 2024 at 23:16, Maciej Fijalkowski
<maciej.fijalkowski@intel.com> wrote:
>
> XDP multi-buffer support introduced XDP_FLAGS_HAS_FRAGS flag that is
> used by drivers to notify data path whether xdp_buff contains fragments
> or not. Data path looks up mentioned flag on first buffer that occupies
> the linear part of xdp_buff, so drivers only modify it there. This is
> sufficient for SKB and XDP_DRV modes as usually xdp_buff is allocated on
> stack or it resides within struct representing driver's queue and
> fragments are carried via skb_frag_t structs. IOW, we are dealing with
> only one xdp_buff.
>
> ZC mode though relies on list of xdp_buff structs that is carried via
> xsk_buff_pool::xskb_list, so ZC data path has to make sure that
> fragments do *not* have XDP_FLAGS_HAS_FRAGS set. Otherwise,
> xsk_buff_free() could misbehave if it would be executed against xdp_buff
> that carries a frag with XDP_FLAGS_HAS_FRAGS flag set. Such scenario can
> take place when within supplied XDP program bpf_xdp_adjust_tail() is
> used with negative offset that would in turn release the tail fragment
> from multi-buffer frame.
>
> Calling xsk_buff_free() on tail fragment with XDP_FLAGS_HAS_FRAGS would
> result in releasing all the nodes from xskb_list that were produced by
> driver before XDP program execution, which is not what is intended -
> only tail fragment should be deleted from xskb_list and then it should
> be put onto xsk_buff_pool::free_list. Such multi-buffer frame will never
> make it up to user space, so from AF_XDP application POV there would be
> no traffic running, however due to free_list getting constantly new
> nodes, driver will be able to feed HW Rx queue with recycled buffers.
> Bottom line is that instead of traffic being redirected to user space,
> it would be continuously dropped.
>
> To fix this, let us clear the mentioned flag on xsk_buff_pool side at
> allocation time, which is what should have been done right from the
> start of XSK multi-buffer support.
>
> Fixes: 1bbc04de607b ("ice: xsk: add RX multi-buffer support")
> Fixes: 1c9ba9c14658 ("i40e: xsk: add RX multi-buffer support")
> Fixes: 24ea50127ecf ("xsk: support mbuf on ZC RX")
> Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
> ---
>  drivers/net/ethernet/intel/i40e/i40e_xsk.c | 1 -
>  drivers/net/ethernet/intel/ice/ice_xsk.c   | 1 -
>  net/xdp/xsk_buff_pool.c                    | 3 +++
>  3 files changed, 3 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
> index e99fa854d17f..fede0bb3e047 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
> +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
> @@ -499,7 +499,6 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
>                 xdp_res = i40e_run_xdp_zc(rx_ring, first, xdp_prog);
>                 i40e_handle_xdp_result_zc(rx_ring, first, rx_desc, &rx_packets,
>                                           &rx_bytes, xdp_res, &failure);
> -               first->flags = 0;
>                 next_to_clean = next_to_process;
>                 if (failure)
>                         break;
> diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c
> index 5d1ae8e4058a..d9073a618ad6 100644
> --- a/drivers/net/ethernet/intel/ice/ice_xsk.c
> +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c
> @@ -895,7 +895,6 @@ int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, int budget)
>
>                 if (!first) {
>                         first = xdp;
> -                       xdp_buff_clear_frags_flag(first);
>                 } else if (ice_add_xsk_frag(rx_ring, first, xdp, size)) {
>                         break;
>                 }
> diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
> index 28711cc44ced..dc5659da6728 100644
> --- a/net/xdp/xsk_buff_pool.c
> +++ b/net/xdp/xsk_buff_pool.c
> @@ -555,6 +555,7 @@ struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool)
>
>         xskb->xdp.data = xskb->xdp.data_hard_start + XDP_PACKET_HEADROOM;
>         xskb->xdp.data_meta = xskb->xdp.data;
> +       xskb->xdp.flags = 0;
>
>         if (pool->dma_need_sync) {
>                 dma_sync_single_range_for_device(pool->dev, xskb->dma, 0,
> @@ -601,6 +602,7 @@ static u32 xp_alloc_new_from_fq(struct xsk_buff_pool *pool, struct xdp_buff **xd
>                 }
>
>                 *xdp = &xskb->xdp;
> +               xskb->xdp.flags = 0;

Thanks for catching this. I am thinking we should have an if-statement
here and only do this when multi-buffer is enabled. The reason that we
have two different paths for aligned mode and unaligned mode here is
that we do not have to touch the xdp_buff at all at allocation time in
aligned mode, which provides a nice speed-up. So let us only do this
when necessary. What do you think? Same goes for the line in
xp_alloc_reused().

>                 xdp++;
>         }
>
> @@ -621,6 +623,7 @@ static u32 xp_alloc_reused(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u3
>                 list_del_init(&xskb->free_list_node);
>
>                 *xdp = &xskb->xdp;
> +               xskb->xdp.flags = 0;
>                 xdp++;
>         }
>         pool->free_list_cnt -= nb_entries;
> --
> 2.34.1
>
>
Fijalkowski, Maciej Jan. 24, 2024, 11:42 a.m. UTC | #2
On Wed, Jan 24, 2024 at 09:20:26AM +0100, Magnus Karlsson wrote:
> On Mon, 22 Jan 2024 at 23:16, Maciej Fijalkowski
> <maciej.fijalkowski@intel.com> wrote:
> >
> > XDP multi-buffer support introduced XDP_FLAGS_HAS_FRAGS flag that is
> > used by drivers to notify data path whether xdp_buff contains fragments
> > or not. Data path looks up mentioned flag on first buffer that occupies
> > the linear part of xdp_buff, so drivers only modify it there. This is
> > sufficient for SKB and XDP_DRV modes as usually xdp_buff is allocated on
> > stack or it resides within struct representing driver's queue and
> > fragments are carried via skb_frag_t structs. IOW, we are dealing with
> > only one xdp_buff.
> >
> > ZC mode though relies on list of xdp_buff structs that is carried via
> > xsk_buff_pool::xskb_list, so ZC data path has to make sure that
> > fragments do *not* have XDP_FLAGS_HAS_FRAGS set. Otherwise,
> > xsk_buff_free() could misbehave if it would be executed against xdp_buff
> > that carries a frag with XDP_FLAGS_HAS_FRAGS flag set. Such scenario can
> > take place when within supplied XDP program bpf_xdp_adjust_tail() is
> > used with negative offset that would in turn release the tail fragment
> > from multi-buffer frame.
> >
> > Calling xsk_buff_free() on tail fragment with XDP_FLAGS_HAS_FRAGS would
> > result in releasing all the nodes from xskb_list that were produced by
> > driver before XDP program execution, which is not what is intended -
> > only tail fragment should be deleted from xskb_list and then it should
> > be put onto xsk_buff_pool::free_list. Such multi-buffer frame will never
> > make it up to user space, so from AF_XDP application POV there would be
> > no traffic running, however due to free_list getting constantly new
> > nodes, driver will be able to feed HW Rx queue with recycled buffers.
> > Bottom line is that instead of traffic being redirected to user space,
> > it would be continuously dropped.
> >
> > To fix this, let us clear the mentioned flag on xsk_buff_pool side at
> > allocation time, which is what should have been done right from the
> > start of XSK multi-buffer support.
> >
> > Fixes: 1bbc04de607b ("ice: xsk: add RX multi-buffer support")
> > Fixes: 1c9ba9c14658 ("i40e: xsk: add RX multi-buffer support")
> > Fixes: 24ea50127ecf ("xsk: support mbuf on ZC RX")
> > Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
> > ---
> >  drivers/net/ethernet/intel/i40e/i40e_xsk.c | 1 -
> >  drivers/net/ethernet/intel/ice/ice_xsk.c   | 1 -
> >  net/xdp/xsk_buff_pool.c                    | 3 +++
> >  3 files changed, 3 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
> > index e99fa854d17f..fede0bb3e047 100644
> > --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
> > +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
> > @@ -499,7 +499,6 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
> >                 xdp_res = i40e_run_xdp_zc(rx_ring, first, xdp_prog);
> >                 i40e_handle_xdp_result_zc(rx_ring, first, rx_desc, &rx_packets,
> >                                           &rx_bytes, xdp_res, &failure);
> > -               first->flags = 0;
> >                 next_to_clean = next_to_process;
> >                 if (failure)
> >                         break;
> > diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c
> > index 5d1ae8e4058a..d9073a618ad6 100644
> > --- a/drivers/net/ethernet/intel/ice/ice_xsk.c
> > +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c
> > @@ -895,7 +895,6 @@ int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, int budget)
> >
> >                 if (!first) {
> >                         first = xdp;
> > -                       xdp_buff_clear_frags_flag(first);
> >                 } else if (ice_add_xsk_frag(rx_ring, first, xdp, size)) {
> >                         break;
> >                 }
> > diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
> > index 28711cc44ced..dc5659da6728 100644
> > --- a/net/xdp/xsk_buff_pool.c
> > +++ b/net/xdp/xsk_buff_pool.c
> > @@ -555,6 +555,7 @@ struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool)
> >
> >         xskb->xdp.data = xskb->xdp.data_hard_start + XDP_PACKET_HEADROOM;
> >         xskb->xdp.data_meta = xskb->xdp.data;
> > +       xskb->xdp.flags = 0;
> >
> >         if (pool->dma_need_sync) {
> >                 dma_sync_single_range_for_device(pool->dev, xskb->dma, 0,
> > @@ -601,6 +602,7 @@ static u32 xp_alloc_new_from_fq(struct xsk_buff_pool *pool, struct xdp_buff **xd
> >                 }
> >
> >                 *xdp = &xskb->xdp;
> > +               xskb->xdp.flags = 0;
> 
> Thanks for catching this. I am thinking we should have an if-statement
> here and only do this when multi-buffer is enabled. The reason that we
> have two different paths for aligned mode and unaligned mode here is
> that we do not have to touch the xdp_buff at all at allocation time in
> aligned mode, which provides a nice speed-up. So let us only do this
> when necessary. What do you think? Same goes for the line in
> xp_alloc_reused().
> 

Good point. How about keeping flags = 0 in xp_alloc() and adding it to
xsk_buff_set_size() ? We do touch xdp_buff there and these two paths cover
batched and non-batched APIs. I do agree that doing it in
xp_alloc_new_from_fq() and in xp_alloc_reused() is not really handy.

> >                 xdp++;
> >         }
> >
> > @@ -621,6 +623,7 @@ static u32 xp_alloc_reused(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u3
> >                 list_del_init(&xskb->free_list_node);
> >
> >                 *xdp = &xskb->xdp;
> > +               xskb->xdp.flags = 0;
> >                 xdp++;
> >         }
> >         pool->free_list_cnt -= nb_entries;
> > --
> > 2.34.1
> >
> >
Magnus Karlsson Jan. 24, 2024, 11:49 a.m. UTC | #3
On Wed, 24 Jan 2024 at 12:42, Maciej Fijalkowski
<maciej.fijalkowski@intel.com> wrote:
>
> On Wed, Jan 24, 2024 at 09:20:26AM +0100, Magnus Karlsson wrote:
> > On Mon, 22 Jan 2024 at 23:16, Maciej Fijalkowski
> > <maciej.fijalkowski@intel.com> wrote:
> > >
> > > XDP multi-buffer support introduced XDP_FLAGS_HAS_FRAGS flag that is
> > > used by drivers to notify data path whether xdp_buff contains fragments
> > > or not. Data path looks up mentioned flag on first buffer that occupies
> > > the linear part of xdp_buff, so drivers only modify it there. This is
> > > sufficient for SKB and XDP_DRV modes as usually xdp_buff is allocated on
> > > stack or it resides within struct representing driver's queue and
> > > fragments are carried via skb_frag_t structs. IOW, we are dealing with
> > > only one xdp_buff.
> > >
> > > ZC mode though relies on list of xdp_buff structs that is carried via
> > > xsk_buff_pool::xskb_list, so ZC data path has to make sure that
> > > fragments do *not* have XDP_FLAGS_HAS_FRAGS set. Otherwise,
> > > xsk_buff_free() could misbehave if it would be executed against xdp_buff
> > > that carries a frag with XDP_FLAGS_HAS_FRAGS flag set. Such scenario can
> > > take place when within supplied XDP program bpf_xdp_adjust_tail() is
> > > used with negative offset that would in turn release the tail fragment
> > > from multi-buffer frame.
> > >
> > > Calling xsk_buff_free() on tail fragment with XDP_FLAGS_HAS_FRAGS would
> > > result in releasing all the nodes from xskb_list that were produced by
> > > driver before XDP program execution, which is not what is intended -
> > > only tail fragment should be deleted from xskb_list and then it should
> > > be put onto xsk_buff_pool::free_list. Such multi-buffer frame will never
> > > make it up to user space, so from AF_XDP application POV there would be
> > > no traffic running, however due to free_list getting constantly new
> > > nodes, driver will be able to feed HW Rx queue with recycled buffers.
> > > Bottom line is that instead of traffic being redirected to user space,
> > > it would be continuously dropped.
> > >
> > > To fix this, let us clear the mentioned flag on xsk_buff_pool side at
> > > allocation time, which is what should have been done right from the
> > > start of XSK multi-buffer support.
> > >
> > > Fixes: 1bbc04de607b ("ice: xsk: add RX multi-buffer support")
> > > Fixes: 1c9ba9c14658 ("i40e: xsk: add RX multi-buffer support")
> > > Fixes: 24ea50127ecf ("xsk: support mbuf on ZC RX")
> > > Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
> > > ---
> > >  drivers/net/ethernet/intel/i40e/i40e_xsk.c | 1 -
> > >  drivers/net/ethernet/intel/ice/ice_xsk.c   | 1 -
> > >  net/xdp/xsk_buff_pool.c                    | 3 +++
> > >  3 files changed, 3 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
> > > index e99fa854d17f..fede0bb3e047 100644
> > > --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
> > > +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
> > > @@ -499,7 +499,6 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
> > >                 xdp_res = i40e_run_xdp_zc(rx_ring, first, xdp_prog);
> > >                 i40e_handle_xdp_result_zc(rx_ring, first, rx_desc, &rx_packets,
> > >                                           &rx_bytes, xdp_res, &failure);
> > > -               first->flags = 0;
> > >                 next_to_clean = next_to_process;
> > >                 if (failure)
> > >                         break;
> > > diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c
> > > index 5d1ae8e4058a..d9073a618ad6 100644
> > > --- a/drivers/net/ethernet/intel/ice/ice_xsk.c
> > > +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c
> > > @@ -895,7 +895,6 @@ int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, int budget)
> > >
> > >                 if (!first) {
> > >                         first = xdp;
> > > -                       xdp_buff_clear_frags_flag(first);
> > >                 } else if (ice_add_xsk_frag(rx_ring, first, xdp, size)) {
> > >                         break;
> > >                 }
> > > diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
> > > index 28711cc44ced..dc5659da6728 100644
> > > --- a/net/xdp/xsk_buff_pool.c
> > > +++ b/net/xdp/xsk_buff_pool.c
> > > @@ -555,6 +555,7 @@ struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool)
> > >
> > >         xskb->xdp.data = xskb->xdp.data_hard_start + XDP_PACKET_HEADROOM;
> > >         xskb->xdp.data_meta = xskb->xdp.data;
> > > +       xskb->xdp.flags = 0;
> > >
> > >         if (pool->dma_need_sync) {
> > >                 dma_sync_single_range_for_device(pool->dev, xskb->dma, 0,
> > > @@ -601,6 +602,7 @@ static u32 xp_alloc_new_from_fq(struct xsk_buff_pool *pool, struct xdp_buff **xd
> > >                 }
> > >
> > >                 *xdp = &xskb->xdp;
> > > +               xskb->xdp.flags = 0;
> >
> > Thanks for catching this. I am thinking we should have an if-statement
> > here and only do this when multi-buffer is enabled. The reason that we
> > have two different paths for aligned mode and unaligned mode here is
> > that we do not have to touch the xdp_buff at all at allocation time in
> > aligned mode, which provides a nice speed-up. So let us only do this
> > when necessary. What do you think? Same goes for the line in
> > xp_alloc_reused().
> >
>
> Good point. How about keeping flags = 0 in xp_alloc() and adding it to
> xsk_buff_set_size() ? We do touch xdp_buff there and these two paths cover
> batched and non-batched APIs. I do agree that doing it in
> xp_alloc_new_from_fq() and in xp_alloc_reused() is not really handy.

That is an even better idea. Go for it.

> > >                 xdp++;
> > >         }
> > >
> > > @@ -621,6 +623,7 @@ static u32 xp_alloc_reused(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u3
> > >                 list_del_init(&xskb->free_list_node);
> > >
> > >                 *xdp = &xskb->xdp;
> > > +               xskb->xdp.flags = 0;
> > >                 xdp++;
> > >         }
> > >         pool->free_list_cnt -= nb_entries;
> > > --
> > > 2.34.1
> > >
> > >
diff mbox series

Patch

diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
index e99fa854d17f..fede0bb3e047 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
@@ -499,7 +499,6 @@  int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
 		xdp_res = i40e_run_xdp_zc(rx_ring, first, xdp_prog);
 		i40e_handle_xdp_result_zc(rx_ring, first, rx_desc, &rx_packets,
 					  &rx_bytes, xdp_res, &failure);
-		first->flags = 0;
 		next_to_clean = next_to_process;
 		if (failure)
 			break;
diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c
index 5d1ae8e4058a..d9073a618ad6 100644
--- a/drivers/net/ethernet/intel/ice/ice_xsk.c
+++ b/drivers/net/ethernet/intel/ice/ice_xsk.c
@@ -895,7 +895,6 @@  int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, int budget)
 
 		if (!first) {
 			first = xdp;
-			xdp_buff_clear_frags_flag(first);
 		} else if (ice_add_xsk_frag(rx_ring, first, xdp, size)) {
 			break;
 		}
diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
index 28711cc44ced..dc5659da6728 100644
--- a/net/xdp/xsk_buff_pool.c
+++ b/net/xdp/xsk_buff_pool.c
@@ -555,6 +555,7 @@  struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool)
 
 	xskb->xdp.data = xskb->xdp.data_hard_start + XDP_PACKET_HEADROOM;
 	xskb->xdp.data_meta = xskb->xdp.data;
+	xskb->xdp.flags = 0;
 
 	if (pool->dma_need_sync) {
 		dma_sync_single_range_for_device(pool->dev, xskb->dma, 0,
@@ -601,6 +602,7 @@  static u32 xp_alloc_new_from_fq(struct xsk_buff_pool *pool, struct xdp_buff **xd
 		}
 
 		*xdp = &xskb->xdp;
+		xskb->xdp.flags = 0;
 		xdp++;
 	}
 
@@ -621,6 +623,7 @@  static u32 xp_alloc_reused(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u3
 		list_del_init(&xskb->free_list_node);
 
 		*xdp = &xskb->xdp;
+		xskb->xdp.flags = 0;
 		xdp++;
 	}
 	pool->free_list_cnt -= nb_entries;