diff mbox series

[bpf-next,v2,5/5] i40e: use batched xsk Tx interfaces to increase performance

Message ID 1605006094-31097-6-git-send-email-magnus.karlsson@gmail.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series xsk: i40e: Tx performance improvements | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for bpf-next
netdev/subject_prefix success Link
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit fail Errors and warnings before: 11 this patch: 11
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch fail Link
netdev/build_allmodconfig_warn success Errors and warnings before: 7 this patch: 7
netdev/header_inline success Link
netdev/stable success Stable not CCed

Commit Message

Magnus Karlsson Nov. 10, 2020, 11:01 a.m. UTC
From: Magnus Karlsson <magnus.karlsson@intel.com>

Use the new batched xsk interfaces for the Tx path in the i40e driver
to improve performance. On my machine, this yields a throughput
increase of 4% for the l2fwd sample app in xdpsock. If we instead just
look at the Tx part, this patch set increases throughput with above
20% for Tx.

Note that I had to explicitly loop unroll the inner loop to get to
this performance level, by using a pragma. It is honored by both clang
and gcc and should be ignored by versions that do not support
it. Using the -funroll-loops compiler command line switch on the
source file resulted in a loop unrolling on a higher level that
lead to a performance decrease instead of an increase.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c |  11 +++
 drivers/net/ethernet/intel/i40e/i40e_txrx.h |   1 +
 drivers/net/ethernet/intel/i40e/i40e_xsk.c  | 127 ++++++++++++++++++++--------
 3 files changed, 104 insertions(+), 35 deletions(-)

Comments

kernel test robot Nov. 11, 2020, 1:37 a.m. UTC | #1
Hi Magnus,

I love your patch! Perhaps something to improve:

[auto build test WARNING on bpf-next/master]

url:    https://github.com/0day-ci/linux/commits/Magnus-Karlsson/xsk-i40e-Tx-performance-improvements/20201110-190310
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
config: powerpc64-randconfig-r025-20201110 (attached as .config)
compiler: clang version 12.0.0 (https://github.com/llvm/llvm-project 4d81c8adb6ed9840257f6cb6b93f60856d422a15)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # install powerpc64 cross compiling tool for clang build
        # apt-get install binutils-powerpc64-linux-gnu
        # https://github.com/0day-ci/linux/commit/b016bbeac6692a93e61b28efa430d64645032b5e
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Magnus-Karlsson/xsk-i40e-Tx-performance-improvements/20201110-190310
        git checkout b016bbeac6692a93e61b28efa430d64645032b5e
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=powerpc64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> drivers/net/ethernet/intel/i40e/i40e_xsk.c:417:13: warning: unknown pragma ignored [-Wunknown-pragmas]
   #pragma GCC unroll 4
               ^
   1 warning generated.

vim +417 drivers/net/ethernet/intel/i40e/i40e_xsk.c

   408	
   409	static void i40e_xmit_pkt_batch(struct i40e_ring *xdp_ring, struct xdp_desc *desc,
   410					unsigned int *total_bytes)
   411	{
   412		u16 ntu = xdp_ring->next_to_use;
   413		struct i40e_tx_desc *tx_desc;
   414		dma_addr_t dma;
   415		u32 i;
   416	
 > 417	#pragma GCC unroll 4
   418		for (i = 0; i < PKTS_PER_BATCH; i++) {
   419			dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc[i].addr);
   420			xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma, desc[i].len);
   421	
   422			tx_desc = I40E_TX_DESC(xdp_ring, ntu++);
   423			tx_desc->buffer_addr = cpu_to_le64(dma);
   424			tx_desc->cmd_type_offset_bsz = build_ctob(I40E_TX_DESC_CMD_ICRC |
   425								  I40E_TX_DESC_CMD_EOP,
   426								  0, desc[i].len, 0);
   427	
   428			*total_bytes += desc[i].len;
   429		}
   430	
   431		xdp_ring->next_to_use = ntu;
   432	}
   433	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Magnus Karlsson Nov. 11, 2020, 11:57 a.m. UTC | #2
On Wed, Nov 11, 2020 at 2:38 AM kernel test robot <lkp@intel.com> wrote:
>
> Hi Magnus,
>
> I love your patch! Perhaps something to improve:
>
> [auto build test WARNING on bpf-next/master]
>
> url:    https://github.com/0day-ci/linux/commits/Magnus-Karlsson/xsk-i40e-Tx-performance-improvements/20201110-190310
> base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
> config: powerpc64-randconfig-r025-20201110 (attached as .config)
> compiler: clang version 12.0.0 (https://github.com/llvm/llvm-project 4d81c8adb6ed9840257f6cb6b93f60856d422a15)
> reproduce (this is a W=1 build):
>         wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
>         chmod +x ~/bin/make.cross
>         # install powerpc64 cross compiling tool for clang build
>         # apt-get install binutils-powerpc64-linux-gnu
>         # https://github.com/0day-ci/linux/commit/b016bbeac6692a93e61b28efa430d64645032b5e
>         git remote add linux-review https://github.com/0day-ci/linux
>         git fetch --no-tags linux-review Magnus-Karlsson/xsk-i40e-Tx-performance-improvements/20201110-190310
>         git checkout b016bbeac6692a93e61b28efa430d64645032b5e
>         # save the attached .config to linux build tree
>         COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=powerpc64
>
> If you fix the issue, kindly add following tag as appropriate
> Reported-by: kernel test robot <lkp@intel.com>
>
> All warnings (new ones prefixed by >>):
>
> >> drivers/net/ethernet/intel/i40e/i40e_xsk.c:417:13: warning: unknown pragma ignored [-Wunknown-pragmas]
>    #pragma GCC unroll 4
>                ^
>    1 warning generated.

And I was hoping that unknown pragmas would be ignored, but that will
obviously not be the case with -Wunknown-pragmas added. The unrolling
of this inner loop where the code spends most of its time gives me
nearly 1 Mpps extra in performance which is substantial, so I would
like to get this unrolled in some way, but without the warning. Need
some advice please. Here are some options that comes in mind:

#1: Suppress unknown pragma warnings in this file only by adding
CFLAGS_i40e_xsk.o += -Wno-unknown-pragmas (or whatever that option
might be) in the Makefile

#2: Force the compiler to loop-unroll the loop with for example a
switch statement with four cases that all fall through. This will make
the code less readable.

#3: Manually loop-unroll the loop. This will make the code even less
readable than #2.

I prefer #1 as I like to keep the code readable, but you might have
other better suggestions on how to tackle this.

Thanks: Magnus

> vim +417 drivers/net/ethernet/intel/i40e/i40e_xsk.c
>
>    408
>    409  static void i40e_xmit_pkt_batch(struct i40e_ring *xdp_ring, struct xdp_desc *desc,
>    410                                  unsigned int *total_bytes)
>    411  {
>    412          u16 ntu = xdp_ring->next_to_use;
>    413          struct i40e_tx_desc *tx_desc;
>    414          dma_addr_t dma;
>    415          u32 i;
>    416
>  > 417  #pragma GCC unroll 4
>    418          for (i = 0; i < PKTS_PER_BATCH; i++) {
>    419                  dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc[i].addr);
>    420                  xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma, desc[i].len);
>    421
>    422                  tx_desc = I40E_TX_DESC(xdp_ring, ntu++);
>    423                  tx_desc->buffer_addr = cpu_to_le64(dma);
>    424                  tx_desc->cmd_type_offset_bsz = build_ctob(I40E_TX_DESC_CMD_ICRC |
>    425                                                            I40E_TX_DESC_CMD_EOP,
>    426                                                            0, desc[i].len, 0);
>    427
>    428                  *total_bytes += desc[i].len;
>    429          }
>    430
>    431          xdp_ring->next_to_use = ntu;
>    432  }
>    433
>
> ---
> 0-DAY CI Kernel Test Service, Intel Corporation
> https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Nick Desaulniers Nov. 11, 2020, 7:16 p.m. UTC | #3
On Wed, Nov 11, 2020 at 3:57 AM Magnus Karlsson
<magnus.karlsson@gmail.com> wrote:
>
> On Wed, Nov 11, 2020 at 2:38 AM kernel test robot <lkp@intel.com> wrote:
> >
> > Hi Magnus,
> >
> > I love your patch! Perhaps something to improve:
> >
> > [auto build test WARNING on bpf-next/master]
> >
> > url:    https://github.com/0day-ci/linux/commits/Magnus-Karlsson/xsk-i40e-Tx-performance-improvements/20201110-190310
> > base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
> > config: powerpc64-randconfig-r025-20201110 (attached as .config)
> > compiler: clang version 12.0.0 (https://github.com/llvm/llvm-project 4d81c8adb6ed9840257f6cb6b93f60856d422a15)

^ Note: clang

> > reproduce (this is a W=1 build):
> >         wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
> >         chmod +x ~/bin/make.cross
> >         # install powerpc64 cross compiling tool for clang build
> >         # apt-get install binutils-powerpc64-linux-gnu
> >         # https://github.com/0day-ci/linux/commit/b016bbeac6692a93e61b28efa430d64645032b5e
> >         git remote add linux-review https://github.com/0day-ci/linux
> >         git fetch --no-tags linux-review Magnus-Karlsson/xsk-i40e-Tx-performance-improvements/20201110-190310
> >         git checkout b016bbeac6692a93e61b28efa430d64645032b5e
> >         # save the attached .config to linux build tree
> >         COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=powerpc64
> >
> > If you fix the issue, kindly add following tag as appropriate
> > Reported-by: kernel test robot <lkp@intel.com>
> >
> > All warnings (new ones prefixed by >>):
> >
> > >> drivers/net/ethernet/intel/i40e/i40e_xsk.c:417:13: warning: unknown pragma ignored [-Wunknown-pragmas]
> >    #pragma GCC unroll 4
> >                ^
> >    1 warning generated.
>
> And I was hoping that unknown pragmas would be ignored, but that will
> obviously not be the case with -Wunknown-pragmas added. The unrolling
> of this inner loop where the code spends most of its time gives me
> nearly 1 Mpps extra in performance which is substantial, so I would
> like to get this unrolled in some way, but without the warning. Need
> some advice please. Here are some options that comes in mind:
>
> #1: Suppress unknown pragma warnings in this file only by adding
> CFLAGS_i40e_xsk.o += -Wno-unknown-pragmas (or whatever that option
> might be) in the Makefile
>
> #2: Force the compiler to loop-unroll the loop with for example a
> switch statement with four cases that all fall through. This will make
> the code less readable.
>
> #3: Manually loop-unroll the loop. This will make the code even less
> readable than #2.

#4 support both compilers.  Note Clang's syntax is slightly different
here; it doesn't accept GCC specific pragmas, and uses a slightly
different form:
https://clang.llvm.org/docs/LanguageExtensions.html#loop-unrolling .
If you wrap that in a macro based on `#ifdef __clang__`, that should
do the trick.

>
> I prefer #1 as I like to keep the code readable, but you might have
> other better suggestions on how to tackle this.
>
> Thanks: Magnus
>
> > vim +417 drivers/net/ethernet/intel/i40e/i40e_xsk.c
> >
> >    408
> >    409  static void i40e_xmit_pkt_batch(struct i40e_ring *xdp_ring, struct xdp_desc *desc,
> >    410                                  unsigned int *total_bytes)
> >    411  {
> >    412          u16 ntu = xdp_ring->next_to_use;
> >    413          struct i40e_tx_desc *tx_desc;
> >    414          dma_addr_t dma;
> >    415          u32 i;
> >    416
> >  > 417  #pragma GCC unroll 4
> >    418          for (i = 0; i < PKTS_PER_BATCH; i++) {
> >    419                  dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc[i].addr);
> >    420                  xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma, desc[i].len);
> >    421
> >    422                  tx_desc = I40E_TX_DESC(xdp_ring, ntu++);
> >    423                  tx_desc->buffer_addr = cpu_to_le64(dma);
> >    424                  tx_desc->cmd_type_offset_bsz = build_ctob(I40E_TX_DESC_CMD_ICRC |
> >    425                                                            I40E_TX_DESC_CMD_EOP,
> >    426                                                            0, desc[i].len, 0);
> >    427
> >    428                  *total_bytes += desc[i].len;
> >    429          }
> >    430
> >    431          xdp_ring->next_to_use = ntu;
> >    432  }
> >    433
> >
> > ---
> > 0-DAY CI Kernel Test Service, Intel Corporation
> > https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
>
> --
> You received this message because you are subscribed to the Google Groups "Clang Built Linux" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to clang-built-linux+unsubscribe@googlegroups.com.
> To view this discussion on the web visit https://groups.google.com/d/msgid/clang-built-linux/CAJ8uoz2aDjLPtcTgZ_pO-%3DS9TgXm3c57rN8TTPXdqT7HOOKrhA%40mail.gmail.com.
Magnus Karlsson Nov. 12, 2020, 7:45 a.m. UTC | #4
On Wed, Nov 11, 2020 at 8:16 PM Nick Desaulniers
<ndesaulniers@google.com> wrote:
>
> On Wed, Nov 11, 2020 at 3:57 AM Magnus Karlsson
> <magnus.karlsson@gmail.com> wrote:
> >
> > On Wed, Nov 11, 2020 at 2:38 AM kernel test robot <lkp@intel.com> wrote:
> > >
> > > Hi Magnus,
> > >
> > > I love your patch! Perhaps something to improve:
> > >
> > > [auto build test WARNING on bpf-next/master]
> > >
> > > url:    https://github.com/0day-ci/linux/commits/Magnus-Karlsson/xsk-i40e-Tx-performance-improvements/20201110-190310
> > > base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
> > > config: powerpc64-randconfig-r025-20201110 (attached as .config)
> > > compiler: clang version 12.0.0 (https://github.com/llvm/llvm-project 4d81c8adb6ed9840257f6cb6b93f60856d422a15)
>
> ^ Note: clang
>
> > > reproduce (this is a W=1 build):
> > >         wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
> > >         chmod +x ~/bin/make.cross
> > >         # install powerpc64 cross compiling tool for clang build
> > >         # apt-get install binutils-powerpc64-linux-gnu
> > >         # https://github.com/0day-ci/linux/commit/b016bbeac6692a93e61b28efa430d64645032b5e
> > >         git remote add linux-review https://github.com/0day-ci/linux
> > >         git fetch --no-tags linux-review Magnus-Karlsson/xsk-i40e-Tx-performance-improvements/20201110-190310
> > >         git checkout b016bbeac6692a93e61b28efa430d64645032b5e
> > >         # save the attached .config to linux build tree
> > >         COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=powerpc64
> > >
> > > If you fix the issue, kindly add following tag as appropriate
> > > Reported-by: kernel test robot <lkp@intel.com>
> > >
> > > All warnings (new ones prefixed by >>):
> > >
> > > >> drivers/net/ethernet/intel/i40e/i40e_xsk.c:417:13: warning: unknown pragma ignored [-Wunknown-pragmas]
> > >    #pragma GCC unroll 4
> > >                ^
> > >    1 warning generated.
> >
> > And I was hoping that unknown pragmas would be ignored, but that will
> > obviously not be the case with -Wunknown-pragmas added. The unrolling
> > of this inner loop where the code spends most of its time gives me
> > nearly 1 Mpps extra in performance which is substantial, so I would
> > like to get this unrolled in some way, but without the warning. Need
> > some advice please. Here are some options that comes in mind:
> >
> > #1: Suppress unknown pragma warnings in this file only by adding
> > CFLAGS_i40e_xsk.o += -Wno-unknown-pragmas (or whatever that option
> > might be) in the Makefile
> >
> > #2: Force the compiler to loop-unroll the loop with for example a
> > switch statement with four cases that all fall through. This will make
> > the code less readable.
> >
> > #3: Manually loop-unroll the loop. This will make the code even less
> > readable than #2.
>
> #4 support both compilers.  Note Clang's syntax is slightly different
> here; it doesn't accept GCC specific pragmas, and uses a slightly
> different form:
> https://clang.llvm.org/docs/LanguageExtensions.html#loop-unrolling .
> If you wrap that in a macro based on `#ifdef __clang__`, that should
> do the trick.

Yes, that did the trick. Tried it out with the compiler explorer at
https://godbolt.org/ and it compiles nicely even for clang-powerpc64.
Will spin a v3.

Thank you: Magnus

> >
> > I prefer #1 as I like to keep the code readable, but you might have
> > other better suggestions on how to tackle this.
> >
> > Thanks: Magnus
> >
> > > vim +417 drivers/net/ethernet/intel/i40e/i40e_xsk.c
> > >
> > >    408
> > >    409  static void i40e_xmit_pkt_batch(struct i40e_ring *xdp_ring, struct xdp_desc *desc,
> > >    410                                  unsigned int *total_bytes)
> > >    411  {
> > >    412          u16 ntu = xdp_ring->next_to_use;
> > >    413          struct i40e_tx_desc *tx_desc;
> > >    414          dma_addr_t dma;
> > >    415          u32 i;
> > >    416
> > >  > 417  #pragma GCC unroll 4
> > >    418          for (i = 0; i < PKTS_PER_BATCH; i++) {
> > >    419                  dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc[i].addr);
> > >    420                  xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma, desc[i].len);
> > >    421
> > >    422                  tx_desc = I40E_TX_DESC(xdp_ring, ntu++);
> > >    423                  tx_desc->buffer_addr = cpu_to_le64(dma);
> > >    424                  tx_desc->cmd_type_offset_bsz = build_ctob(I40E_TX_DESC_CMD_ICRC |
> > >    425                                                            I40E_TX_DESC_CMD_EOP,
> > >    426                                                            0, desc[i].len, 0);
> > >    427
> > >    428                  *total_bytes += desc[i].len;
> > >    429          }
> > >    430
> > >    431          xdp_ring->next_to_use = ntu;
> > >    432  }
> > >    433
> > >
> > > ---
> > > 0-DAY CI Kernel Test Service, Intel Corporation
> > > https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
> >
> > --
> > You received this message because you are subscribed to the Google Groups "Clang Built Linux" group.
> > To unsubscribe from this group and stop receiving emails from it, send an email to clang-built-linux+unsubscribe@googlegroups.com.
> > To view this discussion on the web visit https://groups.google.com/d/msgid/clang-built-linux/CAJ8uoz2aDjLPtcTgZ_pO-%3DS9TgXm3c57rN8TTPXdqT7HOOKrhA%40mail.gmail.com.
>
>
>
> --
> Thanks,
> ~Nick Desaulniers
Nick Desaulniers Nov. 12, 2020, 7:39 p.m. UTC | #5
On Wed, Nov 11, 2020 at 11:45 PM Magnus Karlsson
<magnus.karlsson@gmail.com> wrote:
>
> On Wed, Nov 11, 2020 at 8:16 PM Nick Desaulniers
> <ndesaulniers@google.com> wrote:
> >
> > On Wed, Nov 11, 2020 at 3:57 AM Magnus Karlsson
> > <magnus.karlsson@gmail.com> wrote:
> > >
> > > On Wed, Nov 11, 2020 at 2:38 AM kernel test robot <lkp@intel.com> wrote:
> > > >
> > > > Hi Magnus,
> > > >
> > > > I love your patch! Perhaps something to improve:
> > > >
> > > > [auto build test WARNING on bpf-next/master]
> > > >
> > > > url:    https://github.com/0day-ci/linux/commits/Magnus-Karlsson/xsk-i40e-Tx-performance-improvements/20201110-190310
> > > > base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
> > > > config: powerpc64-randconfig-r025-20201110 (attached as .config)
> > > > compiler: clang version 12.0.0 (https://github.com/llvm/llvm-project 4d81c8adb6ed9840257f6cb6b93f60856d422a15)
> >
> > ^ Note: clang
> >
> > > > reproduce (this is a W=1 build):
> > > >         wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
> > > >         chmod +x ~/bin/make.cross
> > > >         # install powerpc64 cross compiling tool for clang build
> > > >         # apt-get install binutils-powerpc64-linux-gnu
> > > >         # https://github.com/0day-ci/linux/commit/b016bbeac6692a93e61b28efa430d64645032b5e
> > > >         git remote add linux-review https://github.com/0day-ci/linux
> > > >         git fetch --no-tags linux-review Magnus-Karlsson/xsk-i40e-Tx-performance-improvements/20201110-190310
> > > >         git checkout b016bbeac6692a93e61b28efa430d64645032b5e
> > > >         # save the attached .config to linux build tree
> > > >         COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=powerpc64
> > > >
> > > > If you fix the issue, kindly add following tag as appropriate
> > > > Reported-by: kernel test robot <lkp@intel.com>
> > > >
> > > > All warnings (new ones prefixed by >>):
> > > >
> > > > >> drivers/net/ethernet/intel/i40e/i40e_xsk.c:417:13: warning: unknown pragma ignored [-Wunknown-pragmas]
> > > >    #pragma GCC unroll 4
> > > >                ^
> > > >    1 warning generated.
> > >
> > > And I was hoping that unknown pragmas would be ignored, but that will
> > > obviously not be the case with -Wunknown-pragmas added. The unrolling
> > > of this inner loop where the code spends most of its time gives me
> > > nearly 1 Mpps extra in performance which is substantial, so I would
> > > like to get this unrolled in some way, but without the warning. Need
> > > some advice please. Here are some options that comes in mind:
> > >
> > > #1: Suppress unknown pragma warnings in this file only by adding
> > > CFLAGS_i40e_xsk.o += -Wno-unknown-pragmas (or whatever that option
> > > might be) in the Makefile
> > >
> > > #2: Force the compiler to loop-unroll the loop with for example a
> > > switch statement with four cases that all fall through. This will make
> > > the code less readable.
> > >
> > > #3: Manually loop-unroll the loop. This will make the code even less
> > > readable than #2.
> >
> > #4 support both compilers.  Note Clang's syntax is slightly different
> > here; it doesn't accept GCC specific pragmas, and uses a slightly
> > different form:
> > https://clang.llvm.org/docs/LanguageExtensions.html#loop-unrolling .
> > If you wrap that in a macro based on `#ifdef __clang__`, that should
> > do the trick.
>
> Yes, that did the trick. Tried it out with the compiler explorer at
> https://godbolt.org/ and it compiles nicely even for clang-powerpc64.
> Will spin a v3.
>
> Thank you: Magnus

Great job Magnus, I appreciate it!

>
> > >
> > > I prefer #1 as I like to keep the code readable, but you might have
> > > other better suggestions on how to tackle this.
> > >
> > > Thanks: Magnus
> > >
> > > > vim +417 drivers/net/ethernet/intel/i40e/i40e_xsk.c
> > > >
> > > >    408
> > > >    409  static void i40e_xmit_pkt_batch(struct i40e_ring *xdp_ring, struct xdp_desc *desc,
> > > >    410                                  unsigned int *total_bytes)
> > > >    411  {
> > > >    412          u16 ntu = xdp_ring->next_to_use;
> > > >    413          struct i40e_tx_desc *tx_desc;
> > > >    414          dma_addr_t dma;
> > > >    415          u32 i;
> > > >    416
> > > >  > 417  #pragma GCC unroll 4
> > > >    418          for (i = 0; i < PKTS_PER_BATCH; i++) {
> > > >    419                  dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc[i].addr);
> > > >    420                  xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma, desc[i].len);
> > > >    421
> > > >    422                  tx_desc = I40E_TX_DESC(xdp_ring, ntu++);
> > > >    423                  tx_desc->buffer_addr = cpu_to_le64(dma);
> > > >    424                  tx_desc->cmd_type_offset_bsz = build_ctob(I40E_TX_DESC_CMD_ICRC |
> > > >    425                                                            I40E_TX_DESC_CMD_EOP,
> > > >    426                                                            0, desc[i].len, 0);
> > > >    427
> > > >    428                  *total_bytes += desc[i].len;
> > > >    429          }
> > > >    430
> > > >    431          xdp_ring->next_to_use = ntu;
> > > >    432  }
> > > >    433
> > > >
> > > > ---
> > > > 0-DAY CI Kernel Test Service, Intel Corporation
> > > > https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
> > >
> > > --
> > > You received this message because you are subscribed to the Google Groups "Clang Built Linux" group.
> > > To unsubscribe from this group and stop receiving emails from it, send an email to clang-built-linux+unsubscribe@googlegroups.com.
> > > To view this discussion on the web visit https://groups.google.com/d/msgid/clang-built-linux/CAJ8uoz2aDjLPtcTgZ_pO-%3DS9TgXm3c57rN8TTPXdqT7HOOKrhA%40mail.gmail.com.
> >
> >
> >
> > --
> > Thanks,
> > ~Nick Desaulniers
diff mbox series

Patch

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index d43ce13..c21548c 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -676,6 +676,8 @@  void i40e_free_tx_resources(struct i40e_ring *tx_ring)
 	i40e_clean_tx_ring(tx_ring);
 	kfree(tx_ring->tx_bi);
 	tx_ring->tx_bi = NULL;
+	kfree(tx_ring->xsk_descs);
+	tx_ring->xsk_descs = NULL;
 
 	if (tx_ring->desc) {
 		dma_free_coherent(tx_ring->dev, tx_ring->size,
@@ -1277,6 +1279,13 @@  int i40e_setup_tx_descriptors(struct i40e_ring *tx_ring)
 	if (!tx_ring->tx_bi)
 		goto err;
 
+	if (ring_is_xdp(tx_ring)) {
+		tx_ring->xsk_descs = kcalloc(I40E_MAX_NUM_DESCRIPTORS, sizeof(*tx_ring->xsk_descs),
+					     GFP_KERNEL);
+		if (!tx_ring->xsk_descs)
+			goto err;
+	}
+
 	u64_stats_init(&tx_ring->syncp);
 
 	/* round up to nearest 4K */
@@ -1300,6 +1309,8 @@  int i40e_setup_tx_descriptors(struct i40e_ring *tx_ring)
 	return 0;
 
 err:
+	kfree(tx_ring->xsk_descs);
+	tx_ring->xsk_descs = NULL;
 	kfree(tx_ring->tx_bi);
 	tx_ring->tx_bi = NULL;
 	return -ENOMEM;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
index 2feed92..5f531b1 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
@@ -389,6 +389,7 @@  struct i40e_ring {
 	struct i40e_channel *ch;
 	struct xdp_rxq_info xdp_rxq;
 	struct xsk_buff_pool *xsk_pool;
+	struct xdp_desc *xsk_descs;      /* For storing descriptors in the AF_XDP ZC path */
 } ____cacheline_internodealigned_in_smp;
 
 static inline bool ring_uses_build_skb(struct i40e_ring *ring)
diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
index 61aa1fc..a271a02 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
@@ -381,6 +381,78 @@  int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
 	return failure ? budget : (int)total_rx_packets;
 }
 
+static void i40e_xmit_pkt(struct i40e_ring *xdp_ring, struct xdp_desc *desc,
+			  unsigned int *total_bytes)
+{
+	struct i40e_tx_desc *tx_desc;
+	dma_addr_t dma;
+
+	dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc->addr);
+	xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma, desc->len);
+
+	tx_desc = I40E_TX_DESC(xdp_ring, xdp_ring->next_to_use++);
+	tx_desc->buffer_addr = cpu_to_le64(dma);
+	tx_desc->cmd_type_offset_bsz = build_ctob(I40E_TX_DESC_CMD_ICRC | I40E_TX_DESC_CMD_EOP,
+						  0, desc->len, 0);
+
+	*total_bytes += desc->len;
+}
+
+/* This value should match the pragma below. Why 4? It is strictly
+ * empirical. It seems to be a good compromise between the advantage
+ * of having simultaneous outstanding reads to the DMA array that can
+ * hide each others latency and the disadvantage of having a larger
+ * code path.
+ */
+#define PKTS_PER_BATCH 4
+
+static void i40e_xmit_pkt_batch(struct i40e_ring *xdp_ring, struct xdp_desc *desc,
+				unsigned int *total_bytes)
+{
+	u16 ntu = xdp_ring->next_to_use;
+	struct i40e_tx_desc *tx_desc;
+	dma_addr_t dma;
+	u32 i;
+
+#pragma GCC unroll 4
+	for (i = 0; i < PKTS_PER_BATCH; i++) {
+		dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc[i].addr);
+		xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma, desc[i].len);
+
+		tx_desc = I40E_TX_DESC(xdp_ring, ntu++);
+		tx_desc->buffer_addr = cpu_to_le64(dma);
+		tx_desc->cmd_type_offset_bsz = build_ctob(I40E_TX_DESC_CMD_ICRC |
+							  I40E_TX_DESC_CMD_EOP,
+							  0, desc[i].len, 0);
+
+		*total_bytes += desc[i].len;
+	}
+
+	xdp_ring->next_to_use = ntu;
+}
+
+static void i40e_fill_tx_hw_ring(struct i40e_ring *xdp_ring, struct xdp_desc *descs, u32 nb_pkts,
+				 unsigned int *total_bytes)
+{
+	u32 batched, leftover, i;
+
+	batched = nb_pkts & ~(PKTS_PER_BATCH - 1);
+	leftover = nb_pkts & (PKTS_PER_BATCH - 1);
+	for (i = 0; i < batched; i += PKTS_PER_BATCH)
+		i40e_xmit_pkt_batch(xdp_ring, &descs[i], total_bytes);
+	for (i = batched; i < batched + leftover; i++)
+		i40e_xmit_pkt(xdp_ring, &descs[i], total_bytes);
+}
+
+static void i40e_set_rs_bit(struct i40e_ring *xdp_ring)
+{
+	u16 ntu = xdp_ring->next_to_use ? xdp_ring->next_to_use - 1 : xdp_ring->count - 1;
+	struct i40e_tx_desc *tx_desc;
+
+	tx_desc = I40E_TX_DESC(xdp_ring, ntu);
+	tx_desc->cmd_type_offset_bsz |= (I40E_TX_DESC_CMD_RS << I40E_TXD_QW1_CMD_SHIFT);
+}
+
 /**
  * i40e_xmit_zc - Performs zero-copy Tx AF_XDP
  * @xdp_ring: XDP Tx ring
@@ -390,45 +462,30 @@  int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
  **/
 static bool i40e_xmit_zc(struct i40e_ring *xdp_ring, unsigned int budget)
 {
-	unsigned int sent_frames = 0, total_bytes = 0;
-	struct i40e_tx_desc *tx_desc = NULL;
-	struct xdp_desc desc;
-	dma_addr_t dma;
-
-	while (budget-- > 0) {
-		if (!xsk_tx_peek_desc(xdp_ring->xsk_pool, &desc))
-			break;
-
-		dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc.addr);
-		xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma,
-						 desc.len);
-
-		tx_desc = I40E_TX_DESC(xdp_ring, xdp_ring->next_to_use);
-		tx_desc->buffer_addr = cpu_to_le64(dma);
-		tx_desc->cmd_type_offset_bsz =
-			build_ctob(I40E_TX_DESC_CMD_ICRC
-				   | I40E_TX_DESC_CMD_EOP,
-				   0, desc.len, 0);
-
-		sent_frames++;
-		total_bytes += desc.len;
-
-		xdp_ring->next_to_use++;
-		if (xdp_ring->next_to_use == xdp_ring->count)
-			xdp_ring->next_to_use = 0;
+	struct xdp_desc *descs = xdp_ring->xsk_descs;
+	u32 nb_pkts, nb_processed = 0;
+	unsigned int total_bytes = 0;
+
+	nb_pkts = xsk_tx_peek_release_desc_batch(xdp_ring->xsk_pool, descs, budget);
+	if (!nb_pkts)
+		return false;
+
+	if (xdp_ring->next_to_use + nb_pkts >= xdp_ring->count) {
+		nb_processed = xdp_ring->count - xdp_ring->next_to_use;
+		i40e_fill_tx_hw_ring(xdp_ring, descs, nb_processed, &total_bytes);
+		xdp_ring->next_to_use = 0;
 	}
 
-	if (tx_desc) {
-		/* Request an interrupt for the last frame and bump tail ptr. */
-		tx_desc->cmd_type_offset_bsz |= (I40E_TX_DESC_CMD_RS <<
-						 I40E_TXD_QW1_CMD_SHIFT);
-		i40e_xdp_ring_update_tail(xdp_ring);
+	i40e_fill_tx_hw_ring(xdp_ring, &descs[nb_processed], nb_pkts - nb_processed,
+			     &total_bytes);
 
-		xsk_tx_release(xdp_ring->xsk_pool);
-		i40e_update_tx_stats(xdp_ring, sent_frames, total_bytes);
-	}
+	/* Request an interrupt for the last frame and bump tail ptr. */
+	i40e_set_rs_bit(xdp_ring);
+	i40e_xdp_ring_update_tail(xdp_ring);
+
+	i40e_update_tx_stats(xdp_ring, nb_pkts, total_bytes);
 
-	return !!budget;
+	return true;
 }
 
 /**