diff mbox series

[net-next,2/4] i40e: use generic unrolled_count() macro

Message ID 20250206182630.3914318-3-aleksander.lobakin@intel.com (mailing list archive)
State Accepted
Delegated to: Netdev Maintainers
Headers show
Series xsk: the lost bits from Chapter III | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for net-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 0 this patch: 0
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers warning 6 maintainers not CCed: anthony.l.nguyen@intel.com ndesaulniers@google.com morbo@google.com intel-wired-lan@lists.osuosl.org llvm@lists.linux.dev justinstitt@google.com
netdev/build_clang success Errors and warnings before: 4 this patch: 4
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 0 this patch: 0
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 38 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 12 this patch: 12
netdev/source_inline success Was 0 now: 0
netdev/contest success net-next-2025-02-07--21-00 (tests: 890)

Commit Message

Alexander Lobakin Feb. 6, 2025, 6:26 p.m. UTC
i40e, as well as ice, has a custom loop unrolling macro for unrolling
Tx descriptors filling on XSk xmit.
Replace i40e defs with generic unrolled_count(), which is also more
convenient as it allows passing defines as its argument, not hardcoded
values, while the loop declaration will still be a usual for-loop.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_xsk.h | 10 +---------
 drivers/net/ethernet/intel/i40e/i40e_xsk.c |  4 +++-
 2 files changed, 4 insertions(+), 10 deletions(-)

Comments

Maciej Fijalkowski Feb. 10, 2025, 12:30 p.m. UTC | #1
On Thu, Feb 06, 2025 at 07:26:27PM +0100, Alexander Lobakin wrote:
> i40e, as well as ice, has a custom loop unrolling macro for unrolling
> Tx descriptors filling on XSk xmit.
> Replace i40e defs with generic unrolled_count(), which is also more
> convenient as it allows passing defines as its argument, not hardcoded
> values, while the loop declaration will still be a usual for-loop.
> 
> Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>

Acked-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>

> ---
>  drivers/net/ethernet/intel/i40e/i40e_xsk.h | 10 +---------
>  drivers/net/ethernet/intel/i40e/i40e_xsk.c |  4 +++-
>  2 files changed, 4 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.h b/drivers/net/ethernet/intel/i40e/i40e_xsk.h
> index ef156fad52f2..dd16351a7af8 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.h
> +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.h
> @@ -6,7 +6,7 @@
>  
>  #include <linux/types.h>
>  
> -/* This value should match the pragma in the loop_unrolled_for
> +/* This value should match the pragma in the unrolled_count()
>   * macro. Why 4? It is strictly empirical. It seems to be a good
>   * compromise between the advantage of having simultaneous outstanding
>   * reads to the DMA array that can hide each others latency and the
> @@ -14,14 +14,6 @@
>   */
>  #define PKTS_PER_BATCH 4
>  
> -#ifdef __clang__
> -#define loop_unrolled_for _Pragma("clang loop unroll_count(4)") for
> -#elif __GNUC__ >= 8
> -#define loop_unrolled_for _Pragma("GCC unroll 4") for
> -#else
> -#define loop_unrolled_for for
> -#endif
> -
>  struct i40e_ring;
>  struct i40e_vsi;
>  struct net_device;
> diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
> index e28f1905a4a0..9f47388eaba5 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
> +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
> @@ -2,6 +2,7 @@
>  /* Copyright(c) 2018 Intel Corporation. */
>  
>  #include <linux/bpf_trace.h>
> +#include <linux/unroll.h>
>  #include <net/xdp_sock_drv.h>
>  #include "i40e_txrx_common.h"
>  #include "i40e_xsk.h"
> @@ -529,7 +530,8 @@ static void i40e_xmit_pkt_batch(struct i40e_ring *xdp_ring, struct xdp_desc *des
>  	dma_addr_t dma;
>  	u32 i;
>  
> -	loop_unrolled_for(i = 0; i < PKTS_PER_BATCH; i++) {
> +	unrolled_count(PKTS_PER_BATCH)
> +	for (i = 0; i < PKTS_PER_BATCH; i++) {
>  		u32 cmd = I40E_TX_DESC_CMD_ICRC | xsk_is_eop_desc(&desc[i]);
>  
>  		dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc[i].addr);
> -- 
> 2.48.1
>
David Laight Feb. 10, 2025, 10:28 p.m. UTC | #2
On Thu,  6 Feb 2025 19:26:27 +0100
Alexander Lobakin <aleksander.lobakin@intel.com> wrote:

> i40e, as well as ice, has a custom loop unrolling macro for unrolling
> Tx descriptors filling on XSk xmit.
> Replace i40e defs with generic unrolled_count(), which is also more
> convenient as it allows passing defines as its argument, not hardcoded
> values, while the loop declaration will still be a usual for-loop.
..
>  #define PKTS_PER_BATCH 4
>  
> -#ifdef __clang__
> -#define loop_unrolled_for _Pragma("clang loop unroll_count(4)") for
> -#elif __GNUC__ >= 8
> -#define loop_unrolled_for _Pragma("GCC unroll 4") for
> -#else
> -#define loop_unrolled_for for
> -#endif
...
> @@ -529,7 +530,8 @@ static void i40e_xmit_pkt_batch(struct i40e_ring *xdp_ring, struct xdp_desc *des
>  	dma_addr_t dma;
>  	u32 i;
>  
> -	loop_unrolled_for(i = 0; i < PKTS_PER_BATCH; i++) {
> +	unrolled_count(PKTS_PER_BATCH)
> +	for (i = 0; i < PKTS_PER_BATCH; i++) {
>  		u32 cmd = I40E_TX_DESC_CMD_ICRC | xsk_is_eop_desc(&desc[i]);
>  
>  		dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc[i].addr);

The rest of that code is:


		tx_desc = I40E_TX_DESC(xdp_ring, ntu++);
		tx_desc->buffer_addr = cpu_to_le64(dma);
		tx_desc->cmd_type_offset_bsz = build_ctob(cmd, 0, desc[i].len, 0);

		*total_bytes += desc[i].len;
	}

	xdp_ring->next_to_use = ntu;
}

static void i40e_fill_tx_hw_ring(struct i40e_ring *xdp_ring, struct xdp_desc *descs, u32 nb_pkts,
				 unsigned int *total_bytes)
{
	u32 batched, leftover, i;

	batched = nb_pkts & ~(PKTS_PER_BATCH - 1);
	leftover = nb_pkts & (PKTS_PER_BATCH - 1);
	for (i = 0; i < batched; i += PKTS_PER_BATCH)
		i40e_xmit_pkt_batch(xdp_ring, &descs[i], total_bytes);
	for (i = batched; i < batched + leftover; i++)
		i40e_xmit_pkt(xdp_ring, &descs[i], total_bytes);
}

If it isn't a silly question why all the faffing with unrolling?
It isn't as though the loop body is trivial - it contains real function calls.
Unrolling loops is so 1980s - unless you are trying to get the absolute
max performance from a very short loop and need to unroll once (maybe twice)
to get enough spare instruction execution slots to run the loop control
code in parallel with the body.

In this case it looks like the 'batched' loop contains an inlined copy of
the function called for the remainder.
I can't see anything else.
You'd probably gain more by getting rid of the 'int *total bytes' and using
the function return value - that is what it is fot.

	David
diff mbox series

Patch

diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.h b/drivers/net/ethernet/intel/i40e/i40e_xsk.h
index ef156fad52f2..dd16351a7af8 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.h
@@ -6,7 +6,7 @@ 
 
 #include <linux/types.h>
 
-/* This value should match the pragma in the loop_unrolled_for
+/* This value should match the pragma in the unrolled_count()
  * macro. Why 4? It is strictly empirical. It seems to be a good
  * compromise between the advantage of having simultaneous outstanding
  * reads to the DMA array that can hide each others latency and the
@@ -14,14 +14,6 @@ 
  */
 #define PKTS_PER_BATCH 4
 
-#ifdef __clang__
-#define loop_unrolled_for _Pragma("clang loop unroll_count(4)") for
-#elif __GNUC__ >= 8
-#define loop_unrolled_for _Pragma("GCC unroll 4") for
-#else
-#define loop_unrolled_for for
-#endif
-
 struct i40e_ring;
 struct i40e_vsi;
 struct net_device;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
index e28f1905a4a0..9f47388eaba5 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
@@ -2,6 +2,7 @@ 
 /* Copyright(c) 2018 Intel Corporation. */
 
 #include <linux/bpf_trace.h>
+#include <linux/unroll.h>
 #include <net/xdp_sock_drv.h>
 #include "i40e_txrx_common.h"
 #include "i40e_xsk.h"
@@ -529,7 +530,8 @@  static void i40e_xmit_pkt_batch(struct i40e_ring *xdp_ring, struct xdp_desc *des
 	dma_addr_t dma;
 	u32 i;
 
-	loop_unrolled_for(i = 0; i < PKTS_PER_BATCH; i++) {
+	unrolled_count(PKTS_PER_BATCH)
+	for (i = 0; i < PKTS_PER_BATCH; i++) {
 		u32 cmd = I40E_TX_DESC_CMD_ICRC | xsk_is_eop_desc(&desc[i]);
 
 		dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc[i].addr);