Message ID | 20250206182630.3914318-3-aleksander.lobakin@intel.com (mailing list archive) |
---|---|
State | Accepted |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | xsk: the lost bits from Chapter III | expand |
On Thu, Feb 06, 2025 at 07:26:27PM +0100, Alexander Lobakin wrote: > i40e, as well as ice, has a custom loop unrolling macro for unrolling > Tx descriptors filling on XSk xmit. > Replace i40e defs with generic unrolled_count(), which is also more > convenient as it allows passing defines as its argument, not hardcoded > values, while the loop declaration will still be a usual for-loop. > > Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com> Acked-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com> > --- > drivers/net/ethernet/intel/i40e/i40e_xsk.h | 10 +--------- > drivers/net/ethernet/intel/i40e/i40e_xsk.c | 4 +++- > 2 files changed, 4 insertions(+), 10 deletions(-) > > diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.h b/drivers/net/ethernet/intel/i40e/i40e_xsk.h > index ef156fad52f2..dd16351a7af8 100644 > --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.h > +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.h > @@ -6,7 +6,7 @@ > > #include <linux/types.h> > > -/* This value should match the pragma in the loop_unrolled_for > +/* This value should match the pragma in the unrolled_count() > * macro. Why 4? It is strictly empirical. It seems to be a good > * compromise between the advantage of having simultaneous outstanding > * reads to the DMA array that can hide each others latency and the > @@ -14,14 +14,6 @@ > */ > #define PKTS_PER_BATCH 4 > > -#ifdef __clang__ > -#define loop_unrolled_for _Pragma("clang loop unroll_count(4)") for > -#elif __GNUC__ >= 8 > -#define loop_unrolled_for _Pragma("GCC unroll 4") for > -#else > -#define loop_unrolled_for for > -#endif > - > struct i40e_ring; > struct i40e_vsi; > struct net_device; > diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c > index e28f1905a4a0..9f47388eaba5 100644 > --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c > +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c > @@ -2,6 +2,7 @@ > /* Copyright(c) 2018 Intel Corporation. */ > > #include <linux/bpf_trace.h> > +#include <linux/unroll.h> > #include <net/xdp_sock_drv.h> > #include "i40e_txrx_common.h" > #include "i40e_xsk.h" > @@ -529,7 +530,8 @@ static void i40e_xmit_pkt_batch(struct i40e_ring *xdp_ring, struct xdp_desc *des > dma_addr_t dma; > u32 i; > > - loop_unrolled_for(i = 0; i < PKTS_PER_BATCH; i++) { > + unrolled_count(PKTS_PER_BATCH) > + for (i = 0; i < PKTS_PER_BATCH; i++) { > u32 cmd = I40E_TX_DESC_CMD_ICRC | xsk_is_eop_desc(&desc[i]); > > dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc[i].addr); > -- > 2.48.1 >
On Thu, 6 Feb 2025 19:26:27 +0100 Alexander Lobakin <aleksander.lobakin@intel.com> wrote: > i40e, as well as ice, has a custom loop unrolling macro for unrolling > Tx descriptors filling on XSk xmit. > Replace i40e defs with generic unrolled_count(), which is also more > convenient as it allows passing defines as its argument, not hardcoded > values, while the loop declaration will still be a usual for-loop. .. > #define PKTS_PER_BATCH 4 > > -#ifdef __clang__ > -#define loop_unrolled_for _Pragma("clang loop unroll_count(4)") for > -#elif __GNUC__ >= 8 > -#define loop_unrolled_for _Pragma("GCC unroll 4") for > -#else > -#define loop_unrolled_for for > -#endif ... > @@ -529,7 +530,8 @@ static void i40e_xmit_pkt_batch(struct i40e_ring *xdp_ring, struct xdp_desc *des > dma_addr_t dma; > u32 i; > > - loop_unrolled_for(i = 0; i < PKTS_PER_BATCH; i++) { > + unrolled_count(PKTS_PER_BATCH) > + for (i = 0; i < PKTS_PER_BATCH; i++) { > u32 cmd = I40E_TX_DESC_CMD_ICRC | xsk_is_eop_desc(&desc[i]); > > dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc[i].addr); The rest of that code is: tx_desc = I40E_TX_DESC(xdp_ring, ntu++); tx_desc->buffer_addr = cpu_to_le64(dma); tx_desc->cmd_type_offset_bsz = build_ctob(cmd, 0, desc[i].len, 0); *total_bytes += desc[i].len; } xdp_ring->next_to_use = ntu; } static void i40e_fill_tx_hw_ring(struct i40e_ring *xdp_ring, struct xdp_desc *descs, u32 nb_pkts, unsigned int *total_bytes) { u32 batched, leftover, i; batched = nb_pkts & ~(PKTS_PER_BATCH - 1); leftover = nb_pkts & (PKTS_PER_BATCH - 1); for (i = 0; i < batched; i += PKTS_PER_BATCH) i40e_xmit_pkt_batch(xdp_ring, &descs[i], total_bytes); for (i = batched; i < batched + leftover; i++) i40e_xmit_pkt(xdp_ring, &descs[i], total_bytes); } If it isn't a silly question why all the faffing with unrolling? It isn't as though the loop body is trivial - it contains real function calls. Unrolling loops is so 1980s - unless you are trying to get the absolute max performance from a very short loop and need to unroll once (maybe twice) to get enough spare instruction execution slots to run the loop control code in parallel with the body. In this case it looks like the 'batched' loop contains an inlined copy of the function called for the remainder. I can't see anything else. You'd probably gain more by getting rid of the 'int *total bytes' and using the function return value - that is what it is fot. David
diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.h b/drivers/net/ethernet/intel/i40e/i40e_xsk.h index ef156fad52f2..dd16351a7af8 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.h +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.h @@ -6,7 +6,7 @@ #include <linux/types.h> -/* This value should match the pragma in the loop_unrolled_for +/* This value should match the pragma in the unrolled_count() * macro. Why 4? It is strictly empirical. It seems to be a good * compromise between the advantage of having simultaneous outstanding * reads to the DMA array that can hide each others latency and the @@ -14,14 +14,6 @@ */ #define PKTS_PER_BATCH 4 -#ifdef __clang__ -#define loop_unrolled_for _Pragma("clang loop unroll_count(4)") for -#elif __GNUC__ >= 8 -#define loop_unrolled_for _Pragma("GCC unroll 4") for -#else -#define loop_unrolled_for for -#endif - struct i40e_ring; struct i40e_vsi; struct net_device; diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c index e28f1905a4a0..9f47388eaba5 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c @@ -2,6 +2,7 @@ /* Copyright(c) 2018 Intel Corporation. */ #include <linux/bpf_trace.h> +#include <linux/unroll.h> #include <net/xdp_sock_drv.h> #include "i40e_txrx_common.h" #include "i40e_xsk.h" @@ -529,7 +530,8 @@ static void i40e_xmit_pkt_batch(struct i40e_ring *xdp_ring, struct xdp_desc *des dma_addr_t dma; u32 i; - loop_unrolled_for(i = 0; i < PKTS_PER_BATCH; i++) { + unrolled_count(PKTS_PER_BATCH) + for (i = 0; i < PKTS_PER_BATCH; i++) { u32 cmd = I40E_TX_DESC_CMD_ICRC | xsk_is_eop_desc(&desc[i]); dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc[i].addr);
i40e, as well as ice, has a custom loop unrolling macro for unrolling Tx descriptors filling on XSk xmit. Replace i40e defs with generic unrolled_count(), which is also more convenient as it allows passing defines as its argument, not hardcoded values, while the loop declaration will still be a usual for-loop. Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com> --- drivers/net/ethernet/intel/i40e/i40e_xsk.h | 10 +--------- drivers/net/ethernet/intel/i40e/i40e_xsk.c | 4 +++- 2 files changed, 4 insertions(+), 10 deletions(-)