diff mbox series

[net-next,4/4] igb: add AF_XDP zero-copy Tx support

Message ID 20240808183556.386397-5-anthony.l.nguyen@intel.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series igb: Add support for AF_XDP zero-copy | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for net-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 29 this patch: 29
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers success CCed 9 of 9 maintainers
netdev/build_clang success Errors and warnings before: 29 this patch: 29
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 29 this patch: 29
netdev/checkpatch warning WARNING: line length of 87 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 38 this patch: 38
netdev/source_inline success Was 0 now: 0
netdev/contest success net-next-2024-08-09--09-00 (tests: 705)

Commit Message

Tony Nguyen Aug. 8, 2024, 6:35 p.m. UTC
From: Sriram Yagnaraman <sriram.yagnaraman@est.tech>

Add support for AF_XDP zero-copy transmit path.

A new TX buffer type IGB_TYPE_XSK is introduced to indicate that the Tx
frame was allocated from the xsk buff pool, so igb_clean_tx_ring and
igb_clean_tx_irq can clean the buffers correctly based on type.

igb_xmit_zc performs the actual packet transmit when AF_XDP zero-copy is
enabled. We share the TX ring between slow path, XDP and AF_XDP
zero-copy, so we use the netdev queue lock to ensure mutual exclusion.

Signed-off-by: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
[Kurt: Set olinfo_status in igb_xmit_zc() so that frames are transmitted]
Signed-off-by: Kurt Kanzenbach <kurt@linutronix.de>
Tested-by: Chandan Kumar Rout <chandanx.rout@intel.com> (A Contingent Worker at Intel)
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/igb/igb.h      |  2 +
 drivers/net/ethernet/intel/igb/igb_main.c | 56 +++++++++++++++++++----
 drivers/net/ethernet/intel/igb/igb_xsk.c  | 53 +++++++++++++++++++++
 3 files changed, 102 insertions(+), 9 deletions(-)

Comments

Fijalkowski, Maciej Aug. 10, 2024, 2:10 p.m. UTC | #1
On Thu, Aug 08, 2024 at 11:35:54AM -0700, Tony Nguyen wrote:
> From: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
> 
> Add support for AF_XDP zero-copy transmit path.
> 
> A new TX buffer type IGB_TYPE_XSK is introduced to indicate that the Tx
> frame was allocated from the xsk buff pool, so igb_clean_tx_ring and
> igb_clean_tx_irq can clean the buffers correctly based on type.
> 
> igb_xmit_zc performs the actual packet transmit when AF_XDP zero-copy is
> enabled. We share the TX ring between slow path, XDP and AF_XDP
> zero-copy, so we use the netdev queue lock to ensure mutual exclusion.
> 
> Signed-off-by: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
> [Kurt: Set olinfo_status in igb_xmit_zc() so that frames are transmitted]
> Signed-off-by: Kurt Kanzenbach <kurt@linutronix.de>
> Tested-by: Chandan Kumar Rout <chandanx.rout@intel.com> (A Contingent Worker at Intel)
> Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
> ---
>  drivers/net/ethernet/intel/igb/igb.h      |  2 +
>  drivers/net/ethernet/intel/igb/igb_main.c | 56 +++++++++++++++++++----
>  drivers/net/ethernet/intel/igb/igb_xsk.c  | 53 +++++++++++++++++++++
>  3 files changed, 102 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h
> index 4983a6ec718e..9ee18ac1ba47 100644
> --- a/drivers/net/ethernet/intel/igb/igb.h
> +++ b/drivers/net/ethernet/intel/igb/igb.h
> @@ -257,6 +257,7 @@ enum igb_tx_flags {
>  enum igb_tx_buf_type {
>  	IGB_TYPE_SKB = 0,
>  	IGB_TYPE_XDP,
> +	IGB_TYPE_XSK
>  };
>  
>  /* wrapper around a pointer to a socket buffer,
> @@ -836,6 +837,7 @@ int igb_xsk_pool_setup(struct igb_adapter *adapter,
>  bool igb_alloc_rx_buffers_zc(struct igb_ring *rx_ring, u16 count);
>  void igb_clean_rx_ring_zc(struct igb_ring *rx_ring);
>  int igb_clean_rx_irq_zc(struct igb_q_vector *q_vector, const int budget);
> +bool igb_xmit_zc(struct igb_ring *tx_ring);
>  int igb_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags);
>  
>  #endif /* _IGB_H_ */
> diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
> index 0b779b2ca9ea..1ebd67981978 100644
> --- a/drivers/net/ethernet/intel/igb/igb_main.c
> +++ b/drivers/net/ethernet/intel/igb/igb_main.c
> @@ -2996,6 +2996,9 @@ static int igb_xdp_xmit(struct net_device *dev, int n,
>  	if (unlikely(!tx_ring))
>  		return -ENXIO;
>  
> +	if (unlikely(test_bit(IGB_RING_FLAG_TX_DISABLED, &tx_ring->flags)))
> +		return -ENXIO;
> +
>  	nq = txring_txq(tx_ring);
>  	__netif_tx_lock(nq, cpu);
>  
> @@ -4917,15 +4920,20 @@ void igb_clean_tx_ring(struct igb_ring *tx_ring)
>  {
>  	u16 i = tx_ring->next_to_clean;
>  	struct igb_tx_buffer *tx_buffer = &tx_ring->tx_buffer_info[i];
> +	u32 xsk_frames = 0;
>  
>  	while (i != tx_ring->next_to_use) {
>  		union e1000_adv_tx_desc *eop_desc, *tx_desc;
>  
>  		/* Free all the Tx ring sk_buffs or xdp frames */
> -		if (tx_buffer->type == IGB_TYPE_SKB)
> +		if (tx_buffer->type == IGB_TYPE_SKB) {
>  			dev_kfree_skb_any(tx_buffer->skb);
> -		else
> +		} else if (tx_buffer->type == IGB_TYPE_XDP) {
>  			xdp_return_frame(tx_buffer->xdpf);
> +		} else if (tx_buffer->type == IGB_TYPE_XSK) {
> +			xsk_frames++;
> +			goto skip_for_xsk;
> +		}
>  
>  		/* unmap skb header data */
>  		dma_unmap_single(tx_ring->dev,
> @@ -4956,6 +4964,7 @@ void igb_clean_tx_ring(struct igb_ring *tx_ring)
>  					       DMA_TO_DEVICE);
>  		}
>  
> +skip_for_xsk:
>  		tx_buffer->next_to_watch = NULL;
>  
>  		/* move us one more past the eop_desc for start of next pkt */
> @@ -4970,6 +4979,9 @@ void igb_clean_tx_ring(struct igb_ring *tx_ring)
>  	/* reset BQL for queue */
>  	netdev_tx_reset_queue(txring_txq(tx_ring));
>  
> +	if (tx_ring->xsk_pool && xsk_frames)
> +		xsk_tx_completed(tx_ring->xsk_pool, xsk_frames);
> +
>  	/* reset next_to_use and next_to_clean */
>  	tx_ring->next_to_use = 0;
>  	tx_ring->next_to_clean = 0;
> @@ -6503,6 +6515,9 @@ netdev_tx_t igb_xmit_frame_ring(struct sk_buff *skb,
>  		return NETDEV_TX_BUSY;
>  	}
>  
> +	if (unlikely(test_bit(IGB_RING_FLAG_TX_DISABLED, &tx_ring->flags)))
> +		return NETDEV_TX_BUSY;
> +
>  	/* record the location of the first descriptor for this packet */
>  	first = &tx_ring->tx_buffer_info[tx_ring->next_to_use];
>  	first->type = IGB_TYPE_SKB;
> @@ -8263,13 +8278,17 @@ static int igb_poll(struct napi_struct *napi, int budget)
>   **/
>  static bool igb_clean_tx_irq(struct igb_q_vector *q_vector, int napi_budget)
>  {
> -	struct igb_adapter *adapter = q_vector->adapter;
> -	struct igb_ring *tx_ring = q_vector->tx.ring;
> -	struct igb_tx_buffer *tx_buffer;
> -	union e1000_adv_tx_desc *tx_desc;
>  	unsigned int total_bytes = 0, total_packets = 0;
> +	struct igb_adapter *adapter = q_vector->adapter;
>  	unsigned int budget = q_vector->tx.work_limit;
> +	struct igb_ring *tx_ring = q_vector->tx.ring;
>  	unsigned int i = tx_ring->next_to_clean;
> +	union e1000_adv_tx_desc *tx_desc;
> +	struct igb_tx_buffer *tx_buffer;
> +	int cpu = smp_processor_id();
> +	bool xsk_xmit_done = true;
> +	struct netdev_queue *nq;
> +	u32 xsk_frames = 0;
>  
>  	if (test_bit(__IGB_DOWN, &adapter->state))
>  		return true;
> @@ -8300,10 +8319,14 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector, int napi_budget)
>  		total_packets += tx_buffer->gso_segs;
>  
>  		/* free the skb */
> -		if (tx_buffer->type == IGB_TYPE_SKB)
> +		if (tx_buffer->type == IGB_TYPE_SKB) {
>  			napi_consume_skb(tx_buffer->skb, napi_budget);
> -		else
> +		} else if (tx_buffer->type == IGB_TYPE_XDP) {
>  			xdp_return_frame(tx_buffer->xdpf);
> +		} else if (tx_buffer->type == IGB_TYPE_XSK) {
> +			xsk_frames++;
> +			goto skip_for_xsk;
> +		}
>  
>  		/* unmap skb header data */
>  		dma_unmap_single(tx_ring->dev,
> @@ -8335,6 +8358,7 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector, int napi_budget)
>  			}
>  		}
>  
> +skip_for_xsk:
>  		/* move us one more past the eop_desc for start of next pkt */
>  		tx_buffer++;
>  		tx_desc++;
> @@ -8363,6 +8387,20 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector, int napi_budget)
>  	q_vector->tx.total_bytes += total_bytes;
>  	q_vector->tx.total_packets += total_packets;
>  
> +	if (tx_ring->xsk_pool) {

READ_ONCE()

> +		if (xsk_frames)
> +			xsk_tx_completed(tx_ring->xsk_pool, xsk_frames);
> +		if (xsk_uses_need_wakeup(tx_ring->xsk_pool))
> +			xsk_set_tx_need_wakeup(tx_ring->xsk_pool);
> +
> +		nq = txring_txq(tx_ring);
> +		__netif_tx_lock(nq, cpu);
> +		/* Avoid transmit queue timeout since we share it with the slow path */
> +		txq_trans_cond_update(nq);
> +		xsk_xmit_done = igb_xmit_zc(tx_ring);
> +		__netif_tx_unlock(nq);
> +	}
> +
>  	if (test_bit(IGB_RING_FLAG_TX_DETECT_HANG, &tx_ring->flags)) {
>  		struct e1000_hw *hw = &adapter->hw;
>  
> @@ -8425,7 +8463,7 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector, int napi_budget)
>  		}
>  	}
>  
> -	return !!budget;
> +	return !!budget && xsk_xmit_done;
>  }
>  
>  /**
> diff --git a/drivers/net/ethernet/intel/igb/igb_xsk.c b/drivers/net/ethernet/intel/igb/igb_xsk.c
> index 66cdc30e9b6e..4e530e1eb3c0 100644
> --- a/drivers/net/ethernet/intel/igb/igb_xsk.c
> +++ b/drivers/net/ethernet/intel/igb/igb_xsk.c
> @@ -431,6 +431,59 @@ int igb_clean_rx_irq_zc(struct igb_q_vector *q_vector, const int budget)
>  	return failure ? budget : (int)total_packets;
>  }
>  
> +bool igb_xmit_zc(struct igb_ring *tx_ring)
> +{
> +	unsigned int budget = igb_desc_unused(tx_ring);
> +	struct xsk_buff_pool *pool = tx_ring->xsk_pool;
> +	u32 cmd_type, olinfo_status, nb_pkts, i = 0;
> +	struct xdp_desc *descs = pool->tx_descs;
> +	union e1000_adv_tx_desc *tx_desc = NULL;
> +	struct igb_tx_buffer *tx_buffer_info;
> +	unsigned int total_bytes = 0;
> +	dma_addr_t dma;

check IGB_RING_FLAG_TX_DISABLED?

> +
> +	nb_pkts = xsk_tx_peek_release_desc_batch(pool, budget);
> +	if (!nb_pkts)
> +		return true;
> +
> +	while (nb_pkts-- > 0) {
> +		dma = xsk_buff_raw_get_dma(pool, descs[i].addr);
> +		xsk_buff_raw_dma_sync_for_device(pool, dma, descs[i].len);
> +
> +		tx_buffer_info = &tx_ring->tx_buffer_info[tx_ring->next_to_use];
> +		tx_buffer_info->bytecount = descs[i].len;
> +		tx_buffer_info->type = IGB_TYPE_XSK;
> +		tx_buffer_info->xdpf = NULL;
> +		tx_buffer_info->gso_segs = 1;
> +		tx_buffer_info->time_stamp = jiffies;
> +
> +		tx_desc = IGB_TX_DESC(tx_ring, tx_ring->next_to_use);
> +		tx_desc->read.buffer_addr = cpu_to_le64(dma);
> +
> +		/* put descriptor type bits */
> +		cmd_type = E1000_ADVTXD_DTYP_DATA | E1000_ADVTXD_DCMD_DEXT |
> +			   E1000_ADVTXD_DCMD_IFCS;
> +		olinfo_status = descs[i].len << E1000_ADVTXD_PAYLEN_SHIFT;
> +
> +		cmd_type |= descs[i].len | IGB_TXD_DCMD;

This is also sub-optimal as you are setting RS bit on each Tx descriptor,
which will in turn raise a lot of irqs. See how ice sets RS bit only on
last desc from a batch and then, on cleaning side, how it finds a
descriptor that is supposed to have DD bit written by HW.

> +		tx_desc->read.cmd_type_len = cpu_to_le32(cmd_type);
> +		tx_desc->read.olinfo_status = cpu_to_le32(olinfo_status);
> +
> +		total_bytes += descs[i].len;
> +
> +		i++;
> +		tx_ring->next_to_use++;
> +		tx_buffer_info->next_to_watch = tx_desc;
> +		if (tx_ring->next_to_use == tx_ring->count)
> +			tx_ring->next_to_use = 0;
> +	}
> +
> +	netdev_tx_sent_queue(txring_txq(tx_ring), total_bytes);
> +	igb_xdp_ring_update_tail(tx_ring);
> +
> +	return nb_pkts < budget;
> +}
> +
>  int igb_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags)
>  {
>  	struct igb_adapter *adapter = netdev_priv(dev);
> -- 
> 2.42.0
>
Kurt Kanzenbach Aug. 14, 2024, 8:36 a.m. UTC | #2
On Sat Aug 10 2024, Maciej Fijalkowski wrote:
>> +	nb_pkts = xsk_tx_peek_release_desc_batch(pool, budget);
>> +	if (!nb_pkts)
>> +		return true;
>> +
>> +	while (nb_pkts-- > 0) {
>> +		dma = xsk_buff_raw_get_dma(pool, descs[i].addr);
>> +		xsk_buff_raw_dma_sync_for_device(pool, dma, descs[i].len);
>> +
>> +		tx_buffer_info = &tx_ring->tx_buffer_info[tx_ring->next_to_use];
>> +		tx_buffer_info->bytecount = descs[i].len;
>> +		tx_buffer_info->type = IGB_TYPE_XSK;
>> +		tx_buffer_info->xdpf = NULL;
>> +		tx_buffer_info->gso_segs = 1;
>> +		tx_buffer_info->time_stamp = jiffies;
>> +
>> +		tx_desc = IGB_TX_DESC(tx_ring, tx_ring->next_to_use);
>> +		tx_desc->read.buffer_addr = cpu_to_le64(dma);
>> +
>> +		/* put descriptor type bits */
>> +		cmd_type = E1000_ADVTXD_DTYP_DATA | E1000_ADVTXD_DCMD_DEXT |
>> +			   E1000_ADVTXD_DCMD_IFCS;
>> +		olinfo_status = descs[i].len << E1000_ADVTXD_PAYLEN_SHIFT;
>> +
>> +		cmd_type |= descs[i].len | IGB_TXD_DCMD;
>
> This is also sub-optimal as you are setting RS bit on each Tx descriptor,
> which will in turn raise a lot of irqs. See how ice sets RS bit only on
> last desc from a batch and then, on cleaning side, how it finds a
> descriptor that is supposed to have DD bit written by HW.

I see your point. That requires changes to the cleaning side. However,
igb_clean_tx_irq() is shared between normal and zero-copy path.

The amount of irqs can be also controlled by irq coalescing or even
using busy polling. So I'd rather keep this implementation as simple as
it is now.

Thanks,
Kurt
Fijalkowski, Maciej Aug. 14, 2024, 8:55 a.m. UTC | #3
On Wed, Aug 14, 2024 at 10:36:32AM +0200, Kurt Kanzenbach wrote:
> On Sat Aug 10 2024, Maciej Fijalkowski wrote:
> >> +	nb_pkts = xsk_tx_peek_release_desc_batch(pool, budget);
> >> +	if (!nb_pkts)
> >> +		return true;
> >> +
> >> +	while (nb_pkts-- > 0) {
> >> +		dma = xsk_buff_raw_get_dma(pool, descs[i].addr);
> >> +		xsk_buff_raw_dma_sync_for_device(pool, dma, descs[i].len);
> >> +
> >> +		tx_buffer_info = &tx_ring->tx_buffer_info[tx_ring->next_to_use];
> >> +		tx_buffer_info->bytecount = descs[i].len;
> >> +		tx_buffer_info->type = IGB_TYPE_XSK;
> >> +		tx_buffer_info->xdpf = NULL;
> >> +		tx_buffer_info->gso_segs = 1;
> >> +		tx_buffer_info->time_stamp = jiffies;
> >> +
> >> +		tx_desc = IGB_TX_DESC(tx_ring, tx_ring->next_to_use);
> >> +		tx_desc->read.buffer_addr = cpu_to_le64(dma);
> >> +
> >> +		/* put descriptor type bits */
> >> +		cmd_type = E1000_ADVTXD_DTYP_DATA | E1000_ADVTXD_DCMD_DEXT |
> >> +			   E1000_ADVTXD_DCMD_IFCS;
> >> +		olinfo_status = descs[i].len << E1000_ADVTXD_PAYLEN_SHIFT;
> >> +
> >> +		cmd_type |= descs[i].len | IGB_TXD_DCMD;
> >
> > This is also sub-optimal as you are setting RS bit on each Tx descriptor,
> > which will in turn raise a lot of irqs. See how ice sets RS bit only on
> > last desc from a batch and then, on cleaning side, how it finds a
> > descriptor that is supposed to have DD bit written by HW.
> 
> I see your point. That requires changes to the cleaning side. However,
> igb_clean_tx_irq() is shared between normal and zero-copy path.

Ok if that's too much of a hassle then let's leave it as-is. I can address
that in some nearby future.

> 
> The amount of irqs can be also controlled by irq coalescing or even
> using busy polling. So I'd rather keep this implementation as simple as
> it is now.

That has nothing to do with what I was describing.

> 
> Thanks,
> Kurt
Kurt Kanzenbach Aug. 14, 2024, 9:12 a.m. UTC | #4
On Wed Aug 14 2024, Maciej Fijalkowski wrote:
> On Wed, Aug 14, 2024 at 10:36:32AM +0200, Kurt Kanzenbach wrote:
>> On Sat Aug 10 2024, Maciej Fijalkowski wrote:
>> >> +	nb_pkts = xsk_tx_peek_release_desc_batch(pool, budget);
>> >> +	if (!nb_pkts)
>> >> +		return true;
>> >> +
>> >> +	while (nb_pkts-- > 0) {
>> >> +		dma = xsk_buff_raw_get_dma(pool, descs[i].addr);
>> >> +		xsk_buff_raw_dma_sync_for_device(pool, dma, descs[i].len);
>> >> +
>> >> +		tx_buffer_info = &tx_ring->tx_buffer_info[tx_ring->next_to_use];
>> >> +		tx_buffer_info->bytecount = descs[i].len;
>> >> +		tx_buffer_info->type = IGB_TYPE_XSK;
>> >> +		tx_buffer_info->xdpf = NULL;
>> >> +		tx_buffer_info->gso_segs = 1;
>> >> +		tx_buffer_info->time_stamp = jiffies;
>> >> +
>> >> +		tx_desc = IGB_TX_DESC(tx_ring, tx_ring->next_to_use);
>> >> +		tx_desc->read.buffer_addr = cpu_to_le64(dma);
>> >> +
>> >> +		/* put descriptor type bits */
>> >> +		cmd_type = E1000_ADVTXD_DTYP_DATA | E1000_ADVTXD_DCMD_DEXT |
>> >> +			   E1000_ADVTXD_DCMD_IFCS;
>> >> +		olinfo_status = descs[i].len << E1000_ADVTXD_PAYLEN_SHIFT;
>> >> +
>> >> +		cmd_type |= descs[i].len | IGB_TXD_DCMD;
>> >
>> > This is also sub-optimal as you are setting RS bit on each Tx descriptor,
>> > which will in turn raise a lot of irqs. See how ice sets RS bit only on
>> > last desc from a batch and then, on cleaning side, how it finds a
>> > descriptor that is supposed to have DD bit written by HW.
>> 
>> I see your point. That requires changes to the cleaning side. However,
>> igb_clean_tx_irq() is shared between normal and zero-copy path.
>
> Ok if that's too much of a hassle then let's leave it as-is. I can address
> that in some nearby future.

How would you do that, by adding a dedicated igb_clean_tx_irq_zc()
function? Or is there a more simple way?

BTW: This needs to be addressed in igc too.

>
>> 
>> The amount of irqs can be also controlled by irq coalescing or even
>> using busy polling. So I'd rather keep this implementation as simple as
>> it is now.
>
> That has nothing to do with what I was describing.

Ok, maybe I misunderstood your suggestion. It seemed to me that adding
the RS bit to the last frame of the burst will reduce the amount of
raised irqs.

Thanks,
Kurt
Fijalkowski, Maciej Aug. 14, 2024, 10:26 a.m. UTC | #5
On Wed, Aug 14, 2024 at 11:12:30AM +0200, Kurt Kanzenbach wrote:
> On Wed Aug 14 2024, Maciej Fijalkowski wrote:
> > On Wed, Aug 14, 2024 at 10:36:32AM +0200, Kurt Kanzenbach wrote:
> >> On Sat Aug 10 2024, Maciej Fijalkowski wrote:
> >> >> +	nb_pkts = xsk_tx_peek_release_desc_batch(pool, budget);
> >> >> +	if (!nb_pkts)
> >> >> +		return true;
> >> >> +
> >> >> +	while (nb_pkts-- > 0) {
> >> >> +		dma = xsk_buff_raw_get_dma(pool, descs[i].addr);
> >> >> +		xsk_buff_raw_dma_sync_for_device(pool, dma, descs[i].len);
> >> >> +
> >> >> +		tx_buffer_info = &tx_ring->tx_buffer_info[tx_ring->next_to_use];
> >> >> +		tx_buffer_info->bytecount = descs[i].len;
> >> >> +		tx_buffer_info->type = IGB_TYPE_XSK;
> >> >> +		tx_buffer_info->xdpf = NULL;
> >> >> +		tx_buffer_info->gso_segs = 1;
> >> >> +		tx_buffer_info->time_stamp = jiffies;
> >> >> +
> >> >> +		tx_desc = IGB_TX_DESC(tx_ring, tx_ring->next_to_use);
> >> >> +		tx_desc->read.buffer_addr = cpu_to_le64(dma);
> >> >> +
> >> >> +		/* put descriptor type bits */
> >> >> +		cmd_type = E1000_ADVTXD_DTYP_DATA | E1000_ADVTXD_DCMD_DEXT |
> >> >> +			   E1000_ADVTXD_DCMD_IFCS;
> >> >> +		olinfo_status = descs[i].len << E1000_ADVTXD_PAYLEN_SHIFT;
> >> >> +
> >> >> +		cmd_type |= descs[i].len | IGB_TXD_DCMD;
> >> >
> >> > This is also sub-optimal as you are setting RS bit on each Tx descriptor,
> >> > which will in turn raise a lot of irqs. See how ice sets RS bit only on
> >> > last desc from a batch and then, on cleaning side, how it finds a
> >> > descriptor that is supposed to have DD bit written by HW.
> >> 
> >> I see your point. That requires changes to the cleaning side. However,
> >> igb_clean_tx_irq() is shared between normal and zero-copy path.
> >
> > Ok if that's too much of a hassle then let's leave it as-is. I can address
> > that in some nearby future.
> 
> How would you do that, by adding a dedicated igb_clean_tx_irq_zc()
> function? Or is there a more simple way?

Yes that would be my first approach.

> 
> BTW: This needs to be addressed in igc too.

Argh!

> 
> >
> >> 
> >> The amount of irqs can be also controlled by irq coalescing or even
> >> using busy polling. So I'd rather keep this implementation as simple as
> >> it is now.
> >
> > That has nothing to do with what I was describing.
> 
> Ok, maybe I misunderstood your suggestion. It seemed to me that adding
> the RS bit to the last frame of the burst will reduce the amount of
> raised irqs.

You got it right, but I don't think it's related to any outer settings.
The main case here is that by doing what I proposed you get much less PCIe
traffic which in turn yields better performance.

> 
> Thanks,
> Kurt
Kurt Kanzenbach Aug. 14, 2024, 12:51 p.m. UTC | #6
On Wed Aug 14 2024, Maciej Fijalkowski wrote:
>> >> The amount of irqs can be also controlled by irq coalescing or even
>> >> using busy polling. So I'd rather keep this implementation as simple as
>> >> it is now.
>> >
>> > That has nothing to do with what I was describing.
>> 
>> Ok, maybe I misunderstood your suggestion. It seemed to me that adding
>> the RS bit to the last frame of the burst will reduce the amount of
>> raised irqs.
>
> You got it right, but I don't think it's related to any outer settings.
> The main case here is that by doing what I proposed you get much less PCIe
> traffic which in turn yields better performance.

I see, makes sense. Then, let's address this in another patchset also
for igc.

Thanks,
Kurt
diff mbox series

Patch

diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h
index 4983a6ec718e..9ee18ac1ba47 100644
--- a/drivers/net/ethernet/intel/igb/igb.h
+++ b/drivers/net/ethernet/intel/igb/igb.h
@@ -257,6 +257,7 @@  enum igb_tx_flags {
 enum igb_tx_buf_type {
 	IGB_TYPE_SKB = 0,
 	IGB_TYPE_XDP,
+	IGB_TYPE_XSK
 };
 
 /* wrapper around a pointer to a socket buffer,
@@ -836,6 +837,7 @@  int igb_xsk_pool_setup(struct igb_adapter *adapter,
 bool igb_alloc_rx_buffers_zc(struct igb_ring *rx_ring, u16 count);
 void igb_clean_rx_ring_zc(struct igb_ring *rx_ring);
 int igb_clean_rx_irq_zc(struct igb_q_vector *q_vector, const int budget);
+bool igb_xmit_zc(struct igb_ring *tx_ring);
 int igb_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags);
 
 #endif /* _IGB_H_ */
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 0b779b2ca9ea..1ebd67981978 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -2996,6 +2996,9 @@  static int igb_xdp_xmit(struct net_device *dev, int n,
 	if (unlikely(!tx_ring))
 		return -ENXIO;
 
+	if (unlikely(test_bit(IGB_RING_FLAG_TX_DISABLED, &tx_ring->flags)))
+		return -ENXIO;
+
 	nq = txring_txq(tx_ring);
 	__netif_tx_lock(nq, cpu);
 
@@ -4917,15 +4920,20 @@  void igb_clean_tx_ring(struct igb_ring *tx_ring)
 {
 	u16 i = tx_ring->next_to_clean;
 	struct igb_tx_buffer *tx_buffer = &tx_ring->tx_buffer_info[i];
+	u32 xsk_frames = 0;
 
 	while (i != tx_ring->next_to_use) {
 		union e1000_adv_tx_desc *eop_desc, *tx_desc;
 
 		/* Free all the Tx ring sk_buffs or xdp frames */
-		if (tx_buffer->type == IGB_TYPE_SKB)
+		if (tx_buffer->type == IGB_TYPE_SKB) {
 			dev_kfree_skb_any(tx_buffer->skb);
-		else
+		} else if (tx_buffer->type == IGB_TYPE_XDP) {
 			xdp_return_frame(tx_buffer->xdpf);
+		} else if (tx_buffer->type == IGB_TYPE_XSK) {
+			xsk_frames++;
+			goto skip_for_xsk;
+		}
 
 		/* unmap skb header data */
 		dma_unmap_single(tx_ring->dev,
@@ -4956,6 +4964,7 @@  void igb_clean_tx_ring(struct igb_ring *tx_ring)
 					       DMA_TO_DEVICE);
 		}
 
+skip_for_xsk:
 		tx_buffer->next_to_watch = NULL;
 
 		/* move us one more past the eop_desc for start of next pkt */
@@ -4970,6 +4979,9 @@  void igb_clean_tx_ring(struct igb_ring *tx_ring)
 	/* reset BQL for queue */
 	netdev_tx_reset_queue(txring_txq(tx_ring));
 
+	if (tx_ring->xsk_pool && xsk_frames)
+		xsk_tx_completed(tx_ring->xsk_pool, xsk_frames);
+
 	/* reset next_to_use and next_to_clean */
 	tx_ring->next_to_use = 0;
 	tx_ring->next_to_clean = 0;
@@ -6503,6 +6515,9 @@  netdev_tx_t igb_xmit_frame_ring(struct sk_buff *skb,
 		return NETDEV_TX_BUSY;
 	}
 
+	if (unlikely(test_bit(IGB_RING_FLAG_TX_DISABLED, &tx_ring->flags)))
+		return NETDEV_TX_BUSY;
+
 	/* record the location of the first descriptor for this packet */
 	first = &tx_ring->tx_buffer_info[tx_ring->next_to_use];
 	first->type = IGB_TYPE_SKB;
@@ -8263,13 +8278,17 @@  static int igb_poll(struct napi_struct *napi, int budget)
  **/
 static bool igb_clean_tx_irq(struct igb_q_vector *q_vector, int napi_budget)
 {
-	struct igb_adapter *adapter = q_vector->adapter;
-	struct igb_ring *tx_ring = q_vector->tx.ring;
-	struct igb_tx_buffer *tx_buffer;
-	union e1000_adv_tx_desc *tx_desc;
 	unsigned int total_bytes = 0, total_packets = 0;
+	struct igb_adapter *adapter = q_vector->adapter;
 	unsigned int budget = q_vector->tx.work_limit;
+	struct igb_ring *tx_ring = q_vector->tx.ring;
 	unsigned int i = tx_ring->next_to_clean;
+	union e1000_adv_tx_desc *tx_desc;
+	struct igb_tx_buffer *tx_buffer;
+	int cpu = smp_processor_id();
+	bool xsk_xmit_done = true;
+	struct netdev_queue *nq;
+	u32 xsk_frames = 0;
 
 	if (test_bit(__IGB_DOWN, &adapter->state))
 		return true;
@@ -8300,10 +8319,14 @@  static bool igb_clean_tx_irq(struct igb_q_vector *q_vector, int napi_budget)
 		total_packets += tx_buffer->gso_segs;
 
 		/* free the skb */
-		if (tx_buffer->type == IGB_TYPE_SKB)
+		if (tx_buffer->type == IGB_TYPE_SKB) {
 			napi_consume_skb(tx_buffer->skb, napi_budget);
-		else
+		} else if (tx_buffer->type == IGB_TYPE_XDP) {
 			xdp_return_frame(tx_buffer->xdpf);
+		} else if (tx_buffer->type == IGB_TYPE_XSK) {
+			xsk_frames++;
+			goto skip_for_xsk;
+		}
 
 		/* unmap skb header data */
 		dma_unmap_single(tx_ring->dev,
@@ -8335,6 +8358,7 @@  static bool igb_clean_tx_irq(struct igb_q_vector *q_vector, int napi_budget)
 			}
 		}
 
+skip_for_xsk:
 		/* move us one more past the eop_desc for start of next pkt */
 		tx_buffer++;
 		tx_desc++;
@@ -8363,6 +8387,20 @@  static bool igb_clean_tx_irq(struct igb_q_vector *q_vector, int napi_budget)
 	q_vector->tx.total_bytes += total_bytes;
 	q_vector->tx.total_packets += total_packets;
 
+	if (tx_ring->xsk_pool) {
+		if (xsk_frames)
+			xsk_tx_completed(tx_ring->xsk_pool, xsk_frames);
+		if (xsk_uses_need_wakeup(tx_ring->xsk_pool))
+			xsk_set_tx_need_wakeup(tx_ring->xsk_pool);
+
+		nq = txring_txq(tx_ring);
+		__netif_tx_lock(nq, cpu);
+		/* Avoid transmit queue timeout since we share it with the slow path */
+		txq_trans_cond_update(nq);
+		xsk_xmit_done = igb_xmit_zc(tx_ring);
+		__netif_tx_unlock(nq);
+	}
+
 	if (test_bit(IGB_RING_FLAG_TX_DETECT_HANG, &tx_ring->flags)) {
 		struct e1000_hw *hw = &adapter->hw;
 
@@ -8425,7 +8463,7 @@  static bool igb_clean_tx_irq(struct igb_q_vector *q_vector, int napi_budget)
 		}
 	}
 
-	return !!budget;
+	return !!budget && xsk_xmit_done;
 }
 
 /**
diff --git a/drivers/net/ethernet/intel/igb/igb_xsk.c b/drivers/net/ethernet/intel/igb/igb_xsk.c
index 66cdc30e9b6e..4e530e1eb3c0 100644
--- a/drivers/net/ethernet/intel/igb/igb_xsk.c
+++ b/drivers/net/ethernet/intel/igb/igb_xsk.c
@@ -431,6 +431,59 @@  int igb_clean_rx_irq_zc(struct igb_q_vector *q_vector, const int budget)
 	return failure ? budget : (int)total_packets;
 }
 
+bool igb_xmit_zc(struct igb_ring *tx_ring)
+{
+	unsigned int budget = igb_desc_unused(tx_ring);
+	struct xsk_buff_pool *pool = tx_ring->xsk_pool;
+	u32 cmd_type, olinfo_status, nb_pkts, i = 0;
+	struct xdp_desc *descs = pool->tx_descs;
+	union e1000_adv_tx_desc *tx_desc = NULL;
+	struct igb_tx_buffer *tx_buffer_info;
+	unsigned int total_bytes = 0;
+	dma_addr_t dma;
+
+	nb_pkts = xsk_tx_peek_release_desc_batch(pool, budget);
+	if (!nb_pkts)
+		return true;
+
+	while (nb_pkts-- > 0) {
+		dma = xsk_buff_raw_get_dma(pool, descs[i].addr);
+		xsk_buff_raw_dma_sync_for_device(pool, dma, descs[i].len);
+
+		tx_buffer_info = &tx_ring->tx_buffer_info[tx_ring->next_to_use];
+		tx_buffer_info->bytecount = descs[i].len;
+		tx_buffer_info->type = IGB_TYPE_XSK;
+		tx_buffer_info->xdpf = NULL;
+		tx_buffer_info->gso_segs = 1;
+		tx_buffer_info->time_stamp = jiffies;
+
+		tx_desc = IGB_TX_DESC(tx_ring, tx_ring->next_to_use);
+		tx_desc->read.buffer_addr = cpu_to_le64(dma);
+
+		/* put descriptor type bits */
+		cmd_type = E1000_ADVTXD_DTYP_DATA | E1000_ADVTXD_DCMD_DEXT |
+			   E1000_ADVTXD_DCMD_IFCS;
+		olinfo_status = descs[i].len << E1000_ADVTXD_PAYLEN_SHIFT;
+
+		cmd_type |= descs[i].len | IGB_TXD_DCMD;
+		tx_desc->read.cmd_type_len = cpu_to_le32(cmd_type);
+		tx_desc->read.olinfo_status = cpu_to_le32(olinfo_status);
+
+		total_bytes += descs[i].len;
+
+		i++;
+		tx_ring->next_to_use++;
+		tx_buffer_info->next_to_watch = tx_desc;
+		if (tx_ring->next_to_use == tx_ring->count)
+			tx_ring->next_to_use = 0;
+	}
+
+	netdev_tx_sent_queue(txring_txq(tx_ring), total_bytes);
+	igb_xdp_ring_update_tail(tx_ring);
+
+	return nb_pkts < budget;
+}
+
 int igb_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags)
 {
 	struct igb_adapter *adapter = netdev_priv(dev);