Message ID | 20240808183556.386397-5-anthony.l.nguyen@intel.com (mailing list archive) |
---|---|
State | Changes Requested |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | igb: Add support for AF_XDP zero-copy | expand |
On Thu, Aug 08, 2024 at 11:35:54AM -0700, Tony Nguyen wrote: > From: Sriram Yagnaraman <sriram.yagnaraman@est.tech> > > Add support for AF_XDP zero-copy transmit path. > > A new TX buffer type IGB_TYPE_XSK is introduced to indicate that the Tx > frame was allocated from the xsk buff pool, so igb_clean_tx_ring and > igb_clean_tx_irq can clean the buffers correctly based on type. > > igb_xmit_zc performs the actual packet transmit when AF_XDP zero-copy is > enabled. We share the TX ring between slow path, XDP and AF_XDP > zero-copy, so we use the netdev queue lock to ensure mutual exclusion. > > Signed-off-by: Sriram Yagnaraman <sriram.yagnaraman@est.tech> > [Kurt: Set olinfo_status in igb_xmit_zc() so that frames are transmitted] > Signed-off-by: Kurt Kanzenbach <kurt@linutronix.de> > Tested-by: Chandan Kumar Rout <chandanx.rout@intel.com> (A Contingent Worker at Intel) > Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com> > --- > drivers/net/ethernet/intel/igb/igb.h | 2 + > drivers/net/ethernet/intel/igb/igb_main.c | 56 +++++++++++++++++++---- > drivers/net/ethernet/intel/igb/igb_xsk.c | 53 +++++++++++++++++++++ > 3 files changed, 102 insertions(+), 9 deletions(-) > > diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h > index 4983a6ec718e..9ee18ac1ba47 100644 > --- a/drivers/net/ethernet/intel/igb/igb.h > +++ b/drivers/net/ethernet/intel/igb/igb.h > @@ -257,6 +257,7 @@ enum igb_tx_flags { > enum igb_tx_buf_type { > IGB_TYPE_SKB = 0, > IGB_TYPE_XDP, > + IGB_TYPE_XSK > }; > > /* wrapper around a pointer to a socket buffer, > @@ -836,6 +837,7 @@ int igb_xsk_pool_setup(struct igb_adapter *adapter, > bool igb_alloc_rx_buffers_zc(struct igb_ring *rx_ring, u16 count); > void igb_clean_rx_ring_zc(struct igb_ring *rx_ring); > int igb_clean_rx_irq_zc(struct igb_q_vector *q_vector, const int budget); > +bool igb_xmit_zc(struct igb_ring *tx_ring); > int igb_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags); > > #endif /* _IGB_H_ */ > diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c > index 0b779b2ca9ea..1ebd67981978 100644 > --- a/drivers/net/ethernet/intel/igb/igb_main.c > +++ b/drivers/net/ethernet/intel/igb/igb_main.c > @@ -2996,6 +2996,9 @@ static int igb_xdp_xmit(struct net_device *dev, int n, > if (unlikely(!tx_ring)) > return -ENXIO; > > + if (unlikely(test_bit(IGB_RING_FLAG_TX_DISABLED, &tx_ring->flags))) > + return -ENXIO; > + > nq = txring_txq(tx_ring); > __netif_tx_lock(nq, cpu); > > @@ -4917,15 +4920,20 @@ void igb_clean_tx_ring(struct igb_ring *tx_ring) > { > u16 i = tx_ring->next_to_clean; > struct igb_tx_buffer *tx_buffer = &tx_ring->tx_buffer_info[i]; > + u32 xsk_frames = 0; > > while (i != tx_ring->next_to_use) { > union e1000_adv_tx_desc *eop_desc, *tx_desc; > > /* Free all the Tx ring sk_buffs or xdp frames */ > - if (tx_buffer->type == IGB_TYPE_SKB) > + if (tx_buffer->type == IGB_TYPE_SKB) { > dev_kfree_skb_any(tx_buffer->skb); > - else > + } else if (tx_buffer->type == IGB_TYPE_XDP) { > xdp_return_frame(tx_buffer->xdpf); > + } else if (tx_buffer->type == IGB_TYPE_XSK) { > + xsk_frames++; > + goto skip_for_xsk; > + } > > /* unmap skb header data */ > dma_unmap_single(tx_ring->dev, > @@ -4956,6 +4964,7 @@ void igb_clean_tx_ring(struct igb_ring *tx_ring) > DMA_TO_DEVICE); > } > > +skip_for_xsk: > tx_buffer->next_to_watch = NULL; > > /* move us one more past the eop_desc for start of next pkt */ > @@ -4970,6 +4979,9 @@ void igb_clean_tx_ring(struct igb_ring *tx_ring) > /* reset BQL for queue */ > netdev_tx_reset_queue(txring_txq(tx_ring)); > > + if (tx_ring->xsk_pool && xsk_frames) > + xsk_tx_completed(tx_ring->xsk_pool, xsk_frames); > + > /* reset next_to_use and next_to_clean */ > tx_ring->next_to_use = 0; > tx_ring->next_to_clean = 0; > @@ -6503,6 +6515,9 @@ netdev_tx_t igb_xmit_frame_ring(struct sk_buff *skb, > return NETDEV_TX_BUSY; > } > > + if (unlikely(test_bit(IGB_RING_FLAG_TX_DISABLED, &tx_ring->flags))) > + return NETDEV_TX_BUSY; > + > /* record the location of the first descriptor for this packet */ > first = &tx_ring->tx_buffer_info[tx_ring->next_to_use]; > first->type = IGB_TYPE_SKB; > @@ -8263,13 +8278,17 @@ static int igb_poll(struct napi_struct *napi, int budget) > **/ > static bool igb_clean_tx_irq(struct igb_q_vector *q_vector, int napi_budget) > { > - struct igb_adapter *adapter = q_vector->adapter; > - struct igb_ring *tx_ring = q_vector->tx.ring; > - struct igb_tx_buffer *tx_buffer; > - union e1000_adv_tx_desc *tx_desc; > unsigned int total_bytes = 0, total_packets = 0; > + struct igb_adapter *adapter = q_vector->adapter; > unsigned int budget = q_vector->tx.work_limit; > + struct igb_ring *tx_ring = q_vector->tx.ring; > unsigned int i = tx_ring->next_to_clean; > + union e1000_adv_tx_desc *tx_desc; > + struct igb_tx_buffer *tx_buffer; > + int cpu = smp_processor_id(); > + bool xsk_xmit_done = true; > + struct netdev_queue *nq; > + u32 xsk_frames = 0; > > if (test_bit(__IGB_DOWN, &adapter->state)) > return true; > @@ -8300,10 +8319,14 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector, int napi_budget) > total_packets += tx_buffer->gso_segs; > > /* free the skb */ > - if (tx_buffer->type == IGB_TYPE_SKB) > + if (tx_buffer->type == IGB_TYPE_SKB) { > napi_consume_skb(tx_buffer->skb, napi_budget); > - else > + } else if (tx_buffer->type == IGB_TYPE_XDP) { > xdp_return_frame(tx_buffer->xdpf); > + } else if (tx_buffer->type == IGB_TYPE_XSK) { > + xsk_frames++; > + goto skip_for_xsk; > + } > > /* unmap skb header data */ > dma_unmap_single(tx_ring->dev, > @@ -8335,6 +8358,7 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector, int napi_budget) > } > } > > +skip_for_xsk: > /* move us one more past the eop_desc for start of next pkt */ > tx_buffer++; > tx_desc++; > @@ -8363,6 +8387,20 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector, int napi_budget) > q_vector->tx.total_bytes += total_bytes; > q_vector->tx.total_packets += total_packets; > > + if (tx_ring->xsk_pool) { READ_ONCE() > + if (xsk_frames) > + xsk_tx_completed(tx_ring->xsk_pool, xsk_frames); > + if (xsk_uses_need_wakeup(tx_ring->xsk_pool)) > + xsk_set_tx_need_wakeup(tx_ring->xsk_pool); > + > + nq = txring_txq(tx_ring); > + __netif_tx_lock(nq, cpu); > + /* Avoid transmit queue timeout since we share it with the slow path */ > + txq_trans_cond_update(nq); > + xsk_xmit_done = igb_xmit_zc(tx_ring); > + __netif_tx_unlock(nq); > + } > + > if (test_bit(IGB_RING_FLAG_TX_DETECT_HANG, &tx_ring->flags)) { > struct e1000_hw *hw = &adapter->hw; > > @@ -8425,7 +8463,7 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector, int napi_budget) > } > } > > - return !!budget; > + return !!budget && xsk_xmit_done; > } > > /** > diff --git a/drivers/net/ethernet/intel/igb/igb_xsk.c b/drivers/net/ethernet/intel/igb/igb_xsk.c > index 66cdc30e9b6e..4e530e1eb3c0 100644 > --- a/drivers/net/ethernet/intel/igb/igb_xsk.c > +++ b/drivers/net/ethernet/intel/igb/igb_xsk.c > @@ -431,6 +431,59 @@ int igb_clean_rx_irq_zc(struct igb_q_vector *q_vector, const int budget) > return failure ? budget : (int)total_packets; > } > > +bool igb_xmit_zc(struct igb_ring *tx_ring) > +{ > + unsigned int budget = igb_desc_unused(tx_ring); > + struct xsk_buff_pool *pool = tx_ring->xsk_pool; > + u32 cmd_type, olinfo_status, nb_pkts, i = 0; > + struct xdp_desc *descs = pool->tx_descs; > + union e1000_adv_tx_desc *tx_desc = NULL; > + struct igb_tx_buffer *tx_buffer_info; > + unsigned int total_bytes = 0; > + dma_addr_t dma; check IGB_RING_FLAG_TX_DISABLED? > + > + nb_pkts = xsk_tx_peek_release_desc_batch(pool, budget); > + if (!nb_pkts) > + return true; > + > + while (nb_pkts-- > 0) { > + dma = xsk_buff_raw_get_dma(pool, descs[i].addr); > + xsk_buff_raw_dma_sync_for_device(pool, dma, descs[i].len); > + > + tx_buffer_info = &tx_ring->tx_buffer_info[tx_ring->next_to_use]; > + tx_buffer_info->bytecount = descs[i].len; > + tx_buffer_info->type = IGB_TYPE_XSK; > + tx_buffer_info->xdpf = NULL; > + tx_buffer_info->gso_segs = 1; > + tx_buffer_info->time_stamp = jiffies; > + > + tx_desc = IGB_TX_DESC(tx_ring, tx_ring->next_to_use); > + tx_desc->read.buffer_addr = cpu_to_le64(dma); > + > + /* put descriptor type bits */ > + cmd_type = E1000_ADVTXD_DTYP_DATA | E1000_ADVTXD_DCMD_DEXT | > + E1000_ADVTXD_DCMD_IFCS; > + olinfo_status = descs[i].len << E1000_ADVTXD_PAYLEN_SHIFT; > + > + cmd_type |= descs[i].len | IGB_TXD_DCMD; This is also sub-optimal as you are setting RS bit on each Tx descriptor, which will in turn raise a lot of irqs. See how ice sets RS bit only on last desc from a batch and then, on cleaning side, how it finds a descriptor that is supposed to have DD bit written by HW. > + tx_desc->read.cmd_type_len = cpu_to_le32(cmd_type); > + tx_desc->read.olinfo_status = cpu_to_le32(olinfo_status); > + > + total_bytes += descs[i].len; > + > + i++; > + tx_ring->next_to_use++; > + tx_buffer_info->next_to_watch = tx_desc; > + if (tx_ring->next_to_use == tx_ring->count) > + tx_ring->next_to_use = 0; > + } > + > + netdev_tx_sent_queue(txring_txq(tx_ring), total_bytes); > + igb_xdp_ring_update_tail(tx_ring); > + > + return nb_pkts < budget; > +} > + > int igb_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags) > { > struct igb_adapter *adapter = netdev_priv(dev); > -- > 2.42.0 >
On Sat Aug 10 2024, Maciej Fijalkowski wrote: >> + nb_pkts = xsk_tx_peek_release_desc_batch(pool, budget); >> + if (!nb_pkts) >> + return true; >> + >> + while (nb_pkts-- > 0) { >> + dma = xsk_buff_raw_get_dma(pool, descs[i].addr); >> + xsk_buff_raw_dma_sync_for_device(pool, dma, descs[i].len); >> + >> + tx_buffer_info = &tx_ring->tx_buffer_info[tx_ring->next_to_use]; >> + tx_buffer_info->bytecount = descs[i].len; >> + tx_buffer_info->type = IGB_TYPE_XSK; >> + tx_buffer_info->xdpf = NULL; >> + tx_buffer_info->gso_segs = 1; >> + tx_buffer_info->time_stamp = jiffies; >> + >> + tx_desc = IGB_TX_DESC(tx_ring, tx_ring->next_to_use); >> + tx_desc->read.buffer_addr = cpu_to_le64(dma); >> + >> + /* put descriptor type bits */ >> + cmd_type = E1000_ADVTXD_DTYP_DATA | E1000_ADVTXD_DCMD_DEXT | >> + E1000_ADVTXD_DCMD_IFCS; >> + olinfo_status = descs[i].len << E1000_ADVTXD_PAYLEN_SHIFT; >> + >> + cmd_type |= descs[i].len | IGB_TXD_DCMD; > > This is also sub-optimal as you are setting RS bit on each Tx descriptor, > which will in turn raise a lot of irqs. See how ice sets RS bit only on > last desc from a batch and then, on cleaning side, how it finds a > descriptor that is supposed to have DD bit written by HW. I see your point. That requires changes to the cleaning side. However, igb_clean_tx_irq() is shared between normal and zero-copy path. The amount of irqs can be also controlled by irq coalescing or even using busy polling. So I'd rather keep this implementation as simple as it is now. Thanks, Kurt
On Wed, Aug 14, 2024 at 10:36:32AM +0200, Kurt Kanzenbach wrote: > On Sat Aug 10 2024, Maciej Fijalkowski wrote: > >> + nb_pkts = xsk_tx_peek_release_desc_batch(pool, budget); > >> + if (!nb_pkts) > >> + return true; > >> + > >> + while (nb_pkts-- > 0) { > >> + dma = xsk_buff_raw_get_dma(pool, descs[i].addr); > >> + xsk_buff_raw_dma_sync_for_device(pool, dma, descs[i].len); > >> + > >> + tx_buffer_info = &tx_ring->tx_buffer_info[tx_ring->next_to_use]; > >> + tx_buffer_info->bytecount = descs[i].len; > >> + tx_buffer_info->type = IGB_TYPE_XSK; > >> + tx_buffer_info->xdpf = NULL; > >> + tx_buffer_info->gso_segs = 1; > >> + tx_buffer_info->time_stamp = jiffies; > >> + > >> + tx_desc = IGB_TX_DESC(tx_ring, tx_ring->next_to_use); > >> + tx_desc->read.buffer_addr = cpu_to_le64(dma); > >> + > >> + /* put descriptor type bits */ > >> + cmd_type = E1000_ADVTXD_DTYP_DATA | E1000_ADVTXD_DCMD_DEXT | > >> + E1000_ADVTXD_DCMD_IFCS; > >> + olinfo_status = descs[i].len << E1000_ADVTXD_PAYLEN_SHIFT; > >> + > >> + cmd_type |= descs[i].len | IGB_TXD_DCMD; > > > > This is also sub-optimal as you are setting RS bit on each Tx descriptor, > > which will in turn raise a lot of irqs. See how ice sets RS bit only on > > last desc from a batch and then, on cleaning side, how it finds a > > descriptor that is supposed to have DD bit written by HW. > > I see your point. That requires changes to the cleaning side. However, > igb_clean_tx_irq() is shared between normal and zero-copy path. Ok if that's too much of a hassle then let's leave it as-is. I can address that in some nearby future. > > The amount of irqs can be also controlled by irq coalescing or even > using busy polling. So I'd rather keep this implementation as simple as > it is now. That has nothing to do with what I was describing. > > Thanks, > Kurt
On Wed Aug 14 2024, Maciej Fijalkowski wrote: > On Wed, Aug 14, 2024 at 10:36:32AM +0200, Kurt Kanzenbach wrote: >> On Sat Aug 10 2024, Maciej Fijalkowski wrote: >> >> + nb_pkts = xsk_tx_peek_release_desc_batch(pool, budget); >> >> + if (!nb_pkts) >> >> + return true; >> >> + >> >> + while (nb_pkts-- > 0) { >> >> + dma = xsk_buff_raw_get_dma(pool, descs[i].addr); >> >> + xsk_buff_raw_dma_sync_for_device(pool, dma, descs[i].len); >> >> + >> >> + tx_buffer_info = &tx_ring->tx_buffer_info[tx_ring->next_to_use]; >> >> + tx_buffer_info->bytecount = descs[i].len; >> >> + tx_buffer_info->type = IGB_TYPE_XSK; >> >> + tx_buffer_info->xdpf = NULL; >> >> + tx_buffer_info->gso_segs = 1; >> >> + tx_buffer_info->time_stamp = jiffies; >> >> + >> >> + tx_desc = IGB_TX_DESC(tx_ring, tx_ring->next_to_use); >> >> + tx_desc->read.buffer_addr = cpu_to_le64(dma); >> >> + >> >> + /* put descriptor type bits */ >> >> + cmd_type = E1000_ADVTXD_DTYP_DATA | E1000_ADVTXD_DCMD_DEXT | >> >> + E1000_ADVTXD_DCMD_IFCS; >> >> + olinfo_status = descs[i].len << E1000_ADVTXD_PAYLEN_SHIFT; >> >> + >> >> + cmd_type |= descs[i].len | IGB_TXD_DCMD; >> > >> > This is also sub-optimal as you are setting RS bit on each Tx descriptor, >> > which will in turn raise a lot of irqs. See how ice sets RS bit only on >> > last desc from a batch and then, on cleaning side, how it finds a >> > descriptor that is supposed to have DD bit written by HW. >> >> I see your point. That requires changes to the cleaning side. However, >> igb_clean_tx_irq() is shared between normal and zero-copy path. > > Ok if that's too much of a hassle then let's leave it as-is. I can address > that in some nearby future. How would you do that, by adding a dedicated igb_clean_tx_irq_zc() function? Or is there a more simple way? BTW: This needs to be addressed in igc too. > >> >> The amount of irqs can be also controlled by irq coalescing or even >> using busy polling. So I'd rather keep this implementation as simple as >> it is now. > > That has nothing to do with what I was describing. Ok, maybe I misunderstood your suggestion. It seemed to me that adding the RS bit to the last frame of the burst will reduce the amount of raised irqs. Thanks, Kurt
On Wed, Aug 14, 2024 at 11:12:30AM +0200, Kurt Kanzenbach wrote: > On Wed Aug 14 2024, Maciej Fijalkowski wrote: > > On Wed, Aug 14, 2024 at 10:36:32AM +0200, Kurt Kanzenbach wrote: > >> On Sat Aug 10 2024, Maciej Fijalkowski wrote: > >> >> + nb_pkts = xsk_tx_peek_release_desc_batch(pool, budget); > >> >> + if (!nb_pkts) > >> >> + return true; > >> >> + > >> >> + while (nb_pkts-- > 0) { > >> >> + dma = xsk_buff_raw_get_dma(pool, descs[i].addr); > >> >> + xsk_buff_raw_dma_sync_for_device(pool, dma, descs[i].len); > >> >> + > >> >> + tx_buffer_info = &tx_ring->tx_buffer_info[tx_ring->next_to_use]; > >> >> + tx_buffer_info->bytecount = descs[i].len; > >> >> + tx_buffer_info->type = IGB_TYPE_XSK; > >> >> + tx_buffer_info->xdpf = NULL; > >> >> + tx_buffer_info->gso_segs = 1; > >> >> + tx_buffer_info->time_stamp = jiffies; > >> >> + > >> >> + tx_desc = IGB_TX_DESC(tx_ring, tx_ring->next_to_use); > >> >> + tx_desc->read.buffer_addr = cpu_to_le64(dma); > >> >> + > >> >> + /* put descriptor type bits */ > >> >> + cmd_type = E1000_ADVTXD_DTYP_DATA | E1000_ADVTXD_DCMD_DEXT | > >> >> + E1000_ADVTXD_DCMD_IFCS; > >> >> + olinfo_status = descs[i].len << E1000_ADVTXD_PAYLEN_SHIFT; > >> >> + > >> >> + cmd_type |= descs[i].len | IGB_TXD_DCMD; > >> > > >> > This is also sub-optimal as you are setting RS bit on each Tx descriptor, > >> > which will in turn raise a lot of irqs. See how ice sets RS bit only on > >> > last desc from a batch and then, on cleaning side, how it finds a > >> > descriptor that is supposed to have DD bit written by HW. > >> > >> I see your point. That requires changes to the cleaning side. However, > >> igb_clean_tx_irq() is shared between normal and zero-copy path. > > > > Ok if that's too much of a hassle then let's leave it as-is. I can address > > that in some nearby future. > > How would you do that, by adding a dedicated igb_clean_tx_irq_zc() > function? Or is there a more simple way? Yes that would be my first approach. > > BTW: This needs to be addressed in igc too. Argh! > > > > >> > >> The amount of irqs can be also controlled by irq coalescing or even > >> using busy polling. So I'd rather keep this implementation as simple as > >> it is now. > > > > That has nothing to do with what I was describing. > > Ok, maybe I misunderstood your suggestion. It seemed to me that adding > the RS bit to the last frame of the burst will reduce the amount of > raised irqs. You got it right, but I don't think it's related to any outer settings. The main case here is that by doing what I proposed you get much less PCIe traffic which in turn yields better performance. > > Thanks, > Kurt
On Wed Aug 14 2024, Maciej Fijalkowski wrote: >> >> The amount of irqs can be also controlled by irq coalescing or even >> >> using busy polling. So I'd rather keep this implementation as simple as >> >> it is now. >> > >> > That has nothing to do with what I was describing. >> >> Ok, maybe I misunderstood your suggestion. It seemed to me that adding >> the RS bit to the last frame of the burst will reduce the amount of >> raised irqs. > > You got it right, but I don't think it's related to any outer settings. > The main case here is that by doing what I proposed you get much less PCIe > traffic which in turn yields better performance. I see, makes sense. Then, let's address this in another patchset also for igc. Thanks, Kurt
diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h index 4983a6ec718e..9ee18ac1ba47 100644 --- a/drivers/net/ethernet/intel/igb/igb.h +++ b/drivers/net/ethernet/intel/igb/igb.h @@ -257,6 +257,7 @@ enum igb_tx_flags { enum igb_tx_buf_type { IGB_TYPE_SKB = 0, IGB_TYPE_XDP, + IGB_TYPE_XSK }; /* wrapper around a pointer to a socket buffer, @@ -836,6 +837,7 @@ int igb_xsk_pool_setup(struct igb_adapter *adapter, bool igb_alloc_rx_buffers_zc(struct igb_ring *rx_ring, u16 count); void igb_clean_rx_ring_zc(struct igb_ring *rx_ring); int igb_clean_rx_irq_zc(struct igb_q_vector *q_vector, const int budget); +bool igb_xmit_zc(struct igb_ring *tx_ring); int igb_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags); #endif /* _IGB_H_ */ diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 0b779b2ca9ea..1ebd67981978 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -2996,6 +2996,9 @@ static int igb_xdp_xmit(struct net_device *dev, int n, if (unlikely(!tx_ring)) return -ENXIO; + if (unlikely(test_bit(IGB_RING_FLAG_TX_DISABLED, &tx_ring->flags))) + return -ENXIO; + nq = txring_txq(tx_ring); __netif_tx_lock(nq, cpu); @@ -4917,15 +4920,20 @@ void igb_clean_tx_ring(struct igb_ring *tx_ring) { u16 i = tx_ring->next_to_clean; struct igb_tx_buffer *tx_buffer = &tx_ring->tx_buffer_info[i]; + u32 xsk_frames = 0; while (i != tx_ring->next_to_use) { union e1000_adv_tx_desc *eop_desc, *tx_desc; /* Free all the Tx ring sk_buffs or xdp frames */ - if (tx_buffer->type == IGB_TYPE_SKB) + if (tx_buffer->type == IGB_TYPE_SKB) { dev_kfree_skb_any(tx_buffer->skb); - else + } else if (tx_buffer->type == IGB_TYPE_XDP) { xdp_return_frame(tx_buffer->xdpf); + } else if (tx_buffer->type == IGB_TYPE_XSK) { + xsk_frames++; + goto skip_for_xsk; + } /* unmap skb header data */ dma_unmap_single(tx_ring->dev, @@ -4956,6 +4964,7 @@ void igb_clean_tx_ring(struct igb_ring *tx_ring) DMA_TO_DEVICE); } +skip_for_xsk: tx_buffer->next_to_watch = NULL; /* move us one more past the eop_desc for start of next pkt */ @@ -4970,6 +4979,9 @@ void igb_clean_tx_ring(struct igb_ring *tx_ring) /* reset BQL for queue */ netdev_tx_reset_queue(txring_txq(tx_ring)); + if (tx_ring->xsk_pool && xsk_frames) + xsk_tx_completed(tx_ring->xsk_pool, xsk_frames); + /* reset next_to_use and next_to_clean */ tx_ring->next_to_use = 0; tx_ring->next_to_clean = 0; @@ -6503,6 +6515,9 @@ netdev_tx_t igb_xmit_frame_ring(struct sk_buff *skb, return NETDEV_TX_BUSY; } + if (unlikely(test_bit(IGB_RING_FLAG_TX_DISABLED, &tx_ring->flags))) + return NETDEV_TX_BUSY; + /* record the location of the first descriptor for this packet */ first = &tx_ring->tx_buffer_info[tx_ring->next_to_use]; first->type = IGB_TYPE_SKB; @@ -8263,13 +8278,17 @@ static int igb_poll(struct napi_struct *napi, int budget) **/ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector, int napi_budget) { - struct igb_adapter *adapter = q_vector->adapter; - struct igb_ring *tx_ring = q_vector->tx.ring; - struct igb_tx_buffer *tx_buffer; - union e1000_adv_tx_desc *tx_desc; unsigned int total_bytes = 0, total_packets = 0; + struct igb_adapter *adapter = q_vector->adapter; unsigned int budget = q_vector->tx.work_limit; + struct igb_ring *tx_ring = q_vector->tx.ring; unsigned int i = tx_ring->next_to_clean; + union e1000_adv_tx_desc *tx_desc; + struct igb_tx_buffer *tx_buffer; + int cpu = smp_processor_id(); + bool xsk_xmit_done = true; + struct netdev_queue *nq; + u32 xsk_frames = 0; if (test_bit(__IGB_DOWN, &adapter->state)) return true; @@ -8300,10 +8319,14 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector, int napi_budget) total_packets += tx_buffer->gso_segs; /* free the skb */ - if (tx_buffer->type == IGB_TYPE_SKB) + if (tx_buffer->type == IGB_TYPE_SKB) { napi_consume_skb(tx_buffer->skb, napi_budget); - else + } else if (tx_buffer->type == IGB_TYPE_XDP) { xdp_return_frame(tx_buffer->xdpf); + } else if (tx_buffer->type == IGB_TYPE_XSK) { + xsk_frames++; + goto skip_for_xsk; + } /* unmap skb header data */ dma_unmap_single(tx_ring->dev, @@ -8335,6 +8358,7 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector, int napi_budget) } } +skip_for_xsk: /* move us one more past the eop_desc for start of next pkt */ tx_buffer++; tx_desc++; @@ -8363,6 +8387,20 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector, int napi_budget) q_vector->tx.total_bytes += total_bytes; q_vector->tx.total_packets += total_packets; + if (tx_ring->xsk_pool) { + if (xsk_frames) + xsk_tx_completed(tx_ring->xsk_pool, xsk_frames); + if (xsk_uses_need_wakeup(tx_ring->xsk_pool)) + xsk_set_tx_need_wakeup(tx_ring->xsk_pool); + + nq = txring_txq(tx_ring); + __netif_tx_lock(nq, cpu); + /* Avoid transmit queue timeout since we share it with the slow path */ + txq_trans_cond_update(nq); + xsk_xmit_done = igb_xmit_zc(tx_ring); + __netif_tx_unlock(nq); + } + if (test_bit(IGB_RING_FLAG_TX_DETECT_HANG, &tx_ring->flags)) { struct e1000_hw *hw = &adapter->hw; @@ -8425,7 +8463,7 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector, int napi_budget) } } - return !!budget; + return !!budget && xsk_xmit_done; } /** diff --git a/drivers/net/ethernet/intel/igb/igb_xsk.c b/drivers/net/ethernet/intel/igb/igb_xsk.c index 66cdc30e9b6e..4e530e1eb3c0 100644 --- a/drivers/net/ethernet/intel/igb/igb_xsk.c +++ b/drivers/net/ethernet/intel/igb/igb_xsk.c @@ -431,6 +431,59 @@ int igb_clean_rx_irq_zc(struct igb_q_vector *q_vector, const int budget) return failure ? budget : (int)total_packets; } +bool igb_xmit_zc(struct igb_ring *tx_ring) +{ + unsigned int budget = igb_desc_unused(tx_ring); + struct xsk_buff_pool *pool = tx_ring->xsk_pool; + u32 cmd_type, olinfo_status, nb_pkts, i = 0; + struct xdp_desc *descs = pool->tx_descs; + union e1000_adv_tx_desc *tx_desc = NULL; + struct igb_tx_buffer *tx_buffer_info; + unsigned int total_bytes = 0; + dma_addr_t dma; + + nb_pkts = xsk_tx_peek_release_desc_batch(pool, budget); + if (!nb_pkts) + return true; + + while (nb_pkts-- > 0) { + dma = xsk_buff_raw_get_dma(pool, descs[i].addr); + xsk_buff_raw_dma_sync_for_device(pool, dma, descs[i].len); + + tx_buffer_info = &tx_ring->tx_buffer_info[tx_ring->next_to_use]; + tx_buffer_info->bytecount = descs[i].len; + tx_buffer_info->type = IGB_TYPE_XSK; + tx_buffer_info->xdpf = NULL; + tx_buffer_info->gso_segs = 1; + tx_buffer_info->time_stamp = jiffies; + + tx_desc = IGB_TX_DESC(tx_ring, tx_ring->next_to_use); + tx_desc->read.buffer_addr = cpu_to_le64(dma); + + /* put descriptor type bits */ + cmd_type = E1000_ADVTXD_DTYP_DATA | E1000_ADVTXD_DCMD_DEXT | + E1000_ADVTXD_DCMD_IFCS; + olinfo_status = descs[i].len << E1000_ADVTXD_PAYLEN_SHIFT; + + cmd_type |= descs[i].len | IGB_TXD_DCMD; + tx_desc->read.cmd_type_len = cpu_to_le32(cmd_type); + tx_desc->read.olinfo_status = cpu_to_le32(olinfo_status); + + total_bytes += descs[i].len; + + i++; + tx_ring->next_to_use++; + tx_buffer_info->next_to_watch = tx_desc; + if (tx_ring->next_to_use == tx_ring->count) + tx_ring->next_to_use = 0; + } + + netdev_tx_sent_queue(txring_txq(tx_ring), total_bytes); + igb_xdp_ring_update_tail(tx_ring); + + return nb_pkts < budget; +} + int igb_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags) { struct igb_adapter *adapter = netdev_priv(dev);