diff mbox series

[net-next] net: axienet: Use NAPI for TX completion path

Message ID 20220429222835.3641895-1-robert.hancock@calian.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series [net-next] net: axienet: Use NAPI for TX completion path | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for net-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Single patches do not need cover letters
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 2 this patch: 2
netdev/cc_maintainers success CCed 8 of 8 maintainers
netdev/build_clang success Errors and warnings before: 0 this patch: 0
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 2 this patch: 2
netdev/checkpatch warning WARNING: line length of 81 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 2 this patch: 2
netdev/source_inline success Was 0 now: 0

Commit Message

Robert Hancock April 29, 2022, 10:28 p.m. UTC
This driver was using the TX IRQ handler to perform all TX completion
tasks. Under heavy TX network load, this can cause significant irqs-off
latencies (found to be in the hundreds of microseconds using ftrace).
This can cause other issues, such as overrunning serial UART FIFOs when
using high baud rates with limited UART FIFO sizes.

Switch to using the NAPI poll handler to perform the TX completion work
to get this out of hard IRQ context and avoid the IRQ latency impact.

Signed-off-by: Robert Hancock <robert.hancock@calian.com>
---
 drivers/net/ethernet/xilinx/xilinx_axienet.h  |  2 +
 .../net/ethernet/xilinx/xilinx_axienet_main.c | 56 ++++++++++++-------
 2 files changed, 37 insertions(+), 21 deletions(-)

Comments

Radhey Shyam Pandey May 2, 2022, 7:30 p.m. UTC | #1
> -----Original Message-----
> From: Robert Hancock <robert.hancock@calian.com>
> Sent: Saturday, April 30, 2022 3:59 AM
> To: netdev@vger.kernel.org
> Cc: Radhey Shyam Pandey <radheys@xilinx.com>; davem@davemloft.net;
> edumazet@google.com; kuba@kernel.org; pabeni@redhat.com; Michal Simek
> <michals@xilinx.com>; linux-arm-kernel@lists.infradead.org; Robert Hancock
> <robert.hancock@calian.com>
> Subject: [PATCH net-next] net: axienet: Use NAPI for TX completion path
> 
> This driver was using the TX IRQ handler to perform all TX completion
> tasks. Under heavy TX network load, this can cause significant irqs-off
> latencies (found to be in the hundreds of microseconds using ftrace).
> This can cause other issues, such as overrunning serial UART FIFOs when
> using high baud rates with limited UART FIFO sizes.
> 
> Switch to using the NAPI poll handler to perform the TX completion work
> to get this out of hard IRQ context and avoid the IRQ latency impact.

Thanks for the patch. I assume for simulating heavy network load we
are using netperf/iperf. Do we have some details on the benchmark
before and after adding TX NAPI? I want to see the impact on
throughput.

> 
> Signed-off-by: Robert Hancock <robert.hancock@calian.com>
> ---
>  drivers/net/ethernet/xilinx/xilinx_axienet.h  |  2 +
>  .../net/ethernet/xilinx/xilinx_axienet_main.c | 56 ++++++++++++-------
>  2 files changed, 37 insertions(+), 21 deletions(-)
> 
> diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet.h
> b/drivers/net/ethernet/xilinx/xilinx_axienet.h
> index d5c1e5c4a508..6e58d034fe90 100644
> --- a/drivers/net/ethernet/xilinx/xilinx_axienet.h
> +++ b/drivers/net/ethernet/xilinx/xilinx_axienet.h
> @@ -397,6 +397,7 @@ struct axidma_bd {
>   * @regs:	Base address for the axienet_local device address space
>   * @dma_regs:	Base address for the axidma device address space
>   * @rx_dma_cr:  Nominal content of RX DMA control register
> + * @tx_dma_cr:  Nominal content of TX DMA control register
>   * @dma_err_task: Work structure to process Axi DMA errors
>   * @tx_irq:	Axidma TX IRQ number
>   * @rx_irq:	Axidma RX IRQ number
> @@ -454,6 +455,7 @@ struct axienet_local {
>  	void __iomem *dma_regs;
> 
>  	u32 rx_dma_cr;
> +	u32 tx_dma_cr;
> 
>  	struct work_struct dma_err_task;
> 
> diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
> b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
> index d6fc3f7acdf0..a52e616275e4 100644
> --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
> +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
> @@ -254,8 +254,6 @@ static u32 axienet_usec_to_timer(struct axienet_local
> *lp, u32 coalesce_usec)
>   */
>  static void axienet_dma_start(struct axienet_local *lp)
>  {
> -	u32 tx_cr;
> -
>  	/* Start updating the Rx channel control register */
>  	lp->rx_dma_cr = (lp->coalesce_count_rx <<
> XAXIDMA_COALESCE_SHIFT) |
>  			XAXIDMA_IRQ_IOC_MASK |
> XAXIDMA_IRQ_ERROR_MASK;
> @@ -269,16 +267,16 @@ static void axienet_dma_start(struct axienet_local
> *lp)
>  	axienet_dma_out32(lp, XAXIDMA_RX_CR_OFFSET, lp->rx_dma_cr);
> 
>  	/* Start updating the Tx channel control register */
> -	tx_cr = (lp->coalesce_count_tx << XAXIDMA_COALESCE_SHIFT) |
> -		XAXIDMA_IRQ_IOC_MASK | XAXIDMA_IRQ_ERROR_MASK;
> +	lp->tx_dma_cr = (lp->coalesce_count_tx <<
> XAXIDMA_COALESCE_SHIFT) |
> +			XAXIDMA_IRQ_IOC_MASK |
> XAXIDMA_IRQ_ERROR_MASK;
>  	/* Only set interrupt delay timer if not generating an interrupt on
>  	 * the first TX packet. Otherwise leave at 0 to disable delay interrupt.
>  	 */
>  	if (lp->coalesce_count_tx > 1)
> -		tx_cr |= (axienet_usec_to_timer(lp, lp->coalesce_usec_tx)
> -				<< XAXIDMA_DELAY_SHIFT) |
> -			 XAXIDMA_IRQ_DELAY_MASK;
> -	axienet_dma_out32(lp, XAXIDMA_TX_CR_OFFSET, tx_cr);
> +		lp->tx_dma_cr |= (axienet_usec_to_timer(lp, lp-
> >coalesce_usec_tx)
> +					<< XAXIDMA_DELAY_SHIFT) |
> +				 XAXIDMA_IRQ_DELAY_MASK;
> +	axienet_dma_out32(lp, XAXIDMA_TX_CR_OFFSET, lp->tx_dma_cr);
> 
>  	/* Populate the tail pointer and bring the Rx Axi DMA engine out of
>  	 * halted state. This will make the Rx side ready for reception.
> @@ -294,8 +292,8 @@ static void axienet_dma_start(struct axienet_local *lp)
>  	 * tail pointer register that the Tx channel will start transmitting.
>  	 */
>  	axienet_dma_out_addr(lp, XAXIDMA_TX_CDESC_OFFSET, lp-
> >tx_bd_p);
> -	tx_cr |= XAXIDMA_CR_RUNSTOP_MASK;
> -	axienet_dma_out32(lp, XAXIDMA_TX_CR_OFFSET, tx_cr);
> +	lp->tx_dma_cr |= XAXIDMA_CR_RUNSTOP_MASK;
> +	axienet_dma_out32(lp, XAXIDMA_TX_CR_OFFSET, lp->tx_dma_cr);
>  }
> 
>  /**
> @@ -671,13 +669,14 @@ static int axienet_device_reset(struct net_device
> *ndev)
>   * @nr_bds:	Number of descriptors to clean up, can be -1 if unknown.
>   * @sizep:	Pointer to a u32 filled with the total sum of all bytes
>   * 		in all cleaned-up descriptors. Ignored if NULL.
> + * @budget:	NAPI budget (use 0 when not called from NAPI poll)
>   *
>   * Would either be called after a successful transmit operation, or after
>   * there was an error when setting up the chain.
>   * Returns the number of descriptors handled.
>   */
>  static int axienet_free_tx_chain(struct net_device *ndev, u32 first_bd,
> -				 int nr_bds, u32 *sizep)
> +				 int nr_bds, u32 *sizep, int budget)
>  {
>  	struct axienet_local *lp = netdev_priv(ndev);
>  	struct axidma_bd *cur_p;
> @@ -707,7 +706,7 @@ static int axienet_free_tx_chain(struct net_device
> *ndev, u32 first_bd,
>  				 DMA_TO_DEVICE);
> 
>  		if (cur_p->skb && (status &
> XAXIDMA_BD_STS_COMPLETE_MASK))
> -			dev_consume_skb_irq(cur_p->skb);
> +			napi_consume_skb(cur_p->skb, budget);
> 
>  		cur_p->app0 = 0;
>  		cur_p->app1 = 0;
> @@ -756,20 +755,24 @@ static inline int axienet_check_tx_bd_space(struct
> axienet_local *lp,
>   * axienet_start_xmit_done - Invoked once a transmit is completed by the
>   * Axi DMA Tx channel.
>   * @ndev:	Pointer to the net_device structure
> + * @budget:	NAPI budget
>   *
> - * This function is invoked from the Axi DMA Tx isr to notify the completion
> + * This function is invoked from the NAPI processing to notify the completion
>   * of transmit operation. It clears fields in the corresponding Tx BDs and
>   * unmaps the corresponding buffer so that CPU can regain ownership of the
>   * buffer. It finally invokes "netif_wake_queue" to restart transmission if
>   * required.
>   */
> -static void axienet_start_xmit_done(struct net_device *ndev)
> +static void axienet_start_xmit_done(struct net_device *ndev, int budget)
>  {
>  	struct axienet_local *lp = netdev_priv(ndev);
>  	u32 packets = 0;
>  	u32 size = 0;
> 
> -	packets = axienet_free_tx_chain(ndev, lp->tx_bd_ci, -1, &size);
> +	packets = axienet_free_tx_chain(ndev, lp->tx_bd_ci, -1, &size, budget);
> +
> +	if (!packets)
> +		return;
> 
>  	lp->tx_bd_ci += packets;
>  	if (lp->tx_bd_ci >= lp->tx_bd_num)
> @@ -865,7 +868,7 @@ axienet_start_xmit(struct sk_buff *skb, struct
> net_device *ndev)
>  				netdev_err(ndev, "TX DMA mapping error\n");
>  			ndev->stats.tx_dropped++;
>  			axienet_free_tx_chain(ndev, orig_tail_ptr, ii + 1,
> -					      NULL);
> +					      NULL, 0);
>  			lp->tx_bd_tail = orig_tail_ptr;
> 
>  			return NETDEV_TX_OK;
> @@ -899,9 +902,9 @@ axienet_start_xmit(struct sk_buff *skb, struct
> net_device *ndev)
>  }
> 
>  /**
> - * axienet_poll - Triggered by RX ISR to complete the received BD processing.
> + * axienet_poll - Triggered by RX/TX ISR to complete the BD processing.
>   * @napi:	Pointer to NAPI structure.
> - * @budget:	Max number of packets to process.
> + * @budget:	Max number of RX packets to process.
>   *
>   * Return: Number of RX packets processed.
>   */
> @@ -916,6 +919,8 @@ static int axienet_poll(struct napi_struct *napi, int
> budget)
>  	struct sk_buff *skb, *new_skb;
>  	struct axienet_local *lp = container_of(napi, struct axienet_local,
> napi);
> 
> +	axienet_start_xmit_done(lp->ndev, budget);
> +
>  	cur_p = &lp->rx_bd_v[lp->rx_bd_ci];
> 
>  	while (packets < budget && (cur_p->status &
> XAXIDMA_BD_STS_COMPLETE_MASK)) {
> @@ -1001,11 +1006,12 @@ static int axienet_poll(struct napi_struct *napi, int
> budget)
>  		axienet_dma_out_addr(lp, XAXIDMA_RX_TDESC_OFFSET,
> tail_p);
> 
>  	if (packets < budget && napi_complete_done(napi, packets)) {
> -		/* Re-enable RX completion interrupts. This should
> -		 * cause an immediate interrupt if any RX packets are
> +		/* Re-enable RX/TX completion interrupts. This should
> +		 * cause an immediate interrupt if any RX/TX packets are
>  		 * already pending.
>  		 */
>  		axienet_dma_out32(lp, XAXIDMA_RX_CR_OFFSET, lp-
> >rx_dma_cr);
> +		axienet_dma_out32(lp, XAXIDMA_TX_CR_OFFSET, lp-
> >tx_dma_cr);
>  	}
>  	return packets;
>  }
> @@ -1040,7 +1046,15 @@ static irqreturn_t axienet_tx_irq(int irq, void
> *_ndev)
>  			   (lp->tx_bd_v[lp->tx_bd_ci]).phys);
>  		schedule_work(&lp->dma_err_task);
>  	} else {
> -		axienet_start_xmit_done(lp->ndev);
> +		/* Disable further TX completion interrupts and schedule
> +		 * NAPI to handle the completions.
> +		 */
> +		u32 cr = lp->tx_dma_cr;
> +
> +		cr &= ~(XAXIDMA_IRQ_IOC_MASK |
> XAXIDMA_IRQ_DELAY_MASK);
> +		axienet_dma_out32(lp, XAXIDMA_TX_CR_OFFSET, cr);
> +
> +		napi_schedule(&lp->napi);
>  	}
> 
>  	return IRQ_HANDLED;
> --
> 2.31.1
Jakub Kicinski May 5, 2022, 2:20 a.m. UTC | #2
On Mon, 2 May 2022 19:30:51 +0000 Radhey Shyam Pandey wrote:
> > This driver was using the TX IRQ handler to perform all TX completion
> > tasks. Under heavy TX network load, this can cause significant irqs-off
> > latencies (found to be in the hundreds of microseconds using ftrace).
> > This can cause other issues, such as overrunning serial UART FIFOs when
> > using high baud rates with limited UART FIFO sizes.
> > 
> > Switch to using the NAPI poll handler to perform the TX completion work
> > to get this out of hard IRQ context and avoid the IRQ latency impact.  
> 
> Thanks for the patch. I assume for simulating heavy network load we
> are using netperf/iperf. Do we have some details on the benchmark
> before and after adding TX NAPI? I want to see the impact on
> throughput.

Seems like a reasonable ask, let's get the patch reposted 
with the numbers in the commit message.
Robert Hancock May 5, 2022, 5:33 p.m. UTC | #3
On Wed, 2022-05-04 at 19:20 -0700, Jakub Kicinski wrote:
> On Mon, 2 May 2022 19:30:51 +0000 Radhey Shyam Pandey wrote:
> > > This driver was using the TX IRQ handler to perform all TX completion
> > > tasks. Under heavy TX network load, this can cause significant irqs-off
> > > latencies (found to be in the hundreds of microseconds using ftrace).
> > > This can cause other issues, such as overrunning serial UART FIFOs when
> > > using high baud rates with limited UART FIFO sizes.
> > > 
> > > Switch to using the NAPI poll handler to perform the TX completion work
> > > to get this out of hard IRQ context and avoid the IRQ latency impact.  
> > 
> > Thanks for the patch. I assume for simulating heavy network load we
> > are using netperf/iperf. Do we have some details on the benchmark
> > before and after adding TX NAPI? I want to see the impact on
> > throughput.
> 
> Seems like a reasonable ask, let's get the patch reposted 
> with the numbers in the commit message.

Didn't mean to ignore that request, looks like I didn't get Radhey's email
directly, odd.

I did a test with iperf3 from the board (Xilinx MPSoC ZU9EG platform) connected
to a Linux PC via a switch at 1G link speed. With TX NAPI in place I saw about
942 Mbps for TX rate, with the previous code I saw 941 Mbps. RX speed was also
unchanged at 941 Mbps. So no real significant change either way. I can spin
another version of the patch that includes these numbers.
Jakub Kicinski May 5, 2022, 6:08 p.m. UTC | #4
On Thu, 5 May 2022 17:33:39 +0000 Robert Hancock wrote:
> On Wed, 2022-05-04 at 19:20 -0700, Jakub Kicinski wrote:
> > On Mon, 2 May 2022 19:30:51 +0000 Radhey Shyam Pandey wrote:  
> > > Thanks for the patch. I assume for simulating heavy network load we
> > > are using netperf/iperf. Do we have some details on the benchmark
> > > before and after adding TX NAPI? I want to see the impact on
> > > throughput.  
> > 
> > Seems like a reasonable ask, let's get the patch reposted 
> > with the numbers in the commit message.  
> 
> Didn't mean to ignore that request, looks like I didn't get Radhey's email
> directly, odd.
> 
> I did a test with iperf3 from the board (Xilinx MPSoC ZU9EG platform) connected
> to a Linux PC via a switch at 1G link speed. With TX NAPI in place I saw about
> 942 Mbps for TX rate, with the previous code I saw 941 Mbps. RX speed was also
> unchanged at 941 Mbps. So no real significant change either way. I can spin
> another version of the patch that includes these numbers.

Sounds like line rate, is there a difference in CPU utilization?
Robert Hancock May 5, 2022, 6:56 p.m. UTC | #5
On Thu, 2022-05-05 at 11:08 -0700, Jakub Kicinski wrote:
> On Thu, 5 May 2022 17:33:39 +0000 Robert Hancock wrote:
> > On Wed, 2022-05-04 at 19:20 -0700, Jakub Kicinski wrote:
> > > On Mon, 2 May 2022 19:30:51 +0000 Radhey Shyam Pandey wrote:  
> > > > Thanks for the patch. I assume for simulating heavy network load we
> > > > are using netperf/iperf. Do we have some details on the benchmark
> > > > before and after adding TX NAPI? I want to see the impact on
> > > > throughput.  
> > > 
> > > Seems like a reasonable ask, let's get the patch reposted 
> > > with the numbers in the commit message.  
> > 
> > Didn't mean to ignore that request, looks like I didn't get Radhey's email
> > directly, odd.
> > 
> > I did a test with iperf3 from the board (Xilinx MPSoC ZU9EG platform)
> > connected
> > to a Linux PC via a switch at 1G link speed. With TX NAPI in place I saw
> > about
> > 942 Mbps for TX rate, with the previous code I saw 941 Mbps. RX speed was
> > also
> > unchanged at 941 Mbps. So no real significant change either way. I can spin
> > another version of the patch that includes these numbers.
> 
> Sounds like line rate, is there a difference in CPU utilization?

Some measurements on that from the TX load case - in both cases the RX and TX
IRQs ended up being split across CPU0 and CPU3 due to irqbalance:

Before:

CPU0 (RX): 1% hard IRQ, 13% soft IRQ
CPU3 (TX): 12% hard IRQ, 30% soft IRQ

After:

CPU0 (RX): <1% hard IRQ, 29% soft IRQ
CPU3 (TX): <1% hard IRQ, 21% soft IRQ

The hard IRQ time is definitely lower, and the total CPU usage is lower as well
(56% down to 50%). It's interesting that so much of the CPU load ended up on
the CPU with the RX IRQ though, presumably because the RX and TX IRQs are
triggering the same NAPI poll operation. Since they're separate IRQs that can
be on separate CPUs, it might be a win to use separate NAPI poll structures
for RX and TX so that both CPUs aren't trying to hit the same rings (TX and
RX)?
Robert Hancock May 5, 2022, 8:15 p.m. UTC | #6
On Thu, 2022-05-05 at 12:56 -0600, Robert Hancock wrote:
> On Thu, 2022-05-05 at 11:08 -0700, Jakub Kicinski wrote:
> > On Thu, 5 May 2022 17:33:39 +0000 Robert Hancock wrote:
> > > On Wed, 2022-05-04 at 19:20 -0700, Jakub Kicinski wrote:
> > > > On Mon, 2 May 2022 19:30:51 +0000 Radhey Shyam Pandey wrote:  
> > > > > Thanks for the patch. I assume for simulating heavy network load we
> > > > > are using netperf/iperf. Do we have some details on the benchmark
> > > > > before and after adding TX NAPI? I want to see the impact on
> > > > > throughput.  
> > > > 
> > > > Seems like a reasonable ask, let's get the patch reposted 
> > > > with the numbers in the commit message.  
> > > 
> > > Didn't mean to ignore that request, looks like I didn't get Radhey's
> > > email
> > > directly, odd.
> > > 
> > > I did a test with iperf3 from the board (Xilinx MPSoC ZU9EG platform)
> > > connected
> > > to a Linux PC via a switch at 1G link speed. With TX NAPI in place I saw
> > > about
> > > 942 Mbps for TX rate, with the previous code I saw 941 Mbps. RX speed was
> > > also
> > > unchanged at 941 Mbps. So no real significant change either way. I can
> > > spin
> > > another version of the patch that includes these numbers.
> > 
> > Sounds like line rate, is there a difference in CPU utilization?
> 
> Some measurements on that from the TX load case - in both cases the RX and TX
> IRQs ended up being split across CPU0 and CPU3 due to irqbalance:
> 
> Before:
> 
> CPU0 (RX): 1% hard IRQ, 13% soft IRQ
> CPU3 (TX): 12% hard IRQ, 30% soft IRQ
> 
> After:
> 
> CPU0 (RX): <1% hard IRQ, 29% soft IRQ
> CPU3 (TX): <1% hard IRQ, 21% soft IRQ
> 
> The hard IRQ time is definitely lower, and the total CPU usage is lower as
> well
> (56% down to 50%). It's interesting that so much of the CPU load ended up on
> the CPU with the RX IRQ though, presumably because the RX and TX IRQs are
> triggering the same NAPI poll operation. Since they're separate IRQs that can
> be on separate CPUs, it might be a win to use separate NAPI poll structures
> for RX and TX so that both CPUs aren't trying to hit the same rings (TX and
> RX)?

Indeed, it appears that separate RX and TX NAPI polling lowers the CPU usage
overall by a few percent as well as keeping the TX work on the same CPU as the
TX IRQ. I'll submit a v3 with these changes and will include the softirq
numbers in the commit text.

>
diff mbox series

Patch

diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet.h b/drivers/net/ethernet/xilinx/xilinx_axienet.h
index d5c1e5c4a508..6e58d034fe90 100644
--- a/drivers/net/ethernet/xilinx/xilinx_axienet.h
+++ b/drivers/net/ethernet/xilinx/xilinx_axienet.h
@@ -397,6 +397,7 @@  struct axidma_bd {
  * @regs:	Base address for the axienet_local device address space
  * @dma_regs:	Base address for the axidma device address space
  * @rx_dma_cr:  Nominal content of RX DMA control register
+ * @tx_dma_cr:  Nominal content of TX DMA control register
  * @dma_err_task: Work structure to process Axi DMA errors
  * @tx_irq:	Axidma TX IRQ number
  * @rx_irq:	Axidma RX IRQ number
@@ -454,6 +455,7 @@  struct axienet_local {
 	void __iomem *dma_regs;
 
 	u32 rx_dma_cr;
+	u32 tx_dma_cr;
 
 	struct work_struct dma_err_task;
 
diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
index d6fc3f7acdf0..a52e616275e4 100644
--- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
+++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
@@ -254,8 +254,6 @@  static u32 axienet_usec_to_timer(struct axienet_local *lp, u32 coalesce_usec)
  */
 static void axienet_dma_start(struct axienet_local *lp)
 {
-	u32 tx_cr;
-
 	/* Start updating the Rx channel control register */
 	lp->rx_dma_cr = (lp->coalesce_count_rx << XAXIDMA_COALESCE_SHIFT) |
 			XAXIDMA_IRQ_IOC_MASK | XAXIDMA_IRQ_ERROR_MASK;
@@ -269,16 +267,16 @@  static void axienet_dma_start(struct axienet_local *lp)
 	axienet_dma_out32(lp, XAXIDMA_RX_CR_OFFSET, lp->rx_dma_cr);
 
 	/* Start updating the Tx channel control register */
-	tx_cr = (lp->coalesce_count_tx << XAXIDMA_COALESCE_SHIFT) |
-		XAXIDMA_IRQ_IOC_MASK | XAXIDMA_IRQ_ERROR_MASK;
+	lp->tx_dma_cr = (lp->coalesce_count_tx << XAXIDMA_COALESCE_SHIFT) |
+			XAXIDMA_IRQ_IOC_MASK | XAXIDMA_IRQ_ERROR_MASK;
 	/* Only set interrupt delay timer if not generating an interrupt on
 	 * the first TX packet. Otherwise leave at 0 to disable delay interrupt.
 	 */
 	if (lp->coalesce_count_tx > 1)
-		tx_cr |= (axienet_usec_to_timer(lp, lp->coalesce_usec_tx)
-				<< XAXIDMA_DELAY_SHIFT) |
-			 XAXIDMA_IRQ_DELAY_MASK;
-	axienet_dma_out32(lp, XAXIDMA_TX_CR_OFFSET, tx_cr);
+		lp->tx_dma_cr |= (axienet_usec_to_timer(lp, lp->coalesce_usec_tx)
+					<< XAXIDMA_DELAY_SHIFT) |
+				 XAXIDMA_IRQ_DELAY_MASK;
+	axienet_dma_out32(lp, XAXIDMA_TX_CR_OFFSET, lp->tx_dma_cr);
 
 	/* Populate the tail pointer and bring the Rx Axi DMA engine out of
 	 * halted state. This will make the Rx side ready for reception.
@@ -294,8 +292,8 @@  static void axienet_dma_start(struct axienet_local *lp)
 	 * tail pointer register that the Tx channel will start transmitting.
 	 */
 	axienet_dma_out_addr(lp, XAXIDMA_TX_CDESC_OFFSET, lp->tx_bd_p);
-	tx_cr |= XAXIDMA_CR_RUNSTOP_MASK;
-	axienet_dma_out32(lp, XAXIDMA_TX_CR_OFFSET, tx_cr);
+	lp->tx_dma_cr |= XAXIDMA_CR_RUNSTOP_MASK;
+	axienet_dma_out32(lp, XAXIDMA_TX_CR_OFFSET, lp->tx_dma_cr);
 }
 
 /**
@@ -671,13 +669,14 @@  static int axienet_device_reset(struct net_device *ndev)
  * @nr_bds:	Number of descriptors to clean up, can be -1 if unknown.
  * @sizep:	Pointer to a u32 filled with the total sum of all bytes
  * 		in all cleaned-up descriptors. Ignored if NULL.
+ * @budget:	NAPI budget (use 0 when not called from NAPI poll)
  *
  * Would either be called after a successful transmit operation, or after
  * there was an error when setting up the chain.
  * Returns the number of descriptors handled.
  */
 static int axienet_free_tx_chain(struct net_device *ndev, u32 first_bd,
-				 int nr_bds, u32 *sizep)
+				 int nr_bds, u32 *sizep, int budget)
 {
 	struct axienet_local *lp = netdev_priv(ndev);
 	struct axidma_bd *cur_p;
@@ -707,7 +706,7 @@  static int axienet_free_tx_chain(struct net_device *ndev, u32 first_bd,
 				 DMA_TO_DEVICE);
 
 		if (cur_p->skb && (status & XAXIDMA_BD_STS_COMPLETE_MASK))
-			dev_consume_skb_irq(cur_p->skb);
+			napi_consume_skb(cur_p->skb, budget);
 
 		cur_p->app0 = 0;
 		cur_p->app1 = 0;
@@ -756,20 +755,24 @@  static inline int axienet_check_tx_bd_space(struct axienet_local *lp,
  * axienet_start_xmit_done - Invoked once a transmit is completed by the
  * Axi DMA Tx channel.
  * @ndev:	Pointer to the net_device structure
+ * @budget:	NAPI budget
  *
- * This function is invoked from the Axi DMA Tx isr to notify the completion
+ * This function is invoked from the NAPI processing to notify the completion
  * of transmit operation. It clears fields in the corresponding Tx BDs and
  * unmaps the corresponding buffer so that CPU can regain ownership of the
  * buffer. It finally invokes "netif_wake_queue" to restart transmission if
  * required.
  */
-static void axienet_start_xmit_done(struct net_device *ndev)
+static void axienet_start_xmit_done(struct net_device *ndev, int budget)
 {
 	struct axienet_local *lp = netdev_priv(ndev);
 	u32 packets = 0;
 	u32 size = 0;
 
-	packets = axienet_free_tx_chain(ndev, lp->tx_bd_ci, -1, &size);
+	packets = axienet_free_tx_chain(ndev, lp->tx_bd_ci, -1, &size, budget);
+
+	if (!packets)
+		return;
 
 	lp->tx_bd_ci += packets;
 	if (lp->tx_bd_ci >= lp->tx_bd_num)
@@ -865,7 +868,7 @@  axienet_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 				netdev_err(ndev, "TX DMA mapping error\n");
 			ndev->stats.tx_dropped++;
 			axienet_free_tx_chain(ndev, orig_tail_ptr, ii + 1,
-					      NULL);
+					      NULL, 0);
 			lp->tx_bd_tail = orig_tail_ptr;
 
 			return NETDEV_TX_OK;
@@ -899,9 +902,9 @@  axienet_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 }
 
 /**
- * axienet_poll - Triggered by RX ISR to complete the received BD processing.
+ * axienet_poll - Triggered by RX/TX ISR to complete the BD processing.
  * @napi:	Pointer to NAPI structure.
- * @budget:	Max number of packets to process.
+ * @budget:	Max number of RX packets to process.
  *
  * Return: Number of RX packets processed.
  */
@@ -916,6 +919,8 @@  static int axienet_poll(struct napi_struct *napi, int budget)
 	struct sk_buff *skb, *new_skb;
 	struct axienet_local *lp = container_of(napi, struct axienet_local, napi);
 
+	axienet_start_xmit_done(lp->ndev, budget);
+
 	cur_p = &lp->rx_bd_v[lp->rx_bd_ci];
 
 	while (packets < budget && (cur_p->status & XAXIDMA_BD_STS_COMPLETE_MASK)) {
@@ -1001,11 +1006,12 @@  static int axienet_poll(struct napi_struct *napi, int budget)
 		axienet_dma_out_addr(lp, XAXIDMA_RX_TDESC_OFFSET, tail_p);
 
 	if (packets < budget && napi_complete_done(napi, packets)) {
-		/* Re-enable RX completion interrupts. This should
-		 * cause an immediate interrupt if any RX packets are
+		/* Re-enable RX/TX completion interrupts. This should
+		 * cause an immediate interrupt if any RX/TX packets are
 		 * already pending.
 		 */
 		axienet_dma_out32(lp, XAXIDMA_RX_CR_OFFSET, lp->rx_dma_cr);
+		axienet_dma_out32(lp, XAXIDMA_TX_CR_OFFSET, lp->tx_dma_cr);
 	}
 	return packets;
 }
@@ -1040,7 +1046,15 @@  static irqreturn_t axienet_tx_irq(int irq, void *_ndev)
 			   (lp->tx_bd_v[lp->tx_bd_ci]).phys);
 		schedule_work(&lp->dma_err_task);
 	} else {
-		axienet_start_xmit_done(lp->ndev);
+		/* Disable further TX completion interrupts and schedule
+		 * NAPI to handle the completions.
+		 */
+		u32 cr = lp->tx_dma_cr;
+
+		cr &= ~(XAXIDMA_IRQ_IOC_MASK | XAXIDMA_IRQ_DELAY_MASK);
+		axienet_dma_out32(lp, XAXIDMA_TX_CR_OFFSET, cr);
+
+		napi_schedule(&lp->napi);
 	}
 
 	return IRQ_HANDLED;