diff mbox series

[for-next,1/4] dmaengine: ti: k3-udma: Use ktime/usleep_range based TX completion check

Message ID 20200127132111.20464-2-peter.ujfalusi@ti.com (mailing list archive)
State Changes Requested
Headers show
Series dmaengine: ti: k3-udma: Updates for next | expand

Commit Message

Peter Ujfalusi Jan. 27, 2020, 1:21 p.m. UTC
From: Vignesh Raghavendra <vigneshr@ti.com>

In some cases (McSPI for example) the jiffie and delayed_work based
workaround can cause big throughput drop.

Switch to use ktime/usleep_range based implementation to be able
to sustain speed for PDMA based peripherals.

Signed-off-by: Vignesh Raghavendra <vigneshr@ti.com>
Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
---
 drivers/dma/ti/k3-udma.c | 80 ++++++++++++++++++++++++++--------------
 1 file changed, 53 insertions(+), 27 deletions(-)

Comments

Vinod Koul Jan. 28, 2020, 11:48 a.m. UTC | #1
On 27-01-20, 15:21, Peter Ujfalusi wrote:
> From: Vignesh Raghavendra <vigneshr@ti.com>
> 
> In some cases (McSPI for example) the jiffie and delayed_work based
> workaround can cause big throughput drop.
> 
> Switch to use ktime/usleep_range based implementation to be able
> to sustain speed for PDMA based peripherals.
> 
> Signed-off-by: Vignesh Raghavendra <vigneshr@ti.com>
> Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
> ---
>  drivers/dma/ti/k3-udma.c | 80 ++++++++++++++++++++++++++--------------
>  1 file changed, 53 insertions(+), 27 deletions(-)
> 
> diff --git a/drivers/dma/ti/k3-udma.c b/drivers/dma/ti/k3-udma.c
> index ea79c2df28e0..fb59c869a6a7 100644
> --- a/drivers/dma/ti/k3-udma.c
> +++ b/drivers/dma/ti/k3-udma.c
> @@ -5,6 +5,7 @@
>   */
>  
>  #include <linux/kernel.h>
> +#include <linux/delay.h>
>  #include <linux/dmaengine.h>
>  #include <linux/dma-mapping.h>
>  #include <linux/dmapool.h>
> @@ -169,7 +170,7 @@ enum udma_chan_state {
>  
>  struct udma_tx_drain {
>  	struct delayed_work work;
> -	unsigned long jiffie;
> +	ktime_t tstamp;
>  	u32 residue;
>  };
>  
> @@ -946,9 +947,10 @@ static bool udma_is_desc_really_done(struct udma_chan *uc, struct udma_desc *d)
>  	peer_bcnt = udma_tchanrt_read(uc->tchan, UDMA_TCHAN_RT_PEER_BCNT_REG);
>  	bcnt = udma_tchanrt_read(uc->tchan, UDMA_TCHAN_RT_BCNT_REG);
>  
> +	/* Transfer is incomplete, store current residue and time stamp */
>  	if (peer_bcnt < bcnt) {
>  		uc->tx_drain.residue = bcnt - peer_bcnt;
> -		uc->tx_drain.jiffie = jiffies;
> +		uc->tx_drain.tstamp = ktime_get();

Any reason why ktime_get() is better than jiffies..?

>  		return false;
>  	}
>  
> @@ -961,35 +963,59 @@ static void udma_check_tx_completion(struct work_struct *work)
>  					    tx_drain.work.work);
>  	bool desc_done = true;
>  	u32 residue_diff;
> -	unsigned long jiffie_diff, delay;
> +	ktime_t time_diff;
> +	unsigned long delay;
> +
> +	while (1) {
> +		if (uc->desc) {
> +			/* Get previous residue and time stamp */
> +			residue_diff = uc->tx_drain.residue;
> +			time_diff = uc->tx_drain.tstamp;
> +			/*
> +			 * Get current residue and time stamp or see if
> +			 * transfer is complete
> +			 */
> +			desc_done = udma_is_desc_really_done(uc, uc->desc);
> +		}
>  
> -	if (uc->desc) {
> -		residue_diff = uc->tx_drain.residue;
> -		jiffie_diff = uc->tx_drain.jiffie;
> -		desc_done = udma_is_desc_really_done(uc, uc->desc);
> -	}
> -
> -	if (!desc_done) {
> -		jiffie_diff = uc->tx_drain.jiffie - jiffie_diff;
> -		residue_diff -= uc->tx_drain.residue;
> -		if (residue_diff) {
> -			/* Try to guess when we should check next time */
> -			residue_diff /= jiffie_diff;
> -			delay = uc->tx_drain.residue / residue_diff / 3;
> -			if (jiffies_to_msecs(delay) < 5)
> -				delay = 0;
> -		} else {
> -			/* No progress, check again in 1 second  */
> -			delay = HZ;
> +		if (!desc_done) {
> +			/*
> +			 * Find the time delta and residue delta w.r.t
> +			 * previous poll
> +			 */
> +			time_diff = ktime_sub(uc->tx_drain.tstamp,
> +					      time_diff) + 1;
> +			residue_diff -= uc->tx_drain.residue;
> +			if (residue_diff) {
> +				/*
> +				 * Try to guess when we should check
> +				 * next time by calculating rate at
> +				 * which data is being drained at the
> +				 * peer device
> +				 */
> +				delay = (time_diff / residue_diff) *
> +					uc->tx_drain.residue;
> +			} else {
> +				/* No progress, check again in 1 second  */
> +				schedule_delayed_work(&uc->tx_drain.work, HZ);
> +				break;
> +			}
> +
> +			usleep_range(ktime_to_us(delay),
> +				     ktime_to_us(delay) + 10);
> +			continue;
>  		}
>  
> -		schedule_delayed_work(&uc->tx_drain.work, delay);
> -	} else if (uc->desc) {
> -		struct udma_desc *d = uc->desc;
> +		if (uc->desc) {
> +			struct udma_desc *d = uc->desc;
> +
> +			uc->bcnt += d->residue;
> +			udma_start(uc);
> +			vchan_cookie_complete(&d->vd);
> +			break;
> +		}
>  
> -		uc->bcnt += d->residue;
> -		udma_start(uc);
> -		vchan_cookie_complete(&d->vd);
> +		break;
>  	}
>  }
>  
> -- 
> Peter
> 
> Texas Instruments Finland Oy, Porkkalankatu 22, 00180 Helsinki.
> Y-tunnus/Business ID: 0615521-4. Kotipaikka/Domicile: Helsinki
Vignesh Raghavendra Jan. 28, 2020, 12:05 p.m. UTC | #2
Hi Vinod,

On 1/28/2020 5:18 PM, Vinod Koul wrote:
> On 27-01-20, 15:21, Peter Ujfalusi wrote:
>> From: Vignesh Raghavendra <vigneshr@ti.com>
>>
>> In some cases (McSPI for example) the jiffie and delayed_work based
>> workaround can cause big throughput drop.
>>
>> Switch to use ktime/usleep_range based implementation to be able
>> to sustain speed for PDMA based peripherals.
>>
>> Signed-off-by: Vignesh Raghavendra <vigneshr@ti.com>
>> Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
>> ---
>>  drivers/dma/ti/k3-udma.c | 80 ++++++++++++++++++++++++++--------------
>>  1 file changed, 53 insertions(+), 27 deletions(-)
>>
>> diff --git a/drivers/dma/ti/k3-udma.c b/drivers/dma/ti/k3-udma.c
>> index ea79c2df28e0..fb59c869a6a7 100644
>> --- a/drivers/dma/ti/k3-udma.c
>> +++ b/drivers/dma/ti/k3-udma.c
>> @@ -5,6 +5,7 @@
>>   */
>>  
>>  #include <linux/kernel.h>
>> +#include <linux/delay.h>
>>  #include <linux/dmaengine.h>
>>  #include <linux/dma-mapping.h>
>>  #include <linux/dmapool.h>
>> @@ -169,7 +170,7 @@ enum udma_chan_state {
>>  
>>  struct udma_tx_drain {
>>  	struct delayed_work work;
>> -	unsigned long jiffie;
>> +	ktime_t tstamp;
>>  	u32 residue;
>>  };
>>  
>> @@ -946,9 +947,10 @@ static bool udma_is_desc_really_done(struct udma_chan *uc, struct udma_desc *d)
>>  	peer_bcnt = udma_tchanrt_read(uc->tchan, UDMA_TCHAN_RT_PEER_BCNT_REG);
>>  	bcnt = udma_tchanrt_read(uc->tchan, UDMA_TCHAN_RT_BCNT_REG);
>>  
>> +	/* Transfer is incomplete, store current residue and time stamp */
>>  	if (peer_bcnt < bcnt) {
>>  		uc->tx_drain.residue = bcnt - peer_bcnt;
>> -		uc->tx_drain.jiffie = jiffies;
>> +		uc->tx_drain.tstamp = ktime_get();
> 
> Any reason why ktime_get() is better than jiffies..?

Resolution of jiffies is 4ms. ktime_t is has better resolution (upto ns
scale). With jiffies, I observed that code was either always polling DMA
progress counters (which affects HW data transfer speed) or sleeping too
long, both causing performance loss. Switching to ktime_t provides
better prediction of how long transfer takes to complete.

Regards
Vignesh
Vinod Koul Jan. 28, 2020, 12:44 p.m. UTC | #3
On 28-01-20, 17:35, Vignesh Raghavendra wrote:

> >> +	/* Transfer is incomplete, store current residue and time stamp */
> >>  	if (peer_bcnt < bcnt) {
> >>  		uc->tx_drain.residue = bcnt - peer_bcnt;
> >> -		uc->tx_drain.jiffie = jiffies;
> >> +		uc->tx_drain.tstamp = ktime_get();
> > 
> > Any reason why ktime_get() is better than jiffies..?
> 
> Resolution of jiffies is 4ms. ktime_t is has better resolution (upto ns
> scale). With jiffies, I observed that code was either always polling DMA
> progress counters (which affects HW data transfer speed) or sleeping too
> long, both causing performance loss. Switching to ktime_t provides
> better prediction of how long transfer takes to complete.

Thanks for explanation, i think it is good info to add in changelog.
Peter Ujfalusi Feb. 11, 2020, 10:13 a.m. UTC | #4
On 28/01/2020 14.44, Vinod Koul wrote:
> On 28-01-20, 17:35, Vignesh Raghavendra wrote:
> 
>>>> +	/* Transfer is incomplete, store current residue and time stamp */
>>>>  	if (peer_bcnt < bcnt) {
>>>>  		uc->tx_drain.residue = bcnt - peer_bcnt;
>>>> -		uc->tx_drain.jiffie = jiffies;
>>>> +		uc->tx_drain.tstamp = ktime_get();
>>>
>>> Any reason why ktime_get() is better than jiffies..?
>>
>> Resolution of jiffies is 4ms. ktime_t is has better resolution (upto ns
>> scale). With jiffies, I observed that code was either always polling DMA
>> progress counters (which affects HW data transfer speed) or sleeping too
>> long, both causing performance loss. Switching to ktime_t provides
>> better prediction of how long transfer takes to complete.
> 
> Thanks for explanation, i think it is good info to add in changelog.

It turns out that this patch causes lockup with UART stress testing.
The strange thing is that we have identical patch in production with
4.19 without issues.

I'll send two series for UDMA update as we have found a way to induce a
kernel crash with experimental UART patches.
One with patches as must bug fixes for 5.6 and another one with lower
priority fixes.

- Péter

Texas Instruments Finland Oy, Porkkalankatu 22, 00180 Helsinki.
Y-tunnus/Business ID: 0615521-4. Kotipaikka/Domicile: Helsinki
diff mbox series

Patch

diff --git a/drivers/dma/ti/k3-udma.c b/drivers/dma/ti/k3-udma.c
index ea79c2df28e0..fb59c869a6a7 100644
--- a/drivers/dma/ti/k3-udma.c
+++ b/drivers/dma/ti/k3-udma.c
@@ -5,6 +5,7 @@ 
  */
 
 #include <linux/kernel.h>
+#include <linux/delay.h>
 #include <linux/dmaengine.h>
 #include <linux/dma-mapping.h>
 #include <linux/dmapool.h>
@@ -169,7 +170,7 @@  enum udma_chan_state {
 
 struct udma_tx_drain {
 	struct delayed_work work;
-	unsigned long jiffie;
+	ktime_t tstamp;
 	u32 residue;
 };
 
@@ -946,9 +947,10 @@  static bool udma_is_desc_really_done(struct udma_chan *uc, struct udma_desc *d)
 	peer_bcnt = udma_tchanrt_read(uc->tchan, UDMA_TCHAN_RT_PEER_BCNT_REG);
 	bcnt = udma_tchanrt_read(uc->tchan, UDMA_TCHAN_RT_BCNT_REG);
 
+	/* Transfer is incomplete, store current residue and time stamp */
 	if (peer_bcnt < bcnt) {
 		uc->tx_drain.residue = bcnt - peer_bcnt;
-		uc->tx_drain.jiffie = jiffies;
+		uc->tx_drain.tstamp = ktime_get();
 		return false;
 	}
 
@@ -961,35 +963,59 @@  static void udma_check_tx_completion(struct work_struct *work)
 					    tx_drain.work.work);
 	bool desc_done = true;
 	u32 residue_diff;
-	unsigned long jiffie_diff, delay;
+	ktime_t time_diff;
+	unsigned long delay;
+
+	while (1) {
+		if (uc->desc) {
+			/* Get previous residue and time stamp */
+			residue_diff = uc->tx_drain.residue;
+			time_diff = uc->tx_drain.tstamp;
+			/*
+			 * Get current residue and time stamp or see if
+			 * transfer is complete
+			 */
+			desc_done = udma_is_desc_really_done(uc, uc->desc);
+		}
 
-	if (uc->desc) {
-		residue_diff = uc->tx_drain.residue;
-		jiffie_diff = uc->tx_drain.jiffie;
-		desc_done = udma_is_desc_really_done(uc, uc->desc);
-	}
-
-	if (!desc_done) {
-		jiffie_diff = uc->tx_drain.jiffie - jiffie_diff;
-		residue_diff -= uc->tx_drain.residue;
-		if (residue_diff) {
-			/* Try to guess when we should check next time */
-			residue_diff /= jiffie_diff;
-			delay = uc->tx_drain.residue / residue_diff / 3;
-			if (jiffies_to_msecs(delay) < 5)
-				delay = 0;
-		} else {
-			/* No progress, check again in 1 second  */
-			delay = HZ;
+		if (!desc_done) {
+			/*
+			 * Find the time delta and residue delta w.r.t
+			 * previous poll
+			 */
+			time_diff = ktime_sub(uc->tx_drain.tstamp,
+					      time_diff) + 1;
+			residue_diff -= uc->tx_drain.residue;
+			if (residue_diff) {
+				/*
+				 * Try to guess when we should check
+				 * next time by calculating rate at
+				 * which data is being drained at the
+				 * peer device
+				 */
+				delay = (time_diff / residue_diff) *
+					uc->tx_drain.residue;
+			} else {
+				/* No progress, check again in 1 second  */
+				schedule_delayed_work(&uc->tx_drain.work, HZ);
+				break;
+			}
+
+			usleep_range(ktime_to_us(delay),
+				     ktime_to_us(delay) + 10);
+			continue;
 		}
 
-		schedule_delayed_work(&uc->tx_drain.work, delay);
-	} else if (uc->desc) {
-		struct udma_desc *d = uc->desc;
+		if (uc->desc) {
+			struct udma_desc *d = uc->desc;
+
+			uc->bcnt += d->residue;
+			udma_start(uc);
+			vchan_cookie_complete(&d->vd);
+			break;
+		}
 
-		uc->bcnt += d->residue;
-		udma_start(uc);
-		vchan_cookie_complete(&d->vd);
+		break;
 	}
 }