diff mbox series

ALSA: pcm: fix wait_time calculations

Message ID 20230405201219.2197774-1-oswald.buddenhagen@gmx.de (mailing list archive)
State New, archived
Headers show
Series ALSA: pcm: fix wait_time calculations | expand

Commit Message

Oswald Buddenhagen April 5, 2023, 8:12 p.m. UTC
... in wait_for_avail() and snd_pcm_drain().

t was calculated in seconds, so it would be pretty much always zero, to
be subsequently de-facto ignored due to being max(t, 10)'d. And then it
(i.e., 10) would be treated as secs, which doesn't seem right.

However, fixing it to properly calculate msecs would potentially cause
timeouts when using twice the period size for the default timeout (which
seems reasonable to me), so instead use the buffer size plus 10 percent
to be on the safe side ... but that still seems insufficient, presumably
because the hardware typically needs a moment to fire up. To compensate
for this, we up the minimal timeout to 100ms, which is still two orders
of magnitude less than the bogus minimum.

substream->wait_time was also misinterpreted as jiffies, despite being
documented as being in msecs. Only the soc/sof driver sets it - to 500,
which looks very much like msecs were intended.

Speaking of which, shouldn't snd_pcm_drain() also use substream->
wait_time?

As a drive-by, make the debug messages on timeout less confusing.

Signed-off-by: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
---
 sound/core/pcm_lib.c    | 11 +++++------
 sound/core/pcm_native.c |  8 ++++----
 2 files changed, 9 insertions(+), 10 deletions(-)

Comments

Takashi Iwai April 6, 2023, 3:01 p.m. UTC | #1
On Wed, 05 Apr 2023 22:12:19 +0200,
Oswald Buddenhagen wrote:
> 
> ... in wait_for_avail() and snd_pcm_drain().
> 
> t was calculated in seconds, so it would be pretty much always zero, to
> be subsequently de-facto ignored due to being max(t, 10)'d. And then it
> (i.e., 10) would be treated as secs, which doesn't seem right.
> 
> However, fixing it to properly calculate msecs would potentially cause
> timeouts when using twice the period size for the default timeout (which
> seems reasonable to me), so instead use the buffer size plus 10 percent
> to be on the safe side ... but that still seems insufficient, presumably
> because the hardware typically needs a moment to fire up. To compensate
> for this, we up the minimal timeout to 100ms, which is still two orders
> of magnitude less than the bogus minimum.
> 
> substream->wait_time was also misinterpreted as jiffies, despite being
> documented as being in msecs. Only the soc/sof driver sets it - to 500,
> which looks very much like msecs were intended.
> 
> Speaking of which, shouldn't snd_pcm_drain() also use substream->
> wait_time?

Yes, and unifying the code might make more sense.

> As a drive-by, make the debug messages on timeout less confusing.
> 
> Signed-off-by: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>

I applied this patch as is now to for-next branch.


thanks,

Takashi
Jon Hunter June 12, 2023, 12:18 p.m. UTC | #2
Hi Oswald,

On 05/04/2023 21:12, Oswald Buddenhagen wrote:
> ... in wait_for_avail() and snd_pcm_drain().
> 
> t was calculated in seconds, so it would be pretty much always zero, to
> be subsequently de-facto ignored due to being max(t, 10)'d. And then it
> (i.e., 10) would be treated as secs, which doesn't seem right.
> 
> However, fixing it to properly calculate msecs would potentially cause
> timeouts when using twice the period size for the default timeout (which
> seems reasonable to me), so instead use the buffer size plus 10 percent
> to be on the safe side ... but that still seems insufficient, presumably
> because the hardware typically needs a moment to fire up. To compensate
> for this, we up the minimal timeout to 100ms, which is still two orders
> of magnitude less than the bogus minimum.
> 
> substream->wait_time was also misinterpreted as jiffies, despite being
> documented as being in msecs. Only the soc/sof driver sets it - to 500,
> which looks very much like msecs were intended.
> 
> Speaking of which, shouldn't snd_pcm_drain() also use substream->
> wait_time?
> 
> As a drive-by, make the debug messages on timeout less confusing.
> 
> Signed-off-by: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
> ---
>   sound/core/pcm_lib.c    | 11 +++++------
>   sound/core/pcm_native.c |  8 ++++----
>   2 files changed, 9 insertions(+), 10 deletions(-)
> 
> diff --git a/sound/core/pcm_lib.c b/sound/core/pcm_lib.c
> index 02fd65993e7e..af1eb136feb0 100644
> --- a/sound/core/pcm_lib.c
> +++ b/sound/core/pcm_lib.c
> @@ -1878,15 +1878,14 @@ static int wait_for_avail(struct snd_pcm_substream *substream,
>   		if (substream->wait_time) {
>   			wait_time = substream->wait_time;
>   		} else {
> -			wait_time = 10;
> +			wait_time = 100;
>   
>   			if (runtime->rate) {
> -				long t = runtime->period_size * 2 /
> -					 runtime->rate;
> +				long t = runtime->buffer_size * 1100 / runtime->rate;
>   				wait_time = max(t, wait_time);
>   			}
> -			wait_time = msecs_to_jiffies(wait_time * 1000);
>   		}
> +		wait_time = msecs_to_jiffies(wait_time);
>   	}
>   
>   	for (;;) {
> @@ -1934,8 +1933,8 @@ static int wait_for_avail(struct snd_pcm_substream *substream,
>   		}
>   		if (!tout) {
>   			pcm_dbg(substream->pcm,
> -				"%s write error (DMA or IRQ trouble?)\n",
> -				is_playback ? "playback" : "capture");
> +				"%s timeout (DMA or IRQ trouble?)\n",
> +				is_playback ? "playback write" : "capture read");
>   			err = -EIO;
>   			break;
>   		}
> diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
> index 331380c2438b..94185267a7b9 100644
> --- a/sound/core/pcm_native.c
> +++ b/sound/core/pcm_native.c
> @@ -2159,12 +2159,12 @@ static int snd_pcm_drain(struct snd_pcm_substream *substream,
>   		if (runtime->no_period_wakeup)
>   			tout = MAX_SCHEDULE_TIMEOUT;
>   		else {
> -			tout = 10;
> +			tout = 100;
>   			if (runtime->rate) {
> -				long t = runtime->period_size * 2 / runtime->rate;
> +				long t = runtime->buffer_size * 1100 / runtime->rate;
>   				tout = max(t, tout);
>   			}
> -			tout = msecs_to_jiffies(tout * 1000);
> +			tout = msecs_to_jiffies(tout);
>   		}
>   		tout = schedule_timeout(tout);
>   
> @@ -2187,7 +2187,7 @@ static int snd_pcm_drain(struct snd_pcm_substream *substream,
>   				result = -ESTRPIPE;
>   			else {
>   				dev_dbg(substream->pcm->card->dev,
> -					"playback drain error (DMA or IRQ trouble?)\n");
> +					"playback drain timeout (DMA or IRQ trouble?)\n");
>   				snd_pcm_stop(substream, SNDRV_PCM_STATE_SETUP);
>   				result = -EIO;
>   			}


Sorry for not catching this sooner, but I have just noticed that one of 
our audio tests for Tegra is failing on v6.4-rc and bisect is pointing 
to this commit. Reverting this on top of the current mainline fixes it.

Sameer, any thoughts on why this change would break the volume control 
test for Tegra194? After this patch is applied the test fails on the 2nd 
test for MVC1 ...

[ 1314 ms] Starting mvc_run...
[ 3616 ms] Starting mvc_check_configs...
[ 3921 ms] Starting mvc_check_sha1sum...
[ 4379 ms] Test with gain of 40 for Linear [ 4404 ms] curve type with 32 
bits 2 channels using MVC1 ... PASSED!
[ 4437 ms] Starting mvc_run...
[ 6720 ms] Starting mvc_check_configs...
[ 7029 ms] Starting mvc_check_sha1sum...
[ 7463 ms] Test with gain of 40 for Linear [ 7490 ms] curve type with 32 
bits 2 channels using MVC2 ... PASSED!
[ 7541 ms] Starting mvc_run...
[ 9899 ms] Starting mvc_check_configs...
[ 10255 ms] Starting mvc_check_sha1sum...
[ 10384 ms] Error: Sha1sum check FAILED!
[ 10590 ms] Error: Incorrect output gain dB against [ 10613 ms] expected 
gain -122dB for channel 1
[ 10701 ms] Test with gain of -119 for Linear [ 10730 ms] curve type 
with 32 bits 2 channels using MVC1 ... FAILED!

Jon
Jon Hunter June 12, 2023, 1:16 p.m. UTC | #3
On 12/06/2023 13:18, Jon Hunter wrote:
> Hi Oswald,
> 
> On 05/04/2023 21:12, Oswald Buddenhagen wrote:
>> ... in wait_for_avail() and snd_pcm_drain().
>>
>> t was calculated in seconds, so it would be pretty much always zero, to
>> be subsequently de-facto ignored due to being max(t, 10)'d. And then it
>> (i.e., 10) would be treated as secs, which doesn't seem right.
>>
>> However, fixing it to properly calculate msecs would potentially cause
>> timeouts when using twice the period size for the default timeout (which
>> seems reasonable to me), so instead use the buffer size plus 10 percent
>> to be on the safe side ... but that still seems insufficient, presumably
>> because the hardware typically needs a moment to fire up. To compensate
>> for this, we up the minimal timeout to 100ms, which is still two orders
>> of magnitude less than the bogus minimum.
>>
>> substream->wait_time was also misinterpreted as jiffies, despite being
>> documented as being in msecs. Only the soc/sof driver sets it - to 500,
>> which looks very much like msecs were intended.
>>
>> Speaking of which, shouldn't snd_pcm_drain() also use substream->
>> wait_time?
>>
>> As a drive-by, make the debug messages on timeout less confusing.
>>
>> Signed-off-by: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
>> ---
>>   sound/core/pcm_lib.c    | 11 +++++------
>>   sound/core/pcm_native.c |  8 ++++----
>>   2 files changed, 9 insertions(+), 10 deletions(-)
>>
>> diff --git a/sound/core/pcm_lib.c b/sound/core/pcm_lib.c
>> index 02fd65993e7e..af1eb136feb0 100644
>> --- a/sound/core/pcm_lib.c
>> +++ b/sound/core/pcm_lib.c
>> @@ -1878,15 +1878,14 @@ static int wait_for_avail(struct 
>> snd_pcm_substream *substream,
>>           if (substream->wait_time) {
>>               wait_time = substream->wait_time;
>>           } else {
>> -            wait_time = 10;
>> +            wait_time = 100;
>>               if (runtime->rate) {
>> -                long t = runtime->period_size * 2 /
>> -                     runtime->rate;
>> +                long t = runtime->buffer_size * 1100 / runtime->rate;
>>                   wait_time = max(t, wait_time);
>>               }
>> -            wait_time = msecs_to_jiffies(wait_time * 1000);
>>           }
>> +        wait_time = msecs_to_jiffies(wait_time);
>>       }
>>       for (;;) {
>> @@ -1934,8 +1933,8 @@ static int wait_for_avail(struct 
>> snd_pcm_substream *substream,
>>           }
>>           if (!tout) {
>>               pcm_dbg(substream->pcm,
>> -                "%s write error (DMA or IRQ trouble?)\n",
>> -                is_playback ? "playback" : "capture");
>> +                "%s timeout (DMA or IRQ trouble?)\n",
>> +                is_playback ? "playback write" : "capture read");
>>               err = -EIO;
>>               break;
>>           }
>> diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
>> index 331380c2438b..94185267a7b9 100644
>> --- a/sound/core/pcm_native.c
>> +++ b/sound/core/pcm_native.c
>> @@ -2159,12 +2159,12 @@ static int snd_pcm_drain(struct 
>> snd_pcm_substream *substream,
>>           if (runtime->no_period_wakeup)
>>               tout = MAX_SCHEDULE_TIMEOUT;
>>           else {
>> -            tout = 10;
>> +            tout = 100;
>>               if (runtime->rate) {
>> -                long t = runtime->period_size * 2 / runtime->rate;
>> +                long t = runtime->buffer_size * 1100 / runtime->rate;
>>                   tout = max(t, tout);
>>               }
>> -            tout = msecs_to_jiffies(tout * 1000);
>> +            tout = msecs_to_jiffies(tout);
>>           }
>>           tout = schedule_timeout(tout);
>> @@ -2187,7 +2187,7 @@ static int snd_pcm_drain(struct 
>> snd_pcm_substream *substream,
>>                   result = -ESTRPIPE;
>>               else {
>>                   dev_dbg(substream->pcm->card->dev,
>> -                    "playback drain error (DMA or IRQ trouble?)\n");
>> +                    "playback drain timeout (DMA or IRQ trouble?)\n");
>>                   snd_pcm_stop(substream, SNDRV_PCM_STATE_SETUP);
>>                   result = -EIO;
>>               }
> 
> 
> Sorry for not catching this sooner, but I have just noticed that one of 
> our audio tests for Tegra is failing on v6.4-rc and bisect is pointing 
> to this commit. Reverting this on top of the current mainline fixes it.
> 
> Sameer, any thoughts on why this change would break the volume control 
> test for Tegra194? After this patch is applied the test fails on the 2nd 
> test for MVC1 ...
> 
> [ 1314 ms] Starting mvc_run...
> [ 3616 ms] Starting mvc_check_configs...
> [ 3921 ms] Starting mvc_check_sha1sum...
> [ 4379 ms] Test with gain of 40 for Linear [ 4404 ms] curve type with 32 
> bits 2 channels using MVC1 ... PASSED!
> [ 4437 ms] Starting mvc_run...
> [ 6720 ms] Starting mvc_check_configs...
> [ 7029 ms] Starting mvc_check_sha1sum...
> [ 7463 ms] Test with gain of 40 for Linear [ 7490 ms] curve type with 32 
> bits 2 channels using MVC2 ... PASSED!
> [ 7541 ms] Starting mvc_run...
> [ 9899 ms] Starting mvc_check_configs...
> [ 10255 ms] Starting mvc_check_sha1sum...
> [ 10384 ms] Error: Sha1sum check FAILED!
> [ 10590 ms] Error: Incorrect output gain dB against [ 10613 ms] expected 
> gain -122dB for channel 1
> [ 10701 ms] Test with gain of -119 for Linear [ 10730 ms] curve type 
> with 32 bits 2 channels using MVC1 ... FAILED!


If I enable the debug prints, I do see the following messages ...

  tegra-audio-graph-card sound: capture read timeout (DMA or IRQ trouble?)
  tegra-audio-graph-card sound: capture read timeout (DMA or IRQ trouble?)
  tegra-audio-graph-card sound: capture read timeout (DMA or IRQ trouble?)
  tegra-audio-graph-card sound: capture read timeout (DMA or IRQ trouble?)


Jon
Oswald Buddenhagen June 12, 2023, 2:28 p.m. UTC | #4
On Mon, Jun 12, 2023 at 02:16:15PM +0100, Jon Hunter wrote:
>On 12/06/2023 13:18, Jon Hunter wrote:
>> On 05/04/2023 21:12, Oswald Buddenhagen wrote:
>>> ... in wait_for_avail() and snd_pcm_drain().
>> 
>> Sorry for not catching this sooner, but I have just noticed that one of 
>> our audio tests for Tegra is failing on v6.4-rc and bisect is pointing 
>> to this commit. Reverting this on top of the current mainline fixes it.
>> 
>If I enable the debug prints, I do see the following messages ...
>
>  tegra-audio-graph-card sound: capture read timeout (DMA or IRQ trouble?)
>
yes, this is the kind of fallout one would expect from this change, as 
it significantly shortened the effective timeout under most 
circumstances.

first check that there isn't a genuine underlying bug, that is, that the 
unusually slow timings match expectations.

if everything looks right, then properly codify the timeout in the 
driver by setting substream->wait_time as required.
the lazy approach of more or less restoring the previous status quo 
would be setting it to 10000 in the `open` callback.
fwiw, soc/sof sets it to 500, which may actually be a bad idea (it's 
short enough that a very long period time would exceed it, if such is 
permitted). and it's not obvious why it does that.

regards
Jon Hunter June 12, 2023, 4:03 p.m. UTC | #5
On 12/06/2023 15:28, Oswald Buddenhagen wrote:
> On Mon, Jun 12, 2023 at 02:16:15PM +0100, Jon Hunter wrote:
>> On 12/06/2023 13:18, Jon Hunter wrote:
>>> On 05/04/2023 21:12, Oswald Buddenhagen wrote:
>>>> ... in wait_for_avail() and snd_pcm_drain().
>>>
>>> Sorry for not catching this sooner, but I have just noticed that one 
>>> of our audio tests for Tegra is failing on v6.4-rc and bisect is 
>>> pointing to this commit. Reverting this on top of the current 
>>> mainline fixes it.
>>>
>> If I enable the debug prints, I do see the following messages ...
>>
>>  tegra-audio-graph-card sound: capture read timeout (DMA or IRQ trouble?)
>>
> yes, this is the kind of fallout one would expect from this change, as 
> it significantly shortened the effective timeout under most circumstances.
> 
> first check that there isn't a genuine underlying bug, that is, that the 
> unusually slow timings match expectations.
> 
> if everything looks right, then properly codify the timeout in the 
> driver by setting substream->wait_time as required.
> the lazy approach of more or less restoring the previous status quo 
> would be setting it to 10000 in the `open` callback.
> fwiw, soc/sof sets it to 500, which may actually be a bad idea (it's 
> short enough that a very long period time would exceed it, if such is 
> permitted). and it's not obvious why it does that.


Yes I can see now that reducing the timeout is causing the problem here. 
I can try a few settings to see what works.

Thanks
Jon
diff mbox series

Patch

diff --git a/sound/core/pcm_lib.c b/sound/core/pcm_lib.c
index 02fd65993e7e..af1eb136feb0 100644
--- a/sound/core/pcm_lib.c
+++ b/sound/core/pcm_lib.c
@@ -1878,15 +1878,14 @@  static int wait_for_avail(struct snd_pcm_substream *substream,
 		if (substream->wait_time) {
 			wait_time = substream->wait_time;
 		} else {
-			wait_time = 10;
+			wait_time = 100;
 
 			if (runtime->rate) {
-				long t = runtime->period_size * 2 /
-					 runtime->rate;
+				long t = runtime->buffer_size * 1100 / runtime->rate;
 				wait_time = max(t, wait_time);
 			}
-			wait_time = msecs_to_jiffies(wait_time * 1000);
 		}
+		wait_time = msecs_to_jiffies(wait_time);
 	}
 
 	for (;;) {
@@ -1934,8 +1933,8 @@  static int wait_for_avail(struct snd_pcm_substream *substream,
 		}
 		if (!tout) {
 			pcm_dbg(substream->pcm,
-				"%s write error (DMA or IRQ trouble?)\n",
-				is_playback ? "playback" : "capture");
+				"%s timeout (DMA or IRQ trouble?)\n",
+				is_playback ? "playback write" : "capture read");
 			err = -EIO;
 			break;
 		}
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index 331380c2438b..94185267a7b9 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -2159,12 +2159,12 @@  static int snd_pcm_drain(struct snd_pcm_substream *substream,
 		if (runtime->no_period_wakeup)
 			tout = MAX_SCHEDULE_TIMEOUT;
 		else {
-			tout = 10;
+			tout = 100;
 			if (runtime->rate) {
-				long t = runtime->period_size * 2 / runtime->rate;
+				long t = runtime->buffer_size * 1100 / runtime->rate;
 				tout = max(t, tout);
 			}
-			tout = msecs_to_jiffies(tout * 1000);
+			tout = msecs_to_jiffies(tout);
 		}
 		tout = schedule_timeout(tout);
 
@@ -2187,7 +2187,7 @@  static int snd_pcm_drain(struct snd_pcm_substream *substream,
 				result = -ESTRPIPE;
 			else {
 				dev_dbg(substream->pcm->card->dev,
-					"playback drain error (DMA or IRQ trouble?)\n");
+					"playback drain timeout (DMA or IRQ trouble?)\n");
 				snd_pcm_stop(substream, SNDRV_PCM_STATE_SETUP);
 				result = -EIO;
 			}