diff mbox series

ASoC: dpcm: fix race condition to dpcm links in dpcm_be_dai_trigger

Message ID 002f01d7b4f5$c030f4a0$4092dde0$@samsung.com (mailing list archive)
State New, archived
Headers show
Series ASoC: dpcm: fix race condition to dpcm links in dpcm_be_dai_trigger | expand

Commit Message

Gyeongtaek Lee Sept. 29, 2021, 5:49 a.m. UTC
If routing change and underrun stop is run at the same time,
data abort can be occurred by the following sequence.

CPU0: Processing underrun 	CPU1: Processing routing change
dpcm_be_dai_trigger():		dpcm_be_disconnect():

for_each_dpcm_be(fe, stream, dpcm) {

				spin_lock_irqsave(&fe->card->dpcm_lock, flags);
				list_del(&dpcm->list_be);
				list_del(&dpcm->list_fe);
				spin_unlock_irqrestore(&fe->card->dpcm_lock, flags);
				kfree(dpcm);

struct snd_soc_pcm_runtime *be = dpcm->be; <-- Accessing freed memory

To prevent this situation, dpcm_lock is needed during accessing
the lists for dpcm links.

Signed-off-by: Gyeongtaek Lee <gt82.lee@samsung.com>
Cc: stable@vger.kernel.org
---
 sound/soc/soc-pcm.c | 53 ++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 50 insertions(+), 3 deletions(-)


base-commit: 4ac6d90867a4de2e12117e755dbd76e08d88697f

Comments

Pierre-Louis Bossart Sept. 29, 2021, 2:11 p.m. UTC | #1
On 9/29/21 12:49 AM, Gyeongtaek Lee wrote:
> If routing change and underrun stop is run at the same time,
> data abort can be occurred by the following sequence.
> 
> CPU0: Processing underrun 	CPU1: Processing routing change
> dpcm_be_dai_trigger():		dpcm_be_disconnect():
> 
> for_each_dpcm_be(fe, stream, dpcm) {
> 
> 				spin_lock_irqsave(&fe->card->dpcm_lock, flags);
> 				list_del(&dpcm->list_be);
> 				list_del(&dpcm->list_fe);
> 				spin_unlock_irqrestore(&fe->card->dpcm_lock, flags);
> 				kfree(dpcm);
> 
> struct snd_soc_pcm_runtime *be = dpcm->be; <-- Accessing freed memory
> 
> To prevent this situation, dpcm_lock is needed during accessing
> the lists for dpcm links.

Isn't there still a possible inconsistency here introduced by the
duplication of the BE list?

You protect the list creation, but before you use it in
dpcm_be_dai_trigger(), there's a time window where the function could be
pre-empted and a disconnect event might have happened. As a result you
could trigger a BE that's no longer connected.

What you identified as a race is likely valid, but how to fix it isn't
clear to me - the DPCM code isn't self-explanatory at all with its use
in various places of the dpcm_lock spinlock, the pcm mutex, the card mutex.

Ideally we would need to find a way to prevent changes in connections
while we are doing the triggers, but triggers can take a bit of time if
they involve any sort of communication over a bus. I really wonder if
this dpcm_lock should be a mutex and if the model for DPCM really
involves interrupt contexts as the irqsave/irqrestore mentions hint at.

> Signed-off-by: Gyeongtaek Lee <gt82.lee@samsung.com>
> Cc: stable@vger.kernel.org
> ---
>  sound/soc/soc-pcm.c | 53 ++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 50 insertions(+), 3 deletions(-)
> 
> diff --git a/sound/soc/soc-pcm.c b/sound/soc/soc-pcm.c
> index 48f71bb81a2f..df2cd4c0dabe 100644
> --- a/sound/soc/soc-pcm.c
> +++ b/sound/soc/soc-pcm.c
> @@ -1993,17 +1993,63 @@ static int dpcm_fe_dai_hw_params(struct snd_pcm_substream *substream,
>  	return ret;
>  }
>  
> +struct dpcm_be_list {
> +	unsigned int num;
> +	struct snd_soc_pcm_runtime *be[];
> +};
> +
> +static int dpcm_create_be_list(struct snd_soc_pcm_runtime *fe, int stream,
> +		struct dpcm_be_list **be_list)
> +{
> +	struct snd_soc_dpcm *dpcm;
> +	struct dpcm_be_list *be;
> +	int size = 0;
> +	int ret = 0;
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&fe->card->dpcm_lock, flags);
> +
> +	for_each_dpcm_be(fe, stream, dpcm)
> +		size++;
> +
> +	be = kzalloc(struct_size(be, be, size), GFP_ATOMIC);
> +	if (!be) {
> +		ret = -ENOMEM;
> +	} else {
> +		unsigned int i = 0;
> +
> +		for_each_dpcm_be(fe, stream, dpcm)
> +			be->be[i++] = dpcm->be;
> +
> +		*be_list = be;
> +	}
> +
> +	spin_unlock_irqrestore(&fe->card->dpcm_lock, flags);
> +
> +	return ret;
> +}
> +
> +static void dpcm_free_be_list(struct dpcm_be_list *be_list)
> +{
> +	kfree(be_list);
> +}
> +
>  int dpcm_be_dai_trigger(struct snd_soc_pcm_runtime *fe, int stream,
>  			       int cmd)
>  {
>  	struct snd_soc_pcm_runtime *be;
> -	struct snd_soc_dpcm *dpcm;
> +	struct dpcm_be_list *be_list;
>  	int ret = 0;
> +	int i;
>  
> -	for_each_dpcm_be(fe, stream, dpcm) {
> +	ret = dpcm_create_be_list(fe, stream, &be_list);
> +	if (ret < 0)
> +		return ret;
> +
> +	for(i = 0; i < be_list->num; i++) {
>  		struct snd_pcm_substream *be_substream;
>  
> -		be = dpcm->be;
> +		be = be_list->be[i];
>  		be_substream = snd_soc_dpcm_get_substream(be, stream);
>  
>  		/* is this op for this BE ? */
> @@ -2092,6 +2138,7 @@ int dpcm_be_dai_trigger(struct snd_soc_pcm_runtime *fe, int stream,
>  	if (ret < 0)
>  		dev_err(fe->dev, "ASoC: %s() failed at %s (%d)\n",
>  			__func__, be->dai_link->name, ret);
> +	dpcm_free_be_list(be_list);
>  	return ret;
>  }
>  EXPORT_SYMBOL_GPL(dpcm_be_dai_trigger);
> 
> base-commit: 4ac6d90867a4de2e12117e755dbd76e08d88697f
>
Pierre-Louis Bossart Sept. 29, 2021, 9:01 p.m. UTC | #2
>> If routing change and underrun stop is run at the same time,
>> data abort can be occurred by the following sequence.
>>
>> CPU0: Processing underrun 	CPU1: Processing routing change
>> dpcm_be_dai_trigger():		dpcm_be_disconnect():
>>
>> for_each_dpcm_be(fe, stream, dpcm) {
>>
>> 				spin_lock_irqsave(&fe->card->dpcm_lock, flags);
>> 				list_del(&dpcm->list_be);
>> 				list_del(&dpcm->list_fe);
>> 				spin_unlock_irqrestore(&fe->card->dpcm_lock, flags);
>> 				kfree(dpcm);
>>
>> struct snd_soc_pcm_runtime *be = dpcm->be; <-- Accessing freed memory
>>
>> To prevent this situation, dpcm_lock is needed during accessing
>> the lists for dpcm links.
> 
> Isn't there still a possible inconsistency here introduced by the
> duplication of the BE list?
> 
> You protect the list creation, but before you use it in
> dpcm_be_dai_trigger(), there's a time window where the function could be
> pre-empted and a disconnect event might have happened. As a result you
> could trigger a BE that's no longer connected.
> 
> What you identified as a race is likely valid, but how to fix it isn't
> clear to me - the DPCM code isn't self-explanatory at all with its use
> in various places of the dpcm_lock spinlock, the pcm mutex, the card mutex.
> 
> Ideally we would need to find a way to prevent changes in connections
> while we are doing the triggers, but triggers can take a bit of time if
> they involve any sort of communication over a bus. I really wonder if
> this dpcm_lock should be a mutex and if the model for DPCM really
> involves interrupt contexts as the irqsave/irqrestore mentions hint at.

To follow-up on this, I started experimenting with a replacement of the
'dpcm_lock' spinlock with a 'dpcm_mutex', see
https://github.com/thesofproject/linux/pull/3186

If we combine both of our results, the 'right' solution might be to take
this mutex before every use of for_each_dpcm_be(), and unlock it at the
end of the loop, which additional changes to avoid re-taking the same
mutex in helper functions.

there's still a part in DPCM that I can't figure out, there is an
elaborate trick with an explicit comment

	/* if FE's runtime_update is already set, we're in race;
	 * process this trigger later at exit
	 */

Which looks like a missing mutex somewhere, or an overkill solution that
might never be needed.
Gyeongtaek Lee Sept. 30, 2021, 3:48 a.m. UTC | #3
>>> If routing change and underrun stop is run at the same time,
>>> data abort can be occurred by the following sequence.
>>>
>>> CPU0: Processing underrun 	CPU1: Processing routing change
>>> dpcm_be_dai_trigger():		dpcm_be_disconnect():
>>>
>>> for_each_dpcm_be(fe, stream, dpcm) {
>>>
>>> 				spin_lock_irqsave(&fe->card->dpcm_lock, flags);
>>> 				list_del(&dpcm->list_be);
>>> 				list_del(&dpcm->list_fe);
>>> 				spin_unlock_irqrestore(&fe->card->dpcm_lock, flags);
>>> 				kfree(dpcm);
>>>
>>> struct snd_soc_pcm_runtime *be = dpcm->be; <-- Accessing freed memory
>>>
>>> To prevent this situation, dpcm_lock is needed during accessing
>>> the lists for dpcm links.
>> 
>> Isn't there still a possible inconsistency here introduced by the
>> duplication of the BE list?
>> 
>> You protect the list creation, but before you use it in
>> dpcm_be_dai_trigger(), there's a time window where the function could be
>> pre-empted and a disconnect event might have happened. As a result you
>> could trigger a BE that's no longer connected.
>> 
>> What you identified as a race is likely valid, but how to fix it isn't
>> clear to me - the DPCM code isn't self-explanatory at all with its use
>> in various places of the dpcm_lock spinlock, the pcm mutex, the card mutex.
>> 
>> Ideally we would need to find a way to prevent changes in connections
>> while we are doing the triggers, but triggers can take a bit of time if
>> they involve any sort of communication over a bus. I really wonder if
>> this dpcm_lock should be a mutex and if the model for DPCM really
>> involves interrupt contexts as the irqsave/irqrestore mentions hint at.
>
>To follow-up on this, I started experimenting with a replacement of the
>'dpcm_lock' spinlock with a 'dpcm_mutex', see
>https://protect2.fireeye.com/v1/url?k=bdfd74d3-e2664dcc-bdfcff9c-000babdfecba-6f3671279e770f0b&q=1&e=7fdf074e-2aa1-44f0-bd52-58f2d26c9bfb&u=https%3A%2F%2Fgithub.com%2Fthesofproject%2Flinux%2Fpull%2F3186
>
>If we combine both of our results, the 'right' solution might be to take
>this mutex before every use of for_each_dpcm_be(), and unlock it at the
>end of the loop, which additional changes to avoid re-taking the same
>mutex in helper functions.
>
>there's still a part in DPCM that I can't figure out, there is an
>elaborate trick with an explicit comment
>
>	/* if FE's runtime_update is already set, we're in race;
>	 * process this trigger later at exit
>	 */
>
>Which looks like a missing mutex somewhere, or an overkill solution that
>might never be needed.
>
You are right.
This patch can't resolve inconsistency problem completely.
I thought that even part of the problem can be resolved by this patch and
it could help some other developers and me also.
And I also thought that invalid trigger on disconnected BE DAI can be protected
by the state check in the trigger function like the below.

int dpcm_be_dai_trigger(struct snd_soc_pcm_runtime *fe, int stream,
			       int cmd)
{
	struct snd_soc_dpcm *dpcm;
	int ret = 0;

	for_each_dpcm_be(fe, stream, dpcm) {
.......
		switch (cmd) {
		case SNDRV_PCM_TRIGGER_START:
			/* Following if statement protect invalid control. */
			if ((be->dpcm[stream].state != SND_SOC_DPCM_STATE_PREPARE) &&
			    (be->dpcm[stream].state != SND_SOC_DPCM_STATE_STOP) &&
			    (be->dpcm[stream].state != SND_SOC_DPCM_STATE_PAUSED))
				continue;

			ret = dpcm_do_trigger(dpcm, be_substream, cmd);

I really appreciate that there is a project about this problem already.
But if the project needs more time to be merged into the mainline,
I think that this patch can be used until the project is merged.
If you don't mind, would you reconsider this patch one more time?

Thank you,
Gyeongtaek Lee.
diff mbox series

Patch

diff --git a/sound/soc/soc-pcm.c b/sound/soc/soc-pcm.c
index 48f71bb81a2f..df2cd4c0dabe 100644
--- a/sound/soc/soc-pcm.c
+++ b/sound/soc/soc-pcm.c
@@ -1993,17 +1993,63 @@  static int dpcm_fe_dai_hw_params(struct snd_pcm_substream *substream,
 	return ret;
 }
 
+struct dpcm_be_list {
+	unsigned int num;
+	struct snd_soc_pcm_runtime *be[];
+};
+
+static int dpcm_create_be_list(struct snd_soc_pcm_runtime *fe, int stream,
+		struct dpcm_be_list **be_list)
+{
+	struct snd_soc_dpcm *dpcm;
+	struct dpcm_be_list *be;
+	int size = 0;
+	int ret = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&fe->card->dpcm_lock, flags);
+
+	for_each_dpcm_be(fe, stream, dpcm)
+		size++;
+
+	be = kzalloc(struct_size(be, be, size), GFP_ATOMIC);
+	if (!be) {
+		ret = -ENOMEM;
+	} else {
+		unsigned int i = 0;
+
+		for_each_dpcm_be(fe, stream, dpcm)
+			be->be[i++] = dpcm->be;
+
+		*be_list = be;
+	}
+
+	spin_unlock_irqrestore(&fe->card->dpcm_lock, flags);
+
+	return ret;
+}
+
+static void dpcm_free_be_list(struct dpcm_be_list *be_list)
+{
+	kfree(be_list);
+}
+
 int dpcm_be_dai_trigger(struct snd_soc_pcm_runtime *fe, int stream,
 			       int cmd)
 {
 	struct snd_soc_pcm_runtime *be;
-	struct snd_soc_dpcm *dpcm;
+	struct dpcm_be_list *be_list;
 	int ret = 0;
+	int i;
 
-	for_each_dpcm_be(fe, stream, dpcm) {
+	ret = dpcm_create_be_list(fe, stream, &be_list);
+	if (ret < 0)
+		return ret;
+
+	for(i = 0; i < be_list->num; i++) {
 		struct snd_pcm_substream *be_substream;
 
-		be = dpcm->be;
+		be = be_list->be[i];
 		be_substream = snd_soc_dpcm_get_substream(be, stream);
 
 		/* is this op for this BE ? */
@@ -2092,6 +2138,7 @@  int dpcm_be_dai_trigger(struct snd_soc_pcm_runtime *fe, int stream,
 	if (ret < 0)
 		dev_err(fe->dev, "ASoC: %s() failed at %s (%d)\n",
 			__func__, be->dai_link->name, ret);
+	dpcm_free_be_list(be_list);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(dpcm_be_dai_trigger);