diff mbox series

[06/14] ASoC: Intel: avs: Coredump and recovery flow

Message ID 20220426172346.3508411-7-cezary.rojewski@intel.com (mailing list archive)
State Superseded
Headers show
Series ASoC: Intel: avs: Driver core and PCM operations | expand

Commit Message

Cezary Rojewski April 26, 2022, 5:23 p.m. UTC
In rare occassions, under stress conditions or hardware malfunction, DSP
firmware may fail. Software is notified about such situation with
EXCEPTION_CAUGHT notification. IPC timeout is also counted as critical
device failure. More often than not, driver can recover from such
situations by performing full reset: killing and restarting ADSP.

Signed-off-by: Amadeusz Sławiński <amadeuszx.slawinski@linux.intel.com>
Signed-off-by: Cezary Rojewski <cezary.rojewski@intel.com>
---
 sound/soc/intel/Kconfig        |  1 +
 sound/soc/intel/avs/avs.h      |  4 ++
 sound/soc/intel/avs/ipc.c      | 95 +++++++++++++++++++++++++++++++++-
 sound/soc/intel/avs/messages.h |  5 ++
 4 files changed, 103 insertions(+), 2 deletions(-)

Comments

Pierre-Louis Bossart April 26, 2022, 9:53 p.m. UTC | #1
On 4/26/22 12:23, Cezary Rojewski wrote:
> In rare occassions, under stress conditions or hardware malfunction, DSP

occasions

> firmware may fail. Software is notified about such situation with
> EXCEPTION_CAUGHT notification. IPC timeout is also counted as critical
> device failure. More often than not, driver can recover from such
> situations by performing full reset: killing and restarting ADSP.
> 
> Signed-off-by: Amadeusz Sławiński <amadeuszx.slawinski@linux.intel.com>
> Signed-off-by: Cezary Rojewski <cezary.rojewski@intel.com>
> ---
>  sound/soc/intel/Kconfig        |  1 +
>  sound/soc/intel/avs/avs.h      |  4 ++
>  sound/soc/intel/avs/ipc.c      | 95 +++++++++++++++++++++++++++++++++-
>  sound/soc/intel/avs/messages.h |  5 ++
>  4 files changed, 103 insertions(+), 2 deletions(-)
> 
> diff --git a/sound/soc/intel/Kconfig b/sound/soc/intel/Kconfig
> index c364ddf22267..05ad6bdecfc5 100644
> --- a/sound/soc/intel/Kconfig
> +++ b/sound/soc/intel/Kconfig
> @@ -218,6 +218,7 @@ config SND_SOC_INTEL_AVS
>  	select SND_HDA_EXT_CORE
>  	select SND_HDA_DSP_LOADER
>  	select SND_INTEL_NHLT
> +	select WANT_DEV_COREDUMP
>  	help
>  	  Enable support for Intel(R) cAVS 1.5 platforms with DSP
>  	  capabilities. This includes Skylake, Kabylake, Amberlake and
> diff --git a/sound/soc/intel/avs/avs.h b/sound/soc/intel/avs/avs.h
> index e628f78d1864..02c2aa1bcd5c 100644
> --- a/sound/soc/intel/avs/avs.h
> +++ b/sound/soc/intel/avs/avs.h
> @@ -42,6 +42,7 @@ struct avs_dsp_ops {
>  	int (* const load_basefw)(struct avs_dev *, struct firmware *);
>  	int (* const load_lib)(struct avs_dev *, struct firmware *, u32);
>  	int (* const transfer_mods)(struct avs_dev *, bool, struct avs_module_entry *, u32);
> +	int (* const coredump)(struct avs_dev *, union avs_notify_msg *);
>  };
>  
>  #define avs_dsp_op(adev, op, ...) \
> @@ -164,12 +165,15 @@ struct avs_ipc {
>  	struct avs_ipc_msg rx;
>  	u32 default_timeout_ms;
>  	bool ready;
> +	bool recovering;
>  
>  	bool rx_completed;
>  	spinlock_t rx_lock;
>  	struct mutex msg_mutex;
>  	struct completion done_completion;
>  	struct completion busy_completion;
> +
> +	struct work_struct recovery_work;
>  };
>  
>  #define AVS_EIPC	EREMOTEIO
> diff --git a/sound/soc/intel/avs/ipc.c b/sound/soc/intel/avs/ipc.c
> index 68aaf01edbf2..84cb411c82fa 100644
> --- a/sound/soc/intel/avs/ipc.c
> +++ b/sound/soc/intel/avs/ipc.c
> @@ -14,6 +14,87 @@
>  
>  #define AVS_IPC_TIMEOUT_MS	300
>  
> +static void avs_dsp_recovery(struct avs_dev *adev)
> +{
> +	struct avs_soc_component *acomp;
> +	unsigned int core_mask;
> +	int ret;
> +
> +	if (adev->ipc->recovering)
> +		return;
> +	adev->ipc->recovering = true;

don't you need some sort of lock to test/clear this flag?

> +
> +	mutex_lock(&adev->comp_list_mutex);
> +	/* disconnect all running streams */
> +	list_for_each_entry(acomp, &adev->comp_list, node) {
> +		struct snd_soc_pcm_runtime *rtd;
> +		struct snd_soc_card *card;
> +
> +		card = acomp->base.card;
> +		if (!card)
> +			continue;
> +
> +		for_each_card_rtds(card, rtd) {
> +			struct snd_pcm *pcm;
> +			int dir;
> +
> +			pcm = rtd->pcm;
> +			if (!pcm || rtd->dai_link->no_pcm)
> +				continue;
> +
> +			for_each_pcm_streams(dir) {
> +				struct snd_pcm_substream *substream;
> +
> +				substream = pcm->streams[dir].substream;
> +				if (!substream || !substream->runtime)
> +					continue;
> +
> +				snd_pcm_stop(substream, SNDRV_PCM_STATE_DISCONNECTED);
> +			}
> +		}
> +	}
> +	mutex_unlock(&adev->comp_list_mutex);
> +
> +	/* forcibly shutdown all cores */
> +	core_mask = GENMASK(adev->hw_cfg.dsp_cores - 1, 0);
> +	avs_dsp_core_disable(adev, core_mask);
> +
> +	/* attempt dsp reboot */
> +	ret = avs_dsp_boot_firmware(adev, true);
> +	if (ret < 0)
> +		dev_err(adev->dev, "dsp reboot failed: %d\n", ret);
> +
> +	pm_runtime_mark_last_busy(adev->dev);
> +	pm_runtime_enable(adev->dev);
> +	pm_request_autosuspend(adev->dev);

there are zero users of this routine in the entire sound/ tree, can you clarify why this is needed or what you are trying to do?

> +
> +	adev->ipc->recovering = false;
> +}
Cezary Rojewski May 1, 2022, 3:32 p.m. UTC | #2
On 2022-04-26 11:53 PM, Pierre-Louis Bossart wrote:
> On 4/26/22 12:23, Cezary Rojewski wrote:
>> In rare occassions, under stress conditions or hardware malfunction, DSP
> 
> occasions


Ack.

>> firmware may fail. Software is notified about such situation with
>> EXCEPTION_CAUGHT notification. IPC timeout is also counted as critical
>> device failure. More often than not, driver can recover from such
>> situations by performing full reset: killing and restarting ADSP.
>>
>> Signed-off-by: Amadeusz Sławiński <amadeuszx.slawinski@linux.intel.com>
>> Signed-off-by: Cezary Rojewski <cezary.rojewski@intel.com>
>> ---
>>   sound/soc/intel/Kconfig        |  1 +
>>   sound/soc/intel/avs/avs.h      |  4 ++
>>   sound/soc/intel/avs/ipc.c      | 95 +++++++++++++++++++++++++++++++++-
>>   sound/soc/intel/avs/messages.h |  5 ++
>>   4 files changed, 103 insertions(+), 2 deletions(-)
>>
>> diff --git a/sound/soc/intel/Kconfig b/sound/soc/intel/Kconfig
>> index c364ddf22267..05ad6bdecfc5 100644
>> --- a/sound/soc/intel/Kconfig
>> +++ b/sound/soc/intel/Kconfig
>> @@ -218,6 +218,7 @@ config SND_SOC_INTEL_AVS
>>   	select SND_HDA_EXT_CORE
>>   	select SND_HDA_DSP_LOADER
>>   	select SND_INTEL_NHLT
>> +	select WANT_DEV_COREDUMP
>>   	help
>>   	  Enable support for Intel(R) cAVS 1.5 platforms with DSP
>>   	  capabilities. This includes Skylake, Kabylake, Amberlake and
>> diff --git a/sound/soc/intel/avs/avs.h b/sound/soc/intel/avs/avs.h
>> index e628f78d1864..02c2aa1bcd5c 100644
>> --- a/sound/soc/intel/avs/avs.h
>> +++ b/sound/soc/intel/avs/avs.h
>> @@ -42,6 +42,7 @@ struct avs_dsp_ops {
>>   	int (* const load_basefw)(struct avs_dev *, struct firmware *);
>>   	int (* const load_lib)(struct avs_dev *, struct firmware *, u32);
>>   	int (* const transfer_mods)(struct avs_dev *, bool, struct avs_module_entry *, u32);
>> +	int (* const coredump)(struct avs_dev *, union avs_notify_msg *);
>>   };
>>   
>>   #define avs_dsp_op(adev, op, ...) \
>> @@ -164,12 +165,15 @@ struct avs_ipc {
>>   	struct avs_ipc_msg rx;
>>   	u32 default_timeout_ms;
>>   	bool ready;
>> +	bool recovering;
>>   
>>   	bool rx_completed;
>>   	spinlock_t rx_lock;
>>   	struct mutex msg_mutex;
>>   	struct completion done_completion;
>>   	struct completion busy_completion;
>> +
>> +	struct work_struct recovery_work;
>>   };
>>   
>>   #define AVS_EIPC	EREMOTEIO
>> diff --git a/sound/soc/intel/avs/ipc.c b/sound/soc/intel/avs/ipc.c
>> index 68aaf01edbf2..84cb411c82fa 100644
>> --- a/sound/soc/intel/avs/ipc.c
>> +++ b/sound/soc/intel/avs/ipc.c
>> @@ -14,6 +14,87 @@
>>   
>>   #define AVS_IPC_TIMEOUT_MS	300
>>   
>> +static void avs_dsp_recovery(struct avs_dev *adev)
>> +{
>> +	struct avs_soc_component *acomp;
>> +	unsigned int core_mask;
>> +	int ret;
>> +
>> +	if (adev->ipc->recovering)
>> +		return;
>> +	adev->ipc->recovering = true;
> 
> don't you need some sort of lock to test/clear this flag?

Our stress tests do not confirm this. I'll not ignore this warning 
though, will recheck with my team next week.

>> +
>> +	mutex_lock(&adev->comp_list_mutex);
>> +	/* disconnect all running streams */
>> +	list_for_each_entry(acomp, &adev->comp_list, node) {
>> +		struct snd_soc_pcm_runtime *rtd;
>> +		struct snd_soc_card *card;
>> +
>> +		card = acomp->base.card;
>> +		if (!card)
>> +			continue;
>> +
>> +		for_each_card_rtds(card, rtd) {
>> +			struct snd_pcm *pcm;
>> +			int dir;
>> +
>> +			pcm = rtd->pcm;
>> +			if (!pcm || rtd->dai_link->no_pcm)
>> +				continue;
>> +
>> +			for_each_pcm_streams(dir) {
>> +				struct snd_pcm_substream *substream;
>> +
>> +				substream = pcm->streams[dir].substream;
>> +				if (!substream || !substream->runtime)
>> +					continue;
>> +
>> +				snd_pcm_stop(substream, SNDRV_PCM_STATE_DISCONNECTED);
>> +			}
>> +		}
>> +	}
>> +	mutex_unlock(&adev->comp_list_mutex);
>> +
>> +	/* forcibly shutdown all cores */
>> +	core_mask = GENMASK(adev->hw_cfg.dsp_cores - 1, 0);
>> +	avs_dsp_core_disable(adev, core_mask);
>> +
>> +	/* attempt dsp reboot */
>> +	ret = avs_dsp_boot_firmware(adev, true);
>> +	if (ret < 0)
>> +		dev_err(adev->dev, "dsp reboot failed: %d\n", ret);
>> +
>> +	pm_runtime_mark_last_busy(adev->dev);
>> +	pm_runtime_enable(adev->dev);
>> +	pm_request_autosuspend(adev->dev);
> 
> there are zero users of this routine in the entire sound/ tree, can you clarify why this is needed or what you are trying to do?


Unsure which routine you question here. I'll assume it's 
pm_request_autosuspend().

pm_request_audiosuspend() is being used to queue suspend once recovery 
completes. Recovery takes time and during that time all communication 
attempts with DSP will yield -EPERM. PM is also blocked for the device 
with pm_runtime_disable(), performed before scheduling the recovery 
work. Once recovery completes we do not just unblock the PM as that 
would cause immediate suspend. Instead, we "refresh" the *last busy* 
status and queue the suspend operation.

>> +
>> +	adev->ipc->recovering = false;
>> +}
Pierre-Louis Bossart May 2, 2022, 1:53 p.m. UTC | #3
>>> +
>>> +    /* forcibly shutdown all cores */
>>> +    core_mask = GENMASK(adev->hw_cfg.dsp_cores - 1, 0);
>>> +    avs_dsp_core_disable(adev, core_mask);
>>> +
>>> +    /* attempt dsp reboot */
>>> +    ret = avs_dsp_boot_firmware(adev, true);
>>> +    if (ret < 0)
>>> +        dev_err(adev->dev, "dsp reboot failed: %d\n", ret);
>>> +
>>> +    pm_runtime_mark_last_busy(adev->dev);
>>> +    pm_runtime_enable(adev->dev);
>>> +    pm_request_autosuspend(adev->dev);
>>
>> there are zero users of this routine in the entire sound/ tree, can you clarify why this is needed or what you are trying to do?
> 
> 
> Unsure which routine you question here. I'll assume it's pm_request_autosuspend().
> 
> pm_request_audiosuspend() is being used to queue suspend once recovery completes. Recovery takes time and during that time all communication attempts with DSP will yield -EPERM. PM is also blocked for the device with pm_runtime_disable(), performed before scheduling the recovery work. Once recovery completes we do not just unblock the PM as that would cause immediate suspend. Instead, we "refresh" the *last busy* status and queue the suspend operation.

But since you already have autosuspend enabled, why would you need to explicitly queue a suspend operation? What happens if that last call is omitted, is there actually a functional difference?

Not objecting if that's required, but since no one else used it so far I wonder if we missed something or if this is overkill.
diff mbox series

Patch

diff --git a/sound/soc/intel/Kconfig b/sound/soc/intel/Kconfig
index c364ddf22267..05ad6bdecfc5 100644
--- a/sound/soc/intel/Kconfig
+++ b/sound/soc/intel/Kconfig
@@ -218,6 +218,7 @@  config SND_SOC_INTEL_AVS
 	select SND_HDA_EXT_CORE
 	select SND_HDA_DSP_LOADER
 	select SND_INTEL_NHLT
+	select WANT_DEV_COREDUMP
 	help
 	  Enable support for Intel(R) cAVS 1.5 platforms with DSP
 	  capabilities. This includes Skylake, Kabylake, Amberlake and
diff --git a/sound/soc/intel/avs/avs.h b/sound/soc/intel/avs/avs.h
index e628f78d1864..02c2aa1bcd5c 100644
--- a/sound/soc/intel/avs/avs.h
+++ b/sound/soc/intel/avs/avs.h
@@ -42,6 +42,7 @@  struct avs_dsp_ops {
 	int (* const load_basefw)(struct avs_dev *, struct firmware *);
 	int (* const load_lib)(struct avs_dev *, struct firmware *, u32);
 	int (* const transfer_mods)(struct avs_dev *, bool, struct avs_module_entry *, u32);
+	int (* const coredump)(struct avs_dev *, union avs_notify_msg *);
 };
 
 #define avs_dsp_op(adev, op, ...) \
@@ -164,12 +165,15 @@  struct avs_ipc {
 	struct avs_ipc_msg rx;
 	u32 default_timeout_ms;
 	bool ready;
+	bool recovering;
 
 	bool rx_completed;
 	spinlock_t rx_lock;
 	struct mutex msg_mutex;
 	struct completion done_completion;
 	struct completion busy_completion;
+
+	struct work_struct recovery_work;
 };
 
 #define AVS_EIPC	EREMOTEIO
diff --git a/sound/soc/intel/avs/ipc.c b/sound/soc/intel/avs/ipc.c
index 68aaf01edbf2..84cb411c82fa 100644
--- a/sound/soc/intel/avs/ipc.c
+++ b/sound/soc/intel/avs/ipc.c
@@ -14,6 +14,87 @@ 
 
 #define AVS_IPC_TIMEOUT_MS	300
 
+static void avs_dsp_recovery(struct avs_dev *adev)
+{
+	struct avs_soc_component *acomp;
+	unsigned int core_mask;
+	int ret;
+
+	if (adev->ipc->recovering)
+		return;
+	adev->ipc->recovering = true;
+
+	mutex_lock(&adev->comp_list_mutex);
+	/* disconnect all running streams */
+	list_for_each_entry(acomp, &adev->comp_list, node) {
+		struct snd_soc_pcm_runtime *rtd;
+		struct snd_soc_card *card;
+
+		card = acomp->base.card;
+		if (!card)
+			continue;
+
+		for_each_card_rtds(card, rtd) {
+			struct snd_pcm *pcm;
+			int dir;
+
+			pcm = rtd->pcm;
+			if (!pcm || rtd->dai_link->no_pcm)
+				continue;
+
+			for_each_pcm_streams(dir) {
+				struct snd_pcm_substream *substream;
+
+				substream = pcm->streams[dir].substream;
+				if (!substream || !substream->runtime)
+					continue;
+
+				snd_pcm_stop(substream, SNDRV_PCM_STATE_DISCONNECTED);
+			}
+		}
+	}
+	mutex_unlock(&adev->comp_list_mutex);
+
+	/* forcibly shutdown all cores */
+	core_mask = GENMASK(adev->hw_cfg.dsp_cores - 1, 0);
+	avs_dsp_core_disable(adev, core_mask);
+
+	/* attempt dsp reboot */
+	ret = avs_dsp_boot_firmware(adev, true);
+	if (ret < 0)
+		dev_err(adev->dev, "dsp reboot failed: %d\n", ret);
+
+	pm_runtime_mark_last_busy(adev->dev);
+	pm_runtime_enable(adev->dev);
+	pm_request_autosuspend(adev->dev);
+
+	adev->ipc->recovering = false;
+}
+
+static void avs_dsp_recovery_work(struct work_struct *work)
+{
+	struct avs_ipc *ipc = container_of(work, struct avs_ipc, recovery_work);
+
+	avs_dsp_recovery(to_avs_dev(ipc->dev));
+}
+
+static void avs_dsp_exception_caught(struct avs_dev *adev, union avs_notify_msg *msg)
+{
+	if (adev->ipc->recovering)
+		return;
+
+	dev_crit(adev->dev, "communication severed, rebooting dsp..\n");
+
+	/* Re-enabled on recovery completion. */
+	avs_ipc_block(adev->ipc);
+	pm_runtime_disable(adev->dev);
+
+	/* Process received notification. */
+	avs_dsp_op(adev, coredump, msg);
+
+	schedule_work(&adev->ipc->recovery_work);
+}
+
 static void avs_dsp_receive_rx(struct avs_dev *adev, u64 header)
 {
 	struct avs_ipc *ipc = adev->ipc;
@@ -57,6 +138,9 @@  static void avs_dsp_process_notification(struct avs_dev *adev, u64 header)
 		data_size = sizeof(struct avs_notify_res_data);
 		break;
 
+	case AVS_NOTIFY_EXCEPTION_CAUGHT:
+		break;
+
 	case AVS_NOTIFY_MODULE_EVENT:
 		/* To know the total payload size, header needs to be read first. */
 		memcpy_fromio(&mod_data, avs_uplink_addr(adev), sizeof(mod_data));
@@ -84,6 +168,10 @@  static void avs_dsp_process_notification(struct avs_dev *adev, u64 header)
 		complete(&adev->fw_ready);
 		break;
 
+	case AVS_NOTIFY_EXCEPTION_CAUGHT:
+		avs_dsp_exception_caught(adev, &msg);
+		break;
+
 	default:
 		break;
 	}
@@ -278,9 +366,10 @@  static int avs_dsp_do_send_msg(struct avs_dev *adev, struct avs_ipc_msg *request
 	ret = avs_ipc_wait_busy_completion(ipc, timeout);
 	if (ret) {
 		if (ret == -ETIMEDOUT) {
-			dev_crit(adev->dev, "communication severed: %d, rebooting dsp..\n", ret);
+			union avs_notify_msg msg = AVS_NOTIFICATION(EXCEPTION_CAUGHT);
 
-			avs_ipc_block(ipc);
+			/* Same treatment as on exception, just stack_dump=0. */
+			avs_dsp_exception_caught(adev, &msg);
 		}
 		goto exit;
 	}
@@ -368,6 +457,7 @@  int avs_ipc_init(struct avs_ipc *ipc, struct device *dev)
 	ipc->dev = dev;
 	ipc->ready = false;
 	ipc->default_timeout_ms = AVS_IPC_TIMEOUT_MS;
+	INIT_WORK(&ipc->recovery_work, avs_dsp_recovery_work);
 	init_completion(&ipc->done_completion);
 	init_completion(&ipc->busy_completion);
 	spin_lock_init(&ipc->rx_lock);
@@ -379,4 +469,5 @@  int avs_ipc_init(struct avs_ipc *ipc, struct device *dev)
 void avs_ipc_block(struct avs_ipc *ipc)
 {
 	ipc->ready = false;
+	cancel_work_sync(&ipc->recovery_work);
 }
diff --git a/sound/soc/intel/avs/messages.h b/sound/soc/intel/avs/messages.h
index 0395dd7150eb..94875a153124 100644
--- a/sound/soc/intel/avs/messages.h
+++ b/sound/soc/intel/avs/messages.h
@@ -187,6 +187,7 @@  enum avs_notify_msg_type {
 	AVS_NOTIFY_PHRASE_DETECTED = 4,
 	AVS_NOTIFY_RESOURCE_EVENT = 5,
 	AVS_NOTIFY_FW_READY = 8,
+	AVS_NOTIFY_EXCEPTION_CAUGHT = 10,
 	AVS_NOTIFY_MODULE_EVENT = 12,
 };
 
@@ -205,6 +206,10 @@  union avs_notify_msg {
 		};
 		union {
 			u32 val;
+			struct {
+				u32 core_id:2;
+				u32 stack_dump_size:16;
+			} coredump;
 		} ext;
 	};
 } __packed;