diff mbox series

[8/8] scsi: ufs: Fix deadlock between power management and error handler

Message ID 20220923201138.2113123-9-bvanassche@acm.org (mailing list archive)
State Superseded
Headers show
Series Fix a deadlock in the UFS driver | expand

Commit Message

Bart Van Assche Sept. 23, 2022, 8:11 p.m. UTC
The following deadlock has been observed on multiple test setups:
* ufshcd_wl_suspend() is waiting for blk_execute_rq() to complete while it
  holds host_sem.
* ufshcd_eh_host_reset_handler() invokes ufshcd_err_handler() and the
  latter function tries to obtain host_sem.
This is a deadlock because blk_execute_rq() can't execute SCSI commands
while the host is in the SHOST_RECOVERY state and because the error
handler cannot make progress either.

Fix this deadlock by calling the UFS error handler directly during system
suspend instead of relying on the error handling mechanism in the SCSI
core.

Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
 drivers/ufs/core/ufshcd.c | 43 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 40 insertions(+), 3 deletions(-)

Comments

kernel test robot Sept. 24, 2022, 3:06 p.m. UTC | #1
Hi Bart,

I love your patch! Yet something to improve:

[auto build test ERROR on mkp-scsi/for-next]
[cannot apply to jejb-scsi/for-next linus/master v6.0-rc6 next-20220923]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Bart-Van-Assche/Fix-a-deadlock-in-the-UFS-driver/20220924-041421
base:   https://git.kernel.org/pub/scm/linux/kernel/git/mkp/scsi.git for-next
config: m68k-allmodconfig
compiler: m68k-linux-gcc (GCC) 12.1.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/ef035c21b5217f24bd5260255a6592644895ff2e
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Bart-Van-Assche/Fix-a-deadlock-in-the-UFS-driver/20220924-041421
        git checkout ef035c21b5217f24bd5260255a6592644895ff2e
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=m68k SHELL=/bin/bash

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>, old ones prefixed by <<):

>> ERROR: modpost: "system_transition_mutex" [drivers/ufs/core/ufshcd-core.ko] undefined!
Adrian Hunter Sept. 27, 2022, 5:06 p.m. UTC | #2
On 23/09/22 23:11, Bart Van Assche wrote:
> The following deadlock has been observed on multiple test setups:
> * ufshcd_wl_suspend() is waiting for blk_execute_rq() to complete while it
>   holds host_sem.
> * ufshcd_eh_host_reset_handler() invokes ufshcd_err_handler() and the
>   latter function tries to obtain host_sem.
> This is a deadlock because blk_execute_rq() can't execute SCSI commands
> while the host is in the SHOST_RECOVERY state and because the error
> handler cannot make progress either.
> 
> Fix this deadlock by calling the UFS error handler directly during system
> suspend instead of relying on the error handling mechanism in the SCSI
> core.
> 
> Signed-off-by: Bart Van Assche <bvanassche@acm.org>
> ---
>  drivers/ufs/core/ufshcd.c | 43 ++++++++++++++++++++++++++++++++++++---
>  1 file changed, 40 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
> index abeb120b12eb..996641fc1176 100644
> --- a/drivers/ufs/core/ufshcd.c
> +++ b/drivers/ufs/core/ufshcd.c
> @@ -6405,9 +6405,19 @@ static void ufshcd_err_handler(struct work_struct *work)
>  {
>  	struct ufs_hba *hba = container_of(work, struct ufs_hba, eh_work);
>  
> -	down(&hba->host_sem);
> -	ufshcd_recover_link(hba);
> -	up(&hba->host_sem);
> +	/*
> +	 * Serialize suspend/resume and error handling because a deadlock
> +	 * occurs if the error handler runs concurrently with
> +	 * ufshcd_set_dev_pwr_mode().
> +	 */
> +	if (mutex_trylock(&system_transition_mutex)) {

This is effectively disabling the UFS driver's error handler work.
It would be better to add the ability to do that explicitly within
the UFS driver and avoid using system_transition_mutex.

> +		down(&hba->host_sem);
> +		ufshcd_recover_link(hba);
> +		up(&hba->host_sem);
> +		mutex_unlock(&system_transition_mutex);
> +	} else {
> +		pr_info("%s: suspend or resume is ongoing\n", __func__);
> +	}
>  }
>  
>  /**
> @@ -8298,6 +8308,25 @@ static void ufshcd_async_scan(void *data, async_cookie_t cookie)
>  	}
>  }
>  
> +static enum scsi_timeout_action ufshcd_eh_timed_out(struct scsi_cmnd *scmd)
> +{
> +	struct ufs_hba *hba = shost_priv(scmd->device->host);
> +
> +	if (!hba->system_suspending) {

Is a PM notifier needed - couldn't hba->system_suspending
have been set in ufshcd_wl_suspend() ?

Doesn't resume have the same issue ?

> +		/* Apply the SCSI core error handling strategy. */
> +		return SCSI_EH_NOT_HANDLED;
> +	}
> +
> +	/*
> +	 * Fail START STOP UNIT commands without activating the SCSI error
> +	 * handler to prevent a deadlock between ufshcd_set_dev_pwr_mode() and
> +	 * ufshcd_err_handler().
> +	 */
> +	set_host_byte(scmd, DID_TIME_OUT);
> +	scsi_done(scmd);
> +	return SCSI_EH_DONE;
> +}
> +
>  static const struct attribute_group *ufshcd_driver_groups[] = {
>  	&ufs_sysfs_unit_descriptor_group,
>  	&ufs_sysfs_lun_attributes_group,
> @@ -8332,6 +8361,7 @@ static struct scsi_host_template ufshcd_driver_template = {
>  	.eh_abort_handler	= ufshcd_abort,
>  	.eh_device_reset_handler = ufshcd_eh_device_reset_handler,
>  	.eh_host_reset_handler   = ufshcd_eh_host_reset_handler,
> +	.eh_timed_out		= ufshcd_eh_timed_out,
>  	.this_id		= -1,
>  	.sg_tablesize		= SG_ALL,
>  	.cmd_per_lun		= UFSHCD_CMD_PER_LUN,
> @@ -8791,6 +8821,13 @@ static int ufshcd_set_dev_pwr_mode(struct ufs_hba *hba,
>  			break;
>  		if (host_byte(ret) == 0 && scsi_status_is_good(ret))
>  			break;
> +		/*
> +		 * Calling the error handler directly when suspending or
> +		 * resuming the system since the callers of this function hold
> +		 * hba->host_sem in that case.

Runtime PM doesn't hold host_sem

> +		 */
> +		if (host_byte(ret) != 0 && hba->system_suspending)
> +			ufshcd_recover_link(hba);
>  	}
>  	if (ret) {
>  		sdev_printk(KERN_WARNING, sdp,
Bart Van Assche Sept. 27, 2022, 5:54 p.m. UTC | #3
On 9/27/22 10:06, Adrian Hunter wrote:
>> +	/*
>> +	 * Serialize suspend/resume and error handling because a deadlock
>> +	 * occurs if the error handler runs concurrently with
>> +	 * ufshcd_set_dev_pwr_mode().
>> +	 */
>> +	if (mutex_trylock(&system_transition_mutex)) {
> 
> This is effectively disabling the UFS driver's error handler work.
> It would be better to add the ability to do that explicitly within
> the UFS driver and avoid using system_transition_mutex.

I will modify this patch such that system_transition_mutex is not used.

>> +static enum scsi_timeout_action ufshcd_eh_timed_out(struct scsi_cmnd *scmd)
>> +{
>> +	struct ufs_hba *hba = shost_priv(scmd->device->host);
>> +
>> +	if (!hba->system_suspending) {
> 
> Is a PM notifier needed - couldn't hba->system_suspending
> have been set in ufshcd_wl_suspend() ?

I will look into this.

> Doesn't resume have the same issue ?

The member variable "system_suspending" covers both suspending and 
resuming. Do you perhaps want me to rename it into 
system_suspending_or_resuming?

>> +		/*
>> +		 * Calling the error handler directly when suspending or
>> +		 * resuming the system since the callers of this function hold
>> +		 * hba->host_sem in that case.
> 
> Runtime PM doesn't hold host_sem

I will look into whether the deadlock avoidance mechanism needs to be 
extended to runtime PM.

Thanks,

Bart.
diff mbox series

Patch

diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
index abeb120b12eb..996641fc1176 100644
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -6405,9 +6405,19 @@  static void ufshcd_err_handler(struct work_struct *work)
 {
 	struct ufs_hba *hba = container_of(work, struct ufs_hba, eh_work);
 
-	down(&hba->host_sem);
-	ufshcd_recover_link(hba);
-	up(&hba->host_sem);
+	/*
+	 * Serialize suspend/resume and error handling because a deadlock
+	 * occurs if the error handler runs concurrently with
+	 * ufshcd_set_dev_pwr_mode().
+	 */
+	if (mutex_trylock(&system_transition_mutex)) {
+		down(&hba->host_sem);
+		ufshcd_recover_link(hba);
+		up(&hba->host_sem);
+		mutex_unlock(&system_transition_mutex);
+	} else {
+		pr_info("%s: suspend or resume is ongoing\n", __func__);
+	}
 }
 
 /**
@@ -8298,6 +8308,25 @@  static void ufshcd_async_scan(void *data, async_cookie_t cookie)
 	}
 }
 
+static enum scsi_timeout_action ufshcd_eh_timed_out(struct scsi_cmnd *scmd)
+{
+	struct ufs_hba *hba = shost_priv(scmd->device->host);
+
+	if (!hba->system_suspending) {
+		/* Apply the SCSI core error handling strategy. */
+		return SCSI_EH_NOT_HANDLED;
+	}
+
+	/*
+	 * Fail START STOP UNIT commands without activating the SCSI error
+	 * handler to prevent a deadlock between ufshcd_set_dev_pwr_mode() and
+	 * ufshcd_err_handler().
+	 */
+	set_host_byte(scmd, DID_TIME_OUT);
+	scsi_done(scmd);
+	return SCSI_EH_DONE;
+}
+
 static const struct attribute_group *ufshcd_driver_groups[] = {
 	&ufs_sysfs_unit_descriptor_group,
 	&ufs_sysfs_lun_attributes_group,
@@ -8332,6 +8361,7 @@  static struct scsi_host_template ufshcd_driver_template = {
 	.eh_abort_handler	= ufshcd_abort,
 	.eh_device_reset_handler = ufshcd_eh_device_reset_handler,
 	.eh_host_reset_handler   = ufshcd_eh_host_reset_handler,
+	.eh_timed_out		= ufshcd_eh_timed_out,
 	.this_id		= -1,
 	.sg_tablesize		= SG_ALL,
 	.cmd_per_lun		= UFSHCD_CMD_PER_LUN,
@@ -8791,6 +8821,13 @@  static int ufshcd_set_dev_pwr_mode(struct ufs_hba *hba,
 			break;
 		if (host_byte(ret) == 0 && scsi_status_is_good(ret))
 			break;
+		/*
+		 * Calling the error handler directly when suspending or
+		 * resuming the system since the callers of this function hold
+		 * hba->host_sem in that case.
+		 */
+		if (host_byte(ret) != 0 && hba->system_suspending)
+			ufshcd_recover_link(hba);
 	}
 	if (ret) {
 		sdev_printk(KERN_WARNING, sdp,