diff mbox series

ufs: fix deadlock for the case of pm shutdown during suspend transition to resume

Message ID 20250311084257.8989-1-qiwu.chen@transsion.com (mailing list archive)
State New
Headers show
Series ufs: fix deadlock for the case of pm shutdown during suspend transition to resume | expand

Commit Message

qiwu.chen March 11, 2025, 8:42 a.m. UTC
There is a deadlock when triggers pm shutdown during suspend-to-idle state transition
to resume. Here are the issue reproduce steps:
1. System enter suspend-to-idle transition and execute suspend callbacks for given devices
in dpm_suspend(). The suspend callback of ufshcd_wl device - ufshcd_wl_suspend() will down
hba->host_sem which is supposed to up in ufshcd_wl_resume(). Here is main call trace:
enter_state
    suspend_devices_and_enter
        dpm_suspend_start
	    dpm_suspend
		device_suspend
		    ufshcd_wl_suspend
			down(&hba->host_sem) //hold host_sem

2. Someone triggers shutdown due to low battery during suspend transition.
The shutdown flow will hold device lock and execute shutdown callback for given devices
in device_shutdown(). The shutdown callback of ufshcd_wl device - ufshcd_wl_shutdown()
will hold ufshcd_wl's device lock and blocked to down hba->host_sem unfortunately.
Here is the blocked trace of shutdown thread:
__switch_to+0x174/0x338
__schedule+0x608/0x9f0
schedule+0x7c/0xe8
schedule_timeout+0x44/0x1c8
__down_common+0xbc/0x238
__down+0x18/0x28
down+0x50/0x54
ufshcd_wl_shutdown+0x28/0xb4   //blocked to down host_sem
device_shutdown+0x1a0/0x254    //get device lock
kernel_power_off+0x3c/0xf0
power_misc_routine_thread+0x814/0x970
kthread+0x104/0x1d4
ret_from_fork+0x10/0x20

3. Meanwhile, the suspend-to-idle transition is aborted by a wakeup source and go to handle
resume works, the resume work will be blocked in holding ufshcd_wl device lock forever.
Here is the blocked trace of resume work:
__switch_to+0x174/0x338
__schedule+0x608/0x9f0
schedule+0x7c/0xe8
schedule_preempt_disabled+0x24/0x40
__mutex_lock+0x408/0xdac
__mutex_lock_slowpath+0x14/0x24
mutex_lock+0x40/0xec
__device_resume+0x50/0x360    //blocked in holding device lock, deadlock!
async_resume+0x24/0x3c
async_run_entry_fn+0x44/0x118
process_one_work+0x1e4/0x43c
worker_thread+0x25c/0x430
kthread+0x104/0x1d4
ret_from_fork+0x10/0x20

Fix the deadlock issue by using atomic operation instead of sleep lock for shutting_down state check.

Signed-off-by: qiwu.chen <qiwu.chen@transsion.com>
---
 drivers/ufs/core/ufshcd-priv.h |  2 +-
 drivers/ufs/core/ufshcd.c      | 21 +++++++++------------
 include/ufs/ufshcd.h           |  2 +-
 3 files changed, 11 insertions(+), 14 deletions(-)

Comments

Peter Wang (王信友) March 11, 2025, 12:32 p.m. UTC | #1
On Tue, 2025-03-11 at 16:42 +0800, qiwu.chen wrote:
> There is a deadlock when triggers pm shutdown during suspend-to-idle
> state transition
> to resume. Here are the issue reproduce steps:
> 
> 

Hi Qiwu,

It's incorrect to directly call kernel_power_off due to low battery. 
This is encountered during ufs resume, indicating that there are 
still IO operations to be performed. Before kernel_power_off, 
it should be ensured that the file system has been unmounted and 
no more IO will send to UFS. Otherwise, if IO continues to send to 
UFS after UFS shutdown, unexpected errors will also occur."

Thanks.
Peter
diff mbox series

Patch

diff --git a/drivers/ufs/core/ufshcd-priv.h b/drivers/ufs/core/ufshcd-priv.h
index 786f20ef2238..76d323de42f9 100644
--- a/drivers/ufs/core/ufshcd-priv.h
+++ b/drivers/ufs/core/ufshcd-priv.h
@@ -8,7 +8,7 @@ 
 
 static inline bool ufshcd_is_user_access_allowed(struct ufs_hba *hba)
 {
-	return !hba->shutting_down;
+	return !atomic_read(&hba->shutting_down);
 }
 
 void ufshcd_schedule_eh_work(struct ufs_hba *hba);
diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
index 3094f3c89e82..b0f5152e5b04 100644
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -1729,12 +1729,10 @@  static ssize_t ufshcd_clkscale_enable_store(struct device *dev,
 	if (kstrtou32(buf, 0, &value))
 		return -EINVAL;
 
-	down(&hba->host_sem);
-	if (!ufshcd_is_user_access_allowed(hba)) {
-		err = -EBUSY;
-		goto out;
-	}
+	if (!ufshcd_is_user_access_allowed(hba))
+		return -EBUSY;
 
+	down(&hba->host_sem);
 	value = !!value;
 	if (value == hba->clk_scaling.is_enabled)
 		goto out;
@@ -6416,8 +6414,7 @@  static void ufshcd_err_handling_unprepare(struct ufs_hba *hba)
 
 static inline bool ufshcd_err_handling_should_stop(struct ufs_hba *hba)
 {
-	return (!hba->is_powered || hba->shutting_down ||
-		!hba->ufs_device_wlun ||
+	return (!hba->is_powered || !hba->ufs_device_wlun ||
 		hba->ufshcd_state == UFSHCD_STATE_ERROR ||
 		(!(hba->saved_err || hba->saved_uic_err || hba->force_reset ||
 		   ufshcd_is_link_broken(hba))));
@@ -6541,10 +6538,13 @@  static void ufshcd_err_handler(struct work_struct *work)
 	dev_info(hba->dev,
 		 "%s started; HBA state %s; powered %d; shutting down %d; saved_err = %d; saved_uic_err = %d; force_reset = %d%s\n",
 		 __func__, ufshcd_state_name[hba->ufshcd_state],
-		 hba->is_powered, hba->shutting_down, hba->saved_err,
+		 hba->is_powered, atomic_read(&hba->shutting_down), hba->saved_err,
 		 hba->saved_uic_err, hba->force_reset,
 		 ufshcd_is_link_broken(hba) ? "; link is broken" : "");
 
+	if (!ufshcd_is_user_access_allowed(hba))
+		return;
+
 	down(&hba->host_sem);
 	spin_lock_irqsave(hba->host->host_lock, flags);
 	if (ufshcd_err_handling_should_stop(hba)) {
@@ -10194,10 +10194,7 @@  static void ufshcd_wl_shutdown(struct device *dev)
 	struct scsi_device *sdev = to_scsi_device(dev);
 	struct ufs_hba *hba = shost_priv(sdev->host);
 
-	down(&hba->host_sem);
-	hba->shutting_down = true;
-	up(&hba->host_sem);
-
+	atomic_set(&hba->shutting_down, 1);
 	/* Turn on everything while shutting down */
 	ufshcd_rpm_get_sync(hba);
 	scsi_device_quiesce(sdev);
diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h
index 74e5b9960c54..db38141278c7 100644
--- a/include/ufs/ufshcd.h
+++ b/include/ufs/ufshcd.h
@@ -1034,7 +1034,7 @@  struct ufs_hba {
 	u16 ee_usr_mask;
 	struct mutex ee_ctrl_mutex;
 	bool is_powered;
-	bool shutting_down;
+	atomic_t shutting_down;
 	struct semaphore host_sem;
 
 	/* Work Queues */