diff mbox series

[v4,1/8] scsi: ufs: Flush exception event before suspend

Message ID 1579764349-15578-2-git-send-email-cang@codeaurora.org (mailing list archive)
State Superseded
Headers show
Series UFS driver general fixes bundle 4 | expand

Commit Message

Can Guo Jan. 23, 2020, 7:25 a.m. UTC
From: Sayali Lokhande <sayalil@codeaurora.org>

Exception event can be raised by the device when system
suspend is in progress. This will result in unclocked
register access in exception event handler as clocks will
be turned off during suspend. This change makes sure to flush
exception event handler work in suspend before disabling
clocks to avoid unclocked register access issue.

Signed-off-by: Sayali Lokhande <sayalil@codeaurora.org>
Signed-off-by: Asutosh Das <asutoshd@codeaurora.org>
Signed-off-by: Can Guo <cang@codeaurora.org>
---
 drivers/scsi/ufs/ufshcd.c | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

Comments

Bean Huo Jan. 23, 2020, 11:09 p.m. UTC | #1
Hi, Can

>  			/* TODO: handle Reject UPIU Response */ @@ -5215,7
> +5222,14 @@ static void ufshcd_exception_event_handler(struct work_struct
> *work)
> 
>  out:
>  	scsi_unblock_requests(hba->host);
> -	pm_runtime_put_sync(hba->dev);
> +	/*
> +	 * pm_runtime_get_noresume is called while scheduling
> +	 * eeh_work to avoid suspend racing with exception work.
> +	 * Hence decrement usage counter using pm_runtime_put_noidle
> +	 * to allow suspend on completion of exception event handler.
> +	 */
> +	pm_runtime_put_noidle(hba->dev);
> +	pm_runtime_put(hba->dev);
>  	return;
>  }
> 
Rebased this patch.
Thanks, 
//Bean
Bart Van Assche Jan. 26, 2020, 3:29 a.m. UTC | #2
On 2020-01-22 23:25, Can Guo wrote:
> diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
> index 1201578..c2de29f 100644
> --- a/drivers/scsi/ufs/ufshcd.c
> +++ b/drivers/scsi/ufs/ufshcd.c
> @@ -4760,8 +4760,15 @@ static void ufshcd_slave_destroy(struct scsi_device *sdev)
>  			 * UFS device needs urgent BKOPs.
>  			 */
>  			if (!hba->pm_op_in_progress &&
> -			    ufshcd_is_exception_event(lrbp->ucd_rsp_ptr))
> -				schedule_work(&hba->eeh_work);
> +			    ufshcd_is_exception_event(lrbp->ucd_rsp_ptr)) {
> +				/*
> +				 * Prevent suspend once eeh_work is scheduled
> +				 * to avoid deadlock between ufshcd_suspend
> +				 * and exception event handler.
> +				 */
> +				if (schedule_work(&hba->eeh_work))
> +					pm_runtime_get_noresume(hba->dev);
> +			}

Please combine the two logical tests with "&&" instead of nesting two
if-statements inside each other.

>  			break;
>  		case UPIU_TRANSACTION_REJECT_UPIU:
>  			/* TODO: handle Reject UPIU Response */
> @@ -5215,7 +5222,14 @@ static void ufshcd_exception_event_handler(struct work_struct *work)
>  
>  out:
>  	scsi_unblock_requests(hba->host);
> -	pm_runtime_put_sync(hba->dev);
> +	/*
> +	 * pm_runtime_get_noresume is called while scheduling
> +	 * eeh_work to avoid suspend racing with exception work.
> +	 * Hence decrement usage counter using pm_runtime_put_noidle
> +	 * to allow suspend on completion of exception event handler.
> +	 */
> +	pm_runtime_put_noidle(hba->dev);
> +	pm_runtime_put(hba->dev);
>  	return;
>  }
>  
> @@ -7901,6 +7915,7 @@ static int ufshcd_suspend(struct ufs_hba *hba, enum ufs_pm_op pm_op)
>  			goto enable_gating;
>  	}
>  
> +	flush_work(&hba->eeh_work);
>  	ret = ufshcd_link_state_transition(hba, req_link_state, 1);
>  	if (ret)
>  		goto set_dev_active;

I think this patch introduces a new race condition, namely the following:
- ufshcd_slave_destroy() tests pm_op_in_progress and reads the value
  zero from that variable.
- ufshcd_suspend() sets hba->pm_op_in_progress to one.
- ufshcd_slave_destroy() calls schedule_work().

How about fixing this race condition by calling
pm_runtime_get_noresume() before checking pm_op_in_progress and by
reallowing resume if no work is scheduled?

Thanks,

Bart.
Can Guo Feb. 3, 2020, 6:23 a.m. UTC | #3
On 2020-01-26 11:29, Bart Van Assche wrote:
> On 2020-01-22 23:25, Can Guo wrote:
>> diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
>> index 1201578..c2de29f 100644
>> --- a/drivers/scsi/ufs/ufshcd.c
>> +++ b/drivers/scsi/ufs/ufshcd.c
>> @@ -4760,8 +4760,15 @@ static void ufshcd_slave_destroy(struct 
>> scsi_device *sdev)
>>  			 * UFS device needs urgent BKOPs.
>>  			 */
>>  			if (!hba->pm_op_in_progress &&
>> -			    ufshcd_is_exception_event(lrbp->ucd_rsp_ptr))
>> -				schedule_work(&hba->eeh_work);
>> +			    ufshcd_is_exception_event(lrbp->ucd_rsp_ptr)) {
>> +				/*
>> +				 * Prevent suspend once eeh_work is scheduled
>> +				 * to avoid deadlock between ufshcd_suspend
>> +				 * and exception event handler.
>> +				 */
>> +				if (schedule_work(&hba->eeh_work))
>> +					pm_runtime_get_noresume(hba->dev);
>> +			}
> 
> Please combine the two logical tests with "&&" instead of nesting two
> if-statements inside each other.
> 
>>  			break;
>>  		case UPIU_TRANSACTION_REJECT_UPIU:
>>  			/* TODO: handle Reject UPIU Response */
>> @@ -5215,7 +5222,14 @@ static void 
>> ufshcd_exception_event_handler(struct work_struct *work)
>> 
>>  out:
>>  	scsi_unblock_requests(hba->host);
>> -	pm_runtime_put_sync(hba->dev);
>> +	/*
>> +	 * pm_runtime_get_noresume is called while scheduling
>> +	 * eeh_work to avoid suspend racing with exception work.
>> +	 * Hence decrement usage counter using pm_runtime_put_noidle
>> +	 * to allow suspend on completion of exception event handler.
>> +	 */
>> +	pm_runtime_put_noidle(hba->dev);
>> +	pm_runtime_put(hba->dev);
>>  	return;
>>  }
>> 
>> @@ -7901,6 +7915,7 @@ static int ufshcd_suspend(struct ufs_hba *hba, 
>> enum ufs_pm_op pm_op)
>>  			goto enable_gating;
>>  	}
>> 
>> +	flush_work(&hba->eeh_work);
>>  	ret = ufshcd_link_state_transition(hba, req_link_state, 1);
>>  	if (ret)
>>  		goto set_dev_active;
> 
> I think this patch introduces a new race condition, namely the 
> following:
> - ufshcd_slave_destroy() tests pm_op_in_progress and reads the value
>   zero from that variable.
> - ufshcd_suspend() sets hba->pm_op_in_progress to one.
> - ufshcd_slave_destroy() calls schedule_work().
> 
> How about fixing this race condition by calling
> pm_runtime_get_noresume() before checking pm_op_in_progress and by
> reallowing resume if no work is scheduled?
> 
> Thanks,
> 
> Bart.

Hi Bart,

If you apply this patch, you will find the change is not in
ufshcd_slave_destroy(), but in ufshcd_transfer_rsp_status().
So the racing you mentioned above does not exist.

Thanks,

Can Guo.
Bart Van Assche Feb. 4, 2020, 3:12 a.m. UTC | #4
On 2020-02-02 22:23, Can Guo wrote:
> On 2020-01-26 11:29, Bart Van Assche wrote:
>> On 2020-01-22 23:25, Can Guo wrote:
>>>              break;
>>>          case UPIU_TRANSACTION_REJECT_UPIU:
>>>              /* TODO: handle Reject UPIU Response */
>>> @@ -5215,7 +5222,14 @@ static void
>>> ufshcd_exception_event_handler(struct work_struct *work)
>>>
>>>  out:
>>>      scsi_unblock_requests(hba->host);
>>> -    pm_runtime_put_sync(hba->dev);
>>> +    /*
>>> +     * pm_runtime_get_noresume is called while scheduling
>>> +     * eeh_work to avoid suspend racing with exception work.
>>> +     * Hence decrement usage counter using pm_runtime_put_noidle
>>> +     * to allow suspend on completion of exception event handler.
>>> +     */
>>> +    pm_runtime_put_noidle(hba->dev);
>>> +    pm_runtime_put(hba->dev);
>>>      return;
>>>  }
>>>
>>> @@ -7901,6 +7915,7 @@ static int ufshcd_suspend(struct ufs_hba *hba,
>>> enum ufs_pm_op pm_op)
>>>              goto enable_gating;
>>>      }
>>>
>>> +    flush_work(&hba->eeh_work);
>>>      ret = ufshcd_link_state_transition(hba, req_link_state, 1);
>>>      if (ret)
>>>          goto set_dev_active;
>>
>> I think this patch introduces a new race condition, namely the following:
>> - ufshcd_slave_destroy() tests pm_op_in_progress and reads the value
>>   zero from that variable.
>> - ufshcd_suspend() sets hba->pm_op_in_progress to one.
>> - ufshcd_slave_destroy() calls schedule_work().
>>
>> How about fixing this race condition by calling
>> pm_runtime_get_noresume() before checking pm_op_in_progress and by
>> reallowing resume if no work is scheduled?
> 
> If you apply this patch, you will find the change is not in
> ufshcd_slave_destroy(), but in ufshcd_transfer_rsp_status().
> So the racing you mentioned above does not exist.

Hi Can,

Apparently I got a function name wrong. Can the following race condition
happen:
- ufshcd_transfer_rsp_status() tests pm_op_in_progress and reads the
  value zero from that variable.
- ufshcd_suspend() sets hba->pm_op_in_progress to one.
- ufshcd_suspend() calls flush_work(&hba->eeh_work).
- ufshcd_transfer_rsp_status() calls schedule_work(&hba->eeh_work).

Thanks,

Bart.
Can Guo Feb. 4, 2020, 6:28 a.m. UTC | #5
On 2020-02-04 11:12, Bart Van Assche wrote:
> On 2020-02-02 22:23, Can Guo wrote:
>> On 2020-01-26 11:29, Bart Van Assche wrote:
>>> On 2020-01-22 23:25, Can Guo wrote:
>>>>              break;
>>>>          case UPIU_TRANSACTION_REJECT_UPIU:
>>>>              /* TODO: handle Reject UPIU Response */
>>>> @@ -5215,7 +5222,14 @@ static void
>>>> ufshcd_exception_event_handler(struct work_struct *work)
>>>> 
>>>>  out:
>>>>      scsi_unblock_requests(hba->host);
>>>> -    pm_runtime_put_sync(hba->dev);
>>>> +    /*
>>>> +     * pm_runtime_get_noresume is called while scheduling
>>>> +     * eeh_work to avoid suspend racing with exception work.
>>>> +     * Hence decrement usage counter using pm_runtime_put_noidle
>>>> +     * to allow suspend on completion of exception event handler.
>>>> +     */
>>>> +    pm_runtime_put_noidle(hba->dev);
>>>> +    pm_runtime_put(hba->dev);
>>>>      return;
>>>>  }
>>>> 
>>>> @@ -7901,6 +7915,7 @@ static int ufshcd_suspend(struct ufs_hba *hba,
>>>> enum ufs_pm_op pm_op)
>>>>              goto enable_gating;
>>>>      }
>>>> 
>>>> +    flush_work(&hba->eeh_work);
>>>>      ret = ufshcd_link_state_transition(hba, req_link_state, 1);
>>>>      if (ret)
>>>>          goto set_dev_active;
>>> 
>>> I think this patch introduces a new race condition, namely the 
>>> following:
>>> - ufshcd_slave_destroy() tests pm_op_in_progress and reads the value
>>>   zero from that variable.
>>> - ufshcd_suspend() sets hba->pm_op_in_progress to one.
>>> - ufshcd_slave_destroy() calls schedule_work().
>>> 
>>> How about fixing this race condition by calling
>>> pm_runtime_get_noresume() before checking pm_op_in_progress and by
>>> reallowing resume if no work is scheduled?
>> 
>> If you apply this patch, you will find the change is not in
>> ufshcd_slave_destroy(), but in ufshcd_transfer_rsp_status().
>> So the racing you mentioned above does not exist.
> 
> Hi Can,
> 
> Apparently I got a function name wrong. Can the following race 
> condition
> happen:
> - ufshcd_transfer_rsp_status() tests pm_op_in_progress and reads the
>   value zero from that variable.
> - ufshcd_suspend() sets hba->pm_op_in_progress to one.
> - ufshcd_suspend() calls flush_work(&hba->eeh_work).
> - ufshcd_transfer_rsp_status() calls schedule_work(&hba->eeh_work).
> 
> Thanks,
> 
> Bart.

Hi Bart,

The sequence you mentioned is not possible.

In normal cases, before ufshcd_transfer_rsp_status() returns,
ufshcd_suspend() would not be called (unless you intentionally call
ufshcd_suspend() to screw it). Because ufshcd_transfer_rsp_status() is
called from __ufshcd_transfer_req_compl(), which is being used by either
UFS IRQ handler or err handler. Meanwhile, in 
__ufshcd_transfer_req_compl(),
scsi_done() is called only after ufshcd_transfer_rsp_status() returns. 
When
we are here, it means UFS driver is still handling requests/tasks, so 
suspend
would not kick start at this moment, either runtime suspend or system 
suspend.

And this is why below lines work, calling pm_runtime_get_noresume() 
within
ufshcd_transfer_rsp_status() can prevent runtime suspend from happening
after we leave ufshcd_transfer_rsp_status().

+                if (schedule_work(&hba->eeh_work))
+                    pm_runtime_get_noresume(hba->dev);

Thanks,

Can Guo.
diff mbox series

Patch

diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
index 1201578..c2de29f 100644
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c
@@ -4760,8 +4760,15 @@  static void ufshcd_slave_destroy(struct scsi_device *sdev)
 			 * UFS device needs urgent BKOPs.
 			 */
 			if (!hba->pm_op_in_progress &&
-			    ufshcd_is_exception_event(lrbp->ucd_rsp_ptr))
-				schedule_work(&hba->eeh_work);
+			    ufshcd_is_exception_event(lrbp->ucd_rsp_ptr)) {
+				/*
+				 * Prevent suspend once eeh_work is scheduled
+				 * to avoid deadlock between ufshcd_suspend
+				 * and exception event handler.
+				 */
+				if (schedule_work(&hba->eeh_work))
+					pm_runtime_get_noresume(hba->dev);
+			}
 			break;
 		case UPIU_TRANSACTION_REJECT_UPIU:
 			/* TODO: handle Reject UPIU Response */
@@ -5215,7 +5222,14 @@  static void ufshcd_exception_event_handler(struct work_struct *work)
 
 out:
 	scsi_unblock_requests(hba->host);
-	pm_runtime_put_sync(hba->dev);
+	/*
+	 * pm_runtime_get_noresume is called while scheduling
+	 * eeh_work to avoid suspend racing with exception work.
+	 * Hence decrement usage counter using pm_runtime_put_noidle
+	 * to allow suspend on completion of exception event handler.
+	 */
+	pm_runtime_put_noidle(hba->dev);
+	pm_runtime_put(hba->dev);
 	return;
 }
 
@@ -7901,6 +7915,7 @@  static int ufshcd_suspend(struct ufs_hba *hba, enum ufs_pm_op pm_op)
 			goto enable_gating;
 	}
 
+	flush_work(&hba->eeh_work);
 	ret = ufshcd_link_state_transition(hba, req_link_state, 1);
 	if (ret)
 		goto set_dev_active;