[v2,18/20] scsi: ufs: Optimize the command queueing code

Message ID	20211119195743.2817-19-bvanassche@acm.org (mailing list archive)
State	Superseded
Headers	show Return-Path: <linux-scsi-owner@kernel.org> From: Bart Van Assche <bvanassche@acm.org> To: "Martin K . Petersen" <martin.petersen@oracle.com> Cc: Jaegeuk Kim <jaegeuk@kernel.org>, Adrian Hunter <adrian.hunter@intel.com>, linux-scsi@vger.kernel.org, Bart Van Assche <bvanassche@acm.org>, "James E.J. Bottomley" <jejb@linux.ibm.com>, Bean Huo <beanhuo@micron.com>, Can Guo <cang@codeaurora.org>, Avri Altman <avri.altman@wdc.com>, Stanley Chu <stanley.chu@mediatek.com>, Asutosh Das <asutoshd@codeaurora.org>, Keoseong Park <keosung.park@samsung.com> Subject: [PATCH v2 18/20] scsi: ufs: Optimize the command queueing code Date: Fri, 19 Nov 2021 11:57:41 -0800 Message-Id: <20211119195743.2817-19-bvanassche@acm.org> In-Reply-To: <20211119195743.2817-1-bvanassche@acm.org> References: <20211119195743.2817-1-bvanassche@acm.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: bulk
Series	UFS patches for kernel v5.17 \| expand [v2,00/20] UFS patches for kernel v5.17 [v2,01/20] block: Add a flag for internal commands [v2,02/20] scsi: core: Unexport scsi_track_queue_full() [v2,03/20] scsi: core: Fix scsi_device_max_queue_depth() [v2,04/20] scsi: core: Fix a race between scsi_done() and scsi_times_out() [v2,05/20] scsi: core: Add support for internal commands [v2,06/20] scsi: core: Add support for reserved tags [v2,07/20] scsi: ufs: Rename a function argument [v2,08/20] scsi: ufs: Remove is_rpmb_wlun() [v2,09/20] scsi: ufs: Remove the sdev_rpmb member [v2,10/20] scsi: ufs: Remove dead code [v2,11/20] scsi: ufs: Switch to scsi_(get\|put)_internal_cmd() [v2,12/20] scsi: ufs: Rework ufshcd_change_queue_depth() [v2,13/20] scsi: ufs: Fix a deadlock in the error handler [v2,14/20] scsi: ufs: Introduce ufshcd_release_scsi_cmd() [v2,15/20] scsi: ufs: Improve SCSI abort handling [v2,16/20] scsi: ufs: Fix a kernel crash during shutdown [v2,17/20] scsi: ufs: Stop using the clock scaling lock in the error handler [v2,18/20] scsi: ufs: Optimize the command queueing code [v2,19/20] scsi: ufs: Implement polling support [v2,20/20] scsi: ufs: Fix race conditions related to driver data

Bart Van Assche Nov. 19, 2021, 7:57 p.m. UTC

Remove the clock scaling lock from ufshcd_queuecommand() since it is a
performance bottleneck. Freeze request queues instead of polling the
doorbell registers to wait until pending commands have completed.

Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
 drivers/scsi/ufs/ufshcd.c | 124 +++++++++++++-------------------------
 drivers/scsi/ufs/ufshcd.h |   1 +
 2 files changed, 44 insertions(+), 81 deletions(-)

Asutosh Das (asd) Nov. 22, 2021, 5:46 p.m. UTC | #1

On 11/19/2021 11:57 AM, Bart Van Assche wrote:
> Remove the clock scaling lock from ufshcd_queuecommand() since it is a
> performance bottleneck. Freeze request queues instead of polling the
> doorbell registers to wait until pending commands have completed.
> 
> Signed-off-by: Bart Van Assche <bvanassche@acm.org>
> ---
>   drivers/scsi/ufs/ufshcd.c | 124 +++++++++++++-------------------------
>   drivers/scsi/ufs/ufshcd.h |   1 +
>   2 files changed, 44 insertions(+), 81 deletions(-)
> 
> diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
> index a6d3f71c6b00..9cf4a22f1950 100644
> --- a/drivers/scsi/ufs/ufshcd.c
> +++ b/drivers/scsi/ufs/ufshcd.c
> @@ -1070,65 +1070,6 @@ static bool ufshcd_is_devfreq_scaling_required(struct ufs_hba *hba,
>   	return false;
>   }
>   
[...]
>   /**
>    * ufshcd_scale_gear - scale up/down UFS gear
>    * @hba: per adapter instance
> @@ -1176,37 +1117,63 @@ static int ufshcd_scale_gear(struct ufs_hba *hba, bool scale_up)
>   
>   static int ufshcd_clock_scaling_prepare(struct ufs_hba *hba)
>   {
> -	#define DOORBELL_CLR_TOUT_US		(1000 * 1000) /* 1 sec */
> -	int ret = 0;
> +	struct scsi_device *sdev;
> +
>   	/*
> -	 * make sure that there are no outstanding requests when
> -	 * clock scaling is in progress
> +	 * Make sure that no commands are in progress while the clock frequency
> +	 * is being modified.
> +	 *
> +	 * Since ufshcd_exec_dev_cmd() and ufshcd_issue_devman_upiu_cmd() lock
> +	 * the clk_scaling_lock before calling blk_get_request(), lock
> +	 * clk_scaling_lock before freezing the request queues to prevent lock
> +	 * inversion.
>   	 */
> -	ufshcd_scsi_block_requests(hba);
>   	down_write(&hba->clk_scaling_lock);
> -
> -	if (!hba->clk_scaling.is_allowed ||
> -	    ufshcd_wait_for_doorbell_clr(hba, DOORBELL_CLR_TOUT_US)) {
> -		ret = -EBUSY;
> -		up_write(&hba->clk_scaling_lock);
> -		ufshcd_scsi_unblock_requests(hba);
> -		goto out;
> -	}
> -
> +	if (!hba->clk_scaling.is_allowed)
> +		goto busy;
> +	blk_freeze_queue_start(hba->tmf_queue);
> +	blk_freeze_queue_start(hba->cmd_queue);
> +	shost_for_each_device(sdev, hba->host)
> +		blk_freeze_queue_start(sdev->request_queue);
This would still issue the requests present in the queue before freezing 
and that's a concern.
> +	/*
> +	 * Calling synchronize_rcu_expedited() reduces the time required to
> +	 * freeze request queues from milliseconds to microseconds.
> +	 */
> +	synchronize_rcu_expedited();
> +	shost_for_each_device(sdev, hba->host)
> +		if (blk_mq_freeze_queue_wait_timeout(sdev->request_queue, HZ)
> +		    <= 0)
> +			goto unfreeze;
> +	if (blk_mq_freeze_queue_wait_timeout(hba->cmd_queue, HZ) <= 0 ||
> +	    blk_mq_freeze_queue_wait_timeout(hba->tmf_queue, HZ / 10) <= 0)
> +		goto unfreeze;
>   	/* let's not get into low power until clock scaling is completed */
>   	ufshcd_hold(hba, false);
> +	return 0;
>   
> -out:
> -	return ret;
> +unfreeze:
> +	shost_for_each_device(sdev, hba->host)
> +		blk_mq_unfreeze_queue(sdev->request_queue);
> +	blk_mq_unfreeze_queue(hba->cmd_queue);
> +	blk_mq_unfreeze_queue(hba->tmf_queue);
> +
> +busy:
> +	up_write(&hba->clk_scaling_lock);
> +	return -EBUSY;
>   }
>   
>   static void ufshcd_clock_scaling_unprepare(struct ufs_hba *hba, bool writelock)
>   {
> +	struct scsi_device *sdev;
> +
> +	shost_for_each_device(sdev, hba->host)
> +		blk_mq_unfreeze_queue(sdev->request_queue);
> +	blk_mq_unfreeze_queue(hba->cmd_queue);
> +	blk_mq_unfreeze_queue(hba->tmf_queue);
>   	if (writelock)
>   		up_write(&hba->clk_scaling_lock);
>   	else
>   		up_read(&hba->clk_scaling_lock);
> -	ufshcd_scsi_unblock_requests(hba);
>   	ufshcd_release(hba);
>   }
>   
> @@ -2699,9 +2666,6 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
>   
>   	WARN_ONCE(tag < 0, "Invalid tag %d\n", tag);
>   
> -	if (!down_read_trylock(&hba->clk_scaling_lock))
> -		return SCSI_MLQUEUE_HOST_BUSY;
> -
>   	/*
>   	 * Allows the UFS error handler to wait for prior ufshcd_queuecommand()
>   	 * calls.
> @@ -2790,8 +2754,6 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
>   out:
>   	rcu_read_unlock();
>   
> -	up_read(&hba->clk_scaling_lock);
> -
>   	if (ufs_trigger_eh()) {
>   		unsigned long flags;
>   
> diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h
> index e9bc07c69a80..7ec463c97d64 100644
> --- a/drivers/scsi/ufs/ufshcd.h
> +++ b/drivers/scsi/ufs/ufshcd.h
> @@ -778,6 +778,7 @@ struct ufs_hba_monitor {
>    * @clk_list_head: UFS host controller clocks list node head
>    * @pwr_info: holds current power mode
>    * @max_pwr_info: keeps the device max valid pwm
> + * @clk_scaling_lock: used to serialize device commands and clock scaling
>    * @desc_size: descriptor sizes reported by device
>    * @urgent_bkops_lvl: keeps track of urgent bkops level for device
>    * @is_urgent_bkops_lvl_checked: keeps track if the urgent bkops level for
>

Bart Van Assche Nov. 22, 2021, 6:13 p.m. UTC | #2

On 11/22/21 9:46 AM, Asutosh Das (asd) wrote:
> On 11/19/2021 11:57 AM, Bart Van Assche wrote:
>> +    blk_freeze_queue_start(hba->tmf_queue);
>> +    blk_freeze_queue_start(hba->cmd_queue);
>> +    shost_for_each_device(sdev, hba->host)
>> +        blk_freeze_queue_start(sdev->request_queue);
>
> This would still issue the requests present in the queue before freezing 
> and that's a concern.

Isn't that exactly what the existing code is doing since the existing 
code waits until both doorbell registers are zero? See also 
ufshcd_wait_for_doorbell_clr().

Thanks,

Bart.

Asutosh Das (asd) Nov. 22, 2021, 11:02 p.m. UTC | #3

On 11/22/2021 10:13 AM, Bart Van Assche wrote:
> On 11/22/21 9:46 AM, Asutosh Das (asd) wrote:
>> On 11/19/2021 11:57 AM, Bart Van Assche wrote:
>>> +    blk_freeze_queue_start(hba->tmf_queue);
>>> +    blk_freeze_queue_start(hba->cmd_queue);
>>> +    shost_for_each_device(sdev, hba->host)
>>> +        blk_freeze_queue_start(sdev->request_queue);
>>
>> This would still issue the requests present in the queue before 
>> freezing and that's a concern.
> 
> Isn't that exactly what the existing code is doing since the existing 
> code waits until both doorbell registers are zero? See also 
> ufshcd_wait_for_doorbell_clr().
> 
> Thanks,
> 
> Bart.
Current code waits for the already issued requests to complete. It 
doesn't issue the yet-to-be issued requests. Wouldn't freezing the queue 
issue the requests in the context of scaling_{up/down}?
If yes, I don't think the current code is doing that.

-asd

Bart Van Assche Nov. 22, 2021, 11:48 p.m. UTC | #4

On 11/22/21 3:02 PM, Asutosh Das (asd) wrote:
> Current code waits for the already issued requests to complete. It 
> doesn't issue the yet-to-be issued requests. Wouldn't freezing the queue 
> issue the requests in the context of scaling_{up/down}?
> If yes, I don't think the current code is doing that.

Hi Asutosh,

How about the patch below that preserves most of the existing code for
preparing for clock scaling?

Thanks,

Bart.


Subject: [PATCH] scsi: ufs: Optimize the command queueing code

Remove the clock scaling lock from ufshcd_queuecommand() since it is a
performance bottleneck. Instead, use synchronize_rcu_expedited() to wait
for ongoing ufshcd_queuecommand() calls.

Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
  drivers/scsi/ufs/ufshcd.c | 12 +++++++-----
  drivers/scsi/ufs/ufshcd.h |  1 +
  2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
index 5d214456bf82..1d929c28efaf 100644
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c
@@ -1196,6 +1196,13 @@ static int ufshcd_clock_scaling_prepare(struct ufs_hba *hba)
  	/* let's not get into low power until clock scaling is completed */
  	ufshcd_hold(hba, false);

+	/*
+	 * Wait for ongoing ufshcd_queuecommand() calls. Calling
+	 * synchronize_rcu_expedited() instead of synchronize_rcu() reduces the
+	 * waiting time from milliseconds to microseconds.
+	 */
+	synchronize_rcu_expedited();
+
  out:
  	return ret;
  }
@@ -2699,9 +2706,6 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)

  	WARN_ONCE(tag < 0, "Invalid tag %d\n", tag);

-	if (!down_read_trylock(&hba->clk_scaling_lock))
-		return SCSI_MLQUEUE_HOST_BUSY;
-
  	/*
  	 * Allows the UFS error handler to wait for prior ufshcd_queuecommand()
  	 * calls.
@@ -2790,8 +2794,6 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
  out:
  	rcu_read_unlock();

-	up_read(&hba->clk_scaling_lock);
-
  	if (ufs_trigger_eh()) {
  		unsigned long flags;

diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h
index c13ae56fbff8..695bede14dac 100644
--- a/drivers/scsi/ufs/ufshcd.h
+++ b/drivers/scsi/ufs/ufshcd.h
@@ -777,6 +777,7 @@ struct ufs_hba_monitor {
   * @clk_list_head: UFS host controller clocks list node head
   * @pwr_info: holds current power mode
   * @max_pwr_info: keeps the device max valid pwm
+ * @clk_scaling_lock: used to serialize device commands and clock scaling
   * @desc_size: descriptor sizes reported by device
   * @urgent_bkops_lvl: keeps track of urgent bkops level for device
   * @is_urgent_bkops_lvl_checked: keeps track if the urgent bkops level for

Asutosh Das (asd) Nov. 23, 2021, 6:24 p.m. UTC | #5

On 11/22/2021 3:48 PM, Bart Van Assche wrote:
> On 11/22/21 3:02 PM, Asutosh Das (asd) wrote:
>> Current code waits for the already issued requests to complete. It 
>> doesn't issue the yet-to-be issued requests. Wouldn't freezing the 
>> queue issue the requests in the context of scaling_{up/down}?
>> If yes, I don't think the current code is doing that.
> 
> Hi Asutosh,
> 
> How about the patch below that preserves most of the existing code for
> preparing for clock scaling?
> 
> Thanks,
> 
> Bart.
> 
Hi Bart,
This looks good to me. Please push a change and I can test it out.

-asd

> 
> Subject: [PATCH] scsi: ufs: Optimize the command queueing code
> 
> Remove the clock scaling lock from ufshcd_queuecommand() since it is a
> performance bottleneck. Instead, use synchronize_rcu_expedited() to wait
> for ongoing ufshcd_queuecommand() calls.
> 
> Signed-off-by: Bart Van Assche <bvanassche@acm.org>
> ---
>   drivers/scsi/ufs/ufshcd.c | 12 +++++++-----
>   drivers/scsi/ufs/ufshcd.h |  1 +
>   2 files changed, 8 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
> index 5d214456bf82..1d929c28efaf 100644
> --- a/drivers/scsi/ufs/ufshcd.c
> +++ b/drivers/scsi/ufs/ufshcd.c
> @@ -1196,6 +1196,13 @@ static int ufshcd_clock_scaling_prepare(struct 
> ufs_hba *hba)
>       /* let's not get into low power until clock scaling is completed */
>       ufshcd_hold(hba, false);
> 
> +    /*
> +     * Wait for ongoing ufshcd_queuecommand() calls. Calling
> +     * synchronize_rcu_expedited() instead of synchronize_rcu() reduces 
> the
> +     * waiting time from milliseconds to microseconds.
> +     */
> +    synchronize_rcu_expedited();
> +
>   out:
>       return ret;
>   }
> @@ -2699,9 +2706,6 @@ static int ufshcd_queuecommand(struct Scsi_Host 
> *host, struct scsi_cmnd *cmd)
> 
>       WARN_ONCE(tag < 0, "Invalid tag %d\n", tag);
> 
> -    if (!down_read_trylock(&hba->clk_scaling_lock))
> -        return SCSI_MLQUEUE_HOST_BUSY;
> -
>       /*
>        * Allows the UFS error handler to wait for prior 
> ufshcd_queuecommand()
>        * calls.
> @@ -2790,8 +2794,6 @@ static int ufshcd_queuecommand(struct Scsi_Host 
> *host, struct scsi_cmnd *cmd)
>   out:
>       rcu_read_unlock();
> 
> -    up_read(&hba->clk_scaling_lock);
> -
>       if (ufs_trigger_eh()) {
>           unsigned long flags;
> 
> diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h
> index c13ae56fbff8..695bede14dac 100644
> --- a/drivers/scsi/ufs/ufshcd.h
> +++ b/drivers/scsi/ufs/ufshcd.h
> @@ -777,6 +777,7 @@ struct ufs_hba_monitor {
>    * @clk_list_head: UFS host controller clocks list node head
>    * @pwr_info: holds current power mode
>    * @max_pwr_info: keeps the device max valid pwm
> + * @clk_scaling_lock: used to serialize device commands and clock scaling
>    * @desc_size: descriptor sizes reported by device
>    * @urgent_bkops_lvl: keeps track of urgent bkops level for device
>    * @is_urgent_bkops_lvl_checked: keeps track if the urgent bkops level 
> for

Bart Van Assche Dec. 1, 2021, 6:33 p.m. UTC | #6

On 11/23/21 10:24 AM, Asutosh Das (asd) wrote:
> This looks good to me. Please push a change and I can test it out.

Thanks for having taken a look. v3 of this patch series is available at
https://github.com/bvanassche/linux/tree/ufs-for-next.

Bart.

[v2,18/20] scsi: ufs: Optimize the command queueing code

Commit Message

Comments

Patch