diff mbox series

[V3] scsi: core: only re-run queue in scsi_end_request() if device queue is busy

Message ID 20191121032634.32650-1-ming.lei@redhat.com (mailing list archive)
State Deferred
Headers show
Series [V3] scsi: core: only re-run queue in scsi_end_request() if device queue is busy | expand

Commit Message

Ming Lei Nov. 21, 2019, 3:26 a.m. UTC
Now the request queue is run in scsi_end_request() unconditionally if both
target queue and host queue is ready. We should have re-run request queue
only after this device queue becomes busy for restarting this LUN only.

Recently Long Li reported that cost of run queue may be very heavy in
case of high queue depth. So improve this situation by only running
the request queue when this LUN is busy.

Cc: Jens Axboe <axboe@kernel.dk>
Cc: Ewan D. Milne <emilne@redhat.com>
Cc: Kashyap Desai <kashyap.desai@broadcom.com>
Cc: Hannes Reinecke <hare@suse.de>
Cc: Bart Van Assche <bvanassche@acm.org>
Cc: Damien Le Moal <damien.lemoal@wdc.com>
Cc: Long Li <longli@microsoft.com>
Reported-by: Long Li <longli@microsoft.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
V3:
	- add one smp_mb() in scsi_mq_get_budget() and comment

V2:
	- commit log change, no any code change
	- add reported-by tag


 drivers/scsi/scsi_lib.c    | 43 ++++++++++++++++++++++++++++++++++++--
 include/scsi/scsi_device.h |  1 +
 2 files changed, 42 insertions(+), 2 deletions(-)

Comments

Ming Lei Dec. 20, 2019, 11:37 p.m. UTC | #1
On Thu, Nov 21, 2019 at 11:26:34AM +0800, Ming Lei wrote:
> Now the request queue is run in scsi_end_request() unconditionally if both
> target queue and host queue is ready. We should have re-run request queue
> only after this device queue becomes busy for restarting this LUN only.
> 
> Recently Long Li reported that cost of run queue may be very heavy in
> case of high queue depth. So improve this situation by only running
> the request queue when this LUN is busy.
> 
> Cc: Jens Axboe <axboe@kernel.dk>
> Cc: Ewan D. Milne <emilne@redhat.com>
> Cc: Kashyap Desai <kashyap.desai@broadcom.com>
> Cc: Hannes Reinecke <hare@suse.de>
> Cc: Bart Van Assche <bvanassche@acm.org>
> Cc: Damien Le Moal <damien.lemoal@wdc.com>
> Cc: Long Li <longli@microsoft.com>
> Reported-by: Long Li <longli@microsoft.com>
> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> ---
> V3:
> 	- add one smp_mb() in scsi_mq_get_budget() and comment
> 
> V2:
> 	- commit log change, no any code change
> 	- add reported-by tag
> 
> 
>  drivers/scsi/scsi_lib.c    | 43 ++++++++++++++++++++++++++++++++++++--
>  include/scsi/scsi_device.h |  1 +
>  2 files changed, 42 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
> index 379533ce8661..d3d237a09a78 100644
> --- a/drivers/scsi/scsi_lib.c
> +++ b/drivers/scsi/scsi_lib.c
> @@ -607,12 +607,17 @@ static bool scsi_end_request(struct request *req, blk_status_t error,
>  	 */
>  	percpu_ref_get(&q->q_usage_counter);
>  
> +	/*
> +	 * One smp_mb() is implied by either rq->end_io or
> +	 * blk_mq_free_request for ordering writing .device_busy in
> +	 * scsi_device_unbusy() and reading sdev->restart.
> +	 */
>  	__blk_mq_end_request(req, error);
>  
>  	if (scsi_target(sdev)->single_lun ||
>  	    !list_empty(&sdev->host->starved_list))
>  		kblockd_schedule_work(&sdev->requeue_work);
> -	else
> +	else if (READ_ONCE(sdev->restart))
>  		blk_mq_run_hw_queues(q, true);
>  
>  	percpu_ref_put(&q->q_usage_counter);
> @@ -1632,8 +1637,42 @@ static bool scsi_mq_get_budget(struct blk_mq_hw_ctx *hctx)
>  	struct request_queue *q = hctx->queue;
>  	struct scsi_device *sdev = q->queuedata;
>  
> -	if (scsi_dev_queue_ready(q, sdev))
> +	if (scsi_dev_queue_ready(q, sdev)) {
> +		WRITE_ONCE(sdev->restart, 0);
>  		return true;
> +	}
> +
> +	/*
> +	 * If all in-flight requests originated from this LUN are completed
> +	 * before setting .restart, sdev->device_busy will be observed as
> +	 * zero, then blk_mq_delay_run_hw_queue() will dispatch this request
> +	 * soon. Otherwise, completion of one of these request will observe
> +	 * the .restart flag, and the request queue will be run for handling
> +	 * this request, see scsi_end_request().
> +	 *
> +	 * However, the .restart flag may be cleared from other dispatch code
> +	 * path after one inflight request is completed, then:
> +	 *
> +	 * 1) if this request is dispatched from scheduler queue or sw queue one
> +	 * by one, this request will be handled in that dispatch path too given
> +	 * the request still stays at scheduler/sw queue when calling .get_budget()
> +	 * callback.
> +	 *
> +	 * 2) if this request is dispatched from hctx->dispatch or
> +	 * blk_mq_flush_busy_ctxs(), this request will be put into hctx->dispatch
> +	 * list soon, and blk-mq will be responsible for covering it, see
> +	 * blk_mq_dispatch_rq_list().
> +	 */
> +	WRITE_ONCE(sdev->restart, 1);
> +
> +	/*
> +	 * Order writting .restart and reading .device_busy, and make sure
> +	 * .restart is visible to scsi_end_request(). Its pair is implied by
> +	 * __blk_mq_end_request() in scsi_end_request() for ordering
> +	 * writing .device_busy in scsi_device_unbusy() and reading .restart.
> +	 *
> +	 */
> +	smp_mb();
>  
>  	if (atomic_read(&sdev->device_busy) == 0 && !scsi_device_blocked(sdev))
>  		blk_mq_delay_run_hw_queue(hctx, SCSI_QUEUE_DELAY);
> diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h
> index 202f4d6a4342..9d8ca662ae86 100644
> --- a/include/scsi/scsi_device.h
> +++ b/include/scsi/scsi_device.h
> @@ -109,6 +109,7 @@ struct scsi_device {
>  	atomic_t device_busy;		/* commands actually active on LLDD */
>  	atomic_t device_blocked;	/* Device returned QUEUE_FULL. */
>  
> +	unsigned int restart;
>  	spinlock_t list_lock;
>  	struct list_head cmd_list;	/* queue of in use SCSI Command structures */
>  	struct list_head starved_entry;
> -- 
> 2.20.1
> 

Ping...

Thanks,
Ming
Long Li Dec. 20, 2019, 11:44 p.m. UTC | #2
>Subject: Re: [PATCH V3] scsi: core: only re-run queue in scsi_end_request() if
>device queue is busy
>
>On Thu, Nov 21, 2019 at 11:26:34AM +0800, Ming Lei wrote:
>> Now the request queue is run in scsi_end_request() unconditionally if
>> both target queue and host queue is ready. We should have re-run
>> request queue only after this device queue becomes busy for restarting this
>LUN only.
>>
>> Recently Long Li reported that cost of run queue may be very heavy in
>> case of high queue depth. So improve this situation by only running
>> the request queue when this LUN is busy.
>>
>> Cc: Jens Axboe <axboe@kernel.dk>
>> Cc: Ewan D. Milne <emilne@redhat.com>
>> Cc: Kashyap Desai <kashyap.desai@broadcom.com>
>> Cc: Hannes Reinecke <hare@suse.de>
>> Cc: Bart Van Assche <bvanassche@acm.org>
>> Cc: Damien Le Moal <damien.lemoal@wdc.com>
>> Cc: Long Li <longli@microsoft.com>
>> Reported-by: Long Li <longli@microsoft.com>
>> Signed-off-by: Ming Lei <ming.lei@redhat.com>
>> ---
>> V3:
>> 	- add one smp_mb() in scsi_mq_get_budget() and comment
>>
>> V2:
>> 	- commit log change, no any code change
>> 	- add reported-by tag
>>
>>
>>  drivers/scsi/scsi_lib.c    | 43 ++++++++++++++++++++++++++++++++++++-
>-
>>  include/scsi/scsi_device.h |  1 +
>>  2 files changed, 42 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index
>> 379533ce8661..d3d237a09a78 100644
>> --- a/drivers/scsi/scsi_lib.c
>> +++ b/drivers/scsi/scsi_lib.c
>> @@ -607,12 +607,17 @@ static bool scsi_end_request(struct request *req,
>blk_status_t error,
>>  	 */
>>  	percpu_ref_get(&q->q_usage_counter);
>>
>> +	/*
>> +	 * One smp_mb() is implied by either rq->end_io or
>> +	 * blk_mq_free_request for ordering writing .device_busy in
>> +	 * scsi_device_unbusy() and reading sdev->restart.
>> +	 */
>>  	__blk_mq_end_request(req, error);
>>
>>  	if (scsi_target(sdev)->single_lun ||
>>  	    !list_empty(&sdev->host->starved_list))
>>  		kblockd_schedule_work(&sdev->requeue_work);
>> -	else
>> +	else if (READ_ONCE(sdev->restart))
>>  		blk_mq_run_hw_queues(q, true);
>>
>>  	percpu_ref_put(&q->q_usage_counter);
>> @@ -1632,8 +1637,42 @@ static bool scsi_mq_get_budget(struct
>blk_mq_hw_ctx *hctx)
>>  	struct request_queue *q = hctx->queue;
>>  	struct scsi_device *sdev = q->queuedata;
>>
>> -	if (scsi_dev_queue_ready(q, sdev))
>> +	if (scsi_dev_queue_ready(q, sdev)) {
>> +		WRITE_ONCE(sdev->restart, 0);
>>  		return true;
>> +	}
>> +
>> +	/*
>> +	 * If all in-flight requests originated from this LUN are completed
>> +	 * before setting .restart, sdev->device_busy will be observed as
>> +	 * zero, then blk_mq_delay_run_hw_queue() will dispatch this
>request
>> +	 * soon. Otherwise, completion of one of these request will observe
>> +	 * the .restart flag, and the request queue will be run for handling
>> +	 * this request, see scsi_end_request().
>> +	 *
>> +	 * However, the .restart flag may be cleared from other dispatch code
>> +	 * path after one inflight request is completed, then:
>> +	 *
>> +	 * 1) if this request is dispatched from scheduler queue or sw queue
>one
>> +	 * by one, this request will be handled in that dispatch path too given
>> +	 * the request still stays at scheduler/sw queue when
>calling .get_budget()
>> +	 * callback.
>> +	 *
>> +	 * 2) if this request is dispatched from hctx->dispatch or
>> +	 * blk_mq_flush_busy_ctxs(), this request will be put into hctx-
>>dispatch
>> +	 * list soon, and blk-mq will be responsible for covering it, see
>> +	 * blk_mq_dispatch_rq_list().
>> +	 */
>> +	WRITE_ONCE(sdev->restart, 1);
>> +
>> +	/*
>> +	 * Order writting .restart and reading .device_busy, and make sure
>> +	 * .restart is visible to scsi_end_request(). Its pair is implied by
>> +	 * __blk_mq_end_request() in scsi_end_request() for ordering
>> +	 * writing .device_busy in scsi_device_unbusy() and reading .restart.
>> +	 *
>> +	 */
>> +	smp_mb();
>>
>>  	if (atomic_read(&sdev->device_busy) == 0
>&& !scsi_device_blocked(sdev))
>>  		blk_mq_delay_run_hw_queue(hctx, SCSI_QUEUE_DELAY);
>diff --git
>> a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index
>> 202f4d6a4342..9d8ca662ae86 100644
>> --- a/include/scsi/scsi_device.h
>> +++ b/include/scsi/scsi_device.h
>> @@ -109,6 +109,7 @@ struct scsi_device {
>>  	atomic_t device_busy;		/* commands actually active on LLDD
>*/
>>  	atomic_t device_blocked;	/* Device returned QUEUE_FULL. */
>>
>> +	unsigned int restart;
>>  	spinlock_t list_lock;
>>  	struct list_head cmd_list;	/* queue of in use SCSI Command
>structures */
>>  	struct list_head starved_entry;
>> --
>> 2.20.1
>>
>
>Ping...

This patch passed stress tests on Linux VM running in Azure.

Tested-by: Long Li <longli@microsoft.com>

>
>Thanks,
>Ming
diff mbox series

Patch

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 379533ce8661..d3d237a09a78 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -607,12 +607,17 @@  static bool scsi_end_request(struct request *req, blk_status_t error,
 	 */
 	percpu_ref_get(&q->q_usage_counter);
 
+	/*
+	 * One smp_mb() is implied by either rq->end_io or
+	 * blk_mq_free_request for ordering writing .device_busy in
+	 * scsi_device_unbusy() and reading sdev->restart.
+	 */
 	__blk_mq_end_request(req, error);
 
 	if (scsi_target(sdev)->single_lun ||
 	    !list_empty(&sdev->host->starved_list))
 		kblockd_schedule_work(&sdev->requeue_work);
-	else
+	else if (READ_ONCE(sdev->restart))
 		blk_mq_run_hw_queues(q, true);
 
 	percpu_ref_put(&q->q_usage_counter);
@@ -1632,8 +1637,42 @@  static bool scsi_mq_get_budget(struct blk_mq_hw_ctx *hctx)
 	struct request_queue *q = hctx->queue;
 	struct scsi_device *sdev = q->queuedata;
 
-	if (scsi_dev_queue_ready(q, sdev))
+	if (scsi_dev_queue_ready(q, sdev)) {
+		WRITE_ONCE(sdev->restart, 0);
 		return true;
+	}
+
+	/*
+	 * If all in-flight requests originated from this LUN are completed
+	 * before setting .restart, sdev->device_busy will be observed as
+	 * zero, then blk_mq_delay_run_hw_queue() will dispatch this request
+	 * soon. Otherwise, completion of one of these request will observe
+	 * the .restart flag, and the request queue will be run for handling
+	 * this request, see scsi_end_request().
+	 *
+	 * However, the .restart flag may be cleared from other dispatch code
+	 * path after one inflight request is completed, then:
+	 *
+	 * 1) if this request is dispatched from scheduler queue or sw queue one
+	 * by one, this request will be handled in that dispatch path too given
+	 * the request still stays at scheduler/sw queue when calling .get_budget()
+	 * callback.
+	 *
+	 * 2) if this request is dispatched from hctx->dispatch or
+	 * blk_mq_flush_busy_ctxs(), this request will be put into hctx->dispatch
+	 * list soon, and blk-mq will be responsible for covering it, see
+	 * blk_mq_dispatch_rq_list().
+	 */
+	WRITE_ONCE(sdev->restart, 1);
+
+	/*
+	 * Order writting .restart and reading .device_busy, and make sure
+	 * .restart is visible to scsi_end_request(). Its pair is implied by
+	 * __blk_mq_end_request() in scsi_end_request() for ordering
+	 * writing .device_busy in scsi_device_unbusy() and reading .restart.
+	 *
+	 */
+	smp_mb();
 
 	if (atomic_read(&sdev->device_busy) == 0 && !scsi_device_blocked(sdev))
 		blk_mq_delay_run_hw_queue(hctx, SCSI_QUEUE_DELAY);
diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h
index 202f4d6a4342..9d8ca662ae86 100644
--- a/include/scsi/scsi_device.h
+++ b/include/scsi/scsi_device.h
@@ -109,6 +109,7 @@  struct scsi_device {
 	atomic_t device_busy;		/* commands actually active on LLDD */
 	atomic_t device_blocked;	/* Device returned QUEUE_FULL. */
 
+	unsigned int restart;
 	spinlock_t list_lock;
 	struct list_head cmd_list;	/* queue of in use SCSI Command structures */
 	struct list_head starved_entry;