diff mbox

[04/14] blk-mq-sched: improve dispatching from sw queue

Message ID 20170731165111.11536-6-ming.lei@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Ming Lei July 31, 2017, 4:51 p.m. UTC
SCSI devices use host-wide tagset, and the shared
driver tag space is often quite big. Meantime
there is also queue depth for each lun(.cmd_per_lun),
which is often small.

So lots of requests may stay in sw queue, and we
always flush all belonging to same hw queue and
dispatch them all to driver, unfortunately it is
easy to cause queue busy becasue of the small
per-lun queue depth. Once these requests are flushed
out, they have to stay in hctx->dispatch, and no bio
merge can participate into these requests, and
sequential IO performance is hurted.

This patch improves dispatching from sw queue when
there is per-request-queue queue depth by taking
request one by one from sw queue, just like the way
of IO scheduler.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 block/blk-mq-sched.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

Comments

Bart Van Assche July 31, 2017, 11:34 p.m. UTC | #1
On Tue, 2017-08-01 at 00:51 +0800, Ming Lei wrote:
> SCSI devices use host-wide tagset, and the shared
> driver tag space is often quite big. Meantime
> there is also queue depth for each lun(.cmd_per_lun),
> which is often small.
> 
> So lots of requests may stay in sw queue, and we
> always flush all belonging to same hw queue and
> dispatch them all to driver, unfortunately it is
> easy to cause queue busy becasue of the small
> per-lun queue depth. Once these requests are flushed
> out, they have to stay in hctx->dispatch, and no bio
> merge can participate into these requests, and
> sequential IO performance is hurted.
> 
> This patch improves dispatching from sw queue when
> there is per-request-queue queue depth by taking
> request one by one from sw queue, just like the way
> of IO scheduler.
> 
> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> ---
>  block/blk-mq-sched.c | 25 +++++++++++++++----------
>  1 file changed, 15 insertions(+), 10 deletions(-)
> 
> diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
> index 47a25333a136..3510c01cb17b 100644
> --- a/block/blk-mq-sched.c
> +++ b/block/blk-mq-sched.c
> @@ -96,6 +96,9 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
>  	const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request;
>  	bool can_go = true;
>  	LIST_HEAD(rq_list);
> +	struct request *(*dispatch_fn)(struct blk_mq_hw_ctx *) =
> +		has_sched_dispatch ? e->type->ops.mq.dispatch_request :
> +			blk_mq_dispatch_rq_from_ctxs;
>  
>  	/* RCU or SRCU read lock is needed before checking quiesced flag */
>  	if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
> @@ -126,26 +129,28 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
>  	if (!list_empty(&rq_list)) {
>  		blk_mq_sched_mark_restart_hctx(hctx);
>  		can_go = blk_mq_dispatch_rq_list(q, &rq_list);
> -	} else if (!has_sched_dispatch) {
> +	} else if (!has_sched_dispatch && !q->queue_depth) {
>  		blk_mq_flush_busy_ctxs(hctx, &rq_list);
>  		blk_mq_dispatch_rq_list(q, &rq_list);
> +		can_go = false;
>  	}
>  
> +	if (!can_go)
> +		return;
> +
>  	/*
>  	 * We want to dispatch from the scheduler if we had no work left
>  	 * on the dispatch list, OR if we did have work but weren't able
>  	 * to make progress.
>  	 */
> -	if (can_go && has_sched_dispatch) {
> -		do {
> -			struct request *rq;
> +	do {
> +		struct request *rq;
>  
> -			rq = e->type->ops.mq.dispatch_request(hctx);
> -			if (!rq)
> -				break;
> -			list_add(&rq->queuelist, &rq_list);
> -		} while (blk_mq_dispatch_rq_list(q, &rq_list));
> -	}
> +		rq = dispatch_fn(hctx);
> +		if (!rq)
> +			break;
> +		list_add(&rq->queuelist, &rq_list);
> +	} while (blk_mq_dispatch_rq_list(q, &rq_list));
>  }

Hello Ming,

Although I like the idea behind this patch, I'm afraid that this patch will
cause a performance regression for high-performance SCSI LLD drivers, e.g.
ib_srp. Have you considered to rework this patch as follows:
* Remove the code under "else if (!has_sched_dispatch && !q->queue_depth) {".
* Modify all blk_mq_dispatch_rq_list() functions such that these dispatch up
  to cmd_per_lun - (number of requests in progress) at once.

Thanks,

Bart.
Ming Lei Aug. 1, 2017, 10:17 a.m. UTC | #2
On Mon, Jul 31, 2017 at 11:34:35PM +0000, Bart Van Assche wrote:
> On Tue, 2017-08-01 at 00:51 +0800, Ming Lei wrote:
> > SCSI devices use host-wide tagset, and the shared
> > driver tag space is often quite big. Meantime
> > there is also queue depth for each lun(.cmd_per_lun),
> > which is often small.
> > 
> > So lots of requests may stay in sw queue, and we
> > always flush all belonging to same hw queue and
> > dispatch them all to driver, unfortunately it is
> > easy to cause queue busy becasue of the small
> > per-lun queue depth. Once these requests are flushed
> > out, they have to stay in hctx->dispatch, and no bio
> > merge can participate into these requests, and
> > sequential IO performance is hurted.
> > 
> > This patch improves dispatching from sw queue when
> > there is per-request-queue queue depth by taking
> > request one by one from sw queue, just like the way
> > of IO scheduler.
> > 
> > Signed-off-by: Ming Lei <ming.lei@redhat.com>
> > ---
> >  block/blk-mq-sched.c | 25 +++++++++++++++----------
> >  1 file changed, 15 insertions(+), 10 deletions(-)
> > 
> > diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
> > index 47a25333a136..3510c01cb17b 100644
> > --- a/block/blk-mq-sched.c
> > +++ b/block/blk-mq-sched.c
> > @@ -96,6 +96,9 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> >  	const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request;
> >  	bool can_go = true;
> >  	LIST_HEAD(rq_list);
> > +	struct request *(*dispatch_fn)(struct blk_mq_hw_ctx *) =
> > +		has_sched_dispatch ? e->type->ops.mq.dispatch_request :
> > +			blk_mq_dispatch_rq_from_ctxs;
> >  
> >  	/* RCU or SRCU read lock is needed before checking quiesced flag */
> >  	if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
> > @@ -126,26 +129,28 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> >  	if (!list_empty(&rq_list)) {
> >  		blk_mq_sched_mark_restart_hctx(hctx);
> >  		can_go = blk_mq_dispatch_rq_list(q, &rq_list);
> > -	} else if (!has_sched_dispatch) {
> > +	} else if (!has_sched_dispatch && !q->queue_depth) {
> >  		blk_mq_flush_busy_ctxs(hctx, &rq_list);
> >  		blk_mq_dispatch_rq_list(q, &rq_list);
> > +		can_go = false;
> >  	}
> >  
> > +	if (!can_go)
> > +		return;
> > +
> >  	/*
> >  	 * We want to dispatch from the scheduler if we had no work left
> >  	 * on the dispatch list, OR if we did have work but weren't able
> >  	 * to make progress.
> >  	 */
> > -	if (can_go && has_sched_dispatch) {
> > -		do {
> > -			struct request *rq;
> > +	do {
> > +		struct request *rq;
> >  
> > -			rq = e->type->ops.mq.dispatch_request(hctx);
> > -			if (!rq)
> > -				break;
> > -			list_add(&rq->queuelist, &rq_list);
> > -		} while (blk_mq_dispatch_rq_list(q, &rq_list));
> > -	}
> > +		rq = dispatch_fn(hctx);
> > +		if (!rq)
> > +			break;
> > +		list_add(&rq->queuelist, &rq_list);
> > +	} while (blk_mq_dispatch_rq_list(q, &rq_list));
> >  }
> 
> Hello Ming,
> 
> Although I like the idea behind this patch, I'm afraid that this patch will
> cause a performance regression for high-performance SCSI LLD drivers, e.g.
> ib_srp. Have you considered to rework this patch as follows:
> * Remove the code under "else if (!has_sched_dispatch && !q->queue_depth) {".

This will affect devices such as NVMe in which busy isn't triggered
basically, so better to not do this.

> * Modify all blk_mq_dispatch_rq_list() functions such that these dispatch up
>   to cmd_per_lun - (number of requests in progress) at once.

How can we get the accurate 'number of requests in progress' efficiently?

And we have done it in this way for blk-mq scheduler already, so it
shouldn't be a problem.

From my test data of mq-deadline on lpfc, the performance is good,
please see it in cover letter.


Thanks,
Ming
Ming Lei Aug. 1, 2017, 10:50 a.m. UTC | #3
On Tue, Aug 01, 2017 at 06:17:18PM +0800, Ming Lei wrote:
> On Mon, Jul 31, 2017 at 11:34:35PM +0000, Bart Van Assche wrote:
> > On Tue, 2017-08-01 at 00:51 +0800, Ming Lei wrote:
> > > SCSI devices use host-wide tagset, and the shared
> > > driver tag space is often quite big. Meantime
> > > there is also queue depth for each lun(.cmd_per_lun),
> > > which is often small.
> > > 
> > > So lots of requests may stay in sw queue, and we
> > > always flush all belonging to same hw queue and
> > > dispatch them all to driver, unfortunately it is
> > > easy to cause queue busy becasue of the small
> > > per-lun queue depth. Once these requests are flushed
> > > out, they have to stay in hctx->dispatch, and no bio
> > > merge can participate into these requests, and
> > > sequential IO performance is hurted.
> > > 
> > > This patch improves dispatching from sw queue when
> > > there is per-request-queue queue depth by taking
> > > request one by one from sw queue, just like the way
> > > of IO scheduler.
> > > 
> > > Signed-off-by: Ming Lei <ming.lei@redhat.com>
> > > ---
> > >  block/blk-mq-sched.c | 25 +++++++++++++++----------
> > >  1 file changed, 15 insertions(+), 10 deletions(-)
> > > 
> > > diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
> > > index 47a25333a136..3510c01cb17b 100644
> > > --- a/block/blk-mq-sched.c
> > > +++ b/block/blk-mq-sched.c
> > > @@ -96,6 +96,9 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> > >  	const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request;
> > >  	bool can_go = true;
> > >  	LIST_HEAD(rq_list);
> > > +	struct request *(*dispatch_fn)(struct blk_mq_hw_ctx *) =
> > > +		has_sched_dispatch ? e->type->ops.mq.dispatch_request :
> > > +			blk_mq_dispatch_rq_from_ctxs;
> > >  
> > >  	/* RCU or SRCU read lock is needed before checking quiesced flag */
> > >  	if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
> > > @@ -126,26 +129,28 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> > >  	if (!list_empty(&rq_list)) {
> > >  		blk_mq_sched_mark_restart_hctx(hctx);
> > >  		can_go = blk_mq_dispatch_rq_list(q, &rq_list);
> > > -	} else if (!has_sched_dispatch) {
> > > +	} else if (!has_sched_dispatch && !q->queue_depth) {
> > >  		blk_mq_flush_busy_ctxs(hctx, &rq_list);
> > >  		blk_mq_dispatch_rq_list(q, &rq_list);
> > > +		can_go = false;
> > >  	}
> > >  
> > > +	if (!can_go)
> > > +		return;
> > > +
> > >  	/*
> > >  	 * We want to dispatch from the scheduler if we had no work left
> > >  	 * on the dispatch list, OR if we did have work but weren't able
> > >  	 * to make progress.
> > >  	 */
> > > -	if (can_go && has_sched_dispatch) {
> > > -		do {
> > > -			struct request *rq;
> > > +	do {
> > > +		struct request *rq;
> > >  
> > > -			rq = e->type->ops.mq.dispatch_request(hctx);
> > > -			if (!rq)
> > > -				break;
> > > -			list_add(&rq->queuelist, &rq_list);
> > > -		} while (blk_mq_dispatch_rq_list(q, &rq_list));
> > > -	}
> > > +		rq = dispatch_fn(hctx);
> > > +		if (!rq)
> > > +			break;
> > > +		list_add(&rq->queuelist, &rq_list);
> > > +	} while (blk_mq_dispatch_rq_list(q, &rq_list));
> > >  }
> > 
> > Hello Ming,
> > 
> > Although I like the idea behind this patch, I'm afraid that this patch will
> > cause a performance regression for high-performance SCSI LLD drivers, e.g.
> > ib_srp. Have you considered to rework this patch as follows:
> > * Remove the code under "else if (!has_sched_dispatch && !q->queue_depth) {".
> 
> This will affect devices such as NVMe in which busy isn't triggered
> basically, so better to not do this.
> 
> > * Modify all blk_mq_dispatch_rq_list() functions such that these dispatch up
> >   to cmd_per_lun - (number of requests in progress) at once.
> 
> How can we get the accurate 'number of requests in progress' efficiently?
> 
> And we have done it in this way for blk-mq scheduler already, so it
> shouldn't be a problem.
> 
> From my test data of mq-deadline on lpfc, the performance is good,
> please see it in cover letter.

Forget to mention, ctx->list is one per-cpu list and the lock is percpu
lock, so changing to this way shouldn't be a performance issue.
Bart Van Assche Aug. 1, 2017, 3:11 p.m. UTC | #4
On Tue, 2017-08-01 at 18:50 +0800, Ming Lei wrote:
> On Tue, Aug 01, 2017 at 06:17:18PM +0800, Ming Lei wrote:
> > How can we get the accurate 'number of requests in progress' efficiently?

Hello Ming,

How about counting the number of bits that have been set in the tag set?
I am aware that these bits can be set and/or cleared concurrently with the
dispatch code but that count is probably a good starting point.

> > From my test data of mq-deadline on lpfc, the performance is good,
> > please see it in cover letter.
> 
> Forget to mention, ctx->list is one per-cpu list and the lock is percpu
> lock, so changing to this way shouldn't be a performance issue.

Sorry but I don't consider this reply as sufficient. The latency of IB HCA's
is significantly lower than that of any FC hardware I ran performance
measurements on myself. It's not because this patch series improves performance
for lpfc that that guarantees that there won't be a performance regression for
ib_srp, ib_iser or any other low-latency initiator driver for which q->depth
!= 0.

Additionally, patch 03/14 most likely introduces a fairness problem. Shouldn't
blk_mq_dispatch_rq_from_ctxs() dequeue requests from the per-CPU queues in a
round-robin fashion instead of always starting at the first per-CPU queue in
hctx->ctx_map?

Thanks,

Bart.
Ming Lei Aug. 2, 2017, 3:31 a.m. UTC | #5
On Tue, Aug 01, 2017 at 03:11:42PM +0000, Bart Van Assche wrote:
> On Tue, 2017-08-01 at 18:50 +0800, Ming Lei wrote:
> > On Tue, Aug 01, 2017 at 06:17:18PM +0800, Ming Lei wrote:
> > > How can we get the accurate 'number of requests in progress' efficiently?
> 
> Hello Ming,
> 
> How about counting the number of bits that have been set in the tag set?
> I am aware that these bits can be set and/or cleared concurrently with the
> dispatch code but that count is probably a good starting point.

It has to be atomic_t, which is too too heavy for us, please see the report:

	http://marc.info/?t=149868448400003&r=1&w=2

Both Jens and I want to kill hd_struct.in_flight, but looks still no
good way. 

> 
> > > From my test data of mq-deadline on lpfc, the performance is good,
> > > please see it in cover letter.
> > 
> > Forget to mention, ctx->list is one per-cpu list and the lock is percpu
> > lock, so changing to this way shouldn't be a performance issue.
> 
> Sorry but I don't consider this reply as sufficient. The latency of IB HCA's
> is significantly lower than that of any FC hardware I ran performance
> measurements on myself. It's not because this patch series improves performance
> for lpfc that that guarantees that there won't be a performance regression for
> ib_srp, ib_iser or any other low-latency initiator driver for which q->depth
> != 0.

If IB HCA has lower latency than other FC, there should be less chances
to trigger BUSY. Otherwise IB should have the same sequential I/O
performance issue, I guess.

ctx list is per-cpu list, in theory there shouldn't be issues with this
change, and the only change for IB is the following:

V4.13-rc3:
	blk_mq_flush_busy_ctxs(hctx, &rq_list);
    blk_mq_dispatch_rq_list(q, &rq_list);

v4.13-rc3 patched:
        do {
                struct request *rq;

				/* pick up one request from one ctx list */
                rq = blk_mq_dispatch_rq_from_ctxs(hctx); 
                if (!rq)
                        break;
                list_add(&rq->queuelist, &rq_list);
        } while (blk_mq_dispatch_rq_list(q, &rq_list));

I doubt that the change can be observable in actual test.

Never mind, we will provide test data on IB HCA with V2, and Laurence
will help me run the test on IB.

> 
> Additionally, patch 03/14 most likely introduces a fairness problem. Shouldn't
> blk_mq_dispatch_rq_from_ctxs() dequeue requests from the per-CPU queues in a
> round-robin fashion instead of always starting at the first per-CPU queue in
> hctx->ctx_map?

That is a good question, looks round-robin should be more fair, will
change to it in V2 ant the implementation should be simple.
Bart Van Assche Aug. 3, 2017, 1:35 a.m. UTC | #6
On Wed, 2017-08-02 at 11:31 +0800, Ming Lei wrote:
> On Tue, Aug 01, 2017 at 03:11:42PM +0000, Bart Van Assche wrote:

> > On Tue, 2017-08-01 at 18:50 +0800, Ming Lei wrote:

> > > On Tue, Aug 01, 2017 at 06:17:18PM +0800, Ming Lei wrote:

> > > > How can we get the accurate 'number of requests in progress' efficiently?

> > 

> > Hello Ming,

> > 

> > How about counting the number of bits that have been set in the tag set?

> > I am aware that these bits can be set and/or cleared concurrently with the

> > dispatch code but that count is probably a good starting point.

> 

> It has to be atomic_t, which is too too heavy for us, please see the report:

> 

> 	http://marc.info/?t=149868448400003&r=1&w=2

> 

> Both Jens and I want to kill hd_struct.in_flight, but looks still no

> good way. 


Hello Ming,

Sorry but I disagree that a new atomic variable should be added to keep track
of the number of busy requests. Counting the number of bits that are set in
the tag set should be good enough in this context.

Thanks,

Bart.
Ming Lei Aug. 3, 2017, 3:13 a.m. UTC | #7
On Thu, Aug 03, 2017 at 01:35:29AM +0000, Bart Van Assche wrote:
> On Wed, 2017-08-02 at 11:31 +0800, Ming Lei wrote:
> > On Tue, Aug 01, 2017 at 03:11:42PM +0000, Bart Van Assche wrote:
> > > On Tue, 2017-08-01 at 18:50 +0800, Ming Lei wrote:
> > > > On Tue, Aug 01, 2017 at 06:17:18PM +0800, Ming Lei wrote:
> > > > > How can we get the accurate 'number of requests in progress' efficiently?
> > > 
> > > Hello Ming,
> > > 
> > > How about counting the number of bits that have been set in the tag set?
> > > I am aware that these bits can be set and/or cleared concurrently with the
> > > dispatch code but that count is probably a good starting point.
> > 
> > It has to be atomic_t, which is too too heavy for us, please see the report:
> > 
> > 	http://marc.info/?t=149868448400003&r=1&w=2
> > 
> > Both Jens and I want to kill hd_struct.in_flight, but looks still no
> > good way. 
> 
> Hello Ming,
> 
> Sorry but I disagree that a new atomic variable should be added to keep track
> of the number of busy requests. Counting the number of bits that are set in
> the tag set should be good enough in this context.

That won't work because the tag set is host wide and shared by all LUNs.
Bart Van Assche Aug. 3, 2017, 5:33 p.m. UTC | #8
On Thu, 2017-08-03 at 11:13 +0800, Ming Lei wrote:
> On Thu, Aug 03, 2017 at 01:35:29AM +0000, Bart Van Assche wrote:
> > On Wed, 2017-08-02 at 11:31 +0800, Ming Lei wrote:
> > > On Tue, Aug 01, 2017 at 03:11:42PM +0000, Bart Van Assche wrote:
> > > > On Tue, 2017-08-01 at 18:50 +0800, Ming Lei wrote:
> > > > > On Tue, Aug 01, 2017 at 06:17:18PM +0800, Ming Lei wrote:
> > > > > > How can we get the accurate 'number of requests in progress' efficiently?
> > > > 
> > > > Hello Ming,
> > > > 
> > > > How about counting the number of bits that have been set in the tag set?
> > > > I am aware that these bits can be set and/or cleared concurrently with the
> > > > dispatch code but that count is probably a good starting point.
> > > 
> > > It has to be atomic_t, which is too too heavy for us, please see the report:
> > > 
> > > 	http://marc.info/?t=149868448400003&r=1&w=2
> > > 
> > > Both Jens and I want to kill hd_struct.in_flight, but looks still no
> > > good way. 
> > 
> > Hello Ming,
> > 
> > Sorry but I disagree that a new atomic variable should be added to keep track
> > of the number of busy requests. Counting the number of bits that are set in
> > the tag set should be good enough in this context.
> 
> That won't work because the tag set is host wide and shared by all LUNs.

Hello Ming,

Are you aware that the SCSI core already keeps track of the number of busy requests
per LUN? See also the device_busy member of struct scsi_device. How about giving the
block layer core access in some way to that counter?

Bart.
Christoph Hellwig Aug. 5, 2017, 8:40 a.m. UTC | #9
On Thu, Aug 03, 2017 at 05:33:13PM +0000, Bart Van Assche wrote:
> Are you aware that the SCSI core already keeps track of the number of busy requests
> per LUN? See also the device_busy member of struct scsi_device. How about giving the
> block layer core access in some way to that counter?

I'd love ot move it to blk-mq in a scalable way eventually, as
we'll run into the same problems with NVMe systems with multiple
namespaces.
Ming Lei Aug. 5, 2017, 1:40 p.m. UTC | #10
On Thu, Aug 03, 2017 at 05:33:13PM +0000, Bart Van Assche wrote:
> On Thu, 2017-08-03 at 11:13 +0800, Ming Lei wrote:
> > On Thu, Aug 03, 2017 at 01:35:29AM +0000, Bart Van Assche wrote:
> > > On Wed, 2017-08-02 at 11:31 +0800, Ming Lei wrote:
> > > > On Tue, Aug 01, 2017 at 03:11:42PM +0000, Bart Van Assche wrote:
> > > > > On Tue, 2017-08-01 at 18:50 +0800, Ming Lei wrote:
> > > > > > On Tue, Aug 01, 2017 at 06:17:18PM +0800, Ming Lei wrote:
> > > > > > > How can we get the accurate 'number of requests in progress' efficiently?
> > > > > 
> > > > > Hello Ming,
> > > > > 
> > > > > How about counting the number of bits that have been set in the tag set?
> > > > > I am aware that these bits can be set and/or cleared concurrently with the
> > > > > dispatch code but that count is probably a good starting point.
> > > > 
> > > > It has to be atomic_t, which is too too heavy for us, please see the report:
> > > > 
> > > > 	http://marc.info/?t=149868448400003&r=1&w=2
> > > > 
> > > > Both Jens and I want to kill hd_struct.in_flight, but looks still no
> > > > good way. 
> > > 
> > > Hello Ming,
> > > 
> > > Sorry but I disagree that a new atomic variable should be added to keep track
> > > of the number of busy requests. Counting the number of bits that are set in
> > > the tag set should be good enough in this context.
> > 
> > That won't work because the tag set is host wide and shared by all LUNs.
> 
> Hello Ming,
> 
> Are you aware that the SCSI core already keeps track of the number of busy requests
> per LUN? See also the device_busy member of struct scsi_device. How about giving the
> block layer core access in some way to that counter?

Yes, I know that.

Last time I mentioned it to Christoph that this counter can be used for
implementing Runtime PM for avoiding to introduce one new counter to 
account pending I/O.

But for this purpose(estimating how many requests to dequeue from hctxs),
it isn't a good idea:

1) strictly speaking, atomic counter isn't enough, and lock 
is needed, because we need to make sure that the counter can't
be changed during dequeuing requests, so exporting the counter
to block won't work

2) even though you may think it is just for estimating, and
not use a lock, it isn't good too, because for some SCSI devices,
q->queue_depth is very small, both qla2xxx and lfpc's .cmd_perf_lun
is 3. So it can be very inaccurate since it is normal to dequeue
requests from all hctx at the same time.

Also I have posted V2 today, from the test result on SRP, looks
it is good to dequeue one request one time, so I suggest that we
follow mq scheduler's way to dequeue request(pick up one in one time)
for blk-mq 'none' in this patchset. We may consider to improve
it in future if there is better & mature idea.


Thanks,
Ming
diff mbox

Patch

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 47a25333a136..3510c01cb17b 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -96,6 +96,9 @@  void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 	const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request;
 	bool can_go = true;
 	LIST_HEAD(rq_list);
+	struct request *(*dispatch_fn)(struct blk_mq_hw_ctx *) =
+		has_sched_dispatch ? e->type->ops.mq.dispatch_request :
+			blk_mq_dispatch_rq_from_ctxs;
 
 	/* RCU or SRCU read lock is needed before checking quiesced flag */
 	if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
@@ -126,26 +129,28 @@  void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 	if (!list_empty(&rq_list)) {
 		blk_mq_sched_mark_restart_hctx(hctx);
 		can_go = blk_mq_dispatch_rq_list(q, &rq_list);
-	} else if (!has_sched_dispatch) {
+	} else if (!has_sched_dispatch && !q->queue_depth) {
 		blk_mq_flush_busy_ctxs(hctx, &rq_list);
 		blk_mq_dispatch_rq_list(q, &rq_list);
+		can_go = false;
 	}
 
+	if (!can_go)
+		return;
+
 	/*
 	 * We want to dispatch from the scheduler if we had no work left
 	 * on the dispatch list, OR if we did have work but weren't able
 	 * to make progress.
 	 */
-	if (can_go && has_sched_dispatch) {
-		do {
-			struct request *rq;
+	do {
+		struct request *rq;
 
-			rq = e->type->ops.mq.dispatch_request(hctx);
-			if (!rq)
-				break;
-			list_add(&rq->queuelist, &rq_list);
-		} while (blk_mq_dispatch_rq_list(q, &rq_list));
-	}
+		rq = dispatch_fn(hctx);
+		if (!rq)
+			break;
+		list_add(&rq->queuelist, &rq_list);
+	} while (blk_mq_dispatch_rq_list(q, &rq_list));
 }
 
 bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,