diff mbox

[V6,4/5] blk-mq-sched: improve dispatching from sw queue

Message ID 20171009112424.30524-5-ming.lei@redhat.com (mailing list archive)
State Superseded, archived
Delegated to: Mike Snitzer
Headers show

Commit Message

Ming Lei Oct. 9, 2017, 11:24 a.m. UTC
SCSI devices use host-wide tagset, and the shared driver tag space is
often quite big. Meantime there is also queue depth for each lun(
.cmd_per_lun), which is often small, for example, on both lpfc and
qla2xxx, .cmd_per_lun is just 3.

So lots of requests may stay in sw queue, and we always flush all
belonging to same hw queue and dispatch them all to driver, unfortunately
it is easy to cause queue busy because of the small .cmd_per_lun.
Once these requests are flushed out, they have to stay in hctx->dispatch,
and no bio merge can participate into these requests, and sequential IO
performance is hurt a lot.

This patch introduces blk_mq_dequeue_from_ctx for dequeuing request from
sw queue so that we can dispatch them in scheduler's way, then we can
avoid to dequeue too many requests from sw queue when ->dispatch isn't
flushed completely.

This patch improves dispatching from sw queue when there is per-request-queue
queue depth by taking request one by one from sw queue, just like the way
of IO scheduler.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Reviewed-by: Bart Van Assche <bart.vanassche@wdc.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 block/blk-mq-sched.c   | 50 ++++++++++++++++++++++++++++++++++++++++++++++++--
 block/blk-mq.c         | 39 +++++++++++++++++++++++++++++++++++++++
 block/blk-mq.h         |  2 ++
 include/linux/blk-mq.h |  2 ++
 4 files changed, 91 insertions(+), 2 deletions(-)

Comments

Omar Sandoval Oct. 10, 2017, 6:23 p.m. UTC | #1
On Mon, Oct 09, 2017 at 07:24:23PM +0800, Ming Lei wrote:
> SCSI devices use host-wide tagset, and the shared driver tag space is
> often quite big. Meantime there is also queue depth for each lun(
> .cmd_per_lun), which is often small, for example, on both lpfc and
> qla2xxx, .cmd_per_lun is just 3.
> 
> So lots of requests may stay in sw queue, and we always flush all
> belonging to same hw queue and dispatch them all to driver, unfortunately
> it is easy to cause queue busy because of the small .cmd_per_lun.
> Once these requests are flushed out, they have to stay in hctx->dispatch,
> and no bio merge can participate into these requests, and sequential IO
> performance is hurt a lot.
> 
> This patch introduces blk_mq_dequeue_from_ctx for dequeuing request from
> sw queue so that we can dispatch them in scheduler's way, then we can
> avoid to dequeue too many requests from sw queue when ->dispatch isn't
> flushed completely.
> 
> This patch improves dispatching from sw queue when there is per-request-queue
> queue depth by taking request one by one from sw queue, just like the way
> of IO scheduler.

This still didn't address Jens' concern about using q->queue_depth as
the heuristic for whether to do the full sw queue flush or one-by-one
dispatch. The EWMA approach is a bit too complex for now, can you please
try the heuristic of whether the driver ever returned BLK_STS_RESOURCE?

> Reviewed-by: Omar Sandoval <osandov@fb.com>
> Reviewed-by: Bart Van Assche <bart.vanassche@wdc.com>
> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> ---
>  block/blk-mq-sched.c   | 50 ++++++++++++++++++++++++++++++++++++++++++++++++--
>  block/blk-mq.c         | 39 +++++++++++++++++++++++++++++++++++++++
>  block/blk-mq.h         |  2 ++
>  include/linux/blk-mq.h |  2 ++
>  4 files changed, 91 insertions(+), 2 deletions(-)
> 
> diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
> index be29ba849408..14b354f617e5 100644
> --- a/block/blk-mq-sched.c
> +++ b/block/blk-mq-sched.c
> @@ -104,6 +104,39 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
>  	} while (blk_mq_dispatch_rq_list(q, &rq_list));
>  }
>  
> +static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
> +					  struct blk_mq_ctx *ctx)
> +{
> +	unsigned idx = ctx->index_hw;
> +
> +	if (++idx == hctx->nr_ctx)
> +		idx = 0;
> +
> +	return hctx->ctxs[idx];
> +}
> +
> +static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
> +{
> +	struct request_queue *q = hctx->queue;
> +	LIST_HEAD(rq_list);
> +	struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
> +
> +	do {
> +		struct request *rq;
> +
> +		rq = blk_mq_dequeue_from_ctx(hctx, ctx);
> +		if (!rq)
> +			break;
> +		list_add(&rq->queuelist, &rq_list);
> +
> +		/* round robin for fair dispatch */
> +		ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
> +
> +	} while (blk_mq_dispatch_rq_list(q, &rq_list));
> +
> +	WRITE_ONCE(hctx->dispatch_from, ctx);
> +}
> +
>  void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
>  {
>  	struct request_queue *q = hctx->queue;
> @@ -143,10 +176,23 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
>  	 */
>  	if (!list_empty(&rq_list)) {
>  		blk_mq_sched_mark_restart_hctx(hctx);
> -		if (blk_mq_dispatch_rq_list(q, &rq_list) && has_sched_dispatch)
> -			blk_mq_do_dispatch_sched(hctx);
> +		if (blk_mq_dispatch_rq_list(q, &rq_list)) {
> +			if (has_sched_dispatch)
> +				blk_mq_do_dispatch_sched(hctx);
> +			else
> +				blk_mq_do_dispatch_ctx(hctx);
> +		}
>  	} else if (has_sched_dispatch) {
>  		blk_mq_do_dispatch_sched(hctx);
> +	} else if (q->queue_depth) {
> +		/*
> +		 * If there is per-request_queue depth, we dequeue
> +		 * request one by one from sw queue for avoiding to mess
> +		 * up I/O merge when dispatch runs out of resource, which
> +		 * can be triggered easily when there is per-request_queue
> +		 * queue depth or .cmd_per_lun, such as SCSI device.
> +		 */
> +		blk_mq_do_dispatch_ctx(hctx);
>  	} else {
>  		blk_mq_flush_busy_ctxs(hctx, &rq_list);
>  		blk_mq_dispatch_rq_list(q, &rq_list);
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 076cbab9c3e0..394cb75d66fa 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -911,6 +911,45 @@ void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
>  }
>  EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs);
>  
> +struct dispatch_rq_data {
> +	struct blk_mq_hw_ctx *hctx;
> +	struct request *rq;
> +};
> +
> +static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr,
> +		void *data)
> +{
> +	struct dispatch_rq_data *dispatch_data = data;
> +	struct blk_mq_hw_ctx *hctx = dispatch_data->hctx;
> +	struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
> +
> +	spin_lock(&ctx->lock);
> +	if (unlikely(!list_empty(&ctx->rq_list))) {
> +		dispatch_data->rq = list_entry_rq(ctx->rq_list.next);
> +		list_del_init(&dispatch_data->rq->queuelist);
> +		if (list_empty(&ctx->rq_list))
> +			sbitmap_clear_bit(sb, bitnr);
> +	}
> +	spin_unlock(&ctx->lock);
> +
> +	return !dispatch_data->rq;
> +}
> +
> +struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
> +					struct blk_mq_ctx *start)
> +{
> +	unsigned off = start ? start->index_hw : 0;
> +	struct dispatch_rq_data data = {
> +		.hctx = hctx,
> +		.rq   = NULL,
> +	};
> +
> +	__sbitmap_for_each_set(&hctx->ctx_map, off,
> +			       dispatch_rq_from_ctx, &data);
> +
> +	return data.rq;
> +}
> +
>  static inline unsigned int queued_to_index(unsigned int queued)
>  {
>  	if (!queued)
> diff --git a/block/blk-mq.h b/block/blk-mq.h
> index ef15b3414da5..231cfb0d973b 100644
> --- a/block/blk-mq.h
> +++ b/block/blk-mq.h
> @@ -35,6 +35,8 @@ void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
>  bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx);
>  bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
>  				bool wait);
> +struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
> +					struct blk_mq_ctx *start);
>  
>  /*
>   * Internal helpers for allocating/freeing the request map
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index 50c6485cb04f..7b7a366a97f3 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -30,6 +30,8 @@ struct blk_mq_hw_ctx {
>  
>  	struct sbitmap		ctx_map;
>  
> +	struct blk_mq_ctx	*dispatch_from;
> +
>  	struct blk_mq_ctx	**ctxs;
>  	unsigned int		nr_ctx;
>  
> -- 
> 2.9.5
> 

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
Ming Lei Oct. 12, 2017, 10:01 a.m. UTC | #2
On Tue, Oct 10, 2017 at 11:23:45AM -0700, Omar Sandoval wrote:
> On Mon, Oct 09, 2017 at 07:24:23PM +0800, Ming Lei wrote:
> > SCSI devices use host-wide tagset, and the shared driver tag space is
> > often quite big. Meantime there is also queue depth for each lun(
> > .cmd_per_lun), which is often small, for example, on both lpfc and
> > qla2xxx, .cmd_per_lun is just 3.
> > 
> > So lots of requests may stay in sw queue, and we always flush all
> > belonging to same hw queue and dispatch them all to driver, unfortunately
> > it is easy to cause queue busy because of the small .cmd_per_lun.
> > Once these requests are flushed out, they have to stay in hctx->dispatch,
> > and no bio merge can participate into these requests, and sequential IO
> > performance is hurt a lot.
> > 
> > This patch introduces blk_mq_dequeue_from_ctx for dequeuing request from
> > sw queue so that we can dispatch them in scheduler's way, then we can
> > avoid to dequeue too many requests from sw queue when ->dispatch isn't
> > flushed completely.
> > 
> > This patch improves dispatching from sw queue when there is per-request-queue
> > queue depth by taking request one by one from sw queue, just like the way
> > of IO scheduler.
> 
> This still didn't address Jens' concern about using q->queue_depth as
> the heuristic for whether to do the full sw queue flush or one-by-one
> dispatch. The EWMA approach is a bit too complex for now, can you please
> try the heuristic of whether the driver ever returned BLK_STS_RESOURCE?

That can be done easily, but I am not sure if it is good.

For example, inside queue rq path of NVMe, kmalloc(GFP_ATOMIC) is
often used, if kmalloc() returns NULL just once, BLK_STS_RESOURCE
will be returned to blk-mq, then blk-mq will never do full sw
queue flush even when kmalloc() always succeed from that time
on.

Even EWMA approach isn't good on SCSI-MQ too, because
some SCSI's .cmd_per_lun is very small, such as 3 on
lpfc and qla2xxx, and one full flush will trigger
BLK_STS_RESOURCE easily.

So I suggest to use the way of q->queue_depth first, since we
don't get performance degrade report on other devices(!q->queue_depth)
with blk-mq. We can improve this way in the future if we
have better approach.

What do you think about it?
Jens Axboe Oct. 12, 2017, 2:52 p.m. UTC | #3
On 10/12/2017 04:01 AM, Ming Lei wrote:
> On Tue, Oct 10, 2017 at 11:23:45AM -0700, Omar Sandoval wrote:
>> On Mon, Oct 09, 2017 at 07:24:23PM +0800, Ming Lei wrote:
>>> SCSI devices use host-wide tagset, and the shared driver tag space is
>>> often quite big. Meantime there is also queue depth for each lun(
>>> .cmd_per_lun), which is often small, for example, on both lpfc and
>>> qla2xxx, .cmd_per_lun is just 3.
>>>
>>> So lots of requests may stay in sw queue, and we always flush all
>>> belonging to same hw queue and dispatch them all to driver, unfortunately
>>> it is easy to cause queue busy because of the small .cmd_per_lun.
>>> Once these requests are flushed out, they have to stay in hctx->dispatch,
>>> and no bio merge can participate into these requests, and sequential IO
>>> performance is hurt a lot.
>>>
>>> This patch introduces blk_mq_dequeue_from_ctx for dequeuing request from
>>> sw queue so that we can dispatch them in scheduler's way, then we can
>>> avoid to dequeue too many requests from sw queue when ->dispatch isn't
>>> flushed completely.
>>>
>>> This patch improves dispatching from sw queue when there is per-request-queue
>>> queue depth by taking request one by one from sw queue, just like the way
>>> of IO scheduler.
>>
>> This still didn't address Jens' concern about using q->queue_depth as
>> the heuristic for whether to do the full sw queue flush or one-by-one
>> dispatch. The EWMA approach is a bit too complex for now, can you please
>> try the heuristic of whether the driver ever returned BLK_STS_RESOURCE?
> 
> That can be done easily, but I am not sure if it is good.
> 
> For example, inside queue rq path of NVMe, kmalloc(GFP_ATOMIC) is
> often used, if kmalloc() returns NULL just once, BLK_STS_RESOURCE
> will be returned to blk-mq, then blk-mq will never do full sw
> queue flush even when kmalloc() always succeed from that time
> on.

Have it be a bit more than a single bit, then. Reset it every x IOs or
something like that, that'll be more representative of transient busy
conditions anyway.

> Even EWMA approach isn't good on SCSI-MQ too, because
> some SCSI's .cmd_per_lun is very small, such as 3 on
> lpfc and qla2xxx, and one full flush will trigger
> BLK_STS_RESOURCE easily.
> 
> So I suggest to use the way of q->queue_depth first, since we
> don't get performance degrade report on other devices(!q->queue_depth)
> with blk-mq. We can improve this way in the future if we
> have better approach.
> 
> What do you think about it?

I think it's absolutely horrible, and I already explained why in great
detail in an earlier review. tldr is that the fact that only scsi sets
->queue_depth right now is completely randomly related to the fact that
only scsi also shares tags. Every driver should set the queue depth, so
that signal will go to zero very quickly.
Ming Lei Oct. 12, 2017, 3:22 p.m. UTC | #4
On Thu, Oct 12, 2017 at 08:52:12AM -0600, Jens Axboe wrote:
> On 10/12/2017 04:01 AM, Ming Lei wrote:
> > On Tue, Oct 10, 2017 at 11:23:45AM -0700, Omar Sandoval wrote:
> >> On Mon, Oct 09, 2017 at 07:24:23PM +0800, Ming Lei wrote:
> >>> SCSI devices use host-wide tagset, and the shared driver tag space is
> >>> often quite big. Meantime there is also queue depth for each lun(
> >>> .cmd_per_lun), which is often small, for example, on both lpfc and
> >>> qla2xxx, .cmd_per_lun is just 3.
> >>>
> >>> So lots of requests may stay in sw queue, and we always flush all
> >>> belonging to same hw queue and dispatch them all to driver, unfortunately
> >>> it is easy to cause queue busy because of the small .cmd_per_lun.
> >>> Once these requests are flushed out, they have to stay in hctx->dispatch,
> >>> and no bio merge can participate into these requests, and sequential IO
> >>> performance is hurt a lot.
> >>>
> >>> This patch introduces blk_mq_dequeue_from_ctx for dequeuing request from
> >>> sw queue so that we can dispatch them in scheduler's way, then we can
> >>> avoid to dequeue too many requests from sw queue when ->dispatch isn't
> >>> flushed completely.
> >>>
> >>> This patch improves dispatching from sw queue when there is per-request-queue
> >>> queue depth by taking request one by one from sw queue, just like the way
> >>> of IO scheduler.
> >>
> >> This still didn't address Jens' concern about using q->queue_depth as
> >> the heuristic for whether to do the full sw queue flush or one-by-one
> >> dispatch. The EWMA approach is a bit too complex for now, can you please
> >> try the heuristic of whether the driver ever returned BLK_STS_RESOURCE?
> > 
> > That can be done easily, but I am not sure if it is good.
> > 
> > For example, inside queue rq path of NVMe, kmalloc(GFP_ATOMIC) is
> > often used, if kmalloc() returns NULL just once, BLK_STS_RESOURCE
> > will be returned to blk-mq, then blk-mq will never do full sw
> > queue flush even when kmalloc() always succeed from that time
> > on.
> 
> Have it be a bit more than a single bit, then. Reset it every x IOs or
> something like that, that'll be more representative of transient busy
> conditions anyway.

OK, that can be done via a simplified EWMA by considering
the dispatch result only.

I will address it in V6.
Jens Axboe Oct. 12, 2017, 3:24 p.m. UTC | #5
On 10/12/2017 09:22 AM, Ming Lei wrote:
> On Thu, Oct 12, 2017 at 08:52:12AM -0600, Jens Axboe wrote:
>> On 10/12/2017 04:01 AM, Ming Lei wrote:
>>> On Tue, Oct 10, 2017 at 11:23:45AM -0700, Omar Sandoval wrote:
>>>> On Mon, Oct 09, 2017 at 07:24:23PM +0800, Ming Lei wrote:
>>>>> SCSI devices use host-wide tagset, and the shared driver tag space is
>>>>> often quite big. Meantime there is also queue depth for each lun(
>>>>> .cmd_per_lun), which is often small, for example, on both lpfc and
>>>>> qla2xxx, .cmd_per_lun is just 3.
>>>>>
>>>>> So lots of requests may stay in sw queue, and we always flush all
>>>>> belonging to same hw queue and dispatch them all to driver, unfortunately
>>>>> it is easy to cause queue busy because of the small .cmd_per_lun.
>>>>> Once these requests are flushed out, they have to stay in hctx->dispatch,
>>>>> and no bio merge can participate into these requests, and sequential IO
>>>>> performance is hurt a lot.
>>>>>
>>>>> This patch introduces blk_mq_dequeue_from_ctx for dequeuing request from
>>>>> sw queue so that we can dispatch them in scheduler's way, then we can
>>>>> avoid to dequeue too many requests from sw queue when ->dispatch isn't
>>>>> flushed completely.
>>>>>
>>>>> This patch improves dispatching from sw queue when there is per-request-queue
>>>>> queue depth by taking request one by one from sw queue, just like the way
>>>>> of IO scheduler.
>>>>
>>>> This still didn't address Jens' concern about using q->queue_depth as
>>>> the heuristic for whether to do the full sw queue flush or one-by-one
>>>> dispatch. The EWMA approach is a bit too complex for now, can you please
>>>> try the heuristic of whether the driver ever returned BLK_STS_RESOURCE?
>>>
>>> That can be done easily, but I am not sure if it is good.
>>>
>>> For example, inside queue rq path of NVMe, kmalloc(GFP_ATOMIC) is
>>> often used, if kmalloc() returns NULL just once, BLK_STS_RESOURCE
>>> will be returned to blk-mq, then blk-mq will never do full sw
>>> queue flush even when kmalloc() always succeed from that time
>>> on.
>>
>> Have it be a bit more than a single bit, then. Reset it every x IOs or
>> something like that, that'll be more representative of transient busy
>> conditions anyway.
> 
> OK, that can be done via a simplified EWMA by considering
> the dispatch result only.

Yes, if it's kept simple enough, then that would be fine. I'm not totally
against EWMA, I just don't want to have any of this over-engineered.
Especially not when it's a pretty simple thing, we don't care about
averages, basically only if we ever see BLK_STS_RESOURCE in any kind
of recurring fashion.
Bart Van Assche Oct. 12, 2017, 3:33 p.m. UTC | #6
On Thu, 2017-10-12 at 18:01 +0800, Ming Lei wrote:
> Even EWMA approach isn't good on SCSI-MQ too, because
> some SCSI's .cmd_per_lun is very small, such as 3 on
> lpfc and qla2xxx, and one full flush will trigger
> BLK_STS_RESOURCE easily.
> 
> So I suggest to use the way of q->queue_depth first, since we
> don't get performance degrade report on other devices(!q->queue_depth)
> with blk-mq. We can improve this way in the future if we
> have better approach.

Measurements have shown that even with this patch series applied sequential
I/O performance is still below that of the legacy block and SCSI layers. So
this patch series is not the final solution. (See also John Garry's e-mail
of October 10th - https://lkml.org/lkml/2017/10/10/401). I have been
wondering what could be causing that performance difference. Maybe it's
because requests can reside for a while in the hctx dispatch queue and hence
are unvisible for the scheduler while in the hctx dispatch queue? Should we
modify blk_mq_dispatch_rq_list() such that it puts back requests that have
not been accepted by .queue_rq() onto the scheduler queue(s) instead of to
the hctx dispatch queue? If we would make that change, would it allow us to
drop patch "blk-mq-sched: improve dispatching from sw queue"?

Bart.

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
Jens Axboe Oct. 12, 2017, 3:37 p.m. UTC | #7
On 10/12/2017 09:33 AM, Bart Van Assche wrote:
> On Thu, 2017-10-12 at 18:01 +0800, Ming Lei wrote:
>> Even EWMA approach isn't good on SCSI-MQ too, because
>> some SCSI's .cmd_per_lun is very small, such as 3 on
>> lpfc and qla2xxx, and one full flush will trigger
>> BLK_STS_RESOURCE easily.
>>
>> So I suggest to use the way of q->queue_depth first, since we
>> don't get performance degrade report on other devices(!q->queue_depth)
>> with blk-mq. We can improve this way in the future if we
>> have better approach.
> 
> Measurements have shown that even with this patch series applied sequential
> I/O performance is still below that of the legacy block and SCSI layers. So
> this patch series is not the final solution. (See also John Garry's e-mail
> of October 10th - https://lkml.org/lkml/2017/10/10/401). I have been
> wondering what could be causing that performance difference. Maybe it's
> because requests can reside for a while in the hctx dispatch queue and hence
> are unvisible for the scheduler while in the hctx dispatch queue? Should we
> modify blk_mq_dispatch_rq_list() such that it puts back requests that have
> not been accepted by .queue_rq() onto the scheduler queue(s) instead of to
> the hctx dispatch queue? If we would make that change, would it allow us to
> drop patch "blk-mq-sched: improve dispatching from sw queue"?

Yes, it's clear that even with the full series, we're not completely there
yet. We are closer, though, and I do want to close that gap up as much
as we can. I think everybody will be more motivated and have an easier time
getting the last bit of the way there, once we have a good foundation in.

It may be the reason that you hint at, if we do see a lot of requeueing
or BUSY in the test case. That would prematurely move requests from the
schedulers knowledge and into the hctx->dispatch holding area. It'd be
useful to have a standard SATA test run and see if we're missing merging
in that case (since merging is what it boils down to). If we are, then
it's not hctx->dispatch issues.
Ming Lei Oct. 12, 2017, 3:49 p.m. UTC | #8
On Thu, Oct 12, 2017 at 09:37:11AM -0600, Jens Axboe wrote:
> On 10/12/2017 09:33 AM, Bart Van Assche wrote:
> > On Thu, 2017-10-12 at 18:01 +0800, Ming Lei wrote:
> >> Even EWMA approach isn't good on SCSI-MQ too, because
> >> some SCSI's .cmd_per_lun is very small, such as 3 on
> >> lpfc and qla2xxx, and one full flush will trigger
> >> BLK_STS_RESOURCE easily.
> >>
> >> So I suggest to use the way of q->queue_depth first, since we
> >> don't get performance degrade report on other devices(!q->queue_depth)
> >> with blk-mq. We can improve this way in the future if we
> >> have better approach.
> > 
> > Measurements have shown that even with this patch series applied sequential
> > I/O performance is still below that of the legacy block and SCSI layers. So
> > this patch series is not the final solution. (See also John Garry's e-mail
> > of October 10th - https://lkml.org/lkml/2017/10/10/401). I have been
> > wondering what could be causing that performance difference. Maybe it's
> > because requests can reside for a while in the hctx dispatch queue and hence
> > are unvisible for the scheduler while in the hctx dispatch queue? Should we
> > modify blk_mq_dispatch_rq_list() such that it puts back requests that have
> > not been accepted by .queue_rq() onto the scheduler queue(s) instead of to
> > the hctx dispatch queue? If we would make that change, would it allow us to
> > drop patch "blk-mq-sched: improve dispatching from sw queue"?
> 
> Yes, it's clear that even with the full series, we're not completely there
> yet. We are closer, though, and I do want to close that gap up as much
> as we can. I think everybody will be more motivated and have an easier time
> getting the last bit of the way there, once we have a good foundation in.
> 
> It may be the reason that you hint at, if we do see a lot of requeueing
> or BUSY in the test case. That would prematurely move requests from the
> schedulers knowledge and into the hctx->dispatch holding area. It'd be
> useful to have a standard SATA test run and see if we're missing merging
> in that case (since merging is what it boils down to). If we are, then
> it's not hctx->dispatch issues.

>From Gary's test result on the patches of .get_budget()/.put_budget()[1],
the sequential I/O performance is still not good, that means the
issue may not be in IO merge, because .get_buget/.put_budget is 
more helpful to do I/O merge than block legacy.

Actually in my virtio-scsi test, blk-mq has been better than block legacy
with the way of .get_budget()/.put_budget().


[1] https://github.com/ming1/linux/commits/blk_mq_improve_scsi_mpath_perf_V6.2_test
diff mbox

Patch

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index be29ba849408..14b354f617e5 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -104,6 +104,39 @@  static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
 	} while (blk_mq_dispatch_rq_list(q, &rq_list));
 }
 
+static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
+					  struct blk_mq_ctx *ctx)
+{
+	unsigned idx = ctx->index_hw;
+
+	if (++idx == hctx->nr_ctx)
+		idx = 0;
+
+	return hctx->ctxs[idx];
+}
+
+static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
+{
+	struct request_queue *q = hctx->queue;
+	LIST_HEAD(rq_list);
+	struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
+
+	do {
+		struct request *rq;
+
+		rq = blk_mq_dequeue_from_ctx(hctx, ctx);
+		if (!rq)
+			break;
+		list_add(&rq->queuelist, &rq_list);
+
+		/* round robin for fair dispatch */
+		ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
+
+	} while (blk_mq_dispatch_rq_list(q, &rq_list));
+
+	WRITE_ONCE(hctx->dispatch_from, ctx);
+}
+
 void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 {
 	struct request_queue *q = hctx->queue;
@@ -143,10 +176,23 @@  void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 	 */
 	if (!list_empty(&rq_list)) {
 		blk_mq_sched_mark_restart_hctx(hctx);
-		if (blk_mq_dispatch_rq_list(q, &rq_list) && has_sched_dispatch)
-			blk_mq_do_dispatch_sched(hctx);
+		if (blk_mq_dispatch_rq_list(q, &rq_list)) {
+			if (has_sched_dispatch)
+				blk_mq_do_dispatch_sched(hctx);
+			else
+				blk_mq_do_dispatch_ctx(hctx);
+		}
 	} else if (has_sched_dispatch) {
 		blk_mq_do_dispatch_sched(hctx);
+	} else if (q->queue_depth) {
+		/*
+		 * If there is per-request_queue depth, we dequeue
+		 * request one by one from sw queue for avoiding to mess
+		 * up I/O merge when dispatch runs out of resource, which
+		 * can be triggered easily when there is per-request_queue
+		 * queue depth or .cmd_per_lun, such as SCSI device.
+		 */
+		blk_mq_do_dispatch_ctx(hctx);
 	} else {
 		blk_mq_flush_busy_ctxs(hctx, &rq_list);
 		blk_mq_dispatch_rq_list(q, &rq_list);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 076cbab9c3e0..394cb75d66fa 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -911,6 +911,45 @@  void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 }
 EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs);
 
+struct dispatch_rq_data {
+	struct blk_mq_hw_ctx *hctx;
+	struct request *rq;
+};
+
+static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr,
+		void *data)
+{
+	struct dispatch_rq_data *dispatch_data = data;
+	struct blk_mq_hw_ctx *hctx = dispatch_data->hctx;
+	struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
+
+	spin_lock(&ctx->lock);
+	if (unlikely(!list_empty(&ctx->rq_list))) {
+		dispatch_data->rq = list_entry_rq(ctx->rq_list.next);
+		list_del_init(&dispatch_data->rq->queuelist);
+		if (list_empty(&ctx->rq_list))
+			sbitmap_clear_bit(sb, bitnr);
+	}
+	spin_unlock(&ctx->lock);
+
+	return !dispatch_data->rq;
+}
+
+struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
+					struct blk_mq_ctx *start)
+{
+	unsigned off = start ? start->index_hw : 0;
+	struct dispatch_rq_data data = {
+		.hctx = hctx,
+		.rq   = NULL,
+	};
+
+	__sbitmap_for_each_set(&hctx->ctx_map, off,
+			       dispatch_rq_from_ctx, &data);
+
+	return data.rq;
+}
+
 static inline unsigned int queued_to_index(unsigned int queued)
 {
 	if (!queued)
diff --git a/block/blk-mq.h b/block/blk-mq.h
index ef15b3414da5..231cfb0d973b 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -35,6 +35,8 @@  void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
 bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx);
 bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
 				bool wait);
+struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
+					struct blk_mq_ctx *start);
 
 /*
  * Internal helpers for allocating/freeing the request map
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 50c6485cb04f..7b7a366a97f3 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -30,6 +30,8 @@  struct blk_mq_hw_ctx {
 
 	struct sbitmap		ctx_map;
 
+	struct blk_mq_ctx	*dispatch_from;
+
 	struct blk_mq_ctx	**ctxs;
 	unsigned int		nr_ctx;