diff mbox

[05/14] blk-mq-sched: don't dequeue request until all in ->dispatch are flushed

Message ID 20170731165111.11536-7-ming.lei@redhat.com (mailing list archive)
State Changes Requested, archived
Headers show

Commit Message

Ming Lei July 31, 2017, 4:51 p.m. UTC
During dispatch, we moved all requests from hctx->dispatch to
one temporary list, then dispatch them one by one from this list.
Unfortunately duirng this period, run queue from other contexts
may think the queue is idle and start to dequeue from sw/scheduler
queue and try to dispatch because ->dispatch is empty.

This way will hurt sequential I/O performance because requests are
dequeued when queue is busy.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 block/blk-mq-sched.c   | 24 ++++++++++++++++++------
 include/linux/blk-mq.h |  1 +
 2 files changed, 19 insertions(+), 6 deletions(-)

Comments

Bart Van Assche July 31, 2017, 11:42 p.m. UTC | #1
On Tue, 2017-08-01 at 00:51 +0800, Ming Lei wrote:
> During dispatch, we moved all requests from hctx->dispatch to
> one temporary list, then dispatch them one by one from this list.
> Unfortunately duirng this period, run queue from other contexts
> may think the queue is idle and start to dequeue from sw/scheduler
> queue and try to dispatch because ->dispatch is empty.
> 
> This way will hurt sequential I/O performance because requests are
> dequeued when queue is busy.
> 
> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> ---
>  block/blk-mq-sched.c   | 24 ++++++++++++++++++------
>  include/linux/blk-mq.h |  1 +
>  2 files changed, 19 insertions(+), 6 deletions(-)
> 
> diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
> index 3510c01cb17b..eb638063673f 100644
> --- a/block/blk-mq-sched.c
> +++ b/block/blk-mq-sched.c
> @@ -112,8 +112,15 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
>  	 */
>  	if (!list_empty_careful(&hctx->dispatch)) {
>  		spin_lock(&hctx->lock);
> -		if (!list_empty(&hctx->dispatch))
> +		if (!list_empty(&hctx->dispatch)) {
>  			list_splice_init(&hctx->dispatch, &rq_list);
> +
> +			/*
> +			 * BUSY won't be cleared until all requests
> +			 * in hctx->dispatch are dispatched successfully
> +			 */
> +			set_bit(BLK_MQ_S_BUSY, &hctx->state);
> +		}
>  		spin_unlock(&hctx->lock);
>  	}
>  
> @@ -129,15 +136,20 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
>  	if (!list_empty(&rq_list)) {
>  		blk_mq_sched_mark_restart_hctx(hctx);
>  		can_go = blk_mq_dispatch_rq_list(q, &rq_list);
> -	} else if (!has_sched_dispatch && !q->queue_depth) {
> -		blk_mq_flush_busy_ctxs(hctx, &rq_list);
> -		blk_mq_dispatch_rq_list(q, &rq_list);
> -		can_go = false;
> +		if (can_go)
> +			clear_bit(BLK_MQ_S_BUSY, &hctx->state);
>  	}
>  
> -	if (!can_go)
> +	/* can't go until ->dispatch is flushed */
> +	if (!can_go || test_bit(BLK_MQ_S_BUSY, &hctx->state))
>  		return;
>  
> +	if (!has_sched_dispatch && !q->queue_depth) {
> +		blk_mq_flush_busy_ctxs(hctx, &rq_list);
> +		blk_mq_dispatch_rq_list(q, &rq_list);
> +		return;
> +	}

Hello Ming,

Since setting, clearing and testing of BLK_MQ_S_BUSY can happen concurrently
and since clearing and testing happens without any locks held I'm afraid this
patch introduces the following race conditions:
* Clearing of BLK_MQ_S_BUSY immediately after this bit has been set, resulting
  in this bit not being set although there are requests on the dispatch list.
* Checking BLK_MQ_S_BUSY after requests have been added to the dispatch list
  but before that bit is set, resulting in test_bit(BLK_MQ_S_BUSY, &hctx->state)
  reporting that the BLK_MQ_S_BUSY has not been set although there are requests
  on the dispatch list.
* Checking BLK_MQ_S_BUSY after requests have been removed from the dispatch list
  but before that bit is cleared, resulting in test_bit(BLK_MQ_S_BUSY, &hctx->state)
  reporting that the BLK_MQ_S_BUSY
has been set although there are no requests
  on the dispatch list.

Bart.
Ming Lei Aug. 1, 2017, 10:44 a.m. UTC | #2
On Mon, Jul 31, 2017 at 11:42:21PM +0000, Bart Van Assche wrote:
> On Tue, 2017-08-01 at 00:51 +0800, Ming Lei wrote:
> > During dispatch, we moved all requests from hctx->dispatch to
> > one temporary list, then dispatch them one by one from this list.
> > Unfortunately duirng this period, run queue from other contexts
> > may think the queue is idle and start to dequeue from sw/scheduler
> > queue and try to dispatch because ->dispatch is empty.
> > 
> > This way will hurt sequential I/O performance because requests are
> > dequeued when queue is busy.
> > 
> > Signed-off-by: Ming Lei <ming.lei@redhat.com>
> > ---
> >  block/blk-mq-sched.c   | 24 ++++++++++++++++++------
> >  include/linux/blk-mq.h |  1 +
> >  2 files changed, 19 insertions(+), 6 deletions(-)
> > 
> > diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
> > index 3510c01cb17b..eb638063673f 100644
> > --- a/block/blk-mq-sched.c
> > +++ b/block/blk-mq-sched.c
> > @@ -112,8 +112,15 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> >  	 */
> >  	if (!list_empty_careful(&hctx->dispatch)) {
> >  		spin_lock(&hctx->lock);
> > -		if (!list_empty(&hctx->dispatch))
> > +		if (!list_empty(&hctx->dispatch)) {
> >  			list_splice_init(&hctx->dispatch, &rq_list);
> > +
> > +			/*
> > +			 * BUSY won't be cleared until all requests
> > +			 * in hctx->dispatch are dispatched successfully
> > +			 */
> > +			set_bit(BLK_MQ_S_BUSY, &hctx->state);
> > +		}
> >  		spin_unlock(&hctx->lock);
> >  	}
> >  
> > @@ -129,15 +136,20 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> >  	if (!list_empty(&rq_list)) {
> >  		blk_mq_sched_mark_restart_hctx(hctx);
> >  		can_go = blk_mq_dispatch_rq_list(q, &rq_list);
> > -	} else if (!has_sched_dispatch && !q->queue_depth) {
> > -		blk_mq_flush_busy_ctxs(hctx, &rq_list);
> > -		blk_mq_dispatch_rq_list(q, &rq_list);
> > -		can_go = false;
> > +		if (can_go)
> > +			clear_bit(BLK_MQ_S_BUSY, &hctx->state);
> >  	}
> >  
> > -	if (!can_go)
> > +	/* can't go until ->dispatch is flushed */
> > +	if (!can_go || test_bit(BLK_MQ_S_BUSY, &hctx->state))
> >  		return;
> >  
> > +	if (!has_sched_dispatch && !q->queue_depth) {
> > +		blk_mq_flush_busy_ctxs(hctx, &rq_list);
> > +		blk_mq_dispatch_rq_list(q, &rq_list);
> > +		return;
> > +	}
> 
> Hello Ming,
> 
> Since setting, clearing and testing of BLK_MQ_S_BUSY can happen concurrently
> and since clearing and testing happens without any locks held I'm afraid this

Yes, I really want to avoid lock.

> patch introduces the following race conditions:
> * Clearing of BLK_MQ_S_BUSY immediately after this bit has been set, resulting
>   in this bit not being set although there are requests on the dispatch list.

The window is small enough.

And in the context of setting the BUSY bit, dispatch still can't move on
because 'can_go' will stop that.

Even it happens, no big deal, it just means only one request is dequeued
a bit early. What we really need to avoid is I/O hang.


> * Checking BLK_MQ_S_BUSY after requests have been added to the dispatch list
>   but before that bit is set, resulting in test_bit(BLK_MQ_S_BUSY, &hctx->state)
>   reporting that the BLK_MQ_S_BUSY has not been set although there are requests
>   on the dispatch list.

Same as above, no big deal, we can survive that.


> * Checking BLK_MQ_S_BUSY after requests have been removed from the dispatch list
>   but before that bit is cleared, resulting in test_bit(BLK_MQ_S_BUSY, &hctx->state)
>   reporting that the BLK_MQ_S_BUSY
> has been set although there are no requests
>   on the dispatch list.

That won't be a problem, because dispatch will be started in the
context in which dispatch list is flushed, since the BUSY bit
is cleared after blk_mq_dispatch_rq_list() returns. So no I/O
hang.
Bart Van Assche Aug. 1, 2017, 4:14 p.m. UTC | #3
On Tue, 2017-08-01 at 18:44 +0800, Ming Lei wrote:
> On Mon, Jul 31, 2017 at 11:42:21PM +0000, Bart Van Assche wrote:
> > Since setting, clearing and testing of BLK_MQ_S_BUSY can happen concurrently
> > and since clearing and testing happens without any locks held I'm afraid this
> > patch introduces the following race conditions:
> > [ ... ]
> > * Checking BLK_MQ_S_BUSY after requests have been removed from the dispatch list
> >   but before that bit is cleared, resulting in test_bit(BLK_MQ_S_BUSY, &hctx->state)
> >   reporting that the BLK_MQ_S_BUSY
> > has been set although there are no requests
> >   on the dispatch list.
> 
> That won't be a problem, because dispatch will be started in the
> context in which dispatch list is flushed, since the BUSY bit
> is cleared after blk_mq_dispatch_rq_list() returns. So no I/O
> hang.

Hello Ming,

Please consider changing the name of the BLK_MQ_S_BUSY constant. That bit
is used to serialize dispatching requests from the hctx dispatch list but
that's not clear from the name of that constant.

Thanks,

Bart.
Ming Lei Aug. 2, 2017, 3:01 a.m. UTC | #4
On Tue, Aug 01, 2017 at 04:14:07PM +0000, Bart Van Assche wrote:
> On Tue, 2017-08-01 at 18:44 +0800, Ming Lei wrote:
> > On Mon, Jul 31, 2017 at 11:42:21PM +0000, Bart Van Assche wrote:
> > > Since setting, clearing and testing of BLK_MQ_S_BUSY can happen concurrently
> > > and since clearing and testing happens without any locks held I'm afraid this
> > > patch introduces the following race conditions:
> > > [ ... ]
> > > * Checking BLK_MQ_S_BUSY after requests have been removed from the dispatch list
> > >   but before that bit is cleared, resulting in test_bit(BLK_MQ_S_BUSY, &hctx->state)
> > >   reporting that the BLK_MQ_S_BUSY
> > > has been set although there are no requests
> > >   on the dispatch list.
> > 
> > That won't be a problem, because dispatch will be started in the
> > context in which dispatch list is flushed, since the BUSY bit
> > is cleared after blk_mq_dispatch_rq_list() returns. So no I/O
> > hang.
> 
> Hello Ming,
> 
> Please consider changing the name of the BLK_MQ_S_BUSY constant. That bit
> is used to serialize dispatching requests from the hctx dispatch list but
> that's not clear from the name of that constant.

Actually what we want to do is to stop taking request from sw/scheduler
queue when ->dispatch aren't flushed completely, I think BUSY isn't
a bad name for this case, or how about DISPATCH_BUSY? or
FLUSHING_DISPATCH?

After thinking about the handling further, we can set the
BUSY bit just when adding requests to ->dispatch, and clear the
bit after returning from blk_mq_dispatch_rq_list() when the
current local list(from ->dispatch) is flushed completely and
->dispatch is empty. This way can minimize the race window, and
still safe, because we always move on to dispatch either new
request is added to ->dispatch or ->dispatch is flushed completely.

But anyway comment should be added for clarifying the fact.
Bart Van Assche Aug. 3, 2017, 1:33 a.m. UTC | #5
On Wed, 2017-08-02 at 11:01 +0800, Ming Lei wrote:
> On Tue, Aug 01, 2017 at 04:14:07PM +0000, Bart Van Assche wrote:

> > On Tue, 2017-08-01 at 18:44 +0800, Ming Lei wrote:

> > > On Mon, Jul 31, 2017 at 11:42:21PM +0000, Bart Van Assche wrote:

> > > > Since setting, clearing and testing of BLK_MQ_S_BUSY can happen concurrently

> > > > and since clearing and testing happens without any locks held I'm afraid this

> > > > patch introduces the following race conditions:

> > > > [ ... ]

> > > > * Checking BLK_MQ_S_BUSY after requests have been removed from the dispatch list

> > > >   but before that bit is cleared, resulting in test_bit(BLK_MQ_S_BUSY, &hctx->state)

> > > >   reporting that the BLK_MQ_S_BUSY

> > > > has been set although there are no requests

> > > >   on the dispatch list.

> > > 

> > > That won't be a problem, because dispatch will be started in the

> > > context in which dispatch list is flushed, since the BUSY bit

> > > is cleared after blk_mq_dispatch_rq_list() returns. So no I/O

> > > hang.

> > 

> > Hello Ming,

> > 

> > Please consider changing the name of the BLK_MQ_S_BUSY constant. That bit

> > is used to serialize dispatching requests from the hctx dispatch list but

> > that's not clear from the name of that constant.

> 

> Actually what we want to do is to stop taking request from sw/scheduler

> queue when ->dispatch aren't flushed completely, I think BUSY isn't

> a bad name for this case, or how about DISPATCH_BUSY? or

> FLUSHING_DISPATCH?


Hello Ming,

FLUSHING_DISPATCH sounds fine to me. In case you would prefer a shorter name,
how about BLK_MQ_S_DISPATCHING (refers to dispatching requests to the driver)?

Bart.
diff mbox

Patch

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 3510c01cb17b..eb638063673f 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -112,8 +112,15 @@  void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 	 */
 	if (!list_empty_careful(&hctx->dispatch)) {
 		spin_lock(&hctx->lock);
-		if (!list_empty(&hctx->dispatch))
+		if (!list_empty(&hctx->dispatch)) {
 			list_splice_init(&hctx->dispatch, &rq_list);
+
+			/*
+			 * BUSY won't be cleared until all requests
+			 * in hctx->dispatch are dispatched successfully
+			 */
+			set_bit(BLK_MQ_S_BUSY, &hctx->state);
+		}
 		spin_unlock(&hctx->lock);
 	}
 
@@ -129,15 +136,20 @@  void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 	if (!list_empty(&rq_list)) {
 		blk_mq_sched_mark_restart_hctx(hctx);
 		can_go = blk_mq_dispatch_rq_list(q, &rq_list);
-	} else if (!has_sched_dispatch && !q->queue_depth) {
-		blk_mq_flush_busy_ctxs(hctx, &rq_list);
-		blk_mq_dispatch_rq_list(q, &rq_list);
-		can_go = false;
+		if (can_go)
+			clear_bit(BLK_MQ_S_BUSY, &hctx->state);
 	}
 
-	if (!can_go)
+	/* can't go until ->dispatch is flushed */
+	if (!can_go || test_bit(BLK_MQ_S_BUSY, &hctx->state))
 		return;
 
+	if (!has_sched_dispatch && !q->queue_depth) {
+		blk_mq_flush_busy_ctxs(hctx, &rq_list);
+		blk_mq_dispatch_rq_list(q, &rq_list);
+		return;
+	}
+
 	/*
 	 * We want to dispatch from the scheduler if we had no work left
 	 * on the dispatch list, OR if we did have work but weren't able
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 14542308d25b..6d44b242b495 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -172,6 +172,7 @@  enum {
 	BLK_MQ_S_SCHED_RESTART	= 2,
 	BLK_MQ_S_TAG_WAITING	= 3,
 	BLK_MQ_S_START_ON_RUN	= 4,
+	BLK_MQ_S_BUSY		= 5,
 
 	BLK_MQ_MAX_DEPTH	= 10240,