diff mbox series

block: Limit number of items taken from the I/O scheduler in one go

Message ID 20200206211222.83170-1-sqazi@google.com (mailing list archive)
State New, archived
Headers show
Series block: Limit number of items taken from the I/O scheduler in one go | expand

Commit Message

Salman Qazi Feb. 6, 2020, 9:12 p.m. UTC
Flushes bypass the I/O scheduler and get added to hctx->dispatch
in blk_mq_sched_bypass_insert.  This can happen while a kworker is running
hctx->run_work work item and is past the point in
blk_mq_sched_dispatch_requests where hctx->dispatch is checked.

The blk_mq_do_dispatch_sched call is not guaranteed to end in bounded time,
because the I/O scheduler can feed an arbitrary number of commands.

Since we have only one hctx->run_work, the commands waiting in
hctx->dispatch will wait an arbitrary length of time for run_work to be
rerun.

A similar phenomenon exists with dispatches from the software queue.

The solution is to poll hctx->dispatch in blk_mq_do_dispatch_sched and
blk_mq_do_dispatch_ctx and return from the run_work handler and let it
rerun.

Signed-off-by: Salman Qazi <sqazi@google.com>
---
 block/blk-mq-sched.c | 47 ++++++++++++++++++++++++++++++++++++++------
 1 file changed, 41 insertions(+), 6 deletions(-)

Comments

Ming Lei Feb. 7, 2020, 2:07 a.m. UTC | #1
On Thu, Feb 06, 2020 at 01:12:22PM -0800, Salman Qazi wrote:
> Flushes bypass the I/O scheduler and get added to hctx->dispatch
> in blk_mq_sched_bypass_insert.  This can happen while a kworker is running
> hctx->run_work work item and is past the point in
> blk_mq_sched_dispatch_requests where hctx->dispatch is checked.
> 
> The blk_mq_do_dispatch_sched call is not guaranteed to end in bounded time,
> because the I/O scheduler can feed an arbitrary number of commands.
> 
> Since we have only one hctx->run_work, the commands waiting in
> hctx->dispatch will wait an arbitrary length of time for run_work to be
> rerun.
> 
> A similar phenomenon exists with dispatches from the software queue.
> 
> The solution is to poll hctx->dispatch in blk_mq_do_dispatch_sched and
> blk_mq_do_dispatch_ctx and return from the run_work handler and let it
> rerun.
> 
> Signed-off-by: Salman Qazi <sqazi@google.com>
> ---
>  block/blk-mq-sched.c | 47 ++++++++++++++++++++++++++++++++++++++------
>  1 file changed, 41 insertions(+), 6 deletions(-)
> 
> diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
> index ca22afd47b3d..84dde147f646 100644
> --- a/block/blk-mq-sched.c
> +++ b/block/blk-mq-sched.c
> @@ -84,12 +84,16 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
>   * Only SCSI implements .get_budget and .put_budget, and SCSI restarts
>   * its queue by itself in its completion handler, so we don't need to
>   * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
> + *
> + * Returns true if hctx->dispatch was found non-empty and
> + * run_work has to be run again.
>   */
> -static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> +static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
>  {
>  	struct request_queue *q = hctx->queue;
>  	struct elevator_queue *e = q->elevator;
>  	LIST_HEAD(rq_list);
> +	bool ret = false;
>  
>  	do {
>  		struct request *rq;
> @@ -97,6 +101,11 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
>  		if (e->type->ops.has_work && !e->type->ops.has_work(hctx))
>  			break;
>  
> +		if (!list_empty_careful(&hctx->dispatch)) {
> +			ret = true;
> +			break;
> +		}
> +
>  		if (!blk_mq_get_dispatch_budget(hctx))
>  			break;
>  
> @@ -113,6 +122,8 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
>  		 */
>  		list_add(&rq->queuelist, &rq_list);
>  	} while (blk_mq_dispatch_rq_list(q, &rq_list, true));
> +
> +	return ret;
>  }
>  
>  static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
> @@ -130,16 +141,25 @@ static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
>   * Only SCSI implements .get_budget and .put_budget, and SCSI restarts
>   * its queue by itself in its completion handler, so we don't need to
>   * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
> + *
> + * Returns true if hctx->dispatch was found non-empty and
> + * run_work has to be run again.
>   */
> -static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
> +static bool blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
>  {
>  	struct request_queue *q = hctx->queue;
>  	LIST_HEAD(rq_list);
>  	struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
> +	bool ret = false;
>  
>  	do {
>  		struct request *rq;
>  
> +		if (!list_empty_careful(&hctx->dispatch)) {
> +			ret = true;
> +			break;
> +		}
> +
>  		if (!sbitmap_any_bit_set(&hctx->ctx_map))
>  			break;
>  
> @@ -165,6 +185,7 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
>  	} while (blk_mq_dispatch_rq_list(q, &rq_list, true));
>  
>  	WRITE_ONCE(hctx->dispatch_from, ctx);
> +	return ret;
>  }
>  
>  void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> @@ -172,6 +193,8 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
>  	struct request_queue *q = hctx->queue;
>  	struct elevator_queue *e = q->elevator;
>  	const bool has_sched_dispatch = e && e->type->ops.dispatch_request;
> +	bool run_again;
> +	bool restarted = false;
>  	LIST_HEAD(rq_list);
>  
>  	/* RCU or SRCU read lock is needed before checking quiesced flag */
> @@ -180,6 +203,9 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
>  
>  	hctx->run++;
>  
> +again:
> +	run_again = false;
> +
>  	/*
>  	 * If we have previous entries on our dispatch list, grab them first for
>  	 * more fair dispatch.
> @@ -208,19 +234,28 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
>  		blk_mq_sched_mark_restart_hctx(hctx);
>  		if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
>  			if (has_sched_dispatch)
> -				blk_mq_do_dispatch_sched(hctx);
> +				run_again = blk_mq_do_dispatch_sched(hctx);
>  			else
> -				blk_mq_do_dispatch_ctx(hctx);
> +				run_again = blk_mq_do_dispatch_ctx(hctx);
>  		}
>  	} else if (has_sched_dispatch) {
> -		blk_mq_do_dispatch_sched(hctx);
> +		run_again = blk_mq_do_dispatch_sched(hctx);
>  	} else if (hctx->dispatch_busy) {
>  		/* dequeue request one by one from sw queue if queue is busy */
> -		blk_mq_do_dispatch_ctx(hctx);
> +		run_again = blk_mq_do_dispatch_ctx(hctx);
>  	} else {
>  		blk_mq_flush_busy_ctxs(hctx, &rq_list);
>  		blk_mq_dispatch_rq_list(q, &rq_list, false);
>  	}
> +
> +	if (run_again) {
> +		if (!restarted) {
> +			restarted = true;
> +			goto again;
> +		}
> +
> +		blk_mq_run_hw_queue(hctx, true);
> +	}
>  }
>  
>  bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
> -- 
> 2.25.0.341.g760bfbb309-goog
> 

Reviewed-by: Ming Lei <ming.lei@redhat.com>
Bart Van Assche Feb. 7, 2020, 3:26 p.m. UTC | #2
On 2020-02-06 13:12, Salman Qazi wrote:
> + *
> + * Returns true if hctx->dispatch was found non-empty and
> + * run_work has to be run again.

Please elaborate this comment and explain why this is necessary (to
avoid that flush processing is postponed forever).

> + * Returns true if hctx->dispatch was found non-empty and
> + * run_work has to be run again.

Same comment here.

> +again:
> +	run_again = false;
> +
>  	/*
>  	 * If we have previous entries on our dispatch list, grab them first for
>  	 * more fair dispatch.
> @@ -208,19 +234,28 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
>  		blk_mq_sched_mark_restart_hctx(hctx);
>  		if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
>  			if (has_sched_dispatch)
> -				blk_mq_do_dispatch_sched(hctx);
> +				run_again = blk_mq_do_dispatch_sched(hctx);
>  			else
> -				blk_mq_do_dispatch_ctx(hctx);
> +				run_again = blk_mq_do_dispatch_ctx(hctx);
>  		}
>  	} else if (has_sched_dispatch) {
> -		blk_mq_do_dispatch_sched(hctx);
> +		run_again = blk_mq_do_dispatch_sched(hctx);
>  	} else if (hctx->dispatch_busy) {
>  		/* dequeue request one by one from sw queue if queue is busy */
> -		blk_mq_do_dispatch_ctx(hctx);
> +		run_again = blk_mq_do_dispatch_ctx(hctx);
>  	} else {
>  		blk_mq_flush_busy_ctxs(hctx, &rq_list);
>  		blk_mq_dispatch_rq_list(q, &rq_list, false);
>  	}
> +
> +	if (run_again) {
> +		if (!restarted) {
> +			restarted = true;
> +			goto again;
> +		}
> +
> +		blk_mq_run_hw_queue(hctx, true);
> +	}

So this patch changes blk_mq_sched_dispatch_requests() such that it
iterates at most two times? How about implementing that loop with an
explicit for-loop? I think that will make
blk_mq_sched_dispatch_requests() easier to read. As you may know forward
goto's are accepted in kernel code but backward goto's are frowned upon.

Thanks,

Bart.
Salman Qazi Feb. 7, 2020, 6:45 p.m. UTC | #3
On Fri, Feb 7, 2020 at 7:26 AM Bart Van Assche <bvanassche@acm.org> wrote:
>
> On 2020-02-06 13:12, Salman Qazi wrote:
> > + *
> > + * Returns true if hctx->dispatch was found non-empty and
> > + * run_work has to be run again.
>
> Please elaborate this comment and explain why this is necessary (to
> avoid that flush processing is postponed forever).
>
> > + * Returns true if hctx->dispatch was found non-empty and
> > + * run_work has to be run again.
>
> Same comment here.

Will do.

>
> > +again:
> > +     run_again = false;
> > +
> >       /*
> >        * If we have previous entries on our dispatch list, grab them first for
> >        * more fair dispatch.
> > @@ -208,19 +234,28 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> >               blk_mq_sched_mark_restart_hctx(hctx);
> >               if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
> >                       if (has_sched_dispatch)
> > -                             blk_mq_do_dispatch_sched(hctx);
> > +                             run_again = blk_mq_do_dispatch_sched(hctx);
> >                       else
> > -                             blk_mq_do_dispatch_ctx(hctx);
> > +                             run_again = blk_mq_do_dispatch_ctx(hctx);
> >               }
> >       } else if (has_sched_dispatch) {
> > -             blk_mq_do_dispatch_sched(hctx);
> > +             run_again = blk_mq_do_dispatch_sched(hctx);
> >       } else if (hctx->dispatch_busy) {
> >               /* dequeue request one by one from sw queue if queue is busy */
> > -             blk_mq_do_dispatch_ctx(hctx);
> > +             run_again = blk_mq_do_dispatch_ctx(hctx);
> >       } else {
> >               blk_mq_flush_busy_ctxs(hctx, &rq_list);
> >               blk_mq_dispatch_rq_list(q, &rq_list, false);
> >       }
> > +
> > +     if (run_again) {
> > +             if (!restarted) {
> > +                     restarted = true;
> > +                     goto again;
> > +             }
> > +
> > +             blk_mq_run_hw_queue(hctx, true);
> > +     }
>
> So this patch changes blk_mq_sched_dispatch_requests() such that it
> iterates at most two times? How about implementing that loop with an
> explicit for-loop? I think that will make
> blk_mq_sched_dispatch_requests() easier to read. As you may know forward
> goto's are accepted in kernel code but backward goto's are frowned upon.
>

About the goto, I don't know if backwards gotos in general are frowned
upon.  There are plenty of examples
in the kernel source.  This particular label, 'again' for instance:

$ grep -r again: mm/|wc -l
22
$ grep -r again: block/|wc -l
4

But, just because others have done it doesn't mean I should.  So, I
will attempt to explain why I think this is a good idea.
If I were to write this as a for-loop, it will look like this:

for (i = 0; i == 0 || (run_again && i < 2); i++) {
/* another level of 8 character wide indentation */
    run_again = false;
   /* a bunch of code that possibly sets run_again to true
}

if (run_again)
    blk_mq_run_hw_queue(hctx, true);

[Another alternative is to set run_again to true, and simplify the for-loop
condition to run_again && i < 2.  But, again, lots of verbiage and a boolean
in the for-loop condition.]

The for-loop is far from idiomatic.  It's not clear what it does when
you first look at it.
It distracts from the common path of the code, which is something that
almost always
runs exactly once.  There is now an additional level of indentation.
The readers of the
code aren't any better off, because they still have to figure out what
run_again is and if
they care about it.  And the only way to do that is to read the entire
body of the loop, and
comments at the top of the functions.

The goto in this case preserves the intent of the code better.  It is
dealing with an exceptional
and unusual case.  Indeed this kind of use is not unusual in the
kernel, for instance to deal
with possible but unlikely races.

Just my $0.02.

> Thanks,
>
> Bart.
Bart Van Assche Feb. 7, 2020, 8:19 p.m. UTC | #4
On 2/7/20 10:45 AM, Salman Qazi wrote:
> If I were to write this as a for-loop, it will look like this:
> 
> for (i = 0; i == 0 || (run_again && i < 2); i++) {
> /* another level of 8 character wide indentation */
>      run_again = false;
>     /* a bunch of code that possibly sets run_again to true
> }
> 
> if (run_again)
>      blk_mq_run_hw_queue(hctx, true);

That's not what I meant. What I meant is a loop that iterates at most 
two times and also to break out of the loop if run_again == false.

BTW, I share your concern about the additional indentation by eight 
positions. How about avoiding deeper indentation by introducing a new 
function?

Thanks,

Bart.
Salman Qazi Feb. 7, 2020, 8:37 p.m. UTC | #5
On Fri, Feb 7, 2020 at 12:19 PM Bart Van Assche <bvanassche@acm.org> wrote:
>
> On 2/7/20 10:45 AM, Salman Qazi wrote:
> > If I were to write this as a for-loop, it will look like this:
> >
> > for (i = 0; i == 0 || (run_again && i < 2); i++) {
> > /* another level of 8 character wide indentation */
> >      run_again = false;
> >     /* a bunch of code that possibly sets run_again to true
> > }
> >
> > if (run_again)
> >      blk_mq_run_hw_queue(hctx, true);
>
> That's not what I meant. What I meant is a loop that iterates at most
> two times and also to break out of the loop if run_again == false.
>

I picked the most compact variant to demonstrate the problem.  Adding
breaks isn't
really helping the readability.

for (i = 0; i < 2; i++) {
  run_again = false;
/* bunch of code that possibly sets it to true */
...
 if (!run_again)
    break;
}
if (run_again)
    blk_mq_run_hw_queue(hctx, true);

When I read this, I initially assume that the loop in general runs
twice and that this is the common case.  It has the
same problem with conveying intent.  Perhaps, more importantly, the
point of using programming constructs is to shorten and simplify the
code.
There are still two if-statements in addition to the loop. We haven't
gained much by introducing the loop.

> BTW, I share your concern about the additional indentation by eight
> positions. How about avoiding deeper indentation by introducing a new
> function?

If there was a benefit to introducing the loop, this would be a good
call.  But the way I see it, the introduction of another
function is yet another way in which the introduction of the loop
makes the code less readable.

This is not a hill I want to die on.  If the maintainer agrees with
you on this point, I will use a loop.
>
> Thanks,
>
> Bart.
Doug Anderson April 20, 2020, 4:42 p.m. UTC | #6
Hi,

On Fri, Feb 7, 2020 at 12:38 PM Salman Qazi <sqazi@google.com> wrote:
>
> On Fri, Feb 7, 2020 at 12:19 PM Bart Van Assche <bvanassche@acm.org> wrote:
> >
> > On 2/7/20 10:45 AM, Salman Qazi wrote:
> > > If I were to write this as a for-loop, it will look like this:
> > >
> > > for (i = 0; i == 0 || (run_again && i < 2); i++) {
> > > /* another level of 8 character wide indentation */
> > >      run_again = false;
> > >     /* a bunch of code that possibly sets run_again to true
> > > }
> > >
> > > if (run_again)
> > >      blk_mq_run_hw_queue(hctx, true);
> >
> > That's not what I meant. What I meant is a loop that iterates at most
> > two times and also to break out of the loop if run_again == false.
> >
>
> I picked the most compact variant to demonstrate the problem.  Adding
> breaks isn't
> really helping the readability.
>
> for (i = 0; i < 2; i++) {
>   run_again = false;
> /* bunch of code that possibly sets it to true */
> ...
>  if (!run_again)
>     break;
> }
> if (run_again)
>     blk_mq_run_hw_queue(hctx, true);
>
> When I read this, I initially assume that the loop in general runs
> twice and that this is the common case.  It has the
> same problem with conveying intent.  Perhaps, more importantly, the
> point of using programming constructs is to shorten and simplify the
> code.
> There are still two if-statements in addition to the loop. We haven't
> gained much by introducing the loop.
>
> > BTW, I share your concern about the additional indentation by eight
> > positions. How about avoiding deeper indentation by introducing a new
> > function?
>
> If there was a benefit to introducing the loop, this would be a good
> call.  But the way I see it, the introduction of another
> function is yet another way in which the introduction of the loop
> makes the code less readable.
>
> This is not a hill I want to die on.  If the maintainer agrees with
> you on this point, I will use a loop.

I haven't done a massive amount of analysis of this patch, but since I
noticed it while debugging my own block series I've been keeping track
of it.  Is there any status update here?  We've been carrying this
"FROMLIST" in our Chrome OS trees for a little while, but that's not a
state we want to be in long-term.  If it needs to spin before landing
upstream we should get the spin out and land it.  If it's OK as-is
then it'd be nice to see it in mainline.

From the above I guess Salman was waiting for Jens to weigh in with an
opinion on the prefered bike shed color?

-Doug
Jesse Barnes April 23, 2020, 8:13 p.m. UTC | #7
Jens, Bart, Ming, any update here?  Or is this already applied (I didn't check)?

Thanks,
Jesse


On Mon, Apr 20, 2020 at 9:43 AM Doug Anderson <dianders@chromium.org> wrote:
>
> Hi,
>
> On Fri, Feb 7, 2020 at 12:38 PM Salman Qazi <sqazi@google.com> wrote:
> >
> > On Fri, Feb 7, 2020 at 12:19 PM Bart Van Assche <bvanassche@acm.org> wrote:
> > >
> > > On 2/7/20 10:45 AM, Salman Qazi wrote:
> > > > If I were to write this as a for-loop, it will look like this:
> > > >
> > > > for (i = 0; i == 0 || (run_again && i < 2); i++) {
> > > > /* another level of 8 character wide indentation */
> > > >      run_again = false;
> > > >     /* a bunch of code that possibly sets run_again to true
> > > > }
> > > >
> > > > if (run_again)
> > > >      blk_mq_run_hw_queue(hctx, true);
> > >
> > > That's not what I meant. What I meant is a loop that iterates at most
> > > two times and also to break out of the loop if run_again == false.
> > >
> >
> > I picked the most compact variant to demonstrate the problem.  Adding
> > breaks isn't
> > really helping the readability.
> >
> > for (i = 0; i < 2; i++) {
> >   run_again = false;
> > /* bunch of code that possibly sets it to true */
> > ...
> >  if (!run_again)
> >     break;
> > }
> > if (run_again)
> >     blk_mq_run_hw_queue(hctx, true);
> >
> > When I read this, I initially assume that the loop in general runs
> > twice and that this is the common case.  It has the
> > same problem with conveying intent.  Perhaps, more importantly, the
> > point of using programming constructs is to shorten and simplify the
> > code.
> > There are still two if-statements in addition to the loop. We haven't
> > gained much by introducing the loop.
> >
> > > BTW, I share your concern about the additional indentation by eight
> > > positions. How about avoiding deeper indentation by introducing a new
> > > function?
> >
> > If there was a benefit to introducing the loop, this would be a good
> > call.  But the way I see it, the introduction of another
> > function is yet another way in which the introduction of the loop
> > makes the code less readable.
> >
> > This is not a hill I want to die on.  If the maintainer agrees with
> > you on this point, I will use a loop.
>
> I haven't done a massive amount of analysis of this patch, but since I
> noticed it while debugging my own block series I've been keeping track
> of it.  Is there any status update here?  We've been carrying this
> "FROMLIST" in our Chrome OS trees for a little while, but that's not a
> state we want to be in long-term.  If it needs to spin before landing
> upstream we should get the spin out and land it.  If it's OK as-is
> then it'd be nice to see it in mainline.
>
> From the above I guess Salman was waiting for Jens to weigh in with an
> opinion on the prefered bike shed color?
>
> -Doug
Jens Axboe April 23, 2020, 8:34 p.m. UTC | #8
On 4/23/20 2:13 PM, Jesse Barnes wrote:
> Jens, Bart, Ming, any update here?  Or is this already applied (I didn't check)?

No updates on my end. I was expecting that a v2 would be posted after
all the discussion on it, but that doesn't seem to be the case.
Salman Qazi April 23, 2020, 8:40 p.m. UTC | #9
I had posted a version on Feb 7th, but I guess I neglected to
explicitly label it as v2.  I am sorry I am not well-acquainted with
the processes here.

On Thu, Apr 23, 2020 at 1:34 PM Jens Axboe <axboe@kernel.dk> wrote:
>
> On 4/23/20 2:13 PM, Jesse Barnes wrote:
> > Jens, Bart, Ming, any update here?  Or is this already applied (I didn't check)?
>
> No updates on my end. I was expecting that a v2 would be posted after
> all the discussion on it, but that doesn't seem to be the case.
>
> --
> Jens Axboe
>
diff mbox series

Patch

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index ca22afd47b3d..84dde147f646 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -84,12 +84,16 @@  void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
  * Only SCSI implements .get_budget and .put_budget, and SCSI restarts
  * its queue by itself in its completion handler, so we don't need to
  * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
+ *
+ * Returns true if hctx->dispatch was found non-empty and
+ * run_work has to be run again.
  */
-static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
+static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
 {
 	struct request_queue *q = hctx->queue;
 	struct elevator_queue *e = q->elevator;
 	LIST_HEAD(rq_list);
+	bool ret = false;
 
 	do {
 		struct request *rq;
@@ -97,6 +101,11 @@  static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
 		if (e->type->ops.has_work && !e->type->ops.has_work(hctx))
 			break;
 
+		if (!list_empty_careful(&hctx->dispatch)) {
+			ret = true;
+			break;
+		}
+
 		if (!blk_mq_get_dispatch_budget(hctx))
 			break;
 
@@ -113,6 +122,8 @@  static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
 		 */
 		list_add(&rq->queuelist, &rq_list);
 	} while (blk_mq_dispatch_rq_list(q, &rq_list, true));
+
+	return ret;
 }
 
 static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
@@ -130,16 +141,25 @@  static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
  * Only SCSI implements .get_budget and .put_budget, and SCSI restarts
  * its queue by itself in its completion handler, so we don't need to
  * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
+ *
+ * Returns true if hctx->dispatch was found non-empty and
+ * run_work has to be run again.
  */
-static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
+static bool blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
 {
 	struct request_queue *q = hctx->queue;
 	LIST_HEAD(rq_list);
 	struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
+	bool ret = false;
 
 	do {
 		struct request *rq;
 
+		if (!list_empty_careful(&hctx->dispatch)) {
+			ret = true;
+			break;
+		}
+
 		if (!sbitmap_any_bit_set(&hctx->ctx_map))
 			break;
 
@@ -165,6 +185,7 @@  static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
 	} while (blk_mq_dispatch_rq_list(q, &rq_list, true));
 
 	WRITE_ONCE(hctx->dispatch_from, ctx);
+	return ret;
 }
 
 void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
@@ -172,6 +193,8 @@  void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 	struct request_queue *q = hctx->queue;
 	struct elevator_queue *e = q->elevator;
 	const bool has_sched_dispatch = e && e->type->ops.dispatch_request;
+	bool run_again;
+	bool restarted = false;
 	LIST_HEAD(rq_list);
 
 	/* RCU or SRCU read lock is needed before checking quiesced flag */
@@ -180,6 +203,9 @@  void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 
 	hctx->run++;
 
+again:
+	run_again = false;
+
 	/*
 	 * If we have previous entries on our dispatch list, grab them first for
 	 * more fair dispatch.
@@ -208,19 +234,28 @@  void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 		blk_mq_sched_mark_restart_hctx(hctx);
 		if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
 			if (has_sched_dispatch)
-				blk_mq_do_dispatch_sched(hctx);
+				run_again = blk_mq_do_dispatch_sched(hctx);
 			else
-				blk_mq_do_dispatch_ctx(hctx);
+				run_again = blk_mq_do_dispatch_ctx(hctx);
 		}
 	} else if (has_sched_dispatch) {
-		blk_mq_do_dispatch_sched(hctx);
+		run_again = blk_mq_do_dispatch_sched(hctx);
 	} else if (hctx->dispatch_busy) {
 		/* dequeue request one by one from sw queue if queue is busy */
-		blk_mq_do_dispatch_ctx(hctx);
+		run_again = blk_mq_do_dispatch_ctx(hctx);
 	} else {
 		blk_mq_flush_busy_ctxs(hctx, &rq_list);
 		blk_mq_dispatch_rq_list(q, &rq_list, false);
 	}
+
+	if (run_again) {
+		if (!restarted) {
+			restarted = true;
+			goto again;
+		}
+
+		blk_mq_run_hw_queue(hctx, true);
+	}
 }
 
 bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,