diff mbox series

[RESEND,RFC,2/8] block: Allow sending a batch of requests from the scheduler to hardware

Message ID c2e62e5a9942fb833dfc0cdc8c967a12f3c34b03.1584350380.git.baolin.wang7@gmail.com (mailing list archive)
State New, archived
Headers show
Series Add MMC packed request support | expand

Commit Message

Baolin Wang March 16, 2020, 10:01 a.m. UTC
As we know, some SD/MMC host controllers can support packed request,
that means we can package several requests to host controller at one
time to improve performence. So the hardware driver expects the blk-mq
can dispatch a batch of requests at one time, and driver can use bd.last
to indicate if it is the last request in the batch to help to combine
requests as much as possible.

Thus we should add batch requests setting from the block driver to tell
the scheduler how many requests can be dispatched in a batch, as well
as changing the scheduler to dispatch more than one request if setting
the maximum batch requests number.

Signed-off-by: Baolin Wang <baolin.wang7@gmail.com>
---
 block/bfq-iosched.c    | 32 ++++++++++++++--------
 block/blk-mq.c         |  2 --
 block/blk-settings.c   | 13 +++++++++
 block/kyber-iosched.c  | 74 +++++++++++++++++++++++++++-----------------------
 block/mq-deadline.c    | 20 ++++++++++----
 include/linux/blkdev.h |  8 ++++++
 6 files changed, 95 insertions(+), 54 deletions(-)

Comments

Ming Lei March 18, 2020, 10:01 a.m. UTC | #1
On Mon, Mar 16, 2020 at 06:01:19PM +0800, Baolin Wang wrote:
> As we know, some SD/MMC host controllers can support packed request,
> that means we can package several requests to host controller at one
> time to improve performence. So the hardware driver expects the blk-mq
> can dispatch a batch of requests at one time, and driver can use bd.last
> to indicate if it is the last request in the batch to help to combine
> requests as much as possible.
> 
> Thus we should add batch requests setting from the block driver to tell
> the scheduler how many requests can be dispatched in a batch, as well
> as changing the scheduler to dispatch more than one request if setting
> the maximum batch requests number.
> 

I feel this batch dispatch style is more complicated, and some other
drivers(virtio blk/scsi) still may get benefit if we can pass real 'last' flag in
.queue_rq().

So what about the following way by extending .commit_rqs() to this usage?
And you can do whatever batch processing in .commit_rqs() which will be
guaranteed to be called if BLK_MQ_F_FORCE_COMMIT_RQS is set by driver.

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 856356b1619e..cd2bbe56f83f 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -85,11 +85,12 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
  * its queue by itself in its completion handler, so we don't need to
  * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
  */
-static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
+static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
 {
 	struct request_queue *q = hctx->queue;
 	struct elevator_queue *e = q->elevator;
 	LIST_HEAD(rq_list);
+	bool ret = false;
 
 	do {
 		struct request *rq;
@@ -112,7 +113,10 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
 		 * in blk_mq_dispatch_rq_list().
 		 */
 		list_add(&rq->queuelist, &rq_list);
-	} while (blk_mq_dispatch_rq_list(q, &rq_list, true));
+		ret = blk_mq_dispatch_rq_list(q, &rq_list, true);
+	} while (ret);
+
+	return ret;
 }
 
 static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
@@ -131,11 +135,12 @@ static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
  * its queue by itself in its completion handler, so we don't need to
  * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
  */
-static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
+static bool blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
 {
 	struct request_queue *q = hctx->queue;
 	LIST_HEAD(rq_list);
 	struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
+	bool ret = false;
 
 	do {
 		struct request *rq;
@@ -161,10 +166,12 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
 
 		/* round robin for fair dispatch */
 		ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
-
-	} while (blk_mq_dispatch_rq_list(q, &rq_list, true));
+		ret = blk_mq_dispatch_rq_list(q, &rq_list, true);
+	} while (ret);
 
 	WRITE_ONCE(hctx->dispatch_from, ctx);
+
+	return ret;
 }
 
 void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
@@ -173,6 +180,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 	struct elevator_queue *e = q->elevator;
 	const bool has_sched_dispatch = e && e->type->ops.dispatch_request;
 	LIST_HEAD(rq_list);
+	bool dispatch_ret;
 
 	/* RCU or SRCU read lock is needed before checking quiesced flag */
 	if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
@@ -206,20 +214,26 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 	 */
 	if (!list_empty(&rq_list)) {
 		blk_mq_sched_mark_restart_hctx(hctx);
-		if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
+		dispatch_ret = blk_mq_dispatch_rq_list(q, &rq_list, false);
+		if (dispatch_ret) {
 			if (has_sched_dispatch)
-				blk_mq_do_dispatch_sched(hctx);
+				dispatch_ret = blk_mq_do_dispatch_sched(hctx);
 			else
-				blk_mq_do_dispatch_ctx(hctx);
+				dispatch_ret = blk_mq_do_dispatch_ctx(hctx);
 		}
 	} else if (has_sched_dispatch) {
-		blk_mq_do_dispatch_sched(hctx);
+		dispatch_ret = blk_mq_do_dispatch_sched(hctx);
 	} else if (hctx->dispatch_busy) {
 		/* dequeue request one by one from sw queue if queue is busy */
-		blk_mq_do_dispatch_ctx(hctx);
+		dispatch_ret = blk_mq_do_dispatch_ctx(hctx);
 	} else {
 		blk_mq_flush_busy_ctxs(hctx, &rq_list);
-		blk_mq_dispatch_rq_list(q, &rq_list, false);
+		dispatch_ret = blk_mq_dispatch_rq_list(q, &rq_list, false);
+	}
+
+	if (dispatch_ret) {
+		if (hctx->flags & BLK_MQ_F_FORCE_COMMIT_RQS)
+			hctx->queue->mq_ops->commit_rqs(hctx);
 	}
 }
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 87c6699f35ae..9b46f5d6c7fd 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1238,11 +1238,15 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 		 * Flag last if we have no more requests, or if we have more
 		 * but can't assign a driver tag to it.
 		 */
-		if (list_empty(list))
-			bd.last = true;
-		else {
-			nxt = list_first_entry(list, struct request, queuelist);
-			bd.last = !blk_mq_get_driver_tag(nxt);
+		if (!(hctx->flags & BLK_MQ_F_FORCE_COMMIT_RQS)) {
+			if (list_empty(list))
+				bd.last = true;
+			else {
+				nxt = list_first_entry(list, struct request, queuelist);
+				bd.last = !blk_mq_get_driver_tag(nxt);
+			}
+		} else {
+			bd.last = false;
 		}
 
 		ret = q->mq_ops->queue_rq(hctx, &bd);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 07fa767bff86..c0ef6990b698 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -394,6 +394,7 @@ enum {
 	BLK_MQ_F_SHOULD_MERGE	= 1 << 0,
 	BLK_MQ_F_TAG_SHARED	= 1 << 1,
 	BLK_MQ_F_NO_MANAGED_IRQ	= 1 << 2,
+	BLK_MQ_F_FORCE_COMMIT_RQS = 1 << 3,
 	BLK_MQ_F_BLOCKING	= 1 << 5,
 	BLK_MQ_F_NO_SCHED	= 1 << 6,
 	BLK_MQ_F_ALLOC_POLICY_START_BIT = 8,

Thanks, 
Ming
Baolin Wang March 18, 2020, 10:26 a.m. UTC | #2
Hi Ming,

On Wed, Mar 18, 2020 at 6:01 PM Ming Lei <ming.lei@redhat.com> wrote:
>
> On Mon, Mar 16, 2020 at 06:01:19PM +0800, Baolin Wang wrote:
> > As we know, some SD/MMC host controllers can support packed request,
> > that means we can package several requests to host controller at one
> > time to improve performence. So the hardware driver expects the blk-mq
> > can dispatch a batch of requests at one time, and driver can use bd.last
> > to indicate if it is the last request in the batch to help to combine
> > requests as much as possible.
> >
> > Thus we should add batch requests setting from the block driver to tell
> > the scheduler how many requests can be dispatched in a batch, as well
> > as changing the scheduler to dispatch more than one request if setting
> > the maximum batch requests number.
> >
>
> I feel this batch dispatch style is more complicated, and some other
> drivers(virtio blk/scsi) still may get benefit if we can pass real 'last' flag in
> .queue_rq().
>
> So what about the following way by extending .commit_rqs() to this usage?
> And you can do whatever batch processing in .commit_rqs() which will be
> guaranteed to be called if BLK_MQ_F_FORCE_COMMIT_RQS is set by driver.

I'm very appreciated for your good suggestion, which is much simpler than mine.
It seems to solve my problem, and I will try it on my platform to see
if it can work and give you the feadback. Thanks again.

> diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
> index 856356b1619e..cd2bbe56f83f 100644
> --- a/block/blk-mq-sched.c
> +++ b/block/blk-mq-sched.c
> @@ -85,11 +85,12 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
>   * its queue by itself in its completion handler, so we don't need to
>   * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
>   */
> -static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> +static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
>  {
>         struct request_queue *q = hctx->queue;
>         struct elevator_queue *e = q->elevator;
>         LIST_HEAD(rq_list);
> +       bool ret = false;
>
>         do {
>                 struct request *rq;
> @@ -112,7 +113,10 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
>                  * in blk_mq_dispatch_rq_list().
>                  */
>                 list_add(&rq->queuelist, &rq_list);
> -       } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
> +               ret = blk_mq_dispatch_rq_list(q, &rq_list, true);
> +       } while (ret);
> +
> +       return ret;
>  }
>
>  static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
> @@ -131,11 +135,12 @@ static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
>   * its queue by itself in its completion handler, so we don't need to
>   * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
>   */
> -static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
> +static bool blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
>  {
>         struct request_queue *q = hctx->queue;
>         LIST_HEAD(rq_list);
>         struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
> +       bool ret = false;
>
>         do {
>                 struct request *rq;
> @@ -161,10 +166,12 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
>
>                 /* round robin for fair dispatch */
>                 ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
> -
> -       } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
> +               ret = blk_mq_dispatch_rq_list(q, &rq_list, true);
> +       } while (ret);
>
>         WRITE_ONCE(hctx->dispatch_from, ctx);
> +
> +       return ret;
>  }
>
>  void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> @@ -173,6 +180,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
>         struct elevator_queue *e = q->elevator;
>         const bool has_sched_dispatch = e && e->type->ops.dispatch_request;
>         LIST_HEAD(rq_list);
> +       bool dispatch_ret;
>
>         /* RCU or SRCU read lock is needed before checking quiesced flag */
>         if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
> @@ -206,20 +214,26 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
>          */
>         if (!list_empty(&rq_list)) {
>                 blk_mq_sched_mark_restart_hctx(hctx);
> -               if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
> +               dispatch_ret = blk_mq_dispatch_rq_list(q, &rq_list, false);
> +               if (dispatch_ret) {
>                         if (has_sched_dispatch)
> -                               blk_mq_do_dispatch_sched(hctx);
> +                               dispatch_ret = blk_mq_do_dispatch_sched(hctx);
>                         else
> -                               blk_mq_do_dispatch_ctx(hctx);
> +                               dispatch_ret = blk_mq_do_dispatch_ctx(hctx);
>                 }
>         } else if (has_sched_dispatch) {
> -               blk_mq_do_dispatch_sched(hctx);
> +               dispatch_ret = blk_mq_do_dispatch_sched(hctx);
>         } else if (hctx->dispatch_busy) {
>                 /* dequeue request one by one from sw queue if queue is busy */
> -               blk_mq_do_dispatch_ctx(hctx);
> +               dispatch_ret = blk_mq_do_dispatch_ctx(hctx);
>         } else {
>                 blk_mq_flush_busy_ctxs(hctx, &rq_list);
> -               blk_mq_dispatch_rq_list(q, &rq_list, false);
> +               dispatch_ret = blk_mq_dispatch_rq_list(q, &rq_list, false);
> +       }
> +
> +       if (dispatch_ret) {
> +               if (hctx->flags & BLK_MQ_F_FORCE_COMMIT_RQS)
> +                       hctx->queue->mq_ops->commit_rqs(hctx);
>         }
>  }
>
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 87c6699f35ae..9b46f5d6c7fd 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -1238,11 +1238,15 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
>                  * Flag last if we have no more requests, or if we have more
>                  * but can't assign a driver tag to it.
>                  */
> -               if (list_empty(list))
> -                       bd.last = true;
> -               else {
> -                       nxt = list_first_entry(list, struct request, queuelist);
> -                       bd.last = !blk_mq_get_driver_tag(nxt);
> +               if (!(hctx->flags & BLK_MQ_F_FORCE_COMMIT_RQS)) {
> +                       if (list_empty(list))
> +                               bd.last = true;
> +                       else {
> +                               nxt = list_first_entry(list, struct request, queuelist);
> +                               bd.last = !blk_mq_get_driver_tag(nxt);
> +                       }
> +               } else {
> +                       bd.last = false;
>                 }
>
>                 ret = q->mq_ops->queue_rq(hctx, &bd);
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index 07fa767bff86..c0ef6990b698 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -394,6 +394,7 @@ enum {
>         BLK_MQ_F_SHOULD_MERGE   = 1 << 0,
>         BLK_MQ_F_TAG_SHARED     = 1 << 1,
>         BLK_MQ_F_NO_MANAGED_IRQ = 1 << 2,
> +       BLK_MQ_F_FORCE_COMMIT_RQS = 1 << 3,
>         BLK_MQ_F_BLOCKING       = 1 << 5,
>         BLK_MQ_F_NO_SCHED       = 1 << 6,
>         BLK_MQ_F_ALLOC_POLICY_START_BIT = 8,
>
> Thanks,
> Ming
>
Baolin Wang March 20, 2020, 10:27 a.m. UTC | #3
Hi Ming,

On Wed, Mar 18, 2020 at 6:26 PM Baolin Wang <baolin.wang7@gmail.com> wrote:
>
> Hi Ming,
>
> On Wed, Mar 18, 2020 at 6:01 PM Ming Lei <ming.lei@redhat.com> wrote:
> >
> > On Mon, Mar 16, 2020 at 06:01:19PM +0800, Baolin Wang wrote:
> > > As we know, some SD/MMC host controllers can support packed request,
> > > that means we can package several requests to host controller at one
> > > time to improve performence. So the hardware driver expects the blk-mq
> > > can dispatch a batch of requests at one time, and driver can use bd.last
> > > to indicate if it is the last request in the batch to help to combine
> > > requests as much as possible.
> > >
> > > Thus we should add batch requests setting from the block driver to tell
> > > the scheduler how many requests can be dispatched in a batch, as well
> > > as changing the scheduler to dispatch more than one request if setting
> > > the maximum batch requests number.
> > >
> >
> > I feel this batch dispatch style is more complicated, and some other
> > drivers(virtio blk/scsi) still may get benefit if we can pass real 'last' flag in
> > .queue_rq().
> >
> > So what about the following way by extending .commit_rqs() to this usage?
> > And you can do whatever batch processing in .commit_rqs() which will be
> > guaranteed to be called if BLK_MQ_F_FORCE_COMMIT_RQS is set by driver.
>
> I'm very appreciated for your good suggestion, which is much simpler than mine.
> It seems to solve my problem, and I will try it on my platform to see
> if it can work and give you the feadback. Thanks again.

I tried your approach on my platform, but met some problems, see below.

>
> > diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
> > index 856356b1619e..cd2bbe56f83f 100644
> > --- a/block/blk-mq-sched.c
> > +++ b/block/blk-mq-sched.c
> > @@ -85,11 +85,12 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
> >   * its queue by itself in its completion handler, so we don't need to
> >   * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
> >   */
> > -static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> > +static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> >  {
> >         struct request_queue *q = hctx->queue;
> >         struct elevator_queue *e = q->elevator;
> >         LIST_HEAD(rq_list);
> > +       bool ret = false;
> >
> >         do {
> >                 struct request *rq;
> > @@ -112,7 +113,10 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> >                  * in blk_mq_dispatch_rq_list().
> >                  */
> >                 list_add(&rq->queuelist, &rq_list);
> > -       } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
> > +               ret = blk_mq_dispatch_rq_list(q, &rq_list, true);
> > +       } while (ret);
> > +
> > +       return ret;
> >  }
> >
> >  static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
> > @@ -131,11 +135,12 @@ static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
> >   * its queue by itself in its completion handler, so we don't need to
> >   * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
> >   */
> > -static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
> > +static bool blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
> >  {
> >         struct request_queue *q = hctx->queue;
> >         LIST_HEAD(rq_list);
> >         struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
> > +       bool ret = false;
> >
> >         do {
> >                 struct request *rq;
> > @@ -161,10 +166,12 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
> >
> >                 /* round robin for fair dispatch */
> >                 ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
> > -
> > -       } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
> > +               ret = blk_mq_dispatch_rq_list(q, &rq_list, true);
> > +       } while (ret);
> >
> >         WRITE_ONCE(hctx->dispatch_from, ctx);
> > +
> > +       return ret;
> >  }
> >
> >  void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> > @@ -173,6 +180,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> >         struct elevator_queue *e = q->elevator;
> >         const bool has_sched_dispatch = e && e->type->ops.dispatch_request;
> >         LIST_HEAD(rq_list);
> > +       bool dispatch_ret;
> >
> >         /* RCU or SRCU read lock is needed before checking quiesced flag */
> >         if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
> > @@ -206,20 +214,26 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> >          */
> >         if (!list_empty(&rq_list)) {
> >                 blk_mq_sched_mark_restart_hctx(hctx);
> > -               if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
> > +               dispatch_ret = blk_mq_dispatch_rq_list(q, &rq_list, false);
> > +               if (dispatch_ret) {
> >                         if (has_sched_dispatch)
> > -                               blk_mq_do_dispatch_sched(hctx);
> > +                               dispatch_ret = blk_mq_do_dispatch_sched(hctx);

If we dispatched a request successfully by blk_mq_dispatch_rq_list(),
and got dispatch_ret = true now. Then we will try to dispatch more
reuqests from scheduler by blk_mq_do_dispatch_sched(), but if now no
more requests in scheduler, then we will got dispatch_ret = false. In
this case, we will not issue commit_rqs() to tell the hardware to
handle previous request dispatched from &rq_list.

So I think we should not overlap the 'dispatch_ret'? Or do you have
any other thoughts to fix?

> >                         else
> > -                               blk_mq_do_dispatch_ctx(hctx);
> > +                               dispatch_ret = blk_mq_do_dispatch_ctx(hctx);
> >                 }
> >         } else if (has_sched_dispatch) {
> > -               blk_mq_do_dispatch_sched(hctx);
> > +               dispatch_ret = blk_mq_do_dispatch_sched(hctx);
> >         } else if (hctx->dispatch_busy) {
> >                 /* dequeue request one by one from sw queue if queue is busy */
> > -               blk_mq_do_dispatch_ctx(hctx);
> > +               dispatch_ret = blk_mq_do_dispatch_ctx(hctx);
> >         } else {
> >                 blk_mq_flush_busy_ctxs(hctx, &rq_list);
> > -               blk_mq_dispatch_rq_list(q, &rq_list, false);
> > +               dispatch_ret = blk_mq_dispatch_rq_list(q, &rq_list, false);
> > +       }
> > +
> > +       if (dispatch_ret) {
> > +               if (hctx->flags & BLK_MQ_F_FORCE_COMMIT_RQS)
> > +                       hctx->queue->mq_ops->commit_rqs(hctx);
> >         }
> >  }
> >
> > diff --git a/block/blk-mq.c b/block/blk-mq.c
> > index 87c6699f35ae..9b46f5d6c7fd 100644
> > --- a/block/blk-mq.c
> > +++ b/block/blk-mq.c
> > @@ -1238,11 +1238,15 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
> >                  * Flag last if we have no more requests, or if we have more
> >                  * but can't assign a driver tag to it.
> >                  */
> > -               if (list_empty(list))
> > -                       bd.last = true;
> > -               else {
> > -                       nxt = list_first_entry(list, struct request, queuelist);
> > -                       bd.last = !blk_mq_get_driver_tag(nxt);
> > +               if (!(hctx->flags & BLK_MQ_F_FORCE_COMMIT_RQS)) {
> > +                       if (list_empty(list))
> > +                               bd.last = true;
> > +                       else {
> > +                               nxt = list_first_entry(list, struct request, queuelist);
> > +                               bd.last = !blk_mq_get_driver_tag(nxt);
> > +                       }
> > +               } else {
> > +                       bd.last = false;

If we enabled BLK_MQ_F_FORCE_COMMIT_RQS flag, we will always get
bd.last = false even for the real last request in the IO scheduler. I
know you already use commit_irqs() to help to kick driver. But I
worried if it is reasonable that drivers always get bd.last = false.

Thanks for your approach.

> >                 }
> >
> >                 ret = q->mq_ops->queue_rq(hctx, &bd);
> > diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> > index 07fa767bff86..c0ef6990b698 100644
> > --- a/include/linux/blk-mq.h
> > +++ b/include/linux/blk-mq.h
> > @@ -394,6 +394,7 @@ enum {
> >         BLK_MQ_F_SHOULD_MERGE   = 1 << 0,
> >         BLK_MQ_F_TAG_SHARED     = 1 << 1,
> >         BLK_MQ_F_NO_MANAGED_IRQ = 1 << 2,
> > +       BLK_MQ_F_FORCE_COMMIT_RQS = 1 << 3,
> >         BLK_MQ_F_BLOCKING       = 1 << 5,
> >         BLK_MQ_F_NO_SCHED       = 1 << 6,
> >         BLK_MQ_F_ALLOC_POLICY_START_BIT = 8,
> >
> > Thanks,
> > Ming
> >
>
>
> --
> Baolin Wang
Ming Lei March 23, 2020, 3:44 a.m. UTC | #4
On Fri, Mar 20, 2020 at 06:27:41PM +0800, Baolin Wang wrote:
> Hi Ming,
> 
> On Wed, Mar 18, 2020 at 6:26 PM Baolin Wang <baolin.wang7@gmail.com> wrote:
> >
> > Hi Ming,
> >
> > On Wed, Mar 18, 2020 at 6:01 PM Ming Lei <ming.lei@redhat.com> wrote:
> > >
> > > On Mon, Mar 16, 2020 at 06:01:19PM +0800, Baolin Wang wrote:
> > > > As we know, some SD/MMC host controllers can support packed request,
> > > > that means we can package several requests to host controller at one
> > > > time to improve performence. So the hardware driver expects the blk-mq
> > > > can dispatch a batch of requests at one time, and driver can use bd.last
> > > > to indicate if it is the last request in the batch to help to combine
> > > > requests as much as possible.
> > > >
> > > > Thus we should add batch requests setting from the block driver to tell
> > > > the scheduler how many requests can be dispatched in a batch, as well
> > > > as changing the scheduler to dispatch more than one request if setting
> > > > the maximum batch requests number.
> > > >
> > >
> > > I feel this batch dispatch style is more complicated, and some other
> > > drivers(virtio blk/scsi) still may get benefit if we can pass real 'last' flag in
> > > .queue_rq().
> > >
> > > So what about the following way by extending .commit_rqs() to this usage?
> > > And you can do whatever batch processing in .commit_rqs() which will be
> > > guaranteed to be called if BLK_MQ_F_FORCE_COMMIT_RQS is set by driver.
> >
> > I'm very appreciated for your good suggestion, which is much simpler than mine.
> > It seems to solve my problem, and I will try it on my platform to see
> > if it can work and give you the feadback. Thanks again.
> 
> I tried your approach on my platform, but met some problems, see below.
> 
> >
> > > diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
> > > index 856356b1619e..cd2bbe56f83f 100644
> > > --- a/block/blk-mq-sched.c
> > > +++ b/block/blk-mq-sched.c
> > > @@ -85,11 +85,12 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
> > >   * its queue by itself in its completion handler, so we don't need to
> > >   * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
> > >   */
> > > -static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> > > +static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> > >  {
> > >         struct request_queue *q = hctx->queue;
> > >         struct elevator_queue *e = q->elevator;
> > >         LIST_HEAD(rq_list);
> > > +       bool ret = false;
> > >
> > >         do {
> > >                 struct request *rq;
> > > @@ -112,7 +113,10 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> > >                  * in blk_mq_dispatch_rq_list().
> > >                  */
> > >                 list_add(&rq->queuelist, &rq_list);
> > > -       } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
> > > +               ret = blk_mq_dispatch_rq_list(q, &rq_list, true);
> > > +       } while (ret);
> > > +
> > > +       return ret;
> > >  }
> > >
> > >  static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
> > > @@ -131,11 +135,12 @@ static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
> > >   * its queue by itself in its completion handler, so we don't need to
> > >   * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
> > >   */
> > > -static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
> > > +static bool blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
> > >  {
> > >         struct request_queue *q = hctx->queue;
> > >         LIST_HEAD(rq_list);
> > >         struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
> > > +       bool ret = false;
> > >
> > >         do {
> > >                 struct request *rq;
> > > @@ -161,10 +166,12 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
> > >
> > >                 /* round robin for fair dispatch */
> > >                 ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
> > > -
> > > -       } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
> > > +               ret = blk_mq_dispatch_rq_list(q, &rq_list, true);
> > > +       } while (ret);
> > >
> > >         WRITE_ONCE(hctx->dispatch_from, ctx);
> > > +
> > > +       return ret;
> > >  }
> > >
> > >  void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> > > @@ -173,6 +180,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> > >         struct elevator_queue *e = q->elevator;
> > >         const bool has_sched_dispatch = e && e->type->ops.dispatch_request;
> > >         LIST_HEAD(rq_list);
> > > +       bool dispatch_ret;
> > >
> > >         /* RCU or SRCU read lock is needed before checking quiesced flag */
> > >         if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
> > > @@ -206,20 +214,26 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> > >          */
> > >         if (!list_empty(&rq_list)) {
> > >                 blk_mq_sched_mark_restart_hctx(hctx);
> > > -               if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
> > > +               dispatch_ret = blk_mq_dispatch_rq_list(q, &rq_list, false);
> > > +               if (dispatch_ret) {
> > >                         if (has_sched_dispatch)
> > > -                               blk_mq_do_dispatch_sched(hctx);
> > > +                               dispatch_ret = blk_mq_do_dispatch_sched(hctx);
> 
> If we dispatched a request successfully by blk_mq_dispatch_rq_list(),
> and got dispatch_ret = true now. Then we will try to dispatch more
> reuqests from scheduler by blk_mq_do_dispatch_sched(), but if now no
> more requests in scheduler, then we will got dispatch_ret = false. In

'dispatch_ret' always holds result of the last blk_mq_do_dispatch_sched().
When any one request has been dispatched successfully, 'dispatch_ret'
is true. New request is always added to list before calling
blk_mq_do_dispatch_sched(), so once blk_mq_do_dispatch_sched() returns
false, it means that .commit_rqs() has been called.

> this case, we will not issue commit_rqs() to tell the hardware to
> handle previous request dispatched from &rq_list.
> 
> So I think we should not overlap the 'dispatch_ret'? Or do you have
> any other thoughts to fix?
> 
> > >                         else
> > > -                               blk_mq_do_dispatch_ctx(hctx);
> > > +                               dispatch_ret = blk_mq_do_dispatch_ctx(hctx);
> > >                 }
> > >         } else if (has_sched_dispatch) {
> > > -               blk_mq_do_dispatch_sched(hctx);
> > > +               dispatch_ret = blk_mq_do_dispatch_sched(hctx);
> > >         } else if (hctx->dispatch_busy) {
> > >                 /* dequeue request one by one from sw queue if queue is busy */
> > > -               blk_mq_do_dispatch_ctx(hctx);
> > > +               dispatch_ret = blk_mq_do_dispatch_ctx(hctx);
> > >         } else {
> > >                 blk_mq_flush_busy_ctxs(hctx, &rq_list);
> > > -               blk_mq_dispatch_rq_list(q, &rq_list, false);
> > > +               dispatch_ret = blk_mq_dispatch_rq_list(q, &rq_list, false);
> > > +       }
> > > +
> > > +       if (dispatch_ret) {
> > > +               if (hctx->flags & BLK_MQ_F_FORCE_COMMIT_RQS)
> > > +                       hctx->queue->mq_ops->commit_rqs(hctx);
> > >         }
> > >  }
> > >
> > > diff --git a/block/blk-mq.c b/block/blk-mq.c
> > > index 87c6699f35ae..9b46f5d6c7fd 100644
> > > --- a/block/blk-mq.c
> > > +++ b/block/blk-mq.c
> > > @@ -1238,11 +1238,15 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
> > >                  * Flag last if we have no more requests, or if we have more
> > >                  * but can't assign a driver tag to it.
> > >                  */
> > > -               if (list_empty(list))
> > > -                       bd.last = true;
> > > -               else {
> > > -                       nxt = list_first_entry(list, struct request, queuelist);
> > > -                       bd.last = !blk_mq_get_driver_tag(nxt);
> > > +               if (!(hctx->flags & BLK_MQ_F_FORCE_COMMIT_RQS)) {
> > > +                       if (list_empty(list))
> > > +                               bd.last = true;
> > > +                       else {
> > > +                               nxt = list_first_entry(list, struct request, queuelist);
> > > +                               bd.last = !blk_mq_get_driver_tag(nxt);
> > > +                       }
> > > +               } else {
> > > +                       bd.last = false;
> 
> If we enabled BLK_MQ_F_FORCE_COMMIT_RQS flag, we will always get
> bd.last = false even for the real last request in the IO scheduler. I
> know you already use commit_irqs() to help to kick driver. But I
> worried if it is reasonable that drivers always get bd.last = false.
> 

BLK_MQ_F_FORCE_COMMIT_RQS means the .last flag is ignored, and we can
document this usage.


Thanks, 
Ming
Baolin Wang March 23, 2020, 5:36 a.m. UTC | #5
On Mon, Mar 23, 2020 at 11:44 AM Ming Lei <ming.lei@redhat.com> wrote:
>
> On Fri, Mar 20, 2020 at 06:27:41PM +0800, Baolin Wang wrote:
> > Hi Ming,
> >
> > On Wed, Mar 18, 2020 at 6:26 PM Baolin Wang <baolin.wang7@gmail.com> wrote:
> > >
> > > Hi Ming,
> > >
> > > On Wed, Mar 18, 2020 at 6:01 PM Ming Lei <ming.lei@redhat.com> wrote:
> > > >
> > > > On Mon, Mar 16, 2020 at 06:01:19PM +0800, Baolin Wang wrote:
> > > > > As we know, some SD/MMC host controllers can support packed request,
> > > > > that means we can package several requests to host controller at one
> > > > > time to improve performence. So the hardware driver expects the blk-mq
> > > > > can dispatch a batch of requests at one time, and driver can use bd.last
> > > > > to indicate if it is the last request in the batch to help to combine
> > > > > requests as much as possible.
> > > > >
> > > > > Thus we should add batch requests setting from the block driver to tell
> > > > > the scheduler how many requests can be dispatched in a batch, as well
> > > > > as changing the scheduler to dispatch more than one request if setting
> > > > > the maximum batch requests number.
> > > > >
> > > >
> > > > I feel this batch dispatch style is more complicated, and some other
> > > > drivers(virtio blk/scsi) still may get benefit if we can pass real 'last' flag in
> > > > .queue_rq().
> > > >
> > > > So what about the following way by extending .commit_rqs() to this usage?
> > > > And you can do whatever batch processing in .commit_rqs() which will be
> > > > guaranteed to be called if BLK_MQ_F_FORCE_COMMIT_RQS is set by driver.
> > >
> > > I'm very appreciated for your good suggestion, which is much simpler than mine.
> > > It seems to solve my problem, and I will try it on my platform to see
> > > if it can work and give you the feadback. Thanks again.
> >
> > I tried your approach on my platform, but met some problems, see below.
> >
> > >
> > > > diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
> > > > index 856356b1619e..cd2bbe56f83f 100644
> > > > --- a/block/blk-mq-sched.c
> > > > +++ b/block/blk-mq-sched.c
> > > > @@ -85,11 +85,12 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
> > > >   * its queue by itself in its completion handler, so we don't need to
> > > >   * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
> > > >   */
> > > > -static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> > > > +static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> > > >  {
> > > >         struct request_queue *q = hctx->queue;
> > > >         struct elevator_queue *e = q->elevator;
> > > >         LIST_HEAD(rq_list);
> > > > +       bool ret = false;
> > > >
> > > >         do {
> > > >                 struct request *rq;
> > > > @@ -112,7 +113,10 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> > > >                  * in blk_mq_dispatch_rq_list().
> > > >                  */
> > > >                 list_add(&rq->queuelist, &rq_list);
> > > > -       } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
> > > > +               ret = blk_mq_dispatch_rq_list(q, &rq_list, true);
> > > > +       } while (ret);
> > > > +
> > > > +       return ret;
> > > >  }
> > > >
> > > >  static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
> > > > @@ -131,11 +135,12 @@ static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
> > > >   * its queue by itself in its completion handler, so we don't need to
> > > >   * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
> > > >   */
> > > > -static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
> > > > +static bool blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
> > > >  {
> > > >         struct request_queue *q = hctx->queue;
> > > >         LIST_HEAD(rq_list);
> > > >         struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
> > > > +       bool ret = false;
> > > >
> > > >         do {
> > > >                 struct request *rq;
> > > > @@ -161,10 +166,12 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
> > > >
> > > >                 /* round robin for fair dispatch */
> > > >                 ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
> > > > -
> > > > -       } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
> > > > +               ret = blk_mq_dispatch_rq_list(q, &rq_list, true);
> > > > +       } while (ret);
> > > >
> > > >         WRITE_ONCE(hctx->dispatch_from, ctx);
> > > > +
> > > > +       return ret;
> > > >  }
> > > >
> > > >  void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> > > > @@ -173,6 +180,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> > > >         struct elevator_queue *e = q->elevator;
> > > >         const bool has_sched_dispatch = e && e->type->ops.dispatch_request;
> > > >         LIST_HEAD(rq_list);
> > > > +       bool dispatch_ret;
> > > >
> > > >         /* RCU or SRCU read lock is needed before checking quiesced flag */
> > > >         if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
> > > > @@ -206,20 +214,26 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> > > >          */
> > > >         if (!list_empty(&rq_list)) {
> > > >                 blk_mq_sched_mark_restart_hctx(hctx);
> > > > -               if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
> > > > +               dispatch_ret = blk_mq_dispatch_rq_list(q, &rq_list, false);
> > > > +               if (dispatch_ret) {
> > > >                         if (has_sched_dispatch)
> > > > -                               blk_mq_do_dispatch_sched(hctx);
> > > > +                               dispatch_ret = blk_mq_do_dispatch_sched(hctx);
> >
> > If we dispatched a request successfully by blk_mq_dispatch_rq_list(),
> > and got dispatch_ret = true now. Then we will try to dispatch more
> > reuqests from scheduler by blk_mq_do_dispatch_sched(), but if now no
> > more requests in scheduler, then we will got dispatch_ret = false. In
>
> 'dispatch_ret' always holds result of the last blk_mq_do_dispatch_sched().
> When any one request has been dispatched successfully, 'dispatch_ret'
> is true. New request is always added to list before calling
> blk_mq_do_dispatch_sched(), so once blk_mq_do_dispatch_sched() returns
> false, it means that .commit_rqs() has been called.

Not really, if no requests int the IO cheduler, we will break the loop
in blk_mq_do_dispatch_sched() and return false without calling
.commit_rqs().

So in this case, blk_mq_do_dispatch_sched() will return 'false', which
overlapped the return value of 'true' from blk_mq_dispatch_rq_list(),
and did not call .commit_rqs(). Then the IO processing will be stuck.

static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
{
        struct request_queue *q = hctx->queue;
        struct elevator_queue *e = q->elevator;
        LIST_HEAD(rq_list);
        bool ret = false;

       do {
              struct request *rq;

              if (e->type->ops.has_work && !e->type->ops.has_work(hctx))
                     break;

              .......
       } while (ret);

       return ret;
}

>
> > this case, we will not issue commit_rqs() to tell the hardware to
> > handle previous request dispatched from &rq_list.
> >
> > So I think we should not overlap the 'dispatch_ret'? Or do you have
> > any other thoughts to fix?
> >
> > > >                         else
> > > > -                               blk_mq_do_dispatch_ctx(hctx);
> > > > +                               dispatch_ret = blk_mq_do_dispatch_ctx(hctx);
> > > >                 }
> > > >         } else if (has_sched_dispatch) {
> > > > -               blk_mq_do_dispatch_sched(hctx);
> > > > +               dispatch_ret = blk_mq_do_dispatch_sched(hctx);
> > > >         } else if (hctx->dispatch_busy) {
> > > >                 /* dequeue request one by one from sw queue if queue is busy */
> > > > -               blk_mq_do_dispatch_ctx(hctx);
> > > > +               dispatch_ret = blk_mq_do_dispatch_ctx(hctx);
> > > >         } else {
> > > >                 blk_mq_flush_busy_ctxs(hctx, &rq_list);
> > > > -               blk_mq_dispatch_rq_list(q, &rq_list, false);
> > > > +               dispatch_ret = blk_mq_dispatch_rq_list(q, &rq_list, false);
> > > > +       }
> > > > +
> > > > +       if (dispatch_ret) {
> > > > +               if (hctx->flags & BLK_MQ_F_FORCE_COMMIT_RQS)
> > > > +                       hctx->queue->mq_ops->commit_rqs(hctx);
> > > >         }
> > > >  }
> > > >
> > > > diff --git a/block/blk-mq.c b/block/blk-mq.c
> > > > index 87c6699f35ae..9b46f5d6c7fd 100644
> > > > --- a/block/blk-mq.c
> > > > +++ b/block/blk-mq.c
> > > > @@ -1238,11 +1238,15 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
> > > >                  * Flag last if we have no more requests, or if we have more
> > > >                  * but can't assign a driver tag to it.
> > > >                  */
> > > > -               if (list_empty(list))
> > > > -                       bd.last = true;
> > > > -               else {
> > > > -                       nxt = list_first_entry(list, struct request, queuelist);
> > > > -                       bd.last = !blk_mq_get_driver_tag(nxt);
> > > > +               if (!(hctx->flags & BLK_MQ_F_FORCE_COMMIT_RQS)) {
> > > > +                       if (list_empty(list))
> > > > +                               bd.last = true;
> > > > +                       else {
> > > > +                               nxt = list_first_entry(list, struct request, queuelist);
> > > > +                               bd.last = !blk_mq_get_driver_tag(nxt);
> > > > +                       }
> > > > +               } else {
> > > > +                       bd.last = false;
> >
> > If we enabled BLK_MQ_F_FORCE_COMMIT_RQS flag, we will always get
> > bd.last = false even for the real last request in the IO scheduler. I
> > know you already use commit_irqs() to help to kick driver. But I
> > worried if it is reasonable that drivers always get bd.last = false.
> >
>
> BLK_MQ_F_FORCE_COMMIT_RQS means the .last flag is ignored, and we can
> document this usage.

OK. Thanks for your comments.
Ming Lei March 23, 2020, 7:26 a.m. UTC | #6
On Mon, Mar 23, 2020 at 01:36:34PM +0800, Baolin Wang wrote:
> On Mon, Mar 23, 2020 at 11:44 AM Ming Lei <ming.lei@redhat.com> wrote:
> >
> > On Fri, Mar 20, 2020 at 06:27:41PM +0800, Baolin Wang wrote:
> > > Hi Ming,
> > >
> > > On Wed, Mar 18, 2020 at 6:26 PM Baolin Wang <baolin.wang7@gmail.com> wrote:
> > > >
> > > > Hi Ming,
> > > >
> > > > On Wed, Mar 18, 2020 at 6:01 PM Ming Lei <ming.lei@redhat.com> wrote:
> > > > >
> > > > > On Mon, Mar 16, 2020 at 06:01:19PM +0800, Baolin Wang wrote:
> > > > > > As we know, some SD/MMC host controllers can support packed request,
> > > > > > that means we can package several requests to host controller at one
> > > > > > time to improve performence. So the hardware driver expects the blk-mq
> > > > > > can dispatch a batch of requests at one time, and driver can use bd.last
> > > > > > to indicate if it is the last request in the batch to help to combine
> > > > > > requests as much as possible.
> > > > > >
> > > > > > Thus we should add batch requests setting from the block driver to tell
> > > > > > the scheduler how many requests can be dispatched in a batch, as well
> > > > > > as changing the scheduler to dispatch more than one request if setting
> > > > > > the maximum batch requests number.
> > > > > >
> > > > >
> > > > > I feel this batch dispatch style is more complicated, and some other
> > > > > drivers(virtio blk/scsi) still may get benefit if we can pass real 'last' flag in
> > > > > .queue_rq().
> > > > >
> > > > > So what about the following way by extending .commit_rqs() to this usage?
> > > > > And you can do whatever batch processing in .commit_rqs() which will be
> > > > > guaranteed to be called if BLK_MQ_F_FORCE_COMMIT_RQS is set by driver.
> > > >
> > > > I'm very appreciated for your good suggestion, which is much simpler than mine.
> > > > It seems to solve my problem, and I will try it on my platform to see
> > > > if it can work and give you the feadback. Thanks again.
> > >
> > > I tried your approach on my platform, but met some problems, see below.
> > >
> > > >
> > > > > diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
> > > > > index 856356b1619e..cd2bbe56f83f 100644
> > > > > --- a/block/blk-mq-sched.c
> > > > > +++ b/block/blk-mq-sched.c
> > > > > @@ -85,11 +85,12 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
> > > > >   * its queue by itself in its completion handler, so we don't need to
> > > > >   * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
> > > > >   */
> > > > > -static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> > > > > +static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> > > > >  {
> > > > >         struct request_queue *q = hctx->queue;
> > > > >         struct elevator_queue *e = q->elevator;
> > > > >         LIST_HEAD(rq_list);
> > > > > +       bool ret = false;
> > > > >
> > > > >         do {
> > > > >                 struct request *rq;
> > > > > @@ -112,7 +113,10 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> > > > >                  * in blk_mq_dispatch_rq_list().
> > > > >                  */
> > > > >                 list_add(&rq->queuelist, &rq_list);
> > > > > -       } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
> > > > > +               ret = blk_mq_dispatch_rq_list(q, &rq_list, true);
> > > > > +       } while (ret);
> > > > > +
> > > > > +       return ret;
> > > > >  }
> > > > >
> > > > >  static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
> > > > > @@ -131,11 +135,12 @@ static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
> > > > >   * its queue by itself in its completion handler, so we don't need to
> > > > >   * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
> > > > >   */
> > > > > -static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
> > > > > +static bool blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
> > > > >  {
> > > > >         struct request_queue *q = hctx->queue;
> > > > >         LIST_HEAD(rq_list);
> > > > >         struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
> > > > > +       bool ret = false;
> > > > >
> > > > >         do {
> > > > >                 struct request *rq;
> > > > > @@ -161,10 +166,12 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
> > > > >
> > > > >                 /* round robin for fair dispatch */
> > > > >                 ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
> > > > > -
> > > > > -       } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
> > > > > +               ret = blk_mq_dispatch_rq_list(q, &rq_list, true);
> > > > > +       } while (ret);
> > > > >
> > > > >         WRITE_ONCE(hctx->dispatch_from, ctx);
> > > > > +
> > > > > +       return ret;
> > > > >  }
> > > > >
> > > > >  void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> > > > > @@ -173,6 +180,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> > > > >         struct elevator_queue *e = q->elevator;
> > > > >         const bool has_sched_dispatch = e && e->type->ops.dispatch_request;
> > > > >         LIST_HEAD(rq_list);
> > > > > +       bool dispatch_ret;
> > > > >
> > > > >         /* RCU or SRCU read lock is needed before checking quiesced flag */
> > > > >         if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
> > > > > @@ -206,20 +214,26 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> > > > >          */
> > > > >         if (!list_empty(&rq_list)) {
> > > > >                 blk_mq_sched_mark_restart_hctx(hctx);
> > > > > -               if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
> > > > > +               dispatch_ret = blk_mq_dispatch_rq_list(q, &rq_list, false);
> > > > > +               if (dispatch_ret) {
> > > > >                         if (has_sched_dispatch)
> > > > > -                               blk_mq_do_dispatch_sched(hctx);
> > > > > +                               dispatch_ret = blk_mq_do_dispatch_sched(hctx);
> > >
> > > If we dispatched a request successfully by blk_mq_dispatch_rq_list(),
> > > and got dispatch_ret = true now. Then we will try to dispatch more
> > > reuqests from scheduler by blk_mq_do_dispatch_sched(), but if now no
> > > more requests in scheduler, then we will got dispatch_ret = false. In
> >
> > 'dispatch_ret' always holds result of the last blk_mq_do_dispatch_sched().
> > When any one request has been dispatched successfully, 'dispatch_ret'
> > is true. New request is always added to list before calling
> > blk_mq_do_dispatch_sched(), so once blk_mq_do_dispatch_sched() returns
> > false, it means that .commit_rqs() has been called.
> 
> Not really, if no requests int the IO cheduler, we will break the loop
> in blk_mq_do_dispatch_sched() and return false without calling
> .commit_rqs().

If there isn't any request to dispatch, false is returned. Otherwise,
always return the return value of last 'blk_mq_dispatch_rq_list'.

> 
> So in this case, blk_mq_do_dispatch_sched() will return 'false', which
> overlapped the return value of 'true' from blk_mq_dispatch_rq_list(),
> and did not call .commit_rqs(). Then the IO processing will be stuck.

See below.

> 
> static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> {
>         struct request_queue *q = hctx->queue;
>         struct elevator_queue *e = q->elevator;
>         LIST_HEAD(rq_list);
>         bool ret = false;

The above initialization is just done once.

> 
>        do {
>               struct request *rq;
> 
>               if (e->type->ops.has_work && !e->type->ops.has_work(hctx))
>                      break;
> 
>               .......
			    ret = blk_mq_dispatch_rq_list(q, list, ...);

list includes one request, so blk_mq_dispatch_rq_list() won't return
false in case of no request in list.

>        } while (ret);
> 
>        return ret;

'ret' is always updated by return value of last blk_mq_dispatch_rq_list()
if at least one request is dispatched. So if it becomes false, the loop
breaks, that means .commit_rqs() has been called cause 'list' does
include one request for blk_mq_dispatch_rq_list(). Otherwise, true is
still returned.

thanks, 
Ming
Baolin Wang March 23, 2020, 8:22 a.m. UTC | #7
On Mon, Mar 23, 2020 at 3:27 PM Ming Lei <ming.lei@redhat.com> wrote:
>
> On Mon, Mar 23, 2020 at 01:36:34PM +0800, Baolin Wang wrote:
> > On Mon, Mar 23, 2020 at 11:44 AM Ming Lei <ming.lei@redhat.com> wrote:
> > >
> > > On Fri, Mar 20, 2020 at 06:27:41PM +0800, Baolin Wang wrote:
> > > > Hi Ming,
> > > >
> > > > On Wed, Mar 18, 2020 at 6:26 PM Baolin Wang <baolin.wang7@gmail.com> wrote:
> > > > >
> > > > > Hi Ming,
> > > > >
> > > > > On Wed, Mar 18, 2020 at 6:01 PM Ming Lei <ming.lei@redhat.com> wrote:
> > > > > >
> > > > > > On Mon, Mar 16, 2020 at 06:01:19PM +0800, Baolin Wang wrote:
> > > > > > > As we know, some SD/MMC host controllers can support packed request,
> > > > > > > that means we can package several requests to host controller at one
> > > > > > > time to improve performence. So the hardware driver expects the blk-mq
> > > > > > > can dispatch a batch of requests at one time, and driver can use bd.last
> > > > > > > to indicate if it is the last request in the batch to help to combine
> > > > > > > requests as much as possible.
> > > > > > >
> > > > > > > Thus we should add batch requests setting from the block driver to tell
> > > > > > > the scheduler how many requests can be dispatched in a batch, as well
> > > > > > > as changing the scheduler to dispatch more than one request if setting
> > > > > > > the maximum batch requests number.
> > > > > > >
> > > > > >
> > > > > > I feel this batch dispatch style is more complicated, and some other
> > > > > > drivers(virtio blk/scsi) still may get benefit if we can pass real 'last' flag in
> > > > > > .queue_rq().
> > > > > >
> > > > > > So what about the following way by extending .commit_rqs() to this usage?
> > > > > > And you can do whatever batch processing in .commit_rqs() which will be
> > > > > > guaranteed to be called if BLK_MQ_F_FORCE_COMMIT_RQS is set by driver.
> > > > >
> > > > > I'm very appreciated for your good suggestion, which is much simpler than mine.
> > > > > It seems to solve my problem, and I will try it on my platform to see
> > > > > if it can work and give you the feadback. Thanks again.
> > > >
> > > > I tried your approach on my platform, but met some problems, see below.
> > > >
> > > > >
> > > > > > diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
> > > > > > index 856356b1619e..cd2bbe56f83f 100644
> > > > > > --- a/block/blk-mq-sched.c
> > > > > > +++ b/block/blk-mq-sched.c
> > > > > > @@ -85,11 +85,12 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
> > > > > >   * its queue by itself in its completion handler, so we don't need to
> > > > > >   * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
> > > > > >   */
> > > > > > -static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> > > > > > +static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> > > > > >  {
> > > > > >         struct request_queue *q = hctx->queue;
> > > > > >         struct elevator_queue *e = q->elevator;
> > > > > >         LIST_HEAD(rq_list);
> > > > > > +       bool ret = false;
> > > > > >
> > > > > >         do {
> > > > > >                 struct request *rq;
> > > > > > @@ -112,7 +113,10 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> > > > > >                  * in blk_mq_dispatch_rq_list().
> > > > > >                  */
> > > > > >                 list_add(&rq->queuelist, &rq_list);
> > > > > > -       } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
> > > > > > +               ret = blk_mq_dispatch_rq_list(q, &rq_list, true);
> > > > > > +       } while (ret);
> > > > > > +
> > > > > > +       return ret;
> > > > > >  }
> > > > > >
> > > > > >  static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
> > > > > > @@ -131,11 +135,12 @@ static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
> > > > > >   * its queue by itself in its completion handler, so we don't need to
> > > > > >   * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
> > > > > >   */
> > > > > > -static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
> > > > > > +static bool blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
> > > > > >  {
> > > > > >         struct request_queue *q = hctx->queue;
> > > > > >         LIST_HEAD(rq_list);
> > > > > >         struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
> > > > > > +       bool ret = false;
> > > > > >
> > > > > >         do {
> > > > > >                 struct request *rq;
> > > > > > @@ -161,10 +166,12 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
> > > > > >
> > > > > >                 /* round robin for fair dispatch */
> > > > > >                 ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
> > > > > > -
> > > > > > -       } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
> > > > > > +               ret = blk_mq_dispatch_rq_list(q, &rq_list, true);
> > > > > > +       } while (ret);
> > > > > >
> > > > > >         WRITE_ONCE(hctx->dispatch_from, ctx);
> > > > > > +
> > > > > > +       return ret;
> > > > > >  }
> > > > > >
> > > > > >  void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> > > > > > @@ -173,6 +180,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> > > > > >         struct elevator_queue *e = q->elevator;
> > > > > >         const bool has_sched_dispatch = e && e->type->ops.dispatch_request;
> > > > > >         LIST_HEAD(rq_list);
> > > > > > +       bool dispatch_ret;
> > > > > >
> > > > > >         /* RCU or SRCU read lock is needed before checking quiesced flag */
> > > > > >         if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
> > > > > > @@ -206,20 +214,26 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> > > > > >          */
> > > > > >         if (!list_empty(&rq_list)) {
> > > > > >                 blk_mq_sched_mark_restart_hctx(hctx);
> > > > > > -               if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
> > > > > > +               dispatch_ret = blk_mq_dispatch_rq_list(q, &rq_list, false);
> > > > > > +               if (dispatch_ret) {
> > > > > >                         if (has_sched_dispatch)
> > > > > > -                               blk_mq_do_dispatch_sched(hctx);
> > > > > > +                               dispatch_ret = blk_mq_do_dispatch_sched(hctx);
> > > >
> > > > If we dispatched a request successfully by blk_mq_dispatch_rq_list(),
> > > > and got dispatch_ret = true now. Then we will try to dispatch more
> > > > reuqests from scheduler by blk_mq_do_dispatch_sched(), but if now no
> > > > more requests in scheduler, then we will got dispatch_ret = false. In
> > >
> > > 'dispatch_ret' always holds result of the last blk_mq_do_dispatch_sched().
> > > When any one request has been dispatched successfully, 'dispatch_ret'
> > > is true. New request is always added to list before calling
> > > blk_mq_do_dispatch_sched(), so once blk_mq_do_dispatch_sched() returns
> > > false, it means that .commit_rqs() has been called.
> >
> > Not really, if no requests int the IO cheduler, we will break the loop
> > in blk_mq_do_dispatch_sched() and return false without calling
> > .commit_rqs().
>
> If there isn't any request to dispatch, false is returned. Otherwise,
> always return the return value of last 'blk_mq_dispatch_rq_list'.
>
> >
> > So in this case, blk_mq_do_dispatch_sched() will return 'false', which
> > overlapped the return value of 'true' from blk_mq_dispatch_rq_list(),
> > and did not call .commit_rqs(). Then the IO processing will be stuck.
>
> See below.
>
> >
> > static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> > {
> >         struct request_queue *q = hctx->queue;
> >         struct elevator_queue *e = q->elevator;
> >         LIST_HEAD(rq_list);
> >         bool ret = false;
>
> The above initialization is just done once.
>
> >
> >        do {
> >               struct request *rq;
> >
> >               if (e->type->ops.has_work && !e->type->ops.has_work(hctx))
> >                      break;
> >
> >               .......
>                             ret = blk_mq_dispatch_rq_list(q, list, ...);
>
> list includes one request, so blk_mq_dispatch_rq_list() won't return
> false in case of no request in list.
>
> >        } while (ret);
> >
> >        return ret;
>
> 'ret' is always updated by return value of last blk_mq_dispatch_rq_list()
> if at least one request is dispatched. So if it becomes false, the loop
> breaks, that means .commit_rqs() has been called cause 'list' does
> include one request for blk_mq_dispatch_rq_list(). Otherwise, true is
> still returned.

Sorry for my confusing description, let me try again to describe the problem.
When I try to mount the block device, I got the IO stuck with your
patch, and I did some debugging. I found we missed calling
commit_rqs() for one case:

void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
@@ -173,6 +180,7 @@ void blk_mq_sched_dispatch_requests(struct
blk_mq_hw_ctx *hctx)
        struct elevator_queue *e = q->elevator;
        const bool has_sched_dispatch = e && e->type->ops.dispatch_request;
        LIST_HEAD(rq_list);
+       bool dispatch_ret;

        /* RCU or SRCU read lock is needed before checking quiesced flag */
        if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
@@ -206,20 +214,26 @@ void blk_mq_sched_dispatch_requests(struct
blk_mq_hw_ctx *hctx)
         */
        if (!list_empty(&rq_list)) {
                blk_mq_sched_mark_restart_hctx(hctx);
-               if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
+               dispatch_ret = blk_mq_dispatch_rq_list(q, &rq_list, false);

Suppose we dispatch one request to block driver, and return 'true' here.

+               if (dispatch_ret) {
                        if (has_sched_dispatch)
-                               blk_mq_do_dispatch_sched(hctx);
+                               dispatch_ret = blk_mq_do_dispatch_sched(hctx);

Then we will continue to try to dispatch more requests from IO
scheduler, but if there are no requests in IO scheduler now, it will
return 'false' here, and set dispatch_ret as false.

                        else
-                               blk_mq_do_dispatch_ctx(hctx);
+                               dispatch_ret = blk_mq_do_dispatch_ctx(hctx);
                }
        } else if (has_sched_dispatch) {
-               blk_mq_do_dispatch_sched(hctx);
+               dispatch_ret = blk_mq_do_dispatch_sched(hctx);
        } else if (hctx->dispatch_busy) {
                /* dequeue request one by one from sw queue if queue is busy */
-               blk_mq_do_dispatch_ctx(hctx);
+               dispatch_ret = blk_mq_do_dispatch_ctx(hctx);
        } else {
                blk_mq_flush_busy_ctxs(hctx, &rq_list);
-               blk_mq_dispatch_rq_list(q, &rq_list, false);
+               dispatch_ret = blk_mq_dispatch_rq_list(q, &rq_list, false);
+       }
+
+       if (dispatch_ret) {

So in this case, we will not call commit_rqs() to kick block driver
due to dispatch_ret == false, though we've dispatched one request to
block driver by blk_mq_dispatch_rq_list(), which will cause the IO
stuck.

+               if (hctx->flags & BLK_MQ_F_FORCE_COMMIT_RQS)
+                       hctx->queue->mq_ops->commit_rqs(hctx);
        }
 }
Ming Lei March 23, 2020, 8:28 a.m. UTC | #8
On Mon, Mar 23, 2020 at 04:22:38PM +0800, Baolin Wang wrote:
> On Mon, Mar 23, 2020 at 3:27 PM Ming Lei <ming.lei@redhat.com> wrote:
> >
> > On Mon, Mar 23, 2020 at 01:36:34PM +0800, Baolin Wang wrote:
> > > On Mon, Mar 23, 2020 at 11:44 AM Ming Lei <ming.lei@redhat.com> wrote:
> > > >
> > > > On Fri, Mar 20, 2020 at 06:27:41PM +0800, Baolin Wang wrote:
> > > > > Hi Ming,
> > > > >
> > > > > On Wed, Mar 18, 2020 at 6:26 PM Baolin Wang <baolin.wang7@gmail.com> wrote:
> > > > > >
> > > > > > Hi Ming,
> > > > > >
> > > > > > On Wed, Mar 18, 2020 at 6:01 PM Ming Lei <ming.lei@redhat.com> wrote:
> > > > > > >
> > > > > > > On Mon, Mar 16, 2020 at 06:01:19PM +0800, Baolin Wang wrote:
> > > > > > > > As we know, some SD/MMC host controllers can support packed request,
> > > > > > > > that means we can package several requests to host controller at one
> > > > > > > > time to improve performence. So the hardware driver expects the blk-mq
> > > > > > > > can dispatch a batch of requests at one time, and driver can use bd.last
> > > > > > > > to indicate if it is the last request in the batch to help to combine
> > > > > > > > requests as much as possible.
> > > > > > > >
> > > > > > > > Thus we should add batch requests setting from the block driver to tell
> > > > > > > > the scheduler how many requests can be dispatched in a batch, as well
> > > > > > > > as changing the scheduler to dispatch more than one request if setting
> > > > > > > > the maximum batch requests number.
> > > > > > > >
> > > > > > >
> > > > > > > I feel this batch dispatch style is more complicated, and some other
> > > > > > > drivers(virtio blk/scsi) still may get benefit if we can pass real 'last' flag in
> > > > > > > .queue_rq().
> > > > > > >
> > > > > > > So what about the following way by extending .commit_rqs() to this usage?
> > > > > > > And you can do whatever batch processing in .commit_rqs() which will be
> > > > > > > guaranteed to be called if BLK_MQ_F_FORCE_COMMIT_RQS is set by driver.
> > > > > >
> > > > > > I'm very appreciated for your good suggestion, which is much simpler than mine.
> > > > > > It seems to solve my problem, and I will try it on my platform to see
> > > > > > if it can work and give you the feadback. Thanks again.
> > > > >
> > > > > I tried your approach on my platform, but met some problems, see below.
> > > > >
> > > > > >
> > > > > > > diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
> > > > > > > index 856356b1619e..cd2bbe56f83f 100644
> > > > > > > --- a/block/blk-mq-sched.c
> > > > > > > +++ b/block/blk-mq-sched.c
> > > > > > > @@ -85,11 +85,12 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
> > > > > > >   * its queue by itself in its completion handler, so we don't need to
> > > > > > >   * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
> > > > > > >   */
> > > > > > > -static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> > > > > > > +static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> > > > > > >  {
> > > > > > >         struct request_queue *q = hctx->queue;
> > > > > > >         struct elevator_queue *e = q->elevator;
> > > > > > >         LIST_HEAD(rq_list);
> > > > > > > +       bool ret = false;
> > > > > > >
> > > > > > >         do {
> > > > > > >                 struct request *rq;
> > > > > > > @@ -112,7 +113,10 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> > > > > > >                  * in blk_mq_dispatch_rq_list().
> > > > > > >                  */
> > > > > > >                 list_add(&rq->queuelist, &rq_list);
> > > > > > > -       } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
> > > > > > > +               ret = blk_mq_dispatch_rq_list(q, &rq_list, true);
> > > > > > > +       } while (ret);
> > > > > > > +
> > > > > > > +       return ret;
> > > > > > >  }
> > > > > > >
> > > > > > >  static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
> > > > > > > @@ -131,11 +135,12 @@ static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
> > > > > > >   * its queue by itself in its completion handler, so we don't need to
> > > > > > >   * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
> > > > > > >   */
> > > > > > > -static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
> > > > > > > +static bool blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
> > > > > > >  {
> > > > > > >         struct request_queue *q = hctx->queue;
> > > > > > >         LIST_HEAD(rq_list);
> > > > > > >         struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
> > > > > > > +       bool ret = false;
> > > > > > >
> > > > > > >         do {
> > > > > > >                 struct request *rq;
> > > > > > > @@ -161,10 +166,12 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
> > > > > > >
> > > > > > >                 /* round robin for fair dispatch */
> > > > > > >                 ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
> > > > > > > -
> > > > > > > -       } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
> > > > > > > +               ret = blk_mq_dispatch_rq_list(q, &rq_list, true);
> > > > > > > +       } while (ret);
> > > > > > >
> > > > > > >         WRITE_ONCE(hctx->dispatch_from, ctx);
> > > > > > > +
> > > > > > > +       return ret;
> > > > > > >  }
> > > > > > >
> > > > > > >  void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> > > > > > > @@ -173,6 +180,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> > > > > > >         struct elevator_queue *e = q->elevator;
> > > > > > >         const bool has_sched_dispatch = e && e->type->ops.dispatch_request;
> > > > > > >         LIST_HEAD(rq_list);
> > > > > > > +       bool dispatch_ret;
> > > > > > >
> > > > > > >         /* RCU or SRCU read lock is needed before checking quiesced flag */
> > > > > > >         if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
> > > > > > > @@ -206,20 +214,26 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> > > > > > >          */
> > > > > > >         if (!list_empty(&rq_list)) {
> > > > > > >                 blk_mq_sched_mark_restart_hctx(hctx);
> > > > > > > -               if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
> > > > > > > +               dispatch_ret = blk_mq_dispatch_rq_list(q, &rq_list, false);
> > > > > > > +               if (dispatch_ret) {
> > > > > > >                         if (has_sched_dispatch)
> > > > > > > -                               blk_mq_do_dispatch_sched(hctx);
> > > > > > > +                               dispatch_ret = blk_mq_do_dispatch_sched(hctx);
> > > > >
> > > > > If we dispatched a request successfully by blk_mq_dispatch_rq_list(),
> > > > > and got dispatch_ret = true now. Then we will try to dispatch more
> > > > > reuqests from scheduler by blk_mq_do_dispatch_sched(), but if now no
> > > > > more requests in scheduler, then we will got dispatch_ret = false. In
> > > >
> > > > 'dispatch_ret' always holds result of the last blk_mq_do_dispatch_sched().
> > > > When any one request has been dispatched successfully, 'dispatch_ret'
> > > > is true. New request is always added to list before calling
> > > > blk_mq_do_dispatch_sched(), so once blk_mq_do_dispatch_sched() returns
> > > > false, it means that .commit_rqs() has been called.
> > >
> > > Not really, if no requests int the IO cheduler, we will break the loop
> > > in blk_mq_do_dispatch_sched() and return false without calling
> > > .commit_rqs().
> >
> > If there isn't any request to dispatch, false is returned. Otherwise,
> > always return the return value of last 'blk_mq_dispatch_rq_list'.
> >
> > >
> > > So in this case, blk_mq_do_dispatch_sched() will return 'false', which
> > > overlapped the return value of 'true' from blk_mq_dispatch_rq_list(),
> > > and did not call .commit_rqs(). Then the IO processing will be stuck.
> >
> > See below.
> >
> > >
> > > static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> > > {
> > >         struct request_queue *q = hctx->queue;
> > >         struct elevator_queue *e = q->elevator;
> > >         LIST_HEAD(rq_list);
> > >         bool ret = false;
> >
> > The above initialization is just done once.
> >
> > >
> > >        do {
> > >               struct request *rq;
> > >
> > >               if (e->type->ops.has_work && !e->type->ops.has_work(hctx))
> > >                      break;
> > >
> > >               .......
> >                             ret = blk_mq_dispatch_rq_list(q, list, ...);
> >
> > list includes one request, so blk_mq_dispatch_rq_list() won't return
> > false in case of no request in list.
> >
> > >        } while (ret);
> > >
> > >        return ret;
> >
> > 'ret' is always updated by return value of last blk_mq_dispatch_rq_list()
> > if at least one request is dispatched. So if it becomes false, the loop
> > breaks, that means .commit_rqs() has been called cause 'list' does
> > include one request for blk_mq_dispatch_rq_list(). Otherwise, true is
> > still returned.
> 
> Sorry for my confusing description, let me try again to describe the problem.
> When I try to mount the block device, I got the IO stuck with your
> patch, and I did some debugging. I found we missed calling
> commit_rqs() for one case:
> 
> void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> @@ -173,6 +180,7 @@ void blk_mq_sched_dispatch_requests(struct
> blk_mq_hw_ctx *hctx)
>         struct elevator_queue *e = q->elevator;
>         const bool has_sched_dispatch = e && e->type->ops.dispatch_request;
>         LIST_HEAD(rq_list);
> +       bool dispatch_ret;
> 
>         /* RCU or SRCU read lock is needed before checking quiesced flag */
>         if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
> @@ -206,20 +214,26 @@ void blk_mq_sched_dispatch_requests(struct
> blk_mq_hw_ctx *hctx)
>          */
>         if (!list_empty(&rq_list)) {
>                 blk_mq_sched_mark_restart_hctx(hctx);
> -               if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
> +               dispatch_ret = blk_mq_dispatch_rq_list(q, &rq_list, false);
> 
> Suppose we dispatch one request to block driver, and return 'true' here.
> 
> +               if (dispatch_ret) {
>                         if (has_sched_dispatch)
> -                               blk_mq_do_dispatch_sched(hctx);
> +                               dispatch_ret = blk_mq_do_dispatch_sched(hctx);
> 
> Then we will continue to try to dispatch more requests from IO
> scheduler, but if there are no requests in IO scheduler now, it will
> return 'false' here, and set dispatch_ret as false.
> 
>                         else
> -                               blk_mq_do_dispatch_ctx(hctx);
> +                               dispatch_ret = blk_mq_do_dispatch_ctx(hctx);

OK, this one is an issue, but it can be fixed simply by not updating
'dispatch_ret' for the following dispatch, something like the below
way:

	if (dispatch_ret) {
	        if (has_sched_dispatch)
	                blk_mq_do_dispatch_sched(hctx);
	        else
	                blk_mq_do_dispatch_ctx(hctx);
	}


Thanks,
Ming
Baolin Wang March 23, 2020, 9:13 a.m. UTC | #9
On Mon, Mar 23, 2020 at 4:29 PM Ming Lei <ming.lei@redhat.com> wrote:
>
> On Mon, Mar 23, 2020 at 04:22:38PM +0800, Baolin Wang wrote:
> > On Mon, Mar 23, 2020 at 3:27 PM Ming Lei <ming.lei@redhat.com> wrote:
> > >
> > > On Mon, Mar 23, 2020 at 01:36:34PM +0800, Baolin Wang wrote:
> > > > On Mon, Mar 23, 2020 at 11:44 AM Ming Lei <ming.lei@redhat.com> wrote:
> > > > >
> > > > > On Fri, Mar 20, 2020 at 06:27:41PM +0800, Baolin Wang wrote:
> > > > > > Hi Ming,
> > > > > >
> > > > > > On Wed, Mar 18, 2020 at 6:26 PM Baolin Wang <baolin.wang7@gmail.com> wrote:
> > > > > > >
> > > > > > > Hi Ming,
> > > > > > >
> > > > > > > On Wed, Mar 18, 2020 at 6:01 PM Ming Lei <ming.lei@redhat.com> wrote:
> > > > > > > >
> > > > > > > > On Mon, Mar 16, 2020 at 06:01:19PM +0800, Baolin Wang wrote:
> > > > > > > > > As we know, some SD/MMC host controllers can support packed request,
> > > > > > > > > that means we can package several requests to host controller at one
> > > > > > > > > time to improve performence. So the hardware driver expects the blk-mq
> > > > > > > > > can dispatch a batch of requests at one time, and driver can use bd.last
> > > > > > > > > to indicate if it is the last request in the batch to help to combine
> > > > > > > > > requests as much as possible.
> > > > > > > > >
> > > > > > > > > Thus we should add batch requests setting from the block driver to tell
> > > > > > > > > the scheduler how many requests can be dispatched in a batch, as well
> > > > > > > > > as changing the scheduler to dispatch more than one request if setting
> > > > > > > > > the maximum batch requests number.
> > > > > > > > >
> > > > > > > >
> > > > > > > > I feel this batch dispatch style is more complicated, and some other
> > > > > > > > drivers(virtio blk/scsi) still may get benefit if we can pass real 'last' flag in
> > > > > > > > .queue_rq().
> > > > > > > >
> > > > > > > > So what about the following way by extending .commit_rqs() to this usage?
> > > > > > > > And you can do whatever batch processing in .commit_rqs() which will be
> > > > > > > > guaranteed to be called if BLK_MQ_F_FORCE_COMMIT_RQS is set by driver.
> > > > > > >
> > > > > > > I'm very appreciated for your good suggestion, which is much simpler than mine.
> > > > > > > It seems to solve my problem, and I will try it on my platform to see
> > > > > > > if it can work and give you the feadback. Thanks again.
> > > > > >
> > > > > > I tried your approach on my platform, but met some problems, see below.
> > > > > >
> > > > > > >
> > > > > > > > diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
> > > > > > > > index 856356b1619e..cd2bbe56f83f 100644
> > > > > > > > --- a/block/blk-mq-sched.c
> > > > > > > > +++ b/block/blk-mq-sched.c
> > > > > > > > @@ -85,11 +85,12 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
> > > > > > > >   * its queue by itself in its completion handler, so we don't need to
> > > > > > > >   * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
> > > > > > > >   */
> > > > > > > > -static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> > > > > > > > +static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> > > > > > > >  {
> > > > > > > >         struct request_queue *q = hctx->queue;
> > > > > > > >         struct elevator_queue *e = q->elevator;
> > > > > > > >         LIST_HEAD(rq_list);
> > > > > > > > +       bool ret = false;
> > > > > > > >
> > > > > > > >         do {
> > > > > > > >                 struct request *rq;
> > > > > > > > @@ -112,7 +113,10 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> > > > > > > >                  * in blk_mq_dispatch_rq_list().
> > > > > > > >                  */
> > > > > > > >                 list_add(&rq->queuelist, &rq_list);
> > > > > > > > -       } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
> > > > > > > > +               ret = blk_mq_dispatch_rq_list(q, &rq_list, true);
> > > > > > > > +       } while (ret);
> > > > > > > > +
> > > > > > > > +       return ret;
> > > > > > > >  }
> > > > > > > >
> > > > > > > >  static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
> > > > > > > > @@ -131,11 +135,12 @@ static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
> > > > > > > >   * its queue by itself in its completion handler, so we don't need to
> > > > > > > >   * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
> > > > > > > >   */
> > > > > > > > -static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
> > > > > > > > +static bool blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
> > > > > > > >  {
> > > > > > > >         struct request_queue *q = hctx->queue;
> > > > > > > >         LIST_HEAD(rq_list);
> > > > > > > >         struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
> > > > > > > > +       bool ret = false;
> > > > > > > >
> > > > > > > >         do {
> > > > > > > >                 struct request *rq;
> > > > > > > > @@ -161,10 +166,12 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
> > > > > > > >
> > > > > > > >                 /* round robin for fair dispatch */
> > > > > > > >                 ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
> > > > > > > > -
> > > > > > > > -       } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
> > > > > > > > +               ret = blk_mq_dispatch_rq_list(q, &rq_list, true);
> > > > > > > > +       } while (ret);
> > > > > > > >
> > > > > > > >         WRITE_ONCE(hctx->dispatch_from, ctx);
> > > > > > > > +
> > > > > > > > +       return ret;
> > > > > > > >  }
> > > > > > > >
> > > > > > > >  void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> > > > > > > > @@ -173,6 +180,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> > > > > > > >         struct elevator_queue *e = q->elevator;
> > > > > > > >         const bool has_sched_dispatch = e && e->type->ops.dispatch_request;
> > > > > > > >         LIST_HEAD(rq_list);
> > > > > > > > +       bool dispatch_ret;
> > > > > > > >
> > > > > > > >         /* RCU or SRCU read lock is needed before checking quiesced flag */
> > > > > > > >         if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
> > > > > > > > @@ -206,20 +214,26 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> > > > > > > >          */
> > > > > > > >         if (!list_empty(&rq_list)) {
> > > > > > > >                 blk_mq_sched_mark_restart_hctx(hctx);
> > > > > > > > -               if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
> > > > > > > > +               dispatch_ret = blk_mq_dispatch_rq_list(q, &rq_list, false);
> > > > > > > > +               if (dispatch_ret) {
> > > > > > > >                         if (has_sched_dispatch)
> > > > > > > > -                               blk_mq_do_dispatch_sched(hctx);
> > > > > > > > +                               dispatch_ret = blk_mq_do_dispatch_sched(hctx);
> > > > > >
> > > > > > If we dispatched a request successfully by blk_mq_dispatch_rq_list(),
> > > > > > and got dispatch_ret = true now. Then we will try to dispatch more
> > > > > > reuqests from scheduler by blk_mq_do_dispatch_sched(), but if now no
> > > > > > more requests in scheduler, then we will got dispatch_ret = false. In
> > > > >
> > > > > 'dispatch_ret' always holds result of the last blk_mq_do_dispatch_sched().
> > > > > When any one request has been dispatched successfully, 'dispatch_ret'
> > > > > is true. New request is always added to list before calling
> > > > > blk_mq_do_dispatch_sched(), so once blk_mq_do_dispatch_sched() returns
> > > > > false, it means that .commit_rqs() has been called.
> > > >
> > > > Not really, if no requests int the IO cheduler, we will break the loop
> > > > in blk_mq_do_dispatch_sched() and return false without calling
> > > > .commit_rqs().
> > >
> > > If there isn't any request to dispatch, false is returned. Otherwise,
> > > always return the return value of last 'blk_mq_dispatch_rq_list'.
> > >
> > > >
> > > > So in this case, blk_mq_do_dispatch_sched() will return 'false', which
> > > > overlapped the return value of 'true' from blk_mq_dispatch_rq_list(),
> > > > and did not call .commit_rqs(). Then the IO processing will be stuck.
> > >
> > > See below.
> > >
> > > >
> > > > static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> > > > {
> > > >         struct request_queue *q = hctx->queue;
> > > >         struct elevator_queue *e = q->elevator;
> > > >         LIST_HEAD(rq_list);
> > > >         bool ret = false;
> > >
> > > The above initialization is just done once.
> > >
> > > >
> > > >        do {
> > > >               struct request *rq;
> > > >
> > > >               if (e->type->ops.has_work && !e->type->ops.has_work(hctx))
> > > >                      break;
> > > >
> > > >               .......
> > >                             ret = blk_mq_dispatch_rq_list(q, list, ...);
> > >
> > > list includes one request, so blk_mq_dispatch_rq_list() won't return
> > > false in case of no request in list.
> > >
> > > >        } while (ret);
> > > >
> > > >        return ret;
> > >
> > > 'ret' is always updated by return value of last blk_mq_dispatch_rq_list()
> > > if at least one request is dispatched. So if it becomes false, the loop
> > > breaks, that means .commit_rqs() has been called cause 'list' does
> > > include one request for blk_mq_dispatch_rq_list(). Otherwise, true is
> > > still returned.
> >
> > Sorry for my confusing description, let me try again to describe the problem.
> > When I try to mount the block device, I got the IO stuck with your
> > patch, and I did some debugging. I found we missed calling
> > commit_rqs() for one case:
> >
> > void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> > @@ -173,6 +180,7 @@ void blk_mq_sched_dispatch_requests(struct
> > blk_mq_hw_ctx *hctx)
> >         struct elevator_queue *e = q->elevator;
> >         const bool has_sched_dispatch = e && e->type->ops.dispatch_request;
> >         LIST_HEAD(rq_list);
> > +       bool dispatch_ret;
> >
> >         /* RCU or SRCU read lock is needed before checking quiesced flag */
> >         if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
> > @@ -206,20 +214,26 @@ void blk_mq_sched_dispatch_requests(struct
> > blk_mq_hw_ctx *hctx)
> >          */
> >         if (!list_empty(&rq_list)) {
> >                 blk_mq_sched_mark_restart_hctx(hctx);
> > -               if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
> > +               dispatch_ret = blk_mq_dispatch_rq_list(q, &rq_list, false);
> >
> > Suppose we dispatch one request to block driver, and return 'true' here.
> >
> > +               if (dispatch_ret) {
> >                         if (has_sched_dispatch)
> > -                               blk_mq_do_dispatch_sched(hctx);
> > +                               dispatch_ret = blk_mq_do_dispatch_sched(hctx);
> >
> > Then we will continue to try to dispatch more requests from IO
> > scheduler, but if there are no requests in IO scheduler now, it will
> > return 'false' here, and set dispatch_ret as false.
> >
> >                         else
> > -                               blk_mq_do_dispatch_ctx(hctx);
> > +                               dispatch_ret = blk_mq_do_dispatch_ctx(hctx);
>
> OK, this one is an issue, but it can be fixed simply by not updating
> 'dispatch_ret' for the following dispatch, something like the below
> way:
>
>         if (dispatch_ret) {
>                 if (has_sched_dispatch)
>                         blk_mq_do_dispatch_sched(hctx);
>                 else
>                         blk_mq_do_dispatch_ctx(hctx);
>         }

Yes, this can work.

But I found your patch will drop some performance comparing with my
method in patch 1/2. My method can fetch several requests from IO
scheduler and dispatch them to block driver at one time, but in your
patch we still need dispatch request one by one, which will drop some
performance I think.
What do you think? Thanks.
Ming Lei March 23, 2020, 9:58 a.m. UTC | #10
On Mon, Mar 23, 2020 at 05:13:27PM +0800, Baolin Wang wrote:
> On Mon, Mar 23, 2020 at 4:29 PM Ming Lei <ming.lei@redhat.com> wrote:
> >
> > On Mon, Mar 23, 2020 at 04:22:38PM +0800, Baolin Wang wrote:
> > > On Mon, Mar 23, 2020 at 3:27 PM Ming Lei <ming.lei@redhat.com> wrote:
> > > >
> > > > On Mon, Mar 23, 2020 at 01:36:34PM +0800, Baolin Wang wrote:
> > > > > On Mon, Mar 23, 2020 at 11:44 AM Ming Lei <ming.lei@redhat.com> wrote:
> > > > > >
> > > > > > On Fri, Mar 20, 2020 at 06:27:41PM +0800, Baolin Wang wrote:
> > > > > > > Hi Ming,
> > > > > > >
> > > > > > > On Wed, Mar 18, 2020 at 6:26 PM Baolin Wang <baolin.wang7@gmail.com> wrote:
> > > > > > > >
> > > > > > > > Hi Ming,
> > > > > > > >
> > > > > > > > On Wed, Mar 18, 2020 at 6:01 PM Ming Lei <ming.lei@redhat.com> wrote:
> > > > > > > > >
> > > > > > > > > On Mon, Mar 16, 2020 at 06:01:19PM +0800, Baolin Wang wrote:
> > > > > > > > > > As we know, some SD/MMC host controllers can support packed request,
> > > > > > > > > > that means we can package several requests to host controller at one
> > > > > > > > > > time to improve performence. So the hardware driver expects the blk-mq
> > > > > > > > > > can dispatch a batch of requests at one time, and driver can use bd.last
> > > > > > > > > > to indicate if it is the last request in the batch to help to combine
> > > > > > > > > > requests as much as possible.
> > > > > > > > > >
> > > > > > > > > > Thus we should add batch requests setting from the block driver to tell
> > > > > > > > > > the scheduler how many requests can be dispatched in a batch, as well
> > > > > > > > > > as changing the scheduler to dispatch more than one request if setting
> > > > > > > > > > the maximum batch requests number.
> > > > > > > > > >
> > > > > > > > >
> > > > > > > > > I feel this batch dispatch style is more complicated, and some other
> > > > > > > > > drivers(virtio blk/scsi) still may get benefit if we can pass real 'last' flag in
> > > > > > > > > .queue_rq().
> > > > > > > > >
> > > > > > > > > So what about the following way by extending .commit_rqs() to this usage?
> > > > > > > > > And you can do whatever batch processing in .commit_rqs() which will be
> > > > > > > > > guaranteed to be called if BLK_MQ_F_FORCE_COMMIT_RQS is set by driver.
> > > > > > > >
> > > > > > > > I'm very appreciated for your good suggestion, which is much simpler than mine.
> > > > > > > > It seems to solve my problem, and I will try it on my platform to see
> > > > > > > > if it can work and give you the feadback. Thanks again.
> > > > > > >
> > > > > > > I tried your approach on my platform, but met some problems, see below.
> > > > > > >
> > > > > > > >
> > > > > > > > > diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
> > > > > > > > > index 856356b1619e..cd2bbe56f83f 100644
> > > > > > > > > --- a/block/blk-mq-sched.c
> > > > > > > > > +++ b/block/blk-mq-sched.c
> > > > > > > > > @@ -85,11 +85,12 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
> > > > > > > > >   * its queue by itself in its completion handler, so we don't need to
> > > > > > > > >   * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
> > > > > > > > >   */
> > > > > > > > > -static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > +static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> > > > > > > > >  {
> > > > > > > > >         struct request_queue *q = hctx->queue;
> > > > > > > > >         struct elevator_queue *e = q->elevator;
> > > > > > > > >         LIST_HEAD(rq_list);
> > > > > > > > > +       bool ret = false;
> > > > > > > > >
> > > > > > > > >         do {
> > > > > > > > >                 struct request *rq;
> > > > > > > > > @@ -112,7 +113,10 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> > > > > > > > >                  * in blk_mq_dispatch_rq_list().
> > > > > > > > >                  */
> > > > > > > > >                 list_add(&rq->queuelist, &rq_list);
> > > > > > > > > -       } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
> > > > > > > > > +               ret = blk_mq_dispatch_rq_list(q, &rq_list, true);
> > > > > > > > > +       } while (ret);
> > > > > > > > > +
> > > > > > > > > +       return ret;
> > > > > > > > >  }
> > > > > > > > >
> > > > > > > > >  static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
> > > > > > > > > @@ -131,11 +135,12 @@ static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
> > > > > > > > >   * its queue by itself in its completion handler, so we don't need to
> > > > > > > > >   * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
> > > > > > > > >   */
> > > > > > > > > -static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > +static bool blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
> > > > > > > > >  {
> > > > > > > > >         struct request_queue *q = hctx->queue;
> > > > > > > > >         LIST_HEAD(rq_list);
> > > > > > > > >         struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
> > > > > > > > > +       bool ret = false;
> > > > > > > > >
> > > > > > > > >         do {
> > > > > > > > >                 struct request *rq;
> > > > > > > > > @@ -161,10 +166,12 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
> > > > > > > > >
> > > > > > > > >                 /* round robin for fair dispatch */
> > > > > > > > >                 ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
> > > > > > > > > -
> > > > > > > > > -       } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
> > > > > > > > > +               ret = blk_mq_dispatch_rq_list(q, &rq_list, true);
> > > > > > > > > +       } while (ret);
> > > > > > > > >
> > > > > > > > >         WRITE_ONCE(hctx->dispatch_from, ctx);
> > > > > > > > > +
> > > > > > > > > +       return ret;
> > > > > > > > >  }
> > > > > > > > >
> > > > > > > > >  void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > @@ -173,6 +180,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> > > > > > > > >         struct elevator_queue *e = q->elevator;
> > > > > > > > >         const bool has_sched_dispatch = e && e->type->ops.dispatch_request;
> > > > > > > > >         LIST_HEAD(rq_list);
> > > > > > > > > +       bool dispatch_ret;
> > > > > > > > >
> > > > > > > > >         /* RCU or SRCU read lock is needed before checking quiesced flag */
> > > > > > > > >         if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
> > > > > > > > > @@ -206,20 +214,26 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> > > > > > > > >          */
> > > > > > > > >         if (!list_empty(&rq_list)) {
> > > > > > > > >                 blk_mq_sched_mark_restart_hctx(hctx);
> > > > > > > > > -               if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
> > > > > > > > > +               dispatch_ret = blk_mq_dispatch_rq_list(q, &rq_list, false);
> > > > > > > > > +               if (dispatch_ret) {
> > > > > > > > >                         if (has_sched_dispatch)
> > > > > > > > > -                               blk_mq_do_dispatch_sched(hctx);
> > > > > > > > > +                               dispatch_ret = blk_mq_do_dispatch_sched(hctx);
> > > > > > >
> > > > > > > If we dispatched a request successfully by blk_mq_dispatch_rq_list(),
> > > > > > > and got dispatch_ret = true now. Then we will try to dispatch more
> > > > > > > reuqests from scheduler by blk_mq_do_dispatch_sched(), but if now no
> > > > > > > more requests in scheduler, then we will got dispatch_ret = false. In
> > > > > >
> > > > > > 'dispatch_ret' always holds result of the last blk_mq_do_dispatch_sched().
> > > > > > When any one request has been dispatched successfully, 'dispatch_ret'
> > > > > > is true. New request is always added to list before calling
> > > > > > blk_mq_do_dispatch_sched(), so once blk_mq_do_dispatch_sched() returns
> > > > > > false, it means that .commit_rqs() has been called.
> > > > >
> > > > > Not really, if no requests int the IO cheduler, we will break the loop
> > > > > in blk_mq_do_dispatch_sched() and return false without calling
> > > > > .commit_rqs().
> > > >
> > > > If there isn't any request to dispatch, false is returned. Otherwise,
> > > > always return the return value of last 'blk_mq_dispatch_rq_list'.
> > > >
> > > > >
> > > > > So in this case, blk_mq_do_dispatch_sched() will return 'false', which
> > > > > overlapped the return value of 'true' from blk_mq_dispatch_rq_list(),
> > > > > and did not call .commit_rqs(). Then the IO processing will be stuck.
> > > >
> > > > See below.
> > > >
> > > > >
> > > > > static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> > > > > {
> > > > >         struct request_queue *q = hctx->queue;
> > > > >         struct elevator_queue *e = q->elevator;
> > > > >         LIST_HEAD(rq_list);
> > > > >         bool ret = false;
> > > >
> > > > The above initialization is just done once.
> > > >
> > > > >
> > > > >        do {
> > > > >               struct request *rq;
> > > > >
> > > > >               if (e->type->ops.has_work && !e->type->ops.has_work(hctx))
> > > > >                      break;
> > > > >
> > > > >               .......
> > > >                             ret = blk_mq_dispatch_rq_list(q, list, ...);
> > > >
> > > > list includes one request, so blk_mq_dispatch_rq_list() won't return
> > > > false in case of no request in list.
> > > >
> > > > >        } while (ret);
> > > > >
> > > > >        return ret;
> > > >
> > > > 'ret' is always updated by return value of last blk_mq_dispatch_rq_list()
> > > > if at least one request is dispatched. So if it becomes false, the loop
> > > > breaks, that means .commit_rqs() has been called cause 'list' does
> > > > include one request for blk_mq_dispatch_rq_list(). Otherwise, true is
> > > > still returned.
> > >
> > > Sorry for my confusing description, let me try again to describe the problem.
> > > When I try to mount the block device, I got the IO stuck with your
> > > patch, and I did some debugging. I found we missed calling
> > > commit_rqs() for one case:
> > >
> > > void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> > > @@ -173,6 +180,7 @@ void blk_mq_sched_dispatch_requests(struct
> > > blk_mq_hw_ctx *hctx)
> > >         struct elevator_queue *e = q->elevator;
> > >         const bool has_sched_dispatch = e && e->type->ops.dispatch_request;
> > >         LIST_HEAD(rq_list);
> > > +       bool dispatch_ret;
> > >
> > >         /* RCU or SRCU read lock is needed before checking quiesced flag */
> > >         if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
> > > @@ -206,20 +214,26 @@ void blk_mq_sched_dispatch_requests(struct
> > > blk_mq_hw_ctx *hctx)
> > >          */
> > >         if (!list_empty(&rq_list)) {
> > >                 blk_mq_sched_mark_restart_hctx(hctx);
> > > -               if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
> > > +               dispatch_ret = blk_mq_dispatch_rq_list(q, &rq_list, false);
> > >
> > > Suppose we dispatch one request to block driver, and return 'true' here.
> > >
> > > +               if (dispatch_ret) {
> > >                         if (has_sched_dispatch)
> > > -                               blk_mq_do_dispatch_sched(hctx);
> > > +                               dispatch_ret = blk_mq_do_dispatch_sched(hctx);
> > >
> > > Then we will continue to try to dispatch more requests from IO
> > > scheduler, but if there are no requests in IO scheduler now, it will
> > > return 'false' here, and set dispatch_ret as false.
> > >
> > >                         else
> > > -                               blk_mq_do_dispatch_ctx(hctx);
> > > +                               dispatch_ret = blk_mq_do_dispatch_ctx(hctx);
> >
> > OK, this one is an issue, but it can be fixed simply by not updating
> > 'dispatch_ret' for the following dispatch, something like the below
> > way:
> >
> >         if (dispatch_ret) {
> >                 if (has_sched_dispatch)
> >                         blk_mq_do_dispatch_sched(hctx);
> >                 else
> >                         blk_mq_do_dispatch_ctx(hctx);
> >         }
> 
> Yes, this can work.
> 
> But I found your patch will drop some performance comparing with my
> method in patch 1/2. My method can fetch several requests from IO
> scheduler and dispatch them to block driver at one time, but in your
> patch we still need dispatch request one by one, which will drop some
> performance I think.
> What do you think? Thanks.

Please run your test and see if performance drop can be observed.

Thanks,
Ming
Baolin Wang March 24, 2020, 8:29 a.m. UTC | #11
Hi Ming,

On Mon, Mar 23, 2020 at 5:58 PM Ming Lei <ming.lei@redhat.com> wrote:
>
> On Mon, Mar 23, 2020 at 05:13:27PM +0800, Baolin Wang wrote:
> > On Mon, Mar 23, 2020 at 4:29 PM Ming Lei <ming.lei@redhat.com> wrote:
> > >
> > > On Mon, Mar 23, 2020 at 04:22:38PM +0800, Baolin Wang wrote:
> > > > On Mon, Mar 23, 2020 at 3:27 PM Ming Lei <ming.lei@redhat.com> wrote:
> > > > >
> > > > > On Mon, Mar 23, 2020 at 01:36:34PM +0800, Baolin Wang wrote:
> > > > > > On Mon, Mar 23, 2020 at 11:44 AM Ming Lei <ming.lei@redhat.com> wrote:
> > > > > > >
> > > > > > > On Fri, Mar 20, 2020 at 06:27:41PM +0800, Baolin Wang wrote:
> > > > > > > > Hi Ming,
> > > > > > > >
> > > > > > > > On Wed, Mar 18, 2020 at 6:26 PM Baolin Wang <baolin.wang7@gmail.com> wrote:
> > > > > > > > >
> > > > > > > > > Hi Ming,
> > > > > > > > >
> > > > > > > > > On Wed, Mar 18, 2020 at 6:01 PM Ming Lei <ming.lei@redhat.com> wrote:
> > > > > > > > > >
> > > > > > > > > > On Mon, Mar 16, 2020 at 06:01:19PM +0800, Baolin Wang wrote:
> > > > > > > > > > > As we know, some SD/MMC host controllers can support packed request,
> > > > > > > > > > > that means we can package several requests to host controller at one
> > > > > > > > > > > time to improve performence. So the hardware driver expects the blk-mq
> > > > > > > > > > > can dispatch a batch of requests at one time, and driver can use bd.last
> > > > > > > > > > > to indicate if it is the last request in the batch to help to combine
> > > > > > > > > > > requests as much as possible.
> > > > > > > > > > >
> > > > > > > > > > > Thus we should add batch requests setting from the block driver to tell
> > > > > > > > > > > the scheduler how many requests can be dispatched in a batch, as well
> > > > > > > > > > > as changing the scheduler to dispatch more than one request if setting
> > > > > > > > > > > the maximum batch requests number.
> > > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > I feel this batch dispatch style is more complicated, and some other
> > > > > > > > > > drivers(virtio blk/scsi) still may get benefit if we can pass real 'last' flag in
> > > > > > > > > > .queue_rq().
> > > > > > > > > >
> > > > > > > > > > So what about the following way by extending .commit_rqs() to this usage?
> > > > > > > > > > And you can do whatever batch processing in .commit_rqs() which will be
> > > > > > > > > > guaranteed to be called if BLK_MQ_F_FORCE_COMMIT_RQS is set by driver.
> > > > > > > > >
> > > > > > > > > I'm very appreciated for your good suggestion, which is much simpler than mine.
> > > > > > > > > It seems to solve my problem, and I will try it on my platform to see
> > > > > > > > > if it can work and give you the feadback. Thanks again.
> > > > > > > >
> > > > > > > > I tried your approach on my platform, but met some problems, see below.
> > > > > > > >
> > > > > > > > >
> > > > > > > > > > diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
> > > > > > > > > > index 856356b1619e..cd2bbe56f83f 100644
> > > > > > > > > > --- a/block/blk-mq-sched.c
> > > > > > > > > > +++ b/block/blk-mq-sched.c
> > > > > > > > > > @@ -85,11 +85,12 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > >   * its queue by itself in its completion handler, so we don't need to
> > > > > > > > > >   * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
> > > > > > > > > >   */
> > > > > > > > > > -static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > > +static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > >  {
> > > > > > > > > >         struct request_queue *q = hctx->queue;
> > > > > > > > > >         struct elevator_queue *e = q->elevator;
> > > > > > > > > >         LIST_HEAD(rq_list);
> > > > > > > > > > +       bool ret = false;
> > > > > > > > > >
> > > > > > > > > >         do {
> > > > > > > > > >                 struct request *rq;
> > > > > > > > > > @@ -112,7 +113,10 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > >                  * in blk_mq_dispatch_rq_list().
> > > > > > > > > >                  */
> > > > > > > > > >                 list_add(&rq->queuelist, &rq_list);
> > > > > > > > > > -       } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
> > > > > > > > > > +               ret = blk_mq_dispatch_rq_list(q, &rq_list, true);
> > > > > > > > > > +       } while (ret);
> > > > > > > > > > +
> > > > > > > > > > +       return ret;
> > > > > > > > > >  }
> > > > > > > > > >
> > > > > > > > > >  static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
> > > > > > > > > > @@ -131,11 +135,12 @@ static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
> > > > > > > > > >   * its queue by itself in its completion handler, so we don't need to
> > > > > > > > > >   * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
> > > > > > > > > >   */
> > > > > > > > > > -static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > > +static bool blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > >  {
> > > > > > > > > >         struct request_queue *q = hctx->queue;
> > > > > > > > > >         LIST_HEAD(rq_list);
> > > > > > > > > >         struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
> > > > > > > > > > +       bool ret = false;
> > > > > > > > > >
> > > > > > > > > >         do {
> > > > > > > > > >                 struct request *rq;
> > > > > > > > > > @@ -161,10 +166,12 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > >
> > > > > > > > > >                 /* round robin for fair dispatch */
> > > > > > > > > >                 ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
> > > > > > > > > > -
> > > > > > > > > > -       } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
> > > > > > > > > > +               ret = blk_mq_dispatch_rq_list(q, &rq_list, true);
> > > > > > > > > > +       } while (ret);
> > > > > > > > > >
> > > > > > > > > >         WRITE_ONCE(hctx->dispatch_from, ctx);
> > > > > > > > > > +
> > > > > > > > > > +       return ret;
> > > > > > > > > >  }
> > > > > > > > > >
> > > > > > > > > >  void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > > @@ -173,6 +180,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > >         struct elevator_queue *e = q->elevator;
> > > > > > > > > >         const bool has_sched_dispatch = e && e->type->ops.dispatch_request;
> > > > > > > > > >         LIST_HEAD(rq_list);
> > > > > > > > > > +       bool dispatch_ret;
> > > > > > > > > >
> > > > > > > > > >         /* RCU or SRCU read lock is needed before checking quiesced flag */
> > > > > > > > > >         if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
> > > > > > > > > > @@ -206,20 +214,26 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > >          */
> > > > > > > > > >         if (!list_empty(&rq_list)) {
> > > > > > > > > >                 blk_mq_sched_mark_restart_hctx(hctx);
> > > > > > > > > > -               if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
> > > > > > > > > > +               dispatch_ret = blk_mq_dispatch_rq_list(q, &rq_list, false);
> > > > > > > > > > +               if (dispatch_ret) {
> > > > > > > > > >                         if (has_sched_dispatch)
> > > > > > > > > > -                               blk_mq_do_dispatch_sched(hctx);
> > > > > > > > > > +                               dispatch_ret = blk_mq_do_dispatch_sched(hctx);
> > > > > > > >
> > > > > > > > If we dispatched a request successfully by blk_mq_dispatch_rq_list(),
> > > > > > > > and got dispatch_ret = true now. Then we will try to dispatch more
> > > > > > > > reuqests from scheduler by blk_mq_do_dispatch_sched(), but if now no
> > > > > > > > more requests in scheduler, then we will got dispatch_ret = false. In
> > > > > > >
> > > > > > > 'dispatch_ret' always holds result of the last blk_mq_do_dispatch_sched().
> > > > > > > When any one request has been dispatched successfully, 'dispatch_ret'
> > > > > > > is true. New request is always added to list before calling
> > > > > > > blk_mq_do_dispatch_sched(), so once blk_mq_do_dispatch_sched() returns
> > > > > > > false, it means that .commit_rqs() has been called.
> > > > > >
> > > > > > Not really, if no requests int the IO cheduler, we will break the loop
> > > > > > in blk_mq_do_dispatch_sched() and return false without calling
> > > > > > .commit_rqs().
> > > > >
> > > > > If there isn't any request to dispatch, false is returned. Otherwise,
> > > > > always return the return value of last 'blk_mq_dispatch_rq_list'.
> > > > >
> > > > > >
> > > > > > So in this case, blk_mq_do_dispatch_sched() will return 'false', which
> > > > > > overlapped the return value of 'true' from blk_mq_dispatch_rq_list(),
> > > > > > and did not call .commit_rqs(). Then the IO processing will be stuck.
> > > > >
> > > > > See below.
> > > > >
> > > > > >
> > > > > > static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> > > > > > {
> > > > > >         struct request_queue *q = hctx->queue;
> > > > > >         struct elevator_queue *e = q->elevator;
> > > > > >         LIST_HEAD(rq_list);
> > > > > >         bool ret = false;
> > > > >
> > > > > The above initialization is just done once.
> > > > >
> > > > > >
> > > > > >        do {
> > > > > >               struct request *rq;
> > > > > >
> > > > > >               if (e->type->ops.has_work && !e->type->ops.has_work(hctx))
> > > > > >                      break;
> > > > > >
> > > > > >               .......
> > > > >                             ret = blk_mq_dispatch_rq_list(q, list, ...);
> > > > >
> > > > > list includes one request, so blk_mq_dispatch_rq_list() won't return
> > > > > false in case of no request in list.
> > > > >
> > > > > >        } while (ret);
> > > > > >
> > > > > >        return ret;
> > > > >
> > > > > 'ret' is always updated by return value of last blk_mq_dispatch_rq_list()
> > > > > if at least one request is dispatched. So if it becomes false, the loop
> > > > > breaks, that means .commit_rqs() has been called cause 'list' does
> > > > > include one request for blk_mq_dispatch_rq_list(). Otherwise, true is
> > > > > still returned.
> > > >
> > > > Sorry for my confusing description, let me try again to describe the problem.
> > > > When I try to mount the block device, I got the IO stuck with your
> > > > patch, and I did some debugging. I found we missed calling
> > > > commit_rqs() for one case:
> > > >
> > > > void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> > > > @@ -173,6 +180,7 @@ void blk_mq_sched_dispatch_requests(struct
> > > > blk_mq_hw_ctx *hctx)
> > > >         struct elevator_queue *e = q->elevator;
> > > >         const bool has_sched_dispatch = e && e->type->ops.dispatch_request;
> > > >         LIST_HEAD(rq_list);
> > > > +       bool dispatch_ret;
> > > >
> > > >         /* RCU or SRCU read lock is needed before checking quiesced flag */
> > > >         if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
> > > > @@ -206,20 +214,26 @@ void blk_mq_sched_dispatch_requests(struct
> > > > blk_mq_hw_ctx *hctx)
> > > >          */
> > > >         if (!list_empty(&rq_list)) {
> > > >                 blk_mq_sched_mark_restart_hctx(hctx);
> > > > -               if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
> > > > +               dispatch_ret = blk_mq_dispatch_rq_list(q, &rq_list, false);
> > > >
> > > > Suppose we dispatch one request to block driver, and return 'true' here.
> > > >
> > > > +               if (dispatch_ret) {
> > > >                         if (has_sched_dispatch)
> > > > -                               blk_mq_do_dispatch_sched(hctx);
> > > > +                               dispatch_ret = blk_mq_do_dispatch_sched(hctx);
> > > >
> > > > Then we will continue to try to dispatch more requests from IO
> > > > scheduler, but if there are no requests in IO scheduler now, it will
> > > > return 'false' here, and set dispatch_ret as false.
> > > >
> > > >                         else
> > > > -                               blk_mq_do_dispatch_ctx(hctx);
> > > > +                               dispatch_ret = blk_mq_do_dispatch_ctx(hctx);
> > >
> > > OK, this one is an issue, but it can be fixed simply by not updating
> > > 'dispatch_ret' for the following dispatch, something like the below
> > > way:
> > >
> > >         if (dispatch_ret) {
> > >                 if (has_sched_dispatch)
> > >                         blk_mq_do_dispatch_sched(hctx);
> > >                 else
> > >                         blk_mq_do_dispatch_ctx(hctx);
> > >         }
> >
> > Yes, this can work.
> >
> > But I found your patch will drop some performance comparing with my
> > method in patch 1/2. My method can fetch several requests from IO
> > scheduler and dispatch them to block driver at one time, but in your
> > patch we still need dispatch request one by one, which will drop some
> > performance I think.
> > What do you think? Thanks.
>
> Please run your test and see if performance drop can be observed.

From my testing (using the same fio configuration in cover letter), I
found your method will drop some performance from below data.

My original patches:
Sequential read: 229.6MiB/s
Random read:180.8MiB/s
Sequential write: 172MiB/s
Random write:169.2MiB/s

Your patches:
Sequential read: 209MiB/s
Random read:177MiB/s
Sequential write: 148MiB/s
Random write:147MiB/s
Baolin Wang March 27, 2020, 8:30 a.m. UTC | #12
Hi Ming,

On Tue, Mar 24, 2020 at 4:29 PM Baolin Wang <baolin.wang7@gmail.com> wrote:
>
> Hi Ming,
>
> On Mon, Mar 23, 2020 at 5:58 PM Ming Lei <ming.lei@redhat.com> wrote:
> >
> > On Mon, Mar 23, 2020 at 05:13:27PM +0800, Baolin Wang wrote:
> > > On Mon, Mar 23, 2020 at 4:29 PM Ming Lei <ming.lei@redhat.com> wrote:
> > > >
> > > > On Mon, Mar 23, 2020 at 04:22:38PM +0800, Baolin Wang wrote:
> > > > > On Mon, Mar 23, 2020 at 3:27 PM Ming Lei <ming.lei@redhat.com> wrote:
> > > > > >
> > > > > > On Mon, Mar 23, 2020 at 01:36:34PM +0800, Baolin Wang wrote:
> > > > > > > On Mon, Mar 23, 2020 at 11:44 AM Ming Lei <ming.lei@redhat.com> wrote:
> > > > > > > >
> > > > > > > > On Fri, Mar 20, 2020 at 06:27:41PM +0800, Baolin Wang wrote:
> > > > > > > > > Hi Ming,
> > > > > > > > >
> > > > > > > > > On Wed, Mar 18, 2020 at 6:26 PM Baolin Wang <baolin.wang7@gmail.com> wrote:
> > > > > > > > > >
> > > > > > > > > > Hi Ming,
> > > > > > > > > >
> > > > > > > > > > On Wed, Mar 18, 2020 at 6:01 PM Ming Lei <ming.lei@redhat.com> wrote:
> > > > > > > > > > >
> > > > > > > > > > > On Mon, Mar 16, 2020 at 06:01:19PM +0800, Baolin Wang wrote:
> > > > > > > > > > > > As we know, some SD/MMC host controllers can support packed request,
> > > > > > > > > > > > that means we can package several requests to host controller at one
> > > > > > > > > > > > time to improve performence. So the hardware driver expects the blk-mq
> > > > > > > > > > > > can dispatch a batch of requests at one time, and driver can use bd.last
> > > > > > > > > > > > to indicate if it is the last request in the batch to help to combine
> > > > > > > > > > > > requests as much as possible.
> > > > > > > > > > > >
> > > > > > > > > > > > Thus we should add batch requests setting from the block driver to tell
> > > > > > > > > > > > the scheduler how many requests can be dispatched in a batch, as well
> > > > > > > > > > > > as changing the scheduler to dispatch more than one request if setting
> > > > > > > > > > > > the maximum batch requests number.
> > > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > I feel this batch dispatch style is more complicated, and some other
> > > > > > > > > > > drivers(virtio blk/scsi) still may get benefit if we can pass real 'last' flag in
> > > > > > > > > > > .queue_rq().
> > > > > > > > > > >
> > > > > > > > > > > So what about the following way by extending .commit_rqs() to this usage?
> > > > > > > > > > > And you can do whatever batch processing in .commit_rqs() which will be
> > > > > > > > > > > guaranteed to be called if BLK_MQ_F_FORCE_COMMIT_RQS is set by driver.
> > > > > > > > > >
> > > > > > > > > > I'm very appreciated for your good suggestion, which is much simpler than mine.
> > > > > > > > > > It seems to solve my problem, and I will try it on my platform to see
> > > > > > > > > > if it can work and give you the feadback. Thanks again.
> > > > > > > > >
> > > > > > > > > I tried your approach on my platform, but met some problems, see below.
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > > diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
> > > > > > > > > > > index 856356b1619e..cd2bbe56f83f 100644
> > > > > > > > > > > --- a/block/blk-mq-sched.c
> > > > > > > > > > > +++ b/block/blk-mq-sched.c
> > > > > > > > > > > @@ -85,11 +85,12 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > > >   * its queue by itself in its completion handler, so we don't need to
> > > > > > > > > > >   * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
> > > > > > > > > > >   */
> > > > > > > > > > > -static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > > > +static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > > >  {
> > > > > > > > > > >         struct request_queue *q = hctx->queue;
> > > > > > > > > > >         struct elevator_queue *e = q->elevator;
> > > > > > > > > > >         LIST_HEAD(rq_list);
> > > > > > > > > > > +       bool ret = false;
> > > > > > > > > > >
> > > > > > > > > > >         do {
> > > > > > > > > > >                 struct request *rq;
> > > > > > > > > > > @@ -112,7 +113,10 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > > >                  * in blk_mq_dispatch_rq_list().
> > > > > > > > > > >                  */
> > > > > > > > > > >                 list_add(&rq->queuelist, &rq_list);
> > > > > > > > > > > -       } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
> > > > > > > > > > > +               ret = blk_mq_dispatch_rq_list(q, &rq_list, true);
> > > > > > > > > > > +       } while (ret);
> > > > > > > > > > > +
> > > > > > > > > > > +       return ret;
> > > > > > > > > > >  }
> > > > > > > > > > >
> > > > > > > > > > >  static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
> > > > > > > > > > > @@ -131,11 +135,12 @@ static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
> > > > > > > > > > >   * its queue by itself in its completion handler, so we don't need to
> > > > > > > > > > >   * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
> > > > > > > > > > >   */
> > > > > > > > > > > -static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > > > +static bool blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > > >  {
> > > > > > > > > > >         struct request_queue *q = hctx->queue;
> > > > > > > > > > >         LIST_HEAD(rq_list);
> > > > > > > > > > >         struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
> > > > > > > > > > > +       bool ret = false;
> > > > > > > > > > >
> > > > > > > > > > >         do {
> > > > > > > > > > >                 struct request *rq;
> > > > > > > > > > > @@ -161,10 +166,12 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > > >
> > > > > > > > > > >                 /* round robin for fair dispatch */
> > > > > > > > > > >                 ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
> > > > > > > > > > > -
> > > > > > > > > > > -       } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
> > > > > > > > > > > +               ret = blk_mq_dispatch_rq_list(q, &rq_list, true);
> > > > > > > > > > > +       } while (ret);
> > > > > > > > > > >
> > > > > > > > > > >         WRITE_ONCE(hctx->dispatch_from, ctx);
> > > > > > > > > > > +
> > > > > > > > > > > +       return ret;
> > > > > > > > > > >  }
> > > > > > > > > > >
> > > > > > > > > > >  void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > > > @@ -173,6 +180,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > > >         struct elevator_queue *e = q->elevator;
> > > > > > > > > > >         const bool has_sched_dispatch = e && e->type->ops.dispatch_request;
> > > > > > > > > > >         LIST_HEAD(rq_list);
> > > > > > > > > > > +       bool dispatch_ret;
> > > > > > > > > > >
> > > > > > > > > > >         /* RCU or SRCU read lock is needed before checking quiesced flag */
> > > > > > > > > > >         if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
> > > > > > > > > > > @@ -206,20 +214,26 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > > >          */
> > > > > > > > > > >         if (!list_empty(&rq_list)) {
> > > > > > > > > > >                 blk_mq_sched_mark_restart_hctx(hctx);
> > > > > > > > > > > -               if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
> > > > > > > > > > > +               dispatch_ret = blk_mq_dispatch_rq_list(q, &rq_list, false);
> > > > > > > > > > > +               if (dispatch_ret) {
> > > > > > > > > > >                         if (has_sched_dispatch)
> > > > > > > > > > > -                               blk_mq_do_dispatch_sched(hctx);
> > > > > > > > > > > +                               dispatch_ret = blk_mq_do_dispatch_sched(hctx);
> > > > > > > > >
> > > > > > > > > If we dispatched a request successfully by blk_mq_dispatch_rq_list(),
> > > > > > > > > and got dispatch_ret = true now. Then we will try to dispatch more
> > > > > > > > > reuqests from scheduler by blk_mq_do_dispatch_sched(), but if now no
> > > > > > > > > more requests in scheduler, then we will got dispatch_ret = false. In
> > > > > > > >
> > > > > > > > 'dispatch_ret' always holds result of the last blk_mq_do_dispatch_sched().
> > > > > > > > When any one request has been dispatched successfully, 'dispatch_ret'
> > > > > > > > is true. New request is always added to list before calling
> > > > > > > > blk_mq_do_dispatch_sched(), so once blk_mq_do_dispatch_sched() returns
> > > > > > > > false, it means that .commit_rqs() has been called.
> > > > > > >
> > > > > > > Not really, if no requests int the IO cheduler, we will break the loop
> > > > > > > in blk_mq_do_dispatch_sched() and return false without calling
> > > > > > > .commit_rqs().
> > > > > >
> > > > > > If there isn't any request to dispatch, false is returned. Otherwise,
> > > > > > always return the return value of last 'blk_mq_dispatch_rq_list'.
> > > > > >
> > > > > > >
> > > > > > > So in this case, blk_mq_do_dispatch_sched() will return 'false', which
> > > > > > > overlapped the return value of 'true' from blk_mq_dispatch_rq_list(),
> > > > > > > and did not call .commit_rqs(). Then the IO processing will be stuck.
> > > > > >
> > > > > > See below.
> > > > > >
> > > > > > >
> > > > > > > static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> > > > > > > {
> > > > > > >         struct request_queue *q = hctx->queue;
> > > > > > >         struct elevator_queue *e = q->elevator;
> > > > > > >         LIST_HEAD(rq_list);
> > > > > > >         bool ret = false;
> > > > > >
> > > > > > The above initialization is just done once.
> > > > > >
> > > > > > >
> > > > > > >        do {
> > > > > > >               struct request *rq;
> > > > > > >
> > > > > > >               if (e->type->ops.has_work && !e->type->ops.has_work(hctx))
> > > > > > >                      break;
> > > > > > >
> > > > > > >               .......
> > > > > >                             ret = blk_mq_dispatch_rq_list(q, list, ...);
> > > > > >
> > > > > > list includes one request, so blk_mq_dispatch_rq_list() won't return
> > > > > > false in case of no request in list.
> > > > > >
> > > > > > >        } while (ret);
> > > > > > >
> > > > > > >        return ret;
> > > > > >
> > > > > > 'ret' is always updated by return value of last blk_mq_dispatch_rq_list()
> > > > > > if at least one request is dispatched. So if it becomes false, the loop
> > > > > > breaks, that means .commit_rqs() has been called cause 'list' does
> > > > > > include one request for blk_mq_dispatch_rq_list(). Otherwise, true is
> > > > > > still returned.
> > > > >
> > > > > Sorry for my confusing description, let me try again to describe the problem.
> > > > > When I try to mount the block device, I got the IO stuck with your
> > > > > patch, and I did some debugging. I found we missed calling
> > > > > commit_rqs() for one case:
> > > > >
> > > > > void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> > > > > @@ -173,6 +180,7 @@ void blk_mq_sched_dispatch_requests(struct
> > > > > blk_mq_hw_ctx *hctx)
> > > > >         struct elevator_queue *e = q->elevator;
> > > > >         const bool has_sched_dispatch = e && e->type->ops.dispatch_request;
> > > > >         LIST_HEAD(rq_list);
> > > > > +       bool dispatch_ret;
> > > > >
> > > > >         /* RCU or SRCU read lock is needed before checking quiesced flag */
> > > > >         if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
> > > > > @@ -206,20 +214,26 @@ void blk_mq_sched_dispatch_requests(struct
> > > > > blk_mq_hw_ctx *hctx)
> > > > >          */
> > > > >         if (!list_empty(&rq_list)) {
> > > > >                 blk_mq_sched_mark_restart_hctx(hctx);
> > > > > -               if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
> > > > > +               dispatch_ret = blk_mq_dispatch_rq_list(q, &rq_list, false);
> > > > >
> > > > > Suppose we dispatch one request to block driver, and return 'true' here.
> > > > >
> > > > > +               if (dispatch_ret) {
> > > > >                         if (has_sched_dispatch)
> > > > > -                               blk_mq_do_dispatch_sched(hctx);
> > > > > +                               dispatch_ret = blk_mq_do_dispatch_sched(hctx);
> > > > >
> > > > > Then we will continue to try to dispatch more requests from IO
> > > > > scheduler, but if there are no requests in IO scheduler now, it will
> > > > > return 'false' here, and set dispatch_ret as false.
> > > > >
> > > > >                         else
> > > > > -                               blk_mq_do_dispatch_ctx(hctx);
> > > > > +                               dispatch_ret = blk_mq_do_dispatch_ctx(hctx);
> > > >
> > > > OK, this one is an issue, but it can be fixed simply by not updating
> > > > 'dispatch_ret' for the following dispatch, something like the below
> > > > way:
> > > >
> > > >         if (dispatch_ret) {
> > > >                 if (has_sched_dispatch)
> > > >                         blk_mq_do_dispatch_sched(hctx);
> > > >                 else
> > > >                         blk_mq_do_dispatch_ctx(hctx);
> > > >         }
> > >
> > > Yes, this can work.
> > >
> > > But I found your patch will drop some performance comparing with my
> > > method in patch 1/2. My method can fetch several requests from IO
> > > scheduler and dispatch them to block driver at one time, but in your
> > > patch we still need dispatch request one by one, which will drop some
> > > performance I think.
> > > What do you think? Thanks.
> >
> > Please run your test and see if performance drop can be observed.
>
> From my testing (using the same fio configuration in cover letter), I
> found your method will drop some performance from below data.
>
> My original patches:
> Sequential read: 229.6MiB/s
> Random read:180.8MiB/s
> Sequential write: 172MiB/s
> Random write:169.2MiB/s
>
> Your patches:
> Sequential read: 209MiB/s
> Random read:177MiB/s
> Sequential write: 148MiB/s
> Random write:147MiB/s

After some optimiziton and I did more testing, I did not found any
performance issue with your patch comparing with my old method. Sorry
for noise in my last email.

So will you send out a formal patch? If yes, please add my test-by
tag. Thanks for your help.
Tested-by: Baolin Wang <baolin.wang7@gmail.com>
Baolin Wang April 22, 2020, 9:21 a.m. UTC | #13
Hi Ming,

On Fri, Mar 27, 2020 at 4:30 PM Baolin Wang <baolin.wang7@gmail.com> wrote:
>
> Hi Ming,
>
> On Tue, Mar 24, 2020 at 4:29 PM Baolin Wang <baolin.wang7@gmail.com> wrote:
> >
> > Hi Ming,
> >
> > On Mon, Mar 23, 2020 at 5:58 PM Ming Lei <ming.lei@redhat.com> wrote:
> > >
> > > On Mon, Mar 23, 2020 at 05:13:27PM +0800, Baolin Wang wrote:
> > > > On Mon, Mar 23, 2020 at 4:29 PM Ming Lei <ming.lei@redhat.com> wrote:
> > > > >
> > > > > On Mon, Mar 23, 2020 at 04:22:38PM +0800, Baolin Wang wrote:
> > > > > > On Mon, Mar 23, 2020 at 3:27 PM Ming Lei <ming.lei@redhat.com> wrote:
> > > > > > >
> > > > > > > On Mon, Mar 23, 2020 at 01:36:34PM +0800, Baolin Wang wrote:
> > > > > > > > On Mon, Mar 23, 2020 at 11:44 AM Ming Lei <ming.lei@redhat.com> wrote:
> > > > > > > > >
> > > > > > > > > On Fri, Mar 20, 2020 at 06:27:41PM +0800, Baolin Wang wrote:
> > > > > > > > > > Hi Ming,
> > > > > > > > > >
> > > > > > > > > > On Wed, Mar 18, 2020 at 6:26 PM Baolin Wang <baolin.wang7@gmail.com> wrote:
> > > > > > > > > > >
> > > > > > > > > > > Hi Ming,
> > > > > > > > > > >
> > > > > > > > > > > On Wed, Mar 18, 2020 at 6:01 PM Ming Lei <ming.lei@redhat.com> wrote:
> > > > > > > > > > > >
> > > > > > > > > > > > On Mon, Mar 16, 2020 at 06:01:19PM +0800, Baolin Wang wrote:
> > > > > > > > > > > > > As we know, some SD/MMC host controllers can support packed request,
> > > > > > > > > > > > > that means we can package several requests to host controller at one
> > > > > > > > > > > > > time to improve performence. So the hardware driver expects the blk-mq
> > > > > > > > > > > > > can dispatch a batch of requests at one time, and driver can use bd.last
> > > > > > > > > > > > > to indicate if it is the last request in the batch to help to combine
> > > > > > > > > > > > > requests as much as possible.
> > > > > > > > > > > > >
> > > > > > > > > > > > > Thus we should add batch requests setting from the block driver to tell
> > > > > > > > > > > > > the scheduler how many requests can be dispatched in a batch, as well
> > > > > > > > > > > > > as changing the scheduler to dispatch more than one request if setting
> > > > > > > > > > > > > the maximum batch requests number.
> > > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > I feel this batch dispatch style is more complicated, and some other
> > > > > > > > > > > > drivers(virtio blk/scsi) still may get benefit if we can pass real 'last' flag in
> > > > > > > > > > > > .queue_rq().
> > > > > > > > > > > >
> > > > > > > > > > > > So what about the following way by extending .commit_rqs() to this usage?
> > > > > > > > > > > > And you can do whatever batch processing in .commit_rqs() which will be
> > > > > > > > > > > > guaranteed to be called if BLK_MQ_F_FORCE_COMMIT_RQS is set by driver.
> > > > > > > > > > >
> > > > > > > > > > > I'm very appreciated for your good suggestion, which is much simpler than mine.
> > > > > > > > > > > It seems to solve my problem, and I will try it on my platform to see
> > > > > > > > > > > if it can work and give you the feadback. Thanks again.
> > > > > > > > > >
> > > > > > > > > > I tried your approach on my platform, but met some problems, see below.
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > > diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
> > > > > > > > > > > > index 856356b1619e..cd2bbe56f83f 100644
> > > > > > > > > > > > --- a/block/blk-mq-sched.c
> > > > > > > > > > > > +++ b/block/blk-mq-sched.c
> > > > > > > > > > > > @@ -85,11 +85,12 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > > > >   * its queue by itself in its completion handler, so we don't need to
> > > > > > > > > > > >   * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
> > > > > > > > > > > >   */
> > > > > > > > > > > > -static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > > > > +static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > > > >  {
> > > > > > > > > > > >         struct request_queue *q = hctx->queue;
> > > > > > > > > > > >         struct elevator_queue *e = q->elevator;
> > > > > > > > > > > >         LIST_HEAD(rq_list);
> > > > > > > > > > > > +       bool ret = false;
> > > > > > > > > > > >
> > > > > > > > > > > >         do {
> > > > > > > > > > > >                 struct request *rq;
> > > > > > > > > > > > @@ -112,7 +113,10 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > > > >                  * in blk_mq_dispatch_rq_list().
> > > > > > > > > > > >                  */
> > > > > > > > > > > >                 list_add(&rq->queuelist, &rq_list);
> > > > > > > > > > > > -       } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
> > > > > > > > > > > > +               ret = blk_mq_dispatch_rq_list(q, &rq_list, true);
> > > > > > > > > > > > +       } while (ret);
> > > > > > > > > > > > +
> > > > > > > > > > > > +       return ret;
> > > > > > > > > > > >  }
> > > > > > > > > > > >
> > > > > > > > > > > >  static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
> > > > > > > > > > > > @@ -131,11 +135,12 @@ static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
> > > > > > > > > > > >   * its queue by itself in its completion handler, so we don't need to
> > > > > > > > > > > >   * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
> > > > > > > > > > > >   */
> > > > > > > > > > > > -static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > > > > +static bool blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > > > >  {
> > > > > > > > > > > >         struct request_queue *q = hctx->queue;
> > > > > > > > > > > >         LIST_HEAD(rq_list);
> > > > > > > > > > > >         struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
> > > > > > > > > > > > +       bool ret = false;
> > > > > > > > > > > >
> > > > > > > > > > > >         do {
> > > > > > > > > > > >                 struct request *rq;
> > > > > > > > > > > > @@ -161,10 +166,12 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > > > >
> > > > > > > > > > > >                 /* round robin for fair dispatch */
> > > > > > > > > > > >                 ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
> > > > > > > > > > > > -
> > > > > > > > > > > > -       } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
> > > > > > > > > > > > +               ret = blk_mq_dispatch_rq_list(q, &rq_list, true);
> > > > > > > > > > > > +       } while (ret);
> > > > > > > > > > > >
> > > > > > > > > > > >         WRITE_ONCE(hctx->dispatch_from, ctx);
> > > > > > > > > > > > +
> > > > > > > > > > > > +       return ret;
> > > > > > > > > > > >  }
> > > > > > > > > > > >
> > > > > > > > > > > >  void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > > > > @@ -173,6 +180,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > > > >         struct elevator_queue *e = q->elevator;
> > > > > > > > > > > >         const bool has_sched_dispatch = e && e->type->ops.dispatch_request;
> > > > > > > > > > > >         LIST_HEAD(rq_list);
> > > > > > > > > > > > +       bool dispatch_ret;
> > > > > > > > > > > >
> > > > > > > > > > > >         /* RCU or SRCU read lock is needed before checking quiesced flag */
> > > > > > > > > > > >         if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
> > > > > > > > > > > > @@ -206,20 +214,26 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > > > >          */
> > > > > > > > > > > >         if (!list_empty(&rq_list)) {
> > > > > > > > > > > >                 blk_mq_sched_mark_restart_hctx(hctx);
> > > > > > > > > > > > -               if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
> > > > > > > > > > > > +               dispatch_ret = blk_mq_dispatch_rq_list(q, &rq_list, false);
> > > > > > > > > > > > +               if (dispatch_ret) {
> > > > > > > > > > > >                         if (has_sched_dispatch)
> > > > > > > > > > > > -                               blk_mq_do_dispatch_sched(hctx);
> > > > > > > > > > > > +                               dispatch_ret = blk_mq_do_dispatch_sched(hctx);
> > > > > > > > > >
> > > > > > > > > > If we dispatched a request successfully by blk_mq_dispatch_rq_list(),
> > > > > > > > > > and got dispatch_ret = true now. Then we will try to dispatch more
> > > > > > > > > > reuqests from scheduler by blk_mq_do_dispatch_sched(), but if now no
> > > > > > > > > > more requests in scheduler, then we will got dispatch_ret = false. In
> > > > > > > > >
> > > > > > > > > 'dispatch_ret' always holds result of the last blk_mq_do_dispatch_sched().
> > > > > > > > > When any one request has been dispatched successfully, 'dispatch_ret'
> > > > > > > > > is true. New request is always added to list before calling
> > > > > > > > > blk_mq_do_dispatch_sched(), so once blk_mq_do_dispatch_sched() returns
> > > > > > > > > false, it means that .commit_rqs() has been called.
> > > > > > > >
> > > > > > > > Not really, if no requests int the IO cheduler, we will break the loop
> > > > > > > > in blk_mq_do_dispatch_sched() and return false without calling
> > > > > > > > .commit_rqs().
> > > > > > >
> > > > > > > If there isn't any request to dispatch, false is returned. Otherwise,
> > > > > > > always return the return value of last 'blk_mq_dispatch_rq_list'.
> > > > > > >
> > > > > > > >
> > > > > > > > So in this case, blk_mq_do_dispatch_sched() will return 'false', which
> > > > > > > > overlapped the return value of 'true' from blk_mq_dispatch_rq_list(),
> > > > > > > > and did not call .commit_rqs(). Then the IO processing will be stuck.
> > > > > > >
> > > > > > > See below.
> > > > > > >
> > > > > > > >
> > > > > > > > static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> > > > > > > > {
> > > > > > > >         struct request_queue *q = hctx->queue;
> > > > > > > >         struct elevator_queue *e = q->elevator;
> > > > > > > >         LIST_HEAD(rq_list);
> > > > > > > >         bool ret = false;
> > > > > > >
> > > > > > > The above initialization is just done once.
> > > > > > >
> > > > > > > >
> > > > > > > >        do {
> > > > > > > >               struct request *rq;
> > > > > > > >
> > > > > > > >               if (e->type->ops.has_work && !e->type->ops.has_work(hctx))
> > > > > > > >                      break;
> > > > > > > >
> > > > > > > >               .......
> > > > > > >                             ret = blk_mq_dispatch_rq_list(q, list, ...);
> > > > > > >
> > > > > > > list includes one request, so blk_mq_dispatch_rq_list() won't return
> > > > > > > false in case of no request in list.
> > > > > > >
> > > > > > > >        } while (ret);
> > > > > > > >
> > > > > > > >        return ret;
> > > > > > >
> > > > > > > 'ret' is always updated by return value of last blk_mq_dispatch_rq_list()
> > > > > > > if at least one request is dispatched. So if it becomes false, the loop
> > > > > > > breaks, that means .commit_rqs() has been called cause 'list' does
> > > > > > > include one request for blk_mq_dispatch_rq_list(). Otherwise, true is
> > > > > > > still returned.
> > > > > >
> > > > > > Sorry for my confusing description, let me try again to describe the problem.
> > > > > > When I try to mount the block device, I got the IO stuck with your
> > > > > > patch, and I did some debugging. I found we missed calling
> > > > > > commit_rqs() for one case:
> > > > > >
> > > > > > void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> > > > > > @@ -173,6 +180,7 @@ void blk_mq_sched_dispatch_requests(struct
> > > > > > blk_mq_hw_ctx *hctx)
> > > > > >         struct elevator_queue *e = q->elevator;
> > > > > >         const bool has_sched_dispatch = e && e->type->ops.dispatch_request;
> > > > > >         LIST_HEAD(rq_list);
> > > > > > +       bool dispatch_ret;
> > > > > >
> > > > > >         /* RCU or SRCU read lock is needed before checking quiesced flag */
> > > > > >         if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
> > > > > > @@ -206,20 +214,26 @@ void blk_mq_sched_dispatch_requests(struct
> > > > > > blk_mq_hw_ctx *hctx)
> > > > > >          */
> > > > > >         if (!list_empty(&rq_list)) {
> > > > > >                 blk_mq_sched_mark_restart_hctx(hctx);
> > > > > > -               if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
> > > > > > +               dispatch_ret = blk_mq_dispatch_rq_list(q, &rq_list, false);
> > > > > >
> > > > > > Suppose we dispatch one request to block driver, and return 'true' here.
> > > > > >
> > > > > > +               if (dispatch_ret) {
> > > > > >                         if (has_sched_dispatch)
> > > > > > -                               blk_mq_do_dispatch_sched(hctx);
> > > > > > +                               dispatch_ret = blk_mq_do_dispatch_sched(hctx);
> > > > > >
> > > > > > Then we will continue to try to dispatch more requests from IO
> > > > > > scheduler, but if there are no requests in IO scheduler now, it will
> > > > > > return 'false' here, and set dispatch_ret as false.
> > > > > >
> > > > > >                         else
> > > > > > -                               blk_mq_do_dispatch_ctx(hctx);
> > > > > > +                               dispatch_ret = blk_mq_do_dispatch_ctx(hctx);
> > > > >
> > > > > OK, this one is an issue, but it can be fixed simply by not updating
> > > > > 'dispatch_ret' for the following dispatch, something like the below
> > > > > way:
> > > > >
> > > > >         if (dispatch_ret) {
> > > > >                 if (has_sched_dispatch)
> > > > >                         blk_mq_do_dispatch_sched(hctx);
> > > > >                 else
> > > > >                         blk_mq_do_dispatch_ctx(hctx);
> > > > >         }
> > > >
> > > > Yes, this can work.
> > > >
> > > > But I found your patch will drop some performance comparing with my
> > > > method in patch 1/2. My method can fetch several requests from IO
> > > > scheduler and dispatch them to block driver at one time, but in your
> > > > patch we still need dispatch request one by one, which will drop some
> > > > performance I think.
> > > > What do you think? Thanks.
> > >
> > > Please run your test and see if performance drop can be observed.
> >
> > From my testing (using the same fio configuration in cover letter), I
> > found your method will drop some performance from below data.
> >
> > My original patches:
> > Sequential read: 229.6MiB/s
> > Random read:180.8MiB/s
> > Sequential write: 172MiB/s
> > Random write:169.2MiB/s
> >
> > Your patches:
> > Sequential read: 209MiB/s
> > Random read:177MiB/s
> > Sequential write: 148MiB/s
> > Random write:147MiB/s
>
> After some optimiziton and I did more testing, I did not found any
> performance issue with your patch comparing with my old method. Sorry
> for noise in my last email.
>
> So will you send out a formal patch? If yes, please add my test-by
> tag. Thanks for your help.
> Tested-by: Baolin Wang <baolin.wang7@gmail.com>

Can I take this patch into my patch set with your authority? Or you
want to send it out by yourself? Thanks.
Ming Lei April 22, 2020, 9:25 a.m. UTC | #14
On Wed, Apr 22, 2020 at 05:21:01PM +0800, Baolin Wang wrote:
> Hi Ming,
> 
> On Fri, Mar 27, 2020 at 4:30 PM Baolin Wang <baolin.wang7@gmail.com> wrote:
> >
> > Hi Ming,
> >
> > On Tue, Mar 24, 2020 at 4:29 PM Baolin Wang <baolin.wang7@gmail.com> wrote:
> > >
> > > Hi Ming,
> > >
> > > On Mon, Mar 23, 2020 at 5:58 PM Ming Lei <ming.lei@redhat.com> wrote:
> > > >
> > > > On Mon, Mar 23, 2020 at 05:13:27PM +0800, Baolin Wang wrote:
> > > > > On Mon, Mar 23, 2020 at 4:29 PM Ming Lei <ming.lei@redhat.com> wrote:
> > > > > >
> > > > > > On Mon, Mar 23, 2020 at 04:22:38PM +0800, Baolin Wang wrote:
> > > > > > > On Mon, Mar 23, 2020 at 3:27 PM Ming Lei <ming.lei@redhat.com> wrote:
> > > > > > > >
> > > > > > > > On Mon, Mar 23, 2020 at 01:36:34PM +0800, Baolin Wang wrote:
> > > > > > > > > On Mon, Mar 23, 2020 at 11:44 AM Ming Lei <ming.lei@redhat.com> wrote:
> > > > > > > > > >
> > > > > > > > > > On Fri, Mar 20, 2020 at 06:27:41PM +0800, Baolin Wang wrote:
> > > > > > > > > > > Hi Ming,
> > > > > > > > > > >
> > > > > > > > > > > On Wed, Mar 18, 2020 at 6:26 PM Baolin Wang <baolin.wang7@gmail.com> wrote:
> > > > > > > > > > > >
> > > > > > > > > > > > Hi Ming,
> > > > > > > > > > > >
> > > > > > > > > > > > On Wed, Mar 18, 2020 at 6:01 PM Ming Lei <ming.lei@redhat.com> wrote:
> > > > > > > > > > > > >
> > > > > > > > > > > > > On Mon, Mar 16, 2020 at 06:01:19PM +0800, Baolin Wang wrote:
> > > > > > > > > > > > > > As we know, some SD/MMC host controllers can support packed request,
> > > > > > > > > > > > > > that means we can package several requests to host controller at one
> > > > > > > > > > > > > > time to improve performence. So the hardware driver expects the blk-mq
> > > > > > > > > > > > > > can dispatch a batch of requests at one time, and driver can use bd.last
> > > > > > > > > > > > > > to indicate if it is the last request in the batch to help to combine
> > > > > > > > > > > > > > requests as much as possible.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > Thus we should add batch requests setting from the block driver to tell
> > > > > > > > > > > > > > the scheduler how many requests can be dispatched in a batch, as well
> > > > > > > > > > > > > > as changing the scheduler to dispatch more than one request if setting
> > > > > > > > > > > > > > the maximum batch requests number.
> > > > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > I feel this batch dispatch style is more complicated, and some other
> > > > > > > > > > > > > drivers(virtio blk/scsi) still may get benefit if we can pass real 'last' flag in
> > > > > > > > > > > > > .queue_rq().
> > > > > > > > > > > > >
> > > > > > > > > > > > > So what about the following way by extending .commit_rqs() to this usage?
> > > > > > > > > > > > > And you can do whatever batch processing in .commit_rqs() which will be
> > > > > > > > > > > > > guaranteed to be called if BLK_MQ_F_FORCE_COMMIT_RQS is set by driver.
> > > > > > > > > > > >
> > > > > > > > > > > > I'm very appreciated for your good suggestion, which is much simpler than mine.
> > > > > > > > > > > > It seems to solve my problem, and I will try it on my platform to see
> > > > > > > > > > > > if it can work and give you the feadback. Thanks again.
> > > > > > > > > > >
> > > > > > > > > > > I tried your approach on my platform, but met some problems, see below.
> > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > > diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
> > > > > > > > > > > > > index 856356b1619e..cd2bbe56f83f 100644
> > > > > > > > > > > > > --- a/block/blk-mq-sched.c
> > > > > > > > > > > > > +++ b/block/blk-mq-sched.c
> > > > > > > > > > > > > @@ -85,11 +85,12 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > > > > >   * its queue by itself in its completion handler, so we don't need to
> > > > > > > > > > > > >   * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
> > > > > > > > > > > > >   */
> > > > > > > > > > > > > -static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > > > > > +static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > > > > >  {
> > > > > > > > > > > > >         struct request_queue *q = hctx->queue;
> > > > > > > > > > > > >         struct elevator_queue *e = q->elevator;
> > > > > > > > > > > > >         LIST_HEAD(rq_list);
> > > > > > > > > > > > > +       bool ret = false;
> > > > > > > > > > > > >
> > > > > > > > > > > > >         do {
> > > > > > > > > > > > >                 struct request *rq;
> > > > > > > > > > > > > @@ -112,7 +113,10 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > > > > >                  * in blk_mq_dispatch_rq_list().
> > > > > > > > > > > > >                  */
> > > > > > > > > > > > >                 list_add(&rq->queuelist, &rq_list);
> > > > > > > > > > > > > -       } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
> > > > > > > > > > > > > +               ret = blk_mq_dispatch_rq_list(q, &rq_list, true);
> > > > > > > > > > > > > +       } while (ret);
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +       return ret;
> > > > > > > > > > > > >  }
> > > > > > > > > > > > >
> > > > > > > > > > > > >  static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
> > > > > > > > > > > > > @@ -131,11 +135,12 @@ static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
> > > > > > > > > > > > >   * its queue by itself in its completion handler, so we don't need to
> > > > > > > > > > > > >   * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
> > > > > > > > > > > > >   */
> > > > > > > > > > > > > -static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > > > > > +static bool blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > > > > >  {
> > > > > > > > > > > > >         struct request_queue *q = hctx->queue;
> > > > > > > > > > > > >         LIST_HEAD(rq_list);
> > > > > > > > > > > > >         struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
> > > > > > > > > > > > > +       bool ret = false;
> > > > > > > > > > > > >
> > > > > > > > > > > > >         do {
> > > > > > > > > > > > >                 struct request *rq;
> > > > > > > > > > > > > @@ -161,10 +166,12 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > > > > >
> > > > > > > > > > > > >                 /* round robin for fair dispatch */
> > > > > > > > > > > > >                 ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
> > > > > > > > > > > > > -
> > > > > > > > > > > > > -       } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
> > > > > > > > > > > > > +               ret = blk_mq_dispatch_rq_list(q, &rq_list, true);
> > > > > > > > > > > > > +       } while (ret);
> > > > > > > > > > > > >
> > > > > > > > > > > > >         WRITE_ONCE(hctx->dispatch_from, ctx);
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +       return ret;
> > > > > > > > > > > > >  }
> > > > > > > > > > > > >
> > > > > > > > > > > > >  void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > > > > > @@ -173,6 +180,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > > > > >         struct elevator_queue *e = q->elevator;
> > > > > > > > > > > > >         const bool has_sched_dispatch = e && e->type->ops.dispatch_request;
> > > > > > > > > > > > >         LIST_HEAD(rq_list);
> > > > > > > > > > > > > +       bool dispatch_ret;
> > > > > > > > > > > > >
> > > > > > > > > > > > >         /* RCU or SRCU read lock is needed before checking quiesced flag */
> > > > > > > > > > > > >         if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
> > > > > > > > > > > > > @@ -206,20 +214,26 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > > > > >          */
> > > > > > > > > > > > >         if (!list_empty(&rq_list)) {
> > > > > > > > > > > > >                 blk_mq_sched_mark_restart_hctx(hctx);
> > > > > > > > > > > > > -               if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
> > > > > > > > > > > > > +               dispatch_ret = blk_mq_dispatch_rq_list(q, &rq_list, false);
> > > > > > > > > > > > > +               if (dispatch_ret) {
> > > > > > > > > > > > >                         if (has_sched_dispatch)
> > > > > > > > > > > > > -                               blk_mq_do_dispatch_sched(hctx);
> > > > > > > > > > > > > +                               dispatch_ret = blk_mq_do_dispatch_sched(hctx);
> > > > > > > > > > >
> > > > > > > > > > > If we dispatched a request successfully by blk_mq_dispatch_rq_list(),
> > > > > > > > > > > and got dispatch_ret = true now. Then we will try to dispatch more
> > > > > > > > > > > reuqests from scheduler by blk_mq_do_dispatch_sched(), but if now no
> > > > > > > > > > > more requests in scheduler, then we will got dispatch_ret = false. In
> > > > > > > > > >
> > > > > > > > > > 'dispatch_ret' always holds result of the last blk_mq_do_dispatch_sched().
> > > > > > > > > > When any one request has been dispatched successfully, 'dispatch_ret'
> > > > > > > > > > is true. New request is always added to list before calling
> > > > > > > > > > blk_mq_do_dispatch_sched(), so once blk_mq_do_dispatch_sched() returns
> > > > > > > > > > false, it means that .commit_rqs() has been called.
> > > > > > > > >
> > > > > > > > > Not really, if no requests int the IO cheduler, we will break the loop
> > > > > > > > > in blk_mq_do_dispatch_sched() and return false without calling
> > > > > > > > > .commit_rqs().
> > > > > > > >
> > > > > > > > If there isn't any request to dispatch, false is returned. Otherwise,
> > > > > > > > always return the return value of last 'blk_mq_dispatch_rq_list'.
> > > > > > > >
> > > > > > > > >
> > > > > > > > > So in this case, blk_mq_do_dispatch_sched() will return 'false', which
> > > > > > > > > overlapped the return value of 'true' from blk_mq_dispatch_rq_list(),
> > > > > > > > > and did not call .commit_rqs(). Then the IO processing will be stuck.
> > > > > > > >
> > > > > > > > See below.
> > > > > > > >
> > > > > > > > >
> > > > > > > > > static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > {
> > > > > > > > >         struct request_queue *q = hctx->queue;
> > > > > > > > >         struct elevator_queue *e = q->elevator;
> > > > > > > > >         LIST_HEAD(rq_list);
> > > > > > > > >         bool ret = false;
> > > > > > > >
> > > > > > > > The above initialization is just done once.
> > > > > > > >
> > > > > > > > >
> > > > > > > > >        do {
> > > > > > > > >               struct request *rq;
> > > > > > > > >
> > > > > > > > >               if (e->type->ops.has_work && !e->type->ops.has_work(hctx))
> > > > > > > > >                      break;
> > > > > > > > >
> > > > > > > > >               .......
> > > > > > > >                             ret = blk_mq_dispatch_rq_list(q, list, ...);
> > > > > > > >
> > > > > > > > list includes one request, so blk_mq_dispatch_rq_list() won't return
> > > > > > > > false in case of no request in list.
> > > > > > > >
> > > > > > > > >        } while (ret);
> > > > > > > > >
> > > > > > > > >        return ret;
> > > > > > > >
> > > > > > > > 'ret' is always updated by return value of last blk_mq_dispatch_rq_list()
> > > > > > > > if at least one request is dispatched. So if it becomes false, the loop
> > > > > > > > breaks, that means .commit_rqs() has been called cause 'list' does
> > > > > > > > include one request for blk_mq_dispatch_rq_list(). Otherwise, true is
> > > > > > > > still returned.
> > > > > > >
> > > > > > > Sorry for my confusing description, let me try again to describe the problem.
> > > > > > > When I try to mount the block device, I got the IO stuck with your
> > > > > > > patch, and I did some debugging. I found we missed calling
> > > > > > > commit_rqs() for one case:
> > > > > > >
> > > > > > > void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> > > > > > > @@ -173,6 +180,7 @@ void blk_mq_sched_dispatch_requests(struct
> > > > > > > blk_mq_hw_ctx *hctx)
> > > > > > >         struct elevator_queue *e = q->elevator;
> > > > > > >         const bool has_sched_dispatch = e && e->type->ops.dispatch_request;
> > > > > > >         LIST_HEAD(rq_list);
> > > > > > > +       bool dispatch_ret;
> > > > > > >
> > > > > > >         /* RCU or SRCU read lock is needed before checking quiesced flag */
> > > > > > >         if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
> > > > > > > @@ -206,20 +214,26 @@ void blk_mq_sched_dispatch_requests(struct
> > > > > > > blk_mq_hw_ctx *hctx)
> > > > > > >          */
> > > > > > >         if (!list_empty(&rq_list)) {
> > > > > > >                 blk_mq_sched_mark_restart_hctx(hctx);
> > > > > > > -               if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
> > > > > > > +               dispatch_ret = blk_mq_dispatch_rq_list(q, &rq_list, false);
> > > > > > >
> > > > > > > Suppose we dispatch one request to block driver, and return 'true' here.
> > > > > > >
> > > > > > > +               if (dispatch_ret) {
> > > > > > >                         if (has_sched_dispatch)
> > > > > > > -                               blk_mq_do_dispatch_sched(hctx);
> > > > > > > +                               dispatch_ret = blk_mq_do_dispatch_sched(hctx);
> > > > > > >
> > > > > > > Then we will continue to try to dispatch more requests from IO
> > > > > > > scheduler, but if there are no requests in IO scheduler now, it will
> > > > > > > return 'false' here, and set dispatch_ret as false.
> > > > > > >
> > > > > > >                         else
> > > > > > > -                               blk_mq_do_dispatch_ctx(hctx);
> > > > > > > +                               dispatch_ret = blk_mq_do_dispatch_ctx(hctx);
> > > > > >
> > > > > > OK, this one is an issue, but it can be fixed simply by not updating
> > > > > > 'dispatch_ret' for the following dispatch, something like the below
> > > > > > way:
> > > > > >
> > > > > >         if (dispatch_ret) {
> > > > > >                 if (has_sched_dispatch)
> > > > > >                         blk_mq_do_dispatch_sched(hctx);
> > > > > >                 else
> > > > > >                         blk_mq_do_dispatch_ctx(hctx);
> > > > > >         }
> > > > >
> > > > > Yes, this can work.
> > > > >
> > > > > But I found your patch will drop some performance comparing with my
> > > > > method in patch 1/2. My method can fetch several requests from IO
> > > > > scheduler and dispatch them to block driver at one time, but in your
> > > > > patch we still need dispatch request one by one, which will drop some
> > > > > performance I think.
> > > > > What do you think? Thanks.
> > > >
> > > > Please run your test and see if performance drop can be observed.
> > >
> > > From my testing (using the same fio configuration in cover letter), I
> > > found your method will drop some performance from below data.
> > >
> > > My original patches:
> > > Sequential read: 229.6MiB/s
> > > Random read:180.8MiB/s
> > > Sequential write: 172MiB/s
> > > Random write:169.2MiB/s
> > >
> > > Your patches:
> > > Sequential read: 209MiB/s
> > > Random read:177MiB/s
> > > Sequential write: 148MiB/s
> > > Random write:147MiB/s
> >
> > After some optimiziton and I did more testing, I did not found any
> > performance issue with your patch comparing with my old method. Sorry
> > for noise in my last email.
> >
> > So will you send out a formal patch? If yes, please add my test-by
> > tag. Thanks for your help.
> > Tested-by: Baolin Wang <baolin.wang7@gmail.com>
> 
> Can I take this patch into my patch set with your authority? Or you
> want to send it out by yourself? Thanks.

Hi Baolin,

You can fold the patch into your series.

Thanks,
Ming
Baolin Wang April 22, 2020, 9:28 a.m. UTC | #15
On Wed, Apr 22, 2020 at 5:26 PM Ming Lei <ming.lei@redhat.com> wrote:
>
> On Wed, Apr 22, 2020 at 05:21:01PM +0800, Baolin Wang wrote:
> > Hi Ming,
> >
> > On Fri, Mar 27, 2020 at 4:30 PM Baolin Wang <baolin.wang7@gmail.com> wrote:
> > >
> > > Hi Ming,
> > >
> > > On Tue, Mar 24, 2020 at 4:29 PM Baolin Wang <baolin.wang7@gmail.com> wrote:
> > > >
> > > > Hi Ming,
> > > >
> > > > On Mon, Mar 23, 2020 at 5:58 PM Ming Lei <ming.lei@redhat.com> wrote:
> > > > >
> > > > > On Mon, Mar 23, 2020 at 05:13:27PM +0800, Baolin Wang wrote:
> > > > > > On Mon, Mar 23, 2020 at 4:29 PM Ming Lei <ming.lei@redhat.com> wrote:
> > > > > > >
> > > > > > > On Mon, Mar 23, 2020 at 04:22:38PM +0800, Baolin Wang wrote:
> > > > > > > > On Mon, Mar 23, 2020 at 3:27 PM Ming Lei <ming.lei@redhat.com> wrote:
> > > > > > > > >
> > > > > > > > > On Mon, Mar 23, 2020 at 01:36:34PM +0800, Baolin Wang wrote:
> > > > > > > > > > On Mon, Mar 23, 2020 at 11:44 AM Ming Lei <ming.lei@redhat.com> wrote:
> > > > > > > > > > >
> > > > > > > > > > > On Fri, Mar 20, 2020 at 06:27:41PM +0800, Baolin Wang wrote:
> > > > > > > > > > > > Hi Ming,
> > > > > > > > > > > >
> > > > > > > > > > > > On Wed, Mar 18, 2020 at 6:26 PM Baolin Wang <baolin.wang7@gmail.com> wrote:
> > > > > > > > > > > > >
> > > > > > > > > > > > > Hi Ming,
> > > > > > > > > > > > >
> > > > > > > > > > > > > On Wed, Mar 18, 2020 at 6:01 PM Ming Lei <ming.lei@redhat.com> wrote:
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > On Mon, Mar 16, 2020 at 06:01:19PM +0800, Baolin Wang wrote:
> > > > > > > > > > > > > > > As we know, some SD/MMC host controllers can support packed request,
> > > > > > > > > > > > > > > that means we can package several requests to host controller at one
> > > > > > > > > > > > > > > time to improve performence. So the hardware driver expects the blk-mq
> > > > > > > > > > > > > > > can dispatch a batch of requests at one time, and driver can use bd.last
> > > > > > > > > > > > > > > to indicate if it is the last request in the batch to help to combine
> > > > > > > > > > > > > > > requests as much as possible.
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > Thus we should add batch requests setting from the block driver to tell
> > > > > > > > > > > > > > > the scheduler how many requests can be dispatched in a batch, as well
> > > > > > > > > > > > > > > as changing the scheduler to dispatch more than one request if setting
> > > > > > > > > > > > > > > the maximum batch requests number.
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > I feel this batch dispatch style is more complicated, and some other
> > > > > > > > > > > > > > drivers(virtio blk/scsi) still may get benefit if we can pass real 'last' flag in
> > > > > > > > > > > > > > .queue_rq().
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > So what about the following way by extending .commit_rqs() to this usage?
> > > > > > > > > > > > > > And you can do whatever batch processing in .commit_rqs() which will be
> > > > > > > > > > > > > > guaranteed to be called if BLK_MQ_F_FORCE_COMMIT_RQS is set by driver.
> > > > > > > > > > > > >
> > > > > > > > > > > > > I'm very appreciated for your good suggestion, which is much simpler than mine.
> > > > > > > > > > > > > It seems to solve my problem, and I will try it on my platform to see
> > > > > > > > > > > > > if it can work and give you the feadback. Thanks again.
> > > > > > > > > > > >
> > > > > > > > > > > > I tried your approach on my platform, but met some problems, see below.
> > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > > diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
> > > > > > > > > > > > > > index 856356b1619e..cd2bbe56f83f 100644
> > > > > > > > > > > > > > --- a/block/blk-mq-sched.c
> > > > > > > > > > > > > > +++ b/block/blk-mq-sched.c
> > > > > > > > > > > > > > @@ -85,11 +85,12 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > > > > > >   * its queue by itself in its completion handler, so we don't need to
> > > > > > > > > > > > > >   * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
> > > > > > > > > > > > > >   */
> > > > > > > > > > > > > > -static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > > > > > > +static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > > > > > >  {
> > > > > > > > > > > > > >         struct request_queue *q = hctx->queue;
> > > > > > > > > > > > > >         struct elevator_queue *e = q->elevator;
> > > > > > > > > > > > > >         LIST_HEAD(rq_list);
> > > > > > > > > > > > > > +       bool ret = false;
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >         do {
> > > > > > > > > > > > > >                 struct request *rq;
> > > > > > > > > > > > > > @@ -112,7 +113,10 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > > > > > >                  * in blk_mq_dispatch_rq_list().
> > > > > > > > > > > > > >                  */
> > > > > > > > > > > > > >                 list_add(&rq->queuelist, &rq_list);
> > > > > > > > > > > > > > -       } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
> > > > > > > > > > > > > > +               ret = blk_mq_dispatch_rq_list(q, &rq_list, true);
> > > > > > > > > > > > > > +       } while (ret);
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +       return ret;
> > > > > > > > > > > > > >  }
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >  static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
> > > > > > > > > > > > > > @@ -131,11 +135,12 @@ static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
> > > > > > > > > > > > > >   * its queue by itself in its completion handler, so we don't need to
> > > > > > > > > > > > > >   * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
> > > > > > > > > > > > > >   */
> > > > > > > > > > > > > > -static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > > > > > > +static bool blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > > > > > >  {
> > > > > > > > > > > > > >         struct request_queue *q = hctx->queue;
> > > > > > > > > > > > > >         LIST_HEAD(rq_list);
> > > > > > > > > > > > > >         struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
> > > > > > > > > > > > > > +       bool ret = false;
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >         do {
> > > > > > > > > > > > > >                 struct request *rq;
> > > > > > > > > > > > > > @@ -161,10 +166,12 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >                 /* round robin for fair dispatch */
> > > > > > > > > > > > > >                 ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
> > > > > > > > > > > > > > -
> > > > > > > > > > > > > > -       } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
> > > > > > > > > > > > > > +               ret = blk_mq_dispatch_rq_list(q, &rq_list, true);
> > > > > > > > > > > > > > +       } while (ret);
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >         WRITE_ONCE(hctx->dispatch_from, ctx);
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +       return ret;
> > > > > > > > > > > > > >  }
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >  void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > > > > > > @@ -173,6 +180,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > > > > > >         struct elevator_queue *e = q->elevator;
> > > > > > > > > > > > > >         const bool has_sched_dispatch = e && e->type->ops.dispatch_request;
> > > > > > > > > > > > > >         LIST_HEAD(rq_list);
> > > > > > > > > > > > > > +       bool dispatch_ret;
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >         /* RCU or SRCU read lock is needed before checking quiesced flag */
> > > > > > > > > > > > > >         if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
> > > > > > > > > > > > > > @@ -206,20 +214,26 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > > > > > >          */
> > > > > > > > > > > > > >         if (!list_empty(&rq_list)) {
> > > > > > > > > > > > > >                 blk_mq_sched_mark_restart_hctx(hctx);
> > > > > > > > > > > > > > -               if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
> > > > > > > > > > > > > > +               dispatch_ret = blk_mq_dispatch_rq_list(q, &rq_list, false);
> > > > > > > > > > > > > > +               if (dispatch_ret) {
> > > > > > > > > > > > > >                         if (has_sched_dispatch)
> > > > > > > > > > > > > > -                               blk_mq_do_dispatch_sched(hctx);
> > > > > > > > > > > > > > +                               dispatch_ret = blk_mq_do_dispatch_sched(hctx);
> > > > > > > > > > > >
> > > > > > > > > > > > If we dispatched a request successfully by blk_mq_dispatch_rq_list(),
> > > > > > > > > > > > and got dispatch_ret = true now. Then we will try to dispatch more
> > > > > > > > > > > > reuqests from scheduler by blk_mq_do_dispatch_sched(), but if now no
> > > > > > > > > > > > more requests in scheduler, then we will got dispatch_ret = false. In
> > > > > > > > > > >
> > > > > > > > > > > 'dispatch_ret' always holds result of the last blk_mq_do_dispatch_sched().
> > > > > > > > > > > When any one request has been dispatched successfully, 'dispatch_ret'
> > > > > > > > > > > is true. New request is always added to list before calling
> > > > > > > > > > > blk_mq_do_dispatch_sched(), so once blk_mq_do_dispatch_sched() returns
> > > > > > > > > > > false, it means that .commit_rqs() has been called.
> > > > > > > > > >
> > > > > > > > > > Not really, if no requests int the IO cheduler, we will break the loop
> > > > > > > > > > in blk_mq_do_dispatch_sched() and return false without calling
> > > > > > > > > > .commit_rqs().
> > > > > > > > >
> > > > > > > > > If there isn't any request to dispatch, false is returned. Otherwise,
> > > > > > > > > always return the return value of last 'blk_mq_dispatch_rq_list'.
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > So in this case, blk_mq_do_dispatch_sched() will return 'false', which
> > > > > > > > > > overlapped the return value of 'true' from blk_mq_dispatch_rq_list(),
> > > > > > > > > > and did not call .commit_rqs(). Then the IO processing will be stuck.
> > > > > > > > >
> > > > > > > > > See below.
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> > > > > > > > > > {
> > > > > > > > > >         struct request_queue *q = hctx->queue;
> > > > > > > > > >         struct elevator_queue *e = q->elevator;
> > > > > > > > > >         LIST_HEAD(rq_list);
> > > > > > > > > >         bool ret = false;
> > > > > > > > >
> > > > > > > > > The above initialization is just done once.
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > >        do {
> > > > > > > > > >               struct request *rq;
> > > > > > > > > >
> > > > > > > > > >               if (e->type->ops.has_work && !e->type->ops.has_work(hctx))
> > > > > > > > > >                      break;
> > > > > > > > > >
> > > > > > > > > >               .......
> > > > > > > > >                             ret = blk_mq_dispatch_rq_list(q, list, ...);
> > > > > > > > >
> > > > > > > > > list includes one request, so blk_mq_dispatch_rq_list() won't return
> > > > > > > > > false in case of no request in list.
> > > > > > > > >
> > > > > > > > > >        } while (ret);
> > > > > > > > > >
> > > > > > > > > >        return ret;
> > > > > > > > >
> > > > > > > > > 'ret' is always updated by return value of last blk_mq_dispatch_rq_list()
> > > > > > > > > if at least one request is dispatched. So if it becomes false, the loop
> > > > > > > > > breaks, that means .commit_rqs() has been called cause 'list' does
> > > > > > > > > include one request for blk_mq_dispatch_rq_list(). Otherwise, true is
> > > > > > > > > still returned.
> > > > > > > >
> > > > > > > > Sorry for my confusing description, let me try again to describe the problem.
> > > > > > > > When I try to mount the block device, I got the IO stuck with your
> > > > > > > > patch, and I did some debugging. I found we missed calling
> > > > > > > > commit_rqs() for one case:
> > > > > > > >
> > > > > > > > void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
> > > > > > > > @@ -173,6 +180,7 @@ void blk_mq_sched_dispatch_requests(struct
> > > > > > > > blk_mq_hw_ctx *hctx)
> > > > > > > >         struct elevator_queue *e = q->elevator;
> > > > > > > >         const bool has_sched_dispatch = e && e->type->ops.dispatch_request;
> > > > > > > >         LIST_HEAD(rq_list);
> > > > > > > > +       bool dispatch_ret;
> > > > > > > >
> > > > > > > >         /* RCU or SRCU read lock is needed before checking quiesced flag */
> > > > > > > >         if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
> > > > > > > > @@ -206,20 +214,26 @@ void blk_mq_sched_dispatch_requests(struct
> > > > > > > > blk_mq_hw_ctx *hctx)
> > > > > > > >          */
> > > > > > > >         if (!list_empty(&rq_list)) {
> > > > > > > >                 blk_mq_sched_mark_restart_hctx(hctx);
> > > > > > > > -               if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
> > > > > > > > +               dispatch_ret = blk_mq_dispatch_rq_list(q, &rq_list, false);
> > > > > > > >
> > > > > > > > Suppose we dispatch one request to block driver, and return 'true' here.
> > > > > > > >
> > > > > > > > +               if (dispatch_ret) {
> > > > > > > >                         if (has_sched_dispatch)
> > > > > > > > -                               blk_mq_do_dispatch_sched(hctx);
> > > > > > > > +                               dispatch_ret = blk_mq_do_dispatch_sched(hctx);
> > > > > > > >
> > > > > > > > Then we will continue to try to dispatch more requests from IO
> > > > > > > > scheduler, but if there are no requests in IO scheduler now, it will
> > > > > > > > return 'false' here, and set dispatch_ret as false.
> > > > > > > >
> > > > > > > >                         else
> > > > > > > > -                               blk_mq_do_dispatch_ctx(hctx);
> > > > > > > > +                               dispatch_ret = blk_mq_do_dispatch_ctx(hctx);
> > > > > > >
> > > > > > > OK, this one is an issue, but it can be fixed simply by not updating
> > > > > > > 'dispatch_ret' for the following dispatch, something like the below
> > > > > > > way:
> > > > > > >
> > > > > > >         if (dispatch_ret) {
> > > > > > >                 if (has_sched_dispatch)
> > > > > > >                         blk_mq_do_dispatch_sched(hctx);
> > > > > > >                 else
> > > > > > >                         blk_mq_do_dispatch_ctx(hctx);
> > > > > > >         }
> > > > > >
> > > > > > Yes, this can work.
> > > > > >
> > > > > > But I found your patch will drop some performance comparing with my
> > > > > > method in patch 1/2. My method can fetch several requests from IO
> > > > > > scheduler and dispatch them to block driver at one time, but in your
> > > > > > patch we still need dispatch request one by one, which will drop some
> > > > > > performance I think.
> > > > > > What do you think? Thanks.
> > > > >
> > > > > Please run your test and see if performance drop can be observed.
> > > >
> > > > From my testing (using the same fio configuration in cover letter), I
> > > > found your method will drop some performance from below data.
> > > >
> > > > My original patches:
> > > > Sequential read: 229.6MiB/s
> > > > Random read:180.8MiB/s
> > > > Sequential write: 172MiB/s
> > > > Random write:169.2MiB/s
> > > >
> > > > Your patches:
> > > > Sequential read: 209MiB/s
> > > > Random read:177MiB/s
> > > > Sequential write: 148MiB/s
> > > > Random write:147MiB/s
> > >
> > > After some optimiziton and I did more testing, I did not found any
> > > performance issue with your patch comparing with my old method. Sorry
> > > for noise in my last email.
> > >
> > > So will you send out a formal patch? If yes, please add my test-by
> > > tag. Thanks for your help.
> > > Tested-by: Baolin Wang <baolin.wang7@gmail.com>
> >
> > Can I take this patch into my patch set with your authority? Or you
> > want to send it out by yourself? Thanks.
>
> Hi Baolin,
>
> You can fold the patch into your series.

OK. Much thanks for your help.
diff mbox series

Patch

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index d7a128e..9c1e3aa 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -4793,29 +4793,37 @@  static int bfq_dispatch_requests(struct blk_mq_hw_ctx *hctx,
 				 struct list_head *list)
 {
 	struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
+	unsigned int batch_reqs = queue_max_batch_requests(hctx->queue) ? : 1;
 	struct request *rq;
 	struct bfq_queue *in_serv_queue;
 	bool waiting_rq, idle_timer_disabled;
+	int i;
 
-	spin_lock_irq(&bfqd->lock);
+	for (i = 0; i < batch_reqs; i++) {
+		spin_lock_irq(&bfqd->lock);
 
-	in_serv_queue = bfqd->in_service_queue;
-	waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue);
+		in_serv_queue = bfqd->in_service_queue;
+		waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue);
 
-	rq = __bfq_dispatch_request(hctx);
+		rq = __bfq_dispatch_request(hctx);
 
-	idle_timer_disabled =
-		waiting_rq && !bfq_bfqq_wait_request(in_serv_queue);
+		idle_timer_disabled =
+			waiting_rq && !bfq_bfqq_wait_request(in_serv_queue);
 
-	spin_unlock_irq(&bfqd->lock);
+		spin_unlock_irq(&bfqd->lock);
 
-	bfq_update_dispatch_stats(hctx->queue, rq, in_serv_queue,
-				  idle_timer_disabled);
+		bfq_update_dispatch_stats(hctx->queue, rq, in_serv_queue,
+					  idle_timer_disabled);
 
-	if (!rq)
-		return 0;
+		if (!rq) {
+			if (list_empty(list))
+				return 0;
 
-	list_add(&rq->queuelist, list);
+			return 1;
+		}
+
+		list_add(&rq->queuelist, list);
+	}
 
 	return 1;
 }
diff --git a/block/blk-mq.c b/block/blk-mq.c
index a12b176..2588e7a 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1193,8 +1193,6 @@  bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 	if (list_empty(list))
 		return false;
 
-	WARN_ON(!list_is_singular(list) && got_budget);
-
 	/*
 	 * Now process all the entries, sending them to the driver.
 	 */
diff --git a/block/blk-settings.c b/block/blk-settings.c
index c8eda2e..8c0b325 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -59,6 +59,7 @@  void blk_set_default_limits(struct queue_limits *lim)
 	lim->io_opt = 0;
 	lim->misaligned = 0;
 	lim->zoned = BLK_ZONED_NONE;
+	lim->max_batch_reqs = 1;
 }
 EXPORT_SYMBOL(blk_set_default_limits);
 
@@ -871,6 +872,18 @@  bool blk_queue_can_use_dma_map_merging(struct request_queue *q,
 }
 EXPORT_SYMBOL_GPL(blk_queue_can_use_dma_map_merging);
 
+/**
+ * blk_queue_max_batch_requests - set max requests for batch processing
+ * @q:	the request queue for the device
+ * @max_batch_requests: maximum number of requests in a batch
+ **/
+void blk_queue_max_batch_requests(struct request_queue *q,
+				  unsigned int max_batch_requests)
+{
+	q->limits.max_batch_reqs = max_batch_requests;
+}
+EXPORT_SYMBOL(blk_queue_max_batch_requests);
+
 static int __init blk_settings_init(void)
 {
 	blk_max_low_pfn = max_low_pfn - 1;
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index 8f58434..3a84a5f 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -801,50 +801,56 @@  static int kyber_dispatch_requests(struct blk_mq_hw_ctx *hctx,
 {
 	struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
 	struct kyber_hctx_data *khd = hctx->sched_data;
+	unsigned int batch_reqs = queue_max_batch_requests(hctx->queue) ? : 1;
 	struct request *rq;
-	int i, ret = 0;
+	int i, j, ret = 0;
 
 	spin_lock(&khd->lock);
 
-	/*
-	 * First, if we are still entitled to batch, try to dispatch a request
-	 * from the batch.
-	 */
-	if (khd->batching < kyber_batch_size[khd->cur_domain]) {
-		rq = kyber_dispatch_cur_domain(kqd, khd, hctx);
-		if (rq) {
-			list_add(&rq->queuelist, list);
-			ret = 1;
-			goto out;
+	for (j = 0; j < batch_reqs; j++) {
+		/*
+		 * First, if we are still entitled to batch, try to dispatch a
+		 * request from the batch.
+		 */
+		if (khd->batching < kyber_batch_size[khd->cur_domain]) {
+			rq = kyber_dispatch_cur_domain(kqd, khd, hctx);
+			if (rq) {
+				list_add(&rq->queuelist, list);
+				ret = 1;
+				continue;
+			}
 		}
-	}
-
-	/*
-	 * Either,
-	 * 1. We were no longer entitled to a batch.
-	 * 2. The domain we were batching didn't have any requests.
-	 * 3. The domain we were batching was out of tokens.
-	 *
-	 * Start another batch. Note that this wraps back around to the original
-	 * domain if no other domains have requests or tokens.
-	 */
-	khd->batching = 0;
-	for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
-		if (khd->cur_domain == KYBER_NUM_DOMAINS - 1)
-			khd->cur_domain = 0;
-		else
-			khd->cur_domain++;
 
-		rq = kyber_dispatch_cur_domain(kqd, khd, hctx);
-		if (rq) {
-			list_add(&rq->queuelist, list);
-			ret = 1;
-			goto out;
+		/*
+		 * Either,
+		 * 1. We were no longer entitled to a batch.
+		 * 2. The domain we were batching didn't have any requests.
+		 * 3. The domain we were batching was out of tokens.
+		 *
+		 * Start another batch. Note that this wraps back around to the
+		 * original domain if no other domains have requests or tokens.
+		 */
+		khd->batching = 0;
+		for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
+			if (khd->cur_domain == KYBER_NUM_DOMAINS - 1)
+				khd->cur_domain = 0;
+			else
+				khd->cur_domain++;
+
+			rq = kyber_dispatch_cur_domain(kqd, khd, hctx);
+			if (rq) {
+				list_add(&rq->queuelist, list);
+				ret = 1;
+				break;
+			}
 		}
 	}
 
-out:
 	spin_unlock(&khd->lock);
+
+	if (list_empty(list))
+		ret = 0;
+
 	return ret;
 }
 
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 9fbffba..4e3d58a 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -382,16 +382,24 @@  static int dd_dispatch_requests(struct blk_mq_hw_ctx *hctx,
 				struct list_head *list)
 {
 	struct deadline_data *dd = hctx->queue->elevator->elevator_data;
+	unsigned int batch_reqs = queue_max_batch_requests(hctx->queue) ? : 1;
 	struct request *rq;
+	int i;
 
-	spin_lock(&dd->lock);
-	rq = __dd_dispatch_request(dd);
-	spin_unlock(&dd->lock);
+	for (i = 0; i < batch_reqs; i++) {
+		spin_lock(&dd->lock);
+		rq = __dd_dispatch_request(dd);
+		spin_unlock(&dd->lock);
 
-	if (!rq)
-		return 0;
+		if (!rq) {
+			if (list_empty(list))
+				return 0;
 
-	list_add(&rq->queuelist, list);
+			return 1;
+		}
+
+		list_add(&rq->queuelist, list);
+	}
 
 	return 1;
 }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 053ea4b..d7032a0 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -338,6 +338,7 @@  struct queue_limits {
 	unsigned int		max_write_zeroes_sectors;
 	unsigned int		discard_granularity;
 	unsigned int		discard_alignment;
+	unsigned int		max_batch_reqs;
 
 	unsigned short		max_segments;
 	unsigned short		max_integrity_segments;
@@ -1109,6 +1110,8 @@  extern void blk_queue_required_elevator_features(struct request_queue *q,
 						 unsigned int features);
 extern bool blk_queue_can_use_dma_map_merging(struct request_queue *q,
 					      struct device *dev);
+extern void blk_queue_max_batch_requests(struct request_queue *q,
+					 unsigned int max_batch_requests);
 
 /*
  * Number of physical segments as sent to the device.
@@ -1291,6 +1294,11 @@  static inline unsigned int queue_max_segment_size(const struct request_queue *q)
 	return q->limits.max_segment_size;
 }
 
+static inline unsigned int queue_max_batch_requests(const struct request_queue *q)
+{
+	return q->limits.max_batch_reqs;
+}
+
 static inline unsigned queue_logical_block_size(const struct request_queue *q)
 {
 	int retval = 512;