diff mbox

[V9,4/7] blk-mq: introduce .get_budget and .put_budget in blk_mq_ops

Message ID 20171013180532.29304-5-ming.lei@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Ming Lei Oct. 13, 2017, 6:05 p.m. UTC
For SCSI devices, there is often per-request-queue depth, which need
to be respected before queuing one request.

The current blk-mq always dequeues request first, then calls .queue_rq()
to dispatch the request to lld. One obvious issue of this way is that I/O
merge may not be good, because when the per-request-queue depth can't be
respected, .queue_rq() has to return BLK_STS_RESOURCE, then this request
has to stay in hctx->dispatch list, and never got chance to participate
into I/O merge.

This patch introduces .get_budget and .put_budget callback in blk_mq_ops,
then we can try to get reserved budget first before dequeuing request.
Once the budget for queueing I/O can't be satisfied, we don't need to
dequeue request at all, then I/O merge can get improved a lot.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 block/blk-mq-sched.c   | 43 +++++++++++++++++++++++++++++++++++--------
 block/blk-mq.c         | 33 ++++++++++++++++++++++++++++++---
 block/blk-mq.h         | 21 ++++++++++++++++++++-
 include/linux/blk-mq.h | 11 +++++++++++
 4 files changed, 96 insertions(+), 12 deletions(-)

Comments

Bart Van Assche Oct. 13, 2017, 11:43 p.m. UTC | #1
On Sat, 2017-10-14 at 02:05 +0800, Ming Lei wrote:
> @@ -89,19 +89,36 @@ static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)

>  	return false;

>  }

>  

> -static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)

> +static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)


Shouldn't the meaning of the return value of this function be documented?

>  {

>  	struct request_queue *q = hctx->queue;

>  	struct elevator_queue *e = q->elevator;

>  	LIST_HEAD(rq_list);

>  

>  	do {

> -		struct request *rq = e->type->ops.mq.dispatch_request(hctx);

> +		struct request *rq;

> +		blk_status_t ret;

>  

> -		if (!rq)

> +		if (e->type->ops.mq.has_work &&

> +				!e->type->ops.mq.has_work(hctx))

>  			break;

> +

> +		ret = blk_mq_get_dispatch_budget(hctx);

> +		if (ret == BLK_STS_RESOURCE)

> +			return true;

> +

> +		rq = e->type->ops.mq.dispatch_request(hctx);

> +		if (!rq) {

> +			blk_mq_put_dispatch_budget(hctx, true);

> +			break;

> +		} else if (ret != BLK_STS_OK) {

> +			blk_mq_end_request(rq, ret);

> +			continue;

> +		}

>  		list_add(&rq->queuelist, &rq_list);

> -	} while (blk_mq_dispatch_rq_list(q, &rq_list));

> +	} while (blk_mq_dispatch_rq_list(q, &rq_list, true));

> +

> +	return false;

>  }


This means that the request in rq_list becomes the owner of the budget allocated
by blk_mq_get_dispatch_budget(). Shouldn't that be mentioned as a comment above
blk_mq_dispatch_rq_list()?

> +	if (run_queue) {

> +		if (!blk_mq_sched_needs_restart(hctx) &&

> +		    !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state)) {

> +			blk_mq_sched_mark_restart_hctx(hctx);

> +			blk_mq_run_hw_queue(hctx, true);

> +		}

>  	}

>  }


The above if-statement can be changed from a nested if into a single
if-statement.
 
Additionally, why has the code been added to blk_mq_sched_dispatch_requests()
that reruns the queue if blk_mq_get_dispatch_budget() returned BLK_STS_RESOURCE?
Is that code necessary or can it be left out?

> +static inline void blk_mq_put_dispatch_budget(struct blk_mq_hw_ctx *hctx,

> +		bool got_budget)

> +{

> +	struct request_queue *q = hctx->queue;

> +

> +	if (q->mq_ops->put_budget && got_budget)

> +		q->mq_ops->put_budget(hctx);

> +}


So the above function is passed a boolean as second argument and all what
that boolean is used for is to decide whether or not the function is executed?
Sorry but I think that's wrong and that the second argument should be removed
and that it should be evaluated by the caller instead of inside
blk_mq_put_dispatch_budget().

Bart.
Ming Lei Oct. 14, 2017, 7:34 a.m. UTC | #2
On Fri, Oct 13, 2017 at 11:43:44PM +0000, Bart Van Assche wrote:
> On Sat, 2017-10-14 at 02:05 +0800, Ming Lei wrote:
> > @@ -89,19 +89,36 @@ static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
> >  	return false;
> >  }
> >  
> > -static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> > +static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
> 
> Shouldn't the meaning of the return value of this function be documented?

OK, will do it in V10.

> 
> >  {
> >  	struct request_queue *q = hctx->queue;
> >  	struct elevator_queue *e = q->elevator;
> >  	LIST_HEAD(rq_list);
> >  
> >  	do {
> > -		struct request *rq = e->type->ops.mq.dispatch_request(hctx);
> > +		struct request *rq;
> > +		blk_status_t ret;
> >  
> > -		if (!rq)
> > +		if (e->type->ops.mq.has_work &&
> > +				!e->type->ops.mq.has_work(hctx))
> >  			break;
> > +
> > +		ret = blk_mq_get_dispatch_budget(hctx);
> > +		if (ret == BLK_STS_RESOURCE)
> > +			return true;
> > +
> > +		rq = e->type->ops.mq.dispatch_request(hctx);
> > +		if (!rq) {
> > +			blk_mq_put_dispatch_budget(hctx, true);
> > +			break;
> > +		} else if (ret != BLK_STS_OK) {
> > +			blk_mq_end_request(rq, ret);
> > +			continue;
> > +		}
> >  		list_add(&rq->queuelist, &rq_list);
> > -	} while (blk_mq_dispatch_rq_list(q, &rq_list));
> > +	} while (blk_mq_dispatch_rq_list(q, &rq_list, true));
> > +
> > +	return false;
> >  }
> 
> This means that the request in rq_list becomes the owner of the budget allocated
> by blk_mq_get_dispatch_budget(). Shouldn't that be mentioned as a comment above
> blk_mq_dispatch_rq_list()?

OK.

> 
> > +	if (run_queue) {
> > +		if (!blk_mq_sched_needs_restart(hctx) &&
> > +		    !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state)) {
> > +			blk_mq_sched_mark_restart_hctx(hctx);
> > +			blk_mq_run_hw_queue(hctx, true);
> > +		}
> >  	}
> >  }
> 
> The above if-statement can be changed from a nested if into a single
> if-statement.

OK

>  
> Additionally, why has the code been added to blk_mq_sched_dispatch_requests()
> that reruns the queue if blk_mq_get_dispatch_budget() returned BLK_STS_RESOURCE?

That is because it is the place where we get return of BLK_STS_RESOURCE,
and it can be moved to __blk_mq_run_hw_queue(), but looks no big
difference.

> Is that code necessary or can it be left out?

It is necessary.

Without dispatch budget obtained, we don't dequeue now, and actually
do nothing, so we have to rerun the hw queue.

> 
> > +static inline void blk_mq_put_dispatch_budget(struct blk_mq_hw_ctx *hctx,
> > +		bool got_budget)
> > +{
> > +	struct request_queue *q = hctx->queue;
> > +
> > +	if (q->mq_ops->put_budget && got_budget)
> > +		q->mq_ops->put_budget(hctx);
> > +}
> 
> So the above function is passed a boolean as second argument and all what
> that boolean is used for is to decide whether or not the function is executed?
> Sorry but I think that's wrong and that the second argument should be removed
> and that it should be evaluated by the caller instead of inside
> blk_mq_put_dispatch_budget().

This way makes code clean. And not a big deal, I may change back to the
way you suggested.
diff mbox

Patch

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index be29ba849408..6bb92bbc3135 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -89,19 +89,36 @@  static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
 	return false;
 }
 
-static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
+static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
 {
 	struct request_queue *q = hctx->queue;
 	struct elevator_queue *e = q->elevator;
 	LIST_HEAD(rq_list);
 
 	do {
-		struct request *rq = e->type->ops.mq.dispatch_request(hctx);
+		struct request *rq;
+		blk_status_t ret;
 
-		if (!rq)
+		if (e->type->ops.mq.has_work &&
+				!e->type->ops.mq.has_work(hctx))
 			break;
+
+		ret = blk_mq_get_dispatch_budget(hctx);
+		if (ret == BLK_STS_RESOURCE)
+			return true;
+
+		rq = e->type->ops.mq.dispatch_request(hctx);
+		if (!rq) {
+			blk_mq_put_dispatch_budget(hctx, true);
+			break;
+		} else if (ret != BLK_STS_OK) {
+			blk_mq_end_request(rq, ret);
+			continue;
+		}
 		list_add(&rq->queuelist, &rq_list);
-	} while (blk_mq_dispatch_rq_list(q, &rq_list));
+	} while (blk_mq_dispatch_rq_list(q, &rq_list, true));
+
+	return false;
 }
 
 void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
@@ -110,6 +127,7 @@  void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 	struct elevator_queue *e = q->elevator;
 	const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request;
 	LIST_HEAD(rq_list);
+	bool run_queue = false;
 
 	/* RCU or SRCU read lock is needed before checking quiesced flag */
 	if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
@@ -143,13 +161,22 @@  void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 	 */
 	if (!list_empty(&rq_list)) {
 		blk_mq_sched_mark_restart_hctx(hctx);
-		if (blk_mq_dispatch_rq_list(q, &rq_list) && has_sched_dispatch)
-			blk_mq_do_dispatch_sched(hctx);
+		if (blk_mq_dispatch_rq_list(q, &rq_list, false) &&
+				has_sched_dispatch)
+			run_queue = blk_mq_do_dispatch_sched(hctx);
 	} else if (has_sched_dispatch) {
-		blk_mq_do_dispatch_sched(hctx);
+		run_queue = blk_mq_do_dispatch_sched(hctx);
 	} else {
 		blk_mq_flush_busy_ctxs(hctx, &rq_list);
-		blk_mq_dispatch_rq_list(q, &rq_list);
+		blk_mq_dispatch_rq_list(q, &rq_list, false);
+	}
+
+	if (run_queue) {
+		if (!blk_mq_sched_needs_restart(hctx) &&
+		    !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state)) {
+			blk_mq_sched_mark_restart_hctx(hctx);
+			blk_mq_run_hw_queue(hctx, true);
+		}
 	}
 }
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 40cba1b1978f..feb89f1fcc63 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1048,7 +1048,8 @@  static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx)
 	return true;
 }
 
-bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
+bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
+		bool got_budget)
 {
 	struct blk_mq_hw_ctx *hctx;
 	struct request *rq;
@@ -1057,6 +1058,8 @@  bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
 	if (list_empty(list))
 		return false;
 
+	WARN_ON(!list_is_singular(list) && got_budget);
+
 	/*
 	 * Now process all the entries, sending them to the driver.
 	 */
@@ -1074,16 +1077,28 @@  bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
 			 * The initial allocation attempt failed, so we need to
 			 * rerun the hardware queue when a tag is freed.
 			 */
-			if (!blk_mq_dispatch_wait_add(hctx))
+			if (!blk_mq_dispatch_wait_add(hctx)) {
+				blk_mq_put_dispatch_budget(hctx, got_budget);
 				break;
+			}
 
 			/*
 			 * It's possible that a tag was freed in the window
 			 * between the allocation failure and adding the
 			 * hardware queue to the wait queue.
 			 */
-			if (!blk_mq_get_driver_tag(rq, &hctx, false))
+			if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
+				blk_mq_put_dispatch_budget(hctx, got_budget);
+				break;
+			}
+		}
+
+		if (!got_budget) {
+			ret = blk_mq_get_dispatch_budget(hctx);
+			if (ret == BLK_STS_RESOURCE)
 				break;
+			if (ret != BLK_STS_OK)
+				goto fail_rq;
 		}
 
 		list_del_init(&rq->queuelist);
@@ -1111,6 +1126,7 @@  bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
 			break;
 		}
 
+ fail_rq:
 		if (unlikely(ret != BLK_STS_OK)) {
 			errors++;
 			blk_mq_end_request(rq, BLK_STS_IOERR);
@@ -1582,6 +1598,13 @@  static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 	if (!blk_mq_get_driver_tag(rq, NULL, false))
 		goto insert;
 
+	ret = blk_mq_get_dispatch_budget(hctx);
+	if (ret == BLK_STS_RESOURCE) {
+		blk_mq_put_driver_tag(rq);
+		goto insert;
+	} else if (ret != BLK_STS_OK)
+		goto fail_rq;
+
 	new_cookie = request_to_qc_t(hctx, rq);
 
 	/*
@@ -1598,6 +1621,7 @@  static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 		__blk_mq_requeue_request(rq);
 		goto insert;
 	default:
+ fail_rq:
 		*cookie = BLK_QC_T_NONE;
 		blk_mq_end_request(rq, ret);
 		return;
@@ -2582,6 +2606,9 @@  int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
 	if (!set->ops->queue_rq)
 		return -EINVAL;
 
+	if (!set->ops->get_budget ^ !set->ops->put_budget)
+		return -EINVAL;
+
 	if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
 		pr_info("blk-mq: reduced tag depth to %u\n",
 			BLK_MQ_MAX_DEPTH);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index ef15b3414da5..fab566ce0bbf 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -30,7 +30,7 @@  void blk_mq_freeze_queue(struct request_queue *q);
 void blk_mq_free_queue(struct request_queue *q);
 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
 void blk_mq_wake_waiters(struct request_queue *q);
-bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *);
+bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, bool);
 void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
 bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx);
 bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
@@ -137,4 +137,23 @@  static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx)
 void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part,
 			unsigned int inflight[2]);
 
+static inline void blk_mq_put_dispatch_budget(struct blk_mq_hw_ctx *hctx,
+		bool got_budget)
+{
+	struct request_queue *q = hctx->queue;
+
+	if (q->mq_ops->put_budget && got_budget)
+		q->mq_ops->put_budget(hctx);
+}
+
+static inline blk_status_t blk_mq_get_dispatch_budget(
+		struct blk_mq_hw_ctx *hctx)
+{
+	struct request_queue *q = hctx->queue;
+
+	if (q->mq_ops->get_budget)
+		return q->mq_ops->get_budget(hctx);
+	return BLK_STS_OK;
+}
+
 #endif
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 50c6485cb04f..901457df3d64 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -90,6 +90,8 @@  struct blk_mq_queue_data {
 
 typedef blk_status_t (queue_rq_fn)(struct blk_mq_hw_ctx *,
 		const struct blk_mq_queue_data *);
+typedef blk_status_t (get_budget_fn)(struct blk_mq_hw_ctx *);
+typedef void (put_budget_fn)(struct blk_mq_hw_ctx *);
 typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool);
 typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int);
 typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
@@ -112,6 +114,15 @@  struct blk_mq_ops {
 	queue_rq_fn		*queue_rq;
 
 	/*
+	 * Reserve budget before queue request, once .queue_rq is
+	 * run, it is driver's responsibility to release the
+	 * reserved budget. Also we have to handle failure case
+	 * of .get_budget for avoiding I/O deadlock.
+	 */
+	get_budget_fn		*get_budget;
+	put_budget_fn		*put_budget;
+
+	/*
 	 * Called on request timeout
 	 */
 	timeout_fn		*timeout;