diff mbox

[4/5] blk-mq: introduce blk_get_request_notify

Message ID 20180122033550.27855-5-ming.lei@redhat.com (mailing list archive)
State Deferred, archived
Delegated to: Mike Snitzer
Headers show

Commit Message

Ming Lei Jan. 22, 2018, 3:35 a.m. UTC
DM-MPATH need to allocate request from underlying queue, but when the
allocation fails, there is no way to make underlying queue's RESTART
to restart DM's queue.

This patch introduces blk_get_request_notify() for this purpose, and
caller need to pass 'wait_queue_entry_t' to this function, and make
sure it is initialized well, so after the current allocation fails,
DM will get notified when there is request available from underlying
queue.

This approach is suggested by Jens, and has been used in blk-mq dispatch
patch for a while, see blk_mq_mark_tag_wait().

Suggested-by: Jens Axboe <axboe@kernel.dk>
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: Laurence Oberman <loberman@redhat.com>
Cc: Bart Van Assche <bart.vanassche@sandisk.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 block/blk-mq-tag.c     | 27 ++++++++++++++++++++++++++-
 block/blk-mq.c         | 24 +++++++++++++++++++++---
 block/blk-mq.h         |  1 +
 include/linux/blk-mq.h |  5 +++++
 4 files changed, 53 insertions(+), 4 deletions(-)

Comments

Ming Lei Jan. 22, 2018, 10:19 a.m. UTC | #1
On Mon, Jan 22, 2018 at 11:35:49AM +0800, Ming Lei wrote:
> DM-MPATH need to allocate request from underlying queue, but when the
> allocation fails, there is no way to make underlying queue's RESTART
> to restart DM's queue.
> 
> This patch introduces blk_get_request_notify() for this purpose, and
> caller need to pass 'wait_queue_entry_t' to this function, and make
> sure it is initialized well, so after the current allocation fails,
> DM will get notified when there is request available from underlying
> queue.
> 
> This approach is suggested by Jens, and has been used in blk-mq dispatch
> patch for a while, see blk_mq_mark_tag_wait().
> 
> Suggested-by: Jens Axboe <axboe@kernel.dk>
> Cc: Mike Snitzer <snitzer@redhat.com>
> Cc: Laurence Oberman <loberman@redhat.com>
> Cc: Bart Van Assche <bart.vanassche@sandisk.com>
> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> ---
>  block/blk-mq-tag.c     | 27 ++++++++++++++++++++++++++-
>  block/blk-mq.c         | 24 +++++++++++++++++++++---
>  block/blk-mq.h         |  1 +
>  include/linux/blk-mq.h |  5 +++++
>  4 files changed, 53 insertions(+), 4 deletions(-)
> 
> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> index 336dde07b230..911fc9bd1bab 100644
> --- a/block/blk-mq-tag.c
> +++ b/block/blk-mq-tag.c
> @@ -128,10 +128,35 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
>  	if (tag != -1)
>  		goto found_tag;
>  
> -	if (data->flags & BLK_MQ_REQ_NOWAIT)
> +	if ((data->flags & BLK_MQ_REQ_NOWAIT) && !(data->notifier &&
> +			(data->flags & BLK_MQ_REQ_ALLOC_NOTIFY)))
>  		return BLK_MQ_TAG_FAIL;
>  
>  	ws = bt_wait_ptr(bt, data->hctx);
> +
> +	/*
> +	 * If caller requires notification when tag is available, add
> +	 * wait entry of 'data->notifier' to the wait queue.
> +	 */
> +	if (data->flags & BLK_MQ_REQ_NOWAIT) {
> +		bool added = false;
> +
> +		spin_lock_irq(&ws->wait.lock);
> +		if (list_empty(&data->notifier->entry))
> +			__add_wait_queue(&ws->wait, data->notifier);
> +		else
> +			added = true;
> +		spin_unlock_irq(&ws->wait.lock);

The above need a per-notifier lock too, since the same notifier may
be added to different wait queues at the same time.
Bart Van Assche Jan. 22, 2018, 5:13 p.m. UTC | #2
On Mon, 2018-01-22 at 11:35 +0800, Ming Lei wrote:
> DM-MPATH need to allocate request from underlying queue, but when the
> allocation fails, there is no way to make underlying queue's RESTART
> to restart DM's queue.
> 
> This patch introduces blk_get_request_notify() for this purpose, and
> caller need to pass 'wait_queue_entry_t' to this function, and make
> sure it is initialized well, so after the current allocation fails,
> DM will get notified when there is request available from underlying
> queue.

Please mention that this is only a partial solution because the case when
e.g. blk_insert_cloned_request() returns BLK_STS_RESOURCE is not handled.
This could help for drivers that support a very low queue depth (lpfc) but
probably won't be that useful for other drivers.

> +	/*
> +	 * If caller requires notification when tag is available, add
> +	 * wait entry of 'data->notifier' to the wait queue.
> +	 */
> +	if (data->flags & BLK_MQ_REQ_NOWAIT) {
> +		bool added = false;
> +
> +		spin_lock_irq(&ws->wait.lock);
> +		if (list_empty(&data->notifier->entry))
> +			__add_wait_queue(&ws->wait, data->notifier);
> +		else
> +			added = true;
> +		spin_unlock_irq(&ws->wait.lock);
> +
> +		if (added)
> +			return BLK_MQ_TAG_FAIL;
> +
> +		tag = __blk_mq_get_tag(data, bt);
> +		if (tag != -1)
> +			goto found_tag;
> +		return BLK_MQ_TAG_FAIL;
> +	}

Sorry but I don't like this approach. Adding "data->notifier" to the wait
queue creates a link between two request queues, e.g. a dm-mpath queue and
one of the paths that is a member of that dm-mpath queue. This creates the
potential for ugly races between e.g. "data->notifier" being triggered and
removal of the dm-mpath queue.

> diff --git a/block/blk-mq.h b/block/blk-mq.h
> index 88c558f71819..bec2f675f8f1 100644
> --- a/block/blk-mq.h
> +++ b/block/blk-mq.h
> @@ -160,6 +160,7 @@ struct blk_mq_alloc_data {
>  	struct request_queue *q;
>  	blk_mq_req_flags_t flags;
>  	unsigned int shallow_depth;
> +	wait_queue_entry_t *notifier;

If others would agree with the approach of this patch please use another name
than "notifier". In the context of the Linux kernel a notifier is an instance
of struct notifier_block. The above "notifier" member is not a notifier but a
wait queue entry.

Thanks,

Bart.

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
Ming Lei Jan. 23, 2018, 1:29 a.m. UTC | #3
On Mon, Jan 22, 2018 at 05:13:03PM +0000, Bart Van Assche wrote:
> On Mon, 2018-01-22 at 11:35 +0800, Ming Lei wrote:
> > DM-MPATH need to allocate request from underlying queue, but when the
> > allocation fails, there is no way to make underlying queue's RESTART
> > to restart DM's queue.
> > 
> > This patch introduces blk_get_request_notify() for this purpose, and
> > caller need to pass 'wait_queue_entry_t' to this function, and make
> > sure it is initialized well, so after the current allocation fails,
> > DM will get notified when there is request available from underlying
> > queue.
> 
> Please mention that this is only a partial solution because the case when
> e.g. blk_insert_cloned_request() returns BLK_STS_RESOURCE is not handled.

I don't think it is necessary to mention that in comment log, because
one patch won't deal with all things.

But definitely there will be one patch which deals with that in V2,
otherwise the performance issue can't be solved completely.

> This could help for drivers that support a very low queue depth (lpfc) but
> probably won't be that useful for other drivers.

Up to now, it is one dm-mpath specific performance issue under one
specific situation: IO is submitted to dm-mpath device and the
underlying queue at the same time, and the underlying queue depth is
very low, such as 1.

> 
> > +	/*
> > +	 * If caller requires notification when tag is available, add
> > +	 * wait entry of 'data->notifier' to the wait queue.
> > +	 */
> > +	if (data->flags & BLK_MQ_REQ_NOWAIT) {
> > +		bool added = false;
> > +
> > +		spin_lock_irq(&ws->wait.lock);
> > +		if (list_empty(&data->notifier->entry))
> > +			__add_wait_queue(&ws->wait, data->notifier);
> > +		else
> > +			added = true;
> > +		spin_unlock_irq(&ws->wait.lock);
> > +
> > +		if (added)
> > +			return BLK_MQ_TAG_FAIL;
> > +
> > +		tag = __blk_mq_get_tag(data, bt);
> > +		if (tag != -1)
> > +			goto found_tag;
> > +		return BLK_MQ_TAG_FAIL;
> > +	}
> 
> Sorry but I don't like this approach. Adding "data->notifier" to the wait
> queue creates a link between two request queues, e.g. a dm-mpath queue and
> one of the paths that is a member of that dm-mpath queue. This creates the
> potential for ugly races between e.g. "data->notifier" being triggered and
> removal of the dm-mpath queue.

I thought no such race because the dm request is still in dispatch list before
the 'notifier' is handled. And now looks this isn't true, but this issue can be
dealt easily by call blk_queue_enter_live() before the allocation, and
release them all once the notifier is triggered.

Anyway this interface need to be well documented.

> 
> > diff --git a/block/blk-mq.h b/block/blk-mq.h
> > index 88c558f71819..bec2f675f8f1 100644
> > --- a/block/blk-mq.h
> > +++ b/block/blk-mq.h
> > @@ -160,6 +160,7 @@ struct blk_mq_alloc_data {
> >  	struct request_queue *q;
> >  	blk_mq_req_flags_t flags;
> >  	unsigned int shallow_depth;
> > +	wait_queue_entry_t *notifier;
> 
> If others would agree with the approach of this patch please use another name
> than "notifier". In the context of the Linux kernel a notifier is an instance
> of struct notifier_block. The above "notifier" member is not a notifier but a
> wait queue entry.

OK, I will change to 'wait', similar with 'dispatch_wait'.
diff mbox

Patch

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 336dde07b230..911fc9bd1bab 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -128,10 +128,35 @@  unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 	if (tag != -1)
 		goto found_tag;
 
-	if (data->flags & BLK_MQ_REQ_NOWAIT)
+	if ((data->flags & BLK_MQ_REQ_NOWAIT) && !(data->notifier &&
+			(data->flags & BLK_MQ_REQ_ALLOC_NOTIFY)))
 		return BLK_MQ_TAG_FAIL;
 
 	ws = bt_wait_ptr(bt, data->hctx);
+
+	/*
+	 * If caller requires notification when tag is available, add
+	 * wait entry of 'data->notifier' to the wait queue.
+	 */
+	if (data->flags & BLK_MQ_REQ_NOWAIT) {
+		bool added = false;
+
+		spin_lock_irq(&ws->wait.lock);
+		if (list_empty(&data->notifier->entry))
+			__add_wait_queue(&ws->wait, data->notifier);
+		else
+			added = true;
+		spin_unlock_irq(&ws->wait.lock);
+
+		if (added)
+			return BLK_MQ_TAG_FAIL;
+
+		tag = __blk_mq_get_tag(data, bt);
+		if (tag != -1)
+			goto found_tag;
+		return BLK_MQ_TAG_FAIL;
+	}
+
 	drop_ctx = data->ctx == NULL;
 	do {
 		/*
diff --git a/block/blk-mq.c b/block/blk-mq.c
index e91d688792a9..a71e44f17e7f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -384,10 +384,14 @@  static struct request *blk_mq_get_request(struct request_queue *q,
 	return rq;
 }
 
-struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
-		blk_mq_req_flags_t flags)
+static struct request *__blk_mq_alloc_request(struct request_queue *q,
+		unsigned int op, blk_mq_req_flags_t flags,
+		wait_queue_entry_t *notifier)
 {
-	struct blk_mq_alloc_data alloc_data = { .flags = flags };
+	struct blk_mq_alloc_data alloc_data = {
+		.flags = flags,
+		.notifier = notifier,
+	};
 	struct request *rq;
 	int ret;
 
@@ -408,8 +412,22 @@  struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
 	rq->bio = rq->biotail = NULL;
 	return rq;
 }
+
+struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
+		blk_mq_req_flags_t flags)
+{
+	return __blk_mq_alloc_request(q, op, flags, NULL);
+}
 EXPORT_SYMBOL(blk_mq_alloc_request);
 
+struct request *blk_mq_alloc_request_notify(struct request_queue *q,
+		unsigned int op, blk_mq_req_flags_t flags,
+		wait_queue_entry_t *notifier)
+{
+	return __blk_mq_alloc_request(q, op, flags, notifier);
+}
+EXPORT_SYMBOL(blk_mq_alloc_request_notify);
+
 struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 	unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx)
 {
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 88c558f71819..bec2f675f8f1 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -160,6 +160,7 @@  struct blk_mq_alloc_data {
 	struct request_queue *q;
 	blk_mq_req_flags_t flags;
 	unsigned int shallow_depth;
+	wait_queue_entry_t *notifier;
 
 	/* input & output parameter */
 	struct blk_mq_ctx *ctx;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 8efcf49796a3..335c7dace5a7 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -219,10 +219,15 @@  enum {
 	BLK_MQ_REQ_INTERNAL	= (__force blk_mq_req_flags_t)(1 << 2),
 	/* set RQF_PREEMPT */
 	BLK_MQ_REQ_PREEMPT	= (__force blk_mq_req_flags_t)(1 << 3),
+	/* notify when new rq is available */
+	BLK_MQ_REQ_ALLOC_NOTIFY	= (__force blk_mq_req_flags_t)(1 << 4),
 };
 
 struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
 		blk_mq_req_flags_t flags);
+struct request *blk_mq_alloc_request_notify(struct request_queue *q,
+		unsigned int op, blk_mq_req_flags_t flags,
+		wait_queue_entry_t *notifier);
 struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 		unsigned int op, blk_mq_req_flags_t flags,
 		unsigned int hctx_idx);