diff mbox series

[v2,1/2] blk-mq: add tagset quiesce interface

Message ID 20221013094450.5947-2-lengchao@huawei.com (mailing list archive)
State New, archived
Headers show
Series improve nvme quiesce time for large amount of namespaces | expand

Commit Message

Chao Leng Oct. 13, 2022, 9:44 a.m. UTC
Drivers that have shared tagsets may need to quiesce potentially a lot
of request queues that all share a single tagset (e.g. nvme). Add an interface
to quiesce all the queues on a given tagset. This interface is useful because
it can speedup the quiesce by doing it in parallel.

For tagsets that have BLK_MQ_F_BLOCKING set, we use call_srcu to all hctxs
in parallel such that all of them wait for the same rcu elapsed period with
a per-hctx heap allocated rcu_synchronize. for tagsets that don't have
BLK_MQ_F_BLOCKING set, we simply call a single synchronize_rcu as this is
sufficient.

Because some queues never need to be quiesced(e.g. nvme connect_q).
So introduce QUEUE_FLAG_NOQUIESCED to tagset quiesce interface to
skip the queue.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Chao Leng <lengchao@huawei.com>
---
 block/blk-mq.c         | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/blk-mq.h |  2 ++
 include/linux/blkdev.h |  2 ++
 3 files changed, 79 insertions(+)

Comments

Sagi Grimberg Oct. 13, 2022, 10:28 a.m. UTC | #1
On 10/13/22 12:44, Chao Leng wrote:
> Drivers that have shared tagsets may need to quiesce potentially a lot
> of request queues that all share a single tagset (e.g. nvme). Add an interface
> to quiesce all the queues on a given tagset. This interface is useful because
> it can speedup the quiesce by doing it in parallel.
> 
> For tagsets that have BLK_MQ_F_BLOCKING set, we use call_srcu to all hctxs
> in parallel such that all of them wait for the same rcu elapsed period with
> a per-hctx heap allocated rcu_synchronize. for tagsets that don't have
> BLK_MQ_F_BLOCKING set, we simply call a single synchronize_rcu as this is
> sufficient.
> 
> Because some queues never need to be quiesced(e.g. nvme connect_q).
> So introduce QUEUE_FLAG_NOQUIESCED to tagset quiesce interface to
> skip the queue.

I wouldn't say it never nor will ever quiesce, we just don't happen to
quiesce it today...

> 
> Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
> Signed-off-by: Chao Leng <lengchao@huawei.com>
> ---
>   block/blk-mq.c         | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++
>   include/linux/blk-mq.h |  2 ++
>   include/linux/blkdev.h |  2 ++
>   3 files changed, 79 insertions(+)
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 8070b6c10e8d..ebe25da08156 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -29,6 +29,7 @@
>   #include <linux/prefetch.h>
>   #include <linux/blk-crypto.h>
>   #include <linux/part_stat.h>
> +#include <linux/rcupdate_wait.h>
>   
>   #include <trace/events/block.h>
>   
> @@ -311,6 +312,80 @@ void blk_mq_unquiesce_queue(struct request_queue *q)
>   }
>   EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);
>   
> +static void blk_mq_quiesce_blocking_tagset(struct blk_mq_tag_set *set)
> +{
> +	int i = 0;
> +	int count = 0;
> +	struct request_queue *q;
> +	struct rcu_synchronize *rcu;
> +
> +	list_for_each_entry(q, &set->tag_list, tag_set_list) {
> +		if (blk_queue_noquiesced(q))
> +			continue;
> +
> +		blk_mq_quiesce_queue_nowait(q);
> +		count++;
> +	}
> +
> +	rcu = kvmalloc(count * sizeof(*rcu), GFP_KERNEL);
> +	if (rcu) {
> +		list_for_each_entry(q, &set->tag_list, tag_set_list) {
> +			if (blk_queue_noquiesced(q))
> +				continue;
> +
> +			init_rcu_head(&rcu[i].head);
> +			init_completion(&rcu[i].completion);
> +			call_srcu(q->srcu, &rcu[i].head, wakeme_after_rcu);
> +			i++;
> +		}
> +
> +		for (i = 0; i < count; i++) {
> +			wait_for_completion(&rcu[i].completion);
> +			destroy_rcu_head(&rcu[i].head);
> +		}
> +		kvfree(rcu);
> +	} else {
> +		list_for_each_entry(q, &set->tag_list, tag_set_list)
> +			synchronize_srcu(q->srcu);
> +	}
> +}
> +
> +static void blk_mq_quiesce_nonblocking_tagset(struct blk_mq_tag_set *set)
> +{
> +	struct request_queue *q;
> +
> +	list_for_each_entry(q, &set->tag_list, tag_set_list) {
> +		if (blk_queue_noquiesced(q))
> +			continue;
> +
> +		blk_mq_quiesce_queue_nowait(q);
> +	}
> +	synchronize_rcu();
> +}
> +
> +void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set)
> +{
> +	mutex_lock(&set->tag_list_lock);
> +	if (set->flags & BLK_MQ_F_BLOCKING)
> +		blk_mq_quiesce_blocking_tagset(set);
> +	else
> +		blk_mq_quiesce_nonblocking_tagset(set);
> +
> +	mutex_unlock(&set->tag_list_lock);
> +}
> +EXPORT_SYMBOL_GPL(blk_mq_quiesce_tagset);
> +
> +void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set)
> +{
> +	struct request_queue *q;
> +
> +	mutex_lock(&set->tag_list_lock);
> +	list_for_each_entry(q, &set->tag_list, tag_set_list)
> +		blk_mq_unquiesce_queue(q);
> +	mutex_unlock(&set->tag_list_lock);
> +}
> +EXPORT_SYMBOL_GPL(blk_mq_unquiesce_tagset);
> +
>   void blk_mq_wake_waiters(struct request_queue *q)
>   {
>   	struct blk_mq_hw_ctx *hctx;
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index ba18e9bdb799..1df47606d0a7 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -877,6 +877,8 @@ void blk_mq_start_hw_queues(struct request_queue *q);
>   void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
>   void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
>   void blk_mq_quiesce_queue(struct request_queue *q);
> +void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set);
> +void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set);
>   void blk_mq_wait_quiesce_done(struct request_queue *q);
>   void blk_mq_unquiesce_queue(struct request_queue *q);
>   void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 50e358a19d98..f15544299a67 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -579,6 +579,7 @@ struct request_queue {
>   #define QUEUE_FLAG_HCTX_ACTIVE	28	/* at least one blk-mq hctx is active */
>   #define QUEUE_FLAG_NOWAIT       29	/* device supports NOWAIT */
>   #define QUEUE_FLAG_SQ_SCHED     30	/* single queue style io dispatch */
> +#define QUEUE_FLAG_NOQUIESCED	31	/* queue is never quiesced */

the comment is misleading. If this is truely queue that is never
quiescing then blk_mq_quiesce_queue() and friends need to skip it.

I'd call it self_quiesce or something that would reflect that it is
black-listed from tagset-wide quiesce.
Chao Leng Oct. 14, 2022, 2:09 a.m. UTC | #2
On 2022/10/13 18:28, Sagi Grimberg wrote:
> 
> 
> On 10/13/22 12:44, Chao Leng wrote:
>> Drivers that have shared tagsets may need to quiesce potentially a lot
>> of request queues that all share a single tagset (e.g. nvme). Add an interface
>> to quiesce all the queues on a given tagset. This interface is useful because
>> it can speedup the quiesce by doing it in parallel.
>>
>> For tagsets that have BLK_MQ_F_BLOCKING set, we use call_srcu to all hctxs
>> in parallel such that all of them wait for the same rcu elapsed period with
>> a per-hctx heap allocated rcu_synchronize. for tagsets that don't have
>> BLK_MQ_F_BLOCKING set, we simply call a single synchronize_rcu as this is
>> sufficient.
>>
>> Because some queues never need to be quiesced(e.g. nvme connect_q).
>> So introduce QUEUE_FLAG_NOQUIESCED to tagset quiesce interface to
>> skip the queue.
> 
> I wouldn't say it never nor will ever quiesce, we just don't happen to
> quiesce it today...
> 
>>
>> Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
>> Signed-off-by: Chao Leng <lengchao@huawei.com>
>> ---
>>   block/blk-mq.c         | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++
>>   include/linux/blk-mq.h |  2 ++
>>   include/linux/blkdev.h |  2 ++
>>   3 files changed, 79 insertions(+)
>>
>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>> index 8070b6c10e8d..ebe25da08156 100644
>> --- a/block/blk-mq.c
>> +++ b/block/blk-mq.c
>> @@ -29,6 +29,7 @@
>>   #include <linux/prefetch.h>
>>   #include <linux/blk-crypto.h>
>>   #include <linux/part_stat.h>
>> +#include <linux/rcupdate_wait.h>
>>   #include <trace/events/block.h>
>> @@ -311,6 +312,80 @@ void blk_mq_unquiesce_queue(struct request_queue *q)
>>   }
>>   EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);
>> +static void blk_mq_quiesce_blocking_tagset(struct blk_mq_tag_set *set)
>> +{
>> +    int i = 0;
>> +    int count = 0;
>> +    struct request_queue *q;
>> +    struct rcu_synchronize *rcu;
>> +
>> +    list_for_each_entry(q, &set->tag_list, tag_set_list) {
>> +        if (blk_queue_noquiesced(q))
>> +            continue;
>> +
>> +        blk_mq_quiesce_queue_nowait(q);
>> +        count++;
>> +    }
>> +
>> +    rcu = kvmalloc(count * sizeof(*rcu), GFP_KERNEL);
>> +    if (rcu) {
>> +        list_for_each_entry(q, &set->tag_list, tag_set_list) {
>> +            if (blk_queue_noquiesced(q))
>> +                continue;
>> +
>> +            init_rcu_head(&rcu[i].head);
>> +            init_completion(&rcu[i].completion);
>> +            call_srcu(q->srcu, &rcu[i].head, wakeme_after_rcu);
>> +            i++;
>> +        }
>> +
>> +        for (i = 0; i < count; i++) {
>> +            wait_for_completion(&rcu[i].completion);
>> +            destroy_rcu_head(&rcu[i].head);
>> +        }
>> +        kvfree(rcu);
>> +    } else {
>> +        list_for_each_entry(q, &set->tag_list, tag_set_list)
>> +            synchronize_srcu(q->srcu);
>> +    }
>> +}
>> +
>> +static void blk_mq_quiesce_nonblocking_tagset(struct blk_mq_tag_set *set)
>> +{
>> +    struct request_queue *q;
>> +
>> +    list_for_each_entry(q, &set->tag_list, tag_set_list) {
>> +        if (blk_queue_noquiesced(q))
>> +            continue;
>> +
>> +        blk_mq_quiesce_queue_nowait(q);
>> +    }
>> +    synchronize_rcu();
>> +}
>> +
>> +void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set)
>> +{
>> +    mutex_lock(&set->tag_list_lock);
>> +    if (set->flags & BLK_MQ_F_BLOCKING)
>> +        blk_mq_quiesce_blocking_tagset(set);
>> +    else
>> +        blk_mq_quiesce_nonblocking_tagset(set);
>> +
>> +    mutex_unlock(&set->tag_list_lock);
>> +}
>> +EXPORT_SYMBOL_GPL(blk_mq_quiesce_tagset);
>> +
>> +void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set)
>> +{
>> +    struct request_queue *q;
>> +
>> +    mutex_lock(&set->tag_list_lock);
>> +    list_for_each_entry(q, &set->tag_list, tag_set_list)
>> +        blk_mq_unquiesce_queue(q);
>> +    mutex_unlock(&set->tag_list_lock);
>> +}
>> +EXPORT_SYMBOL_GPL(blk_mq_unquiesce_tagset);
>> +
>>   void blk_mq_wake_waiters(struct request_queue *q)
>>   {
>>       struct blk_mq_hw_ctx *hctx;
>> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
>> index ba18e9bdb799..1df47606d0a7 100644
>> --- a/include/linux/blk-mq.h
>> +++ b/include/linux/blk-mq.h
>> @@ -877,6 +877,8 @@ void blk_mq_start_hw_queues(struct request_queue *q);
>>   void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
>>   void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
>>   void blk_mq_quiesce_queue(struct request_queue *q);
>> +void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set);
>> +void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set);
>>   void blk_mq_wait_quiesce_done(struct request_queue *q);
>>   void blk_mq_unquiesce_queue(struct request_queue *q);
>>   void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
>> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
>> index 50e358a19d98..f15544299a67 100644
>> --- a/include/linux/blkdev.h
>> +++ b/include/linux/blkdev.h
>> @@ -579,6 +579,7 @@ struct request_queue {
>>   #define QUEUE_FLAG_HCTX_ACTIVE    28    /* at least one blk-mq hctx is active */
>>   #define QUEUE_FLAG_NOWAIT       29    /* device supports NOWAIT */
>>   #define QUEUE_FLAG_SQ_SCHED     30    /* single queue style io dispatch */
>> +#define QUEUE_FLAG_NOQUIESCED    31    /* queue is never quiesced */
> 
> the comment is misleading. If this is truely queue that is never
> quiescing then blk_mq_quiesce_queue() and friends need to skip it.
> 
> I'd call it self_quiesce or something that would reflect that it is
> black-listed from tagset-wide quiesce.
Yes, you are right. I will modify the comment in patch V3.
> .
Christoph Hellwig Oct. 17, 2022, 1:39 p.m. UTC | #3
On Thu, Oct 13, 2022 at 05:44:49PM +0800, Chao Leng wrote:
> +	rcu = kvmalloc(count * sizeof(*rcu), GFP_KERNEL);
> +	if (rcu) {
> +		list_for_each_entry(q, &set->tag_list, tag_set_list) {
> +			if (blk_queue_noquiesced(q))
> +				continue;
> +
> +			init_rcu_head(&rcu[i].head);
> +			init_completion(&rcu[i].completion);
> +			call_srcu(q->srcu, &rcu[i].head, wakeme_after_rcu);
> +			i++;
> +		}
> +
> +		for (i = 0; i < count; i++) {
> +			wait_for_completion(&rcu[i].completion);
> +			destroy_rcu_head(&rcu[i].head);
> +		}
> +		kvfree(rcu);
> +	} else {
> +		list_for_each_entry(q, &set->tag_list, tag_set_list)
> +			synchronize_srcu(q->srcu);
> +	}

Having to allocate a struct rcu_synchronize for each of the potentially
many queues here is a bit sad.

Pull just explained the start_poll_synchronize_rcu interfaces at ALPSS
last week, so I wonder if something like that would also be feasible
for SRCU, as that would come in really handy here.
Christoph Hellwig Oct. 17, 2022, 1:42 p.m. UTC | #4
On Mon, Oct 17, 2022 at 03:39:06PM +0200, Christoph Hellwig wrote:
> Having to allocate a struct rcu_synchronize for each of the potentially
> many queues here is a bit sad.
> 
> Pull just explained the start_poll_synchronize_rcu interfaces at ALPSS
> last week, so I wonder if something like that would also be feasible
> for SRCU, as that would come in really handy here.

Alternatively I wonder if we could simply use a single srcu_struct
in the tag_set instead of one per-queue.  That would potentially
increase the number of read side critical sections
blk_mq_wait_quiesce_done would have to wait for in tagsets with
multiple queues, but I wonder how much overhead that would be in
practive.
Christoph Hellwig Oct. 17, 2022, 1:43 p.m. UTC | #5
On Thu, Oct 13, 2022 at 01:28:37PM +0300, Sagi Grimberg wrote:
>> Because some queues never need to be quiesced(e.g. nvme connect_q).
>> So introduce QUEUE_FLAG_NOQUIESCED to tagset quiesce interface to
>> skip the queue.
>
> I wouldn't say it never nor will ever quiesce, we just don't happen to
> quiesce it today...

Yes.  It really is a QUEUE_FLAG_SKIP_TAGSET_QUIESCE.
Paul E. McKenney Oct. 17, 2022, 3:21 p.m. UTC | #6
On Mon, Oct 17, 2022 at 03:39:06PM +0200, Christoph Hellwig wrote:
> On Thu, Oct 13, 2022 at 05:44:49PM +0800, Chao Leng wrote:
> > +	rcu = kvmalloc(count * sizeof(*rcu), GFP_KERNEL);
> > +	if (rcu) {
> > +		list_for_each_entry(q, &set->tag_list, tag_set_list) {
> > +			if (blk_queue_noquiesced(q))
> > +				continue;
> > +
> > +			init_rcu_head(&rcu[i].head);
> > +			init_completion(&rcu[i].completion);
> > +			call_srcu(q->srcu, &rcu[i].head, wakeme_after_rcu);
> > +			i++;
> > +		}
> > +
> > +		for (i = 0; i < count; i++) {
> > +			wait_for_completion(&rcu[i].completion);
> > +			destroy_rcu_head(&rcu[i].head);
> > +		}
> > +		kvfree(rcu);
> > +	} else {
> > +		list_for_each_entry(q, &set->tag_list, tag_set_list)
> > +			synchronize_srcu(q->srcu);
> > +	}
> 
> Having to allocate a struct rcu_synchronize for each of the potentially
> many queues here is a bit sad.
> 
> Pull just explained the start_poll_synchronize_rcu interfaces at ALPSS
> last week, so I wonder if something like that would also be feasible
> for SRCU, as that would come in really handy here.

There is start_poll_synchronize_srcu() and poll_state_synchronize_srcu(),
but there would need to be an unsigned long for each srcu_struct from
which an SRCU grace period was required.  This would be half the size
of the "rcu" array above, but still maybe larger than you would like.

The resulting code might look something like this, with "rcu" now being
a pointer to unsigned long:

	rcu = kvmalloc(count * sizeof(*rcu), GFP_KERNEL);
	if (rcu) {
		list_for_each_entry(q, &set->tag_list, tag_set_list) {
			if (blk_queue_noquiesced(q))
				continue;
			rcu[i] = start_poll_synchronize_srcu(q->srcu);
			i++;
		}

		for (i = 0; i < count; i++)
			if (!poll_state_synchronize_srcu(q->srcu))
				synchronize_srcu(q->srcu);
		kvfree(rcu);
	} else {
		list_for_each_entry(q, &set->tag_list, tag_set_list)
			synchronize_srcu(q->srcu);
	}

Or as Christoph suggested, just have a single srcu_struct for the
whole group.

The main reason for having multiple srcu_struct structures is to
prevent the readers from one from holding up the updaters from another.
Except that by waiting for the multiple grace periods, you are losing
that property anyway, correct?  Or is this code waiting on only a small
fraction of the srcu_struct structures associated with blk_queue?

							Thanx, Paul
Christoph Hellwig Oct. 17, 2022, 3:31 p.m. UTC | #7
On Mon, Oct 17, 2022 at 08:21:36AM -0700, Paul E. McKenney wrote:
> The main reason for having multiple srcu_struct structures is to
> prevent the readers from one from holding up the updaters from another.
> Except that by waiting for the multiple grace periods, you are losing
> that property anyway, correct?  Or is this code waiting on only a small
> fraction of the srcu_struct structures associated with blk_queue?

There are a few places that do this.  SCSI and MMC could probably switch
to this scheme or at least and open coded version of it (if we move
to a per_tagsect srcu_struct open coding it might be a better idea
anyway).

del_gendisk is one where we have to go one queue at a time, and
that might actually matter for a device removal.  elevator_switch
is another one, there is a variant for it that works on the whole
tagset, but also those that don't.
Paul E. McKenney Oct. 17, 2022, 10:41 p.m. UTC | #8
On Mon, Oct 17, 2022 at 05:31:05PM +0200, Christoph Hellwig wrote:
> On Mon, Oct 17, 2022 at 08:21:36AM -0700, Paul E. McKenney wrote:
> > The main reason for having multiple srcu_struct structures is to
> > prevent the readers from one from holding up the updaters from another.
> > Except that by waiting for the multiple grace periods, you are losing
> > that property anyway, correct?  Or is this code waiting on only a small
> > fraction of the srcu_struct structures associated with blk_queue?
> 
> There are a few places that do this.  SCSI and MMC could probably switch
> to this scheme or at least and open coded version of it (if we move
> to a per_tagsect srcu_struct open coding it might be a better idea
> anyway).
> 
> del_gendisk is one where we have to go one queue at a time, and
> that might actually matter for a device removal.  elevator_switch
> is another one, there is a variant for it that works on the whole
> tagset, but also those that don't.

OK, thank you for the info!

Then the big question is "how long do the SRCU readers run?"

If all of the readers ran for exactly the same duration, there would be
little point in having more than one srcu_struct.

If the kernel knew up front how long the SRCU readers for a given entry
would run, it could provide an srcu_struct structure for each duration.
For a (fanciful) example, you could have one srcu_struct structure for
SSDs, another for rotating rust, a third for LAN-attached storage, and
a fourth for WAN-attached storage.  Maybe a fifth for lunar-based storage.

If it is impossible to know up front which entry has what SRCU read-side
latency characteristics, then the current approach of just giving each
entry its own srcu_struct structure is not at all a bad plan.

Does that help, or am I off in the weeds here?

							Thanx, Paul
Christoph Hellwig Oct. 18, 2022, 5:19 a.m. UTC | #9
On Mon, Oct 17, 2022 at 03:41:15PM -0700, Paul E. McKenney wrote:
> Then the big question is "how long do the SRCU readers run?"
> 
> If all of the readers ran for exactly the same duration, there would be
> little point in having more than one srcu_struct.

The SRCU readers are the I/O dispatch, which will have quite similar
runtimes for the different queues.

> If the kernel knew up front how long the SRCU readers for a given entry
> would run, it could provide an srcu_struct structure for each duration.
> For a (fanciful) example, you could have one srcu_struct structure for
> SSDs, another for rotating rust, a third for LAN-attached storage, and
> a fourth for WAN-attached storage.  Maybe a fifth for lunar-based storage.

All the different request_queues in a tag_set are for the same device.
There might be some corner cases like storare arrays where they have
different latencies.  But we're not even waiting for the I/O completion
here, this just protects the submission.

> Does that help, or am I off in the weeds here?

I think this was very helpful, and at least to be moving the srcu_struct
to the tag_set sounds like a good idea to explore.

Ming, anything I might have missed?
Sagi Grimberg Oct. 18, 2022, 8:39 a.m. UTC | #10
>> Having to allocate a struct rcu_synchronize for each of the potentially
>> many queues here is a bit sad.
>>
>> Pull just explained the start_poll_synchronize_rcu interfaces at ALPSS
>> last week, so I wonder if something like that would also be feasible
>> for SRCU, as that would come in really handy here.
> 
> Alternatively I wonder if we could simply use a single srcu_struct
> in the tag_set instead of one per-queue.  That would potentially
> increase the number of read side critical sections
> blk_mq_wait_quiesce_done would have to wait for in tagsets with
> multiple queues, but I wonder how much overhead that would be in
> practive.

It used to be on the hctx actually. It was moved to the request_queue
in an attempt to make parallel quiesce that didn't eventually complete
afaict.

For nvme, we never really quiesce a single namespace, so it will be a
net positive IMO. I also think that quiesce is really more a tagset
operation so I would think that its a more natural location.

The only question in my mind is regarding patch 2/2 with the subtle
ordering of nvme_set_queue_dying...
Christoph Hellwig Oct. 18, 2022, 8:55 a.m. UTC | #11
On Tue, Oct 18, 2022 at 11:39:30AM +0300, Sagi Grimberg wrote:
> The only question in my mind is regarding patch 2/2 with the subtle
> ordering of nvme_set_queue_dying...

I think we need to sort that out as a pre-patch.  There is quite some
history there was a fair amount of dead lock potential earlier, but
I think I sorted quite a lot out there, including a new lock for setting
the gendisk size, which is what the comment refers to.

Something like this:

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 059737c1a2c19..cb7623b5d2c8b 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -5105,21 +5105,21 @@ static void nvme_stop_ns_queue(struct nvme_ns *ns)
 
 /*
  * Prepare a queue for teardown.
- *
- * This must forcibly unquiesce queues to avoid blocking dispatch, and only set
- * the capacity to 0 after that to avoid blocking dispatchers that may be
- * holding bd_butex.  This will end buffered writers dirtying pages that can't
- * be synced.
  */
 static void nvme_set_queue_dying(struct nvme_ns *ns)
 {
 	if (test_and_set_bit(NVME_NS_DEAD, &ns->flags))
 		return;
 
+	/*
+	 * Mark the disk dead to prevent new opens, and set the capacity to 0
+	 * to end buffered writers dirtying pages that can't be synced.
+	 */
 	blk_mark_disk_dead(ns->disk);
-	nvme_start_ns_queue(ns);
-
 	set_capacity_and_notify(ns->disk, 0);
+
+	/* forcibly unquiesce queues to avoid blocking dispatch */
+	nvme_start_ns_queue(ns);
 }
 
 /**
Sagi Grimberg Oct. 18, 2022, 9:06 a.m. UTC | #12
>> The only question in my mind is regarding patch 2/2 with the subtle
>> ordering of nvme_set_queue_dying...
> 
> I think we need to sort that out as a pre-patch.  There is quite some
> history there was a fair amount of dead lock potential earlier, but
> I think I sorted quite a lot out there, including a new lock for setting
> the gendisk size, which is what the comment refers to.
> 
> Something like this:
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 059737c1a2c19..cb7623b5d2c8b 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -5105,21 +5105,21 @@ static void nvme_stop_ns_queue(struct nvme_ns *ns)
>   
>   /*
>    * Prepare a queue for teardown.
> - *
> - * This must forcibly unquiesce queues to avoid blocking dispatch, and only set
> - * the capacity to 0 after that to avoid blocking dispatchers that may be
> - * holding bd_butex.  This will end buffered writers dirtying pages that can't
> - * be synced.
>    */
>   static void nvme_set_queue_dying(struct nvme_ns *ns)
>   {
>   	if (test_and_set_bit(NVME_NS_DEAD, &ns->flags))
>   		return;
>   
> +	/*
> +	 * Mark the disk dead to prevent new opens, and set the capacity to 0
> +	 * to end buffered writers dirtying pages that can't be synced.
> +	 */
>   	blk_mark_disk_dead(ns->disk);
> -	nvme_start_ns_queue(ns);
> -
>   	set_capacity_and_notify(ns->disk, 0);
> +
> +	/* forcibly unquiesce queues to avoid blocking dispatch */
> +	nvme_start_ns_queue(ns);
>   }

If we no longer have this ordering requirement, then I don't see why
the unquiesce cannot move before or after nvme_set_queue_dying and apply
a tagset-wide quiesce/unquiesce...
Chao Leng Oct. 18, 2022, 9:51 a.m. UTC | #13
On 2022/10/17 21:43, Christoph Hellwig wrote:
> On Thu, Oct 13, 2022 at 01:28:37PM +0300, Sagi Grimberg wrote:
>>> Because some queues never need to be quiesced(e.g. nvme connect_q).
>>> So introduce QUEUE_FLAG_NOQUIESCED to tagset quiesce interface to
>>> skip the queue.
>>
>> I wouldn't say it never nor will ever quiesce, we just don't happen to
>> quiesce it today...
> 
> Yes.  It really is a QUEUE_FLAG_SKIP_TAGSET_QUIESCE.
I will modify it in patch V3.
> 
> 
> .
>
Chao Leng Oct. 18, 2022, 9:52 a.m. UTC | #14
On 2022/10/17 23:21, Paul E. McKenney wrote:
> On Mon, Oct 17, 2022 at 03:39:06PM +0200, Christoph Hellwig wrote:
>> On Thu, Oct 13, 2022 at 05:44:49PM +0800, Chao Leng wrote:
>>> +	rcu = kvmalloc(count * sizeof(*rcu), GFP_KERNEL);
>>> +	if (rcu) {
>>> +		list_for_each_entry(q, &set->tag_list, tag_set_list) {
>>> +			if (blk_queue_noquiesced(q))
>>> +				continue;
>>> +
>>> +			init_rcu_head(&rcu[i].head);
>>> +			init_completion(&rcu[i].completion);
>>> +			call_srcu(q->srcu, &rcu[i].head, wakeme_after_rcu);
>>> +			i++;
>>> +		}
>>> +
>>> +		for (i = 0; i < count; i++) {
>>> +			wait_for_completion(&rcu[i].completion);
>>> +			destroy_rcu_head(&rcu[i].head);
>>> +		}
>>> +		kvfree(rcu);
>>> +	} else {
>>> +		list_for_each_entry(q, &set->tag_list, tag_set_list)
>>> +			synchronize_srcu(q->srcu);
>>> +	}
>>
>> Having to allocate a struct rcu_synchronize for each of the potentially
>> many queues here is a bit sad.
>>
>> Pull just explained the start_poll_synchronize_rcu interfaces at ALPSS
>> last week, so I wonder if something like that would also be feasible
>> for SRCU, as that would come in really handy here.
> 
> There is start_poll_synchronize_srcu() and poll_state_synchronize_srcu(),
> but there would need to be an unsigned long for each srcu_struct from
> which an SRCU grace period was required.  This would be half the size
> of the "rcu" array above, but still maybe larger than you would like.
> 
> The resulting code might look something like this, with "rcu" now being
> a pointer to unsigned long:
> 
> 	rcu = kvmalloc(count * sizeof(*rcu), GFP_KERNEL);
> 	if (rcu) {
> 		list_for_each_entry(q, &set->tag_list, tag_set_list) {
> 			if (blk_queue_noquiesced(q))
> 				continue;
> 			rcu[i] = start_poll_synchronize_srcu(q->srcu);
> 			i++;
> 		}
> 
> 		for (i = 0; i < count; i++)
> 			if (!poll_state_synchronize_srcu(q->srcu))
> 				synchronize_srcu(q->srcu);
synchronize_srcu will restart a new period of grace.
Maybe it would be better like this:
			while (!poll_state_synchronize_srcu(q->srcu, rcu[i]))
				schedule_timeout_uninterruptible(1);
> 		kvfree(rcu);
> 	} else {
> 		list_for_each_entry(q, &set->tag_list, tag_set_list)
> 			synchronize_srcu(q->srcu);
> 	}
> 
> Or as Christoph suggested, just have a single srcu_struct for the
> whole group.
> 
> The main reason for having multiple srcu_struct structures is to
> prevent the readers from one from holding up the updaters from another.
> Except that by waiting for the multiple grace periods, you are losing
> that property anyway, correct?  Or is this code waiting on only a small
> fraction of the srcu_struct structures associated with blk_queue?
> 
> 							Thanx, Paul
> .
>
Chao Leng Oct. 18, 2022, 9:52 a.m. UTC | #15
On 2022/10/17 21:39, Christoph Hellwig wrote:
> On Thu, Oct 13, 2022 at 05:44:49PM +0800, Chao Leng wrote:
>> +	rcu = kvmalloc(count * sizeof(*rcu), GFP_KERNEL);
>> +	if (rcu) {
>> +		list_for_each_entry(q, &set->tag_list, tag_set_list) {
>> +			if (blk_queue_noquiesced(q))
>> +				continue;
>> +
>> +			init_rcu_head(&rcu[i].head);
>> +			init_completion(&rcu[i].completion);
>> +			call_srcu(q->srcu, &rcu[i].head, wakeme_after_rcu);
>> +			i++;
>> +		}
>> +
>> +		for (i = 0; i < count; i++) {
>> +			wait_for_completion(&rcu[i].completion);
>> +			destroy_rcu_head(&rcu[i].head);
>> +		}
>> +		kvfree(rcu);
>> +	} else {
>> +		list_for_each_entry(q, &set->tag_list, tag_set_list)
>> +			synchronize_srcu(q->srcu);
>> +	}
> 
> Having to allocate a struct rcu_synchronize for each of the potentially
> many queues here is a bit sad.
> 
> Pull just explained the start_poll_synchronize_rcu interfaces at ALPSS
> last week, so I wonder if something like that would also be feasible
> for SRCU, as that would come in really handy here.
Using start_poll_synchronize_rcu will reduce some memory.
This is a good idea.
> 
> .
>
Chao Leng Oct. 18, 2022, 9:52 a.m. UTC | #16
On 2022/10/17 21:42, Christoph Hellwig wrote:
> On Mon, Oct 17, 2022 at 03:39:06PM +0200, Christoph Hellwig wrote:
>> Having to allocate a struct rcu_synchronize for each of the potentially
>> many queues here is a bit sad.
>>
>> Pull just explained the start_poll_synchronize_rcu interfaces at ALPSS
>> last week, so I wonder if something like that would also be feasible
>> for SRCU, as that would come in really handy here.
> 
> Alternatively I wonder if we could simply use a single srcu_struct
> in the tag_set instead of one per-queue.  That would potentially
> increase the number of read side critical sections
> blk_mq_wait_quiesce_done would have to wait for in tagsets with
> multiple queues, but I wonder how much overhead that would be in
> practive.
Now we submit request based on queues, maybe it is difficult to use
a single srcu_struct in the tag instead of one per-queue.
Using start_poll_synchronize_rcu  still need a unsigned long array.
> 
> .
>
Christoph Hellwig Oct. 18, 2022, 11:05 a.m. UTC | #17
On Tue, Oct 18, 2022 at 12:06:41PM +0300, Sagi Grimberg wrote:
>>   +	/*
>> +	 * Mark the disk dead to prevent new opens, and set the capacity to 0
>> +	 * to end buffered writers dirtying pages that can't be synced.
>> +	 */
>>   	blk_mark_disk_dead(ns->disk);
>> -	nvme_start_ns_queue(ns);
>> -
>>   	set_capacity_and_notify(ns->disk, 0);
>> +
>> +	/* forcibly unquiesce queues to avoid blocking dispatch */
>> +	nvme_start_ns_queue(ns);
>>   }
>
> If we no longer have this ordering requirement, then I don't see why
> the unquiesce cannot move before or after nvme_set_queue_dying and apply
> a tagset-wide quiesce/unquiesce...

Yes.  After this patch we can simply do the tagset-wide unquiesce
after the loop calling blk_mark_disk_dead and set_capacity_and_notify.

We can then also move the NЅ_DEAD flag to the controller..
Paul E. McKenney Oct. 18, 2022, 3:04 p.m. UTC | #18
On Tue, Oct 18, 2022 at 05:52:06PM +0800, Chao Leng wrote:
> On 2022/10/17 23:21, Paul E. McKenney wrote:
> > On Mon, Oct 17, 2022 at 03:39:06PM +0200, Christoph Hellwig wrote:
> > > On Thu, Oct 13, 2022 at 05:44:49PM +0800, Chao Leng wrote:
> > > > +	rcu = kvmalloc(count * sizeof(*rcu), GFP_KERNEL);
> > > > +	if (rcu) {
> > > > +		list_for_each_entry(q, &set->tag_list, tag_set_list) {
> > > > +			if (blk_queue_noquiesced(q))
> > > > +				continue;
> > > > +
> > > > +			init_rcu_head(&rcu[i].head);
> > > > +			init_completion(&rcu[i].completion);
> > > > +			call_srcu(q->srcu, &rcu[i].head, wakeme_after_rcu);
> > > > +			i++;
> > > > +		}
> > > > +
> > > > +		for (i = 0; i < count; i++) {
> > > > +			wait_for_completion(&rcu[i].completion);
> > > > +			destroy_rcu_head(&rcu[i].head);
> > > > +		}
> > > > +		kvfree(rcu);
> > > > +	} else {
> > > > +		list_for_each_entry(q, &set->tag_list, tag_set_list)
> > > > +			synchronize_srcu(q->srcu);
> > > > +	}
> > > 
> > > Having to allocate a struct rcu_synchronize for each of the potentially
> > > many queues here is a bit sad.
> > > 
> > > Pull just explained the start_poll_synchronize_rcu interfaces at ALPSS
> > > last week, so I wonder if something like that would also be feasible
> > > for SRCU, as that would come in really handy here.
> > 
> > There is start_poll_synchronize_srcu() and poll_state_synchronize_srcu(),
> > but there would need to be an unsigned long for each srcu_struct from
> > which an SRCU grace period was required.  This would be half the size
> > of the "rcu" array above, but still maybe larger than you would like.
> > 
> > The resulting code might look something like this, with "rcu" now being
> > a pointer to unsigned long:
> > 
> > 	rcu = kvmalloc(count * sizeof(*rcu), GFP_KERNEL);
> > 	if (rcu) {
> > 		list_for_each_entry(q, &set->tag_list, tag_set_list) {
> > 			if (blk_queue_noquiesced(q))
> > 				continue;
> > 			rcu[i] = start_poll_synchronize_srcu(q->srcu);
> > 			i++;
> > 		}
> > 
> > 		for (i = 0; i < count; i++)
> > 			if (!poll_state_synchronize_srcu(q->srcu))
> > 				synchronize_srcu(q->srcu);
> synchronize_srcu will restart a new period of grace.

True, but SRCU grace periods normally complete reasonably quickly, so
the synchronize_srcu() might well be faster than the loop, depending on
what the corresponding SRCU readers are doing.

> Maybe it would be better like this:
> 			while (!poll_state_synchronize_srcu(q->srcu, rcu[i]))
> 				schedule_timeout_uninterruptible(1);

Why not try it both ways and see what happens?  Assuming that is, that
the differences matter in this code.

							Thanx, Paul

> > 		kvfree(rcu);
> > 	} else {
> > 		list_for_each_entry(q, &set->tag_list, tag_set_list)
> > 			synchronize_srcu(q->srcu);
> > 	}
> > 
> > Or as Christoph suggested, just have a single srcu_struct for the
> > whole group.
> > 
> > The main reason for having multiple srcu_struct structures is to
> > prevent the readers from one from holding up the updaters from another.
> > Except that by waiting for the multiple grace periods, you are losing
> > that property anyway, correct?  Or is this code waiting on only a small
> > fraction of the srcu_struct structures associated with blk_queue?
> > 
> > 							Thanx, Paul
> > .
> >
Ming Lei Oct. 19, 2022, 12:35 a.m. UTC | #19
On Tue, Oct 18, 2022 at 07:19:56AM +0200, Christoph Hellwig wrote:
> On Mon, Oct 17, 2022 at 03:41:15PM -0700, Paul E. McKenney wrote:
> > Then the big question is "how long do the SRCU readers run?"
> > 
> > If all of the readers ran for exactly the same duration, there would be
> > little point in having more than one srcu_struct.
> 
> The SRCU readers are the I/O dispatch, which will have quite similar
> runtimes for the different queues.
> 
> > If the kernel knew up front how long the SRCU readers for a given entry
> > would run, it could provide an srcu_struct structure for each duration.
> > For a (fanciful) example, you could have one srcu_struct structure for
> > SSDs, another for rotating rust, a third for LAN-attached storage, and
> > a fourth for WAN-attached storage.  Maybe a fifth for lunar-based storage.
> 
> All the different request_queues in a tag_set are for the same device.
> There might be some corner cases like storare arrays where they have
> different latencies.  But we're not even waiting for the I/O completion
> here, this just protects the submission.
> 
> > Does that help, or am I off in the weeds here?
> 
> I think this was very helpful, and at least to be moving the srcu_struct
> to the tag_set sounds like a good idea to explore.
> 
> Ming, anything I might have missed?

I think it is fine to move it to tag_set, this way could simplify a
lot.

The effect could be that blk_mq_quiesce_queue() becomes a little
slow, but it is always in slow path, and synchronize_srcu() won't
wait new read-side critical-section.

Just one corner case, in case of BLK_MQ_F_BLOCKING, is there any such driver
which may block too long in ->queue_rq() sometime? such as wait for dozens
of seconds.


thanks,
Ming
Chao Leng Oct. 19, 2022, 2:39 a.m. UTC | #20
On 2022/10/18 23:04, Paul E. McKenney wrote:
> On Tue, Oct 18, 2022 at 05:52:06PM +0800, Chao Leng wrote:
>> On 2022/10/17 23:21, Paul E. McKenney wrote:
>>> On Mon, Oct 17, 2022 at 03:39:06PM +0200, Christoph Hellwig wrote:
>>>> On Thu, Oct 13, 2022 at 05:44:49PM +0800, Chao Leng wrote:
>>>>> +	rcu = kvmalloc(count * sizeof(*rcu), GFP_KERNEL);
>>>>> +	if (rcu) {
>>>>> +		list_for_each_entry(q, &set->tag_list, tag_set_list) {
>>>>> +			if (blk_queue_noquiesced(q))
>>>>> +				continue;
>>>>> +
>>>>> +			init_rcu_head(&rcu[i].head);
>>>>> +			init_completion(&rcu[i].completion);
>>>>> +			call_srcu(q->srcu, &rcu[i].head, wakeme_after_rcu);
>>>>> +			i++;
>>>>> +		}
>>>>> +
>>>>> +		for (i = 0; i < count; i++) {
>>>>> +			wait_for_completion(&rcu[i].completion);
>>>>> +			destroy_rcu_head(&rcu[i].head);
>>>>> +		}
>>>>> +		kvfree(rcu);
>>>>> +	} else {
>>>>> +		list_for_each_entry(q, &set->tag_list, tag_set_list)
>>>>> +			synchronize_srcu(q->srcu);
>>>>> +	}
>>>>
>>>> Having to allocate a struct rcu_synchronize for each of the potentially
>>>> many queues here is a bit sad.
>>>>
>>>> Pull just explained the start_poll_synchronize_rcu interfaces at ALPSS
>>>> last week, so I wonder if something like that would also be feasible
>>>> for SRCU, as that would come in really handy here.
>>>
>>> There is start_poll_synchronize_srcu() and poll_state_synchronize_srcu(),
>>> but there would need to be an unsigned long for each srcu_struct from
>>> which an SRCU grace period was required.  This would be half the size
>>> of the "rcu" array above, but still maybe larger than you would like.
>>>
>>> The resulting code might look something like this, with "rcu" now being
>>> a pointer to unsigned long:
>>>
>>> 	rcu = kvmalloc(count * sizeof(*rcu), GFP_KERNEL);
>>> 	if (rcu) {
>>> 		list_for_each_entry(q, &set->tag_list, tag_set_list) {
>>> 			if (blk_queue_noquiesced(q))
>>> 				continue;
>>> 			rcu[i] = start_poll_synchronize_srcu(q->srcu);
>>> 			i++;
>>> 		}
>>>
>>> 		for (i = 0; i < count; i++)
>>> 			if (!poll_state_synchronize_srcu(q->srcu))
>>> 				synchronize_srcu(q->srcu);
>> synchronize_srcu will restart a new period of grace.
> 
> True, but SRCU grace periods normally complete reasonably quickly, so
> the synchronize_srcu() might well be faster than the loop, depending on
> what the corresponding SRCU readers are doing.
Yes, Different runtimes have different wait times, and it's hard to say
which is better or worse.
> 
>> Maybe it would be better like this:
>> 			while (!poll_state_synchronize_srcu(q->srcu, rcu[i]))
>> 				schedule_timeout_uninterruptible(1);
> 
> Why not try it both ways and see what happens?  Assuming that is, that
> the differences matter in this code.
> 
> 							Thanx, Paul
> 
>>> 		kvfree(rcu);
>>> 	} else {
>>> 		list_for_each_entry(q, &set->tag_list, tag_set_list)
>>> 			synchronize_srcu(q->srcu);
>>> 	}
>>>
>>> Or as Christoph suggested, just have a single srcu_struct for the
>>> whole group.
>>>
>>> The main reason for having multiple srcu_struct structures is to
>>> prevent the readers from one from holding up the updaters from another.
>>> Except that by waiting for the multiple grace periods, you are losing
>>> that property anyway, correct?  Or is this code waiting on only a small
>>> fraction of the srcu_struct structures associated with blk_queue?
>>>
>>> 							Thanx, Paul
>>> .
>>>
> .
>
Sagi Grimberg Oct. 19, 2022, 7:15 a.m. UTC | #21
>>> Then the big question is "how long do the SRCU readers run?"
>>>
>>> If all of the readers ran for exactly the same duration, there would be
>>> little point in having more than one srcu_struct.
>>
>> The SRCU readers are the I/O dispatch, which will have quite similar
>> runtimes for the different queues.
>>
>>> If the kernel knew up front how long the SRCU readers for a given entry
>>> would run, it could provide an srcu_struct structure for each duration.
>>> For a (fanciful) example, you could have one srcu_struct structure for
>>> SSDs, another for rotating rust, a third for LAN-attached storage, and
>>> a fourth for WAN-attached storage.  Maybe a fifth for lunar-based storage.
>>
>> All the different request_queues in a tag_set are for the same device.
>> There might be some corner cases like storare arrays where they have
>> different latencies.  But we're not even waiting for the I/O completion
>> here, this just protects the submission.
>>
>>> Does that help, or am I off in the weeds here?
>>
>> I think this was very helpful, and at least to be moving the srcu_struct
>> to the tag_set sounds like a good idea to explore.
>>
>> Ming, anything I might have missed?
> 
> I think it is fine to move it to tag_set, this way could simplify a
> lot.
> 
> The effect could be that blk_mq_quiesce_queue() becomes a little
> slow, but it is always in slow path, and synchronize_srcu() won't
> wait new read-side critical-section.
> 
> Just one corner case, in case of BLK_MQ_F_BLOCKING, is there any such driver
> which may block too long in ->queue_rq() sometime? such as wait for dozens
> of seconds.

nvme-tcp will opportunistically try a network send directly from
.queue_rq(), but always with MSG_DONTWAIT, so that is not a problem.

nbd though can block in .queue_rq() with blocking network sends, however
afaict nbd allocates a tagset per nbd_device, and also a request_queue
per device, so its effectively the same if the srcu is in the tagset or
in the request_queue.
Christoph Hellwig Oct. 19, 2022, 7:25 a.m. UTC | #22
On Wed, Oct 19, 2022 at 10:15:26AM +0300, Sagi Grimberg wrote:
> nvme-tcp will opportunistically try a network send directly from
> .queue_rq(), but always with MSG_DONTWAIT, so that is not a problem.
>
> nbd though can block in .queue_rq() with blocking network sends, however
> afaict nbd allocates a tagset per nbd_device, and also a request_queue
> per device, so its effectively the same if the srcu is in the tagset or
> in the request_queue.

Yes, the only multiple request_queue per tag_set cases right now
are nvme-tcp and mmc.  There have been patches from iSCSI, but they
seem to be stuck.
Christoph Hellwig Oct. 19, 2022, 7:27 a.m. UTC | #23
On Wed, Oct 19, 2022 at 09:25:35AM +0200, Christoph Hellwig wrote:
> Yes, the only multiple request_queue per tag_set cases right now
> are nvme-tcp and mmc.  There have been patches from iSCSI, but they
> seem to be stuck.

... and looking at the only blk_mq_quiesce_queue caller in mmc,
it would benefit from a tagset-wide quiesce after a bit of refactoring
as well.
Sagi Grimberg Oct. 19, 2022, 7:30 a.m. UTC | #24
>> nvme-tcp will opportunistically try a network send directly from
>> .queue_rq(), but always with MSG_DONTWAIT, so that is not a problem.
>>
>> nbd though can block in .queue_rq() with blocking network sends, however
>> afaict nbd allocates a tagset per nbd_device, and also a request_queue
>> per device, so its effectively the same if the srcu is in the tagset or
>> in the request_queue.
> 
> Yes, the only multiple request_queue per tag_set cases right now
> are nvme-tcp and mmc.  There have been patches from iSCSI, but they
> seem to be stuck.

iscsi defers network sends to a workqueue anyways, and no scsi lld sets
BLK_MQ_F_BLOCKING.
Christoph Hellwig Oct. 19, 2022, 7:32 a.m. UTC | #25
On Wed, Oct 19, 2022 at 10:30:44AM +0300, Sagi Grimberg wrote:
> iscsi defers network sends to a workqueue anyways, and no scsi lld sets
> BLK_MQ_F_BLOCKING.

Yes, but Mike had a series to implement a similar fast path as nvme-tcp
as while ago that included supporting BLK_MQ_F_BLOCKING in the scsi
midlayer.  We'll also need it to convert the paride drivers to libata,
but those are single request_queue per tag_set as far as I know.
Sagi Grimberg Oct. 19, 2022, 7:57 a.m. UTC | #26
On 10/19/22 10:32, Christoph Hellwig wrote:
> On Wed, Oct 19, 2022 at 10:30:44AM +0300, Sagi Grimberg wrote:
>> iscsi defers network sends to a workqueue anyways, and no scsi lld sets
>> BLK_MQ_F_BLOCKING.
> 
> Yes, but Mike had a series to implement a similar fast path as nvme-tcp
> as while ago that included supporting BLK_MQ_F_BLOCKING in the scsi
> midlayer.

Hmmm, didn't see that, pointer?

>  We'll also need it to convert the paride drivers to libata,
> but those are single request_queue per tag_set as far as I know.

didn't see paride even attempting to quiesce...
Christoph Hellwig Oct. 19, 2022, 8:17 a.m. UTC | #27
On Wed, Oct 19, 2022 at 10:57:11AM +0300, Sagi Grimberg wrote:
>
>
> On 10/19/22 10:32, Christoph Hellwig wrote:
>> On Wed, Oct 19, 2022 at 10:30:44AM +0300, Sagi Grimberg wrote:
>>> iscsi defers network sends to a workqueue anyways, and no scsi lld sets
>>> BLK_MQ_F_BLOCKING.
>>
>> Yes, but Mike had a series to implement a similar fast path as nvme-tcp
>> as while ago that included supporting BLK_MQ_F_BLOCKING in the scsi
>> midlayer.
>
> Hmmm, didn't see that, pointer?

https://www.spinics.net/lists/linux-scsi/msg170849.html

>>  We'll also need it to convert the paride drivers to libata,
>> but those are single request_queue per tag_set as far as I know.
>
> didn't see paride even attempting to quiesce...

The block layer might.  But it really does not matter for our
considerations here..
Sagi Grimberg Oct. 19, 2022, 8:29 a.m. UTC | #28
>>>> iscsi defers network sends to a workqueue anyways, and no scsi lld sets
>>>> BLK_MQ_F_BLOCKING.
>>>
>>> Yes, but Mike had a series to implement a similar fast path as nvme-tcp
>>> as while ago that included supporting BLK_MQ_F_BLOCKING in the scsi
>>> midlayer.
>>
>> Hmmm, didn't see that, pointer?
> 
> https://www.spinics.net/lists/linux-scsi/msg170849.html

Nice.

>>>   We'll also need it to convert the paride drivers to libata,
>>> but those are single request_queue per tag_set as far as I know.
>>
>> didn't see paride even attempting to quiesce...
> 
> The block layer might.  But it really does not matter for our
> considerations here..

Right.
diff mbox series

Patch

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 8070b6c10e8d..ebe25da08156 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -29,6 +29,7 @@ 
 #include <linux/prefetch.h>
 #include <linux/blk-crypto.h>
 #include <linux/part_stat.h>
+#include <linux/rcupdate_wait.h>
 
 #include <trace/events/block.h>
 
@@ -311,6 +312,80 @@  void blk_mq_unquiesce_queue(struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);
 
+static void blk_mq_quiesce_blocking_tagset(struct blk_mq_tag_set *set)
+{
+	int i = 0;
+	int count = 0;
+	struct request_queue *q;
+	struct rcu_synchronize *rcu;
+
+	list_for_each_entry(q, &set->tag_list, tag_set_list) {
+		if (blk_queue_noquiesced(q))
+			continue;
+
+		blk_mq_quiesce_queue_nowait(q);
+		count++;
+	}
+
+	rcu = kvmalloc(count * sizeof(*rcu), GFP_KERNEL);
+	if (rcu) {
+		list_for_each_entry(q, &set->tag_list, tag_set_list) {
+			if (blk_queue_noquiesced(q))
+				continue;
+
+			init_rcu_head(&rcu[i].head);
+			init_completion(&rcu[i].completion);
+			call_srcu(q->srcu, &rcu[i].head, wakeme_after_rcu);
+			i++;
+		}
+
+		for (i = 0; i < count; i++) {
+			wait_for_completion(&rcu[i].completion);
+			destroy_rcu_head(&rcu[i].head);
+		}
+		kvfree(rcu);
+	} else {
+		list_for_each_entry(q, &set->tag_list, tag_set_list)
+			synchronize_srcu(q->srcu);
+	}
+}
+
+static void blk_mq_quiesce_nonblocking_tagset(struct blk_mq_tag_set *set)
+{
+	struct request_queue *q;
+
+	list_for_each_entry(q, &set->tag_list, tag_set_list) {
+		if (blk_queue_noquiesced(q))
+			continue;
+
+		blk_mq_quiesce_queue_nowait(q);
+	}
+	synchronize_rcu();
+}
+
+void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set)
+{
+	mutex_lock(&set->tag_list_lock);
+	if (set->flags & BLK_MQ_F_BLOCKING)
+		blk_mq_quiesce_blocking_tagset(set);
+	else
+		blk_mq_quiesce_nonblocking_tagset(set);
+
+	mutex_unlock(&set->tag_list_lock);
+}
+EXPORT_SYMBOL_GPL(blk_mq_quiesce_tagset);
+
+void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set)
+{
+	struct request_queue *q;
+
+	mutex_lock(&set->tag_list_lock);
+	list_for_each_entry(q, &set->tag_list, tag_set_list)
+		blk_mq_unquiesce_queue(q);
+	mutex_unlock(&set->tag_list_lock);
+}
+EXPORT_SYMBOL_GPL(blk_mq_unquiesce_tagset);
+
 void blk_mq_wake_waiters(struct request_queue *q)
 {
 	struct blk_mq_hw_ctx *hctx;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index ba18e9bdb799..1df47606d0a7 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -877,6 +877,8 @@  void blk_mq_start_hw_queues(struct request_queue *q);
 void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
 void blk_mq_quiesce_queue(struct request_queue *q);
+void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set);
+void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set);
 void blk_mq_wait_quiesce_done(struct request_queue *q);
 void blk_mq_unquiesce_queue(struct request_queue *q);
 void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 50e358a19d98..f15544299a67 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -579,6 +579,7 @@  struct request_queue {
 #define QUEUE_FLAG_HCTX_ACTIVE	28	/* at least one blk-mq hctx is active */
 #define QUEUE_FLAG_NOWAIT       29	/* device supports NOWAIT */
 #define QUEUE_FLAG_SQ_SCHED     30	/* single queue style io dispatch */
+#define QUEUE_FLAG_NOQUIESCED	31	/* queue is never quiesced */
 
 #define QUEUE_FLAG_MQ_DEFAULT	((1UL << QUEUE_FLAG_IO_STAT) |		\
 				 (1UL << QUEUE_FLAG_SAME_COMP) |	\
@@ -619,6 +620,7 @@  bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
 #define blk_queue_pm_only(q)	atomic_read(&(q)->pm_only)
 #define blk_queue_registered(q)	test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags)
 #define blk_queue_sq_sched(q)	test_bit(QUEUE_FLAG_SQ_SCHED, &(q)->queue_flags)
+#define blk_queue_noquiesced(q)	test_bit(QUEUE_FLAG_NOQUIESCED, &(q)->queue_flags)
 
 extern void blk_set_pm_only(struct request_queue *q);
 extern void blk_clear_pm_only(struct request_queue *q);