Message ID | 20221013094450.5947-2-lengchao@huawei.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | improve nvme quiesce time for large amount of namespaces | expand |
On 10/13/22 12:44, Chao Leng wrote: > Drivers that have shared tagsets may need to quiesce potentially a lot > of request queues that all share a single tagset (e.g. nvme). Add an interface > to quiesce all the queues on a given tagset. This interface is useful because > it can speedup the quiesce by doing it in parallel. > > For tagsets that have BLK_MQ_F_BLOCKING set, we use call_srcu to all hctxs > in parallel such that all of them wait for the same rcu elapsed period with > a per-hctx heap allocated rcu_synchronize. for tagsets that don't have > BLK_MQ_F_BLOCKING set, we simply call a single synchronize_rcu as this is > sufficient. > > Because some queues never need to be quiesced(e.g. nvme connect_q). > So introduce QUEUE_FLAG_NOQUIESCED to tagset quiesce interface to > skip the queue. I wouldn't say it never nor will ever quiesce, we just don't happen to quiesce it today... > > Signed-off-by: Sagi Grimberg <sagi@grimberg.me> > Signed-off-by: Chao Leng <lengchao@huawei.com> > --- > block/blk-mq.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++ > include/linux/blk-mq.h | 2 ++ > include/linux/blkdev.h | 2 ++ > 3 files changed, 79 insertions(+) > > diff --git a/block/blk-mq.c b/block/blk-mq.c > index 8070b6c10e8d..ebe25da08156 100644 > --- a/block/blk-mq.c > +++ b/block/blk-mq.c > @@ -29,6 +29,7 @@ > #include <linux/prefetch.h> > #include <linux/blk-crypto.h> > #include <linux/part_stat.h> > +#include <linux/rcupdate_wait.h> > > #include <trace/events/block.h> > > @@ -311,6 +312,80 @@ void blk_mq_unquiesce_queue(struct request_queue *q) > } > EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue); > > +static void blk_mq_quiesce_blocking_tagset(struct blk_mq_tag_set *set) > +{ > + int i = 0; > + int count = 0; > + struct request_queue *q; > + struct rcu_synchronize *rcu; > + > + list_for_each_entry(q, &set->tag_list, tag_set_list) { > + if (blk_queue_noquiesced(q)) > + continue; > + > + blk_mq_quiesce_queue_nowait(q); > + count++; > + } > + > + rcu = kvmalloc(count * sizeof(*rcu), GFP_KERNEL); > + if (rcu) { > + list_for_each_entry(q, &set->tag_list, tag_set_list) { > + if (blk_queue_noquiesced(q)) > + continue; > + > + init_rcu_head(&rcu[i].head); > + init_completion(&rcu[i].completion); > + call_srcu(q->srcu, &rcu[i].head, wakeme_after_rcu); > + i++; > + } > + > + for (i = 0; i < count; i++) { > + wait_for_completion(&rcu[i].completion); > + destroy_rcu_head(&rcu[i].head); > + } > + kvfree(rcu); > + } else { > + list_for_each_entry(q, &set->tag_list, tag_set_list) > + synchronize_srcu(q->srcu); > + } > +} > + > +static void blk_mq_quiesce_nonblocking_tagset(struct blk_mq_tag_set *set) > +{ > + struct request_queue *q; > + > + list_for_each_entry(q, &set->tag_list, tag_set_list) { > + if (blk_queue_noquiesced(q)) > + continue; > + > + blk_mq_quiesce_queue_nowait(q); > + } > + synchronize_rcu(); > +} > + > +void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set) > +{ > + mutex_lock(&set->tag_list_lock); > + if (set->flags & BLK_MQ_F_BLOCKING) > + blk_mq_quiesce_blocking_tagset(set); > + else > + blk_mq_quiesce_nonblocking_tagset(set); > + > + mutex_unlock(&set->tag_list_lock); > +} > +EXPORT_SYMBOL_GPL(blk_mq_quiesce_tagset); > + > +void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set) > +{ > + struct request_queue *q; > + > + mutex_lock(&set->tag_list_lock); > + list_for_each_entry(q, &set->tag_list, tag_set_list) > + blk_mq_unquiesce_queue(q); > + mutex_unlock(&set->tag_list_lock); > +} > +EXPORT_SYMBOL_GPL(blk_mq_unquiesce_tagset); > + > void blk_mq_wake_waiters(struct request_queue *q) > { > struct blk_mq_hw_ctx *hctx; > diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h > index ba18e9bdb799..1df47606d0a7 100644 > --- a/include/linux/blk-mq.h > +++ b/include/linux/blk-mq.h > @@ -877,6 +877,8 @@ void blk_mq_start_hw_queues(struct request_queue *q); > void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); > void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); > void blk_mq_quiesce_queue(struct request_queue *q); > +void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set); > +void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set); > void blk_mq_wait_quiesce_done(struct request_queue *q); > void blk_mq_unquiesce_queue(struct request_queue *q); > void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); > diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h > index 50e358a19d98..f15544299a67 100644 > --- a/include/linux/blkdev.h > +++ b/include/linux/blkdev.h > @@ -579,6 +579,7 @@ struct request_queue { > #define QUEUE_FLAG_HCTX_ACTIVE 28 /* at least one blk-mq hctx is active */ > #define QUEUE_FLAG_NOWAIT 29 /* device supports NOWAIT */ > #define QUEUE_FLAG_SQ_SCHED 30 /* single queue style io dispatch */ > +#define QUEUE_FLAG_NOQUIESCED 31 /* queue is never quiesced */ the comment is misleading. If this is truely queue that is never quiescing then blk_mq_quiesce_queue() and friends need to skip it. I'd call it self_quiesce or something that would reflect that it is black-listed from tagset-wide quiesce.
On 2022/10/13 18:28, Sagi Grimberg wrote: > > > On 10/13/22 12:44, Chao Leng wrote: >> Drivers that have shared tagsets may need to quiesce potentially a lot >> of request queues that all share a single tagset (e.g. nvme). Add an interface >> to quiesce all the queues on a given tagset. This interface is useful because >> it can speedup the quiesce by doing it in parallel. >> >> For tagsets that have BLK_MQ_F_BLOCKING set, we use call_srcu to all hctxs >> in parallel such that all of them wait for the same rcu elapsed period with >> a per-hctx heap allocated rcu_synchronize. for tagsets that don't have >> BLK_MQ_F_BLOCKING set, we simply call a single synchronize_rcu as this is >> sufficient. >> >> Because some queues never need to be quiesced(e.g. nvme connect_q). >> So introduce QUEUE_FLAG_NOQUIESCED to tagset quiesce interface to >> skip the queue. > > I wouldn't say it never nor will ever quiesce, we just don't happen to > quiesce it today... > >> >> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> >> Signed-off-by: Chao Leng <lengchao@huawei.com> >> --- >> block/blk-mq.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++ >> include/linux/blk-mq.h | 2 ++ >> include/linux/blkdev.h | 2 ++ >> 3 files changed, 79 insertions(+) >> >> diff --git a/block/blk-mq.c b/block/blk-mq.c >> index 8070b6c10e8d..ebe25da08156 100644 >> --- a/block/blk-mq.c >> +++ b/block/blk-mq.c >> @@ -29,6 +29,7 @@ >> #include <linux/prefetch.h> >> #include <linux/blk-crypto.h> >> #include <linux/part_stat.h> >> +#include <linux/rcupdate_wait.h> >> #include <trace/events/block.h> >> @@ -311,6 +312,80 @@ void blk_mq_unquiesce_queue(struct request_queue *q) >> } >> EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue); >> +static void blk_mq_quiesce_blocking_tagset(struct blk_mq_tag_set *set) >> +{ >> + int i = 0; >> + int count = 0; >> + struct request_queue *q; >> + struct rcu_synchronize *rcu; >> + >> + list_for_each_entry(q, &set->tag_list, tag_set_list) { >> + if (blk_queue_noquiesced(q)) >> + continue; >> + >> + blk_mq_quiesce_queue_nowait(q); >> + count++; >> + } >> + >> + rcu = kvmalloc(count * sizeof(*rcu), GFP_KERNEL); >> + if (rcu) { >> + list_for_each_entry(q, &set->tag_list, tag_set_list) { >> + if (blk_queue_noquiesced(q)) >> + continue; >> + >> + init_rcu_head(&rcu[i].head); >> + init_completion(&rcu[i].completion); >> + call_srcu(q->srcu, &rcu[i].head, wakeme_after_rcu); >> + i++; >> + } >> + >> + for (i = 0; i < count; i++) { >> + wait_for_completion(&rcu[i].completion); >> + destroy_rcu_head(&rcu[i].head); >> + } >> + kvfree(rcu); >> + } else { >> + list_for_each_entry(q, &set->tag_list, tag_set_list) >> + synchronize_srcu(q->srcu); >> + } >> +} >> + >> +static void blk_mq_quiesce_nonblocking_tagset(struct blk_mq_tag_set *set) >> +{ >> + struct request_queue *q; >> + >> + list_for_each_entry(q, &set->tag_list, tag_set_list) { >> + if (blk_queue_noquiesced(q)) >> + continue; >> + >> + blk_mq_quiesce_queue_nowait(q); >> + } >> + synchronize_rcu(); >> +} >> + >> +void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set) >> +{ >> + mutex_lock(&set->tag_list_lock); >> + if (set->flags & BLK_MQ_F_BLOCKING) >> + blk_mq_quiesce_blocking_tagset(set); >> + else >> + blk_mq_quiesce_nonblocking_tagset(set); >> + >> + mutex_unlock(&set->tag_list_lock); >> +} >> +EXPORT_SYMBOL_GPL(blk_mq_quiesce_tagset); >> + >> +void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set) >> +{ >> + struct request_queue *q; >> + >> + mutex_lock(&set->tag_list_lock); >> + list_for_each_entry(q, &set->tag_list, tag_set_list) >> + blk_mq_unquiesce_queue(q); >> + mutex_unlock(&set->tag_list_lock); >> +} >> +EXPORT_SYMBOL_GPL(blk_mq_unquiesce_tagset); >> + >> void blk_mq_wake_waiters(struct request_queue *q) >> { >> struct blk_mq_hw_ctx *hctx; >> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h >> index ba18e9bdb799..1df47606d0a7 100644 >> --- a/include/linux/blk-mq.h >> +++ b/include/linux/blk-mq.h >> @@ -877,6 +877,8 @@ void blk_mq_start_hw_queues(struct request_queue *q); >> void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); >> void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); >> void blk_mq_quiesce_queue(struct request_queue *q); >> +void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set); >> +void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set); >> void blk_mq_wait_quiesce_done(struct request_queue *q); >> void blk_mq_unquiesce_queue(struct request_queue *q); >> void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); >> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h >> index 50e358a19d98..f15544299a67 100644 >> --- a/include/linux/blkdev.h >> +++ b/include/linux/blkdev.h >> @@ -579,6 +579,7 @@ struct request_queue { >> #define QUEUE_FLAG_HCTX_ACTIVE 28 /* at least one blk-mq hctx is active */ >> #define QUEUE_FLAG_NOWAIT 29 /* device supports NOWAIT */ >> #define QUEUE_FLAG_SQ_SCHED 30 /* single queue style io dispatch */ >> +#define QUEUE_FLAG_NOQUIESCED 31 /* queue is never quiesced */ > > the comment is misleading. If this is truely queue that is never > quiescing then blk_mq_quiesce_queue() and friends need to skip it. > > I'd call it self_quiesce or something that would reflect that it is > black-listed from tagset-wide quiesce. Yes, you are right. I will modify the comment in patch V3. > .
On Thu, Oct 13, 2022 at 05:44:49PM +0800, Chao Leng wrote: > + rcu = kvmalloc(count * sizeof(*rcu), GFP_KERNEL); > + if (rcu) { > + list_for_each_entry(q, &set->tag_list, tag_set_list) { > + if (blk_queue_noquiesced(q)) > + continue; > + > + init_rcu_head(&rcu[i].head); > + init_completion(&rcu[i].completion); > + call_srcu(q->srcu, &rcu[i].head, wakeme_after_rcu); > + i++; > + } > + > + for (i = 0; i < count; i++) { > + wait_for_completion(&rcu[i].completion); > + destroy_rcu_head(&rcu[i].head); > + } > + kvfree(rcu); > + } else { > + list_for_each_entry(q, &set->tag_list, tag_set_list) > + synchronize_srcu(q->srcu); > + } Having to allocate a struct rcu_synchronize for each of the potentially many queues here is a bit sad. Pull just explained the start_poll_synchronize_rcu interfaces at ALPSS last week, so I wonder if something like that would also be feasible for SRCU, as that would come in really handy here.
On Mon, Oct 17, 2022 at 03:39:06PM +0200, Christoph Hellwig wrote: > Having to allocate a struct rcu_synchronize for each of the potentially > many queues here is a bit sad. > > Pull just explained the start_poll_synchronize_rcu interfaces at ALPSS > last week, so I wonder if something like that would also be feasible > for SRCU, as that would come in really handy here. Alternatively I wonder if we could simply use a single srcu_struct in the tag_set instead of one per-queue. That would potentially increase the number of read side critical sections blk_mq_wait_quiesce_done would have to wait for in tagsets with multiple queues, but I wonder how much overhead that would be in practive.
On Thu, Oct 13, 2022 at 01:28:37PM +0300, Sagi Grimberg wrote: >> Because some queues never need to be quiesced(e.g. nvme connect_q). >> So introduce QUEUE_FLAG_NOQUIESCED to tagset quiesce interface to >> skip the queue. > > I wouldn't say it never nor will ever quiesce, we just don't happen to > quiesce it today... Yes. It really is a QUEUE_FLAG_SKIP_TAGSET_QUIESCE.
On Mon, Oct 17, 2022 at 03:39:06PM +0200, Christoph Hellwig wrote: > On Thu, Oct 13, 2022 at 05:44:49PM +0800, Chao Leng wrote: > > + rcu = kvmalloc(count * sizeof(*rcu), GFP_KERNEL); > > + if (rcu) { > > + list_for_each_entry(q, &set->tag_list, tag_set_list) { > > + if (blk_queue_noquiesced(q)) > > + continue; > > + > > + init_rcu_head(&rcu[i].head); > > + init_completion(&rcu[i].completion); > > + call_srcu(q->srcu, &rcu[i].head, wakeme_after_rcu); > > + i++; > > + } > > + > > + for (i = 0; i < count; i++) { > > + wait_for_completion(&rcu[i].completion); > > + destroy_rcu_head(&rcu[i].head); > > + } > > + kvfree(rcu); > > + } else { > > + list_for_each_entry(q, &set->tag_list, tag_set_list) > > + synchronize_srcu(q->srcu); > > + } > > Having to allocate a struct rcu_synchronize for each of the potentially > many queues here is a bit sad. > > Pull just explained the start_poll_synchronize_rcu interfaces at ALPSS > last week, so I wonder if something like that would also be feasible > for SRCU, as that would come in really handy here. There is start_poll_synchronize_srcu() and poll_state_synchronize_srcu(), but there would need to be an unsigned long for each srcu_struct from which an SRCU grace period was required. This would be half the size of the "rcu" array above, but still maybe larger than you would like. The resulting code might look something like this, with "rcu" now being a pointer to unsigned long: rcu = kvmalloc(count * sizeof(*rcu), GFP_KERNEL); if (rcu) { list_for_each_entry(q, &set->tag_list, tag_set_list) { if (blk_queue_noquiesced(q)) continue; rcu[i] = start_poll_synchronize_srcu(q->srcu); i++; } for (i = 0; i < count; i++) if (!poll_state_synchronize_srcu(q->srcu)) synchronize_srcu(q->srcu); kvfree(rcu); } else { list_for_each_entry(q, &set->tag_list, tag_set_list) synchronize_srcu(q->srcu); } Or as Christoph suggested, just have a single srcu_struct for the whole group. The main reason for having multiple srcu_struct structures is to prevent the readers from one from holding up the updaters from another. Except that by waiting for the multiple grace periods, you are losing that property anyway, correct? Or is this code waiting on only a small fraction of the srcu_struct structures associated with blk_queue? Thanx, Paul
On Mon, Oct 17, 2022 at 08:21:36AM -0700, Paul E. McKenney wrote: > The main reason for having multiple srcu_struct structures is to > prevent the readers from one from holding up the updaters from another. > Except that by waiting for the multiple grace periods, you are losing > that property anyway, correct? Or is this code waiting on only a small > fraction of the srcu_struct structures associated with blk_queue? There are a few places that do this. SCSI and MMC could probably switch to this scheme or at least and open coded version of it (if we move to a per_tagsect srcu_struct open coding it might be a better idea anyway). del_gendisk is one where we have to go one queue at a time, and that might actually matter for a device removal. elevator_switch is another one, there is a variant for it that works on the whole tagset, but also those that don't.
On Mon, Oct 17, 2022 at 05:31:05PM +0200, Christoph Hellwig wrote: > On Mon, Oct 17, 2022 at 08:21:36AM -0700, Paul E. McKenney wrote: > > The main reason for having multiple srcu_struct structures is to > > prevent the readers from one from holding up the updaters from another. > > Except that by waiting for the multiple grace periods, you are losing > > that property anyway, correct? Or is this code waiting on only a small > > fraction of the srcu_struct structures associated with blk_queue? > > There are a few places that do this. SCSI and MMC could probably switch > to this scheme or at least and open coded version of it (if we move > to a per_tagsect srcu_struct open coding it might be a better idea > anyway). > > del_gendisk is one where we have to go one queue at a time, and > that might actually matter for a device removal. elevator_switch > is another one, there is a variant for it that works on the whole > tagset, but also those that don't. OK, thank you for the info! Then the big question is "how long do the SRCU readers run?" If all of the readers ran for exactly the same duration, there would be little point in having more than one srcu_struct. If the kernel knew up front how long the SRCU readers for a given entry would run, it could provide an srcu_struct structure for each duration. For a (fanciful) example, you could have one srcu_struct structure for SSDs, another for rotating rust, a third for LAN-attached storage, and a fourth for WAN-attached storage. Maybe a fifth for lunar-based storage. If it is impossible to know up front which entry has what SRCU read-side latency characteristics, then the current approach of just giving each entry its own srcu_struct structure is not at all a bad plan. Does that help, or am I off in the weeds here? Thanx, Paul
On Mon, Oct 17, 2022 at 03:41:15PM -0700, Paul E. McKenney wrote: > Then the big question is "how long do the SRCU readers run?" > > If all of the readers ran for exactly the same duration, there would be > little point in having more than one srcu_struct. The SRCU readers are the I/O dispatch, which will have quite similar runtimes for the different queues. > If the kernel knew up front how long the SRCU readers for a given entry > would run, it could provide an srcu_struct structure for each duration. > For a (fanciful) example, you could have one srcu_struct structure for > SSDs, another for rotating rust, a third for LAN-attached storage, and > a fourth for WAN-attached storage. Maybe a fifth for lunar-based storage. All the different request_queues in a tag_set are for the same device. There might be some corner cases like storare arrays where they have different latencies. But we're not even waiting for the I/O completion here, this just protects the submission. > Does that help, or am I off in the weeds here? I think this was very helpful, and at least to be moving the srcu_struct to the tag_set sounds like a good idea to explore. Ming, anything I might have missed?
>> Having to allocate a struct rcu_synchronize for each of the potentially >> many queues here is a bit sad. >> >> Pull just explained the start_poll_synchronize_rcu interfaces at ALPSS >> last week, so I wonder if something like that would also be feasible >> for SRCU, as that would come in really handy here. > > Alternatively I wonder if we could simply use a single srcu_struct > in the tag_set instead of one per-queue. That would potentially > increase the number of read side critical sections > blk_mq_wait_quiesce_done would have to wait for in tagsets with > multiple queues, but I wonder how much overhead that would be in > practive. It used to be on the hctx actually. It was moved to the request_queue in an attempt to make parallel quiesce that didn't eventually complete afaict. For nvme, we never really quiesce a single namespace, so it will be a net positive IMO. I also think that quiesce is really more a tagset operation so I would think that its a more natural location. The only question in my mind is regarding patch 2/2 with the subtle ordering of nvme_set_queue_dying...
On Tue, Oct 18, 2022 at 11:39:30AM +0300, Sagi Grimberg wrote: > The only question in my mind is regarding patch 2/2 with the subtle > ordering of nvme_set_queue_dying... I think we need to sort that out as a pre-patch. There is quite some history there was a fair amount of dead lock potential earlier, but I think I sorted quite a lot out there, including a new lock for setting the gendisk size, which is what the comment refers to. Something like this: diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 059737c1a2c19..cb7623b5d2c8b 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -5105,21 +5105,21 @@ static void nvme_stop_ns_queue(struct nvme_ns *ns) /* * Prepare a queue for teardown. - * - * This must forcibly unquiesce queues to avoid blocking dispatch, and only set - * the capacity to 0 after that to avoid blocking dispatchers that may be - * holding bd_butex. This will end buffered writers dirtying pages that can't - * be synced. */ static void nvme_set_queue_dying(struct nvme_ns *ns) { if (test_and_set_bit(NVME_NS_DEAD, &ns->flags)) return; + /* + * Mark the disk dead to prevent new opens, and set the capacity to 0 + * to end buffered writers dirtying pages that can't be synced. + */ blk_mark_disk_dead(ns->disk); - nvme_start_ns_queue(ns); - set_capacity_and_notify(ns->disk, 0); + + /* forcibly unquiesce queues to avoid blocking dispatch */ + nvme_start_ns_queue(ns); } /**
>> The only question in my mind is regarding patch 2/2 with the subtle >> ordering of nvme_set_queue_dying... > > I think we need to sort that out as a pre-patch. There is quite some > history there was a fair amount of dead lock potential earlier, but > I think I sorted quite a lot out there, including a new lock for setting > the gendisk size, which is what the comment refers to. > > Something like this: > > diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c > index 059737c1a2c19..cb7623b5d2c8b 100644 > --- a/drivers/nvme/host/core.c > +++ b/drivers/nvme/host/core.c > @@ -5105,21 +5105,21 @@ static void nvme_stop_ns_queue(struct nvme_ns *ns) > > /* > * Prepare a queue for teardown. > - * > - * This must forcibly unquiesce queues to avoid blocking dispatch, and only set > - * the capacity to 0 after that to avoid blocking dispatchers that may be > - * holding bd_butex. This will end buffered writers dirtying pages that can't > - * be synced. > */ > static void nvme_set_queue_dying(struct nvme_ns *ns) > { > if (test_and_set_bit(NVME_NS_DEAD, &ns->flags)) > return; > > + /* > + * Mark the disk dead to prevent new opens, and set the capacity to 0 > + * to end buffered writers dirtying pages that can't be synced. > + */ > blk_mark_disk_dead(ns->disk); > - nvme_start_ns_queue(ns); > - > set_capacity_and_notify(ns->disk, 0); > + > + /* forcibly unquiesce queues to avoid blocking dispatch */ > + nvme_start_ns_queue(ns); > } If we no longer have this ordering requirement, then I don't see why the unquiesce cannot move before or after nvme_set_queue_dying and apply a tagset-wide quiesce/unquiesce...
On 2022/10/17 21:43, Christoph Hellwig wrote: > On Thu, Oct 13, 2022 at 01:28:37PM +0300, Sagi Grimberg wrote: >>> Because some queues never need to be quiesced(e.g. nvme connect_q). >>> So introduce QUEUE_FLAG_NOQUIESCED to tagset quiesce interface to >>> skip the queue. >> >> I wouldn't say it never nor will ever quiesce, we just don't happen to >> quiesce it today... > > Yes. It really is a QUEUE_FLAG_SKIP_TAGSET_QUIESCE. I will modify it in patch V3. > > > . >
On 2022/10/17 23:21, Paul E. McKenney wrote: > On Mon, Oct 17, 2022 at 03:39:06PM +0200, Christoph Hellwig wrote: >> On Thu, Oct 13, 2022 at 05:44:49PM +0800, Chao Leng wrote: >>> + rcu = kvmalloc(count * sizeof(*rcu), GFP_KERNEL); >>> + if (rcu) { >>> + list_for_each_entry(q, &set->tag_list, tag_set_list) { >>> + if (blk_queue_noquiesced(q)) >>> + continue; >>> + >>> + init_rcu_head(&rcu[i].head); >>> + init_completion(&rcu[i].completion); >>> + call_srcu(q->srcu, &rcu[i].head, wakeme_after_rcu); >>> + i++; >>> + } >>> + >>> + for (i = 0; i < count; i++) { >>> + wait_for_completion(&rcu[i].completion); >>> + destroy_rcu_head(&rcu[i].head); >>> + } >>> + kvfree(rcu); >>> + } else { >>> + list_for_each_entry(q, &set->tag_list, tag_set_list) >>> + synchronize_srcu(q->srcu); >>> + } >> >> Having to allocate a struct rcu_synchronize for each of the potentially >> many queues here is a bit sad. >> >> Pull just explained the start_poll_synchronize_rcu interfaces at ALPSS >> last week, so I wonder if something like that would also be feasible >> for SRCU, as that would come in really handy here. > > There is start_poll_synchronize_srcu() and poll_state_synchronize_srcu(), > but there would need to be an unsigned long for each srcu_struct from > which an SRCU grace period was required. This would be half the size > of the "rcu" array above, but still maybe larger than you would like. > > The resulting code might look something like this, with "rcu" now being > a pointer to unsigned long: > > rcu = kvmalloc(count * sizeof(*rcu), GFP_KERNEL); > if (rcu) { > list_for_each_entry(q, &set->tag_list, tag_set_list) { > if (blk_queue_noquiesced(q)) > continue; > rcu[i] = start_poll_synchronize_srcu(q->srcu); > i++; > } > > for (i = 0; i < count; i++) > if (!poll_state_synchronize_srcu(q->srcu)) > synchronize_srcu(q->srcu); synchronize_srcu will restart a new period of grace. Maybe it would be better like this: while (!poll_state_synchronize_srcu(q->srcu, rcu[i])) schedule_timeout_uninterruptible(1); > kvfree(rcu); > } else { > list_for_each_entry(q, &set->tag_list, tag_set_list) > synchronize_srcu(q->srcu); > } > > Or as Christoph suggested, just have a single srcu_struct for the > whole group. > > The main reason for having multiple srcu_struct structures is to > prevent the readers from one from holding up the updaters from another. > Except that by waiting for the multiple grace periods, you are losing > that property anyway, correct? Or is this code waiting on only a small > fraction of the srcu_struct structures associated with blk_queue? > > Thanx, Paul > . >
On 2022/10/17 21:39, Christoph Hellwig wrote: > On Thu, Oct 13, 2022 at 05:44:49PM +0800, Chao Leng wrote: >> + rcu = kvmalloc(count * sizeof(*rcu), GFP_KERNEL); >> + if (rcu) { >> + list_for_each_entry(q, &set->tag_list, tag_set_list) { >> + if (blk_queue_noquiesced(q)) >> + continue; >> + >> + init_rcu_head(&rcu[i].head); >> + init_completion(&rcu[i].completion); >> + call_srcu(q->srcu, &rcu[i].head, wakeme_after_rcu); >> + i++; >> + } >> + >> + for (i = 0; i < count; i++) { >> + wait_for_completion(&rcu[i].completion); >> + destroy_rcu_head(&rcu[i].head); >> + } >> + kvfree(rcu); >> + } else { >> + list_for_each_entry(q, &set->tag_list, tag_set_list) >> + synchronize_srcu(q->srcu); >> + } > > Having to allocate a struct rcu_synchronize for each of the potentially > many queues here is a bit sad. > > Pull just explained the start_poll_synchronize_rcu interfaces at ALPSS > last week, so I wonder if something like that would also be feasible > for SRCU, as that would come in really handy here. Using start_poll_synchronize_rcu will reduce some memory. This is a good idea. > > . >
On 2022/10/17 21:42, Christoph Hellwig wrote: > On Mon, Oct 17, 2022 at 03:39:06PM +0200, Christoph Hellwig wrote: >> Having to allocate a struct rcu_synchronize for each of the potentially >> many queues here is a bit sad. >> >> Pull just explained the start_poll_synchronize_rcu interfaces at ALPSS >> last week, so I wonder if something like that would also be feasible >> for SRCU, as that would come in really handy here. > > Alternatively I wonder if we could simply use a single srcu_struct > in the tag_set instead of one per-queue. That would potentially > increase the number of read side critical sections > blk_mq_wait_quiesce_done would have to wait for in tagsets with > multiple queues, but I wonder how much overhead that would be in > practive. Now we submit request based on queues, maybe it is difficult to use a single srcu_struct in the tag instead of one per-queue. Using start_poll_synchronize_rcu still need a unsigned long array. > > . >
On Tue, Oct 18, 2022 at 12:06:41PM +0300, Sagi Grimberg wrote: >> + /* >> + * Mark the disk dead to prevent new opens, and set the capacity to 0 >> + * to end buffered writers dirtying pages that can't be synced. >> + */ >> blk_mark_disk_dead(ns->disk); >> - nvme_start_ns_queue(ns); >> - >> set_capacity_and_notify(ns->disk, 0); >> + >> + /* forcibly unquiesce queues to avoid blocking dispatch */ >> + nvme_start_ns_queue(ns); >> } > > If we no longer have this ordering requirement, then I don't see why > the unquiesce cannot move before or after nvme_set_queue_dying and apply > a tagset-wide quiesce/unquiesce... Yes. After this patch we can simply do the tagset-wide unquiesce after the loop calling blk_mark_disk_dead and set_capacity_and_notify. We can then also move the NЅ_DEAD flag to the controller..
On Tue, Oct 18, 2022 at 05:52:06PM +0800, Chao Leng wrote: > On 2022/10/17 23:21, Paul E. McKenney wrote: > > On Mon, Oct 17, 2022 at 03:39:06PM +0200, Christoph Hellwig wrote: > > > On Thu, Oct 13, 2022 at 05:44:49PM +0800, Chao Leng wrote: > > > > + rcu = kvmalloc(count * sizeof(*rcu), GFP_KERNEL); > > > > + if (rcu) { > > > > + list_for_each_entry(q, &set->tag_list, tag_set_list) { > > > > + if (blk_queue_noquiesced(q)) > > > > + continue; > > > > + > > > > + init_rcu_head(&rcu[i].head); > > > > + init_completion(&rcu[i].completion); > > > > + call_srcu(q->srcu, &rcu[i].head, wakeme_after_rcu); > > > > + i++; > > > > + } > > > > + > > > > + for (i = 0; i < count; i++) { > > > > + wait_for_completion(&rcu[i].completion); > > > > + destroy_rcu_head(&rcu[i].head); > > > > + } > > > > + kvfree(rcu); > > > > + } else { > > > > + list_for_each_entry(q, &set->tag_list, tag_set_list) > > > > + synchronize_srcu(q->srcu); > > > > + } > > > > > > Having to allocate a struct rcu_synchronize for each of the potentially > > > many queues here is a bit sad. > > > > > > Pull just explained the start_poll_synchronize_rcu interfaces at ALPSS > > > last week, so I wonder if something like that would also be feasible > > > for SRCU, as that would come in really handy here. > > > > There is start_poll_synchronize_srcu() and poll_state_synchronize_srcu(), > > but there would need to be an unsigned long for each srcu_struct from > > which an SRCU grace period was required. This would be half the size > > of the "rcu" array above, but still maybe larger than you would like. > > > > The resulting code might look something like this, with "rcu" now being > > a pointer to unsigned long: > > > > rcu = kvmalloc(count * sizeof(*rcu), GFP_KERNEL); > > if (rcu) { > > list_for_each_entry(q, &set->tag_list, tag_set_list) { > > if (blk_queue_noquiesced(q)) > > continue; > > rcu[i] = start_poll_synchronize_srcu(q->srcu); > > i++; > > } > > > > for (i = 0; i < count; i++) > > if (!poll_state_synchronize_srcu(q->srcu)) > > synchronize_srcu(q->srcu); > synchronize_srcu will restart a new period of grace. True, but SRCU grace periods normally complete reasonably quickly, so the synchronize_srcu() might well be faster than the loop, depending on what the corresponding SRCU readers are doing. > Maybe it would be better like this: > while (!poll_state_synchronize_srcu(q->srcu, rcu[i])) > schedule_timeout_uninterruptible(1); Why not try it both ways and see what happens? Assuming that is, that the differences matter in this code. Thanx, Paul > > kvfree(rcu); > > } else { > > list_for_each_entry(q, &set->tag_list, tag_set_list) > > synchronize_srcu(q->srcu); > > } > > > > Or as Christoph suggested, just have a single srcu_struct for the > > whole group. > > > > The main reason for having multiple srcu_struct structures is to > > prevent the readers from one from holding up the updaters from another. > > Except that by waiting for the multiple grace periods, you are losing > > that property anyway, correct? Or is this code waiting on only a small > > fraction of the srcu_struct structures associated with blk_queue? > > > > Thanx, Paul > > . > >
On Tue, Oct 18, 2022 at 07:19:56AM +0200, Christoph Hellwig wrote: > On Mon, Oct 17, 2022 at 03:41:15PM -0700, Paul E. McKenney wrote: > > Then the big question is "how long do the SRCU readers run?" > > > > If all of the readers ran for exactly the same duration, there would be > > little point in having more than one srcu_struct. > > The SRCU readers are the I/O dispatch, which will have quite similar > runtimes for the different queues. > > > If the kernel knew up front how long the SRCU readers for a given entry > > would run, it could provide an srcu_struct structure for each duration. > > For a (fanciful) example, you could have one srcu_struct structure for > > SSDs, another for rotating rust, a third for LAN-attached storage, and > > a fourth for WAN-attached storage. Maybe a fifth for lunar-based storage. > > All the different request_queues in a tag_set are for the same device. > There might be some corner cases like storare arrays where they have > different latencies. But we're not even waiting for the I/O completion > here, this just protects the submission. > > > Does that help, or am I off in the weeds here? > > I think this was very helpful, and at least to be moving the srcu_struct > to the tag_set sounds like a good idea to explore. > > Ming, anything I might have missed? I think it is fine to move it to tag_set, this way could simplify a lot. The effect could be that blk_mq_quiesce_queue() becomes a little slow, but it is always in slow path, and synchronize_srcu() won't wait new read-side critical-section. Just one corner case, in case of BLK_MQ_F_BLOCKING, is there any such driver which may block too long in ->queue_rq() sometime? such as wait for dozens of seconds. thanks, Ming
On 2022/10/18 23:04, Paul E. McKenney wrote: > On Tue, Oct 18, 2022 at 05:52:06PM +0800, Chao Leng wrote: >> On 2022/10/17 23:21, Paul E. McKenney wrote: >>> On Mon, Oct 17, 2022 at 03:39:06PM +0200, Christoph Hellwig wrote: >>>> On Thu, Oct 13, 2022 at 05:44:49PM +0800, Chao Leng wrote: >>>>> + rcu = kvmalloc(count * sizeof(*rcu), GFP_KERNEL); >>>>> + if (rcu) { >>>>> + list_for_each_entry(q, &set->tag_list, tag_set_list) { >>>>> + if (blk_queue_noquiesced(q)) >>>>> + continue; >>>>> + >>>>> + init_rcu_head(&rcu[i].head); >>>>> + init_completion(&rcu[i].completion); >>>>> + call_srcu(q->srcu, &rcu[i].head, wakeme_after_rcu); >>>>> + i++; >>>>> + } >>>>> + >>>>> + for (i = 0; i < count; i++) { >>>>> + wait_for_completion(&rcu[i].completion); >>>>> + destroy_rcu_head(&rcu[i].head); >>>>> + } >>>>> + kvfree(rcu); >>>>> + } else { >>>>> + list_for_each_entry(q, &set->tag_list, tag_set_list) >>>>> + synchronize_srcu(q->srcu); >>>>> + } >>>> >>>> Having to allocate a struct rcu_synchronize for each of the potentially >>>> many queues here is a bit sad. >>>> >>>> Pull just explained the start_poll_synchronize_rcu interfaces at ALPSS >>>> last week, so I wonder if something like that would also be feasible >>>> for SRCU, as that would come in really handy here. >>> >>> There is start_poll_synchronize_srcu() and poll_state_synchronize_srcu(), >>> but there would need to be an unsigned long for each srcu_struct from >>> which an SRCU grace period was required. This would be half the size >>> of the "rcu" array above, but still maybe larger than you would like. >>> >>> The resulting code might look something like this, with "rcu" now being >>> a pointer to unsigned long: >>> >>> rcu = kvmalloc(count * sizeof(*rcu), GFP_KERNEL); >>> if (rcu) { >>> list_for_each_entry(q, &set->tag_list, tag_set_list) { >>> if (blk_queue_noquiesced(q)) >>> continue; >>> rcu[i] = start_poll_synchronize_srcu(q->srcu); >>> i++; >>> } >>> >>> for (i = 0; i < count; i++) >>> if (!poll_state_synchronize_srcu(q->srcu)) >>> synchronize_srcu(q->srcu); >> synchronize_srcu will restart a new period of grace. > > True, but SRCU grace periods normally complete reasonably quickly, so > the synchronize_srcu() might well be faster than the loop, depending on > what the corresponding SRCU readers are doing. Yes, Different runtimes have different wait times, and it's hard to say which is better or worse. > >> Maybe it would be better like this: >> while (!poll_state_synchronize_srcu(q->srcu, rcu[i])) >> schedule_timeout_uninterruptible(1); > > Why not try it both ways and see what happens? Assuming that is, that > the differences matter in this code. > > Thanx, Paul > >>> kvfree(rcu); >>> } else { >>> list_for_each_entry(q, &set->tag_list, tag_set_list) >>> synchronize_srcu(q->srcu); >>> } >>> >>> Or as Christoph suggested, just have a single srcu_struct for the >>> whole group. >>> >>> The main reason for having multiple srcu_struct structures is to >>> prevent the readers from one from holding up the updaters from another. >>> Except that by waiting for the multiple grace periods, you are losing >>> that property anyway, correct? Or is this code waiting on only a small >>> fraction of the srcu_struct structures associated with blk_queue? >>> >>> Thanx, Paul >>> . >>> > . >
>>> Then the big question is "how long do the SRCU readers run?" >>> >>> If all of the readers ran for exactly the same duration, there would be >>> little point in having more than one srcu_struct. >> >> The SRCU readers are the I/O dispatch, which will have quite similar >> runtimes for the different queues. >> >>> If the kernel knew up front how long the SRCU readers for a given entry >>> would run, it could provide an srcu_struct structure for each duration. >>> For a (fanciful) example, you could have one srcu_struct structure for >>> SSDs, another for rotating rust, a third for LAN-attached storage, and >>> a fourth for WAN-attached storage. Maybe a fifth for lunar-based storage. >> >> All the different request_queues in a tag_set are for the same device. >> There might be some corner cases like storare arrays where they have >> different latencies. But we're not even waiting for the I/O completion >> here, this just protects the submission. >> >>> Does that help, or am I off in the weeds here? >> >> I think this was very helpful, and at least to be moving the srcu_struct >> to the tag_set sounds like a good idea to explore. >> >> Ming, anything I might have missed? > > I think it is fine to move it to tag_set, this way could simplify a > lot. > > The effect could be that blk_mq_quiesce_queue() becomes a little > slow, but it is always in slow path, and synchronize_srcu() won't > wait new read-side critical-section. > > Just one corner case, in case of BLK_MQ_F_BLOCKING, is there any such driver > which may block too long in ->queue_rq() sometime? such as wait for dozens > of seconds. nvme-tcp will opportunistically try a network send directly from .queue_rq(), but always with MSG_DONTWAIT, so that is not a problem. nbd though can block in .queue_rq() with blocking network sends, however afaict nbd allocates a tagset per nbd_device, and also a request_queue per device, so its effectively the same if the srcu is in the tagset or in the request_queue.
On Wed, Oct 19, 2022 at 10:15:26AM +0300, Sagi Grimberg wrote: > nvme-tcp will opportunistically try a network send directly from > .queue_rq(), but always with MSG_DONTWAIT, so that is not a problem. > > nbd though can block in .queue_rq() with blocking network sends, however > afaict nbd allocates a tagset per nbd_device, and also a request_queue > per device, so its effectively the same if the srcu is in the tagset or > in the request_queue. Yes, the only multiple request_queue per tag_set cases right now are nvme-tcp and mmc. There have been patches from iSCSI, but they seem to be stuck.
On Wed, Oct 19, 2022 at 09:25:35AM +0200, Christoph Hellwig wrote: > Yes, the only multiple request_queue per tag_set cases right now > are nvme-tcp and mmc. There have been patches from iSCSI, but they > seem to be stuck. ... and looking at the only blk_mq_quiesce_queue caller in mmc, it would benefit from a tagset-wide quiesce after a bit of refactoring as well.
>> nvme-tcp will opportunistically try a network send directly from >> .queue_rq(), but always with MSG_DONTWAIT, so that is not a problem. >> >> nbd though can block in .queue_rq() with blocking network sends, however >> afaict nbd allocates a tagset per nbd_device, and also a request_queue >> per device, so its effectively the same if the srcu is in the tagset or >> in the request_queue. > > Yes, the only multiple request_queue per tag_set cases right now > are nvme-tcp and mmc. There have been patches from iSCSI, but they > seem to be stuck. iscsi defers network sends to a workqueue anyways, and no scsi lld sets BLK_MQ_F_BLOCKING.
On Wed, Oct 19, 2022 at 10:30:44AM +0300, Sagi Grimberg wrote: > iscsi defers network sends to a workqueue anyways, and no scsi lld sets > BLK_MQ_F_BLOCKING. Yes, but Mike had a series to implement a similar fast path as nvme-tcp as while ago that included supporting BLK_MQ_F_BLOCKING in the scsi midlayer. We'll also need it to convert the paride drivers to libata, but those are single request_queue per tag_set as far as I know.
On 10/19/22 10:32, Christoph Hellwig wrote: > On Wed, Oct 19, 2022 at 10:30:44AM +0300, Sagi Grimberg wrote: >> iscsi defers network sends to a workqueue anyways, and no scsi lld sets >> BLK_MQ_F_BLOCKING. > > Yes, but Mike had a series to implement a similar fast path as nvme-tcp > as while ago that included supporting BLK_MQ_F_BLOCKING in the scsi > midlayer. Hmmm, didn't see that, pointer? > We'll also need it to convert the paride drivers to libata, > but those are single request_queue per tag_set as far as I know. didn't see paride even attempting to quiesce...
On Wed, Oct 19, 2022 at 10:57:11AM +0300, Sagi Grimberg wrote: > > > On 10/19/22 10:32, Christoph Hellwig wrote: >> On Wed, Oct 19, 2022 at 10:30:44AM +0300, Sagi Grimberg wrote: >>> iscsi defers network sends to a workqueue anyways, and no scsi lld sets >>> BLK_MQ_F_BLOCKING. >> >> Yes, but Mike had a series to implement a similar fast path as nvme-tcp >> as while ago that included supporting BLK_MQ_F_BLOCKING in the scsi >> midlayer. > > Hmmm, didn't see that, pointer? https://www.spinics.net/lists/linux-scsi/msg170849.html >> We'll also need it to convert the paride drivers to libata, >> but those are single request_queue per tag_set as far as I know. > > didn't see paride even attempting to quiesce... The block layer might. But it really does not matter for our considerations here..
>>>> iscsi defers network sends to a workqueue anyways, and no scsi lld sets >>>> BLK_MQ_F_BLOCKING. >>> >>> Yes, but Mike had a series to implement a similar fast path as nvme-tcp >>> as while ago that included supporting BLK_MQ_F_BLOCKING in the scsi >>> midlayer. >> >> Hmmm, didn't see that, pointer? > > https://www.spinics.net/lists/linux-scsi/msg170849.html Nice. >>> We'll also need it to convert the paride drivers to libata, >>> but those are single request_queue per tag_set as far as I know. >> >> didn't see paride even attempting to quiesce... > > The block layer might. But it really does not matter for our > considerations here.. Right.
diff --git a/block/blk-mq.c b/block/blk-mq.c index 8070b6c10e8d..ebe25da08156 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -29,6 +29,7 @@ #include <linux/prefetch.h> #include <linux/blk-crypto.h> #include <linux/part_stat.h> +#include <linux/rcupdate_wait.h> #include <trace/events/block.h> @@ -311,6 +312,80 @@ void blk_mq_unquiesce_queue(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue); +static void blk_mq_quiesce_blocking_tagset(struct blk_mq_tag_set *set) +{ + int i = 0; + int count = 0; + struct request_queue *q; + struct rcu_synchronize *rcu; + + list_for_each_entry(q, &set->tag_list, tag_set_list) { + if (blk_queue_noquiesced(q)) + continue; + + blk_mq_quiesce_queue_nowait(q); + count++; + } + + rcu = kvmalloc(count * sizeof(*rcu), GFP_KERNEL); + if (rcu) { + list_for_each_entry(q, &set->tag_list, tag_set_list) { + if (blk_queue_noquiesced(q)) + continue; + + init_rcu_head(&rcu[i].head); + init_completion(&rcu[i].completion); + call_srcu(q->srcu, &rcu[i].head, wakeme_after_rcu); + i++; + } + + for (i = 0; i < count; i++) { + wait_for_completion(&rcu[i].completion); + destroy_rcu_head(&rcu[i].head); + } + kvfree(rcu); + } else { + list_for_each_entry(q, &set->tag_list, tag_set_list) + synchronize_srcu(q->srcu); + } +} + +static void blk_mq_quiesce_nonblocking_tagset(struct blk_mq_tag_set *set) +{ + struct request_queue *q; + + list_for_each_entry(q, &set->tag_list, tag_set_list) { + if (blk_queue_noquiesced(q)) + continue; + + blk_mq_quiesce_queue_nowait(q); + } + synchronize_rcu(); +} + +void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set) +{ + mutex_lock(&set->tag_list_lock); + if (set->flags & BLK_MQ_F_BLOCKING) + blk_mq_quiesce_blocking_tagset(set); + else + blk_mq_quiesce_nonblocking_tagset(set); + + mutex_unlock(&set->tag_list_lock); +} +EXPORT_SYMBOL_GPL(blk_mq_quiesce_tagset); + +void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set) +{ + struct request_queue *q; + + mutex_lock(&set->tag_list_lock); + list_for_each_entry(q, &set->tag_list, tag_set_list) + blk_mq_unquiesce_queue(q); + mutex_unlock(&set->tag_list_lock); +} +EXPORT_SYMBOL_GPL(blk_mq_unquiesce_tagset); + void blk_mq_wake_waiters(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index ba18e9bdb799..1df47606d0a7 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -877,6 +877,8 @@ void blk_mq_start_hw_queues(struct request_queue *q); void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); void blk_mq_quiesce_queue(struct request_queue *q); +void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set); +void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set); void blk_mq_wait_quiesce_done(struct request_queue *q); void blk_mq_unquiesce_queue(struct request_queue *q); void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 50e358a19d98..f15544299a67 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -579,6 +579,7 @@ struct request_queue { #define QUEUE_FLAG_HCTX_ACTIVE 28 /* at least one blk-mq hctx is active */ #define QUEUE_FLAG_NOWAIT 29 /* device supports NOWAIT */ #define QUEUE_FLAG_SQ_SCHED 30 /* single queue style io dispatch */ +#define QUEUE_FLAG_NOQUIESCED 31 /* queue is never quiesced */ #define QUEUE_FLAG_MQ_DEFAULT ((1UL << QUEUE_FLAG_IO_STAT) | \ (1UL << QUEUE_FLAG_SAME_COMP) | \ @@ -619,6 +620,7 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_pm_only(q) atomic_read(&(q)->pm_only) #define blk_queue_registered(q) test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags) #define blk_queue_sq_sched(q) test_bit(QUEUE_FLAG_SQ_SCHED, &(q)->queue_flags) +#define blk_queue_noquiesced(q) test_bit(QUEUE_FLAG_NOQUIESCED, &(q)->queue_flags) extern void blk_set_pm_only(struct request_queue *q); extern void blk_clear_pm_only(struct request_queue *q);