diff mbox series

[RFC,v7,06/12] blk-mq: Record active_queues_shared_sbitmap per tag_set for when using shared sbitmap

Message ID 1591810159-240929-7-git-send-email-john.garry@huawei.com (mailing list archive)
State New, archived
Headers show
Series blk-mq/scsi: Provide hostwide shared tags for SCSI HBAs | expand

Commit Message

John Garry June 10, 2020, 5:29 p.m. UTC
For when using a shared sbitmap, no longer should the number of active
request queues per hctx be relied on for when judging how to share the tag
bitmap.

Instead maintain the number of active request queues per tag_set, and make
the judgment based on that.

And since the blk_mq_tags.active_queues is no longer maintained, do not
show it in debugfs.

Originally-from: Kashyap Desai <kashyap.desai@broadcom.com>
Signed-off-by: John Garry <john.garry@huawei.com>
---
 block/blk-mq-debugfs.c | 25 ++++++++++++++++++++--
 block/blk-mq-tag.c     | 47 ++++++++++++++++++++++++++++++++----------
 block/blk-mq.c         |  2 ++
 include/linux/blk-mq.h |  1 +
 include/linux/blkdev.h |  1 +
 5 files changed, 63 insertions(+), 13 deletions(-)

Comments

Hannes Reinecke June 11, 2020, 1:16 p.m. UTC | #1
On 6/10/20 7:29 PM, John Garry wrote:
> For when using a shared sbitmap, no longer should the number of active
> request queues per hctx be relied on for when judging how to share the tag
> bitmap.
> 
> Instead maintain the number of active request queues per tag_set, and make
> the judgment based on that.
> 
> And since the blk_mq_tags.active_queues is no longer maintained, do not
> show it in debugfs.
> 
> Originally-from: Kashyap Desai <kashyap.desai@broadcom.com>
> Signed-off-by: John Garry <john.garry@huawei.com>
> ---
>   block/blk-mq-debugfs.c | 25 ++++++++++++++++++++--
>   block/blk-mq-tag.c     | 47 ++++++++++++++++++++++++++++++++----------
>   block/blk-mq.c         |  2 ++
>   include/linux/blk-mq.h |  1 +
>   include/linux/blkdev.h |  1 +
>   5 files changed, 63 insertions(+), 13 deletions(-)
> 
> diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
> index 0fa3af41ab65..05b4be0c03d9 100644
> --- a/block/blk-mq-debugfs.c
> +++ b/block/blk-mq-debugfs.c
> @@ -458,17 +458,37 @@ static void blk_mq_debugfs_tags_show(struct seq_file *m,
>   	}
>   }
>   
> +static void blk_mq_debugfs_tags_shared_sbitmap_show(struct seq_file *m,
> +				     struct blk_mq_tags *tags)
> +{
> +	seq_printf(m, "nr_tags=%u\n", tags->nr_tags);
> +	seq_printf(m, "nr_reserved_tags=%u\n", tags->nr_reserved_tags);
> +
> +	seq_puts(m, "\nbitmap_tags:\n");
> +	sbitmap_queue_show(tags->bitmap_tags, m);
> +
> +	if (tags->nr_reserved_tags) {
> +		seq_puts(m, "\nbreserved_tags:\n");
> +		sbitmap_queue_show(tags->breserved_tags, m);
> +	}
> +}
> +
>   static int hctx_tags_show(void *data, struct seq_file *m)
>   {
>   	struct blk_mq_hw_ctx *hctx = data;
>   	struct request_queue *q = hctx->queue;
> +	struct blk_mq_tag_set *set = q->tag_set;
>   	int res;
>   
>   	res = mutex_lock_interruptible(&q->sysfs_lock);
>   	if (res)
>   		goto out;
> -	if (hctx->tags)
> -		blk_mq_debugfs_tags_show(m, hctx->tags);
> +	if (hctx->tags) {
> +		if (blk_mq_is_sbitmap_shared(set))
> +			blk_mq_debugfs_tags_shared_sbitmap_show(m, hctx->tags);
> +		else
> +			blk_mq_debugfs_tags_show(m, hctx->tags);
> +	}
>   	mutex_unlock(&q->sysfs_lock);
>   
>   out:
> @@ -802,6 +822,7 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_shared_sbitmap_attrs
>   	{"dispatch", 0400, .seq_ops = &hctx_dispatch_seq_ops},
>   	{"busy", 0400, hctx_busy_show},
>   	{"ctx_map", 0400, hctx_ctx_map_show},
> +	{"tags", 0400, hctx_tags_show},
>   	{"sched_tags", 0400, hctx_sched_tags_show},
>   	{"sched_tags_bitmap", 0400, hctx_sched_tags_bitmap_show},
>   	{"io_poll", 0600, hctx_io_poll_show, hctx_io_poll_write},

I had been pondering this, too, when creating v6. Problem is that it'll 
show the tags per hctx, but as they are shared I guess the list looks 
pretty identical per hctx.
So I guess we should filter the tags per hctx to have only those active 
on that hctx displayed. But when doing so we can only print the 
in-flight tags, the others are not assigned to a hctx and as such we 
can't make a decision on which hctx they'll end up.

> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> index 7db16e49f6f6..6ca06b1c3a99 100644
> --- a/block/blk-mq-tag.c
> +++ b/block/blk-mq-tag.c
> @@ -23,9 +23,19 @@
>    */
>   bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
>   {
> -	if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
> -	    !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
> -		atomic_inc(&hctx->tags->active_queues);
> +	struct request_queue *q = hctx->queue;
> +	struct blk_mq_tag_set *set = q->tag_set;
> +
> +	if (blk_mq_is_sbitmap_shared(set)){
> +		if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) &&
> +		    !test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
> +			atomic_inc(&set->active_queues_shared_sbitmap);
> +
> +	} else {
> +		if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
> +		    !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
> +			atomic_inc(&hctx->tags->active_queues);
> +	}
>   
>   	return true;
>   }
At one point someone would need to educate me what this double 
'test_bit' and 'test_and_set_bit' is supposed to achieve.
Other than deliberately injecting a race condition ...

> @@ -47,11 +57,19 @@ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
>   void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
>   {
>   	struct blk_mq_tags *tags = hctx->tags;
> -
> -	if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
> -		return;
> -
> -	atomic_dec(&tags->active_queues);
> +	struct request_queue *q = hctx->queue;
> +	struct blk_mq_tag_set *set = q->tag_set;
> +
> +	if (blk_mq_is_sbitmap_shared(q->tag_set)){
> +		if (!test_and_clear_bit(QUEUE_FLAG_HCTX_ACTIVE,
> +					&q->queue_flags))
> +			return;
> +		atomic_dec(&set->active_queues_shared_sbitmap);
> +	} else {
> +		if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
> +			return;
> +		atomic_dec(&tags->active_queues);
> +	}
>   
>   	blk_mq_tag_wakeup_all(tags, false);
>   }
> @@ -65,12 +83,11 @@ static inline bool hctx_may_queue(struct blk_mq_alloc_data *data,
>   {
>   	struct blk_mq_hw_ctx *hctx = data->hctx;
>   	struct request_queue *q = data->q;
> +	struct blk_mq_tag_set *set = q->tag_set;
>   	unsigned int depth, users;
>   
>   	if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED))
>   		return true;
> -	if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
> -		return true;
>   
>   	/*
>   	 * Don't try dividing an ant
> @@ -78,7 +95,15 @@ static inline bool hctx_may_queue(struct blk_mq_alloc_data *data,
>   	if (bt->sb.depth == 1)
>   		return true;
>   
> -	users = atomic_read(&hctx->tags->active_queues);
> +	if (blk_mq_is_sbitmap_shared(q->tag_set)) {
> +		if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &q->queue_flags))
> +			return true;
> +		users = atomic_read(&set->active_queues_shared_sbitmap);
> +	} else {
> +		if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
> +			return true;
> +		users = atomic_read(&hctx->tags->active_queues);
> +	}
>   	if (!users)
>   		return true;
>   
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 0f7e062a1665..f73a2f9c58bd 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -3350,6 +3350,8 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
>   		goto out_free_mq_map;
>   
>   	if (blk_mq_is_sbitmap_shared(set)) {
> +		atomic_set(&set->active_queues_shared_sbitmap, 0);
> +
>   		if (!blk_mq_init_shared_sbitmap(set)) {
>   			ret = -ENOMEM;
>   			goto out_free_mq_rq_maps;
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index 7b31cdb92a71..66711c7234db 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -252,6 +252,7 @@ struct blk_mq_tag_set {
>   	unsigned int		timeout;
>   	unsigned int		flags;
>   	void			*driver_data;
> +	atomic_t		active_queues_shared_sbitmap;
>   
>   	struct sbitmap_queue	__bitmap_tags;
>   	struct sbitmap_queue	__breserved_tags;
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index c536278bec9e..1b0087e8d01a 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -619,6 +619,7 @@ struct request_queue {
>   #define QUEUE_FLAG_PCI_P2PDMA	25	/* device supports PCI p2p requests */
>   #define QUEUE_FLAG_ZONE_RESETALL 26	/* supports Zone Reset All */
>   #define QUEUE_FLAG_RQ_ALLOC_TIME 27	/* record rq->alloc_time_ns */
> +#define QUEUE_FLAG_HCTX_ACTIVE 28	/* at least one blk-mq hctx is active */
>   
>   #define QUEUE_FLAG_MQ_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
>   				 (1 << QUEUE_FLAG_SAME_COMP))
> 
Other than that it looks fine.

Cheers,

Hannes
John Garry June 11, 2020, 2:22 p.m. UTC | #2
>>   out:
>> @@ -802,6 +822,7 @@ static const struct blk_mq_debugfs_attr 
>> blk_mq_debugfs_hctx_shared_sbitmap_attrs
>>       {"dispatch", 0400, .seq_ops = &hctx_dispatch_seq_ops},
>>       {"busy", 0400, hctx_busy_show},
>>       {"ctx_map", 0400, hctx_ctx_map_show},
>> +    {"tags", 0400, hctx_tags_show},
>>       {"sched_tags", 0400, hctx_sched_tags_show},
>>       {"sched_tags_bitmap", 0400, hctx_sched_tags_bitmap_show},
>>       {"io_poll", 0600, hctx_io_poll_show, hctx_io_poll_write},
> 
> I had been pondering this, too, when creating v6. Problem is that it'll 
> show the tags per hctx, but as they are shared I guess the list looks 
> pretty identical per hctx.

Right, so my main concern in this series is debugfs, and how to present 
the tag/hctx info. We can hide things under the hood mostly, but not so 
much for debugfs.

> So I guess we should filter the tags per hctx to have only those active 
> on that hctx displayed. But when doing so we can only print the 
> in-flight tags, the others are not assigned to a hctx and as such we 
> can't make a decision on which hctx they'll end up.

So we filter the tags in 7/12, and that should be ok.

But a concern is that we present identical hctx_tags_show() -> 
blk_mq_debugfs_shared_sitmap_show() -> sbitmap_queue_show() per-hctx 
info, which may be inappropriate/misleading/wrong. I need to audit that 
more thoroughly.

> 
>> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
>> index 7db16e49f6f6..6ca06b1c3a99 100644
>> --- a/block/blk-mq-tag.c
>> +++ b/block/blk-mq-tag.c
>> @@ -23,9 +23,19 @@
>>    */
>>   bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
>>   {
>> -    if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
>> -        !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
>> -        atomic_inc(&hctx->tags->active_queues);
>> +    struct request_queue *q = hctx->queue;
>> +    struct blk_mq_tag_set *set = q->tag_set;
>> +
>> +    if (blk_mq_is_sbitmap_shared(set)){
>> +        if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) &&
>> +            !test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
>> +            atomic_inc(&set->active_queues_shared_sbitmap);
>> +
>> +    } else {
>> +        if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
>> +            !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
>> +            atomic_inc(&hctx->tags->active_queues);
>> +    }
>>       return true;
>>   }
> At one point someone would need to educate me what this double 
> 'test_bit' and 'test_and_set_bit' is supposed to achieve.
> Other than deliberately injecting a race condition ...

As I see, it's not a dangerous race, and meant as a performance 
optimization.

test_bit() should be is quick compared to test_and_set_bit(), so nicer 
to avoid always doing a test_and_set_bit().

So if we race with failed test_bit() calls and have multiple attempts 
with the test_and_set_bit(), only 1 will succeed and do the increment.

> 
>> @@ -47,11 +57,19 @@ void blk_mq_tag_wakeup_all(struct blk_mq_tags 
>> *tags, bool include_reserve)
>>   void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
>>   {
>>       struct blk_mq_tags *tags = hctx->tags;
>> -
>> -    if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
>> -        return;
>> -
>> -    atomic_dec(&tags->active_queues);
>> +    struct request_queue *q = hctx->queue;
>> +    struct blk_mq_tag_set *set = q->tag_set;
>> +
>> +    if (blk_mq_is_sbitmap_shared(q->tag_set)){
>> +        if (!test_and_clear_bit(QUEUE_FLAG_HCTX_ACTIVE,
>> +                    &q->queue_flags))
>> +            return;
>> +        atomic_dec(&set->active_queues_shared_sbitmap);
>> +    } else {
>> +        if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
>> +            return;
>> +        atomic_dec(&tags->active_queues);
>> +    }
>>       blk_mq_tag_wakeup_all(tags, false);
>>   }
>> @@ -65,12 +83,11 @@ static inline bool hctx_may_queue(struct 
>> blk_mq_alloc_data *data,
>>   {
>>       struct blk_mq_hw_ctx *hctx = data->hctx;
>>       struct request_queue *q = data->q;
>> +    struct blk_mq_tag_set *set = q->tag_set;
>>       unsigned int depth, users;
>>       if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED))
>>           return true;
>> -    if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
>> -        return true;
>>       /*
>>        * Don't try dividing an ant
>> @@ -78,7 +95,15 @@ static inline bool hctx_may_queue(struct 
>> blk_mq_alloc_data *data,
>>       if (bt->sb.depth == 1)
>>           return true;
>> -    users = atomic_read(&hctx->tags->active_queues);
>> +    if (blk_mq_is_sbitmap_shared(q->tag_set)) {
>> +        if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &q->queue_flags))
>> +            return true;
>> +        users = atomic_read(&set->active_queues_shared_sbitmap);
>> +    } else {
>> +        if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
>> +            return true;
>> +        users = atomic_read(&hctx->tags->active_queues);
>> +    }
>>       if (!users)
>>           return true;
>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>> index 0f7e062a1665..f73a2f9c58bd 100644
>> --- a/block/blk-mq.c
>> +++ b/block/blk-mq.c
>> @@ -3350,6 +3350,8 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set 
>> *set)
>>           goto out_free_mq_map;
>>       if (blk_mq_is_sbitmap_shared(set)) {
>> +        atomic_set(&set->active_queues_shared_sbitmap, 0);
>> +
>>           if (!blk_mq_init_shared_sbitmap(set)) {
>>               ret = -ENOMEM;
>>               goto out_free_mq_rq_maps;
>> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
>> index 7b31cdb92a71..66711c7234db 100644
>> --- a/include/linux/blk-mq.h
>> +++ b/include/linux/blk-mq.h
>> @@ -252,6 +252,7 @@ struct blk_mq_tag_set {
>>       unsigned int        timeout;
>>       unsigned int        flags;
>>       void            *driver_data;
>> +    atomic_t        active_queues_shared_sbitmap;

I'm not sure if we should present this in debugfs. It is not according 
to this series.

>>       struct sbitmap_queue    __bitmap_tags;
>>       struct sbitmap_queue    __breserved_tags;
>> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
>> index c536278bec9e..1b0087e8d01a 100644
>> --- a/include/linux/blkdev.h
>> +++ b/include/linux/blkdev.h
>> @@ -619,6 +619,7 @@ struct request_queue {
>>   #define QUEUE_FLAG_PCI_P2PDMA    25    /* device supports PCI p2p 
>> requests */
>>   #define QUEUE_FLAG_ZONE_RESETALL 26    /* supports Zone Reset All */
>>   #define QUEUE_FLAG_RQ_ALLOC_TIME 27    /* record rq->alloc_time_ns */
>> +#define QUEUE_FLAG_HCTX_ACTIVE 28    /* at least one blk-mq hctx is 
>> active */
>>   #define QUEUE_FLAG_MQ_DEFAULT    ((1 << QUEUE_FLAG_IO_STAT) |        \
>>                    (1 << QUEUE_FLAG_SAME_COMP))
>>
> Other than that it looks fine.

Cheers
diff mbox series

Patch

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 0fa3af41ab65..05b4be0c03d9 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -458,17 +458,37 @@  static void blk_mq_debugfs_tags_show(struct seq_file *m,
 	}
 }
 
+static void blk_mq_debugfs_tags_shared_sbitmap_show(struct seq_file *m,
+				     struct blk_mq_tags *tags)
+{
+	seq_printf(m, "nr_tags=%u\n", tags->nr_tags);
+	seq_printf(m, "nr_reserved_tags=%u\n", tags->nr_reserved_tags);
+
+	seq_puts(m, "\nbitmap_tags:\n");
+	sbitmap_queue_show(tags->bitmap_tags, m);
+
+	if (tags->nr_reserved_tags) {
+		seq_puts(m, "\nbreserved_tags:\n");
+		sbitmap_queue_show(tags->breserved_tags, m);
+	}
+}
+
 static int hctx_tags_show(void *data, struct seq_file *m)
 {
 	struct blk_mq_hw_ctx *hctx = data;
 	struct request_queue *q = hctx->queue;
+	struct blk_mq_tag_set *set = q->tag_set;
 	int res;
 
 	res = mutex_lock_interruptible(&q->sysfs_lock);
 	if (res)
 		goto out;
-	if (hctx->tags)
-		blk_mq_debugfs_tags_show(m, hctx->tags);
+	if (hctx->tags) {
+		if (blk_mq_is_sbitmap_shared(set))
+			blk_mq_debugfs_tags_shared_sbitmap_show(m, hctx->tags);
+		else
+			blk_mq_debugfs_tags_show(m, hctx->tags);
+	}
 	mutex_unlock(&q->sysfs_lock);
 
 out:
@@ -802,6 +822,7 @@  static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_shared_sbitmap_attrs
 	{"dispatch", 0400, .seq_ops = &hctx_dispatch_seq_ops},
 	{"busy", 0400, hctx_busy_show},
 	{"ctx_map", 0400, hctx_ctx_map_show},
+	{"tags", 0400, hctx_tags_show},
 	{"sched_tags", 0400, hctx_sched_tags_show},
 	{"sched_tags_bitmap", 0400, hctx_sched_tags_bitmap_show},
 	{"io_poll", 0600, hctx_io_poll_show, hctx_io_poll_write},
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 7db16e49f6f6..6ca06b1c3a99 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -23,9 +23,19 @@ 
  */
 bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
 {
-	if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
-	    !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
-		atomic_inc(&hctx->tags->active_queues);
+	struct request_queue *q = hctx->queue;
+	struct blk_mq_tag_set *set = q->tag_set;
+
+	if (blk_mq_is_sbitmap_shared(set)){
+		if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) &&
+		    !test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
+			atomic_inc(&set->active_queues_shared_sbitmap);
+
+	} else {
+		if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
+		    !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
+			atomic_inc(&hctx->tags->active_queues);
+	}
 
 	return true;
 }
@@ -47,11 +57,19 @@  void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
 void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
 {
 	struct blk_mq_tags *tags = hctx->tags;
-
-	if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
-		return;
-
-	atomic_dec(&tags->active_queues);
+	struct request_queue *q = hctx->queue;
+	struct blk_mq_tag_set *set = q->tag_set;
+
+	if (blk_mq_is_sbitmap_shared(q->tag_set)){
+		if (!test_and_clear_bit(QUEUE_FLAG_HCTX_ACTIVE,
+					&q->queue_flags))
+			return;
+		atomic_dec(&set->active_queues_shared_sbitmap);
+	} else {
+		if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
+			return;
+		atomic_dec(&tags->active_queues);
+	}
 
 	blk_mq_tag_wakeup_all(tags, false);
 }
@@ -65,12 +83,11 @@  static inline bool hctx_may_queue(struct blk_mq_alloc_data *data,
 {
 	struct blk_mq_hw_ctx *hctx = data->hctx;
 	struct request_queue *q = data->q;
+	struct blk_mq_tag_set *set = q->tag_set;
 	unsigned int depth, users;
 
 	if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED))
 		return true;
-	if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
-		return true;
 
 	/*
 	 * Don't try dividing an ant
@@ -78,7 +95,15 @@  static inline bool hctx_may_queue(struct blk_mq_alloc_data *data,
 	if (bt->sb.depth == 1)
 		return true;
 
-	users = atomic_read(&hctx->tags->active_queues);
+	if (blk_mq_is_sbitmap_shared(q->tag_set)) {
+		if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &q->queue_flags))
+			return true;
+		users = atomic_read(&set->active_queues_shared_sbitmap);
+	} else {
+		if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
+			return true;
+		users = atomic_read(&hctx->tags->active_queues);
+	}
 	if (!users)
 		return true;
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 0f7e062a1665..f73a2f9c58bd 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3350,6 +3350,8 @@  int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
 		goto out_free_mq_map;
 
 	if (blk_mq_is_sbitmap_shared(set)) {
+		atomic_set(&set->active_queues_shared_sbitmap, 0);
+
 		if (!blk_mq_init_shared_sbitmap(set)) {
 			ret = -ENOMEM;
 			goto out_free_mq_rq_maps;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 7b31cdb92a71..66711c7234db 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -252,6 +252,7 @@  struct blk_mq_tag_set {
 	unsigned int		timeout;
 	unsigned int		flags;
 	void			*driver_data;
+	atomic_t		active_queues_shared_sbitmap;
 
 	struct sbitmap_queue	__bitmap_tags;
 	struct sbitmap_queue	__breserved_tags;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c536278bec9e..1b0087e8d01a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -619,6 +619,7 @@  struct request_queue {
 #define QUEUE_FLAG_PCI_P2PDMA	25	/* device supports PCI p2p requests */
 #define QUEUE_FLAG_ZONE_RESETALL 26	/* supports Zone Reset All */
 #define QUEUE_FLAG_RQ_ALLOC_TIME 27	/* record rq->alloc_time_ns */
+#define QUEUE_FLAG_HCTX_ACTIVE 28	/* at least one blk-mq hctx is active */
 
 #define QUEUE_FLAG_MQ_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
 				 (1 << QUEUE_FLAG_SAME_COMP))