diff mbox series

blk-mq: allow hardware queue to get more tag while sharing a tag set

Message ID 20210712031818.31918-1-yukuai3@huawei.com (mailing list archive)
State New, archived
Headers show
Series blk-mq: allow hardware queue to get more tag while sharing a tag set | expand

Commit Message

Yu Kuai July 12, 2021, 3:18 a.m. UTC
If there are multiple active queues while sharing a tag set, it's not
necessary to limit the available tags as same share for each active queue
if no one ever failed to get driver tag. And fall back to same share if
someone do failed to get driver tag.

This modification will be beneficial if total queue_depth of disks
on the same host is less than total tags.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 block/blk-mq-debugfs.c |  2 ++
 block/blk-mq-tag.c     | 43 +++++++++++++++++++++++++++++++++++++++++-
 block/blk-mq-tag.h     | 27 ++++++++++++++++++++++++--
 block/blk-mq.c         | 13 ++++++++++---
 block/blk-mq.h         |  8 ++++++++
 include/linux/blk-mq.h |  4 ++++
 include/linux/blkdev.h |  1 +
 7 files changed, 92 insertions(+), 6 deletions(-)

Comments

Yu Kuai July 12, 2021, 3:19 a.m. UTC | #1
On 2021/07/12 11:18, Yu Kuai wrote:
> If there are multiple active queues while sharing a tag set, it's not
> necessary to limit the available tags as same share for each active queue
> if no one ever failed to get driver tag. And fall back to same share if
> someone do failed to get driver tag.
> 
> This modification will be beneficial if total queue_depth of disks
> on the same host is less than total tags.

Sorry this should be "more than", I'll correct this in next iteration
if anyone think this patch is acceptable.

Thanks,
Yu Kuai
> 
> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
> ---
>   block/blk-mq-debugfs.c |  2 ++
>   block/blk-mq-tag.c     | 43 +++++++++++++++++++++++++++++++++++++++++-
>   block/blk-mq-tag.h     | 27 ++++++++++++++++++++++++--
>   block/blk-mq.c         | 13 ++++++++++---
>   block/blk-mq.h         |  8 ++++++++
>   include/linux/blk-mq.h |  4 ++++
>   include/linux/blkdev.h |  1 +
>   7 files changed, 92 insertions(+), 6 deletions(-)
> 
> diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
> index 4b66d2776eda..35f1f01d93ae 100644
> --- a/block/blk-mq-debugfs.c
> +++ b/block/blk-mq-debugfs.c
> @@ -450,6 +450,8 @@ static void blk_mq_debugfs_tags_show(struct seq_file *m,
>   	seq_printf(m, "nr_reserved_tags=%u\n", tags->nr_reserved_tags);
>   	seq_printf(m, "active_queues=%d\n",
>   		   atomic_read(&tags->active_queues));
> +	seq_printf(m, "pending_queues=%d\n",
> +		   atomic_read(&tags->pending_queues));
>   
>   	seq_puts(m, "\nbitmap_tags:\n");
>   	sbitmap_queue_show(tags->bitmap_tags, m);
> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> index 86f87346232a..618624b359d6 100644
> --- a/block/blk-mq-tag.c
> +++ b/block/blk-mq-tag.c
> @@ -40,6 +40,22 @@ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
>   	return true;
>   }
>   
> +void __blk_mq_dtag_busy(struct blk_mq_hw_ctx *hctx)
> +{
> +	if (blk_mq_is_sbitmap_shared(hctx->flags)) {
> +		struct request_queue *q = hctx->queue;
> +		struct blk_mq_tag_set *set = q->tag_set;
> +
> +		if (!test_bit(QUEUE_FLAG_HCTX_WAIT, &q->queue_flags) &&
> +		    !test_and_set_bit(QUEUE_FLAG_HCTX_WAIT, &q->queue_flags))
> +			atomic_inc(&set->pending_queues_shared_sbitmap);
> +	} else {
> +		if (!test_bit(BLK_MQ_S_DTAG_WAIT, &hctx->state) &&
> +		    !test_and_set_bit(BLK_MQ_S_DTAG_WAIT, &hctx->state))
> +			atomic_inc(&hctx->tags->pending_queues);
> +	}
> +}
> +
>   /*
>    * Wakeup all potentially sleeping on tags
>    */
> @@ -74,6 +90,24 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
>   	blk_mq_tag_wakeup_all(tags, false);
>   }
>   
> +void __blk_mq_dtag_idle(struct blk_mq_hw_ctx *hctx)
> +{
> +	struct blk_mq_tags *tags = hctx->tags;
> +	struct request_queue *q = hctx->queue;
> +	struct blk_mq_tag_set *set = q->tag_set;
> +
> +	if (blk_mq_is_sbitmap_shared(hctx->flags)) {
> +		if (!test_and_clear_bit(QUEUE_FLAG_HCTX_WAIT,
> +					&q->queue_flags))
> +			return;
> +		atomic_dec(&set->pending_queues_shared_sbitmap);
> +	} else {
> +		if (!test_and_clear_bit(BLK_MQ_S_DTAG_WAIT, &hctx->state))
> +			return;
> +		atomic_dec(&tags->pending_queues);
> +	}
> +}
> +
>   static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
>   			    struct sbitmap_queue *bt)
>   {
> @@ -112,8 +146,12 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
>   	if (tag != BLK_MQ_NO_TAG)
>   		goto found_tag;
>   
> -	if (data->flags & BLK_MQ_REQ_NOWAIT)
> +	if (data->flags & BLK_MQ_REQ_NOWAIT) {
> +		if (!data->q->elevator)
> +			blk_mq_dtag_busy(data->hctx);
> +
>   		return BLK_MQ_NO_TAG;
> +	}
>   
>   	ws = bt_wait_ptr(bt, data->hctx);
>   	do {
> @@ -140,6 +178,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
>   		if (tag != BLK_MQ_NO_TAG)
>   			break;
>   
> +		if (!data->q->elevator)
> +			blk_mq_dtag_busy(data->hctx);
> +
>   		bt_prev = bt;
>   		io_schedule();
>   
> diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
> index 8ed55af08427..badcf3693749 100644
> --- a/block/blk-mq-tag.h
> +++ b/block/blk-mq-tag.h
> @@ -10,6 +10,11 @@ struct blk_mq_tags {
>   	unsigned int nr_reserved_tags;
>   
>   	atomic_t active_queues;
> +	/*
> +	 * if multiple queues share a tag set, pending_queues record the
> +	 * number of queues that can't get driver tag.
> +	 */
> +	atomic_t pending_queues;
>   
>   	struct sbitmap_queue *bitmap_tags;
>   	struct sbitmap_queue *breserved_tags;
> @@ -69,8 +74,10 @@ enum {
>   	BLK_MQ_TAG_MAX		= BLK_MQ_NO_TAG - 1,
>   };
>   
> -extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *);
> -extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *);
> +extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx);
> +extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx);
> +extern void __blk_mq_dtag_busy(struct blk_mq_hw_ctx *hctx);
> +extern void __blk_mq_dtag_idle(struct blk_mq_hw_ctx *hctx);
>   
>   static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
>   {
> @@ -88,6 +95,22 @@ static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
>   	__blk_mq_tag_idle(hctx);
>   }
>   
> +static inline void blk_mq_dtag_busy(struct blk_mq_hw_ctx *hctx)
> +{
> +	if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED))
> +		return;
> +
> +	__blk_mq_dtag_busy(hctx);
> +}
> +
> +static inline void blk_mq_dtag_idle(struct blk_mq_hw_ctx *hctx)
> +{
> +	if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED))
> +		return;
> +
> +	__blk_mq_dtag_idle(hctx);
> +}
> +
>   static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags,
>   					  unsigned int tag)
>   {
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 2c4ac51e54eb..1bb52bd71da8 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -991,8 +991,10 @@ static void blk_mq_timeout_work(struct work_struct *work)
>   		 */
>   		queue_for_each_hw_ctx(q, hctx, i) {
>   			/* the hctx may be unmapped, so check it here */
> -			if (blk_mq_hw_queue_mapped(hctx))
> +			if (blk_mq_hw_queue_mapped(hctx)) {
>   				blk_mq_tag_idle(hctx);
> +				blk_mq_dtag_idle(hctx);
> +			}
>   		}
>   	}
>   	blk_queue_exit(q);
> @@ -1097,8 +1099,10 @@ static bool __blk_mq_get_driver_tag(struct request *rq)
>   	}
>   
>   	tag = __sbitmap_queue_get(bt);
> -	if (tag == BLK_MQ_NO_TAG)
> +	if (tag == BLK_MQ_NO_TAG) {
> +		blk_mq_dtag_busy(rq->mq_hctx);
>   		return false;
> +	}
>   
>   	rq->tag = tag + tag_offset;
>   	return true;
> @@ -2676,8 +2680,10 @@ static void blk_mq_exit_hctx(struct request_queue *q,
>   {
>   	struct request *flush_rq = hctx->fq->flush_rq;
>   
> -	if (blk_mq_hw_queue_mapped(hctx))
> +	if (blk_mq_hw_queue_mapped(hctx)) {
>   		blk_mq_tag_idle(hctx);
> +		blk_mq_dtag_idle(hctx);
> +	}
>   
>   	blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx],
>   			set->queue_depth, flush_rq);
> @@ -3536,6 +3542,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
>   
>   	if (blk_mq_is_sbitmap_shared(set->flags)) {
>   		atomic_set(&set->active_queues_shared_sbitmap, 0);
> +		atomic_set(&set->pending_queues_shared_sbitmap, 0);
>   
>   		if (blk_mq_init_shared_sbitmap(set)) {
>   			ret = -ENOMEM;
> diff --git a/block/blk-mq.h b/block/blk-mq.h
> index d08779f77a26..9e646ade81a8 100644
> --- a/block/blk-mq.h
> +++ b/block/blk-mq.h
> @@ -337,10 +337,18 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
>   
>   		if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
>   			return true;
> +
> +		if (!atomic_read(&set->pending_queues_shared_sbitmap))
> +			return true;
> +
>   		users = atomic_read(&set->active_queues_shared_sbitmap);
>   	} else {
>   		if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
>   			return true;
> +
> +		if (!atomic_read(&hctx->tags->pending_queues))
> +			return true;
> +
>   		users = atomic_read(&hctx->tags->active_queues);
>   	}
>   
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index 1d18447ebebc..3bc0faf0e2cf 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -256,6 +256,7 @@ struct blk_mq_tag_set {
>   	unsigned int		flags;
>   	void			*driver_data;
>   	atomic_t		active_queues_shared_sbitmap;
> +	atomic_t		pending_queues_shared_sbitmap;
>   
>   	struct sbitmap_queue	__bitmap_tags;
>   	struct sbitmap_queue	__breserved_tags;
> @@ -415,6 +416,9 @@ enum {
>   	/* hw queue is inactive after all its CPUs become offline */
>   	BLK_MQ_S_INACTIVE	= 3,
>   
> +	/* hw queue is waiting for driver tag */
> +	BLK_MQ_S_DTAG_WAIT	= 1,
> +
>   	BLK_MQ_MAX_DEPTH	= 10240,
>   
>   	BLK_MQ_CPU_WORK_BATCH	= 8,
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 3177181c4326..55e0965c9c3c 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -603,6 +603,7 @@ struct request_queue {
>   #define QUEUE_FLAG_RQ_ALLOC_TIME 27	/* record rq->alloc_time_ns */
>   #define QUEUE_FLAG_HCTX_ACTIVE	28	/* at least one blk-mq hctx is active */
>   #define QUEUE_FLAG_NOWAIT       29	/* device supports NOWAIT */
> +#define QUEUE_FLAG_HCTX_WAIT	30	/* at least one blk-mq hctx can't get driver tag */
>   
>   #define QUEUE_FLAG_MQ_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
>   				 (1 << QUEUE_FLAG_SAME_COMP) |		\
>
Yu Kuai July 20, 2021, 12:33 p.m. UTC | #2
On 2021/07/12 11:18, Yu Kuai wrote:
> If there are multiple active queues while sharing a tag set, it's not
> necessary to limit the available tags as same share for each active queue
> if no one ever failed to get driver tag. And fall back to same share if
> someone do failed to get driver tag.
> 
> This modification will be beneficial if total queue_depth of disks
> on the same host is less than total tags.
> 
> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
> ---
>   block/blk-mq-debugfs.c |  2 ++
>   block/blk-mq-tag.c     | 43 +++++++++++++++++++++++++++++++++++++++++-
>   block/blk-mq-tag.h     | 27 ++++++++++++++++++++++++--
>   block/blk-mq.c         | 13 ++++++++++---
>   block/blk-mq.h         |  8 ++++++++
>   include/linux/blk-mq.h |  4 ++++
>   include/linux/blkdev.h |  1 +
>   7 files changed, 92 insertions(+), 6 deletions(-)
> 
> diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
> index 4b66d2776eda..35f1f01d93ae 100644
> --- a/block/blk-mq-debugfs.c
> +++ b/block/blk-mq-debugfs.c
> @@ -450,6 +450,8 @@ static void blk_mq_debugfs_tags_show(struct seq_file *m,
>   	seq_printf(m, "nr_reserved_tags=%u\n", tags->nr_reserved_tags);
>   	seq_printf(m, "active_queues=%d\n",
>   		   atomic_read(&tags->active_queues));
> +	seq_printf(m, "pending_queues=%d\n",
> +		   atomic_read(&tags->pending_queues));
>   
>   	seq_puts(m, "\nbitmap_tags:\n");
>   	sbitmap_queue_show(tags->bitmap_tags, m);
> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> index 86f87346232a..618624b359d6 100644
> --- a/block/blk-mq-tag.c
> +++ b/block/blk-mq-tag.c
> @@ -40,6 +40,22 @@ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
>   	return true;
>   }
>   
> +void __blk_mq_dtag_busy(struct blk_mq_hw_ctx *hctx)
> +{
> +	if (blk_mq_is_sbitmap_shared(hctx->flags)) {
> +		struct request_queue *q = hctx->queue;
> +		struct blk_mq_tag_set *set = q->tag_set;
> +
> +		if (!test_bit(QUEUE_FLAG_HCTX_WAIT, &q->queue_flags) &&
> +		    !test_and_set_bit(QUEUE_FLAG_HCTX_WAIT, &q->queue_flags))
> +			atomic_inc(&set->pending_queues_shared_sbitmap);
> +	} else {
> +		if (!test_bit(BLK_MQ_S_DTAG_WAIT, &hctx->state) &&
> +		    !test_and_set_bit(BLK_MQ_S_DTAG_WAIT, &hctx->state))
> +			atomic_inc(&hctx->tags->pending_queues);
> +	}
> +}
> +
>   /*
>    * Wakeup all potentially sleeping on tags
>    */
> @@ -74,6 +90,24 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
>   	blk_mq_tag_wakeup_all(tags, false);
>   }
>   
> +void __blk_mq_dtag_idle(struct blk_mq_hw_ctx *hctx)
> +{
> +	struct blk_mq_tags *tags = hctx->tags;
> +	struct request_queue *q = hctx->queue;
> +	struct blk_mq_tag_set *set = q->tag_set;
> +
> +	if (blk_mq_is_sbitmap_shared(hctx->flags)) {
> +		if (!test_and_clear_bit(QUEUE_FLAG_HCTX_WAIT,
> +					&q->queue_flags))
> +			return;
> +		atomic_dec(&set->pending_queues_shared_sbitmap);
> +	} else {
> +		if (!test_and_clear_bit(BLK_MQ_S_DTAG_WAIT, &hctx->state))
> +			return;
> +		atomic_dec(&tags->pending_queues);
> +	}
> +}
> +
>   static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
>   			    struct sbitmap_queue *bt)
>   {
> @@ -112,8 +146,12 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
>   	if (tag != BLK_MQ_NO_TAG)
>   		goto found_tag;
>   
> -	if (data->flags & BLK_MQ_REQ_NOWAIT)
> +	if (data->flags & BLK_MQ_REQ_NOWAIT) {
> +		if (!data->q->elevator)
> +			blk_mq_dtag_busy(data->hctx);
> +
>   		return BLK_MQ_NO_TAG;
> +	}
>   
>   	ws = bt_wait_ptr(bt, data->hctx);
>   	do {
> @@ -140,6 +178,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
>   		if (tag != BLK_MQ_NO_TAG)
>   			break;
>   
> +		if (!data->q->elevator)
> +			blk_mq_dtag_busy(data->hctx);
> +
>   		bt_prev = bt;
>   		io_schedule();
>   
> diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
> index 8ed55af08427..badcf3693749 100644
> --- a/block/blk-mq-tag.h
> +++ b/block/blk-mq-tag.h
> @@ -10,6 +10,11 @@ struct blk_mq_tags {
>   	unsigned int nr_reserved_tags;
>   
>   	atomic_t active_queues;
> +	/*
> +	 * if multiple queues share a tag set, pending_queues record the
> +	 * number of queues that can't get driver tag.
> +	 */
> +	atomic_t pending_queues;
>   
>   	struct sbitmap_queue *bitmap_tags;
>   	struct sbitmap_queue *breserved_tags;
> @@ -69,8 +74,10 @@ enum {
>   	BLK_MQ_TAG_MAX		= BLK_MQ_NO_TAG - 1,
>   };
>   
> -extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *);
> -extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *);
> +extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx);
> +extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx);
> +extern void __blk_mq_dtag_busy(struct blk_mq_hw_ctx *hctx);
> +extern void __blk_mq_dtag_idle(struct blk_mq_hw_ctx *hctx);
>   
>   static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
>   {
> @@ -88,6 +95,22 @@ static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
>   	__blk_mq_tag_idle(hctx);
>   }
>   
> +static inline void blk_mq_dtag_busy(struct blk_mq_hw_ctx *hctx)
> +{
> +	if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED))
> +		return;
> +
> +	__blk_mq_dtag_busy(hctx);
> +}
> +
> +static inline void blk_mq_dtag_idle(struct blk_mq_hw_ctx *hctx)
> +{
> +	if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED))
> +		return;
> +
> +	__blk_mq_dtag_idle(hctx);
> +}
> +
>   static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags,
>   					  unsigned int tag)
>   {
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 2c4ac51e54eb..1bb52bd71da8 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -991,8 +991,10 @@ static void blk_mq_timeout_work(struct work_struct *work)
>   		 */
>   		queue_for_each_hw_ctx(q, hctx, i) {
>   			/* the hctx may be unmapped, so check it here */
> -			if (blk_mq_hw_queue_mapped(hctx))
> +			if (blk_mq_hw_queue_mapped(hctx)) {
>   				blk_mq_tag_idle(hctx);
> +				blk_mq_dtag_idle(hctx);
> +			}
>   		}
>   	}
>   	blk_queue_exit(q);
> @@ -1097,8 +1099,10 @@ static bool __blk_mq_get_driver_tag(struct request *rq)
>   	}
>   
>   	tag = __sbitmap_queue_get(bt);
> -	if (tag == BLK_MQ_NO_TAG)
> +	if (tag == BLK_MQ_NO_TAG) {
> +		blk_mq_dtag_busy(rq->mq_hctx);
>   		return false;
> +	}
>   
>   	rq->tag = tag + tag_offset;
>   	return true;
> @@ -2676,8 +2680,10 @@ static void blk_mq_exit_hctx(struct request_queue *q,
>   {
>   	struct request *flush_rq = hctx->fq->flush_rq;
>   
> -	if (blk_mq_hw_queue_mapped(hctx))
> +	if (blk_mq_hw_queue_mapped(hctx)) {
>   		blk_mq_tag_idle(hctx);
> +		blk_mq_dtag_idle(hctx);
> +	}
>   
>   	blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx],
>   			set->queue_depth, flush_rq);
> @@ -3536,6 +3542,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
>   
>   	if (blk_mq_is_sbitmap_shared(set->flags)) {
>   		atomic_set(&set->active_queues_shared_sbitmap, 0);
> +		atomic_set(&set->pending_queues_shared_sbitmap, 0);
>   
>   		if (blk_mq_init_shared_sbitmap(set)) {
>   			ret = -ENOMEM;
> diff --git a/block/blk-mq.h b/block/blk-mq.h
> index d08779f77a26..9e646ade81a8 100644
> --- a/block/blk-mq.h
> +++ b/block/blk-mq.h
> @@ -337,10 +337,18 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
>   
>   		if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
>   			return true;
> +
> +		if (!atomic_read(&set->pending_queues_shared_sbitmap))
> +			return true;
> +
>   		users = atomic_read(&set->active_queues_shared_sbitmap);
>   	} else {
>   		if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
>   			return true;
> +
> +		if (!atomic_read(&hctx->tags->pending_queues))
> +			return true;
> +
>   		users = atomic_read(&hctx->tags->active_queues);
>   	}
>   
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index 1d18447ebebc..3bc0faf0e2cf 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -256,6 +256,7 @@ struct blk_mq_tag_set {
>   	unsigned int		flags;
>   	void			*driver_data;
>   	atomic_t		active_queues_shared_sbitmap;
> +	atomic_t		pending_queues_shared_sbitmap;
>   
>   	struct sbitmap_queue	__bitmap_tags;
>   	struct sbitmap_queue	__breserved_tags;
> @@ -415,6 +416,9 @@ enum {
>   	/* hw queue is inactive after all its CPUs become offline */
>   	BLK_MQ_S_INACTIVE	= 3,
>   
> +	/* hw queue is waiting for driver tag */
> +	BLK_MQ_S_DTAG_WAIT	= 1,
> +
>   	BLK_MQ_MAX_DEPTH	= 10240,
>   
>   	BLK_MQ_CPU_WORK_BATCH	= 8,
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 3177181c4326..55e0965c9c3c 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -603,6 +603,7 @@ struct request_queue {
>   #define QUEUE_FLAG_RQ_ALLOC_TIME 27	/* record rq->alloc_time_ns */
>   #define QUEUE_FLAG_HCTX_ACTIVE	28	/* at least one blk-mq hctx is active */
>   #define QUEUE_FLAG_NOWAIT       29	/* device supports NOWAIT */
> +#define QUEUE_FLAG_HCTX_WAIT	30	/* at least one blk-mq hctx can't get driver tag */
>   
>   #define QUEUE_FLAG_MQ_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
>   				 (1 << QUEUE_FLAG_SAME_COMP) |		\
> 

ping ...
Yu Kuai July 31, 2021, 7:13 a.m. UTC | #3
On 2021/07/20 20:33, yukuai (C) wrote:
> On 2021/07/12 11:18, Yu Kuai wrote:
>> If there are multiple active queues while sharing a tag set, it's not
>> necessary to limit the available tags as same share for each active queue
>> if no one ever failed to get driver tag. And fall back to same share if
>> someone do failed to get driver tag.
>>
>> This modification will be beneficial if total queue_depth of disks
>> on the same host is less than total tags.
>>
>> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
>> ---
>>   block/blk-mq-debugfs.c |  2 ++
>>   block/blk-mq-tag.c     | 43 +++++++++++++++++++++++++++++++++++++++++-
>>   block/blk-mq-tag.h     | 27 ++++++++++++++++++++++++--
>>   block/blk-mq.c         | 13 ++++++++++---
>>   block/blk-mq.h         |  8 ++++++++
>>   include/linux/blk-mq.h |  4 ++++
>>   include/linux/blkdev.h |  1 +
>>   7 files changed, 92 insertions(+), 6 deletions(-)
>>
>> diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
>> index 4b66d2776eda..35f1f01d93ae 100644
>> --- a/block/blk-mq-debugfs.c
>> +++ b/block/blk-mq-debugfs.c
>> @@ -450,6 +450,8 @@ static void blk_mq_debugfs_tags_show(struct 
>> seq_file *m,
>>       seq_printf(m, "nr_reserved_tags=%u\n", tags->nr_reserved_tags);
>>       seq_printf(m, "active_queues=%d\n",
>>              atomic_read(&tags->active_queues));
>> +    seq_printf(m, "pending_queues=%d\n",
>> +           atomic_read(&tags->pending_queues));
>>       seq_puts(m, "\nbitmap_tags:\n");
>>       sbitmap_queue_show(tags->bitmap_tags, m);
>> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
>> index 86f87346232a..618624b359d6 100644
>> --- a/block/blk-mq-tag.c
>> +++ b/block/blk-mq-tag.c
>> @@ -40,6 +40,22 @@ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
>>       return true;
>>   }
>> +void __blk_mq_dtag_busy(struct blk_mq_hw_ctx *hctx)
>> +{
>> +    if (blk_mq_is_sbitmap_shared(hctx->flags)) {
>> +        struct request_queue *q = hctx->queue;
>> +        struct blk_mq_tag_set *set = q->tag_set;
>> +
>> +        if (!test_bit(QUEUE_FLAG_HCTX_WAIT, &q->queue_flags) &&
>> +            !test_and_set_bit(QUEUE_FLAG_HCTX_WAIT, &q->queue_flags))
>> +            atomic_inc(&set->pending_queues_shared_sbitmap);
>> +    } else {
>> +        if (!test_bit(BLK_MQ_S_DTAG_WAIT, &hctx->state) &&
>> +            !test_and_set_bit(BLK_MQ_S_DTAG_WAIT, &hctx->state))
>> +            atomic_inc(&hctx->tags->pending_queues);
>> +    }
>> +}
>> +
>>   /*
>>    * Wakeup all potentially sleeping on tags
>>    */
>> @@ -74,6 +90,24 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
>>       blk_mq_tag_wakeup_all(tags, false);
>>   }
>> +void __blk_mq_dtag_idle(struct blk_mq_hw_ctx *hctx)
>> +{
>> +    struct blk_mq_tags *tags = hctx->tags;
>> +    struct request_queue *q = hctx->queue;
>> +    struct blk_mq_tag_set *set = q->tag_set;
>> +
>> +    if (blk_mq_is_sbitmap_shared(hctx->flags)) {
>> +        if (!test_and_clear_bit(QUEUE_FLAG_HCTX_WAIT,
>> +                    &q->queue_flags))
>> +            return;
>> +        atomic_dec(&set->pending_queues_shared_sbitmap);
>> +    } else {
>> +        if (!test_and_clear_bit(BLK_MQ_S_DTAG_WAIT, &hctx->state))
>> +            return;
>> +        atomic_dec(&tags->pending_queues);
>> +    }
>> +}
>> +
>>   static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
>>                   struct sbitmap_queue *bt)
>>   {
>> @@ -112,8 +146,12 @@ unsigned int blk_mq_get_tag(struct 
>> blk_mq_alloc_data *data)
>>       if (tag != BLK_MQ_NO_TAG)
>>           goto found_tag;
>> -    if (data->flags & BLK_MQ_REQ_NOWAIT)
>> +    if (data->flags & BLK_MQ_REQ_NOWAIT) {
>> +        if (!data->q->elevator)
>> +            blk_mq_dtag_busy(data->hctx);
>> +
>>           return BLK_MQ_NO_TAG;
>> +    }
>>       ws = bt_wait_ptr(bt, data->hctx);
>>       do {
>> @@ -140,6 +178,9 @@ unsigned int blk_mq_get_tag(struct 
>> blk_mq_alloc_data *data)
>>           if (tag != BLK_MQ_NO_TAG)
>>               break;
>> +        if (!data->q->elevator)
>> +            blk_mq_dtag_busy(data->hctx);
>> +
>>           bt_prev = bt;
>>           io_schedule();
>> diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
>> index 8ed55af08427..badcf3693749 100644
>> --- a/block/blk-mq-tag.h
>> +++ b/block/blk-mq-tag.h
>> @@ -10,6 +10,11 @@ struct blk_mq_tags {
>>       unsigned int nr_reserved_tags;
>>       atomic_t active_queues;
>> +    /*
>> +     * if multiple queues share a tag set, pending_queues record the
>> +     * number of queues that can't get driver tag.
>> +     */
>> +    atomic_t pending_queues;
>>       struct sbitmap_queue *bitmap_tags;
>>       struct sbitmap_queue *breserved_tags;
>> @@ -69,8 +74,10 @@ enum {
>>       BLK_MQ_TAG_MAX        = BLK_MQ_NO_TAG - 1,
>>   };
>> -extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *);
>> -extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *);
>> +extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx);
>> +extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx);
>> +extern void __blk_mq_dtag_busy(struct blk_mq_hw_ctx *hctx);
>> +extern void __blk_mq_dtag_idle(struct blk_mq_hw_ctx *hctx);
>>   static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
>>   {
>> @@ -88,6 +95,22 @@ static inline void blk_mq_tag_idle(struct 
>> blk_mq_hw_ctx *hctx)
>>       __blk_mq_tag_idle(hctx);
>>   }
>> +static inline void blk_mq_dtag_busy(struct blk_mq_hw_ctx *hctx)
>> +{
>> +    if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED))
>> +        return;
>> +
>> +    __blk_mq_dtag_busy(hctx);
>> +}
>> +
>> +static inline void blk_mq_dtag_idle(struct blk_mq_hw_ctx *hctx)
>> +{
>> +    if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED))
>> +        return;
>> +
>> +    __blk_mq_dtag_idle(hctx);
>> +}
>> +
>>   static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags,
>>                         unsigned int tag)
>>   {
>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>> index 2c4ac51e54eb..1bb52bd71da8 100644
>> --- a/block/blk-mq.c
>> +++ b/block/blk-mq.c
>> @@ -991,8 +991,10 @@ static void blk_mq_timeout_work(struct 
>> work_struct *work)
>>            */
>>           queue_for_each_hw_ctx(q, hctx, i) {
>>               /* the hctx may be unmapped, so check it here */
>> -            if (blk_mq_hw_queue_mapped(hctx))
>> +            if (blk_mq_hw_queue_mapped(hctx)) {
>>                   blk_mq_tag_idle(hctx);
>> +                blk_mq_dtag_idle(hctx);
>> +            }
>>           }
>>       }
>>       blk_queue_exit(q);
>> @@ -1097,8 +1099,10 @@ static bool __blk_mq_get_driver_tag(struct 
>> request *rq)
>>       }
>>       tag = __sbitmap_queue_get(bt);
>> -    if (tag == BLK_MQ_NO_TAG)
>> +    if (tag == BLK_MQ_NO_TAG) {
>> +        blk_mq_dtag_busy(rq->mq_hctx);
>>           return false;
>> +    }
>>       rq->tag = tag + tag_offset;
>>       return true;
>> @@ -2676,8 +2680,10 @@ static void blk_mq_exit_hctx(struct 
>> request_queue *q,
>>   {
>>       struct request *flush_rq = hctx->fq->flush_rq;
>> -    if (blk_mq_hw_queue_mapped(hctx))
>> +    if (blk_mq_hw_queue_mapped(hctx)) {
>>           blk_mq_tag_idle(hctx);
>> +        blk_mq_dtag_idle(hctx);
>> +    }
>>       blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx],
>>               set->queue_depth, flush_rq);
>> @@ -3536,6 +3542,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set 
>> *set)
>>       if (blk_mq_is_sbitmap_shared(set->flags)) {
>>           atomic_set(&set->active_queues_shared_sbitmap, 0);
>> +        atomic_set(&set->pending_queues_shared_sbitmap, 0);
>>           if (blk_mq_init_shared_sbitmap(set)) {
>>               ret = -ENOMEM;
>> diff --git a/block/blk-mq.h b/block/blk-mq.h
>> index d08779f77a26..9e646ade81a8 100644
>> --- a/block/blk-mq.h
>> +++ b/block/blk-mq.h
>> @@ -337,10 +337,18 @@ static inline bool hctx_may_queue(struct 
>> blk_mq_hw_ctx *hctx,
>>           if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
>>               return true;
>> +
>> +        if (!atomic_read(&set->pending_queues_shared_sbitmap))
>> +            return true;
>> +
>>           users = atomic_read(&set->active_queues_shared_sbitmap);
>>       } else {
>>           if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
>>               return true;
>> +
>> +        if (!atomic_read(&hctx->tags->pending_queues))
>> +            return true;
>> +
>>           users = atomic_read(&hctx->tags->active_queues);
>>       }
>> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
>> index 1d18447ebebc..3bc0faf0e2cf 100644
>> --- a/include/linux/blk-mq.h
>> +++ b/include/linux/blk-mq.h
>> @@ -256,6 +256,7 @@ struct blk_mq_tag_set {
>>       unsigned int        flags;
>>       void            *driver_data;
>>       atomic_t        active_queues_shared_sbitmap;
>> +    atomic_t        pending_queues_shared_sbitmap;
>>       struct sbitmap_queue    __bitmap_tags;
>>       struct sbitmap_queue    __breserved_tags;
>> @@ -415,6 +416,9 @@ enum {
>>       /* hw queue is inactive after all its CPUs become offline */
>>       BLK_MQ_S_INACTIVE    = 3,
>> +    /* hw queue is waiting for driver tag */
>> +    BLK_MQ_S_DTAG_WAIT    = 1,
>> +
>>       BLK_MQ_MAX_DEPTH    = 10240,
>>       BLK_MQ_CPU_WORK_BATCH    = 8,
>> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
>> index 3177181c4326..55e0965c9c3c 100644
>> --- a/include/linux/blkdev.h
>> +++ b/include/linux/blkdev.h
>> @@ -603,6 +603,7 @@ struct request_queue {
>>   #define QUEUE_FLAG_RQ_ALLOC_TIME 27    /* record rq->alloc_time_ns */
>>   #define QUEUE_FLAG_HCTX_ACTIVE    28    /* at least one blk-mq hctx 
>> is active */
>>   #define QUEUE_FLAG_NOWAIT       29    /* device supports NOWAIT */
>> +#define QUEUE_FLAG_HCTX_WAIT    30    /* at least one blk-mq hctx 
>> can't get driver tag */
>>   #define QUEUE_FLAG_MQ_DEFAULT    ((1 << QUEUE_FLAG_IO_STAT) |        \
>>                    (1 << QUEUE_FLAG_SAME_COMP) |        \
>>
> 
> ping ...

ping ...
Bart Van Assche July 31, 2021, 5:15 p.m. UTC | #4
On 7/11/21 8:18 PM, Yu Kuai wrote:
> If there are multiple active queues while sharing a tag set, it's not
> necessary to limit the available tags as same share for each active queue
> if no one ever failed to get driver tag. And fall back to same share if
> someone do failed to get driver tag.
> 
> This modification will be beneficial if total queue_depth of disks
> on the same host is less than total tags.

This patch adds new atomic operations in the hot path and hence probably
has a negative performance impact. What is the performance impact of
this patch for e.g. null_blk when submitting I/O from all CPU cores?

Thanks,

Bart.
Yu Kuai Aug. 2, 2021, 1:34 p.m. UTC | #5
On 2021/08/01 1:15, Bart Van Assche wrote:
> On 7/11/21 8:18 PM, Yu Kuai wrote:
>> If there are multiple active queues while sharing a tag set, it's not
>> necessary to limit the available tags as same share for each active queue
>> if no one ever failed to get driver tag. And fall back to same share if
>> someone do failed to get driver tag.
>>
>> This modification will be beneficial if total queue_depth of disks
>> on the same host is less than total tags.
> 
> This patch adds new atomic operations in the hot path and hence probably
> has a negative performance impact. What is the performance impact of
> this patch for e.g. null_blk when submitting I/O from all CPU cores?
> 
> Thanks,
> 
> Bart.
> .
> 

Hi, Bart

I run a test on both null_blk and nvme, results show that there are no
performance degradation:

test platform: x86
test cpu: 2 nodes, total 72
test scheduler: none
test device: null_blk / nvme

test cmd: fio -filename=/dev/xxx -name=test -ioengine=libaio -direct=1
-numjobs=72 -iodepth=16 -bs=4k -rw=write -offset_increment=1G
-cpus_allowed=0:71 -cpus_allowed_policy=split -group_reporting
-runtime=120

test results: iops
1) null_blk before this patch: 280k
2) null_blk after this patch: 282k
3) nvme before this patch: 378k
4) nvme after this patch: 384k

details:
1) null_blk before this patch:
test: (g=0): rw=write, bs=(R) 4096B-4096B, (W) 4096B-4096B, (T) 
4096B-4096B, ioengine=libaio, iodepth=16
...
fio-3.13-42-g8066f
Starting 72 processes
Jobs: 72 (f=72): [W(72)][100.0%][w=1095MiB/s][w=280k IOPS][eta 00m:00s]
test: (groupid=0, jobs=72): err= 0: pid=4986: Mon Aug  2 11:25:33 2021
   write: IOPS=279k, BW=1091MiB/s (1144MB/s)(128GiB/120009msec); 0 zone 
resets
     slat (nsec): min=1069, max=1837.6M, avg=240280.55, stdev=3604257.00
     clat (usec): min=89, max=1837.9k, avg=3882.70, stdev=13528.70
      lat (usec): min=175, max=1837.9k, avg=4123.03, stdev=13939.66
     clat percentiles (usec):
      |  1.00th=[   223],  5.00th=[   223], 10.00th=[   225], 20.00th=[ 
231],
      | 30.00th=[   478], 40.00th=[   873], 50.00th=[  1811], 60.00th=[ 
2737],
      | 70.00th=[  4293], 80.00th=[  6915], 90.00th=[  9372], 95.00th=[ 
12780],
      | 99.00th=[ 18482], 99.50th=[ 22676], 99.90th=[ 62129], 
99.95th=[231736],
      | 99.99th=[641729]
    bw (  MiB/s): min=   32, max= 3681, per=100.00%, avg=1106.55, 
stdev=25.25, samples=17006
    iops        : min= 8405, max=942588, avg=283276.25, stdev=6464.60, 
samples=17006
   lat (usec)   : 100=0.01%, 250=24.18%, 500=8.74%, 750=4.72%, 1000=4.01%
   lat (msec)   : 2=12.28%, 4=12.86%, 10=24.23%, 20=8.06%, 50=0.81%
   lat (msec)   : 100=0.02%, 250=0.04%, 500=0.03%, 750=0.01%, 1000=0.01%
   lat (msec)   : 2000=0.01%
   cpu          : usr=0.35%, sys=0.79%, ctx=35473919, majf=0, minf=1419
   IO depths    : 1=0.1%, 2=0.1%, 4=0.1%, 8=0.1%, 16=100.0%, 32=0.0%, 
 >=64=0.0%
      submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, 
 >=64=0.0%
      complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.1%, 32=0.0%, 64=0.0%, 
 >=64=0.0%
      issued rwts: total=0,33525688,0,0 short=0,0,0,0 dropped=0,0,0,0
      latency   : target=0, window=0, percentile=100.00%, depth=16

Run status group 0 (all jobs):
   WRITE: bw=1091MiB/s (1144MB/s), 1091MiB/s-1091MiB/s 
(1144MB/s-1144MB/s), io=128GiB (137GB), run=120009-120009msec

Disk stats (read/write):
   nullb0: ios=0/33485328, merge=0/0, ticks=0/4817009, in_queue=4817009, 
util=100.00%

2) null_blk after this patch:
test: (g=0): rw=write, bs=(R) 4096B-4096B, (W) 4096B-4096B, (T) 
4096B-4096B, ioengine=libaio, iodepth=16
...
fio-3.13-42-g8066f
Starting 72 processes
Jobs: 72 (f=72): [W(72)][100.0%][w=1101MiB/s][w=282k IOPS][eta 00m:00s]
test: (groupid=0, jobs=72): err= 0: pid=5001: Mon Aug  2 10:36:52 2021
   write: IOPS=281k, BW=1097MiB/s (1150MB/s)(129GiB/120009msec); 0 zone 
resets
     slat (nsec): min=1104, max=5358.9M, avg=239050.23, stdev=4040598.71
     clat (usec): min=2, max=5359.3k, avg=3862.86, stdev=15270.20
      lat (usec): min=4, max=5359.3k, avg=4101.96, stdev=15742.32
     clat percentiles (usec):
      |  1.00th=[   221],  5.00th=[   223], 10.00th=[   225], 20.00th=[ 
231],
      | 30.00th=[   482], 40.00th=[  1106], 50.00th=[  1909], 60.00th=[ 
3163],
      | 70.00th=[  4490], 80.00th=[  5538], 90.00th=[ 10683], 95.00th=[ 
14877],
      | 99.00th=[ 16450], 99.50th=[ 19530], 99.90th=[ 30802], 99.95th=[ 
34341],
      | 99.99th=[650118]
    bw (  MiB/s): min=   23, max= 4395, per=100.00%, avg=1119.48, 
stdev=27.64, samples=16872
    iops        : min= 5906, max=1125367, avg=286585.88, stdev=7075.29, 
samples=16872
   lat (usec)   : 4=0.01%, 10=0.01%, 20=0.01%, 50=0.01%, 250=24.77%
   lat (usec)   : 500=6.12%, 750=4.51%, 1000=3.97%
   lat (msec)   : 2=11.02%, 4=15.75%, 10=23.34%, 20=10.15%, 50=0.34%
   lat (msec)   : 100=0.01%, 250=0.01%, 500=0.01%, 750=0.01%, 1000=0.01%
   lat (msec)   : 2000=0.01%, >=2000=0.01%
   cpu          : usr=0.36%, sys=0.79%, ctx=35506798, majf=0, minf=966
   IO depths    : 1=0.1%, 2=0.1%, 4=0.1%, 8=0.1%, 16=100.0%, 32=0.0%, 
 >=64=0.0%
      submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, 
 >=64=0.0%
      complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.1%, 32=0.0%, 64=0.0%, 
 >=64=0.0%
      issued rwts: total=0,33697894,0,0 short=0,0,0,0 dropped=0,0,0,0
      latency   : target=0, window=0, percentile=100.00%, depth=16

Run status group 0 (all jobs):
   WRITE: bw=1097MiB/s (1150MB/s), 1097MiB/s-1097MiB/s 
(1150MB/s-1150MB/s), io=129GiB (138GB), run=120009-120009msec

Disk stats (read/write):
   nullb0: ios=0/33657152, merge=0/0, ticks=0/4812746, in_queue=4812745, 
util=100.00%

3) nvme before this patch:
test: (g=0): rw=write, bs=(R) 4096B-4096B, (W) 4096B-4096B, (T) 
4096B-4096B, ioengine=libaio, iodepth=16
...
fio-3.13-42-g8066f
Starting 72 processes
Jobs: 72 (f=72): [W(72)][100.0%][w=1478MiB/s][w=378k IOPS][eta 00m:00s]
test: (groupid=0, jobs=72): err= 0: pid=4780: Mon Aug  2 11:22:36 2021
   write: IOPS=382k, BW=1491MiB/s (1564MB/s)(175GiB/120113msec); 0 zone 
resets
     slat (nsec): min=1234, max=328006k, avg=102467.85, stdev=4967629.26
     clat (nsec): min=1788, max=329044k, avg=2899631.83, stdev=24819488.97
      lat (usec): min=31, max=424004, avg=3004.41, stdev=25334.53
     clat percentiles (usec):
      |  1.00th=[    39],  5.00th=[    39], 10.00th=[    39], 20.00th=[ 
39],
      | 30.00th=[    40], 40.00th=[    40], 50.00th=[    40], 60.00th=[ 
40],
      | 70.00th=[    41], 80.00th=[    41], 90.00th=[    42], 95.00th=[ 
45],
      | 99.00th=[132645], 99.50th=[252707], 99.90th=[287310], 
99.95th=[291505],
      | 99.99th=[304088]
    bw (  MiB/s): min=  783, max= 2394, per=100.00%, avg=1492.49, stdev= 
5.56, samples=17278
    iops        : min=200590, max=613014, avg=382076.48, stdev=1424.37, 
samples=17278
   lat (usec)   : 2=0.01%, 4=0.01%, 20=0.01%, 50=95.89%, 100=0.06%
   lat (usec)   : 250=0.06%, 500=0.15%, 750=0.18%, 1000=0.22%
   lat (msec)   : 2=0.96%, 4=0.60%, 10=0.21%, 20=0.05%, 50=0.18%
   lat (msec)   : 100=0.27%, 250=0.65%, 500=0.51%
   cpu          : usr=0.44%, sys=0.94%, ctx=123991, majf=0, minf=988
   IO depths    : 1=0.1%, 2=0.1%, 4=0.1%, 8=0.1%, 16=100.0%, 32=0.0%, 
 >=64=0.0%
      submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, 
 >=64=0.0%
      complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.1%, 32=0.0%, 64=0.0%, 
 >=64=0.0%
      issued rwts: total=0,45859799,0,0 short=0,0,0,0 dropped=0,0,0,0
      latency   : target=0, window=0, percentile=100.00%, depth=16

Run status group 0 (all jobs):
   WRITE: bw=1491MiB/s (1564MB/s), 1491MiB/s-1491MiB/s 
(1564MB/s-1564MB/s), io=175GiB (188GB), run=120113-120113msec

Disk stats (read/write):
   nvme0n1: ios=308/45807739, merge=0/0, ticks=57/2334550, 
in_queue=2334607, util=100.00%

4) nvme after this patch:
after: nvme
test: (g=0): rw=write, bs=(R) 4096B-4096B, (W) 4096B-4096B, (T) 
4096B-4096B, ioengine=libaio, iodepth=16
...
fio-3.13-42-g8066f
Starting 72 processes
Jobs: 72 (f=72): [W(72)][100.0%][w=1502MiB/s][w=384k IOPS][eta 00m:00s]
test: (groupid=0, jobs=72): err= 0: pid=5320: Mon Aug  2 10:42:07 2021
   write: IOPS=383k, BW=1496MiB/s (1569MB/s)(175GiB/120098msec); 0 zone 
resets
     slat (nsec): min=1229, max=370007k, avg=100549.47, stdev=4919208.81
     clat (nsec): min=1634, max=370050k, avg=2892105.62, stdev=24891976.05
      lat (usec): min=31, max=380005, avg=2995.16, stdev=25391.59
     clat percentiles (usec):
      |  1.00th=[    38],  5.00th=[    39], 10.00th=[    39], 20.00th=[ 
39],
      | 30.00th=[    39], 40.00th=[    40], 50.00th=[    40], 60.00th=[ 
40],
      | 70.00th=[    41], 80.00th=[    41], 90.00th=[    42], 95.00th=[ 
44],
      | 99.00th=[135267], 99.50th=[252707], 99.90th=[287310], 
99.95th=[291505],
      | 99.99th=[304088]
    bw (  MiB/s): min=  827, max= 2248, per=100.00%, avg=1496.99, stdev= 
5.51, samples=17278
    iops        : min=211931, max=575591, avg=383228.21, stdev=1411.19, 
samples=17278
   lat (usec)   : 2=0.01%, 4=0.01%, 10=0.01%, 20=0.01%, 50=95.83%
   lat (usec)   : 100=0.15%, 250=0.05%, 500=0.13%, 750=0.18%, 1000=0.21%
   lat (msec)   : 2=0.85%, 4=0.84%, 10=0.14%, 20=0.05%, 50=0.14%
   lat (msec)   : 100=0.25%, 250=0.65%, 500=0.51%
   cpu          : usr=0.43%, sys=0.95%, ctx=123368, majf=0, minf=989
   IO depths    : 1=0.1%, 2=0.1%, 4=0.1%, 8=0.1%, 16=100.0%, 32=0.0%, 
 >=64=0.0%
      submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, 
 >=64=0.0%
      complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.1%, 32=0.0%, 64=0.0%, 
 >=64=0.0%
      issued rwts: total=0,45995620,0,0 short=0,0,0,0 dropped=0,0,0,0
      latency   : target=0, window=0, percentile=100.00%, depth=16

Run status group 0 (all jobs):
   WRITE: bw=1496MiB/s (1569MB/s), 1496MiB/s-1496MiB/s 
(1569MB/s-1569MB/s), io=175GiB (188GB), run=120098-120098msec

Disk stats (read/write):
   nvme0n1: ios=190/45976809, merge=0/0, ticks=34/2374865, 
in_queue=2374900, util=100.00%

Thanks
Kuai
Bart Van Assche Aug. 2, 2021, 4:17 p.m. UTC | #6
On 8/2/21 6:34 AM, yukuai (C) wrote:
> I run a test on both null_blk and nvme, results show that there are no
> performance degradation:
> 
> test platform: x86
> test cpu: 2 nodes, total 72
> test scheduler: none
> test device: null_blk / nvme
> 
> test cmd: fio -filename=/dev/xxx -name=test -ioengine=libaio -direct=1
> -numjobs=72 -iodepth=16 -bs=4k -rw=write -offset_increment=1G
> -cpus_allowed=0:71 -cpus_allowed_policy=split -group_reporting
> -runtime=120
> 
> test results: iops
> 1) null_blk before this patch: 280k
> 2) null_blk after this patch: 282k
> 3) nvme before this patch: 378k
> 4) nvme after this patch: 384k

Please use io_uring for performance tests.

The null_blk numbers seem way too low to me. If I run a null_blk 
performance test inside a VM with 6 CPU cores (Xeon W-2135 CPU) I see 
about 6 million IOPS for synchronous I/O and about 4.4 million IOPS when 
using libaio. The options I used and that are not in the above command 
line are: --thread --gtod_reduce=1 --ioscheduler=none.

Thanks,

Bart.
Yu Kuai Aug. 3, 2021, 2:57 a.m. UTC | #7
On 2021/08/03 0:17, Bart Van Assche wrote:
> On 8/2/21 6:34 AM, yukuai (C) wrote:
>> I run a test on both null_blk and nvme, results show that there are no
>> performance degradation:
>>
>> test platform: x86
>> test cpu: 2 nodes, total 72
>> test scheduler: none
>> test device: null_blk / nvme
>>
>> test cmd: fio -filename=/dev/xxx -name=test -ioengine=libaio -direct=1
>> -numjobs=72 -iodepth=16 -bs=4k -rw=write -offset_increment=1G
>> -cpus_allowed=0:71 -cpus_allowed_policy=split -group_reporting
>> -runtime=120
>>
>> test results: iops
>> 1) null_blk before this patch: 280k
>> 2) null_blk after this patch: 282k
>> 3) nvme before this patch: 378k
>> 4) nvme after this patch: 384k
> 
> Please use io_uring for performance tests.
> 
> The null_blk numbers seem way too low to me. If I run a null_blk 
> performance test inside a VM with 6 CPU cores (Xeon W-2135 CPU) I see 
> about 6 million IOPS for synchronous I/O and about 4.4 million IOPS when 
> using libaio. The options I used and that are not in the above command 
> line are: --thread --gtod_reduce=1 --ioscheduler=none.
> 

Hi, Bart

The cpu I'm testing is Intel(R) Xeon(R) Gold 6140 CPU @ 2.30GHz, and
after switching to io_uring with "--thread --gtod_reduce=1
--ioscheduler=none", the numbers can increase to 330k, yet still
far behind 6000k.

The new atomic operations in the hot path is atomic_read() from
hctx_may_queue(), and the atomic variable will change in two
situations:

a. fail to get driver tag with dbusy not set, increase and set dbusy.
b. if dbusy is set when queue switch from busy to dile, decrease and
clear dbusy.

During the period a device "idle -> busy -> idle", the new atomic
variable can be writen twice at most, which means this is almost
readonly in the above test situation. So I guess the impact on
performance is minimal ?

Thanks!
Kuai
Bart Van Assche Aug. 3, 2021, 6:38 p.m. UTC | #8
On 8/2/21 7:57 PM, yukuai (C) wrote:
> The cpu I'm testing is Intel(R) Xeon(R) Gold 6140 CPU @ 2.30GHz, and
> after switching to io_uring with "--thread --gtod_reduce=1
> --ioscheduler=none", the numbers can increase to 330k, yet still
> far behind 6000k.

On 
https://ark.intel.com/content/www/us/en/ark/products/120485/intel-xeon-gold-6140-processor-24-75m-cache-2-30-ghz.html 
I found the following information about that CPU:
18 CPU cores
36 hyperthreads

so 36 fio jobs should be sufficient. Maybe IOPS are lower than expected 
because of how null_blk has been configured? This is the configuration 
that I used in my test:

modprobe null_blk nr_devices=0 &&
     udevadm settle &&
     cd /sys/kernel/config/nullb &&
     mkdir nullb0 &&
     cd nullb0 &&
     echo 0 > completion_nsec &&
     echo 512 > blocksize &&
     echo 0 > home_node &&
     echo 0 > irqmode &&
     echo 1024 > size &&
     echo 0 > memory_backed &&
     echo 2 > queue_mode &&
     echo 1 > power ||
     exit $?

> The new atomic operations in the hot path is atomic_read() from
> hctx_may_queue(), and the atomic variable will change in two
> situations:
> 
> a. fail to get driver tag with dbusy not set, increase and set dbusy.
> b. if dbusy is set when queue switch from busy to dile, decrease and
> clear dbusy.
> 
> During the period a device "idle -> busy -> idle", the new atomic
> variable can be writen twice at most, which means this is almost
> readonly in the above test situation. So I guess the impact on
> performance is minimal ?

Please measure the performance impact of your patch.

Thanks,

Bart.
Yu Kuai Aug. 6, 2021, 1:50 a.m. UTC | #9
On 2021/08/04 2:38, Bart Van Assche wrote:
> On 8/2/21 7:57 PM, yukuai (C) wrote:
>> The cpu I'm testing is Intel(R) Xeon(R) Gold 6140 CPU @ 2.30GHz, and
>> after switching to io_uring with "--thread --gtod_reduce=1
>> --ioscheduler=none", the numbers can increase to 330k, yet still
>> far behind 6000k.
> 
> On 
> https://ark.intel.com/content/www/us/en/ark/products/120485/intel-xeon-gold-6140-processor-24-75m-cache-2-30-ghz.html 
> I found the following information about that CPU:
> 18 CPU cores
> 36 hyperthreads
> 
> so 36 fio jobs should be sufficient. Maybe IOPS are lower than expected 
> because of how null_blk has been configured? This is the configuration 
> that I used in my test:
> 
> modprobe null_blk nr_devices=0 &&
>      udevadm settle &&
>      cd /sys/kernel/config/nullb &&
>      mkdir nullb0 &&
>      cd nullb0 &&
>      echo 0 > completion_nsec &&
>      echo 512 > blocksize &&
>      echo 0 > home_node &&
>      echo 0 > irqmode &&
>      echo 1024 > size &&
>      echo 0 > memory_backed &&
>      echo 2 > queue_mode &&
>      echo 1 > power ||
>      exit $?

hi Bart,

After applying this configuration, the number of null_blk in my
machine is about 650k(330k before). Is this still too low?

By the way, there are no performance degradation.

Thanks
Kuai
> 
>> The new atomic operations in the hot path is atomic_read() from
>> hctx_may_queue(), and the atomic variable will change in two
>> situations:
>>
>> a. fail to get driver tag with dbusy not set, increase and set dbusy.
>> b. if dbusy is set when queue switch from busy to dile, decrease and
>> clear dbusy.
>>
>> During the period a device "idle -> busy -> idle", the new atomic
>> variable can be writen twice at most, which means this is almost
>> readonly in the above test situation. So I guess the impact on
>> performance is minimal ?
> 
> Please measure the performance impact of your patch.
> 
> Thanks,
> 
> Bart.
> 
> .
>
Bart Van Assche Aug. 6, 2021, 2:43 a.m. UTC | #10
On 8/5/21 6:50 PM, yukuai (C) wrote:
> After applying this configuration, the number of null_blk in my
> machine is about 650k(330k before). Is this still too low?

That seems low to me. If I run the attached script on a six year old
desktop with an eight core i7-4790 CPU it reports a little more than 5
million IOPS. Has kernel debugging perhaps been enabled in the kernel on
the test setup? Or is the system perhaps slowed down by security
mitigations?

> By the way, there are no performance degradation.

Please wait with drawing a conclusion until you can run a workload on
your setup of several million IOPS.

Thanks,

Bart.
#!/bin/bash

if [ -e /sys/kernel/config/nullb ]; then
    for d in /sys/kernel/config/nullb/*; do
        [ -d "$d" ] && rmdir "$d"
    done
fi
numcpus=$(grep -c ^processor /proc/cpuinfo)
modprobe -r null_blk
[ -e /sys/module/null_blk ] && exit $?
modprobe null_blk nr_devices=0 &&
    udevadm settle &&
    cd /sys/kernel/config/nullb &&
    mkdir nullb0 &&
    cd nullb0 &&
    echo 0 > completion_nsec &&
    echo 512 > blocksize &&
    echo 0 > home_node &&
    echo 0 > irqmode &&
    echo 1024 > size &&
    echo 0 > memory_backed &&
    echo 2 > queue_mode &&
    echo 1 > power ||
    exit $?

(
    cd /sys/block/nullb0/queue &&
	echo 2 > rq_affinity
) || exit $?

iodepth=${1:-1}
runtime=30
args=()
if [ "$iodepth" = 1 ]; then
	args+=(--ioengine=psync)
else
	args+=(--ioengine=io_uring --iodepth_batch=$((iodepth/2)))
fi
args+=(--iodepth=$iodepth --name=nullb0 --filename=/dev/nullb0\
    --rw=read --bs=4096 --loops=$((1<<20)) --direct=1 --numjobs=$numcpus \
    --thread --runtime=$runtime --invalidate=1 --gtod_reduce=1 \
    --group_reporting=1 --ioscheduler=none)
if numactl -m 0 -N 0 echo >&/dev/null; then
	numactl -m 0 -N 0 -- fio "${args[@]}"
else
	fio "${args[@]}"
fi
Yu Kuai Aug. 14, 2021, 9:43 a.m. UTC | #11
On 2021/08/06 10:43, Bart Van Assche wrote:
> On 8/5/21 6:50 PM, yukuai (C) wrote:
>> After applying this configuration, the number of null_blk in my
>> machine is about 650k(330k before). Is this still too low?
> 
> That seems low to me. If I run the attached script on a six year old
> desktop with an eight core i7-4790 CPU it reports a little more than 5
> million IOPS. Has kernel debugging perhaps been enabled in the kernel on
> the test setup? Or is the system perhaps slowed down by security
> mitigations?
> 

Hi, Bart

Sorry for the delay. I was too busy with other things recently.

After disable all the kernel debuging config I can think of, the
numbers can increase to millions.

setup cmd:
modprobe null_blk nr_devices=0 &&
     udevadm settle &&
     cd /sys/kernel/config/nullb &&
     mkdir nullb0 &&
     cd nullb0 &&
     echo 0 > completion_nsec &&
     echo 512 > blocksize &&
     echo 0 > home_node &&
     echo 0 > irqmode &&
     echo 1024 > size &&
     echo 0 > memory_backed &&
     echo 2 > queue_mode &&
     echo 1 > power ||
     exit $?

test cmd:
fio -filename=/dev/nullb0 -name=test -ioengine=io_uring -direct=1
-numjobs=32 -iodepth=32 -bs=4k -rw=write -group_reporting -runtime=30
--thread --gtod_reduce=1 --ioscheduler=none -time_based

test result:
| test round | with this patch | without this patch |
| ---------- | --------------- | ------------------ |
| 1          | 4310k           | 4265k              |
| 2          | 4295k           | 4327k              |
| 3          | 4217k           | 4213k              |
| 4          | 4355k           | 4236k              |
| 5          | 4315k           | 4337k              |
| average    | 4294k           | 4275k              |

Thanks
Kuai
Bart Van Assche Aug. 16, 2021, 4:16 a.m. UTC | #12
On 8/14/21 2:43 AM, yukuai (C) wrote:
> test result:
> | test round | with this patch | without this patch |
> | ---------- | --------------- | ------------------ |
> | 1          | 4310k           | 4265k              |
> | 2          | 4295k           | 4327k              |
> | 3          | 4217k           | 4213k              |
> | 4          | 4355k           | 4236k              |
> | 5          | 4315k           | 4337k              |
> | average    | 4294k           | 4275k              |

Hi Kuai,

Thank you for having taken the time to rerun the IOPS measurements with
kernel debugging disabled. According to my calculations the standard
deviation (50K) is larger than the difference between the averages
(19K). Unfortunately that makes it hard to draw any conclusion ...

Bart.
diff mbox series

Patch

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 4b66d2776eda..35f1f01d93ae 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -450,6 +450,8 @@  static void blk_mq_debugfs_tags_show(struct seq_file *m,
 	seq_printf(m, "nr_reserved_tags=%u\n", tags->nr_reserved_tags);
 	seq_printf(m, "active_queues=%d\n",
 		   atomic_read(&tags->active_queues));
+	seq_printf(m, "pending_queues=%d\n",
+		   atomic_read(&tags->pending_queues));
 
 	seq_puts(m, "\nbitmap_tags:\n");
 	sbitmap_queue_show(tags->bitmap_tags, m);
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 86f87346232a..618624b359d6 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -40,6 +40,22 @@  bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
 	return true;
 }
 
+void __blk_mq_dtag_busy(struct blk_mq_hw_ctx *hctx)
+{
+	if (blk_mq_is_sbitmap_shared(hctx->flags)) {
+		struct request_queue *q = hctx->queue;
+		struct blk_mq_tag_set *set = q->tag_set;
+
+		if (!test_bit(QUEUE_FLAG_HCTX_WAIT, &q->queue_flags) &&
+		    !test_and_set_bit(QUEUE_FLAG_HCTX_WAIT, &q->queue_flags))
+			atomic_inc(&set->pending_queues_shared_sbitmap);
+	} else {
+		if (!test_bit(BLK_MQ_S_DTAG_WAIT, &hctx->state) &&
+		    !test_and_set_bit(BLK_MQ_S_DTAG_WAIT, &hctx->state))
+			atomic_inc(&hctx->tags->pending_queues);
+	}
+}
+
 /*
  * Wakeup all potentially sleeping on tags
  */
@@ -74,6 +90,24 @@  void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
 	blk_mq_tag_wakeup_all(tags, false);
 }
 
+void __blk_mq_dtag_idle(struct blk_mq_hw_ctx *hctx)
+{
+	struct blk_mq_tags *tags = hctx->tags;
+	struct request_queue *q = hctx->queue;
+	struct blk_mq_tag_set *set = q->tag_set;
+
+	if (blk_mq_is_sbitmap_shared(hctx->flags)) {
+		if (!test_and_clear_bit(QUEUE_FLAG_HCTX_WAIT,
+					&q->queue_flags))
+			return;
+		atomic_dec(&set->pending_queues_shared_sbitmap);
+	} else {
+		if (!test_and_clear_bit(BLK_MQ_S_DTAG_WAIT, &hctx->state))
+			return;
+		atomic_dec(&tags->pending_queues);
+	}
+}
+
 static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
 			    struct sbitmap_queue *bt)
 {
@@ -112,8 +146,12 @@  unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 	if (tag != BLK_MQ_NO_TAG)
 		goto found_tag;
 
-	if (data->flags & BLK_MQ_REQ_NOWAIT)
+	if (data->flags & BLK_MQ_REQ_NOWAIT) {
+		if (!data->q->elevator)
+			blk_mq_dtag_busy(data->hctx);
+
 		return BLK_MQ_NO_TAG;
+	}
 
 	ws = bt_wait_ptr(bt, data->hctx);
 	do {
@@ -140,6 +178,9 @@  unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 		if (tag != BLK_MQ_NO_TAG)
 			break;
 
+		if (!data->q->elevator)
+			blk_mq_dtag_busy(data->hctx);
+
 		bt_prev = bt;
 		io_schedule();
 
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 8ed55af08427..badcf3693749 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -10,6 +10,11 @@  struct blk_mq_tags {
 	unsigned int nr_reserved_tags;
 
 	atomic_t active_queues;
+	/*
+	 * if multiple queues share a tag set, pending_queues record the
+	 * number of queues that can't get driver tag.
+	 */
+	atomic_t pending_queues;
 
 	struct sbitmap_queue *bitmap_tags;
 	struct sbitmap_queue *breserved_tags;
@@ -69,8 +74,10 @@  enum {
 	BLK_MQ_TAG_MAX		= BLK_MQ_NO_TAG - 1,
 };
 
-extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *);
-extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *);
+extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx);
+extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx);
+extern void __blk_mq_dtag_busy(struct blk_mq_hw_ctx *hctx);
+extern void __blk_mq_dtag_idle(struct blk_mq_hw_ctx *hctx);
 
 static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
 {
@@ -88,6 +95,22 @@  static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
 	__blk_mq_tag_idle(hctx);
 }
 
+static inline void blk_mq_dtag_busy(struct blk_mq_hw_ctx *hctx)
+{
+	if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED))
+		return;
+
+	__blk_mq_dtag_busy(hctx);
+}
+
+static inline void blk_mq_dtag_idle(struct blk_mq_hw_ctx *hctx)
+{
+	if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED))
+		return;
+
+	__blk_mq_dtag_idle(hctx);
+}
+
 static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags,
 					  unsigned int tag)
 {
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 2c4ac51e54eb..1bb52bd71da8 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -991,8 +991,10 @@  static void blk_mq_timeout_work(struct work_struct *work)
 		 */
 		queue_for_each_hw_ctx(q, hctx, i) {
 			/* the hctx may be unmapped, so check it here */
-			if (blk_mq_hw_queue_mapped(hctx))
+			if (blk_mq_hw_queue_mapped(hctx)) {
 				blk_mq_tag_idle(hctx);
+				blk_mq_dtag_idle(hctx);
+			}
 		}
 	}
 	blk_queue_exit(q);
@@ -1097,8 +1099,10 @@  static bool __blk_mq_get_driver_tag(struct request *rq)
 	}
 
 	tag = __sbitmap_queue_get(bt);
-	if (tag == BLK_MQ_NO_TAG)
+	if (tag == BLK_MQ_NO_TAG) {
+		blk_mq_dtag_busy(rq->mq_hctx);
 		return false;
+	}
 
 	rq->tag = tag + tag_offset;
 	return true;
@@ -2676,8 +2680,10 @@  static void blk_mq_exit_hctx(struct request_queue *q,
 {
 	struct request *flush_rq = hctx->fq->flush_rq;
 
-	if (blk_mq_hw_queue_mapped(hctx))
+	if (blk_mq_hw_queue_mapped(hctx)) {
 		blk_mq_tag_idle(hctx);
+		blk_mq_dtag_idle(hctx);
+	}
 
 	blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx],
 			set->queue_depth, flush_rq);
@@ -3536,6 +3542,7 @@  int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
 
 	if (blk_mq_is_sbitmap_shared(set->flags)) {
 		atomic_set(&set->active_queues_shared_sbitmap, 0);
+		atomic_set(&set->pending_queues_shared_sbitmap, 0);
 
 		if (blk_mq_init_shared_sbitmap(set)) {
 			ret = -ENOMEM;
diff --git a/block/blk-mq.h b/block/blk-mq.h
index d08779f77a26..9e646ade81a8 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -337,10 +337,18 @@  static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
 
 		if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
 			return true;
+
+		if (!atomic_read(&set->pending_queues_shared_sbitmap))
+			return true;
+
 		users = atomic_read(&set->active_queues_shared_sbitmap);
 	} else {
 		if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
 			return true;
+
+		if (!atomic_read(&hctx->tags->pending_queues))
+			return true;
+
 		users = atomic_read(&hctx->tags->active_queues);
 	}
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 1d18447ebebc..3bc0faf0e2cf 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -256,6 +256,7 @@  struct blk_mq_tag_set {
 	unsigned int		flags;
 	void			*driver_data;
 	atomic_t		active_queues_shared_sbitmap;
+	atomic_t		pending_queues_shared_sbitmap;
 
 	struct sbitmap_queue	__bitmap_tags;
 	struct sbitmap_queue	__breserved_tags;
@@ -415,6 +416,9 @@  enum {
 	/* hw queue is inactive after all its CPUs become offline */
 	BLK_MQ_S_INACTIVE	= 3,
 
+	/* hw queue is waiting for driver tag */
+	BLK_MQ_S_DTAG_WAIT	= 1,
+
 	BLK_MQ_MAX_DEPTH	= 10240,
 
 	BLK_MQ_CPU_WORK_BATCH	= 8,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 3177181c4326..55e0965c9c3c 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -603,6 +603,7 @@  struct request_queue {
 #define QUEUE_FLAG_RQ_ALLOC_TIME 27	/* record rq->alloc_time_ns */
 #define QUEUE_FLAG_HCTX_ACTIVE	28	/* at least one blk-mq hctx is active */
 #define QUEUE_FLAG_NOWAIT       29	/* device supports NOWAIT */
+#define QUEUE_FLAG_HCTX_WAIT	30	/* at least one blk-mq hctx can't get driver tag */
 
 #define QUEUE_FLAG_MQ_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
 				 (1 << QUEUE_FLAG_SAME_COMP) |		\