diff mbox series

[3/3] block: Add support for sharing tags across hardware queues

Message ID 20191126175656.67638-4-bvanassche@acm.org (mailing list archive)
State New, archived
Headers show
Series blk-mq: Support sharing tags across hardware queues | expand

Commit Message

Bart Van Assche Nov. 26, 2019, 5:56 p.m. UTC
Add a boolean member 'share_tags' in struct blk_mq_tag_set. If that member
variable is set, make all hctx->tags[] pointers identical. Implement the
necessary changes in the functions that allocate, free and resize tag sets.
Modify blk_mq_tagset_busy_iter() such that it continues to call the
callback function once per request. Modify blk_mq_queue_tag_busy_iter()
such that the callback function is only called with the correct hctx
as first argument. Modify the debugfs code such that it keeps showing only
matching tags per hctx.

This patch has been tested by running blktests on top of a kernel that
includes the following change to enable shared tags for all block drivers
except the NVMe drivers:

Comments

John Garry Nov. 27, 2019, 9:51 a.m. UTC | #1
On 26/11/2019 17:56, Bart Van Assche wrote:
> Add a boolean member 'share_tags' in struct blk_mq_tag_set. If that member
> variable is set, make all hctx->tags[] pointers identical. Implement the
> necessary changes in the functions that allocate, free and resize tag sets.
> Modify blk_mq_tagset_busy_iter() such that it continues to call the
> callback function once per request. Modify blk_mq_queue_tag_busy_iter()
> such that the callback function is only called with the correct hctx
> as first argument. Modify the debugfs code such that it keeps showing only
> matching tags per hctx.
> 

Hi Bart,

 > This patch has been tested by running blktests on top of a kernel that
 > includes the following change to enable shared tags for all block drivers
 > except the NVMe drivers:

Could something be broken here with this approach, see ***:

static int
nvme_init_request(struct blk_mq_tag_set *set, struct request *req,
unsigned int hctx_idx, unsigned int numa_node)
{
	struct nvme_dev *dev = set->driver_data;
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
	int queue_idx = (set == &dev->tagset) ? hctx_idx + 1 : 0;
	struct nvme_queue *nvmeq = &dev->queues[queue_idx];

	BUG_ON(!nvmeq);
	iod->nvmeq = nvmeq; ***

	nvme_req(req)->ctrl = &dev->ctrl;
	return 0;
}

All iods are from hctx0, but could use different hctx's and nvme queues.

Obviously NVMe would not want shared tags, but I am just trying to 
illustrate a potential problem in how requests are associated with 
queues. I haven't audited all users.

Thanks,
John
diff mbox series

Patch

diff --git a/block/blk-mq.c b/block/blk-mq.c
@@ -3037,6 +3037,10 @@  int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)

        BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);

+       /* Test code: enable tag sharing for all block drivers except NVMe */
+       if (!set->ops->poll)
+               set->share_tags = true;
+
        if (!set->nr_hw_queues)
                return -EINVAL;
        if (!set->queue_depth)

Cc: Christoph Hellwig <hch@infradead.org>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Hannes Reinecke <hare@suse.com>
Cc: John Garry <john.garry@huawei.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
 block/blk-mq-debugfs.c | 40 ++++++++++++++++++++++++++++++++++++++--
 block/blk-mq-tag.c     |  7 +++++--
 block/blk-mq.c         | 28 +++++++++++++++++++++-------
 include/linux/blk-mq.h |  8 ++++++--
 4 files changed, 70 insertions(+), 13 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 3678e95ec947..653e80ede3bd 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -472,20 +472,56 @@  static int hctx_tags_show(void *data, struct seq_file *m)
 	return res;
 }
 
+struct hctx_sb_data {
+	struct sbitmap		*sb;	/* output bitmap */
+	struct blk_mq_hw_ctx	*hctx;	/* input hctx */
+};
+
+static bool hctx_filter_fn(struct blk_mq_hw_ctx *hctx, struct request *req,
+			   void *priv, bool reserved)
+{
+	struct hctx_sb_data *hctx_sb_data = priv;
+
+	if (hctx == hctx_sb_data->hctx)
+		sbitmap_set_bit(hctx_sb_data->sb, req->tag);
+	return true;
+}
+
+static void hctx_filter_sb(struct sbitmap *sb, struct blk_mq_hw_ctx *hctx)
+{
+	struct hctx_sb_data hctx_sb_data = { .sb = sb, .hctx = hctx };
+
+	blk_mq_queue_tag_busy_iter(hctx->queue, hctx_filter_fn, &hctx_sb_data);
+}
+
 static int hctx_tags_bitmap_show(void *data, struct seq_file *m)
 {
 	struct blk_mq_hw_ctx *hctx = data;
 	struct request_queue *q = hctx->queue;
+	struct sbitmap sb, *hctx_sb;
 	int res;
 
+	if (!hctx->tags)
+		return 0;
+	hctx_sb = &hctx->tags->bitmap_tags.sb;
+	res = sbitmap_init_node(&sb, hctx_sb->depth, hctx_sb->shift, GFP_KERNEL,
+				NUMA_NO_NODE);
+	if (res)
+		return res;
+
 	res = mutex_lock_interruptible(&q->sysfs_lock);
 	if (res)
 		goto out;
-	if (hctx->tags)
-		sbitmap_bitmap_show(&hctx->tags->bitmap_tags.sb, m);
+	/*
+	 * If tags are shared across hardware queues, hctx_sb contains tags
+	 * for multiple hardware queues. Filter the tags for 'hctx' into 'sb'.
+	 */
+	hctx_filter_sb(&sb, hctx);
 	mutex_unlock(&q->sysfs_lock);
+	sbitmap_bitmap_show(&sb, m);
 
 out:
+	sbitmap_free(&sb);
 	return res;
 }
 
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index a60e1b4a8158..770fe2324230 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -220,7 +220,7 @@  static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
 	 * We can hit rq == NULL here, because the tagging functions
 	 * test and set the bit before assigning ->rqs[].
 	 */
-	if (rq && rq->q == hctx->queue)
+	if (rq && rq->q == hctx->queue && rq->mq_hctx == hctx)
 		return iter_data->fn(hctx, rq, iter_data->data, reserved);
 	return true;
 }
@@ -341,8 +341,11 @@  void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
 	int i;
 
 	for (i = 0; i < tagset->nr_hw_queues; i++) {
-		if (tagset->tags && tagset->tags[i])
+		if (tagset->tags && tagset->tags[i]) {
 			blk_mq_all_tag_busy_iter(tagset->tags[i], fn, priv);
+			if (tagset->share_tags)
+				break;
+		}
 	}
 }
 EXPORT_SYMBOL(blk_mq_tagset_busy_iter);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index fec4b82ff91c..fa4cfc4b7e7c 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2404,6 +2404,12 @@  static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx)
 {
 	int ret = 0;
 
+	if (hctx_idx > 0 && set->share_tags) {
+		WARN_ON_ONCE(!set->tags[0]);
+		set->tags[hctx_idx] = set->tags[0];
+		return 0;
+	}
+
 	set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx,
 					set->queue_depth, set->reserved_tags);
 	if (!set->tags[hctx_idx])
@@ -2423,8 +2429,10 @@  static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
 					 unsigned int hctx_idx)
 {
 	if (set->tags && set->tags[hctx_idx]) {
-		blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
-		blk_mq_free_rq_map(set->tags[hctx_idx]);
+		if (hctx_idx == 0 || !set->share_tags) {
+			blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
+			blk_mq_free_rq_map(set->tags[hctx_idx]);
+		}
 		set->tags[hctx_idx] = NULL;
 	}
 }
@@ -2568,7 +2576,7 @@  static void blk_mq_del_queue_tag_set(struct request_queue *q)
 
 	mutex_lock(&set->tag_list_lock);
 	list_del_rcu(&q->tag_set_list);
-	if (list_is_singular(&set->tag_list)) {
+	if (list_is_singular(&set->tag_list) && !set->share_tags) {
 		/* just transitioned to unshared */
 		set->flags &= ~BLK_MQ_F_TAG_SHARED;
 		/* update existing queue */
@@ -2586,7 +2594,7 @@  static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
 	/*
 	 * Check to see if we're transitioning to shared (from 1 to 2 queues).
 	 */
-	if (!list_empty(&set->tag_list) &&
+	if ((!list_empty(&set->tag_list) || set->share_tags) &&
 	    !(set->flags & BLK_MQ_F_TAG_SHARED)) {
 		set->flags |= BLK_MQ_F_TAG_SHARED;
 		/* update existing queue */
@@ -2911,15 +2919,21 @@  static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
 {
 	int i;
 
-	for (i = 0; i < set->nr_hw_queues; i++)
-		if (!__blk_mq_alloc_rq_map(set, i))
+	for (i = 0; i < set->nr_hw_queues; i++) {
+		if (i > 0 && set->share_tags) {
+			set->tags[i] = set->tags[0];
+		} else if (!__blk_mq_alloc_rq_map(set, i))
 			goto out_unwind;
+	}
 
 	return 0;
 
 out_unwind:
-	while (--i >= 0)
+	while (--i >= 0) {
+		if (i > 0 && set->share_tags)
+			continue;
 		blk_mq_free_rq_map(set->tags[i]);
+	}
 
 	return -ENOMEM;
 }
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 522631d108af..dd5517476314 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -224,10 +224,13 @@  enum hctx_type {
  * @numa_node:	   NUMA node the storage adapter has been connected to.
  * @timeout:	   Request processing timeout in jiffies.
  * @flags:	   Zero or more BLK_MQ_F_* flags.
+ * @share_tags:	   Whether or not to share one tag set across hardware queues.
  * @driver_data:   Pointer to data owned by the block driver that created this
  *		   tag set.
- * @tags:	   Tag sets. One tag set per hardware queue. Has @nr_hw_queues
- *		   elements.
+ * @tags:	   Array of tag set pointers. Has @nr_hw_queues elements. If
+ *		   share_tags has not been set, all tag set pointers are
+ *		   different. If share_tags has been set, all tag_set pointers
+ *		   are identical.
  * @tag_list_lock: Serializes tag_list accesses.
  * @tag_list:	   List of the request queues that use this tag set. See also
  *		   request_queue.tag_set_list.
@@ -243,6 +246,7 @@  struct blk_mq_tag_set {
 	int			numa_node;
 	unsigned int		timeout;
 	unsigned int		flags;
+	bool			share_tags;
 	void			*driver_data;
 
 	struct blk_mq_tags	**tags;