diff mbox series

[RFC,v2,8/8] blk-mq-tag: allow shared queue/hctx to get more driver tags

Message ID 20231021154806.4019417-9-yukuai1@huaweicloud.com (mailing list archive)
State New, archived
Headers show
Series blk-mq: improve tag fair sharing | expand

Commit Message

Yu Kuai Oct. 21, 2023, 3:48 p.m. UTC
From: Yu Kuai <yukuai3@huawei.com>

Currently tags are fail shared to each node, and if some node only
issue litter IO, the shared tags will be wasted and can't be used for
other nodes that is under high IO pressure.

This patch implement a new way to allow one node to borrow tags from
other node:

1) multiple nodes start issue io, and all tags is exhausted, start
   sharing tags by fair share;
2) while sharing tag, detect if there are any node need more tags, if so
   start a timer;
3) in the timer function
   - If there are node with borrowed tags and doesn't busy anymore,
     return borrowed tags;
   - Find the busy node with highest pressure, and calculate available
     free tags from node that is not busy, then borrow tags;

A simple test, 32 tags with two shared node:
[global]
ioengine=libaio
iodepth=2
bs=4k
direct=1
rw=randrw
group_reporting

[sda]
numjobs=32
filename=/dev/sda

[sdb]
numjobs=1
filename=/dev/sdb

Test result(monitor new debugfs entry):

time	active		available
	sda 	sdb	sda	sdb
0	0	0	32	32
1	16	2	16	16	-> start fair sharing
2	19	2	20	16
3	24	2	24	16
4	26	2	28	16 	-> borrow 32/8=4 tags each round
5	28	2	28	16	-> save at lease 4 tags for sdb

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 block/blk-mq-debugfs.c |  1 +
 block/blk-mq-tag.c     | 92 +++++++++++++++++++++++++++++++++++++++---
 include/linux/blk-mq.h |  3 ++
 include/linux/blkdev.h |  1 +
 4 files changed, 91 insertions(+), 6 deletions(-)

Comments

Bart Van Assche Oct. 23, 2023, 8:46 p.m. UTC | #1
On 10/21/23 08:48, Yu Kuai wrote:
> +	if (!busy) {
> +		ctl->timer_running = false;
> +	} else {
> +		ctl->timer.expires = jiffies + HZ;
> +		add_timer(&ctl->timer);
> +	}
> +	spin_unlock_irq(&tags->lock);

[ ... ]

> +inc_busy:
> +	atomic_inc(&info->busy_count);
> +	if (!tags->ctl.timer_running &&
> +	    try_cmpxchg_relaxed(&tags->ctl.timer_running, &timer_running, true)) {
> +		tags->ctl.timer.expires = jiffies + HZ;
> +		add_timer(&tags->ctl.timer);
> +	}
>   }

So the choice has been made to let the timer expire after one second? I
think the choice of the timer value is important enough to mention it in
the patch description.

Thanks,

Bart.
Yu Kuai Oct. 24, 2023, 1:07 a.m. UTC | #2
Hi,

在 2023/10/24 4:46, Bart Van Assche 写道:
> On 10/21/23 08:48, Yu Kuai wrote:
>> +    if (!busy) {
>> +        ctl->timer_running = false;
>> +    } else {
>> +        ctl->timer.expires = jiffies + HZ;
>> +        add_timer(&ctl->timer);
>> +    }
>> +    spin_unlock_irq(&tags->lock);
> 
> [ ... ]
> 
>> +inc_busy:
>> +    atomic_inc(&info->busy_count);
>> +    if (!tags->ctl.timer_running &&
>> +        try_cmpxchg_relaxed(&tags->ctl.timer_running, &timer_running, 
>> true)) {
>> +        tags->ctl.timer.expires = jiffies + HZ;
>> +        add_timer(&tags->ctl.timer);
>> +    }
>>   }
> 
> So the choice has been made to let the timer expire after one second? I
> think the choice of the timer value is important enough to mention it in
> the patch description.

Yes, I agree, I admit that I was not that cautious while choose 1 HZ in
this version, I'm not quite sure is this approch will be accepted yet,
while this is the best(simple and workable) way that I can think of.

Thanks,
Kuai

> 
> Thanks,
> 
> Bart.
> 
> 
> .
>
diff mbox series

Patch

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 170bc2236e81..a987f4228b67 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -160,6 +160,7 @@  static void shared_tag_info_show(struct shared_tag_info *info,
 {
 	seq_printf(m, "active tags %d\n", atomic_read(&info->active_tags));
 	seq_printf(m, "available tags %u\n", READ_ONCE(info->available_tags));
+	seq_printf(m, "busy count %u\n", atomic_read(&info->busy_count));
 }
 
 static int queue_shared_tag_info_show(void *data, struct seq_file *m)
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index a98b25c8d594..798b90d3f11a 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -35,6 +35,7 @@  void blk_mq_init_shared_tag_info(struct shared_tag_info *info,
 				 unsigned int nr_tags)
 {
 	atomic_set(&info->active_tags, 0);
+	atomic_set(&info->busy_count, 0);
 	INIT_LIST_HEAD(&info->node);
 	info->available_tags = nr_tags;
 }
@@ -143,26 +144,93 @@  void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
 	spin_unlock_irq(&tags->lock);
 }
 
+static void tag_sharing_ctl_timer_fn(struct timer_list *t)
+{
+	struct tag_sharing_ctl *ctl = from_timer(ctl, t, timer);
+	struct blk_mq_tags *tags = container_of(ctl, struct blk_mq_tags, ctl);
+	struct shared_tag_info *busy = NULL;
+	struct shared_tag_info *info;
+	unsigned int nr_tags;
+	unsigned int step;
+	unsigned int free_tags = 0;
+	unsigned int borrowed_tags = 0;
+	unsigned int max_busy_count = 0;
+
+	spin_lock_irq(&tags->lock);
+
+	if (tags->ctl.busy_queues <= 1)
+		goto out;
+
+	/* First round, decrease busy_count by half. */
+	list_for_each_entry(info, &tags->ctl.head, node) {
+		int count = atomic_read(&info->busy_count);
+
+		if (count > tags->nr_tags)
+			count = count >> 1;
+		atomic_sub(count, &info->busy_count);
+	}
+
+	/* Second round, find borrowed tags that can be returned. */
+	nr_tags = shared_tags(tags, tags->ctl.busy_queues);
+	step = clamp_t(unsigned int, tags->nr_tags / SBQ_WAIT_QUEUES, 1,
+		       SBQ_WAKE_BATCH);
+	list_for_each_entry(info, &tags->ctl.head, node) {
+		if (info->available_tags > nr_tags &&
+		    atomic_read(&info->active_tags) <= nr_tags &&
+		    atomic_read(&info->busy_count) <= tags->nr_tags)
+			info->available_tags = nr_tags;
+	}
+
+	/* Last round, find available free tags, and which node need more tags. */
+	list_for_each_entry(info, &tags->ctl.head, node) {
+		unsigned int busy_count;
+
+		if (info->available_tags > nr_tags)
+			borrowed_tags += info->available_tags - nr_tags;
+		else
+			free_tags += info->available_tags - max(step,
+				(unsigned int)atomic_read(&info->active_tags));
+
+		busy_count = atomic_read(&info->busy_count);
+		if (busy_count > tags->nr_tags && busy_count > max_busy_count) {
+			busy = info;
+			max_busy_count = busy_count;
+		}
+	}
+
+	/* Borrow tags. */
+	if (busy && borrowed_tags < free_tags)
+		busy->available_tags += min(free_tags - borrowed_tags, step);
+
+out:
+	if (!busy) {
+		ctl->timer_running = false;
+	} else {
+		ctl->timer.expires = jiffies + HZ;
+		add_timer(&ctl->timer);
+	}
+	spin_unlock_irq(&tags->lock);
+}
+
 void __blk_mq_driver_tag_busy(struct blk_mq_hw_ctx *hctx)
 {
 	unsigned int users;
 	struct blk_mq_tags *tags = hctx->tags;
 	struct shared_tag_info *info;
+	bool timer_running = false;
 
 	if (blk_mq_is_shared_tags(hctx->flags)) {
 		struct request_queue *q = hctx->queue;
 
+		info = &q->shared_tag_info;
 		if (test_bit(QUEUE_FLAG_HCTX_BUSY, &q->queue_flags) ||
 		    test_and_set_bit(QUEUE_FLAG_HCTX_BUSY, &q->queue_flags))
-			return;
-
-		info = &q->shared_tag_info;
+			goto inc_busy;
 	} else {
+		info = &hctx->shared_tag_info;
 		if (test_bit(BLK_MQ_S_DTAG_BUSY, &hctx->state) ||
 		    test_and_set_bit(BLK_MQ_S_DTAG_BUSY, &hctx->state))
-			return;
-
-		info = &hctx->shared_tag_info;
+			goto inc_busy;
 	}
 
 	spin_lock_irq(&tags->lock);
@@ -172,6 +240,15 @@  void __blk_mq_driver_tag_busy(struct blk_mq_hw_ctx *hctx)
 	blk_mq_update_wake_batch(tags, users);
 	WRITE_ONCE(tags->ctl.busy_queues, users);
 	spin_unlock_irq(&tags->lock);
+	return;
+
+inc_busy:
+	atomic_inc(&info->busy_count);
+	if (!tags->ctl.timer_running &&
+	    try_cmpxchg_relaxed(&tags->ctl.timer_running, &timer_running, true)) {
+		tags->ctl.timer.expires = jiffies + HZ;
+		add_timer(&tags->ctl.timer);
+	}
 }
 
 void __blk_mq_driver_tag_idle(struct blk_mq_hw_ctx *hctx)
@@ -204,6 +281,7 @@  void __blk_mq_driver_tag_idle(struct blk_mq_hw_ctx *hctx)
 		info = &hctx->shared_tag_info;
 	}
 
+	atomic_set(&info->busy_count, 0);
 	list_del_init(&info->node);
 	users = tags->ctl.busy_queues - 1;
 	blk_mq_update_available_driver_tags(tags, info, users);
@@ -705,6 +783,7 @@  struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
 	tags->nr_reserved_tags = reserved_tags;
 	spin_lock_init(&tags->lock);
 	INIT_LIST_HEAD(&tags->ctl.head);
+	timer_setup(&tags->ctl.timer, tag_sharing_ctl_timer_fn, 0);
 
 	if (blk_mq_init_bitmaps(&tags->bitmap_tags, &tags->breserved_tags,
 				total_tags, reserved_tags, node,
@@ -717,6 +796,7 @@  struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
 
 void blk_mq_free_tags(struct blk_mq_tags *tags)
 {
+	del_timer_sync(&tags->ctl.timer);
 	sbitmap_queue_free(&tags->bitmap_tags);
 	sbitmap_queue_free(&tags->breserved_tags);
 	kfree(tags);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 9182ceca8c7a..1e4aa733e1ab 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -736,6 +736,9 @@  struct tag_sharing_ctl {
 	 * a list with request_queue/hctx->shared_tag_info.node entries.
 	 */
 	struct list_head head;
+	bool timer_running;
+	/* Start when any queue/hctx failed to get driver tag. */
+	struct timer_list timer;
 };
 
 /*
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8fd6a0a92233..8bfae877dd27 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -378,6 +378,7 @@  struct blk_independent_access_ranges {
 struct shared_tag_info {
 	atomic_t		active_tags;
 	unsigned int		available_tags;
+	atomic_t		busy_count;
 	struct list_head	node;
 };