diff mbox series

[for-6.12,2/2] blk-throttle: support prioritized processing of metadata

Message ID 20240903135149.271857-3-yukuai1@huaweicloud.com (mailing list archive)
State New, archived
Headers show
Series blk-throttle: support prioritized processing of metadata | expand

Commit Message

Yu Kuai Sept. 3, 2024, 1:51 p.m. UTC
From: Yu Kuai <yukuai3@huawei.com>

Currently, blk-throttle handle all IO fifo, hence if data IO is
throttled and then meta IO is dispatched, the meta IO will have to wait
for the data IO, causing priority inversion problems.

This patch support to handle metadata first and then pay debt while
throttling data.

Test script: use cgroup v1 to throttle root cgroup, then create new
dir and file while write back is throttled

test() {
  mkdir /mnt/test/xxx
  touch /mnt/test/xxx/1
  sync /mnt/test/xxx
  sync /mnt/test/xxx
}

mkfs.ext4 -F /dev/nvme0n1 -E lazy_itable_init=0,lazy_journal_init=0
mount /dev/nvme0n1 /mnt/test

echo "259:0 $((1024*1024))" > /sys/fs/cgroup/blkio/blkio.throttle.write_bps_device
dd if=/dev/zero of=/mnt/test/foo1 bs=16M count=1 conv=fdatasync status=none &
sleep 4

time test
echo "259:0 0" > /sys/fs/cgroup/blkio/blkio.throttle.write_bps_device

sleep 1
umount /dev/nvme0n1

Test result: time cost for creating new dir and file
before this patch:  14s
after this patch:   0.1s

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 block/blk-throttle.c | 65 +++++++++++++++++++++++++++++---------------
 1 file changed, 43 insertions(+), 22 deletions(-)

Comments

Tejun Heo Sept. 10, 2024, 9:19 p.m. UTC | #1
On Tue, Sep 03, 2024 at 09:51:49PM +0800, Yu Kuai wrote:
> From: Yu Kuai <yukuai3@huawei.com>
> 
> Currently, blk-throttle handle all IO fifo, hence if data IO is
> throttled and then meta IO is dispatched, the meta IO will have to wait
> for the data IO, causing priority inversion problems.
> 
> This patch support to handle metadata first and then pay debt while
> throttling data.
> 
> Test script: use cgroup v1 to throttle root cgroup, then create new
> dir and file while write back is throttled
> 
> test() {
>   mkdir /mnt/test/xxx
>   touch /mnt/test/xxx/1
>   sync /mnt/test/xxx
>   sync /mnt/test/xxx
> }
> 
> mkfs.ext4 -F /dev/nvme0n1 -E lazy_itable_init=0,lazy_journal_init=0
> mount /dev/nvme0n1 /mnt/test
> 
> echo "259:0 $((1024*1024))" > /sys/fs/cgroup/blkio/blkio.throttle.write_bps_device
> dd if=/dev/zero of=/mnt/test/foo1 bs=16M count=1 conv=fdatasync status=none &
> sleep 4
> 
> time test
> echo "259:0 0" > /sys/fs/cgroup/blkio/blkio.throttle.write_bps_device
> 
> sleep 1
> umount /dev/nvme0n1
> 
> Test result: time cost for creating new dir and file
> before this patch:  14s
> after this patch:   0.1s
> 
> Signed-off-by: Yu Kuai <yukuai3@huawei.com>

This is a lot simpler than I expected. Great.

 Acked-by: Tejun Heo <tj@kernel.org>

Thanks.
diff mbox series

Patch

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index eb859c44c9f3..9c5bbd261724 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1595,6 +1595,22 @@  void blk_throtl_cancel_bios(struct gendisk *disk)
 	spin_unlock_irq(&q->queue_lock);
 }
 
+static bool tg_within_limit(struct throtl_grp *tg, struct bio *bio, bool rw)
+{
+	/* throtl is FIFO - if bios are already queued, should queue */
+	if (tg->service_queue.nr_queued[rw])
+		return false;
+
+	return tg_may_dispatch(tg, bio, NULL);
+}
+
+static void tg_dispatch_in_debt(struct throtl_grp *tg, struct bio *bio, bool rw)
+{
+	if (!bio_flagged(bio, BIO_BPS_THROTTLED))
+		tg->carryover_bytes[rw] -= throtl_bio_data_size(bio);
+	tg->carryover_ios[rw]--;
+}
+
 bool __blk_throtl_bio(struct bio *bio)
 {
 	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
@@ -1611,29 +1627,34 @@  bool __blk_throtl_bio(struct bio *bio)
 	sq = &tg->service_queue;
 
 	while (true) {
-		/* throtl is FIFO - if bios are already queued, should queue */
-		if (sq->nr_queued[rw])
+		if (tg_within_limit(tg, bio, rw)) {
+			/* within limits, let's charge and dispatch directly */
+			throtl_charge_bio(tg, bio);
+
+			/*
+			 * We need to trim slice even when bios are not being
+			 * queued otherwise it might happen that a bio is not
+			 * queued for a long time and slice keeps on extending
+			 * and trim is not called for a long time. Now if limits
+			 * are reduced suddenly we take into account all the IO
+			 * dispatched so far at new low rate and * newly queued
+			 * IO gets a really long dispatch time.
+			 *
+			 * So keep on trimming slice even if bio is not queued.
+			 */
+			throtl_trim_slice(tg, rw);
+		} else if (bio_issue_as_root_blkg(bio)) {
+			/*
+			 * IOs which may cause priority inversions are
+			 * dispatched directly, even if they're over limit.
+			 * Debts are handled by carryover_bytes/ios while
+			 * calculating wait time.
+			 */
+			tg_dispatch_in_debt(tg, bio, rw);
+		} else {
+			/* if above limits, break to queue */
 			break;
-
-		/* if above limits, break to queue */
-		if (!tg_may_dispatch(tg, bio, NULL))
-			break;
-
-		/* within limits, let's charge and dispatch directly */
-		throtl_charge_bio(tg, bio);
-
-		/*
-		 * We need to trim slice even when bios are not being queued
-		 * otherwise it might happen that a bio is not queued for
-		 * a long time and slice keeps on extending and trim is not
-		 * called for a long time. Now if limits are reduced suddenly
-		 * we take into account all the IO dispatched so far at new
-		 * low rate and * newly queued IO gets a really long dispatch
-		 * time.
-		 *
-		 * So keep on trimming slice even if bio is not queued.
-		 */
-		throtl_trim_slice(tg, rw);
+		}
 
 		/*
 		 * @bio passed through this layer without being throttled.