diff mbox series

[linux-4.19.y] blk-mq: fix divide by zero crash in tg_may_dispatch()

Message ID 20210914125402.4068844-1-yukuai3@huawei.com (mailing list archive)
State New, archived
Headers show
Series [linux-4.19.y] blk-mq: fix divide by zero crash in tg_may_dispatch() | expand

Commit Message

Yu Kuai Sept. 14, 2021, 12:54 p.m. UTC
If blk-throttle is enabled and io is issued before
blk_throtl_register_queue() is done. Divide by zero crash will be
triggered in tg_may_dispatch() because 'throtl_slice' is uninitialized.

The problem is fixed in commit 75f4dca59694 ("block: call
blk_register_queue earlier in device_add_disk") from mainline, however
it's too hard to backport this patch due to lots of refactoring.

Thus introduce a new flag QUEUE_FLAG_THROTL_INIT_DONE. It will be set
after blk_throtl_register_queue() is done, and will be checked before
applying any config.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 block/blk-sysfs.c      |  2 ++
 block/blk-throttle.c   | 41 +++++++++++++++++++++++++++++++++++++++--
 include/linux/blkdev.h |  1 +
 3 files changed, 42 insertions(+), 2 deletions(-)

Comments

Greg Kroah-Hartman Sept. 15, 2021, 11:27 a.m. UTC | #1
On Tue, Sep 14, 2021 at 08:54:02PM +0800, Yu Kuai wrote:
> If blk-throttle is enabled and io is issued before
> blk_throtl_register_queue() is done. Divide by zero crash will be
> triggered in tg_may_dispatch() because 'throtl_slice' is uninitialized.
> 
> The problem is fixed in commit 75f4dca59694 ("block: call
> blk_register_queue earlier in device_add_disk") from mainline, however
> it's too hard to backport this patch due to lots of refactoring.
> 
> Thus introduce a new flag QUEUE_FLAG_THROTL_INIT_DONE. It will be set
> after blk_throtl_register_queue() is done, and will be checked before
> applying any config.
> 
> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
> ---
>  block/blk-sysfs.c      |  2 ++
>  block/blk-throttle.c   | 41 +++++++++++++++++++++++++++++++++++++++--
>  include/linux/blkdev.h |  1 +
>  3 files changed, 42 insertions(+), 2 deletions(-)


The commit you reference above is in 5.15-rc1.  What about all of the
other stable kernel releases newer than 4.19.y?  You do not want to move
to a newer release and have a regression.

And I would _REALLY_ like to take the identical commits that are
upstream if at all possible.  What is needed to backport instead of
doing this one-off patch instead?

When we take changes that are not upstream, almost always they are
broken so we almost never want to do that.

thanks,

greg k-h
Yu Kuai Sept. 15, 2021, 11:58 a.m. UTC | #2
On 2021/09/15 19:27, Greg KH wrote:
> On Tue, Sep 14, 2021 at 08:54:02PM +0800, Yu Kuai wrote:
>> If blk-throttle is enabled and io is issued before
>> blk_throtl_register_queue() is done. Divide by zero crash will be
>> triggered in tg_may_dispatch() because 'throtl_slice' is uninitialized.
>>
>> The problem is fixed in commit 75f4dca59694 ("block: call
>> blk_register_queue earlier in device_add_disk") from mainline, however
>> it's too hard to backport this patch due to lots of refactoring.
>>
>> Thus introduce a new flag QUEUE_FLAG_THROTL_INIT_DONE. It will be set
>> after blk_throtl_register_queue() is done, and will be checked before
>> applying any config.
>>
>> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
>> ---
>>   block/blk-sysfs.c      |  2 ++
>>   block/blk-throttle.c   | 41 +++++++++++++++++++++++++++++++++++++++--
>>   include/linux/blkdev.h |  1 +
>>   3 files changed, 42 insertions(+), 2 deletions(-)
> 
> 
> The commit you reference above is in 5.15-rc1.  What about all of the
> other stable kernel releases newer than 4.19.y?  You do not want to move
> to a newer release and have a regression.

All other kernel releases without this patch have the same issure,
including v5.14.
> 
> And I would _REALLY_ like to take the identical commits that are
> upstream if at all possible.  What is needed to backport instead of
> doing this one-off patch instead?

The function __device_add_disk() is quite different compared from
4.19.y to mainline, I haven't finish to collect that how many patches is
needed yet, which is not easy to do...
> 
> When we take changes that are not upstream, almost always they are
> broken so we almost never want to do that.

I understand that, more proper fix should be calling blk_register_queue
earlier in device_add_disk like the commit did. I'll try to do that and
hope conflicts are not too much...

Thanks,
Kuai
> 
> thanks,
> 
> greg k-h
> .
>
diff mbox series

Patch

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 07494deb1a26..7d250acf8ece 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -954,6 +954,7 @@  int blk_register_queue(struct gendisk *disk)
 	blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
 	wbt_enable_default(q);
 	blk_throtl_register_queue(q);
+	blk_queue_flag_set(QUEUE_FLAG_THROTL_INIT_DONE, q);
 
 	/* Now everything is ready and send out KOBJ_ADD uevent */
 	kobject_uevent(&q->kobj, KOBJ_ADD);
@@ -986,6 +987,7 @@  void blk_unregister_queue(struct gendisk *disk)
 	if (!blk_queue_registered(q))
 		return;
 
+	blk_queue_flag_clear(QUEUE_FLAG_THROTL_INIT_DONE, q);
 	/*
 	 * Since sysfs_remove_dir() prevents adding new directory entries
 	 * before removal of existing entries starts, protect against
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index caee658609d7..b2eeca9a9962 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -11,6 +11,8 @@ 
 #include <linux/bio.h>
 #include <linux/blktrace_api.h>
 #include <linux/blk-cgroup.h>
+#include <linux/sched/signal.h>
+#include <linux/delay.h>
 #include "blk.h"
 
 /* Max dispatch from a group in 1 round */
@@ -1428,6 +1430,31 @@  static void tg_conf_updated(struct throtl_grp *tg, bool global)
 	}
 }
 
+static inline int throtl_check_init_done(struct request_queue *q)
+{
+	if (test_bit(QUEUE_FLAG_THROTL_INIT_DONE, &q->queue_flags))
+		return 0;
+
+	return blk_queue_dying(q) ? -ENODEV : -EBUSY;
+}
+
+/*
+ * If throtl_check_init_done() return -EBUSY, we should retry after a short
+ * msleep(), since that throttle init will be completed in blk_register_queue()
+ * soon.
+ */
+static inline int throtl_restart_syscall_when_busy(int errno)
+{
+	int ret = errno;
+
+	if (ret == -EBUSY) {
+		msleep(10);
+		ret = restart_syscall();
+	}
+
+	return ret;
+}
+
 static ssize_t tg_set_conf(struct kernfs_open_file *of,
 			   char *buf, size_t nbytes, loff_t off, bool is_u64)
 {
@@ -1441,6 +1468,10 @@  static ssize_t tg_set_conf(struct kernfs_open_file *of,
 	if (ret)
 		return ret;
 
+	ret = throtl_check_init_done(ctx.disk->queue);
+	if (ret)
+		goto out_finish;
+
 	ret = -EINVAL;
 	if (sscanf(ctx.body, "%llu", &v) != 1)
 		goto out_finish;
@@ -1448,7 +1479,6 @@  static ssize_t tg_set_conf(struct kernfs_open_file *of,
 		v = U64_MAX;
 
 	tg = blkg_to_tg(ctx.blkg);
-
 	if (is_u64)
 		*(u64 *)((void *)tg + of_cft(of)->private) = v;
 	else
@@ -1458,6 +1488,8 @@  static ssize_t tg_set_conf(struct kernfs_open_file *of,
 	ret = 0;
 out_finish:
 	blkg_conf_finish(&ctx);
+	ret = throtl_restart_syscall_when_busy(ret);
+
 	return ret ?: nbytes;
 }
 
@@ -1607,8 +1639,11 @@  static ssize_t tg_set_limit(struct kernfs_open_file *of,
 	if (ret)
 		return ret;
 
-	tg = blkg_to_tg(ctx.blkg);
+	ret = throtl_check_init_done(ctx.disk->queue);
+	if (ret)
+		goto out_finish;
 
+	tg = blkg_to_tg(ctx.blkg);
 	v[0] = tg->bps_conf[READ][index];
 	v[1] = tg->bps_conf[WRITE][index];
 	v[2] = tg->iops_conf[READ][index];
@@ -1704,6 +1739,8 @@  static ssize_t tg_set_limit(struct kernfs_open_file *of,
 	ret = 0;
 out_finish:
 	blkg_conf_finish(&ctx);
+	ret = throtl_restart_syscall_when_busy(ret);
+
 	return ret ?: nbytes;
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 209ba8e7bd31..22be9f090bf1 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -684,6 +684,7 @@  struct request_queue {
 #define QUEUE_FLAG_NOMERGES     5	/* disable merge attempts */
 #define QUEUE_FLAG_SAME_COMP	6	/* complete on same CPU-group */
 #define QUEUE_FLAG_FAIL_IO	7	/* fake timeout */
+#define QUEUE_FLAG_THROTL_INIT_DONE 8	/* io throttle can be online */
 #define QUEUE_FLAG_NONROT	9	/* non-rotational device (SSD) */
 #define QUEUE_FLAG_VIRT        QUEUE_FLAG_NONROT /* paravirt device */
 #define QUEUE_FLAG_IO_STAT     10	/* do IO stats */