diff mbox series

block: Bail out iteration functions upon SIGKILL.

Message ID 8fde32da-d5e5-11b7-9ed7-e3aa5b003647@i-love.sakura.ne.jp (mailing list archive)
State New, archived
Headers show
Series block: Bail out iteration functions upon SIGKILL. | expand

Commit Message

Tetsuo Handa Nov. 8, 2019, 11:41 a.m. UTC
syzbot found that a thread can stall for minutes inside fallocate()
after that thread was killed by SIGKILL [1]. While trying to allocate
64TB of disk space using fallocate() is legal, delaying termination of
killed thread for minutes is bad. Thus, allow iteration functions in
block/blk-lib.c to be killable.

[1] https://syzkaller.appspot.com/bug?id=9386d051e11e09973d5a4cf79af5e8cedf79386d

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Reported-by: syzbot <syzbot+b48daca8639150bc5e73@syzkaller.appspotmail.com>
---
 block/blk-lib.c | 44 ++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 40 insertions(+), 4 deletions(-)

Comments

Chaitanya Kulkarni Nov. 8, 2019, 6:13 p.m. UTC | #1
Thanks for the patch, this looks good to me, let me test this patch
will send a review then.

On 11/08/2019 03:54 AM, Tetsuo Handa wrote:
> syzbot found that a thread can stall for minutes inside fallocate()
> after that thread was killed by SIGKILL [1]. While trying to allocate
> 64TB of disk space using fallocate() is legal, delaying termination of
> killed thread for minutes is bad. Thus, allow iteration functions in
> block/blk-lib.c to be killable.
>
> [1]https://syzkaller.appspot.com/bug?id=9386d051e11e09973d5a4cf79af5e8cedf79386d
>
> Signed-off-by: Tetsuo Handa<penguin-kernel@I-love.SAKURA.ne.jp>
> Reported-by: syzbot<syzbot+b48daca8639150bc5e73@syzkaller.appspotmail.com>
Chaitanya Kulkarni Nov. 8, 2019, 10:18 p.m. UTC | #2
Looks good.

Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
On 11/08/2019 10:13 AM, Chaitanya Kulkarni wrote:
> Thanks for the patch, this looks good to me, let me test this patch
> will send a review then.
>
> On 11/08/2019 03:54 AM, Tetsuo Handa wrote:
>> syzbot found that a thread can stall for minutes inside fallocate()
>> after that thread was killed by SIGKILL [1]. While trying to allocate
>> 64TB of disk space using fallocate() is legal, delaying termination of
>> killed thread for minutes is bad. Thus, allow iteration functions in
>> block/blk-lib.c to be killable.
>>
>> [1]https://syzkaller.appspot.com/bug?id=9386d051e11e09973d5a4cf79af5e8cedf79386d
>>
>> Signed-off-by: Tetsuo Handa<penguin-kernel@I-love.SAKURA.ne.jp>
>> Reported-by: syzbot<syzbot+b48daca8639150bc5e73@syzkaller.appspotmail.com>
>
>
Damien Le Moal Nov. 12, 2019, 4:05 a.m. UTC | #3
On 2019/11/08 20:54, Tetsuo Handa wrote:
> syzbot found that a thread can stall for minutes inside fallocate()
> after that thread was killed by SIGKILL [1]. While trying to allocate
> 64TB of disk space using fallocate() is legal, delaying termination of
> killed thread for minutes is bad. Thus, allow iteration functions in
> block/blk-lib.c to be killable.
> 
> [1] https://syzkaller.appspot.com/bug?id=9386d051e11e09973d5a4cf79af5e8cedf79386d
> 
> Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
> Reported-by: syzbot <syzbot+b48daca8639150bc5e73@syzkaller.appspotmail.com>
> ---
>  block/blk-lib.c | 44 ++++++++++++++++++++++++++++++++++++++++----
>  1 file changed, 40 insertions(+), 4 deletions(-)
> 
> diff --git a/block/blk-lib.c b/block/blk-lib.c
> index 5f2c429..6ca7cae 100644
> --- a/block/blk-lib.c
> +++ b/block/blk-lib.c
> @@ -7,9 +7,22 @@
>  #include <linux/bio.h>
>  #include <linux/blkdev.h>
>  #include <linux/scatterlist.h>
> +#include <linux/sched/signal.h>
>  
>  #include "blk.h"
>  
> +static int blk_should_abort(struct bio *bio)
> +{
> +	int ret;
> +
> +	cond_resched();
> +	if (!fatal_signal_pending(current))
> +		return 0;
> +	ret = submit_bio_wait(bio);

This will change the behavior of __blkdev_issue_discard() to a sync IO
execution instead of the current async execution since submit_bio_wait()
call is the responsibility of the caller (e.g. blkdev_issue_discard()).
Have you checked if users of __blkdev_issue_discard() are OK with that ?
f2fs, ext4, xfs, dm and nvme use this function.

Looking at f2fs, this does not look like it is going to work as expected
since the bio setup, including end_io callback, is done after this
function is called and a regular submit_bio() execution is being used.

> +	bio_put(bio);
> +	return ret ? ret : -EINTR;
> +}
> +
>  struct bio *blk_next_bio(struct bio *bio, unsigned int nr_pages, gfp_t gfp)
>  {
>  	struct bio *new = bio_alloc(gfp, nr_pages);
> @@ -55,6 +68,7 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
>  		return -EINVAL;
>  
>  	while (nr_sects) {
> +		int ret;

Please add a white line after the declaration similarly to your change
in __blkdev_issue_write_same() and __blkdev_issue_zero_pages().

>  		sector_t req_sects = min_t(sector_t, nr_sects,
>  				bio_allowed_max_sectors(q));
>  
> @@ -75,7 +89,11 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
>  		 * us to schedule out to avoid softlocking if preempt
>  		 * is disabled.
>  		 */
> -		cond_resched();
> +		ret = blk_should_abort(bio);
> +		if (ret) {
> +			*biop = NULL;
> +			return ret;
> +		}
>  	}
>  
>  	*biop = bio;
> @@ -154,6 +172,8 @@ static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
>  	max_write_same_sectors = bio_allowed_max_sectors(q);
>  
>  	while (nr_sects) {
> +		int ret;
> +
>  		bio = blk_next_bio(bio, 1, gfp_mask);
>  		bio->bi_iter.bi_sector = sector;
>  		bio_set_dev(bio, bdev);
> @@ -171,7 +191,11 @@ static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
>  			bio->bi_iter.bi_size = nr_sects << 9;
>  			nr_sects = 0;
>  		}
> -		cond_resched();
> +		ret = blk_should_abort(bio);
> +		if (ret) {
> +			*biop = NULL;
> +			return ret;
> +		}
>  	}
>  
>  	*biop = bio;
> @@ -230,6 +254,8 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
>  		return -EOPNOTSUPP;
>  
>  	while (nr_sects) {
> +		int ret;
> +
>  		bio = blk_next_bio(bio, 0, gfp_mask);
>  		bio->bi_iter.bi_sector = sector;
>  		bio_set_dev(bio, bdev);
> @@ -245,7 +271,11 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
>  			bio->bi_iter.bi_size = nr_sects << 9;
>  			nr_sects = 0;
>  		}
> -		cond_resched();
> +		ret = blk_should_abort(bio);
> +		if (ret) {
> +			*biop = NULL;
> +			return ret;
> +		}
>  	}
>  
>  	*biop = bio;
> @@ -281,6 +311,8 @@ static int __blkdev_issue_zero_pages(struct block_device *bdev,
>  		return -EPERM;
>  
>  	while (nr_sects != 0) {
> +		int ret;
> +
>  		bio = blk_next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects),
>  				   gfp_mask);
>  		bio->bi_iter.bi_sector = sector;
> @@ -295,7 +327,11 @@ static int __blkdev_issue_zero_pages(struct block_device *bdev,
>  			if (bi_size < sz)
>  				break;
>  		}
> -		cond_resched();
> +		ret = blk_should_abort(bio);
> +		if (ret) {
> +			*biop = NULL;
> +			return ret;
> +		}
>  	}
>  
>  	*biop = bio;
>
Tetsuo Handa Nov. 12, 2019, 2:47 p.m. UTC | #4
On 2019/11/12 13:05, Damien Le Moal wrote:
> On 2019/11/08 20:54, Tetsuo Handa wrote:
>> syzbot found that a thread can stall for minutes inside fallocate()
>> after that thread was killed by SIGKILL [1]. While trying to allocate
>> 64TB of disk space using fallocate() is legal, delaying termination of
>> killed thread for minutes is bad. Thus, allow iteration functions in
>> block/blk-lib.c to be killable.
>>
>> [1] https://syzkaller.appspot.com/bug?id=9386d051e11e09973d5a4cf79af5e8cedf79386d
>>
>> Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
>> Reported-by: syzbot <syzbot+b48daca8639150bc5e73@syzkaller.appspotmail.com>
>> ---
>>  block/blk-lib.c | 44 ++++++++++++++++++++++++++++++++++++++++----
>>  1 file changed, 40 insertions(+), 4 deletions(-)
>>
>> diff --git a/block/blk-lib.c b/block/blk-lib.c
>> index 5f2c429..6ca7cae 100644
>> --- a/block/blk-lib.c
>> +++ b/block/blk-lib.c
>> @@ -7,9 +7,22 @@
>>  #include <linux/bio.h>
>>  #include <linux/blkdev.h>
>>  #include <linux/scatterlist.h>
>> +#include <linux/sched/signal.h>
>>  
>>  #include "blk.h"
>>  
>> +static int blk_should_abort(struct bio *bio)
>> +{
>> +	int ret;
>> +
>> +	cond_resched();
>> +	if (!fatal_signal_pending(current))
>> +		return 0;
>> +	ret = submit_bio_wait(bio);
> 
> This will change the behavior of __blkdev_issue_discard() to a sync IO
> execution instead of the current async execution since submit_bio_wait()
> call is the responsibility of the caller (e.g. blkdev_issue_discard()).
> Have you checked if users of __blkdev_issue_discard() are OK with that ?
> f2fs, ext4, xfs, dm and nvme use this function.

I'm not sure...

> 
> Looking at f2fs, this does not look like it is going to work as expected
> since the bio setup, including end_io callback, is done after this
> function is called and a regular submit_bio() execution is being used.

Then, just breaking the iteration like below?
nvmet_bdev_execute_write_zeroes() ignores -EINTR if "*biop = bio;" is done. Is that no problem?

--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -7,6 +7,7 @@
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 #include <linux/scatterlist.h>
+#include <linux/sched/signal.h>
 
 #include "blk.h"
 
@@ -30,6 +31,7 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 	struct bio *bio = *biop;
 	unsigned int op;
 	sector_t bs_mask;
+	int ret = 0;
 
 	if (!q)
 		return -ENXIO;
@@ -76,10 +78,14 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		 * is disabled.
 		 */
 		cond_resched();
+		if (fatal_signal_pending(current)) {
+			ret = -EINTR;
+			break;
+		}
 	}
 
 	*biop = bio;
-	return 0;
+	return ret;
 }
 EXPORT_SYMBOL(__blkdev_issue_discard);
 
@@ -136,6 +142,7 @@ static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
 	unsigned int max_write_same_sectors;
 	struct bio *bio = *biop;
 	sector_t bs_mask;
+	int ret = 0;
 
 	if (!q)
 		return -ENXIO;
@@ -172,10 +179,14 @@ static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
 			nr_sects = 0;
 		}
 		cond_resched();
+		if (fatal_signal_pending(current)) {
+			ret = -EINTR;
+			break;
+		}
 	}
 
 	*biop = bio;
-	return 0;
+	return ret;
 }
 
 /**
@@ -216,6 +227,7 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
 	struct bio *bio = *biop;
 	unsigned int max_write_zeroes_sectors;
 	struct request_queue *q = bdev_get_queue(bdev);
+	int ret = 0;
 
 	if (!q)
 		return -ENXIO;
@@ -246,10 +258,14 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
 			nr_sects = 0;
 		}
 		cond_resched();
+		if (fatal_signal_pending(current)) {
+			ret = -EINTR;
+			break;
+		}
 	}
 
 	*biop = bio;
-	return 0;
+	return ret;
 }
 
 /*
@@ -273,6 +289,7 @@ static int __blkdev_issue_zero_pages(struct block_device *bdev,
 	struct bio *bio = *biop;
 	int bi_size = 0;
 	unsigned int sz;
+	int ret = 0;
 
 	if (!q)
 		return -ENXIO;
@@ -296,10 +313,14 @@ static int __blkdev_issue_zero_pages(struct block_device *bdev,
 				break;
 		}
 		cond_resched();
+		if (fatal_signal_pending(current)) {
+			ret = -EINTR;
+			break;
+		}
 	}
 
 	*biop = bio;
-	return 0;
+	return ret;
 }
 
 /**

> 
>> +	bio_put(bio);
>> +	return ret ? ret : -EINTR;
>> +}
>> +
>>  struct bio *blk_next_bio(struct bio *bio, unsigned int nr_pages, gfp_t gfp)
>>  {
>>  	struct bio *new = bio_alloc(gfp, nr_pages);
Damien Le Moal Nov. 13, 2019, 1:54 a.m. UTC | #5
On 2019/11/12 23:48, Tetsuo Handa wrote:
[...]
>>> +static int blk_should_abort(struct bio *bio)
>>> +{
>>> +	int ret;
>>> +
>>> +	cond_resched();
>>> +	if (!fatal_signal_pending(current))
>>> +		return 0;
>>> +	ret = submit_bio_wait(bio);
>>
>> This will change the behavior of __blkdev_issue_discard() to a sync IO
>> execution instead of the current async execution since submit_bio_wait()
>> call is the responsibility of the caller (e.g. blkdev_issue_discard()).
>> Have you checked if users of __blkdev_issue_discard() are OK with that ?
>> f2fs, ext4, xfs, dm and nvme use this function.
> 
> I'm not sure...
> 
>>
>> Looking at f2fs, this does not look like it is going to work as expected
>> since the bio setup, including end_io callback, is done after this
>> function is called and a regular submit_bio() execution is being used.
> 
> Then, just breaking the iteration like below?
> nvmet_bdev_execute_write_zeroes() ignores -EINTR if "*biop = bio;" is done. Is that no problem?
> 
> --- a/block/blk-lib.c
> +++ b/block/blk-lib.c
> @@ -7,6 +7,7 @@
>  #include <linux/bio.h>
>  #include <linux/blkdev.h>
>  #include <linux/scatterlist.h>
> +#include <linux/sched/signal.h>
>  
>  #include "blk.h"
>  
> @@ -30,6 +31,7 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
>  	struct bio *bio = *biop;
>  	unsigned int op;
>  	sector_t bs_mask;
> +	int ret = 0;
>  
>  	if (!q)
>  		return -ENXIO;
> @@ -76,10 +78,14 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
>  		 * is disabled.
>  		 */
>  		cond_resched();
> +		if (fatal_signal_pending(current)) {
> +			ret = -EINTR;
> +			break;
> +		}
>  	}
>  
>  	*biop = bio;
> -	return 0;
> +	return ret;

This will leak a bio as blkdev_issue_discard() executes the bio only in
the case "if (!ret && bio)". So that does not work as is, unless all
callers of __blkdev_issue_discard() are also changed. Same problem for
the other __blkdev_issue_xxx() functions.

Looking more into this, if an error is returned here, no bio should be
returned and we need to make sure that all started bios are also
completed. So your helper blk_should_abort() did the right thing calling
submit_bio_wait(). However, I Think it would be better to fail
immediately the current loop bio instead of executing it and then
reporting the -EINTR error, unconditionally, regardless of what the
started bios completion status is.

This could be done with the help of a function like this, very similar
to submit_bio_wait().

void bio_chain_end_wait(struct bio *bio)
{
	DECLARE_COMPLETION_ONSTACK_MAP(done, bio->bi_disk->lockdep_map);

	bio->bi_private = &done;
	bio->bi_end_io = submit_bio_wait_endio;
	bio->bi_opf |= REQ_SYNC;
	bio_endio(bio);
	wait_for_completion_io(&done);
}

And then your helper function becomes something like this:

static int blk_should_abort(struct bio *bio)
{
	int ret;

	cond_resched();
	if (!fatal_signal_pending(current))
		return 0;

	if (bio_flagged(bio, BIO_CHAIN))
		bio_chain_end_wait(bio);
	bio_put(bio);

	return -EINTR;
}

Thoughts ?


>  }
>  EXPORT_SYMBOL(__blkdev_issue_discard);
>  
> @@ -136,6 +142,7 @@ static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
>  	unsigned int max_write_same_sectors;
>  	struct bio *bio = *biop;
>  	sector_t bs_mask;
> +	int ret = 0;
>  
>  	if (!q)
>  		return -ENXIO;
> @@ -172,10 +179,14 @@ static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
>  			nr_sects = 0;
>  		}
>  		cond_resched();
> +		if (fatal_signal_pending(current)) {
> +			ret = -EINTR;
> +			break;
> +		}
>  	}
>  
>  	*biop = bio;
> -	return 0;
> +	return ret;
>  }
>  
>  /**
> @@ -216,6 +227,7 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
>  	struct bio *bio = *biop;
>  	unsigned int max_write_zeroes_sectors;
>  	struct request_queue *q = bdev_get_queue(bdev);
> +	int ret = 0;
>  
>  	if (!q)
>  		return -ENXIO;
> @@ -246,10 +258,14 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
>  			nr_sects = 0;
>  		}
>  		cond_resched();
> +		if (fatal_signal_pending(current)) {
> +			ret = -EINTR;
> +			break;
> +		}
>  	}
>  
>  	*biop = bio;
> -	return 0;
> +	return ret;
>  }
>  
>  /*
> @@ -273,6 +289,7 @@ static int __blkdev_issue_zero_pages(struct block_device *bdev,
>  	struct bio *bio = *biop;
>  	int bi_size = 0;
>  	unsigned int sz;
> +	int ret = 0;
>  
>  	if (!q)
>  		return -ENXIO;
> @@ -296,10 +313,14 @@ static int __blkdev_issue_zero_pages(struct block_device *bdev,
>  				break;
>  		}
>  		cond_resched();
> +		if (fatal_signal_pending(current)) {
> +			ret = -EINTR;
> +			break;
> +		}
>  	}
>  
>  	*biop = bio;
> -	return 0;
> +	return ret;
>  }
>  
>  /**
> 
>>
>>> +	bio_put(bio);
>>> +	return ret ? ret : -EINTR;
>>> +}
>>> +
>>>  struct bio *blk_next_bio(struct bio *bio, unsigned int nr_pages, gfp_t gfp)
>>>  {
>>>  	struct bio *new = bio_alloc(gfp, nr_pages);
>
Ming Lei Nov. 13, 2019, 6:55 a.m. UTC | #6
On Wed, Nov 13, 2019 at 01:54:14AM +0000, Damien Le Moal wrote:
> On 2019/11/12 23:48, Tetsuo Handa wrote:
> [...]
> >>> +static int blk_should_abort(struct bio *bio)
> >>> +{
> >>> +	int ret;
> >>> +
> >>> +	cond_resched();
> >>> +	if (!fatal_signal_pending(current))
> >>> +		return 0;
> >>> +	ret = submit_bio_wait(bio);
> >>
> >> This will change the behavior of __blkdev_issue_discard() to a sync IO
> >> execution instead of the current async execution since submit_bio_wait()
> >> call is the responsibility of the caller (e.g. blkdev_issue_discard()).
> >> Have you checked if users of __blkdev_issue_discard() are OK with that ?
> >> f2fs, ext4, xfs, dm and nvme use this function.
> > 
> > I'm not sure...
> > 
> >>
> >> Looking at f2fs, this does not look like it is going to work as expected
> >> since the bio setup, including end_io callback, is done after this
> >> function is called and a regular submit_bio() execution is being used.
> > 
> > Then, just breaking the iteration like below?
> > nvmet_bdev_execute_write_zeroes() ignores -EINTR if "*biop = bio;" is done. Is that no problem?
> > 
> > --- a/block/blk-lib.c
> > +++ b/block/blk-lib.c
> > @@ -7,6 +7,7 @@
> >  #include <linux/bio.h>
> >  #include <linux/blkdev.h>
> >  #include <linux/scatterlist.h>
> > +#include <linux/sched/signal.h>
> >  
> >  #include "blk.h"
> >  
> > @@ -30,6 +31,7 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
> >  	struct bio *bio = *biop;
> >  	unsigned int op;
> >  	sector_t bs_mask;
> > +	int ret = 0;
> >  
> >  	if (!q)
> >  		return -ENXIO;
> > @@ -76,10 +78,14 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
> >  		 * is disabled.
> >  		 */
> >  		cond_resched();
> > +		if (fatal_signal_pending(current)) {
> > +			ret = -EINTR;
> > +			break;
> > +		}
> >  	}
> >  
> >  	*biop = bio;
> > -	return 0;
> > +	return ret;
> 
> This will leak a bio as blkdev_issue_discard() executes the bio only in
> the case "if (!ret && bio)". So that does not work as is, unless all
> callers of __blkdev_issue_discard() are also changed. Same problem for
> the other __blkdev_issue_xxx() functions.
> 
> Looking more into this, if an error is returned here, no bio should be
> returned and we need to make sure that all started bios are also
> completed. So your helper blk_should_abort() did the right thing calling
> submit_bio_wait(). However, I Think it would be better to fail
> immediately the current loop bio instead of executing it and then
> reporting the -EINTR error, unconditionally, regardless of what the
> started bios completion status is.
> 
> This could be done with the help of a function like this, very similar
> to submit_bio_wait().
> 
> void bio_chain_end_wait(struct bio *bio)
> {
> 	DECLARE_COMPLETION_ONSTACK_MAP(done, bio->bi_disk->lockdep_map);
> 
> 	bio->bi_private = &done;
> 	bio->bi_end_io = submit_bio_wait_endio;
> 	bio->bi_opf |= REQ_SYNC;
> 	bio_endio(bio);
> 	wait_for_completion_io(&done);
> }
> 
> And then your helper function becomes something like this:
> 
> static int blk_should_abort(struct bio *bio)
> {
> 	int ret;
> 
> 	cond_resched();
> 	if (!fatal_signal_pending(current))
> 		return 0;
> 
> 	if (bio_flagged(bio, BIO_CHAIN))
> 		bio_chain_end_wait(bio);
> 	bio_put(bio);
> 
> 	return -EINTR;
> }
> 
> Thoughts ?

DISCARD request can be quite big, and any sync bio submission may cause
serious performance regression.

Not mention blkdev_issue_discard() may be called in non-block context.

Thanks,
Ming
Damien Le Moal Nov. 13, 2019, 7:11 a.m. UTC | #7
On 2019/11/13 15:55, Ming Lei wrote:
> On Wed, Nov 13, 2019 at 01:54:14AM +0000, Damien Le Moal wrote:
>> On 2019/11/12 23:48, Tetsuo Handa wrote:
>> [...]
>>>>> +static int blk_should_abort(struct bio *bio)
>>>>> +{
>>>>> +	int ret;
>>>>> +
>>>>> +	cond_resched();
>>>>> +	if (!fatal_signal_pending(current))
>>>>> +		return 0;
>>>>> +	ret = submit_bio_wait(bio);
>>>>
>>>> This will change the behavior of __blkdev_issue_discard() to a sync IO
>>>> execution instead of the current async execution since submit_bio_wait()
>>>> call is the responsibility of the caller (e.g. blkdev_issue_discard()).
>>>> Have you checked if users of __blkdev_issue_discard() are OK with that ?
>>>> f2fs, ext4, xfs, dm and nvme use this function.
>>>
>>> I'm not sure...
>>>
>>>>
>>>> Looking at f2fs, this does not look like it is going to work as expected
>>>> since the bio setup, including end_io callback, is done after this
>>>> function is called and a regular submit_bio() execution is being used.
>>>
>>> Then, just breaking the iteration like below?
>>> nvmet_bdev_execute_write_zeroes() ignores -EINTR if "*biop = bio;" is done. Is that no problem?
>>>
>>> --- a/block/blk-lib.c
>>> +++ b/block/blk-lib.c
>>> @@ -7,6 +7,7 @@
>>>  #include <linux/bio.h>
>>>  #include <linux/blkdev.h>
>>>  #include <linux/scatterlist.h>
>>> +#include <linux/sched/signal.h>
>>>  
>>>  #include "blk.h"
>>>  
>>> @@ -30,6 +31,7 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
>>>  	struct bio *bio = *biop;
>>>  	unsigned int op;
>>>  	sector_t bs_mask;
>>> +	int ret = 0;
>>>  
>>>  	if (!q)
>>>  		return -ENXIO;
>>> @@ -76,10 +78,14 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
>>>  		 * is disabled.
>>>  		 */
>>>  		cond_resched();
>>> +		if (fatal_signal_pending(current)) {
>>> +			ret = -EINTR;
>>> +			break;
>>> +		}
>>>  	}
>>>  
>>>  	*biop = bio;
>>> -	return 0;
>>> +	return ret;
>>
>> This will leak a bio as blkdev_issue_discard() executes the bio only in
>> the case "if (!ret && bio)". So that does not work as is, unless all
>> callers of __blkdev_issue_discard() are also changed. Same problem for
>> the other __blkdev_issue_xxx() functions.
>>
>> Looking more into this, if an error is returned here, no bio should be
>> returned and we need to make sure that all started bios are also
>> completed. So your helper blk_should_abort() did the right thing calling
>> submit_bio_wait(). However, I Think it would be better to fail
>> immediately the current loop bio instead of executing it and then
>> reporting the -EINTR error, unconditionally, regardless of what the
>> started bios completion status is.
>>
>> This could be done with the help of a function like this, very similar
>> to submit_bio_wait().
>>
>> void bio_chain_end_wait(struct bio *bio)
>> {
>> 	DECLARE_COMPLETION_ONSTACK_MAP(done, bio->bi_disk->lockdep_map);
>>
>> 	bio->bi_private = &done;
>> 	bio->bi_end_io = submit_bio_wait_endio;
>> 	bio->bi_opf |= REQ_SYNC;
>> 	bio_endio(bio);
>> 	wait_for_completion_io(&done);
>> }
>>
>> And then your helper function becomes something like this:
>>
>> static int blk_should_abort(struct bio *bio)
>> {
>> 	int ret;
>>
>> 	cond_resched();
>> 	if (!fatal_signal_pending(current))
>> 		return 0;
>>
>> 	if (bio_flagged(bio, BIO_CHAIN))
>> 		bio_chain_end_wait(bio);
>> 	bio_put(bio);
>>
>> 	return -EINTR;
>> }
>>
>> Thoughts ?
> 
> DISCARD request can be quite big, and any sync bio submission may cause
> serious performance regression.

Yes indeed. But if the bio issuing loop is interrupted with discard BIOs
already issued, I do not think there is any other choice but to wait for
their completion before returning.

> Not mention blkdev_issue_discard() may be called in non-block context.

This loop is calling cond_resched(), which checks might_sleep(). So
certainly this function can block, no ?
Ming Lei Nov. 13, 2019, 7:49 a.m. UTC | #8
On Wed, Nov 13, 2019 at 07:11:36AM +0000, Damien Le Moal wrote:
> On 2019/11/13 15:55, Ming Lei wrote:
> > On Wed, Nov 13, 2019 at 01:54:14AM +0000, Damien Le Moal wrote:
> >> On 2019/11/12 23:48, Tetsuo Handa wrote:
> >> [...]
> >>>>> +static int blk_should_abort(struct bio *bio)
> >>>>> +{
> >>>>> +	int ret;
> >>>>> +
> >>>>> +	cond_resched();
> >>>>> +	if (!fatal_signal_pending(current))
> >>>>> +		return 0;
> >>>>> +	ret = submit_bio_wait(bio);
> >>>>
> >>>> This will change the behavior of __blkdev_issue_discard() to a sync IO
> >>>> execution instead of the current async execution since submit_bio_wait()
> >>>> call is the responsibility of the caller (e.g. blkdev_issue_discard()).
> >>>> Have you checked if users of __blkdev_issue_discard() are OK with that ?
> >>>> f2fs, ext4, xfs, dm and nvme use this function.
> >>>
> >>> I'm not sure...
> >>>
> >>>>
> >>>> Looking at f2fs, this does not look like it is going to work as expected
> >>>> since the bio setup, including end_io callback, is done after this
> >>>> function is called and a regular submit_bio() execution is being used.
> >>>
> >>> Then, just breaking the iteration like below?
> >>> nvmet_bdev_execute_write_zeroes() ignores -EINTR if "*biop = bio;" is done. Is that no problem?
> >>>
> >>> --- a/block/blk-lib.c
> >>> +++ b/block/blk-lib.c
> >>> @@ -7,6 +7,7 @@
> >>>  #include <linux/bio.h>
> >>>  #include <linux/blkdev.h>
> >>>  #include <linux/scatterlist.h>
> >>> +#include <linux/sched/signal.h>
> >>>  
> >>>  #include "blk.h"
> >>>  
> >>> @@ -30,6 +31,7 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
> >>>  	struct bio *bio = *biop;
> >>>  	unsigned int op;
> >>>  	sector_t bs_mask;
> >>> +	int ret = 0;
> >>>  
> >>>  	if (!q)
> >>>  		return -ENXIO;
> >>> @@ -76,10 +78,14 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
> >>>  		 * is disabled.
> >>>  		 */
> >>>  		cond_resched();
> >>> +		if (fatal_signal_pending(current)) {
> >>> +			ret = -EINTR;
> >>> +			break;
> >>> +		}
> >>>  	}
> >>>  
> >>>  	*biop = bio;
> >>> -	return 0;
> >>> +	return ret;
> >>
> >> This will leak a bio as blkdev_issue_discard() executes the bio only in
> >> the case "if (!ret && bio)". So that does not work as is, unless all
> >> callers of __blkdev_issue_discard() are also changed. Same problem for
> >> the other __blkdev_issue_xxx() functions.
> >>
> >> Looking more into this, if an error is returned here, no bio should be
> >> returned and we need to make sure that all started bios are also
> >> completed. So your helper blk_should_abort() did the right thing calling
> >> submit_bio_wait(). However, I Think it would be better to fail
> >> immediately the current loop bio instead of executing it and then
> >> reporting the -EINTR error, unconditionally, regardless of what the
> >> started bios completion status is.
> >>
> >> This could be done with the help of a function like this, very similar
> >> to submit_bio_wait().
> >>
> >> void bio_chain_end_wait(struct bio *bio)
> >> {
> >> 	DECLARE_COMPLETION_ONSTACK_MAP(done, bio->bi_disk->lockdep_map);
> >>
> >> 	bio->bi_private = &done;
> >> 	bio->bi_end_io = submit_bio_wait_endio;
> >> 	bio->bi_opf |= REQ_SYNC;
> >> 	bio_endio(bio);
> >> 	wait_for_completion_io(&done);
> >> }
> >>
> >> And then your helper function becomes something like this:
> >>
> >> static int blk_should_abort(struct bio *bio)
> >> {
> >> 	int ret;
> >>
> >> 	cond_resched();
> >> 	if (!fatal_signal_pending(current))
> >> 		return 0;
> >>
> >> 	if (bio_flagged(bio, BIO_CHAIN))
> >> 		bio_chain_end_wait(bio);
> >> 	bio_put(bio);
> >>
> >> 	return -EINTR;
> >> }
> >>
> >> Thoughts ?
> > 
> > DISCARD request can be quite big, and any sync bio submission may cause
> > serious performance regression.
> 
> Yes indeed. But if the bio issuing loop is interrupted with discard BIOs
> already issued, I do not think there is any other choice but to wait for
> their completion before returning.

Looks I miss the check on fatal_signal_pending(), then this approach
seems fine.

> 
> > Not mention blkdev_issue_discard() may be called in non-block context.
> 
> This loop is calling cond_resched(), which checks might_sleep(). So
> certainly this function can block, no ?

Indeed, looks I misunderstood it.

Thanks,
Ming
Tetsuo Handa Nov. 15, 2019, 10:05 a.m. UTC | #9
On 2019/11/13 10:54, Damien Le Moal wrote:
> On 2019/11/12 23:48, Tetsuo Handa wrote:
> [...]
>>>> +static int blk_should_abort(struct bio *bio)
>>>> +{
>>>> +	int ret;
>>>> +
>>>> +	cond_resched();
>>>> +	if (!fatal_signal_pending(current))
>>>> +		return 0;
>>>> +	ret = submit_bio_wait(bio);
>>>
>>> This will change the behavior of __blkdev_issue_discard() to a sync IO
>>> execution instead of the current async execution since submit_bio_wait()
>>> call is the responsibility of the caller (e.g. blkdev_issue_discard()).
>>> Have you checked if users of __blkdev_issue_discard() are OK with that ?
>>> f2fs, ext4, xfs, dm and nvme use this function.
>>
>> I'm not sure...
>>
>>>
>>> Looking at f2fs, this does not look like it is going to work as expected
>>> since the bio setup, including end_io callback, is done after this
>>> function is called and a regular submit_bio() execution is being used.
>>
>> Then, just breaking the iteration like below?
>> nvmet_bdev_execute_write_zeroes() ignores -EINTR if "*biop = bio;" is done. Is that no problem?
>>
>> --- a/block/blk-lib.c
>> +++ b/block/blk-lib.c
>> @@ -7,6 +7,7 @@
>>  #include <linux/bio.h>
>>  #include <linux/blkdev.h>
>>  #include <linux/scatterlist.h>
>> +#include <linux/sched/signal.h>
>>  
>>  #include "blk.h"
>>  
>> @@ -30,6 +31,7 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
>>  	struct bio *bio = *biop;
>>  	unsigned int op;
>>  	sector_t bs_mask;
>> +	int ret = 0;
>>  
>>  	if (!q)
>>  		return -ENXIO;
>> @@ -76,10 +78,14 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
>>  		 * is disabled.
>>  		 */
>>  		cond_resched();
>> +		if (fatal_signal_pending(current)) {
>> +			ret = -EINTR;
>> +			break;
>> +		}
>>  	}
>>  
>>  	*biop = bio;
>> -	return 0;
>> +	return ret;
> 
> This will leak a bio as blkdev_issue_discard() executes the bio only in
> the case "if (!ret && bio)". So that does not work as is, unless all
> callers of __blkdev_issue_discard() are also changed. Same problem for
> the other __blkdev_issue_xxx() functions.
> 
> Looking more into this, if an error is returned here, no bio should be
> returned and we need to make sure that all started bios are also
> completed. So your helper blk_should_abort() did the right thing calling
> submit_bio_wait(). However, I Think it would be better to fail
> immediately the current loop bio instead of executing it and then
> reporting the -EINTR error, unconditionally, regardless of what the
> started bios completion status is.
> 
> This could be done with the help of a function like this, very similar
> to submit_bio_wait().
> 
> void bio_chain_end_wait(struct bio *bio)
> {
> 	DECLARE_COMPLETION_ONSTACK_MAP(done, bio->bi_disk->lockdep_map);
> 
> 	bio->bi_private = &done;
> 	bio->bi_end_io = submit_bio_wait_endio;
> 	bio->bi_opf |= REQ_SYNC;
> 	bio_endio(bio);
> 	wait_for_completion_io(&done);
> }
> 
> And then your helper function becomes something like this:
> 
> static int blk_should_abort(struct bio *bio)
> {
> 	int ret;
> 
> 	cond_resched();
> 	if (!fatal_signal_pending(current))
> 		return 0;
> 
> 	if (bio_flagged(bio, BIO_CHAIN))
> 		bio_chain_end_wait(bio);

I don't know about block layer, but I feel this is bad because bio_put()
will be called without submit_bio_wait() when bio_flagged() == false.
Who calls submit_bio_wait() if bio_flagged() == false ?

> 	bio_put(bio);
> 
> 	return -EINTR;
> }
> 
> Thoughts ?
>
Damien Le Moal Nov. 18, 2019, 12:02 a.m. UTC | #10
On 2019/11/15 19:05, Tetsuo Handa wrote:
> On 2019/11/13 10:54, Damien Le Moal wrote:
>> On 2019/11/12 23:48, Tetsuo Handa wrote:
>> [...]
>>>>> +static int blk_should_abort(struct bio *bio)
>>>>> +{
>>>>> +	int ret;
>>>>> +
>>>>> +	cond_resched();
>>>>> +	if (!fatal_signal_pending(current))
>>>>> +		return 0;
>>>>> +	ret = submit_bio_wait(bio);
>>>>
>>>> This will change the behavior of __blkdev_issue_discard() to a sync IO
>>>> execution instead of the current async execution since submit_bio_wait()
>>>> call is the responsibility of the caller (e.g. blkdev_issue_discard()).
>>>> Have you checked if users of __blkdev_issue_discard() are OK with that ?
>>>> f2fs, ext4, xfs, dm and nvme use this function.
>>>
>>> I'm not sure...
>>>
>>>>
>>>> Looking at f2fs, this does not look like it is going to work as expected
>>>> since the bio setup, including end_io callback, is done after this
>>>> function is called and a regular submit_bio() execution is being used.
>>>
>>> Then, just breaking the iteration like below?
>>> nvmet_bdev_execute_write_zeroes() ignores -EINTR if "*biop = bio;" is done. Is that no problem?
>>>
>>> --- a/block/blk-lib.c
>>> +++ b/block/blk-lib.c
>>> @@ -7,6 +7,7 @@
>>>  #include <linux/bio.h>
>>>  #include <linux/blkdev.h>
>>>  #include <linux/scatterlist.h>
>>> +#include <linux/sched/signal.h>
>>>  
>>>  #include "blk.h"
>>>  
>>> @@ -30,6 +31,7 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
>>>  	struct bio *bio = *biop;
>>>  	unsigned int op;
>>>  	sector_t bs_mask;
>>> +	int ret = 0;
>>>  
>>>  	if (!q)
>>>  		return -ENXIO;
>>> @@ -76,10 +78,14 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
>>>  		 * is disabled.
>>>  		 */
>>>  		cond_resched();
>>> +		if (fatal_signal_pending(current)) {
>>> +			ret = -EINTR;
>>> +			break;
>>> +		}
>>>  	}
>>>  
>>>  	*biop = bio;
>>> -	return 0;
>>> +	return ret;
>>
>> This will leak a bio as blkdev_issue_discard() executes the bio only in
>> the case "if (!ret && bio)". So that does not work as is, unless all
>> callers of __blkdev_issue_discard() are also changed. Same problem for
>> the other __blkdev_issue_xxx() functions.
>>
>> Looking more into this, if an error is returned here, no bio should be
>> returned and we need to make sure that all started bios are also
>> completed. So your helper blk_should_abort() did the right thing calling
>> submit_bio_wait(). However, I Think it would be better to fail
>> immediately the current loop bio instead of executing it and then
>> reporting the -EINTR error, unconditionally, regardless of what the
>> started bios completion status is.
>>
>> This could be done with the help of a function like this, very similar
>> to submit_bio_wait().
>>
>> void bio_chain_end_wait(struct bio *bio)
>> {
>> 	DECLARE_COMPLETION_ONSTACK_MAP(done, bio->bi_disk->lockdep_map);
>>
>> 	bio->bi_private = &done;
>> 	bio->bi_end_io = submit_bio_wait_endio;
>> 	bio->bi_opf |= REQ_SYNC;
>> 	bio_endio(bio);
>> 	wait_for_completion_io(&done);
>> }
>>
>> And then your helper function becomes something like this:
>>
>> static int blk_should_abort(struct bio *bio)
>> {
>> 	int ret;
>>
>> 	cond_resched();
>> 	if (!fatal_signal_pending(current))
>> 		return 0;
>>
>> 	if (bio_flagged(bio, BIO_CHAIN))
>> 		bio_chain_end_wait(bio);
> 
> I don't know about block layer, but I feel this is bad because bio_put()
> will be called without submit_bio_wait() when bio_flagged() == false.
> Who calls submit_bio_wait() if bio_flagged() == false ?

If the BIO is not flagged, then it is not chained and so does not need
to be executed at all and can be dropped (freed) right away with
bio_put(). No need (and in fact we do not want) to execute it at all.

For other cases where bio is flagged, it means that it is chained and so
that previous BIOs where already started by the submit_bio() call in
bio_next(). In this case, the current BIO is still *not* executed at all
and bio_endio() is called for it instead of submit_bio_wait(). But since
bio_endio() is called after setting:

bio->bi_end_io = submit_bio_wait_endio;

the bio_endio() call has the same effect as the completion of the bio if
it were executed: the previous chained BIO completion is waited for.

> 
>> 	bio_put(bio);
>>
>> 	return -EINTR;
>> }
>>
>> Thoughts ?
>>
>
diff mbox series

Patch

diff --git a/block/blk-lib.c b/block/blk-lib.c
index 5f2c429..6ca7cae 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -7,9 +7,22 @@ 
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 #include <linux/scatterlist.h>
+#include <linux/sched/signal.h>
 
 #include "blk.h"
 
+static int blk_should_abort(struct bio *bio)
+{
+	int ret;
+
+	cond_resched();
+	if (!fatal_signal_pending(current))
+		return 0;
+	ret = submit_bio_wait(bio);
+	bio_put(bio);
+	return ret ? ret : -EINTR;
+}
+
 struct bio *blk_next_bio(struct bio *bio, unsigned int nr_pages, gfp_t gfp)
 {
 	struct bio *new = bio_alloc(gfp, nr_pages);
@@ -55,6 +68,7 @@  int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		return -EINVAL;
 
 	while (nr_sects) {
+		int ret;
 		sector_t req_sects = min_t(sector_t, nr_sects,
 				bio_allowed_max_sectors(q));
 
@@ -75,7 +89,11 @@  int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		 * us to schedule out to avoid softlocking if preempt
 		 * is disabled.
 		 */
-		cond_resched();
+		ret = blk_should_abort(bio);
+		if (ret) {
+			*biop = NULL;
+			return ret;
+		}
 	}
 
 	*biop = bio;
@@ -154,6 +172,8 @@  static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
 	max_write_same_sectors = bio_allowed_max_sectors(q);
 
 	while (nr_sects) {
+		int ret;
+
 		bio = blk_next_bio(bio, 1, gfp_mask);
 		bio->bi_iter.bi_sector = sector;
 		bio_set_dev(bio, bdev);
@@ -171,7 +191,11 @@  static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
 			bio->bi_iter.bi_size = nr_sects << 9;
 			nr_sects = 0;
 		}
-		cond_resched();
+		ret = blk_should_abort(bio);
+		if (ret) {
+			*biop = NULL;
+			return ret;
+		}
 	}
 
 	*biop = bio;
@@ -230,6 +254,8 @@  static int __blkdev_issue_write_zeroes(struct block_device *bdev,
 		return -EOPNOTSUPP;
 
 	while (nr_sects) {
+		int ret;
+
 		bio = blk_next_bio(bio, 0, gfp_mask);
 		bio->bi_iter.bi_sector = sector;
 		bio_set_dev(bio, bdev);
@@ -245,7 +271,11 @@  static int __blkdev_issue_write_zeroes(struct block_device *bdev,
 			bio->bi_iter.bi_size = nr_sects << 9;
 			nr_sects = 0;
 		}
-		cond_resched();
+		ret = blk_should_abort(bio);
+		if (ret) {
+			*biop = NULL;
+			return ret;
+		}
 	}
 
 	*biop = bio;
@@ -281,6 +311,8 @@  static int __blkdev_issue_zero_pages(struct block_device *bdev,
 		return -EPERM;
 
 	while (nr_sects != 0) {
+		int ret;
+
 		bio = blk_next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects),
 				   gfp_mask);
 		bio->bi_iter.bi_sector = sector;
@@ -295,7 +327,11 @@  static int __blkdev_issue_zero_pages(struct block_device *bdev,
 			if (bi_size < sz)
 				break;
 		}
-		cond_resched();
+		ret = blk_should_abort(bio);
+		if (ret) {
+			*biop = NULL;
+			return ret;
+		}
 	}
 
 	*biop = bio;