diff mbox series

[37/44] block: convert to advancing variants of iov_iter_get_pages{,_alloc}()

Message ID 20220622041552.737754-37-viro@zeniv.linux.org.uk (mailing list archive)
State New, archived
Headers show
Series [01/44] 9p: handling Rerror without copy_from_iter_full() | expand

Commit Message

Al Viro June 22, 2022, 4:15 a.m. UTC
... doing revert if we end up not using some pages

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 block/bio.c     | 15 ++++++---------
 block/blk-map.c |  7 ++++---
 2 files changed, 10 insertions(+), 12 deletions(-)

Comments

Jeff Layton June 28, 2022, 12:16 p.m. UTC | #1
On Wed, 2022-06-22 at 05:15 +0100, Al Viro wrote:
> ... doing revert if we end up not using some pages
> 
> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
> ---
>  block/bio.c     | 15 ++++++---------
>  block/blk-map.c |  7 ++++---
>  2 files changed, 10 insertions(+), 12 deletions(-)
> 
> diff --git a/block/bio.c b/block/bio.c
> index 51c99f2c5c90..01ab683e67be 100644
> --- a/block/bio.c
> +++ b/block/bio.c
> @@ -1190,7 +1190,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
>  	BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
>  	pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
>  
> -	size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
> +	size = iov_iter_get_pages2(iter, pages, LONG_MAX, nr_pages, &offset);
>  	if (unlikely(size <= 0))
>  		return size ? size : -EFAULT;
>  
> @@ -1205,6 +1205,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
>  		} else {
>  			if (WARN_ON_ONCE(bio_full(bio, len))) {
>  				bio_put_pages(pages + i, left, offset);
> +				iov_iter_revert(iter, left);
>  				return -EINVAL;
>  			}
>  			__bio_add_page(bio, page, len, offset);
> @@ -1212,7 +1213,6 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
>  		offset = 0;
>  	}
>  
> -	iov_iter_advance(iter, size);
>  	return 0;
>  }
>  
> @@ -1227,7 +1227,6 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter)
>  	ssize_t size, left;
>  	unsigned len, i;
>  	size_t offset;
> -	int ret = 0;
>  
>  	if (WARN_ON_ONCE(!max_append_sectors))
>  		return 0;
> @@ -1240,7 +1239,7 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter)
>  	BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
>  	pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
>  
> -	size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
> +	size = iov_iter_get_pages2(iter, pages, LONG_MAX, nr_pages, &offset);
>  	if (unlikely(size <= 0))
>  		return size ? size : -EFAULT;
>  
> @@ -1252,16 +1251,14 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter)
>  		if (bio_add_hw_page(q, bio, page, len, offset,
>  				max_append_sectors, &same_page) != len) {
>  			bio_put_pages(pages + i, left, offset);
> -			ret = -EINVAL;
> -			break;
> +			iov_iter_revert(iter, left);
> +			return -EINVAL;
>  		}
>  		if (same_page)
>  			put_page(page);
>  		offset = 0;
>  	}
> -
> -	iov_iter_advance(iter, size - left);
> -	return ret;
> +	return 0;
>  }
>  
>  /**
> diff --git a/block/blk-map.c b/block/blk-map.c
> index df8b066cd548..7196a6b64c80 100644
> --- a/block/blk-map.c
> +++ b/block/blk-map.c
> @@ -254,7 +254,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
>  		size_t offs, added = 0;
>  		int npages;
>  
> -		bytes = iov_iter_get_pages_alloc(iter, &pages, LONG_MAX, &offs);
> +		bytes = iov_iter_get_pages_alloc2(iter, &pages, LONG_MAX, &offs);
>  		if (unlikely(bytes <= 0)) {
>  			ret = bytes ? bytes : -EFAULT;
>  			goto out_unmap;
> @@ -284,7 +284,6 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
>  				bytes -= n;
>  				offs = 0;
>  			}
> -			iov_iter_advance(iter, added);
>  		}
>  		/*
>  		 * release the pages we didn't map into the bio, if any
> @@ -293,8 +292,10 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
>  			put_page(pages[j++]);
>  		kvfree(pages);
>  		/* couldn't stuff something into bio? */
> -		if (bytes)
> +		if (bytes) {
> +			iov_iter_revert(iter, bytes);
>  			break;
> +		}
>  	}
>  
>  	ret = blk_rq_append_bio(rq, bio);

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Al Viro June 30, 2022, 10:11 p.m. UTC | #2
On Wed, Jun 22, 2022 at 05:15:45AM +0100, Al Viro wrote:
> ... doing revert if we end up not using some pages
> 
> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

... and the first half of that thing conflicts with "block: relax direct
io memory alignment" in -next...

Joy.  It's not hard to redo on top of the commit in there; the
question is, how to deal with conflicts?

I can do a backmerge, provided that there's a sane tag or branch to
backmerge from.  Another fun (if trivial) issue in the same series
is around "iov: introduce iov_iter_aligned" (two commits prior).

Jens, Keith, do you have any suggestions?  AFAICS, variants include
	* tag or branch covering b1a000d3b8ec582da64bb644be633e5a0beffcbf
(I'd rather not grab the entire for-5.20/block for obvious reasons)
It sits in the beginning of for-5.20/block, so that should be fairly
straightforward, provided that you are not going to do rebases there.
If you are, could you put that stuff into an invariant branch, so
I'd just pull it?
	* feeding the entire iov_iter pile through block.git;
bad idea, IMO, seeing that it contains a lot of stuff far from
anything block-related. 
	* doing a manual conflict resolution on top of my branch
and pushing that out.  Would get rid of the problem from -next, but
Linus hates that kind of stuff, AFAIK, and with good reasons.

	I would prefer the first variant (and that's what I'm
going to do locally for now - just
git tag keith_stuff bf8d08532bc19a14cfb54ae61099dccadefca446
and backmerge from it), but if you would prefer to deal with that
differently - please tell.
Al Viro June 30, 2022, 10:39 p.m. UTC | #3
On Thu, Jun 30, 2022 at 11:11:27PM +0100, Al Viro wrote:

> ... and the first half of that thing conflicts with "block: relax direct
> io memory alignment" in -next...

BTW, looking at that commit - are you sure that bio_put_pages() on failure
exit will do the right thing?  We have grabbed a bunch of page references;
the amount if DIV_ROUND_UP(offset + size, PAGE_SIZE).  And that's before
your
                size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev));
in there.  IMO the following would be more obviously correct:
        size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
        if (unlikely(size <= 0))
                return size ? size : -EFAULT;

	nr_pages = DIV_ROUND_UP(size + offset, PAGE_SIZE);
	size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev));

        for (left = size, i = 0; left > 0; left -= len, i++) {
...
                if (ret) {
			while (i < nr_pages)
				put_page(pages[i++]);
                        return ret;
                }
...

and get rid of bio_put_pages() entirely.  Objections?
Jens Axboe June 30, 2022, 11:07 p.m. UTC | #4
On 6/30/22 4:11 PM, Al Viro wrote:
> On Wed, Jun 22, 2022 at 05:15:45AM +0100, Al Viro wrote:
>> ... doing revert if we end up not using some pages
>>
>> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
> 
> ... and the first half of that thing conflicts with "block: relax direct
> io memory alignment" in -next...
> 
> Joy.  It's not hard to redo on top of the commit in there; the
> question is, how to deal with conflicts?
> 
> I can do a backmerge, provided that there's a sane tag or branch to
> backmerge from.  Another fun (if trivial) issue in the same series
> is around "iov: introduce iov_iter_aligned" (two commits prior).
> 
> Jens, Keith, do you have any suggestions?  AFAICS, variants include
> 	* tag or branch covering b1a000d3b8ec582da64bb644be633e5a0beffcbf
> (I'd rather not grab the entire for-5.20/block for obvious reasons)
> It sits in the beginning of for-5.20/block, so that should be fairly
> straightforward, provided that you are not going to do rebases there.
> If you are, could you put that stuff into an invariant branch, so
> I'd just pull it?
> 	* feeding the entire iov_iter pile through block.git;
> bad idea, IMO, seeing that it contains a lot of stuff far from
> anything block-related. 
> 	* doing a manual conflict resolution on top of my branch
> and pushing that out.  Would get rid of the problem from -next, but
> Linus hates that kind of stuff, AFAIK, and with good reasons.
> 
> 	I would prefer the first variant (and that's what I'm
> going to do locally for now - just
> git tag keith_stuff bf8d08532bc19a14cfb54ae61099dccadefca446
> and backmerge from it), but if you would prefer to deal with that
> differently - please tell.

I'm not going to rebase it, and I can create a tag for that commit for
you. Done, it's block-5.20-al. I did the former commit, or we can move
the tag so it includes bf8d08532bc19a14cfb54ae61099dccadefca446? That'd
be the whole series of that patchset, which is just that one extra
patch.
Keith Busch July 1, 2022, 2:07 a.m. UTC | #5
On Thu, Jun 30, 2022 at 11:39:36PM +0100, Al Viro wrote:
> On Thu, Jun 30, 2022 at 11:11:27PM +0100, Al Viro wrote:
> 
> > ... and the first half of that thing conflicts with "block: relax direct
> > io memory alignment" in -next...
> 
> BTW, looking at that commit - are you sure that bio_put_pages() on failure
> exit will do the right thing?  We have grabbed a bunch of page references;
> the amount if DIV_ROUND_UP(offset + size, PAGE_SIZE).  And that's before
> your
>                 size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev));

Thanks for the catch, it does look like a page reference could get leaked here.

> in there.  IMO the following would be more obviously correct:
>         size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
>         if (unlikely(size <= 0))
>                 return size ? size : -EFAULT;
> 
> 	nr_pages = DIV_ROUND_UP(size + offset, PAGE_SIZE);
> 	size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev));
> 
>         for (left = size, i = 0; left > 0; left -= len, i++) {
> ...
>                 if (ret) {
> 			while (i < nr_pages)
> 				put_page(pages[i++]);
>                         return ret;
>                 }
> ...
> 
> and get rid of bio_put_pages() entirely.  Objections?


I think that makes sense. I'll give your idea a test run tomorrow.
Al Viro July 1, 2022, 5:40 p.m. UTC | #6
On Thu, Jun 30, 2022 at 08:07:24PM -0600, Keith Busch wrote:
> On Thu, Jun 30, 2022 at 11:39:36PM +0100, Al Viro wrote:
> > On Thu, Jun 30, 2022 at 11:11:27PM +0100, Al Viro wrote:
> > 
> > > ... and the first half of that thing conflicts with "block: relax direct
> > > io memory alignment" in -next...
> > 
> > BTW, looking at that commit - are you sure that bio_put_pages() on failure
> > exit will do the right thing?  We have grabbed a bunch of page references;
> > the amount if DIV_ROUND_UP(offset + size, PAGE_SIZE).  And that's before
> > your
> >                 size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev));
> 
> Thanks for the catch, it does look like a page reference could get leaked here.
> 
> > in there.  IMO the following would be more obviously correct:
> >         size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
> >         if (unlikely(size <= 0))
> >                 return size ? size : -EFAULT;
> > 
> > 	nr_pages = DIV_ROUND_UP(size + offset, PAGE_SIZE);
> > 	size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev));
> > 
> >         for (left = size, i = 0; left > 0; left -= len, i++) {
> > ...
> >                 if (ret) {
> > 			while (i < nr_pages)
> > 				put_page(pages[i++]);
> >                         return ret;
> >                 }
> > ...
> > 
> > and get rid of bio_put_pages() entirely.  Objections?
> 
> 
> I think that makes sense. I'll give your idea a test run tomorrow.

See vfs.git#block-fixes, along with #work.iov_iter_get_pages-3 in there.
Seems to work here...

If you are OK with #block-fixes (it's one commit on top of
bf8d08532bc1 "iomap: add support for dma aligned direct-io" in
block.git), the easiest way to deal with the conflicts would be
to have that branch pulled into block.git.  Jens, would you be
OK with that in terms of tree topology?  Provided that patch
itself looks sane to you, of course...

FWOW, the patch in question is
commit 863965bb7e52997851af3a107ec3e4d8c7050cbd
Author: Al Viro <viro@zeniv.linux.org.uk>
Date:   Fri Jul 1 13:15:36 2022 -0400

    __bio_iov_iter_get_pages(): make sure we don't leak page refs on failure
    
    Calculate the number of pages we'd grabbed before trimming size down.
    And don't bother with bio_put_pages() - an explicit cleanup loop is
    easier to follow...
    
    Fixes: b1a000d3b8ec "block: relax direct io memory alignment"
    Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/block/bio.c b/block/bio.c
index 933ea3210954..59be4eca1192 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1151,14 +1151,6 @@ void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
 	bio_set_flag(bio, BIO_CLONED);
 }
 
-static void bio_put_pages(struct page **pages, size_t size, size_t off)
-{
-	size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE);
-
-	for (i = 0; i < nr; i++)
-		put_page(pages[i]);
-}
-
 static int bio_iov_add_page(struct bio *bio, struct page *page,
 		unsigned int len, unsigned int offset)
 {
@@ -1228,11 +1220,11 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 	 * the iov data will be picked up in the next bio iteration.
 	 */
 	size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
-	if (size > 0)
-		size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev));
 	if (unlikely(size <= 0))
 		return size ? size : -EFAULT;
+	nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE);
 
+	size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev));
 	for (left = size, i = 0; left > 0; left -= len, i++) {
 		struct page *page = pages[i];
 		int ret;
@@ -1245,7 +1237,8 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 			ret = bio_iov_add_page(bio, page, len, offset);
 
 		if (ret) {
-			bio_put_pages(pages + i, left, offset);
+			while (i < nr_pages)
+				put_page(pages[i++]);
 			return ret;
 		}
 		offset = 0;
Keith Busch July 1, 2022, 5:53 p.m. UTC | #7
On Fri, Jul 01, 2022 at 06:40:40PM +0100, Al Viro wrote:
> -static void bio_put_pages(struct page **pages, size_t size, size_t off)
> -{
> -	size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE);
> -
> -	for (i = 0; i < nr; i++)
> -		put_page(pages[i]);
> -}
> -
>  static int bio_iov_add_page(struct bio *bio, struct page *page,
>  		unsigned int len, unsigned int offset)
>  {
> @@ -1228,11 +1220,11 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
>  	 * the iov data will be picked up in the next bio iteration.
>  	 */
>  	size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
> -	if (size > 0)
> -		size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev));
>  	if (unlikely(size <= 0))
>  		return size ? size : -EFAULT;
> +	nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE);
>  
> +	size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev));

This isn't quite right. The result of the ALIGN_DOWN could be 0, so whatever
page we got before would be leaked since unused pages are only released on an
add_page error. I was about to reply with a patch that fixes this, but here's
the one that I'm currently testing:

---
diff --git a/block/bio.c b/block/bio.c
index 933ea3210954..c4a1ce39c65c 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1151,14 +1151,6 @@ void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
 	bio_set_flag(bio, BIO_CLONED);
 }
 
-static void bio_put_pages(struct page **pages, size_t size, size_t off)
-{
-	size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE);
-
-	for (i = 0; i < nr; i++)
-		put_page(pages[i]);
-}
-
 static int bio_iov_add_page(struct bio *bio, struct page *page,
 		unsigned int len, unsigned int offset)
 {
@@ -1208,9 +1200,10 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 	unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
 	struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
 	struct page **pages = (struct page **)bv;
+	unsigned len, i = 0;
 	ssize_t size, left;
-	unsigned len, i;
 	size_t offset;
+	int ret;
 
 	/*
 	 * Move page array up in the allocated memory for the bio vecs as far as
@@ -1228,14 +1221,19 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 	 * the iov data will be picked up in the next bio iteration.
 	 */
 	size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
-	if (size > 0)
+	if (size > 0) {
+		nr_pages = DIV_ROUND_UP(size + offset, PAGE_SIZE);
 		size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev));
-	if (unlikely(size <= 0))
-		return size ? size : -EFAULT;
+	} else
+		nr_pages = 0;
+
+	if (unlikely(size <= 0)) {
+		ret = size ? size : -EFAULT;
+		goto out;
+	}
 
-	for (left = size, i = 0; left > 0; left -= len, i++) {
+	for (left = size; left > 0; left -= len, i++) {
 		struct page *page = pages[i];
-		int ret;
 
 		len = min_t(size_t, PAGE_SIZE - offset, left);
 		if (bio_op(bio) == REQ_OP_ZONE_APPEND)
@@ -1244,15 +1242,19 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 		else
 			ret = bio_iov_add_page(bio, page, len, offset);
 
-		if (ret) {
-			bio_put_pages(pages + i, left, offset);
-			return ret;
-		}
+		if (ret)
+			goto out;
 		offset = 0;
 	}
 
 	iov_iter_advance(iter, size);
-	return 0;
+out:
+	while (i < nr_pages) {
+		put_page(pages[i]);
+		i++;
+	}
+
+	return ret;
 }
 
 /**
--
Al Viro July 1, 2022, 6:07 p.m. UTC | #8
On Fri, Jul 01, 2022 at 11:53:44AM -0600, Keith Busch wrote:
> On Fri, Jul 01, 2022 at 06:40:40PM +0100, Al Viro wrote:
> > -static void bio_put_pages(struct page **pages, size_t size, size_t off)
> > -{
> > -	size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE);
> > -
> > -	for (i = 0; i < nr; i++)
> > -		put_page(pages[i]);
> > -}
> > -
> >  static int bio_iov_add_page(struct bio *bio, struct page *page,
> >  		unsigned int len, unsigned int offset)
> >  {
> > @@ -1228,11 +1220,11 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
> >  	 * the iov data will be picked up in the next bio iteration.
> >  	 */
> >  	size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
> > -	if (size > 0)
> > -		size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev));
> >  	if (unlikely(size <= 0))
> >  		return size ? size : -EFAULT;
> > +	nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE);
> >  
> > +	size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev));
> 
> This isn't quite right. The result of the ALIGN_DOWN could be 0, so whatever
> page we got before would be leaked since unused pages are only released on an
> add_page error. I was about to reply with a patch that fixes this, but here's
> the one that I'm currently testing:

AFAICS, result is broken; you might end up consuming some data and leaving
iterator not advanced at all.  With no way for the caller to tell which way it
went.
Al Viro July 1, 2022, 6:12 p.m. UTC | #9
On Fri, Jul 01, 2022 at 07:07:45PM +0100, Al Viro wrote:
> On Fri, Jul 01, 2022 at 11:53:44AM -0600, Keith Busch wrote:
> > On Fri, Jul 01, 2022 at 06:40:40PM +0100, Al Viro wrote:
> > > -static void bio_put_pages(struct page **pages, size_t size, size_t off)
> > > -{
> > > -	size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE);
> > > -
> > > -	for (i = 0; i < nr; i++)
> > > -		put_page(pages[i]);
> > > -}
> > > -
> > >  static int bio_iov_add_page(struct bio *bio, struct page *page,
> > >  		unsigned int len, unsigned int offset)
> > >  {
> > > @@ -1228,11 +1220,11 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
> > >  	 * the iov data will be picked up in the next bio iteration.
> > >  	 */
> > >  	size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
> > > -	if (size > 0)
> > > -		size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev));
> > >  	if (unlikely(size <= 0))
> > >  		return size ? size : -EFAULT;
> > > +	nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE);
> > >  
> > > +	size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev));
> > 
> > This isn't quite right. The result of the ALIGN_DOWN could be 0, so whatever
> > page we got before would be leaked since unused pages are only released on an
> > add_page error. I was about to reply with a patch that fixes this, but here's
> > the one that I'm currently testing:
> 
> AFAICS, result is broken; you might end up consuming some data and leaving
> iterator not advanced at all.  With no way for the caller to tell which way it
> went.

How about the following?

commit 5e3e9769404de54734c110b2040bdb93593e0f1b
Author: Al Viro <viro@zeniv.linux.org.uk>
Date:   Fri Jul 1 13:15:36 2022 -0400

    __bio_iov_iter_get_pages(): make sure we don't leak page refs on failure
    
    Calculate the number of pages we'd grabbed before trimming size down.
    And don't bother with bio_put_pages() - an explicit cleanup loop is
    easier to follow...
    
    Fixes: b1a000d3b8ec "block: relax direct io memory alignment"
    Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/block/bio.c b/block/bio.c
index 933ea3210954..a9fe20cb71fe 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1151,14 +1151,6 @@ void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
 	bio_set_flag(bio, BIO_CLONED);
 }
 
-static void bio_put_pages(struct page **pages, size_t size, size_t off)
-{
-	size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE);
-
-	for (i = 0; i < nr; i++)
-		put_page(pages[i]);
-}
-
 static int bio_iov_add_page(struct bio *bio, struct page *page,
 		unsigned int len, unsigned int offset)
 {
@@ -1211,6 +1203,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 	ssize_t size, left;
 	unsigned len, i;
 	size_t offset;
+	int ret;
 
 	/*
 	 * Move page array up in the allocated memory for the bio vecs as far as
@@ -1228,14 +1221,13 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 	 * the iov data will be picked up in the next bio iteration.
 	 */
 	size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
-	if (size > 0)
-		size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev));
 	if (unlikely(size <= 0))
 		return size ? size : -EFAULT;
+	nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE);
 
+	size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev));
 	for (left = size, i = 0; left > 0; left -= len, i++) {
 		struct page *page = pages[i];
-		int ret;
 
 		len = min_t(size_t, PAGE_SIZE - offset, left);
 		if (bio_op(bio) == REQ_OP_ZONE_APPEND)
@@ -1244,15 +1236,15 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 		else
 			ret = bio_iov_add_page(bio, page, len, offset);
 
-		if (ret) {
-			bio_put_pages(pages + i, left, offset);
-			return ret;
-		}
+		if (ret)
+			break;
 		offset = 0;
 	}
+	while (i < nr_pages)
+		put_page(pages[i++]);
 
-	iov_iter_advance(iter, size);
-	return 0;
+	iov_iter_advance(iter, size - left);
+	return ret;
 }
 
 /**
Keith Busch July 1, 2022, 6:38 p.m. UTC | #10
On Fri, Jul 01, 2022 at 07:12:17PM +0100, Al Viro wrote:
> On Fri, Jul 01, 2022 at 07:07:45PM +0100, Al Viro wrote:
> > > page we got before would be leaked since unused pages are only released on an
> > > add_page error. I was about to reply with a patch that fixes this, but here's
> > > the one that I'm currently testing:
> > 
> > AFAICS, result is broken; you might end up consuming some data and leaving
> > iterator not advanced at all.  With no way for the caller to tell which way it
> > went.

I think I see what you mean, though the issue with a non-advancing iterator on
a partially filled bio was happening prior to this patch.

> How about the following?

This looks close to what I was about to respond with. Just a couple issues
below:

> -static void bio_put_pages(struct page **pages, size_t size, size_t off)
> -{
> -	size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE);
> -
> -	for (i = 0; i < nr; i++)
> -		put_page(pages[i]);
> -}
> -
>  static int bio_iov_add_page(struct bio *bio, struct page *page,
>  		unsigned int len, unsigned int offset)
>  {
> @@ -1211,6 +1203,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
>  	ssize_t size, left;
>  	unsigned len, i;
>  	size_t offset;
> +	int ret;

'ret' might never be initialized if 'size' aligns down to 0.

>  	/*
>  	 * Move page array up in the allocated memory for the bio vecs as far as
> @@ -1228,14 +1221,13 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
>  	 * the iov data will be picked up in the next bio iteration.
>  	 */
>  	size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
> -	if (size > 0)
> -		size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev));
>  	if (unlikely(size <= 0))
>  		return size ? size : -EFAULT;
> +	nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE);
>  
> +	size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev));

We still need to return EFAULT if size becomes 0 because that's the only way
bio_iov_iter_get_pages()'s loop will break out in this condition.
Keith Busch July 1, 2022, 7:05 p.m. UTC | #11
On Fri, Jul 01, 2022 at 07:07:45PM +0100, Al Viro wrote:
> On Fri, Jul 01, 2022 at 11:53:44AM -0600, Keith Busch wrote:
> > On Fri, Jul 01, 2022 at 06:40:40PM +0100, Al Viro wrote:
> > > -static void bio_put_pages(struct page **pages, size_t size, size_t off)
> > > -{
> > > -	size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE);
> > > -
> > > -	for (i = 0; i < nr; i++)
> > > -		put_page(pages[i]);
> > > -}
> > > -
> > >  static int bio_iov_add_page(struct bio *bio, struct page *page,
> > >  		unsigned int len, unsigned int offset)
> > >  {
> > > @@ -1228,11 +1220,11 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
> > >  	 * the iov data will be picked up in the next bio iteration.
> > >  	 */
> > >  	size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
> > > -	if (size > 0)
> > > -		size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev));
> > >  	if (unlikely(size <= 0))
> > >  		return size ? size : -EFAULT;
> > > +	nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE);
> > >  
> > > +	size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev));
> > 
> > This isn't quite right. The result of the ALIGN_DOWN could be 0, so whatever
> > page we got before would be leaked since unused pages are only released on an
> > add_page error. I was about to reply with a patch that fixes this, but here's
> > the one that I'm currently testing:
> 
> AFAICS, result is broken; you might end up consuming some data and leaving
> iterator not advanced at all.  With no way for the caller to tell which way it
> went.

Looks like the possibility of a non-advancing iterator goes all the way back to
the below commit.

  commit 576ed9135489c723fb39b97c4e2c73428d06dd78
  Author: Christoph Hellwig <hch@lst.de>
  Date:   Thu Sep 20 08:28:21 2018 +0200

      block: use bio_add_page in bio_iov_iter_get_pages

I guess the error condition never occured, and nor should it if the bio's
available vectors is accounted for correctly.
Al Viro July 1, 2022, 7:08 p.m. UTC | #12
On Fri, Jul 01, 2022 at 12:38:36PM -0600, Keith Busch wrote:
> On Fri, Jul 01, 2022 at 07:12:17PM +0100, Al Viro wrote:
> > On Fri, Jul 01, 2022 at 07:07:45PM +0100, Al Viro wrote:
> > > > page we got before would be leaked since unused pages are only released on an
> > > > add_page error. I was about to reply with a patch that fixes this, but here's
> > > > the one that I'm currently testing:
> > > 
> > > AFAICS, result is broken; you might end up consuming some data and leaving
> > > iterator not advanced at all.  With no way for the caller to tell which way it
> > > went.
> 
> I think I see what you mean, though the issue with a non-advancing iterator on
> a partially filled bio was happening prior to this patch.
> 
> > How about the following?
> 
> This looks close to what I was about to respond with. Just a couple issues
> below:
> 
> > -static void bio_put_pages(struct page **pages, size_t size, size_t off)
> > -{
> > -	size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE);
> > -
> > -	for (i = 0; i < nr; i++)
> > -		put_page(pages[i]);
> > -}
> > -
> >  static int bio_iov_add_page(struct bio *bio, struct page *page,
> >  		unsigned int len, unsigned int offset)
> >  {
> > @@ -1211,6 +1203,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
> >  	ssize_t size, left;
> >  	unsigned len, i;
> >  	size_t offset;
> > +	int ret;
> 
> 'ret' might never be initialized if 'size' aligns down to 0.

Point.

> >  	/*
> >  	 * Move page array up in the allocated memory for the bio vecs as far as
> > @@ -1228,14 +1221,13 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
> >  	 * the iov data will be picked up in the next bio iteration.
> >  	 */
> >  	size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
> > -	if (size > 0)
> > -		size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev));
> >  	if (unlikely(size <= 0))
> >  		return size ? size : -EFAULT;
> > +	nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE);
> >  
> > +	size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev));
> 
> We still need to return EFAULT if size becomes 0 because that's the only way
> bio_iov_iter_get_pages()'s loop will break out in this condition.

I really don't like these calling conventions ;-/

What do you want to happen if we have that align-down to reduce size?
IOW, what should be the state after that loop in the caller?
Keith Busch July 1, 2022, 7:28 p.m. UTC | #13
On Fri, Jul 01, 2022 at 08:08:37PM +0100, Al Viro wrote:
> On Fri, Jul 01, 2022 at 12:38:36PM -0600, Keith Busch wrote:
> > >  	size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
> > > -	if (size > 0)
> > > -		size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev));
> > >  	if (unlikely(size <= 0))
> > >  		return size ? size : -EFAULT;
> > > +	nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE);
> > >  
> > > +	size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev));
> > 
> > We still need to return EFAULT if size becomes 0 because that's the only way
> > bio_iov_iter_get_pages()'s loop will break out in this condition.
> 
> I really don't like these calling conventions ;-/

No argument here; I'm just working in the space as I found it. :)
 
> What do you want to happen if we have that align-down to reduce size?
> IOW, what should be the state after that loop in the caller?

We fill up the bio to bi_max_vecs. If there are more pages than vectors, then
the bio is submitted as-is with the partially drained iov_iter. The remainder
of the iov is left for a subsequent bio to deal with.

The ALIGN_DOWN is required because I've replaced the artificial kernel aligment
with the underlying hardware's alignment. The hardware's alignment is usually
smaller than a block size. If the last bvec has a non-block aligned offset,
then it has to be truncated down, which could result in a 0 size when bi_vcnt
is already non-zero. If that happens, we just submit the bio as-is, and
allocate a new one to deal with the rest of the iov.
Al Viro July 1, 2022, 7:43 p.m. UTC | #14
On Fri, Jul 01, 2022 at 01:28:13PM -0600, Keith Busch wrote:
> On Fri, Jul 01, 2022 at 08:08:37PM +0100, Al Viro wrote:
> > On Fri, Jul 01, 2022 at 12:38:36PM -0600, Keith Busch wrote:
> > > >  	size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
> > > > -	if (size > 0)
> > > > -		size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev));
> > > >  	if (unlikely(size <= 0))
> > > >  		return size ? size : -EFAULT;
> > > > +	nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE);
> > > >  
> > > > +	size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev));
> > > 
> > > We still need to return EFAULT if size becomes 0 because that's the only way
> > > bio_iov_iter_get_pages()'s loop will break out in this condition.
> > 
> > I really don't like these calling conventions ;-/
> 
> No argument here; I'm just working in the space as I found it. :)
>  
> > What do you want to happen if we have that align-down to reduce size?
> > IOW, what should be the state after that loop in the caller?
> 
> We fill up the bio to bi_max_vecs. If there are more pages than vectors, then
> the bio is submitted as-is with the partially drained iov_iter. The remainder
> of the iov is left for a subsequent bio to deal with.
> 
> The ALIGN_DOWN is required because I've replaced the artificial kernel aligment
> with the underlying hardware's alignment. The hardware's alignment is usually
> smaller than a block size. If the last bvec has a non-block aligned offset,
> then it has to be truncated down, which could result in a 0 size when bi_vcnt
> is already non-zero. If that happens, we just submit the bio as-is, and
> allocate a new one to deal with the rest of the iov.

Wait a sec.  Looks like you are dealing with the round-down in the wrong place -
it applies to the *total* you've packed into the bio, no matter how it is
distributed over the segments you've gathered it from.  Looks like it would
be more natural to handle it after the loop in the caller, would it not?

I.e.
	while bio is not full
		grab pages
		if got nothing
			break
		pack pages into bio
		if can't add a page (bio_add_hw_page() failure)
			drop the ones not shoved there
			break
	see how much had we packed into the sucker
	if not a multiple of logical block size
		trim the bio, dropping what needs to be dropped
		iov_iter_revert()
	if nothing's packed
		return -EINVAL if it was a failed bio_add_hw_page()
		return -EFAULT otherwise
	return 0
Keith Busch July 1, 2022, 7:56 p.m. UTC | #15
On Fri, Jul 01, 2022 at 08:43:32PM +0100, Al Viro wrote:
> On Fri, Jul 01, 2022 at 01:28:13PM -0600, Keith Busch wrote:
> > On Fri, Jul 01, 2022 at 08:08:37PM +0100, Al Viro wrote:
> > > On Fri, Jul 01, 2022 at 12:38:36PM -0600, Keith Busch wrote:
> > > > >  	size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
> > > > > -	if (size > 0)
> > > > > -		size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev));
> > > > >  	if (unlikely(size <= 0))
> > > > >  		return size ? size : -EFAULT;
> > > > > +	nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE);
> > > > >  
> > > > > +	size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev));
> > > > 
> > > > We still need to return EFAULT if size becomes 0 because that's the only way
> > > > bio_iov_iter_get_pages()'s loop will break out in this condition.
> > > 
> > > I really don't like these calling conventions ;-/
> > 
> > No argument here; I'm just working in the space as I found it. :)
> >  
> > > What do you want to happen if we have that align-down to reduce size?
> > > IOW, what should be the state after that loop in the caller?
> > 
> > We fill up the bio to bi_max_vecs. If there are more pages than vectors, then
> > the bio is submitted as-is with the partially drained iov_iter. The remainder
> > of the iov is left for a subsequent bio to deal with.
> > 
> > The ALIGN_DOWN is required because I've replaced the artificial kernel aligment
> > with the underlying hardware's alignment. The hardware's alignment is usually
> > smaller than a block size. If the last bvec has a non-block aligned offset,
> > then it has to be truncated down, which could result in a 0 size when bi_vcnt
> > is already non-zero. If that happens, we just submit the bio as-is, and
> > allocate a new one to deal with the rest of the iov.
> 
> Wait a sec.  Looks like you are dealing with the round-down in the wrong place -
> it applies to the *total* you've packed into the bio, no matter how it is
> distributed over the segments you've gathered it from.  Looks like it would
> be more natural to handle it after the loop in the caller, would it not?
> 
> I.e.
> 	while bio is not full
> 		grab pages
> 		if got nothing
> 			break
> 		pack pages into bio
> 		if can't add a page (bio_add_hw_page() failure)
> 			drop the ones not shoved there
> 			break
> 	see how much had we packed into the sucker
> 	if not a multiple of logical block size
> 		trim the bio, dropping what needs to be dropped
> 		iov_iter_revert()
> 	if nothing's packed
> 		return -EINVAL if it was a failed bio_add_hw_page()
> 		return -EFAULT otherwise
> 	return 0

Validating user requests gets really messy if we allow arbitrary segment
lengths. This particular patch just enables arbitrary address alignment, but
segment size is still required to be a block size. You found the commit that
enforces that earlier, "iov: introduce iov_iter_aligned", two commits prior.

The rest of the logic simplifies when we are guaranteed segment size is a block
size mulitple: truncating a segment at a block boundary ensures both sides are
block size aligned, and we don't even need to consult lower level queue limits,
like segment count or segment length. The bio is valid after this step, and can
be split into valid bios later if needed.
Jens Axboe July 1, 2022, 9:30 p.m. UTC | #16
On 7/1/22 11:40 AM, Al Viro wrote:
> On Thu, Jun 30, 2022 at 08:07:24PM -0600, Keith Busch wrote:
>> On Thu, Jun 30, 2022 at 11:39:36PM +0100, Al Viro wrote:
>>> On Thu, Jun 30, 2022 at 11:11:27PM +0100, Al Viro wrote:
>>>
>>>> ... and the first half of that thing conflicts with "block: relax direct
>>>> io memory alignment" in -next...
>>>
>>> BTW, looking at that commit - are you sure that bio_put_pages() on failure
>>> exit will do the right thing?  We have grabbed a bunch of page references;
>>> the amount if DIV_ROUND_UP(offset + size, PAGE_SIZE).  And that's before
>>> your
>>>                 size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev));
>>
>> Thanks for the catch, it does look like a page reference could get leaked here.
>>
>>> in there.  IMO the following would be more obviously correct:
>>>         size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
>>>         if (unlikely(size <= 0))
>>>                 return size ? size : -EFAULT;
>>>
>>> 	nr_pages = DIV_ROUND_UP(size + offset, PAGE_SIZE);
>>> 	size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev));
>>>
>>>         for (left = size, i = 0; left > 0; left -= len, i++) {
>>> ...
>>>                 if (ret) {
>>> 			while (i < nr_pages)
>>> 				put_page(pages[i++]);
>>>                         return ret;
>>>                 }
>>> ...
>>>
>>> and get rid of bio_put_pages() entirely.  Objections?
>>
>>
>> I think that makes sense. I'll give your idea a test run tomorrow.
> 
> See vfs.git#block-fixes, along with #work.iov_iter_get_pages-3 in there.
> Seems to work here...
> 
> If you are OK with #block-fixes (it's one commit on top of
> bf8d08532bc1 "iomap: add support for dma aligned direct-io" in
> block.git), the easiest way to deal with the conflicts would be
> to have that branch pulled into block.git.  Jens, would you be
> OK with that in terms of tree topology?  Provided that patch
> itself looks sane to you, of course...

I'm fine with that approach. Don't have too much time to look into this
stuff just yet, but looks like you and Keith are getting it sorted out.
I'll check in on emails later this weekend and we can get it pulled in
at that point when you guys deem it ready to check/pull.
Al Viro July 2, 2022, 5:35 a.m. UTC | #17
On Fri, Jul 01, 2022 at 01:56:53PM -0600, Keith Busch wrote:

> Validating user requests gets really messy if we allow arbitrary segment
> lengths. This particular patch just enables arbitrary address alignment, but
> segment size is still required to be a block size. You found the commit that
> enforces that earlier, "iov: introduce iov_iter_aligned", two commits prior.

BTW, where do you check it for this caller?
	fs/zonefs/super.c:786:  ret = bio_iov_iter_get_pages(bio, from);
Incidentally, we have an incorrect use of iov_iter_truncate() in that one (compare
with iomap case, where we reexpand it afterwards)...

I still don't get the logics of those round-downs.  You've *already* verified
that each segment is a multiple of logical block size.  And you are stuffing
as much as you can into bio, covering the data for as many segments as you
can.  Sure, you might end up e.g. running into an unmapped page at wrong
offset (since your requirements for initial offsets might be milder than
logical block size).  Or you might run out of pages bio would take.  Either
might put the end of bio at the wrong offset.

So why not trim it down *after* you are done adding pages into it?  And do it
once, outside of the loop.  IDGI...  Validation is already done; I'm not
suggesting to allow weird segment lengths or to change behaviour of your
iov_iter_is_aligned() in any other way.

Put it another way, is there any possibility for __bio_iov_iter_get_pages() to
do a non-trivial round-down on anything other than the last iteration of that
loop?
Keith Busch July 2, 2022, 9:02 p.m. UTC | #18
On Sat, Jul 02, 2022 at 06:35:58AM +0100, Al Viro wrote:
> On Fri, Jul 01, 2022 at 01:56:53PM -0600, Keith Busch wrote:
> 
> > Validating user requests gets really messy if we allow arbitrary segment
> > lengths. This particular patch just enables arbitrary address alignment, but
> > segment size is still required to be a block size. You found the commit that
> > enforces that earlier, "iov: introduce iov_iter_aligned", two commits prior.
> 
> BTW, where do you check it for this caller?
> 	fs/zonefs/super.c:786:  ret = bio_iov_iter_get_pages(bio, from);
> Incidentally, we have an incorrect use of iov_iter_truncate() in that one (compare
> with iomap case, where we reexpand it afterwards)...
> 
> I still don't get the logics of those round-downs.  You've *already* verified
> that each segment is a multiple of logical block size.  And you are stuffing
> as much as you can into bio, covering the data for as many segments as you
> can.  Sure, you might end up e.g. running into an unmapped page at wrong
> offset (since your requirements for initial offsets might be milder than
> logical block size).  Or you might run out of pages bio would take.  Either
> might put the end of bio at the wrong offset.

It is strange this function allows the possibility that bio_iov_add_page() can
fail. There's no reason to grab more pages that exceed the bio_full() condition
(ignoring the special ZONE_APPEND case for the moment).

I think it will make more sense if we clean that part up first so the size for
all successfully gotten pages can skip subsequent bio add page checks, and make
the error handling unnecessary.
 
> So why not trim it down *after* you are done adding pages into it?  And do it
> once, outside of the loop.  IDGI...  Validation is already done; I'm not
> suggesting to allow weird segment lengths or to change behaviour of your
> iov_iter_is_aligned() in any other way.

I may have misunderstood your previous suggestion, but I still think this is
the right way to go. The ALIGN_DOWN in its current location ensures the size
we're appending to the bio is acceptable before we even start. It's easier to
prevent adding pages to a bio IO than to back them out later. The latter would
need something like a special cased version of bio_truncate().

Anyway, I have some changes testing right now that I think will fix up the
issues you've raised, and make the rest a bit more clear. I'll send them for
consideration this weekend if all is succesful.

> Put it another way, is there any possibility for __bio_iov_iter_get_pages() to
> do a non-trivial round-down on anything other than the last iteration of that
> loop?
Sedat Dilek July 10, 2022, 6:04 p.m. UTC | #19
On Wed, Jun 22, 2022 at 6:56 AM Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> ... doing revert if we end up not using some pages
>

With the version from latest vfs.git#for-next as from [1] (which
differs from this one) I see with LLVM-14:

5618:  clang -Wp,-MMD,block/.bio.o.d -nostdinc -I./arch/x86/include
-I./arch/x86/include/generated  -I./include -I./arch/x86/include/uapi
-I./arch/x86/include/generate
d/uapi -I./include/uapi -I./include/generated/uapi -include
./include/linux/compiler-version.h -include ./include/linux/kconfig.h
-include ./include/linux/compiler_typ
es.h -D__KERNEL__ -Qunused-arguments -fmacro-prefix-map=./= -Wall
-Wundef -Werror=strict-prototypes -Wno-trigraphs -fno-strict-aliasing
-fno-common -fshort-wchar -fno-
PIE -Werror=implicit-function-declaration -Werror=implicit-int
-Werror=return-type -Wno-format-security -std=gnu11
--target=x86_64-linux-gnu -fintegrated-as -Werror=un
known-warning-option -Werror=ignored-optimization-argument -mno-sse
-mno-mmx -mno-sse2 -mno-3dnow -mno-avx -fcf-protection=none -m64
-falign-loops=1 -mno-80387 -mno-fp
-ret-in-387 -mstack-alignment=8 -mskip-rax-setup -mtune=generic
-mno-red-zone -mcmodel=kernel -Wno-sign-compare
-fno-asynchronous-unwind-tables -mretpoline-external-th
unk -fno-delete-null-pointer-checks -Wno-frame-address
-Wno-address-of-packed-member -O2 -Wframe-larger-than=2048
-fstack-protector-strong -Wimplicit-fallthrough -Wno-
gnu -Wno-unused-but-set-variable -Wno-unused-const-variable
-fno-stack-clash-protection -pg -mfentry -DCC_USING_NOP_MCOUNT
-DCC_USING_FENTRY -fno-lto -flto=thin -fspli
t-lto-unit -fvisibility=hidden -Wdeclaration-after-statement -Wvla
-Wno-pointer-sign -Wcast-function-type -fno-strict-overflow
-fno-stack-check -Werror=date-time -Werr
or=incompatible-pointer-types -Wno-initializer-overrides -Wno-format
-Wno-sign-compare -Wno-format-zero-length -Wno-pointer-to-enum-cast
-Wno-tautological-constant-out
-of-range-compare -Wno-unaligned-access -g -gdwarf-5
-DKBUILD_MODFILE='"block/bio"' -DKBUILD_BASENAME='"bio"'
-DKBUILD_MODNAME='"bio"' -D__KBUILD_MODNAME=kmod_bio -
c -o block/bio.o block/bio.c
[ ... ]
5635:block/bio.c:1232:6: warning: variable 'i' is used uninitialized
whenever 'if' condition is true [-Wsometimes-uninitialized]
5636-        if (unlikely(!size)) {
5637-            ^~~~~~~~~~~~~~~
5638-./include/linux/compiler.h:78:22: note: expanded from macro 'unlikely'
5639-# define unlikely(x)    __builtin_expect(!!(x), 0)
5640-                        ^~~~~~~~~~~~~~~~~~~~~~~~~~
5641-block/bio.c:1254:9: note: uninitialized use occurs here
5642-        while (i < nr_pages)
5643-               ^
5644-block/bio.c:1232:2: note: remove the 'if' if its condition is always false
5645-        if (unlikely(!size)) {
5646-        ^~~~~~~~~~~~~~~~~~~~~~
5647-block/bio.c:1202:17: note: initialize the variable 'i' to silence
this warning
5648-        unsigned len, i;
5649-                       ^
5650-                        = 0

Patch from [1] is attached.

Regards,
-Sedat-

[1] https://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git/commit/block/bio.c?h=for-next&id=9a6469060316674230c0666c5706f7097e9278bb

> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
> ---
>  block/bio.c     | 15 ++++++---------
>  block/blk-map.c |  7 ++++---
>  2 files changed, 10 insertions(+), 12 deletions(-)
>
> diff --git a/block/bio.c b/block/bio.c
> index 51c99f2c5c90..01ab683e67be 100644
> --- a/block/bio.c
> +++ b/block/bio.c
> @@ -1190,7 +1190,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
>         BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
>         pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
>
> -       size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
> +       size = iov_iter_get_pages2(iter, pages, LONG_MAX, nr_pages, &offset);
>         if (unlikely(size <= 0))
>                 return size ? size : -EFAULT;
>
> @@ -1205,6 +1205,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
>                 } else {
>                         if (WARN_ON_ONCE(bio_full(bio, len))) {
>                                 bio_put_pages(pages + i, left, offset);
> +                               iov_iter_revert(iter, left);
>                                 return -EINVAL;
>                         }
>                         __bio_add_page(bio, page, len, offset);
> @@ -1212,7 +1213,6 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
>                 offset = 0;
>         }
>
> -       iov_iter_advance(iter, size);
>         return 0;
>  }
>
> @@ -1227,7 +1227,6 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter)
>         ssize_t size, left;
>         unsigned len, i;
>         size_t offset;
> -       int ret = 0;
>
>         if (WARN_ON_ONCE(!max_append_sectors))
>                 return 0;
> @@ -1240,7 +1239,7 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter)
>         BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
>         pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
>
> -       size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
> +       size = iov_iter_get_pages2(iter, pages, LONG_MAX, nr_pages, &offset);
>         if (unlikely(size <= 0))
>                 return size ? size : -EFAULT;
>
> @@ -1252,16 +1251,14 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter)
>                 if (bio_add_hw_page(q, bio, page, len, offset,
>                                 max_append_sectors, &same_page) != len) {
>                         bio_put_pages(pages + i, left, offset);
> -                       ret = -EINVAL;
> -                       break;
> +                       iov_iter_revert(iter, left);
> +                       return -EINVAL;
>                 }
>                 if (same_page)
>                         put_page(page);
>                 offset = 0;
>         }
> -
> -       iov_iter_advance(iter, size - left);
> -       return ret;
> +       return 0;
>  }
>
>  /**
> diff --git a/block/blk-map.c b/block/blk-map.c
> index df8b066cd548..7196a6b64c80 100644
> --- a/block/blk-map.c
> +++ b/block/blk-map.c
> @@ -254,7 +254,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
>                 size_t offs, added = 0;
>                 int npages;
>
> -               bytes = iov_iter_get_pages_alloc(iter, &pages, LONG_MAX, &offs);
> +               bytes = iov_iter_get_pages_alloc2(iter, &pages, LONG_MAX, &offs);
>                 if (unlikely(bytes <= 0)) {
>                         ret = bytes ? bytes : -EFAULT;
>                         goto out_unmap;
> @@ -284,7 +284,6 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
>                                 bytes -= n;
>                                 offs = 0;
>                         }
> -                       iov_iter_advance(iter, added);
>                 }
>                 /*
>                  * release the pages we didn't map into the bio, if any
> @@ -293,8 +292,10 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
>                         put_page(pages[j++]);
>                 kvfree(pages);
>                 /* couldn't stuff something into bio? */
> -               if (bytes)
> +               if (bytes) {
> +                       iov_iter_revert(iter, bytes);
>                         break;
> +               }
>         }
>
>         ret = blk_rq_append_bio(rq, bio);
> --
> 2.30.2
>
diff mbox series

Patch

diff --git a/block/bio.c b/block/bio.c
index 51c99f2c5c90..01ab683e67be 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1190,7 +1190,7 @@  static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 	BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
 	pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
 
-	size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
+	size = iov_iter_get_pages2(iter, pages, LONG_MAX, nr_pages, &offset);
 	if (unlikely(size <= 0))
 		return size ? size : -EFAULT;
 
@@ -1205,6 +1205,7 @@  static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 		} else {
 			if (WARN_ON_ONCE(bio_full(bio, len))) {
 				bio_put_pages(pages + i, left, offset);
+				iov_iter_revert(iter, left);
 				return -EINVAL;
 			}
 			__bio_add_page(bio, page, len, offset);
@@ -1212,7 +1213,6 @@  static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 		offset = 0;
 	}
 
-	iov_iter_advance(iter, size);
 	return 0;
 }
 
@@ -1227,7 +1227,6 @@  static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter)
 	ssize_t size, left;
 	unsigned len, i;
 	size_t offset;
-	int ret = 0;
 
 	if (WARN_ON_ONCE(!max_append_sectors))
 		return 0;
@@ -1240,7 +1239,7 @@  static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter)
 	BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
 	pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
 
-	size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
+	size = iov_iter_get_pages2(iter, pages, LONG_MAX, nr_pages, &offset);
 	if (unlikely(size <= 0))
 		return size ? size : -EFAULT;
 
@@ -1252,16 +1251,14 @@  static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter)
 		if (bio_add_hw_page(q, bio, page, len, offset,
 				max_append_sectors, &same_page) != len) {
 			bio_put_pages(pages + i, left, offset);
-			ret = -EINVAL;
-			break;
+			iov_iter_revert(iter, left);
+			return -EINVAL;
 		}
 		if (same_page)
 			put_page(page);
 		offset = 0;
 	}
-
-	iov_iter_advance(iter, size - left);
-	return ret;
+	return 0;
 }
 
 /**
diff --git a/block/blk-map.c b/block/blk-map.c
index df8b066cd548..7196a6b64c80 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -254,7 +254,7 @@  static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
 		size_t offs, added = 0;
 		int npages;
 
-		bytes = iov_iter_get_pages_alloc(iter, &pages, LONG_MAX, &offs);
+		bytes = iov_iter_get_pages_alloc2(iter, &pages, LONG_MAX, &offs);
 		if (unlikely(bytes <= 0)) {
 			ret = bytes ? bytes : -EFAULT;
 			goto out_unmap;
@@ -284,7 +284,6 @@  static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
 				bytes -= n;
 				offs = 0;
 			}
-			iov_iter_advance(iter, added);
 		}
 		/*
 		 * release the pages we didn't map into the bio, if any
@@ -293,8 +292,10 @@  static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
 			put_page(pages[j++]);
 		kvfree(pages);
 		/* couldn't stuff something into bio? */
-		if (bytes)
+		if (bytes) {
+			iov_iter_revert(iter, bytes);
 			break;
+		}
 	}
 
 	ret = blk_rq_append_bio(rq, bio);