Message ID | 20220622041552.737754-37-viro@zeniv.linux.org.uk (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [01/44] 9p: handling Rerror without copy_from_iter_full() | expand |
On Wed, 2022-06-22 at 05:15 +0100, Al Viro wrote: > ... doing revert if we end up not using some pages > > Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> > --- > block/bio.c | 15 ++++++--------- > block/blk-map.c | 7 ++++--- > 2 files changed, 10 insertions(+), 12 deletions(-) > > diff --git a/block/bio.c b/block/bio.c > index 51c99f2c5c90..01ab683e67be 100644 > --- a/block/bio.c > +++ b/block/bio.c > @@ -1190,7 +1190,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) > BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2); > pages += entries_left * (PAGE_PTRS_PER_BVEC - 1); > > - size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset); > + size = iov_iter_get_pages2(iter, pages, LONG_MAX, nr_pages, &offset); > if (unlikely(size <= 0)) > return size ? size : -EFAULT; > > @@ -1205,6 +1205,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) > } else { > if (WARN_ON_ONCE(bio_full(bio, len))) { > bio_put_pages(pages + i, left, offset); > + iov_iter_revert(iter, left); > return -EINVAL; > } > __bio_add_page(bio, page, len, offset); > @@ -1212,7 +1213,6 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) > offset = 0; > } > > - iov_iter_advance(iter, size); > return 0; > } > > @@ -1227,7 +1227,6 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) > ssize_t size, left; > unsigned len, i; > size_t offset; > - int ret = 0; > > if (WARN_ON_ONCE(!max_append_sectors)) > return 0; > @@ -1240,7 +1239,7 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) > BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2); > pages += entries_left * (PAGE_PTRS_PER_BVEC - 1); > > - size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset); > + size = iov_iter_get_pages2(iter, pages, LONG_MAX, nr_pages, &offset); > if (unlikely(size <= 0)) > return size ? size : -EFAULT; > > @@ -1252,16 +1251,14 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) > if (bio_add_hw_page(q, bio, page, len, offset, > max_append_sectors, &same_page) != len) { > bio_put_pages(pages + i, left, offset); > - ret = -EINVAL; > - break; > + iov_iter_revert(iter, left); > + return -EINVAL; > } > if (same_page) > put_page(page); > offset = 0; > } > - > - iov_iter_advance(iter, size - left); > - return ret; > + return 0; > } > > /** > diff --git a/block/blk-map.c b/block/blk-map.c > index df8b066cd548..7196a6b64c80 100644 > --- a/block/blk-map.c > +++ b/block/blk-map.c > @@ -254,7 +254,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, > size_t offs, added = 0; > int npages; > > - bytes = iov_iter_get_pages_alloc(iter, &pages, LONG_MAX, &offs); > + bytes = iov_iter_get_pages_alloc2(iter, &pages, LONG_MAX, &offs); > if (unlikely(bytes <= 0)) { > ret = bytes ? bytes : -EFAULT; > goto out_unmap; > @@ -284,7 +284,6 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, > bytes -= n; > offs = 0; > } > - iov_iter_advance(iter, added); > } > /* > * release the pages we didn't map into the bio, if any > @@ -293,8 +292,10 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, > put_page(pages[j++]); > kvfree(pages); > /* couldn't stuff something into bio? */ > - if (bytes) > + if (bytes) { > + iov_iter_revert(iter, bytes); > break; > + } > } > > ret = blk_rq_append_bio(rq, bio); Reviewed-by: Jeff Layton <jlayton@kernel.org>
On Wed, Jun 22, 2022 at 05:15:45AM +0100, Al Viro wrote: > ... doing revert if we end up not using some pages > > Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> ... and the first half of that thing conflicts with "block: relax direct io memory alignment" in -next... Joy. It's not hard to redo on top of the commit in there; the question is, how to deal with conflicts? I can do a backmerge, provided that there's a sane tag or branch to backmerge from. Another fun (if trivial) issue in the same series is around "iov: introduce iov_iter_aligned" (two commits prior). Jens, Keith, do you have any suggestions? AFAICS, variants include * tag or branch covering b1a000d3b8ec582da64bb644be633e5a0beffcbf (I'd rather not grab the entire for-5.20/block for obvious reasons) It sits in the beginning of for-5.20/block, so that should be fairly straightforward, provided that you are not going to do rebases there. If you are, could you put that stuff into an invariant branch, so I'd just pull it? * feeding the entire iov_iter pile through block.git; bad idea, IMO, seeing that it contains a lot of stuff far from anything block-related. * doing a manual conflict resolution on top of my branch and pushing that out. Would get rid of the problem from -next, but Linus hates that kind of stuff, AFAIK, and with good reasons. I would prefer the first variant (and that's what I'm going to do locally for now - just git tag keith_stuff bf8d08532bc19a14cfb54ae61099dccadefca446 and backmerge from it), but if you would prefer to deal with that differently - please tell.
On Thu, Jun 30, 2022 at 11:11:27PM +0100, Al Viro wrote: > ... and the first half of that thing conflicts with "block: relax direct > io memory alignment" in -next... BTW, looking at that commit - are you sure that bio_put_pages() on failure exit will do the right thing? We have grabbed a bunch of page references; the amount if DIV_ROUND_UP(offset + size, PAGE_SIZE). And that's before your size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev)); in there. IMO the following would be more obviously correct: size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset); if (unlikely(size <= 0)) return size ? size : -EFAULT; nr_pages = DIV_ROUND_UP(size + offset, PAGE_SIZE); size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev)); for (left = size, i = 0; left > 0; left -= len, i++) { ... if (ret) { while (i < nr_pages) put_page(pages[i++]); return ret; } ... and get rid of bio_put_pages() entirely. Objections?
On 6/30/22 4:11 PM, Al Viro wrote: > On Wed, Jun 22, 2022 at 05:15:45AM +0100, Al Viro wrote: >> ... doing revert if we end up not using some pages >> >> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> > > ... and the first half of that thing conflicts with "block: relax direct > io memory alignment" in -next... > > Joy. It's not hard to redo on top of the commit in there; the > question is, how to deal with conflicts? > > I can do a backmerge, provided that there's a sane tag or branch to > backmerge from. Another fun (if trivial) issue in the same series > is around "iov: introduce iov_iter_aligned" (two commits prior). > > Jens, Keith, do you have any suggestions? AFAICS, variants include > * tag or branch covering b1a000d3b8ec582da64bb644be633e5a0beffcbf > (I'd rather not grab the entire for-5.20/block for obvious reasons) > It sits in the beginning of for-5.20/block, so that should be fairly > straightforward, provided that you are not going to do rebases there. > If you are, could you put that stuff into an invariant branch, so > I'd just pull it? > * feeding the entire iov_iter pile through block.git; > bad idea, IMO, seeing that it contains a lot of stuff far from > anything block-related. > * doing a manual conflict resolution on top of my branch > and pushing that out. Would get rid of the problem from -next, but > Linus hates that kind of stuff, AFAIK, and with good reasons. > > I would prefer the first variant (and that's what I'm > going to do locally for now - just > git tag keith_stuff bf8d08532bc19a14cfb54ae61099dccadefca446 > and backmerge from it), but if you would prefer to deal with that > differently - please tell. I'm not going to rebase it, and I can create a tag for that commit for you. Done, it's block-5.20-al. I did the former commit, or we can move the tag so it includes bf8d08532bc19a14cfb54ae61099dccadefca446? That'd be the whole series of that patchset, which is just that one extra patch.
On Thu, Jun 30, 2022 at 11:39:36PM +0100, Al Viro wrote: > On Thu, Jun 30, 2022 at 11:11:27PM +0100, Al Viro wrote: > > > ... and the first half of that thing conflicts with "block: relax direct > > io memory alignment" in -next... > > BTW, looking at that commit - are you sure that bio_put_pages() on failure > exit will do the right thing? We have grabbed a bunch of page references; > the amount if DIV_ROUND_UP(offset + size, PAGE_SIZE). And that's before > your > size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev)); Thanks for the catch, it does look like a page reference could get leaked here. > in there. IMO the following would be more obviously correct: > size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset); > if (unlikely(size <= 0)) > return size ? size : -EFAULT; > > nr_pages = DIV_ROUND_UP(size + offset, PAGE_SIZE); > size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev)); > > for (left = size, i = 0; left > 0; left -= len, i++) { > ... > if (ret) { > while (i < nr_pages) > put_page(pages[i++]); > return ret; > } > ... > > and get rid of bio_put_pages() entirely. Objections? I think that makes sense. I'll give your idea a test run tomorrow.
On Thu, Jun 30, 2022 at 08:07:24PM -0600, Keith Busch wrote: > On Thu, Jun 30, 2022 at 11:39:36PM +0100, Al Viro wrote: > > On Thu, Jun 30, 2022 at 11:11:27PM +0100, Al Viro wrote: > > > > > ... and the first half of that thing conflicts with "block: relax direct > > > io memory alignment" in -next... > > > > BTW, looking at that commit - are you sure that bio_put_pages() on failure > > exit will do the right thing? We have grabbed a bunch of page references; > > the amount if DIV_ROUND_UP(offset + size, PAGE_SIZE). And that's before > > your > > size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev)); > > Thanks for the catch, it does look like a page reference could get leaked here. > > > in there. IMO the following would be more obviously correct: > > size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset); > > if (unlikely(size <= 0)) > > return size ? size : -EFAULT; > > > > nr_pages = DIV_ROUND_UP(size + offset, PAGE_SIZE); > > size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev)); > > > > for (left = size, i = 0; left > 0; left -= len, i++) { > > ... > > if (ret) { > > while (i < nr_pages) > > put_page(pages[i++]); > > return ret; > > } > > ... > > > > and get rid of bio_put_pages() entirely. Objections? > > > I think that makes sense. I'll give your idea a test run tomorrow. See vfs.git#block-fixes, along with #work.iov_iter_get_pages-3 in there. Seems to work here... If you are OK with #block-fixes (it's one commit on top of bf8d08532bc1 "iomap: add support for dma aligned direct-io" in block.git), the easiest way to deal with the conflicts would be to have that branch pulled into block.git. Jens, would you be OK with that in terms of tree topology? Provided that patch itself looks sane to you, of course... FWOW, the patch in question is commit 863965bb7e52997851af3a107ec3e4d8c7050cbd Author: Al Viro <viro@zeniv.linux.org.uk> Date: Fri Jul 1 13:15:36 2022 -0400 __bio_iov_iter_get_pages(): make sure we don't leak page refs on failure Calculate the number of pages we'd grabbed before trimming size down. And don't bother with bio_put_pages() - an explicit cleanup loop is easier to follow... Fixes: b1a000d3b8ec "block: relax direct io memory alignment" Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> diff --git a/block/bio.c b/block/bio.c index 933ea3210954..59be4eca1192 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1151,14 +1151,6 @@ void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) bio_set_flag(bio, BIO_CLONED); } -static void bio_put_pages(struct page **pages, size_t size, size_t off) -{ - size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE); - - for (i = 0; i < nr; i++) - put_page(pages[i]); -} - static int bio_iov_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset) { @@ -1228,11 +1220,11 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) * the iov data will be picked up in the next bio iteration. */ size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset); - if (size > 0) - size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev)); if (unlikely(size <= 0)) return size ? size : -EFAULT; + nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE); + size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev)); for (left = size, i = 0; left > 0; left -= len, i++) { struct page *page = pages[i]; int ret; @@ -1245,7 +1237,8 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) ret = bio_iov_add_page(bio, page, len, offset); if (ret) { - bio_put_pages(pages + i, left, offset); + while (i < nr_pages) + put_page(pages[i++]); return ret; } offset = 0;
On Fri, Jul 01, 2022 at 06:40:40PM +0100, Al Viro wrote: > -static void bio_put_pages(struct page **pages, size_t size, size_t off) > -{ > - size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE); > - > - for (i = 0; i < nr; i++) > - put_page(pages[i]); > -} > - > static int bio_iov_add_page(struct bio *bio, struct page *page, > unsigned int len, unsigned int offset) > { > @@ -1228,11 +1220,11 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) > * the iov data will be picked up in the next bio iteration. > */ > size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset); > - if (size > 0) > - size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev)); > if (unlikely(size <= 0)) > return size ? size : -EFAULT; > + nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE); > > + size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev)); This isn't quite right. The result of the ALIGN_DOWN could be 0, so whatever page we got before would be leaked since unused pages are only released on an add_page error. I was about to reply with a patch that fixes this, but here's the one that I'm currently testing: --- diff --git a/block/bio.c b/block/bio.c index 933ea3210954..c4a1ce39c65c 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1151,14 +1151,6 @@ void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) bio_set_flag(bio, BIO_CLONED); } -static void bio_put_pages(struct page **pages, size_t size, size_t off) -{ - size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE); - - for (i = 0; i < nr; i++) - put_page(pages[i]); -} - static int bio_iov_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset) { @@ -1208,9 +1200,10 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt; struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; struct page **pages = (struct page **)bv; + unsigned len, i = 0; ssize_t size, left; - unsigned len, i; size_t offset; + int ret; /* * Move page array up in the allocated memory for the bio vecs as far as @@ -1228,14 +1221,19 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) * the iov data will be picked up in the next bio iteration. */ size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset); - if (size > 0) + if (size > 0) { + nr_pages = DIV_ROUND_UP(size + offset, PAGE_SIZE); size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev)); - if (unlikely(size <= 0)) - return size ? size : -EFAULT; + } else + nr_pages = 0; + + if (unlikely(size <= 0)) { + ret = size ? size : -EFAULT; + goto out; + } - for (left = size, i = 0; left > 0; left -= len, i++) { + for (left = size; left > 0; left -= len, i++) { struct page *page = pages[i]; - int ret; len = min_t(size_t, PAGE_SIZE - offset, left); if (bio_op(bio) == REQ_OP_ZONE_APPEND) @@ -1244,15 +1242,19 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) else ret = bio_iov_add_page(bio, page, len, offset); - if (ret) { - bio_put_pages(pages + i, left, offset); - return ret; - } + if (ret) + goto out; offset = 0; } iov_iter_advance(iter, size); - return 0; +out: + while (i < nr_pages) { + put_page(pages[i]); + i++; + } + + return ret; } /** --
On Fri, Jul 01, 2022 at 11:53:44AM -0600, Keith Busch wrote: > On Fri, Jul 01, 2022 at 06:40:40PM +0100, Al Viro wrote: > > -static void bio_put_pages(struct page **pages, size_t size, size_t off) > > -{ > > - size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE); > > - > > - for (i = 0; i < nr; i++) > > - put_page(pages[i]); > > -} > > - > > static int bio_iov_add_page(struct bio *bio, struct page *page, > > unsigned int len, unsigned int offset) > > { > > @@ -1228,11 +1220,11 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) > > * the iov data will be picked up in the next bio iteration. > > */ > > size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset); > > - if (size > 0) > > - size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev)); > > if (unlikely(size <= 0)) > > return size ? size : -EFAULT; > > + nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE); > > > > + size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev)); > > This isn't quite right. The result of the ALIGN_DOWN could be 0, so whatever > page we got before would be leaked since unused pages are only released on an > add_page error. I was about to reply with a patch that fixes this, but here's > the one that I'm currently testing: AFAICS, result is broken; you might end up consuming some data and leaving iterator not advanced at all. With no way for the caller to tell which way it went.
On Fri, Jul 01, 2022 at 07:07:45PM +0100, Al Viro wrote: > On Fri, Jul 01, 2022 at 11:53:44AM -0600, Keith Busch wrote: > > On Fri, Jul 01, 2022 at 06:40:40PM +0100, Al Viro wrote: > > > -static void bio_put_pages(struct page **pages, size_t size, size_t off) > > > -{ > > > - size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE); > > > - > > > - for (i = 0; i < nr; i++) > > > - put_page(pages[i]); > > > -} > > > - > > > static int bio_iov_add_page(struct bio *bio, struct page *page, > > > unsigned int len, unsigned int offset) > > > { > > > @@ -1228,11 +1220,11 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) > > > * the iov data will be picked up in the next bio iteration. > > > */ > > > size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset); > > > - if (size > 0) > > > - size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev)); > > > if (unlikely(size <= 0)) > > > return size ? size : -EFAULT; > > > + nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE); > > > > > > + size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev)); > > > > This isn't quite right. The result of the ALIGN_DOWN could be 0, so whatever > > page we got before would be leaked since unused pages are only released on an > > add_page error. I was about to reply with a patch that fixes this, but here's > > the one that I'm currently testing: > > AFAICS, result is broken; you might end up consuming some data and leaving > iterator not advanced at all. With no way for the caller to tell which way it > went. How about the following? commit 5e3e9769404de54734c110b2040bdb93593e0f1b Author: Al Viro <viro@zeniv.linux.org.uk> Date: Fri Jul 1 13:15:36 2022 -0400 __bio_iov_iter_get_pages(): make sure we don't leak page refs on failure Calculate the number of pages we'd grabbed before trimming size down. And don't bother with bio_put_pages() - an explicit cleanup loop is easier to follow... Fixes: b1a000d3b8ec "block: relax direct io memory alignment" Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> diff --git a/block/bio.c b/block/bio.c index 933ea3210954..a9fe20cb71fe 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1151,14 +1151,6 @@ void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) bio_set_flag(bio, BIO_CLONED); } -static void bio_put_pages(struct page **pages, size_t size, size_t off) -{ - size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE); - - for (i = 0; i < nr; i++) - put_page(pages[i]); -} - static int bio_iov_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset) { @@ -1211,6 +1203,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) ssize_t size, left; unsigned len, i; size_t offset; + int ret; /* * Move page array up in the allocated memory for the bio vecs as far as @@ -1228,14 +1221,13 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) * the iov data will be picked up in the next bio iteration. */ size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset); - if (size > 0) - size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev)); if (unlikely(size <= 0)) return size ? size : -EFAULT; + nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE); + size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev)); for (left = size, i = 0; left > 0; left -= len, i++) { struct page *page = pages[i]; - int ret; len = min_t(size_t, PAGE_SIZE - offset, left); if (bio_op(bio) == REQ_OP_ZONE_APPEND) @@ -1244,15 +1236,15 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) else ret = bio_iov_add_page(bio, page, len, offset); - if (ret) { - bio_put_pages(pages + i, left, offset); - return ret; - } + if (ret) + break; offset = 0; } + while (i < nr_pages) + put_page(pages[i++]); - iov_iter_advance(iter, size); - return 0; + iov_iter_advance(iter, size - left); + return ret; } /**
On Fri, Jul 01, 2022 at 07:12:17PM +0100, Al Viro wrote: > On Fri, Jul 01, 2022 at 07:07:45PM +0100, Al Viro wrote: > > > page we got before would be leaked since unused pages are only released on an > > > add_page error. I was about to reply with a patch that fixes this, but here's > > > the one that I'm currently testing: > > > > AFAICS, result is broken; you might end up consuming some data and leaving > > iterator not advanced at all. With no way for the caller to tell which way it > > went. I think I see what you mean, though the issue with a non-advancing iterator on a partially filled bio was happening prior to this patch. > How about the following? This looks close to what I was about to respond with. Just a couple issues below: > -static void bio_put_pages(struct page **pages, size_t size, size_t off) > -{ > - size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE); > - > - for (i = 0; i < nr; i++) > - put_page(pages[i]); > -} > - > static int bio_iov_add_page(struct bio *bio, struct page *page, > unsigned int len, unsigned int offset) > { > @@ -1211,6 +1203,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) > ssize_t size, left; > unsigned len, i; > size_t offset; > + int ret; 'ret' might never be initialized if 'size' aligns down to 0. > /* > * Move page array up in the allocated memory for the bio vecs as far as > @@ -1228,14 +1221,13 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) > * the iov data will be picked up in the next bio iteration. > */ > size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset); > - if (size > 0) > - size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev)); > if (unlikely(size <= 0)) > return size ? size : -EFAULT; > + nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE); > > + size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev)); We still need to return EFAULT if size becomes 0 because that's the only way bio_iov_iter_get_pages()'s loop will break out in this condition.
On Fri, Jul 01, 2022 at 07:07:45PM +0100, Al Viro wrote: > On Fri, Jul 01, 2022 at 11:53:44AM -0600, Keith Busch wrote: > > On Fri, Jul 01, 2022 at 06:40:40PM +0100, Al Viro wrote: > > > -static void bio_put_pages(struct page **pages, size_t size, size_t off) > > > -{ > > > - size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE); > > > - > > > - for (i = 0; i < nr; i++) > > > - put_page(pages[i]); > > > -} > > > - > > > static int bio_iov_add_page(struct bio *bio, struct page *page, > > > unsigned int len, unsigned int offset) > > > { > > > @@ -1228,11 +1220,11 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) > > > * the iov data will be picked up in the next bio iteration. > > > */ > > > size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset); > > > - if (size > 0) > > > - size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev)); > > > if (unlikely(size <= 0)) > > > return size ? size : -EFAULT; > > > + nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE); > > > > > > + size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev)); > > > > This isn't quite right. The result of the ALIGN_DOWN could be 0, so whatever > > page we got before would be leaked since unused pages are only released on an > > add_page error. I was about to reply with a patch that fixes this, but here's > > the one that I'm currently testing: > > AFAICS, result is broken; you might end up consuming some data and leaving > iterator not advanced at all. With no way for the caller to tell which way it > went. Looks like the possibility of a non-advancing iterator goes all the way back to the below commit. commit 576ed9135489c723fb39b97c4e2c73428d06dd78 Author: Christoph Hellwig <hch@lst.de> Date: Thu Sep 20 08:28:21 2018 +0200 block: use bio_add_page in bio_iov_iter_get_pages I guess the error condition never occured, and nor should it if the bio's available vectors is accounted for correctly.
On Fri, Jul 01, 2022 at 12:38:36PM -0600, Keith Busch wrote: > On Fri, Jul 01, 2022 at 07:12:17PM +0100, Al Viro wrote: > > On Fri, Jul 01, 2022 at 07:07:45PM +0100, Al Viro wrote: > > > > page we got before would be leaked since unused pages are only released on an > > > > add_page error. I was about to reply with a patch that fixes this, but here's > > > > the one that I'm currently testing: > > > > > > AFAICS, result is broken; you might end up consuming some data and leaving > > > iterator not advanced at all. With no way for the caller to tell which way it > > > went. > > I think I see what you mean, though the issue with a non-advancing iterator on > a partially filled bio was happening prior to this patch. > > > How about the following? > > This looks close to what I was about to respond with. Just a couple issues > below: > > > -static void bio_put_pages(struct page **pages, size_t size, size_t off) > > -{ > > - size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE); > > - > > - for (i = 0; i < nr; i++) > > - put_page(pages[i]); > > -} > > - > > static int bio_iov_add_page(struct bio *bio, struct page *page, > > unsigned int len, unsigned int offset) > > { > > @@ -1211,6 +1203,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) > > ssize_t size, left; > > unsigned len, i; > > size_t offset; > > + int ret; > > 'ret' might never be initialized if 'size' aligns down to 0. Point. > > /* > > * Move page array up in the allocated memory for the bio vecs as far as > > @@ -1228,14 +1221,13 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) > > * the iov data will be picked up in the next bio iteration. > > */ > > size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset); > > - if (size > 0) > > - size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev)); > > if (unlikely(size <= 0)) > > return size ? size : -EFAULT; > > + nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE); > > > > + size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev)); > > We still need to return EFAULT if size becomes 0 because that's the only way > bio_iov_iter_get_pages()'s loop will break out in this condition. I really don't like these calling conventions ;-/ What do you want to happen if we have that align-down to reduce size? IOW, what should be the state after that loop in the caller?
On Fri, Jul 01, 2022 at 08:08:37PM +0100, Al Viro wrote: > On Fri, Jul 01, 2022 at 12:38:36PM -0600, Keith Busch wrote: > > > size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset); > > > - if (size > 0) > > > - size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev)); > > > if (unlikely(size <= 0)) > > > return size ? size : -EFAULT; > > > + nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE); > > > > > > + size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev)); > > > > We still need to return EFAULT if size becomes 0 because that's the only way > > bio_iov_iter_get_pages()'s loop will break out in this condition. > > I really don't like these calling conventions ;-/ No argument here; I'm just working in the space as I found it. :) > What do you want to happen if we have that align-down to reduce size? > IOW, what should be the state after that loop in the caller? We fill up the bio to bi_max_vecs. If there are more pages than vectors, then the bio is submitted as-is with the partially drained iov_iter. The remainder of the iov is left for a subsequent bio to deal with. The ALIGN_DOWN is required because I've replaced the artificial kernel aligment with the underlying hardware's alignment. The hardware's alignment is usually smaller than a block size. If the last bvec has a non-block aligned offset, then it has to be truncated down, which could result in a 0 size when bi_vcnt is already non-zero. If that happens, we just submit the bio as-is, and allocate a new one to deal with the rest of the iov.
On Fri, Jul 01, 2022 at 01:28:13PM -0600, Keith Busch wrote: > On Fri, Jul 01, 2022 at 08:08:37PM +0100, Al Viro wrote: > > On Fri, Jul 01, 2022 at 12:38:36PM -0600, Keith Busch wrote: > > > > size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset); > > > > - if (size > 0) > > > > - size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev)); > > > > if (unlikely(size <= 0)) > > > > return size ? size : -EFAULT; > > > > + nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE); > > > > > > > > + size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev)); > > > > > > We still need to return EFAULT if size becomes 0 because that's the only way > > > bio_iov_iter_get_pages()'s loop will break out in this condition. > > > > I really don't like these calling conventions ;-/ > > No argument here; I'm just working in the space as I found it. :) > > > What do you want to happen if we have that align-down to reduce size? > > IOW, what should be the state after that loop in the caller? > > We fill up the bio to bi_max_vecs. If there are more pages than vectors, then > the bio is submitted as-is with the partially drained iov_iter. The remainder > of the iov is left for a subsequent bio to deal with. > > The ALIGN_DOWN is required because I've replaced the artificial kernel aligment > with the underlying hardware's alignment. The hardware's alignment is usually > smaller than a block size. If the last bvec has a non-block aligned offset, > then it has to be truncated down, which could result in a 0 size when bi_vcnt > is already non-zero. If that happens, we just submit the bio as-is, and > allocate a new one to deal with the rest of the iov. Wait a sec. Looks like you are dealing with the round-down in the wrong place - it applies to the *total* you've packed into the bio, no matter how it is distributed over the segments you've gathered it from. Looks like it would be more natural to handle it after the loop in the caller, would it not? I.e. while bio is not full grab pages if got nothing break pack pages into bio if can't add a page (bio_add_hw_page() failure) drop the ones not shoved there break see how much had we packed into the sucker if not a multiple of logical block size trim the bio, dropping what needs to be dropped iov_iter_revert() if nothing's packed return -EINVAL if it was a failed bio_add_hw_page() return -EFAULT otherwise return 0
On Fri, Jul 01, 2022 at 08:43:32PM +0100, Al Viro wrote: > On Fri, Jul 01, 2022 at 01:28:13PM -0600, Keith Busch wrote: > > On Fri, Jul 01, 2022 at 08:08:37PM +0100, Al Viro wrote: > > > On Fri, Jul 01, 2022 at 12:38:36PM -0600, Keith Busch wrote: > > > > > size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset); > > > > > - if (size > 0) > > > > > - size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev)); > > > > > if (unlikely(size <= 0)) > > > > > return size ? size : -EFAULT; > > > > > + nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE); > > > > > > > > > > + size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev)); > > > > > > > > We still need to return EFAULT if size becomes 0 because that's the only way > > > > bio_iov_iter_get_pages()'s loop will break out in this condition. > > > > > > I really don't like these calling conventions ;-/ > > > > No argument here; I'm just working in the space as I found it. :) > > > > > What do you want to happen if we have that align-down to reduce size? > > > IOW, what should be the state after that loop in the caller? > > > > We fill up the bio to bi_max_vecs. If there are more pages than vectors, then > > the bio is submitted as-is with the partially drained iov_iter. The remainder > > of the iov is left for a subsequent bio to deal with. > > > > The ALIGN_DOWN is required because I've replaced the artificial kernel aligment > > with the underlying hardware's alignment. The hardware's alignment is usually > > smaller than a block size. If the last bvec has a non-block aligned offset, > > then it has to be truncated down, which could result in a 0 size when bi_vcnt > > is already non-zero. If that happens, we just submit the bio as-is, and > > allocate a new one to deal with the rest of the iov. > > Wait a sec. Looks like you are dealing with the round-down in the wrong place - > it applies to the *total* you've packed into the bio, no matter how it is > distributed over the segments you've gathered it from. Looks like it would > be more natural to handle it after the loop in the caller, would it not? > > I.e. > while bio is not full > grab pages > if got nothing > break > pack pages into bio > if can't add a page (bio_add_hw_page() failure) > drop the ones not shoved there > break > see how much had we packed into the sucker > if not a multiple of logical block size > trim the bio, dropping what needs to be dropped > iov_iter_revert() > if nothing's packed > return -EINVAL if it was a failed bio_add_hw_page() > return -EFAULT otherwise > return 0 Validating user requests gets really messy if we allow arbitrary segment lengths. This particular patch just enables arbitrary address alignment, but segment size is still required to be a block size. You found the commit that enforces that earlier, "iov: introduce iov_iter_aligned", two commits prior. The rest of the logic simplifies when we are guaranteed segment size is a block size mulitple: truncating a segment at a block boundary ensures both sides are block size aligned, and we don't even need to consult lower level queue limits, like segment count or segment length. The bio is valid after this step, and can be split into valid bios later if needed.
On 7/1/22 11:40 AM, Al Viro wrote: > On Thu, Jun 30, 2022 at 08:07:24PM -0600, Keith Busch wrote: >> On Thu, Jun 30, 2022 at 11:39:36PM +0100, Al Viro wrote: >>> On Thu, Jun 30, 2022 at 11:11:27PM +0100, Al Viro wrote: >>> >>>> ... and the first half of that thing conflicts with "block: relax direct >>>> io memory alignment" in -next... >>> >>> BTW, looking at that commit - are you sure that bio_put_pages() on failure >>> exit will do the right thing? We have grabbed a bunch of page references; >>> the amount if DIV_ROUND_UP(offset + size, PAGE_SIZE). And that's before >>> your >>> size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev)); >> >> Thanks for the catch, it does look like a page reference could get leaked here. >> >>> in there. IMO the following would be more obviously correct: >>> size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset); >>> if (unlikely(size <= 0)) >>> return size ? size : -EFAULT; >>> >>> nr_pages = DIV_ROUND_UP(size + offset, PAGE_SIZE); >>> size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev)); >>> >>> for (left = size, i = 0; left > 0; left -= len, i++) { >>> ... >>> if (ret) { >>> while (i < nr_pages) >>> put_page(pages[i++]); >>> return ret; >>> } >>> ... >>> >>> and get rid of bio_put_pages() entirely. Objections? >> >> >> I think that makes sense. I'll give your idea a test run tomorrow. > > See vfs.git#block-fixes, along with #work.iov_iter_get_pages-3 in there. > Seems to work here... > > If you are OK with #block-fixes (it's one commit on top of > bf8d08532bc1 "iomap: add support for dma aligned direct-io" in > block.git), the easiest way to deal with the conflicts would be > to have that branch pulled into block.git. Jens, would you be > OK with that in terms of tree topology? Provided that patch > itself looks sane to you, of course... I'm fine with that approach. Don't have too much time to look into this stuff just yet, but looks like you and Keith are getting it sorted out. I'll check in on emails later this weekend and we can get it pulled in at that point when you guys deem it ready to check/pull.
On Fri, Jul 01, 2022 at 01:56:53PM -0600, Keith Busch wrote: > Validating user requests gets really messy if we allow arbitrary segment > lengths. This particular patch just enables arbitrary address alignment, but > segment size is still required to be a block size. You found the commit that > enforces that earlier, "iov: introduce iov_iter_aligned", two commits prior. BTW, where do you check it for this caller? fs/zonefs/super.c:786: ret = bio_iov_iter_get_pages(bio, from); Incidentally, we have an incorrect use of iov_iter_truncate() in that one (compare with iomap case, where we reexpand it afterwards)... I still don't get the logics of those round-downs. You've *already* verified that each segment is a multiple of logical block size. And you are stuffing as much as you can into bio, covering the data for as many segments as you can. Sure, you might end up e.g. running into an unmapped page at wrong offset (since your requirements for initial offsets might be milder than logical block size). Or you might run out of pages bio would take. Either might put the end of bio at the wrong offset. So why not trim it down *after* you are done adding pages into it? And do it once, outside of the loop. IDGI... Validation is already done; I'm not suggesting to allow weird segment lengths or to change behaviour of your iov_iter_is_aligned() in any other way. Put it another way, is there any possibility for __bio_iov_iter_get_pages() to do a non-trivial round-down on anything other than the last iteration of that loop?
On Sat, Jul 02, 2022 at 06:35:58AM +0100, Al Viro wrote: > On Fri, Jul 01, 2022 at 01:56:53PM -0600, Keith Busch wrote: > > > Validating user requests gets really messy if we allow arbitrary segment > > lengths. This particular patch just enables arbitrary address alignment, but > > segment size is still required to be a block size. You found the commit that > > enforces that earlier, "iov: introduce iov_iter_aligned", two commits prior. > > BTW, where do you check it for this caller? > fs/zonefs/super.c:786: ret = bio_iov_iter_get_pages(bio, from); > Incidentally, we have an incorrect use of iov_iter_truncate() in that one (compare > with iomap case, where we reexpand it afterwards)... > > I still don't get the logics of those round-downs. You've *already* verified > that each segment is a multiple of logical block size. And you are stuffing > as much as you can into bio, covering the data for as many segments as you > can. Sure, you might end up e.g. running into an unmapped page at wrong > offset (since your requirements for initial offsets might be milder than > logical block size). Or you might run out of pages bio would take. Either > might put the end of bio at the wrong offset. It is strange this function allows the possibility that bio_iov_add_page() can fail. There's no reason to grab more pages that exceed the bio_full() condition (ignoring the special ZONE_APPEND case for the moment). I think it will make more sense if we clean that part up first so the size for all successfully gotten pages can skip subsequent bio add page checks, and make the error handling unnecessary. > So why not trim it down *after* you are done adding pages into it? And do it > once, outside of the loop. IDGI... Validation is already done; I'm not > suggesting to allow weird segment lengths or to change behaviour of your > iov_iter_is_aligned() in any other way. I may have misunderstood your previous suggestion, but I still think this is the right way to go. The ALIGN_DOWN in its current location ensures the size we're appending to the bio is acceptable before we even start. It's easier to prevent adding pages to a bio IO than to back them out later. The latter would need something like a special cased version of bio_truncate(). Anyway, I have some changes testing right now that I think will fix up the issues you've raised, and make the rest a bit more clear. I'll send them for consideration this weekend if all is succesful. > Put it another way, is there any possibility for __bio_iov_iter_get_pages() to > do a non-trivial round-down on anything other than the last iteration of that > loop?
On Wed, Jun 22, 2022 at 6:56 AM Al Viro <viro@zeniv.linux.org.uk> wrote: > > ... doing revert if we end up not using some pages > With the version from latest vfs.git#for-next as from [1] (which differs from this one) I see with LLVM-14: 5618: clang -Wp,-MMD,block/.bio.o.d -nostdinc -I./arch/x86/include -I./arch/x86/include/generated -I./include -I./arch/x86/include/uapi -I./arch/x86/include/generate d/uapi -I./include/uapi -I./include/generated/uapi -include ./include/linux/compiler-version.h -include ./include/linux/kconfig.h -include ./include/linux/compiler_typ es.h -D__KERNEL__ -Qunused-arguments -fmacro-prefix-map=./= -Wall -Wundef -Werror=strict-prototypes -Wno-trigraphs -fno-strict-aliasing -fno-common -fshort-wchar -fno- PIE -Werror=implicit-function-declaration -Werror=implicit-int -Werror=return-type -Wno-format-security -std=gnu11 --target=x86_64-linux-gnu -fintegrated-as -Werror=un known-warning-option -Werror=ignored-optimization-argument -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx -fcf-protection=none -m64 -falign-loops=1 -mno-80387 -mno-fp -ret-in-387 -mstack-alignment=8 -mskip-rax-setup -mtune=generic -mno-red-zone -mcmodel=kernel -Wno-sign-compare -fno-asynchronous-unwind-tables -mretpoline-external-th unk -fno-delete-null-pointer-checks -Wno-frame-address -Wno-address-of-packed-member -O2 -Wframe-larger-than=2048 -fstack-protector-strong -Wimplicit-fallthrough -Wno- gnu -Wno-unused-but-set-variable -Wno-unused-const-variable -fno-stack-clash-protection -pg -mfentry -DCC_USING_NOP_MCOUNT -DCC_USING_FENTRY -fno-lto -flto=thin -fspli t-lto-unit -fvisibility=hidden -Wdeclaration-after-statement -Wvla -Wno-pointer-sign -Wcast-function-type -fno-strict-overflow -fno-stack-check -Werror=date-time -Werr or=incompatible-pointer-types -Wno-initializer-overrides -Wno-format -Wno-sign-compare -Wno-format-zero-length -Wno-pointer-to-enum-cast -Wno-tautological-constant-out -of-range-compare -Wno-unaligned-access -g -gdwarf-5 -DKBUILD_MODFILE='"block/bio"' -DKBUILD_BASENAME='"bio"' -DKBUILD_MODNAME='"bio"' -D__KBUILD_MODNAME=kmod_bio - c -o block/bio.o block/bio.c [ ... ] 5635:block/bio.c:1232:6: warning: variable 'i' is used uninitialized whenever 'if' condition is true [-Wsometimes-uninitialized] 5636- if (unlikely(!size)) { 5637- ^~~~~~~~~~~~~~~ 5638-./include/linux/compiler.h:78:22: note: expanded from macro 'unlikely' 5639-# define unlikely(x) __builtin_expect(!!(x), 0) 5640- ^~~~~~~~~~~~~~~~~~~~~~~~~~ 5641-block/bio.c:1254:9: note: uninitialized use occurs here 5642- while (i < nr_pages) 5643- ^ 5644-block/bio.c:1232:2: note: remove the 'if' if its condition is always false 5645- if (unlikely(!size)) { 5646- ^~~~~~~~~~~~~~~~~~~~~~ 5647-block/bio.c:1202:17: note: initialize the variable 'i' to silence this warning 5648- unsigned len, i; 5649- ^ 5650- = 0 Patch from [1] is attached. Regards, -Sedat- [1] https://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git/commit/block/bio.c?h=for-next&id=9a6469060316674230c0666c5706f7097e9278bb > Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> > --- > block/bio.c | 15 ++++++--------- > block/blk-map.c | 7 ++++--- > 2 files changed, 10 insertions(+), 12 deletions(-) > > diff --git a/block/bio.c b/block/bio.c > index 51c99f2c5c90..01ab683e67be 100644 > --- a/block/bio.c > +++ b/block/bio.c > @@ -1190,7 +1190,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) > BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2); > pages += entries_left * (PAGE_PTRS_PER_BVEC - 1); > > - size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset); > + size = iov_iter_get_pages2(iter, pages, LONG_MAX, nr_pages, &offset); > if (unlikely(size <= 0)) > return size ? size : -EFAULT; > > @@ -1205,6 +1205,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) > } else { > if (WARN_ON_ONCE(bio_full(bio, len))) { > bio_put_pages(pages + i, left, offset); > + iov_iter_revert(iter, left); > return -EINVAL; > } > __bio_add_page(bio, page, len, offset); > @@ -1212,7 +1213,6 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) > offset = 0; > } > > - iov_iter_advance(iter, size); > return 0; > } > > @@ -1227,7 +1227,6 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) > ssize_t size, left; > unsigned len, i; > size_t offset; > - int ret = 0; > > if (WARN_ON_ONCE(!max_append_sectors)) > return 0; > @@ -1240,7 +1239,7 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) > BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2); > pages += entries_left * (PAGE_PTRS_PER_BVEC - 1); > > - size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset); > + size = iov_iter_get_pages2(iter, pages, LONG_MAX, nr_pages, &offset); > if (unlikely(size <= 0)) > return size ? size : -EFAULT; > > @@ -1252,16 +1251,14 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) > if (bio_add_hw_page(q, bio, page, len, offset, > max_append_sectors, &same_page) != len) { > bio_put_pages(pages + i, left, offset); > - ret = -EINVAL; > - break; > + iov_iter_revert(iter, left); > + return -EINVAL; > } > if (same_page) > put_page(page); > offset = 0; > } > - > - iov_iter_advance(iter, size - left); > - return ret; > + return 0; > } > > /** > diff --git a/block/blk-map.c b/block/blk-map.c > index df8b066cd548..7196a6b64c80 100644 > --- a/block/blk-map.c > +++ b/block/blk-map.c > @@ -254,7 +254,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, > size_t offs, added = 0; > int npages; > > - bytes = iov_iter_get_pages_alloc(iter, &pages, LONG_MAX, &offs); > + bytes = iov_iter_get_pages_alloc2(iter, &pages, LONG_MAX, &offs); > if (unlikely(bytes <= 0)) { > ret = bytes ? bytes : -EFAULT; > goto out_unmap; > @@ -284,7 +284,6 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, > bytes -= n; > offs = 0; > } > - iov_iter_advance(iter, added); > } > /* > * release the pages we didn't map into the bio, if any > @@ -293,8 +292,10 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, > put_page(pages[j++]); > kvfree(pages); > /* couldn't stuff something into bio? */ > - if (bytes) > + if (bytes) { > + iov_iter_revert(iter, bytes); > break; > + } > } > > ret = blk_rq_append_bio(rq, bio); > -- > 2.30.2 >
diff --git a/block/bio.c b/block/bio.c index 51c99f2c5c90..01ab683e67be 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1190,7 +1190,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2); pages += entries_left * (PAGE_PTRS_PER_BVEC - 1); - size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset); + size = iov_iter_get_pages2(iter, pages, LONG_MAX, nr_pages, &offset); if (unlikely(size <= 0)) return size ? size : -EFAULT; @@ -1205,6 +1205,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) } else { if (WARN_ON_ONCE(bio_full(bio, len))) { bio_put_pages(pages + i, left, offset); + iov_iter_revert(iter, left); return -EINVAL; } __bio_add_page(bio, page, len, offset); @@ -1212,7 +1213,6 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) offset = 0; } - iov_iter_advance(iter, size); return 0; } @@ -1227,7 +1227,6 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) ssize_t size, left; unsigned len, i; size_t offset; - int ret = 0; if (WARN_ON_ONCE(!max_append_sectors)) return 0; @@ -1240,7 +1239,7 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2); pages += entries_left * (PAGE_PTRS_PER_BVEC - 1); - size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset); + size = iov_iter_get_pages2(iter, pages, LONG_MAX, nr_pages, &offset); if (unlikely(size <= 0)) return size ? size : -EFAULT; @@ -1252,16 +1251,14 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) if (bio_add_hw_page(q, bio, page, len, offset, max_append_sectors, &same_page) != len) { bio_put_pages(pages + i, left, offset); - ret = -EINVAL; - break; + iov_iter_revert(iter, left); + return -EINVAL; } if (same_page) put_page(page); offset = 0; } - - iov_iter_advance(iter, size - left); - return ret; + return 0; } /** diff --git a/block/blk-map.c b/block/blk-map.c index df8b066cd548..7196a6b64c80 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -254,7 +254,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, size_t offs, added = 0; int npages; - bytes = iov_iter_get_pages_alloc(iter, &pages, LONG_MAX, &offs); + bytes = iov_iter_get_pages_alloc2(iter, &pages, LONG_MAX, &offs); if (unlikely(bytes <= 0)) { ret = bytes ? bytes : -EFAULT; goto out_unmap; @@ -284,7 +284,6 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, bytes -= n; offs = 0; } - iov_iter_advance(iter, added); } /* * release the pages we didn't map into the bio, if any @@ -293,8 +292,10 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, put_page(pages[j++]); kvfree(pages); /* couldn't stuff something into bio? */ - if (bytes) + if (bytes) { + iov_iter_revert(iter, bytes); break; + } } ret = blk_rq_append_bio(rq, bio);
... doing revert if we end up not using some pages Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- block/bio.c | 15 ++++++--------- block/blk-map.c | 7 ++++--- 2 files changed, 10 insertions(+), 12 deletions(-)