[11/19] block: implement bio helper to add iter bvec pages to bio
diff mbox series

Message ID 20190211190049.7888-13-axboe@kernel.dk
State New
Headers show
Series
  • [01/19] fs: add an iopoll method to struct file_operations
Related show

Commit Message

Jens Axboe Feb. 11, 2019, 7 p.m. UTC
For an ITER_BVEC, we can just iterate the iov and add the pages
to the bio directly. This requires that the caller doesn't releases
the pages on IO completion, we add a BIO_NO_PAGE_REF flag for that.

The current two callers of bio_iov_iter_get_pages() are updated to
check if they need to release pages on completion. This makes them
work with bvecs that contain kernel mapped pages already.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c               | 59 ++++++++++++++++++++++++++++++++-------
 fs/block_dev.c            |  5 ++--
 fs/iomap.c                |  5 ++--
 include/linux/blk_types.h |  1 +
 4 files changed, 56 insertions(+), 14 deletions(-)

Comments

Ming Lei Feb. 20, 2019, 10:58 p.m. UTC | #1
On Mon, Feb 11, 2019 at 12:00:41PM -0700, Jens Axboe wrote:
> For an ITER_BVEC, we can just iterate the iov and add the pages
> to the bio directly. This requires that the caller doesn't releases
> the pages on IO completion, we add a BIO_NO_PAGE_REF flag for that.
> 
> The current two callers of bio_iov_iter_get_pages() are updated to
> check if they need to release pages on completion. This makes them
> work with bvecs that contain kernel mapped pages already.
> 
> Reviewed-by: Hannes Reinecke <hare@suse.com>
> Reviewed-by: Christoph Hellwig <hch@lst.de>
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> ---
>  block/bio.c               | 59 ++++++++++++++++++++++++++++++++-------
>  fs/block_dev.c            |  5 ++--
>  fs/iomap.c                |  5 ++--
>  include/linux/blk_types.h |  1 +
>  4 files changed, 56 insertions(+), 14 deletions(-)
> 
> diff --git a/block/bio.c b/block/bio.c
> index 4db1008309ed..330df572cfb8 100644
> --- a/block/bio.c
> +++ b/block/bio.c
> @@ -828,6 +828,23 @@ int bio_add_page(struct bio *bio, struct page *page,
>  }
>  EXPORT_SYMBOL(bio_add_page);
>  
> +static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter)
> +{
> +	const struct bio_vec *bv = iter->bvec;
> +	unsigned int len;
> +	size_t size;
> +
> +	len = min_t(size_t, bv->bv_len, iter->count);
> +	size = bio_add_page(bio, bv->bv_page, len,
> +				bv->bv_offset + iter->iov_offset);

iter->iov_offset needs to be subtracted from 'len', looks
the following delta change[1] is required, otherwise memory corruption
can be observed when running xfstests over loop/dio.

Another interesting thing is that bio_add_page() is capable of
adding multi contiguous pages actually, especially loop uses
ITER_BVEC to pass multi-page bvecs. Even though pages in loop's
ITER_BVEC may belong to user-space, looks it is still safe to not
grab the page ref given it has been done by fs. 

[1]
diff --git a/block/bio.c b/block/bio.c
index 3b49963676fc..df99bb3816a1 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -842,7 +842,10 @@ static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter)
 	unsigned int len;
 	size_t size;
 
-	len = min_t(size_t, bv->bv_len, iter->count);
+	if (WARN_ON_ONCE(iter->iov_offset > bv->bv_len))
+		return -EINVAL;
+
+	len = min_t(size_t, bv->bv_len - iter->iov_offset, iter->count);
 	size = bio_add_page(bio, bv->bv_page, len,
 				bv->bv_offset + iter->iov_offset);
 	if (size == len) {

Thanks,
Ming
Jens Axboe Feb. 21, 2019, 5:45 p.m. UTC | #2
On 2/20/19 3:58 PM, Ming Lei wrote:
> On Mon, Feb 11, 2019 at 12:00:41PM -0700, Jens Axboe wrote:
>> For an ITER_BVEC, we can just iterate the iov and add the pages
>> to the bio directly. This requires that the caller doesn't releases
>> the pages on IO completion, we add a BIO_NO_PAGE_REF flag for that.
>>
>> The current two callers of bio_iov_iter_get_pages() are updated to
>> check if they need to release pages on completion. This makes them
>> work with bvecs that contain kernel mapped pages already.
>>
>> Reviewed-by: Hannes Reinecke <hare@suse.com>
>> Reviewed-by: Christoph Hellwig <hch@lst.de>
>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>> ---
>>  block/bio.c               | 59 ++++++++++++++++++++++++++++++++-------
>>  fs/block_dev.c            |  5 ++--
>>  fs/iomap.c                |  5 ++--
>>  include/linux/blk_types.h |  1 +
>>  4 files changed, 56 insertions(+), 14 deletions(-)
>>
>> diff --git a/block/bio.c b/block/bio.c
>> index 4db1008309ed..330df572cfb8 100644
>> --- a/block/bio.c
>> +++ b/block/bio.c
>> @@ -828,6 +828,23 @@ int bio_add_page(struct bio *bio, struct page *page,
>>  }
>>  EXPORT_SYMBOL(bio_add_page);
>>  
>> +static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter)
>> +{
>> +	const struct bio_vec *bv = iter->bvec;
>> +	unsigned int len;
>> +	size_t size;
>> +
>> +	len = min_t(size_t, bv->bv_len, iter->count);
>> +	size = bio_add_page(bio, bv->bv_page, len,
>> +				bv->bv_offset + iter->iov_offset);
> 
> iter->iov_offset needs to be subtracted from 'len', looks
> the following delta change[1] is required, otherwise memory corruption
> can be observed when running xfstests over loop/dio.

Thanks, I folded this in.

Patch
diff mbox series

diff --git a/block/bio.c b/block/bio.c
index 4db1008309ed..330df572cfb8 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -828,6 +828,23 @@  int bio_add_page(struct bio *bio, struct page *page,
 }
 EXPORT_SYMBOL(bio_add_page);
 
+static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter)
+{
+	const struct bio_vec *bv = iter->bvec;
+	unsigned int len;
+	size_t size;
+
+	len = min_t(size_t, bv->bv_len, iter->count);
+	size = bio_add_page(bio, bv->bv_page, len,
+				bv->bv_offset + iter->iov_offset);
+	if (size == len) {
+		iov_iter_advance(iter, size);
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
 #define PAGE_PTRS_PER_BVEC     (sizeof(struct bio_vec) / sizeof(struct page *))
 
 /**
@@ -876,23 +893,43 @@  static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 }
 
 /**
- * bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio
+ * bio_iov_iter_get_pages - add user or kernel pages to a bio
  * @bio: bio to add pages to
- * @iter: iov iterator describing the region to be mapped
+ * @iter: iov iterator describing the region to be added
+ *
+ * This takes either an iterator pointing to user memory, or one pointing to
+ * kernel pages (BVEC iterator). If we're adding user pages, we pin them and
+ * map them into the kernel. On IO completion, the caller should put those
+ * pages. If we're adding kernel pages, we just have to add the pages to the
+ * bio directly. We don't grab an extra reference to those pages (the user
+ * should already have that), and we don't put the page on IO completion.
+ * The caller needs to check if the bio is flagged BIO_NO_PAGE_REF on IO
+ * completion. If it isn't, then pages should be released.
  *
- * Pins pages from *iter and appends them to @bio's bvec array. The
- * pages will have to be released using put_page() when done.
  * The function tries, but does not guarantee, to pin as many pages as
- * fit into the bio, or are requested in *iter, whatever is smaller.
- * If MM encounters an error pinning the requested pages, it stops.
- * Error is returned only if 0 pages could be pinned.
+ * fit into the bio, or are requested in *iter, whatever is smaller. If
+ * MM encounters an error pinning the requested pages, it stops. Error
+ * is returned only if 0 pages could be pinned.
  */
 int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 {
+	const bool is_bvec = iov_iter_is_bvec(iter);
 	unsigned short orig_vcnt = bio->bi_vcnt;
 
+	/*
+	 * If this is a BVEC iter, then the pages are kernel pages. Don't
+	 * release them on IO completion.
+	 */
+	if (is_bvec)
+		bio_set_flag(bio, BIO_NO_PAGE_REF);
+
 	do {
-		int ret = __bio_iov_iter_get_pages(bio, iter);
+		int ret;
+
+		if (is_bvec)
+			ret = __bio_iov_bvec_add_pages(bio, iter);
+		else
+			ret = __bio_iov_iter_get_pages(bio, iter);
 
 		if (unlikely(ret))
 			return bio->bi_vcnt > orig_vcnt ? 0 : ret;
@@ -1634,7 +1671,8 @@  static void bio_dirty_fn(struct work_struct *work)
 		next = bio->bi_private;
 
 		bio_set_pages_dirty(bio);
-		bio_release_pages(bio);
+		if (!bio_flagged(bio, BIO_NO_PAGE_REF))
+			bio_release_pages(bio);
 		bio_put(bio);
 	}
 }
@@ -1650,7 +1688,8 @@  void bio_check_pages_dirty(struct bio *bio)
 			goto defer;
 	}
 
-	bio_release_pages(bio);
+	if (!bio_flagged(bio, BIO_NO_PAGE_REF))
+		bio_release_pages(bio);
 	bio_put(bio);
 	return;
 defer:
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 392e2bfb636f..051ab41d1c61 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -338,8 +338,9 @@  static void blkdev_bio_end_io(struct bio *bio)
 		struct bio_vec *bvec;
 		int i;
 
-		bio_for_each_segment_all(bvec, bio, i)
-			put_page(bvec->bv_page);
+		if (!bio_flagged(bio, BIO_NO_PAGE_REF))
+			bio_for_each_segment_all(bvec, bio, i)
+				put_page(bvec->bv_page);
 		bio_put(bio);
 	}
 }
diff --git a/fs/iomap.c b/fs/iomap.c
index 2ac9eb746d44..9389cf0a1c6f 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -1591,8 +1591,9 @@  static void iomap_dio_bio_end_io(struct bio *bio)
 		struct bio_vec *bvec;
 		int i;
 
-		bio_for_each_segment_all(bvec, bio, i)
-			put_page(bvec->bv_page);
+		if (!bio_flagged(bio, BIO_NO_PAGE_REF))
+			bio_for_each_segment_all(bvec, bio, i)
+				put_page(bvec->bv_page);
 		bio_put(bio);
 	}
 }
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index d66bf5f32610..791fee35df88 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -215,6 +215,7 @@  struct bio {
 /*
  * bio flags
  */
+#define BIO_NO_PAGE_REF	0	/* don't put release vec pages */
 #define BIO_SEG_VALID	1	/* bi_phys_segments valid */
 #define BIO_CLONED	2	/* doesn't own data */
 #define BIO_BOUNCED	3	/* bio is a bounce bio */