diff mbox series

[PATCHv3,2/5] block: bio-integrity: directly map user buffers

Message ID 20231120224058.2750705-3-kbusch@meta.com (mailing list archive)
State New, archived
Headers show
Series block integrity: directly map user space addresses | expand

Commit Message

Keith Busch Nov. 20, 2023, 10:40 p.m. UTC
From: Keith Busch <kbusch@kernel.org>

Passthrough commands that utilize metadata currently bounce the user
space buffer through the kernel. Add support for mapping user space
directly so that we can avoid this costly overhead. This is similiar to
how the normal bio data payload utilizes user addresses with
bio_map_user_iov().

If the user address can't directly be used for reasons like too many
segments or address unalignement, fallback to a copy of the user vec
while keeping the user address pinned for the IO duration so that it
can safely be copied on completion in any process context.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 block/bio-integrity.c | 212 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/bio.h   |  12 +++
 2 files changed, 224 insertions(+)

Comments

Jens Axboe Nov. 20, 2023, 11:19 p.m. UTC | #1
On 11/20/23 3:40 PM, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> Passthrough commands that utilize metadata currently bounce the user
> space buffer through the kernel. Add support for mapping user space
> directly so that we can avoid this costly overhead. This is similiar to
> how the normal bio data payload utilizes user addresses with
> bio_map_user_iov().
> 
> If the user address can't directly be used for reasons like too many
> segments or address unalignement, fallback to a copy of the user vec
> while keeping the user address pinned for the IO duration so that it
> can safely be copied on completion in any process context.
> 
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> ---
>  block/bio-integrity.c | 212 ++++++++++++++++++++++++++++++++++++++++++
>  include/linux/bio.h   |  12 +++
>  2 files changed, 224 insertions(+)
> 
> diff --git a/block/bio-integrity.c b/block/bio-integrity.c
> index ec8ac8cf6e1b9..b761058bfb92f 100644
> --- a/block/bio-integrity.c
> +++ b/block/bio-integrity.c
> @@ -91,6 +91,37 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
>  }
>  EXPORT_SYMBOL(bio_integrity_alloc);
>  
> +static void bio_integrity_unmap_user(struct bio_integrity_payload *bip)
> +{
> +	bool dirty = bio_data_dir(bip->bip_bio) == READ;
> +	struct bvec_iter iter;
> +	struct bio_vec bv;
> +
> +	if (bip->bip_flags & BIP_COPY_USER) {
> +		unsigned short nr_vecs = bip->bip_max_vcnt - 1;
> +		struct bio_vec *copy = bvec_virt(&bip->bip_vec[nr_vecs]);
> +		size_t bytes = bip->bip_iter.bi_size;
> +		void *buf = bvec_virt(bip->bip_vec);
> +
> +		if (dirty) {
> +			struct iov_iter iter;
> +
> +			iov_iter_bvec(&iter, ITER_DEST, copy, nr_vecs, bytes);
> +			WARN_ON_ONCE(copy_to_iter(buf, bytes, &iter) != bytes);
> +		}

Minor nit, but I don't like hiding functions with side effects inside
potentially debug statements. Would be better to do:

	ret = copy_to_iter(buf, bytes, &iter);
	WARN_ON_ONCE(ret != bytes);

which is also easier to read, imho.

Apart from that, looks good to me.
Christoph Hellwig Nov. 21, 2023, 5:04 a.m. UTC | #2
On Mon, Nov 20, 2023 at 02:40:55PM -0800, Keith Busch wrote:
> +static void bio_integrity_unmap_user(struct bio_integrity_payload *bip)
> +{
> +	bool dirty = bio_data_dir(bip->bip_bio) == READ;
> +	struct bvec_iter iter;
> +	struct bio_vec bv;
> +
> +	if (bip->bip_flags & BIP_COPY_USER) {
> +		unsigned short nr_vecs = bip->bip_max_vcnt - 1;
> +		struct bio_vec *copy = bvec_virt(&bip->bip_vec[nr_vecs]);
> +		size_t bytes = bip->bip_iter.bi_size;
> +		void *buf = bvec_virt(bip->bip_vec);
> +
> +		if (dirty) {
> +			struct iov_iter iter;
> +
> +			iov_iter_bvec(&iter, ITER_DEST, copy, nr_vecs, bytes);
> +			WARN_ON_ONCE(copy_to_iter(buf, bytes, &iter) != bytes);
> +		}
> +
> +		memcpy(bip->bip_vec, copy, nr_vecs * sizeof(*copy));
> +		kfree(copy);
> +		kfree(buf);

Nit: but I'd probably just split the user copy version into a separate
helper for clarity.  Nice trick with the temporary iter, we could probably
use this for the data path too.

> +extern int bio_integrity_map_user(struct bio *, void __user *, ssize_t, u32);

Can you drop the pointless extern and just spell out the paratmeters?
I know this follows the existing style, but that style is pretty
horrible :)
Anuj gupta Nov. 21, 2023, 4:10 p.m. UTC | #3
On Tue, Nov 21, 2023 at 4:11 AM Keith Busch <kbusch@meta.com> wrote:
>
> From: Keith Busch <kbusch@kernel.org>
>
> Passthrough commands that utilize metadata currently bounce the user
> space buffer through the kernel. Add support for mapping user space
> directly so that we can avoid this costly overhead. This is similiar to

Nit: s/similiar/similar

 >  /**
>   * bio_integrity_free - Free bio integrity payload
>   * @bio:       bio containing bip to be freed
> @@ -105,6 +136,8 @@ void bio_integrity_free(struct bio *bio)
>
>         if (bip->bip_flags & BIP_BLOCK_INTEGRITY)
>                 kfree(bvec_virt(bip->bip_vec));
> +       else if (bip->bip_flags & BIP_INTEGRITY_USER)
> +               bio_integrity_unmap_user(bip);;

Nit: extra semicolon here

--
Anuj Gupta
diff mbox series

Patch

diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index ec8ac8cf6e1b9..b761058bfb92f 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -91,6 +91,37 @@  struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
 }
 EXPORT_SYMBOL(bio_integrity_alloc);
 
+static void bio_integrity_unmap_user(struct bio_integrity_payload *bip)
+{
+	bool dirty = bio_data_dir(bip->bip_bio) == READ;
+	struct bvec_iter iter;
+	struct bio_vec bv;
+
+	if (bip->bip_flags & BIP_COPY_USER) {
+		unsigned short nr_vecs = bip->bip_max_vcnt - 1;
+		struct bio_vec *copy = bvec_virt(&bip->bip_vec[nr_vecs]);
+		size_t bytes = bip->bip_iter.bi_size;
+		void *buf = bvec_virt(bip->bip_vec);
+
+		if (dirty) {
+			struct iov_iter iter;
+
+			iov_iter_bvec(&iter, ITER_DEST, copy, nr_vecs, bytes);
+			WARN_ON_ONCE(copy_to_iter(buf, bytes, &iter) != bytes);
+		}
+
+		memcpy(bip->bip_vec, copy, nr_vecs * sizeof(*copy));
+		kfree(copy);
+		kfree(buf);
+	}
+
+	bip_for_each_mp_vec(bv, bip, iter) {
+		if (dirty && !PageCompound(bv.bv_page))
+			set_page_dirty_lock(bv.bv_page);
+		unpin_user_page(bv.bv_page);
+	}
+}
+
 /**
  * bio_integrity_free - Free bio integrity payload
  * @bio:	bio containing bip to be freed
@@ -105,6 +136,8 @@  void bio_integrity_free(struct bio *bio)
 
 	if (bip->bip_flags & BIP_BLOCK_INTEGRITY)
 		kfree(bvec_virt(bip->bip_vec));
+	else if (bip->bip_flags & BIP_INTEGRITY_USER)
+		bio_integrity_unmap_user(bip);;
 
 	__bio_integrity_free(bs, bip);
 	bio->bi_integrity = NULL;
@@ -160,6 +193,185 @@  int bio_integrity_add_page(struct bio *bio, struct page *page,
 }
 EXPORT_SYMBOL(bio_integrity_add_page);
 
+static int bio_integrity_copy_user(struct bio *bio, struct bio_vec *bvec,
+				   int nr_vecs, unsigned int len,
+				   unsigned int direction, u32 seed)
+{
+	struct bio_integrity_payload *bip;
+	struct bio_vec *copy_vec = NULL;
+	struct iov_iter iter;
+	void *buf;
+	int ret;
+
+	/* We need to allocate a copy for the completion if bvec is on stack */
+	if (nr_vecs <= UIO_FASTIOV) {
+		copy_vec = kcalloc(sizeof(*bvec), nr_vecs, GFP_KERNEL);
+		if (!copy_vec)
+			return -ENOMEM;
+		memcpy(copy_vec, bvec, nr_vecs * sizeof(*bvec));
+		bvec = copy_vec;
+	}
+
+	buf = kmalloc(len, GFP_KERNEL);
+	if (!buf) {
+		ret = -ENOMEM;
+		goto free_copy;
+	}
+
+	if (direction == ITER_SOURCE) {
+		iov_iter_bvec(&iter, direction, bvec, nr_vecs, len);
+		if (!copy_from_iter_full(buf, len, &iter)) {
+			ret = -EFAULT;
+			goto free_buf;
+		}
+	} else {
+		memset(buf, 0, len);
+	}
+
+	/*
+	 * We need just one vec for this bip, but we also need to preserve the
+	 * a pointer to the original bvec and the number of vecs in it for
+	 * completion handling
+	 */
+	bip = bio_integrity_alloc(bio, GFP_KERNEL, nr_vecs + 1);
+	if (IS_ERR(bip)) {
+		ret = PTR_ERR(bip);
+		goto free_buf;
+	}
+
+	ret = bio_integrity_add_page(bio, virt_to_page(buf), len,
+				     offset_in_page(buf));
+	if (ret != len) {
+		ret = -ENOMEM;
+		goto free_bip;
+	}
+
+	/*
+	 * Save a pointer to the user bvec at the end of this bip's bvec for
+	 * completion handling: we know the index won't be used for anything
+	 * else.
+	 */
+	bvec_set_page(&bip->bip_vec[nr_vecs], virt_to_page(bvec), 0,
+		      offset_in_page(bvec));
+	bip->bip_flags |= BIP_INTEGRITY_USER | BIP_COPY_USER;
+	return 0;
+
+free_bip:
+	bio_integrity_free(bio);
+free_buf:
+	kfree(buf);
+free_copy:
+	kfree(copy_vec);
+	return ret;
+}
+
+static unsigned int bvec_from_pages(struct bio_vec *bvec, struct page **pages,
+				    int nr_vecs, ssize_t bytes, ssize_t offset)
+{
+	unsigned int nr_bvecs = 0;
+	int i, j;
+
+	for (i = 0; i < nr_vecs; i = j) {
+		size_t size = min_t(size_t, bytes, PAGE_SIZE - offset);
+		struct folio *folio = page_folio(pages[i]);
+
+		bytes -= size;
+		for (j = i + 1; j < nr_vecs; j++) {
+			size_t next = min_t(size_t, PAGE_SIZE, bytes);
+
+			if (page_folio(pages[j]) != folio ||
+			    pages[j] != pages[j - 1] + 1)
+				break;
+			unpin_user_page(pages[j]);
+			size += next;
+			bytes -= next;
+		}
+
+		bvec_set_page(&bvec[nr_bvecs], pages[i], size, offset);
+		offset = 0;
+		nr_bvecs++;
+	}
+
+	return nr_bvecs;
+}
+
+int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes,
+			   u32 seed)
+{
+	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+	unsigned int align = q->dma_pad_mask | queue_dma_alignment(q);
+	struct bio_vec bv, stack_vec[UIO_FASTIOV], *bvec = stack_vec;
+	struct page *stack_pages[UIO_FASTIOV], **pages = stack_pages;
+	struct bvec_iter bi = { bi.bi_size = bytes, };
+	unsigned int direction, nr_bvecs;
+	struct iov_iter iter;
+	int ret, nr_vecs;
+	size_t offset;
+	bool copy;
+
+	if (bio_integrity(bio))
+		return -EINVAL;
+	if (bytes >> SECTOR_SHIFT > queue_max_hw_sectors(q))
+		return -E2BIG;
+
+	if (bio_data_dir(bio) == READ)
+		direction = ITER_DEST;
+	else
+		direction = ITER_SOURCE;
+
+	iov_iter_ubuf(&iter, direction, ubuf, bytes);
+	nr_vecs = iov_iter_npages(&iter, BIO_MAX_VECS + 1);
+	if (nr_vecs > BIO_MAX_VECS)
+		return -E2BIG;
+	if (nr_vecs > UIO_FASTIOV) {
+		bvec = kcalloc(sizeof(*bvec), nr_vecs, GFP_KERNEL);
+		if (!bvec)
+			return -ENOMEM;
+		pages = NULL;
+	}
+
+	copy = !iov_iter_is_aligned(&iter, align, align);
+	ret = iov_iter_extract_pages(&iter, &pages, bytes, nr_vecs, 0, &offset);
+	if (unlikely(ret < 0))
+		goto free_bvec;
+
+	nr_bvecs = bvec_from_pages(bvec, pages, nr_vecs, bytes, offset);
+	if (pages != stack_pages)
+		kvfree(pages);
+
+	if (nr_bvecs > queue_max_integrity_segments(q) || copy) {
+		ret = bio_integrity_copy_user(bio, bvec, nr_bvecs, bytes,
+					      direction, seed);
+		if (ret)
+			goto release_pages;
+	} else {
+		struct bio_integrity_payload *bip;
+
+		bip = bio_integrity_alloc(bio, GFP_KERNEL, nr_bvecs);
+		if (IS_ERR(bip)) {
+			ret = PTR_ERR(bip);
+			goto release_pages;
+		}
+
+		memcpy(bip->bip_vec, bvec, nr_bvecs * sizeof(*bvec));
+		bip->bip_flags |= BIP_INTEGRITY_USER;
+		bip->bip_iter.bi_size = bytes;
+		if (bvec != stack_vec)
+			kfree(bvec);
+	}
+
+	return 0;
+
+release_pages:
+	for_each_bvec(bv, bvec, bi, bi)
+		unpin_user_page(bv.bv_page);
+free_bvec:
+	if (bvec != stack_vec)
+		kfree(bvec);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(bio_integrity_map_user);
+
 /**
  * bio_integrity_process - Process integrity metadata for a bio
  * @bio:	bio to generate/verify integrity metadata for
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 41d417ee13499..09e123e7c4941 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -324,6 +324,8 @@  enum bip_flags {
 	BIP_CTRL_NOCHECK	= 1 << 2, /* disable HBA integrity checking */
 	BIP_DISK_NOCHECK	= 1 << 3, /* disable disk integrity checking */
 	BIP_IP_CHECKSUM		= 1 << 4, /* IP checksum */
+	BIP_INTEGRITY_USER	= 1 << 5, /* Integrity payload is user address */
+	BIP_COPY_USER		= 1 << 6, /* Kernel bounce buffer in use */
 };
 
 /*
@@ -714,12 +716,16 @@  static inline bool bioset_initialized(struct bio_set *bs)
 #define bip_for_each_vec(bvl, bip, iter)				\
 	for_each_bvec(bvl, (bip)->bip_vec, iter, (bip)->bip_iter)
 
+#define bip_for_each_mp_vec(bvl, bip, iter)				\
+	for_each_mp_bvec(bvl, (bip)->bip_vec, iter, (bip)->bip_iter)
+
 #define bio_for_each_integrity_vec(_bvl, _bio, _iter)			\
 	for_each_bio(_bio)						\
 		bip_for_each_vec(_bvl, _bio->bi_integrity, _iter)
 
 extern struct bio_integrity_payload *bio_integrity_alloc(struct bio *, gfp_t, unsigned int);
 extern int bio_integrity_add_page(struct bio *, struct page *, unsigned int, unsigned int);
+extern int bio_integrity_map_user(struct bio *, void __user *, ssize_t, u32);
 extern bool bio_integrity_prep(struct bio *);
 extern void bio_integrity_advance(struct bio *, unsigned int);
 extern void bio_integrity_trim(struct bio *);
@@ -789,6 +795,12 @@  static inline int bio_integrity_add_page(struct bio *bio, struct page *page,
 	return 0;
 }
 
+static inline int bio_integrity_map_user(struct bio *bio, void __user *ubuf,
+					 ssize_t len, u32 seed)
+{
+	return -EINVAL;
+}
+
 #endif /* CONFIG_BLK_DEV_INTEGRITY */
 
 /*