diff mbox series

[PATCHv2,1/4] block: bio-integrity: directly map user buffers

Message ID 20231027181929.2589937-2-kbusch@meta.com (mailing list archive)
State New, archived
Headers show
Series block integrity: directly map user space addresses | expand

Commit Message

Keith Busch Oct. 27, 2023, 6:19 p.m. UTC
From: Keith Busch <kbusch@kernel.org>

Passthrough commands that utilize metadata currently need to bounce the
user space buffer through the kernel. Add support for mapping user space
directly so that we can avoid this costly overhead. This is similiar to
how the normal bio data payload utilizes user addresses with
bio_map_user_iov().

If the user address can't directly be used for reasons like too many
segments or address unalignement, fallback to a copy of the user vec
while keeping the user address pinned for the IO duration so that it
can safely be copied on completion in any process context.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 block/bio-integrity.c | 195 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/bio.h   |   9 ++
 2 files changed, 204 insertions(+)

Comments

Pankaj Raghav Oct. 30, 2023, 2:40 p.m. UTC | #1
> +static int bio_integrity_copy_user(struct bio *bio, struct bio_vec *bvec,
> +				   int nr_vecs, unsigned int len,
> +				   unsigned int direction, u32 seed)
> +{
> +	struct bio_integrity_payload *bip;
> +	struct bio_vec *copy_vec = NULL;
> +	struct iov_iter iter;
> +	void *buf;
> +	int ret;
> +
> +	/* if bvec is on the stack, we need to allocate a copy for the completion */
> +	if (nr_vecs <= UIO_FASTIOV) {
> +		copy_vec = kcalloc(sizeof(*bvec), nr_vecs, GFP_KERNEL);
> +		if (!copy_vec)
> +			return -ENOMEM;
> +		memcpy(copy_vec, bvec, nr_vecs * sizeof(*bvec));
> +	}
> +
> +	buf = kmalloc(len, GFP_KERNEL);
> +	if (!buf)
> +		goto free_copy;

ret is not set to -ENOMEM here.

> +
> +	if (direction == ITER_SOURCE) {
> +		iov_iter_bvec(&iter, direction, bvec, nr_vecs, len);
> +		if (!copy_from_iter_full(buf, len, &iter)) {
> +			ret = -EFAULT;
> +			goto free_buf;
> +		}
> +	} else {
> +		memset(buf, 0, len);
> +	}
> +
> +	/*
> +	 * We just need one vec for this bip, but we need to preserve the
> +	 * number of vecs in the user bvec for the completion handling, so use
> +	 * nr_vecs.
> +	 */
> +	bip = bio_integrity_alloc(bio, GFP_KERNEL, nr_vecs);
> +	if (IS_ERR(bip)) {
> +		ret = PTR_ERR(bip);
> +		goto free_buf;
> +	}
> +
> +	ret = bio_integrity_add_page(bio, virt_to_page(buf), len,
> +				     offset_in_page(buf));
> +	if (ret != len) {
> +		ret = -ENOMEM;
> +		goto free_bip;
> +	}
> +
> +	bip->bip_flags |= BIP_INTEGRITY_USER;
> +	bip->copy_vec = copy_vec ?: bvec;
> +	return 0;
> +free_bip:
> +	bio_integrity_free(bio);
> +free_buf:
> +	kfree(buf);
> +free_copy:
> +	kfree(copy_vec);
> +	return ret;
> +}
> +
Keith Busch Oct. 30, 2023, 2:54 p.m. UTC | #2
On Mon, Oct 30, 2023 at 03:40:47PM +0100, Pankaj Raghav wrote:
> > +	int ret;
> > +
> > +	/* if bvec is on the stack, we need to allocate a copy for the completion */
> > +	if (nr_vecs <= UIO_FASTIOV) {
> > +		copy_vec = kcalloc(sizeof(*bvec), nr_vecs, GFP_KERNEL);
> > +		if (!copy_vec)
> > +			return -ENOMEM;
> > +		memcpy(copy_vec, bvec, nr_vecs * sizeof(*bvec));
> > +	}
> > +
> > +	buf = kmalloc(len, GFP_KERNEL);
> > +	if (!buf)
> > +		goto free_copy;
> 
> ret is not set to -ENOMEM here.

Indeed, thanks for pointing that out. I'll wait a bit longer before
posting a v3.
kernel test robot Oct. 30, 2023, 3:27 p.m. UTC | #3
Hi Keith,

kernel test robot noticed the following build errors:

[auto build test ERROR on axboe-block/for-next]
[also build test ERROR on linus/master v6.6 next-20231030]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Keith-Busch/block-bio-integrity-directly-map-user-buffers/20231028-022107
base:   https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git for-next
patch link:    https://lore.kernel.org/r/20231027181929.2589937-2-kbusch%40meta.com
patch subject: [PATCHv2 1/4] block: bio-integrity: directly map user buffers
config: powerpc-allnoconfig (https://download.01.org/0day-ci/archive/20231030/202310302318.pPweFNME-lkp@intel.com/config)
compiler: powerpc-linux-gcc (GCC) 13.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20231030/202310302318.pPweFNME-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202310302318.pPweFNME-lkp@intel.com/

All errors (new ones prefixed by >>):

   In file included from include/linux/blkdev.h:17,
                    from lib/vsprintf.c:47:
   include/linux/bio.h: In function 'bio_integrity_map_user':
>> include/linux/bio.h:799:1: error: expected ';' before '}' token
     799 | }
         | ^
   lib/vsprintf.c: In function 'va_format':
   lib/vsprintf.c:1682:9: warning: function 'va_format' might be a candidate for 'gnu_printf' format attribute [-Wsuggest-attribute=format]
    1682 |         buf += vsnprintf(buf, end > buf ? end - buf : 0, va_fmt->fmt, va);
         |         ^~~


vim +799 include/linux/bio.h

   794	
   795	static inline int bio_integrity_map_user(struct bio *bio, void __user *ubuf,
   796						 unsigned int len, u32 seed)
   797	{
   798		return -EINVAL
 > 799	}
   800
Kanchan Joshi Oct. 30, 2023, 9:02 p.m. UTC | #4
On 10/27/2023 11:49 PM, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> Passthrough commands that utilize metadata currently need to bounce the
> user space buffer through the kernel. Add support for mapping user space
> directly so that we can avoid this costly overhead. This is similiar to
> how the normal bio data payload utilizes user addresses with
> bio_map_user_iov().
> 
> If the user address can't directly be used for reasons like too many
> segments or address unalignement, fallback to a copy of the user vec
> while keeping the user address pinned for the IO duration so that it
> can safely be copied on completion in any process context.
> 
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> ---
>   block/bio-integrity.c | 195 ++++++++++++++++++++++++++++++++++++++++++
>   include/linux/bio.h   |   9 ++
>   2 files changed, 204 insertions(+)
> 
> diff --git a/block/bio-integrity.c b/block/bio-integrity.c
> index ec8ac8cf6e1b9..7f9d242ad79df 100644
> --- a/block/bio-integrity.c
> +++ b/block/bio-integrity.c
> @@ -91,6 +91,37 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
>   }
>   EXPORT_SYMBOL(bio_integrity_alloc);
>   
> +static void bio_integrity_unmap_user(struct bio_integrity_payload *bip)
> +{
> +	bool dirty = bio_data_dir(bip->bip_bio) == READ;
> +	struct bio_vec *copy = bip->copy_vec;
> +	struct bvec_iter iter;
> +	struct bio_vec bv;
> +
> +	if (copy) {
> +		unsigned short nr_vecs = bip->bip_max_vcnt;
> +		size_t bytes = bip->bip_iter.bi_size;
> +		void *buf = bvec_virt(bip->bip_vec);
> +
> +		if (dirty) {
> +			struct iov_iter iter;
> +
> +			iov_iter_bvec(&iter, ITER_DEST, copy, nr_vecs, bytes);
> +			WARN_ON(copy_to_iter(buf, bytes, &iter) != bytes);
> +		}
> +
> +		memcpy(bip->bip_vec, copy, nr_vecs * sizeof(*copy));
> +		kfree(copy);
> +		kfree(buf);
> +	}
> +
> +	bip_for_each_vec(bv, bip, iter) {
> +		if (dirty && !PageCompound(bv.bv_page))
> +			set_page_dirty_lock(bv.bv_page);
> +		unpin_user_page(bv.bv_page);
> +	}
> +}

Leak here, page-unpinning loop will not execute for the common (i.e., 
no-copy) case...


> +int bio_integrity_map_user(struct bio *bio, void __user *ubuf, unsigned int len,
> +			   u32 seed)
> +{
> +	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
> +	unsigned long offs, align = q->dma_pad_mask | queue_dma_alignment(q);
> +	int ret, direction, nr_vecs, i, j, folios = 0;
> +	struct bio_vec stack_vec[UIO_FASTIOV];
> +	struct bio_vec bv, *bvec = stack_vec;
> +	struct page *stack_pages[UIO_FASTIOV];
> +	struct page **pages = stack_pages;
> +	struct bio_integrity_payload *bip;
> +	struct iov_iter iter;
> +	struct bvec_iter bi;
> +	u32 bytes;
> +
> +	if (bio_integrity(bio))
> +		return -EINVAL;
> +	if (len >> SECTOR_SHIFT > queue_max_hw_sectors(q))
> +		return -E2BIG;
> +
> +	if (bio_data_dir(bio) == READ)
> +		direction = ITER_DEST;
> +	else
> +		direction = ITER_SOURCE;
> +
> +	iov_iter_ubuf(&iter, direction, ubuf, len);
> +	nr_vecs = iov_iter_npages(&iter, BIO_MAX_VECS + 1);
> +	if (nr_vecs > BIO_MAX_VECS)
> +		return -E2BIG;
> +	if (nr_vecs > UIO_FASTIOV) {
> +		bvec = kcalloc(sizeof(*bvec), nr_vecs, GFP_KERNEL);
> +		if (!bvec)
> +			return -ENOMEM;
> +		pages = NULL;
> +	}
> +
> +	bytes = iov_iter_extract_pages(&iter, &pages, len, nr_vecs, 0, &offs);
> +	if (unlikely(bytes < 0)) {
> +		ret =  bytes;
> +		goto free_bvec;
> +	}
> +
> +	for (i = 0; i < nr_vecs; i = j) {
> +		size_t size = min_t(size_t, bytes, PAGE_SIZE - offs);
> +		struct folio *folio = page_folio(pages[i]);
> +
> +		bytes -= size;
> +		for (j = i + 1; j < nr_vecs; j++) {
> +			size_t next = min_t(size_t, PAGE_SIZE, bytes);
> +
> +			if (page_folio(pages[j]) != folio ||
> +			    pages[j] != pages[j - 1] + 1)
> +				break;
> +			unpin_user_page(pages[j]);
> +			size += next;
> +			bytes -= next;
> +		}
> +
> +		bvec_set_page(&bvec[folios], pages[i], size, offs);
> +		offs = 0;
> +		folios++;
> +	}
> +
> +	if (pages != stack_pages)
> +		kvfree(pages);
> +
> +	if (folios > queue_max_integrity_segments(q) ||
> +	    !iov_iter_is_aligned(&iter, align, align)) {
> +		ret = bio_integrity_copy_user(bio, bvec, folios, len,
> +					      direction, seed);
> +		if (ret)
> +			goto release_pages;
> +		return 0;
> +	}
> +
> +	bip = bio_integrity_alloc(bio, GFP_KERNEL, folios);
> +	if (IS_ERR(bip)) {
> +		ret = PTR_ERR(bip);
> +		goto release_pages;
> +	}
> +
> +	memcpy(bip->bip_vec, bvec, folios * sizeof(*bvec));

Because with this way of copying, bip->bip_iter.bi_size will remain zero.

Second, is it fine not to have those virt-alignment checks that are done 
by bvec_gap_to_prev() when the pages are added using 
bio_integrity_add_page()?
Keith Busch Oct. 30, 2023, 9:25 p.m. UTC | #5
On Tue, Oct 31, 2023 at 02:32:48AM +0530, Kanchan Joshi wrote:
> On 10/27/2023 11:49 PM, Keith Busch wrote:
> > +
> > +	bip_for_each_vec(bv, bip, iter) {
> > +		if (dirty && !PageCompound(bv.bv_page))
> > +			set_page_dirty_lock(bv.bv_page);
> > +		unpin_user_page(bv.bv_page);
> > +	}
> > +}
> 
> Leak here, page-unpinning loop will not execute for the common (i.e., 
> no-copy) case...
> 
> > +	bip = bio_integrity_alloc(bio, GFP_KERNEL, folios);
> > +	if (IS_ERR(bip)) {
> > +		ret = PTR_ERR(bip);
> > +		goto release_pages;
> > +	}
> > +
> > +	memcpy(bip->bip_vec, bvec, folios * sizeof(*bvec));
> 
> Because with this way of copying, bip->bip_iter.bi_size will remain zero.

Good catch.
 
> Second, is it fine not to have those virt-alignment checks that are done 
> by bvec_gap_to_prev() when the pages are added using 
> bio_integrity_add_page()?

We're mapping a single user buffer. It's guaranteed to be virtually
congiguous, so no gaps.
kernel test robot Oct. 31, 2023, 12:13 a.m. UTC | #6
Hi Keith,

kernel test robot noticed the following build warnings:

[auto build test WARNING on axboe-block/for-next]
[also build test WARNING on linus/master v6.6 next-20231030]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Keith-Busch/block-bio-integrity-directly-map-user-buffers/20231028-022107
base:   https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git for-next
patch link:    https://lore.kernel.org/r/20231027181929.2589937-2-kbusch%40meta.com
patch subject: [PATCHv2 1/4] block: bio-integrity: directly map user buffers
config: x86_64-rhel-8.3-rust (https://download.01.org/0day-ci/archive/20231031/202310310704.l4FwoJDd-lkp@intel.com/config)
compiler: clang version 16.0.4 (https://github.com/llvm/llvm-project.git ae42196bc493ffe877a7e3dff8be32035dea4d07)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20231031/202310310704.l4FwoJDd-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202310310704.l4FwoJDd-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> block/bio-integrity.c:215:6: warning: variable 'ret' is used uninitialized whenever 'if' condition is true [-Wsometimes-uninitialized]
           if (!buf)
               ^~~~
   block/bio-integrity.c:255:9: note: uninitialized use occurs here
           return ret;
                  ^~~
   block/bio-integrity.c:215:2: note: remove the 'if' if its condition is always false
           if (!buf)
           ^~~~~~~~~
   block/bio-integrity.c:204:9: note: initialize the variable 'ret' to silence this warning
           int ret;
                  ^
                   = 0
   1 warning generated.


vim +215 block/bio-integrity.c

   195	
   196	static int bio_integrity_copy_user(struct bio *bio, struct bio_vec *bvec,
   197					   int nr_vecs, unsigned int len,
   198					   unsigned int direction, u32 seed)
   199	{
   200		struct bio_integrity_payload *bip;
   201		struct bio_vec *copy_vec = NULL;
   202		struct iov_iter iter;
   203		void *buf;
   204		int ret;
   205	
   206		/* if bvec is on the stack, we need to allocate a copy for the completion */
   207		if (nr_vecs <= UIO_FASTIOV) {
   208			copy_vec = kcalloc(sizeof(*bvec), nr_vecs, GFP_KERNEL);
   209			if (!copy_vec)
   210				return -ENOMEM;
   211			memcpy(copy_vec, bvec, nr_vecs * sizeof(*bvec));
   212		}
   213	
   214		buf = kmalloc(len, GFP_KERNEL);
 > 215		if (!buf)
   216			goto free_copy;
   217	
   218		if (direction == ITER_SOURCE) {
   219			iov_iter_bvec(&iter, direction, bvec, nr_vecs, len);
   220			if (!copy_from_iter_full(buf, len, &iter)) {
   221				ret = -EFAULT;
   222				goto free_buf;
   223			}
   224		} else {
   225			memset(buf, 0, len);
   226		}
   227	
   228		/*
   229		 * We just need one vec for this bip, but we need to preserve the
   230		 * number of vecs in the user bvec for the completion handling, so use
   231		 * nr_vecs.
   232		 */
   233		bip = bio_integrity_alloc(bio, GFP_KERNEL, nr_vecs);
   234		if (IS_ERR(bip)) {
   235			ret = PTR_ERR(bip);
   236			goto free_buf;
   237		}
   238	
   239		ret = bio_integrity_add_page(bio, virt_to_page(buf), len,
   240					     offset_in_page(buf));
   241		if (ret != len) {
   242			ret = -ENOMEM;
   243			goto free_bip;
   244		}
   245	
   246		bip->bip_flags |= BIP_INTEGRITY_USER;
   247		bip->copy_vec = copy_vec ?: bvec;
   248		return 0;
   249	free_bip:
   250		bio_integrity_free(bio);
   251	free_buf:
   252		kfree(buf);
   253	free_copy:
   254		kfree(copy_vec);
   255		return ret;
   256	}
   257
kernel test robot Oct. 31, 2023, 2:46 a.m. UTC | #7
Hi Keith,

kernel test robot noticed the following build errors:

[auto build test ERROR on axboe-block/for-next]
[also build test ERROR on linus/master v6.6 next-20231030]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Keith-Busch/block-bio-integrity-directly-map-user-buffers/20231028-022107
base:   https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git for-next
patch link:    https://lore.kernel.org/r/20231027181929.2589937-2-kbusch%40meta.com
patch subject: [PATCHv2 1/4] block: bio-integrity: directly map user buffers
config: mips-allmodconfig (https://download.01.org/0day-ci/archive/20231031/202310311041.38ISTxlo-lkp@intel.com/config)
compiler: mips-linux-gcc (GCC) 13.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20231031/202310311041.38ISTxlo-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202310311041.38ISTxlo-lkp@intel.com/

All errors (new ones prefixed by >>):

   block/bio-integrity.c: In function 'bio_integrity_map_user':
>> block/bio-integrity.c:294:72: error: passing argument 6 of 'iov_iter_extract_pages' from incompatible pointer type [-Werror=incompatible-pointer-types]
     294 |         bytes = iov_iter_extract_pages(&iter, &pages, len, nr_vecs, 0, &offs);
         |                                                                        ^~~~~
         |                                                                        |
         |                                                                        long unsigned int *
   In file included from include/linux/bio.h:11,
                    from include/linux/blkdev.h:17,
                    from include/linux/blk-mq.h:5,
                    from include/linux/blk-integrity.h:5,
                    from block/bio-integrity.c:9:
   include/linux/uio.h:400:40: note: expected 'size_t *' {aka 'unsigned int *'} but argument is of type 'long unsigned int *'
     400 |                                size_t *offset0);
         |                                ~~~~~~~~^~~~~~~
   cc1: some warnings being treated as errors


vim +/iov_iter_extract_pages +294 block/bio-integrity.c

   257	
   258	int bio_integrity_map_user(struct bio *bio, void __user *ubuf, unsigned int len,
   259				   u32 seed)
   260	{
   261		struct request_queue *q = bdev_get_queue(bio->bi_bdev);
   262		unsigned long offs, align = q->dma_pad_mask | queue_dma_alignment(q);
   263		int ret, direction, nr_vecs, i, j, folios = 0;
   264		struct bio_vec stack_vec[UIO_FASTIOV];
   265		struct bio_vec bv, *bvec = stack_vec;
   266		struct page *stack_pages[UIO_FASTIOV];
   267		struct page **pages = stack_pages;
   268		struct bio_integrity_payload *bip;
   269		struct iov_iter iter;
   270		struct bvec_iter bi;
   271		u32 bytes;
   272	
   273		if (bio_integrity(bio))
   274			return -EINVAL;
   275		if (len >> SECTOR_SHIFT > queue_max_hw_sectors(q))
   276			return -E2BIG;
   277	
   278		if (bio_data_dir(bio) == READ)
   279			direction = ITER_DEST;
   280		else
   281			direction = ITER_SOURCE;
   282	
   283		iov_iter_ubuf(&iter, direction, ubuf, len);
   284		nr_vecs = iov_iter_npages(&iter, BIO_MAX_VECS + 1);
   285		if (nr_vecs > BIO_MAX_VECS)
   286			return -E2BIG;
   287		if (nr_vecs > UIO_FASTIOV) {
   288			bvec = kcalloc(sizeof(*bvec), nr_vecs, GFP_KERNEL);
   289			if (!bvec)
   290				return -ENOMEM;
   291			pages = NULL;
   292		}
   293	
 > 294		bytes = iov_iter_extract_pages(&iter, &pages, len, nr_vecs, 0, &offs);
   295		if (unlikely(bytes < 0)) {
   296			ret =  bytes;
   297			goto free_bvec;
   298		}
   299	
   300		for (i = 0; i < nr_vecs; i = j) {
   301			size_t size = min_t(size_t, bytes, PAGE_SIZE - offs);
   302			struct folio *folio = page_folio(pages[i]);
   303	
   304			bytes -= size;
   305			for (j = i + 1; j < nr_vecs; j++) {
   306				size_t next = min_t(size_t, PAGE_SIZE, bytes);
   307	
   308				if (page_folio(pages[j]) != folio ||
   309				    pages[j] != pages[j - 1] + 1)
   310					break;
   311				unpin_user_page(pages[j]);
   312				size += next;
   313				bytes -= next;
   314			}
   315	
   316			bvec_set_page(&bvec[folios], pages[i], size, offs);
   317			offs = 0;
   318			folios++;
   319		}
   320	
   321		if (pages != stack_pages)
   322			kvfree(pages);
   323	
   324		if (folios > queue_max_integrity_segments(q) ||
   325		    !iov_iter_is_aligned(&iter, align, align)) {
   326			ret = bio_integrity_copy_user(bio, bvec, folios, len,
   327						      direction, seed);
   328			if (ret)
   329				goto release_pages;
   330			return 0;
   331		}
   332	
   333		bip = bio_integrity_alloc(bio, GFP_KERNEL, folios);
   334		if (IS_ERR(bip)) {
   335			ret = PTR_ERR(bip);
   336			goto release_pages;
   337		}
   338	
   339		memcpy(bip->bip_vec, bvec, folios * sizeof(*bvec));
   340		if (bvec != stack_vec)
   341			kfree(bvec);
   342	
   343		bip->bip_flags |= BIP_INTEGRITY_USER;
   344		bip->copy_vec = NULL;
   345		return 0;
   346	
   347	release_pages:
   348		bi.bi_size = len;
   349		for_each_bvec(bv, bvec, bi, bi)
   350			unpin_user_page(bv.bv_page);
   351	free_bvec:
   352		if (bvec != stack_vec)
   353			kfree(bvec);
   354		return ret;
   355	}
   356	EXPORT_SYMBOL_GPL(bio_integrity_map_user);
   357
Kanchan Joshi Nov. 6, 2023, 5:48 a.m. UTC | #8
On 10/27/2023 11:49 PM, Keith Busch wrote:
> +int bio_integrity_map_user(struct bio *bio, void __user *ubuf, unsigned int len,
> +			   u32 seed)
> +{
> +	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
> +	unsigned long offs, align = q->dma_pad_mask | queue_dma_alignment(q);
> +	int ret, direction, nr_vecs, i, j, folios = 0;
> +	struct bio_vec stack_vec[UIO_FASTIOV];
> +	struct bio_vec bv, *bvec = stack_vec;
> +	struct page *stack_pages[UIO_FASTIOV];
> +	struct page **pages = stack_pages;
> +	struct bio_integrity_payload *bip;
> +	struct iov_iter iter;
> +	struct bvec_iter bi;
> +	u32 bytes;
> +
> +	if (bio_integrity(bio))
> +		return -EINVAL;
> +	if (len >> SECTOR_SHIFT > queue_max_hw_sectors(q))
> +		return -E2BIG;
> +
> +	if (bio_data_dir(bio) == READ)
> +		direction = ITER_DEST;
> +	else
> +		direction = ITER_SOURCE;
> +
> +	iov_iter_ubuf(&iter, direction, ubuf, len);
> +	nr_vecs = iov_iter_npages(&iter, BIO_MAX_VECS + 1);
> +	if (nr_vecs > BIO_MAX_VECS)
> +		return -E2BIG;
> +	if (nr_vecs > UIO_FASTIOV) {
> +		bvec = kcalloc(sizeof(*bvec), nr_vecs, GFP_KERNEL);
> +		if (!bvec)
> +			return -ENOMEM;
> +		pages = NULL;
> +	}
> +
> +	bytes = iov_iter_extract_pages(&iter, &pages, len, nr_vecs, 0, &offs);
> +	if (unlikely(bytes < 0)) {
> +		ret =  bytes;
> +		goto free_bvec;
> +	}
> +
> +	for (i = 0; i < nr_vecs; i = j) {
> +		size_t size = min_t(size_t, bytes, PAGE_SIZE - offs);
> +		struct folio *folio = page_folio(pages[i]);
> +
> +		bytes -= size;
> +		for (j = i + 1; j < nr_vecs; j++) {
> +			size_t next = min_t(size_t, PAGE_SIZE, bytes);
> +
> +			if (page_folio(pages[j]) != folio ||
> +			    pages[j] != pages[j - 1] + 1)
> +				break;
> +			unpin_user_page(pages[j]);

Is this unpin correct here?
Keith Busch Nov. 6, 2023, 3:02 p.m. UTC | #9
On Mon, Nov 06, 2023 at 11:18:03AM +0530, Kanchan Joshi wrote:
> On 10/27/2023 11:49 PM, Keith Busch wrote:
> > +	for (i = 0; i < nr_vecs; i = j) {
> > +		size_t size = min_t(size_t, bytes, PAGE_SIZE - offs);
> > +		struct folio *folio = page_folio(pages[i]);
> > +
> > +		bytes -= size;
> > +		for (j = i + 1; j < nr_vecs; j++) {
> > +			size_t next = min_t(size_t, PAGE_SIZE, bytes);
> > +
> > +			if (page_folio(pages[j]) != folio ||
> > +			    pages[j] != pages[j - 1] + 1)
> > +				break;
> > +			unpin_user_page(pages[j]);
> 
> Is this unpin correct here?

Should be. The pages are bound to the folio, so this doesn't really
unpin the user page. It just drops a reference, and the folio holds the
final reference to the contiguous pages, which is released on
completion. You can find the same idea in io_uring/rscs.c,
io_sqe_buffer_register().
Kanchan Joshi Nov. 7, 2023, 10:25 a.m. UTC | #10
On 11/6/2023 8:32 PM, Keith Busch wrote:
> On Mon, Nov 06, 2023 at 11:18:03AM +0530, Kanchan Joshi wrote:
>> On 10/27/2023 11:49 PM, Keith Busch wrote:
>>> +	for (i = 0; i < nr_vecs; i = j) {
>>> +		size_t size = min_t(size_t, bytes, PAGE_SIZE - offs);
>>> +		struct folio *folio = page_folio(pages[i]);
>>> +
>>> +		bytes -= size;
>>> +		for (j = i + 1; j < nr_vecs; j++) {
>>> +			size_t next = min_t(size_t, PAGE_SIZE, bytes);
>>> +
>>> +			if (page_folio(pages[j]) != folio ||
>>> +			    pages[j] != pages[j - 1] + 1)
>>> +				break;
>>> +			unpin_user_page(pages[j]);
>>
>> Is this unpin correct here?
> 
> Should be. The pages are bound to the folio, so this doesn't really
> unpin the user page. It just drops a reference, and the folio holds the
> final reference to the contiguous pages, which is released on
> completion. 

But the completion is still going to see multiple pages and not one 
(folio). The bip_for_each_vec loop is going to drop the reference again.
I suspect it is not folio-aware.
Keith Busch Nov. 7, 2023, 3:08 p.m. UTC | #11
On Tue, Nov 07, 2023 at 03:55:14PM +0530, Kanchan Joshi wrote:
> On 11/6/2023 8:32 PM, Keith Busch wrote:
> > On Mon, Nov 06, 2023 at 11:18:03AM +0530, Kanchan Joshi wrote:
> >> On 10/27/2023 11:49 PM, Keith Busch wrote:
> >>> +	for (i = 0; i < nr_vecs; i = j) {
> >>> +		size_t size = min_t(size_t, bytes, PAGE_SIZE - offs);
> >>> +		struct folio *folio = page_folio(pages[i]);
> >>> +
> >>> +		bytes -= size;
> >>> +		for (j = i + 1; j < nr_vecs; j++) {
> >>> +			size_t next = min_t(size_t, PAGE_SIZE, bytes);
> >>> +
> >>> +			if (page_folio(pages[j]) != folio ||
> >>> +			    pages[j] != pages[j - 1] + 1)
> >>> +				break;
> >>> +			unpin_user_page(pages[j]);
> >>
> >> Is this unpin correct here?
> > 
> > Should be. The pages are bound to the folio, so this doesn't really
> > unpin the user page. It just drops a reference, and the folio holds the
> > final reference to the contiguous pages, which is released on
> > completion. 
> 
> But the completion is still going to see multiple pages and not one 
> (folio). The bip_for_each_vec loop is going to drop the reference again.
> I suspect it is not folio-aware.

The completion unpins once per bvec, not individual pages. The setup
creates multipage bvecs with only one pin remaining per bvec for all of
the bvec's pages. If a page can't be merged into the current bvec, then
that page is not unpinned and becomes the first page of to the next
bvec.
Kanchan Joshi Nov. 8, 2023, 12:15 p.m. UTC | #12
On 11/7/2023 8:38 PM, Keith Busch wrote:
> On Tue, Nov 07, 2023 at 03:55:14PM +0530, Kanchan Joshi wrote:
>> On 11/6/2023 8:32 PM, Keith Busch wrote:
>>> On Mon, Nov 06, 2023 at 11:18:03AM +0530, Kanchan Joshi wrote:
>>>> On 10/27/2023 11:49 PM, Keith Busch wrote:
>>>>> +	for (i = 0; i < nr_vecs; i = j) {
>>>>> +		size_t size = min_t(size_t, bytes, PAGE_SIZE - offs);
>>>>> +		struct folio *folio = page_folio(pages[i]);
>>>>> +
>>>>> +		bytes -= size;
>>>>> +		for (j = i + 1; j < nr_vecs; j++) {
>>>>> +			size_t next = min_t(size_t, PAGE_SIZE, bytes);
>>>>> +
>>>>> +			if (page_folio(pages[j]) != folio ||
>>>>> +			    pages[j] != pages[j - 1] + 1)
>>>>> +				break;
>>>>> +			unpin_user_page(pages[j]);
>>>>
>>>> Is this unpin correct here?
>>>
>>> Should be. The pages are bound to the folio, so this doesn't really
>>> unpin the user page. It just drops a reference, and the folio holds the
>>> final reference to the contiguous pages, which is released on
>>> completion.
>>
>> But the completion is still going to see multiple pages and not one
>> (folio). The bip_for_each_vec loop is going to drop the reference again.
>> I suspect it is not folio-aware.
> 
> The completion unpins once per bvec, not individual pages. The setup
> creates multipage bvecs with only one pin remaining per bvec for all of
> the bvec's pages. If a page can't be merged into the current bvec, then
> that page is not unpinned and becomes the first page of to the next
> bvec.
> 

Here is a test program [2] that creates this scenario.
Single 8KB+16b read on a 4KB+8b formatted namespace. It prepares 
meta-buffer out of a huge-page in a way that it spans two regular 4K pages.
With this, I see more unpins than expected.

And I had added this [1] also on top of your patch.

[1]
@@ -339,7 +367,22 @@ int bio_integrity_map_user(struct bio *bio, void 
__user *ubuf, unsigned int len,
         memcpy(bip->bip_vec, bvec, folios * sizeof(*bvec));
         if (bvec != stack_vec)
                 kfree(bvec);
+       // quick fix for completion
+       bip->bip_vcnt = folios;
+       bip->bip_iter.bi_size = len;


[2]
#define _GNU_SOURCE
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <liburing.h>
#include <libnvme.h>

#define DEV             "/dev/ng0n1"
#define NSID            1
#define DBCNT           2
#define DATA_BUFLEN     (4096 * DBCNT)
#define OFFSET          0
#define LBA_SHIFT       12

/* This assumes 4K + 8b lba format */
#define MD_BUFLEN       (8 * DBCNT)
#define MD_OFFSET       (4096 - 8)
#define HP_SIZE         (2*2*1024*1024) /*Two 2M pages*/

#define APPTAG_MASK     (0xFFFF)
#define APPTAG          (0x8888)

void *alloc_meta_buf_hp()
{
         void *ptr;

         ptr = mmap(NULL, HP_SIZE, PROT_READ | PROT_WRITE,
                         MAP_PRIVATE|MAP_ANONYMOUS|MAP_HUGETLB,
                         -1, 0);
         if (ptr == MAP_FAILED)
                 return NULL;
         return ptr;
}

void free_meta_buf(void *ptr)
{
         munmap(ptr, HP_SIZE);
}

int main()
{
         struct io_uring ring;
         struct io_uring_cqe *cqe;
         struct io_uring_sqe *sqe;
         struct io_uring_params p = { };
         int fd, ret;
         struct nvme_uring_cmd *cmd;
         void *buffer, *md_buf;
         __u64 slba;
         __u16 nlb;

         ret = posix_memalign(&buffer, DATA_BUFLEN, DATA_BUFLEN);
         if (ret) {
                 fprintf(stderr, "data buffer allocation failed: %d\n", 
ret);
                 return 1;
         }
         memset(buffer, 'x', DATA_BUFLEN);

         md_buf = alloc_meta_buf_hp();
         if (!md_buf) {
                 fprintf(stderr, "meta buffer allocation failed: %d\n", 
ret);
                 return 1;
         }

         p.flags = IORING_SETUP_CQE32 | IORING_SETUP_SQE128;
         ret = io_uring_queue_init_params(4, &ring, &p);
         if (ret) {
                 fprintf(stderr, "ring create failed: %d\n", ret);
                 return 1;
         }

         fd = open(DEV, O_RDWR);
         if (fd < 0) {
                 perror("file open");
                 exit(1);
         }

         sqe = io_uring_get_sqe(&ring);
         io_uring_prep_read(sqe, fd, buffer, DATA_BUFLEN, OFFSET);
         sqe->cmd_op = NVME_URING_CMD_IO;
         sqe->opcode = IORING_OP_URING_CMD;
         sqe->user_data = 1234;

         cmd = (struct nvme_uring_cmd *)sqe->cmd;
         memset(cmd, 0, sizeof(struct nvme_uring_cmd));
         cmd->opcode = nvme_cmd_read;
         cmd->addr = (__u64)(uintptr_t)buffer;
         cmd->data_len = DATA_BUFLEN;
         cmd->nsid = NSID;

         slba = OFFSET >> LBA_SHIFT;
         nlb = (DATA_BUFLEN >> LBA_SHIFT) - 1;
         cmd->cdw10 = slba & 0xffffffff;
         cmd->cdw11 = slba >> 32;
         cmd->cdw12 = nlb;
         /* set the pract and prchk (Guard, App, RefTag) bits in cdw12 */
         //cmd->cdw12 |= 15 << 26;
         cmd->cdw12 |= 7 << 26;

         cmd->metadata = ((__u64)(uintptr_t)md_buf) + MD_OFFSET;
         cmd->metadata_len = MD_BUFLEN;

         /* reftag */
         cmd->cdw14 = (__u32)slba;
         /* apptag mask and apptag */
         cmd->cdw15 = APPTAG_MASK << 16 | APPTAG;

         ret = io_uring_submit(&ring);
         if (ret != 1) {
                 fprintf(stderr, "submit got %d, wanted %d\n", ret, 1);
                 goto err;
         }
         ret = io_uring_wait_cqe(&ring, &cqe);
         if (ret) {
                 fprintf(stderr, "wait_cqe=%d\n", ret);
                 goto err;
         }
         if (cqe->res != 0) {
                 fprintf(stderr, "cqe res %d, wanted success\n", cqe->res);
                 goto err;
         }

         io_uring_cqe_seen(&ring, cqe);
         free_meta_buf(md_buf);
         close(fd);
         io_uring_queue_exit(&ring);
         return 0;
err:
         if (fd != -1)
                 close(fd);
         io_uring_queue_exit(&ring);
         return 1;
}
Keith Busch Nov. 8, 2023, 5:19 p.m. UTC | #13
On Wed, Nov 08, 2023 at 05:45:19PM +0530, Kanchan Joshi wrote:
> On 11/7/2023 8:38 PM, Keith Busch wrote:
> > On Tue, Nov 07, 2023 at 03:55:14PM +0530, Kanchan Joshi wrote:
> >> On 11/6/2023 8:32 PM, Keith Busch wrote:
> >>> On Mon, Nov 06, 2023 at 11:18:03AM +0530, Kanchan Joshi wrote:
> >>>> On 10/27/2023 11:49 PM, Keith Busch wrote:
> >>>>> +	for (i = 0; i < nr_vecs; i = j) {
> >>>>> +		size_t size = min_t(size_t, bytes, PAGE_SIZE - offs);
> >>>>> +		struct folio *folio = page_folio(pages[i]);
> >>>>> +
> >>>>> +		bytes -= size;
> >>>>> +		for (j = i + 1; j < nr_vecs; j++) {
> >>>>> +			size_t next = min_t(size_t, PAGE_SIZE, bytes);
> >>>>> +
> >>>>> +			if (page_folio(pages[j]) != folio ||
> >>>>> +			    pages[j] != pages[j - 1] + 1)
> >>>>> +				break;
> >>>>> +			unpin_user_page(pages[j]);
> >>>>
> >>>> Is this unpin correct here?
> >>>
> >>> Should be. The pages are bound to the folio, so this doesn't really
> >>> unpin the user page. It just drops a reference, and the folio holds the
> >>> final reference to the contiguous pages, which is released on
> >>> completion.
> >>
> >> But the completion is still going to see multiple pages and not one
> >> (folio). The bip_for_each_vec loop is going to drop the reference again.
> >> I suspect it is not folio-aware.
> > 
> > The completion unpins once per bvec, not individual pages. The setup
> > creates multipage bvecs with only one pin remaining per bvec for all of
> > the bvec's pages. If a page can't be merged into the current bvec, then
> > that page is not unpinned and becomes the first page of to the next
> > bvec.
> > 
> 
> Here is a test program [2] that creates this scenario.
> Single 8KB+16b read on a 4KB+8b formatted namespace. It prepares 
> meta-buffer out of a huge-page in a way that it spans two regular 4K pages.
> With this, I see more unpins than expected.

I understand now. The bip_for_each_bvec is using single page vector
iterators. Will fix it up for next time.
diff mbox series

Patch

diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index ec8ac8cf6e1b9..7f9d242ad79df 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -91,6 +91,37 @@  struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
 }
 EXPORT_SYMBOL(bio_integrity_alloc);
 
+static void bio_integrity_unmap_user(struct bio_integrity_payload *bip)
+{
+	bool dirty = bio_data_dir(bip->bip_bio) == READ;
+	struct bio_vec *copy = bip->copy_vec;
+	struct bvec_iter iter;
+	struct bio_vec bv;
+
+	if (copy) {
+		unsigned short nr_vecs = bip->bip_max_vcnt;
+		size_t bytes = bip->bip_iter.bi_size;
+		void *buf = bvec_virt(bip->bip_vec);
+
+		if (dirty) {
+			struct iov_iter iter;
+
+			iov_iter_bvec(&iter, ITER_DEST, copy, nr_vecs, bytes);
+			WARN_ON(copy_to_iter(buf, bytes, &iter) != bytes);
+		}
+
+		memcpy(bip->bip_vec, copy, nr_vecs * sizeof(*copy));
+		kfree(copy);
+		kfree(buf);
+	}
+
+	bip_for_each_vec(bv, bip, iter) {
+		if (dirty && !PageCompound(bv.bv_page))
+			set_page_dirty_lock(bv.bv_page);
+		unpin_user_page(bv.bv_page);
+	}
+}
+
 /**
  * bio_integrity_free - Free bio integrity payload
  * @bio:	bio containing bip to be freed
@@ -105,6 +136,8 @@  void bio_integrity_free(struct bio *bio)
 
 	if (bip->bip_flags & BIP_BLOCK_INTEGRITY)
 		kfree(bvec_virt(bip->bip_vec));
+	else if (bip->bip_flags & BIP_INTEGRITY_USER)
+		bio_integrity_unmap_user(bip);
 
 	__bio_integrity_free(bs, bip);
 	bio->bi_integrity = NULL;
@@ -160,6 +193,168 @@  int bio_integrity_add_page(struct bio *bio, struct page *page,
 }
 EXPORT_SYMBOL(bio_integrity_add_page);
 
+static int bio_integrity_copy_user(struct bio *bio, struct bio_vec *bvec,
+				   int nr_vecs, unsigned int len,
+				   unsigned int direction, u32 seed)
+{
+	struct bio_integrity_payload *bip;
+	struct bio_vec *copy_vec = NULL;
+	struct iov_iter iter;
+	void *buf;
+	int ret;
+
+	/* if bvec is on the stack, we need to allocate a copy for the completion */
+	if (nr_vecs <= UIO_FASTIOV) {
+		copy_vec = kcalloc(sizeof(*bvec), nr_vecs, GFP_KERNEL);
+		if (!copy_vec)
+			return -ENOMEM;
+		memcpy(copy_vec, bvec, nr_vecs * sizeof(*bvec));
+	}
+
+	buf = kmalloc(len, GFP_KERNEL);
+	if (!buf)
+		goto free_copy;
+
+	if (direction == ITER_SOURCE) {
+		iov_iter_bvec(&iter, direction, bvec, nr_vecs, len);
+		if (!copy_from_iter_full(buf, len, &iter)) {
+			ret = -EFAULT;
+			goto free_buf;
+		}
+	} else {
+		memset(buf, 0, len);
+	}
+
+	/*
+	 * We just need one vec for this bip, but we need to preserve the
+	 * number of vecs in the user bvec for the completion handling, so use
+	 * nr_vecs.
+	 */
+	bip = bio_integrity_alloc(bio, GFP_KERNEL, nr_vecs);
+	if (IS_ERR(bip)) {
+		ret = PTR_ERR(bip);
+		goto free_buf;
+	}
+
+	ret = bio_integrity_add_page(bio, virt_to_page(buf), len,
+				     offset_in_page(buf));
+	if (ret != len) {
+		ret = -ENOMEM;
+		goto free_bip;
+	}
+
+	bip->bip_flags |= BIP_INTEGRITY_USER;
+	bip->copy_vec = copy_vec ?: bvec;
+	return 0;
+free_bip:
+	bio_integrity_free(bio);
+free_buf:
+	kfree(buf);
+free_copy:
+	kfree(copy_vec);
+	return ret;
+}
+
+int bio_integrity_map_user(struct bio *bio, void __user *ubuf, unsigned int len,
+			   u32 seed)
+{
+	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+	unsigned long offs, align = q->dma_pad_mask | queue_dma_alignment(q);
+	int ret, direction, nr_vecs, i, j, folios = 0;
+	struct bio_vec stack_vec[UIO_FASTIOV];
+	struct bio_vec bv, *bvec = stack_vec;
+	struct page *stack_pages[UIO_FASTIOV];
+	struct page **pages = stack_pages;
+	struct bio_integrity_payload *bip;
+	struct iov_iter iter;
+	struct bvec_iter bi;
+	u32 bytes;
+
+	if (bio_integrity(bio))
+		return -EINVAL;
+	if (len >> SECTOR_SHIFT > queue_max_hw_sectors(q))
+		return -E2BIG;
+
+	if (bio_data_dir(bio) == READ)
+		direction = ITER_DEST;
+	else
+		direction = ITER_SOURCE;
+
+	iov_iter_ubuf(&iter, direction, ubuf, len);
+	nr_vecs = iov_iter_npages(&iter, BIO_MAX_VECS + 1);
+	if (nr_vecs > BIO_MAX_VECS)
+		return -E2BIG;
+	if (nr_vecs > UIO_FASTIOV) {
+		bvec = kcalloc(sizeof(*bvec), nr_vecs, GFP_KERNEL);
+		if (!bvec)
+			return -ENOMEM;
+		pages = NULL;
+	}
+
+	bytes = iov_iter_extract_pages(&iter, &pages, len, nr_vecs, 0, &offs);
+	if (unlikely(bytes < 0)) {
+		ret =  bytes;
+		goto free_bvec;
+	}
+
+	for (i = 0; i < nr_vecs; i = j) {
+		size_t size = min_t(size_t, bytes, PAGE_SIZE - offs);
+		struct folio *folio = page_folio(pages[i]);
+
+		bytes -= size;
+		for (j = i + 1; j < nr_vecs; j++) {
+			size_t next = min_t(size_t, PAGE_SIZE, bytes);
+
+			if (page_folio(pages[j]) != folio ||
+			    pages[j] != pages[j - 1] + 1)
+				break;
+			unpin_user_page(pages[j]);
+			size += next;
+			bytes -= next;
+		}
+
+		bvec_set_page(&bvec[folios], pages[i], size, offs);
+		offs = 0;
+		folios++;
+	}
+
+	if (pages != stack_pages)
+		kvfree(pages);
+
+	if (folios > queue_max_integrity_segments(q) ||
+	    !iov_iter_is_aligned(&iter, align, align)) {
+		ret = bio_integrity_copy_user(bio, bvec, folios, len,
+					      direction, seed);
+		if (ret)
+			goto release_pages;
+		return 0;
+	}
+
+	bip = bio_integrity_alloc(bio, GFP_KERNEL, folios);
+	if (IS_ERR(bip)) {
+		ret = PTR_ERR(bip);
+		goto release_pages;
+	}
+
+	memcpy(bip->bip_vec, bvec, folios * sizeof(*bvec));
+	if (bvec != stack_vec)
+		kfree(bvec);
+
+	bip->bip_flags |= BIP_INTEGRITY_USER;
+	bip->copy_vec = NULL;
+	return 0;
+
+release_pages:
+	bi.bi_size = len;
+	for_each_bvec(bv, bvec, bi, bi)
+		unpin_user_page(bv.bv_page);
+free_bvec:
+	if (bvec != stack_vec)
+		kfree(bvec);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(bio_integrity_map_user);
+
 /**
  * bio_integrity_process - Process integrity metadata for a bio
  * @bio:	bio to generate/verify integrity metadata for
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 41d417ee13499..2b4a0de838ed1 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -324,6 +324,7 @@  enum bip_flags {
 	BIP_CTRL_NOCHECK	= 1 << 2, /* disable HBA integrity checking */
 	BIP_DISK_NOCHECK	= 1 << 3, /* disable disk integrity checking */
 	BIP_IP_CHECKSUM		= 1 << 4, /* IP checksum */
+	BIP_INTEGRITY_USER	= 1 << 5, /* Integrity payload is user address */
 };
 
 /*
@@ -342,6 +343,7 @@  struct bio_integrity_payload {
 
 	struct work_struct	bip_work;	/* I/O completion */
 
+	struct bio_vec		*copy_vec;	/* for bounce buffering */
 	struct bio_vec		*bip_vec;
 	struct bio_vec		bip_inline_vecs[];/* embedded bvec array */
 };
@@ -720,6 +722,7 @@  static inline bool bioset_initialized(struct bio_set *bs)
 
 extern struct bio_integrity_payload *bio_integrity_alloc(struct bio *, gfp_t, unsigned int);
 extern int bio_integrity_add_page(struct bio *, struct page *, unsigned int, unsigned int);
+extern int bio_integrity_map_user(struct bio *, void __user *, unsigned int, u32);
 extern bool bio_integrity_prep(struct bio *);
 extern void bio_integrity_advance(struct bio *, unsigned int);
 extern void bio_integrity_trim(struct bio *);
@@ -789,6 +792,12 @@  static inline int bio_integrity_add_page(struct bio *bio, struct page *page,
 	return 0;
 }
 
+static inline int bio_integrity_map_user(struct bio *bio, void __user *ubuf,
+					 unsigned int len, u32 seed)
+{
+	return -EINVAL
+}
+
 #endif /* CONFIG_BLK_DEV_INTEGRITY */
 
 /*