diff mbox series

[RFC,1/8] block: add support for REQ_OP_VERIFY

Message ID 20211104064634.4481-2-chaitanyak@nvidia.com (mailing list archive)
State Not Applicable
Headers show
Series block: add support for REQ_OP_VERIFY | expand

Commit Message

Chaitanya Kulkarni Nov. 4, 2021, 6:46 a.m. UTC
From: Chaitanya Kulkarni <kch@nvidia.com>

This adds a new block layer operation to offload verifying a range of
LBAs. This support is needed in order to provide file systems and
fabrics, kernel components to offload LBA verification when it is
supported by the hardware controller. In case hardware offloading is
not supported then we provide APIs to emulate the same. The prominent
example of that is NVMe Verify command. We also provide an emulation of
the same operation which can be used in case H/W does not support
verify. This is still useful when block device is remotely attached e.g.
using NVMeOF.

Signed-off-by: Chaitanya Kulkarni <kch@nvidia.com>
---
 Documentation/ABI/testing/sysfs-block |  14 ++
 block/blk-core.c                      |   5 +
 block/blk-lib.c                       | 192 ++++++++++++++++++++++++++
 block/blk-merge.c                     |  19 +++
 block/blk-settings.c                  |  17 +++
 block/blk-sysfs.c                     |   8 ++
 block/blk-zoned.c                     |   1 +
 block/bounce.c                        |   1 +
 block/ioctl.c                         |  35 +++++
 include/linux/bio.h                   |  10 +-
 include/linux/blk_types.h             |   2 +
 include/linux/blkdev.h                |  31 +++++
 include/uapi/linux/fs.h               |   1 +
 13 files changed, 332 insertions(+), 4 deletions(-)

Comments

Darrick J. Wong Nov. 4, 2021, 5:25 p.m. UTC | #1
On Wed, Nov 03, 2021 at 11:46:27PM -0700, Chaitanya Kulkarni wrote:
> From: Chaitanya Kulkarni <kch@nvidia.com>
> 
> This adds a new block layer operation to offload verifying a range of
> LBAs. This support is needed in order to provide file systems and
> fabrics, kernel components to offload LBA verification when it is
> supported by the hardware controller. In case hardware offloading is
> not supported then we provide APIs to emulate the same. The prominent
> example of that is NVMe Verify command. We also provide an emulation of
> the same operation which can be used in case H/W does not support
> verify. This is still useful when block device is remotely attached e.g.
> using NVMeOF.
> 
> Signed-off-by: Chaitanya Kulkarni <kch@nvidia.com>
> ---
>  Documentation/ABI/testing/sysfs-block |  14 ++
>  block/blk-core.c                      |   5 +
>  block/blk-lib.c                       | 192 ++++++++++++++++++++++++++
>  block/blk-merge.c                     |  19 +++
>  block/blk-settings.c                  |  17 +++
>  block/blk-sysfs.c                     |   8 ++
>  block/blk-zoned.c                     |   1 +
>  block/bounce.c                        |   1 +
>  block/ioctl.c                         |  35 +++++
>  include/linux/bio.h                   |  10 +-
>  include/linux/blk_types.h             |   2 +
>  include/linux/blkdev.h                |  31 +++++
>  include/uapi/linux/fs.h               |   1 +
>  13 files changed, 332 insertions(+), 4 deletions(-)
> 

(skipping to the ioctl part; I didn't see anything obviously weird in
the block/ changes)

> diff --git a/block/ioctl.c b/block/ioctl.c
> index d61d652078f4..5e1b3c4660bf 100644
> --- a/block/ioctl.c
> +++ b/block/ioctl.c
> @@ -168,6 +168,39 @@ static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode,
>  			BLKDEV_ZERO_NOUNMAP);
>  }
>  
> +static int blk_ioctl_verify(struct block_device *bdev, fmode_t mode,
> +		unsigned long arg)
> +{
> +	uint64_t range[2];
> +	struct address_space *mapping;
> +	uint64_t start, end, len;
> +
> +	if (!(mode & FMODE_WRITE))
> +		return -EBADF;

Why does the fd have to be opened writable?  Isn't this a read test?

> +
> +	if (copy_from_user(range, (void __user *)arg, sizeof(range)))
> +		return -EFAULT;
> +
> +	start = range[0];
> +	len = range[1];
> +	end = start + len - 1;
> +
> +	if (start & 511)
> +		return -EINVAL;
> +	if (len & 511)
> +		return -EINVAL;
> +	if (end >= (uint64_t)i_size_read(bdev->bd_inode))
> +		return -EINVAL;
> +	if (end < start)
> +		return -EINVAL;
> +
> +	/* Invalidate the page cache, including dirty pages */
> +	mapping = bdev->bd_inode->i_mapping;
> +	truncate_inode_pages_range(mapping, start, end);

Why do we need to invalidate the page cache to verify media?  Won't that
cause data loss if those pages were dirty and about to be flushed?

--D

> +
> +	return blkdev_issue_verify(bdev, start >> 9, len >> 9, GFP_KERNEL);
> +}
> +
>  static int put_ushort(unsigned short __user *argp, unsigned short val)
>  {
>  	return put_user(val, argp);
> @@ -460,6 +493,8 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode,
>  				BLKDEV_DISCARD_SECURE);
>  	case BLKZEROOUT:
>  		return blk_ioctl_zeroout(bdev, mode, arg);
> +	case BLKVERIFY:
> +		return blk_ioctl_verify(bdev, mode, arg);
>  	case BLKREPORTZONE:
>  		return blkdev_report_zones_ioctl(bdev, mode, cmd, arg);
>  	case BLKRESETZONE:
> diff --git a/include/linux/bio.h b/include/linux/bio.h
> index c74857cf1252..d660c37b7d6c 100644
> --- a/include/linux/bio.h
> +++ b/include/linux/bio.h
> @@ -63,7 +63,8 @@ static inline bool bio_has_data(struct bio *bio)
>  	    bio->bi_iter.bi_size &&
>  	    bio_op(bio) != REQ_OP_DISCARD &&
>  	    bio_op(bio) != REQ_OP_SECURE_ERASE &&
> -	    bio_op(bio) != REQ_OP_WRITE_ZEROES)
> +	    bio_op(bio) != REQ_OP_WRITE_ZEROES &&
> +	    bio_op(bio) != REQ_OP_VERIFY)
>  		return true;
>  
>  	return false;
> @@ -73,8 +74,8 @@ static inline bool bio_no_advance_iter(const struct bio *bio)
>  {
>  	return bio_op(bio) == REQ_OP_DISCARD ||
>  	       bio_op(bio) == REQ_OP_SECURE_ERASE ||
> -	       bio_op(bio) == REQ_OP_WRITE_SAME ||
> -	       bio_op(bio) == REQ_OP_WRITE_ZEROES;
> +	       bio_op(bio) == REQ_OP_WRITE_ZEROES ||
> +	       bio_op(bio) == REQ_OP_VERIFY;
>  }
>  
>  static inline bool bio_mergeable(struct bio *bio)
> @@ -198,7 +199,7 @@ static inline unsigned bio_segments(struct bio *bio)
>  	struct bvec_iter iter;
>  
>  	/*
> -	 * We special case discard/write same/write zeroes, because they
> +	 * We special case discard/write same/write zeroes/verify, because they
>  	 * interpret bi_size differently:
>  	 */
>  
> @@ -206,6 +207,7 @@ static inline unsigned bio_segments(struct bio *bio)
>  	case REQ_OP_DISCARD:
>  	case REQ_OP_SECURE_ERASE:
>  	case REQ_OP_WRITE_ZEROES:
> +	case REQ_OP_VERIFY:
>  		return 0;
>  	case REQ_OP_WRITE_SAME:
>  		return 1;
> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
> index 1bc6f6a01070..8877711c4c56 100644
> --- a/include/linux/blk_types.h
> +++ b/include/linux/blk_types.h
> @@ -366,6 +366,8 @@ enum req_opf {
>  	REQ_OP_SECURE_ERASE	= 5,
>  	/* write the same sector many times */
>  	REQ_OP_WRITE_SAME	= 7,
> +	/* verify the sectors */
> +	REQ_OP_VERIFY		= 8,
>  	/* write the zero filled sector many times */
>  	REQ_OP_WRITE_ZEROES	= 9,
>  	/* Open a zone */
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 0dea268bd61b..99c41d90584b 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -334,6 +334,7 @@ struct queue_limits {
>  	unsigned int		max_hw_discard_sectors;
>  	unsigned int		max_write_same_sectors;
>  	unsigned int		max_write_zeroes_sectors;
> +	unsigned int		max_verify_sectors;
>  	unsigned int		max_zone_append_sectors;
>  	unsigned int		discard_granularity;
>  	unsigned int		discard_alignment;
> @@ -621,6 +622,7 @@ struct request_queue {
>  #define QUEUE_FLAG_RQ_ALLOC_TIME 27	/* record rq->alloc_time_ns */
>  #define QUEUE_FLAG_HCTX_ACTIVE	28	/* at least one blk-mq hctx is active */
>  #define QUEUE_FLAG_NOWAIT       29	/* device supports NOWAIT */
> +#define QUEUE_FLAG_VERIFY	30	/* supports Verify */
>  
>  #define QUEUE_FLAG_MQ_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
>  				 (1 << QUEUE_FLAG_SAME_COMP) |		\
> @@ -667,6 +669,7 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
>  #define blk_queue_fua(q)	test_bit(QUEUE_FLAG_FUA, &(q)->queue_flags)
>  #define blk_queue_registered(q)	test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags)
>  #define blk_queue_nowait(q)	test_bit(QUEUE_FLAG_NOWAIT, &(q)->queue_flags)
> +#define blk_queue_verify(q)	test_bit(QUEUE_FLAG_VERIFY, &(q)->queue_flags)
>  
>  extern void blk_set_pm_only(struct request_queue *q);
>  extern void blk_clear_pm_only(struct request_queue *q);
> @@ -814,6 +817,9 @@ static inline bool rq_mergeable(struct request *rq)
>  	if (req_op(rq) == REQ_OP_WRITE_ZEROES)
>  		return false;
>  
> +	if (req_op(rq) == REQ_OP_VERIFY)
> +		return false;
> +
>  	if (req_op(rq) == REQ_OP_ZONE_APPEND)
>  		return false;
>  
> @@ -1072,6 +1078,9 @@ static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q,
>  	if (unlikely(op == REQ_OP_WRITE_ZEROES))
>  		return q->limits.max_write_zeroes_sectors;
>  
> +	if (unlikely(op == REQ_OP_VERIFY))
> +		return q->limits.max_verify_sectors;
> +
>  	return q->limits.max_sectors;
>  }
>  
> @@ -1154,6 +1163,8 @@ extern void blk_queue_max_discard_sectors(struct request_queue *q,
>  		unsigned int max_discard_sectors);
>  extern void blk_queue_max_write_same_sectors(struct request_queue *q,
>  		unsigned int max_write_same_sectors);
> +extern void blk_queue_max_verify_sectors(struct request_queue *q,
> +		unsigned int max_verify_sectors);
>  extern void blk_queue_max_write_zeroes_sectors(struct request_queue *q,
>  		unsigned int max_write_same_sectors);
>  extern void blk_queue_logical_block_size(struct request_queue *, unsigned int);
> @@ -1348,6 +1359,16 @@ extern int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
>  		unsigned flags);
>  extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
>  		sector_t nr_sects, gfp_t gfp_mask, unsigned flags);
> +extern int __blkdev_emulate_verify(struct block_device *bdev, sector_t sector,
> +		sector_t nr_sects, gfp_t gfp_mask, struct bio **biop,
> +		char *buf);
> +extern int blkdev_emulate_verify(struct block_device *bdev, sector_t sector,
> +		sector_t nr_sects, gfp_t gfp_mask);
> +extern int __blkdev_issue_verify(struct block_device *bdev,
> +		sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
> +		struct bio **biop);
> +extern int blkdev_issue_verify(struct block_device *bdev, sector_t sector,
> +		sector_t nr_sects, gfp_t gfp_mask);
>  
>  static inline int sb_issue_discard(struct super_block *sb, sector_t block,
>  		sector_t nr_blocks, gfp_t gfp_mask, unsigned long flags)
> @@ -1553,6 +1574,16 @@ static inline unsigned int bdev_write_same(struct block_device *bdev)
>  	return 0;
>  }
>  
> +static inline unsigned int bdev_verify_sectors(struct block_device *bdev)
> +{
> +	struct request_queue *q = bdev_get_queue(bdev);
> +
> +	if (q)
> +		return q->limits.max_verify_sectors;
> +
> +	return 0;
> +}
> +
>  static inline unsigned int bdev_write_zeroes_sectors(struct block_device *bdev)
>  {
>  	struct request_queue *q = bdev_get_queue(bdev);
> diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> index f44eb0a04afd..5eda16bd2c3d 100644
> --- a/include/uapi/linux/fs.h
> +++ b/include/uapi/linux/fs.h
> @@ -184,6 +184,7 @@ struct fsxattr {
>  #define BLKSECDISCARD _IO(0x12,125)
>  #define BLKROTATIONAL _IO(0x12,126)
>  #define BLKZEROOUT _IO(0x12,127)
> +#define BLKVERIFY _IO(0x12,128)
>  /*
>   * A jump here: 130-131 are reserved for zoned block devices
>   * (see uapi/linux/blkzoned.h)
> -- 
> 2.22.1
>
Chaitanya Kulkarni Nov. 11, 2021, 8:01 a.m. UTC | #2
On 11/4/2021 10:25 AM, Darrick J. Wong wrote:
> External email: Use caution opening links or attachments
> 
> 
> On Wed, Nov 03, 2021 at 11:46:27PM -0700, Chaitanya Kulkarni wrote:
>> From: Chaitanya Kulkarni <kch@nvidia.com>
>>
>> This adds a new block layer operation to offload verifying a range of
>> LBAs. This support is needed in order to provide file systems and
>> fabrics, kernel components to offload LBA verification when it is
>> supported by the hardware controller. In case hardware offloading is
>> not supported then we provide APIs to emulate the same. The prominent
>> example of that is NVMe Verify command. We also provide an emulation of
>> the same operation which can be used in case H/W does not support
>> verify. This is still useful when block device is remotely attached e.g.
>> using NVMeOF.
>>
>> Signed-off-by: Chaitanya Kulkarni <kch@nvidia.com>
>> ---
>>   Documentation/ABI/testing/sysfs-block |  14 ++
>>   block/blk-core.c                      |   5 +
>>   block/blk-lib.c                       | 192 ++++++++++++++++++++++++++
>>   block/blk-merge.c                     |  19 +++
>>   block/blk-settings.c                  |  17 +++
>>   block/blk-sysfs.c                     |   8 ++
>>   block/blk-zoned.c                     |   1 +
>>   block/bounce.c                        |   1 +
>>   block/ioctl.c                         |  35 +++++
>>   include/linux/bio.h                   |  10 +-
>>   include/linux/blk_types.h             |   2 +
>>   include/linux/blkdev.h                |  31 +++++
>>   include/uapi/linux/fs.h               |   1 +
>>   13 files changed, 332 insertions(+), 4 deletions(-)
>>
> 
> (skipping to the ioctl part; I didn't see anything obviously weird in
> the block/ changes)
> 

Yes it is pretty straight forward.

>> diff --git a/block/ioctl.c b/block/ioctl.c
>> index d61d652078f4..5e1b3c4660bf 100644
>> --- a/block/ioctl.c
>> +++ b/block/ioctl.c
>> @@ -168,6 +168,39 @@ static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode,
>>                        BLKDEV_ZERO_NOUNMAP);
>>   }
>>
>> +static int blk_ioctl_verify(struct block_device *bdev, fmode_t mode,
>> +             unsigned long arg)
>> +{
>> +     uint64_t range[2];
>> +     struct address_space *mapping;
>> +     uint64_t start, end, len;
>> +
>> +     if (!(mode & FMODE_WRITE))
>> +             return -EBADF;
> 
> Why does the fd have to be opened writable?  Isn't this a read test?
> 

Yes this needs to be removed, will fix it in the V1.

>> +
>> +     if (copy_from_user(range, (void __user *)arg, sizeof(range)))
>> +             return -EFAULT;
>> +
>> +     start = range[0];
>> +     len = range[1];
>> +     end = start + len - 1;
>> +
>> +     if (start & 511)
>> +             return -EINVAL;
>> +     if (len & 511)
>> +             return -EINVAL;
>> +     if (end >= (uint64_t)i_size_read(bdev->bd_inode))
>> +             return -EINVAL;
>> +     if (end < start)
>> +             return -EINVAL;
>> +
>> +     /* Invalidate the page cache, including dirty pages */
>> +     mapping = bdev->bd_inode->i_mapping;
>> +     truncate_inode_pages_range(mapping, start, end);
> 
> Why do we need to invalidate the page cache to verify media?  Won't that
> cause data loss if those pages were dirty and about to be flushed?
> 
> --D
> 

Yes, will fix it in the v1.

>> +
>> +     return blkdev_issue_verify(bdev, start >> 9, len >> 9, GFP_KERNEL);
>> +}
>> +

Thanks a lot Derrik for your comments, I'll add fixes to V1.
diff mbox series

Patch

diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block
index e34cdeeeb9d4..ba97f7a9cbec 100644
--- a/Documentation/ABI/testing/sysfs-block
+++ b/Documentation/ABI/testing/sysfs-block
@@ -252,6 +252,20 @@  Description:
 		write_zeroes_max_bytes is 0, write zeroes is not supported
 		by the device.
 
+What:		/sys/block/<disk>/queue/verify_max_bytes
+Date:		Nov 2021
+Contact:	Chaitanya Kulkarni <kch@nvidia.com>
+Description:
+		Devices that support verify operation in which a single
+		request can be issued to verify the range of the contiguous
+		blocks on the storage without any payload in the request.
+		This can be used to optimize verifying LBAs on the device
+		without reading by offloading functionality. verify_max_bytes
+		indicates how many bytes can be written in a single verify
+		command. If verify_max_bytes is 0, verify operation is not
+		supported by the device.
+
+
 What:		/sys/block/<disk>/queue/zoned
 Date:		September 2016
 Contact:	Damien Le Moal <damien.lemoal@wdc.com>
diff --git a/block/blk-core.c b/block/blk-core.c
index 5e752840b41a..62160e729e7d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -141,6 +141,7 @@  static const char *const blk_op_name[] = {
 	REQ_OP_NAME(ZONE_APPEND),
 	REQ_OP_NAME(WRITE_SAME),
 	REQ_OP_NAME(WRITE_ZEROES),
+	REQ_OP_NAME(VERIFY),
 	REQ_OP_NAME(SCSI_IN),
 	REQ_OP_NAME(SCSI_OUT),
 	REQ_OP_NAME(DRV_IN),
@@ -851,6 +852,10 @@  static noinline_for_stack bool submit_bio_checks(struct bio *bio)
 		if (!q->limits.max_write_same_sectors)
 			goto not_supported;
 		break;
+	case REQ_OP_VERIFY:
+		if (!q->limits.max_verify_sectors)
+			goto not_supported;
+		break;
 	case REQ_OP_ZONE_APPEND:
 		status = blk_check_zone_append(q, bio);
 		if (status != BLK_STS_OK)
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 752f9c722062..fdbb765b369e 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -439,3 +439,195 @@  int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 	return ret;
 }
 EXPORT_SYMBOL(blkdev_issue_zeroout);
+
+/**
+ * __blkdev_emulate_verify - emulate number of verify operations
+ * 				asynchronously
+ * @bdev:	blockdev to issue
+ * @sector:	start sector
+ * @nr_sects:	number of sectors to verify
+ * @gfp_mask:	memory allocation flags (for bio_alloc)
+ * @biop:	pointer to anchor bio
+ * @buf:	data buffer to mapped on bio
+ *
+ * Description:
+ *  Verify a block range by emulating REQ_OP_VERIFY, use this when H/W
+ *  offloading is not supported asynchronously. Caller is responsible to
+ *  handle anchored bio.
+ */
+int __blkdev_emulate_verify(struct block_device *bdev, sector_t sector,
+		sector_t nr_sects, gfp_t gfp_mask, struct bio **biop, char *buf)
+{
+	struct request_queue *q = bdev_get_queue(bdev);
+	struct bio *bio = *biop;
+	unsigned int sz;
+	int bi_size;
+
+	if (!q)
+		return -ENXIO;
+
+	if (bdev_read_only(bdev))
+		return -EPERM;
+
+	while (nr_sects != 0) {
+		bio = blk_next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects),
+				gfp_mask);
+		bio->bi_iter.bi_sector = sector;
+		bio_set_dev(bio, bdev);
+		bio_set_op_attrs(bio, REQ_OP_READ, 0);
+
+		while (nr_sects != 0) {
+			bool is_vaddr = is_vmalloc_addr(buf);
+			struct page *p;
+
+			p = is_vaddr ? vmalloc_to_page(buf) : virt_to_page(buf);
+			sz = min((sector_t) PAGE_SIZE, nr_sects << 9);
+			bi_size = bio_add_page(bio, p, sz, offset_in_page(buf));
+			nr_sects -= bi_size >> 9;
+			sector += bi_size >> 9;
+			buf += bi_size;
+
+			if (bi_size < sz)
+				break;
+		}
+		cond_resched();
+	}
+
+	*biop = bio;
+	return 0;
+}
+EXPORT_SYMBOL(__blkdev_emulate_verify);
+
+/**
+ * blkdev_emulate_verify - emulate number of verify operations synchronously
+ * @bdev:	blockdev to issue
+ * @sector:	start sector
+ * @nr_sects:	number of sectors to verify
+ * @gfp_mask:	memory allocation flags (for bio_alloc)
+ *
+ * Description:
+ *  Verify a block range by emulating REQ_OP_VERIFY, use this when H/W
+ *  offloading is not supported synchronously.
+ */
+int blkdev_emulate_verify(struct block_device *bdev, sector_t sector,
+		sector_t nr_sects, gfp_t gfp_mask)
+{
+	sector_t min_io_sect = (BIO_MAX_VECS << PAGE_SHIFT) >> 9;
+	int ret = 0;
+	char *buf;
+
+	/* allows pages in buffer to be == BIO_MAX_VECS */
+	buf = kzalloc(min_io_sect << 9, GFP_KERNEL);
+	if (!buf) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	while (nr_sects > 0) {
+		sector_t curr_sects = min_t(sector_t, nr_sects, min_io_sect);
+		struct bio *bio = NULL;
+
+		ret = __blkdev_emulate_verify(bdev, sector, curr_sects,
+				GFP_KERNEL, &bio, buf);
+
+		if (!(ret == 0 && bio))
+			break;
+
+		ret = submit_bio_wait(bio);
+		bio_put(bio);
+
+		nr_sects -= curr_sects;
+		sector += curr_sects;
+	}
+out:
+	kfree(buf);
+	return ret;
+}
+EXPORT_SYMBOL(blkdev_emulate_verify);
+
+/**
+ * __blkdev_issue_verify - generate number of verify operations
+ * @bdev:	blockdev to issue
+ * @sector:	start sector
+ * @nr_sects:	number of sectors to verify
+ * @gfp_mask:	memory allocation flags (for bio_alloc)
+ * @biop:	pointer to anchor bio
+ *
+ * Description:
+ *  Verify a block range using hardware offload.
+ *
+ * The function will emulate verify operation if no explicit hardware
+ * offloading for verifying is provided.
+ */
+int __blkdev_issue_verify(struct block_device *bdev, sector_t sector,
+		sector_t nr_sects, gfp_t gfp_mask, struct bio **biop)
+{
+	struct request_queue *q = bdev_get_queue(bdev);
+	unsigned int max_verify_sectors;
+	struct bio *bio = *biop;
+
+	if (!q)
+		return -ENXIO;
+
+	if (bdev_read_only(bdev))
+		return -EPERM;
+
+	max_verify_sectors = bdev_verify_sectors(bdev);
+
+	if (max_verify_sectors == 0)
+		return blkdev_emulate_verify(bdev, sector, nr_sects, gfp_mask);
+
+	while (nr_sects) {
+		bio = blk_next_bio(bio, 0, gfp_mask);
+		bio->bi_iter.bi_sector = sector;
+		bio_set_dev(bio, bdev);
+		bio->bi_opf = REQ_OP_VERIFY;
+		if (nr_sects > max_verify_sectors) {
+			bio->bi_iter.bi_size = max_verify_sectors << 9;
+			nr_sects -= max_verify_sectors;
+			sector += max_verify_sectors;
+		} else {
+			bio->bi_iter.bi_size = nr_sects << 9;
+			nr_sects = 0;
+		}
+		cond_resched();
+	}
+
+	*biop = bio;
+	return 0;
+}
+EXPORT_SYMBOL(__blkdev_issue_verify);
+
+/**
+ * blkdev_issue_verify - verify a block range
+ * @bdev:	blockdev to verify
+ * @sector:	start sector
+ * @nr_sects:	number of sectors to verify
+ * @gfp_mask:	memory allocation flags (for bio_alloc)
+ *
+ * Description:
+ *  Verify a block range using hardware offload.
+ */
+int blkdev_issue_verify(struct block_device *bdev, sector_t sector,
+		sector_t nr_sects, gfp_t gfp_mask)
+{
+	int ret = 0;
+	sector_t bs_mask;
+	struct bio *bio = NULL;
+	struct blk_plug plug;
+
+	bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
+	if ((sector | nr_sects) & bs_mask)
+		return -EINVAL;
+
+	blk_start_plug(&plug);
+	ret = __blkdev_issue_verify(bdev, sector, nr_sects, gfp_mask, &bio);
+	if (ret == 0 && bio) {
+		ret = submit_bio_wait(bio);
+		bio_put(bio);
+	}
+	blk_finish_plug(&plug);
+
+	return ret;
+}
+EXPORT_SYMBOL(blkdev_issue_verify);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index ffb4aa0ea68b..c28632cb936b 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -117,6 +117,20 @@  static struct bio *blk_bio_write_zeroes_split(struct request_queue *q,
 	return bio_split(bio, q->limits.max_write_zeroes_sectors, GFP_NOIO, bs);
 }
 
+static struct bio *blk_bio_verify_split(struct request_queue *q,
+		struct bio *bio, struct bio_set *bs, unsigned *nsegs)
+{
+	*nsegs = 0;
+
+	if (!q->limits.max_verify_sectors)
+		return NULL;
+
+	if (bio_sectors(bio) <= q->limits.max_verify_sectors)
+		return NULL;
+
+	return bio_split(bio, q->limits.max_verify_sectors, GFP_NOIO, bs);
+}
+
 static struct bio *blk_bio_write_same_split(struct request_queue *q,
 					    struct bio *bio,
 					    struct bio_set *bs,
@@ -316,6 +330,10 @@  void __blk_queue_split(struct bio **bio, unsigned int *nr_segs)
 		split = blk_bio_write_zeroes_split(q, *bio, &q->bio_split,
 				nr_segs);
 		break;
+	case REQ_OP_VERIFY:
+		split = blk_bio_verify_split(q, *bio, &q->bio_split,
+				nr_segs);
+		break;
 	case REQ_OP_WRITE_SAME:
 		split = blk_bio_write_same_split(q, *bio, &q->bio_split,
 				nr_segs);
@@ -383,6 +401,7 @@  unsigned int blk_recalc_rq_segments(struct request *rq)
 	case REQ_OP_DISCARD:
 	case REQ_OP_SECURE_ERASE:
 	case REQ_OP_WRITE_ZEROES:
+	case REQ_OP_VERIFY:
 		return 0;
 	case REQ_OP_WRITE_SAME:
 		return 1;
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 4c974340f1a9..f34cbd3678b6 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -48,6 +48,7 @@  void blk_set_default_limits(struct queue_limits *lim)
 	lim->chunk_sectors = 0;
 	lim->max_write_same_sectors = 0;
 	lim->max_write_zeroes_sectors = 0;
+	lim->max_verify_sectors = 0;
 	lim->max_zone_append_sectors = 0;
 	lim->max_discard_sectors = 0;
 	lim->max_hw_discard_sectors = 0;
@@ -84,6 +85,7 @@  void blk_set_stacking_limits(struct queue_limits *lim)
 	lim->max_dev_sectors = UINT_MAX;
 	lim->max_write_same_sectors = UINT_MAX;
 	lim->max_write_zeroes_sectors = UINT_MAX;
+	lim->max_verify_sectors = UINT_MAX;
 	lim->max_zone_append_sectors = UINT_MAX;
 }
 EXPORT_SYMBOL(blk_set_stacking_limits);
@@ -227,6 +229,19 @@  void blk_queue_max_write_zeroes_sectors(struct request_queue *q,
 }
 EXPORT_SYMBOL(blk_queue_max_write_zeroes_sectors);
 
+/**
+ * blk_queue_max_verify_sectors - set max sectors for a single verify
+ *
+ * @q:  the request queue for the device
+ * @max_verify_sectors: maximum number of sectors to verify per command
+ **/
+void blk_queue_max_verify_sectors(struct request_queue *q,
+		unsigned int max_verify_sectors)
+{
+	q->limits.max_verify_sectors = max_verify_sectors;
+}
+EXPORT_SYMBOL(blk_queue_max_verify_sectors);
+
 /**
  * blk_queue_max_zone_append_sectors - set max sectors for a single zone append
  * @q:  the request queue for the device
@@ -514,6 +529,8 @@  int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 					b->max_write_same_sectors);
 	t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors,
 					b->max_write_zeroes_sectors);
+	t->max_verify_sectors = min(t->max_verify_sectors,
+				    b->max_verify_sectors);
 	t->max_zone_append_sectors = min(t->max_zone_append_sectors,
 					b->max_zone_append_sectors);
 	t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index b513f1683af0..f918c83dd8d4 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -108,6 +108,12 @@  queue_ra_store(struct request_queue *q, const char *page, size_t count)
 	return ret;
 }
 
+static ssize_t queue_verify_max_show(struct request_queue *q, char *page)
+{
+	return sprintf(page, "%llu\n",
+		(unsigned long long)q->limits.max_verify_sectors << 9);
+}
+
 static ssize_t queue_max_sectors_show(struct request_queue *q, char *page)
 {
 	int max_sectors_kb = queue_max_sectors(q) >> 1;
@@ -584,6 +590,7 @@  QUEUE_RO_ENTRY(queue_discard_zeroes_data, "discard_zeroes_data");
 
 QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes");
 QUEUE_RO_ENTRY(queue_write_zeroes_max, "write_zeroes_max_bytes");
+QUEUE_RO_ENTRY(queue_verify_max, "verify_max_bytes");
 QUEUE_RO_ENTRY(queue_zone_append_max, "zone_append_max_bytes");
 
 QUEUE_RO_ENTRY(queue_zoned, "zoned");
@@ -638,6 +645,7 @@  static struct attribute *queue_attrs[] = {
 	&queue_discard_zeroes_data_entry.attr,
 	&queue_write_same_max_entry.attr,
 	&queue_write_zeroes_max_entry.attr,
+	&queue_verify_max_entry.attr,
 	&queue_zone_append_max_entry.attr,
 	&queue_nonrot_entry.attr,
 	&queue_zoned_entry.attr,
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 7a68b6e4300c..c9c51ee22a49 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -73,6 +73,7 @@  bool blk_req_needs_zone_write_lock(struct request *rq)
 
 	switch (req_op(rq)) {
 	case REQ_OP_WRITE_ZEROES:
+	case REQ_OP_VERIFY:
 	case REQ_OP_WRITE_SAME:
 	case REQ_OP_WRITE:
 		return blk_rq_zone_is_seq(rq);
diff --git a/block/bounce.c b/block/bounce.c
index fc55314aa426..86cdb900b88f 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -259,6 +259,7 @@  static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask,
 	case REQ_OP_DISCARD:
 	case REQ_OP_SECURE_ERASE:
 	case REQ_OP_WRITE_ZEROES:
+	case REQ_OP_VERIFY:
 		break;
 	case REQ_OP_WRITE_SAME:
 		bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
diff --git a/block/ioctl.c b/block/ioctl.c
index d61d652078f4..5e1b3c4660bf 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -168,6 +168,39 @@  static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode,
 			BLKDEV_ZERO_NOUNMAP);
 }
 
+static int blk_ioctl_verify(struct block_device *bdev, fmode_t mode,
+		unsigned long arg)
+{
+	uint64_t range[2];
+	struct address_space *mapping;
+	uint64_t start, end, len;
+
+	if (!(mode & FMODE_WRITE))
+		return -EBADF;
+
+	if (copy_from_user(range, (void __user *)arg, sizeof(range)))
+		return -EFAULT;
+
+	start = range[0];
+	len = range[1];
+	end = start + len - 1;
+
+	if (start & 511)
+		return -EINVAL;
+	if (len & 511)
+		return -EINVAL;
+	if (end >= (uint64_t)i_size_read(bdev->bd_inode))
+		return -EINVAL;
+	if (end < start)
+		return -EINVAL;
+
+	/* Invalidate the page cache, including dirty pages */
+	mapping = bdev->bd_inode->i_mapping;
+	truncate_inode_pages_range(mapping, start, end);
+
+	return blkdev_issue_verify(bdev, start >> 9, len >> 9, GFP_KERNEL);
+}
+
 static int put_ushort(unsigned short __user *argp, unsigned short val)
 {
 	return put_user(val, argp);
@@ -460,6 +493,8 @@  static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode,
 				BLKDEV_DISCARD_SECURE);
 	case BLKZEROOUT:
 		return blk_ioctl_zeroout(bdev, mode, arg);
+	case BLKVERIFY:
+		return blk_ioctl_verify(bdev, mode, arg);
 	case BLKREPORTZONE:
 		return blkdev_report_zones_ioctl(bdev, mode, cmd, arg);
 	case BLKRESETZONE:
diff --git a/include/linux/bio.h b/include/linux/bio.h
index c74857cf1252..d660c37b7d6c 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -63,7 +63,8 @@  static inline bool bio_has_data(struct bio *bio)
 	    bio->bi_iter.bi_size &&
 	    bio_op(bio) != REQ_OP_DISCARD &&
 	    bio_op(bio) != REQ_OP_SECURE_ERASE &&
-	    bio_op(bio) != REQ_OP_WRITE_ZEROES)
+	    bio_op(bio) != REQ_OP_WRITE_ZEROES &&
+	    bio_op(bio) != REQ_OP_VERIFY)
 		return true;
 
 	return false;
@@ -73,8 +74,8 @@  static inline bool bio_no_advance_iter(const struct bio *bio)
 {
 	return bio_op(bio) == REQ_OP_DISCARD ||
 	       bio_op(bio) == REQ_OP_SECURE_ERASE ||
-	       bio_op(bio) == REQ_OP_WRITE_SAME ||
-	       bio_op(bio) == REQ_OP_WRITE_ZEROES;
+	       bio_op(bio) == REQ_OP_WRITE_ZEROES ||
+	       bio_op(bio) == REQ_OP_VERIFY;
 }
 
 static inline bool bio_mergeable(struct bio *bio)
@@ -198,7 +199,7 @@  static inline unsigned bio_segments(struct bio *bio)
 	struct bvec_iter iter;
 
 	/*
-	 * We special case discard/write same/write zeroes, because they
+	 * We special case discard/write same/write zeroes/verify, because they
 	 * interpret bi_size differently:
 	 */
 
@@ -206,6 +207,7 @@  static inline unsigned bio_segments(struct bio *bio)
 	case REQ_OP_DISCARD:
 	case REQ_OP_SECURE_ERASE:
 	case REQ_OP_WRITE_ZEROES:
+	case REQ_OP_VERIFY:
 		return 0;
 	case REQ_OP_WRITE_SAME:
 		return 1;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 1bc6f6a01070..8877711c4c56 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -366,6 +366,8 @@  enum req_opf {
 	REQ_OP_SECURE_ERASE	= 5,
 	/* write the same sector many times */
 	REQ_OP_WRITE_SAME	= 7,
+	/* verify the sectors */
+	REQ_OP_VERIFY		= 8,
 	/* write the zero filled sector many times */
 	REQ_OP_WRITE_ZEROES	= 9,
 	/* Open a zone */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 0dea268bd61b..99c41d90584b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -334,6 +334,7 @@  struct queue_limits {
 	unsigned int		max_hw_discard_sectors;
 	unsigned int		max_write_same_sectors;
 	unsigned int		max_write_zeroes_sectors;
+	unsigned int		max_verify_sectors;
 	unsigned int		max_zone_append_sectors;
 	unsigned int		discard_granularity;
 	unsigned int		discard_alignment;
@@ -621,6 +622,7 @@  struct request_queue {
 #define QUEUE_FLAG_RQ_ALLOC_TIME 27	/* record rq->alloc_time_ns */
 #define QUEUE_FLAG_HCTX_ACTIVE	28	/* at least one blk-mq hctx is active */
 #define QUEUE_FLAG_NOWAIT       29	/* device supports NOWAIT */
+#define QUEUE_FLAG_VERIFY	30	/* supports Verify */
 
 #define QUEUE_FLAG_MQ_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
 				 (1 << QUEUE_FLAG_SAME_COMP) |		\
@@ -667,6 +669,7 @@  bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
 #define blk_queue_fua(q)	test_bit(QUEUE_FLAG_FUA, &(q)->queue_flags)
 #define blk_queue_registered(q)	test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags)
 #define blk_queue_nowait(q)	test_bit(QUEUE_FLAG_NOWAIT, &(q)->queue_flags)
+#define blk_queue_verify(q)	test_bit(QUEUE_FLAG_VERIFY, &(q)->queue_flags)
 
 extern void blk_set_pm_only(struct request_queue *q);
 extern void blk_clear_pm_only(struct request_queue *q);
@@ -814,6 +817,9 @@  static inline bool rq_mergeable(struct request *rq)
 	if (req_op(rq) == REQ_OP_WRITE_ZEROES)
 		return false;
 
+	if (req_op(rq) == REQ_OP_VERIFY)
+		return false;
+
 	if (req_op(rq) == REQ_OP_ZONE_APPEND)
 		return false;
 
@@ -1072,6 +1078,9 @@  static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q,
 	if (unlikely(op == REQ_OP_WRITE_ZEROES))
 		return q->limits.max_write_zeroes_sectors;
 
+	if (unlikely(op == REQ_OP_VERIFY))
+		return q->limits.max_verify_sectors;
+
 	return q->limits.max_sectors;
 }
 
@@ -1154,6 +1163,8 @@  extern void blk_queue_max_discard_sectors(struct request_queue *q,
 		unsigned int max_discard_sectors);
 extern void blk_queue_max_write_same_sectors(struct request_queue *q,
 		unsigned int max_write_same_sectors);
+extern void blk_queue_max_verify_sectors(struct request_queue *q,
+		unsigned int max_verify_sectors);
 extern void blk_queue_max_write_zeroes_sectors(struct request_queue *q,
 		unsigned int max_write_same_sectors);
 extern void blk_queue_logical_block_size(struct request_queue *, unsigned int);
@@ -1348,6 +1359,16 @@  extern int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 		unsigned flags);
 extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, unsigned flags);
+extern int __blkdev_emulate_verify(struct block_device *bdev, sector_t sector,
+		sector_t nr_sects, gfp_t gfp_mask, struct bio **biop,
+		char *buf);
+extern int blkdev_emulate_verify(struct block_device *bdev, sector_t sector,
+		sector_t nr_sects, gfp_t gfp_mask);
+extern int __blkdev_issue_verify(struct block_device *bdev,
+		sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
+		struct bio **biop);
+extern int blkdev_issue_verify(struct block_device *bdev, sector_t sector,
+		sector_t nr_sects, gfp_t gfp_mask);
 
 static inline int sb_issue_discard(struct super_block *sb, sector_t block,
 		sector_t nr_blocks, gfp_t gfp_mask, unsigned long flags)
@@ -1553,6 +1574,16 @@  static inline unsigned int bdev_write_same(struct block_device *bdev)
 	return 0;
 }
 
+static inline unsigned int bdev_verify_sectors(struct block_device *bdev)
+{
+	struct request_queue *q = bdev_get_queue(bdev);
+
+	if (q)
+		return q->limits.max_verify_sectors;
+
+	return 0;
+}
+
 static inline unsigned int bdev_write_zeroes_sectors(struct block_device *bdev)
 {
 	struct request_queue *q = bdev_get_queue(bdev);
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index f44eb0a04afd..5eda16bd2c3d 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -184,6 +184,7 @@  struct fsxattr {
 #define BLKSECDISCARD _IO(0x12,125)
 #define BLKROTATIONAL _IO(0x12,126)
 #define BLKZEROOUT _IO(0x12,127)
+#define BLKVERIFY _IO(0x12,128)
 /*
  * A jump here: 130-131 are reserved for zoned block devices
  * (see uapi/linux/blkzoned.h)