diff mbox series

[RFC,1/2] block: add simple copy support

Message ID 20201201053949.143175-2-selvakuma.s1@samsung.com (mailing list archive)
State New, archived
Headers show
Series add simple copy support | expand

Commit Message

SelvaKumar S Dec. 1, 2020, 5:39 a.m. UTC
Add new BLKCOPY ioctl that offloads copying of multiple sources
to a destination to the device. Accept copy_ranges that contains
destination, no of sources and pointer to the array of source
ranges. Each range_entry contains start and length of source
ranges (in bytes).

Introduce REQ_OP_COPY, a no-merge copy offload operation. Create
bio with control information as payload and submit to the device.
REQ_OP_COPY(19) is a write op and takes zone_write_lock when submitted
to zoned device.

Introduce queue limits for simple copy and other helper functions.
Add device limits as sysfs entries.
	- max_copy_sectors
	- max_copy_ranges_sectors
	- max_copy_nr_ranges

max_copy_sectors = 0 indicates the device doesn't support copy.

Signed-off-by: SelvaKumar S <selvakuma.s1@samsung.com>
Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
Signed-off-by: Javier González <javier.gonz@samsung.com>
---
 block/blk-core.c          | 104 +++++++++++++++++++++++++++++++---
 block/blk-lib.c           | 116 ++++++++++++++++++++++++++++++++++++++
 block/blk-merge.c         |   2 +
 block/blk-settings.c      |  11 ++++
 block/blk-sysfs.c         |  23 ++++++++
 block/blk-zoned.c         |   1 +
 block/bounce.c            |   1 +
 block/ioctl.c             |  43 ++++++++++++++
 include/linux/bio.h       |   1 +
 include/linux/blk_types.h |   7 +++
 include/linux/blkdev.h    |  15 +++++
 include/uapi/linux/fs.h   |  21 +++++++
 12 files changed, 337 insertions(+), 8 deletions(-)

Comments

Johannes Thumshirn Dec. 1, 2020, 9:57 a.m. UTC | #1
On 01/12/2020 08:14, SelvaKumar S wrote:
> +static inline int bio_check_copy_eod(struct bio *bio, sector_t start,
> +		sector_t nr_sectors, sector_t maxsector)
> +{
> +	if (nr_sectors && maxsector && (nr_sectors > maxsector ||
> +				start > maxsector - nr_sectors)) {

Nit: I don't like the line break here, maybe:

	if (nr_sectors && maxsector &&
	    (nr_sectors > maxsector || start > maxsector - nr_sectors)) {

> +		handle_bad_sector(bio, maxsector);
> +		return -EIO;
> +	}
> +	return 0;
> +}
> +
>  /*
>   * Check whether this bio extends beyond the end of the device or partition.
>   * This may well happen - the kernel calls bread() without checking the size of
> @@ -737,6 +748,75 @@ static inline int bio_check_eod(struct bio *bio, sector_t maxsector)
>  	return 0;
>  }
>  
> +/*
> + * Check for copy limits and remap source ranges if needed.
> + */
> +static inline int blk_check_copy(struct bio *bio)

That function is a bit big to be marked as inline, isn't it?


> +{
> +	struct hd_struct *p = NULL;
> +	struct request_queue *q = bio->bi_disk->queue;
> +	struct blk_copy_payload *payload;
> +	unsigned short nr_range;
> +	int ret = -EIO;
> +	int i, copy_len = 0;
> +
> +	rcu_read_lock();
> +
> +	if (bio->bi_partno) {
> +		p = __disk_get_part(bio->bi_disk, bio->bi_partno);
> +		if (unlikely(!p))
> +			goto out;
> +		if (unlikely(bio_check_ro(bio, p)))
> +			goto out;
> +	} else {
> +		if (unlikely(bio_check_ro(bio, &bio->bi_disk->part0)))
> +			goto out;
> +	}
> +
> +	payload = bio_data(bio);
> +	nr_range = payload->copy_range;
> +
> +	/* cannot handle copy crossing nr_ranges limit */
> +	if (payload->copy_range > q->limits.max_copy_nr_ranges)
> +		goto out;
> +
> +	for (i = 0; i < nr_range; i++) {
> +		copy_len += payload->range[i].len;
> +		if (p) {
> +			if (bio_check_copy_eod(bio, payload->range[i].src,
> +					payload->range[i].len, part_nr_sects_read(p)))
> +				goto out;
> +			payload->range[i].src += p->start_sect;
> +		} else {
> +			if (unlikely(bio_check_copy_eod(bio, payload->range[i].src,
> +					payload->range[i].len,
> +					get_capacity(bio->bi_disk))))
> +				goto out;
> +		}
> +	}
> +
> +	/* cannot handle copy more than copy limits */
> +	if (copy_len > q->limits.max_copy_sectors)
> +		goto out;
> +
> +	if (p) {
> +		if (unlikely(bio_check_copy_eod(bio, bio->bi_iter.bi_sector, copy_len,
> +					part_nr_sects_read(p))))
> +			goto out;
> +	} else {
> +		if (unlikely(bio_check_copy_eod(bio, bio->bi_iter.bi_sector, copy_len,
> +					get_capacity(bio->bi_disk))))
> +			goto out;
> +
> +	}
> +


All these if (p) {} else {} branches make this function a bit hard to follow.

> +	if (p)
> +		bio->bi_partno = 0;
> +	ret = 0;
> +out:
> +	rcu_read_unlock();
> +	return ret;
> +}
>  /*
>   * Remap block n of partition p to block n+start(p) of the disk.
>   */


> diff --git a/block/blk-lib.c b/block/blk-lib.c
> index e90614fd8d6a..db4947f7014d 100644
> --- a/block/blk-lib.c
> +++ b/block/blk-lib.c
> @@ -150,6 +150,122 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
>  }
>  EXPORT_SYMBOL(blkdev_issue_discard);
>  
> +int __blkdev_issue_copy(struct block_device *bdev, sector_t dest,
> +		sector_t nr_srcs, struct range_entry *rlist, gfp_t gfp_mask,
> +		int flags, struct bio **biop)
> +{
> +	struct request_queue *q = bdev_get_queue(bdev);
> +	struct bio *bio;
> +	struct blk_copy_payload *payload;
> +	unsigned int op;

I don't think op is needed.

> +	sector_t bs_mask;
> +	sector_t src_sects, len = 0, total_len = 0;
> +	int i, ret, total_size;
> +
> +	if (!q)
> +		return -ENXIO;
> +
> +	if (!nr_srcs)
> +		return -EINVAL;
> +
> +	if (bdev_read_only(bdev))
> +		return -EPERM;
> +
> +	if (!blk_queue_copy(q))
> +		return -EOPNOTSUPP;
> +	op = REQ_OP_COPY;
> +
> +	bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
> +	if (dest & bs_mask)
> +		return -EINVAL;
> +
> +	payload = kmalloc(sizeof(struct blk_copy_payload) +
> +			nr_srcs * sizeof(struct range_entry),
> +				GFP_ATOMIC | __GFP_NOWARN);

Please check if the use of struct_size() is possible. Probably even assign
total_size here so you don't need to do the size calculation twice.

> +	if (!payload)
> +		return -ENOMEM;
> +
> +	bio = bio_alloc(gfp_mask, 1);
> +	bio->bi_iter.bi_sector = dest;
> +	bio_set_dev(bio, bdev);
> +	bio_set_op_attrs(bio, op, REQ_NOMERGE);

bio_set_op_attrs() is deprecated, please don't use it.
	bio->bi_opf = REQ_OP_COPY | REQ_NOMERGE;

> +
> +	payload->dest = dest;
> +
> +	for (i = 0; i < nr_srcs; i++) {
> +		/*  copy payload provided are in bytes */
> +		src_sects = rlist[i].src;
> +		if (src_sects & bs_mask)
> +			return -EINVAL;
> +		src_sects = src_sects >> SECTOR_SHIFT;
> +
> +		if (len & bs_mask)
> +			return -EINVAL;
> +		len = rlist[i].len >> SECTOR_SHIFT;
> +		if (len > q->limits.max_copy_range_sectors)
> +			return -EINVAL;
> +
> +		total_len += len;
> +
> +		WARN_ON_ONCE((src_sects << 9) > UINT_MAX);
> +
> +		payload->range[i].src = src_sects;
> +		payload->range[i].len = len;
> +	}
> +
> +	/* storing # of source ranges */
> +	payload->copy_range = i;
> +	/* storing copy len so far */
> +	payload->copy_size = total_len;
> +
> +	total_size = sizeof(struct blk_copy_payload) + nr_srcs * sizeof(struct range_entry);

See above.

> +	ret = bio_add_page(bio, virt_to_page(payload), total_size,
> +					   offset_in_page(payload));
> +	if (ret != total_size) {
> +		kfree(payload);
> +		return -ENOMEM;
> +	}
> +
> +	*biop = bio;
> +	return 0;
> +}
> +EXPORT_SYMBOL(__blkdev_issue_copy);
> +
> +/**
> + * blkdev_issue_copy - queue a copy
> + * @bdev:       blockdev to issue copy for
> + * @dest:	dest sector
> + * @nr_srcs:	number of source ranges to copy
> + * @rlist:	list of range entries
> + * @gfp_mask:   memory allocation flags (for bio_alloc)
> + * @flags:      BLKDEV_COPY_* flags to control behaviour	//TODO
> + *
> + * Description:
> + *    Issue a copy request for dest sector with source in rlist
> + */
> +int blkdev_issue_copy(struct block_device *bdev, sector_t dest,
> +		int nr_srcs, struct range_entry *rlist,
> +		gfp_t gfp_mask, unsigned long flags)
> +{
> +	struct bio *bio = NULL;
> +	int ret;
> +
> +	ret = __blkdev_issue_copy(bdev, dest, nr_srcs, rlist, gfp_mask, flags,
> +			&bio);
> +	if (!ret && bio) {
> +		ret = submit_bio_wait(bio);
> +		if (ret == -EOPNOTSUPP)
> +			ret = 0;
> +
> +		kfree(page_address(bio_first_bvec_all(bio)->bv_page) +
> +				bio_first_bvec_all(bio)->bv_offset);
> +		bio_put(bio);
> +	}
> +
> +	return ret;

What happens with bio here if __blkdev_issue_copy() returns say -ENOMEM because
bio_add_page() fails?

Also please handle failure not success. Sth along the lines of

	ret = __blkdev_issue_copy(bdev, dest, nr_srcs, rlist, gfp_mask, flags,
				  &bio);
	if (ret)
		...

	ret = submit_bio_wait();
	if (ret)
		...

	...

	return ret;
}

> +}
> +EXPORT_SYMBOL(blkdev_issue_copy);
Aleksei Marov Dec. 1, 2020, 10:28 a.m. UTC | #2
On Tue, 2020-12-01 at 11:09 +0530, SelvaKumar S wrote:
> +	ret = __blkdev_issue_copy(bdev, dest, nr_srcs, rlist, gfp_mask, flags,
> +			&bio);
> +	if (!ret && bio) {
> +		ret = submit_bio_wait(bio);
> +		if (ret == -EOPNOTSUPP)
> +			ret = 0;
> +
> +		kfree(page_address(bio_first_bvec_all(bio)->bv_page) +
> +				bio_first_bvec_all(bio)->bv_offset);
> +		bio_put(bio);
> +	}
> +
> +	return ret;
> +}
I think  there is an issue here that if bio_add_page  returns error in
__blkdev_issue_copy then ret is -ENOMEM and we never do bio_put for bio
allocated in  __blkdev_issue_copy so it is small memory leak.
Selva Jove Dec. 1, 2020, 1:22 p.m. UTC | #3
Thanks for reporting the memory leak. Will add a fix.

On Tue, Dec 1, 2020 at 3:58 PM Aleksei Marov <alekseymmm@mail.ru> wrote:
>
> On Tue, 2020-12-01 at 11:09 +0530, SelvaKumar S wrote:
> > +     ret = __blkdev_issue_copy(bdev, dest, nr_srcs, rlist, gfp_mask, flags,
> > +                     &bio);
> > +     if (!ret && bio) {
> > +             ret = submit_bio_wait(bio);
> > +             if (ret == -EOPNOTSUPP)
> > +                     ret = 0;
> > +
> > +             kfree(page_address(bio_first_bvec_all(bio)->bv_page) +
> > +                             bio_first_bvec_all(bio)->bv_offset);
> > +             bio_put(bio);
> > +     }
> > +
> > +     return ret;
> > +}
> I think  there is an issue here that if bio_add_page  returns error in
> __blkdev_issue_copy then ret is -ENOMEM and we never do bio_put for bio
> allocated in  __blkdev_issue_copy so it is small memory leak.
>
>
diff mbox series

Patch

diff --git a/block/blk-core.c b/block/blk-core.c
index 2db8bda43b6e..6818d0cdf627 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -719,6 +719,17 @@  static noinline int should_fail_bio(struct bio *bio)
 }
 ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO);
 
+static inline int bio_check_copy_eod(struct bio *bio, sector_t start,
+		sector_t nr_sectors, sector_t maxsector)
+{
+	if (nr_sectors && maxsector && (nr_sectors > maxsector ||
+				start > maxsector - nr_sectors)) {
+		handle_bad_sector(bio, maxsector);
+		return -EIO;
+	}
+	return 0;
+}
+
 /*
  * Check whether this bio extends beyond the end of the device or partition.
  * This may well happen - the kernel calls bread() without checking the size of
@@ -737,6 +748,75 @@  static inline int bio_check_eod(struct bio *bio, sector_t maxsector)
 	return 0;
 }
 
+/*
+ * Check for copy limits and remap source ranges if needed.
+ */
+static inline int blk_check_copy(struct bio *bio)
+{
+	struct hd_struct *p = NULL;
+	struct request_queue *q = bio->bi_disk->queue;
+	struct blk_copy_payload *payload;
+	unsigned short nr_range;
+	int ret = -EIO;
+	int i, copy_len = 0;
+
+	rcu_read_lock();
+
+	if (bio->bi_partno) {
+		p = __disk_get_part(bio->bi_disk, bio->bi_partno);
+		if (unlikely(!p))
+			goto out;
+		if (unlikely(bio_check_ro(bio, p)))
+			goto out;
+	} else {
+		if (unlikely(bio_check_ro(bio, &bio->bi_disk->part0)))
+			goto out;
+	}
+
+	payload = bio_data(bio);
+	nr_range = payload->copy_range;
+
+	/* cannot handle copy crossing nr_ranges limit */
+	if (payload->copy_range > q->limits.max_copy_nr_ranges)
+		goto out;
+
+	for (i = 0; i < nr_range; i++) {
+		copy_len += payload->range[i].len;
+		if (p) {
+			if (bio_check_copy_eod(bio, payload->range[i].src,
+					payload->range[i].len, part_nr_sects_read(p)))
+				goto out;
+			payload->range[i].src += p->start_sect;
+		} else {
+			if (unlikely(bio_check_copy_eod(bio, payload->range[i].src,
+					payload->range[i].len,
+					get_capacity(bio->bi_disk))))
+				goto out;
+		}
+	}
+
+	/* cannot handle copy more than copy limits */
+	if (copy_len > q->limits.max_copy_sectors)
+		goto out;
+
+	if (p) {
+		if (unlikely(bio_check_copy_eod(bio, bio->bi_iter.bi_sector, copy_len,
+					part_nr_sects_read(p))))
+			goto out;
+	} else {
+		if (unlikely(bio_check_copy_eod(bio, bio->bi_iter.bi_sector, copy_len,
+					get_capacity(bio->bi_disk))))
+			goto out;
+
+	}
+
+	if (p)
+		bio->bi_partno = 0;
+	ret = 0;
+out:
+	rcu_read_unlock();
+	return ret;
+}
 /*
  * Remap block n of partition p to block n+start(p) of the disk.
  */
@@ -825,14 +905,16 @@  static noinline_for_stack bool submit_bio_checks(struct bio *bio)
 	if (should_fail_bio(bio))
 		goto end_io;
 
-	if (bio->bi_partno) {
-		if (unlikely(blk_partition_remap(bio)))
-			goto end_io;
-	} else {
-		if (unlikely(bio_check_ro(bio, &bio->bi_disk->part0)))
-			goto end_io;
-		if (unlikely(bio_check_eod(bio, get_capacity(bio->bi_disk))))
-			goto end_io;
+	if (likely(!op_is_copy(bio->bi_opf))) {
+		if (bio->bi_partno) {
+			if (unlikely(blk_partition_remap(bio)))
+				goto end_io;
+		} else {
+			if (unlikely(bio_check_ro(bio, &bio->bi_disk->part0)))
+				goto end_io;
+			if (unlikely(bio_check_eod(bio, get_capacity(bio->bi_disk))))
+				goto end_io;
+		}
 	}
 
 	/*
@@ -856,6 +938,12 @@  static noinline_for_stack bool submit_bio_checks(struct bio *bio)
 		if (!blk_queue_discard(q))
 			goto not_supported;
 		break;
+	case REQ_OP_COPY:
+		if (!blk_queue_copy(q))
+			goto not_supported;
+		if (unlikely(blk_check_copy(bio)))
+			goto end_io;
+		break;
 	case REQ_OP_SECURE_ERASE:
 		if (!blk_queue_secure_erase(q))
 			goto not_supported;
diff --git a/block/blk-lib.c b/block/blk-lib.c
index e90614fd8d6a..db4947f7014d 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -150,6 +150,122 @@  int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 }
 EXPORT_SYMBOL(blkdev_issue_discard);
 
+int __blkdev_issue_copy(struct block_device *bdev, sector_t dest,
+		sector_t nr_srcs, struct range_entry *rlist, gfp_t gfp_mask,
+		int flags, struct bio **biop)
+{
+	struct request_queue *q = bdev_get_queue(bdev);
+	struct bio *bio;
+	struct blk_copy_payload *payload;
+	unsigned int op;
+	sector_t bs_mask;
+	sector_t src_sects, len = 0, total_len = 0;
+	int i, ret, total_size;
+
+	if (!q)
+		return -ENXIO;
+
+	if (!nr_srcs)
+		return -EINVAL;
+
+	if (bdev_read_only(bdev))
+		return -EPERM;
+
+	if (!blk_queue_copy(q))
+		return -EOPNOTSUPP;
+	op = REQ_OP_COPY;
+
+	bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
+	if (dest & bs_mask)
+		return -EINVAL;
+
+	payload = kmalloc(sizeof(struct blk_copy_payload) +
+			nr_srcs * sizeof(struct range_entry),
+				GFP_ATOMIC | __GFP_NOWARN);
+	if (!payload)
+		return -ENOMEM;
+
+	bio = bio_alloc(gfp_mask, 1);
+	bio->bi_iter.bi_sector = dest;
+	bio_set_dev(bio, bdev);
+	bio_set_op_attrs(bio, op, REQ_NOMERGE);
+
+	payload->dest = dest;
+
+	for (i = 0; i < nr_srcs; i++) {
+		/*  copy payload provided are in bytes */
+		src_sects = rlist[i].src;
+		if (src_sects & bs_mask)
+			return -EINVAL;
+		src_sects = src_sects >> SECTOR_SHIFT;
+
+		if (len & bs_mask)
+			return -EINVAL;
+		len = rlist[i].len >> SECTOR_SHIFT;
+		if (len > q->limits.max_copy_range_sectors)
+			return -EINVAL;
+
+		total_len += len;
+
+		WARN_ON_ONCE((src_sects << 9) > UINT_MAX);
+
+		payload->range[i].src = src_sects;
+		payload->range[i].len = len;
+	}
+
+	/* storing # of source ranges */
+	payload->copy_range = i;
+	/* storing copy len so far */
+	payload->copy_size = total_len;
+
+	total_size = sizeof(struct blk_copy_payload) + nr_srcs * sizeof(struct range_entry);
+	ret = bio_add_page(bio, virt_to_page(payload), total_size,
+					   offset_in_page(payload));
+	if (ret != total_size) {
+		kfree(payload);
+		return -ENOMEM;
+	}
+
+	*biop = bio;
+	return 0;
+}
+EXPORT_SYMBOL(__blkdev_issue_copy);
+
+/**
+ * blkdev_issue_copy - queue a copy
+ * @bdev:       blockdev to issue copy for
+ * @dest:	dest sector
+ * @nr_srcs:	number of source ranges to copy
+ * @rlist:	list of range entries
+ * @gfp_mask:   memory allocation flags (for bio_alloc)
+ * @flags:      BLKDEV_COPY_* flags to control behaviour	//TODO
+ *
+ * Description:
+ *    Issue a copy request for dest sector with source in rlist
+ */
+int blkdev_issue_copy(struct block_device *bdev, sector_t dest,
+		int nr_srcs, struct range_entry *rlist,
+		gfp_t gfp_mask, unsigned long flags)
+{
+	struct bio *bio = NULL;
+	int ret;
+
+	ret = __blkdev_issue_copy(bdev, dest, nr_srcs, rlist, gfp_mask, flags,
+			&bio);
+	if (!ret && bio) {
+		ret = submit_bio_wait(bio);
+		if (ret == -EOPNOTSUPP)
+			ret = 0;
+
+		kfree(page_address(bio_first_bvec_all(bio)->bv_page) +
+				bio_first_bvec_all(bio)->bv_offset);
+		bio_put(bio);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(blkdev_issue_copy);
+
 /**
  * __blkdev_issue_write_same - generate number of bios with same page
  * @bdev:	target blockdev
diff --git a/block/blk-merge.c b/block/blk-merge.c
index bcf5e4580603..a16e7598d6ad 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -301,6 +301,8 @@  void __blk_queue_split(struct bio **bio, unsigned int *nr_segs)
 	struct bio *split = NULL;
 
 	switch (bio_op(*bio)) {
+	case REQ_OP_COPY:
+			break;
 	case REQ_OP_DISCARD:
 	case REQ_OP_SECURE_ERASE:
 		split = blk_bio_discard_split(q, *bio, &q->bio_split, nr_segs);
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 9741d1d83e98..18e357939ed4 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -60,6 +60,9 @@  void blk_set_default_limits(struct queue_limits *lim)
 	lim->io_opt = 0;
 	lim->misaligned = 0;
 	lim->zoned = BLK_ZONED_NONE;
+	lim->max_copy_sectors = 0;
+	lim->max_copy_nr_ranges = 0;
+	lim->max_copy_range_sectors = 0;
 }
 EXPORT_SYMBOL(blk_set_default_limits);
 
@@ -549,6 +552,14 @@  int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 	t->io_opt = lcm_not_zero(t->io_opt, b->io_opt);
 	t->chunk_sectors = lcm_not_zero(t->chunk_sectors, b->chunk_sectors);
 
+	/* copy limits */
+	t->max_copy_sectors = min_not_zero(t->max_copy_sectors,
+			b->max_copy_sectors);
+	t->max_copy_range_sectors = min_not_zero(t->max_copy_range_sectors,
+			b->max_copy_range_sectors);
+	t->max_copy_nr_ranges = min_not_zero(t->max_copy_nr_ranges,
+			b->max_copy_nr_ranges);
+
 	/* Physical block size a multiple of the logical block size? */
 	if (t->physical_block_size & (t->logical_block_size - 1)) {
 		t->physical_block_size = t->logical_block_size;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index b513f1683af0..e5aabb6a3ac1 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -166,6 +166,23 @@  static ssize_t queue_discard_granularity_show(struct request_queue *q, char *pag
 	return queue_var_show(q->limits.discard_granularity, page);
 }
 
+static ssize_t queue_max_copy_sectors_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(q->limits.max_copy_sectors, page);
+}
+
+static ssize_t queue_max_copy_range_sectors_show(struct request_queue *q,
+		char *page)
+{
+	return queue_var_show(q->limits.max_copy_range_sectors, page);
+}
+
+static ssize_t queue_max_copy_nr_ranges_show(struct request_queue *q,
+		char *page)
+{
+	return queue_var_show(q->limits.max_copy_nr_ranges, page);
+}
+
 static ssize_t queue_discard_max_hw_show(struct request_queue *q, char *page)
 {
 
@@ -590,6 +607,9 @@  QUEUE_RO_ENTRY(queue_zoned, "zoned");
 QUEUE_RO_ENTRY(queue_nr_zones, "nr_zones");
 QUEUE_RO_ENTRY(queue_max_open_zones, "max_open_zones");
 QUEUE_RO_ENTRY(queue_max_active_zones, "max_active_zones");
+QUEUE_RO_ENTRY(queue_max_copy_sectors, "max_copy_sectors");
+QUEUE_RO_ENTRY(queue_max_copy_range_sectors, "max_copy_range_sectors");
+QUEUE_RO_ENTRY(queue_max_copy_nr_ranges, "max_copy_nr_ranges");
 
 QUEUE_RW_ENTRY(queue_nomerges, "nomerges");
 QUEUE_RW_ENTRY(queue_rq_affinity, "rq_affinity");
@@ -636,6 +656,9 @@  static struct attribute *queue_attrs[] = {
 	&queue_discard_max_entry.attr,
 	&queue_discard_max_hw_entry.attr,
 	&queue_discard_zeroes_data_entry.attr,
+	&queue_max_copy_sectors_entry.attr,
+	&queue_max_copy_range_sectors_entry.attr,
+	&queue_max_copy_nr_ranges_entry.attr,
 	&queue_write_same_max_entry.attr,
 	&queue_write_zeroes_max_entry.attr,
 	&queue_zone_append_max_entry.attr,
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 6817a673e5ce..6e5fef3cc615 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -75,6 +75,7 @@  bool blk_req_needs_zone_write_lock(struct request *rq)
 	case REQ_OP_WRITE_ZEROES:
 	case REQ_OP_WRITE_SAME:
 	case REQ_OP_WRITE:
+	case REQ_OP_COPY:
 		return blk_rq_zone_is_seq(rq);
 	default:
 		return false;
diff --git a/block/bounce.c b/block/bounce.c
index 162a6eee8999..7fbdc52decb3 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -254,6 +254,7 @@  static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask,
 	bio->bi_iter.bi_size	= bio_src->bi_iter.bi_size;
 
 	switch (bio_op(bio)) {
+	case REQ_OP_COPY:
 	case REQ_OP_DISCARD:
 	case REQ_OP_SECURE_ERASE:
 	case REQ_OP_WRITE_ZEROES:
diff --git a/block/ioctl.c b/block/ioctl.c
index 6b785181344f..a4a507d85e56 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -142,6 +142,47 @@  static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode,
 				    GFP_KERNEL, flags);
 }
 
+static int blk_ioctl_copy(struct block_device *bdev, fmode_t mode,
+		unsigned long arg, unsigned long flags)
+{
+	struct copy_range crange;
+	struct range_entry *rlist;
+	struct request_queue *q = bdev_get_queue(bdev);
+	sector_t dest;
+	int ret;
+
+	if (!(mode & FMODE_WRITE))
+		return -EBADF;
+
+	if (!blk_queue_copy(q))
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&crange, (void __user *)arg, sizeof(crange)))
+		return -EFAULT;
+
+	if (crange.dest & ((1 << SECTOR_SHIFT) - 1))
+		return -EFAULT;
+	dest = crange.dest >> SECTOR_SHIFT;
+
+	rlist = kmalloc_array(crange.nr_range, sizeof(*rlist),
+			GFP_ATOMIC | __GFP_NOWARN);
+
+	if (!rlist)
+		return -ENOMEM;
+
+	if (copy_from_user(rlist, (void __user *)crange.range_list,
+				sizeof(*rlist) * crange.nr_range)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	ret = blkdev_issue_copy(bdev, dest, crange.nr_range,
+			rlist, GFP_KERNEL, flags);
+out:
+	kfree(rlist);
+	return ret;
+}
+
 static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode,
 		unsigned long arg)
 {
@@ -467,6 +508,8 @@  static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode,
 	case BLKSECDISCARD:
 		return blk_ioctl_discard(bdev, mode, arg,
 				BLKDEV_DISCARD_SECURE);
+	case BLKCOPY:
+		return blk_ioctl_copy(bdev, mode, arg, 0);
 	case BLKZEROOUT:
 		return blk_ioctl_zeroout(bdev, mode, arg);
 	case BLKREPORTZONE:
diff --git a/include/linux/bio.h b/include/linux/bio.h
index ecf67108f091..7e40a37f0ee5 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -71,6 +71,7 @@  static inline bool bio_has_data(struct bio *bio)
 static inline bool bio_no_advance_iter(const struct bio *bio)
 {
 	return bio_op(bio) == REQ_OP_DISCARD ||
+	       bio_op(bio) == REQ_OP_COPY ||
 	       bio_op(bio) == REQ_OP_SECURE_ERASE ||
 	       bio_op(bio) == REQ_OP_WRITE_SAME ||
 	       bio_op(bio) == REQ_OP_WRITE_ZEROES;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index d9b69bbde5cc..fbb6396b0317 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -360,6 +360,8 @@  enum req_opf {
 	REQ_OP_ZONE_RESET	= 15,
 	/* reset all the zone present on the device */
 	REQ_OP_ZONE_RESET_ALL	= 17,
+	/* copy ranges within device */
+	REQ_OP_COPY		= 19,
 
 	/* SCSI passthrough using struct scsi_request */
 	REQ_OP_SCSI_IN		= 32,
@@ -486,6 +488,11 @@  static inline bool op_is_discard(unsigned int op)
 	return (op & REQ_OP_MASK) == REQ_OP_DISCARD;
 }
 
+static inline bool op_is_copy(unsigned int op)
+{
+	return (op & REQ_OP_MASK) == REQ_OP_COPY;
+}
+
 /*
  * Check if a bio or request operation is a zone management operation, with
  * the exception of REQ_OP_ZONE_RESET_ALL which is treated as a special case
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 05b346a68c2e..dbeaeebf41c4 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -340,10 +340,13 @@  struct queue_limits {
 	unsigned int		max_zone_append_sectors;
 	unsigned int		discard_granularity;
 	unsigned int		discard_alignment;
+	unsigned int		max_copy_sectors;
 
 	unsigned short		max_segments;
 	unsigned short		max_integrity_segments;
 	unsigned short		max_discard_segments;
+	unsigned short		max_copy_range_sectors;
+	unsigned short		max_copy_nr_ranges;
 
 	unsigned char		misaligned;
 	unsigned char		discard_misaligned;
@@ -625,6 +628,7 @@  struct request_queue {
 #define QUEUE_FLAG_RQ_ALLOC_TIME 27	/* record rq->alloc_time_ns */
 #define QUEUE_FLAG_HCTX_ACTIVE	28	/* at least one blk-mq hctx is active */
 #define QUEUE_FLAG_NOWAIT       29	/* device supports NOWAIT */
+#define QUEUE_FLAG_COPY		30	/* supports copy */
 
 #define QUEUE_FLAG_MQ_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
 				 (1 << QUEUE_FLAG_SAME_COMP) |		\
@@ -647,6 +651,7 @@  bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
 #define blk_queue_io_stat(q)	test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags)
 #define blk_queue_add_random(q)	test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags)
 #define blk_queue_discard(q)	test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags)
+#define blk_queue_copy(q)	test_bit(QUEUE_FLAG_COPY, &(q)->queue_flags)
 #define blk_queue_zone_resetall(q)	\
 	test_bit(QUEUE_FLAG_ZONE_RESETALL, &(q)->queue_flags)
 #define blk_queue_secure_erase(q) \
@@ -1059,6 +1064,9 @@  static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q,
 		return min(q->limits.max_discard_sectors,
 			   UINT_MAX >> SECTOR_SHIFT);
 
+	if (unlikely(op == REQ_OP_COPY))
+		return q->limits.max_copy_sectors;
+
 	if (unlikely(op == REQ_OP_WRITE_SAME))
 		return q->limits.max_write_same_sectors;
 
@@ -1330,6 +1338,13 @@  extern int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, int flags,
 		struct bio **biop);
 
+extern int __blkdev_issue_copy(struct block_device *bdev, sector_t dest,
+		sector_t nr_srcs, struct range_entry *rlist, gfp_t gfp_mask,
+		int flags, struct bio **biop);
+extern int blkdev_issue_copy(struct block_device *bdev, sector_t dest,
+		int nr_srcs, struct range_entry *rlist,
+		gfp_t gfp_mask, unsigned long flags);
+
 #define BLKDEV_ZERO_NOUNMAP	(1 << 0)  /* do not free blocks */
 #define BLKDEV_ZERO_NOFALLBACK	(1 << 1)  /* don't write explicit zeroes */
 
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index f44eb0a04afd..e6d7b1232f3a 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -64,6 +64,26 @@  struct fstrim_range {
 	__u64 minlen;
 };
 
+struct range_entry {
+	__u64 src;
+	__u64 len;
+};
+
+struct copy_range {
+	__u64 dest;
+	__u64 nr_range;
+	__u64 range_list;
+	__u64 rsvd;
+};
+
+struct blk_copy_payload {
+	sector_t	dest;
+	int		copy_range;
+	int		copy_size;
+	int		err;
+	struct	range_entry	range[];
+};
+
 /* extent-same (dedupe) ioctls; these MUST match the btrfs ioctl definitions */
 #define FILE_DEDUPE_RANGE_SAME		0
 #define FILE_DEDUPE_RANGE_DIFFERS	1
@@ -184,6 +204,7 @@  struct fsxattr {
 #define BLKSECDISCARD _IO(0x12,125)
 #define BLKROTATIONAL _IO(0x12,126)
 #define BLKZEROOUT _IO(0x12,127)
+#define BLKCOPY _IOWR(0x12, 128, struct copy_range)
 /*
  * A jump here: 130-131 are reserved for zoned block devices
  * (see uapi/linux/blkzoned.h)