diff mbox series

[v13,2/9] block: Add copy offload support infrastructure

Message ID 20230627183629.26571-3-nj.shetty@samsung.com (mailing list archive)
State New, archived
Headers show
Series [v13,1/9] block: Introduce queue limits for copy-offload support | expand

Commit Message

Nitesh Shetty June 27, 2023, 6:36 p.m. UTC
Introduce blkdev_copy_offload which takes similar arguments as
copy_file_range and performs copy offload between two bdevs.
Introduce REQ_OP_COPY_DST, REQ_OP_COPY_SRC operation.
Issue REQ_OP_COPY_DST with destination info along with taking a plug.
This flows till request layer and waits for src bio to get merged.
Issue REQ_OP_COPY_SRC with source info and this bio reaches request
layer and merges with dst request.
For any reason, if request comes to driver with either only one of src/dst
info we fail the copy offload.

Larger copy will be divided, based on max_copy_sectors limit.

Suggested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
---
 block/blk-core.c          |   5 ++
 block/blk-lib.c           | 177 ++++++++++++++++++++++++++++++++++++++
 block/blk-merge.c         |  21 +++++
 block/blk.h               |   9 ++
 block/elevator.h          |   1 +
 include/linux/bio.h       |   4 +-
 include/linux/blk_types.h |  21 +++++
 include/linux/blkdev.h    |   4 +
 8 files changed, 241 insertions(+), 1 deletion(-)

Comments

Nitesh Shetty June 28, 2023, 4:03 p.m. UTC | #1
On 23/06/28 03:45PM, Damien Le Moal wrote:
>On 6/28/23 03:36, Nitesh Shetty wrote:
>> Introduce blkdev_copy_offload which takes similar arguments as
>> copy_file_range and performs copy offload between two bdevs.
>
>I am confused... I thought it was discussed to only allow copy offload only
>within a single bdev for now... Did I missi something ?
>

Yes, you are right. copy is supported within single bdev only.
We will update this.

>> Introduce REQ_OP_COPY_DST, REQ_OP_COPY_SRC operation.
>> Issue REQ_OP_COPY_DST with destination info along with taking a plug.
>> This flows till request layer and waits for src bio to get merged.
>> Issue REQ_OP_COPY_SRC with source info and this bio reaches request
>> layer and merges with dst request.
>> For any reason, if request comes to driver with either only one of src/dst
>> info we fail the copy offload.
>>
>> Larger copy will be divided, based on max_copy_sectors limit.
>>
>> Suggested-by: Christoph Hellwig <hch@lst.de>
>> Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
>> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
>> ---
>>  block/blk-core.c          |   5 ++
>>  block/blk-lib.c           | 177 ++++++++++++++++++++++++++++++++++++++
>>  block/blk-merge.c         |  21 +++++
>>  block/blk.h               |   9 ++
>>  block/elevator.h          |   1 +
>>  include/linux/bio.h       |   4 +-
>>  include/linux/blk_types.h |  21 +++++
>>  include/linux/blkdev.h    |   4 +
>>  8 files changed, 241 insertions(+), 1 deletion(-)
>>
>> diff --git a/block/blk-core.c b/block/blk-core.c
>> index 99d8b9812b18..e6714391c93f 100644
>> --- a/block/blk-core.c
>> +++ b/block/blk-core.c
>> @@ -796,6 +796,11 @@ void submit_bio_noacct(struct bio *bio)
>>  		if (!q->limits.max_write_zeroes_sectors)
>>  			goto not_supported;
>>  		break;
>> +	case REQ_OP_COPY_SRC:
>> +	case REQ_OP_COPY_DST:
>> +		if (!blk_queue_copy(q))
>> +			goto not_supported;
>> +		break;
>>  	default:
>>  		break;
>>  	}
>> diff --git a/block/blk-lib.c b/block/blk-lib.c
>> index e59c3069e835..10c3eadd5bf6 100644
>> --- a/block/blk-lib.c
>> +++ b/block/blk-lib.c
>> @@ -115,6 +115,183 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
>>  }
>>  EXPORT_SYMBOL(blkdev_issue_discard);
>>
>> +/*
>> + * For synchronous copy offload/emulation, wait and process all in-flight BIOs.
>> + * This must only be called once all bios have been issued so that the refcount
>> + * can only decrease. This just waits for all bios to make it through
>> + * blkdev_copy_(offload/emulate)_(read/write)_endio.
>> + */
>> +static ssize_t blkdev_copy_wait_io_completion(struct cio *cio)
>> +{
>> +	ssize_t ret;
>> +
>> +	if (cio->endio)
>> +		return 0;
>> +
>> +	if (atomic_read(&cio->refcount)) {
>> +		__set_current_state(TASK_UNINTERRUPTIBLE);
>> +		blk_io_schedule();
>> +	}
>> +
>> +	ret = cio->comp_len;
>> +	kfree(cio);
>> +
>> +	return ret;
>> +}
>> +
>> +static void blkdev_copy_offload_read_endio(struct bio *bio)
>> +{
>> +	struct cio *cio = bio->bi_private;
>> +	sector_t clen;
>> +
>> +	if (bio->bi_status) {
>> +		clen = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - cio->pos_out;
>> +		cio->comp_len = min_t(sector_t, clen, cio->comp_len);
>> +	}
>> +	bio_put(bio);
>> +
>> +	if (!atomic_dec_and_test(&cio->refcount))
>> +		return;
>> +	if (cio->endio) {
>> +		cio->endio(cio->private, cio->comp_len);
>> +		kfree(cio);
>> +	} else
>> +		blk_wake_io_task(cio->waiter);
>
>Curly brackets around else missing.
>

Acked.

>> +}
>> +
>> +/*
>> + * __blkdev_copy_offload	- Use device's native copy offload feature.
>> + * we perform copy operation by sending 2 bio.
>> + * 1. We take a plug and send a REQ_OP_COPY_DST bio along with destination
>> + * sector and length. Once this bio reaches request layer, we form a request and
>> + * wait for src bio to arrive.
>> + * 2. We issue REQ_OP_COPY_SRC bio along with source sector and length. Once
>> + * this bio reaches request layer and find a request with previously sent
>> + * destination info we merge the source bio and return.
>> + * 3. Release the plug and request is sent to driver
>> + *
>> + * Returns the length of bytes copied or error if encountered
>> + */
>> +static ssize_t __blkdev_copy_offload(
>> +		struct block_device *bdev_in, loff_t pos_in,
>> +		struct block_device *bdev_out, loff_t pos_out,
>> +		size_t len, cio_iodone_t endio, void *private, gfp_t gfp_mask)
>> +{
>> +	struct cio *cio;
>> +	struct bio *read_bio, *write_bio;
>> +	sector_t rem, copy_len, max_copy_len;
>> +	struct blk_plug plug;
>> +
>> +	cio = kzalloc(sizeof(struct cio), GFP_KERNEL);
>> +	if (!cio)
>> +		return -ENOMEM;
>> +	atomic_set(&cio->refcount, 0);
>> +	cio->waiter = current;
>> +	cio->endio = endio;
>> +	cio->private = private;
>> +
>> +	max_copy_len = min(bdev_max_copy_sectors(bdev_in),
>> +			bdev_max_copy_sectors(bdev_out)) << SECTOR_SHIFT;
>
>According to patch 1, this can end up being 0, so the loop below will be infinite.
>

Agreed. As you suggested earlier, once we remove copy_offload parameter
and checking copy_max_sector to identify copy offload capabilty should
solve this.

Thank you,
Nitesh Shetty
Christoph Hellwig July 20, 2023, 7:42 a.m. UTC | #2
I wonder if this might benefit if you split the actual block
layer copy infrastructure from the blkdev_copy_offload* helpers
that just make use of it.

> Suggested-by: Christoph Hellwig <hch@lst.de>

Hmm, I'm not sure I suggested adding copy offload..

> +/*
> + * For synchronous copy offload/emulation, wait and process all in-flight BIOs.
> + * This must only be called once all bios have been issued so that the refcount
> + * can only decrease. This just waits for all bios to make it through
> + * blkdev_copy_(offload/emulate)_(read/write)_endio.
> + */
> +static ssize_t blkdev_copy_wait_io_completion(struct cio *cio)
> +{
> +	ssize_t ret;
> +
> +	if (cio->endio)
> +		return 0;

I'd move this to the caller to make things more clear.

> +
> +	if (atomic_read(&cio->refcount)) {
> +		__set_current_state(TASK_UNINTERRUPTIBLE);
> +		blk_io_schedule();

I don't think the refcount scheme you have works, instead you need
to have an extra count for the submitter, which is dropped using
atomic_dec_and_test here.  Take a look at ref scheme in blkdev_dio
which should be applicable here.

> +	ret = cio->comp_len;

The comp_len name for this variable confuses me.  I think is the length
that has succesfully been copied.  So maybe name it copied?

> +static void blkdev_copy_offload_read_endio(struct bio *bio)
> +{
> +	struct cio *cio = bio->bi_private;
> +	sector_t clen;
> +
> +	if (bio->bi_status) {
> +		clen = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - cio->pos_out;
> +		cio->comp_len = min_t(sector_t, clen, cio->comp_len);

bi_sector can be thrashed once you hit the end_io handler.

> +	if (!atomic_dec_and_test(&cio->refcount))
> +		return;
> +	if (cio->endio) {
> +		cio->endio(cio->private, cio->comp_len);
> +		kfree(cio);
> +	} else
> +		blk_wake_io_task(cio->waiter);
> +}

This code is duplicated in a few places, please add a helper for it.

Also don't we need a way to return an error code through ->endio?

> +static ssize_t __blkdev_copy_offload(
> +		struct block_device *bdev_in, loff_t pos_in,
> +		struct block_device *bdev_out, loff_t pos_out,
> +		size_t len, cio_iodone_t endio, void *private, gfp_t gfp_mask)

I'd call this something like blkdev_copy_native, or maybe just
blkdev_copy_offlod.  Also given that we only want to support
single-device copies there i no need to pass two block_devices here.

Also please use the available space on the declaration line:

static ssize_t __blkdev_copy_offload(struct block_device *bdev_in,
		loff_t pos_in, struct block_device *bdev_out, loff_t pos_out,
		size_t len, cio_iodone_t endio, void *private, gfp_t gfp_mask)

Also the cio_iodone_t name is very generic.  Givne that there aren't
many places where we pass these callbacks I'd probably just drop the
typedef entirely.

> +	/* If there is a error, comp_len will be set to least successfully
> +	 * completed copied length
> +	 */

This is not the canonical kernel comment style.

> +	cio->comp_len = len;
> +	for (rem = len; rem > 0; rem -= copy_len) {
> +		copy_len = min(rem, max_copy_len);
> +
> +		write_bio = bio_alloc(bdev_out, 0, REQ_OP_COPY_DST, gfp_mask);
> +		if (!write_bio)
> +			goto err_write_bio_alloc;
> +		write_bio->bi_iter.bi_size = copy_len;
> +		write_bio->bi_iter.bi_sector = pos_out >> SECTOR_SHIFT;
> +
> +		blk_start_plug(&plug);
> +		read_bio = blk_next_bio(write_bio, bdev_in, 0, REQ_OP_COPY_SRC,
> +						gfp_mask);
> +		read_bio->bi_iter.bi_size = copy_len;
> +		read_bio->bi_iter.bi_sector = pos_in >> SECTOR_SHIFT;
> +		read_bio->bi_end_io = blkdev_copy_offload_read_endio;
> +		read_bio->bi_private = cio;

The chaining order here seems inverse to what I'd expect.  At least
for NVMe the copy command supports multiple input ranges being copied
to a single output range, and that is a very useful and important
feature for garbage collection in out of place write file systems.

So I'd expect to see one or more read bios first, which get chained
to the write bio that drives the completion.  We don't need the
multiple input range support in the very first version, but I'd expect
it to be added soon later so we better get the infrastructure right
for it.

> +
> +static inline ssize_t blkdev_copy_sanity_check(
> +	struct block_device *bdev_in, loff_t pos_in,
> +	struct block_device *bdev_out, loff_t pos_out,
> +	size_t len)

Two tab indents for the prototype, please.

> +{
> +	unsigned int align = max(bdev_logical_block_size(bdev_out),
> +					bdev_logical_block_size(bdev_in)) - 1;
> +
> +	if (bdev_read_only(bdev_out))
> +		return -EPERM;
> +
> +	if ((pos_in & align) || (pos_out & align) || (len & align) || !len ||
> +		len >= COPY_MAX_BYTES)

This indentation should also use two tabs or alignent of the opening
brace, and not the same as the indented branch.

> +ssize_t blkdev_copy_offload(

Just blkdev_copy?  Especially as the non-offloaded version is added
later.

> diff --git a/block/blk-merge.c b/block/blk-merge.c
> index 65e75efa9bd3..bfd86c54df22 100644
> --- a/block/blk-merge.c
> +++ b/block/blk-merge.c
> @@ -922,6 +922,9 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
>  	if (!rq_mergeable(rq) || !bio_mergeable(bio))
>  		return false;
>  
> +	if ((req_op(rq) == REQ_OP_COPY_DST) && (bio_op(bio) == REQ_OP_COPY_SRC))
> +		return true;

This seems to be equivalent to blk_copy_offload_mergable, so why not
use that?

> +static enum bio_merge_status bio_attempt_copy_offload_merge(
> +	struct request_queue *q, struct request *req, struct bio *bio)

Same comment about the indentation as above (I'm not going to mention
it further, please do a sweep).

Also we don't need the q argument, it must be req->q.

> +{
> +	if (req->__data_len != bio->bi_iter.bi_size)
> +		return BIO_MERGE_FAILED;
> +
> +	req->biotail->bi_next = bio;
> +	req->biotail = bio;
> +	req->nr_phys_segments = blk_rq_nr_phys_segments(req) + 1;

This should just be req->nr_phys_segments++, shouldn't it?

>  }
>  
> +static inline bool blk_copy_offload_mergable(struct request *req,
> +					     struct bio *bio)
> +{
> +	return ((req_op(req) == REQ_OP_COPY_DST)  &&
> +		(bio_op(bio) == REQ_OP_COPY_SRC));
> +}

Can you please add a comment explaining the logic of merging different
operations here?

Also all the braces in the function body are superflous and there is a
double whitespace before the &&.

>  static inline unsigned int blk_rq_get_max_segments(struct request *rq)
>  {
>  	if (req_op(rq) == REQ_OP_DISCARD)
> @@ -303,6 +310,8 @@ static inline bool bio_may_exceed_limits(struct bio *bio,
>  		break;
>  	}
>  
> +	if (unlikely(op_is_copy(bio->bi_opf)))
> +		return false;

This looks wrong to me.  I think the copy ops need to be added to the
switch statement above as they have non-trivial splitting decisions.
Or at least should have those as we're missing the code to split
copy commands right now.

> diff --git a/include/linux/bio.h b/include/linux/bio.h
> index c4f5b5228105..a2673f24e493 100644
> --- a/include/linux/bio.h
> +++ b/include/linux/bio.h
> @@ -57,7 +57,9 @@ static inline bool bio_has_data(struct bio *bio)
>  	    bio->bi_iter.bi_size &&
>  	    bio_op(bio) != REQ_OP_DISCARD &&
>  	    bio_op(bio) != REQ_OP_SECURE_ERASE &&
> -	    bio_op(bio) != REQ_OP_WRITE_ZEROES)
> +	    bio_op(bio) != REQ_OP_WRITE_ZEROES &&
> +	    bio_op(bio) != REQ_OP_COPY_DST &&
> +	    bio_op(bio) != REQ_OP_COPY_SRC)

It probably make sense to replace this with a positive check
for the operations that do have data as a prep patch, which is
just REQ_OP_READ an  REQ_OP_WRITE.

>  	/* reset all the zone present on the device */
>  	REQ_OP_ZONE_RESET_ALL	= (__force blk_opf_t)17,
>  
> +	REQ_OP_COPY_SRC		= (__force blk_opf_t)18,
> +	REQ_OP_COPY_DST		= (__force blk_opf_t)19,

Little comments on these ops, please.

> +static inline bool op_is_copy(blk_opf_t op)
> +{
> +	return (((op & REQ_OP_MASK) == REQ_OP_COPY_SRC) ||
> +		((op & REQ_OP_MASK) == REQ_OP_COPY_DST));

All but the inner most braces here are superflous.

> +struct cio {
> +	struct task_struct *waiter;     /* waiting task (NULL if none) */
> +	loff_t pos_in;
> +	loff_t pos_out;
> +	ssize_t comp_len;
> +	cio_iodone_t *endio;		/* applicable for async operation */
> +	void *private;			/* applicable for async operation */
> +	atomic_t refcount;
> +};

The name for this structure is way to generic.  It also is only used
inside of blk-lib.c and should be moved there.
Nitesh Shetty July 27, 2023, 10:29 a.m. UTC | #3
On Thu, Jul 20, 2023 at 1:12 PM Christoph Hellwig <hch@lst.de> wrote:
> > Suggested-by: Christoph Hellwig <hch@lst.de>
>
> Hmm, I'm not sure I suggested adding copy offload..
>
We meant for request based design, we will remove it.

> >  static inline unsigned int blk_rq_get_max_segments(struct request *rq)
> >  {
> >       if (req_op(rq) == REQ_OP_DISCARD)
> > @@ -303,6 +310,8 @@ static inline bool bio_may_exceed_limits(struct bio *bio,
> >               break;
> >       }
> >
> > +     if (unlikely(op_is_copy(bio->bi_opf)))
> > +             return false;
>
> This looks wrong to me.  I think the copy ops need to be added to the
> switch statement above as they have non-trivial splitting decisions.
> Or at least should have those as we're missing the code to split
> copy commands right now.
>

Agreed, copy will have non-trivial splitting decisions. But, I
couldn't think of scenarios where this could happen, as we check for
queue limits before issuing a copy. Do you see scenarios where split
could happen for copy here.

Acked for all other review comments.

Thank you,
Nitesh Shetty
diff mbox series

Patch

diff --git a/block/blk-core.c b/block/blk-core.c
index 99d8b9812b18..e6714391c93f 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -796,6 +796,11 @@  void submit_bio_noacct(struct bio *bio)
 		if (!q->limits.max_write_zeroes_sectors)
 			goto not_supported;
 		break;
+	case REQ_OP_COPY_SRC:
+	case REQ_OP_COPY_DST:
+		if (!blk_queue_copy(q))
+			goto not_supported;
+		break;
 	default:
 		break;
 	}
diff --git a/block/blk-lib.c b/block/blk-lib.c
index e59c3069e835..10c3eadd5bf6 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -115,6 +115,183 @@  int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 }
 EXPORT_SYMBOL(blkdev_issue_discard);
 
+/*
+ * For synchronous copy offload/emulation, wait and process all in-flight BIOs.
+ * This must only be called once all bios have been issued so that the refcount
+ * can only decrease. This just waits for all bios to make it through
+ * blkdev_copy_(offload/emulate)_(read/write)_endio.
+ */
+static ssize_t blkdev_copy_wait_io_completion(struct cio *cio)
+{
+	ssize_t ret;
+
+	if (cio->endio)
+		return 0;
+
+	if (atomic_read(&cio->refcount)) {
+		__set_current_state(TASK_UNINTERRUPTIBLE);
+		blk_io_schedule();
+	}
+
+	ret = cio->comp_len;
+	kfree(cio);
+
+	return ret;
+}
+
+static void blkdev_copy_offload_read_endio(struct bio *bio)
+{
+	struct cio *cio = bio->bi_private;
+	sector_t clen;
+
+	if (bio->bi_status) {
+		clen = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - cio->pos_out;
+		cio->comp_len = min_t(sector_t, clen, cio->comp_len);
+	}
+	bio_put(bio);
+
+	if (!atomic_dec_and_test(&cio->refcount))
+		return;
+	if (cio->endio) {
+		cio->endio(cio->private, cio->comp_len);
+		kfree(cio);
+	} else
+		blk_wake_io_task(cio->waiter);
+}
+
+/*
+ * __blkdev_copy_offload	- Use device's native copy offload feature.
+ * we perform copy operation by sending 2 bio.
+ * 1. We take a plug and send a REQ_OP_COPY_DST bio along with destination
+ * sector and length. Once this bio reaches request layer, we form a request and
+ * wait for src bio to arrive.
+ * 2. We issue REQ_OP_COPY_SRC bio along with source sector and length. Once
+ * this bio reaches request layer and find a request with previously sent
+ * destination info we merge the source bio and return.
+ * 3. Release the plug and request is sent to driver
+ *
+ * Returns the length of bytes copied or error if encountered
+ */
+static ssize_t __blkdev_copy_offload(
+		struct block_device *bdev_in, loff_t pos_in,
+		struct block_device *bdev_out, loff_t pos_out,
+		size_t len, cio_iodone_t endio, void *private, gfp_t gfp_mask)
+{
+	struct cio *cio;
+	struct bio *read_bio, *write_bio;
+	sector_t rem, copy_len, max_copy_len;
+	struct blk_plug plug;
+
+	cio = kzalloc(sizeof(struct cio), GFP_KERNEL);
+	if (!cio)
+		return -ENOMEM;
+	atomic_set(&cio->refcount, 0);
+	cio->waiter = current;
+	cio->endio = endio;
+	cio->private = private;
+
+	max_copy_len = min(bdev_max_copy_sectors(bdev_in),
+			bdev_max_copy_sectors(bdev_out)) << SECTOR_SHIFT;
+
+	cio->pos_in = pos_in;
+	cio->pos_out = pos_out;
+	/* If there is a error, comp_len will be set to least successfully
+	 * completed copied length
+	 */
+	cio->comp_len = len;
+	for (rem = len; rem > 0; rem -= copy_len) {
+		copy_len = min(rem, max_copy_len);
+
+		write_bio = bio_alloc(bdev_out, 0, REQ_OP_COPY_DST, gfp_mask);
+		if (!write_bio)
+			goto err_write_bio_alloc;
+		write_bio->bi_iter.bi_size = copy_len;
+		write_bio->bi_iter.bi_sector = pos_out >> SECTOR_SHIFT;
+
+		blk_start_plug(&plug);
+		read_bio = blk_next_bio(write_bio, bdev_in, 0, REQ_OP_COPY_SRC,
+						gfp_mask);
+		read_bio->bi_iter.bi_size = copy_len;
+		read_bio->bi_iter.bi_sector = pos_in >> SECTOR_SHIFT;
+		read_bio->bi_end_io = blkdev_copy_offload_read_endio;
+		read_bio->bi_private = cio;
+
+		atomic_inc(&cio->refcount);
+		submit_bio(read_bio);
+		blk_finish_plug(&plug);
+		pos_in += copy_len;
+		pos_out += copy_len;
+	}
+
+	return blkdev_copy_wait_io_completion(cio);
+
+err_write_bio_alloc:
+	cio->comp_len = min_t(sector_t, cio->comp_len, (len - rem));
+	if (!atomic_read(&cio->refcount)) {
+		kfree(cio);
+		return -ENOMEM;
+	}
+	return blkdev_copy_wait_io_completion(cio);
+}
+
+static inline ssize_t blkdev_copy_sanity_check(
+	struct block_device *bdev_in, loff_t pos_in,
+	struct block_device *bdev_out, loff_t pos_out,
+	size_t len)
+{
+	unsigned int align = max(bdev_logical_block_size(bdev_out),
+					bdev_logical_block_size(bdev_in)) - 1;
+
+	if (bdev_read_only(bdev_out))
+		return -EPERM;
+
+	if ((pos_in & align) || (pos_out & align) || (len & align) || !len ||
+		len >= COPY_MAX_BYTES)
+		return -EINVAL;
+
+	return 0;
+}
+
+/*
+ * @bdev_in:	source block device
+ * @pos_in:	source offset
+ * @bdev_out:	destination block device
+ * @pos_out:	destination offset
+ * @len:	length in bytes to be copied
+ * @endio:	endio function to be called on completion of copy operation,
+ *		for synchronous operation this should be NULL
+ * @private:	endio function will be called with this private data, should be
+ *		NULL, if operation is synchronous in nature
+ * @gfp_mask:   memory allocation flags (for bio_alloc)
+ *
+ * Returns the length of bytes copied or error if encountered
+ *
+ * Description:
+ *	Copy source offset from source block device to destination block
+ *	device. If copy offload is not supported or fails, fallback to
+ *	emulation. Max total length of copy is limited to COPY_MAX_BYTES
+ */
+ssize_t blkdev_copy_offload(
+		struct block_device *bdev_in, loff_t pos_in,
+		struct block_device *bdev_out, loff_t pos_out,
+		size_t len, cio_iodone_t endio, void *private, gfp_t gfp_mask)
+{
+	struct request_queue *q_in = bdev_get_queue(bdev_in);
+	struct request_queue *q_out = bdev_get_queue(bdev_out);
+	ssize_t ret;
+
+	ret = blkdev_copy_sanity_check(bdev_in, pos_in, bdev_out, pos_out, len);
+	if (ret)
+		return ret;
+
+	if (blk_queue_copy(q_in) && blk_queue_copy(q_out))
+		ret = __blkdev_copy_offload(bdev_in, pos_in, bdev_out, pos_out,
+			   len, endio, private, gfp_mask);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(blkdev_copy_offload);
+
 static int __blkdev_issue_write_zeroes(struct block_device *bdev,
 		sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
 		struct bio **biop, unsigned flags)
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 65e75efa9bd3..bfd86c54df22 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -922,6 +922,9 @@  bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
 	if (!rq_mergeable(rq) || !bio_mergeable(bio))
 		return false;
 
+	if ((req_op(rq) == REQ_OP_COPY_DST) && (bio_op(bio) == REQ_OP_COPY_SRC))
+		return true;
+
 	if (req_op(rq) != bio_op(bio))
 		return false;
 
@@ -951,6 +954,8 @@  enum elv_merge blk_try_merge(struct request *rq, struct bio *bio)
 {
 	if (blk_discard_mergable(rq))
 		return ELEVATOR_DISCARD_MERGE;
+	else if (blk_copy_offload_mergable(rq, bio))
+		return ELEVATOR_COPY_OFFLOAD_MERGE;
 	else if (blk_rq_pos(rq) + blk_rq_sectors(rq) == bio->bi_iter.bi_sector)
 		return ELEVATOR_BACK_MERGE;
 	else if (blk_rq_pos(rq) - bio_sectors(bio) == bio->bi_iter.bi_sector)
@@ -1053,6 +1058,20 @@  static enum bio_merge_status bio_attempt_discard_merge(struct request_queue *q,
 	return BIO_MERGE_FAILED;
 }
 
+static enum bio_merge_status bio_attempt_copy_offload_merge(
+	struct request_queue *q, struct request *req, struct bio *bio)
+{
+	if (req->__data_len != bio->bi_iter.bi_size)
+		return BIO_MERGE_FAILED;
+
+	req->biotail->bi_next = bio;
+	req->biotail = bio;
+	req->nr_phys_segments = blk_rq_nr_phys_segments(req) + 1;
+	req->__data_len += bio->bi_iter.bi_size;
+
+	return BIO_MERGE_OK;
+}
+
 static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
 						   struct request *rq,
 						   struct bio *bio,
@@ -1073,6 +1092,8 @@  static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
 		break;
 	case ELEVATOR_DISCARD_MERGE:
 		return bio_attempt_discard_merge(q, rq, bio);
+	case ELEVATOR_COPY_OFFLOAD_MERGE:
+		return bio_attempt_copy_offload_merge(q, rq, bio);
 	default:
 		return BIO_MERGE_NONE;
 	}
diff --git a/block/blk.h b/block/blk.h
index 608c5dcc516b..440bfa148461 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -156,6 +156,13 @@  static inline bool blk_discard_mergable(struct request *req)
 	return false;
 }
 
+static inline bool blk_copy_offload_mergable(struct request *req,
+					     struct bio *bio)
+{
+	return ((req_op(req) == REQ_OP_COPY_DST)  &&
+		(bio_op(bio) == REQ_OP_COPY_SRC));
+}
+
 static inline unsigned int blk_rq_get_max_segments(struct request *rq)
 {
 	if (req_op(rq) == REQ_OP_DISCARD)
@@ -303,6 +310,8 @@  static inline bool bio_may_exceed_limits(struct bio *bio,
 		break;
 	}
 
+	if (unlikely(op_is_copy(bio->bi_opf)))
+		return false;
 	/*
 	 * All drivers must accept single-segments bios that are <= PAGE_SIZE.
 	 * This is a quick and dirty check that relies on the fact that
diff --git a/block/elevator.h b/block/elevator.h
index 7ca3d7b6ed82..eec442bbf384 100644
--- a/block/elevator.h
+++ b/block/elevator.h
@@ -18,6 +18,7 @@  enum elv_merge {
 	ELEVATOR_FRONT_MERGE	= 1,
 	ELEVATOR_BACK_MERGE	= 2,
 	ELEVATOR_DISCARD_MERGE	= 3,
+	ELEVATOR_COPY_OFFLOAD_MERGE	= 4,
 };
 
 struct blk_mq_alloc_data;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index c4f5b5228105..a2673f24e493 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -57,7 +57,9 @@  static inline bool bio_has_data(struct bio *bio)
 	    bio->bi_iter.bi_size &&
 	    bio_op(bio) != REQ_OP_DISCARD &&
 	    bio_op(bio) != REQ_OP_SECURE_ERASE &&
-	    bio_op(bio) != REQ_OP_WRITE_ZEROES)
+	    bio_op(bio) != REQ_OP_WRITE_ZEROES &&
+	    bio_op(bio) != REQ_OP_COPY_DST &&
+	    bio_op(bio) != REQ_OP_COPY_SRC)
 		return true;
 
 	return false;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 0bad62cca3d0..336146798e56 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -394,6 +394,9 @@  enum req_op {
 	/* reset all the zone present on the device */
 	REQ_OP_ZONE_RESET_ALL	= (__force blk_opf_t)17,
 
+	REQ_OP_COPY_SRC		= (__force blk_opf_t)18,
+	REQ_OP_COPY_DST		= (__force blk_opf_t)19,
+
 	/* Driver private requests */
 	REQ_OP_DRV_IN		= (__force blk_opf_t)34,
 	REQ_OP_DRV_OUT		= (__force blk_opf_t)35,
@@ -482,6 +485,12 @@  static inline bool op_is_write(blk_opf_t op)
 	return !!(op & (__force blk_opf_t)1);
 }
 
+static inline bool op_is_copy(blk_opf_t op)
+{
+	return (((op & REQ_OP_MASK) == REQ_OP_COPY_SRC) ||
+		((op & REQ_OP_MASK) == REQ_OP_COPY_DST));
+}
+
 /*
  * Check if the bio or request is one that needs special treatment in the
  * flush state machine.
@@ -541,4 +550,16 @@  struct blk_rq_stat {
 	u64 batch;
 };
 
+typedef void (cio_iodone_t)(void *private, int comp_len);
+
+struct cio {
+	struct task_struct *waiter;     /* waiting task (NULL if none) */
+	loff_t pos_in;
+	loff_t pos_out;
+	ssize_t comp_len;
+	cio_iodone_t *endio;		/* applicable for async operation */
+	void *private;			/* applicable for async operation */
+	atomic_t refcount;
+};
+
 #endif /* __LINUX_BLK_TYPES_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6098665953e6..963f5c97dec0 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1043,6 +1043,10 @@  int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, struct bio **biop);
 int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp);
+ssize_t blkdev_copy_offload(
+		struct block_device *bdev_in, loff_t pos_in,
+		struct block_device *bdev_out, loff_t pos_out,
+		size_t len, cio_iodone_t end_io, void *private, gfp_t gfp_mask);
 
 #define BLKDEV_ZERO_NOUNMAP	(1 << 0)  /* do not free blocks */
 #define BLKDEV_ZERO_NOFALLBACK	(1 << 1)  /* don't write explicit zeroes */