diff mbox series

[v15,03/12] block: add copy offload support

Message ID 20230906163844.18754-4-nj.shetty@samsung.com (mailing list archive)
State New, archived
Headers show
Series [v15,01/12] block: Introduce queue limits and sysfs for copy-offload support | expand

Commit Message

Nitesh Shetty Sept. 6, 2023, 4:38 p.m. UTC
Introduce blkdev_copy_offload to perform copy offload.
Issue REQ_OP_COPY_SRC with source info along with taking a plug.
This flows till request layer and waits for dst bio to arrive.
Issue REQ_OP_COPY_DST with destination info and this bio reaches request
layer and merges with src request.
For any reason, if a request comes to the driver with only one of src/dst
bio, we fail the copy offload.

Larger copy will be divided, based on max_copy_sectors limit.

Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
---
 block/blk-lib.c        | 202 +++++++++++++++++++++++++++++++++++++++++
 include/linux/blkdev.h |   4 +
 2 files changed, 206 insertions(+)

Comments

Hannes Reinecke Sept. 7, 2023, 5:49 a.m. UTC | #1
On 9/6/23 18:38, Nitesh Shetty wrote:
> Introduce blkdev_copy_offload to perform copy offload.
> Issue REQ_OP_COPY_SRC with source info along with taking a plug.
> This flows till request layer and waits for dst bio to arrive.
> Issue REQ_OP_COPY_DST with destination info and this bio reaches request
> layer and merges with src request.
> For any reason, if a request comes to the driver with only one of src/dst
> bio, we fail the copy offload.
> 
> Larger copy will be divided, based on max_copy_sectors limit.
> 
> Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
> ---
>   block/blk-lib.c        | 202 +++++++++++++++++++++++++++++++++++++++++
>   include/linux/blkdev.h |   4 +
>   2 files changed, 206 insertions(+)
> 
> diff --git a/block/blk-lib.c b/block/blk-lib.c
> index e59c3069e835..d22e1e7417ca 100644
> --- a/block/blk-lib.c
> +++ b/block/blk-lib.c
> @@ -10,6 +10,22 @@
>   
>   #include "blk.h"
>   
> +/* Keeps track of all outstanding copy IO */
> +struct blkdev_copy_io {
> +	atomic_t refcount;
> +	ssize_t copied;
> +	int status;
> +	struct task_struct *waiter;
> +	void (*endio)(void *private, int status, ssize_t copied);
> +	void *private;
> +};
> +
> +/* Keeps track of single outstanding copy offload IO */
> +struct blkdev_copy_offload_io {
> +	struct blkdev_copy_io *cio;
> +	loff_t offset;
> +};
> +
>   static sector_t bio_discard_limit(struct block_device *bdev, sector_t sector)
>   {
>   	unsigned int discard_granularity = bdev_discard_granularity(bdev);
> @@ -115,6 +131,192 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
>   }
>   EXPORT_SYMBOL(blkdev_issue_discard);
>   
> +static inline ssize_t blkdev_copy_sanity_check(struct block_device *bdev_in,
> +					       loff_t pos_in,
> +					       struct block_device *bdev_out,
> +					       loff_t pos_out, size_t len)
> +{
> +	unsigned int align = max(bdev_logical_block_size(bdev_out),
> +				 bdev_logical_block_size(bdev_in)) - 1;
> +
> +	if ((pos_in & align) || (pos_out & align) || (len & align) || !len ||
> +	    len >= BLK_COPY_MAX_BYTES)
> +		return -EINVAL;
> +
> +	return 0;
> +}
> +
> +static inline void blkdev_copy_endio(struct blkdev_copy_io *cio)
> +{
> +	if (cio->endio) {
> +		cio->endio(cio->private, cio->status, cio->copied);
> +		kfree(cio);
> +	} else {
> +		struct task_struct *waiter = cio->waiter;
> +
> +		WRITE_ONCE(cio->waiter, NULL);
> +		blk_wake_io_task(waiter);
> +	}
> +}
> +
> +/*
> + * This must only be called once all bios have been issued so that the refcount
> + * can only decrease. This just waits for all bios to complete.
> + * Returns the length of bytes copied or error
> + */
> +static ssize_t blkdev_copy_wait_io_completion(struct blkdev_copy_io *cio)
> +{
> +	ssize_t ret;
> +
> +	for (;;) {
> +		__set_current_state(TASK_UNINTERRUPTIBLE);
> +		if (!READ_ONCE(cio->waiter))
> +			break;
> +		blk_io_schedule();
> +	}
> +	__set_current_state(TASK_RUNNING);
> +	ret = cio->copied;
> +	kfree(cio);
> +
> +	return ret;
> +}
> +
> +static void blkdev_copy_offload_dst_endio(struct bio *bio)
> +{
> +	struct blkdev_copy_offload_io *offload_io = bio->bi_private;
> +	struct blkdev_copy_io *cio = offload_io->cio;
> +
> +	if (bio->bi_status) {
> +		cio->copied = min_t(ssize_t, offload_io->offset, cio->copied);
> +		if (!cio->status)
> +			cio->status = blk_status_to_errno(bio->bi_status);
> +	}
> +	bio_put(bio);
> +
> +	if (atomic_dec_and_test(&cio->refcount))
> +		blkdev_copy_endio(cio);
> +}
> +
> +/*
> + * @bdev:	block device
> + * @pos_in:	source offset
> + * @pos_out:	destination offset
> + * @len:	length in bytes to be copied
> + * @endio:	endio function to be called on completion of copy operation,
> + *		for synchronous operation this should be NULL
> + * @private:	endio function will be called with this private data,
> + *		for synchronous operation this should be NULL
> + * @gfp_mask:	memory allocation flags (for bio_alloc)
> + *
> + * For synchronous operation returns the length of bytes copied or error
> + * For asynchronous operation returns -EIOCBQUEUED or error
> + *
> + * Description:
> + *	Copy source offset to destination offset within block device, using
> + *	device's native copy offload feature. This function can fail, and
> + *	in that case the caller can fallback to emulation.
> + *	We perform copy operation using 2 bio's.
> + *	1. We take a plug and send a REQ_OP_COPY_SRC bio along with source
> + *	sector and length. Once this bio reaches request layer, we form a
> + *	request and wait for dst bio to arrive.
> + *	2. We issue REQ_OP_COPY_DST bio along with destination sector, length.
> + *	Once this bio reaches request layer and find a request with previously
> + *	sent source info we merge the destination bio and return.
> + *	3. Release the plug and request is sent to driver
> + *	This design works only for drivers with request queue.
> + */
> +ssize_t blkdev_copy_offload(struct block_device *bdev, loff_t pos_in,
> +			    loff_t pos_out, size_t len,
> +			    void (*endio)(void *, int, ssize_t),
> +			    void *private, gfp_t gfp)
> +{
> +	struct blkdev_copy_io *cio;
> +	struct blkdev_copy_offload_io *offload_io;
> +	struct bio *src_bio, *dst_bio;
> +	ssize_t rem, chunk, ret;
> +	ssize_t max_copy_bytes = bdev_max_copy_sectors(bdev) << SECTOR_SHIFT;
> +	struct blk_plug plug;
> +
> +	if (!max_copy_bytes)
> +		return -EINVAL;
> +
> +	ret = blkdev_copy_sanity_check(bdev, pos_in, bdev, pos_out, len);
> +	if (ret)
> +		return ret;
> +
> +	cio = kzalloc(sizeof(*cio), GFP_KERNEL);
> +	if (!cio)
> +		return -ENOMEM;
> +	atomic_set(&cio->refcount, 1);
> +	cio->waiter = current;
> +	cio->endio = endio;
> +	cio->private = private;
> +
> +	/*
> +	 * If there is a error, copied will be set to least successfully
> +	 * completed copied length
> +	 */
> +	cio->copied = len;
> +	for (rem = len; rem > 0; rem -= chunk) {
> +		chunk = min(rem, max_copy_bytes);
> +
> +		offload_io = kzalloc(sizeof(*offload_io), GFP_KERNEL);
> +		if (!offload_io)
> +			goto err_free_cio;
> +		offload_io->cio = cio;
> +		/*
> +		 * For partial completion, we use offload_io->offset to truncate
> +		 * successful copy length
> +		 */
> +		offload_io->offset = len - rem;
> +
> +		src_bio = bio_alloc(bdev, 0, REQ_OP_COPY_SRC, gfp);
> +		if (!src_bio)
> +			goto err_free_offload_io;
> +		src_bio->bi_iter.bi_size = chunk;
> +		src_bio->bi_iter.bi_sector = pos_in >> SECTOR_SHIFT;
> +
> +		blk_start_plug(&plug);
> +		dst_bio = blk_next_bio(src_bio, bdev, 0, REQ_OP_COPY_DST, gfp);
> +		if (!dst_bio)
> +			goto err_free_src_bio;
> +		dst_bio->bi_iter.bi_size = chunk;
> +		dst_bio->bi_iter.bi_sector = pos_out >> SECTOR_SHIFT;
> +		dst_bio->bi_end_io = blkdev_copy_offload_dst_endio;
> +		dst_bio->bi_private = offload_io;
> +
> +		atomic_inc(&cio->refcount);
> +		submit_bio(dst_bio);
> +		blk_finish_plug(&plug);
> +		pos_in += chunk;
> +		pos_out += chunk;
> +	}
> +
> +	if (atomic_dec_and_test(&cio->refcount))
> +		blkdev_copy_endio(cio);
> +	if (cio->endio)
> +		return -EIOCBQUEUED;
> +
> +	return blkdev_copy_wait_io_completion(cio);
> +
> +err_free_src_bio:
> +	bio_put(src_bio);
> +err_free_offload_io:
> +	kfree(offload_io);
> +err_free_cio:
> +	cio->copied = min_t(ssize_t, cio->copied, (len - rem));
> +	cio->status = -ENOMEM;
> +	if (rem == len) {
> +		kfree(cio);
> +		return cio->status;
> +	}
> +	if (cio->endio)
> +		return cio->status;
> +
> +	return blkdev_copy_wait_io_completion(cio);
> +}
> +EXPORT_SYMBOL_GPL(blkdev_copy_offload);

Hmm. That looks a bit odd. Why do you have to use wait_for_completion?
Can't you submit the 'src' bio, and then submit the 'dst' bio from the 
endio handler of the 'src' bio?

Cheers,

Hannes
Nitesh Shetty Sept. 7, 2023, 7:16 a.m. UTC | #2
On 07/09/23 07:49AM, Hannes Reinecke wrote:
>On 9/6/23 18:38, Nitesh Shetty wrote:
>
>Hmm. That looks a bit odd. Why do you have to use wait_for_completion?

wait_for_completion is waiting for all the copy IOs to complete,
when caller does not pass endio handler.
Copy IO submissions are still async, as in previous revisions.

>Can't you submit the 'src' bio, and then submit the 'dst' bio from the 
>endio handler of the 'src' bio?
We can't do this with the current bio merging approach.
'src' bio waits for the 'dst' bio to arrive in request layer.
Note that both bio's should be present in request reaching the driver,
to form the copy-cmd.

Thank You,
Nitesh Shetty
Hannes Reinecke Sept. 8, 2023, 5:55 a.m. UTC | #3
On 9/7/23 09:16, Nitesh Shetty wrote:
> On 07/09/23 07:49AM, Hannes Reinecke wrote:
>> On 9/6/23 18:38, Nitesh Shetty wrote:
>>
>> Hmm. That looks a bit odd. Why do you have to use wait_for_completion?
> 
> wait_for_completion is waiting for all the copy IOs to complete,
> when caller does not pass endio handler.
> Copy IO submissions are still async, as in previous revisions.
> 
>> Can't you submit the 'src' bio, and then submit the 'dst' bio from the 
>> endio handler of the 'src' bio?
> We can't do this with the current bio merging approach.
> 'src' bio waits for the 'dst' bio to arrive in request layer.
> Note that both bio's should be present in request reaching the driver,
> to form the copy-cmd.
> 
Hmm. I guess it would be possible, but in the end we can always change
it later once the infrastructure is in.

So:

Reviewed-by: Hannes Reinecke <hare@suse.de>

Cheers,

Hannes
diff mbox series

Patch

diff --git a/block/blk-lib.c b/block/blk-lib.c
index e59c3069e835..d22e1e7417ca 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -10,6 +10,22 @@ 
 
 #include "blk.h"
 
+/* Keeps track of all outstanding copy IO */
+struct blkdev_copy_io {
+	atomic_t refcount;
+	ssize_t copied;
+	int status;
+	struct task_struct *waiter;
+	void (*endio)(void *private, int status, ssize_t copied);
+	void *private;
+};
+
+/* Keeps track of single outstanding copy offload IO */
+struct blkdev_copy_offload_io {
+	struct blkdev_copy_io *cio;
+	loff_t offset;
+};
+
 static sector_t bio_discard_limit(struct block_device *bdev, sector_t sector)
 {
 	unsigned int discard_granularity = bdev_discard_granularity(bdev);
@@ -115,6 +131,192 @@  int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 }
 EXPORT_SYMBOL(blkdev_issue_discard);
 
+static inline ssize_t blkdev_copy_sanity_check(struct block_device *bdev_in,
+					       loff_t pos_in,
+					       struct block_device *bdev_out,
+					       loff_t pos_out, size_t len)
+{
+	unsigned int align = max(bdev_logical_block_size(bdev_out),
+				 bdev_logical_block_size(bdev_in)) - 1;
+
+	if ((pos_in & align) || (pos_out & align) || (len & align) || !len ||
+	    len >= BLK_COPY_MAX_BYTES)
+		return -EINVAL;
+
+	return 0;
+}
+
+static inline void blkdev_copy_endio(struct blkdev_copy_io *cio)
+{
+	if (cio->endio) {
+		cio->endio(cio->private, cio->status, cio->copied);
+		kfree(cio);
+	} else {
+		struct task_struct *waiter = cio->waiter;
+
+		WRITE_ONCE(cio->waiter, NULL);
+		blk_wake_io_task(waiter);
+	}
+}
+
+/*
+ * This must only be called once all bios have been issued so that the refcount
+ * can only decrease. This just waits for all bios to complete.
+ * Returns the length of bytes copied or error
+ */
+static ssize_t blkdev_copy_wait_io_completion(struct blkdev_copy_io *cio)
+{
+	ssize_t ret;
+
+	for (;;) {
+		__set_current_state(TASK_UNINTERRUPTIBLE);
+		if (!READ_ONCE(cio->waiter))
+			break;
+		blk_io_schedule();
+	}
+	__set_current_state(TASK_RUNNING);
+	ret = cio->copied;
+	kfree(cio);
+
+	return ret;
+}
+
+static void blkdev_copy_offload_dst_endio(struct bio *bio)
+{
+	struct blkdev_copy_offload_io *offload_io = bio->bi_private;
+	struct blkdev_copy_io *cio = offload_io->cio;
+
+	if (bio->bi_status) {
+		cio->copied = min_t(ssize_t, offload_io->offset, cio->copied);
+		if (!cio->status)
+			cio->status = blk_status_to_errno(bio->bi_status);
+	}
+	bio_put(bio);
+
+	if (atomic_dec_and_test(&cio->refcount))
+		blkdev_copy_endio(cio);
+}
+
+/*
+ * @bdev:	block device
+ * @pos_in:	source offset
+ * @pos_out:	destination offset
+ * @len:	length in bytes to be copied
+ * @endio:	endio function to be called on completion of copy operation,
+ *		for synchronous operation this should be NULL
+ * @private:	endio function will be called with this private data,
+ *		for synchronous operation this should be NULL
+ * @gfp_mask:	memory allocation flags (for bio_alloc)
+ *
+ * For synchronous operation returns the length of bytes copied or error
+ * For asynchronous operation returns -EIOCBQUEUED or error
+ *
+ * Description:
+ *	Copy source offset to destination offset within block device, using
+ *	device's native copy offload feature. This function can fail, and
+ *	in that case the caller can fallback to emulation.
+ *	We perform copy operation using 2 bio's.
+ *	1. We take a plug and send a REQ_OP_COPY_SRC bio along with source
+ *	sector and length. Once this bio reaches request layer, we form a
+ *	request and wait for dst bio to arrive.
+ *	2. We issue REQ_OP_COPY_DST bio along with destination sector, length.
+ *	Once this bio reaches request layer and find a request with previously
+ *	sent source info we merge the destination bio and return.
+ *	3. Release the plug and request is sent to driver
+ *	This design works only for drivers with request queue.
+ */
+ssize_t blkdev_copy_offload(struct block_device *bdev, loff_t pos_in,
+			    loff_t pos_out, size_t len,
+			    void (*endio)(void *, int, ssize_t),
+			    void *private, gfp_t gfp)
+{
+	struct blkdev_copy_io *cio;
+	struct blkdev_copy_offload_io *offload_io;
+	struct bio *src_bio, *dst_bio;
+	ssize_t rem, chunk, ret;
+	ssize_t max_copy_bytes = bdev_max_copy_sectors(bdev) << SECTOR_SHIFT;
+	struct blk_plug plug;
+
+	if (!max_copy_bytes)
+		return -EINVAL;
+
+	ret = blkdev_copy_sanity_check(bdev, pos_in, bdev, pos_out, len);
+	if (ret)
+		return ret;
+
+	cio = kzalloc(sizeof(*cio), GFP_KERNEL);
+	if (!cio)
+		return -ENOMEM;
+	atomic_set(&cio->refcount, 1);
+	cio->waiter = current;
+	cio->endio = endio;
+	cio->private = private;
+
+	/*
+	 * If there is a error, copied will be set to least successfully
+	 * completed copied length
+	 */
+	cio->copied = len;
+	for (rem = len; rem > 0; rem -= chunk) {
+		chunk = min(rem, max_copy_bytes);
+
+		offload_io = kzalloc(sizeof(*offload_io), GFP_KERNEL);
+		if (!offload_io)
+			goto err_free_cio;
+		offload_io->cio = cio;
+		/*
+		 * For partial completion, we use offload_io->offset to truncate
+		 * successful copy length
+		 */
+		offload_io->offset = len - rem;
+
+		src_bio = bio_alloc(bdev, 0, REQ_OP_COPY_SRC, gfp);
+		if (!src_bio)
+			goto err_free_offload_io;
+		src_bio->bi_iter.bi_size = chunk;
+		src_bio->bi_iter.bi_sector = pos_in >> SECTOR_SHIFT;
+
+		blk_start_plug(&plug);
+		dst_bio = blk_next_bio(src_bio, bdev, 0, REQ_OP_COPY_DST, gfp);
+		if (!dst_bio)
+			goto err_free_src_bio;
+		dst_bio->bi_iter.bi_size = chunk;
+		dst_bio->bi_iter.bi_sector = pos_out >> SECTOR_SHIFT;
+		dst_bio->bi_end_io = blkdev_copy_offload_dst_endio;
+		dst_bio->bi_private = offload_io;
+
+		atomic_inc(&cio->refcount);
+		submit_bio(dst_bio);
+		blk_finish_plug(&plug);
+		pos_in += chunk;
+		pos_out += chunk;
+	}
+
+	if (atomic_dec_and_test(&cio->refcount))
+		blkdev_copy_endio(cio);
+	if (cio->endio)
+		return -EIOCBQUEUED;
+
+	return blkdev_copy_wait_io_completion(cio);
+
+err_free_src_bio:
+	bio_put(src_bio);
+err_free_offload_io:
+	kfree(offload_io);
+err_free_cio:
+	cio->copied = min_t(ssize_t, cio->copied, (len - rem));
+	cio->status = -ENOMEM;
+	if (rem == len) {
+		kfree(cio);
+		return cio->status;
+	}
+	if (cio->endio)
+		return cio->status;
+
+	return blkdev_copy_wait_io_completion(cio);
+}
+EXPORT_SYMBOL_GPL(blkdev_copy_offload);
+
 static int __blkdev_issue_write_zeroes(struct block_device *bdev,
 		sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
 		struct bio **biop, unsigned flags)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 7548f1685ee9..5405499bcf22 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1042,6 +1042,10 @@  int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, struct bio **biop);
 int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp);
+ssize_t blkdev_copy_offload(struct block_device *bdev, loff_t pos_in,
+			    loff_t pos_out, size_t len,
+			    void (*endio)(void *, int, ssize_t),
+			    void *private, gfp_t gfp_mask);
 
 #define BLKDEV_ZERO_NOUNMAP	(1 << 0)  /* do not free blocks */
 #define BLKDEV_ZERO_NOFALLBACK	(1 << 1)  /* don't write explicit zeroes */