diff mbox series

[v13,3/9] block: add emulation for copy

Message ID 20230627183629.26571-4-nj.shetty@samsung.com (mailing list archive)
State New, archived
Headers show
Series [v13,1/9] block: Introduce queue limits for copy-offload support | expand

Commit Message

Nitesh Shetty June 27, 2023, 6:36 p.m. UTC
For the devices which does not support copy, copy emulation is added.
It is required for in-kernel users like fabrics, where file descriptor is
not available and hence they can't use copy_file_range.
Copy-emulation is implemented by reading from source into memory and
writing to the corresponding destination asynchronously.
Also emulation is used, if copy offload fails or partially completes.

Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
Signed-off-by: Vincent Fu <vincent.fu@samsung.com>
Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
---
 block/blk-lib.c           | 183 +++++++++++++++++++++++++++++++++++++-
 block/blk-map.c           |   4 +-
 include/linux/blk_types.h |   5 ++
 include/linux/blkdev.h    |   3 +
 4 files changed, 192 insertions(+), 3 deletions(-)

Comments

Nitesh Shetty June 28, 2023, 4:10 p.m. UTC | #1
On 23/06/28 03:50PM, Damien Le Moal wrote:
>On 6/28/23 03:36, Nitesh Shetty wrote:
>> For the devices which does not support copy, copy emulation is added.
>> It is required for in-kernel users like fabrics, where file descriptor is
>> not available and hence they can't use copy_file_range.
>> Copy-emulation is implemented by reading from source into memory and
>> writing to the corresponding destination asynchronously.
>> Also emulation is used, if copy offload fails or partially completes.
>>
>> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
>> Signed-off-by: Vincent Fu <vincent.fu@samsung.com>
>> Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
>> ---
>>  block/blk-lib.c           | 183 +++++++++++++++++++++++++++++++++++++-
>>  block/blk-map.c           |   4 +-
>>  include/linux/blk_types.h |   5 ++
>>  include/linux/blkdev.h    |   3 +
>>  4 files changed, 192 insertions(+), 3 deletions(-)
>>
>> diff --git a/block/blk-lib.c b/block/blk-lib.c
>> index 10c3eadd5bf6..09e0d5d51d03 100644
>> --- a/block/blk-lib.c
>> +++ b/block/blk-lib.c
>> @@ -234,6 +234,180 @@ static ssize_t __blkdev_copy_offload(
>>  	return blkdev_copy_wait_io_completion(cio);
>>  }
>>
>> +static void *blkdev_copy_alloc_buf(sector_t req_size, sector_t *alloc_size,
>> +		gfp_t gfp_mask)
>> +{
>> +	int min_size = PAGE_SIZE;
>> +	void *buf;
>> +
>> +	while (req_size >= min_size) {
>> +		buf = kvmalloc(req_size, gfp_mask);
>> +		if (buf) {
>> +			*alloc_size = req_size;
>> +			return buf;
>> +		}
>> +		/* retry half the requested size */
>
>Kind of obvious :)

Acked. will remove.

>
>> +		req_size >>= 1;
>> +	}
>> +
>> +	return NULL;
>> +}
>> +
>> +static void blkdev_copy_emulate_write_endio(struct bio *bio)
>> +{
>> +	struct copy_ctx *ctx = bio->bi_private;
>> +	struct cio *cio = ctx->cio;
>> +	sector_t clen;
>> +
>> +	if (bio->bi_status) {
>> +		clen = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - cio->pos_out;
>> +		cio->comp_len = min_t(sector_t, clen, cio->comp_len);
>> +	}
>> +	kfree(bvec_virt(&bio->bi_io_vec[0]));
>> +	bio_map_kern_endio(bio);
>> +	kfree(ctx);
>> +	if (atomic_dec_and_test(&cio->refcount)) {
>> +		if (cio->endio) {
>> +			cio->endio(cio->private, cio->comp_len);
>> +			kfree(cio);
>> +		} else
>> +			blk_wake_io_task(cio->waiter);
>
>Curly brackets.
>
acked

>> +	}
>> +}
>> +
>> +static void blkdev_copy_emulate_read_endio(struct bio *read_bio)
>> +{
>> +	struct copy_ctx *ctx = read_bio->bi_private;
>> +	struct cio *cio = ctx->cio;
>> +	sector_t clen;
>> +
>> +	if (read_bio->bi_status) {
>> +		clen = (read_bio->bi_iter.bi_sector << SECTOR_SHIFT) -
>> +						cio->pos_in;
>> +		cio->comp_len = min_t(sector_t, clen, cio->comp_len);
>> +		kfree(bvec_virt(&read_bio->bi_io_vec[0]));
>> +		bio_map_kern_endio(read_bio);
>> +		kfree(ctx);
>> +
>> +		if (atomic_dec_and_test(&cio->refcount)) {
>> +			if (cio->endio) {
>> +				cio->endio(cio->private, cio->comp_len);
>> +				kfree(cio);
>> +			} else
>> +				blk_wake_io_task(cio->waiter);
>
>Same.

acked

>
>> +		}
>> +	}
>> +	schedule_work(&ctx->dispatch_work);
>
>ctx may have been freed above.

acked, will fix this.

>
>> +	kfree(read_bio);
>> +}
>> +
>> +static void blkdev_copy_dispatch_work(struct work_struct *work)
>> +{
>> +	struct copy_ctx *ctx = container_of(work, struct copy_ctx,
>> +			dispatch_work);
>
>Please align the argument, or even better: split the line after "=".


acked.

Thank you
Nitesh Shetty
Ming Lei June 29, 2023, 8:33 a.m. UTC | #2
Hi Nitesh,

On Wed, Jun 28, 2023 at 12:06:17AM +0530, Nitesh Shetty wrote:
> For the devices which does not support copy, copy emulation is added.
> It is required for in-kernel users like fabrics, where file descriptor is

I can understand copy command does help for FS GC and fabrics storages,
but still not very clear why copy emulation is needed for kernel users,
is it just for covering both copy command and emulation in single
interface? Or other purposes?

I'd suggest to add more words about in-kernel users of copy emulation.

> not available and hence they can't use copy_file_range.
> Copy-emulation is implemented by reading from source into memory and
> writing to the corresponding destination asynchronously.
> Also emulation is used, if copy offload fails or partially completes.

Per my understanding, this kind of emulation may not be as efficient
as doing it in userspace(two linked io_uring SQEs, read & write with
shared buffer). But it is fine if there are real in-kernel such users.


Thanks,
Ming
Nitesh Shetty June 30, 2023, 11:22 a.m. UTC | #3
On 23/06/29 04:33PM, Ming Lei wrote:
>Hi Nitesh,
>
>On Wed, Jun 28, 2023 at 12:06:17AM +0530, Nitesh Shetty wrote:
>> For the devices which does not support copy, copy emulation is added.
>> It is required for in-kernel users like fabrics, where file descriptor is
>
>I can understand copy command does help for FS GC and fabrics storages,
>but still not very clear why copy emulation is needed for kernel users,
>is it just for covering both copy command and emulation in single
>interface? Or other purposes?
>
>I'd suggest to add more words about in-kernel users of copy emulation.
>

As you mentioned above, we need a single interface for covering both
copy command and emulation.
This is needed in fabrics cases, as we expose any non copy command
supported target device also as copy capable, so we fallback to emulation
once we recieve copy from host/initator.
Agreed, we will add more description to covey the same info.

>> not available and hence they can't use copy_file_range.
>> Copy-emulation is implemented by reading from source into memory and
>> writing to the corresponding destination asynchronously.
>> Also emulation is used, if copy offload fails or partially completes.
>
>Per my understanding, this kind of emulation may not be as efficient
>as doing it in userspace(two linked io_uring SQEs, read & write with
>shared buffer). But it is fine if there are real in-kernel such users.
>

We do have plans for uring based copy interface in next phase,
once curent series is merged.
With current design we really see the advantage of emulation in fabrics case.

Thank you,
Nitesh Shetty
Christoph Hellwig July 20, 2023, 7:50 a.m. UTC | #4
> +static void *blkdev_copy_alloc_buf(sector_t req_size, sector_t *alloc_size,
> +		gfp_t gfp_mask)
> +{
> +	int min_size = PAGE_SIZE;
> +	void *buf;
> +
> +	while (req_size >= min_size) {
> +		buf = kvmalloc(req_size, gfp_mask);
> +		if (buf) {
> +			*alloc_size = req_size;
> +			return buf;
> +		}
> +		/* retry half the requested size */
> +		req_size >>= 1;
> +	}
> +
> +	return NULL;

Is there any good reason for using vmalloc instead of a bunch
of distcontiguous pages?

> +		ctx = kzalloc(sizeof(struct copy_ctx), gfp_mask);
> +		if (!ctx)
> +			goto err_ctx;

I'd suspect it would be better to just allocte a single buffer and
only have a single outstanding copy.  That will reduce the bandwith
you can theoretically get, but copies tend to be background operations
anyway.  It will reduce the required memory, and thus the chance for
this operation to fail on a loaded system.  It will also dramatically
reduce the effect on memory managment.

> +		read_bio = bio_map_kern(in, buf, buf_len, gfp_mask);
> +		if (IS_ERR(read_bio))
> +			goto err_read_bio;
> +
> +		write_bio = bio_map_kern(out, buf, buf_len, gfp_mask);
> +		if (IS_ERR(write_bio))
> +			goto err_write_bio;

bio_map_kern is only for passthrough I/Os.  You need to use
a bio_add_page loop here.

> index 336146798e56..f8c80940c7ad 100644
> --- a/include/linux/blk_types.h
> +++ b/include/linux/blk_types.h
> @@ -562,4 +562,9 @@ struct cio {
>  	atomic_t refcount;
>  };
>  
> +struct copy_ctx {
> +	struct cio *cio;
> +	struct work_struct dispatch_work;
> +	struct bio *write_bio;
> +};

This is misnamed as it's only used by the fallback code, and also
should be private to blk-lib.c.
Nitesh Shetty Aug. 1, 2023, 1:07 p.m. UTC | #5
On 23/07/20 09:50AM, Christoph Hellwig wrote:
>> +static void *blkdev_copy_alloc_buf(sector_t req_size, sector_t *alloc_size,
>> +		gfp_t gfp_mask)
>> +{
>> +	int min_size = PAGE_SIZE;
>> +	void *buf;
>> +
>> +	while (req_size >= min_size) {
>> +		buf = kvmalloc(req_size, gfp_mask);
>> +		if (buf) {
>> +			*alloc_size = req_size;
>> +			return buf;
>> +		}
>> +		/* retry half the requested size */
>> +		req_size >>= 1;
>> +	}
>> +
>> +	return NULL;
>
>Is there any good reason for using vmalloc instead of a bunch
>of distcontiguous pages?
>

kvmalloc seemed convenient for the purpose. 
We will need to call alloc_page in a loop to guarantee discontigous pages. 
Do you prefer that over kvmalloc?

>> +		ctx = kzalloc(sizeof(struct copy_ctx), gfp_mask);
>> +		if (!ctx)
>> +			goto err_ctx;
>
>I'd suspect it would be better to just allocte a single buffer and
>only have a single outstanding copy.  That will reduce the bandwith
>you can theoretically get, but copies tend to be background operations
>anyway.  It will reduce the required memory, and thus the chance for
>this operation to fail on a loaded system.  It will also dramatically
>reduce the effect on memory managment.
>

Next version will have that change.

Thank You,
Nitesh Shetty
Kent Overstreet Aug. 2, 2023, 6:31 a.m. UTC | #6
On Tue, Aug 01, 2023 at 06:37:02PM +0530, Nitesh Shetty wrote:
> On 23/07/20 09:50AM, Christoph Hellwig wrote:
> > > +static void *blkdev_copy_alloc_buf(sector_t req_size, sector_t *alloc_size,
> > > +		gfp_t gfp_mask)
> > > +{
> > > +	int min_size = PAGE_SIZE;
> > > +	void *buf;
> > > +
> > > +	while (req_size >= min_size) {
> > > +		buf = kvmalloc(req_size, gfp_mask);
> > > +		if (buf) {
> > > +			*alloc_size = req_size;
> > > +			return buf;
> > > +		}
> > > +		/* retry half the requested size */
> > > +		req_size >>= 1;
> > > +	}
> > > +
> > > +	return NULL;
> > 
> > Is there any good reason for using vmalloc instead of a bunch
> > of distcontiguous pages?
> > 
> 
> kvmalloc seemed convenient for the purpose. We will need to call alloc_page
> in a loop to guarantee discontigous pages. Do you prefer that over kvmalloc?

No, kvmalloc should be the preferred approach here now: with large
folios, we're now getting better about doing more large memory
allocations and avoiding fragmentation, so in practice this won't be a
vmalloc allocation except in exceptional circumstances, and performance
will be better and the code will be simpler doing a single large
allocation.
diff mbox series

Patch

diff --git a/block/blk-lib.c b/block/blk-lib.c
index 10c3eadd5bf6..09e0d5d51d03 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -234,6 +234,180 @@  static ssize_t __blkdev_copy_offload(
 	return blkdev_copy_wait_io_completion(cio);
 }
 
+static void *blkdev_copy_alloc_buf(sector_t req_size, sector_t *alloc_size,
+		gfp_t gfp_mask)
+{
+	int min_size = PAGE_SIZE;
+	void *buf;
+
+	while (req_size >= min_size) {
+		buf = kvmalloc(req_size, gfp_mask);
+		if (buf) {
+			*alloc_size = req_size;
+			return buf;
+		}
+		/* retry half the requested size */
+		req_size >>= 1;
+	}
+
+	return NULL;
+}
+
+static void blkdev_copy_emulate_write_endio(struct bio *bio)
+{
+	struct copy_ctx *ctx = bio->bi_private;
+	struct cio *cio = ctx->cio;
+	sector_t clen;
+
+	if (bio->bi_status) {
+		clen = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - cio->pos_out;
+		cio->comp_len = min_t(sector_t, clen, cio->comp_len);
+	}
+	kfree(bvec_virt(&bio->bi_io_vec[0]));
+	bio_map_kern_endio(bio);
+	kfree(ctx);
+	if (atomic_dec_and_test(&cio->refcount)) {
+		if (cio->endio) {
+			cio->endio(cio->private, cio->comp_len);
+			kfree(cio);
+		} else
+			blk_wake_io_task(cio->waiter);
+	}
+}
+
+static void blkdev_copy_emulate_read_endio(struct bio *read_bio)
+{
+	struct copy_ctx *ctx = read_bio->bi_private;
+	struct cio *cio = ctx->cio;
+	sector_t clen;
+
+	if (read_bio->bi_status) {
+		clen = (read_bio->bi_iter.bi_sector << SECTOR_SHIFT) -
+						cio->pos_in;
+		cio->comp_len = min_t(sector_t, clen, cio->comp_len);
+		kfree(bvec_virt(&read_bio->bi_io_vec[0]));
+		bio_map_kern_endio(read_bio);
+		kfree(ctx);
+
+		if (atomic_dec_and_test(&cio->refcount)) {
+			if (cio->endio) {
+				cio->endio(cio->private, cio->comp_len);
+				kfree(cio);
+			} else
+				blk_wake_io_task(cio->waiter);
+		}
+	}
+	schedule_work(&ctx->dispatch_work);
+	kfree(read_bio);
+}
+
+static void blkdev_copy_dispatch_work(struct work_struct *work)
+{
+	struct copy_ctx *ctx = container_of(work, struct copy_ctx,
+			dispatch_work);
+
+	submit_bio(ctx->write_bio);
+}
+
+/*
+ * If native copy offload feature is absent, this function tries to emulate,
+ * by copying data from source to a temporary buffer and from buffer to
+ * destination device.
+ * Returns the length of bytes copied or error if encountered
+ */
+static ssize_t __blkdev_copy_emulate(
+		struct block_device *bdev_in, loff_t pos_in,
+		struct block_device *bdev_out, loff_t pos_out,
+		size_t len, cio_iodone_t endio, void *private, gfp_t gfp_mask)
+{
+	struct request_queue *in = bdev_get_queue(bdev_in);
+	struct request_queue *out = bdev_get_queue(bdev_out);
+	struct bio *read_bio, *write_bio;
+	void *buf = NULL;
+	struct copy_ctx *ctx;
+	struct cio *cio;
+	sector_t buf_len, req_len, rem = 0;
+	sector_t max_src_hw_len = min_t(unsigned int,
+			queue_max_hw_sectors(in),
+			queue_max_segments(in) << (PAGE_SHIFT - SECTOR_SHIFT))
+			<< SECTOR_SHIFT;
+	sector_t max_dst_hw_len = min_t(unsigned int,
+		queue_max_hw_sectors(out),
+			queue_max_segments(out) << (PAGE_SHIFT - SECTOR_SHIFT))
+			<< SECTOR_SHIFT;
+	sector_t max_hw_len = min_t(unsigned int,
+			max_src_hw_len, max_dst_hw_len);
+
+	cio = kzalloc(sizeof(struct cio), GFP_KERNEL);
+	if (!cio)
+		return -ENOMEM;
+	atomic_set(&cio->refcount, 0);
+	cio->pos_in = pos_in;
+	cio->pos_out = pos_out;
+	cio->waiter = current;
+	cio->endio = endio;
+	cio->private = private;
+
+	for (rem = len; rem > 0; rem -= buf_len) {
+		req_len = min_t(int, max_hw_len, rem);
+
+		buf = blkdev_copy_alloc_buf(req_len, &buf_len, gfp_mask);
+		if (!buf)
+			goto err_alloc_buf;
+
+		ctx = kzalloc(sizeof(struct copy_ctx), gfp_mask);
+		if (!ctx)
+			goto err_ctx;
+
+		read_bio = bio_map_kern(in, buf, buf_len, gfp_mask);
+		if (IS_ERR(read_bio))
+			goto err_read_bio;
+
+		write_bio = bio_map_kern(out, buf, buf_len, gfp_mask);
+		if (IS_ERR(write_bio))
+			goto err_write_bio;
+
+		ctx->cio = cio;
+		ctx->write_bio = write_bio;
+		INIT_WORK(&ctx->dispatch_work, blkdev_copy_dispatch_work);
+
+		read_bio->bi_iter.bi_sector = pos_in >> SECTOR_SHIFT;
+		read_bio->bi_iter.bi_size = buf_len;
+		read_bio->bi_opf = REQ_OP_READ | REQ_SYNC;
+		bio_set_dev(read_bio, bdev_in);
+		read_bio->bi_end_io = blkdev_copy_emulate_read_endio;
+		read_bio->bi_private = ctx;
+
+		write_bio->bi_iter.bi_size = buf_len;
+		write_bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
+		bio_set_dev(write_bio, bdev_out);
+		write_bio->bi_end_io = blkdev_copy_emulate_write_endio;
+		write_bio->bi_iter.bi_sector = pos_out >> SECTOR_SHIFT;
+		write_bio->bi_private = ctx;
+
+		atomic_inc(&cio->refcount);
+		submit_bio(read_bio);
+
+		pos_in += buf_len;
+		pos_out += buf_len;
+	}
+	return blkdev_copy_wait_io_completion(cio);
+
+err_write_bio:
+	bio_put(read_bio);
+err_read_bio:
+	kfree(ctx);
+err_ctx:
+	kvfree(buf);
+err_alloc_buf:
+	cio->comp_len -= min_t(sector_t, cio->comp_len, len - rem);
+	if (!atomic_read(&cio->refcount)) {
+		kfree(cio);
+		return -ENOMEM;
+	}
+	return blkdev_copy_wait_io_completion(cio);
+}
+
 static inline ssize_t blkdev_copy_sanity_check(
 	struct block_device *bdev_in, loff_t pos_in,
 	struct block_device *bdev_out, loff_t pos_out,
@@ -284,9 +458,16 @@  ssize_t blkdev_copy_offload(
 	if (ret)
 		return ret;
 
-	if (blk_queue_copy(q_in) && blk_queue_copy(q_out))
+	if (blk_queue_copy(q_in) && blk_queue_copy(q_out)) {
 		ret = __blkdev_copy_offload(bdev_in, pos_in, bdev_out, pos_out,
 			   len, endio, private, gfp_mask);
+		if (ret < 0)
+			ret = 0;
+	}
+
+	if (ret != len)
+		ret = __blkdev_copy_emulate(bdev_in, pos_in + ret, bdev_out,
+			 pos_out + ret, len - ret, endio, private, gfp_mask);
 
 	return ret;
 }
diff --git a/block/blk-map.c b/block/blk-map.c
index 44d74a30ddac..ceeb70a95fd1 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -363,7 +363,7 @@  static void bio_invalidate_vmalloc_pages(struct bio *bio)
 #endif
 }
 
-static void bio_map_kern_endio(struct bio *bio)
+void bio_map_kern_endio(struct bio *bio)
 {
 	bio_invalidate_vmalloc_pages(bio);
 	bio_uninit(bio);
@@ -380,7 +380,7 @@  static void bio_map_kern_endio(struct bio *bio)
  *	Map the kernel address into a bio suitable for io to a block
  *	device. Returns an error pointer in case of error.
  */
-static struct bio *bio_map_kern(struct request_queue *q, void *data,
+struct bio *bio_map_kern(struct request_queue *q, void *data,
 		unsigned int len, gfp_t gfp_mask)
 {
 	unsigned long kaddr = (unsigned long)data;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 336146798e56..f8c80940c7ad 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -562,4 +562,9 @@  struct cio {
 	atomic_t refcount;
 };
 
+struct copy_ctx {
+	struct cio *cio;
+	struct work_struct dispatch_work;
+	struct bio *write_bio;
+};
 #endif /* __LINUX_BLK_TYPES_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 963f5c97dec0..c176bf6173c5 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1047,6 +1047,9 @@  ssize_t blkdev_copy_offload(
 		struct block_device *bdev_in, loff_t pos_in,
 		struct block_device *bdev_out, loff_t pos_out,
 		size_t len, cio_iodone_t end_io, void *private, gfp_t gfp_mask);
+struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
+		gfp_t gfp_mask);
+void bio_map_kern_endio(struct bio *bio);
 
 #define BLKDEV_ZERO_NOUNMAP	(1 << 0)  /* do not free blocks */
 #define BLKDEV_ZERO_NOFALLBACK	(1 << 1)  /* don't write explicit zeroes */