diff mbox series

[v16,04/12] block: add emulation for copy

Message ID 20230920080756.11919-5-nj.shetty@samsung.com (mailing list archive)
State New, archived
Headers show
Series [v16,01/12] block: Introduce queue limits and sysfs for copy-offload support | expand

Commit Message

Nitesh Shetty Sept. 20, 2023, 8:07 a.m. UTC
For the devices which does not support copy, copy emulation is added.
It is required for in-kernel users like fabrics, where file descriptor is
not available and hence they can't use copy_file_range.
Copy-emulation is implemented by reading from source into memory and
writing to the corresponding destination.
At present in kernel user of emulation is fabrics.

Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
Signed-off-by: Vincent Fu <vincent.fu@samsung.com>
Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
---
 block/blk-lib.c        | 223 +++++++++++++++++++++++++++++++++++++++++
 include/linux/blkdev.h |   4 +
 2 files changed, 227 insertions(+)

Comments

Jinyoung Choi Sept. 22, 2023, 1:08 p.m. UTC | #1
> +static void blkdev_copy_emulation_work(struct work_struct *work)
> +{
> +        struct blkdev_copy_emulation_io *emulation_io = container_of(work,
> +                        struct blkdev_copy_emulation_io, emulation_work);
> +        struct blkdev_copy_io *cio = emulation_io->cio;
> +        struct bio *read_bio, *write_bio;
> +        loff_t pos_in = emulation_io->pos_in, pos_out = emulation_io->pos_out;
> +        ssize_t rem, chunk;
> +        int ret = 0;
> +
> +        for (rem = emulation_io->len; rem > 0; rem -= chunk) {
> +                chunk = min_t(int, emulation_io->buf_len, rem);
> +
> +                read_bio = bio_map_buf(emulation_io->buf,
> +                                       emulation_io->buf_len,
> +                                       emulation_io->gfp);
> +                if (IS_ERR(read_bio)) {
> +                        ret = PTR_ERR(read_bio);
> +                        break;
> +                }
> +                read_bio->bi_opf = REQ_OP_READ | REQ_SYNC;
> +                bio_set_dev(read_bio, emulation_io->bdev_in);
> +                read_bio->bi_iter.bi_sector = pos_in >> SECTOR_SHIFT;
> +                read_bio->bi_iter.bi_size = chunk;
> +                ret = submit_bio_wait(read_bio);
> +                kfree(read_bio);

Hi, Nitesh,

blk_mq_map_bio_put(read_bio)?
or bio_uninit(read_bio); kfree(read_bio)?

> +                if (ret)
> +                        break;
> +
> +                write_bio = bio_map_buf(emulation_io->buf,
> +                                        emulation_io->buf_len,
> +                                        emulation_io->gfp);
> +                if (IS_ERR(write_bio)) {
> +                        ret = PTR_ERR(write_bio);
> +                        break;
> +                }
> +                write_bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
> +                bio_set_dev(write_bio, emulation_io->bdev_out);
> +                write_bio->bi_iter.bi_sector = pos_out >> SECTOR_SHIFT;
> +                write_bio->bi_iter.bi_size = chunk;
> +                ret = submit_bio_wait(write_bio);
> +                kfree(write_bio);

blk_mq_map_bio_put(write_bio) ?
or bio_uninit(write_bio); kfree(write_bio)?

hmm... 
It continuously allocates and releases memory for bio,
Why don't you just allocate and reuse bio outside the loop?

> +                if (ret)
> +                        break;
> +
> +                pos_in += chunk;
> +                pos_out += chunk;
> +        }
> +        cio->status = ret;
> +        kvfree(emulation_io->buf);
> +        kfree(emulation_io);

I have not usually seen an implementation that releases memory for
itself while performing a worker. ( I don't know what's right. :) )

Since blkdev_copy_emulation() allocates memory for the emulation 
and waits for it to be completed, wouldn't it be better to proceed
with the memory release for it in the same context?

That is, IMO, wouldn't it be better to free the memory related to
emulation in blkdev_copy_wait_io_completion()?

Best Regards,
Jinyoung.
Nitesh Shetty Sept. 26, 2023, 10:07 a.m. UTC | #2
>> +                write_bio->bi_iter.bi_size = chunk;
>> +                ret = submit_bio_wait(write_bio);
>> +                kfree(write_bio);
>
>blk_mq_map_bio_put(write_bio) ?
>or bio_uninit(write_bio); kfree(write_bio)?
>
>hmm...
>It continuously allocates and releases memory for bio,
>Why don't you just allocate and reuse bio outside the loop?
>

Agree, we will update this in next version.

>> +                if (ret)
>> +                        break;
>> +
>> +                pos_in += chunk;
>> +                pos_out += chunk;
>> +        }
>> +        cio->status = ret;
>> +        kvfree(emulation_io->buf);
>> +        kfree(emulation_io);
>
>I have not usually seen an implementation that releases memory for
>itself while performing a worker. ( I don't know what's right. :) )
>
The worker is already executing at this point.
We think releasing the reference after it starts executing should not
be an issue, and it didn't come-up in any of our testing too.

>Since blkdev_copy_emulation() allocates memory for the emulation
>and waits for it to be completed, wouldn't it be better to proceed
>with the memory release for it in the same context?
>
>That is, IMO, wouldn't it be better to free the memory related to
>emulation in blkdev_copy_wait_io_completion()?
>

Above mentioned design works for synchronous IOs. But for asynchronous
IOs emulation job is punted to worker and submitter task returns.
Submitter doesn't wait for emulation to complete and memory is freed
later by worker.

Thank you,
Nitesh Shetty
Nitesh Shetty Oct. 18, 2023, 10:08 a.m. UTC | #3
On 26/09/23 03:37PM, Nitesh Jagadeesh Shetty wrote:
>>>+                write_bio->bi_iter.bi_size = chunk;
>>>+                ret = submit_bio_wait(write_bio);
>>>+                kfree(write_bio);
>>
>>blk_mq_map_bio_put(write_bio) ?
>>or bio_uninit(write_bio); kfree(write_bio)?
>>
>>hmm...
>>It continuously allocates and releases memory for bio,
>>Why don't you just allocate and reuse bio outside the loop?
>>
>
>Agree, we will update this in next version.
>
Reusing the bio won't work in cases where the bio gets split.
So we decided to keep the previous design.

Thank you,
Nitesh Shetty
diff mbox series

Patch

diff --git a/block/blk-lib.c b/block/blk-lib.c
index 50d10fa3c4c5..da3594d25a3f 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -26,6 +26,20 @@  struct blkdev_copy_offload_io {
 	loff_t offset;
 };
 
+/* Keeps track of single outstanding copy emulation IO */
+struct blkdev_copy_emulation_io {
+	struct blkdev_copy_io *cio;
+	struct work_struct emulation_work;
+	void *buf;
+	ssize_t buf_len;
+	loff_t pos_in;
+	loff_t pos_out;
+	ssize_t len;
+	struct block_device *bdev_in;
+	struct block_device *bdev_out;
+	gfp_t gfp;
+};
+
 static sector_t bio_discard_limit(struct block_device *bdev, sector_t sector)
 {
 	unsigned int discard_granularity = bdev_discard_granularity(bdev);
@@ -316,6 +330,215 @@  ssize_t blkdev_copy_offload(struct block_device *bdev, loff_t pos_in,
 }
 EXPORT_SYMBOL_GPL(blkdev_copy_offload);
 
+static void *blkdev_copy_alloc_buf(ssize_t req_size, ssize_t *alloc_size,
+				   gfp_t gfp)
+{
+	int min_size = PAGE_SIZE;
+	char *buf;
+
+	while (req_size >= min_size) {
+		buf = kvmalloc(req_size, gfp);
+		if (buf) {
+			*alloc_size = req_size;
+			return buf;
+		}
+		req_size >>= 1;
+	}
+
+	return NULL;
+}
+
+static struct bio *bio_map_buf(void *data, unsigned int len, gfp_t gfp)
+{
+	unsigned long kaddr = (unsigned long)data;
+	unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	unsigned long start = kaddr >> PAGE_SHIFT;
+	const int nr_pages = end - start;
+	bool is_vmalloc = is_vmalloc_addr(data);
+	struct page *page;
+	int offset, i;
+	struct bio *bio;
+
+	bio = bio_kmalloc(nr_pages, gfp);
+	if (!bio)
+		return ERR_PTR(-ENOMEM);
+	bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, 0);
+
+	if (is_vmalloc) {
+		flush_kernel_vmap_range(data, len);
+		bio->bi_private = data;
+	}
+
+	offset = offset_in_page(kaddr);
+	for (i = 0; i < nr_pages; i++) {
+		unsigned int bytes = PAGE_SIZE - offset;
+
+		if (len <= 0)
+			break;
+
+		if (bytes > len)
+			bytes = len;
+
+		if (!is_vmalloc)
+			page = virt_to_page(data);
+		else
+			page = vmalloc_to_page(data);
+		if (bio_add_page(bio, page, bytes, offset) < bytes) {
+			/* we don't support partial mappings */
+			bio_uninit(bio);
+			kfree(bio);
+			return ERR_PTR(-EINVAL);
+		}
+
+		data += bytes;
+		len -= bytes;
+		offset = 0;
+	}
+
+	return bio;
+}
+
+static void blkdev_copy_emulation_work(struct work_struct *work)
+{
+	struct blkdev_copy_emulation_io *emulation_io = container_of(work,
+			struct blkdev_copy_emulation_io, emulation_work);
+	struct blkdev_copy_io *cio = emulation_io->cio;
+	struct bio *read_bio, *write_bio;
+	loff_t pos_in = emulation_io->pos_in, pos_out = emulation_io->pos_out;
+	ssize_t rem, chunk;
+	int ret = 0;
+
+	for (rem = emulation_io->len; rem > 0; rem -= chunk) {
+		chunk = min_t(int, emulation_io->buf_len, rem);
+
+		read_bio = bio_map_buf(emulation_io->buf,
+				       emulation_io->buf_len,
+				       emulation_io->gfp);
+		if (IS_ERR(read_bio)) {
+			ret = PTR_ERR(read_bio);
+			break;
+		}
+		read_bio->bi_opf = REQ_OP_READ | REQ_SYNC;
+		bio_set_dev(read_bio, emulation_io->bdev_in);
+		read_bio->bi_iter.bi_sector = pos_in >> SECTOR_SHIFT;
+		read_bio->bi_iter.bi_size = chunk;
+		ret = submit_bio_wait(read_bio);
+		kfree(read_bio);
+		if (ret)
+			break;
+
+		write_bio = bio_map_buf(emulation_io->buf,
+					emulation_io->buf_len,
+					emulation_io->gfp);
+		if (IS_ERR(write_bio)) {
+			ret = PTR_ERR(write_bio);
+			break;
+		}
+		write_bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
+		bio_set_dev(write_bio, emulation_io->bdev_out);
+		write_bio->bi_iter.bi_sector = pos_out >> SECTOR_SHIFT;
+		write_bio->bi_iter.bi_size = chunk;
+		ret = submit_bio_wait(write_bio);
+		kfree(write_bio);
+		if (ret)
+			break;
+
+		pos_in += chunk;
+		pos_out += chunk;
+	}
+	cio->status = ret;
+	kvfree(emulation_io->buf);
+	kfree(emulation_io);
+	blkdev_copy_endio(cio);
+}
+
+static inline ssize_t queue_max_hw_bytes(struct request_queue *q)
+{
+	return min_t(ssize_t, queue_max_hw_sectors(q) << SECTOR_SHIFT,
+		     queue_max_segments(q) << PAGE_SHIFT);
+}
+/*
+ * @bdev_in:	source block device
+ * @pos_in:	source offset
+ * @bdev_out:	destination block device
+ * @pos_out:	destination offset
+ * @len:	length in bytes to be copied
+ * @endio:	endio function to be called on completion of copy operation,
+ *		for synchronous operation this should be NULL
+ * @private:	endio function will be called with this private data,
+ *		for synchronous operation this should be NULL
+ * @gfp_mask:	memory allocation flags (for bio_alloc)
+ *
+ * For synchronous operation returns the length of bytes copied or error
+ * For asynchronous operation returns -EIOCBQUEUED or error
+ *
+ * Description:
+ *	If native copy offload feature is absent, caller can use this function
+ *	to perform copy.
+ *	We store information required to perform the copy along with temporary
+ *	buffer allocation. We async punt copy emulation to a worker. And worker
+ *	performs copy in 2 steps.
+ *	1. Read data from source to temporary buffer
+ *	2. Write data to destination from temporary buffer
+ */
+ssize_t blkdev_copy_emulation(struct block_device *bdev_in, loff_t pos_in,
+			      struct block_device *bdev_out, loff_t pos_out,
+			      size_t len, void (*endio)(void *, int, ssize_t),
+			      void *private, gfp_t gfp)
+{
+	struct request_queue *in = bdev_get_queue(bdev_in);
+	struct request_queue *out = bdev_get_queue(bdev_out);
+	struct blkdev_copy_emulation_io *emulation_io;
+	struct blkdev_copy_io *cio;
+	ssize_t ret;
+	size_t max_hw_bytes = min(queue_max_hw_bytes(in),
+				  queue_max_hw_bytes(out));
+
+	ret = blkdev_copy_sanity_check(bdev_in, pos_in, bdev_out, pos_out, len);
+	if (ret)
+		return ret;
+
+	cio = kzalloc(sizeof(*cio), GFP_KERNEL);
+	if (!cio)
+		return -ENOMEM;
+
+	cio->waiter = current;
+	cio->copied = len;
+	cio->endio = endio;
+	cio->private = private;
+
+	emulation_io = kzalloc(sizeof(*emulation_io), gfp);
+	if (!emulation_io)
+		goto err_free_cio;
+	emulation_io->cio = cio;
+	INIT_WORK(&emulation_io->emulation_work, blkdev_copy_emulation_work);
+	emulation_io->pos_in = pos_in;
+	emulation_io->pos_out = pos_out;
+	emulation_io->len = len;
+	emulation_io->bdev_in = bdev_in;
+	emulation_io->bdev_out = bdev_out;
+	emulation_io->gfp = gfp;
+
+	emulation_io->buf = blkdev_copy_alloc_buf(min(max_hw_bytes, len),
+						  &emulation_io->buf_len, gfp);
+	if (!emulation_io->buf)
+		goto err_free_emulation_io;
+
+	schedule_work(&emulation_io->emulation_work);
+
+	if (cio->endio)
+		return -EIOCBQUEUED;
+
+	return blkdev_copy_wait_io_completion(cio);
+
+err_free_emulation_io:
+	kfree(emulation_io);
+err_free_cio:
+	kfree(cio);
+	return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(blkdev_copy_emulation);
+
 static int __blkdev_issue_write_zeroes(struct block_device *bdev,
 		sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
 		struct bio **biop, unsigned flags)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 5405499bcf22..e0a832a1c3a7 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1046,6 +1046,10 @@  ssize_t blkdev_copy_offload(struct block_device *bdev, loff_t pos_in,
 			    loff_t pos_out, size_t len,
 			    void (*endio)(void *, int, ssize_t),
 			    void *private, gfp_t gfp_mask);
+ssize_t blkdev_copy_emulation(struct block_device *bdev_in, loff_t pos_in,
+			      struct block_device *bdev_out, loff_t pos_out,
+			      size_t len, void (*endio)(void *, int, ssize_t),
+			      void *private, gfp_t gfp);
 
 #define BLKDEV_ZERO_NOUNMAP	(1 << 0)  /* do not free blocks */
 #define BLKDEV_ZERO_NOFALLBACK	(1 << 1)  /* don't write explicit zeroes */