Message ID | 20230522104146.2856-3-nj.shetty@samsung.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | Implement copy offload support | expand |
On Mon, May 22, 2023 at 04:11:33PM +0530, Nitesh Shetty wrote: > Introduce blkdev_issue_copy which takes similar arguments as > copy_file_range and performs copy offload between two bdevs. > Introduce REQ_COPY copy offload operation flag. Create a read-write > bio pair with a token as payload and submitted to the device in order. > Read request populates token with source specific information which > is then passed with write request. > This design is courtesy Mikulas Patocka's token based copy > > Larger copy will be divided, based on max_copy_sectors limit. > > Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com> > Signed-off-by: Anuj Gupta <anuj20.g@samsung.com> > --- > block/blk-lib.c | 235 ++++++++++++++++++++++++++++++++++++++ > block/blk.h | 2 + > include/linux/blk_types.h | 25 ++++ > include/linux/blkdev.h | 3 + > 4 files changed, 265 insertions(+) > > diff --git a/block/blk-lib.c b/block/blk-lib.c > index e59c3069e835..ed089e703cb1 100644 > --- a/block/blk-lib.c > +++ b/block/blk-lib.c > @@ -115,6 +115,241 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, > } > EXPORT_SYMBOL(blkdev_issue_discard); > > +/* > + * For synchronous copy offload/emulation, wait and process all in-flight BIOs. > + * This must only be called once all bios have been issued so that the refcount > + * can only decrease. This just waits for all bios to make it through > + * blkdev_copy_write_endio. > + */ > +static int blkdev_copy_wait_completion(struct cio *cio) > +{ > + int ret; > + > + if (cio->endio) > + return 0; > + > + if (atomic_read(&cio->refcount)) { > + __set_current_state(TASK_UNINTERRUPTIBLE); > + blk_io_schedule(); > + } > + > + ret = cio->comp_len; > + kfree(cio); > + > + return ret; > +} > + > +static void blkdev_copy_offload_write_endio(struct bio *bio) > +{ > + struct copy_ctx *ctx = bio->bi_private; > + struct cio *cio = ctx->cio; > + sector_t clen; > + > + if (bio->bi_status) { > + clen = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - cio->pos_out; > + cio->comp_len = min_t(sector_t, clen, cio->comp_len); > + } > + __free_page(bio->bi_io_vec[0].bv_page); > + bio_put(bio); > + > + kfree(ctx); > + if (!atomic_dec_and_test(&cio->refcount)) > + return; > + if (cio->endio) { > + cio->endio(cio->private, cio->comp_len); > + kfree(cio); > + } else > + blk_wake_io_task(cio->waiter); > +} > + > +static void blkdev_copy_offload_read_endio(struct bio *read_bio) > +{ > + struct copy_ctx *ctx = read_bio->bi_private; > + struct cio *cio = ctx->cio; > + sector_t clen; > + > + if (read_bio->bi_status) { > + clen = (read_bio->bi_iter.bi_sector << SECTOR_SHIFT) > + - cio->pos_in; > + cio->comp_len = min_t(sector_t, clen, cio->comp_len); > + __free_page(read_bio->bi_io_vec[0].bv_page); > + bio_put(ctx->write_bio); > + bio_put(read_bio); > + kfree(ctx); > + if (atomic_dec_and_test(&cio->refcount)) { > + if (cio->endio) { > + cio->endio(cio->private, cio->comp_len); > + kfree(cio); > + } else > + blk_wake_io_task(cio->waiter); > + } > + return; > + } > + > + schedule_work(&ctx->dispatch_work); > + bio_put(read_bio); > +} > + > +static void blkdev_copy_dispatch_work(struct work_struct *work) > +{ > + struct copy_ctx *ctx = container_of(work, struct copy_ctx, > + dispatch_work); > + > + submit_bio(ctx->write_bio); > +} > + > +/* > + * __blkdev_copy_offload - Use device's native copy offload feature. > + * we perform copy operation by sending 2 bio. > + * 1. First we send a read bio with REQ_COPY flag along with a token and source > + * and length. Once read bio reaches driver layer, device driver adds all the > + * source info to token and does a fake completion. > + * 2. Once read operation completes, we issue write with REQ_COPY flag with same > + * token. In driver layer, token info is used to form a copy offload command. > + * > + * Returns the length of bytes copied or error if encountered > + */ > +static int __blkdev_copy_offload(struct block_device *bdev_in, loff_t pos_in, > + struct block_device *bdev_out, loff_t pos_out, size_t len, > + cio_iodone_t endio, void *private, gfp_t gfp_mask) > +{ > + struct cio *cio; > + struct copy_ctx *ctx; > + struct bio *read_bio, *write_bio; > + struct page *token; > + sector_t copy_len; > + sector_t rem, max_copy_len; > + > + cio = kzalloc(sizeof(struct cio), GFP_KERNEL); > + if (!cio) > + return -ENOMEM; > + atomic_set(&cio->refcount, 0); > + cio->waiter = current; > + cio->endio = endio; > + cio->private = private; > + > + max_copy_len = min(bdev_max_copy_sectors(bdev_in), > + bdev_max_copy_sectors(bdev_out)) << SECTOR_SHIFT; > + > + cio->pos_in = pos_in; > + cio->pos_out = pos_out; > + /* If there is a error, comp_len will be set to least successfully > + * completed copied length > + */ > + cio->comp_len = len; > + for (rem = len; rem > 0; rem -= copy_len) { > + copy_len = min(rem, max_copy_len); > + > + token = alloc_page(gfp_mask); > + if (unlikely(!token)) > + goto err_token; > + > + ctx = kzalloc(sizeof(struct copy_ctx), gfp_mask); > + if (!ctx) > + goto err_ctx; > + read_bio = bio_alloc(bdev_in, 1, REQ_OP_READ | REQ_COPY > + | REQ_SYNC | REQ_NOMERGE, gfp_mask); > + if (!read_bio) > + goto err_read_bio; > + write_bio = bio_alloc(bdev_out, 1, REQ_OP_WRITE > + | REQ_COPY | REQ_SYNC | REQ_NOMERGE, gfp_mask); > + if (!write_bio) > + goto err_write_bio; > + > + ctx->cio = cio; > + ctx->write_bio = write_bio; > + INIT_WORK(&ctx->dispatch_work, blkdev_copy_dispatch_work); > + > + __bio_add_page(read_bio, token, PAGE_SIZE, 0); > + read_bio->bi_iter.bi_size = copy_len; > + read_bio->bi_iter.bi_sector = pos_in >> SECTOR_SHIFT; > + read_bio->bi_end_io = blkdev_copy_offload_read_endio; > + read_bio->bi_private = ctx; > + > + __bio_add_page(write_bio, token, PAGE_SIZE, 0); > + write_bio->bi_iter.bi_size = copy_len; > + write_bio->bi_end_io = blkdev_copy_offload_write_endio; > + write_bio->bi_iter.bi_sector = pos_out >> SECTOR_SHIFT; > + write_bio->bi_private = ctx; > + > + atomic_inc(&cio->refcount); > + submit_bio(read_bio); > + pos_in += copy_len; > + pos_out += copy_len; > + } > + > + /* Wait for completion of all IO's*/ > + return blkdev_copy_wait_completion(cio); > + > +err_write_bio: > + bio_put(read_bio); > +err_read_bio: > + kfree(ctx); > +err_ctx: > + __free_page(token); > +err_token: > + cio->comp_len = min_t(sector_t, cio->comp_len, (len - rem)); > + if (!atomic_read(&cio->refcount)) > + return -ENOMEM; > + /* Wait for submitted IOs to complete */ > + return blkdev_copy_wait_completion(cio); > +} > + > +static inline int blkdev_copy_sanity_check(struct block_device *bdev_in, > + loff_t pos_in, struct block_device *bdev_out, loff_t pos_out, > + size_t len) > +{ > + unsigned int align = max(bdev_logical_block_size(bdev_out), > + bdev_logical_block_size(bdev_in)) - 1; > + > + if (bdev_read_only(bdev_out)) > + return -EPERM; > + > + if ((pos_in & align) || (pos_out & align) || (len & align) || !len || > + len >= COPY_MAX_BYTES) > + return -EINVAL; > + > + return 0; > +} > + > +/* > + * @bdev_in: source block device > + * @pos_in: source offset > + * @bdev_out: destination block device > + * @pos_out: destination offset > + * @len: length in bytes to be copied > + * @endio: endio function to be called on completion of copy operation, > + * for synchronous operation this should be NULL > + * @private: endio function will be called with this private data, should be > + * NULL, if operation is synchronous in nature > + * @gfp_mask: memory allocation flags (for bio_alloc) > + * > + * Returns the length of bytes copied or error if encountered > + * > + * Description: > + * Copy source offset from source block device to destination block > + * device. Max total length of copy is limited to MAX_COPY_TOTAL_LENGTH > + */ > +int blkdev_issue_copy(struct block_device *bdev_in, loff_t pos_in, I'd have thought you'd return ssize_t here. If the two block devices are loopmounted xfs files, we can certainly reflink "copy" more than 2GB at a time. --D > + struct block_device *bdev_out, loff_t pos_out, size_t len, > + cio_iodone_t endio, void *private, gfp_t gfp_mask) > +{ > + struct request_queue *q_in = bdev_get_queue(bdev_in); > + struct request_queue *q_out = bdev_get_queue(bdev_out); > + int ret; > + > + ret = blkdev_copy_sanity_check(bdev_in, pos_in, bdev_out, pos_out, len); > + if (ret) > + return ret; > + > + if (blk_queue_copy(q_in) && blk_queue_copy(q_out)) > + ret = __blkdev_copy_offload(bdev_in, pos_in, bdev_out, pos_out, > + len, endio, private, gfp_mask); > + > + return ret; > +} > +EXPORT_SYMBOL_GPL(blkdev_issue_copy); > + > static int __blkdev_issue_write_zeroes(struct block_device *bdev, > sector_t sector, sector_t nr_sects, gfp_t gfp_mask, > struct bio **biop, unsigned flags) > diff --git a/block/blk.h b/block/blk.h > index 45547bcf1119..ec48a237fe12 100644 > --- a/block/blk.h > +++ b/block/blk.h > @@ -303,6 +303,8 @@ static inline bool bio_may_exceed_limits(struct bio *bio, > break; > } > > + if (unlikely(op_is_copy(bio->bi_opf))) > + return false; > /* > * All drivers must accept single-segments bios that are <= PAGE_SIZE. > * This is a quick and dirty check that relies on the fact that > diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h > index 740afe80f297..0117e33087e1 100644 > --- a/include/linux/blk_types.h > +++ b/include/linux/blk_types.h > @@ -420,6 +420,7 @@ enum req_flag_bits { > */ > /* for REQ_OP_WRITE_ZEROES: */ > __REQ_NOUNMAP, /* do not free blocks when zeroing */ > + __REQ_COPY, /* copy request */ > > __REQ_NR_BITS, /* stops here */ > }; > @@ -444,6 +445,7 @@ enum req_flag_bits { > #define REQ_POLLED (__force blk_opf_t)(1ULL << __REQ_POLLED) > #define REQ_ALLOC_CACHE (__force blk_opf_t)(1ULL << __REQ_ALLOC_CACHE) > #define REQ_SWAP (__force blk_opf_t)(1ULL << __REQ_SWAP) > +#define REQ_COPY ((__force blk_opf_t)(1ULL << __REQ_COPY)) > #define REQ_DRV (__force blk_opf_t)(1ULL << __REQ_DRV) > #define REQ_FS_PRIVATE (__force blk_opf_t)(1ULL << __REQ_FS_PRIVATE) > > @@ -474,6 +476,11 @@ static inline bool op_is_write(blk_opf_t op) > return !!(op & (__force blk_opf_t)1); > } > > +static inline bool op_is_copy(blk_opf_t op) > +{ > + return op & REQ_COPY; > +} > + > /* > * Check if the bio or request is one that needs special treatment in the > * flush state machine. > @@ -533,4 +540,22 @@ struct blk_rq_stat { > u64 batch; > }; > > +typedef void (cio_iodone_t)(void *private, int comp_len); > + > +struct cio { > + struct task_struct *waiter; /* waiting task (NULL if none) */ > + atomic_t refcount; > + loff_t pos_in; > + loff_t pos_out; > + size_t comp_len; > + cio_iodone_t *endio; /* applicable for async operation */ > + void *private; /* applicable for async operation */ > +}; > + > +struct copy_ctx { > + struct cio *cio; > + struct work_struct dispatch_work; > + struct bio *write_bio; > +}; > + > #endif /* __LINUX_BLK_TYPES_H */ > diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h > index c9bf11adccb3..6f2814ab4741 100644 > --- a/include/linux/blkdev.h > +++ b/include/linux/blkdev.h > @@ -1051,6 +1051,9 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, > sector_t nr_sects, gfp_t gfp_mask, struct bio **biop); > int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector, > sector_t nr_sects, gfp_t gfp); > +int blkdev_issue_copy(struct block_device *bdev_in, loff_t pos_in, > + struct block_device *bdev_out, loff_t pos_out, size_t len, > + cio_iodone_t end_io, void *private, gfp_t gfp_mask); > > #define BLKDEV_ZERO_NOUNMAP (1 << 0) /* do not free blocks */ > #define BLKDEV_ZERO_NOFALLBACK (1 << 1) /* don't write explicit zeroes */ > -- > 2.35.1.500.gb896f729e2 > > -- > dm-devel mailing list > dm-devel@redhat.com > https://listman.redhat.com/mailman/listinfo/dm-devel >
> > +/* > > + * @bdev_in: source block device > > + * @pos_in: source offset > > + * @bdev_out: destination block device > > + * @pos_out: destination offset > > + * @len: length in bytes to be copied > > + * @endio: endio function to be called on completion of copy operation, > > + * for synchronous operation this should be NULL > > + * @private: endio function will be called with this private data, should be > > + * NULL, if operation is synchronous in nature > > + * @gfp_mask: memory allocation flags (for bio_alloc) > > + * > > + * Returns the length of bytes copied or error if encountered > > + * > > + * Description: > > + * Copy source offset from source block device to destination block > > + * device. Max total length of copy is limited to MAX_COPY_TOTAL_LENGTH > > + */ > > +int blkdev_issue_copy(struct block_device *bdev_in, loff_t pos_in, > > I'd have thought you'd return ssize_t here. If the two block devices > are loopmounted xfs files, we can certainly reflink "copy" more than 2GB > at a time. > > --D > Sure we will add this to make API future proof, but at present we do have a limit for copy. COPY_MAX_BYTES(=128MB) at present. This limit is based on our internal testing, we have plans to increase/remove with this limit in future phases. Thank you, Nitesh Shetty
On Mon, May 22, 2023 at 04:11:33PM +0530, Nitesh Shetty wrote:
> + token = alloc_page(gfp_mask);
Why is PAGE_SIZE the right size for 'token'? That seems quite unlikely.
I could understand it being SECTOR_SIZE or something that's dependent on
the device, but I cannot fathom it being dependent on the host' page size.
On 23/05/29 06:55PM, Matthew Wilcox wrote: >On Mon, May 22, 2023 at 04:11:33PM +0530, Nitesh Shetty wrote: >> + token = alloc_page(gfp_mask); > >Why is PAGE_SIZE the right size for 'token'? That seems quite unlikely. >I could understand it being SECTOR_SIZE or something that's dependent on >the device, but I cannot fathom it being dependent on the host' page size. This has nothing to do with block device at this point, merely a place holder to store information about copy offload such as src sectors, len. Token will be typecasted by driver to get copy info. SECTOR_SIZE also should work in this case, will update in next version.
po 22. 5. 2023 v 13:17 odesÃlatel Nitesh Shetty <nj.shetty@samsung.com> napsal: > > +static int __blkdev_copy_offload(struct block_device *bdev_in, loff_t pos_in, > + struct block_device *bdev_out, loff_t pos_out, size_t len, > + cio_iodone_t endio, void *private, gfp_t gfp_mask) > +{ > + struct cio *cio; > + struct copy_ctx *ctx; > + struct bio *read_bio, *write_bio; > + struct page *token; > + sector_t copy_len; > + sector_t rem, max_copy_len; > + > + cio = kzalloc(sizeof(struct cio), GFP_KERNEL); > + if (!cio) > + return -ENOMEM; > + atomic_set(&cio->refcount, 0); > + cio->waiter = current; > + cio->endio = endio; > + cio->private = private; > + > + max_copy_len = min(bdev_max_copy_sectors(bdev_in), > + bdev_max_copy_sectors(bdev_out)) << SECTOR_SHIFT; > + > + cio->pos_in = pos_in; > + cio->pos_out = pos_out; > + /* If there is a error, comp_len will be set to least successfully > + * completed copied length > + */ > + cio->comp_len = len; > + for (rem = len; rem > 0; rem -= copy_len) { > + copy_len = min(rem, max_copy_len); > + > + token = alloc_page(gfp_mask); > + if (unlikely(!token)) > + goto err_token; [...] > +err_token: > + cio->comp_len = min_t(sector_t, cio->comp_len, (len - rem)); > + if (!atomic_read(&cio->refcount)) > + return -ENOMEM; > + /* Wait for submitted IOs to complete */ > + return blkdev_copy_wait_completion(cio); > +} Suppose the first call to "token = alloc_page()" fails (and cio->refcount == 0), isn't "cio" going to be leaked here? Maurizio
On 23/05/30 01:29PM, Maurizio Lombardi wrote: >po 22. 5. 2023 v 13:17 odesÃlatel Nitesh Shetty <nj.shetty@samsung.com> napsal: >> >> +static int __blkdev_copy_offload(struct block_device *bdev_in, loff_t pos_in, >> + struct block_device *bdev_out, loff_t pos_out, size_t len, >> + cio_iodone_t endio, void *private, gfp_t gfp_mask) >> +{ >> + struct cio *cio; >> + struct copy_ctx *ctx; >> + struct bio *read_bio, *write_bio; >> + struct page *token; >> + sector_t copy_len; >> + sector_t rem, max_copy_len; >> + >> + cio = kzalloc(sizeof(struct cio), GFP_KERNEL); >> + if (!cio) >> + return -ENOMEM; >> + atomic_set(&cio->refcount, 0); >> + cio->waiter = current; >> + cio->endio = endio; >> + cio->private = private; >> + >> + max_copy_len = min(bdev_max_copy_sectors(bdev_in), >> + bdev_max_copy_sectors(bdev_out)) << SECTOR_SHIFT; >> + >> + cio->pos_in = pos_in; >> + cio->pos_out = pos_out; >> + /* If there is a error, comp_len will be set to least successfully >> + * completed copied length >> + */ >> + cio->comp_len = len; >> + for (rem = len; rem > 0; rem -= copy_len) { >> + copy_len = min(rem, max_copy_len); >> + >> + token = alloc_page(gfp_mask); >> + if (unlikely(!token)) >> + goto err_token; > >[...] > >> +err_token: >> + cio->comp_len = min_t(sector_t, cio->comp_len, (len - rem)); >> + if (!atomic_read(&cio->refcount)) >> + return -ENOMEM; >> + /* Wait for submitted IOs to complete */ >> + return blkdev_copy_wait_completion(cio); >> +} > >Suppose the first call to "token = alloc_page()" fails (and >cio->refcount == 0), isn't "cio" going to be leaked here? > >Maurizio > Agreed, will free it in "err_token", and will update next version. Thank you, Nitesh Shetty
diff --git a/block/blk-lib.c b/block/blk-lib.c index e59c3069e835..ed089e703cb1 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -115,6 +115,241 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, } EXPORT_SYMBOL(blkdev_issue_discard); +/* + * For synchronous copy offload/emulation, wait and process all in-flight BIOs. + * This must only be called once all bios have been issued so that the refcount + * can only decrease. This just waits for all bios to make it through + * blkdev_copy_write_endio. + */ +static int blkdev_copy_wait_completion(struct cio *cio) +{ + int ret; + + if (cio->endio) + return 0; + + if (atomic_read(&cio->refcount)) { + __set_current_state(TASK_UNINTERRUPTIBLE); + blk_io_schedule(); + } + + ret = cio->comp_len; + kfree(cio); + + return ret; +} + +static void blkdev_copy_offload_write_endio(struct bio *bio) +{ + struct copy_ctx *ctx = bio->bi_private; + struct cio *cio = ctx->cio; + sector_t clen; + + if (bio->bi_status) { + clen = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - cio->pos_out; + cio->comp_len = min_t(sector_t, clen, cio->comp_len); + } + __free_page(bio->bi_io_vec[0].bv_page); + bio_put(bio); + + kfree(ctx); + if (!atomic_dec_and_test(&cio->refcount)) + return; + if (cio->endio) { + cio->endio(cio->private, cio->comp_len); + kfree(cio); + } else + blk_wake_io_task(cio->waiter); +} + +static void blkdev_copy_offload_read_endio(struct bio *read_bio) +{ + struct copy_ctx *ctx = read_bio->bi_private; + struct cio *cio = ctx->cio; + sector_t clen; + + if (read_bio->bi_status) { + clen = (read_bio->bi_iter.bi_sector << SECTOR_SHIFT) + - cio->pos_in; + cio->comp_len = min_t(sector_t, clen, cio->comp_len); + __free_page(read_bio->bi_io_vec[0].bv_page); + bio_put(ctx->write_bio); + bio_put(read_bio); + kfree(ctx); + if (atomic_dec_and_test(&cio->refcount)) { + if (cio->endio) { + cio->endio(cio->private, cio->comp_len); + kfree(cio); + } else + blk_wake_io_task(cio->waiter); + } + return; + } + + schedule_work(&ctx->dispatch_work); + bio_put(read_bio); +} + +static void blkdev_copy_dispatch_work(struct work_struct *work) +{ + struct copy_ctx *ctx = container_of(work, struct copy_ctx, + dispatch_work); + + submit_bio(ctx->write_bio); +} + +/* + * __blkdev_copy_offload - Use device's native copy offload feature. + * we perform copy operation by sending 2 bio. + * 1. First we send a read bio with REQ_COPY flag along with a token and source + * and length. Once read bio reaches driver layer, device driver adds all the + * source info to token and does a fake completion. + * 2. Once read operation completes, we issue write with REQ_COPY flag with same + * token. In driver layer, token info is used to form a copy offload command. + * + * Returns the length of bytes copied or error if encountered + */ +static int __blkdev_copy_offload(struct block_device *bdev_in, loff_t pos_in, + struct block_device *bdev_out, loff_t pos_out, size_t len, + cio_iodone_t endio, void *private, gfp_t gfp_mask) +{ + struct cio *cio; + struct copy_ctx *ctx; + struct bio *read_bio, *write_bio; + struct page *token; + sector_t copy_len; + sector_t rem, max_copy_len; + + cio = kzalloc(sizeof(struct cio), GFP_KERNEL); + if (!cio) + return -ENOMEM; + atomic_set(&cio->refcount, 0); + cio->waiter = current; + cio->endio = endio; + cio->private = private; + + max_copy_len = min(bdev_max_copy_sectors(bdev_in), + bdev_max_copy_sectors(bdev_out)) << SECTOR_SHIFT; + + cio->pos_in = pos_in; + cio->pos_out = pos_out; + /* If there is a error, comp_len will be set to least successfully + * completed copied length + */ + cio->comp_len = len; + for (rem = len; rem > 0; rem -= copy_len) { + copy_len = min(rem, max_copy_len); + + token = alloc_page(gfp_mask); + if (unlikely(!token)) + goto err_token; + + ctx = kzalloc(sizeof(struct copy_ctx), gfp_mask); + if (!ctx) + goto err_ctx; + read_bio = bio_alloc(bdev_in, 1, REQ_OP_READ | REQ_COPY + | REQ_SYNC | REQ_NOMERGE, gfp_mask); + if (!read_bio) + goto err_read_bio; + write_bio = bio_alloc(bdev_out, 1, REQ_OP_WRITE + | REQ_COPY | REQ_SYNC | REQ_NOMERGE, gfp_mask); + if (!write_bio) + goto err_write_bio; + + ctx->cio = cio; + ctx->write_bio = write_bio; + INIT_WORK(&ctx->dispatch_work, blkdev_copy_dispatch_work); + + __bio_add_page(read_bio, token, PAGE_SIZE, 0); + read_bio->bi_iter.bi_size = copy_len; + read_bio->bi_iter.bi_sector = pos_in >> SECTOR_SHIFT; + read_bio->bi_end_io = blkdev_copy_offload_read_endio; + read_bio->bi_private = ctx; + + __bio_add_page(write_bio, token, PAGE_SIZE, 0); + write_bio->bi_iter.bi_size = copy_len; + write_bio->bi_end_io = blkdev_copy_offload_write_endio; + write_bio->bi_iter.bi_sector = pos_out >> SECTOR_SHIFT; + write_bio->bi_private = ctx; + + atomic_inc(&cio->refcount); + submit_bio(read_bio); + pos_in += copy_len; + pos_out += copy_len; + } + + /* Wait for completion of all IO's*/ + return blkdev_copy_wait_completion(cio); + +err_write_bio: + bio_put(read_bio); +err_read_bio: + kfree(ctx); +err_ctx: + __free_page(token); +err_token: + cio->comp_len = min_t(sector_t, cio->comp_len, (len - rem)); + if (!atomic_read(&cio->refcount)) + return -ENOMEM; + /* Wait for submitted IOs to complete */ + return blkdev_copy_wait_completion(cio); +} + +static inline int blkdev_copy_sanity_check(struct block_device *bdev_in, + loff_t pos_in, struct block_device *bdev_out, loff_t pos_out, + size_t len) +{ + unsigned int align = max(bdev_logical_block_size(bdev_out), + bdev_logical_block_size(bdev_in)) - 1; + + if (bdev_read_only(bdev_out)) + return -EPERM; + + if ((pos_in & align) || (pos_out & align) || (len & align) || !len || + len >= COPY_MAX_BYTES) + return -EINVAL; + + return 0; +} + +/* + * @bdev_in: source block device + * @pos_in: source offset + * @bdev_out: destination block device + * @pos_out: destination offset + * @len: length in bytes to be copied + * @endio: endio function to be called on completion of copy operation, + * for synchronous operation this should be NULL + * @private: endio function will be called with this private data, should be + * NULL, if operation is synchronous in nature + * @gfp_mask: memory allocation flags (for bio_alloc) + * + * Returns the length of bytes copied or error if encountered + * + * Description: + * Copy source offset from source block device to destination block + * device. Max total length of copy is limited to MAX_COPY_TOTAL_LENGTH + */ +int blkdev_issue_copy(struct block_device *bdev_in, loff_t pos_in, + struct block_device *bdev_out, loff_t pos_out, size_t len, + cio_iodone_t endio, void *private, gfp_t gfp_mask) +{ + struct request_queue *q_in = bdev_get_queue(bdev_in); + struct request_queue *q_out = bdev_get_queue(bdev_out); + int ret; + + ret = blkdev_copy_sanity_check(bdev_in, pos_in, bdev_out, pos_out, len); + if (ret) + return ret; + + if (blk_queue_copy(q_in) && blk_queue_copy(q_out)) + ret = __blkdev_copy_offload(bdev_in, pos_in, bdev_out, pos_out, + len, endio, private, gfp_mask); + + return ret; +} +EXPORT_SYMBOL_GPL(blkdev_issue_copy); + static int __blkdev_issue_write_zeroes(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct bio **biop, unsigned flags) diff --git a/block/blk.h b/block/blk.h index 45547bcf1119..ec48a237fe12 100644 --- a/block/blk.h +++ b/block/blk.h @@ -303,6 +303,8 @@ static inline bool bio_may_exceed_limits(struct bio *bio, break; } + if (unlikely(op_is_copy(bio->bi_opf))) + return false; /* * All drivers must accept single-segments bios that are <= PAGE_SIZE. * This is a quick and dirty check that relies on the fact that diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 740afe80f297..0117e33087e1 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -420,6 +420,7 @@ enum req_flag_bits { */ /* for REQ_OP_WRITE_ZEROES: */ __REQ_NOUNMAP, /* do not free blocks when zeroing */ + __REQ_COPY, /* copy request */ __REQ_NR_BITS, /* stops here */ }; @@ -444,6 +445,7 @@ enum req_flag_bits { #define REQ_POLLED (__force blk_opf_t)(1ULL << __REQ_POLLED) #define REQ_ALLOC_CACHE (__force blk_opf_t)(1ULL << __REQ_ALLOC_CACHE) #define REQ_SWAP (__force blk_opf_t)(1ULL << __REQ_SWAP) +#define REQ_COPY ((__force blk_opf_t)(1ULL << __REQ_COPY)) #define REQ_DRV (__force blk_opf_t)(1ULL << __REQ_DRV) #define REQ_FS_PRIVATE (__force blk_opf_t)(1ULL << __REQ_FS_PRIVATE) @@ -474,6 +476,11 @@ static inline bool op_is_write(blk_opf_t op) return !!(op & (__force blk_opf_t)1); } +static inline bool op_is_copy(blk_opf_t op) +{ + return op & REQ_COPY; +} + /* * Check if the bio or request is one that needs special treatment in the * flush state machine. @@ -533,4 +540,22 @@ struct blk_rq_stat { u64 batch; }; +typedef void (cio_iodone_t)(void *private, int comp_len); + +struct cio { + struct task_struct *waiter; /* waiting task (NULL if none) */ + atomic_t refcount; + loff_t pos_in; + loff_t pos_out; + size_t comp_len; + cio_iodone_t *endio; /* applicable for async operation */ + void *private; /* applicable for async operation */ +}; + +struct copy_ctx { + struct cio *cio; + struct work_struct dispatch_work; + struct bio *write_bio; +}; + #endif /* __LINUX_BLK_TYPES_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c9bf11adccb3..6f2814ab4741 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1051,6 +1051,9 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct bio **biop); int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp); +int blkdev_issue_copy(struct block_device *bdev_in, loff_t pos_in, + struct block_device *bdev_out, loff_t pos_out, size_t len, + cio_iodone_t end_io, void *private, gfp_t gfp_mask); #define BLKDEV_ZERO_NOUNMAP (1 << 0) /* do not free blocks */ #define BLKDEV_ZERO_NOFALLBACK (1 << 1) /* don't write explicit zeroes */