diff mbox series

[v20,02/12] Add infrastructure for copy offload in block and request layer.

Message ID 20240520102033.9361-3-nj.shetty@samsung.com (mailing list archive)
State New, archived
Headers show
Series Implement copy offload support | expand

Commit Message

Nitesh Shetty May 20, 2024, 10:20 a.m. UTC
We add two new opcode REQ_OP_COPY_DST, REQ_OP_COPY_SRC.
Since copy is a composite operation involving src and dst sectors/lba,
each needs to be represented by a separate bio to make it compatible
with device mapper.
We expect caller to take a plug and send bio with destination information,
followed by bio with source information.
Once the dst bio arrives we form a request and wait for source
bio. Upon arrival of source bio we merge these two bio's and send
corresponding request down to device driver.
Merging non copy offload bio is avoided by checking for copy specific
opcodes in merge function.

Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
---
 block/blk-core.c          |  7 +++++++
 block/blk-merge.c         | 41 +++++++++++++++++++++++++++++++++++++++
 block/blk.h               | 16 +++++++++++++++
 block/elevator.h          |  1 +
 include/linux/bio.h       |  6 +-----
 include/linux/blk_types.h | 10 ++++++++++
 6 files changed, 76 insertions(+), 5 deletions(-)

Comments

Damien Le Moal May 20, 2024, 3 p.m. UTC | #1
On 2024/05/20 12:20, Nitesh Shetty wrote:
> We add two new opcode REQ_OP_COPY_DST, REQ_OP_COPY_SRC.
> Since copy is a composite operation involving src and dst sectors/lba,
> each needs to be represented by a separate bio to make it compatible
> with device mapper.

Why ? The beginning of the sentence isn't justification enough for the two new
operation codes ? The 2 sentences should be reversed for easier reading:
justification first naturally leads to the reader understanding why the codes
are needed.

Also: s/opcode/operations


> We expect caller to take a plug and send bio with destination information,
> followed by bio with source information.

expect ? Plugging is optional. Does copy offload require it ? Please clarify this.

> Once the dst bio arrives we form a request and wait for source

arrives ? You mean "is submitted" ?

s/and wait for/and wait for the

> bio. Upon arrival of source bio we merge these two bio's and send

s/arrival/submission ?

s/of/of the
s/bio's/BIOs
s/and send/and send the
s/down to/down to the

> corresponding request down to device driver.
> Merging non copy offload bio is avoided by checking for copy specific
> opcodes in merge function.

Super unclear... What are you trying to say here ? That merging copy offload
BIOs with other BIOs is not allowed ? That is already handled. Only BIOs &
requests with the same operation can be merged. The code below also suggests
that you allow merging copy offloads... So I really do not understand this.

> 
> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
> Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
> ---
>  block/blk-core.c          |  7 +++++++
>  block/blk-merge.c         | 41 +++++++++++++++++++++++++++++++++++++++
>  block/blk.h               | 16 +++++++++++++++
>  block/elevator.h          |  1 +
>  include/linux/bio.h       |  6 +-----
>  include/linux/blk_types.h | 10 ++++++++++
>  6 files changed, 76 insertions(+), 5 deletions(-)
> 
> diff --git a/block/blk-core.c b/block/blk-core.c
> index ea44b13af9af..f18ee5f709c0 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -122,6 +122,8 @@ static const char *const blk_op_name[] = {
>  	REQ_OP_NAME(ZONE_FINISH),
>  	REQ_OP_NAME(ZONE_APPEND),
>  	REQ_OP_NAME(WRITE_ZEROES),
> +	REQ_OP_NAME(COPY_SRC),
> +	REQ_OP_NAME(COPY_DST),
>  	REQ_OP_NAME(DRV_IN),
>  	REQ_OP_NAME(DRV_OUT),
>  };
> @@ -838,6 +840,11 @@ void submit_bio_noacct(struct bio *bio)
>  		 * requests.
>  		 */
>  		fallthrough;
> +	case REQ_OP_COPY_SRC:
> +	case REQ_OP_COPY_DST:
> +		if (!q->limits.max_copy_sectors)
> +			goto not_supported;
> +		break;
>  	default:
>  		goto not_supported;
>  	}
> diff --git a/block/blk-merge.c b/block/blk-merge.c
> index 8534c35e0497..f8dc48a03379 100644
> --- a/block/blk-merge.c
> +++ b/block/blk-merge.c
> @@ -154,6 +154,20 @@ static struct bio *bio_split_write_zeroes(struct bio *bio,
>  	return bio_split(bio, lim->max_write_zeroes_sectors, GFP_NOIO, bs);
>  }
>  
> +static struct bio *bio_split_copy(struct bio *bio,
> +				  const struct queue_limits *lim,
> +				  unsigned int *nsegs)
> +{
> +	*nsegs = 1;
> +	if (bio_sectors(bio) <= lim->max_copy_sectors)
> +		return NULL;
> +	/*
> +	 * We don't support splitting for a copy bio. End it with EIO if
> +	 * splitting is required and return an error pointer.
> +	 */
> +	return ERR_PTR(-EIO);
> +}

Hmm... Why not check that the copy request is small enough and will not be split
when it is submitted ? Something like blk_check_zone_append() does with
REQ_OP_ZONE_APPEND ? So adding a blk_check_copy_offload(). That would also
include the limits check from the previous hunk.

> +
>  /*
>   * Return the maximum number of sectors from the start of a bio that may be
>   * submitted as a single request to a block device. If enough sectors remain,
> @@ -362,6 +376,12 @@ struct bio *__bio_split_to_limits(struct bio *bio,
>  	case REQ_OP_WRITE_ZEROES:
>  		split = bio_split_write_zeroes(bio, lim, nr_segs, bs);
>  		break;
> +	case REQ_OP_COPY_SRC:
> +	case REQ_OP_COPY_DST:
> +		split = bio_split_copy(bio, lim, nr_segs);
> +		if (IS_ERR(split))
> +			return NULL;
> +		break;

See above.

>  	default:
>  		split = bio_split_rw(bio, lim, nr_segs, bs,
>  				get_max_io_size(bio, lim) << SECTOR_SHIFT);
> @@ -925,6 +945,9 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
>  	if (!rq_mergeable(rq) || !bio_mergeable(bio))
>  		return false;
>  
> +	if (blk_copy_offload_mergable(rq, bio))
> +		return true;
> +
>  	if (req_op(rq) != bio_op(bio))
>  		return false;
>  
> @@ -958,6 +981,8 @@ enum elv_merge blk_try_merge(struct request *rq, struct bio *bio)
>  {
>  	if (blk_discard_mergable(rq))
>  		return ELEVATOR_DISCARD_MERGE;
> +	else if (blk_copy_offload_mergable(rq, bio))
> +		return ELEVATOR_COPY_OFFLOAD_MERGE;
>  	else if (blk_rq_pos(rq) + blk_rq_sectors(rq) == bio->bi_iter.bi_sector)
>  		return ELEVATOR_BACK_MERGE;
>  	else if (blk_rq_pos(rq) - bio_sectors(bio) == bio->bi_iter.bi_sector)
> @@ -1065,6 +1090,20 @@ static enum bio_merge_status bio_attempt_discard_merge(struct request_queue *q,
>  	return BIO_MERGE_FAILED;
>  }
>  
> +static enum bio_merge_status bio_attempt_copy_offload_merge(struct request *req,
> +							    struct bio *bio)
> +{
> +	if (req->__data_len != bio->bi_iter.bi_size)
> +		return BIO_MERGE_FAILED;
> +
> +	req->biotail->bi_next = bio;
> +	req->biotail = bio;
> +	req->nr_phys_segments++;
> +	req->__data_len += bio->bi_iter.bi_size;

Arg... You seem to be assuming that the source BIO always comes right after the
destination request... What if copy offloads are being concurrently issued ?
Shouldn't you check somehow that the pair is a match ? Or are you relying on the
per-context plugging which prevents that from happening in the first place ? But
that would assumes that you never ever sleep trying to allocate the source BIO
after the destination BIO/request are prepared and plugged.

> +
> +	return BIO_MERGE_OK;
> +}
> +
>  static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
>  						   struct request *rq,
>  						   struct bio *bio,
> @@ -1085,6 +1124,8 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
>  		break;
>  	case ELEVATOR_DISCARD_MERGE:
>  		return bio_attempt_discard_merge(q, rq, bio);
> +	case ELEVATOR_COPY_OFFLOAD_MERGE:
> +		return bio_attempt_copy_offload_merge(rq, bio);
>  	default:
>  		return BIO_MERGE_NONE;
>  	}
> diff --git a/block/blk.h b/block/blk.h
> index 189bc25beb50..6528a2779b84 100644
> --- a/block/blk.h
> +++ b/block/blk.h
> @@ -174,6 +174,20 @@ static inline bool blk_discard_mergable(struct request *req)
>  	return false;
>  }
>  
> +/*
> + * Copy offload sends a pair of bio with REQ_OP_COPY_DST and REQ_OP_COPY_SRC
> + * operation by taking a plug.
> + * Initially DST bio is sent which forms a request and
> + * waits for SRC bio to arrive. Once SRC bio arrives
> + * we merge it and send request down to driver.
> + */
> +static inline bool blk_copy_offload_mergable(struct request *req,
> +					     struct bio *bio)
> +{
> +	return (req_op(req) == REQ_OP_COPY_DST &&
> +		bio_op(bio) == REQ_OP_COPY_SRC);
> +}

This function is really not needed at all (used in one place only).

> +
>  static inline unsigned int blk_rq_get_max_segments(struct request *rq)
>  {
>  	if (req_op(rq) == REQ_OP_DISCARD)
> @@ -323,6 +337,8 @@ static inline bool bio_may_exceed_limits(struct bio *bio,
>  	case REQ_OP_DISCARD:
>  	case REQ_OP_SECURE_ERASE:
>  	case REQ_OP_WRITE_ZEROES:
> +	case REQ_OP_COPY_SRC:
> +	case REQ_OP_COPY_DST:
>  		return true; /* non-trivial splitting decisions */

See above. Limits should be checked on submission.

>  	default:
>  		break;
> diff --git a/block/elevator.h b/block/elevator.h
> index e9a050a96e53..c7a45c1f4156 100644
> --- a/block/elevator.h
> +++ b/block/elevator.h
> @@ -18,6 +18,7 @@ enum elv_merge {
>  	ELEVATOR_FRONT_MERGE	= 1,
>  	ELEVATOR_BACK_MERGE	= 2,
>  	ELEVATOR_DISCARD_MERGE	= 3,
> +	ELEVATOR_COPY_OFFLOAD_MERGE	= 4,
>  };
>  
>  struct blk_mq_alloc_data;
> diff --git a/include/linux/bio.h b/include/linux/bio.h
> index d5379548d684..528ef22dd65b 100644
> --- a/include/linux/bio.h
> +++ b/include/linux/bio.h
> @@ -53,11 +53,7 @@ static inline unsigned int bio_max_segs(unsigned int nr_segs)
>   */
>  static inline bool bio_has_data(struct bio *bio)
>  {
> -	if (bio &&
> -	    bio->bi_iter.bi_size &&
> -	    bio_op(bio) != REQ_OP_DISCARD &&
> -	    bio_op(bio) != REQ_OP_SECURE_ERASE &&
> -	    bio_op(bio) != REQ_OP_WRITE_ZEROES)
> +	if (bio && (bio_op(bio) == REQ_OP_READ || bio_op(bio) == REQ_OP_WRITE))
>  		return true;

This change seems completely broken and out of place. This would cause a return
of false for zone append operations.

>  
>  	return false;
> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
> index 781c4500491b..7f692bade271 100644
> --- a/include/linux/blk_types.h
> +++ b/include/linux/blk_types.h
> @@ -342,6 +342,10 @@ enum req_op {
>  	/* reset all the zone present on the device */
>  	REQ_OP_ZONE_RESET_ALL	= (__force blk_opf_t)15,
>  
> +	/* copy offload src and dst operation */

s/src/source
s/dst/destination
s/operation/operations

> +	REQ_OP_COPY_SRC		= (__force blk_opf_t)18,
> +	REQ_OP_COPY_DST		= (__force blk_opf_t)19,
> +
>  	/* Driver private requests */
>  	REQ_OP_DRV_IN		= (__force blk_opf_t)34,
>  	REQ_OP_DRV_OUT		= (__force blk_opf_t)35,
> @@ -430,6 +434,12 @@ static inline bool op_is_write(blk_opf_t op)
>  	return !!(op & (__force blk_opf_t)1);
>  }
>  
> +static inline bool op_is_copy(blk_opf_t op)
> +{
> +	return ((op & REQ_OP_MASK) == REQ_OP_COPY_SRC ||
> +		(op & REQ_OP_MASK) == REQ_OP_COPY_DST);
> +}

May be use a switch here to avoid the double masking of op ?

> +
>  /*
>   * Check if the bio or request is one that needs special treatment in the
>   * flush state machine.
Bart Van Assche May 20, 2024, 11 p.m. UTC | #2
On 5/20/24 03:20, Nitesh Shetty wrote:
> Upon arrival of source bio we merge these two bio's and send
> corresponding request down to device driver.

bios with different operation types must not be merged.

> +static enum bio_merge_status bio_attempt_copy_offload_merge(struct request *req,
> +							    struct bio *bio)
> +{
> +	if (req->__data_len != bio->bi_iter.bi_size)
> +		return BIO_MERGE_FAILED;
> +
> +	req->biotail->bi_next = bio;
> +	req->biotail = bio;
> +	req->nr_phys_segments++;
> +	req->__data_len += bio->bi_iter.bi_size;
> +
> +	return BIO_MERGE_OK;
> +}

This function appends a bio to a request. Hence, the name of this function is
wrong.

> @@ -1085,6 +1124,8 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
>   		break;
>   	case ELEVATOR_DISCARD_MERGE:
>   		return bio_attempt_discard_merge(q, rq, bio);
> +	case ELEVATOR_COPY_OFFLOAD_MERGE:
> +		return bio_attempt_copy_offload_merge(rq, bio);
>   	default:
>   		return BIO_MERGE_NONE;
>   	}

Is any code added in this patch series that causes an I/O scheduler to return
ELEVATOR_COPY_OFFLOAD_MERGE?

> +static inline bool blk_copy_offload_mergable(struct request *req,
> +					     struct bio *bio)
> +{
> +	return (req_op(req) == REQ_OP_COPY_DST &&
> +		bio_op(bio) == REQ_OP_COPY_SRC);
> +}

bios with different operation types must not be merged. Please rename this function.

> +static inline bool op_is_copy(blk_opf_t op)
> +{
> +	return ((op & REQ_OP_MASK) == REQ_OP_COPY_SRC ||
> +		(op & REQ_OP_MASK) == REQ_OP_COPY_DST);
> +}

The above function is not used in this patch. Please introduce new functions in the
patch in which these are used for the first time.

Thanks,

Bart.
Hannes Reinecke May 21, 2024, 7:01 a.m. UTC | #3
On 5/20/24 12:20, Nitesh Shetty wrote:
> We add two new opcode REQ_OP_COPY_DST, REQ_OP_COPY_SRC.
> Since copy is a composite operation involving src and dst sectors/lba,
> each needs to be represented by a separate bio to make it compatible
> with device mapper.
> We expect caller to take a plug and send bio with destination information,
> followed by bio with source information.
> Once the dst bio arrives we form a request and wait for source
> bio. Upon arrival of source bio we merge these two bio's and send
> corresponding request down to device driver.
> Merging non copy offload bio is avoided by checking for copy specific
> opcodes in merge function.
> 
> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
> Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
> ---
>   block/blk-core.c          |  7 +++++++
>   block/blk-merge.c         | 41 +++++++++++++++++++++++++++++++++++++++
>   block/blk.h               | 16 +++++++++++++++
>   block/elevator.h          |  1 +
>   include/linux/bio.h       |  6 +-----
>   include/linux/blk_types.h | 10 ++++++++++
>   6 files changed, 76 insertions(+), 5 deletions(-)
> 
I am a bit unsure about leveraging 'merge' here. As Bart pointed out, 
this is arguably as mis-use of the 'merge' functionality as we don't
actually merge bios, but rather use the information from these bios to
form the actual request.
Wouldn't it be better to use bio_chain here, and send out the eventual
request from the end_io function of the bio chain?

Cheers,

Hannes
Nitesh Shetty May 21, 2024, 10:50 a.m. UTC | #4
On 20/05/24 05:00PM, Damien Le Moal wrote:
>On 2024/05/20 12:20, Nitesh Shetty wrote:
>> We add two new opcode REQ_OP_COPY_DST, REQ_OP_COPY_SRC.
>> Since copy is a composite operation involving src and dst sectors/lba,
>> each needs to be represented by a separate bio to make it compatible
>> with device mapper.
>
>Why ? The beginning of the sentence isn't justification enough for the two new
>operation codes ? The 2 sentences should be reversed for easier reading:
>justification first naturally leads to the reader understanding why the codes
>are needed.
>
>Also: s/opcode/operations
>
>
Acked

>> We expect caller to take a plug and send bio with destination information,
>> followed by bio with source information.
>
>expect ? Plugging is optional. Does copy offload require it ? Please clarify this.
>
"caller" here refers to blkdev_copy_offload. The caller of blkdev_copy_offload
does not need to take the plug. We should have made this clear.
Sorry for the confusion, will update the description to reflect the same
in next version.

>> Once the dst bio arrives we form a request and wait for source
>
>arrives ? You mean "is submitted" ?
>
>s/and wait for/and wait for the
>
>> bio. Upon arrival of source bio we merge these two bio's and send
>
>s/arrival/submission ?
>
>s/of/of the
>s/bio's/BIOs
>s/and send/and send the
>s/down to/down to the
>
acked for above.

>> corresponding request down to device driver.
>> Merging non copy offload bio is avoided by checking for copy specific
>> opcodes in merge function.
>
>Super unclear... What are you trying to say here ? That merging copy offload
>BIOs with other BIOs is not allowed ? That is already handled. Only BIOs &
>requests with the same operation can be merged. The code below also suggests
>that you allow merging copy offloads... So I really do not understand this.
>
You are right, only BIOs & requests with same operation can be merged.
We are not merging copy offloads. We are only merging src and dst BIO to
form a copy offload request.
We will remove this in next version.

>> diff --git a/block/blk-merge.c b/block/blk-merge.c
>> index 8534c35e0497..f8dc48a03379 100644
>> --- a/block/blk-merge.c
>> +++ b/block/blk-merge.c
>> @@ -154,6 +154,20 @@ static struct bio *bio_split_write_zeroes(struct bio *bio,
>>  	return bio_split(bio, lim->max_write_zeroes_sectors, GFP_NOIO, bs);
>>  }
>>
>> +static struct bio *bio_split_copy(struct bio *bio,
>> +				  const struct queue_limits *lim,
>> +				  unsigned int *nsegs)
>> +{
>> +	*nsegs = 1;
>> +	if (bio_sectors(bio) <= lim->max_copy_sectors)
>> +		return NULL;
>> +	/*
>> +	 * We don't support splitting for a copy bio. End it with EIO if
>> +	 * splitting is required and return an error pointer.
>> +	 */
>> +	return ERR_PTR(-EIO);
>> +}
>
>Hmm... Why not check that the copy request is small enough and will not be split
>when it is submitted ? Something like blk_check_zone_append() does with
>REQ_OP_ZONE_APPEND ? So adding a blk_check_copy_offload(). That would also
>include the limits check from the previous hunk.
Yes, that could be one way. But we followed approach similar to discard.

>
>> +
>>  /*
>>   * Return the maximum number of sectors from the start of a bio that may be
>>   * submitted as a single request to a block device. If enough sectors remain,
>> @@ -362,6 +376,12 @@ struct bio *__bio_split_to_limits(struct bio *bio,
>>  	case REQ_OP_WRITE_ZEROES:
>>  		split = bio_split_write_zeroes(bio, lim, nr_segs, bs);
>>  		break;
>> +	case REQ_OP_COPY_SRC:
>> +	case REQ_OP_COPY_DST:
>> +		split = bio_split_copy(bio, lim, nr_segs);
>> +		if (IS_ERR(split))
>> +			return NULL;
>> +		break;
>
>See above.
>
>>  	default:
>>  		split = bio_split_rw(bio, lim, nr_segs, bs,
>>  				get_max_io_size(bio, lim) << SECTOR_SHIFT);
>> @@ -925,6 +945,9 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
>>  	if (!rq_mergeable(rq) || !bio_mergeable(bio))
>>  		return false;
>>
>> +	if (blk_copy_offload_mergable(rq, bio))
>> +		return true;
>> +
>>  	if (req_op(rq) != bio_op(bio))
>>  		return false;
>>
>> @@ -958,6 +981,8 @@ enum elv_merge blk_try_merge(struct request *rq, struct bio *bio)
>>  {
>>  	if (blk_discard_mergable(rq))
>>  		return ELEVATOR_DISCARD_MERGE;
>> +	else if (blk_copy_offload_mergable(rq, bio))
>> +		return ELEVATOR_COPY_OFFLOAD_MERGE;
>>  	else if (blk_rq_pos(rq) + blk_rq_sectors(rq) == bio->bi_iter.bi_sector)
>>  		return ELEVATOR_BACK_MERGE;
>>  	else if (blk_rq_pos(rq) - bio_sectors(bio) == bio->bi_iter.bi_sector)
>> @@ -1065,6 +1090,20 @@ static enum bio_merge_status bio_attempt_discard_merge(struct request_queue *q,
>>  	return BIO_MERGE_FAILED;
>>  }
>>
>> +static enum bio_merge_status bio_attempt_copy_offload_merge(struct request *req,
>> +							    struct bio *bio)
>> +{
>> +	if (req->__data_len != bio->bi_iter.bi_size)
>> +		return BIO_MERGE_FAILED;
>> +
>> +	req->biotail->bi_next = bio;
>> +	req->biotail = bio;
>> +	req->nr_phys_segments++;
>> +	req->__data_len += bio->bi_iter.bi_size;
>
>Arg... You seem to be assuming that the source BIO always comes right after the
>destination request... What if copy offloads are being concurrently issued ?
>Shouldn't you check somehow that the pair is a match ? Or are you relying on the
>per-context plugging which prevents that from happening in the first place ? But
>that would assumes that you never ever sleep trying to allocate the source BIO
>after the destination BIO/request are prepared and plugged.
>
Yes, we are rely on per-context plugging for copy to work.
Incase for any reason we are not able to merge src and dst BIOs, and the
request reaches driver layer with only one of the src or dst BIO,
we fail this request in driver layer.
This approach simplifies the overall copy plumbing.

>> +
>> +	return BIO_MERGE_OK;
>> +}
>> +
>>  static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
>>  						   struct request *rq,
>>  						   struct bio *bio,
>> @@ -1085,6 +1124,8 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
>>  		break;
>>  	case ELEVATOR_DISCARD_MERGE:
>>  		return bio_attempt_discard_merge(q, rq, bio);
>> +	case ELEVATOR_COPY_OFFLOAD_MERGE:
>> +		return bio_attempt_copy_offload_merge(rq, bio);
>>  	default:
>>  		return BIO_MERGE_NONE;
>>  	}
>> diff --git a/block/blk.h b/block/blk.h
>> index 189bc25beb50..6528a2779b84 100644
>> --- a/block/blk.h
>> +++ b/block/blk.h
>> @@ -174,6 +174,20 @@ static inline bool blk_discard_mergable(struct request *req)
>>  	return false;
>>  }
>>
>> +/*
>> + * Copy offload sends a pair of bio with REQ_OP_COPY_DST and REQ_OP_COPY_SRC
>> + * operation by taking a plug.
>> + * Initially DST bio is sent which forms a request and
>> + * waits for SRC bio to arrive. Once SRC bio arrives
>> + * we merge it and send request down to driver.
>> + */
>> +static inline bool blk_copy_offload_mergable(struct request *req,
>> +					     struct bio *bio)
>> +{
>> +	return (req_op(req) == REQ_OP_COPY_DST &&
>> +		bio_op(bio) == REQ_OP_COPY_SRC);
>> +}
>
>This function is really not needed at all (used in one place only).
>
This is used at two places and we felt this provides better readability,
similar to discard.

>> +
>>  static inline unsigned int blk_rq_get_max_segments(struct request *rq)
>>  {
>>  	if (req_op(rq) == REQ_OP_DISCARD)
>> @@ -323,6 +337,8 @@ static inline bool bio_may_exceed_limits(struct bio *bio,
>>  	case REQ_OP_DISCARD:
>>  	case REQ_OP_SECURE_ERASE:
>>  	case REQ_OP_WRITE_ZEROES:
>> +	case REQ_OP_COPY_SRC:
>> +	case REQ_OP_COPY_DST:
>>  		return true; /* non-trivial splitting decisions */
>
>See above. Limits should be checked on submission.
>
>>  	default:
>>  		break;
>> diff --git a/block/elevator.h b/block/elevator.h
>> index e9a050a96e53..c7a45c1f4156 100644
>> --- a/block/elevator.h
>> +++ b/block/elevator.h
>> @@ -18,6 +18,7 @@ enum elv_merge {
>>  	ELEVATOR_FRONT_MERGE	= 1,
>>  	ELEVATOR_BACK_MERGE	= 2,
>>  	ELEVATOR_DISCARD_MERGE	= 3,
>> +	ELEVATOR_COPY_OFFLOAD_MERGE	= 4,
>>  };
>>
>>  struct blk_mq_alloc_data;
>> diff --git a/include/linux/bio.h b/include/linux/bio.h
>> index d5379548d684..528ef22dd65b 100644
>> --- a/include/linux/bio.h
>> +++ b/include/linux/bio.h
>> @@ -53,11 +53,7 @@ static inline unsigned int bio_max_segs(unsigned int nr_segs)
>>   */
>>  static inline bool bio_has_data(struct bio *bio)
>>  {
>> -	if (bio &&
>> -	    bio->bi_iter.bi_size &&
>> -	    bio_op(bio) != REQ_OP_DISCARD &&
>> -	    bio_op(bio) != REQ_OP_SECURE_ERASE &&
>> -	    bio_op(bio) != REQ_OP_WRITE_ZEROES)
>> +	if (bio && (bio_op(bio) == REQ_OP_READ || bio_op(bio) == REQ_OP_WRITE))
>>  		return true;
>
>This change seems completely broken and out of place. This would cause a return
>of false for zone append operations.
>
We changed this based on previous review comments[1].
Idea is to replace this with a positive check.
But we did miss adding ZONE_APPEND in this.
We will add ZONE_APPEND check in next version.

[1] https://lore.kernel.org/linux-block/20230720074256.GA5042@lst.de/
>>
>>  	return false;
>> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
>> index 781c4500491b..7f692bade271 100644
>> --- a/include/linux/blk_types.h
>> +++ b/include/linux/blk_types.h
>> @@ -342,6 +342,10 @@ enum req_op {
>>  	/* reset all the zone present on the device */
>>  	REQ_OP_ZONE_RESET_ALL	= (__force blk_opf_t)15,
>>
>> +	/* copy offload src and dst operation */
>
>s/src/source
>s/dst/destination
>s/operation/operations
>
Acked

>> +	REQ_OP_COPY_SRC		= (__force blk_opf_t)18,
>> +	REQ_OP_COPY_DST		= (__force blk_opf_t)19,
>> +
>>  	/* Driver private requests */
>>  	REQ_OP_DRV_IN		= (__force blk_opf_t)34,
>>  	REQ_OP_DRV_OUT		= (__force blk_opf_t)35,
>> @@ -430,6 +434,12 @@ static inline bool op_is_write(blk_opf_t op)
>>  	return !!(op & (__force blk_opf_t)1);
>>  }
>>
>> +static inline bool op_is_copy(blk_opf_t op)
>> +{
>> +	return ((op & REQ_OP_MASK) == REQ_OP_COPY_SRC ||
>> +		(op & REQ_OP_MASK) == REQ_OP_COPY_DST);
>> +}
>
>May be use a switch here to avoid the double masking of op ?
>
Acked

Thank you,
Nitesh Shetty
Nitesh Shetty May 21, 2024, 11:17 a.m. UTC | #5
On 20/05/24 04:00PM, Bart Van Assche wrote:
>On 5/20/24 03:20, Nitesh Shetty wrote:
>>Upon arrival of source bio we merge these two bio's and send
>>corresponding request down to device driver.
>
>bios with different operation types must not be merged.
>
Copy is a composite operation which has two operation read and
write combined, so we chose two operation types which reflects that.

>>+static enum bio_merge_status bio_attempt_copy_offload_merge(struct request *req,
>>+							    struct bio *bio)
>>+{
>>+	if (req->__data_len != bio->bi_iter.bi_size)
>>+		return BIO_MERGE_FAILED;
>>+
>>+	req->biotail->bi_next = bio;
>>+	req->biotail = bio;
>>+	req->nr_phys_segments++;
>>+	req->__data_len += bio->bi_iter.bi_size;
>>+
>>+	return BIO_MERGE_OK;
>>+}
>
>This function appends a bio to a request. Hence, the name of this function is
>wrong.
>
We followed the naming convention from discard(bio_attempt_discard_merge)
which does similar thing.
But we are open to renaming, if overall community also feels the same.

>>@@ -1085,6 +1124,8 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
>>  		break;
>>  	case ELEVATOR_DISCARD_MERGE:
>>  		return bio_attempt_discard_merge(q, rq, bio);
>>+	case ELEVATOR_COPY_OFFLOAD_MERGE:
>>+		return bio_attempt_copy_offload_merge(rq, bio);
>>  	default:
>>  		return BIO_MERGE_NONE;
>>  	}
>
>Is any code added in this patch series that causes an I/O scheduler to return
>ELEVATOR_COPY_OFFLOAD_MERGE?
>
yes, blk_try_merge returns ELEVATOR_COPY_OFFLOAD_MERGE.

>>+static inline bool blk_copy_offload_mergable(struct request *req,
>>+					     struct bio *bio)
>>+{
>>+	return (req_op(req) == REQ_OP_COPY_DST &&
>>+		bio_op(bio) == REQ_OP_COPY_SRC);
>>+}
>
>bios with different operation types must not be merged. Please rename this function.
>
As described above we need two different opcodes.
As far as function renaming, we followed discard's naming. But open to
any suggestion.

>>+static inline bool op_is_copy(blk_opf_t op)
>>+{
>>+	return ((op & REQ_OP_MASK) == REQ_OP_COPY_SRC ||
>>+		(op & REQ_OP_MASK) == REQ_OP_COPY_DST);
>>+}
>
>The above function is not used in this patch. Please introduce new functions in the
>patch in which these are used for the first time.
>
Acked

Thank You
Nitesh Shetty
Bart Van Assche May 22, 2024, 5:58 p.m. UTC | #6
On 5/21/24 04:17, Nitesh Shetty wrote:
> On 20/05/24 04:00PM, Bart Van Assche wrote:
>> On 5/20/24 03:20, Nitesh Shetty wrote:
>>> +static inline bool blk_copy_offload_mergable(struct request *req,
>>> +                         struct bio *bio)
>>> +{
>>> +    return (req_op(req) == REQ_OP_COPY_DST &&
>>> +        bio_op(bio) == REQ_OP_COPY_SRC);
>>> +}
>>
>> bios with different operation types must not be merged. Please rename this function.
>>
> As far as function renaming, we followed discard's naming. But open to
> any suggestion.

req_attempt_discard_merge() checks whether two REQ_OP_DISCARD bios can be merged.
The above function checks something else, namely whether REQ_OP_COPY_DST and
REQ_OP_COPY_SRC can be combined into a copy offload operation. Hence my request
not to use the verb "merge" for combining REQ_OP_COPY_SRC and REQ_OP_COPY_DST
operations.

Thanks,

Bart.
Bart Van Assche May 22, 2024, 6:05 p.m. UTC | #7
On 5/20/24 03:20, Nitesh Shetty wrote:
> We add two new opcode REQ_OP_COPY_DST, REQ_OP_COPY_SRC.
> Since copy is a composite operation involving src and dst sectors/lba,
> each needs to be represented by a separate bio to make it compatible
> with device mapper.
> We expect caller to take a plug and send bio with destination information,
> followed by bio with source information.
> Once the dst bio arrives we form a request and wait for source
> bio. Upon arrival of source bio we merge these two bio's and send
> corresponding request down to device driver.
> Merging non copy offload bio is avoided by checking for copy specific
> opcodes in merge function.

Plugs are per task. Can the following happen?
* Task A calls blk_start_plug()
* Task B calls blk_start_plug()
* Task A submits a REQ_OP_COPY_DST bio and a REQ_OP_COPY_SRC bio.
* Task B submits a REQ_OP_COPY_DST bio and a REQ_OP_COPY_SRC bio.
* Task A calls blk_finish_plug()
* Task B calls blk_finish_plug()
* The REQ_OP_COPY_DST bio from task A and the REQ_OP_COPY_SRC bio from
   task B are combined into a single request.
* The REQ_OP_COPY_DST bio from task B and the REQ_OP_COPY_SRC bio from
   task A are combined into a single request.

Thanks,

Bart.
Nitesh Shetty May 23, 2024, 11:34 a.m. UTC | #8
On 22/05/24 11:05AM, Bart Van Assche wrote:
>On 5/20/24 03:20, Nitesh Shetty wrote:
>>We add two new opcode REQ_OP_COPY_DST, REQ_OP_COPY_SRC.
>>Since copy is a composite operation involving src and dst sectors/lba,
>>each needs to be represented by a separate bio to make it compatible
>>with device mapper.
>>We expect caller to take a plug and send bio with destination information,
>>followed by bio with source information.
>>Once the dst bio arrives we form a request and wait for source
>>bio. Upon arrival of source bio we merge these two bio's and send
>>corresponding request down to device driver.
>>Merging non copy offload bio is avoided by checking for copy specific
>>opcodes in merge function.
>
>Plugs are per task. Can the following happen?
We rely on per-context plugging to avoid this.
>* Task A calls blk_start_plug()
>* Task B calls blk_start_plug()
>* Task A submits a REQ_OP_COPY_DST bio and a REQ_OP_COPY_SRC bio.
Lets say this forms request A and stored in plug A
>* Task B submits a REQ_OP_COPY_DST bio and a REQ_OP_COPY_SRC bio.
Lets say this forms request B and stored in plug B
>* Task A calls blk_finish_plug()
>* Task B calls blk_finish_plug()
>* The REQ_OP_COPY_DST bio from task A and the REQ_OP_COPY_SRC bio from
>  task B are combined into a single request.
Here task A picks plug A and hence request A
>* The REQ_OP_COPY_DST bio from task B and the REQ_OP_COPY_SRC bio from
>  task A are combined into a single request.
same as above, request B

So we expect this case not to happen.

Thank you,
Nitesh Shetty
Nitesh Shetty May 24, 2024, 6:54 a.m. UTC | #9
On 21/05/24 09:01AM, Hannes Reinecke wrote:
>On 5/20/24 12:20, Nitesh Shetty wrote:
>>We add two new opcode REQ_OP_COPY_DST, REQ_OP_COPY_SRC.
>>Since copy is a composite operation involving src and dst sectors/lba,
>>each needs to be represented by a separate bio to make it compatible
>>with device mapper.
>>We expect caller to take a plug and send bio with destination information,
>>followed by bio with source information.
>>Once the dst bio arrives we form a request and wait for source
>>bio. Upon arrival of source bio we merge these two bio's and send
>>corresponding request down to device driver.
>>Merging non copy offload bio is avoided by checking for copy specific
>>opcodes in merge function.
>>
>>Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
>>Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
>>---
>>  block/blk-core.c          |  7 +++++++
>>  block/blk-merge.c         | 41 +++++++++++++++++++++++++++++++++++++++
>>  block/blk.h               | 16 +++++++++++++++
>>  block/elevator.h          |  1 +
>>  include/linux/bio.h       |  6 +-----
>>  include/linux/blk_types.h | 10 ++++++++++
>>  6 files changed, 76 insertions(+), 5 deletions(-)
>>
>I am a bit unsure about leveraging 'merge' here. As Bart pointed out, 
>this is arguably as mis-use of the 'merge' functionality as we don't
>actually merge bios, but rather use the information from these bios to
>form the actual request.
>Wouldn't it be better to use bio_chain here, and send out the eventual
>request from the end_io function of the bio chain?
>
With above bio chain approach, I see a difficulty while dealing with
stacked/partitioned devices, as they might get remapped.
Or am I missing something here ?

Bart, Hannes,
Regarding merge, does it looks any better, if we use single request
operation such as REQ_OP_COPY and use op_flags(REQ_COPY_DST/REQ_COPY_SRC)
to identify dst and src bios ?


Regards,
Nitesh Shetty
Bart Van Assche May 24, 2024, 1:52 p.m. UTC | #10
On 5/23/24 23:54, Nitesh Shetty wrote:
> Regarding merge, does it looks any better, if we use single request
> operation such as REQ_OP_COPY and use op_flags(REQ_COPY_DST/REQ_COPY_SRC)
> to identify dst and src bios ?

I prefer to keep the current approach (REQ_COPY_DST/REQ_COPY_SRC) and to
use a more appropriate verb than "merge", e.g. "combine".

Thanks,

Bart.
Bart Van Assche May 24, 2024, 8:33 p.m. UTC | #11
On 5/20/24 03:20, Nitesh Shetty wrote:
> We add two new opcode REQ_OP_COPY_DST, REQ_OP_COPY_SRC.
> Since copy is a composite operation involving src and dst sectors/lba,
> each needs to be represented by a separate bio to make it compatible
> with device mapper.
> We expect caller to take a plug and send bio with destination information,
> followed by bio with source information.
> Once the dst bio arrives we form a request and wait for source
> bio. Upon arrival of source bio we merge these two bio's and send
> corresponding request down to device driver.
> Merging non copy offload bio is avoided by checking for copy specific
> opcodes in merge function.

In this patch I don't see any changes for blk_attempt_bio_merge(). Does
this mean that combining REQ_OP_COPY_DST and REQ_OP_COPY_SRC will never
happen if the QUEUE_FLAG_NOMERGES request queue flag has been set?

Can it happen that the REQ_NOMERGE flag is set by __bio_split_to_limits()
for REQ_OP_COPY_DST or REQ_OP_COPY_SRC bios? Will this happen if the
following condition is met?

dst_bio->nr_phys_segs + src_bio->nr_phys_segs > max_segments

Is it allowed to set REQ_PREFLUSH or REQ_FUA for REQ_OP_COPY_DST or
REQ_OP_COPY_SRC bios? I'm asking this because these flags disable merging.

 From include/linux/blk_types.h:

#define REQ_NOMERGE_FLAGS (REQ_NOMERGE | REQ_PREFLUSH | REQ_FUA)

Thanks,

Bart.
Nitesh Shetty May 27, 2024, 8:27 a.m. UTC | #12
On 24/05/24 06:52AM, Bart Van Assche wrote:
>On 5/23/24 23:54, Nitesh Shetty wrote:
>>Regarding merge, does it looks any better, if we use single request
>>operation such as REQ_OP_COPY and use op_flags(REQ_COPY_DST/REQ_COPY_SRC)
>>to identify dst and src bios ?
>
>I prefer to keep the current approach (REQ_COPY_DST/REQ_COPY_SRC) and to
>use a more appropriate verb than "merge", e.g. "combine".
>
Thanks for the suggesion, will do this in next version.

--Nitesh Shetty
Bart Van Assche May 28, 2024, 2:07 p.m. UTC | #13
On 5/21/24 00:01, Hannes Reinecke wrote:
> On 5/20/24 12:20, Nitesh Shetty wrote:
>> We add two new opcode REQ_OP_COPY_DST, REQ_OP_COPY_SRC.
>> Since copy is a composite operation involving src and dst sectors/lba,
>> each needs to be represented by a separate bio to make it compatible
>> with device mapper.
>> We expect caller to take a plug and send bio with destination 
>> information,
>> followed by bio with source information.
>> Once the dst bio arrives we form a request and wait for source
>> bio. Upon arrival of source bio we merge these two bio's and send
>> corresponding request down to device driver.
>> Merging non copy offload bio is avoided by checking for copy specific
>> opcodes in merge function.
>>
> I am a bit unsure about leveraging 'merge' here. As Bart pointed out, 
> this is arguably as mis-use of the 'merge' functionality as we don't
> actually merge bios, but rather use the information from these bios to
> form the actual request.
> Wouldn't it be better to use bio_chain here, and send out the eventual
> request from the end_io function of the bio chain?

Let me formulate this a bit stronger: I think this patch series abuses
the merge functionality and also that it should use another mechanism
for combining REQ_OP_COPY_DST and REQ_OP_COPY_SRC. See also my email
with concerns about using the merge functionality:
https://lore.kernel.org/linux-block/eda6c198-3a29-4da4-94db-305cfe28d3d6@acm.org/.

Thanks,

Bart.
Nitesh Shetty May 29, 2024, 6:17 a.m. UTC | #14
On 24/05/24 01:33PM, Bart Van Assche wrote:
>On 5/20/24 03:20, Nitesh Shetty wrote:
>>We add two new opcode REQ_OP_COPY_DST, REQ_OP_COPY_SRC.
>>Since copy is a composite operation involving src and dst sectors/lba,
>>each needs to be represented by a separate bio to make it compatible
>>with device mapper.
>>We expect caller to take a plug and send bio with destination information,
>>followed by bio with source information.
>>Once the dst bio arrives we form a request and wait for source
>>bio. Upon arrival of source bio we merge these two bio's and send
>>corresponding request down to device driver.
>>Merging non copy offload bio is avoided by checking for copy specific
>>opcodes in merge function.
>
>In this patch I don't see any changes for blk_attempt_bio_merge(). Does
>this mean that combining REQ_OP_COPY_DST and REQ_OP_COPY_SRC will never
>happen if the QUEUE_FLAG_NOMERGES request queue flag has been set?
>
Yes, in this case copy won't work, as both src and dst bio reach driver
as part of separate requests.
We will add this as part of documentation.

>Can it happen that the REQ_NOMERGE flag is set by __bio_split_to_limits()
>for REQ_OP_COPY_DST or REQ_OP_COPY_SRC bios? Will this happen if the
>following condition is met?
>
>dst_bio->nr_phys_segs + src_bio->nr_phys_segs > max_segments
>
No, this should not happen. We don't use bio_split_rw for copy.
We have added a separate function to check for split incase of
copy(bio_split_copy), which doesn't allow copy bio splits,
hence REQ_NOMERGE flag won't be set.

>Is it allowed to set REQ_PREFLUSH or REQ_FUA for REQ_OP_COPY_DST or
>REQ_OP_COPY_SRC bios? I'm asking this because these flags disable merging.
>
>From include/linux/blk_types.h:
>
>#define REQ_NOMERGE_FLAGS (REQ_NOMERGE | REQ_PREFLUSH | REQ_FUA)
>
No, setting these flags won't allow copy bio's to merge and hence copy
won't work.
We suggest to use helper API blkdev_copy_offload to achieve the
copy which won't be setting these flags.

Thank You,
Nitesh Shetty
Damien Le Moal May 29, 2024, 7:48 a.m. UTC | #15
On 5/29/24 15:17, Nitesh Shetty wrote:
> On 24/05/24 01:33PM, Bart Van Assche wrote:
>> On 5/20/24 03:20, Nitesh Shetty wrote:
>>> We add two new opcode REQ_OP_COPY_DST, REQ_OP_COPY_SRC.
>>> Since copy is a composite operation involving src and dst sectors/lba,
>>> each needs to be represented by a separate bio to make it compatible
>>> with device mapper.
>>> We expect caller to take a plug and send bio with destination information,
>>> followed by bio with source information.
>>> Once the dst bio arrives we form a request and wait for source
>>> bio. Upon arrival of source bio we merge these two bio's and send
>>> corresponding request down to device driver.
>>> Merging non copy offload bio is avoided by checking for copy specific
>>> opcodes in merge function.
>>
>> In this patch I don't see any changes for blk_attempt_bio_merge(). Does
>> this mean that combining REQ_OP_COPY_DST and REQ_OP_COPY_SRC will never
>> happen if the QUEUE_FLAG_NOMERGES request queue flag has been set?
>>
> Yes, in this case copy won't work, as both src and dst bio reach driver
> as part of separate requests.
> We will add this as part of documentation.

So that means that 2 major SAS HBAs which set this flag (megaraid and mpt3sas)
will not get support for copy offload ? Not ideal, by far.
Bart Van Assche May 29, 2024, 10:41 p.m. UTC | #16
On 5/29/24 12:48 AM, Damien Le Moal wrote:
> On 5/29/24 15:17, Nitesh Shetty wrote:
>> On 24/05/24 01:33PM, Bart Van Assche wrote:
>>> On 5/20/24 03:20, Nitesh Shetty wrote:
>>>> We add two new opcode REQ_OP_COPY_DST, REQ_OP_COPY_SRC.
>>>> Since copy is a composite operation involving src and dst sectors/lba,
>>>> each needs to be represented by a separate bio to make it compatible
>>>> with device mapper.
>>>> We expect caller to take a plug and send bio with destination information,
>>>> followed by bio with source information.
>>>> Once the dst bio arrives we form a request and wait for source
>>>> bio. Upon arrival of source bio we merge these two bio's and send
>>>> corresponding request down to device driver.
>>>> Merging non copy offload bio is avoided by checking for copy specific
>>>> opcodes in merge function.
>>>
>>> In this patch I don't see any changes for blk_attempt_bio_merge(). Does
>>> this mean that combining REQ_OP_COPY_DST and REQ_OP_COPY_SRC will never
>>> happen if the QUEUE_FLAG_NOMERGES request queue flag has been set?
>>>
>> Yes, in this case copy won't work, as both src and dst bio reach driver
>> as part of separate requests.
>> We will add this as part of documentation.
> 
> So that means that 2 major SAS HBAs which set this flag (megaraid and mpt3sas)
> will not get support for copy offload ? Not ideal, by far.

QUEUE_FLAG_NOMERGES can also be set through sysfs (see also
queue_nomerges_store()). This is one of the reasons why using the merge
infrastructure for combining REQ_OP_COPY_DST and REQ_OP_COPY_SRC is
unacceptable.

Thanks,

Bart.
Nitesh Shetty May 30, 2024, 7:16 a.m. UTC | #17
On 29/05/24 03:41PM, Bart Van Assche wrote:
>On 5/29/24 12:48 AM, Damien Le Moal wrote:
>>On 5/29/24 15:17, Nitesh Shetty wrote:
>>>On 24/05/24 01:33PM, Bart Van Assche wrote:
>>>>On 5/20/24 03:20, Nitesh Shetty wrote:
>>>>>We add two new opcode REQ_OP_COPY_DST, REQ_OP_COPY_SRC.
>>>>>Since copy is a composite operation involving src and dst sectors/lba,
>>>>>each needs to be represented by a separate bio to make it compatible
>>>>>with device mapper.
>>>>>We expect caller to take a plug and send bio with destination information,
>>>>>followed by bio with source information.
>>>>>Once the dst bio arrives we form a request and wait for source
>>>>>bio. Upon arrival of source bio we merge these two bio's and send
>>>>>corresponding request down to device driver.
>>>>>Merging non copy offload bio is avoided by checking for copy specific
>>>>>opcodes in merge function.
>>>>
>>>>In this patch I don't see any changes for blk_attempt_bio_merge(). Does
>>>>this mean that combining REQ_OP_COPY_DST and REQ_OP_COPY_SRC will never
>>>>happen if the QUEUE_FLAG_NOMERGES request queue flag has been set?
>>>>
>>>Yes, in this case copy won't work, as both src and dst bio reach driver
>>>as part of separate requests.
>>>We will add this as part of documentation.
>>
>>So that means that 2 major SAS HBAs which set this flag (megaraid and mpt3sas)
>>will not get support for copy offload ? Not ideal, by far.
>
>QUEUE_FLAG_NOMERGES can also be set through sysfs (see also
>queue_nomerges_store()). This is one of the reasons why using the merge
>infrastructure for combining REQ_OP_COPY_DST and REQ_OP_COPY_SRC is
>unacceptable.
>

Bart, Damien, Hannes,
Thanks for your review.
We tried a slightly modified approach which simplifies this patch and
also avoids merge path.
Also with this, we should be able to solve the QUEUE_FLAG_MERGES issue.
Previously we also tried payload/token based approach,
which avoids merge path and tries to combine bios in driver.
But we received feedback that it wasn't the right approach [1].
Do below changes look any better or do you guys have anything else in mind ?

[1] https://lore.kernel.org/linux-block/ZIKphgDavKVPREnw@infradead.org/


diff --git a/block/blk-core.c b/block/blk-core.c
index 82c3ae22d76d..7158bac8cc57 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -122,6 +122,8 @@ static const char *const blk_op_name[] = {
  	REQ_OP_NAME(ZONE_FINISH),
  	REQ_OP_NAME(ZONE_APPEND),
  	REQ_OP_NAME(WRITE_ZEROES),
+	REQ_OP_NAME(COPY_SRC),
+	REQ_OP_NAME(COPY_DST),
  	REQ_OP_NAME(DRV_IN),
  	REQ_OP_NAME(DRV_OUT),
  };
@@ -839,6 +841,11 @@ void submit_bio_noacct(struct bio *bio)
  		 * requests.
  		 */
  		fallthrough;
+	case REQ_OP_COPY_SRC:
+	case REQ_OP_COPY_DST:
+		if (!q->limits.max_copy_sectors)
+			goto not_supported;
+		break;
  	default:
  		goto not_supported;
  	}
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 8534c35e0497..a651e7c114d0 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -154,6 +154,20 @@ static struct bio *bio_split_write_zeroes(struct bio *bio,
  	return bio_split(bio, lim->max_write_zeroes_sectors, GFP_NOIO, bs);
  }
  
+static struct bio *bio_split_copy(struct bio *bio,
+				  const struct queue_limits *lim,
+				  unsigned int *nsegs)
+{
+	*nsegs = 1;
+	if (bio_sectors(bio) <= lim->max_copy_sectors)
+		return NULL;
+	/*
+	 * We don't support splitting for a copy bio. End it with EIO if
+	 * splitting is required and return an error pointer.
+	 */
+	return ERR_PTR(-EIO);
+}
+
  /*
   * Return the maximum number of sectors from the start of a bio that may be
   * submitted as a single request to a block device. If enough sectors remain,
@@ -362,6 +376,12 @@ struct bio *__bio_split_to_limits(struct bio *bio,
  	case REQ_OP_WRITE_ZEROES:
  		split = bio_split_write_zeroes(bio, lim, nr_segs, bs);
  		break;
+	case REQ_OP_COPY_SRC:
+	case REQ_OP_COPY_DST:
+		split = bio_split_copy(bio, lim, nr_segs);
+		if (IS_ERR(split))
+			return NULL;
+		break;
  	default:
  		split = bio_split_rw(bio, lim, nr_segs, bs,
  				get_max_io_size(bio, lim) << SECTOR_SHIFT);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3b4df8e5ac9e..6d4ffbdade28 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2833,6 +2833,63 @@ static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
  		blk_mq_commit_rqs(hctx, queued, false);
  }
  
+/*
+ * Copy offload sends a pair of bio with REQ_OP_COPY_DST and REQ_OP_COPY_SRC
+ * operation by taking a plug.
+ * Initially DST bio is sent which forms a request and
+ * waits for SRC bio to arrive. Once SRC bio arrives
+ * we combine it and send request down to driver.
+ */
+static inline bool blk_copy_offload_combine_ok(struct request *req,
+					      struct bio *bio)
+{
+	return (req_op(req) == REQ_OP_COPY_DST &&
+		bio_op(bio) == REQ_OP_COPY_SRC);
+}
+
+static int blk_copy_offload_combine(struct request *req, struct bio *bio)
+{
+	if (!blk_copy_offload_combine_ok(req, bio))
+		return 1;
+
+	if (req->__data_len != bio->bi_iter.bi_size)
+		return 1;
+
+	req->biotail->bi_next = bio;
+	req->biotail = bio;
+	req->nr_phys_segments++;
+	req->__data_len += bio->bi_iter.bi_size;
+
+	return 0;
+}
+
+static inline bool blk_copy_offload_attempt_combine(struct request_queue *q,
+					     struct bio *bio)
+{
+	struct blk_plug *plug = current->plug;
+	struct request *rq;
+
+	if (!plug || rq_list_empty(plug->mq_list))
+		return false;
+
+	rq_list_for_each(&plug->mq_list, rq) {
+		if (rq->q == q) {
+			if (!blk_copy_offload_combine(rq, bio))
+				return true;
+			break;
+		}
+
+		/*
+		 * Only keep iterating plug list for combines if we have multiple
+		 * queues
+		 */
+		if (!plug->multiple_queues)
+			break;
+	}
+	return false;
+}
+
  static bool blk_mq_attempt_bio_merge(struct request_queue *q,
  				     struct bio *bio, unsigned int nr_segs)
  {
@@ -2977,6 +3034,9 @@ void blk_mq_submit_bio(struct bio *bio)
  	if (blk_mq_attempt_bio_merge(q, bio, nr_segs))
  		goto queue_exit;
  
+	if (blk_copy_offload_attempt_combine(q, bio))
+		goto queue_exit;
+
  	if (blk_queue_is_zoned(q) && blk_zone_plug_bio(bio, nr_segs))
  		goto queue_exit;
  
diff --git a/block/blk.h b/block/blk.h
index 189bc25beb50..112c6736f44c 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -323,6 +323,8 @@ static inline bool bio_may_exceed_limits(struct bio *bio,
  	case REQ_OP_DISCARD:
  	case REQ_OP_SECURE_ERASE:
  	case REQ_OP_WRITE_ZEROES:
+	case REQ_OP_COPY_SRC:
+	case REQ_OP_COPY_DST:
  		return true; /* non-trivial splitting decisions */
  	default:
  		break;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 781c4500491b..22a08425d13e 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -342,6 +342,10 @@ enum req_op {
  	/* reset all the zone present on the device */
  	REQ_OP_ZONE_RESET_ALL	= (__force blk_opf_t)15,
  
+	/* copy offload source and destination operations */
+	REQ_OP_COPY_SRC		= (__force blk_opf_t)18,
+	REQ_OP_COPY_DST		= (__force blk_opf_t)19,
+
  	/* Driver private requests */
  	REQ_OP_DRV_IN		= (__force blk_opf_t)34,
  	REQ_OP_DRV_OUT		= (__force blk_opf_t)35,
--·
2.34.1
Bart Van Assche May 30, 2024, 5:11 p.m. UTC | #18
On 5/30/24 00:16, Nitesh Shetty wrote:
> +static inline bool blk_copy_offload_attempt_combine(struct request_queue *q,
> +                         struct bio *bio)
> +{
> +    struct blk_plug *plug = current->plug;
> +    struct request *rq;
> +
> +    if (!plug || rq_list_empty(plug->mq_list))
> +        return false;
> +
> +    rq_list_for_each(&plug->mq_list, rq) {
> +        if (rq->q == q) {
> +            if (!blk_copy_offload_combine(rq, bio))
> +                return true;
> +            break;
> +        }
> +
> +        /*
> +         * Only keep iterating plug list for combines if we have multiple
> +         * queues
> +         */
> +        if (!plug->multiple_queues)
> +            break;
> +    }
> +    return false;
> +}

This new approach has the following two disadvantages:
* Without plug, REQ_OP_COPY_SRC and REQ_OP_COPY_DST are not combined. These two
   operation types are the only operation types for which not using a plug causes
   an I/O failure.
* A loop is required to combine the REQ_OP_COPY_SRC and REQ_OP_COPY_DST operations.

Please switch to the approach Hannes suggested, namely bio chaining. Chaining
REQ_OP_COPY_SRC and REQ_OP_COPY_DST bios before these are submitted eliminates the
two disadvantages mentioned above.

Thanks,

Bart.
Nitesh Shetty May 31, 2024, 10:17 a.m. UTC | #19
On 30/05/24 10:11AM, Bart Van Assche wrote:
>On 5/30/24 00:16, Nitesh Shetty wrote:
>>+static inline bool blk_copy_offload_attempt_combine(struct request_queue *q,
>>+                         struct bio *bio)
>>+{
>>+    struct blk_plug *plug = current->plug;
>>+    struct request *rq;
>>+
>>+    if (!plug || rq_list_empty(plug->mq_list))
>>+        return false;
>>+
>>+    rq_list_for_each(&plug->mq_list, rq) {
>>+        if (rq->q == q) {
>>+            if (!blk_copy_offload_combine(rq, bio))
>>+                return true;
>>+            break;
>>+        }
>>+
>>+        /*
>>+         * Only keep iterating plug list for combines if we have multiple
>>+         * queues
>>+         */
>>+        if (!plug->multiple_queues)
>>+            break;
>>+    }
>>+    return false;
>>+}
>
>This new approach has the following two disadvantages:
>* Without plug, REQ_OP_COPY_SRC and REQ_OP_COPY_DST are not combined. These two
>  operation types are the only operation types for which not using a plug causes
>  an I/O failure.
>* A loop is required to combine the REQ_OP_COPY_SRC and REQ_OP_COPY_DST operations.
>
>Please switch to the approach Hannes suggested, namely bio chaining. Chaining
>REQ_OP_COPY_SRC and REQ_OP_COPY_DST bios before these are submitted eliminates the
>two disadvantages mentioned above.
>
Bart, Hannes,

I see the following challenges with bio-chained approach.
1. partitioned device:
	We need to add the code which iterates over all bios and adjusts
	the sectors offsets.
2. dm/stacked device:
	We need to make major changes in dm, such as allocating cloned
	bios, IO splits, IO offset mappings. All of which need to
	iterate over chained BIOs.

Overall with chained BIOs we need to add a special handling only for copy
to iterate over chained BIOs and do the same thing which is being done
for single BIO at present.
Or am I missing something here ?

Thank You,
Nitesh Shetty
Bart Van Assche May 31, 2024, 11:45 p.m. UTC | #20
On 5/31/24 03:17, Nitesh Shetty wrote:
> I see the following challenges with bio-chained approach.
> 1. partitioned device:
>      We need to add the code which iterates over all bios and adjusts
>      the sectors offsets.
> 2. dm/stacked device:
>      We need to make major changes in dm, such as allocating cloned
>      bios, IO splits, IO offset mappings. All of which need to
>      iterate over chained BIOs.
> 
> Overall with chained BIOs we need to add a special handling only for copy
> to iterate over chained BIOs and do the same thing which is being done
> for single BIO at present.
> Or am I missing something here ?

Hmm ... aren't chained bios submitted individually? See e.g.
bio_chain_and_submit(). In other words, it shouldn't be necessary to
add any code that iterates over bio chains.

Thanks,

Bart.
Christoph Hellwig June 1, 2024, 5:57 a.m. UTC | #21
On Wed, May 29, 2024 at 04:48:18PM +0900, Damien Le Moal wrote:
> > Yes, in this case copy won't work, as both src and dst bio reach driver
> > as part of separate requests.
> > We will add this as part of documentation.
> 
> So that means that 2 major SAS HBAs which set this flag (megaraid and mpt3sas)
> will not get support for copy offload ? Not ideal, by far.

We really need to ignore the normerges flag for this.
As should we for the precedence in the discard merging.

And the drivers need to stop setting this flag as they have no business
to, but that's a separate discussion (and the flag can still be set
manually).
Christoph Hellwig June 1, 2024, 5:59 a.m. UTC | #22
On Thu, May 30, 2024 at 10:11:15AM -0700, Bart Van Assche wrote:
> This new approach has the following two disadvantages:
> * Without plug, REQ_OP_COPY_SRC and REQ_OP_COPY_DST are not combined. These two
>   operation types are the only operation types for which not using a plug causes
>   an I/O failure.

So?  We can clearly document that and even fail submission with a helpful
message trivially to enforce that.

> * A loop is required to combine the REQ_OP_COPY_SRC and REQ_OP_COPY_DST operations.

I don't see why that is a problem.  Especiallly once we allow multiple
sources which is really useful for GC workloads we'll need to do that
anyway.

> Please switch to the approach Hannes suggested, namely bio chaining. Chaining
> REQ_OP_COPY_SRC and REQ_OP_COPY_DST bios before these are submitted eliminates the
> two disadvantages mentioned above.

No.
Bart Van Assche June 3, 2024, 5:12 p.m. UTC | #23
On 5/31/24 22:59, Christoph Hellwig wrote:
> On Thu, May 30, 2024 at 10:11:15AM -0700, Bart Van Assche wrote:
>> This new approach has the following two disadvantages:
>> * Without plug, REQ_OP_COPY_SRC and REQ_OP_COPY_DST are not combined. These two
>>    operation types are the only operation types for which not using a plug causes
>>    an I/O failure.
> 
> So?  We can clearly document that and even fail submission with a helpful
> message trivially to enforce that.

Consider the following use case:
* Task A calls blk_start_plug()
* Task B calls blk_start_plug()
* Task A submits a REQ_OP_COPY_DST bio and a REQ_OP_COPY_SRC bio.
* Task B submits a REQ_OP_COPY_DST bio and a REQ_OP_COPY_SRC bio.
* The stacking driver to which all REQ_OP_COPY_* operations have been
   submitted processes bios asynchronusly.
* Task A calls blk_finish_plug()
* Task B calls blk_finish_plug()
* The REQ_OP_COPY_DST bio from task A and the REQ_OP_COPY_SRC bio from
   task B are combined into a single request.
* The REQ_OP_COPY_DST bio from task B and the REQ_OP_COPY_SRC bio from
   task A are combined into a single request.

This results in silent and hard-to-debug data corruption. Do you agree
that we should not restrict copy offloading to stacking drivers that
process bios synchronously and also that this kind of data corruption
should be prevented?

Thanks,

Bart.
Christoph Hellwig June 4, 2024, 4:40 a.m. UTC | #24
On Mon, Jun 03, 2024 at 10:12:48AM -0700, Bart Van Assche wrote:
> Consider the following use case:
> * Task A calls blk_start_plug()
> * Task B calls blk_start_plug()
> * Task A submits a REQ_OP_COPY_DST bio and a REQ_OP_COPY_SRC bio.
> * Task B submits a REQ_OP_COPY_DST bio and a REQ_OP_COPY_SRC bio.
> * The stacking driver to which all REQ_OP_COPY_* operations have been
>   submitted processes bios asynchronusly.
> * Task A calls blk_finish_plug()
> * Task B calls blk_finish_plug()
> * The REQ_OP_COPY_DST bio from task A and the REQ_OP_COPY_SRC bio from
>   task B are combined into a single request.
> * The REQ_OP_COPY_DST bio from task B and the REQ_OP_COPY_SRC bio from
>   task A are combined into a single request.
>
> This results in silent and hard-to-debug data corruption. Do you agree
> that we should not restrict copy offloading to stacking drivers that
> process bios synchronously and also that this kind of data corruption
> should be prevented?

There is no requirement to process them synchronously, there is just
a requirement to preserve the order.  Note that my suggestion a few
arounds ago also included a copy id to match them up.  If we don't
need that I'm happy to leave it away.  If need it it to make stacking
drivers' lifes easier that suggestion still stands.

>
> Thanks,
>
> Bart.
---end quoted text---
Bart Van Assche June 4, 2024, 11:44 a.m. UTC | #25
On 6/3/24 21:40, Christoph Hellwig wrote:
> There is no requirement to process them synchronously, there is just
> a requirement to preserve the order.  Note that my suggestion a few
> arounds ago also included a copy id to match them up.  If we don't
> need that I'm happy to leave it away.  If need it it to make stacking
> drivers' lifes easier that suggestion still stands.

Including an ID in REQ_OP_COPY_DST and REQ_OP_COPY_SRC operations sounds
much better to me than abusing the merge infrastructure for combining
these two operations into a single request. With the ID-based approach
stacking drivers are allowed to process copy bios asynchronously and it
is no longer necessary to activate merging for copy operations if
merging is disabled (QUEUE_FLAG_NOMERGES).

Thanks,

Bart.
Christoph Hellwig June 5, 2024, 8:20 a.m. UTC | #26
On Tue, Jun 04, 2024 at 04:44:34AM -0700, Bart Van Assche wrote:
> On 6/3/24 21:40, Christoph Hellwig wrote:
>> There is no requirement to process them synchronously, there is just
>> a requirement to preserve the order.  Note that my suggestion a few
>> arounds ago also included a copy id to match them up.  If we don't
>> need that I'm happy to leave it away.  If need it it to make stacking
>> drivers' lifes easier that suggestion still stands.
>
> Including an ID in REQ_OP_COPY_DST and REQ_OP_COPY_SRC operations sounds
> much better to me than abusing the merge infrastructure for combining
> these two operations into a single request. With the ID-based approach
> stacking drivers are allowed to process copy bios asynchronously and it
> is no longer necessary to activate merging for copy operations if
> merging is disabled (QUEUE_FLAG_NOMERGES).

Again, we can decided on QUEUE_FLAG_NOMERGES per request type.  In fact
I think we should not use it for discards as that just like copy
is a very different kind of "merge".

I'm in fact much more happy about avoiding the copy_id IFF we can.  It
it a fair amout of extra overhead, so we should only add it if there
is a real need for it
Nitesh Shetty June 6, 2024, 7:28 a.m. UTC | #27
On 04/06/24 04:44AM, Bart Van Assche wrote:
>On 6/3/24 21:40, Christoph Hellwig wrote:
>>There is no requirement to process them synchronously, there is just
>>a requirement to preserve the order.  Note that my suggestion a few
>>arounds ago also included a copy id to match them up.  If we don't
>>need that I'm happy to leave it away.  If need it it to make stacking
>>drivers' lifes easier that suggestion still stands.
>
>Including an ID in REQ_OP_COPY_DST and REQ_OP_COPY_SRC operations sounds
>much better to me than abusing the merge infrastructure for combining
>these two operations into a single request. With the ID-based approach
>stacking drivers are allowed to process copy bios asynchronously and it
>is no longer necessary to activate merging for copy operations if
>merging is disabled (QUEUE_FLAG_NOMERGES).
>
Single request, with bio merging approach:
The current approach is to send a single request to driver,
which contains both destination and source information inside separate bios.
Do you have any different approach in mind ?

If we want to proceed with this single request based approach,
we need to merge the destination request with source BIOs at some point.
a. We chose to do it via plug approach.
b. Alternative I see is scheduler merging, but here we need some way to
hold the request which has destination info, until source bio is also
submitted.
c. Is there any other way, which I am missing here ?

Limitation of current plug based approach:
I missed the possibility of asynchronous submission by stacked device.
Since we enabled only dm-linear, we had synchronous submission till now
and our test cases worked fine.
But in future if we start enabling dm targets with asynchronous submission,
the current plug based approach won't work.
The case where Bart mentioned possibility of 2 different tasks sending
copy[1] and they are getting merged wrongly is valid in this case.
There will be corruption, copy ID approach can solve this wrong merging.

Copy ID based merging might preserve the order, but we still need the
copy destination request to wait for copy source bio to merge.

Copy ID approach:
We see 3 possibilities here:
1. No merging: If we include copy-id in src and dst bio, the bio's will get
submitted separately and reach to the driver as separate requests.
How do we plan to form a copy command in driver ?
2. Merging BIOs:
At some point we need to match the src bio with the dst bio and send this
information together to the driver. The current implementation.
This still does not solve the asynchronous submission problem, mentioned
above.
3. Chaining BIOs:
This won't work with stacked devices as there will be cloning, and hence
chain won't be maintained.

[1] https://lore.kernel.org/all/d7ae00c8-c038-4bed-937e-222251bc627a@acm.org/

Thank You,
Nitesh Shetty
Bart Van Assche June 6, 2024, 4:44 p.m. UTC | #28
On 6/6/24 01:28, Nitesh Shetty wrote:
> On 04/06/24 04:44AM, Bart Van Assche wrote:
>> On 6/3/24 21:40, Christoph Hellwig wrote:
>>> There is no requirement to process them synchronously, there is just
>>> a requirement to preserve the order.  Note that my suggestion a few
>>> arounds ago also included a copy id to match them up.  If we don't
>>> need that I'm happy to leave it away.  If need it it to make stacking
>>> drivers' lifes easier that suggestion still stands.
>>
>> Including an ID in REQ_OP_COPY_DST and REQ_OP_COPY_SRC operations sounds
>> much better to me than abusing the merge infrastructure for combining
>> these two operations into a single request. With the ID-based approach
>> stacking drivers are allowed to process copy bios asynchronously and it
>> is no longer necessary to activate merging for copy operations if
>> merging is disabled (QUEUE_FLAG_NOMERGES).
>>
> Single request, with bio merging approach:
> The current approach is to send a single request to driver,
> which contains both destination and source information inside separate bios.
> Do you have any different approach in mind ?

No. I did not propose to change how copy offload requests are sent to block
drivers (other than stacking drivers).

> If we want to proceed with this single request based approach,
> we need to merge the destination request with source BIOs at some point.
> a. We chose to do it via plug approach.
> b. Alternative I see is scheduler merging, but here we need some way to
> hold the request which has destination info, until source bio is also
> submitted.
> c. Is there any other way, which I am missing here ?

There are already exceptions in blk_mq_submit_bio() for zoned writes and for
flush bios. Another exception could be added for REQ_OP_COPY_* bios. I'm not
claiming that this is the best possible alternative. I'm only mentioning this
to show that there are alternatives.

> Copy ID approach:
> We see 3 possibilities here:
> 1. No merging: If we include copy-id in src and dst bio, the bio's will get
> submitted separately and reach to the driver as separate requests.
> How do we plan to form a copy command in driver ?
> 2. Merging BIOs:
> At some point we need to match the src bio with the dst bio and send this
> information together to the driver. The current implementation.
> This still does not solve the asynchronous submission problem, mentioned
> above.
> 3. Chaining BIOs:
> This won't work with stacked devices as there will be cloning, and hence
> chain won't be maintained.

I prefer option (2). Option (1) could result in a deadlock because the source
and destination bio both would have to be converted into a request before
these are sent to a request-based driver.

Thanks,

Bart.
Nitesh Shetty June 24, 2024, 10:44 a.m. UTC | #29
On 05/06/24 10:20AM, Christoph Hellwig wrote:
>On Tue, Jun 04, 2024 at 04:44:34AM -0700, Bart Van Assche wrote:
>> On 6/3/24 21:40, Christoph Hellwig wrote:
>>> There is no requirement to process them synchronously, there is just
>>> a requirement to preserve the order.  Note that my suggestion a few
>>> arounds ago also included a copy id to match them up.  If we don't
>>> need that I'm happy to leave it away.  If need it it to make stacking
>>> drivers' lifes easier that suggestion still stands.
>>
>> Including an ID in REQ_OP_COPY_DST and REQ_OP_COPY_SRC operations sounds
>> much better to me than abusing the merge infrastructure for combining
>> these two operations into a single request. With the ID-based approach
>> stacking drivers are allowed to process copy bios asynchronously and it
>> is no longer necessary to activate merging for copy operations if
>> merging is disabled (QUEUE_FLAG_NOMERGES).
>
>Again, we can decided on QUEUE_FLAG_NOMERGES per request type.  In fact
>I think we should not use it for discards as that just like copy
>is a very different kind of "merge".
>
>I'm in fact much more happy about avoiding the copy_id IFF we can.  It
>it a fair amout of extra overhead, so we should only add it if there
>is a real need for it

Christoph, Martin, Bart, Hannes, Damien

We have iterated over couple of designs for copy offload, but ended up
with changing them owing to some drawbacks. I would like to take your
opinion on how we can plumb copy offload in block and dm layer.

For reference, I have listed the approaches we have taken in the past.

a. Token/payload based approach:
1. Here we allocate a buffer/payload.
2. First source BIO is sent along with the buffer.
3. Once the buffer reaches driver, it is filled with the source LBA
and length and namespace info. And the request is completed.
4. Then destination BIO is sent with same buffer.
5. Once the buffer reaches driver, it retrieves the source information from
the BIO and forms a copy command and sends it down to device.

We received feedback that putting anything inside payload which is not
data, is not a good idea[1].


b. Plug based approach:
1. We take a plug.
2. Send a destination BIO, this forms a copy request and waits for source BIO
to arrive.
3. Send a source BIO, this merges with the copy request which was formed
by destination BIO.
4. We release the plug, then copy request reaches driver layer and forms
a copy command.

This approach won't work with stacked devices which has asynchronous
submission.
Overall taking plug and merging BIOs received not so good feedback from
community.

c. List/ctx based approach:
A new member is added to bio, bio_copy_ctx, which will a union with
bi_integrity. Idea is once a copy bio reaches blk_mq_submit_bio, it will
add the bio to this list.
1. Send the destination BIO, once this reaches blk_mq_submit_bio, this
will add the destination BIO to the list inside bi_copy_ctx and return
without forming any request.
2. Send source BIO, once this reaches blk_mq_submit_bio, this will
retrieve the destination BIO from bi_copy_ctx and form a request with
destination BIO and source BIO. After this request will be sent to
driver.

This work is still in POC phase[2]. But this approach makes lifetime
management of BIO complicated, especially during failure cases.

Thank You,
Nitesh Shetty

[1] https://lore.kernel.org/linux-block/20230605121732.28468-1-nj.shetty@samsung.com/
[2] https://github.com/SamsungDS/linux/tree/feat/co/v21/ctx
Bart Van Assche June 24, 2024, 4:25 p.m. UTC | #30
On 6/24/24 3:44 AM, Nitesh Shetty wrote:
> For reference, I have listed the approaches we have taken in the past.
> 
> a. Token/payload based approach:
> 1. Here we allocate a buffer/payload.
> 2. First source BIO is sent along with the buffer.
> 3. Once the buffer reaches driver, it is filled with the source LBA
> and length and namespace info. And the request is completed.
> 4. Then destination BIO is sent with same buffer.
> 5. Once the buffer reaches driver, it retrieves the source information from
> the BIO and forms a copy command and sends it down to device.
> 
> We received feedback that putting anything inside payload which is not
> data, is not a good idea[1].

A token-based approach (pairing copy_src and copy_dst based on a token)
is completely different from a payload-based approach (copy offload
parameters stored in the bio payload). From [1] (I agree with what has
been quoted): "In general every time we tried to come up with a request
payload that is not just data passed to the device it has been a
nightmare." [ ... ] "The only thing we'd need is a sequence number / idr
/ etc to find an input and output side match up, as long as we
stick to the proper namespace scope."

> c. List/ctx based approach:
> A new member is added to bio, bio_copy_ctx, which will a union with
> bi_integrity. Idea is once a copy bio reaches blk_mq_submit_bio, it will
> add the bio to this list.
> 1. Send the destination BIO, once this reaches blk_mq_submit_bio, this
> will add the destination BIO to the list inside bi_copy_ctx and return
> without forming any request.
> 2. Send source BIO, once this reaches blk_mq_submit_bio, this will
> retrieve the destination BIO from bi_copy_ctx and form a request with
> destination BIO and source BIO. After this request will be sent to
> driver.
> 
> This work is still in POC phase[2]. But this approach makes lifetime
> management of BIO complicated, especially during failure cases.

Associating src and dst operations by embedding a pointer to a third
data structure in struct bio is an implementation choice and is not the
only possibility for assocating src and dst operations. Hence, the
bio lifetime complexity mentioned above is not inherent to the list
based approach but is a result of the implementation choice made for
associating src and dst operations.

Has it been considered to combine the list-based approach for managing
unpaired copy operations with the token based approach for pairing copy
src and copy dst operations?

Thanks,

Bart.
Damien Le Moal June 24, 2024, 9:55 p.m. UTC | #31
On 2024/06/25 1:25, Bart Van Assche wrote:
> On 6/24/24 3:44 AM, Nitesh Shetty wrote:
>> For reference, I have listed the approaches we have taken in the past.
>>
>> a. Token/payload based approach:
>> 1. Here we allocate a buffer/payload.
>> 2. First source BIO is sent along with the buffer.
>> 3. Once the buffer reaches driver, it is filled with the source LBA
>> and length and namespace info. And the request is completed.
>> 4. Then destination BIO is sent with same buffer.
>> 5. Once the buffer reaches driver, it retrieves the source information from
>> the BIO and forms a copy command and sends it down to device.
>>
>> We received feedback that putting anything inside payload which is not
>> data, is not a good idea[1].
> 
> A token-based approach (pairing copy_src and copy_dst based on a token)
> is completely different from a payload-based approach (copy offload
> parameters stored in the bio payload). From [1] (I agree with what has
> been quoted): "In general every time we tried to come up with a request
> payload that is not just data passed to the device it has been a
> nightmare." [ ... ] "The only thing we'd need is a sequence number / idr
> / etc to find an input and output side match up, as long as we
> stick to the proper namespace scope."
> 
>> c. List/ctx based approach:
>> A new member is added to bio, bio_copy_ctx, which will a union with
>> bi_integrity. Idea is once a copy bio reaches blk_mq_submit_bio, it will
>> add the bio to this list.
>> 1. Send the destination BIO, once this reaches blk_mq_submit_bio, this
>> will add the destination BIO to the list inside bi_copy_ctx and return
>> without forming any request.
>> 2. Send source BIO, once this reaches blk_mq_submit_bio, this will
>> retrieve the destination BIO from bi_copy_ctx and form a request with
>> destination BIO and source BIO. After this request will be sent to
>> driver.
>>
>> This work is still in POC phase[2]. But this approach makes lifetime
>> management of BIO complicated, especially during failure cases.
> 
> Associating src and dst operations by embedding a pointer to a third
> data structure in struct bio is an implementation choice and is not the
> only possibility for assocating src and dst operations. Hence, the
> bio lifetime complexity mentioned above is not inherent to the list
> based approach but is a result of the implementation choice made for
> associating src and dst operations.
> 
> Has it been considered to combine the list-based approach for managing
> unpaired copy operations with the token based approach for pairing copy
> src and copy dst operations?

I am still a little confused as to why we need 2 BIOs, one for src and one for
dst... Is it because of the overly complex scsi extended copy support ?

Given that the main use case is copy offload for data within the same device,
using a single BIO which somehow can carry a list of LBA sources and a single
destination LBA would be far simpler and perfectly matching nvme simple copy and
ATA write gathered. And I think that this would also match the simplest case for
scsi extended copy as well.
Keith Busch June 24, 2024, 10:58 p.m. UTC | #32
On Mon, Jun 24, 2024 at 04:14:07PM +0530, Nitesh Shetty wrote:
> c. List/ctx based approach:
> A new member is added to bio, bio_copy_ctx, which will a union with
> bi_integrity. Idea is once a copy bio reaches blk_mq_submit_bio, it will
> add the bio to this list.

Is there a reason to tie this to CONFIG_BLK_DEV_INTEGRITY? Why not use
the bio_io_vec? There's no user data here, so that's unused, right?

> 1. Send the destination BIO, once this reaches blk_mq_submit_bio, this
> will add the destination BIO to the list inside bi_copy_ctx and return
> without forming any request.
> 2. Send source BIO, once this reaches blk_mq_submit_bio, this will
> retrieve the destination BIO from bi_copy_ctx and form a request with
> destination BIO and source BIO. After this request will be sent to
> driver.

Like Damien, I also don't see the point of the 2-bio requirement. Treat
it like discard, and drivers can allocate "special".
Bart Van Assche June 25, 2024, 6:18 p.m. UTC | #33
On 6/24/24 2:55 PM, Damien Le Moal wrote:
> I am still a little confused as to why we need 2 BIOs, one for src and one for
> dst... Is it because of the overly complex scsi extended copy support ?
> 
> Given that the main use case is copy offload for data within the same device,
> using a single BIO which somehow can carry a list of LBA sources and a single
> destination LBA would be far simpler and perfectly matching nvme simple copy and
> ATA write gathered. And I think that this would also match the simplest case for
> scsi extended copy as well.

Hi Damien,

What are the implications for the device mapper code if the copy source
and destination LBAs are encoded in the bio payload instead of in
bio->bi_sector?

Thanks,

Bart.
Damien Le Moal June 25, 2024, 9:18 p.m. UTC | #34
On 6/26/24 03:18, Bart Van Assche wrote:
> On 6/24/24 2:55 PM, Damien Le Moal wrote:
>> I am still a little confused as to why we need 2 BIOs, one for src and one for
>> dst... Is it because of the overly complex scsi extended copy support ?
>>
>> Given that the main use case is copy offload for data within the same device,
>> using a single BIO which somehow can carry a list of LBA sources and a single
>> destination LBA would be far simpler and perfectly matching nvme simple copy and
>> ATA write gathered. And I think that this would also match the simplest case for
>> scsi extended copy as well.
> 
> Hi Damien,
> 
> What are the implications for the device mapper code if the copy source
> and destination LBAs are encoded in the bio payload instead of in
> bio->bi_sector?

DM can deal with "abnormal" BIOs on its own. There is code for that.
See is_abnormal_io() and __process_abnormal_io(). Sure, that will need more code
compared to a bio sector+size based simple split, but I do not think it is a big
deal given the potential benefits of the offloading.
Christoph Hellwig June 26, 2024, 5:22 a.m. UTC | #35
On Wed, Jun 26, 2024 at 06:18:18AM +0900, Damien Le Moal wrote:
> 
> DM can deal with "abnormal" BIOs on its own. There is code for that.
> See is_abnormal_io() and __process_abnormal_io(). Sure, that will need more code
> compared to a bio sector+size based simple split, but I do not think it is a big
> deal given the potential benefits of the offloading.

It's not just dm.  You also need it in the partition remapping code
(mandatory), md (nice to have), etc.

And then we have the whole mess of what is in the payload for the I/O
stack vs what is in the payload for the on the wire protocol, which
will have different formatting and potentially also different sizes.
diff mbox series

Patch

diff --git a/block/blk-core.c b/block/blk-core.c
index ea44b13af9af..f18ee5f709c0 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -122,6 +122,8 @@  static const char *const blk_op_name[] = {
 	REQ_OP_NAME(ZONE_FINISH),
 	REQ_OP_NAME(ZONE_APPEND),
 	REQ_OP_NAME(WRITE_ZEROES),
+	REQ_OP_NAME(COPY_SRC),
+	REQ_OP_NAME(COPY_DST),
 	REQ_OP_NAME(DRV_IN),
 	REQ_OP_NAME(DRV_OUT),
 };
@@ -838,6 +840,11 @@  void submit_bio_noacct(struct bio *bio)
 		 * requests.
 		 */
 		fallthrough;
+	case REQ_OP_COPY_SRC:
+	case REQ_OP_COPY_DST:
+		if (!q->limits.max_copy_sectors)
+			goto not_supported;
+		break;
 	default:
 		goto not_supported;
 	}
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 8534c35e0497..f8dc48a03379 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -154,6 +154,20 @@  static struct bio *bio_split_write_zeroes(struct bio *bio,
 	return bio_split(bio, lim->max_write_zeroes_sectors, GFP_NOIO, bs);
 }
 
+static struct bio *bio_split_copy(struct bio *bio,
+				  const struct queue_limits *lim,
+				  unsigned int *nsegs)
+{
+	*nsegs = 1;
+	if (bio_sectors(bio) <= lim->max_copy_sectors)
+		return NULL;
+	/*
+	 * We don't support splitting for a copy bio. End it with EIO if
+	 * splitting is required and return an error pointer.
+	 */
+	return ERR_PTR(-EIO);
+}
+
 /*
  * Return the maximum number of sectors from the start of a bio that may be
  * submitted as a single request to a block device. If enough sectors remain,
@@ -362,6 +376,12 @@  struct bio *__bio_split_to_limits(struct bio *bio,
 	case REQ_OP_WRITE_ZEROES:
 		split = bio_split_write_zeroes(bio, lim, nr_segs, bs);
 		break;
+	case REQ_OP_COPY_SRC:
+	case REQ_OP_COPY_DST:
+		split = bio_split_copy(bio, lim, nr_segs);
+		if (IS_ERR(split))
+			return NULL;
+		break;
 	default:
 		split = bio_split_rw(bio, lim, nr_segs, bs,
 				get_max_io_size(bio, lim) << SECTOR_SHIFT);
@@ -925,6 +945,9 @@  bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
 	if (!rq_mergeable(rq) || !bio_mergeable(bio))
 		return false;
 
+	if (blk_copy_offload_mergable(rq, bio))
+		return true;
+
 	if (req_op(rq) != bio_op(bio))
 		return false;
 
@@ -958,6 +981,8 @@  enum elv_merge blk_try_merge(struct request *rq, struct bio *bio)
 {
 	if (blk_discard_mergable(rq))
 		return ELEVATOR_DISCARD_MERGE;
+	else if (blk_copy_offload_mergable(rq, bio))
+		return ELEVATOR_COPY_OFFLOAD_MERGE;
 	else if (blk_rq_pos(rq) + blk_rq_sectors(rq) == bio->bi_iter.bi_sector)
 		return ELEVATOR_BACK_MERGE;
 	else if (blk_rq_pos(rq) - bio_sectors(bio) == bio->bi_iter.bi_sector)
@@ -1065,6 +1090,20 @@  static enum bio_merge_status bio_attempt_discard_merge(struct request_queue *q,
 	return BIO_MERGE_FAILED;
 }
 
+static enum bio_merge_status bio_attempt_copy_offload_merge(struct request *req,
+							    struct bio *bio)
+{
+	if (req->__data_len != bio->bi_iter.bi_size)
+		return BIO_MERGE_FAILED;
+
+	req->biotail->bi_next = bio;
+	req->biotail = bio;
+	req->nr_phys_segments++;
+	req->__data_len += bio->bi_iter.bi_size;
+
+	return BIO_MERGE_OK;
+}
+
 static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
 						   struct request *rq,
 						   struct bio *bio,
@@ -1085,6 +1124,8 @@  static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
 		break;
 	case ELEVATOR_DISCARD_MERGE:
 		return bio_attempt_discard_merge(q, rq, bio);
+	case ELEVATOR_COPY_OFFLOAD_MERGE:
+		return bio_attempt_copy_offload_merge(rq, bio);
 	default:
 		return BIO_MERGE_NONE;
 	}
diff --git a/block/blk.h b/block/blk.h
index 189bc25beb50..6528a2779b84 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -174,6 +174,20 @@  static inline bool blk_discard_mergable(struct request *req)
 	return false;
 }
 
+/*
+ * Copy offload sends a pair of bio with REQ_OP_COPY_DST and REQ_OP_COPY_SRC
+ * operation by taking a plug.
+ * Initially DST bio is sent which forms a request and
+ * waits for SRC bio to arrive. Once SRC bio arrives
+ * we merge it and send request down to driver.
+ */
+static inline bool blk_copy_offload_mergable(struct request *req,
+					     struct bio *bio)
+{
+	return (req_op(req) == REQ_OP_COPY_DST &&
+		bio_op(bio) == REQ_OP_COPY_SRC);
+}
+
 static inline unsigned int blk_rq_get_max_segments(struct request *rq)
 {
 	if (req_op(rq) == REQ_OP_DISCARD)
@@ -323,6 +337,8 @@  static inline bool bio_may_exceed_limits(struct bio *bio,
 	case REQ_OP_DISCARD:
 	case REQ_OP_SECURE_ERASE:
 	case REQ_OP_WRITE_ZEROES:
+	case REQ_OP_COPY_SRC:
+	case REQ_OP_COPY_DST:
 		return true; /* non-trivial splitting decisions */
 	default:
 		break;
diff --git a/block/elevator.h b/block/elevator.h
index e9a050a96e53..c7a45c1f4156 100644
--- a/block/elevator.h
+++ b/block/elevator.h
@@ -18,6 +18,7 @@  enum elv_merge {
 	ELEVATOR_FRONT_MERGE	= 1,
 	ELEVATOR_BACK_MERGE	= 2,
 	ELEVATOR_DISCARD_MERGE	= 3,
+	ELEVATOR_COPY_OFFLOAD_MERGE	= 4,
 };
 
 struct blk_mq_alloc_data;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index d5379548d684..528ef22dd65b 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -53,11 +53,7 @@  static inline unsigned int bio_max_segs(unsigned int nr_segs)
  */
 static inline bool bio_has_data(struct bio *bio)
 {
-	if (bio &&
-	    bio->bi_iter.bi_size &&
-	    bio_op(bio) != REQ_OP_DISCARD &&
-	    bio_op(bio) != REQ_OP_SECURE_ERASE &&
-	    bio_op(bio) != REQ_OP_WRITE_ZEROES)
+	if (bio && (bio_op(bio) == REQ_OP_READ || bio_op(bio) == REQ_OP_WRITE))
 		return true;
 
 	return false;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 781c4500491b..7f692bade271 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -342,6 +342,10 @@  enum req_op {
 	/* reset all the zone present on the device */
 	REQ_OP_ZONE_RESET_ALL	= (__force blk_opf_t)15,
 
+	/* copy offload src and dst operation */
+	REQ_OP_COPY_SRC		= (__force blk_opf_t)18,
+	REQ_OP_COPY_DST		= (__force blk_opf_t)19,
+
 	/* Driver private requests */
 	REQ_OP_DRV_IN		= (__force blk_opf_t)34,
 	REQ_OP_DRV_OUT		= (__force blk_opf_t)35,
@@ -430,6 +434,12 @@  static inline bool op_is_write(blk_opf_t op)
 	return !!(op & (__force blk_opf_t)1);
 }
 
+static inline bool op_is_copy(blk_opf_t op)
+{
+	return ((op & REQ_OP_MASK) == REQ_OP_COPY_SRC ||
+		(op & REQ_OP_MASK) == REQ_OP_COPY_DST);
+}
+
 /*
  * Check if the bio or request is one that needs special treatment in the
  * flush state machine.