diff mbox series

[v20,02/12] Add infrastructure for copy offload in block and request layer.

Message ID 20240520102033.9361-3-nj.shetty@samsung.com (mailing list archive)
State New
Headers show
Series [v20,01/12] block: Introduce queue limits and sysfs for copy-offload support | expand

Commit Message

Nitesh Shetty May 20, 2024, 10:20 a.m. UTC
We add two new opcode REQ_OP_COPY_DST, REQ_OP_COPY_SRC.
Since copy is a composite operation involving src and dst sectors/lba,
each needs to be represented by a separate bio to make it compatible
with device mapper.
We expect caller to take a plug and send bio with destination information,
followed by bio with source information.
Once the dst bio arrives we form a request and wait for source
bio. Upon arrival of source bio we merge these two bio's and send
corresponding request down to device driver.
Merging non copy offload bio is avoided by checking for copy specific
opcodes in merge function.

Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
---
 block/blk-core.c          |  7 +++++++
 block/blk-merge.c         | 41 +++++++++++++++++++++++++++++++++++++++
 block/blk.h               | 16 +++++++++++++++
 block/elevator.h          |  1 +
 include/linux/bio.h       |  6 +-----
 include/linux/blk_types.h | 10 ++++++++++
 6 files changed, 76 insertions(+), 5 deletions(-)

Comments

Damien Le Moal May 20, 2024, 3 p.m. UTC | #1
On 2024/05/20 12:20, Nitesh Shetty wrote:
> We add two new opcode REQ_OP_COPY_DST, REQ_OP_COPY_SRC.
> Since copy is a composite operation involving src and dst sectors/lba,
> each needs to be represented by a separate bio to make it compatible
> with device mapper.

Why ? The beginning of the sentence isn't justification enough for the two new
operation codes ? The 2 sentences should be reversed for easier reading:
justification first naturally leads to the reader understanding why the codes
are needed.

Also: s/opcode/operations


> We expect caller to take a plug and send bio with destination information,
> followed by bio with source information.

expect ? Plugging is optional. Does copy offload require it ? Please clarify this.

> Once the dst bio arrives we form a request and wait for source

arrives ? You mean "is submitted" ?

s/and wait for/and wait for the

> bio. Upon arrival of source bio we merge these two bio's and send

s/arrival/submission ?

s/of/of the
s/bio's/BIOs
s/and send/and send the
s/down to/down to the

> corresponding request down to device driver.
> Merging non copy offload bio is avoided by checking for copy specific
> opcodes in merge function.

Super unclear... What are you trying to say here ? That merging copy offload
BIOs with other BIOs is not allowed ? That is already handled. Only BIOs &
requests with the same operation can be merged. The code below also suggests
that you allow merging copy offloads... So I really do not understand this.

> 
> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
> Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
> ---
>  block/blk-core.c          |  7 +++++++
>  block/blk-merge.c         | 41 +++++++++++++++++++++++++++++++++++++++
>  block/blk.h               | 16 +++++++++++++++
>  block/elevator.h          |  1 +
>  include/linux/bio.h       |  6 +-----
>  include/linux/blk_types.h | 10 ++++++++++
>  6 files changed, 76 insertions(+), 5 deletions(-)
> 
> diff --git a/block/blk-core.c b/block/blk-core.c
> index ea44b13af9af..f18ee5f709c0 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -122,6 +122,8 @@ static const char *const blk_op_name[] = {
>  	REQ_OP_NAME(ZONE_FINISH),
>  	REQ_OP_NAME(ZONE_APPEND),
>  	REQ_OP_NAME(WRITE_ZEROES),
> +	REQ_OP_NAME(COPY_SRC),
> +	REQ_OP_NAME(COPY_DST),
>  	REQ_OP_NAME(DRV_IN),
>  	REQ_OP_NAME(DRV_OUT),
>  };
> @@ -838,6 +840,11 @@ void submit_bio_noacct(struct bio *bio)
>  		 * requests.
>  		 */
>  		fallthrough;
> +	case REQ_OP_COPY_SRC:
> +	case REQ_OP_COPY_DST:
> +		if (!q->limits.max_copy_sectors)
> +			goto not_supported;
> +		break;
>  	default:
>  		goto not_supported;
>  	}
> diff --git a/block/blk-merge.c b/block/blk-merge.c
> index 8534c35e0497..f8dc48a03379 100644
> --- a/block/blk-merge.c
> +++ b/block/blk-merge.c
> @@ -154,6 +154,20 @@ static struct bio *bio_split_write_zeroes(struct bio *bio,
>  	return bio_split(bio, lim->max_write_zeroes_sectors, GFP_NOIO, bs);
>  }
>  
> +static struct bio *bio_split_copy(struct bio *bio,
> +				  const struct queue_limits *lim,
> +				  unsigned int *nsegs)
> +{
> +	*nsegs = 1;
> +	if (bio_sectors(bio) <= lim->max_copy_sectors)
> +		return NULL;
> +	/*
> +	 * We don't support splitting for a copy bio. End it with EIO if
> +	 * splitting is required and return an error pointer.
> +	 */
> +	return ERR_PTR(-EIO);
> +}

Hmm... Why not check that the copy request is small enough and will not be split
when it is submitted ? Something like blk_check_zone_append() does with
REQ_OP_ZONE_APPEND ? So adding a blk_check_copy_offload(). That would also
include the limits check from the previous hunk.

> +
>  /*
>   * Return the maximum number of sectors from the start of a bio that may be
>   * submitted as a single request to a block device. If enough sectors remain,
> @@ -362,6 +376,12 @@ struct bio *__bio_split_to_limits(struct bio *bio,
>  	case REQ_OP_WRITE_ZEROES:
>  		split = bio_split_write_zeroes(bio, lim, nr_segs, bs);
>  		break;
> +	case REQ_OP_COPY_SRC:
> +	case REQ_OP_COPY_DST:
> +		split = bio_split_copy(bio, lim, nr_segs);
> +		if (IS_ERR(split))
> +			return NULL;
> +		break;

See above.

>  	default:
>  		split = bio_split_rw(bio, lim, nr_segs, bs,
>  				get_max_io_size(bio, lim) << SECTOR_SHIFT);
> @@ -925,6 +945,9 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
>  	if (!rq_mergeable(rq) || !bio_mergeable(bio))
>  		return false;
>  
> +	if (blk_copy_offload_mergable(rq, bio))
> +		return true;
> +
>  	if (req_op(rq) != bio_op(bio))
>  		return false;
>  
> @@ -958,6 +981,8 @@ enum elv_merge blk_try_merge(struct request *rq, struct bio *bio)
>  {
>  	if (blk_discard_mergable(rq))
>  		return ELEVATOR_DISCARD_MERGE;
> +	else if (blk_copy_offload_mergable(rq, bio))
> +		return ELEVATOR_COPY_OFFLOAD_MERGE;
>  	else if (blk_rq_pos(rq) + blk_rq_sectors(rq) == bio->bi_iter.bi_sector)
>  		return ELEVATOR_BACK_MERGE;
>  	else if (blk_rq_pos(rq) - bio_sectors(bio) == bio->bi_iter.bi_sector)
> @@ -1065,6 +1090,20 @@ static enum bio_merge_status bio_attempt_discard_merge(struct request_queue *q,
>  	return BIO_MERGE_FAILED;
>  }
>  
> +static enum bio_merge_status bio_attempt_copy_offload_merge(struct request *req,
> +							    struct bio *bio)
> +{
> +	if (req->__data_len != bio->bi_iter.bi_size)
> +		return BIO_MERGE_FAILED;
> +
> +	req->biotail->bi_next = bio;
> +	req->biotail = bio;
> +	req->nr_phys_segments++;
> +	req->__data_len += bio->bi_iter.bi_size;

Arg... You seem to be assuming that the source BIO always comes right after the
destination request... What if copy offloads are being concurrently issued ?
Shouldn't you check somehow that the pair is a match ? Or are you relying on the
per-context plugging which prevents that from happening in the first place ? But
that would assumes that you never ever sleep trying to allocate the source BIO
after the destination BIO/request are prepared and plugged.

> +
> +	return BIO_MERGE_OK;
> +}
> +
>  static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
>  						   struct request *rq,
>  						   struct bio *bio,
> @@ -1085,6 +1124,8 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
>  		break;
>  	case ELEVATOR_DISCARD_MERGE:
>  		return bio_attempt_discard_merge(q, rq, bio);
> +	case ELEVATOR_COPY_OFFLOAD_MERGE:
> +		return bio_attempt_copy_offload_merge(rq, bio);
>  	default:
>  		return BIO_MERGE_NONE;
>  	}
> diff --git a/block/blk.h b/block/blk.h
> index 189bc25beb50..6528a2779b84 100644
> --- a/block/blk.h
> +++ b/block/blk.h
> @@ -174,6 +174,20 @@ static inline bool blk_discard_mergable(struct request *req)
>  	return false;
>  }
>  
> +/*
> + * Copy offload sends a pair of bio with REQ_OP_COPY_DST and REQ_OP_COPY_SRC
> + * operation by taking a plug.
> + * Initially DST bio is sent which forms a request and
> + * waits for SRC bio to arrive. Once SRC bio arrives
> + * we merge it and send request down to driver.
> + */
> +static inline bool blk_copy_offload_mergable(struct request *req,
> +					     struct bio *bio)
> +{
> +	return (req_op(req) == REQ_OP_COPY_DST &&
> +		bio_op(bio) == REQ_OP_COPY_SRC);
> +}

This function is really not needed at all (used in one place only).

> +
>  static inline unsigned int blk_rq_get_max_segments(struct request *rq)
>  {
>  	if (req_op(rq) == REQ_OP_DISCARD)
> @@ -323,6 +337,8 @@ static inline bool bio_may_exceed_limits(struct bio *bio,
>  	case REQ_OP_DISCARD:
>  	case REQ_OP_SECURE_ERASE:
>  	case REQ_OP_WRITE_ZEROES:
> +	case REQ_OP_COPY_SRC:
> +	case REQ_OP_COPY_DST:
>  		return true; /* non-trivial splitting decisions */

See above. Limits should be checked on submission.

>  	default:
>  		break;
> diff --git a/block/elevator.h b/block/elevator.h
> index e9a050a96e53..c7a45c1f4156 100644
> --- a/block/elevator.h
> +++ b/block/elevator.h
> @@ -18,6 +18,7 @@ enum elv_merge {
>  	ELEVATOR_FRONT_MERGE	= 1,
>  	ELEVATOR_BACK_MERGE	= 2,
>  	ELEVATOR_DISCARD_MERGE	= 3,
> +	ELEVATOR_COPY_OFFLOAD_MERGE	= 4,
>  };
>  
>  struct blk_mq_alloc_data;
> diff --git a/include/linux/bio.h b/include/linux/bio.h
> index d5379548d684..528ef22dd65b 100644
> --- a/include/linux/bio.h
> +++ b/include/linux/bio.h
> @@ -53,11 +53,7 @@ static inline unsigned int bio_max_segs(unsigned int nr_segs)
>   */
>  static inline bool bio_has_data(struct bio *bio)
>  {
> -	if (bio &&
> -	    bio->bi_iter.bi_size &&
> -	    bio_op(bio) != REQ_OP_DISCARD &&
> -	    bio_op(bio) != REQ_OP_SECURE_ERASE &&
> -	    bio_op(bio) != REQ_OP_WRITE_ZEROES)
> +	if (bio && (bio_op(bio) == REQ_OP_READ || bio_op(bio) == REQ_OP_WRITE))
>  		return true;

This change seems completely broken and out of place. This would cause a return
of false for zone append operations.

>  
>  	return false;
> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
> index 781c4500491b..7f692bade271 100644
> --- a/include/linux/blk_types.h
> +++ b/include/linux/blk_types.h
> @@ -342,6 +342,10 @@ enum req_op {
>  	/* reset all the zone present on the device */
>  	REQ_OP_ZONE_RESET_ALL	= (__force blk_opf_t)15,
>  
> +	/* copy offload src and dst operation */

s/src/source
s/dst/destination
s/operation/operations

> +	REQ_OP_COPY_SRC		= (__force blk_opf_t)18,
> +	REQ_OP_COPY_DST		= (__force blk_opf_t)19,
> +
>  	/* Driver private requests */
>  	REQ_OP_DRV_IN		= (__force blk_opf_t)34,
>  	REQ_OP_DRV_OUT		= (__force blk_opf_t)35,
> @@ -430,6 +434,12 @@ static inline bool op_is_write(blk_opf_t op)
>  	return !!(op & (__force blk_opf_t)1);
>  }
>  
> +static inline bool op_is_copy(blk_opf_t op)
> +{
> +	return ((op & REQ_OP_MASK) == REQ_OP_COPY_SRC ||
> +		(op & REQ_OP_MASK) == REQ_OP_COPY_DST);
> +}

May be use a switch here to avoid the double masking of op ?

> +
>  /*
>   * Check if the bio or request is one that needs special treatment in the
>   * flush state machine.
Bart Van Assche May 20, 2024, 11 p.m. UTC | #2
On 5/20/24 03:20, Nitesh Shetty wrote:
> Upon arrival of source bio we merge these two bio's and send
> corresponding request down to device driver.

bios with different operation types must not be merged.

> +static enum bio_merge_status bio_attempt_copy_offload_merge(struct request *req,
> +							    struct bio *bio)
> +{
> +	if (req->__data_len != bio->bi_iter.bi_size)
> +		return BIO_MERGE_FAILED;
> +
> +	req->biotail->bi_next = bio;
> +	req->biotail = bio;
> +	req->nr_phys_segments++;
> +	req->__data_len += bio->bi_iter.bi_size;
> +
> +	return BIO_MERGE_OK;
> +}

This function appends a bio to a request. Hence, the name of this function is
wrong.

> @@ -1085,6 +1124,8 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
>   		break;
>   	case ELEVATOR_DISCARD_MERGE:
>   		return bio_attempt_discard_merge(q, rq, bio);
> +	case ELEVATOR_COPY_OFFLOAD_MERGE:
> +		return bio_attempt_copy_offload_merge(rq, bio);
>   	default:
>   		return BIO_MERGE_NONE;
>   	}

Is any code added in this patch series that causes an I/O scheduler to return
ELEVATOR_COPY_OFFLOAD_MERGE?

> +static inline bool blk_copy_offload_mergable(struct request *req,
> +					     struct bio *bio)
> +{
> +	return (req_op(req) == REQ_OP_COPY_DST &&
> +		bio_op(bio) == REQ_OP_COPY_SRC);
> +}

bios with different operation types must not be merged. Please rename this function.

> +static inline bool op_is_copy(blk_opf_t op)
> +{
> +	return ((op & REQ_OP_MASK) == REQ_OP_COPY_SRC ||
> +		(op & REQ_OP_MASK) == REQ_OP_COPY_DST);
> +}

The above function is not used in this patch. Please introduce new functions in the
patch in which these are used for the first time.

Thanks,

Bart.
Hannes Reinecke May 21, 2024, 7:01 a.m. UTC | #3
On 5/20/24 12:20, Nitesh Shetty wrote:
> We add two new opcode REQ_OP_COPY_DST, REQ_OP_COPY_SRC.
> Since copy is a composite operation involving src and dst sectors/lba,
> each needs to be represented by a separate bio to make it compatible
> with device mapper.
> We expect caller to take a plug and send bio with destination information,
> followed by bio with source information.
> Once the dst bio arrives we form a request and wait for source
> bio. Upon arrival of source bio we merge these two bio's and send
> corresponding request down to device driver.
> Merging non copy offload bio is avoided by checking for copy specific
> opcodes in merge function.
> 
> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
> Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
> ---
>   block/blk-core.c          |  7 +++++++
>   block/blk-merge.c         | 41 +++++++++++++++++++++++++++++++++++++++
>   block/blk.h               | 16 +++++++++++++++
>   block/elevator.h          |  1 +
>   include/linux/bio.h       |  6 +-----
>   include/linux/blk_types.h | 10 ++++++++++
>   6 files changed, 76 insertions(+), 5 deletions(-)
> 
I am a bit unsure about leveraging 'merge' here. As Bart pointed out, 
this is arguably as mis-use of the 'merge' functionality as we don't
actually merge bios, but rather use the information from these bios to
form the actual request.
Wouldn't it be better to use bio_chain here, and send out the eventual
request from the end_io function of the bio chain?

Cheers,

Hannes
Nitesh Shetty May 21, 2024, 10:50 a.m. UTC | #4
On 20/05/24 05:00PM, Damien Le Moal wrote:
>On 2024/05/20 12:20, Nitesh Shetty wrote:
>> We add two new opcode REQ_OP_COPY_DST, REQ_OP_COPY_SRC.
>> Since copy is a composite operation involving src and dst sectors/lba,
>> each needs to be represented by a separate bio to make it compatible
>> with device mapper.
>
>Why ? The beginning of the sentence isn't justification enough for the two new
>operation codes ? The 2 sentences should be reversed for easier reading:
>justification first naturally leads to the reader understanding why the codes
>are needed.
>
>Also: s/opcode/operations
>
>
Acked

>> We expect caller to take a plug and send bio with destination information,
>> followed by bio with source information.
>
>expect ? Plugging is optional. Does copy offload require it ? Please clarify this.
>
"caller" here refers to blkdev_copy_offload. The caller of blkdev_copy_offload
does not need to take the plug. We should have made this clear.
Sorry for the confusion, will update the description to reflect the same
in next version.

>> Once the dst bio arrives we form a request and wait for source
>
>arrives ? You mean "is submitted" ?
>
>s/and wait for/and wait for the
>
>> bio. Upon arrival of source bio we merge these two bio's and send
>
>s/arrival/submission ?
>
>s/of/of the
>s/bio's/BIOs
>s/and send/and send the
>s/down to/down to the
>
acked for above.

>> corresponding request down to device driver.
>> Merging non copy offload bio is avoided by checking for copy specific
>> opcodes in merge function.
>
>Super unclear... What are you trying to say here ? That merging copy offload
>BIOs with other BIOs is not allowed ? That is already handled. Only BIOs &
>requests with the same operation can be merged. The code below also suggests
>that you allow merging copy offloads... So I really do not understand this.
>
You are right, only BIOs & requests with same operation can be merged.
We are not merging copy offloads. We are only merging src and dst BIO to
form a copy offload request.
We will remove this in next version.

>> diff --git a/block/blk-merge.c b/block/blk-merge.c
>> index 8534c35e0497..f8dc48a03379 100644
>> --- a/block/blk-merge.c
>> +++ b/block/blk-merge.c
>> @@ -154,6 +154,20 @@ static struct bio *bio_split_write_zeroes(struct bio *bio,
>>  	return bio_split(bio, lim->max_write_zeroes_sectors, GFP_NOIO, bs);
>>  }
>>
>> +static struct bio *bio_split_copy(struct bio *bio,
>> +				  const struct queue_limits *lim,
>> +				  unsigned int *nsegs)
>> +{
>> +	*nsegs = 1;
>> +	if (bio_sectors(bio) <= lim->max_copy_sectors)
>> +		return NULL;
>> +	/*
>> +	 * We don't support splitting for a copy bio. End it with EIO if
>> +	 * splitting is required and return an error pointer.
>> +	 */
>> +	return ERR_PTR(-EIO);
>> +}
>
>Hmm... Why not check that the copy request is small enough and will not be split
>when it is submitted ? Something like blk_check_zone_append() does with
>REQ_OP_ZONE_APPEND ? So adding a blk_check_copy_offload(). That would also
>include the limits check from the previous hunk.
Yes, that could be one way. But we followed approach similar to discard.

>
>> +
>>  /*
>>   * Return the maximum number of sectors from the start of a bio that may be
>>   * submitted as a single request to a block device. If enough sectors remain,
>> @@ -362,6 +376,12 @@ struct bio *__bio_split_to_limits(struct bio *bio,
>>  	case REQ_OP_WRITE_ZEROES:
>>  		split = bio_split_write_zeroes(bio, lim, nr_segs, bs);
>>  		break;
>> +	case REQ_OP_COPY_SRC:
>> +	case REQ_OP_COPY_DST:
>> +		split = bio_split_copy(bio, lim, nr_segs);
>> +		if (IS_ERR(split))
>> +			return NULL;
>> +		break;
>
>See above.
>
>>  	default:
>>  		split = bio_split_rw(bio, lim, nr_segs, bs,
>>  				get_max_io_size(bio, lim) << SECTOR_SHIFT);
>> @@ -925,6 +945,9 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
>>  	if (!rq_mergeable(rq) || !bio_mergeable(bio))
>>  		return false;
>>
>> +	if (blk_copy_offload_mergable(rq, bio))
>> +		return true;
>> +
>>  	if (req_op(rq) != bio_op(bio))
>>  		return false;
>>
>> @@ -958,6 +981,8 @@ enum elv_merge blk_try_merge(struct request *rq, struct bio *bio)
>>  {
>>  	if (blk_discard_mergable(rq))
>>  		return ELEVATOR_DISCARD_MERGE;
>> +	else if (blk_copy_offload_mergable(rq, bio))
>> +		return ELEVATOR_COPY_OFFLOAD_MERGE;
>>  	else if (blk_rq_pos(rq) + blk_rq_sectors(rq) == bio->bi_iter.bi_sector)
>>  		return ELEVATOR_BACK_MERGE;
>>  	else if (blk_rq_pos(rq) - bio_sectors(bio) == bio->bi_iter.bi_sector)
>> @@ -1065,6 +1090,20 @@ static enum bio_merge_status bio_attempt_discard_merge(struct request_queue *q,
>>  	return BIO_MERGE_FAILED;
>>  }
>>
>> +static enum bio_merge_status bio_attempt_copy_offload_merge(struct request *req,
>> +							    struct bio *bio)
>> +{
>> +	if (req->__data_len != bio->bi_iter.bi_size)
>> +		return BIO_MERGE_FAILED;
>> +
>> +	req->biotail->bi_next = bio;
>> +	req->biotail = bio;
>> +	req->nr_phys_segments++;
>> +	req->__data_len += bio->bi_iter.bi_size;
>
>Arg... You seem to be assuming that the source BIO always comes right after the
>destination request... What if copy offloads are being concurrently issued ?
>Shouldn't you check somehow that the pair is a match ? Or are you relying on the
>per-context plugging which prevents that from happening in the first place ? But
>that would assumes that you never ever sleep trying to allocate the source BIO
>after the destination BIO/request are prepared and plugged.
>
Yes, we are rely on per-context plugging for copy to work.
Incase for any reason we are not able to merge src and dst BIOs, and the
request reaches driver layer with only one of the src or dst BIO,
we fail this request in driver layer.
This approach simplifies the overall copy plumbing.

>> +
>> +	return BIO_MERGE_OK;
>> +}
>> +
>>  static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
>>  						   struct request *rq,
>>  						   struct bio *bio,
>> @@ -1085,6 +1124,8 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
>>  		break;
>>  	case ELEVATOR_DISCARD_MERGE:
>>  		return bio_attempt_discard_merge(q, rq, bio);
>> +	case ELEVATOR_COPY_OFFLOAD_MERGE:
>> +		return bio_attempt_copy_offload_merge(rq, bio);
>>  	default:
>>  		return BIO_MERGE_NONE;
>>  	}
>> diff --git a/block/blk.h b/block/blk.h
>> index 189bc25beb50..6528a2779b84 100644
>> --- a/block/blk.h
>> +++ b/block/blk.h
>> @@ -174,6 +174,20 @@ static inline bool blk_discard_mergable(struct request *req)
>>  	return false;
>>  }
>>
>> +/*
>> + * Copy offload sends a pair of bio with REQ_OP_COPY_DST and REQ_OP_COPY_SRC
>> + * operation by taking a plug.
>> + * Initially DST bio is sent which forms a request and
>> + * waits for SRC bio to arrive. Once SRC bio arrives
>> + * we merge it and send request down to driver.
>> + */
>> +static inline bool blk_copy_offload_mergable(struct request *req,
>> +					     struct bio *bio)
>> +{
>> +	return (req_op(req) == REQ_OP_COPY_DST &&
>> +		bio_op(bio) == REQ_OP_COPY_SRC);
>> +}
>
>This function is really not needed at all (used in one place only).
>
This is used at two places and we felt this provides better readability,
similar to discard.

>> +
>>  static inline unsigned int blk_rq_get_max_segments(struct request *rq)
>>  {
>>  	if (req_op(rq) == REQ_OP_DISCARD)
>> @@ -323,6 +337,8 @@ static inline bool bio_may_exceed_limits(struct bio *bio,
>>  	case REQ_OP_DISCARD:
>>  	case REQ_OP_SECURE_ERASE:
>>  	case REQ_OP_WRITE_ZEROES:
>> +	case REQ_OP_COPY_SRC:
>> +	case REQ_OP_COPY_DST:
>>  		return true; /* non-trivial splitting decisions */
>
>See above. Limits should be checked on submission.
>
>>  	default:
>>  		break;
>> diff --git a/block/elevator.h b/block/elevator.h
>> index e9a050a96e53..c7a45c1f4156 100644
>> --- a/block/elevator.h
>> +++ b/block/elevator.h
>> @@ -18,6 +18,7 @@ enum elv_merge {
>>  	ELEVATOR_FRONT_MERGE	= 1,
>>  	ELEVATOR_BACK_MERGE	= 2,
>>  	ELEVATOR_DISCARD_MERGE	= 3,
>> +	ELEVATOR_COPY_OFFLOAD_MERGE	= 4,
>>  };
>>
>>  struct blk_mq_alloc_data;
>> diff --git a/include/linux/bio.h b/include/linux/bio.h
>> index d5379548d684..528ef22dd65b 100644
>> --- a/include/linux/bio.h
>> +++ b/include/linux/bio.h
>> @@ -53,11 +53,7 @@ static inline unsigned int bio_max_segs(unsigned int nr_segs)
>>   */
>>  static inline bool bio_has_data(struct bio *bio)
>>  {
>> -	if (bio &&
>> -	    bio->bi_iter.bi_size &&
>> -	    bio_op(bio) != REQ_OP_DISCARD &&
>> -	    bio_op(bio) != REQ_OP_SECURE_ERASE &&
>> -	    bio_op(bio) != REQ_OP_WRITE_ZEROES)
>> +	if (bio && (bio_op(bio) == REQ_OP_READ || bio_op(bio) == REQ_OP_WRITE))
>>  		return true;
>
>This change seems completely broken and out of place. This would cause a return
>of false for zone append operations.
>
We changed this based on previous review comments[1].
Idea is to replace this with a positive check.
But we did miss adding ZONE_APPEND in this.
We will add ZONE_APPEND check in next version.

[1] https://lore.kernel.org/linux-block/20230720074256.GA5042@lst.de/
>>
>>  	return false;
>> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
>> index 781c4500491b..7f692bade271 100644
>> --- a/include/linux/blk_types.h
>> +++ b/include/linux/blk_types.h
>> @@ -342,6 +342,10 @@ enum req_op {
>>  	/* reset all the zone present on the device */
>>  	REQ_OP_ZONE_RESET_ALL	= (__force blk_opf_t)15,
>>
>> +	/* copy offload src and dst operation */
>
>s/src/source
>s/dst/destination
>s/operation/operations
>
Acked

>> +	REQ_OP_COPY_SRC		= (__force blk_opf_t)18,
>> +	REQ_OP_COPY_DST		= (__force blk_opf_t)19,
>> +
>>  	/* Driver private requests */
>>  	REQ_OP_DRV_IN		= (__force blk_opf_t)34,
>>  	REQ_OP_DRV_OUT		= (__force blk_opf_t)35,
>> @@ -430,6 +434,12 @@ static inline bool op_is_write(blk_opf_t op)
>>  	return !!(op & (__force blk_opf_t)1);
>>  }
>>
>> +static inline bool op_is_copy(blk_opf_t op)
>> +{
>> +	return ((op & REQ_OP_MASK) == REQ_OP_COPY_SRC ||
>> +		(op & REQ_OP_MASK) == REQ_OP_COPY_DST);
>> +}
>
>May be use a switch here to avoid the double masking of op ?
>
Acked

Thank you,
Nitesh Shetty
Nitesh Shetty May 21, 2024, 11:17 a.m. UTC | #5
On 20/05/24 04:00PM, Bart Van Assche wrote:
>On 5/20/24 03:20, Nitesh Shetty wrote:
>>Upon arrival of source bio we merge these two bio's and send
>>corresponding request down to device driver.
>
>bios with different operation types must not be merged.
>
Copy is a composite operation which has two operation read and
write combined, so we chose two operation types which reflects that.

>>+static enum bio_merge_status bio_attempt_copy_offload_merge(struct request *req,
>>+							    struct bio *bio)
>>+{
>>+	if (req->__data_len != bio->bi_iter.bi_size)
>>+		return BIO_MERGE_FAILED;
>>+
>>+	req->biotail->bi_next = bio;
>>+	req->biotail = bio;
>>+	req->nr_phys_segments++;
>>+	req->__data_len += bio->bi_iter.bi_size;
>>+
>>+	return BIO_MERGE_OK;
>>+}
>
>This function appends a bio to a request. Hence, the name of this function is
>wrong.
>
We followed the naming convention from discard(bio_attempt_discard_merge)
which does similar thing.
But we are open to renaming, if overall community also feels the same.

>>@@ -1085,6 +1124,8 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
>>  		break;
>>  	case ELEVATOR_DISCARD_MERGE:
>>  		return bio_attempt_discard_merge(q, rq, bio);
>>+	case ELEVATOR_COPY_OFFLOAD_MERGE:
>>+		return bio_attempt_copy_offload_merge(rq, bio);
>>  	default:
>>  		return BIO_MERGE_NONE;
>>  	}
>
>Is any code added in this patch series that causes an I/O scheduler to return
>ELEVATOR_COPY_OFFLOAD_MERGE?
>
yes, blk_try_merge returns ELEVATOR_COPY_OFFLOAD_MERGE.

>>+static inline bool blk_copy_offload_mergable(struct request *req,
>>+					     struct bio *bio)
>>+{
>>+	return (req_op(req) == REQ_OP_COPY_DST &&
>>+		bio_op(bio) == REQ_OP_COPY_SRC);
>>+}
>
>bios with different operation types must not be merged. Please rename this function.
>
As described above we need two different opcodes.
As far as function renaming, we followed discard's naming. But open to
any suggestion.

>>+static inline bool op_is_copy(blk_opf_t op)
>>+{
>>+	return ((op & REQ_OP_MASK) == REQ_OP_COPY_SRC ||
>>+		(op & REQ_OP_MASK) == REQ_OP_COPY_DST);
>>+}
>
>The above function is not used in this patch. Please introduce new functions in the
>patch in which these are used for the first time.
>
Acked

Thank You
Nitesh Shetty
Bart Van Assche May 22, 2024, 5:58 p.m. UTC | #6
On 5/21/24 04:17, Nitesh Shetty wrote:
> On 20/05/24 04:00PM, Bart Van Assche wrote:
>> On 5/20/24 03:20, Nitesh Shetty wrote:
>>> +static inline bool blk_copy_offload_mergable(struct request *req,
>>> +                         struct bio *bio)
>>> +{
>>> +    return (req_op(req) == REQ_OP_COPY_DST &&
>>> +        bio_op(bio) == REQ_OP_COPY_SRC);
>>> +}
>>
>> bios with different operation types must not be merged. Please rename this function.
>>
> As far as function renaming, we followed discard's naming. But open to
> any suggestion.

req_attempt_discard_merge() checks whether two REQ_OP_DISCARD bios can be merged.
The above function checks something else, namely whether REQ_OP_COPY_DST and
REQ_OP_COPY_SRC can be combined into a copy offload operation. Hence my request
not to use the verb "merge" for combining REQ_OP_COPY_SRC and REQ_OP_COPY_DST
operations.

Thanks,

Bart.
Bart Van Assche May 22, 2024, 6:05 p.m. UTC | #7
On 5/20/24 03:20, Nitesh Shetty wrote:
> We add two new opcode REQ_OP_COPY_DST, REQ_OP_COPY_SRC.
> Since copy is a composite operation involving src and dst sectors/lba,
> each needs to be represented by a separate bio to make it compatible
> with device mapper.
> We expect caller to take a plug and send bio with destination information,
> followed by bio with source information.
> Once the dst bio arrives we form a request and wait for source
> bio. Upon arrival of source bio we merge these two bio's and send
> corresponding request down to device driver.
> Merging non copy offload bio is avoided by checking for copy specific
> opcodes in merge function.

Plugs are per task. Can the following happen?
* Task A calls blk_start_plug()
* Task B calls blk_start_plug()
* Task A submits a REQ_OP_COPY_DST bio and a REQ_OP_COPY_SRC bio.
* Task B submits a REQ_OP_COPY_DST bio and a REQ_OP_COPY_SRC bio.
* Task A calls blk_finish_plug()
* Task B calls blk_finish_plug()
* The REQ_OP_COPY_DST bio from task A and the REQ_OP_COPY_SRC bio from
   task B are combined into a single request.
* The REQ_OP_COPY_DST bio from task B and the REQ_OP_COPY_SRC bio from
   task A are combined into a single request.

Thanks,

Bart.
Nitesh Shetty May 23, 2024, 11:34 a.m. UTC | #8
On 22/05/24 11:05AM, Bart Van Assche wrote:
>On 5/20/24 03:20, Nitesh Shetty wrote:
>>We add two new opcode REQ_OP_COPY_DST, REQ_OP_COPY_SRC.
>>Since copy is a composite operation involving src and dst sectors/lba,
>>each needs to be represented by a separate bio to make it compatible
>>with device mapper.
>>We expect caller to take a plug and send bio with destination information,
>>followed by bio with source information.
>>Once the dst bio arrives we form a request and wait for source
>>bio. Upon arrival of source bio we merge these two bio's and send
>>corresponding request down to device driver.
>>Merging non copy offload bio is avoided by checking for copy specific
>>opcodes in merge function.
>
>Plugs are per task. Can the following happen?
We rely on per-context plugging to avoid this.
>* Task A calls blk_start_plug()
>* Task B calls blk_start_plug()
>* Task A submits a REQ_OP_COPY_DST bio and a REQ_OP_COPY_SRC bio.
Lets say this forms request A and stored in plug A
>* Task B submits a REQ_OP_COPY_DST bio and a REQ_OP_COPY_SRC bio.
Lets say this forms request B and stored in plug B
>* Task A calls blk_finish_plug()
>* Task B calls blk_finish_plug()
>* The REQ_OP_COPY_DST bio from task A and the REQ_OP_COPY_SRC bio from
>  task B are combined into a single request.
Here task A picks plug A and hence request A
>* The REQ_OP_COPY_DST bio from task B and the REQ_OP_COPY_SRC bio from
>  task A are combined into a single request.
same as above, request B

So we expect this case not to happen.

Thank you,
Nitesh Shetty
Nitesh Shetty May 24, 2024, 6:54 a.m. UTC | #9
On 21/05/24 09:01AM, Hannes Reinecke wrote:
>On 5/20/24 12:20, Nitesh Shetty wrote:
>>We add two new opcode REQ_OP_COPY_DST, REQ_OP_COPY_SRC.
>>Since copy is a composite operation involving src and dst sectors/lba,
>>each needs to be represented by a separate bio to make it compatible
>>with device mapper.
>>We expect caller to take a plug and send bio with destination information,
>>followed by bio with source information.
>>Once the dst bio arrives we form a request and wait for source
>>bio. Upon arrival of source bio we merge these two bio's and send
>>corresponding request down to device driver.
>>Merging non copy offload bio is avoided by checking for copy specific
>>opcodes in merge function.
>>
>>Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
>>Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
>>---
>>  block/blk-core.c          |  7 +++++++
>>  block/blk-merge.c         | 41 +++++++++++++++++++++++++++++++++++++++
>>  block/blk.h               | 16 +++++++++++++++
>>  block/elevator.h          |  1 +
>>  include/linux/bio.h       |  6 +-----
>>  include/linux/blk_types.h | 10 ++++++++++
>>  6 files changed, 76 insertions(+), 5 deletions(-)
>>
>I am a bit unsure about leveraging 'merge' here. As Bart pointed out, 
>this is arguably as mis-use of the 'merge' functionality as we don't
>actually merge bios, but rather use the information from these bios to
>form the actual request.
>Wouldn't it be better to use bio_chain here, and send out the eventual
>request from the end_io function of the bio chain?
>
With above bio chain approach, I see a difficulty while dealing with
stacked/partitioned devices, as they might get remapped.
Or am I missing something here ?

Bart, Hannes,
Regarding merge, does it looks any better, if we use single request
operation such as REQ_OP_COPY and use op_flags(REQ_COPY_DST/REQ_COPY_SRC)
to identify dst and src bios ?


Regards,
Nitesh Shetty
Bart Van Assche May 24, 2024, 1:52 p.m. UTC | #10
On 5/23/24 23:54, Nitesh Shetty wrote:
> Regarding merge, does it looks any better, if we use single request
> operation such as REQ_OP_COPY and use op_flags(REQ_COPY_DST/REQ_COPY_SRC)
> to identify dst and src bios ?

I prefer to keep the current approach (REQ_COPY_DST/REQ_COPY_SRC) and to
use a more appropriate verb than "merge", e.g. "combine".

Thanks,

Bart.
Bart Van Assche May 24, 2024, 8:33 p.m. UTC | #11
On 5/20/24 03:20, Nitesh Shetty wrote:
> We add two new opcode REQ_OP_COPY_DST, REQ_OP_COPY_SRC.
> Since copy is a composite operation involving src and dst sectors/lba,
> each needs to be represented by a separate bio to make it compatible
> with device mapper.
> We expect caller to take a plug and send bio with destination information,
> followed by bio with source information.
> Once the dst bio arrives we form a request and wait for source
> bio. Upon arrival of source bio we merge these two bio's and send
> corresponding request down to device driver.
> Merging non copy offload bio is avoided by checking for copy specific
> opcodes in merge function.

In this patch I don't see any changes for blk_attempt_bio_merge(). Does
this mean that combining REQ_OP_COPY_DST and REQ_OP_COPY_SRC will never
happen if the QUEUE_FLAG_NOMERGES request queue flag has been set?

Can it happen that the REQ_NOMERGE flag is set by __bio_split_to_limits()
for REQ_OP_COPY_DST or REQ_OP_COPY_SRC bios? Will this happen if the
following condition is met?

dst_bio->nr_phys_segs + src_bio->nr_phys_segs > max_segments

Is it allowed to set REQ_PREFLUSH or REQ_FUA for REQ_OP_COPY_DST or
REQ_OP_COPY_SRC bios? I'm asking this because these flags disable merging.

 From include/linux/blk_types.h:

#define REQ_NOMERGE_FLAGS (REQ_NOMERGE | REQ_PREFLUSH | REQ_FUA)

Thanks,

Bart.
Nitesh Shetty May 27, 2024, 8:27 a.m. UTC | #12
On 24/05/24 06:52AM, Bart Van Assche wrote:
>On 5/23/24 23:54, Nitesh Shetty wrote:
>>Regarding merge, does it looks any better, if we use single request
>>operation such as REQ_OP_COPY and use op_flags(REQ_COPY_DST/REQ_COPY_SRC)
>>to identify dst and src bios ?
>
>I prefer to keep the current approach (REQ_COPY_DST/REQ_COPY_SRC) and to
>use a more appropriate verb than "merge", e.g. "combine".
>
Thanks for the suggesion, will do this in next version.

--Nitesh Shetty
Bart Van Assche May 28, 2024, 2:07 p.m. UTC | #13
On 5/21/24 00:01, Hannes Reinecke wrote:
> On 5/20/24 12:20, Nitesh Shetty wrote:
>> We add two new opcode REQ_OP_COPY_DST, REQ_OP_COPY_SRC.
>> Since copy is a composite operation involving src and dst sectors/lba,
>> each needs to be represented by a separate bio to make it compatible
>> with device mapper.
>> We expect caller to take a plug and send bio with destination 
>> information,
>> followed by bio with source information.
>> Once the dst bio arrives we form a request and wait for source
>> bio. Upon arrival of source bio we merge these two bio's and send
>> corresponding request down to device driver.
>> Merging non copy offload bio is avoided by checking for copy specific
>> opcodes in merge function.
>>
> I am a bit unsure about leveraging 'merge' here. As Bart pointed out, 
> this is arguably as mis-use of the 'merge' functionality as we don't
> actually merge bios, but rather use the information from these bios to
> form the actual request.
> Wouldn't it be better to use bio_chain here, and send out the eventual
> request from the end_io function of the bio chain?

Let me formulate this a bit stronger: I think this patch series abuses
the merge functionality and also that it should use another mechanism
for combining REQ_OP_COPY_DST and REQ_OP_COPY_SRC. See also my email
with concerns about using the merge functionality:
https://lore.kernel.org/linux-block/eda6c198-3a29-4da4-94db-305cfe28d3d6@acm.org/.

Thanks,

Bart.
Nitesh Shetty May 29, 2024, 6:17 a.m. UTC | #14
On 24/05/24 01:33PM, Bart Van Assche wrote:
>On 5/20/24 03:20, Nitesh Shetty wrote:
>>We add two new opcode REQ_OP_COPY_DST, REQ_OP_COPY_SRC.
>>Since copy is a composite operation involving src and dst sectors/lba,
>>each needs to be represented by a separate bio to make it compatible
>>with device mapper.
>>We expect caller to take a plug and send bio with destination information,
>>followed by bio with source information.
>>Once the dst bio arrives we form a request and wait for source
>>bio. Upon arrival of source bio we merge these two bio's and send
>>corresponding request down to device driver.
>>Merging non copy offload bio is avoided by checking for copy specific
>>opcodes in merge function.
>
>In this patch I don't see any changes for blk_attempt_bio_merge(). Does
>this mean that combining REQ_OP_COPY_DST and REQ_OP_COPY_SRC will never
>happen if the QUEUE_FLAG_NOMERGES request queue flag has been set?
>
Yes, in this case copy won't work, as both src and dst bio reach driver
as part of separate requests.
We will add this as part of documentation.

>Can it happen that the REQ_NOMERGE flag is set by __bio_split_to_limits()
>for REQ_OP_COPY_DST or REQ_OP_COPY_SRC bios? Will this happen if the
>following condition is met?
>
>dst_bio->nr_phys_segs + src_bio->nr_phys_segs > max_segments
>
No, this should not happen. We don't use bio_split_rw for copy.
We have added a separate function to check for split incase of
copy(bio_split_copy), which doesn't allow copy bio splits,
hence REQ_NOMERGE flag won't be set.

>Is it allowed to set REQ_PREFLUSH or REQ_FUA for REQ_OP_COPY_DST or
>REQ_OP_COPY_SRC bios? I'm asking this because these flags disable merging.
>
>From include/linux/blk_types.h:
>
>#define REQ_NOMERGE_FLAGS (REQ_NOMERGE | REQ_PREFLUSH | REQ_FUA)
>
No, setting these flags won't allow copy bio's to merge and hence copy
won't work.
We suggest to use helper API blkdev_copy_offload to achieve the
copy which won't be setting these flags.

Thank You,
Nitesh Shetty
Damien Le Moal May 29, 2024, 7:48 a.m. UTC | #15
On 5/29/24 15:17, Nitesh Shetty wrote:
> On 24/05/24 01:33PM, Bart Van Assche wrote:
>> On 5/20/24 03:20, Nitesh Shetty wrote:
>>> We add two new opcode REQ_OP_COPY_DST, REQ_OP_COPY_SRC.
>>> Since copy is a composite operation involving src and dst sectors/lba,
>>> each needs to be represented by a separate bio to make it compatible
>>> with device mapper.
>>> We expect caller to take a plug and send bio with destination information,
>>> followed by bio with source information.
>>> Once the dst bio arrives we form a request and wait for source
>>> bio. Upon arrival of source bio we merge these two bio's and send
>>> corresponding request down to device driver.
>>> Merging non copy offload bio is avoided by checking for copy specific
>>> opcodes in merge function.
>>
>> In this patch I don't see any changes for blk_attempt_bio_merge(). Does
>> this mean that combining REQ_OP_COPY_DST and REQ_OP_COPY_SRC will never
>> happen if the QUEUE_FLAG_NOMERGES request queue flag has been set?
>>
> Yes, in this case copy won't work, as both src and dst bio reach driver
> as part of separate requests.
> We will add this as part of documentation.

So that means that 2 major SAS HBAs which set this flag (megaraid and mpt3sas)
will not get support for copy offload ? Not ideal, by far.
Bart Van Assche May 29, 2024, 10:41 p.m. UTC | #16
On 5/29/24 12:48 AM, Damien Le Moal wrote:
> On 5/29/24 15:17, Nitesh Shetty wrote:
>> On 24/05/24 01:33PM, Bart Van Assche wrote:
>>> On 5/20/24 03:20, Nitesh Shetty wrote:
>>>> We add two new opcode REQ_OP_COPY_DST, REQ_OP_COPY_SRC.
>>>> Since copy is a composite operation involving src and dst sectors/lba,
>>>> each needs to be represented by a separate bio to make it compatible
>>>> with device mapper.
>>>> We expect caller to take a plug and send bio with destination information,
>>>> followed by bio with source information.
>>>> Once the dst bio arrives we form a request and wait for source
>>>> bio. Upon arrival of source bio we merge these two bio's and send
>>>> corresponding request down to device driver.
>>>> Merging non copy offload bio is avoided by checking for copy specific
>>>> opcodes in merge function.
>>>
>>> In this patch I don't see any changes for blk_attempt_bio_merge(). Does
>>> this mean that combining REQ_OP_COPY_DST and REQ_OP_COPY_SRC will never
>>> happen if the QUEUE_FLAG_NOMERGES request queue flag has been set?
>>>
>> Yes, in this case copy won't work, as both src and dst bio reach driver
>> as part of separate requests.
>> We will add this as part of documentation.
> 
> So that means that 2 major SAS HBAs which set this flag (megaraid and mpt3sas)
> will not get support for copy offload ? Not ideal, by far.

QUEUE_FLAG_NOMERGES can also be set through sysfs (see also
queue_nomerges_store()). This is one of the reasons why using the merge
infrastructure for combining REQ_OP_COPY_DST and REQ_OP_COPY_SRC is
unacceptable.

Thanks,

Bart.
Nitesh Shetty May 30, 2024, 7:16 a.m. UTC | #17
On 29/05/24 03:41PM, Bart Van Assche wrote:
>On 5/29/24 12:48 AM, Damien Le Moal wrote:
>>On 5/29/24 15:17, Nitesh Shetty wrote:
>>>On 24/05/24 01:33PM, Bart Van Assche wrote:
>>>>On 5/20/24 03:20, Nitesh Shetty wrote:
>>>>>We add two new opcode REQ_OP_COPY_DST, REQ_OP_COPY_SRC.
>>>>>Since copy is a composite operation involving src and dst sectors/lba,
>>>>>each needs to be represented by a separate bio to make it compatible
>>>>>with device mapper.
>>>>>We expect caller to take a plug and send bio with destination information,
>>>>>followed by bio with source information.
>>>>>Once the dst bio arrives we form a request and wait for source
>>>>>bio. Upon arrival of source bio we merge these two bio's and send
>>>>>corresponding request down to device driver.
>>>>>Merging non copy offload bio is avoided by checking for copy specific
>>>>>opcodes in merge function.
>>>>
>>>>In this patch I don't see any changes for blk_attempt_bio_merge(). Does
>>>>this mean that combining REQ_OP_COPY_DST and REQ_OP_COPY_SRC will never
>>>>happen if the QUEUE_FLAG_NOMERGES request queue flag has been set?
>>>>
>>>Yes, in this case copy won't work, as both src and dst bio reach driver
>>>as part of separate requests.
>>>We will add this as part of documentation.
>>
>>So that means that 2 major SAS HBAs which set this flag (megaraid and mpt3sas)
>>will not get support for copy offload ? Not ideal, by far.
>
>QUEUE_FLAG_NOMERGES can also be set through sysfs (see also
>queue_nomerges_store()). This is one of the reasons why using the merge
>infrastructure for combining REQ_OP_COPY_DST and REQ_OP_COPY_SRC is
>unacceptable.
>

Bart, Damien, Hannes,
Thanks for your review.
We tried a slightly modified approach which simplifies this patch and
also avoids merge path.
Also with this, we should be able to solve the QUEUE_FLAG_MERGES issue.
Previously we also tried payload/token based approach,
which avoids merge path and tries to combine bios in driver.
But we received feedback that it wasn't the right approach [1].
Do below changes look any better or do you guys have anything else in mind ?

[1] https://lore.kernel.org/linux-block/ZIKphgDavKVPREnw@infradead.org/


diff --git a/block/blk-core.c b/block/blk-core.c
index 82c3ae22d76d..7158bac8cc57 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -122,6 +122,8 @@ static const char *const blk_op_name[] = {
  	REQ_OP_NAME(ZONE_FINISH),
  	REQ_OP_NAME(ZONE_APPEND),
  	REQ_OP_NAME(WRITE_ZEROES),
+	REQ_OP_NAME(COPY_SRC),
+	REQ_OP_NAME(COPY_DST),
  	REQ_OP_NAME(DRV_IN),
  	REQ_OP_NAME(DRV_OUT),
  };
@@ -839,6 +841,11 @@ void submit_bio_noacct(struct bio *bio)
  		 * requests.
  		 */
  		fallthrough;
+	case REQ_OP_COPY_SRC:
+	case REQ_OP_COPY_DST:
+		if (!q->limits.max_copy_sectors)
+			goto not_supported;
+		break;
  	default:
  		goto not_supported;
  	}
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 8534c35e0497..a651e7c114d0 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -154,6 +154,20 @@ static struct bio *bio_split_write_zeroes(struct bio *bio,
  	return bio_split(bio, lim->max_write_zeroes_sectors, GFP_NOIO, bs);
  }
  
+static struct bio *bio_split_copy(struct bio *bio,
+				  const struct queue_limits *lim,
+				  unsigned int *nsegs)
+{
+	*nsegs = 1;
+	if (bio_sectors(bio) <= lim->max_copy_sectors)
+		return NULL;
+	/*
+	 * We don't support splitting for a copy bio. End it with EIO if
+	 * splitting is required and return an error pointer.
+	 */
+	return ERR_PTR(-EIO);
+}
+
  /*
   * Return the maximum number of sectors from the start of a bio that may be
   * submitted as a single request to a block device. If enough sectors remain,
@@ -362,6 +376,12 @@ struct bio *__bio_split_to_limits(struct bio *bio,
  	case REQ_OP_WRITE_ZEROES:
  		split = bio_split_write_zeroes(bio, lim, nr_segs, bs);
  		break;
+	case REQ_OP_COPY_SRC:
+	case REQ_OP_COPY_DST:
+		split = bio_split_copy(bio, lim, nr_segs);
+		if (IS_ERR(split))
+			return NULL;
+		break;
  	default:
  		split = bio_split_rw(bio, lim, nr_segs, bs,
  				get_max_io_size(bio, lim) << SECTOR_SHIFT);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3b4df8e5ac9e..6d4ffbdade28 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2833,6 +2833,63 @@ static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
  		blk_mq_commit_rqs(hctx, queued, false);
  }
  
+/*
+ * Copy offload sends a pair of bio with REQ_OP_COPY_DST and REQ_OP_COPY_SRC
+ * operation by taking a plug.
+ * Initially DST bio is sent which forms a request and
+ * waits for SRC bio to arrive. Once SRC bio arrives
+ * we combine it and send request down to driver.
+ */
+static inline bool blk_copy_offload_combine_ok(struct request *req,
+					      struct bio *bio)
+{
+	return (req_op(req) == REQ_OP_COPY_DST &&
+		bio_op(bio) == REQ_OP_COPY_SRC);
+}
+
+static int blk_copy_offload_combine(struct request *req, struct bio *bio)
+{
+	if (!blk_copy_offload_combine_ok(req, bio))
+		return 1;
+
+	if (req->__data_len != bio->bi_iter.bi_size)
+		return 1;
+
+	req->biotail->bi_next = bio;
+	req->biotail = bio;
+	req->nr_phys_segments++;
+	req->__data_len += bio->bi_iter.bi_size;
+
+	return 0;
+}
+
+static inline bool blk_copy_offload_attempt_combine(struct request_queue *q,
+					     struct bio *bio)
+{
+	struct blk_plug *plug = current->plug;
+	struct request *rq;
+
+	if (!plug || rq_list_empty(plug->mq_list))
+		return false;
+
+	rq_list_for_each(&plug->mq_list, rq) {
+		if (rq->q == q) {
+			if (!blk_copy_offload_combine(rq, bio))
+				return true;
+			break;
+		}
+
+		/*
+		 * Only keep iterating plug list for combines if we have multiple
+		 * queues
+		 */
+		if (!plug->multiple_queues)
+			break;
+	}
+	return false;
+}
+
  static bool blk_mq_attempt_bio_merge(struct request_queue *q,
  				     struct bio *bio, unsigned int nr_segs)
  {
@@ -2977,6 +3034,9 @@ void blk_mq_submit_bio(struct bio *bio)
  	if (blk_mq_attempt_bio_merge(q, bio, nr_segs))
  		goto queue_exit;
  
+	if (blk_copy_offload_attempt_combine(q, bio))
+		goto queue_exit;
+
  	if (blk_queue_is_zoned(q) && blk_zone_plug_bio(bio, nr_segs))
  		goto queue_exit;
  
diff --git a/block/blk.h b/block/blk.h
index 189bc25beb50..112c6736f44c 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -323,6 +323,8 @@ static inline bool bio_may_exceed_limits(struct bio *bio,
  	case REQ_OP_DISCARD:
  	case REQ_OP_SECURE_ERASE:
  	case REQ_OP_WRITE_ZEROES:
+	case REQ_OP_COPY_SRC:
+	case REQ_OP_COPY_DST:
  		return true; /* non-trivial splitting decisions */
  	default:
  		break;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 781c4500491b..22a08425d13e 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -342,6 +342,10 @@ enum req_op {
  	/* reset all the zone present on the device */
  	REQ_OP_ZONE_RESET_ALL	= (__force blk_opf_t)15,
  
+	/* copy offload source and destination operations */
+	REQ_OP_COPY_SRC		= (__force blk_opf_t)18,
+	REQ_OP_COPY_DST		= (__force blk_opf_t)19,
+
  	/* Driver private requests */
  	REQ_OP_DRV_IN		= (__force blk_opf_t)34,
  	REQ_OP_DRV_OUT		= (__force blk_opf_t)35,
--·
2.34.1
Bart Van Assche May 30, 2024, 5:11 p.m. UTC | #18
On 5/30/24 00:16, Nitesh Shetty wrote:
> +static inline bool blk_copy_offload_attempt_combine(struct request_queue *q,
> +                         struct bio *bio)
> +{
> +    struct blk_plug *plug = current->plug;
> +    struct request *rq;
> +
> +    if (!plug || rq_list_empty(plug->mq_list))
> +        return false;
> +
> +    rq_list_for_each(&plug->mq_list, rq) {
> +        if (rq->q == q) {
> +            if (!blk_copy_offload_combine(rq, bio))
> +                return true;
> +            break;
> +        }
> +
> +        /*
> +         * Only keep iterating plug list for combines if we have multiple
> +         * queues
> +         */
> +        if (!plug->multiple_queues)
> +            break;
> +    }
> +    return false;
> +}

This new approach has the following two disadvantages:
* Without plug, REQ_OP_COPY_SRC and REQ_OP_COPY_DST are not combined. These two
   operation types are the only operation types for which not using a plug causes
   an I/O failure.
* A loop is required to combine the REQ_OP_COPY_SRC and REQ_OP_COPY_DST operations.

Please switch to the approach Hannes suggested, namely bio chaining. Chaining
REQ_OP_COPY_SRC and REQ_OP_COPY_DST bios before these are submitted eliminates the
two disadvantages mentioned above.

Thanks,

Bart.
Nitesh Shetty May 31, 2024, 10:17 a.m. UTC | #19
On 30/05/24 10:11AM, Bart Van Assche wrote:
>On 5/30/24 00:16, Nitesh Shetty wrote:
>>+static inline bool blk_copy_offload_attempt_combine(struct request_queue *q,
>>+                         struct bio *bio)
>>+{
>>+    struct blk_plug *plug = current->plug;
>>+    struct request *rq;
>>+
>>+    if (!plug || rq_list_empty(plug->mq_list))
>>+        return false;
>>+
>>+    rq_list_for_each(&plug->mq_list, rq) {
>>+        if (rq->q == q) {
>>+            if (!blk_copy_offload_combine(rq, bio))
>>+                return true;
>>+            break;
>>+        }
>>+
>>+        /*
>>+         * Only keep iterating plug list for combines if we have multiple
>>+         * queues
>>+         */
>>+        if (!plug->multiple_queues)
>>+            break;
>>+    }
>>+    return false;
>>+}
>
>This new approach has the following two disadvantages:
>* Without plug, REQ_OP_COPY_SRC and REQ_OP_COPY_DST are not combined. These two
>  operation types are the only operation types for which not using a plug causes
>  an I/O failure.
>* A loop is required to combine the REQ_OP_COPY_SRC and REQ_OP_COPY_DST operations.
>
>Please switch to the approach Hannes suggested, namely bio chaining. Chaining
>REQ_OP_COPY_SRC and REQ_OP_COPY_DST bios before these are submitted eliminates the
>two disadvantages mentioned above.
>
Bart, Hannes,

I see the following challenges with bio-chained approach.
1. partitioned device:
	We need to add the code which iterates over all bios and adjusts
	the sectors offsets.
2. dm/stacked device:
	We need to make major changes in dm, such as allocating cloned
	bios, IO splits, IO offset mappings. All of which need to
	iterate over chained BIOs.

Overall with chained BIOs we need to add a special handling only for copy
to iterate over chained BIOs and do the same thing which is being done
for single BIO at present.
Or am I missing something here ?

Thank You,
Nitesh Shetty
Bart Van Assche May 31, 2024, 11:45 p.m. UTC | #20
On 5/31/24 03:17, Nitesh Shetty wrote:
> I see the following challenges with bio-chained approach.
> 1. partitioned device:
>      We need to add the code which iterates over all bios and adjusts
>      the sectors offsets.
> 2. dm/stacked device:
>      We need to make major changes in dm, such as allocating cloned
>      bios, IO splits, IO offset mappings. All of which need to
>      iterate over chained BIOs.
> 
> Overall with chained BIOs we need to add a special handling only for copy
> to iterate over chained BIOs and do the same thing which is being done
> for single BIO at present.
> Or am I missing something here ?

Hmm ... aren't chained bios submitted individually? See e.g.
bio_chain_and_submit(). In other words, it shouldn't be necessary to
add any code that iterates over bio chains.

Thanks,

Bart.
Christoph Hellwig June 1, 2024, 5:57 a.m. UTC | #21
On Wed, May 29, 2024 at 04:48:18PM +0900, Damien Le Moal wrote:
> > Yes, in this case copy won't work, as both src and dst bio reach driver
> > as part of separate requests.
> > We will add this as part of documentation.
> 
> So that means that 2 major SAS HBAs which set this flag (megaraid and mpt3sas)
> will not get support for copy offload ? Not ideal, by far.

We really need to ignore the normerges flag for this.
As should we for the precedence in the discard merging.

And the drivers need to stop setting this flag as they have no business
to, but that's a separate discussion (and the flag can still be set
manually).
Christoph Hellwig June 1, 2024, 5:59 a.m. UTC | #22
On Thu, May 30, 2024 at 10:11:15AM -0700, Bart Van Assche wrote:
> This new approach has the following two disadvantages:
> * Without plug, REQ_OP_COPY_SRC and REQ_OP_COPY_DST are not combined. These two
>   operation types are the only operation types for which not using a plug causes
>   an I/O failure.

So?  We can clearly document that and even fail submission with a helpful
message trivially to enforce that.

> * A loop is required to combine the REQ_OP_COPY_SRC and REQ_OP_COPY_DST operations.

I don't see why that is a problem.  Especiallly once we allow multiple
sources which is really useful for GC workloads we'll need to do that
anyway.

> Please switch to the approach Hannes suggested, namely bio chaining. Chaining
> REQ_OP_COPY_SRC and REQ_OP_COPY_DST bios before these are submitted eliminates the
> two disadvantages mentioned above.

No.
diff mbox series

Patch

diff --git a/block/blk-core.c b/block/blk-core.c
index ea44b13af9af..f18ee5f709c0 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -122,6 +122,8 @@  static const char *const blk_op_name[] = {
 	REQ_OP_NAME(ZONE_FINISH),
 	REQ_OP_NAME(ZONE_APPEND),
 	REQ_OP_NAME(WRITE_ZEROES),
+	REQ_OP_NAME(COPY_SRC),
+	REQ_OP_NAME(COPY_DST),
 	REQ_OP_NAME(DRV_IN),
 	REQ_OP_NAME(DRV_OUT),
 };
@@ -838,6 +840,11 @@  void submit_bio_noacct(struct bio *bio)
 		 * requests.
 		 */
 		fallthrough;
+	case REQ_OP_COPY_SRC:
+	case REQ_OP_COPY_DST:
+		if (!q->limits.max_copy_sectors)
+			goto not_supported;
+		break;
 	default:
 		goto not_supported;
 	}
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 8534c35e0497..f8dc48a03379 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -154,6 +154,20 @@  static struct bio *bio_split_write_zeroes(struct bio *bio,
 	return bio_split(bio, lim->max_write_zeroes_sectors, GFP_NOIO, bs);
 }
 
+static struct bio *bio_split_copy(struct bio *bio,
+				  const struct queue_limits *lim,
+				  unsigned int *nsegs)
+{
+	*nsegs = 1;
+	if (bio_sectors(bio) <= lim->max_copy_sectors)
+		return NULL;
+	/*
+	 * We don't support splitting for a copy bio. End it with EIO if
+	 * splitting is required and return an error pointer.
+	 */
+	return ERR_PTR(-EIO);
+}
+
 /*
  * Return the maximum number of sectors from the start of a bio that may be
  * submitted as a single request to a block device. If enough sectors remain,
@@ -362,6 +376,12 @@  struct bio *__bio_split_to_limits(struct bio *bio,
 	case REQ_OP_WRITE_ZEROES:
 		split = bio_split_write_zeroes(bio, lim, nr_segs, bs);
 		break;
+	case REQ_OP_COPY_SRC:
+	case REQ_OP_COPY_DST:
+		split = bio_split_copy(bio, lim, nr_segs);
+		if (IS_ERR(split))
+			return NULL;
+		break;
 	default:
 		split = bio_split_rw(bio, lim, nr_segs, bs,
 				get_max_io_size(bio, lim) << SECTOR_SHIFT);
@@ -925,6 +945,9 @@  bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
 	if (!rq_mergeable(rq) || !bio_mergeable(bio))
 		return false;
 
+	if (blk_copy_offload_mergable(rq, bio))
+		return true;
+
 	if (req_op(rq) != bio_op(bio))
 		return false;
 
@@ -958,6 +981,8 @@  enum elv_merge blk_try_merge(struct request *rq, struct bio *bio)
 {
 	if (blk_discard_mergable(rq))
 		return ELEVATOR_DISCARD_MERGE;
+	else if (blk_copy_offload_mergable(rq, bio))
+		return ELEVATOR_COPY_OFFLOAD_MERGE;
 	else if (blk_rq_pos(rq) + blk_rq_sectors(rq) == bio->bi_iter.bi_sector)
 		return ELEVATOR_BACK_MERGE;
 	else if (blk_rq_pos(rq) - bio_sectors(bio) == bio->bi_iter.bi_sector)
@@ -1065,6 +1090,20 @@  static enum bio_merge_status bio_attempt_discard_merge(struct request_queue *q,
 	return BIO_MERGE_FAILED;
 }
 
+static enum bio_merge_status bio_attempt_copy_offload_merge(struct request *req,
+							    struct bio *bio)
+{
+	if (req->__data_len != bio->bi_iter.bi_size)
+		return BIO_MERGE_FAILED;
+
+	req->biotail->bi_next = bio;
+	req->biotail = bio;
+	req->nr_phys_segments++;
+	req->__data_len += bio->bi_iter.bi_size;
+
+	return BIO_MERGE_OK;
+}
+
 static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
 						   struct request *rq,
 						   struct bio *bio,
@@ -1085,6 +1124,8 @@  static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
 		break;
 	case ELEVATOR_DISCARD_MERGE:
 		return bio_attempt_discard_merge(q, rq, bio);
+	case ELEVATOR_COPY_OFFLOAD_MERGE:
+		return bio_attempt_copy_offload_merge(rq, bio);
 	default:
 		return BIO_MERGE_NONE;
 	}
diff --git a/block/blk.h b/block/blk.h
index 189bc25beb50..6528a2779b84 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -174,6 +174,20 @@  static inline bool blk_discard_mergable(struct request *req)
 	return false;
 }
 
+/*
+ * Copy offload sends a pair of bio with REQ_OP_COPY_DST and REQ_OP_COPY_SRC
+ * operation by taking a plug.
+ * Initially DST bio is sent which forms a request and
+ * waits for SRC bio to arrive. Once SRC bio arrives
+ * we merge it and send request down to driver.
+ */
+static inline bool blk_copy_offload_mergable(struct request *req,
+					     struct bio *bio)
+{
+	return (req_op(req) == REQ_OP_COPY_DST &&
+		bio_op(bio) == REQ_OP_COPY_SRC);
+}
+
 static inline unsigned int blk_rq_get_max_segments(struct request *rq)
 {
 	if (req_op(rq) == REQ_OP_DISCARD)
@@ -323,6 +337,8 @@  static inline bool bio_may_exceed_limits(struct bio *bio,
 	case REQ_OP_DISCARD:
 	case REQ_OP_SECURE_ERASE:
 	case REQ_OP_WRITE_ZEROES:
+	case REQ_OP_COPY_SRC:
+	case REQ_OP_COPY_DST:
 		return true; /* non-trivial splitting decisions */
 	default:
 		break;
diff --git a/block/elevator.h b/block/elevator.h
index e9a050a96e53..c7a45c1f4156 100644
--- a/block/elevator.h
+++ b/block/elevator.h
@@ -18,6 +18,7 @@  enum elv_merge {
 	ELEVATOR_FRONT_MERGE	= 1,
 	ELEVATOR_BACK_MERGE	= 2,
 	ELEVATOR_DISCARD_MERGE	= 3,
+	ELEVATOR_COPY_OFFLOAD_MERGE	= 4,
 };
 
 struct blk_mq_alloc_data;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index d5379548d684..528ef22dd65b 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -53,11 +53,7 @@  static inline unsigned int bio_max_segs(unsigned int nr_segs)
  */
 static inline bool bio_has_data(struct bio *bio)
 {
-	if (bio &&
-	    bio->bi_iter.bi_size &&
-	    bio_op(bio) != REQ_OP_DISCARD &&
-	    bio_op(bio) != REQ_OP_SECURE_ERASE &&
-	    bio_op(bio) != REQ_OP_WRITE_ZEROES)
+	if (bio && (bio_op(bio) == REQ_OP_READ || bio_op(bio) == REQ_OP_WRITE))
 		return true;
 
 	return false;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 781c4500491b..7f692bade271 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -342,6 +342,10 @@  enum req_op {
 	/* reset all the zone present on the device */
 	REQ_OP_ZONE_RESET_ALL	= (__force blk_opf_t)15,
 
+	/* copy offload src and dst operation */
+	REQ_OP_COPY_SRC		= (__force blk_opf_t)18,
+	REQ_OP_COPY_DST		= (__force blk_opf_t)19,
+
 	/* Driver private requests */
 	REQ_OP_DRV_IN		= (__force blk_opf_t)34,
 	REQ_OP_DRV_OUT		= (__force blk_opf_t)35,
@@ -430,6 +434,12 @@  static inline bool op_is_write(blk_opf_t op)
 	return !!(op & (__force blk_opf_t)1);
 }
 
+static inline bool op_is_copy(blk_opf_t op)
+{
+	return ((op & REQ_OP_MASK) == REQ_OP_COPY_SRC ||
+		(op & REQ_OP_MASK) == REQ_OP_COPY_DST);
+}
+
 /*
  * Check if the bio or request is one that needs special treatment in the
  * flush state machine.