diff mbox series

[V10,09/19] block: introduce bio_bvecs()

Message ID 20181115085306.9910-10-ming.lei@redhat.com (mailing list archive)
State Not Applicable
Headers show
Series block: support multi-page bvec | expand

Commit Message

Ming Lei Nov. 15, 2018, 8:52 a.m. UTC
There are still cases in which we need to use bio_bvecs() for get the
number of multi-page segment, so introduce it.

Cc: Dave Chinner <dchinner@redhat.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: dm-devel@redhat.com
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: Shaohua Li <shli@kernel.org>
Cc: linux-raid@vger.kernel.org
Cc: linux-erofs@lists.ozlabs.org
Cc: David Sterba <dsterba@suse.com>
Cc: linux-btrfs@vger.kernel.org
Cc: Darrick J. Wong <darrick.wong@oracle.com>
Cc: linux-xfs@vger.kernel.org
Cc: Gao Xiang <gaoxiang25@huawei.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: linux-ext4@vger.kernel.org
Cc: Coly Li <colyli@suse.de>
Cc: linux-bcache@vger.kernel.org
Cc: Boaz Harrosh <ooo@electrozaur.com>
Cc: Bob Peterson <rpeterso@redhat.com>
Cc: cluster-devel@redhat.com
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 include/linux/bio.h | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

Comments

Omar Sandoval Nov. 16, 2018, 12:26 a.m. UTC | #1
On Thu, Nov 15, 2018 at 04:52:56PM +0800, Ming Lei wrote:
> There are still cases in which we need to use bio_bvecs() for get the
> number of multi-page segment, so introduce it.
> 
> Cc: Dave Chinner <dchinner@redhat.com>
> Cc: Kent Overstreet <kent.overstreet@gmail.com>
> Cc: Mike Snitzer <snitzer@redhat.com>
> Cc: dm-devel@redhat.com
> Cc: Alexander Viro <viro@zeniv.linux.org.uk>
> Cc: linux-fsdevel@vger.kernel.org
> Cc: Shaohua Li <shli@kernel.org>
> Cc: linux-raid@vger.kernel.org
> Cc: linux-erofs@lists.ozlabs.org
> Cc: David Sterba <dsterba@suse.com>
> Cc: linux-btrfs@vger.kernel.org
> Cc: Darrick J. Wong <darrick.wong@oracle.com>
> Cc: linux-xfs@vger.kernel.org
> Cc: Gao Xiang <gaoxiang25@huawei.com>
> Cc: Christoph Hellwig <hch@lst.de>
> Cc: Theodore Ts'o <tytso@mit.edu>
> Cc: linux-ext4@vger.kernel.org
> Cc: Coly Li <colyli@suse.de>
> Cc: linux-bcache@vger.kernel.org
> Cc: Boaz Harrosh <ooo@electrozaur.com>
> Cc: Bob Peterson <rpeterso@redhat.com>
> Cc: cluster-devel@redhat.com

Reviewed-by: Omar Sandoval <osandov@fb.com>

> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> ---
>  include/linux/bio.h | 30 +++++++++++++++++++++++++-----
>  1 file changed, 25 insertions(+), 5 deletions(-)
> 
> diff --git a/include/linux/bio.h b/include/linux/bio.h
> index 1f0dcf109841..3496c816946e 100644
> --- a/include/linux/bio.h
> +++ b/include/linux/bio.h
> @@ -196,7 +196,6 @@ static inline unsigned bio_segments(struct bio *bio)
>  	 * We special case discard/write same/write zeroes, because they
>  	 * interpret bi_size differently:
>  	 */
> -
>  	switch (bio_op(bio)) {
>  	case REQ_OP_DISCARD:
>  	case REQ_OP_SECURE_ERASE:
> @@ -205,13 +204,34 @@ static inline unsigned bio_segments(struct bio *bio)
>  	case REQ_OP_WRITE_SAME:
>  		return 1;
>  	default:
> -		break;
> +		bio_for_each_segment(bv, bio, iter)
> +			segs++;
> +		return segs;
>  	}
> +}
>  
> -	bio_for_each_segment(bv, bio, iter)
> -		segs++;
> +static inline unsigned bio_bvecs(struct bio *bio)
> +{
> +	unsigned bvecs = 0;
> +	struct bio_vec bv;
> +	struct bvec_iter iter;
>  
> -	return segs;
> +	/*
> +	 * We special case discard/write same/write zeroes, because they
> +	 * interpret bi_size differently:
> +	 */
> +	switch (bio_op(bio)) {
> +	case REQ_OP_DISCARD:
> +	case REQ_OP_SECURE_ERASE:
> +	case REQ_OP_WRITE_ZEROES:
> +		return 0;
> +	case REQ_OP_WRITE_SAME:
> +		return 1;
> +	default:
> +		bio_for_each_bvec(bv, bio, iter)
> +			bvecs++;
> +		return bvecs;
> +	}
>  }
>  
>  /*
> -- 
> 2.9.5
>
Christoph Hellwig Nov. 16, 2018, 1:45 p.m. UTC | #2
On Thu, Nov 15, 2018 at 04:52:56PM +0800, Ming Lei wrote:
> There are still cases in which we need to use bio_bvecs() for get the
> number of multi-page segment, so introduce it.

The only user in your final tree seems to be the loop driver, and
even that one only uses the helper for read/write bios.

I think something like this would be much simpler in the end:

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index d509902a8046..712511815ac6 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -514,16 +514,18 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
 	struct request *rq = blk_mq_rq_from_pdu(cmd);
 	struct bio *bio = rq->bio;
 	struct file *file = lo->lo_backing_file;
+	struct bvec_iter bvec_iter;
+	struct bio_vec tmp;
 	unsigned int offset;
 	int nr_bvec = 0;
 	int ret;
 
+	__rq_for_each_bio(bio, rq)
+		bio_for_each_bvec(tmp, bio, bvec_iter)
+			nr_bvec++;
+
 	if (rq->bio != rq->biotail) {
-		struct bvec_iter iter;
-		struct bio_vec tmp;
 
-		__rq_for_each_bio(bio, rq)
-			nr_bvec += bio_bvecs(bio);
 		bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec),
 				     GFP_NOIO);
 		if (!bvec)
@@ -537,7 +539,7 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
 		 * API will take care of all details for us.
 		 */
 		__rq_for_each_bio(bio, rq)
-			bio_for_each_bvec(tmp, bio, iter) {
+			bio_for_each_bvec(tmp, bio, bvec_iter) {
 				*bvec = tmp;
 				bvec++;
 			}
@@ -551,7 +553,6 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
 		 */
 		offset = bio->bi_iter.bi_bvec_done;
 		bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
-		nr_bvec = bio_bvecs(bio);
 	}
 	atomic_set(&cmd->ref, 2);
 
diff --git a/include/linux/bio.h b/include/linux/bio.h
index dcad0b69f57a..379440d1ced0 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -200,30 +200,6 @@ static inline unsigned bio_segments(struct bio *bio)
 	}
 }
 
-static inline unsigned bio_bvecs(struct bio *bio)
-{
-	unsigned bvecs = 0;
-	struct bio_vec bv;
-	struct bvec_iter iter;
-
-	/*
-	 * We special case discard/write same/write zeroes, because they
-	 * interpret bi_size differently:
-	 */
-	switch (bio_op(bio)) {
-	case REQ_OP_DISCARD:
-	case REQ_OP_SECURE_ERASE:
-	case REQ_OP_WRITE_ZEROES:
-		return 0;
-	case REQ_OP_WRITE_SAME:
-		return 1;
-	default:
-		bio_for_each_bvec(bv, bio, iter)
-			bvecs++;
-		return bvecs;
-	}
-}
-
 /*
  * get a reference to a bio, so it won't disappear. the intended use is
  * something like:
Ming Lei Nov. 19, 2018, 8:21 a.m. UTC | #3
On Fri, Nov 16, 2018 at 02:45:41PM +0100, Christoph Hellwig wrote:
> On Thu, Nov 15, 2018 at 04:52:56PM +0800, Ming Lei wrote:
> > There are still cases in which we need to use bio_bvecs() for get the
> > number of multi-page segment, so introduce it.
> 
> The only user in your final tree seems to be the loop driver, and
> even that one only uses the helper for read/write bios.
> 
> I think something like this would be much simpler in the end:
> 
> diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> index d509902a8046..712511815ac6 100644
> --- a/drivers/block/loop.c
> +++ b/drivers/block/loop.c
> @@ -514,16 +514,18 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
>  	struct request *rq = blk_mq_rq_from_pdu(cmd);
>  	struct bio *bio = rq->bio;
>  	struct file *file = lo->lo_backing_file;
> +	struct bvec_iter bvec_iter;
> +	struct bio_vec tmp;
>  	unsigned int offset;
>  	int nr_bvec = 0;
>  	int ret;
>  
> +	__rq_for_each_bio(bio, rq)
> +		bio_for_each_bvec(tmp, bio, bvec_iter)
> +			nr_bvec++;
> +
>  	if (rq->bio != rq->biotail) {
> -		struct bvec_iter iter;
> -		struct bio_vec tmp;
>  
> -		__rq_for_each_bio(bio, rq)
> -			nr_bvec += bio_bvecs(bio);
>  		bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec),
>  				     GFP_NOIO);
>  		if (!bvec)
> @@ -537,7 +539,7 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
>  		 * API will take care of all details for us.
>  		 */
>  		__rq_for_each_bio(bio, rq)
> -			bio_for_each_bvec(tmp, bio, iter) {
> +			bio_for_each_bvec(tmp, bio, bvec_iter) {
>  				*bvec = tmp;
>  				bvec++;
>  			}
> @@ -551,7 +553,6 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
>  		 */
>  		offset = bio->bi_iter.bi_bvec_done;
>  		bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
> -		nr_bvec = bio_bvecs(bio);
>  	}
>  	atomic_set(&cmd->ref, 2);
>  
> diff --git a/include/linux/bio.h b/include/linux/bio.h
> index dcad0b69f57a..379440d1ced0 100644
> --- a/include/linux/bio.h
> +++ b/include/linux/bio.h
> @@ -200,30 +200,6 @@ static inline unsigned bio_segments(struct bio *bio)
>  	}
>  }
>  
> -static inline unsigned bio_bvecs(struct bio *bio)
> -{
> -	unsigned bvecs = 0;
> -	struct bio_vec bv;
> -	struct bvec_iter iter;
> -
> -	/*
> -	 * We special case discard/write same/write zeroes, because they
> -	 * interpret bi_size differently:
> -	 */
> -	switch (bio_op(bio)) {
> -	case REQ_OP_DISCARD:
> -	case REQ_OP_SECURE_ERASE:
> -	case REQ_OP_WRITE_ZEROES:
> -		return 0;
> -	case REQ_OP_WRITE_SAME:
> -		return 1;
> -	default:
> -		bio_for_each_bvec(bv, bio, iter)
> -			bvecs++;
> -		return bvecs;
> -	}
> -}
> -
>  /*
>   * get a reference to a bio, so it won't disappear. the intended use is
>   * something like:

OK, will do it in next version.

Thanks,
Ming
Sagi Grimberg Nov. 20, 2018, 12:49 a.m. UTC | #4
> The only user in your final tree seems to be the loop driver, and
> even that one only uses the helper for read/write bios.
> 
> I think something like this would be much simpler in the end:

The recently submitted nvme-tcp host driver should also be a user
of this. Does it make sense to keep it as a helper then?
Christoph Hellwig Nov. 20, 2018, 4:16 p.m. UTC | #5
On Mon, Nov 19, 2018 at 04:49:27PM -0800, Sagi Grimberg wrote:
>
>> The only user in your final tree seems to be the loop driver, and
>> even that one only uses the helper for read/write bios.
>>
>> I think something like this would be much simpler in the end:
>
> The recently submitted nvme-tcp host driver should also be a user
> of this. Does it make sense to keep it as a helper then?

I did take a brief look at the code, and I really don't understand
why the heck it even deals with bios to start with.  Like all the
other nvme transports it is a blk-mq driver and should iterate
over segments in a request and more or less ignore bios.  Something
is horribly wrong in the design.
Sagi Grimberg Nov. 20, 2018, 8:11 p.m. UTC | #6
>>> The only user in your final tree seems to be the loop driver, and
>>> even that one only uses the helper for read/write bios.
>>>
>>> I think something like this would be much simpler in the end:
>>
>> The recently submitted nvme-tcp host driver should also be a user
>> of this. Does it make sense to keep it as a helper then?
> 
> I did take a brief look at the code, and I really don't understand
> why the heck it even deals with bios to start with.  Like all the
> other nvme transports it is a blk-mq driver and should iterate
> over segments in a request and more or less ignore bios.  Something
> is horribly wrong in the design.

Can you explain a little more? I'm more than happy to change that but
I'm not completely clear how...

Before we begin a data transfer, we need to set our own iterator that
will advance with the progression of the data transfer. We also need to
keep in mind that all the data transfer (both send and recv) are
completely non blocking (and zero-copy when we send).

That means that every data movement needs to be able to suspend
and resume asynchronously. i.e. we cannot use the following pattern:
rq_for_each_segment(bvec, rq, rq_iter) {
	iov_iter_bvec(&iov_iter, WRITE, &bvec, 1, bvec.bv_len);
	send(sock, iov_iter);
}

Given that a request can hold more than a single bio, I'm not clear on
how we can achieve that without iterating over the bios in the request
ourselves.

Any useful insight?
Ming Lei Nov. 21, 2018, 12:59 a.m. UTC | #7
On Tue, Nov 20, 2018 at 12:11:35PM -0800, Sagi Grimberg wrote:
> 
> > > > The only user in your final tree seems to be the loop driver, and
> > > > even that one only uses the helper for read/write bios.
> > > > 
> > > > I think something like this would be much simpler in the end:
> > > 
> > > The recently submitted nvme-tcp host driver should also be a user
> > > of this. Does it make sense to keep it as a helper then?
> > 
> > I did take a brief look at the code, and I really don't understand
> > why the heck it even deals with bios to start with.  Like all the
> > other nvme transports it is a blk-mq driver and should iterate
> > over segments in a request and more or less ignore bios.  Something
> > is horribly wrong in the design.
> 
> Can you explain a little more? I'm more than happy to change that but
> I'm not completely clear how...
> 
> Before we begin a data transfer, we need to set our own iterator that
> will advance with the progression of the data transfer. We also need to
> keep in mind that all the data transfer (both send and recv) are
> completely non blocking (and zero-copy when we send).
> 
> That means that every data movement needs to be able to suspend
> and resume asynchronously. i.e. we cannot use the following pattern:
> rq_for_each_segment(bvec, rq, rq_iter) {
> 	iov_iter_bvec(&iov_iter, WRITE, &bvec, 1, bvec.bv_len);
> 	send(sock, iov_iter);
> }

Not sure I understand the 'blocking' problem in this case.

We can build a bvec table from this req, and send them all
in send(), can this way avoid your blocking issue? You may see this
example in branch 'rq->bio != rq->biotail' of lo_rw_aio().

If this way is what you need, I think you are right, even we may
introduce the following helpers:

	rq_for_each_bvec()
	rq_bvecs()

So looks nvme-tcp host driver might be the 2nd driver which benefits
from multi-page bvec directly.

The multi-page bvec V11 has passed my tests and addressed almost
all the comments during review on V10. I removed bio_vecs() in V11,
but it won't be big deal, we can introduce them anytime when there
is the requirement.

Thanks,
Ming
Sagi Grimberg Nov. 21, 2018, 3:20 a.m. UTC | #8
> Not sure I understand the 'blocking' problem in this case.
> 
> We can build a bvec table from this req, and send them all
> in send(),

I would like to avoid growing bvec tables and keep everything
preallocated. Plus, a bvec_iter operates on a bvec which means
we'll need a table there as well... Not liking it so far...

> can this way avoid your blocking issue? You may see this
> example in branch 'rq->bio != rq->biotail' of lo_rw_aio().

This is exactly an example of not ignoring the bios...

> If this way is what you need, I think you are right, even we may
> introduce the following helpers:
> 
> 	rq_for_each_bvec()
> 	rq_bvecs()

I'm not sure how this helps me either. Unless we can set a bvec_iter to
span bvecs or have an abstract bio crossing when we re-initialize the
bvec_iter I don't see how I can ignore bios completely...

> So looks nvme-tcp host driver might be the 2nd driver which benefits
> from multi-page bvec directly.
> 
> The multi-page bvec V11 has passed my tests and addressed almost
> all the comments during review on V10. I removed bio_vecs() in V11,
> but it won't be big deal, we can introduce them anytime when there
> is the requirement.

multipage-bvecs and nvme-tcp are going to conflict, so it would be good
to coordinate on this. I think that nvme-tcp host needs some adjustments
as setting a bvec_iter. I'm under the impression that the change is 
rather small and self-contained, but I'm not sure I have the full
picture here.
Ming Lei Nov. 21, 2018, 3:44 a.m. UTC | #9
On Tue, Nov 20, 2018 at 07:20:45PM -0800, Sagi Grimberg wrote:
> 
> > Not sure I understand the 'blocking' problem in this case.
> > 
> > We can build a bvec table from this req, and send them all
> > in send(),
> 
> I would like to avoid growing bvec tables and keep everything
> preallocated. Plus, a bvec_iter operates on a bvec which means
> we'll need a table there as well... Not liking it so far...

In case of bios in one request, we can't know how many bvecs there
are except for calling rq_bvecs(), so it may not be suitable to
preallocate the table. If you have to send the IO request in one send(),
runtime allocation may be inevitable.

If you don't require to send the IO request in one send(), you may send
one bio in one time, and just uses the bio's bvec table directly,
such as the single bio case in lo_rw_aio().

> 
> > can this way avoid your blocking issue? You may see this
> > example in branch 'rq->bio != rq->biotail' of lo_rw_aio().
> 
> This is exactly an example of not ignoring the bios...

Yeah, that is the most common example, given merge is enabled
in most of cases. If the driver or device doesn't care merge,
you can disable it and always get single bio request, then the
bio's bvec table can be reused for send().

> 
> > If this way is what you need, I think you are right, even we may
> > introduce the following helpers:
> > 
> > 	rq_for_each_bvec()
> > 	rq_bvecs()
> 
> I'm not sure how this helps me either. Unless we can set a bvec_iter to
> span bvecs or have an abstract bio crossing when we re-initialize the
> bvec_iter I don't see how I can ignore bios completely...

rq_for_each_bvec() will iterate over all bvecs from all bios, so you
needn't to see any bio in this req.

rq_bvecs() will return how many bvecs there are in this request(cover
all bios in this req)

> 
> > So looks nvme-tcp host driver might be the 2nd driver which benefits
> > from multi-page bvec directly.
> > 
> > The multi-page bvec V11 has passed my tests and addressed almost
> > all the comments during review on V10. I removed bio_vecs() in V11,
> > but it won't be big deal, we can introduce them anytime when there
> > is the requirement.
> 
> multipage-bvecs and nvme-tcp are going to conflict, so it would be good
> to coordinate on this. I think that nvme-tcp host needs some adjustments
> as setting a bvec_iter. I'm under the impression that the change is rather
> small and self-contained, but I'm not sure I have the full
> picture here.

I guess I may not get your exact requirement on block io iterator from nvme-tcp
too, :-(

thanks,
Ming
Sagi Grimberg Nov. 21, 2018, 4:25 a.m. UTC | #10
>> I would like to avoid growing bvec tables and keep everything
>> preallocated. Plus, a bvec_iter operates on a bvec which means
>> we'll need a table there as well... Not liking it so far...
> 
> In case of bios in one request, we can't know how many bvecs there
> are except for calling rq_bvecs(), so it may not be suitable to
> preallocate the table. If you have to send the IO request in one send(),
> runtime allocation may be inevitable.

I don't want to do that, I want to work on a single bvec at a time like
the current implementation does.

> If you don't require to send the IO request in one send(), you may send
> one bio in one time, and just uses the bio's bvec table directly,
> such as the single bio case in lo_rw_aio().

we'd need some indication that we need to reinit my iter with the
new bvec, today we do:

static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req,
                 int len)
{
         req->snd.data_sent += len;
         req->pdu_sent += len;
         iov_iter_advance(&req->snd.iter, len);
         if (!iov_iter_count(&req->snd.iter) &&
             req->snd.data_sent < req->data_len) {
                 req->snd.curr_bio = req->snd.curr_bio->bi_next;
                 nvme_tcp_init_send_iter(req);
         }
}

and initialize the send iter. I imagine that now I will need to
switch to the next bvec and only if I'm on the last I need to
use the next bio...

Do you offer an API for that?


>>> can this way avoid your blocking issue? You may see this
>>> example in branch 'rq->bio != rq->biotail' of lo_rw_aio().
>>
>> This is exactly an example of not ignoring the bios...
> 
> Yeah, that is the most common example, given merge is enabled
> in most of cases. If the driver or device doesn't care merge,
> you can disable it and always get single bio request, then the
> bio's bvec table can be reused for send().

Does bvec_iter span bvecs with your patches? I didn't see that change?

>> I'm not sure how this helps me either. Unless we can set a bvec_iter to
>> span bvecs or have an abstract bio crossing when we re-initialize the
>> bvec_iter I don't see how I can ignore bios completely...
> 
> rq_for_each_bvec() will iterate over all bvecs from all bios, so you
> needn't to see any bio in this req.

But I don't need this iteration, I need a transparent API like;
bvec2 = rq_bvec_next(rq, bvec)

This way I can simply always reinit my iter without thinking about how
the request/bios/bvecs are constructed...

> rq_bvecs() will return how many bvecs there are in this request(cover
> all bios in this req)

Still not very useful given that I don't want to use a table...

>>> So looks nvme-tcp host driver might be the 2nd driver which benefits
>>> from multi-page bvec directly.
>>>
>>> The multi-page bvec V11 has passed my tests and addressed almost
>>> all the comments during review on V10. I removed bio_vecs() in V11,
>>> but it won't be big deal, we can introduce them anytime when there
>>> is the requirement.
>>
>> multipage-bvecs and nvme-tcp are going to conflict, so it would be good
>> to coordinate on this. I think that nvme-tcp host needs some adjustments
>> as setting a bvec_iter. I'm under the impression that the change is rather
>> small and self-contained, but I'm not sure I have the full
>> picture here.
> 
> I guess I may not get your exact requirement on block io iterator from nvme-tcp
> too, :-(

They are pretty much listed above. Today nvme-tcp sets an iterator with:

vec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
nsegs = bio_segments(bio);
size = bio->bi_iter.bi_size;
offset = bio->bi_iter.bi_bvec_done;
iov_iter_bvec(&req->snd.iter, WRITE, vec, nsegs, size);

and when done, iterate to the next bio and do the same.

With multipage bvec it would be great if we can simply have
something like rq_bvec_next() that would pretty much satisfy
the requirements from the nvme-tcp side...
Sagi Grimberg Nov. 21, 2018, 4:42 a.m. UTC | #11
>> Yeah, that is the most common example, given merge is enabled
>> in most of cases. If the driver or device doesn't care merge,
>> you can disable it and always get single bio request, then the
>> bio's bvec table can be reused for send().
> 
> Does bvec_iter span bvecs with your patches? I didn't see that change?

Wait, I see that the bvec is still a single array per bio. When you said
a table I thought you meant a 2-dimentional array...

Unless I'm not mistaken, I think that the change is pretty simple then.
However, nvme-tcp still needs to be bio aware unless we have some
abstraction in place.. Which will mean that nvme-tcp will need to
open-code bio_bvecs.
Ming Lei Nov. 21, 2018, 5:04 a.m. UTC | #12
On Tue, Nov 20, 2018 at 08:42:04PM -0800, Sagi Grimberg wrote:
> 
> > > Yeah, that is the most common example, given merge is enabled
> > > in most of cases. If the driver or device doesn't care merge,
> > > you can disable it and always get single bio request, then the
> > > bio's bvec table can be reused for send().
> > 
> > Does bvec_iter span bvecs with your patches? I didn't see that change?
> 
> Wait, I see that the bvec is still a single array per bio. When you said
> a table I thought you meant a 2-dimentional array...

I mean a new 1-d table A has to be created for multiple bios in one rq,
and build it in the following way

           rq_for_each_bvec(tmp, rq, rq_iter)
                    *A = tmp;

Then you can pass A to iov_iter_bvec() & send().

Given it is over TCP, I guess it should be doable for you to preallocate one
256-bvec table in one page for each request, then sets the max segment size as
(unsigned int)-1, and max segment number as 256, the preallocated table
should work anytime.


Thanks,
Ming
Sagi Grimberg Nov. 21, 2018, 5:35 a.m. UTC | #13
>> Wait, I see that the bvec is still a single array per bio. When you said
>> a table I thought you meant a 2-dimentional array...
> 
> I mean a new 1-d table A has to be created for multiple bios in one rq,
> and build it in the following way
> 
>             rq_for_each_bvec(tmp, rq, rq_iter)
>                      *A = tmp;
> 
> Then you can pass A to iov_iter_bvec() & send().
> 
> Given it is over TCP, I guess it should be doable for you to preallocate one
> 256-bvec table in one page for each request, then sets the max segment size as
> (unsigned int)-1, and max segment number as 256, the preallocated table
> should work anytime.

256 bvec table is really a lot to preallocate, especially when its not
needed, I can easily initialize the bvec_iter on the bio bvec. If this
involves preallocation of the worst-case than I don't consider this to
be an improvement.
Christoph Hellwig Nov. 21, 2018, 8:46 a.m. UTC | #14
On Tue, Nov 20, 2018 at 09:35:07PM -0800, Sagi Grimberg wrote:
>> Given it is over TCP, I guess it should be doable for you to preallocate one
>> 256-bvec table in one page for each request, then sets the max segment size as
>> (unsigned int)-1, and max segment number as 256, the preallocated table
>> should work anytime.
>
> 256 bvec table is really a lot to preallocate, especially when its not
> needed, I can easily initialize the bvec_iter on the bio bvec. If this
> involves preallocation of the worst-case than I don't consider this to
> be an improvement.

Ok, I took a look at the nvme-tcp code and it seems you care about
bios because you want a contiguos bio chunk for sending it down
the networking code.  Yes, in that case we sort of need to iterate
over bios.  But you already have a special case for discard, so you
don't really need any of the magic in the bio_bvecs() helper either
can can just count bi_vcnt in the bio.
Ming Lei Nov. 21, 2018, 10:19 a.m. UTC | #15
On Tue, Nov 20, 2018 at 09:35:07PM -0800, Sagi Grimberg wrote:
> 
> > > Wait, I see that the bvec is still a single array per bio. When you said
> > > a table I thought you meant a 2-dimentional array...
> > 
> > I mean a new 1-d table A has to be created for multiple bios in one rq,
> > and build it in the following way
> > 
> >             rq_for_each_bvec(tmp, rq, rq_iter)
> >                      *A = tmp;
> > 
> > Then you can pass A to iov_iter_bvec() & send().
> > 
> > Given it is over TCP, I guess it should be doable for you to preallocate one
> > 256-bvec table in one page for each request, then sets the max segment size as
> > (unsigned int)-1, and max segment number as 256, the preallocated table
> > should work anytime.
> 
> 256 bvec table is really a lot to preallocate, especially when its not
> needed, I can easily initialize the bvec_iter on the bio bvec. If this
> involves preallocation of the worst-case than I don't consider this to
> be an improvement.

If you don't provide one single bvec table, I understand you may not send
this req via one send().

The bvec_iter initialization is easy to do:

	bvec_iter = bio->bi_iter

when you move to a new a bio, please refer to  __bio_for_each_bvec() or
__bio_for_each_segment().

Thanks,
Ming
diff mbox series

Patch

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 1f0dcf109841..3496c816946e 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -196,7 +196,6 @@  static inline unsigned bio_segments(struct bio *bio)
 	 * We special case discard/write same/write zeroes, because they
 	 * interpret bi_size differently:
 	 */
-
 	switch (bio_op(bio)) {
 	case REQ_OP_DISCARD:
 	case REQ_OP_SECURE_ERASE:
@@ -205,13 +204,34 @@  static inline unsigned bio_segments(struct bio *bio)
 	case REQ_OP_WRITE_SAME:
 		return 1;
 	default:
-		break;
+		bio_for_each_segment(bv, bio, iter)
+			segs++;
+		return segs;
 	}
+}
 
-	bio_for_each_segment(bv, bio, iter)
-		segs++;
+static inline unsigned bio_bvecs(struct bio *bio)
+{
+	unsigned bvecs = 0;
+	struct bio_vec bv;
+	struct bvec_iter iter;
 
-	return segs;
+	/*
+	 * We special case discard/write same/write zeroes, because they
+	 * interpret bi_size differently:
+	 */
+	switch (bio_op(bio)) {
+	case REQ_OP_DISCARD:
+	case REQ_OP_SECURE_ERASE:
+	case REQ_OP_WRITE_ZEROES:
+		return 0;
+	case REQ_OP_WRITE_SAME:
+		return 1;
+	default:
+		bio_for_each_bvec(bv, bio, iter)
+			bvecs++;
+		return bvecs;
+	}
 }
 
 /*