diff mbox series

block: optimise bvec_iter_advance()

Message ID 06b1b796b8d9bcaa6d5b325668525b7a5663035b.1574974574.git.asml.silence@gmail.com (mailing list archive)
State New, archived
Headers show
Series block: optimise bvec_iter_advance() | expand

Commit Message

Pavel Begunkov Nov. 28, 2019, 9:04 p.m. UTC
bvec_iter_advance() is quite popular, but compilers fail to do proper
alias analysis and optimise it good enough. The assembly is checked
for gcc 9.2, x86-64.

- remove @iter->bi_size from min(...), as it's always less than @bytes.
Modify at the beginning and forget about it.

- the compiler isn't able to collapse memory dependencies and remove
writes in the loop. Help it by explicitely using local vars.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 include/linux/bvec.h | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

Comments

Arvind Sankar Nov. 29, 2019, 10:17 p.m. UTC | #1
On Fri, Nov 29, 2019 at 12:04:37AM +0300, Pavel Begunkov wrote:
> bvec_iter_advance() is quite popular, but compilers fail to do proper
> alias analysis and optimise it good enough. The assembly is checked
> for gcc 9.2, x86-64.
> 
> - remove @iter->bi_size from min(...), as it's always less than @bytes.
> Modify at the beginning and forget about it.
> 
> - the compiler isn't able to collapse memory dependencies and remove
> writes in the loop. Help it by explicitely using local vars.
> 
> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
> ---
>  include/linux/bvec.h | 23 ++++++++++++++---------
>  1 file changed, 14 insertions(+), 9 deletions(-)
> 
> diff --git a/include/linux/bvec.h b/include/linux/bvec.h
> index a032f01e928c..7b2f05faae14 100644
> --- a/include/linux/bvec.h
> +++ b/include/linux/bvec.h
> @@ -87,26 +87,31 @@ struct bvec_iter_all {
>  static inline bool bvec_iter_advance(const struct bio_vec *bv,
>  		struct bvec_iter *iter, unsigned bytes)
>  {
> +	unsigned int done = iter->bi_bvec_done;
> +	unsigned int idx = iter->bi_idx;
> +
>  	if (WARN_ONCE(bytes > iter->bi_size,
>  		     "Attempted to advance past end of bvec iter\n")) {
>  		iter->bi_size = 0;
>  		return false;
>  	}
>  
> +	iter->bi_size -= bytes;
> +
>  	while (bytes) {
> -		const struct bio_vec *cur = bv + iter->bi_idx;
> -		unsigned len = min3(bytes, iter->bi_size,
> -				    cur->bv_len - iter->bi_bvec_done);
> +		const struct bio_vec *cur = bv + idx;
> +		unsigned int len = min(bytes, cur->bv_len - done);
>  
>  		bytes -= len;
> -		iter->bi_size -= len;
> -		iter->bi_bvec_done += len;
> -
> -		if (iter->bi_bvec_done == cur->bv_len) {
> -			iter->bi_bvec_done = 0;
> -			iter->bi_idx++;
> +		done += len;
> +		if (done == cur->bv_len) {
> +			idx++;
> +			done = 0;
>  		}
>  	}
> +
> +	iter->bi_idx = idx;
> +	iter->bi_bvec_done = done;
>  	return true;
>  }
>  
> -- 
> 2.24.0
> 

The loop can be simplified a bit further, as done has to be 0 once we go
beyond the current bio_vec. See below for the simplified version.

I also check if bi_size became zero so we can skip the rest of the
calculations in that case. If we want to preserve the current behavior of
updating iter->bi_idx and iter->bi_bvec_done even if bi_size is going to
become zero, the loop condition should change to

		while (bytes && bytes >= cur->bv_len)

to ensure that we don't try to access past the end of the bio_vec array.

diff --git a/include/linux/bvec.h b/include/linux/bvec.h
index a032f01e928c..380d188dfbd2 100644
--- a/include/linux/bvec.h
+++ b/include/linux/bvec.h
@@ -87,25 +87,26 @@ struct bvec_iter_all {
 static inline bool bvec_iter_advance(const struct bio_vec *bv,
 		struct bvec_iter *iter, unsigned bytes)
 {
+	unsigned int idx = iter->bi_idx;
+	const struct bio_vec *cur = bv + idx;
+
 	if (WARN_ONCE(bytes > iter->bi_size,
 		     "Attempted to advance past end of bvec iter\n")) {
 		iter->bi_size = 0;
 		return false;
 	}
 
-	while (bytes) {
-		const struct bio_vec *cur = bv + iter->bi_idx;
-		unsigned len = min3(bytes, iter->bi_size,
-				    cur->bv_len - iter->bi_bvec_done);
-
-		bytes -= len;
-		iter->bi_size -= len;
-		iter->bi_bvec_done += len;
-
-		if (iter->bi_bvec_done == cur->bv_len) {
-			iter->bi_bvec_done = 0;
-			iter->bi_idx++;
+	iter->bi_size -= bytes;
+	if (iter->bi_size != 0) {
+		bytes += iter->bi_bvec_done;
+		while (bytes >= cur->bv_len) {
+			bytes -= cur->bv_len;
+			idx++;
+			cur++;
 		}
+
+		iter->bi_idx = idx;
+		iter->bi_bvec_done = bytes;
 	}
 	return true;
 }
Pavel Begunkov Nov. 29, 2019, 10:47 p.m. UTC | #2
On 30/11/2019 01:17, Arvind Sankar wrote:
> 
> The loop can be simplified a bit further, as done has to be 0 once we go
> beyond the current bio_vec. See below for the simplified version.
> 

Thanks for the suggestion! I thought about it, and decided to not
for several reasons. I prefer to not fine-tune and give compilers
more opportunity to do their job. And it's already fast enough with
modern architectures (MOVcc, complex addressing, etc).

Also need to consider code clarity and the fact, that this is inline,
so should be brief and register-friendly.


> I also check if bi_size became zero so we can skip the rest of the
> calculations in that case. If we want to preserve the current behavior of
> updating iter->bi_idx and iter->bi_bvec_done even if bi_size is going to
> become zero, the loop condition should change to
> 
> 		while (bytes && bytes >= cur->bv_len)

Probably, it's better to leave it in a consistent state. Shouldn't be
a problem, but never know when and who will screw it up. 

> 
> to ensure that we don't try to access past the end of the bio_vec array.
> 
> diff --git a/include/linux/bvec.h b/include/linux/bvec.h
> index a032f01e928c..380d188dfbd2 100644
> --- a/include/linux/bvec.h
> +++ b/include/linux/bvec.h
> @@ -87,25 +87,26 @@ struct bvec_iter_all {
>  static inline bool bvec_iter_advance(const struct bio_vec *bv,
>  		struct bvec_iter *iter, unsigned bytes)
>  {
> +	unsigned int idx = iter->bi_idx;
> +	const struct bio_vec *cur = bv + idx;
> +
>  	if (WARN_ONCE(bytes > iter->bi_size,
>  		     "Attempted to advance past end of bvec iter\n")) {
>  		iter->bi_size = 0;
>  		return false;
>  	}
>  
> -	while (bytes) {
> -		const struct bio_vec *cur = bv + iter->bi_idx;
> -		unsigned len = min3(bytes, iter->bi_size,
> -				    cur->bv_len - iter->bi_bvec_done);
> -
> -		bytes -= len;
> -		iter->bi_size -= len;
> -		iter->bi_bvec_done += len;
> -
> -		if (iter->bi_bvec_done == cur->bv_len) {
> -			iter->bi_bvec_done = 0;
> -			iter->bi_idx++;
> +	iter->bi_size -= bytes;
> +	if (iter->bi_size != 0) {
> +		bytes += iter->bi_bvec_done;
> +		while (bytes >= cur->bv_len) {
> +			bytes -= cur->bv_len;
> +			idx++;
> +			cur++;
>  		}
> +
> +		iter->bi_idx = idx;
> +		iter->bi_bvec_done = bytes;
>  	}
>  	return true;
>  }
>
Arvind Sankar Nov. 29, 2019, 11:24 p.m. UTC | #3
On Sat, Nov 30, 2019 at 01:47:16AM +0300, Pavel Begunkov wrote:
> On 30/11/2019 01:17, Arvind Sankar wrote:
> > 
> > The loop can be simplified a bit further, as done has to be 0 once we go
> > beyond the current bio_vec. See below for the simplified version.
> > 
> 
> Thanks for the suggestion! I thought about it, and decided to not
> for several reasons. I prefer to not fine-tune and give compilers
> more opportunity to do their job. And it's already fast enough with
> modern architectures (MOVcc, complex addressing, etc).
> 
> Also need to consider code clarity and the fact, that this is inline,
> so should be brief and register-friendly.
> 

It should be more register-friendly, as it uses fewer variables, and I
think it's easier to see what the loop is doing, i.e. that we advance
one bio_vec per iteration: in the existing code, it takes a bit of
thinking to see that we won't spend more than one iteration within the
same bio_vec.

I don't see this as fine-tuning, rather simplifying the code. I do agree
that it's not going to make much difference for performance of the loop
itself, as the most common case I think is that we either stay in the
current bio_vec or advance by one.

> 
> > I also check if bi_size became zero so we can skip the rest of the
> > calculations in that case. If we want to preserve the current behavior of
> > updating iter->bi_idx and iter->bi_bvec_done even if bi_size is going to
> > become zero, the loop condition should change to
> > 
> > 		while (bytes && bytes >= cur->bv_len)
> 
> Probably, it's better to leave it in a consistent state. Shouldn't be
> a problem, but never know when and who will screw it up. 
> 

The WARN_ONCE case does leave it inconsistent, though that's not
supposed to happen, so less of a pitfall there.
Pavel Begunkov Nov. 30, 2019, 9:22 a.m. UTC | #4
On 30/11/2019 02:24, Arvind Sankar wrote:
> On Sat, Nov 30, 2019 at 01:47:16AM +0300, Pavel Begunkov wrote:
>> On 30/11/2019 01:17, Arvind Sankar wrote:
>>>
>>> The loop can be simplified a bit further, as done has to be 0 once we go
>>> beyond the current bio_vec. See below for the simplified version.
>>>
>>
>> Thanks for the suggestion! I thought about it, and decided to not
>> for several reasons. I prefer to not fine-tune and give compilers
>> more opportunity to do their job. And it's already fast enough with
>> modern architectures (MOVcc, complex addressing, etc).
>>
>> Also need to consider code clarity and the fact, that this is inline,
>> so should be brief and register-friendly.
>>
> 
> It should be more register-friendly, as it uses fewer variables, and I
> think it's easier to see what the loop is doing, i.e. that we advance
> one bio_vec per iteration: in the existing code, it takes a bit of
> thinking to see that we won't spend more than one iteration within the
> same bio_vec.

Yeah, may be. It's more the matter of preference then. I don't think
it's simpler, and performance is entirely depends on a compiler and 
input. But, that's rather subjective and IMHO not worth of time.

Anyway, thanks for thinking this through!

> 
> I don't see this as fine-tuning, rather simplifying the code. I do agree
> that it's not going to make much difference for performance of the loop
> itself, as the most common case I think is that we either stay in the
> current bio_vec or advance by one.
> 
>>
>>> I also check if bi_size became zero so we can skip the rest of the
>>> calculations in that case. If we want to preserve the current behavior of
>>> updating iter->bi_idx and iter->bi_bvec_done even if bi_size is going to
>>> become zero, the loop condition should change to
>>>
>>> 		while (bytes && bytes >= cur->bv_len)
>>
>> Probably, it's better to leave it in a consistent state. Shouldn't be
>> a problem, but never know when and who will screw it up. 
>>
> 
> The WARN_ONCE case does leave it inconsistent, though that's not
> supposed to happen, so less of a pitfall there.
> 

But I hope, this WARN_ONCE won't ever happen, but I wouldn't be
suprised by code like:

last_page = (bv + iter->idx - 1)->page.
Arvind Sankar Nov. 30, 2019, 6:56 p.m. UTC | #5
On Sat, Nov 30, 2019 at 12:22:27PM +0300, Pavel Begunkov wrote:
> On 30/11/2019 02:24, Arvind Sankar wrote:
> > On Sat, Nov 30, 2019 at 01:47:16AM +0300, Pavel Begunkov wrote:
> >> On 30/11/2019 01:17, Arvind Sankar wrote:
> >>>
> >>> The loop can be simplified a bit further, as done has to be 0 once we go
> >>> beyond the current bio_vec. See below for the simplified version.
> >>>
> >>
> >> Thanks for the suggestion! I thought about it, and decided to not
> >> for several reasons. I prefer to not fine-tune and give compilers
> >> more opportunity to do their job. And it's already fast enough with
> >> modern architectures (MOVcc, complex addressing, etc).
> >>
> >> Also need to consider code clarity and the fact, that this is inline,
> >> so should be brief and register-friendly.
> >>
> > 
> > It should be more register-friendly, as it uses fewer variables, and I
> > think it's easier to see what the loop is doing, i.e. that we advance
> > one bio_vec per iteration: in the existing code, it takes a bit of
> > thinking to see that we won't spend more than one iteration within the
> > same bio_vec.
> 
> Yeah, may be. It's more the matter of preference then. I don't think
> it's simpler, and performance is entirely depends on a compiler and 
> input. But, that's rather subjective and IMHO not worth of time.
> 
> Anyway, thanks for thinking this through!
> 

You don't find listing 1 simpler than listing 2? It does save one
register, as it doesn't have to keep track of done independently from
bytes. This is always going to be the case unless the compiler can
eliminate done by transforming Listing 2 into Listing 1. Unfortunately,
even if it gets much smarter, it's unlikely to be able to do that,
because they're equivalent only if there is no overflow, so it would
need to know that bytes + iter->bi_bvec_done cannot overflow, and that
iter->bi_bvec_done must be smaller than cur->bv_len initially.

Listing 1:

	bytes += iter->bi_bvec_done;
	while (bytes) {
		const struct bio_vec *cur = bv + idx;

		if (bytes < cur->bv_len)
			break;
		bytes -= cur->bv_len;
		idx++;
	}

	iter->bi_idx = idx;
	iter->bi_bvec_done = bytes;

Listing 2:

	while (bytes) {
		const struct bio_vec *cur = bv + idx;
		unsigned int len = min(bytes, cur->bv_len - done);

		bytes -= len;
		done += len;
		if (done == cur->bv_len) {
			idx++;
			done = 0;
		}
	}

	iter->bi_idx = idx;
	iter->bi_bvec_done = done;
Jens Axboe Nov. 30, 2019, 6:57 p.m. UTC | #6
On 11/30/19 10:56 AM, Arvind Sankar wrote:
> On Sat, Nov 30, 2019 at 12:22:27PM +0300, Pavel Begunkov wrote:
>> On 30/11/2019 02:24, Arvind Sankar wrote:
>>> On Sat, Nov 30, 2019 at 01:47:16AM +0300, Pavel Begunkov wrote:
>>>> On 30/11/2019 01:17, Arvind Sankar wrote:
>>>>>
>>>>> The loop can be simplified a bit further, as done has to be 0 once we go
>>>>> beyond the current bio_vec. See below for the simplified version.
>>>>>
>>>>
>>>> Thanks for the suggestion! I thought about it, and decided to not
>>>> for several reasons. I prefer to not fine-tune and give compilers
>>>> more opportunity to do their job. And it's already fast enough with
>>>> modern architectures (MOVcc, complex addressing, etc).
>>>>
>>>> Also need to consider code clarity and the fact, that this is inline,
>>>> so should be brief and register-friendly.
>>>>
>>>
>>> It should be more register-friendly, as it uses fewer variables, and I
>>> think it's easier to see what the loop is doing, i.e. that we advance
>>> one bio_vec per iteration: in the existing code, it takes a bit of
>>> thinking to see that we won't spend more than one iteration within the
>>> same bio_vec.
>>
>> Yeah, may be. It's more the matter of preference then. I don't think
>> it's simpler, and performance is entirely depends on a compiler and
>> input. But, that's rather subjective and IMHO not worth of time.
>>
>> Anyway, thanks for thinking this through!
>>
> 
> You don't find listing 1 simpler than listing 2? It does save one
> register, as it doesn't have to keep track of done independently from
> bytes. This is always going to be the case unless the compiler can
> eliminate done by transforming Listing 2 into Listing 1. Unfortunately,
> even if it gets much smarter, it's unlikely to be able to do that,
> because they're equivalent only if there is no overflow, so it would
> need to know that bytes + iter->bi_bvec_done cannot overflow, and that
> iter->bi_bvec_done must be smaller than cur->bv_len initially.
> 
> Listing 1:
> 
> 	bytes += iter->bi_bvec_done;
> 	while (bytes) {
> 		const struct bio_vec *cur = bv + idx;
> 
> 		if (bytes < cur->bv_len)
> 			break;
> 		bytes -= cur->bv_len;
> 		idx++;
> 	}
> 
> 	iter->bi_idx = idx;
> 	iter->bi_bvec_done = bytes;
> 
> Listing 2:
> 
> 	while (bytes) {
> 		const struct bio_vec *cur = bv + idx;
> 		unsigned int len = min(bytes, cur->bv_len - done);
> 
> 		bytes -= len;
> 		done += len;
> 		if (done == cur->bv_len) {
> 			idx++;
> 			done = 0;
> 		}
> 	}
> 
> 	iter->bi_idx = idx;
> 	iter->bi_bvec_done = done;

Have yet to take a closer look (and benchmark) and the patches and
the generated code, but fwiw I do agree that case #1 is easier to
read.
Pavel Begunkov Nov. 30, 2019, 8:11 p.m. UTC | #7
On 30/11/2019 21:57, Jens Axboe wrote:
> On 11/30/19 10:56 AM, Arvind Sankar wrote:
>> On Sat, Nov 30, 2019 at 12:22:27PM +0300, Pavel Begunkov wrote:
>>> On 30/11/2019 02:24, Arvind Sankar wrote:
>>>> On Sat, Nov 30, 2019 at 01:47:16AM +0300, Pavel Begunkov wrote:
>>>>> On 30/11/2019 01:17, Arvind Sankar wrote:
>>>>>>
>>>>>> The loop can be simplified a bit further, as done has to be 0 once we go
>>>>>> beyond the current bio_vec. See below for the simplified version.
>>>>>>
>>>>>
>>>>> Thanks for the suggestion! I thought about it, and decided to not
>>>>> for several reasons. I prefer to not fine-tune and give compilers
>>>>> more opportunity to do their job. And it's already fast enough with
>>>>> modern architectures (MOVcc, complex addressing, etc).
>>>>>
>>>>> Also need to consider code clarity and the fact, that this is inline,
>>>>> so should be brief and register-friendly.
>>>>>
>>>>
>>>> It should be more register-friendly, as it uses fewer variables, and I
>>>> think it's easier to see what the loop is doing, i.e. that we advance
>>>> one bio_vec per iteration: in the existing code, it takes a bit of
>>>> thinking to see that we won't spend more than one iteration within the
>>>> same bio_vec.
>>>
>>> Yeah, may be. It's more the matter of preference then. I don't think
>>> it's simpler, and performance is entirely depends on a compiler and
>>> input. But, that's rather subjective and IMHO not worth of time.
>>>
>>> Anyway, thanks for thinking this through!
>>>
>>
>> You don't find listing 1 simpler than listing 2? It does save one
>> register, as it doesn't have to keep track of done independently from
>> bytes. This is always going to be the case unless the compiler can
>> eliminate done by transforming Listing 2 into Listing 1. Unfortunately,
>> even if it gets much smarter, it's unlikely to be able to do that,
>> because they're equivalent only if there is no overflow, so it would
>> need to know that bytes + iter->bi_bvec_done cannot overflow, and that
>> iter->bi_bvec_done must be smaller than cur->bv_len initially.
>>
>> Listing 1:
>>
>>     bytes += iter->bi_bvec_done;
>>     while (bytes) {
>>         const struct bio_vec *cur = bv + idx;
>>
>>         if (bytes < cur->bv_len)
>>             break;
>>         bytes -= cur->bv_len;
>>         idx++;
>>     }
>>
>>     iter->bi_idx = idx;
>>     iter->bi_bvec_done = bytes;
>>
>> Listing 2:
>>
>>     while (bytes) {
>>         const struct bio_vec *cur = bv + idx;
>>         unsigned int len = min(bytes, cur->bv_len - done);
>>
>>         bytes -= len;
>>         done += len;
>>         if (done == cur->bv_len) {
>>             idx++;
>>             done = 0;
>>         }
>>     }
>>
>>     iter->bi_idx = idx;
>>     iter->bi_bvec_done = done;
> 
> Have yet to take a closer look (and benchmark) and the patches and
> the generated code, but fwiw I do agree that case #1 is easier to
> read.
> 
Ok, ok, I'm not keen on bike-shedding. I'll resend a simplified version
Jens Axboe Nov. 30, 2019, 8:56 p.m. UTC | #8
On 11/30/19 12:11 PM, Pavel Begunkov wrote:
> On 30/11/2019 21:57, Jens Axboe wrote:
>> On 11/30/19 10:56 AM, Arvind Sankar wrote:
>>> On Sat, Nov 30, 2019 at 12:22:27PM +0300, Pavel Begunkov wrote:
>>>> On 30/11/2019 02:24, Arvind Sankar wrote:
>>>>> On Sat, Nov 30, 2019 at 01:47:16AM +0300, Pavel Begunkov wrote:
>>>>>> On 30/11/2019 01:17, Arvind Sankar wrote:
>>>>>>>
>>>>>>> The loop can be simplified a bit further, as done has to be 0 once we go
>>>>>>> beyond the current bio_vec. See below for the simplified version.
>>>>>>>
>>>>>>
>>>>>> Thanks for the suggestion! I thought about it, and decided to not
>>>>>> for several reasons. I prefer to not fine-tune and give compilers
>>>>>> more opportunity to do their job. And it's already fast enough with
>>>>>> modern architectures (MOVcc, complex addressing, etc).
>>>>>>
>>>>>> Also need to consider code clarity and the fact, that this is inline,
>>>>>> so should be brief and register-friendly.
>>>>>>
>>>>>
>>>>> It should be more register-friendly, as it uses fewer variables, and I
>>>>> think it's easier to see what the loop is doing, i.e. that we advance
>>>>> one bio_vec per iteration: in the existing code, it takes a bit of
>>>>> thinking to see that we won't spend more than one iteration within the
>>>>> same bio_vec.
>>>>
>>>> Yeah, may be. It's more the matter of preference then. I don't think
>>>> it's simpler, and performance is entirely depends on a compiler and
>>>> input. But, that's rather subjective and IMHO not worth of time.
>>>>
>>>> Anyway, thanks for thinking this through!
>>>>
>>>
>>> You don't find listing 1 simpler than listing 2? It does save one
>>> register, as it doesn't have to keep track of done independently from
>>> bytes. This is always going to be the case unless the compiler can
>>> eliminate done by transforming Listing 2 into Listing 1. Unfortunately,
>>> even if it gets much smarter, it's unlikely to be able to do that,
>>> because they're equivalent only if there is no overflow, so it would
>>> need to know that bytes + iter->bi_bvec_done cannot overflow, and that
>>> iter->bi_bvec_done must be smaller than cur->bv_len initially.
>>>
>>> Listing 1:
>>>
>>>      bytes += iter->bi_bvec_done;
>>>      while (bytes) {
>>>          const struct bio_vec *cur = bv + idx;
>>>
>>>          if (bytes < cur->bv_len)
>>>              break;
>>>          bytes -= cur->bv_len;
>>>          idx++;
>>>      }
>>>
>>>      iter->bi_idx = idx;
>>>      iter->bi_bvec_done = bytes;
>>>
>>> Listing 2:
>>>
>>>      while (bytes) {
>>>          const struct bio_vec *cur = bv + idx;
>>>          unsigned int len = min(bytes, cur->bv_len - done);
>>>
>>>          bytes -= len;
>>>          done += len;
>>>          if (done == cur->bv_len) {
>>>              idx++;
>>>              done = 0;
>>>          }
>>>      }
>>>
>>>      iter->bi_idx = idx;
>>>      iter->bi_bvec_done = done;
>>
>> Have yet to take a closer look (and benchmark) and the patches and
>> the generated code, but fwiw I do agree that case #1 is easier to
>> read.
>>
> Ok, ok, I'm not keen on bike-shedding. I'll resend a simplified version

Sweet thanks. Make sure it's green.
diff mbox series

Patch

diff --git a/include/linux/bvec.h b/include/linux/bvec.h
index a032f01e928c..7b2f05faae14 100644
--- a/include/linux/bvec.h
+++ b/include/linux/bvec.h
@@ -87,26 +87,31 @@  struct bvec_iter_all {
 static inline bool bvec_iter_advance(const struct bio_vec *bv,
 		struct bvec_iter *iter, unsigned bytes)
 {
+	unsigned int done = iter->bi_bvec_done;
+	unsigned int idx = iter->bi_idx;
+
 	if (WARN_ONCE(bytes > iter->bi_size,
 		     "Attempted to advance past end of bvec iter\n")) {
 		iter->bi_size = 0;
 		return false;
 	}
 
+	iter->bi_size -= bytes;
+
 	while (bytes) {
-		const struct bio_vec *cur = bv + iter->bi_idx;
-		unsigned len = min3(bytes, iter->bi_size,
-				    cur->bv_len - iter->bi_bvec_done);
+		const struct bio_vec *cur = bv + idx;
+		unsigned int len = min(bytes, cur->bv_len - done);
 
 		bytes -= len;
-		iter->bi_size -= len;
-		iter->bi_bvec_done += len;
-
-		if (iter->bi_bvec_done == cur->bv_len) {
-			iter->bi_bvec_done = 0;
-			iter->bi_idx++;
+		done += len;
+		if (done == cur->bv_len) {
+			idx++;
+			done = 0;
 		}
 	}
+
+	iter->bi_idx = idx;
+	iter->bi_bvec_done = done;
 	return true;
 }