diff mbox

[V4] Btrfs: enchanse raid1/10 balance heuristic

Message ID 20180425002017.25523-1-nefelim4ag@gmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

Timofey Titovets April 25, 2018, 12:20 a.m. UTC
Currently btrfs raid1/10 balancer bаlance requests to mirrors,
based on pid % num of mirrors.

Make logic understood:
 - if one of underline devices are non rotational
 - Queue leght to underline devices

By default try use pid % num_mirrors guessing, but:
 - If one of mirrors are non rotational, repick optimal to it
 - If underline mirror have less queue leght then optimal,
   repick to that mirror

For avoid round-robin request balancing,
lets round down queue leght:
 - By 8 for rotational devs
 - By 2 for all non rotational devs

Changes:
  v1 -> v2:
    - Use helper part_in_flight() from genhd.c
      to get queue lenght
    - Move guess code to guess_optimal()
    - Change balancer logic, try use pid % mirror by default
      Make balancing on spinning rust if one of underline devices
      are overloaded
  v2 -> v3:
    - Fix arg for RAID10 - use sub_stripes, instead of num_stripes
  v3 -> v4:
    - Rebased on latest misc-next

Signed-off-by: Timofey Titovets <nefelim4ag@gmail.com>
---
 block/genhd.c      |   1 +
 fs/btrfs/volumes.c | 111 ++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 110 insertions(+), 2 deletions(-)

Comments

Misono Tomohiro April 25, 2018, 7:54 a.m. UTC | #1
On 2018/04/25 9:20, Timofey Titovets wrote:
> Currently btrfs raid1/10 balancer bаlance requests to mirrors,
> based on pid % num of mirrors.
> 
> Make logic understood:
>  - if one of underline devices are non rotational
>  - Queue leght to underline devices
> 
> By default try use pid % num_mirrors guessing, but:
>  - If one of mirrors are non rotational, repick optimal to it
>  - If underline mirror have less queue leght then optimal,
>    repick to that mirror
> 
> For avoid round-robin request balancing,
> lets round down queue leght:
>  - By 8 for rotational devs
>  - By 2 for all non rotational devs
> 
> Changes:
>   v1 -> v2:
>     - Use helper part_in_flight() from genhd.c
>       to get queue lenght
>     - Move guess code to guess_optimal()
>     - Change balancer logic, try use pid % mirror by default
>       Make balancing on spinning rust if one of underline devices
>       are overloaded
>   v2 -> v3:
>     - Fix arg for RAID10 - use sub_stripes, instead of num_stripes
>   v3 -> v4:
>     - Rebased on latest misc-next
> 
> Signed-off-by: Timofey Titovets <nefelim4ag@gmail.com>
> ---
>  block/genhd.c      |   1 +
>  fs/btrfs/volumes.c | 111 ++++++++++++++++++++++++++++++++++++++++++++-
>  2 files changed, 110 insertions(+), 2 deletions(-)
> 
> diff --git a/block/genhd.c b/block/genhd.c
> index 9656f9e9f99e..5ea5acc88d3c 100644
> --- a/block/genhd.c
> +++ b/block/genhd.c
> @@ -81,6 +81,7 @@ void part_in_flight(struct request_queue *q, struct hd_struct *part,
>  				atomic_read(&part->in_flight[1]);
>  	}
>  }
> +EXPORT_SYMBOL_GPL(part_in_flight);
>  
>  struct hd_struct *__disk_get_part(struct gendisk *disk, int partno)
>  {
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index c95af358b71f..fa7dd6ac087f 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -16,6 +16,7 @@
>  #include <linux/semaphore.h>
>  #include <linux/uuid.h>
>  #include <linux/list_sort.h>
> +#include <linux/genhd.h>
>  #include <asm/div64.h>
>  #include "ctree.h"
>  #include "extent_map.h"
> @@ -5148,7 +5149,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
>  		/*
>  		 * There could be two corrupted data stripes, we need
>  		 * to loop retry in order to rebuild the correct data.
> -		 * 
> +		 *
>  		 * Fail a stripe at a time on every retry except the
>  		 * stripe under reconstruction.
>  		 */
> @@ -5201,6 +5202,111 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
>  	return ret;
>  }
>  
> +/**
> + * bdev_get_queue_len - return rounded down in flight queue lenght of bdev
> + *
> + * @bdev: target bdev
> + * @round_down: round factor big for hdd and small for ssd, like 8 and 2
> + */
> +static int bdev_get_queue_len(struct block_device *bdev, int round_down)
> +{
> +	int sum;
> +	struct hd_struct *bd_part = bdev->bd_part;
> +	struct request_queue *rq = bdev_get_queue(bdev);
> +	uint32_t inflight[2] = {0, 0};
> +
> +	part_in_flight(rq, bd_part, inflight);
> +
> +	sum = max_t(uint32_t, inflight[0], inflight[1]);
> +
> +	/*
> +	 * Try prevent switch for every sneeze
> +	 * By roundup output num by some value
> +	 */
> +	return ALIGN_DOWN(sum, round_down);
> +}
> +
> +/**
> + * guess_optimal - return guessed optimal mirror
> + *
> + * Optimal expected to be pid % num_stripes
> + *
> + * That's generaly ok for spread load
> + * Add some balancer based on queue leght to device
> + *
> + * Basic ideas:
> + *  - Sequential read generate low amount of request
> + *    so if load of drives are equal, use pid % num_stripes balancing
> + *  - For mixed rotate/non-rotate mirrors, pick non-rotate as optimal
> + *    and repick if other dev have "significant" less queue lenght

The code looks always choosing the queue with the lowest length regardless
of the amount of queue length difference. So, this "significant" may be wrong? 

> + *  - Repick optimal if queue leght of other mirror are less
> + */
> +static int guess_optimal(struct map_lookup *map, int num, int optimal)
> +{
> +	int i;
> +	int round_down = 8;
> +	int qlen[num];
> +	bool is_nonrot[num];
> +	bool all_bdev_nonrot = true;
> +	bool all_bdev_rotate = true;
> +	struct block_device *bdev;
> +
> +	if (num == 1)
> +		return optimal;
> +
> +	/* Check accessible bdevs */
> +	for (i = 0; i < num; i++) {
> +		/* Init for missing bdevs */
> +		is_nonrot[i] = false;
> +		qlen[i] = INT_MAX;
> +		bdev = map->stripes[i].dev->bdev;
> +		if (bdev) {
> +			qlen[i] = 0;
> +			is_nonrot[i] = blk_queue_nonrot(bdev_get_queue(bdev));
> +			if (is_nonrot[i])
> +				all_bdev_rotate = false;
> +			else
> +				all_bdev_nonrot = false;
> +		}
> +	}
> +
> +	/*
> +	 * Don't bother with computation
> +	 * if only one of two bdevs are accessible
> +	 */
> +	if (num == 2 && qlen[0] != qlen[1]) {
> +		if (qlen[0] < qlen[1])
> +			return 0;
> +		else
> +			return 1;
> +	}
> +
> +	if (all_bdev_nonrot)
> +		round_down = 2;
> +
> +	for (i = 0; i < num; i++) {
> +		if (qlen[i])
> +			continue;
> +		bdev = map->stripes[i].dev->bdev;
> +		qlen[i] = bdev_get_queue_len(bdev, round_down);
> +	}
> +
> +	/* For mixed case, pick non rotational dev as optimal */
> +	if (all_bdev_rotate == all_bdev_nonrot) {
> +		for (i = 0; i < num; i++) {
> +			if (is_nonrot[i])
> +				optimal = i;
> +		}
> +	}
> +
> +	for (i = 0; i < num; i++) {
> +		if (qlen[optimal] > qlen[i])
> +			optimal = i;
> +	}
> +
> +	return optimal;
> +}
> +
>  static int find_live_mirror(struct btrfs_fs_info *fs_info,
>  			    struct map_lookup *map, int first,
>  			    int dev_replace_is_ongoing)
> @@ -5219,7 +5325,8 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
>  	else
>  		num_stripes = map->num_stripes;
>  
> -	preferred_mirror = first + current->pid % num_stripes;
> +	preferred_mirror = first + guess_optimal(map, num_stripes,
> +						 current->pid % num_stripes);
>  
>  	if (dev_replace_is_ongoing &&
>  	    fs_info->dev_replace.cont_reading_from_srcdev_mode ==
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Timofey Titovets April 25, 2018, 8:15 a.m. UTC | #2
2018-04-25 10:54 GMT+03:00 Misono Tomohiro <misono.tomohiro@jp.fujitsu.com>:
> On 2018/04/25 9:20, Timofey Titovets wrote:
>> Currently btrfs raid1/10 balancer bаlance requests to mirrors,
>> based on pid % num of mirrors.
>>
>> Make logic understood:
>>  - if one of underline devices are non rotational
>>  - Queue leght to underline devices
>>
>> By default try use pid % num_mirrors guessing, but:
>>  - If one of mirrors are non rotational, repick optimal to it
>>  - If underline mirror have less queue leght then optimal,
>>    repick to that mirror
>>
>> For avoid round-robin request balancing,
>> lets round down queue leght:
>>  - By 8 for rotational devs
>>  - By 2 for all non rotational devs
>>
>> Changes:
>>   v1 -> v2:
>>     - Use helper part_in_flight() from genhd.c
>>       to get queue lenght
>>     - Move guess code to guess_optimal()
>>     - Change balancer logic, try use pid % mirror by default
>>       Make balancing on spinning rust if one of underline devices
>>       are overloaded
>>   v2 -> v3:
>>     - Fix arg for RAID10 - use sub_stripes, instead of num_stripes
>>   v3 -> v4:
>>     - Rebased on latest misc-next
>>
>> Signed-off-by: Timofey Titovets <nefelim4ag@gmail.com>
>> ---
>>  block/genhd.c      |   1 +
>>  fs/btrfs/volumes.c | 111 ++++++++++++++++++++++++++++++++++++++++++++-
>>  2 files changed, 110 insertions(+), 2 deletions(-)
>>
>> diff --git a/block/genhd.c b/block/genhd.c
>> index 9656f9e9f99e..5ea5acc88d3c 100644
>> --- a/block/genhd.c
>> +++ b/block/genhd.c
>> @@ -81,6 +81,7 @@ void part_in_flight(struct request_queue *q, struct hd_struct *part,
>>                               atomic_read(&part->in_flight[1]);
>>       }
>>  }
>> +EXPORT_SYMBOL_GPL(part_in_flight);
>>
>>  struct hd_struct *__disk_get_part(struct gendisk *disk, int partno)
>>  {
>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>> index c95af358b71f..fa7dd6ac087f 100644
>> --- a/fs/btrfs/volumes.c
>> +++ b/fs/btrfs/volumes.c
>> @@ -16,6 +16,7 @@
>>  #include <linux/semaphore.h>
>>  #include <linux/uuid.h>
>>  #include <linux/list_sort.h>
>> +#include <linux/genhd.h>
>>  #include <asm/div64.h>
>>  #include "ctree.h"
>>  #include "extent_map.h"
>> @@ -5148,7 +5149,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
>>               /*
>>                * There could be two corrupted data stripes, we need
>>                * to loop retry in order to rebuild the correct data.
>> -              *
>> +              *
>>                * Fail a stripe at a time on every retry except the
>>                * stripe under reconstruction.
>>                */
>> @@ -5201,6 +5202,111 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
>>       return ret;
>>  }
>>
>> +/**
>> + * bdev_get_queue_len - return rounded down in flight queue lenght of bdev
>> + *
>> + * @bdev: target bdev
>> + * @round_down: round factor big for hdd and small for ssd, like 8 and 2
>> + */
>> +static int bdev_get_queue_len(struct block_device *bdev, int round_down)
>> +{
>> +     int sum;
>> +     struct hd_struct *bd_part = bdev->bd_part;
>> +     struct request_queue *rq = bdev_get_queue(bdev);
>> +     uint32_t inflight[2] = {0, 0};
>> +
>> +     part_in_flight(rq, bd_part, inflight);
>> +
>> +     sum = max_t(uint32_t, inflight[0], inflight[1]);
>> +
>> +     /*
>> +      * Try prevent switch for every sneeze
>> +      * By roundup output num by some value
>> +      */
>> +     return ALIGN_DOWN(sum, round_down);
>> +}
>> +
>> +/**
>> + * guess_optimal - return guessed optimal mirror
>> + *
>> + * Optimal expected to be pid % num_stripes
>> + *
>> + * That's generaly ok for spread load
>> + * Add some balancer based on queue leght to device
>> + *
>> + * Basic ideas:
>> + *  - Sequential read generate low amount of request
>> + *    so if load of drives are equal, use pid % num_stripes balancing
>> + *  - For mixed rotate/non-rotate mirrors, pick non-rotate as optimal
>> + *    and repick if other dev have "significant" less queue lenght
>
> The code looks always choosing the queue with the lowest length regardless
> of the amount of queue length difference. So, this "significant" may be wrong?

yes, but before code looks at queue len, we do round_down by 8,
may be you confused because i hide ALIGN_DOWN in bdev_get_queue_len()

I'm not think what this is "best" leveling queue leveling solution,
but that's works.
In theory ABS() <> some_num, can be used, but that's will lead to
random send of some requests to hdd.

i.e. for ABS() > 8:
ssd hdd
9    0 -> hdd
9    1 -> ssd
10  1 -> hdd
10  2 -> ssd
And so on.

So in general that will looks like:
ssd rount_down(7, 8) = 0, hdd round_down(0, 8) = 0
ssd will be used
And
ssd round_down(9, 8) = 8, hdd round_down(0,8) = 0
hdd will be used, while hdd qlen < 8.

i.e. 'significant' depends on round_down factor.

Thanks.

>> + *  - Repick optimal if queue leght of other mirror are less
>> + */
>> +static int guess_optimal(struct map_lookup *map, int num, int optimal)
>> +{
>> +     int i;
>> +     int round_down = 8;
>> +     int qlen[num];
>> +     bool is_nonrot[num];
>> +     bool all_bdev_nonrot = true;
>> +     bool all_bdev_rotate = true;
>> +     struct block_device *bdev;
>> +
>> +     if (num == 1)
>> +             return optimal;
>> +
>> +     /* Check accessible bdevs */
>> +     for (i = 0; i < num; i++) {
>> +             /* Init for missing bdevs */
>> +             is_nonrot[i] = false;
>> +             qlen[i] = INT_MAX;
>> +             bdev = map->stripes[i].dev->bdev;
>> +             if (bdev) {
>> +                     qlen[i] = 0;
>> +                     is_nonrot[i] = blk_queue_nonrot(bdev_get_queue(bdev));
>> +                     if (is_nonrot[i])
>> +                             all_bdev_rotate = false;
>> +                     else
>> +                             all_bdev_nonrot = false;
>> +             }
>> +     }
>> +
>> +     /*
>> +      * Don't bother with computation
>> +      * if only one of two bdevs are accessible
>> +      */
>> +     if (num == 2 && qlen[0] != qlen[1]) {
>> +             if (qlen[0] < qlen[1])
>> +                     return 0;
>> +             else
>> +                     return 1;
>> +     }
>> +
>> +     if (all_bdev_nonrot)
>> +             round_down = 2;
>> +
>> +     for (i = 0; i < num; i++) {
>> +             if (qlen[i])
>> +                     continue;
>> +             bdev = map->stripes[i].dev->bdev;
>> +             qlen[i] = bdev_get_queue_len(bdev, round_down);
>> +     }
>> +
>> +     /* For mixed case, pick non rotational dev as optimal */
>> +     if (all_bdev_rotate == all_bdev_nonrot) {
>> +             for (i = 0; i < num; i++) {
>> +                     if (is_nonrot[i])
>> +                             optimal = i;
>> +             }
>> +     }
>> +
>> +     for (i = 0; i < num; i++) {
>> +             if (qlen[optimal] > qlen[i])
>> +                     optimal = i;
>> +     }
>> +
>> +     return optimal;
>> +}
>> +
>>  static int find_live_mirror(struct btrfs_fs_info *fs_info,
>>                           struct map_lookup *map, int first,
>>                           int dev_replace_is_ongoing)
>> @@ -5219,7 +5325,8 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
>>       else
>>               num_stripes = map->num_stripes;
>>
>> -     preferred_mirror = first + current->pid % num_stripes;
>> +     preferred_mirror = first + guess_optimal(map, num_stripes,
>> +                                              current->pid % num_stripes);
>>
>>       if (dev_replace_is_ongoing &&
>>           fs_info->dev_replace.cont_reading_from_srcdev_mode ==
>>
>
Misono Tomohiro April 25, 2018, 8:31 a.m. UTC | #3
On 2018/04/25 17:15, Timofey Titovets wrote:
> 2018-04-25 10:54 GMT+03:00 Misono Tomohiro <misono.tomohiro@jp.fujitsu.com>:
>> On 2018/04/25 9:20, Timofey Titovets wrote:
>>> Currently btrfs raid1/10 balancer bаlance requests to mirrors,
>>> based on pid % num of mirrors.
>>>
>>> Make logic understood:
>>>  - if one of underline devices are non rotational
>>>  - Queue leght to underline devices
>>>
>>> By default try use pid % num_mirrors guessing, but:
>>>  - If one of mirrors are non rotational, repick optimal to it
>>>  - If underline mirror have less queue leght then optimal,
>>>    repick to that mirror
>>>
>>> For avoid round-robin request balancing,
>>> lets round down queue leght:
>>>  - By 8 for rotational devs
>>>  - By 2 for all non rotational devs
>>>
>>> Changes:
>>>   v1 -> v2:
>>>     - Use helper part_in_flight() from genhd.c
>>>       to get queue lenght
>>>     - Move guess code to guess_optimal()
>>>     - Change balancer logic, try use pid % mirror by default
>>>       Make balancing on spinning rust if one of underline devices
>>>       are overloaded
>>>   v2 -> v3:
>>>     - Fix arg for RAID10 - use sub_stripes, instead of num_stripes
>>>   v3 -> v4:
>>>     - Rebased on latest misc-next
>>>
>>> Signed-off-by: Timofey Titovets <nefelim4ag@gmail.com>
>>> ---
>>>  block/genhd.c      |   1 +
>>>  fs/btrfs/volumes.c | 111 ++++++++++++++++++++++++++++++++++++++++++++-
>>>  2 files changed, 110 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/block/genhd.c b/block/genhd.c
>>> index 9656f9e9f99e..5ea5acc88d3c 100644
>>> --- a/block/genhd.c
>>> +++ b/block/genhd.c
>>> @@ -81,6 +81,7 @@ void part_in_flight(struct request_queue *q, struct hd_struct *part,
>>>                               atomic_read(&part->in_flight[1]);
>>>       }
>>>  }
>>> +EXPORT_SYMBOL_GPL(part_in_flight);
>>>
>>>  struct hd_struct *__disk_get_part(struct gendisk *disk, int partno)
>>>  {
>>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>>> index c95af358b71f..fa7dd6ac087f 100644
>>> --- a/fs/btrfs/volumes.c
>>> +++ b/fs/btrfs/volumes.c
>>> @@ -16,6 +16,7 @@
>>>  #include <linux/semaphore.h>
>>>  #include <linux/uuid.h>
>>>  #include <linux/list_sort.h>
>>> +#include <linux/genhd.h>
>>>  #include <asm/div64.h>
>>>  #include "ctree.h"
>>>  #include "extent_map.h"
>>> @@ -5148,7 +5149,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
>>>               /*
>>>                * There could be two corrupted data stripes, we need
>>>                * to loop retry in order to rebuild the correct data.
>>> -              *
>>> +              *
>>>                * Fail a stripe at a time on every retry except the
>>>                * stripe under reconstruction.
>>>                */
>>> @@ -5201,6 +5202,111 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
>>>       return ret;
>>>  }
>>>
>>> +/**
>>> + * bdev_get_queue_len - return rounded down in flight queue lenght of bdev
>>> + *
>>> + * @bdev: target bdev
>>> + * @round_down: round factor big for hdd and small for ssd, like 8 and 2
>>> + */
>>> +static int bdev_get_queue_len(struct block_device *bdev, int round_down)
>>> +{
>>> +     int sum;
>>> +     struct hd_struct *bd_part = bdev->bd_part;
>>> +     struct request_queue *rq = bdev_get_queue(bdev);
>>> +     uint32_t inflight[2] = {0, 0};
>>> +
>>> +     part_in_flight(rq, bd_part, inflight);
>>> +
>>> +     sum = max_t(uint32_t, inflight[0], inflight[1]);
>>> +
>>> +     /*
>>> +      * Try prevent switch for every sneeze
>>> +      * By roundup output num by some value
>>> +      */
>>> +     return ALIGN_DOWN(sum, round_down);
>>> +}
>>> +
>>> +/**
>>> + * guess_optimal - return guessed optimal mirror
>>> + *
>>> + * Optimal expected to be pid % num_stripes
>>> + *
>>> + * That's generaly ok for spread load
>>> + * Add some balancer based on queue leght to device
>>> + *
>>> + * Basic ideas:
>>> + *  - Sequential read generate low amount of request
>>> + *    so if load of drives are equal, use pid % num_stripes balancing
>>> + *  - For mixed rotate/non-rotate mirrors, pick non-rotate as optimal
>>> + *    and repick if other dev have "significant" less queue lenght
>>
>> The code looks always choosing the queue with the lowest length regardless
>> of the amount of queue length difference. So, this "significant" may be wrong?
> 
> yes, but before code looks at queue len, we do round_down by 8,
> may be you confused because i hide ALIGN_DOWN in bdev_get_queue_len()
> 
> I'm not think what this is "best" leveling queue leveling solution,
> but that's works.
> In theory ABS() <> some_num, can be used, but that's will lead to
> random send of some requests to hdd.
> 
> i.e. for ABS() > 8:
> ssd hdd
> 9    0 -> hdd
> 9    1 -> ssd
> 10  1 -> hdd
> 10  2 -> ssd
> And so on.
> 
> So in general that will looks like:
> ssd rount_down(7, 8) = 0, hdd round_down(0, 8) = 0
> ssd will be used
> And
> ssd round_down(9, 8) = 8, hdd round_down(0,8) = 0
> hdd will be used, while hdd qlen < 8.
> 
> i.e. 'significant' depends on round_down factor.
> 
> Thanks.

I see, thanks for the explanation.
Tomohiro Misono

> 
>>> + *  - Repick optimal if queue leght of other mirror are less
>>> + */
>>> +static int guess_optimal(struct map_lookup *map, int num, int optimal)
>>> +{
>>> +     int i;
>>> +     int round_down = 8;
>>> +     int qlen[num];
>>> +     bool is_nonrot[num];
>>> +     bool all_bdev_nonrot = true;
>>> +     bool all_bdev_rotate = true;
>>> +     struct block_device *bdev;
>>> +
>>> +     if (num == 1)
>>> +             return optimal;
>>> +
>>> +     /* Check accessible bdevs */
>>> +     for (i = 0; i < num; i++) {
>>> +             /* Init for missing bdevs */
>>> +             is_nonrot[i] = false;
>>> +             qlen[i] = INT_MAX;
>>> +             bdev = map->stripes[i].dev->bdev;
>>> +             if (bdev) {
>>> +                     qlen[i] = 0;
>>> +                     is_nonrot[i] = blk_queue_nonrot(bdev_get_queue(bdev));
>>> +                     if (is_nonrot[i])
>>> +                             all_bdev_rotate = false;
>>> +                     else
>>> +                             all_bdev_nonrot = false;
>>> +             }
>>> +     }
>>> +
>>> +     /*
>>> +      * Don't bother with computation
>>> +      * if only one of two bdevs are accessible
>>> +      */
>>> +     if (num == 2 && qlen[0] != qlen[1]) {
>>> +             if (qlen[0] < qlen[1])
>>> +                     return 0;
>>> +             else
>>> +                     return 1;
>>> +     }
>>> +
>>> +     if (all_bdev_nonrot)
>>> +             round_down = 2;
>>> +
>>> +     for (i = 0; i < num; i++) {
>>> +             if (qlen[i])
>>> +                     continue;
>>> +             bdev = map->stripes[i].dev->bdev;
>>> +             qlen[i] = bdev_get_queue_len(bdev, round_down);
>>> +     }
>>> +
>>> +     /* For mixed case, pick non rotational dev as optimal */
>>> +     if (all_bdev_rotate == all_bdev_nonrot) {
>>> +             for (i = 0; i < num; i++) {
>>> +                     if (is_nonrot[i])
>>> +                             optimal = i;
>>> +             }
>>> +     }
>>> +
>>> +     for (i = 0; i < num; i++) {
>>> +             if (qlen[optimal] > qlen[i])
>>> +                     optimal = i;
>>> +     }
>>> +
>>> +     return optimal;
>>> +}
>>> +
>>>  static int find_live_mirror(struct btrfs_fs_info *fs_info,
>>>                           struct map_lookup *map, int first,
>>>                           int dev_replace_is_ongoing)
>>> @@ -5219,7 +5325,8 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
>>>       else
>>>               num_stripes = map->num_stripes;
>>>
>>> -     preferred_mirror = first + current->pid % num_stripes;
>>> +     preferred_mirror = first + guess_optimal(map, num_stripes,
>>> +                                              current->pid % num_stripes);
>>>
>>>       if (dev_replace_is_ongoing &&
>>>           fs_info->dev_replace.cont_reading_from_srcdev_mode ==
>>>
>>
> 
> 
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/block/genhd.c b/block/genhd.c
index 9656f9e9f99e..5ea5acc88d3c 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -81,6 +81,7 @@  void part_in_flight(struct request_queue *q, struct hd_struct *part,
 				atomic_read(&part->in_flight[1]);
 	}
 }
+EXPORT_SYMBOL_GPL(part_in_flight);
 
 struct hd_struct *__disk_get_part(struct gendisk *disk, int partno)
 {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c95af358b71f..fa7dd6ac087f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -16,6 +16,7 @@ 
 #include <linux/semaphore.h>
 #include <linux/uuid.h>
 #include <linux/list_sort.h>
+#include <linux/genhd.h>
 #include <asm/div64.h>
 #include "ctree.h"
 #include "extent_map.h"
@@ -5148,7 +5149,7 @@  int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
 		/*
 		 * There could be two corrupted data stripes, we need
 		 * to loop retry in order to rebuild the correct data.
-		 * 
+		 *
 		 * Fail a stripe at a time on every retry except the
 		 * stripe under reconstruction.
 		 */
@@ -5201,6 +5202,111 @@  int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
 	return ret;
 }
 
+/**
+ * bdev_get_queue_len - return rounded down in flight queue lenght of bdev
+ *
+ * @bdev: target bdev
+ * @round_down: round factor big for hdd and small for ssd, like 8 and 2
+ */
+static int bdev_get_queue_len(struct block_device *bdev, int round_down)
+{
+	int sum;
+	struct hd_struct *bd_part = bdev->bd_part;
+	struct request_queue *rq = bdev_get_queue(bdev);
+	uint32_t inflight[2] = {0, 0};
+
+	part_in_flight(rq, bd_part, inflight);
+
+	sum = max_t(uint32_t, inflight[0], inflight[1]);
+
+	/*
+	 * Try prevent switch for every sneeze
+	 * By roundup output num by some value
+	 */
+	return ALIGN_DOWN(sum, round_down);
+}
+
+/**
+ * guess_optimal - return guessed optimal mirror
+ *
+ * Optimal expected to be pid % num_stripes
+ *
+ * That's generaly ok for spread load
+ * Add some balancer based on queue leght to device
+ *
+ * Basic ideas:
+ *  - Sequential read generate low amount of request
+ *    so if load of drives are equal, use pid % num_stripes balancing
+ *  - For mixed rotate/non-rotate mirrors, pick non-rotate as optimal
+ *    and repick if other dev have "significant" less queue lenght
+ *  - Repick optimal if queue leght of other mirror are less
+ */
+static int guess_optimal(struct map_lookup *map, int num, int optimal)
+{
+	int i;
+	int round_down = 8;
+	int qlen[num];
+	bool is_nonrot[num];
+	bool all_bdev_nonrot = true;
+	bool all_bdev_rotate = true;
+	struct block_device *bdev;
+
+	if (num == 1)
+		return optimal;
+
+	/* Check accessible bdevs */
+	for (i = 0; i < num; i++) {
+		/* Init for missing bdevs */
+		is_nonrot[i] = false;
+		qlen[i] = INT_MAX;
+		bdev = map->stripes[i].dev->bdev;
+		if (bdev) {
+			qlen[i] = 0;
+			is_nonrot[i] = blk_queue_nonrot(bdev_get_queue(bdev));
+			if (is_nonrot[i])
+				all_bdev_rotate = false;
+			else
+				all_bdev_nonrot = false;
+		}
+	}
+
+	/*
+	 * Don't bother with computation
+	 * if only one of two bdevs are accessible
+	 */
+	if (num == 2 && qlen[0] != qlen[1]) {
+		if (qlen[0] < qlen[1])
+			return 0;
+		else
+			return 1;
+	}
+
+	if (all_bdev_nonrot)
+		round_down = 2;
+
+	for (i = 0; i < num; i++) {
+		if (qlen[i])
+			continue;
+		bdev = map->stripes[i].dev->bdev;
+		qlen[i] = bdev_get_queue_len(bdev, round_down);
+	}
+
+	/* For mixed case, pick non rotational dev as optimal */
+	if (all_bdev_rotate == all_bdev_nonrot) {
+		for (i = 0; i < num; i++) {
+			if (is_nonrot[i])
+				optimal = i;
+		}
+	}
+
+	for (i = 0; i < num; i++) {
+		if (qlen[optimal] > qlen[i])
+			optimal = i;
+	}
+
+	return optimal;
+}
+
 static int find_live_mirror(struct btrfs_fs_info *fs_info,
 			    struct map_lookup *map, int first,
 			    int dev_replace_is_ongoing)
@@ -5219,7 +5325,8 @@  static int find_live_mirror(struct btrfs_fs_info *fs_info,
 	else
 		num_stripes = map->num_stripes;
 
-	preferred_mirror = first + current->pid % num_stripes;
+	preferred_mirror = first + guess_optimal(map, num_stripes,
+						 current->pid % num_stripes);
 
 	if (dev_replace_is_ongoing &&
 	    fs_info->dev_replace.cont_reading_from_srcdev_mode ==