@@ -906,7 +906,8 @@ static bool btrfs_strmatch(const char *given, const char *golden)
return false;
}
-static const char * const btrfs_read_policy_name[] = { "pid" };
+/* Must follow the order as in enum btrfs_read_policy */
+static const char * const btrfs_read_policy_name[] = { "pid", "latency" };
static ssize_t btrfs_read_policy_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf)
@@ -14,6 +14,7 @@
#include <linux/semaphore.h>
#include <linux/uuid.h>
#include <linux/list_sort.h>
+#include <linux/part_stat.h>
#include "misc.h"
#include "ctree.h"
#include "extent_map.h"
@@ -5468,6 +5469,39 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
return ret;
}
+static int btrfs_find_best_stripe(struct btrfs_fs_info *fs_info,
+ struct map_lookup *map, int first,
+ int num_stripe)
+{
+ u64 est_wait = 0;
+ int best_stripe = 0;
+ int index;
+
+ for (index = first; index < first + num_stripe; index++) {
+ u64 read_wait;
+ u64 avg_wait = 0;
+ unsigned long read_ios;
+ struct btrfs_device *device = map->stripes[index].dev;
+
+ read_wait = part_stat_read(device->bdev->bd_part, nsecs[READ]);
+ read_ios = part_stat_read(device->bdev->bd_part, ios[READ]);
+
+ if (read_wait && read_ios && read_wait >= read_ios)
+ avg_wait = div_u64(read_wait, read_ios);
+ else
+ btrfs_info_rl(device->fs_devices->fs_info,
+ "devid: %llu avg_wait ZERO read_wait %llu read_ios %lu",
+ device->devid, read_wait, read_ios);
+
+ if (est_wait == 0 || est_wait > avg_wait) {
+ est_wait = avg_wait;
+ best_stripe = index;
+ }
+ }
+
+ return best_stripe;
+}
+
static int find_live_mirror(struct btrfs_fs_info *fs_info,
struct map_lookup *map, int first,
int dev_replace_is_ongoing)
@@ -5498,6 +5532,10 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
case BTRFS_READ_POLICY_PID:
preferred_mirror = first + current->pid % num_stripes;
break;
+ case BTRFS_READ_POLICY_LATENCY:
+ preferred_mirror = btrfs_find_best_stripe(fs_info, map, first,
+ num_stripes);
+ break;
}
if (dev_replace_is_ongoing &&
@@ -6114,7 +6152,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
u32 factor = map->num_stripes / map->sub_stripes;
-
stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
stripe_index *= map->sub_stripes;
@@ -217,6 +217,7 @@ enum btrfs_chunk_allocation_policy {
*/
enum btrfs_read_policy {
BTRFS_READ_POLICY_PID,
+ BTRFS_READ_POLICY_LATENCY,
BTRFS_NR_READ_POLICY,
};
The read policy type latency routes the read IO based on the historical average wait time experienced by the read IOs through the individual device factored by 1/10 of inflight commands in the queue. The factor 1/10 is because generally the block device queue depth is more than 1, so there can be commands in the queue even before the previous commands have been completed. This patch obtains the historical read IO stats from the kernel block layer. Signed-off-by: Anand Jain <anand.jain@oracle.com> --- v1: Drop part_stat_read_all instead use part_stat_read Drop inflight fs/btrfs/sysfs.c | 3 ++- fs/btrfs/volumes.c | 39 ++++++++++++++++++++++++++++++++++++++- fs/btrfs/volumes.h | 1 + 3 files changed, 41 insertions(+), 2 deletions(-)