diff mbox series

[RFC,3/7] btrfs: add read_policy latency

Message ID de0a28ed406c84c84d40d4bdad5f45250aabfdea.1603751876.git.anand.jain@oracle.com (mailing list archive)
State New, archived
Headers show
Series [RFC,1/7] block: export part_stat_read_all | expand

Commit Message

Anand Jain Oct. 26, 2020, 11:55 p.m. UTC
The read policy type latency routes the read IO based on the historical
average wait time experienced by the read IOs through the individual
device factored by 1/10 of inflight commands in the queue. The factor
1/10 is because generally the block device queue depth is more than 1,
so there can be commands in the queue even before the previous commands
have been completed. This patch obtains the historical read IO stats from
the kernel block layer.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
---
 fs/btrfs/sysfs.c   |  3 +-
 fs/btrfs/volumes.c | 74 +++++++++++++++++++++++++++++++++++++++++++++-
 fs/btrfs/volumes.h |  1 +
 3 files changed, 76 insertions(+), 2 deletions(-)

Comments

Josef Bacik Oct. 27, 2020, 6:20 p.m. UTC | #1
On 10/26/20 7:55 PM, Anand Jain wrote:
> The read policy type latency routes the read IO based on the historical
> average wait time experienced by the read IOs through the individual
> device factored by 1/10 of inflight commands in the queue. The factor
> 1/10 is because generally the block device queue depth is more than 1,
> so there can be commands in the queue even before the previous commands
> have been completed. This patch obtains the historical read IO stats from
> the kernel block layer.
> 
> Signed-off-by: Anand Jain <anand.jain@oracle.com>
> ---
>   fs/btrfs/sysfs.c   |  3 +-
>   fs/btrfs/volumes.c | 74 +++++++++++++++++++++++++++++++++++++++++++++-
>   fs/btrfs/volumes.h |  1 +
>   3 files changed, 76 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
> index d159f7c70bcd..6690abeeb889 100644
> --- a/fs/btrfs/sysfs.c
> +++ b/fs/btrfs/sysfs.c
> @@ -874,7 +874,8 @@ static int btrfs_strmatch(const char *given, const char *golden)
>   	return -EINVAL;
>   }
>   
> -static const char * const btrfs_read_policy_name[] = { "pid" };
> +/* Must follow the order as in enum btrfs_read_policy */
> +static const char * const btrfs_read_policy_name[] = { "pid", "latency"};
>   
>   static ssize_t btrfs_read_policy_show(struct kobject *kobj,
>   				      struct kobj_attribute *a, char *buf)
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index da31b11ceb61..9bab6080cebf 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -14,6 +14,7 @@
>   #include <linux/semaphore.h>
>   #include <linux/uuid.h>
>   #include <linux/list_sort.h>
> +#include <linux/part_stat.h>
>   #include "misc.h"
>   #include "ctree.h"
>   #include "extent_map.h"
> @@ -5465,6 +5466,66 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
>   	return ret;
>   }
>   
> +static u64 btrfs_estimate_read(struct btrfs_device *device,
> +			       unsigned long *inflight)
> +{
> +	u64 read_wait;
> +	u64 avg_wait = 0;
> +	unsigned long read_ios;
> +	struct disk_stats stat;
> +
> +	/* Commands in flight on this partition/device */
> +	*inflight = part_stat_read_inflight(bdev_get_queue(device->bdev),
> +					    device->bdev->bd_part);
> +	part_stat_read_all(device->bdev->bd_part, &stat);
> +
> +	read_wait = stat.nsecs[STAT_READ];
> +	read_ios = stat.ios[STAT_READ];
> +
> +	if (read_wait && read_ios && read_wait >= read_ios)
> +		avg_wait = div_u64(read_wait, read_ios);
> +	else
> +		btrfs_info_rl(device->fs_devices->fs_info,
> +			"devid: %llu avg_wait ZERO read_wait %llu read_ios %lu",
> +			      device->devid, read_wait, read_ios);
> +
> +	return avg_wait;
> +}
> +
> +static int btrfs_find_best_stripe(struct btrfs_fs_info *fs_info,
> +				  struct map_lookup *map, int first,
> +				  int num_stripe)
> +{
> +	int index;
> +	int best_stripe = 0;
> +	int est_wait = -EINVAL;
> +	int last = first + num_stripe;
> +	unsigned long inflight;
> +
> +	for (index = first; index < last; index++) {
> +		struct btrfs_device *device = map->stripes[index].dev;
> +
> +		if (!blk_queue_io_stat(bdev_get_queue(device->bdev)))
> +			return -ENOENT;
> +	}
> +
> +	for (index = first; index < last; index++) {
> +		struct btrfs_device *device = map->stripes[index].dev;
> +		u64 avg_wait;
> +		u64 final_wait;
> +
> +		avg_wait = btrfs_estimate_read(device, &inflight);
> +		final_wait = avg_wait + (avg_wait * (inflight / 10));

Inflight is going to lag because it's only going to account for bio's that 
actually have been attached to requests here.  Since we're already on fuzzy 
ground, why not just skip the inflight and go with the average latencies.  If we 
heavily load one side it's latencies will creep up and then we'll favor the 
other side.  If we want to really aim for the lowest latency, we could add our 
own inflight counter to the stripe itself, and this would account for actual 
IO's that we have inflight currently, and would be much less fuzzy than relying 
on the block inflight counters.  Thanks,

Josef
diff mbox series

Patch

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index d159f7c70bcd..6690abeeb889 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -874,7 +874,8 @@  static int btrfs_strmatch(const char *given, const char *golden)
 	return -EINVAL;
 }
 
-static const char * const btrfs_read_policy_name[] = { "pid" };
+/* Must follow the order as in enum btrfs_read_policy */
+static const char * const btrfs_read_policy_name[] = { "pid", "latency"};
 
 static ssize_t btrfs_read_policy_show(struct kobject *kobj,
 				      struct kobj_attribute *a, char *buf)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index da31b11ceb61..9bab6080cebf 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -14,6 +14,7 @@ 
 #include <linux/semaphore.h>
 #include <linux/uuid.h>
 #include <linux/list_sort.h>
+#include <linux/part_stat.h>
 #include "misc.h"
 #include "ctree.h"
 #include "extent_map.h"
@@ -5465,6 +5466,66 @@  int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
 	return ret;
 }
 
+static u64 btrfs_estimate_read(struct btrfs_device *device,
+			       unsigned long *inflight)
+{
+	u64 read_wait;
+	u64 avg_wait = 0;
+	unsigned long read_ios;
+	struct disk_stats stat;
+
+	/* Commands in flight on this partition/device */
+	*inflight = part_stat_read_inflight(bdev_get_queue(device->bdev),
+					    device->bdev->bd_part);
+	part_stat_read_all(device->bdev->bd_part, &stat);
+
+	read_wait = stat.nsecs[STAT_READ];
+	read_ios = stat.ios[STAT_READ];
+
+	if (read_wait && read_ios && read_wait >= read_ios)
+		avg_wait = div_u64(read_wait, read_ios);
+	else
+		btrfs_info_rl(device->fs_devices->fs_info,
+			"devid: %llu avg_wait ZERO read_wait %llu read_ios %lu",
+			      device->devid, read_wait, read_ios);
+
+	return avg_wait;
+}
+
+static int btrfs_find_best_stripe(struct btrfs_fs_info *fs_info,
+				  struct map_lookup *map, int first,
+				  int num_stripe)
+{
+	int index;
+	int best_stripe = 0;
+	int est_wait = -EINVAL;
+	int last = first + num_stripe;
+	unsigned long inflight;
+
+	for (index = first; index < last; index++) {
+		struct btrfs_device *device = map->stripes[index].dev;
+
+		if (!blk_queue_io_stat(bdev_get_queue(device->bdev)))
+			return -ENOENT;
+	}
+
+	for (index = first; index < last; index++) {
+		struct btrfs_device *device = map->stripes[index].dev;
+		u64 avg_wait;
+		u64 final_wait;
+
+		avg_wait = btrfs_estimate_read(device, &inflight);
+		final_wait = avg_wait + (avg_wait * (inflight / 10));
+
+		if (est_wait == 0 || est_wait > final_wait) {
+			est_wait = final_wait;
+			best_stripe = index;
+		}
+	}
+
+	return best_stripe;
+}
+
 static int find_live_mirror(struct btrfs_fs_info *fs_info,
 			    struct map_lookup *map, int first,
 			    int dev_replace_is_ongoing)
@@ -5491,6 +5552,18 @@  static int find_live_mirror(struct btrfs_fs_info *fs_info,
 		btrfs_warn_rl(fs_info,
 			      "unknown read_policy type %u, fallback to pid",
 			      fs_info->fs_devices->read_policy);
+		fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID;
+		preferred_mirror = first + current->pid % num_stripes;
+		break;
+	case BTRFS_READ_POLICY_LATENCY:
+		preferred_mirror = btrfs_find_best_stripe(fs_info, map, first,
+							  num_stripes);
+		if (preferred_mirror >= 0)
+			break;
+
+		btrfs_warn(fs_info,
+   "iostat is disabled, cannot set latency read_policy, fallback to pid");
+		fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID;
 		fallthrough;
 	case BTRFS_READ_POLICY_PID:
 		preferred_mirror = first + current->pid % num_stripes;
@@ -6111,7 +6184,6 @@  static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
 
 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
 		u32 factor = map->num_stripes / map->sub_stripes;
-
 		stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
 		stripe_index *= map->sub_stripes;
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index e3c36951742d..8705d755d148 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -217,6 +217,7 @@  enum btrfs_chunk_allocation_policy {
  */
 enum btrfs_read_policy {
 	BTRFS_READ_POLICY_PID,
+	BTRFS_READ_POLICY_LATENCY,
 	BTRFS_NR_READ_POLICY,
 };