diff mbox series

[v3,05/10] btrfs: introduce RAID1 round-robin read balancing

Message ID 995d4a9dd9f553825805efdac24dec4a9de20ef3.1731076425.git.anand.jain@oracle.com (mailing list archive)
State New
Headers show
Series raid1 balancing methods | expand

Commit Message

Anand Jain Nov. 15, 2024, 2:54 p.m. UTC
This feature balances I/O across the striped devices when reading from
RAID1 blocks.

   echo round-robin:[min_contiguous_read] > /sys/fs/btrfs/<uuid>/read_policy

The min_contiguous_read parameter defines the minimum read size before
switching to the next mirrored device. This setting is optional, with a
default value of 256 KiB.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
---
 fs/btrfs/sysfs.c   | 38 +++++++++++++++++++++++++++++
 fs/btrfs/volumes.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/volumes.h |  9 +++++++
 3 files changed, 107 insertions(+)
diff mbox series

Patch

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 7907507b8ced..092a78298d1a 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -1305,7 +1305,11 @@  static ssize_t btrfs_temp_fsid_show(struct kobject *kobj,
 }
 BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show);
 
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+static const char * const btrfs_read_policy_name[] = { "pid", "round-robin" };
+#else
 static const char * const btrfs_read_policy_name[] = { "pid" };
+#endif
 
 static enum btrfs_read_policy btrfs_read_policy_to_enum(const char *str, s64 *value)
 {
@@ -1367,6 +1371,12 @@  static ssize_t btrfs_read_policy_show(struct kobject *kobj,
 
 		ret += sysfs_emit_at(buf, ret, "%s", btrfs_read_policy_name[i]);
 
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+		if (i == BTRFS_READ_POLICY_RR)
+			ret += sysfs_emit_at(buf, ret, ":%d",
+					     fs_devices->min_contiguous_read);
+#endif
+
 		if (i == policy)
 			ret += sysfs_emit_at(buf, ret, "]");
 	}
@@ -1388,6 +1398,34 @@  static ssize_t btrfs_read_policy_store(struct kobject *kobj,
 	if (index == -EINVAL)
 		return -EINVAL;
 
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+	if (index == BTRFS_READ_POLICY_RR) {
+		if (value != -1) {
+			if ((value % fs_devices->fs_info->sectorsize) != 0) {
+				btrfs_err(fs_devices->fs_info,
+"read_policy: min_contiguous_read %lld should be multiples of the sectorsize %u",
+					  value, fs_devices->fs_info->sectorsize);
+				return -EINVAL;
+			}
+		} else {
+			/* value is not provided, set it to the default 256k */
+			value = 256 * 1024;
+		}
+
+		if (index != READ_ONCE(fs_devices->read_policy) ||
+		    value != READ_ONCE(fs_devices->min_contiguous_read)) {
+			WRITE_ONCE(fs_devices->read_policy, index);
+			WRITE_ONCE(fs_devices->min_contiguous_read, value);
+			atomic_set(&fs_devices->total_reads, 0);
+
+			btrfs_info(fs_devices->fs_info, "read policy set to '%s:%lld'",
+				   btrfs_read_policy_name[index], value);
+
+		}
+
+		return len;
+	}
+#endif
 	if (index != READ_ONCE(fs_devices->read_policy)) {
 		WRITE_ONCE(fs_devices->read_policy, index);
 		btrfs_info(fs_devices->fs_info, "read policy set to '%s'",
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index fe5ceea2ba0b..97576a715191 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1328,6 +1328,10 @@  static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
 	fs_devices->total_rw_bytes = 0;
 	fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
 	fs_devices->read_policy = BTRFS_READ_POLICY_PID;
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+	/* Set min_contiguous_read to a default 256kib */
+	fs_devices->min_contiguous_read = 256 * 1024;
+#endif
 
 	return 0;
 }
@@ -5959,6 +5963,57 @@  unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
 	return len;
 }
 
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+struct stripe_mirror {
+	u64 devid;
+	int num;
+};
+
+static int btrfs_cmp_devid(const void *a, const void *b)
+{
+	struct stripe_mirror *s1 = (struct stripe_mirror *)a;
+	struct stripe_mirror *s2 = (struct stripe_mirror *)b;
+
+	if (s1->devid < s2->devid)
+		return -1;
+	if (s1->devid > s2->devid)
+		return 1;
+	return 0;
+}
+
+static int btrfs_read_rr(struct btrfs_chunk_map *map, int first, int num_stripe)
+{
+	struct stripe_mirror stripes[4] = {0}; //4: max possible mirrors
+	struct btrfs_fs_devices *fs_devices;
+	struct btrfs_device *device;
+	int j;
+	int read_cycle;
+	int index;
+	int ret_stripe;
+	int total_reads;
+	int reads_per_dev = 0;
+
+	device = map->stripes[first].dev;
+
+	fs_devices = device->fs_devices;
+	reads_per_dev = fs_devices->min_contiguous_read/fs_devices->fs_info->sectorsize;
+	index = 0;
+	for (j = first; j < first + num_stripe; j++) {
+		stripes[index].devid = map->stripes[j].dev->devid;
+		stripes[index].num = j;
+		index++;
+	}
+	sort(stripes, num_stripe, sizeof(struct stripe_mirror),
+	     btrfs_cmp_devid, NULL);
+
+	total_reads = atomic_inc_return(&fs_devices->total_reads);
+	read_cycle = total_reads/reads_per_dev;
+	ret_stripe = stripes[read_cycle % num_stripe].num;
+
+	return ret_stripe;
+}
+#endif
+
 static int find_live_mirror(struct btrfs_fs_info *fs_info,
 			    struct btrfs_chunk_map *map, int first,
 			    int dev_replace_is_ongoing)
@@ -5988,6 +6043,11 @@  static int find_live_mirror(struct btrfs_fs_info *fs_info,
 	case BTRFS_READ_POLICY_PID:
 		preferred_mirror = first + (current->pid % num_stripes);
 		break;
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+	case BTRFS_READ_POLICY_RR:
+		preferred_mirror = btrfs_read_rr(map, first, num_stripes);
+		break;
+#endif
 	}
 
 	if (dev_replace_is_ongoing &&
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 3a416b1bc24c..05778361c270 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -303,6 +303,10 @@  enum btrfs_chunk_allocation_policy {
 enum btrfs_read_policy {
 	/* Use process PID to choose the stripe */
 	BTRFS_READ_POLICY_PID,
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+	/* Balancing raid1 reads across all striped devices (round-robin) */
+	BTRFS_READ_POLICY_RR,
+#endif
 	BTRFS_NR_READ_POLICY,
 };
 
@@ -431,6 +435,11 @@  struct btrfs_fs_devices {
 	enum btrfs_read_policy read_policy;
 
 #ifdef CONFIG_BTRFS_EXPERIMENTAL
+	/* IO stat, read counter. */
+	atomic_t total_reads;
+	/* Min contiguous reads before switching to next device. */
+	int min_contiguous_read;
+
 	/* Checksum mode - offload it or do it synchronously. */
 	enum btrfs_offload_csum_mode offload_csum_mode;
 #endif