@@ -1305,7 +1305,12 @@ static ssize_t btrfs_temp_fsid_show(struct kobject *kobj,
}
BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show);
-static const char * const btrfs_read_policy_name[] = { "pid" };
+static const char *btrfs_read_policy_name[] = {
+ "pid",
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+ "round-robin",
+#endif
+};
static int btrfs_read_policy_to_enum(const char *str, s64 *value)
{
@@ -1347,6 +1352,12 @@ static ssize_t btrfs_read_policy_show(struct kobject *kobj,
ret += sysfs_emit_at(buf, ret, "%s", btrfs_read_policy_name[i]);
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+ if (i == BTRFS_READ_POLICY_RR)
+ ret += sysfs_emit_at(buf, ret, ":%d",
+ READ_ONCE(fs_devices->rr_min_contiguous_read));
+#endif
+
if (i == policy)
ret += sysfs_emit_at(buf, ret, "]");
}
@@ -1368,6 +1379,42 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj,
if (index < 0)
return -EINVAL;
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+ /* If moving out of RR then disable fs_stats */
+ if (fs_devices->read_policy == BTRFS_READ_POLICY_RR &&
+ index != BTRFS_READ_POLICY_RR)
+ fs_devices->fs_stats = false;
+
+ if (index == BTRFS_READ_POLICY_RR) {
+ if (value != -1) {
+ u32 sectorsize = fs_devices->fs_info->sectorsize;
+
+ if (!IS_ALIGNED(value, sectorsize)) {
+ u64 temp_value = round_up(value, sectorsize);
+
+ btrfs_warn(fs_devices->fs_info,
+"read_policy: min contiguous read %lld should be multiples of the sectorsize %u, rounded to %llu",
+ value, sectorsize, temp_value);
+ value = temp_value;
+ }
+ } else {
+ value = BTRFS_DEFAULT_RR_MIN_CONTIGUOUS_READ;
+ }
+
+ if (index != READ_ONCE(fs_devices->read_policy) ||
+ value != READ_ONCE(fs_devices->rr_min_contiguous_read)) {
+ WRITE_ONCE(fs_devices->read_policy, index);
+ WRITE_ONCE(fs_devices->rr_min_contiguous_read, value);
+
+ btrfs_info(fs_devices->fs_info, "read policy set to '%s:%lld'",
+ btrfs_read_policy_name[index], value);
+ }
+
+ fs_devices->fs_stats = true;
+
+ return len;
+ }
+#endif
if (index != READ_ONCE(fs_devices->read_policy)) {
WRITE_ONCE(fs_devices->read_policy, index);
btrfs_info(fs_devices->fs_info, "read policy set to '%s'",
@@ -1334,6 +1334,9 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
fs_devices->total_rw_bytes = 0;
fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
fs_devices->read_policy = BTRFS_READ_POLICY_PID;
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+ fs_devices->rr_min_contiguous_read = BTRFS_DEFAULT_RR_MIN_CONTIGUOUS_READ;
+#endif
return 0;
}
@@ -5965,6 +5968,70 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
return len;
}
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+struct stripe_mirror {
+ u64 devid;
+ int num;
+};
+
+static int btrfs_cmp_devid(const void *a, const void *b)
+{
+ const struct stripe_mirror *s1 = (struct stripe_mirror *)a;
+ const struct stripe_mirror *s2 = (struct stripe_mirror *)b;
+
+ if (s1->devid < s2->devid)
+ return -1;
+ if (s1->devid > s2->devid)
+ return 1;
+ return 0;
+}
+
+/*
+ * btrfs_read_rr.
+ *
+ * Select a stripe for reading using a round-robin algorithm:
+ *
+ * 1. Compute the read cycle as the total sectors read divided by the minimum
+ * sectors per device.
+ * 2. Determine the stripe number for the current read by taking the modulus
+ * of the read cycle with the total number of stripes:
+ *
+ * stripe index = (total sectors / min sectors per dev) % num stripes
+ *
+ * The calculated stripe index is then used to select the corresponding device
+ * from the list of devices, which is ordered by devid.
+ */
+static int btrfs_read_rr(struct btrfs_chunk_map *map, int first, int num_stripe)
+{
+ struct stripe_mirror stripes[BTRFS_RAID1_MAX_MIRRORS] = {0};
+ struct btrfs_device *device = map->stripes[first].dev;
+ struct btrfs_fs_devices *fs_devices = device->fs_devices;
+ int read_cycle;
+ int index;
+ int ret_stripe;
+ int total_reads;
+ int min_reads_per_dev;
+
+ total_reads = percpu_counter_sum(&fs_devices->read_cnt_blocks);
+ min_reads_per_dev = READ_ONCE(fs_devices->rr_min_contiguous_read) >>
+ fs_devices->fs_info->sectorsize_bits;
+
+ index = 0;
+ for (int i = first; i < first + num_stripe; i++) {
+ stripes[index].devid = map->stripes[i].dev->devid;
+ stripes[index].num = i;
+ index++;
+ }
+ sort(stripes, num_stripe, sizeof(struct stripe_mirror),
+ btrfs_cmp_devid, NULL);
+
+ read_cycle = total_reads / min_reads_per_dev;
+ ret_stripe = stripes[read_cycle % num_stripe].num;
+
+ return ret_stripe;
+}
+#endif
+
static int find_live_mirror(struct btrfs_fs_info *fs_info,
struct btrfs_chunk_map *map, int first,
int dev_replace_is_ongoing)
@@ -5994,6 +6061,11 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
case BTRFS_READ_POLICY_PID:
preferred_mirror = first + (current->pid % num_stripes);
break;
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+ case BTRFS_READ_POLICY_RR:
+ preferred_mirror = btrfs_read_rr(map, first, num_stripes);
+ break;
+#endif
}
if (dev_replace_is_ongoing &&
@@ -296,6 +296,9 @@ enum btrfs_chunk_allocation_policy {
BTRFS_CHUNK_ALLOC_ZONED,
};
+/* SZ_192K = 192 * 1024 = 196608 */
+#define BTRFS_DEFAULT_RR_MIN_CONTIGUOUS_READ (196608)
+#define BTRFS_RAID1_MAX_MIRRORS (4)
/*
* Read policies for mirrored block group profiles, read picks the stripe based
* on these policies.
@@ -303,6 +306,10 @@ enum btrfs_chunk_allocation_policy {
enum btrfs_read_policy {
/* Use process PID to choose the stripe */
BTRFS_READ_POLICY_PID,
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+ /* Balancing raid1 reads across all striped devices (round-robin) */
+ BTRFS_READ_POLICY_RR,
+#endif
BTRFS_NR_READ_POLICY,
};
@@ -436,6 +443,9 @@ struct btrfs_fs_devices {
enum btrfs_read_policy read_policy;
#ifdef CONFIG_BTRFS_EXPERIMENTAL
+ /* Min contiguous reads before switching to next device. */
+ int rr_min_contiguous_read;
+
/* Checksum mode - offload it or do it synchronously. */
enum btrfs_offload_csum_mode offload_csum_mode;
#endif
This feature balances I/O across the striped devices when reading from RAID1 blocks. echo round-robin[:min_contiguous_read] > /sys/fs/btrfs/<uuid>/read_policy The min_contiguous_read parameter defines the minimum read size before switching to the next mirrored device. This setting is optional, with a default value of 192KiB. Signed-off-by: Anand Jain <anand.jain@oracle.com> --- fs/btrfs/sysfs.c | 49 ++++++++++++++++++++++++++++++- fs/btrfs/volumes.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/volumes.h | 10 +++++++ 3 files changed, 130 insertions(+), 1 deletion(-)