Message ID | 90934f391bc1c9772f9e3a7902cf9d04f3b0d14a.1734370092.git.anand.jain@oracle.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | raid1 balancing methods | expand |
On Tue, Dec 17, 2024 at 02:13:13AM +0800, Anand Jain wrote: > This feature balances I/O across the striped devices when reading from > RAID1 blocks. > > echo round-robin[:min_contiguous_read] > /sys/fs/btrfs/<uuid>/read_policy > > The min_contiguous_read parameter defines the minimum read size before > switching to the next mirrored device. This setting is optional, with a > default value of 256 KiB. > > Signed-off-by: Anand Jain <anand.jain@oracle.com> > --- > fs/btrfs/sysfs.c | 44 +++++++++++++++++++++++++++- > fs/btrfs/volumes.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++ > fs/btrfs/volumes.h | 11 +++++++ > 3 files changed, 127 insertions(+), 1 deletion(-) > > diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c > index 9c7bedf974d2..b0e1fb787ce6 100644 > --- a/fs/btrfs/sysfs.c > +++ b/fs/btrfs/sysfs.c > @@ -1305,7 +1305,12 @@ static ssize_t btrfs_temp_fsid_show(struct kobject *kobj, > } > BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show); > > -static const char * const btrfs_read_policy_name[] = { "pid" }; > +static const char *btrfs_read_policy_name[] = { > + "pid", > +#ifdef CONFIG_BTRFS_EXPERIMENTAL > + "round-robin", > +#endif > +}; > > static int btrfs_read_policy_to_enum(const char *str, s64 *value) > { > @@ -1359,6 +1364,12 @@ static ssize_t btrfs_read_policy_show(struct kobject *kobj, > > ret += sysfs_emit_at(buf, ret, "%s", btrfs_read_policy_name[i]); > > +#ifdef CONFIG_BTRFS_EXPERIMENTAL > + if (i == BTRFS_READ_POLICY_RR) > + ret += sysfs_emit_at(buf, ret, ":%d", > + fs_devices->rr_min_contiguous_read); I guess we want READ_ONCE() here as well. > +#endif > + > if (i == policy) > ret += sysfs_emit_at(buf, ret, "]"); > } > @@ -1380,6 +1391,37 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj, > if (index == -EINVAL) > return -EINVAL; > > +#ifdef CONFIG_BTRFS_EXPERIMENTAL > + if (index == BTRFS_READ_POLICY_RR) { > + if (value != -1) { > + u32 sectorsize = fs_devices->fs_info->sectorsize; > + > + if (!IS_ALIGNED(value, sectorsize)) { > + u64 temp_value = round_up(value, sectorsize); > + > + btrfs_warn(fs_devices->fs_info, > +"read_policy: min contiguous read %lld should be multiples of the sectorsize %u, rounded to %llu", > + value, sectorsize, temp_value); > + value = temp_value; > + } > + } else { > + value = BTRFS_DEFAULT_RR_MIN_CONTIGUOUS_READ; > + } > + > + if (index != READ_ONCE(fs_devices->read_policy) || > + value != READ_ONCE(fs_devices->rr_min_contiguous_read)) { > + WRITE_ONCE(fs_devices->read_policy, index); > + WRITE_ONCE(fs_devices->rr_min_contiguous_read, value); > + atomic_set(&fs_devices->total_reads, 0); > + > + btrfs_info(fs_devices->fs_info, "read policy set to '%s:%lld'", > + btrfs_read_policy_name[index], value); > + > + } > + > + return len; > + } > +#endif > if (index != READ_ONCE(fs_devices->read_policy)) { > WRITE_ONCE(fs_devices->read_policy, index); > btrfs_info(fs_devices->fs_info, "read policy set to '%s'", > diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c > index fe5ceea2ba0b..77c3b66d56a0 100644 > --- a/fs/btrfs/volumes.c > +++ b/fs/btrfs/volumes.c > @@ -1328,6 +1328,9 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, > fs_devices->total_rw_bytes = 0; > fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR; > fs_devices->read_policy = BTRFS_READ_POLICY_PID; > +#ifdef CONFIG_BTRFS_EXPERIMENTAL > + fs_devices->rr_min_contiguous_read = BTRFS_DEFAULT_RR_MIN_CONTIGUOUS_READ; > +#endif > > return 0; > } > @@ -5959,6 +5962,71 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, > return len; > } > > +#ifdef CONFIG_BTRFS_EXPERIMENTAL > +struct stripe_mirror { > + u64 devid; > + int num; > +}; > + > +static int btrfs_cmp_devid(const void *a, const void *b) > +{ > + const struct stripe_mirror *s1 = (struct stripe_mirror *)a; > + const struct stripe_mirror *s2 = (struct stripe_mirror *)b; > + > + if (s1->devid < s2->devid) > + return -1; > + if (s1->devid > s2->devid) > + return 1; > + return 0; > +} > + > +/* > + * btrfs_read_rr. > + * > + * Select a stripe for reading using a round-robin algorithm: > + * > + * 1. Compute the read cycle as the total sectors read divided by the minimum > + * sectors per device. > + * 2. Determine the stripe number for the current read by taking the modulus > + * of the read cycle with the total number of stripes: > + * > + * stripe index = (total sectors / min sectors per dev) % num stripes > + * > + * The calculated stripe index is then used to select the corresponding device > + * from the list of devices, which is ordered by devid. > + */ > +static int btrfs_read_rr(struct btrfs_chunk_map *map, int first, int num_stripe) > +{ > + struct stripe_mirror stripes[BTRFS_RAID1_MAX_MIRRORS] = {0}; > + struct btrfs_fs_devices *fs_devices; > + struct btrfs_device *device; > + int read_cycle; > + int index; > + int ret_stripe; > + int total_reads; > + int reads_per_dev = 0; > + > + device = map->stripes[first].dev; > + > + fs_devices = device->fs_devices; > + reads_per_dev = fs_devices->rr_min_contiguous_read >> SECTOR_SHIFT; Want READ_ONCE() as well. Also, is it OK to divide it with (1 << SECTOR_SHIFT), which is not necessary equal to fs_info->sectorsize? > + index = 0; > + for (int i = first; i < first + num_stripe; i++) { > + stripes[index].devid = map->stripes[i].dev->devid; > + stripes[index].num = i; > + index++; > + } > + sort(stripes, num_stripe, sizeof(struct stripe_mirror), > + btrfs_cmp_devid, NULL); > + > + total_reads = atomic_inc_return(&fs_devices->total_reads); > + read_cycle = total_reads / reads_per_dev; > + ret_stripe = stripes[read_cycle % num_stripe].num; I'm not sure the logic here. Since the code increments the total_reads counter by 1, can we assume this function is invoked per fs_info->sectorsize? > + > + return ret_stripe; > +} > +#endif > + > static int find_live_mirror(struct btrfs_fs_info *fs_info, > struct btrfs_chunk_map *map, int first, > int dev_replace_is_ongoing) > @@ -5988,6 +6056,11 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, > case BTRFS_READ_POLICY_PID: > preferred_mirror = first + (current->pid % num_stripes); > break; > +#ifdef CONFIG_BTRFS_EXPERIMENTAL > + case BTRFS_READ_POLICY_RR: > + preferred_mirror = btrfs_read_rr(map, first, num_stripes); > + break; > +#endif > } > > if (dev_replace_is_ongoing && > diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h > index 3a416b1bc24c..b7b130ce0b10 100644 > --- a/fs/btrfs/volumes.h > +++ b/fs/btrfs/volumes.h > @@ -296,6 +296,8 @@ enum btrfs_chunk_allocation_policy { > BTRFS_CHUNK_ALLOC_ZONED, > }; > > +#define BTRFS_DEFAULT_RR_MIN_CONTIGUOUS_READ (SZ_256K) > +#define BTRFS_RAID1_MAX_MIRRORS (4) > /* > * Read policies for mirrored block group profiles, read picks the stripe based > * on these policies. > @@ -303,6 +305,10 @@ enum btrfs_chunk_allocation_policy { > enum btrfs_read_policy { > /* Use process PID to choose the stripe */ > BTRFS_READ_POLICY_PID, > +#ifdef CONFIG_BTRFS_EXPERIMENTAL > + /* Balancing raid1 reads across all striped devices (round-robin) */ > + BTRFS_READ_POLICY_RR, > +#endif > BTRFS_NR_READ_POLICY, > }; > > @@ -431,6 +437,11 @@ struct btrfs_fs_devices { > enum btrfs_read_policy read_policy; > > #ifdef CONFIG_BTRFS_EXPERIMENTAL > + /* IO stat, read counter. */ > + atomic_t total_reads; > + /* Min contiguous reads before switching to next device. */ > + int rr_min_contiguous_read; > + > /* Checksum mode - offload it or do it synchronously. */ > enum btrfs_offload_csum_mode offload_csum_mode; > #endif > -- > 2.47.0 >
On 18/12/24 11:23, Naohiro Aota wrote: > On Tue, Dec 17, 2024 at 02:13:13AM +0800, Anand Jain wrote: >> This feature balances I/O across the striped devices when reading from >> RAID1 blocks. >> >> echo round-robin[:min_contiguous_read] > /sys/fs/btrfs/<uuid>/read_policy >> >> The min_contiguous_read parameter defines the minimum read size before >> switching to the next mirrored device. This setting is optional, with a >> default value of 256 KiB. >> >> Signed-off-by: Anand Jain <anand.jain@oracle.com> >> --- >> fs/btrfs/sysfs.c | 44 +++++++++++++++++++++++++++- >> fs/btrfs/volumes.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++ >> fs/btrfs/volumes.h | 11 +++++++ >> 3 files changed, 127 insertions(+), 1 deletion(-) >> >> diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c >> index 9c7bedf974d2..b0e1fb787ce6 100644 >> --- a/fs/btrfs/sysfs.c >> +++ b/fs/btrfs/sysfs.c >> @@ -1305,7 +1305,12 @@ static ssize_t btrfs_temp_fsid_show(struct kobject *kobj, >> } >> BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show); >> >> -static const char * const btrfs_read_policy_name[] = { "pid" }; >> +static const char *btrfs_read_policy_name[] = { >> + "pid", >> +#ifdef CONFIG_BTRFS_EXPERIMENTAL >> + "round-robin", >> +#endif >> +}; >> >> static int btrfs_read_policy_to_enum(const char *str, s64 *value) >> { >> @@ -1359,6 +1364,12 @@ static ssize_t btrfs_read_policy_show(struct kobject *kobj, >> >> ret += sysfs_emit_at(buf, ret, "%s", btrfs_read_policy_name[i]); >> >> +#ifdef CONFIG_BTRFS_EXPERIMENTAL >> + if (i == BTRFS_READ_POLICY_RR) >> + ret += sysfs_emit_at(buf, ret, ":%d", >> + fs_devices->rr_min_contiguous_read); > > I guess we want READ_ONCE() here as well. > >> +#endif >> + >> if (i == policy) >> ret += sysfs_emit_at(buf, ret, "]"); >> } >> @@ -1380,6 +1391,37 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj, >> if (index == -EINVAL) >> return -EINVAL; >> >> +#ifdef CONFIG_BTRFS_EXPERIMENTAL >> + if (index == BTRFS_READ_POLICY_RR) { >> + if (value != -1) { >> + u32 sectorsize = fs_devices->fs_info->sectorsize; >> + >> + if (!IS_ALIGNED(value, sectorsize)) { >> + u64 temp_value = round_up(value, sectorsize); >> + >> + btrfs_warn(fs_devices->fs_info, >> +"read_policy: min contiguous read %lld should be multiples of the sectorsize %u, rounded to %llu", >> + value, sectorsize, temp_value); >> + value = temp_value; >> + } >> + } else { >> + value = BTRFS_DEFAULT_RR_MIN_CONTIGUOUS_READ; >> + } >> + >> + if (index != READ_ONCE(fs_devices->read_policy) || >> + value != READ_ONCE(fs_devices->rr_min_contiguous_read)) { >> + WRITE_ONCE(fs_devices->read_policy, index); >> + WRITE_ONCE(fs_devices->rr_min_contiguous_read, value); >> + atomic_set(&fs_devices->total_reads, 0); >> + >> + btrfs_info(fs_devices->fs_info, "read policy set to '%s:%lld'", >> + btrfs_read_policy_name[index], value); >> + >> + } >> + >> + return len; >> + } >> +#endif >> if (index != READ_ONCE(fs_devices->read_policy)) { >> WRITE_ONCE(fs_devices->read_policy, index); >> btrfs_info(fs_devices->fs_info, "read policy set to '%s'", >> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c >> index fe5ceea2ba0b..77c3b66d56a0 100644 >> --- a/fs/btrfs/volumes.c >> +++ b/fs/btrfs/volumes.c >> @@ -1328,6 +1328,9 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, >> fs_devices->total_rw_bytes = 0; >> fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR; >> fs_devices->read_policy = BTRFS_READ_POLICY_PID; >> +#ifdef CONFIG_BTRFS_EXPERIMENTAL >> + fs_devices->rr_min_contiguous_read = BTRFS_DEFAULT_RR_MIN_CONTIGUOUS_READ; >> +#endif >> >> return 0; >> } >> @@ -5959,6 +5962,71 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, >> return len; >> } >> >> +#ifdef CONFIG_BTRFS_EXPERIMENTAL >> +struct stripe_mirror { >> + u64 devid; >> + int num; >> +}; >> + >> +static int btrfs_cmp_devid(const void *a, const void *b) >> +{ >> + const struct stripe_mirror *s1 = (struct stripe_mirror *)a; >> + const struct stripe_mirror *s2 = (struct stripe_mirror *)b; >> + >> + if (s1->devid < s2->devid) >> + return -1; >> + if (s1->devid > s2->devid) >> + return 1; >> + return 0; >> +} >> + >> +/* >> + * btrfs_read_rr. >> + * >> + * Select a stripe for reading using a round-robin algorithm: >> + * >> + * 1. Compute the read cycle as the total sectors read divided by the minimum >> + * sectors per device. >> + * 2. Determine the stripe number for the current read by taking the modulus >> + * of the read cycle with the total number of stripes: >> + * >> + * stripe index = (total sectors / min sectors per dev) % num stripes >> + * >> + * The calculated stripe index is then used to select the corresponding device >> + * from the list of devices, which is ordered by devid. >> + */ >> +static int btrfs_read_rr(struct btrfs_chunk_map *map, int first, int num_stripe) >> +{ >> + struct stripe_mirror stripes[BTRFS_RAID1_MAX_MIRRORS] = {0}; >> + struct btrfs_fs_devices *fs_devices; >> + struct btrfs_device *device; >> + int read_cycle; >> + int index; >> + int ret_stripe; >> + int total_reads; >> + int reads_per_dev = 0; >> + >> + device = map->stripes[first].dev; >> + >> + fs_devices = device->fs_devices; >> + reads_per_dev = fs_devices->rr_min_contiguous_read >> SECTOR_SHIFT; > > Want READ_ONCE() as well. Also, is it OK to divide it with (1 << > SECTOR_SHIFT), which is not necessary equal to fs_info->sectorsize? > >> + index = 0; >> + for (int i = first; i < first + num_stripe; i++) { >> + stripes[index].devid = map->stripes[i].dev->devid; >> + stripes[index].num = i; >> + index++; >> + } >> + sort(stripes, num_stripe, sizeof(struct stripe_mirror), >> + btrfs_cmp_devid, NULL); >> + >> + total_reads = atomic_inc_return(&fs_devices->total_reads); >> + read_cycle = total_reads / reads_per_dev; >> + ret_stripe = stripes[read_cycle % num_stripe].num; > > I'm not sure the logic here. Since the code increments the total_reads > counter by 1, can we assume this function is invoked per > fs_info->sectorsize? > You're right. To fix this, we need to track read I/O stat in `struct device` on our own. I avoided this earlier as the block layer already provides I/O stats (though they might be stale). Unless there is a better way. I'm trying. Thanks for your review. -Anand >> + >> + return ret_stripe; >> +} >> +#endif >> + >> static int find_live_mirror(struct btrfs_fs_info *fs_info, >> struct btrfs_chunk_map *map, int first, >> int dev_replace_is_ongoing) >> @@ -5988,6 +6056,11 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, >> case BTRFS_READ_POLICY_PID: >> preferred_mirror = first + (current->pid % num_stripes); >> break; >> +#ifdef CONFIG_BTRFS_EXPERIMENTAL >> + case BTRFS_READ_POLICY_RR: >> + preferred_mirror = btrfs_read_rr(map, first, num_stripes); >> + break; >> +#endif >> } >> >> if (dev_replace_is_ongoing && >> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h >> index 3a416b1bc24c..b7b130ce0b10 100644 >> --- a/fs/btrfs/volumes.h >> +++ b/fs/btrfs/volumes.h >> @@ -296,6 +296,8 @@ enum btrfs_chunk_allocation_policy { >> BTRFS_CHUNK_ALLOC_ZONED, >> }; >> >> +#define BTRFS_DEFAULT_RR_MIN_CONTIGUOUS_READ (SZ_256K) >> +#define BTRFS_RAID1_MAX_MIRRORS (4) >> /* >> * Read policies for mirrored block group profiles, read picks the stripe based >> * on these policies. >> @@ -303,6 +305,10 @@ enum btrfs_chunk_allocation_policy { >> enum btrfs_read_policy { >> /* Use process PID to choose the stripe */ >> BTRFS_READ_POLICY_PID, >> +#ifdef CONFIG_BTRFS_EXPERIMENTAL >> + /* Balancing raid1 reads across all striped devices (round-robin) */ >> + BTRFS_READ_POLICY_RR, >> +#endif >> BTRFS_NR_READ_POLICY, >> }; >> >> @@ -431,6 +437,11 @@ struct btrfs_fs_devices { >> enum btrfs_read_policy read_policy; >> >> #ifdef CONFIG_BTRFS_EXPERIMENTAL >> + /* IO stat, read counter. */ >> + atomic_t total_reads; >> + /* Min contiguous reads before switching to next device. */ >> + int rr_min_contiguous_read; >> + >> /* Checksum mode - offload it or do it synchronously. */ >> enum btrfs_offload_csum_mode offload_csum_mode; >> #endif >> -- >> 2.47.0
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 9c7bedf974d2..b0e1fb787ce6 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1305,7 +1305,12 @@ static ssize_t btrfs_temp_fsid_show(struct kobject *kobj, } BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show); -static const char * const btrfs_read_policy_name[] = { "pid" }; +static const char *btrfs_read_policy_name[] = { + "pid", +#ifdef CONFIG_BTRFS_EXPERIMENTAL + "round-robin", +#endif +}; static int btrfs_read_policy_to_enum(const char *str, s64 *value) { @@ -1359,6 +1364,12 @@ static ssize_t btrfs_read_policy_show(struct kobject *kobj, ret += sysfs_emit_at(buf, ret, "%s", btrfs_read_policy_name[i]); +#ifdef CONFIG_BTRFS_EXPERIMENTAL + if (i == BTRFS_READ_POLICY_RR) + ret += sysfs_emit_at(buf, ret, ":%d", + fs_devices->rr_min_contiguous_read); +#endif + if (i == policy) ret += sysfs_emit_at(buf, ret, "]"); } @@ -1380,6 +1391,37 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj, if (index == -EINVAL) return -EINVAL; +#ifdef CONFIG_BTRFS_EXPERIMENTAL + if (index == BTRFS_READ_POLICY_RR) { + if (value != -1) { + u32 sectorsize = fs_devices->fs_info->sectorsize; + + if (!IS_ALIGNED(value, sectorsize)) { + u64 temp_value = round_up(value, sectorsize); + + btrfs_warn(fs_devices->fs_info, +"read_policy: min contiguous read %lld should be multiples of the sectorsize %u, rounded to %llu", + value, sectorsize, temp_value); + value = temp_value; + } + } else { + value = BTRFS_DEFAULT_RR_MIN_CONTIGUOUS_READ; + } + + if (index != READ_ONCE(fs_devices->read_policy) || + value != READ_ONCE(fs_devices->rr_min_contiguous_read)) { + WRITE_ONCE(fs_devices->read_policy, index); + WRITE_ONCE(fs_devices->rr_min_contiguous_read, value); + atomic_set(&fs_devices->total_reads, 0); + + btrfs_info(fs_devices->fs_info, "read policy set to '%s:%lld'", + btrfs_read_policy_name[index], value); + + } + + return len; + } +#endif if (index != READ_ONCE(fs_devices->read_policy)) { WRITE_ONCE(fs_devices->read_policy, index); btrfs_info(fs_devices->fs_info, "read policy set to '%s'", diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index fe5ceea2ba0b..77c3b66d56a0 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1328,6 +1328,9 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, fs_devices->total_rw_bytes = 0; fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR; fs_devices->read_policy = BTRFS_READ_POLICY_PID; +#ifdef CONFIG_BTRFS_EXPERIMENTAL + fs_devices->rr_min_contiguous_read = BTRFS_DEFAULT_RR_MIN_CONTIGUOUS_READ; +#endif return 0; } @@ -5959,6 +5962,71 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, return len; } +#ifdef CONFIG_BTRFS_EXPERIMENTAL +struct stripe_mirror { + u64 devid; + int num; +}; + +static int btrfs_cmp_devid(const void *a, const void *b) +{ + const struct stripe_mirror *s1 = (struct stripe_mirror *)a; + const struct stripe_mirror *s2 = (struct stripe_mirror *)b; + + if (s1->devid < s2->devid) + return -1; + if (s1->devid > s2->devid) + return 1; + return 0; +} + +/* + * btrfs_read_rr. + * + * Select a stripe for reading using a round-robin algorithm: + * + * 1. Compute the read cycle as the total sectors read divided by the minimum + * sectors per device. + * 2. Determine the stripe number for the current read by taking the modulus + * of the read cycle with the total number of stripes: + * + * stripe index = (total sectors / min sectors per dev) % num stripes + * + * The calculated stripe index is then used to select the corresponding device + * from the list of devices, which is ordered by devid. + */ +static int btrfs_read_rr(struct btrfs_chunk_map *map, int first, int num_stripe) +{ + struct stripe_mirror stripes[BTRFS_RAID1_MAX_MIRRORS] = {0}; + struct btrfs_fs_devices *fs_devices; + struct btrfs_device *device; + int read_cycle; + int index; + int ret_stripe; + int total_reads; + int reads_per_dev = 0; + + device = map->stripes[first].dev; + + fs_devices = device->fs_devices; + reads_per_dev = fs_devices->rr_min_contiguous_read >> SECTOR_SHIFT; + index = 0; + for (int i = first; i < first + num_stripe; i++) { + stripes[index].devid = map->stripes[i].dev->devid; + stripes[index].num = i; + index++; + } + sort(stripes, num_stripe, sizeof(struct stripe_mirror), + btrfs_cmp_devid, NULL); + + total_reads = atomic_inc_return(&fs_devices->total_reads); + read_cycle = total_reads / reads_per_dev; + ret_stripe = stripes[read_cycle % num_stripe].num; + + return ret_stripe; +} +#endif + static int find_live_mirror(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map, int first, int dev_replace_is_ongoing) @@ -5988,6 +6056,11 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, case BTRFS_READ_POLICY_PID: preferred_mirror = first + (current->pid % num_stripes); break; +#ifdef CONFIG_BTRFS_EXPERIMENTAL + case BTRFS_READ_POLICY_RR: + preferred_mirror = btrfs_read_rr(map, first, num_stripes); + break; +#endif } if (dev_replace_is_ongoing && diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 3a416b1bc24c..b7b130ce0b10 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -296,6 +296,8 @@ enum btrfs_chunk_allocation_policy { BTRFS_CHUNK_ALLOC_ZONED, }; +#define BTRFS_DEFAULT_RR_MIN_CONTIGUOUS_READ (SZ_256K) +#define BTRFS_RAID1_MAX_MIRRORS (4) /* * Read policies for mirrored block group profiles, read picks the stripe based * on these policies. @@ -303,6 +305,10 @@ enum btrfs_chunk_allocation_policy { enum btrfs_read_policy { /* Use process PID to choose the stripe */ BTRFS_READ_POLICY_PID, +#ifdef CONFIG_BTRFS_EXPERIMENTAL + /* Balancing raid1 reads across all striped devices (round-robin) */ + BTRFS_READ_POLICY_RR, +#endif BTRFS_NR_READ_POLICY, }; @@ -431,6 +437,11 @@ struct btrfs_fs_devices { enum btrfs_read_policy read_policy; #ifdef CONFIG_BTRFS_EXPERIMENTAL + /* IO stat, read counter. */ + atomic_t total_reads; + /* Min contiguous reads before switching to next device. */ + int rr_min_contiguous_read; + /* Checksum mode - offload it or do it synchronously. */ enum btrfs_offload_csum_mode offload_csum_mode; #endif
This feature balances I/O across the striped devices when reading from RAID1 blocks. echo round-robin[:min_contiguous_read] > /sys/fs/btrfs/<uuid>/read_policy The min_contiguous_read parameter defines the minimum read size before switching to the next mirrored device. This setting is optional, with a default value of 256 KiB. Signed-off-by: Anand Jain <anand.jain@oracle.com> --- fs/btrfs/sysfs.c | 44 +++++++++++++++++++++++++++- fs/btrfs/volumes.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/volumes.h | 11 +++++++ 3 files changed, 127 insertions(+), 1 deletion(-)