diff mbox series

[v4,5/9] btrfs: introduce RAID1 round-robin read balancing

Message ID 90934f391bc1c9772f9e3a7902cf9d04f3b0d14a.1734370092.git.anand.jain@oracle.com (mailing list archive)
State New
Headers show
Series raid1 balancing methods | expand

Commit Message

Anand Jain Dec. 16, 2024, 6:13 p.m. UTC
This feature balances I/O across the striped devices when reading from
RAID1 blocks.

   echo round-robin[:min_contiguous_read] > /sys/fs/btrfs/<uuid>/read_policy

The min_contiguous_read parameter defines the minimum read size before
switching to the next mirrored device. This setting is optional, with a
default value of 256 KiB.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
---
 fs/btrfs/sysfs.c   | 44 +++++++++++++++++++++++++++-
 fs/btrfs/volumes.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/volumes.h | 11 +++++++
 3 files changed, 127 insertions(+), 1 deletion(-)

Comments

Naohiro Aota Dec. 18, 2024, 5:53 a.m. UTC | #1
On Tue, Dec 17, 2024 at 02:13:13AM +0800, Anand Jain wrote:
> This feature balances I/O across the striped devices when reading from
> RAID1 blocks.
> 
>    echo round-robin[:min_contiguous_read] > /sys/fs/btrfs/<uuid>/read_policy
> 
> The min_contiguous_read parameter defines the minimum read size before
> switching to the next mirrored device. This setting is optional, with a
> default value of 256 KiB.
> 
> Signed-off-by: Anand Jain <anand.jain@oracle.com>
> ---
>  fs/btrfs/sysfs.c   | 44 +++++++++++++++++++++++++++-
>  fs/btrfs/volumes.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++
>  fs/btrfs/volumes.h | 11 +++++++
>  3 files changed, 127 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
> index 9c7bedf974d2..b0e1fb787ce6 100644
> --- a/fs/btrfs/sysfs.c
> +++ b/fs/btrfs/sysfs.c
> @@ -1305,7 +1305,12 @@ static ssize_t btrfs_temp_fsid_show(struct kobject *kobj,
>  }
>  BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show);
>  
> -static const char * const btrfs_read_policy_name[] = { "pid" };
> +static const char *btrfs_read_policy_name[] = {
> +	"pid",
> +#ifdef CONFIG_BTRFS_EXPERIMENTAL
> +	"round-robin",
> +#endif
> +};
>  
>  static int btrfs_read_policy_to_enum(const char *str, s64 *value)
>  {
> @@ -1359,6 +1364,12 @@ static ssize_t btrfs_read_policy_show(struct kobject *kobj,
>  
>  		ret += sysfs_emit_at(buf, ret, "%s", btrfs_read_policy_name[i]);
>  
> +#ifdef CONFIG_BTRFS_EXPERIMENTAL
> +		if (i == BTRFS_READ_POLICY_RR)
> +			ret += sysfs_emit_at(buf, ret, ":%d",
> +					     fs_devices->rr_min_contiguous_read);

I guess we want READ_ONCE() here as well.

> +#endif
> +
>  		if (i == policy)
>  			ret += sysfs_emit_at(buf, ret, "]");
>  	}
> @@ -1380,6 +1391,37 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj,
>  	if (index == -EINVAL)
>  		return -EINVAL;
>  
> +#ifdef CONFIG_BTRFS_EXPERIMENTAL
> +	if (index == BTRFS_READ_POLICY_RR) {
> +		if (value != -1) {
> +			u32 sectorsize = fs_devices->fs_info->sectorsize;
> +
> +			if (!IS_ALIGNED(value, sectorsize)) {
> +				u64 temp_value = round_up(value, sectorsize);
> +
> +				btrfs_warn(fs_devices->fs_info,
> +"read_policy: min contiguous read %lld should be multiples of the sectorsize %u, rounded to %llu",
> +					  value, sectorsize, temp_value);
> +				value = temp_value;
> +			}
> +		} else {
> +			value = BTRFS_DEFAULT_RR_MIN_CONTIGUOUS_READ;
> +		}
> +
> +		if (index != READ_ONCE(fs_devices->read_policy) ||
> +		    value != READ_ONCE(fs_devices->rr_min_contiguous_read)) {
> +			WRITE_ONCE(fs_devices->read_policy, index);
> +			WRITE_ONCE(fs_devices->rr_min_contiguous_read, value);
> +			atomic_set(&fs_devices->total_reads, 0);
> +
> +			btrfs_info(fs_devices->fs_info, "read policy set to '%s:%lld'",
> +				   btrfs_read_policy_name[index], value);
> +
> +		}
> +
> +		return len;
> +	}
> +#endif
>  	if (index != READ_ONCE(fs_devices->read_policy)) {
>  		WRITE_ONCE(fs_devices->read_policy, index);
>  		btrfs_info(fs_devices->fs_info, "read policy set to '%s'",
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index fe5ceea2ba0b..77c3b66d56a0 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -1328,6 +1328,9 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
>  	fs_devices->total_rw_bytes = 0;
>  	fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
>  	fs_devices->read_policy = BTRFS_READ_POLICY_PID;
> +#ifdef CONFIG_BTRFS_EXPERIMENTAL
> +	fs_devices->rr_min_contiguous_read = BTRFS_DEFAULT_RR_MIN_CONTIGUOUS_READ;
> +#endif
>  
>  	return 0;
>  }
> @@ -5959,6 +5962,71 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
>  	return len;
>  }
>  
> +#ifdef CONFIG_BTRFS_EXPERIMENTAL
> +struct stripe_mirror {
> +	u64 devid;
> +	int num;
> +};
> +
> +static int btrfs_cmp_devid(const void *a, const void *b)
> +{
> +	const struct stripe_mirror *s1 = (struct stripe_mirror *)a;
> +	const struct stripe_mirror *s2 = (struct stripe_mirror *)b;
> +
> +	if (s1->devid < s2->devid)
> +		return -1;
> +	if (s1->devid > s2->devid)
> +		return 1;
> +	return 0;
> +}
> +
> +/*
> + * btrfs_read_rr.
> + *
> + * Select a stripe for reading using a round-robin algorithm:
> + *
> + *  1. Compute the read cycle as the total sectors read divided by the minimum
> + *  sectors per device.
> + *  2. Determine the stripe number for the current read by taking the modulus
> + *  of the read cycle with the total number of stripes:
> + *
> + *      stripe index = (total sectors / min sectors per dev) % num stripes
> + *
> + * The calculated stripe index is then used to select the corresponding device
> + * from the list of devices, which is ordered by devid.
> + */
> +static int btrfs_read_rr(struct btrfs_chunk_map *map, int first, int num_stripe)
> +{
> +	struct stripe_mirror stripes[BTRFS_RAID1_MAX_MIRRORS] = {0};
> +	struct btrfs_fs_devices *fs_devices;
> +	struct btrfs_device *device;
> +	int read_cycle;
> +	int index;
> +	int ret_stripe;
> +	int total_reads;
> +	int reads_per_dev = 0;
> +
> +	device = map->stripes[first].dev;
> +
> +	fs_devices = device->fs_devices;
> +	reads_per_dev = fs_devices->rr_min_contiguous_read >> SECTOR_SHIFT;

Want READ_ONCE() as well. Also, is it OK to divide it with (1 <<
SECTOR_SHIFT), which is not necessary equal to fs_info->sectorsize?

> +	index = 0;
> +	for (int i = first; i < first + num_stripe; i++) {
> +		stripes[index].devid = map->stripes[i].dev->devid;
> +		stripes[index].num = i;
> +		index++;
> +	}
> +	sort(stripes, num_stripe, sizeof(struct stripe_mirror),
> +	     btrfs_cmp_devid, NULL);
> +
> +	total_reads = atomic_inc_return(&fs_devices->total_reads);
> +	read_cycle = total_reads / reads_per_dev;
> +	ret_stripe = stripes[read_cycle % num_stripe].num;

I'm not sure the logic here. Since the code increments the total_reads
counter by 1, can we assume this function is invoked per
fs_info->sectorsize?

> +
> +	return ret_stripe;
> +}
> +#endif
> +
>  static int find_live_mirror(struct btrfs_fs_info *fs_info,
>  			    struct btrfs_chunk_map *map, int first,
>  			    int dev_replace_is_ongoing)
> @@ -5988,6 +6056,11 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
>  	case BTRFS_READ_POLICY_PID:
>  		preferred_mirror = first + (current->pid % num_stripes);
>  		break;
> +#ifdef CONFIG_BTRFS_EXPERIMENTAL
> +	case BTRFS_READ_POLICY_RR:
> +		preferred_mirror = btrfs_read_rr(map, first, num_stripes);
> +		break;
> +#endif
>  	}
>  
>  	if (dev_replace_is_ongoing &&
> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
> index 3a416b1bc24c..b7b130ce0b10 100644
> --- a/fs/btrfs/volumes.h
> +++ b/fs/btrfs/volumes.h
> @@ -296,6 +296,8 @@ enum btrfs_chunk_allocation_policy {
>  	BTRFS_CHUNK_ALLOC_ZONED,
>  };
>  
> +#define BTRFS_DEFAULT_RR_MIN_CONTIGUOUS_READ	(SZ_256K)
> +#define BTRFS_RAID1_MAX_MIRRORS			(4)
>  /*
>   * Read policies for mirrored block group profiles, read picks the stripe based
>   * on these policies.
> @@ -303,6 +305,10 @@ enum btrfs_chunk_allocation_policy {
>  enum btrfs_read_policy {
>  	/* Use process PID to choose the stripe */
>  	BTRFS_READ_POLICY_PID,
> +#ifdef CONFIG_BTRFS_EXPERIMENTAL
> +	/* Balancing raid1 reads across all striped devices (round-robin) */
> +	BTRFS_READ_POLICY_RR,
> +#endif
>  	BTRFS_NR_READ_POLICY,
>  };
>  
> @@ -431,6 +437,11 @@ struct btrfs_fs_devices {
>  	enum btrfs_read_policy read_policy;
>  
>  #ifdef CONFIG_BTRFS_EXPERIMENTAL
> +	/* IO stat, read counter. */
> +	atomic_t total_reads;
> +	/* Min contiguous reads before switching to next device. */
> +	int rr_min_contiguous_read;
> +
>  	/* Checksum mode - offload it or do it synchronously. */
>  	enum btrfs_offload_csum_mode offload_csum_mode;
>  #endif
> -- 
> 2.47.0
>
Anand Jain Dec. 18, 2024, 3:20 p.m. UTC | #2
On 18/12/24 11:23, Naohiro Aota wrote:
> On Tue, Dec 17, 2024 at 02:13:13AM +0800, Anand Jain wrote:
>> This feature balances I/O across the striped devices when reading from
>> RAID1 blocks.
>>
>>     echo round-robin[:min_contiguous_read] > /sys/fs/btrfs/<uuid>/read_policy
>>
>> The min_contiguous_read parameter defines the minimum read size before
>> switching to the next mirrored device. This setting is optional, with a
>> default value of 256 KiB.
>>
>> Signed-off-by: Anand Jain <anand.jain@oracle.com>
>> ---
>>   fs/btrfs/sysfs.c   | 44 +++++++++++++++++++++++++++-
>>   fs/btrfs/volumes.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++
>>   fs/btrfs/volumes.h | 11 +++++++
>>   3 files changed, 127 insertions(+), 1 deletion(-)
>>
>> diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
>> index 9c7bedf974d2..b0e1fb787ce6 100644
>> --- a/fs/btrfs/sysfs.c
>> +++ b/fs/btrfs/sysfs.c
>> @@ -1305,7 +1305,12 @@ static ssize_t btrfs_temp_fsid_show(struct kobject *kobj,
>>   }
>>   BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show);
>>   
>> -static const char * const btrfs_read_policy_name[] = { "pid" };
>> +static const char *btrfs_read_policy_name[] = {
>> +	"pid",
>> +#ifdef CONFIG_BTRFS_EXPERIMENTAL
>> +	"round-robin",
>> +#endif
>> +};
>>   
>>   static int btrfs_read_policy_to_enum(const char *str, s64 *value)
>>   {
>> @@ -1359,6 +1364,12 @@ static ssize_t btrfs_read_policy_show(struct kobject *kobj,
>>   
>>   		ret += sysfs_emit_at(buf, ret, "%s", btrfs_read_policy_name[i]);
>>   
>> +#ifdef CONFIG_BTRFS_EXPERIMENTAL
>> +		if (i == BTRFS_READ_POLICY_RR)
>> +			ret += sysfs_emit_at(buf, ret, ":%d",
>> +					     fs_devices->rr_min_contiguous_read);
> 
> I guess we want READ_ONCE() here as well.
> 
>> +#endif
>> +
>>   		if (i == policy)
>>   			ret += sysfs_emit_at(buf, ret, "]");
>>   	}
>> @@ -1380,6 +1391,37 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj,
>>   	if (index == -EINVAL)
>>   		return -EINVAL;
>>   
>> +#ifdef CONFIG_BTRFS_EXPERIMENTAL
>> +	if (index == BTRFS_READ_POLICY_RR) {
>> +		if (value != -1) {
>> +			u32 sectorsize = fs_devices->fs_info->sectorsize;
>> +
>> +			if (!IS_ALIGNED(value, sectorsize)) {
>> +				u64 temp_value = round_up(value, sectorsize);
>> +
>> +				btrfs_warn(fs_devices->fs_info,
>> +"read_policy: min contiguous read %lld should be multiples of the sectorsize %u, rounded to %llu",
>> +					  value, sectorsize, temp_value);
>> +				value = temp_value;
>> +			}
>> +		} else {
>> +			value = BTRFS_DEFAULT_RR_MIN_CONTIGUOUS_READ;
>> +		}
>> +
>> +		if (index != READ_ONCE(fs_devices->read_policy) ||
>> +		    value != READ_ONCE(fs_devices->rr_min_contiguous_read)) {
>> +			WRITE_ONCE(fs_devices->read_policy, index);
>> +			WRITE_ONCE(fs_devices->rr_min_contiguous_read, value);
>> +			atomic_set(&fs_devices->total_reads, 0);
>> +
>> +			btrfs_info(fs_devices->fs_info, "read policy set to '%s:%lld'",
>> +				   btrfs_read_policy_name[index], value);
>> +
>> +		}
>> +
>> +		return len;
>> +	}
>> +#endif
>>   	if (index != READ_ONCE(fs_devices->read_policy)) {
>>   		WRITE_ONCE(fs_devices->read_policy, index);
>>   		btrfs_info(fs_devices->fs_info, "read policy set to '%s'",
>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>> index fe5ceea2ba0b..77c3b66d56a0 100644
>> --- a/fs/btrfs/volumes.c
>> +++ b/fs/btrfs/volumes.c
>> @@ -1328,6 +1328,9 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
>>   	fs_devices->total_rw_bytes = 0;
>>   	fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
>>   	fs_devices->read_policy = BTRFS_READ_POLICY_PID;
>> +#ifdef CONFIG_BTRFS_EXPERIMENTAL
>> +	fs_devices->rr_min_contiguous_read = BTRFS_DEFAULT_RR_MIN_CONTIGUOUS_READ;
>> +#endif
>>   
>>   	return 0;
>>   }
>> @@ -5959,6 +5962,71 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
>>   	return len;
>>   }
>>   
>> +#ifdef CONFIG_BTRFS_EXPERIMENTAL
>> +struct stripe_mirror {
>> +	u64 devid;
>> +	int num;
>> +};
>> +
>> +static int btrfs_cmp_devid(const void *a, const void *b)
>> +{
>> +	const struct stripe_mirror *s1 = (struct stripe_mirror *)a;
>> +	const struct stripe_mirror *s2 = (struct stripe_mirror *)b;
>> +
>> +	if (s1->devid < s2->devid)
>> +		return -1;
>> +	if (s1->devid > s2->devid)
>> +		return 1;
>> +	return 0;
>> +}
>> +
>> +/*
>> + * btrfs_read_rr.
>> + *
>> + * Select a stripe for reading using a round-robin algorithm:
>> + *
>> + *  1. Compute the read cycle as the total sectors read divided by the minimum
>> + *  sectors per device.
>> + *  2. Determine the stripe number for the current read by taking the modulus
>> + *  of the read cycle with the total number of stripes:
>> + *
>> + *      stripe index = (total sectors / min sectors per dev) % num stripes
>> + *
>> + * The calculated stripe index is then used to select the corresponding device
>> + * from the list of devices, which is ordered by devid.
>> + */
>> +static int btrfs_read_rr(struct btrfs_chunk_map *map, int first, int num_stripe)
>> +{
>> +	struct stripe_mirror stripes[BTRFS_RAID1_MAX_MIRRORS] = {0};
>> +	struct btrfs_fs_devices *fs_devices;
>> +	struct btrfs_device *device;
>> +	int read_cycle;
>> +	int index;
>> +	int ret_stripe;
>> +	int total_reads;
>> +	int reads_per_dev = 0;
>> +
>> +	device = map->stripes[first].dev;
>> +
>> +	fs_devices = device->fs_devices;
>> +	reads_per_dev = fs_devices->rr_min_contiguous_read >> SECTOR_SHIFT;
> 
> Want READ_ONCE() as well. Also, is it OK to divide it with (1 <<
> SECTOR_SHIFT), which is not necessary equal to fs_info->sectorsize?
> 
>> +	index = 0;
>> +	for (int i = first; i < first + num_stripe; i++) {
>> +		stripes[index].devid = map->stripes[i].dev->devid;
>> +		stripes[index].num = i;
>> +		index++;
>> +	}
>> +	sort(stripes, num_stripe, sizeof(struct stripe_mirror),
>> +	     btrfs_cmp_devid, NULL);
>> +
>> +	total_reads = atomic_inc_return(&fs_devices->total_reads);
>> +	read_cycle = total_reads / reads_per_dev;
>> +	ret_stripe = stripes[read_cycle % num_stripe].num;
> 
> I'm not sure the logic here. Since the code increments the total_reads
> counter by 1, can we assume this function is invoked per
> fs_info->sectorsize?
> 

You're right. To fix this, we need to track read I/O stat in
`struct device` on our own. I avoided this earlier as the
block layer already provides I/O stats (though they might
be stale). Unless there is a better way. I'm trying.

Thanks for your review.
-Anand


>> +
>> +	return ret_stripe;
>> +}
>> +#endif
>> +
>>   static int find_live_mirror(struct btrfs_fs_info *fs_info,
>>   			    struct btrfs_chunk_map *map, int first,
>>   			    int dev_replace_is_ongoing)
>> @@ -5988,6 +6056,11 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
>>   	case BTRFS_READ_POLICY_PID:
>>   		preferred_mirror = first + (current->pid % num_stripes);
>>   		break;
>> +#ifdef CONFIG_BTRFS_EXPERIMENTAL
>> +	case BTRFS_READ_POLICY_RR:
>> +		preferred_mirror = btrfs_read_rr(map, first, num_stripes);
>> +		break;
>> +#endif
>>   	}
>>   
>>   	if (dev_replace_is_ongoing &&
>> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
>> index 3a416b1bc24c..b7b130ce0b10 100644
>> --- a/fs/btrfs/volumes.h
>> +++ b/fs/btrfs/volumes.h
>> @@ -296,6 +296,8 @@ enum btrfs_chunk_allocation_policy {
>>   	BTRFS_CHUNK_ALLOC_ZONED,
>>   };
>>   
>> +#define BTRFS_DEFAULT_RR_MIN_CONTIGUOUS_READ	(SZ_256K)
>> +#define BTRFS_RAID1_MAX_MIRRORS			(4)
>>   /*
>>    * Read policies for mirrored block group profiles, read picks the stripe based
>>    * on these policies.
>> @@ -303,6 +305,10 @@ enum btrfs_chunk_allocation_policy {
>>   enum btrfs_read_policy {
>>   	/* Use process PID to choose the stripe */
>>   	BTRFS_READ_POLICY_PID,
>> +#ifdef CONFIG_BTRFS_EXPERIMENTAL
>> +	/* Balancing raid1 reads across all striped devices (round-robin) */
>> +	BTRFS_READ_POLICY_RR,
>> +#endif
>>   	BTRFS_NR_READ_POLICY,
>>   };
>>   
>> @@ -431,6 +437,11 @@ struct btrfs_fs_devices {
>>   	enum btrfs_read_policy read_policy;
>>   
>>   #ifdef CONFIG_BTRFS_EXPERIMENTAL
>> +	/* IO stat, read counter. */
>> +	atomic_t total_reads;
>> +	/* Min contiguous reads before switching to next device. */
>> +	int rr_min_contiguous_read;
>> +
>>   	/* Checksum mode - offload it or do it synchronously. */
>>   	enum btrfs_offload_csum_mode offload_csum_mode;
>>   #endif
>> -- 
>> 2.47.0
diff mbox series

Patch

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 9c7bedf974d2..b0e1fb787ce6 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -1305,7 +1305,12 @@  static ssize_t btrfs_temp_fsid_show(struct kobject *kobj,
 }
 BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show);
 
-static const char * const btrfs_read_policy_name[] = { "pid" };
+static const char *btrfs_read_policy_name[] = {
+	"pid",
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+	"round-robin",
+#endif
+};
 
 static int btrfs_read_policy_to_enum(const char *str, s64 *value)
 {
@@ -1359,6 +1364,12 @@  static ssize_t btrfs_read_policy_show(struct kobject *kobj,
 
 		ret += sysfs_emit_at(buf, ret, "%s", btrfs_read_policy_name[i]);
 
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+		if (i == BTRFS_READ_POLICY_RR)
+			ret += sysfs_emit_at(buf, ret, ":%d",
+					     fs_devices->rr_min_contiguous_read);
+#endif
+
 		if (i == policy)
 			ret += sysfs_emit_at(buf, ret, "]");
 	}
@@ -1380,6 +1391,37 @@  static ssize_t btrfs_read_policy_store(struct kobject *kobj,
 	if (index == -EINVAL)
 		return -EINVAL;
 
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+	if (index == BTRFS_READ_POLICY_RR) {
+		if (value != -1) {
+			u32 sectorsize = fs_devices->fs_info->sectorsize;
+
+			if (!IS_ALIGNED(value, sectorsize)) {
+				u64 temp_value = round_up(value, sectorsize);
+
+				btrfs_warn(fs_devices->fs_info,
+"read_policy: min contiguous read %lld should be multiples of the sectorsize %u, rounded to %llu",
+					  value, sectorsize, temp_value);
+				value = temp_value;
+			}
+		} else {
+			value = BTRFS_DEFAULT_RR_MIN_CONTIGUOUS_READ;
+		}
+
+		if (index != READ_ONCE(fs_devices->read_policy) ||
+		    value != READ_ONCE(fs_devices->rr_min_contiguous_read)) {
+			WRITE_ONCE(fs_devices->read_policy, index);
+			WRITE_ONCE(fs_devices->rr_min_contiguous_read, value);
+			atomic_set(&fs_devices->total_reads, 0);
+
+			btrfs_info(fs_devices->fs_info, "read policy set to '%s:%lld'",
+				   btrfs_read_policy_name[index], value);
+
+		}
+
+		return len;
+	}
+#endif
 	if (index != READ_ONCE(fs_devices->read_policy)) {
 		WRITE_ONCE(fs_devices->read_policy, index);
 		btrfs_info(fs_devices->fs_info, "read policy set to '%s'",
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index fe5ceea2ba0b..77c3b66d56a0 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1328,6 +1328,9 @@  static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
 	fs_devices->total_rw_bytes = 0;
 	fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
 	fs_devices->read_policy = BTRFS_READ_POLICY_PID;
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+	fs_devices->rr_min_contiguous_read = BTRFS_DEFAULT_RR_MIN_CONTIGUOUS_READ;
+#endif
 
 	return 0;
 }
@@ -5959,6 +5962,71 @@  unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
 	return len;
 }
 
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+struct stripe_mirror {
+	u64 devid;
+	int num;
+};
+
+static int btrfs_cmp_devid(const void *a, const void *b)
+{
+	const struct stripe_mirror *s1 = (struct stripe_mirror *)a;
+	const struct stripe_mirror *s2 = (struct stripe_mirror *)b;
+
+	if (s1->devid < s2->devid)
+		return -1;
+	if (s1->devid > s2->devid)
+		return 1;
+	return 0;
+}
+
+/*
+ * btrfs_read_rr.
+ *
+ * Select a stripe for reading using a round-robin algorithm:
+ *
+ *  1. Compute the read cycle as the total sectors read divided by the minimum
+ *  sectors per device.
+ *  2. Determine the stripe number for the current read by taking the modulus
+ *  of the read cycle with the total number of stripes:
+ *
+ *      stripe index = (total sectors / min sectors per dev) % num stripes
+ *
+ * The calculated stripe index is then used to select the corresponding device
+ * from the list of devices, which is ordered by devid.
+ */
+static int btrfs_read_rr(struct btrfs_chunk_map *map, int first, int num_stripe)
+{
+	struct stripe_mirror stripes[BTRFS_RAID1_MAX_MIRRORS] = {0};
+	struct btrfs_fs_devices *fs_devices;
+	struct btrfs_device *device;
+	int read_cycle;
+	int index;
+	int ret_stripe;
+	int total_reads;
+	int reads_per_dev = 0;
+
+	device = map->stripes[first].dev;
+
+	fs_devices = device->fs_devices;
+	reads_per_dev = fs_devices->rr_min_contiguous_read >> SECTOR_SHIFT;
+	index = 0;
+	for (int i = first; i < first + num_stripe; i++) {
+		stripes[index].devid = map->stripes[i].dev->devid;
+		stripes[index].num = i;
+		index++;
+	}
+	sort(stripes, num_stripe, sizeof(struct stripe_mirror),
+	     btrfs_cmp_devid, NULL);
+
+	total_reads = atomic_inc_return(&fs_devices->total_reads);
+	read_cycle = total_reads / reads_per_dev;
+	ret_stripe = stripes[read_cycle % num_stripe].num;
+
+	return ret_stripe;
+}
+#endif
+
 static int find_live_mirror(struct btrfs_fs_info *fs_info,
 			    struct btrfs_chunk_map *map, int first,
 			    int dev_replace_is_ongoing)
@@ -5988,6 +6056,11 @@  static int find_live_mirror(struct btrfs_fs_info *fs_info,
 	case BTRFS_READ_POLICY_PID:
 		preferred_mirror = first + (current->pid % num_stripes);
 		break;
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+	case BTRFS_READ_POLICY_RR:
+		preferred_mirror = btrfs_read_rr(map, first, num_stripes);
+		break;
+#endif
 	}
 
 	if (dev_replace_is_ongoing &&
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 3a416b1bc24c..b7b130ce0b10 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -296,6 +296,8 @@  enum btrfs_chunk_allocation_policy {
 	BTRFS_CHUNK_ALLOC_ZONED,
 };
 
+#define BTRFS_DEFAULT_RR_MIN_CONTIGUOUS_READ	(SZ_256K)
+#define BTRFS_RAID1_MAX_MIRRORS			(4)
 /*
  * Read policies for mirrored block group profiles, read picks the stripe based
  * on these policies.
@@ -303,6 +305,10 @@  enum btrfs_chunk_allocation_policy {
 enum btrfs_read_policy {
 	/* Use process PID to choose the stripe */
 	BTRFS_READ_POLICY_PID,
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+	/* Balancing raid1 reads across all striped devices (round-robin) */
+	BTRFS_READ_POLICY_RR,
+#endif
 	BTRFS_NR_READ_POLICY,
 };
 
@@ -431,6 +437,11 @@  struct btrfs_fs_devices {
 	enum btrfs_read_policy read_policy;
 
 #ifdef CONFIG_BTRFS_EXPERIMENTAL
+	/* IO stat, read counter. */
+	atomic_t total_reads;
+	/* Min contiguous reads before switching to next device. */
+	int rr_min_contiguous_read;
+
 	/* Checksum mode - offload it or do it synchronously. */
 	enum btrfs_offload_csum_mode offload_csum_mode;
 #endif