diff mbox series

[RFC,4/4] btrfs: introduce new read_policy round-robin

Message ID 8e0afaa33f33d1a5efbf37fa4465954056ce3f59.1610324448.git.anand.jain@oracle.com (mailing list archive)
State New, archived
Headers show
Series [v3,1/4] btrfs: add read_policy latency | expand

Commit Message

Anand Jain Jan. 11, 2021, 9:41 a.m. UTC
Add round-robin read policy to route the read IO to the next device in the
round-robin order. The chunk allocation and thus the stripe-index follows
the order of free space available on devices. So to make the round-robin
effective it shall follow the devid order instead of the stripe-index
order.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
--
RFC because: Provides terrible performance with the fio tests.
I am not yet sure if there is any io workload or a block layer
tuning that shall make this policy better. As of now just an
experimental patch.

 fs/btrfs/sysfs.c   |  2 +-
 fs/btrfs/volumes.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/volumes.h |  3 +++
 3 files changed, 54 insertions(+), 1 deletion(-)

Comments

Josef Bacik Jan. 19, 2021, 7:41 p.m. UTC | #1
On 1/11/21 4:41 AM, Anand Jain wrote:
> Add round-robin read policy to route the read IO to the next device in the
> round-robin order. The chunk allocation and thus the stripe-index follows
> the order of free space available on devices. So to make the round-robin
> effective it shall follow the devid order instead of the stripe-index
> order.
> 
> Signed-off-by: Anand Jain <anand.jain@oracle.com>
> --
> RFC because: Provides terrible performance with the fio tests.
> I am not yet sure if there is any io workload or a block layer
> tuning that shall make this policy better. As of now just an
> experimental patch.
> 

Just drop this one, if we can't find a reason to use it then don't bother adding 
the code.  The other options have real world valuable uses, so stick with those. 
  Thanks,

Josef
Anand Jain Jan. 20, 2021, 2:40 a.m. UTC | #2
On 20/1/21 3:41 am, Josef Bacik wrote:
> On 1/11/21 4:41 AM, Anand Jain wrote:
>> Add round-robin read policy to route the read IO to the next device in 
>> the
>> round-robin order. The chunk allocation and thus the stripe-index follows
>> the order of free space available on devices. So to make the round-robin
>> effective it shall follow the devid order instead of the stripe-index
>> order.
>>
>> Signed-off-by: Anand Jain <anand.jain@oracle.com>
>> -- 
>> RFC because: Provides terrible performance with the fio tests.
>> I am not yet sure if there is any io workload or a block layer
>> tuning that shall make this policy better. As of now just an
>> experimental patch.
>>
> 
> Just drop this one, if we can't find a reason to use it then don't 
> bother adding the code.  The other options have real world valuable 
> uses, so stick with those.  Thanks,
> 

  Yep. I will drop this patch in the next iteration.

  The low performance is attributed to the low number of read IO
  mergers in the block layer. The consecutive blocks in my test case
  (fio random read) were read from the other disk, so the block layer
  lost the opportunity to merge the IOs.

Thanks, Anand

> Josef
>
diff mbox series

Patch

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 899b66c83db1..d40b0ff054ca 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -917,7 +917,7 @@  static bool strmatch(const char *buffer, const char *string)
 
 /* Must follow the order as in enum btrfs_read_policy */
 static const char * const btrfs_read_policy_name[] = { "pid", "latency",
-						       "device" };
+						       "device", "roundrobin" };
 
 static ssize_t btrfs_read_policy_show(struct kobject *kobj,
 				      struct kobj_attribute *a, char *buf)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 50d4d54f7abd..60370b9121e0 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5491,6 +5491,52 @@  int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
 	return ret;
 }
 
+struct stripe_mirror {
+	u64 devid;
+	int map;
+};
+
+static int btrfs_cmp_devid(const void *a, const void *b)
+{
+	struct stripe_mirror *s1 = (struct stripe_mirror *)a;
+	struct stripe_mirror *s2 = (struct stripe_mirror *)b;
+
+	if (s1->devid < s2->devid)
+		return -1;
+	if (s1->devid > s2->devid)
+		return 1;
+	return 0;
+}
+
+static int btrfs_find_read_round_robin(struct map_lookup *map, int first,
+				       int num_stripe)
+{
+	struct stripe_mirror stripes[4] = {0}; //4: for testing, works for now.
+	struct btrfs_fs_devices *fs_devices;
+	u64 devid;
+	int index, j, cnt;
+	int next_stripe;
+
+	index = 0;
+	for (j = first; j < first + num_stripe; j++) {
+		devid = map->stripes[j].dev->devid;
+
+		stripes[index].devid = devid;
+		stripes[index].map = j;
+
+		index++;
+	}
+
+	sort(stripes, num_stripe, sizeof(struct stripe_mirror),
+	     btrfs_cmp_devid, NULL);
+
+	fs_devices = map->stripes[first].dev->fs_devices;
+	cnt = atomic_inc_return(&fs_devices->total_reads);
+	next_stripe = stripes[cnt % num_stripe].map;
+
+	return next_stripe;
+}
+
 static int btrfs_find_best_stripe(struct btrfs_fs_info *fs_info,
 				  struct map_lookup *map, int first,
 				  int num_stripe)
@@ -5579,6 +5625,10 @@  static int find_live_mirror(struct btrfs_fs_info *fs_info,
 	case BTRFS_READ_POLICY_DEVICE:
 		preferred_mirror = btrfs_find_read_preferred(map, first, num_stripes);
 		break;
+	case BTRFS_READ_POLICY_ROUND_ROBIN:
+		preferred_mirror = btrfs_find_read_round_robin(map, first,
+							       num_stripes);
+		break;
 	}
 
 	if (dev_replace_is_ongoing &&
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 8d5a2cddc0ab..ce4490437f53 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -227,6 +227,8 @@  enum btrfs_read_policy {
 	BTRFS_READ_POLICY_LATENCY,
 	/* Use the device marked with READ_PREFERRED state */
 	BTRFS_READ_POLICY_DEVICE,
+	/* Distribute read IO equally across striped devices */
+	BTRFS_READ_POLICY_ROUND_ROBIN,
 	BTRFS_NR_READ_POLICY,
 };
 
@@ -286,6 +288,7 @@  struct btrfs_fs_devices {
 
 	/* Policy used to read the mirrored stripes */
 	enum btrfs_read_policy read_policy;
+	atomic_t total_reads;
 };
 
 #define BTRFS_BIO_INLINE_CSUM_SIZE	64