diff mbox series

[2/2,v2,RFC] btrfs: create chunk device type aware

Message ID 007dccb0651ea5d278d88d9f991214543c1a14c5.1657536723.git.anand.jain@oracle.com (mailing list archive)
State New, archived
Headers show
Series device type and create chunk | expand

Commit Message

Anand Jain July 11, 2022, 11:14 a.m. UTC
Mixed device-types use case prefers that the data chunk allocates on lower
latency device type and the metadata chunk allocates on the faster device
type when possible.

As of now, in the function gather_device_info() called from
btrfs_create_chunk(), we sort the devices based on unallocated space only.
After this patch, the function will also check for mixed device types.

First, it sorts the devices based on the latency. That is, sort
ascending if the allocation type is metadata and reverse-sort if the
allocation type is data. Next, within a device type, sort the devices by
their free space.

enum btrfs_device_types values are in the ascending order of latency.
It is a simple static list helps in most common cases. For any user
options it can be added later.

When one of the device types runs out of free space, that device will not
make it to the available device list. Then allocation will continue by
the free space next preferred device type. At some point later, we can
change this behaviour too by the user option, to fail with ENOSPC or to warn().

The advantage of this method is that data/metadata allocation distribution
based on the device type happens automatically for the performance without
any manual configuration.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
---
v2: Initialize btrfs_dev_types array btrfs_devices_by_latency to hold
     latency value. (Kdave).
    Sort devices by type and then by latency. (Kdave).

 fs/btrfs/volumes.c | 89 +++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 84 insertions(+), 5 deletions(-)
diff mbox series

Patch

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b8ab13127caf..838ebf62e517 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5027,6 +5027,47 @@  static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
 	return 0;
 }
 
+/* The most preferred type for Metadata is at the top. */
+enum btrfs_dev_types btrfs_devices_by_latency[] = {
+	BTRFS_DEV_TYPE_NVME,
+	BTRFS_DEV_TYPE_NONROT,
+	BTRFS_DEV_TYPE_ZNS,
+	BTRFS_DEV_TYPE_ROT,
+	BTRFS_DEV_TYPE_ZONED,
+};
+
+static int btrfs_dev_type_to_latency(enum btrfs_dev_types type)
+{
+	int p;
+
+	for (p = 0; p < BTRFS_DEV_TYPE_NR; p++) {
+		if (btrfs_devices_by_latency[p] == type)
+			return p;
+	}
+
+	return -EINVAL;
+}
+
+/* Sort the devices in the ascending order of their latency. */
+static int btrfs_device_latency_asc(const void *a, const void *b)
+{
+	const struct btrfs_device_info *di_a = a;
+	const struct btrfs_device_info *di_b = b;
+	int latency_a = btrfs_dev_type_to_latency(di_a->dev->dev_type);
+	int latency_b = btrfs_dev_type_to_latency(di_b->dev->dev_type);
+
+	if (latency_a > latency_b)
+		return 1;
+	if (latency_a < latency_b)
+		return -1;
+	return 0;
+}
+
+static int btrfs_device_latency_des(const void *a, const void *b)
+{
+	return -btrfs_device_latency_asc(a, b);
+}
+
 /*
  * sort the devices in descending order by max_avail, total_avail
  */
@@ -5185,6 +5226,7 @@  static int gather_device_info(struct btrfs_fs_devices *fs_devices,
 	u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes;
 	int ret;
 	int ndevs = 0;
+	unsigned int mixed_type = 0;
 	u64 max_avail;
 	u64 dev_offset;
 
@@ -5239,15 +5281,52 @@  static int gather_device_info(struct btrfs_fs_devices *fs_devices,
 		devices_info[ndevs].max_avail = max_avail;
 		devices_info[ndevs].total_avail = total_avail;
 		devices_info[ndevs].dev = device;
+		mixed_type |= 1 << device->dev_type;
 		++ndevs;
 	}
 	ctl->ndevs = ndevs;
 
-	/*
-	 * now sort the devices by hole size / available space
-	 */
-	sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
-	     btrfs_cmp_device_info, NULL);
+	/* Check if the gathered devices have mixed device types. */
+	if (mixed_type && !is_power_of_2(mixed_type)) {
+		u64 cur_index;
+		u64 start_index;
+		int start_type;
+
+		/*
+		 * Sort devices by their type. Ascending for metadata and descending
+		 * for the data chunks.
+		 */
+		sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
+		     ctl->type & BTRFS_BLOCK_GROUP_DATA ?
+		     btrfs_device_latency_des : btrfs_device_latency_asc,
+		     NULL);
+
+		/* Now sort devices in each type by its available space */
+		start_index = 0;
+		start_type = devices_info[0].dev->dev_type;
+		for (cur_index = 1; cur_index < ndevs; cur_index++) {
+			int cur_type = devices_info[cur_index].dev->dev_type;
+
+			if (cur_type == start_type)
+				continue;
+
+			sort(&devices_info[start_index],
+			     cur_index - start_index,
+			     sizeof(struct btrfs_device_info),
+			     btrfs_cmp_device_info, NULL);
+
+			start_index = cur_index;
+			start_type = cur_type;
+		}
+		if (cur_index - start_index > 1)
+			sort(&devices_info[start_index], cur_index - start_index,
+			     sizeof(struct btrfs_device_info),
+			     btrfs_cmp_device_info, NULL);
+	} else {
+		/* Sort the devices by hole size / available space */
+		sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
+		     btrfs_cmp_device_info, NULL);
+	}
 
 	return 0;
 }