@@ -5027,6 +5027,47 @@ static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
return 0;
}
+/* The most preferred type for Metadata is at the top. */
+enum btrfs_dev_types btrfs_devices_by_latency[] = {
+ BTRFS_DEV_TYPE_NVME,
+ BTRFS_DEV_TYPE_NONROT,
+ BTRFS_DEV_TYPE_ZNS,
+ BTRFS_DEV_TYPE_ROT,
+ BTRFS_DEV_TYPE_ZONED,
+};
+
+static int btrfs_dev_type_to_latency(enum btrfs_dev_types type)
+{
+ int p;
+
+ for (p = 0; p < BTRFS_DEV_TYPE_NR; p++) {
+ if (btrfs_devices_by_latency[p] == type)
+ return p;
+ }
+
+ return -EINVAL;
+}
+
+/* Sort the devices in the ascending order of their latency. */
+static int btrfs_device_latency_asc(const void *a, const void *b)
+{
+ const struct btrfs_device_info *di_a = a;
+ const struct btrfs_device_info *di_b = b;
+ int latency_a = btrfs_dev_type_to_latency(di_a->dev->dev_type);
+ int latency_b = btrfs_dev_type_to_latency(di_b->dev->dev_type);
+
+ if (latency_a > latency_b)
+ return 1;
+ if (latency_a < latency_b)
+ return -1;
+ return 0;
+}
+
+static int btrfs_device_latency_des(const void *a, const void *b)
+{
+ return -btrfs_device_latency_asc(a, b);
+}
+
/*
* sort the devices in descending order by max_avail, total_avail
*/
@@ -5185,6 +5226,7 @@ static int gather_device_info(struct btrfs_fs_devices *fs_devices,
u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes;
int ret;
int ndevs = 0;
+ unsigned int mixed_type = 0;
u64 max_avail;
u64 dev_offset;
@@ -5239,15 +5281,52 @@ static int gather_device_info(struct btrfs_fs_devices *fs_devices,
devices_info[ndevs].max_avail = max_avail;
devices_info[ndevs].total_avail = total_avail;
devices_info[ndevs].dev = device;
+ mixed_type |= 1 << device->dev_type;
++ndevs;
}
ctl->ndevs = ndevs;
- /*
- * now sort the devices by hole size / available space
- */
- sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
- btrfs_cmp_device_info, NULL);
+ /* Check if the gathered devices have mixed device types. */
+ if (mixed_type && !is_power_of_2(mixed_type)) {
+ u64 cur_index;
+ u64 start_index;
+ int start_type;
+
+ /*
+ * Sort devices by their type. Ascending for metadata and descending
+ * for the data chunks.
+ */
+ sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
+ ctl->type & BTRFS_BLOCK_GROUP_DATA ?
+ btrfs_device_latency_des : btrfs_device_latency_asc,
+ NULL);
+
+ /* Now sort devices in each type by its available space */
+ start_index = 0;
+ start_type = devices_info[0].dev->dev_type;
+ for (cur_index = 1; cur_index < ndevs; cur_index++) {
+ int cur_type = devices_info[cur_index].dev->dev_type;
+
+ if (cur_type == start_type)
+ continue;
+
+ sort(&devices_info[start_index],
+ cur_index - start_index,
+ sizeof(struct btrfs_device_info),
+ btrfs_cmp_device_info, NULL);
+
+ start_index = cur_index;
+ start_type = cur_type;
+ }
+ if (cur_index - start_index > 1)
+ sort(&devices_info[start_index], cur_index - start_index,
+ sizeof(struct btrfs_device_info),
+ btrfs_cmp_device_info, NULL);
+ } else {
+ /* Sort the devices by hole size / available space */
+ sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
+ btrfs_cmp_device_info, NULL);
+ }
return 0;
}
Mixed device-types use case prefers that the data chunk allocates on lower latency device type and the metadata chunk allocates on the faster device type when possible. As of now, in the function gather_device_info() called from btrfs_create_chunk(), we sort the devices based on unallocated space only. After this patch, the function will also check for mixed device types. First, it sorts the devices based on the latency. That is, sort ascending if the allocation type is metadata and reverse-sort if the allocation type is data. Next, within a device type, sort the devices by their free space. enum btrfs_device_types values are in the ascending order of latency. It is a simple static list helps in most common cases. For any user options it can be added later. When one of the device types runs out of free space, that device will not make it to the available device list. Then allocation will continue by the free space next preferred device type. At some point later, we can change this behaviour too by the user option, to fail with ENOSPC or to warn(). The advantage of this method is that data/metadata allocation distribution based on the device type happens automatically for the performance without any manual configuration. Signed-off-by: Anand Jain <anand.jain@oracle.com> --- v2: Initialize btrfs_dev_types array btrfs_devices_by_latency to hold latency value. (Kdave). Sort devices by type and then by latency. (Kdave). fs/btrfs/volumes.c | 89 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 84 insertions(+), 5 deletions(-)