diff mbox series

[1/2] btrfs: keep device type in the struct btrfs_device

Message ID c815946b0b05990230e9342cc50da3d146268b65.1642518245.git.anand.jain@oracle.com (mailing list archive)
State New, archived
Headers show
Series device type and create chunk | expand

Commit Message

Anand Jain Jan. 18, 2022, 3:18 p.m. UTC
Preparation to make data/metadata chunks allocations based on the device
types- keep the identified device type in the struct btrfs_device.

This patch adds a member 'dev_type' to hold the defined device types in
the struct btrfs_devices.

Also, add a helper function and a struct btrfs_fs_devices member
'mixed_dev_type' to know if the filesystem contains the mixed device
types.

Struct btrfs_device has an existing member 'type' that stages and writes
back to the on-disk format. This patch does not use it. As just an
in-memory only data will suffice the requirement here.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
---
 fs/btrfs/dev-replace.c |  2 ++
 fs/btrfs/volumes.c     | 45 ++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/volumes.h     | 26 +++++++++++++++++++++++-
 3 files changed, 72 insertions(+), 1 deletion(-)

Comments

David Sterba Jan. 26, 2022, 4:53 p.m. UTC | #1
On Tue, Jan 18, 2022 at 11:18:01PM +0800, Anand Jain wrote:
> Preparation to make data/metadata chunks allocations based on the device
> types- keep the identified device type in the struct btrfs_device.
> 
> This patch adds a member 'dev_type' to hold the defined device types in
> the struct btrfs_devices.
> 
> Also, add a helper function and a struct btrfs_fs_devices member
> 'mixed_dev_type' to know if the filesystem contains the mixed device
> types.
> 
> Struct btrfs_device has an existing member 'type' that stages and writes
> back to the on-disk format. This patch does not use it. As just an
> in-memory only data will suffice the requirement here.
> 
> Signed-off-by: Anand Jain <anand.jain@oracle.com>
> ---
>  fs/btrfs/dev-replace.c |  2 ++
>  fs/btrfs/volumes.c     | 45 ++++++++++++++++++++++++++++++++++++++++++
>  fs/btrfs/volumes.h     | 26 +++++++++++++++++++++++-
>  3 files changed, 72 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
> index 71fd99b48283..3731c7d1c6b7 100644
> --- a/fs/btrfs/dev-replace.c
> +++ b/fs/btrfs/dev-replace.c
> @@ -325,6 +325,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
>  	device->dev_stats_valid = 1;
>  	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
>  	device->fs_devices = fs_devices;
> +	device->dev_type = btrfs_get_device_type(device);
>  
>  	ret = btrfs_get_dev_zone_info(device, false);
>  	if (ret)
> @@ -334,6 +335,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
>  	list_add(&device->dev_list, &fs_devices->devices);
>  	fs_devices->num_devices++;
>  	fs_devices->open_devices++;
> +	fs_devices->mixed_dev_types = btrfs_has_mixed_dev_types(fs_devices);
>  	mutex_unlock(&fs_devices->device_list_mutex);
>  
>  	*device_out = device;
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index 9d50e035e61d..da3d6d0f5bc3 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -1041,6 +1041,7 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
>  			     device->generation > (*latest_dev)->generation)) {
>  				*latest_dev = device;
>  			}
> +			device->dev_type = btrfs_get_device_type(device);
>  			continue;
>  		}
>  
> @@ -1084,6 +1085,7 @@ void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices)
>  		__btrfs_free_extra_devids(seed_dev, &latest_dev);
>  
>  	fs_devices->latest_dev = latest_dev;
> +	fs_devices->mixed_dev_types = btrfs_has_mixed_dev_types(fs_devices);
>  
>  	mutex_unlock(&uuid_mutex);
>  }
> @@ -2183,6 +2185,9 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
>  
>  	num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
>  	btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
> +
> +	cur_devices->mixed_dev_types = btrfs_has_mixed_dev_types(cur_devices);
> +
>  	mutex_unlock(&fs_devices->device_list_mutex);
>  
>  	/*
> @@ -2584,6 +2589,44 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
>  	return ret;
>  }
>  
> +bool btrfs_has_mixed_dev_types(struct btrfs_fs_devices *fs_devices)
> +{
> +	struct btrfs_device *device;
> +	int type_rot = 0;
> +	int type_nonrot = 0;
> +
> +	list_for_each_entry(device, &fs_devices->devices, dev_list) {
> +
> +		if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
> +			continue;
> +
> +		switch (device->dev_type) {
> +		case BTRFS_DEV_TYPE_ROT:
> +			type_rot++;
> +			break;
> +		case BTRFS_DEV_TYPE_NONROT:
> +		default:
> +			type_nonrot++;
> +		}
> +	}
> +
> +	if (type_rot && type_nonrot)
> +		return true;
> +	else
> +		return false;
> +}
> +
> +enum btrfs_dev_types btrfs_get_device_type(struct btrfs_device *device)
> +{
> +	if (bdev_is_zoned(device->bdev))
> +		return BTRFS_DEV_TYPE_ZONED;
> +
> +	if (blk_queue_nonrot(bdev_get_queue(device->bdev)))
> +		return BTRFS_DEV_TYPE_NONROT;
> +
> +	return BTRFS_DEV_TYPE_ROT;
> +}
> +
>  int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
>  {
>  	struct btrfs_root *root = fs_info->dev_root;
> @@ -2675,6 +2718,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
>  	clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
>  	device->mode = FMODE_EXCL;
>  	device->dev_stats_valid = 1;
> +	device->dev_type = btrfs_get_device_type(device);
>  	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
>  
>  	if (seeding_dev) {
> @@ -2710,6 +2754,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
>  	atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
>  
>  	fs_devices->rotating = !blk_queue_nonrot(bdev_get_queue(bdev));
> +	fs_devices->mixed_dev_types = btrfs_has_mixed_dev_types(fs_devices);
>  
>  	orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
>  	btrfs_set_super_total_bytes(fs_info->super_copy,
> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
> index 6a790b85edd8..5be31161601d 100644
> --- a/fs/btrfs/volumes.h
> +++ b/fs/btrfs/volumes.h
> @@ -52,6 +52,16 @@ struct btrfs_io_geometry {
>  #define BTRFS_DEV_STATE_FLUSH_SENT	(4)
>  #define BTRFS_DEV_STATE_NO_READA	(5)
>  
> +/*
> + * Device class types arranged by their IO latency from low to high.
> + */
> +enum btrfs_dev_types {
> +	BTRFS_DEV_TYPE_MEM = 1,
> +	BTRFS_DEV_TYPE_NONROT,
> +	BTRFS_DEV_TYPE_ROT,
> +	BTRFS_DEV_TYPE_ZONED,

I think this should be separate, in one list define all sensible device
types and then add arrays sorted by latency, or other factors.

The zoned devices are mostly HDD but ther are also SSD-like using ZNS,
so that can't be both under BTRFS_DEV_TYPE_ZONED and behind
BTRFS_DEV_TYPE_ROT as if it had worse latency.

I'm not sure how much we should try to guess the device types, the ones
you have so far are almost all we can get without peeking too much into
the devices/queues internals.

Also here's the terminology question if we should still consider
rotational device status as the main criteria, because then SSD, NVMe,
RAM-backed are all non-rotational but with different latency
characteristics.

> +};
> +
>  struct btrfs_zoned_device_info;
>  
>  struct btrfs_device {
> @@ -101,9 +111,16 @@ struct btrfs_device {
>  
>  	/* optimal io width for this device */
>  	u32 io_width;
> -	/* type and info about this device */
> +
> +	/* Type and info about this device, on-disk (currently unused) */
>  	u64 type;
>  
> +	/*
> +	 * Device type (in memory only) at some point, merge to the on-disk
> +	 * member 'type' above.
> +	 */
> +	enum btrfs_dev_types dev_type;

So at some point this is planned to be user configurable? We should be
always able to detect the device type at mount type so I wonder if this
needs to be stored on disk.
Anand Jain Jan. 29, 2022, 4:24 p.m. UTC | #2
On 27/01/2022 00:53, David Sterba wrote:
> On Tue, Jan 18, 2022 at 11:18:01PM +0800, Anand Jain wrote:
>> Preparation to make data/metadata chunks allocations based on the device
>> types- keep the identified device type in the struct btrfs_device.
>>
>> This patch adds a member 'dev_type' to hold the defined device types in
>> the struct btrfs_devices.
>>
>> Also, add a helper function and a struct btrfs_fs_devices member
>> 'mixed_dev_type' to know if the filesystem contains the mixed device
>> types.
>>
>> Struct btrfs_device has an existing member 'type' that stages and writes
>> back to the on-disk format. This patch does not use it. As just an
>> in-memory only data will suffice the requirement here.
>>
>> Signed-off-by: Anand Jain <anand.jain@oracle.com>
>> ---
>>   fs/btrfs/dev-replace.c |  2 ++
>>   fs/btrfs/volumes.c     | 45 ++++++++++++++++++++++++++++++++++++++++++
>>   fs/btrfs/volumes.h     | 26 +++++++++++++++++++++++-
>>   3 files changed, 72 insertions(+), 1 deletion(-)
>>
>> diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
>> index 71fd99b48283..3731c7d1c6b7 100644
>> --- a/fs/btrfs/dev-replace.c
>> +++ b/fs/btrfs/dev-replace.c
>> @@ -325,6 +325,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
>>   	device->dev_stats_valid = 1;
>>   	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
>>   	device->fs_devices = fs_devices;
>> +	device->dev_type = btrfs_get_device_type(device);
>>   
>>   	ret = btrfs_get_dev_zone_info(device, false);
>>   	if (ret)
>> @@ -334,6 +335,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
>>   	list_add(&device->dev_list, &fs_devices->devices);
>>   	fs_devices->num_devices++;
>>   	fs_devices->open_devices++;
>> +	fs_devices->mixed_dev_types = btrfs_has_mixed_dev_types(fs_devices);
>>   	mutex_unlock(&fs_devices->device_list_mutex);
>>   
>>   	*device_out = device;
>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>> index 9d50e035e61d..da3d6d0f5bc3 100644
>> --- a/fs/btrfs/volumes.c
>> +++ b/fs/btrfs/volumes.c
>> @@ -1041,6 +1041,7 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
>>   			     device->generation > (*latest_dev)->generation)) {
>>   				*latest_dev = device;
>>   			}
>> +			device->dev_type = btrfs_get_device_type(device);
>>   			continue;
>>   		}
>>   
>> @@ -1084,6 +1085,7 @@ void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices)
>>   		__btrfs_free_extra_devids(seed_dev, &latest_dev);
>>   
>>   	fs_devices->latest_dev = latest_dev;
>> +	fs_devices->mixed_dev_types = btrfs_has_mixed_dev_types(fs_devices);
>>   
>>   	mutex_unlock(&uuid_mutex);
>>   }
>> @@ -2183,6 +2185,9 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
>>   
>>   	num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
>>   	btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
>> +
>> +	cur_devices->mixed_dev_types = btrfs_has_mixed_dev_types(cur_devices);
>> +
>>   	mutex_unlock(&fs_devices->device_list_mutex);
>>   
>>   	/*
>> @@ -2584,6 +2589,44 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
>>   	return ret;
>>   }
>>   
>> +bool btrfs_has_mixed_dev_types(struct btrfs_fs_devices *fs_devices)
>> +{
>> +	struct btrfs_device *device;
>> +	int type_rot = 0;
>> +	int type_nonrot = 0;
>> +
>> +	list_for_each_entry(device, &fs_devices->devices, dev_list) {
>> +
>> +		if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
>> +			continue;
>> +
>> +		switch (device->dev_type) {
>> +		case BTRFS_DEV_TYPE_ROT:
>> +			type_rot++;
>> +			break;
>> +		case BTRFS_DEV_TYPE_NONROT:
>> +		default:
>> +			type_nonrot++;
>> +		}
>> +	}
>> +
>> +	if (type_rot && type_nonrot)
>> +		return true;
>> +	else
>> +		return false;
>> +}
>> +
>> +enum btrfs_dev_types btrfs_get_device_type(struct btrfs_device *device)
>> +{
>> +	if (bdev_is_zoned(device->bdev))
>> +		return BTRFS_DEV_TYPE_ZONED;
>> +
>> +	if (blk_queue_nonrot(bdev_get_queue(device->bdev)))
>> +		return BTRFS_DEV_TYPE_NONROT;
>> +
>> +	return BTRFS_DEV_TYPE_ROT;
>> +}
>> +
>>   int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
>>   {
>>   	struct btrfs_root *root = fs_info->dev_root;
>> @@ -2675,6 +2718,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
>>   	clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
>>   	device->mode = FMODE_EXCL;
>>   	device->dev_stats_valid = 1;
>> +	device->dev_type = btrfs_get_device_type(device);
>>   	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
>>   
>>   	if (seeding_dev) {
>> @@ -2710,6 +2754,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
>>   	atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
>>   
>>   	fs_devices->rotating = !blk_queue_nonrot(bdev_get_queue(bdev));
>> +	fs_devices->mixed_dev_types = btrfs_has_mixed_dev_types(fs_devices);
>>   
>>   	orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
>>   	btrfs_set_super_total_bytes(fs_info->super_copy,
>> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
>> index 6a790b85edd8..5be31161601d 100644
>> --- a/fs/btrfs/volumes.h
>> +++ b/fs/btrfs/volumes.h
>> @@ -52,6 +52,16 @@ struct btrfs_io_geometry {
>>   #define BTRFS_DEV_STATE_FLUSH_SENT	(4)
>>   #define BTRFS_DEV_STATE_NO_READA	(5)
>>   
>> +/*
>> + * Device class types arranged by their IO latency from low to high.
>> + */
>> +enum btrfs_dev_types {
>> +	BTRFS_DEV_TYPE_MEM = 1,
>> +	BTRFS_DEV_TYPE_NONROT,
>> +	BTRFS_DEV_TYPE_ROT,
>> +	BTRFS_DEV_TYPE_ZONED,
> 
> I think this should be separate, in one list define all sensible device
> types and then add arrays sorted by latency, or other factors.
>
> The zoned devices are mostly HDD but ther are also SSD-like using ZNS,
> so that can't be both under BTRFS_DEV_TYPE_ZONED and behind
> BTRFS_DEV_TYPE_ROT as if it had worse latency.

I still do not have a complete idea of its implantation using an array. 
Do you mean something like as below,

enum btrfs_dev_types {
	::
};

struct btrfs_dev_type {
	enum btrfs_dev_types dev_type;
	int priority_latency;
};


I think we can identify a ZNS and set its relative latency to a value 
lower (better) than rotational.


> I'm not sure how much we should try to guess the device types, the ones
> you have so far are almost all we can get without peeking too much into
> the devices/queues internals.

Agree.

> Also here's the terminology question if we should still consider
> rotational device status as the main criteria, because then SSD, NVMe,
> RAM-backed are all non-rotational but with different latency
> characteristics.

Right. The expected performance also depends on the interconnect type
which is undetectable.

IMO we shouldn't worry about the non-rational's interconnect types
(M2/PCIe/NVMe/SATA/SAS) even though they have different performances.

Because some of these interconnects are compatible with each-other 
medium might change its interconnect during the lifecycle of the data.

Then left with are the types of mediums- rotational, non-rotational and
zoned which, we can identify safely.

Use-cases plan to mix these types of mediums to achieve the 
cost-per-byte advantage. As of now, I think our target can be to help these
kind of planned mixing.

>> +};
>> +
>>   struct btrfs_zoned_device_info;
>>   
>>   struct btrfs_device {
>> @@ -101,9 +111,16 @@ struct btrfs_device {
>>   
>>   	/* optimal io width for this device */
>>   	u32 io_width;
>> -	/* type and info about this device */
>> +
>> +	/* Type and info about this device, on-disk (currently unused) */
>>   	u64 type;
>>   
>> +	/*
>> +	 * Device type (in memory only) at some point, merge to the on-disk
>> +	 * member 'type' above.
>> +	 */
>> +	enum btrfs_dev_types dev_type;
> 
> So at some point this is planned to be user configurable? We should be
> always able to detect the device type at mount type so I wonder if this
> needs to be stored on disk.

I didn't find any planned configs (white papers) discussing the
advantages of mixing non-rotational drives with different interconnect
types. If any then, we may have to provide the manual configuration.
(If the mixing is across the medium types like non-rotational with
either zoned or HDD, then the users don't have to configure as we can
optimize the allocation automatically for performance.)

Also, hot cache device or device grouping (when implemented) need the
user to configure.

And so maybe part of in-memory 'dev_type' would be in-sync with on-disk
'type' at some point. IMO.

Thanks, Anand
David Sterba Feb. 1, 2022, 5:06 p.m. UTC | #3
On Sun, Jan 30, 2022 at 12:24:15AM +0800, Anand Jain wrote:
> On 27/01/2022 00:53, David Sterba wrote:
> > On Tue, Jan 18, 2022 at 11:18:01PM +0800, Anand Jain wrote:
> >> +/*
> >> + * Device class types arranged by their IO latency from low to high.
> >> + */
> >> +enum btrfs_dev_types {
> >> +	BTRFS_DEV_TYPE_MEM = 1,
> >> +	BTRFS_DEV_TYPE_NONROT,
> >> +	BTRFS_DEV_TYPE_ROT,
> >> +	BTRFS_DEV_TYPE_ZONED,
> > 
> > I think this should be separate, in one list define all sensible device
> > types and then add arrays sorted by latency, or other factors.
> >
> > The zoned devices are mostly HDD but ther are also SSD-like using ZNS,
> > so that can't be both under BTRFS_DEV_TYPE_ZONED and behind
> > BTRFS_DEV_TYPE_ROT as if it had worse latency.
> 
> I still do not have a complete idea of its implantation using an array. 
> Do you mean something like as below,
> 
> enum btrfs_dev_types {
> 	::
> };
> 
> struct btrfs_dev_type {
> 	enum btrfs_dev_types dev_type;
> 	int priority_latency;
> };

	enum btrfs_dev_types {
		BTRFS_DEV_TYPE_HDD,
		BTRFS_DEV_TYPE_SSD,
		BTRFS_DEV_TYPE_NVME,
		BTRFS_DEV_TYPE_ZONED_HDD,
		BTRFS_DEV_TYPE_ZONED_ZNS,
	};

	enum btrfs_dev_types btrfs_devices_by_latency[] = {
		BTRFS_DEV_TYPE_NVME,
		BTRFS_DEV_TYPE_SSD,
		BTRFS_DEV_TYPE_ZONED_ZNS,
		BTRFS_DEV_TYPE_HDD,
		BTRFS_DEV_TYPE_ZONED_HDD,
	};

	enum btrfs_dev_types btrfs_devices_by_capacity[] = {
		BTRFS_DEV_TYPE_ZONED_HDD,
		BTRFS_DEV_TYPE_HDD,
		BTRFS_DEV_TYPE_SSD,
		BTRFS_DEV_TYPE_ZONED_ZNS,
		BTRFS_DEV_TYPE_NVME,
	};

Then if the chunk allocator has a selected policy (here by latency and
by capacity), it would use the list as additional sorting criteria.

Optimizing for latency: device 1 (SSD) vs device 2 (NVME), pick NVME
even if device 1 would be otherwise picked due to the capacity criteria
(as we have it now).

> I think we can identify a ZNS and set its relative latency to a value 
> lower (better) than rotational.

The device classes are general I don't mean to measure the latecy
exactly, it's usually typical for the whole class and eg. NVME is
considered better than SSD and SSD better than HDD.

> > I'm not sure how much we should try to guess the device types, the ones
> > you have so far are almost all we can get without peeking too much into
> > the devices/queues internals.
> 
> Agree.
> 
> > Also here's the terminology question if we should still consider
> > rotational device status as the main criteria, because then SSD, NVMe,
> > RAM-backed are all non-rotational but with different latency
> > characteristics.
> 
> Right. The expected performance also depends on the interconnect type
> which is undetectable.
> 
> IMO we shouldn't worry about the non-rational's interconnect types
> (M2/PCIe/NVMe/SATA/SAS) even though they have different performances.

Yeah, that's why I'd stay just with the general storage type, not the
type of connection.

> Because some of these interconnects are compatible with each-other 
> medium might change its interconnect during the lifecycle of the data.
> 
> Then left with are the types of mediums- rotational, non-rotational and
> zoned which, we can identify safely.
> 
> Use-cases plan to mix these types of mediums to achieve the 
> cost-per-byte advantage. As of now, I think our target can be to help these
> kind of planned mixing.
> 
> >> +};
> >> +
> >>   struct btrfs_zoned_device_info;
> >>   
> >>   struct btrfs_device {
> >> @@ -101,9 +111,16 @@ struct btrfs_device {
> >>   
> >>   	/* optimal io width for this device */
> >>   	u32 io_width;
> >> -	/* type and info about this device */
> >> +
> >> +	/* Type and info about this device, on-disk (currently unused) */
> >>   	u64 type;
> >>   
> >> +	/*
> >> +	 * Device type (in memory only) at some point, merge to the on-disk
> >> +	 * member 'type' above.
> >> +	 */
> >> +	enum btrfs_dev_types dev_type;
> > 
> > So at some point this is planned to be user configurable? We should be
> > always able to detect the device type at mount type so I wonder if this
> > needs to be stored on disk.
> 
> I didn't find any planned configs (white papers) discussing the
> advantages of mixing non-rotational drives with different interconnect
> types. If any then, we may have to provide the manual configuration.
> (If the mixing is across the medium types like non-rotational with
> either zoned or HDD, then the users don't have to configure as we can
> optimize the allocation automatically for performance.)
> 
> Also, hot cache device or device grouping (when implemented) need the
> user to configure.
> 
> And so maybe part of in-memory 'dev_type' would be in-sync with on-disk
> 'type' at some point. IMO.

Yeah, the tentative plan for such features is to first make it in-memory
only so we can test it without also defining the permanent on-disk
format. From user perspective if there's a reasonable auto-tuning
behaviour without a need for configuration then it should be
implemented. Once we start adding knobs for various storage types and
what should be stored where, it's a beginning of chaos.
Anand Jain Feb. 3, 2022, 12:56 p.m. UTC | #4
On 02/02/2022 01:06, David Sterba wrote:
> On Sun, Jan 30, 2022 at 12:24:15AM +0800, Anand Jain wrote:
>> On 27/01/2022 00:53, David Sterba wrote:
>>> On Tue, Jan 18, 2022 at 11:18:01PM +0800, Anand Jain wrote:
>>>> +/*
>>>> + * Device class types arranged by their IO latency from low to high.
>>>> + */
>>>> +enum btrfs_dev_types {
>>>> +	BTRFS_DEV_TYPE_MEM = 1,
>>>> +	BTRFS_DEV_TYPE_NONROT,
>>>> +	BTRFS_DEV_TYPE_ROT,
>>>> +	BTRFS_DEV_TYPE_ZONED,
>>>
>>> I think this should be separate, in one list define all sensible device
>>> types and then add arrays sorted by latency, or other factors.
>>>
>>> The zoned devices are mostly HDD but ther are also SSD-like using ZNS,
>>> so that can't be both under BTRFS_DEV_TYPE_ZONED and behind
>>> BTRFS_DEV_TYPE_ROT as if it had worse latency.
>>
>> I still do not have a complete idea of its implantation using an array.
>> Do you mean something like as below,
>>
>> enum btrfs_dev_types {
>> 	::
>> };
>>
>> struct btrfs_dev_type {
>> 	enum btrfs_dev_types dev_type;
>> 	int priority_latency;
>> };
> 
> 	enum btrfs_dev_types {
> 		BTRFS_DEV_TYPE_HDD,
> 		BTRFS_DEV_TYPE_SSD,
> 		BTRFS_DEV_TYPE_NVME,
> 		BTRFS_DEV_TYPE_ZONED_HDD,
> 		BTRFS_DEV_TYPE_ZONED_ZNS,
> 	};
> 
> 	enum btrfs_dev_types btrfs_devices_by_latency[] = {
> 		BTRFS_DEV_TYPE_NVME,
> 		BTRFS_DEV_TYPE_SSD,
> 		BTRFS_DEV_TYPE_ZONED_ZNS,
> 		BTRFS_DEV_TYPE_HDD,
> 		BTRFS_DEV_TYPE_ZONED_HDD,
> 	};
> 

  Ah. It is a cleaner way. Thanks.

> 	enum btrfs_dev_types btrfs_devices_by_capacity[] = {
> 		BTRFS_DEV_TYPE_ZONED_HDD,
> 		BTRFS_DEV_TYPE_HDD,
> 		BTRFS_DEV_TYPE_SSD,
> 		BTRFS_DEV_TYPE_ZONED_ZNS,
> 		BTRFS_DEV_TYPE_NVME,
> 	};
> 
 >
> Then if the chunk allocator has a selected policy (here by latency and
> by capacity), it would use the list as additional sorting criteria.

> Optimizing for latency: device 1 (SSD) vs device 2 (NVME), pick NVME
> even if device 1 would be otherwise picked due to the capacity criteria
> (as we have it now).

  Hmm. Above, btrfs_devices_by_capacity[] contains device types
  (not device itself).
  Do you mean the btrfs_devices_by_capacity[]  used as the device-type
  priority list. And, then to sort the devices by the capacity in that
  type?

>> I think we can identify a ZNS and set its relative latency to a value
>> lower (better) than rotational.
> 
> The device classes are general I don't mean to measure the latecy
> exactly, it's usually typical for the whole class and eg. NVME is
> considered better than SSD and SSD better than HDD.
> 

  Got it.

>>> I'm not sure how much we should try to guess the device types, the ones
>>> you have so far are almost all we can get without peeking too much into
>>> the devices/queues internals.
>>
>> Agree.
>>
>>> Also here's the terminology question if we should still consider
>>> rotational device status as the main criteria, because then SSD, NVMe,
>>> RAM-backed are all non-rotational but with different latency
>>> characteristics.
>>
>> Right. The expected performance also depends on the interconnect type
>> which is undetectable.
>>
>> IMO we shouldn't worry about the non-rational's interconnect types
>> (M2/PCIe/NVMe/SATA/SAS) even though they have different performances.
> 
> Yeah, that's why I'd stay just with the general storage type, not the
> type of connection.
> 

  Hmm. We might have challenges to distinguish between SSD and NVME.
  Both are non-rotational. They differ by interface type. Unless we read
  the class name from the
     struct block_device::struct device bd_device::class
  that may be considered as a dirty hack.

>> Because some of these interconnects are compatible with each-other
>> medium might change its interconnect during the lifecycle of the data.
>>
>> Then left with are the types of mediums- rotational, non-rotational and
>> zoned which, we can identify safely.
>>
>> Use-cases plan to mix these types of mediums to achieve the
>> cost-per-byte advantage. As of now, I think our target can be to help these
>> kind of planned mixing.
>>
>>>> +};
>>>> +
>>>>    struct btrfs_zoned_device_info;
>>>>    
>>>>    struct btrfs_device {
>>>> @@ -101,9 +111,16 @@ struct btrfs_device {
>>>>    
>>>>    	/* optimal io width for this device */
>>>>    	u32 io_width;
>>>> -	/* type and info about this device */
>>>> +
>>>> +	/* Type and info about this device, on-disk (currently unused) */
>>>>    	u64 type;
>>>>    
>>>> +	/*
>>>> +	 * Device type (in memory only) at some point, merge to the on-disk
>>>> +	 * member 'type' above.
>>>> +	 */
>>>> +	enum btrfs_dev_types dev_type;
>>>
>>> So at some point this is planned to be user configurable? We should be
>>> always able to detect the device type at mount type so I wonder if this
>>> needs to be stored on disk.
>>
>> I didn't find any planned configs (white papers) discussing the
>> advantages of mixing non-rotational drives with different interconnect
>> types. If any then, we may have to provide the manual configuration.
>> (If the mixing is across the medium types like non-rotational with
>> either zoned or HDD, then the users don't have to configure as we can
>> optimize the allocation automatically for performance.)
>>
>> Also, hot cache device or device grouping (when implemented) need the
>> user to configure.
>>
>> And so maybe part of in-memory 'dev_type' would be in-sync with on-disk
>> 'type' at some point. IMO.
> 
> Yeah, the tentative plan for such features is to first make it in-memory
> only so we can test it without also defining the permanent on-disk
> format. From user perspective if there's a reasonable auto-tuning
> behaviour without a need for configuration then it should be
> implemented. Once we start adding knobs for various storage types and
> what should be stored where, it's a beginning of chaos.

Yep. Thanks.

Anand
diff mbox series

Patch

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 71fd99b48283..3731c7d1c6b7 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -325,6 +325,7 @@  static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 	device->dev_stats_valid = 1;
 	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
 	device->fs_devices = fs_devices;
+	device->dev_type = btrfs_get_device_type(device);
 
 	ret = btrfs_get_dev_zone_info(device, false);
 	if (ret)
@@ -334,6 +335,7 @@  static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 	list_add(&device->dev_list, &fs_devices->devices);
 	fs_devices->num_devices++;
 	fs_devices->open_devices++;
+	fs_devices->mixed_dev_types = btrfs_has_mixed_dev_types(fs_devices);
 	mutex_unlock(&fs_devices->device_list_mutex);
 
 	*device_out = device;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9d50e035e61d..da3d6d0f5bc3 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1041,6 +1041,7 @@  static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
 			     device->generation > (*latest_dev)->generation)) {
 				*latest_dev = device;
 			}
+			device->dev_type = btrfs_get_device_type(device);
 			continue;
 		}
 
@@ -1084,6 +1085,7 @@  void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices)
 		__btrfs_free_extra_devids(seed_dev, &latest_dev);
 
 	fs_devices->latest_dev = latest_dev;
+	fs_devices->mixed_dev_types = btrfs_has_mixed_dev_types(fs_devices);
 
 	mutex_unlock(&uuid_mutex);
 }
@@ -2183,6 +2185,9 @@  int btrfs_rm_device(struct btrfs_fs_info *fs_info,
 
 	num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
 	btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
+
+	cur_devices->mixed_dev_types = btrfs_has_mixed_dev_types(cur_devices);
+
 	mutex_unlock(&fs_devices->device_list_mutex);
 
 	/*
@@ -2584,6 +2589,44 @@  static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
 	return ret;
 }
 
+bool btrfs_has_mixed_dev_types(struct btrfs_fs_devices *fs_devices)
+{
+	struct btrfs_device *device;
+	int type_rot = 0;
+	int type_nonrot = 0;
+
+	list_for_each_entry(device, &fs_devices->devices, dev_list) {
+
+		if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
+			continue;
+
+		switch (device->dev_type) {
+		case BTRFS_DEV_TYPE_ROT:
+			type_rot++;
+			break;
+		case BTRFS_DEV_TYPE_NONROT:
+		default:
+			type_nonrot++;
+		}
+	}
+
+	if (type_rot && type_nonrot)
+		return true;
+	else
+		return false;
+}
+
+enum btrfs_dev_types btrfs_get_device_type(struct btrfs_device *device)
+{
+	if (bdev_is_zoned(device->bdev))
+		return BTRFS_DEV_TYPE_ZONED;
+
+	if (blk_queue_nonrot(bdev_get_queue(device->bdev)))
+		return BTRFS_DEV_TYPE_NONROT;
+
+	return BTRFS_DEV_TYPE_ROT;
+}
+
 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
 {
 	struct btrfs_root *root = fs_info->dev_root;
@@ -2675,6 +2718,7 @@  int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
 	device->mode = FMODE_EXCL;
 	device->dev_stats_valid = 1;
+	device->dev_type = btrfs_get_device_type(device);
 	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
 
 	if (seeding_dev) {
@@ -2710,6 +2754,7 @@  int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
 
 	fs_devices->rotating = !blk_queue_nonrot(bdev_get_queue(bdev));
+	fs_devices->mixed_dev_types = btrfs_has_mixed_dev_types(fs_devices);
 
 	orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
 	btrfs_set_super_total_bytes(fs_info->super_copy,
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 6a790b85edd8..5be31161601d 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -52,6 +52,16 @@  struct btrfs_io_geometry {
 #define BTRFS_DEV_STATE_FLUSH_SENT	(4)
 #define BTRFS_DEV_STATE_NO_READA	(5)
 
+/*
+ * Device class types arranged by their IO latency from low to high.
+ */
+enum btrfs_dev_types {
+	BTRFS_DEV_TYPE_MEM = 1,
+	BTRFS_DEV_TYPE_NONROT,
+	BTRFS_DEV_TYPE_ROT,
+	BTRFS_DEV_TYPE_ZONED,
+};
+
 struct btrfs_zoned_device_info;
 
 struct btrfs_device {
@@ -101,9 +111,16 @@  struct btrfs_device {
 
 	/* optimal io width for this device */
 	u32 io_width;
-	/* type and info about this device */
+
+	/* Type and info about this device, on-disk (currently unused) */
 	u64 type;
 
+	/*
+	 * Device type (in memory only) at some point, merge to the on-disk
+	 * member 'type' above.
+	 */
+	enum btrfs_dev_types dev_type;
+
 	/* minimal io size for this device */
 	u32 sector_size;
 
@@ -296,6 +313,11 @@  struct btrfs_fs_devices {
 	 */
 	bool rotating;
 
+	/*
+	 * True when devices belong more than one type.
+	 */
+	bool mixed_dev_types;
+
 	struct btrfs_fs_info *fs_info;
 	/* sysfs kobjects */
 	struct kobject fsid_kobj;
@@ -636,5 +658,7 @@  int btrfs_bg_type_to_factor(u64 flags);
 const char *btrfs_bg_type_to_raid_name(u64 flags);
 int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info);
 bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical);
+enum btrfs_dev_types btrfs_get_device_type(struct btrfs_device *device);
+bool btrfs_has_mixed_dev_types(struct btrfs_fs_devices *fs_devices);
 
 #endif