diff mbox series

btrfs: add ssd_metadata mode

Message ID 20200401200316.9917-2-kreijack@libero.it (mailing list archive)
State New, archived
Headers show
Series btrfs: add ssd_metadata mode | expand

Commit Message

Goffredo Baroncelli April 1, 2020, 8:03 p.m. UTC
From: Goffredo Baroncelli <kreijack@inwind.it>

When this mode is enabled, the allocation policy of the chunk
is so modified:
- when a metadata chunk is allocated, priority is given to
ssd disk.
- When a data chunk is allocated, priority is given to a
rotational disk.

When a striped profile is involved (like RAID0,5,6), the logic
is a bit more complex. If there are enough disks, the data profiles
are stored on the rotational disks only; the metadata profiles
are stored on the non rotational disk only.
If the disks are not enough, then the profiles is stored on all
the disks.

Example: assuming that sda, sdb, sdc are ssd disks, and sde, sdf are
rotational ones.
A data profile raid5, will be stored on sda, sdb, sdc, sde, sdf (sde
and sdf are not enough to host a raid5 profile).
A metadata profile raid5, will be stored on sda, sdb, sdc (these
are enough to host a raid5 profile).

To enable this mode pass -o ssd_metadata at mount time.

Signed-off-by: Goffredo Baroncelli <kreijack@inwind.it>
---
 fs/btrfs/ctree.h   |  1 +
 fs/btrfs/super.c   |  8 +++++
 fs/btrfs/volumes.c | 89 ++++++++++++++++++++++++++++++++++++++++++++--
 fs/btrfs/volumes.h |  1 +
 4 files changed, 97 insertions(+), 2 deletions(-)

Comments

Goffredo Baroncelli April 1, 2020, 8:53 p.m. UTC | #1
On 4/1/20 10:03 PM, Goffredo Baroncelli wrote:
> From: Goffredo Baroncelli <kreijack@inwind.it>
> 
> When this mode is enabled, the allocation policy of the chunk
> is so modified:
> - when a metadata chunk is allocated, priority is given to
> ssd disk.
> - When a data chunk is allocated, priority is given to a
> rotational disk.
> 
> When a striped profile is involved (like RAID0,5,6), the logic
> is a bit more complex. If there are enough disks, the data profiles
> are stored on the rotational disks only; the metadata profiles
> are stored on the non rotational disk only.
> If the disks are not enough, then the profiles is stored on all
> the disks.
> 
> Example: assuming that sda, sdb, sdc are ssd disks, and sde, sdf are
> rotational ones.
> A data profile raid5, will be stored on sda, sdb, sdc, sde, sdf (sde
> and sdf are not enough to host a raid5 profile).
> A metadata profile raid5, will be stored on sda, sdb, sdc (these
> are enough to host a raid5 profile).

The example is a bit wrong. Replace raid5 with raid6. However I
hope that the sense is still understandable.

> 
> To enable this mode pass -o ssd_metadata at mount time.
> 
> Signed-off-by: Goffredo Baroncelli <kreijack@inwind.it>
> ---
>   fs/btrfs/ctree.h   |  1 +
>   fs/btrfs/super.c   |  8 +++++
>   fs/btrfs/volumes.c | 89 ++++++++++++++++++++++++++++++++++++++++++++--
>   fs/btrfs/volumes.h |  1 +
>   4 files changed, 97 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 2e9f938508e9..0f3c09cc4863 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -1187,6 +1187,7 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
>   #define BTRFS_MOUNT_FREE_SPACE_TREE	(1 << 26)
>   #define BTRFS_MOUNT_NOLOGREPLAY		(1 << 27)
>   #define BTRFS_MOUNT_REF_VERIFY		(1 << 28)
> +#define BTRFS_MOUNT_SSD_METADATA	(1 << 29)
>   
>   #define BTRFS_DEFAULT_COMMIT_INTERVAL	(30)
>   #define BTRFS_DEFAULT_MAX_INLINE	(2048)
> diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
> index c6557d44907a..d0a5cf496f90 100644
> --- a/fs/btrfs/super.c
> +++ b/fs/btrfs/super.c
> @@ -346,6 +346,7 @@ enum {
>   #ifdef CONFIG_BTRFS_FS_REF_VERIFY
>   	Opt_ref_verify,
>   #endif
> +	Opt_ssd_metadata,
>   	Opt_err,
>   };
>   
> @@ -416,6 +417,7 @@ static const match_table_t tokens = {
>   #ifdef CONFIG_BTRFS_FS_REF_VERIFY
>   	{Opt_ref_verify, "ref_verify"},
>   #endif
> +	{Opt_ssd_metadata, "ssd_metadata"},
>   	{Opt_err, NULL},
>   };
>   
> @@ -853,6 +855,10 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
>   			btrfs_set_opt(info->mount_opt, REF_VERIFY);
>   			break;
>   #endif
> +		case Opt_ssd_metadata:
> +			btrfs_set_and_info(info, SSD_METADATA,
> +					"enabling ssd_metadata");
> +			break;
>   		case Opt_err:
>   			btrfs_info(info, "unrecognized mount option '%s'", p);
>   			ret = -EINVAL;
> @@ -1369,6 +1375,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
>   #endif
>   	if (btrfs_test_opt(info, REF_VERIFY))
>   		seq_puts(seq, ",ref_verify");
> +	if (btrfs_test_opt(info, SSD_METADATA))
> +		seq_puts(seq, ",ssd_metadata");
>   	seq_printf(seq, ",subvolid=%llu",
>   		  BTRFS_I(d_inode(dentry))->root->root_key.objectid);
>   	seq_puts(seq, ",subvol=");
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index a8b71ded4d21..678dc3366711 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -4758,6 +4758,58 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
>   	return 0;
>   }
>   
> +/*
> + * sort the devices in descending order by rotational,
> + * max_avail, total_avail
> + */
> +static int btrfs_cmp_device_info_metadata(const void *a, const void *b)
> +{
> +	const struct btrfs_device_info *di_a = a;
> +	const struct btrfs_device_info *di_b = b;
> +
> +	/* metadata -> non rotational first */
> +	if (!di_a->rotational && di_b->rotational)
> +		return -1;
> +	if (di_a->rotational && !di_b->rotational)
> +		return 1;
> +	if (di_a->max_avail > di_b->max_avail)
> +		return -1;
> +	if (di_a->max_avail < di_b->max_avail)
> +		return 1;
> +	if (di_a->total_avail > di_b->total_avail)
> +		return -1;
> +	if (di_a->total_avail < di_b->total_avail)
> +		return 1;
> +	return 0;
> +}
> +
> +/*
> + * sort the devices in descending order by !rotational,
> + * max_avail, total_avail
> + */
> +static int btrfs_cmp_device_info_data(const void *a, const void *b)
> +{
> +	const struct btrfs_device_info *di_a = a;
> +	const struct btrfs_device_info *di_b = b;
> +
> +	/* data -> non rotational last */
> +	if (!di_a->rotational && di_b->rotational)
> +		return 1;
> +	if (di_a->rotational && !di_b->rotational)
> +		return -1;
> +	if (di_a->max_avail > di_b->max_avail)
> +		return -1;
> +	if (di_a->max_avail < di_b->max_avail)
> +		return 1;
> +	if (di_a->total_avail > di_b->total_avail)
> +		return -1;
> +	if (di_a->total_avail < di_b->total_avail)
> +		return 1;
> +	return 0;
> +}
> +
> +
> +
>   static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
>   {
>   	if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
> @@ -4805,6 +4857,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
>   	int i;
>   	int j;
>   	int index;
> +	int nr_rotational;
>   
>   	BUG_ON(!alloc_profile_is_valid(type, 0));
>   
> @@ -4860,6 +4913,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
>   	 * about the available holes on each device.
>   	 */
>   	ndevs = 0;
> +	nr_rotational = 0;
>   	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
>   		u64 max_avail;
>   		u64 dev_offset;
> @@ -4911,14 +4965,45 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
>   		devices_info[ndevs].max_avail = max_avail;
>   		devices_info[ndevs].total_avail = total_avail;
>   		devices_info[ndevs].dev = device;
> +		devices_info[ndevs].rotational = !test_bit(QUEUE_FLAG_NONROT,
> +				&(bdev_get_queue(device->bdev)->queue_flags));
> +		if (devices_info[ndevs].rotational)
> +			nr_rotational++;
>   		++ndevs;
>   	}
>   
> +	BUG_ON(nr_rotational > ndevs);
>   	/*
>   	 * now sort the devices by hole size / available space
>   	 */
> -	sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
> -	     btrfs_cmp_device_info, NULL);
> +	if (((type & BTRFS_BLOCK_GROUP_DATA) &&
> +	     (type & BTRFS_BLOCK_GROUP_METADATA)) ||
> +	    !btrfs_test_opt(info, SSD_METADATA)) {
> +		/* mixed bg or SSD_METADATA not set */
> +		sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
> +			     btrfs_cmp_device_info, NULL);
> +	} else {
> +		/*
> +		 * if SSD_METADATA is set, sort the device considering also the
> +		 * kind (ssd or not). Limit the availables devices to the ones
> +		 * of the same kind, to avoid that a striped profile like raid5
> +		 * spans to all kind of devices (ssd and rotational).
> +		 * It is allowed to span different kind of devices if the ones of
> +		 * the same kind are not enough alone.
> +		 */
> +		if (type & BTRFS_BLOCK_GROUP_DATA) {
> +			sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
> +				     btrfs_cmp_device_info_data, NULL);
> +			if (nr_rotational > devs_min)

It should be
			if (nr_rotational >= devs_min)

> +				ndevs = nr_rotational;
> +		} else {
> +			int nr_norot = ndevs - nr_rotational;
> +			sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
> +				     btrfs_cmp_device_info_metadata, NULL);
> +			if (nr_norot > devs_min)

It should be
			if (nr_nonrot >= devs_min)



> +				ndevs = nr_norot;
> +		}
> +	}
>   
>   	/*
>   	 * Round down to number of usable stripes, devs_increment can be any
> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
> index fc1b564b9cfe..bc1cfa0c27ea 100644
> --- a/fs/btrfs/volumes.h
> +++ b/fs/btrfs/volumes.h
> @@ -340,6 +340,7 @@ struct btrfs_device_info {
>   	u64 dev_offset;
>   	u64 max_avail;
>   	u64 total_avail;
> +	int rotational:1;
>   };
>   
>   struct btrfs_raid_attr {
>
Steven Davies April 2, 2020, 9:33 a.m. UTC | #2
On 01/04/2020 21:03, Goffredo Baroncelli wrote:
> From: Goffredo Baroncelli <kreijack@inwind.it>
>
> When this mode is enabled, the allocation policy of the chunk
> is so modified:
> - when a metadata chunk is allocated, priority is given to
> ssd disk.
> - When a data chunk is allocated, priority is given to a
> rotational disk.
>
> When a striped profile is involved (like RAID0,5,6), the logic
> is a bit more complex. If there are enough disks, the data profiles
> are stored on the rotational disks only; the metadata profiles
> are stored on the non rotational disk only.
> If the disks are not enough, then the profiles is stored on all
> the disks.
>
> Example: assuming that sda, sdb, sdc are ssd disks, and sde, sdf are
> rotational ones.
> A data profile raid5, will be stored on sda, sdb, sdc, sde, sdf (sde
> and sdf are not enough to host a raid5 profile).
> A metadata profile raid5, will be stored on sda, sdb, sdc (these
> are enough to host a raid5 profile).
>
> To enable this mode pass -o ssd_metadata at mount time.
>
> Signed-off-by: Goffredo Baroncelli <kreijack@inwind.it>

The idea of this sounds similar to what Anand has been working on with 
the readmirror patchset[1] which was originally designed to prefer 
reading from SSD devices in a RAID1 configuration but has evolved into 
allowing the read policy to be configured through sysfs, at least partly 
because detecting SSDs correctly is not an exact science. Also, there 
may be more considerations than just HDD or SSD: for example in my 
system I use a SATA SSD and an NVMe SSD in RAID1 where the NVMe device 
is twice the speed of the SSD.

I would therefore vote for configurability of this rather than always 
choosing SSD over HDD.

[1] https://patchwork.kernel.org/project/linux-btrfs/list/?series=245121
Goffredo Baroncelli April 2, 2020, 4:39 p.m. UTC | #3
On 4/2/20 11:33 AM, Steven Davies wrote:
> On 01/04/2020 21:03, Goffredo Baroncelli wrote:
>> From: Goffredo Baroncelli <kreijack@inwind.it>
>>
>> When this mode is enabled, the allocation policy of the chunk
>> is so modified:
>> - when a metadata chunk is allocated, priority is given to
>> ssd disk.
>> - When a data chunk is allocated, priority is given to a
>> rotational disk.
>>
>> When a striped profile is involved (like RAID0,5,6), the logic
>> is a bit more complex. If there are enough disks, the data profiles
>> are stored on the rotational disks only; the metadata profiles
>> are stored on the non rotational disk only.
>> If the disks are not enough, then the profiles is stored on all
>> the disks.
>>
>> Example: assuming that sda, sdb, sdc are ssd disks, and sde, sdf are
>> rotational ones.
>> A data profile raid5, will be stored on sda, sdb, sdc, sde, sdf (sde
>> and sdf are not enough to host a raid5 profile).
>> A metadata profile raid5, will be stored on sda, sdb, sdc (these
>> are enough to host a raid5 profile).
>>
>> To enable this mode pass -o ssd_metadata at mount time.
>>
>> Signed-off-by: Goffredo Baroncelli <kreijack@inwind.it>
> 
> The idea of this sounds similar to what Anand has been working on with the readmirror patchset[1] which was originally designed to prefer reading from SSD devices in a RAID1 configuration but has evolved into allowing the read policy to be configured through sysfs, 

The work done by Anand is very different. He is working to developing better policy about which mirror has to be select during the reading.

I am investigating the possibility to store (reading and *writing*) the metadata in SSD and the data in rotational disk.

> at least partly because detecting SSDs correctly is not an exact science. Also, there may be more considerations than just HDD or SSD: for example in my system I use a SATA SSD and an NVMe SSD in RAID1 where the NVMe device is twice the speed of the SSD.

There is the "rotational" attribute, which can be set or by the kernel or by appropriate udev rules.

> 
> I would therefore vote for configurability of this rather than always choosing SSD over HDD.

As state above, there are two completely different patch set, which address two different problem.
> 
> [1] https://patchwork.kernel.org/project/linux-btrfs/list/?series=245121
>
Martin Svec April 2, 2020, 6:01 p.m. UTC | #4
Hi Goffredo,

we're using similar in-house patch on our backup servers for years and the performance impact is
HUGE. Or backup workload includes rsyncs of mailservers/webservers and image-based CBT vSphere
backups, we've hundreds of millions of files and thousands of daily snapshots on every backup
server. Nothing of this would be possible without dedicated metadata SSDs. A typical btrfs backup
server has two 500 GB NVMe SSDs in btrfs RAID1 and twelve 10TB SATA drives in hardware RAID6.

Our chunk allocation logic is fairly simple: if btrfs contains both rotational and non-rotational
drives and there's a metadata chunk allocation request, ignore rotational drives in
__btrfs_alloc_chunk(); in the same way, ignore non-rotational drives when allocating a data chunk.
If the allocation request cannot be satisfied, fallback to the standard __btrfs_alloc_chunk()
implementation which uses all available drives.

Martin

Dne 1.4.2020 v 22:03 Goffredo Baroncelli napsal(a):
> From: Goffredo Baroncelli <kreijack@inwind.it>
>
> When this mode is enabled, the allocation policy of the chunk
> is so modified:
> - when a metadata chunk is allocated, priority is given to
> ssd disk.
> - When a data chunk is allocated, priority is given to a
> rotational disk.
>
> When a striped profile is involved (like RAID0,5,6), the logic
> is a bit more complex. If there are enough disks, the data profiles
> are stored on the rotational disks only; the metadata profiles
> are stored on the non rotational disk only.
> If the disks are not enough, then the profiles is stored on all
> the disks.
>
> Example: assuming that sda, sdb, sdc are ssd disks, and sde, sdf are
> rotational ones.
> A data profile raid5, will be stored on sda, sdb, sdc, sde, sdf (sde
> and sdf are not enough to host a raid5 profile).
> A metadata profile raid5, will be stored on sda, sdb, sdc (these
> are enough to host a raid5 profile).
>
> To enable this mode pass -o ssd_metadata at mount time.
>
> Signed-off-by: Goffredo Baroncelli <kreijack@inwind.it>
> ---
>  fs/btrfs/ctree.h   |  1 +
>  fs/btrfs/super.c   |  8 +++++
>  fs/btrfs/volumes.c | 89 ++++++++++++++++++++++++++++++++++++++++++++--
>  fs/btrfs/volumes.h |  1 +
>  4 files changed, 97 insertions(+), 2 deletions(-)
>
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 2e9f938508e9..0f3c09cc4863 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -1187,6 +1187,7 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
>  #define BTRFS_MOUNT_FREE_SPACE_TREE	(1 << 26)
>  #define BTRFS_MOUNT_NOLOGREPLAY		(1 << 27)
>  #define BTRFS_MOUNT_REF_VERIFY		(1 << 28)
> +#define BTRFS_MOUNT_SSD_METADATA	(1 << 29)
>  
>  #define BTRFS_DEFAULT_COMMIT_INTERVAL	(30)
>  #define BTRFS_DEFAULT_MAX_INLINE	(2048)
> diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
> index c6557d44907a..d0a5cf496f90 100644
> --- a/fs/btrfs/super.c
> +++ b/fs/btrfs/super.c
> @@ -346,6 +346,7 @@ enum {
>  #ifdef CONFIG_BTRFS_FS_REF_VERIFY
>  	Opt_ref_verify,
>  #endif
> +	Opt_ssd_metadata,
>  	Opt_err,
>  };
>  
> @@ -416,6 +417,7 @@ static const match_table_t tokens = {
>  #ifdef CONFIG_BTRFS_FS_REF_VERIFY
>  	{Opt_ref_verify, "ref_verify"},
>  #endif
> +	{Opt_ssd_metadata, "ssd_metadata"},
>  	{Opt_err, NULL},
>  };
>  
> @@ -853,6 +855,10 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
>  			btrfs_set_opt(info->mount_opt, REF_VERIFY);
>  			break;
>  #endif
> +		case Opt_ssd_metadata:
> +			btrfs_set_and_info(info, SSD_METADATA,
> +					"enabling ssd_metadata");
> +			break;
>  		case Opt_err:
>  			btrfs_info(info, "unrecognized mount option '%s'", p);
>  			ret = -EINVAL;
> @@ -1369,6 +1375,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
>  #endif
>  	if (btrfs_test_opt(info, REF_VERIFY))
>  		seq_puts(seq, ",ref_verify");
> +	if (btrfs_test_opt(info, SSD_METADATA))
> +		seq_puts(seq, ",ssd_metadata");
>  	seq_printf(seq, ",subvolid=%llu",
>  		  BTRFS_I(d_inode(dentry))->root->root_key.objectid);
>  	seq_puts(seq, ",subvol=");
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index a8b71ded4d21..678dc3366711 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -4758,6 +4758,58 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
>  	return 0;
>  }
>  
> +/*
> + * sort the devices in descending order by rotational,
> + * max_avail, total_avail
> + */
> +static int btrfs_cmp_device_info_metadata(const void *a, const void *b)
> +{
> +	const struct btrfs_device_info *di_a = a;
> +	const struct btrfs_device_info *di_b = b;
> +
> +	/* metadata -> non rotational first */
> +	if (!di_a->rotational && di_b->rotational)
> +		return -1;
> +	if (di_a->rotational && !di_b->rotational)
> +		return 1;
> +	if (di_a->max_avail > di_b->max_avail)
> +		return -1;
> +	if (di_a->max_avail < di_b->max_avail)
> +		return 1;
> +	if (di_a->total_avail > di_b->total_avail)
> +		return -1;
> +	if (di_a->total_avail < di_b->total_avail)
> +		return 1;
> +	return 0;
> +}
> +
> +/*
> + * sort the devices in descending order by !rotational,
> + * max_avail, total_avail
> + */
> +static int btrfs_cmp_device_info_data(const void *a, const void *b)
> +{
> +	const struct btrfs_device_info *di_a = a;
> +	const struct btrfs_device_info *di_b = b;
> +
> +	/* data -> non rotational last */
> +	if (!di_a->rotational && di_b->rotational)
> +		return 1;
> +	if (di_a->rotational && !di_b->rotational)
> +		return -1;
> +	if (di_a->max_avail > di_b->max_avail)
> +		return -1;
> +	if (di_a->max_avail < di_b->max_avail)
> +		return 1;
> +	if (di_a->total_avail > di_b->total_avail)
> +		return -1;
> +	if (di_a->total_avail < di_b->total_avail)
> +		return 1;
> +	return 0;
> +}
> +
> +
> +
>  static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
>  {
>  	if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
> @@ -4805,6 +4857,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
>  	int i;
>  	int j;
>  	int index;
> +	int nr_rotational;
>  
>  	BUG_ON(!alloc_profile_is_valid(type, 0));
>  
> @@ -4860,6 +4913,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
>  	 * about the available holes on each device.
>  	 */
>  	ndevs = 0;
> +	nr_rotational = 0;
>  	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
>  		u64 max_avail;
>  		u64 dev_offset;
> @@ -4911,14 +4965,45 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
>  		devices_info[ndevs].max_avail = max_avail;
>  		devices_info[ndevs].total_avail = total_avail;
>  		devices_info[ndevs].dev = device;
> +		devices_info[ndevs].rotational = !test_bit(QUEUE_FLAG_NONROT,
> +				&(bdev_get_queue(device->bdev)->queue_flags));
> +		if (devices_info[ndevs].rotational)
> +			nr_rotational++;
>  		++ndevs;
>  	}
>  
> +	BUG_ON(nr_rotational > ndevs);
>  	/*
>  	 * now sort the devices by hole size / available space
>  	 */
> -	sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
> -	     btrfs_cmp_device_info, NULL);
> +	if (((type & BTRFS_BLOCK_GROUP_DATA) &&
> +	     (type & BTRFS_BLOCK_GROUP_METADATA)) ||
> +	    !btrfs_test_opt(info, SSD_METADATA)) {
> +		/* mixed bg or SSD_METADATA not set */
> +		sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
> +			     btrfs_cmp_device_info, NULL);
> +	} else {
> +		/*
> +		 * if SSD_METADATA is set, sort the device considering also the
> +		 * kind (ssd or not). Limit the availables devices to the ones
> +		 * of the same kind, to avoid that a striped profile like raid5
> +		 * spans to all kind of devices (ssd and rotational).
> +		 * It is allowed to span different kind of devices if the ones of
> +		 * the same kind are not enough alone.
> +		 */
> +		if (type & BTRFS_BLOCK_GROUP_DATA) {
> +			sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
> +				     btrfs_cmp_device_info_data, NULL);
> +			if (nr_rotational > devs_min)
> +				ndevs = nr_rotational;
> +		} else {
> +			int nr_norot = ndevs - nr_rotational;
> +			sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
> +				     btrfs_cmp_device_info_metadata, NULL);
> +			if (nr_norot > devs_min)
> +				ndevs = nr_norot;
> +		}
> +	}
>  
>  	/*
>  	 * Round down to number of usable stripes, devs_increment can be any
> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
> index fc1b564b9cfe..bc1cfa0c27ea 100644
> --- a/fs/btrfs/volumes.h
> +++ b/fs/btrfs/volumes.h
> @@ -340,6 +340,7 @@ struct btrfs_device_info {
>  	u64 dev_offset;
>  	u64 max_avail;
>  	u64 total_avail;
> +	int rotational:1;
>  };
>  
>  struct btrfs_raid_attr {
Michael April 3, 2020, 8:43 a.m. UTC | #5
02.04.2020 12:33, Steven Davies пишет:
> On 01/04/2020 21:03, Goffredo Baroncelli wrote:
>> From: Goffredo Baroncelli <kreijack@inwind.it>
>>
>> When this mode is enabled, the allocation policy of the chunk
>> is so modified:
>> - when a metadata chunk is allocated, priority is given to
>> ssd disk.
>> - When a data chunk is allocated, priority is given to a
>> rotational disk.
>>
>> When a striped profile is involved (like RAID0,5,6), the logic
>> is a bit more complex. If there are enough disks, the data profiles
>> are stored on the rotational disks only; the metadata profiles
>> are stored on the non rotational disk only.
>> If the disks are not enough, then the profiles is stored on all
>> the disks.
>>
>> Example: assuming that sda, sdb, sdc are ssd disks, and sde, sdf are
>> rotational ones.
>> A data profile raid5, will be stored on sda, sdb, sdc, sde, sdf (sde
>> and sdf are not enough to host a raid5 profile).
>> A metadata profile raid5, will be stored on sda, sdb, sdc (these
>> are enough to host a raid5 profile).
>>
>> To enable this mode pass -o ssd_metadata at mount time.
>>
>> Signed-off-by: Goffredo Baroncelli <kreijack@inwind.it>
>
> The idea of this sounds similar to what Anand has been working on with 
> the readmirror patchset[1] which was originally designed to prefer 
> reading from SSD devices in a RAID1 configuration but has evolved into 
> allowing the read policy to be configured through sysfs, at least 
> partly because detecting SSDs correctly is not an exact science. Also, 
> there may be more considerations than just HDD or SSD: for example in 
> my system I use a SATA SSD and an NVMe SSD in RAID1 where the NVMe 
> device is twice the speed of the SSD.
May be something like -o 
metadata_preferred_devices=device_id,[device_id,[device_id]]... ?
>
> I would therefore vote for configurability of this rather than always 
> choosing SSD over HDD.
>
> [1] https://patchwork.kernel.org/project/linux-btrfs/list/?series=245121
>
Steven Davies April 3, 2020, 10:08 a.m. UTC | #6
On 03/04/2020 09:43, Michael wrote:
> 02.04.2020 12:33, Steven Davies пишет:
>> On 01/04/2020 21:03, Goffredo Baroncelli wrote:
>>> From: Goffredo Baroncelli <kreijack@inwind.it>
>>>
>>> When this mode is enabled, the allocation policy of the chunk
>>> is so modified:
>>> - when a metadata chunk is allocated, priority is given to
>>> ssd disk.
>>> - When a data chunk is allocated, priority is given to a
>>> rotational disk.
>>>
>>> When a striped profile is involved (like RAID0,5,6), the logic
>>> is a bit more complex. If there are enough disks, the data profiles
>>> are stored on the rotational disks only; the metadata profiles
>>> are stored on the non rotational disk only.
>>> If the disks are not enough, then the profiles is stored on all
>>> the disks.
>>>
>>> Example: assuming that sda, sdb, sdc are ssd disks, and sde, sdf are
>>> rotational ones.
>>> A data profile raid5, will be stored on sda, sdb, sdc, sde, sdf (sde
>>> and sdf are not enough to host a raid5 profile).
>>> A metadata profile raid5, will be stored on sda, sdb, sdc (these
>>> are enough to host a raid5 profile).
>>>
>>> To enable this mode pass -o ssd_metadata at mount time.
>>>
>>> Signed-off-by: Goffredo Baroncelli <kreijack@inwind.it>
>>
>> The idea of this sounds similar to what Anand has been working on with 
>> the readmirror patchset[1] which was originally designed to prefer 
>> reading from SSD devices in a RAID1 configuration but has evolved into 
>> allowing the read policy to be configured through sysfs, at least 
>> partly because detecting SSDs correctly is not an exact science. Also, 
>> there may be more considerations than just HDD or SSD: for example in 
>> my system I use a SATA SSD and an NVMe SSD in RAID1 where the NVMe 
>> device is twice the speed of the SSD.
> May be something like -o 
> metadata_preferred_devices=device_id,[device_id,[device_id]]... ?

Yes, that's what I was thinking of.
Goffredo Baroncelli April 3, 2020, 4:19 p.m. UTC | #7
On 4/3/20 10:43 AM, Michael wrote:
> 02.04.2020 12:33, Steven Davies пишет:
>> On 01/04/2020 21:03, Goffredo Baroncelli wrote:
>>> From: Goffredo Baroncelli <kreijack@inwind.it>
[...]
>>> To enable this mode pass -o ssd_metadata at mount time.
>>>
>>> Signed-off-by: Goffredo Baroncelli <kreijack@inwind.it>
>>
>> The idea of this sounds similar to what Anand has been working on with the readmirror patchset[1] which was originally designed to prefer reading from SSD devices in a RAID1 configuration but has evolved into allowing the read policy to be configured through sysfs, at least partly because detecting SSDs correctly is not an exact science. Also, there may be more considerations than just HDD or SSD: for example in my system I use a SATA SSD and an NVMe SSD in RAID1 where the NVMe device is twice the speed of the SSD.
> May be something like -o metadata_preferred_devices=device_id,[device_id,[device_id]]... ?

I think that it should be a device property instead of a device list passed at mount time.
Looking at the btrfs_dev_item structure (which is embedded in the super block), there are several promising fields:
- seek_speed
- bandwidth
- dev_group

currently these fields are unused. Se we are free to use as we wish. For example, if we use 2 bit of dev_group as marker
for "not for metadata" and "not for date" we can have the following combination:
- 0 (default) -> the disk is suitable for both data and metadata
- "not for metadata" -> the disk has an high priority for "data"
- "not for data" -> the disk has an high priority for "metadata"
- "not for data" and "not for metadata" -> invalid

As kernel<->user space interface, I like the idea to export these bits via sysfs.. unfortunately there is no a directory for the devices.... :-(


>>
>> I would therefore vote for configurability of this rather than always choosing SSD over HDD.
>>
>> [1] https://patchwork.kernel.org/project/linux-btrfs/list/?series=245121
>>
>
Hugo Mills April 3, 2020, 4:28 p.m. UTC | #8
On Fri, Apr 03, 2020 at 06:19:59PM +0200, Goffredo Baroncelli wrote:
> On 4/3/20 10:43 AM, Michael wrote:
> > 02.04.2020 12:33, Steven Davies пишет:
> > > On 01/04/2020 21:03, Goffredo Baroncelli wrote:
> > > > From: Goffredo Baroncelli <kreijack@inwind.it>
> [...]
> > > > To enable this mode pass -o ssd_metadata at mount time.
> > > > 
> > > > Signed-off-by: Goffredo Baroncelli <kreijack@inwind.it>
> > > 
> > > The idea of this sounds similar to what Anand has been working on with the readmirror patchset[1] which was originally designed to prefer reading from SSD devices in a RAID1 configuration but has evolved into allowing the read policy to be configured through sysfs, at least partly because detecting SSDs correctly is not an exact science. Also, there may be more considerations than just HDD or SSD: for example in my system I use a SATA SSD and an NVMe SSD in RAID1 where the NVMe device is twice the speed of the SSD.
> > May be something like -o metadata_preferred_devices=device_id,[device_id,[device_id]]... ?
> 
> I think that it should be a device property instead of a device list passed at mount time.
> Looking at the btrfs_dev_item structure (which is embedded in the super block), there are several promising fields:
> - seek_speed
> - bandwidth
> - dev_group
> 
> currently these fields are unused.

   The first two of these at least were left over from the last
attempt at this which got anywhere near code.

   If you're going to do this kind of thing, please read (at least) my
(sadly code-less; sorry) proposal from a few years ago:

https://www.mail-archive.com/linux-btrfs@vger.kernel.org/msg33632.html

   Essentially, allowing for multiple policies for chunk allocation,
of which this case is a small subset.

   I'd envisaged putting the relevant config data into properties, but
since we didn't actually have them in any sense at the time, I got
bogged down in that part of it.

> Se we are free to use as we wish. For example, if we use 2 bit of dev_group as marker
> for "not for metadata" and "not for date" we can have the following combination:
> - 0 (default) -> the disk is suitable for both data and metadata
> - "not for metadata" -> the disk has an high priority for "data"
> - "not for data" -> the disk has an high priority for "metadata"
> - "not for data" and "not for metadata" -> invalid
> 
> As kernel<->user space interface, I like the idea to export these bits via sysfs.. unfortunately there is no a directory for the devices.... :-(
> 
> 
> > > 
> > > I would therefore vote for configurability of this rather than always choosing SSD over HDD.
> > > 
> > > [1] https://patchwork.kernel.org/project/linux-btrfs/list/?series=245121
> > > 
> > 
> 
>
Hans van Kranenburg April 3, 2020, 4:36 p.m. UTC | #9
On 4/3/20 6:28 PM, Hugo Mills wrote:
> On Fri, Apr 03, 2020 at 06:19:59PM +0200, Goffredo Baroncelli wrote:
>> On 4/3/20 10:43 AM, Michael wrote:
>>> 02.04.2020 12:33, Steven Davies пишет:
>>>> On 01/04/2020 21:03, Goffredo Baroncelli wrote:
>>>>> From: Goffredo Baroncelli <kreijack@inwind.it>
>> [...]
>>>>> To enable this mode pass -o ssd_metadata at mount time.
>>>>>
>>>>> Signed-off-by: Goffredo Baroncelli <kreijack@inwind.it>
>>>>
>>>> The idea of this sounds similar to what Anand has been working on with the readmirror patchset[1] which was originally designed to prefer reading from SSD devices in a RAID1 configuration but has evolved into allowing the read policy to be configured through sysfs, at least partly because detecting SSDs correctly is not an exact science. Also, there may be more considerations than just HDD or SSD: for example in my system I use a SATA SSD and an NVMe SSD in RAID1 where the NVMe device is twice the speed of the SSD.
>>> May be something like -o metadata_preferred_devices=device_id,[device_id,[device_id]]... ?
>>
>> I think that it should be a device property instead of a device list passed at mount time.
>> Looking at the btrfs_dev_item structure (which is embedded in the super block), there are several promising fields:
>> - seek_speed
>> - bandwidth
>> - dev_group
>>
>> currently these fields are unused.
> 
>    The first two of these at least were left over from the last
> attempt at this which got anywhere near code.
> 
>    If you're going to do this kind of thing, please read (at least) my
> (sadly code-less; sorry) proposal from a few years ago:
> 
> https://www.mail-archive.com/linux-btrfs@vger.kernel.org/msg33632.html
> 
>    Essentially, allowing for multiple policies for chunk allocation,
> of which this case is a small subset.
> 
>    I'd envisaged putting the relevant config data into properties, but
> since we didn't actually have them in any sense at the time, I got
> bogged down in that part of it.

Ideas about properties:

https://github.com/kdave/drafts/blob/master/btrfs/properties.txt

>> Se we are free to use as we wish. For example, if we use 2 bit of dev_group as marker
>> for "not for metadata" and "not for date" we can have the following combination:
>> - 0 (default) -> the disk is suitable for both data and metadata
>> - "not for metadata" -> the disk has an high priority for "data"
>> - "not for data" -> the disk has an high priority for "metadata"
>> - "not for data" and "not for metadata" -> invalid
>>
>> As kernel<->user space interface, I like the idea to export these bits via sysfs.. unfortunately there is no a directory for the devices.... :-(
>>
>>
>>>>
>>>> I would therefore vote for configurability of this rather than always choosing SSD over HDD.
>>>>
>>>> [1] https://patchwork.kernel.org/project/linux-btrfs/list/?series=245121
>>>>
>>>
>>
>>
>
diff mbox series

Patch

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2e9f938508e9..0f3c09cc4863 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1187,6 +1187,7 @@  static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
 #define BTRFS_MOUNT_FREE_SPACE_TREE	(1 << 26)
 #define BTRFS_MOUNT_NOLOGREPLAY		(1 << 27)
 #define BTRFS_MOUNT_REF_VERIFY		(1 << 28)
+#define BTRFS_MOUNT_SSD_METADATA	(1 << 29)
 
 #define BTRFS_DEFAULT_COMMIT_INTERVAL	(30)
 #define BTRFS_DEFAULT_MAX_INLINE	(2048)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index c6557d44907a..d0a5cf496f90 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -346,6 +346,7 @@  enum {
 #ifdef CONFIG_BTRFS_FS_REF_VERIFY
 	Opt_ref_verify,
 #endif
+	Opt_ssd_metadata,
 	Opt_err,
 };
 
@@ -416,6 +417,7 @@  static const match_table_t tokens = {
 #ifdef CONFIG_BTRFS_FS_REF_VERIFY
 	{Opt_ref_verify, "ref_verify"},
 #endif
+	{Opt_ssd_metadata, "ssd_metadata"},
 	{Opt_err, NULL},
 };
 
@@ -853,6 +855,10 @@  int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
 			btrfs_set_opt(info->mount_opt, REF_VERIFY);
 			break;
 #endif
+		case Opt_ssd_metadata:
+			btrfs_set_and_info(info, SSD_METADATA,
+					"enabling ssd_metadata");
+			break;
 		case Opt_err:
 			btrfs_info(info, "unrecognized mount option '%s'", p);
 			ret = -EINVAL;
@@ -1369,6 +1375,8 @@  static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
 #endif
 	if (btrfs_test_opt(info, REF_VERIFY))
 		seq_puts(seq, ",ref_verify");
+	if (btrfs_test_opt(info, SSD_METADATA))
+		seq_puts(seq, ",ssd_metadata");
 	seq_printf(seq, ",subvolid=%llu",
 		  BTRFS_I(d_inode(dentry))->root->root_key.objectid);
 	seq_puts(seq, ",subvol=");
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index a8b71ded4d21..678dc3366711 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4758,6 +4758,58 @@  static int btrfs_cmp_device_info(const void *a, const void *b)
 	return 0;
 }
 
+/*
+ * sort the devices in descending order by rotational,
+ * max_avail, total_avail
+ */
+static int btrfs_cmp_device_info_metadata(const void *a, const void *b)
+{
+	const struct btrfs_device_info *di_a = a;
+	const struct btrfs_device_info *di_b = b;
+
+	/* metadata -> non rotational first */
+	if (!di_a->rotational && di_b->rotational)
+		return -1;
+	if (di_a->rotational && !di_b->rotational)
+		return 1;
+	if (di_a->max_avail > di_b->max_avail)
+		return -1;
+	if (di_a->max_avail < di_b->max_avail)
+		return 1;
+	if (di_a->total_avail > di_b->total_avail)
+		return -1;
+	if (di_a->total_avail < di_b->total_avail)
+		return 1;
+	return 0;
+}
+
+/*
+ * sort the devices in descending order by !rotational,
+ * max_avail, total_avail
+ */
+static int btrfs_cmp_device_info_data(const void *a, const void *b)
+{
+	const struct btrfs_device_info *di_a = a;
+	const struct btrfs_device_info *di_b = b;
+
+	/* data -> non rotational last */
+	if (!di_a->rotational && di_b->rotational)
+		return 1;
+	if (di_a->rotational && !di_b->rotational)
+		return -1;
+	if (di_a->max_avail > di_b->max_avail)
+		return -1;
+	if (di_a->max_avail < di_b->max_avail)
+		return 1;
+	if (di_a->total_avail > di_b->total_avail)
+		return -1;
+	if (di_a->total_avail < di_b->total_avail)
+		return 1;
+	return 0;
+}
+
+
+
 static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
 {
 	if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
@@ -4805,6 +4857,7 @@  static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	int i;
 	int j;
 	int index;
+	int nr_rotational;
 
 	BUG_ON(!alloc_profile_is_valid(type, 0));
 
@@ -4860,6 +4913,7 @@  static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	 * about the available holes on each device.
 	 */
 	ndevs = 0;
+	nr_rotational = 0;
 	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
 		u64 max_avail;
 		u64 dev_offset;
@@ -4911,14 +4965,45 @@  static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		devices_info[ndevs].max_avail = max_avail;
 		devices_info[ndevs].total_avail = total_avail;
 		devices_info[ndevs].dev = device;
+		devices_info[ndevs].rotational = !test_bit(QUEUE_FLAG_NONROT,
+				&(bdev_get_queue(device->bdev)->queue_flags));
+		if (devices_info[ndevs].rotational)
+			nr_rotational++;
 		++ndevs;
 	}
 
+	BUG_ON(nr_rotational > ndevs);
 	/*
 	 * now sort the devices by hole size / available space
 	 */
-	sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
-	     btrfs_cmp_device_info, NULL);
+	if (((type & BTRFS_BLOCK_GROUP_DATA) &&
+	     (type & BTRFS_BLOCK_GROUP_METADATA)) ||
+	    !btrfs_test_opt(info, SSD_METADATA)) {
+		/* mixed bg or SSD_METADATA not set */
+		sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
+			     btrfs_cmp_device_info, NULL);
+	} else {
+		/*
+		 * if SSD_METADATA is set, sort the device considering also the
+		 * kind (ssd or not). Limit the availables devices to the ones
+		 * of the same kind, to avoid that a striped profile like raid5
+		 * spans to all kind of devices (ssd and rotational).
+		 * It is allowed to span different kind of devices if the ones of
+		 * the same kind are not enough alone.
+		 */
+		if (type & BTRFS_BLOCK_GROUP_DATA) {
+			sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
+				     btrfs_cmp_device_info_data, NULL);
+			if (nr_rotational > devs_min)
+				ndevs = nr_rotational;
+		} else {
+			int nr_norot = ndevs - nr_rotational;
+			sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
+				     btrfs_cmp_device_info_metadata, NULL);
+			if (nr_norot > devs_min)
+				ndevs = nr_norot;
+		}
+	}
 
 	/*
 	 * Round down to number of usable stripes, devs_increment can be any
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index fc1b564b9cfe..bc1cfa0c27ea 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -340,6 +340,7 @@  struct btrfs_device_info {
 	u64 dev_offset;
 	u64 max_avail;
 	u64 total_avail;
+	int rotational:1;
 };
 
 struct btrfs_raid_attr {