diff mbox series

[4/4] btrfs: add preferred_metadata mode

Message ID 20200528183451.16654-5-kreijack@libero.it (mailing list archive)
State New, archived
Headers show
Series [1/4] Add an ioctl to set/retrive the device properties | expand

Commit Message

Goffredo Baroncelli May 28, 2020, 6:34 p.m. UTC
From: Goffredo Baroncelli <kreijack@inwind.it>

When this mode is enabled, the allocation policy of the chunk
is so modified:
- allocation of metadata chunk: priority is given to preferred_metadata
  disks.
- allocation of data chunk: priority is given to a non preferred_metadata
  disk.

When a striped profile is involved (like RAID0,5,6), the logic
is a bit more complex. If there are enough disks, the data profiles
are stored on the non preferred_metadata disks; instead the metadata
profiles are stored on the preferred_metadata disk.
If the disks are not enough, then the profile is allocated on all
the disks.

Example: assuming that sda, sdb, sdc are ssd disks, and sde, sdf are
non preferred_metadata ones.
A data profile raid6, will be stored on sda, sdb, sdc, sde, sdf (sde
and sdf are not enough to host a raid5 profile).
A metadata profile raid6, will be stored on sda, sdb, sdc (these
are enough to host a raid6 profile).

To enable this mode pass -o dedicated_metadata at mount time.

Signed-off-by: Goffredo Baroncelli <kreijack@inwind.it>
---
 fs/btrfs/ctree.h   |  1 +
 fs/btrfs/super.c   |  8 +++++
 fs/btrfs/volumes.c | 89 ++++++++++++++++++++++++++++++++++++++++++++--
 fs/btrfs/volumes.h |  1 +
 4 files changed, 97 insertions(+), 2 deletions(-)

Comments

Hans van Kranenburg May 28, 2020, 10:02 p.m. UTC | #1
Hi,

On 5/28/20 8:34 PM, Goffredo Baroncelli wrote:
> From: Goffredo Baroncelli <kreijack@inwind.it>
> 
> When this mode is enabled,

The commit message does not mention if this is either only a convenience
during development and testing of the feature to be able to quickly turn
it on/off, or if you intend to have this into the final change set.

> the allocation policy of the chunk
> is so modified:
> - allocation of metadata chunk: priority is given to preferred_metadata
>   disks.
> - allocation of data chunk: priority is given to a non preferred_metadata
>   disk.
> 
> When a striped profile is involved (like RAID0,5,6), the logic
> is a bit more complex. If there are enough disks, the data profiles
> are stored on the non preferred_metadata disks; instead the metadata
> profiles are stored on the preferred_metadata disk.
> If the disks are not enough, then the profile is allocated on all
> the disks.
> 
> Example: assuming that sda, sdb, sdc are ssd disks, and sde, sdf are
> non preferred_metadata ones.
> A data profile raid6, will be stored on sda, sdb, sdc, sde, sdf (sde
> and sdf are not enough to host a raid5 profile).
> A metadata profile raid6, will be stored on sda, sdb, sdc (these
> are enough to host a raid6 profile).
> 
> To enable this mode pass -o dedicated_metadata at mount time.

Is it dedicated_metadata or preferred_metadata?

> Signed-off-by: Goffredo Baroncelli <kreijack@inwind.it>
> ---
>  fs/btrfs/ctree.h   |  1 +
>  fs/btrfs/super.c   |  8 +++++
>  fs/btrfs/volumes.c | 89 ++++++++++++++++++++++++++++++++++++++++++++--
>  fs/btrfs/volumes.h |  1 +
>  4 files changed, 97 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 03ea7370aea7..779760fd27b1 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -1239,6 +1239,7 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
>  #define BTRFS_MOUNT_NOLOGREPLAY		(1 << 27)
>  #define BTRFS_MOUNT_REF_VERIFY		(1 << 28)
>  #define BTRFS_MOUNT_DISCARD_ASYNC	(1 << 29)
> +#define BTRFS_MOUNT_PREFERRED_METADATA	(1 << 30)
>  
>  #define BTRFS_DEFAULT_COMMIT_INTERVAL	(30)
>  #define BTRFS_DEFAULT_MAX_INLINE	(2048)
> diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
> index 438ecba26557..80700dc9dcf8 100644
> --- a/fs/btrfs/super.c
> +++ b/fs/btrfs/super.c
> @@ -359,6 +359,7 @@ enum {
>  #ifdef CONFIG_BTRFS_FS_REF_VERIFY
>  	Opt_ref_verify,
>  #endif
> +	Opt_preferred_metadata,
>  	Opt_err,
>  };
>  
> @@ -430,6 +431,7 @@ static const match_table_t tokens = {
>  #ifdef CONFIG_BTRFS_FS_REF_VERIFY
>  	{Opt_ref_verify, "ref_verify"},
>  #endif
> +	{Opt_preferred_metadata, "preferred_metadata"},
>  	{Opt_err, NULL},
>  };
>  
> @@ -881,6 +883,10 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
>  			btrfs_set_opt(info->mount_opt, REF_VERIFY);
>  			break;
>  #endif
> +		case Opt_preferred_metadata:
> +			btrfs_set_and_info(info, PREFERRED_METADATA,
> +					"enabling preferred_metadata");
> +			break;
>  		case Opt_err:
>  			btrfs_err(info, "unrecognized mount option '%s'", p);
>  			ret = -EINVAL;
> @@ -1403,6 +1409,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
>  #endif
>  	if (btrfs_test_opt(info, REF_VERIFY))
>  		seq_puts(seq, ",ref_verify");
> +	if (btrfs_test_opt(info, PREFERRED_METADATA))
> +		seq_puts(seq, ",preferred_metadata");
>  	seq_printf(seq, ",subvolid=%llu",
>  		  BTRFS_I(d_inode(dentry))->root->root_key.objectid);
>  	seq_puts(seq, ",subvol=");
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index 5265f54c2931..c68efb15e473 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -4770,6 +4770,56 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
>  	return 0;
>  }
>  
> +/*
> + * sort the devices in descending order by preferred_metadata,
> + * max_avail, total_avail
> + */
> +static int btrfs_cmp_device_info_metadata(const void *a, const void *b)
> +{
> +	const struct btrfs_device_info *di_a = a;
> +	const struct btrfs_device_info *di_b = b;
> +
> +	/* metadata -> preferred_metadata first */
> +	if (di_a->preferred_metadata && !di_b->preferred_metadata)
> +		return -1;
> +	if (!di_a->preferred_metadata && di_b->preferred_metadata)
> +		return 1;
> +	if (di_a->max_avail > di_b->max_avail)
> +		return -1;
> +	if (di_a->max_avail < di_b->max_avail)
> +		return 1;
> +	if (di_a->total_avail > di_b->total_avail)
> +		return -1;
> +	if (di_a->total_avail < di_b->total_avail)
> +		return 1;
> +	return 0;
> +}
> +
> +/*
> + * sort the devices in descending order by !preferred_metadata,
> + * max_avail, total_avail
> + */
> +static int btrfs_cmp_device_info_data(const void *a, const void *b)
> +{
> +	const struct btrfs_device_info *di_a = a;
> +	const struct btrfs_device_info *di_b = b;
> +
> +	/* data -> preferred_metadata last */
> +	if (di_a->preferred_metadata && !di_b->preferred_metadata)
> +		return 1;
> +	if (!di_a->preferred_metadata && di_b->preferred_metadata)
> +		return -1;
> +	if (di_a->max_avail > di_b->max_avail)
> +		return -1;
> +	if (di_a->max_avail < di_b->max_avail)
> +		return 1;
> +	if (di_a->total_avail > di_b->total_avail)
> +		return -1;
> +	if (di_a->total_avail < di_b->total_avail)
> +		return 1;
> +	return 0;
> +}
> +
>  static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
>  {
>  	if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
> @@ -4885,6 +4935,7 @@ static int gather_device_info(struct btrfs_fs_devices *fs_devices,
>  	int ndevs = 0;
>  	u64 max_avail;
>  	u64 dev_offset;
> +	int nr_preferred_metadata = 0;
>  
>  	/*
>  	 * in the first pass through the devices list, we gather information
> @@ -4937,15 +4988,49 @@ static int gather_device_info(struct btrfs_fs_devices *fs_devices,
>  		devices_info[ndevs].max_avail = max_avail;
>  		devices_info[ndevs].total_avail = total_avail;
>  		devices_info[ndevs].dev = device;
> +		devices_info[ndevs].preferred_metadata = !!(device->type &
> +			BTRFS_DEV_PREFERRED_METADATA);
> +		if (devices_info[ndevs].preferred_metadata)
> +			nr_preferred_metadata++;
>  		++ndevs;
>  	}
>  	ctl->ndevs = ndevs;
>  
> +	BUG_ON(nr_preferred_metadata > ndevs);
>  	/*
>  	 * now sort the devices by hole size / available space
>  	 */
> -	sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
> -	     btrfs_cmp_device_info, NULL);
> +	if (((ctl->type & BTRFS_BLOCK_GROUP_DATA) &&
> +	     (ctl->type & BTRFS_BLOCK_GROUP_METADATA)) ||
> +	    !btrfs_test_opt(info, PREFERRED_METADATA)) {
> +		/* mixed bg or PREFERRED_METADATA not set */
> +		sort(devices_info, ctl->ndevs, sizeof(struct btrfs_device_info),
> +			     btrfs_cmp_device_info, NULL);
> +	} else {
> +		/*
> +		 * if PREFERRED_METADATA is set, sort the device considering
> +		 * also the kind (preferred_metadata or not). Limit the
> +		 * availables devices to the ones of the same kind, to avoid
> +		 * that a striped profile, like raid5, spreads to all kind of
> +		 * devices.
> +		 * It is allowed to use different kinds of devices if the ones
> +		 * of the same kind are not enough alone.
> +		 */
> +		if (ctl->type & BTRFS_BLOCK_GROUP_DATA) {
> +			int nr_data = ctl->ndevs - nr_preferred_metadata;
> +			sort(devices_info, ctl->ndevs,
> +				     sizeof(struct btrfs_device_info),
> +				     btrfs_cmp_device_info_data, NULL);
> +			if (nr_data >= ctl->devs_min)
> +				ctl->ndevs = nr_data;
> +		} else { /* non data -> metadata and system */
> +			sort(devices_info, ctl->ndevs,
> +				     sizeof(struct btrfs_device_info),
> +				     btrfs_cmp_device_info_metadata, NULL);
> +			if (nr_preferred_metadata >= ctl->devs_min)
> +				ctl->ndevs = nr_preferred_metadata;
> +		}
> +	}
>  
>  	return 0;
>  }
> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
> index 0ac5bf2b95e6..d39c3b0e7569 100644
> --- a/fs/btrfs/volumes.h
> +++ b/fs/btrfs/volumes.h
> @@ -347,6 +347,7 @@ struct btrfs_device_info {
>  	u64 dev_offset;
>  	u64 max_avail;
>  	u64 total_avail;
> +	int preferred_metadata:1;
>  };
>  
>  struct btrfs_raid_attr {
>
Goffredo Baroncelli May 29, 2020, 4:26 p.m. UTC | #2
On 5/29/20 12:02 AM, Hans van Kranenburg wrote:
> Hi,
> 
> On 5/28/20 8:34 PM, Goffredo Baroncelli wrote:
>> From: Goffredo Baroncelli <kreijack@inwind.it>
>>
>> When this mode is enabled,
> 
> The commit message does not mention if this is either only a convenience
> during development and testing of the feature to be able to quickly turn
> it on/off, or if you intend to have this into the final change set.

Good question. IMHO for the initial devel phase I think that it is useful to have
a preferred_metadata disk (opt-in). Then we could reverse the logic and
default to preferred_metadata. Of course then we will have a
no-preferred_metadata flag (opt-out)
> 
>> the allocation policy of the chunk
>> is so modified:
>> - allocation of metadata chunk: priority is given to preferred_metadata
>>    disks.
>> - allocation of data chunk: priority is given to a non preferred_metadata
>>    disk.
>>
>> When a striped profile is involved (like RAID0,5,6), the logic
>> is a bit more complex. If there are enough disks, the data profiles
>> are stored on the non preferred_metadata disks; instead the metadata
>> profiles are stored on the preferred_metadata disk.
>> If the disks are not enough, then the profile is allocated on all
>> the disks.
>>
>> Example: assuming that sda, sdb, sdc are ssd disks, and sde, sdf are
>> non preferred_metadata ones.
>> A data profile raid6, will be stored on sda, sdb, sdc, sde, sdf (sde
>> and sdf are not enough to host a raid5 profile).
>> A metadata profile raid6, will be stored on sda, sdb, sdc (these
>> are enough to host a raid6 profile).
>>
>> To enable this mode pass -o dedicated_metadata at mount time.
> 
> Is it dedicated_metadata or preferred_metadata?

It was an copy&paste error. It should be preferred_metadata
> 
>> Signed-off-by: Goffredo Baroncelli <kreijack@inwind.it>
>> ---
>>   fs/btrfs/ctree.h   |  1 +
>>   fs/btrfs/super.c   |  8 +++++
>>   fs/btrfs/volumes.c | 89 ++++++++++++++++++++++++++++++++++++++++++++--
>>   fs/btrfs/volumes.h |  1 +
>>   4 files changed, 97 insertions(+), 2 deletions(-)
>>
>> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
>> index 03ea7370aea7..779760fd27b1 100644
>> --- a/fs/btrfs/ctree.h
>> +++ b/fs/btrfs/ctree.h
>> @@ -1239,6 +1239,7 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
>>   #define BTRFS_MOUNT_NOLOGREPLAY		(1 << 27)
>>   #define BTRFS_MOUNT_REF_VERIFY		(1 << 28)
>>   #define BTRFS_MOUNT_DISCARD_ASYNC	(1 << 29)
>> +#define BTRFS_MOUNT_PREFERRED_METADATA	(1 << 30)
>>   
>>   #define BTRFS_DEFAULT_COMMIT_INTERVAL	(30)
>>   #define BTRFS_DEFAULT_MAX_INLINE	(2048)
>> diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
>> index 438ecba26557..80700dc9dcf8 100644
>> --- a/fs/btrfs/super.c
>> +++ b/fs/btrfs/super.c
>> @@ -359,6 +359,7 @@ enum {
>>   #ifdef CONFIG_BTRFS_FS_REF_VERIFY
>>   	Opt_ref_verify,
>>   #endif
>> +	Opt_preferred_metadata,
>>   	Opt_err,
>>   };
>>   
>> @@ -430,6 +431,7 @@ static const match_table_t tokens = {
>>   #ifdef CONFIG_BTRFS_FS_REF_VERIFY
>>   	{Opt_ref_verify, "ref_verify"},
>>   #endif
>> +	{Opt_preferred_metadata, "preferred_metadata"},
>>   	{Opt_err, NULL},
>>   };
>>   
>> @@ -881,6 +883,10 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
>>   			btrfs_set_opt(info->mount_opt, REF_VERIFY);
>>   			break;
>>   #endif
>> +		case Opt_preferred_metadata:
>> +			btrfs_set_and_info(info, PREFERRED_METADATA,
>> +					"enabling preferred_metadata");
>> +			break;
>>   		case Opt_err:
>>   			btrfs_err(info, "unrecognized mount option '%s'", p);
>>   			ret = -EINVAL;
>> @@ -1403,6 +1409,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
>>   #endif
>>   	if (btrfs_test_opt(info, REF_VERIFY))
>>   		seq_puts(seq, ",ref_verify");
>> +	if (btrfs_test_opt(info, PREFERRED_METADATA))
>> +		seq_puts(seq, ",preferred_metadata");
>>   	seq_printf(seq, ",subvolid=%llu",
>>   		  BTRFS_I(d_inode(dentry))->root->root_key.objectid);
>>   	seq_puts(seq, ",subvol=");
>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>> index 5265f54c2931..c68efb15e473 100644
>> --- a/fs/btrfs/volumes.c
>> +++ b/fs/btrfs/volumes.c
>> @@ -4770,6 +4770,56 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
>>   	return 0;
>>   }
>>   
>> +/*
>> + * sort the devices in descending order by preferred_metadata,
>> + * max_avail, total_avail
>> + */
>> +static int btrfs_cmp_device_info_metadata(const void *a, const void *b)
>> +{
>> +	const struct btrfs_device_info *di_a = a;
>> +	const struct btrfs_device_info *di_b = b;
>> +
>> +	/* metadata -> preferred_metadata first */
>> +	if (di_a->preferred_metadata && !di_b->preferred_metadata)
>> +		return -1;
>> +	if (!di_a->preferred_metadata && di_b->preferred_metadata)
>> +		return 1;
>> +	if (di_a->max_avail > di_b->max_avail)
>> +		return -1;
>> +	if (di_a->max_avail < di_b->max_avail)
>> +		return 1;
>> +	if (di_a->total_avail > di_b->total_avail)
>> +		return -1;
>> +	if (di_a->total_avail < di_b->total_avail)
>> +		return 1;
>> +	return 0;
>> +}
>> +
>> +/*
>> + * sort the devices in descending order by !preferred_metadata,
>> + * max_avail, total_avail
>> + */
>> +static int btrfs_cmp_device_info_data(const void *a, const void *b)
>> +{
>> +	const struct btrfs_device_info *di_a = a;
>> +	const struct btrfs_device_info *di_b = b;
>> +
>> +	/* data -> preferred_metadata last */
>> +	if (di_a->preferred_metadata && !di_b->preferred_metadata)
>> +		return 1;
>> +	if (!di_a->preferred_metadata && di_b->preferred_metadata)
>> +		return -1;
>> +	if (di_a->max_avail > di_b->max_avail)
>> +		return -1;
>> +	if (di_a->max_avail < di_b->max_avail)
>> +		return 1;
>> +	if (di_a->total_avail > di_b->total_avail)
>> +		return -1;
>> +	if (di_a->total_avail < di_b->total_avail)
>> +		return 1;
>> +	return 0;
>> +}
>> +
>>   static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
>>   {
>>   	if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
>> @@ -4885,6 +4935,7 @@ static int gather_device_info(struct btrfs_fs_devices *fs_devices,
>>   	int ndevs = 0;
>>   	u64 max_avail;
>>   	u64 dev_offset;
>> +	int nr_preferred_metadata = 0;
>>   
>>   	/*
>>   	 * in the first pass through the devices list, we gather information
>> @@ -4937,15 +4988,49 @@ static int gather_device_info(struct btrfs_fs_devices *fs_devices,
>>   		devices_info[ndevs].max_avail = max_avail;
>>   		devices_info[ndevs].total_avail = total_avail;
>>   		devices_info[ndevs].dev = device;
>> +		devices_info[ndevs].preferred_metadata = !!(device->type &
>> +			BTRFS_DEV_PREFERRED_METADATA);
>> +		if (devices_info[ndevs].preferred_metadata)
>> +			nr_preferred_metadata++;
>>   		++ndevs;
>>   	}
>>   	ctl->ndevs = ndevs;
>>   
>> +	BUG_ON(nr_preferred_metadata > ndevs);
>>   	/*
>>   	 * now sort the devices by hole size / available space
>>   	 */
>> -	sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
>> -	     btrfs_cmp_device_info, NULL);
>> +	if (((ctl->type & BTRFS_BLOCK_GROUP_DATA) &&
>> +	     (ctl->type & BTRFS_BLOCK_GROUP_METADATA)) ||
>> +	    !btrfs_test_opt(info, PREFERRED_METADATA)) {
>> +		/* mixed bg or PREFERRED_METADATA not set */
>> +		sort(devices_info, ctl->ndevs, sizeof(struct btrfs_device_info),
>> +			     btrfs_cmp_device_info, NULL);
>> +	} else {
>> +		/*
>> +		 * if PREFERRED_METADATA is set, sort the device considering
>> +		 * also the kind (preferred_metadata or not). Limit the
>> +		 * availables devices to the ones of the same kind, to avoid
>> +		 * that a striped profile, like raid5, spreads to all kind of
>> +		 * devices.
>> +		 * It is allowed to use different kinds of devices if the ones
>> +		 * of the same kind are not enough alone.
>> +		 */
>> +		if (ctl->type & BTRFS_BLOCK_GROUP_DATA) {
>> +			int nr_data = ctl->ndevs - nr_preferred_metadata;
>> +			sort(devices_info, ctl->ndevs,
>> +				     sizeof(struct btrfs_device_info),
>> +				     btrfs_cmp_device_info_data, NULL);
>> +			if (nr_data >= ctl->devs_min)
>> +				ctl->ndevs = nr_data;
>> +		} else { /* non data -> metadata and system */
>> +			sort(devices_info, ctl->ndevs,
>> +				     sizeof(struct btrfs_device_info),
>> +				     btrfs_cmp_device_info_metadata, NULL);
>> +			if (nr_preferred_metadata >= ctl->devs_min)
>> +				ctl->ndevs = nr_preferred_metadata;
>> +		}
>> +	}
>>   
>>   	return 0;
>>   }
>> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
>> index 0ac5bf2b95e6..d39c3b0e7569 100644
>> --- a/fs/btrfs/volumes.h
>> +++ b/fs/btrfs/volumes.h
>> @@ -347,6 +347,7 @@ struct btrfs_device_info {
>>   	u64 dev_offset;
>>   	u64 max_avail;
>>   	u64 total_avail;
>> +	int preferred_metadata:1;
>>   };
>>   
>>   struct btrfs_raid_attr {
>>
>
diff mbox series

Patch

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 03ea7370aea7..779760fd27b1 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1239,6 +1239,7 @@  static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
 #define BTRFS_MOUNT_NOLOGREPLAY		(1 << 27)
 #define BTRFS_MOUNT_REF_VERIFY		(1 << 28)
 #define BTRFS_MOUNT_DISCARD_ASYNC	(1 << 29)
+#define BTRFS_MOUNT_PREFERRED_METADATA	(1 << 30)
 
 #define BTRFS_DEFAULT_COMMIT_INTERVAL	(30)
 #define BTRFS_DEFAULT_MAX_INLINE	(2048)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 438ecba26557..80700dc9dcf8 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -359,6 +359,7 @@  enum {
 #ifdef CONFIG_BTRFS_FS_REF_VERIFY
 	Opt_ref_verify,
 #endif
+	Opt_preferred_metadata,
 	Opt_err,
 };
 
@@ -430,6 +431,7 @@  static const match_table_t tokens = {
 #ifdef CONFIG_BTRFS_FS_REF_VERIFY
 	{Opt_ref_verify, "ref_verify"},
 #endif
+	{Opt_preferred_metadata, "preferred_metadata"},
 	{Opt_err, NULL},
 };
 
@@ -881,6 +883,10 @@  int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
 			btrfs_set_opt(info->mount_opt, REF_VERIFY);
 			break;
 #endif
+		case Opt_preferred_metadata:
+			btrfs_set_and_info(info, PREFERRED_METADATA,
+					"enabling preferred_metadata");
+			break;
 		case Opt_err:
 			btrfs_err(info, "unrecognized mount option '%s'", p);
 			ret = -EINVAL;
@@ -1403,6 +1409,8 @@  static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
 #endif
 	if (btrfs_test_opt(info, REF_VERIFY))
 		seq_puts(seq, ",ref_verify");
+	if (btrfs_test_opt(info, PREFERRED_METADATA))
+		seq_puts(seq, ",preferred_metadata");
 	seq_printf(seq, ",subvolid=%llu",
 		  BTRFS_I(d_inode(dentry))->root->root_key.objectid);
 	seq_puts(seq, ",subvol=");
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5265f54c2931..c68efb15e473 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4770,6 +4770,56 @@  static int btrfs_cmp_device_info(const void *a, const void *b)
 	return 0;
 }
 
+/*
+ * sort the devices in descending order by preferred_metadata,
+ * max_avail, total_avail
+ */
+static int btrfs_cmp_device_info_metadata(const void *a, const void *b)
+{
+	const struct btrfs_device_info *di_a = a;
+	const struct btrfs_device_info *di_b = b;
+
+	/* metadata -> preferred_metadata first */
+	if (di_a->preferred_metadata && !di_b->preferred_metadata)
+		return -1;
+	if (!di_a->preferred_metadata && di_b->preferred_metadata)
+		return 1;
+	if (di_a->max_avail > di_b->max_avail)
+		return -1;
+	if (di_a->max_avail < di_b->max_avail)
+		return 1;
+	if (di_a->total_avail > di_b->total_avail)
+		return -1;
+	if (di_a->total_avail < di_b->total_avail)
+		return 1;
+	return 0;
+}
+
+/*
+ * sort the devices in descending order by !preferred_metadata,
+ * max_avail, total_avail
+ */
+static int btrfs_cmp_device_info_data(const void *a, const void *b)
+{
+	const struct btrfs_device_info *di_a = a;
+	const struct btrfs_device_info *di_b = b;
+
+	/* data -> preferred_metadata last */
+	if (di_a->preferred_metadata && !di_b->preferred_metadata)
+		return 1;
+	if (!di_a->preferred_metadata && di_b->preferred_metadata)
+		return -1;
+	if (di_a->max_avail > di_b->max_avail)
+		return -1;
+	if (di_a->max_avail < di_b->max_avail)
+		return 1;
+	if (di_a->total_avail > di_b->total_avail)
+		return -1;
+	if (di_a->total_avail < di_b->total_avail)
+		return 1;
+	return 0;
+}
+
 static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
 {
 	if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
@@ -4885,6 +4935,7 @@  static int gather_device_info(struct btrfs_fs_devices *fs_devices,
 	int ndevs = 0;
 	u64 max_avail;
 	u64 dev_offset;
+	int nr_preferred_metadata = 0;
 
 	/*
 	 * in the first pass through the devices list, we gather information
@@ -4937,15 +4988,49 @@  static int gather_device_info(struct btrfs_fs_devices *fs_devices,
 		devices_info[ndevs].max_avail = max_avail;
 		devices_info[ndevs].total_avail = total_avail;
 		devices_info[ndevs].dev = device;
+		devices_info[ndevs].preferred_metadata = !!(device->type &
+			BTRFS_DEV_PREFERRED_METADATA);
+		if (devices_info[ndevs].preferred_metadata)
+			nr_preferred_metadata++;
 		++ndevs;
 	}
 	ctl->ndevs = ndevs;
 
+	BUG_ON(nr_preferred_metadata > ndevs);
 	/*
 	 * now sort the devices by hole size / available space
 	 */
-	sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
-	     btrfs_cmp_device_info, NULL);
+	if (((ctl->type & BTRFS_BLOCK_GROUP_DATA) &&
+	     (ctl->type & BTRFS_BLOCK_GROUP_METADATA)) ||
+	    !btrfs_test_opt(info, PREFERRED_METADATA)) {
+		/* mixed bg or PREFERRED_METADATA not set */
+		sort(devices_info, ctl->ndevs, sizeof(struct btrfs_device_info),
+			     btrfs_cmp_device_info, NULL);
+	} else {
+		/*
+		 * if PREFERRED_METADATA is set, sort the device considering
+		 * also the kind (preferred_metadata or not). Limit the
+		 * availables devices to the ones of the same kind, to avoid
+		 * that a striped profile, like raid5, spreads to all kind of
+		 * devices.
+		 * It is allowed to use different kinds of devices if the ones
+		 * of the same kind are not enough alone.
+		 */
+		if (ctl->type & BTRFS_BLOCK_GROUP_DATA) {
+			int nr_data = ctl->ndevs - nr_preferred_metadata;
+			sort(devices_info, ctl->ndevs,
+				     sizeof(struct btrfs_device_info),
+				     btrfs_cmp_device_info_data, NULL);
+			if (nr_data >= ctl->devs_min)
+				ctl->ndevs = nr_data;
+		} else { /* non data -> metadata and system */
+			sort(devices_info, ctl->ndevs,
+				     sizeof(struct btrfs_device_info),
+				     btrfs_cmp_device_info_metadata, NULL);
+			if (nr_preferred_metadata >= ctl->devs_min)
+				ctl->ndevs = nr_preferred_metadata;
+		}
+	}
 
 	return 0;
 }
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 0ac5bf2b95e6..d39c3b0e7569 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -347,6 +347,7 @@  struct btrfs_device_info {
 	u64 dev_offset;
 	u64 max_avail;
 	u64 total_avail;
+	int preferred_metadata:1;
 };
 
 struct btrfs_raid_attr {