@@ -574,6 +574,20 @@ enum btrfs_exclusive_operation {
BTRFS_EXCLOP_SWAP_ACTIVATE,
};
+/*
+ * tier policy for btrfs data/metadata
+ * FIXME: per-subvol tier policy for full tier support.
+ * FIXME: per-subvol profile(RAID) is needed for full tier support too.
+ */
+enum btrfs_tier_policy
+{
+ NOT_TIERING,
+ TOP_TIER_ONLY, /* TO */
+ TOP_TIER_FIRSTLY, /* TF */
+ OTHER_TIER_FIRSTLY, /* OF */
+ OTHER_TIER_ONLY, /* OO */
+};
+
struct btrfs_fs_info {
u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
unsigned long flags;
@@ -831,6 +845,9 @@ struct btrfs_fs_info {
u64 avail_metadata_alloc_bits;
u64 avail_system_alloc_bits;
+ enum btrfs_tier_policy data_tier_policy;
+ enum btrfs_tier_policy metadata_tier_policy;
+
/* restriper state */
spinlock_t balance_lock;
struct mutex balance_mutex;
@@ -375,6 +375,7 @@ enum {
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
Opt_ref_verify,
#endif
+ Opt_tier, Opt_tier_policy,
Opt_err,
};
@@ -449,6 +450,8 @@ static const match_table_t tokens = {
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
{Opt_ref_verify, "ref_verify"},
#endif
+ {Opt_tier, "tier"},
+ {Opt_tier_policy, "tier=%s"},
{Opt_err, NULL},
};
@@ -501,6 +504,40 @@ out:
return ret;
}
+static const char *btrfs_tier_policy_names[] = {
+ [NOT_TIERING] = "NO",
+ [TOP_TIER_ONLY] = "TO",
+ [TOP_TIER_FIRSTLY] = "OF",
+ [OTHER_TIER_FIRSTLY] = "OF",
+ [OTHER_TIER_ONLY] = "OO"};
+
+struct btrfs_tier_option {
+ const char *name;
+ enum btrfs_tier_policy data_tier_policy;
+ enum btrfs_tier_policy metadata_tier_policy;
+};
+
+static const struct btrfs_tier_option btrfs_tier_options[] = {
+ {"off", NOT_TIERING, NOT_TIERING},
+ {"auto", OTHER_TIER_FIRSTLY, TOP_TIER_FIRSTLY},
+ {"OF/TF", OTHER_TIER_FIRSTLY, TOP_TIER_FIRSTLY},
+ {"OO/TF", OTHER_TIER_ONLY, TOP_TIER_FIRSTLY}};
+
+static int parse_tier_options(struct btrfs_fs_info *info, const char *option)
+{
+ int i;
+ for (i = 0; i < sizeof(btrfs_tier_options) / sizeof(btrfs_tier_options[0]); ++i)
+ {
+ if (strcmp(option, btrfs_tier_options[i].name) == 0)
+ {
+ info->data_tier_policy = btrfs_tier_options[i].data_tier_policy;
+ info->metadata_tier_policy = btrfs_tier_options[i].metadata_tier_policy;
+ return 0;
+ }
+ }
+ return -EINVAL;
+}
+
/*
* Regular mount options parser. Everything that is needed only when
* reading in a new superblock is parsed here.
@@ -527,6 +564,10 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
else if (cache_gen)
btrfs_set_opt(info->mount_opt, SPACE_CACHE);
+ /* default tier=auto */
+ info->data_tier_policy = OTHER_TIER_FIRSTLY;
+ info->metadata_tier_policy = TOP_TIER_FIRSTLY;
+
/*
* Even the options are empty, we still need to do extra check
* against new flags
@@ -959,6 +1000,15 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
btrfs_set_opt(info->mount_opt, REF_VERIFY);
break;
#endif
+ case Opt_tier:
+ info->data_tier_policy = OTHER_TIER_FIRSTLY;
+ info->metadata_tier_policy = TOP_TIER_FIRSTLY;
+ break;
+ case Opt_tier_policy:
+ ret = parse_tier_options(info, args[0].from);
+ if (ret < 0)
+ goto out;
+ break;
case Opt_err:
btrfs_err(info, "unrecognized mount option '%s'", p);
ret = -EINVAL;
@@ -988,6 +1038,18 @@ out:
btrfs_info(info, "disk space caching is enabled");
if (!ret && btrfs_test_opt(info, FREE_SPACE_TREE))
btrfs_info(info, "using free space tree");
+ if (!ret){
+ if(info->data_tier_policy == NOT_TIERING &&
+ info->metadata_tier_policy == NOT_TIERING)
+ btrfs_info(info, "disabled tiering(tier=off)");
+ else if(info->data_tier_policy == OTHER_TIER_FIRSTLY &&
+ info->metadata_tier_policy == TOP_TIER_FIRSTLY)
+ btrfs_info(info, "enabling tiering(tier=auto)");
+ else
+ btrfs_info(info, "enabling tiering(tier=%s/%s)",
+ btrfs_tier_policy_names[info->data_tier_policy],
+ btrfs_tier_policy_names[info->metadata_tier_policy]);
+ }
return ret;
}
@@ -1472,6 +1534,16 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
#endif
if (btrfs_test_opt(info, REF_VERIFY))
seq_puts(seq, ",ref_verify");
+ if(info->data_tier_policy == NOT_TIERING &&
+ info->metadata_tier_policy == NOT_TIERING)
+ seq_puts(seq, ",tier=off");
+ else if(info->data_tier_policy == OTHER_TIER_FIRSTLY &&
+ info->metadata_tier_policy == TOP_TIER_FIRSTLY)
+ seq_puts(seq, ",tier"); /* or ",tier=auto"? */
+ else
+ seq_printf(seq, ",tier=%s/%s",
+ btrfs_tier_policy_names[info->data_tier_policy],
+ btrfs_tier_policy_names[info->metadata_tier_policy]);
seq_printf(seq, ",subvolid=%llu",
BTRFS_I(d_inode(dentry))->root->root_key.objectid);
subvol_name = btrfs_get_subvol_name_from_objectid(info,
@@ -4816,6 +4816,44 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
return 0;
}
+/*
+ * sort the devices in descending order by tier_score,
+ * max_avail, total_avail
+ */
+static int btrfs_cmp_device_info_metadata(const void *a, const void *b)
+{
+ const struct btrfs_device_info *di_a = a;
+ const struct btrfs_device_info *di_b = b;
+
+ /* higher tier_score firstly for metadata */
+ if (di_a->dev->tier_score > di_b->dev->tier_score)
+ return -1;
+ if (di_a->dev->tier_score < di_b->dev->tier_score)
+ return 1;
+
+ return btrfs_cmp_device_info(a,b);
+}
+
+/*
+ * sort the devices in ascending order by tier_score,
+ * max_avail, total_avail
+ */
+static int btrfs_cmp_device_info_data(const void *a, const void *b)
+{
+ const struct btrfs_device_info *di_a = a;
+ const struct btrfs_device_info *di_b = b;
+
+ /* lower tier_score firstly for data */
+ if (di_a->dev->tier_score > di_b->dev->tier_score)
+ return 1;
+ if (di_a->dev->tier_score < di_b->dev->tier_score)
+ return -1;
+
+ return btrfs_cmp_device_info(a,b);
+}
+
+
+
static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
{
if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
@@ -4931,6 +4969,8 @@ static int gather_device_info(struct btrfs_fs_devices *fs_devices,
int ndevs = 0;
u64 max_avail;
u64 dev_offset;
+ int top_tier_score = 0;
+ int nr_top_tier = 0;
/*
* in the first pass through the devices list, we gather information
@@ -4983,15 +5023,51 @@ static int gather_device_info(struct btrfs_fs_devices *fs_devices,
devices_info[ndevs].max_avail = max_avail;
devices_info[ndevs].total_avail = total_avail;
devices_info[ndevs].dev = device;
+ if (devices_info[ndevs].dev->tier_score > top_tier_score) {
+ top_tier_score = devices_info[ndevs].dev->tier_score;
+ nr_top_tier = 1;
+ } else if (devices_info[ndevs].dev->tier_score == top_tier_score) {
+ nr_top_tier++;
+ }
++ndevs;
}
ctl->ndevs = ndevs;
+ BUG_ON(nr_top_tier > ndevs);
/*
* now sort the devices by hole size / available space
*/
- sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
- btrfs_cmp_device_info, NULL);
+ if (nr_top_tier == ndevs ||
+ ((ctl->type & BTRFS_BLOCK_GROUP_DATA) && info->data_tier_policy == NOT_TIERING) ||
+ (!(ctl->type & BTRFS_BLOCK_GROUP_DATA) && info->metadata_tier_policy == NOT_TIERING) ||
+ ((ctl->type & BTRFS_BLOCK_GROUP_DATA) && (ctl->type & BTRFS_BLOCK_GROUP_METADATA))) {
+ /* 1 tier only; NOT_TIERING; mixed bg */
+ sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
+ btrfs_cmp_device_info, NULL);
+ }
+ else
+ {
+ /*
+ * if tiering, sort the device considering also the tier_score.
+ * Limit the availables devices to the ones
+ * of the same kind, to avoid that a striped profile like raid5
+ * spans to all kind of devices.
+ * It is allowed to span different kind of devices if the ones of
+ * the same kind are not enough alone.
+ */
+ if (ctl->type & BTRFS_BLOCK_GROUP_DATA) {
+ int nr_other_tier = ndevs - nr_top_tier;
+ sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
+ btrfs_cmp_device_info_data, NULL);
+ if (nr_other_tier >= ctl->devs_min || info->data_tier_policy == OTHER_TIER_ONLY)
+ ndevs = nr_other_tier;
+ } else { /* non data -> metadata and system */
+ sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
+ btrfs_cmp_device_info_metadata, NULL);
+ if (nr_top_tier >= ctl->devs_min || info->metadata_tier_policy == TOP_TIER_ONLY)
+ ndevs = nr_top_tier;
+ }
+ }
return 0;
}
This based the patch 'btrfs: add ssd_metadata mode' from Goffredo Baroncelli <kreijack@libero.it> In most case, only 1 or 2 tiers are used at the same time, so we group them into top tier and other tier(s). We define a mount option to tiering data/metadata to slower/faster device(s) When there is only 1 tier, tiering is auto disabled. mount option: tier[={off|auto|data_tier_X/metadata_tier_Y}] default is 'tier[=auto]'. 'tier' is same as 'tier=auto', 'tier=OF/TF' the policies to use the device(s): Top-tier-Only(TO) : metadata only use top-tier device. Top-tier-Firstly(TF) : metadata use top-tier device firstly. Other-tier-First(OF) : data use other-tier device firstly. Other-tier-Only(OO) : data only use other-tier device. data_tier_X is the policy for data, support OF, OO. metadata_tier_Y is the policy for metadata and system, support TF. Signed-off-by: wangyugui <wangyugui@e16-tech.com> --- fs/btrfs/ctree.h | 17 ++++++++++ fs/btrfs/super.c | 72 +++++++++++++++++++++++++++++++++++++++++ fs/btrfs/volumes.c | 80 ++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 167 insertions(+), 2 deletions(-)