diff mbox series

btrfs-progs: add --subvol option to mkfs.btrfs

Message ID 20240627095455.315620-1-maharmstone@fb.com (mailing list archive)
State New
Headers show
Series btrfs-progs: add --subvol option to mkfs.btrfs | expand

Commit Message

Mark Harmstone June 27, 2024, 9:54 a.m. UTC
From: Mark Harmstone <maharmstone@meta.com>

This patch adds a --subvol option, which tells mkfs.btrfs to create the
specified directories as subvolumes.

Given a populated directory img, the command

$ mkfs.btrfs --rootdir img --subvol usr --subvol home --subvol home/username /dev/loop0

will create subvolumes usr and home within the FS root, and subvolume
username within the home subvolume. It will fail if any of the
directories do not yet exist.

Signed-off-by: Mark Harmstone <maharmstone@meta.com>
---
 convert/main.c                              |   4 +-
 kernel-shared/ctree.h                       |   3 +-
 kernel-shared/inode.c                       |  46 +--
 mkfs/main.c                                 | 357 +++++++++++++++++++-
 mkfs/rootdir.c                              |  31 +-
 mkfs/rootdir.h                              |  16 +-
 tests/mkfs-tests/034-rootdir-subvol/test.sh |  33 ++
 7 files changed, 463 insertions(+), 27 deletions(-)
 create mode 100755 tests/mkfs-tests/034-rootdir-subvol/test.sh

Comments

Qu Wenruo June 28, 2024, 4:12 a.m. UTC | #1
在 2024/6/27 19:24, Mark Harmstone 写道:
> From: Mark Harmstone <maharmstone@meta.com>
>
> This patch adds a --subvol option, which tells mkfs.btrfs to create the
> specified directories as subvolumes.

I have considered this feature in the past, but I do not have a good
enough UI for that.

>
> Given a populated directory img, the command
>
> $ mkfs.btrfs --rootdir img --subvol usr --subvol home --subvol home/username /dev/loop0

Initially I thought the UI can be a little confusing, but I have no
better alternatives, and it is flex enough to handle all cases I can
thing of.

So a nice solution.

>
> will create subvolumes usr and home within the FS root, and subvolume
> username within the home subvolume. It will fail if any of the
> directories do not yet exist.
>
> Signed-off-by: Mark Harmstone <maharmstone@meta.com>
> ---
>   convert/main.c                              |   4 +-
>   kernel-shared/ctree.h                       |   3 +-
>   kernel-shared/inode.c                       |  46 +--
>   mkfs/main.c                                 | 357 +++++++++++++++++++-
>   mkfs/rootdir.c                              |  31 +-
>   mkfs/rootdir.h                              |  16 +-
>   tests/mkfs-tests/034-rootdir-subvol/test.sh |  33 ++
>   7 files changed, 463 insertions(+), 27 deletions(-)
>   create mode 100755 tests/mkfs-tests/034-rootdir-subvol/test.sh
>
> diff --git a/convert/main.c b/convert/main.c
> index 8e73aa25..7249c793 100644
> --- a/convert/main.c
> +++ b/convert/main.c
> @@ -1314,7 +1314,9 @@ static int do_convert(const char *devname, u32 convert_flags, u32 nodesize,
>   	}
>
>   	image_root = btrfs_mksubvol(root, subvol_name,
> -				    CONV_IMAGE_SUBVOL_OBJECTID, true);
> +				    CONV_IMAGE_SUBVOL_OBJECTID, true,
> +				    btrfs_root_dirid(&root->root_item),
> +				    false);
>   	if (!image_root) {
>   		error("unable to link subvolume %s", subvol_name);
>   		goto fail;
> diff --git a/kernel-shared/ctree.h b/kernel-shared/ctree.h
> index 1341a418..8a5ddcdb 100644
> --- a/kernel-shared/ctree.h
> +++ b/kernel-shared/ctree.h
> @@ -1230,7 +1230,8 @@ int btrfs_add_orphan_item(struct btrfs_trans_handle *trans,
>   int btrfs_mkdir(struct btrfs_trans_handle *trans, struct btrfs_root *root,
>   		char *name, int namelen, u64 parent_ino, u64 *ino, int mode);
>   struct btrfs_root *btrfs_mksubvol(struct btrfs_root *root, const char *base,
> -				  u64 root_objectid, bool convert);
> +				  u64 root_objectid, bool convert, u64 dirid,
> +				  bool dont_change_size);
>   int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
>   			     struct btrfs_root *fs_root,
>   			     u64 dirid, u64 *objectid);
> diff --git a/kernel-shared/inode.c b/kernel-shared/inode.c
> index 91b4f629..99965558 100644
> --- a/kernel-shared/inode.c
> +++ b/kernel-shared/inode.c
> @@ -584,7 +584,8 @@ out:
>
>   struct btrfs_root *btrfs_mksubvol(struct btrfs_root *root,
>   				  const char *base, u64 root_objectid,
> -				  bool convert)
> +				  bool convert, u64 dirid,
> +				  bool dont_change_size)

Any reason why adding this new parameter?

Normally it's pretty nature that we increase the directory inode's size
with new entries.
Just like btrfs_add_link().



>   {
>   	struct btrfs_trans_handle *trans;
>   	struct btrfs_fs_info *fs_info = root->fs_info;
> @@ -594,7 +595,6 @@ struct btrfs_root *btrfs_mksubvol(struct btrfs_root *root,
>   	struct btrfs_inode_item *inode_item;
>   	struct extent_buffer *leaf;
>   	struct btrfs_key key;
> -	u64 dirid = btrfs_root_dirid(&root->root_item);
>   	u64 index = 2;
>   	char buf[BTRFS_NAME_LEN + 1]; /* for snprintf null */
>   	int len;
> @@ -632,20 +632,6 @@ struct btrfs_root *btrfs_mksubvol(struct btrfs_root *root,
>   		goto fail;
>   	}
>
> -	key.objectid = dirid;
> -	key.type =  BTRFS_INODE_ITEM_KEY;
> -	key.offset = 0;
> -
> -	ret = btrfs_lookup_inode(trans, root, &path, &key, 1);
> -	if (ret) {
> -		error("search for INODE_ITEM %llu failed: %d",
> -				(unsigned long long)dirid, ret);
> -		goto fail;
> -	}
> -	leaf = path.nodes[0];
> -	inode_item = btrfs_item_ptr(leaf, path.slots[0],
> -				    struct btrfs_inode_item);
> -
>   	key.objectid = root_objectid;
>   	key.type = BTRFS_ROOT_ITEM_KEY;
>   	key.offset = (u64)-1;
> @@ -670,10 +656,26 @@ struct btrfs_root *btrfs_mksubvol(struct btrfs_root *root,
>   	if (ret)
>   		goto fail;
>
> -	btrfs_set_inode_size(leaf, inode_item, len * 2 +
> -			     btrfs_inode_size(leaf, inode_item));
> -	btrfs_mark_buffer_dirty(leaf);
> -	btrfs_release_path(&path);
> +	if (!dont_change_size) {
> +		key.objectid = dirid;
> +		key.type =  BTRFS_INODE_ITEM_KEY;
> +		key.offset = 0;
> +
> +		ret = btrfs_lookup_inode(trans, root, &path, &key, 1);
> +		if (ret) {
> +			error("search for INODE_ITEM %llu failed: %d",
> +					(unsigned long long)dirid, ret);
> +			goto fail;
> +		}
> +		leaf = path.nodes[0];
> +		inode_item = btrfs_item_ptr(leaf, path.slots[0],
> +					struct btrfs_inode_item);
> +
> +		btrfs_set_inode_size(leaf, inode_item, len * 2 +
> +				btrfs_inode_size(leaf, inode_item));
> +		btrfs_mark_buffer_dirty(leaf);
> +		btrfs_release_path(&path);
> +	}
>
>   	/* add the backref first */
>   	ret = btrfs_add_root_ref(trans, tree_root, root_objectid,
> @@ -703,6 +705,10 @@ struct btrfs_root *btrfs_mksubvol(struct btrfs_root *root,
>   		goto fail;
>   	}
>
> +	key.objectid = root_objectid;
> +	key.type = BTRFS_ROOT_ITEM_KEY;
> +	key.offset = (u64)-1;
> +
>   	new_root = btrfs_read_fs_root(fs_info, &key);
>   	if (IS_ERR(new_root)) {
>   		error("unable to fs read root: %lu", PTR_ERR(new_root));
> diff --git a/mkfs/main.c b/mkfs/main.c
> index b40f7432..63119fc3 100644
> --- a/mkfs/main.c
> +++ b/mkfs/main.c
> @@ -440,6 +440,7 @@ static const char * const mkfs_usage[] = {
>   	"Creation:",
>   	OPTLINE("-b|--byte-count SIZE", "set size of each device to SIZE (filesystem size is sum of all device sizes)"),
>   	OPTLINE("-r|--rootdir DIR", "copy files from DIR to the image root directory"),
> +	OPTLINE("-u|--subvol SUBDIR", "create SUBDIR as subvolume rather than normal directory"),
>   	OPTLINE("--shrink", "(with --rootdir) shrink the filled filesystem to minimal size"),
>   	OPTLINE("-K|--nodiscard", "do not perform whole device TRIM"),
>   	OPTLINE("-f|--force", "force overwrite of existing filesystem"),
> @@ -1168,6 +1169,67 @@ static void *prepare_one_device(void *ctx)
>   	return NULL;
>   }
>
> +static int create_subvol(struct btrfs_trans_handle *trans,
> +			 struct btrfs_root *root, u64 root_objectid)
> +{
> +	struct extent_buffer *tmp;
> +	struct btrfs_root *new_root;
> +	struct btrfs_key key;
> +	struct btrfs_root_item root_item;
> +	u8 uuid[BTRFS_UUID_SIZE];
> +	int ret;
> +
> +	ret = btrfs_copy_root(trans, root, root->node, &tmp,
> +			      root_objectid);

I'm not a super big fan of copying root just to skip the initialization
of some members.

Can't we just use btrfs_create_root() instead?

> +	if (ret)
> +		return ret;
> +
> +	uuid_generate(uuid);
> +
> +	memcpy(&root_item, &root->root_item, sizeof(root_item));
> +	btrfs_set_root_bytenr(&root_item, tmp->start);
> +	btrfs_set_root_level(&root_item, btrfs_header_level(tmp));
> +	btrfs_set_root_generation(&root_item, trans->transid);
> +	memcpy(&root_item.uuid, uuid, BTRFS_UUID_SIZE);
> +
> +	free_extent_buffer(tmp);
> +
> +	key.objectid = root_objectid;
> +	key.type = BTRFS_ROOT_ITEM_KEY;
> +	key.offset = trans->transid;
> +	ret = btrfs_insert_root(trans, root->fs_info->tree_root,
> +				&key, &root_item);
> +
> +	key.offset = (u64)-1;
> +	new_root = btrfs_read_fs_root(root->fs_info, &key);
> +	if (!new_root || IS_ERR(new_root)) {
> +		error("unable to fs read root: %lu", PTR_ERR(new_root));
> +		return PTR_ERR(new_root);
> +	}
> +
> +	ret = btrfs_uuid_tree_add(trans, uuid, BTRFS_UUID_KEY_SUBVOL,
> +				  root_objectid);
> +	if (ret < 0) {
> +		error("failed to add uuid entry");
> +		return ret;
> +	}
> +
> +	return 0;
> +}
> +
> +static int subvol_compar(const void *p1, const void *p2)
> +{
> +	const struct rootdir_subvol *s1 = *(const struct rootdir_subvol**)p1;
> +	const struct rootdir_subvol *s2 = *(const struct rootdir_subvol**)p2;
> +
> +	if (s1->depth < s2->depth)
> +		return 1;
> +	else if (s1->depth > s2->depth)
> +		return -1;
> +	else
> +		return 0;
> +}
> +
>   int BOX_MAIN(mkfs)(int argc, char **argv)
>   {
>   	char *file;
> @@ -1209,6 +1271,7 @@ int BOX_MAIN(mkfs)(int argc, char **argv)
>   	char *label = NULL;
>   	int nr_global_roots = sysconf(_SC_NPROCESSORS_ONLN);
>   	char *source_dir = NULL;
> +	LIST_HEAD(subvols);
>
>   	cpu_detect_flags();
>   	hash_init_accel();
> @@ -1239,6 +1302,7 @@ int BOX_MAIN(mkfs)(int argc, char **argv)
>   			{ "data", required_argument, NULL, 'd' },
>   			{ "version", no_argument, NULL, 'V' },
>   			{ "rootdir", required_argument, NULL, 'r' },
> +			{ "subvol", required_argument, NULL, 'u' },
>   			{ "nodiscard", no_argument, NULL, 'K' },
>   			{ "features", required_argument, NULL, 'O' },
>   			{ "runtime-features", required_argument, NULL, 'R' },
> @@ -1360,6 +1424,25 @@ int BOX_MAIN(mkfs)(int argc, char **argv)
>   				free(source_dir);
>   				source_dir = strdup(optarg);
>   				break;
> +			case 'u': {
> +				struct rootdir_subvol *s;
> +
> +				s = malloc(sizeof(struct rootdir_subvol));
> +				if (!s) {
> +					error("out of memory");
> +					goto error;
> +				}
> +
> +				s->dir = strdup(optarg);
> +				s->fullpath = NULL;
> +				s->parent = NULL;
> +				s->parent_inum = 0;
> +				INIT_LIST_HEAD(&s->children);
> +				s->root = NULL;
> +
> +				list_add_tail(&s->list, &subvols);
> +				break;
> +				}
>   			case 'U':
>   				strncpy_null(fs_uuid, optarg, BTRFS_UUID_UNPARSED_SIZE);
>   				break;
> @@ -1420,6 +1503,159 @@ int BOX_MAIN(mkfs)(int argc, char **argv)
>   		error("the option --shrink must be used with --rootdir");
>   		goto error;
>   	}
> +	if (!list_empty(&subvols) && source_dir == NULL) {
> +		error("the option --subvol must be used with --rootdir");
> +		goto error;
> +	}
> +
> +	if (source_dir) {
> +		char *canonical = realpath(source_dir, NULL);
> +
> +		if (!canonical) {
> +			error("could not get canonical path to %s", source_dir);
> +			goto error;
> +		}
> +
> +		free(source_dir);
> +		source_dir = canonical;
> +	}
> +
> +	if (!list_empty(&subvols)) {
> +		unsigned int num_subvols = 0;
> +		size_t source_dir_len = strlen(source_dir);
> +		struct rootdir_subvol **arr, **ptr, *s;
> +
> +		list_for_each_entry(s, &subvols, list) {
> +			size_t tmp_len;
> +			char *tmp, *path;
> +			struct rootdir_subvol *s2;
> +
> +			tmp_len = source_dir_len + 1 + strlen(s->dir) + 1;
> +
> +			tmp = malloc(tmp_len);
> +			if (!tmp) {
> +				error("out of memory");
> +				goto error;
> +			}
> +
> +			strcpy(tmp, source_dir);
> +			strcat(tmp, "/");
> +			strcat(tmp, s->dir);
> +
> +			if (!path_exists(tmp)) {
> +				error("subvol %s does not exist within rootdir",
> +				      s->dir);
> +				free(tmp);
> +				goto error;
> +			}
> +
> +			if (!path_is_dir(tmp)) {
> +				error("subvol %s is not a directory", s->dir);
> +				free(tmp);
> +				goto error;
> +			}
> +
> +			path = realpath(tmp, NULL);
> +
> +			free(tmp);
> +
> +			if (!path) {
> +				error("could not get canonical path to %s",
> +				      s->dir);
> +				goto error;
> +			}
> +
> +			if (strlen(path) < source_dir_len + 1 ||
> +			    memcmp(path, source_dir, source_dir_len) ||
> +			    path[source_dir_len] != '/') {
> +				error("subvol %s is not a child of %s",
> +				      s->dir, source_dir);
> +				free(path);
> +				goto error;
> +			}
> +
> +			for (s2 = list_first_entry(&subvols, struct rootdir_subvol, list);
> +			     s2 != s; s2 = list_next_entry(s2, list)) {
> +				if (!strcmp(s2->fullpath, path)) {
> +					error("subvol %s specified more than once",
> +					      s->dir);
> +					free(path);
> +					goto error;
> +				}
> +			}
> +
> +			s->fullpath = path;
> +
> +			s->depth = 0;
> +			for (i = source_dir_len + 1; i < strlen(s->fullpath); i++) {
> +				if (s->fullpath[i] == '/')
> +					s->depth++;
> +			}
> +
> +			num_subvols++;
> +		}
> +
> +		/* Reorder subvol list by depth. */
> +
> +		arr = malloc(sizeof(struct rootdir_subvol*) * num_subvols);
> +		if (!arr) {
> +			error("out of memory");
> +			goto error;
> +		}
> +
> +		ptr = arr;
> +
> +		list_for_each_entry(s, &subvols, list) {
> +			*ptr = s;
> +			ptr++;
> +		}
> +
> +		qsort(arr, num_subvols, sizeof(struct rootdir_subvol*),
> +		      subvol_compar);
> +
> +		INIT_LIST_HEAD(&subvols);
> +		for (i = 0; i < num_subvols; i++) {
> +			list_add_tail(&arr[i]->list, &subvols);
> +		}
> +
> +		free(arr);
> +
> +		/* Assign subvols to parents. */
> +
> +		list_for_each_entry(s, &subvols, list) {
> +			size_t len1;
> +
> +			if (s->depth == 0)
> +				break;
> +
> +			len1 = strlen(s->fullpath);
> +
> +			for (struct rootdir_subvol *s2 = list_next_entry(s, list);
> +			     !list_entry_is_head(s2, &subvols, list);
> +			     s2 = list_next_entry(s2, list)) {
> +				size_t len2;
> +
> +				if (s2->depth == s->depth)
> +					continue;
> +
> +				len2 = strlen(s2->fullpath);
> +
> +				if (len1 <= len2 + 1)
> +					continue;
> +
> +				if (s->fullpath[len2] != '/')
> +					continue;
> +
> +				if (memcmp(s->fullpath, s2->fullpath, len2))
> +					continue;
> +
> +				s->parent = s2;
> +				list_add_tail(&s->child_list, &s2->children);
> +
> +				break;
> +			}
> +		}
> +	}
>
>   	if (*fs_uuid) {
>   		uuid_t dummy_uuid;
> @@ -1964,9 +2200,68 @@ raid_groups:
>   		goto out;
>   	}
>
> +	if (!list_empty(&subvols)) {
> +		struct rootdir_subvol *s;
> +		u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
> +
> +		list_for_each_entry_reverse(s, &subvols, list) {
> +			struct btrfs_key key;
> +
> +			s->objectid = objectid;
> +
> +			trans = btrfs_start_transaction(root, 1);
> +			if (IS_ERR(trans)) {
> +				errno = -PTR_ERR(trans);
> +				error_msg(ERROR_MSG_START_TRANS, "%m");
> +				goto error;
> +			}
> +
> +			ret = create_subvol(trans, root, objectid);

Would it be possible to do the subvolume creation during regular
directory traversal?

By that we can just treat a target subvolume as a slightly different
directory creation.
The biggest problem here is, we only insert the root items without any
backref, and immediately commits the transaction, and would need special
handling for target subvolumes anyway.

If by somehow the mkfs is interrupted, what we got is a corrupted fs
with a lot of subvolume which can not be accessed.
(Well, not a huge problem since the mkfs is not done, its super magic is
not a valid one, kernel won't be able to mount them anyway)

Thanks,
Qu
> +			if (ret < 0) {
> +				error("failed to create subvolume: %d", ret);
> +				goto out;
> +			}
> +
> +			ret = btrfs_commit_transaction(trans, root);
> +			if (ret) {
> +				errno = -ret;
> +				error_msg(ERROR_MSG_COMMIT_TRANS, "%m");
> +				goto out;
> +			}
> +
> +			key.objectid = objectid;
> +			key.type = BTRFS_ROOT_ITEM_KEY;
> +			key.offset = (u64)-1;
> +
> +			s->root = btrfs_read_fs_root(fs_info, &key);
> +			if (IS_ERR(s->root)) {
> +				error("unable to fs read root: %lu", PTR_ERR(s->root));
> +				goto out;
> +			}
> +
> +			objectid++;
> +		}
> +	}
> +
>   	if (source_dir) {
> +		LIST_HEAD(subvol_children);
> +
>   		pr_verbose(LOG_DEFAULT, "Rootdir from:       %s\n", source_dir);
> -		ret = btrfs_mkfs_fill_dir(source_dir, root);
> +
> +		if (!list_empty(&subvols)) {
> +			struct rootdir_subvol *s;
> +
> +			list_for_each_entry(s, &subvols, list) {
> +				if (s->parent)
> +					continue;
> +
> +				list_add_tail(&s->child_list,
> +					      &subvol_children);
> +			}
> +		}
> +
> +		ret = btrfs_mkfs_fill_dir(source_dir, root,
> +					  &subvol_children);
>   		if (ret) {
>   			error("error while filling filesystem: %d", ret);
>   			goto out;
> @@ -1983,6 +2278,41 @@ raid_groups:
>   		} else {
>   			pr_verbose(LOG_DEFAULT, "  Shrink:           no\n");
>   		}
> +
> +		if (!list_empty(&subvols)) {
> +			struct rootdir_subvol *s;
> +
> +			list_for_each_entry_reverse(s, &subvols, list) {
> +				pr_verbose(LOG_DEFAULT,
> +					   "  Subvol from:      %s\n",
> +					   s->fullpath);
> +			}
> +		}
> +	}
> +
> +	if (!list_empty(&subvols)) {
> +		struct rootdir_subvol *s;
> +
> +		list_for_each_entry(s, &subvols, list) {
> +			ret = btrfs_mkfs_fill_dir(s->fullpath, s->root,
> +						  &s->children);
> +			if (ret) {
> +				error("error while filling filesystem: %d",
> +				      ret);
> +				goto out;
> +			}
> +		}
> +
> +		list_for_each_entry_reverse(s, &subvols, list) {
> +			if (!btrfs_mksubvol(s->parent ? s->parent->root : root,
> +					    path_basename(s->dir), s->objectid,
> +					    false, s->parent_inum,
> +					    true)) {
> +				error("unable to link subvolume %s",
> +				      path_basename(s->dir));
> +				goto out;
> +			}
> +		}
>   	}
>
>   	if (features.runtime_flags & BTRFS_FEATURE_RUNTIME_QUOTA ||
> @@ -2076,6 +2406,18 @@ out:
>   	free(label);
>   	free(source_dir);
>
> +	while (!list_empty(&subvols)) {
> +		struct rootdir_subvol *head = list_entry(subvols.next,
> +					      struct rootdir_subvol,
> +					      list);
> +
> +		free(head->dir);
> +		free(head->fullpath);
> +
> +		list_del(&head->list);
> +		free(head);
> +	}
> +
>   	return !!ret;
>
>   error:
> @@ -2087,6 +2429,19 @@ error:
>   	free(prepare_ctx);
>   	free(label);
>   	free(source_dir);
> +
> +	while (!list_empty(&subvols)) {
> +		struct rootdir_subvol *head = list_entry(subvols.next,
> +					      struct rootdir_subvol,
> +					      list);
> +
> +		free(head->dir);
> +		free(head->fullpath);
> +
> +		list_del(&head->list);
> +		free(head);
> +	}
> +
>   	exit(1);
>   success:
>   	exit(0);
> diff --git a/mkfs/rootdir.c b/mkfs/rootdir.c
> index 617a7efd..3377bec5 100644
> --- a/mkfs/rootdir.c
> +++ b/mkfs/rootdir.c
> @@ -493,7 +493,8 @@ error:
>
>   static int traverse_directory(struct btrfs_trans_handle *trans,
>   			      struct btrfs_root *root, const char *dir_name,
> -			      struct directory_name_entry *dir_head)
> +			      struct directory_name_entry *dir_head,
> +			      struct list_head *subvol_children)
>   {
>   	int ret = 0;
>
> @@ -570,6 +571,28 @@ static int traverse_directory(struct btrfs_trans_handle *trans,
>   				pr_verbose(LOG_INFO, "ADD: %s\n", tmp);
>   			}
>
> +			/* Omit child if it is going to be a subvolume. */
> +			if (!list_empty(subvol_children) && S_ISDIR(st.st_mode)) {
> +				struct rootdir_subvol *s;
> +				bool skip = false;
> +
> +				if (bconf.verbose < LOG_INFO) {
> +					path_cat_out(tmp, parent_dir_entry->path,
> +						     cur_file->d_name);
> +				}
> +
> +				list_for_each_entry(s, subvol_children, child_list) {
> +					if (!strcmp(tmp, s->fullpath)) {
> +						s->parent_inum = parent_inum;
> +						skip = true;
> +						break;
> +					}
> +				}
> +
> +				if (skip)
> +					continue;
> +			}
> +
>   			/*
>   			 * We can not directly use the source ino number,
>   			 * as there is a chance that the ino is smaller than
> @@ -680,7 +703,8 @@ fail_no_dir:
>   	goto out;
>   }
>
> -int btrfs_mkfs_fill_dir(const char *source_dir, struct btrfs_root *root)
> +int btrfs_mkfs_fill_dir(const char *source_dir, struct btrfs_root *root,
> +			struct list_head *subvol_children)
>   {
>   	int ret;
>   	struct btrfs_trans_handle *trans;
> @@ -705,7 +729,8 @@ int btrfs_mkfs_fill_dir(const char *source_dir, struct btrfs_root *root)
>   		goto fail;
>   }
>
> -	ret = traverse_directory(trans, root, source_dir, &dir_head);
> +	ret = traverse_directory(trans, root, source_dir, &dir_head,
> +				 subvol_children);
>   	if (ret) {
>   		error("unable to traverse directory %s: %d", source_dir, ret);
>   		goto fail;
> diff --git a/mkfs/rootdir.h b/mkfs/rootdir.h
> index 8d5f6896..598eb1a7 100644
> --- a/mkfs/rootdir.h
> +++ b/mkfs/rootdir.h
> @@ -36,7 +36,21 @@ struct directory_name_entry {
>   	struct list_head list;
>   };
>
> -int btrfs_mkfs_fill_dir(const char *source_dir, struct btrfs_root *root);
> +struct rootdir_subvol {
> +	struct list_head list;
> +	struct list_head child_list;
> +	char *dir;
> +	char *fullpath;
> +	struct rootdir_subvol *parent;
> +	u64 parent_inum;
> +	struct list_head children;
> +	unsigned int depth;
> +	u64 objectid;
> +	struct btrfs_root *root;
> +};
> +
> +int btrfs_mkfs_fill_dir(const char *source_dir, struct btrfs_root *root,
> +			struct list_head *subvol_children);
>   u64 btrfs_mkfs_size_dir(const char *dir_name, u32 sectorsize, u64 min_dev_size,
>   			u64 meta_profile, u64 data_profile);
>   int btrfs_mkfs_shrink_fs(struct btrfs_fs_info *fs_info, u64 *new_size_ret,
> diff --git a/tests/mkfs-tests/034-rootdir-subvol/test.sh b/tests/mkfs-tests/034-rootdir-subvol/test.sh
> new file mode 100755
> index 00000000..d8085659
> --- /dev/null
> +++ b/tests/mkfs-tests/034-rootdir-subvol/test.sh
> @@ -0,0 +1,33 @@
> +#!/bin/bash
> +# smoke test for mkfs.btrfs --subvol option
> +
> +source "$TEST_TOP/common" || exit
> +
> +check_prereq mkfs.btrfs
> +check_prereq btrfs
> +
> +setup_root_helper
> +prepare_test_dev
> +
> +tmp=$(_mktemp_dir mkfs-rootdir)
> +
> +touch $tmp/foo
> +mkdir $tmp/dir
> +mkdir $tmp/dir/subvol
> +touch $tmp/dir/subvol/bar
> +
> +run_check_mkfs_test_dev --rootdir "$tmp" --subvol dir/subvol
> +run_check $SUDO_HELPER "$TOP/btrfs" check "$TEST_DEV"
> +
> +run_check_mount_test_dev
> +run_check_stdout $SUDO_HELPER "$TOP/btrfs" subvolume list "$TEST_MNT" | \
> +	cut -d\  -f9 > "$tmp/output"
> +run_check_umount_test_dev
> +
> +result=$(cat "$tmp/output")
> +
> +if [ "$result" != "dir/subvol" ]; then
> +	_fail "dir/subvol not in subvolume list"
> +fi
> +
> +rm -rf -- "$tmp"
Mark Harmstone June 28, 2024, 9:47 a.m. UTC | #2
Thanks Qu.

> >   struct btrfs_root *btrfs_mksubvol(struct btrfs_root *root,
> >   				  const char *base, u64 root_objectid,
> > -				  bool convert)
> > +				  bool convert, u64 dirid,
> > +				  bool dont_change_size)
> 
> Any reason why adding this new parameter?
> 
> Normally it's pretty nature that we increase the directory inode's size with new
> entries.
> Just like btrfs_add_link().

Because btrfs_mkfs_fill_dir calculates the inode size as normal, but skips any
directories that are going to be subvols - so we'd be double-counting otherwise.

Another way to do it would have been to pass the subvol list to
calculate_dir_inode_size, but this would have meant a lot more recursing,
and writing the parent dir inode item more than once.

> > +	ret = btrfs_copy_root(trans, root, root->node, &tmp,
> > +			      root_objectid);
> 
> I'm not a super big fan of copying root just to skip the initialization of some
> members.
> 
> Can't we just use btrfs_create_root() instead?

I was copying what we do in convert.c here. It's something I can look into in
a later patch, if necessary.
 
> Would it be possible to do the subvolume creation during regular directory
> traversal?
> 
> By that we can just treat a target subvolume as a slightly different directory
> creation.
> The biggest problem here is, we only insert the root items without any backref,
> and immediately commits the transaction, and would need special handling for
> target subvolumes anyway.

I think the issue was that btrfs_mksubvol does its own transaction, so can't
be called from within btrfs_mkfs_fill_dir. Quite possibly it could be done with
some refactoring, but I've tried to keep the code changes to a minimum...

> If by somehow the mkfs is interrupted, what we got is a corrupted fs with a lot
> of subvolume which can not be accessed.
> (Well, not a huge problem since the mkfs is not done, its super magic is not a
> valid one, kernel won't be able to mount them anyway)

Yes, I don't think transactions are all that important when it comes to mkfs,
as you either have a valid filesystem or unmountable nonsense.

Thanks

Mark
Goffredo Baroncelli June 28, 2024, 5:06 p.m. UTC | #3
On 27/06/2024 11.54, Mark Harmstone wrote:
> From: Mark Harmstone <maharmstone@meta.com>
> 
> This patch adds a --subvol option, which tells mkfs.btrfs to create the
> specified directories as subvolumes.
> 
> Given a populated directory img, the command
> 
> $ mkfs.btrfs --rootdir img --subvol usr --subvol home --subvol home/username /dev/loop0
> 
> will create subvolumes usr and home within the FS root, and subvolume
> username within the home subvolume. It will fail if any of the
> directories do not yet exist.
> 

Could be possible to decouple the "--rootdir" and the "--subvol" options ?
I.e. doing a first iteration where only the subvolume/subdir are created and a second one where
all the subvolume are populated.

The use case is creating only the subvol without --rootdir. My goal is to pupulate a btrfs
filesystem with a"root" subvol, and make it default. This to simplify the next snapshots.

Until now the subvol=0 is special because it can be snapshotted, but it cannot be deleted.

Having a / in a default subvol, could simplify the filesystem snapshot.

Otherwise, mkfs.btrfs can create a (temporary) root-image with the minimal directories needed...
but it seems a bit overkilling.


> Signed-off-by: Mark Harmstone <maharmstone@meta.com>
> ---
>   convert/main.c                              |   4 +-
>   kernel-shared/ctree.h                       |   3 +-
[...]

>   
>   	if (features.runtime_flags & BTRFS_FEATURE_RUNTIME_QUOTA ||
> @@ -2076,6 +2406,18 @@ out:
>   	free(label);
>   	free(source_dir);
>   
> +	while (!list_empty(&subvols)) {
> +		struct rootdir_subvol *head = list_entry(subvols.next,
> +					      struct rootdir_subvol,
> +					      list);
> +
> +		free(head->dir);
> +		free(head->fullpath);
> +
> +		list_del(&head->list);
> +		free(head);
> +	}
> +

Because it is called more than once, this part can be refactored in a dedicated function:

void free_subvols_list(list_head *subvols) {
	while (!list_empty(subvols)) {
		struct rootdir_subvol *head = list_entry(subvols.next,
					      struct rootdir_subvol,
					      list);

		free(head->dir);
		free(head->fullpath);

		list_del(&head->list);
		free(head);
}

>   	return !!ret;
>   
>   error:
> @@ -2087,6 +2429,19 @@ error:
>   	free(prepare_ctx);
>   	free(label);
>   	free(source_dir);
> +
> +	while (!list_empty(&subvols)) {
> +		struct rootdir_subvol *head = list_entry(subvols.next,
> +					      struct rootdir_subvol,
> +					      list);
> +
> +		free(head->dir);
> +		free(head->fullpath);
> +
> +		list_del(&head->list);
> +		free(head);
> +	}
> +

Same as above

[...]
diff mbox series

Patch

diff --git a/convert/main.c b/convert/main.c
index 8e73aa25..7249c793 100644
--- a/convert/main.c
+++ b/convert/main.c
@@ -1314,7 +1314,9 @@  static int do_convert(const char *devname, u32 convert_flags, u32 nodesize,
 	}
 
 	image_root = btrfs_mksubvol(root, subvol_name,
-				    CONV_IMAGE_SUBVOL_OBJECTID, true);
+				    CONV_IMAGE_SUBVOL_OBJECTID, true,
+				    btrfs_root_dirid(&root->root_item),
+				    false);
 	if (!image_root) {
 		error("unable to link subvolume %s", subvol_name);
 		goto fail;
diff --git a/kernel-shared/ctree.h b/kernel-shared/ctree.h
index 1341a418..8a5ddcdb 100644
--- a/kernel-shared/ctree.h
+++ b/kernel-shared/ctree.h
@@ -1230,7 +1230,8 @@  int btrfs_add_orphan_item(struct btrfs_trans_handle *trans,
 int btrfs_mkdir(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		char *name, int namelen, u64 parent_ino, u64 *ino, int mode);
 struct btrfs_root *btrfs_mksubvol(struct btrfs_root *root, const char *base,
-				  u64 root_objectid, bool convert);
+				  u64 root_objectid, bool convert, u64 dirid,
+				  bool dont_change_size);
 int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *fs_root,
 			     u64 dirid, u64 *objectid);
diff --git a/kernel-shared/inode.c b/kernel-shared/inode.c
index 91b4f629..99965558 100644
--- a/kernel-shared/inode.c
+++ b/kernel-shared/inode.c
@@ -584,7 +584,8 @@  out:
 
 struct btrfs_root *btrfs_mksubvol(struct btrfs_root *root,
 				  const char *base, u64 root_objectid,
-				  bool convert)
+				  bool convert, u64 dirid,
+				  bool dont_change_size)
 {
 	struct btrfs_trans_handle *trans;
 	struct btrfs_fs_info *fs_info = root->fs_info;
@@ -594,7 +595,6 @@  struct btrfs_root *btrfs_mksubvol(struct btrfs_root *root,
 	struct btrfs_inode_item *inode_item;
 	struct extent_buffer *leaf;
 	struct btrfs_key key;
-	u64 dirid = btrfs_root_dirid(&root->root_item);
 	u64 index = 2;
 	char buf[BTRFS_NAME_LEN + 1]; /* for snprintf null */
 	int len;
@@ -632,20 +632,6 @@  struct btrfs_root *btrfs_mksubvol(struct btrfs_root *root,
 		goto fail;
 	}
 
-	key.objectid = dirid;
-	key.type =  BTRFS_INODE_ITEM_KEY;
-	key.offset = 0;
-
-	ret = btrfs_lookup_inode(trans, root, &path, &key, 1);
-	if (ret) {
-		error("search for INODE_ITEM %llu failed: %d",
-				(unsigned long long)dirid, ret);
-		goto fail;
-	}
-	leaf = path.nodes[0];
-	inode_item = btrfs_item_ptr(leaf, path.slots[0],
-				    struct btrfs_inode_item);
-
 	key.objectid = root_objectid;
 	key.type = BTRFS_ROOT_ITEM_KEY;
 	key.offset = (u64)-1;
@@ -670,10 +656,26 @@  struct btrfs_root *btrfs_mksubvol(struct btrfs_root *root,
 	if (ret)
 		goto fail;
 
-	btrfs_set_inode_size(leaf, inode_item, len * 2 +
-			     btrfs_inode_size(leaf, inode_item));
-	btrfs_mark_buffer_dirty(leaf);
-	btrfs_release_path(&path);
+	if (!dont_change_size) {
+		key.objectid = dirid;
+		key.type =  BTRFS_INODE_ITEM_KEY;
+		key.offset = 0;
+
+		ret = btrfs_lookup_inode(trans, root, &path, &key, 1);
+		if (ret) {
+			error("search for INODE_ITEM %llu failed: %d",
+					(unsigned long long)dirid, ret);
+			goto fail;
+		}
+		leaf = path.nodes[0];
+		inode_item = btrfs_item_ptr(leaf, path.slots[0],
+					struct btrfs_inode_item);
+
+		btrfs_set_inode_size(leaf, inode_item, len * 2 +
+				btrfs_inode_size(leaf, inode_item));
+		btrfs_mark_buffer_dirty(leaf);
+		btrfs_release_path(&path);
+	}
 
 	/* add the backref first */
 	ret = btrfs_add_root_ref(trans, tree_root, root_objectid,
@@ -703,6 +705,10 @@  struct btrfs_root *btrfs_mksubvol(struct btrfs_root *root,
 		goto fail;
 	}
 
+	key.objectid = root_objectid;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	key.offset = (u64)-1;
+
 	new_root = btrfs_read_fs_root(fs_info, &key);
 	if (IS_ERR(new_root)) {
 		error("unable to fs read root: %lu", PTR_ERR(new_root));
diff --git a/mkfs/main.c b/mkfs/main.c
index b40f7432..63119fc3 100644
--- a/mkfs/main.c
+++ b/mkfs/main.c
@@ -440,6 +440,7 @@  static const char * const mkfs_usage[] = {
 	"Creation:",
 	OPTLINE("-b|--byte-count SIZE", "set size of each device to SIZE (filesystem size is sum of all device sizes)"),
 	OPTLINE("-r|--rootdir DIR", "copy files from DIR to the image root directory"),
+	OPTLINE("-u|--subvol SUBDIR", "create SUBDIR as subvolume rather than normal directory"),
 	OPTLINE("--shrink", "(with --rootdir) shrink the filled filesystem to minimal size"),
 	OPTLINE("-K|--nodiscard", "do not perform whole device TRIM"),
 	OPTLINE("-f|--force", "force overwrite of existing filesystem"),
@@ -1168,6 +1169,67 @@  static void *prepare_one_device(void *ctx)
 	return NULL;
 }
 
+static int create_subvol(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *root, u64 root_objectid)
+{
+	struct extent_buffer *tmp;
+	struct btrfs_root *new_root;
+	struct btrfs_key key;
+	struct btrfs_root_item root_item;
+	u8 uuid[BTRFS_UUID_SIZE];
+	int ret;
+
+	ret = btrfs_copy_root(trans, root, root->node, &tmp,
+			      root_objectid);
+	if (ret)
+		return ret;
+
+	uuid_generate(uuid);
+
+	memcpy(&root_item, &root->root_item, sizeof(root_item));
+	btrfs_set_root_bytenr(&root_item, tmp->start);
+	btrfs_set_root_level(&root_item, btrfs_header_level(tmp));
+	btrfs_set_root_generation(&root_item, trans->transid);
+	memcpy(&root_item.uuid, uuid, BTRFS_UUID_SIZE);
+
+	free_extent_buffer(tmp);
+
+	key.objectid = root_objectid;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	key.offset = trans->transid;
+	ret = btrfs_insert_root(trans, root->fs_info->tree_root,
+				&key, &root_item);
+
+	key.offset = (u64)-1;
+	new_root = btrfs_read_fs_root(root->fs_info, &key);
+	if (!new_root || IS_ERR(new_root)) {
+		error("unable to fs read root: %lu", PTR_ERR(new_root));
+		return PTR_ERR(new_root);
+	}
+
+	ret = btrfs_uuid_tree_add(trans, uuid, BTRFS_UUID_KEY_SUBVOL,
+				  root_objectid);
+	if (ret < 0) {
+		error("failed to add uuid entry");
+		return ret;
+	}
+
+	return 0;
+}
+
+static int subvol_compar(const void *p1, const void *p2)
+{
+	const struct rootdir_subvol *s1 = *(const struct rootdir_subvol**)p1;
+	const struct rootdir_subvol *s2 = *(const struct rootdir_subvol**)p2;
+
+	if (s1->depth < s2->depth)
+		return 1;
+	else if (s1->depth > s2->depth)
+		return -1;
+	else
+		return 0;
+}
+
 int BOX_MAIN(mkfs)(int argc, char **argv)
 {
 	char *file;
@@ -1209,6 +1271,7 @@  int BOX_MAIN(mkfs)(int argc, char **argv)
 	char *label = NULL;
 	int nr_global_roots = sysconf(_SC_NPROCESSORS_ONLN);
 	char *source_dir = NULL;
+	LIST_HEAD(subvols);
 
 	cpu_detect_flags();
 	hash_init_accel();
@@ -1239,6 +1302,7 @@  int BOX_MAIN(mkfs)(int argc, char **argv)
 			{ "data", required_argument, NULL, 'd' },
 			{ "version", no_argument, NULL, 'V' },
 			{ "rootdir", required_argument, NULL, 'r' },
+			{ "subvol", required_argument, NULL, 'u' },
 			{ "nodiscard", no_argument, NULL, 'K' },
 			{ "features", required_argument, NULL, 'O' },
 			{ "runtime-features", required_argument, NULL, 'R' },
@@ -1360,6 +1424,25 @@  int BOX_MAIN(mkfs)(int argc, char **argv)
 				free(source_dir);
 				source_dir = strdup(optarg);
 				break;
+			case 'u': {
+				struct rootdir_subvol *s;
+
+				s = malloc(sizeof(struct rootdir_subvol));
+				if (!s) {
+					error("out of memory");
+					goto error;
+				}
+
+				s->dir = strdup(optarg);
+				s->fullpath = NULL;
+				s->parent = NULL;
+				s->parent_inum = 0;
+				INIT_LIST_HEAD(&s->children);
+				s->root = NULL;
+
+				list_add_tail(&s->list, &subvols);
+				break;
+				}
 			case 'U':
 				strncpy_null(fs_uuid, optarg, BTRFS_UUID_UNPARSED_SIZE);
 				break;
@@ -1420,6 +1503,159 @@  int BOX_MAIN(mkfs)(int argc, char **argv)
 		error("the option --shrink must be used with --rootdir");
 		goto error;
 	}
+	if (!list_empty(&subvols) && source_dir == NULL) {
+		error("the option --subvol must be used with --rootdir");
+		goto error;
+	}
+
+	if (source_dir) {
+		char *canonical = realpath(source_dir, NULL);
+
+		if (!canonical) {
+			error("could not get canonical path to %s", source_dir);
+			goto error;
+		}
+
+		free(source_dir);
+		source_dir = canonical;
+	}
+
+	if (!list_empty(&subvols)) {
+		unsigned int num_subvols = 0;
+		size_t source_dir_len = strlen(source_dir);
+		struct rootdir_subvol **arr, **ptr, *s;
+
+		list_for_each_entry(s, &subvols, list) {
+			size_t tmp_len;
+			char *tmp, *path;
+			struct rootdir_subvol *s2;
+
+			tmp_len = source_dir_len + 1 + strlen(s->dir) + 1;
+
+			tmp = malloc(tmp_len);
+			if (!tmp) {
+				error("out of memory");
+				goto error;
+			}
+
+			strcpy(tmp, source_dir);
+			strcat(tmp, "/");
+			strcat(tmp, s->dir);
+
+			if (!path_exists(tmp)) {
+				error("subvol %s does not exist within rootdir",
+				      s->dir);
+				free(tmp);
+				goto error;
+			}
+
+			if (!path_is_dir(tmp)) {
+				error("subvol %s is not a directory", s->dir);
+				free(tmp);
+				goto error;
+			}
+
+			path = realpath(tmp, NULL);
+
+			free(tmp);
+
+			if (!path) {
+				error("could not get canonical path to %s",
+				      s->dir);
+				goto error;
+			}
+
+			if (strlen(path) < source_dir_len + 1 ||
+			    memcmp(path, source_dir, source_dir_len) ||
+			    path[source_dir_len] != '/') {
+				error("subvol %s is not a child of %s",
+				      s->dir, source_dir);
+				free(path);
+				goto error;
+			}
+
+			for (s2 = list_first_entry(&subvols, struct rootdir_subvol, list);
+			     s2 != s; s2 = list_next_entry(s2, list)) {
+				if (!strcmp(s2->fullpath, path)) {
+					error("subvol %s specified more than once",
+					      s->dir);
+					free(path);
+					goto error;
+				}
+			}
+
+			s->fullpath = path;
+
+			s->depth = 0;
+			for (i = source_dir_len + 1; i < strlen(s->fullpath); i++) {
+				if (s->fullpath[i] == '/')
+					s->depth++;
+			}
+
+			num_subvols++;
+		}
+
+		/* Reorder subvol list by depth. */
+
+		arr = malloc(sizeof(struct rootdir_subvol*) * num_subvols);
+		if (!arr) {
+			error("out of memory");
+			goto error;
+		}
+
+		ptr = arr;
+
+		list_for_each_entry(s, &subvols, list) {
+			*ptr = s;
+			ptr++;
+		}
+
+		qsort(arr, num_subvols, sizeof(struct rootdir_subvol*),
+		      subvol_compar);
+
+		INIT_LIST_HEAD(&subvols);
+		for (i = 0; i < num_subvols; i++) {
+			list_add_tail(&arr[i]->list, &subvols);
+		}
+
+		free(arr);
+
+		/* Assign subvols to parents. */
+
+		list_for_each_entry(s, &subvols, list) {
+			size_t len1;
+
+			if (s->depth == 0)
+				break;
+
+			len1 = strlen(s->fullpath);
+
+			for (struct rootdir_subvol *s2 = list_next_entry(s, list);
+			     !list_entry_is_head(s2, &subvols, list);
+			     s2 = list_next_entry(s2, list)) {
+				size_t len2;
+
+				if (s2->depth == s->depth)
+					continue;
+
+				len2 = strlen(s2->fullpath);
+
+				if (len1 <= len2 + 1)
+					continue;
+
+				if (s->fullpath[len2] != '/')
+					continue;
+
+				if (memcmp(s->fullpath, s2->fullpath, len2))
+					continue;
+
+				s->parent = s2;
+				list_add_tail(&s->child_list, &s2->children);
+
+				break;
+			}
+		}
+	}
 
 	if (*fs_uuid) {
 		uuid_t dummy_uuid;
@@ -1964,9 +2200,68 @@  raid_groups:
 		goto out;
 	}
 
+	if (!list_empty(&subvols)) {
+		struct rootdir_subvol *s;
+		u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
+
+		list_for_each_entry_reverse(s, &subvols, list) {
+			struct btrfs_key key;
+
+			s->objectid = objectid;
+
+			trans = btrfs_start_transaction(root, 1);
+			if (IS_ERR(trans)) {
+				errno = -PTR_ERR(trans);
+				error_msg(ERROR_MSG_START_TRANS, "%m");
+				goto error;
+			}
+
+			ret = create_subvol(trans, root, objectid);
+			if (ret < 0) {
+				error("failed to create subvolume: %d", ret);
+				goto out;
+			}
+
+			ret = btrfs_commit_transaction(trans, root);
+			if (ret) {
+				errno = -ret;
+				error_msg(ERROR_MSG_COMMIT_TRANS, "%m");
+				goto out;
+			}
+
+			key.objectid = objectid;
+			key.type = BTRFS_ROOT_ITEM_KEY;
+			key.offset = (u64)-1;
+
+			s->root = btrfs_read_fs_root(fs_info, &key);
+			if (IS_ERR(s->root)) {
+				error("unable to fs read root: %lu", PTR_ERR(s->root));
+				goto out;
+			}
+
+			objectid++;
+		}
+	}
+
 	if (source_dir) {
+		LIST_HEAD(subvol_children);
+
 		pr_verbose(LOG_DEFAULT, "Rootdir from:       %s\n", source_dir);
-		ret = btrfs_mkfs_fill_dir(source_dir, root);
+
+		if (!list_empty(&subvols)) {
+			struct rootdir_subvol *s;
+
+			list_for_each_entry(s, &subvols, list) {
+				if (s->parent)
+					continue;
+
+				list_add_tail(&s->child_list,
+					      &subvol_children);
+			}
+		}
+
+		ret = btrfs_mkfs_fill_dir(source_dir, root,
+					  &subvol_children);
 		if (ret) {
 			error("error while filling filesystem: %d", ret);
 			goto out;
@@ -1983,6 +2278,41 @@  raid_groups:
 		} else {
 			pr_verbose(LOG_DEFAULT, "  Shrink:           no\n");
 		}
+
+		if (!list_empty(&subvols)) {
+			struct rootdir_subvol *s;
+
+			list_for_each_entry_reverse(s, &subvols, list) {
+				pr_verbose(LOG_DEFAULT,
+					   "  Subvol from:      %s\n",
+					   s->fullpath);
+			}
+		}
+	}
+
+	if (!list_empty(&subvols)) {
+		struct rootdir_subvol *s;
+
+		list_for_each_entry(s, &subvols, list) {
+			ret = btrfs_mkfs_fill_dir(s->fullpath, s->root,
+						  &s->children);
+			if (ret) {
+				error("error while filling filesystem: %d",
+				      ret);
+				goto out;
+			}
+		}
+
+		list_for_each_entry_reverse(s, &subvols, list) {
+			if (!btrfs_mksubvol(s->parent ? s->parent->root : root,
+					    path_basename(s->dir), s->objectid,
+					    false, s->parent_inum,
+					    true)) {
+				error("unable to link subvolume %s",
+				      path_basename(s->dir));
+				goto out;
+			}
+		}
 	}
 
 	if (features.runtime_flags & BTRFS_FEATURE_RUNTIME_QUOTA ||
@@ -2076,6 +2406,18 @@  out:
 	free(label);
 	free(source_dir);
 
+	while (!list_empty(&subvols)) {
+		struct rootdir_subvol *head = list_entry(subvols.next,
+					      struct rootdir_subvol,
+					      list);
+
+		free(head->dir);
+		free(head->fullpath);
+
+		list_del(&head->list);
+		free(head);
+	}
+
 	return !!ret;
 
 error:
@@ -2087,6 +2429,19 @@  error:
 	free(prepare_ctx);
 	free(label);
 	free(source_dir);
+
+	while (!list_empty(&subvols)) {
+		struct rootdir_subvol *head = list_entry(subvols.next,
+					      struct rootdir_subvol,
+					      list);
+
+		free(head->dir);
+		free(head->fullpath);
+
+		list_del(&head->list);
+		free(head);
+	}
+
 	exit(1);
 success:
 	exit(0);
diff --git a/mkfs/rootdir.c b/mkfs/rootdir.c
index 617a7efd..3377bec5 100644
--- a/mkfs/rootdir.c
+++ b/mkfs/rootdir.c
@@ -493,7 +493,8 @@  error:
 
 static int traverse_directory(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root, const char *dir_name,
-			      struct directory_name_entry *dir_head)
+			      struct directory_name_entry *dir_head,
+			      struct list_head *subvol_children)
 {
 	int ret = 0;
 
@@ -570,6 +571,28 @@  static int traverse_directory(struct btrfs_trans_handle *trans,
 				pr_verbose(LOG_INFO, "ADD: %s\n", tmp);
 			}
 
+			/* Omit child if it is going to be a subvolume. */
+			if (!list_empty(subvol_children) && S_ISDIR(st.st_mode)) {
+				struct rootdir_subvol *s;
+				bool skip = false;
+
+				if (bconf.verbose < LOG_INFO) {
+					path_cat_out(tmp, parent_dir_entry->path,
+						     cur_file->d_name);
+				}
+
+				list_for_each_entry(s, subvol_children, child_list) {
+					if (!strcmp(tmp, s->fullpath)) {
+						s->parent_inum = parent_inum;
+						skip = true;
+						break;
+					}
+				}
+
+				if (skip)
+					continue;
+			}
+
 			/*
 			 * We can not directly use the source ino number,
 			 * as there is a chance that the ino is smaller than
@@ -680,7 +703,8 @@  fail_no_dir:
 	goto out;
 }
 
-int btrfs_mkfs_fill_dir(const char *source_dir, struct btrfs_root *root)
+int btrfs_mkfs_fill_dir(const char *source_dir, struct btrfs_root *root,
+			struct list_head *subvol_children)
 {
 	int ret;
 	struct btrfs_trans_handle *trans;
@@ -705,7 +729,8 @@  int btrfs_mkfs_fill_dir(const char *source_dir, struct btrfs_root *root)
 		goto fail;
 }
 
-	ret = traverse_directory(trans, root, source_dir, &dir_head);
+	ret = traverse_directory(trans, root, source_dir, &dir_head,
+				 subvol_children);
 	if (ret) {
 		error("unable to traverse directory %s: %d", source_dir, ret);
 		goto fail;
diff --git a/mkfs/rootdir.h b/mkfs/rootdir.h
index 8d5f6896..598eb1a7 100644
--- a/mkfs/rootdir.h
+++ b/mkfs/rootdir.h
@@ -36,7 +36,21 @@  struct directory_name_entry {
 	struct list_head list;
 };
 
-int btrfs_mkfs_fill_dir(const char *source_dir, struct btrfs_root *root);
+struct rootdir_subvol {
+	struct list_head list;
+	struct list_head child_list;
+	char *dir;
+	char *fullpath;
+	struct rootdir_subvol *parent;
+	u64 parent_inum;
+	struct list_head children;
+	unsigned int depth;
+	u64 objectid;
+	struct btrfs_root *root;
+};
+
+int btrfs_mkfs_fill_dir(const char *source_dir, struct btrfs_root *root,
+			struct list_head *subvol_children);
 u64 btrfs_mkfs_size_dir(const char *dir_name, u32 sectorsize, u64 min_dev_size,
 			u64 meta_profile, u64 data_profile);
 int btrfs_mkfs_shrink_fs(struct btrfs_fs_info *fs_info, u64 *new_size_ret,
diff --git a/tests/mkfs-tests/034-rootdir-subvol/test.sh b/tests/mkfs-tests/034-rootdir-subvol/test.sh
new file mode 100755
index 00000000..d8085659
--- /dev/null
+++ b/tests/mkfs-tests/034-rootdir-subvol/test.sh
@@ -0,0 +1,33 @@ 
+#!/bin/bash
+# smoke test for mkfs.btrfs --subvol option
+
+source "$TEST_TOP/common" || exit
+
+check_prereq mkfs.btrfs
+check_prereq btrfs
+
+setup_root_helper
+prepare_test_dev
+
+tmp=$(_mktemp_dir mkfs-rootdir)
+
+touch $tmp/foo
+mkdir $tmp/dir
+mkdir $tmp/dir/subvol
+touch $tmp/dir/subvol/bar
+
+run_check_mkfs_test_dev --rootdir "$tmp" --subvol dir/subvol
+run_check $SUDO_HELPER "$TOP/btrfs" check "$TEST_DEV"
+
+run_check_mount_test_dev
+run_check_stdout $SUDO_HELPER "$TOP/btrfs" subvolume list "$TEST_MNT" | \
+	cut -d\  -f9 > "$tmp/output"
+run_check_umount_test_dev
+
+result=$(cat "$tmp/output")
+
+if [ "$result" != "dir/subvol" ]; then
+	_fail "dir/subvol not in subvolume list"
+fi
+
+rm -rf -- "$tmp"