diff mbox series

[2/7] btrfs: fix qgroup prealloc rsv leak in subvolume operations

Message ID c39e9aea5fa5e22c42fcea3d810d78cb499312ff.1711488980.git.boris@bur.io (mailing list archive)
State New, archived
Headers show
Series btrfs: various qg meta rsv leak fixes | expand

Commit Message

Boris Burkov March 26, 2024, 9:39 p.m. UTC
Create subvol, create snapshot and delete subvol all use
btrfs_subvolume_reserve_metadata to reserve metadata for the changes
done to the parent subvolume's fs tree, which cannot be mediated in the
normal way via start_transaction. When quota groups (squota or qgroups)
are enabled, this reserves qgroup metadata of type PREALLOC. Once the
operation is associated to a transaction, we convert PREALLOC to
PERTRANS, which gets cleared in bulk at the end of the transaction.

However, the error paths of these three operations were not implementing
this lifecycle correctly. They unconditionally converted the PREALLOC to
PERTRANS in a generic cleanup step regardless of errors or whether the
operation was fully associated to a transaction or not. This resulted in
error paths occasionally converting this rsv to PERTRANS without calling
record_root_in_trans successfully, which meant that unless that root got
recorded in the transaction by some other thread, the end of the
transaction would not free that root's PERTRANS, leaking it. Ultimately,
this resulted in hitting a WARN in CONFIG_BTRFS_DEBUG builds at unmount
for the leaked reservation.

The fix is to ensure that every qgroup PREALLOC reservation observes the
following properties:
1. any failure before record_root_in_trans is called successfully
   results in freeing the PREALLOC reservation.
2. after record_root_in_trans, we convert to PERTRANS, and now the
   transaction owns freeing the reservation.

This patch enforces those properties on the three operations. Without
it, generic/269 with squotas enabled at MKFS time would fail in ~5-10
runs on my system. With this patch, it ran successfully 1000 times in a
row.

Fixes: e85fde5162bf ("btrfs: qgroup: fix qgroup meta rsv leak for subvolume operations")
Signed-off-by: Boris Burkov <boris@bur.io>
---
 fs/btrfs/inode.c     | 13 ++++++++++++-
 fs/btrfs/ioctl.c     | 37 ++++++++++++++++++++++++++++---------
 fs/btrfs/root-tree.c | 10 ----------
 fs/btrfs/root-tree.h |  2 --
 4 files changed, 40 insertions(+), 22 deletions(-)

Comments

Qu Wenruo March 26, 2024, 10:07 p.m. UTC | #1
在 2024/3/27 08:09, Boris Burkov 写道:
> Create subvol, create snapshot and delete subvol all use
> btrfs_subvolume_reserve_metadata to reserve metadata for the changes
> done to the parent subvolume's fs tree, which cannot be mediated in the
> normal way via start_transaction. When quota groups (squota or qgroups)
> are enabled, this reserves qgroup metadata of type PREALLOC. Once the
> operation is associated to a transaction, we convert PREALLOC to
> PERTRANS, which gets cleared in bulk at the end of the transaction.
> 
> However, the error paths of these three operations were not implementing
> this lifecycle correctly. They unconditionally converted the PREALLOC to
> PERTRANS in a generic cleanup step regardless of errors or whether the
> operation was fully associated to a transaction or not. This resulted in
> error paths occasionally converting this rsv to PERTRANS without calling
> record_root_in_trans successfully, which meant that unless that root got
> recorded in the transaction by some other thread, the end of the
> transaction would not free that root's PERTRANS, leaking it. Ultimately,
> this resulted in hitting a WARN in CONFIG_BTRFS_DEBUG builds at unmount
> for the leaked reservation.
> 
> The fix is to ensure that every qgroup PREALLOC reservation observes the
> following properties:
> 1. any failure before record_root_in_trans is called successfully
>     results in freeing the PREALLOC reservation.
> 2. after record_root_in_trans, we convert to PERTRANS, and now the
>     transaction owns freeing the reservation.
> 
> This patch enforces those properties on the three operations. Without
> it, generic/269 with squotas enabled at MKFS time would fail in ~5-10
> runs on my system. With this patch, it ran successfully 1000 times in a
> row.
> 
> Fixes: e85fde5162bf ("btrfs: qgroup: fix qgroup meta rsv leak for subvolume operations")

Thanks for pinning down the bug, and it looks fine to me.

Reviewed-by: Qu Wenruo <wqu@suse.com>

Thanks,
Qu

> Signed-off-by: Boris Burkov <boris@bur.io>
> ---
>   fs/btrfs/inode.c     | 13 ++++++++++++-
>   fs/btrfs/ioctl.c     | 37 ++++++++++++++++++++++++++++---------
>   fs/btrfs/root-tree.c | 10 ----------
>   fs/btrfs/root-tree.h |  2 --
>   4 files changed, 40 insertions(+), 22 deletions(-)
> 
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 09f0a20b5ce8..2587a2e25e44 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -4503,6 +4503,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
>   	struct btrfs_trans_handle *trans;
>   	struct btrfs_block_rsv block_rsv;
>   	u64 root_flags;
> +	u64 qgroup_reserved = 0;
>   	int ret;
>   
>   	down_write(&fs_info->subvol_sem);
> @@ -4547,12 +4548,20 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
>   	ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
>   	if (ret)
>   		goto out_undead;
> +	qgroup_reserved = block_rsv.qgroup_rsv_reserved;
>   
>   	trans = btrfs_start_transaction(root, 0);
>   	if (IS_ERR(trans)) {
>   		ret = PTR_ERR(trans);
>   		goto out_release;
>   	}
> +	ret = btrfs_record_root_in_trans(trans, root);
> +	if (ret) {
> +		btrfs_abort_transaction(trans, ret);
> +		goto out_end_trans;
> +	}
> +	btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved);
> +	qgroup_reserved = 0;
>   	trans->block_rsv = &block_rsv;
>   	trans->bytes_reserved = block_rsv.size;
>   
> @@ -4611,7 +4620,9 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
>   	ret = btrfs_end_transaction(trans);
>   	inode->i_flags |= S_DEAD;
>   out_release:
> -	btrfs_subvolume_release_metadata(root, &block_rsv);
> +	btrfs_block_rsv_release(fs_info, &block_rsv, (u64)-1, NULL);
> +	if (qgroup_reserved)
> +		btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved);
>   out_undead:
>   	if (ret) {
>   		spin_lock(&dest->root_item_lock);
> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
> index e3a72292eda9..888dc92c6c75 100644
> --- a/fs/btrfs/ioctl.c
> +++ b/fs/btrfs/ioctl.c
> @@ -613,6 +613,7 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
>   	int ret;
>   	dev_t anon_dev;
>   	u64 objectid;
> +	u64 qgroup_reserved = 0;
>   
>   	root_item = kzalloc(sizeof(*root_item), GFP_KERNEL);
>   	if (!root_item)
> @@ -650,13 +651,18 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
>   					       trans_num_items, false);
>   	if (ret)
>   		goto out_new_inode_args;
> +	qgroup_reserved = block_rsv.qgroup_rsv_reserved;
>   
>   	trans = btrfs_start_transaction(root, 0);
>   	if (IS_ERR(trans)) {
>   		ret = PTR_ERR(trans);
> -		btrfs_subvolume_release_metadata(root, &block_rsv);
> -		goto out_new_inode_args;
> +		goto out_release_rsv;
>   	}
> +	ret = btrfs_record_root_in_trans(trans, BTRFS_I(dir)->root);
> +	if (ret)
> +		goto out;
> +	btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved);
> +	qgroup_reserved = 0;
>   	trans->block_rsv = &block_rsv;
>   	trans->bytes_reserved = block_rsv.size;
>   	/* Tree log can't currently deal with an inode which is a new root. */
> @@ -767,9 +773,11 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
>   out:
>   	trans->block_rsv = NULL;
>   	trans->bytes_reserved = 0;
> -	btrfs_subvolume_release_metadata(root, &block_rsv);
> -
>   	btrfs_end_transaction(trans);
> +out_release_rsv:
> +	btrfs_block_rsv_release(fs_info, &block_rsv, (u64)-1, NULL);
> +	if (qgroup_reserved)
> +		btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved);
>   out_new_inode_args:
>   	btrfs_new_inode_args_destroy(&new_inode_args);
>   out_inode:
> @@ -791,6 +799,8 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
>   	struct btrfs_pending_snapshot *pending_snapshot;
>   	unsigned int trans_num_items;
>   	struct btrfs_trans_handle *trans;
> +	struct btrfs_block_rsv *block_rsv;
> +	u64 qgroup_reserved = 0;
>   	int ret;
>   
>   	/* We do not support snapshotting right now. */
> @@ -827,19 +837,19 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
>   		goto free_pending;
>   	}
>   
> -	btrfs_init_block_rsv(&pending_snapshot->block_rsv,
> -			     BTRFS_BLOCK_RSV_TEMP);
> +	block_rsv = &pending_snapshot->block_rsv;
> +	btrfs_init_block_rsv(block_rsv, BTRFS_BLOCK_RSV_TEMP);
>   	/*
>   	 * 1 to add dir item
>   	 * 1 to add dir index
>   	 * 1 to update parent inode item
>   	 */
>   	trans_num_items = create_subvol_num_items(inherit) + 3;
> -	ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root,
> -					       &pending_snapshot->block_rsv,
> +	ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root, block_rsv,
>   					       trans_num_items, false);
>   	if (ret)
>   		goto free_pending;
> +	qgroup_reserved = block_rsv->qgroup_rsv_reserved;
>   
>   	pending_snapshot->dentry = dentry;
>   	pending_snapshot->root = root;
> @@ -852,6 +862,13 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
>   		ret = PTR_ERR(trans);
>   		goto fail;
>   	}
> +	ret = btrfs_record_root_in_trans(trans, BTRFS_I(dir)->root);
> +	if (ret) {
> +		btrfs_end_transaction(trans);
> +		goto fail;
> +	}
> +	btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved);
> +	qgroup_reserved = 0;
>   
>   	trans->pending_snapshot = pending_snapshot;
>   
> @@ -881,7 +898,9 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
>   	if (ret && pending_snapshot->snap)
>   		pending_snapshot->snap->anon_dev = 0;
>   	btrfs_put_root(pending_snapshot->snap);
> -	btrfs_subvolume_release_metadata(root, &pending_snapshot->block_rsv);
> +	btrfs_block_rsv_release(fs_info, block_rsv, (u64)-1, NULL);
> +	if (qgroup_reserved)
> +		btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved);
>   free_pending:
>   	if (pending_snapshot->anon_dev)
>   		free_anon_bdev(pending_snapshot->anon_dev);
> diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
> index 4bb538a372ce..7007f9e0c972 100644
> --- a/fs/btrfs/root-tree.c
> +++ b/fs/btrfs/root-tree.c
> @@ -548,13 +548,3 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
>   	}
>   	return ret;
>   }
> -
> -void btrfs_subvolume_release_metadata(struct btrfs_root *root,
> -				      struct btrfs_block_rsv *rsv)
> -{
> -	struct btrfs_fs_info *fs_info = root->fs_info;
> -	u64 qgroup_to_release;
> -
> -	btrfs_block_rsv_release(fs_info, rsv, (u64)-1, &qgroup_to_release);
> -	btrfs_qgroup_convert_reserved_meta(root, qgroup_to_release);
> -}
> diff --git a/fs/btrfs/root-tree.h b/fs/btrfs/root-tree.h
> index 6f929cf3bd49..8f5739e732b9 100644
> --- a/fs/btrfs/root-tree.h
> +++ b/fs/btrfs/root-tree.h
> @@ -18,8 +18,6 @@ struct btrfs_trans_handle;
>   int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
>   				     struct btrfs_block_rsv *rsv,
>   				     int nitems, bool use_global_rsv);
> -void btrfs_subvolume_release_metadata(struct btrfs_root *root,
> -				      struct btrfs_block_rsv *rsv);
>   int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
>   		       u64 ref_id, u64 dirid, u64 sequence,
>   		       const struct fscrypt_str *name);
diff mbox series

Patch

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 09f0a20b5ce8..2587a2e25e44 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4503,6 +4503,7 @@  int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
 	struct btrfs_trans_handle *trans;
 	struct btrfs_block_rsv block_rsv;
 	u64 root_flags;
+	u64 qgroup_reserved = 0;
 	int ret;
 
 	down_write(&fs_info->subvol_sem);
@@ -4547,12 +4548,20 @@  int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
 	ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
 	if (ret)
 		goto out_undead;
+	qgroup_reserved = block_rsv.qgroup_rsv_reserved;
 
 	trans = btrfs_start_transaction(root, 0);
 	if (IS_ERR(trans)) {
 		ret = PTR_ERR(trans);
 		goto out_release;
 	}
+	ret = btrfs_record_root_in_trans(trans, root);
+	if (ret) {
+		btrfs_abort_transaction(trans, ret);
+		goto out_end_trans;
+	}
+	btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved);
+	qgroup_reserved = 0;
 	trans->block_rsv = &block_rsv;
 	trans->bytes_reserved = block_rsv.size;
 
@@ -4611,7 +4620,9 @@  int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
 	ret = btrfs_end_transaction(trans);
 	inode->i_flags |= S_DEAD;
 out_release:
-	btrfs_subvolume_release_metadata(root, &block_rsv);
+	btrfs_block_rsv_release(fs_info, &block_rsv, (u64)-1, NULL);
+	if (qgroup_reserved)
+		btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved);
 out_undead:
 	if (ret) {
 		spin_lock(&dest->root_item_lock);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index e3a72292eda9..888dc92c6c75 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -613,6 +613,7 @@  static noinline int create_subvol(struct mnt_idmap *idmap,
 	int ret;
 	dev_t anon_dev;
 	u64 objectid;
+	u64 qgroup_reserved = 0;
 
 	root_item = kzalloc(sizeof(*root_item), GFP_KERNEL);
 	if (!root_item)
@@ -650,13 +651,18 @@  static noinline int create_subvol(struct mnt_idmap *idmap,
 					       trans_num_items, false);
 	if (ret)
 		goto out_new_inode_args;
+	qgroup_reserved = block_rsv.qgroup_rsv_reserved;
 
 	trans = btrfs_start_transaction(root, 0);
 	if (IS_ERR(trans)) {
 		ret = PTR_ERR(trans);
-		btrfs_subvolume_release_metadata(root, &block_rsv);
-		goto out_new_inode_args;
+		goto out_release_rsv;
 	}
+	ret = btrfs_record_root_in_trans(trans, BTRFS_I(dir)->root);
+	if (ret)
+		goto out;
+	btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved);
+	qgroup_reserved = 0;
 	trans->block_rsv = &block_rsv;
 	trans->bytes_reserved = block_rsv.size;
 	/* Tree log can't currently deal with an inode which is a new root. */
@@ -767,9 +773,11 @@  static noinline int create_subvol(struct mnt_idmap *idmap,
 out:
 	trans->block_rsv = NULL;
 	trans->bytes_reserved = 0;
-	btrfs_subvolume_release_metadata(root, &block_rsv);
-
 	btrfs_end_transaction(trans);
+out_release_rsv:
+	btrfs_block_rsv_release(fs_info, &block_rsv, (u64)-1, NULL);
+	if (qgroup_reserved)
+		btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved);
 out_new_inode_args:
 	btrfs_new_inode_args_destroy(&new_inode_args);
 out_inode:
@@ -791,6 +799,8 @@  static int create_snapshot(struct btrfs_root *root, struct inode *dir,
 	struct btrfs_pending_snapshot *pending_snapshot;
 	unsigned int trans_num_items;
 	struct btrfs_trans_handle *trans;
+	struct btrfs_block_rsv *block_rsv;
+	u64 qgroup_reserved = 0;
 	int ret;
 
 	/* We do not support snapshotting right now. */
@@ -827,19 +837,19 @@  static int create_snapshot(struct btrfs_root *root, struct inode *dir,
 		goto free_pending;
 	}
 
-	btrfs_init_block_rsv(&pending_snapshot->block_rsv,
-			     BTRFS_BLOCK_RSV_TEMP);
+	block_rsv = &pending_snapshot->block_rsv;
+	btrfs_init_block_rsv(block_rsv, BTRFS_BLOCK_RSV_TEMP);
 	/*
 	 * 1 to add dir item
 	 * 1 to add dir index
 	 * 1 to update parent inode item
 	 */
 	trans_num_items = create_subvol_num_items(inherit) + 3;
-	ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root,
-					       &pending_snapshot->block_rsv,
+	ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root, block_rsv,
 					       trans_num_items, false);
 	if (ret)
 		goto free_pending;
+	qgroup_reserved = block_rsv->qgroup_rsv_reserved;
 
 	pending_snapshot->dentry = dentry;
 	pending_snapshot->root = root;
@@ -852,6 +862,13 @@  static int create_snapshot(struct btrfs_root *root, struct inode *dir,
 		ret = PTR_ERR(trans);
 		goto fail;
 	}
+	ret = btrfs_record_root_in_trans(trans, BTRFS_I(dir)->root);
+	if (ret) {
+		btrfs_end_transaction(trans);
+		goto fail;
+	}
+	btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved);
+	qgroup_reserved = 0;
 
 	trans->pending_snapshot = pending_snapshot;
 
@@ -881,7 +898,9 @@  static int create_snapshot(struct btrfs_root *root, struct inode *dir,
 	if (ret && pending_snapshot->snap)
 		pending_snapshot->snap->anon_dev = 0;
 	btrfs_put_root(pending_snapshot->snap);
-	btrfs_subvolume_release_metadata(root, &pending_snapshot->block_rsv);
+	btrfs_block_rsv_release(fs_info, block_rsv, (u64)-1, NULL);
+	if (qgroup_reserved)
+		btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved);
 free_pending:
 	if (pending_snapshot->anon_dev)
 		free_anon_bdev(pending_snapshot->anon_dev);
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 4bb538a372ce..7007f9e0c972 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -548,13 +548,3 @@  int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
 	}
 	return ret;
 }
-
-void btrfs_subvolume_release_metadata(struct btrfs_root *root,
-				      struct btrfs_block_rsv *rsv)
-{
-	struct btrfs_fs_info *fs_info = root->fs_info;
-	u64 qgroup_to_release;
-
-	btrfs_block_rsv_release(fs_info, rsv, (u64)-1, &qgroup_to_release);
-	btrfs_qgroup_convert_reserved_meta(root, qgroup_to_release);
-}
diff --git a/fs/btrfs/root-tree.h b/fs/btrfs/root-tree.h
index 6f929cf3bd49..8f5739e732b9 100644
--- a/fs/btrfs/root-tree.h
+++ b/fs/btrfs/root-tree.h
@@ -18,8 +18,6 @@  struct btrfs_trans_handle;
 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
 				     struct btrfs_block_rsv *rsv,
 				     int nitems, bool use_global_rsv);
-void btrfs_subvolume_release_metadata(struct btrfs_root *root,
-				      struct btrfs_block_rsv *rsv);
 int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
 		       u64 ref_id, u64 dirid, u64 sequence,
 		       const struct fscrypt_str *name);