diff mbox series

[v3,14/54] btrfs: convert BUG_ON()'s in select_reloc_root() to proper errors

Message ID a9346ccd6f5de1a6cac12918ccace014b7f3bd6c.1606938211.git.josef@toxicpanda.com (mailing list archive)
State New, archived
Headers show
Series Cleanup error handling in relocation | expand

Commit Message

Josef Bacik Dec. 2, 2020, 7:50 p.m. UTC
We have several BUG_ON()'s in select_reloc_root() that can be tripped if
you have extent tree corruption.  Convert these to ASSERT()'s, because
if we hit it during testing it really is bad, or could indicate a
problem with the backref walking code.

However if users hit these problems it generally indicates corruption,
I've hit a few machines in the fleet that trip over these with clearly
corrupted extent trees, so be nice and spit out an error message and
return an error instead of bringing the whole box down.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
---
 fs/btrfs/relocation.c | 51 +++++++++++++++++++++++++++++++++++++++----
 1 file changed, 47 insertions(+), 4 deletions(-)

Comments

Qu Wenruo Dec. 3, 2020, 2:29 a.m. UTC | #1
On 2020/12/3 上午3:50, Josef Bacik wrote:
> We have several BUG_ON()'s in select_reloc_root() that can be tripped if
> you have extent tree corruption.  Convert these to ASSERT()'s, because
> if we hit it during testing it really is bad, or could indicate a
> problem with the backref walking code.
> 
> However if users hit these problems it generally indicates corruption,
> I've hit a few machines in the fleet that trip over these with clearly
> corrupted extent trees, so be nice and spit out an error message and
> return an error instead of bringing the whole box down.
> 
> Signed-off-by: Josef Bacik <josef@toxicpanda.com>
> ---
>  fs/btrfs/relocation.c | 51 +++++++++++++++++++++++++++++++++++++++----
>  1 file changed, 47 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
> index 66515ccc04fe..bf4e1018356a 100644
> --- a/fs/btrfs/relocation.c
> +++ b/fs/btrfs/relocation.c
> @@ -1996,8 +1996,35 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
>  		cond_resched();
>  		next = walk_up_backref(next, edges, &index);
>  		root = next->root;
> -		BUG_ON(!root);
> -		BUG_ON(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state));
> +
> +		/*
> +		 * If there is no root, then our references for this block are
> +		 * incomplete, as we should be able to walk all the way up to a
> +		 * block that is owned by a root.
> +		 *
> +		 * This path is only for SHAREABLE roots, so if we come upon a
> +		 * non-SHAREABLE root then we have backrefs that resolve
> +		 * improperly.
> +		 *
> +		 * Both of these cases indicate file system corruption, or a bug
> +		 * in the backref walking code.  The ASSERT() is to make sure
> +		 * developers get bitten as soon as possible, proper error
> +		 * handling is for users who may have corrupt file systems.
> +		 */
> +		if (!root) {
> +			ASSERT(root);

ASSERT(0); maybe a little less confusing.

> +			btrfs_err(trans->fs_info,
> +		"bytenr %llu doesn't have a backref path ending in a root",
> +				  node->bytenr);
> +			return ERR_PTR(-EUCLEAN);
> +		}
> +		if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
> +			ASSERT(test_bit(BTRFS_ROOT_SHAREABLE, &root->state));
Same here.

> +			btrfs_err(trans->fs_info,
> +"bytenr %llu has multiple refs with one ending in a non shareable root",
> +				  node->bytenr);
> +			return ERR_PTR(-EUCLEAN);
> +		}
>  
>  		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
>  			record_reloc_root_in_trans(trans, root);
> @@ -2008,8 +2035,24 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
>  		root = root->reloc_root;
>  
>  		if (next->new_bytenr != root->node->start) {
> -			BUG_ON(next->new_bytenr);
> -			BUG_ON(!list_empty(&next->list));
> +			/*
> +			 * We just created the reloc root, so we shouldn't have
> +			 * ->new_bytenr set and this shouldn't be in the changed
> +			 *  list.  If it is then we have multiple roots pointing
> +			 *  at the same bytenr, or we've made a mistake in the
> +			 *  backref walking code.  ASSERT() for developers,
> +			 *  error out for users, as it indicates corruption or a
> +			 *  bad bug.

The ASSERT() comment mentioned everywhere seems a little overkilled.

> +			 */
> +			ASSERT(next->new_bytenr == 0);
> +			ASSERT(list_empty(&next->list));
> +			if (next->new_bytenr || !list_empty(&next->list)) {

Just ASSERT(0); here would be good enough.

Despite that, the new ASSERT() for developer and do error handling
properly is really awesome behavior.

Thanks,
Qu

> +				btrfs_err(trans->fs_info,
> +"bytenr %llu possibly has multiple roots pointing at the same bytenr %llu",
> +					  node->bytenr, next->bytenr);
> +				return ERR_PTR(-EUCLEAN);
> +			}
> +
>  			next->new_bytenr = root->node->start;
>  			btrfs_put_root(next->root);
>  			next->root = btrfs_grab_root(root);
>
diff mbox series

Patch

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 66515ccc04fe..bf4e1018356a 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1996,8 +1996,35 @@  struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
 		cond_resched();
 		next = walk_up_backref(next, edges, &index);
 		root = next->root;
-		BUG_ON(!root);
-		BUG_ON(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state));
+
+		/*
+		 * If there is no root, then our references for this block are
+		 * incomplete, as we should be able to walk all the way up to a
+		 * block that is owned by a root.
+		 *
+		 * This path is only for SHAREABLE roots, so if we come upon a
+		 * non-SHAREABLE root then we have backrefs that resolve
+		 * improperly.
+		 *
+		 * Both of these cases indicate file system corruption, or a bug
+		 * in the backref walking code.  The ASSERT() is to make sure
+		 * developers get bitten as soon as possible, proper error
+		 * handling is for users who may have corrupt file systems.
+		 */
+		if (!root) {
+			ASSERT(root);
+			btrfs_err(trans->fs_info,
+		"bytenr %llu doesn't have a backref path ending in a root",
+				  node->bytenr);
+			return ERR_PTR(-EUCLEAN);
+		}
+		if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
+			ASSERT(test_bit(BTRFS_ROOT_SHAREABLE, &root->state));
+			btrfs_err(trans->fs_info,
+"bytenr %llu has multiple refs with one ending in a non shareable root",
+				  node->bytenr);
+			return ERR_PTR(-EUCLEAN);
+		}
 
 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
 			record_reloc_root_in_trans(trans, root);
@@ -2008,8 +2035,24 @@  struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
 		root = root->reloc_root;
 
 		if (next->new_bytenr != root->node->start) {
-			BUG_ON(next->new_bytenr);
-			BUG_ON(!list_empty(&next->list));
+			/*
+			 * We just created the reloc root, so we shouldn't have
+			 * ->new_bytenr set and this shouldn't be in the changed
+			 *  list.  If it is then we have multiple roots pointing
+			 *  at the same bytenr, or we've made a mistake in the
+			 *  backref walking code.  ASSERT() for developers,
+			 *  error out for users, as it indicates corruption or a
+			 *  bad bug.
+			 */
+			ASSERT(next->new_bytenr == 0);
+			ASSERT(list_empty(&next->list));
+			if (next->new_bytenr || !list_empty(&next->list)) {
+				btrfs_err(trans->fs_info,
+"bytenr %llu possibly has multiple roots pointing at the same bytenr %llu",
+					  node->bytenr, next->bytenr);
+				return ERR_PTR(-EUCLEAN);
+			}
+
 			next->new_bytenr = root->node->start;
 			btrfs_put_root(next->root);
 			next->root = btrfs_grab_root(root);