diff mbox series

btrfs: output mirror number for bad metadata

Message ID ae3c7264a3aefe55c64e3c6a0426289800023742.1655646447.git.wqu@suse.com (mailing list archive)
State New, archived
Headers show
Series btrfs: output mirror number for bad metadata | expand

Commit Message

Qu Wenruo June 19, 2022, 1:47 p.m. UTC
When handling a real world transid mismatch image, it's hard to know
which copy is corrupted, as the error messages just look like this:

BTRFS warning (device dm-3): checksum verify failed on 30408704 wanted 0xcdcdcdcd found 0x3c0adc8e level 0
BTRFS warning (device dm-3): checksum verify failed on 30408704 wanted 0xcdcdcdcd found 0x3c0adc8e level 0
BTRFS warning (device dm-3): checksum verify failed on 30408704 wanted 0xcdcdcdcd found 0x3c0adc8e level 0
BTRFS warning (device dm-3): checksum verify failed on 30408704 wanted 0xcdcdcdcd found 0x3c0adc8e level 0

We don't even know if the retry is caused by btrfs or the VFS retry.

To make things a little easier to read, this patch will add mirror
number for all related tree block read errors.

So the above messages would look like this:

 BTRFS warning (device dm-3): checksum verify failed on 30408704 mirror 1 wanted 0xcdcdcdcd found 0x3c0adc8e level 0
 BTRFS warning (device dm-3): checksum verify failed on 30408704 mirror 2 wanted 0xcdcdcdcd found 0x3c0adc8e level 0
 BTRFS warning (device dm-3): checksum verify failed on 30408704 mirror 1 wanted 0xcdcdcdcd found 0x3c0adc8e level 0
 BTRFS warning (device dm-3): checksum verify failed on 30408704 mirror 2 wanted 0xcdcdcdcd found 0x3c0adc8e level 0

Signed-off-by: Qu Wenruo <wqu@suse.com>
---
 fs/btrfs/disk-io.c | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

Comments

Wang Yugui June 19, 2022, 2:10 p.m. UTC | #1
Hi,

> When handling a real world transid mismatch image, it's hard to know
> which copy is corrupted, as the error messages just look like this:
> 
> BTRFS warning (device dm-3): checksum verify failed on 30408704 wanted 0xcdcdcdcd found 0x3c0adc8e level 0
> BTRFS warning (device dm-3): checksum verify failed on 30408704 wanted 0xcdcdcdcd found 0x3c0adc8e level 0
> BTRFS warning (device dm-3): checksum verify failed on 30408704 wanted 0xcdcdcdcd found 0x3c0adc8e level 0
> BTRFS warning (device dm-3): checksum verify failed on 30408704 wanted 0xcdcdcdcd found 0x3c0adc8e level 0

Is this case like the flowing:
metadata minor 1: updated with new data
metadata minor 2: old data
data minor 1: old data
data minor 2: old data

Best Regards
Wang Yugui (wangyugui@e16-tech.com)
2022/06/19


> We don't even know if the retry is caused by btrfs or the VFS retry.
> 
> To make things a little easier to read, this patch will add mirror
> number for all related tree block read errors.
> 
> So the above messages would look like this:
> 
>  BTRFS warning (device dm-3): checksum verify failed on 30408704 mirror 1 wanted 0xcdcdcdcd found 0x3c0adc8e level 0
>  BTRFS warning (device dm-3): checksum verify failed on 30408704 mirror 2 wanted 0xcdcdcdcd found 0x3c0adc8e level 0
>  BTRFS warning (device dm-3): checksum verify failed on 30408704 mirror 1 wanted 0xcdcdcdcd found 0x3c0adc8e level 0
>  BTRFS warning (device dm-3): checksum verify failed on 30408704 mirror 2 wanted 0xcdcdcdcd found 0x3c0adc8e level 0
> 
> Signed-off-by: Qu Wenruo <wqu@suse.com>
> ---
>  fs/btrfs/disk-io.c | 25 +++++++++++++------------
>  1 file changed, 13 insertions(+), 12 deletions(-)
> 
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 800ad3a9c68e..506d48b5fd7e 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -220,8 +220,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
>  		goto out;
>  	}
>  	btrfs_err_rl(eb->fs_info,
> -		"parent transid verify failed on %llu wanted %llu found %llu",
> -			eb->start,
> +	"parent transid verify failed on %llu mirror %u wanted %llu found %llu",
> +			eb->start, eb->read_mirror,
>  			parent_transid, btrfs_header_generation(eb));
>  	ret = 1;
>  	clear_extent_buffer_uptodate(eb);
> @@ -551,21 +551,22 @@ static int validate_extent_buffer(struct extent_buffer *eb)
>  
>  	found_start = btrfs_header_bytenr(eb);
>  	if (found_start != eb->start) {
> -		btrfs_err_rl(fs_info, "bad tree block start, want %llu have %llu",
> -			     eb->start, found_start);
> +		btrfs_err_rl(fs_info,
> +			"bad tree block start, mirror %u want %llu have %llu",
> +			     eb->read_mirror, eb->start, found_start);
>  		ret = -EIO;
>  		goto out;
>  	}
>  	if (check_tree_block_fsid(eb)) {
> -		btrfs_err_rl(fs_info, "bad fsid on block %llu",
> -			     eb->start);
> +		btrfs_err_rl(fs_info, "bad fsid on block %llu mirror %u",
> +			     eb->start, eb->read_mirror);
>  		ret = -EIO;
>  		goto out;
>  	}
>  	found_level = btrfs_header_level(eb);
>  	if (found_level >= BTRFS_MAX_LEVEL) {
> -		btrfs_err(fs_info, "bad tree block level %d on %llu",
> -			  (int)btrfs_header_level(eb), eb->start);
> +		btrfs_err(fs_info, "bad tree block mirror %u level %d on %llu",
> +			  eb->read_mirror, btrfs_header_level(eb), eb->start);
>  		ret = -EIO;
>  		goto out;
>  	}
> @@ -576,8 +577,8 @@ static int validate_extent_buffer(struct extent_buffer *eb)
>  
>  	if (memcmp(result, header_csum, csum_size) != 0) {
>  		btrfs_warn_rl(fs_info,
> -	"checksum verify failed on %llu wanted " CSUM_FMT " found " CSUM_FMT " level %d",
> -			      eb->start,
> +	"checksum verify failed on %llu mirror %u wanted " CSUM_FMT " found " CSUM_FMT " level %d",
> +			      eb->start, eb->read_mirror,
>  			      CSUM_FMT_VALUE(csum_size, header_csum),
>  			      CSUM_FMT_VALUE(csum_size, result),
>  			      btrfs_header_level(eb));
> @@ -602,8 +603,8 @@ static int validate_extent_buffer(struct extent_buffer *eb)
>  		set_extent_buffer_uptodate(eb);
>  	else
>  		btrfs_err(fs_info,
> -			  "block=%llu read time tree block corruption detected",
> -			  eb->start);
> +		"block=%llu mirror %u read time tree block corruption detected",
> +			  eb->start, eb->read_mirror);
>  out:
>  	return ret;
>  }
> -- 
> 2.36.1
Qu Wenruo June 19, 2022, 9:37 p.m. UTC | #2
On 2022/6/19 22:10, Wang Yugui wrote:
> Hi,
>
>> When handling a real world transid mismatch image, it's hard to know
>> which copy is corrupted, as the error messages just look like this:
>>
>> BTRFS warning (device dm-3): checksum verify failed on 30408704 wanted 0xcdcdcdcd found 0x3c0adc8e level 0
>> BTRFS warning (device dm-3): checksum verify failed on 30408704 wanted 0xcdcdcdcd found 0x3c0adc8e level 0
>> BTRFS warning (device dm-3): checksum verify failed on 30408704 wanted 0xcdcdcdcd found 0x3c0adc8e level 0
>> BTRFS warning (device dm-3): checksum verify failed on 30408704 wanted 0xcdcdcdcd found 0x3c0adc8e level 0
>
> Is this case like the flowing:
> metadata minor 1: updated with new data
> metadata minor 2: old data
> data minor 1: old data
> data minor 2: old data

It's a crafted case that all metadata copies are corrupted (0xcdcd is
the default xfs io pwrite pattern)
>
> Best Regards
> Wang Yugui (wangyugui@e16-tech.com)
> 2022/06/19
>
>
>> We don't even know if the retry is caused by btrfs or the VFS retry.
>>
>> To make things a little easier to read, this patch will add mirror
>> number for all related tree block read errors.
>>
>> So the above messages would look like this:
>>
>>   BTRFS warning (device dm-3): checksum verify failed on 30408704 mirror 1 wanted 0xcdcdcdcd found 0x3c0adc8e level 0
>>   BTRFS warning (device dm-3): checksum verify failed on 30408704 mirror 2 wanted 0xcdcdcdcd found 0x3c0adc8e level 0
>>   BTRFS warning (device dm-3): checksum verify failed on 30408704 mirror 1 wanted 0xcdcdcdcd found 0x3c0adc8e level 0
>>   BTRFS warning (device dm-3): checksum verify failed on 30408704 mirror 2 wanted 0xcdcdcdcd found 0x3c0adc8e level 0
>>
>> Signed-off-by: Qu Wenruo <wqu@suse.com>
>> ---
>>   fs/btrfs/disk-io.c | 25 +++++++++++++------------
>>   1 file changed, 13 insertions(+), 12 deletions(-)
>>
>> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
>> index 800ad3a9c68e..506d48b5fd7e 100644
>> --- a/fs/btrfs/disk-io.c
>> +++ b/fs/btrfs/disk-io.c
>> @@ -220,8 +220,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
>>   		goto out;
>>   	}
>>   	btrfs_err_rl(eb->fs_info,
>> -		"parent transid verify failed on %llu wanted %llu found %llu",
>> -			eb->start,
>> +	"parent transid verify failed on %llu mirror %u wanted %llu found %llu",
>> +			eb->start, eb->read_mirror,
>>   			parent_transid, btrfs_header_generation(eb));
>>   	ret = 1;
>>   	clear_extent_buffer_uptodate(eb);
>> @@ -551,21 +551,22 @@ static int validate_extent_buffer(struct extent_buffer *eb)
>>
>>   	found_start = btrfs_header_bytenr(eb);
>>   	if (found_start != eb->start) {
>> -		btrfs_err_rl(fs_info, "bad tree block start, want %llu have %llu",
>> -			     eb->start, found_start);
>> +		btrfs_err_rl(fs_info,
>> +			"bad tree block start, mirror %u want %llu have %llu",
>> +			     eb->read_mirror, eb->start, found_start);
>>   		ret = -EIO;
>>   		goto out;
>>   	}
>>   	if (check_tree_block_fsid(eb)) {
>> -		btrfs_err_rl(fs_info, "bad fsid on block %llu",
>> -			     eb->start);
>> +		btrfs_err_rl(fs_info, "bad fsid on block %llu mirror %u",
>> +			     eb->start, eb->read_mirror);
>>   		ret = -EIO;
>>   		goto out;
>>   	}
>>   	found_level = btrfs_header_level(eb);
>>   	if (found_level >= BTRFS_MAX_LEVEL) {
>> -		btrfs_err(fs_info, "bad tree block level %d on %llu",
>> -			  (int)btrfs_header_level(eb), eb->start);
>> +		btrfs_err(fs_info, "bad tree block mirror %u level %d on %llu",
>> +			  eb->read_mirror, btrfs_header_level(eb), eb->start);
>>   		ret = -EIO;
>>   		goto out;
>>   	}
>> @@ -576,8 +577,8 @@ static int validate_extent_buffer(struct extent_buffer *eb)
>>
>>   	if (memcmp(result, header_csum, csum_size) != 0) {
>>   		btrfs_warn_rl(fs_info,
>> -	"checksum verify failed on %llu wanted " CSUM_FMT " found " CSUM_FMT " level %d",
>> -			      eb->start,
>> +	"checksum verify failed on %llu mirror %u wanted " CSUM_FMT " found " CSUM_FMT " level %d",
>> +			      eb->start, eb->read_mirror,
>>   			      CSUM_FMT_VALUE(csum_size, header_csum),
>>   			      CSUM_FMT_VALUE(csum_size, result),
>>   			      btrfs_header_level(eb));
>> @@ -602,8 +603,8 @@ static int validate_extent_buffer(struct extent_buffer *eb)
>>   		set_extent_buffer_uptodate(eb);
>>   	else
>>   		btrfs_err(fs_info,
>> -			  "block=%llu read time tree block corruption detected",
>> -			  eb->start);
>> +		"block=%llu mirror %u read time tree block corruption detected",
>> +			  eb->start, eb->read_mirror);
>>   out:
>>   	return ret;
>>   }
>> --
>> 2.36.1
>
>
David Sterba June 22, 2022, 4:23 p.m. UTC | #3
On Sun, Jun 19, 2022 at 09:47:56PM +0800, Qu Wenruo wrote:
> When handling a real world transid mismatch image, it's hard to know
> which copy is corrupted, as the error messages just look like this:
> 
> BTRFS warning (device dm-3): checksum verify failed on 30408704 wanted 0xcdcdcdcd found 0x3c0adc8e level 0
> BTRFS warning (device dm-3): checksum verify failed on 30408704 wanted 0xcdcdcdcd found 0x3c0adc8e level 0
> BTRFS warning (device dm-3): checksum verify failed on 30408704 wanted 0xcdcdcdcd found 0x3c0adc8e level 0
> BTRFS warning (device dm-3): checksum verify failed on 30408704 wanted 0xcdcdcdcd found 0x3c0adc8e level 0
> 
> We don't even know if the retry is caused by btrfs or the VFS retry.
> 
> To make things a little easier to read, this patch will add mirror
> number for all related tree block read errors.
> 
> So the above messages would look like this:
> 
>  BTRFS warning (device dm-3): checksum verify failed on 30408704 mirror 1 wanted 0xcdcdcdcd found 0x3c0adc8e level 0
>  BTRFS warning (device dm-3): checksum verify failed on 30408704 mirror 2 wanted 0xcdcdcdcd found 0x3c0adc8e level 0
>  BTRFS warning (device dm-3): checksum verify failed on 30408704 mirror 1 wanted 0xcdcdcdcd found 0x3c0adc8e level 0
>  BTRFS warning (device dm-3): checksum verify failed on 30408704 mirror 2 wanted 0xcdcdcdcd found 0x3c0adc8e level 0
> 
> Signed-off-by: Qu Wenruo <wqu@suse.com>

Added to misc-next, thanks.

> ---
>  fs/btrfs/disk-io.c | 25 +++++++++++++------------
>  1 file changed, 13 insertions(+), 12 deletions(-)
> 
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 800ad3a9c68e..506d48b5fd7e 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -220,8 +220,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
>  		goto out;
>  	}
>  	btrfs_err_rl(eb->fs_info,
> -		"parent transid verify failed on %llu wanted %llu found %llu",
> -			eb->start,
> +	"parent transid verify failed on %llu mirror %u wanted %llu found %llu",

I've added "logical" in front of all %llu that refer to logical block address.


> +			eb->start, eb->read_mirror,
>  			parent_transid, btrfs_header_generation(eb));
>  	ret = 1;
>  	clear_extent_buffer_uptodate(eb);
> @@ -551,21 +551,22 @@ static int validate_extent_buffer(struct extent_buffer *eb)
>  
>  	found_start = btrfs_header_bytenr(eb);
>  	if (found_start != eb->start) {
> -		btrfs_err_rl(fs_info, "bad tree block start, want %llu have %llu",
> -			     eb->start, found_start);
> +		btrfs_err_rl(fs_info,
> +			"bad tree block start, mirror %u want %llu have %llu",
> +			     eb->read_mirror, eb->start, found_start);
>  		ret = -EIO;
>  		goto out;
>  	}
>  	if (check_tree_block_fsid(eb)) {
> -		btrfs_err_rl(fs_info, "bad fsid on block %llu",
> -			     eb->start);
> +		btrfs_err_rl(fs_info, "bad fsid on block %llu mirror %u",

Though this says 'block' it's not in fact block address (ie. a multiple
of a block size) but the logical address.

> +			     eb->start, eb->read_mirror);
>  		ret = -EIO;
>  		goto out;
>  	}
>  	found_level = btrfs_header_level(eb);
>  	if (found_level >= BTRFS_MAX_LEVEL) {
> -		btrfs_err(fs_info, "bad tree block level %d on %llu",
> -			  (int)btrfs_header_level(eb), eb->start);
> +		btrfs_err(fs_info, "bad tree block mirror %u level %d on %llu",
> +			  eb->read_mirror, btrfs_header_level(eb), eb->start);
>  		ret = -EIO;
>  		goto out;
>  	}
> @@ -576,8 +577,8 @@ static int validate_extent_buffer(struct extent_buffer *eb)
>  
>  	if (memcmp(result, header_csum, csum_size) != 0) {
>  		btrfs_warn_rl(fs_info,
> -	"checksum verify failed on %llu wanted " CSUM_FMT " found " CSUM_FMT " level %d",
> -			      eb->start,
> +	"checksum verify failed on %llu mirror %u wanted " CSUM_FMT " found " CSUM_FMT " level %d",
> +			      eb->start, eb->read_mirror,
>  			      CSUM_FMT_VALUE(csum_size, header_csum),
>  			      CSUM_FMT_VALUE(csum_size, result),
>  			      btrfs_header_level(eb));
> @@ -602,8 +603,8 @@ static int validate_extent_buffer(struct extent_buffer *eb)
>  		set_extent_buffer_uptodate(eb);
>  	else
>  		btrfs_err(fs_info,
> -			  "block=%llu read time tree block corruption detected",
> -			  eb->start);
> +		"block=%llu mirror %u read time tree block corruption detected",
> +			  eb->start, eb->read_mirror);
>  out:
>  	return ret;
>  }
> -- 
> 2.36.1
diff mbox series

Patch

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 800ad3a9c68e..506d48b5fd7e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -220,8 +220,8 @@  static int verify_parent_transid(struct extent_io_tree *io_tree,
 		goto out;
 	}
 	btrfs_err_rl(eb->fs_info,
-		"parent transid verify failed on %llu wanted %llu found %llu",
-			eb->start,
+	"parent transid verify failed on %llu mirror %u wanted %llu found %llu",
+			eb->start, eb->read_mirror,
 			parent_transid, btrfs_header_generation(eb));
 	ret = 1;
 	clear_extent_buffer_uptodate(eb);
@@ -551,21 +551,22 @@  static int validate_extent_buffer(struct extent_buffer *eb)
 
 	found_start = btrfs_header_bytenr(eb);
 	if (found_start != eb->start) {
-		btrfs_err_rl(fs_info, "bad tree block start, want %llu have %llu",
-			     eb->start, found_start);
+		btrfs_err_rl(fs_info,
+			"bad tree block start, mirror %u want %llu have %llu",
+			     eb->read_mirror, eb->start, found_start);
 		ret = -EIO;
 		goto out;
 	}
 	if (check_tree_block_fsid(eb)) {
-		btrfs_err_rl(fs_info, "bad fsid on block %llu",
-			     eb->start);
+		btrfs_err_rl(fs_info, "bad fsid on block %llu mirror %u",
+			     eb->start, eb->read_mirror);
 		ret = -EIO;
 		goto out;
 	}
 	found_level = btrfs_header_level(eb);
 	if (found_level >= BTRFS_MAX_LEVEL) {
-		btrfs_err(fs_info, "bad tree block level %d on %llu",
-			  (int)btrfs_header_level(eb), eb->start);
+		btrfs_err(fs_info, "bad tree block mirror %u level %d on %llu",
+			  eb->read_mirror, btrfs_header_level(eb), eb->start);
 		ret = -EIO;
 		goto out;
 	}
@@ -576,8 +577,8 @@  static int validate_extent_buffer(struct extent_buffer *eb)
 
 	if (memcmp(result, header_csum, csum_size) != 0) {
 		btrfs_warn_rl(fs_info,
-	"checksum verify failed on %llu wanted " CSUM_FMT " found " CSUM_FMT " level %d",
-			      eb->start,
+	"checksum verify failed on %llu mirror %u wanted " CSUM_FMT " found " CSUM_FMT " level %d",
+			      eb->start, eb->read_mirror,
 			      CSUM_FMT_VALUE(csum_size, header_csum),
 			      CSUM_FMT_VALUE(csum_size, result),
 			      btrfs_header_level(eb));
@@ -602,8 +603,8 @@  static int validate_extent_buffer(struct extent_buffer *eb)
 		set_extent_buffer_uptodate(eb);
 	else
 		btrfs_err(fs_info,
-			  "block=%llu read time tree block corruption detected",
-			  eb->start);
+		"block=%llu mirror %u read time tree block corruption detected",
+			  eb->start, eb->read_mirror);
 out:
 	return ret;
 }